rezo 1.0.66 → 1.0.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/dist/adapters/entries/curl.d.ts +5 -0
  2. package/dist/adapters/entries/fetch.d.ts +5 -0
  3. package/dist/adapters/entries/http.d.ts +5 -0
  4. package/dist/adapters/entries/http2.d.ts +5 -0
  5. package/dist/adapters/entries/react-native.d.ts +5 -0
  6. package/dist/adapters/entries/xhr.d.ts +5 -0
  7. package/dist/adapters/index.cjs +6 -6
  8. package/dist/cache/index.cjs +9 -9
  9. package/dist/crawler/crawler.cjs +26 -5
  10. package/dist/crawler/crawler.js +26 -5
  11. package/dist/crawler/index.cjs +40 -40
  12. package/dist/crawler.d.ts +10 -0
  13. package/dist/entries/crawler.cjs +4 -4
  14. package/dist/index.cjs +27 -27
  15. package/dist/index.d.ts +5 -0
  16. package/dist/internal/agents/index.cjs +10 -10
  17. package/dist/platform/browser.d.ts +5 -0
  18. package/dist/platform/bun.d.ts +5 -0
  19. package/dist/platform/deno.d.ts +5 -0
  20. package/dist/platform/node.d.ts +5 -0
  21. package/dist/platform/react-native.d.ts +5 -0
  22. package/dist/platform/worker.d.ts +5 -0
  23. package/dist/proxy/index.cjs +4 -4
  24. package/dist/proxy/manager.cjs +1 -1
  25. package/dist/proxy/manager.js +1 -1
  26. package/dist/queue/index.cjs +8 -8
  27. package/dist/queue/queue.cjs +3 -1
  28. package/dist/queue/queue.js +3 -1
  29. package/dist/responses/universal/index.cjs +11 -11
  30. package/dist/wget/asset-extractor.cjs +556 -0
  31. package/dist/wget/asset-extractor.js +553 -0
  32. package/dist/wget/asset-organizer.cjs +230 -0
  33. package/dist/wget/asset-organizer.js +227 -0
  34. package/dist/wget/download-cache.cjs +221 -0
  35. package/dist/wget/download-cache.js +218 -0
  36. package/dist/wget/downloader.cjs +607 -0
  37. package/dist/wget/downloader.js +604 -0
  38. package/dist/wget/file-writer.cjs +349 -0
  39. package/dist/wget/file-writer.js +346 -0
  40. package/dist/wget/filter-lists.cjs +1330 -0
  41. package/dist/wget/filter-lists.js +1330 -0
  42. package/dist/wget/index.cjs +633 -0
  43. package/dist/wget/index.d.ts +8486 -0
  44. package/dist/wget/index.js +614 -0
  45. package/dist/wget/link-converter.cjs +297 -0
  46. package/dist/wget/link-converter.js +294 -0
  47. package/dist/wget/progress.cjs +271 -0
  48. package/dist/wget/progress.js +266 -0
  49. package/dist/wget/resume.cjs +166 -0
  50. package/dist/wget/resume.js +163 -0
  51. package/dist/wget/robots.cjs +303 -0
  52. package/dist/wget/robots.js +300 -0
  53. package/dist/wget/types.cjs +200 -0
  54. package/dist/wget/types.js +197 -0
  55. package/dist/wget/url-filter.cjs +351 -0
  56. package/dist/wget/url-filter.js +348 -0
  57. package/package.json +6 -1
@@ -0,0 +1,163 @@
1
+ import { promises as fs } from "node:fs";
2
+
3
+ export class ResumeHandler {
4
+ options;
5
+ constructor(options) {
6
+ this.options = options;
7
+ }
8
+ async getResumeInfo(localPath) {
9
+ try {
10
+ const stats = await fs.stat(localPath);
11
+ return {
12
+ path: localPath,
13
+ bytesDownloaded: stats.size,
14
+ mtime: stats.mtime,
15
+ exists: true,
16
+ canResume: this.options.continueDownload === true && stats.size > 0
17
+ };
18
+ } catch {
19
+ return {
20
+ path: localPath,
21
+ bytesDownloaded: 0,
22
+ mtime: new Date(0),
23
+ exists: false,
24
+ canResume: false
25
+ };
26
+ }
27
+ }
28
+ getResumeHeaders(info) {
29
+ if (!info.canResume) {
30
+ return {};
31
+ }
32
+ return {
33
+ Range: `bytes=${info.bytesDownloaded}-`,
34
+ "If-Range": info.mtime.toUTCString()
35
+ };
36
+ }
37
+ async getTimestampHeaders(localPath) {
38
+ if (!this.options.timestamping) {
39
+ return {};
40
+ }
41
+ try {
42
+ const stats = await fs.stat(localPath);
43
+ return {
44
+ "If-Modified-Since": stats.mtime.toUTCString()
45
+ };
46
+ } catch {
47
+ return {};
48
+ }
49
+ }
50
+ async checkTimestamp(localPath, remoteMtime) {
51
+ if (!this.options.timestamping) {
52
+ return {
53
+ shouldDownload: true,
54
+ reason: "no-timestamp"
55
+ };
56
+ }
57
+ if (!remoteMtime) {
58
+ return {
59
+ shouldDownload: true,
60
+ reason: "no-timestamp"
61
+ };
62
+ }
63
+ try {
64
+ const stats = await fs.stat(localPath);
65
+ const localMtime = stats.mtime;
66
+ const timeDiff = remoteMtime.getTime() - localMtime.getTime();
67
+ if (timeDiff > 1000) {
68
+ return {
69
+ shouldDownload: true,
70
+ reason: "newer",
71
+ localMtime,
72
+ remoteMtime
73
+ };
74
+ } else if (timeDiff < -1000) {
75
+ return {
76
+ shouldDownload: false,
77
+ reason: "older",
78
+ localMtime,
79
+ remoteMtime
80
+ };
81
+ } else {
82
+ return {
83
+ shouldDownload: false,
84
+ reason: "same",
85
+ localMtime,
86
+ remoteMtime
87
+ };
88
+ }
89
+ } catch {
90
+ return {
91
+ shouldDownload: true,
92
+ reason: "not-found",
93
+ remoteMtime
94
+ };
95
+ }
96
+ }
97
+ isValidPartialResponse(statusCode, contentRange, expectedStart) {
98
+ if (statusCode !== 206) {
99
+ return false;
100
+ }
101
+ if (!contentRange) {
102
+ return false;
103
+ }
104
+ const match = contentRange.match(/bytes\s+(\d+)-(\d+)\/(\d+|\*)/i);
105
+ if (!match) {
106
+ return false;
107
+ }
108
+ const start = parseInt(match[1], 10);
109
+ if (start !== expectedStart) {
110
+ return false;
111
+ }
112
+ return true;
113
+ }
114
+ parseContentRange(contentRange) {
115
+ if (!contentRange)
116
+ return null;
117
+ const match = contentRange.match(/bytes\s+(\d+)-(\d+)\/(\d+|\*)/i);
118
+ if (!match)
119
+ return null;
120
+ return {
121
+ start: parseInt(match[1], 10),
122
+ end: parseInt(match[2], 10),
123
+ total: match[3] === "*" ? null : parseInt(match[3], 10)
124
+ };
125
+ }
126
+ supportsRanges(acceptRanges) {
127
+ if (!acceptRanges)
128
+ return true;
129
+ return acceptRanges.toLowerCase() !== "none";
130
+ }
131
+ parseLastModified(lastModified) {
132
+ if (!lastModified)
133
+ return null;
134
+ try {
135
+ const date = new Date(lastModified);
136
+ return isNaN(date.getTime()) ? null : date;
137
+ } catch {
138
+ return null;
139
+ }
140
+ }
141
+ determineAction(statusCode, contentRange, resumeInfo) {
142
+ if (statusCode === 304) {
143
+ return "skip";
144
+ }
145
+ if (statusCode === 206) {
146
+ if (this.isValidPartialResponse(statusCode, contentRange, resumeInfo.bytesDownloaded)) {
147
+ return "resume";
148
+ }
149
+ return "restart";
150
+ }
151
+ if (statusCode === 200) {
152
+ return "restart";
153
+ }
154
+ if (statusCode === 416) {
155
+ return "skip";
156
+ }
157
+ return "restart";
158
+ }
159
+ updateOptions(options) {
160
+ this.options = { ...this.options, ...options };
161
+ }
162
+ }
163
+ export default ResumeHandler;
@@ -0,0 +1,303 @@
1
+ const DEFAULT_USER_AGENT = "Rezo-Wget";
2
+
3
+ class RobotsHandler {
4
+ options;
5
+ userAgent;
6
+ cache = new Map;
7
+ pending = new Map;
8
+ constructor(options) {
9
+ this.options = options;
10
+ this.userAgent = this.extractBotName(options.userAgent || DEFAULT_USER_AGENT);
11
+ }
12
+ extractBotName(userAgent) {
13
+ const match = userAgent.match(/^([^\s\/]+)/);
14
+ return match ? match[1].toLowerCase() : userAgent.toLowerCase();
15
+ }
16
+ async fetch(url, fetcher) {
17
+ if (this.options.noRobots || this.options.robots === false) {
18
+ return null;
19
+ }
20
+ const domain = this.getDomain(url);
21
+ if (!domain)
22
+ return null;
23
+ if (this.cache.has(domain)) {
24
+ return this.cache.get(domain) || null;
25
+ }
26
+ if (this.pending.has(domain)) {
27
+ await this.pending.get(domain);
28
+ return this.cache.get(domain) || null;
29
+ }
30
+ const robotsUrl = this.getRobotsUrl(url);
31
+ const fetchPromise = (async () => {
32
+ try {
33
+ const content = await fetcher(robotsUrl);
34
+ if (content) {
35
+ const parsed = this.parse(content);
36
+ this.cache.set(domain, parsed);
37
+ } else {
38
+ this.cache.set(domain, null);
39
+ }
40
+ } catch {
41
+ this.cache.set(domain, null);
42
+ } finally {
43
+ this.pending.delete(domain);
44
+ }
45
+ })();
46
+ this.pending.set(domain, fetchPromise);
47
+ await fetchPromise;
48
+ return this.cache.get(domain) || null;
49
+ }
50
+ isAllowed(url) {
51
+ if (this.options.noRobots || this.options.robots === false) {
52
+ return true;
53
+ }
54
+ const domain = this.getDomain(url);
55
+ if (!domain)
56
+ return true;
57
+ const parsed = this.cache.get(domain);
58
+ if (!parsed) {
59
+ return true;
60
+ }
61
+ try {
62
+ const parsedUrl = new URL(url);
63
+ const path = parsedUrl.pathname + parsedUrl.search;
64
+ return this.isPathAllowed(path, parsed.rules);
65
+ } catch {
66
+ return true;
67
+ }
68
+ }
69
+ isPathAllowed(path, rules) {
70
+ const matchingRules = this.findMatchingRules(rules);
71
+ if (matchingRules.length === 0) {
72
+ return true;
73
+ }
74
+ const allAllows = [];
75
+ const allDisallows = [];
76
+ for (const rule of matchingRules) {
77
+ allAllows.push(...rule.allow);
78
+ allDisallows.push(...rule.disallow);
79
+ }
80
+ let bestMatch = null;
81
+ let bestLength = -1;
82
+ for (const pattern of allAllows) {
83
+ if (this.pathMatches(path, pattern)) {
84
+ const specificity = this.getSpecificity(pattern);
85
+ if (specificity > bestLength) {
86
+ bestLength = specificity;
87
+ bestMatch = { pattern, allow: true };
88
+ }
89
+ }
90
+ }
91
+ for (const pattern of allDisallows) {
92
+ if (this.pathMatches(path, pattern)) {
93
+ const specificity = this.getSpecificity(pattern);
94
+ if (specificity > bestLength) {
95
+ bestLength = specificity;
96
+ bestMatch = { pattern, allow: false };
97
+ }
98
+ }
99
+ }
100
+ if (!bestMatch) {
101
+ return true;
102
+ }
103
+ return bestMatch.allow;
104
+ }
105
+ findMatchingRules(rules) {
106
+ const matching = [];
107
+ let hasSpecificMatch = false;
108
+ for (const rule of rules) {
109
+ const ruleAgent = rule.userAgent.toLowerCase();
110
+ if (ruleAgent === this.userAgent) {
111
+ matching.push(rule);
112
+ hasSpecificMatch = true;
113
+ }
114
+ }
115
+ if (hasSpecificMatch) {
116
+ return matching;
117
+ }
118
+ for (const rule of rules) {
119
+ const ruleAgent = rule.userAgent.toLowerCase();
120
+ if (this.userAgent.includes(ruleAgent) || ruleAgent.includes(this.userAgent)) {
121
+ matching.push(rule);
122
+ }
123
+ }
124
+ if (matching.length > 0) {
125
+ return matching;
126
+ }
127
+ for (const rule of rules) {
128
+ if (rule.userAgent === "*") {
129
+ matching.push(rule);
130
+ }
131
+ }
132
+ return matching;
133
+ }
134
+ pathMatches(path, pattern) {
135
+ if (!pattern)
136
+ return false;
137
+ if (pattern === "")
138
+ return false;
139
+ let regexStr = "";
140
+ let hasEndAnchor = false;
141
+ if (pattern.endsWith("$")) {
142
+ hasEndAnchor = true;
143
+ pattern = pattern.slice(0, -1);
144
+ }
145
+ for (const char of pattern) {
146
+ if (char === "*") {
147
+ regexStr += ".*";
148
+ } else if (".+?^${}()|[]\\".includes(char)) {
149
+ regexStr += "\\" + char;
150
+ } else {
151
+ regexStr += char;
152
+ }
153
+ }
154
+ regexStr = "^" + regexStr;
155
+ if (hasEndAnchor) {
156
+ regexStr += "$";
157
+ }
158
+ try {
159
+ const regex = new RegExp(regexStr);
160
+ return regex.test(path);
161
+ } catch {
162
+ return path.startsWith(pattern.replace(/\*/g, ""));
163
+ }
164
+ }
165
+ getSpecificity(pattern) {
166
+ return pattern.replace(/\*/g, "").length;
167
+ }
168
+ getCrawlDelay(urlOrDomain) {
169
+ const domain = urlOrDomain.includes("://") ? this.getDomain(urlOrDomain) : urlOrDomain;
170
+ if (!domain)
171
+ return null;
172
+ const parsed = this.cache.get(domain);
173
+ if (!parsed)
174
+ return null;
175
+ const matchingRules = this.findMatchingRules(parsed.rules);
176
+ for (const rule of matchingRules) {
177
+ if (rule.crawlDelay !== undefined) {
178
+ return rule.crawlDelay;
179
+ }
180
+ }
181
+ return null;
182
+ }
183
+ getSitemaps(urlOrDomain) {
184
+ const domain = urlOrDomain.includes("://") ? this.getDomain(urlOrDomain) : urlOrDomain;
185
+ if (!domain)
186
+ return [];
187
+ const parsed = this.cache.get(domain);
188
+ if (!parsed)
189
+ return [];
190
+ return parsed.sitemaps;
191
+ }
192
+ getParsed(urlOrDomain) {
193
+ const domain = urlOrDomain.includes("://") ? this.getDomain(urlOrDomain) : urlOrDomain;
194
+ if (!domain)
195
+ return null;
196
+ return this.cache.get(domain) || null;
197
+ }
198
+ parse(content) {
199
+ const rules = [];
200
+ const sitemaps = [];
201
+ let currentRule = null;
202
+ const lines = content.split(/\r?\n/);
203
+ for (let line of lines) {
204
+ const commentIndex = line.indexOf("#");
205
+ if (commentIndex !== -1) {
206
+ line = line.substring(0, commentIndex);
207
+ }
208
+ line = line.trim();
209
+ if (!line)
210
+ continue;
211
+ const colonIndex = line.indexOf(":");
212
+ if (colonIndex === -1)
213
+ continue;
214
+ const directive = line.substring(0, colonIndex).trim().toLowerCase();
215
+ const value = line.substring(colonIndex + 1).trim();
216
+ switch (directive) {
217
+ case "user-agent":
218
+ if (currentRule && (currentRule.allow.length > 0 || currentRule.disallow.length > 0)) {
219
+ rules.push(currentRule);
220
+ }
221
+ currentRule = {
222
+ userAgent: value,
223
+ disallow: [],
224
+ allow: [],
225
+ sitemaps: []
226
+ };
227
+ break;
228
+ case "disallow":
229
+ if (currentRule) {
230
+ currentRule.disallow.push(value);
231
+ }
232
+ break;
233
+ case "allow":
234
+ if (currentRule) {
235
+ currentRule.allow.push(value);
236
+ }
237
+ break;
238
+ case "crawl-delay":
239
+ if (currentRule) {
240
+ const delay = parseFloat(value);
241
+ if (!isNaN(delay) && delay >= 0) {
242
+ currentRule.crawlDelay = delay;
243
+ }
244
+ }
245
+ break;
246
+ case "sitemap":
247
+ sitemaps.push(value);
248
+ break;
249
+ }
250
+ }
251
+ if (currentRule && (currentRule.allow.length > 0 || currentRule.disallow.length > 0)) {
252
+ rules.push(currentRule);
253
+ }
254
+ return {
255
+ rules,
256
+ sitemaps,
257
+ raw: content
258
+ };
259
+ }
260
+ getRobotsUrl(url) {
261
+ try {
262
+ const parsed = new URL(url);
263
+ return `${parsed.protocol}//${parsed.host}/robots.txt`;
264
+ } catch {
265
+ return "";
266
+ }
267
+ }
268
+ getDomain(url) {
269
+ try {
270
+ const parsed = new URL(url);
271
+ return parsed.hostname.toLowerCase();
272
+ } catch {
273
+ return null;
274
+ }
275
+ }
276
+ hasFetched(urlOrDomain) {
277
+ const domain = urlOrDomain.includes("://") ? this.getDomain(urlOrDomain) : urlOrDomain;
278
+ if (!domain)
279
+ return false;
280
+ return this.cache.has(domain);
281
+ }
282
+ clearCache(urlOrDomain) {
283
+ if (urlOrDomain) {
284
+ const domain = urlOrDomain.includes("://") ? this.getDomain(urlOrDomain) : urlOrDomain;
285
+ if (domain) {
286
+ this.cache.delete(domain);
287
+ }
288
+ } else {
289
+ this.cache.clear();
290
+ }
291
+ }
292
+ getRulesCount(urlOrDomain) {
293
+ const domain = urlOrDomain.includes("://") ? this.getDomain(urlOrDomain) : urlOrDomain;
294
+ if (!domain)
295
+ return 0;
296
+ const parsed = this.cache.get(domain);
297
+ return parsed?.rules.length || 0;
298
+ }
299
+ }
300
+
301
+ exports.RobotsHandler = RobotsHandler;
302
+ exports.default = RobotsHandler;
303
+ module.exports = Object.assign(RobotsHandler, exports);