rezo 1.0.66 → 1.0.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/dist/adapters/entries/curl.d.ts +5 -0
  2. package/dist/adapters/entries/fetch.d.ts +5 -0
  3. package/dist/adapters/entries/http.d.ts +5 -0
  4. package/dist/adapters/entries/http2.d.ts +5 -0
  5. package/dist/adapters/entries/react-native.d.ts +5 -0
  6. package/dist/adapters/entries/xhr.d.ts +5 -0
  7. package/dist/adapters/index.cjs +6 -6
  8. package/dist/cache/index.cjs +9 -9
  9. package/dist/crawler/crawler.cjs +26 -5
  10. package/dist/crawler/crawler.js +26 -5
  11. package/dist/crawler/index.cjs +40 -40
  12. package/dist/crawler.d.ts +10 -0
  13. package/dist/entries/crawler.cjs +4 -4
  14. package/dist/index.cjs +27 -27
  15. package/dist/index.d.ts +5 -0
  16. package/dist/internal/agents/index.cjs +10 -10
  17. package/dist/platform/browser.d.ts +5 -0
  18. package/dist/platform/bun.d.ts +5 -0
  19. package/dist/platform/deno.d.ts +5 -0
  20. package/dist/platform/node.d.ts +5 -0
  21. package/dist/platform/react-native.d.ts +5 -0
  22. package/dist/platform/worker.d.ts +5 -0
  23. package/dist/proxy/index.cjs +4 -4
  24. package/dist/proxy/manager.cjs +1 -1
  25. package/dist/proxy/manager.js +1 -1
  26. package/dist/queue/index.cjs +8 -8
  27. package/dist/queue/queue.cjs +3 -1
  28. package/dist/queue/queue.js +3 -1
  29. package/dist/responses/universal/index.cjs +11 -11
  30. package/dist/wget/asset-extractor.cjs +556 -0
  31. package/dist/wget/asset-extractor.js +553 -0
  32. package/dist/wget/asset-organizer.cjs +230 -0
  33. package/dist/wget/asset-organizer.js +227 -0
  34. package/dist/wget/download-cache.cjs +221 -0
  35. package/dist/wget/download-cache.js +218 -0
  36. package/dist/wget/downloader.cjs +607 -0
  37. package/dist/wget/downloader.js +604 -0
  38. package/dist/wget/file-writer.cjs +349 -0
  39. package/dist/wget/file-writer.js +346 -0
  40. package/dist/wget/filter-lists.cjs +1330 -0
  41. package/dist/wget/filter-lists.js +1330 -0
  42. package/dist/wget/index.cjs +633 -0
  43. package/dist/wget/index.d.ts +8486 -0
  44. package/dist/wget/index.js +614 -0
  45. package/dist/wget/link-converter.cjs +297 -0
  46. package/dist/wget/link-converter.js +294 -0
  47. package/dist/wget/progress.cjs +271 -0
  48. package/dist/wget/progress.js +266 -0
  49. package/dist/wget/resume.cjs +166 -0
  50. package/dist/wget/resume.js +163 -0
  51. package/dist/wget/robots.cjs +303 -0
  52. package/dist/wget/robots.js +300 -0
  53. package/dist/wget/types.cjs +200 -0
  54. package/dist/wget/types.js +197 -0
  55. package/dist/wget/url-filter.cjs +351 -0
  56. package/dist/wget/url-filter.js +348 -0
  57. package/package.json +6 -1
@@ -0,0 +1,351 @@
1
+ class UrlFilter {
2
+ options;
3
+ startUrls = new Set;
4
+ startHosts = new Set;
5
+ startPaths = new Map;
6
+ allowedDomains = null;
7
+ excludedDomains = null;
8
+ acceptPatterns = null;
9
+ rejectPatterns = null;
10
+ acceptRegex = null;
11
+ rejectRegex = null;
12
+ includeDirectories = null;
13
+ excludeDirectories = null;
14
+ excludeExtensions = null;
15
+ constructor(options) {
16
+ this.options = options;
17
+ this.initializeFilters();
18
+ }
19
+ initializeFilters() {
20
+ if (this.options.domains) {
21
+ const domains = Array.isArray(this.options.domains) ? this.options.domains : this.options.domains.split(",").map((d) => d.trim());
22
+ this.allowedDomains = new Set(domains.map((d) => d.toLowerCase()));
23
+ }
24
+ if (this.options.excludeDomains) {
25
+ const domains = Array.isArray(this.options.excludeDomains) ? this.options.excludeDomains : this.options.excludeDomains.split(",").map((d) => d.trim());
26
+ this.excludedDomains = new Set(domains.map((d) => d.toLowerCase()));
27
+ }
28
+ if (this.options.accept) {
29
+ this.acceptPatterns = Array.isArray(this.options.accept) ? this.options.accept : this.options.accept.split(",").map((p) => p.trim());
30
+ }
31
+ if (this.options.reject) {
32
+ this.rejectPatterns = Array.isArray(this.options.reject) ? this.options.reject : this.options.reject.split(",").map((p) => p.trim());
33
+ }
34
+ if (this.options.acceptRegex) {
35
+ this.acceptRegex = this.options.acceptRegex instanceof RegExp ? this.options.acceptRegex : new RegExp(this.options.acceptRegex);
36
+ }
37
+ if (this.options.rejectRegex) {
38
+ this.rejectRegex = this.options.rejectRegex instanceof RegExp ? this.options.rejectRegex : new RegExp(this.options.rejectRegex);
39
+ }
40
+ if (this.options.includeDirectories) {
41
+ this.includeDirectories = this.options.includeDirectories.map((d) => d.startsWith("/") ? d : "/" + d);
42
+ }
43
+ if (this.options.excludeDirectories) {
44
+ this.excludeDirectories = this.options.excludeDirectories.map((d) => d.startsWith("/") ? d : "/" + d);
45
+ }
46
+ if (this.options.excludeExtensions) {
47
+ this.excludeExtensions = this.options.excludeExtensions.map((ext) => ext.startsWith(".") ? ext.toLowerCase() : ("." + ext).toLowerCase());
48
+ }
49
+ }
50
+ addStartUrl(url) {
51
+ try {
52
+ const parsed = new URL(url);
53
+ this.startUrls.add(url);
54
+ this.startHosts.add(parsed.hostname.toLowerCase());
55
+ const host = parsed.hostname.toLowerCase();
56
+ if (!this.startPaths.has(host)) {
57
+ let path = parsed.pathname;
58
+ if (!path.endsWith("/")) {
59
+ path = path.substring(0, path.lastIndexOf("/") + 1) || "/";
60
+ }
61
+ this.startPaths.set(host, path);
62
+ }
63
+ } catch {}
64
+ }
65
+ shouldDownload(url, sourceUrl, depth) {
66
+ let parsed;
67
+ try {
68
+ parsed = new URL(url);
69
+ } catch {
70
+ return {
71
+ allowed: false,
72
+ reason: "invalid-url",
73
+ message: `Invalid URL: ${url}`
74
+ };
75
+ }
76
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
77
+ if (parsed.protocol === "ftp:" && this.options.followFTP) {} else {
78
+ return {
79
+ allowed: false,
80
+ reason: "unsupported-protocol",
81
+ message: `Unsupported protocol: ${parsed.protocol}`
82
+ };
83
+ }
84
+ }
85
+ const depthResult = this.checkDepth(depth);
86
+ if (!depthResult.allowed)
87
+ return depthResult;
88
+ const hostResult = this.checkHost(parsed, sourceUrl);
89
+ if (!hostResult.allowed)
90
+ return hostResult;
91
+ const domainResult = this.checkDomain(parsed);
92
+ if (!domainResult.allowed)
93
+ return domainResult;
94
+ const parentResult = this.checkParent(parsed);
95
+ if (!parentResult.allowed)
96
+ return parentResult;
97
+ const dirResult = this.checkDirectory(parsed);
98
+ if (!dirResult.allowed)
99
+ return dirResult;
100
+ const extResult = this.checkExtension(parsed);
101
+ if (!extResult.allowed)
102
+ return extResult;
103
+ const patternResult = this.checkPatterns(url, parsed);
104
+ if (!patternResult.allowed)
105
+ return patternResult;
106
+ if (this.options.relativeOnly) {
107
+ const isAbsolute = url.startsWith("http://") || url.startsWith("https://");
108
+ }
109
+ return { allowed: true };
110
+ }
111
+ checkDepth(depth) {
112
+ const maxDepth = this.options.depth ?? this.options.maxDepth ?? 5;
113
+ if (maxDepth === 0 || maxDepth === 1 / 0) {
114
+ return { allowed: true };
115
+ }
116
+ if (depth > maxDepth) {
117
+ return {
118
+ allowed: false,
119
+ reason: "depth-exceeded",
120
+ message: `Depth ${depth} exceeds maximum ${maxDepth}`
121
+ };
122
+ }
123
+ return { allowed: true };
124
+ }
125
+ checkHost(parsed, sourceUrl) {
126
+ const host = parsed.hostname.toLowerCase();
127
+ if (this.options.spanHosts) {
128
+ return { allowed: true };
129
+ }
130
+ if (this.startHosts.has(host)) {
131
+ return { allowed: true };
132
+ }
133
+ try {
134
+ const sourceHost = new URL(sourceUrl).hostname.toLowerCase();
135
+ if (host === sourceHost || this.startHosts.has(host)) {
136
+ return { allowed: true };
137
+ }
138
+ } catch {}
139
+ return {
140
+ allowed: false,
141
+ reason: "cross-host",
142
+ message: `Cross-host URL not allowed without --span-hosts: ${host}`
143
+ };
144
+ }
145
+ checkDomain(parsed) {
146
+ const host = parsed.hostname.toLowerCase();
147
+ if (this.excludedDomains) {
148
+ for (const excludedDomain of Array.from(this.excludedDomains)) {
149
+ if (this.matchesDomain(host, excludedDomain)) {
150
+ return {
151
+ allowed: false,
152
+ reason: "domain-excluded",
153
+ message: `Domain ${host} is in excluded list`
154
+ };
155
+ }
156
+ }
157
+ }
158
+ if (this.allowedDomains) {
159
+ let matchesAllowed = false;
160
+ for (const allowedDomain of Array.from(this.allowedDomains)) {
161
+ if (this.matchesDomain(host, allowedDomain)) {
162
+ matchesAllowed = true;
163
+ break;
164
+ }
165
+ }
166
+ if (!matchesAllowed) {
167
+ return {
168
+ allowed: false,
169
+ reason: "domain-excluded",
170
+ message: `Domain ${host} not in allowed list`
171
+ };
172
+ }
173
+ }
174
+ return { allowed: true };
175
+ }
176
+ matchesDomain(host, domain) {
177
+ if (host === domain)
178
+ return true;
179
+ if (host.endsWith("." + domain))
180
+ return true;
181
+ return false;
182
+ }
183
+ checkParent(parsed) {
184
+ if (!this.options.noParent) {
185
+ return { allowed: true };
186
+ }
187
+ const host = parsed.hostname.toLowerCase();
188
+ const basePath = this.startPaths.get(host);
189
+ if (!basePath) {
190
+ if (!this.startHosts.has(host)) {
191
+ return { allowed: true };
192
+ }
193
+ return { allowed: true };
194
+ }
195
+ const urlPath = parsed.pathname;
196
+ const normalizedBase = basePath.endsWith("/") ? basePath : basePath + "/";
197
+ const normalizedUrl = urlPath.endsWith("/") ? urlPath : urlPath + "/";
198
+ if (!normalizedUrl.startsWith(normalizedBase) && normalizedUrl !== normalizedBase.slice(0, -1)) {
199
+ if (urlPath !== basePath && !urlPath.startsWith(normalizedBase)) {
200
+ return {
201
+ allowed: false,
202
+ reason: "parent-directory",
203
+ message: `URL ${urlPath} goes above parent ${basePath}`
204
+ };
205
+ }
206
+ }
207
+ return { allowed: true };
208
+ }
209
+ checkDirectory(parsed) {
210
+ const path = parsed.pathname;
211
+ if (this.excludeDirectories) {
212
+ for (const excludeDir of this.excludeDirectories) {
213
+ if (path.startsWith(excludeDir)) {
214
+ return {
215
+ allowed: false,
216
+ reason: "directory-excluded",
217
+ message: `Path ${path} is in excluded directory ${excludeDir}`
218
+ };
219
+ }
220
+ }
221
+ }
222
+ if (this.includeDirectories) {
223
+ let matchesInclude = false;
224
+ for (const includeDir of this.includeDirectories) {
225
+ if (path.startsWith(includeDir)) {
226
+ matchesInclude = true;
227
+ break;
228
+ }
229
+ }
230
+ if (!matchesInclude) {
231
+ return {
232
+ allowed: false,
233
+ reason: "directory-excluded",
234
+ message: `Path ${path} not in any included directory`
235
+ };
236
+ }
237
+ }
238
+ return { allowed: true };
239
+ }
240
+ checkExtension(parsed) {
241
+ if (!this.excludeExtensions || this.excludeExtensions.length === 0) {
242
+ return { allowed: true };
243
+ }
244
+ const filename = this.getFilename(parsed);
245
+ if (!filename) {
246
+ return { allowed: true };
247
+ }
248
+ const ext = this.getExtension(filename);
249
+ if (!ext) {
250
+ return { allowed: true };
251
+ }
252
+ if (this.excludeExtensions.includes(ext.toLowerCase())) {
253
+ return {
254
+ allowed: false,
255
+ reason: "pattern-rejected",
256
+ message: `File extension ${ext} is excluded`
257
+ };
258
+ }
259
+ return { allowed: true };
260
+ }
261
+ getExtension(filename) {
262
+ const lastDot = filename.lastIndexOf(".");
263
+ if (lastDot === -1 || lastDot === filename.length - 1) {
264
+ return "";
265
+ }
266
+ return filename.slice(lastDot).toLowerCase();
267
+ }
268
+ checkPatterns(url, parsed) {
269
+ const filename = this.getFilename(parsed);
270
+ if (this.acceptPatterns && this.acceptPatterns.length > 0) {
271
+ const matchesAccept = this.acceptPatterns.some((pattern) => this.matchGlob(filename, pattern) || this.matchGlob(url, pattern));
272
+ if (!matchesAccept) {
273
+ return {
274
+ allowed: false,
275
+ reason: "pattern-not-accepted",
276
+ message: `URL does not match accept patterns`
277
+ };
278
+ }
279
+ }
280
+ if (this.rejectPatterns && this.rejectPatterns.length > 0) {
281
+ const matchesReject = this.rejectPatterns.some((pattern) => this.matchGlob(filename, pattern) || this.matchGlob(url, pattern));
282
+ if (matchesReject) {
283
+ return {
284
+ allowed: false,
285
+ reason: "pattern-rejected",
286
+ message: `URL matches reject pattern`
287
+ };
288
+ }
289
+ }
290
+ if (this.acceptRegex) {
291
+ if (!this.acceptRegex.test(url)) {
292
+ return {
293
+ allowed: false,
294
+ reason: "pattern-not-accepted",
295
+ message: `URL does not match accept regex`
296
+ };
297
+ }
298
+ }
299
+ if (this.rejectRegex) {
300
+ if (this.rejectRegex.test(url)) {
301
+ return {
302
+ allowed: false,
303
+ reason: "pattern-rejected",
304
+ message: `URL matches reject regex`
305
+ };
306
+ }
307
+ }
308
+ return { allowed: true };
309
+ }
310
+ getFilename(parsed) {
311
+ const path = parsed.pathname;
312
+ const lastSlash = path.lastIndexOf("/");
313
+ if (lastSlash === -1 || lastSlash === path.length - 1) {
314
+ return "";
315
+ }
316
+ return path.substring(lastSlash + 1);
317
+ }
318
+ matchGlob(str, pattern) {
319
+ const regexStr = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
320
+ try {
321
+ const regex = new RegExp(`^${regexStr}$`, "i");
322
+ return regex.test(str);
323
+ } catch {
324
+ return false;
325
+ }
326
+ }
327
+ getMaxDepth() {
328
+ if (this.options.mirror) {
329
+ return 1 / 0;
330
+ }
331
+ const depth = this.options.depth ?? this.options.maxDepth ?? 5;
332
+ return depth === 0 ? 1 / 0 : depth;
333
+ }
334
+ isRecursive() {
335
+ return this.options.recursive === true || this.options.mirror === true;
336
+ }
337
+ isStartUrl(url) {
338
+ return this.startUrls.has(url);
339
+ }
340
+ getStartUrls() {
341
+ return this.startUrls;
342
+ }
343
+ updateOptions(options) {
344
+ this.options = { ...this.options, ...options };
345
+ this.initializeFilters();
346
+ }
347
+ }
348
+
349
+ exports.UrlFilter = UrlFilter;
350
+ exports.default = UrlFilter;
351
+ module.exports = Object.assign(UrlFilter, exports);
@@ -0,0 +1,348 @@
1
+ export class UrlFilter {
2
+ options;
3
+ startUrls = new Set;
4
+ startHosts = new Set;
5
+ startPaths = new Map;
6
+ allowedDomains = null;
7
+ excludedDomains = null;
8
+ acceptPatterns = null;
9
+ rejectPatterns = null;
10
+ acceptRegex = null;
11
+ rejectRegex = null;
12
+ includeDirectories = null;
13
+ excludeDirectories = null;
14
+ excludeExtensions = null;
15
+ constructor(options) {
16
+ this.options = options;
17
+ this.initializeFilters();
18
+ }
19
+ initializeFilters() {
20
+ if (this.options.domains) {
21
+ const domains = Array.isArray(this.options.domains) ? this.options.domains : this.options.domains.split(",").map((d) => d.trim());
22
+ this.allowedDomains = new Set(domains.map((d) => d.toLowerCase()));
23
+ }
24
+ if (this.options.excludeDomains) {
25
+ const domains = Array.isArray(this.options.excludeDomains) ? this.options.excludeDomains : this.options.excludeDomains.split(",").map((d) => d.trim());
26
+ this.excludedDomains = new Set(domains.map((d) => d.toLowerCase()));
27
+ }
28
+ if (this.options.accept) {
29
+ this.acceptPatterns = Array.isArray(this.options.accept) ? this.options.accept : this.options.accept.split(",").map((p) => p.trim());
30
+ }
31
+ if (this.options.reject) {
32
+ this.rejectPatterns = Array.isArray(this.options.reject) ? this.options.reject : this.options.reject.split(",").map((p) => p.trim());
33
+ }
34
+ if (this.options.acceptRegex) {
35
+ this.acceptRegex = this.options.acceptRegex instanceof RegExp ? this.options.acceptRegex : new RegExp(this.options.acceptRegex);
36
+ }
37
+ if (this.options.rejectRegex) {
38
+ this.rejectRegex = this.options.rejectRegex instanceof RegExp ? this.options.rejectRegex : new RegExp(this.options.rejectRegex);
39
+ }
40
+ if (this.options.includeDirectories) {
41
+ this.includeDirectories = this.options.includeDirectories.map((d) => d.startsWith("/") ? d : "/" + d);
42
+ }
43
+ if (this.options.excludeDirectories) {
44
+ this.excludeDirectories = this.options.excludeDirectories.map((d) => d.startsWith("/") ? d : "/" + d);
45
+ }
46
+ if (this.options.excludeExtensions) {
47
+ this.excludeExtensions = this.options.excludeExtensions.map((ext) => ext.startsWith(".") ? ext.toLowerCase() : ("." + ext).toLowerCase());
48
+ }
49
+ }
50
+ addStartUrl(url) {
51
+ try {
52
+ const parsed = new URL(url);
53
+ this.startUrls.add(url);
54
+ this.startHosts.add(parsed.hostname.toLowerCase());
55
+ const host = parsed.hostname.toLowerCase();
56
+ if (!this.startPaths.has(host)) {
57
+ let path = parsed.pathname;
58
+ if (!path.endsWith("/")) {
59
+ path = path.substring(0, path.lastIndexOf("/") + 1) || "/";
60
+ }
61
+ this.startPaths.set(host, path);
62
+ }
63
+ } catch {}
64
+ }
65
+ shouldDownload(url, sourceUrl, depth) {
66
+ let parsed;
67
+ try {
68
+ parsed = new URL(url);
69
+ } catch {
70
+ return {
71
+ allowed: false,
72
+ reason: "invalid-url",
73
+ message: `Invalid URL: ${url}`
74
+ };
75
+ }
76
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:") {
77
+ if (parsed.protocol === "ftp:" && this.options.followFTP) {} else {
78
+ return {
79
+ allowed: false,
80
+ reason: "unsupported-protocol",
81
+ message: `Unsupported protocol: ${parsed.protocol}`
82
+ };
83
+ }
84
+ }
85
+ const depthResult = this.checkDepth(depth);
86
+ if (!depthResult.allowed)
87
+ return depthResult;
88
+ const hostResult = this.checkHost(parsed, sourceUrl);
89
+ if (!hostResult.allowed)
90
+ return hostResult;
91
+ const domainResult = this.checkDomain(parsed);
92
+ if (!domainResult.allowed)
93
+ return domainResult;
94
+ const parentResult = this.checkParent(parsed);
95
+ if (!parentResult.allowed)
96
+ return parentResult;
97
+ const dirResult = this.checkDirectory(parsed);
98
+ if (!dirResult.allowed)
99
+ return dirResult;
100
+ const extResult = this.checkExtension(parsed);
101
+ if (!extResult.allowed)
102
+ return extResult;
103
+ const patternResult = this.checkPatterns(url, parsed);
104
+ if (!patternResult.allowed)
105
+ return patternResult;
106
+ if (this.options.relativeOnly) {
107
+ const isAbsolute = url.startsWith("http://") || url.startsWith("https://");
108
+ }
109
+ return { allowed: true };
110
+ }
111
+ checkDepth(depth) {
112
+ const maxDepth = this.options.depth ?? this.options.maxDepth ?? 5;
113
+ if (maxDepth === 0 || maxDepth === 1 / 0) {
114
+ return { allowed: true };
115
+ }
116
+ if (depth > maxDepth) {
117
+ return {
118
+ allowed: false,
119
+ reason: "depth-exceeded",
120
+ message: `Depth ${depth} exceeds maximum ${maxDepth}`
121
+ };
122
+ }
123
+ return { allowed: true };
124
+ }
125
+ checkHost(parsed, sourceUrl) {
126
+ const host = parsed.hostname.toLowerCase();
127
+ if (this.options.spanHosts) {
128
+ return { allowed: true };
129
+ }
130
+ if (this.startHosts.has(host)) {
131
+ return { allowed: true };
132
+ }
133
+ try {
134
+ const sourceHost = new URL(sourceUrl).hostname.toLowerCase();
135
+ if (host === sourceHost || this.startHosts.has(host)) {
136
+ return { allowed: true };
137
+ }
138
+ } catch {}
139
+ return {
140
+ allowed: false,
141
+ reason: "cross-host",
142
+ message: `Cross-host URL not allowed without --span-hosts: ${host}`
143
+ };
144
+ }
145
+ checkDomain(parsed) {
146
+ const host = parsed.hostname.toLowerCase();
147
+ if (this.excludedDomains) {
148
+ for (const excludedDomain of Array.from(this.excludedDomains)) {
149
+ if (this.matchesDomain(host, excludedDomain)) {
150
+ return {
151
+ allowed: false,
152
+ reason: "domain-excluded",
153
+ message: `Domain ${host} is in excluded list`
154
+ };
155
+ }
156
+ }
157
+ }
158
+ if (this.allowedDomains) {
159
+ let matchesAllowed = false;
160
+ for (const allowedDomain of Array.from(this.allowedDomains)) {
161
+ if (this.matchesDomain(host, allowedDomain)) {
162
+ matchesAllowed = true;
163
+ break;
164
+ }
165
+ }
166
+ if (!matchesAllowed) {
167
+ return {
168
+ allowed: false,
169
+ reason: "domain-excluded",
170
+ message: `Domain ${host} not in allowed list`
171
+ };
172
+ }
173
+ }
174
+ return { allowed: true };
175
+ }
176
+ matchesDomain(host, domain) {
177
+ if (host === domain)
178
+ return true;
179
+ if (host.endsWith("." + domain))
180
+ return true;
181
+ return false;
182
+ }
183
+ checkParent(parsed) {
184
+ if (!this.options.noParent) {
185
+ return { allowed: true };
186
+ }
187
+ const host = parsed.hostname.toLowerCase();
188
+ const basePath = this.startPaths.get(host);
189
+ if (!basePath) {
190
+ if (!this.startHosts.has(host)) {
191
+ return { allowed: true };
192
+ }
193
+ return { allowed: true };
194
+ }
195
+ const urlPath = parsed.pathname;
196
+ const normalizedBase = basePath.endsWith("/") ? basePath : basePath + "/";
197
+ const normalizedUrl = urlPath.endsWith("/") ? urlPath : urlPath + "/";
198
+ if (!normalizedUrl.startsWith(normalizedBase) && normalizedUrl !== normalizedBase.slice(0, -1)) {
199
+ if (urlPath !== basePath && !urlPath.startsWith(normalizedBase)) {
200
+ return {
201
+ allowed: false,
202
+ reason: "parent-directory",
203
+ message: `URL ${urlPath} goes above parent ${basePath}`
204
+ };
205
+ }
206
+ }
207
+ return { allowed: true };
208
+ }
209
+ checkDirectory(parsed) {
210
+ const path = parsed.pathname;
211
+ if (this.excludeDirectories) {
212
+ for (const excludeDir of this.excludeDirectories) {
213
+ if (path.startsWith(excludeDir)) {
214
+ return {
215
+ allowed: false,
216
+ reason: "directory-excluded",
217
+ message: `Path ${path} is in excluded directory ${excludeDir}`
218
+ };
219
+ }
220
+ }
221
+ }
222
+ if (this.includeDirectories) {
223
+ let matchesInclude = false;
224
+ for (const includeDir of this.includeDirectories) {
225
+ if (path.startsWith(includeDir)) {
226
+ matchesInclude = true;
227
+ break;
228
+ }
229
+ }
230
+ if (!matchesInclude) {
231
+ return {
232
+ allowed: false,
233
+ reason: "directory-excluded",
234
+ message: `Path ${path} not in any included directory`
235
+ };
236
+ }
237
+ }
238
+ return { allowed: true };
239
+ }
240
+ checkExtension(parsed) {
241
+ if (!this.excludeExtensions || this.excludeExtensions.length === 0) {
242
+ return { allowed: true };
243
+ }
244
+ const filename = this.getFilename(parsed);
245
+ if (!filename) {
246
+ return { allowed: true };
247
+ }
248
+ const ext = this.getExtension(filename);
249
+ if (!ext) {
250
+ return { allowed: true };
251
+ }
252
+ if (this.excludeExtensions.includes(ext.toLowerCase())) {
253
+ return {
254
+ allowed: false,
255
+ reason: "pattern-rejected",
256
+ message: `File extension ${ext} is excluded`
257
+ };
258
+ }
259
+ return { allowed: true };
260
+ }
261
+ getExtension(filename) {
262
+ const lastDot = filename.lastIndexOf(".");
263
+ if (lastDot === -1 || lastDot === filename.length - 1) {
264
+ return "";
265
+ }
266
+ return filename.slice(lastDot).toLowerCase();
267
+ }
268
+ checkPatterns(url, parsed) {
269
+ const filename = this.getFilename(parsed);
270
+ if (this.acceptPatterns && this.acceptPatterns.length > 0) {
271
+ const matchesAccept = this.acceptPatterns.some((pattern) => this.matchGlob(filename, pattern) || this.matchGlob(url, pattern));
272
+ if (!matchesAccept) {
273
+ return {
274
+ allowed: false,
275
+ reason: "pattern-not-accepted",
276
+ message: `URL does not match accept patterns`
277
+ };
278
+ }
279
+ }
280
+ if (this.rejectPatterns && this.rejectPatterns.length > 0) {
281
+ const matchesReject = this.rejectPatterns.some((pattern) => this.matchGlob(filename, pattern) || this.matchGlob(url, pattern));
282
+ if (matchesReject) {
283
+ return {
284
+ allowed: false,
285
+ reason: "pattern-rejected",
286
+ message: `URL matches reject pattern`
287
+ };
288
+ }
289
+ }
290
+ if (this.acceptRegex) {
291
+ if (!this.acceptRegex.test(url)) {
292
+ return {
293
+ allowed: false,
294
+ reason: "pattern-not-accepted",
295
+ message: `URL does not match accept regex`
296
+ };
297
+ }
298
+ }
299
+ if (this.rejectRegex) {
300
+ if (this.rejectRegex.test(url)) {
301
+ return {
302
+ allowed: false,
303
+ reason: "pattern-rejected",
304
+ message: `URL matches reject regex`
305
+ };
306
+ }
307
+ }
308
+ return { allowed: true };
309
+ }
310
+ getFilename(parsed) {
311
+ const path = parsed.pathname;
312
+ const lastSlash = path.lastIndexOf("/");
313
+ if (lastSlash === -1 || lastSlash === path.length - 1) {
314
+ return "";
315
+ }
316
+ return path.substring(lastSlash + 1);
317
+ }
318
+ matchGlob(str, pattern) {
319
+ const regexStr = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
320
+ try {
321
+ const regex = new RegExp(`^${regexStr}$`, "i");
322
+ return regex.test(str);
323
+ } catch {
324
+ return false;
325
+ }
326
+ }
327
+ getMaxDepth() {
328
+ if (this.options.mirror) {
329
+ return 1 / 0;
330
+ }
331
+ const depth = this.options.depth ?? this.options.maxDepth ?? 5;
332
+ return depth === 0 ? 1 / 0 : depth;
333
+ }
334
+ isRecursive() {
335
+ return this.options.recursive === true || this.options.mirror === true;
336
+ }
337
+ isStartUrl(url) {
338
+ return this.startUrls.has(url);
339
+ }
340
+ getStartUrls() {
341
+ return this.startUrls;
342
+ }
343
+ updateOptions(options) {
344
+ this.options = { ...this.options, ...options };
345
+ this.initializeFilters();
346
+ }
347
+ }
348
+ export default UrlFilter;