rezo 1.0.66 → 1.0.68

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/dist/adapters/entries/curl.d.ts +5 -0
  2. package/dist/adapters/entries/fetch.d.ts +5 -0
  3. package/dist/adapters/entries/http.d.ts +5 -0
  4. package/dist/adapters/entries/http2.d.ts +5 -0
  5. package/dist/adapters/entries/react-native.d.ts +5 -0
  6. package/dist/adapters/entries/xhr.d.ts +5 -0
  7. package/dist/adapters/index.cjs +6 -6
  8. package/dist/cache/index.cjs +9 -9
  9. package/dist/crawler/crawler.cjs +26 -5
  10. package/dist/crawler/crawler.js +26 -5
  11. package/dist/crawler/index.cjs +40 -40
  12. package/dist/crawler.d.ts +10 -0
  13. package/dist/entries/crawler.cjs +4 -4
  14. package/dist/index.cjs +27 -27
  15. package/dist/index.d.ts +5 -0
  16. package/dist/internal/agents/index.cjs +10 -10
  17. package/dist/platform/browser.d.ts +5 -0
  18. package/dist/platform/bun.d.ts +5 -0
  19. package/dist/platform/deno.d.ts +5 -0
  20. package/dist/platform/node.d.ts +5 -0
  21. package/dist/platform/react-native.d.ts +5 -0
  22. package/dist/platform/worker.d.ts +5 -0
  23. package/dist/proxy/index.cjs +4 -4
  24. package/dist/proxy/manager.cjs +1 -1
  25. package/dist/proxy/manager.js +1 -1
  26. package/dist/queue/index.cjs +8 -8
  27. package/dist/queue/queue.cjs +3 -1
  28. package/dist/queue/queue.js +3 -1
  29. package/dist/responses/universal/index.cjs +11 -11
  30. package/dist/wget/asset-extractor.cjs +556 -0
  31. package/dist/wget/asset-extractor.js +553 -0
  32. package/dist/wget/asset-organizer.cjs +230 -0
  33. package/dist/wget/asset-organizer.js +227 -0
  34. package/dist/wget/download-cache.cjs +221 -0
  35. package/dist/wget/download-cache.js +218 -0
  36. package/dist/wget/downloader.cjs +607 -0
  37. package/dist/wget/downloader.js +604 -0
  38. package/dist/wget/file-writer.cjs +349 -0
  39. package/dist/wget/file-writer.js +346 -0
  40. package/dist/wget/filter-lists.cjs +1330 -0
  41. package/dist/wget/filter-lists.js +1330 -0
  42. package/dist/wget/index.cjs +633 -0
  43. package/dist/wget/index.d.ts +8486 -0
  44. package/dist/wget/index.js +614 -0
  45. package/dist/wget/link-converter.cjs +297 -0
  46. package/dist/wget/link-converter.js +294 -0
  47. package/dist/wget/progress.cjs +271 -0
  48. package/dist/wget/progress.js +266 -0
  49. package/dist/wget/resume.cjs +166 -0
  50. package/dist/wget/resume.js +163 -0
  51. package/dist/wget/robots.cjs +303 -0
  52. package/dist/wget/robots.js +300 -0
  53. package/dist/wget/types.cjs +200 -0
  54. package/dist/wget/types.js +197 -0
  55. package/dist/wget/url-filter.cjs +351 -0
  56. package/dist/wget/url-filter.js +348 -0
  57. package/package.json +6 -1
@@ -0,0 +1,553 @@
1
+ import { parseHTML, DOMParser } from '../dom/index.js';
2
+ const HTML_URL_ATTRIBUTES = {
3
+ a: ["href"],
4
+ area: ["href"],
5
+ link: ["href"],
6
+ base: ["href"],
7
+ img: ["src", "srcset", "data-src", "data-srcset", "data-lazy-src"],
8
+ picture: [],
9
+ source: ["src", "srcset"],
10
+ video: ["src", "poster"],
11
+ audio: ["src"],
12
+ track: ["src"],
13
+ script: ["src"],
14
+ style: [],
15
+ iframe: ["src"],
16
+ frame: ["src"],
17
+ embed: ["src"],
18
+ object: ["data", "codebase"],
19
+ form: ["action"],
20
+ input: ["src"],
21
+ button: ["formaction"],
22
+ meta: ["content"],
23
+ body: ["background"],
24
+ table: ["background"],
25
+ td: ["background"],
26
+ th: ["background"],
27
+ blockquote: ["cite"],
28
+ q: ["cite"],
29
+ del: ["cite"],
30
+ ins: ["cite"],
31
+ applet: ["code", "codebase", "archive"]
32
+ };
33
+ const META_URL_PROPERTIES = [
34
+ "og:image",
35
+ "og:image:url",
36
+ "og:image:secure_url",
37
+ "og:video",
38
+ "og:video:url",
39
+ "og:video:secure_url",
40
+ "og:audio",
41
+ "og:audio:url",
42
+ "og:audio:secure_url",
43
+ "og:url",
44
+ "twitter:image",
45
+ "twitter:image:src",
46
+ "twitter:player",
47
+ "twitter:player:stream"
48
+ ];
49
+ const REQUISITE_LINK_RELS = [
50
+ "stylesheet",
51
+ "icon",
52
+ "shortcut icon",
53
+ "apple-touch-icon",
54
+ "apple-touch-icon-precomposed",
55
+ "manifest",
56
+ "preload",
57
+ "modulepreload"
58
+ ];
59
+ function determineAssetType(url, tag, attribute, rel) {
60
+ const lowerTag = tag.toLowerCase();
61
+ const lowerUrl = url.toLowerCase();
62
+ if (lowerTag === "script")
63
+ return "script";
64
+ if (lowerTag === "style")
65
+ return "stylesheet";
66
+ if (lowerTag === "img" || lowerTag === "picture")
67
+ return "image";
68
+ if (lowerTag === "video")
69
+ return "video";
70
+ if (lowerTag === "audio")
71
+ return "audio";
72
+ if (lowerTag === "iframe" || lowerTag === "frame")
73
+ return "iframe";
74
+ if (lowerTag === "embed" || lowerTag === "object")
75
+ return "object";
76
+ if (lowerTag === "link" && rel) {
77
+ const lowerRel = rel.toLowerCase();
78
+ if (lowerRel.includes("stylesheet"))
79
+ return "stylesheet";
80
+ if (lowerRel.includes("icon"))
81
+ return "favicon";
82
+ if (lowerRel.includes("manifest"))
83
+ return "manifest";
84
+ if (lowerRel.includes("preload") || lowerRel.includes("modulepreload")) {
85
+ return "other";
86
+ }
87
+ }
88
+ const ext = getUrlExtension(lowerUrl);
89
+ switch (ext) {
90
+ case "css":
91
+ return "stylesheet";
92
+ case "js":
93
+ case "mjs":
94
+ case "cjs":
95
+ return "script";
96
+ case "png":
97
+ case "jpg":
98
+ case "jpeg":
99
+ case "gif":
100
+ case "webp":
101
+ case "avif":
102
+ case "svg":
103
+ case "ico":
104
+ case "bmp":
105
+ case "tiff":
106
+ case "tif":
107
+ return "image";
108
+ case "mp4":
109
+ case "webm":
110
+ case "ogg":
111
+ case "ogv":
112
+ case "mov":
113
+ case "avi":
114
+ case "mkv":
115
+ return "video";
116
+ case "mp3":
117
+ case "wav":
118
+ case "flac":
119
+ case "aac":
120
+ case "m4a":
121
+ case "oga":
122
+ return "audio";
123
+ case "woff":
124
+ case "woff2":
125
+ case "ttf":
126
+ case "otf":
127
+ case "eot":
128
+ return "font";
129
+ case "html":
130
+ case "htm":
131
+ case "xhtml":
132
+ case "php":
133
+ case "asp":
134
+ case "aspx":
135
+ case "jsp":
136
+ return "document";
137
+ case "json":
138
+ case "xml":
139
+ return "data";
140
+ case "webmanifest":
141
+ return "manifest";
142
+ default:
143
+ if (lowerTag === "a")
144
+ return "document";
145
+ return "other";
146
+ }
147
+ }
148
+ function getUrlExtension(url) {
149
+ try {
150
+ const pathname = new URL(url, "http://localhost").pathname;
151
+ const lastDot = pathname.lastIndexOf(".");
152
+ const lastSlash = pathname.lastIndexOf("/");
153
+ if (lastDot > lastSlash && lastDot < pathname.length - 1) {
154
+ return pathname.slice(lastDot + 1).toLowerCase();
155
+ }
156
+ } catch {
157
+ const match = url.match(/\.([a-zA-Z0-9]+)(?:\?|#|$)/);
158
+ if (match)
159
+ return match[1].toLowerCase();
160
+ }
161
+ return "";
162
+ }
163
+ function isPageRequisite(type, tag, rel) {
164
+ if (["stylesheet", "script", "font", "favicon", "manifest"].includes(type)) {
165
+ return true;
166
+ }
167
+ if (type === "image") {
168
+ return true;
169
+ }
170
+ if (tag.toLowerCase() === "link" && rel) {
171
+ return REQUISITE_LINK_RELS.some((r) => rel.toLowerCase().includes(r));
172
+ }
173
+ return false;
174
+ }
175
+
176
+ export class AssetExtractor {
177
+ extractFromHTML(html, baseUrl, options) {
178
+ const assets = [];
179
+ const { document } = parseHTML(html);
180
+ const baseElement = document.querySelector("base[href]");
181
+ if (baseElement) {
182
+ const baseHref = baseElement.getAttribute("href");
183
+ if (baseHref) {
184
+ baseUrl = this.resolveUrl(baseHref, baseUrl) || baseUrl;
185
+ }
186
+ }
187
+ const followTags = options?.followTags ? new Set(options.followTags.map((t) => t.toLowerCase())) : null;
188
+ const ignoreTags = options?.ignoreTags ? new Set(options.ignoreTags.map((t) => t.toLowerCase())) : null;
189
+ for (const [tag, attributes] of Object.entries(HTML_URL_ATTRIBUTES)) {
190
+ const lowerTag = tag.toLowerCase();
191
+ if (followTags && !followTags.has(lowerTag))
192
+ continue;
193
+ if (ignoreTags && ignoreTags.has(lowerTag))
194
+ continue;
195
+ const elements = Array.from(document.querySelectorAll(tag));
196
+ for (const element of elements) {
197
+ const rel = element.getAttribute("rel");
198
+ for (const attr of attributes) {
199
+ const value = element.getAttribute(attr);
200
+ if (!value)
201
+ continue;
202
+ if (attr === "srcset" || attr === "data-srcset") {
203
+ const srcsetUrls = this.parseSrcset(value, baseUrl);
204
+ for (const url of srcsetUrls) {
205
+ assets.push({
206
+ url,
207
+ type: "image",
208
+ source: "html",
209
+ tag: lowerTag,
210
+ attribute: attr,
211
+ required: true,
212
+ inline: false
213
+ });
214
+ }
215
+ continue;
216
+ }
217
+ if (lowerTag === "meta" && attr === "content") {
218
+ const property = element.getAttribute("property") || element.getAttribute("name");
219
+ if (!property || !META_URL_PROPERTIES.includes(property.toLowerCase())) {
220
+ continue;
221
+ }
222
+ }
223
+ const resolvedUrl = this.resolveUrl(value, baseUrl);
224
+ if (!resolvedUrl)
225
+ continue;
226
+ const assetType = determineAssetType(resolvedUrl, lowerTag, attr, rel);
227
+ const required = isPageRequisite(assetType, lowerTag, rel);
228
+ assets.push({
229
+ url: resolvedUrl,
230
+ type: assetType,
231
+ source: "html",
232
+ tag: lowerTag,
233
+ attribute: attr,
234
+ required,
235
+ inline: false
236
+ });
237
+ }
238
+ const styleAttr = element.getAttribute("style");
239
+ if (styleAttr) {
240
+ const cssAssets = this.extractUrlsFromCSSText(styleAttr, baseUrl);
241
+ for (const cssAsset of cssAssets) {
242
+ assets.push({
243
+ ...cssAsset,
244
+ source: "html",
245
+ tag: lowerTag,
246
+ attribute: "style",
247
+ inline: true
248
+ });
249
+ }
250
+ }
251
+ }
252
+ }
253
+ const styleTags = Array.from(document.querySelectorAll("style"));
254
+ for (const styleTag of styleTags) {
255
+ const cssContent = styleTag.textContent;
256
+ if (cssContent) {
257
+ const cssAssets = this.extractFromCSS(cssContent, baseUrl);
258
+ for (const asset of cssAssets) {
259
+ assets.push({
260
+ ...asset,
261
+ source: "html",
262
+ tag: "style",
263
+ inline: true
264
+ });
265
+ }
266
+ }
267
+ }
268
+ return assets;
269
+ }
270
+ extractFromCSS(css, baseUrl) {
271
+ const assets = [];
272
+ const importRegex = /@import\s+(?:url\s*\(\s*)?['"]?([^'"\)\s;]+)['"]?\s*\)?[^;]*;/gi;
273
+ let match;
274
+ while ((match = importRegex.exec(css)) !== null) {
275
+ const url = this.resolveUrl(match[1], baseUrl);
276
+ if (url) {
277
+ assets.push({
278
+ url,
279
+ type: "stylesheet",
280
+ source: "css",
281
+ required: true,
282
+ inline: false
283
+ });
284
+ }
285
+ }
286
+ const urlAssets = this.extractUrlsFromCSSText(css, baseUrl);
287
+ assets.push(...urlAssets);
288
+ return assets;
289
+ }
290
+ extractUrlsFromCSSText(css, baseUrl) {
291
+ const assets = [];
292
+ const urlRegex = /url\s*\(\s*(['"]?)([^'"\)\s]+)\1\s*\)/gi;
293
+ let match;
294
+ while ((match = urlRegex.exec(css)) !== null) {
295
+ const urlValue = match[2].trim();
296
+ if (urlValue.startsWith("data:")) {
297
+ continue;
298
+ }
299
+ if (!urlValue || urlValue.startsWith("#")) {
300
+ continue;
301
+ }
302
+ const resolvedUrl = this.resolveUrl(urlValue, baseUrl);
303
+ if (!resolvedUrl)
304
+ continue;
305
+ const type = this.guessAssetTypeFromUrl(resolvedUrl);
306
+ assets.push({
307
+ url: resolvedUrl,
308
+ type,
309
+ source: "css",
310
+ required: true,
311
+ inline: false
312
+ });
313
+ }
314
+ return assets;
315
+ }
316
+ extractFromXML(xml, baseUrl) {
317
+ const assets = [];
318
+ try {
319
+ const parser = new DOMParser;
320
+ const doc = parser.parseFromString(xml, "text/xml");
321
+ const isSVG = doc.documentElement?.tagName.toLowerCase() === "svg";
322
+ const source = isSVG ? "svg" : "xml";
323
+ const allElements = Array.from(doc.querySelectorAll("*"));
324
+ for (const el of allElements) {
325
+ for (const attr of ["href", "src", "xlink:href"]) {
326
+ const value = el.getAttribute(attr);
327
+ if (value && !value.startsWith("#") && !value.startsWith("data:")) {
328
+ const resolvedUrl = this.resolveUrl(value, baseUrl);
329
+ if (resolvedUrl) {
330
+ if (!assets.some((a) => a.url === resolvedUrl)) {
331
+ const tagName = el.tagName.toLowerCase();
332
+ let assetType = this.guessAssetTypeFromUrl(resolvedUrl);
333
+ if (isSVG) {
334
+ if (tagName === "image")
335
+ assetType = "image";
336
+ else if (tagName === "use")
337
+ assetType = "image";
338
+ }
339
+ assets.push({
340
+ url: resolvedUrl,
341
+ type: assetType,
342
+ source,
343
+ tag: tagName,
344
+ attribute: attr,
345
+ required: isSVG && (tagName === "image" || tagName === "use"),
346
+ inline: false
347
+ });
348
+ }
349
+ }
350
+ }
351
+ }
352
+ }
353
+ } catch (error) {
354
+ console.warn("Failed to parse XML/SVG:", error);
355
+ }
356
+ return assets;
357
+ }
358
+ extractFromJS(js, baseUrl) {
359
+ const assets = [];
360
+ const seen = new Set;
361
+ const patterns = [
362
+ /['"`](https?:\/\/[^'"`\s]+)['"`]/gi,
363
+ /['"`](\/[a-zA-Z0-9._\-/]+\.[a-zA-Z0-9]+)['"`]/gi,
364
+ /['"`](\.\/[a-zA-Z0-9._\-/]+\.[a-zA-Z0-9]+)['"`]/gi
365
+ ];
366
+ for (const pattern of patterns) {
367
+ let match;
368
+ while ((match = pattern.exec(js)) !== null) {
369
+ const urlCandidate = match[1];
370
+ if (seen.has(urlCandidate))
371
+ continue;
372
+ seen.add(urlCandidate);
373
+ if (urlCandidate.startsWith("data:"))
374
+ continue;
375
+ const resolvedUrl = this.resolveUrl(urlCandidate, baseUrl);
376
+ if (!resolvedUrl)
377
+ continue;
378
+ const ext = getUrlExtension(resolvedUrl);
379
+ if (ext && ["js", "css", "png", "jpg", "jpeg", "gif", "svg", "webp", "json", "html"].includes(ext)) {
380
+ assets.push({
381
+ url: resolvedUrl,
382
+ type: this.guessAssetTypeFromUrl(resolvedUrl),
383
+ source: "js",
384
+ required: false,
385
+ inline: false
386
+ });
387
+ }
388
+ }
389
+ }
390
+ return assets;
391
+ }
392
+ parseSrcset(srcset, baseUrl) {
393
+ const urls = [];
394
+ const candidates = srcset.split(/,\s*(?=[^\s])/);
395
+ for (const candidate of candidates) {
396
+ const parts = candidate.trim().split(/\s+/);
397
+ if (parts.length > 0 && parts[0]) {
398
+ const url = this.resolveUrl(parts[0], baseUrl);
399
+ if (url) {
400
+ urls.push(url);
401
+ }
402
+ }
403
+ }
404
+ return urls;
405
+ }
406
+ resolveUrl(url, baseUrl) {
407
+ if (!url)
408
+ return null;
409
+ url = url.trim();
410
+ if (!url || url.startsWith("#") || url.startsWith("javascript:") || url.startsWith("data:") || url.startsWith("mailto:") || url.startsWith("tel:")) {
411
+ return null;
412
+ }
413
+ try {
414
+ const resolved = new URL(url, baseUrl);
415
+ if (resolved.protocol !== "http:" && resolved.protocol !== "https:") {
416
+ return null;
417
+ }
418
+ return resolved.href;
419
+ } catch {
420
+ return null;
421
+ }
422
+ }
423
+ guessAssetTypeFromUrl(url) {
424
+ const ext = getUrlExtension(url);
425
+ switch (ext) {
426
+ case "css":
427
+ return "stylesheet";
428
+ case "js":
429
+ case "mjs":
430
+ case "cjs":
431
+ return "script";
432
+ case "png":
433
+ case "jpg":
434
+ case "jpeg":
435
+ case "gif":
436
+ case "webp":
437
+ case "avif":
438
+ case "svg":
439
+ case "ico":
440
+ case "bmp":
441
+ return "image";
442
+ case "mp4":
443
+ case "webm":
444
+ case "ogg":
445
+ case "ogv":
446
+ return "video";
447
+ case "mp3":
448
+ case "wav":
449
+ case "flac":
450
+ case "aac":
451
+ return "audio";
452
+ case "woff":
453
+ case "woff2":
454
+ case "ttf":
455
+ case "otf":
456
+ case "eot":
457
+ return "font";
458
+ case "html":
459
+ case "htm":
460
+ case "xhtml":
461
+ return "document";
462
+ case "json":
463
+ case "xml":
464
+ return "data";
465
+ default:
466
+ return "other";
467
+ }
468
+ }
469
+ filterAssets(assets, options) {
470
+ return assets.filter((asset) => {
471
+ if (options.acceptAssetTypes && options.acceptAssetTypes.length > 0) {
472
+ if (!options.acceptAssetTypes.includes(asset.type)) {
473
+ return false;
474
+ }
475
+ }
476
+ if (options.rejectAssetTypes && options.rejectAssetTypes.length > 0) {
477
+ if (options.rejectAssetTypes.includes(asset.type)) {
478
+ return false;
479
+ }
480
+ }
481
+ if (options.followTags && asset.tag) {
482
+ if (!options.followTags.includes(asset.tag)) {
483
+ return false;
484
+ }
485
+ }
486
+ if (options.ignoreTags && asset.tag) {
487
+ if (options.ignoreTags.includes(asset.tag)) {
488
+ return false;
489
+ }
490
+ }
491
+ if (options.accept) {
492
+ const patterns = Array.isArray(options.accept) ? options.accept : options.accept.split(",");
493
+ const matches = patterns.some((p) => this.matchGlob(asset.url, p.trim()));
494
+ if (!matches)
495
+ return false;
496
+ }
497
+ if (options.reject) {
498
+ const patterns = Array.isArray(options.reject) ? options.reject : options.reject.split(",");
499
+ const matches = patterns.some((p) => this.matchGlob(asset.url, p.trim()));
500
+ if (matches)
501
+ return false;
502
+ }
503
+ if (options.acceptRegex) {
504
+ const regex = options.acceptRegex instanceof RegExp ? options.acceptRegex : new RegExp(options.acceptRegex);
505
+ if (!regex.test(asset.url))
506
+ return false;
507
+ }
508
+ if (options.rejectRegex) {
509
+ const regex = options.rejectRegex instanceof RegExp ? options.rejectRegex : new RegExp(options.rejectRegex);
510
+ if (regex.test(asset.url))
511
+ return false;
512
+ }
513
+ if (options.excludeExtensions && options.excludeExtensions.length > 0) {
514
+ const ext = getUrlExtension(asset.url);
515
+ if (ext) {
516
+ const normalizedExt = "." + ext.toLowerCase();
517
+ const excluded = options.excludeExtensions.some((excludeExt) => {
518
+ const normalizedExclude = excludeExt.startsWith(".") ? excludeExt.toLowerCase() : ("." + excludeExt).toLowerCase();
519
+ return normalizedExt === normalizedExclude;
520
+ });
521
+ if (excluded)
522
+ return false;
523
+ }
524
+ }
525
+ return true;
526
+ });
527
+ }
528
+ matchGlob(url, pattern) {
529
+ const regexStr = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
530
+ const regex = new RegExp(`^${regexStr}$|${regexStr}`, "i");
531
+ return regex.test(url);
532
+ }
533
+ extract(content, mimeType, baseUrl, options) {
534
+ const lowerMime = mimeType.toLowerCase();
535
+ if (lowerMime.includes("html") || lowerMime.includes("xhtml")) {
536
+ return this.extractFromHTML(content, baseUrl, options);
537
+ }
538
+ if (lowerMime.includes("css")) {
539
+ return this.extractFromCSS(content, baseUrl);
540
+ }
541
+ if (lowerMime.includes("svg")) {
542
+ return this.extractFromXML(content, baseUrl);
543
+ }
544
+ if (lowerMime.includes("xml")) {
545
+ return this.extractFromXML(content, baseUrl);
546
+ }
547
+ if (lowerMime.includes("javascript") || lowerMime.includes("ecmascript")) {
548
+ return this.extractFromJS(content, baseUrl);
549
+ }
550
+ return [];
551
+ }
552
+ }
553
+ export default AssetExtractor;