rezo 1.0.66 → 1.0.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/entries/curl.d.ts +5 -0
- package/dist/adapters/entries/fetch.d.ts +5 -0
- package/dist/adapters/entries/http.d.ts +5 -0
- package/dist/adapters/entries/http2.d.ts +5 -0
- package/dist/adapters/entries/react-native.d.ts +5 -0
- package/dist/adapters/entries/xhr.d.ts +5 -0
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -9
- package/dist/crawler/crawler.cjs +26 -5
- package/dist/crawler/crawler.js +26 -5
- package/dist/crawler/index.cjs +40 -40
- package/dist/crawler.d.ts +10 -0
- package/dist/entries/crawler.cjs +4 -4
- package/dist/index.cjs +27 -27
- package/dist/index.d.ts +5 -0
- package/dist/internal/agents/index.cjs +10 -10
- package/dist/platform/browser.d.ts +5 -0
- package/dist/platform/bun.d.ts +5 -0
- package/dist/platform/deno.d.ts +5 -0
- package/dist/platform/node.d.ts +5 -0
- package/dist/platform/react-native.d.ts +5 -0
- package/dist/platform/worker.d.ts +5 -0
- package/dist/proxy/index.cjs +4 -4
- package/dist/proxy/manager.cjs +1 -1
- package/dist/proxy/manager.js +1 -1
- package/dist/queue/index.cjs +8 -8
- package/dist/queue/queue.cjs +3 -1
- package/dist/queue/queue.js +3 -1
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/wget/asset-extractor.cjs +556 -0
- package/dist/wget/asset-extractor.js +553 -0
- package/dist/wget/asset-organizer.cjs +230 -0
- package/dist/wget/asset-organizer.js +227 -0
- package/dist/wget/download-cache.cjs +221 -0
- package/dist/wget/download-cache.js +218 -0
- package/dist/wget/downloader.cjs +607 -0
- package/dist/wget/downloader.js +604 -0
- package/dist/wget/file-writer.cjs +349 -0
- package/dist/wget/file-writer.js +346 -0
- package/dist/wget/filter-lists.cjs +1330 -0
- package/dist/wget/filter-lists.js +1330 -0
- package/dist/wget/index.cjs +633 -0
- package/dist/wget/index.d.ts +8486 -0
- package/dist/wget/index.js +614 -0
- package/dist/wget/link-converter.cjs +297 -0
- package/dist/wget/link-converter.js +294 -0
- package/dist/wget/progress.cjs +271 -0
- package/dist/wget/progress.js +266 -0
- package/dist/wget/resume.cjs +166 -0
- package/dist/wget/resume.js +163 -0
- package/dist/wget/robots.cjs +303 -0
- package/dist/wget/robots.js +300 -0
- package/dist/wget/types.cjs +200 -0
- package/dist/wget/types.js +197 -0
- package/dist/wget/url-filter.cjs +351 -0
- package/dist/wget/url-filter.js +348 -0
- package/package.json +6 -1
|
@@ -0,0 +1,553 @@
|
|
|
1
|
+
import { parseHTML, DOMParser } from '../dom/index.js';
|
|
2
|
+
const HTML_URL_ATTRIBUTES = {
|
|
3
|
+
a: ["href"],
|
|
4
|
+
area: ["href"],
|
|
5
|
+
link: ["href"],
|
|
6
|
+
base: ["href"],
|
|
7
|
+
img: ["src", "srcset", "data-src", "data-srcset", "data-lazy-src"],
|
|
8
|
+
picture: [],
|
|
9
|
+
source: ["src", "srcset"],
|
|
10
|
+
video: ["src", "poster"],
|
|
11
|
+
audio: ["src"],
|
|
12
|
+
track: ["src"],
|
|
13
|
+
script: ["src"],
|
|
14
|
+
style: [],
|
|
15
|
+
iframe: ["src"],
|
|
16
|
+
frame: ["src"],
|
|
17
|
+
embed: ["src"],
|
|
18
|
+
object: ["data", "codebase"],
|
|
19
|
+
form: ["action"],
|
|
20
|
+
input: ["src"],
|
|
21
|
+
button: ["formaction"],
|
|
22
|
+
meta: ["content"],
|
|
23
|
+
body: ["background"],
|
|
24
|
+
table: ["background"],
|
|
25
|
+
td: ["background"],
|
|
26
|
+
th: ["background"],
|
|
27
|
+
blockquote: ["cite"],
|
|
28
|
+
q: ["cite"],
|
|
29
|
+
del: ["cite"],
|
|
30
|
+
ins: ["cite"],
|
|
31
|
+
applet: ["code", "codebase", "archive"]
|
|
32
|
+
};
|
|
33
|
+
const META_URL_PROPERTIES = [
|
|
34
|
+
"og:image",
|
|
35
|
+
"og:image:url",
|
|
36
|
+
"og:image:secure_url",
|
|
37
|
+
"og:video",
|
|
38
|
+
"og:video:url",
|
|
39
|
+
"og:video:secure_url",
|
|
40
|
+
"og:audio",
|
|
41
|
+
"og:audio:url",
|
|
42
|
+
"og:audio:secure_url",
|
|
43
|
+
"og:url",
|
|
44
|
+
"twitter:image",
|
|
45
|
+
"twitter:image:src",
|
|
46
|
+
"twitter:player",
|
|
47
|
+
"twitter:player:stream"
|
|
48
|
+
];
|
|
49
|
+
const REQUISITE_LINK_RELS = [
|
|
50
|
+
"stylesheet",
|
|
51
|
+
"icon",
|
|
52
|
+
"shortcut icon",
|
|
53
|
+
"apple-touch-icon",
|
|
54
|
+
"apple-touch-icon-precomposed",
|
|
55
|
+
"manifest",
|
|
56
|
+
"preload",
|
|
57
|
+
"modulepreload"
|
|
58
|
+
];
|
|
59
|
+
function determineAssetType(url, tag, attribute, rel) {
|
|
60
|
+
const lowerTag = tag.toLowerCase();
|
|
61
|
+
const lowerUrl = url.toLowerCase();
|
|
62
|
+
if (lowerTag === "script")
|
|
63
|
+
return "script";
|
|
64
|
+
if (lowerTag === "style")
|
|
65
|
+
return "stylesheet";
|
|
66
|
+
if (lowerTag === "img" || lowerTag === "picture")
|
|
67
|
+
return "image";
|
|
68
|
+
if (lowerTag === "video")
|
|
69
|
+
return "video";
|
|
70
|
+
if (lowerTag === "audio")
|
|
71
|
+
return "audio";
|
|
72
|
+
if (lowerTag === "iframe" || lowerTag === "frame")
|
|
73
|
+
return "iframe";
|
|
74
|
+
if (lowerTag === "embed" || lowerTag === "object")
|
|
75
|
+
return "object";
|
|
76
|
+
if (lowerTag === "link" && rel) {
|
|
77
|
+
const lowerRel = rel.toLowerCase();
|
|
78
|
+
if (lowerRel.includes("stylesheet"))
|
|
79
|
+
return "stylesheet";
|
|
80
|
+
if (lowerRel.includes("icon"))
|
|
81
|
+
return "favicon";
|
|
82
|
+
if (lowerRel.includes("manifest"))
|
|
83
|
+
return "manifest";
|
|
84
|
+
if (lowerRel.includes("preload") || lowerRel.includes("modulepreload")) {
|
|
85
|
+
return "other";
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
const ext = getUrlExtension(lowerUrl);
|
|
89
|
+
switch (ext) {
|
|
90
|
+
case "css":
|
|
91
|
+
return "stylesheet";
|
|
92
|
+
case "js":
|
|
93
|
+
case "mjs":
|
|
94
|
+
case "cjs":
|
|
95
|
+
return "script";
|
|
96
|
+
case "png":
|
|
97
|
+
case "jpg":
|
|
98
|
+
case "jpeg":
|
|
99
|
+
case "gif":
|
|
100
|
+
case "webp":
|
|
101
|
+
case "avif":
|
|
102
|
+
case "svg":
|
|
103
|
+
case "ico":
|
|
104
|
+
case "bmp":
|
|
105
|
+
case "tiff":
|
|
106
|
+
case "tif":
|
|
107
|
+
return "image";
|
|
108
|
+
case "mp4":
|
|
109
|
+
case "webm":
|
|
110
|
+
case "ogg":
|
|
111
|
+
case "ogv":
|
|
112
|
+
case "mov":
|
|
113
|
+
case "avi":
|
|
114
|
+
case "mkv":
|
|
115
|
+
return "video";
|
|
116
|
+
case "mp3":
|
|
117
|
+
case "wav":
|
|
118
|
+
case "flac":
|
|
119
|
+
case "aac":
|
|
120
|
+
case "m4a":
|
|
121
|
+
case "oga":
|
|
122
|
+
return "audio";
|
|
123
|
+
case "woff":
|
|
124
|
+
case "woff2":
|
|
125
|
+
case "ttf":
|
|
126
|
+
case "otf":
|
|
127
|
+
case "eot":
|
|
128
|
+
return "font";
|
|
129
|
+
case "html":
|
|
130
|
+
case "htm":
|
|
131
|
+
case "xhtml":
|
|
132
|
+
case "php":
|
|
133
|
+
case "asp":
|
|
134
|
+
case "aspx":
|
|
135
|
+
case "jsp":
|
|
136
|
+
return "document";
|
|
137
|
+
case "json":
|
|
138
|
+
case "xml":
|
|
139
|
+
return "data";
|
|
140
|
+
case "webmanifest":
|
|
141
|
+
return "manifest";
|
|
142
|
+
default:
|
|
143
|
+
if (lowerTag === "a")
|
|
144
|
+
return "document";
|
|
145
|
+
return "other";
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
function getUrlExtension(url) {
|
|
149
|
+
try {
|
|
150
|
+
const pathname = new URL(url, "http://localhost").pathname;
|
|
151
|
+
const lastDot = pathname.lastIndexOf(".");
|
|
152
|
+
const lastSlash = pathname.lastIndexOf("/");
|
|
153
|
+
if (lastDot > lastSlash && lastDot < pathname.length - 1) {
|
|
154
|
+
return pathname.slice(lastDot + 1).toLowerCase();
|
|
155
|
+
}
|
|
156
|
+
} catch {
|
|
157
|
+
const match = url.match(/\.([a-zA-Z0-9]+)(?:\?|#|$)/);
|
|
158
|
+
if (match)
|
|
159
|
+
return match[1].toLowerCase();
|
|
160
|
+
}
|
|
161
|
+
return "";
|
|
162
|
+
}
|
|
163
|
+
function isPageRequisite(type, tag, rel) {
|
|
164
|
+
if (["stylesheet", "script", "font", "favicon", "manifest"].includes(type)) {
|
|
165
|
+
return true;
|
|
166
|
+
}
|
|
167
|
+
if (type === "image") {
|
|
168
|
+
return true;
|
|
169
|
+
}
|
|
170
|
+
if (tag.toLowerCase() === "link" && rel) {
|
|
171
|
+
return REQUISITE_LINK_RELS.some((r) => rel.toLowerCase().includes(r));
|
|
172
|
+
}
|
|
173
|
+
return false;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export class AssetExtractor {
|
|
177
|
+
extractFromHTML(html, baseUrl, options) {
|
|
178
|
+
const assets = [];
|
|
179
|
+
const { document } = parseHTML(html);
|
|
180
|
+
const baseElement = document.querySelector("base[href]");
|
|
181
|
+
if (baseElement) {
|
|
182
|
+
const baseHref = baseElement.getAttribute("href");
|
|
183
|
+
if (baseHref) {
|
|
184
|
+
baseUrl = this.resolveUrl(baseHref, baseUrl) || baseUrl;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
const followTags = options?.followTags ? new Set(options.followTags.map((t) => t.toLowerCase())) : null;
|
|
188
|
+
const ignoreTags = options?.ignoreTags ? new Set(options.ignoreTags.map((t) => t.toLowerCase())) : null;
|
|
189
|
+
for (const [tag, attributes] of Object.entries(HTML_URL_ATTRIBUTES)) {
|
|
190
|
+
const lowerTag = tag.toLowerCase();
|
|
191
|
+
if (followTags && !followTags.has(lowerTag))
|
|
192
|
+
continue;
|
|
193
|
+
if (ignoreTags && ignoreTags.has(lowerTag))
|
|
194
|
+
continue;
|
|
195
|
+
const elements = Array.from(document.querySelectorAll(tag));
|
|
196
|
+
for (const element of elements) {
|
|
197
|
+
const rel = element.getAttribute("rel");
|
|
198
|
+
for (const attr of attributes) {
|
|
199
|
+
const value = element.getAttribute(attr);
|
|
200
|
+
if (!value)
|
|
201
|
+
continue;
|
|
202
|
+
if (attr === "srcset" || attr === "data-srcset") {
|
|
203
|
+
const srcsetUrls = this.parseSrcset(value, baseUrl);
|
|
204
|
+
for (const url of srcsetUrls) {
|
|
205
|
+
assets.push({
|
|
206
|
+
url,
|
|
207
|
+
type: "image",
|
|
208
|
+
source: "html",
|
|
209
|
+
tag: lowerTag,
|
|
210
|
+
attribute: attr,
|
|
211
|
+
required: true,
|
|
212
|
+
inline: false
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
if (lowerTag === "meta" && attr === "content") {
|
|
218
|
+
const property = element.getAttribute("property") || element.getAttribute("name");
|
|
219
|
+
if (!property || !META_URL_PROPERTIES.includes(property.toLowerCase())) {
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
const resolvedUrl = this.resolveUrl(value, baseUrl);
|
|
224
|
+
if (!resolvedUrl)
|
|
225
|
+
continue;
|
|
226
|
+
const assetType = determineAssetType(resolvedUrl, lowerTag, attr, rel);
|
|
227
|
+
const required = isPageRequisite(assetType, lowerTag, rel);
|
|
228
|
+
assets.push({
|
|
229
|
+
url: resolvedUrl,
|
|
230
|
+
type: assetType,
|
|
231
|
+
source: "html",
|
|
232
|
+
tag: lowerTag,
|
|
233
|
+
attribute: attr,
|
|
234
|
+
required,
|
|
235
|
+
inline: false
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
const styleAttr = element.getAttribute("style");
|
|
239
|
+
if (styleAttr) {
|
|
240
|
+
const cssAssets = this.extractUrlsFromCSSText(styleAttr, baseUrl);
|
|
241
|
+
for (const cssAsset of cssAssets) {
|
|
242
|
+
assets.push({
|
|
243
|
+
...cssAsset,
|
|
244
|
+
source: "html",
|
|
245
|
+
tag: lowerTag,
|
|
246
|
+
attribute: "style",
|
|
247
|
+
inline: true
|
|
248
|
+
});
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
const styleTags = Array.from(document.querySelectorAll("style"));
|
|
254
|
+
for (const styleTag of styleTags) {
|
|
255
|
+
const cssContent = styleTag.textContent;
|
|
256
|
+
if (cssContent) {
|
|
257
|
+
const cssAssets = this.extractFromCSS(cssContent, baseUrl);
|
|
258
|
+
for (const asset of cssAssets) {
|
|
259
|
+
assets.push({
|
|
260
|
+
...asset,
|
|
261
|
+
source: "html",
|
|
262
|
+
tag: "style",
|
|
263
|
+
inline: true
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
return assets;
|
|
269
|
+
}
|
|
270
|
+
extractFromCSS(css, baseUrl) {
|
|
271
|
+
const assets = [];
|
|
272
|
+
const importRegex = /@import\s+(?:url\s*\(\s*)?['"]?([^'"\)\s;]+)['"]?\s*\)?[^;]*;/gi;
|
|
273
|
+
let match;
|
|
274
|
+
while ((match = importRegex.exec(css)) !== null) {
|
|
275
|
+
const url = this.resolveUrl(match[1], baseUrl);
|
|
276
|
+
if (url) {
|
|
277
|
+
assets.push({
|
|
278
|
+
url,
|
|
279
|
+
type: "stylesheet",
|
|
280
|
+
source: "css",
|
|
281
|
+
required: true,
|
|
282
|
+
inline: false
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
const urlAssets = this.extractUrlsFromCSSText(css, baseUrl);
|
|
287
|
+
assets.push(...urlAssets);
|
|
288
|
+
return assets;
|
|
289
|
+
}
|
|
290
|
+
extractUrlsFromCSSText(css, baseUrl) {
|
|
291
|
+
const assets = [];
|
|
292
|
+
const urlRegex = /url\s*\(\s*(['"]?)([^'"\)\s]+)\1\s*\)/gi;
|
|
293
|
+
let match;
|
|
294
|
+
while ((match = urlRegex.exec(css)) !== null) {
|
|
295
|
+
const urlValue = match[2].trim();
|
|
296
|
+
if (urlValue.startsWith("data:")) {
|
|
297
|
+
continue;
|
|
298
|
+
}
|
|
299
|
+
if (!urlValue || urlValue.startsWith("#")) {
|
|
300
|
+
continue;
|
|
301
|
+
}
|
|
302
|
+
const resolvedUrl = this.resolveUrl(urlValue, baseUrl);
|
|
303
|
+
if (!resolvedUrl)
|
|
304
|
+
continue;
|
|
305
|
+
const type = this.guessAssetTypeFromUrl(resolvedUrl);
|
|
306
|
+
assets.push({
|
|
307
|
+
url: resolvedUrl,
|
|
308
|
+
type,
|
|
309
|
+
source: "css",
|
|
310
|
+
required: true,
|
|
311
|
+
inline: false
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
return assets;
|
|
315
|
+
}
|
|
316
|
+
extractFromXML(xml, baseUrl) {
|
|
317
|
+
const assets = [];
|
|
318
|
+
try {
|
|
319
|
+
const parser = new DOMParser;
|
|
320
|
+
const doc = parser.parseFromString(xml, "text/xml");
|
|
321
|
+
const isSVG = doc.documentElement?.tagName.toLowerCase() === "svg";
|
|
322
|
+
const source = isSVG ? "svg" : "xml";
|
|
323
|
+
const allElements = Array.from(doc.querySelectorAll("*"));
|
|
324
|
+
for (const el of allElements) {
|
|
325
|
+
for (const attr of ["href", "src", "xlink:href"]) {
|
|
326
|
+
const value = el.getAttribute(attr);
|
|
327
|
+
if (value && !value.startsWith("#") && !value.startsWith("data:")) {
|
|
328
|
+
const resolvedUrl = this.resolveUrl(value, baseUrl);
|
|
329
|
+
if (resolvedUrl) {
|
|
330
|
+
if (!assets.some((a) => a.url === resolvedUrl)) {
|
|
331
|
+
const tagName = el.tagName.toLowerCase();
|
|
332
|
+
let assetType = this.guessAssetTypeFromUrl(resolvedUrl);
|
|
333
|
+
if (isSVG) {
|
|
334
|
+
if (tagName === "image")
|
|
335
|
+
assetType = "image";
|
|
336
|
+
else if (tagName === "use")
|
|
337
|
+
assetType = "image";
|
|
338
|
+
}
|
|
339
|
+
assets.push({
|
|
340
|
+
url: resolvedUrl,
|
|
341
|
+
type: assetType,
|
|
342
|
+
source,
|
|
343
|
+
tag: tagName,
|
|
344
|
+
attribute: attr,
|
|
345
|
+
required: isSVG && (tagName === "image" || tagName === "use"),
|
|
346
|
+
inline: false
|
|
347
|
+
});
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
} catch (error) {
|
|
354
|
+
console.warn("Failed to parse XML/SVG:", error);
|
|
355
|
+
}
|
|
356
|
+
return assets;
|
|
357
|
+
}
|
|
358
|
+
extractFromJS(js, baseUrl) {
|
|
359
|
+
const assets = [];
|
|
360
|
+
const seen = new Set;
|
|
361
|
+
const patterns = [
|
|
362
|
+
/['"`](https?:\/\/[^'"`\s]+)['"`]/gi,
|
|
363
|
+
/['"`](\/[a-zA-Z0-9._\-/]+\.[a-zA-Z0-9]+)['"`]/gi,
|
|
364
|
+
/['"`](\.\/[a-zA-Z0-9._\-/]+\.[a-zA-Z0-9]+)['"`]/gi
|
|
365
|
+
];
|
|
366
|
+
for (const pattern of patterns) {
|
|
367
|
+
let match;
|
|
368
|
+
while ((match = pattern.exec(js)) !== null) {
|
|
369
|
+
const urlCandidate = match[1];
|
|
370
|
+
if (seen.has(urlCandidate))
|
|
371
|
+
continue;
|
|
372
|
+
seen.add(urlCandidate);
|
|
373
|
+
if (urlCandidate.startsWith("data:"))
|
|
374
|
+
continue;
|
|
375
|
+
const resolvedUrl = this.resolveUrl(urlCandidate, baseUrl);
|
|
376
|
+
if (!resolvedUrl)
|
|
377
|
+
continue;
|
|
378
|
+
const ext = getUrlExtension(resolvedUrl);
|
|
379
|
+
if (ext && ["js", "css", "png", "jpg", "jpeg", "gif", "svg", "webp", "json", "html"].includes(ext)) {
|
|
380
|
+
assets.push({
|
|
381
|
+
url: resolvedUrl,
|
|
382
|
+
type: this.guessAssetTypeFromUrl(resolvedUrl),
|
|
383
|
+
source: "js",
|
|
384
|
+
required: false,
|
|
385
|
+
inline: false
|
|
386
|
+
});
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
return assets;
|
|
391
|
+
}
|
|
392
|
+
parseSrcset(srcset, baseUrl) {
|
|
393
|
+
const urls = [];
|
|
394
|
+
const candidates = srcset.split(/,\s*(?=[^\s])/);
|
|
395
|
+
for (const candidate of candidates) {
|
|
396
|
+
const parts = candidate.trim().split(/\s+/);
|
|
397
|
+
if (parts.length > 0 && parts[0]) {
|
|
398
|
+
const url = this.resolveUrl(parts[0], baseUrl);
|
|
399
|
+
if (url) {
|
|
400
|
+
urls.push(url);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
return urls;
|
|
405
|
+
}
|
|
406
|
+
resolveUrl(url, baseUrl) {
|
|
407
|
+
if (!url)
|
|
408
|
+
return null;
|
|
409
|
+
url = url.trim();
|
|
410
|
+
if (!url || url.startsWith("#") || url.startsWith("javascript:") || url.startsWith("data:") || url.startsWith("mailto:") || url.startsWith("tel:")) {
|
|
411
|
+
return null;
|
|
412
|
+
}
|
|
413
|
+
try {
|
|
414
|
+
const resolved = new URL(url, baseUrl);
|
|
415
|
+
if (resolved.protocol !== "http:" && resolved.protocol !== "https:") {
|
|
416
|
+
return null;
|
|
417
|
+
}
|
|
418
|
+
return resolved.href;
|
|
419
|
+
} catch {
|
|
420
|
+
return null;
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
guessAssetTypeFromUrl(url) {
|
|
424
|
+
const ext = getUrlExtension(url);
|
|
425
|
+
switch (ext) {
|
|
426
|
+
case "css":
|
|
427
|
+
return "stylesheet";
|
|
428
|
+
case "js":
|
|
429
|
+
case "mjs":
|
|
430
|
+
case "cjs":
|
|
431
|
+
return "script";
|
|
432
|
+
case "png":
|
|
433
|
+
case "jpg":
|
|
434
|
+
case "jpeg":
|
|
435
|
+
case "gif":
|
|
436
|
+
case "webp":
|
|
437
|
+
case "avif":
|
|
438
|
+
case "svg":
|
|
439
|
+
case "ico":
|
|
440
|
+
case "bmp":
|
|
441
|
+
return "image";
|
|
442
|
+
case "mp4":
|
|
443
|
+
case "webm":
|
|
444
|
+
case "ogg":
|
|
445
|
+
case "ogv":
|
|
446
|
+
return "video";
|
|
447
|
+
case "mp3":
|
|
448
|
+
case "wav":
|
|
449
|
+
case "flac":
|
|
450
|
+
case "aac":
|
|
451
|
+
return "audio";
|
|
452
|
+
case "woff":
|
|
453
|
+
case "woff2":
|
|
454
|
+
case "ttf":
|
|
455
|
+
case "otf":
|
|
456
|
+
case "eot":
|
|
457
|
+
return "font";
|
|
458
|
+
case "html":
|
|
459
|
+
case "htm":
|
|
460
|
+
case "xhtml":
|
|
461
|
+
return "document";
|
|
462
|
+
case "json":
|
|
463
|
+
case "xml":
|
|
464
|
+
return "data";
|
|
465
|
+
default:
|
|
466
|
+
return "other";
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
filterAssets(assets, options) {
|
|
470
|
+
return assets.filter((asset) => {
|
|
471
|
+
if (options.acceptAssetTypes && options.acceptAssetTypes.length > 0) {
|
|
472
|
+
if (!options.acceptAssetTypes.includes(asset.type)) {
|
|
473
|
+
return false;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
if (options.rejectAssetTypes && options.rejectAssetTypes.length > 0) {
|
|
477
|
+
if (options.rejectAssetTypes.includes(asset.type)) {
|
|
478
|
+
return false;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
if (options.followTags && asset.tag) {
|
|
482
|
+
if (!options.followTags.includes(asset.tag)) {
|
|
483
|
+
return false;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
if (options.ignoreTags && asset.tag) {
|
|
487
|
+
if (options.ignoreTags.includes(asset.tag)) {
|
|
488
|
+
return false;
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
if (options.accept) {
|
|
492
|
+
const patterns = Array.isArray(options.accept) ? options.accept : options.accept.split(",");
|
|
493
|
+
const matches = patterns.some((p) => this.matchGlob(asset.url, p.trim()));
|
|
494
|
+
if (!matches)
|
|
495
|
+
return false;
|
|
496
|
+
}
|
|
497
|
+
if (options.reject) {
|
|
498
|
+
const patterns = Array.isArray(options.reject) ? options.reject : options.reject.split(",");
|
|
499
|
+
const matches = patterns.some((p) => this.matchGlob(asset.url, p.trim()));
|
|
500
|
+
if (matches)
|
|
501
|
+
return false;
|
|
502
|
+
}
|
|
503
|
+
if (options.acceptRegex) {
|
|
504
|
+
const regex = options.acceptRegex instanceof RegExp ? options.acceptRegex : new RegExp(options.acceptRegex);
|
|
505
|
+
if (!regex.test(asset.url))
|
|
506
|
+
return false;
|
|
507
|
+
}
|
|
508
|
+
if (options.rejectRegex) {
|
|
509
|
+
const regex = options.rejectRegex instanceof RegExp ? options.rejectRegex : new RegExp(options.rejectRegex);
|
|
510
|
+
if (regex.test(asset.url))
|
|
511
|
+
return false;
|
|
512
|
+
}
|
|
513
|
+
if (options.excludeExtensions && options.excludeExtensions.length > 0) {
|
|
514
|
+
const ext = getUrlExtension(asset.url);
|
|
515
|
+
if (ext) {
|
|
516
|
+
const normalizedExt = "." + ext.toLowerCase();
|
|
517
|
+
const excluded = options.excludeExtensions.some((excludeExt) => {
|
|
518
|
+
const normalizedExclude = excludeExt.startsWith(".") ? excludeExt.toLowerCase() : ("." + excludeExt).toLowerCase();
|
|
519
|
+
return normalizedExt === normalizedExclude;
|
|
520
|
+
});
|
|
521
|
+
if (excluded)
|
|
522
|
+
return false;
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
return true;
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
matchGlob(url, pattern) {
|
|
529
|
+
const regexStr = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
530
|
+
const regex = new RegExp(`^${regexStr}$|${regexStr}`, "i");
|
|
531
|
+
return regex.test(url);
|
|
532
|
+
}
|
|
533
|
+
extract(content, mimeType, baseUrl, options) {
|
|
534
|
+
const lowerMime = mimeType.toLowerCase();
|
|
535
|
+
if (lowerMime.includes("html") || lowerMime.includes("xhtml")) {
|
|
536
|
+
return this.extractFromHTML(content, baseUrl, options);
|
|
537
|
+
}
|
|
538
|
+
if (lowerMime.includes("css")) {
|
|
539
|
+
return this.extractFromCSS(content, baseUrl);
|
|
540
|
+
}
|
|
541
|
+
if (lowerMime.includes("svg")) {
|
|
542
|
+
return this.extractFromXML(content, baseUrl);
|
|
543
|
+
}
|
|
544
|
+
if (lowerMime.includes("xml")) {
|
|
545
|
+
return this.extractFromXML(content, baseUrl);
|
|
546
|
+
}
|
|
547
|
+
if (lowerMime.includes("javascript") || lowerMime.includes("ecmascript")) {
|
|
548
|
+
return this.extractFromJS(content, baseUrl);
|
|
549
|
+
}
|
|
550
|
+
return [];
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
export default AssetExtractor;
|