rezo 1.0.66 → 1.0.68
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapters/entries/curl.d.ts +5 -0
- package/dist/adapters/entries/fetch.d.ts +5 -0
- package/dist/adapters/entries/http.d.ts +5 -0
- package/dist/adapters/entries/http2.d.ts +5 -0
- package/dist/adapters/entries/react-native.d.ts +5 -0
- package/dist/adapters/entries/xhr.d.ts +5 -0
- package/dist/adapters/index.cjs +6 -6
- package/dist/cache/index.cjs +9 -9
- package/dist/crawler/crawler.cjs +26 -5
- package/dist/crawler/crawler.js +26 -5
- package/dist/crawler/index.cjs +40 -40
- package/dist/crawler.d.ts +10 -0
- package/dist/entries/crawler.cjs +4 -4
- package/dist/index.cjs +27 -27
- package/dist/index.d.ts +5 -0
- package/dist/internal/agents/index.cjs +10 -10
- package/dist/platform/browser.d.ts +5 -0
- package/dist/platform/bun.d.ts +5 -0
- package/dist/platform/deno.d.ts +5 -0
- package/dist/platform/node.d.ts +5 -0
- package/dist/platform/react-native.d.ts +5 -0
- package/dist/platform/worker.d.ts +5 -0
- package/dist/proxy/index.cjs +4 -4
- package/dist/proxy/manager.cjs +1 -1
- package/dist/proxy/manager.js +1 -1
- package/dist/queue/index.cjs +8 -8
- package/dist/queue/queue.cjs +3 -1
- package/dist/queue/queue.js +3 -1
- package/dist/responses/universal/index.cjs +11 -11
- package/dist/wget/asset-extractor.cjs +556 -0
- package/dist/wget/asset-extractor.js +553 -0
- package/dist/wget/asset-organizer.cjs +230 -0
- package/dist/wget/asset-organizer.js +227 -0
- package/dist/wget/download-cache.cjs +221 -0
- package/dist/wget/download-cache.js +218 -0
- package/dist/wget/downloader.cjs +607 -0
- package/dist/wget/downloader.js +604 -0
- package/dist/wget/file-writer.cjs +349 -0
- package/dist/wget/file-writer.js +346 -0
- package/dist/wget/filter-lists.cjs +1330 -0
- package/dist/wget/filter-lists.js +1330 -0
- package/dist/wget/index.cjs +633 -0
- package/dist/wget/index.d.ts +8486 -0
- package/dist/wget/index.js +614 -0
- package/dist/wget/link-converter.cjs +297 -0
- package/dist/wget/link-converter.js +294 -0
- package/dist/wget/progress.cjs +271 -0
- package/dist/wget/progress.js +266 -0
- package/dist/wget/resume.cjs +166 -0
- package/dist/wget/resume.js +163 -0
- package/dist/wget/robots.cjs +303 -0
- package/dist/wget/robots.js +300 -0
- package/dist/wget/types.cjs +200 -0
- package/dist/wget/types.js +197 -0
- package/dist/wget/url-filter.cjs +351 -0
- package/dist/wget/url-filter.js +348 -0
- package/package.json +6 -1
|
@@ -2128,6 +2128,8 @@ declare class RezoQueue<T = any> {
|
|
|
2128
2128
|
private readonly throughputWindowSize;
|
|
2129
2129
|
private idlePromise?;
|
|
2130
2130
|
private emptyPromise?;
|
|
2131
|
+
/** Tracks if queue has ever had work added - ensures onIdle waits for first task */
|
|
2132
|
+
private hasEverBeenActive;
|
|
2131
2133
|
readonly config: Required<QueueConfig>;
|
|
2132
2134
|
/**
|
|
2133
2135
|
* Create a new RezoQueue
|
|
@@ -2202,6 +2204,9 @@ declare class RezoQueue<T = any> {
|
|
|
2202
2204
|
}) => boolean): number;
|
|
2203
2205
|
/**
|
|
2204
2206
|
* Wait for queue to become idle (no running or pending tasks)
|
|
2207
|
+
*
|
|
2208
|
+
* Unlike a simple "isIdle" check, this properly waits for work to be added
|
|
2209
|
+
* and completed if called before any tasks are queued (matches p-queue behavior).
|
|
2205
2210
|
*/
|
|
2206
2211
|
onIdle(): Promise<void>;
|
|
2207
2212
|
/**
|
package/dist/proxy/index.cjs
CHANGED
|
@@ -1,9 +1,9 @@
|
|
|
1
1
|
const { Agent, HttpProxyAgent, HttpsProxyAgent, SocksProxyAgent } = require('../internal/agents/index.cjs');
|
|
2
2
|
const { parseProxyString } = require('./parse.cjs');
|
|
3
|
-
const
|
|
4
|
-
exports.ProxyManager =
|
|
5
|
-
const
|
|
6
|
-
exports.parseProxyString =
|
|
3
|
+
const _mod_nqcj71 = require('./manager.cjs');
|
|
4
|
+
exports.ProxyManager = _mod_nqcj71.ProxyManager;;
|
|
5
|
+
const _mod_ipg17b = require('./parse.cjs');
|
|
6
|
+
exports.parseProxyString = _mod_ipg17b.parseProxyString;;
|
|
7
7
|
function createOptions(uri, opts) {
|
|
8
8
|
if (uri instanceof URL || typeof uri === "string") {
|
|
9
9
|
return {
|
package/dist/proxy/manager.cjs
CHANGED
package/dist/proxy/manager.js
CHANGED
package/dist/queue/index.cjs
CHANGED
|
@@ -1,8 +1,8 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.RezoQueue =
|
|
3
|
-
const
|
|
4
|
-
exports.HttpQueue =
|
|
5
|
-
exports.extractDomain =
|
|
6
|
-
const
|
|
7
|
-
exports.Priority =
|
|
8
|
-
exports.HttpMethodPriority =
|
|
1
|
+
const _mod_uimwsu = require('./queue.cjs');
|
|
2
|
+
exports.RezoQueue = _mod_uimwsu.RezoQueue;;
|
|
3
|
+
const _mod_u4qw7d = require('./http-queue.cjs');
|
|
4
|
+
exports.HttpQueue = _mod_u4qw7d.HttpQueue;
|
|
5
|
+
exports.extractDomain = _mod_u4qw7d.extractDomain;;
|
|
6
|
+
const _mod_gb74bt = require('./types.cjs');
|
|
7
|
+
exports.Priority = _mod_gb74bt.Priority;
|
|
8
|
+
exports.HttpMethodPriority = _mod_gb74bt.HttpMethodPriority;;
|
package/dist/queue/queue.cjs
CHANGED
|
@@ -25,6 +25,7 @@ class RezoQueue {
|
|
|
25
25
|
throughputWindowSize = 60;
|
|
26
26
|
idlePromise;
|
|
27
27
|
emptyPromise;
|
|
28
|
+
hasEverBeenActive = false;
|
|
28
29
|
config;
|
|
29
30
|
constructor(config = {}) {
|
|
30
31
|
this.config = {
|
|
@@ -92,6 +93,7 @@ class RezoQueue {
|
|
|
92
93
|
});
|
|
93
94
|
this.insertByPriority(task);
|
|
94
95
|
this.statsData.added++;
|
|
96
|
+
this.hasEverBeenActive = true;
|
|
95
97
|
this.emit("add", { id: task.id, priority: task.priority });
|
|
96
98
|
if (this.config.autoStart && !this.isPausedFlag) {
|
|
97
99
|
this.tryRunNext();
|
|
@@ -156,7 +158,7 @@ class RezoQueue {
|
|
|
156
158
|
return count;
|
|
157
159
|
}
|
|
158
160
|
onIdle() {
|
|
159
|
-
if (this.state.isIdle) {
|
|
161
|
+
if (this.hasEverBeenActive && this.state.isIdle) {
|
|
160
162
|
return Promise.resolve();
|
|
161
163
|
}
|
|
162
164
|
if (!this.idlePromise) {
|
package/dist/queue/queue.js
CHANGED
|
@@ -25,6 +25,7 @@ export class RezoQueue {
|
|
|
25
25
|
throughputWindowSize = 60;
|
|
26
26
|
idlePromise;
|
|
27
27
|
emptyPromise;
|
|
28
|
+
hasEverBeenActive = false;
|
|
28
29
|
config;
|
|
29
30
|
constructor(config = {}) {
|
|
30
31
|
this.config = {
|
|
@@ -92,6 +93,7 @@ export class RezoQueue {
|
|
|
92
93
|
});
|
|
93
94
|
this.insertByPriority(task);
|
|
94
95
|
this.statsData.added++;
|
|
96
|
+
this.hasEverBeenActive = true;
|
|
95
97
|
this.emit("add", { id: task.id, priority: task.priority });
|
|
96
98
|
if (this.config.autoStart && !this.isPausedFlag) {
|
|
97
99
|
this.tryRunNext();
|
|
@@ -156,7 +158,7 @@ export class RezoQueue {
|
|
|
156
158
|
return count;
|
|
157
159
|
}
|
|
158
160
|
onIdle() {
|
|
159
|
-
if (this.state.isIdle) {
|
|
161
|
+
if (this.hasEverBeenActive && this.state.isIdle) {
|
|
160
162
|
return Promise.resolve();
|
|
161
163
|
}
|
|
162
164
|
if (!this.idlePromise) {
|
|
@@ -1,11 +1,11 @@
|
|
|
1
|
-
const
|
|
2
|
-
exports.UniversalEventEmitter =
|
|
3
|
-
const
|
|
4
|
-
exports.UniversalStreamResponse =
|
|
5
|
-
exports.StreamResponse =
|
|
6
|
-
const
|
|
7
|
-
exports.UniversalDownloadResponse =
|
|
8
|
-
exports.DownloadResponse =
|
|
9
|
-
const
|
|
10
|
-
exports.UniversalUploadResponse =
|
|
11
|
-
exports.UploadResponse =
|
|
1
|
+
const _mod_n7nj5e = require('./event-emitter.cjs');
|
|
2
|
+
exports.UniversalEventEmitter = _mod_n7nj5e.UniversalEventEmitter;;
|
|
3
|
+
const _mod_r33s45 = require('./stream.cjs');
|
|
4
|
+
exports.UniversalStreamResponse = _mod_r33s45.UniversalStreamResponse;
|
|
5
|
+
exports.StreamResponse = _mod_r33s45.StreamResponse;;
|
|
6
|
+
const _mod_cvx40y = require('./download.cjs');
|
|
7
|
+
exports.UniversalDownloadResponse = _mod_cvx40y.UniversalDownloadResponse;
|
|
8
|
+
exports.DownloadResponse = _mod_cvx40y.DownloadResponse;;
|
|
9
|
+
const _mod_yoj35o = require('./upload.cjs');
|
|
10
|
+
exports.UniversalUploadResponse = _mod_yoj35o.UniversalUploadResponse;
|
|
11
|
+
exports.UploadResponse = _mod_yoj35o.UploadResponse;;
|
|
@@ -0,0 +1,556 @@
|
|
|
1
|
+
const { parseHTML, DOMParser } = require('../dom/index.cjs');
|
|
2
|
+
const HTML_URL_ATTRIBUTES = {
|
|
3
|
+
a: ["href"],
|
|
4
|
+
area: ["href"],
|
|
5
|
+
link: ["href"],
|
|
6
|
+
base: ["href"],
|
|
7
|
+
img: ["src", "srcset", "data-src", "data-srcset", "data-lazy-src"],
|
|
8
|
+
picture: [],
|
|
9
|
+
source: ["src", "srcset"],
|
|
10
|
+
video: ["src", "poster"],
|
|
11
|
+
audio: ["src"],
|
|
12
|
+
track: ["src"],
|
|
13
|
+
script: ["src"],
|
|
14
|
+
style: [],
|
|
15
|
+
iframe: ["src"],
|
|
16
|
+
frame: ["src"],
|
|
17
|
+
embed: ["src"],
|
|
18
|
+
object: ["data", "codebase"],
|
|
19
|
+
form: ["action"],
|
|
20
|
+
input: ["src"],
|
|
21
|
+
button: ["formaction"],
|
|
22
|
+
meta: ["content"],
|
|
23
|
+
body: ["background"],
|
|
24
|
+
table: ["background"],
|
|
25
|
+
td: ["background"],
|
|
26
|
+
th: ["background"],
|
|
27
|
+
blockquote: ["cite"],
|
|
28
|
+
q: ["cite"],
|
|
29
|
+
del: ["cite"],
|
|
30
|
+
ins: ["cite"],
|
|
31
|
+
applet: ["code", "codebase", "archive"]
|
|
32
|
+
};
|
|
33
|
+
const META_URL_PROPERTIES = [
|
|
34
|
+
"og:image",
|
|
35
|
+
"og:image:url",
|
|
36
|
+
"og:image:secure_url",
|
|
37
|
+
"og:video",
|
|
38
|
+
"og:video:url",
|
|
39
|
+
"og:video:secure_url",
|
|
40
|
+
"og:audio",
|
|
41
|
+
"og:audio:url",
|
|
42
|
+
"og:audio:secure_url",
|
|
43
|
+
"og:url",
|
|
44
|
+
"twitter:image",
|
|
45
|
+
"twitter:image:src",
|
|
46
|
+
"twitter:player",
|
|
47
|
+
"twitter:player:stream"
|
|
48
|
+
];
|
|
49
|
+
const REQUISITE_LINK_RELS = [
|
|
50
|
+
"stylesheet",
|
|
51
|
+
"icon",
|
|
52
|
+
"shortcut icon",
|
|
53
|
+
"apple-touch-icon",
|
|
54
|
+
"apple-touch-icon-precomposed",
|
|
55
|
+
"manifest",
|
|
56
|
+
"preload",
|
|
57
|
+
"modulepreload"
|
|
58
|
+
];
|
|
59
|
+
function determineAssetType(url, tag, attribute, rel) {
|
|
60
|
+
const lowerTag = tag.toLowerCase();
|
|
61
|
+
const lowerUrl = url.toLowerCase();
|
|
62
|
+
if (lowerTag === "script")
|
|
63
|
+
return "script";
|
|
64
|
+
if (lowerTag === "style")
|
|
65
|
+
return "stylesheet";
|
|
66
|
+
if (lowerTag === "img" || lowerTag === "picture")
|
|
67
|
+
return "image";
|
|
68
|
+
if (lowerTag === "video")
|
|
69
|
+
return "video";
|
|
70
|
+
if (lowerTag === "audio")
|
|
71
|
+
return "audio";
|
|
72
|
+
if (lowerTag === "iframe" || lowerTag === "frame")
|
|
73
|
+
return "iframe";
|
|
74
|
+
if (lowerTag === "embed" || lowerTag === "object")
|
|
75
|
+
return "object";
|
|
76
|
+
if (lowerTag === "link" && rel) {
|
|
77
|
+
const lowerRel = rel.toLowerCase();
|
|
78
|
+
if (lowerRel.includes("stylesheet"))
|
|
79
|
+
return "stylesheet";
|
|
80
|
+
if (lowerRel.includes("icon"))
|
|
81
|
+
return "favicon";
|
|
82
|
+
if (lowerRel.includes("manifest"))
|
|
83
|
+
return "manifest";
|
|
84
|
+
if (lowerRel.includes("preload") || lowerRel.includes("modulepreload")) {
|
|
85
|
+
return "other";
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
const ext = getUrlExtension(lowerUrl);
|
|
89
|
+
switch (ext) {
|
|
90
|
+
case "css":
|
|
91
|
+
return "stylesheet";
|
|
92
|
+
case "js":
|
|
93
|
+
case "mjs":
|
|
94
|
+
case "cjs":
|
|
95
|
+
return "script";
|
|
96
|
+
case "png":
|
|
97
|
+
case "jpg":
|
|
98
|
+
case "jpeg":
|
|
99
|
+
case "gif":
|
|
100
|
+
case "webp":
|
|
101
|
+
case "avif":
|
|
102
|
+
case "svg":
|
|
103
|
+
case "ico":
|
|
104
|
+
case "bmp":
|
|
105
|
+
case "tiff":
|
|
106
|
+
case "tif":
|
|
107
|
+
return "image";
|
|
108
|
+
case "mp4":
|
|
109
|
+
case "webm":
|
|
110
|
+
case "ogg":
|
|
111
|
+
case "ogv":
|
|
112
|
+
case "mov":
|
|
113
|
+
case "avi":
|
|
114
|
+
case "mkv":
|
|
115
|
+
return "video";
|
|
116
|
+
case "mp3":
|
|
117
|
+
case "wav":
|
|
118
|
+
case "flac":
|
|
119
|
+
case "aac":
|
|
120
|
+
case "m4a":
|
|
121
|
+
case "oga":
|
|
122
|
+
return "audio";
|
|
123
|
+
case "woff":
|
|
124
|
+
case "woff2":
|
|
125
|
+
case "ttf":
|
|
126
|
+
case "otf":
|
|
127
|
+
case "eot":
|
|
128
|
+
return "font";
|
|
129
|
+
case "html":
|
|
130
|
+
case "htm":
|
|
131
|
+
case "xhtml":
|
|
132
|
+
case "php":
|
|
133
|
+
case "asp":
|
|
134
|
+
case "aspx":
|
|
135
|
+
case "jsp":
|
|
136
|
+
return "document";
|
|
137
|
+
case "json":
|
|
138
|
+
case "xml":
|
|
139
|
+
return "data";
|
|
140
|
+
case "webmanifest":
|
|
141
|
+
return "manifest";
|
|
142
|
+
default:
|
|
143
|
+
if (lowerTag === "a")
|
|
144
|
+
return "document";
|
|
145
|
+
return "other";
|
|
146
|
+
}
|
|
147
|
+
}
|
|
148
|
+
function getUrlExtension(url) {
|
|
149
|
+
try {
|
|
150
|
+
const pathname = new URL(url, "http://localhost").pathname;
|
|
151
|
+
const lastDot = pathname.lastIndexOf(".");
|
|
152
|
+
const lastSlash = pathname.lastIndexOf("/");
|
|
153
|
+
if (lastDot > lastSlash && lastDot < pathname.length - 1) {
|
|
154
|
+
return pathname.slice(lastDot + 1).toLowerCase();
|
|
155
|
+
}
|
|
156
|
+
} catch {
|
|
157
|
+
const match = url.match(/\.([a-zA-Z0-9]+)(?:\?|#|$)/);
|
|
158
|
+
if (match)
|
|
159
|
+
return match[1].toLowerCase();
|
|
160
|
+
}
|
|
161
|
+
return "";
|
|
162
|
+
}
|
|
163
|
+
function isPageRequisite(type, tag, rel) {
|
|
164
|
+
if (["stylesheet", "script", "font", "favicon", "manifest"].includes(type)) {
|
|
165
|
+
return true;
|
|
166
|
+
}
|
|
167
|
+
if (type === "image") {
|
|
168
|
+
return true;
|
|
169
|
+
}
|
|
170
|
+
if (tag.toLowerCase() === "link" && rel) {
|
|
171
|
+
return REQUISITE_LINK_RELS.some((r) => rel.toLowerCase().includes(r));
|
|
172
|
+
}
|
|
173
|
+
return false;
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
class AssetExtractor {
|
|
177
|
+
extractFromHTML(html, baseUrl, options) {
|
|
178
|
+
const assets = [];
|
|
179
|
+
const { document } = parseHTML(html);
|
|
180
|
+
const baseElement = document.querySelector("base[href]");
|
|
181
|
+
if (baseElement) {
|
|
182
|
+
const baseHref = baseElement.getAttribute("href");
|
|
183
|
+
if (baseHref) {
|
|
184
|
+
baseUrl = this.resolveUrl(baseHref, baseUrl) || baseUrl;
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
const followTags = options?.followTags ? new Set(options.followTags.map((t) => t.toLowerCase())) : null;
|
|
188
|
+
const ignoreTags = options?.ignoreTags ? new Set(options.ignoreTags.map((t) => t.toLowerCase())) : null;
|
|
189
|
+
for (const [tag, attributes] of Object.entries(HTML_URL_ATTRIBUTES)) {
|
|
190
|
+
const lowerTag = tag.toLowerCase();
|
|
191
|
+
if (followTags && !followTags.has(lowerTag))
|
|
192
|
+
continue;
|
|
193
|
+
if (ignoreTags && ignoreTags.has(lowerTag))
|
|
194
|
+
continue;
|
|
195
|
+
const elements = Array.from(document.querySelectorAll(tag));
|
|
196
|
+
for (const element of elements) {
|
|
197
|
+
const rel = element.getAttribute("rel");
|
|
198
|
+
for (const attr of attributes) {
|
|
199
|
+
const value = element.getAttribute(attr);
|
|
200
|
+
if (!value)
|
|
201
|
+
continue;
|
|
202
|
+
if (attr === "srcset" || attr === "data-srcset") {
|
|
203
|
+
const srcsetUrls = this.parseSrcset(value, baseUrl);
|
|
204
|
+
for (const url of srcsetUrls) {
|
|
205
|
+
assets.push({
|
|
206
|
+
url,
|
|
207
|
+
type: "image",
|
|
208
|
+
source: "html",
|
|
209
|
+
tag: lowerTag,
|
|
210
|
+
attribute: attr,
|
|
211
|
+
required: true,
|
|
212
|
+
inline: false
|
|
213
|
+
});
|
|
214
|
+
}
|
|
215
|
+
continue;
|
|
216
|
+
}
|
|
217
|
+
if (lowerTag === "meta" && attr === "content") {
|
|
218
|
+
const property = element.getAttribute("property") || element.getAttribute("name");
|
|
219
|
+
if (!property || !META_URL_PROPERTIES.includes(property.toLowerCase())) {
|
|
220
|
+
continue;
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
const resolvedUrl = this.resolveUrl(value, baseUrl);
|
|
224
|
+
if (!resolvedUrl)
|
|
225
|
+
continue;
|
|
226
|
+
const assetType = determineAssetType(resolvedUrl, lowerTag, attr, rel);
|
|
227
|
+
const required = isPageRequisite(assetType, lowerTag, rel);
|
|
228
|
+
assets.push({
|
|
229
|
+
url: resolvedUrl,
|
|
230
|
+
type: assetType,
|
|
231
|
+
source: "html",
|
|
232
|
+
tag: lowerTag,
|
|
233
|
+
attribute: attr,
|
|
234
|
+
required,
|
|
235
|
+
inline: false
|
|
236
|
+
});
|
|
237
|
+
}
|
|
238
|
+
const styleAttr = element.getAttribute("style");
|
|
239
|
+
if (styleAttr) {
|
|
240
|
+
const cssAssets = this.extractUrlsFromCSSText(styleAttr, baseUrl);
|
|
241
|
+
for (const cssAsset of cssAssets) {
|
|
242
|
+
assets.push({
|
|
243
|
+
...cssAsset,
|
|
244
|
+
source: "html",
|
|
245
|
+
tag: lowerTag,
|
|
246
|
+
attribute: "style",
|
|
247
|
+
inline: true
|
|
248
|
+
});
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
const styleTags = Array.from(document.querySelectorAll("style"));
|
|
254
|
+
for (const styleTag of styleTags) {
|
|
255
|
+
const cssContent = styleTag.textContent;
|
|
256
|
+
if (cssContent) {
|
|
257
|
+
const cssAssets = this.extractFromCSS(cssContent, baseUrl);
|
|
258
|
+
for (const asset of cssAssets) {
|
|
259
|
+
assets.push({
|
|
260
|
+
...asset,
|
|
261
|
+
source: "html",
|
|
262
|
+
tag: "style",
|
|
263
|
+
inline: true
|
|
264
|
+
});
|
|
265
|
+
}
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
return assets;
|
|
269
|
+
}
|
|
270
|
+
extractFromCSS(css, baseUrl) {
|
|
271
|
+
const assets = [];
|
|
272
|
+
const importRegex = /@import\s+(?:url\s*\(\s*)?['"]?([^'"\)\s;]+)['"]?\s*\)?[^;]*;/gi;
|
|
273
|
+
let match;
|
|
274
|
+
while ((match = importRegex.exec(css)) !== null) {
|
|
275
|
+
const url = this.resolveUrl(match[1], baseUrl);
|
|
276
|
+
if (url) {
|
|
277
|
+
assets.push({
|
|
278
|
+
url,
|
|
279
|
+
type: "stylesheet",
|
|
280
|
+
source: "css",
|
|
281
|
+
required: true,
|
|
282
|
+
inline: false
|
|
283
|
+
});
|
|
284
|
+
}
|
|
285
|
+
}
|
|
286
|
+
const urlAssets = this.extractUrlsFromCSSText(css, baseUrl);
|
|
287
|
+
assets.push(...urlAssets);
|
|
288
|
+
return assets;
|
|
289
|
+
}
|
|
290
|
+
extractUrlsFromCSSText(css, baseUrl) {
|
|
291
|
+
const assets = [];
|
|
292
|
+
const urlRegex = /url\s*\(\s*(['"]?)([^'"\)\s]+)\1\s*\)/gi;
|
|
293
|
+
let match;
|
|
294
|
+
while ((match = urlRegex.exec(css)) !== null) {
|
|
295
|
+
const urlValue = match[2].trim();
|
|
296
|
+
if (urlValue.startsWith("data:")) {
|
|
297
|
+
continue;
|
|
298
|
+
}
|
|
299
|
+
if (!urlValue || urlValue.startsWith("#")) {
|
|
300
|
+
continue;
|
|
301
|
+
}
|
|
302
|
+
const resolvedUrl = this.resolveUrl(urlValue, baseUrl);
|
|
303
|
+
if (!resolvedUrl)
|
|
304
|
+
continue;
|
|
305
|
+
const type = this.guessAssetTypeFromUrl(resolvedUrl);
|
|
306
|
+
assets.push({
|
|
307
|
+
url: resolvedUrl,
|
|
308
|
+
type,
|
|
309
|
+
source: "css",
|
|
310
|
+
required: true,
|
|
311
|
+
inline: false
|
|
312
|
+
});
|
|
313
|
+
}
|
|
314
|
+
return assets;
|
|
315
|
+
}
|
|
316
|
+
extractFromXML(xml, baseUrl) {
|
|
317
|
+
const assets = [];
|
|
318
|
+
try {
|
|
319
|
+
const parser = new DOMParser;
|
|
320
|
+
const doc = parser.parseFromString(xml, "text/xml");
|
|
321
|
+
const isSVG = doc.documentElement?.tagName.toLowerCase() === "svg";
|
|
322
|
+
const source = isSVG ? "svg" : "xml";
|
|
323
|
+
const allElements = Array.from(doc.querySelectorAll("*"));
|
|
324
|
+
for (const el of allElements) {
|
|
325
|
+
for (const attr of ["href", "src", "xlink:href"]) {
|
|
326
|
+
const value = el.getAttribute(attr);
|
|
327
|
+
if (value && !value.startsWith("#") && !value.startsWith("data:")) {
|
|
328
|
+
const resolvedUrl = this.resolveUrl(value, baseUrl);
|
|
329
|
+
if (resolvedUrl) {
|
|
330
|
+
if (!assets.some((a) => a.url === resolvedUrl)) {
|
|
331
|
+
const tagName = el.tagName.toLowerCase();
|
|
332
|
+
let assetType = this.guessAssetTypeFromUrl(resolvedUrl);
|
|
333
|
+
if (isSVG) {
|
|
334
|
+
if (tagName === "image")
|
|
335
|
+
assetType = "image";
|
|
336
|
+
else if (tagName === "use")
|
|
337
|
+
assetType = "image";
|
|
338
|
+
}
|
|
339
|
+
assets.push({
|
|
340
|
+
url: resolvedUrl,
|
|
341
|
+
type: assetType,
|
|
342
|
+
source,
|
|
343
|
+
tag: tagName,
|
|
344
|
+
attribute: attr,
|
|
345
|
+
required: isSVG && (tagName === "image" || tagName === "use"),
|
|
346
|
+
inline: false
|
|
347
|
+
});
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
}
|
|
352
|
+
}
|
|
353
|
+
} catch (error) {
|
|
354
|
+
console.warn("Failed to parse XML/SVG:", error);
|
|
355
|
+
}
|
|
356
|
+
return assets;
|
|
357
|
+
}
|
|
358
|
+
extractFromJS(js, baseUrl) {
|
|
359
|
+
const assets = [];
|
|
360
|
+
const seen = new Set;
|
|
361
|
+
const patterns = [
|
|
362
|
+
/['"`](https?:\/\/[^'"`\s]+)['"`]/gi,
|
|
363
|
+
/['"`](\/[a-zA-Z0-9._\-/]+\.[a-zA-Z0-9]+)['"`]/gi,
|
|
364
|
+
/['"`](\.\/[a-zA-Z0-9._\-/]+\.[a-zA-Z0-9]+)['"`]/gi
|
|
365
|
+
];
|
|
366
|
+
for (const pattern of patterns) {
|
|
367
|
+
let match;
|
|
368
|
+
while ((match = pattern.exec(js)) !== null) {
|
|
369
|
+
const urlCandidate = match[1];
|
|
370
|
+
if (seen.has(urlCandidate))
|
|
371
|
+
continue;
|
|
372
|
+
seen.add(urlCandidate);
|
|
373
|
+
if (urlCandidate.startsWith("data:"))
|
|
374
|
+
continue;
|
|
375
|
+
const resolvedUrl = this.resolveUrl(urlCandidate, baseUrl);
|
|
376
|
+
if (!resolvedUrl)
|
|
377
|
+
continue;
|
|
378
|
+
const ext = getUrlExtension(resolvedUrl);
|
|
379
|
+
if (ext && ["js", "css", "png", "jpg", "jpeg", "gif", "svg", "webp", "json", "html"].includes(ext)) {
|
|
380
|
+
assets.push({
|
|
381
|
+
url: resolvedUrl,
|
|
382
|
+
type: this.guessAssetTypeFromUrl(resolvedUrl),
|
|
383
|
+
source: "js",
|
|
384
|
+
required: false,
|
|
385
|
+
inline: false
|
|
386
|
+
});
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
}
|
|
390
|
+
return assets;
|
|
391
|
+
}
|
|
392
|
+
parseSrcset(srcset, baseUrl) {
|
|
393
|
+
const urls = [];
|
|
394
|
+
const candidates = srcset.split(/,\s*(?=[^\s])/);
|
|
395
|
+
for (const candidate of candidates) {
|
|
396
|
+
const parts = candidate.trim().split(/\s+/);
|
|
397
|
+
if (parts.length > 0 && parts[0]) {
|
|
398
|
+
const url = this.resolveUrl(parts[0], baseUrl);
|
|
399
|
+
if (url) {
|
|
400
|
+
urls.push(url);
|
|
401
|
+
}
|
|
402
|
+
}
|
|
403
|
+
}
|
|
404
|
+
return urls;
|
|
405
|
+
}
|
|
406
|
+
resolveUrl(url, baseUrl) {
|
|
407
|
+
if (!url)
|
|
408
|
+
return null;
|
|
409
|
+
url = url.trim();
|
|
410
|
+
if (!url || url.startsWith("#") || url.startsWith("javascript:") || url.startsWith("data:") || url.startsWith("mailto:") || url.startsWith("tel:")) {
|
|
411
|
+
return null;
|
|
412
|
+
}
|
|
413
|
+
try {
|
|
414
|
+
const resolved = new URL(url, baseUrl);
|
|
415
|
+
if (resolved.protocol !== "http:" && resolved.protocol !== "https:") {
|
|
416
|
+
return null;
|
|
417
|
+
}
|
|
418
|
+
return resolved.href;
|
|
419
|
+
} catch {
|
|
420
|
+
return null;
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
guessAssetTypeFromUrl(url) {
|
|
424
|
+
const ext = getUrlExtension(url);
|
|
425
|
+
switch (ext) {
|
|
426
|
+
case "css":
|
|
427
|
+
return "stylesheet";
|
|
428
|
+
case "js":
|
|
429
|
+
case "mjs":
|
|
430
|
+
case "cjs":
|
|
431
|
+
return "script";
|
|
432
|
+
case "png":
|
|
433
|
+
case "jpg":
|
|
434
|
+
case "jpeg":
|
|
435
|
+
case "gif":
|
|
436
|
+
case "webp":
|
|
437
|
+
case "avif":
|
|
438
|
+
case "svg":
|
|
439
|
+
case "ico":
|
|
440
|
+
case "bmp":
|
|
441
|
+
return "image";
|
|
442
|
+
case "mp4":
|
|
443
|
+
case "webm":
|
|
444
|
+
case "ogg":
|
|
445
|
+
case "ogv":
|
|
446
|
+
return "video";
|
|
447
|
+
case "mp3":
|
|
448
|
+
case "wav":
|
|
449
|
+
case "flac":
|
|
450
|
+
case "aac":
|
|
451
|
+
return "audio";
|
|
452
|
+
case "woff":
|
|
453
|
+
case "woff2":
|
|
454
|
+
case "ttf":
|
|
455
|
+
case "otf":
|
|
456
|
+
case "eot":
|
|
457
|
+
return "font";
|
|
458
|
+
case "html":
|
|
459
|
+
case "htm":
|
|
460
|
+
case "xhtml":
|
|
461
|
+
return "document";
|
|
462
|
+
case "json":
|
|
463
|
+
case "xml":
|
|
464
|
+
return "data";
|
|
465
|
+
default:
|
|
466
|
+
return "other";
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
filterAssets(assets, options) {
|
|
470
|
+
return assets.filter((asset) => {
|
|
471
|
+
if (options.acceptAssetTypes && options.acceptAssetTypes.length > 0) {
|
|
472
|
+
if (!options.acceptAssetTypes.includes(asset.type)) {
|
|
473
|
+
return false;
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
if (options.rejectAssetTypes && options.rejectAssetTypes.length > 0) {
|
|
477
|
+
if (options.rejectAssetTypes.includes(asset.type)) {
|
|
478
|
+
return false;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
if (options.followTags && asset.tag) {
|
|
482
|
+
if (!options.followTags.includes(asset.tag)) {
|
|
483
|
+
return false;
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
if (options.ignoreTags && asset.tag) {
|
|
487
|
+
if (options.ignoreTags.includes(asset.tag)) {
|
|
488
|
+
return false;
|
|
489
|
+
}
|
|
490
|
+
}
|
|
491
|
+
if (options.accept) {
|
|
492
|
+
const patterns = Array.isArray(options.accept) ? options.accept : options.accept.split(",");
|
|
493
|
+
const matches = patterns.some((p) => this.matchGlob(asset.url, p.trim()));
|
|
494
|
+
if (!matches)
|
|
495
|
+
return false;
|
|
496
|
+
}
|
|
497
|
+
if (options.reject) {
|
|
498
|
+
const patterns = Array.isArray(options.reject) ? options.reject : options.reject.split(",");
|
|
499
|
+
const matches = patterns.some((p) => this.matchGlob(asset.url, p.trim()));
|
|
500
|
+
if (matches)
|
|
501
|
+
return false;
|
|
502
|
+
}
|
|
503
|
+
if (options.acceptRegex) {
|
|
504
|
+
const regex = options.acceptRegex instanceof RegExp ? options.acceptRegex : new RegExp(options.acceptRegex);
|
|
505
|
+
if (!regex.test(asset.url))
|
|
506
|
+
return false;
|
|
507
|
+
}
|
|
508
|
+
if (options.rejectRegex) {
|
|
509
|
+
const regex = options.rejectRegex instanceof RegExp ? options.rejectRegex : new RegExp(options.rejectRegex);
|
|
510
|
+
if (regex.test(asset.url))
|
|
511
|
+
return false;
|
|
512
|
+
}
|
|
513
|
+
if (options.excludeExtensions && options.excludeExtensions.length > 0) {
|
|
514
|
+
const ext = getUrlExtension(asset.url);
|
|
515
|
+
if (ext) {
|
|
516
|
+
const normalizedExt = "." + ext.toLowerCase();
|
|
517
|
+
const excluded = options.excludeExtensions.some((excludeExt) => {
|
|
518
|
+
const normalizedExclude = excludeExt.startsWith(".") ? excludeExt.toLowerCase() : ("." + excludeExt).toLowerCase();
|
|
519
|
+
return normalizedExt === normalizedExclude;
|
|
520
|
+
});
|
|
521
|
+
if (excluded)
|
|
522
|
+
return false;
|
|
523
|
+
}
|
|
524
|
+
}
|
|
525
|
+
return true;
|
|
526
|
+
});
|
|
527
|
+
}
|
|
528
|
+
matchGlob(url, pattern) {
|
|
529
|
+
const regexStr = pattern.replace(/[.+^${}()|[\]\\]/g, "\\$&").replace(/\*/g, ".*").replace(/\?/g, ".");
|
|
530
|
+
const regex = new RegExp(`^${regexStr}$|${regexStr}`, "i");
|
|
531
|
+
return regex.test(url);
|
|
532
|
+
}
|
|
533
|
+
extract(content, mimeType, baseUrl, options) {
|
|
534
|
+
const lowerMime = mimeType.toLowerCase();
|
|
535
|
+
if (lowerMime.includes("html") || lowerMime.includes("xhtml")) {
|
|
536
|
+
return this.extractFromHTML(content, baseUrl, options);
|
|
537
|
+
}
|
|
538
|
+
if (lowerMime.includes("css")) {
|
|
539
|
+
return this.extractFromCSS(content, baseUrl);
|
|
540
|
+
}
|
|
541
|
+
if (lowerMime.includes("svg")) {
|
|
542
|
+
return this.extractFromXML(content, baseUrl);
|
|
543
|
+
}
|
|
544
|
+
if (lowerMime.includes("xml")) {
|
|
545
|
+
return this.extractFromXML(content, baseUrl);
|
|
546
|
+
}
|
|
547
|
+
if (lowerMime.includes("javascript") || lowerMime.includes("ecmascript")) {
|
|
548
|
+
return this.extractFromJS(content, baseUrl);
|
|
549
|
+
}
|
|
550
|
+
return [];
|
|
551
|
+
}
|
|
552
|
+
}
|
|
553
|
+
|
|
554
|
+
exports.AssetExtractor = AssetExtractor;
|
|
555
|
+
exports.default = AssetExtractor;
|
|
556
|
+
module.exports = Object.assign(AssetExtractor, exports);
|