cleanscrape 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +90 -0
- package/dist/cli.js +326 -0
- package/dist/extractor.js +642 -0
- package/dist/preview.js +104 -0
- package/dist/types.js +1 -0
- package/package.json +42 -0
package/README.md
ADDED
|
@@ -0,0 +1,90 @@
|
|
|
1
|
+
# better-wget
|
|
2
|
+
|
|
3
|
+
`better-wget` is a frontend-first alternative to wget/curl for design and product work.
|
|
4
|
+
|
|
5
|
+
Instead of dumping low-quality scraped output, it crawls a live site and exports:
|
|
6
|
+
|
|
7
|
+
- cleaned, formatted HTML
|
|
8
|
+
- merged, editable CSS
|
|
9
|
+
- downloaded image/font/media assets
|
|
10
|
+
- a machine-readable `manifest.json`
|
|
11
|
+
|
|
12
|
+
## Why this exists
|
|
13
|
+
|
|
14
|
+
Traditional download tools optimize for raw bytes. This tool optimizes for **clean editable frontend code**.
|
|
15
|
+
|
|
16
|
+
## Install
|
|
17
|
+
|
|
18
|
+
```bash
|
|
19
|
+
# global install
|
|
20
|
+
npm install -g cleanscrape
|
|
21
|
+
|
|
22
|
+
# local development
|
|
23
|
+
npm install
|
|
24
|
+
npm run build
|
|
25
|
+
npm link
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
## Usage
|
|
29
|
+
|
|
30
|
+
```bash
|
|
31
|
+
# direct command
|
|
32
|
+
cleanscrape https://example.com -o ./output/example
|
|
33
|
+
|
|
34
|
+
# interactive mode (prompts for URL/mode/etc.)
|
|
35
|
+
cleanscrape
|
|
36
|
+
|
|
37
|
+
# set and reuse a default URL (for example a Vercel URL)
|
|
38
|
+
cleanscrape default https://your-app.vercel.app
|
|
39
|
+
|
|
40
|
+
# run the scraped site locally
|
|
41
|
+
cleanscrape run ./output/example --port 4173
|
|
42
|
+
|
|
43
|
+
# everything mode is now default; disable with --no-everything
|
|
44
|
+
cleanscrape https://example.com -o ./output/example --mode clean
|
|
45
|
+
|
|
46
|
+
# strict clean pass for ultra-editable output
|
|
47
|
+
cleanscrape https://example.com -o ./output/example --strict-clean
|
|
48
|
+
|
|
49
|
+
# save this run as your default template
|
|
50
|
+
cleanscrape https://example.com --save-default
|
|
51
|
+
|
|
52
|
+
# whole-site clean crawl (default): follows internal links and strips scripts/tracker junk
|
|
53
|
+
node dist/cli.js frontend https://example.com -o ./output/example --mode clean
|
|
54
|
+
|
|
55
|
+
# mirror mode: keeps script tags and fetches script files when possible
|
|
56
|
+
node dist/cli.js frontend https://example.com -o ./output/example-mirror --mode mirror
|
|
57
|
+
|
|
58
|
+
# tune crawl scope
|
|
59
|
+
node dist/cli.js frontend https://example.com -o ./output/example --depth 4 --max-pages 250
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
## Output structure
|
|
63
|
+
|
|
64
|
+
```text
|
|
65
|
+
output/example/
|
|
66
|
+
manifest.json
|
|
67
|
+
src/
|
|
68
|
+
index.html
|
|
69
|
+
styles.css
|
|
70
|
+
pages/
|
|
71
|
+
about/
|
|
72
|
+
index.html
|
|
73
|
+
styles.css
|
|
74
|
+
pricing/
|
|
75
|
+
index.html
|
|
76
|
+
styles.css
|
|
77
|
+
assets/
|
|
78
|
+
<hostname>/...
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
CLI prints a verification report after each scrape:
|
|
82
|
+
- pages/assets/scripts/styles/fonts/images/others counts
|
|
83
|
+
- `remote_urls_remaining` so you can quickly see if anything external is still referenced
|
|
84
|
+
|
|
85
|
+
## Roadmap
|
|
86
|
+
|
|
87
|
+
- component inference (`Hero`, `Navbar`, `Footer`) into framework templates
|
|
88
|
+
- CSS deduplication and naming normalization
|
|
89
|
+
- JS de-minification and source-map aware rewriting
|
|
90
|
+
- multi-page crawl with route graph export
|
package/dist/cli.js
ADDED
|
@@ -0,0 +1,326 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { fileURLToPath } from "node:url";
|
|
4
|
+
import os from "node:os";
|
|
5
|
+
import { mkdir, readFile, writeFile } from "node:fs/promises";
|
|
6
|
+
import { createInterface } from "node:readline/promises";
|
|
7
|
+
import { stdin as input, stdout as output } from "node:process";
|
|
8
|
+
import { Command } from "commander";
|
|
9
|
+
import { extractFrontend } from "./extractor.js";
|
|
10
|
+
import { startPreviewServer } from "./preview.js";
|
|
11
|
+
const CONFIG_PATH = path.join(os.homedir(), ".cleanscrape", "config.json");
|
|
12
|
+
async function readConfig() {
|
|
13
|
+
try {
|
|
14
|
+
const raw = await readFile(CONFIG_PATH, "utf8");
|
|
15
|
+
return JSON.parse(raw);
|
|
16
|
+
}
|
|
17
|
+
catch {
|
|
18
|
+
return {};
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
async function writeConfig(config) {
|
|
22
|
+
await mkdir(path.dirname(CONFIG_PATH), { recursive: true });
|
|
23
|
+
await writeFile(CONFIG_PATH, JSON.stringify(config, null, 2) + "\n", "utf8");
|
|
24
|
+
}
|
|
25
|
+
async function setDefaultUrl(url) {
|
|
26
|
+
const config = await readConfig();
|
|
27
|
+
config.defaultUrl = url;
|
|
28
|
+
await writeConfig(config);
|
|
29
|
+
}
|
|
30
|
+
async function askInteractive(promptLabel, fallback = "") {
|
|
31
|
+
const rl = createInterface({ input, output });
|
|
32
|
+
try {
|
|
33
|
+
const suffix = fallback ? ` [${fallback}]` : "";
|
|
34
|
+
const answer = (await rl.question(`${promptLabel}${suffix}: `)).trim();
|
|
35
|
+
return answer || fallback;
|
|
36
|
+
}
|
|
37
|
+
finally {
|
|
38
|
+
rl.close();
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
async function askYesNo(promptLabel, fallback) {
|
|
42
|
+
const defaultText = fallback ? "Y/n" : "y/N";
|
|
43
|
+
const answer = (await askInteractive(`${promptLabel} (${defaultText})`, "")).toLowerCase();
|
|
44
|
+
if (!answer)
|
|
45
|
+
return fallback;
|
|
46
|
+
return answer === "y" || answer === "yes";
|
|
47
|
+
}
|
|
48
|
+
async function resolveInteractiveRun(urlArg, opts, config) {
|
|
49
|
+
const modeDefault = String(opts.mode || config.mode || "clean");
|
|
50
|
+
const outDefault = String(opts.out || config.out || "./output");
|
|
51
|
+
const timeoutDefault = String(opts.timeout || config.timeout || "60000");
|
|
52
|
+
const depthDefault = String(opts.depth || config.depth || "3");
|
|
53
|
+
const maxPagesDefault = String(opts.maxPages || config.maxPages || "100");
|
|
54
|
+
const everythingDefault = typeof opts.everything === "boolean" ? opts.everything : typeof config.everything === "boolean" ? config.everything : true;
|
|
55
|
+
const strictDefault = typeof opts.strictClean === "boolean" ? opts.strictClean : typeof config.strictClean === "boolean" ? config.strictClean : false;
|
|
56
|
+
if (urlArg) {
|
|
57
|
+
return {
|
|
58
|
+
url: urlArg,
|
|
59
|
+
opts: {
|
|
60
|
+
...opts,
|
|
61
|
+
mode: String(opts.mode || config.mode || "clean"),
|
|
62
|
+
out: String(opts.out || config.out || "./output"),
|
|
63
|
+
timeout: String(opts.timeout || config.timeout || "60000"),
|
|
64
|
+
depth: String(opts.depth || config.depth || "3"),
|
|
65
|
+
maxPages: String(opts.maxPages || config.maxPages || "100"),
|
|
66
|
+
everything: typeof opts.everything === "boolean" ? opts.everything : config.everything ?? true,
|
|
67
|
+
strictClean: typeof opts.strictClean === "boolean" ? opts.strictClean : config.strictClean ?? false
|
|
68
|
+
}
|
|
69
|
+
};
|
|
70
|
+
}
|
|
71
|
+
const url = await askInteractive("Website URL (example: https://your-app.vercel.app)", config.defaultUrl || "");
|
|
72
|
+
if (!url) {
|
|
73
|
+
console.error("No URL provided.");
|
|
74
|
+
process.exit(1);
|
|
75
|
+
}
|
|
76
|
+
let mode = modeDefault;
|
|
77
|
+
while (mode !== "clean" && mode !== "mirror") {
|
|
78
|
+
mode = await askInteractive("Mode (clean|mirror)", "clean");
|
|
79
|
+
}
|
|
80
|
+
const out = await askInteractive("Output directory", outDefault);
|
|
81
|
+
const timeout = await askInteractive("Timeout (ms)", timeoutDefault);
|
|
82
|
+
const depth = await askInteractive("Crawl depth", depthDefault);
|
|
83
|
+
const maxPages = await askInteractive("Max pages", maxPagesDefault);
|
|
84
|
+
const everything = await askYesNo("Capture everything", everythingDefault);
|
|
85
|
+
const strictClean = await askYesNo("Enable strict clean", strictDefault);
|
|
86
|
+
const saveDefault = await askYesNo("Save these as defaults", false);
|
|
87
|
+
const mergedOpts = {
|
|
88
|
+
...opts,
|
|
89
|
+
mode,
|
|
90
|
+
out,
|
|
91
|
+
timeout,
|
|
92
|
+
depth,
|
|
93
|
+
maxPages,
|
|
94
|
+
everything,
|
|
95
|
+
strictClean,
|
|
96
|
+
saveDefault
|
|
97
|
+
};
|
|
98
|
+
return { url, opts: mergedOpts };
|
|
99
|
+
}
|
|
100
|
+
async function runExtraction(url, opts) {
|
|
101
|
+
const mode = String(opts.mode);
|
|
102
|
+
if (mode !== "clean" && mode !== "mirror") {
|
|
103
|
+
console.error(`Invalid mode: ${opts.mode}. Use clean or mirror.`);
|
|
104
|
+
process.exit(1);
|
|
105
|
+
}
|
|
106
|
+
const outDir = path.resolve(String(opts.out));
|
|
107
|
+
const timeoutMs = Number(opts.timeout);
|
|
108
|
+
if (!Number.isFinite(timeoutMs) || timeoutMs < 1000) {
|
|
109
|
+
console.error(`Invalid timeout: ${opts.timeout}`);
|
|
110
|
+
process.exit(1);
|
|
111
|
+
}
|
|
112
|
+
const crawlDepth = Number(opts.depth);
|
|
113
|
+
if (!Number.isFinite(crawlDepth) || crawlDepth < 0) {
|
|
114
|
+
console.error(`Invalid depth: ${opts.depth}`);
|
|
115
|
+
process.exit(1);
|
|
116
|
+
}
|
|
117
|
+
const maxPages = Number(opts.maxPages);
|
|
118
|
+
if (!Number.isFinite(maxPages) || maxPages < 1) {
|
|
119
|
+
console.error(`Invalid max-pages: ${opts.maxPages}`);
|
|
120
|
+
process.exit(1);
|
|
121
|
+
}
|
|
122
|
+
console.log(`Extracting ${url}`);
|
|
123
|
+
console.log(`Mode: ${mode}`);
|
|
124
|
+
console.log(`Everything: ${opts.everything !== false ? "yes" : "no"}`);
|
|
125
|
+
console.log(`Strict clean: ${opts.strictClean ? "yes" : "no"}`);
|
|
126
|
+
console.log(`Depth: ${crawlDepth}`);
|
|
127
|
+
console.log(`Max pages: ${maxPages}`);
|
|
128
|
+
console.log(`Output: ${outDir}`);
|
|
129
|
+
try {
|
|
130
|
+
const summary = await extractFrontend({
|
|
131
|
+
url,
|
|
132
|
+
outDir,
|
|
133
|
+
mode,
|
|
134
|
+
everything: opts.everything !== false,
|
|
135
|
+
strictClean: Boolean(opts.strictClean),
|
|
136
|
+
timeoutMs,
|
|
137
|
+
crawlDepth,
|
|
138
|
+
maxPages,
|
|
139
|
+
userAgent: opts.userAgent ? String(opts.userAgent) : undefined
|
|
140
|
+
});
|
|
141
|
+
console.log("\nDone.");
|
|
142
|
+
console.log(`Root HTML: ${summary.htmlPath}`);
|
|
143
|
+
if (summary.cssPath) {
|
|
144
|
+
console.log(`Root CSS: ${summary.cssPath}`);
|
|
145
|
+
}
|
|
146
|
+
console.log(`Pages exported: ${summary.pageCount}`);
|
|
147
|
+
console.log(`Assets downloaded: ${summary.assets.length}`);
|
|
148
|
+
console.log(`Verify: pages=${summary.verification.pages} assets=${summary.verification.assets} scripts=${summary.verification.scripts} styles=${summary.verification.stylesheets} fonts=${summary.verification.fonts} images=${summary.verification.images} others=${summary.verification.others} remote_urls_remaining=${summary.verification.remoteUrlsRemaining}`);
|
|
149
|
+
if (summary.warnings.length > 0) {
|
|
150
|
+
console.log("\nWarnings:");
|
|
151
|
+
for (const warning of summary.warnings) {
|
|
152
|
+
console.log(`- ${warning}`);
|
|
153
|
+
}
|
|
154
|
+
}
|
|
155
|
+
if (opts.saveDefault) {
|
|
156
|
+
await writeConfig({
|
|
157
|
+
defaultUrl: url,
|
|
158
|
+
mode,
|
|
159
|
+
out: opts.out,
|
|
160
|
+
everything: opts.everything !== false,
|
|
161
|
+
strictClean: Boolean(opts.strictClean),
|
|
162
|
+
timeout: opts.timeout,
|
|
163
|
+
depth: opts.depth,
|
|
164
|
+
maxPages: opts.maxPages
|
|
165
|
+
});
|
|
166
|
+
console.log(`Saved defaults to ${CONFIG_PATH}`);
|
|
167
|
+
}
|
|
168
|
+
console.log(`\nPreview with: cleanscrape run ${outDir}`);
|
|
169
|
+
}
|
|
170
|
+
catch (error) {
|
|
171
|
+
console.error("Extraction failed:");
|
|
172
|
+
console.error(error);
|
|
173
|
+
process.exit(1);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
async function runPreview(dirArg, opts) {
|
|
177
|
+
const dir = path.resolve(dirArg || "./output");
|
|
178
|
+
const port = Number(opts.port);
|
|
179
|
+
if (!Number.isFinite(port) || port < 1 || port > 65535) {
|
|
180
|
+
console.error(`Invalid port: ${opts.port}`);
|
|
181
|
+
process.exit(1);
|
|
182
|
+
}
|
|
183
|
+
const host = String(opts.host || "127.0.0.1");
|
|
184
|
+
const server = await startPreviewServer({ dir, port, host });
|
|
185
|
+
console.log(`Serving scraped site from ${dir}`);
|
|
186
|
+
console.log(`Open: http://${host}:${port}`);
|
|
187
|
+
console.log("Press Ctrl+C to stop.");
|
|
188
|
+
const stop = () => {
|
|
189
|
+
server.close(() => process.exit(0));
|
|
190
|
+
};
|
|
191
|
+
process.on("SIGINT", stop);
|
|
192
|
+
process.on("SIGTERM", stop);
|
|
193
|
+
}
|
|
194
|
+
const defaultOptions = {
|
|
195
|
+
out: "./output",
|
|
196
|
+
mode: "clean",
|
|
197
|
+
everything: true,
|
|
198
|
+
timeout: "60000",
|
|
199
|
+
depth: "3",
|
|
200
|
+
maxPages: "100"
|
|
201
|
+
};
|
|
202
|
+
const invokedAs = path.basename(fileURLToPath(import.meta.url));
|
|
203
|
+
const argv0 = path.basename(process.argv[1] || "");
|
|
204
|
+
const cliAliases = new Set(["scrapify", "scraper", "cleanscrape"]);
|
|
205
|
+
const activeCliName = cliAliases.has(argv0) ? argv0 : "cleanscrape";
|
|
206
|
+
const isScrapify = cliAliases.has(argv0) || cliAliases.has(invokedAs.replace(/\.js$/, ""));
|
|
207
|
+
if (isScrapify) {
|
|
208
|
+
if (process.argv[2] === "default") {
|
|
209
|
+
const defaultProgram = new Command();
|
|
210
|
+
defaultProgram
|
|
211
|
+
.name(`${activeCliName} default`)
|
|
212
|
+
.description("Set default URL used by interactive mode")
|
|
213
|
+
.argument("[url]", "Default website URL")
|
|
214
|
+
.action(async (url) => {
|
|
215
|
+
const resolved = typeof url === "string" && url.trim() ? url.trim() : await askInteractive("Default URL");
|
|
216
|
+
if (!resolved) {
|
|
217
|
+
console.error("No default URL provided.");
|
|
218
|
+
process.exit(1);
|
|
219
|
+
}
|
|
220
|
+
await setDefaultUrl(resolved);
|
|
221
|
+
console.log(`Default URL set: ${resolved}`);
|
|
222
|
+
console.log(`Config: ${CONFIG_PATH}`);
|
|
223
|
+
});
|
|
224
|
+
defaultProgram.parseAsync(["node", activeCliName, ...process.argv.slice(3)]).catch((err) => {
|
|
225
|
+
console.error(err);
|
|
226
|
+
process.exit(1);
|
|
227
|
+
});
|
|
228
|
+
}
|
|
229
|
+
else if (process.argv[2] === "help") {
|
|
230
|
+
const helpProgram = new Command();
|
|
231
|
+
helpProgram
|
|
232
|
+
.name(activeCliName)
|
|
233
|
+
.description("Scrape a site into clean editable frontend code")
|
|
234
|
+
.argument("[url]", "Website URL")
|
|
235
|
+
.option("-o, --out <dir>", "Output directory", defaultOptions.out)
|
|
236
|
+
.option("-m, --mode <mode>", "Export mode: clean or mirror", defaultOptions.mode)
|
|
237
|
+
.option("--everything", "Capture all discoverable assets/code and keep scripts")
|
|
238
|
+
.option("--no-everything", "Disable full capture mode")
|
|
239
|
+
.option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
|
|
240
|
+
.option("--save-default", "Save this run's URL/options as defaults")
|
|
241
|
+
.option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
|
|
242
|
+
.option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
|
|
243
|
+
.option("--max-pages <n>", "Maximum pages to crawl", defaultOptions.maxPages)
|
|
244
|
+
.option("--user-agent <ua>", "Custom user agent");
|
|
245
|
+
helpProgram.outputHelp();
|
|
246
|
+
process.exit(0);
|
|
247
|
+
}
|
|
248
|
+
else if (process.argv[2] === "run") {
|
|
249
|
+
const runProgram = new Command();
|
|
250
|
+
runProgram
|
|
251
|
+
.name(`${activeCliName} run`)
|
|
252
|
+
.description("Serve a scraped output folder locally")
|
|
253
|
+
.argument("[dir]", "Scraped output directory", "./output")
|
|
254
|
+
.option("-p, --port <port>", "Port", "4173")
|
|
255
|
+
.option("--host <host>", "Host", "127.0.0.1")
|
|
256
|
+
.action(async (dir, opts) => {
|
|
257
|
+
await runPreview(typeof dir === "string" ? dir : undefined, opts);
|
|
258
|
+
});
|
|
259
|
+
runProgram.parseAsync(["node", activeCliName, ...process.argv.slice(3)]).catch((err) => {
|
|
260
|
+
console.error(err);
|
|
261
|
+
process.exit(1);
|
|
262
|
+
});
|
|
263
|
+
}
|
|
264
|
+
else {
|
|
265
|
+
const program = new Command();
|
|
266
|
+
program
|
|
267
|
+
.name(activeCliName)
|
|
268
|
+
.description("Scrape a site into clean editable frontend code")
|
|
269
|
+
.argument("[url]", "Website URL")
|
|
270
|
+
.option("-o, --out <dir>", "Output directory", defaultOptions.out)
|
|
271
|
+
.option("-m, --mode <mode>", "Export mode: clean or mirror", defaultOptions.mode)
|
|
272
|
+
.option("--everything", "Capture all discoverable assets/code and keep scripts")
|
|
273
|
+
.option("--no-everything", "Disable full capture mode")
|
|
274
|
+
.option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
|
|
275
|
+
.option("--save-default", "Save this run's URL/options as defaults")
|
|
276
|
+
.option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
|
|
277
|
+
.option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
|
|
278
|
+
.option("--max-pages <n>", "Maximum pages to crawl", defaultOptions.maxPages)
|
|
279
|
+
.option("--user-agent <ua>", "Custom user agent")
|
|
280
|
+
.action(async (url, opts) => {
|
|
281
|
+
const config = await readConfig();
|
|
282
|
+
const resolved = await resolveInteractiveRun(typeof url === "string" && url.trim() ? url.trim() : undefined, opts, config);
|
|
283
|
+
await runExtraction(resolved.url, resolved.opts);
|
|
284
|
+
});
|
|
285
|
+
program.parseAsync(process.argv).catch((err) => {
|
|
286
|
+
console.error(err);
|
|
287
|
+
process.exit(1);
|
|
288
|
+
});
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
else {
|
|
292
|
+
const program = new Command();
|
|
293
|
+
program
|
|
294
|
+
.name("better-wget")
|
|
295
|
+
.description("Capture clean, editable frontend code from a live website")
|
|
296
|
+
.version("0.1.0");
|
|
297
|
+
program
|
|
298
|
+
.command("frontend")
|
|
299
|
+
.description("Export a frontend snapshot as clean HTML/CSS/assets")
|
|
300
|
+
.argument("<url>", "Website URL")
|
|
301
|
+
.option("-o, --out <dir>", "Output directory", defaultOptions.out)
|
|
302
|
+
.option("-m, --mode <mode>", "Export mode: clean or mirror", defaultOptions.mode)
|
|
303
|
+
.option("--everything", "Capture all discoverable assets/code and keep scripts")
|
|
304
|
+
.option("--no-everything", "Disable full capture mode")
|
|
305
|
+
.option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
|
|
306
|
+
.option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
|
|
307
|
+
.option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
|
|
308
|
+
.option("--max-pages <n>", "Maximum pages to crawl", defaultOptions.maxPages)
|
|
309
|
+
.option("--user-agent <ua>", "Custom user agent")
|
|
310
|
+
.action(async (url, opts) => {
|
|
311
|
+
await runExtraction(url, opts);
|
|
312
|
+
});
|
|
313
|
+
program
|
|
314
|
+
.command("run")
|
|
315
|
+
.description("Serve a scraped output folder locally")
|
|
316
|
+
.argument("[dir]", "Scraped output directory", "./output")
|
|
317
|
+
.option("-p, --port <port>", "Port", "4173")
|
|
318
|
+
.option("--host <host>", "Host", "127.0.0.1")
|
|
319
|
+
.action(async (dir, opts) => {
|
|
320
|
+
await runPreview(typeof dir === "string" ? dir : undefined, opts);
|
|
321
|
+
});
|
|
322
|
+
program.parseAsync(process.argv).catch((err) => {
|
|
323
|
+
console.error(err);
|
|
324
|
+
process.exit(1);
|
|
325
|
+
});
|
|
326
|
+
}
|
|
@@ -0,0 +1,642 @@
|
|
|
1
|
+
import { mkdir, writeFile } from "node:fs/promises";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { URL } from "node:url";
|
|
4
|
+
import { load } from "cheerio";
|
|
5
|
+
import prettier from "prettier";
|
|
6
|
+
const TRACKER_HOST_PATTERNS = [
|
|
7
|
+
"google-analytics.com",
|
|
8
|
+
"googletagmanager.com",
|
|
9
|
+
"doubleclick.net",
|
|
10
|
+
"segment.com",
|
|
11
|
+
"mixpanel.com",
|
|
12
|
+
"hotjar.com",
|
|
13
|
+
"sentry.io"
|
|
14
|
+
];
|
|
15
|
+
const NON_HTML_EXTENSIONS = new Set([
|
|
16
|
+
".png",
|
|
17
|
+
".jpg",
|
|
18
|
+
".jpeg",
|
|
19
|
+
".gif",
|
|
20
|
+
".webp",
|
|
21
|
+
".svg",
|
|
22
|
+
".ico",
|
|
23
|
+
".avif",
|
|
24
|
+
".pdf",
|
|
25
|
+
".zip",
|
|
26
|
+
".gz",
|
|
27
|
+
".rar",
|
|
28
|
+
".7z",
|
|
29
|
+
".mp4",
|
|
30
|
+
".mov",
|
|
31
|
+
".webm",
|
|
32
|
+
".mp3",
|
|
33
|
+
".wav",
|
|
34
|
+
".woff",
|
|
35
|
+
".woff2",
|
|
36
|
+
".ttf",
|
|
37
|
+
".otf",
|
|
38
|
+
".css",
|
|
39
|
+
".js",
|
|
40
|
+
".mjs",
|
|
41
|
+
".json",
|
|
42
|
+
".xml"
|
|
43
|
+
]);
|
|
44
|
+
function normalizeUrl(raw, baseUrl) {
|
|
45
|
+
try {
|
|
46
|
+
return new URL(raw, baseUrl).toString();
|
|
47
|
+
}
|
|
48
|
+
catch {
|
|
49
|
+
return null;
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
function normalizePageUrl(raw) {
|
|
53
|
+
try {
|
|
54
|
+
const u = new URL(raw);
|
|
55
|
+
u.hash = "";
|
|
56
|
+
u.search = "";
|
|
57
|
+
if (u.pathname.length > 1 && u.pathname.endsWith("/")) {
|
|
58
|
+
u.pathname = u.pathname.slice(0, -1);
|
|
59
|
+
}
|
|
60
|
+
return u.toString();
|
|
61
|
+
}
|
|
62
|
+
catch {
|
|
63
|
+
return null;
|
|
64
|
+
}
|
|
65
|
+
}
|
|
66
|
+
function toSafeFileName(input) {
|
|
67
|
+
return input.replace(/[^a-zA-Z0-9._-]+/g, "-").slice(0, 180) || "asset";
|
|
68
|
+
}
|
|
69
|
+
async function formatMaybe(content, parser) {
|
|
70
|
+
try {
|
|
71
|
+
return await prettier.format(content, { parser });
|
|
72
|
+
}
|
|
73
|
+
catch {
|
|
74
|
+
return content;
|
|
75
|
+
}
|
|
76
|
+
}
|
|
77
|
+
async function saveTextFile(filePath, content) {
|
|
78
|
+
await mkdir(path.dirname(filePath), { recursive: true });
|
|
79
|
+
await writeFile(filePath, content, "utf8");
|
|
80
|
+
}
|
|
81
|
+
function buildAssetFilePath(assetUrl, outputRoot, fallbackExt = "") {
|
|
82
|
+
const urlObj = new URL(assetUrl);
|
|
83
|
+
const ext = path.extname(urlObj.pathname) || fallbackExt;
|
|
84
|
+
const base = toSafeFileName(path.basename(urlObj.pathname, ext) || "asset");
|
|
85
|
+
const host = toSafeFileName(urlObj.hostname);
|
|
86
|
+
const dir = path.join(outputRoot, "assets", host);
|
|
87
|
+
return path.join(dir, `${base}${ext}`);
|
|
88
|
+
}
|
|
89
|
+
function inferBinaryAssetKind(assetUrl) {
|
|
90
|
+
const pathname = new URL(assetUrl).pathname.toLowerCase();
|
|
91
|
+
if (/\.(png|jpe?g|gif|webp|svg|ico|avif)$/.test(pathname))
|
|
92
|
+
return "image";
|
|
93
|
+
if (/\.(woff2?|ttf|otf|eot)$/.test(pathname))
|
|
94
|
+
return "font";
|
|
95
|
+
return "other";
|
|
96
|
+
}
|
|
97
|
+
function inferAssetKind(assetUrl) {
|
|
98
|
+
const pathname = new URL(assetUrl).pathname.toLowerCase();
|
|
99
|
+
if (pathname.endsWith(".css"))
|
|
100
|
+
return "stylesheet";
|
|
101
|
+
if (pathname.endsWith(".js") || pathname.endsWith(".mjs"))
|
|
102
|
+
return "script";
|
|
103
|
+
const binary = inferBinaryAssetKind(assetUrl);
|
|
104
|
+
if (binary === "image")
|
|
105
|
+
return "image";
|
|
106
|
+
if (binary === "font")
|
|
107
|
+
return "font";
|
|
108
|
+
return "other";
|
|
109
|
+
}
|
|
110
|
+
function isTrackerUrl(assetUrl) {
|
|
111
|
+
try {
|
|
112
|
+
const host = new URL(assetUrl).hostname.toLowerCase();
|
|
113
|
+
return TRACKER_HOST_PATTERNS.some((pattern) => host.includes(pattern));
|
|
114
|
+
}
|
|
115
|
+
catch {
|
|
116
|
+
return false;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
function rewriteCssUrls(cssText, replacer) {
|
|
120
|
+
return cssText.replace(/url\(\s*(['"]?)([^'")]+)\1\s*\)/g, (full, quote, rawUrl) => {
|
|
121
|
+
const trimmed = rawUrl.trim();
|
|
122
|
+
if (!trimmed || trimmed.startsWith("data:") || trimmed.startsWith("#"))
|
|
123
|
+
return full;
|
|
124
|
+
const replaced = replacer(trimmed);
|
|
125
|
+
const q = quote || '"';
|
|
126
|
+
return `url(${q}${replaced}${q})`;
|
|
127
|
+
});
|
|
128
|
+
}
|
|
129
|
+
function getPagePaths(pageUrl, outDir) {
|
|
130
|
+
const u = new URL(pageUrl);
|
|
131
|
+
const segments = u.pathname
|
|
132
|
+
.split("/")
|
|
133
|
+
.filter(Boolean)
|
|
134
|
+
.map((s) => toSafeFileName(s));
|
|
135
|
+
if (segments.length === 0) {
|
|
136
|
+
return {
|
|
137
|
+
routeKey: "/",
|
|
138
|
+
htmlPath: path.join(outDir, "src", "index.html"),
|
|
139
|
+
cssPath: path.join(outDir, "src", "styles.css")
|
|
140
|
+
};
|
|
141
|
+
}
|
|
142
|
+
const dir = path.join(outDir, "src", "pages", ...segments);
|
|
143
|
+
return {
|
|
144
|
+
routeKey: `/${segments.join("/")}`,
|
|
145
|
+
htmlPath: path.join(dir, "index.html"),
|
|
146
|
+
cssPath: path.join(dir, "styles.css")
|
|
147
|
+
};
|
|
148
|
+
}
|
|
149
|
+
async function fetchWithTimeout(url, timeoutMs, userAgent) {
|
|
150
|
+
const controller = new AbortController();
|
|
151
|
+
const timeout = setTimeout(() => controller.abort(), timeoutMs);
|
|
152
|
+
try {
|
|
153
|
+
return await fetch(url, {
|
|
154
|
+
signal: controller.signal,
|
|
155
|
+
headers: {
|
|
156
|
+
"user-agent": userAgent ||
|
|
157
|
+
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
|
|
158
|
+
}
|
|
159
|
+
});
|
|
160
|
+
}
|
|
161
|
+
finally {
|
|
162
|
+
clearTimeout(timeout);
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
async function downloadTextAsset(assetUrl, timeoutMs, userAgent) {
|
|
166
|
+
const res = await fetchWithTimeout(assetUrl, timeoutMs, userAgent);
|
|
167
|
+
if (!res.ok)
|
|
168
|
+
return null;
|
|
169
|
+
return await res.text();
|
|
170
|
+
}
|
|
171
|
+
async function downloadBinaryAsset(assetUrl, outDir, timeoutMs, userAgent, assetMap) {
|
|
172
|
+
const existingPath = assetMap.get(assetUrl);
|
|
173
|
+
if (existingPath)
|
|
174
|
+
return existingPath;
|
|
175
|
+
const res = await fetchWithTimeout(assetUrl, timeoutMs, userAgent);
|
|
176
|
+
if (!res.ok)
|
|
177
|
+
return null;
|
|
178
|
+
const bytes = Buffer.from(await res.arrayBuffer());
|
|
179
|
+
const targetPath = buildAssetFilePath(assetUrl, outDir);
|
|
180
|
+
await mkdir(path.dirname(targetPath), { recursive: true });
|
|
181
|
+
await writeFile(targetPath, bytes);
|
|
182
|
+
const savedPath = path.relative(outDir, targetPath);
|
|
183
|
+
assetMap.set(assetUrl, savedPath);
|
|
184
|
+
return savedPath;
|
|
185
|
+
}
|
|
186
|
+
function maybeAddAsset(assets, seen, record) {
|
|
187
|
+
const key = `${record.kind}:${record.savedPath}`;
|
|
188
|
+
if (seen.has(key))
|
|
189
|
+
return;
|
|
190
|
+
seen.add(key);
|
|
191
|
+
assets.push(record);
|
|
192
|
+
}
|
|
193
|
+
function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
|
|
194
|
+
let parsed;
|
|
195
|
+
try {
|
|
196
|
+
parsed = new URL(candidateUrl);
|
|
197
|
+
}
|
|
198
|
+
catch {
|
|
199
|
+
return false;
|
|
200
|
+
}
|
|
201
|
+
if (parsed.origin !== rootOrigin)
|
|
202
|
+
return false;
|
|
203
|
+
if (parsed.protocol !== "http:" && parsed.protocol !== "https:")
|
|
204
|
+
return false;
|
|
205
|
+
const ext = path.extname(parsed.pathname.toLowerCase());
|
|
206
|
+
if (ext && NON_HTML_EXTENSIONS.has(ext))
|
|
207
|
+
return false;
|
|
208
|
+
return true;
|
|
209
|
+
}
|
|
210
|
+
function toRelativeWebPath(fromDir, toPath) {
|
|
211
|
+
const rel = path.relative(fromDir, toPath).split(path.sep).join("/");
|
|
212
|
+
if (rel === "")
|
|
213
|
+
return "./";
|
|
214
|
+
return rel.startsWith(".") ? rel : `./${rel}`;
|
|
215
|
+
}
|
|
216
|
+
function strictCleanDom($) {
|
|
217
|
+
$("*").each((_, el) => {
|
|
218
|
+
const attrs = Object.keys(el.attribs || {});
|
|
219
|
+
for (const attrName of attrs) {
|
|
220
|
+
const lower = attrName.toLowerCase();
|
|
221
|
+
if (lower.startsWith("data-") || lower.startsWith("on")) {
|
|
222
|
+
$(el).removeAttr(attrName);
|
|
223
|
+
continue;
|
|
224
|
+
}
|
|
225
|
+
if (lower === "nonce" ||
|
|
226
|
+
lower === "integrity" ||
|
|
227
|
+
lower === "crossorigin" ||
|
|
228
|
+
lower === "fetchpriority" ||
|
|
229
|
+
lower === "referrerpolicy") {
|
|
230
|
+
$(el).removeAttr(attrName);
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
});
|
|
234
|
+
}
|
|
235
|
+
function findRemoteUrls(content) {
|
|
236
|
+
const matches = content.match(/(?:https?:)?\/\/[^\s"'()<>]+/g) || [];
|
|
237
|
+
return matches.map((u) => u.trim()).filter(Boolean);
|
|
238
|
+
}
|
|
239
|
+
async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings) {
|
|
240
|
+
const value = $(el).attr(attr);
|
|
241
|
+
if (!value)
|
|
242
|
+
return;
|
|
243
|
+
if (value.startsWith("./") ||
|
|
244
|
+
value.startsWith("../") ||
|
|
245
|
+
value.startsWith("assets/") ||
|
|
246
|
+
value.startsWith("/assets/") ||
|
|
247
|
+
value.startsWith("src/") ||
|
|
248
|
+
value.startsWith("/src/")) {
|
|
249
|
+
return;
|
|
250
|
+
}
|
|
251
|
+
const absolute = normalizeUrl(value, baseUrl);
|
|
252
|
+
if (!absolute)
|
|
253
|
+
return;
|
|
254
|
+
if (isTrackerUrl(absolute))
|
|
255
|
+
return;
|
|
256
|
+
const savedPath = await downloadBinaryAsset(absolute, outDir, options.timeoutMs, options.userAgent, assetMap);
|
|
257
|
+
if (!savedPath) {
|
|
258
|
+
warnings.push(`Failed to fetch asset: ${absolute}`);
|
|
259
|
+
return;
|
|
260
|
+
}
|
|
261
|
+
const localPath = path.join(outDir, savedPath);
|
|
262
|
+
const rel = toRelativeWebPath(pageDir, localPath);
|
|
263
|
+
$(el).attr(attr, rel);
|
|
264
|
+
maybeAddAsset(assets, seenAssetRecords, {
|
|
265
|
+
url: absolute,
|
|
266
|
+
kind: inferAssetKind(absolute),
|
|
267
|
+
savedPath
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
export async function extractFrontend(options) {
|
|
271
|
+
const everything = options.everything === true;
|
|
272
|
+
const strictClean = options.strictClean === true;
|
|
273
|
+
const outDir = path.resolve(options.outDir);
|
|
274
|
+
await mkdir(path.join(outDir, "src"), { recursive: true });
|
|
275
|
+
const normalizedRootUrl = normalizePageUrl(options.url);
|
|
276
|
+
if (!normalizedRootUrl) {
|
|
277
|
+
throw new Error(`Invalid URL: ${options.url}`);
|
|
278
|
+
}
|
|
279
|
+
const rootOrigin = new URL(normalizedRootUrl).origin;
|
|
280
|
+
const queue = [{ url: normalizedRootUrl, depth: 0 }];
|
|
281
|
+
const visited = new Set();
|
|
282
|
+
const assets = [];
|
|
283
|
+
const warnings = [];
|
|
284
|
+
const pages = [];
|
|
285
|
+
const remoteUrlsRemaining = new Set();
|
|
286
|
+
const assetMap = new Map();
|
|
287
|
+
const seenAssetRecords = new Set();
|
|
288
|
+
while (queue.length > 0 && visited.size < options.maxPages) {
|
|
289
|
+
const current = queue.shift();
|
|
290
|
+
if (!current)
|
|
291
|
+
break;
|
|
292
|
+
if (visited.has(current.url))
|
|
293
|
+
continue;
|
|
294
|
+
visited.add(current.url);
|
|
295
|
+
let htmlRes;
|
|
296
|
+
try {
|
|
297
|
+
htmlRes = await fetchWithTimeout(current.url, options.timeoutMs, options.userAgent);
|
|
298
|
+
}
|
|
299
|
+
catch (error) {
|
|
300
|
+
warnings.push(`Failed to fetch page: ${current.url} (${String(error)})`);
|
|
301
|
+
continue;
|
|
302
|
+
}
|
|
303
|
+
if (!htmlRes.ok) {
|
|
304
|
+
warnings.push(`Failed to fetch page: ${current.url} (HTTP ${htmlRes.status})`);
|
|
305
|
+
continue;
|
|
306
|
+
}
|
|
307
|
+
const contentType = htmlRes.headers.get("content-type") || "";
|
|
308
|
+
if (!contentType.includes("text/html")) {
|
|
309
|
+
warnings.push(`Skipped non-HTML page: ${current.url} (${contentType || "unknown content-type"})`);
|
|
310
|
+
continue;
|
|
311
|
+
}
|
|
312
|
+
const domHtml = await htmlRes.text();
|
|
313
|
+
const $ = load(domHtml);
|
|
314
|
+
if (current.depth < options.crawlDepth) {
|
|
315
|
+
const discovered = new Set();
|
|
316
|
+
$("a[href]").each((_, el) => {
|
|
317
|
+
const href = $(el).attr("href");
|
|
318
|
+
if (!href)
|
|
319
|
+
return;
|
|
320
|
+
if (href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
321
|
+
return;
|
|
322
|
+
}
|
|
323
|
+
const absolute = normalizeUrl(href, current.url);
|
|
324
|
+
if (!absolute)
|
|
325
|
+
return;
|
|
326
|
+
const normalized = normalizePageUrl(absolute);
|
|
327
|
+
if (!normalized)
|
|
328
|
+
return;
|
|
329
|
+
if (!isSameOriginHtmlLink(normalized, rootOrigin))
|
|
330
|
+
return;
|
|
331
|
+
if (visited.has(normalized) || discovered.has(normalized))
|
|
332
|
+
return;
|
|
333
|
+
discovered.add(normalized);
|
|
334
|
+
queue.push({ url: normalized, depth: current.depth + 1 });
|
|
335
|
+
});
|
|
336
|
+
}
|
|
337
|
+
const pagePaths = getPagePaths(current.url, outDir);
|
|
338
|
+
const pageDir = path.dirname(pagePaths.htmlPath);
|
|
339
|
+
const cssDir = path.dirname(pagePaths.cssPath);
|
|
340
|
+
const cssBlocks = [];
|
|
341
|
+
$("style").each((_, el) => {
|
|
342
|
+
const text = $(el).html();
|
|
343
|
+
if (text?.trim())
|
|
344
|
+
cssBlocks.push(text.trim());
|
|
345
|
+
});
|
|
346
|
+
const stylesheetLinks = $("link[rel='stylesheet']").toArray();
|
|
347
|
+
for (const el of stylesheetLinks) {
|
|
348
|
+
const href = $(el).attr("href");
|
|
349
|
+
if (!href)
|
|
350
|
+
continue;
|
|
351
|
+
const absolute = normalizeUrl(href, current.url);
|
|
352
|
+
if (!absolute)
|
|
353
|
+
continue;
|
|
354
|
+
try {
|
|
355
|
+
const rawCss = await downloadTextAsset(absolute, options.timeoutMs, options.userAgent);
|
|
356
|
+
if (!rawCss) {
|
|
357
|
+
warnings.push(`Failed to fetch stylesheet: ${absolute}`);
|
|
358
|
+
continue;
|
|
359
|
+
}
|
|
360
|
+
let cleanedCss = rawCss;
|
|
361
|
+
const importRegex = /@import\s+(?:url\()?['"]?([^'")]+)['"]?\)?\s*;/g;
|
|
362
|
+
const importUrls = Array.from(cleanedCss.matchAll(importRegex)).map((m) => m[1]).filter(Boolean);
|
|
363
|
+
for (const importUrl of importUrls) {
|
|
364
|
+
const importAbs = normalizeUrl(importUrl, absolute);
|
|
365
|
+
if (!importAbs)
|
|
366
|
+
continue;
|
|
367
|
+
const importCss = await downloadTextAsset(importAbs, options.timeoutMs, options.userAgent);
|
|
368
|
+
if (!importCss)
|
|
369
|
+
continue;
|
|
370
|
+
cleanedCss = cleanedCss.replace(`@import url(${importUrl});`, "");
|
|
371
|
+
cleanedCss += `\n\n/* inlined import: ${importAbs} */\n${importCss}`;
|
|
372
|
+
}
|
|
373
|
+
cleanedCss = rewriteCssUrls(cleanedCss, (rawUrl) => {
|
|
374
|
+
const abs = normalizeUrl(rawUrl, absolute);
|
|
375
|
+
if (!abs)
|
|
376
|
+
return rawUrl;
|
|
377
|
+
return abs;
|
|
378
|
+
});
|
|
379
|
+
cssBlocks.push(cleanedCss);
|
|
380
|
+
}
|
|
381
|
+
catch (error) {
|
|
382
|
+
warnings.push(`Failed to fetch stylesheet: ${absolute} (${String(error)})`);
|
|
383
|
+
}
|
|
384
|
+
}
|
|
385
|
+
$("style").remove();
|
|
386
|
+
$("link[rel='stylesheet']").remove();
|
|
387
|
+
const cssHref = toRelativeWebPath(pageDir, pagePaths.cssPath);
|
|
388
|
+
$("head").append(`<link rel="stylesheet" href="${cssHref}">`);
|
|
389
|
+
const mediaNodes = $("img[src], source[src], source[srcset], video[poster]").toArray();
|
|
390
|
+
for (const el of mediaNodes) {
|
|
391
|
+
if ($(el).attr("srcset") !== undefined) {
|
|
392
|
+
const srcsetValue = $(el).attr("srcset") || "";
|
|
393
|
+
const first = srcsetValue
|
|
394
|
+
.split(",")
|
|
395
|
+
.map((item) => item.trim().split(/\s+/)[0])
|
|
396
|
+
.find(Boolean);
|
|
397
|
+
if (first) {
|
|
398
|
+
$(el).attr("src", first);
|
|
399
|
+
}
|
|
400
|
+
$(el).removeAttr("srcset");
|
|
401
|
+
}
|
|
402
|
+
const attr = $(el).attr("src") !== undefined ? "src" : "poster";
|
|
403
|
+
const value = $(el).attr(attr);
|
|
404
|
+
if (!value)
|
|
405
|
+
continue;
|
|
406
|
+
const absolute = normalizeUrl(value, current.url);
|
|
407
|
+
if (!absolute)
|
|
408
|
+
continue;
|
|
409
|
+
try {
|
|
410
|
+
const savedPath = await downloadBinaryAsset(absolute, outDir, options.timeoutMs, options.userAgent, assetMap);
|
|
411
|
+
if (!savedPath) {
|
|
412
|
+
warnings.push(`Failed to fetch media: ${absolute}`);
|
|
413
|
+
continue;
|
|
414
|
+
}
|
|
415
|
+
const localPath = path.join(outDir, savedPath);
|
|
416
|
+
const rel = toRelativeWebPath(pageDir, localPath);
|
|
417
|
+
$(el).attr(attr, rel);
|
|
418
|
+
maybeAddAsset(assets, seenAssetRecords, {
|
|
419
|
+
url: absolute,
|
|
420
|
+
kind: inferBinaryAssetKind(absolute) === "font" ? "font" : "image",
|
|
421
|
+
savedPath
|
|
422
|
+
});
|
|
423
|
+
}
|
|
424
|
+
catch (error) {
|
|
425
|
+
warnings.push(`Failed to fetch media: ${absolute} (${String(error)})`);
|
|
426
|
+
}
|
|
427
|
+
}
|
|
428
|
+
const keepScripts = options.mode !== "clean" || everything;
|
|
429
|
+
if (!keepScripts) {
|
|
430
|
+
$("script").remove();
|
|
431
|
+
$("noscript").remove();
|
|
432
|
+
$("*[data-reactroot], *[data-reactid], *[data-v-app], *[ng-version]").removeAttr("data-reactroot data-reactid data-v-app ng-version");
|
|
433
|
+
}
|
|
434
|
+
else {
|
|
435
|
+
const scripts = $("script[src]").toArray();
|
|
436
|
+
for (const el of scripts) {
|
|
437
|
+
const src = $(el).attr("src");
|
|
438
|
+
if (!src)
|
|
439
|
+
continue;
|
|
440
|
+
const absolute = normalizeUrl(src, current.url);
|
|
441
|
+
if (!absolute)
|
|
442
|
+
continue;
|
|
443
|
+
try {
|
|
444
|
+
const host = new URL(absolute).hostname;
|
|
445
|
+
if (TRACKER_HOST_PATTERNS.some((pattern) => host.includes(pattern))) {
|
|
446
|
+
$(el).remove();
|
|
447
|
+
continue;
|
|
448
|
+
}
|
|
449
|
+
}
|
|
450
|
+
catch {
|
|
451
|
+
// ignore parse failures
|
|
452
|
+
}
|
|
453
|
+
try {
|
|
454
|
+
const rawJs = await downloadTextAsset(absolute, options.timeoutMs, options.userAgent);
|
|
455
|
+
if (!rawJs) {
|
|
456
|
+
warnings.push(`Failed to fetch script: ${absolute}`);
|
|
457
|
+
continue;
|
|
458
|
+
}
|
|
459
|
+
const targetPath = buildAssetFilePath(absolute, outDir, ".js");
|
|
460
|
+
await saveTextFile(targetPath, await formatMaybe(rawJs, "babel"));
|
|
461
|
+
const savedPath = path.relative(outDir, targetPath);
|
|
462
|
+
assetMap.set(absolute, savedPath);
|
|
463
|
+
const localPath = path.join(outDir, savedPath);
|
|
464
|
+
const rel = toRelativeWebPath(pageDir, localPath);
|
|
465
|
+
$(el).attr("src", rel);
|
|
466
|
+
maybeAddAsset(assets, seenAssetRecords, {
|
|
467
|
+
url: absolute,
|
|
468
|
+
kind: "script",
|
|
469
|
+
savedPath
|
|
470
|
+
});
|
|
471
|
+
}
|
|
472
|
+
catch (error) {
|
|
473
|
+
warnings.push(`Failed to fetch script: ${absolute} (${String(error)})`);
|
|
474
|
+
}
|
|
475
|
+
}
|
|
476
|
+
if (everything) {
|
|
477
|
+
let inlineIndex = 0;
|
|
478
|
+
const inlineScripts = $("script:not([src])").toArray();
|
|
479
|
+
for (const el of inlineScripts) {
|
|
480
|
+
const raw = $(el).html();
|
|
481
|
+
if (!raw || !raw.trim())
|
|
482
|
+
continue;
|
|
483
|
+
inlineIndex += 1;
|
|
484
|
+
const targetPath = path.join(outDir, "src", "scripts", `inline-${inlineIndex}.js`);
|
|
485
|
+
await saveTextFile(targetPath, await formatMaybe(raw, "babel"));
|
|
486
|
+
const savedPath = path.relative(outDir, targetPath);
|
|
487
|
+
const rel = toRelativeWebPath(pageDir, targetPath);
|
|
488
|
+
$(el).attr("src", rel);
|
|
489
|
+
$(el).text("");
|
|
490
|
+
maybeAddAsset(assets, seenAssetRecords, {
|
|
491
|
+
url: `${current.url}#inline-script-${inlineIndex}`,
|
|
492
|
+
kind: "script",
|
|
493
|
+
savedPath
|
|
494
|
+
});
|
|
495
|
+
}
|
|
496
|
+
}
|
|
497
|
+
}
|
|
498
|
+
$("a[href]").each((_, el) => {
|
|
499
|
+
const href = $(el).attr("href");
|
|
500
|
+
if (!href)
|
|
501
|
+
return;
|
|
502
|
+
if (href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
503
|
+
return;
|
|
504
|
+
}
|
|
505
|
+
const absolute = normalizeUrl(href, current.url);
|
|
506
|
+
if (!absolute)
|
|
507
|
+
return;
|
|
508
|
+
const normalized = normalizePageUrl(absolute);
|
|
509
|
+
if (!normalized)
|
|
510
|
+
return;
|
|
511
|
+
if (!isSameOriginHtmlLink(normalized, rootOrigin))
|
|
512
|
+
return;
|
|
513
|
+
const targetPage = getPagePaths(normalized, outDir);
|
|
514
|
+
const targetHtml = targetPage.htmlPath;
|
|
515
|
+
const relHref = toRelativeWebPath(pageDir, targetHtml);
|
|
516
|
+
$(el).attr("href", relHref);
|
|
517
|
+
});
|
|
518
|
+
if (everything) {
|
|
519
|
+
const attrSelectors = [
|
|
520
|
+
{ selector: "img[src]", attr: "src" },
|
|
521
|
+
{ selector: "video[src]", attr: "src" },
|
|
522
|
+
{ selector: "audio[src]", attr: "src" },
|
|
523
|
+
{ selector: "track[src]", attr: "src" },
|
|
524
|
+
{ selector: "iframe[src]", attr: "src" },
|
|
525
|
+
{ selector: "embed[src]", attr: "src" },
|
|
526
|
+
{ selector: "object[data]", attr: "data" },
|
|
527
|
+
{ selector: "input[src]", attr: "src" },
|
|
528
|
+
{ selector: "link[href]", attr: "href" },
|
|
529
|
+
{ selector: "source[src]", attr: "src" },
|
|
530
|
+
{ selector: "image[href]", attr: "href" },
|
|
531
|
+
{ selector: "image[xlink\\:href]", attr: "xlink:href" },
|
|
532
|
+
{ selector: "use[href]", attr: "href" },
|
|
533
|
+
{ selector: "use[xlink\\:href]", attr: "xlink:href" }
|
|
534
|
+
];
|
|
535
|
+
for (const { selector, attr } of attrSelectors) {
|
|
536
|
+
const nodes = $(selector).toArray();
|
|
537
|
+
for (const el of nodes) {
|
|
538
|
+
if (selector === "link[href]" && $(el).attr("rel")?.toLowerCase() === "stylesheet") {
|
|
539
|
+
continue;
|
|
540
|
+
}
|
|
541
|
+
await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
|
|
542
|
+
}
|
|
543
|
+
}
|
|
544
|
+
const downloadableAnchors = $("a[href]").toArray();
|
|
545
|
+
for (const el of downloadableAnchors) {
|
|
546
|
+
const href = $(el).attr("href");
|
|
547
|
+
if (!href)
|
|
548
|
+
continue;
|
|
549
|
+
if (href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
|
|
550
|
+
continue;
|
|
551
|
+
}
|
|
552
|
+
const absolute = normalizeUrl(href, current.url);
|
|
553
|
+
if (!absolute)
|
|
554
|
+
continue;
|
|
555
|
+
const normalized = normalizePageUrl(absolute);
|
|
556
|
+
const isInternalHtml = normalized ? isSameOriginHtmlLink(normalized, rootOrigin) : false;
|
|
557
|
+
if (isInternalHtml)
|
|
558
|
+
continue;
|
|
559
|
+
await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
|
|
560
|
+
}
|
|
561
|
+
}
|
|
562
|
+
const cssMerged = cssBlocks.join("\n\n");
|
|
563
|
+
const cssUrls = Array.from(cssMerged.matchAll(/url\(\s*(['"]?)([^'")]+)\1\s*\)/g))
|
|
564
|
+
.map((m) => m[2])
|
|
565
|
+
.filter((u) => Boolean(u) && !u.startsWith("data:") && !u.startsWith("#"));
|
|
566
|
+
let rewrittenCss = cssMerged;
|
|
567
|
+
for (const cssUrl of cssUrls) {
|
|
568
|
+
const abs = normalizeUrl(cssUrl, current.url);
|
|
569
|
+
if (!abs)
|
|
570
|
+
continue;
|
|
571
|
+
const savedPath = await downloadBinaryAsset(abs, outDir, options.timeoutMs, options.userAgent, assetMap);
|
|
572
|
+
if (!savedPath)
|
|
573
|
+
continue;
|
|
574
|
+
const localPath = path.join(outDir, savedPath);
|
|
575
|
+
const rel = toRelativeWebPath(cssDir, localPath);
|
|
576
|
+
rewrittenCss = rewrittenCss.split(cssUrl).join(rel);
|
|
577
|
+
maybeAddAsset(assets, seenAssetRecords, {
|
|
578
|
+
url: abs,
|
|
579
|
+
kind: inferBinaryAssetKind(abs) === "font" ? "font" : "image",
|
|
580
|
+
savedPath
|
|
581
|
+
});
|
|
582
|
+
}
|
|
583
|
+
if (strictClean) {
|
|
584
|
+
strictCleanDom($);
|
|
585
|
+
}
|
|
586
|
+
const finalHtml = strictClean ? $.html().replace(/<!--[\s\S]*?-->/g, "") : $.html();
|
|
587
|
+
for (const u of findRemoteUrls(finalHtml)) {
|
|
588
|
+
remoteUrlsRemaining.add(u);
|
|
589
|
+
}
|
|
590
|
+
for (const u of findRemoteUrls(rewrittenCss)) {
|
|
591
|
+
remoteUrlsRemaining.add(u);
|
|
592
|
+
}
|
|
593
|
+
await saveTextFile(pagePaths.htmlPath, await formatMaybe(finalHtml, "html"));
|
|
594
|
+
await saveTextFile(pagePaths.cssPath, await formatMaybe(rewrittenCss, "css"));
|
|
595
|
+
pages.push({
|
|
596
|
+
url: current.url,
|
|
597
|
+
htmlPath: path.relative(outDir, pagePaths.htmlPath),
|
|
598
|
+
cssPath: path.relative(outDir, pagePaths.cssPath)
|
|
599
|
+
});
|
|
600
|
+
}
|
|
601
|
+
const scriptsCount = assets.filter((a) => a.kind === "script").length;
|
|
602
|
+
const stylesheetsCount = assets.filter((a) => a.kind === "stylesheet").length;
|
|
603
|
+
const fontsCount = assets.filter((a) => a.kind === "font").length;
|
|
604
|
+
const imagesCount = assets.filter((a) => a.kind === "image").length;
|
|
605
|
+
const othersCount = assets.filter((a) => a.kind === "other").length;
|
|
606
|
+
const verification = {
|
|
607
|
+
pages: pages.length,
|
|
608
|
+
assets: assets.length,
|
|
609
|
+
scripts: scriptsCount,
|
|
610
|
+
stylesheets: stylesheetsCount,
|
|
611
|
+
fonts: fontsCount,
|
|
612
|
+
images: imagesCount,
|
|
613
|
+
others: othersCount,
|
|
614
|
+
remoteUrlsRemaining: remoteUrlsRemaining.size
|
|
615
|
+
};
|
|
616
|
+
await saveTextFile(path.join(outDir, "manifest.json"), JSON.stringify({
|
|
617
|
+
sourceUrl: options.url,
|
|
618
|
+
mode: options.mode,
|
|
619
|
+
everything,
|
|
620
|
+
strictClean,
|
|
621
|
+
exportedAt: new Date().toISOString(),
|
|
622
|
+
crawlDepth: options.crawlDepth,
|
|
623
|
+
maxPages: options.maxPages,
|
|
624
|
+
pageCount: pages.length,
|
|
625
|
+
pages,
|
|
626
|
+
assetCount: assets.length,
|
|
627
|
+
verification,
|
|
628
|
+
warnings
|
|
629
|
+
}, null, 2));
|
|
630
|
+
const rootPagePaths = getPagePaths(normalizedRootUrl, outDir);
|
|
631
|
+
return {
|
|
632
|
+
url: options.url,
|
|
633
|
+
outDir,
|
|
634
|
+
htmlPath: rootPagePaths.htmlPath,
|
|
635
|
+
cssPath: rootPagePaths.cssPath,
|
|
636
|
+
verification,
|
|
637
|
+
pageCount: pages.length,
|
|
638
|
+
pages,
|
|
639
|
+
assets,
|
|
640
|
+
warnings
|
|
641
|
+
};
|
|
642
|
+
}
|
package/dist/preview.js
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { readFile } from "node:fs/promises";
|
|
2
|
+
import { createServer } from "node:http";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
const CONTENT_TYPES = {
|
|
5
|
+
".html": "text/html; charset=utf-8",
|
|
6
|
+
".css": "text/css; charset=utf-8",
|
|
7
|
+
".js": "application/javascript; charset=utf-8",
|
|
8
|
+
".json": "application/json; charset=utf-8",
|
|
9
|
+
".svg": "image/svg+xml",
|
|
10
|
+
".png": "image/png",
|
|
11
|
+
".jpg": "image/jpeg",
|
|
12
|
+
".jpeg": "image/jpeg",
|
|
13
|
+
".gif": "image/gif",
|
|
14
|
+
".webp": "image/webp",
|
|
15
|
+
".ico": "image/x-icon",
|
|
16
|
+
".woff": "font/woff",
|
|
17
|
+
".woff2": "font/woff2",
|
|
18
|
+
".ttf": "font/ttf",
|
|
19
|
+
".otf": "font/otf"
|
|
20
|
+
};
|
|
21
|
+
function contentTypeFor(filePath) {
|
|
22
|
+
return CONTENT_TYPES[path.extname(filePath).toLowerCase()] || "application/octet-stream";
|
|
23
|
+
}
|
|
24
|
+
async function tryRead(filePath) {
|
|
25
|
+
try {
|
|
26
|
+
return await readFile(filePath);
|
|
27
|
+
}
|
|
28
|
+
catch {
|
|
29
|
+
return null;
|
|
30
|
+
}
|
|
31
|
+
}
|
|
32
|
+
function sanitizePathname(pathname) {
|
|
33
|
+
const decoded = decodeURIComponent(pathname || "/");
|
|
34
|
+
const normalized = path.posix.normalize(decoded.startsWith("/") ? decoded : `/${decoded}`);
|
|
35
|
+
return normalized.startsWith("/") ? normalized : `/${normalized}`;
|
|
36
|
+
}
|
|
37
|
+
function resolveFromRoute(rootDir, reqPath) {
|
|
38
|
+
const direct = path.join(rootDir, reqPath.slice(1));
|
|
39
|
+
if (reqPath === "/") {
|
|
40
|
+
return [path.join(rootDir, "src", "index.html"), direct];
|
|
41
|
+
}
|
|
42
|
+
if (reqPath === "/manifest.json") {
|
|
43
|
+
return [path.join(rootDir, "manifest.json"), direct];
|
|
44
|
+
}
|
|
45
|
+
if (reqPath.startsWith("/assets/") || reqPath.startsWith("/src/")) {
|
|
46
|
+
return [direct];
|
|
47
|
+
}
|
|
48
|
+
if (reqPath === "/styles.css") {
|
|
49
|
+
return [path.join(rootDir, "src", "styles.css"), direct];
|
|
50
|
+
}
|
|
51
|
+
if (reqPath.endsWith("/styles.css")) {
|
|
52
|
+
const route = reqPath.slice(1, -"styles.css".length - 1).replace(/\/+$/, "");
|
|
53
|
+
if (!route)
|
|
54
|
+
return [path.join(rootDir, "src", "styles.css"), direct];
|
|
55
|
+
return [path.join(rootDir, "src", "pages", ...route.split("/"), "styles.css"), direct];
|
|
56
|
+
}
|
|
57
|
+
if (reqPath.endsWith(".html")) {
|
|
58
|
+
if (reqPath === "/index.html")
|
|
59
|
+
return [path.join(rootDir, "src", "index.html"), direct];
|
|
60
|
+
const route = reqPath.slice(1, -".html".length).replace(/\/+$/, "");
|
|
61
|
+
return [
|
|
62
|
+
path.join(rootDir, "src", "pages", ...route.split("/"), "index.html"),
|
|
63
|
+
path.join(rootDir, "src", reqPath.slice(1)),
|
|
64
|
+
direct
|
|
65
|
+
];
|
|
66
|
+
}
|
|
67
|
+
if (path.extname(reqPath) === "") {
|
|
68
|
+
const route = reqPath.slice(1).replace(/\/+$/, "");
|
|
69
|
+
if (!route)
|
|
70
|
+
return [path.join(rootDir, "src", "index.html"), direct];
|
|
71
|
+
return [path.join(rootDir, "src", "pages", ...route.split("/"), "index.html"), direct];
|
|
72
|
+
}
|
|
73
|
+
return [direct];
|
|
74
|
+
}
|
|
75
|
+
export async function startPreviewServer(options) {
|
|
76
|
+
const rootDir = path.resolve(options.dir);
|
|
77
|
+
const server = createServer(async (req, res) => {
|
|
78
|
+
const parsedUrl = new URL(req.url || "/", "http://localhost");
|
|
79
|
+
const reqPath = sanitizePathname(parsedUrl.pathname);
|
|
80
|
+
if (reqPath.includes("..")) {
|
|
81
|
+
res.statusCode = 400;
|
|
82
|
+
res.end("Bad Request");
|
|
83
|
+
return;
|
|
84
|
+
}
|
|
85
|
+
const candidates = resolveFromRoute(rootDir, reqPath);
|
|
86
|
+
for (const filePath of candidates) {
|
|
87
|
+
const data = await tryRead(filePath);
|
|
88
|
+
if (!data)
|
|
89
|
+
continue;
|
|
90
|
+
res.statusCode = 200;
|
|
91
|
+
res.setHeader("Content-Type", contentTypeFor(filePath));
|
|
92
|
+
res.end(data);
|
|
93
|
+
return;
|
|
94
|
+
}
|
|
95
|
+
res.statusCode = 404;
|
|
96
|
+
res.setHeader("Content-Type", "text/plain; charset=utf-8");
|
|
97
|
+
res.end("Not found");
|
|
98
|
+
});
|
|
99
|
+
await new Promise((resolve, reject) => {
|
|
100
|
+
server.once("error", reject);
|
|
101
|
+
server.listen(options.port, options.host, () => resolve());
|
|
102
|
+
});
|
|
103
|
+
return server;
|
|
104
|
+
}
|
package/dist/types.js
ADDED
|
@@ -0,0 +1 @@
|
|
|
1
|
+
export {};
|
package/package.json
ADDED
|
@@ -0,0 +1,42 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "cleanscrape",
|
|
3
|
+
"version": "0.1.0",
|
|
4
|
+
"description": "Clean frontend extractor: convert live sites into editable HTML/CSS/assets",
|
|
5
|
+
"type": "module",
|
|
6
|
+
"bin": {
|
|
7
|
+
"cleanscrape": "dist/cli.js",
|
|
8
|
+
"scraper": "dist/cli.js",
|
|
9
|
+
"scrapify": "dist/cli.js",
|
|
10
|
+
"better-wget": "dist/cli.js"
|
|
11
|
+
},
|
|
12
|
+
"scripts": {
|
|
13
|
+
"build": "tsc -p tsconfig.json",
|
|
14
|
+
"dev": "tsx src/cli.ts",
|
|
15
|
+
"start": "node dist/cli.js",
|
|
16
|
+
"check": "tsc --noEmit -p tsconfig.json"
|
|
17
|
+
},
|
|
18
|
+
"files": [
|
|
19
|
+
"dist",
|
|
20
|
+
"README.md",
|
|
21
|
+
"LICENSE"
|
|
22
|
+
],
|
|
23
|
+
"keywords": [
|
|
24
|
+
"wget",
|
|
25
|
+
"curl",
|
|
26
|
+
"scraping",
|
|
27
|
+
"frontend",
|
|
28
|
+
"playwright"
|
|
29
|
+
],
|
|
30
|
+
"author": "",
|
|
31
|
+
"license": "MIT",
|
|
32
|
+
"dependencies": {
|
|
33
|
+
"cheerio": "^1.0.0",
|
|
34
|
+
"commander": "^14.0.0",
|
|
35
|
+
"prettier": "^3.6.0"
|
|
36
|
+
},
|
|
37
|
+
"devDependencies": {
|
|
38
|
+
"@types/node": "^24.5.1",
|
|
39
|
+
"tsx": "^4.20.5",
|
|
40
|
+
"typescript": "^5.9.2"
|
|
41
|
+
}
|
|
42
|
+
}
|