cleanscrape 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,90 @@
1
+ # better-wget
2
+
3
+ `better-wget` is a frontend-first alternative to wget/curl for design and product work.
4
+
5
+ Instead of dumping low-quality scraped output, it crawls a live site and exports:
6
+
7
+ - cleaned, formatted HTML
8
+ - merged, editable CSS
9
+ - downloaded image/font/media assets
10
+ - a machine-readable `manifest.json`
11
+
12
+ ## Why this exists
13
+
14
+ Traditional download tools optimize for raw bytes. This tool optimizes for **clean editable frontend code**.
15
+
16
+ ## Install
17
+
18
+ ```bash
19
+ # global install
20
+ npm install -g cleanscrape
21
+
22
+ # local development
23
+ npm install
24
+ npm run build
25
+ npm link
26
+ ```
27
+
28
+ ## Usage
29
+
30
+ ```bash
31
+ # direct command
32
+ cleanscrape https://example.com -o ./output/example
33
+
34
+ # interactive mode (prompts for URL/mode/etc.)
35
+ cleanscrape
36
+
37
+ # set and reuse a default URL (for example a Vercel URL)
38
+ cleanscrape default https://your-app.vercel.app
39
+
40
+ # run the scraped site locally
41
+ cleanscrape run ./output/example --port 4173
42
+
43
+ # everything mode is now default; disable with --no-everything
44
+ cleanscrape https://example.com -o ./output/example --mode clean
45
+
46
+ # strict clean pass for ultra-editable output
47
+ cleanscrape https://example.com -o ./output/example --strict-clean
48
+
49
+ # save this run as your default template
50
+ cleanscrape https://example.com --save-default
51
+
52
+ # whole-site clean crawl (default): follows internal links and strips scripts/tracker junk
53
+ node dist/cli.js frontend https://example.com -o ./output/example --mode clean
54
+
55
+ # mirror mode: keeps script tags and fetches script files when possible
56
+ node dist/cli.js frontend https://example.com -o ./output/example-mirror --mode mirror
57
+
58
+ # tune crawl scope
59
+ node dist/cli.js frontend https://example.com -o ./output/example --depth 4 --max-pages 250
60
+ ```
61
+
62
+ ## Output structure
63
+
64
+ ```text
65
+ output/example/
66
+ manifest.json
67
+ src/
68
+ index.html
69
+ styles.css
70
+ pages/
71
+ about/
72
+ index.html
73
+ styles.css
74
+ pricing/
75
+ index.html
76
+ styles.css
77
+ assets/
78
+ <hostname>/...
79
+ ```
80
+
81
+ CLI prints a verification report after each scrape:
82
+ - pages/assets/scripts/styles/fonts/images/others counts
83
+ - `remote_urls_remaining` so you can quickly see if anything external is still referenced
84
+
85
+ ## Roadmap
86
+
87
+ - component inference (`Hero`, `Navbar`, `Footer`) into framework templates
88
+ - CSS deduplication and naming normalization
89
+ - JS de-minification and source-map aware rewriting
90
+ - multi-page crawl with route graph export
package/dist/cli.js ADDED
@@ -0,0 +1,326 @@
1
+ #!/usr/bin/env node
2
+ import path from "node:path";
3
+ import { fileURLToPath } from "node:url";
4
+ import os from "node:os";
5
+ import { mkdir, readFile, writeFile } from "node:fs/promises";
6
+ import { createInterface } from "node:readline/promises";
7
+ import { stdin as input, stdout as output } from "node:process";
8
+ import { Command } from "commander";
9
+ import { extractFrontend } from "./extractor.js";
10
+ import { startPreviewServer } from "./preview.js";
11
+ const CONFIG_PATH = path.join(os.homedir(), ".cleanscrape", "config.json");
12
+ async function readConfig() {
13
+ try {
14
+ const raw = await readFile(CONFIG_PATH, "utf8");
15
+ return JSON.parse(raw);
16
+ }
17
+ catch {
18
+ return {};
19
+ }
20
+ }
21
+ async function writeConfig(config) {
22
+ await mkdir(path.dirname(CONFIG_PATH), { recursive: true });
23
+ await writeFile(CONFIG_PATH, JSON.stringify(config, null, 2) + "\n", "utf8");
24
+ }
25
+ async function setDefaultUrl(url) {
26
+ const config = await readConfig();
27
+ config.defaultUrl = url;
28
+ await writeConfig(config);
29
+ }
30
+ async function askInteractive(promptLabel, fallback = "") {
31
+ const rl = createInterface({ input, output });
32
+ try {
33
+ const suffix = fallback ? ` [${fallback}]` : "";
34
+ const answer = (await rl.question(`${promptLabel}${suffix}: `)).trim();
35
+ return answer || fallback;
36
+ }
37
+ finally {
38
+ rl.close();
39
+ }
40
+ }
41
+ async function askYesNo(promptLabel, fallback) {
42
+ const defaultText = fallback ? "Y/n" : "y/N";
43
+ const answer = (await askInteractive(`${promptLabel} (${defaultText})`, "")).toLowerCase();
44
+ if (!answer)
45
+ return fallback;
46
+ return answer === "y" || answer === "yes";
47
+ }
48
+ async function resolveInteractiveRun(urlArg, opts, config) {
49
+ const modeDefault = String(opts.mode || config.mode || "clean");
50
+ const outDefault = String(opts.out || config.out || "./output");
51
+ const timeoutDefault = String(opts.timeout || config.timeout || "60000");
52
+ const depthDefault = String(opts.depth || config.depth || "3");
53
+ const maxPagesDefault = String(opts.maxPages || config.maxPages || "100");
54
+ const everythingDefault = typeof opts.everything === "boolean" ? opts.everything : typeof config.everything === "boolean" ? config.everything : true;
55
+ const strictDefault = typeof opts.strictClean === "boolean" ? opts.strictClean : typeof config.strictClean === "boolean" ? config.strictClean : false;
56
+ if (urlArg) {
57
+ return {
58
+ url: urlArg,
59
+ opts: {
60
+ ...opts,
61
+ mode: String(opts.mode || config.mode || "clean"),
62
+ out: String(opts.out || config.out || "./output"),
63
+ timeout: String(opts.timeout || config.timeout || "60000"),
64
+ depth: String(opts.depth || config.depth || "3"),
65
+ maxPages: String(opts.maxPages || config.maxPages || "100"),
66
+ everything: typeof opts.everything === "boolean" ? opts.everything : config.everything ?? true,
67
+ strictClean: typeof opts.strictClean === "boolean" ? opts.strictClean : config.strictClean ?? false
68
+ }
69
+ };
70
+ }
71
+ const url = await askInteractive("Website URL (example: https://your-app.vercel.app)", config.defaultUrl || "");
72
+ if (!url) {
73
+ console.error("No URL provided.");
74
+ process.exit(1);
75
+ }
76
+ let mode = modeDefault;
77
+ while (mode !== "clean" && mode !== "mirror") {
78
+ mode = await askInteractive("Mode (clean|mirror)", "clean");
79
+ }
80
+ const out = await askInteractive("Output directory", outDefault);
81
+ const timeout = await askInteractive("Timeout (ms)", timeoutDefault);
82
+ const depth = await askInteractive("Crawl depth", depthDefault);
83
+ const maxPages = await askInteractive("Max pages", maxPagesDefault);
84
+ const everything = await askYesNo("Capture everything", everythingDefault);
85
+ const strictClean = await askYesNo("Enable strict clean", strictDefault);
86
+ const saveDefault = await askYesNo("Save these as defaults", false);
87
+ const mergedOpts = {
88
+ ...opts,
89
+ mode,
90
+ out,
91
+ timeout,
92
+ depth,
93
+ maxPages,
94
+ everything,
95
+ strictClean,
96
+ saveDefault
97
+ };
98
+ return { url, opts: mergedOpts };
99
+ }
100
+ async function runExtraction(url, opts) {
101
+ const mode = String(opts.mode);
102
+ if (mode !== "clean" && mode !== "mirror") {
103
+ console.error(`Invalid mode: ${opts.mode}. Use clean or mirror.`);
104
+ process.exit(1);
105
+ }
106
+ const outDir = path.resolve(String(opts.out));
107
+ const timeoutMs = Number(opts.timeout);
108
+ if (!Number.isFinite(timeoutMs) || timeoutMs < 1000) {
109
+ console.error(`Invalid timeout: ${opts.timeout}`);
110
+ process.exit(1);
111
+ }
112
+ const crawlDepth = Number(opts.depth);
113
+ if (!Number.isFinite(crawlDepth) || crawlDepth < 0) {
114
+ console.error(`Invalid depth: ${opts.depth}`);
115
+ process.exit(1);
116
+ }
117
+ const maxPages = Number(opts.maxPages);
118
+ if (!Number.isFinite(maxPages) || maxPages < 1) {
119
+ console.error(`Invalid max-pages: ${opts.maxPages}`);
120
+ process.exit(1);
121
+ }
122
+ console.log(`Extracting ${url}`);
123
+ console.log(`Mode: ${mode}`);
124
+ console.log(`Everything: ${opts.everything !== false ? "yes" : "no"}`);
125
+ console.log(`Strict clean: ${opts.strictClean ? "yes" : "no"}`);
126
+ console.log(`Depth: ${crawlDepth}`);
127
+ console.log(`Max pages: ${maxPages}`);
128
+ console.log(`Output: ${outDir}`);
129
+ try {
130
+ const summary = await extractFrontend({
131
+ url,
132
+ outDir,
133
+ mode,
134
+ everything: opts.everything !== false,
135
+ strictClean: Boolean(opts.strictClean),
136
+ timeoutMs,
137
+ crawlDepth,
138
+ maxPages,
139
+ userAgent: opts.userAgent ? String(opts.userAgent) : undefined
140
+ });
141
+ console.log("\nDone.");
142
+ console.log(`Root HTML: ${summary.htmlPath}`);
143
+ if (summary.cssPath) {
144
+ console.log(`Root CSS: ${summary.cssPath}`);
145
+ }
146
+ console.log(`Pages exported: ${summary.pageCount}`);
147
+ console.log(`Assets downloaded: ${summary.assets.length}`);
148
+ console.log(`Verify: pages=${summary.verification.pages} assets=${summary.verification.assets} scripts=${summary.verification.scripts} styles=${summary.verification.stylesheets} fonts=${summary.verification.fonts} images=${summary.verification.images} others=${summary.verification.others} remote_urls_remaining=${summary.verification.remoteUrlsRemaining}`);
149
+ if (summary.warnings.length > 0) {
150
+ console.log("\nWarnings:");
151
+ for (const warning of summary.warnings) {
152
+ console.log(`- ${warning}`);
153
+ }
154
+ }
155
+ if (opts.saveDefault) {
156
+ await writeConfig({
157
+ defaultUrl: url,
158
+ mode,
159
+ out: opts.out,
160
+ everything: opts.everything !== false,
161
+ strictClean: Boolean(opts.strictClean),
162
+ timeout: opts.timeout,
163
+ depth: opts.depth,
164
+ maxPages: opts.maxPages
165
+ });
166
+ console.log(`Saved defaults to ${CONFIG_PATH}`);
167
+ }
168
+ console.log(`\nPreview with: cleanscrape run ${outDir}`);
169
+ }
170
+ catch (error) {
171
+ console.error("Extraction failed:");
172
+ console.error(error);
173
+ process.exit(1);
174
+ }
175
+ }
176
+ async function runPreview(dirArg, opts) {
177
+ const dir = path.resolve(dirArg || "./output");
178
+ const port = Number(opts.port);
179
+ if (!Number.isFinite(port) || port < 1 || port > 65535) {
180
+ console.error(`Invalid port: ${opts.port}`);
181
+ process.exit(1);
182
+ }
183
+ const host = String(opts.host || "127.0.0.1");
184
+ const server = await startPreviewServer({ dir, port, host });
185
+ console.log(`Serving scraped site from ${dir}`);
186
+ console.log(`Open: http://${host}:${port}`);
187
+ console.log("Press Ctrl+C to stop.");
188
+ const stop = () => {
189
+ server.close(() => process.exit(0));
190
+ };
191
+ process.on("SIGINT", stop);
192
+ process.on("SIGTERM", stop);
193
+ }
194
+ const defaultOptions = {
195
+ out: "./output",
196
+ mode: "clean",
197
+ everything: true,
198
+ timeout: "60000",
199
+ depth: "3",
200
+ maxPages: "100"
201
+ };
202
+ const invokedAs = path.basename(fileURLToPath(import.meta.url));
203
+ const argv0 = path.basename(process.argv[1] || "");
204
+ const cliAliases = new Set(["scrapify", "scraper", "cleanscrape"]);
205
+ const activeCliName = cliAliases.has(argv0) ? argv0 : "cleanscrape";
206
+ const isScrapify = cliAliases.has(argv0) || cliAliases.has(invokedAs.replace(/\.js$/, ""));
207
+ if (isScrapify) {
208
+ if (process.argv[2] === "default") {
209
+ const defaultProgram = new Command();
210
+ defaultProgram
211
+ .name(`${activeCliName} default`)
212
+ .description("Set default URL used by interactive mode")
213
+ .argument("[url]", "Default website URL")
214
+ .action(async (url) => {
215
+ const resolved = typeof url === "string" && url.trim() ? url.trim() : await askInteractive("Default URL");
216
+ if (!resolved) {
217
+ console.error("No default URL provided.");
218
+ process.exit(1);
219
+ }
220
+ await setDefaultUrl(resolved);
221
+ console.log(`Default URL set: ${resolved}`);
222
+ console.log(`Config: ${CONFIG_PATH}`);
223
+ });
224
+ defaultProgram.parseAsync(["node", activeCliName, ...process.argv.slice(3)]).catch((err) => {
225
+ console.error(err);
226
+ process.exit(1);
227
+ });
228
+ }
229
+ else if (process.argv[2] === "help") {
230
+ const helpProgram = new Command();
231
+ helpProgram
232
+ .name(activeCliName)
233
+ .description("Scrape a site into clean editable frontend code")
234
+ .argument("[url]", "Website URL")
235
+ .option("-o, --out <dir>", "Output directory", defaultOptions.out)
236
+ .option("-m, --mode <mode>", "Export mode: clean or mirror", defaultOptions.mode)
237
+ .option("--everything", "Capture all discoverable assets/code and keep scripts")
238
+ .option("--no-everything", "Disable full capture mode")
239
+ .option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
240
+ .option("--save-default", "Save this run's URL/options as defaults")
241
+ .option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
242
+ .option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
243
+ .option("--max-pages <n>", "Maximum pages to crawl", defaultOptions.maxPages)
244
+ .option("--user-agent <ua>", "Custom user agent");
245
+ helpProgram.outputHelp();
246
+ process.exit(0);
247
+ }
248
+ else if (process.argv[2] === "run") {
249
+ const runProgram = new Command();
250
+ runProgram
251
+ .name(`${activeCliName} run`)
252
+ .description("Serve a scraped output folder locally")
253
+ .argument("[dir]", "Scraped output directory", "./output")
254
+ .option("-p, --port <port>", "Port", "4173")
255
+ .option("--host <host>", "Host", "127.0.0.1")
256
+ .action(async (dir, opts) => {
257
+ await runPreview(typeof dir === "string" ? dir : undefined, opts);
258
+ });
259
+ runProgram.parseAsync(["node", activeCliName, ...process.argv.slice(3)]).catch((err) => {
260
+ console.error(err);
261
+ process.exit(1);
262
+ });
263
+ }
264
+ else {
265
+ const program = new Command();
266
+ program
267
+ .name(activeCliName)
268
+ .description("Scrape a site into clean editable frontend code")
269
+ .argument("[url]", "Website URL")
270
+ .option("-o, --out <dir>", "Output directory", defaultOptions.out)
271
+ .option("-m, --mode <mode>", "Export mode: clean or mirror", defaultOptions.mode)
272
+ .option("--everything", "Capture all discoverable assets/code and keep scripts")
273
+ .option("--no-everything", "Disable full capture mode")
274
+ .option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
275
+ .option("--save-default", "Save this run's URL/options as defaults")
276
+ .option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
277
+ .option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
278
+ .option("--max-pages <n>", "Maximum pages to crawl", defaultOptions.maxPages)
279
+ .option("--user-agent <ua>", "Custom user agent")
280
+ .action(async (url, opts) => {
281
+ const config = await readConfig();
282
+ const resolved = await resolveInteractiveRun(typeof url === "string" && url.trim() ? url.trim() : undefined, opts, config);
283
+ await runExtraction(resolved.url, resolved.opts);
284
+ });
285
+ program.parseAsync(process.argv).catch((err) => {
286
+ console.error(err);
287
+ process.exit(1);
288
+ });
289
+ }
290
+ }
291
+ else {
292
+ const program = new Command();
293
+ program
294
+ .name("better-wget")
295
+ .description("Capture clean, editable frontend code from a live website")
296
+ .version("0.1.0");
297
+ program
298
+ .command("frontend")
299
+ .description("Export a frontend snapshot as clean HTML/CSS/assets")
300
+ .argument("<url>", "Website URL")
301
+ .option("-o, --out <dir>", "Output directory", defaultOptions.out)
302
+ .option("-m, --mode <mode>", "Export mode: clean or mirror", defaultOptions.mode)
303
+ .option("--everything", "Capture all discoverable assets/code and keep scripts")
304
+ .option("--no-everything", "Disable full capture mode")
305
+ .option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
306
+ .option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
307
+ .option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
308
+ .option("--max-pages <n>", "Maximum pages to crawl", defaultOptions.maxPages)
309
+ .option("--user-agent <ua>", "Custom user agent")
310
+ .action(async (url, opts) => {
311
+ await runExtraction(url, opts);
312
+ });
313
+ program
314
+ .command("run")
315
+ .description("Serve a scraped output folder locally")
316
+ .argument("[dir]", "Scraped output directory", "./output")
317
+ .option("-p, --port <port>", "Port", "4173")
318
+ .option("--host <host>", "Host", "127.0.0.1")
319
+ .action(async (dir, opts) => {
320
+ await runPreview(typeof dir === "string" ? dir : undefined, opts);
321
+ });
322
+ program.parseAsync(process.argv).catch((err) => {
323
+ console.error(err);
324
+ process.exit(1);
325
+ });
326
+ }
@@ -0,0 +1,642 @@
1
+ import { mkdir, writeFile } from "node:fs/promises";
2
+ import path from "node:path";
3
+ import { URL } from "node:url";
4
+ import { load } from "cheerio";
5
+ import prettier from "prettier";
6
+ const TRACKER_HOST_PATTERNS = [
7
+ "google-analytics.com",
8
+ "googletagmanager.com",
9
+ "doubleclick.net",
10
+ "segment.com",
11
+ "mixpanel.com",
12
+ "hotjar.com",
13
+ "sentry.io"
14
+ ];
15
+ const NON_HTML_EXTENSIONS = new Set([
16
+ ".png",
17
+ ".jpg",
18
+ ".jpeg",
19
+ ".gif",
20
+ ".webp",
21
+ ".svg",
22
+ ".ico",
23
+ ".avif",
24
+ ".pdf",
25
+ ".zip",
26
+ ".gz",
27
+ ".rar",
28
+ ".7z",
29
+ ".mp4",
30
+ ".mov",
31
+ ".webm",
32
+ ".mp3",
33
+ ".wav",
34
+ ".woff",
35
+ ".woff2",
36
+ ".ttf",
37
+ ".otf",
38
+ ".css",
39
+ ".js",
40
+ ".mjs",
41
+ ".json",
42
+ ".xml"
43
+ ]);
44
+ function normalizeUrl(raw, baseUrl) {
45
+ try {
46
+ return new URL(raw, baseUrl).toString();
47
+ }
48
+ catch {
49
+ return null;
50
+ }
51
+ }
52
+ function normalizePageUrl(raw) {
53
+ try {
54
+ const u = new URL(raw);
55
+ u.hash = "";
56
+ u.search = "";
57
+ if (u.pathname.length > 1 && u.pathname.endsWith("/")) {
58
+ u.pathname = u.pathname.slice(0, -1);
59
+ }
60
+ return u.toString();
61
+ }
62
+ catch {
63
+ return null;
64
+ }
65
+ }
66
+ function toSafeFileName(input) {
67
+ return input.replace(/[^a-zA-Z0-9._-]+/g, "-").slice(0, 180) || "asset";
68
+ }
69
+ async function formatMaybe(content, parser) {
70
+ try {
71
+ return await prettier.format(content, { parser });
72
+ }
73
+ catch {
74
+ return content;
75
+ }
76
+ }
77
+ async function saveTextFile(filePath, content) {
78
+ await mkdir(path.dirname(filePath), { recursive: true });
79
+ await writeFile(filePath, content, "utf8");
80
+ }
81
+ function buildAssetFilePath(assetUrl, outputRoot, fallbackExt = "") {
82
+ const urlObj = new URL(assetUrl);
83
+ const ext = path.extname(urlObj.pathname) || fallbackExt;
84
+ const base = toSafeFileName(path.basename(urlObj.pathname, ext) || "asset");
85
+ const host = toSafeFileName(urlObj.hostname);
86
+ const dir = path.join(outputRoot, "assets", host);
87
+ return path.join(dir, `${base}${ext}`);
88
+ }
89
+ function inferBinaryAssetKind(assetUrl) {
90
+ const pathname = new URL(assetUrl).pathname.toLowerCase();
91
+ if (/\.(png|jpe?g|gif|webp|svg|ico|avif)$/.test(pathname))
92
+ return "image";
93
+ if (/\.(woff2?|ttf|otf|eot)$/.test(pathname))
94
+ return "font";
95
+ return "other";
96
+ }
97
+ function inferAssetKind(assetUrl) {
98
+ const pathname = new URL(assetUrl).pathname.toLowerCase();
99
+ if (pathname.endsWith(".css"))
100
+ return "stylesheet";
101
+ if (pathname.endsWith(".js") || pathname.endsWith(".mjs"))
102
+ return "script";
103
+ const binary = inferBinaryAssetKind(assetUrl);
104
+ if (binary === "image")
105
+ return "image";
106
+ if (binary === "font")
107
+ return "font";
108
+ return "other";
109
+ }
110
+ function isTrackerUrl(assetUrl) {
111
+ try {
112
+ const host = new URL(assetUrl).hostname.toLowerCase();
113
+ return TRACKER_HOST_PATTERNS.some((pattern) => host.includes(pattern));
114
+ }
115
+ catch {
116
+ return false;
117
+ }
118
+ }
119
+ function rewriteCssUrls(cssText, replacer) {
120
+ return cssText.replace(/url\(\s*(['"]?)([^'")]+)\1\s*\)/g, (full, quote, rawUrl) => {
121
+ const trimmed = rawUrl.trim();
122
+ if (!trimmed || trimmed.startsWith("data:") || trimmed.startsWith("#"))
123
+ return full;
124
+ const replaced = replacer(trimmed);
125
+ const q = quote || '"';
126
+ return `url(${q}${replaced}${q})`;
127
+ });
128
+ }
129
+ function getPagePaths(pageUrl, outDir) {
130
+ const u = new URL(pageUrl);
131
+ const segments = u.pathname
132
+ .split("/")
133
+ .filter(Boolean)
134
+ .map((s) => toSafeFileName(s));
135
+ if (segments.length === 0) {
136
+ return {
137
+ routeKey: "/",
138
+ htmlPath: path.join(outDir, "src", "index.html"),
139
+ cssPath: path.join(outDir, "src", "styles.css")
140
+ };
141
+ }
142
+ const dir = path.join(outDir, "src", "pages", ...segments);
143
+ return {
144
+ routeKey: `/${segments.join("/")}`,
145
+ htmlPath: path.join(dir, "index.html"),
146
+ cssPath: path.join(dir, "styles.css")
147
+ };
148
+ }
149
+ async function fetchWithTimeout(url, timeoutMs, userAgent) {
150
+ const controller = new AbortController();
151
+ const timeout = setTimeout(() => controller.abort(), timeoutMs);
152
+ try {
153
+ return await fetch(url, {
154
+ signal: controller.signal,
155
+ headers: {
156
+ "user-agent": userAgent ||
157
+ "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36"
158
+ }
159
+ });
160
+ }
161
+ finally {
162
+ clearTimeout(timeout);
163
+ }
164
+ }
165
+ async function downloadTextAsset(assetUrl, timeoutMs, userAgent) {
166
+ const res = await fetchWithTimeout(assetUrl, timeoutMs, userAgent);
167
+ if (!res.ok)
168
+ return null;
169
+ return await res.text();
170
+ }
171
+ async function downloadBinaryAsset(assetUrl, outDir, timeoutMs, userAgent, assetMap) {
172
+ const existingPath = assetMap.get(assetUrl);
173
+ if (existingPath)
174
+ return existingPath;
175
+ const res = await fetchWithTimeout(assetUrl, timeoutMs, userAgent);
176
+ if (!res.ok)
177
+ return null;
178
+ const bytes = Buffer.from(await res.arrayBuffer());
179
+ const targetPath = buildAssetFilePath(assetUrl, outDir);
180
+ await mkdir(path.dirname(targetPath), { recursive: true });
181
+ await writeFile(targetPath, bytes);
182
+ const savedPath = path.relative(outDir, targetPath);
183
+ assetMap.set(assetUrl, savedPath);
184
+ return savedPath;
185
+ }
186
+ function maybeAddAsset(assets, seen, record) {
187
+ const key = `${record.kind}:${record.savedPath}`;
188
+ if (seen.has(key))
189
+ return;
190
+ seen.add(key);
191
+ assets.push(record);
192
+ }
193
+ function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
194
+ let parsed;
195
+ try {
196
+ parsed = new URL(candidateUrl);
197
+ }
198
+ catch {
199
+ return false;
200
+ }
201
+ if (parsed.origin !== rootOrigin)
202
+ return false;
203
+ if (parsed.protocol !== "http:" && parsed.protocol !== "https:")
204
+ return false;
205
+ const ext = path.extname(parsed.pathname.toLowerCase());
206
+ if (ext && NON_HTML_EXTENSIONS.has(ext))
207
+ return false;
208
+ return true;
209
+ }
210
+ function toRelativeWebPath(fromDir, toPath) {
211
+ const rel = path.relative(fromDir, toPath).split(path.sep).join("/");
212
+ if (rel === "")
213
+ return "./";
214
+ return rel.startsWith(".") ? rel : `./${rel}`;
215
+ }
216
+ function strictCleanDom($) {
217
+ $("*").each((_, el) => {
218
+ const attrs = Object.keys(el.attribs || {});
219
+ for (const attrName of attrs) {
220
+ const lower = attrName.toLowerCase();
221
+ if (lower.startsWith("data-") || lower.startsWith("on")) {
222
+ $(el).removeAttr(attrName);
223
+ continue;
224
+ }
225
+ if (lower === "nonce" ||
226
+ lower === "integrity" ||
227
+ lower === "crossorigin" ||
228
+ lower === "fetchpriority" ||
229
+ lower === "referrerpolicy") {
230
+ $(el).removeAttr(attrName);
231
+ }
232
+ }
233
+ });
234
+ }
235
+ function findRemoteUrls(content) {
236
+ const matches = content.match(/(?:https?:)?\/\/[^\s"'()<>]+/g) || [];
237
+ return matches.map((u) => u.trim()).filter(Boolean);
238
+ }
239
+ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings) {
240
+ const value = $(el).attr(attr);
241
+ if (!value)
242
+ return;
243
+ if (value.startsWith("./") ||
244
+ value.startsWith("../") ||
245
+ value.startsWith("assets/") ||
246
+ value.startsWith("/assets/") ||
247
+ value.startsWith("src/") ||
248
+ value.startsWith("/src/")) {
249
+ return;
250
+ }
251
+ const absolute = normalizeUrl(value, baseUrl);
252
+ if (!absolute)
253
+ return;
254
+ if (isTrackerUrl(absolute))
255
+ return;
256
+ const savedPath = await downloadBinaryAsset(absolute, outDir, options.timeoutMs, options.userAgent, assetMap);
257
+ if (!savedPath) {
258
+ warnings.push(`Failed to fetch asset: ${absolute}`);
259
+ return;
260
+ }
261
+ const localPath = path.join(outDir, savedPath);
262
+ const rel = toRelativeWebPath(pageDir, localPath);
263
+ $(el).attr(attr, rel);
264
+ maybeAddAsset(assets, seenAssetRecords, {
265
+ url: absolute,
266
+ kind: inferAssetKind(absolute),
267
+ savedPath
268
+ });
269
+ }
270
+ export async function extractFrontend(options) {
271
+ const everything = options.everything === true;
272
+ const strictClean = options.strictClean === true;
273
+ const outDir = path.resolve(options.outDir);
274
+ await mkdir(path.join(outDir, "src"), { recursive: true });
275
+ const normalizedRootUrl = normalizePageUrl(options.url);
276
+ if (!normalizedRootUrl) {
277
+ throw new Error(`Invalid URL: ${options.url}`);
278
+ }
279
+ const rootOrigin = new URL(normalizedRootUrl).origin;
280
+ const queue = [{ url: normalizedRootUrl, depth: 0 }];
281
+ const visited = new Set();
282
+ const assets = [];
283
+ const warnings = [];
284
+ const pages = [];
285
+ const remoteUrlsRemaining = new Set();
286
+ const assetMap = new Map();
287
+ const seenAssetRecords = new Set();
288
+ while (queue.length > 0 && visited.size < options.maxPages) {
289
+ const current = queue.shift();
290
+ if (!current)
291
+ break;
292
+ if (visited.has(current.url))
293
+ continue;
294
+ visited.add(current.url);
295
+ let htmlRes;
296
+ try {
297
+ htmlRes = await fetchWithTimeout(current.url, options.timeoutMs, options.userAgent);
298
+ }
299
+ catch (error) {
300
+ warnings.push(`Failed to fetch page: ${current.url} (${String(error)})`);
301
+ continue;
302
+ }
303
+ if (!htmlRes.ok) {
304
+ warnings.push(`Failed to fetch page: ${current.url} (HTTP ${htmlRes.status})`);
305
+ continue;
306
+ }
307
+ const contentType = htmlRes.headers.get("content-type") || "";
308
+ if (!contentType.includes("text/html")) {
309
+ warnings.push(`Skipped non-HTML page: ${current.url} (${contentType || "unknown content-type"})`);
310
+ continue;
311
+ }
312
+ const domHtml = await htmlRes.text();
313
+ const $ = load(domHtml);
314
+ if (current.depth < options.crawlDepth) {
315
+ const discovered = new Set();
316
+ $("a[href]").each((_, el) => {
317
+ const href = $(el).attr("href");
318
+ if (!href)
319
+ return;
320
+ if (href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
321
+ return;
322
+ }
323
+ const absolute = normalizeUrl(href, current.url);
324
+ if (!absolute)
325
+ return;
326
+ const normalized = normalizePageUrl(absolute);
327
+ if (!normalized)
328
+ return;
329
+ if (!isSameOriginHtmlLink(normalized, rootOrigin))
330
+ return;
331
+ if (visited.has(normalized) || discovered.has(normalized))
332
+ return;
333
+ discovered.add(normalized);
334
+ queue.push({ url: normalized, depth: current.depth + 1 });
335
+ });
336
+ }
337
+ const pagePaths = getPagePaths(current.url, outDir);
338
+ const pageDir = path.dirname(pagePaths.htmlPath);
339
+ const cssDir = path.dirname(pagePaths.cssPath);
340
+ const cssBlocks = [];
341
+ $("style").each((_, el) => {
342
+ const text = $(el).html();
343
+ if (text?.trim())
344
+ cssBlocks.push(text.trim());
345
+ });
346
+ const stylesheetLinks = $("link[rel='stylesheet']").toArray();
347
+ for (const el of stylesheetLinks) {
348
+ const href = $(el).attr("href");
349
+ if (!href)
350
+ continue;
351
+ const absolute = normalizeUrl(href, current.url);
352
+ if (!absolute)
353
+ continue;
354
+ try {
355
+ const rawCss = await downloadTextAsset(absolute, options.timeoutMs, options.userAgent);
356
+ if (!rawCss) {
357
+ warnings.push(`Failed to fetch stylesheet: ${absolute}`);
358
+ continue;
359
+ }
360
+ let cleanedCss = rawCss;
361
+ const importRegex = /@import\s+(?:url\()?['"]?([^'")]+)['"]?\)?\s*;/g;
362
+ const importUrls = Array.from(cleanedCss.matchAll(importRegex)).map((m) => m[1]).filter(Boolean);
363
+ for (const importUrl of importUrls) {
364
+ const importAbs = normalizeUrl(importUrl, absolute);
365
+ if (!importAbs)
366
+ continue;
367
+ const importCss = await downloadTextAsset(importAbs, options.timeoutMs, options.userAgent);
368
+ if (!importCss)
369
+ continue;
370
+ cleanedCss = cleanedCss.replace(`@import url(${importUrl});`, "");
371
+ cleanedCss += `\n\n/* inlined import: ${importAbs} */\n${importCss}`;
372
+ }
373
+ cleanedCss = rewriteCssUrls(cleanedCss, (rawUrl) => {
374
+ const abs = normalizeUrl(rawUrl, absolute);
375
+ if (!abs)
376
+ return rawUrl;
377
+ return abs;
378
+ });
379
+ cssBlocks.push(cleanedCss);
380
+ }
381
+ catch (error) {
382
+ warnings.push(`Failed to fetch stylesheet: ${absolute} (${String(error)})`);
383
+ }
384
+ }
385
+ $("style").remove();
386
+ $("link[rel='stylesheet']").remove();
387
+ const cssHref = toRelativeWebPath(pageDir, pagePaths.cssPath);
388
+ $("head").append(`<link rel="stylesheet" href="${cssHref}">`);
389
+ const mediaNodes = $("img[src], source[src], source[srcset], video[poster]").toArray();
390
+ for (const el of mediaNodes) {
391
+ if ($(el).attr("srcset") !== undefined) {
392
+ const srcsetValue = $(el).attr("srcset") || "";
393
+ const first = srcsetValue
394
+ .split(",")
395
+ .map((item) => item.trim().split(/\s+/)[0])
396
+ .find(Boolean);
397
+ if (first) {
398
+ $(el).attr("src", first);
399
+ }
400
+ $(el).removeAttr("srcset");
401
+ }
402
+ const attr = $(el).attr("src") !== undefined ? "src" : "poster";
403
+ const value = $(el).attr(attr);
404
+ if (!value)
405
+ continue;
406
+ const absolute = normalizeUrl(value, current.url);
407
+ if (!absolute)
408
+ continue;
409
+ try {
410
+ const savedPath = await downloadBinaryAsset(absolute, outDir, options.timeoutMs, options.userAgent, assetMap);
411
+ if (!savedPath) {
412
+ warnings.push(`Failed to fetch media: ${absolute}`);
413
+ continue;
414
+ }
415
+ const localPath = path.join(outDir, savedPath);
416
+ const rel = toRelativeWebPath(pageDir, localPath);
417
+ $(el).attr(attr, rel);
418
+ maybeAddAsset(assets, seenAssetRecords, {
419
+ url: absolute,
420
+ kind: inferBinaryAssetKind(absolute) === "font" ? "font" : "image",
421
+ savedPath
422
+ });
423
+ }
424
+ catch (error) {
425
+ warnings.push(`Failed to fetch media: ${absolute} (${String(error)})`);
426
+ }
427
+ }
428
+ const keepScripts = options.mode !== "clean" || everything;
429
+ if (!keepScripts) {
430
+ $("script").remove();
431
+ $("noscript").remove();
432
+ $("*[data-reactroot], *[data-reactid], *[data-v-app], *[ng-version]").removeAttr("data-reactroot data-reactid data-v-app ng-version");
433
+ }
434
+ else {
435
+ const scripts = $("script[src]").toArray();
436
+ for (const el of scripts) {
437
+ const src = $(el).attr("src");
438
+ if (!src)
439
+ continue;
440
+ const absolute = normalizeUrl(src, current.url);
441
+ if (!absolute)
442
+ continue;
443
+ try {
444
+ const host = new URL(absolute).hostname;
445
+ if (TRACKER_HOST_PATTERNS.some((pattern) => host.includes(pattern))) {
446
+ $(el).remove();
447
+ continue;
448
+ }
449
+ }
450
+ catch {
451
+ // ignore parse failures
452
+ }
453
+ try {
454
+ const rawJs = await downloadTextAsset(absolute, options.timeoutMs, options.userAgent);
455
+ if (!rawJs) {
456
+ warnings.push(`Failed to fetch script: ${absolute}`);
457
+ continue;
458
+ }
459
+ const targetPath = buildAssetFilePath(absolute, outDir, ".js");
460
+ await saveTextFile(targetPath, await formatMaybe(rawJs, "babel"));
461
+ const savedPath = path.relative(outDir, targetPath);
462
+ assetMap.set(absolute, savedPath);
463
+ const localPath = path.join(outDir, savedPath);
464
+ const rel = toRelativeWebPath(pageDir, localPath);
465
+ $(el).attr("src", rel);
466
+ maybeAddAsset(assets, seenAssetRecords, {
467
+ url: absolute,
468
+ kind: "script",
469
+ savedPath
470
+ });
471
+ }
472
+ catch (error) {
473
+ warnings.push(`Failed to fetch script: ${absolute} (${String(error)})`);
474
+ }
475
+ }
476
+ if (everything) {
477
+ let inlineIndex = 0;
478
+ const inlineScripts = $("script:not([src])").toArray();
479
+ for (const el of inlineScripts) {
480
+ const raw = $(el).html();
481
+ if (!raw || !raw.trim())
482
+ continue;
483
+ inlineIndex += 1;
484
+ const targetPath = path.join(outDir, "src", "scripts", `inline-${inlineIndex}.js`);
485
+ await saveTextFile(targetPath, await formatMaybe(raw, "babel"));
486
+ const savedPath = path.relative(outDir, targetPath);
487
+ const rel = toRelativeWebPath(pageDir, targetPath);
488
+ $(el).attr("src", rel);
489
+ $(el).text("");
490
+ maybeAddAsset(assets, seenAssetRecords, {
491
+ url: `${current.url}#inline-script-${inlineIndex}`,
492
+ kind: "script",
493
+ savedPath
494
+ });
495
+ }
496
+ }
497
+ }
498
+ $("a[href]").each((_, el) => {
499
+ const href = $(el).attr("href");
500
+ if (!href)
501
+ return;
502
+ if (href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
503
+ return;
504
+ }
505
+ const absolute = normalizeUrl(href, current.url);
506
+ if (!absolute)
507
+ return;
508
+ const normalized = normalizePageUrl(absolute);
509
+ if (!normalized)
510
+ return;
511
+ if (!isSameOriginHtmlLink(normalized, rootOrigin))
512
+ return;
513
+ const targetPage = getPagePaths(normalized, outDir);
514
+ const targetHtml = targetPage.htmlPath;
515
+ const relHref = toRelativeWebPath(pageDir, targetHtml);
516
+ $(el).attr("href", relHref);
517
+ });
518
+ if (everything) {
519
+ const attrSelectors = [
520
+ { selector: "img[src]", attr: "src" },
521
+ { selector: "video[src]", attr: "src" },
522
+ { selector: "audio[src]", attr: "src" },
523
+ { selector: "track[src]", attr: "src" },
524
+ { selector: "iframe[src]", attr: "src" },
525
+ { selector: "embed[src]", attr: "src" },
526
+ { selector: "object[data]", attr: "data" },
527
+ { selector: "input[src]", attr: "src" },
528
+ { selector: "link[href]", attr: "href" },
529
+ { selector: "source[src]", attr: "src" },
530
+ { selector: "image[href]", attr: "href" },
531
+ { selector: "image[xlink\\:href]", attr: "xlink:href" },
532
+ { selector: "use[href]", attr: "href" },
533
+ { selector: "use[xlink\\:href]", attr: "xlink:href" }
534
+ ];
535
+ for (const { selector, attr } of attrSelectors) {
536
+ const nodes = $(selector).toArray();
537
+ for (const el of nodes) {
538
+ if (selector === "link[href]" && $(el).attr("rel")?.toLowerCase() === "stylesheet") {
539
+ continue;
540
+ }
541
+ await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
542
+ }
543
+ }
544
+ const downloadableAnchors = $("a[href]").toArray();
545
+ for (const el of downloadableAnchors) {
546
+ const href = $(el).attr("href");
547
+ if (!href)
548
+ continue;
549
+ if (href.startsWith("#") || href.startsWith("mailto:") || href.startsWith("tel:") || href.startsWith("javascript:")) {
550
+ continue;
551
+ }
552
+ const absolute = normalizeUrl(href, current.url);
553
+ if (!absolute)
554
+ continue;
555
+ const normalized = normalizePageUrl(absolute);
556
+ const isInternalHtml = normalized ? isSameOriginHtmlLink(normalized, rootOrigin) : false;
557
+ if (isInternalHtml)
558
+ continue;
559
+ await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
560
+ }
561
+ }
562
+ const cssMerged = cssBlocks.join("\n\n");
563
+ const cssUrls = Array.from(cssMerged.matchAll(/url\(\s*(['"]?)([^'")]+)\1\s*\)/g))
564
+ .map((m) => m[2])
565
+ .filter((u) => Boolean(u) && !u.startsWith("data:") && !u.startsWith("#"));
566
+ let rewrittenCss = cssMerged;
567
+ for (const cssUrl of cssUrls) {
568
+ const abs = normalizeUrl(cssUrl, current.url);
569
+ if (!abs)
570
+ continue;
571
+ const savedPath = await downloadBinaryAsset(abs, outDir, options.timeoutMs, options.userAgent, assetMap);
572
+ if (!savedPath)
573
+ continue;
574
+ const localPath = path.join(outDir, savedPath);
575
+ const rel = toRelativeWebPath(cssDir, localPath);
576
+ rewrittenCss = rewrittenCss.split(cssUrl).join(rel);
577
+ maybeAddAsset(assets, seenAssetRecords, {
578
+ url: abs,
579
+ kind: inferBinaryAssetKind(abs) === "font" ? "font" : "image",
580
+ savedPath
581
+ });
582
+ }
583
+ if (strictClean) {
584
+ strictCleanDom($);
585
+ }
586
+ const finalHtml = strictClean ? $.html().replace(/<!--[\s\S]*?-->/g, "") : $.html();
587
+ for (const u of findRemoteUrls(finalHtml)) {
588
+ remoteUrlsRemaining.add(u);
589
+ }
590
+ for (const u of findRemoteUrls(rewrittenCss)) {
591
+ remoteUrlsRemaining.add(u);
592
+ }
593
+ await saveTextFile(pagePaths.htmlPath, await formatMaybe(finalHtml, "html"));
594
+ await saveTextFile(pagePaths.cssPath, await formatMaybe(rewrittenCss, "css"));
595
+ pages.push({
596
+ url: current.url,
597
+ htmlPath: path.relative(outDir, pagePaths.htmlPath),
598
+ cssPath: path.relative(outDir, pagePaths.cssPath)
599
+ });
600
+ }
601
+ const scriptsCount = assets.filter((a) => a.kind === "script").length;
602
+ const stylesheetsCount = assets.filter((a) => a.kind === "stylesheet").length;
603
+ const fontsCount = assets.filter((a) => a.kind === "font").length;
604
+ const imagesCount = assets.filter((a) => a.kind === "image").length;
605
+ const othersCount = assets.filter((a) => a.kind === "other").length;
606
+ const verification = {
607
+ pages: pages.length,
608
+ assets: assets.length,
609
+ scripts: scriptsCount,
610
+ stylesheets: stylesheetsCount,
611
+ fonts: fontsCount,
612
+ images: imagesCount,
613
+ others: othersCount,
614
+ remoteUrlsRemaining: remoteUrlsRemaining.size
615
+ };
616
+ await saveTextFile(path.join(outDir, "manifest.json"), JSON.stringify({
617
+ sourceUrl: options.url,
618
+ mode: options.mode,
619
+ everything,
620
+ strictClean,
621
+ exportedAt: new Date().toISOString(),
622
+ crawlDepth: options.crawlDepth,
623
+ maxPages: options.maxPages,
624
+ pageCount: pages.length,
625
+ pages,
626
+ assetCount: assets.length,
627
+ verification,
628
+ warnings
629
+ }, null, 2));
630
+ const rootPagePaths = getPagePaths(normalizedRootUrl, outDir);
631
+ return {
632
+ url: options.url,
633
+ outDir,
634
+ htmlPath: rootPagePaths.htmlPath,
635
+ cssPath: rootPagePaths.cssPath,
636
+ verification,
637
+ pageCount: pages.length,
638
+ pages,
639
+ assets,
640
+ warnings
641
+ };
642
+ }
@@ -0,0 +1,104 @@
1
+ import { readFile } from "node:fs/promises";
2
+ import { createServer } from "node:http";
3
+ import path from "node:path";
4
+ const CONTENT_TYPES = {
5
+ ".html": "text/html; charset=utf-8",
6
+ ".css": "text/css; charset=utf-8",
7
+ ".js": "application/javascript; charset=utf-8",
8
+ ".json": "application/json; charset=utf-8",
9
+ ".svg": "image/svg+xml",
10
+ ".png": "image/png",
11
+ ".jpg": "image/jpeg",
12
+ ".jpeg": "image/jpeg",
13
+ ".gif": "image/gif",
14
+ ".webp": "image/webp",
15
+ ".ico": "image/x-icon",
16
+ ".woff": "font/woff",
17
+ ".woff2": "font/woff2",
18
+ ".ttf": "font/ttf",
19
+ ".otf": "font/otf"
20
+ };
21
+ function contentTypeFor(filePath) {
22
+ return CONTENT_TYPES[path.extname(filePath).toLowerCase()] || "application/octet-stream";
23
+ }
24
+ async function tryRead(filePath) {
25
+ try {
26
+ return await readFile(filePath);
27
+ }
28
+ catch {
29
+ return null;
30
+ }
31
+ }
32
+ function sanitizePathname(pathname) {
33
+ const decoded = decodeURIComponent(pathname || "/");
34
+ const normalized = path.posix.normalize(decoded.startsWith("/") ? decoded : `/${decoded}`);
35
+ return normalized.startsWith("/") ? normalized : `/${normalized}`;
36
+ }
37
+ function resolveFromRoute(rootDir, reqPath) {
38
+ const direct = path.join(rootDir, reqPath.slice(1));
39
+ if (reqPath === "/") {
40
+ return [path.join(rootDir, "src", "index.html"), direct];
41
+ }
42
+ if (reqPath === "/manifest.json") {
43
+ return [path.join(rootDir, "manifest.json"), direct];
44
+ }
45
+ if (reqPath.startsWith("/assets/") || reqPath.startsWith("/src/")) {
46
+ return [direct];
47
+ }
48
+ if (reqPath === "/styles.css") {
49
+ return [path.join(rootDir, "src", "styles.css"), direct];
50
+ }
51
+ if (reqPath.endsWith("/styles.css")) {
52
+ const route = reqPath.slice(1, -"styles.css".length - 1).replace(/\/+$/, "");
53
+ if (!route)
54
+ return [path.join(rootDir, "src", "styles.css"), direct];
55
+ return [path.join(rootDir, "src", "pages", ...route.split("/"), "styles.css"), direct];
56
+ }
57
+ if (reqPath.endsWith(".html")) {
58
+ if (reqPath === "/index.html")
59
+ return [path.join(rootDir, "src", "index.html"), direct];
60
+ const route = reqPath.slice(1, -".html".length).replace(/\/+$/, "");
61
+ return [
62
+ path.join(rootDir, "src", "pages", ...route.split("/"), "index.html"),
63
+ path.join(rootDir, "src", reqPath.slice(1)),
64
+ direct
65
+ ];
66
+ }
67
+ if (path.extname(reqPath) === "") {
68
+ const route = reqPath.slice(1).replace(/\/+$/, "");
69
+ if (!route)
70
+ return [path.join(rootDir, "src", "index.html"), direct];
71
+ return [path.join(rootDir, "src", "pages", ...route.split("/"), "index.html"), direct];
72
+ }
73
+ return [direct];
74
+ }
75
+ export async function startPreviewServer(options) {
76
+ const rootDir = path.resolve(options.dir);
77
+ const server = createServer(async (req, res) => {
78
+ const parsedUrl = new URL(req.url || "/", "http://localhost");
79
+ const reqPath = sanitizePathname(parsedUrl.pathname);
80
+ if (reqPath.includes("..")) {
81
+ res.statusCode = 400;
82
+ res.end("Bad Request");
83
+ return;
84
+ }
85
+ const candidates = resolveFromRoute(rootDir, reqPath);
86
+ for (const filePath of candidates) {
87
+ const data = await tryRead(filePath);
88
+ if (!data)
89
+ continue;
90
+ res.statusCode = 200;
91
+ res.setHeader("Content-Type", contentTypeFor(filePath));
92
+ res.end(data);
93
+ return;
94
+ }
95
+ res.statusCode = 404;
96
+ res.setHeader("Content-Type", "text/plain; charset=utf-8");
97
+ res.end("Not found");
98
+ });
99
+ await new Promise((resolve, reject) => {
100
+ server.once("error", reject);
101
+ server.listen(options.port, options.host, () => resolve());
102
+ });
103
+ return server;
104
+ }
package/dist/types.js ADDED
@@ -0,0 +1 @@
1
+ export {};
package/package.json ADDED
@@ -0,0 +1,42 @@
1
+ {
2
+ "name": "cleanscrape",
3
+ "version": "0.1.0",
4
+ "description": "Clean frontend extractor: convert live sites into editable HTML/CSS/assets",
5
+ "type": "module",
6
+ "bin": {
7
+ "cleanscrape": "dist/cli.js",
8
+ "scraper": "dist/cli.js",
9
+ "scrapify": "dist/cli.js",
10
+ "better-wget": "dist/cli.js"
11
+ },
12
+ "scripts": {
13
+ "build": "tsc -p tsconfig.json",
14
+ "dev": "tsx src/cli.ts",
15
+ "start": "node dist/cli.js",
16
+ "check": "tsc --noEmit -p tsconfig.json"
17
+ },
18
+ "files": [
19
+ "dist",
20
+ "README.md",
21
+ "LICENSE"
22
+ ],
23
+ "keywords": [
24
+ "wget",
25
+ "curl",
26
+ "scraping",
27
+ "frontend",
28
+ "playwright"
29
+ ],
30
+ "author": "",
31
+ "license": "MIT",
32
+ "dependencies": {
33
+ "cheerio": "^1.0.0",
34
+ "commander": "^14.0.0",
35
+ "prettier": "^3.6.0"
36
+ },
37
+ "devDependencies": {
38
+ "@types/node": "^24.5.1",
39
+ "tsx": "^4.20.5",
40
+ "typescript": "^5.9.2"
41
+ }
42
+ }