cleanscrape 0.1.1 → 0.1.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +3 -0
- package/dist/extractor.js +43 -7
- package/package.json +3 -2
package/README.md
CHANGED
|
@@ -40,6 +40,9 @@ cleanscrape default https://your-app.vercel.app
|
|
|
40
40
|
# run the scraped site locally
|
|
41
41
|
cleanscrape run ./output/example --port 4173
|
|
42
42
|
|
|
43
|
+
# same preview server via npm script
|
|
44
|
+
npm run dev
|
|
45
|
+
|
|
43
46
|
# everything mode is now default; disable with --no-everything
|
|
44
47
|
cleanscrape https://example.com -o ./output/example --mode clean
|
|
45
48
|
|
package/dist/extractor.js
CHANGED
|
@@ -208,6 +208,33 @@ function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
|
|
|
208
208
|
return false;
|
|
209
209
|
return true;
|
|
210
210
|
}
|
|
211
|
+
function getBaseDomain(hostname) {
|
|
212
|
+
const parts = hostname.toLowerCase().split(".").filter(Boolean);
|
|
213
|
+
if (parts.length <= 2)
|
|
214
|
+
return hostname.toLowerCase();
|
|
215
|
+
return `${parts[parts.length - 2]}.${parts[parts.length - 1]}`;
|
|
216
|
+
}
|
|
217
|
+
function isAllowedAssetHost(assetUrl, rootHost) {
|
|
218
|
+
try {
|
|
219
|
+
const host = new URL(assetUrl).hostname.toLowerCase();
|
|
220
|
+
const root = rootHost.toLowerCase();
|
|
221
|
+
const rootBase = getBaseDomain(root);
|
|
222
|
+
const hostBase = getBaseDomain(host);
|
|
223
|
+
return host === root || host.endsWith(`.${root}`) || hostBase === rootBase;
|
|
224
|
+
}
|
|
225
|
+
catch {
|
|
226
|
+
return false;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
function looksLikeFileUrl(urlStr) {
|
|
230
|
+
try {
|
|
231
|
+
const u = new URL(urlStr);
|
|
232
|
+
return path.extname(u.pathname.toLowerCase()).length > 0;
|
|
233
|
+
}
|
|
234
|
+
catch {
|
|
235
|
+
return false;
|
|
236
|
+
}
|
|
237
|
+
}
|
|
211
238
|
function toRelativeWebPath(fromDir, toPath) {
|
|
212
239
|
const rel = path.relative(fromDir, toPath).split(path.sep).join("/");
|
|
213
240
|
if (rel === "")
|
|
@@ -237,7 +264,7 @@ function findRemoteUrls(content) {
|
|
|
237
264
|
const matches = content.match(/(?:https?:)?\/\/[^\s"'()<>]+/g) || [];
|
|
238
265
|
return matches.map((u) => u.trim()).filter(Boolean);
|
|
239
266
|
}
|
|
240
|
-
async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, onProgress) {
|
|
267
|
+
async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, onProgress, rootHost) {
|
|
241
268
|
const value = $(el).attr(attr);
|
|
242
269
|
if (!value)
|
|
243
270
|
return;
|
|
@@ -252,8 +279,12 @@ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir,
|
|
|
252
279
|
const absolute = normalizeUrl(value, baseUrl);
|
|
253
280
|
if (!absolute)
|
|
254
281
|
return;
|
|
282
|
+
if (!looksLikeFileUrl(absolute))
|
|
283
|
+
return;
|
|
255
284
|
if (isTrackerUrl(absolute))
|
|
256
285
|
return;
|
|
286
|
+
if (rootHost && !isAllowedAssetHost(absolute, rootHost))
|
|
287
|
+
return;
|
|
257
288
|
const savedPath = await downloadBinaryAsset(absolute, outDir, options.timeoutMs, options.userAgent, assetMap);
|
|
258
289
|
if (!savedPath) {
|
|
259
290
|
warnings.push(`Failed to fetch asset: ${absolute}`);
|
|
@@ -284,6 +315,7 @@ export async function extractFrontend(options) {
|
|
|
284
315
|
throw new Error(`Invalid URL: ${options.url}`);
|
|
285
316
|
}
|
|
286
317
|
const rootOrigin = new URL(normalizedRootUrl).origin;
|
|
318
|
+
const rootHost = new URL(normalizedRootUrl).hostname;
|
|
287
319
|
const queue = [{ url: normalizedRootUrl, depth: 0 }];
|
|
288
320
|
const visited = new Set();
|
|
289
321
|
const assets = [];
|
|
@@ -540,7 +572,10 @@ export async function extractFrontend(options) {
|
|
|
540
572
|
{ selector: "embed[src]", attr: "src" },
|
|
541
573
|
{ selector: "object[data]", attr: "data" },
|
|
542
574
|
{ selector: "input[src]", attr: "src" },
|
|
543
|
-
{ selector: "link[href]", attr: "href" },
|
|
575
|
+
{ selector: "link[rel~='icon'][href]", attr: "href" },
|
|
576
|
+
{ selector: "link[rel='manifest'][href]", attr: "href" },
|
|
577
|
+
{ selector: "link[rel='preload'][href]", attr: "href" },
|
|
578
|
+
{ selector: "link[rel='prefetch'][href]", attr: "href" },
|
|
544
579
|
{ selector: "source[src]", attr: "src" },
|
|
545
580
|
{ selector: "image[href]", attr: "href" },
|
|
546
581
|
{ selector: "image[xlink\\:href]", attr: "xlink:href" },
|
|
@@ -550,10 +585,7 @@ export async function extractFrontend(options) {
|
|
|
550
585
|
for (const { selector, attr } of attrSelectors) {
|
|
551
586
|
const nodes = $(selector).toArray();
|
|
552
587
|
for (const el of nodes) {
|
|
553
|
-
|
|
554
|
-
continue;
|
|
555
|
-
}
|
|
556
|
-
await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress);
|
|
588
|
+
await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress, rootHost);
|
|
557
589
|
}
|
|
558
590
|
}
|
|
559
591
|
const downloadableAnchors = $("a[href]").toArray();
|
|
@@ -567,11 +599,15 @@ export async function extractFrontend(options) {
|
|
|
567
599
|
const absolute = normalizeUrl(href, current.url);
|
|
568
600
|
if (!absolute)
|
|
569
601
|
continue;
|
|
602
|
+
if (!looksLikeFileUrl(absolute))
|
|
603
|
+
continue;
|
|
604
|
+
if (!isAllowedAssetHost(absolute, rootHost))
|
|
605
|
+
continue;
|
|
570
606
|
const normalized = normalizePageUrl(absolute);
|
|
571
607
|
const isInternalHtml = normalized ? isSameOriginHtmlLink(normalized, rootOrigin) : false;
|
|
572
608
|
if (isInternalHtml)
|
|
573
609
|
continue;
|
|
574
|
-
await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress);
|
|
610
|
+
await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress, rootHost);
|
|
575
611
|
}
|
|
576
612
|
}
|
|
577
613
|
const cssMerged = cssBlocks.join("\n\n");
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "cleanscrape",
|
|
3
|
-
"version": "0.1.
|
|
3
|
+
"version": "0.1.3",
|
|
4
4
|
"description": "Clean frontend extractor: convert live sites into editable HTML/CSS/assets",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"bin": {
|
|
@@ -11,7 +11,8 @@
|
|
|
11
11
|
},
|
|
12
12
|
"scripts": {
|
|
13
13
|
"build": "tsc -p tsconfig.json",
|
|
14
|
-
"dev": "
|
|
14
|
+
"dev": "node dist/cli.js run ./output --port 4173",
|
|
15
|
+
"dev:cli": "tsx src/cli.ts",
|
|
15
16
|
"start": "node dist/cli.js",
|
|
16
17
|
"check": "tsc --noEmit -p tsconfig.json"
|
|
17
18
|
},
|