cleanscrape 0.1.0 → 0.1.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +3 -0
- package/dist/extractor.js +69 -13
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -133,6 +133,7 @@ async function runExtraction(url, opts) {
|
|
|
133
133
|
mode,
|
|
134
134
|
everything: opts.everything !== false,
|
|
135
135
|
strictClean: Boolean(opts.strictClean),
|
|
136
|
+
onProgress: opts.quiet ? undefined : (message) => console.log(message),
|
|
136
137
|
timeoutMs,
|
|
137
138
|
crawlDepth,
|
|
138
139
|
maxPages,
|
|
@@ -237,6 +238,7 @@ if (isScrapify) {
|
|
|
237
238
|
.option("--everything", "Capture all discoverable assets/code and keep scripts")
|
|
238
239
|
.option("--no-everything", "Disable full capture mode")
|
|
239
240
|
.option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
|
|
241
|
+
.option("--quiet", "Hide per-file streaming logs")
|
|
240
242
|
.option("--save-default", "Save this run's URL/options as defaults")
|
|
241
243
|
.option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
|
|
242
244
|
.option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
|
|
@@ -272,6 +274,7 @@ if (isScrapify) {
|
|
|
272
274
|
.option("--everything", "Capture all discoverable assets/code and keep scripts")
|
|
273
275
|
.option("--no-everything", "Disable full capture mode")
|
|
274
276
|
.option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
|
|
277
|
+
.option("--quiet", "Hide per-file streaming logs")
|
|
275
278
|
.option("--save-default", "Save this run's URL/options as defaults")
|
|
276
279
|
.option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
|
|
277
280
|
.option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
|
package/dist/extractor.js
CHANGED
|
@@ -186,9 +186,10 @@ async function downloadBinaryAsset(assetUrl, outDir, timeoutMs, userAgent, asset
|
|
|
186
186
|
function maybeAddAsset(assets, seen, record) {
|
|
187
187
|
const key = `${record.kind}:${record.savedPath}`;
|
|
188
188
|
if (seen.has(key))
|
|
189
|
-
return;
|
|
189
|
+
return false;
|
|
190
190
|
seen.add(key);
|
|
191
191
|
assets.push(record);
|
|
192
|
+
return true;
|
|
192
193
|
}
|
|
193
194
|
function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
|
|
194
195
|
let parsed;
|
|
@@ -207,6 +208,33 @@ function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
|
|
|
207
208
|
return false;
|
|
208
209
|
return true;
|
|
209
210
|
}
|
|
211
|
+
function getBaseDomain(hostname) {
|
|
212
|
+
const parts = hostname.toLowerCase().split(".").filter(Boolean);
|
|
213
|
+
if (parts.length <= 2)
|
|
214
|
+
return hostname.toLowerCase();
|
|
215
|
+
return `${parts[parts.length - 2]}.${parts[parts.length - 1]}`;
|
|
216
|
+
}
|
|
217
|
+
function isAllowedAssetHost(assetUrl, rootHost) {
|
|
218
|
+
try {
|
|
219
|
+
const host = new URL(assetUrl).hostname.toLowerCase();
|
|
220
|
+
const root = rootHost.toLowerCase();
|
|
221
|
+
const rootBase = getBaseDomain(root);
|
|
222
|
+
const hostBase = getBaseDomain(host);
|
|
223
|
+
return host === root || host.endsWith(`.${root}`) || hostBase === rootBase;
|
|
224
|
+
}
|
|
225
|
+
catch {
|
|
226
|
+
return false;
|
|
227
|
+
}
|
|
228
|
+
}
|
|
229
|
+
function looksLikeFileUrl(urlStr) {
|
|
230
|
+
try {
|
|
231
|
+
const u = new URL(urlStr);
|
|
232
|
+
return path.extname(u.pathname.toLowerCase()).length > 0;
|
|
233
|
+
}
|
|
234
|
+
catch {
|
|
235
|
+
return false;
|
|
236
|
+
}
|
|
237
|
+
}
|
|
210
238
|
function toRelativeWebPath(fromDir, toPath) {
|
|
211
239
|
const rel = path.relative(fromDir, toPath).split(path.sep).join("/");
|
|
212
240
|
if (rel === "")
|
|
@@ -236,7 +264,7 @@ function findRemoteUrls(content) {
|
|
|
236
264
|
const matches = content.match(/(?:https?:)?\/\/[^\s"'()<>]+/g) || [];
|
|
237
265
|
return matches.map((u) => u.trim()).filter(Boolean);
|
|
238
266
|
}
|
|
239
|
-
async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings) {
|
|
267
|
+
async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, onProgress, rootHost) {
|
|
240
268
|
const value = $(el).attr(attr);
|
|
241
269
|
if (!value)
|
|
242
270
|
return;
|
|
@@ -251,8 +279,12 @@ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir,
|
|
|
251
279
|
const absolute = normalizeUrl(value, baseUrl);
|
|
252
280
|
if (!absolute)
|
|
253
281
|
return;
|
|
282
|
+
if (!looksLikeFileUrl(absolute))
|
|
283
|
+
return;
|
|
254
284
|
if (isTrackerUrl(absolute))
|
|
255
285
|
return;
|
|
286
|
+
if (rootHost && !isAllowedAssetHost(absolute, rootHost))
|
|
287
|
+
return;
|
|
256
288
|
const savedPath = await downloadBinaryAsset(absolute, outDir, options.timeoutMs, options.userAgent, assetMap);
|
|
257
289
|
if (!savedPath) {
|
|
258
290
|
warnings.push(`Failed to fetch asset: ${absolute}`);
|
|
@@ -261,13 +293,19 @@ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir,
|
|
|
261
293
|
const localPath = path.join(outDir, savedPath);
|
|
262
294
|
const rel = toRelativeWebPath(pageDir, localPath);
|
|
263
295
|
$(el).attr(attr, rel);
|
|
264
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
296
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
265
297
|
url: absolute,
|
|
266
298
|
kind: inferAssetKind(absolute),
|
|
267
299
|
savedPath
|
|
268
300
|
});
|
|
301
|
+
if (added && onProgress)
|
|
302
|
+
onProgress(`[asset:${inferAssetKind(absolute)}] ${savedPath}`);
|
|
269
303
|
}
|
|
270
304
|
export async function extractFrontend(options) {
|
|
305
|
+
const progress = (msg) => {
|
|
306
|
+
if (options.onProgress)
|
|
307
|
+
options.onProgress(msg);
|
|
308
|
+
};
|
|
271
309
|
const everything = options.everything === true;
|
|
272
310
|
const strictClean = options.strictClean === true;
|
|
273
311
|
const outDir = path.resolve(options.outDir);
|
|
@@ -277,6 +315,7 @@ export async function extractFrontend(options) {
|
|
|
277
315
|
throw new Error(`Invalid URL: ${options.url}`);
|
|
278
316
|
}
|
|
279
317
|
const rootOrigin = new URL(normalizedRootUrl).origin;
|
|
318
|
+
const rootHost = new URL(normalizedRootUrl).hostname;
|
|
280
319
|
const queue = [{ url: normalizedRootUrl, depth: 0 }];
|
|
281
320
|
const visited = new Set();
|
|
282
321
|
const assets = [];
|
|
@@ -292,6 +331,7 @@ export async function extractFrontend(options) {
|
|
|
292
331
|
if (visited.has(current.url))
|
|
293
332
|
continue;
|
|
294
333
|
visited.add(current.url);
|
|
334
|
+
progress(`[page] Fetching ${current.url}`);
|
|
295
335
|
let htmlRes;
|
|
296
336
|
try {
|
|
297
337
|
htmlRes = await fetchWithTimeout(current.url, options.timeoutMs, options.userAgent);
|
|
@@ -311,6 +351,7 @@ export async function extractFrontend(options) {
|
|
|
311
351
|
}
|
|
312
352
|
const domHtml = await htmlRes.text();
|
|
313
353
|
const $ = load(domHtml);
|
|
354
|
+
progress(`[page] Parsing ${current.url}`);
|
|
314
355
|
if (current.depth < options.crawlDepth) {
|
|
315
356
|
const discovered = new Set();
|
|
316
357
|
$("a[href]").each((_, el) => {
|
|
@@ -415,11 +456,13 @@ export async function extractFrontend(options) {
|
|
|
415
456
|
const localPath = path.join(outDir, savedPath);
|
|
416
457
|
const rel = toRelativeWebPath(pageDir, localPath);
|
|
417
458
|
$(el).attr(attr, rel);
|
|
418
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
459
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
419
460
|
url: absolute,
|
|
420
461
|
kind: inferBinaryAssetKind(absolute) === "font" ? "font" : "image",
|
|
421
462
|
savedPath
|
|
422
463
|
});
|
|
464
|
+
if (added)
|
|
465
|
+
progress(`[asset:image] ${savedPath}`);
|
|
423
466
|
}
|
|
424
467
|
catch (error) {
|
|
425
468
|
warnings.push(`Failed to fetch media: ${absolute} (${String(error)})`);
|
|
@@ -463,11 +506,13 @@ export async function extractFrontend(options) {
|
|
|
463
506
|
const localPath = path.join(outDir, savedPath);
|
|
464
507
|
const rel = toRelativeWebPath(pageDir, localPath);
|
|
465
508
|
$(el).attr("src", rel);
|
|
466
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
509
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
467
510
|
url: absolute,
|
|
468
511
|
kind: "script",
|
|
469
512
|
savedPath
|
|
470
513
|
});
|
|
514
|
+
if (added)
|
|
515
|
+
progress(`[asset:script] ${savedPath}`);
|
|
471
516
|
}
|
|
472
517
|
catch (error) {
|
|
473
518
|
warnings.push(`Failed to fetch script: ${absolute} (${String(error)})`);
|
|
@@ -487,11 +532,13 @@ export async function extractFrontend(options) {
|
|
|
487
532
|
const rel = toRelativeWebPath(pageDir, targetPath);
|
|
488
533
|
$(el).attr("src", rel);
|
|
489
534
|
$(el).text("");
|
|
490
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
535
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
491
536
|
url: `${current.url}#inline-script-${inlineIndex}`,
|
|
492
537
|
kind: "script",
|
|
493
538
|
savedPath
|
|
494
539
|
});
|
|
540
|
+
if (added)
|
|
541
|
+
progress(`[asset:script] ${savedPath}`);
|
|
495
542
|
}
|
|
496
543
|
}
|
|
497
544
|
}
|
|
@@ -525,7 +572,10 @@ export async function extractFrontend(options) {
|
|
|
525
572
|
{ selector: "embed[src]", attr: "src" },
|
|
526
573
|
{ selector: "object[data]", attr: "data" },
|
|
527
574
|
{ selector: "input[src]", attr: "src" },
|
|
528
|
-
{ selector: "link[href]", attr: "href" },
|
|
575
|
+
{ selector: "link[rel~='icon'][href]", attr: "href" },
|
|
576
|
+
{ selector: "link[rel='manifest'][href]", attr: "href" },
|
|
577
|
+
{ selector: "link[rel='preload'][href]", attr: "href" },
|
|
578
|
+
{ selector: "link[rel='prefetch'][href]", attr: "href" },
|
|
529
579
|
{ selector: "source[src]", attr: "src" },
|
|
530
580
|
{ selector: "image[href]", attr: "href" },
|
|
531
581
|
{ selector: "image[xlink\\:href]", attr: "xlink:href" },
|
|
@@ -535,10 +585,7 @@ export async function extractFrontend(options) {
|
|
|
535
585
|
for (const { selector, attr } of attrSelectors) {
|
|
536
586
|
const nodes = $(selector).toArray();
|
|
537
587
|
for (const el of nodes) {
|
|
538
|
-
|
|
539
|
-
continue;
|
|
540
|
-
}
|
|
541
|
-
await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
|
|
588
|
+
await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress, rootHost);
|
|
542
589
|
}
|
|
543
590
|
}
|
|
544
591
|
const downloadableAnchors = $("a[href]").toArray();
|
|
@@ -552,11 +599,15 @@ export async function extractFrontend(options) {
|
|
|
552
599
|
const absolute = normalizeUrl(href, current.url);
|
|
553
600
|
if (!absolute)
|
|
554
601
|
continue;
|
|
602
|
+
if (!looksLikeFileUrl(absolute))
|
|
603
|
+
continue;
|
|
604
|
+
if (!isAllowedAssetHost(absolute, rootHost))
|
|
605
|
+
continue;
|
|
555
606
|
const normalized = normalizePageUrl(absolute);
|
|
556
607
|
const isInternalHtml = normalized ? isSameOriginHtmlLink(normalized, rootOrigin) : false;
|
|
557
608
|
if (isInternalHtml)
|
|
558
609
|
continue;
|
|
559
|
-
await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
|
|
610
|
+
await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress, rootHost);
|
|
560
611
|
}
|
|
561
612
|
}
|
|
562
613
|
const cssMerged = cssBlocks.join("\n\n");
|
|
@@ -574,11 +625,13 @@ export async function extractFrontend(options) {
|
|
|
574
625
|
const localPath = path.join(outDir, savedPath);
|
|
575
626
|
const rel = toRelativeWebPath(cssDir, localPath);
|
|
576
627
|
rewrittenCss = rewrittenCss.split(cssUrl).join(rel);
|
|
577
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
628
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
578
629
|
url: abs,
|
|
579
630
|
kind: inferBinaryAssetKind(abs) === "font" ? "font" : "image",
|
|
580
631
|
savedPath
|
|
581
632
|
});
|
|
633
|
+
if (added)
|
|
634
|
+
progress(`[asset:${inferBinaryAssetKind(abs)}] ${savedPath}`);
|
|
582
635
|
}
|
|
583
636
|
if (strictClean) {
|
|
584
637
|
strictCleanDom($);
|
|
@@ -592,6 +645,8 @@ export async function extractFrontend(options) {
|
|
|
592
645
|
}
|
|
593
646
|
await saveTextFile(pagePaths.htmlPath, await formatMaybe(finalHtml, "html"));
|
|
594
647
|
await saveTextFile(pagePaths.cssPath, await formatMaybe(rewrittenCss, "css"));
|
|
648
|
+
progress(`[write] ${path.relative(outDir, pagePaths.htmlPath)}`);
|
|
649
|
+
progress(`[write] ${path.relative(outDir, pagePaths.cssPath)}`);
|
|
595
650
|
pages.push({
|
|
596
651
|
url: current.url,
|
|
597
652
|
htmlPath: path.relative(outDir, pagePaths.htmlPath),
|
|
@@ -627,6 +682,7 @@ export async function extractFrontend(options) {
|
|
|
627
682
|
verification,
|
|
628
683
|
warnings
|
|
629
684
|
}, null, 2));
|
|
685
|
+
progress("[done] Wrote manifest.json");
|
|
630
686
|
const rootPagePaths = getPagePaths(normalizedRootUrl, outDir);
|
|
631
687
|
return {
|
|
632
688
|
url: options.url,
|