cleanscrape 0.1.0 → 0.1.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/cli.js +3 -0
- package/dist/extractor.js +29 -9
- package/package.json +1 -1
package/dist/cli.js
CHANGED
|
@@ -133,6 +133,7 @@ async function runExtraction(url, opts) {
|
|
|
133
133
|
mode,
|
|
134
134
|
everything: opts.everything !== false,
|
|
135
135
|
strictClean: Boolean(opts.strictClean),
|
|
136
|
+
onProgress: opts.quiet ? undefined : (message) => console.log(message),
|
|
136
137
|
timeoutMs,
|
|
137
138
|
crawlDepth,
|
|
138
139
|
maxPages,
|
|
@@ -237,6 +238,7 @@ if (isScrapify) {
|
|
|
237
238
|
.option("--everything", "Capture all discoverable assets/code and keep scripts")
|
|
238
239
|
.option("--no-everything", "Disable full capture mode")
|
|
239
240
|
.option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
|
|
241
|
+
.option("--quiet", "Hide per-file streaming logs")
|
|
240
242
|
.option("--save-default", "Save this run's URL/options as defaults")
|
|
241
243
|
.option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
|
|
242
244
|
.option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
|
|
@@ -272,6 +274,7 @@ if (isScrapify) {
|
|
|
272
274
|
.option("--everything", "Capture all discoverable assets/code and keep scripts")
|
|
273
275
|
.option("--no-everything", "Disable full capture mode")
|
|
274
276
|
.option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
|
|
277
|
+
.option("--quiet", "Hide per-file streaming logs")
|
|
275
278
|
.option("--save-default", "Save this run's URL/options as defaults")
|
|
276
279
|
.option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
|
|
277
280
|
.option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
|
package/dist/extractor.js
CHANGED
|
@@ -186,9 +186,10 @@ async function downloadBinaryAsset(assetUrl, outDir, timeoutMs, userAgent, asset
|
|
|
186
186
|
function maybeAddAsset(assets, seen, record) {
|
|
187
187
|
const key = `${record.kind}:${record.savedPath}`;
|
|
188
188
|
if (seen.has(key))
|
|
189
|
-
return;
|
|
189
|
+
return false;
|
|
190
190
|
seen.add(key);
|
|
191
191
|
assets.push(record);
|
|
192
|
+
return true;
|
|
192
193
|
}
|
|
193
194
|
function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
|
|
194
195
|
let parsed;
|
|
@@ -236,7 +237,7 @@ function findRemoteUrls(content) {
|
|
|
236
237
|
const matches = content.match(/(?:https?:)?\/\/[^\s"'()<>]+/g) || [];
|
|
237
238
|
return matches.map((u) => u.trim()).filter(Boolean);
|
|
238
239
|
}
|
|
239
|
-
async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings) {
|
|
240
|
+
async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, onProgress) {
|
|
240
241
|
const value = $(el).attr(attr);
|
|
241
242
|
if (!value)
|
|
242
243
|
return;
|
|
@@ -261,13 +262,19 @@ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir,
|
|
|
261
262
|
const localPath = path.join(outDir, savedPath);
|
|
262
263
|
const rel = toRelativeWebPath(pageDir, localPath);
|
|
263
264
|
$(el).attr(attr, rel);
|
|
264
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
265
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
265
266
|
url: absolute,
|
|
266
267
|
kind: inferAssetKind(absolute),
|
|
267
268
|
savedPath
|
|
268
269
|
});
|
|
270
|
+
if (added && onProgress)
|
|
271
|
+
onProgress(`[asset:${inferAssetKind(absolute)}] ${savedPath}`);
|
|
269
272
|
}
|
|
270
273
|
export async function extractFrontend(options) {
|
|
274
|
+
const progress = (msg) => {
|
|
275
|
+
if (options.onProgress)
|
|
276
|
+
options.onProgress(msg);
|
|
277
|
+
};
|
|
271
278
|
const everything = options.everything === true;
|
|
272
279
|
const strictClean = options.strictClean === true;
|
|
273
280
|
const outDir = path.resolve(options.outDir);
|
|
@@ -292,6 +299,7 @@ export async function extractFrontend(options) {
|
|
|
292
299
|
if (visited.has(current.url))
|
|
293
300
|
continue;
|
|
294
301
|
visited.add(current.url);
|
|
302
|
+
progress(`[page] Fetching ${current.url}`);
|
|
295
303
|
let htmlRes;
|
|
296
304
|
try {
|
|
297
305
|
htmlRes = await fetchWithTimeout(current.url, options.timeoutMs, options.userAgent);
|
|
@@ -311,6 +319,7 @@ export async function extractFrontend(options) {
|
|
|
311
319
|
}
|
|
312
320
|
const domHtml = await htmlRes.text();
|
|
313
321
|
const $ = load(domHtml);
|
|
322
|
+
progress(`[page] Parsing ${current.url}`);
|
|
314
323
|
if (current.depth < options.crawlDepth) {
|
|
315
324
|
const discovered = new Set();
|
|
316
325
|
$("a[href]").each((_, el) => {
|
|
@@ -415,11 +424,13 @@ export async function extractFrontend(options) {
|
|
|
415
424
|
const localPath = path.join(outDir, savedPath);
|
|
416
425
|
const rel = toRelativeWebPath(pageDir, localPath);
|
|
417
426
|
$(el).attr(attr, rel);
|
|
418
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
427
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
419
428
|
url: absolute,
|
|
420
429
|
kind: inferBinaryAssetKind(absolute) === "font" ? "font" : "image",
|
|
421
430
|
savedPath
|
|
422
431
|
});
|
|
432
|
+
if (added)
|
|
433
|
+
progress(`[asset:image] ${savedPath}`);
|
|
423
434
|
}
|
|
424
435
|
catch (error) {
|
|
425
436
|
warnings.push(`Failed to fetch media: ${absolute} (${String(error)})`);
|
|
@@ -463,11 +474,13 @@ export async function extractFrontend(options) {
|
|
|
463
474
|
const localPath = path.join(outDir, savedPath);
|
|
464
475
|
const rel = toRelativeWebPath(pageDir, localPath);
|
|
465
476
|
$(el).attr("src", rel);
|
|
466
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
477
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
467
478
|
url: absolute,
|
|
468
479
|
kind: "script",
|
|
469
480
|
savedPath
|
|
470
481
|
});
|
|
482
|
+
if (added)
|
|
483
|
+
progress(`[asset:script] ${savedPath}`);
|
|
471
484
|
}
|
|
472
485
|
catch (error) {
|
|
473
486
|
warnings.push(`Failed to fetch script: ${absolute} (${String(error)})`);
|
|
@@ -487,11 +500,13 @@ export async function extractFrontend(options) {
|
|
|
487
500
|
const rel = toRelativeWebPath(pageDir, targetPath);
|
|
488
501
|
$(el).attr("src", rel);
|
|
489
502
|
$(el).text("");
|
|
490
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
503
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
491
504
|
url: `${current.url}#inline-script-${inlineIndex}`,
|
|
492
505
|
kind: "script",
|
|
493
506
|
savedPath
|
|
494
507
|
});
|
|
508
|
+
if (added)
|
|
509
|
+
progress(`[asset:script] ${savedPath}`);
|
|
495
510
|
}
|
|
496
511
|
}
|
|
497
512
|
}
|
|
@@ -538,7 +553,7 @@ export async function extractFrontend(options) {
|
|
|
538
553
|
if (selector === "link[href]" && $(el).attr("rel")?.toLowerCase() === "stylesheet") {
|
|
539
554
|
continue;
|
|
540
555
|
}
|
|
541
|
-
await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
|
|
556
|
+
await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress);
|
|
542
557
|
}
|
|
543
558
|
}
|
|
544
559
|
const downloadableAnchors = $("a[href]").toArray();
|
|
@@ -556,7 +571,7 @@ export async function extractFrontend(options) {
|
|
|
556
571
|
const isInternalHtml = normalized ? isSameOriginHtmlLink(normalized, rootOrigin) : false;
|
|
557
572
|
if (isInternalHtml)
|
|
558
573
|
continue;
|
|
559
|
-
await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
|
|
574
|
+
await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress);
|
|
560
575
|
}
|
|
561
576
|
}
|
|
562
577
|
const cssMerged = cssBlocks.join("\n\n");
|
|
@@ -574,11 +589,13 @@ export async function extractFrontend(options) {
|
|
|
574
589
|
const localPath = path.join(outDir, savedPath);
|
|
575
590
|
const rel = toRelativeWebPath(cssDir, localPath);
|
|
576
591
|
rewrittenCss = rewrittenCss.split(cssUrl).join(rel);
|
|
577
|
-
maybeAddAsset(assets, seenAssetRecords, {
|
|
592
|
+
const added = maybeAddAsset(assets, seenAssetRecords, {
|
|
578
593
|
url: abs,
|
|
579
594
|
kind: inferBinaryAssetKind(abs) === "font" ? "font" : "image",
|
|
580
595
|
savedPath
|
|
581
596
|
});
|
|
597
|
+
if (added)
|
|
598
|
+
progress(`[asset:${inferBinaryAssetKind(abs)}] ${savedPath}`);
|
|
582
599
|
}
|
|
583
600
|
if (strictClean) {
|
|
584
601
|
strictCleanDom($);
|
|
@@ -592,6 +609,8 @@ export async function extractFrontend(options) {
|
|
|
592
609
|
}
|
|
593
610
|
await saveTextFile(pagePaths.htmlPath, await formatMaybe(finalHtml, "html"));
|
|
594
611
|
await saveTextFile(pagePaths.cssPath, await formatMaybe(rewrittenCss, "css"));
|
|
612
|
+
progress(`[write] ${path.relative(outDir, pagePaths.htmlPath)}`);
|
|
613
|
+
progress(`[write] ${path.relative(outDir, pagePaths.cssPath)}`);
|
|
595
614
|
pages.push({
|
|
596
615
|
url: current.url,
|
|
597
616
|
htmlPath: path.relative(outDir, pagePaths.htmlPath),
|
|
@@ -627,6 +646,7 @@ export async function extractFrontend(options) {
|
|
|
627
646
|
verification,
|
|
628
647
|
warnings
|
|
629
648
|
}, null, 2));
|
|
649
|
+
progress("[done] Wrote manifest.json");
|
|
630
650
|
const rootPagePaths = getPagePaths(normalizedRootUrl, outDir);
|
|
631
651
|
return {
|
|
632
652
|
url: options.url,
|