cleanscrape 0.1.0 → 0.1.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -133,6 +133,7 @@ async function runExtraction(url, opts) {
133
133
  mode,
134
134
  everything: opts.everything !== false,
135
135
  strictClean: Boolean(opts.strictClean),
136
+ onProgress: opts.quiet ? undefined : (message) => console.log(message),
136
137
  timeoutMs,
137
138
  crawlDepth,
138
139
  maxPages,
@@ -237,6 +238,7 @@ if (isScrapify) {
237
238
  .option("--everything", "Capture all discoverable assets/code and keep scripts")
238
239
  .option("--no-everything", "Disable full capture mode")
239
240
  .option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
241
+ .option("--quiet", "Hide per-file streaming logs")
240
242
  .option("--save-default", "Save this run's URL/options as defaults")
241
243
  .option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
242
244
  .option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
@@ -272,6 +274,7 @@ if (isScrapify) {
272
274
  .option("--everything", "Capture all discoverable assets/code and keep scripts")
273
275
  .option("--no-everything", "Disable full capture mode")
274
276
  .option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
277
+ .option("--quiet", "Hide per-file streaming logs")
275
278
  .option("--save-default", "Save this run's URL/options as defaults")
276
279
  .option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
277
280
  .option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
package/dist/extractor.js CHANGED
@@ -186,9 +186,10 @@ async function downloadBinaryAsset(assetUrl, outDir, timeoutMs, userAgent, asset
186
186
  function maybeAddAsset(assets, seen, record) {
187
187
  const key = `${record.kind}:${record.savedPath}`;
188
188
  if (seen.has(key))
189
- return;
189
+ return false;
190
190
  seen.add(key);
191
191
  assets.push(record);
192
+ return true;
192
193
  }
193
194
  function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
194
195
  let parsed;
@@ -236,7 +237,7 @@ function findRemoteUrls(content) {
236
237
  const matches = content.match(/(?:https?:)?\/\/[^\s"'()<>]+/g) || [];
237
238
  return matches.map((u) => u.trim()).filter(Boolean);
238
239
  }
239
- async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings) {
240
+ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, onProgress) {
240
241
  const value = $(el).attr(attr);
241
242
  if (!value)
242
243
  return;
@@ -261,13 +262,19 @@ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir,
261
262
  const localPath = path.join(outDir, savedPath);
262
263
  const rel = toRelativeWebPath(pageDir, localPath);
263
264
  $(el).attr(attr, rel);
264
- maybeAddAsset(assets, seenAssetRecords, {
265
+ const added = maybeAddAsset(assets, seenAssetRecords, {
265
266
  url: absolute,
266
267
  kind: inferAssetKind(absolute),
267
268
  savedPath
268
269
  });
270
+ if (added && onProgress)
271
+ onProgress(`[asset:${inferAssetKind(absolute)}] ${savedPath}`);
269
272
  }
270
273
  export async function extractFrontend(options) {
274
+ const progress = (msg) => {
275
+ if (options.onProgress)
276
+ options.onProgress(msg);
277
+ };
271
278
  const everything = options.everything === true;
272
279
  const strictClean = options.strictClean === true;
273
280
  const outDir = path.resolve(options.outDir);
@@ -292,6 +299,7 @@ export async function extractFrontend(options) {
292
299
  if (visited.has(current.url))
293
300
  continue;
294
301
  visited.add(current.url);
302
+ progress(`[page] Fetching ${current.url}`);
295
303
  let htmlRes;
296
304
  try {
297
305
  htmlRes = await fetchWithTimeout(current.url, options.timeoutMs, options.userAgent);
@@ -311,6 +319,7 @@ export async function extractFrontend(options) {
311
319
  }
312
320
  const domHtml = await htmlRes.text();
313
321
  const $ = load(domHtml);
322
+ progress(`[page] Parsing ${current.url}`);
314
323
  if (current.depth < options.crawlDepth) {
315
324
  const discovered = new Set();
316
325
  $("a[href]").each((_, el) => {
@@ -415,11 +424,13 @@ export async function extractFrontend(options) {
415
424
  const localPath = path.join(outDir, savedPath);
416
425
  const rel = toRelativeWebPath(pageDir, localPath);
417
426
  $(el).attr(attr, rel);
418
- maybeAddAsset(assets, seenAssetRecords, {
427
+ const added = maybeAddAsset(assets, seenAssetRecords, {
419
428
  url: absolute,
420
429
  kind: inferBinaryAssetKind(absolute) === "font" ? "font" : "image",
421
430
  savedPath
422
431
  });
432
+ if (added)
433
+ progress(`[asset:image] ${savedPath}`);
423
434
  }
424
435
  catch (error) {
425
436
  warnings.push(`Failed to fetch media: ${absolute} (${String(error)})`);
@@ -463,11 +474,13 @@ export async function extractFrontend(options) {
463
474
  const localPath = path.join(outDir, savedPath);
464
475
  const rel = toRelativeWebPath(pageDir, localPath);
465
476
  $(el).attr("src", rel);
466
- maybeAddAsset(assets, seenAssetRecords, {
477
+ const added = maybeAddAsset(assets, seenAssetRecords, {
467
478
  url: absolute,
468
479
  kind: "script",
469
480
  savedPath
470
481
  });
482
+ if (added)
483
+ progress(`[asset:script] ${savedPath}`);
471
484
  }
472
485
  catch (error) {
473
486
  warnings.push(`Failed to fetch script: ${absolute} (${String(error)})`);
@@ -487,11 +500,13 @@ export async function extractFrontend(options) {
487
500
  const rel = toRelativeWebPath(pageDir, targetPath);
488
501
  $(el).attr("src", rel);
489
502
  $(el).text("");
490
- maybeAddAsset(assets, seenAssetRecords, {
503
+ const added = maybeAddAsset(assets, seenAssetRecords, {
491
504
  url: `${current.url}#inline-script-${inlineIndex}`,
492
505
  kind: "script",
493
506
  savedPath
494
507
  });
508
+ if (added)
509
+ progress(`[asset:script] ${savedPath}`);
495
510
  }
496
511
  }
497
512
  }
@@ -538,7 +553,7 @@ export async function extractFrontend(options) {
538
553
  if (selector === "link[href]" && $(el).attr("rel")?.toLowerCase() === "stylesheet") {
539
554
  continue;
540
555
  }
541
- await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
556
+ await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress);
542
557
  }
543
558
  }
544
559
  const downloadableAnchors = $("a[href]").toArray();
@@ -556,7 +571,7 @@ export async function extractFrontend(options) {
556
571
  const isInternalHtml = normalized ? isSameOriginHtmlLink(normalized, rootOrigin) : false;
557
572
  if (isInternalHtml)
558
573
  continue;
559
- await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
574
+ await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress);
560
575
  }
561
576
  }
562
577
  const cssMerged = cssBlocks.join("\n\n");
@@ -574,11 +589,13 @@ export async function extractFrontend(options) {
574
589
  const localPath = path.join(outDir, savedPath);
575
590
  const rel = toRelativeWebPath(cssDir, localPath);
576
591
  rewrittenCss = rewrittenCss.split(cssUrl).join(rel);
577
- maybeAddAsset(assets, seenAssetRecords, {
592
+ const added = maybeAddAsset(assets, seenAssetRecords, {
578
593
  url: abs,
579
594
  kind: inferBinaryAssetKind(abs) === "font" ? "font" : "image",
580
595
  savedPath
581
596
  });
597
+ if (added)
598
+ progress(`[asset:${inferBinaryAssetKind(abs)}] ${savedPath}`);
582
599
  }
583
600
  if (strictClean) {
584
601
  strictCleanDom($);
@@ -592,6 +609,8 @@ export async function extractFrontend(options) {
592
609
  }
593
610
  await saveTextFile(pagePaths.htmlPath, await formatMaybe(finalHtml, "html"));
594
611
  await saveTextFile(pagePaths.cssPath, await formatMaybe(rewrittenCss, "css"));
612
+ progress(`[write] ${path.relative(outDir, pagePaths.htmlPath)}`);
613
+ progress(`[write] ${path.relative(outDir, pagePaths.cssPath)}`);
595
614
  pages.push({
596
615
  url: current.url,
597
616
  htmlPath: path.relative(outDir, pagePaths.htmlPath),
@@ -627,6 +646,7 @@ export async function extractFrontend(options) {
627
646
  verification,
628
647
  warnings
629
648
  }, null, 2));
649
+ progress("[done] Wrote manifest.json");
630
650
  const rootPagePaths = getPagePaths(normalizedRootUrl, outDir);
631
651
  return {
632
652
  url: options.url,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cleanscrape",
3
- "version": "0.1.0",
3
+ "version": "0.1.1",
4
4
  "description": "Clean frontend extractor: convert live sites into editable HTML/CSS/assets",
5
5
  "type": "module",
6
6
  "bin": {