cleanscrape 0.1.0 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -133,6 +133,7 @@ async function runExtraction(url, opts) {
133
133
  mode,
134
134
  everything: opts.everything !== false,
135
135
  strictClean: Boolean(opts.strictClean),
136
+ onProgress: opts.quiet ? undefined : (message) => console.log(message),
136
137
  timeoutMs,
137
138
  crawlDepth,
138
139
  maxPages,
@@ -237,6 +238,7 @@ if (isScrapify) {
237
238
  .option("--everything", "Capture all discoverable assets/code and keep scripts")
238
239
  .option("--no-everything", "Disable full capture mode")
239
240
  .option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
241
+ .option("--quiet", "Hide per-file streaming logs")
240
242
  .option("--save-default", "Save this run's URL/options as defaults")
241
243
  .option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
242
244
  .option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
@@ -272,6 +274,7 @@ if (isScrapify) {
272
274
  .option("--everything", "Capture all discoverable assets/code and keep scripts")
273
275
  .option("--no-everything", "Disable full capture mode")
274
276
  .option("--strict-clean", "Aggressively strip noisy attributes/comments for cleaner manual edits")
277
+ .option("--quiet", "Hide per-file streaming logs")
275
278
  .option("--save-default", "Save this run's URL/options as defaults")
276
279
  .option("-t, --timeout <ms>", "Timeout in milliseconds", defaultOptions.timeout)
277
280
  .option("-d, --depth <n>", "Internal link crawl depth", defaultOptions.depth)
package/dist/extractor.js CHANGED
@@ -186,9 +186,10 @@ async function downloadBinaryAsset(assetUrl, outDir, timeoutMs, userAgent, asset
186
186
  function maybeAddAsset(assets, seen, record) {
187
187
  const key = `${record.kind}:${record.savedPath}`;
188
188
  if (seen.has(key))
189
- return;
189
+ return false;
190
190
  seen.add(key);
191
191
  assets.push(record);
192
+ return true;
192
193
  }
193
194
  function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
194
195
  let parsed;
@@ -207,6 +208,33 @@ function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
207
208
  return false;
208
209
  return true;
209
210
  }
211
+ function getBaseDomain(hostname) {
212
+ const parts = hostname.toLowerCase().split(".").filter(Boolean);
213
+ if (parts.length <= 2)
214
+ return hostname.toLowerCase();
215
+ return `${parts[parts.length - 2]}.${parts[parts.length - 1]}`;
216
+ }
217
+ function isAllowedAssetHost(assetUrl, rootHost) {
218
+ try {
219
+ const host = new URL(assetUrl).hostname.toLowerCase();
220
+ const root = rootHost.toLowerCase();
221
+ const rootBase = getBaseDomain(root);
222
+ const hostBase = getBaseDomain(host);
223
+ return host === root || host.endsWith(`.${root}`) || hostBase === rootBase;
224
+ }
225
+ catch {
226
+ return false;
227
+ }
228
+ }
229
+ function looksLikeFileUrl(urlStr) {
230
+ try {
231
+ const u = new URL(urlStr);
232
+ return path.extname(u.pathname.toLowerCase()).length > 0;
233
+ }
234
+ catch {
235
+ return false;
236
+ }
237
+ }
210
238
  function toRelativeWebPath(fromDir, toPath) {
211
239
  const rel = path.relative(fromDir, toPath).split(path.sep).join("/");
212
240
  if (rel === "")
@@ -236,7 +264,7 @@ function findRemoteUrls(content) {
236
264
  const matches = content.match(/(?:https?:)?\/\/[^\s"'()<>]+/g) || [];
237
265
  return matches.map((u) => u.trim()).filter(Boolean);
238
266
  }
239
- async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings) {
267
+ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, onProgress, rootHost) {
240
268
  const value = $(el).attr(attr);
241
269
  if (!value)
242
270
  return;
@@ -251,8 +279,12 @@ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir,
251
279
  const absolute = normalizeUrl(value, baseUrl);
252
280
  if (!absolute)
253
281
  return;
282
+ if (!looksLikeFileUrl(absolute))
283
+ return;
254
284
  if (isTrackerUrl(absolute))
255
285
  return;
286
+ if (rootHost && !isAllowedAssetHost(absolute, rootHost))
287
+ return;
256
288
  const savedPath = await downloadBinaryAsset(absolute, outDir, options.timeoutMs, options.userAgent, assetMap);
257
289
  if (!savedPath) {
258
290
  warnings.push(`Failed to fetch asset: ${absolute}`);
@@ -261,13 +293,19 @@ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir,
261
293
  const localPath = path.join(outDir, savedPath);
262
294
  const rel = toRelativeWebPath(pageDir, localPath);
263
295
  $(el).attr(attr, rel);
264
- maybeAddAsset(assets, seenAssetRecords, {
296
+ const added = maybeAddAsset(assets, seenAssetRecords, {
265
297
  url: absolute,
266
298
  kind: inferAssetKind(absolute),
267
299
  savedPath
268
300
  });
301
+ if (added && onProgress)
302
+ onProgress(`[asset:${inferAssetKind(absolute)}] ${savedPath}`);
269
303
  }
270
304
  export async function extractFrontend(options) {
305
+ const progress = (msg) => {
306
+ if (options.onProgress)
307
+ options.onProgress(msg);
308
+ };
271
309
  const everything = options.everything === true;
272
310
  const strictClean = options.strictClean === true;
273
311
  const outDir = path.resolve(options.outDir);
@@ -277,6 +315,7 @@ export async function extractFrontend(options) {
277
315
  throw new Error(`Invalid URL: ${options.url}`);
278
316
  }
279
317
  const rootOrigin = new URL(normalizedRootUrl).origin;
318
+ const rootHost = new URL(normalizedRootUrl).hostname;
280
319
  const queue = [{ url: normalizedRootUrl, depth: 0 }];
281
320
  const visited = new Set();
282
321
  const assets = [];
@@ -292,6 +331,7 @@ export async function extractFrontend(options) {
292
331
  if (visited.has(current.url))
293
332
  continue;
294
333
  visited.add(current.url);
334
+ progress(`[page] Fetching ${current.url}`);
295
335
  let htmlRes;
296
336
  try {
297
337
  htmlRes = await fetchWithTimeout(current.url, options.timeoutMs, options.userAgent);
@@ -311,6 +351,7 @@ export async function extractFrontend(options) {
311
351
  }
312
352
  const domHtml = await htmlRes.text();
313
353
  const $ = load(domHtml);
354
+ progress(`[page] Parsing ${current.url}`);
314
355
  if (current.depth < options.crawlDepth) {
315
356
  const discovered = new Set();
316
357
  $("a[href]").each((_, el) => {
@@ -415,11 +456,13 @@ export async function extractFrontend(options) {
415
456
  const localPath = path.join(outDir, savedPath);
416
457
  const rel = toRelativeWebPath(pageDir, localPath);
417
458
  $(el).attr(attr, rel);
418
- maybeAddAsset(assets, seenAssetRecords, {
459
+ const added = maybeAddAsset(assets, seenAssetRecords, {
419
460
  url: absolute,
420
461
  kind: inferBinaryAssetKind(absolute) === "font" ? "font" : "image",
421
462
  savedPath
422
463
  });
464
+ if (added)
465
+ progress(`[asset:image] ${savedPath}`);
423
466
  }
424
467
  catch (error) {
425
468
  warnings.push(`Failed to fetch media: ${absolute} (${String(error)})`);
@@ -463,11 +506,13 @@ export async function extractFrontend(options) {
463
506
  const localPath = path.join(outDir, savedPath);
464
507
  const rel = toRelativeWebPath(pageDir, localPath);
465
508
  $(el).attr("src", rel);
466
- maybeAddAsset(assets, seenAssetRecords, {
509
+ const added = maybeAddAsset(assets, seenAssetRecords, {
467
510
  url: absolute,
468
511
  kind: "script",
469
512
  savedPath
470
513
  });
514
+ if (added)
515
+ progress(`[asset:script] ${savedPath}`);
471
516
  }
472
517
  catch (error) {
473
518
  warnings.push(`Failed to fetch script: ${absolute} (${String(error)})`);
@@ -487,11 +532,13 @@ export async function extractFrontend(options) {
487
532
  const rel = toRelativeWebPath(pageDir, targetPath);
488
533
  $(el).attr("src", rel);
489
534
  $(el).text("");
490
- maybeAddAsset(assets, seenAssetRecords, {
535
+ const added = maybeAddAsset(assets, seenAssetRecords, {
491
536
  url: `${current.url}#inline-script-${inlineIndex}`,
492
537
  kind: "script",
493
538
  savedPath
494
539
  });
540
+ if (added)
541
+ progress(`[asset:script] ${savedPath}`);
495
542
  }
496
543
  }
497
544
  }
@@ -525,7 +572,10 @@ export async function extractFrontend(options) {
525
572
  { selector: "embed[src]", attr: "src" },
526
573
  { selector: "object[data]", attr: "data" },
527
574
  { selector: "input[src]", attr: "src" },
528
- { selector: "link[href]", attr: "href" },
575
+ { selector: "link[rel~='icon'][href]", attr: "href" },
576
+ { selector: "link[rel='manifest'][href]", attr: "href" },
577
+ { selector: "link[rel='preload'][href]", attr: "href" },
578
+ { selector: "link[rel='prefetch'][href]", attr: "href" },
529
579
  { selector: "source[src]", attr: "src" },
530
580
  { selector: "image[href]", attr: "href" },
531
581
  { selector: "image[xlink\\:href]", attr: "xlink:href" },
@@ -535,10 +585,7 @@ export async function extractFrontend(options) {
535
585
  for (const { selector, attr } of attrSelectors) {
536
586
  const nodes = $(selector).toArray();
537
587
  for (const el of nodes) {
538
- if (selector === "link[href]" && $(el).attr("rel")?.toLowerCase() === "stylesheet") {
539
- continue;
540
- }
541
- await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
588
+ await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress, rootHost);
542
589
  }
543
590
  }
544
591
  const downloadableAnchors = $("a[href]").toArray();
@@ -552,11 +599,15 @@ export async function extractFrontend(options) {
552
599
  const absolute = normalizeUrl(href, current.url);
553
600
  if (!absolute)
554
601
  continue;
602
+ if (!looksLikeFileUrl(absolute))
603
+ continue;
604
+ if (!isAllowedAssetHost(absolute, rootHost))
605
+ continue;
555
606
  const normalized = normalizePageUrl(absolute);
556
607
  const isInternalHtml = normalized ? isSameOriginHtmlLink(normalized, rootOrigin) : false;
557
608
  if (isInternalHtml)
558
609
  continue;
559
- await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings);
610
+ await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress, rootHost);
560
611
  }
561
612
  }
562
613
  const cssMerged = cssBlocks.join("\n\n");
@@ -574,11 +625,13 @@ export async function extractFrontend(options) {
574
625
  const localPath = path.join(outDir, savedPath);
575
626
  const rel = toRelativeWebPath(cssDir, localPath);
576
627
  rewrittenCss = rewrittenCss.split(cssUrl).join(rel);
577
- maybeAddAsset(assets, seenAssetRecords, {
628
+ const added = maybeAddAsset(assets, seenAssetRecords, {
578
629
  url: abs,
579
630
  kind: inferBinaryAssetKind(abs) === "font" ? "font" : "image",
580
631
  savedPath
581
632
  });
633
+ if (added)
634
+ progress(`[asset:${inferBinaryAssetKind(abs)}] ${savedPath}`);
582
635
  }
583
636
  if (strictClean) {
584
637
  strictCleanDom($);
@@ -592,6 +645,8 @@ export async function extractFrontend(options) {
592
645
  }
593
646
  await saveTextFile(pagePaths.htmlPath, await formatMaybe(finalHtml, "html"));
594
647
  await saveTextFile(pagePaths.cssPath, await formatMaybe(rewrittenCss, "css"));
648
+ progress(`[write] ${path.relative(outDir, pagePaths.htmlPath)}`);
649
+ progress(`[write] ${path.relative(outDir, pagePaths.cssPath)}`);
595
650
  pages.push({
596
651
  url: current.url,
597
652
  htmlPath: path.relative(outDir, pagePaths.htmlPath),
@@ -627,6 +682,7 @@ export async function extractFrontend(options) {
627
682
  verification,
628
683
  warnings
629
684
  }, null, 2));
685
+ progress("[done] Wrote manifest.json");
630
686
  const rootPagePaths = getPagePaths(normalizedRootUrl, outDir);
631
687
  return {
632
688
  url: options.url,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cleanscrape",
3
- "version": "0.1.0",
3
+ "version": "0.1.2",
4
4
  "description": "Clean frontend extractor: convert live sites into editable HTML/CSS/assets",
5
5
  "type": "module",
6
6
  "bin": {