cleanscrape 0.1.1 → 0.1.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/dist/extractor.js +43 -7
  2. package/package.json +1 -1
package/dist/extractor.js CHANGED
@@ -208,6 +208,33 @@ function isSameOriginHtmlLink(candidateUrl, rootOrigin) {
208
208
  return false;
209
209
  return true;
210
210
  }
211
+ function getBaseDomain(hostname) {
212
+ const parts = hostname.toLowerCase().split(".").filter(Boolean);
213
+ if (parts.length <= 2)
214
+ return hostname.toLowerCase();
215
+ return `${parts[parts.length - 2]}.${parts[parts.length - 1]}`;
216
+ }
217
+ function isAllowedAssetHost(assetUrl, rootHost) {
218
+ try {
219
+ const host = new URL(assetUrl).hostname.toLowerCase();
220
+ const root = rootHost.toLowerCase();
221
+ const rootBase = getBaseDomain(root);
222
+ const hostBase = getBaseDomain(host);
223
+ return host === root || host.endsWith(`.${root}`) || hostBase === rootBase;
224
+ }
225
+ catch {
226
+ return false;
227
+ }
228
+ }
229
+ function looksLikeFileUrl(urlStr) {
230
+ try {
231
+ const u = new URL(urlStr);
232
+ return path.extname(u.pathname.toLowerCase()).length > 0;
233
+ }
234
+ catch {
235
+ return false;
236
+ }
237
+ }
211
238
  function toRelativeWebPath(fromDir, toPath) {
212
239
  const rel = path.relative(fromDir, toPath).split(path.sep).join("/");
213
240
  if (rel === "")
@@ -237,7 +264,7 @@ function findRemoteUrls(content) {
237
264
  const matches = content.match(/(?:https?:)?\/\/[^\s"'()<>]+/g) || [];
238
265
  return matches.map((u) => u.trim()).filter(Boolean);
239
266
  }
240
- async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, onProgress) {
267
+ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, onProgress, rootHost) {
241
268
  const value = $(el).attr(attr);
242
269
  if (!value)
243
270
  return;
@@ -252,8 +279,12 @@ async function downloadAndRelinkAttribute($, el, attr, baseUrl, pageDir, outDir,
252
279
  const absolute = normalizeUrl(value, baseUrl);
253
280
  if (!absolute)
254
281
  return;
282
+ if (!looksLikeFileUrl(absolute))
283
+ return;
255
284
  if (isTrackerUrl(absolute))
256
285
  return;
286
+ if (rootHost && !isAllowedAssetHost(absolute, rootHost))
287
+ return;
257
288
  const savedPath = await downloadBinaryAsset(absolute, outDir, options.timeoutMs, options.userAgent, assetMap);
258
289
  if (!savedPath) {
259
290
  warnings.push(`Failed to fetch asset: ${absolute}`);
@@ -284,6 +315,7 @@ export async function extractFrontend(options) {
284
315
  throw new Error(`Invalid URL: ${options.url}`);
285
316
  }
286
317
  const rootOrigin = new URL(normalizedRootUrl).origin;
318
+ const rootHost = new URL(normalizedRootUrl).hostname;
287
319
  const queue = [{ url: normalizedRootUrl, depth: 0 }];
288
320
  const visited = new Set();
289
321
  const assets = [];
@@ -540,7 +572,10 @@ export async function extractFrontend(options) {
540
572
  { selector: "embed[src]", attr: "src" },
541
573
  { selector: "object[data]", attr: "data" },
542
574
  { selector: "input[src]", attr: "src" },
543
- { selector: "link[href]", attr: "href" },
575
+ { selector: "link[rel~='icon'][href]", attr: "href" },
576
+ { selector: "link[rel='manifest'][href]", attr: "href" },
577
+ { selector: "link[rel='preload'][href]", attr: "href" },
578
+ { selector: "link[rel='prefetch'][href]", attr: "href" },
544
579
  { selector: "source[src]", attr: "src" },
545
580
  { selector: "image[href]", attr: "href" },
546
581
  { selector: "image[xlink\\:href]", attr: "xlink:href" },
@@ -550,10 +585,7 @@ export async function extractFrontend(options) {
550
585
  for (const { selector, attr } of attrSelectors) {
551
586
  const nodes = $(selector).toArray();
552
587
  for (const el of nodes) {
553
- if (selector === "link[href]" && $(el).attr("rel")?.toLowerCase() === "stylesheet") {
554
- continue;
555
- }
556
- await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress);
588
+ await downloadAndRelinkAttribute($, el, attr, current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress, rootHost);
557
589
  }
558
590
  }
559
591
  const downloadableAnchors = $("a[href]").toArray();
@@ -567,11 +599,15 @@ export async function extractFrontend(options) {
567
599
  const absolute = normalizeUrl(href, current.url);
568
600
  if (!absolute)
569
601
  continue;
602
+ if (!looksLikeFileUrl(absolute))
603
+ continue;
604
+ if (!isAllowedAssetHost(absolute, rootHost))
605
+ continue;
570
606
  const normalized = normalizePageUrl(absolute);
571
607
  const isInternalHtml = normalized ? isSameOriginHtmlLink(normalized, rootOrigin) : false;
572
608
  if (isInternalHtml)
573
609
  continue;
574
- await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress);
610
+ await downloadAndRelinkAttribute($, el, "href", current.url, pageDir, outDir, options, assetMap, assets, seenAssetRecords, warnings, progress, rootHost);
575
611
  }
576
612
  }
577
613
  const cssMerged = cssBlocks.join("\n\n");
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "cleanscrape",
3
- "version": "0.1.1",
3
+ "version": "0.1.2",
4
4
  "description": "Clean frontend extractor: convert live sites into editable HTML/CSS/assets",
5
5
  "type": "module",
6
6
  "bin": {