npm - guidelinescraper - Versions diffs - 1.0.13 → 1.0.15 - Mend

guidelinescraper 1.0.13 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (3) hide show

package/crawl.mjs CHANGED Viewed

@@ -21,6 +21,8 @@ const { values, positionals } = parseArgs({
     hub: { type: "string", short: "h" },
     cookie: { type: "string", short: "c" },
     limit: { type: "string", short: "l" },
+    concurrency: { type: "string" },
+    force: { type: "boolean" },
     help: { type: "boolean" },
   },
   allowPositionals: true,
@@ -35,6 +37,8 @@ Options:
   -h, --hub <id>        Hub ID (auto-detected if omitted)
   -c, --cookie <str>    Cookie header for authenticated requests
   -l, --limit <n>       Only crawl the first N pages
+      --concurrency <n> Max parallel browsers (default: 3)
+      --force           Re-crawl pages even if output exists
       --help            Show this help
 Environment variables (or .env file):
@@ -48,6 +52,8 @@ const inputUrl = values.url || positionals[0] || process.env.URL;
 const hubId = values.hub || process.env.HUB_ID;
 const cookie = values.cookie || process.env.COOKIE || "";
 const limit = values.limit ? Number(values.limit) : Infinity;
+const concurrency = values.concurrency ? Number(values.concurrency) : 3;
+const force = !!values.force;
 const OUTPUT_DIR = "output";
@@ -104,7 +110,12 @@ function collectPages(nodes, parentDir) {
 const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
 const allPages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
-const pages = allPages.slice(0, limit);
+const limitedPages = allPages.slice(0, limit);
+const pages = force
+  ? limitedPages
+  : limitedPages.filter((p) => !fs.existsSync(p.pdfPath));
+const skipped = limitedPages.length - pages.length;
 const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
 const totalPages = pages.length;
@@ -112,7 +123,11 @@ let completed = 0;
 let failed = 0;
 const crawlStart = Date.now();
-console.log(`Discovered ${totalPages} pages to crawl.\n`);
+if (skipped > 0) {
+  console.log(`Resuming: ${skipped} already done, ${totalPages} remaining.\n`);
+} else {
+  console.log(`Discovered ${totalPages} pages to crawl.\n`);
+}
 for (const { pdfPath } of pages) {
   fs.mkdirSync(path.dirname(pdfPath), { recursive: true });
@@ -132,9 +147,11 @@ const crawler = new PlaywrightCrawler({
   launchContext: {
     launchOptions: { args: ["--disable-dev-shm-usage"] },
   },
-  maxConcurrency: 8,
+  maxConcurrency: concurrency,
   maxRequestRetries: 2,
   navigationTimeoutSecs: 120,
+  useSessionPool: true,
+  persistCookiesPerSession: true,
   preNavigationHooks: [
     async ({ page, request }) => {
@@ -172,10 +189,25 @@ const crawler = new PlaywrightCrawler({
       return;
     }
+    // Random delay to reduce Cloudflare captcha triggers
+    await page.waitForTimeout(1000 + Math.random() * 2000);
     await page
       .waitForLoadState("networkidle", { timeout: 30_000 })
       .catch(() => {});
+    const isCaptcha = await page.evaluate(() => {
+      const text = document.body?.textContent || "";
+      return /confirm you are\s*human|security check|Cloudflare/i.test(text) ||
+        !!document.querySelector("#challenge-running, #challenge-stage, .cf-turnstile");
+    });
+    if (isCaptcha) {
+      log.error(`Cloudflare challenge detected on ${request.url} — skipping.`);
+      failed++;
+      completed++;
+      return;
+    }
     // Expand all collapsible / accordion content
     await page.evaluate(async () => {
       for (const el of document.querySelectorAll("details")) {
@@ -442,15 +474,19 @@ function formatDuration(ms) {
   return m > 0 ? `${m}m ${s}s` : `${s}s`;
 }
-const discoverEnd = Date.now();
-console.log(`\nCrawling ${totalPages} pages…\n`);
-await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
-const totalMs = Date.now() - crawlStart;
-console.log(`\n${"─".repeat(50)}`);
-console.log(`Done in ${formatDuration(totalMs)}`);
-console.log(
-  `  ${completed - failed} saved, ${failed} failed, ${totalPages} total`,
-);
-console.log(`  Output: ${domainDir}`);
+if (totalPages === 0) {
+  console.log("Nothing to crawl — all pages already saved. Use --force to re-crawl.");
+} else {
+  console.log(`\nCrawling ${totalPages} pages…\n`);
+  await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
+  const totalMs = Date.now() - crawlStart;
+  console.log(`\n${"─".repeat(50)}`);
+  console.log(`Done in ${formatDuration(totalMs)}`);
+  const parts = [`${completed - failed} saved`, `${failed} failed`];
+  if (skipped > 0) parts.push(`${skipped} skipped`);
+  parts.push(`${limitedPages.length} total`);
+  console.log(`  ${parts.join(", ")}`);
+  console.log(`  Output: ${domainDir}`);
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "guidelinescraper",
-  "version": "1.0.13",
+  "version": "1.0.15",
   "type": "module",
   "description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
   "bin": {

package/purge-html.mjs CHANGED Viewed

@@ -99,10 +99,6 @@ export function purge(html) {
     /Your changes could not be saved\.\s*Please reload the page and try it again\./,
     /^Describe this color palette here$/i,
   ];
-  const COLOR_FORMATS = "HEX|RGB|CMYK|LESS|HSL|HSB|RAL|ORA|PMS|PMS-C|PMS-U|PMS-CP|PMS-PQ|PMS-TCX|CMYK-C|CMYK-U|CMYK-N|NCS|HKS|3M|LAB|PANTONE";
-  const COLOR_EMPTY_RE = new RegExp(`^(${COLOR_FORMATS})$`);
-  const COLOR_VALUE_RE = new RegExp(`^(${COLOR_FORMATS})(.+)$`);
   const walk = (node) => {
     for (const child of [...node.childNodes]) {
       if (child.nodeType === 3) {
@@ -110,16 +106,6 @@ export function purge(html) {
         for (const pat of NOISE_PATTERNS) {
           text = text.replace(pat, "");
         }
-        // Color palette: remove format-only lines (no value)
-        if (COLOR_EMPTY_RE.test(text.trim())) {
-          child.remove();
-          continue;
-        }
-        // Color palette: insert ": " between format label and value
-        const m = text.trim().match(COLOR_VALUE_RE);
-        if (m) {
-          text = m[1] + ": " + m[2].trim();
-        }
         if (text.trim()) {
           child.textContent = text;
         } else {
@@ -136,7 +122,17 @@ export function purge(html) {
   const cleanHtml =
     `<!DOCTYPE html>\n<html lang="${document.documentElement?.getAttribute("lang") || "en"}">\n<head>\n<meta charset="utf-8">\n<title>${title}</title>\n<style>body{max-width:72ch;margin:2rem auto;padding:0 1rem;font:1rem/1.6 monospace}img{max-width:100%;height:auto}pre{background:#f5f5f5;padding:1rem;overflow-x:auto;border-radius:4px}code{font-family:monospace}table{border-collapse:collapse;width:100%}th,td{border:1px solid #ccc;padding:.5rem;text-align:left}th{background:#f5f5f5}</style>\n</head>\n<body>\n${main.innerHTML.trim()}\n</body>\n</html>`;
-  return cleanHtml.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+\n/g, "\n");
+  const COLOR_FORMATS = "HEX|RGB|CMYK|LESS|HSL|HSB|RAL|ORA|PMS|PMS-C|PMS-U|PMS-CP|PMS-PQ|PMS-TCX|CMYK-C|CMYK-U|CMYK-N|NCS|HKS|3M|LAB|PANTONE";
+  // Insert ": " when a format label is concatenated with its value
+  const concatRe = new RegExp(`(${COLOR_FORMATS})(#|\\d)`, "g");
+  // Remove empty format entries (element containing only a format name, no value)
+  const emptyTagRe = new RegExp(`<(p|td|li|dd|span)([^>]*)>\\s*(${COLOR_FORMATS})\\s*<\\/\\1>`, "g");
+  return cleanHtml
+    .replace(concatRe, "$1: $2")
+    .replace(emptyTagRe, "")
+    .replace(/\n{3,}/g, "\n\n")
+    .replace(/[ \t]+\n/g, "\n");
 }
 // CLI mode: run standalone on a directory