npm - guidelinescraper - Versions diffs - 1.0.14 → 1.0.15 - Mend

guidelinescraper 1.0.14 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (2) hide show

package/crawl.mjs +51 -15
package/package.json +1 -1

package/crawl.mjs CHANGED Viewed

@@ -21,6 +21,8 @@ const { values, positionals } = parseArgs({
     hub: { type: "string", short: "h" },
     cookie: { type: "string", short: "c" },
     limit: { type: "string", short: "l" },
+    concurrency: { type: "string" },
+    force: { type: "boolean" },
     help: { type: "boolean" },
   },
   allowPositionals: true,
@@ -35,6 +37,8 @@ Options:
   -h, --hub <id>        Hub ID (auto-detected if omitted)
   -c, --cookie <str>    Cookie header for authenticated requests
   -l, --limit <n>       Only crawl the first N pages
+      --concurrency <n> Max parallel browsers (default: 3)
+      --force           Re-crawl pages even if output exists
       --help            Show this help
 Environment variables (or .env file):
@@ -48,6 +52,8 @@ const inputUrl = values.url || positionals[0] || process.env.URL;
 const hubId = values.hub || process.env.HUB_ID;
 const cookie = values.cookie || process.env.COOKIE || "";
 const limit = values.limit ? Number(values.limit) : Infinity;
+const concurrency = values.concurrency ? Number(values.concurrency) : 3;
+const force = !!values.force;
 const OUTPUT_DIR = "output";
@@ -104,7 +110,12 @@ function collectPages(nodes, parentDir) {
 const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
 const allPages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
-const pages = allPages.slice(0, limit);
+const limitedPages = allPages.slice(0, limit);
+const pages = force
+  ? limitedPages
+  : limitedPages.filter((p) => !fs.existsSync(p.pdfPath));
+const skipped = limitedPages.length - pages.length;
 const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
 const totalPages = pages.length;
@@ -112,7 +123,11 @@ let completed = 0;
 let failed = 0;
 const crawlStart = Date.now();
-console.log(`Discovered ${totalPages} pages to crawl.\n`);
+if (skipped > 0) {
+  console.log(`Resuming: ${skipped} already done, ${totalPages} remaining.\n`);
+} else {
+  console.log(`Discovered ${totalPages} pages to crawl.\n`);
+}
 for (const { pdfPath } of pages) {
   fs.mkdirSync(path.dirname(pdfPath), { recursive: true });
@@ -132,9 +147,11 @@ const crawler = new PlaywrightCrawler({
   launchContext: {
     launchOptions: { args: ["--disable-dev-shm-usage"] },
   },
-  maxConcurrency: 8,
+  maxConcurrency: concurrency,
   maxRequestRetries: 2,
   navigationTimeoutSecs: 120,
+  useSessionPool: true,
+  persistCookiesPerSession: true,
   preNavigationHooks: [
     async ({ page, request }) => {
@@ -172,10 +189,25 @@ const crawler = new PlaywrightCrawler({
       return;
     }
+    // Random delay to reduce Cloudflare captcha triggers
+    await page.waitForTimeout(1000 + Math.random() * 2000);
     await page
       .waitForLoadState("networkidle", { timeout: 30_000 })
       .catch(() => {});
+    const isCaptcha = await page.evaluate(() => {
+      const text = document.body?.textContent || "";
+      return /confirm you are\s*human|security check|Cloudflare/i.test(text) ||
+        !!document.querySelector("#challenge-running, #challenge-stage, .cf-turnstile");
+    });
+    if (isCaptcha) {
+      log.error(`Cloudflare challenge detected on ${request.url} — skipping.`);
+      failed++;
+      completed++;
+      return;
+    }
     // Expand all collapsible / accordion content
     await page.evaluate(async () => {
       for (const el of document.querySelectorAll("details")) {
@@ -442,15 +474,19 @@ function formatDuration(ms) {
   return m > 0 ? `${m}m ${s}s` : `${s}s`;
 }
-const discoverEnd = Date.now();
-console.log(`\nCrawling ${totalPages} pages…\n`);
-await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
-const totalMs = Date.now() - crawlStart;
-console.log(`\n${"─".repeat(50)}`);
-console.log(`Done in ${formatDuration(totalMs)}`);
-console.log(
-  `  ${completed - failed} saved, ${failed} failed, ${totalPages} total`,
-);
-console.log(`  Output: ${domainDir}`);
+if (totalPages === 0) {
+  console.log("Nothing to crawl — all pages already saved. Use --force to re-crawl.");
+} else {
+  console.log(`\nCrawling ${totalPages} pages…\n`);
+  await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
+  const totalMs = Date.now() - crawlStart;
+  console.log(`\n${"─".repeat(50)}`);
+  console.log(`Done in ${formatDuration(totalMs)}`);
+  const parts = [`${completed - failed} saved`, `${failed} failed`];
+  if (skipped > 0) parts.push(`${skipped} skipped`);
+  parts.push(`${limitedPages.length} total`);
+  console.log(`  ${parts.join(", ")}`);
+  console.log(`  Output: ${domainDir}`);
+}

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "guidelinescraper",
-  "version": "1.0.14",
+  "version": "1.0.15",
   "type": "module",
   "description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
   "bin": {