guidelinescraper 1.0.14 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/crawl.mjs +51 -15
  2. package/package.json +1 -1
package/crawl.mjs CHANGED
@@ -21,6 +21,8 @@ const { values, positionals } = parseArgs({
21
21
  hub: { type: "string", short: "h" },
22
22
  cookie: { type: "string", short: "c" },
23
23
  limit: { type: "string", short: "l" },
24
+ concurrency: { type: "string" },
25
+ force: { type: "boolean" },
24
26
  help: { type: "boolean" },
25
27
  },
26
28
  allowPositionals: true,
@@ -35,6 +37,8 @@ Options:
35
37
  -h, --hub <id> Hub ID (auto-detected if omitted)
36
38
  -c, --cookie <str> Cookie header for authenticated requests
37
39
  -l, --limit <n> Only crawl the first N pages
40
+ --concurrency <n> Max parallel browsers (default: 3)
41
+ --force Re-crawl pages even if output exists
38
42
  --help Show this help
39
43
 
40
44
  Environment variables (or .env file):
@@ -48,6 +52,8 @@ const inputUrl = values.url || positionals[0] || process.env.URL;
48
52
  const hubId = values.hub || process.env.HUB_ID;
49
53
  const cookie = values.cookie || process.env.COOKIE || "";
50
54
  const limit = values.limit ? Number(values.limit) : Infinity;
55
+ const concurrency = values.concurrency ? Number(values.concurrency) : 3;
56
+ const force = !!values.force;
51
57
 
52
58
  const OUTPUT_DIR = "output";
53
59
 
@@ -104,7 +110,12 @@ function collectPages(nodes, parentDir) {
104
110
 
105
111
  const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
106
112
  const allPages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
107
- const pages = allPages.slice(0, limit);
113
+ const limitedPages = allPages.slice(0, limit);
114
+
115
+ const pages = force
116
+ ? limitedPages
117
+ : limitedPages.filter((p) => !fs.existsSync(p.pdfPath));
118
+ const skipped = limitedPages.length - pages.length;
108
119
 
109
120
  const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
110
121
  const totalPages = pages.length;
@@ -112,7 +123,11 @@ let completed = 0;
112
123
  let failed = 0;
113
124
  const crawlStart = Date.now();
114
125
 
115
- console.log(`Discovered ${totalPages} pages to crawl.\n`);
126
+ if (skipped > 0) {
127
+ console.log(`Resuming: ${skipped} already done, ${totalPages} remaining.\n`);
128
+ } else {
129
+ console.log(`Discovered ${totalPages} pages to crawl.\n`);
130
+ }
116
131
 
117
132
  for (const { pdfPath } of pages) {
118
133
  fs.mkdirSync(path.dirname(pdfPath), { recursive: true });
@@ -132,9 +147,11 @@ const crawler = new PlaywrightCrawler({
132
147
  launchContext: {
133
148
  launchOptions: { args: ["--disable-dev-shm-usage"] },
134
149
  },
135
- maxConcurrency: 8,
150
+ maxConcurrency: concurrency,
136
151
  maxRequestRetries: 2,
137
152
  navigationTimeoutSecs: 120,
153
+ useSessionPool: true,
154
+ persistCookiesPerSession: true,
138
155
 
139
156
  preNavigationHooks: [
140
157
  async ({ page, request }) => {
@@ -172,10 +189,25 @@ const crawler = new PlaywrightCrawler({
172
189
  return;
173
190
  }
174
191
 
192
+ // Random delay to reduce Cloudflare captcha triggers
193
+ await page.waitForTimeout(1000 + Math.random() * 2000);
194
+
175
195
  await page
176
196
  .waitForLoadState("networkidle", { timeout: 30_000 })
177
197
  .catch(() => {});
178
198
 
199
+ const isCaptcha = await page.evaluate(() => {
200
+ const text = document.body?.textContent || "";
201
+ return /confirm you are\s*human|security check|Cloudflare/i.test(text) ||
202
+ !!document.querySelector("#challenge-running, #challenge-stage, .cf-turnstile");
203
+ });
204
+ if (isCaptcha) {
205
+ log.error(`Cloudflare challenge detected on ${request.url} — skipping.`);
206
+ failed++;
207
+ completed++;
208
+ return;
209
+ }
210
+
179
211
  // Expand all collapsible / accordion content
180
212
  await page.evaluate(async () => {
181
213
  for (const el of document.querySelectorAll("details")) {
@@ -442,15 +474,19 @@ function formatDuration(ms) {
442
474
  return m > 0 ? `${m}m ${s}s` : `${s}s`;
443
475
  }
444
476
 
445
- const discoverEnd = Date.now();
446
- console.log(`\nCrawling ${totalPages} pages…\n`);
447
-
448
- await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
449
-
450
- const totalMs = Date.now() - crawlStart;
451
- console.log(`\n${"─".repeat(50)}`);
452
- console.log(`Done in ${formatDuration(totalMs)}`);
453
- console.log(
454
- ` ${completed - failed} saved, ${failed} failed, ${totalPages} total`,
455
- );
456
- console.log(` Output: ${domainDir}`);
477
+ if (totalPages === 0) {
478
+ console.log("Nothing to crawl — all pages already saved. Use --force to re-crawl.");
479
+ } else {
480
+ console.log(`\nCrawling ${totalPages} pages…\n`);
481
+
482
+ await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
483
+
484
+ const totalMs = Date.now() - crawlStart;
485
+ console.log(`\n${"─".repeat(50)}`);
486
+ console.log(`Done in ${formatDuration(totalMs)}`);
487
+ const parts = [`${completed - failed} saved`, `${failed} failed`];
488
+ if (skipped > 0) parts.push(`${skipped} skipped`);
489
+ parts.push(`${limitedPages.length} total`);
490
+ console.log(` ${parts.join(", ")}`);
491
+ console.log(` Output: ${domainDir}`);
492
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "guidelinescraper",
3
- "version": "1.0.14",
3
+ "version": "1.0.15",
4
4
  "type": "module",
5
5
  "description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
6
6
  "bin": {