guidelinescraper 1.0.13 → 1.0.15

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/crawl.mjs +51 -15
  2. package/package.json +1 -1
  3. package/purge-html.mjs +11 -15
package/crawl.mjs CHANGED
@@ -21,6 +21,8 @@ const { values, positionals } = parseArgs({
21
21
  hub: { type: "string", short: "h" },
22
22
  cookie: { type: "string", short: "c" },
23
23
  limit: { type: "string", short: "l" },
24
+ concurrency: { type: "string" },
25
+ force: { type: "boolean" },
24
26
  help: { type: "boolean" },
25
27
  },
26
28
  allowPositionals: true,
@@ -35,6 +37,8 @@ Options:
35
37
  -h, --hub <id> Hub ID (auto-detected if omitted)
36
38
  -c, --cookie <str> Cookie header for authenticated requests
37
39
  -l, --limit <n> Only crawl the first N pages
40
+ --concurrency <n> Max parallel browsers (default: 3)
41
+ --force Re-crawl pages even if output exists
38
42
  --help Show this help
39
43
 
40
44
  Environment variables (or .env file):
@@ -48,6 +52,8 @@ const inputUrl = values.url || positionals[0] || process.env.URL;
48
52
  const hubId = values.hub || process.env.HUB_ID;
49
53
  const cookie = values.cookie || process.env.COOKIE || "";
50
54
  const limit = values.limit ? Number(values.limit) : Infinity;
55
+ const concurrency = values.concurrency ? Number(values.concurrency) : 3;
56
+ const force = !!values.force;
51
57
 
52
58
  const OUTPUT_DIR = "output";
53
59
 
@@ -104,7 +110,12 @@ function collectPages(nodes, parentDir) {
104
110
 
105
111
  const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
106
112
  const allPages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
107
- const pages = allPages.slice(0, limit);
113
+ const limitedPages = allPages.slice(0, limit);
114
+
115
+ const pages = force
116
+ ? limitedPages
117
+ : limitedPages.filter((p) => !fs.existsSync(p.pdfPath));
118
+ const skipped = limitedPages.length - pages.length;
108
119
 
109
120
  const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
110
121
  const totalPages = pages.length;
@@ -112,7 +123,11 @@ let completed = 0;
112
123
  let failed = 0;
113
124
  const crawlStart = Date.now();
114
125
 
115
- console.log(`Discovered ${totalPages} pages to crawl.\n`);
126
+ if (skipped > 0) {
127
+ console.log(`Resuming: ${skipped} already done, ${totalPages} remaining.\n`);
128
+ } else {
129
+ console.log(`Discovered ${totalPages} pages to crawl.\n`);
130
+ }
116
131
 
117
132
  for (const { pdfPath } of pages) {
118
133
  fs.mkdirSync(path.dirname(pdfPath), { recursive: true });
@@ -132,9 +147,11 @@ const crawler = new PlaywrightCrawler({
132
147
  launchContext: {
133
148
  launchOptions: { args: ["--disable-dev-shm-usage"] },
134
149
  },
135
- maxConcurrency: 8,
150
+ maxConcurrency: concurrency,
136
151
  maxRequestRetries: 2,
137
152
  navigationTimeoutSecs: 120,
153
+ useSessionPool: true,
154
+ persistCookiesPerSession: true,
138
155
 
139
156
  preNavigationHooks: [
140
157
  async ({ page, request }) => {
@@ -172,10 +189,25 @@ const crawler = new PlaywrightCrawler({
172
189
  return;
173
190
  }
174
191
 
192
+ // Random delay to reduce Cloudflare captcha triggers
193
+ await page.waitForTimeout(1000 + Math.random() * 2000);
194
+
175
195
  await page
176
196
  .waitForLoadState("networkidle", { timeout: 30_000 })
177
197
  .catch(() => {});
178
198
 
199
+ const isCaptcha = await page.evaluate(() => {
200
+ const text = document.body?.textContent || "";
201
+ return /confirm you are\s*human|security check|Cloudflare/i.test(text) ||
202
+ !!document.querySelector("#challenge-running, #challenge-stage, .cf-turnstile");
203
+ });
204
+ if (isCaptcha) {
205
+ log.error(`Cloudflare challenge detected on ${request.url} — skipping.`);
206
+ failed++;
207
+ completed++;
208
+ return;
209
+ }
210
+
179
211
  // Expand all collapsible / accordion content
180
212
  await page.evaluate(async () => {
181
213
  for (const el of document.querySelectorAll("details")) {
@@ -442,15 +474,19 @@ function formatDuration(ms) {
442
474
  return m > 0 ? `${m}m ${s}s` : `${s}s`;
443
475
  }
444
476
 
445
- const discoverEnd = Date.now();
446
- console.log(`\nCrawling ${totalPages} pages…\n`);
447
-
448
- await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
449
-
450
- const totalMs = Date.now() - crawlStart;
451
- console.log(`\n${"─".repeat(50)}`);
452
- console.log(`Done in ${formatDuration(totalMs)}`);
453
- console.log(
454
- ` ${completed - failed} saved, ${failed} failed, ${totalPages} total`,
455
- );
456
- console.log(` Output: ${domainDir}`);
477
+ if (totalPages === 0) {
478
+ console.log("Nothing to crawl — all pages already saved. Use --force to re-crawl.");
479
+ } else {
480
+ console.log(`\nCrawling ${totalPages} pages…\n`);
481
+
482
+ await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
483
+
484
+ const totalMs = Date.now() - crawlStart;
485
+ console.log(`\n${"─".repeat(50)}`);
486
+ console.log(`Done in ${formatDuration(totalMs)}`);
487
+ const parts = [`${completed - failed} saved`, `${failed} failed`];
488
+ if (skipped > 0) parts.push(`${skipped} skipped`);
489
+ parts.push(`${limitedPages.length} total`);
490
+ console.log(` ${parts.join(", ")}`);
491
+ console.log(` Output: ${domainDir}`);
492
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "guidelinescraper",
3
- "version": "1.0.13",
3
+ "version": "1.0.15",
4
4
  "type": "module",
5
5
  "description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
6
6
  "bin": {
package/purge-html.mjs CHANGED
@@ -99,10 +99,6 @@ export function purge(html) {
99
99
  /Your changes could not be saved\.\s*Please reload the page and try it again\./,
100
100
  /^Describe this color palette here$/i,
101
101
  ];
102
- const COLOR_FORMATS = "HEX|RGB|CMYK|LESS|HSL|HSB|RAL|ORA|PMS|PMS-C|PMS-U|PMS-CP|PMS-PQ|PMS-TCX|CMYK-C|CMYK-U|CMYK-N|NCS|HKS|3M|LAB|PANTONE";
103
- const COLOR_EMPTY_RE = new RegExp(`^(${COLOR_FORMATS})$`);
104
- const COLOR_VALUE_RE = new RegExp(`^(${COLOR_FORMATS})(.+)$`);
105
-
106
102
  const walk = (node) => {
107
103
  for (const child of [...node.childNodes]) {
108
104
  if (child.nodeType === 3) {
@@ -110,16 +106,6 @@ export function purge(html) {
110
106
  for (const pat of NOISE_PATTERNS) {
111
107
  text = text.replace(pat, "");
112
108
  }
113
- // Color palette: remove format-only lines (no value)
114
- if (COLOR_EMPTY_RE.test(text.trim())) {
115
- child.remove();
116
- continue;
117
- }
118
- // Color palette: insert ": " between format label and value
119
- const m = text.trim().match(COLOR_VALUE_RE);
120
- if (m) {
121
- text = m[1] + ": " + m[2].trim();
122
- }
123
109
  if (text.trim()) {
124
110
  child.textContent = text;
125
111
  } else {
@@ -136,7 +122,17 @@ export function purge(html) {
136
122
  const cleanHtml =
137
123
  `<!DOCTYPE html>\n<html lang="${document.documentElement?.getAttribute("lang") || "en"}">\n<head>\n<meta charset="utf-8">\n<title>${title}</title>\n<style>body{max-width:72ch;margin:2rem auto;padding:0 1rem;font:1rem/1.6 monospace}img{max-width:100%;height:auto}pre{background:#f5f5f5;padding:1rem;overflow-x:auto;border-radius:4px}code{font-family:monospace}table{border-collapse:collapse;width:100%}th,td{border:1px solid #ccc;padding:.5rem;text-align:left}th{background:#f5f5f5}</style>\n</head>\n<body>\n${main.innerHTML.trim()}\n</body>\n</html>`;
138
124
 
139
- return cleanHtml.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+\n/g, "\n");
125
+ const COLOR_FORMATS = "HEX|RGB|CMYK|LESS|HSL|HSB|RAL|ORA|PMS|PMS-C|PMS-U|PMS-CP|PMS-PQ|PMS-TCX|CMYK-C|CMYK-U|CMYK-N|NCS|HKS|3M|LAB|PANTONE";
126
+ // Insert ": " when a format label is concatenated with its value
127
+ const concatRe = new RegExp(`(${COLOR_FORMATS})(#|\\d)`, "g");
128
+ // Remove empty format entries (element containing only a format name, no value)
129
+ const emptyTagRe = new RegExp(`<(p|td|li|dd|span)([^>]*)>\\s*(${COLOR_FORMATS})\\s*<\\/\\1>`, "g");
130
+
131
+ return cleanHtml
132
+ .replace(concatRe, "$1: $2")
133
+ .replace(emptyTagRe, "")
134
+ .replace(/\n{3,}/g, "\n\n")
135
+ .replace(/[ \t]+\n/g, "\n");
140
136
  }
141
137
 
142
138
  // CLI mode: run standalone on a directory