guidelinescraper 1.0.4 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/crawl.mjs CHANGED
@@ -20,6 +20,7 @@ const { values, positionals } = parseArgs({
20
20
  url: { type: "string", short: "u" },
21
21
  hub: { type: "string", short: "h" },
22
22
  cookie: { type: "string", short: "c" },
23
+ limit: { type: "string", short: "l" },
23
24
  help: { type: "boolean" },
24
25
  },
25
26
  allowPositionals: true,
@@ -33,6 +34,7 @@ Options:
33
34
  -u, --url <url> Portal domain or URL
34
35
  -h, --hub <id> Hub ID (auto-detected if omitted)
35
36
  -c, --cookie <str> Cookie header for authenticated requests
37
+ -l, --limit <n> Only crawl the first N pages
36
38
  --help Show this help
37
39
 
38
40
  Environment variables (or .env file):
@@ -45,6 +47,7 @@ Environment variables (or .env file):
45
47
  const inputUrl = values.url || positionals[0] || process.env.URL;
46
48
  const hubId = values.hub || process.env.HUB_ID;
47
49
  const cookie = values.cookie || process.env.COOKIE || "";
50
+ const limit = values.limit ? Number(values.limit) : Infinity;
48
51
 
49
52
  const OUTPUT_DIR = "output";
50
53
 
@@ -100,7 +103,8 @@ function collectPages(nodes, parentDir) {
100
103
  }
101
104
 
102
105
  const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
103
- const pages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
106
+ const allPages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
107
+ const pages = allPages.slice(0, limit);
104
108
 
105
109
  const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
106
110
  const totalPages = pages.length;
@@ -125,7 +129,7 @@ const crawler = new PlaywrightCrawler({
125
129
  launchContext: {
126
130
  launchOptions: { args: ["--disable-dev-shm-usage"] },
127
131
  },
128
- maxConcurrency: 16,
132
+ maxConcurrency: 8,
129
133
  maxRequestRetries: 2,
130
134
  navigationTimeoutSecs: 120,
131
135
 
@@ -165,7 +169,6 @@ const crawler = new PlaywrightCrawler({
165
169
  return;
166
170
  }
167
171
 
168
-
169
172
  await page
170
173
  .waitForLoadState("networkidle", { timeout: 30_000 })
171
174
  .catch(() => {});
@@ -395,11 +398,18 @@ const crawler = new PlaywrightCrawler({
395
398
 
396
399
  fs.writeFileSync(pdfPath, pdfBuffer);
397
400
 
401
+ const rawHtml = await page.content();
402
+
403
+ const rawPath = pdfPath
404
+ .replace(/\/pdf\//, "/raw/")
405
+ .replace(/\.pdf$/, ".html");
406
+ fs.mkdirSync(path.dirname(rawPath), { recursive: true });
407
+ fs.writeFileSync(rawPath, rawHtml);
408
+
398
409
  const htmlPath = pdfPath
399
410
  .replace(/\/pdf\//, "/html/")
400
411
  .replace(/\.pdf$/, ".html");
401
412
  fs.mkdirSync(path.dirname(htmlPath), { recursive: true });
402
- const rawHtml = await page.content();
403
413
  fs.writeFileSync(htmlPath, purge(rawHtml));
404
414
 
405
415
  completed++;
@@ -407,9 +417,12 @@ const crawler = new PlaywrightCrawler({
407
417
  const pct = Math.round((completed / totalPages) * 100);
408
418
  const avgSec = (Date.now() - crawlStart) / 1000 / completed;
409
419
  const remaining = Math.round(avgSec * (totalPages - completed));
410
- const eta = remaining > 0 ? `~${formatDuration(remaining * 1000)} left` : "";
420
+ const eta =
421
+ remaining > 0 ? `~${formatDuration(remaining * 1000)} left` : "";
411
422
  const shortPath = pdfPath.replace(/\.pdf$/, "").replace(/^output\//, "");
412
- log.info(`[${completed}/${totalPages}] ${pct}% · ${elapsed}s ${eta} · ${shortPath}`);
423
+ log.info(
424
+ `[${completed}/${totalPages}] ${pct}% · ${elapsed}s ${eta} · ${shortPath}`,
425
+ );
413
426
  },
414
427
 
415
428
  async failedRequestHandler({ request, log }) {
@@ -434,5 +447,7 @@ await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
434
447
  const totalMs = Date.now() - crawlStart;
435
448
  console.log(`\n${"─".repeat(50)}`);
436
449
  console.log(`Done in ${formatDuration(totalMs)}`);
437
- console.log(` ${completed - failed} saved, ${failed} failed, ${totalPages} total`);
450
+ console.log(
451
+ ` ${completed - failed} saved, ${failed} failed, ${totalPages} total`,
452
+ );
438
453
  console.log(` Output: ${domainDir}`);
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "guidelinescraper",
3
- "version": "1.0.4",
3
+ "version": "1.0.6",
4
4
  "type": "module",
5
5
  "description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
6
6
  "bin": {
package/purge-html.mjs CHANGED
@@ -5,7 +5,7 @@ import { parseHTML } from "linkedom";
5
5
  const REMOVE_TAGS = new Set([
6
6
  "script", "style", "link", "meta", "noscript", "iframe", "svg", "canvas",
7
7
  "video", "audio", "source", "track", "object", "embed", "applet",
8
- "form", "input", "textarea", "select", "button", "template",
8
+ "form", "input", "textarea", "select", "template",
9
9
  ]);
10
10
 
11
11
  const REMOVE_SELECTORS = [
@@ -81,7 +81,7 @@ export function purge(html) {
81
81
  document.querySelector("#classic-theme") ||
82
82
  document.body;
83
83
 
84
- cleanNode(main);
84
+ for (const child of [...main.childNodes]) cleanNode(child);
85
85
 
86
86
  const title = document.querySelector("title")?.textContent?.trim() || "";
87
87
  const cleanHtml =