guidelinescraper 1.0.5 → 1.0.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/crawl.mjs CHANGED
@@ -20,6 +20,7 @@ const { values, positionals } = parseArgs({
20
20
  url: { type: "string", short: "u" },
21
21
  hub: { type: "string", short: "h" },
22
22
  cookie: { type: "string", short: "c" },
23
+ limit: { type: "string", short: "l" },
23
24
  help: { type: "boolean" },
24
25
  },
25
26
  allowPositionals: true,
@@ -33,6 +34,7 @@ Options:
33
34
  -u, --url <url> Portal domain or URL
34
35
  -h, --hub <id> Hub ID (auto-detected if omitted)
35
36
  -c, --cookie <str> Cookie header for authenticated requests
37
+ -l, --limit <n> Only crawl the first N pages
36
38
  --help Show this help
37
39
 
38
40
  Environment variables (or .env file):
@@ -45,6 +47,7 @@ Environment variables (or .env file):
45
47
  const inputUrl = values.url || positionals[0] || process.env.URL;
46
48
  const hubId = values.hub || process.env.HUB_ID;
47
49
  const cookie = values.cookie || process.env.COOKIE || "";
50
+ const limit = values.limit ? Number(values.limit) : Infinity;
48
51
 
49
52
  const OUTPUT_DIR = "output";
50
53
 
@@ -100,7 +103,8 @@ function collectPages(nodes, parentDir) {
100
103
  }
101
104
 
102
105
  const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
103
- const pages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
106
+ const allPages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
107
+ const pages = allPages.slice(0, limit);
104
108
 
105
109
  const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
106
110
  const totalPages = pages.length;
@@ -394,11 +398,18 @@ const crawler = new PlaywrightCrawler({
394
398
 
395
399
  fs.writeFileSync(pdfPath, pdfBuffer);
396
400
 
401
+ const rawHtml = await page.content();
402
+
403
+ const rawPath = pdfPath
404
+ .replace(/\/pdf\//, "/raw/")
405
+ .replace(/\.pdf$/, ".html");
406
+ fs.mkdirSync(path.dirname(rawPath), { recursive: true });
407
+ fs.writeFileSync(rawPath, rawHtml);
408
+
397
409
  const htmlPath = pdfPath
398
410
  .replace(/\/pdf\//, "/html/")
399
411
  .replace(/\.pdf$/, ".html");
400
412
  fs.mkdirSync(path.dirname(htmlPath), { recursive: true });
401
- const rawHtml = await page.content();
402
413
  fs.writeFileSync(htmlPath, purge(rawHtml));
403
414
 
404
415
  completed++;
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "guidelinescraper",
3
- "version": "1.0.5",
3
+ "version": "1.0.6",
4
4
  "type": "module",
5
5
  "description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
6
6
  "bin": {
package/purge-html.mjs CHANGED
@@ -5,7 +5,7 @@ import { parseHTML } from "linkedom";
5
5
  const REMOVE_TAGS = new Set([
6
6
  "script", "style", "link", "meta", "noscript", "iframe", "svg", "canvas",
7
7
  "video", "audio", "source", "track", "object", "embed", "applet",
8
- "form", "input", "textarea", "select", "button", "template",
8
+ "form", "input", "textarea", "select", "template",
9
9
  ]);
10
10
 
11
11
  const REMOVE_SELECTORS = [
@@ -81,7 +81,7 @@ export function purge(html) {
81
81
  document.querySelector("#classic-theme") ||
82
82
  document.body;
83
83
 
84
- cleanNode(main);
84
+ for (const child of [...main.childNodes]) cleanNode(child);
85
85
 
86
86
  const title = document.querySelector("title")?.textContent?.trim() || "";
87
87
  const cleanHtml =