guidelinescraper 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md ADDED
@@ -0,0 +1,97 @@
1
+ # Frontify Guideline Scraper
2
+
3
+ Scrape a Frontify brand portal and save every guideline page as a PDF and clean semantic HTML.
4
+
5
+ ## How it works
6
+
7
+ 1. **Discover** — Queries Frontify's portal and document navigation APIs to build the full site tree (documents, pages, groups, headings, external links).
8
+ 2. **Crawl** — Visits every page with Playwright, expands accordions, forces lazy images to load, dismisses cookie/overlay dialogs, then saves a PDF and raw HTML snapshot.
9
+ 3. **Clean** — Strips the raw HTML down to semantic content (headings, text, images, tables) with no scripts, styles, or navigation chrome.
10
+
11
+ ## Setup
12
+
13
+ ```bash
14
+ npm install
15
+ npx playwright install chromium
16
+ ```
17
+
18
+ ## Usage
19
+
20
+ ```bash
21
+ node crawl.mjs --url brand.uber.com
22
+ ```
23
+
24
+ Or pass a full URL:
25
+
26
+ ```bash
27
+ node crawl.mjs --url https://developer.frontify.com
28
+ ```
29
+
30
+ ### Options
31
+
32
+ | Flag | Short | Description |
33
+ |------|-------|-------------|
34
+ | `--url <url>` | `-u` | Portal domain or full URL |
35
+ | `--hub <id>` | `-h` | Hub ID (auto-detected if omitted) |
36
+ | `--cookie <str>` | `-c` | Cookie header for authenticated portals |
37
+ | `--help` | | Show help |
38
+
39
+ These can also be set via environment variables or a `.env` file:
40
+
41
+ ```
42
+ URL=brand.uber.com
43
+ HUB_ID=25
44
+ COOKIE=frontify-session-id=your-session-id
45
+ ```
46
+
47
+ ## Output
48
+
49
+ ```
50
+ output/{domain}/
51
+ pdf/
52
+ Group Name/
53
+ Document Title.pdf
54
+ Document Title/
55
+ Page Title.pdf
56
+ html/
57
+ Group Name/
58
+ Document Title.html
59
+ ...
60
+ ```
61
+
62
+ - **PDF** — Full-page A4 captures with background graphics, expanded accordions, and loaded lazy images.
63
+ - **HTML** — Cleaned semantic HTML: headings, paragraphs, images, tables. No scripts, styles, classes, or navigation elements. Wrapped in minimal readable CSS.
64
+
65
+ ## Discover only
66
+
67
+ Run the discovery step standalone to inspect or save the navigation tree:
68
+
69
+ ```bash
70
+ node discover.mjs --url brand.uber.com --output brand.uber.com.json
71
+ ```
72
+
73
+ This outputs a JSON tree of the portal's structure without crawling any pages.
74
+
75
+ ## Clean HTML only
76
+
77
+ Re-clean previously scraped raw HTML:
78
+
79
+ ```bash
80
+ node purge-html.mjs output/.raw/html output/clean
81
+ ```
82
+
83
+ ## Authenticated portals
84
+
85
+ For portals that require login, grab your session cookie from browser dev tools and pass it:
86
+
87
+ ```bash
88
+ node crawl.mjs --url brand.uber.com --cookie "frontify-session-id=your-session-id"
89
+ ```
90
+
91
+ Or add it to `.env`:
92
+
93
+ ```
94
+ COOKIE=frontify-session-id=your-session-id
95
+ ```
96
+
97
+ See `.env.example` for reference.
package/crawl.mjs ADDED
@@ -0,0 +1,416 @@
1
+ #!/usr/bin/env node
2
+ import { PlaywrightCrawler } from "crawlee";
3
+ import fs from "node:fs";
4
+ import path from "node:path";
5
+ import os from "node:os";
6
+ import { parseArgs } from "node:util";
7
+ import { discover } from "./discover.mjs";
8
+ import { purge } from "./purge-html.mjs";
9
+
10
+ process.env.PLAYWRIGHT_BROWSERS_PATH = path.join(
11
+ os.homedir(),
12
+ "Library",
13
+ "Caches",
14
+ "ms-playwright",
15
+ );
16
+
17
+ // ── CLI args ──
18
+ const { values, positionals } = parseArgs({
19
+ options: {
20
+ url: { type: "string", short: "u" },
21
+ hub: { type: "string", short: "h" },
22
+ cookie: { type: "string", short: "c" },
23
+ help: { type: "boolean" },
24
+ },
25
+ allowPositionals: true,
26
+ strict: false,
27
+ });
28
+
29
+ if (values.help) {
30
+ console.log(`Usage: node crawl.mjs [options] [url]
31
+
32
+ Options:
33
+ -u, --url <url> Portal domain or URL
34
+ -h, --hub <id> Hub ID (auto-detected if omitted)
35
+ -c, --cookie <str> Cookie header for authenticated requests
36
+ --help Show this help
37
+
38
+ Environment variables (or .env file):
39
+ COOKIE Cookie header value
40
+ URL Portal domain or URL
41
+ HUB_ID Hub ID`);
42
+ process.exit(0);
43
+ }
44
+
45
+ const inputUrl = values.url || positionals[0] || process.env.URL;
46
+ const hubId = values.hub || process.env.HUB_ID;
47
+ const cookie = values.cookie || process.env.COOKIE || "";
48
+
49
+ const OUTPUT_DIR = "output";
50
+
51
+ // ── Phase 1: Discover navigation tree ──
52
+ console.log("Discovering portal navigation…\n");
53
+ const siteTree = await discover({ url: inputUrl, hubId, cookie });
54
+
55
+ function sanitize(name) {
56
+ return name.replace(/[<>:"/\\|?*]+/g, "_").trim();
57
+ }
58
+
59
+ function collectPages(nodes, parentDir) {
60
+ const pages = [];
61
+
62
+ for (const node of nodes) {
63
+ if (node.type === "heading") continue;
64
+ if (node.type === "external_link") continue;
65
+
66
+ const name = sanitize(node.title);
67
+
68
+ if (node.type === "group" || node.type === "category") {
69
+ const groupDir = path.join(parentDir, name);
70
+ if (node.children) {
71
+ pages.push(...collectPages(node.children, groupDir));
72
+ }
73
+ continue;
74
+ }
75
+
76
+ if (node.url) {
77
+ const hasPageChildren =
78
+ node.children &&
79
+ node.children.some(
80
+ (c) =>
81
+ c.type === "page" ||
82
+ c.type === "document" ||
83
+ c.type === "group" ||
84
+ c.type === "category",
85
+ );
86
+
87
+ if (hasPageChildren) {
88
+ const dir = path.join(parentDir, name);
89
+ pages.push(...collectPages(node.children, dir));
90
+ } else {
91
+ pages.push({
92
+ url: node.url,
93
+ pdfPath: path.join(parentDir, `${name}.pdf`),
94
+ });
95
+ }
96
+ }
97
+ }
98
+
99
+ return pages;
100
+ }
101
+
102
+ const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
103
+ const pages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
104
+
105
+ const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
106
+
107
+ console.log(`Discovered ${pages.length} pages to crawl.\n`);
108
+ for (const { url, pdfPath } of pages) {
109
+ console.log(` ${pdfPath} ← ${url}`);
110
+ }
111
+ console.log();
112
+
113
+ for (const { pdfPath } of pages) {
114
+ fs.mkdirSync(path.dirname(pdfPath), { recursive: true });
115
+ const htmlDir = path.dirname(pdfPath.replace(/\/pdf\//, "/html/"));
116
+ fs.mkdirSync(htmlDir, { recursive: true });
117
+ }
118
+
119
+ // ── Phase 2: Crawl pages ──
120
+ const cookieConsentValue =
121
+ '{"isCookieConsentOpen":false,"preferencesScriptsEnabled":true,"statisticsScriptsEnabled":true}';
122
+
123
+ const crawler = new PlaywrightCrawler({
124
+ headless: true,
125
+ launchContext: {
126
+ launchOptions: { args: ["--disable-dev-shm-usage"] },
127
+ },
128
+ maxConcurrency: 16,
129
+ maxRequestRetries: 2,
130
+ navigationTimeoutSecs: 120,
131
+
132
+ preNavigationHooks: [
133
+ async ({ page, request }) => {
134
+ const reqUrl = new URL(request.url);
135
+ const cookies = [
136
+ {
137
+ name: "cookieConsentUserData",
138
+ value: cookieConsentValue,
139
+ domain: reqUrl.hostname,
140
+ path: "/",
141
+ },
142
+ ];
143
+
144
+ if (cookie) {
145
+ for (const pair of cookie.split(";")) {
146
+ const eq = pair.indexOf("=");
147
+ if (eq === -1) continue;
148
+ cookies.push({
149
+ name: pair.slice(0, eq).trim(),
150
+ value: pair.slice(eq + 1).trim(),
151
+ domain: reqUrl.hostname,
152
+ path: "/",
153
+ });
154
+ }
155
+ }
156
+
157
+ await page.context().addCookies(cookies);
158
+ },
159
+ ],
160
+
161
+ async requestHandler({ page, request, log }) {
162
+ const pdfPath = urlToPdf.get(request.url);
163
+ if (!pdfPath) {
164
+ log.warning(`No PDF path mapped for ${request.url}, skipping.`);
165
+ return;
166
+ }
167
+
168
+ log.info(`Saving ${request.url} → ${pdfPath}`);
169
+
170
+ await page
171
+ .waitForLoadState("networkidle", { timeout: 30_000 })
172
+ .catch(() => {});
173
+
174
+ // Expand all collapsible / accordion content
175
+ await page.evaluate(async () => {
176
+ for (const el of document.querySelectorAll("details")) {
177
+ el.setAttribute("open", "");
178
+ }
179
+
180
+ const delay = (ms) => new Promise((r) => setTimeout(r, ms));
181
+
182
+ const skipAncestors =
183
+ "nav, header, [role='navigation'], [role='search'], [class*='search'], [class*='nav'], [class*='header'], [class*='modal'], [class*='overlay']";
184
+
185
+ const triggerSelectors = [
186
+ '[data-state="closed"]',
187
+ ".accordion-trigger",
188
+ ".accordion-header",
189
+ ".accordion-toggle",
190
+ 'button[class*="accordion"]',
191
+ 'button[class*="collapse"]',
192
+ 'button[class*="expand"]',
193
+ 'div[class*="accordion"] > button',
194
+ 'div[class*="accordion"] > [role="button"]',
195
+ 'main [aria-expanded="false"]',
196
+ 'article [aria-expanded="false"]',
197
+ '[role="main"] [aria-expanded="false"]',
198
+ '.content [aria-expanded="false"]',
199
+ ];
200
+
201
+ for (const selector of triggerSelectors) {
202
+ for (const trigger of document.querySelectorAll(selector)) {
203
+ if (trigger.closest(skipAncestors)) continue;
204
+ trigger.click();
205
+ await delay(100);
206
+ }
207
+ }
208
+
209
+ const panelSelectors = [
210
+ '[data-state="closed"]',
211
+ ".accordion-content",
212
+ ".collapse:not(.show)",
213
+ '[class*="accordion-panel"]',
214
+ ];
215
+ for (const el of document.querySelectorAll(panelSelectors.join(", "))) {
216
+ if (el.closest(skipAncestors)) continue;
217
+ el.style.setProperty("display", "block", "important");
218
+ el.style.setProperty("height", "auto", "important");
219
+ el.style.setProperty("max-height", "none", "important");
220
+ el.style.setProperty("overflow", "visible", "important");
221
+ el.style.setProperty("opacity", "1", "important");
222
+ el.style.setProperty("visibility", "visible", "important");
223
+ el.removeAttribute("aria-hidden");
224
+ }
225
+ });
226
+
227
+ await page.evaluate(() => {
228
+ for (const el of document.querySelectorAll(
229
+ '[class*="overlay"], [class*="modal"], [class*="search"][class*="open"], [role="dialog"], [class*="underlay"]',
230
+ )) {
231
+ el.style.setProperty("display", "none", "important");
232
+ }
233
+ });
234
+
235
+ await page.waitForTimeout(1000);
236
+
237
+ await page.evaluate(() => {
238
+ const lazyAttrs = ["data-src", "data-lazy-src", "data-original"];
239
+ const lazySrcsetAttrs = ["data-srcset", "data-lazy-srcset"];
240
+ const realSrcMap = new WeakMap();
241
+
242
+ for (const img of document.querySelectorAll("img")) {
243
+ img.removeAttribute("loading");
244
+ img.removeAttribute("decoding");
245
+
246
+ for (const attr of lazyAttrs) {
247
+ const val = img.getAttribute(attr);
248
+ if (val) {
249
+ img.src = val;
250
+ realSrcMap.set(img, val);
251
+ img.removeAttribute(attr);
252
+ }
253
+ }
254
+ for (const attr of lazySrcsetAttrs) {
255
+ const val = img.getAttribute(attr);
256
+ if (val) {
257
+ img.srcset = val;
258
+ img.removeAttribute(attr);
259
+ }
260
+ }
261
+ }
262
+
263
+ const observer = new MutationObserver((mutations) => {
264
+ for (const m of mutations) {
265
+ if (m.type === "attributes" && m.attributeName === "src") {
266
+ const img = m.target;
267
+ const saved = realSrcMap.get(img);
268
+ if (saved && img.src !== saved && img.src.startsWith("data:")) {
269
+ img.src = saved;
270
+ }
271
+ }
272
+ }
273
+ });
274
+ observer.observe(document.body, {
275
+ attributes: true,
276
+ attributeFilter: ["src"],
277
+ subtree: true,
278
+ });
279
+
280
+ for (const source of document.querySelectorAll("picture source")) {
281
+ for (const attr of [...lazyAttrs, ...lazySrcsetAttrs]) {
282
+ const val = source.getAttribute(attr);
283
+ if (val) {
284
+ if (attr.includes("srcset")) source.srcset = val;
285
+ source.removeAttribute(attr);
286
+ }
287
+ }
288
+ }
289
+
290
+ for (const el of document.querySelectorAll("*")) {
291
+ const bg = getComputedStyle(el).backgroundImage;
292
+ if (bg && bg !== "none" && bg.startsWith("url(")) {
293
+ const url = bg.slice(4, -1).replace(/["']/g, "");
294
+ if (/\.(jpe?g|png|gif|webp|svg|avif)/i.test(url)) {
295
+ const img = document.createElement("img");
296
+ img.src = url;
297
+ img.style.cssText = "width:100%;height:auto;display:block;";
298
+ el.style.backgroundImage = "none";
299
+ el.prepend(img);
300
+ }
301
+ }
302
+ }
303
+ });
304
+
305
+ await page
306
+ .waitForLoadState("networkidle", { timeout: 15_000 })
307
+ .catch(() => {});
308
+
309
+ for (let pass = 0; pass < 2; pass++) {
310
+ await page.evaluate(async () => {
311
+ const delay = (ms) => new Promise((r) => setTimeout(r, ms));
312
+ const step = window.innerHeight;
313
+ const maxScroll = document.body.scrollHeight;
314
+ for (let y = 0; y <= maxScroll; y += step) {
315
+ window.scrollTo(0, y);
316
+ await delay(250);
317
+ }
318
+ window.scrollTo(0, 0);
319
+
320
+ for (const img of document.querySelectorAll("img[data-src]")) {
321
+ img.src = img.dataset.src;
322
+ img.removeAttribute("data-src");
323
+ }
324
+ });
325
+ await page
326
+ .waitForLoadState("networkidle", { timeout: 15_000 })
327
+ .catch(() => {});
328
+ }
329
+
330
+ await page
331
+ .waitForFunction(
332
+ () =>
333
+ [...document.querySelectorAll("img")].every((img) => {
334
+ if (img.src.startsWith("data:")) return true;
335
+ if (img.offsetWidth <= 1 && img.offsetHeight <= 1) return true;
336
+ return img.complete && img.naturalWidth > 0;
337
+ }),
338
+ { timeout: 30_000 },
339
+ )
340
+ .catch(() => {});
341
+
342
+ await page.evaluate(() => {
343
+ const style = document.createElement("style");
344
+ style.textContent = `
345
+ img {
346
+ opacity: 1 !important;
347
+ visibility: visible !important;
348
+ transition: none !important;
349
+ animation: none !important;
350
+ }
351
+ `;
352
+ document.head.appendChild(style);
353
+
354
+ for (const img of document.querySelectorAll("img")) {
355
+ img.style.setProperty("opacity", "1", "important");
356
+ img.style.setProperty("visibility", "visible", "important");
357
+ img.style.setProperty("transition", "none", "important");
358
+ img.style.setProperty("animation", "none", "important");
359
+ }
360
+ });
361
+
362
+ await page.waitForTimeout(1000);
363
+
364
+ await page.evaluate(() => {
365
+ const selectors = [
366
+ '[role="dialog"]',
367
+ '[class*="overlay"]',
368
+ '[class*="underlay"]',
369
+ '[class*="modal"]',
370
+ '[class*="cookie"]',
371
+ '[class*="consent"]',
372
+ '[class*="backdrop"]',
373
+ ];
374
+ for (const el of document.querySelectorAll(selectors.join(", "))) {
375
+ el.remove();
376
+ }
377
+
378
+ for (const el of document.querySelectorAll("*")) {
379
+ const s = getComputedStyle(el);
380
+ if (
381
+ s.position === "fixed" &&
382
+ el.offsetWidth >= window.innerWidth * 0.9 &&
383
+ el.offsetHeight >= window.innerHeight * 0.9 &&
384
+ !el.matches("header, nav, main, article, section, footer")
385
+ ) {
386
+ el.remove();
387
+ }
388
+ }
389
+ });
390
+
391
+ const pdfBuffer = await page.pdf({
392
+ format: "A4",
393
+ printBackground: true,
394
+ margin: { top: "20mm", bottom: "20mm", left: "15mm", right: "15mm" },
395
+ });
396
+
397
+ fs.writeFileSync(pdfPath, pdfBuffer);
398
+
399
+ const htmlPath = pdfPath
400
+ .replace(/\/pdf\//, "/html/")
401
+ .replace(/\.pdf$/, ".html");
402
+ fs.mkdirSync(path.dirname(htmlPath), { recursive: true });
403
+ const rawHtml = await page.content();
404
+ fs.writeFileSync(htmlPath, purge(rawHtml));
405
+
406
+ log.info(`Saved ${pdfPath} + ${htmlPath}`);
407
+ },
408
+
409
+ async failedRequestHandler({ request, log }) {
410
+ log.error(`Failed to crawl ${request.url} after retries.`);
411
+ },
412
+ });
413
+
414
+ await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
415
+
416
+ console.log("\nDone. Output saved under:", domainDir);
package/discover.mjs ADDED
@@ -0,0 +1,222 @@
1
+ import { writeFileSync } from "node:fs";
2
+ import { parseArgs } from "node:util";
3
+
4
+ /**
5
+ * Discover the full navigation tree for a Frontify portal.
6
+ * @param {{ url?: string, domain?: string, baseUrl?: string, hubId?: string, cookie?: string }} opts
7
+ * @returns {Promise<{ domain: string, hubId: number, baseUrl: string, children: object[] }>}
8
+ */
9
+ export async function discover(opts = {}) {
10
+ const input = opts.url || opts.domain || "developer.frontify.com";
11
+ const isUrl = input.startsWith("https://") || input.startsWith("http://");
12
+ const baseUrl = opts.baseUrl || (isUrl ? new URL(input).origin : `https://${input}`);
13
+ const domain = opts.domain || (isUrl ? new URL(input).hostname : input);
14
+ let hubId = opts.hubId;
15
+ const cookie = opts.cookie || "";
16
+
17
+ const headers = {};
18
+ if (cookie) headers.cookie = cookie;
19
+
20
+ async function fetchJSON(url) {
21
+ const res = await fetch(url, { headers });
22
+ if (!res.ok) throw new Error(`${res.status} ${res.statusText} – ${url}`);
23
+ const json = await res.json();
24
+ if (!json.success) throw new Error(`API error – ${url}`);
25
+ return json.data;
26
+ }
27
+
28
+ async function detectHubId() {
29
+ console.error(`No hub ID provided, detecting from ${baseUrl} …`);
30
+ const res = await fetch(baseUrl, { headers });
31
+ if (!res.ok) throw new Error(`${res.status} fetching ${baseUrl}`);
32
+ const html = await res.text();
33
+ const match = html.match(/"hub_id"\s*:\s*(\d+)/);
34
+ if (!match) throw new Error("Could not find hub_id in page response");
35
+ console.error(`Detected hub_id: ${match[1]}`);
36
+ return match[1];
37
+ }
38
+
39
+ function pageUrl(documentId, categorySlug, pageSlug, headingSlug) {
40
+ let url = `${baseUrl}/document/${documentId}#`;
41
+ url += categorySlug ? `/${categorySlug}` : `/-`;
42
+ if (pageSlug) url += `/${pageSlug}`;
43
+ if (headingSlug) url += `/${headingSlug}`;
44
+ return url;
45
+ }
46
+
47
+ function buildHeadingNode(heading, documentId, categorySlug, pageSlug) {
48
+ const t = heading.translations.default;
49
+ return {
50
+ type: "heading",
51
+ id: heading.id,
52
+ title: t.title,
53
+ slug: t.slug,
54
+ url: pageUrl(documentId, categorySlug, pageSlug, t.slug),
55
+ };
56
+ }
57
+
58
+ function buildPageNode(page, documentId, categorySlug) {
59
+ const t = page.translations.default;
60
+ const node = {
61
+ type: "page",
62
+ id: page.id,
63
+ title: t.title,
64
+ slug: t.slug,
65
+ url: pageUrl(documentId, categorySlug, t.slug),
66
+ children: [],
67
+ };
68
+ if (page.children) {
69
+ for (const child of page.children) {
70
+ if (child.type === "page_heading") {
71
+ node.children.push(buildHeadingNode(child, documentId, categorySlug, t.slug));
72
+ }
73
+ }
74
+ }
75
+ return node;
76
+ }
77
+
78
+ function buildDocNavItems(items, documentId, categorySlug) {
79
+ const nodes = [];
80
+ for (const item of items) {
81
+ switch (item.type) {
82
+ case "page_category": {
83
+ const t = item.translations.default;
84
+ const cat = {
85
+ type: "category",
86
+ id: item.id,
87
+ title: t.title,
88
+ slug: t.slug,
89
+ children: [],
90
+ };
91
+ if (item.children) {
92
+ cat.children = buildDocNavItems(item.children, documentId, t.slug);
93
+ }
94
+ nodes.push(cat);
95
+ break;
96
+ }
97
+ case "page": {
98
+ nodes.push(buildPageNode(item, documentId, categorySlug));
99
+ break;
100
+ }
101
+ case "page_heading": {
102
+ nodes.push(buildHeadingNode(item, documentId, categorySlug, null));
103
+ break;
104
+ }
105
+ }
106
+ }
107
+ return nodes;
108
+ }
109
+
110
+ async function buildDocumentNode(doc) {
111
+ const t = doc.translations.default;
112
+ const node = {
113
+ type: "document",
114
+ id: doc.id,
115
+ title: t.title,
116
+ slug: t.slug,
117
+ url: `${baseUrl}/document/${doc.id}`,
118
+ children: [],
119
+ };
120
+ try {
121
+ const items = await fetchJSON(`${baseUrl}/api/document-navigation/${doc.id}`);
122
+ node.children = buildDocNavItems(items, doc.id, null);
123
+ } catch (err) {
124
+ node.error = err.message;
125
+ }
126
+ return node;
127
+ }
128
+
129
+ function buildLinkNode(doc) {
130
+ const t = doc.translations.default;
131
+ return {
132
+ type: "external_link",
133
+ id: doc.id,
134
+ title: t.title,
135
+ url: doc.url,
136
+ openInNewTab: doc.should_open_in_new_tab,
137
+ };
138
+ }
139
+
140
+ if (!hubId) hubId = await detectHubId();
141
+ console.error(`Crawling portal: ${baseUrl} (hub ${hubId})`);
142
+ const portalNav = await fetchJSON(`${baseUrl}/api/portal-navigation/${hubId}`);
143
+
144
+ const tree = { domain, hubId: Number(hubId), baseUrl, children: [] };
145
+
146
+ for (const item of portalNav) {
147
+ if (item.type === "document") {
148
+ tree.children.push(await buildDocumentNode(item));
149
+ } else if (item.type === "document_group") {
150
+ const group = {
151
+ type: "group",
152
+ id: item.id,
153
+ title: item.translations.default.name,
154
+ children: [],
155
+ };
156
+ const docPromises = item.children.map((child) => {
157
+ if (child.type === "link_document") return buildLinkNode(child);
158
+ return buildDocumentNode(child);
159
+ });
160
+ group.children = await Promise.all(docPromises);
161
+ tree.children.push(group);
162
+ } else if (item.type === "link_document") {
163
+ tree.children.push(buildLinkNode(item));
164
+ }
165
+ }
166
+
167
+ return tree;
168
+ }
169
+
170
+ function countType(node, type) {
171
+ let count = node.type === type ? 1 : 0;
172
+ if (node.children) {
173
+ for (const child of node.children) count += countType(child, type);
174
+ }
175
+ return count;
176
+ }
177
+
178
+ // ── CLI mode ──
179
+ const isCLI = process.argv[1]?.endsWith("discover.mjs");
180
+ if (isCLI) {
181
+ const { values, positionals } = parseArgs({
182
+ options: {
183
+ url: { type: "string", short: "u" },
184
+ hub: { type: "string", short: "h" },
185
+ output: { type: "string", short: "o" },
186
+ cookie: { type: "string", short: "c" },
187
+ help: { type: "boolean" },
188
+ },
189
+ allowPositionals: true,
190
+ strict: false,
191
+ });
192
+
193
+ if (values.help) {
194
+ console.log(`Usage: node discover.mjs [options] [url] [hubId] [output]
195
+
196
+ Options:
197
+ -u, --url <url> Portal domain or URL
198
+ -h, --hub <id> Hub ID (auto-detected if omitted)
199
+ -o, --output <file> Output file (default: {domain}.json)
200
+ -c, --cookie <str> Cookie header for authenticated requests
201
+ --help Show this help
202
+
203
+ Environment variables (or .env file):
204
+ COOKIE Cookie header value
205
+ URL Portal domain or URL
206
+ HUB_ID Hub ID`);
207
+ process.exit(0);
208
+ }
209
+
210
+ const input = values.url || positionals[0] || process.env.URL;
211
+ const hubId = values.hub || positionals[1] || process.env.HUB_ID;
212
+ const cookie = values.cookie || process.env.COOKIE || "";
213
+
214
+ const tree = await discover({ url: input, hubId, cookie });
215
+
216
+ const outputFile = values.output || positionals[2] || `${tree.domain}.json`;
217
+ console.error(
218
+ `Done – ${countType(tree, "document")} documents, ${countType(tree, "category")} categories, ${countType(tree, "page")} pages, ${countType(tree, "heading")} headings, ${countType(tree, "external_link")} external links`
219
+ );
220
+ writeFileSync(outputFile, JSON.stringify(tree, null, 2) + "\n");
221
+ console.error(`Written to ${outputFile}`);
222
+ }
package/package.json ADDED
@@ -0,0 +1,28 @@
1
+ {
2
+ "name": "guidelinescraper",
3
+ "version": "1.0.0",
4
+ "type": "module",
5
+ "description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
6
+ "bin": {
7
+ "guidelinescraper": "./crawl.mjs"
8
+ },
9
+ "scripts": {
10
+ "start": "node crawl.mjs"
11
+ },
12
+ "files": [
13
+ "crawl.mjs",
14
+ "discover.mjs",
15
+ "purge-html.mjs"
16
+ ],
17
+ "keywords": [
18
+ "frontify",
19
+ "scraper",
20
+ "pdf",
21
+ "brand-guidelines"
22
+ ],
23
+ "license": "ISC",
24
+ "dependencies": {
25
+ "crawlee": "^3.16.0",
26
+ "playwright": "^1.58.2"
27
+ }
28
+ }
package/purge-html.mjs ADDED
@@ -0,0 +1,134 @@
1
+ import fs from "node:fs";
2
+ import path from "node:path";
3
+ import { parseHTML } from "linkedom";
4
+
5
+ const REMOVE_TAGS = new Set([
6
+ "script", "style", "link", "meta", "noscript", "iframe", "svg", "canvas",
7
+ "video", "audio", "source", "track", "object", "embed", "applet",
8
+ "form", "input", "textarea", "select", "button", "template",
9
+ ]);
10
+
11
+ const REMOVE_SELECTORS = [
12
+ "nav", "header", "footer",
13
+ "[role='navigation']", "[role='banner']", "[role='search']", "[role='dialog']",
14
+ "[class*='overlay']", "[class*='underlay']", "[class*='modal']",
15
+ "[class*='cookie']", "[class*='consent']", "[class*='sidebar']",
16
+ "[class*='nav-']", "[class*='toolbar']", "[class*='skip-to-main']",
17
+ "[class*='search']", "[class*='tooltip']", "[class*='popover']",
18
+ "[class*='dropdown']", "[aria-hidden='true']",
19
+ "[data-test-id='skip-to-main-content-link']",
20
+ ];
21
+
22
+ const KEEP_ATTRS = new Set([
23
+ "src", "href", "alt", "title", "id",
24
+ "colspan", "rowspan", "headers", "scope", "lang", "dir",
25
+ ]);
26
+
27
+ const SEMANTIC_TAGS = new Set([
28
+ "html", "head", "body", "main", "article", "section", "aside",
29
+ "h1", "h2", "h3", "h4", "h5", "h6",
30
+ "p", "blockquote", "pre", "code",
31
+ "ul", "ol", "li", "dl", "dt", "dd",
32
+ "table", "thead", "tbody", "tfoot", "tr", "th", "td", "caption",
33
+ "a", "img", "figure", "figcaption", "picture",
34
+ "strong", "em", "b", "i", "u", "s", "mark", "small", "sub", "sup",
35
+ "br", "hr", "details", "summary", "time", "abbr", "cite", "q",
36
+ ]);
37
+
38
+ function cleanNode(node) {
39
+ if (node.nodeType === 3) return;
40
+ if (node.nodeType !== 1) { node.remove(); return; }
41
+
42
+ const tag = node.tagName?.toLowerCase();
43
+ if (REMOVE_TAGS.has(tag)) { node.remove(); return; }
44
+
45
+ if (node.attributes) {
46
+ const toRemove = [];
47
+ for (const attr of node.attributes) {
48
+ if (!KEEP_ATTRS.has(attr.name)) toRemove.push(attr.name);
49
+ }
50
+ for (const name of toRemove) node.removeAttribute(name);
51
+ }
52
+
53
+ for (const child of [...node.childNodes]) cleanNode(child);
54
+
55
+ if (!SEMANTIC_TAGS.has(tag) && node.children?.length === 0 && !node.textContent?.trim()) {
56
+ node.remove();
57
+ return;
58
+ }
59
+
60
+ if (!SEMANTIC_TAGS.has(tag)) {
61
+ const parent = node.parentNode;
62
+ if (parent) {
63
+ while (node.firstChild) parent.insertBefore(node.firstChild, node);
64
+ node.remove();
65
+ }
66
+ }
67
+ }
68
+
69
+ export function purge(html) {
70
+ const { document } = parseHTML(html);
71
+
72
+ for (const selector of REMOVE_SELECTORS) {
73
+ try { for (const el of document.querySelectorAll(selector)) el.remove(); } catch {}
74
+ }
75
+
76
+ const main =
77
+ document.querySelector("main") ||
78
+ document.querySelector("[role='main']") ||
79
+ document.querySelector("#main") ||
80
+ document.querySelector(".main-content") ||
81
+ document.querySelector("#classic-theme") ||
82
+ document.body;
83
+
84
+ cleanNode(main);
85
+
86
+ const title = document.querySelector("title")?.textContent?.trim() || "";
87
+ const cleanHtml =
88
+ `<!DOCTYPE html>\n<html lang="${document.documentElement?.getAttribute("lang") || "en"}">\n<head>\n<meta charset="utf-8">\n<title>${title}</title>\n<style>body{max-width:72ch;margin:2rem auto;padding:0 1rem;font:1rem/1.6 monospace}img{max-width:100%;height:auto}</style>\n</head>\n<body>\n${main.innerHTML.trim()}\n</body>\n</html>`;
89
+
90
+ return cleanHtml.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+\n/g, "\n");
91
+ }
92
+
93
+ // CLI mode: run standalone on a directory
94
+ const isCLI = process.argv[1]?.endsWith("purge-html.mjs");
95
+ if (isCLI && process.argv[2]) {
96
+ const RAW_DIR = process.argv[2];
97
+ const CLEAN_DIR = process.argv[3] || RAW_DIR.replace(".raw", ".clean");
98
+
99
+ function collectFiles(dir) {
100
+ const files = [];
101
+ for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
102
+ const full = path.join(dir, entry.name);
103
+ if (entry.isDirectory()) files.push(...collectFiles(full));
104
+ else if (entry.name.endsWith(".html")) files.push(full);
105
+ }
106
+ return files;
107
+ }
108
+
109
+ const files = collectFiles(RAW_DIR);
110
+ console.log(`Found ${files.length} HTML files in ${RAW_DIR}`);
111
+
112
+ let totalRawBytes = 0;
113
+ let totalCleanBytes = 0;
114
+
115
+ for (const file of files) {
116
+ const relPath = path.relative(RAW_DIR, file);
117
+ const outPath = path.join(CLEAN_DIR, relPath);
118
+ fs.mkdirSync(path.dirname(outPath), { recursive: true });
119
+
120
+ const raw = fs.readFileSync(file, "utf-8");
121
+ const clean = purge(raw);
122
+ fs.writeFileSync(outPath, clean);
123
+
124
+ totalRawBytes += raw.length;
125
+ totalCleanBytes += clean.length;
126
+
127
+ const pct = ((1 - clean.length / raw.length) * 100).toFixed(0);
128
+ console.log(` ${relPath} ${(raw.length / 1024).toFixed(0)}K → ${(clean.length / 1024).toFixed(0)}K (-${pct}%)`);
129
+ }
130
+
131
+ console.log(`\nDone. ${files.length} files cleaned.`);
132
+ console.log(`Total: ${(totalRawBytes / 1024).toFixed(0)}K → ${(totalCleanBytes / 1024).toFixed(0)}K (-${((1 - totalCleanBytes / totalRawBytes) * 100).toFixed(0)}%)`);
133
+ console.log(`Output: ${CLEAN_DIR}`);
134
+ }