guidelinescraper 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +97 -0
- package/crawl.mjs +416 -0
- package/discover.mjs +222 -0
- package/package.json +28 -0
- package/purge-html.mjs +134 -0
package/README.md
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Frontify Guideline Scraper
|
|
2
|
+
|
|
3
|
+
Scrape a Frontify brand portal and save every guideline page as a PDF and clean semantic HTML.
|
|
4
|
+
|
|
5
|
+
## How it works
|
|
6
|
+
|
|
7
|
+
1. **Discover** — Queries Frontify's portal and document navigation APIs to build the full site tree (documents, pages, groups, headings, external links).
|
|
8
|
+
2. **Crawl** — Visits every page with Playwright, expands accordions, forces lazy images to load, dismisses cookie/overlay dialogs, then saves a PDF and raw HTML snapshot.
|
|
9
|
+
3. **Clean** — Strips the raw HTML down to semantic content (headings, text, images, tables) with no scripts, styles, or navigation chrome.
|
|
10
|
+
|
|
11
|
+
## Setup
|
|
12
|
+
|
|
13
|
+
```bash
|
|
14
|
+
npm install
|
|
15
|
+
npx playwright install chromium
|
|
16
|
+
```
|
|
17
|
+
|
|
18
|
+
## Usage
|
|
19
|
+
|
|
20
|
+
```bash
|
|
21
|
+
node crawl.mjs --url brand.uber.com
|
|
22
|
+
```
|
|
23
|
+
|
|
24
|
+
Or pass a full URL:
|
|
25
|
+
|
|
26
|
+
```bash
|
|
27
|
+
node crawl.mjs --url https://developer.frontify.com
|
|
28
|
+
```
|
|
29
|
+
|
|
30
|
+
### Options
|
|
31
|
+
|
|
32
|
+
| Flag | Short | Description |
|
|
33
|
+
|------|-------|-------------|
|
|
34
|
+
| `--url <url>` | `-u` | Portal domain or full URL |
|
|
35
|
+
| `--hub <id>` | `-h` | Hub ID (auto-detected if omitted) |
|
|
36
|
+
| `--cookie <str>` | `-c` | Cookie header for authenticated portals |
|
|
37
|
+
| `--help` | | Show help |
|
|
38
|
+
|
|
39
|
+
These can also be set via environment variables or a `.env` file:
|
|
40
|
+
|
|
41
|
+
```
|
|
42
|
+
URL=brand.uber.com
|
|
43
|
+
HUB_ID=25
|
|
44
|
+
COOKIE=frontify-session-id=your-session-id
|
|
45
|
+
```
|
|
46
|
+
|
|
47
|
+
## Output
|
|
48
|
+
|
|
49
|
+
```
|
|
50
|
+
output/{domain}/
|
|
51
|
+
pdf/
|
|
52
|
+
Group Name/
|
|
53
|
+
Document Title.pdf
|
|
54
|
+
Document Title/
|
|
55
|
+
Page Title.pdf
|
|
56
|
+
html/
|
|
57
|
+
Group Name/
|
|
58
|
+
Document Title.html
|
|
59
|
+
...
|
|
60
|
+
```
|
|
61
|
+
|
|
62
|
+
- **PDF** — Full-page A4 captures with background graphics, expanded accordions, and loaded lazy images.
|
|
63
|
+
- **HTML** — Cleaned semantic HTML: headings, paragraphs, images, tables. No scripts, styles, classes, or navigation elements. Wrapped in minimal readable CSS.
|
|
64
|
+
|
|
65
|
+
## Discover only
|
|
66
|
+
|
|
67
|
+
Run the discovery step standalone to inspect or save the navigation tree:
|
|
68
|
+
|
|
69
|
+
```bash
|
|
70
|
+
node discover.mjs --url brand.uber.com --output brand.uber.com.json
|
|
71
|
+
```
|
|
72
|
+
|
|
73
|
+
This outputs a JSON tree of the portal's structure without crawling any pages.
|
|
74
|
+
|
|
75
|
+
## Clean HTML only
|
|
76
|
+
|
|
77
|
+
Re-clean previously scraped raw HTML:
|
|
78
|
+
|
|
79
|
+
```bash
|
|
80
|
+
node purge-html.mjs output/.raw/html output/clean
|
|
81
|
+
```
|
|
82
|
+
|
|
83
|
+
## Authenticated portals
|
|
84
|
+
|
|
85
|
+
For portals that require login, grab your session cookie from browser dev tools and pass it:
|
|
86
|
+
|
|
87
|
+
```bash
|
|
88
|
+
node crawl.mjs --url brand.uber.com --cookie "frontify-session-id=your-session-id"
|
|
89
|
+
```
|
|
90
|
+
|
|
91
|
+
Or add it to `.env`:
|
|
92
|
+
|
|
93
|
+
```
|
|
94
|
+
COOKIE=frontify-session-id=your-session-id
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
See `.env.example` for reference.
|
package/crawl.mjs
ADDED
|
@@ -0,0 +1,416 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
import { PlaywrightCrawler } from "crawlee";
|
|
3
|
+
import fs from "node:fs";
|
|
4
|
+
import path from "node:path";
|
|
5
|
+
import os from "node:os";
|
|
6
|
+
import { parseArgs } from "node:util";
|
|
7
|
+
import { discover } from "./discover.mjs";
|
|
8
|
+
import { purge } from "./purge-html.mjs";
|
|
9
|
+
|
|
10
|
+
process.env.PLAYWRIGHT_BROWSERS_PATH = path.join(
|
|
11
|
+
os.homedir(),
|
|
12
|
+
"Library",
|
|
13
|
+
"Caches",
|
|
14
|
+
"ms-playwright",
|
|
15
|
+
);
|
|
16
|
+
|
|
17
|
+
// ── CLI args ──
|
|
18
|
+
const { values, positionals } = parseArgs({
|
|
19
|
+
options: {
|
|
20
|
+
url: { type: "string", short: "u" },
|
|
21
|
+
hub: { type: "string", short: "h" },
|
|
22
|
+
cookie: { type: "string", short: "c" },
|
|
23
|
+
help: { type: "boolean" },
|
|
24
|
+
},
|
|
25
|
+
allowPositionals: true,
|
|
26
|
+
strict: false,
|
|
27
|
+
});
|
|
28
|
+
|
|
29
|
+
if (values.help) {
|
|
30
|
+
console.log(`Usage: node crawl.mjs [options] [url]
|
|
31
|
+
|
|
32
|
+
Options:
|
|
33
|
+
-u, --url <url> Portal domain or URL
|
|
34
|
+
-h, --hub <id> Hub ID (auto-detected if omitted)
|
|
35
|
+
-c, --cookie <str> Cookie header for authenticated requests
|
|
36
|
+
--help Show this help
|
|
37
|
+
|
|
38
|
+
Environment variables (or .env file):
|
|
39
|
+
COOKIE Cookie header value
|
|
40
|
+
URL Portal domain or URL
|
|
41
|
+
HUB_ID Hub ID`);
|
|
42
|
+
process.exit(0);
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
const inputUrl = values.url || positionals[0] || process.env.URL;
|
|
46
|
+
const hubId = values.hub || process.env.HUB_ID;
|
|
47
|
+
const cookie = values.cookie || process.env.COOKIE || "";
|
|
48
|
+
|
|
49
|
+
const OUTPUT_DIR = "output";
|
|
50
|
+
|
|
51
|
+
// ── Phase 1: Discover navigation tree ──
|
|
52
|
+
console.log("Discovering portal navigation…\n");
|
|
53
|
+
const siteTree = await discover({ url: inputUrl, hubId, cookie });
|
|
54
|
+
|
|
55
|
+
function sanitize(name) {
|
|
56
|
+
return name.replace(/[<>:"/\\|?*]+/g, "_").trim();
|
|
57
|
+
}
|
|
58
|
+
|
|
59
|
+
function collectPages(nodes, parentDir) {
|
|
60
|
+
const pages = [];
|
|
61
|
+
|
|
62
|
+
for (const node of nodes) {
|
|
63
|
+
if (node.type === "heading") continue;
|
|
64
|
+
if (node.type === "external_link") continue;
|
|
65
|
+
|
|
66
|
+
const name = sanitize(node.title);
|
|
67
|
+
|
|
68
|
+
if (node.type === "group" || node.type === "category") {
|
|
69
|
+
const groupDir = path.join(parentDir, name);
|
|
70
|
+
if (node.children) {
|
|
71
|
+
pages.push(...collectPages(node.children, groupDir));
|
|
72
|
+
}
|
|
73
|
+
continue;
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
if (node.url) {
|
|
77
|
+
const hasPageChildren =
|
|
78
|
+
node.children &&
|
|
79
|
+
node.children.some(
|
|
80
|
+
(c) =>
|
|
81
|
+
c.type === "page" ||
|
|
82
|
+
c.type === "document" ||
|
|
83
|
+
c.type === "group" ||
|
|
84
|
+
c.type === "category",
|
|
85
|
+
);
|
|
86
|
+
|
|
87
|
+
if (hasPageChildren) {
|
|
88
|
+
const dir = path.join(parentDir, name);
|
|
89
|
+
pages.push(...collectPages(node.children, dir));
|
|
90
|
+
} else {
|
|
91
|
+
pages.push({
|
|
92
|
+
url: node.url,
|
|
93
|
+
pdfPath: path.join(parentDir, `${name}.pdf`),
|
|
94
|
+
});
|
|
95
|
+
}
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
return pages;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
const domainDir = path.join(OUTPUT_DIR, sanitize(siteTree.domain));
|
|
103
|
+
const pages = collectPages(siteTree.children, path.join(domainDir, "pdf"));
|
|
104
|
+
|
|
105
|
+
const urlToPdf = new Map(pages.map((p) => [p.url, p.pdfPath]));
|
|
106
|
+
|
|
107
|
+
console.log(`Discovered ${pages.length} pages to crawl.\n`);
|
|
108
|
+
for (const { url, pdfPath } of pages) {
|
|
109
|
+
console.log(` ${pdfPath} ← ${url}`);
|
|
110
|
+
}
|
|
111
|
+
console.log();
|
|
112
|
+
|
|
113
|
+
for (const { pdfPath } of pages) {
|
|
114
|
+
fs.mkdirSync(path.dirname(pdfPath), { recursive: true });
|
|
115
|
+
const htmlDir = path.dirname(pdfPath.replace(/\/pdf\//, "/html/"));
|
|
116
|
+
fs.mkdirSync(htmlDir, { recursive: true });
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
// ── Phase 2: Crawl pages ──
|
|
120
|
+
const cookieConsentValue =
|
|
121
|
+
'{"isCookieConsentOpen":false,"preferencesScriptsEnabled":true,"statisticsScriptsEnabled":true}';
|
|
122
|
+
|
|
123
|
+
const crawler = new PlaywrightCrawler({
|
|
124
|
+
headless: true,
|
|
125
|
+
launchContext: {
|
|
126
|
+
launchOptions: { args: ["--disable-dev-shm-usage"] },
|
|
127
|
+
},
|
|
128
|
+
maxConcurrency: 16,
|
|
129
|
+
maxRequestRetries: 2,
|
|
130
|
+
navigationTimeoutSecs: 120,
|
|
131
|
+
|
|
132
|
+
preNavigationHooks: [
|
|
133
|
+
async ({ page, request }) => {
|
|
134
|
+
const reqUrl = new URL(request.url);
|
|
135
|
+
const cookies = [
|
|
136
|
+
{
|
|
137
|
+
name: "cookieConsentUserData",
|
|
138
|
+
value: cookieConsentValue,
|
|
139
|
+
domain: reqUrl.hostname,
|
|
140
|
+
path: "/",
|
|
141
|
+
},
|
|
142
|
+
];
|
|
143
|
+
|
|
144
|
+
if (cookie) {
|
|
145
|
+
for (const pair of cookie.split(";")) {
|
|
146
|
+
const eq = pair.indexOf("=");
|
|
147
|
+
if (eq === -1) continue;
|
|
148
|
+
cookies.push({
|
|
149
|
+
name: pair.slice(0, eq).trim(),
|
|
150
|
+
value: pair.slice(eq + 1).trim(),
|
|
151
|
+
domain: reqUrl.hostname,
|
|
152
|
+
path: "/",
|
|
153
|
+
});
|
|
154
|
+
}
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
await page.context().addCookies(cookies);
|
|
158
|
+
},
|
|
159
|
+
],
|
|
160
|
+
|
|
161
|
+
async requestHandler({ page, request, log }) {
|
|
162
|
+
const pdfPath = urlToPdf.get(request.url);
|
|
163
|
+
if (!pdfPath) {
|
|
164
|
+
log.warning(`No PDF path mapped for ${request.url}, skipping.`);
|
|
165
|
+
return;
|
|
166
|
+
}
|
|
167
|
+
|
|
168
|
+
log.info(`Saving ${request.url} → ${pdfPath}`);
|
|
169
|
+
|
|
170
|
+
await page
|
|
171
|
+
.waitForLoadState("networkidle", { timeout: 30_000 })
|
|
172
|
+
.catch(() => {});
|
|
173
|
+
|
|
174
|
+
// Expand all collapsible / accordion content
|
|
175
|
+
await page.evaluate(async () => {
|
|
176
|
+
for (const el of document.querySelectorAll("details")) {
|
|
177
|
+
el.setAttribute("open", "");
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
const delay = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
181
|
+
|
|
182
|
+
const skipAncestors =
|
|
183
|
+
"nav, header, [role='navigation'], [role='search'], [class*='search'], [class*='nav'], [class*='header'], [class*='modal'], [class*='overlay']";
|
|
184
|
+
|
|
185
|
+
const triggerSelectors = [
|
|
186
|
+
'[data-state="closed"]',
|
|
187
|
+
".accordion-trigger",
|
|
188
|
+
".accordion-header",
|
|
189
|
+
".accordion-toggle",
|
|
190
|
+
'button[class*="accordion"]',
|
|
191
|
+
'button[class*="collapse"]',
|
|
192
|
+
'button[class*="expand"]',
|
|
193
|
+
'div[class*="accordion"] > button',
|
|
194
|
+
'div[class*="accordion"] > [role="button"]',
|
|
195
|
+
'main [aria-expanded="false"]',
|
|
196
|
+
'article [aria-expanded="false"]',
|
|
197
|
+
'[role="main"] [aria-expanded="false"]',
|
|
198
|
+
'.content [aria-expanded="false"]',
|
|
199
|
+
];
|
|
200
|
+
|
|
201
|
+
for (const selector of triggerSelectors) {
|
|
202
|
+
for (const trigger of document.querySelectorAll(selector)) {
|
|
203
|
+
if (trigger.closest(skipAncestors)) continue;
|
|
204
|
+
trigger.click();
|
|
205
|
+
await delay(100);
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
const panelSelectors = [
|
|
210
|
+
'[data-state="closed"]',
|
|
211
|
+
".accordion-content",
|
|
212
|
+
".collapse:not(.show)",
|
|
213
|
+
'[class*="accordion-panel"]',
|
|
214
|
+
];
|
|
215
|
+
for (const el of document.querySelectorAll(panelSelectors.join(", "))) {
|
|
216
|
+
if (el.closest(skipAncestors)) continue;
|
|
217
|
+
el.style.setProperty("display", "block", "important");
|
|
218
|
+
el.style.setProperty("height", "auto", "important");
|
|
219
|
+
el.style.setProperty("max-height", "none", "important");
|
|
220
|
+
el.style.setProperty("overflow", "visible", "important");
|
|
221
|
+
el.style.setProperty("opacity", "1", "important");
|
|
222
|
+
el.style.setProperty("visibility", "visible", "important");
|
|
223
|
+
el.removeAttribute("aria-hidden");
|
|
224
|
+
}
|
|
225
|
+
});
|
|
226
|
+
|
|
227
|
+
await page.evaluate(() => {
|
|
228
|
+
for (const el of document.querySelectorAll(
|
|
229
|
+
'[class*="overlay"], [class*="modal"], [class*="search"][class*="open"], [role="dialog"], [class*="underlay"]',
|
|
230
|
+
)) {
|
|
231
|
+
el.style.setProperty("display", "none", "important");
|
|
232
|
+
}
|
|
233
|
+
});
|
|
234
|
+
|
|
235
|
+
await page.waitForTimeout(1000);
|
|
236
|
+
|
|
237
|
+
await page.evaluate(() => {
|
|
238
|
+
const lazyAttrs = ["data-src", "data-lazy-src", "data-original"];
|
|
239
|
+
const lazySrcsetAttrs = ["data-srcset", "data-lazy-srcset"];
|
|
240
|
+
const realSrcMap = new WeakMap();
|
|
241
|
+
|
|
242
|
+
for (const img of document.querySelectorAll("img")) {
|
|
243
|
+
img.removeAttribute("loading");
|
|
244
|
+
img.removeAttribute("decoding");
|
|
245
|
+
|
|
246
|
+
for (const attr of lazyAttrs) {
|
|
247
|
+
const val = img.getAttribute(attr);
|
|
248
|
+
if (val) {
|
|
249
|
+
img.src = val;
|
|
250
|
+
realSrcMap.set(img, val);
|
|
251
|
+
img.removeAttribute(attr);
|
|
252
|
+
}
|
|
253
|
+
}
|
|
254
|
+
for (const attr of lazySrcsetAttrs) {
|
|
255
|
+
const val = img.getAttribute(attr);
|
|
256
|
+
if (val) {
|
|
257
|
+
img.srcset = val;
|
|
258
|
+
img.removeAttribute(attr);
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
|
|
263
|
+
const observer = new MutationObserver((mutations) => {
|
|
264
|
+
for (const m of mutations) {
|
|
265
|
+
if (m.type === "attributes" && m.attributeName === "src") {
|
|
266
|
+
const img = m.target;
|
|
267
|
+
const saved = realSrcMap.get(img);
|
|
268
|
+
if (saved && img.src !== saved && img.src.startsWith("data:")) {
|
|
269
|
+
img.src = saved;
|
|
270
|
+
}
|
|
271
|
+
}
|
|
272
|
+
}
|
|
273
|
+
});
|
|
274
|
+
observer.observe(document.body, {
|
|
275
|
+
attributes: true,
|
|
276
|
+
attributeFilter: ["src"],
|
|
277
|
+
subtree: true,
|
|
278
|
+
});
|
|
279
|
+
|
|
280
|
+
for (const source of document.querySelectorAll("picture source")) {
|
|
281
|
+
for (const attr of [...lazyAttrs, ...lazySrcsetAttrs]) {
|
|
282
|
+
const val = source.getAttribute(attr);
|
|
283
|
+
if (val) {
|
|
284
|
+
if (attr.includes("srcset")) source.srcset = val;
|
|
285
|
+
source.removeAttribute(attr);
|
|
286
|
+
}
|
|
287
|
+
}
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
for (const el of document.querySelectorAll("*")) {
|
|
291
|
+
const bg = getComputedStyle(el).backgroundImage;
|
|
292
|
+
if (bg && bg !== "none" && bg.startsWith("url(")) {
|
|
293
|
+
const url = bg.slice(4, -1).replace(/["']/g, "");
|
|
294
|
+
if (/\.(jpe?g|png|gif|webp|svg|avif)/i.test(url)) {
|
|
295
|
+
const img = document.createElement("img");
|
|
296
|
+
img.src = url;
|
|
297
|
+
img.style.cssText = "width:100%;height:auto;display:block;";
|
|
298
|
+
el.style.backgroundImage = "none";
|
|
299
|
+
el.prepend(img);
|
|
300
|
+
}
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
});
|
|
304
|
+
|
|
305
|
+
await page
|
|
306
|
+
.waitForLoadState("networkidle", { timeout: 15_000 })
|
|
307
|
+
.catch(() => {});
|
|
308
|
+
|
|
309
|
+
for (let pass = 0; pass < 2; pass++) {
|
|
310
|
+
await page.evaluate(async () => {
|
|
311
|
+
const delay = (ms) => new Promise((r) => setTimeout(r, ms));
|
|
312
|
+
const step = window.innerHeight;
|
|
313
|
+
const maxScroll = document.body.scrollHeight;
|
|
314
|
+
for (let y = 0; y <= maxScroll; y += step) {
|
|
315
|
+
window.scrollTo(0, y);
|
|
316
|
+
await delay(250);
|
|
317
|
+
}
|
|
318
|
+
window.scrollTo(0, 0);
|
|
319
|
+
|
|
320
|
+
for (const img of document.querySelectorAll("img[data-src]")) {
|
|
321
|
+
img.src = img.dataset.src;
|
|
322
|
+
img.removeAttribute("data-src");
|
|
323
|
+
}
|
|
324
|
+
});
|
|
325
|
+
await page
|
|
326
|
+
.waitForLoadState("networkidle", { timeout: 15_000 })
|
|
327
|
+
.catch(() => {});
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
await page
|
|
331
|
+
.waitForFunction(
|
|
332
|
+
() =>
|
|
333
|
+
[...document.querySelectorAll("img")].every((img) => {
|
|
334
|
+
if (img.src.startsWith("data:")) return true;
|
|
335
|
+
if (img.offsetWidth <= 1 && img.offsetHeight <= 1) return true;
|
|
336
|
+
return img.complete && img.naturalWidth > 0;
|
|
337
|
+
}),
|
|
338
|
+
{ timeout: 30_000 },
|
|
339
|
+
)
|
|
340
|
+
.catch(() => {});
|
|
341
|
+
|
|
342
|
+
await page.evaluate(() => {
|
|
343
|
+
const style = document.createElement("style");
|
|
344
|
+
style.textContent = `
|
|
345
|
+
img {
|
|
346
|
+
opacity: 1 !important;
|
|
347
|
+
visibility: visible !important;
|
|
348
|
+
transition: none !important;
|
|
349
|
+
animation: none !important;
|
|
350
|
+
}
|
|
351
|
+
`;
|
|
352
|
+
document.head.appendChild(style);
|
|
353
|
+
|
|
354
|
+
for (const img of document.querySelectorAll("img")) {
|
|
355
|
+
img.style.setProperty("opacity", "1", "important");
|
|
356
|
+
img.style.setProperty("visibility", "visible", "important");
|
|
357
|
+
img.style.setProperty("transition", "none", "important");
|
|
358
|
+
img.style.setProperty("animation", "none", "important");
|
|
359
|
+
}
|
|
360
|
+
});
|
|
361
|
+
|
|
362
|
+
await page.waitForTimeout(1000);
|
|
363
|
+
|
|
364
|
+
await page.evaluate(() => {
|
|
365
|
+
const selectors = [
|
|
366
|
+
'[role="dialog"]',
|
|
367
|
+
'[class*="overlay"]',
|
|
368
|
+
'[class*="underlay"]',
|
|
369
|
+
'[class*="modal"]',
|
|
370
|
+
'[class*="cookie"]',
|
|
371
|
+
'[class*="consent"]',
|
|
372
|
+
'[class*="backdrop"]',
|
|
373
|
+
];
|
|
374
|
+
for (const el of document.querySelectorAll(selectors.join(", "))) {
|
|
375
|
+
el.remove();
|
|
376
|
+
}
|
|
377
|
+
|
|
378
|
+
for (const el of document.querySelectorAll("*")) {
|
|
379
|
+
const s = getComputedStyle(el);
|
|
380
|
+
if (
|
|
381
|
+
s.position === "fixed" &&
|
|
382
|
+
el.offsetWidth >= window.innerWidth * 0.9 &&
|
|
383
|
+
el.offsetHeight >= window.innerHeight * 0.9 &&
|
|
384
|
+
!el.matches("header, nav, main, article, section, footer")
|
|
385
|
+
) {
|
|
386
|
+
el.remove();
|
|
387
|
+
}
|
|
388
|
+
}
|
|
389
|
+
});
|
|
390
|
+
|
|
391
|
+
const pdfBuffer = await page.pdf({
|
|
392
|
+
format: "A4",
|
|
393
|
+
printBackground: true,
|
|
394
|
+
margin: { top: "20mm", bottom: "20mm", left: "15mm", right: "15mm" },
|
|
395
|
+
});
|
|
396
|
+
|
|
397
|
+
fs.writeFileSync(pdfPath, pdfBuffer);
|
|
398
|
+
|
|
399
|
+
const htmlPath = pdfPath
|
|
400
|
+
.replace(/\/pdf\//, "/html/")
|
|
401
|
+
.replace(/\.pdf$/, ".html");
|
|
402
|
+
fs.mkdirSync(path.dirname(htmlPath), { recursive: true });
|
|
403
|
+
const rawHtml = await page.content();
|
|
404
|
+
fs.writeFileSync(htmlPath, purge(rawHtml));
|
|
405
|
+
|
|
406
|
+
log.info(`Saved ${pdfPath} + ${htmlPath}`);
|
|
407
|
+
},
|
|
408
|
+
|
|
409
|
+
async failedRequestHandler({ request, log }) {
|
|
410
|
+
log.error(`Failed to crawl ${request.url} after retries.`);
|
|
411
|
+
},
|
|
412
|
+
});
|
|
413
|
+
|
|
414
|
+
await crawler.run(pages.map((p) => ({ url: p.url, uniqueKey: p.url })));
|
|
415
|
+
|
|
416
|
+
console.log("\nDone. Output saved under:", domainDir);
|
package/discover.mjs
ADDED
|
@@ -0,0 +1,222 @@
|
|
|
1
|
+
import { writeFileSync } from "node:fs";
|
|
2
|
+
import { parseArgs } from "node:util";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Discover the full navigation tree for a Frontify portal.
|
|
6
|
+
* @param {{ url?: string, domain?: string, baseUrl?: string, hubId?: string, cookie?: string }} opts
|
|
7
|
+
* @returns {Promise<{ domain: string, hubId: number, baseUrl: string, children: object[] }>}
|
|
8
|
+
*/
|
|
9
|
+
export async function discover(opts = {}) {
|
|
10
|
+
const input = opts.url || opts.domain || "developer.frontify.com";
|
|
11
|
+
const isUrl = input.startsWith("https://") || input.startsWith("http://");
|
|
12
|
+
const baseUrl = opts.baseUrl || (isUrl ? new URL(input).origin : `https://${input}`);
|
|
13
|
+
const domain = opts.domain || (isUrl ? new URL(input).hostname : input);
|
|
14
|
+
let hubId = opts.hubId;
|
|
15
|
+
const cookie = opts.cookie || "";
|
|
16
|
+
|
|
17
|
+
const headers = {};
|
|
18
|
+
if (cookie) headers.cookie = cookie;
|
|
19
|
+
|
|
20
|
+
async function fetchJSON(url) {
|
|
21
|
+
const res = await fetch(url, { headers });
|
|
22
|
+
if (!res.ok) throw new Error(`${res.status} ${res.statusText} – ${url}`);
|
|
23
|
+
const json = await res.json();
|
|
24
|
+
if (!json.success) throw new Error(`API error – ${url}`);
|
|
25
|
+
return json.data;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
async function detectHubId() {
|
|
29
|
+
console.error(`No hub ID provided, detecting from ${baseUrl} …`);
|
|
30
|
+
const res = await fetch(baseUrl, { headers });
|
|
31
|
+
if (!res.ok) throw new Error(`${res.status} fetching ${baseUrl}`);
|
|
32
|
+
const html = await res.text();
|
|
33
|
+
const match = html.match(/"hub_id"\s*:\s*(\d+)/);
|
|
34
|
+
if (!match) throw new Error("Could not find hub_id in page response");
|
|
35
|
+
console.error(`Detected hub_id: ${match[1]}`);
|
|
36
|
+
return match[1];
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
function pageUrl(documentId, categorySlug, pageSlug, headingSlug) {
|
|
40
|
+
let url = `${baseUrl}/document/${documentId}#`;
|
|
41
|
+
url += categorySlug ? `/${categorySlug}` : `/-`;
|
|
42
|
+
if (pageSlug) url += `/${pageSlug}`;
|
|
43
|
+
if (headingSlug) url += `/${headingSlug}`;
|
|
44
|
+
return url;
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
function buildHeadingNode(heading, documentId, categorySlug, pageSlug) {
|
|
48
|
+
const t = heading.translations.default;
|
|
49
|
+
return {
|
|
50
|
+
type: "heading",
|
|
51
|
+
id: heading.id,
|
|
52
|
+
title: t.title,
|
|
53
|
+
slug: t.slug,
|
|
54
|
+
url: pageUrl(documentId, categorySlug, pageSlug, t.slug),
|
|
55
|
+
};
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function buildPageNode(page, documentId, categorySlug) {
|
|
59
|
+
const t = page.translations.default;
|
|
60
|
+
const node = {
|
|
61
|
+
type: "page",
|
|
62
|
+
id: page.id,
|
|
63
|
+
title: t.title,
|
|
64
|
+
slug: t.slug,
|
|
65
|
+
url: pageUrl(documentId, categorySlug, t.slug),
|
|
66
|
+
children: [],
|
|
67
|
+
};
|
|
68
|
+
if (page.children) {
|
|
69
|
+
for (const child of page.children) {
|
|
70
|
+
if (child.type === "page_heading") {
|
|
71
|
+
node.children.push(buildHeadingNode(child, documentId, categorySlug, t.slug));
|
|
72
|
+
}
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
return node;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
function buildDocNavItems(items, documentId, categorySlug) {
|
|
79
|
+
const nodes = [];
|
|
80
|
+
for (const item of items) {
|
|
81
|
+
switch (item.type) {
|
|
82
|
+
case "page_category": {
|
|
83
|
+
const t = item.translations.default;
|
|
84
|
+
const cat = {
|
|
85
|
+
type: "category",
|
|
86
|
+
id: item.id,
|
|
87
|
+
title: t.title,
|
|
88
|
+
slug: t.slug,
|
|
89
|
+
children: [],
|
|
90
|
+
};
|
|
91
|
+
if (item.children) {
|
|
92
|
+
cat.children = buildDocNavItems(item.children, documentId, t.slug);
|
|
93
|
+
}
|
|
94
|
+
nodes.push(cat);
|
|
95
|
+
break;
|
|
96
|
+
}
|
|
97
|
+
case "page": {
|
|
98
|
+
nodes.push(buildPageNode(item, documentId, categorySlug));
|
|
99
|
+
break;
|
|
100
|
+
}
|
|
101
|
+
case "page_heading": {
|
|
102
|
+
nodes.push(buildHeadingNode(item, documentId, categorySlug, null));
|
|
103
|
+
break;
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
return nodes;
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
async function buildDocumentNode(doc) {
|
|
111
|
+
const t = doc.translations.default;
|
|
112
|
+
const node = {
|
|
113
|
+
type: "document",
|
|
114
|
+
id: doc.id,
|
|
115
|
+
title: t.title,
|
|
116
|
+
slug: t.slug,
|
|
117
|
+
url: `${baseUrl}/document/${doc.id}`,
|
|
118
|
+
children: [],
|
|
119
|
+
};
|
|
120
|
+
try {
|
|
121
|
+
const items = await fetchJSON(`${baseUrl}/api/document-navigation/${doc.id}`);
|
|
122
|
+
node.children = buildDocNavItems(items, doc.id, null);
|
|
123
|
+
} catch (err) {
|
|
124
|
+
node.error = err.message;
|
|
125
|
+
}
|
|
126
|
+
return node;
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
function buildLinkNode(doc) {
|
|
130
|
+
const t = doc.translations.default;
|
|
131
|
+
return {
|
|
132
|
+
type: "external_link",
|
|
133
|
+
id: doc.id,
|
|
134
|
+
title: t.title,
|
|
135
|
+
url: doc.url,
|
|
136
|
+
openInNewTab: doc.should_open_in_new_tab,
|
|
137
|
+
};
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
if (!hubId) hubId = await detectHubId();
|
|
141
|
+
console.error(`Crawling portal: ${baseUrl} (hub ${hubId})`);
|
|
142
|
+
const portalNav = await fetchJSON(`${baseUrl}/api/portal-navigation/${hubId}`);
|
|
143
|
+
|
|
144
|
+
const tree = { domain, hubId: Number(hubId), baseUrl, children: [] };
|
|
145
|
+
|
|
146
|
+
for (const item of portalNav) {
|
|
147
|
+
if (item.type === "document") {
|
|
148
|
+
tree.children.push(await buildDocumentNode(item));
|
|
149
|
+
} else if (item.type === "document_group") {
|
|
150
|
+
const group = {
|
|
151
|
+
type: "group",
|
|
152
|
+
id: item.id,
|
|
153
|
+
title: item.translations.default.name,
|
|
154
|
+
children: [],
|
|
155
|
+
};
|
|
156
|
+
const docPromises = item.children.map((child) => {
|
|
157
|
+
if (child.type === "link_document") return buildLinkNode(child);
|
|
158
|
+
return buildDocumentNode(child);
|
|
159
|
+
});
|
|
160
|
+
group.children = await Promise.all(docPromises);
|
|
161
|
+
tree.children.push(group);
|
|
162
|
+
} else if (item.type === "link_document") {
|
|
163
|
+
tree.children.push(buildLinkNode(item));
|
|
164
|
+
}
|
|
165
|
+
}
|
|
166
|
+
|
|
167
|
+
return tree;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
function countType(node, type) {
|
|
171
|
+
let count = node.type === type ? 1 : 0;
|
|
172
|
+
if (node.children) {
|
|
173
|
+
for (const child of node.children) count += countType(child, type);
|
|
174
|
+
}
|
|
175
|
+
return count;
|
|
176
|
+
}
|
|
177
|
+
|
|
178
|
+
// ── CLI mode ──
|
|
179
|
+
const isCLI = process.argv[1]?.endsWith("discover.mjs");
|
|
180
|
+
if (isCLI) {
|
|
181
|
+
const { values, positionals } = parseArgs({
|
|
182
|
+
options: {
|
|
183
|
+
url: { type: "string", short: "u" },
|
|
184
|
+
hub: { type: "string", short: "h" },
|
|
185
|
+
output: { type: "string", short: "o" },
|
|
186
|
+
cookie: { type: "string", short: "c" },
|
|
187
|
+
help: { type: "boolean" },
|
|
188
|
+
},
|
|
189
|
+
allowPositionals: true,
|
|
190
|
+
strict: false,
|
|
191
|
+
});
|
|
192
|
+
|
|
193
|
+
if (values.help) {
|
|
194
|
+
console.log(`Usage: node discover.mjs [options] [url] [hubId] [output]
|
|
195
|
+
|
|
196
|
+
Options:
|
|
197
|
+
-u, --url <url> Portal domain or URL
|
|
198
|
+
-h, --hub <id> Hub ID (auto-detected if omitted)
|
|
199
|
+
-o, --output <file> Output file (default: {domain}.json)
|
|
200
|
+
-c, --cookie <str> Cookie header for authenticated requests
|
|
201
|
+
--help Show this help
|
|
202
|
+
|
|
203
|
+
Environment variables (or .env file):
|
|
204
|
+
COOKIE Cookie header value
|
|
205
|
+
URL Portal domain or URL
|
|
206
|
+
HUB_ID Hub ID`);
|
|
207
|
+
process.exit(0);
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
const input = values.url || positionals[0] || process.env.URL;
|
|
211
|
+
const hubId = values.hub || positionals[1] || process.env.HUB_ID;
|
|
212
|
+
const cookie = values.cookie || process.env.COOKIE || "";
|
|
213
|
+
|
|
214
|
+
const tree = await discover({ url: input, hubId, cookie });
|
|
215
|
+
|
|
216
|
+
const outputFile = values.output || positionals[2] || `${tree.domain}.json`;
|
|
217
|
+
console.error(
|
|
218
|
+
`Done – ${countType(tree, "document")} documents, ${countType(tree, "category")} categories, ${countType(tree, "page")} pages, ${countType(tree, "heading")} headings, ${countType(tree, "external_link")} external links`
|
|
219
|
+
);
|
|
220
|
+
writeFileSync(outputFile, JSON.stringify(tree, null, 2) + "\n");
|
|
221
|
+
console.error(`Written to ${outputFile}`);
|
|
222
|
+
}
|
package/package.json
ADDED
|
@@ -0,0 +1,28 @@
|
|
|
1
|
+
{
|
|
2
|
+
"name": "guidelinescraper",
|
|
3
|
+
"version": "1.0.0",
|
|
4
|
+
"type": "module",
|
|
5
|
+
"description": "Scrape a Frontify brand portal and save every page as PDF and clean HTML",
|
|
6
|
+
"bin": {
|
|
7
|
+
"guidelinescraper": "./crawl.mjs"
|
|
8
|
+
},
|
|
9
|
+
"scripts": {
|
|
10
|
+
"start": "node crawl.mjs"
|
|
11
|
+
},
|
|
12
|
+
"files": [
|
|
13
|
+
"crawl.mjs",
|
|
14
|
+
"discover.mjs",
|
|
15
|
+
"purge-html.mjs"
|
|
16
|
+
],
|
|
17
|
+
"keywords": [
|
|
18
|
+
"frontify",
|
|
19
|
+
"scraper",
|
|
20
|
+
"pdf",
|
|
21
|
+
"brand-guidelines"
|
|
22
|
+
],
|
|
23
|
+
"license": "ISC",
|
|
24
|
+
"dependencies": {
|
|
25
|
+
"crawlee": "^3.16.0",
|
|
26
|
+
"playwright": "^1.58.2"
|
|
27
|
+
}
|
|
28
|
+
}
|
package/purge-html.mjs
ADDED
|
@@ -0,0 +1,134 @@
|
|
|
1
|
+
import fs from "node:fs";
|
|
2
|
+
import path from "node:path";
|
|
3
|
+
import { parseHTML } from "linkedom";
|
|
4
|
+
|
|
5
|
+
const REMOVE_TAGS = new Set([
|
|
6
|
+
"script", "style", "link", "meta", "noscript", "iframe", "svg", "canvas",
|
|
7
|
+
"video", "audio", "source", "track", "object", "embed", "applet",
|
|
8
|
+
"form", "input", "textarea", "select", "button", "template",
|
|
9
|
+
]);
|
|
10
|
+
|
|
11
|
+
const REMOVE_SELECTORS = [
|
|
12
|
+
"nav", "header", "footer",
|
|
13
|
+
"[role='navigation']", "[role='banner']", "[role='search']", "[role='dialog']",
|
|
14
|
+
"[class*='overlay']", "[class*='underlay']", "[class*='modal']",
|
|
15
|
+
"[class*='cookie']", "[class*='consent']", "[class*='sidebar']",
|
|
16
|
+
"[class*='nav-']", "[class*='toolbar']", "[class*='skip-to-main']",
|
|
17
|
+
"[class*='search']", "[class*='tooltip']", "[class*='popover']",
|
|
18
|
+
"[class*='dropdown']", "[aria-hidden='true']",
|
|
19
|
+
"[data-test-id='skip-to-main-content-link']",
|
|
20
|
+
];
|
|
21
|
+
|
|
22
|
+
const KEEP_ATTRS = new Set([
|
|
23
|
+
"src", "href", "alt", "title", "id",
|
|
24
|
+
"colspan", "rowspan", "headers", "scope", "lang", "dir",
|
|
25
|
+
]);
|
|
26
|
+
|
|
27
|
+
const SEMANTIC_TAGS = new Set([
|
|
28
|
+
"html", "head", "body", "main", "article", "section", "aside",
|
|
29
|
+
"h1", "h2", "h3", "h4", "h5", "h6",
|
|
30
|
+
"p", "blockquote", "pre", "code",
|
|
31
|
+
"ul", "ol", "li", "dl", "dt", "dd",
|
|
32
|
+
"table", "thead", "tbody", "tfoot", "tr", "th", "td", "caption",
|
|
33
|
+
"a", "img", "figure", "figcaption", "picture",
|
|
34
|
+
"strong", "em", "b", "i", "u", "s", "mark", "small", "sub", "sup",
|
|
35
|
+
"br", "hr", "details", "summary", "time", "abbr", "cite", "q",
|
|
36
|
+
]);
|
|
37
|
+
|
|
38
|
+
function cleanNode(node) {
|
|
39
|
+
if (node.nodeType === 3) return;
|
|
40
|
+
if (node.nodeType !== 1) { node.remove(); return; }
|
|
41
|
+
|
|
42
|
+
const tag = node.tagName?.toLowerCase();
|
|
43
|
+
if (REMOVE_TAGS.has(tag)) { node.remove(); return; }
|
|
44
|
+
|
|
45
|
+
if (node.attributes) {
|
|
46
|
+
const toRemove = [];
|
|
47
|
+
for (const attr of node.attributes) {
|
|
48
|
+
if (!KEEP_ATTRS.has(attr.name)) toRemove.push(attr.name);
|
|
49
|
+
}
|
|
50
|
+
for (const name of toRemove) node.removeAttribute(name);
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
for (const child of [...node.childNodes]) cleanNode(child);
|
|
54
|
+
|
|
55
|
+
if (!SEMANTIC_TAGS.has(tag) && node.children?.length === 0 && !node.textContent?.trim()) {
|
|
56
|
+
node.remove();
|
|
57
|
+
return;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
if (!SEMANTIC_TAGS.has(tag)) {
|
|
61
|
+
const parent = node.parentNode;
|
|
62
|
+
if (parent) {
|
|
63
|
+
while (node.firstChild) parent.insertBefore(node.firstChild, node);
|
|
64
|
+
node.remove();
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
export function purge(html) {
|
|
70
|
+
const { document } = parseHTML(html);
|
|
71
|
+
|
|
72
|
+
for (const selector of REMOVE_SELECTORS) {
|
|
73
|
+
try { for (const el of document.querySelectorAll(selector)) el.remove(); } catch {}
|
|
74
|
+
}
|
|
75
|
+
|
|
76
|
+
const main =
|
|
77
|
+
document.querySelector("main") ||
|
|
78
|
+
document.querySelector("[role='main']") ||
|
|
79
|
+
document.querySelector("#main") ||
|
|
80
|
+
document.querySelector(".main-content") ||
|
|
81
|
+
document.querySelector("#classic-theme") ||
|
|
82
|
+
document.body;
|
|
83
|
+
|
|
84
|
+
cleanNode(main);
|
|
85
|
+
|
|
86
|
+
const title = document.querySelector("title")?.textContent?.trim() || "";
|
|
87
|
+
const cleanHtml =
|
|
88
|
+
`<!DOCTYPE html>\n<html lang="${document.documentElement?.getAttribute("lang") || "en"}">\n<head>\n<meta charset="utf-8">\n<title>${title}</title>\n<style>body{max-width:72ch;margin:2rem auto;padding:0 1rem;font:1rem/1.6 monospace}img{max-width:100%;height:auto}</style>\n</head>\n<body>\n${main.innerHTML.trim()}\n</body>\n</html>`;
|
|
89
|
+
|
|
90
|
+
return cleanHtml.replace(/\n{3,}/g, "\n\n").replace(/[ \t]+\n/g, "\n");
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
// CLI mode: run standalone on a directory
|
|
94
|
+
const isCLI = process.argv[1]?.endsWith("purge-html.mjs");
|
|
95
|
+
if (isCLI && process.argv[2]) {
|
|
96
|
+
const RAW_DIR = process.argv[2];
|
|
97
|
+
const CLEAN_DIR = process.argv[3] || RAW_DIR.replace(".raw", ".clean");
|
|
98
|
+
|
|
99
|
+
function collectFiles(dir) {
|
|
100
|
+
const files = [];
|
|
101
|
+
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
102
|
+
const full = path.join(dir, entry.name);
|
|
103
|
+
if (entry.isDirectory()) files.push(...collectFiles(full));
|
|
104
|
+
else if (entry.name.endsWith(".html")) files.push(full);
|
|
105
|
+
}
|
|
106
|
+
return files;
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const files = collectFiles(RAW_DIR);
|
|
110
|
+
console.log(`Found ${files.length} HTML files in ${RAW_DIR}`);
|
|
111
|
+
|
|
112
|
+
let totalRawBytes = 0;
|
|
113
|
+
let totalCleanBytes = 0;
|
|
114
|
+
|
|
115
|
+
for (const file of files) {
|
|
116
|
+
const relPath = path.relative(RAW_DIR, file);
|
|
117
|
+
const outPath = path.join(CLEAN_DIR, relPath);
|
|
118
|
+
fs.mkdirSync(path.dirname(outPath), { recursive: true });
|
|
119
|
+
|
|
120
|
+
const raw = fs.readFileSync(file, "utf-8");
|
|
121
|
+
const clean = purge(raw);
|
|
122
|
+
fs.writeFileSync(outPath, clean);
|
|
123
|
+
|
|
124
|
+
totalRawBytes += raw.length;
|
|
125
|
+
totalCleanBytes += clean.length;
|
|
126
|
+
|
|
127
|
+
const pct = ((1 - clean.length / raw.length) * 100).toFixed(0);
|
|
128
|
+
console.log(` ${relPath} ${(raw.length / 1024).toFixed(0)}K → ${(clean.length / 1024).toFixed(0)}K (-${pct}%)`);
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
console.log(`\nDone. ${files.length} files cleaned.`);
|
|
132
|
+
console.log(`Total: ${(totalRawBytes / 1024).toFixed(0)}K → ${(totalCleanBytes / 1024).toFixed(0)}K (-${((1 - totalCleanBytes / totalRawBytes) * 100).toFixed(0)}%)`);
|
|
133
|
+
console.log(`Output: ${CLEAN_DIR}`);
|
|
134
|
+
}
|