dembrandt 0.7.2 → 0.8.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +24 -0
- package/index.js +69 -1
- package/lib/colors.js +27 -0
- package/lib/discovery.js +247 -0
- package/lib/display.js +4 -0
- package/lib/extractors.js +25 -2
- package/lib/merger.js +311 -0
- package/package.json +1 -1
package/README.md
CHANGED
|
@@ -54,12 +54,36 @@ dembrandt bmw.de --dark-mode # Extract colors from dark mode variant
|
|
|
54
54
|
dembrandt bmw.de --mobile # Use mobile viewport (390x844, iPhone 12/13/14/15) for responsive analysis
|
|
55
55
|
dembrandt bmw.de --slow # 3x longer timeouts (24s hydration) for JavaScript-heavy sites
|
|
56
56
|
dembrandt bmw.de --brand-guide # Generate a brand guide PDF
|
|
57
|
+
dembrandt bmw.de --pages 5 # Analyze 5 pages (homepage + 4 discovered pages), merges results
|
|
58
|
+
dembrandt bmw.de --sitemap # Discover pages from sitemap.xml instead of DOM links
|
|
59
|
+
dembrandt bmw.de --pages 10 --sitemap # Combine: up to 10 pages discovered via sitemap
|
|
57
60
|
dembrandt bmw.de --no-sandbox # Disable Chromium sandbox (required for Docker/CI)
|
|
58
61
|
dembrandt bmw.de --browser=firefox # Use Firefox instead of Chromium (better for Cloudflare bypass)
|
|
59
62
|
```
|
|
60
63
|
|
|
61
64
|
Default: formatted terminal display only. Use `--save-output` to persist results as JSON files. Browser automatically retries in visible mode if headless extraction fails.
|
|
62
65
|
|
|
66
|
+
### Multi-Page Extraction
|
|
67
|
+
|
|
68
|
+
Analyze multiple pages to get a more complete picture of a site's design system. Results are merged into a single unified output with cross-page confidence boosting — colors appearing on multiple pages get higher confidence scores.
|
|
69
|
+
|
|
70
|
+
```bash
|
|
71
|
+
# Analyze homepage + 4 auto-discovered pages (default: 5 total)
|
|
72
|
+
dembrandt stripe.com --pages 5
|
|
73
|
+
|
|
74
|
+
# Use sitemap.xml for page discovery instead of DOM link scraping
|
|
75
|
+
dembrandt stripe.com --sitemap
|
|
76
|
+
|
|
77
|
+
# Combine both: up to 10 pages from sitemap
|
|
78
|
+
dembrandt stripe.com --pages 10 --sitemap
|
|
79
|
+
```
|
|
80
|
+
|
|
81
|
+
**Page discovery** works two ways:
|
|
82
|
+
- **DOM links** (default): Scrapes navigation, header, and footer links from the homepage, prioritizing key pages like /pricing, /about, /features
|
|
83
|
+
- **Sitemap** (`--sitemap`): Parses sitemap.xml (checks robots.txt first), follows sitemapindex references, and scores URLs by importance
|
|
84
|
+
|
|
85
|
+
Pages are crawled sequentially with polite delays. Failed pages are skipped without aborting the run.
|
|
86
|
+
|
|
63
87
|
### Browser Selection
|
|
64
88
|
|
|
65
89
|
By default, dembrandt uses Chromium. If you encounter bot detection or timeouts (especially on sites behind Cloudflare), try Firefox which is often more successful at bypassing these protections:
|
package/index.js
CHANGED
|
@@ -15,13 +15,15 @@ import { extractBranding } from "./lib/extractors.js";
|
|
|
15
15
|
import { displayResults } from "./lib/display.js";
|
|
16
16
|
import { toW3CFormat } from "./lib/w3c-exporter.js";
|
|
17
17
|
import { generatePDF } from "./lib/pdf.js";
|
|
18
|
+
import { parseSitemap } from "./lib/discovery.js";
|
|
19
|
+
import { mergeResults } from "./lib/merger.js";
|
|
18
20
|
import { writeFileSync, mkdirSync } from "fs";
|
|
19
21
|
import { join } from "path";
|
|
20
22
|
|
|
21
23
|
program
|
|
22
24
|
.name("dembrandt")
|
|
23
25
|
.description("Extract design tokens from any website")
|
|
24
|
-
.version("0.7.
|
|
26
|
+
.version("0.7.3")
|
|
25
27
|
.argument("<url>")
|
|
26
28
|
.option("--browser <type>", "Browser to use (chromium|firefox)", "chromium")
|
|
27
29
|
.option("--json-only", "Output raw JSON")
|
|
@@ -34,6 +36,12 @@ program
|
|
|
34
36
|
.option("--no-sandbox", "Disable browser sandbox (needed for Docker/CI)")
|
|
35
37
|
.option("--raw-colors", "Include pre-filter raw colors in JSON output")
|
|
36
38
|
.option("--screenshot <path>", "Save a screenshot of the page")
|
|
39
|
+
.option("--pages <n>", "Analyze up to N total pages including start URL (default: 5)", (v) => {
|
|
40
|
+
const n = parseInt(v, 10);
|
|
41
|
+
if (isNaN(n) || n < 1) throw new Error(`--pages must be a positive integer, got: ${v}`);
|
|
42
|
+
return n;
|
|
43
|
+
})
|
|
44
|
+
.option("--sitemap", "Discover pages from sitemap.xml instead of DOM links")
|
|
37
45
|
.action(async (input, opts) => {
|
|
38
46
|
let url = input;
|
|
39
47
|
if (!url.startsWith("http://") && !url.startsWith("https://")) {
|
|
@@ -67,13 +75,73 @@ program
|
|
|
67
75
|
});
|
|
68
76
|
|
|
69
77
|
try {
|
|
78
|
+
const isMultiPage = opts.pages || opts.sitemap;
|
|
79
|
+
const maxPages = (opts.pages || 5) - 1; // -1 because homepage counts
|
|
70
80
|
result = await extractBranding(url, spinner, browser, {
|
|
71
81
|
navigationTimeout: 90000,
|
|
72
82
|
darkMode: opts.darkMode,
|
|
73
83
|
mobile: opts.mobile,
|
|
74
84
|
slow: opts.slow,
|
|
75
85
|
screenshotPath: opts.screenshot,
|
|
86
|
+
discoverLinks: isMultiPage && !opts.sitemap ? maxPages : null,
|
|
76
87
|
});
|
|
88
|
+
|
|
89
|
+
// Multi-page crawl
|
|
90
|
+
if (isMultiPage && maxPages > 0) {
|
|
91
|
+
spinner.start("Discovering pages...");
|
|
92
|
+
|
|
93
|
+
let additionalUrls;
|
|
94
|
+
if (opts.sitemap) {
|
|
95
|
+
// Try post-redirect URL first, fall back to user-provided URL
|
|
96
|
+
// (sites like spotify.com redirect browser to open.spotify.com
|
|
97
|
+
// but sitemap lives at www.spotify.com)
|
|
98
|
+
additionalUrls = await parseSitemap(result.url, maxPages);
|
|
99
|
+
if (additionalUrls.length === 0 && result.url !== url) {
|
|
100
|
+
additionalUrls = await parseSitemap(url, maxPages);
|
|
101
|
+
}
|
|
102
|
+
} else {
|
|
103
|
+
additionalUrls = result._discoveredLinks || [];
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
delete result._discoveredLinks;
|
|
107
|
+
|
|
108
|
+
if (additionalUrls.length === 0) {
|
|
109
|
+
spinner.warn("No additional pages discovered");
|
|
110
|
+
} else {
|
|
111
|
+
spinner.stop();
|
|
112
|
+
console.log(chalk.dim(` Found ${additionalUrls.length} page(s) to analyze`));
|
|
113
|
+
|
|
114
|
+
const allResults = [result];
|
|
115
|
+
for (let i = 0; i < additionalUrls.length; i++) {
|
|
116
|
+
const pageUrl = additionalUrls[i];
|
|
117
|
+
const pageNum = i + 2;
|
|
118
|
+
const total = additionalUrls.length + 1;
|
|
119
|
+
spinner.start(`Extracting page ${pageNum}/${total}: ${new URL(pageUrl).pathname}`);
|
|
120
|
+
|
|
121
|
+
// Polite delay between pages
|
|
122
|
+
await new Promise(r => setTimeout(r, 2000 + Math.random() * 3000));
|
|
123
|
+
|
|
124
|
+
try {
|
|
125
|
+
const pageResult = await extractBranding(pageUrl, spinner, browser, {
|
|
126
|
+
navigationTimeout: 90000,
|
|
127
|
+
darkMode: opts.darkMode,
|
|
128
|
+
mobile: opts.mobile,
|
|
129
|
+
slow: opts.slow,
|
|
130
|
+
});
|
|
131
|
+
delete pageResult._discoveredLinks;
|
|
132
|
+
allResults.push(pageResult);
|
|
133
|
+
} catch (err) {
|
|
134
|
+
spinner.warn(`Skipping ${pageUrl}: ${String(err?.message || err).slice(0, 80)}`);
|
|
135
|
+
}
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
spinner.stop();
|
|
139
|
+
result = mergeResults(allResults);
|
|
140
|
+
}
|
|
141
|
+
} else {
|
|
142
|
+
delete result._discoveredLinks;
|
|
143
|
+
}
|
|
144
|
+
|
|
77
145
|
break;
|
|
78
146
|
} catch (err) {
|
|
79
147
|
await browser.close();
|
package/lib/colors.js
CHANGED
|
@@ -159,6 +159,33 @@ export function formatOklch(oklch, alpha) {
|
|
|
159
159
|
return `oklch(${l}% ${c} ${h})`;
|
|
160
160
|
}
|
|
161
161
|
|
|
162
|
+
/**
|
|
163
|
+
* Compute CIE76 delta-E perceptual distance between two hex colors.
|
|
164
|
+
* Returns 0 for identical colors, ~100 for maximally different.
|
|
165
|
+
* @param {string} hex1 - Hex color string (e.g. "#ff0000")
|
|
166
|
+
* @param {string} hex2 - Hex color string
|
|
167
|
+
* @returns {number}
|
|
168
|
+
*/
|
|
169
|
+
export function deltaE(hex1, hex2) {
|
|
170
|
+
function toLab(hex) {
|
|
171
|
+
const rgb = hexToRgb(hex);
|
|
172
|
+
if (!rgb) return null;
|
|
173
|
+
const lr = srgbToLinear(rgb.r);
|
|
174
|
+
const lg = srgbToLinear(rgb.g);
|
|
175
|
+
const lb = srgbToLinear(rgb.b);
|
|
176
|
+
const xyz = linearRgbToXyz(lr, lg, lb);
|
|
177
|
+
return xyzToLab(xyz.x, xyz.y, xyz.z);
|
|
178
|
+
}
|
|
179
|
+
const lab1 = toLab(hex1);
|
|
180
|
+
const lab2 = toLab(hex2);
|
|
181
|
+
if (!lab1 || !lab2) return 999;
|
|
182
|
+
return Math.sqrt(
|
|
183
|
+
Math.pow(lab1.l - lab2.l, 2) +
|
|
184
|
+
Math.pow(lab1.a - lab2.a, 2) +
|
|
185
|
+
Math.pow(lab1.b - lab2.b, 2)
|
|
186
|
+
);
|
|
187
|
+
}
|
|
188
|
+
|
|
162
189
|
/**
|
|
163
190
|
* Parse a hex color string and return RGB values
|
|
164
191
|
* @param {string} hex - Hex color (#fff, #ffffff, #ffffffaa)
|
package/lib/discovery.js
ADDED
|
@@ -0,0 +1,247 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Page Discovery
|
|
3
|
+
*
|
|
4
|
+
* Discovers internal pages from a starting URL via DOM link extraction
|
|
5
|
+
* or sitemap.xml parsing.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
/**
|
|
9
|
+
* Score a URL path for importance (higher = more important).
|
|
10
|
+
* Prioritizes shallow, navigational paths over blog posts / legal pages.
|
|
11
|
+
*/
|
|
12
|
+
export function scoreUrl(pathname) {
|
|
13
|
+
const lc = pathname.toLowerCase();
|
|
14
|
+
|
|
15
|
+
// Exclude noise
|
|
16
|
+
const exclude = [
|
|
17
|
+
/\/(login|signin|sign-in|signup|sign-up|register|auth)/,
|
|
18
|
+
/\/(terms|privacy|legal|cookie|gdpr|tos)/,
|
|
19
|
+
/\/(cdn-cgi|wp-admin|wp-content|wp-json)/,
|
|
20
|
+
/\.(pdf|jpg|jpeg|png|gif|svg|webp|zip|tar|gz|mp4|mp3)$/,
|
|
21
|
+
/\/tag\//,
|
|
22
|
+
/\/author\//,
|
|
23
|
+
/\/page\/\d+/,
|
|
24
|
+
/\?/, // query strings (shouldn't appear here but safety)
|
|
25
|
+
];
|
|
26
|
+
if (exclude.some(r => r.test(lc))) return -1;
|
|
27
|
+
|
|
28
|
+
let score = 100;
|
|
29
|
+
|
|
30
|
+
// Prefer shallow paths
|
|
31
|
+
const depth = (pathname.match(/\//g) || []).length;
|
|
32
|
+
score -= depth * 10;
|
|
33
|
+
|
|
34
|
+
// Boost key navigational pages
|
|
35
|
+
const boosts = [
|
|
36
|
+
[/^\/(pricing|plans?|cost)/, 30],
|
|
37
|
+
[/^\/(about|company|team|story)/, 25],
|
|
38
|
+
[/^\/(product|features?|solutions?)/, 25],
|
|
39
|
+
[/^\/(enterprise|business|platform)/, 20],
|
|
40
|
+
[/^\/(contact|demo|trial|start)/, 20],
|
|
41
|
+
[/^\/(docs|documentation|developers?|api)/, 15],
|
|
42
|
+
[/^\/(blog|resources|insights|news)/, 10],
|
|
43
|
+
];
|
|
44
|
+
for (const [re, boost] of boosts) {
|
|
45
|
+
if (re.test(lc)) { score += boost; break; }
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
return score;
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
/**
|
|
52
|
+
* Discover internal links from an already-loaded Playwright page.
|
|
53
|
+
* Call after extractBranding() has loaded and scrolled the page.
|
|
54
|
+
*
|
|
55
|
+
* @param {import('playwright-core').Page} page
|
|
56
|
+
* @param {string} baseUrl - The starting URL (used to determine same-origin)
|
|
57
|
+
* @param {number} maxPages - Maximum number of URLs to return
|
|
58
|
+
* @returns {Promise<string[]>} Ordered list of URLs to crawl (excluding homepage)
|
|
59
|
+
*/
|
|
60
|
+
export async function discoverLinks(page, baseUrl, maxPages) {
|
|
61
|
+
const origin = new URL(baseUrl).origin;
|
|
62
|
+
|
|
63
|
+
const links = await page.evaluate((origin) => {
|
|
64
|
+
const results = [];
|
|
65
|
+
const navSelectors = [
|
|
66
|
+
'nav a', 'header a', 'footer a',
|
|
67
|
+
'[role="navigation"] a', '[aria-label*="nav"] a',
|
|
68
|
+
'[data-nav] a', '.nav a', '.navbar a', '.header a', '.menu a',
|
|
69
|
+
];
|
|
70
|
+
|
|
71
|
+
// Score links by location: nav/header/footer = 2, elsewhere = 1
|
|
72
|
+
const scored = new Map(); // pathname → { href, locationScore }
|
|
73
|
+
|
|
74
|
+
function addLinks(selector, locationScore) {
|
|
75
|
+
document.querySelectorAll(selector).forEach(a => {
|
|
76
|
+
try {
|
|
77
|
+
const url = new URL(a.href);
|
|
78
|
+
if (url.origin !== origin) return;
|
|
79
|
+
const pathname = url.pathname.replace(/\/$/, '') || '/';
|
|
80
|
+
if (!scored.has(pathname) || scored.get(pathname).locationScore < locationScore) {
|
|
81
|
+
scored.set(pathname, { href: url.origin + pathname, locationScore });
|
|
82
|
+
}
|
|
83
|
+
} catch {}
|
|
84
|
+
});
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
navSelectors.forEach(sel => addLinks(sel, 2));
|
|
88
|
+
addLinks('a[href]', 1);
|
|
89
|
+
|
|
90
|
+
scored.forEach(({ href, locationScore }, pathname) => {
|
|
91
|
+
results.push({ href, pathname, locationScore });
|
|
92
|
+
});
|
|
93
|
+
|
|
94
|
+
return results;
|
|
95
|
+
}, origin);
|
|
96
|
+
|
|
97
|
+
// Remove homepage itself
|
|
98
|
+
const homepagePath = new URL(baseUrl).pathname.replace(/\/$/, '') || '/';
|
|
99
|
+
|
|
100
|
+
const scored = links
|
|
101
|
+
.filter(l => l.pathname !== homepagePath)
|
|
102
|
+
.map(l => ({ ...l, score: scoreUrl(l.pathname) + (l.locationScore * 5) }))
|
|
103
|
+
.filter(l => l.score >= 0)
|
|
104
|
+
.sort((a, b) => b.score - a.score);
|
|
105
|
+
|
|
106
|
+
return scored.slice(0, maxPages).map(l => l.href);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
const FETCH_OPTS = {
|
|
110
|
+
headers: { 'User-Agent': 'Mozilla/5.0 (compatible; Dembrandt/1.0)' },
|
|
111
|
+
};
|
|
112
|
+
|
|
113
|
+
/**
|
|
114
|
+
* Discover sitemap URLs by checking robots.txt first, then common paths.
|
|
115
|
+
* Returns the first sitemap XML that responds successfully.
|
|
116
|
+
*/
|
|
117
|
+
async function findSitemapUrls(origin) {
|
|
118
|
+
// 1. Check robots.txt for Sitemap: directives
|
|
119
|
+
try {
|
|
120
|
+
const res = await fetch(`${origin}/robots.txt`, {
|
|
121
|
+
...FETCH_OPTS,
|
|
122
|
+
signal: AbortSignal.timeout(5000),
|
|
123
|
+
});
|
|
124
|
+
if (res.ok) {
|
|
125
|
+
const text = await res.text();
|
|
126
|
+
const sitemaps = [...text.matchAll(/^Sitemap:\s*(.+)$/gmi)]
|
|
127
|
+
.map(m => m[1].trim())
|
|
128
|
+
.filter(u => u.startsWith('http'));
|
|
129
|
+
if (sitemaps.length > 0) return sitemaps;
|
|
130
|
+
}
|
|
131
|
+
} catch {}
|
|
132
|
+
|
|
133
|
+
// 2. Fall back to common sitemap locations
|
|
134
|
+
return [
|
|
135
|
+
`${origin}/sitemap.xml`,
|
|
136
|
+
`${origin}/sitemap_index.xml`,
|
|
137
|
+
`${origin}/sitemap/sitemap-index.xml`,
|
|
138
|
+
];
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Fetch a single sitemap URL, returning its XML text or empty string.
|
|
143
|
+
*/
|
|
144
|
+
async function fetchSitemap(url) {
|
|
145
|
+
try {
|
|
146
|
+
const res = await fetch(url, {
|
|
147
|
+
...FETCH_OPTS,
|
|
148
|
+
signal: AbortSignal.timeout(10000),
|
|
149
|
+
});
|
|
150
|
+
if (!res.ok) return '';
|
|
151
|
+
const text = await res.text();
|
|
152
|
+
return text.includes('<') ? text : ''; // sanity check for XML
|
|
153
|
+
} catch {
|
|
154
|
+
return '';
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
/**
|
|
159
|
+
* Discover pages from a site's sitemap.xml.
|
|
160
|
+
* Checks robots.txt for Sitemap directives, then tries common paths.
|
|
161
|
+
* Follows sitemapindex references one level deep.
|
|
162
|
+
*
|
|
163
|
+
* @param {string} baseUrl - The starting URL (should be post-redirect)
|
|
164
|
+
* @param {number} maxPages - Maximum number of URLs to return
|
|
165
|
+
* @returns {Promise<string[]>} List of URLs from sitemap (excluding homepage)
|
|
166
|
+
*/
|
|
167
|
+
export async function parseSitemap(baseUrl, maxPages) {
|
|
168
|
+
const base = new URL(baseUrl);
|
|
169
|
+
|
|
170
|
+
// Find and fetch sitemap(s)
|
|
171
|
+
const candidates = await findSitemapUrls(base.origin);
|
|
172
|
+
let xml = '';
|
|
173
|
+
for (const candidate of candidates) {
|
|
174
|
+
xml = await fetchSitemap(candidate);
|
|
175
|
+
if (xml) break;
|
|
176
|
+
}
|
|
177
|
+
if (!xml) return [];
|
|
178
|
+
|
|
179
|
+
// Also accept www/non-www variants of the same domain
|
|
180
|
+
const acceptedHostnames = new Set([base.hostname]);
|
|
181
|
+
if (base.hostname.startsWith('www.')) {
|
|
182
|
+
acceptedHostnames.add(base.hostname.slice(4));
|
|
183
|
+
} else {
|
|
184
|
+
acceptedHostnames.add('www.' + base.hostname);
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
// Expand sitemapindex references up to two levels deep
|
|
188
|
+
let allXml = xml;
|
|
189
|
+
if (xml.includes('<sitemapindex')) {
|
|
190
|
+
const extractLocs = (text) => [...text.matchAll(/<loc>\s*(https?:\/\/[^<]+)\s*<\/loc>/g)]
|
|
191
|
+
.map(m => m[1].trim())
|
|
192
|
+
.filter(u => { try { return acceptedHostnames.has(new URL(u).hostname); } catch { return false; } });
|
|
193
|
+
|
|
194
|
+
const childUrls = extractLocs(xml).slice(0, 10);
|
|
195
|
+
const childXmls = await Promise.all(childUrls.map(u => fetchSitemap(u)));
|
|
196
|
+
|
|
197
|
+
// Check if children are also sitemapindex (e.g. Spotify's 3-level hierarchy)
|
|
198
|
+
const grandchildFetches = [];
|
|
199
|
+
const pageXmls = [];
|
|
200
|
+
for (const childXml of childXmls) {
|
|
201
|
+
if (!childXml) continue;
|
|
202
|
+
if (childXml.includes('<sitemapindex')) {
|
|
203
|
+
const gcUrls = extractLocs(childXml).slice(0, 3);
|
|
204
|
+
grandchildFetches.push(...gcUrls.map(u => fetchSitemap(u)));
|
|
205
|
+
} else {
|
|
206
|
+
pageXmls.push(childXml);
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
if (grandchildFetches.length > 0) {
|
|
210
|
+
const gcXmls = await Promise.all(grandchildFetches);
|
|
211
|
+
pageXmls.push(...gcXmls);
|
|
212
|
+
}
|
|
213
|
+
allXml = pageXmls.join('\n');
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
const homepagePath = base.pathname.replace(/\/$/, '') || '/';
|
|
217
|
+
|
|
218
|
+
const urls = [...allXml.matchAll(/<loc>\s*(https?:\/\/[^<]+)\s*<\/loc>/g)]
|
|
219
|
+
.map(m => m[1].trim())
|
|
220
|
+
.filter(u => {
|
|
221
|
+
try { return acceptedHostnames.has(new URL(u).hostname); }
|
|
222
|
+
catch { return false; }
|
|
223
|
+
})
|
|
224
|
+
.map(u => {
|
|
225
|
+
const p = new URL(u);
|
|
226
|
+
return p.origin + (p.pathname.replace(/\/$/, '') || '/');
|
|
227
|
+
});
|
|
228
|
+
|
|
229
|
+
// Deduplicate
|
|
230
|
+
const seen = new Set();
|
|
231
|
+
const deduped = [];
|
|
232
|
+
for (const u of urls) {
|
|
233
|
+
const path = new URL(u).pathname;
|
|
234
|
+
if (path === homepagePath) continue;
|
|
235
|
+
if (seen.has(u)) continue;
|
|
236
|
+
seen.add(u);
|
|
237
|
+
deduped.push(u);
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// Score and sort
|
|
241
|
+
const scored = deduped
|
|
242
|
+
.map(u => ({ href: u, score: scoreUrl(new URL(u).pathname) }))
|
|
243
|
+
.filter(l => l.score >= 0)
|
|
244
|
+
.sort((a, b) => b.score - a.score);
|
|
245
|
+
|
|
246
|
+
return scored.slice(0, maxPages).map(l => l.href);
|
|
247
|
+
}
|
package/lib/display.js
CHANGED
|
@@ -33,6 +33,10 @@ export function displayResults(data) {
|
|
|
33
33
|
second: '2-digit'
|
|
34
34
|
});
|
|
35
35
|
console.log(chalk.dim('├─') + ' ' + chalk.dim(timeString));
|
|
36
|
+
if (data.pages && data.pages.length > 1) {
|
|
37
|
+
const paths = data.pages.map(p => new URL(p.url).pathname || '/').join(', ');
|
|
38
|
+
console.log(chalk.dim('├─') + ' ' + chalk.dim(`${data.pages.length} pages: ${paths}`));
|
|
39
|
+
}
|
|
36
40
|
console.log(chalk.dim('│'));
|
|
37
41
|
|
|
38
42
|
displayLogo(data.logo);
|
package/lib/extractors.js
CHANGED
|
@@ -7,6 +7,7 @@
|
|
|
7
7
|
|
|
8
8
|
import chalk from "chalk";
|
|
9
9
|
import { convertColor } from "./colors.js";
|
|
10
|
+
import { discoverLinks } from "./discovery.js";
|
|
10
11
|
|
|
11
12
|
/**
|
|
12
13
|
* Main extraction function - orchestrates the entire brand analysis process
|
|
@@ -157,9 +158,22 @@ export async function extractBranding(
|
|
|
157
158
|
300 + Math.random() * 400,
|
|
158
159
|
200 + Math.random() * 300
|
|
159
160
|
);
|
|
160
|
-
|
|
161
|
+
// Scroll through entire page to trigger lazy-loaded content
|
|
162
|
+
await page.evaluate(async () => {
|
|
163
|
+
const delay = (ms) => new Promise(r => setTimeout(r, ms));
|
|
164
|
+
const scrollStep = 600;
|
|
165
|
+
const maxHeight = Math.min(document.body.scrollHeight, 30000);
|
|
166
|
+
let y = 0;
|
|
167
|
+
while (y < maxHeight) {
|
|
168
|
+
y = Math.min(y + scrollStep, maxHeight);
|
|
169
|
+
window.scrollTo(0, y);
|
|
170
|
+
await delay(150 + Math.random() * 100);
|
|
171
|
+
}
|
|
172
|
+
// Scroll back to top
|
|
173
|
+
window.scrollTo(0, 0);
|
|
174
|
+
});
|
|
161
175
|
spinner.stop();
|
|
162
|
-
console.log(chalk.hex('#50FA7B')(` ✓
|
|
176
|
+
console.log(chalk.hex('#50FA7B')(` ✓ Full page scrolled (lazy content triggered)`));
|
|
163
177
|
|
|
164
178
|
// Final hydration wait
|
|
165
179
|
spinner.start("Final content stabilization...");
|
|
@@ -641,6 +655,15 @@ export async function extractBranding(
|
|
|
641
655
|
result.colors.rawColors = colors._raw || [];
|
|
642
656
|
}
|
|
643
657
|
|
|
658
|
+
// Discover internal links for multi-page extraction
|
|
659
|
+
if (options.discoverLinks) {
|
|
660
|
+
try {
|
|
661
|
+
result._discoveredLinks = await discoverLinks(page, page.url(), options.discoverLinks);
|
|
662
|
+
} catch {
|
|
663
|
+
result._discoveredLinks = [];
|
|
664
|
+
}
|
|
665
|
+
}
|
|
666
|
+
|
|
644
667
|
return result;
|
|
645
668
|
} catch (error) {
|
|
646
669
|
spinner.fail("Extraction failed");
|
package/lib/merger.js
ADDED
|
@@ -0,0 +1,311 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Multi-Page Result Merger
|
|
3
|
+
*
|
|
4
|
+
* Merges extraction results from multiple pages into a single
|
|
5
|
+
* unified result that is a superset of the single-page result: all single-page
|
|
6
|
+
* fields are preserved, with additional multi-page metadata (pages array,
|
|
7
|
+
* pageCount on palette entries) added.
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import { deltaE } from './colors.js';
|
|
11
|
+
|
|
12
|
+
const DELTA_E_THRESHOLD = 15;
|
|
13
|
+
|
|
14
|
+
function mergeColors(results) {
|
|
15
|
+
const base = results[0].colors;
|
|
16
|
+
|
|
17
|
+
// Pool all palette entries with their source page index
|
|
18
|
+
const allColors = [];
|
|
19
|
+
results.forEach((r, pageIdx) => {
|
|
20
|
+
(r.colors?.palette || []).forEach(c => {
|
|
21
|
+
allColors.push({ ...c, _pageIdx: pageIdx });
|
|
22
|
+
});
|
|
23
|
+
});
|
|
24
|
+
|
|
25
|
+
// Perceptual dedup across all pages
|
|
26
|
+
const merged = [];
|
|
27
|
+
const used = new Set();
|
|
28
|
+
|
|
29
|
+
for (let i = 0; i < allColors.length; i++) {
|
|
30
|
+
if (used.has(i)) continue;
|
|
31
|
+
|
|
32
|
+
const c = allColors[i];
|
|
33
|
+
const similar = [c];
|
|
34
|
+
const pagesSeen = new Set([c._pageIdx]);
|
|
35
|
+
|
|
36
|
+
for (let j = i + 1; j < allColors.length; j++) {
|
|
37
|
+
if (used.has(j)) continue;
|
|
38
|
+
try {
|
|
39
|
+
if (deltaE(c.normalized, allColors[j].normalized) < DELTA_E_THRESHOLD) {
|
|
40
|
+
similar.push(allColors[j]);
|
|
41
|
+
pagesSeen.add(allColors[j]._pageIdx);
|
|
42
|
+
used.add(j);
|
|
43
|
+
}
|
|
44
|
+
} catch { /* skip unparseable colors */ }
|
|
45
|
+
}
|
|
46
|
+
used.add(i);
|
|
47
|
+
|
|
48
|
+
// Keep variant with highest count as canonical
|
|
49
|
+
const best = similar.sort((a, b) => b.count - a.count)[0];
|
|
50
|
+
const totalCount = similar.reduce((s, x) => s + (x.count || 0), 0);
|
|
51
|
+
const pageCount = pagesSeen.size;
|
|
52
|
+
|
|
53
|
+
// Boost confidence when color appears on multiple pages
|
|
54
|
+
let confidence = best.confidence;
|
|
55
|
+
if (pageCount > 1 && confidence === 'low') confidence = 'medium';
|
|
56
|
+
if (pageCount > 2 && confidence === 'medium') confidence = 'high';
|
|
57
|
+
|
|
58
|
+
const { _pageIdx, ...clean } = best;
|
|
59
|
+
merged.push({ ...clean, count: totalCount, confidence, pageCount });
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
// Semantic: homepage wins, fill missing from other pages
|
|
63
|
+
const semantic = { ...base.semantic };
|
|
64
|
+
for (let i = 1; i < results.length; i++) {
|
|
65
|
+
const s = results[i].colors?.semantic || {};
|
|
66
|
+
for (const [k, v] of Object.entries(s)) {
|
|
67
|
+
if (!semantic[k] && v) semantic[k] = v;
|
|
68
|
+
}
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
// CSS variables: union, first occurrence wins
|
|
72
|
+
const cssVariables = {};
|
|
73
|
+
results.forEach(r => {
|
|
74
|
+
const vars = r.colors?.cssVariables || {};
|
|
75
|
+
for (const [k, v] of Object.entries(vars)) {
|
|
76
|
+
if (!(k in cssVariables)) cssVariables[k] = v;
|
|
77
|
+
}
|
|
78
|
+
});
|
|
79
|
+
|
|
80
|
+
return { ...base, semantic, palette: merged, cssVariables };
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
function mergeTypography(results) {
|
|
84
|
+
const base = results[0].typography || {};
|
|
85
|
+
|
|
86
|
+
// Dedup styles by (family, size, weight) tuple, sum counts
|
|
87
|
+
const styleMap = new Map();
|
|
88
|
+
results.forEach(r => {
|
|
89
|
+
(r.typography?.styles || []).forEach(s => {
|
|
90
|
+
const key = `${s.family}|${s.size}|${s.weight}`;
|
|
91
|
+
if (!styleMap.has(key)) {
|
|
92
|
+
styleMap.set(key, { ...s, count: 1 });
|
|
93
|
+
} else {
|
|
94
|
+
styleMap.get(key).count++;
|
|
95
|
+
}
|
|
96
|
+
});
|
|
97
|
+
});
|
|
98
|
+
|
|
99
|
+
// Merge sources
|
|
100
|
+
const sources = { ...(base.sources || {}) };
|
|
101
|
+
results.slice(1).forEach(r => {
|
|
102
|
+
const s = r.typography?.sources || {};
|
|
103
|
+
for (const [k, v] of Object.entries(s)) {
|
|
104
|
+
if (Array.isArray(v)) {
|
|
105
|
+
sources[k] = [...new Set([...(sources[k] || []), ...v])];
|
|
106
|
+
} else if (v && !sources[k]) {
|
|
107
|
+
sources[k] = v;
|
|
108
|
+
}
|
|
109
|
+
}
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
const styles = [...styleMap.values()].sort((a, b) => parseFloat(b.size) - parseFloat(a.size));
|
|
113
|
+
return { ...base, styles, sources };
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
/**
|
|
117
|
+
* Fingerprint a component by its visual properties.
|
|
118
|
+
* Buttons and links have states.default; badges have top-level props.
|
|
119
|
+
*/
|
|
120
|
+
function fingerprintComponent(c) {
|
|
121
|
+
const base = c.states?.default || c;
|
|
122
|
+
return [
|
|
123
|
+
base.backgroundColor || '', base.color || '', base.borderRadius || '',
|
|
124
|
+
base.fontSize || '', base.fontWeight || '', base.border || ''
|
|
125
|
+
].join('|');
|
|
126
|
+
}
|
|
127
|
+
|
|
128
|
+
function mergeComponentArray(arrays) {
|
|
129
|
+
const map = new Map();
|
|
130
|
+
arrays.flat().forEach(item => {
|
|
131
|
+
const key = fingerprintComponent(item);
|
|
132
|
+
if (!map.has(key)) {
|
|
133
|
+
map.set(key, { ...item, count: item.count || 1 });
|
|
134
|
+
} else {
|
|
135
|
+
map.get(key).count += (item.count || 1);
|
|
136
|
+
}
|
|
137
|
+
});
|
|
138
|
+
return [...map.values()].sort((a, b) => b.count - a.count);
|
|
139
|
+
}
|
|
140
|
+
|
|
141
|
+
/**
|
|
142
|
+
* Merge grouped component objects (inputs: {text,checkbox,...}, badges: {all,byVariant}).
|
|
143
|
+
* Preserves the grouping keys and merges each sub-array independently.
|
|
144
|
+
*/
|
|
145
|
+
function mergeComponentGroups(groupObjects) {
|
|
146
|
+
const grouped = {};
|
|
147
|
+
groupObjects.forEach(obj => {
|
|
148
|
+
if (!obj) return;
|
|
149
|
+
for (const [key, val] of Object.entries(obj)) {
|
|
150
|
+
if (Array.isArray(val)) {
|
|
151
|
+
if (!grouped[key]) grouped[key] = [];
|
|
152
|
+
grouped[key].push(val);
|
|
153
|
+
} else if (typeof val === 'object' && val !== null) {
|
|
154
|
+
// Nested groups like byVariant: { error: [], warning: [], ... }
|
|
155
|
+
if (!grouped[key]) grouped[key] = {};
|
|
156
|
+
for (const [subKey, subArr] of Object.entries(val)) {
|
|
157
|
+
if (Array.isArray(subArr)) {
|
|
158
|
+
if (!grouped[key][subKey]) grouped[key][subKey] = [];
|
|
159
|
+
grouped[key][subKey].push(subArr);
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
}
|
|
163
|
+
}
|
|
164
|
+
});
|
|
165
|
+
|
|
166
|
+
const merged = {};
|
|
167
|
+
for (const [key, val] of Object.entries(grouped)) {
|
|
168
|
+
if (Array.isArray(val)) {
|
|
169
|
+
merged[key] = mergeComponentArray(val);
|
|
170
|
+
} else {
|
|
171
|
+
// Nested groups
|
|
172
|
+
merged[key] = {};
|
|
173
|
+
for (const [subKey, subArrays] of Object.entries(val)) {
|
|
174
|
+
merged[key][subKey] = mergeComponentArray(subArrays);
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
}
|
|
178
|
+
return merged;
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
function mergeComponents(results) {
|
|
182
|
+
return {
|
|
183
|
+
buttons: mergeComponentArray(results.map(r => r.components?.buttons || [])),
|
|
184
|
+
inputs: mergeComponentGroups(results.map(r => r.components?.inputs).filter(Boolean)),
|
|
185
|
+
links: mergeComponentArray(results.map(r => r.components?.links || [])),
|
|
186
|
+
badges: mergeComponentGroups(results.map(r => r.components?.badges).filter(Boolean)),
|
|
187
|
+
};
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
function mergeValueArrays(results, getter, valueKey = 'value') {
|
|
191
|
+
const map = new Map();
|
|
192
|
+
results.forEach(r => {
|
|
193
|
+
(getter(r) || []).forEach(item => {
|
|
194
|
+
const key = item[valueKey];
|
|
195
|
+
if (!map.has(key)) {
|
|
196
|
+
map.set(key, { ...item });
|
|
197
|
+
} else {
|
|
198
|
+
const e = map.get(key);
|
|
199
|
+
e.count = (e.count || 0) + (item.count || 1);
|
|
200
|
+
e.frequency = (e.frequency || 0) + (item.frequency || 0);
|
|
201
|
+
}
|
|
202
|
+
});
|
|
203
|
+
});
|
|
204
|
+
return [...map.values()].sort((a, b) => (b.count || b.frequency || 0) - (a.count || a.frequency || 0));
|
|
205
|
+
}
|
|
206
|
+
|
|
207
|
+
function mergeSpacing(results) {
|
|
208
|
+
const base = results[0].spacing || {};
|
|
209
|
+
const values = mergeValueArrays(results, r => r.spacing?.commonValues, 'px');
|
|
210
|
+
return { ...base, commonValues: values };
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
function mergeBorderRadius(results) {
|
|
214
|
+
const base = results[0].borderRadius || {};
|
|
215
|
+
const values = mergeValueArrays(results, r => r.borderRadius?.values);
|
|
216
|
+
// Recompute confidence from aggregated count
|
|
217
|
+
for (const v of values) {
|
|
218
|
+
if (v.count > 10) v.confidence = 'high';
|
|
219
|
+
else if (v.count > 3) v.confidence = 'medium';
|
|
220
|
+
}
|
|
221
|
+
return { ...base, values };
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
function mergeBorders(results) {
|
|
225
|
+
const map = new Map();
|
|
226
|
+
results.forEach(r => {
|
|
227
|
+
(r.borders?.combinations || []).forEach(item => {
|
|
228
|
+
const key = `${item.width}|${item.style}|${item.color}`;
|
|
229
|
+
if (!map.has(key)) {
|
|
230
|
+
map.set(key, { ...item, elements: [...(item.elements || [])] });
|
|
231
|
+
} else {
|
|
232
|
+
const e = map.get(key);
|
|
233
|
+
e.count += (item.count || 1);
|
|
234
|
+
const elementSet = new Set([...(e.elements || []), ...(item.elements || [])]);
|
|
235
|
+
e.elements = [...elementSet].slice(0, 5);
|
|
236
|
+
if (e.count > 10) e.confidence = 'high';
|
|
237
|
+
else if (e.count > 3) e.confidence = 'medium';
|
|
238
|
+
}
|
|
239
|
+
});
|
|
240
|
+
});
|
|
241
|
+
return {
|
|
242
|
+
combinations: [...map.values()].sort((a, b) => b.count - a.count),
|
|
243
|
+
};
|
|
244
|
+
}
|
|
245
|
+
|
|
246
|
+
function mergeShadows(results) {
|
|
247
|
+
const map = new Map();
|
|
248
|
+
results.forEach(r => {
|
|
249
|
+
(r.shadows || []).forEach(s => {
|
|
250
|
+
const key = s.shadow || s.value || JSON.stringify(s);
|
|
251
|
+
if (!map.has(key)) {
|
|
252
|
+
map.set(key, { ...s, count: s.count || 1 });
|
|
253
|
+
} else {
|
|
254
|
+
map.get(key).count += (s.count || 1);
|
|
255
|
+
}
|
|
256
|
+
});
|
|
257
|
+
});
|
|
258
|
+
// Recompute confidence from aggregated count
|
|
259
|
+
for (const s of map.values()) {
|
|
260
|
+
if (s.count > 10) s.confidence = 'high';
|
|
261
|
+
else if (s.count > 3) s.confidence = 'medium';
|
|
262
|
+
}
|
|
263
|
+
return [...map.values()].sort((a, b) => b.count - a.count);
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
function mergeByName(results, getter) {
|
|
267
|
+
const seen = new Set();
|
|
268
|
+
const out = [];
|
|
269
|
+
results.forEach(r => {
|
|
270
|
+
(getter(r) || []).forEach(item => {
|
|
271
|
+
const key = item.name || item.library || JSON.stringify(item);
|
|
272
|
+
if (!seen.has(key)) { seen.add(key); out.push(item); }
|
|
273
|
+
});
|
|
274
|
+
});
|
|
275
|
+
return out;
|
|
276
|
+
}
|
|
277
|
+
|
|
278
|
+
/**
|
|
279
|
+
* Merge an array of per-page result objects into a single unified result.
|
|
280
|
+
* @param {Object[]} results - Array of extractBranding() result objects
|
|
281
|
+
* @returns {Object} Merged result with same shape as single-page result
|
|
282
|
+
*/
|
|
283
|
+
export function mergeResults(results) {
|
|
284
|
+
if (results.length === 0) throw new Error('No results to merge');
|
|
285
|
+
if (results.length === 1) return results[0];
|
|
286
|
+
|
|
287
|
+
const home = results[0];
|
|
288
|
+
|
|
289
|
+
return {
|
|
290
|
+
url: home.url,
|
|
291
|
+
extractedAt: home.extractedAt,
|
|
292
|
+
siteName: home.siteName,
|
|
293
|
+
logo: home.logo,
|
|
294
|
+
favicons: home.favicons,
|
|
295
|
+
colors: mergeColors(results),
|
|
296
|
+
typography: mergeTypography(results),
|
|
297
|
+
spacing: mergeSpacing(results),
|
|
298
|
+
borderRadius: mergeBorderRadius(results),
|
|
299
|
+
borders: mergeBorders(results),
|
|
300
|
+
shadows: mergeShadows(results),
|
|
301
|
+
components: mergeComponents(results),
|
|
302
|
+
breakpoints: [
|
|
303
|
+
...new Map(
|
|
304
|
+
results.flatMap(r => r.breakpoints || []).map(b => [b.px, b])
|
|
305
|
+
).values()
|
|
306
|
+
].sort((a, b) => parseInt(b.px) - parseInt(a.px)),
|
|
307
|
+
iconSystem: mergeByName(results, r => r.iconSystem),
|
|
308
|
+
frameworks: mergeByName(results, r => r.frameworks),
|
|
309
|
+
pages: results.map(r => ({ url: r.url, extractedAt: r.extractedAt })),
|
|
310
|
+
};
|
|
311
|
+
}
|