similarbuild 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CHANGELOG.md +110 -0
  2. package/LICENSE +21 -0
  3. package/README.md +301 -0
  4. package/bin/install.js +256 -0
  5. package/lib/copy-templates.mjs +52 -0
  6. package/lib/install-deps.mjs +62 -0
  7. package/lib/prompt-config.mjs +83 -0
  8. package/lib/verify-env.mjs +19 -0
  9. package/package.json +63 -0
  10. package/scripts/sync-templates.mjs +71 -0
  11. package/templates/commands/build-page.md +490 -0
  12. package/templates/commands/build-site.md +548 -0
  13. package/templates/commands/clip-section.md +519 -0
  14. package/templates/memory/anti-patterns.md +212 -0
  15. package/templates/memory/design-knowledge.md +225 -0
  16. package/templates/memory/fixes.md +163 -0
  17. package/templates/memory/patterns.md +681 -0
  18. package/templates/presets/shopify-section.yaml +51 -0
  19. package/templates/presets/wp-elementor.yaml +49 -0
  20. package/templates/reports/fixtures/mock-run-1.json +115 -0
  21. package/templates/reports/fixtures/mock-run-2.json +72 -0
  22. package/templates/reports/report-renderer.mjs +218 -0
  23. package/templates/reports/report-template.html +571 -0
  24. package/templates/skills/sb-build-shopify/SKILL.md +104 -0
  25. package/templates/skills/sb-build-shopify/references/shopify-build-rules.md +563 -0
  26. package/templates/skills/sb-build-shopify/scripts/build-shopify.mjs +637 -0
  27. package/templates/skills/sb-build-shopify/scripts/tests/test-build-shopify.mjs +424 -0
  28. package/templates/skills/sb-build-wp/SKILL.md +83 -0
  29. package/templates/skills/sb-build-wp/references/wp-build-rules.md +376 -0
  30. package/templates/skills/sb-build-wp/scripts/build-wp.mjs +327 -0
  31. package/templates/skills/sb-build-wp/scripts/tests/test-build-wp.mjs +224 -0
  32. package/templates/skills/sb-compare-visual/SKILL.md +121 -0
  33. package/templates/skills/sb-compare-visual/scripts/compare-visual.mjs +387 -0
  34. package/templates/skills/sb-compare-visual/scripts/lib/compare-tokens.mjs +273 -0
  35. package/templates/skills/sb-compare-visual/scripts/tests/test-compare-tokens.mjs +350 -0
  36. package/templates/skills/sb-compare-visual/scripts/tests/test-compare-visual.mjs +626 -0
  37. package/templates/skills/sb-crawl-and-list/SKILL.md +99 -0
  38. package/templates/skills/sb-crawl-and-list/scripts/crawl-and-list.mjs +437 -0
  39. package/templates/skills/sb-crawl-and-list/scripts/lib/blocklist-filter.mjs +176 -0
  40. package/templates/skills/sb-crawl-and-list/scripts/lib/fallback-crawler.mjs +107 -0
  41. package/templates/skills/sb-crawl-and-list/scripts/lib/page-classifier.mjs +89 -0
  42. package/templates/skills/sb-crawl-and-list/scripts/lib/sitemap-parser.mjs +118 -0
  43. package/templates/skills/sb-crawl-and-list/scripts/tests/test-blocklist-filter.mjs +204 -0
  44. package/templates/skills/sb-crawl-and-list/scripts/tests/test-crawl-and-list.mjs +276 -0
  45. package/templates/skills/sb-crawl-and-list/scripts/tests/test-fallback-crawler.mjs +243 -0
  46. package/templates/skills/sb-crawl-and-list/scripts/tests/test-page-classifier.mjs +120 -0
  47. package/templates/skills/sb-crawl-and-list/scripts/tests/test-sitemap-parser.mjs +157 -0
  48. package/templates/skills/sb-extract-assets/SKILL.md +112 -0
  49. package/templates/skills/sb-extract-assets/scripts/extract-assets.mjs +484 -0
  50. package/templates/skills/sb-extract-assets/scripts/tests/test-extract-assets.mjs +112 -0
  51. package/templates/skills/sb-inspect-live/SKILL.md +105 -0
  52. package/templates/skills/sb-inspect-live/scripts/inspect-live.mjs +693 -0
  53. package/templates/skills/sb-inspect-live/scripts/tests/test-inspect-live.mjs +181 -0
  54. package/templates/skills/sb-review-checks/SKILL.md +113 -0
  55. package/templates/skills/sb-review-checks/references/review-rules.md +195 -0
  56. package/templates/skills/sb-review-checks/scripts/lib/anti-patterns.mjs +379 -0
  57. package/templates/skills/sb-review-checks/scripts/lib/cross-reference.mjs +115 -0
  58. package/templates/skills/sb-review-checks/scripts/lib/design-quality.mjs +541 -0
  59. package/templates/skills/sb-review-checks/scripts/review-checks.mjs +250 -0
  60. package/templates/skills/sb-review-checks/scripts/tests/test-anti-patterns.mjs +343 -0
  61. package/templates/skills/sb-review-checks/scripts/tests/test-cross-reference.mjs +170 -0
  62. package/templates/skills/sb-review-checks/scripts/tests/test-design-quality.mjs +493 -0
  63. package/templates/skills/sb-review-checks/scripts/tests/test-review-checks.mjs +267 -0
  64. package/templates/skills/sb-tweak/SKILL.md +130 -0
  65. package/templates/skills/sb-tweak/references/tweak-patterns.md +157 -0
  66. package/templates/skills/sb-tweak/scripts/lib/diff-summarizer.mjs +140 -0
  67. package/templates/skills/sb-tweak/scripts/lib/element-locator.mjs +507 -0
  68. package/templates/skills/sb-tweak/scripts/lib/intent-parser.mjs +324 -0
  69. package/templates/skills/sb-tweak/scripts/tests/test-diff-summarizer.mjs +248 -0
  70. package/templates/skills/sb-tweak/scripts/tests/test-element-locator.mjs +418 -0
  71. package/templates/skills/sb-tweak/scripts/tests/test-intent-parser.mjs +496 -0
  72. package/templates/skills/sb-tweak/scripts/tests/test-tweak.mjs +407 -0
  73. package/templates/skills/sb-tweak/scripts/tweak.mjs +656 -0
  74. package/templates/skills/sb-validate-render/SKILL.md +120 -0
  75. package/templates/skills/sb-validate-render/scripts/tests/test-validate-render.mjs +304 -0
  76. package/templates/skills/sb-validate-render/scripts/validate-render.mjs +645 -0
@@ -0,0 +1,176 @@
1
+ // blocklist-filter.mjs — Pure URL filtering. No I/O.
2
+ //
3
+ // Encodes the rule that a SimilarBuild page list must never include:
4
+ //
5
+ // - Auth-gated pages (/account, /login, /admin, etc.) — irrelevant for
6
+ // visual cloning and many will 401/403.
7
+ // - Cart / checkout — dynamic, session-bound, and typically unique enough
8
+ // that a clone would mislead more than it informs.
9
+ // - API / framework internals (/api/**, /_next/**, /_nuxt/**) — not visual.
10
+ // - Tracking-only URLs (?utm_*, ?fbclid, ?gclid, ?mc_eid). These ARE
11
+ // duplicates of the canonical URL with a parameter; we drop the params
12
+ // and let dedup handle the rest.
13
+ // - RSS/Atom feeds — not pages, will not render.
14
+ //
15
+ // Custom blocklist (--blocklist) is additive: user prefixes are layered on
16
+ // top of the defaults, never replace them. That way you can't accidentally
17
+ // allow /admin by passing a too-narrow custom list.
18
+
19
+ const DEFAULT_PATH_PREFIXES = [
20
+ '/cart',
21
+ '/checkout',
22
+ '/account',
23
+ '/login',
24
+ '/register',
25
+ '/signin',
26
+ '/signup',
27
+ '/sign-in',
28
+ '/sign-up',
29
+ '/logout',
30
+ '/admin',
31
+ '/wp-admin',
32
+ '/wp-login.php',
33
+ '/api/',
34
+ '/_next/',
35
+ '/_nuxt/',
36
+ '/feed',
37
+ '/rss',
38
+ '/atom',
39
+ ]
40
+
41
+ // Tracking params we strip. Any URL whose ONLY querystring content is these
42
+ // gets normalized to the bare path.
43
+ const TRACKING_PARAMS = new Set([
44
+ 'utm_source',
45
+ 'utm_medium',
46
+ 'utm_campaign',
47
+ 'utm_term',
48
+ 'utm_content',
49
+ 'utm_id',
50
+ 'utm_name',
51
+ 'fbclid',
52
+ 'gclid',
53
+ 'gbraid',
54
+ 'wbraid',
55
+ 'mc_eid',
56
+ 'mc_cid',
57
+ 'msclkid',
58
+ 'yclid',
59
+ '_ga',
60
+ 'ref',
61
+ 'ref_src',
62
+ ])
63
+
64
+ function stripTracking(u) {
65
+ const params = u.searchParams
66
+ for (const k of [...params.keys()]) {
67
+ if (TRACKING_PARAMS.has(k.toLowerCase()) || k.toLowerCase().startsWith('utm_')) {
68
+ params.delete(k)
69
+ }
70
+ }
71
+ return u
72
+ }
73
+
74
+ // Normalize a URL for dedupe + filtering. Returns null if the URL is invalid,
75
+ // off-origin, or has an unsupported scheme. `originHost` is the lower-cased
76
+ // hostname of the root URL.
77
+ export function normalizeUrl(url, originHost) {
78
+ try {
79
+ const u = new URL(url)
80
+ if (!/^https?:$/.test(u.protocol)) return null
81
+ if (originHost && u.hostname.toLowerCase() !== originHost) return null
82
+
83
+ // Drop fragment — anchors are intra-page navigation, not separate pages.
84
+ u.hash = ''
85
+ // Strip tracking params.
86
+ stripTracking(u)
87
+ // Sort remaining params for stable dedupe ordering.
88
+ const sorted = [...u.searchParams.entries()].sort(([a], [b]) => a.localeCompare(b))
89
+ u.search = ''
90
+ for (const [k, v] of sorted) u.searchParams.append(k, v)
91
+
92
+ // Normalize trailing slash on the pathname (except for root). This is
93
+ // dedupe-only: `/foo` and `/foo/` are the same page on 99% of sites.
94
+ if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
95
+ u.pathname = u.pathname.slice(0, -1)
96
+ }
97
+ return u.toString()
98
+ } catch {
99
+ return null
100
+ }
101
+ }
102
+
103
+ // Check if a normalized URL matches any path prefix in the blocklist. The
104
+ // match is on lowercased pathname only — querystrings can't trigger blocks.
105
+ export function matchesBlocklist(url, blocklistPrefixes) {
106
+ let pathname
107
+ try {
108
+ pathname = new URL(url).pathname.toLowerCase()
109
+ } catch {
110
+ return false
111
+ }
112
+ // Special-case: exact path matches like '/login' (no slash) shouldn't
113
+ // accidentally match '/login-info'. Use boundary-aware matching: the
114
+ // prefix must end at a slash, end-of-path, or a non-word boundary.
115
+ for (const prefix of blocklistPrefixes) {
116
+ const p = prefix.toLowerCase()
117
+ if (pathname === p) return true
118
+ if (p.endsWith('/')) {
119
+ if (pathname.startsWith(p)) return true
120
+ } else {
121
+ if (pathname.startsWith(p + '/')) return true
122
+ // Files/exact endpoints like /wp-login.php
123
+ if (pathname === p) return true
124
+ }
125
+ }
126
+ return false
127
+ }
128
+
129
+ // .xml extensions — block every *.xml EXCEPT the one we used to crawl. The
130
+ // crawler passes the sitemap URL it consumed (if any) so we don't accidentally
131
+ // drop sitemap.xml itself if it appears in <loc> entries (which would be weird
132
+ // but does happen on misconfigured sites).
133
+ export function isXmlOrFeed(url, sitemapUrl) {
134
+ try {
135
+ const u = new URL(url)
136
+ const path = u.pathname.toLowerCase()
137
+ if (sitemapUrl) {
138
+ try {
139
+ const su = new URL(sitemapUrl)
140
+ if (su.pathname.toLowerCase() === path && su.hostname === u.hostname) {
141
+ return false
142
+ }
143
+ } catch {
144
+ /* fall through */
145
+ }
146
+ }
147
+ return path.endsWith('.xml') || path.endsWith('.rss') || path.endsWith('.atom')
148
+ } catch {
149
+ return false
150
+ }
151
+ }
152
+
153
+ export function buildBlocklist(customBlocklist) {
154
+ const custom = (customBlocklist || '')
155
+ .split(',')
156
+ .map((s) => s.trim())
157
+ .filter(Boolean)
158
+ return [...DEFAULT_PATH_PREFIXES, ...custom]
159
+ }
160
+
161
+ // Run the full pipeline for one URL. Returns:
162
+ // { keep: true, url: normalized } — pass through
163
+ // { keep: false, reason: 'invalid'|'off-origin'|... } — drop
164
+ export function filterUrl(url, { originHost, blocklistPrefixes, sitemapUrl }) {
165
+ const normalized = normalizeUrl(url, originHost)
166
+ if (!normalized) return { keep: false, reason: 'invalid-or-off-origin' }
167
+ if (matchesBlocklist(normalized, blocklistPrefixes)) {
168
+ return { keep: false, reason: 'blocklisted' }
169
+ }
170
+ if (isXmlOrFeed(normalized, sitemapUrl)) {
171
+ return { keep: false, reason: 'xml-or-feed' }
172
+ }
173
+ return { keep: true, url: normalized }
174
+ }
175
+
176
+ export const __defaults = { DEFAULT_PATH_PREFIXES, TRACKING_PARAMS }
@@ -0,0 +1,107 @@
1
+ // fallback-crawler.mjs — Same-origin BFS using fetch + cheerio when sitemap
2
+ // is absent or unparseable. Pure-ish: cheerio + fetcher are injected, no
3
+ // process I/O. The crawler stops at maxDepth and maxPages and reports every
4
+ // URL it discovered (including the ones over the cap, so the orchestrator
5
+ // can warn).
6
+ //
7
+ // Why fetch + cheerio not chromium:
8
+ // - Discovery doesn't need rendering. Static <a href> tags reveal site
9
+ // structure on > 90% of WordPress / Shopify / Webflow / static sites.
10
+ // - Chromium adds 250MB install + ~5s startup per page; for 100 pages
11
+ // that's a 10-minute crawl when fetch finishes in 30s.
12
+ // - sb-inspect-live owns the chromium budget — we keep it for the actual
13
+ // visual capture.
14
+ //
15
+ // Trade-off accepted: pure-SPA sites (Next.js client-routed, fully
16
+ // JS-rendered) return only the entry shell. We detect that ("0–1 internal
17
+ // links found across the whole crawl") and surface a clear warning so the
18
+ // user knows to supply a sitemap manually.
19
+
20
+ import { filterUrl } from './blocklist-filter.mjs'
21
+
22
+ // fetcher signature: async (url) => { ok, status, text? }
23
+ // loadHtml signature: (html) => $ (cheerio instance)
24
+ export async function fallbackCrawl({
25
+ rootUrl,
26
+ fetcher,
27
+ loadHtml,
28
+ maxDepth,
29
+ maxPages,
30
+ blocklistPrefixes,
31
+ sitemapUrl, // pass-through so isXmlOrFeed knows what to keep
32
+ }) {
33
+ const origin = new URL(rootUrl)
34
+ const originHost = origin.hostname.toLowerCase()
35
+ const startUrl = origin.toString()
36
+
37
+ const visited = new Set()
38
+ const queue = [{ url: startUrl, depth: 0 }]
39
+ const results = [] // { url, depth }
40
+ const warnings = []
41
+
42
+ // Track the very first page's link count separately — that's our SPA
43
+ // detector. If the root yields 0–1 internal hrefs and total results <= 1
44
+ // at the end, the site is almost certainly client-rendered.
45
+ let rootInternalHrefCount = null
46
+
47
+ while (queue.length > 0 && results.length < maxPages) {
48
+ const { url, depth } = queue.shift()
49
+ const filtered = filterUrl(url, { originHost, blocklistPrefixes, sitemapUrl })
50
+ if (!filtered.keep) continue
51
+ const norm = filtered.url
52
+ if (visited.has(norm)) continue
53
+ visited.add(norm)
54
+
55
+ const fetched = await fetcher(norm)
56
+ if (!fetched.ok) {
57
+ warnings.push(`crawl-fetch-failed:${norm}:${fetched.status || 0}`)
58
+ continue
59
+ }
60
+
61
+ // The page itself counts even if it yields no further links.
62
+ results.push({ url: norm, depth })
63
+
64
+ if (depth >= maxDepth) continue
65
+
66
+ let $
67
+ try {
68
+ $ = loadHtml(fetched.text || '')
69
+ } catch (err) {
70
+ warnings.push(`crawl-parse-failed:${norm}:${err.message || 'unknown'}`)
71
+ continue
72
+ }
73
+
74
+ const hrefs = []
75
+ $('a[href]').each((_, el) => {
76
+ const raw = $(el).attr('href')
77
+ if (raw) hrefs.push(raw)
78
+ })
79
+
80
+ if (depth === 0) rootInternalHrefCount = 0
81
+
82
+ for (const raw of hrefs) {
83
+ let abs
84
+ try {
85
+ abs = new URL(raw, norm).toString()
86
+ } catch {
87
+ continue
88
+ }
89
+ const f = filterUrl(abs, { originHost, blocklistPrefixes, sitemapUrl })
90
+ if (!f.keep) continue
91
+ if (depth === 0) rootInternalHrefCount += 1
92
+ if (visited.has(f.url)) continue
93
+ // Don't enqueue if already queued (cheap check by scanning queue is
94
+ // O(n) but n is bounded by maxPages so it's fine).
95
+ if (queue.some((q) => q.url === f.url)) continue
96
+ queue.push({ url: f.url, depth: depth + 1 })
97
+ }
98
+ }
99
+
100
+ // SPA detector: tiny result set AND the entry page had ≤ 1 internal link.
101
+ // (We only signal the warning — the orchestrator decides what to do.)
102
+ if (results.length <= 1 && (rootInternalHrefCount ?? 0) <= 1) {
103
+ warnings.push('spa-suspected')
104
+ }
105
+
106
+ return { results, warnings, totalDiscovered: results.length + queue.length }
107
+ }
@@ -0,0 +1,89 @@
1
+ // page-classifier.mjs — URL → page-type heuristic via path inspection. Pure.
2
+ //
3
+ // Used by `/build-site` to bucket pages so the user sees a structured table
4
+ // (e.g. "5 PDPs, 3 collections, 1 home") before confirming. The classification
5
+ // drives nothing fatal — `other` is a valid type — but the better the
6
+ // guesses, the less the orchestrator nags the user about reordering.
7
+ //
8
+ // Heuristics are intentionally Shopify-flavoured (most common SimilarBuild
9
+ // target) but match common WordPress/Webflow paths too. Patterns are checked
10
+ // top-down; first match wins. No regex slowness — these are literal substring
11
+ // checks against the lowercased pathname.
12
+
13
+ const TYPE_RULES = [
14
+ // Home — only the bare root. Trailing slash handled by normalizing pathname
15
+ // to "/" before matching.
16
+ { type: 'home', test: (p) => p === '/' || p === '' },
17
+
18
+ // Product pages — Shopify (`/products/`), WooCommerce / generic
19
+ // (`/product/`), some BigCommerce stores (`/product/`).
20
+ { type: 'pdp', test: (p) => p.startsWith('/products/') || p.startsWith('/product/') },
21
+
22
+ // Collections — Shopify (`/collections/`), some PrestaShop (`/category/`),
23
+ // WooCommerce (`/product-category/`).
24
+ {
25
+ type: 'collection',
26
+ test: (p) =>
27
+ p.startsWith('/collections/') ||
28
+ p.startsWith('/collection/') ||
29
+ p.startsWith('/category/') ||
30
+ p.startsWith('/product-category/'),
31
+ },
32
+
33
+ // Contact — must match before generic /pages/* fallthrough.
34
+ {
35
+ type: 'contact',
36
+ test: (p) => /^\/pages\/contact(\b|\/|-)/.test(p) || p === '/contact' || p.startsWith('/contact/'),
37
+ },
38
+
39
+ // About — same shape as contact.
40
+ {
41
+ type: 'about',
42
+ test: (p) => /^\/pages\/about(\b|\/|-)/.test(p) || p === '/about' || p.startsWith('/about/'),
43
+ },
44
+
45
+ // Policy pages — Shopify uses /policies/, generic stores use named pages.
46
+ {
47
+ type: 'policy',
48
+ test: (p) =>
49
+ p.startsWith('/policies/') ||
50
+ /^\/pages\/(privacy|terms|refund|shipping|return)(\b|\/|-)/.test(p) ||
51
+ /^\/(privacy|terms|refund|shipping|return)(\b|\/|-)/.test(p),
52
+ },
53
+
54
+ // Blog index + posts. Shopify: /blogs/, WordPress: /blog/, generic /posts/.
55
+ {
56
+ type: 'blog',
57
+ test: (p) =>
58
+ p.startsWith('/blogs/') ||
59
+ p.startsWith('/blog/') ||
60
+ p === '/blog' ||
61
+ p.startsWith('/posts/') ||
62
+ p === '/news' ||
63
+ p.startsWith('/news/'),
64
+ },
65
+ ]
66
+
67
+ // Normalize a URL string to its lowercase pathname. Falls back to the input
68
+ // if it's not parseable (caller may pass a path-only string for testing).
69
+ function pathnameOf(url) {
70
+ try {
71
+ const u = new URL(url)
72
+ let p = u.pathname.toLowerCase()
73
+ // Treat trailing slash as equal to non-trailing (except for "/")
74
+ if (p.length > 1 && p.endsWith('/')) p = p.slice(0, -1)
75
+ return p
76
+ } catch {
77
+ let p = String(url).toLowerCase()
78
+ if (p.length > 1 && p.endsWith('/')) p = p.slice(0, -1)
79
+ return p
80
+ }
81
+ }
82
+
83
+ export function classifyUrl(url) {
84
+ const path = pathnameOf(url)
85
+ for (const rule of TYPE_RULES) {
86
+ if (rule.test(path)) return rule.type
87
+ }
88
+ return 'other'
89
+ }
@@ -0,0 +1,118 @@
1
+ // sitemap-parser.mjs — Pure XML → URL extraction. No I/O. Tested in isolation.
2
+ //
3
+ // Sitemap protocol (sitemaps.org) is small enough that a regex-driven parser
4
+ // beats pulling in xml2js for an install-time cost we don't need. We accept:
5
+ //
6
+ // - <urlset> with <url><loc>…</loc></url> (the common case)
7
+ // - <sitemapindex> with <sitemap><loc>…</loc></sitemap> (recurse one level)
8
+ //
9
+ // We do NOT validate XML well-formedness — sitemaps in the wild are full of
10
+ // stray BOMs, missing closing tags, and CDATA quirks. Strict validation would
11
+ // reject sitemaps Google's crawler happily eats. Instead: extract every <loc>
12
+ // payload, decode entities, trim, dedupe, and let downstream filtering decide.
13
+ //
14
+ // Returns: { urlSetUrls, sitemapIndexUrls, malformed }
15
+
16
+ const ENTITY_MAP = {
17
+ '&amp;': '&',
18
+ '&lt;': '<',
19
+ '&gt;': '>',
20
+ '&quot;': '"',
21
+ '&apos;': "'",
22
+ }
23
+
24
+ function decodeEntities(s) {
25
+ return s
26
+ .replace(/&(amp|lt|gt|quot|apos);/g, (m) => ENTITY_MAP[m] || m)
27
+ .replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(parseInt(n, 10)))
28
+ .replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCodePoint(parseInt(n, 16)))
29
+ }
30
+
31
+ function extractLocs(xml) {
32
+ const out = []
33
+ // Capture <loc>…</loc> non-greedy. Allow whitespace + CDATA wrapping.
34
+ const re = /<loc\b[^>]*>\s*(?:<!\[CDATA\[)?([\s\S]*?)(?:\]\]>)?\s*<\/loc\s*>/gi
35
+ let m
36
+ while ((m = re.exec(xml)) !== null) {
37
+ const raw = m[1].trim()
38
+ if (!raw) continue
39
+ out.push(decodeEntities(raw))
40
+ }
41
+ return out
42
+ }
43
+
44
+ // Distinguish between <urlset> (page URLs) and <sitemapindex> (more sitemaps).
45
+ // Sitemap protocol allows nested sitemapindex but we recurse only one level —
46
+ // real-world sites that nest deeper are rare and orchestrator can pass a
47
+ // specific child sitemap via --sitemap-path.
48
+ function detectKind(xml) {
49
+ if (/<\s*sitemapindex\b/i.test(xml)) return 'index'
50
+ if (/<\s*urlset\b/i.test(xml)) return 'urlset'
51
+ // Some hand-rolled sitemaps skip the wrapper entirely. If we find <loc>
52
+ // tags at all, treat as a flat urlset.
53
+ if (/<loc\b/i.test(xml)) return 'urlset'
54
+ return 'unknown'
55
+ }
56
+
57
+ export function parseSitemap(xml) {
58
+ if (typeof xml !== 'string' || xml.length === 0) {
59
+ return { kind: 'unknown', urlSetUrls: [], sitemapIndexUrls: [], malformed: true }
60
+ }
61
+ // Strip BOM if present — common from CMS-exported sitemaps.
62
+ if (xml.charCodeAt(0) === 0xfeff) xml = xml.slice(1)
63
+
64
+ const kind = detectKind(xml)
65
+ if (kind === 'unknown') {
66
+ return { kind, urlSetUrls: [], sitemapIndexUrls: [], malformed: true }
67
+ }
68
+ const locs = extractLocs(xml)
69
+ if (kind === 'index') {
70
+ return { kind, urlSetUrls: [], sitemapIndexUrls: locs, malformed: false }
71
+ }
72
+ return { kind, urlSetUrls: locs, sitemapIndexUrls: [], malformed: false }
73
+ }
74
+
75
+ // Async fetch + parse with one level of sitemapindex recursion. Fetcher is
76
+ // injected so this stays pure-ish and testable. Returns merged URL list +
77
+ // any per-child errors as warnings.
78
+ //
79
+ // fetcher signature: async (url) => { ok, status, text? }
80
+ export async function fetchAndParseSitemap(url, fetcher, { maxChildren = 50 } = {}) {
81
+ const warnings = []
82
+ const collected = []
83
+
84
+ const top = await fetcher(url)
85
+ if (!top.ok) {
86
+ return { urls: [], warnings: [`sitemap-fetch-failed:${top.status || 0}`] }
87
+ }
88
+ const parsed = parseSitemap(top.text || '')
89
+ if (parsed.malformed) {
90
+ return { urls: [], warnings: ['sitemap-malformed'] }
91
+ }
92
+ if (parsed.kind === 'urlset') {
93
+ return { urls: parsed.urlSetUrls, warnings }
94
+ }
95
+
96
+ // sitemapindex — fetch each child sitemap up to maxChildren.
97
+ const children = parsed.sitemapIndexUrls.slice(0, maxChildren)
98
+ if (parsed.sitemapIndexUrls.length > maxChildren) {
99
+ warnings.push(`sitemap-index-truncated:${parsed.sitemapIndexUrls.length}-of-${maxChildren}`)
100
+ }
101
+ for (const childUrl of children) {
102
+ const r = await fetcher(childUrl)
103
+ if (!r.ok) {
104
+ warnings.push(`sitemap-child-failed:${childUrl}:${r.status || 0}`)
105
+ continue
106
+ }
107
+ const childParsed = parseSitemap(r.text || '')
108
+ if (childParsed.malformed) {
109
+ warnings.push(`sitemap-child-malformed:${childUrl}`)
110
+ continue
111
+ }
112
+ // Treat any nested sitemapindex as flat — don't recurse a second level.
113
+ const childUrls =
114
+ childParsed.kind === 'urlset' ? childParsed.urlSetUrls : []
115
+ collected.push(...childUrls)
116
+ }
117
+ return { urls: collected, warnings }
118
+ }
@@ -0,0 +1,204 @@
1
+ #!/usr/bin/env node
2
+ // Tests for lib/blocklist-filter.mjs — pure unit tests.
3
+
4
+ import { strict as assert } from 'node:assert'
5
+ import {
6
+ buildBlocklist,
7
+ filterUrl,
8
+ matchesBlocklist,
9
+ normalizeUrl,
10
+ isXmlOrFeed,
11
+ __defaults,
12
+ } from '../lib/blocklist-filter.mjs'
13
+
14
+ let passed = 0
15
+ let failed = 0
16
+
17
+ function test(name, fn) {
18
+ try {
19
+ fn()
20
+ process.stdout.write(`ok - ${name}\n`)
21
+ passed++
22
+ } catch (err) {
23
+ process.stdout.write(`not ok - ${name}\n ${err.message}\n`)
24
+ failed++
25
+ }
26
+ }
27
+
28
+ // ─── normalizeUrl ───────────────────────────────────────────────────────────
29
+
30
+ test('normalizeUrl strips fragment', () => {
31
+ assert.equal(
32
+ normalizeUrl('https://example.com/page#section', 'example.com'),
33
+ 'https://example.com/page',
34
+ )
35
+ })
36
+
37
+ test('normalizeUrl strips utm_* tracking params', () => {
38
+ assert.equal(
39
+ normalizeUrl('https://example.com/page?utm_source=twitter&utm_medium=social', 'example.com'),
40
+ 'https://example.com/page',
41
+ )
42
+ })
43
+
44
+ test('normalizeUrl strips fbclid/gclid/mc_eid', () => {
45
+ assert.equal(
46
+ normalizeUrl('https://example.com/p?fbclid=123', 'example.com'),
47
+ 'https://example.com/p',
48
+ )
49
+ assert.equal(
50
+ normalizeUrl('https://example.com/p?gclid=abc', 'example.com'),
51
+ 'https://example.com/p',
52
+ )
53
+ assert.equal(
54
+ normalizeUrl('https://example.com/p?mc_eid=z', 'example.com'),
55
+ 'https://example.com/p',
56
+ )
57
+ })
58
+
59
+ test('normalizeUrl preserves non-tracking params', () => {
60
+ assert.equal(
61
+ normalizeUrl('https://example.com/products?variant=red', 'example.com'),
62
+ 'https://example.com/products?variant=red',
63
+ )
64
+ })
65
+
66
+ test('normalizeUrl rejects off-origin URLs', () => {
67
+ assert.equal(normalizeUrl('https://other.com/foo', 'example.com'), null)
68
+ })
69
+
70
+ test('normalizeUrl rejects non-http(s) schemes', () => {
71
+ assert.equal(normalizeUrl('mailto:hello@example.com', 'example.com'), null)
72
+ assert.equal(normalizeUrl('ftp://example.com/file', 'example.com'), null)
73
+ })
74
+
75
+ test('normalizeUrl strips trailing slash on non-root pathnames', () => {
76
+ assert.equal(
77
+ normalizeUrl('https://example.com/products/foo/', 'example.com'),
78
+ 'https://example.com/products/foo',
79
+ )
80
+ })
81
+
82
+ test('normalizeUrl preserves root slash', () => {
83
+ assert.equal(normalizeUrl('https://example.com/', 'example.com'), 'https://example.com/')
84
+ })
85
+
86
+ test('normalizeUrl returns null on garbage input', () => {
87
+ assert.equal(normalizeUrl('not-a-url', 'example.com'), null)
88
+ })
89
+
90
+ // ─── matchesBlocklist ───────────────────────────────────────────────────────
91
+
92
+ test('matchesBlocklist matches /admin and /admin/foo', () => {
93
+ const list = ['/admin']
94
+ assert.ok(matchesBlocklist('https://example.com/admin', list))
95
+ assert.ok(matchesBlocklist('https://example.com/admin/users', list))
96
+ })
97
+
98
+ test('matchesBlocklist does NOT falsely match /admin-tools as /admin', () => {
99
+ assert.ok(!matchesBlocklist('https://example.com/admin-tools', ['/admin']))
100
+ })
101
+
102
+ test('matchesBlocklist matches /api/ prefix', () => {
103
+ assert.ok(matchesBlocklist('https://example.com/api/v1/users', ['/api/']))
104
+ })
105
+
106
+ test('matchesBlocklist matches exact /wp-login.php file', () => {
107
+ assert.ok(matchesBlocklist('https://example.com/wp-login.php', ['/wp-login.php']))
108
+ })
109
+
110
+ test('matchesBlocklist is case-insensitive on path', () => {
111
+ assert.ok(matchesBlocklist('https://example.com/Admin', ['/admin']))
112
+ })
113
+
114
+ // ─── isXmlOrFeed ────────────────────────────────────────────────────────────
115
+
116
+ test('isXmlOrFeed flags arbitrary .xml URLs', () => {
117
+ assert.ok(isXmlOrFeed('https://example.com/foo.xml'))
118
+ })
119
+
120
+ test('isXmlOrFeed allows the active sitemap to pass through', () => {
121
+ assert.ok(
122
+ !isXmlOrFeed('https://example.com/sitemap.xml', 'https://example.com/sitemap.xml'),
123
+ )
124
+ })
125
+
126
+ test('isXmlOrFeed flags /feed and rss/atom paths via prefix logic separately (here just .xml)', () => {
127
+ assert.ok(isXmlOrFeed('https://example.com/feed.atom'))
128
+ assert.ok(isXmlOrFeed('https://example.com/posts.rss'))
129
+ })
130
+
131
+ // ─── buildBlocklist ─────────────────────────────────────────────────────────
132
+
133
+ test('buildBlocklist includes all defaults', () => {
134
+ const list = buildBlocklist()
135
+ for (const p of __defaults.DEFAULT_PATH_PREFIXES) {
136
+ assert.ok(list.includes(p), `missing default ${p}`)
137
+ }
138
+ })
139
+
140
+ test('buildBlocklist appends custom prefixes additively', () => {
141
+ const list = buildBlocklist('/preview, /internal,/staging')
142
+ assert.ok(list.includes('/preview'))
143
+ assert.ok(list.includes('/internal'))
144
+ assert.ok(list.includes('/staging'))
145
+ // Defaults still present
146
+ assert.ok(list.includes('/admin'))
147
+ })
148
+
149
+ test('buildBlocklist tolerates empty/whitespace custom string', () => {
150
+ const list = buildBlocklist(' ,, ')
151
+ assert.deepEqual(list, [...__defaults.DEFAULT_PATH_PREFIXES])
152
+ })
153
+
154
+ // ─── filterUrl (full pipeline) ──────────────────────────────────────────────
155
+
156
+ const ctx = {
157
+ originHost: 'example.com',
158
+ blocklistPrefixes: buildBlocklist('/preview'),
159
+ sitemapUrl: 'https://example.com/sitemap.xml',
160
+ }
161
+
162
+ test('filterUrl keeps a clean product URL', () => {
163
+ const r = filterUrl('https://example.com/products/foo', ctx)
164
+ assert.equal(r.keep, true)
165
+ assert.equal(r.url, 'https://example.com/products/foo')
166
+ })
167
+
168
+ test('filterUrl normalizes tracking-laden URL before checking', () => {
169
+ const r = filterUrl('https://example.com/products/foo?utm_source=x#hash', ctx)
170
+ assert.equal(r.keep, true)
171
+ assert.equal(r.url, 'https://example.com/products/foo')
172
+ })
173
+
174
+ test('filterUrl drops /admin via default blocklist', () => {
175
+ const r = filterUrl('https://example.com/admin/users', ctx)
176
+ assert.equal(r.keep, false)
177
+ assert.equal(r.reason, 'blocklisted')
178
+ })
179
+
180
+ test('filterUrl drops /preview via custom blocklist', () => {
181
+ const r = filterUrl('https://example.com/preview/page', ctx)
182
+ assert.equal(r.keep, false)
183
+ assert.equal(r.reason, 'blocklisted')
184
+ })
185
+
186
+ test('filterUrl drops off-origin URL', () => {
187
+ const r = filterUrl('https://other.com/foo', ctx)
188
+ assert.equal(r.keep, false)
189
+ assert.equal(r.reason, 'invalid-or-off-origin')
190
+ })
191
+
192
+ test('filterUrl drops generic *.xml but keeps the active sitemap', () => {
193
+ const r = filterUrl('https://example.com/products.xml', ctx)
194
+ assert.equal(r.keep, false)
195
+ assert.equal(r.reason, 'xml-or-feed')
196
+ // Active sitemap URL is filtered as "keep" because we treat it as a non-page,
197
+ // i.e. the active sitemap shouldn't appear in URL list anyway. But isXmlOrFeed
198
+ // returns false for the active sitemap so the keep path runs:
199
+ const r2 = filterUrl('https://example.com/sitemap.xml', ctx)
200
+ assert.equal(r2.keep, true)
201
+ })
202
+
203
+ process.stdout.write(`\n${passed} passed, ${failed} failed\n`)
204
+ process.exit(failed === 0 ? 0 : 1)