similarbuild 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CHANGELOG.md +110 -0
- package/LICENSE +21 -0
- package/README.md +301 -0
- package/bin/install.js +256 -0
- package/lib/copy-templates.mjs +52 -0
- package/lib/install-deps.mjs +62 -0
- package/lib/prompt-config.mjs +83 -0
- package/lib/verify-env.mjs +19 -0
- package/package.json +63 -0
- package/scripts/sync-templates.mjs +71 -0
- package/templates/commands/build-page.md +490 -0
- package/templates/commands/build-site.md +548 -0
- package/templates/commands/clip-section.md +519 -0
- package/templates/memory/anti-patterns.md +212 -0
- package/templates/memory/design-knowledge.md +225 -0
- package/templates/memory/fixes.md +163 -0
- package/templates/memory/patterns.md +681 -0
- package/templates/presets/shopify-section.yaml +51 -0
- package/templates/presets/wp-elementor.yaml +49 -0
- package/templates/reports/fixtures/mock-run-1.json +115 -0
- package/templates/reports/fixtures/mock-run-2.json +72 -0
- package/templates/reports/report-renderer.mjs +218 -0
- package/templates/reports/report-template.html +571 -0
- package/templates/skills/sb-build-shopify/SKILL.md +104 -0
- package/templates/skills/sb-build-shopify/references/shopify-build-rules.md +563 -0
- package/templates/skills/sb-build-shopify/scripts/build-shopify.mjs +637 -0
- package/templates/skills/sb-build-shopify/scripts/tests/test-build-shopify.mjs +424 -0
- package/templates/skills/sb-build-wp/SKILL.md +83 -0
- package/templates/skills/sb-build-wp/references/wp-build-rules.md +376 -0
- package/templates/skills/sb-build-wp/scripts/build-wp.mjs +327 -0
- package/templates/skills/sb-build-wp/scripts/tests/test-build-wp.mjs +224 -0
- package/templates/skills/sb-compare-visual/SKILL.md +121 -0
- package/templates/skills/sb-compare-visual/scripts/compare-visual.mjs +387 -0
- package/templates/skills/sb-compare-visual/scripts/lib/compare-tokens.mjs +273 -0
- package/templates/skills/sb-compare-visual/scripts/tests/test-compare-tokens.mjs +350 -0
- package/templates/skills/sb-compare-visual/scripts/tests/test-compare-visual.mjs +626 -0
- package/templates/skills/sb-crawl-and-list/SKILL.md +99 -0
- package/templates/skills/sb-crawl-and-list/scripts/crawl-and-list.mjs +437 -0
- package/templates/skills/sb-crawl-and-list/scripts/lib/blocklist-filter.mjs +176 -0
- package/templates/skills/sb-crawl-and-list/scripts/lib/fallback-crawler.mjs +107 -0
- package/templates/skills/sb-crawl-and-list/scripts/lib/page-classifier.mjs +89 -0
- package/templates/skills/sb-crawl-and-list/scripts/lib/sitemap-parser.mjs +118 -0
- package/templates/skills/sb-crawl-and-list/scripts/tests/test-blocklist-filter.mjs +204 -0
- package/templates/skills/sb-crawl-and-list/scripts/tests/test-crawl-and-list.mjs +276 -0
- package/templates/skills/sb-crawl-and-list/scripts/tests/test-fallback-crawler.mjs +243 -0
- package/templates/skills/sb-crawl-and-list/scripts/tests/test-page-classifier.mjs +120 -0
- package/templates/skills/sb-crawl-and-list/scripts/tests/test-sitemap-parser.mjs +157 -0
- package/templates/skills/sb-extract-assets/SKILL.md +112 -0
- package/templates/skills/sb-extract-assets/scripts/extract-assets.mjs +484 -0
- package/templates/skills/sb-extract-assets/scripts/tests/test-extract-assets.mjs +112 -0
- package/templates/skills/sb-inspect-live/SKILL.md +105 -0
- package/templates/skills/sb-inspect-live/scripts/inspect-live.mjs +693 -0
- package/templates/skills/sb-inspect-live/scripts/tests/test-inspect-live.mjs +181 -0
- package/templates/skills/sb-review-checks/SKILL.md +113 -0
- package/templates/skills/sb-review-checks/references/review-rules.md +195 -0
- package/templates/skills/sb-review-checks/scripts/lib/anti-patterns.mjs +379 -0
- package/templates/skills/sb-review-checks/scripts/lib/cross-reference.mjs +115 -0
- package/templates/skills/sb-review-checks/scripts/lib/design-quality.mjs +541 -0
- package/templates/skills/sb-review-checks/scripts/review-checks.mjs +250 -0
- package/templates/skills/sb-review-checks/scripts/tests/test-anti-patterns.mjs +343 -0
- package/templates/skills/sb-review-checks/scripts/tests/test-cross-reference.mjs +170 -0
- package/templates/skills/sb-review-checks/scripts/tests/test-design-quality.mjs +493 -0
- package/templates/skills/sb-review-checks/scripts/tests/test-review-checks.mjs +267 -0
- package/templates/skills/sb-tweak/SKILL.md +130 -0
- package/templates/skills/sb-tweak/references/tweak-patterns.md +157 -0
- package/templates/skills/sb-tweak/scripts/lib/diff-summarizer.mjs +140 -0
- package/templates/skills/sb-tweak/scripts/lib/element-locator.mjs +507 -0
- package/templates/skills/sb-tweak/scripts/lib/intent-parser.mjs +324 -0
- package/templates/skills/sb-tweak/scripts/tests/test-diff-summarizer.mjs +248 -0
- package/templates/skills/sb-tweak/scripts/tests/test-element-locator.mjs +418 -0
- package/templates/skills/sb-tweak/scripts/tests/test-intent-parser.mjs +496 -0
- package/templates/skills/sb-tweak/scripts/tests/test-tweak.mjs +407 -0
- package/templates/skills/sb-tweak/scripts/tweak.mjs +656 -0
- package/templates/skills/sb-validate-render/SKILL.md +120 -0
- package/templates/skills/sb-validate-render/scripts/tests/test-validate-render.mjs +304 -0
- package/templates/skills/sb-validate-render/scripts/validate-render.mjs +645 -0
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
// blocklist-filter.mjs — Pure URL filtering. No I/O.
|
|
2
|
+
//
|
|
3
|
+
// Encodes the rule that a SimilarBuild page list must never include:
|
|
4
|
+
//
|
|
5
|
+
// - Auth-gated pages (/account, /login, /admin, etc.) — irrelevant for
|
|
6
|
+
// visual cloning and many will 401/403.
|
|
7
|
+
// - Cart / checkout — dynamic, session-bound, and typically unique enough
|
|
8
|
+
// that a clone would mislead more than it informs.
|
|
9
|
+
// - API / framework internals (/api/**, /_next/**, /_nuxt/**) — not visual.
|
|
10
|
+
// - Tracking-only URLs (?utm_*, ?fbclid, ?gclid, ?mc_eid). These ARE
|
|
11
|
+
// duplicates of the canonical URL with a parameter; we drop the params
|
|
12
|
+
// and let dedup handle the rest.
|
|
13
|
+
// - RSS/Atom feeds — not pages, will not render.
|
|
14
|
+
//
|
|
15
|
+
// Custom blocklist (--blocklist) is additive: user prefixes are layered on
|
|
16
|
+
// top of the defaults, never replace them. That way you can't accidentally
|
|
17
|
+
// allow /admin by passing a too-narrow custom list.
|
|
18
|
+
|
|
19
|
+
const DEFAULT_PATH_PREFIXES = [
|
|
20
|
+
'/cart',
|
|
21
|
+
'/checkout',
|
|
22
|
+
'/account',
|
|
23
|
+
'/login',
|
|
24
|
+
'/register',
|
|
25
|
+
'/signin',
|
|
26
|
+
'/signup',
|
|
27
|
+
'/sign-in',
|
|
28
|
+
'/sign-up',
|
|
29
|
+
'/logout',
|
|
30
|
+
'/admin',
|
|
31
|
+
'/wp-admin',
|
|
32
|
+
'/wp-login.php',
|
|
33
|
+
'/api/',
|
|
34
|
+
'/_next/',
|
|
35
|
+
'/_nuxt/',
|
|
36
|
+
'/feed',
|
|
37
|
+
'/rss',
|
|
38
|
+
'/atom',
|
|
39
|
+
]
|
|
40
|
+
|
|
41
|
+
// Tracking params we strip. Any URL whose ONLY querystring content is these
|
|
42
|
+
// gets normalized to the bare path.
|
|
43
|
+
const TRACKING_PARAMS = new Set([
|
|
44
|
+
'utm_source',
|
|
45
|
+
'utm_medium',
|
|
46
|
+
'utm_campaign',
|
|
47
|
+
'utm_term',
|
|
48
|
+
'utm_content',
|
|
49
|
+
'utm_id',
|
|
50
|
+
'utm_name',
|
|
51
|
+
'fbclid',
|
|
52
|
+
'gclid',
|
|
53
|
+
'gbraid',
|
|
54
|
+
'wbraid',
|
|
55
|
+
'mc_eid',
|
|
56
|
+
'mc_cid',
|
|
57
|
+
'msclkid',
|
|
58
|
+
'yclid',
|
|
59
|
+
'_ga',
|
|
60
|
+
'ref',
|
|
61
|
+
'ref_src',
|
|
62
|
+
])
|
|
63
|
+
|
|
64
|
+
function stripTracking(u) {
|
|
65
|
+
const params = u.searchParams
|
|
66
|
+
for (const k of [...params.keys()]) {
|
|
67
|
+
if (TRACKING_PARAMS.has(k.toLowerCase()) || k.toLowerCase().startsWith('utm_')) {
|
|
68
|
+
params.delete(k)
|
|
69
|
+
}
|
|
70
|
+
}
|
|
71
|
+
return u
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
// Normalize a URL for dedupe + filtering. Returns null if the URL is invalid,
|
|
75
|
+
// off-origin, or has an unsupported scheme. `originHost` is the lower-cased
|
|
76
|
+
// hostname of the root URL.
|
|
77
|
+
export function normalizeUrl(url, originHost) {
|
|
78
|
+
try {
|
|
79
|
+
const u = new URL(url)
|
|
80
|
+
if (!/^https?:$/.test(u.protocol)) return null
|
|
81
|
+
if (originHost && u.hostname.toLowerCase() !== originHost) return null
|
|
82
|
+
|
|
83
|
+
// Drop fragment — anchors are intra-page navigation, not separate pages.
|
|
84
|
+
u.hash = ''
|
|
85
|
+
// Strip tracking params.
|
|
86
|
+
stripTracking(u)
|
|
87
|
+
// Sort remaining params for stable dedupe ordering.
|
|
88
|
+
const sorted = [...u.searchParams.entries()].sort(([a], [b]) => a.localeCompare(b))
|
|
89
|
+
u.search = ''
|
|
90
|
+
for (const [k, v] of sorted) u.searchParams.append(k, v)
|
|
91
|
+
|
|
92
|
+
// Normalize trailing slash on the pathname (except for root). This is
|
|
93
|
+
// dedupe-only: `/foo` and `/foo/` are the same page on 99% of sites.
|
|
94
|
+
if (u.pathname.length > 1 && u.pathname.endsWith('/')) {
|
|
95
|
+
u.pathname = u.pathname.slice(0, -1)
|
|
96
|
+
}
|
|
97
|
+
return u.toString()
|
|
98
|
+
} catch {
|
|
99
|
+
return null
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
|
|
103
|
+
// Check if a normalized URL matches any path prefix in the blocklist. The
|
|
104
|
+
// match is on lowercased pathname only — querystrings can't trigger blocks.
|
|
105
|
+
export function matchesBlocklist(url, blocklistPrefixes) {
|
|
106
|
+
let pathname
|
|
107
|
+
try {
|
|
108
|
+
pathname = new URL(url).pathname.toLowerCase()
|
|
109
|
+
} catch {
|
|
110
|
+
return false
|
|
111
|
+
}
|
|
112
|
+
// Special-case: exact path matches like '/login' (no slash) shouldn't
|
|
113
|
+
// accidentally match '/login-info'. Use boundary-aware matching: the
|
|
114
|
+
// prefix must end at a slash, end-of-path, or a non-word boundary.
|
|
115
|
+
for (const prefix of blocklistPrefixes) {
|
|
116
|
+
const p = prefix.toLowerCase()
|
|
117
|
+
if (pathname === p) return true
|
|
118
|
+
if (p.endsWith('/')) {
|
|
119
|
+
if (pathname.startsWith(p)) return true
|
|
120
|
+
} else {
|
|
121
|
+
if (pathname.startsWith(p + '/')) return true
|
|
122
|
+
// Files/exact endpoints like /wp-login.php
|
|
123
|
+
if (pathname === p) return true
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
return false
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
// .xml extensions — block every *.xml EXCEPT the one we used to crawl. The
|
|
130
|
+
// crawler passes the sitemap URL it consumed (if any) so we don't accidentally
|
|
131
|
+
// drop sitemap.xml itself if it appears in <loc> entries (which would be weird
|
|
132
|
+
// but does happen on misconfigured sites).
|
|
133
|
+
export function isXmlOrFeed(url, sitemapUrl) {
|
|
134
|
+
try {
|
|
135
|
+
const u = new URL(url)
|
|
136
|
+
const path = u.pathname.toLowerCase()
|
|
137
|
+
if (sitemapUrl) {
|
|
138
|
+
try {
|
|
139
|
+
const su = new URL(sitemapUrl)
|
|
140
|
+
if (su.pathname.toLowerCase() === path && su.hostname === u.hostname) {
|
|
141
|
+
return false
|
|
142
|
+
}
|
|
143
|
+
} catch {
|
|
144
|
+
/* fall through */
|
|
145
|
+
}
|
|
146
|
+
}
|
|
147
|
+
return path.endsWith('.xml') || path.endsWith('.rss') || path.endsWith('.atom')
|
|
148
|
+
} catch {
|
|
149
|
+
return false
|
|
150
|
+
}
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
export function buildBlocklist(customBlocklist) {
|
|
154
|
+
const custom = (customBlocklist || '')
|
|
155
|
+
.split(',')
|
|
156
|
+
.map((s) => s.trim())
|
|
157
|
+
.filter(Boolean)
|
|
158
|
+
return [...DEFAULT_PATH_PREFIXES, ...custom]
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
// Run the full pipeline for one URL. Returns:
|
|
162
|
+
// { keep: true, url: normalized } — pass through
|
|
163
|
+
// { keep: false, reason: 'invalid'|'off-origin'|... } — drop
|
|
164
|
+
export function filterUrl(url, { originHost, blocklistPrefixes, sitemapUrl }) {
|
|
165
|
+
const normalized = normalizeUrl(url, originHost)
|
|
166
|
+
if (!normalized) return { keep: false, reason: 'invalid-or-off-origin' }
|
|
167
|
+
if (matchesBlocklist(normalized, blocklistPrefixes)) {
|
|
168
|
+
return { keep: false, reason: 'blocklisted' }
|
|
169
|
+
}
|
|
170
|
+
if (isXmlOrFeed(normalized, sitemapUrl)) {
|
|
171
|
+
return { keep: false, reason: 'xml-or-feed' }
|
|
172
|
+
}
|
|
173
|
+
return { keep: true, url: normalized }
|
|
174
|
+
}
|
|
175
|
+
|
|
176
|
+
export const __defaults = { DEFAULT_PATH_PREFIXES, TRACKING_PARAMS }
|
|
@@ -0,0 +1,107 @@
|
|
|
1
|
+
// fallback-crawler.mjs — Same-origin BFS using fetch + cheerio when sitemap
|
|
2
|
+
// is absent or unparseable. Pure-ish: cheerio + fetcher are injected, no
|
|
3
|
+
// process I/O. The crawler stops at maxDepth and maxPages and reports every
|
|
4
|
+
// URL it discovered (including the ones over the cap, so the orchestrator
|
|
5
|
+
// can warn).
|
|
6
|
+
//
|
|
7
|
+
// Why fetch + cheerio not chromium:
|
|
8
|
+
// - Discovery doesn't need rendering. Static <a href> tags reveal site
|
|
9
|
+
// structure on > 90% of WordPress / Shopify / Webflow / static sites.
|
|
10
|
+
// - Chromium adds 250MB install + ~5s startup per page; for 100 pages
|
|
11
|
+
// that's a 10-minute crawl when fetch finishes in 30s.
|
|
12
|
+
// - sb-inspect-live owns the chromium budget — we keep it for the actual
|
|
13
|
+
// visual capture.
|
|
14
|
+
//
|
|
15
|
+
// Trade-off accepted: pure-SPA sites (Next.js client-routed, fully
|
|
16
|
+
// JS-rendered) return only the entry shell. We detect that ("0–1 internal
|
|
17
|
+
// links found across the whole crawl") and surface a clear warning so the
|
|
18
|
+
// user knows to supply a sitemap manually.
|
|
19
|
+
|
|
20
|
+
import { filterUrl } from './blocklist-filter.mjs'
|
|
21
|
+
|
|
22
|
+
// fetcher signature: async (url) => { ok, status, text? }
|
|
23
|
+
// loadHtml signature: (html) => $ (cheerio instance)
|
|
24
|
+
export async function fallbackCrawl({
|
|
25
|
+
rootUrl,
|
|
26
|
+
fetcher,
|
|
27
|
+
loadHtml,
|
|
28
|
+
maxDepth,
|
|
29
|
+
maxPages,
|
|
30
|
+
blocklistPrefixes,
|
|
31
|
+
sitemapUrl, // pass-through so isXmlOrFeed knows what to keep
|
|
32
|
+
}) {
|
|
33
|
+
const origin = new URL(rootUrl)
|
|
34
|
+
const originHost = origin.hostname.toLowerCase()
|
|
35
|
+
const startUrl = origin.toString()
|
|
36
|
+
|
|
37
|
+
const visited = new Set()
|
|
38
|
+
const queue = [{ url: startUrl, depth: 0 }]
|
|
39
|
+
const results = [] // { url, depth }
|
|
40
|
+
const warnings = []
|
|
41
|
+
|
|
42
|
+
// Track the very first page's link count separately — that's our SPA
|
|
43
|
+
// detector. If the root yields 0–1 internal hrefs and total results <= 1
|
|
44
|
+
// at the end, the site is almost certainly client-rendered.
|
|
45
|
+
let rootInternalHrefCount = null
|
|
46
|
+
|
|
47
|
+
while (queue.length > 0 && results.length < maxPages) {
|
|
48
|
+
const { url, depth } = queue.shift()
|
|
49
|
+
const filtered = filterUrl(url, { originHost, blocklistPrefixes, sitemapUrl })
|
|
50
|
+
if (!filtered.keep) continue
|
|
51
|
+
const norm = filtered.url
|
|
52
|
+
if (visited.has(norm)) continue
|
|
53
|
+
visited.add(norm)
|
|
54
|
+
|
|
55
|
+
const fetched = await fetcher(norm)
|
|
56
|
+
if (!fetched.ok) {
|
|
57
|
+
warnings.push(`crawl-fetch-failed:${norm}:${fetched.status || 0}`)
|
|
58
|
+
continue
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
// The page itself counts even if it yields no further links.
|
|
62
|
+
results.push({ url: norm, depth })
|
|
63
|
+
|
|
64
|
+
if (depth >= maxDepth) continue
|
|
65
|
+
|
|
66
|
+
let $
|
|
67
|
+
try {
|
|
68
|
+
$ = loadHtml(fetched.text || '')
|
|
69
|
+
} catch (err) {
|
|
70
|
+
warnings.push(`crawl-parse-failed:${norm}:${err.message || 'unknown'}`)
|
|
71
|
+
continue
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const hrefs = []
|
|
75
|
+
$('a[href]').each((_, el) => {
|
|
76
|
+
const raw = $(el).attr('href')
|
|
77
|
+
if (raw) hrefs.push(raw)
|
|
78
|
+
})
|
|
79
|
+
|
|
80
|
+
if (depth === 0) rootInternalHrefCount = 0
|
|
81
|
+
|
|
82
|
+
for (const raw of hrefs) {
|
|
83
|
+
let abs
|
|
84
|
+
try {
|
|
85
|
+
abs = new URL(raw, norm).toString()
|
|
86
|
+
} catch {
|
|
87
|
+
continue
|
|
88
|
+
}
|
|
89
|
+
const f = filterUrl(abs, { originHost, blocklistPrefixes, sitemapUrl })
|
|
90
|
+
if (!f.keep) continue
|
|
91
|
+
if (depth === 0) rootInternalHrefCount += 1
|
|
92
|
+
if (visited.has(f.url)) continue
|
|
93
|
+
// Don't enqueue if already queued (cheap check by scanning queue is
|
|
94
|
+
// O(n) but n is bounded by maxPages so it's fine).
|
|
95
|
+
if (queue.some((q) => q.url === f.url)) continue
|
|
96
|
+
queue.push({ url: f.url, depth: depth + 1 })
|
|
97
|
+
}
|
|
98
|
+
}
|
|
99
|
+
|
|
100
|
+
// SPA detector: tiny result set AND the entry page had ≤ 1 internal link.
|
|
101
|
+
// (We only signal the warning — the orchestrator decides what to do.)
|
|
102
|
+
if (results.length <= 1 && (rootInternalHrefCount ?? 0) <= 1) {
|
|
103
|
+
warnings.push('spa-suspected')
|
|
104
|
+
}
|
|
105
|
+
|
|
106
|
+
return { results, warnings, totalDiscovered: results.length + queue.length }
|
|
107
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
// page-classifier.mjs — URL → page-type heuristic via path inspection. Pure.
|
|
2
|
+
//
|
|
3
|
+
// Used by `/build-site` to bucket pages so the user sees a structured table
|
|
4
|
+
// (e.g. "5 PDPs, 3 collections, 1 home") before confirming. The classification
|
|
5
|
+
// drives nothing fatal — `other` is a valid type — but the better the
|
|
6
|
+
// guesses, the less the orchestrator nags the user about reordering.
|
|
7
|
+
//
|
|
8
|
+
// Heuristics are intentionally Shopify-flavoured (most common SimilarBuild
|
|
9
|
+
// target) but match common WordPress/Webflow paths too. Patterns are checked
|
|
10
|
+
// top-down; first match wins. No regex slowness — these are literal substring
|
|
11
|
+
// checks against the lowercased pathname.
|
|
12
|
+
|
|
13
|
+
const TYPE_RULES = [
|
|
14
|
+
// Home — only the bare root. Trailing slash handled by normalizing pathname
|
|
15
|
+
// to "/" before matching.
|
|
16
|
+
{ type: 'home', test: (p) => p === '/' || p === '' },
|
|
17
|
+
|
|
18
|
+
// Product pages — Shopify (`/products/`), WooCommerce / generic
|
|
19
|
+
// (`/product/`), some BigCommerce stores (`/product/`).
|
|
20
|
+
{ type: 'pdp', test: (p) => p.startsWith('/products/') || p.startsWith('/product/') },
|
|
21
|
+
|
|
22
|
+
// Collections — Shopify (`/collections/`), some PrestaShop (`/category/`),
|
|
23
|
+
// WooCommerce (`/product-category/`).
|
|
24
|
+
{
|
|
25
|
+
type: 'collection',
|
|
26
|
+
test: (p) =>
|
|
27
|
+
p.startsWith('/collections/') ||
|
|
28
|
+
p.startsWith('/collection/') ||
|
|
29
|
+
p.startsWith('/category/') ||
|
|
30
|
+
p.startsWith('/product-category/'),
|
|
31
|
+
},
|
|
32
|
+
|
|
33
|
+
// Contact — must match before generic /pages/* fallthrough.
|
|
34
|
+
{
|
|
35
|
+
type: 'contact',
|
|
36
|
+
test: (p) => /^\/pages\/contact(\b|\/|-)/.test(p) || p === '/contact' || p.startsWith('/contact/'),
|
|
37
|
+
},
|
|
38
|
+
|
|
39
|
+
// About — same shape as contact.
|
|
40
|
+
{
|
|
41
|
+
type: 'about',
|
|
42
|
+
test: (p) => /^\/pages\/about(\b|\/|-)/.test(p) || p === '/about' || p.startsWith('/about/'),
|
|
43
|
+
},
|
|
44
|
+
|
|
45
|
+
// Policy pages — Shopify uses /policies/, generic stores use named pages.
|
|
46
|
+
{
|
|
47
|
+
type: 'policy',
|
|
48
|
+
test: (p) =>
|
|
49
|
+
p.startsWith('/policies/') ||
|
|
50
|
+
/^\/pages\/(privacy|terms|refund|shipping|return)(\b|\/|-)/.test(p) ||
|
|
51
|
+
/^\/(privacy|terms|refund|shipping|return)(\b|\/|-)/.test(p),
|
|
52
|
+
},
|
|
53
|
+
|
|
54
|
+
// Blog index + posts. Shopify: /blogs/, WordPress: /blog/, generic /posts/.
|
|
55
|
+
{
|
|
56
|
+
type: 'blog',
|
|
57
|
+
test: (p) =>
|
|
58
|
+
p.startsWith('/blogs/') ||
|
|
59
|
+
p.startsWith('/blog/') ||
|
|
60
|
+
p === '/blog' ||
|
|
61
|
+
p.startsWith('/posts/') ||
|
|
62
|
+
p === '/news' ||
|
|
63
|
+
p.startsWith('/news/'),
|
|
64
|
+
},
|
|
65
|
+
]
|
|
66
|
+
|
|
67
|
+
// Normalize a URL string to its lowercase pathname. Falls back to the input
|
|
68
|
+
// if it's not parseable (caller may pass a path-only string for testing).
|
|
69
|
+
function pathnameOf(url) {
|
|
70
|
+
try {
|
|
71
|
+
const u = new URL(url)
|
|
72
|
+
let p = u.pathname.toLowerCase()
|
|
73
|
+
// Treat trailing slash as equal to non-trailing (except for "/")
|
|
74
|
+
if (p.length > 1 && p.endsWith('/')) p = p.slice(0, -1)
|
|
75
|
+
return p
|
|
76
|
+
} catch {
|
|
77
|
+
let p = String(url).toLowerCase()
|
|
78
|
+
if (p.length > 1 && p.endsWith('/')) p = p.slice(0, -1)
|
|
79
|
+
return p
|
|
80
|
+
}
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
export function classifyUrl(url) {
|
|
84
|
+
const path = pathnameOf(url)
|
|
85
|
+
for (const rule of TYPE_RULES) {
|
|
86
|
+
if (rule.test(path)) return rule.type
|
|
87
|
+
}
|
|
88
|
+
return 'other'
|
|
89
|
+
}
|
|
@@ -0,0 +1,118 @@
|
|
|
1
|
+
// sitemap-parser.mjs — Pure XML → URL extraction. No I/O. Tested in isolation.
|
|
2
|
+
//
|
|
3
|
+
// Sitemap protocol (sitemaps.org) is small enough that a regex-driven parser
|
|
4
|
+
// beats pulling in xml2js for an install-time cost we don't need. We accept:
|
|
5
|
+
//
|
|
6
|
+
// - <urlset> with <url><loc>…</loc></url> (the common case)
|
|
7
|
+
// - <sitemapindex> with <sitemap><loc>…</loc></sitemap> (recurse one level)
|
|
8
|
+
//
|
|
9
|
+
// We do NOT validate XML well-formedness — sitemaps in the wild are full of
|
|
10
|
+
// stray BOMs, missing closing tags, and CDATA quirks. Strict validation would
|
|
11
|
+
// reject sitemaps Google's crawler happily eats. Instead: extract every <loc>
|
|
12
|
+
// payload, decode entities, trim, dedupe, and let downstream filtering decide.
|
|
13
|
+
//
|
|
14
|
+
// Returns: { urlSetUrls, sitemapIndexUrls, malformed }
|
|
15
|
+
|
|
16
|
+
const ENTITY_MAP = {
|
|
17
|
+
'&': '&',
|
|
18
|
+
'<': '<',
|
|
19
|
+
'>': '>',
|
|
20
|
+
'"': '"',
|
|
21
|
+
''': "'",
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
function decodeEntities(s) {
|
|
25
|
+
return s
|
|
26
|
+
.replace(/&(amp|lt|gt|quot|apos);/g, (m) => ENTITY_MAP[m] || m)
|
|
27
|
+
.replace(/&#(\d+);/g, (_, n) => String.fromCodePoint(parseInt(n, 10)))
|
|
28
|
+
.replace(/&#x([0-9a-f]+);/gi, (_, n) => String.fromCodePoint(parseInt(n, 16)))
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
function extractLocs(xml) {
|
|
32
|
+
const out = []
|
|
33
|
+
// Capture <loc>…</loc> non-greedy. Allow whitespace + CDATA wrapping.
|
|
34
|
+
const re = /<loc\b[^>]*>\s*(?:<!\[CDATA\[)?([\s\S]*?)(?:\]\]>)?\s*<\/loc\s*>/gi
|
|
35
|
+
let m
|
|
36
|
+
while ((m = re.exec(xml)) !== null) {
|
|
37
|
+
const raw = m[1].trim()
|
|
38
|
+
if (!raw) continue
|
|
39
|
+
out.push(decodeEntities(raw))
|
|
40
|
+
}
|
|
41
|
+
return out
|
|
42
|
+
}
|
|
43
|
+
|
|
44
|
+
// Distinguish between <urlset> (page URLs) and <sitemapindex> (more sitemaps).
|
|
45
|
+
// Sitemap protocol allows nested sitemapindex but we recurse only one level —
|
|
46
|
+
// real-world sites that nest deeper are rare and orchestrator can pass a
|
|
47
|
+
// specific child sitemap via --sitemap-path.
|
|
48
|
+
function detectKind(xml) {
|
|
49
|
+
if (/<\s*sitemapindex\b/i.test(xml)) return 'index'
|
|
50
|
+
if (/<\s*urlset\b/i.test(xml)) return 'urlset'
|
|
51
|
+
// Some hand-rolled sitemaps skip the wrapper entirely. If we find <loc>
|
|
52
|
+
// tags at all, treat as a flat urlset.
|
|
53
|
+
if (/<loc\b/i.test(xml)) return 'urlset'
|
|
54
|
+
return 'unknown'
|
|
55
|
+
}
|
|
56
|
+
|
|
57
|
+
export function parseSitemap(xml) {
|
|
58
|
+
if (typeof xml !== 'string' || xml.length === 0) {
|
|
59
|
+
return { kind: 'unknown', urlSetUrls: [], sitemapIndexUrls: [], malformed: true }
|
|
60
|
+
}
|
|
61
|
+
// Strip BOM if present — common from CMS-exported sitemaps.
|
|
62
|
+
if (xml.charCodeAt(0) === 0xfeff) xml = xml.slice(1)
|
|
63
|
+
|
|
64
|
+
const kind = detectKind(xml)
|
|
65
|
+
if (kind === 'unknown') {
|
|
66
|
+
return { kind, urlSetUrls: [], sitemapIndexUrls: [], malformed: true }
|
|
67
|
+
}
|
|
68
|
+
const locs = extractLocs(xml)
|
|
69
|
+
if (kind === 'index') {
|
|
70
|
+
return { kind, urlSetUrls: [], sitemapIndexUrls: locs, malformed: false }
|
|
71
|
+
}
|
|
72
|
+
return { kind, urlSetUrls: locs, sitemapIndexUrls: [], malformed: false }
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
// Async fetch + parse with one level of sitemapindex recursion. Fetcher is
|
|
76
|
+
// injected so this stays pure-ish and testable. Returns merged URL list +
|
|
77
|
+
// any per-child errors as warnings.
|
|
78
|
+
//
|
|
79
|
+
// fetcher signature: async (url) => { ok, status, text? }
|
|
80
|
+
export async function fetchAndParseSitemap(url, fetcher, { maxChildren = 50 } = {}) {
|
|
81
|
+
const warnings = []
|
|
82
|
+
const collected = []
|
|
83
|
+
|
|
84
|
+
const top = await fetcher(url)
|
|
85
|
+
if (!top.ok) {
|
|
86
|
+
return { urls: [], warnings: [`sitemap-fetch-failed:${top.status || 0}`] }
|
|
87
|
+
}
|
|
88
|
+
const parsed = parseSitemap(top.text || '')
|
|
89
|
+
if (parsed.malformed) {
|
|
90
|
+
return { urls: [], warnings: ['sitemap-malformed'] }
|
|
91
|
+
}
|
|
92
|
+
if (parsed.kind === 'urlset') {
|
|
93
|
+
return { urls: parsed.urlSetUrls, warnings }
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
// sitemapindex — fetch each child sitemap up to maxChildren.
|
|
97
|
+
const children = parsed.sitemapIndexUrls.slice(0, maxChildren)
|
|
98
|
+
if (parsed.sitemapIndexUrls.length > maxChildren) {
|
|
99
|
+
warnings.push(`sitemap-index-truncated:${parsed.sitemapIndexUrls.length}-of-${maxChildren}`)
|
|
100
|
+
}
|
|
101
|
+
for (const childUrl of children) {
|
|
102
|
+
const r = await fetcher(childUrl)
|
|
103
|
+
if (!r.ok) {
|
|
104
|
+
warnings.push(`sitemap-child-failed:${childUrl}:${r.status || 0}`)
|
|
105
|
+
continue
|
|
106
|
+
}
|
|
107
|
+
const childParsed = parseSitemap(r.text || '')
|
|
108
|
+
if (childParsed.malformed) {
|
|
109
|
+
warnings.push(`sitemap-child-malformed:${childUrl}`)
|
|
110
|
+
continue
|
|
111
|
+
}
|
|
112
|
+
// Treat any nested sitemapindex as flat — don't recurse a second level.
|
|
113
|
+
const childUrls =
|
|
114
|
+
childParsed.kind === 'urlset' ? childParsed.urlSetUrls : []
|
|
115
|
+
collected.push(...childUrls)
|
|
116
|
+
}
|
|
117
|
+
return { urls: collected, warnings }
|
|
118
|
+
}
|
|
@@ -0,0 +1,204 @@
|
|
|
1
|
+
#!/usr/bin/env node
|
|
2
|
+
// Tests for lib/blocklist-filter.mjs — pure unit tests.
|
|
3
|
+
|
|
4
|
+
import { strict as assert } from 'node:assert'
|
|
5
|
+
import {
|
|
6
|
+
buildBlocklist,
|
|
7
|
+
filterUrl,
|
|
8
|
+
matchesBlocklist,
|
|
9
|
+
normalizeUrl,
|
|
10
|
+
isXmlOrFeed,
|
|
11
|
+
__defaults,
|
|
12
|
+
} from '../lib/blocklist-filter.mjs'
|
|
13
|
+
|
|
14
|
+
let passed = 0
|
|
15
|
+
let failed = 0
|
|
16
|
+
|
|
17
|
+
function test(name, fn) {
|
|
18
|
+
try {
|
|
19
|
+
fn()
|
|
20
|
+
process.stdout.write(`ok - ${name}\n`)
|
|
21
|
+
passed++
|
|
22
|
+
} catch (err) {
|
|
23
|
+
process.stdout.write(`not ok - ${name}\n ${err.message}\n`)
|
|
24
|
+
failed++
|
|
25
|
+
}
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
// ─── normalizeUrl ───────────────────────────────────────────────────────────
|
|
29
|
+
|
|
30
|
+
test('normalizeUrl strips fragment', () => {
|
|
31
|
+
assert.equal(
|
|
32
|
+
normalizeUrl('https://example.com/page#section', 'example.com'),
|
|
33
|
+
'https://example.com/page',
|
|
34
|
+
)
|
|
35
|
+
})
|
|
36
|
+
|
|
37
|
+
test('normalizeUrl strips utm_* tracking params', () => {
|
|
38
|
+
assert.equal(
|
|
39
|
+
normalizeUrl('https://example.com/page?utm_source=twitter&utm_medium=social', 'example.com'),
|
|
40
|
+
'https://example.com/page',
|
|
41
|
+
)
|
|
42
|
+
})
|
|
43
|
+
|
|
44
|
+
test('normalizeUrl strips fbclid/gclid/mc_eid', () => {
|
|
45
|
+
assert.equal(
|
|
46
|
+
normalizeUrl('https://example.com/p?fbclid=123', 'example.com'),
|
|
47
|
+
'https://example.com/p',
|
|
48
|
+
)
|
|
49
|
+
assert.equal(
|
|
50
|
+
normalizeUrl('https://example.com/p?gclid=abc', 'example.com'),
|
|
51
|
+
'https://example.com/p',
|
|
52
|
+
)
|
|
53
|
+
assert.equal(
|
|
54
|
+
normalizeUrl('https://example.com/p?mc_eid=z', 'example.com'),
|
|
55
|
+
'https://example.com/p',
|
|
56
|
+
)
|
|
57
|
+
})
|
|
58
|
+
|
|
59
|
+
test('normalizeUrl preserves non-tracking params', () => {
|
|
60
|
+
assert.equal(
|
|
61
|
+
normalizeUrl('https://example.com/products?variant=red', 'example.com'),
|
|
62
|
+
'https://example.com/products?variant=red',
|
|
63
|
+
)
|
|
64
|
+
})
|
|
65
|
+
|
|
66
|
+
test('normalizeUrl rejects off-origin URLs', () => {
|
|
67
|
+
assert.equal(normalizeUrl('https://other.com/foo', 'example.com'), null)
|
|
68
|
+
})
|
|
69
|
+
|
|
70
|
+
test('normalizeUrl rejects non-http(s) schemes', () => {
|
|
71
|
+
assert.equal(normalizeUrl('mailto:hello@example.com', 'example.com'), null)
|
|
72
|
+
assert.equal(normalizeUrl('ftp://example.com/file', 'example.com'), null)
|
|
73
|
+
})
|
|
74
|
+
|
|
75
|
+
test('normalizeUrl strips trailing slash on non-root pathnames', () => {
|
|
76
|
+
assert.equal(
|
|
77
|
+
normalizeUrl('https://example.com/products/foo/', 'example.com'),
|
|
78
|
+
'https://example.com/products/foo',
|
|
79
|
+
)
|
|
80
|
+
})
|
|
81
|
+
|
|
82
|
+
test('normalizeUrl preserves root slash', () => {
|
|
83
|
+
assert.equal(normalizeUrl('https://example.com/', 'example.com'), 'https://example.com/')
|
|
84
|
+
})
|
|
85
|
+
|
|
86
|
+
test('normalizeUrl returns null on garbage input', () => {
|
|
87
|
+
assert.equal(normalizeUrl('not-a-url', 'example.com'), null)
|
|
88
|
+
})
|
|
89
|
+
|
|
90
|
+
// ─── matchesBlocklist ───────────────────────────────────────────────────────
|
|
91
|
+
|
|
92
|
+
test('matchesBlocklist matches /admin and /admin/foo', () => {
|
|
93
|
+
const list = ['/admin']
|
|
94
|
+
assert.ok(matchesBlocklist('https://example.com/admin', list))
|
|
95
|
+
assert.ok(matchesBlocklist('https://example.com/admin/users', list))
|
|
96
|
+
})
|
|
97
|
+
|
|
98
|
+
test('matchesBlocklist does NOT falsely match /admin-tools as /admin', () => {
|
|
99
|
+
assert.ok(!matchesBlocklist('https://example.com/admin-tools', ['/admin']))
|
|
100
|
+
})
|
|
101
|
+
|
|
102
|
+
test('matchesBlocklist matches /api/ prefix', () => {
|
|
103
|
+
assert.ok(matchesBlocklist('https://example.com/api/v1/users', ['/api/']))
|
|
104
|
+
})
|
|
105
|
+
|
|
106
|
+
test('matchesBlocklist matches exact /wp-login.php file', () => {
|
|
107
|
+
assert.ok(matchesBlocklist('https://example.com/wp-login.php', ['/wp-login.php']))
|
|
108
|
+
})
|
|
109
|
+
|
|
110
|
+
test('matchesBlocklist is case-insensitive on path', () => {
|
|
111
|
+
assert.ok(matchesBlocklist('https://example.com/Admin', ['/admin']))
|
|
112
|
+
})
|
|
113
|
+
|
|
114
|
+
// ─── isXmlOrFeed ────────────────────────────────────────────────────────────
|
|
115
|
+
|
|
116
|
+
test('isXmlOrFeed flags arbitrary .xml URLs', () => {
|
|
117
|
+
assert.ok(isXmlOrFeed('https://example.com/foo.xml'))
|
|
118
|
+
})
|
|
119
|
+
|
|
120
|
+
test('isXmlOrFeed allows the active sitemap to pass through', () => {
|
|
121
|
+
assert.ok(
|
|
122
|
+
!isXmlOrFeed('https://example.com/sitemap.xml', 'https://example.com/sitemap.xml'),
|
|
123
|
+
)
|
|
124
|
+
})
|
|
125
|
+
|
|
126
|
+
test('isXmlOrFeed flags /feed and rss/atom paths via prefix logic separately (here just .xml)', () => {
|
|
127
|
+
assert.ok(isXmlOrFeed('https://example.com/feed.atom'))
|
|
128
|
+
assert.ok(isXmlOrFeed('https://example.com/posts.rss'))
|
|
129
|
+
})
|
|
130
|
+
|
|
131
|
+
// ─── buildBlocklist ─────────────────────────────────────────────────────────
|
|
132
|
+
|
|
133
|
+
test('buildBlocklist includes all defaults', () => {
|
|
134
|
+
const list = buildBlocklist()
|
|
135
|
+
for (const p of __defaults.DEFAULT_PATH_PREFIXES) {
|
|
136
|
+
assert.ok(list.includes(p), `missing default ${p}`)
|
|
137
|
+
}
|
|
138
|
+
})
|
|
139
|
+
|
|
140
|
+
test('buildBlocklist appends custom prefixes additively', () => {
|
|
141
|
+
const list = buildBlocklist('/preview, /internal,/staging')
|
|
142
|
+
assert.ok(list.includes('/preview'))
|
|
143
|
+
assert.ok(list.includes('/internal'))
|
|
144
|
+
assert.ok(list.includes('/staging'))
|
|
145
|
+
// Defaults still present
|
|
146
|
+
assert.ok(list.includes('/admin'))
|
|
147
|
+
})
|
|
148
|
+
|
|
149
|
+
test('buildBlocklist tolerates empty/whitespace custom string', () => {
|
|
150
|
+
const list = buildBlocklist(' ,, ')
|
|
151
|
+
assert.deepEqual(list, [...__defaults.DEFAULT_PATH_PREFIXES])
|
|
152
|
+
})
|
|
153
|
+
|
|
154
|
+
// ─── filterUrl (full pipeline) ──────────────────────────────────────────────
|
|
155
|
+
|
|
156
|
+
const ctx = {
|
|
157
|
+
originHost: 'example.com',
|
|
158
|
+
blocklistPrefixes: buildBlocklist('/preview'),
|
|
159
|
+
sitemapUrl: 'https://example.com/sitemap.xml',
|
|
160
|
+
}
|
|
161
|
+
|
|
162
|
+
test('filterUrl keeps a clean product URL', () => {
|
|
163
|
+
const r = filterUrl('https://example.com/products/foo', ctx)
|
|
164
|
+
assert.equal(r.keep, true)
|
|
165
|
+
assert.equal(r.url, 'https://example.com/products/foo')
|
|
166
|
+
})
|
|
167
|
+
|
|
168
|
+
test('filterUrl normalizes tracking-laden URL before checking', () => {
|
|
169
|
+
const r = filterUrl('https://example.com/products/foo?utm_source=x#hash', ctx)
|
|
170
|
+
assert.equal(r.keep, true)
|
|
171
|
+
assert.equal(r.url, 'https://example.com/products/foo')
|
|
172
|
+
})
|
|
173
|
+
|
|
174
|
+
test('filterUrl drops /admin via default blocklist', () => {
|
|
175
|
+
const r = filterUrl('https://example.com/admin/users', ctx)
|
|
176
|
+
assert.equal(r.keep, false)
|
|
177
|
+
assert.equal(r.reason, 'blocklisted')
|
|
178
|
+
})
|
|
179
|
+
|
|
180
|
+
test('filterUrl drops /preview via custom blocklist', () => {
|
|
181
|
+
const r = filterUrl('https://example.com/preview/page', ctx)
|
|
182
|
+
assert.equal(r.keep, false)
|
|
183
|
+
assert.equal(r.reason, 'blocklisted')
|
|
184
|
+
})
|
|
185
|
+
|
|
186
|
+
test('filterUrl drops off-origin URL', () => {
|
|
187
|
+
const r = filterUrl('https://other.com/foo', ctx)
|
|
188
|
+
assert.equal(r.keep, false)
|
|
189
|
+
assert.equal(r.reason, 'invalid-or-off-origin')
|
|
190
|
+
})
|
|
191
|
+
|
|
192
|
+
test('filterUrl drops generic *.xml but keeps the active sitemap', () => {
|
|
193
|
+
const r = filterUrl('https://example.com/products.xml', ctx)
|
|
194
|
+
assert.equal(r.keep, false)
|
|
195
|
+
assert.equal(r.reason, 'xml-or-feed')
|
|
196
|
+
// Active sitemap URL is filtered as "keep" because we treat it as a non-page,
|
|
197
|
+
// i.e. the active sitemap shouldn't appear in URL list anyway. But isXmlOrFeed
|
|
198
|
+
// returns false for the active sitemap so the keep path runs:
|
|
199
|
+
const r2 = filterUrl('https://example.com/sitemap.xml', ctx)
|
|
200
|
+
assert.equal(r2.keep, true)
|
|
201
|
+
})
|
|
202
|
+
|
|
203
|
+
process.stdout.write(`\n${passed} passed, ${failed} failed\n`)
|
|
204
|
+
process.exit(failed === 0 ? 0 : 1)
|