similarbuild 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CHANGELOG.md +110 -0
  2. package/LICENSE +21 -0
  3. package/README.md +301 -0
  4. package/bin/install.js +256 -0
  5. package/lib/copy-templates.mjs +52 -0
  6. package/lib/install-deps.mjs +62 -0
  7. package/lib/prompt-config.mjs +83 -0
  8. package/lib/verify-env.mjs +19 -0
  9. package/package.json +63 -0
  10. package/scripts/sync-templates.mjs +71 -0
  11. package/templates/commands/build-page.md +490 -0
  12. package/templates/commands/build-site.md +548 -0
  13. package/templates/commands/clip-section.md +519 -0
  14. package/templates/memory/anti-patterns.md +212 -0
  15. package/templates/memory/design-knowledge.md +225 -0
  16. package/templates/memory/fixes.md +163 -0
  17. package/templates/memory/patterns.md +681 -0
  18. package/templates/presets/shopify-section.yaml +51 -0
  19. package/templates/presets/wp-elementor.yaml +49 -0
  20. package/templates/reports/fixtures/mock-run-1.json +115 -0
  21. package/templates/reports/fixtures/mock-run-2.json +72 -0
  22. package/templates/reports/report-renderer.mjs +218 -0
  23. package/templates/reports/report-template.html +571 -0
  24. package/templates/skills/sb-build-shopify/SKILL.md +104 -0
  25. package/templates/skills/sb-build-shopify/references/shopify-build-rules.md +563 -0
  26. package/templates/skills/sb-build-shopify/scripts/build-shopify.mjs +637 -0
  27. package/templates/skills/sb-build-shopify/scripts/tests/test-build-shopify.mjs +424 -0
  28. package/templates/skills/sb-build-wp/SKILL.md +83 -0
  29. package/templates/skills/sb-build-wp/references/wp-build-rules.md +376 -0
  30. package/templates/skills/sb-build-wp/scripts/build-wp.mjs +327 -0
  31. package/templates/skills/sb-build-wp/scripts/tests/test-build-wp.mjs +224 -0
  32. package/templates/skills/sb-compare-visual/SKILL.md +121 -0
  33. package/templates/skills/sb-compare-visual/scripts/compare-visual.mjs +387 -0
  34. package/templates/skills/sb-compare-visual/scripts/lib/compare-tokens.mjs +273 -0
  35. package/templates/skills/sb-compare-visual/scripts/tests/test-compare-tokens.mjs +350 -0
  36. package/templates/skills/sb-compare-visual/scripts/tests/test-compare-visual.mjs +626 -0
  37. package/templates/skills/sb-crawl-and-list/SKILL.md +99 -0
  38. package/templates/skills/sb-crawl-and-list/scripts/crawl-and-list.mjs +437 -0
  39. package/templates/skills/sb-crawl-and-list/scripts/lib/blocklist-filter.mjs +176 -0
  40. package/templates/skills/sb-crawl-and-list/scripts/lib/fallback-crawler.mjs +107 -0
  41. package/templates/skills/sb-crawl-and-list/scripts/lib/page-classifier.mjs +89 -0
  42. package/templates/skills/sb-crawl-and-list/scripts/lib/sitemap-parser.mjs +118 -0
  43. package/templates/skills/sb-crawl-and-list/scripts/tests/test-blocklist-filter.mjs +204 -0
  44. package/templates/skills/sb-crawl-and-list/scripts/tests/test-crawl-and-list.mjs +276 -0
  45. package/templates/skills/sb-crawl-and-list/scripts/tests/test-fallback-crawler.mjs +243 -0
  46. package/templates/skills/sb-crawl-and-list/scripts/tests/test-page-classifier.mjs +120 -0
  47. package/templates/skills/sb-crawl-and-list/scripts/tests/test-sitemap-parser.mjs +157 -0
  48. package/templates/skills/sb-extract-assets/SKILL.md +112 -0
  49. package/templates/skills/sb-extract-assets/scripts/extract-assets.mjs +484 -0
  50. package/templates/skills/sb-extract-assets/scripts/tests/test-extract-assets.mjs +112 -0
  51. package/templates/skills/sb-inspect-live/SKILL.md +105 -0
  52. package/templates/skills/sb-inspect-live/scripts/inspect-live.mjs +693 -0
  53. package/templates/skills/sb-inspect-live/scripts/tests/test-inspect-live.mjs +181 -0
  54. package/templates/skills/sb-review-checks/SKILL.md +113 -0
  55. package/templates/skills/sb-review-checks/references/review-rules.md +195 -0
  56. package/templates/skills/sb-review-checks/scripts/lib/anti-patterns.mjs +379 -0
  57. package/templates/skills/sb-review-checks/scripts/lib/cross-reference.mjs +115 -0
  58. package/templates/skills/sb-review-checks/scripts/lib/design-quality.mjs +541 -0
  59. package/templates/skills/sb-review-checks/scripts/review-checks.mjs +250 -0
  60. package/templates/skills/sb-review-checks/scripts/tests/test-anti-patterns.mjs +343 -0
  61. package/templates/skills/sb-review-checks/scripts/tests/test-cross-reference.mjs +170 -0
  62. package/templates/skills/sb-review-checks/scripts/tests/test-design-quality.mjs +493 -0
  63. package/templates/skills/sb-review-checks/scripts/tests/test-review-checks.mjs +267 -0
  64. package/templates/skills/sb-tweak/SKILL.md +130 -0
  65. package/templates/skills/sb-tweak/references/tweak-patterns.md +157 -0
  66. package/templates/skills/sb-tweak/scripts/lib/diff-summarizer.mjs +140 -0
  67. package/templates/skills/sb-tweak/scripts/lib/element-locator.mjs +507 -0
  68. package/templates/skills/sb-tweak/scripts/lib/intent-parser.mjs +324 -0
  69. package/templates/skills/sb-tweak/scripts/tests/test-diff-summarizer.mjs +248 -0
  70. package/templates/skills/sb-tweak/scripts/tests/test-element-locator.mjs +418 -0
  71. package/templates/skills/sb-tweak/scripts/tests/test-intent-parser.mjs +496 -0
  72. package/templates/skills/sb-tweak/scripts/tests/test-tweak.mjs +407 -0
  73. package/templates/skills/sb-tweak/scripts/tweak.mjs +656 -0
  74. package/templates/skills/sb-validate-render/SKILL.md +120 -0
  75. package/templates/skills/sb-validate-render/scripts/tests/test-validate-render.mjs +304 -0
  76. package/templates/skills/sb-validate-render/scripts/validate-render.mjs +645 -0
@@ -0,0 +1,276 @@
1
+ #!/usr/bin/env node
2
+ // test-crawl-and-list.mjs — Smoke tests for the main script.
3
+ // No network, no cheerio dep needed: we exercise --help, arg validation,
4
+ // and a local --sitemap-path round-trip (which doesn't touch fetch or cheerio).
5
+
6
+ import { spawnSync } from 'node:child_process'
7
+ import { fileURLToPath } from 'node:url'
8
+ import { dirname, resolve, join } from 'node:path'
9
+ import { mkdtempSync, writeFileSync, rmSync, existsSync, readFileSync } from 'node:fs'
10
+ import { tmpdir } from 'node:os'
11
+ import { strict as assert } from 'node:assert'
12
+
13
+ const here = dirname(fileURLToPath(import.meta.url))
14
+ const SCRIPT = resolve(here, '..', 'crawl-and-list.mjs')
15
+
16
+ let passed = 0
17
+ let failed = 0
18
+
19
+ function test(name, fn) {
20
+ try {
21
+ fn()
22
+ process.stdout.write(`ok - ${name}\n`)
23
+ passed++
24
+ } catch (err) {
25
+ process.stdout.write(`not ok - ${name}\n ${err.message}\n`)
26
+ failed++
27
+ }
28
+ }
29
+
30
+ test('--help exits 0 and prints usage', () => {
31
+ const r = spawnSync('node', [SCRIPT, '--help'], { encoding: 'utf8' })
32
+ assert.equal(r.status, 0, `exit code was ${r.status}`)
33
+ assert.match(r.stdout, /crawl-and-list\.mjs/)
34
+ assert.match(r.stdout, /--root-url/)
35
+ assert.match(r.stdout, /--output-dir/)
36
+ assert.match(r.stdout, /--max-depth/)
37
+ assert.match(r.stdout, /--max-pages/)
38
+ assert.match(r.stdout, /--blocklist/)
39
+ assert.match(r.stdout, /--sitemap-path/)
40
+ })
41
+
42
+ test('missing --root-url exits 2', () => {
43
+ const r = spawnSync('node', [SCRIPT, '--output-dir', '/tmp/sb-crawl-test'], {
44
+ encoding: 'utf8',
45
+ })
46
+ assert.equal(r.status, 2, `exit code was ${r.status}`)
47
+ assert.match(r.stderr, /missing --root-url/)
48
+ })
49
+
50
+ test('missing --output-dir exits 2', () => {
51
+ const r = spawnSync('node', [SCRIPT, '--root-url', 'https://example.com'], {
52
+ encoding: 'utf8',
53
+ })
54
+ assert.equal(r.status, 2, `exit code was ${r.status}`)
55
+ assert.match(r.stderr, /missing --output-dir/)
56
+ })
57
+
58
+ test('invalid --root-url exits 2', () => {
59
+ const r = spawnSync(
60
+ 'node',
61
+ [SCRIPT, '--root-url', 'not-a-url', '--output-dir', '/tmp/sb-crawl-test'],
62
+ { encoding: 'utf8' },
63
+ )
64
+ assert.equal(r.status, 2, `exit code was ${r.status}`)
65
+ assert.match(r.stderr, /not a valid URL/)
66
+ })
67
+
68
+ test('non-http(s) --root-url exits 2', () => {
69
+ const r = spawnSync(
70
+ 'node',
71
+ [SCRIPT, '--root-url', 'ftp://example.com', '--output-dir', '/tmp/sb-crawl-test'],
72
+ { encoding: 'utf8' },
73
+ )
74
+ assert.equal(r.status, 2, `exit code was ${r.status}`)
75
+ assert.match(r.stderr, /must be http\(s\)/)
76
+ })
77
+
78
+ test('non-numeric --max-depth exits 2', () => {
79
+ const r = spawnSync(
80
+ 'node',
81
+ [
82
+ SCRIPT,
83
+ '--root-url',
84
+ 'https://example.com',
85
+ '--output-dir',
86
+ '/tmp/sb-crawl-test',
87
+ '--max-depth',
88
+ 'foo',
89
+ ],
90
+ { encoding: 'utf8' },
91
+ )
92
+ assert.equal(r.status, 2, `exit code was ${r.status}`)
93
+ assert.match(r.stderr, /max-depth/)
94
+ })
95
+
96
+ test('non-numeric --max-pages exits 2', () => {
97
+ const r = spawnSync(
98
+ 'node',
99
+ [
100
+ SCRIPT,
101
+ '--root-url',
102
+ 'https://example.com',
103
+ '--output-dir',
104
+ '/tmp/sb-crawl-test',
105
+ '--max-pages',
106
+ 'zero',
107
+ ],
108
+ { encoding: 'utf8' },
109
+ )
110
+ assert.equal(r.status, 2, `exit code was ${r.status}`)
111
+ assert.match(r.stderr, /max-pages/)
112
+ })
113
+
114
+ test('--sitemap-path that does not exist exits 2', () => {
115
+ const r = spawnSync(
116
+ 'node',
117
+ [
118
+ SCRIPT,
119
+ '--root-url',
120
+ 'https://example.com',
121
+ '--output-dir',
122
+ '/tmp/sb-crawl-test',
123
+ '--sitemap-path',
124
+ '/tmp/sb-does-not-exist-xyz.xml',
125
+ ],
126
+ { encoding: 'utf8' },
127
+ )
128
+ assert.equal(r.status, 2, `exit code was ${r.status}`)
129
+ assert.match(r.stderr, /not found/)
130
+ })
131
+
132
+ test('valid --sitemap-path produces a pages-list.json with classified pages', () => {
133
+ // Use /tmp because mkdtempSync template needs an existing parent.
134
+ const dir = mkdtempSync(join(tmpdir(), 'sb-crawl-'))
135
+ const sitemapPath = join(dir, 'sitemap.xml')
136
+ writeFileSync(
137
+ sitemapPath,
138
+ `<?xml version="1.0"?>
139
+ <urlset>
140
+ <url><loc>https://example.com/</loc></url>
141
+ <url><loc>https://example.com/products/foo</loc></url>
142
+ <url><loc>https://example.com/collections/men</loc></url>
143
+ <url><loc>https://example.com/admin/users</loc></url>
144
+ <url><loc>https://example.com/cart</loc></url>
145
+ <url><loc>https://example.com/pages/about</loc></url>
146
+ <url><loc>https://example.com/products/foo?utm_source=tw</loc></url>
147
+ </urlset>`,
148
+ 'utf8',
149
+ )
150
+ const out = join(dir, 'out')
151
+ const r = spawnSync(
152
+ 'node',
153
+ [
154
+ SCRIPT,
155
+ '--root-url',
156
+ 'https://example.com',
157
+ '--output-dir',
158
+ out,
159
+ '--sitemap-path',
160
+ sitemapPath,
161
+ '--respect-robots-txt',
162
+ 'false',
163
+ ],
164
+ { encoding: 'utf8' },
165
+ )
166
+ try {
167
+ assert.equal(r.status, 0, `exit ${r.status}\nstderr:\n${r.stderr}\nstdout:\n${r.stdout}`)
168
+ const json = JSON.parse(r.stdout)
169
+ assert.equal(json.rootUrl, 'https://example.com/')
170
+ assert.equal(json.source, 'sitemap-path')
171
+
172
+ const types = json.pages.map((p) => p.type)
173
+ assert.ok(types.includes('home'), `home missing: ${types}`)
174
+ assert.ok(types.includes('pdp'))
175
+ assert.ok(types.includes('collection'))
176
+ assert.ok(types.includes('about'))
177
+
178
+ // /admin and /cart must be filtered
179
+ const urls = json.pages.map((p) => p.url)
180
+ assert.ok(!urls.some((u) => u.includes('/admin')), 'admin leaked')
181
+ assert.ok(!urls.some((u) => u.endsWith('/cart')), 'cart leaked')
182
+
183
+ // utm-tagged URL must collapse onto its bare twin
184
+ const fooHits = urls.filter((u) => u === 'https://example.com/products/foo')
185
+ assert.equal(fooHits.length, 1, 'utm dedupe failed')
186
+ assert.ok(json.duplicates >= 1, 'duplicate count not reported')
187
+
188
+ // Counts add up: blocked count matches admin + cart (2)
189
+ assert.equal(json.blocked, 2, `blocked was ${json.blocked}`)
190
+
191
+ // pages-list.json file persisted
192
+ assert.ok(existsSync(join(out, 'pages-list.json')))
193
+ const fileJson = JSON.parse(readFileSync(join(out, 'pages-list.json'), 'utf8'))
194
+ assert.equal(fileJson.pageCount, json.pageCount)
195
+ } finally {
196
+ rmSync(dir, { recursive: true, force: true })
197
+ }
198
+ })
199
+
200
+ test('--max-pages cap surfaces a warning and truncates', () => {
201
+ const dir = mkdtempSync(join(tmpdir(), 'sb-crawl-'))
202
+ const sitemapPath = join(dir, 'sitemap.xml')
203
+ const urls = []
204
+ for (let i = 0; i < 25; i++) urls.push(`<url><loc>https://example.com/products/item-${i}</loc></url>`)
205
+ writeFileSync(sitemapPath, `<urlset>${urls.join('')}</urlset>`, 'utf8')
206
+ const out = join(dir, 'out')
207
+ const r = spawnSync(
208
+ 'node',
209
+ [
210
+ SCRIPT,
211
+ '--root-url',
212
+ 'https://example.com',
213
+ '--output-dir',
214
+ out,
215
+ '--sitemap-path',
216
+ sitemapPath,
217
+ '--max-pages',
218
+ '10',
219
+ '--respect-robots-txt',
220
+ 'false',
221
+ ],
222
+ { encoding: 'utf8' },
223
+ )
224
+ try {
225
+ assert.equal(r.status, 0)
226
+ const json = JSON.parse(r.stdout)
227
+ assert.equal(json.pageCount, 10)
228
+ assert.ok(json.warnings.some((w) => w.startsWith('max-pages-cap:10-of-25')))
229
+ } finally {
230
+ rmSync(dir, { recursive: true, force: true })
231
+ }
232
+ })
233
+
234
+ test('--blocklist additive: custom prefix drops matching URLs', () => {
235
+ const dir = mkdtempSync(join(tmpdir(), 'sb-crawl-'))
236
+ const sitemapPath = join(dir, 'sitemap.xml')
237
+ writeFileSync(
238
+ sitemapPath,
239
+ `<urlset>
240
+ <url><loc>https://example.com/</loc></url>
241
+ <url><loc>https://example.com/preview/secret</loc></url>
242
+ <url><loc>https://example.com/products/foo</loc></url>
243
+ </urlset>`,
244
+ 'utf8',
245
+ )
246
+ const out = join(dir, 'out')
247
+ const r = spawnSync(
248
+ 'node',
249
+ [
250
+ SCRIPT,
251
+ '--root-url',
252
+ 'https://example.com',
253
+ '--output-dir',
254
+ out,
255
+ '--sitemap-path',
256
+ sitemapPath,
257
+ '--blocklist',
258
+ '/preview',
259
+ '--respect-robots-txt',
260
+ 'false',
261
+ ],
262
+ { encoding: 'utf8' },
263
+ )
264
+ try {
265
+ assert.equal(r.status, 0, r.stderr)
266
+ const json = JSON.parse(r.stdout)
267
+ const urls = json.pages.map((p) => p.url)
268
+ assert.ok(!urls.some((u) => u.includes('/preview')), 'preview not blocked')
269
+ assert.equal(json.blocked, 1)
270
+ } finally {
271
+ rmSync(dir, { recursive: true, force: true })
272
+ }
273
+ })
274
+
275
+ process.stdout.write(`\n${passed} passed, ${failed} failed\n`)
276
+ process.exit(failed === 0 ? 0 : 1)
@@ -0,0 +1,243 @@
1
+ #!/usr/bin/env node
2
+ // Tests for lib/fallback-crawler.mjs — pure unit tests with injected
3
+ // fetcher + cheerio-shaped loadHtml. No real network or DOM required.
4
+ //
5
+ // We don't import cheerio here — instead we use a tiny shim that returns
6
+ // $('a[href]').each(cb) results matching what cheerio would yield. The
7
+ // fallback crawler doesn't depend on cheerio's full API, just the
8
+ // `$(selector).each` + `.attr('href')` chain.
9
+
10
+ import { strict as assert } from 'node:assert'
11
+ import { fallbackCrawl } from '../lib/fallback-crawler.mjs'
12
+ import { buildBlocklist } from '../lib/blocklist-filter.mjs'
13
+
14
+ let passed = 0
15
+ let failed = 0
16
+
17
+ async function test(name, fn) {
18
+ try {
19
+ await fn()
20
+ process.stdout.write(`ok - ${name}\n`)
21
+ passed++
22
+ } catch (err) {
23
+ process.stdout.write(`not ok - ${name}\n ${err.message}\n`)
24
+ failed++
25
+ }
26
+ }
27
+
28
+ // Build a tiny "$" replacement: parses very simple HTML for href attributes
29
+ // in <a> tags and exposes them via the cheerio-ish API the crawler uses.
30
+ function makeFakeLoadHtml() {
31
+ return (html) => {
32
+ const hrefs = []
33
+ const re = /<a\b[^>]*\bhref\s*=\s*"([^"]*)"[^>]*>/gi
34
+ let m
35
+ while ((m = re.exec(html)) !== null) hrefs.push(m[1])
36
+ function $(sel) {
37
+ // Only `a[href]` is used.
38
+ return {
39
+ each(cb) {
40
+ hrefs.forEach((h, i) =>
41
+ cb(i, {
42
+ // The crawler does $(el).attr('href') — provide that path.
43
+ }),
44
+ )
45
+ },
46
+ }
47
+ }
48
+ // The crawler does $(el).attr('href'); to make that work, $(el) must
49
+ // accept the (i, el) wrapping above. Easiest path: index-based lookup.
50
+ // Re-implement so the .each callback re-wraps element through $:
51
+ function dollar(arg) {
52
+ if (typeof arg === 'string') {
53
+ // selector path
54
+ return {
55
+ each(cb) {
56
+ hrefs.forEach((h, i) => {
57
+ cb(i, { __href: h })
58
+ })
59
+ },
60
+ }
61
+ }
62
+ // element wrapper
63
+ return {
64
+ attr(name) {
65
+ if (name === 'href') return arg.__href
66
+ return undefined
67
+ },
68
+ }
69
+ }
70
+ return dollar
71
+ }
72
+ }
73
+
74
+ const blocklistPrefixes = buildBlocklist()
75
+
76
+ await test('crawls a single page with internal links and respects max-depth', async () => {
77
+ const pages = {
78
+ 'https://example.com/': `
79
+ <html>
80
+ <a href="/about">About</a>
81
+ <a href="/products/foo">Foo</a>
82
+ <a href="https://other.com/external">Out</a>
83
+ </html>`,
84
+ 'https://example.com/about': '<html>about</html>',
85
+ 'https://example.com/products/foo': '<html>foo</html>',
86
+ }
87
+ const fetcher = async (url) => {
88
+ if (url in pages) return { ok: true, status: 200, text: pages[url] }
89
+ return { ok: false, status: 404 }
90
+ }
91
+ const loadHtml = makeFakeLoadHtml()
92
+ const r = await fallbackCrawl({
93
+ rootUrl: 'https://example.com/',
94
+ fetcher,
95
+ loadHtml,
96
+ maxDepth: 2,
97
+ maxPages: 50,
98
+ blocklistPrefixes,
99
+ sitemapUrl: null,
100
+ })
101
+ const urls = r.results.map((p) => p.url).sort()
102
+ assert.deepEqual(urls, [
103
+ 'https://example.com/',
104
+ 'https://example.com/about',
105
+ 'https://example.com/products/foo',
106
+ ])
107
+ assert.ok(!r.warnings.includes('spa-suspected'))
108
+ })
109
+
110
+ await test('skips off-origin links', async () => {
111
+ const fetcher = async (url) => {
112
+ if (url === 'https://example.com/')
113
+ return {
114
+ ok: true,
115
+ status: 200,
116
+ text: '<html><a href="https://elsewhere.com/x">x</a></html>',
117
+ }
118
+ return { ok: false, status: 404 }
119
+ }
120
+ const loadHtml = makeFakeLoadHtml()
121
+ const r = await fallbackCrawl({
122
+ rootUrl: 'https://example.com/',
123
+ fetcher,
124
+ loadHtml,
125
+ maxDepth: 2,
126
+ maxPages: 50,
127
+ blocklistPrefixes,
128
+ sitemapUrl: null,
129
+ })
130
+ // Only the root + spa-suspected because root had 0 internal links
131
+ assert.deepEqual(r.results.map((p) => p.url), ['https://example.com/'])
132
+ assert.ok(r.warnings.includes('spa-suspected'))
133
+ })
134
+
135
+ await test('flags spa-suspected when entry has no internal hrefs', async () => {
136
+ const fetcher = async () => ({ ok: true, status: 200, text: '<html><script>app()</script></html>' })
137
+ const loadHtml = makeFakeLoadHtml()
138
+ const r = await fallbackCrawl({
139
+ rootUrl: 'https://example.com/',
140
+ fetcher,
141
+ loadHtml,
142
+ maxDepth: 3,
143
+ maxPages: 200,
144
+ blocklistPrefixes,
145
+ sitemapUrl: null,
146
+ })
147
+ assert.equal(r.results.length, 1)
148
+ assert.ok(r.warnings.includes('spa-suspected'))
149
+ })
150
+
151
+ await test('does NOT flag spa-suspected when crawl finds many pages', async () => {
152
+ const pages = {
153
+ 'https://example.com/': `<a href="/a">A</a><a href="/b">B</a><a href="/c">C</a>`,
154
+ 'https://example.com/a': '<html>a</html>',
155
+ 'https://example.com/b': '<html>b</html>',
156
+ 'https://example.com/c': '<html>c</html>',
157
+ }
158
+ const fetcher = async (url) => (pages[url] != null ? { ok: true, status: 200, text: pages[url] } : { ok: false, status: 404 })
159
+ const loadHtml = makeFakeLoadHtml()
160
+ const r = await fallbackCrawl({
161
+ rootUrl: 'https://example.com/',
162
+ fetcher,
163
+ loadHtml,
164
+ maxDepth: 2,
165
+ maxPages: 50,
166
+ blocklistPrefixes,
167
+ sitemapUrl: null,
168
+ })
169
+ assert.equal(r.results.length, 4)
170
+ assert.ok(!r.warnings.includes('spa-suspected'))
171
+ })
172
+
173
+ await test('respects maxPages cap', async () => {
174
+ const pages = {
175
+ 'https://example.com/': `<a href="/a">A</a><a href="/b">B</a><a href="/c">C</a><a href="/d">D</a>`,
176
+ 'https://example.com/a': '<html>a</html>',
177
+ 'https://example.com/b': '<html>b</html>',
178
+ 'https://example.com/c': '<html>c</html>',
179
+ 'https://example.com/d': '<html>d</html>',
180
+ }
181
+ const fetcher = async (url) => (pages[url] != null ? { ok: true, status: 200, text: pages[url] } : { ok: false, status: 404 })
182
+ const loadHtml = makeFakeLoadHtml()
183
+ const r = await fallbackCrawl({
184
+ rootUrl: 'https://example.com/',
185
+ fetcher,
186
+ loadHtml,
187
+ maxDepth: 2,
188
+ maxPages: 2,
189
+ blocklistPrefixes,
190
+ sitemapUrl: null,
191
+ })
192
+ assert.equal(r.results.length, 2)
193
+ })
194
+
195
+ await test('does not enqueue blocklisted paths', async () => {
196
+ const pages = {
197
+ 'https://example.com/': `<a href="/account/profile">Profile</a><a href="/products/foo">Foo</a>`,
198
+ 'https://example.com/products/foo': '<html>foo</html>',
199
+ }
200
+ const fetcher = async (url) => (pages[url] != null ? { ok: true, status: 200, text: pages[url] } : { ok: false, status: 404 })
201
+ const loadHtml = makeFakeLoadHtml()
202
+ const r = await fallbackCrawl({
203
+ rootUrl: 'https://example.com/',
204
+ fetcher,
205
+ loadHtml,
206
+ maxDepth: 2,
207
+ maxPages: 50,
208
+ blocklistPrefixes,
209
+ sitemapUrl: null,
210
+ })
211
+ const urls = r.results.map((p) => p.url).sort()
212
+ // /account/profile must NOT appear
213
+ assert.deepEqual(urls, ['https://example.com/', 'https://example.com/products/foo'])
214
+ })
215
+
216
+ await test('warns on per-page fetch failures but keeps other pages', async () => {
217
+ const pages = {
218
+ 'https://example.com/': `<a href="/ok">OK</a><a href="/broken">Broken</a>`,
219
+ 'https://example.com/ok': '<html>ok</html>',
220
+ // /broken will 500
221
+ }
222
+ const fetcher = async (url) => {
223
+ if (url === 'https://example.com/broken') return { ok: false, status: 500 }
224
+ if (pages[url] != null) return { ok: true, status: 200, text: pages[url] }
225
+ return { ok: false, status: 404 }
226
+ }
227
+ const loadHtml = makeFakeLoadHtml()
228
+ const r = await fallbackCrawl({
229
+ rootUrl: 'https://example.com/',
230
+ fetcher,
231
+ loadHtml,
232
+ maxDepth: 2,
233
+ maxPages: 50,
234
+ blocklistPrefixes,
235
+ sitemapUrl: null,
236
+ })
237
+ const urls = r.results.map((p) => p.url).sort()
238
+ assert.deepEqual(urls, ['https://example.com/', 'https://example.com/ok'])
239
+ assert.ok(r.warnings.some((w) => w.startsWith('crawl-fetch-failed:https://example.com/broken')))
240
+ })
241
+
242
+ process.stdout.write(`\n${passed} passed, ${failed} failed\n`)
243
+ process.exit(failed === 0 ? 0 : 1)
@@ -0,0 +1,120 @@
1
+ #!/usr/bin/env node
2
+ // Tests for lib/page-classifier.mjs — pure unit tests.
3
+
4
+ import { strict as assert } from 'node:assert'
5
+ import { classifyUrl } from '../lib/page-classifier.mjs'
6
+
7
+ let passed = 0
8
+ let failed = 0
9
+
10
+ function test(name, fn) {
11
+ try {
12
+ fn()
13
+ process.stdout.write(`ok - ${name}\n`)
14
+ passed++
15
+ } catch (err) {
16
+ process.stdout.write(`not ok - ${name}\n ${err.message}\n`)
17
+ failed++
18
+ }
19
+ }
20
+
21
+ // Home
22
+ test('classifies bare root as home', () => {
23
+ assert.equal(classifyUrl('https://example.com/'), 'home')
24
+ assert.equal(classifyUrl('https://example.com'), 'home')
25
+ })
26
+
27
+ // PDP
28
+ test('classifies Shopify /products/ as pdp', () => {
29
+ assert.equal(classifyUrl('https://example.com/products/foo-bar'), 'pdp')
30
+ assert.equal(classifyUrl('https://example.com/products/foo-bar/'), 'pdp')
31
+ })
32
+
33
+ test('classifies Woo-style /product/ as pdp', () => {
34
+ assert.equal(classifyUrl('https://example.com/product/widget'), 'pdp')
35
+ })
36
+
37
+ // Collection
38
+ test('classifies /collections/ as collection', () => {
39
+ assert.equal(classifyUrl('https://example.com/collections/men'), 'collection')
40
+ })
41
+
42
+ test('classifies /product-category/ as collection', () => {
43
+ assert.equal(classifyUrl('https://example.com/product-category/shoes'), 'collection')
44
+ })
45
+
46
+ test('classifies /category/ as collection', () => {
47
+ assert.equal(classifyUrl('https://example.com/category/dresses'), 'collection')
48
+ })
49
+
50
+ // Contact
51
+ test('classifies /pages/contact-us as contact', () => {
52
+ assert.equal(classifyUrl('https://example.com/pages/contact-us'), 'contact')
53
+ })
54
+
55
+ test('classifies /contact as contact', () => {
56
+ assert.equal(classifyUrl('https://example.com/contact'), 'contact')
57
+ })
58
+
59
+ // About
60
+ test('classifies /pages/about-us as about', () => {
61
+ assert.equal(classifyUrl('https://example.com/pages/about-us'), 'about')
62
+ })
63
+
64
+ test('classifies /about as about', () => {
65
+ assert.equal(classifyUrl('https://example.com/about'), 'about')
66
+ })
67
+
68
+ // Policy
69
+ test('classifies /policies/privacy-policy as policy', () => {
70
+ assert.equal(classifyUrl('https://example.com/policies/privacy-policy'), 'policy')
71
+ })
72
+
73
+ test('classifies /pages/terms-of-service as policy', () => {
74
+ assert.equal(classifyUrl('https://example.com/pages/terms-of-service'), 'policy')
75
+ })
76
+
77
+ test('classifies /pages/refund-policy as policy', () => {
78
+ assert.equal(classifyUrl('https://example.com/pages/refund-policy'), 'policy')
79
+ })
80
+
81
+ // Blog
82
+ test('classifies /blogs/news/post-slug as blog', () => {
83
+ assert.equal(classifyUrl('https://example.com/blogs/news/post-slug'), 'blog')
84
+ })
85
+
86
+ test('classifies WordPress /blog/post-slug as blog', () => {
87
+ assert.equal(classifyUrl('https://example.com/blog/hello-world'), 'blog')
88
+ })
89
+
90
+ test('classifies /posts/foo as blog', () => {
91
+ assert.equal(classifyUrl('https://example.com/posts/foo'), 'blog')
92
+ })
93
+
94
+ // Other
95
+ test('classifies generic /random-path as other', () => {
96
+ assert.equal(classifyUrl('https://example.com/random-path'), 'other')
97
+ })
98
+
99
+ test('classifies /pages/landing-2024 as other (no specific match)', () => {
100
+ assert.equal(classifyUrl('https://example.com/pages/landing-2024'), 'other')
101
+ })
102
+
103
+ // Edge cases
104
+ test('handles invalid URL string by treating it as a path', () => {
105
+ assert.equal(classifyUrl('/products/foo'), 'pdp')
106
+ assert.equal(classifyUrl('/'), 'home')
107
+ })
108
+
109
+ test('is case-insensitive for paths', () => {
110
+ assert.equal(classifyUrl('https://example.com/PRODUCTS/foo'), 'pdp')
111
+ assert.equal(classifyUrl('https://example.com/Pages/Contact'), 'contact')
112
+ })
113
+
114
+ test('does not misclassify /products-old as pdp (no slash boundary)', () => {
115
+ // /products-old is "other" because the rule requires /products/ with a slash
116
+ assert.equal(classifyUrl('https://example.com/products-old'), 'other')
117
+ })
118
+
119
+ process.stdout.write(`\n${passed} passed, ${failed} failed\n`)
120
+ process.exit(failed === 0 ? 0 : 1)