similarbuild 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (76) hide show
  1. package/CHANGELOG.md +110 -0
  2. package/LICENSE +21 -0
  3. package/README.md +301 -0
  4. package/bin/install.js +256 -0
  5. package/lib/copy-templates.mjs +52 -0
  6. package/lib/install-deps.mjs +62 -0
  7. package/lib/prompt-config.mjs +83 -0
  8. package/lib/verify-env.mjs +19 -0
  9. package/package.json +63 -0
  10. package/scripts/sync-templates.mjs +71 -0
  11. package/templates/commands/build-page.md +490 -0
  12. package/templates/commands/build-site.md +548 -0
  13. package/templates/commands/clip-section.md +519 -0
  14. package/templates/memory/anti-patterns.md +212 -0
  15. package/templates/memory/design-knowledge.md +225 -0
  16. package/templates/memory/fixes.md +163 -0
  17. package/templates/memory/patterns.md +681 -0
  18. package/templates/presets/shopify-section.yaml +51 -0
  19. package/templates/presets/wp-elementor.yaml +49 -0
  20. package/templates/reports/fixtures/mock-run-1.json +115 -0
  21. package/templates/reports/fixtures/mock-run-2.json +72 -0
  22. package/templates/reports/report-renderer.mjs +218 -0
  23. package/templates/reports/report-template.html +571 -0
  24. package/templates/skills/sb-build-shopify/SKILL.md +104 -0
  25. package/templates/skills/sb-build-shopify/references/shopify-build-rules.md +563 -0
  26. package/templates/skills/sb-build-shopify/scripts/build-shopify.mjs +637 -0
  27. package/templates/skills/sb-build-shopify/scripts/tests/test-build-shopify.mjs +424 -0
  28. package/templates/skills/sb-build-wp/SKILL.md +83 -0
  29. package/templates/skills/sb-build-wp/references/wp-build-rules.md +376 -0
  30. package/templates/skills/sb-build-wp/scripts/build-wp.mjs +327 -0
  31. package/templates/skills/sb-build-wp/scripts/tests/test-build-wp.mjs +224 -0
  32. package/templates/skills/sb-compare-visual/SKILL.md +121 -0
  33. package/templates/skills/sb-compare-visual/scripts/compare-visual.mjs +387 -0
  34. package/templates/skills/sb-compare-visual/scripts/lib/compare-tokens.mjs +273 -0
  35. package/templates/skills/sb-compare-visual/scripts/tests/test-compare-tokens.mjs +350 -0
  36. package/templates/skills/sb-compare-visual/scripts/tests/test-compare-visual.mjs +626 -0
  37. package/templates/skills/sb-crawl-and-list/SKILL.md +99 -0
  38. package/templates/skills/sb-crawl-and-list/scripts/crawl-and-list.mjs +437 -0
  39. package/templates/skills/sb-crawl-and-list/scripts/lib/blocklist-filter.mjs +176 -0
  40. package/templates/skills/sb-crawl-and-list/scripts/lib/fallback-crawler.mjs +107 -0
  41. package/templates/skills/sb-crawl-and-list/scripts/lib/page-classifier.mjs +89 -0
  42. package/templates/skills/sb-crawl-and-list/scripts/lib/sitemap-parser.mjs +118 -0
  43. package/templates/skills/sb-crawl-and-list/scripts/tests/test-blocklist-filter.mjs +204 -0
  44. package/templates/skills/sb-crawl-and-list/scripts/tests/test-crawl-and-list.mjs +276 -0
  45. package/templates/skills/sb-crawl-and-list/scripts/tests/test-fallback-crawler.mjs +243 -0
  46. package/templates/skills/sb-crawl-and-list/scripts/tests/test-page-classifier.mjs +120 -0
  47. package/templates/skills/sb-crawl-and-list/scripts/tests/test-sitemap-parser.mjs +157 -0
  48. package/templates/skills/sb-extract-assets/SKILL.md +112 -0
  49. package/templates/skills/sb-extract-assets/scripts/extract-assets.mjs +484 -0
  50. package/templates/skills/sb-extract-assets/scripts/tests/test-extract-assets.mjs +112 -0
  51. package/templates/skills/sb-inspect-live/SKILL.md +105 -0
  52. package/templates/skills/sb-inspect-live/scripts/inspect-live.mjs +693 -0
  53. package/templates/skills/sb-inspect-live/scripts/tests/test-inspect-live.mjs +181 -0
  54. package/templates/skills/sb-review-checks/SKILL.md +113 -0
  55. package/templates/skills/sb-review-checks/references/review-rules.md +195 -0
  56. package/templates/skills/sb-review-checks/scripts/lib/anti-patterns.mjs +379 -0
  57. package/templates/skills/sb-review-checks/scripts/lib/cross-reference.mjs +115 -0
  58. package/templates/skills/sb-review-checks/scripts/lib/design-quality.mjs +541 -0
  59. package/templates/skills/sb-review-checks/scripts/review-checks.mjs +250 -0
  60. package/templates/skills/sb-review-checks/scripts/tests/test-anti-patterns.mjs +343 -0
  61. package/templates/skills/sb-review-checks/scripts/tests/test-cross-reference.mjs +170 -0
  62. package/templates/skills/sb-review-checks/scripts/tests/test-design-quality.mjs +493 -0
  63. package/templates/skills/sb-review-checks/scripts/tests/test-review-checks.mjs +267 -0
  64. package/templates/skills/sb-tweak/SKILL.md +130 -0
  65. package/templates/skills/sb-tweak/references/tweak-patterns.md +157 -0
  66. package/templates/skills/sb-tweak/scripts/lib/diff-summarizer.mjs +140 -0
  67. package/templates/skills/sb-tweak/scripts/lib/element-locator.mjs +507 -0
  68. package/templates/skills/sb-tweak/scripts/lib/intent-parser.mjs +324 -0
  69. package/templates/skills/sb-tweak/scripts/tests/test-diff-summarizer.mjs +248 -0
  70. package/templates/skills/sb-tweak/scripts/tests/test-element-locator.mjs +418 -0
  71. package/templates/skills/sb-tweak/scripts/tests/test-intent-parser.mjs +496 -0
  72. package/templates/skills/sb-tweak/scripts/tests/test-tweak.mjs +407 -0
  73. package/templates/skills/sb-tweak/scripts/tweak.mjs +656 -0
  74. package/templates/skills/sb-validate-render/SKILL.md +120 -0
  75. package/templates/skills/sb-validate-render/scripts/tests/test-validate-render.mjs +304 -0
  76. package/templates/skills/sb-validate-render/scripts/validate-render.mjs +645 -0
@@ -0,0 +1,99 @@
1
+ ---
2
+ name: sb-crawl-and-list
3
+ description: Discovers a site's pages by trying `sitemap.xml` first then falling back to a depth-bounded HTML crawl with cheerio, applies a default + custom blocklist, classifies each URL by path heuristic, and emits a dense JSON page list ready for human confirmation. Use when the SimilarBuild orchestrator (`/build-site`) requests page discovery before batching, or when the user asks to 'crawl and list' a site's pages.
4
+ ---
5
+
6
+ # sb-crawl-and-list
7
+
8
+ ## Overview
9
+
10
+ Takes a `--root-url` and produces a normalized page list — `{url, type, title?, depth, source}[]` — that downstream batch orchestration (`/build-site`) renders as a confirmation table before launching N parallel `/build-page` runs. Sitemap.xml covers the modern web; cheerio handles static HTML when sitemaps are absent or hostile; SPAs that hide all routes behind JS get a clear warning rather than a fabricated list.
11
+
12
+ Deliberately **chromium-free**. Crawling is URL discovery, not visual inspection — `sb-inspect-live` and `sb-validate-render` already pay the chromium tax. A second headless browser here would balloon install time, memory, and crawl latency for a job that `fetch` + `cheerio` does in seconds. The trade-off is explicit: pure-SPA sites require a manual sitemap. The skill tells the user that, doesn't try to fake it.
13
+
14
+ Act as a discovery gate: the orchestrator commits to building everything in this list, so the list must be **complete enough to confirm** (no missing key pages) and **clean enough to ship** (no admin paths, tracking URLs, or duplicate-by-querystring noise). Determinism is non-negotiable — same root, same sitemap, same list across reruns.
15
+
16
+ ## Inputs
17
+
18
+ | Argument | Required | Default | Notes |
19
+ | ------------------------ | -------- | ------- | ------------------------------------------------------------------------------------------------------ |
20
+ | `root-url` | yes | — | Origin to crawl. Used as base for sitemap probe and as scope for the fallback crawler (same-origin). |
21
+ | `output-dir` | yes | — | Directory for `pages-list.json` (and `sitemap-raw.xml` if a sitemap was downloaded). |
22
+ | `max-depth` | no | `3` | Only used by the fallback crawl — sitemap is one level by definition. |
23
+ | `max-pages` | no | `200` | Hard cap on the emitted list. Excess URLs are dropped and a warning is added. |
24
+ | `respect-robots-txt` | no | `true` | If `false`, `robots.txt` `Disallow:` rules for the same origin are ignored. |
25
+ | `blocklist` | no | (none) | Comma-separated extra path prefixes (e.g. `/internal,/preview`). Adds to the default blocklist. |
26
+ | `sitemap-path` | no | (none) | If set, reads sitemap XML from this local path instead of probing `{root-url}/sitemap.xml`. |
27
+ | `timeout` | no | `10000` | Per-request fetch timeout (ms). Applies to sitemap probe, robots fetch, and each crawl page. |
28
+
29
+ ## Output
30
+
31
+ A single JSON object printed to stdout AND saved to `{output-dir}/pages-list.json`:
32
+
33
+ ```json
34
+ {
35
+ "rootUrl": "https://example.com",
36
+ "source": "sitemap",
37
+ "pageCount": 27,
38
+ "blocked": 12,
39
+ "duplicates": 3,
40
+ "warnings": [],
41
+ "pages": [
42
+ { "url": "https://example.com/", "type": "home", "title": null, "depth": 0, "source": "sitemap" },
43
+ { "url": "https://example.com/products/foo", "type": "pdp", "title": null, "depth": 1, "source": "sitemap" }
44
+ ]
45
+ }
46
+ ```
47
+
48
+ `source` is `"sitemap"` when any URL was harvested from XML, `"crawl"` when the fallback was used, `"sitemap-path"` when `--sitemap-path` was provided. `pageCount` reflects what's in `pages[]` after blocklist + dedupe + cap. `blocked` and `duplicates` are diagnostics so the orchestrator can show the user how aggressive the filtering was. `warnings[]` is where SPA detection, cap-truncation, robots-blocked-root, and oversized-sitemap messages surface.
49
+
50
+ `title` is reserved — populated `null` here. The orchestrator can hydrate it later from individual `sb-inspect-live` runs without re-crawling.
51
+
52
+ ## On Activation
53
+
54
+ 1. **Resolve inputs.** Collect `root-url` and `output-dir`. Apply defaults above. Normalize `root-url` (strip trailing slash on the path component).
55
+
56
+ 2. **Ensure `output-dir` exists.** `mkdir -p` it.
57
+
58
+ 3. **Run the script.** From the project root:
59
+
60
+ ```bash
61
+ node {skill-root}/scripts/crawl-and-list.mjs \
62
+ --root-url "{root_url}" \
63
+ --output-dir "{output_dir}" \
64
+ [--max-depth N] [--max-pages N] [--respect-robots-txt false] \
65
+ [--blocklist "/preview,/internal"] \
66
+ [--sitemap-path "{path}"] \
67
+ [--timeout 10000]
68
+ ```
69
+
70
+ The script: optionally honors `robots.txt`, tries `--sitemap-path` → `{root}/sitemap.xml` → `{root}/sitemap_index.xml` (recursing one level), falls back to a same-origin BFS via `fetch` + `cheerio` honoring `--max-depth`, then runs blocklist → dedupe → classifier → cap. See `scripts/crawl-and-list.mjs --help` for the full flag list.
71
+
72
+ 4. **Validate the result.** Parse stdout as JSON. If exit is non-zero, surface stderr to the orchestrator and stop — don't fabricate a partial list.
73
+
74
+ 5. **Forward `warnings[]` unchanged.** Cap-exceeded, SPA-suspected, robots-blocked-root, and parse-failure warnings are user-facing — the orchestrator displays them alongside the page table.
75
+
76
+ 6. **Return the JSON unchanged** to the caller. `/build-site` consumes `pages[]`, `pageCount`, and `warnings[]` directly to render the confirmation table.
77
+
78
+ ## Failure modes
79
+
80
+ | Symptom | Likely cause | What to surface |
81
+ | ------------------------------------------------ | ----------------------------------------------------- | ------------------------------------------------------------------------ |
82
+ | Script exits non-zero (1) | Network failure to root, write failure, malformed args masked as runtime | Pass stderr verbatim. If the message mentions `cheerio`, suggest `npm i cheerio`. |
83
+ | `pageCount: 0` + `warnings: ["spa-suspected"]` | Pure SPA — root HTML has 0–1 internal `<a href>` | Tell the user: site appears to be a SPA; rerun with `--sitemap-path` pointing at a manually-supplied sitemap. |
84
+ | `warnings: ["sitemap-truncated:N"]` | Source sitemap had > 1000 entries | Ask the user if they want to filter (e.g. only `/products/*`) or proceed with the cap. |
85
+ | `warnings: ["robots-disallow-root"]` | `robots.txt` disallows `/` for `*` | Forward unchanged. Either rerun with `--respect-robots-txt false` (with permission) or stop. |
86
+ | `warnings: ["max-pages-cap:N-of-M"]` | Emitted list hit the `--max-pages` cap | Surface both numbers. Suggest raising the cap or filtering before continuing. |
87
+ | HTTP 403 / 401 / 429 to root | Origin blocks node's UA or rate-limits | Error is clear in stderr. (Custom UA support is a future flag — escalate to user.) |
88
+
89
+ ## Conventions
90
+
91
+ - Bare paths (e.g. `scripts/crawl-and-list.mjs`) resolve from the skill root.
92
+ - `{skill-root}` resolves to this skill's installed directory.
93
+ - `{project-root}` resolves to the project working directory.
94
+ - The script never invents URLs. If discovery yields nothing, the list is empty and a warning explains why.
95
+ - Stdout is **only** the result JSON. All progress goes to stderr with the `[sb-crawl-and-list]` prefix so the orchestrator can pipe stdout straight into `JSON.parse`.
96
+
97
+ ## Dependencies
98
+
99
+ The host project must have `cheerio` installed (already a dep of `sb-review-checks`). Node ≥ 20 (uses native `fetch`, `URL`, `AbortController`). XML parsing is done with the bundled `lib/sitemap-parser.mjs` — no `xml2js` required. The SimilarBuild installer handles `cheerio`.
@@ -0,0 +1,437 @@
1
+ #!/usr/bin/env node
2
+ // crawl-and-list.mjs — Discover a site's pages and emit a normalized JSON
3
+ // list ready for human confirmation in /build-site.
4
+ //
5
+ // Strategy: try sitemap first (cheap, authoritative), fall back to a
6
+ // same-origin BFS via fetch + cheerio. No chromium. SPA-only sites get a
7
+ // warning, not a fabricated list.
8
+ //
9
+ // Output: JSON to stdout AND pages-list.json in --output-dir. Logs to stderr
10
+ // with the [sb-crawl-and-list] prefix. Exit codes: 0=ok, 1=script error,
11
+ // 2=invalid args.
12
+ //
13
+ // cheerio is imported lazily inside main() so --help and arg validation
14
+ // work without the dep installed.
15
+
16
+ import { parseArgs } from 'node:util'
17
+ import { mkdir, writeFile, readFile, access } from 'node:fs/promises'
18
+ import { join, resolve } from 'node:path'
19
+ import { parseSitemap, fetchAndParseSitemap } from './lib/sitemap-parser.mjs'
20
+ import { buildBlocklist, filterUrl } from './lib/blocklist-filter.mjs'
21
+ import { classifyUrl } from './lib/page-classifier.mjs'
22
+ import { fallbackCrawl } from './lib/fallback-crawler.mjs'
23
+
24
+ const HELP = `
25
+ crawl-and-list.mjs — Discover a site's pages for SimilarBuild batch builds.
26
+
27
+ Required:
28
+ --root-url <url> Origin to crawl (e.g. https://example.com).
29
+ --output-dir <dir> Directory for pages-list.json (and sitemap-raw.xml
30
+ if a sitemap was downloaded).
31
+
32
+ Optional:
33
+ --max-depth <n> Default 3. Used only by the fallback crawler.
34
+ --max-pages <n> Default 200. Hard cap on the emitted list.
35
+ --respect-robots-txt Default true. Pass --respect-robots-txt false to ignore.
36
+ --blocklist <csv> Extra path prefixes, comma-separated. Adds to defaults.
37
+ --sitemap-path <file> Read sitemap from this local path instead of probing.
38
+ --timeout <ms> Default 10000. Per-request fetch timeout.
39
+ --help Show this message.
40
+
41
+ Exit codes: 0=ok, 1=script error, 2=invalid args.
42
+ `
43
+
44
+ const STDERR_PREFIX = '[sb-crawl-and-list]'
45
+
46
+ function fail(msg, code = 2) {
47
+ process.stderr.write(`${STDERR_PREFIX} ${msg}\n`)
48
+ process.exit(code)
49
+ }
50
+
51
+ function log(msg) {
52
+ process.stderr.write(`${STDERR_PREFIX} ${msg}\n`)
53
+ }
54
+
55
+ const { values } = parseArgs({
56
+ options: {
57
+ 'root-url': { type: 'string' },
58
+ 'output-dir': { type: 'string' },
59
+ 'max-depth': { type: 'string', default: '3' },
60
+ 'max-pages': { type: 'string', default: '200' },
61
+ 'respect-robots-txt': { type: 'string', default: 'true' },
62
+ blocklist: { type: 'string' },
63
+ 'sitemap-path': { type: 'string' },
64
+ timeout: { type: 'string', default: '10000' },
65
+ help: { type: 'boolean', default: false },
66
+ },
67
+ strict: false,
68
+ })
69
+
70
+ if (values.help) {
71
+ process.stdout.write(HELP)
72
+ process.exit(0)
73
+ }
74
+
75
+ if (!values['root-url']) fail('missing --root-url')
76
+ if (!values['output-dir']) fail('missing --output-dir')
77
+
78
+ let ROOT_URL
79
+ try {
80
+ const u = new URL(values['root-url'])
81
+ if (!/^https?:$/.test(u.protocol)) fail(`--root-url must be http(s): got "${values['root-url']}"`)
82
+ // Normalize: strip query/hash, keep trailing slash off the pathname
83
+ u.search = ''
84
+ u.hash = ''
85
+ if (u.pathname.length > 1 && u.pathname.endsWith('/')) u.pathname = u.pathname.slice(0, -1)
86
+ ROOT_URL = u.toString()
87
+ } catch (err) {
88
+ if (err?.message?.startsWith('--root-url')) throw err
89
+ fail(`--root-url is not a valid URL: ${err.message}`)
90
+ }
91
+
92
+ const OUTPUT_DIR = resolve(values['output-dir'])
93
+ const MAX_DEPTH = parseInt(values['max-depth'], 10)
94
+ const MAX_PAGES = parseInt(values['max-pages'], 10)
95
+ const TIMEOUT = parseInt(values.timeout, 10)
96
+ const RESPECT_ROBOTS = values['respect-robots-txt'] !== 'false'
97
+ const BLOCKLIST = buildBlocklist(values.blocklist)
98
+ const SITEMAP_PATH = values['sitemap-path'] ? resolve(values['sitemap-path']) : null
99
+
100
+ if (!Number.isFinite(MAX_DEPTH) || MAX_DEPTH < 0) fail('--max-depth must be a non-negative integer')
101
+ if (!Number.isFinite(MAX_PAGES) || MAX_PAGES < 1) fail('--max-pages must be a positive integer')
102
+ if (!Number.isFinite(TIMEOUT) || TIMEOUT < 1) fail('--timeout must be a positive integer (ms)')
103
+
104
+ // ---------- HTTP helpers ----------
105
+
106
+ const USER_AGENT =
107
+ 'Mozilla/5.0 (compatible; SimilarBuild/1.0; +https://similarbuild.local) Chrome/120 Safari/537.36'
108
+
109
+ async function fetchText(url) {
110
+ const ac = new AbortController()
111
+ const t = setTimeout(() => ac.abort(), TIMEOUT)
112
+ try {
113
+ const res = await fetch(url, {
114
+ signal: ac.signal,
115
+ redirect: 'follow',
116
+ headers: {
117
+ 'user-agent': USER_AGENT,
118
+ accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
119
+ },
120
+ })
121
+ if (!res.ok) {
122
+ return { ok: false, status: res.status, reason: res.statusText || `http-${res.status}` }
123
+ }
124
+ const text = await res.text()
125
+ return { ok: true, status: res.status, text }
126
+ } catch (err) {
127
+ return {
128
+ ok: false,
129
+ status: 0,
130
+ reason: err?.name === 'AbortError' ? 'timeout' : err?.message || String(err),
131
+ }
132
+ } finally {
133
+ clearTimeout(t)
134
+ }
135
+ }
136
+
137
+ // ---------- robots.txt ----------
138
+
139
+ // Minimal robots.txt parser — enough to decide "is `/` disallowed for `*`?".
140
+ // We don't try to honor crawl-delay or user-agent-specific allowlists; if the
141
+ // user wants finer control they pass --respect-robots-txt false and own the
142
+ // decision. The check is "does any User-agent: * block disallow exactly /?".
143
+ function isRootDisallowed(robotsTxt) {
144
+ const lines = robotsTxt.split(/\r?\n/)
145
+ let inStarBlock = false
146
+ for (const raw of lines) {
147
+ const line = raw.replace(/#.*$/, '').trim()
148
+ if (!line) {
149
+ // Blank line ends a block.
150
+ inStarBlock = false
151
+ continue
152
+ }
153
+ const m = /^([A-Za-z-]+)\s*:\s*(.*)$/.exec(line)
154
+ if (!m) continue
155
+ const key = m[1].toLowerCase()
156
+ const val = m[2].trim()
157
+ if (key === 'user-agent') {
158
+ inStarBlock = val === '*'
159
+ continue
160
+ }
161
+ if (inStarBlock && key === 'disallow') {
162
+ // Exactly "/" or empty val (= disallow nothing). We only care about
163
+ // total bans on the root.
164
+ if (val === '/') return true
165
+ }
166
+ }
167
+ return false
168
+ }
169
+
170
+ async function checkRobots() {
171
+ if (!RESPECT_ROBOTS) return { allowed: true, warnings: [] }
172
+ const robotsUrl = new URL('/robots.txt', ROOT_URL).toString()
173
+ const r = await fetchText(robotsUrl)
174
+ if (!r.ok) {
175
+ // Missing robots.txt (404) is permission-by-default, not an error.
176
+ return { allowed: true, warnings: [] }
177
+ }
178
+ if (isRootDisallowed(r.text)) {
179
+ return {
180
+ allowed: false,
181
+ warnings: ['robots-disallow-root'],
182
+ }
183
+ }
184
+ return { allowed: true, warnings: [] }
185
+ }
186
+
187
+ // ---------- main ----------
188
+
189
+ async function readLocalSitemap(path) {
190
+ try {
191
+ await access(path)
192
+ } catch {
193
+ fail(`--sitemap-path not found: ${path}`)
194
+ }
195
+ try {
196
+ return await readFile(path, 'utf8')
197
+ } catch (err) {
198
+ fail(`cannot read --sitemap-path: ${err.message}`)
199
+ }
200
+ }
201
+
202
+ async function trySitemapFetch() {
203
+ const candidates = [
204
+ new URL('/sitemap.xml', ROOT_URL).toString(),
205
+ new URL('/sitemap_index.xml', ROOT_URL).toString(),
206
+ ]
207
+ for (const url of candidates) {
208
+ log(`probing ${url}`)
209
+ const r = await fetchText(url)
210
+ if (!r.ok) {
211
+ log(` → ${r.status} ${r.reason}`)
212
+ continue
213
+ }
214
+ return { ok: true, url, text: r.text }
215
+ }
216
+ return { ok: false }
217
+ }
218
+
219
+ async function main() {
220
+ await mkdir(OUTPUT_DIR, { recursive: true })
221
+
222
+ const warnings = []
223
+ let blockedCount = 0
224
+ let duplicateCount = 0
225
+
226
+ // 1. robots.txt gate
227
+ const robots = await checkRobots()
228
+ warnings.push(...robots.warnings)
229
+ if (!robots.allowed) {
230
+ log('robots.txt disallows root for *; emitting empty list')
231
+ return emit({
232
+ rootUrl: ROOT_URL,
233
+ source: 'sitemap',
234
+ pageCount: 0,
235
+ blocked: 0,
236
+ duplicates: 0,
237
+ warnings,
238
+ pages: [],
239
+ })
240
+ }
241
+
242
+ // 2. Sitemap path: explicit --sitemap-path wins, then probe origin
243
+ let rawSitemapXml = null
244
+ let sitemapUrl = null
245
+ let sitemapWarnings = []
246
+ let urls = []
247
+ let source = 'crawl' // updated below if sitemap yields anything
248
+
249
+ if (SITEMAP_PATH) {
250
+ rawSitemapXml = await readLocalSitemap(SITEMAP_PATH)
251
+ log(`reading sitemap from ${SITEMAP_PATH} (${rawSitemapXml.length} bytes)`)
252
+ const parsed = parseSitemap(rawSitemapXml)
253
+ if (parsed.malformed) {
254
+ sitemapWarnings.push('sitemap-malformed-local')
255
+ } else if (parsed.kind === 'index') {
256
+ sitemapWarnings.push('sitemap-path-is-index-not-fetched')
257
+ // Don't recurse into a local sitemapindex — we'd have to fetch its
258
+ // children which defeats the purpose of "use this local file."
259
+ } else {
260
+ urls = parsed.urlSetUrls
261
+ source = 'sitemap-path'
262
+ }
263
+ } else {
264
+ const probed = await trySitemapFetch()
265
+ if (probed.ok) {
266
+ sitemapUrl = probed.url
267
+ rawSitemapXml = probed.text
268
+ log(`got sitemap from ${probed.url} (${probed.text.length} bytes)`)
269
+ // Re-parse via fetchAndParseSitemap so sitemapindex recursion works.
270
+ const result = await fetchAndParseSitemap(probed.url, fetchText, { maxChildren: 50 })
271
+ sitemapWarnings.push(...result.warnings)
272
+ if (result.urls.length > 0) {
273
+ urls = result.urls
274
+ source = 'sitemap'
275
+ }
276
+ }
277
+ }
278
+ warnings.push(...sitemapWarnings)
279
+
280
+ if (urls.length === 0 && !SITEMAP_PATH) {
281
+ log('no sitemap usable, falling back to fetch + cheerio crawl')
282
+ let cheerio
283
+ try {
284
+ cheerio = await import('cheerio')
285
+ } catch (err) {
286
+ process.stderr.write(
287
+ `${STDERR_PREFIX} missing dependency 'cheerio': ${err?.message || err}\n` +
288
+ `Install with: npm i cheerio\n`,
289
+ )
290
+ process.exit(1)
291
+ }
292
+ const loadHtml = (html) => cheerio.load(html, { decodeEntities: false })
293
+
294
+ const crawl = await fallbackCrawl({
295
+ rootUrl: ROOT_URL,
296
+ fetcher: fetchText,
297
+ loadHtml,
298
+ maxDepth: MAX_DEPTH,
299
+ maxPages: MAX_PAGES,
300
+ blocklistPrefixes: BLOCKLIST,
301
+ sitemapUrl,
302
+ })
303
+ warnings.push(...crawl.warnings)
304
+ // Crawl results are { url, depth }; convert to a flat list of URLs and
305
+ // we'll re-classify + record depth below.
306
+ return emit(
307
+ buildResult({
308
+ crawlPages: crawl.results,
309
+ urlsFromSitemap: null,
310
+ sitemapUrl,
311
+ rawSitemapXml,
312
+ source: 'crawl',
313
+ warnings,
314
+ }),
315
+ )
316
+ }
317
+
318
+ // Sitemap yielded URLs — apply blocklist + dedupe + classify + cap.
319
+ if (urls.length > 1000) {
320
+ warnings.push(`sitemap-truncated:${urls.length}`)
321
+ }
322
+
323
+ return emit(
324
+ buildResult({
325
+ crawlPages: null,
326
+ urlsFromSitemap: urls,
327
+ sitemapUrl,
328
+ rawSitemapXml,
329
+ source,
330
+ warnings,
331
+ }),
332
+ )
333
+
334
+ // ---- helpers closed over top-level state ----
335
+
336
+ function buildResult({ crawlPages, urlsFromSitemap, sitemapUrl, rawSitemapXml, source, warnings }) {
337
+ const originHost = new URL(ROOT_URL).hostname.toLowerCase()
338
+ const seen = new Map() // normalized URL → { depth, source }
339
+ let localBlocked = 0
340
+ let localDup = 0
341
+
342
+ function consider(url, depth, perEntrySource) {
343
+ const filtered = filterUrl(url, {
344
+ originHost,
345
+ blocklistPrefixes: BLOCKLIST,
346
+ sitemapUrl,
347
+ })
348
+ if (!filtered.keep) {
349
+ if (filtered.reason === 'blocklisted' || filtered.reason === 'xml-or-feed') {
350
+ localBlocked += 1
351
+ }
352
+ return
353
+ }
354
+ if (seen.has(filtered.url)) {
355
+ localDup += 1
356
+ // Keep the lowest depth seen (closer to root = better signal)
357
+ const existing = seen.get(filtered.url)
358
+ if (depth < existing.depth) existing.depth = depth
359
+ return
360
+ }
361
+ seen.set(filtered.url, { depth, source: perEntrySource })
362
+ }
363
+
364
+ if (urlsFromSitemap) {
365
+ for (const u of urlsFromSitemap) {
366
+ // Sitemap doesn't give us depth; treat root as depth 0 and others as 1.
367
+ const isRoot = (() => {
368
+ try {
369
+ const p = new URL(u).pathname.replace(/\/+$/, '')
370
+ return p === '' || p === '/'
371
+ } catch {
372
+ return false
373
+ }
374
+ })()
375
+ consider(u, isRoot ? 0 : 1, 'sitemap')
376
+ }
377
+ } else if (crawlPages) {
378
+ for (const { url, depth } of crawlPages) {
379
+ consider(url, depth, 'crawl')
380
+ }
381
+ }
382
+
383
+ // Cap
384
+ let pages = [...seen.entries()].map(([url, meta]) => ({
385
+ url,
386
+ type: classifyUrl(url),
387
+ title: null,
388
+ depth: meta.depth,
389
+ source: meta.source,
390
+ }))
391
+
392
+ // Stable order: home first (depth 0), then by depth, then alphabetical.
393
+ pages.sort((a, b) => {
394
+ if (a.depth !== b.depth) return a.depth - b.depth
395
+ return a.url.localeCompare(b.url)
396
+ })
397
+
398
+ if (pages.length > MAX_PAGES) {
399
+ const total = pages.length
400
+ pages = pages.slice(0, MAX_PAGES)
401
+ warnings.push(`max-pages-cap:${MAX_PAGES}-of-${total}`)
402
+ }
403
+
404
+ return {
405
+ rootUrl: ROOT_URL,
406
+ source,
407
+ pageCount: pages.length,
408
+ blocked: localBlocked,
409
+ duplicates: localDup,
410
+ warnings,
411
+ pages,
412
+ _rawSitemapXml: rawSitemapXml, // not part of output; consumed by emit()
413
+ }
414
+ }
415
+
416
+ async function emit(result) {
417
+ const rawSitemapXml = result._rawSitemapXml
418
+ delete result._rawSitemapXml
419
+
420
+ const listPath = join(OUTPUT_DIR, 'pages-list.json')
421
+ await writeFile(listPath, JSON.stringify(result, null, 2), 'utf8')
422
+ log(`wrote ${listPath}`)
423
+ if (rawSitemapXml) {
424
+ const sitemapPath = join(OUTPUT_DIR, 'sitemap-raw.xml')
425
+ await writeFile(sitemapPath, rawSitemapXml, 'utf8')
426
+ log(`wrote ${sitemapPath}`)
427
+ }
428
+ process.stdout.write(JSON.stringify(result))
429
+ process.stdout.write('\n')
430
+ process.exit(0)
431
+ }
432
+ }
433
+
434
+ main().catch((err) => {
435
+ process.stderr.write(`${STDERR_PREFIX} fatal: ${err?.stack || err}\n`)
436
+ process.exit(1)
437
+ })