@readme/cli 0.0.26

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (144) hide show
  1. package/README.md +55 -0
  2. package/bin/readme.js +8 -0
  3. package/package.json +58 -0
  4. package/src/bootstrap.js +97 -0
  5. package/src/cli.js +189 -0
  6. package/src/commands/dev.js +119 -0
  7. package/src/commands/eyes.js +37 -0
  8. package/src/commands/import.js +2565 -0
  9. package/src/commands/lint.js +70 -0
  10. package/src/commands/oas-sync.js +364 -0
  11. package/src/commands/oas-validate.js +208 -0
  12. package/src/commands/play.js +17 -0
  13. package/src/commands/pretty.js +133 -0
  14. package/src/commands/setup.js +256 -0
  15. package/src/commands/versions.js +81 -0
  16. package/src/dev/.next/app-build-manifest.json +20 -0
  17. package/src/dev/.next/build-manifest.json +31 -0
  18. package/src/dev/.next/cache/.rscinfo +1 -0
  19. package/src/dev/.next/cache/next-devtools-config.json +1 -0
  20. package/src/dev/.next/cache/webpack/client-development/0.pack.gz +0 -0
  21. package/src/dev/.next/cache/webpack/client-development/1.pack.gz +0 -0
  22. package/src/dev/.next/cache/webpack/client-development/10.pack.gz +0 -0
  23. package/src/dev/.next/cache/webpack/client-development/11.pack.gz +0 -0
  24. package/src/dev/.next/cache/webpack/client-development/2.pack.gz +0 -0
  25. package/src/dev/.next/cache/webpack/client-development/3.pack.gz +0 -0
  26. package/src/dev/.next/cache/webpack/client-development/3.pack.gz_ +0 -0
  27. package/src/dev/.next/cache/webpack/client-development/4.pack.gz +0 -0
  28. package/src/dev/.next/cache/webpack/client-development/5.pack.gz +0 -0
  29. package/src/dev/.next/cache/webpack/client-development/5.pack.gz_ +0 -0
  30. package/src/dev/.next/cache/webpack/client-development/6.pack.gz +0 -0
  31. package/src/dev/.next/cache/webpack/client-development/7.pack.gz +0 -0
  32. package/src/dev/.next/cache/webpack/client-development/7.pack.gz_ +0 -0
  33. package/src/dev/.next/cache/webpack/client-development/8.pack.gz +0 -0
  34. package/src/dev/.next/cache/webpack/client-development/9.pack.gz +0 -0
  35. package/src/dev/.next/cache/webpack/client-development/index.pack.gz.old +0 -0
  36. package/src/dev/.next/cache/webpack/client-development-fallback/0.pack.gz +0 -0
  37. package/src/dev/.next/cache/webpack/client-development-fallback/1.pack.gz +0 -0
  38. package/src/dev/.next/cache/webpack/client-development-fallback/index.pack.gz +0 -0
  39. package/src/dev/.next/cache/webpack/client-development-fallback/index.pack.gz.old +0 -0
  40. package/src/dev/.next/cache/webpack/edge-server-development/0.pack.gz +0 -0
  41. package/src/dev/.next/cache/webpack/edge-server-development/1.pack.gz +0 -0
  42. package/src/dev/.next/cache/webpack/edge-server-development/index.pack.gz +0 -0
  43. package/src/dev/.next/cache/webpack/edge-server-development/index.pack.gz.old +0 -0
  44. package/src/dev/.next/cache/webpack/server-development/0.pack.gz +0 -0
  45. package/src/dev/.next/cache/webpack/server-development/1.pack.gz +0 -0
  46. package/src/dev/.next/cache/webpack/server-development/10.pack.gz +0 -0
  47. package/src/dev/.next/cache/webpack/server-development/11.pack.gz +0 -0
  48. package/src/dev/.next/cache/webpack/server-development/12.pack.gz +0 -0
  49. package/src/dev/.next/cache/webpack/server-development/13.pack.gz +0 -0
  50. package/src/dev/.next/cache/webpack/server-development/14.pack.gz +0 -0
  51. package/src/dev/.next/cache/webpack/server-development/15.pack.gz +0 -0
  52. package/src/dev/.next/cache/webpack/server-development/2.pack.gz +0 -0
  53. package/src/dev/.next/cache/webpack/server-development/2.pack.gz_ +0 -0
  54. package/src/dev/.next/cache/webpack/server-development/3.pack.gz +0 -0
  55. package/src/dev/.next/cache/webpack/server-development/3.pack.gz_ +0 -0
  56. package/src/dev/.next/cache/webpack/server-development/4.pack.gz +0 -0
  57. package/src/dev/.next/cache/webpack/server-development/5.pack.gz +0 -0
  58. package/src/dev/.next/cache/webpack/server-development/6.pack.gz +0 -0
  59. package/src/dev/.next/cache/webpack/server-development/6.pack.gz_ +0 -0
  60. package/src/dev/.next/cache/webpack/server-development/7.pack.gz +0 -0
  61. package/src/dev/.next/cache/webpack/server-development/7.pack.gz_ +0 -0
  62. package/src/dev/.next/cache/webpack/server-development/8.pack.gz +0 -0
  63. package/src/dev/.next/cache/webpack/server-development/9.pack.gz +0 -0
  64. package/src/dev/.next/cache/webpack/server-development/9.pack.gz_ +0 -0
  65. package/src/dev/.next/cache/webpack/server-development/index.pack.gz +0 -0
  66. package/src/dev/.next/cache/webpack/server-development/index.pack.gz.old +0 -0
  67. package/src/dev/.next/package.json +1 -0
  68. package/src/dev/.next/prerender-manifest.json +11 -0
  69. package/src/dev/.next/react-loadable-manifest.json +1 -0
  70. package/src/dev/.next/routes-manifest.json +1 -0
  71. package/src/dev/.next/server/app/[...slug]/page.js +360 -0
  72. package/src/dev/.next/server/app/[...slug]/page_client-reference-manifest.js +1 -0
  73. package/src/dev/.next/server/app/page.js +349 -0
  74. package/src/dev/.next/server/app/page_client-reference-manifest.js +1 -0
  75. package/src/dev/.next/server/app-paths-manifest.json +3 -0
  76. package/src/dev/.next/server/edge-runtime-webpack.js +1151 -0
  77. package/src/dev/.next/server/interception-route-rewrite-manifest.js +1 -0
  78. package/src/dev/.next/server/middleware-build-manifest.js +33 -0
  79. package/src/dev/.next/server/middleware-manifest.json +32 -0
  80. package/src/dev/.next/server/middleware-react-loadable-manifest.js +1 -0
  81. package/src/dev/.next/server/middleware.js +1113 -0
  82. package/src/dev/.next/server/next-font-manifest.js +1 -0
  83. package/src/dev/.next/server/next-font-manifest.json +1 -0
  84. package/src/dev/.next/server/pages-manifest.json +5 -0
  85. package/src/dev/.next/server/server-reference-manifest.js +1 -0
  86. package/src/dev/.next/server/server-reference-manifest.json +5 -0
  87. package/src/dev/.next/server/static/webpack/633457081244afec._.hot-update.json +1 -0
  88. package/src/dev/.next/server/vendor-chunks/@readme.js +25 -0
  89. package/src/dev/.next/server/vendor-chunks/@swc.js +55 -0
  90. package/src/dev/.next/server/vendor-chunks/next.js +3659 -0
  91. package/src/dev/.next/server/webpack-runtime.js +209 -0
  92. package/src/dev/.next/static/chunks/app/[...slug]/loading.js +28 -0
  93. package/src/dev/.next/static/chunks/app/[...slug]/page.js +28 -0
  94. package/src/dev/.next/static/chunks/app/layout.js +171 -0
  95. package/src/dev/.next/static/chunks/app/page.js +28 -0
  96. package/src/dev/.next/static/chunks/app-pages-internals.js +182 -0
  97. package/src/dev/.next/static/chunks/main-app.js +1882 -0
  98. package/src/dev/.next/static/chunks/polyfills.js +1 -0
  99. package/src/dev/.next/static/chunks/webpack.js +1393 -0
  100. package/src/dev/.next/static/css/app/layout.css +559 -0
  101. package/src/dev/.next/static/development/_buildManifest.js +1 -0
  102. package/src/dev/.next/static/development/_ssgManifest.js +1 -0
  103. package/src/dev/.next/static/webpack/633457081244afec._.hot-update.json +1 -0
  104. package/src/dev/.next/static/webpack/ec52a3fce0f78db0.webpack.hot-update.json +1 -0
  105. package/src/dev/.next/static/webpack/webpack.ec52a3fce0f78db0.hot-update.js +12 -0
  106. package/src/dev/.next/trace +21 -0
  107. package/src/dev/.next/types/app/[...slug]/page.ts +84 -0
  108. package/src/dev/.next/types/app/layout.ts +84 -0
  109. package/src/dev/.next/types/app/page.ts +84 -0
  110. package/src/dev/.next/types/cache-life.d.ts +141 -0
  111. package/src/dev/.next/types/package.json +1 -0
  112. package/src/dev/.next/types/routes.d.ts +55 -0
  113. package/src/dev/app/Sidebar.js +149 -0
  114. package/src/dev/app/[...slug]/loading.js +16 -0
  115. package/src/dev/app/[...slug]/page.js +43 -0
  116. package/src/dev/app/globals.css +167 -0
  117. package/src/dev/app/layout.js +73 -0
  118. package/src/dev/app/page.js +19 -0
  119. package/src/dev/lib/docs.js +337 -0
  120. package/src/dev/middleware.js +7 -0
  121. package/src/dev/next.config.mjs +22 -0
  122. package/src/index.js +12 -0
  123. package/src/prompts/index.js +352 -0
  124. package/src/utils/claude.js +15 -0
  125. package/src/utils/eyes.js +365 -0
  126. package/src/utils/git.js +143 -0
  127. package/src/utils/lint.js +99 -0
  128. package/src/utils/reporter.js +319 -0
  129. package/src/utils/setup-templates.js +323 -0
  130. package/src/utils/styles.js +50 -0
  131. package/src/utils/tamagotchi.js +1139 -0
  132. package/src/utils/tips.js +90 -0
  133. package/src/validators/components.js +230 -0
  134. package/src/validators/content.js +53 -0
  135. package/src/validators/duplicates.js +45 -0
  136. package/src/validators/frontmatter.js +247 -0
  137. package/src/validators/links.js +68 -0
  138. package/src/validators/nesting.js +50 -0
  139. package/src/validators/numbering.js +136 -0
  140. package/src/validators/oas-reference.js +126 -0
  141. package/src/validators/oas-schema.js +106 -0
  142. package/src/validators/ordering.js +121 -0
  143. package/src/validators/recipes.js +143 -0
  144. package/vendor/TOOLS.md +19 -0
@@ -0,0 +1,2565 @@
1
+ import fs from 'node:fs'
2
+ import os from 'node:os'
3
+ import path from 'node:path'
4
+ import { spawn } from 'node:child_process'
5
+ import { createRequire } from 'node:module'
6
+ import { Option } from 'commander'
7
+ import { query } from '@anthropic-ai/claude-agent-sdk'
8
+ import matter from 'gray-matter'
9
+ import * as styles from '../utils/styles.js'
10
+ import { syncOas, extractOperations } from './oas-sync.js'
11
+ import OASNormalize from 'oas-normalize'
12
+ import { slotOrphansPrompt, iconizeNavPrompt, organizeFromSectionsPrompt, organizeFromScratchPrompt, stripCodeFences } from '../prompts/index.js'
13
+
14
+ export const command = 'import'
15
+ export const order = 7
16
+ export const description = 'Import content from a URL and package it as a ReadMe zip'
17
+ export const hidden = true
18
+ export const skipBootstrap = true
19
+
20
+ export function args(cmd) {
21
+ cmd.requiredOption('--source <url-or-file>', 'URL to import from, or path to a local OpenAPI spec (.json/.yaml/.yml)')
22
+ cmd.option('-o, --output <path>', 'Output zip path (defaults to <basename>-readme.zip in cwd)')
23
+ cmd.option('--model <name>', 'Claude model alias: haiku, sonnet, opus', 'sonnet')
24
+ cmd.option('--firecrawl-key <key>', 'Firecrawl API key (or set FIRECRAWL_API_KEY env var) — enables JS-rendered sidebar scraping')
25
+ cmd.option('--skip-api-reference', 'Drop pages routed to the API Reference / reference dir. Use when uploading the OAS spec separately.')
26
+ // Internal dev-only flag: skip the zip, keep staging, and boot the dev server
27
+ // against it for quick visual previews. Hidden from --help.
28
+ cmd.addOption(new Option('--test').hideHelp())
29
+ // Dump intermediate pipeline artifacts (llms parse, scraped nav, orphan
30
+ // handling, final organized tree) so we can diff stages when the produced
31
+ // sidebar disagrees with the source.
32
+ cmd.addOption(new Option('--debug').hideHelp())
33
+ }
34
+
35
+ /**
36
+ * Run the importer programmatically. Mirrors the CLI command but throws on
37
+ * fatal errors instead of calling `process.exit`, and returns a result object
38
+ * on success.
39
+ *
40
+ * @param {object} options
41
+ * @param {string} options.source URL to import from, or path to a local OAS spec.
42
+ * @param {string} [options.output] Output zip path. Defaults to `<basename>-readme.zip` in cwd.
43
+ * @param {string} [options.model] Claude model alias: 'haiku' | 'sonnet' | 'opus'. Defaults to 'sonnet'.
44
+ * @param {string} [options.firecrawlKey] Firecrawl API key (falls back to FIRECRAWL_API_KEY env var).
45
+ * @param {boolean} [options.skipApiReference] Drop pages routed to the API Reference dir.
46
+ * @param {boolean} [options.test] Skip the zip, keep staging, and boot the dev server.
47
+ * @param {boolean} [options.debug] Dump intermediate pipeline artifacts to a tmp dir.
48
+ * @returns {Promise<{ source: 'url' | 'oas', outputZip?: string, stagingDir?: string, fileCount: number, duration: number, phases: Array<{ label: string, ms: number }> }>}
49
+ */
50
+ export async function importDocs(options) {
51
+ const startedAt = Date.now()
52
+ const phases = []
53
+ const timePhase = async (label, fn) => {
54
+ const t = Date.now()
55
+ const result = await fn()
56
+ phases.push({ label, ms: Date.now() - t })
57
+ return result
58
+ }
59
+
60
+ const debugSnapshots = options.debug ? {} : null
61
+
62
+ // Dispatch: http(s) URL → docs-site scrape flow; anything else → local OAS.
63
+ if (!/^https?:\/\//i.test(options.source)) {
64
+ return runOasImport(options.source, options, startedAt, phases, timePhase)
65
+ }
66
+
67
+ let sourceUrl
68
+ try {
69
+ sourceUrl = new URL(options.source)
70
+ } catch {
71
+ throw new Error(`Invalid --source URL: ${options.source}`)
72
+ }
73
+
74
+ const outputZip = path.resolve(options.output || path.join(process.cwd(), `${sourceUrl.hostname}-readme.zip`))
75
+
76
+ console.log()
77
+ styles.info(`Importing from ${styles.bold(sourceUrl.toString())}`)
78
+ if (!options.test) styles.info(`Output: ${styles.bold(outputZip)}`)
79
+ console.log()
80
+
81
+ // Build the list of llms.txt URLs to probe, walking up the supplied path
82
+ // from most-specific to root. For `https://mintlify.com/docs/quickstart`
83
+ // we try `/docs/quickstart/llms.txt`, then `/docs/llms.txt`, then root.
84
+ // This catches sites that scope llms.txt to a docs subpath.
85
+ const llmsCandidates = buildLlmsCandidates(sourceUrl)
86
+ styles.info(`Checking for llms.txt (${llmsCandidates.length} candidate${llmsCandidates.length === 1 ? '' : 's'})...`)
87
+
88
+ const { llms, llmsUrl } = await timePhase('fetch llms.txt', async () => {
89
+ for (const candidate of llmsCandidates) {
90
+ const res = await fetchLlmsTxt(candidate)
91
+ if (res.ok) return { llms: res, llmsUrl: candidate }
92
+ styles.info(styles.dim(` ${candidate} → ${res.status ? `HTTP ${res.status}` : res.error || 'failed'}`))
93
+ }
94
+ return { llms: null, llmsUrl: null }
95
+ })
96
+ console.log()
97
+
98
+ if (!llms) {
99
+ styles.warning(`No llms.txt found at any probed path — falling back to sidebar discovery via scrape.`)
100
+ } else {
101
+ styles.info(styles.dim(`Using ${llmsUrl}.`))
102
+ }
103
+
104
+ if (debugSnapshots) {
105
+ debugSnapshots['01-llms-parsed.json'] = { llmsUrl, parsed: llms ? llms.parsed : null }
106
+ }
107
+
108
+ let knownUrls = []
109
+ if (llms) {
110
+ const totalItems = llms.parsed.sections.reduce((n, s) => n + s.items.length, 0)
111
+ styles.ok(
112
+ `Found llms.txt — ${styles.bold(String(totalItems))} page${totalItems === 1 ? '' : 's'} across ${styles.bold(String(llms.parsed.sections.length))} section${llms.parsed.sections.length === 1 ? '' : 's'}${llms.parsed.title ? ` (${llms.parsed.title})` : ''}.`,
113
+ )
114
+
115
+ const rawKnownUrls = llms.parsed.sections.flatMap((s) => s.items.map((i) => ({ title: i.text, url: i.url, description: i.description })))
116
+
117
+ // Dedupe llms.txt entries by pathname. Some sites (zod.dev, fumadocs) list
118
+ // every in-page anchor as its own llms.txt row (`/v4?id=wrapping-up`,
119
+ // `/v4?id=metadata`, …) even though they all live on one rendered page.
120
+ // We prefer the "cleanest" URL per path — the shortest one, which is
121
+ // usually the one without a query string or hash.
122
+ const byKnownPath = new Map()
123
+ for (const p of rawKnownUrls) {
124
+ const key = normalizePath(p.url)
125
+ const prev = byKnownPath.get(key)
126
+ if (!prev || p.url.length < prev.url.length) byKnownPath.set(key, p)
127
+ }
128
+ knownUrls = Array.from(byKnownPath.values())
129
+ const dropped = rawKnownUrls.length - knownUrls.length
130
+ if (dropped > 0) {
131
+ styles.info(`${styles.dim(`Collapsed ${dropped} anchor/query duplicates → ${knownUrls.length} unique pages.`)}`)
132
+ }
133
+ }
134
+
135
+ console.log()
136
+ const firecrawlKey = options.firecrawlKey || process.env.FIRECRAWL_API_KEY || null
137
+
138
+ // Mintlify fast path — the canonical sidebar lives in docs.json/mint.json
139
+ // at origin root. When present it gives us perfect structure with zero
140
+ // HTML parsing, so try it before falling back to generic nav scraping.
141
+ styles.info(`Probing for Mintlify config (docs.json, mint.json)...`)
142
+ const mintlifyStart = Date.now()
143
+ const mintlifyNav = await timePhase('mintlify probe', () => tryMintlifyNav(sourceUrl.toString(), knownUrls, firecrawlKey))
144
+ if (debugSnapshots) {
145
+ debugSnapshots['02a-mintlify-nav.json'] = mintlifyNav ? JSON.parse(JSON.stringify(mintlifyNav)) : null
146
+ }
147
+ if (mintlifyNav) {
148
+ const pageCount = mintlifyNav.categories.reduce((n, c) => n + c.pages.length, 0)
149
+ styles.ok(
150
+ `Found Mintlify config at ${styles.bold(mintlifyNav.source)} in ${styles.bold(formatDuration(Date.now() - mintlifyStart))} — ${styles.bold(String(mintlifyNav.categories.length))} categor${mintlifyNav.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(pageCount))} pages.`,
151
+ )
152
+ }
153
+ console.log()
154
+
155
+ let scraped
156
+ let scrapeStart = Date.now()
157
+ if (mintlifyNav) {
158
+ scraped = { title: mintlifyNav.title, categories: mintlifyNav.categories }
159
+ } else {
160
+ styles.info(`Scraping sidebar nav from ${styles.bold(sourceUrl.toString())}${firecrawlKey ? ' ' + styles.dim('(via Firecrawl)') : ''}...`)
161
+ scrapeStart = Date.now()
162
+ scraped = await timePhase('scrape nav', () => scrapeNavFromSite(sourceUrl.toString(), knownUrls, firecrawlKey))
163
+ }
164
+ if (debugSnapshots) {
165
+ debugSnapshots['02-scraped-raw.json'] = scraped ? JSON.parse(JSON.stringify(scraped)) : null
166
+ }
167
+ // Prefer llms.txt when it has strong multi-section structure and the
168
+ // scrape was a thin snapshot (common on big multi-tab docs — Stripe, AWS,
169
+ // Twilio — where each page only renders its own tab's sidebar). Without
170
+ // this override the 4-category scrape wins over a 25-section llms.txt and
171
+ // hundreds of real pages end up smeared into orphan buckets.
172
+ if (scraped && llms && knownUrls.length > 0) {
173
+ const scrapedPages = scraped.categories.reduce((n, c) => n + c.pages.length, 0)
174
+ const coverage = scrapedPages / knownUrls.length
175
+ const llmsUsable = usableSections(llms.parsed.sections)
176
+ if (llmsUsable.length >= 5 && coverage < 0.5) {
177
+ styles.info(
178
+ `Scrape covered ${styles.bold(Math.round(coverage * 100) + '%')} of llms.txt pages; preferring llms.txt's ${styles.bold(String(llmsUsable.length))} sections for structure.`,
179
+ )
180
+ scraped = null
181
+ }
182
+ }
183
+
184
+ if (scraped) {
185
+ const directMatches = scraped.categories.reduce((n, c) => n + c.pages.length, 0)
186
+ if (knownUrls.length > 0) {
187
+ const slotted = slotOrphansByPath(scraped, knownUrls)
188
+ if (debugSnapshots) {
189
+ debugSnapshots['03-after-slot-by-path.json'] = {
190
+ scraped: JSON.parse(JSON.stringify(scraped)),
191
+ unslottedOrphans: slotted,
192
+ }
193
+ }
194
+ const totalMatched = scraped.categories.reduce((n, c) => n + c.pages.length, 0)
195
+ styles.ok(
196
+ `Scraped nav in ${styles.bold(formatDuration(Date.now() - scrapeStart))} — ${styles.bold(String(scraped.categories.length))} categor${scraped.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(directMatches))} direct matches + ${styles.bold(String(totalMatched - directMatches))} slotted by path = ${styles.bold(String(totalMatched))}/${knownUrls.length}.`,
197
+ )
198
+ // Sweep pass first: any page whose URL has a strong reference segment
199
+ // (e.g. `/api-reference/...`) belongs in the API Reference category,
200
+ // even if the site's sidebar spotlighted it under Developers or similar.
201
+ // Sidebars often surface a few endpoints as "featured" outside the
202
+ // reference section — we respect the URL over the nav here.
203
+ const moved = reclassifyReferencePages(scraped)
204
+ if (moved > 0) {
205
+ styles.info(`Moved ${styles.bold(String(moved))} page${moved === 1 ? '' : 's'} into ${styles.bold('API Reference')} based on URL path.`)
206
+ }
207
+
208
+ if (slotted.length > 0) {
209
+ // Mintlify-style docs put API endpoints in a separate tab rooted at
210
+ // /api-reference/* (or /api/*, /reference/*). Collapse remaining such
211
+ // pages into a single "API Reference" category (absorbing the flat
212
+ // category the sweep pass just built, if any) and nest it by resource
213
+ // segment so routeCategory() maps the whole thing to ReadMe's
214
+ // `reference/` top-level dir.
215
+ const apiResult = collectApiReferencePages(slotted, scraped)
216
+ const otherOrphans = apiResult.nonApiOrphans
217
+ if (apiResult.category) scraped.categories.push(apiResult.category)
218
+
219
+ const buckets = bucketOrphansByPathType(otherOrphans, scraped)
220
+ for (const b of buckets) scraped.categories.push(b)
221
+ if (debugSnapshots) {
222
+ debugSnapshots['04-after-orphan-buckets.json'] = {
223
+ apiReferenceCollected: apiResult.category
224
+ ? {
225
+ pageCount: apiResult.category.pages.length,
226
+ mergedFromScraped: apiResult.mergedScrapedTitles,
227
+ }
228
+ : null,
229
+ buckets,
230
+ scraped: JSON.parse(JSON.stringify(scraped)),
231
+ }
232
+ }
233
+ const parts = []
234
+ if (apiResult.category) {
235
+ parts.push(`${styles.bold(String(apiResult.category.pages.length))} in ${styles.bold('API Reference')}`)
236
+ }
237
+ for (const b of buckets) {
238
+ parts.push(`${styles.bold(String(b.pages.length))} in ${styles.bold(b.title)}`)
239
+ }
240
+ if (parts.length > 0) {
241
+ styles.info(`${styles.bold(String(slotted.length))} orphan page${slotted.length === 1 ? '' : 's'} bucketed by URL type: ${parts.join(', ')}.`)
242
+ }
243
+ }
244
+ } else {
245
+ // Discovery mode — scraped pages ARE our known pages. No orphans.
246
+ // If everything landed in a single flat category (the sidebar had no
247
+ // <h*>/<p> headers to split on), try to re-cluster by URL path
248
+ // structure: pages that share a common prefix often live under the
249
+ // same section in the site's real hierarchy.
250
+ if (scraped.categories.length === 1) {
251
+ const reclustered = clusterByUrlPath(scraped.categories[0].pages)
252
+ if (reclustered) {
253
+ scraped.categories = reclustered
254
+ styles.ok(
255
+ `Scraped nav in ${styles.bold(formatDuration(Date.now() - scrapeStart))} — re-clustered by URL path into ${styles.bold(String(scraped.categories.length))} categor${scraped.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(directMatches))} pages discovered (no llms.txt).`,
256
+ )
257
+ } else {
258
+ styles.ok(
259
+ `Scraped nav in ${styles.bold(formatDuration(Date.now() - scrapeStart))} — ${styles.bold(String(scraped.categories.length))} categor${scraped.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(directMatches))} pages discovered (no llms.txt).`,
260
+ )
261
+ }
262
+ } else {
263
+ styles.ok(
264
+ `Scraped nav in ${styles.bold(formatDuration(Date.now() - scrapeStart))} — ${styles.bold(String(scraped.categories.length))} categor${scraped.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(directMatches))} pages discovered (no llms.txt).`,
265
+ )
266
+ }
267
+ }
268
+ } else if (!llms) {
269
+ throw new Error(`No llms.txt and the sidebar scrape found no usable structure — can't import ${sourceUrl.toString()}.`)
270
+ } else {
271
+ styles.warning(`Couldn't extract a useful nav — falling back to llms.txt-based organization.`)
272
+ }
273
+ console.log()
274
+
275
+ let organized
276
+ const organizeStart = Date.now()
277
+ if (scraped) {
278
+ // No Claude call — icons deferred. Use a neutral placeholder so the tree
279
+ // view still prints cleanly.
280
+ organized = {
281
+ title: (llms && llms.parsed.title) || null,
282
+ categories: scraped.categories.map((c) => ({ title: c.title, icon: null, pages: c.pages })),
283
+ }
284
+ } else {
285
+ const fastPath = sectionsLookUsable(llms.parsed.sections)
286
+ styles.info(`Organizing with Claude (${styles.bold(options.model)}, ${fastPath ? 'fast path: icons only' : 'full reorg'})...`)
287
+ organized = await timePhase('claude organize', () => organizeWithClaude(llms.parsed, options.model))
288
+ }
289
+ styles.ok(`Organized in ${styles.bold(formatDuration(Date.now() - organizeStart))}.`)
290
+ if (debugSnapshots) {
291
+ debugSnapshots['05-organized.json'] = organized
292
+ const debugDir = path.join(os.tmpdir(), `readme-import-debug-${sourceUrl.hostname}-${Date.now()}`)
293
+ fs.mkdirSync(debugDir, { recursive: true })
294
+ for (const [name, data] of Object.entries(debugSnapshots)) {
295
+ fs.writeFileSync(path.join(debugDir, name), JSON.stringify(data, null, 2))
296
+ }
297
+ styles.info(`${styles.dim(`Debug snapshots → ${debugDir}`)}`)
298
+ }
299
+ console.log()
300
+
301
+ console.log(` ${styles.bold(organized.title || '(untitled)')}`)
302
+ for (const cat of organized.categories || []) {
303
+ console.log()
304
+ const iconLabel = cat.icon ? `${styles.brand(`[${cat.icon}]`)} ` : ''
305
+ console.log(` ${iconLabel}${styles.bold(cat.title)}`)
306
+ printPagesTree(cat.pages || [], 2)
307
+ }
308
+
309
+ const stagingDir = fs.mkdtempSync(path.join(os.tmpdir(), 'readme-import-'))
310
+
311
+ let result
312
+ try {
313
+ styles.info(`Staging frontmatter stubs in ${styles.bold(stagingDir)}...`)
314
+ const stageStart = Date.now()
315
+ const staged = await timePhase('stage stubs', async () => stageOrganized(organized, stagingDir, { skipApiReference: !!options.skipApiReference }))
316
+ ensureDocsLandingPage(stagingDir, organized.title || sourceUrl.hostname)
317
+ styles.ok(
318
+ `Staged ${styles.bold(String(staged.fileCount))} stub${staged.fileCount === 1 ? '' : 's'} across ${styles.bold(String(staged.dirCount))} director${staged.dirCount === 1 ? 'y' : 'ies'} in ${styles.bold(formatDuration(Date.now() - stageStart))}.`,
319
+ )
320
+ if (staged.skippedApiRef > 0) {
321
+ styles.info(`Skipped ${styles.bold(String(staged.skippedApiRef))} API reference page${staged.skippedApiRef === 1 ? '' : 's'} (--skip-api-reference)`)
322
+ }
323
+ console.log()
324
+
325
+ if (options.test) {
326
+ styles.ok(`Done in ${styles.bold(formatDuration(Date.now() - startedAt))}! Staged ${styles.bold(String(staged.fileCount))} files at ${styles.bold(stagingDir)}`)
327
+ console.log()
328
+ styles.info('Starting the dev server for preview...')
329
+ console.log()
330
+ await runDevPreview(stagingDir)
331
+ return { source: 'url', stagingDir, fileCount: staged.fileCount, duration: Date.now() - startedAt, phases }
332
+ }
333
+
334
+ if (staged.fileCount === 0) {
335
+ styles.warning('Staging directory is empty — skipping zip.')
336
+ return { source: 'url', fileCount: 0, duration: Date.now() - startedAt, phases }
337
+ }
338
+
339
+ styles.info(`Packaging ${styles.bold(String(staged.fileCount))} files into ${styles.bold(outputZip)}...`)
340
+ await timePhase('zip', () => createZip(stagingDir, outputZip))
341
+
342
+ console.log()
343
+ styles.ok(`Done in ${styles.bold(formatDuration(Date.now() - startedAt))}! Your ReadMe import is ready at ${styles.bold(outputZip)}`)
344
+ console.log(styles.dim(` ⏱ ${phases.map((p) => `${p.label} ${formatDuration(p.ms)}`).join(' · ')}`))
345
+ result = { source: 'url', outputZip, fileCount: staged.fileCount, duration: Date.now() - startedAt, phases }
346
+ } finally {
347
+ if (!options.test) {
348
+ fs.rmSync(stagingDir, { recursive: true, force: true })
349
+ }
350
+ }
351
+
352
+ return result
353
+ }
354
+
355
+ /**
356
+ * CLI entrypoint. Wraps `importDocs` with friendly error reporting and a
357
+ * force-exit on success — the Claude SDK and fetch keep-alive both hold open
358
+ * handles that idle for 20-30s before timing out, and we're done by this
359
+ * point so we exit explicitly to match wall-clock time with the "Done in"
360
+ * message.
361
+ */
362
+ export async function run(options) {
363
+ try {
364
+ await importDocs(options)
365
+ } catch (err) {
366
+ styles.error(err.message || String(err))
367
+ process.exit(1)
368
+ }
369
+ process.exit(0)
370
+ }
371
+
372
+ /**
373
+ * Import path for local OpenAPI spec files. The spec is copied verbatim into
374
+ * `reference/` — the git-format build pipeline auto-generates endpoint pages
375
+ * from the spec at render time, so we don't need to stub anything here.
376
+ */
377
+ async function runOasImport(sourcePath, options, startedAt, phases, timePhase) {
378
+ const absPath = path.resolve(sourcePath)
379
+ if (!fs.existsSync(absPath) || !fs.statSync(absPath).isFile()) {
380
+ throw new Error(`File not found: ${absPath}`)
381
+ }
382
+ const ext = path.extname(absPath).toLowerCase()
383
+ if (!['.json', '.yaml', '.yml'].includes(ext)) {
384
+ throw new Error(`Unsupported file type ${ext || '(none)'} — expected .json, .yaml, or .yml.`)
385
+ }
386
+
387
+ const basename = path.basename(absPath, ext)
388
+ const outputZip = path.resolve(options.output || path.join(process.cwd(), `${basename}-readme.zip`))
389
+
390
+ console.log()
391
+ styles.info(`Importing OpenAPI spec from ${styles.bold(absPath)}`)
392
+ if (!options.test) styles.info(`Output: ${styles.bold(outputZip)}`)
393
+ console.log()
394
+
395
+ // Parse + sanity-check it's actually an OAS before we stage anything.
396
+ // We do this in two stages: a cheap parse + looks-like-OAS check (fail
397
+ // fast on clearly wrong inputs), then a normalize step that will repair
398
+ // fixable issues (Swagger 2 → OpenAPI 3 conversion, bundling $refs, etc.).
399
+ const { spec, opCount, wasFixed, fixReason } = await timePhase('parse spec', async () => {
400
+ const raw = fs.readFileSync(absPath, 'utf-8')
401
+ let parsed
402
+ try {
403
+ parsed = ext === '.json' ? JSON.parse(raw) : yamlRequire().load(raw)
404
+ } catch (e) {
405
+ throw new Error(`Couldn't parse ${absPath} as ${ext === '.json' ? 'JSON' : 'YAML'}: ${e.message}`)
406
+ }
407
+ if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
408
+ throw new Error(`File isn't a usable object: ${absPath}`)
409
+ }
410
+ if (!looksLikeOas(parsed)) {
411
+ throw new Error(`Not an OpenAPI/Swagger spec — no top-level openapi / swagger field and no paths section.`)
412
+ }
413
+
414
+ // Normalize via oas-normalize. `bundle()` handles the common fix-ups:
415
+ // Swagger 2 → OpenAPI 3, Postman collection → OpenAPI, inline $ref
416
+ // resolution. We try it first (even on apparently-valid specs) so Postman
417
+ // collections actually get converted. If bundle errors we fall back to
418
+ // the original spec when it passes validate, else fail hard.
419
+ const normalizer = new OASNormalize(parsed)
420
+ try {
421
+ const bundled = await normalizer.bundle()
422
+ const changed = JSON.stringify(bundled) !== JSON.stringify(parsed)
423
+ return {
424
+ spec: bundled,
425
+ opCount: countOperations(bundled),
426
+ wasFixed: changed,
427
+ fixReason: changed ? 'normalized (Swagger 2 → OpenAPI 3, Postman conversion, or $ref inlining)' : null,
428
+ }
429
+ } catch (bundleErr) {
430
+ try {
431
+ await normalizer.validate()
432
+ return { spec: parsed, opCount: countOperations(parsed), wasFixed: false }
433
+ } catch (validateErr) {
434
+ throw new Error(
435
+ `Spec is invalid and couldn't be auto-fixed.\n Validation error: ${validateErr.message.split('\n')[0]}\n Fix attempt error: ${bundleErr.message.split('\n')[0]}`,
436
+ )
437
+ }
438
+ }
439
+ })
440
+
441
+ if (wasFixed) {
442
+ styles.warning(`Spec had issues — auto-fixed (${fixReason}).`)
443
+ }
444
+
445
+ const title = spec.info?.title || basename
446
+ const version = spec.info?.version || null
447
+ styles.ok(`Parsed OpenAPI ${version ? 'v' + version + ' ' : ''}spec — ${styles.bold(title)} (${styles.bold(String(opCount))} operation${opCount === 1 ? '' : 's'}).`)
448
+ console.log()
449
+
450
+ const stagingDir = fs.mkdtempSync(path.join(os.tmpdir(), 'readme-import-'))
451
+ let result
452
+ try {
453
+ const { stagedName } = await timePhase('stage spec', async () => {
454
+ // If we auto-fixed the spec, serialize the fixed version as JSON (always
455
+ // writable, avoids YAML-ambiguity regressions). Otherwise copy the
456
+ // original file verbatim so formatting/comments are preserved.
457
+ const rawName = path.basename(absPath)
458
+ let targetName
459
+ let targetContent
460
+ if (wasFixed) {
461
+ targetName = rawName.replace(/\.(ya?ml|json)$/i, '.json')
462
+ targetContent = JSON.stringify(spec, null, 2)
463
+ } else {
464
+ targetName = rawName
465
+ targetContent = null // signal to copy
466
+ }
467
+ const targetPath = path.join(stagingDir, 'reference', targetName)
468
+ fs.mkdirSync(path.dirname(targetPath), { recursive: true })
469
+ if (targetContent === null) {
470
+ fs.copyFileSync(absPath, targetPath)
471
+ } else {
472
+ fs.writeFileSync(targetPath, targetContent)
473
+ }
474
+
475
+ // syncOas walks reference/ and generates one <operationId>.md per
476
+ // operation, grouped by tag. We pass stagingDir as the "git root" so
477
+ // its refDir lookup lands on stagingDir/reference/.
478
+ syncOas(stagingDir)
479
+ return { stagedName: targetName }
480
+ })
481
+
482
+ // Ensure there's always at least a landing page — OAS-only imports leave
483
+ // docs/ empty, which makes `--test` dev server show "no pages" at /.
484
+ ensureDocsLandingPage(stagingDir, title, opCount)
485
+
486
+ // OAS operation pages don't need an x-import URL — their content is
487
+ // intrinsic to the spec (summary/description live in the OpenAPI doc,
488
+ // and the page's `api:` frontmatter already points back to it).
489
+ const pageCount = countReferencePages(stagingDir, stagedName)
490
+
491
+ styles.ok(`Staged ${styles.bold(stagedName)} and generated ${styles.bold(String(pageCount))} operation page${pageCount === 1 ? '' : 's'} under ${styles.bold('reference/')}.`)
492
+ console.log()
493
+
494
+ if (options.test) {
495
+ styles.ok(`Done in ${styles.bold(formatDuration(Date.now() - startedAt))}! Staged at ${styles.bold(stagingDir)}`)
496
+ console.log()
497
+ styles.info('Starting the dev server for preview...')
498
+ console.log()
499
+ await runDevPreview(stagingDir)
500
+ return { source: 'oas', stagingDir, fileCount: pageCount, duration: Date.now() - startedAt, phases }
501
+ }
502
+
503
+ await timePhase('zip', () => createZip(stagingDir, outputZip))
504
+
505
+ console.log()
506
+ styles.ok(`Done in ${styles.bold(formatDuration(Date.now() - startedAt))}! Your ReadMe import is ready at ${styles.bold(outputZip)}`)
507
+ console.log(styles.dim(` ⏱ ${phases.map((p) => `${p.label} ${formatDuration(p.ms)}`).join(' · ')}`))
508
+ result = { source: 'oas', outputZip, fileCount: pageCount, duration: Date.now() - startedAt, phases }
509
+ } finally {
510
+ if (!options.test) fs.rmSync(stagingDir, { recursive: true, force: true })
511
+ }
512
+ return result
513
+ }
514
+
515
+ // js-yaml is installed transitively (via oas-sync.js) but not in our direct
516
+ // deps. Load it lazily on first use so the JSON-only path doesn't pay for it.
517
+ let _yaml = null
518
+ function yamlRequire() {
519
+ if (!_yaml) {
520
+ const require = createRequire(import.meta.url)
521
+ _yaml = require('js-yaml')
522
+ }
523
+ return _yaml
524
+ }
525
+
526
+ /**
527
+ * Cheap first-pass "does this look like an OpenAPI/Swagger spec?" check.
528
+ * Accepts anything with a version field OR a paths section — some specs in
529
+ * the wild drop the version; the follow-up oas-normalize pass will still
530
+ * catch malformed inputs that slip through here.
531
+ */
532
+ function looksLikeOas(obj) {
533
+ if (!obj || typeof obj !== 'object') return false
534
+ if (typeof obj.openapi === 'string' || typeof obj.swagger === 'string') return true
535
+ if (obj.paths && typeof obj.paths === 'object') return true
536
+ // Postman collections — oas-normalize auto-converts these to OpenAPI.
537
+ if (obj.info && typeof obj.info.schema === 'string' && /getpostman\.com/i.test(obj.info.schema)) return true
538
+ return false
539
+ }
540
+
541
+ function countOperations(spec) {
542
+ let n = 0
543
+ for (const p of Object.values(spec.paths || {})) {
544
+ for (const k of Object.keys(p || {})) {
545
+ if (/^(get|post|put|patch|delete|options|head|trace)$/i.test(k)) n++
546
+ }
547
+ }
548
+ return n
549
+ }
550
+
551
+ /**
552
+ * Count operation pages syncOas just generated under reference/ for the
553
+ * given spec file. Used only for the "generated N pages" success message.
554
+ */
555
+ function countReferencePages(stagingDir, specFilename) {
556
+ const refDir = path.join(stagingDir, 'reference')
557
+ if (!fs.existsSync(refDir)) return 0
558
+
559
+ let count = 0
560
+ const walk = (dir) => {
561
+ for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
562
+ const full = path.join(dir, entry.name)
563
+ if (entry.isDirectory()) {
564
+ walk(full)
565
+ continue
566
+ }
567
+ if (!entry.name.endsWith('.md')) continue
568
+
569
+ const parsed = matter(fs.readFileSync(full, 'utf-8'))
570
+ const fm = parsed.data || {}
571
+ if (fm.api && fm.api.file === specFilename) count++
572
+ }
573
+ }
574
+ walk(refDir)
575
+ return count
576
+ }
577
+
578
+ /**
579
+ * Write a `docs/Getting Started/getting-started.md` when there's no other
580
+ * docs content (typical after an OAS-only import). Body includes a pointer
581
+ * to the Reference tab so users know where the operations are — the dev
582
+ * server only shows one sidebar section at a time, so a blank landing on
583
+ * `/docs/...` makes it look like nothing imported.
584
+ */
585
+ function ensureDocsLandingPage(stagingDir, siteTitle, opCount = 0) {
586
+ const docsDir = path.join(stagingDir, 'docs')
587
+ if (fs.existsSync(docsDir) && fs.readdirSync(docsDir).length > 0) return
588
+
589
+ const categoryDir = path.join(docsDir, 'Getting Started')
590
+ fs.mkdirSync(categoryDir, { recursive: true })
591
+
592
+ const name = siteTitle || 'your API'
593
+ const title = siteTitle ? `Welcome to ${siteTitle}` : 'Getting Started'
594
+ const body =
595
+ opCount > 0
596
+ ? `This import brought in **${opCount} API operation${opCount === 1 ? '' : 's'}** from ${name}.\n\n👉 [Browse the API Reference →](/reference)\n\nThis page is a placeholder landing. Replace or expand it with onboarding content specific to your API.\n`
597
+ : `This is a placeholder landing page. Replace it with your docs.\n`
598
+ fs.writeFileSync(path.join(categoryDir, 'getting-started.md'), matter.stringify(body, { title, icon: formatIconClass('rocket') }))
599
+ fs.writeFileSync(path.join(categoryDir, '_order.yaml'), '- getting-started\n')
600
+ fs.writeFileSync(path.join(docsDir, '_order.yaml'), '- Getting Started\n')
601
+ }
602
+
603
+ /**
604
+ * Run a Claude agent in the staging directory. Streams tool calls + assistant
605
+ * text to the terminal. Writes are scoped to the staging dir.
606
+ */
607
+ export async function runAgent({ userPrompt, systemPrompt, cwd, model }) {
608
+ for await (const message of query({
609
+ prompt: userPrompt,
610
+ options: {
611
+ cwd,
612
+ allowedTools: ['Read', 'Write', 'Edit', 'Glob', 'Grep'],
613
+ permissionMode: 'acceptEdits',
614
+ canUseTool: makeStagingGuard(cwd),
615
+ ...(systemPrompt ? { systemPrompt } : {}),
616
+ ...(model ? { model } : {}),
617
+ },
618
+ })) {
619
+ if (message.type === 'assistant' && message.message?.content) {
620
+ for (const block of message.message.content) {
621
+ if (block.type === 'text' && block.text?.trim()) {
622
+ console.log(styles.dim(block.text.trim()))
623
+ } else if (block.type === 'tool_use') {
624
+ console.log(`${styles.brand('›')} ${styles.bold(block.name)}`)
625
+ }
626
+ }
627
+ } else if (message.type === 'result') {
628
+ if (message.subtype && message.subtype !== 'success') {
629
+ const err = new Error(`Agent result subtype=${message.subtype}${message.error?.message ? ': ' + message.error.message : ''}`)
630
+ err.subtype = message.subtype
631
+ err.result = message
632
+ throw err
633
+ }
634
+ return
635
+ }
636
+ }
637
+ }
638
+
639
+ function makeStagingGuard(stagingDir) {
640
+ const absStaging = path.resolve(stagingDir)
641
+ const WRITE_TOOLS = new Set(['Write', 'Edit', 'NotebookEdit', 'MultiEdit'])
642
+
643
+ return async (toolName, input) => {
644
+ if (!WRITE_TOOLS.has(toolName)) return { behavior: 'allow' }
645
+ const fp = input?.file_path
646
+ if (typeof fp !== 'string' || !fp) {
647
+ return { behavior: 'deny', message: `${toolName}: missing file_path` }
648
+ }
649
+ const abs = path.isAbsolute(fp) ? path.resolve(fp) : path.resolve(absStaging, fp)
650
+ const rel = path.relative(absStaging, abs)
651
+ const inside = rel && !rel.startsWith('..') && !path.isAbsolute(rel)
652
+ if (!inside) {
653
+ styles.warning(`Blocked ${toolName} outside staging: ${fp}`)
654
+ return {
655
+ behavior: 'deny',
656
+ message: `Writes must stay inside the staging directory ${absStaging}. Refused: ${fp}`,
657
+ }
658
+ }
659
+ return { behavior: 'allow' }
660
+ }
661
+ }
662
+
663
+ /**
664
+ * Hand off to the dev command so the user can preview the staged docs. Reuses
665
+ * the currently-running CLI binary so fixes to the dev server ship immediately
666
+ * to users already on this version (and so local development doesn't need a
667
+ * publish to test). Falls back to `npx @readme/cli` if we can't locate
668
+ * ourselves. Stdout is piped so we can detect the server's URL and open it.
669
+ */
670
+ function runDevPreview(stagingDir) {
671
+ return new Promise((resolve, reject) => {
672
+ const selfBin = process.argv[1] && fs.existsSync(process.argv[1]) ? process.argv[1] : null
673
+ const [cmd, args] = selfBin ? [process.execPath, [selfBin, 'dev', '--no-check']] : ['npx', ['--yes', '@readme/cli', 'dev', '--no-check']]
674
+ const child = spawn(cmd, args, {
675
+ cwd: stagingDir,
676
+ stdio: ['inherit', 'pipe', 'inherit'],
677
+ })
678
+
679
+ let opened = false
680
+ child.stdout.on('data', (chunk) => {
681
+ process.stdout.write(chunk)
682
+ if (opened) return
683
+ const match = chunk.toString().match(/https?:\/\/localhost:\d+/)
684
+ if (match) {
685
+ opened = true
686
+ openUrl(match[0])
687
+ }
688
+ })
689
+
690
+ child.on('close', () => resolve())
691
+ child.on('error', reject)
692
+ })
693
+ }
694
+
695
+ function openUrl(url) {
696
+ const cmd = process.platform === 'darwin' ? 'open' : process.platform === 'win32' ? 'cmd' : 'xdg-open'
697
+ const args = process.platform === 'win32' ? ['/c', 'start', '""', url] : [url]
698
+ try {
699
+ spawn(cmd, args, { stdio: 'ignore', detached: true }).unref()
700
+ } catch {
701
+ // Best-effort — the URL is still in the terminal output for the user.
702
+ }
703
+ }
704
+
705
+ function listFiles(dir, prefix = '') {
706
+ const results = []
707
+ if (!fs.existsSync(dir)) return results
708
+ for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
709
+ if (entry.name.startsWith('.')) continue
710
+ const rel = prefix ? `${prefix}/${entry.name}` : entry.name
711
+ if (entry.isDirectory()) {
712
+ results.push(...listFiles(path.join(dir, entry.name), rel))
713
+ } else {
714
+ results.push(rel)
715
+ }
716
+ }
717
+ return results
718
+ }
719
+
720
+ function createZip(sourceDir, outputZip) {
721
+ fs.mkdirSync(path.dirname(outputZip), { recursive: true })
722
+ if (fs.existsSync(outputZip)) fs.rmSync(outputZip)
723
+
724
+ return new Promise((resolve, reject) => {
725
+ const child = spawn('zip', ['-r', '-q', outputZip, '.'], {
726
+ cwd: sourceDir,
727
+ stdio: 'inherit',
728
+ })
729
+ child.on('close', (code) => {
730
+ if (code === 0) resolve()
731
+ else reject(new Error(`zip exited with code ${code}`))
732
+ })
733
+ child.on('error', reject)
734
+ })
735
+ }
736
+
737
+ /**
738
+ * Ask Claude to fold a parsed llms.txt into a category/hierarchy JSON object.
739
+ * Returns { title, categories: [{ title, icon, pages: [{ title, url, description }] }] }.
740
+ *
741
+ * If the input already has sections, Claude is told to use them as starting
742
+ * points and refine. If sections are missing or obviously generic ("Resources",
743
+ * "English"), it invents better ones. Every category gets a FontAwesome icon.
744
+ */
745
+ /**
746
+ * Mintlify sites ship the canonical sidebar in `docs.json` (v2) or `mint.json`
747
+ * (v1) at the origin root. When present, it's a perfect structural source —
748
+ * no HTML parsing needed. Pages are listed by slug; we enrich titles from
749
+ * llms.txt where available, otherwise fall back to a title derived from the
750
+ * slug.
751
+ *
752
+ * Returns { source, title, categories } or null if no Mintlify config is
753
+ * found or parseable.
754
+ */
755
+ async function tryMintlifyNav(sourceUrl, knownPages, firecrawlKey) {
756
+ const origin = new URL(sourceUrl).origin
757
+ const fetchHtml = firecrawlKey ? makeFirecrawlFetcher(firecrawlKey) : fetchHtmlDirect
758
+
759
+ const byPath = new Map()
760
+ for (const p of knownPages) byPath.set(normalizePath(p.url), p)
761
+
762
+ for (const filename of ['docs.json', 'mint.json']) {
763
+ const configUrl = `${origin}/${filename}`
764
+ const body = await fetchHtml(configUrl)
765
+ if (!body) continue
766
+ const config = extractMintlifyConfig(body)
767
+ if (!config || !config.navigation) continue
768
+ const parsed = parseMintlifyConfig(config, origin, byPath)
769
+ if (parsed.categories.length > 0) {
770
+ return { source: configUrl, title: parsed.title, categories: parsed.categories }
771
+ }
772
+ }
773
+ return null
774
+ }
775
+
776
+ /**
777
+ * Extract a Mintlify config object from a fetched body. Handles:
778
+ * - raw JSON (`{ "navigation": ... }`)
779
+ * - JSON wrapped in HTML (Firecrawl sometimes returns `<pre>...</pre>` or
780
+ * a formatted view of the JSON body)
781
+ * - HTML-escaped JSON
782
+ */
783
+ function extractMintlifyConfig(body) {
784
+ try {
785
+ return JSON.parse(body)
786
+ } catch {}
787
+ // Pull out the first balanced `{ ... }` that contains a "navigation" key.
788
+ const navIdx = body.indexOf('"navigation"')
789
+ if (navIdx === -1) return null
790
+ let start = body.lastIndexOf('{', navIdx)
791
+ while (start !== -1) {
792
+ for (let end = body.lastIndexOf('}'); end > start; end = body.lastIndexOf('}', end - 1)) {
793
+ const candidate = body.slice(start, end + 1)
794
+ try {
795
+ const parsed = JSON.parse(candidate)
796
+ if (parsed && parsed.navigation) return parsed
797
+ } catch {}
798
+ }
799
+ start = body.lastIndexOf('{', start - 1)
800
+ }
801
+ return null
802
+ }
803
+
804
+ function parseMintlifyConfig(config, origin, byPath) {
805
+ const title = config.name || null
806
+ const categories = []
807
+
808
+ const slugToTitle = (slug) => {
809
+ const base = String(slug).split('/').pop() || slug
810
+ return base
811
+ .split(/[-_]/)
812
+ .filter(Boolean)
813
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
814
+ .join(' ')
815
+ }
816
+
817
+ const pageToEntry = (p) => {
818
+ // Simple slug string → leaf page
819
+ if (typeof p === 'string') {
820
+ const slug = p.replace(/^\//, '')
821
+ const url = `${origin}/${slug}.md`
822
+ const known = byPath.get(normalizePath(url))
823
+ return {
824
+ title: known?.title || slugToTitle(slug),
825
+ url,
826
+ ...(known?.description ? { description: known.description } : {}),
827
+ }
828
+ }
829
+ if (p && typeof p === 'object') {
830
+ // Nested group: recurse
831
+ if (p.group && Array.isArray(p.pages)) {
832
+ return {
833
+ title: p.group,
834
+ url: null,
835
+ pages: p.pages.map(pageToEntry).filter(Boolean),
836
+ }
837
+ }
838
+ // Some v2 shapes: { page: "slug", title?: "..." }
839
+ if (typeof p.page === 'string') {
840
+ const slug = p.page.replace(/^\//, '')
841
+ const url = `${origin}/${slug}.md`
842
+ const known = byPath.get(normalizePath(url))
843
+ return {
844
+ title: p.title || known?.title || slugToTitle(slug),
845
+ url,
846
+ ...(known?.description ? { description: known.description } : {}),
847
+ }
848
+ }
849
+ }
850
+ return null
851
+ }
852
+
853
+ const pushGroup = (group) => {
854
+ if (!group || !Array.isArray(group.pages)) return
855
+ const pages = group.pages.map(pageToEntry).filter(Boolean)
856
+ if (pages.length === 0) return
857
+ categories.push({ title: group.group || 'Untitled', pages })
858
+ }
859
+
860
+ const nav = config.navigation
861
+ // v1: navigation is an array of groups
862
+ if (Array.isArray(nav)) {
863
+ for (const g of nav) pushGroup(g)
864
+ }
865
+ // v2: navigation.tabs[].groups[]
866
+ else if (Array.isArray(nav?.tabs)) {
867
+ for (const tab of nav.tabs) {
868
+ for (const g of tab.groups || []) pushGroup(g)
869
+ }
870
+ }
871
+ // v2: navigation.groups[] (no tabs)
872
+ else if (Array.isArray(nav?.groups)) {
873
+ for (const g of nav.groups) pushGroup(g)
874
+ }
875
+ // v2: navigation.pages[] (flat)
876
+ else if (Array.isArray(nav?.pages)) {
877
+ const pages = nav.pages.map(pageToEntry).filter(Boolean)
878
+ if (pages.length > 0) categories.push({ title: 'Documentation', pages })
879
+ }
880
+
881
+ return { title, categories }
882
+ }
883
+
884
+ /**
885
+ * Score a parsed nav tree for "sidebar-likeness". A real docs sidebar has
886
+ * multiple section headers (hierarchy) and tens of links; secondary navs
887
+ * (sitemaps, footers, search-index result lists) tend to be flat single-
888
+ * category blobs that are either alphabetized or dumped in insertion order.
889
+ *
890
+ * Flat single-category blocks are penalized so a slightly smaller hierarchical
891
+ * block wins over a bigger flat one. Without this, greenflash.ai's sidebar
892
+ * order was non-deterministic across runs — sometimes Quickstart first (real
893
+ * sidebar won), sometimes last (a flat alphabetized index block won).
894
+ */
895
+ function scoreNavTree(tree) {
896
+ const count = tree.categories.reduce((n, c) => n + c.pages.length, 0)
897
+ const cats = tree.categories.length
898
+ const hierarchyBonus = cats >= 2 ? cats * 5 : -20
899
+ // Alphabetical penalty: real sidebars are curated (Overview first, related
900
+ // topics grouped). Auto-generated indexes, search clouds, and footer link
901
+ // lists tend to be strictly alphabetized. When a category's pages come in
902
+ // alpha order it's almost always noise masquerading as structure.
903
+ let alphaPenalty = 0
904
+ for (const c of tree.categories) {
905
+ const titles = (c.pages || []).map((p) => (p.title || '').toLowerCase())
906
+ if (titles.length >= 3 && isMonotonicAlpha(titles)) alphaPenalty -= 15
907
+ }
908
+ return count + hierarchyBonus + alphaPenalty
909
+ }
910
+
911
+ function isMonotonicAlpha(titles) {
912
+ for (let i = 1; i < titles.length; i++) {
913
+ if (titles[i] < titles[i - 1]) return false
914
+ }
915
+ return true
916
+ }
917
+
918
+ /**
919
+ * Fetch the source URL, find the `<nav>` or `<aside>` that contains the most
920
+ * links matching our known llms.txt URLs, and extract its heading/link
921
+ * structure into { title, categories: [{ title, pages: [...] }] }.
922
+ *
923
+ * Generic approach — no site-specific selectors. Works on any docs site that
924
+ * renders its sidebar server-side as <nav>/<aside> with <h*> section headers.
925
+ * Returns null if coverage is too low to be useful.
926
+ */
927
+ async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey) {
928
+ // Index known pages by normalized pathname so we can match nav hrefs against them.
929
+ const byPath = new Map()
930
+ for (const p of knownPages) byPath.set(normalizePath(p.url), p)
931
+
932
+ const fetchHtml = firecrawlKey ? makeFirecrawlFetcher(firecrawlKey) : fetchHtmlDirect
933
+
934
+ const visited = new Set()
935
+ const matched = new Map() // normalizedPath → page
936
+ const placed = new Set() // URLs already placed into some category (prevents cross-category duplication when round-1 visits reshape the tree)
937
+ const categoryByTitle = new Map()
938
+ const categoryOrder = []
939
+
940
+ // Fetch one URL, pick its best nav block, and merge anything new into our
941
+ // running tree. Each page on a typical docs site renders the full sidebar
942
+ // with its own branch expanded, so repeated visits into different branches
943
+ // accumulate coverage.
944
+ async function visit(url) {
945
+ const vkey = normalizePath(url)
946
+ if (visited.has(vkey)) return 0
947
+ visited.add(vkey)
948
+
949
+ const html = await fetchHtml(url)
950
+ if (!html) return 0
951
+
952
+ const base = new URL(url)
953
+ let best = { score: -Infinity, count: 0, tree: null }
954
+
955
+ // Tier 1: <nav>/<aside> elements — semantic markup, almost always the real sidebar.
956
+ const blockRegex = /<(nav|aside)\b[^>]*>([\s\S]*?)<\/\1>/gi
957
+ let m
958
+ while ((m = blockRegex.exec(html)) !== null) {
959
+ const tree = parseNavBlock(m[2], base, byPath)
960
+ const score = scoreNavTree(tree)
961
+ if (score > best.score) {
962
+ const count = tree.categories.reduce((n, c) => n + c.pages.length, 0)
963
+ best = { score, count, tree }
964
+ }
965
+ }
966
+
967
+ // Tier 2: <div>/<ul>/<section> containers whose attributes look sidebar-shaped
968
+ // (id="sidebar-group", class*="sidebar", role="navigation", etc.). Catches
969
+ // Mintlify-style stacks (greenflash.ai, mintlify.com clones) where the sidebar
970
+ // lives in a plain <div>. We need balanced-tag extraction since divs nest.
971
+ for (const block of extractSidebarContainers(html)) {
972
+ const tree = parseNavBlock(block, base, byPath)
973
+ const score = scoreNavTree(tree)
974
+ if (score > best.score) {
975
+ const count = tree.categories.reduce((n, c) => n + c.pages.length, 0)
976
+ best = { score, count, tree }
977
+ }
978
+ }
979
+
980
+ // Tier 3 (last resort): parse the whole document, filter out noise. Risky —
981
+ // alphabetical link clusters elsewhere on the page (TOCs, footers, indexes)
982
+ // can pollute the result. Only used when earlier tiers didn't find enough.
983
+ if (best.count < 10) {
984
+ const wholeTree = parseNavBlock(html, base, byPath)
985
+ const seen = new Set()
986
+ const filtered = []
987
+ for (const cat of wholeTree.categories) {
988
+ const keptPages = filterDedupePages(cat.pages, seen)
989
+ if (keptPages.length >= 2) filtered.push({ ...cat, pages: keptPages })
990
+ }
991
+ const filteredCount = filtered.reduce((n, c) => n + c.pages.length, 0)
992
+ const filteredTree = { title: null, categories: filtered }
993
+ const filteredScore = scoreNavTree(filteredTree)
994
+ // Compare on score, not count — a noisy 20-link alphabetical cluster
995
+ // shouldn't replace a clean 8-link real sidebar.
996
+ if (filteredScore > best.score) {
997
+ best = { score: filteredScore, count: filteredCount, tree: filteredTree }
998
+ }
999
+ }
1000
+
1001
+ if (!best.tree) return 0
1002
+
1003
+ let added = 0
1004
+ for (const cat of best.tree.categories) {
1005
+ let existing = categoryByTitle.get(cat.title)
1006
+ if (!existing) {
1007
+ existing = { title: cat.title, pages: [] }
1008
+ categoryByTitle.set(cat.title, existing)
1009
+ categoryOrder.push(existing)
1010
+ }
1011
+ added += mergePages(cat.pages, existing, matched)
1012
+ }
1013
+ return added
1014
+ }
1015
+
1016
+ /**
1017
+ * Merge `incoming` page tree into `target.pages`, recursing into sub-pages.
1018
+ * If a page already exists under `target`, we recurse into it to add any
1019
+ * newly-discovered children. If a page is globally `placed` under a
1020
+ * different category, we skip it — round-1 visits often reshape the tree
1021
+ * and we don't want the same URL to appear in multiple categories.
1022
+ * Returns the number of newly-added unique pages across the entire sub-tree.
1023
+ */
1024
+ function mergePages(incoming, target, matched) {
1025
+ let added = 0
1026
+ for (const page of incoming) {
1027
+ const norm = normalizePath(page.url)
1028
+ let existing = target.pages.find((p) => p.url === page.url) || target.pages.find((p) => normalizePath(p.url) === norm)
1029
+ if (!existing) {
1030
+ // Already lives in a different category — don't add here.
1031
+ if (placed.has(norm)) {
1032
+ if (page.pages && page.pages.length > 0) {
1033
+ // Still merge its children into wherever the canonical page lives.
1034
+ const canonical = matched.get(norm)
1035
+ if (canonical) added += mergePages(page.pages, canonical, matched)
1036
+ }
1037
+ continue
1038
+ }
1039
+ existing = {
1040
+ title: page.title,
1041
+ url: page.url,
1042
+ ...(page.description ? { description: page.description } : {}),
1043
+ pages: [],
1044
+ }
1045
+ target.pages.push(existing)
1046
+ placed.add(norm)
1047
+ if (!matched.has(norm)) {
1048
+ matched.set(norm, existing)
1049
+ added++
1050
+ }
1051
+ }
1052
+ if (page.pages && page.pages.length > 0) {
1053
+ added += mergePages(page.pages, existing, matched)
1054
+ }
1055
+ }
1056
+ return added
1057
+ }
1058
+
1059
+ // Round 0: the source URL itself — reveals top-level + the source page's branch.
1060
+ const r0Start = Date.now()
1061
+ await visit(sourceUrl)
1062
+ const r0Ms = Date.now() - r0Start
1063
+ if (categoryOrder.length === 0) return null
1064
+
1065
+ // Round 1 (parallel): visit pages so each branch has a chance to expose
1066
+ // its sub-items. Sidebars on most docs sites auto-expand the current
1067
+ // page's own branch on render, so visiting a rep per branch is what
1068
+ // surfaces those hidden children.
1069
+ //
1070
+ // With llms.txt (full mode) we already have a trusted page list and the
1071
+ // scrape has real category headers — one rep per scraped category is
1072
+ // enough. Without llms.txt (discovery mode) everything may have collapsed
1073
+ // into a single flat category and we don't yet know which of those pages
1074
+ // is a parent; visit all of them up to a cap.
1075
+ const isDiscovery = knownPages.length === 0
1076
+ const MAX_DISCOVERY_FETCHES = 20
1077
+ const r1Urls = isDiscovery
1078
+ ? flattenTree(categoryOrder).slice(0, MAX_DISCOVERY_FETCHES)
1079
+ : categoryOrder
1080
+ .map((c) => c.pages[0])
1081
+ .filter(Boolean)
1082
+ .map((p) => toBrowsableUrl(p.url))
1083
+ const r1Start = Date.now()
1084
+ // Firecrawl standard-plan concurrency is 10; 5 leaves headroom for retries.
1085
+ // Native HTTP can run hotter since we're hitting our own loopback.
1086
+ await visitAllInParallel(r1Urls, visit, firecrawlKey ? 5 : 10)
1087
+ const r1Ms = Date.now() - r1Start
1088
+ console.log(
1089
+ styles.dim(` ⏱ scrape breakdown: round0=${formatDuration(r0Ms)} round1=${formatDuration(r1Ms)} (${r1Urls.length} ${isDiscovery ? 'discovery' : 'category rep'} fetches)`),
1090
+ )
1091
+
1092
+ // Accept thresholds — looser in discovery mode (no llms.txt) where even a
1093
+ // single flat "Overview" bucket is better than nothing, stricter when we
1094
+ // have llms.txt to compare against.
1095
+ const categories = categoryOrder.filter((c) => c.pages.length > 0)
1096
+ if (isDiscovery) {
1097
+ if (categories.length < 1 || matched.size < 5) return null
1098
+ } else {
1099
+ if (categories.length < 2 || matched.size < 10) return null
1100
+ }
1101
+ return { title: null, categories }
1102
+ }
1103
+
1104
+ /**
1105
+ * Walk a nav block in document order, splitting links into categories at each
1106
+ * <h*> heading. Links whose hrefs resolve to a known page land in the current
1107
+ * category (or a leading "Overview" bucket if they appear before any heading).
1108
+ */
1109
+ /**
1110
+ * Find sidebar-shaped container elements (<div>/<ul>/<section>) in `html` and
1111
+ * return their inner HTML. Looks for tag attributes like id="sidebar*",
1112
+ * class*="sidebar", role="navigation", aria-label*="navigation". Uses
1113
+ * balanced-tag walking so nested elements with the same tag don't break the
1114
+ * boundaries.
1115
+ */
1116
+ function extractSidebarContainers(html) {
1117
+ const SIDEBAR_TAGS = ['div', 'ul', 'section']
1118
+ // Match attribute patterns commonly used for the sidebar. Case-insensitive
1119
+ // partial-string matches on id/class so "sidebar-group", "sidebarNav",
1120
+ // "DocsSidebar__container" etc. all hit.
1121
+ const SIDEBAR_ATTR_RE =
1122
+ /\b(?:id|class|aria-label|data-testid)=(?:"[^"]*sidebar[^"]*"|'[^']*sidebar[^']*'|"[^"]*navigation[^"]*"|'[^']*navigation[^']*')|\brole=(?:"navigation"|'navigation')/i
1123
+
1124
+ const out = []
1125
+ for (const tag of SIDEBAR_TAGS) {
1126
+ const openRe = new RegExp(`<${tag}\\b([^>]*)>`, 'gi')
1127
+ let m
1128
+ while ((m = openRe.exec(html)) !== null) {
1129
+ if (!SIDEBAR_ATTR_RE.test(m[1])) continue
1130
+ const inner = extractBalancedTag(html, tag, m.index + m[0].length)
1131
+ if (inner != null && inner.length > 100) out.push(inner)
1132
+ }
1133
+ }
1134
+ return out
1135
+ }
1136
+
1137
+ /**
1138
+ * Starting at `startIdx` (just past an opening `<tag …>`), walk forward
1139
+ * through `html` tracking nested opens/closes of the same tag. Returns the
1140
+ * inner HTML up to the matching close tag, or null if unbalanced.
1141
+ *
1142
+ * Generous about case and whitespace; void-element rules are NOT applied
1143
+ * (we only call this for non-void containers: div/ul/section).
1144
+ */
1145
+ function extractBalancedTag(html, tag, startIdx) {
1146
+ const re = new RegExp(`<(/?)${tag}\\b[^>]*>`, 'gi')
1147
+ re.lastIndex = startIdx
1148
+ let depth = 1
1149
+ let m
1150
+ while ((m = re.exec(html)) !== null) {
1151
+ if (m[1] === '/') {
1152
+ depth--
1153
+ if (depth === 0) return html.slice(startIdx, m.index)
1154
+ } else {
1155
+ depth++
1156
+ }
1157
+ if (re.lastIndex - startIdx > 1_500_000) return null // safety cap
1158
+ }
1159
+ return null
1160
+ }
1161
+
1162
+ function parseNavBlock(blockHtml, base, byPath) {
1163
+ // Five alternatives, carefully ordered for regex semantics:
1164
+ // 1. <h*>…</h*> — classic heading
1165
+ // 2. <a href="…">…</a> — link (matched greedily as a unit, so inner <p>
1166
+ // tags inside link text are consumed and NOT treated as headings)
1167
+ // 3. <p>…</p> — bare paragraph used as section heading by
1168
+ // fumadocs (zod.dev) and similar frameworks
1169
+ // 4. <ul …> — start of nested list → subsequent <a>s are
1170
+ // children of the most recently emitted <a> at the outer level
1171
+ // 5. </ul> — close of nested list → pop parent stack
1172
+ const tokenRegex = /<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>|<a\b[^>]*\bhref="([^"]+)"[^>]*>([\s\S]*?)<\/a>|<p\b[^>]*>([\s\S]*?)<\/p>|<ul\b[^>]*>|<\/ul>/gi
1173
+
1174
+ const categories = []
1175
+ let current = null
1176
+ let leading = null
1177
+ // Stack of parent page objects. When inside a nested <ul>, new links attach
1178
+ // to the top of the stack (= the <a> that preceded the opening <ul>).
1179
+ const parentStack = []
1180
+ // The most recent link we emitted, at the current depth. Becomes the parent
1181
+ // if a <ul> opens next.
1182
+ let lastLinkAtDepth = null
1183
+
1184
+ const resetCategoryState = () => {
1185
+ parentStack.length = 0
1186
+ lastLinkAtDepth = null
1187
+ }
1188
+
1189
+ let m
1190
+ while ((m = tokenRegex.exec(blockHtml)) !== null) {
1191
+ const token = m[0]
1192
+
1193
+ if (/^<\/ul\b/i.test(token)) {
1194
+ parentStack.pop()
1195
+ lastLinkAtDepth = null
1196
+ continue
1197
+ }
1198
+ if (/^<ul\b/i.test(token)) {
1199
+ // A <ul> opening right after a link means that link becomes a parent.
1200
+ if (lastLinkAtDepth) parentStack.push(lastLinkAtDepth)
1201
+ lastLinkAtDepth = null
1202
+ continue
1203
+ }
1204
+
1205
+ if (m[1]) {
1206
+ const title = stripTags(m[2]).trim()
1207
+ if (!title) continue
1208
+ current = { title, pages: [] }
1209
+ categories.push(current)
1210
+ resetCategoryState()
1211
+ continue
1212
+ }
1213
+ if (m[5] !== undefined) {
1214
+ const title = stripTags(m[5]).trim()
1215
+ if (!title || title.length > 60 || /[.!?]\s*$/.test(title)) continue
1216
+ current = { title, pages: [] }
1217
+ categories.push(current)
1218
+ resetCategoryState()
1219
+ continue
1220
+ }
1221
+
1222
+ // Link.
1223
+ const href = m[3]
1224
+ if (!href || href.startsWith('#') || href.startsWith('mailto:') || href.startsWith('javascript:')) continue
1225
+ let abs
1226
+ try {
1227
+ abs = new URL(href, base).toString()
1228
+ } catch {
1229
+ continue
1230
+ }
1231
+
1232
+ // byPath is populated from llms.txt. When it's empty we're in discovery
1233
+ // mode — fall back to synthesizing a page entry from the link itself,
1234
+ // filtered to same-origin non-asset URLs so we don't slurp every footer,
1235
+ // social, or static file link on the page.
1236
+ let page = byPath.size > 0 ? byPath.get(normalizePath(abs)) : null
1237
+ if (!page && byPath.size === 0) {
1238
+ if (!isDiscoverableLink(abs, base)) continue
1239
+ const text = stripTags(m[4] || '').trim()
1240
+ if (!text || text.length > 150) continue
1241
+ page = { title: text, url: abs }
1242
+ }
1243
+ if (!page) continue
1244
+
1245
+ const parent = parentStack.length > 0 ? parentStack[parentStack.length - 1] : current || leading || (leading = { title: 'Overview', pages: [] })
1246
+ if (leading && parent === leading && categories[0] !== leading) categories.unshift(leading)
1247
+
1248
+ const existing = parent.pages.find((p) => p.url === page.url)
1249
+ if (existing) {
1250
+ lastLinkAtDepth = existing
1251
+ } else {
1252
+ const newPage = {
1253
+ title: page.title,
1254
+ url: page.url,
1255
+ ...(page.description ? { description: page.description } : {}),
1256
+ pages: [],
1257
+ }
1258
+ parent.pages.push(newPage)
1259
+ lastLinkAtDepth = newPage
1260
+ }
1261
+ }
1262
+
1263
+ return { title: null, categories: categories.filter((c) => c.pages.length > 0) }
1264
+ }
1265
+
1266
+ /**
1267
+ * Partition orphans into pages under an API-Reference-style URL prefix
1268
+ * (`/api-reference/*`, `/api/*`, `/reference/*`) and everything else.
1269
+ *
1270
+ * API pages are collapsed into a single `{ title: "API Reference", pages: [] }`
1271
+ * category so `routeCategory()` sends them to ReadMe's `reference/` top-level
1272
+ * dir as a tab of their own, matching how Mintlify/Stripe/etc. structure
1273
+ * these docs. Any pre-existing "API Reference"-titled category on `scraped`
1274
+ * (including variants with zero-width prefixes from stray DOM subtrees) is
1275
+ * merged in and removed from `scraped.categories`.
1276
+ *
1277
+ * Returns { category, nonApiOrphans, mergedScrapedTitles }.
1278
+ */
1279
+ function collectApiReferencePages(orphans, scraped) {
1280
+ const API_PREFIX_RE = /^\/(api[-_]?reference|api|reference)(\/|$)/i
1281
+ const isApiUrl = (url) => {
1282
+ try {
1283
+ return API_PREFIX_RE.test(new URL(url).pathname)
1284
+ } catch {
1285
+ return false
1286
+ }
1287
+ }
1288
+ const cleanTitle = (t) => (t || '').replace(/[\u200B-\u200F\uFEFF]/g, '').trim()
1289
+ const isApiCategoryTitle = (t) => /^(api[ -]?reference|reference)$/i.test(cleanTitle(t))
1290
+
1291
+ const apiPages = []
1292
+ const seenUrls = new Set()
1293
+ const push = (p) => {
1294
+ if (!p || !p.url) return
1295
+ // Skip non-page assets like /api-reference/openapi.json that some
1296
+ // llms.txt files list alongside real pages.
1297
+ if (/\.(json|yaml|yml)$/i.test(p.url)) return
1298
+ if (seenUrls.has(p.url)) return
1299
+ seenUrls.add(p.url)
1300
+ apiPages.push(p)
1301
+ }
1302
+
1303
+ // Pull pages out of any scraped category that already looks like API
1304
+ // Reference — even (especially) if it's a partial, DOM-polluted one.
1305
+ const mergedScrapedTitles = []
1306
+ const keptCategories = []
1307
+ for (const cat of scraped.categories) {
1308
+ if (isApiCategoryTitle(cat.title)) {
1309
+ mergedScrapedTitles.push(cat.title)
1310
+ for (const p of cat.pages || []) push(p)
1311
+ } else {
1312
+ keptCategories.push(cat)
1313
+ }
1314
+ }
1315
+ scraped.categories = keptCategories
1316
+
1317
+ const nonApiOrphans = []
1318
+ for (const p of orphans) {
1319
+ if (isApiUrl(p.url)) push(p)
1320
+ else nonApiOrphans.push(p)
1321
+ }
1322
+
1323
+ if (apiPages.length === 0) {
1324
+ return { category: null, nonApiOrphans, mergedScrapedTitles }
1325
+ }
1326
+
1327
+ // Nest by the resource segment after the API prefix, e.g.
1328
+ // /api-reference/analytics/get-interaction → group "analytics"
1329
+ // /api-reference/users/list-users → group "users"
1330
+ // /api-reference/openapi.json → top-level (no resource)
1331
+ // Preserves first-encountered order for both groups and their pages so the
1332
+ // final sidebar mirrors input order.
1333
+ const groupOrder = []
1334
+ const groupPages = new Map()
1335
+ const topLevel = []
1336
+ for (const p of apiPages) {
1337
+ let segs = []
1338
+ try {
1339
+ segs = new URL(p.url).pathname.split('/').filter(Boolean)
1340
+ } catch {}
1341
+ // segs[0] is the api-prefix itself; segs[1] (if any) is the resource.
1342
+ const resource = segs.length >= 3 ? segs[1] : null
1343
+ if (!resource) {
1344
+ topLevel.push(p)
1345
+ continue
1346
+ }
1347
+ if (!groupPages.has(resource)) {
1348
+ groupPages.set(resource, [])
1349
+ groupOrder.push(resource)
1350
+ }
1351
+ groupPages.get(resource).push(p)
1352
+ }
1353
+
1354
+ const titleize = (slug) =>
1355
+ String(slug)
1356
+ .split(/[-_]/)
1357
+ .filter(Boolean)
1358
+ .map((w) => w.charAt(0).toUpperCase() + w.slice(1))
1359
+ .join(' ')
1360
+
1361
+ const pages = []
1362
+ for (const p of topLevel) pages.push(p)
1363
+ for (const key of groupOrder) {
1364
+ pages.push({
1365
+ title: titleize(key),
1366
+ url: null,
1367
+ pages: groupPages.get(key),
1368
+ })
1369
+ }
1370
+
1371
+ return {
1372
+ category: { title: 'API Reference', pages },
1373
+ nonApiOrphans,
1374
+ mergedScrapedTitles,
1375
+ }
1376
+ }
1377
+
1378
+ /**
1379
+ * Group orphans by the second-to-last URL path segment (the "type" segment,
1380
+ * e.g. `/main/docs/about` → "docs", `/main/reference/xyz` → "reference"). This
1381
+ * separates API reference, changelog, and general docs into their own top-
1382
+ * level categories instead of dumping them into one big "Other".
1383
+ *
1384
+ * Uses distinctive titles — just "Docs" or "Reference" — rather than the raw
1385
+ * segment name. Falls back to "Other" when the path has no usable type.
1386
+ */
1387
+ function bucketOrphansByPathType(orphans, scraped) {
1388
+ const TYPE_TITLES = {
1389
+ reference: 'API Reference',
1390
+ api: 'API Reference',
1391
+ 'api-reference': 'API Reference',
1392
+ api_reference: 'API Reference',
1393
+ endpoints: 'API Reference',
1394
+ endpoint: 'API Reference',
1395
+ changelog: 'Changelog',
1396
+ release: 'Release Notes',
1397
+ releases: 'Release Notes',
1398
+ 'release-notes': 'Release Notes',
1399
+ recipes: 'Recipes',
1400
+ recipe: 'Recipes',
1401
+ guides: 'Guides',
1402
+ docs: 'Other Docs',
1403
+ doc: 'Other Docs',
1404
+ }
1405
+ // Strong top-level type segments. If ANY path segment matches, treat that
1406
+ // as the bucket type — `/api-reference/prompts/archive-prompt` belongs in
1407
+ // "API Reference", not in a sub-bucket called "Prompts".
1408
+ const STRONG_TYPE = /^(api[-_]?reference|endpoints?|changelog|release[-_]?notes?|releases)$/i
1409
+ // Segments that look like version/locale prefixes, not category types.
1410
+ // Walked from the end until we find a real category-type segment.
1411
+ const VERSION_LOCALE = /^(v?\d+(\.\d+)*|main|master|latest|stable|next|current|ent|enterprise|en|en-us|en_us|fr|de|es|ja|zh|ko|pt)$/i
1412
+ // Normalize title for merge-matching: strip invisible chars, lowercase, trim.
1413
+ const normTitle = (t) =>
1414
+ String(t || '')
1415
+ .replace(INVISIBLE_CHARS, '')
1416
+ .trim()
1417
+ .toLowerCase()
1418
+
1419
+ // Map existing scraped categories by normalized title so we can merge
1420
+ // orphans into them instead of creating a parallel "(orphans)" bucket that
1421
+ // routes to the wrong top-level directory.
1422
+ const byNormTitle = new Map()
1423
+ for (const cat of scraped.categories) byNormTitle.set(normTitle(cat.title), cat)
1424
+ // Buckets we've created so later orphans can pile onto the same one.
1425
+ const newBuckets = new Map()
1426
+
1427
+ for (const p of orphans) {
1428
+ let url
1429
+ try {
1430
+ url = new URL(p.url)
1431
+ } catch {
1432
+ continue
1433
+ }
1434
+ const pathname = url.pathname
1435
+ // Skip OAS spec endpoints — `/api-reference/openapi.json` etc. aren't
1436
+ // documentation pages and shouldn't become stubs.
1437
+ if (/\.(json|ya?ml)$/i.test(pathname)) continue
1438
+
1439
+ const segs = pathname.split('/').filter(Boolean)
1440
+ let type = null
1441
+ // First: any strong top-level type anywhere in the path wins.
1442
+ for (const seg of segs) {
1443
+ if (STRONG_TYPE.test(seg)) {
1444
+ type = seg.toLowerCase()
1445
+ break
1446
+ }
1447
+ }
1448
+ // Fallback: walk the segment-before-slug backwards, skip version/locale.
1449
+ if (!type) {
1450
+ for (let i = segs.length - 2; i >= 0; i--) {
1451
+ if (!VERSION_LOCALE.test(segs[i])) {
1452
+ type = segs[i].toLowerCase()
1453
+ break
1454
+ }
1455
+ }
1456
+ }
1457
+
1458
+ const rawTitle = type ? TYPE_TITLES[type] || titleCase(type) : 'Other'
1459
+ const key = normTitle(rawTitle)
1460
+
1461
+ let bucket = byNormTitle.get(key)
1462
+ if (!bucket) {
1463
+ bucket = newBuckets.get(key)
1464
+ if (!bucket) {
1465
+ bucket = { title: rawTitle, pages: [] }
1466
+ newBuckets.set(key, bucket)
1467
+ byNormTitle.set(key, bucket)
1468
+ }
1469
+ }
1470
+
1471
+ bucket.pages.push({
1472
+ title: p.title,
1473
+ url: p.url,
1474
+ ...(p.description ? { description: p.description } : {}),
1475
+ })
1476
+ }
1477
+
1478
+ // Existing scraped categories were mutated in place. Return only genuinely
1479
+ // new buckets for the caller to append.
1480
+ return Array.from(newBuckets.values()).sort((a, b) => b.pages.length - a.pages.length)
1481
+ }
1482
+
1483
+ function titleCase(s) {
1484
+ return String(s)
1485
+ .replace(/[-_]+/g, ' ')
1486
+ .replace(/\b\w/g, (c) => c.toUpperCase())
1487
+ }
1488
+
1489
+ /**
1490
+ * Walk all scraped pages (including nested sub-pages) and move any whose URL
1491
+ * contains a strong reference segment (`/api-reference/`, `/endpoints/`, etc.)
1492
+ * into a single "API Reference" category. Some docs sites (e.g. greenflash.ai)
1493
+ * spotlight a handful of endpoints under "Developers" in the sidebar while the
1494
+ * bulk of endpoints live under a separate API Reference section — we favor
1495
+ * the URL-path signal over the sidebar placement so all reference pages land
1496
+ * together in `reference/` after staging.
1497
+ *
1498
+ * Returns the number of pages relocated.
1499
+ */
1500
+ function reclassifyReferencePages(scraped) {
1501
+ const REFERENCE_SEGMENT = /^(api[-_]?reference|endpoints?)$/i
1502
+ const normTitle = (t) =>
1503
+ String(t || '')
1504
+ .replace(INVISIBLE_CHARS, '')
1505
+ .trim()
1506
+ .toLowerCase()
1507
+
1508
+ const looksLikeRefUrl = (url) => {
1509
+ try {
1510
+ const segs = new URL(url).pathname.split('/').filter(Boolean)
1511
+ return segs.some((s) => REFERENCE_SEGMENT.test(s))
1512
+ } catch {
1513
+ return false
1514
+ }
1515
+ }
1516
+
1517
+ // Find (or create) the canonical API Reference category. Prefer an existing
1518
+ // one with a reference-shaped title so we don't end up with duplicates.
1519
+ let refCat = scraped.categories.find((c) => /^(api[ -]?reference|reference|api|endpoints?)$/i.test(normTitle(c.title).replace(/\s+/g, ' ')))
1520
+ const existedBefore = Boolean(refCat)
1521
+
1522
+ const collected = []
1523
+ const filterPages = (pages) => {
1524
+ const kept = []
1525
+ for (const p of pages || []) {
1526
+ if (looksLikeRefUrl(p.url)) {
1527
+ // Flatten sub-pages when relocating — API Reference is a flat list.
1528
+ collectFlat(p, collected)
1529
+ continue
1530
+ }
1531
+ if (p.pages && p.pages.length > 0) p.pages = filterPages(p.pages)
1532
+ kept.push(p)
1533
+ }
1534
+ return kept
1535
+ }
1536
+
1537
+ // Never pull pages out of the reference category itself.
1538
+ for (const cat of scraped.categories) {
1539
+ if (cat === refCat) continue
1540
+ cat.pages = filterPages(cat.pages)
1541
+ }
1542
+
1543
+ if (collected.length === 0) return 0
1544
+
1545
+ if (!refCat) {
1546
+ refCat = { title: 'API Reference', pages: [] }
1547
+ scraped.categories.push(refCat)
1548
+ }
1549
+ // Dedupe against anything already in the reference category.
1550
+ const seen = new Set(refCat.pages.map((p) => normalizePath(p.url)))
1551
+ for (const p of collected) {
1552
+ const key = normalizePath(p.url)
1553
+ if (seen.has(key)) continue
1554
+ seen.add(key)
1555
+ refCat.pages.push(p)
1556
+ }
1557
+
1558
+ // Drop now-empty categories (other than the reference one we may have just created).
1559
+ scraped.categories = scraped.categories.filter((c) => c === refCat || (c.pages && c.pages.length > 0))
1560
+
1561
+ // If the category existed before but the relocation was a no-op, surface 0.
1562
+ return existedBefore ? collected.length : collected.length
1563
+ }
1564
+
1565
+ function collectFlat(page, out) {
1566
+ out.push({
1567
+ title: page.title,
1568
+ url: page.url,
1569
+ ...(page.description ? { description: page.description } : {}),
1570
+ })
1571
+ if (page.pages && page.pages.length > 0) {
1572
+ for (const child of page.pages) collectFlat(child, out)
1573
+ }
1574
+ }
1575
+
1576
+ /**
1577
+ * The scraped nav only contains top-level items; subcategory pages sit behind
1578
+ * `>` chevrons and don't render on a cold fetch. For each llms.txt URL not
1579
+ * already in the scrape, find the scraped page whose URL path is the longest
1580
+ * ancestor of it, and drop the orphan into that page's category. Returns any
1581
+ * orphans that still have no ancestor match.
1582
+ */
1583
+ function slotOrphansByPath(scraped, knownPages) {
1584
+ const matched = new Set()
1585
+ const pathToCategory = new Map() // normalizedPath → category
1586
+ for (const cat of scraped.categories) {
1587
+ for (const p of cat.pages) {
1588
+ const norm = normalizePath(p.url)
1589
+ matched.add(norm)
1590
+ pathToCategory.set(norm, cat)
1591
+ }
1592
+ }
1593
+
1594
+ const orphans = []
1595
+ for (const p of knownPages) {
1596
+ const norm = normalizePath(p.url)
1597
+ if (matched.has(norm)) continue
1598
+
1599
+ let bestCat = null
1600
+ let bestLen = -1
1601
+ for (const [navPath, cat] of pathToCategory) {
1602
+ if (navPath && (norm === navPath || norm.startsWith(navPath + '/'))) {
1603
+ if (navPath.length > bestLen) {
1604
+ bestCat = cat
1605
+ bestLen = navPath.length
1606
+ }
1607
+ }
1608
+ }
1609
+
1610
+ if (bestCat) {
1611
+ bestCat.pages.push({
1612
+ title: p.title,
1613
+ url: p.url,
1614
+ ...(p.description ? { description: p.description } : {}),
1615
+ })
1616
+ matched.add(norm)
1617
+ } else {
1618
+ orphans.push(p)
1619
+ }
1620
+ }
1621
+ return orphans
1622
+ }
1623
+
1624
+ /**
1625
+ * Flatten a tree of categories → pages → sub-pages into a linear list of URLs
1626
+ * (depth-first, in-order). Used to enumerate every URL we want to re-visit
1627
+ * during round 1 of scraping.
1628
+ */
1629
+ /**
1630
+ * Walk a tree of pages+sub-pages and drop any whose URL is already in `seen`.
1631
+ * First occurrence wins — this is used by the whole-body fallback to keep the
1632
+ * sidebar's first appearance of each page and drop duplicates that end up
1633
+ * under landing-page headings.
1634
+ */
1635
+ function filterDedupePages(pages, seen) {
1636
+ const out = []
1637
+ for (const p of pages) {
1638
+ const norm = normalizePath(p.url)
1639
+ if (seen.has(norm)) continue
1640
+ seen.add(norm)
1641
+ const childPages = p.pages ? filterDedupePages(p.pages, seen) : []
1642
+ out.push({ ...p, pages: childPages })
1643
+ }
1644
+ return out
1645
+ }
1646
+
1647
+ function flattenTree(categories) {
1648
+ const out = []
1649
+ function walk(pages) {
1650
+ for (const p of pages || []) {
1651
+ out.push(toBrowsableUrl(p.url))
1652
+ if (p.pages && p.pages.length > 0) walk(p.pages)
1653
+ }
1654
+ }
1655
+ for (const c of categories || []) walk(c.pages)
1656
+ return out
1657
+ }
1658
+
1659
+ /**
1660
+ * llms.txt often lists URLs with a `.md` extension — those are raw markdown
1661
+ * endpoints, not the rendered HTML page that has the sidebar. Strip the
1662
+ * extension so we fetch the human-facing page instead.
1663
+ */
1664
+ function toBrowsableUrl(url) {
1665
+ try {
1666
+ const u = new URL(url)
1667
+ u.pathname = u.pathname.replace(/\.(md|mdx)$/i, '')
1668
+ return u.toString()
1669
+ } catch {
1670
+ return url
1671
+ }
1672
+ }
1673
+
1674
+ /**
1675
+ * Native-fetch HTML loader. Returns the body string or empty string on failure.
1676
+ */
1677
+ async function fetchHtmlDirect(url) {
1678
+ try {
1679
+ const res = await fetch(url, {
1680
+ redirect: 'follow',
1681
+ headers: { 'User-Agent': 'readme-cli-import' },
1682
+ })
1683
+ if (!res.ok) return ''
1684
+ return await res.text()
1685
+ } catch {
1686
+ return ''
1687
+ }
1688
+ }
1689
+
1690
+ /**
1691
+ * Firecrawl-backed HTML loader. Firecrawl runs a real browser, waits for
1692
+ * hydration, and returns the rendered DOM — which is what we need for sites
1693
+ * that render their sidebar nav client-side (zod.dev, most Next.js docs).
1694
+ *
1695
+ * Returns a function with the same (url) → html string contract as fetchHtmlDirect
1696
+ * so scrapeNavFromSite doesn't need to care which backend is in use.
1697
+ */
1698
+ function makeFirecrawlFetcher(apiKey) {
1699
+ return async function fetchHtmlViaFirecrawl(url) {
1700
+ try {
1701
+ const res = await fetch('https://api.firecrawl.dev/v1/scrape', {
1702
+ method: 'POST',
1703
+ headers: {
1704
+ Authorization: `Bearer ${apiKey}`,
1705
+ 'Content-Type': 'application/json',
1706
+ },
1707
+ body: JSON.stringify({
1708
+ url,
1709
+ formats: ['rawHtml'],
1710
+ // Wait a bit for client-side frameworks to hydrate the sidebar.
1711
+ waitFor: 2000,
1712
+ // Block common ad/tracking domains so we don't burn time on them.
1713
+ blockAds: true,
1714
+ }),
1715
+ })
1716
+ if (!res.ok) {
1717
+ styles.warning(`Firecrawl HTTP ${res.status} for ${url}`)
1718
+ return ''
1719
+ }
1720
+ const body = await res.json()
1721
+ if (!body.success) {
1722
+ styles.warning(`Firecrawl error for ${url}: ${body.error || 'unknown'}`)
1723
+ return ''
1724
+ }
1725
+ return body.data?.rawHtml || body.data?.html || ''
1726
+ } catch (e) {
1727
+ styles.warning(`Firecrawl fetch failed for ${url}: ${e.message}`)
1728
+ return ''
1729
+ }
1730
+ }
1731
+ }
1732
+
1733
+ /**
1734
+ * Run `visit(url)` across `urls` with at most `concurrency` in flight at once.
1735
+ * Order of completion doesn't matter — visit() merges into shared state.
1736
+ */
1737
+ async function visitAllInParallel(urls, visit, concurrency) {
1738
+ let i = 0
1739
+ async function worker() {
1740
+ while (i < urls.length) {
1741
+ const idx = i++
1742
+ await visit(urls[idx])
1743
+ }
1744
+ }
1745
+ await Promise.all(Array.from({ length: Math.min(concurrency, urls.length) }, worker))
1746
+ }
1747
+
1748
+ /**
1749
+ * Namespace = origin + first meaningful path segment (skipping version/locale
1750
+ * prefixes like /main, /en, /v1). Pages in different namespaces live under
1751
+ * different sidebars on typical multi-product docs sites, so cross-namespace
1752
+ * follow-up fetches don't help coverage.
1753
+ */
1754
+ function urlNamespace(url) {
1755
+ const NS_SKIP = /^(v?\d+(\.\d+)*|main|master|latest|stable|next|current|ent|enterprise|en|en-us|en_us|fr|de|es|ja|zh|ko|pt)$/i
1756
+ try {
1757
+ const u = new URL(url)
1758
+ const segs = u.pathname.split('/').filter(Boolean)
1759
+ let ns = ''
1760
+ for (const s of segs) {
1761
+ if (!NS_SKIP.test(s)) {
1762
+ ns = s.toLowerCase()
1763
+ break
1764
+ }
1765
+ }
1766
+ return `${u.origin}/${ns}`
1767
+ } catch {
1768
+ return ''
1769
+ }
1770
+ }
1771
+
1772
+ /**
1773
+ * Re-cluster a flat page list by URL path structure. When the sidebar scrape
1774
+ * found links but no <h*>/<p> headers to split on, everything ends up in one
1775
+ * big bucket. URLs often encode the site's real hierarchy, though — if many
1776
+ * pages share `/foo/bar/<slug>.html` and a few others share `/foo/baz/<slug>`,
1777
+ * "bar" and "baz" are almost certainly section names.
1778
+ *
1779
+ * Algorithm:
1780
+ * 1. Find the longest path prefix ALL pages share (the "base").
1781
+ * 2. Take the segment immediately after the base — this is the category key.
1782
+ * 3. Group pages by that key; the key value (title-cased) is the category.
1783
+ * 4. Only accept the result if it produces >=2 categories AND at least one
1784
+ * category has >=2 pages. Otherwise the clustering is too sparse — every
1785
+ * page lives in its own category and we'd just be renaming "Overview".
1786
+ */
1787
+ function clusterByUrlPath(pages) {
1788
+ if (!pages || pages.length < 3) return null
1789
+
1790
+ const parts = pages.map((p) => {
1791
+ try {
1792
+ return new URL(p.url).pathname.split('/').filter(Boolean)
1793
+ } catch {
1794
+ return []
1795
+ }
1796
+ })
1797
+ if (parts.some((pp) => pp.length === 0)) return null
1798
+
1799
+ // Longest common prefix depth.
1800
+ let commonDepth = 0
1801
+ while (commonDepth < parts[0].length) {
1802
+ const seg = parts[0][commonDepth]
1803
+ if (!parts.every((pp) => pp[commonDepth] === seg)) break
1804
+ commonDepth++
1805
+ }
1806
+
1807
+ // The segment right after the common base is the category key.
1808
+ const keyIdx = commonDepth
1809
+ const byKey = new Map()
1810
+ for (let i = 0; i < pages.length; i++) {
1811
+ const key = parts[i][keyIdx]
1812
+ // Skip pages that have no segment at the cluster index (they're AT the
1813
+ // common base — those would become their own "index"-like category).
1814
+ if (!key) continue
1815
+ if (!byKey.has(key)) byKey.set(key, [])
1816
+ byKey.get(key).push(pages[i])
1817
+ }
1818
+
1819
+ // Reject weak clusterings: need at least 2 groups AND at least one group
1820
+ // with multiple pages (otherwise every "category" is a single page, which
1821
+ // is just Overview renamed).
1822
+ if (byKey.size < 2) return null
1823
+ if (![...byKey.values()].some((arr) => arr.length >= 2)) return null
1824
+
1825
+ // Preserve first-appearance order so the sidebar reflects source order.
1826
+ const firstSeen = new Map()
1827
+ pages.forEach((p, i) => {
1828
+ const key = parts[i][keyIdx]
1829
+ if (key && !firstSeen.has(key)) firstSeen.set(key, i)
1830
+ })
1831
+ const orderedKeys = [...byKey.keys()].sort((a, b) => firstSeen.get(a) - firstSeen.get(b))
1832
+
1833
+ const rawClusters = orderedKeys.map((key) => ({
1834
+ title: titleCase(key),
1835
+ pages: byKey.get(key),
1836
+ }))
1837
+
1838
+ // A cluster with exactly one top-level page is NOT a real category — it's
1839
+ // a parent page with children wrapped in a pseudo-category label. Categories
1840
+ // are grouping labels with no content of their own; parent pages have
1841
+ // content AND children. Collect those singletons into a shared
1842
+ // "Documentation" bucket so they're siblings at the top level, each with
1843
+ // their own sub-tree intact.
1844
+ const multipageClusters = rawClusters.filter((c) => c.pages.length >= 2)
1845
+ const singletonPages = rawClusters.filter((c) => c.pages.length === 1).flatMap((c) => c.pages)
1846
+
1847
+ const out = []
1848
+ if (singletonPages.length > 0) {
1849
+ out.push({ title: 'Documentation', pages: singletonPages })
1850
+ }
1851
+ out.push(...multipageClusters)
1852
+
1853
+ // If we didn't actually produce any multi-page cluster, clustering added no
1854
+ // value — every page was a singleton and we'd just have renamed Overview.
1855
+ // Tell the caller to stick with the original flat shape.
1856
+ if (multipageClusters.length === 0) return null
1857
+ return out
1858
+ }
1859
+
1860
+ /**
1861
+ * Used by discovery-mode scraping (no llms.txt) to decide whether a nav
1862
+ * link is worth importing as a doc page. Filters out cross-origin links,
1863
+ * asset file types, build artifacts, and anchors-on-same-page.
1864
+ */
1865
+ function isDiscoverableLink(abs, base) {
1866
+ let u
1867
+ try {
1868
+ u = new URL(abs)
1869
+ } catch {
1870
+ return false
1871
+ }
1872
+ if (u.origin !== base.origin) return false
1873
+ const p = u.pathname.toLowerCase()
1874
+ if (!p || p === '/') return false
1875
+ if (/\.(png|jpe?g|gif|svg|webp|ico|css|js|pdf|zip|tar|gz|woff2?|ttf|mp4|mp3)$/i.test(p)) return false
1876
+ if (p.startsWith('/_next/') || p.startsWith('/__/') || p.includes('/static/') || p.includes('/assets/')) return false
1877
+ return true
1878
+ }
1879
+
1880
+ // Zero-width spaces, direction marks, word joiner, BOM — some docs sites
1881
+ // inject these into sidebar headings (docs.greenflash.ai prefixes
1882
+ // "API Reference" with U+200B), which otherwise defeats downstream
1883
+ // title-matching regexes like routeCategory's.
1884
+ const INVISIBLE_CHARS = new RegExp('[\\u200B-\\u200F\\u202A-\\u202E\\u2060\\uFEFF]', 'g')
1885
+ function stripTags(s) {
1886
+ return decodeEntities(String(s).replace(/<[^>]+>/g, '')).replace(INVISIBLE_CHARS, '')
1887
+ }
1888
+
1889
+ /**
1890
+ * Decode the common HTML entities that show up inside <a>/<p>/<h*> tag text.
1891
+ * The sidebar/nav scrapers feed their output straight into frontmatter titles
1892
+ * and on-page headings, so leaving `&amp;` raw produces titles like
1893
+ * "New Features &amp; Upgrade Changes". This covers the named entities we
1894
+ * see in practice plus numeric and hex forms.
1895
+ */
1896
+ function decodeEntities(s) {
1897
+ if (!s || s.indexOf('&') === -1) return s
1898
+ return s
1899
+ .replace(/&(?:#x([0-9a-f]+)|#(\d+));/gi, (_, hex, dec) => {
1900
+ const code = hex ? parseInt(hex, 16) : parseInt(dec, 10)
1901
+ return Number.isFinite(code) ? String.fromCodePoint(code) : _
1902
+ })
1903
+ .replace(/&amp;/g, '&')
1904
+ .replace(/&lt;/g, '<')
1905
+ .replace(/&gt;/g, '>')
1906
+ .replace(/&quot;/g, '"')
1907
+ .replace(/&#39;/g, "'")
1908
+ .replace(/&apos;/g, "'")
1909
+ .replace(/&nbsp;/g, ' ')
1910
+ .replace(/&mdash;/g, '—')
1911
+ .replace(/&ndash;/g, '–')
1912
+ .replace(/&hellip;/g, '…')
1913
+ .replace(/&rsquo;/g, '’')
1914
+ .replace(/&lsquo;/g, '‘')
1915
+ .replace(/&ldquo;/g, '“')
1916
+ .replace(/&rdquo;/g, '”')
1917
+ }
1918
+
1919
+ /**
1920
+ * Reduce a URL to a comparable pathname: lowercase host, strip trailing slash
1921
+ * and common suffixes (.md, .html) so `/foo/bar.md` and `/foo/bar` match.
1922
+ */
1923
+ function normalizePath(url) {
1924
+ try {
1925
+ const u = new URL(url)
1926
+ let p = u.pathname.replace(/\/$/, '').toLowerCase()
1927
+ p = p.replace(/\.(md|mdx|html?)$/i, '')
1928
+ return p
1929
+ } catch {
1930
+ return String(url).toLowerCase()
1931
+ }
1932
+ }
1933
+
1934
+ /**
1935
+ * Given a set of scraped categories + orphan pages that didn't match the nav
1936
+ * or any path ancestor, ask Claude to slot each orphan into an existing
1937
+ * category by index. Output is a compact array of indices (or -1 for none),
1938
+ * so token count stays low. Mutates `scraped.categories[*].pages`.
1939
+ * Returns the orphans Claude couldn't slot.
1940
+ */
1941
+ async function slotOrphansWithClaude(scraped, orphans, model) {
1942
+ const { systemPrompt, userPrompt } = slotOrphansPrompt({
1943
+ categories: scraped.categories,
1944
+ orphans,
1945
+ })
1946
+ const raw = await runJsonQuery({ systemPrompt, userPrompt, model })
1947
+ if (!Array.isArray(raw)) {
1948
+ // If the model didn't cooperate, return all orphans unassigned.
1949
+ return orphans
1950
+ }
1951
+
1952
+ const leftover = []
1953
+ orphans.forEach((p, i) => {
1954
+ const idx = Number.isInteger(raw[i]) ? raw[i] : -1
1955
+ if (idx >= 0 && idx < scraped.categories.length) {
1956
+ scraped.categories[idx].pages.push({
1957
+ title: p.title,
1958
+ url: p.url,
1959
+ ...(p.description ? { description: p.description } : {}),
1960
+ })
1961
+ } else {
1962
+ leftover.push(p)
1963
+ }
1964
+ })
1965
+ return leftover
1966
+ }
1967
+
1968
+ /**
1969
+ * We already have the category structure from the scraped nav; we just need
1970
+ * one FontAwesome icon per category. Tiny Claude call, fast.
1971
+ */
1972
+ async function iconizeScrapedNav(scraped, _unused, model, siteTitle) {
1973
+ const { systemPrompt, userPrompt } = iconizeNavPrompt({
1974
+ categories: scraped.categories,
1975
+ })
1976
+ const icons = await runJsonQuery({ systemPrompt, userPrompt, model })
1977
+ const iconArr = Array.isArray(icons) ? icons : []
1978
+ return {
1979
+ title: siteTitle || null,
1980
+ categories: scraped.categories.map((c, i) => ({
1981
+ title: c.title,
1982
+ icon: iconArr[i] || 'folder',
1983
+ pages: c.pages,
1984
+ })),
1985
+ }
1986
+ }
1987
+
1988
+ /**
1989
+ * Sections are "usable" when the llms.txt already did the hard grouping work
1990
+ * for us — meaningful titles, not too many/few, each populated. When usable we
1991
+ * take a fast path that only asks Claude for icons + title polish instead of
1992
+ * re-bucketing every page, which is the slow part of a full reorg.
1993
+ */
1994
+ // Sections named like these are catch-all buckets — even in richly-structured
1995
+ // llms.txt files (e.g. Stripe's "Docs" section is where pages go that don't
1996
+ // fit into a named product tab), so always drop them rather than promote the
1997
+ // grab-bag contents to a top-level sidebar category.
1998
+ const GENERIC_SECTION_RE =
1999
+ /^(resources?|english|root url|pages?|docs?|documentation|content|available languages.*|site|sitemap|index|home|optional|instructions?(\s|:).*|miscellaneous|misc|other)$/i
2000
+
2001
+ /**
2002
+ * Return the subset of llms.txt sections that carry real structural signal —
2003
+ * drop catch-all buckets ("Docs", "Resources", "Optional"), empty sections,
2004
+ * and oversized ones (site-dumps masquerading as sections).
2005
+ */
2006
+ function usableSections(sections) {
2007
+ if (!sections) return []
2008
+ return sections.filter((s) => s.title && !GENERIC_SECTION_RE.test(s.title.trim()) && s.items && s.items.length > 0 && s.items.length <= 200)
2009
+ }
2010
+
2011
+ function sectionsLookUsable(sections) {
2012
+ if (!sections || sections.length > 40) return false
2013
+ return usableSections(sections).length >= 3
2014
+ }
2015
+
2016
+ async function organizeWithClaude(parsed, model) {
2017
+ if (sectionsLookUsable(parsed.sections)) {
2018
+ return organizeFromSections(parsed, model)
2019
+ }
2020
+ return organizeFromScratch(parsed, model)
2021
+ }
2022
+
2023
+ /**
2024
+ * Fast path: llms.txt sections look good, so keep them 1:1 and ask Claude only
2025
+ * for a FontAwesome icon (and optional Title-Case cleanup) per section. Output
2026
+ * is O(sections), not O(pages), so this is usually ~5-15s vs. a full reorg.
2027
+ */
2028
+ async function organizeFromSections(parsed, model) {
2029
+ // Drop generic/empty/oversized sections so they don't pollute the sidebar
2030
+ // (e.g. Stripe's llms.txt has a "Docs" catch-all and a 0-item "Instructions
2031
+ // for Large Language Model Agents" section — neither is structural signal).
2032
+ const sections = usableSections(parsed.sections)
2033
+
2034
+ const { systemPrompt, userPrompt } = organizeFromSectionsPrompt({
2035
+ siteTitle: parsed.title,
2036
+ sections,
2037
+ })
2038
+ const raw = await runJsonQuery({ systemPrompt, userPrompt, model })
2039
+ if (!Array.isArray(raw)) {
2040
+ throw new Error('Fast-path expected a JSON array of {title, icon} entries.')
2041
+ }
2042
+
2043
+ const categories = sections.map((s, i) => {
2044
+ const meta = raw[i] || {}
2045
+ return {
2046
+ title: meta.title || s.title,
2047
+ icon: meta.icon || 'folder',
2048
+ pages: s.items.map((it) => ({
2049
+ title: it.text,
2050
+ url: it.url,
2051
+ ...(it.description ? { description: it.description } : {}),
2052
+ })),
2053
+ }
2054
+ })
2055
+
2056
+ return { title: parsed.title || null, categories }
2057
+ }
2058
+
2059
+ async function organizeFromScratch(parsed, model) {
2060
+ const items = parsed.sections.flatMap((s) =>
2061
+ s.items.map((i) => ({
2062
+ section: s.title,
2063
+ title: i.text,
2064
+ url: i.url,
2065
+ description: i.description || undefined,
2066
+ })),
2067
+ )
2068
+
2069
+ const { systemPrompt, userPrompt } = organizeFromScratchPrompt({
2070
+ siteTitle: parsed.title,
2071
+ items,
2072
+ })
2073
+ const raw = await runJsonQuery({ systemPrompt, userPrompt, model })
2074
+
2075
+ // Rehydrate pages from the id references Claude returned.
2076
+ const expandedCategories = []
2077
+ const usedIds = new Set()
2078
+ for (const cat of raw.categories || []) {
2079
+ const pages = []
2080
+ for (const id of cat.pageIds || []) {
2081
+ const item = items[id]
2082
+ if (!item) continue // ignore out-of-range ids
2083
+ if (usedIds.has(id)) continue // ignore dupes
2084
+ usedIds.add(id)
2085
+ pages.push({
2086
+ title: item.title,
2087
+ url: item.url,
2088
+ ...(item.description ? { description: item.description } : {}),
2089
+ })
2090
+ }
2091
+ expandedCategories.push({ title: cat.title, icon: cat.icon, pages })
2092
+ }
2093
+
2094
+ // Safety net: if Claude dropped any ids, park them under a leftover category
2095
+ // so we never silently lose pages.
2096
+ const missing = items.map((it, idx) => (usedIds.has(idx) ? null : { id: idx, ...it })).filter(Boolean)
2097
+ if (missing.length > 0) {
2098
+ expandedCategories.push({
2099
+ title: 'Uncategorized',
2100
+ icon: 'folder',
2101
+ pages: missing.map((it) => ({
2102
+ title: it.title,
2103
+ url: it.url,
2104
+ ...(it.description ? { description: it.description } : {}),
2105
+ })),
2106
+ })
2107
+ styles.warning(`Claude missed ${missing.length} page${missing.length === 1 ? '' : 's'} — parked under "Uncategorized".`)
2108
+ }
2109
+
2110
+ return { title: raw.title, categories: expandedCategories }
2111
+ }
2112
+
2113
+ /**
2114
+ * Shared Claude call for "send a prompt, parse JSON back". Logs the prompts so
2115
+ * we can see what went in, runs a heartbeat so silent model latency doesn't
2116
+ * look like a hang, and strips stray code fences if the model adds them.
2117
+ */
2118
+ async function runJsonQuery({ systemPrompt, userPrompt, model }) {
2119
+ console.log()
2120
+ console.log(styles.dim('─ system prompt ─'))
2121
+ console.log(styles.dim(systemPrompt))
2122
+ console.log(styles.dim('─ user prompt (first 80 lines) ─'))
2123
+ console.log(styles.dim(userPrompt.split('\n').slice(0, 80).join('\n')))
2124
+ const userLineCount = userPrompt.split('\n').length
2125
+ if (userLineCount > 80) {
2126
+ console.log(styles.dim(`… (${userLineCount - 80} more lines)`))
2127
+ }
2128
+ console.log(styles.dim('─'.repeat(40)))
2129
+ console.log()
2130
+
2131
+ const heartbeat = setInterval(() => process.stdout.write(styles.dim('.')), 1000)
2132
+ let text = ''
2133
+ try {
2134
+ for await (const message of query({
2135
+ prompt: userPrompt,
2136
+ options: {
2137
+ systemPrompt,
2138
+ allowedTools: [],
2139
+ ...(model ? { model } : {}),
2140
+ },
2141
+ })) {
2142
+ if (message.type === 'assistant' && message.message?.content) {
2143
+ for (const block of message.message.content) {
2144
+ if (block.type === 'text' && block.text) text += block.text
2145
+ }
2146
+ } else if (message.type === 'result') {
2147
+ if (message.subtype && message.subtype !== 'success') {
2148
+ throw new Error(`Claude failed: ${message.subtype}${message.error?.message ? ' — ' + message.error.message : ''}`)
2149
+ }
2150
+ break
2151
+ }
2152
+ }
2153
+ } finally {
2154
+ clearInterval(heartbeat)
2155
+ process.stdout.write('\n')
2156
+ }
2157
+
2158
+ const stripped = stripCodeFences(text)
2159
+
2160
+ try {
2161
+ return JSON.parse(stripped)
2162
+ } catch (e) {
2163
+ throw new Error(
2164
+ `Claude returned invalid JSON: ${e.message}\n` +
2165
+ `Output length: ${stripped.length} chars. Likely hit the model's output limit — try --model sonnet.\n\n` +
2166
+ `First 500 chars:\n${stripped.slice(0, 500)}\n\n` +
2167
+ `Last 500 chars:\n${stripped.slice(-500)}`,
2168
+ )
2169
+ }
2170
+ }
2171
+
2172
+ /**
2173
+ * Produce the ordered list of llms.txt URLs to probe for a given source URL.
2174
+ * Starts at the deepest path the user supplied and walks up one segment at a
2175
+ * time, ending at the origin root. Each level gets `<path>/llms.txt` appended.
2176
+ *
2177
+ * For https://mintlify.com/docs/quickstart:
2178
+ * → https://mintlify.com/docs/quickstart/llms.txt
2179
+ * → https://mintlify.com/docs/llms.txt
2180
+ * → https://mintlify.com/llms.txt
2181
+ *
2182
+ * Returns deduped URLs in probe order.
2183
+ */
2184
+ function buildLlmsCandidates(sourceUrl) {
2185
+ const out = []
2186
+ const seen = new Set()
2187
+ const add = (url) => {
2188
+ if (!seen.has(url)) {
2189
+ seen.add(url)
2190
+ out.push(url)
2191
+ }
2192
+ }
2193
+
2194
+ const origin = sourceUrl.origin
2195
+ const segs = sourceUrl.pathname.split('/').filter(Boolean)
2196
+ for (let i = segs.length; i >= 0; i--) {
2197
+ const prefix = segs.slice(0, i).join('/')
2198
+ add(`${origin}${prefix ? '/' + prefix : ''}/llms.txt`)
2199
+ }
2200
+ return out
2201
+ }
2202
+
2203
+ /**
2204
+ * Best-effort fetch of a site's /llms.txt. Returns { ok, status, error, parsed }
2205
+ * where parsed is { title, sections: [{ title, items: [{ text, url, description }] }] }.
2206
+ */
2207
+ async function fetchLlmsTxt(llmsUrl) {
2208
+ try {
2209
+ const res = await fetch(llmsUrl, {
2210
+ redirect: 'follow',
2211
+ headers: { 'User-Agent': 'readme-cli-import' },
2212
+ })
2213
+ if (!res.ok) return { ok: false, status: res.status }
2214
+ const text = await res.text()
2215
+ return { ok: true, status: res.status, parsed: parseLlmsTxt(text) }
2216
+ } catch (e) {
2217
+ return { ok: false, error: e.message }
2218
+ }
2219
+ }
2220
+
2221
+ /**
2222
+ * Parse the llms.txt format. `##` headings become sections;
2223
+ * `- [text](url): description` bullets become items. Items before any `##`
2224
+ * land in an implicit "Resources" section.
2225
+ */
2226
+ function parseLlmsTxt(body) {
2227
+ const lines = body.split(/\r?\n/)
2228
+ let title = null
2229
+ const sections = []
2230
+ let current = null
2231
+
2232
+ const itemRe = /^\s*-\s*\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)(?:\s*[:—–-]\s*(.+))?/
2233
+
2234
+ for (const line of lines) {
2235
+ const h1 = line.match(/^#\s+(.+)$/)
2236
+ if (h1 && !title) {
2237
+ title = h1[1].trim()
2238
+ continue
2239
+ }
2240
+
2241
+ const h2 = line.match(/^##\s+(.+)$/)
2242
+ if (h2) {
2243
+ current = { title: h2[1].trim(), items: [] }
2244
+ sections.push(current)
2245
+ continue
2246
+ }
2247
+
2248
+ const item = line.match(itemRe)
2249
+ if (item) {
2250
+ if (!current) {
2251
+ current = { title: 'Resources', items: [] }
2252
+ sections.push(current)
2253
+ }
2254
+ current.items.push({
2255
+ text: item[1].trim(),
2256
+ url: item[2].replace(/[.,;]+$/, ''),
2257
+ description: item[3] ? item[3].trim() : null,
2258
+ })
2259
+ }
2260
+ }
2261
+
2262
+ return { title, sections }
2263
+ }
2264
+
2265
+ /**
2266
+ * Write the organized hierarchy to disk as git-format markdown stubs — just
2267
+ * frontmatter, no body yet. docs/ pages go under docs/<Category>/<slug>.md;
2268
+ * reference/recipes/custom_pages/custom_blocks get their own top-level dir
2269
+ * without a category subfolder (the git-format schema doesn't nest them).
2270
+ * Writes _order.yaml per directory so sidebar order matches input order.
2271
+ */
2272
+ /**
2273
+ * Recursively print the page tree. Sub-pages are indented under their parent
2274
+ * with no leading bullet character, to show them as children of the parent.
2275
+ */
2276
+ function printPagesTree(pages, indentLevel) {
2277
+ const indent = ' '.repeat(indentLevel)
2278
+ for (const page of pages) {
2279
+ const desc = page.description ? ` ${styles.dim('— ' + page.description)}` : ''
2280
+ const url = page.url ? ` ${styles.dim(page.url)}` : ''
2281
+ console.log(`${indent}${styles.dim('·')} ${page.title}${url}${desc}`)
2282
+ if (page.pages && page.pages.length > 0) {
2283
+ printPagesTree(page.pages, indentLevel + 1)
2284
+ }
2285
+ }
2286
+ }
2287
+
2288
+ function stageOrganized(organized, stagingDir, opts = {}) {
2289
+ const pickIcon = makeIconPicker()
2290
+ const usedSlugs = new Set() // cross-dir: duplicates validator is global
2291
+ const byDir = new Map()
2292
+ const subDirsByTopDir = new Map()
2293
+ const counts = { fileCount: 0, skippedApiRef: 0 }
2294
+ const skipApiReference = !!opts.skipApiReference
2295
+
2296
+ /**
2297
+ * Write a page (and its descendants) into `dir`. A page with children gets
2298
+ * its own subfolder named after its slug; the parent page lives at
2299
+ * `<dir>/<slug>.md` while children live at `<dir>/<slug>/<childSlug>.md`.
2300
+ * This matches git-format's on-disk convention for nested sidebars.
2301
+ */
2302
+ function writePage(page, dir, topDir, isSubPage = false) {
2303
+ const slug = resolveSlug(deriveSlug(page.url, page.title), usedSlugs)
2304
+ usedSlugs.add(slug)
2305
+
2306
+ // Group-only nodes (e.g. a resource sub-group within API Reference) have
2307
+ // no backing page on the source site — they're pure sidebar containers.
2308
+ // Skip the stub write but still recurse so their children land in the
2309
+ // right subdirectory.
2310
+ const isGroupOnly = !page.url
2311
+
2312
+ if (!isGroupOnly) {
2313
+ const relFilePath = `${dir}/${slug}.md`
2314
+ // Sub-pages don't get icons per design decision.
2315
+ const frontmatter = buildFrontmatter(topDir, page, slug, pickIcon, { skipIcon: isSubPage })
2316
+ // x-import points at the source URL for this stub. The content-import
2317
+ // step reads it to fetch the page body. x-prefixed custom field is the
2318
+ // git-format convention for metadata the schema doesn't know about.
2319
+ frontmatter['x-import'] = toBrowsableUrl(page.url)
2320
+ // hide pages that need import
2321
+ frontmatter.hidden = true
2322
+
2323
+ const absPath = path.join(stagingDir, relFilePath)
2324
+ fs.mkdirSync(path.dirname(absPath), { recursive: true })
2325
+ fs.writeFileSync(absPath, matter.stringify('', frontmatter))
2326
+ counts.fileCount++
2327
+
2328
+ if (!byDir.has(dir)) byDir.set(dir, [])
2329
+ byDir.get(dir).push(slug)
2330
+ }
2331
+
2332
+ const children = page.pages || []
2333
+ if (children.length > 0) {
2334
+ const subDir = `${dir}/${slug}`
2335
+ for (const child of children) writePage(child, subDir, topDir, true)
2336
+ }
2337
+ }
2338
+
2339
+ for (const cat of organized.categories || []) {
2340
+ const { topDir, subDir } = routeCategory(cat.title)
2341
+ if (skipApiReference && topDir === 'reference') {
2342
+ counts.skippedApiRef += countPagesDeep(cat.pages || [])
2343
+ continue
2344
+ }
2345
+ const dir = subDir ? `${topDir}/${subDir}` : topDir
2346
+ if (subDir) {
2347
+ if (!subDirsByTopDir.has(topDir)) subDirsByTopDir.set(topDir, [])
2348
+ if (!subDirsByTopDir.get(topDir).includes(subDir)) subDirsByTopDir.get(topDir).push(subDir)
2349
+ }
2350
+
2351
+ for (const page of cat.pages || []) writePage(page, dir, topDir, false)
2352
+ }
2353
+
2354
+ // Per-directory _order.yaml preserves input order in the sidebar.
2355
+ for (const [dir, slugs] of byDir) {
2356
+ const orderPath = path.join(stagingDir, dir, '_order.yaml')
2357
+ const body = slugs.map((s) => `- ${yamlSafeSlug(s)}`).join('\n') + '\n'
2358
+ fs.writeFileSync(orderPath, body)
2359
+ }
2360
+
2361
+ // Top-level _order.yaml (e.g. docs/_order.yaml) lists category subfolders.
2362
+ for (const [topDir, subs] of subDirsByTopDir) {
2363
+ const orderPath = path.join(stagingDir, topDir, '_order.yaml')
2364
+ const body = subs.map((s) => `- ${yamlSafeSlug(s)}`).join('\n') + '\n'
2365
+ fs.writeFileSync(orderPath, body)
2366
+ }
2367
+
2368
+ return { fileCount: counts.fileCount, dirCount: byDir.size, skippedApiRef: counts.skippedApiRef }
2369
+ }
2370
+
2371
+ function countPagesDeep(pages) {
2372
+ let n = 0
2373
+ for (const p of pages || []) {
2374
+ if (p.url) n++
2375
+ if (p.pages && p.pages.length) n += countPagesDeep(p.pages)
2376
+ }
2377
+ return n
2378
+ }
2379
+
2380
+ /**
2381
+ * Map a category title to the git-format top-level directory + optional
2382
+ * category subdir. docs/ is the only top dir that takes a subfolder.
2383
+ */
2384
+ function routeCategory(title) {
2385
+ const t = (title || '').trim()
2386
+ if (/^(api[ -]?reference|reference|api|endpoints?)$/i.test(t)) return { topDir: 'reference', subDir: null }
2387
+ if (/^(recipes?|cookbook|tutorials?|how[ -]?tos?)$/i.test(t)) return { topDir: 'recipes', subDir: null }
2388
+ if (/^(custom[ -]?pages?|landing( page)?s?)$/i.test(t)) return { topDir: 'custom_pages', subDir: null }
2389
+ if (/^(custom[ -]?blocks?|snippets?|reusable( content)?)$/i.test(t)) return { topDir: 'custom_blocks', subDir: null }
2390
+ return { topDir: 'docs', subDir: t || 'Documentation' }
2391
+ }
2392
+
2393
+ function buildFrontmatter(topDir, page, slug, pickIcon, opts = {}) {
2394
+ const fm = {}
2395
+ const title = (page.title || titleCase(slug)).trim()
2396
+
2397
+ if (topDir === 'custom_blocks') {
2398
+ fm.name = title
2399
+ } else {
2400
+ fm.title = title
2401
+ }
2402
+
2403
+ if (page.description && page.description.trim()) {
2404
+ fm.excerpt = page.description.trim()
2405
+ }
2406
+
2407
+ // Sub-pages skip icons by design — the parent carries the nav icon, and
2408
+ // children render without one.
2409
+ if (opts.skipIcon) return fm
2410
+
2411
+ // Recipes use `recipe.icon` instead of a top-level `icon` (per git-format schema).
2412
+ const icon = pickIcon(slug, title)
2413
+ if (topDir === 'recipes') {
2414
+ fm.recipe = { color: '#018ef5', icon: icon || 'book-open' }
2415
+ } else if (topDir === 'docs' || topDir === 'reference') {
2416
+ fm.icon = formatIconClass(icon)
2417
+ }
2418
+
2419
+ return fm
2420
+ }
2421
+
2422
+ /**
2423
+ * Turn a URL's trailing segment into a filename-safe slug. Strips `.md`, kebabs
2424
+ * the result, drops any leading numeric prefix (common in imports).
2425
+ */
2426
+ function deriveSlug(url, fallbackTitle) {
2427
+ let raw = ''
2428
+ try {
2429
+ const segs = new URL(url).pathname.split('/').filter(Boolean)
2430
+ raw = segs[segs.length - 1] || ''
2431
+ } catch {}
2432
+ raw = raw.replace(/\.(md|mdx|html?)$/i, '').replace(/^\d+[-_.]/, '')
2433
+ const slug = kebabCase(raw || fallbackTitle || 'page')
2434
+ return slug || 'page'
2435
+ }
2436
+
2437
+ /**
2438
+ * If `slug` is already in use anywhere in the staging tree, try `slug-2`,
2439
+ * `slug-3`, etc. The duplicates validator flags same-slug collisions across
2440
+ * directories, not just within a directory, so uniqueness must be global.
2441
+ */
2442
+ function resolveSlug(slug, usedSlugs) {
2443
+ if (!usedSlugs.has(slug)) return slug
2444
+ let n = 2
2445
+ while (usedSlugs.has(`${slug}-${n}`)) n++
2446
+ return `${slug}-${n}`
2447
+ }
2448
+
2449
+ // Values YAML interprets as non-strings need quoting when used as _order entries.
2450
+ const YAML_UNSAFE = /^(?:\d+\.?\d*|true|false|yes|no|on|off|null|~)$/i
2451
+ function yamlSafeSlug(slug) {
2452
+ return YAML_UNSAFE.test(slug) ? `"${slug}"` : slug
2453
+ }
2454
+
2455
+ function kebabCase(s) {
2456
+ return (
2457
+ String(s)
2458
+ .replace(/([a-z0-9])([A-Z])/g, '$1-$2')
2459
+ .toLowerCase()
2460
+ .replace(/[^a-z0-9]+/g, '-')
2461
+ .replace(/^-+|-+$/g, '') || 'page'
2462
+ )
2463
+ }
2464
+
2465
+ // Keyword → FontAwesome icon. Ported from v1 (most-specific entries first).
2466
+ // Each rule maps to an ordered list of candidates; the first unused-in-this-dir
2467
+ // icon wins so sibling pages don't all share the same icon.
2468
+ const ICON_RULES = [
2469
+ [/\b(getting[- ]?started|quick[- ]?start|intro|introduction|welcome|overview|start)\b/, ['rocket', 'door-open', 'flag', 'star']],
2470
+ [/\b(api[- ]?keys?|token|secrets?|credentials?|scopes?)\b/, ['key', 'key-skeleton', 'fingerprint']],
2471
+ [/\b(auth|authn|authentication|sign[- ]?in|login|oauth|sso|identity|saml|oidc)\b/, ['lock', 'shield-halved', 'id-badge']],
2472
+ [/\b(permissions?|roles?|access|authz|authorization|rbac|acl)\b/, ['user-lock', 'user-shield', 'user-tag']],
2473
+ [/\b(users?|accounts?|profiles?|members?|people)\b/, ['user', 'user-gear', 'id-card', 'circle-user']],
2474
+ [/\b(groups?|org(anizations?)?|teams?|workspaces?)\b/, ['users', 'people-group', 'user-group']],
2475
+ [/\b(sync|syncing|mirror|pipeline|webhooks?)\b/, ['arrows-rotate', 'shuffle', 'bell']],
2476
+ [/\b(projects?|apps?|applications?)\b/, ['folder', 'folder-open', 'folder-tree']],
2477
+ [/\b(errors?|troubleshoot|debug(ging)?|issues?)\b/, ['triangle-exclamation', 'bug', 'circle-xmark']],
2478
+ [/\b(rate[- ]?limits?|throttl|quota|limits?)\b/, ['gauge', 'gauge-high']],
2479
+ [/\b(pagination|cursor|paginate)\b/, ['list', 'list-ol', 'ellipsis']],
2480
+ [/\b(versioning|versions?|changelog|releases?|release[- ]?notes?)\b/, ['code-branch', 'code-fork', 'timeline']],
2481
+ [/\b(sdks?|libraries|clients?|packages?)\b/, ['cube', 'cubes', 'boxes-stacked']],
2482
+ [/\b(cli|command[- ]?line|terminal|shell)\b/, ['terminal', 'square-terminal']],
2483
+ [/\b(billing|invoices?|subscriptions?|plans?)\b/, ['credit-card', 'file-invoice-dollar']],
2484
+ [/\b(security|compliance|privacy|gdpr|soc|hipaa|encryption)\b/, ['shield', 'shield-check', 'lock-keyhole']],
2485
+ [/\b(search|query|filters?|lookup)\b/, ['magnifying-glass', 'filter']],
2486
+ [/\b(uploads?|files?|storage|assets?|media)\b/, ['cloud-arrow-up', 'file-arrow-up']],
2487
+ [/\b(downloads?|exports?)\b/, ['cloud-arrow-down', 'download']],
2488
+ [/\b(imports?|ingest|ingestion)\b/, ['file-import', 'inbox-in']],
2489
+ [/\b(graphql)\b/, ['diagram-project', 'sitemap']],
2490
+ [/\b(sandbox|test(ing)?|staging|preview)\b/, ['flask', 'vial', 'eye']],
2491
+ [/\b(analytics|metrics|usage|stats|dashboard|reports?|monitoring|observability)\b/, ['chart-line', 'chart-pie', 'chart-bar']],
2492
+ [/\b(integrations?|plugins?|extensions?|connectors?)\b/, ['plug', 'puzzle-piece']],
2493
+ [/\b(tutorials?|how[- ]?to|guides?|recipes?|walkthroughs?)\b/, ['book-open', 'book-open-reader', 'graduation-cap']],
2494
+ [/\b(reference|endpoints?|api|apis|operations?)\b/, ['code', 'brackets-curly', 'file-code']],
2495
+ [/\b(configuration|config|settings?|preferences|admin|administration)\b/, ['sliders', 'gear', 'wrench', 'screwdriver-wrench']],
2496
+ [/\b(faq|questions?|answers?|help|support)\b/, ['circle-question', 'circle-info', 'life-ring']],
2497
+ [/\b(migration|migrations?|upgrade|upgrades?|migrate)\b/, ['arrow-up-right', 'stairs']],
2498
+ [/\b(logs?|logging|audit|audits?|history)\b/, ['file-lines', 'clock-rotate-left', 'scroll']],
2499
+ [/\b(data|datasets?|database|db|tables?|schemas?)\b/, ['database', 'table', 'server']],
2500
+ [/\b(ai|ml|machine[- ]?learning|llm|models?)\b/, ['robot', 'brain', 'microchip']],
2501
+ [/\b(globe|language|locale|i18n|internationalization|translations?)\b/, ['globe', 'language', 'earth-americas']],
2502
+ ]
2503
+
2504
+ const DEFAULT_ICON_POOL = ['file-lines', 'file', 'bookmark', 'note-sticky', 'circle', 'square', 'diamond']
2505
+
2506
+ // ReadMe's hub sidebar renders `<i class="{icon}">` with no normalization, so
2507
+ // a bare "rocket" matches no CSS. Prefix short names with `fa-solid fa-` so
2508
+ // readme.com's FontAwesome 6 Pro stylesheet picks them up. Leaves values that
2509
+ // already include a space or a `fa-` prefix untouched.
2510
+ function formatIconClass(icon) {
2511
+ if (!icon) return icon
2512
+ if (icon.includes(' ') || icon.startsWith('fa-')) return icon
2513
+ return `fa-solid fa-${icon}`
2514
+ }
2515
+
2516
+ function makeIconPicker() {
2517
+ const used = new Set()
2518
+ // Every icon we could ever return, deduped so the round-robin fallback
2519
+ // spreads evenly. Order puts rule icons first (semantically meaningful)
2520
+ // then defaults — this order is also the cycle order once the pool is
2521
+ // exhausted.
2522
+ const allIcons = Array.from(new Set([...ICON_RULES.flatMap(([, icons]) => icons), ...DEFAULT_ICON_POOL]))
2523
+ let cycleIndex = 0
2524
+
2525
+ return function pickIcon(slug, title) {
2526
+ const haystack = `${slug || ''} ${title || ''}`.toLowerCase()
2527
+
2528
+ // First pass: find a semantically-matching rule whose candidates aren't
2529
+ // all already taken globally.
2530
+ for (const [re, icons] of ICON_RULES) {
2531
+ if (!re.test(haystack)) continue
2532
+ for (const icon of icons) {
2533
+ if (!used.has(icon)) {
2534
+ used.add(icon)
2535
+ return icon
2536
+ }
2537
+ }
2538
+ }
2539
+
2540
+ // No rule matched, or all matching rules' candidates are taken. Take the
2541
+ // first globally-unused icon from the full pool.
2542
+ for (const icon of allIcons) {
2543
+ if (!used.has(icon)) {
2544
+ used.add(icon)
2545
+ return icon
2546
+ }
2547
+ }
2548
+
2549
+ // Pool exhausted — spread reuse evenly via round-robin rather than
2550
+ // piling every remaining page onto one icon.
2551
+ const icon = allIcons[cycleIndex % allIcons.length]
2552
+ cycleIndex++
2553
+ return icon
2554
+ }
2555
+ }
2556
+
2557
+ function formatDuration(ms) {
2558
+ const safe = Math.max(0, ms)
2559
+ // Show ms under a second so sub-second work doesn't misleadingly read as "0m 0s".
2560
+ if (safe < 1000) return `${Math.round(safe)}ms`
2561
+ const totalSeconds = Math.round(safe / 1000)
2562
+ const minutes = Math.floor(totalSeconds / 60)
2563
+ const seconds = totalSeconds % 60
2564
+ return `${minutes}m ${seconds}s`
2565
+ }