@readme/cli 0.0.26
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +55 -0
- package/bin/readme.js +8 -0
- package/package.json +58 -0
- package/src/bootstrap.js +97 -0
- package/src/cli.js +189 -0
- package/src/commands/dev.js +119 -0
- package/src/commands/eyes.js +37 -0
- package/src/commands/import.js +2565 -0
- package/src/commands/lint.js +70 -0
- package/src/commands/oas-sync.js +364 -0
- package/src/commands/oas-validate.js +208 -0
- package/src/commands/play.js +17 -0
- package/src/commands/pretty.js +133 -0
- package/src/commands/setup.js +256 -0
- package/src/commands/versions.js +81 -0
- package/src/dev/.next/app-build-manifest.json +20 -0
- package/src/dev/.next/build-manifest.json +31 -0
- package/src/dev/.next/cache/.rscinfo +1 -0
- package/src/dev/.next/cache/next-devtools-config.json +1 -0
- package/src/dev/.next/cache/webpack/client-development/0.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/1.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/10.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/11.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/2.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/3.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/3.pack.gz_ +0 -0
- package/src/dev/.next/cache/webpack/client-development/4.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/5.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/5.pack.gz_ +0 -0
- package/src/dev/.next/cache/webpack/client-development/6.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/7.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/7.pack.gz_ +0 -0
- package/src/dev/.next/cache/webpack/client-development/8.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/9.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development/index.pack.gz.old +0 -0
- package/src/dev/.next/cache/webpack/client-development-fallback/0.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development-fallback/1.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development-fallback/index.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/client-development-fallback/index.pack.gz.old +0 -0
- package/src/dev/.next/cache/webpack/edge-server-development/0.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/edge-server-development/1.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/edge-server-development/index.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/edge-server-development/index.pack.gz.old +0 -0
- package/src/dev/.next/cache/webpack/server-development/0.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/1.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/10.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/11.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/12.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/13.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/14.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/15.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/2.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/2.pack.gz_ +0 -0
- package/src/dev/.next/cache/webpack/server-development/3.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/3.pack.gz_ +0 -0
- package/src/dev/.next/cache/webpack/server-development/4.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/5.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/6.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/6.pack.gz_ +0 -0
- package/src/dev/.next/cache/webpack/server-development/7.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/7.pack.gz_ +0 -0
- package/src/dev/.next/cache/webpack/server-development/8.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/9.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/9.pack.gz_ +0 -0
- package/src/dev/.next/cache/webpack/server-development/index.pack.gz +0 -0
- package/src/dev/.next/cache/webpack/server-development/index.pack.gz.old +0 -0
- package/src/dev/.next/package.json +1 -0
- package/src/dev/.next/prerender-manifest.json +11 -0
- package/src/dev/.next/react-loadable-manifest.json +1 -0
- package/src/dev/.next/routes-manifest.json +1 -0
- package/src/dev/.next/server/app/[...slug]/page.js +360 -0
- package/src/dev/.next/server/app/[...slug]/page_client-reference-manifest.js +1 -0
- package/src/dev/.next/server/app/page.js +349 -0
- package/src/dev/.next/server/app/page_client-reference-manifest.js +1 -0
- package/src/dev/.next/server/app-paths-manifest.json +3 -0
- package/src/dev/.next/server/edge-runtime-webpack.js +1151 -0
- package/src/dev/.next/server/interception-route-rewrite-manifest.js +1 -0
- package/src/dev/.next/server/middleware-build-manifest.js +33 -0
- package/src/dev/.next/server/middleware-manifest.json +32 -0
- package/src/dev/.next/server/middleware-react-loadable-manifest.js +1 -0
- package/src/dev/.next/server/middleware.js +1113 -0
- package/src/dev/.next/server/next-font-manifest.js +1 -0
- package/src/dev/.next/server/next-font-manifest.json +1 -0
- package/src/dev/.next/server/pages-manifest.json +5 -0
- package/src/dev/.next/server/server-reference-manifest.js +1 -0
- package/src/dev/.next/server/server-reference-manifest.json +5 -0
- package/src/dev/.next/server/static/webpack/633457081244afec._.hot-update.json +1 -0
- package/src/dev/.next/server/vendor-chunks/@readme.js +25 -0
- package/src/dev/.next/server/vendor-chunks/@swc.js +55 -0
- package/src/dev/.next/server/vendor-chunks/next.js +3659 -0
- package/src/dev/.next/server/webpack-runtime.js +209 -0
- package/src/dev/.next/static/chunks/app/[...slug]/loading.js +28 -0
- package/src/dev/.next/static/chunks/app/[...slug]/page.js +28 -0
- package/src/dev/.next/static/chunks/app/layout.js +171 -0
- package/src/dev/.next/static/chunks/app/page.js +28 -0
- package/src/dev/.next/static/chunks/app-pages-internals.js +182 -0
- package/src/dev/.next/static/chunks/main-app.js +1882 -0
- package/src/dev/.next/static/chunks/polyfills.js +1 -0
- package/src/dev/.next/static/chunks/webpack.js +1393 -0
- package/src/dev/.next/static/css/app/layout.css +559 -0
- package/src/dev/.next/static/development/_buildManifest.js +1 -0
- package/src/dev/.next/static/development/_ssgManifest.js +1 -0
- package/src/dev/.next/static/webpack/633457081244afec._.hot-update.json +1 -0
- package/src/dev/.next/static/webpack/ec52a3fce0f78db0.webpack.hot-update.json +1 -0
- package/src/dev/.next/static/webpack/webpack.ec52a3fce0f78db0.hot-update.js +12 -0
- package/src/dev/.next/trace +21 -0
- package/src/dev/.next/types/app/[...slug]/page.ts +84 -0
- package/src/dev/.next/types/app/layout.ts +84 -0
- package/src/dev/.next/types/app/page.ts +84 -0
- package/src/dev/.next/types/cache-life.d.ts +141 -0
- package/src/dev/.next/types/package.json +1 -0
- package/src/dev/.next/types/routes.d.ts +55 -0
- package/src/dev/app/Sidebar.js +149 -0
- package/src/dev/app/[...slug]/loading.js +16 -0
- package/src/dev/app/[...slug]/page.js +43 -0
- package/src/dev/app/globals.css +167 -0
- package/src/dev/app/layout.js +73 -0
- package/src/dev/app/page.js +19 -0
- package/src/dev/lib/docs.js +337 -0
- package/src/dev/middleware.js +7 -0
- package/src/dev/next.config.mjs +22 -0
- package/src/index.js +12 -0
- package/src/prompts/index.js +352 -0
- package/src/utils/claude.js +15 -0
- package/src/utils/eyes.js +365 -0
- package/src/utils/git.js +143 -0
- package/src/utils/lint.js +99 -0
- package/src/utils/reporter.js +319 -0
- package/src/utils/setup-templates.js +323 -0
- package/src/utils/styles.js +50 -0
- package/src/utils/tamagotchi.js +1139 -0
- package/src/utils/tips.js +90 -0
- package/src/validators/components.js +230 -0
- package/src/validators/content.js +53 -0
- package/src/validators/duplicates.js +45 -0
- package/src/validators/frontmatter.js +247 -0
- package/src/validators/links.js +68 -0
- package/src/validators/nesting.js +50 -0
- package/src/validators/numbering.js +136 -0
- package/src/validators/oas-reference.js +126 -0
- package/src/validators/oas-schema.js +106 -0
- package/src/validators/ordering.js +121 -0
- package/src/validators/recipes.js +143 -0
- package/vendor/TOOLS.md +19 -0
|
@@ -0,0 +1,2565 @@
|
|
|
1
|
+
import fs from 'node:fs'
|
|
2
|
+
import os from 'node:os'
|
|
3
|
+
import path from 'node:path'
|
|
4
|
+
import { spawn } from 'node:child_process'
|
|
5
|
+
import { createRequire } from 'node:module'
|
|
6
|
+
import { Option } from 'commander'
|
|
7
|
+
import { query } from '@anthropic-ai/claude-agent-sdk'
|
|
8
|
+
import matter from 'gray-matter'
|
|
9
|
+
import * as styles from '../utils/styles.js'
|
|
10
|
+
import { syncOas, extractOperations } from './oas-sync.js'
|
|
11
|
+
import OASNormalize from 'oas-normalize'
|
|
12
|
+
import { slotOrphansPrompt, iconizeNavPrompt, organizeFromSectionsPrompt, organizeFromScratchPrompt, stripCodeFences } from '../prompts/index.js'
|
|
13
|
+
|
|
14
|
+
export const command = 'import'
|
|
15
|
+
export const order = 7
|
|
16
|
+
export const description = 'Import content from a URL and package it as a ReadMe zip'
|
|
17
|
+
export const hidden = true
|
|
18
|
+
export const skipBootstrap = true
|
|
19
|
+
|
|
20
|
+
export function args(cmd) {
|
|
21
|
+
cmd.requiredOption('--source <url-or-file>', 'URL to import from, or path to a local OpenAPI spec (.json/.yaml/.yml)')
|
|
22
|
+
cmd.option('-o, --output <path>', 'Output zip path (defaults to <basename>-readme.zip in cwd)')
|
|
23
|
+
cmd.option('--model <name>', 'Claude model alias: haiku, sonnet, opus', 'sonnet')
|
|
24
|
+
cmd.option('--firecrawl-key <key>', 'Firecrawl API key (or set FIRECRAWL_API_KEY env var) — enables JS-rendered sidebar scraping')
|
|
25
|
+
cmd.option('--skip-api-reference', 'Drop pages routed to the API Reference / reference dir. Use when uploading the OAS spec separately.')
|
|
26
|
+
// Internal dev-only flag: skip the zip, keep staging, and boot the dev server
|
|
27
|
+
// against it for quick visual previews. Hidden from --help.
|
|
28
|
+
cmd.addOption(new Option('--test').hideHelp())
|
|
29
|
+
// Dump intermediate pipeline artifacts (llms parse, scraped nav, orphan
|
|
30
|
+
// handling, final organized tree) so we can diff stages when the produced
|
|
31
|
+
// sidebar disagrees with the source.
|
|
32
|
+
cmd.addOption(new Option('--debug').hideHelp())
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/**
|
|
36
|
+
* Run the importer programmatically. Mirrors the CLI command but throws on
|
|
37
|
+
* fatal errors instead of calling `process.exit`, and returns a result object
|
|
38
|
+
* on success.
|
|
39
|
+
*
|
|
40
|
+
* @param {object} options
|
|
41
|
+
* @param {string} options.source URL to import from, or path to a local OAS spec.
|
|
42
|
+
* @param {string} [options.output] Output zip path. Defaults to `<basename>-readme.zip` in cwd.
|
|
43
|
+
* @param {string} [options.model] Claude model alias: 'haiku' | 'sonnet' | 'opus'. Defaults to 'sonnet'.
|
|
44
|
+
* @param {string} [options.firecrawlKey] Firecrawl API key (falls back to FIRECRAWL_API_KEY env var).
|
|
45
|
+
* @param {boolean} [options.skipApiReference] Drop pages routed to the API Reference dir.
|
|
46
|
+
* @param {boolean} [options.test] Skip the zip, keep staging, and boot the dev server.
|
|
47
|
+
* @param {boolean} [options.debug] Dump intermediate pipeline artifacts to a tmp dir.
|
|
48
|
+
* @returns {Promise<{ source: 'url' | 'oas', outputZip?: string, stagingDir?: string, fileCount: number, duration: number, phases: Array<{ label: string, ms: number }> }>}
|
|
49
|
+
*/
|
|
50
|
+
export async function importDocs(options) {
|
|
51
|
+
const startedAt = Date.now()
|
|
52
|
+
const phases = []
|
|
53
|
+
const timePhase = async (label, fn) => {
|
|
54
|
+
const t = Date.now()
|
|
55
|
+
const result = await fn()
|
|
56
|
+
phases.push({ label, ms: Date.now() - t })
|
|
57
|
+
return result
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
const debugSnapshots = options.debug ? {} : null
|
|
61
|
+
|
|
62
|
+
// Dispatch: http(s) URL → docs-site scrape flow; anything else → local OAS.
|
|
63
|
+
if (!/^https?:\/\//i.test(options.source)) {
|
|
64
|
+
return runOasImport(options.source, options, startedAt, phases, timePhase)
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
let sourceUrl
|
|
68
|
+
try {
|
|
69
|
+
sourceUrl = new URL(options.source)
|
|
70
|
+
} catch {
|
|
71
|
+
throw new Error(`Invalid --source URL: ${options.source}`)
|
|
72
|
+
}
|
|
73
|
+
|
|
74
|
+
const outputZip = path.resolve(options.output || path.join(process.cwd(), `${sourceUrl.hostname}-readme.zip`))
|
|
75
|
+
|
|
76
|
+
console.log()
|
|
77
|
+
styles.info(`Importing from ${styles.bold(sourceUrl.toString())}`)
|
|
78
|
+
if (!options.test) styles.info(`Output: ${styles.bold(outputZip)}`)
|
|
79
|
+
console.log()
|
|
80
|
+
|
|
81
|
+
// Build the list of llms.txt URLs to probe, walking up the supplied path
|
|
82
|
+
// from most-specific to root. For `https://mintlify.com/docs/quickstart`
|
|
83
|
+
// we try `/docs/quickstart/llms.txt`, then `/docs/llms.txt`, then root.
|
|
84
|
+
// This catches sites that scope llms.txt to a docs subpath.
|
|
85
|
+
const llmsCandidates = buildLlmsCandidates(sourceUrl)
|
|
86
|
+
styles.info(`Checking for llms.txt (${llmsCandidates.length} candidate${llmsCandidates.length === 1 ? '' : 's'})...`)
|
|
87
|
+
|
|
88
|
+
const { llms, llmsUrl } = await timePhase('fetch llms.txt', async () => {
|
|
89
|
+
for (const candidate of llmsCandidates) {
|
|
90
|
+
const res = await fetchLlmsTxt(candidate)
|
|
91
|
+
if (res.ok) return { llms: res, llmsUrl: candidate }
|
|
92
|
+
styles.info(styles.dim(` ${candidate} → ${res.status ? `HTTP ${res.status}` : res.error || 'failed'}`))
|
|
93
|
+
}
|
|
94
|
+
return { llms: null, llmsUrl: null }
|
|
95
|
+
})
|
|
96
|
+
console.log()
|
|
97
|
+
|
|
98
|
+
if (!llms) {
|
|
99
|
+
styles.warning(`No llms.txt found at any probed path — falling back to sidebar discovery via scrape.`)
|
|
100
|
+
} else {
|
|
101
|
+
styles.info(styles.dim(`Using ${llmsUrl}.`))
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if (debugSnapshots) {
|
|
105
|
+
debugSnapshots['01-llms-parsed.json'] = { llmsUrl, parsed: llms ? llms.parsed : null }
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
let knownUrls = []
|
|
109
|
+
if (llms) {
|
|
110
|
+
const totalItems = llms.parsed.sections.reduce((n, s) => n + s.items.length, 0)
|
|
111
|
+
styles.ok(
|
|
112
|
+
`Found llms.txt — ${styles.bold(String(totalItems))} page${totalItems === 1 ? '' : 's'} across ${styles.bold(String(llms.parsed.sections.length))} section${llms.parsed.sections.length === 1 ? '' : 's'}${llms.parsed.title ? ` (${llms.parsed.title})` : ''}.`,
|
|
113
|
+
)
|
|
114
|
+
|
|
115
|
+
const rawKnownUrls = llms.parsed.sections.flatMap((s) => s.items.map((i) => ({ title: i.text, url: i.url, description: i.description })))
|
|
116
|
+
|
|
117
|
+
// Dedupe llms.txt entries by pathname. Some sites (zod.dev, fumadocs) list
|
|
118
|
+
// every in-page anchor as its own llms.txt row (`/v4?id=wrapping-up`,
|
|
119
|
+
// `/v4?id=metadata`, …) even though they all live on one rendered page.
|
|
120
|
+
// We prefer the "cleanest" URL per path — the shortest one, which is
|
|
121
|
+
// usually the one without a query string or hash.
|
|
122
|
+
const byKnownPath = new Map()
|
|
123
|
+
for (const p of rawKnownUrls) {
|
|
124
|
+
const key = normalizePath(p.url)
|
|
125
|
+
const prev = byKnownPath.get(key)
|
|
126
|
+
if (!prev || p.url.length < prev.url.length) byKnownPath.set(key, p)
|
|
127
|
+
}
|
|
128
|
+
knownUrls = Array.from(byKnownPath.values())
|
|
129
|
+
const dropped = rawKnownUrls.length - knownUrls.length
|
|
130
|
+
if (dropped > 0) {
|
|
131
|
+
styles.info(`${styles.dim(`Collapsed ${dropped} anchor/query duplicates → ${knownUrls.length} unique pages.`)}`)
|
|
132
|
+
}
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
console.log()
|
|
136
|
+
const firecrawlKey = options.firecrawlKey || process.env.FIRECRAWL_API_KEY || null
|
|
137
|
+
|
|
138
|
+
// Mintlify fast path — the canonical sidebar lives in docs.json/mint.json
|
|
139
|
+
// at origin root. When present it gives us perfect structure with zero
|
|
140
|
+
// HTML parsing, so try it before falling back to generic nav scraping.
|
|
141
|
+
styles.info(`Probing for Mintlify config (docs.json, mint.json)...`)
|
|
142
|
+
const mintlifyStart = Date.now()
|
|
143
|
+
const mintlifyNav = await timePhase('mintlify probe', () => tryMintlifyNav(sourceUrl.toString(), knownUrls, firecrawlKey))
|
|
144
|
+
if (debugSnapshots) {
|
|
145
|
+
debugSnapshots['02a-mintlify-nav.json'] = mintlifyNav ? JSON.parse(JSON.stringify(mintlifyNav)) : null
|
|
146
|
+
}
|
|
147
|
+
if (mintlifyNav) {
|
|
148
|
+
const pageCount = mintlifyNav.categories.reduce((n, c) => n + c.pages.length, 0)
|
|
149
|
+
styles.ok(
|
|
150
|
+
`Found Mintlify config at ${styles.bold(mintlifyNav.source)} in ${styles.bold(formatDuration(Date.now() - mintlifyStart))} — ${styles.bold(String(mintlifyNav.categories.length))} categor${mintlifyNav.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(pageCount))} pages.`,
|
|
151
|
+
)
|
|
152
|
+
}
|
|
153
|
+
console.log()
|
|
154
|
+
|
|
155
|
+
let scraped
|
|
156
|
+
let scrapeStart = Date.now()
|
|
157
|
+
if (mintlifyNav) {
|
|
158
|
+
scraped = { title: mintlifyNav.title, categories: mintlifyNav.categories }
|
|
159
|
+
} else {
|
|
160
|
+
styles.info(`Scraping sidebar nav from ${styles.bold(sourceUrl.toString())}${firecrawlKey ? ' ' + styles.dim('(via Firecrawl)') : ''}...`)
|
|
161
|
+
scrapeStart = Date.now()
|
|
162
|
+
scraped = await timePhase('scrape nav', () => scrapeNavFromSite(sourceUrl.toString(), knownUrls, firecrawlKey))
|
|
163
|
+
}
|
|
164
|
+
if (debugSnapshots) {
|
|
165
|
+
debugSnapshots['02-scraped-raw.json'] = scraped ? JSON.parse(JSON.stringify(scraped)) : null
|
|
166
|
+
}
|
|
167
|
+
// Prefer llms.txt when it has strong multi-section structure and the
|
|
168
|
+
// scrape was a thin snapshot (common on big multi-tab docs — Stripe, AWS,
|
|
169
|
+
// Twilio — where each page only renders its own tab's sidebar). Without
|
|
170
|
+
// this override the 4-category scrape wins over a 25-section llms.txt and
|
|
171
|
+
// hundreds of real pages end up smeared into orphan buckets.
|
|
172
|
+
if (scraped && llms && knownUrls.length > 0) {
|
|
173
|
+
const scrapedPages = scraped.categories.reduce((n, c) => n + c.pages.length, 0)
|
|
174
|
+
const coverage = scrapedPages / knownUrls.length
|
|
175
|
+
const llmsUsable = usableSections(llms.parsed.sections)
|
|
176
|
+
if (llmsUsable.length >= 5 && coverage < 0.5) {
|
|
177
|
+
styles.info(
|
|
178
|
+
`Scrape covered ${styles.bold(Math.round(coverage * 100) + '%')} of llms.txt pages; preferring llms.txt's ${styles.bold(String(llmsUsable.length))} sections for structure.`,
|
|
179
|
+
)
|
|
180
|
+
scraped = null
|
|
181
|
+
}
|
|
182
|
+
}
|
|
183
|
+
|
|
184
|
+
if (scraped) {
|
|
185
|
+
const directMatches = scraped.categories.reduce((n, c) => n + c.pages.length, 0)
|
|
186
|
+
if (knownUrls.length > 0) {
|
|
187
|
+
const slotted = slotOrphansByPath(scraped, knownUrls)
|
|
188
|
+
if (debugSnapshots) {
|
|
189
|
+
debugSnapshots['03-after-slot-by-path.json'] = {
|
|
190
|
+
scraped: JSON.parse(JSON.stringify(scraped)),
|
|
191
|
+
unslottedOrphans: slotted,
|
|
192
|
+
}
|
|
193
|
+
}
|
|
194
|
+
const totalMatched = scraped.categories.reduce((n, c) => n + c.pages.length, 0)
|
|
195
|
+
styles.ok(
|
|
196
|
+
`Scraped nav in ${styles.bold(formatDuration(Date.now() - scrapeStart))} — ${styles.bold(String(scraped.categories.length))} categor${scraped.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(directMatches))} direct matches + ${styles.bold(String(totalMatched - directMatches))} slotted by path = ${styles.bold(String(totalMatched))}/${knownUrls.length}.`,
|
|
197
|
+
)
|
|
198
|
+
// Sweep pass first: any page whose URL has a strong reference segment
|
|
199
|
+
// (e.g. `/api-reference/...`) belongs in the API Reference category,
|
|
200
|
+
// even if the site's sidebar spotlighted it under Developers or similar.
|
|
201
|
+
// Sidebars often surface a few endpoints as "featured" outside the
|
|
202
|
+
// reference section — we respect the URL over the nav here.
|
|
203
|
+
const moved = reclassifyReferencePages(scraped)
|
|
204
|
+
if (moved > 0) {
|
|
205
|
+
styles.info(`Moved ${styles.bold(String(moved))} page${moved === 1 ? '' : 's'} into ${styles.bold('API Reference')} based on URL path.`)
|
|
206
|
+
}
|
|
207
|
+
|
|
208
|
+
if (slotted.length > 0) {
|
|
209
|
+
// Mintlify-style docs put API endpoints in a separate tab rooted at
|
|
210
|
+
// /api-reference/* (or /api/*, /reference/*). Collapse remaining such
|
|
211
|
+
// pages into a single "API Reference" category (absorbing the flat
|
|
212
|
+
// category the sweep pass just built, if any) and nest it by resource
|
|
213
|
+
// segment so routeCategory() maps the whole thing to ReadMe's
|
|
214
|
+
// `reference/` top-level dir.
|
|
215
|
+
const apiResult = collectApiReferencePages(slotted, scraped)
|
|
216
|
+
const otherOrphans = apiResult.nonApiOrphans
|
|
217
|
+
if (apiResult.category) scraped.categories.push(apiResult.category)
|
|
218
|
+
|
|
219
|
+
const buckets = bucketOrphansByPathType(otherOrphans, scraped)
|
|
220
|
+
for (const b of buckets) scraped.categories.push(b)
|
|
221
|
+
if (debugSnapshots) {
|
|
222
|
+
debugSnapshots['04-after-orphan-buckets.json'] = {
|
|
223
|
+
apiReferenceCollected: apiResult.category
|
|
224
|
+
? {
|
|
225
|
+
pageCount: apiResult.category.pages.length,
|
|
226
|
+
mergedFromScraped: apiResult.mergedScrapedTitles,
|
|
227
|
+
}
|
|
228
|
+
: null,
|
|
229
|
+
buckets,
|
|
230
|
+
scraped: JSON.parse(JSON.stringify(scraped)),
|
|
231
|
+
}
|
|
232
|
+
}
|
|
233
|
+
const parts = []
|
|
234
|
+
if (apiResult.category) {
|
|
235
|
+
parts.push(`${styles.bold(String(apiResult.category.pages.length))} in ${styles.bold('API Reference')}`)
|
|
236
|
+
}
|
|
237
|
+
for (const b of buckets) {
|
|
238
|
+
parts.push(`${styles.bold(String(b.pages.length))} in ${styles.bold(b.title)}`)
|
|
239
|
+
}
|
|
240
|
+
if (parts.length > 0) {
|
|
241
|
+
styles.info(`${styles.bold(String(slotted.length))} orphan page${slotted.length === 1 ? '' : 's'} bucketed by URL type: ${parts.join(', ')}.`)
|
|
242
|
+
}
|
|
243
|
+
}
|
|
244
|
+
} else {
|
|
245
|
+
// Discovery mode — scraped pages ARE our known pages. No orphans.
|
|
246
|
+
// If everything landed in a single flat category (the sidebar had no
|
|
247
|
+
// <h*>/<p> headers to split on), try to re-cluster by URL path
|
|
248
|
+
// structure: pages that share a common prefix often live under the
|
|
249
|
+
// same section in the site's real hierarchy.
|
|
250
|
+
if (scraped.categories.length === 1) {
|
|
251
|
+
const reclustered = clusterByUrlPath(scraped.categories[0].pages)
|
|
252
|
+
if (reclustered) {
|
|
253
|
+
scraped.categories = reclustered
|
|
254
|
+
styles.ok(
|
|
255
|
+
`Scraped nav in ${styles.bold(formatDuration(Date.now() - scrapeStart))} — re-clustered by URL path into ${styles.bold(String(scraped.categories.length))} categor${scraped.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(directMatches))} pages discovered (no llms.txt).`,
|
|
256
|
+
)
|
|
257
|
+
} else {
|
|
258
|
+
styles.ok(
|
|
259
|
+
`Scraped nav in ${styles.bold(formatDuration(Date.now() - scrapeStart))} — ${styles.bold(String(scraped.categories.length))} categor${scraped.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(directMatches))} pages discovered (no llms.txt).`,
|
|
260
|
+
)
|
|
261
|
+
}
|
|
262
|
+
} else {
|
|
263
|
+
styles.ok(
|
|
264
|
+
`Scraped nav in ${styles.bold(formatDuration(Date.now() - scrapeStart))} — ${styles.bold(String(scraped.categories.length))} categor${scraped.categories.length === 1 ? 'y' : 'ies'}, ${styles.bold(String(directMatches))} pages discovered (no llms.txt).`,
|
|
265
|
+
)
|
|
266
|
+
}
|
|
267
|
+
}
|
|
268
|
+
} else if (!llms) {
|
|
269
|
+
throw new Error(`No llms.txt and the sidebar scrape found no usable structure — can't import ${sourceUrl.toString()}.`)
|
|
270
|
+
} else {
|
|
271
|
+
styles.warning(`Couldn't extract a useful nav — falling back to llms.txt-based organization.`)
|
|
272
|
+
}
|
|
273
|
+
console.log()
|
|
274
|
+
|
|
275
|
+
let organized
|
|
276
|
+
const organizeStart = Date.now()
|
|
277
|
+
if (scraped) {
|
|
278
|
+
// No Claude call — icons deferred. Use a neutral placeholder so the tree
|
|
279
|
+
// view still prints cleanly.
|
|
280
|
+
organized = {
|
|
281
|
+
title: (llms && llms.parsed.title) || null,
|
|
282
|
+
categories: scraped.categories.map((c) => ({ title: c.title, icon: null, pages: c.pages })),
|
|
283
|
+
}
|
|
284
|
+
} else {
|
|
285
|
+
const fastPath = sectionsLookUsable(llms.parsed.sections)
|
|
286
|
+
styles.info(`Organizing with Claude (${styles.bold(options.model)}, ${fastPath ? 'fast path: icons only' : 'full reorg'})...`)
|
|
287
|
+
organized = await timePhase('claude organize', () => organizeWithClaude(llms.parsed, options.model))
|
|
288
|
+
}
|
|
289
|
+
styles.ok(`Organized in ${styles.bold(formatDuration(Date.now() - organizeStart))}.`)
|
|
290
|
+
if (debugSnapshots) {
|
|
291
|
+
debugSnapshots['05-organized.json'] = organized
|
|
292
|
+
const debugDir = path.join(os.tmpdir(), `readme-import-debug-${sourceUrl.hostname}-${Date.now()}`)
|
|
293
|
+
fs.mkdirSync(debugDir, { recursive: true })
|
|
294
|
+
for (const [name, data] of Object.entries(debugSnapshots)) {
|
|
295
|
+
fs.writeFileSync(path.join(debugDir, name), JSON.stringify(data, null, 2))
|
|
296
|
+
}
|
|
297
|
+
styles.info(`${styles.dim(`Debug snapshots → ${debugDir}`)}`)
|
|
298
|
+
}
|
|
299
|
+
console.log()
|
|
300
|
+
|
|
301
|
+
console.log(` ${styles.bold(organized.title || '(untitled)')}`)
|
|
302
|
+
for (const cat of organized.categories || []) {
|
|
303
|
+
console.log()
|
|
304
|
+
const iconLabel = cat.icon ? `${styles.brand(`[${cat.icon}]`)} ` : ''
|
|
305
|
+
console.log(` ${iconLabel}${styles.bold(cat.title)}`)
|
|
306
|
+
printPagesTree(cat.pages || [], 2)
|
|
307
|
+
}
|
|
308
|
+
|
|
309
|
+
const stagingDir = fs.mkdtempSync(path.join(os.tmpdir(), 'readme-import-'))
|
|
310
|
+
|
|
311
|
+
let result
|
|
312
|
+
try {
|
|
313
|
+
styles.info(`Staging frontmatter stubs in ${styles.bold(stagingDir)}...`)
|
|
314
|
+
const stageStart = Date.now()
|
|
315
|
+
const staged = await timePhase('stage stubs', async () => stageOrganized(organized, stagingDir, { skipApiReference: !!options.skipApiReference }))
|
|
316
|
+
ensureDocsLandingPage(stagingDir, organized.title || sourceUrl.hostname)
|
|
317
|
+
styles.ok(
|
|
318
|
+
`Staged ${styles.bold(String(staged.fileCount))} stub${staged.fileCount === 1 ? '' : 's'} across ${styles.bold(String(staged.dirCount))} director${staged.dirCount === 1 ? 'y' : 'ies'} in ${styles.bold(formatDuration(Date.now() - stageStart))}.`,
|
|
319
|
+
)
|
|
320
|
+
if (staged.skippedApiRef > 0) {
|
|
321
|
+
styles.info(`Skipped ${styles.bold(String(staged.skippedApiRef))} API reference page${staged.skippedApiRef === 1 ? '' : 's'} (--skip-api-reference)`)
|
|
322
|
+
}
|
|
323
|
+
console.log()
|
|
324
|
+
|
|
325
|
+
if (options.test) {
|
|
326
|
+
styles.ok(`Done in ${styles.bold(formatDuration(Date.now() - startedAt))}! Staged ${styles.bold(String(staged.fileCount))} files at ${styles.bold(stagingDir)}`)
|
|
327
|
+
console.log()
|
|
328
|
+
styles.info('Starting the dev server for preview...')
|
|
329
|
+
console.log()
|
|
330
|
+
await runDevPreview(stagingDir)
|
|
331
|
+
return { source: 'url', stagingDir, fileCount: staged.fileCount, duration: Date.now() - startedAt, phases }
|
|
332
|
+
}
|
|
333
|
+
|
|
334
|
+
if (staged.fileCount === 0) {
|
|
335
|
+
styles.warning('Staging directory is empty — skipping zip.')
|
|
336
|
+
return { source: 'url', fileCount: 0, duration: Date.now() - startedAt, phases }
|
|
337
|
+
}
|
|
338
|
+
|
|
339
|
+
styles.info(`Packaging ${styles.bold(String(staged.fileCount))} files into ${styles.bold(outputZip)}...`)
|
|
340
|
+
await timePhase('zip', () => createZip(stagingDir, outputZip))
|
|
341
|
+
|
|
342
|
+
console.log()
|
|
343
|
+
styles.ok(`Done in ${styles.bold(formatDuration(Date.now() - startedAt))}! Your ReadMe import is ready at ${styles.bold(outputZip)}`)
|
|
344
|
+
console.log(styles.dim(` ⏱ ${phases.map((p) => `${p.label} ${formatDuration(p.ms)}`).join(' · ')}`))
|
|
345
|
+
result = { source: 'url', outputZip, fileCount: staged.fileCount, duration: Date.now() - startedAt, phases }
|
|
346
|
+
} finally {
|
|
347
|
+
if (!options.test) {
|
|
348
|
+
fs.rmSync(stagingDir, { recursive: true, force: true })
|
|
349
|
+
}
|
|
350
|
+
}
|
|
351
|
+
|
|
352
|
+
return result
|
|
353
|
+
}
|
|
354
|
+
|
|
355
|
+
/**
|
|
356
|
+
* CLI entrypoint. Wraps `importDocs` with friendly error reporting and a
|
|
357
|
+
* force-exit on success — the Claude SDK and fetch keep-alive both hold open
|
|
358
|
+
* handles that idle for 20-30s before timing out, and we're done by this
|
|
359
|
+
* point so we exit explicitly to match wall-clock time with the "Done in"
|
|
360
|
+
* message.
|
|
361
|
+
*/
|
|
362
|
+
export async function run(options) {
|
|
363
|
+
try {
|
|
364
|
+
await importDocs(options)
|
|
365
|
+
} catch (err) {
|
|
366
|
+
styles.error(err.message || String(err))
|
|
367
|
+
process.exit(1)
|
|
368
|
+
}
|
|
369
|
+
process.exit(0)
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
/**
|
|
373
|
+
* Import path for local OpenAPI spec files. The spec is copied verbatim into
|
|
374
|
+
* `reference/` — the git-format build pipeline auto-generates endpoint pages
|
|
375
|
+
* from the spec at render time, so we don't need to stub anything here.
|
|
376
|
+
*/
|
|
377
|
+
async function runOasImport(sourcePath, options, startedAt, phases, timePhase) {
|
|
378
|
+
const absPath = path.resolve(sourcePath)
|
|
379
|
+
if (!fs.existsSync(absPath) || !fs.statSync(absPath).isFile()) {
|
|
380
|
+
throw new Error(`File not found: ${absPath}`)
|
|
381
|
+
}
|
|
382
|
+
const ext = path.extname(absPath).toLowerCase()
|
|
383
|
+
if (!['.json', '.yaml', '.yml'].includes(ext)) {
|
|
384
|
+
throw new Error(`Unsupported file type ${ext || '(none)'} — expected .json, .yaml, or .yml.`)
|
|
385
|
+
}
|
|
386
|
+
|
|
387
|
+
const basename = path.basename(absPath, ext)
|
|
388
|
+
const outputZip = path.resolve(options.output || path.join(process.cwd(), `${basename}-readme.zip`))
|
|
389
|
+
|
|
390
|
+
console.log()
|
|
391
|
+
styles.info(`Importing OpenAPI spec from ${styles.bold(absPath)}`)
|
|
392
|
+
if (!options.test) styles.info(`Output: ${styles.bold(outputZip)}`)
|
|
393
|
+
console.log()
|
|
394
|
+
|
|
395
|
+
// Parse + sanity-check it's actually an OAS before we stage anything.
|
|
396
|
+
// We do this in two stages: a cheap parse + looks-like-OAS check (fail
|
|
397
|
+
// fast on clearly wrong inputs), then a normalize step that will repair
|
|
398
|
+
// fixable issues (Swagger 2 → OpenAPI 3 conversion, bundling $refs, etc.).
|
|
399
|
+
const { spec, opCount, wasFixed, fixReason } = await timePhase('parse spec', async () => {
|
|
400
|
+
const raw = fs.readFileSync(absPath, 'utf-8')
|
|
401
|
+
let parsed
|
|
402
|
+
try {
|
|
403
|
+
parsed = ext === '.json' ? JSON.parse(raw) : yamlRequire().load(raw)
|
|
404
|
+
} catch (e) {
|
|
405
|
+
throw new Error(`Couldn't parse ${absPath} as ${ext === '.json' ? 'JSON' : 'YAML'}: ${e.message}`)
|
|
406
|
+
}
|
|
407
|
+
if (!parsed || typeof parsed !== 'object' || Array.isArray(parsed)) {
|
|
408
|
+
throw new Error(`File isn't a usable object: ${absPath}`)
|
|
409
|
+
}
|
|
410
|
+
if (!looksLikeOas(parsed)) {
|
|
411
|
+
throw new Error(`Not an OpenAPI/Swagger spec — no top-level openapi / swagger field and no paths section.`)
|
|
412
|
+
}
|
|
413
|
+
|
|
414
|
+
// Normalize via oas-normalize. `bundle()` handles the common fix-ups:
|
|
415
|
+
// Swagger 2 → OpenAPI 3, Postman collection → OpenAPI, inline $ref
|
|
416
|
+
// resolution. We try it first (even on apparently-valid specs) so Postman
|
|
417
|
+
// collections actually get converted. If bundle errors we fall back to
|
|
418
|
+
// the original spec when it passes validate, else fail hard.
|
|
419
|
+
const normalizer = new OASNormalize(parsed)
|
|
420
|
+
try {
|
|
421
|
+
const bundled = await normalizer.bundle()
|
|
422
|
+
const changed = JSON.stringify(bundled) !== JSON.stringify(parsed)
|
|
423
|
+
return {
|
|
424
|
+
spec: bundled,
|
|
425
|
+
opCount: countOperations(bundled),
|
|
426
|
+
wasFixed: changed,
|
|
427
|
+
fixReason: changed ? 'normalized (Swagger 2 → OpenAPI 3, Postman conversion, or $ref inlining)' : null,
|
|
428
|
+
}
|
|
429
|
+
} catch (bundleErr) {
|
|
430
|
+
try {
|
|
431
|
+
await normalizer.validate()
|
|
432
|
+
return { spec: parsed, opCount: countOperations(parsed), wasFixed: false }
|
|
433
|
+
} catch (validateErr) {
|
|
434
|
+
throw new Error(
|
|
435
|
+
`Spec is invalid and couldn't be auto-fixed.\n Validation error: ${validateErr.message.split('\n')[0]}\n Fix attempt error: ${bundleErr.message.split('\n')[0]}`,
|
|
436
|
+
)
|
|
437
|
+
}
|
|
438
|
+
}
|
|
439
|
+
})
|
|
440
|
+
|
|
441
|
+
if (wasFixed) {
|
|
442
|
+
styles.warning(`Spec had issues — auto-fixed (${fixReason}).`)
|
|
443
|
+
}
|
|
444
|
+
|
|
445
|
+
const title = spec.info?.title || basename
|
|
446
|
+
const version = spec.info?.version || null
|
|
447
|
+
styles.ok(`Parsed OpenAPI ${version ? 'v' + version + ' ' : ''}spec — ${styles.bold(title)} (${styles.bold(String(opCount))} operation${opCount === 1 ? '' : 's'}).`)
|
|
448
|
+
console.log()
|
|
449
|
+
|
|
450
|
+
const stagingDir = fs.mkdtempSync(path.join(os.tmpdir(), 'readme-import-'))
|
|
451
|
+
let result
|
|
452
|
+
try {
|
|
453
|
+
const { stagedName } = await timePhase('stage spec', async () => {
|
|
454
|
+
// If we auto-fixed the spec, serialize the fixed version as JSON (always
|
|
455
|
+
// writable, avoids YAML-ambiguity regressions). Otherwise copy the
|
|
456
|
+
// original file verbatim so formatting/comments are preserved.
|
|
457
|
+
const rawName = path.basename(absPath)
|
|
458
|
+
let targetName
|
|
459
|
+
let targetContent
|
|
460
|
+
if (wasFixed) {
|
|
461
|
+
targetName = rawName.replace(/\.(ya?ml|json)$/i, '.json')
|
|
462
|
+
targetContent = JSON.stringify(spec, null, 2)
|
|
463
|
+
} else {
|
|
464
|
+
targetName = rawName
|
|
465
|
+
targetContent = null // signal to copy
|
|
466
|
+
}
|
|
467
|
+
const targetPath = path.join(stagingDir, 'reference', targetName)
|
|
468
|
+
fs.mkdirSync(path.dirname(targetPath), { recursive: true })
|
|
469
|
+
if (targetContent === null) {
|
|
470
|
+
fs.copyFileSync(absPath, targetPath)
|
|
471
|
+
} else {
|
|
472
|
+
fs.writeFileSync(targetPath, targetContent)
|
|
473
|
+
}
|
|
474
|
+
|
|
475
|
+
// syncOas walks reference/ and generates one <operationId>.md per
|
|
476
|
+
// operation, grouped by tag. We pass stagingDir as the "git root" so
|
|
477
|
+
// its refDir lookup lands on stagingDir/reference/.
|
|
478
|
+
syncOas(stagingDir)
|
|
479
|
+
return { stagedName: targetName }
|
|
480
|
+
})
|
|
481
|
+
|
|
482
|
+
// Ensure there's always at least a landing page — OAS-only imports leave
|
|
483
|
+
// docs/ empty, which makes `--test` dev server show "no pages" at /.
|
|
484
|
+
ensureDocsLandingPage(stagingDir, title, opCount)
|
|
485
|
+
|
|
486
|
+
// OAS operation pages don't need an x-import URL — their content is
|
|
487
|
+
// intrinsic to the spec (summary/description live in the OpenAPI doc,
|
|
488
|
+
// and the page's `api:` frontmatter already points back to it).
|
|
489
|
+
const pageCount = countReferencePages(stagingDir, stagedName)
|
|
490
|
+
|
|
491
|
+
styles.ok(`Staged ${styles.bold(stagedName)} and generated ${styles.bold(String(pageCount))} operation page${pageCount === 1 ? '' : 's'} under ${styles.bold('reference/')}.`)
|
|
492
|
+
console.log()
|
|
493
|
+
|
|
494
|
+
if (options.test) {
|
|
495
|
+
styles.ok(`Done in ${styles.bold(formatDuration(Date.now() - startedAt))}! Staged at ${styles.bold(stagingDir)}`)
|
|
496
|
+
console.log()
|
|
497
|
+
styles.info('Starting the dev server for preview...')
|
|
498
|
+
console.log()
|
|
499
|
+
await runDevPreview(stagingDir)
|
|
500
|
+
return { source: 'oas', stagingDir, fileCount: pageCount, duration: Date.now() - startedAt, phases }
|
|
501
|
+
}
|
|
502
|
+
|
|
503
|
+
await timePhase('zip', () => createZip(stagingDir, outputZip))
|
|
504
|
+
|
|
505
|
+
console.log()
|
|
506
|
+
styles.ok(`Done in ${styles.bold(formatDuration(Date.now() - startedAt))}! Your ReadMe import is ready at ${styles.bold(outputZip)}`)
|
|
507
|
+
console.log(styles.dim(` ⏱ ${phases.map((p) => `${p.label} ${formatDuration(p.ms)}`).join(' · ')}`))
|
|
508
|
+
result = { source: 'oas', outputZip, fileCount: pageCount, duration: Date.now() - startedAt, phases }
|
|
509
|
+
} finally {
|
|
510
|
+
if (!options.test) fs.rmSync(stagingDir, { recursive: true, force: true })
|
|
511
|
+
}
|
|
512
|
+
return result
|
|
513
|
+
}
|
|
514
|
+
|
|
515
|
+
// js-yaml is installed transitively (via oas-sync.js) but not in our direct
|
|
516
|
+
// deps. Load it lazily on first use so the JSON-only path doesn't pay for it.
|
|
517
|
+
let _yaml = null
|
|
518
|
+
function yamlRequire() {
|
|
519
|
+
if (!_yaml) {
|
|
520
|
+
const require = createRequire(import.meta.url)
|
|
521
|
+
_yaml = require('js-yaml')
|
|
522
|
+
}
|
|
523
|
+
return _yaml
|
|
524
|
+
}
|
|
525
|
+
|
|
526
|
+
/**
|
|
527
|
+
* Cheap first-pass "does this look like an OpenAPI/Swagger spec?" check.
|
|
528
|
+
* Accepts anything with a version field OR a paths section — some specs in
|
|
529
|
+
* the wild drop the version; the follow-up oas-normalize pass will still
|
|
530
|
+
* catch malformed inputs that slip through here.
|
|
531
|
+
*/
|
|
532
|
+
function looksLikeOas(obj) {
|
|
533
|
+
if (!obj || typeof obj !== 'object') return false
|
|
534
|
+
if (typeof obj.openapi === 'string' || typeof obj.swagger === 'string') return true
|
|
535
|
+
if (obj.paths && typeof obj.paths === 'object') return true
|
|
536
|
+
// Postman collections — oas-normalize auto-converts these to OpenAPI.
|
|
537
|
+
if (obj.info && typeof obj.info.schema === 'string' && /getpostman\.com/i.test(obj.info.schema)) return true
|
|
538
|
+
return false
|
|
539
|
+
}
|
|
540
|
+
|
|
541
|
+
function countOperations(spec) {
|
|
542
|
+
let n = 0
|
|
543
|
+
for (const p of Object.values(spec.paths || {})) {
|
|
544
|
+
for (const k of Object.keys(p || {})) {
|
|
545
|
+
if (/^(get|post|put|patch|delete|options|head|trace)$/i.test(k)) n++
|
|
546
|
+
}
|
|
547
|
+
}
|
|
548
|
+
return n
|
|
549
|
+
}
|
|
550
|
+
|
|
551
|
+
/**
|
|
552
|
+
* Count operation pages syncOas just generated under reference/ for the
|
|
553
|
+
* given spec file. Used only for the "generated N pages" success message.
|
|
554
|
+
*/
|
|
555
|
+
function countReferencePages(stagingDir, specFilename) {
|
|
556
|
+
const refDir = path.join(stagingDir, 'reference')
|
|
557
|
+
if (!fs.existsSync(refDir)) return 0
|
|
558
|
+
|
|
559
|
+
let count = 0
|
|
560
|
+
const walk = (dir) => {
|
|
561
|
+
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
562
|
+
const full = path.join(dir, entry.name)
|
|
563
|
+
if (entry.isDirectory()) {
|
|
564
|
+
walk(full)
|
|
565
|
+
continue
|
|
566
|
+
}
|
|
567
|
+
if (!entry.name.endsWith('.md')) continue
|
|
568
|
+
|
|
569
|
+
const parsed = matter(fs.readFileSync(full, 'utf-8'))
|
|
570
|
+
const fm = parsed.data || {}
|
|
571
|
+
if (fm.api && fm.api.file === specFilename) count++
|
|
572
|
+
}
|
|
573
|
+
}
|
|
574
|
+
walk(refDir)
|
|
575
|
+
return count
|
|
576
|
+
}
|
|
577
|
+
|
|
578
|
+
/**
|
|
579
|
+
* Write a `docs/Getting Started/getting-started.md` when there's no other
|
|
580
|
+
* docs content (typical after an OAS-only import). Body includes a pointer
|
|
581
|
+
* to the Reference tab so users know where the operations are — the dev
|
|
582
|
+
* server only shows one sidebar section at a time, so a blank landing on
|
|
583
|
+
* `/docs/...` makes it look like nothing imported.
|
|
584
|
+
*/
|
|
585
|
+
function ensureDocsLandingPage(stagingDir, siteTitle, opCount = 0) {
|
|
586
|
+
const docsDir = path.join(stagingDir, 'docs')
|
|
587
|
+
if (fs.existsSync(docsDir) && fs.readdirSync(docsDir).length > 0) return
|
|
588
|
+
|
|
589
|
+
const categoryDir = path.join(docsDir, 'Getting Started')
|
|
590
|
+
fs.mkdirSync(categoryDir, { recursive: true })
|
|
591
|
+
|
|
592
|
+
const name = siteTitle || 'your API'
|
|
593
|
+
const title = siteTitle ? `Welcome to ${siteTitle}` : 'Getting Started'
|
|
594
|
+
const body =
|
|
595
|
+
opCount > 0
|
|
596
|
+
? `This import brought in **${opCount} API operation${opCount === 1 ? '' : 's'}** from ${name}.\n\n👉 [Browse the API Reference →](/reference)\n\nThis page is a placeholder landing. Replace or expand it with onboarding content specific to your API.\n`
|
|
597
|
+
: `This is a placeholder landing page. Replace it with your docs.\n`
|
|
598
|
+
fs.writeFileSync(path.join(categoryDir, 'getting-started.md'), matter.stringify(body, { title, icon: formatIconClass('rocket') }))
|
|
599
|
+
fs.writeFileSync(path.join(categoryDir, '_order.yaml'), '- getting-started\n')
|
|
600
|
+
fs.writeFileSync(path.join(docsDir, '_order.yaml'), '- Getting Started\n')
|
|
601
|
+
}
|
|
602
|
+
|
|
603
|
+
/**
|
|
604
|
+
* Run a Claude agent in the staging directory. Streams tool calls + assistant
|
|
605
|
+
* text to the terminal. Writes are scoped to the staging dir.
|
|
606
|
+
*/
|
|
607
|
+
export async function runAgent({ userPrompt, systemPrompt, cwd, model }) {
|
|
608
|
+
for await (const message of query({
|
|
609
|
+
prompt: userPrompt,
|
|
610
|
+
options: {
|
|
611
|
+
cwd,
|
|
612
|
+
allowedTools: ['Read', 'Write', 'Edit', 'Glob', 'Grep'],
|
|
613
|
+
permissionMode: 'acceptEdits',
|
|
614
|
+
canUseTool: makeStagingGuard(cwd),
|
|
615
|
+
...(systemPrompt ? { systemPrompt } : {}),
|
|
616
|
+
...(model ? { model } : {}),
|
|
617
|
+
},
|
|
618
|
+
})) {
|
|
619
|
+
if (message.type === 'assistant' && message.message?.content) {
|
|
620
|
+
for (const block of message.message.content) {
|
|
621
|
+
if (block.type === 'text' && block.text?.trim()) {
|
|
622
|
+
console.log(styles.dim(block.text.trim()))
|
|
623
|
+
} else if (block.type === 'tool_use') {
|
|
624
|
+
console.log(`${styles.brand('›')} ${styles.bold(block.name)}`)
|
|
625
|
+
}
|
|
626
|
+
}
|
|
627
|
+
} else if (message.type === 'result') {
|
|
628
|
+
if (message.subtype && message.subtype !== 'success') {
|
|
629
|
+
const err = new Error(`Agent result subtype=${message.subtype}${message.error?.message ? ': ' + message.error.message : ''}`)
|
|
630
|
+
err.subtype = message.subtype
|
|
631
|
+
err.result = message
|
|
632
|
+
throw err
|
|
633
|
+
}
|
|
634
|
+
return
|
|
635
|
+
}
|
|
636
|
+
}
|
|
637
|
+
}
|
|
638
|
+
|
|
639
|
+
function makeStagingGuard(stagingDir) {
|
|
640
|
+
const absStaging = path.resolve(stagingDir)
|
|
641
|
+
const WRITE_TOOLS = new Set(['Write', 'Edit', 'NotebookEdit', 'MultiEdit'])
|
|
642
|
+
|
|
643
|
+
return async (toolName, input) => {
|
|
644
|
+
if (!WRITE_TOOLS.has(toolName)) return { behavior: 'allow' }
|
|
645
|
+
const fp = input?.file_path
|
|
646
|
+
if (typeof fp !== 'string' || !fp) {
|
|
647
|
+
return { behavior: 'deny', message: `${toolName}: missing file_path` }
|
|
648
|
+
}
|
|
649
|
+
const abs = path.isAbsolute(fp) ? path.resolve(fp) : path.resolve(absStaging, fp)
|
|
650
|
+
const rel = path.relative(absStaging, abs)
|
|
651
|
+
const inside = rel && !rel.startsWith('..') && !path.isAbsolute(rel)
|
|
652
|
+
if (!inside) {
|
|
653
|
+
styles.warning(`Blocked ${toolName} outside staging: ${fp}`)
|
|
654
|
+
return {
|
|
655
|
+
behavior: 'deny',
|
|
656
|
+
message: `Writes must stay inside the staging directory ${absStaging}. Refused: ${fp}`,
|
|
657
|
+
}
|
|
658
|
+
}
|
|
659
|
+
return { behavior: 'allow' }
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
|
|
663
|
+
/**
|
|
664
|
+
* Hand off to the dev command so the user can preview the staged docs. Reuses
|
|
665
|
+
* the currently-running CLI binary so fixes to the dev server ship immediately
|
|
666
|
+
* to users already on this version (and so local development doesn't need a
|
|
667
|
+
* publish to test). Falls back to `npx @readme/cli` if we can't locate
|
|
668
|
+
* ourselves. Stdout is piped so we can detect the server's URL and open it.
|
|
669
|
+
*/
|
|
670
|
+
function runDevPreview(stagingDir) {
|
|
671
|
+
return new Promise((resolve, reject) => {
|
|
672
|
+
const selfBin = process.argv[1] && fs.existsSync(process.argv[1]) ? process.argv[1] : null
|
|
673
|
+
const [cmd, args] = selfBin ? [process.execPath, [selfBin, 'dev', '--no-check']] : ['npx', ['--yes', '@readme/cli', 'dev', '--no-check']]
|
|
674
|
+
const child = spawn(cmd, args, {
|
|
675
|
+
cwd: stagingDir,
|
|
676
|
+
stdio: ['inherit', 'pipe', 'inherit'],
|
|
677
|
+
})
|
|
678
|
+
|
|
679
|
+
let opened = false
|
|
680
|
+
child.stdout.on('data', (chunk) => {
|
|
681
|
+
process.stdout.write(chunk)
|
|
682
|
+
if (opened) return
|
|
683
|
+
const match = chunk.toString().match(/https?:\/\/localhost:\d+/)
|
|
684
|
+
if (match) {
|
|
685
|
+
opened = true
|
|
686
|
+
openUrl(match[0])
|
|
687
|
+
}
|
|
688
|
+
})
|
|
689
|
+
|
|
690
|
+
child.on('close', () => resolve())
|
|
691
|
+
child.on('error', reject)
|
|
692
|
+
})
|
|
693
|
+
}
|
|
694
|
+
|
|
695
|
+
function openUrl(url) {
|
|
696
|
+
const cmd = process.platform === 'darwin' ? 'open' : process.platform === 'win32' ? 'cmd' : 'xdg-open'
|
|
697
|
+
const args = process.platform === 'win32' ? ['/c', 'start', '""', url] : [url]
|
|
698
|
+
try {
|
|
699
|
+
spawn(cmd, args, { stdio: 'ignore', detached: true }).unref()
|
|
700
|
+
} catch {
|
|
701
|
+
// Best-effort — the URL is still in the terminal output for the user.
|
|
702
|
+
}
|
|
703
|
+
}
|
|
704
|
+
|
|
705
|
+
function listFiles(dir, prefix = '') {
|
|
706
|
+
const results = []
|
|
707
|
+
if (!fs.existsSync(dir)) return results
|
|
708
|
+
for (const entry of fs.readdirSync(dir, { withFileTypes: true })) {
|
|
709
|
+
if (entry.name.startsWith('.')) continue
|
|
710
|
+
const rel = prefix ? `${prefix}/${entry.name}` : entry.name
|
|
711
|
+
if (entry.isDirectory()) {
|
|
712
|
+
results.push(...listFiles(path.join(dir, entry.name), rel))
|
|
713
|
+
} else {
|
|
714
|
+
results.push(rel)
|
|
715
|
+
}
|
|
716
|
+
}
|
|
717
|
+
return results
|
|
718
|
+
}
|
|
719
|
+
|
|
720
|
+
function createZip(sourceDir, outputZip) {
|
|
721
|
+
fs.mkdirSync(path.dirname(outputZip), { recursive: true })
|
|
722
|
+
if (fs.existsSync(outputZip)) fs.rmSync(outputZip)
|
|
723
|
+
|
|
724
|
+
return new Promise((resolve, reject) => {
|
|
725
|
+
const child = spawn('zip', ['-r', '-q', outputZip, '.'], {
|
|
726
|
+
cwd: sourceDir,
|
|
727
|
+
stdio: 'inherit',
|
|
728
|
+
})
|
|
729
|
+
child.on('close', (code) => {
|
|
730
|
+
if (code === 0) resolve()
|
|
731
|
+
else reject(new Error(`zip exited with code ${code}`))
|
|
732
|
+
})
|
|
733
|
+
child.on('error', reject)
|
|
734
|
+
})
|
|
735
|
+
}
|
|
736
|
+
|
|
737
|
+
/**
|
|
738
|
+
* Ask Claude to fold a parsed llms.txt into a category/hierarchy JSON object.
|
|
739
|
+
* Returns { title, categories: [{ title, icon, pages: [{ title, url, description }] }] }.
|
|
740
|
+
*
|
|
741
|
+
* If the input already has sections, Claude is told to use them as starting
|
|
742
|
+
* points and refine. If sections are missing or obviously generic ("Resources",
|
|
743
|
+
* "English"), it invents better ones. Every category gets a FontAwesome icon.
|
|
744
|
+
*/
|
|
745
|
+
/**
|
|
746
|
+
* Mintlify sites ship the canonical sidebar in `docs.json` (v2) or `mint.json`
|
|
747
|
+
* (v1) at the origin root. When present, it's a perfect structural source —
|
|
748
|
+
* no HTML parsing needed. Pages are listed by slug; we enrich titles from
|
|
749
|
+
* llms.txt where available, otherwise fall back to a title derived from the
|
|
750
|
+
* slug.
|
|
751
|
+
*
|
|
752
|
+
* Returns { source, title, categories } or null if no Mintlify config is
|
|
753
|
+
* found or parseable.
|
|
754
|
+
*/
|
|
755
|
+
async function tryMintlifyNav(sourceUrl, knownPages, firecrawlKey) {
|
|
756
|
+
const origin = new URL(sourceUrl).origin
|
|
757
|
+
const fetchHtml = firecrawlKey ? makeFirecrawlFetcher(firecrawlKey) : fetchHtmlDirect
|
|
758
|
+
|
|
759
|
+
const byPath = new Map()
|
|
760
|
+
for (const p of knownPages) byPath.set(normalizePath(p.url), p)
|
|
761
|
+
|
|
762
|
+
for (const filename of ['docs.json', 'mint.json']) {
|
|
763
|
+
const configUrl = `${origin}/${filename}`
|
|
764
|
+
const body = await fetchHtml(configUrl)
|
|
765
|
+
if (!body) continue
|
|
766
|
+
const config = extractMintlifyConfig(body)
|
|
767
|
+
if (!config || !config.navigation) continue
|
|
768
|
+
const parsed = parseMintlifyConfig(config, origin, byPath)
|
|
769
|
+
if (parsed.categories.length > 0) {
|
|
770
|
+
return { source: configUrl, title: parsed.title, categories: parsed.categories }
|
|
771
|
+
}
|
|
772
|
+
}
|
|
773
|
+
return null
|
|
774
|
+
}
|
|
775
|
+
|
|
776
|
+
/**
|
|
777
|
+
* Extract a Mintlify config object from a fetched body. Handles:
|
|
778
|
+
* - raw JSON (`{ "navigation": ... }`)
|
|
779
|
+
* - JSON wrapped in HTML (Firecrawl sometimes returns `<pre>...</pre>` or
|
|
780
|
+
* a formatted view of the JSON body)
|
|
781
|
+
* - HTML-escaped JSON
|
|
782
|
+
*/
|
|
783
|
+
function extractMintlifyConfig(body) {
|
|
784
|
+
try {
|
|
785
|
+
return JSON.parse(body)
|
|
786
|
+
} catch {}
|
|
787
|
+
// Pull out the first balanced `{ ... }` that contains a "navigation" key.
|
|
788
|
+
const navIdx = body.indexOf('"navigation"')
|
|
789
|
+
if (navIdx === -1) return null
|
|
790
|
+
let start = body.lastIndexOf('{', navIdx)
|
|
791
|
+
while (start !== -1) {
|
|
792
|
+
for (let end = body.lastIndexOf('}'); end > start; end = body.lastIndexOf('}', end - 1)) {
|
|
793
|
+
const candidate = body.slice(start, end + 1)
|
|
794
|
+
try {
|
|
795
|
+
const parsed = JSON.parse(candidate)
|
|
796
|
+
if (parsed && parsed.navigation) return parsed
|
|
797
|
+
} catch {}
|
|
798
|
+
}
|
|
799
|
+
start = body.lastIndexOf('{', start - 1)
|
|
800
|
+
}
|
|
801
|
+
return null
|
|
802
|
+
}
|
|
803
|
+
|
|
804
|
+
function parseMintlifyConfig(config, origin, byPath) {
|
|
805
|
+
const title = config.name || null
|
|
806
|
+
const categories = []
|
|
807
|
+
|
|
808
|
+
const slugToTitle = (slug) => {
|
|
809
|
+
const base = String(slug).split('/').pop() || slug
|
|
810
|
+
return base
|
|
811
|
+
.split(/[-_]/)
|
|
812
|
+
.filter(Boolean)
|
|
813
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
814
|
+
.join(' ')
|
|
815
|
+
}
|
|
816
|
+
|
|
817
|
+
const pageToEntry = (p) => {
|
|
818
|
+
// Simple slug string → leaf page
|
|
819
|
+
if (typeof p === 'string') {
|
|
820
|
+
const slug = p.replace(/^\//, '')
|
|
821
|
+
const url = `${origin}/${slug}.md`
|
|
822
|
+
const known = byPath.get(normalizePath(url))
|
|
823
|
+
return {
|
|
824
|
+
title: known?.title || slugToTitle(slug),
|
|
825
|
+
url,
|
|
826
|
+
...(known?.description ? { description: known.description } : {}),
|
|
827
|
+
}
|
|
828
|
+
}
|
|
829
|
+
if (p && typeof p === 'object') {
|
|
830
|
+
// Nested group: recurse
|
|
831
|
+
if (p.group && Array.isArray(p.pages)) {
|
|
832
|
+
return {
|
|
833
|
+
title: p.group,
|
|
834
|
+
url: null,
|
|
835
|
+
pages: p.pages.map(pageToEntry).filter(Boolean),
|
|
836
|
+
}
|
|
837
|
+
}
|
|
838
|
+
// Some v2 shapes: { page: "slug", title?: "..." }
|
|
839
|
+
if (typeof p.page === 'string') {
|
|
840
|
+
const slug = p.page.replace(/^\//, '')
|
|
841
|
+
const url = `${origin}/${slug}.md`
|
|
842
|
+
const known = byPath.get(normalizePath(url))
|
|
843
|
+
return {
|
|
844
|
+
title: p.title || known?.title || slugToTitle(slug),
|
|
845
|
+
url,
|
|
846
|
+
...(known?.description ? { description: known.description } : {}),
|
|
847
|
+
}
|
|
848
|
+
}
|
|
849
|
+
}
|
|
850
|
+
return null
|
|
851
|
+
}
|
|
852
|
+
|
|
853
|
+
const pushGroup = (group) => {
|
|
854
|
+
if (!group || !Array.isArray(group.pages)) return
|
|
855
|
+
const pages = group.pages.map(pageToEntry).filter(Boolean)
|
|
856
|
+
if (pages.length === 0) return
|
|
857
|
+
categories.push({ title: group.group || 'Untitled', pages })
|
|
858
|
+
}
|
|
859
|
+
|
|
860
|
+
const nav = config.navigation
|
|
861
|
+
// v1: navigation is an array of groups
|
|
862
|
+
if (Array.isArray(nav)) {
|
|
863
|
+
for (const g of nav) pushGroup(g)
|
|
864
|
+
}
|
|
865
|
+
// v2: navigation.tabs[].groups[]
|
|
866
|
+
else if (Array.isArray(nav?.tabs)) {
|
|
867
|
+
for (const tab of nav.tabs) {
|
|
868
|
+
for (const g of tab.groups || []) pushGroup(g)
|
|
869
|
+
}
|
|
870
|
+
}
|
|
871
|
+
// v2: navigation.groups[] (no tabs)
|
|
872
|
+
else if (Array.isArray(nav?.groups)) {
|
|
873
|
+
for (const g of nav.groups) pushGroup(g)
|
|
874
|
+
}
|
|
875
|
+
// v2: navigation.pages[] (flat)
|
|
876
|
+
else if (Array.isArray(nav?.pages)) {
|
|
877
|
+
const pages = nav.pages.map(pageToEntry).filter(Boolean)
|
|
878
|
+
if (pages.length > 0) categories.push({ title: 'Documentation', pages })
|
|
879
|
+
}
|
|
880
|
+
|
|
881
|
+
return { title, categories }
|
|
882
|
+
}
|
|
883
|
+
|
|
884
|
+
/**
|
|
885
|
+
* Score a parsed nav tree for "sidebar-likeness". A real docs sidebar has
|
|
886
|
+
* multiple section headers (hierarchy) and tens of links; secondary navs
|
|
887
|
+
* (sitemaps, footers, search-index result lists) tend to be flat single-
|
|
888
|
+
* category blobs that are either alphabetized or dumped in insertion order.
|
|
889
|
+
*
|
|
890
|
+
* Flat single-category blocks are penalized so a slightly smaller hierarchical
|
|
891
|
+
* block wins over a bigger flat one. Without this, greenflash.ai's sidebar
|
|
892
|
+
* order was non-deterministic across runs — sometimes Quickstart first (real
|
|
893
|
+
* sidebar won), sometimes last (a flat alphabetized index block won).
|
|
894
|
+
*/
|
|
895
|
+
function scoreNavTree(tree) {
|
|
896
|
+
const count = tree.categories.reduce((n, c) => n + c.pages.length, 0)
|
|
897
|
+
const cats = tree.categories.length
|
|
898
|
+
const hierarchyBonus = cats >= 2 ? cats * 5 : -20
|
|
899
|
+
// Alphabetical penalty: real sidebars are curated (Overview first, related
|
|
900
|
+
// topics grouped). Auto-generated indexes, search clouds, and footer link
|
|
901
|
+
// lists tend to be strictly alphabetized. When a category's pages come in
|
|
902
|
+
// alpha order it's almost always noise masquerading as structure.
|
|
903
|
+
let alphaPenalty = 0
|
|
904
|
+
for (const c of tree.categories) {
|
|
905
|
+
const titles = (c.pages || []).map((p) => (p.title || '').toLowerCase())
|
|
906
|
+
if (titles.length >= 3 && isMonotonicAlpha(titles)) alphaPenalty -= 15
|
|
907
|
+
}
|
|
908
|
+
return count + hierarchyBonus + alphaPenalty
|
|
909
|
+
}
|
|
910
|
+
|
|
911
|
+
function isMonotonicAlpha(titles) {
|
|
912
|
+
for (let i = 1; i < titles.length; i++) {
|
|
913
|
+
if (titles[i] < titles[i - 1]) return false
|
|
914
|
+
}
|
|
915
|
+
return true
|
|
916
|
+
}
|
|
917
|
+
|
|
918
|
+
/**
|
|
919
|
+
* Fetch the source URL, find the `<nav>` or `<aside>` that contains the most
|
|
920
|
+
* links matching our known llms.txt URLs, and extract its heading/link
|
|
921
|
+
* structure into { title, categories: [{ title, pages: [...] }] }.
|
|
922
|
+
*
|
|
923
|
+
* Generic approach — no site-specific selectors. Works on any docs site that
|
|
924
|
+
* renders its sidebar server-side as <nav>/<aside> with <h*> section headers.
|
|
925
|
+
* Returns null if coverage is too low to be useful.
|
|
926
|
+
*/
|
|
927
|
+
async function scrapeNavFromSite(sourceUrl, knownPages, firecrawlKey) {
|
|
928
|
+
// Index known pages by normalized pathname so we can match nav hrefs against them.
|
|
929
|
+
const byPath = new Map()
|
|
930
|
+
for (const p of knownPages) byPath.set(normalizePath(p.url), p)
|
|
931
|
+
|
|
932
|
+
const fetchHtml = firecrawlKey ? makeFirecrawlFetcher(firecrawlKey) : fetchHtmlDirect
|
|
933
|
+
|
|
934
|
+
const visited = new Set()
|
|
935
|
+
const matched = new Map() // normalizedPath → page
|
|
936
|
+
const placed = new Set() // URLs already placed into some category (prevents cross-category duplication when round-1 visits reshape the tree)
|
|
937
|
+
const categoryByTitle = new Map()
|
|
938
|
+
const categoryOrder = []
|
|
939
|
+
|
|
940
|
+
// Fetch one URL, pick its best nav block, and merge anything new into our
|
|
941
|
+
// running tree. Each page on a typical docs site renders the full sidebar
|
|
942
|
+
// with its own branch expanded, so repeated visits into different branches
|
|
943
|
+
// accumulate coverage.
|
|
944
|
+
async function visit(url) {
|
|
945
|
+
const vkey = normalizePath(url)
|
|
946
|
+
if (visited.has(vkey)) return 0
|
|
947
|
+
visited.add(vkey)
|
|
948
|
+
|
|
949
|
+
const html = await fetchHtml(url)
|
|
950
|
+
if (!html) return 0
|
|
951
|
+
|
|
952
|
+
const base = new URL(url)
|
|
953
|
+
let best = { score: -Infinity, count: 0, tree: null }
|
|
954
|
+
|
|
955
|
+
// Tier 1: <nav>/<aside> elements — semantic markup, almost always the real sidebar.
|
|
956
|
+
const blockRegex = /<(nav|aside)\b[^>]*>([\s\S]*?)<\/\1>/gi
|
|
957
|
+
let m
|
|
958
|
+
while ((m = blockRegex.exec(html)) !== null) {
|
|
959
|
+
const tree = parseNavBlock(m[2], base, byPath)
|
|
960
|
+
const score = scoreNavTree(tree)
|
|
961
|
+
if (score > best.score) {
|
|
962
|
+
const count = tree.categories.reduce((n, c) => n + c.pages.length, 0)
|
|
963
|
+
best = { score, count, tree }
|
|
964
|
+
}
|
|
965
|
+
}
|
|
966
|
+
|
|
967
|
+
// Tier 2: <div>/<ul>/<section> containers whose attributes look sidebar-shaped
|
|
968
|
+
// (id="sidebar-group", class*="sidebar", role="navigation", etc.). Catches
|
|
969
|
+
// Mintlify-style stacks (greenflash.ai, mintlify.com clones) where the sidebar
|
|
970
|
+
// lives in a plain <div>. We need balanced-tag extraction since divs nest.
|
|
971
|
+
for (const block of extractSidebarContainers(html)) {
|
|
972
|
+
const tree = parseNavBlock(block, base, byPath)
|
|
973
|
+
const score = scoreNavTree(tree)
|
|
974
|
+
if (score > best.score) {
|
|
975
|
+
const count = tree.categories.reduce((n, c) => n + c.pages.length, 0)
|
|
976
|
+
best = { score, count, tree }
|
|
977
|
+
}
|
|
978
|
+
}
|
|
979
|
+
|
|
980
|
+
// Tier 3 (last resort): parse the whole document, filter out noise. Risky —
|
|
981
|
+
// alphabetical link clusters elsewhere on the page (TOCs, footers, indexes)
|
|
982
|
+
// can pollute the result. Only used when earlier tiers didn't find enough.
|
|
983
|
+
if (best.count < 10) {
|
|
984
|
+
const wholeTree = parseNavBlock(html, base, byPath)
|
|
985
|
+
const seen = new Set()
|
|
986
|
+
const filtered = []
|
|
987
|
+
for (const cat of wholeTree.categories) {
|
|
988
|
+
const keptPages = filterDedupePages(cat.pages, seen)
|
|
989
|
+
if (keptPages.length >= 2) filtered.push({ ...cat, pages: keptPages })
|
|
990
|
+
}
|
|
991
|
+
const filteredCount = filtered.reduce((n, c) => n + c.pages.length, 0)
|
|
992
|
+
const filteredTree = { title: null, categories: filtered }
|
|
993
|
+
const filteredScore = scoreNavTree(filteredTree)
|
|
994
|
+
// Compare on score, not count — a noisy 20-link alphabetical cluster
|
|
995
|
+
// shouldn't replace a clean 8-link real sidebar.
|
|
996
|
+
if (filteredScore > best.score) {
|
|
997
|
+
best = { score: filteredScore, count: filteredCount, tree: filteredTree }
|
|
998
|
+
}
|
|
999
|
+
}
|
|
1000
|
+
|
|
1001
|
+
if (!best.tree) return 0
|
|
1002
|
+
|
|
1003
|
+
let added = 0
|
|
1004
|
+
for (const cat of best.tree.categories) {
|
|
1005
|
+
let existing = categoryByTitle.get(cat.title)
|
|
1006
|
+
if (!existing) {
|
|
1007
|
+
existing = { title: cat.title, pages: [] }
|
|
1008
|
+
categoryByTitle.set(cat.title, existing)
|
|
1009
|
+
categoryOrder.push(existing)
|
|
1010
|
+
}
|
|
1011
|
+
added += mergePages(cat.pages, existing, matched)
|
|
1012
|
+
}
|
|
1013
|
+
return added
|
|
1014
|
+
}
|
|
1015
|
+
|
|
1016
|
+
/**
|
|
1017
|
+
* Merge `incoming` page tree into `target.pages`, recursing into sub-pages.
|
|
1018
|
+
* If a page already exists under `target`, we recurse into it to add any
|
|
1019
|
+
* newly-discovered children. If a page is globally `placed` under a
|
|
1020
|
+
* different category, we skip it — round-1 visits often reshape the tree
|
|
1021
|
+
* and we don't want the same URL to appear in multiple categories.
|
|
1022
|
+
* Returns the number of newly-added unique pages across the entire sub-tree.
|
|
1023
|
+
*/
|
|
1024
|
+
function mergePages(incoming, target, matched) {
|
|
1025
|
+
let added = 0
|
|
1026
|
+
for (const page of incoming) {
|
|
1027
|
+
const norm = normalizePath(page.url)
|
|
1028
|
+
let existing = target.pages.find((p) => p.url === page.url) || target.pages.find((p) => normalizePath(p.url) === norm)
|
|
1029
|
+
if (!existing) {
|
|
1030
|
+
// Already lives in a different category — don't add here.
|
|
1031
|
+
if (placed.has(norm)) {
|
|
1032
|
+
if (page.pages && page.pages.length > 0) {
|
|
1033
|
+
// Still merge its children into wherever the canonical page lives.
|
|
1034
|
+
const canonical = matched.get(norm)
|
|
1035
|
+
if (canonical) added += mergePages(page.pages, canonical, matched)
|
|
1036
|
+
}
|
|
1037
|
+
continue
|
|
1038
|
+
}
|
|
1039
|
+
existing = {
|
|
1040
|
+
title: page.title,
|
|
1041
|
+
url: page.url,
|
|
1042
|
+
...(page.description ? { description: page.description } : {}),
|
|
1043
|
+
pages: [],
|
|
1044
|
+
}
|
|
1045
|
+
target.pages.push(existing)
|
|
1046
|
+
placed.add(norm)
|
|
1047
|
+
if (!matched.has(norm)) {
|
|
1048
|
+
matched.set(norm, existing)
|
|
1049
|
+
added++
|
|
1050
|
+
}
|
|
1051
|
+
}
|
|
1052
|
+
if (page.pages && page.pages.length > 0) {
|
|
1053
|
+
added += mergePages(page.pages, existing, matched)
|
|
1054
|
+
}
|
|
1055
|
+
}
|
|
1056
|
+
return added
|
|
1057
|
+
}
|
|
1058
|
+
|
|
1059
|
+
// Round 0: the source URL itself — reveals top-level + the source page's branch.
|
|
1060
|
+
const r0Start = Date.now()
|
|
1061
|
+
await visit(sourceUrl)
|
|
1062
|
+
const r0Ms = Date.now() - r0Start
|
|
1063
|
+
if (categoryOrder.length === 0) return null
|
|
1064
|
+
|
|
1065
|
+
// Round 1 (parallel): visit pages so each branch has a chance to expose
|
|
1066
|
+
// its sub-items. Sidebars on most docs sites auto-expand the current
|
|
1067
|
+
// page's own branch on render, so visiting a rep per branch is what
|
|
1068
|
+
// surfaces those hidden children.
|
|
1069
|
+
//
|
|
1070
|
+
// With llms.txt (full mode) we already have a trusted page list and the
|
|
1071
|
+
// scrape has real category headers — one rep per scraped category is
|
|
1072
|
+
// enough. Without llms.txt (discovery mode) everything may have collapsed
|
|
1073
|
+
// into a single flat category and we don't yet know which of those pages
|
|
1074
|
+
// is a parent; visit all of them up to a cap.
|
|
1075
|
+
const isDiscovery = knownPages.length === 0
|
|
1076
|
+
const MAX_DISCOVERY_FETCHES = 20
|
|
1077
|
+
const r1Urls = isDiscovery
|
|
1078
|
+
? flattenTree(categoryOrder).slice(0, MAX_DISCOVERY_FETCHES)
|
|
1079
|
+
: categoryOrder
|
|
1080
|
+
.map((c) => c.pages[0])
|
|
1081
|
+
.filter(Boolean)
|
|
1082
|
+
.map((p) => toBrowsableUrl(p.url))
|
|
1083
|
+
const r1Start = Date.now()
|
|
1084
|
+
// Firecrawl standard-plan concurrency is 10; 5 leaves headroom for retries.
|
|
1085
|
+
// Native HTTP can run hotter since we're hitting our own loopback.
|
|
1086
|
+
await visitAllInParallel(r1Urls, visit, firecrawlKey ? 5 : 10)
|
|
1087
|
+
const r1Ms = Date.now() - r1Start
|
|
1088
|
+
console.log(
|
|
1089
|
+
styles.dim(` ⏱ scrape breakdown: round0=${formatDuration(r0Ms)} round1=${formatDuration(r1Ms)} (${r1Urls.length} ${isDiscovery ? 'discovery' : 'category rep'} fetches)`),
|
|
1090
|
+
)
|
|
1091
|
+
|
|
1092
|
+
// Accept thresholds — looser in discovery mode (no llms.txt) where even a
|
|
1093
|
+
// single flat "Overview" bucket is better than nothing, stricter when we
|
|
1094
|
+
// have llms.txt to compare against.
|
|
1095
|
+
const categories = categoryOrder.filter((c) => c.pages.length > 0)
|
|
1096
|
+
if (isDiscovery) {
|
|
1097
|
+
if (categories.length < 1 || matched.size < 5) return null
|
|
1098
|
+
} else {
|
|
1099
|
+
if (categories.length < 2 || matched.size < 10) return null
|
|
1100
|
+
}
|
|
1101
|
+
return { title: null, categories }
|
|
1102
|
+
}
|
|
1103
|
+
|
|
1104
|
+
/**
|
|
1105
|
+
* Walk a nav block in document order, splitting links into categories at each
|
|
1106
|
+
* <h*> heading. Links whose hrefs resolve to a known page land in the current
|
|
1107
|
+
* category (or a leading "Overview" bucket if they appear before any heading).
|
|
1108
|
+
*/
|
|
1109
|
+
/**
|
|
1110
|
+
* Find sidebar-shaped container elements (<div>/<ul>/<section>) in `html` and
|
|
1111
|
+
* return their inner HTML. Looks for tag attributes like id="sidebar*",
|
|
1112
|
+
* class*="sidebar", role="navigation", aria-label*="navigation". Uses
|
|
1113
|
+
* balanced-tag walking so nested elements with the same tag don't break the
|
|
1114
|
+
* boundaries.
|
|
1115
|
+
*/
|
|
1116
|
+
function extractSidebarContainers(html) {
|
|
1117
|
+
const SIDEBAR_TAGS = ['div', 'ul', 'section']
|
|
1118
|
+
// Match attribute patterns commonly used for the sidebar. Case-insensitive
|
|
1119
|
+
// partial-string matches on id/class so "sidebar-group", "sidebarNav",
|
|
1120
|
+
// "DocsSidebar__container" etc. all hit.
|
|
1121
|
+
const SIDEBAR_ATTR_RE =
|
|
1122
|
+
/\b(?:id|class|aria-label|data-testid)=(?:"[^"]*sidebar[^"]*"|'[^']*sidebar[^']*'|"[^"]*navigation[^"]*"|'[^']*navigation[^']*')|\brole=(?:"navigation"|'navigation')/i
|
|
1123
|
+
|
|
1124
|
+
const out = []
|
|
1125
|
+
for (const tag of SIDEBAR_TAGS) {
|
|
1126
|
+
const openRe = new RegExp(`<${tag}\\b([^>]*)>`, 'gi')
|
|
1127
|
+
let m
|
|
1128
|
+
while ((m = openRe.exec(html)) !== null) {
|
|
1129
|
+
if (!SIDEBAR_ATTR_RE.test(m[1])) continue
|
|
1130
|
+
const inner = extractBalancedTag(html, tag, m.index + m[0].length)
|
|
1131
|
+
if (inner != null && inner.length > 100) out.push(inner)
|
|
1132
|
+
}
|
|
1133
|
+
}
|
|
1134
|
+
return out
|
|
1135
|
+
}
|
|
1136
|
+
|
|
1137
|
+
/**
|
|
1138
|
+
* Starting at `startIdx` (just past an opening `<tag …>`), walk forward
|
|
1139
|
+
* through `html` tracking nested opens/closes of the same tag. Returns the
|
|
1140
|
+
* inner HTML up to the matching close tag, or null if unbalanced.
|
|
1141
|
+
*
|
|
1142
|
+
* Generous about case and whitespace; void-element rules are NOT applied
|
|
1143
|
+
* (we only call this for non-void containers: div/ul/section).
|
|
1144
|
+
*/
|
|
1145
|
+
function extractBalancedTag(html, tag, startIdx) {
|
|
1146
|
+
const re = new RegExp(`<(/?)${tag}\\b[^>]*>`, 'gi')
|
|
1147
|
+
re.lastIndex = startIdx
|
|
1148
|
+
let depth = 1
|
|
1149
|
+
let m
|
|
1150
|
+
while ((m = re.exec(html)) !== null) {
|
|
1151
|
+
if (m[1] === '/') {
|
|
1152
|
+
depth--
|
|
1153
|
+
if (depth === 0) return html.slice(startIdx, m.index)
|
|
1154
|
+
} else {
|
|
1155
|
+
depth++
|
|
1156
|
+
}
|
|
1157
|
+
if (re.lastIndex - startIdx > 1_500_000) return null // safety cap
|
|
1158
|
+
}
|
|
1159
|
+
return null
|
|
1160
|
+
}
|
|
1161
|
+
|
|
1162
|
+
function parseNavBlock(blockHtml, base, byPath) {
|
|
1163
|
+
// Five alternatives, carefully ordered for regex semantics:
|
|
1164
|
+
// 1. <h*>…</h*> — classic heading
|
|
1165
|
+
// 2. <a href="…">…</a> — link (matched greedily as a unit, so inner <p>
|
|
1166
|
+
// tags inside link text are consumed and NOT treated as headings)
|
|
1167
|
+
// 3. <p>…</p> — bare paragraph used as section heading by
|
|
1168
|
+
// fumadocs (zod.dev) and similar frameworks
|
|
1169
|
+
// 4. <ul …> — start of nested list → subsequent <a>s are
|
|
1170
|
+
// children of the most recently emitted <a> at the outer level
|
|
1171
|
+
// 5. </ul> — close of nested list → pop parent stack
|
|
1172
|
+
const tokenRegex = /<h([1-6])\b[^>]*>([\s\S]*?)<\/h\1>|<a\b[^>]*\bhref="([^"]+)"[^>]*>([\s\S]*?)<\/a>|<p\b[^>]*>([\s\S]*?)<\/p>|<ul\b[^>]*>|<\/ul>/gi
|
|
1173
|
+
|
|
1174
|
+
const categories = []
|
|
1175
|
+
let current = null
|
|
1176
|
+
let leading = null
|
|
1177
|
+
// Stack of parent page objects. When inside a nested <ul>, new links attach
|
|
1178
|
+
// to the top of the stack (= the <a> that preceded the opening <ul>).
|
|
1179
|
+
const parentStack = []
|
|
1180
|
+
// The most recent link we emitted, at the current depth. Becomes the parent
|
|
1181
|
+
// if a <ul> opens next.
|
|
1182
|
+
let lastLinkAtDepth = null
|
|
1183
|
+
|
|
1184
|
+
const resetCategoryState = () => {
|
|
1185
|
+
parentStack.length = 0
|
|
1186
|
+
lastLinkAtDepth = null
|
|
1187
|
+
}
|
|
1188
|
+
|
|
1189
|
+
let m
|
|
1190
|
+
while ((m = tokenRegex.exec(blockHtml)) !== null) {
|
|
1191
|
+
const token = m[0]
|
|
1192
|
+
|
|
1193
|
+
if (/^<\/ul\b/i.test(token)) {
|
|
1194
|
+
parentStack.pop()
|
|
1195
|
+
lastLinkAtDepth = null
|
|
1196
|
+
continue
|
|
1197
|
+
}
|
|
1198
|
+
if (/^<ul\b/i.test(token)) {
|
|
1199
|
+
// A <ul> opening right after a link means that link becomes a parent.
|
|
1200
|
+
if (lastLinkAtDepth) parentStack.push(lastLinkAtDepth)
|
|
1201
|
+
lastLinkAtDepth = null
|
|
1202
|
+
continue
|
|
1203
|
+
}
|
|
1204
|
+
|
|
1205
|
+
if (m[1]) {
|
|
1206
|
+
const title = stripTags(m[2]).trim()
|
|
1207
|
+
if (!title) continue
|
|
1208
|
+
current = { title, pages: [] }
|
|
1209
|
+
categories.push(current)
|
|
1210
|
+
resetCategoryState()
|
|
1211
|
+
continue
|
|
1212
|
+
}
|
|
1213
|
+
if (m[5] !== undefined) {
|
|
1214
|
+
const title = stripTags(m[5]).trim()
|
|
1215
|
+
if (!title || title.length > 60 || /[.!?]\s*$/.test(title)) continue
|
|
1216
|
+
current = { title, pages: [] }
|
|
1217
|
+
categories.push(current)
|
|
1218
|
+
resetCategoryState()
|
|
1219
|
+
continue
|
|
1220
|
+
}
|
|
1221
|
+
|
|
1222
|
+
// Link.
|
|
1223
|
+
const href = m[3]
|
|
1224
|
+
if (!href || href.startsWith('#') || href.startsWith('mailto:') || href.startsWith('javascript:')) continue
|
|
1225
|
+
let abs
|
|
1226
|
+
try {
|
|
1227
|
+
abs = new URL(href, base).toString()
|
|
1228
|
+
} catch {
|
|
1229
|
+
continue
|
|
1230
|
+
}
|
|
1231
|
+
|
|
1232
|
+
// byPath is populated from llms.txt. When it's empty we're in discovery
|
|
1233
|
+
// mode — fall back to synthesizing a page entry from the link itself,
|
|
1234
|
+
// filtered to same-origin non-asset URLs so we don't slurp every footer,
|
|
1235
|
+
// social, or static file link on the page.
|
|
1236
|
+
let page = byPath.size > 0 ? byPath.get(normalizePath(abs)) : null
|
|
1237
|
+
if (!page && byPath.size === 0) {
|
|
1238
|
+
if (!isDiscoverableLink(abs, base)) continue
|
|
1239
|
+
const text = stripTags(m[4] || '').trim()
|
|
1240
|
+
if (!text || text.length > 150) continue
|
|
1241
|
+
page = { title: text, url: abs }
|
|
1242
|
+
}
|
|
1243
|
+
if (!page) continue
|
|
1244
|
+
|
|
1245
|
+
const parent = parentStack.length > 0 ? parentStack[parentStack.length - 1] : current || leading || (leading = { title: 'Overview', pages: [] })
|
|
1246
|
+
if (leading && parent === leading && categories[0] !== leading) categories.unshift(leading)
|
|
1247
|
+
|
|
1248
|
+
const existing = parent.pages.find((p) => p.url === page.url)
|
|
1249
|
+
if (existing) {
|
|
1250
|
+
lastLinkAtDepth = existing
|
|
1251
|
+
} else {
|
|
1252
|
+
const newPage = {
|
|
1253
|
+
title: page.title,
|
|
1254
|
+
url: page.url,
|
|
1255
|
+
...(page.description ? { description: page.description } : {}),
|
|
1256
|
+
pages: [],
|
|
1257
|
+
}
|
|
1258
|
+
parent.pages.push(newPage)
|
|
1259
|
+
lastLinkAtDepth = newPage
|
|
1260
|
+
}
|
|
1261
|
+
}
|
|
1262
|
+
|
|
1263
|
+
return { title: null, categories: categories.filter((c) => c.pages.length > 0) }
|
|
1264
|
+
}
|
|
1265
|
+
|
|
1266
|
+
/**
|
|
1267
|
+
* Partition orphans into pages under an API-Reference-style URL prefix
|
|
1268
|
+
* (`/api-reference/*`, `/api/*`, `/reference/*`) and everything else.
|
|
1269
|
+
*
|
|
1270
|
+
* API pages are collapsed into a single `{ title: "API Reference", pages: [] }`
|
|
1271
|
+
* category so `routeCategory()` sends them to ReadMe's `reference/` top-level
|
|
1272
|
+
* dir as a tab of their own, matching how Mintlify/Stripe/etc. structure
|
|
1273
|
+
* these docs. Any pre-existing "API Reference"-titled category on `scraped`
|
|
1274
|
+
* (including variants with zero-width prefixes from stray DOM subtrees) is
|
|
1275
|
+
* merged in and removed from `scraped.categories`.
|
|
1276
|
+
*
|
|
1277
|
+
* Returns { category, nonApiOrphans, mergedScrapedTitles }.
|
|
1278
|
+
*/
|
|
1279
|
+
function collectApiReferencePages(orphans, scraped) {
|
|
1280
|
+
const API_PREFIX_RE = /^\/(api[-_]?reference|api|reference)(\/|$)/i
|
|
1281
|
+
const isApiUrl = (url) => {
|
|
1282
|
+
try {
|
|
1283
|
+
return API_PREFIX_RE.test(new URL(url).pathname)
|
|
1284
|
+
} catch {
|
|
1285
|
+
return false
|
|
1286
|
+
}
|
|
1287
|
+
}
|
|
1288
|
+
const cleanTitle = (t) => (t || '').replace(/[\u200B-\u200F\uFEFF]/g, '').trim()
|
|
1289
|
+
const isApiCategoryTitle = (t) => /^(api[ -]?reference|reference)$/i.test(cleanTitle(t))
|
|
1290
|
+
|
|
1291
|
+
const apiPages = []
|
|
1292
|
+
const seenUrls = new Set()
|
|
1293
|
+
const push = (p) => {
|
|
1294
|
+
if (!p || !p.url) return
|
|
1295
|
+
// Skip non-page assets like /api-reference/openapi.json that some
|
|
1296
|
+
// llms.txt files list alongside real pages.
|
|
1297
|
+
if (/\.(json|yaml|yml)$/i.test(p.url)) return
|
|
1298
|
+
if (seenUrls.has(p.url)) return
|
|
1299
|
+
seenUrls.add(p.url)
|
|
1300
|
+
apiPages.push(p)
|
|
1301
|
+
}
|
|
1302
|
+
|
|
1303
|
+
// Pull pages out of any scraped category that already looks like API
|
|
1304
|
+
// Reference — even (especially) if it's a partial, DOM-polluted one.
|
|
1305
|
+
const mergedScrapedTitles = []
|
|
1306
|
+
const keptCategories = []
|
|
1307
|
+
for (const cat of scraped.categories) {
|
|
1308
|
+
if (isApiCategoryTitle(cat.title)) {
|
|
1309
|
+
mergedScrapedTitles.push(cat.title)
|
|
1310
|
+
for (const p of cat.pages || []) push(p)
|
|
1311
|
+
} else {
|
|
1312
|
+
keptCategories.push(cat)
|
|
1313
|
+
}
|
|
1314
|
+
}
|
|
1315
|
+
scraped.categories = keptCategories
|
|
1316
|
+
|
|
1317
|
+
const nonApiOrphans = []
|
|
1318
|
+
for (const p of orphans) {
|
|
1319
|
+
if (isApiUrl(p.url)) push(p)
|
|
1320
|
+
else nonApiOrphans.push(p)
|
|
1321
|
+
}
|
|
1322
|
+
|
|
1323
|
+
if (apiPages.length === 0) {
|
|
1324
|
+
return { category: null, nonApiOrphans, mergedScrapedTitles }
|
|
1325
|
+
}
|
|
1326
|
+
|
|
1327
|
+
// Nest by the resource segment after the API prefix, e.g.
|
|
1328
|
+
// /api-reference/analytics/get-interaction → group "analytics"
|
|
1329
|
+
// /api-reference/users/list-users → group "users"
|
|
1330
|
+
// /api-reference/openapi.json → top-level (no resource)
|
|
1331
|
+
// Preserves first-encountered order for both groups and their pages so the
|
|
1332
|
+
// final sidebar mirrors input order.
|
|
1333
|
+
const groupOrder = []
|
|
1334
|
+
const groupPages = new Map()
|
|
1335
|
+
const topLevel = []
|
|
1336
|
+
for (const p of apiPages) {
|
|
1337
|
+
let segs = []
|
|
1338
|
+
try {
|
|
1339
|
+
segs = new URL(p.url).pathname.split('/').filter(Boolean)
|
|
1340
|
+
} catch {}
|
|
1341
|
+
// segs[0] is the api-prefix itself; segs[1] (if any) is the resource.
|
|
1342
|
+
const resource = segs.length >= 3 ? segs[1] : null
|
|
1343
|
+
if (!resource) {
|
|
1344
|
+
topLevel.push(p)
|
|
1345
|
+
continue
|
|
1346
|
+
}
|
|
1347
|
+
if (!groupPages.has(resource)) {
|
|
1348
|
+
groupPages.set(resource, [])
|
|
1349
|
+
groupOrder.push(resource)
|
|
1350
|
+
}
|
|
1351
|
+
groupPages.get(resource).push(p)
|
|
1352
|
+
}
|
|
1353
|
+
|
|
1354
|
+
const titleize = (slug) =>
|
|
1355
|
+
String(slug)
|
|
1356
|
+
.split(/[-_]/)
|
|
1357
|
+
.filter(Boolean)
|
|
1358
|
+
.map((w) => w.charAt(0).toUpperCase() + w.slice(1))
|
|
1359
|
+
.join(' ')
|
|
1360
|
+
|
|
1361
|
+
const pages = []
|
|
1362
|
+
for (const p of topLevel) pages.push(p)
|
|
1363
|
+
for (const key of groupOrder) {
|
|
1364
|
+
pages.push({
|
|
1365
|
+
title: titleize(key),
|
|
1366
|
+
url: null,
|
|
1367
|
+
pages: groupPages.get(key),
|
|
1368
|
+
})
|
|
1369
|
+
}
|
|
1370
|
+
|
|
1371
|
+
return {
|
|
1372
|
+
category: { title: 'API Reference', pages },
|
|
1373
|
+
nonApiOrphans,
|
|
1374
|
+
mergedScrapedTitles,
|
|
1375
|
+
}
|
|
1376
|
+
}
|
|
1377
|
+
|
|
1378
|
+
/**
|
|
1379
|
+
* Group orphans by the second-to-last URL path segment (the "type" segment,
|
|
1380
|
+
* e.g. `/main/docs/about` → "docs", `/main/reference/xyz` → "reference"). This
|
|
1381
|
+
* separates API reference, changelog, and general docs into their own top-
|
|
1382
|
+
* level categories instead of dumping them into one big "Other".
|
|
1383
|
+
*
|
|
1384
|
+
* Uses distinctive titles — just "Docs" or "Reference" — rather than the raw
|
|
1385
|
+
* segment name. Falls back to "Other" when the path has no usable type.
|
|
1386
|
+
*/
|
|
1387
|
+
function bucketOrphansByPathType(orphans, scraped) {
|
|
1388
|
+
const TYPE_TITLES = {
|
|
1389
|
+
reference: 'API Reference',
|
|
1390
|
+
api: 'API Reference',
|
|
1391
|
+
'api-reference': 'API Reference',
|
|
1392
|
+
api_reference: 'API Reference',
|
|
1393
|
+
endpoints: 'API Reference',
|
|
1394
|
+
endpoint: 'API Reference',
|
|
1395
|
+
changelog: 'Changelog',
|
|
1396
|
+
release: 'Release Notes',
|
|
1397
|
+
releases: 'Release Notes',
|
|
1398
|
+
'release-notes': 'Release Notes',
|
|
1399
|
+
recipes: 'Recipes',
|
|
1400
|
+
recipe: 'Recipes',
|
|
1401
|
+
guides: 'Guides',
|
|
1402
|
+
docs: 'Other Docs',
|
|
1403
|
+
doc: 'Other Docs',
|
|
1404
|
+
}
|
|
1405
|
+
// Strong top-level type segments. If ANY path segment matches, treat that
|
|
1406
|
+
// as the bucket type — `/api-reference/prompts/archive-prompt` belongs in
|
|
1407
|
+
// "API Reference", not in a sub-bucket called "Prompts".
|
|
1408
|
+
const STRONG_TYPE = /^(api[-_]?reference|endpoints?|changelog|release[-_]?notes?|releases)$/i
|
|
1409
|
+
// Segments that look like version/locale prefixes, not category types.
|
|
1410
|
+
// Walked from the end until we find a real category-type segment.
|
|
1411
|
+
const VERSION_LOCALE = /^(v?\d+(\.\d+)*|main|master|latest|stable|next|current|ent|enterprise|en|en-us|en_us|fr|de|es|ja|zh|ko|pt)$/i
|
|
1412
|
+
// Normalize title for merge-matching: strip invisible chars, lowercase, trim.
|
|
1413
|
+
const normTitle = (t) =>
|
|
1414
|
+
String(t || '')
|
|
1415
|
+
.replace(INVISIBLE_CHARS, '')
|
|
1416
|
+
.trim()
|
|
1417
|
+
.toLowerCase()
|
|
1418
|
+
|
|
1419
|
+
// Map existing scraped categories by normalized title so we can merge
|
|
1420
|
+
// orphans into them instead of creating a parallel "(orphans)" bucket that
|
|
1421
|
+
// routes to the wrong top-level directory.
|
|
1422
|
+
const byNormTitle = new Map()
|
|
1423
|
+
for (const cat of scraped.categories) byNormTitle.set(normTitle(cat.title), cat)
|
|
1424
|
+
// Buckets we've created so later orphans can pile onto the same one.
|
|
1425
|
+
const newBuckets = new Map()
|
|
1426
|
+
|
|
1427
|
+
for (const p of orphans) {
|
|
1428
|
+
let url
|
|
1429
|
+
try {
|
|
1430
|
+
url = new URL(p.url)
|
|
1431
|
+
} catch {
|
|
1432
|
+
continue
|
|
1433
|
+
}
|
|
1434
|
+
const pathname = url.pathname
|
|
1435
|
+
// Skip OAS spec endpoints — `/api-reference/openapi.json` etc. aren't
|
|
1436
|
+
// documentation pages and shouldn't become stubs.
|
|
1437
|
+
if (/\.(json|ya?ml)$/i.test(pathname)) continue
|
|
1438
|
+
|
|
1439
|
+
const segs = pathname.split('/').filter(Boolean)
|
|
1440
|
+
let type = null
|
|
1441
|
+
// First: any strong top-level type anywhere in the path wins.
|
|
1442
|
+
for (const seg of segs) {
|
|
1443
|
+
if (STRONG_TYPE.test(seg)) {
|
|
1444
|
+
type = seg.toLowerCase()
|
|
1445
|
+
break
|
|
1446
|
+
}
|
|
1447
|
+
}
|
|
1448
|
+
// Fallback: walk the segment-before-slug backwards, skip version/locale.
|
|
1449
|
+
if (!type) {
|
|
1450
|
+
for (let i = segs.length - 2; i >= 0; i--) {
|
|
1451
|
+
if (!VERSION_LOCALE.test(segs[i])) {
|
|
1452
|
+
type = segs[i].toLowerCase()
|
|
1453
|
+
break
|
|
1454
|
+
}
|
|
1455
|
+
}
|
|
1456
|
+
}
|
|
1457
|
+
|
|
1458
|
+
const rawTitle = type ? TYPE_TITLES[type] || titleCase(type) : 'Other'
|
|
1459
|
+
const key = normTitle(rawTitle)
|
|
1460
|
+
|
|
1461
|
+
let bucket = byNormTitle.get(key)
|
|
1462
|
+
if (!bucket) {
|
|
1463
|
+
bucket = newBuckets.get(key)
|
|
1464
|
+
if (!bucket) {
|
|
1465
|
+
bucket = { title: rawTitle, pages: [] }
|
|
1466
|
+
newBuckets.set(key, bucket)
|
|
1467
|
+
byNormTitle.set(key, bucket)
|
|
1468
|
+
}
|
|
1469
|
+
}
|
|
1470
|
+
|
|
1471
|
+
bucket.pages.push({
|
|
1472
|
+
title: p.title,
|
|
1473
|
+
url: p.url,
|
|
1474
|
+
...(p.description ? { description: p.description } : {}),
|
|
1475
|
+
})
|
|
1476
|
+
}
|
|
1477
|
+
|
|
1478
|
+
// Existing scraped categories were mutated in place. Return only genuinely
|
|
1479
|
+
// new buckets for the caller to append.
|
|
1480
|
+
return Array.from(newBuckets.values()).sort((a, b) => b.pages.length - a.pages.length)
|
|
1481
|
+
}
|
|
1482
|
+
|
|
1483
|
+
function titleCase(s) {
|
|
1484
|
+
return String(s)
|
|
1485
|
+
.replace(/[-_]+/g, ' ')
|
|
1486
|
+
.replace(/\b\w/g, (c) => c.toUpperCase())
|
|
1487
|
+
}
|
|
1488
|
+
|
|
1489
|
+
/**
|
|
1490
|
+
* Walk all scraped pages (including nested sub-pages) and move any whose URL
|
|
1491
|
+
* contains a strong reference segment (`/api-reference/`, `/endpoints/`, etc.)
|
|
1492
|
+
* into a single "API Reference" category. Some docs sites (e.g. greenflash.ai)
|
|
1493
|
+
* spotlight a handful of endpoints under "Developers" in the sidebar while the
|
|
1494
|
+
* bulk of endpoints live under a separate API Reference section — we favor
|
|
1495
|
+
* the URL-path signal over the sidebar placement so all reference pages land
|
|
1496
|
+
* together in `reference/` after staging.
|
|
1497
|
+
*
|
|
1498
|
+
* Returns the number of pages relocated.
|
|
1499
|
+
*/
|
|
1500
|
+
function reclassifyReferencePages(scraped) {
|
|
1501
|
+
const REFERENCE_SEGMENT = /^(api[-_]?reference|endpoints?)$/i
|
|
1502
|
+
const normTitle = (t) =>
|
|
1503
|
+
String(t || '')
|
|
1504
|
+
.replace(INVISIBLE_CHARS, '')
|
|
1505
|
+
.trim()
|
|
1506
|
+
.toLowerCase()
|
|
1507
|
+
|
|
1508
|
+
const looksLikeRefUrl = (url) => {
|
|
1509
|
+
try {
|
|
1510
|
+
const segs = new URL(url).pathname.split('/').filter(Boolean)
|
|
1511
|
+
return segs.some((s) => REFERENCE_SEGMENT.test(s))
|
|
1512
|
+
} catch {
|
|
1513
|
+
return false
|
|
1514
|
+
}
|
|
1515
|
+
}
|
|
1516
|
+
|
|
1517
|
+
// Find (or create) the canonical API Reference category. Prefer an existing
|
|
1518
|
+
// one with a reference-shaped title so we don't end up with duplicates.
|
|
1519
|
+
let refCat = scraped.categories.find((c) => /^(api[ -]?reference|reference|api|endpoints?)$/i.test(normTitle(c.title).replace(/\s+/g, ' ')))
|
|
1520
|
+
const existedBefore = Boolean(refCat)
|
|
1521
|
+
|
|
1522
|
+
const collected = []
|
|
1523
|
+
const filterPages = (pages) => {
|
|
1524
|
+
const kept = []
|
|
1525
|
+
for (const p of pages || []) {
|
|
1526
|
+
if (looksLikeRefUrl(p.url)) {
|
|
1527
|
+
// Flatten sub-pages when relocating — API Reference is a flat list.
|
|
1528
|
+
collectFlat(p, collected)
|
|
1529
|
+
continue
|
|
1530
|
+
}
|
|
1531
|
+
if (p.pages && p.pages.length > 0) p.pages = filterPages(p.pages)
|
|
1532
|
+
kept.push(p)
|
|
1533
|
+
}
|
|
1534
|
+
return kept
|
|
1535
|
+
}
|
|
1536
|
+
|
|
1537
|
+
// Never pull pages out of the reference category itself.
|
|
1538
|
+
for (const cat of scraped.categories) {
|
|
1539
|
+
if (cat === refCat) continue
|
|
1540
|
+
cat.pages = filterPages(cat.pages)
|
|
1541
|
+
}
|
|
1542
|
+
|
|
1543
|
+
if (collected.length === 0) return 0
|
|
1544
|
+
|
|
1545
|
+
if (!refCat) {
|
|
1546
|
+
refCat = { title: 'API Reference', pages: [] }
|
|
1547
|
+
scraped.categories.push(refCat)
|
|
1548
|
+
}
|
|
1549
|
+
// Dedupe against anything already in the reference category.
|
|
1550
|
+
const seen = new Set(refCat.pages.map((p) => normalizePath(p.url)))
|
|
1551
|
+
for (const p of collected) {
|
|
1552
|
+
const key = normalizePath(p.url)
|
|
1553
|
+
if (seen.has(key)) continue
|
|
1554
|
+
seen.add(key)
|
|
1555
|
+
refCat.pages.push(p)
|
|
1556
|
+
}
|
|
1557
|
+
|
|
1558
|
+
// Drop now-empty categories (other than the reference one we may have just created).
|
|
1559
|
+
scraped.categories = scraped.categories.filter((c) => c === refCat || (c.pages && c.pages.length > 0))
|
|
1560
|
+
|
|
1561
|
+
// If the category existed before but the relocation was a no-op, surface 0.
|
|
1562
|
+
return existedBefore ? collected.length : collected.length
|
|
1563
|
+
}
|
|
1564
|
+
|
|
1565
|
+
function collectFlat(page, out) {
|
|
1566
|
+
out.push({
|
|
1567
|
+
title: page.title,
|
|
1568
|
+
url: page.url,
|
|
1569
|
+
...(page.description ? { description: page.description } : {}),
|
|
1570
|
+
})
|
|
1571
|
+
if (page.pages && page.pages.length > 0) {
|
|
1572
|
+
for (const child of page.pages) collectFlat(child, out)
|
|
1573
|
+
}
|
|
1574
|
+
}
|
|
1575
|
+
|
|
1576
|
+
/**
|
|
1577
|
+
* The scraped nav only contains top-level items; subcategory pages sit behind
|
|
1578
|
+
* `>` chevrons and don't render on a cold fetch. For each llms.txt URL not
|
|
1579
|
+
* already in the scrape, find the scraped page whose URL path is the longest
|
|
1580
|
+
* ancestor of it, and drop the orphan into that page's category. Returns any
|
|
1581
|
+
* orphans that still have no ancestor match.
|
|
1582
|
+
*/
|
|
1583
|
+
function slotOrphansByPath(scraped, knownPages) {
|
|
1584
|
+
const matched = new Set()
|
|
1585
|
+
const pathToCategory = new Map() // normalizedPath → category
|
|
1586
|
+
for (const cat of scraped.categories) {
|
|
1587
|
+
for (const p of cat.pages) {
|
|
1588
|
+
const norm = normalizePath(p.url)
|
|
1589
|
+
matched.add(norm)
|
|
1590
|
+
pathToCategory.set(norm, cat)
|
|
1591
|
+
}
|
|
1592
|
+
}
|
|
1593
|
+
|
|
1594
|
+
const orphans = []
|
|
1595
|
+
for (const p of knownPages) {
|
|
1596
|
+
const norm = normalizePath(p.url)
|
|
1597
|
+
if (matched.has(norm)) continue
|
|
1598
|
+
|
|
1599
|
+
let bestCat = null
|
|
1600
|
+
let bestLen = -1
|
|
1601
|
+
for (const [navPath, cat] of pathToCategory) {
|
|
1602
|
+
if (navPath && (norm === navPath || norm.startsWith(navPath + '/'))) {
|
|
1603
|
+
if (navPath.length > bestLen) {
|
|
1604
|
+
bestCat = cat
|
|
1605
|
+
bestLen = navPath.length
|
|
1606
|
+
}
|
|
1607
|
+
}
|
|
1608
|
+
}
|
|
1609
|
+
|
|
1610
|
+
if (bestCat) {
|
|
1611
|
+
bestCat.pages.push({
|
|
1612
|
+
title: p.title,
|
|
1613
|
+
url: p.url,
|
|
1614
|
+
...(p.description ? { description: p.description } : {}),
|
|
1615
|
+
})
|
|
1616
|
+
matched.add(norm)
|
|
1617
|
+
} else {
|
|
1618
|
+
orphans.push(p)
|
|
1619
|
+
}
|
|
1620
|
+
}
|
|
1621
|
+
return orphans
|
|
1622
|
+
}
|
|
1623
|
+
|
|
1624
|
+
/**
|
|
1625
|
+
* Flatten a tree of categories → pages → sub-pages into a linear list of URLs
|
|
1626
|
+
* (depth-first, in-order). Used to enumerate every URL we want to re-visit
|
|
1627
|
+
* during round 1 of scraping.
|
|
1628
|
+
*/
|
|
1629
|
+
/**
|
|
1630
|
+
* Walk a tree of pages+sub-pages and drop any whose URL is already in `seen`.
|
|
1631
|
+
* First occurrence wins — this is used by the whole-body fallback to keep the
|
|
1632
|
+
* sidebar's first appearance of each page and drop duplicates that end up
|
|
1633
|
+
* under landing-page headings.
|
|
1634
|
+
*/
|
|
1635
|
+
function filterDedupePages(pages, seen) {
|
|
1636
|
+
const out = []
|
|
1637
|
+
for (const p of pages) {
|
|
1638
|
+
const norm = normalizePath(p.url)
|
|
1639
|
+
if (seen.has(norm)) continue
|
|
1640
|
+
seen.add(norm)
|
|
1641
|
+
const childPages = p.pages ? filterDedupePages(p.pages, seen) : []
|
|
1642
|
+
out.push({ ...p, pages: childPages })
|
|
1643
|
+
}
|
|
1644
|
+
return out
|
|
1645
|
+
}
|
|
1646
|
+
|
|
1647
|
+
function flattenTree(categories) {
|
|
1648
|
+
const out = []
|
|
1649
|
+
function walk(pages) {
|
|
1650
|
+
for (const p of pages || []) {
|
|
1651
|
+
out.push(toBrowsableUrl(p.url))
|
|
1652
|
+
if (p.pages && p.pages.length > 0) walk(p.pages)
|
|
1653
|
+
}
|
|
1654
|
+
}
|
|
1655
|
+
for (const c of categories || []) walk(c.pages)
|
|
1656
|
+
return out
|
|
1657
|
+
}
|
|
1658
|
+
|
|
1659
|
+
/**
|
|
1660
|
+
* llms.txt often lists URLs with a `.md` extension — those are raw markdown
|
|
1661
|
+
* endpoints, not the rendered HTML page that has the sidebar. Strip the
|
|
1662
|
+
* extension so we fetch the human-facing page instead.
|
|
1663
|
+
*/
|
|
1664
|
+
function toBrowsableUrl(url) {
|
|
1665
|
+
try {
|
|
1666
|
+
const u = new URL(url)
|
|
1667
|
+
u.pathname = u.pathname.replace(/\.(md|mdx)$/i, '')
|
|
1668
|
+
return u.toString()
|
|
1669
|
+
} catch {
|
|
1670
|
+
return url
|
|
1671
|
+
}
|
|
1672
|
+
}
|
|
1673
|
+
|
|
1674
|
+
/**
|
|
1675
|
+
* Native-fetch HTML loader. Returns the body string or empty string on failure.
|
|
1676
|
+
*/
|
|
1677
|
+
async function fetchHtmlDirect(url) {
|
|
1678
|
+
try {
|
|
1679
|
+
const res = await fetch(url, {
|
|
1680
|
+
redirect: 'follow',
|
|
1681
|
+
headers: { 'User-Agent': 'readme-cli-import' },
|
|
1682
|
+
})
|
|
1683
|
+
if (!res.ok) return ''
|
|
1684
|
+
return await res.text()
|
|
1685
|
+
} catch {
|
|
1686
|
+
return ''
|
|
1687
|
+
}
|
|
1688
|
+
}
|
|
1689
|
+
|
|
1690
|
+
/**
|
|
1691
|
+
* Firecrawl-backed HTML loader. Firecrawl runs a real browser, waits for
|
|
1692
|
+
* hydration, and returns the rendered DOM — which is what we need for sites
|
|
1693
|
+
* that render their sidebar nav client-side (zod.dev, most Next.js docs).
|
|
1694
|
+
*
|
|
1695
|
+
* Returns a function with the same (url) → html string contract as fetchHtmlDirect
|
|
1696
|
+
* so scrapeNavFromSite doesn't need to care which backend is in use.
|
|
1697
|
+
*/
|
|
1698
|
+
function makeFirecrawlFetcher(apiKey) {
|
|
1699
|
+
return async function fetchHtmlViaFirecrawl(url) {
|
|
1700
|
+
try {
|
|
1701
|
+
const res = await fetch('https://api.firecrawl.dev/v1/scrape', {
|
|
1702
|
+
method: 'POST',
|
|
1703
|
+
headers: {
|
|
1704
|
+
Authorization: `Bearer ${apiKey}`,
|
|
1705
|
+
'Content-Type': 'application/json',
|
|
1706
|
+
},
|
|
1707
|
+
body: JSON.stringify({
|
|
1708
|
+
url,
|
|
1709
|
+
formats: ['rawHtml'],
|
|
1710
|
+
// Wait a bit for client-side frameworks to hydrate the sidebar.
|
|
1711
|
+
waitFor: 2000,
|
|
1712
|
+
// Block common ad/tracking domains so we don't burn time on them.
|
|
1713
|
+
blockAds: true,
|
|
1714
|
+
}),
|
|
1715
|
+
})
|
|
1716
|
+
if (!res.ok) {
|
|
1717
|
+
styles.warning(`Firecrawl HTTP ${res.status} for ${url}`)
|
|
1718
|
+
return ''
|
|
1719
|
+
}
|
|
1720
|
+
const body = await res.json()
|
|
1721
|
+
if (!body.success) {
|
|
1722
|
+
styles.warning(`Firecrawl error for ${url}: ${body.error || 'unknown'}`)
|
|
1723
|
+
return ''
|
|
1724
|
+
}
|
|
1725
|
+
return body.data?.rawHtml || body.data?.html || ''
|
|
1726
|
+
} catch (e) {
|
|
1727
|
+
styles.warning(`Firecrawl fetch failed for ${url}: ${e.message}`)
|
|
1728
|
+
return ''
|
|
1729
|
+
}
|
|
1730
|
+
}
|
|
1731
|
+
}
|
|
1732
|
+
|
|
1733
|
+
/**
|
|
1734
|
+
* Run `visit(url)` across `urls` with at most `concurrency` in flight at once.
|
|
1735
|
+
* Order of completion doesn't matter — visit() merges into shared state.
|
|
1736
|
+
*/
|
|
1737
|
+
async function visitAllInParallel(urls, visit, concurrency) {
|
|
1738
|
+
let i = 0
|
|
1739
|
+
async function worker() {
|
|
1740
|
+
while (i < urls.length) {
|
|
1741
|
+
const idx = i++
|
|
1742
|
+
await visit(urls[idx])
|
|
1743
|
+
}
|
|
1744
|
+
}
|
|
1745
|
+
await Promise.all(Array.from({ length: Math.min(concurrency, urls.length) }, worker))
|
|
1746
|
+
}
|
|
1747
|
+
|
|
1748
|
+
/**
|
|
1749
|
+
* Namespace = origin + first meaningful path segment (skipping version/locale
|
|
1750
|
+
* prefixes like /main, /en, /v1). Pages in different namespaces live under
|
|
1751
|
+
* different sidebars on typical multi-product docs sites, so cross-namespace
|
|
1752
|
+
* follow-up fetches don't help coverage.
|
|
1753
|
+
*/
|
|
1754
|
+
function urlNamespace(url) {
|
|
1755
|
+
const NS_SKIP = /^(v?\d+(\.\d+)*|main|master|latest|stable|next|current|ent|enterprise|en|en-us|en_us|fr|de|es|ja|zh|ko|pt)$/i
|
|
1756
|
+
try {
|
|
1757
|
+
const u = new URL(url)
|
|
1758
|
+
const segs = u.pathname.split('/').filter(Boolean)
|
|
1759
|
+
let ns = ''
|
|
1760
|
+
for (const s of segs) {
|
|
1761
|
+
if (!NS_SKIP.test(s)) {
|
|
1762
|
+
ns = s.toLowerCase()
|
|
1763
|
+
break
|
|
1764
|
+
}
|
|
1765
|
+
}
|
|
1766
|
+
return `${u.origin}/${ns}`
|
|
1767
|
+
} catch {
|
|
1768
|
+
return ''
|
|
1769
|
+
}
|
|
1770
|
+
}
|
|
1771
|
+
|
|
1772
|
+
/**
|
|
1773
|
+
* Re-cluster a flat page list by URL path structure. When the sidebar scrape
|
|
1774
|
+
* found links but no <h*>/<p> headers to split on, everything ends up in one
|
|
1775
|
+
* big bucket. URLs often encode the site's real hierarchy, though — if many
|
|
1776
|
+
* pages share `/foo/bar/<slug>.html` and a few others share `/foo/baz/<slug>`,
|
|
1777
|
+
* "bar" and "baz" are almost certainly section names.
|
|
1778
|
+
*
|
|
1779
|
+
* Algorithm:
|
|
1780
|
+
* 1. Find the longest path prefix ALL pages share (the "base").
|
|
1781
|
+
* 2. Take the segment immediately after the base — this is the category key.
|
|
1782
|
+
* 3. Group pages by that key; the key value (title-cased) is the category.
|
|
1783
|
+
* 4. Only accept the result if it produces >=2 categories AND at least one
|
|
1784
|
+
* category has >=2 pages. Otherwise the clustering is too sparse — every
|
|
1785
|
+
* page lives in its own category and we'd just be renaming "Overview".
|
|
1786
|
+
*/
|
|
1787
|
+
function clusterByUrlPath(pages) {
|
|
1788
|
+
if (!pages || pages.length < 3) return null
|
|
1789
|
+
|
|
1790
|
+
const parts = pages.map((p) => {
|
|
1791
|
+
try {
|
|
1792
|
+
return new URL(p.url).pathname.split('/').filter(Boolean)
|
|
1793
|
+
} catch {
|
|
1794
|
+
return []
|
|
1795
|
+
}
|
|
1796
|
+
})
|
|
1797
|
+
if (parts.some((pp) => pp.length === 0)) return null
|
|
1798
|
+
|
|
1799
|
+
// Longest common prefix depth.
|
|
1800
|
+
let commonDepth = 0
|
|
1801
|
+
while (commonDepth < parts[0].length) {
|
|
1802
|
+
const seg = parts[0][commonDepth]
|
|
1803
|
+
if (!parts.every((pp) => pp[commonDepth] === seg)) break
|
|
1804
|
+
commonDepth++
|
|
1805
|
+
}
|
|
1806
|
+
|
|
1807
|
+
// The segment right after the common base is the category key.
|
|
1808
|
+
const keyIdx = commonDepth
|
|
1809
|
+
const byKey = new Map()
|
|
1810
|
+
for (let i = 0; i < pages.length; i++) {
|
|
1811
|
+
const key = parts[i][keyIdx]
|
|
1812
|
+
// Skip pages that have no segment at the cluster index (they're AT the
|
|
1813
|
+
// common base — those would become their own "index"-like category).
|
|
1814
|
+
if (!key) continue
|
|
1815
|
+
if (!byKey.has(key)) byKey.set(key, [])
|
|
1816
|
+
byKey.get(key).push(pages[i])
|
|
1817
|
+
}
|
|
1818
|
+
|
|
1819
|
+
// Reject weak clusterings: need at least 2 groups AND at least one group
|
|
1820
|
+
// with multiple pages (otherwise every "category" is a single page, which
|
|
1821
|
+
// is just Overview renamed).
|
|
1822
|
+
if (byKey.size < 2) return null
|
|
1823
|
+
if (![...byKey.values()].some((arr) => arr.length >= 2)) return null
|
|
1824
|
+
|
|
1825
|
+
// Preserve first-appearance order so the sidebar reflects source order.
|
|
1826
|
+
const firstSeen = new Map()
|
|
1827
|
+
pages.forEach((p, i) => {
|
|
1828
|
+
const key = parts[i][keyIdx]
|
|
1829
|
+
if (key && !firstSeen.has(key)) firstSeen.set(key, i)
|
|
1830
|
+
})
|
|
1831
|
+
const orderedKeys = [...byKey.keys()].sort((a, b) => firstSeen.get(a) - firstSeen.get(b))
|
|
1832
|
+
|
|
1833
|
+
const rawClusters = orderedKeys.map((key) => ({
|
|
1834
|
+
title: titleCase(key),
|
|
1835
|
+
pages: byKey.get(key),
|
|
1836
|
+
}))
|
|
1837
|
+
|
|
1838
|
+
// A cluster with exactly one top-level page is NOT a real category — it's
|
|
1839
|
+
// a parent page with children wrapped in a pseudo-category label. Categories
|
|
1840
|
+
// are grouping labels with no content of their own; parent pages have
|
|
1841
|
+
// content AND children. Collect those singletons into a shared
|
|
1842
|
+
// "Documentation" bucket so they're siblings at the top level, each with
|
|
1843
|
+
// their own sub-tree intact.
|
|
1844
|
+
const multipageClusters = rawClusters.filter((c) => c.pages.length >= 2)
|
|
1845
|
+
const singletonPages = rawClusters.filter((c) => c.pages.length === 1).flatMap((c) => c.pages)
|
|
1846
|
+
|
|
1847
|
+
const out = []
|
|
1848
|
+
if (singletonPages.length > 0) {
|
|
1849
|
+
out.push({ title: 'Documentation', pages: singletonPages })
|
|
1850
|
+
}
|
|
1851
|
+
out.push(...multipageClusters)
|
|
1852
|
+
|
|
1853
|
+
// If we didn't actually produce any multi-page cluster, clustering added no
|
|
1854
|
+
// value — every page was a singleton and we'd just have renamed Overview.
|
|
1855
|
+
// Tell the caller to stick with the original flat shape.
|
|
1856
|
+
if (multipageClusters.length === 0) return null
|
|
1857
|
+
return out
|
|
1858
|
+
}
|
|
1859
|
+
|
|
1860
|
+
/**
|
|
1861
|
+
* Used by discovery-mode scraping (no llms.txt) to decide whether a nav
|
|
1862
|
+
* link is worth importing as a doc page. Filters out cross-origin links,
|
|
1863
|
+
* asset file types, build artifacts, and anchors-on-same-page.
|
|
1864
|
+
*/
|
|
1865
|
+
function isDiscoverableLink(abs, base) {
|
|
1866
|
+
let u
|
|
1867
|
+
try {
|
|
1868
|
+
u = new URL(abs)
|
|
1869
|
+
} catch {
|
|
1870
|
+
return false
|
|
1871
|
+
}
|
|
1872
|
+
if (u.origin !== base.origin) return false
|
|
1873
|
+
const p = u.pathname.toLowerCase()
|
|
1874
|
+
if (!p || p === '/') return false
|
|
1875
|
+
if (/\.(png|jpe?g|gif|svg|webp|ico|css|js|pdf|zip|tar|gz|woff2?|ttf|mp4|mp3)$/i.test(p)) return false
|
|
1876
|
+
if (p.startsWith('/_next/') || p.startsWith('/__/') || p.includes('/static/') || p.includes('/assets/')) return false
|
|
1877
|
+
return true
|
|
1878
|
+
}
|
|
1879
|
+
|
|
1880
|
+
// Zero-width spaces, direction marks, word joiner, BOM — some docs sites
|
|
1881
|
+
// inject these into sidebar headings (docs.greenflash.ai prefixes
|
|
1882
|
+
// "API Reference" with U+200B), which otherwise defeats downstream
|
|
1883
|
+
// title-matching regexes like routeCategory's.
|
|
1884
|
+
const INVISIBLE_CHARS = new RegExp('[\\u200B-\\u200F\\u202A-\\u202E\\u2060\\uFEFF]', 'g')
|
|
1885
|
+
function stripTags(s) {
|
|
1886
|
+
return decodeEntities(String(s).replace(/<[^>]+>/g, '')).replace(INVISIBLE_CHARS, '')
|
|
1887
|
+
}
|
|
1888
|
+
|
|
1889
|
+
/**
|
|
1890
|
+
* Decode the common HTML entities that show up inside <a>/<p>/<h*> tag text.
|
|
1891
|
+
* The sidebar/nav scrapers feed their output straight into frontmatter titles
|
|
1892
|
+
* and on-page headings, so leaving `&` raw produces titles like
|
|
1893
|
+
* "New Features & Upgrade Changes". This covers the named entities we
|
|
1894
|
+
* see in practice plus numeric and hex forms.
|
|
1895
|
+
*/
|
|
1896
|
+
function decodeEntities(s) {
|
|
1897
|
+
if (!s || s.indexOf('&') === -1) return s
|
|
1898
|
+
return s
|
|
1899
|
+
.replace(/&(?:#x([0-9a-f]+)|#(\d+));/gi, (_, hex, dec) => {
|
|
1900
|
+
const code = hex ? parseInt(hex, 16) : parseInt(dec, 10)
|
|
1901
|
+
return Number.isFinite(code) ? String.fromCodePoint(code) : _
|
|
1902
|
+
})
|
|
1903
|
+
.replace(/&/g, '&')
|
|
1904
|
+
.replace(/</g, '<')
|
|
1905
|
+
.replace(/>/g, '>')
|
|
1906
|
+
.replace(/"/g, '"')
|
|
1907
|
+
.replace(/'/g, "'")
|
|
1908
|
+
.replace(/'/g, "'")
|
|
1909
|
+
.replace(/ /g, ' ')
|
|
1910
|
+
.replace(/—/g, '—')
|
|
1911
|
+
.replace(/–/g, '–')
|
|
1912
|
+
.replace(/…/g, '…')
|
|
1913
|
+
.replace(/’/g, '’')
|
|
1914
|
+
.replace(/‘/g, '‘')
|
|
1915
|
+
.replace(/“/g, '“')
|
|
1916
|
+
.replace(/”/g, '”')
|
|
1917
|
+
}
|
|
1918
|
+
|
|
1919
|
+
/**
|
|
1920
|
+
* Reduce a URL to a comparable pathname: lowercase host, strip trailing slash
|
|
1921
|
+
* and common suffixes (.md, .html) so `/foo/bar.md` and `/foo/bar` match.
|
|
1922
|
+
*/
|
|
1923
|
+
function normalizePath(url) {
|
|
1924
|
+
try {
|
|
1925
|
+
const u = new URL(url)
|
|
1926
|
+
let p = u.pathname.replace(/\/$/, '').toLowerCase()
|
|
1927
|
+
p = p.replace(/\.(md|mdx|html?)$/i, '')
|
|
1928
|
+
return p
|
|
1929
|
+
} catch {
|
|
1930
|
+
return String(url).toLowerCase()
|
|
1931
|
+
}
|
|
1932
|
+
}
|
|
1933
|
+
|
|
1934
|
+
/**
|
|
1935
|
+
* Given a set of scraped categories + orphan pages that didn't match the nav
|
|
1936
|
+
* or any path ancestor, ask Claude to slot each orphan into an existing
|
|
1937
|
+
* category by index. Output is a compact array of indices (or -1 for none),
|
|
1938
|
+
* so token count stays low. Mutates `scraped.categories[*].pages`.
|
|
1939
|
+
* Returns the orphans Claude couldn't slot.
|
|
1940
|
+
*/
|
|
1941
|
+
async function slotOrphansWithClaude(scraped, orphans, model) {
|
|
1942
|
+
const { systemPrompt, userPrompt } = slotOrphansPrompt({
|
|
1943
|
+
categories: scraped.categories,
|
|
1944
|
+
orphans,
|
|
1945
|
+
})
|
|
1946
|
+
const raw = await runJsonQuery({ systemPrompt, userPrompt, model })
|
|
1947
|
+
if (!Array.isArray(raw)) {
|
|
1948
|
+
// If the model didn't cooperate, return all orphans unassigned.
|
|
1949
|
+
return orphans
|
|
1950
|
+
}
|
|
1951
|
+
|
|
1952
|
+
const leftover = []
|
|
1953
|
+
orphans.forEach((p, i) => {
|
|
1954
|
+
const idx = Number.isInteger(raw[i]) ? raw[i] : -1
|
|
1955
|
+
if (idx >= 0 && idx < scraped.categories.length) {
|
|
1956
|
+
scraped.categories[idx].pages.push({
|
|
1957
|
+
title: p.title,
|
|
1958
|
+
url: p.url,
|
|
1959
|
+
...(p.description ? { description: p.description } : {}),
|
|
1960
|
+
})
|
|
1961
|
+
} else {
|
|
1962
|
+
leftover.push(p)
|
|
1963
|
+
}
|
|
1964
|
+
})
|
|
1965
|
+
return leftover
|
|
1966
|
+
}
|
|
1967
|
+
|
|
1968
|
+
/**
|
|
1969
|
+
* We already have the category structure from the scraped nav; we just need
|
|
1970
|
+
* one FontAwesome icon per category. Tiny Claude call, fast.
|
|
1971
|
+
*/
|
|
1972
|
+
async function iconizeScrapedNav(scraped, _unused, model, siteTitle) {
|
|
1973
|
+
const { systemPrompt, userPrompt } = iconizeNavPrompt({
|
|
1974
|
+
categories: scraped.categories,
|
|
1975
|
+
})
|
|
1976
|
+
const icons = await runJsonQuery({ systemPrompt, userPrompt, model })
|
|
1977
|
+
const iconArr = Array.isArray(icons) ? icons : []
|
|
1978
|
+
return {
|
|
1979
|
+
title: siteTitle || null,
|
|
1980
|
+
categories: scraped.categories.map((c, i) => ({
|
|
1981
|
+
title: c.title,
|
|
1982
|
+
icon: iconArr[i] || 'folder',
|
|
1983
|
+
pages: c.pages,
|
|
1984
|
+
})),
|
|
1985
|
+
}
|
|
1986
|
+
}
|
|
1987
|
+
|
|
1988
|
+
/**
|
|
1989
|
+
* Sections are "usable" when the llms.txt already did the hard grouping work
|
|
1990
|
+
* for us — meaningful titles, not too many/few, each populated. When usable we
|
|
1991
|
+
* take a fast path that only asks Claude for icons + title polish instead of
|
|
1992
|
+
* re-bucketing every page, which is the slow part of a full reorg.
|
|
1993
|
+
*/
|
|
1994
|
+
// Sections named like these are catch-all buckets — even in richly-structured
|
|
1995
|
+
// llms.txt files (e.g. Stripe's "Docs" section is where pages go that don't
|
|
1996
|
+
// fit into a named product tab), so always drop them rather than promote the
|
|
1997
|
+
// grab-bag contents to a top-level sidebar category.
|
|
1998
|
+
const GENERIC_SECTION_RE =
|
|
1999
|
+
/^(resources?|english|root url|pages?|docs?|documentation|content|available languages.*|site|sitemap|index|home|optional|instructions?(\s|:).*|miscellaneous|misc|other)$/i
|
|
2000
|
+
|
|
2001
|
+
/**
|
|
2002
|
+
* Return the subset of llms.txt sections that carry real structural signal —
|
|
2003
|
+
* drop catch-all buckets ("Docs", "Resources", "Optional"), empty sections,
|
|
2004
|
+
* and oversized ones (site-dumps masquerading as sections).
|
|
2005
|
+
*/
|
|
2006
|
+
function usableSections(sections) {
|
|
2007
|
+
if (!sections) return []
|
|
2008
|
+
return sections.filter((s) => s.title && !GENERIC_SECTION_RE.test(s.title.trim()) && s.items && s.items.length > 0 && s.items.length <= 200)
|
|
2009
|
+
}
|
|
2010
|
+
|
|
2011
|
+
function sectionsLookUsable(sections) {
|
|
2012
|
+
if (!sections || sections.length > 40) return false
|
|
2013
|
+
return usableSections(sections).length >= 3
|
|
2014
|
+
}
|
|
2015
|
+
|
|
2016
|
+
async function organizeWithClaude(parsed, model) {
|
|
2017
|
+
if (sectionsLookUsable(parsed.sections)) {
|
|
2018
|
+
return organizeFromSections(parsed, model)
|
|
2019
|
+
}
|
|
2020
|
+
return organizeFromScratch(parsed, model)
|
|
2021
|
+
}
|
|
2022
|
+
|
|
2023
|
+
/**
|
|
2024
|
+
* Fast path: llms.txt sections look good, so keep them 1:1 and ask Claude only
|
|
2025
|
+
* for a FontAwesome icon (and optional Title-Case cleanup) per section. Output
|
|
2026
|
+
* is O(sections), not O(pages), so this is usually ~5-15s vs. a full reorg.
|
|
2027
|
+
*/
|
|
2028
|
+
async function organizeFromSections(parsed, model) {
|
|
2029
|
+
// Drop generic/empty/oversized sections so they don't pollute the sidebar
|
|
2030
|
+
// (e.g. Stripe's llms.txt has a "Docs" catch-all and a 0-item "Instructions
|
|
2031
|
+
// for Large Language Model Agents" section — neither is structural signal).
|
|
2032
|
+
const sections = usableSections(parsed.sections)
|
|
2033
|
+
|
|
2034
|
+
const { systemPrompt, userPrompt } = organizeFromSectionsPrompt({
|
|
2035
|
+
siteTitle: parsed.title,
|
|
2036
|
+
sections,
|
|
2037
|
+
})
|
|
2038
|
+
const raw = await runJsonQuery({ systemPrompt, userPrompt, model })
|
|
2039
|
+
if (!Array.isArray(raw)) {
|
|
2040
|
+
throw new Error('Fast-path expected a JSON array of {title, icon} entries.')
|
|
2041
|
+
}
|
|
2042
|
+
|
|
2043
|
+
const categories = sections.map((s, i) => {
|
|
2044
|
+
const meta = raw[i] || {}
|
|
2045
|
+
return {
|
|
2046
|
+
title: meta.title || s.title,
|
|
2047
|
+
icon: meta.icon || 'folder',
|
|
2048
|
+
pages: s.items.map((it) => ({
|
|
2049
|
+
title: it.text,
|
|
2050
|
+
url: it.url,
|
|
2051
|
+
...(it.description ? { description: it.description } : {}),
|
|
2052
|
+
})),
|
|
2053
|
+
}
|
|
2054
|
+
})
|
|
2055
|
+
|
|
2056
|
+
return { title: parsed.title || null, categories }
|
|
2057
|
+
}
|
|
2058
|
+
|
|
2059
|
+
async function organizeFromScratch(parsed, model) {
|
|
2060
|
+
const items = parsed.sections.flatMap((s) =>
|
|
2061
|
+
s.items.map((i) => ({
|
|
2062
|
+
section: s.title,
|
|
2063
|
+
title: i.text,
|
|
2064
|
+
url: i.url,
|
|
2065
|
+
description: i.description || undefined,
|
|
2066
|
+
})),
|
|
2067
|
+
)
|
|
2068
|
+
|
|
2069
|
+
const { systemPrompt, userPrompt } = organizeFromScratchPrompt({
|
|
2070
|
+
siteTitle: parsed.title,
|
|
2071
|
+
items,
|
|
2072
|
+
})
|
|
2073
|
+
const raw = await runJsonQuery({ systemPrompt, userPrompt, model })
|
|
2074
|
+
|
|
2075
|
+
// Rehydrate pages from the id references Claude returned.
|
|
2076
|
+
const expandedCategories = []
|
|
2077
|
+
const usedIds = new Set()
|
|
2078
|
+
for (const cat of raw.categories || []) {
|
|
2079
|
+
const pages = []
|
|
2080
|
+
for (const id of cat.pageIds || []) {
|
|
2081
|
+
const item = items[id]
|
|
2082
|
+
if (!item) continue // ignore out-of-range ids
|
|
2083
|
+
if (usedIds.has(id)) continue // ignore dupes
|
|
2084
|
+
usedIds.add(id)
|
|
2085
|
+
pages.push({
|
|
2086
|
+
title: item.title,
|
|
2087
|
+
url: item.url,
|
|
2088
|
+
...(item.description ? { description: item.description } : {}),
|
|
2089
|
+
})
|
|
2090
|
+
}
|
|
2091
|
+
expandedCategories.push({ title: cat.title, icon: cat.icon, pages })
|
|
2092
|
+
}
|
|
2093
|
+
|
|
2094
|
+
// Safety net: if Claude dropped any ids, park them under a leftover category
|
|
2095
|
+
// so we never silently lose pages.
|
|
2096
|
+
const missing = items.map((it, idx) => (usedIds.has(idx) ? null : { id: idx, ...it })).filter(Boolean)
|
|
2097
|
+
if (missing.length > 0) {
|
|
2098
|
+
expandedCategories.push({
|
|
2099
|
+
title: 'Uncategorized',
|
|
2100
|
+
icon: 'folder',
|
|
2101
|
+
pages: missing.map((it) => ({
|
|
2102
|
+
title: it.title,
|
|
2103
|
+
url: it.url,
|
|
2104
|
+
...(it.description ? { description: it.description } : {}),
|
|
2105
|
+
})),
|
|
2106
|
+
})
|
|
2107
|
+
styles.warning(`Claude missed ${missing.length} page${missing.length === 1 ? '' : 's'} — parked under "Uncategorized".`)
|
|
2108
|
+
}
|
|
2109
|
+
|
|
2110
|
+
return { title: raw.title, categories: expandedCategories }
|
|
2111
|
+
}
|
|
2112
|
+
|
|
2113
|
+
/**
|
|
2114
|
+
* Shared Claude call for "send a prompt, parse JSON back". Logs the prompts so
|
|
2115
|
+
* we can see what went in, runs a heartbeat so silent model latency doesn't
|
|
2116
|
+
* look like a hang, and strips stray code fences if the model adds them.
|
|
2117
|
+
*/
|
|
2118
|
+
async function runJsonQuery({ systemPrompt, userPrompt, model }) {
|
|
2119
|
+
console.log()
|
|
2120
|
+
console.log(styles.dim('─ system prompt ─'))
|
|
2121
|
+
console.log(styles.dim(systemPrompt))
|
|
2122
|
+
console.log(styles.dim('─ user prompt (first 80 lines) ─'))
|
|
2123
|
+
console.log(styles.dim(userPrompt.split('\n').slice(0, 80).join('\n')))
|
|
2124
|
+
const userLineCount = userPrompt.split('\n').length
|
|
2125
|
+
if (userLineCount > 80) {
|
|
2126
|
+
console.log(styles.dim(`… (${userLineCount - 80} more lines)`))
|
|
2127
|
+
}
|
|
2128
|
+
console.log(styles.dim('─'.repeat(40)))
|
|
2129
|
+
console.log()
|
|
2130
|
+
|
|
2131
|
+
const heartbeat = setInterval(() => process.stdout.write(styles.dim('.')), 1000)
|
|
2132
|
+
let text = ''
|
|
2133
|
+
try {
|
|
2134
|
+
for await (const message of query({
|
|
2135
|
+
prompt: userPrompt,
|
|
2136
|
+
options: {
|
|
2137
|
+
systemPrompt,
|
|
2138
|
+
allowedTools: [],
|
|
2139
|
+
...(model ? { model } : {}),
|
|
2140
|
+
},
|
|
2141
|
+
})) {
|
|
2142
|
+
if (message.type === 'assistant' && message.message?.content) {
|
|
2143
|
+
for (const block of message.message.content) {
|
|
2144
|
+
if (block.type === 'text' && block.text) text += block.text
|
|
2145
|
+
}
|
|
2146
|
+
} else if (message.type === 'result') {
|
|
2147
|
+
if (message.subtype && message.subtype !== 'success') {
|
|
2148
|
+
throw new Error(`Claude failed: ${message.subtype}${message.error?.message ? ' — ' + message.error.message : ''}`)
|
|
2149
|
+
}
|
|
2150
|
+
break
|
|
2151
|
+
}
|
|
2152
|
+
}
|
|
2153
|
+
} finally {
|
|
2154
|
+
clearInterval(heartbeat)
|
|
2155
|
+
process.stdout.write('\n')
|
|
2156
|
+
}
|
|
2157
|
+
|
|
2158
|
+
const stripped = stripCodeFences(text)
|
|
2159
|
+
|
|
2160
|
+
try {
|
|
2161
|
+
return JSON.parse(stripped)
|
|
2162
|
+
} catch (e) {
|
|
2163
|
+
throw new Error(
|
|
2164
|
+
`Claude returned invalid JSON: ${e.message}\n` +
|
|
2165
|
+
`Output length: ${stripped.length} chars. Likely hit the model's output limit — try --model sonnet.\n\n` +
|
|
2166
|
+
`First 500 chars:\n${stripped.slice(0, 500)}\n\n` +
|
|
2167
|
+
`Last 500 chars:\n${stripped.slice(-500)}`,
|
|
2168
|
+
)
|
|
2169
|
+
}
|
|
2170
|
+
}
|
|
2171
|
+
|
|
2172
|
+
/**
|
|
2173
|
+
* Produce the ordered list of llms.txt URLs to probe for a given source URL.
|
|
2174
|
+
* Starts at the deepest path the user supplied and walks up one segment at a
|
|
2175
|
+
* time, ending at the origin root. Each level gets `<path>/llms.txt` appended.
|
|
2176
|
+
*
|
|
2177
|
+
* For https://mintlify.com/docs/quickstart:
|
|
2178
|
+
* → https://mintlify.com/docs/quickstart/llms.txt
|
|
2179
|
+
* → https://mintlify.com/docs/llms.txt
|
|
2180
|
+
* → https://mintlify.com/llms.txt
|
|
2181
|
+
*
|
|
2182
|
+
* Returns deduped URLs in probe order.
|
|
2183
|
+
*/
|
|
2184
|
+
function buildLlmsCandidates(sourceUrl) {
|
|
2185
|
+
const out = []
|
|
2186
|
+
const seen = new Set()
|
|
2187
|
+
const add = (url) => {
|
|
2188
|
+
if (!seen.has(url)) {
|
|
2189
|
+
seen.add(url)
|
|
2190
|
+
out.push(url)
|
|
2191
|
+
}
|
|
2192
|
+
}
|
|
2193
|
+
|
|
2194
|
+
const origin = sourceUrl.origin
|
|
2195
|
+
const segs = sourceUrl.pathname.split('/').filter(Boolean)
|
|
2196
|
+
for (let i = segs.length; i >= 0; i--) {
|
|
2197
|
+
const prefix = segs.slice(0, i).join('/')
|
|
2198
|
+
add(`${origin}${prefix ? '/' + prefix : ''}/llms.txt`)
|
|
2199
|
+
}
|
|
2200
|
+
return out
|
|
2201
|
+
}
|
|
2202
|
+
|
|
2203
|
+
/**
|
|
2204
|
+
* Best-effort fetch of a site's /llms.txt. Returns { ok, status, error, parsed }
|
|
2205
|
+
* where parsed is { title, sections: [{ title, items: [{ text, url, description }] }] }.
|
|
2206
|
+
*/
|
|
2207
|
+
async function fetchLlmsTxt(llmsUrl) {
|
|
2208
|
+
try {
|
|
2209
|
+
const res = await fetch(llmsUrl, {
|
|
2210
|
+
redirect: 'follow',
|
|
2211
|
+
headers: { 'User-Agent': 'readme-cli-import' },
|
|
2212
|
+
})
|
|
2213
|
+
if (!res.ok) return { ok: false, status: res.status }
|
|
2214
|
+
const text = await res.text()
|
|
2215
|
+
return { ok: true, status: res.status, parsed: parseLlmsTxt(text) }
|
|
2216
|
+
} catch (e) {
|
|
2217
|
+
return { ok: false, error: e.message }
|
|
2218
|
+
}
|
|
2219
|
+
}
|
|
2220
|
+
|
|
2221
|
+
/**
|
|
2222
|
+
* Parse the llms.txt format. `##` headings become sections;
|
|
2223
|
+
* `- [text](url): description` bullets become items. Items before any `##`
|
|
2224
|
+
* land in an implicit "Resources" section.
|
|
2225
|
+
*/
|
|
2226
|
+
function parseLlmsTxt(body) {
|
|
2227
|
+
const lines = body.split(/\r?\n/)
|
|
2228
|
+
let title = null
|
|
2229
|
+
const sections = []
|
|
2230
|
+
let current = null
|
|
2231
|
+
|
|
2232
|
+
const itemRe = /^\s*-\s*\[([^\]]*)\]\((https?:\/\/[^)\s]+)\)(?:\s*[:—–-]\s*(.+))?/
|
|
2233
|
+
|
|
2234
|
+
for (const line of lines) {
|
|
2235
|
+
const h1 = line.match(/^#\s+(.+)$/)
|
|
2236
|
+
if (h1 && !title) {
|
|
2237
|
+
title = h1[1].trim()
|
|
2238
|
+
continue
|
|
2239
|
+
}
|
|
2240
|
+
|
|
2241
|
+
const h2 = line.match(/^##\s+(.+)$/)
|
|
2242
|
+
if (h2) {
|
|
2243
|
+
current = { title: h2[1].trim(), items: [] }
|
|
2244
|
+
sections.push(current)
|
|
2245
|
+
continue
|
|
2246
|
+
}
|
|
2247
|
+
|
|
2248
|
+
const item = line.match(itemRe)
|
|
2249
|
+
if (item) {
|
|
2250
|
+
if (!current) {
|
|
2251
|
+
current = { title: 'Resources', items: [] }
|
|
2252
|
+
sections.push(current)
|
|
2253
|
+
}
|
|
2254
|
+
current.items.push({
|
|
2255
|
+
text: item[1].trim(),
|
|
2256
|
+
url: item[2].replace(/[.,;]+$/, ''),
|
|
2257
|
+
description: item[3] ? item[3].trim() : null,
|
|
2258
|
+
})
|
|
2259
|
+
}
|
|
2260
|
+
}
|
|
2261
|
+
|
|
2262
|
+
return { title, sections }
|
|
2263
|
+
}
|
|
2264
|
+
|
|
2265
|
+
/**
|
|
2266
|
+
* Write the organized hierarchy to disk as git-format markdown stubs — just
|
|
2267
|
+
* frontmatter, no body yet. docs/ pages go under docs/<Category>/<slug>.md;
|
|
2268
|
+
* reference/recipes/custom_pages/custom_blocks get their own top-level dir
|
|
2269
|
+
* without a category subfolder (the git-format schema doesn't nest them).
|
|
2270
|
+
* Writes _order.yaml per directory so sidebar order matches input order.
|
|
2271
|
+
*/
|
|
2272
|
+
/**
|
|
2273
|
+
* Recursively print the page tree. Sub-pages are indented under their parent
|
|
2274
|
+
* with no leading bullet character, to show them as children of the parent.
|
|
2275
|
+
*/
|
|
2276
|
+
function printPagesTree(pages, indentLevel) {
|
|
2277
|
+
const indent = ' '.repeat(indentLevel)
|
|
2278
|
+
for (const page of pages) {
|
|
2279
|
+
const desc = page.description ? ` ${styles.dim('— ' + page.description)}` : ''
|
|
2280
|
+
const url = page.url ? ` ${styles.dim(page.url)}` : ''
|
|
2281
|
+
console.log(`${indent}${styles.dim('·')} ${page.title}${url}${desc}`)
|
|
2282
|
+
if (page.pages && page.pages.length > 0) {
|
|
2283
|
+
printPagesTree(page.pages, indentLevel + 1)
|
|
2284
|
+
}
|
|
2285
|
+
}
|
|
2286
|
+
}
|
|
2287
|
+
|
|
2288
|
+
function stageOrganized(organized, stagingDir, opts = {}) {
|
|
2289
|
+
const pickIcon = makeIconPicker()
|
|
2290
|
+
const usedSlugs = new Set() // cross-dir: duplicates validator is global
|
|
2291
|
+
const byDir = new Map()
|
|
2292
|
+
const subDirsByTopDir = new Map()
|
|
2293
|
+
const counts = { fileCount: 0, skippedApiRef: 0 }
|
|
2294
|
+
const skipApiReference = !!opts.skipApiReference
|
|
2295
|
+
|
|
2296
|
+
/**
|
|
2297
|
+
* Write a page (and its descendants) into `dir`. A page with children gets
|
|
2298
|
+
* its own subfolder named after its slug; the parent page lives at
|
|
2299
|
+
* `<dir>/<slug>.md` while children live at `<dir>/<slug>/<childSlug>.md`.
|
|
2300
|
+
* This matches git-format's on-disk convention for nested sidebars.
|
|
2301
|
+
*/
|
|
2302
|
+
function writePage(page, dir, topDir, isSubPage = false) {
|
|
2303
|
+
const slug = resolveSlug(deriveSlug(page.url, page.title), usedSlugs)
|
|
2304
|
+
usedSlugs.add(slug)
|
|
2305
|
+
|
|
2306
|
+
// Group-only nodes (e.g. a resource sub-group within API Reference) have
|
|
2307
|
+
// no backing page on the source site — they're pure sidebar containers.
|
|
2308
|
+
// Skip the stub write but still recurse so their children land in the
|
|
2309
|
+
// right subdirectory.
|
|
2310
|
+
const isGroupOnly = !page.url
|
|
2311
|
+
|
|
2312
|
+
if (!isGroupOnly) {
|
|
2313
|
+
const relFilePath = `${dir}/${slug}.md`
|
|
2314
|
+
// Sub-pages don't get icons per design decision.
|
|
2315
|
+
const frontmatter = buildFrontmatter(topDir, page, slug, pickIcon, { skipIcon: isSubPage })
|
|
2316
|
+
// x-import points at the source URL for this stub. The content-import
|
|
2317
|
+
// step reads it to fetch the page body. x-prefixed custom field is the
|
|
2318
|
+
// git-format convention for metadata the schema doesn't know about.
|
|
2319
|
+
frontmatter['x-import'] = toBrowsableUrl(page.url)
|
|
2320
|
+
// hide pages that need import
|
|
2321
|
+
frontmatter.hidden = true
|
|
2322
|
+
|
|
2323
|
+
const absPath = path.join(stagingDir, relFilePath)
|
|
2324
|
+
fs.mkdirSync(path.dirname(absPath), { recursive: true })
|
|
2325
|
+
fs.writeFileSync(absPath, matter.stringify('', frontmatter))
|
|
2326
|
+
counts.fileCount++
|
|
2327
|
+
|
|
2328
|
+
if (!byDir.has(dir)) byDir.set(dir, [])
|
|
2329
|
+
byDir.get(dir).push(slug)
|
|
2330
|
+
}
|
|
2331
|
+
|
|
2332
|
+
const children = page.pages || []
|
|
2333
|
+
if (children.length > 0) {
|
|
2334
|
+
const subDir = `${dir}/${slug}`
|
|
2335
|
+
for (const child of children) writePage(child, subDir, topDir, true)
|
|
2336
|
+
}
|
|
2337
|
+
}
|
|
2338
|
+
|
|
2339
|
+
for (const cat of organized.categories || []) {
|
|
2340
|
+
const { topDir, subDir } = routeCategory(cat.title)
|
|
2341
|
+
if (skipApiReference && topDir === 'reference') {
|
|
2342
|
+
counts.skippedApiRef += countPagesDeep(cat.pages || [])
|
|
2343
|
+
continue
|
|
2344
|
+
}
|
|
2345
|
+
const dir = subDir ? `${topDir}/${subDir}` : topDir
|
|
2346
|
+
if (subDir) {
|
|
2347
|
+
if (!subDirsByTopDir.has(topDir)) subDirsByTopDir.set(topDir, [])
|
|
2348
|
+
if (!subDirsByTopDir.get(topDir).includes(subDir)) subDirsByTopDir.get(topDir).push(subDir)
|
|
2349
|
+
}
|
|
2350
|
+
|
|
2351
|
+
for (const page of cat.pages || []) writePage(page, dir, topDir, false)
|
|
2352
|
+
}
|
|
2353
|
+
|
|
2354
|
+
// Per-directory _order.yaml preserves input order in the sidebar.
|
|
2355
|
+
for (const [dir, slugs] of byDir) {
|
|
2356
|
+
const orderPath = path.join(stagingDir, dir, '_order.yaml')
|
|
2357
|
+
const body = slugs.map((s) => `- ${yamlSafeSlug(s)}`).join('\n') + '\n'
|
|
2358
|
+
fs.writeFileSync(orderPath, body)
|
|
2359
|
+
}
|
|
2360
|
+
|
|
2361
|
+
// Top-level _order.yaml (e.g. docs/_order.yaml) lists category subfolders.
|
|
2362
|
+
for (const [topDir, subs] of subDirsByTopDir) {
|
|
2363
|
+
const orderPath = path.join(stagingDir, topDir, '_order.yaml')
|
|
2364
|
+
const body = subs.map((s) => `- ${yamlSafeSlug(s)}`).join('\n') + '\n'
|
|
2365
|
+
fs.writeFileSync(orderPath, body)
|
|
2366
|
+
}
|
|
2367
|
+
|
|
2368
|
+
return { fileCount: counts.fileCount, dirCount: byDir.size, skippedApiRef: counts.skippedApiRef }
|
|
2369
|
+
}
|
|
2370
|
+
|
|
2371
|
+
function countPagesDeep(pages) {
|
|
2372
|
+
let n = 0
|
|
2373
|
+
for (const p of pages || []) {
|
|
2374
|
+
if (p.url) n++
|
|
2375
|
+
if (p.pages && p.pages.length) n += countPagesDeep(p.pages)
|
|
2376
|
+
}
|
|
2377
|
+
return n
|
|
2378
|
+
}
|
|
2379
|
+
|
|
2380
|
+
/**
|
|
2381
|
+
* Map a category title to the git-format top-level directory + optional
|
|
2382
|
+
* category subdir. docs/ is the only top dir that takes a subfolder.
|
|
2383
|
+
*/
|
|
2384
|
+
function routeCategory(title) {
|
|
2385
|
+
const t = (title || '').trim()
|
|
2386
|
+
if (/^(api[ -]?reference|reference|api|endpoints?)$/i.test(t)) return { topDir: 'reference', subDir: null }
|
|
2387
|
+
if (/^(recipes?|cookbook|tutorials?|how[ -]?tos?)$/i.test(t)) return { topDir: 'recipes', subDir: null }
|
|
2388
|
+
if (/^(custom[ -]?pages?|landing( page)?s?)$/i.test(t)) return { topDir: 'custom_pages', subDir: null }
|
|
2389
|
+
if (/^(custom[ -]?blocks?|snippets?|reusable( content)?)$/i.test(t)) return { topDir: 'custom_blocks', subDir: null }
|
|
2390
|
+
return { topDir: 'docs', subDir: t || 'Documentation' }
|
|
2391
|
+
}
|
|
2392
|
+
|
|
2393
|
+
function buildFrontmatter(topDir, page, slug, pickIcon, opts = {}) {
|
|
2394
|
+
const fm = {}
|
|
2395
|
+
const title = (page.title || titleCase(slug)).trim()
|
|
2396
|
+
|
|
2397
|
+
if (topDir === 'custom_blocks') {
|
|
2398
|
+
fm.name = title
|
|
2399
|
+
} else {
|
|
2400
|
+
fm.title = title
|
|
2401
|
+
}
|
|
2402
|
+
|
|
2403
|
+
if (page.description && page.description.trim()) {
|
|
2404
|
+
fm.excerpt = page.description.trim()
|
|
2405
|
+
}
|
|
2406
|
+
|
|
2407
|
+
// Sub-pages skip icons by design — the parent carries the nav icon, and
|
|
2408
|
+
// children render without one.
|
|
2409
|
+
if (opts.skipIcon) return fm
|
|
2410
|
+
|
|
2411
|
+
// Recipes use `recipe.icon` instead of a top-level `icon` (per git-format schema).
|
|
2412
|
+
const icon = pickIcon(slug, title)
|
|
2413
|
+
if (topDir === 'recipes') {
|
|
2414
|
+
fm.recipe = { color: '#018ef5', icon: icon || 'book-open' }
|
|
2415
|
+
} else if (topDir === 'docs' || topDir === 'reference') {
|
|
2416
|
+
fm.icon = formatIconClass(icon)
|
|
2417
|
+
}
|
|
2418
|
+
|
|
2419
|
+
return fm
|
|
2420
|
+
}
|
|
2421
|
+
|
|
2422
|
+
/**
|
|
2423
|
+
* Turn a URL's trailing segment into a filename-safe slug. Strips `.md`, kebabs
|
|
2424
|
+
* the result, drops any leading numeric prefix (common in imports).
|
|
2425
|
+
*/
|
|
2426
|
+
function deriveSlug(url, fallbackTitle) {
|
|
2427
|
+
let raw = ''
|
|
2428
|
+
try {
|
|
2429
|
+
const segs = new URL(url).pathname.split('/').filter(Boolean)
|
|
2430
|
+
raw = segs[segs.length - 1] || ''
|
|
2431
|
+
} catch {}
|
|
2432
|
+
raw = raw.replace(/\.(md|mdx|html?)$/i, '').replace(/^\d+[-_.]/, '')
|
|
2433
|
+
const slug = kebabCase(raw || fallbackTitle || 'page')
|
|
2434
|
+
return slug || 'page'
|
|
2435
|
+
}
|
|
2436
|
+
|
|
2437
|
+
/**
|
|
2438
|
+
* If `slug` is already in use anywhere in the staging tree, try `slug-2`,
|
|
2439
|
+
* `slug-3`, etc. The duplicates validator flags same-slug collisions across
|
|
2440
|
+
* directories, not just within a directory, so uniqueness must be global.
|
|
2441
|
+
*/
|
|
2442
|
+
function resolveSlug(slug, usedSlugs) {
|
|
2443
|
+
if (!usedSlugs.has(slug)) return slug
|
|
2444
|
+
let n = 2
|
|
2445
|
+
while (usedSlugs.has(`${slug}-${n}`)) n++
|
|
2446
|
+
return `${slug}-${n}`
|
|
2447
|
+
}
|
|
2448
|
+
|
|
2449
|
+
// Values YAML interprets as non-strings need quoting when used as _order entries.
|
|
2450
|
+
const YAML_UNSAFE = /^(?:\d+\.?\d*|true|false|yes|no|on|off|null|~)$/i
|
|
2451
|
+
function yamlSafeSlug(slug) {
|
|
2452
|
+
return YAML_UNSAFE.test(slug) ? `"${slug}"` : slug
|
|
2453
|
+
}
|
|
2454
|
+
|
|
2455
|
+
function kebabCase(s) {
|
|
2456
|
+
return (
|
|
2457
|
+
String(s)
|
|
2458
|
+
.replace(/([a-z0-9])([A-Z])/g, '$1-$2')
|
|
2459
|
+
.toLowerCase()
|
|
2460
|
+
.replace(/[^a-z0-9]+/g, '-')
|
|
2461
|
+
.replace(/^-+|-+$/g, '') || 'page'
|
|
2462
|
+
)
|
|
2463
|
+
}
|
|
2464
|
+
|
|
2465
|
+
// Keyword → FontAwesome icon. Ported from v1 (most-specific entries first).
|
|
2466
|
+
// Each rule maps to an ordered list of candidates; the first unused-in-this-dir
|
|
2467
|
+
// icon wins so sibling pages don't all share the same icon.
|
|
2468
|
+
const ICON_RULES = [
|
|
2469
|
+
[/\b(getting[- ]?started|quick[- ]?start|intro|introduction|welcome|overview|start)\b/, ['rocket', 'door-open', 'flag', 'star']],
|
|
2470
|
+
[/\b(api[- ]?keys?|token|secrets?|credentials?|scopes?)\b/, ['key', 'key-skeleton', 'fingerprint']],
|
|
2471
|
+
[/\b(auth|authn|authentication|sign[- ]?in|login|oauth|sso|identity|saml|oidc)\b/, ['lock', 'shield-halved', 'id-badge']],
|
|
2472
|
+
[/\b(permissions?|roles?|access|authz|authorization|rbac|acl)\b/, ['user-lock', 'user-shield', 'user-tag']],
|
|
2473
|
+
[/\b(users?|accounts?|profiles?|members?|people)\b/, ['user', 'user-gear', 'id-card', 'circle-user']],
|
|
2474
|
+
[/\b(groups?|org(anizations?)?|teams?|workspaces?)\b/, ['users', 'people-group', 'user-group']],
|
|
2475
|
+
[/\b(sync|syncing|mirror|pipeline|webhooks?)\b/, ['arrows-rotate', 'shuffle', 'bell']],
|
|
2476
|
+
[/\b(projects?|apps?|applications?)\b/, ['folder', 'folder-open', 'folder-tree']],
|
|
2477
|
+
[/\b(errors?|troubleshoot|debug(ging)?|issues?)\b/, ['triangle-exclamation', 'bug', 'circle-xmark']],
|
|
2478
|
+
[/\b(rate[- ]?limits?|throttl|quota|limits?)\b/, ['gauge', 'gauge-high']],
|
|
2479
|
+
[/\b(pagination|cursor|paginate)\b/, ['list', 'list-ol', 'ellipsis']],
|
|
2480
|
+
[/\b(versioning|versions?|changelog|releases?|release[- ]?notes?)\b/, ['code-branch', 'code-fork', 'timeline']],
|
|
2481
|
+
[/\b(sdks?|libraries|clients?|packages?)\b/, ['cube', 'cubes', 'boxes-stacked']],
|
|
2482
|
+
[/\b(cli|command[- ]?line|terminal|shell)\b/, ['terminal', 'square-terminal']],
|
|
2483
|
+
[/\b(billing|invoices?|subscriptions?|plans?)\b/, ['credit-card', 'file-invoice-dollar']],
|
|
2484
|
+
[/\b(security|compliance|privacy|gdpr|soc|hipaa|encryption)\b/, ['shield', 'shield-check', 'lock-keyhole']],
|
|
2485
|
+
[/\b(search|query|filters?|lookup)\b/, ['magnifying-glass', 'filter']],
|
|
2486
|
+
[/\b(uploads?|files?|storage|assets?|media)\b/, ['cloud-arrow-up', 'file-arrow-up']],
|
|
2487
|
+
[/\b(downloads?|exports?)\b/, ['cloud-arrow-down', 'download']],
|
|
2488
|
+
[/\b(imports?|ingest|ingestion)\b/, ['file-import', 'inbox-in']],
|
|
2489
|
+
[/\b(graphql)\b/, ['diagram-project', 'sitemap']],
|
|
2490
|
+
[/\b(sandbox|test(ing)?|staging|preview)\b/, ['flask', 'vial', 'eye']],
|
|
2491
|
+
[/\b(analytics|metrics|usage|stats|dashboard|reports?|monitoring|observability)\b/, ['chart-line', 'chart-pie', 'chart-bar']],
|
|
2492
|
+
[/\b(integrations?|plugins?|extensions?|connectors?)\b/, ['plug', 'puzzle-piece']],
|
|
2493
|
+
[/\b(tutorials?|how[- ]?to|guides?|recipes?|walkthroughs?)\b/, ['book-open', 'book-open-reader', 'graduation-cap']],
|
|
2494
|
+
[/\b(reference|endpoints?|api|apis|operations?)\b/, ['code', 'brackets-curly', 'file-code']],
|
|
2495
|
+
[/\b(configuration|config|settings?|preferences|admin|administration)\b/, ['sliders', 'gear', 'wrench', 'screwdriver-wrench']],
|
|
2496
|
+
[/\b(faq|questions?|answers?|help|support)\b/, ['circle-question', 'circle-info', 'life-ring']],
|
|
2497
|
+
[/\b(migration|migrations?|upgrade|upgrades?|migrate)\b/, ['arrow-up-right', 'stairs']],
|
|
2498
|
+
[/\b(logs?|logging|audit|audits?|history)\b/, ['file-lines', 'clock-rotate-left', 'scroll']],
|
|
2499
|
+
[/\b(data|datasets?|database|db|tables?|schemas?)\b/, ['database', 'table', 'server']],
|
|
2500
|
+
[/\b(ai|ml|machine[- ]?learning|llm|models?)\b/, ['robot', 'brain', 'microchip']],
|
|
2501
|
+
[/\b(globe|language|locale|i18n|internationalization|translations?)\b/, ['globe', 'language', 'earth-americas']],
|
|
2502
|
+
]
|
|
2503
|
+
|
|
2504
|
+
const DEFAULT_ICON_POOL = ['file-lines', 'file', 'bookmark', 'note-sticky', 'circle', 'square', 'diamond']
|
|
2505
|
+
|
|
2506
|
+
// ReadMe's hub sidebar renders `<i class="{icon}">` with no normalization, so
|
|
2507
|
+
// a bare "rocket" matches no CSS. Prefix short names with `fa-solid fa-` so
|
|
2508
|
+
// readme.com's FontAwesome 6 Pro stylesheet picks them up. Leaves values that
|
|
2509
|
+
// already include a space or a `fa-` prefix untouched.
|
|
2510
|
+
function formatIconClass(icon) {
|
|
2511
|
+
if (!icon) return icon
|
|
2512
|
+
if (icon.includes(' ') || icon.startsWith('fa-')) return icon
|
|
2513
|
+
return `fa-solid fa-${icon}`
|
|
2514
|
+
}
|
|
2515
|
+
|
|
2516
|
+
function makeIconPicker() {
|
|
2517
|
+
const used = new Set()
|
|
2518
|
+
// Every icon we could ever return, deduped so the round-robin fallback
|
|
2519
|
+
// spreads evenly. Order puts rule icons first (semantically meaningful)
|
|
2520
|
+
// then defaults — this order is also the cycle order once the pool is
|
|
2521
|
+
// exhausted.
|
|
2522
|
+
const allIcons = Array.from(new Set([...ICON_RULES.flatMap(([, icons]) => icons), ...DEFAULT_ICON_POOL]))
|
|
2523
|
+
let cycleIndex = 0
|
|
2524
|
+
|
|
2525
|
+
return function pickIcon(slug, title) {
|
|
2526
|
+
const haystack = `${slug || ''} ${title || ''}`.toLowerCase()
|
|
2527
|
+
|
|
2528
|
+
// First pass: find a semantically-matching rule whose candidates aren't
|
|
2529
|
+
// all already taken globally.
|
|
2530
|
+
for (const [re, icons] of ICON_RULES) {
|
|
2531
|
+
if (!re.test(haystack)) continue
|
|
2532
|
+
for (const icon of icons) {
|
|
2533
|
+
if (!used.has(icon)) {
|
|
2534
|
+
used.add(icon)
|
|
2535
|
+
return icon
|
|
2536
|
+
}
|
|
2537
|
+
}
|
|
2538
|
+
}
|
|
2539
|
+
|
|
2540
|
+
// No rule matched, or all matching rules' candidates are taken. Take the
|
|
2541
|
+
// first globally-unused icon from the full pool.
|
|
2542
|
+
for (const icon of allIcons) {
|
|
2543
|
+
if (!used.has(icon)) {
|
|
2544
|
+
used.add(icon)
|
|
2545
|
+
return icon
|
|
2546
|
+
}
|
|
2547
|
+
}
|
|
2548
|
+
|
|
2549
|
+
// Pool exhausted — spread reuse evenly via round-robin rather than
|
|
2550
|
+
// piling every remaining page onto one icon.
|
|
2551
|
+
const icon = allIcons[cycleIndex % allIcons.length]
|
|
2552
|
+
cycleIndex++
|
|
2553
|
+
return icon
|
|
2554
|
+
}
|
|
2555
|
+
}
|
|
2556
|
+
|
|
2557
|
+
function formatDuration(ms) {
|
|
2558
|
+
const safe = Math.max(0, ms)
|
|
2559
|
+
// Show ms under a second so sub-second work doesn't misleadingly read as "0m 0s".
|
|
2560
|
+
if (safe < 1000) return `${Math.round(safe)}ms`
|
|
2561
|
+
const totalSeconds = Math.round(safe / 1000)
|
|
2562
|
+
const minutes = Math.floor(totalSeconds / 60)
|
|
2563
|
+
const seconds = totalSeconds % 60
|
|
2564
|
+
return `${minutes}m ${seconds}s`
|
|
2565
|
+
}
|