@nuasite/llm-enhancements 0.0.57
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +360 -0
- package/dist/types/build-processor.d.ts +11 -0
- package/dist/types/build-processor.d.ts.map +1 -0
- package/dist/types/cms-marker.d.ts +19 -0
- package/dist/types/cms-marker.d.ts.map +1 -0
- package/dist/types/dev-middleware.d.ts +7 -0
- package/dist/types/dev-middleware.d.ts.map +1 -0
- package/dist/types/html-to-markdown.d.ts +14 -0
- package/dist/types/html-to-markdown.d.ts.map +1 -0
- package/dist/types/index.d.ts +5 -0
- package/dist/types/index.d.ts.map +1 -0
- package/dist/types/llm-endpoint.d.ts +15 -0
- package/dist/types/llm-endpoint.d.ts.map +1 -0
- package/dist/types/markdown-generator.d.ts +29 -0
- package/dist/types/markdown-generator.d.ts.map +1 -0
- package/dist/types/paths.d.ts +31 -0
- package/dist/types/paths.d.ts.map +1 -0
- package/dist/types/tests/tsconfig.tsbuildinfo +1 -0
- package/dist/types/tsconfig.tsbuildinfo +1 -0
- package/dist/types/types.d.ts +34 -0
- package/dist/types/types.d.ts.map +1 -0
- package/package.json +48 -0
- package/src/build-processor.ts +173 -0
- package/src/cms-marker.ts +56 -0
- package/src/dev-middleware.ts +240 -0
- package/src/html-to-markdown.ts +351 -0
- package/src/index.ts +29 -0
- package/src/llm-endpoint.ts +80 -0
- package/src/llms-txt-endpoint.ts +123 -0
- package/src/markdown-generator.ts +138 -0
- package/src/paths.ts +90 -0
- package/src/tsconfig.json +6 -0
- package/src/types.ts +67 -0
|
@@ -0,0 +1,351 @@
|
|
|
1
|
+
import { type HTMLElement, type Node, NodeType, parse } from 'node-html-parser'
|
|
2
|
+
|
|
3
|
+
/** Elements to exclude from markdown conversion */
|
|
4
|
+
const EXCLUDED_TAGS = new Set([
|
|
5
|
+
'nav',
|
|
6
|
+
'footer',
|
|
7
|
+
'header',
|
|
8
|
+
'script',
|
|
9
|
+
'style',
|
|
10
|
+
'noscript',
|
|
11
|
+
'svg',
|
|
12
|
+
'iframe',
|
|
13
|
+
'form',
|
|
14
|
+
'button',
|
|
15
|
+
'input',
|
|
16
|
+
'select',
|
|
17
|
+
'textarea',
|
|
18
|
+
])
|
|
19
|
+
|
|
20
|
+
/** Block-level elements that need newlines around them */
|
|
21
|
+
const BLOCK_ELEMENTS = new Set([
|
|
22
|
+
'p',
|
|
23
|
+
'div',
|
|
24
|
+
'section',
|
|
25
|
+
'article',
|
|
26
|
+
'main',
|
|
27
|
+
'aside',
|
|
28
|
+
'h1',
|
|
29
|
+
'h2',
|
|
30
|
+
'h3',
|
|
31
|
+
'h4',
|
|
32
|
+
'h5',
|
|
33
|
+
'h6',
|
|
34
|
+
'ul',
|
|
35
|
+
'ol',
|
|
36
|
+
'li',
|
|
37
|
+
'blockquote',
|
|
38
|
+
'pre',
|
|
39
|
+
'table',
|
|
40
|
+
'tr',
|
|
41
|
+
'thead',
|
|
42
|
+
'tbody',
|
|
43
|
+
'figure',
|
|
44
|
+
'figcaption',
|
|
45
|
+
'hr',
|
|
46
|
+
'br',
|
|
47
|
+
])
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Extract the main content area from HTML
|
|
51
|
+
*/
|
|
52
|
+
function extractMainContent(root: HTMLElement): HTMLElement {
|
|
53
|
+
// Try to find main content container in order of preference
|
|
54
|
+
const selectors = ['main', 'article', '[role="main"]', '.content', '#content']
|
|
55
|
+
|
|
56
|
+
for (const selector of selectors) {
|
|
57
|
+
const element = root.querySelector(selector)
|
|
58
|
+
if (element) {
|
|
59
|
+
return element
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Fall back to body content
|
|
64
|
+
const body = root.querySelector('body')
|
|
65
|
+
return body ?? root
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/**
|
|
69
|
+
* Check if an element should be excluded from markdown output
|
|
70
|
+
*/
|
|
71
|
+
function shouldExclude(element: HTMLElement): boolean {
|
|
72
|
+
const tagName = element.tagName?.toLowerCase()
|
|
73
|
+
if (!tagName) return false
|
|
74
|
+
return EXCLUDED_TAGS.has(tagName)
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/**
|
|
78
|
+
* Convert an HTML element to markdown
|
|
79
|
+
*/
|
|
80
|
+
function elementToMarkdown(element: HTMLElement, depth = 0): string {
|
|
81
|
+
const tagName = element.tagName?.toLowerCase()
|
|
82
|
+
|
|
83
|
+
if (!tagName) {
|
|
84
|
+
return ''
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (shouldExclude(element)) {
|
|
88
|
+
return ''
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// Process children for most elements
|
|
92
|
+
const childrenMd = () => childNodesToMarkdown(element, depth)
|
|
93
|
+
|
|
94
|
+
switch (tagName) {
|
|
95
|
+
// Headings
|
|
96
|
+
case 'h1':
|
|
97
|
+
return `\n# ${childrenMd().trim()}\n`
|
|
98
|
+
case 'h2':
|
|
99
|
+
return `\n## ${childrenMd().trim()}\n`
|
|
100
|
+
case 'h3':
|
|
101
|
+
return `\n### ${childrenMd().trim()}\n`
|
|
102
|
+
case 'h4':
|
|
103
|
+
return `\n#### ${childrenMd().trim()}\n`
|
|
104
|
+
case 'h5':
|
|
105
|
+
return `\n##### ${childrenMd().trim()}\n`
|
|
106
|
+
case 'h6':
|
|
107
|
+
return `\n###### ${childrenMd().trim()}\n`
|
|
108
|
+
|
|
109
|
+
// Paragraphs
|
|
110
|
+
case 'p':
|
|
111
|
+
return `\n${childrenMd().trim()}\n`
|
|
112
|
+
|
|
113
|
+
// Inline formatting
|
|
114
|
+
case 'strong':
|
|
115
|
+
case 'b':
|
|
116
|
+
return `**${childrenMd()}**`
|
|
117
|
+
case 'em':
|
|
118
|
+
case 'i':
|
|
119
|
+
return `*${childrenMd()}*`
|
|
120
|
+
case 'code':
|
|
121
|
+
return `\`${childrenMd()}\``
|
|
122
|
+
case 's':
|
|
123
|
+
case 'del':
|
|
124
|
+
case 'strike':
|
|
125
|
+
return `~~${childrenMd()}~~`
|
|
126
|
+
|
|
127
|
+
// Links
|
|
128
|
+
case 'a': {
|
|
129
|
+
const href = element.getAttribute('href') ?? ''
|
|
130
|
+
const text = childrenMd().trim()
|
|
131
|
+
if (!text) return ''
|
|
132
|
+
return `[${text}](${href})`
|
|
133
|
+
}
|
|
134
|
+
|
|
135
|
+
// Images
|
|
136
|
+
case 'img': {
|
|
137
|
+
const src = element.getAttribute('src') ?? ''
|
|
138
|
+
const alt = element.getAttribute('alt') ?? ''
|
|
139
|
+
return ``
|
|
140
|
+
}
|
|
141
|
+
|
|
142
|
+
// Lists
|
|
143
|
+
case 'ul':
|
|
144
|
+
case 'ol':
|
|
145
|
+
return `\n${childrenMd()}`
|
|
146
|
+
case 'li': {
|
|
147
|
+
const parent = element.parentNode as HTMLElement | null
|
|
148
|
+
const isOrdered = parent?.tagName?.toLowerCase() === 'ol'
|
|
149
|
+
const prefix = isOrdered ? '1. ' : '- '
|
|
150
|
+
const content = childrenMd().trim()
|
|
151
|
+
return `${prefix}${content}\n`
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
// Code blocks
|
|
155
|
+
case 'pre': {
|
|
156
|
+
// node-html-parser treats pre content as text, so we need to re-parse innerHTML
|
|
157
|
+
const innerParsed = parse(element.innerHTML)
|
|
158
|
+
const codeElement = innerParsed.querySelector('code')
|
|
159
|
+
const code = codeElement ? codeElement.textContent : element.textContent
|
|
160
|
+
const lang = codeElement?.getAttribute('class')?.match(/language-(\w+)/)?.[1] ?? ''
|
|
161
|
+
return `\n\`\`\`${lang}\n${code?.trim()}\n\`\`\`\n`
|
|
162
|
+
}
|
|
163
|
+
|
|
164
|
+
// Blockquotes
|
|
165
|
+
case 'blockquote': {
|
|
166
|
+
const content = childrenMd().trim()
|
|
167
|
+
const lines = content.split('\n')
|
|
168
|
+
return '\n' + lines.map((line) => `> ${line}`).join('\n') + '\n'
|
|
169
|
+
}
|
|
170
|
+
|
|
171
|
+
// Horizontal rule
|
|
172
|
+
case 'hr':
|
|
173
|
+
return '\n---\n'
|
|
174
|
+
|
|
175
|
+
// Line break
|
|
176
|
+
case 'br':
|
|
177
|
+
return '\n'
|
|
178
|
+
|
|
179
|
+
// Tables
|
|
180
|
+
case 'table':
|
|
181
|
+
return `\n${convertTable(element)}\n`
|
|
182
|
+
|
|
183
|
+
// Figures
|
|
184
|
+
case 'figure':
|
|
185
|
+
return `\n${childrenMd()}`
|
|
186
|
+
case 'figcaption':
|
|
187
|
+
return `\n*${childrenMd().trim()}*\n`
|
|
188
|
+
|
|
189
|
+
// Container elements - just process children
|
|
190
|
+
case 'div':
|
|
191
|
+
case 'section':
|
|
192
|
+
case 'article':
|
|
193
|
+
case 'main':
|
|
194
|
+
case 'aside':
|
|
195
|
+
case 'span':
|
|
196
|
+
case 'thead':
|
|
197
|
+
case 'tbody':
|
|
198
|
+
case 'tfoot':
|
|
199
|
+
return childrenMd()
|
|
200
|
+
|
|
201
|
+
default:
|
|
202
|
+
return childrenMd()
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
/**
|
|
207
|
+
* Convert child nodes to markdown
|
|
208
|
+
*/
|
|
209
|
+
function childNodesToMarkdown(element: HTMLElement, depth: number): string {
|
|
210
|
+
let result = ''
|
|
211
|
+
|
|
212
|
+
for (const child of element.childNodes) {
|
|
213
|
+
if (child.nodeType === NodeType.TEXT_NODE) {
|
|
214
|
+
// Normalize whitespace in text nodes
|
|
215
|
+
const text = child.textContent?.replace(/\s+/g, ' ') ?? ''
|
|
216
|
+
result += text
|
|
217
|
+
} else if (child.nodeType === NodeType.ELEMENT_NODE) {
|
|
218
|
+
result += elementToMarkdown(child as HTMLElement, depth + 1)
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
|
|
222
|
+
return result
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
/**
|
|
226
|
+
* Convert HTML table to markdown table
|
|
227
|
+
*/
|
|
228
|
+
function convertTable(table: HTMLElement): string {
|
|
229
|
+
const rows = table.querySelectorAll('tr')
|
|
230
|
+
if (rows.length === 0) return ''
|
|
231
|
+
|
|
232
|
+
const result: string[] = []
|
|
233
|
+
let headerDone = false
|
|
234
|
+
|
|
235
|
+
for (const row of rows) {
|
|
236
|
+
const cells = row.querySelectorAll('th, td')
|
|
237
|
+
const cellContents = cells.map((cell) => {
|
|
238
|
+
const content = childNodesToMarkdown(cell as HTMLElement, 0).trim()
|
|
239
|
+
return content.replace(/\|/g, '\\|') // Escape pipe characters
|
|
240
|
+
})
|
|
241
|
+
|
|
242
|
+
result.push(`| ${cellContents.join(' | ')} |`)
|
|
243
|
+
|
|
244
|
+
// Add header separator after first row with th elements
|
|
245
|
+
if (!headerDone && row.querySelector('th')) {
|
|
246
|
+
result.push(`| ${cellContents.map(() => '---').join(' | ')} |`)
|
|
247
|
+
headerDone = true
|
|
248
|
+
}
|
|
249
|
+
}
|
|
250
|
+
|
|
251
|
+
// If no header row with th, add separator after first row
|
|
252
|
+
if (!headerDone && result.length > 0) {
|
|
253
|
+
const firstRow = rows[0]
|
|
254
|
+
if (firstRow) {
|
|
255
|
+
const cellCount = firstRow.querySelectorAll('td').length
|
|
256
|
+
if (cellCount > 0) {
|
|
257
|
+
result.splice(1, 0, `| ${Array(cellCount).fill('---').join(' | ')} |`)
|
|
258
|
+
}
|
|
259
|
+
}
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
return result.join('\n')
|
|
263
|
+
}
|
|
264
|
+
|
|
265
|
+
/**
|
|
266
|
+
* Extract page title from HTML
|
|
267
|
+
*/
|
|
268
|
+
function extractTitle(root: HTMLElement): string | undefined {
|
|
269
|
+
// Try meta title first
|
|
270
|
+
const title = root.querySelector('title')
|
|
271
|
+
if (title?.textContent) {
|
|
272
|
+
return title.textContent.trim()
|
|
273
|
+
}
|
|
274
|
+
|
|
275
|
+
// Try og:title
|
|
276
|
+
const ogTitle = root.querySelector('meta[property="og:title"]')
|
|
277
|
+
if (ogTitle) {
|
|
278
|
+
return ogTitle.getAttribute('content') ?? undefined
|
|
279
|
+
}
|
|
280
|
+
|
|
281
|
+
// Try first h1
|
|
282
|
+
const h1 = root.querySelector('h1')
|
|
283
|
+
if (h1?.textContent) {
|
|
284
|
+
return h1.textContent.trim()
|
|
285
|
+
}
|
|
286
|
+
|
|
287
|
+
return undefined
|
|
288
|
+
}
|
|
289
|
+
|
|
290
|
+
/**
|
|
291
|
+
* Extract meta description from HTML
|
|
292
|
+
*/
|
|
293
|
+
function extractDescription(root: HTMLElement): string | undefined {
|
|
294
|
+
const meta = root.querySelector('meta[name="description"]')
|
|
295
|
+
if (meta) {
|
|
296
|
+
return meta.getAttribute('content') ?? undefined
|
|
297
|
+
}
|
|
298
|
+
|
|
299
|
+
const ogDesc = root.querySelector('meta[property="og:description"]')
|
|
300
|
+
if (ogDesc) {
|
|
301
|
+
return ogDesc.getAttribute('content') ?? undefined
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
return undefined
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
export interface HtmlToMarkdownResult {
|
|
308
|
+
/** Extracted metadata for frontmatter */
|
|
309
|
+
metadata: {
|
|
310
|
+
title?: string
|
|
311
|
+
description?: string
|
|
312
|
+
}
|
|
313
|
+
/** Converted markdown body */
|
|
314
|
+
body: string
|
|
315
|
+
}
|
|
316
|
+
|
|
317
|
+
/**
|
|
318
|
+
* Convert HTML string to markdown
|
|
319
|
+
*/
|
|
320
|
+
export function htmlToMarkdown(html: string): HtmlToMarkdownResult {
|
|
321
|
+
const root = parse(html)
|
|
322
|
+
|
|
323
|
+
// Extract metadata
|
|
324
|
+
const title = extractTitle(root)
|
|
325
|
+
const description = extractDescription(root)
|
|
326
|
+
|
|
327
|
+
// Get main content
|
|
328
|
+
const mainContent = extractMainContent(root)
|
|
329
|
+
|
|
330
|
+
// Convert to markdown
|
|
331
|
+
let body = elementToMarkdown(mainContent)
|
|
332
|
+
|
|
333
|
+
// Clean up the output
|
|
334
|
+
body = body
|
|
335
|
+
// Remove excessive newlines
|
|
336
|
+
.replace(/\n{3,}/g, '\n\n')
|
|
337
|
+
// Trim whitespace from each line
|
|
338
|
+
.split('\n')
|
|
339
|
+
.map((line) => line.trimEnd())
|
|
340
|
+
.join('\n')
|
|
341
|
+
// Trim start and end
|
|
342
|
+
.trim()
|
|
343
|
+
|
|
344
|
+
return {
|
|
345
|
+
metadata: {
|
|
346
|
+
title,
|
|
347
|
+
description,
|
|
348
|
+
},
|
|
349
|
+
body,
|
|
350
|
+
}
|
|
351
|
+
}
|
package/src/index.ts
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
import type { AstroConfig, AstroIntegration } from 'astro'
|
|
2
|
+
import { processBuildOutput } from './build-processor'
|
|
3
|
+
import { createDevMiddleware } from './dev-middleware'
|
|
4
|
+
import { type PageMarkdownOptions, resolveOptions } from './types'
|
|
5
|
+
|
|
6
|
+
export default function pageMarkdown(options: PageMarkdownOptions = {}): AstroIntegration {
|
|
7
|
+
const resolvedOptions = resolveOptions(options)
|
|
8
|
+
let config: AstroConfig
|
|
9
|
+
|
|
10
|
+
return {
|
|
11
|
+
name: 'astro-page-markdown',
|
|
12
|
+
hooks: {
|
|
13
|
+
'astro:config:done': ({ config: cfg }) => {
|
|
14
|
+
config = cfg
|
|
15
|
+
},
|
|
16
|
+
|
|
17
|
+
'astro:server:setup': ({ server, logger }) => {
|
|
18
|
+
createDevMiddleware(server, resolvedOptions, config)
|
|
19
|
+
logger.info('Markdown endpoints enabled')
|
|
20
|
+
},
|
|
21
|
+
|
|
22
|
+
'astro:build:done': async ({ dir, pages, logger }) => {
|
|
23
|
+
await processBuildOutput(dir, pages, resolvedOptions, logger, config)
|
|
24
|
+
},
|
|
25
|
+
},
|
|
26
|
+
}
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
export type { LlmEndpointOptions, LlmsTxtOptions, MarkdownOutput, PageMarkdownOptions } from './types'
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
import { getMarkdownUrl } from './paths'
|
|
2
|
+
import type { LlmEndpointOptions } from './types'
|
|
3
|
+
|
|
4
|
+
export interface PageEntry {
|
|
5
|
+
pathname: string
|
|
6
|
+
title?: string
|
|
7
|
+
type: 'collection' | 'static'
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export interface SiteMetadata {
|
|
11
|
+
title?: string
|
|
12
|
+
description?: string
|
|
13
|
+
baseUrl?: string
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Generate the content for /.well-known/llm.md
|
|
18
|
+
*/
|
|
19
|
+
export function generateLlmMarkdown(
|
|
20
|
+
pages: PageEntry[],
|
|
21
|
+
siteMetadata: SiteMetadata,
|
|
22
|
+
options: LlmEndpointOptions,
|
|
23
|
+
): string {
|
|
24
|
+
const siteName = options.siteName ?? siteMetadata.title ?? 'Site'
|
|
25
|
+
const description = options.description ?? siteMetadata.description
|
|
26
|
+
const baseUrl = options.baseUrl ?? siteMetadata.baseUrl ?? ''
|
|
27
|
+
|
|
28
|
+
const lines: string[] = []
|
|
29
|
+
|
|
30
|
+
// Frontmatter
|
|
31
|
+
lines.push('---')
|
|
32
|
+
lines.push(`generatedAt: ${new Date().toISOString()}`)
|
|
33
|
+
lines.push('---')
|
|
34
|
+
lines.push('')
|
|
35
|
+
|
|
36
|
+
// Title
|
|
37
|
+
lines.push(`# ${siteName}`)
|
|
38
|
+
lines.push('')
|
|
39
|
+
|
|
40
|
+
// Description
|
|
41
|
+
if (description) {
|
|
42
|
+
lines.push(description)
|
|
43
|
+
lines.push('')
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Markdown endpoints section
|
|
47
|
+
lines.push('## Markdown Endpoints')
|
|
48
|
+
lines.push('')
|
|
49
|
+
lines.push('This site exposes page content as markdown at `.md` URLs.')
|
|
50
|
+
lines.push('')
|
|
51
|
+
|
|
52
|
+
// Pages section
|
|
53
|
+
if (pages.length > 0) {
|
|
54
|
+
lines.push('### Pages')
|
|
55
|
+
lines.push('')
|
|
56
|
+
|
|
57
|
+
// List all pages sorted by pathname
|
|
58
|
+
const sortedPages = [...pages].sort((a, b) => a.pathname.localeCompare(b.pathname))
|
|
59
|
+
for (const page of sortedPages) {
|
|
60
|
+
const mdUrl = getMarkdownUrl(page.pathname)
|
|
61
|
+
lines.push(`- [${baseUrl}${mdUrl}](${baseUrl}${mdUrl})${page.title ? ` - ${page.title}` : ''}`)
|
|
62
|
+
}
|
|
63
|
+
lines.push('')
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
// Usage section
|
|
67
|
+
lines.push('## Usage')
|
|
68
|
+
lines.push('')
|
|
69
|
+
lines.push('Append `.md` to any page URL to get the markdown version:')
|
|
70
|
+
lines.push(`- \`${baseUrl}/about\` → \`${baseUrl}/about.md\``)
|
|
71
|
+
lines.push(`- \`${baseUrl}/blog/hello\` → \`${baseUrl}/blog/hello.md\``)
|
|
72
|
+
|
|
73
|
+
// Additional content
|
|
74
|
+
if (options.additionalContent) {
|
|
75
|
+
lines.push('')
|
|
76
|
+
lines.push(options.additionalContent)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
return lines.join('\n')
|
|
80
|
+
}
|
|
@@ -0,0 +1,123 @@
|
|
|
1
|
+
import { getMarkdownUrl } from './paths'
|
|
2
|
+
import type { LlmsTxtOptions } from './types'
|
|
3
|
+
|
|
4
|
+
export interface PageEntry {
|
|
5
|
+
pathname: string
|
|
6
|
+
title?: string
|
|
7
|
+
type: 'collection' | 'static'
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
export interface SiteMetadata {
|
|
11
|
+
title?: string
|
|
12
|
+
description?: string
|
|
13
|
+
baseUrl?: string
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
/**
|
|
17
|
+
* Generate the content for /llms.txt
|
|
18
|
+
*
|
|
19
|
+
* The llms.txt format follows the convention at https://llmstxt.org/
|
|
20
|
+
* providing a standardized way to communicate site structure to LLMs and crawlers.
|
|
21
|
+
*/
|
|
22
|
+
export function generateLlmsTxt(
|
|
23
|
+
pages: PageEntry[],
|
|
24
|
+
siteMetadata: SiteMetadata,
|
|
25
|
+
options: LlmsTxtOptions,
|
|
26
|
+
): string {
|
|
27
|
+
const siteName = options.siteName ?? siteMetadata.title ?? 'Site'
|
|
28
|
+
const description = options.description ?? siteMetadata.description
|
|
29
|
+
const baseUrl = options.baseUrl ?? siteMetadata.baseUrl ?? ''
|
|
30
|
+
|
|
31
|
+
const lines: string[] = []
|
|
32
|
+
|
|
33
|
+
// Title
|
|
34
|
+
lines.push(`# ${siteName}`)
|
|
35
|
+
lines.push('')
|
|
36
|
+
|
|
37
|
+
// Description as blockquote (llms.txt convention)
|
|
38
|
+
if (description) {
|
|
39
|
+
lines.push(`> ${description}`)
|
|
40
|
+
lines.push('')
|
|
41
|
+
}
|
|
42
|
+
|
|
43
|
+
// LLM-specific guidance
|
|
44
|
+
if (options.allowCrawling !== false) {
|
|
45
|
+
lines.push('This site provides markdown versions of all pages for LLM consumption.')
|
|
46
|
+
lines.push('')
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
// LLM Discovery endpoint
|
|
50
|
+
lines.push('## LLM Discovery')
|
|
51
|
+
lines.push('')
|
|
52
|
+
lines.push(`- [LLM Discovery Endpoint](${baseUrl}/.well-known/llm.md): Full site map with all available markdown endpoints`)
|
|
53
|
+
lines.push('')
|
|
54
|
+
|
|
55
|
+
// Markdown endpoints section
|
|
56
|
+
lines.push('## Markdown Endpoints')
|
|
57
|
+
lines.push('')
|
|
58
|
+
lines.push('All pages are available as markdown by appending `.md` to the URL.')
|
|
59
|
+
lines.push('')
|
|
60
|
+
|
|
61
|
+
// Separate collection and static pages
|
|
62
|
+
const collectionPages = pages.filter((p) => p.type === 'collection')
|
|
63
|
+
const staticPages = pages.filter((p) => p.type === 'static')
|
|
64
|
+
|
|
65
|
+
// Content pages (from collections)
|
|
66
|
+
if (collectionPages.length > 0) {
|
|
67
|
+
lines.push('### Content')
|
|
68
|
+
lines.push('')
|
|
69
|
+
const sortedCollection = [...collectionPages].sort((a, b) => a.pathname.localeCompare(b.pathname))
|
|
70
|
+
for (const page of sortedCollection) {
|
|
71
|
+
const mdUrl = getMarkdownUrl(page.pathname)
|
|
72
|
+
const title = page.title ?? page.pathname
|
|
73
|
+
lines.push(`- [${title}](${baseUrl}${mdUrl}): ${page.pathname}`)
|
|
74
|
+
}
|
|
75
|
+
lines.push('')
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
// Static pages
|
|
79
|
+
if (staticPages.length > 0) {
|
|
80
|
+
lines.push('### Pages')
|
|
81
|
+
lines.push('')
|
|
82
|
+
const sortedStatic = [...staticPages].sort((a, b) => a.pathname.localeCompare(b.pathname))
|
|
83
|
+
for (const page of sortedStatic) {
|
|
84
|
+
const mdUrl = getMarkdownUrl(page.pathname)
|
|
85
|
+
const title = page.title ?? page.pathname
|
|
86
|
+
lines.push(`- [${title}](${baseUrl}${mdUrl}): ${page.pathname}`)
|
|
87
|
+
}
|
|
88
|
+
lines.push('')
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
// If both are empty, show a generic message
|
|
92
|
+
if (collectionPages.length === 0 && staticPages.length === 0) {
|
|
93
|
+
lines.push('Append `.md` to any page URL to get the markdown version.')
|
|
94
|
+
lines.push('')
|
|
95
|
+
lines.push('Examples:')
|
|
96
|
+
lines.push('- `/about` → `/about.md`')
|
|
97
|
+
lines.push('- `/blog/post` → `/blog/post.md`')
|
|
98
|
+
lines.push('')
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Crawling permissions
|
|
102
|
+
lines.push('## Permissions')
|
|
103
|
+
lines.push('')
|
|
104
|
+
if (options.allowCrawling !== false) {
|
|
105
|
+
lines.push('LLMs and crawlers are welcome to access markdown endpoints.')
|
|
106
|
+
} else {
|
|
107
|
+
lines.push('Please respect rate limits when crawling.')
|
|
108
|
+
}
|
|
109
|
+
|
|
110
|
+
// Additional instructions
|
|
111
|
+
if (options.instructions) {
|
|
112
|
+
lines.push('')
|
|
113
|
+
lines.push(options.instructions)
|
|
114
|
+
}
|
|
115
|
+
|
|
116
|
+
// Additional content
|
|
117
|
+
if (options.additionalContent) {
|
|
118
|
+
lines.push('')
|
|
119
|
+
lines.push(options.additionalContent)
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
return lines.join('\n')
|
|
123
|
+
}
|
|
@@ -0,0 +1,138 @@
|
|
|
1
|
+
import type { MarkdownOutput } from './types'
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Serialize a value to YAML format
|
|
5
|
+
*/
|
|
6
|
+
function yamlValue(value: unknown): string {
|
|
7
|
+
if (value === null || value === undefined) {
|
|
8
|
+
return 'null'
|
|
9
|
+
}
|
|
10
|
+
if (typeof value === 'string') {
|
|
11
|
+
// Check if string needs quoting
|
|
12
|
+
if (
|
|
13
|
+
value.includes('\n')
|
|
14
|
+
|| value.includes(':')
|
|
15
|
+
|| value.includes('#')
|
|
16
|
+
|| value.startsWith(' ')
|
|
17
|
+
|| value.endsWith(' ')
|
|
18
|
+
|| value === ''
|
|
19
|
+
) {
|
|
20
|
+
// Use double quotes and escape
|
|
21
|
+
return `"${value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n')}"`
|
|
22
|
+
}
|
|
23
|
+
return value
|
|
24
|
+
}
|
|
25
|
+
if (typeof value === 'number' || typeof value === 'boolean') {
|
|
26
|
+
return String(value)
|
|
27
|
+
}
|
|
28
|
+
if (Array.isArray(value)) {
|
|
29
|
+
if (value.length === 0) return '[]'
|
|
30
|
+
return '\n' + value.map((v) => ` - ${yamlValue(v)}`).join('\n')
|
|
31
|
+
}
|
|
32
|
+
if (typeof value === 'object') {
|
|
33
|
+
const entries = Object.entries(value)
|
|
34
|
+
if (entries.length === 0) return '{}'
|
|
35
|
+
return '\n' + entries.map(([k, v]) => ` ${k}: ${yamlValue(v)}`).join('\n')
|
|
36
|
+
}
|
|
37
|
+
return String(value)
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
/**
|
|
41
|
+
* Convert frontmatter object to YAML string
|
|
42
|
+
*/
|
|
43
|
+
function frontmatterToYaml(frontmatter: Record<string, unknown>): string {
|
|
44
|
+
const lines: string[] = []
|
|
45
|
+
|
|
46
|
+
for (const [key, value] of Object.entries(frontmatter)) {
|
|
47
|
+
const yamlVal = yamlValue(value)
|
|
48
|
+
if (yamlVal.startsWith('\n')) {
|
|
49
|
+
lines.push(`${key}:${yamlVal}`)
|
|
50
|
+
} else {
|
|
51
|
+
lines.push(`${key}: ${yamlVal}`)
|
|
52
|
+
}
|
|
53
|
+
}
|
|
54
|
+
|
|
55
|
+
return lines.join('\n')
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
export interface GenerateOptions {
|
|
59
|
+
/** URL path of the page */
|
|
60
|
+
url: string
|
|
61
|
+
/** Type of content (collection or static) */
|
|
62
|
+
type: 'collection' | 'static'
|
|
63
|
+
/** Path to source file (for collections) */
|
|
64
|
+
sourcePath?: string
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* Generate complete markdown output with frontmatter
|
|
69
|
+
*/
|
|
70
|
+
export function generateMarkdown(
|
|
71
|
+
output: MarkdownOutput,
|
|
72
|
+
options: GenerateOptions,
|
|
73
|
+
includeFrontmatter = true,
|
|
74
|
+
): string {
|
|
75
|
+
if (!includeFrontmatter) {
|
|
76
|
+
return output.body
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Build frontmatter with metadata
|
|
80
|
+
const frontmatter: Record<string, unknown> = {
|
|
81
|
+
...output.frontmatter,
|
|
82
|
+
url: options.url,
|
|
83
|
+
type: options.type,
|
|
84
|
+
generatedAt: new Date().toISOString(),
|
|
85
|
+
}
|
|
86
|
+
|
|
87
|
+
if (options.sourcePath) {
|
|
88
|
+
frontmatter.source = options.sourcePath
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const yamlContent = frontmatterToYaml(frontmatter)
|
|
92
|
+
|
|
93
|
+
return `---\n${yamlContent}\n---\n\n${output.body}`
|
|
94
|
+
}
|
|
95
|
+
|
|
96
|
+
/**
|
|
97
|
+
* Create a simple markdown output for collection content
|
|
98
|
+
* that already has markdown body
|
|
99
|
+
*/
|
|
100
|
+
export function createCollectionOutput(
|
|
101
|
+
frontmatter: Record<string, { value: string; line: number }>,
|
|
102
|
+
body: string,
|
|
103
|
+
sourcePath: string,
|
|
104
|
+
): MarkdownOutput {
|
|
105
|
+
// Convert frontmatter to simple key-value
|
|
106
|
+
const simpleFrontmatter: Record<string, unknown> = {}
|
|
107
|
+
for (const [key, { value }] of Object.entries(frontmatter)) {
|
|
108
|
+
simpleFrontmatter[key] = value
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
return {
|
|
112
|
+
frontmatter: simpleFrontmatter,
|
|
113
|
+
body,
|
|
114
|
+
sourcePath,
|
|
115
|
+
}
|
|
116
|
+
}
|
|
117
|
+
|
|
118
|
+
/**
|
|
119
|
+
* Create markdown output from HTML conversion result
|
|
120
|
+
*/
|
|
121
|
+
export function createStaticOutput(
|
|
122
|
+
metadata: { title?: string; description?: string },
|
|
123
|
+
body: string,
|
|
124
|
+
): MarkdownOutput {
|
|
125
|
+
const frontmatter: Record<string, unknown> = {}
|
|
126
|
+
|
|
127
|
+
if (metadata.title) {
|
|
128
|
+
frontmatter.title = metadata.title
|
|
129
|
+
}
|
|
130
|
+
if (metadata.description) {
|
|
131
|
+
frontmatter.description = metadata.description
|
|
132
|
+
}
|
|
133
|
+
|
|
134
|
+
return {
|
|
135
|
+
frontmatter,
|
|
136
|
+
body,
|
|
137
|
+
}
|
|
138
|
+
}
|