@nuasite/llm-enhancements 0.0.57

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,351 @@
1
+ import { type HTMLElement, type Node, NodeType, parse } from 'node-html-parser'
2
+
3
+ /** Elements to exclude from markdown conversion */
4
+ const EXCLUDED_TAGS = new Set([
5
+ 'nav',
6
+ 'footer',
7
+ 'header',
8
+ 'script',
9
+ 'style',
10
+ 'noscript',
11
+ 'svg',
12
+ 'iframe',
13
+ 'form',
14
+ 'button',
15
+ 'input',
16
+ 'select',
17
+ 'textarea',
18
+ ])
19
+
20
+ /** Block-level elements that need newlines around them */
21
+ const BLOCK_ELEMENTS = new Set([
22
+ 'p',
23
+ 'div',
24
+ 'section',
25
+ 'article',
26
+ 'main',
27
+ 'aside',
28
+ 'h1',
29
+ 'h2',
30
+ 'h3',
31
+ 'h4',
32
+ 'h5',
33
+ 'h6',
34
+ 'ul',
35
+ 'ol',
36
+ 'li',
37
+ 'blockquote',
38
+ 'pre',
39
+ 'table',
40
+ 'tr',
41
+ 'thead',
42
+ 'tbody',
43
+ 'figure',
44
+ 'figcaption',
45
+ 'hr',
46
+ 'br',
47
+ ])
48
+
49
+ /**
50
+ * Extract the main content area from HTML
51
+ */
52
+ function extractMainContent(root: HTMLElement): HTMLElement {
53
+ // Try to find main content container in order of preference
54
+ const selectors = ['main', 'article', '[role="main"]', '.content', '#content']
55
+
56
+ for (const selector of selectors) {
57
+ const element = root.querySelector(selector)
58
+ if (element) {
59
+ return element
60
+ }
61
+ }
62
+
63
+ // Fall back to body content
64
+ const body = root.querySelector('body')
65
+ return body ?? root
66
+ }
67
+
68
+ /**
69
+ * Check if an element should be excluded from markdown output
70
+ */
71
+ function shouldExclude(element: HTMLElement): boolean {
72
+ const tagName = element.tagName?.toLowerCase()
73
+ if (!tagName) return false
74
+ return EXCLUDED_TAGS.has(tagName)
75
+ }
76
+
77
+ /**
78
+ * Convert an HTML element to markdown
79
+ */
80
+ function elementToMarkdown(element: HTMLElement, depth = 0): string {
81
+ const tagName = element.tagName?.toLowerCase()
82
+
83
+ if (!tagName) {
84
+ return ''
85
+ }
86
+
87
+ if (shouldExclude(element)) {
88
+ return ''
89
+ }
90
+
91
+ // Process children for most elements
92
+ const childrenMd = () => childNodesToMarkdown(element, depth)
93
+
94
+ switch (tagName) {
95
+ // Headings
96
+ case 'h1':
97
+ return `\n# ${childrenMd().trim()}\n`
98
+ case 'h2':
99
+ return `\n## ${childrenMd().trim()}\n`
100
+ case 'h3':
101
+ return `\n### ${childrenMd().trim()}\n`
102
+ case 'h4':
103
+ return `\n#### ${childrenMd().trim()}\n`
104
+ case 'h5':
105
+ return `\n##### ${childrenMd().trim()}\n`
106
+ case 'h6':
107
+ return `\n###### ${childrenMd().trim()}\n`
108
+
109
+ // Paragraphs
110
+ case 'p':
111
+ return `\n${childrenMd().trim()}\n`
112
+
113
+ // Inline formatting
114
+ case 'strong':
115
+ case 'b':
116
+ return `**${childrenMd()}**`
117
+ case 'em':
118
+ case 'i':
119
+ return `*${childrenMd()}*`
120
+ case 'code':
121
+ return `\`${childrenMd()}\``
122
+ case 's':
123
+ case 'del':
124
+ case 'strike':
125
+ return `~~${childrenMd()}~~`
126
+
127
+ // Links
128
+ case 'a': {
129
+ const href = element.getAttribute('href') ?? ''
130
+ const text = childrenMd().trim()
131
+ if (!text) return ''
132
+ return `[${text}](${href})`
133
+ }
134
+
135
+ // Images
136
+ case 'img': {
137
+ const src = element.getAttribute('src') ?? ''
138
+ const alt = element.getAttribute('alt') ?? ''
139
+ return `![${alt}](${src})`
140
+ }
141
+
142
+ // Lists
143
+ case 'ul':
144
+ case 'ol':
145
+ return `\n${childrenMd()}`
146
+ case 'li': {
147
+ const parent = element.parentNode as HTMLElement | null
148
+ const isOrdered = parent?.tagName?.toLowerCase() === 'ol'
149
+ const prefix = isOrdered ? '1. ' : '- '
150
+ const content = childrenMd().trim()
151
+ return `${prefix}${content}\n`
152
+ }
153
+
154
+ // Code blocks
155
+ case 'pre': {
156
+ // node-html-parser treats pre content as text, so we need to re-parse innerHTML
157
+ const innerParsed = parse(element.innerHTML)
158
+ const codeElement = innerParsed.querySelector('code')
159
+ const code = codeElement ? codeElement.textContent : element.textContent
160
+ const lang = codeElement?.getAttribute('class')?.match(/language-(\w+)/)?.[1] ?? ''
161
+ return `\n\`\`\`${lang}\n${code?.trim()}\n\`\`\`\n`
162
+ }
163
+
164
+ // Blockquotes
165
+ case 'blockquote': {
166
+ const content = childrenMd().trim()
167
+ const lines = content.split('\n')
168
+ return '\n' + lines.map((line) => `> ${line}`).join('\n') + '\n'
169
+ }
170
+
171
+ // Horizontal rule
172
+ case 'hr':
173
+ return '\n---\n'
174
+
175
+ // Line break
176
+ case 'br':
177
+ return '\n'
178
+
179
+ // Tables
180
+ case 'table':
181
+ return `\n${convertTable(element)}\n`
182
+
183
+ // Figures
184
+ case 'figure':
185
+ return `\n${childrenMd()}`
186
+ case 'figcaption':
187
+ return `\n*${childrenMd().trim()}*\n`
188
+
189
+ // Container elements - just process children
190
+ case 'div':
191
+ case 'section':
192
+ case 'article':
193
+ case 'main':
194
+ case 'aside':
195
+ case 'span':
196
+ case 'thead':
197
+ case 'tbody':
198
+ case 'tfoot':
199
+ return childrenMd()
200
+
201
+ default:
202
+ return childrenMd()
203
+ }
204
+ }
205
+
206
+ /**
207
+ * Convert child nodes to markdown
208
+ */
209
+ function childNodesToMarkdown(element: HTMLElement, depth: number): string {
210
+ let result = ''
211
+
212
+ for (const child of element.childNodes) {
213
+ if (child.nodeType === NodeType.TEXT_NODE) {
214
+ // Normalize whitespace in text nodes
215
+ const text = child.textContent?.replace(/\s+/g, ' ') ?? ''
216
+ result += text
217
+ } else if (child.nodeType === NodeType.ELEMENT_NODE) {
218
+ result += elementToMarkdown(child as HTMLElement, depth + 1)
219
+ }
220
+ }
221
+
222
+ return result
223
+ }
224
+
225
+ /**
226
+ * Convert HTML table to markdown table
227
+ */
228
+ function convertTable(table: HTMLElement): string {
229
+ const rows = table.querySelectorAll('tr')
230
+ if (rows.length === 0) return ''
231
+
232
+ const result: string[] = []
233
+ let headerDone = false
234
+
235
+ for (const row of rows) {
236
+ const cells = row.querySelectorAll('th, td')
237
+ const cellContents = cells.map((cell) => {
238
+ const content = childNodesToMarkdown(cell as HTMLElement, 0).trim()
239
+ return content.replace(/\|/g, '\\|') // Escape pipe characters
240
+ })
241
+
242
+ result.push(`| ${cellContents.join(' | ')} |`)
243
+
244
+ // Add header separator after first row with th elements
245
+ if (!headerDone && row.querySelector('th')) {
246
+ result.push(`| ${cellContents.map(() => '---').join(' | ')} |`)
247
+ headerDone = true
248
+ }
249
+ }
250
+
251
+ // If no header row with th, add separator after first row
252
+ if (!headerDone && result.length > 0) {
253
+ const firstRow = rows[0]
254
+ if (firstRow) {
255
+ const cellCount = firstRow.querySelectorAll('td').length
256
+ if (cellCount > 0) {
257
+ result.splice(1, 0, `| ${Array(cellCount).fill('---').join(' | ')} |`)
258
+ }
259
+ }
260
+ }
261
+
262
+ return result.join('\n')
263
+ }
264
+
265
+ /**
266
+ * Extract page title from HTML
267
+ */
268
+ function extractTitle(root: HTMLElement): string | undefined {
269
+ // Try meta title first
270
+ const title = root.querySelector('title')
271
+ if (title?.textContent) {
272
+ return title.textContent.trim()
273
+ }
274
+
275
+ // Try og:title
276
+ const ogTitle = root.querySelector('meta[property="og:title"]')
277
+ if (ogTitle) {
278
+ return ogTitle.getAttribute('content') ?? undefined
279
+ }
280
+
281
+ // Try first h1
282
+ const h1 = root.querySelector('h1')
283
+ if (h1?.textContent) {
284
+ return h1.textContent.trim()
285
+ }
286
+
287
+ return undefined
288
+ }
289
+
290
+ /**
291
+ * Extract meta description from HTML
292
+ */
293
+ function extractDescription(root: HTMLElement): string | undefined {
294
+ const meta = root.querySelector('meta[name="description"]')
295
+ if (meta) {
296
+ return meta.getAttribute('content') ?? undefined
297
+ }
298
+
299
+ const ogDesc = root.querySelector('meta[property="og:description"]')
300
+ if (ogDesc) {
301
+ return ogDesc.getAttribute('content') ?? undefined
302
+ }
303
+
304
+ return undefined
305
+ }
306
+
307
+ export interface HtmlToMarkdownResult {
308
+ /** Extracted metadata for frontmatter */
309
+ metadata: {
310
+ title?: string
311
+ description?: string
312
+ }
313
+ /** Converted markdown body */
314
+ body: string
315
+ }
316
+
317
+ /**
318
+ * Convert HTML string to markdown
319
+ */
320
+ export function htmlToMarkdown(html: string): HtmlToMarkdownResult {
321
+ const root = parse(html)
322
+
323
+ // Extract metadata
324
+ const title = extractTitle(root)
325
+ const description = extractDescription(root)
326
+
327
+ // Get main content
328
+ const mainContent = extractMainContent(root)
329
+
330
+ // Convert to markdown
331
+ let body = elementToMarkdown(mainContent)
332
+
333
+ // Clean up the output
334
+ body = body
335
+ // Remove excessive newlines
336
+ .replace(/\n{3,}/g, '\n\n')
337
+ // Trim whitespace from each line
338
+ .split('\n')
339
+ .map((line) => line.trimEnd())
340
+ .join('\n')
341
+ // Trim start and end
342
+ .trim()
343
+
344
+ return {
345
+ metadata: {
346
+ title,
347
+ description,
348
+ },
349
+ body,
350
+ }
351
+ }
package/src/index.ts ADDED
@@ -0,0 +1,29 @@
1
+ import type { AstroConfig, AstroIntegration } from 'astro'
2
+ import { processBuildOutput } from './build-processor'
3
+ import { createDevMiddleware } from './dev-middleware'
4
+ import { type PageMarkdownOptions, resolveOptions } from './types'
5
+
6
+ export default function pageMarkdown(options: PageMarkdownOptions = {}): AstroIntegration {
7
+ const resolvedOptions = resolveOptions(options)
8
+ let config: AstroConfig
9
+
10
+ return {
11
+ name: 'astro-page-markdown',
12
+ hooks: {
13
+ 'astro:config:done': ({ config: cfg }) => {
14
+ config = cfg
15
+ },
16
+
17
+ 'astro:server:setup': ({ server, logger }) => {
18
+ createDevMiddleware(server, resolvedOptions, config)
19
+ logger.info('Markdown endpoints enabled')
20
+ },
21
+
22
+ 'astro:build:done': async ({ dir, pages, logger }) => {
23
+ await processBuildOutput(dir, pages, resolvedOptions, logger, config)
24
+ },
25
+ },
26
+ }
27
+ }
28
+
29
+ export type { LlmEndpointOptions, LlmsTxtOptions, MarkdownOutput, PageMarkdownOptions } from './types'
@@ -0,0 +1,80 @@
1
+ import { getMarkdownUrl } from './paths'
2
+ import type { LlmEndpointOptions } from './types'
3
+
4
+ export interface PageEntry {
5
+ pathname: string
6
+ title?: string
7
+ type: 'collection' | 'static'
8
+ }
9
+
10
+ export interface SiteMetadata {
11
+ title?: string
12
+ description?: string
13
+ baseUrl?: string
14
+ }
15
+
16
+ /**
17
+ * Generate the content for /.well-known/llm.md
18
+ */
19
+ export function generateLlmMarkdown(
20
+ pages: PageEntry[],
21
+ siteMetadata: SiteMetadata,
22
+ options: LlmEndpointOptions,
23
+ ): string {
24
+ const siteName = options.siteName ?? siteMetadata.title ?? 'Site'
25
+ const description = options.description ?? siteMetadata.description
26
+ const baseUrl = options.baseUrl ?? siteMetadata.baseUrl ?? ''
27
+
28
+ const lines: string[] = []
29
+
30
+ // Frontmatter
31
+ lines.push('---')
32
+ lines.push(`generatedAt: ${new Date().toISOString()}`)
33
+ lines.push('---')
34
+ lines.push('')
35
+
36
+ // Title
37
+ lines.push(`# ${siteName}`)
38
+ lines.push('')
39
+
40
+ // Description
41
+ if (description) {
42
+ lines.push(description)
43
+ lines.push('')
44
+ }
45
+
46
+ // Markdown endpoints section
47
+ lines.push('## Markdown Endpoints')
48
+ lines.push('')
49
+ lines.push('This site exposes page content as markdown at `.md` URLs.')
50
+ lines.push('')
51
+
52
+ // Pages section
53
+ if (pages.length > 0) {
54
+ lines.push('### Pages')
55
+ lines.push('')
56
+
57
+ // List all pages sorted by pathname
58
+ const sortedPages = [...pages].sort((a, b) => a.pathname.localeCompare(b.pathname))
59
+ for (const page of sortedPages) {
60
+ const mdUrl = getMarkdownUrl(page.pathname)
61
+ lines.push(`- [${baseUrl}${mdUrl}](${baseUrl}${mdUrl})${page.title ? ` - ${page.title}` : ''}`)
62
+ }
63
+ lines.push('')
64
+ }
65
+
66
+ // Usage section
67
+ lines.push('## Usage')
68
+ lines.push('')
69
+ lines.push('Append `.md` to any page URL to get the markdown version:')
70
+ lines.push(`- \`${baseUrl}/about\` → \`${baseUrl}/about.md\``)
71
+ lines.push(`- \`${baseUrl}/blog/hello\` → \`${baseUrl}/blog/hello.md\``)
72
+
73
+ // Additional content
74
+ if (options.additionalContent) {
75
+ lines.push('')
76
+ lines.push(options.additionalContent)
77
+ }
78
+
79
+ return lines.join('\n')
80
+ }
@@ -0,0 +1,123 @@
1
+ import { getMarkdownUrl } from './paths'
2
+ import type { LlmsTxtOptions } from './types'
3
+
4
+ export interface PageEntry {
5
+ pathname: string
6
+ title?: string
7
+ type: 'collection' | 'static'
8
+ }
9
+
10
+ export interface SiteMetadata {
11
+ title?: string
12
+ description?: string
13
+ baseUrl?: string
14
+ }
15
+
16
+ /**
17
+ * Generate the content for /llms.txt
18
+ *
19
+ * The llms.txt format follows the convention at https://llmstxt.org/
20
+ * providing a standardized way to communicate site structure to LLMs and crawlers.
21
+ */
22
+ export function generateLlmsTxt(
23
+ pages: PageEntry[],
24
+ siteMetadata: SiteMetadata,
25
+ options: LlmsTxtOptions,
26
+ ): string {
27
+ const siteName = options.siteName ?? siteMetadata.title ?? 'Site'
28
+ const description = options.description ?? siteMetadata.description
29
+ const baseUrl = options.baseUrl ?? siteMetadata.baseUrl ?? ''
30
+
31
+ const lines: string[] = []
32
+
33
+ // Title
34
+ lines.push(`# ${siteName}`)
35
+ lines.push('')
36
+
37
+ // Description as blockquote (llms.txt convention)
38
+ if (description) {
39
+ lines.push(`> ${description}`)
40
+ lines.push('')
41
+ }
42
+
43
+ // LLM-specific guidance
44
+ if (options.allowCrawling !== false) {
45
+ lines.push('This site provides markdown versions of all pages for LLM consumption.')
46
+ lines.push('')
47
+ }
48
+
49
+ // LLM Discovery endpoint
50
+ lines.push('## LLM Discovery')
51
+ lines.push('')
52
+ lines.push(`- [LLM Discovery Endpoint](${baseUrl}/.well-known/llm.md): Full site map with all available markdown endpoints`)
53
+ lines.push('')
54
+
55
+ // Markdown endpoints section
56
+ lines.push('## Markdown Endpoints')
57
+ lines.push('')
58
+ lines.push('All pages are available as markdown by appending `.md` to the URL.')
59
+ lines.push('')
60
+
61
+ // Separate collection and static pages
62
+ const collectionPages = pages.filter((p) => p.type === 'collection')
63
+ const staticPages = pages.filter((p) => p.type === 'static')
64
+
65
+ // Content pages (from collections)
66
+ if (collectionPages.length > 0) {
67
+ lines.push('### Content')
68
+ lines.push('')
69
+ const sortedCollection = [...collectionPages].sort((a, b) => a.pathname.localeCompare(b.pathname))
70
+ for (const page of sortedCollection) {
71
+ const mdUrl = getMarkdownUrl(page.pathname)
72
+ const title = page.title ?? page.pathname
73
+ lines.push(`- [${title}](${baseUrl}${mdUrl}): ${page.pathname}`)
74
+ }
75
+ lines.push('')
76
+ }
77
+
78
+ // Static pages
79
+ if (staticPages.length > 0) {
80
+ lines.push('### Pages')
81
+ lines.push('')
82
+ const sortedStatic = [...staticPages].sort((a, b) => a.pathname.localeCompare(b.pathname))
83
+ for (const page of sortedStatic) {
84
+ const mdUrl = getMarkdownUrl(page.pathname)
85
+ const title = page.title ?? page.pathname
86
+ lines.push(`- [${title}](${baseUrl}${mdUrl}): ${page.pathname}`)
87
+ }
88
+ lines.push('')
89
+ }
90
+
91
+ // If both are empty, show a generic message
92
+ if (collectionPages.length === 0 && staticPages.length === 0) {
93
+ lines.push('Append `.md` to any page URL to get the markdown version.')
94
+ lines.push('')
95
+ lines.push('Examples:')
96
+ lines.push('- `/about` → `/about.md`')
97
+ lines.push('- `/blog/post` → `/blog/post.md`')
98
+ lines.push('')
99
+ }
100
+
101
+ // Crawling permissions
102
+ lines.push('## Permissions')
103
+ lines.push('')
104
+ if (options.allowCrawling !== false) {
105
+ lines.push('LLMs and crawlers are welcome to access markdown endpoints.')
106
+ } else {
107
+ lines.push('Please respect rate limits when crawling.')
108
+ }
109
+
110
+ // Additional instructions
111
+ if (options.instructions) {
112
+ lines.push('')
113
+ lines.push(options.instructions)
114
+ }
115
+
116
+ // Additional content
117
+ if (options.additionalContent) {
118
+ lines.push('')
119
+ lines.push(options.additionalContent)
120
+ }
121
+
122
+ return lines.join('\n')
123
+ }
@@ -0,0 +1,138 @@
1
+ import type { MarkdownOutput } from './types'
2
+
3
+ /**
4
+ * Serialize a value to YAML format
5
+ */
6
+ function yamlValue(value: unknown): string {
7
+ if (value === null || value === undefined) {
8
+ return 'null'
9
+ }
10
+ if (typeof value === 'string') {
11
+ // Check if string needs quoting
12
+ if (
13
+ value.includes('\n')
14
+ || value.includes(':')
15
+ || value.includes('#')
16
+ || value.startsWith(' ')
17
+ || value.endsWith(' ')
18
+ || value === ''
19
+ ) {
20
+ // Use double quotes and escape
21
+ return `"${value.replace(/\\/g, '\\\\').replace(/"/g, '\\"').replace(/\n/g, '\\n')}"`
22
+ }
23
+ return value
24
+ }
25
+ if (typeof value === 'number' || typeof value === 'boolean') {
26
+ return String(value)
27
+ }
28
+ if (Array.isArray(value)) {
29
+ if (value.length === 0) return '[]'
30
+ return '\n' + value.map((v) => ` - ${yamlValue(v)}`).join('\n')
31
+ }
32
+ if (typeof value === 'object') {
33
+ const entries = Object.entries(value)
34
+ if (entries.length === 0) return '{}'
35
+ return '\n' + entries.map(([k, v]) => ` ${k}: ${yamlValue(v)}`).join('\n')
36
+ }
37
+ return String(value)
38
+ }
39
+
40
+ /**
41
+ * Convert frontmatter object to YAML string
42
+ */
43
+ function frontmatterToYaml(frontmatter: Record<string, unknown>): string {
44
+ const lines: string[] = []
45
+
46
+ for (const [key, value] of Object.entries(frontmatter)) {
47
+ const yamlVal = yamlValue(value)
48
+ if (yamlVal.startsWith('\n')) {
49
+ lines.push(`${key}:${yamlVal}`)
50
+ } else {
51
+ lines.push(`${key}: ${yamlVal}`)
52
+ }
53
+ }
54
+
55
+ return lines.join('\n')
56
+ }
57
+
58
+ export interface GenerateOptions {
59
+ /** URL path of the page */
60
+ url: string
61
+ /** Type of content (collection or static) */
62
+ type: 'collection' | 'static'
63
+ /** Path to source file (for collections) */
64
+ sourcePath?: string
65
+ }
66
+
67
+ /**
68
+ * Generate complete markdown output with frontmatter
69
+ */
70
+ export function generateMarkdown(
71
+ output: MarkdownOutput,
72
+ options: GenerateOptions,
73
+ includeFrontmatter = true,
74
+ ): string {
75
+ if (!includeFrontmatter) {
76
+ return output.body
77
+ }
78
+
79
+ // Build frontmatter with metadata
80
+ const frontmatter: Record<string, unknown> = {
81
+ ...output.frontmatter,
82
+ url: options.url,
83
+ type: options.type,
84
+ generatedAt: new Date().toISOString(),
85
+ }
86
+
87
+ if (options.sourcePath) {
88
+ frontmatter.source = options.sourcePath
89
+ }
90
+
91
+ const yamlContent = frontmatterToYaml(frontmatter)
92
+
93
+ return `---\n${yamlContent}\n---\n\n${output.body}`
94
+ }
95
+
96
+ /**
97
+ * Create a simple markdown output for collection content
98
+ * that already has markdown body
99
+ */
100
+ export function createCollectionOutput(
101
+ frontmatter: Record<string, { value: string; line: number }>,
102
+ body: string,
103
+ sourcePath: string,
104
+ ): MarkdownOutput {
105
+ // Convert frontmatter to simple key-value
106
+ const simpleFrontmatter: Record<string, unknown> = {}
107
+ for (const [key, { value }] of Object.entries(frontmatter)) {
108
+ simpleFrontmatter[key] = value
109
+ }
110
+
111
+ return {
112
+ frontmatter: simpleFrontmatter,
113
+ body,
114
+ sourcePath,
115
+ }
116
+ }
117
+
118
+ /**
119
+ * Create markdown output from HTML conversion result
120
+ */
121
+ export function createStaticOutput(
122
+ metadata: { title?: string; description?: string },
123
+ body: string,
124
+ ): MarkdownOutput {
125
+ const frontmatter: Record<string, unknown> = {}
126
+
127
+ if (metadata.title) {
128
+ frontmatter.title = metadata.title
129
+ }
130
+ if (metadata.description) {
131
+ frontmatter.description = metadata.description
132
+ }
133
+
134
+ return {
135
+ frontmatter,
136
+ body,
137
+ }
138
+ }