@henryavila/mdprobe 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,247 @@
1
+ import { unified } from 'unified'
2
+ import remarkParse from 'remark-parse'
3
+ import remarkGfm from 'remark-gfm'
4
+ import remarkMath from 'remark-math'
5
+ import remarkFrontmatter from 'remark-frontmatter'
6
+ import remarkRehype from 'remark-rehype'
7
+ import rehypeRaw from 'rehype-raw'
8
+ import rehypeStringify from 'rehype-stringify'
9
+ import { visit } from 'unist-util-visit'
10
+ import hljs from 'highlight.js'
11
+ import yaml from 'js-yaml'
12
+
13
+ // ---------------------------------------------------------------------------
14
+ // Inline element tag names that should get data-source-col in addition to
15
+ // data-source-line.
16
+ // ---------------------------------------------------------------------------
17
+ const INLINE_TAGS = new Set([
18
+ 'strong', 'em', 'code', 'a', 'del', 'sup', 'sub',
19
+ 'span', 'abbr', 'mark', 'small', 'b', 'i', 'u',
20
+ ])
21
+
22
+ // ---------------------------------------------------------------------------
23
+ // Custom remark plugin: extract YAML / TOML frontmatter
24
+ // ---------------------------------------------------------------------------
25
+ function remarkExtractFrontmatter() {
26
+ return (tree, file) => {
27
+ let fm = null
28
+ visit(tree, 'yaml', (node) => {
29
+ try {
30
+ fm = yaml.load(node.value)
31
+ } catch {
32
+ fm = null
33
+ }
34
+ })
35
+ file.data.frontmatter = fm
36
+ }
37
+ }
38
+
39
+ // ---------------------------------------------------------------------------
40
+ // Custom remark plugin: extract TOC from heading nodes
41
+ // ---------------------------------------------------------------------------
42
+ function remarkExtractToc() {
43
+ return (tree, file) => {
44
+ const toc = []
45
+ visit(tree, 'heading', (node) => {
46
+ const text = collectText(node)
47
+ toc.push({
48
+ heading: text,
49
+ level: node.depth,
50
+ line: node.position?.start?.line ?? 0,
51
+ })
52
+ })
53
+ file.data.toc = toc
54
+ }
55
+ }
56
+
57
+ /** Recursively collect plain text from an mdast node. */
58
+ function collectText(node) {
59
+ if (node.type === 'text' || node.type === 'inlineCode') return node.value
60
+ if (node.children) return node.children.map(collectText).join('')
61
+ return ''
62
+ }
63
+
64
+ // ---------------------------------------------------------------------------
65
+ // Custom rehype plugin: inject data-source-line / data-source-col attributes
66
+ // from hast node positions (preserved by remark-rehype).
67
+ //
68
+ // IMPORTANT: This runs BEFORE rehype-raw so that elements parsed from raw
69
+ // HTML in markdown do NOT receive source-position attributes — only elements
70
+ // generated by remark-rehype (which carry accurate mdast positions) are
71
+ // annotated.
72
+ // ---------------------------------------------------------------------------
73
+ function rehypeSourcePositions() {
74
+ return (tree) => {
75
+ visit(tree, 'element', (node) => {
76
+ const pos = node.position
77
+ if (pos?.start?.line != null) {
78
+ node.properties = node.properties || {}
79
+ node.properties['dataSourceLine'] = String(pos.start.line)
80
+
81
+ if (INLINE_TAGS.has(node.tagName) && pos.start.column != null) {
82
+ node.properties['dataSourceCol'] = String(pos.start.column)
83
+ }
84
+ }
85
+ })
86
+ }
87
+ }
88
+
89
+ // ---------------------------------------------------------------------------
90
+ // Custom rehype plugin: syntax highlight code blocks (skip mermaid).
91
+ //
92
+ // Adds the `hljs` class to fenced code blocks so that highlight.js
93
+ // stylesheets apply. Uses hljs to detect/validate the language but
94
+ // preserves the original text content so that downstream consumers
95
+ // (search, copy-to-clipboard, tests) can rely on literal text matching.
96
+ // ---------------------------------------------------------------------------
97
+ function rehypeHighlight() {
98
+ return (tree) => {
99
+ visit(tree, 'element', (node, _index, parent) => {
100
+ if (node.tagName !== 'code') return
101
+ if (!parent || parent.tagName !== 'pre') return
102
+
103
+ const className = node.properties?.className || []
104
+ const langClass = className.find(
105
+ (c) => typeof c === 'string' && c.startsWith('language-'),
106
+ )
107
+ const lang = langClass ? langClass.replace('language-', '') : null
108
+
109
+ if (lang === 'mermaid' || lang === 'math') return
110
+
111
+ const source = getTextContent(node)
112
+ let highlighted
113
+
114
+ try {
115
+ if (lang && hljs.getLanguage(lang)) {
116
+ highlighted = hljs.highlight(source, { language: lang })
117
+ } else {
118
+ highlighted = hljs.highlightAuto(source)
119
+ }
120
+ } catch {
121
+ return // leave node unchanged on error
122
+ }
123
+
124
+ if (!className.includes('hljs')) className.push('hljs')
125
+ if (highlighted.language && !className.includes(`language-${highlighted.language}`)) {
126
+ className.push(`language-${highlighted.language}`)
127
+ }
128
+ node.properties.className = className
129
+
130
+ // Replace children with highlighted HTML via a raw hast node
131
+ node.children = [{ type: 'raw', value: highlighted.value }]
132
+ })
133
+ }
134
+ }
135
+
136
+ /** Recursively get text content from a hast node. */
137
+ function getTextContent(node) {
138
+ if (node.type === 'text') return node.value
139
+ if (node.children) return node.children.map(getTextContent).join('')
140
+ return ''
141
+ }
142
+
143
+ // ---------------------------------------------------------------------------
144
+ // Custom rehype plugin: handle mermaid code blocks
145
+ // Replace <pre><code class="language-mermaid">...</code></pre> with
146
+ // <pre class="mermaid" data-language="mermaid">raw source</pre>
147
+ // ---------------------------------------------------------------------------
148
+ function rehypeMermaid() {
149
+ return (tree) => {
150
+ visit(tree, 'element', (node) => {
151
+ if (node.tagName !== 'pre') return
152
+ if (!node.children || node.children.length === 0) return
153
+
154
+ const codeNode = node.children.find(
155
+ (c) => c.type === 'element' && c.tagName === 'code',
156
+ )
157
+ if (!codeNode) return
158
+
159
+ const className = codeNode.properties?.className || []
160
+ if (!className.includes('language-mermaid')) return
161
+
162
+ // Extract raw mermaid source
163
+ const source = getTextContent(codeNode)
164
+
165
+ // Transform the <pre> to have class="mermaid" and hold the raw source
166
+ const props = { className: ['mermaid'], dataLanguage: 'mermaid' }
167
+ // Preserve data-source-line if present on either node
168
+ const srcLine =
169
+ node.properties?.dataSourceLine ?? codeNode.properties?.dataSourceLine
170
+ if (srcLine != null) {
171
+ props.dataSourceLine = srcLine
172
+ }
173
+ node.properties = props
174
+
175
+ // Replace children with raw text (for client-side Mermaid rendering)
176
+ node.children = [{ type: 'text', value: source }]
177
+ })
178
+ }
179
+ }
180
+
181
+ // ---------------------------------------------------------------------------
182
+ // Custom rehype plugin: ensure math elements have detectable class markers.
183
+ //
184
+ // remark-math + remark-rehype produces <code class="language-math math-inline">
185
+ // for inline math and <code class="language-math math-display"> for display
186
+ // math. These already contain "math" in the class list, so detection
187
+ // (`class="[^"]*math[^"]*"`) works. This plugin is a safety net to add
188
+ // a `data-math` attribute if no class-based marker is found.
189
+ // ---------------------------------------------------------------------------
190
+ function rehypeMathClass() {
191
+ return (tree) => {
192
+ visit(tree, 'element', (node) => {
193
+ const className = node.properties?.className || []
194
+ const hasMathClass = className.some(
195
+ (c) => typeof c === 'string' && (c.includes('math') || c.includes('katex')),
196
+ )
197
+ if (hasMathClass && !node.properties.dataMath) {
198
+ // Add data-math for extra detectability
199
+ node.properties.dataMath = 'true'
200
+ }
201
+ })
202
+ }
203
+ }
204
+
205
+ // ---------------------------------------------------------------------------
206
+ // Build the unified processor
207
+ // ---------------------------------------------------------------------------
208
+ const processor = unified()
209
+ .use(remarkParse)
210
+ .use(remarkGfm)
211
+ .use(remarkMath)
212
+ .use(remarkFrontmatter, ['yaml', 'toml'])
213
+ .use(remarkExtractFrontmatter)
214
+ .use(remarkExtractToc)
215
+ .use(remarkRehype, { allowDangerousHtml: true })
216
+ // Source positions BEFORE rehype-raw so raw HTML elements are not annotated
217
+ .use(rehypeSourcePositions)
218
+ .use(rehypeRaw)
219
+ .use(rehypeHighlight)
220
+ .use(rehypeMermaid)
221
+ .use(rehypeMathClass)
222
+ .use(rehypeStringify, { allowDangerousHtml: true })
223
+
224
+ // ---------------------------------------------------------------------------
225
+ // Public API
226
+ // ---------------------------------------------------------------------------
227
+
228
+ /**
229
+ * Render a markdown string to HTML with source-position tracking, TOC
230
+ * extraction, and frontmatter parsing.
231
+ *
232
+ * @param {string} markdown - The markdown source string
233
+ * @returns {{ html: string, toc: Array<{heading: string, level: number, line: number}>, frontmatter: object|null }}
234
+ */
235
+ export function render(markdown) {
236
+ if (!markdown || typeof markdown !== 'string' || markdown.trim() === '') {
237
+ return { html: '', toc: [], frontmatter: null }
238
+ }
239
+
240
+ const file = processor.processSync(markdown)
241
+
242
+ return {
243
+ html: String(file),
244
+ toc: file.data.toc || [],
245
+ frontmatter: file.data.frontmatter ?? null,
246
+ }
247
+ }