@redpanda-data/docs-extensions-and-macros 4.10.8 → 4.11.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,373 @@
1
+ const path = require('path')
2
+ const os = require('os')
3
+ const TurndownService = require('turndown')
4
+ const turndownPluginGfm = require('turndown-plugin-gfm')
5
+ const { gfm } = turndownPluginGfm
6
+
7
+ module.exports.register = function () {
8
+ const logger = this.getLogger('convert-to-markdown-extension')
9
+ let playbook
10
+
11
+ // Shared Turndown configuration
12
+ const baseConfig = {
13
+ headingStyle: 'atx',
14
+ codeBlockStyle: 'fenced',
15
+ bulletListMarker: '-',
16
+ linkReferenceStyle: 'full',
17
+ }
18
+
19
+ // Factory: create a configured Turndown instance
20
+ function createTurndownBase() {
21
+ const td = new TurndownService(baseConfig)
22
+ td.use(gfm)
23
+
24
+ // Remove unwanted global elements (footers, modals, feedback, etc.)
25
+ td.addRule('remove-unwanted', {
26
+ filter: (node) => {
27
+ if (!node || !node.getAttribute) return false
28
+
29
+ const classAttr = (node.getAttribute('class') || '').toLowerCase()
30
+ const idAttr = (node.getAttribute('id') || '').toLowerCase()
31
+ const tag = node.nodeName.toLowerCase()
32
+
33
+ // Remove by tag
34
+ if (['script', 'style', 'footer', 'nav'].includes(tag)) return true
35
+
36
+ // Remove tracking or hidden images
37
+ if (
38
+ tag === 'img' &&
39
+ (classAttr.includes('tracking') ||
40
+ idAttr.includes('scarf') ||
41
+ node.getAttribute('role') === 'presentation' ||
42
+ node.style?.display === 'none')
43
+ ) {
44
+ return true
45
+ }
46
+
47
+ // Remove by class or id
48
+ const toRemove = [
49
+ 'thumbs',
50
+ 'back-to-top',
51
+ 'contributors-modal',
52
+ 'feedback-section',
53
+ 'feedback-toast',
54
+ 'pagination',
55
+ 'footer',
56
+ 'nav-expand',
57
+ 'banner-container'
58
+ ]
59
+ return toRemove.some(
60
+ (x) => classAttr.includes(x) || idAttr.includes(x)
61
+ )
62
+ },
63
+ replacement: () => '',
64
+ })
65
+
66
+ // Keep critical content blocks only
67
+ td.keep(['div.openblock.tabs', 'article.doc'])
68
+ return td
69
+ }
70
+
71
+ // Factory: create page-specific Turndown converter
72
+ function createTurndownForPage(page) {
73
+ const outerTurndown = createTurndownBase()
74
+ const nestedTurndown = createTurndownBase()
75
+
76
+ // Helper to add custom rules
77
+ function addCustomRules(turndownInstance, isInner = false) {
78
+ // Determine heading depth for tab conversion
79
+ function findNearestHeadingLevel(el) {
80
+ let current = el.previousElementSibling
81
+ while (current) {
82
+ if (/^H[1-6]$/i.test(current.nodeName))
83
+ return parseInt(current.nodeName.substring(1))
84
+ current = current.previousElementSibling
85
+ }
86
+ let parent = el.parentElement
87
+ while (parent) {
88
+ const headings = Array.from(
89
+ parent.querySelectorAll('h1,h2,h3,h4,h5,h6')
90
+ )
91
+ if (headings.length > 0) {
92
+ const last = headings[headings.length - 1]
93
+ return parseInt(last.nodeName.substring(1))
94
+ }
95
+ parent = parent.parentElement
96
+ }
97
+ return 2
98
+ }
99
+
100
+ // Asciidoctor tab conversion
101
+ turndownInstance.addRule('asciidoctor-tabs', {
102
+ filter: (node) => {
103
+ if (node.nodeName !== 'DIV') return false
104
+ const classAttr = node.getAttribute?.('class') || node.className || ''
105
+ return classAttr.includes('openblock') && classAttr.includes('tabs')
106
+ },
107
+ replacement: function (_, node) {
108
+ function processTabGroup(group, parentHeadingLevel = null) {
109
+ const contentDiv = group.querySelector('.content') || group
110
+ const tabList = contentDiv.querySelectorAll('li.tab')
111
+ if (!tabList.length) return ''
112
+
113
+ const nearestLevel =
114
+ parentHeadingLevel != null
115
+ ? parentHeadingLevel + 1
116
+ : findNearestHeadingLevel(group) + 1
117
+ const tabHeadingLevel = Math.min(nearestLevel, 6)
118
+ const headingPrefix = '#'.repeat(tabHeadingLevel)
119
+
120
+ let markdown = ''
121
+ tabList.forEach((tab) => {
122
+ const title =
123
+ tab.querySelector('p')?.textContent.trim() ||
124
+ tab.textContent.trim()
125
+
126
+ let panelId = tab.getAttribute('aria-controls')
127
+ if (!panelId && tab.id) panelId = tab.id + '--panel'
128
+ const panel = group.querySelector(`#${panelId}`)
129
+ if (!panel) return
130
+
131
+ const nestedTabs = panel.querySelectorAll('.openblock.tabs')
132
+ let nestedMdCombined = ''
133
+ nestedTabs.forEach((nested) => {
134
+ nestedMdCombined +=
135
+ '\n' + processTabGroup(nested, tabHeadingLevel) + '\n'
136
+ nested.remove()
137
+ })
138
+
139
+ const innerHtml = panel.innerHTML || ''
140
+ let md = ''
141
+ try {
142
+ const converter = isInner ? nestedTurndown : turndownInstance
143
+ md = converter.turndown(innerHtml)
144
+ } catch (e) {
145
+ logger.warn(`Turndown failed in nested tab: ${e.message}`)
146
+ }
147
+
148
+ markdown += `${headingPrefix} ${title}\n\n${md.trim()}\n${nestedMdCombined.trim()}\n\n`
149
+ })
150
+
151
+ return markdown.trim()
152
+ }
153
+
154
+ return '\n' + processTabGroup(node, null) + '\n'
155
+ },
156
+ })
157
+
158
+ // Admonition block conversion
159
+ turndownInstance.addRule('admonition', {
160
+ filter: (node) =>
161
+ node.nodeName === 'TABLE' &&
162
+ node.querySelector('td.icon') &&
163
+ node.querySelector('td.content'),
164
+ replacement: function (_, node) {
165
+ const iconCell = node.querySelector('td.icon')
166
+ const contentCell = node.querySelector('td.content')
167
+ if (!iconCell || !contentCell) return ''
168
+
169
+ const iconEl = iconCell.querySelector('i')
170
+ const classAttr = iconEl?.className || ''
171
+ const match = classAttr.match(/icon-([a-z]+)/i)
172
+ const type = match ? match[1].toUpperCase() : 'NOTE'
173
+
174
+ const titleEl =
175
+ node.querySelector('.title') ||
176
+ contentCell.querySelector('.title') ||
177
+ iconEl?.getAttribute('title')
178
+ const customTitle =
179
+ typeof titleEl === 'string'
180
+ ? titleEl.trim()
181
+ : titleEl?.textContent?.trim() || ''
182
+
183
+ const emojiMap = {
184
+ CAUTION: '⚠️',
185
+ WARNING: '⚠️',
186
+ TIP: '💡',
187
+ NOTE: '📝',
188
+ IMPORTANT: '❗',
189
+ }
190
+ const emoji = emojiMap[type] || '📘'
191
+
192
+ const innerHtml = contentCell.innerHTML || ''
193
+ let innerMd = ''
194
+ try {
195
+ const converter = isInner ? nestedTurndown : turndownInstance
196
+ innerMd = converter.turndown(innerHtml).trim()
197
+ } catch (e) {
198
+ logger.warn(`Turndown failed in admonition: ${e.message}`)
199
+ }
200
+
201
+ const titleLower = customTitle.toLowerCase()
202
+ const typeLower = type.toLowerCase()
203
+ const header =
204
+ customTitle && titleLower !== typeLower
205
+ ? `${emoji} **${type}: ${customTitle}**`
206
+ : `${emoji} **${type}**`
207
+
208
+ const quoted = innerMd
209
+ .split('\n')
210
+ .map((line) => (line.startsWith('>') ? line : `> ${line}`))
211
+ .join('\n')
212
+
213
+ return `\n> ${header}\n>\n${quoted}\n`
214
+ },
215
+ })
216
+
217
+ // Markdown table conversion
218
+ turndownInstance.addRule('tables', {
219
+ filter: (node) => {
220
+ if (node.nodeName !== 'TABLE') return false
221
+ if (node.querySelector('td.icon') && node.querySelector('td.content'))
222
+ return false
223
+ return true
224
+ },
225
+ replacement: function (content, node) {
226
+ const rows = Array.from(node.querySelectorAll('tr'))
227
+ if (!rows.length) return content
228
+ const tableRows = []
229
+ rows.forEach((row, index) => {
230
+ const cells = Array.from(row.querySelectorAll('th, td'))
231
+ const cellContents = cells.map((cell) =>
232
+ (cell.textContent || '').trim().replace(/\s+/g, ' ')
233
+ )
234
+ if (!cellContents.length) return
235
+ const rowLine = '| ' + cellContents.join(' | ') + ' |'
236
+ tableRows.push(rowLine)
237
+ if (index === 0) {
238
+ const separator =
239
+ '| ' + cellContents.map(() => '---').join(' | ') + ' |'
240
+ tableRows.push(separator)
241
+ }
242
+ })
243
+ return '\n' + tableRows.join('\n') + '\n'
244
+ },
245
+ })
246
+ }
247
+
248
+ addCustomRules(outerTurndown, false)
249
+ addCustomRules(nestedTurndown, true)
250
+ return outerTurndown
251
+ }
252
+
253
+ // Conversion pipeline
254
+ this.on('pagesComposed', async ({ playbook: pb, contentCatalog }) => {
255
+ playbook = pb
256
+ const siteUrl = playbook.site?.url || ''
257
+ const pages = contentCatalog.getPages()
258
+ logger.info(
259
+ `Converting ${pages.length} pages to Markdown${
260
+ siteUrl ? ` (site.url=${siteUrl})` : ''
261
+ }...`
262
+ )
263
+
264
+ const concurrency = Math.max(2, Math.floor(os.cpus().length / 2))
265
+ const queue = [...pages]
266
+ let convertedCount = 0
267
+
268
+ async function processQueue() {
269
+ while (queue.length) {
270
+ const page = queue.shift()
271
+ if (!page?.contents) continue
272
+
273
+ try {
274
+ const html = page.contents.toString().trim()
275
+ if (!html) continue
276
+
277
+ // Extract only the <article class="doc"> portion
278
+ const match = html.match(
279
+ /<article[^>]*class=["'][^"']*\bdoc\b[^"']*["'][^>]*>([\s\S]*?)<\/article>/i
280
+ )
281
+ if (!match || !match[1]) {
282
+ logger.info(`No <article class="doc"> found for ${page.src?.path}`)
283
+ continue
284
+ }
285
+ const articleHtml = match[1]
286
+
287
+ // Convert with Turndown
288
+ const td = createTurndownForPage(page)
289
+ let markdown = td.turndown(articleHtml).trim()
290
+
291
+ // Canonical source link
292
+ let canonicalUrl = ''
293
+ try {
294
+ if (siteUrl && page.pub?.url) {
295
+ const htmlStyle = playbook?.urls?.htmlExtensionStyle
296
+ const isIndexify = htmlStyle === 'indexify'
297
+ const baseUrl = new URL(page.pub.url, siteUrl)
298
+ let pathname = baseUrl.pathname
299
+
300
+ if (isIndexify) {
301
+ const looksLikeDir =
302
+ pathname.endsWith('/') ||
303
+ !path.basename(pathname).includes('.')
304
+ baseUrl.pathname = looksLikeDir
305
+ ? pathname.replace(/\/?$/, '/index.md')
306
+ : pathname.replace(/\.html$/, '.md')
307
+ } else {
308
+ baseUrl.pathname = pathname.replace(/\.html$/, '.md')
309
+ }
310
+
311
+ canonicalUrl = baseUrl.toString()
312
+ }
313
+ } catch (e) {
314
+ logger.debug(
315
+ `Failed to build canonical URL for ${page.src?.path}: ${e.message}`
316
+ )
317
+ }
318
+
319
+ // Prepend Markdown source reference and URL construction hint
320
+ if (canonicalUrl) {
321
+ const urlHint = `<!-- Note for AI: Links in this doc are relative to the current page and use indexify format. Add /index.md to directory-style links for the Markdown version. -->`
322
+
323
+ markdown = `<!-- Source: ${canonicalUrl} -->\n${urlHint}\n\n${markdown}`
324
+ }
325
+
326
+ // Clean up unnecessary whitespace
327
+ if (markdown) {
328
+ // Remove excessive blank lines (more than 2 consecutive newlines)
329
+ markdown = markdown.replace(/\n{3,}/g, '\n\n')
330
+ // Remove trailing whitespace from lines
331
+ markdown = markdown.replace(/[ \t]+$/gm, '')
332
+ // Remove leading/trailing whitespace from the entire document
333
+ markdown = markdown.trim()
334
+ }
335
+
336
+ if (markdown) {
337
+ page.markdownContents = Buffer.from(markdown, 'utf8')
338
+ convertedCount++
339
+ }
340
+ } catch (err) {
341
+ logger.error(
342
+ `Error converting ${page.src?.path || 'unknown'}: ${err.message}`
343
+ )
344
+ logger.debug(err.stack)
345
+ }
346
+ }
347
+ }
348
+
349
+ const workers = Array.from({ length: concurrency }, processQueue)
350
+ await Promise.all(workers)
351
+ logger.info(`Converted ${convertedCount} Markdown files.`)
352
+ })
353
+
354
+ // Add Markdown files to site catalog
355
+ this.on('beforePublish', ({ siteCatalog, contentCatalog }) => {
356
+ const pages = contentCatalog.getPages((p) => p.markdownContents)
357
+ if (!pages.length) {
358
+ logger.info('No Markdown files to publish.')
359
+ return
360
+ }
361
+ logger.info(`Adding ${pages.length} Markdown files to site catalog...`)
362
+ for (const page of pages) {
363
+ const htmlOut = page.out?.path
364
+ if (!htmlOut) continue
365
+ const mdOutPath = htmlOut.replace(/\.html$/, '.md')
366
+ siteCatalog.addFile({
367
+ contents: page.markdownContents,
368
+ out: { path: mdOutPath },
369
+ })
370
+ logger.debug(`Added Markdown: ${mdOutPath}`)
371
+ }
372
+ })
373
+ }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@redpanda-data/docs-extensions-and-macros",
3
- "version": "4.10.8",
3
+ "version": "4.11.0",
4
4
  "description": "Antora extensions and macros developed for Redpanda documentation.",
5
5
  "keywords": [
6
6
  "antora",
@@ -43,6 +43,7 @@
43
43
  "./extensions/process-context-switcher": "./extensions/process-context-switcher.js",
44
44
  "./extensions/archive-attachments": "./extensions/archive-attachments.js",
45
45
  "./extensions/add-pages-to-root": "./extensions/add-pages-to-root.js",
46
+ "./extensions/convert-to-markdown": "./extensions/convert-to-markdown.js",
46
47
  "./extensions/collect-bloblang-samples": "./extensions/collect-bloblang-samples.js",
47
48
  "./extensions/compute-end-of-life": "./extensions/compute-end-of-life.js",
48
49
  "./extensions/generate-rp-connect-categories": "./extensions/generate-rp-connect-categories.js",
@@ -85,8 +86,10 @@
85
86
  "@octokit/core": "^6.1.2",
86
87
  "@octokit/plugin-retry": "^7.1.1",
87
88
  "@octokit/rest": "^21.0.1",
89
+ "@redocly/cli": "^2.2.0",
88
90
  "algoliasearch": "^4.17.0",
89
91
  "chalk": "4.1.2",
92
+ "cheerio": "^1.1.2",
90
93
  "commander": "^14.0.0",
91
94
  "gulp": "^4.0.2",
92
95
  "gulp-connect": "^5.7.0",
@@ -103,9 +106,10 @@
103
106
  "sync-request": "^6.1.0",
104
107
  "tar": "^7.4.3",
105
108
  "tree-sitter": "^0.22.4",
109
+ "turndown": "^7.2.2",
110
+ "turndown-plugin-gfm": "^1.0.2",
106
111
  "yaml": "^2.7.1",
107
- "yargs": "^17.7.2",
108
- "@redocly/cli": "^2.2.0"
112
+ "yargs": "^17.7.2"
109
113
  },
110
114
  "devDependencies": {
111
115
  "@antora/cli": "3.1.4",