confluence-exporter 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.eslintrc.cjs +18 -0
  2. package/.github/copilot-instructions.md +3 -0
  3. package/.github/prompts/analyze.prompt.md +101 -0
  4. package/.github/prompts/clarify.prompt.md +158 -0
  5. package/.github/prompts/constitution.prompt.md +73 -0
  6. package/.github/prompts/implement.prompt.md +56 -0
  7. package/.github/prompts/plan.prompt.md +50 -0
  8. package/.github/prompts/specify.prompt.md +21 -0
  9. package/.github/prompts/tasks.prompt.md +69 -0
  10. package/LICENSE +21 -0
  11. package/README.md +332 -0
  12. package/agents.md +1174 -0
  13. package/dist/api.d.ts +73 -0
  14. package/dist/api.js +387 -0
  15. package/dist/api.js.map +1 -0
  16. package/dist/commands/download.command.d.ts +18 -0
  17. package/dist/commands/download.command.js +257 -0
  18. package/dist/commands/download.command.js.map +1 -0
  19. package/dist/commands/executor.d.ts +22 -0
  20. package/dist/commands/executor.js +52 -0
  21. package/dist/commands/executor.js.map +1 -0
  22. package/dist/commands/help.command.d.ts +8 -0
  23. package/dist/commands/help.command.js +68 -0
  24. package/dist/commands/help.command.js.map +1 -0
  25. package/dist/commands/index.command.d.ts +14 -0
  26. package/dist/commands/index.command.js +95 -0
  27. package/dist/commands/index.command.js.map +1 -0
  28. package/dist/commands/index.d.ts +13 -0
  29. package/dist/commands/index.js +13 -0
  30. package/dist/commands/index.js.map +1 -0
  31. package/dist/commands/plan.command.d.ts +54 -0
  32. package/dist/commands/plan.command.js +272 -0
  33. package/dist/commands/plan.command.js.map +1 -0
  34. package/dist/commands/registry.d.ts +12 -0
  35. package/dist/commands/registry.js +32 -0
  36. package/dist/commands/registry.js.map +1 -0
  37. package/dist/commands/transform.command.d.ts +69 -0
  38. package/dist/commands/transform.command.js +951 -0
  39. package/dist/commands/transform.command.js.map +1 -0
  40. package/dist/commands/types.d.ts +12 -0
  41. package/dist/commands/types.js +5 -0
  42. package/dist/commands/types.js.map +1 -0
  43. package/dist/commands/update.command.d.ts +10 -0
  44. package/dist/commands/update.command.js +201 -0
  45. package/dist/commands/update.command.js.map +1 -0
  46. package/dist/constants.d.ts +1 -0
  47. package/dist/constants.js +2 -0
  48. package/dist/constants.js.map +1 -0
  49. package/dist/index.d.ts +5 -0
  50. package/dist/index.js +110 -0
  51. package/dist/index.js.map +1 -0
  52. package/dist/logger.d.ts +15 -0
  53. package/dist/logger.js +52 -0
  54. package/dist/logger.js.map +1 -0
  55. package/dist/types.d.ts +167 -0
  56. package/dist/types.js +5 -0
  57. package/dist/types.js.map +1 -0
  58. package/dist/utils.d.ts +56 -0
  59. package/dist/utils.js +178 -0
  60. package/dist/utils.js.map +1 -0
  61. package/eslint.config.js +29 -0
  62. package/jest.config.cjs +25 -0
  63. package/migrate-meta.js +132 -0
  64. package/package.json +53 -0
  65. package/src/api.ts +469 -0
  66. package/src/commands/download.command.ts +324 -0
  67. package/src/commands/executor.ts +62 -0
  68. package/src/commands/help.command.ts +72 -0
  69. package/src/commands/index.command.ts +111 -0
  70. package/src/commands/index.ts +14 -0
  71. package/src/commands/plan.command.ts +318 -0
  72. package/src/commands/registry.ts +39 -0
  73. package/src/commands/transform.command.ts +1103 -0
  74. package/src/commands/types.ts +16 -0
  75. package/src/commands/update.command.ts +229 -0
  76. package/src/constants.ts +0 -0
  77. package/src/index.ts +120 -0
  78. package/src/logger.ts +60 -0
  79. package/src/test.sh +66 -0
  80. package/src/types.ts +176 -0
  81. package/src/utils.ts +204 -0
  82. package/tests/commands/README.md +123 -0
  83. package/tests/commands/download.command.test.ts +8 -0
  84. package/tests/commands/help.command.test.ts +8 -0
  85. package/tests/commands/index.command.test.ts +8 -0
  86. package/tests/commands/plan.command.test.ts +15 -0
  87. package/tests/commands/transform.command.test.ts +8 -0
  88. package/tests/fixtures/_index.yaml +38 -0
  89. package/tests/fixtures/mock-pages.ts +62 -0
  90. package/tsconfig.json +25 -0
  91. package/vite.config.ts +45 -0
@@ -0,0 +1,1103 @@
1
+ /**
2
+ * Transform command handler - Transforms HTML files to Markdown
3
+ */
4
+
5
+ import { promises as fs } from 'fs';
6
+ import path from 'path';
7
+ import prettier from 'prettier';
8
+ import { htmlToMarkdown } from "webforai";
9
+ import { ConfluenceApi } from '../api.js';
10
+ import { pagePath, slugify, unslugify } from '../utils.js';
11
+ import type { Page } from '../types.js';
12
+ import { logger } from '../logger.js';
13
+ import type { CommandContext, CommandHandler } from './types.js';
14
+ import type { ConfluenceConfig } from '../types.js';
15
+
16
+ interface TreeNode {
17
+ name: string;
18
+ children: { [key: string]: TreeNode };
19
+ files: Array<{ name: string; relativePath: string }>;
20
+ }
21
+
22
+ export class TransformCommand implements CommandHandler {
23
+ private pendingIncludes: Array<{ placeholder: string; content: string }> = [];
24
+ private api!: ConfluenceApi;
25
+ constructor(private config: ConfluenceConfig) {
26
+ }
27
+ async execute(_context: CommandContext): Promise<void> {
28
+ this.api = new ConfluenceApi(this.config);
29
+
30
+ logger.info(`Transforming HTML files to Markdown...`);
31
+ logger.info(`Output directory: ${this.config.outputDir}\n`);
32
+
33
+ // Clear existing MD files and images if --clear flag is set
34
+ if (this.config.clear) {
35
+ logger.info('Clearing existing .md files and images folders...');
36
+ await this.clearExistingFiles(this.config.outputDir);
37
+ logger.info('✓ Cleared existing files\n');
38
+ }
39
+
40
+ let transformedCount = 0;
41
+ let skippedCount = 0;
42
+ let errorCount = 0;
43
+ const htmlFiles: string[] = [];
44
+
45
+ if (this.config.pageId) {
46
+ logger.info(`Processing specific page: ${this.config.pageId}\n`);
47
+ const pageHtmlPath = pagePath(this.config.pageId, this.config);
48
+ logger.info(`HTML path: ${pageHtmlPath}\n`);
49
+ htmlFiles.push(pageHtmlPath);
50
+ } else {
51
+
52
+ // Helper function to recursively find HTML files
53
+ const findHtmlFiles = async (dir: string, fileList: string[] = []): Promise<string[]> => {
54
+ const entries = await fs.readdir(dir, { withFileTypes: true });
55
+
56
+ for (const entry of entries) {
57
+ const fullPath = path.join(dir, entry.name);
58
+
59
+ if (entry.isDirectory() && !entry.name.startsWith('_') && entry.name !== 'images') {
60
+ // Recursively search subdirectories (skip _index, _queue, etc. and images folder)
61
+ await findHtmlFiles(fullPath, fileList);
62
+ } else if (entry.isFile() && entry.name.endsWith('.html') && !entry.name.startsWith('_')) {
63
+ fileList.push(fullPath);
64
+ }
65
+ }
66
+
67
+ return fileList;
68
+ };
69
+
70
+ // Find all HTML files recursively
71
+ htmlFiles.push(...await findHtmlFiles(this.config.outputDir));
72
+ }
73
+ if (htmlFiles.length === 0) {
74
+ logger.info('No HTML files found to transform.');
75
+ logger.info('Run the "download" command first to download HTML pages.');
76
+ return;
77
+ }
78
+
79
+ // Apply limit if specified
80
+ const filesToProcess = this.config.limit ? htmlFiles.slice(0, this.config.limit) : htmlFiles;
81
+
82
+ logger.info(`Found ${htmlFiles.length} HTML files`);
83
+ if (this.config.limit && htmlFiles.length > this.config.limit) {
84
+ logger.info(`Limiting to first ${this.config.limit} files\n`);
85
+ } else {
86
+ logger.info();
87
+ }
88
+
89
+ // Process HTML files in parallel batches
90
+ const batchSize = this.config.parallel || 5;
91
+ const batches = [];
92
+ for (let i = 0; i < filesToProcess.length; i += batchSize) {
93
+ batches.push(filesToProcess.slice(i, i + batchSize));
94
+ }
95
+
96
+ for (let batchIndex = 0; batchIndex < batches.length; batchIndex++) {
97
+ const batch = batches[batchIndex];
98
+ const batchStart = batchIndex * batchSize + 1;
99
+ const batchEnd = Math.min((batchIndex + 1) * batchSize, filesToProcess.length);
100
+ logger.info(`Processing batch ${batchIndex + 1}/${batches.length} (files ${batchStart}-${batchEnd})`);
101
+
102
+ await Promise.all(batch.map(async (htmlFilepath, indexInBatch) => {
103
+ const globalIndex = batchIndex * batchSize + indexInBatch;
104
+ await this.processFile(htmlFilepath, globalIndex + 1, filesToProcess.length);
105
+ }));
106
+ }
107
+
108
+ logger.info(`\n✓ Transformation complete!`);
109
+ logger.info(` Processed: ${filesToProcess.length} files in ${batches.length} batches`);
110
+ logger.info(` Note: Files are processed in parallel batches of up to ${batchSize} pages each`);
111
+ logger.info(` Check individual file logs above for skipped/transformed status`);
112
+
113
+ // Create links folder and _links.md file
114
+ logger.info('\nCreating links folder and _links.md file...');
115
+ await this.createLinksStructure(this.config.outputDir);
116
+ logger.info('✓ Links structure created');
117
+ }
118
+
119
+ /**
120
+ * Process a single HTML file to Markdown
121
+ */
122
+ private async processFile(htmlFilepath: string, index: number, total: number): Promise<void> {
123
+ const htmlFile = path.basename(htmlFilepath);
124
+ const dirPath = path.dirname(htmlFilepath);
125
+ const baseFilename = htmlFile.replace('.html', '');
126
+ const mdFilename = `${baseFilename}.md`;
127
+ const mdFilepath = path.join(dirPath, mdFilename);
128
+ const id = baseFilename.split('-')[0];
129
+
130
+ // Show relative path for better readability
131
+ const relativePath = path.relative(this.config.outputDir, htmlFilepath);
132
+ logger.info(`[${index}/${total}] Checking: ${relativePath}`);
133
+ logger.debug(`Processing file ${baseFilename} (ID: ${id})`);
134
+
135
+ // Check if MD file already exists
136
+ try {
137
+ await fs.access(mdFilepath);
138
+ if (this.config.force) {
139
+ logger.info(` ⚑ Force: Overwriting existing ${mdFilename}`);
140
+ // If forcing, remove existing images folder for this page to avoid stale files
141
+ try {
142
+ const imagesDir = path.join(dirPath, 'images');
143
+ await fs.rm(imagesDir, { recursive: true, force: true });
144
+ logger.info(` ✓ Removed existing images/ for ${baseFilename}`);
145
+ } catch (err) {
146
+ // Non-fatal if images removal fails
147
+ logger.warn(` ⚠ Could not remove images for ${baseFilename}:`, err instanceof Error ? err.message : err);
148
+ }
149
+ } else {
150
+ logger.info(` ⊘ Skipped: ${mdFilename} already exists`);
151
+ return;
152
+ }
153
+ } catch {
154
+ // MD file doesn't exist, proceed with transformation
155
+ }
156
+
157
+ try {
158
+ logger.debug(`Reading HTML content from ${htmlFilepath}`);
159
+ // Read HTML content
160
+ const htmlContent = await fs.readFile(htmlFilepath, 'utf-8');
161
+ logger.debug(`HTML content length: ${htmlContent.length} characters`);
162
+
163
+ // Parse the title from filename (reverse slugification is lossy, but best effort)
164
+ const title = unslugify(baseFilename);
165
+ logger.debug(`Parsed title: "${title}"`);
166
+
167
+ logger.debug(`Starting HTML to Markdown transformation`);
168
+ // Transform HTML to Markdown
169
+ const images: Array<{ filename: string; data: Buffer }> = [];
170
+ const markdownBody = await this.htmlToMarkdown(htmlContent, id, images);
171
+ logger.debug(`Transformation complete, markdown length: ${markdownBody.length} characters`);
172
+
173
+ // Build original page URL (use baseUrl if available)
174
+ const originalUrl = this.config.baseUrl
175
+ ? `${this.config.baseUrl}/pages/viewpage.action?pageId=${id}`
176
+ : '';
177
+ logger.debug(`Original URL: ${originalUrl || 'none'}`);
178
+
179
+ // Create front matter
180
+ const frontMatter = [
181
+ '---',
182
+ `title: "${title.replace(/"/g, '\\"') }"`,
183
+ `id: "${id}"`,
184
+ originalUrl ? `url: "${originalUrl}"` : '',
185
+ '---'
186
+ ].filter(Boolean).join('\n');
187
+ logger.debug(`Front matter created`);
188
+
189
+ // Before finalizing, replace any pending include placeholders inside markdownBody
190
+ let finalBody = markdownBody;
191
+ logger.debug(`Processing ${this.pendingIncludes.length} pending includes`);
192
+ for (const include of this.pendingIncludes) {
193
+ // Replace raw placeholder
194
+ finalBody = finalBody.replace(include.placeholder, include.content);
195
+ // Some converters escape underscores/backslashes; also replace escaped variants
196
+ const escaped = include.placeholder.replace(/_/g, '\\_');
197
+ finalBody = finalBody.replace(escaped, include.content);
198
+ // And double-escaped (e.g. \__INCLUDE_1__)
199
+ const doubleEscaped = escaped.replace(/\\/g, '\\\\');
200
+ finalBody = finalBody.replace(doubleEscaped, include.content);
201
+ }
202
+ logger.debug(`Include placeholders replaced`);
203
+
204
+ // Combine front matter and content
205
+ const markdownContent = `${frontMatter}\n\n${finalBody}`;
206
+ logger.debug(`Combined content length: ${markdownContent.length} characters`);
207
+
208
+ // Save images if any (in the same directory as the page)
209
+ if (images.length > 0) {
210
+ logger.debug(`Saving ${images.length} images`);
211
+ const imagesDir = path.join(dirPath, 'images');
212
+ await fs.mkdir(imagesDir, { recursive: true });
213
+
214
+ for (const image of images) {
215
+ const imagePath = path.join(imagesDir, image.filename);
216
+ await fs.writeFile(imagePath, image.data);
217
+ }
218
+ logger.info(` ✓ Saved ${images.length} image(s) for ${baseFilename}`);
219
+ } else {
220
+ logger.debug(`No images to save`);
221
+ }
222
+
223
+ logger.debug(`Performing final cleanup`);
224
+ // Final cleanup: unescape any remaining backslashes before [],() produced by converters
225
+ let finalMarkdownToWrite = markdownContent
226
+ // Remove escaped bracket/paren characters produced by converters (e.g. \[ \] \( \) )
227
+ .replace(/\\([\[\]\(\)])/g, '$1');
228
+ logger.debug(`Final markdown length: ${finalMarkdownToWrite.length} characters`);
229
+
230
+ logger.debug(`Formatting with Prettier`);
231
+
232
+ // Format and write markdown file
233
+ try {
234
+ const formatted = await prettier.format(finalMarkdownToWrite, {
235
+ parser: 'markdown',
236
+ printWidth: 120,
237
+ proseWrap: 'preserve',
238
+ tabWidth: 2,
239
+ useTabs: false
240
+ });
241
+ logger.debug(`Writing formatted markdown to ${mdFilepath}`);
242
+ await fs.writeFile(mdFilepath, formatted, 'utf-8');
243
+ logger.info(` ✓ Transformed: ${mdFilename} (formatted)`);
244
+ } catch {
245
+ // If formatting fails, save unformatted markdown
246
+ logger.warn(` ⚠ Could not format Markdown, saving unformatted`);
247
+ logger.debug(`Writing unformatted markdown to ${mdFilepath}`);
248
+ await fs.writeFile(mdFilepath, finalMarkdownToWrite, 'utf-8');
249
+ logger.info(` ✓ Transformed: ${mdFilename}`);
250
+ }
251
+ } catch (error) {
252
+ logger.error(` ✗ Failed to transform ${htmlFile}:`, error instanceof Error ? error.message : error);
253
+ }
254
+ }
255
+
256
+ /**
257
+ * Basic HTML to Markdown conversion
258
+ */
259
+ private async htmlToMarkdown(html: string, pageId: string, images: Array<{ filename: string; data: Buffer }>): Promise<string> {
260
+ let markdown = html;
261
+
262
+ // Preprocess: convert lists inside table cells to inline text to avoid breaking Markdown tables
263
+ // Convert <td>...<ul><li>Item</li>...</ul>...</td> -> <td>...• Item; Item; ...</td>
264
+ try {
265
+ markdown = markdown.replace(/<td([^>]*)>([\s\S]*?)<\/td>/gi, (full, attrs, inner) => {
266
+ // If there are list tags inside, replace them with inline bullets separated by semicolons
267
+ if (/<ul[^>]*>|<ol[^>]*>/i.test(inner)) {
268
+ // Extract list items
269
+ const items: string[] = [];
270
+ const liRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
271
+ for (const m of Array.from(inner.matchAll(liRegex))) {
272
+ let item = m[1] || '';
273
+ // Strip tags inside li
274
+ item = item.replace(/<[^>]+>/g, '').trim();
275
+ if (item) items.push(item);
276
+ }
277
+
278
+ if (items.length > 0) {
279
+ const replacement = items.map(i => `• ${i}`).join('; ');
280
+ // Remove the original lists from inner and append the inline replacement
281
+ const cleanedInner = inner.replace(/<ul[^>]*>[\s\S]*?<\/ul>/gi, '').replace(/<ol[^>]*>[\s\S]*?<\/ol>/gi, '').trim();
282
+ const spacer = cleanedInner && !cleanedInner.endsWith(' ') ? ' ' : '';
283
+ return `<td${attrs}>${cleanedInner}${spacer}${replacement}</td>`;
284
+ }
285
+ }
286
+ return full;
287
+ });
288
+ } catch (e) {
289
+ // Non-fatal: if preprocessing fails, continue without it
290
+ logger.warn('List-in-table preprocessing failed:', e instanceof Error ? e.message : e);
291
+ }
292
+
293
+ // Transform macros to markdown equivalents (with data fetching)
294
+ markdown = await this.transformMacros(markdown, pageId);
295
+
296
+ // Transform user links first (before removing ac:link)
297
+ markdown = await this.transformUserLinks(markdown);
298
+
299
+ // Transform page links to HTML anchor tags (will be converted to MD links later)
300
+ markdown = await this.transformPageLinks(markdown);
301
+
302
+ // Transform images and download attachments
303
+ markdown = await this.transformImages(markdown, pageId, images);
304
+
305
+ logger.debug(`Reving layout, time, and other elements`);
306
+ // Remove layout structure tags (they don't add value in markdown)
307
+ markdown = markdown.replace(/<\/?ac:layout[^>]*>/gi, '');
308
+ markdown = markdown.replace(/<\/?ac:layout-section[^>]*>/gi, '\n\n');
309
+ markdown = markdown.replace(/<\/?ac:layout-cell[^>]*>/gi, '\n\n');
310
+
311
+ // Time elements
312
+ markdown = markdown.replace(/<time[^>]*datetime="([^"]+)"[^>]*\/?>.*?/gi, '$1');
313
+
314
+ logger.debug(`Converting HTML to Markdown using webforai`);
315
+ markdown = htmlToMarkdown(markdown);
316
+
317
+ // Trim whitespace in Markdown table cells
318
+ logger.debug(`Trimming whitespace in Markdown table cells`);
319
+ markdown = markdown.replace(/^\|(.+)\|$/gm, (line) => {
320
+ const parts = line.split('|');
321
+ const trimmedParts = parts.map(part => part.trim());
322
+ return trimmedParts.join('|');
323
+ });
324
+
325
+ logger.debug(`Post-processing Markdown content (Pending includes, links, cleanup)`);
326
+ // Replace include placeholders with actual content (handle escaped variants)
327
+ for (const include of this.pendingIncludes) {
328
+ // raw
329
+ markdown = markdown.replace(include.placeholder, include.content);
330
+ // escaped underscores (e.g. \_\_INCLUDE_1\_\_)
331
+ const escaped = include.placeholder.replace(/_/g, '\\_');
332
+ markdown = markdown.replace(escaped, include.content);
333
+ // double-escaped (e.g. \\\_\\\_INCLUDE_1\\\_\\\_)
334
+ const doubleEscaped = escaped.replace(/\\/g, '\\\\');
335
+ markdown = markdown.replace(doubleEscaped, include.content);
336
+ }
337
+ this.pendingIncludes = [];
338
+ logger.debug(`Pending includes processed`);
339
+ // Restore page links that were escaped by htmlToMarkdown
340
+ // Pattern: \[Title\](url.md) -> [Title](url.md)
341
+ markdown = markdown.replace(/\\?\[([^\]]+)\\?\]\\?\(([^)]+\.md)\\?\)/g, '[$1]($2)');
342
+
343
+ // Unescape image and link bracket escaping produced by converters
344
+ // Example: !\[image.png\]\(images/image.png\) -> ![image.png](images/image.png)
345
+ markdown = markdown.replace(/!\\\[([^\]]+)\\\]\(\s*([^\)]+)\s*\)/g, '![$1]($2)');
346
+ markdown = markdown.replace(/\\\[([^\]]+)\\\]\(\s*([^\)]+)\s*\)/g, '[$1]($2)');
347
+
348
+ // Remove remaining ac:link elements
349
+ markdown = markdown.replace(/<ac:link[^>]*>[\s\S]*?<\/ac:link>/g, '');
350
+
351
+ logger.debug(`Converting headers`);
352
+ // Headers
353
+ markdown = markdown.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '\n# $1\n');
354
+ markdown = markdown.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '\n## $1\n');
355
+ markdown = markdown.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '\n### $1\n');
356
+ markdown = markdown.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '\n#### $1\n');
357
+ markdown = markdown.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '\n##### $1\n');
358
+ markdown = markdown.replace(/<h6[^>]*>(.*?)<\/h6>/gi, '\n###### $1\n');
359
+
360
+ logger.debug(`Converting text formatting`);
361
+ // Bold and italic
362
+ markdown = markdown.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**');
363
+ markdown = markdown.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**');
364
+ markdown = markdown.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*');
365
+ markdown = markdown.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*');
366
+
367
+ // Links
368
+ logger.debug(`Converting links`);
369
+ markdown = markdown.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
370
+
371
+ // Lists
372
+ logger.debug(`Converting lists`);
373
+ markdown = markdown.replace(/<ul[^>]*>/gi, '\n');
374
+ markdown = markdown.replace(/<\/ul>/gi, '\n');
375
+ markdown = markdown.replace(/<ol[^>]*>/gi, '\n');
376
+ markdown = markdown.replace(/<\/ol>/gi, '\n');
377
+ markdown = markdown.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
378
+
379
+ // Paragraphs
380
+ markdown = markdown.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
381
+
382
+ // Code blocks
383
+ markdown = markdown.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, '```\n$1\n```\n');
384
+ markdown = markdown.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`');
385
+
386
+ // Line breaks
387
+ markdown = markdown.replace(/<br\s*\/?>/gi, '\n');
388
+
389
+ // Remove remaining HTML tags
390
+ logger.debug(`Removing remaining HTML tags`);
391
+ markdown = markdown.replace(/<[^>]+>/g, '');
392
+
393
+ // Clean up HTML entities
394
+ markdown = markdown.replace(/&nbsp;/g, ' ');
395
+ markdown = markdown.replace(/&amp;/g, '&');
396
+ markdown = markdown.replace(/&lt;/g, '<');
397
+ markdown = markdown.replace(/&gt;/g, '>');
398
+ markdown = markdown.replace(/&quot;/g, '"');
399
+
400
+ // Clean up extra whitespace
401
+ markdown = markdown.replace(/\n{3,}/g, '\n\n');
402
+ markdown = markdown.trim();
403
+
404
+ // Apply markdown cleanup to remove malformed patterns
405
+ logger.debug(`Cleaning up markdown`);
406
+ markdown = this.cleanMarkdown(markdown);
407
+
408
+ return markdown;
409
+ }
410
+
411
+ /**
412
+ * Transform images and download attachments
413
+ */
414
+ private async transformImages(content: string, pageId: string, images: Array<{ filename: string; data: Buffer }>): Promise<string> {
415
+ let result = content;
416
+ const downloadPromises: Promise<void>[] = [];
417
+
418
+ // Match image attachments: <ac:image><ri:attachment ri:filename="..." /></ac:image>
419
+ const imageRegex = /<ac:image[^>]*><ri:attachment[^>]*ri:filename="([^"]+)"[^>]*\/><\/ac:image>/gi;
420
+ const imageMatches = Array.from(content.matchAll(imageRegex));
421
+
422
+ for (const match of imageMatches) {
423
+ const originalFilename = match[1];
424
+ logger.debug(`Processing image attachment: ${originalFilename}`);
425
+
426
+ // Extract extension and slugify the base name
427
+ const lastDotIndex = originalFilename.lastIndexOf('.');
428
+ const extension = lastDotIndex > 0 ? originalFilename.slice(lastDotIndex) : '';
429
+ const baseName = lastDotIndex > 0 ? originalFilename.slice(0, lastDotIndex) : originalFilename;
430
+ const slugifiedFilename = slugify(baseName) + extension;
431
+
432
+ let replacement = `![${originalFilename}](images/${slugifiedFilename})`;
433
+
434
+ // Download the image if API is available
435
+ if (this.api) {
436
+ downloadPromises.push((async () => {
437
+ try {
438
+ // Try downloading with original filename first (Confluence API may handle encoding internally)
439
+ let imageData = await this.api.downloadAttachment(pageId, originalFilename);
440
+
441
+ // If that fails, try with URL-encoded filename
442
+ if (!imageData) {
443
+ const encodedImageName = encodeURIComponent(originalFilename);
444
+ imageData = await this.api.downloadAttachment(pageId, encodedImageName);
445
+ }
446
+
447
+ if (imageData) {
448
+ images.push({ filename: slugifiedFilename, data: imageData });
449
+ logger.info(` ✓ Downloaded image: ${originalFilename} -> ${slugifiedFilename}`);
450
+ } else {
451
+ // Image might be on a different page or not exist
452
+ logger.warn(` ⚠ Image not found on this page: ${originalFilename} (may be on parent/child page)`);
453
+ }
454
+ } catch (error) {
455
+ const errorMessage = error instanceof Error ? error.message : String(error);
456
+ if (errorMessage.includes('404')) {
457
+ logger.warn(` ⚠ Image not attached to this page: ${originalFilename}`);
458
+ } else {
459
+ logger.warn(` ⚠ Error downloading image ${originalFilename}:`, errorMessage);
460
+ }
461
+ }
462
+ })());
463
+ }
464
+
465
+ logger.debug(`Replacing image tag with markdown: ${replacement}`);
466
+ result = result.replace(match[0], replacement);
467
+ }
468
+
469
+ logger.debug(`Processed inline <img> tags that reference /download/attachments/...`);
470
+
471
+ // Also handle inline <img> tags that reference /download/attachments/... with optional data-linked-resource-container-id
472
+ // Example: <img class="confluence-embedded-image" src="/download/attachments/715168874/image.png?version=1&api=v2" data-linked-resource-container-id="715168874" />
473
+ const inlineImgRegex = /<img[^>]*src="([^"]*\/download\/attachments\/[^"\s]+)"[^>]*>/gi;
474
+ const inlineImgMatches = Array.from(content.matchAll(inlineImgRegex));
475
+
476
+ logger.debug(`Found ${inlineImgMatches.length} inline <img> tags with /download/attachments/ URLs`);
477
+
478
+ for (const match of inlineImgMatches) {
479
+ const src = match[1];
480
+ logger.debug(`Processing inline image src: ${src}`);
481
+
482
+ // Try to extract filename from URL path
483
+ let filename = src.split('/').pop() || 'image';
484
+ // Strip query params if present
485
+ filename = filename.split('?')[0];
486
+
487
+ // Try to extract container id from the tag using a secondary regex on the original match
488
+ const fullTag = match[0];
489
+ const containerIdMatch = fullTag.match(/data-linked-resource-container-id="([^"<>]+)"/i);
490
+ const containerId = containerIdMatch ? containerIdMatch[1] : pageId;
491
+
492
+ const lastDotIndex = filename.lastIndexOf('.');
493
+ const extension = lastDotIndex > 0 ? filename.slice(lastDotIndex) : '';
494
+ const baseName = lastDotIndex > 0 ? filename.slice(0, lastDotIndex) : filename;
495
+ const slugifiedFilename = slugify(baseName) + extension;
496
+
497
+ let replacement = `![${filename}](images/${slugifiedFilename})`;
498
+
499
+ if (this.api) {
500
+ downloadPromises.push((async () => {
501
+ try {
502
+ logger.debug(`Downloading inline image from container ${containerId} with filename ${filename}`);
503
+ // The API expects the filename as-is; try original filename first
504
+ let imageData = await this.api.downloadAttachment(containerId, filename);
505
+
506
+ // Fallback: try URL-decoded filename
507
+ if (!imageData) {
508
+ const decoded = decodeURIComponent(filename);
509
+ if (decoded !== filename) {
510
+ imageData = await this.api.downloadAttachment(containerId, decoded);
511
+ }
512
+ }
513
+
514
+ // Another fallback: try removing any appended tokens (some Confluence instances append ids)
515
+ if (!imageData) {
516
+ const simpleName = filename.replace(/^[^a-z0-9]+/i, '').split(/[^a-z0-9.\-_]/i)[0];
517
+ if (simpleName && simpleName !== filename) {
518
+ imageData = await this.api.downloadAttachment(containerId, simpleName);
519
+ }
520
+ }
521
+
522
+ if (imageData) {
523
+ images.push({ filename: slugifiedFilename, data: imageData });
524
+ logger.info(` ✓ Downloaded inline image: ${filename} -> ${slugifiedFilename}`);
525
+ } else {
526
+ logger.warn(` ⚠ Inline image not downloaded: ${filename} (container ${containerId})`);
527
+ }
528
+ } catch (error) {
529
+ const errorMessage = error instanceof Error ? error.message : String(error);
530
+ logger.warn(` ⚠ Error downloading inline image ${filename}:`, errorMessage);
531
+ }
532
+ })());
533
+ }
534
+
535
+ result = result.replace(match[0], replacement);
536
+ }
537
+
538
+ // Wait for all downloads to complete
539
+ await Promise.all(downloadPromises);
540
+
541
+ logger.debug(`Completed processing inline <img> tags`);
542
+ return result;
543
+ }
544
+
545
+ /**
546
+ * Build a Markdown list from an included page's HTML content.
547
+ * Prefer extracting <ul>/<ol> list items and anchor links; fall back to full page transform.
548
+ */
549
+ private async buildIncludeList(page: Page, title: string): Promise<string> {
550
+ try {
551
+ const html = page.body || '';
552
+
553
+ // Extract list items inside <ul> or <ol>
554
+ const listRegex = /<ul[^>]*>([\s\S]*?)<\/ul>/i;
555
+ const listMatch = html.match(listRegex);
556
+ if (listMatch) {
557
+ const itemsHtml = listMatch[1];
558
+ const itemRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
559
+ const items: string[] = [];
560
+ for (const m of Array.from(itemsHtml.matchAll(itemRegex))) {
561
+ let item = m[1].trim();
562
+ // Convert <a href> to markdown
563
+ item = item.replace(/<a[^>]*href="([^"]+)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
564
+ // Strip remaining tags
565
+ item = item.replace(/<[^>]+>/g, '').trim();
566
+ items.push(`- ${item}`);
567
+ }
568
+ if (items.length > 0) {
569
+ return `\n\n## ${title}\n\n${items.join('\n')}\n\n`;
570
+ }
571
+ }
572
+
573
+ // If no lists found, look for anchor links
574
+ const anchorRegex = /<a[^>]*href="([^"]+)"[^>]*>(.*?)<\/a>/gi;
575
+ const anchors = Array.from(html.matchAll(anchorRegex));
576
+ if (anchors.length > 0) {
577
+ const items = anchors.map(a => `- [${a[2].replace(/<[^>]+>/g, '').trim()}](${a[1]})`);
578
+ return `\n\n## ${title}\n\n${items.join('\n')}\n\n`;
579
+ }
580
+
581
+ // Fall back to full-page transform
582
+ const full = await this.htmlToMarkdown(html, page.id || title, []);
583
+ return `\n\n## ${title}\n\n${full}\n\n`;
584
+ } catch (error) {
585
+ logger.warn(`Failed to build include list for ${title}:`, error);
586
+ return `\n\n## ${title}\n\n<!-- failed to include content -->\n\n`;
587
+ }
588
+ }
589
+
590
+ /**
591
+ * Transform Confluence macros to Markdown
592
+ */
593
+ private async transformMacros(content: string, pageId: string): Promise<string> {
594
+ let result = content;
595
+
596
+ // Handle children macro - fetch child pages of specified page or current page
597
+ const childrenRegex = /<ac:structured-macro[^>]*ac:name="children"[^>]*>([\s\S]*?)<\/ac:structured-macro>/gis;
598
+ const childrenMatches = Array.from(content.matchAll(childrenRegex));
599
+
600
+ for (const match of childrenMatches) {
601
+ let replacement = '<!-- Child Pages -->\n\n';
602
+ const macroContent = match[1];
603
+
604
+ if (this.api) {
605
+ try {
606
+ // Check if there's a page parameter
607
+ const pageParamMatch = macroContent.match(/ri:content-title="([^"]+)"/i);
608
+ let targetPageId = pageId;
609
+ let targetTitle = '';
610
+
611
+ if (pageParamMatch) {
612
+ targetTitle = pageParamMatch[1];
613
+ // Try to find the page by title
614
+ const targetPage = await this.api.getPageByTitle(this.config.spaceKey, targetTitle);
615
+ if (targetPage) {
616
+ targetPageId = targetPage.id;
617
+ }
618
+ }
619
+
620
+ const childPages = await this.api.getChildPages(targetPageId);
621
+ if (childPages.length > 0) {
622
+ replacement = childPages.map(child => `- [${child.title}](${slugify(child.title)}.md)`).join('\n') + '\n\n';
623
+ }
624
+ } catch (error) {
625
+ logger.warn(`Failed to fetch child pages:`, error);
626
+ }
627
+ }
628
+
629
+ result = result.replace(match[0], replacement);
630
+ }
631
+
632
+ // Handle list-children macro - fetch actual child pages
633
+ const listChildrenRegex = /<ac:structured-macro[^>]*ac:name="list-children"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis;
634
+ const listChildrenMatches = Array.from(result.matchAll(listChildrenRegex));
635
+
636
+ for (const match of listChildrenMatches) {
637
+ let replacement = '<!-- Child Pages List -->\n\n';
638
+
639
+ if (this.api) {
640
+ try {
641
+ const childPages = await this.api.getChildPages(pageId);
642
+ if (childPages.length > 0) {
643
+ replacement = childPages.map(child => `- [${child.title}](${slugify(child.title)}.md)`).join('\n') + '\n\n';
644
+ }
645
+ } catch (error) {
646
+ logger.warn(`Failed to fetch child pages for ${pageId}:`, error);
647
+ }
648
+ }
649
+
650
+ result = result.replace(match[0], replacement);
651
+ }
652
+
653
+ // Handle include macro - fetch content from included page
654
+ const includeRegex = /<ac:structured-macro[^>]*ac:name="include"[^>]*>([\s\S]*?)<\/ac:structured-macro>/gis;
655
+ const includeMatches = Array.from(result.matchAll(includeRegex));
656
+
657
+ for (const match of includeMatches) {
658
+ const macroContent = match[1];
659
+ const titleMatch = macroContent.match(/ri:content-title="([^"]+)"/i);
660
+
661
+ if (titleMatch && this.api) {
662
+ const includeTitle = titleMatch[1];
663
+ try {
664
+ let includedPage: Page | null;
665
+ if (includeTitle === "FCS Useful Links") {
666
+ // Hardcode the pageId for FCS Useful Links
667
+ includedPage = await this.api.getPage("167810724");
668
+ } else {
669
+ includedPage = await this.api.getPageByTitle(this.config.spaceKey, includeTitle);
670
+ }
671
+ if (includedPage && includedPage.body) {
672
+ // Build a concise Markdown list from the included page using the API
673
+ const listMd = await this.buildIncludeList(includedPage, includeTitle);
674
+
675
+ // Generate a unique placeholder per include to avoid collisions
676
+ const placeholder = `__INCLUDE_${this.pendingIncludes.length + 1}__`;
677
+
678
+ // Replace macro with placeholder and remember the content for later
679
+ result = result.replace(match[0], placeholder);
680
+ this.pendingIncludes.push({ placeholder, content: listMd });
681
+ } else {
682
+ result = result.replace(match[0], `<!-- Include: ${includeTitle} (page not found) -->\n\n`);
683
+ }
684
+ } catch (error) {
685
+ logger.warn(`Failed to fetch included page "${includeTitle}":`, error);
686
+ result = result.replace(match[0], `<!-- Include: ${includeTitle} (error) -->\n\n`);
687
+ }
688
+ } else {
689
+ result = result.replace(match[0], '<!-- Include macro -->\n\n');
690
+ }
691
+ }
692
+ // Preserve table-like macros: extract the inner rich-text-body so HTML tables
693
+ // inside macros (e.g. table-filter) are retained and later converted to Markdown.
694
+ result = result.replace(/<ac:structured-macro[^>]*ac:name="(?:table|table-filter)"[^>]*>[\s\S]*?<ac:rich-text-body>([\s\S]*?)<\/ac:rich-text-body>[\s\S]*?<\/ac:structured-macro>/gis, '$1\n\n');
695
+
696
+ // Apply other macro transformations
697
+ result = result
698
+ // Code blocks with language
699
+ .replace(/<ac:structured-macro[^>]*ac:name="code"[^>]*>.*?<ac:parameter[^>]*ac:name="language"[^>]*>(.*?)<\/ac:parameter>.*?<ac:plain-text-body><!\[CDATA\[(.*?)\]\]><\/ac:plain-text-body>.*?<\/ac:structured-macro>/gis, '```$1\n$2\n```\n\n')
700
+ // Code blocks without language
701
+ .replace(/<ac:structured-macro[^>]*ac:name="code"[^>]*>.*?<ac:plain-text-body><!\[CDATA\[(.*?)\]\]><\/ac:plain-text-body>.*?<\/ac:structured-macro>/gis, '```\n$1\n```\n\n')
702
+ // Info panels
703
+ /* Replace info macro with a concise inline marker using the macro title and body.
704
+ Desired output example:
705
+ [i] Here you will find
706
+ <body content...>
707
+ */
708
+ .replace(/<ac:structured-macro[^>]*ac:name="info"[^>]*>([\s\S]*?)<\/ac:structured-macro>/gis, (_match, inner) => {
709
+ try {
710
+ // Extract title parameter if present
711
+ const titleMatch = inner.match(/<ac:parameter[^>]*ac:name="title"[^>]*>([\s\S]*?)<\/ac:parameter>/i);
712
+ const title = titleMatch ? titleMatch[1].trim() : '';
713
+
714
+ // Extract rich-text-body content
715
+ const bodyMatch = inner.match(/<ac:rich-text-body>([\s\S]*?)<\/ac:rich-text-body>/i);
716
+ const body = bodyMatch ? bodyMatch[1].trim() : '';
717
+
718
+ const titleLine = title ? `[i] ${title}\n\n` : '';
719
+
720
+ // Return title marker plus body (body will be further transformed later)
721
+ return `${titleLine}${body}\n\n`;
722
+ } catch (e) {
723
+ return '<!-- Info macro -->\n\n';
724
+ }
725
+ })
726
+ // Warning panels
727
+ .replace(/<ac:structured-macro[^>]*ac:name="warning"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '> **Warning:** $1\n\n')
728
+ // Note panels
729
+ .replace(/<ac:structured-macro[^>]*ac:name="note"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '> **Note:** $1\n\n')
730
+ // Panel macro - extract content
731
+ .replace(/<ac:structured-macro[^>]*ac:name="panel"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '$1\n\n')
732
+ // Excerpt macro - extract content
733
+ .replace(/<ac:structured-macro[^>]*ac:name="excerpt"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '$1\n\n')
734
+ // Table of contents
735
+ .replace(/<ac:structured-macro[^>]*ac:name="toc"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Table of Contents -->\n\n')
736
+ // Content by label
737
+ .replace(/<ac:structured-macro[^>]*ac:name="contentbylabel"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Content by Label -->\n\n')
738
+ // Livesearch macro
739
+ .replace(/<ac:structured-macro[^>]*ac:name="livesearch"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Live Search -->\n\n')
740
+ // Jira macro
741
+ .replace(/<ac:structured-macro[^>]*ac:name="jira"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Jira Issues -->\n\n')
742
+ // Recently updated macro
743
+ .replace(/<ac:structured-macro[^>]*ac:name="recently-updated"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Recently Updated Pages -->\n\n')
744
+ // Popular labels macro
745
+ .replace(/<ac:structured-macro[^>]*ac:name="popular-labels"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Popular Labels -->\n\n')
746
+ // Other macros - convert to comments
747
+ .replace(/<ac:structured-macro[^>]*ac:name="([^"]*)"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Confluence Macro: $1 -->\n\n');
748
+
749
+ return result;
750
+ }
751
+
752
+ /**
753
+ * Transform user links to display names
754
+ */
755
+ private async transformUserLinks(html: string): Promise<string> {
756
+ if (!this.api) {
757
+ // If no API provided, just remove user links
758
+ return html.replace(/<ac:link[^>]*><ri:user[^>]*\/><\/ac:link>/g, '@unknown-user');
759
+ }
760
+
761
+ let result = html;
762
+
763
+ // Match user links by username
764
+ const usernameRegex = /<ac:link[^>]*><ri:user[^>]*ri:username="([^"]+)"[^>]*\/><\/ac:link>/gi;
765
+ const usernameMatches = Array.from(html.matchAll(usernameRegex));
766
+
767
+ for (const match of usernameMatches) {
768
+ const username = match[1];
769
+ const user = await this.api.getUserByUsername(username);
770
+
771
+ if (user) {
772
+ result = result.replace(match[0], `@${user.displayName}`);
773
+ } else {
774
+ result = result.replace(match[0], `@${username}`);
775
+ }
776
+ }
777
+
778
+ // Match user links by userkey
779
+ const userkeyRegex = /<ac:link[^>]*><ri:user[^>]*ri:userkey="([^"]+)"[^>]*\/><\/ac:link>/gi;
780
+ const userkeyMatches = Array.from(result.matchAll(userkeyRegex));
781
+
782
+ for (const match of userkeyMatches) {
783
+ const userKey = match[1];
784
+ const user = await this.api.getUserByKey(userKey);
785
+
786
+ if (user) {
787
+ result = result.replace(match[0], `@${user.displayName}`);
788
+ } else {
789
+ result = result.replace(match[0], `@user-${userKey.slice(-8)}`);
790
+ }
791
+ }
792
+
793
+ return result;
794
+ }
795
+
796
+ /**
797
+ * Transform page links to markdown links
798
+ */
799
+ private async transformPageLinks(html: string): Promise<string> {
800
+ let result = html;
801
+
802
+ // Match page links by content title - various formats
803
+ // Format 1: <ac:link><ri:page ri:content-title="Title" /></ac:link>
804
+ const pageLinkRegex1 = /<ac:link[^>]*>\s*<ri:page[^>]*ri:content-title="([^"]+)"[^>]*\/>\s*<\/ac:link>/gi;
805
+ const matches1 = Array.from(html.matchAll(pageLinkRegex1));
806
+
807
+ for (const match of matches1) {
808
+ const title = match[1];
809
+ const link = `[${title}](${slugify(title)}.md)`;
810
+ result = result.replace(match[0], link);
811
+ }
812
+
813
+ // Format 2: Just <ri:page ri:content-title="Title" /> without ac:link wrapper
814
+ const pageLinkRegex2 = /<ri:page[^>]*ri:content-title="([^"]+)"[^>]*\/>/gi;
815
+ const matches2 = Array.from(result.matchAll(pageLinkRegex2));
816
+
817
+ for (const match of matches2) {
818
+ const title = match[1];
819
+ const link = `[${title}](${slugify(title)}.md)`;
820
+ result = result.replace(match[0], link);
821
+ }
822
+
823
+ return result;
824
+ }
825
+
826
+ /**
827
+ * Clean up malformed markdown patterns
828
+ */
829
+ private cleanMarkdown(markdown: string): string {
830
+ let cleaned = markdown;
831
+
832
+ // First pass: clean confluence-specific patterns
833
+ logger.debug('Cleaning Confluence-specific markdown patterns');
834
+ cleaned = this.cleanConfluencePatterns(cleaned);
835
+
836
+ // Second pass: general cleanup
837
+ logger.debug('Cleaning general markdown patterns');
838
+ cleaned = this.cleanGeneral(cleaned);
839
+
840
+ // Third pass: another round of confluence patterns to catch any new issues
841
+ logger.debug('Cleaning Confluence-specific markdown patterns (second pass)');
842
+ cleaned = this.cleanConfluencePatterns(cleaned);
843
+
844
+ // Final cleanup of excessive whitespace
845
+ cleaned = cleaned.replace(/\n{4,}/g, '\n\n\n');
846
+ cleaned = cleaned.trim() + '\n';
847
+
848
+ logger.debug('Final cleanup of excessive whitespace');
849
+ return cleaned;
850
+ }
851
+
852
+ /**
853
+ * Clean up specific problematic patterns that appear in Confluence exports
854
+ */
855
+ private cleanConfluencePatterns(markdown: string): string {
856
+ let cleaned = markdown;
857
+
858
+ // Remove standalone bold markers that are not part of content
859
+ // This handles cases like "**\n\n**" or "** **"
860
+ cleaned = cleaned.replace(/\*\*\s*\n\s*\n\s*\*\*/g, '');
861
+
862
+ // Remove lines that only contain **
863
+ cleaned = cleaned.replace(/^\s*\*\*\s*$/gm, '');
864
+
865
+ // Remove empty headers (headers with no content)
866
+ cleaned = cleaned.replace(/^#+\s*$/gm, '');
867
+
868
+ // Remove bold markers around only whitespace
869
+ cleaned = cleaned.replace(/\*\*\s+\*\*/g, ' ');
870
+
871
+ // Remove italic markers around only whitespace
872
+ cleaned = cleaned.replace(/\*\s+\*/g, ' ');
873
+
874
+ // Clean up malformed blockquotes
875
+ cleaned = cleaned.replace(/^>\s*$/gm, '');
876
+
877
+ // Remove empty code blocks
878
+ cleaned = cleaned.replace(/```\s*\n\s*```/g, '');
879
+
880
+ // Clean up malformed horizontal rules
881
+ cleaned = cleaned.replace(/^[-*_]\s*$/gm, '');
882
+
883
+ return cleaned;
884
+ }
885
+
886
+ /**
887
+ * General markdown cleanup
888
+ */
889
+ private cleanGeneral(markdown: string): string {
890
+ let cleaned = markdown;
891
+
892
+ // Remove empty headers with just bold/italic markers (no content between them)
893
+ // Match: ## ** or ## * (at end of line)
894
+ logger.debug('Removing empty headers with only formatting markers');
895
+ cleaned = cleaned.replace(/^#+\s*\*\*\s*$/gm, '');
896
+ cleaned = cleaned.replace(/^#+\s*\*\s*$/gm, '');
897
+ cleaned = cleaned.replace(/^#+\s*__\s*$/gm, '');
898
+ cleaned = cleaned.replace(/^#+\s*_\s*$/gm, '');
899
+
900
+ // Remove headers that only contain bold/italic markers across multiple lines
901
+ // Example: ## **\n\n** (with only whitespace between)
902
+ logger.debug('Removing headers with only formatting markers across multiple lines');
903
+ cleaned = cleaned.replace(/^(#+)\s*\*\*\s*\n+\s*\*\*\s*$/gm, '');
904
+ cleaned = cleaned.replace(/^(#+)\s*\*\s*\n+\s*\*\s*$/gm, '');
905
+
906
+ // Remove empty bold markers (no content or only whitespace between)
907
+ logger.debug('Removing empty bold markers');
908
+ cleaned = cleaned.replace(/\*\*\s*\*\*/g, '');
909
+ cleaned = cleaned.replace(/__\s*__/g, '');
910
+
911
+ // Remove standalone italic markers on their own line
912
+ logger.debug('Removing standalone italic markers on their own line');
913
+ cleaned = cleaned.replace(/^\s*\*\s*$/gm, '');
914
+ cleaned = cleaned.replace(/^\s*_\s*$/gm, '');
915
+
916
+ // Remove empty italic markers that span multiple lines (only if truly empty)
917
+ logger.debug('Removing empty italic markers that span multiple lines');
918
+ cleaned = cleaned.replace(/\*\s*\n+\s*\*/g, '\n\n');
919
+
920
+ // Remove empty links
921
+ logger.debug('Removing empty links');
922
+ cleaned = cleaned.replace(/\[\s*\]\(\s*\)/g, '');
923
+
924
+ // Remove empty list items
925
+ logger.debug('Removing empty list items');
926
+ cleaned = cleaned.replace(/^[-*+]\s*$/gm, '');
927
+
928
+ // Clean up excessive blank lines (more than 3 consecutive)
929
+ logger.debug('Cleaning up excessive blank lines');
930
+ cleaned = cleaned.replace(/\n{4,}/g, '\n\n\n');
931
+
932
+ // Remove trailing whitespace from each line
933
+ logger.debug('Removing trailing whitespace from each line');
934
+ cleaned = cleaned.split('\n').map(line => line.trimEnd()).join('\n');
935
+
936
+ // Ensure single trailing newline at end of file
937
+ logger.debug('Ensuring single trailing newline at end of file');
938
+ cleaned = cleaned.trim() + '\n';
939
+
940
+ return cleaned;
941
+ }
942
+
943
+ /**
944
+ * Create links folder with symlinks and _links.md with tree structure
945
+ */
946
+ private async createLinksStructure(outputDir: string): Promise<void> {
947
+ const linksDir = path.join(outputDir, 'links');
948
+
949
+ // Remove existing links folder if it exists
950
+ try {
951
+ await fs.rm(linksDir, { recursive: true, force: true });
952
+ } catch {
953
+ // Ignore if doesn't exist
954
+ }
955
+
956
+ // Create fresh links folder
957
+ await fs.mkdir(linksDir, { recursive: true });
958
+
959
+ // Find all MD files recursively
960
+ const findMdFiles = async (dir: string, fileList: Array<{ path: string; relativePath: string }> = []): Promise<Array<{ path: string; relativePath: string }>> => {
961
+ const entries = await fs.readdir(dir, { withFileTypes: true });
962
+
963
+ for (const entry of entries) {
964
+ const fullPath = path.join(dir, entry.name);
965
+
966
+ if (entry.isDirectory() && !entry.name.startsWith('_') && entry.name !== 'images' && entry.name !== 'links') {
967
+ await findMdFiles(fullPath, fileList);
968
+ } else if (entry.isFile() && entry.name.endsWith('.md') && !entry.name.startsWith('_')) {
969
+ const relativePath = path.relative(outputDir, fullPath);
970
+ fileList.push({ path: fullPath, relativePath });
971
+ }
972
+ }
973
+
974
+ return fileList;
975
+ };
976
+
977
+ const mdFiles = await findMdFiles(outputDir);
978
+
979
+ // Create symlinks in links folder
980
+ for (const file of mdFiles) {
981
+ const linkName = path.basename(file.path);
982
+ const linkPath = path.join(linksDir, linkName);
983
+ const targetPath = path.relative(linksDir, file.path);
984
+
985
+ try {
986
+ await fs.symlink(targetPath, linkPath);
987
+ } catch (error) {
988
+ logger.warn(` ⚠ Failed to create symlink for ${linkName}:`, error instanceof Error ? error.message : error);
989
+ }
990
+ }
991
+
992
+ logger.info(` ✓ Created ${mdFiles.length} symlinks in links/`);
993
+
994
+ // Build tree structure for _links.md
995
+ const tree = this.buildFileTree(mdFiles);
996
+ const treeMarkdown = this.generateTreeMarkdown(tree, outputDir);
997
+
998
+ // Write _links.md
999
+ const linksFilePath = path.join(outputDir, '_links.md');
1000
+ const linksContent = `# Documentation Links\n\n${treeMarkdown}`;
1001
+
1002
+ try {
1003
+ const formattedContent = await prettier.format(linksContent, {
1004
+ parser: 'markdown',
1005
+ printWidth: 120,
1006
+ proseWrap: 'preserve',
1007
+ tabWidth: 2,
1008
+ useTabs: false
1009
+ });
1010
+ await fs.writeFile(linksFilePath, formattedContent, 'utf-8');
1011
+ } catch {
1012
+ await fs.writeFile(linksFilePath, linksContent, 'utf-8');
1013
+ }
1014
+
1015
+ logger.info(` ✓ Created _links.md with tree structure`);
1016
+ }
1017
+
1018
+ /**
1019
+ * Build a tree structure from flat file list
1020
+ */
1021
+ private buildFileTree(files: Array<{ path: string; relativePath: string }>): TreeNode {
1022
+ const root: TreeNode = { name: '', children: {}, files: [] };
1023
+
1024
+ for (const file of files) {
1025
+ const parts = file.relativePath.split(path.sep);
1026
+ let current = root;
1027
+
1028
+ // Navigate/create directory structure
1029
+ for (let i = 0; i < parts.length - 1; i++) {
1030
+ const part = parts[i];
1031
+ if (!current.children[part]) {
1032
+ current.children[part] = { name: part, children: {}, files: [] };
1033
+ }
1034
+ current = current.children[part];
1035
+ }
1036
+
1037
+ // Add file to current directory
1038
+ current.files.push({
1039
+ name: parts[parts.length - 1],
1040
+ relativePath: file.relativePath
1041
+ });
1042
+ }
1043
+
1044
+ return root;
1045
+ }
1046
+
1047
+ /**
1048
+ * Generate markdown tree structure
1049
+ */
1050
+ private generateTreeMarkdown(node: TreeNode, outputDir: string, level: number = 0): string {
1051
+ let result = '';
1052
+ const indent = ' '.repeat(level);
1053
+
1054
+ // Sort directories and files alphabetically
1055
+ const sortedDirs = Object.keys(node.children).sort();
1056
+ const sortedFiles = node.files.sort((a, b) => a.name.localeCompare(b.name));
1057
+
1058
+ // Add directories first
1059
+ for (const dirName of sortedDirs) {
1060
+ const child = node.children[dirName];
1061
+ result += `${indent}- **${dirName}/**\n`;
1062
+ result += this.generateTreeMarkdown(child, outputDir, level + 1);
1063
+ }
1064
+
1065
+ // Add files
1066
+ for (const file of sortedFiles) {
1067
+ const linkPath = file.relativePath;
1068
+ result += `${indent}- [${file.name}](${linkPath})\n`;
1069
+ }
1070
+
1071
+ return result;
1072
+ }
1073
+
1074
+ /**
1075
+ * Recursively clear existing .md files and images folders
1076
+ */
1077
+ private async clearExistingFiles(dir: string): Promise<void> {
1078
+ try {
1079
+ const entries = await fs.readdir(dir, { withFileTypes: true });
1080
+
1081
+ for (const entry of entries) {
1082
+ const fullPath = path.join(dir, entry.name);
1083
+
1084
+ if (entry.isDirectory()) {
1085
+ if (entry.name === 'images' || entry.name === 'links') {
1086
+ // Remove entire images and links folders
1087
+ await fs.rm(fullPath, { recursive: true, force: true });
1088
+ logger.info(` Removed: ${path.relative(this.config.outputDir, fullPath)}/`);
1089
+ } else if (!entry.name.startsWith('_')) {
1090
+ // Recursively clear subdirectories (skip _index, _queue, etc.)
1091
+ await this.clearExistingFiles(fullPath);
1092
+ }
1093
+ } else if (entry.isFile() && entry.name.endsWith('.md') && !entry.name.startsWith('_')) {
1094
+ // Remove .md files
1095
+ await fs.unlink(fullPath);
1096
+ logger.info(` Removed: ${path.relative(this.config.outputDir, fullPath)}`);
1097
+ }
1098
+ }
1099
+ } catch (error) {
1100
+ logger.warn(`Warning: Could not clear files in ${dir}:`, error instanceof Error ? error.message : error);
1101
+ }
1102
+ }
1103
+ }