confluence-exporter 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (91) hide show
  1. package/.eslintrc.cjs +18 -0
  2. package/.github/copilot-instructions.md +3 -0
  3. package/.github/prompts/analyze.prompt.md +101 -0
  4. package/.github/prompts/clarify.prompt.md +158 -0
  5. package/.github/prompts/constitution.prompt.md +73 -0
  6. package/.github/prompts/implement.prompt.md +56 -0
  7. package/.github/prompts/plan.prompt.md +50 -0
  8. package/.github/prompts/specify.prompt.md +21 -0
  9. package/.github/prompts/tasks.prompt.md +69 -0
  10. package/LICENSE +21 -0
  11. package/README.md +332 -0
  12. package/agents.md +1174 -0
  13. package/dist/api.d.ts +73 -0
  14. package/dist/api.js +387 -0
  15. package/dist/api.js.map +1 -0
  16. package/dist/commands/download.command.d.ts +18 -0
  17. package/dist/commands/download.command.js +257 -0
  18. package/dist/commands/download.command.js.map +1 -0
  19. package/dist/commands/executor.d.ts +22 -0
  20. package/dist/commands/executor.js +52 -0
  21. package/dist/commands/executor.js.map +1 -0
  22. package/dist/commands/help.command.d.ts +8 -0
  23. package/dist/commands/help.command.js +68 -0
  24. package/dist/commands/help.command.js.map +1 -0
  25. package/dist/commands/index.command.d.ts +14 -0
  26. package/dist/commands/index.command.js +95 -0
  27. package/dist/commands/index.command.js.map +1 -0
  28. package/dist/commands/index.d.ts +13 -0
  29. package/dist/commands/index.js +13 -0
  30. package/dist/commands/index.js.map +1 -0
  31. package/dist/commands/plan.command.d.ts +54 -0
  32. package/dist/commands/plan.command.js +272 -0
  33. package/dist/commands/plan.command.js.map +1 -0
  34. package/dist/commands/registry.d.ts +12 -0
  35. package/dist/commands/registry.js +32 -0
  36. package/dist/commands/registry.js.map +1 -0
  37. package/dist/commands/transform.command.d.ts +69 -0
  38. package/dist/commands/transform.command.js +951 -0
  39. package/dist/commands/transform.command.js.map +1 -0
  40. package/dist/commands/types.d.ts +12 -0
  41. package/dist/commands/types.js +5 -0
  42. package/dist/commands/types.js.map +1 -0
  43. package/dist/commands/update.command.d.ts +10 -0
  44. package/dist/commands/update.command.js +201 -0
  45. package/dist/commands/update.command.js.map +1 -0
  46. package/dist/constants.d.ts +1 -0
  47. package/dist/constants.js +2 -0
  48. package/dist/constants.js.map +1 -0
  49. package/dist/index.d.ts +5 -0
  50. package/dist/index.js +110 -0
  51. package/dist/index.js.map +1 -0
  52. package/dist/logger.d.ts +15 -0
  53. package/dist/logger.js +52 -0
  54. package/dist/logger.js.map +1 -0
  55. package/dist/types.d.ts +167 -0
  56. package/dist/types.js +5 -0
  57. package/dist/types.js.map +1 -0
  58. package/dist/utils.d.ts +56 -0
  59. package/dist/utils.js +178 -0
  60. package/dist/utils.js.map +1 -0
  61. package/eslint.config.js +29 -0
  62. package/jest.config.cjs +25 -0
  63. package/migrate-meta.js +132 -0
  64. package/package.json +53 -0
  65. package/src/api.ts +469 -0
  66. package/src/commands/download.command.ts +324 -0
  67. package/src/commands/executor.ts +62 -0
  68. package/src/commands/help.command.ts +72 -0
  69. package/src/commands/index.command.ts +111 -0
  70. package/src/commands/index.ts +14 -0
  71. package/src/commands/plan.command.ts +318 -0
  72. package/src/commands/registry.ts +39 -0
  73. package/src/commands/transform.command.ts +1103 -0
  74. package/src/commands/types.ts +16 -0
  75. package/src/commands/update.command.ts +229 -0
  76. package/src/constants.ts +0 -0
  77. package/src/index.ts +120 -0
  78. package/src/logger.ts +60 -0
  79. package/src/test.sh +66 -0
  80. package/src/types.ts +176 -0
  81. package/src/utils.ts +204 -0
  82. package/tests/commands/README.md +123 -0
  83. package/tests/commands/download.command.test.ts +8 -0
  84. package/tests/commands/help.command.test.ts +8 -0
  85. package/tests/commands/index.command.test.ts +8 -0
  86. package/tests/commands/plan.command.test.ts +15 -0
  87. package/tests/commands/transform.command.test.ts +8 -0
  88. package/tests/fixtures/_index.yaml +38 -0
  89. package/tests/fixtures/mock-pages.ts +62 -0
  90. package/tsconfig.json +25 -0
  91. package/vite.config.ts +45 -0
@@ -0,0 +1,951 @@
1
+ /**
2
+ * Transform command handler - Transforms HTML files to Markdown
3
+ */
4
+ import { promises as fs } from 'fs';
5
+ import path from 'path';
6
+ import prettier from 'prettier';
7
+ import { htmlToMarkdown } from "webforai";
8
+ import { ConfluenceApi } from '../api.js';
9
+ import { pagePath, slugify, unslugify } from '../utils.js';
10
+ import { logger } from '../logger.js';
11
+ export class TransformCommand {
12
+ config;
13
+ pendingIncludes = [];
14
+ api;
15
+ constructor(config) {
16
+ this.config = config;
17
+ }
18
+ async execute(_context) {
19
+ this.api = new ConfluenceApi(this.config);
20
+ logger.info(`Transforming HTML files to Markdown...`);
21
+ logger.info(`Output directory: ${this.config.outputDir}\n`);
22
+ // Clear existing MD files and images if --clear flag is set
23
+ if (this.config.clear) {
24
+ logger.info('Clearing existing .md files and images folders...');
25
+ await this.clearExistingFiles(this.config.outputDir);
26
+ logger.info('✓ Cleared existing files\n');
27
+ }
28
+ let transformedCount = 0;
29
+ let skippedCount = 0;
30
+ let errorCount = 0;
31
+ const htmlFiles = [];
32
+ if (this.config.pageId) {
33
+ logger.info(`Processing specific page: ${this.config.pageId}\n`);
34
+ const pageHtmlPath = pagePath(this.config.pageId, this.config);
35
+ logger.info(`HTML path: ${pageHtmlPath}\n`);
36
+ htmlFiles.push(pageHtmlPath);
37
+ }
38
+ else {
39
+ // Helper function to recursively find HTML files
40
+ const findHtmlFiles = async (dir, fileList = []) => {
41
+ const entries = await fs.readdir(dir, { withFileTypes: true });
42
+ for (const entry of entries) {
43
+ const fullPath = path.join(dir, entry.name);
44
+ if (entry.isDirectory() && !entry.name.startsWith('_') && entry.name !== 'images') {
45
+ // Recursively search subdirectories (skip _index, _queue, etc. and images folder)
46
+ await findHtmlFiles(fullPath, fileList);
47
+ }
48
+ else if (entry.isFile() && entry.name.endsWith('.html') && !entry.name.startsWith('_')) {
49
+ fileList.push(fullPath);
50
+ }
51
+ }
52
+ return fileList;
53
+ };
54
+ // Find all HTML files recursively
55
+ htmlFiles.push(...await findHtmlFiles(this.config.outputDir));
56
+ }
57
+ if (htmlFiles.length === 0) {
58
+ logger.info('No HTML files found to transform.');
59
+ logger.info('Run the "download" command first to download HTML pages.');
60
+ return;
61
+ }
62
+ // Apply limit if specified
63
+ const filesToProcess = this.config.limit ? htmlFiles.slice(0, this.config.limit) : htmlFiles;
64
+ logger.info(`Found ${htmlFiles.length} HTML files`);
65
+ if (this.config.limit && htmlFiles.length > this.config.limit) {
66
+ logger.info(`Limiting to first ${this.config.limit} files\n`);
67
+ }
68
+ else {
69
+ logger.info();
70
+ }
71
+ // Process HTML files in parallel batches
72
+ const batchSize = this.config.parallel || 5;
73
+ const batches = [];
74
+ for (let i = 0; i < filesToProcess.length; i += batchSize) {
75
+ batches.push(filesToProcess.slice(i, i + batchSize));
76
+ }
77
+ for (let batchIndex = 0; batchIndex < batches.length; batchIndex++) {
78
+ const batch = batches[batchIndex];
79
+ const batchStart = batchIndex * batchSize + 1;
80
+ const batchEnd = Math.min((batchIndex + 1) * batchSize, filesToProcess.length);
81
+ logger.info(`Processing batch ${batchIndex + 1}/${batches.length} (files ${batchStart}-${batchEnd})`);
82
+ await Promise.all(batch.map(async (htmlFilepath, indexInBatch) => {
83
+ const globalIndex = batchIndex * batchSize + indexInBatch;
84
+ await this.processFile(htmlFilepath, globalIndex + 1, filesToProcess.length);
85
+ }));
86
+ }
87
+ logger.info(`\n✓ Transformation complete!`);
88
+ logger.info(` Processed: ${filesToProcess.length} files in ${batches.length} batches`);
89
+ logger.info(` Note: Files are processed in parallel batches of up to ${batchSize} pages each`);
90
+ logger.info(` Check individual file logs above for skipped/transformed status`);
91
+ // Create links folder and _links.md file
92
+ logger.info('\nCreating links folder and _links.md file...');
93
+ await this.createLinksStructure(this.config.outputDir);
94
+ logger.info('✓ Links structure created');
95
+ }
96
+ /**
97
+ * Process a single HTML file to Markdown
98
+ */
99
+ async processFile(htmlFilepath, index, total) {
100
+ const htmlFile = path.basename(htmlFilepath);
101
+ const dirPath = path.dirname(htmlFilepath);
102
+ const baseFilename = htmlFile.replace('.html', '');
103
+ const mdFilename = `${baseFilename}.md`;
104
+ const mdFilepath = path.join(dirPath, mdFilename);
105
+ const id = baseFilename.split('-')[0];
106
+ // Show relative path for better readability
107
+ const relativePath = path.relative(this.config.outputDir, htmlFilepath);
108
+ logger.info(`[${index}/${total}] Checking: ${relativePath}`);
109
+ logger.debug(`Processing file ${baseFilename} (ID: ${id})`);
110
+ // Check if MD file already exists
111
+ try {
112
+ await fs.access(mdFilepath);
113
+ if (this.config.force) {
114
+ logger.info(` ⚑ Force: Overwriting existing ${mdFilename}`);
115
+ // If forcing, remove existing images folder for this page to avoid stale files
116
+ try {
117
+ const imagesDir = path.join(dirPath, 'images');
118
+ await fs.rm(imagesDir, { recursive: true, force: true });
119
+ logger.info(` ✓ Removed existing images/ for ${baseFilename}`);
120
+ }
121
+ catch (err) {
122
+ // Non-fatal if images removal fails
123
+ logger.warn(` ⚠ Could not remove images for ${baseFilename}:`, err instanceof Error ? err.message : err);
124
+ }
125
+ }
126
+ else {
127
+ logger.info(` ⊘ Skipped: ${mdFilename} already exists`);
128
+ return;
129
+ }
130
+ }
131
+ catch {
132
+ // MD file doesn't exist, proceed with transformation
133
+ }
134
+ try {
135
+ logger.debug(`Reading HTML content from ${htmlFilepath}`);
136
+ // Read HTML content
137
+ const htmlContent = await fs.readFile(htmlFilepath, 'utf-8');
138
+ logger.debug(`HTML content length: ${htmlContent.length} characters`);
139
+ // Parse the title from filename (reverse slugification is lossy, but best effort)
140
+ const title = unslugify(baseFilename);
141
+ logger.debug(`Parsed title: "${title}"`);
142
+ logger.debug(`Starting HTML to Markdown transformation`);
143
+ // Transform HTML to Markdown
144
+ const images = [];
145
+ const markdownBody = await this.htmlToMarkdown(htmlContent, id, images);
146
+ logger.debug(`Transformation complete, markdown length: ${markdownBody.length} characters`);
147
+ // Build original page URL (use baseUrl if available)
148
+ const originalUrl = this.config.baseUrl
149
+ ? `${this.config.baseUrl}/pages/viewpage.action?pageId=${id}`
150
+ : '';
151
+ logger.debug(`Original URL: ${originalUrl || 'none'}`);
152
+ // Create front matter
153
+ const frontMatter = [
154
+ '---',
155
+ `title: "${title.replace(/"/g, '\\"')}"`,
156
+ `id: "${id}"`,
157
+ originalUrl ? `url: "${originalUrl}"` : '',
158
+ '---'
159
+ ].filter(Boolean).join('\n');
160
+ logger.debug(`Front matter created`);
161
+ // Before finalizing, replace any pending include placeholders inside markdownBody
162
+ let finalBody = markdownBody;
163
+ logger.debug(`Processing ${this.pendingIncludes.length} pending includes`);
164
+ for (const include of this.pendingIncludes) {
165
+ // Replace raw placeholder
166
+ finalBody = finalBody.replace(include.placeholder, include.content);
167
+ // Some converters escape underscores/backslashes; also replace escaped variants
168
+ const escaped = include.placeholder.replace(/_/g, '\\_');
169
+ finalBody = finalBody.replace(escaped, include.content);
170
+ // And double-escaped (e.g. \__INCLUDE_1__)
171
+ const doubleEscaped = escaped.replace(/\\/g, '\\\\');
172
+ finalBody = finalBody.replace(doubleEscaped, include.content);
173
+ }
174
+ logger.debug(`Include placeholders replaced`);
175
+ // Combine front matter and content
176
+ const markdownContent = `${frontMatter}\n\n${finalBody}`;
177
+ logger.debug(`Combined content length: ${markdownContent.length} characters`);
178
+ // Save images if any (in the same directory as the page)
179
+ if (images.length > 0) {
180
+ logger.debug(`Saving ${images.length} images`);
181
+ const imagesDir = path.join(dirPath, 'images');
182
+ await fs.mkdir(imagesDir, { recursive: true });
183
+ for (const image of images) {
184
+ const imagePath = path.join(imagesDir, image.filename);
185
+ await fs.writeFile(imagePath, image.data);
186
+ }
187
+ logger.info(` ✓ Saved ${images.length} image(s) for ${baseFilename}`);
188
+ }
189
+ else {
190
+ logger.debug(`No images to save`);
191
+ }
192
+ logger.debug(`Performing final cleanup`);
193
+ // Final cleanup: unescape any remaining backslashes before [],() produced by converters
194
+ let finalMarkdownToWrite = markdownContent
195
+ // Remove escaped bracket/paren characters produced by converters (e.g. \[ \] \( \) )
196
+ .replace(/\\([\[\]\(\)])/g, '$1');
197
+ logger.debug(`Final markdown length: ${finalMarkdownToWrite.length} characters`);
198
+ logger.debug(`Formatting with Prettier`);
199
+ // Format and write markdown file
200
+ try {
201
+ const formatted = await prettier.format(finalMarkdownToWrite, {
202
+ parser: 'markdown',
203
+ printWidth: 120,
204
+ proseWrap: 'preserve',
205
+ tabWidth: 2,
206
+ useTabs: false
207
+ });
208
+ logger.debug(`Writing formatted markdown to ${mdFilepath}`);
209
+ await fs.writeFile(mdFilepath, formatted, 'utf-8');
210
+ logger.info(` ✓ Transformed: ${mdFilename} (formatted)`);
211
+ }
212
+ catch {
213
+ // If formatting fails, save unformatted markdown
214
+ logger.warn(` ⚠ Could not format Markdown, saving unformatted`);
215
+ logger.debug(`Writing unformatted markdown to ${mdFilepath}`);
216
+ await fs.writeFile(mdFilepath, finalMarkdownToWrite, 'utf-8');
217
+ logger.info(` ✓ Transformed: ${mdFilename}`);
218
+ }
219
+ }
220
+ catch (error) {
221
+ logger.error(` ✗ Failed to transform ${htmlFile}:`, error instanceof Error ? error.message : error);
222
+ }
223
+ }
224
+ /**
225
+ * Basic HTML to Markdown conversion
226
+ */
227
+ async htmlToMarkdown(html, pageId, images) {
228
+ let markdown = html;
229
+ // Preprocess: convert lists inside table cells to inline text to avoid breaking Markdown tables
230
+ // Convert <td>...<ul><li>Item</li>...</ul>...</td> -> <td>...• Item; Item; ...</td>
231
+ try {
232
+ markdown = markdown.replace(/<td([^>]*)>([\s\S]*?)<\/td>/gi, (full, attrs, inner) => {
233
+ // If there are list tags inside, replace them with inline bullets separated by semicolons
234
+ if (/<ul[^>]*>|<ol[^>]*>/i.test(inner)) {
235
+ // Extract list items
236
+ const items = [];
237
+ const liRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
238
+ for (const m of Array.from(inner.matchAll(liRegex))) {
239
+ let item = m[1] || '';
240
+ // Strip tags inside li
241
+ item = item.replace(/<[^>]+>/g, '').trim();
242
+ if (item)
243
+ items.push(item);
244
+ }
245
+ if (items.length > 0) {
246
+ const replacement = items.map(i => `• ${i}`).join('; ');
247
+ // Remove the original lists from inner and append the inline replacement
248
+ const cleanedInner = inner.replace(/<ul[^>]*>[\s\S]*?<\/ul>/gi, '').replace(/<ol[^>]*>[\s\S]*?<\/ol>/gi, '').trim();
249
+ const spacer = cleanedInner && !cleanedInner.endsWith(' ') ? ' ' : '';
250
+ return `<td${attrs}>${cleanedInner}${spacer}${replacement}</td>`;
251
+ }
252
+ }
253
+ return full;
254
+ });
255
+ }
256
+ catch (e) {
257
+ // Non-fatal: if preprocessing fails, continue without it
258
+ logger.warn('List-in-table preprocessing failed:', e instanceof Error ? e.message : e);
259
+ }
260
+ // Transform macros to markdown equivalents (with data fetching)
261
+ markdown = await this.transformMacros(markdown, pageId);
262
+ // Transform user links first (before removing ac:link)
263
+ markdown = await this.transformUserLinks(markdown);
264
+ // Transform page links to HTML anchor tags (will be converted to MD links later)
265
+ markdown = await this.transformPageLinks(markdown);
266
+ // Transform images and download attachments
267
+ markdown = await this.transformImages(markdown, pageId, images);
268
+ logger.debug(`Reving layout, time, and other elements`);
269
+ // Remove layout structure tags (they don't add value in markdown)
270
+ markdown = markdown.replace(/<\/?ac:layout[^>]*>/gi, '');
271
+ markdown = markdown.replace(/<\/?ac:layout-section[^>]*>/gi, '\n\n');
272
+ markdown = markdown.replace(/<\/?ac:layout-cell[^>]*>/gi, '\n\n');
273
+ // Time elements
274
+ markdown = markdown.replace(/<time[^>]*datetime="([^"]+)"[^>]*\/?>.*?/gi, '$1');
275
+ logger.debug(`Converting HTML to Markdown using webforai`);
276
+ markdown = htmlToMarkdown(markdown);
277
+ // Trim whitespace in Markdown table cells
278
+ logger.debug(`Trimming whitespace in Markdown table cells`);
279
+ markdown = markdown.replace(/^\|(.+)\|$/gm, (line) => {
280
+ const parts = line.split('|');
281
+ const trimmedParts = parts.map(part => part.trim());
282
+ return trimmedParts.join('|');
283
+ });
284
+ logger.debug(`Post-processing Markdown content (Pending includes, links, cleanup)`);
285
+ // Replace include placeholders with actual content (handle escaped variants)
286
+ for (const include of this.pendingIncludes) {
287
+ // raw
288
+ markdown = markdown.replace(include.placeholder, include.content);
289
+ // escaped underscores (e.g. \_\_INCLUDE_1\_\_)
290
+ const escaped = include.placeholder.replace(/_/g, '\\_');
291
+ markdown = markdown.replace(escaped, include.content);
292
+ // double-escaped (e.g. \\\_\\\_INCLUDE_1\\\_\\\_)
293
+ const doubleEscaped = escaped.replace(/\\/g, '\\\\');
294
+ markdown = markdown.replace(doubleEscaped, include.content);
295
+ }
296
+ this.pendingIncludes = [];
297
+ logger.debug(`Pending includes processed`);
298
+ // Restore page links that were escaped by htmlToMarkdown
299
+ // Pattern: \[Title\](url.md) -> [Title](url.md)
300
+ markdown = markdown.replace(/\\?\[([^\]]+)\\?\]\\?\(([^)]+\.md)\\?\)/g, '[$1]($2)');
301
+ // Unescape image and link bracket escaping produced by converters
302
+ // Example: !\[image.png\]\(images/image.png\) -> ![image.png](images/image.png)
303
+ markdown = markdown.replace(/!\\\[([^\]]+)\\\]\(\s*([^\)]+)\s*\)/g, '![$1]($2)');
304
+ markdown = markdown.replace(/\\\[([^\]]+)\\\]\(\s*([^\)]+)\s*\)/g, '[$1]($2)');
305
+ // Remove remaining ac:link elements
306
+ markdown = markdown.replace(/<ac:link[^>]*>[\s\S]*?<\/ac:link>/g, '');
307
+ logger.debug(`Converting headers`);
308
+ // Headers
309
+ markdown = markdown.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '\n# $1\n');
310
+ markdown = markdown.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '\n## $1\n');
311
+ markdown = markdown.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '\n### $1\n');
312
+ markdown = markdown.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '\n#### $1\n');
313
+ markdown = markdown.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '\n##### $1\n');
314
+ markdown = markdown.replace(/<h6[^>]*>(.*?)<\/h6>/gi, '\n###### $1\n');
315
+ logger.debug(`Converting text formatting`);
316
+ // Bold and italic
317
+ markdown = markdown.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**');
318
+ markdown = markdown.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**');
319
+ markdown = markdown.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*');
320
+ markdown = markdown.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*');
321
+ // Links
322
+ logger.debug(`Converting links`);
323
+ markdown = markdown.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
324
+ // Lists
325
+ logger.debug(`Converting lists`);
326
+ markdown = markdown.replace(/<ul[^>]*>/gi, '\n');
327
+ markdown = markdown.replace(/<\/ul>/gi, '\n');
328
+ markdown = markdown.replace(/<ol[^>]*>/gi, '\n');
329
+ markdown = markdown.replace(/<\/ol>/gi, '\n');
330
+ markdown = markdown.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
331
+ // Paragraphs
332
+ markdown = markdown.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
333
+ // Code blocks
334
+ markdown = markdown.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, '```\n$1\n```\n');
335
+ markdown = markdown.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`');
336
+ // Line breaks
337
+ markdown = markdown.replace(/<br\s*\/?>/gi, '\n');
338
+ // Remove remaining HTML tags
339
+ logger.debug(`Removing remaining HTML tags`);
340
+ markdown = markdown.replace(/<[^>]+>/g, '');
341
+ // Clean up HTML entities
342
+ markdown = markdown.replace(/&nbsp;/g, ' ');
343
+ markdown = markdown.replace(/&amp;/g, '&');
344
+ markdown = markdown.replace(/&lt;/g, '<');
345
+ markdown = markdown.replace(/&gt;/g, '>');
346
+ markdown = markdown.replace(/&quot;/g, '"');
347
+ // Clean up extra whitespace
348
+ markdown = markdown.replace(/\n{3,}/g, '\n\n');
349
+ markdown = markdown.trim();
350
+ // Apply markdown cleanup to remove malformed patterns
351
+ logger.debug(`Cleaning up markdown`);
352
+ markdown = this.cleanMarkdown(markdown);
353
+ return markdown;
354
+ }
355
+ /**
356
+ * Transform images and download attachments
357
+ */
358
+ async transformImages(content, pageId, images) {
359
+ let result = content;
360
+ const downloadPromises = [];
361
+ // Match image attachments: <ac:image><ri:attachment ri:filename="..." /></ac:image>
362
+ const imageRegex = /<ac:image[^>]*><ri:attachment[^>]*ri:filename="([^"]+)"[^>]*\/><\/ac:image>/gi;
363
+ const imageMatches = Array.from(content.matchAll(imageRegex));
364
+ for (const match of imageMatches) {
365
+ const originalFilename = match[1];
366
+ logger.debug(`Processing image attachment: ${originalFilename}`);
367
+ // Extract extension and slugify the base name
368
+ const lastDotIndex = originalFilename.lastIndexOf('.');
369
+ const extension = lastDotIndex > 0 ? originalFilename.slice(lastDotIndex) : '';
370
+ const baseName = lastDotIndex > 0 ? originalFilename.slice(0, lastDotIndex) : originalFilename;
371
+ const slugifiedFilename = slugify(baseName) + extension;
372
+ let replacement = `![${originalFilename}](images/${slugifiedFilename})`;
373
+ // Download the image if API is available
374
+ if (this.api) {
375
+ downloadPromises.push((async () => {
376
+ try {
377
+ // Try downloading with original filename first (Confluence API may handle encoding internally)
378
+ let imageData = await this.api.downloadAttachment(pageId, originalFilename);
379
+ // If that fails, try with URL-encoded filename
380
+ if (!imageData) {
381
+ const encodedImageName = encodeURIComponent(originalFilename);
382
+ imageData = await this.api.downloadAttachment(pageId, encodedImageName);
383
+ }
384
+ if (imageData) {
385
+ images.push({ filename: slugifiedFilename, data: imageData });
386
+ logger.info(` ✓ Downloaded image: ${originalFilename} -> ${slugifiedFilename}`);
387
+ }
388
+ else {
389
+ // Image might be on a different page or not exist
390
+ logger.warn(` ⚠ Image not found on this page: ${originalFilename} (may be on parent/child page)`);
391
+ }
392
+ }
393
+ catch (error) {
394
+ const errorMessage = error instanceof Error ? error.message : String(error);
395
+ if (errorMessage.includes('404')) {
396
+ logger.warn(` ⚠ Image not attached to this page: ${originalFilename}`);
397
+ }
398
+ else {
399
+ logger.warn(` ⚠ Error downloading image ${originalFilename}:`, errorMessage);
400
+ }
401
+ }
402
+ })());
403
+ }
404
+ logger.debug(`Replacing image tag with markdown: ${replacement}`);
405
+ result = result.replace(match[0], replacement);
406
+ }
407
+ logger.debug(`Processed inline <img> tags that reference /download/attachments/...`);
408
+ // Also handle inline <img> tags that reference /download/attachments/... with optional data-linked-resource-container-id
409
+ // Example: <img class="confluence-embedded-image" src="/download/attachments/715168874/image.png?version=1&api=v2" data-linked-resource-container-id="715168874" />
410
+ const inlineImgRegex = /<img[^>]*src="([^"]*\/download\/attachments\/[^"\s]+)"[^>]*>/gi;
411
+ const inlineImgMatches = Array.from(content.matchAll(inlineImgRegex));
412
+ logger.debug(`Found ${inlineImgMatches.length} inline <img> tags with /download/attachments/ URLs`);
413
+ for (const match of inlineImgMatches) {
414
+ const src = match[1];
415
+ logger.debug(`Processing inline image src: ${src}`);
416
+ // Try to extract filename from URL path
417
+ let filename = src.split('/').pop() || 'image';
418
+ // Strip query params if present
419
+ filename = filename.split('?')[0];
420
+ // Try to extract container id from the tag using a secondary regex on the original match
421
+ const fullTag = match[0];
422
+ const containerIdMatch = fullTag.match(/data-linked-resource-container-id="([^"<>]+)"/i);
423
+ const containerId = containerIdMatch ? containerIdMatch[1] : pageId;
424
+ const lastDotIndex = filename.lastIndexOf('.');
425
+ const extension = lastDotIndex > 0 ? filename.slice(lastDotIndex) : '';
426
+ const baseName = lastDotIndex > 0 ? filename.slice(0, lastDotIndex) : filename;
427
+ const slugifiedFilename = slugify(baseName) + extension;
428
+ let replacement = `![${filename}](images/${slugifiedFilename})`;
429
+ if (this.api) {
430
+ downloadPromises.push((async () => {
431
+ try {
432
+ logger.debug(`Downloading inline image from container ${containerId} with filename ${filename}`);
433
+ // The API expects the filename as-is; try original filename first
434
+ let imageData = await this.api.downloadAttachment(containerId, filename);
435
+ // Fallback: try URL-decoded filename
436
+ if (!imageData) {
437
+ const decoded = decodeURIComponent(filename);
438
+ if (decoded !== filename) {
439
+ imageData = await this.api.downloadAttachment(containerId, decoded);
440
+ }
441
+ }
442
+ // Another fallback: try removing any appended tokens (some Confluence instances append ids)
443
+ if (!imageData) {
444
+ const simpleName = filename.replace(/^[^a-z0-9]+/i, '').split(/[^a-z0-9.\-_]/i)[0];
445
+ if (simpleName && simpleName !== filename) {
446
+ imageData = await this.api.downloadAttachment(containerId, simpleName);
447
+ }
448
+ }
449
+ if (imageData) {
450
+ images.push({ filename: slugifiedFilename, data: imageData });
451
+ logger.info(` ✓ Downloaded inline image: ${filename} -> ${slugifiedFilename}`);
452
+ }
453
+ else {
454
+ logger.warn(` ⚠ Inline image not downloaded: ${filename} (container ${containerId})`);
455
+ }
456
+ }
457
+ catch (error) {
458
+ const errorMessage = error instanceof Error ? error.message : String(error);
459
+ logger.warn(` ⚠ Error downloading inline image ${filename}:`, errorMessage);
460
+ }
461
+ })());
462
+ }
463
+ result = result.replace(match[0], replacement);
464
+ }
465
+ // Wait for all downloads to complete
466
+ await Promise.all(downloadPromises);
467
+ logger.debug(`Completed processing inline <img> tags`);
468
+ return result;
469
+ }
470
+ /**
471
+ * Build a Markdown list from an included page's HTML content.
472
+ * Prefer extracting <ul>/<ol> list items and anchor links; fall back to full page transform.
473
+ */
474
+ async buildIncludeList(page, title) {
475
+ try {
476
+ const html = page.body || '';
477
+ // Extract list items inside <ul> or <ol>
478
+ const listRegex = /<ul[^>]*>([\s\S]*?)<\/ul>/i;
479
+ const listMatch = html.match(listRegex);
480
+ if (listMatch) {
481
+ const itemsHtml = listMatch[1];
482
+ const itemRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
483
+ const items = [];
484
+ for (const m of Array.from(itemsHtml.matchAll(itemRegex))) {
485
+ let item = m[1].trim();
486
+ // Convert <a href> to markdown
487
+ item = item.replace(/<a[^>]*href="([^"]+)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
488
+ // Strip remaining tags
489
+ item = item.replace(/<[^>]+>/g, '').trim();
490
+ items.push(`- ${item}`);
491
+ }
492
+ if (items.length > 0) {
493
+ return `\n\n## ${title}\n\n${items.join('\n')}\n\n`;
494
+ }
495
+ }
496
+ // If no lists found, look for anchor links
497
+ const anchorRegex = /<a[^>]*href="([^"]+)"[^>]*>(.*?)<\/a>/gi;
498
+ const anchors = Array.from(html.matchAll(anchorRegex));
499
+ if (anchors.length > 0) {
500
+ const items = anchors.map(a => `- [${a[2].replace(/<[^>]+>/g, '').trim()}](${a[1]})`);
501
+ return `\n\n## ${title}\n\n${items.join('\n')}\n\n`;
502
+ }
503
+ // Fall back to full-page transform
504
+ const full = await this.htmlToMarkdown(html, page.id || title, []);
505
+ return `\n\n## ${title}\n\n${full}\n\n`;
506
+ }
507
+ catch (error) {
508
+ logger.warn(`Failed to build include list for ${title}:`, error);
509
+ return `\n\n## ${title}\n\n<!-- failed to include content -->\n\n`;
510
+ }
511
+ }
512
+ /**
513
+ * Transform Confluence macros to Markdown
514
+ */
515
+ async transformMacros(content, pageId) {
516
+ let result = content;
517
+ // Handle children macro - fetch child pages of specified page or current page
518
+ const childrenRegex = /<ac:structured-macro[^>]*ac:name="children"[^>]*>([\s\S]*?)<\/ac:structured-macro>/gis;
519
+ const childrenMatches = Array.from(content.matchAll(childrenRegex));
520
+ for (const match of childrenMatches) {
521
+ let replacement = '<!-- Child Pages -->\n\n';
522
+ const macroContent = match[1];
523
+ if (this.api) {
524
+ try {
525
+ // Check if there's a page parameter
526
+ const pageParamMatch = macroContent.match(/ri:content-title="([^"]+)"/i);
527
+ let targetPageId = pageId;
528
+ let targetTitle = '';
529
+ if (pageParamMatch) {
530
+ targetTitle = pageParamMatch[1];
531
+ // Try to find the page by title
532
+ const targetPage = await this.api.getPageByTitle(this.config.spaceKey, targetTitle);
533
+ if (targetPage) {
534
+ targetPageId = targetPage.id;
535
+ }
536
+ }
537
+ const childPages = await this.api.getChildPages(targetPageId);
538
+ if (childPages.length > 0) {
539
+ replacement = childPages.map(child => `- [${child.title}](${slugify(child.title)}.md)`).join('\n') + '\n\n';
540
+ }
541
+ }
542
+ catch (error) {
543
+ logger.warn(`Failed to fetch child pages:`, error);
544
+ }
545
+ }
546
+ result = result.replace(match[0], replacement);
547
+ }
548
+ // Handle list-children macro - fetch actual child pages
549
+ const listChildrenRegex = /<ac:structured-macro[^>]*ac:name="list-children"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis;
550
+ const listChildrenMatches = Array.from(result.matchAll(listChildrenRegex));
551
+ for (const match of listChildrenMatches) {
552
+ let replacement = '<!-- Child Pages List -->\n\n';
553
+ if (this.api) {
554
+ try {
555
+ const childPages = await this.api.getChildPages(pageId);
556
+ if (childPages.length > 0) {
557
+ replacement = childPages.map(child => `- [${child.title}](${slugify(child.title)}.md)`).join('\n') + '\n\n';
558
+ }
559
+ }
560
+ catch (error) {
561
+ logger.warn(`Failed to fetch child pages for ${pageId}:`, error);
562
+ }
563
+ }
564
+ result = result.replace(match[0], replacement);
565
+ }
566
+ // Handle include macro - fetch content from included page
567
+ const includeRegex = /<ac:structured-macro[^>]*ac:name="include"[^>]*>([\s\S]*?)<\/ac:structured-macro>/gis;
568
+ const includeMatches = Array.from(result.matchAll(includeRegex));
569
+ for (const match of includeMatches) {
570
+ const macroContent = match[1];
571
+ const titleMatch = macroContent.match(/ri:content-title="([^"]+)"/i);
572
+ if (titleMatch && this.api) {
573
+ const includeTitle = titleMatch[1];
574
+ try {
575
+ let includedPage;
576
+ if (includeTitle === "FCS Useful Links") {
577
+ // Hardcode the pageId for FCS Useful Links
578
+ includedPage = await this.api.getPage("167810724");
579
+ }
580
+ else {
581
+ includedPage = await this.api.getPageByTitle(this.config.spaceKey, includeTitle);
582
+ }
583
+ if (includedPage && includedPage.body) {
584
+ // Build a concise Markdown list from the included page using the API
585
+ const listMd = await this.buildIncludeList(includedPage, includeTitle);
586
+ // Generate a unique placeholder per include to avoid collisions
587
+ const placeholder = `__INCLUDE_${this.pendingIncludes.length + 1}__`;
588
+ // Replace macro with placeholder and remember the content for later
589
+ result = result.replace(match[0], placeholder);
590
+ this.pendingIncludes.push({ placeholder, content: listMd });
591
+ }
592
+ else {
593
+ result = result.replace(match[0], `<!-- Include: ${includeTitle} (page not found) -->\n\n`);
594
+ }
595
+ }
596
+ catch (error) {
597
+ logger.warn(`Failed to fetch included page "${includeTitle}":`, error);
598
+ result = result.replace(match[0], `<!-- Include: ${includeTitle} (error) -->\n\n`);
599
+ }
600
+ }
601
+ else {
602
+ result = result.replace(match[0], '<!-- Include macro -->\n\n');
603
+ }
604
+ }
605
+ // Preserve table-like macros: extract the inner rich-text-body so HTML tables
606
+ // inside macros (e.g. table-filter) are retained and later converted to Markdown.
607
+ result = result.replace(/<ac:structured-macro[^>]*ac:name="(?:table|table-filter)"[^>]*>[\s\S]*?<ac:rich-text-body>([\s\S]*?)<\/ac:rich-text-body>[\s\S]*?<\/ac:structured-macro>/gis, '$1\n\n');
608
+ // Apply other macro transformations
609
+ result = result
610
+ // Code blocks with language
611
+ .replace(/<ac:structured-macro[^>]*ac:name="code"[^>]*>.*?<ac:parameter[^>]*ac:name="language"[^>]*>(.*?)<\/ac:parameter>.*?<ac:plain-text-body><!\[CDATA\[(.*?)\]\]><\/ac:plain-text-body>.*?<\/ac:structured-macro>/gis, '```$1\n$2\n```\n\n')
612
+ // Code blocks without language
613
+ .replace(/<ac:structured-macro[^>]*ac:name="code"[^>]*>.*?<ac:plain-text-body><!\[CDATA\[(.*?)\]\]><\/ac:plain-text-body>.*?<\/ac:structured-macro>/gis, '```\n$1\n```\n\n')
614
+ // Info panels
615
+ /* Replace info macro with a concise inline marker using the macro title and body.
616
+ Desired output example:
617
+ [i] Here you will find
618
+ <body content...>
619
+ */
620
+ .replace(/<ac:structured-macro[^>]*ac:name="info"[^>]*>([\s\S]*?)<\/ac:structured-macro>/gis, (_match, inner) => {
621
+ try {
622
+ // Extract title parameter if present
623
+ const titleMatch = inner.match(/<ac:parameter[^>]*ac:name="title"[^>]*>([\s\S]*?)<\/ac:parameter>/i);
624
+ const title = titleMatch ? titleMatch[1].trim() : '';
625
+ // Extract rich-text-body content
626
+ const bodyMatch = inner.match(/<ac:rich-text-body>([\s\S]*?)<\/ac:rich-text-body>/i);
627
+ const body = bodyMatch ? bodyMatch[1].trim() : '';
628
+ const titleLine = title ? `[i] ${title}\n\n` : '';
629
+ // Return title marker plus body (body will be further transformed later)
630
+ return `${titleLine}${body}\n\n`;
631
+ }
632
+ catch (e) {
633
+ return '<!-- Info macro -->\n\n';
634
+ }
635
+ })
636
+ // Warning panels
637
+ .replace(/<ac:structured-macro[^>]*ac:name="warning"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '> **Warning:** $1\n\n')
638
+ // Note panels
639
+ .replace(/<ac:structured-macro[^>]*ac:name="note"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '> **Note:** $1\n\n')
640
+ // Panel macro - extract content
641
+ .replace(/<ac:structured-macro[^>]*ac:name="panel"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '$1\n\n')
642
+ // Excerpt macro - extract content
643
+ .replace(/<ac:structured-macro[^>]*ac:name="excerpt"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '$1\n\n')
644
+ // Table of contents
645
+ .replace(/<ac:structured-macro[^>]*ac:name="toc"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Table of Contents -->\n\n')
646
+ // Content by label
647
+ .replace(/<ac:structured-macro[^>]*ac:name="contentbylabel"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Content by Label -->\n\n')
648
+ // Livesearch macro
649
+ .replace(/<ac:structured-macro[^>]*ac:name="livesearch"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Live Search -->\n\n')
650
+ // Jira macro
651
+ .replace(/<ac:structured-macro[^>]*ac:name="jira"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Jira Issues -->\n\n')
652
+ // Recently updated macro
653
+ .replace(/<ac:structured-macro[^>]*ac:name="recently-updated"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Recently Updated Pages -->\n\n')
654
+ // Popular labels macro
655
+ .replace(/<ac:structured-macro[^>]*ac:name="popular-labels"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Popular Labels -->\n\n')
656
+ // Other macros - convert to comments
657
+ .replace(/<ac:structured-macro[^>]*ac:name="([^"]*)"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Confluence Macro: $1 -->\n\n');
658
+ return result;
659
+ }
660
+ /**
661
+ * Transform user links to display names
662
+ */
663
+ async transformUserLinks(html) {
664
+ if (!this.api) {
665
+ // If no API provided, just remove user links
666
+ return html.replace(/<ac:link[^>]*><ri:user[^>]*\/><\/ac:link>/g, '@unknown-user');
667
+ }
668
+ let result = html;
669
+ // Match user links by username
670
+ const usernameRegex = /<ac:link[^>]*><ri:user[^>]*ri:username="([^"]+)"[^>]*\/><\/ac:link>/gi;
671
+ const usernameMatches = Array.from(html.matchAll(usernameRegex));
672
+ for (const match of usernameMatches) {
673
+ const username = match[1];
674
+ const user = await this.api.getUserByUsername(username);
675
+ if (user) {
676
+ result = result.replace(match[0], `@${user.displayName}`);
677
+ }
678
+ else {
679
+ result = result.replace(match[0], `@${username}`);
680
+ }
681
+ }
682
+ // Match user links by userkey
683
+ const userkeyRegex = /<ac:link[^>]*><ri:user[^>]*ri:userkey="([^"]+)"[^>]*\/><\/ac:link>/gi;
684
+ const userkeyMatches = Array.from(result.matchAll(userkeyRegex));
685
+ for (const match of userkeyMatches) {
686
+ const userKey = match[1];
687
+ const user = await this.api.getUserByKey(userKey);
688
+ if (user) {
689
+ result = result.replace(match[0], `@${user.displayName}`);
690
+ }
691
+ else {
692
+ result = result.replace(match[0], `@user-${userKey.slice(-8)}`);
693
+ }
694
+ }
695
+ return result;
696
+ }
697
+ /**
698
+ * Transform page links to markdown links
699
+ */
700
+ async transformPageLinks(html) {
701
+ let result = html;
702
+ // Match page links by content title - various formats
703
+ // Format 1: <ac:link><ri:page ri:content-title="Title" /></ac:link>
704
+ const pageLinkRegex1 = /<ac:link[^>]*>\s*<ri:page[^>]*ri:content-title="([^"]+)"[^>]*\/>\s*<\/ac:link>/gi;
705
+ const matches1 = Array.from(html.matchAll(pageLinkRegex1));
706
+ for (const match of matches1) {
707
+ const title = match[1];
708
+ const link = `[${title}](${slugify(title)}.md)`;
709
+ result = result.replace(match[0], link);
710
+ }
711
+ // Format 2: Just <ri:page ri:content-title="Title" /> without ac:link wrapper
712
+ const pageLinkRegex2 = /<ri:page[^>]*ri:content-title="([^"]+)"[^>]*\/>/gi;
713
+ const matches2 = Array.from(result.matchAll(pageLinkRegex2));
714
+ for (const match of matches2) {
715
+ const title = match[1];
716
+ const link = `[${title}](${slugify(title)}.md)`;
717
+ result = result.replace(match[0], link);
718
+ }
719
+ return result;
720
+ }
721
+ /**
722
+ * Clean up malformed markdown patterns
723
+ */
724
+ cleanMarkdown(markdown) {
725
+ let cleaned = markdown;
726
+ // First pass: clean confluence-specific patterns
727
+ logger.debug('Cleaning Confluence-specific markdown patterns');
728
+ cleaned = this.cleanConfluencePatterns(cleaned);
729
+ // Second pass: general cleanup
730
+ logger.debug('Cleaning general markdown patterns');
731
+ cleaned = this.cleanGeneral(cleaned);
732
+ // Third pass: another round of confluence patterns to catch any new issues
733
+ logger.debug('Cleaning Confluence-specific markdown patterns (second pass)');
734
+ cleaned = this.cleanConfluencePatterns(cleaned);
735
+ // Final cleanup of excessive whitespace
736
+ cleaned = cleaned.replace(/\n{4,}/g, '\n\n\n');
737
+ cleaned = cleaned.trim() + '\n';
738
+ logger.debug('Final cleanup of excessive whitespace');
739
+ return cleaned;
740
+ }
741
+ /**
742
+ * Clean up specific problematic patterns that appear in Confluence exports
743
+ */
744
+ cleanConfluencePatterns(markdown) {
745
+ let cleaned = markdown;
746
+ // Remove standalone bold markers that are not part of content
747
+ // This handles cases like "**\n\n**" or "** **"
748
+ cleaned = cleaned.replace(/\*\*\s*\n\s*\n\s*\*\*/g, '');
749
+ // Remove lines that only contain **
750
+ cleaned = cleaned.replace(/^\s*\*\*\s*$/gm, '');
751
+ // Remove empty headers (headers with no content)
752
+ cleaned = cleaned.replace(/^#+\s*$/gm, '');
753
+ // Remove bold markers around only whitespace
754
+ cleaned = cleaned.replace(/\*\*\s+\*\*/g, ' ');
755
+ // Remove italic markers around only whitespace
756
+ cleaned = cleaned.replace(/\*\s+\*/g, ' ');
757
+ // Clean up malformed blockquotes
758
+ cleaned = cleaned.replace(/^>\s*$/gm, '');
759
+ // Remove empty code blocks
760
+ cleaned = cleaned.replace(/```\s*\n\s*```/g, '');
761
+ // Clean up malformed horizontal rules
762
+ cleaned = cleaned.replace(/^[-*_]\s*$/gm, '');
763
+ return cleaned;
764
+ }
765
+ /**
766
+ * General markdown cleanup
767
+ */
768
+ cleanGeneral(markdown) {
769
+ let cleaned = markdown;
770
+ // Remove empty headers with just bold/italic markers (no content between them)
771
+ // Match: ## ** or ## * (at end of line)
772
+ logger.debug('Removing empty headers with only formatting markers');
773
+ cleaned = cleaned.replace(/^#+\s*\*\*\s*$/gm, '');
774
+ cleaned = cleaned.replace(/^#+\s*\*\s*$/gm, '');
775
+ cleaned = cleaned.replace(/^#+\s*__\s*$/gm, '');
776
+ cleaned = cleaned.replace(/^#+\s*_\s*$/gm, '');
777
+ // Remove headers that only contain bold/italic markers across multiple lines
778
+ // Example: ## **\n\n** (with only whitespace between)
779
+ logger.debug('Removing headers with only formatting markers across multiple lines');
780
+ cleaned = cleaned.replace(/^(#+)\s*\*\*\s*\n+\s*\*\*\s*$/gm, '');
781
+ cleaned = cleaned.replace(/^(#+)\s*\*\s*\n+\s*\*\s*$/gm, '');
782
+ // Remove empty bold markers (no content or only whitespace between)
783
+ logger.debug('Removing empty bold markers');
784
+ cleaned = cleaned.replace(/\*\*\s*\*\*/g, '');
785
+ cleaned = cleaned.replace(/__\s*__/g, '');
786
+ // Remove standalone italic markers on their own line
787
+ logger.debug('Removing standalone italic markers on their own line');
788
+ cleaned = cleaned.replace(/^\s*\*\s*$/gm, '');
789
+ cleaned = cleaned.replace(/^\s*_\s*$/gm, '');
790
+ // Remove empty italic markers that span multiple lines (only if truly empty)
791
+ logger.debug('Removing empty italic markers that span multiple lines');
792
+ cleaned = cleaned.replace(/\*\s*\n+\s*\*/g, '\n\n');
793
+ // Remove empty links
794
+ logger.debug('Removing empty links');
795
+ cleaned = cleaned.replace(/\[\s*\]\(\s*\)/g, '');
796
+ // Remove empty list items
797
+ logger.debug('Removing empty list items');
798
+ cleaned = cleaned.replace(/^[-*+]\s*$/gm, '');
799
+ // Clean up excessive blank lines (more than 3 consecutive)
800
+ logger.debug('Cleaning up excessive blank lines');
801
+ cleaned = cleaned.replace(/\n{4,}/g, '\n\n\n');
802
+ // Remove trailing whitespace from each line
803
+ logger.debug('Removing trailing whitespace from each line');
804
+ cleaned = cleaned.split('\n').map(line => line.trimEnd()).join('\n');
805
+ // Ensure single trailing newline at end of file
806
+ logger.debug('Ensuring single trailing newline at end of file');
807
+ cleaned = cleaned.trim() + '\n';
808
+ return cleaned;
809
+ }
810
+ /**
811
+ * Create links folder with symlinks and _links.md with tree structure
812
+ */
813
+ async createLinksStructure(outputDir) {
814
+ const linksDir = path.join(outputDir, 'links');
815
+ // Remove existing links folder if it exists
816
+ try {
817
+ await fs.rm(linksDir, { recursive: true, force: true });
818
+ }
819
+ catch {
820
+ // Ignore if doesn't exist
821
+ }
822
+ // Create fresh links folder
823
+ await fs.mkdir(linksDir, { recursive: true });
824
+ // Find all MD files recursively
825
+ const findMdFiles = async (dir, fileList = []) => {
826
+ const entries = await fs.readdir(dir, { withFileTypes: true });
827
+ for (const entry of entries) {
828
+ const fullPath = path.join(dir, entry.name);
829
+ if (entry.isDirectory() && !entry.name.startsWith('_') && entry.name !== 'images' && entry.name !== 'links') {
830
+ await findMdFiles(fullPath, fileList);
831
+ }
832
+ else if (entry.isFile() && entry.name.endsWith('.md') && !entry.name.startsWith('_')) {
833
+ const relativePath = path.relative(outputDir, fullPath);
834
+ fileList.push({ path: fullPath, relativePath });
835
+ }
836
+ }
837
+ return fileList;
838
+ };
839
+ const mdFiles = await findMdFiles(outputDir);
840
+ // Create symlinks in links folder
841
+ for (const file of mdFiles) {
842
+ const linkName = path.basename(file.path);
843
+ const linkPath = path.join(linksDir, linkName);
844
+ const targetPath = path.relative(linksDir, file.path);
845
+ try {
846
+ await fs.symlink(targetPath, linkPath);
847
+ }
848
+ catch (error) {
849
+ logger.warn(` ⚠ Failed to create symlink for ${linkName}:`, error instanceof Error ? error.message : error);
850
+ }
851
+ }
852
+ logger.info(` ✓ Created ${mdFiles.length} symlinks in links/`);
853
+ // Build tree structure for _links.md
854
+ const tree = this.buildFileTree(mdFiles);
855
+ const treeMarkdown = this.generateTreeMarkdown(tree, outputDir);
856
+ // Write _links.md
857
+ const linksFilePath = path.join(outputDir, '_links.md');
858
+ const linksContent = `# Documentation Links\n\n${treeMarkdown}`;
859
+ try {
860
+ const formattedContent = await prettier.format(linksContent, {
861
+ parser: 'markdown',
862
+ printWidth: 120,
863
+ proseWrap: 'preserve',
864
+ tabWidth: 2,
865
+ useTabs: false
866
+ });
867
+ await fs.writeFile(linksFilePath, formattedContent, 'utf-8');
868
+ }
869
+ catch {
870
+ await fs.writeFile(linksFilePath, linksContent, 'utf-8');
871
+ }
872
+ logger.info(` ✓ Created _links.md with tree structure`);
873
+ }
874
+ /**
875
+ * Build a tree structure from flat file list
876
+ */
877
+ buildFileTree(files) {
878
+ const root = { name: '', children: {}, files: [] };
879
+ for (const file of files) {
880
+ const parts = file.relativePath.split(path.sep);
881
+ let current = root;
882
+ // Navigate/create directory structure
883
+ for (let i = 0; i < parts.length - 1; i++) {
884
+ const part = parts[i];
885
+ if (!current.children[part]) {
886
+ current.children[part] = { name: part, children: {}, files: [] };
887
+ }
888
+ current = current.children[part];
889
+ }
890
+ // Add file to current directory
891
+ current.files.push({
892
+ name: parts[parts.length - 1],
893
+ relativePath: file.relativePath
894
+ });
895
+ }
896
+ return root;
897
+ }
898
+ /**
899
+ * Generate markdown tree structure
900
+ */
901
+ generateTreeMarkdown(node, outputDir, level = 0) {
902
+ let result = '';
903
+ const indent = ' '.repeat(level);
904
+ // Sort directories and files alphabetically
905
+ const sortedDirs = Object.keys(node.children).sort();
906
+ const sortedFiles = node.files.sort((a, b) => a.name.localeCompare(b.name));
907
+ // Add directories first
908
+ for (const dirName of sortedDirs) {
909
+ const child = node.children[dirName];
910
+ result += `${indent}- **${dirName}/**\n`;
911
+ result += this.generateTreeMarkdown(child, outputDir, level + 1);
912
+ }
913
+ // Add files
914
+ for (const file of sortedFiles) {
915
+ const linkPath = file.relativePath;
916
+ result += `${indent}- [${file.name}](${linkPath})\n`;
917
+ }
918
+ return result;
919
+ }
920
+ /**
921
+ * Recursively clear existing .md files and images folders
922
+ */
923
+ async clearExistingFiles(dir) {
924
+ try {
925
+ const entries = await fs.readdir(dir, { withFileTypes: true });
926
+ for (const entry of entries) {
927
+ const fullPath = path.join(dir, entry.name);
928
+ if (entry.isDirectory()) {
929
+ if (entry.name === 'images' || entry.name === 'links') {
930
+ // Remove entire images and links folders
931
+ await fs.rm(fullPath, { recursive: true, force: true });
932
+ logger.info(` Removed: ${path.relative(this.config.outputDir, fullPath)}/`);
933
+ }
934
+ else if (!entry.name.startsWith('_')) {
935
+ // Recursively clear subdirectories (skip _index, _queue, etc.)
936
+ await this.clearExistingFiles(fullPath);
937
+ }
938
+ }
939
+ else if (entry.isFile() && entry.name.endsWith('.md') && !entry.name.startsWith('_')) {
940
+ // Remove .md files
941
+ await fs.unlink(fullPath);
942
+ logger.info(` Removed: ${path.relative(this.config.outputDir, fullPath)}`);
943
+ }
944
+ }
945
+ }
946
+ catch (error) {
947
+ logger.warn(`Warning: Could not clear files in ${dir}:`, error instanceof Error ? error.message : error);
948
+ }
949
+ }
950
+ }
951
+ //# sourceMappingURL=transform.command.js.map