confluence-exporter 1.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.eslintrc.cjs +18 -0
- package/.github/copilot-instructions.md +3 -0
- package/.github/prompts/analyze.prompt.md +101 -0
- package/.github/prompts/clarify.prompt.md +158 -0
- package/.github/prompts/constitution.prompt.md +73 -0
- package/.github/prompts/implement.prompt.md +56 -0
- package/.github/prompts/plan.prompt.md +50 -0
- package/.github/prompts/specify.prompt.md +21 -0
- package/.github/prompts/tasks.prompt.md +69 -0
- package/LICENSE +21 -0
- package/README.md +332 -0
- package/agents.md +1174 -0
- package/dist/api.d.ts +73 -0
- package/dist/api.js +387 -0
- package/dist/api.js.map +1 -0
- package/dist/commands/download.command.d.ts +18 -0
- package/dist/commands/download.command.js +257 -0
- package/dist/commands/download.command.js.map +1 -0
- package/dist/commands/executor.d.ts +22 -0
- package/dist/commands/executor.js +52 -0
- package/dist/commands/executor.js.map +1 -0
- package/dist/commands/help.command.d.ts +8 -0
- package/dist/commands/help.command.js +68 -0
- package/dist/commands/help.command.js.map +1 -0
- package/dist/commands/index.command.d.ts +14 -0
- package/dist/commands/index.command.js +95 -0
- package/dist/commands/index.command.js.map +1 -0
- package/dist/commands/index.d.ts +13 -0
- package/dist/commands/index.js +13 -0
- package/dist/commands/index.js.map +1 -0
- package/dist/commands/plan.command.d.ts +54 -0
- package/dist/commands/plan.command.js +272 -0
- package/dist/commands/plan.command.js.map +1 -0
- package/dist/commands/registry.d.ts +12 -0
- package/dist/commands/registry.js +32 -0
- package/dist/commands/registry.js.map +1 -0
- package/dist/commands/transform.command.d.ts +69 -0
- package/dist/commands/transform.command.js +951 -0
- package/dist/commands/transform.command.js.map +1 -0
- package/dist/commands/types.d.ts +12 -0
- package/dist/commands/types.js +5 -0
- package/dist/commands/types.js.map +1 -0
- package/dist/commands/update.command.d.ts +10 -0
- package/dist/commands/update.command.js +201 -0
- package/dist/commands/update.command.js.map +1 -0
- package/dist/constants.d.ts +1 -0
- package/dist/constants.js +2 -0
- package/dist/constants.js.map +1 -0
- package/dist/index.d.ts +5 -0
- package/dist/index.js +110 -0
- package/dist/index.js.map +1 -0
- package/dist/logger.d.ts +15 -0
- package/dist/logger.js +52 -0
- package/dist/logger.js.map +1 -0
- package/dist/types.d.ts +167 -0
- package/dist/types.js +5 -0
- package/dist/types.js.map +1 -0
- package/dist/utils.d.ts +56 -0
- package/dist/utils.js +178 -0
- package/dist/utils.js.map +1 -0
- package/eslint.config.js +29 -0
- package/jest.config.cjs +25 -0
- package/migrate-meta.js +132 -0
- package/package.json +53 -0
- package/src/api.ts +469 -0
- package/src/commands/download.command.ts +324 -0
- package/src/commands/executor.ts +62 -0
- package/src/commands/help.command.ts +72 -0
- package/src/commands/index.command.ts +111 -0
- package/src/commands/index.ts +14 -0
- package/src/commands/plan.command.ts +318 -0
- package/src/commands/registry.ts +39 -0
- package/src/commands/transform.command.ts +1103 -0
- package/src/commands/types.ts +16 -0
- package/src/commands/update.command.ts +229 -0
- package/src/constants.ts +0 -0
- package/src/index.ts +120 -0
- package/src/logger.ts +60 -0
- package/src/test.sh +66 -0
- package/src/types.ts +176 -0
- package/src/utils.ts +204 -0
- package/tests/commands/README.md +123 -0
- package/tests/commands/download.command.test.ts +8 -0
- package/tests/commands/help.command.test.ts +8 -0
- package/tests/commands/index.command.test.ts +8 -0
- package/tests/commands/plan.command.test.ts +15 -0
- package/tests/commands/transform.command.test.ts +8 -0
- package/tests/fixtures/_index.yaml +38 -0
- package/tests/fixtures/mock-pages.ts +62 -0
- package/tsconfig.json +25 -0
- package/vite.config.ts +45 -0
|
@@ -0,0 +1,951 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Transform command handler - Transforms HTML files to Markdown
|
|
3
|
+
*/
|
|
4
|
+
import { promises as fs } from 'fs';
|
|
5
|
+
import path from 'path';
|
|
6
|
+
import prettier from 'prettier';
|
|
7
|
+
import { htmlToMarkdown } from "webforai";
|
|
8
|
+
import { ConfluenceApi } from '../api.js';
|
|
9
|
+
import { pagePath, slugify, unslugify } from '../utils.js';
|
|
10
|
+
import { logger } from '../logger.js';
|
|
11
|
+
export class TransformCommand {
|
|
12
|
+
config;
|
|
13
|
+
pendingIncludes = [];
|
|
14
|
+
api;
|
|
15
|
+
constructor(config) {
|
|
16
|
+
this.config = config;
|
|
17
|
+
}
|
|
18
|
+
async execute(_context) {
|
|
19
|
+
this.api = new ConfluenceApi(this.config);
|
|
20
|
+
logger.info(`Transforming HTML files to Markdown...`);
|
|
21
|
+
logger.info(`Output directory: ${this.config.outputDir}\n`);
|
|
22
|
+
// Clear existing MD files and images if --clear flag is set
|
|
23
|
+
if (this.config.clear) {
|
|
24
|
+
logger.info('Clearing existing .md files and images folders...');
|
|
25
|
+
await this.clearExistingFiles(this.config.outputDir);
|
|
26
|
+
logger.info('✓ Cleared existing files\n');
|
|
27
|
+
}
|
|
28
|
+
let transformedCount = 0;
|
|
29
|
+
let skippedCount = 0;
|
|
30
|
+
let errorCount = 0;
|
|
31
|
+
const htmlFiles = [];
|
|
32
|
+
if (this.config.pageId) {
|
|
33
|
+
logger.info(`Processing specific page: ${this.config.pageId}\n`);
|
|
34
|
+
const pageHtmlPath = pagePath(this.config.pageId, this.config);
|
|
35
|
+
logger.info(`HTML path: ${pageHtmlPath}\n`);
|
|
36
|
+
htmlFiles.push(pageHtmlPath);
|
|
37
|
+
}
|
|
38
|
+
else {
|
|
39
|
+
// Helper function to recursively find HTML files
|
|
40
|
+
const findHtmlFiles = async (dir, fileList = []) => {
|
|
41
|
+
const entries = await fs.readdir(dir, { withFileTypes: true });
|
|
42
|
+
for (const entry of entries) {
|
|
43
|
+
const fullPath = path.join(dir, entry.name);
|
|
44
|
+
if (entry.isDirectory() && !entry.name.startsWith('_') && entry.name !== 'images') {
|
|
45
|
+
// Recursively search subdirectories (skip _index, _queue, etc. and images folder)
|
|
46
|
+
await findHtmlFiles(fullPath, fileList);
|
|
47
|
+
}
|
|
48
|
+
else if (entry.isFile() && entry.name.endsWith('.html') && !entry.name.startsWith('_')) {
|
|
49
|
+
fileList.push(fullPath);
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
return fileList;
|
|
53
|
+
};
|
|
54
|
+
// Find all HTML files recursively
|
|
55
|
+
htmlFiles.push(...await findHtmlFiles(this.config.outputDir));
|
|
56
|
+
}
|
|
57
|
+
if (htmlFiles.length === 0) {
|
|
58
|
+
logger.info('No HTML files found to transform.');
|
|
59
|
+
logger.info('Run the "download" command first to download HTML pages.');
|
|
60
|
+
return;
|
|
61
|
+
}
|
|
62
|
+
// Apply limit if specified
|
|
63
|
+
const filesToProcess = this.config.limit ? htmlFiles.slice(0, this.config.limit) : htmlFiles;
|
|
64
|
+
logger.info(`Found ${htmlFiles.length} HTML files`);
|
|
65
|
+
if (this.config.limit && htmlFiles.length > this.config.limit) {
|
|
66
|
+
logger.info(`Limiting to first ${this.config.limit} files\n`);
|
|
67
|
+
}
|
|
68
|
+
else {
|
|
69
|
+
logger.info();
|
|
70
|
+
}
|
|
71
|
+
// Process HTML files in parallel batches
|
|
72
|
+
const batchSize = this.config.parallel || 5;
|
|
73
|
+
const batches = [];
|
|
74
|
+
for (let i = 0; i < filesToProcess.length; i += batchSize) {
|
|
75
|
+
batches.push(filesToProcess.slice(i, i + batchSize));
|
|
76
|
+
}
|
|
77
|
+
for (let batchIndex = 0; batchIndex < batches.length; batchIndex++) {
|
|
78
|
+
const batch = batches[batchIndex];
|
|
79
|
+
const batchStart = batchIndex * batchSize + 1;
|
|
80
|
+
const batchEnd = Math.min((batchIndex + 1) * batchSize, filesToProcess.length);
|
|
81
|
+
logger.info(`Processing batch ${batchIndex + 1}/${batches.length} (files ${batchStart}-${batchEnd})`);
|
|
82
|
+
await Promise.all(batch.map(async (htmlFilepath, indexInBatch) => {
|
|
83
|
+
const globalIndex = batchIndex * batchSize + indexInBatch;
|
|
84
|
+
await this.processFile(htmlFilepath, globalIndex + 1, filesToProcess.length);
|
|
85
|
+
}));
|
|
86
|
+
}
|
|
87
|
+
logger.info(`\n✓ Transformation complete!`);
|
|
88
|
+
logger.info(` Processed: ${filesToProcess.length} files in ${batches.length} batches`);
|
|
89
|
+
logger.info(` Note: Files are processed in parallel batches of up to ${batchSize} pages each`);
|
|
90
|
+
logger.info(` Check individual file logs above for skipped/transformed status`);
|
|
91
|
+
// Create links folder and _links.md file
|
|
92
|
+
logger.info('\nCreating links folder and _links.md file...');
|
|
93
|
+
await this.createLinksStructure(this.config.outputDir);
|
|
94
|
+
logger.info('✓ Links structure created');
|
|
95
|
+
}
|
|
96
|
+
/**
|
|
97
|
+
* Process a single HTML file to Markdown
|
|
98
|
+
*/
|
|
99
|
+
async processFile(htmlFilepath, index, total) {
|
|
100
|
+
const htmlFile = path.basename(htmlFilepath);
|
|
101
|
+
const dirPath = path.dirname(htmlFilepath);
|
|
102
|
+
const baseFilename = htmlFile.replace('.html', '');
|
|
103
|
+
const mdFilename = `${baseFilename}.md`;
|
|
104
|
+
const mdFilepath = path.join(dirPath, mdFilename);
|
|
105
|
+
const id = baseFilename.split('-')[0];
|
|
106
|
+
// Show relative path for better readability
|
|
107
|
+
const relativePath = path.relative(this.config.outputDir, htmlFilepath);
|
|
108
|
+
logger.info(`[${index}/${total}] Checking: ${relativePath}`);
|
|
109
|
+
logger.debug(`Processing file ${baseFilename} (ID: ${id})`);
|
|
110
|
+
// Check if MD file already exists
|
|
111
|
+
try {
|
|
112
|
+
await fs.access(mdFilepath);
|
|
113
|
+
if (this.config.force) {
|
|
114
|
+
logger.info(` ⚑ Force: Overwriting existing ${mdFilename}`);
|
|
115
|
+
// If forcing, remove existing images folder for this page to avoid stale files
|
|
116
|
+
try {
|
|
117
|
+
const imagesDir = path.join(dirPath, 'images');
|
|
118
|
+
await fs.rm(imagesDir, { recursive: true, force: true });
|
|
119
|
+
logger.info(` ✓ Removed existing images/ for ${baseFilename}`);
|
|
120
|
+
}
|
|
121
|
+
catch (err) {
|
|
122
|
+
// Non-fatal if images removal fails
|
|
123
|
+
logger.warn(` ⚠ Could not remove images for ${baseFilename}:`, err instanceof Error ? err.message : err);
|
|
124
|
+
}
|
|
125
|
+
}
|
|
126
|
+
else {
|
|
127
|
+
logger.info(` ⊘ Skipped: ${mdFilename} already exists`);
|
|
128
|
+
return;
|
|
129
|
+
}
|
|
130
|
+
}
|
|
131
|
+
catch {
|
|
132
|
+
// MD file doesn't exist, proceed with transformation
|
|
133
|
+
}
|
|
134
|
+
try {
|
|
135
|
+
logger.debug(`Reading HTML content from ${htmlFilepath}`);
|
|
136
|
+
// Read HTML content
|
|
137
|
+
const htmlContent = await fs.readFile(htmlFilepath, 'utf-8');
|
|
138
|
+
logger.debug(`HTML content length: ${htmlContent.length} characters`);
|
|
139
|
+
// Parse the title from filename (reverse slugification is lossy, but best effort)
|
|
140
|
+
const title = unslugify(baseFilename);
|
|
141
|
+
logger.debug(`Parsed title: "${title}"`);
|
|
142
|
+
logger.debug(`Starting HTML to Markdown transformation`);
|
|
143
|
+
// Transform HTML to Markdown
|
|
144
|
+
const images = [];
|
|
145
|
+
const markdownBody = await this.htmlToMarkdown(htmlContent, id, images);
|
|
146
|
+
logger.debug(`Transformation complete, markdown length: ${markdownBody.length} characters`);
|
|
147
|
+
// Build original page URL (use baseUrl if available)
|
|
148
|
+
const originalUrl = this.config.baseUrl
|
|
149
|
+
? `${this.config.baseUrl}/pages/viewpage.action?pageId=${id}`
|
|
150
|
+
: '';
|
|
151
|
+
logger.debug(`Original URL: ${originalUrl || 'none'}`);
|
|
152
|
+
// Create front matter
|
|
153
|
+
const frontMatter = [
|
|
154
|
+
'---',
|
|
155
|
+
`title: "${title.replace(/"/g, '\\"')}"`,
|
|
156
|
+
`id: "${id}"`,
|
|
157
|
+
originalUrl ? `url: "${originalUrl}"` : '',
|
|
158
|
+
'---'
|
|
159
|
+
].filter(Boolean).join('\n');
|
|
160
|
+
logger.debug(`Front matter created`);
|
|
161
|
+
// Before finalizing, replace any pending include placeholders inside markdownBody
|
|
162
|
+
let finalBody = markdownBody;
|
|
163
|
+
logger.debug(`Processing ${this.pendingIncludes.length} pending includes`);
|
|
164
|
+
for (const include of this.pendingIncludes) {
|
|
165
|
+
// Replace raw placeholder
|
|
166
|
+
finalBody = finalBody.replace(include.placeholder, include.content);
|
|
167
|
+
// Some converters escape underscores/backslashes; also replace escaped variants
|
|
168
|
+
const escaped = include.placeholder.replace(/_/g, '\\_');
|
|
169
|
+
finalBody = finalBody.replace(escaped, include.content);
|
|
170
|
+
// And double-escaped (e.g. \__INCLUDE_1__)
|
|
171
|
+
const doubleEscaped = escaped.replace(/\\/g, '\\\\');
|
|
172
|
+
finalBody = finalBody.replace(doubleEscaped, include.content);
|
|
173
|
+
}
|
|
174
|
+
logger.debug(`Include placeholders replaced`);
|
|
175
|
+
// Combine front matter and content
|
|
176
|
+
const markdownContent = `${frontMatter}\n\n${finalBody}`;
|
|
177
|
+
logger.debug(`Combined content length: ${markdownContent.length} characters`);
|
|
178
|
+
// Save images if any (in the same directory as the page)
|
|
179
|
+
if (images.length > 0) {
|
|
180
|
+
logger.debug(`Saving ${images.length} images`);
|
|
181
|
+
const imagesDir = path.join(dirPath, 'images');
|
|
182
|
+
await fs.mkdir(imagesDir, { recursive: true });
|
|
183
|
+
for (const image of images) {
|
|
184
|
+
const imagePath = path.join(imagesDir, image.filename);
|
|
185
|
+
await fs.writeFile(imagePath, image.data);
|
|
186
|
+
}
|
|
187
|
+
logger.info(` ✓ Saved ${images.length} image(s) for ${baseFilename}`);
|
|
188
|
+
}
|
|
189
|
+
else {
|
|
190
|
+
logger.debug(`No images to save`);
|
|
191
|
+
}
|
|
192
|
+
logger.debug(`Performing final cleanup`);
|
|
193
|
+
// Final cleanup: unescape any remaining backslashes before [],() produced by converters
|
|
194
|
+
let finalMarkdownToWrite = markdownContent
|
|
195
|
+
// Remove escaped bracket/paren characters produced by converters (e.g. \[ \] \( \) )
|
|
196
|
+
.replace(/\\([\[\]\(\)])/g, '$1');
|
|
197
|
+
logger.debug(`Final markdown length: ${finalMarkdownToWrite.length} characters`);
|
|
198
|
+
logger.debug(`Formatting with Prettier`);
|
|
199
|
+
// Format and write markdown file
|
|
200
|
+
try {
|
|
201
|
+
const formatted = await prettier.format(finalMarkdownToWrite, {
|
|
202
|
+
parser: 'markdown',
|
|
203
|
+
printWidth: 120,
|
|
204
|
+
proseWrap: 'preserve',
|
|
205
|
+
tabWidth: 2,
|
|
206
|
+
useTabs: false
|
|
207
|
+
});
|
|
208
|
+
logger.debug(`Writing formatted markdown to ${mdFilepath}`);
|
|
209
|
+
await fs.writeFile(mdFilepath, formatted, 'utf-8');
|
|
210
|
+
logger.info(` ✓ Transformed: ${mdFilename} (formatted)`);
|
|
211
|
+
}
|
|
212
|
+
catch {
|
|
213
|
+
// If formatting fails, save unformatted markdown
|
|
214
|
+
logger.warn(` ⚠ Could not format Markdown, saving unformatted`);
|
|
215
|
+
logger.debug(`Writing unformatted markdown to ${mdFilepath}`);
|
|
216
|
+
await fs.writeFile(mdFilepath, finalMarkdownToWrite, 'utf-8');
|
|
217
|
+
logger.info(` ✓ Transformed: ${mdFilename}`);
|
|
218
|
+
}
|
|
219
|
+
}
|
|
220
|
+
catch (error) {
|
|
221
|
+
logger.error(` ✗ Failed to transform ${htmlFile}:`, error instanceof Error ? error.message : error);
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
/**
|
|
225
|
+
* Basic HTML to Markdown conversion
|
|
226
|
+
*/
|
|
227
|
+
async htmlToMarkdown(html, pageId, images) {
|
|
228
|
+
let markdown = html;
|
|
229
|
+
// Preprocess: convert lists inside table cells to inline text to avoid breaking Markdown tables
|
|
230
|
+
// Convert <td>...<ul><li>Item</li>...</ul>...</td> -> <td>...• Item; Item; ...</td>
|
|
231
|
+
try {
|
|
232
|
+
markdown = markdown.replace(/<td([^>]*)>([\s\S]*?)<\/td>/gi, (full, attrs, inner) => {
|
|
233
|
+
// If there are list tags inside, replace them with inline bullets separated by semicolons
|
|
234
|
+
if (/<ul[^>]*>|<ol[^>]*>/i.test(inner)) {
|
|
235
|
+
// Extract list items
|
|
236
|
+
const items = [];
|
|
237
|
+
const liRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
|
|
238
|
+
for (const m of Array.from(inner.matchAll(liRegex))) {
|
|
239
|
+
let item = m[1] || '';
|
|
240
|
+
// Strip tags inside li
|
|
241
|
+
item = item.replace(/<[^>]+>/g, '').trim();
|
|
242
|
+
if (item)
|
|
243
|
+
items.push(item);
|
|
244
|
+
}
|
|
245
|
+
if (items.length > 0) {
|
|
246
|
+
const replacement = items.map(i => `• ${i}`).join('; ');
|
|
247
|
+
// Remove the original lists from inner and append the inline replacement
|
|
248
|
+
const cleanedInner = inner.replace(/<ul[^>]*>[\s\S]*?<\/ul>/gi, '').replace(/<ol[^>]*>[\s\S]*?<\/ol>/gi, '').trim();
|
|
249
|
+
const spacer = cleanedInner && !cleanedInner.endsWith(' ') ? ' ' : '';
|
|
250
|
+
return `<td${attrs}>${cleanedInner}${spacer}${replacement}</td>`;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
return full;
|
|
254
|
+
});
|
|
255
|
+
}
|
|
256
|
+
catch (e) {
|
|
257
|
+
// Non-fatal: if preprocessing fails, continue without it
|
|
258
|
+
logger.warn('List-in-table preprocessing failed:', e instanceof Error ? e.message : e);
|
|
259
|
+
}
|
|
260
|
+
// Transform macros to markdown equivalents (with data fetching)
|
|
261
|
+
markdown = await this.transformMacros(markdown, pageId);
|
|
262
|
+
// Transform user links first (before removing ac:link)
|
|
263
|
+
markdown = await this.transformUserLinks(markdown);
|
|
264
|
+
// Transform page links to HTML anchor tags (will be converted to MD links later)
|
|
265
|
+
markdown = await this.transformPageLinks(markdown);
|
|
266
|
+
// Transform images and download attachments
|
|
267
|
+
markdown = await this.transformImages(markdown, pageId, images);
|
|
268
|
+
logger.debug(`Reving layout, time, and other elements`);
|
|
269
|
+
// Remove layout structure tags (they don't add value in markdown)
|
|
270
|
+
markdown = markdown.replace(/<\/?ac:layout[^>]*>/gi, '');
|
|
271
|
+
markdown = markdown.replace(/<\/?ac:layout-section[^>]*>/gi, '\n\n');
|
|
272
|
+
markdown = markdown.replace(/<\/?ac:layout-cell[^>]*>/gi, '\n\n');
|
|
273
|
+
// Time elements
|
|
274
|
+
markdown = markdown.replace(/<time[^>]*datetime="([^"]+)"[^>]*\/?>.*?/gi, '$1');
|
|
275
|
+
logger.debug(`Converting HTML to Markdown using webforai`);
|
|
276
|
+
markdown = htmlToMarkdown(markdown);
|
|
277
|
+
// Trim whitespace in Markdown table cells
|
|
278
|
+
logger.debug(`Trimming whitespace in Markdown table cells`);
|
|
279
|
+
markdown = markdown.replace(/^\|(.+)\|$/gm, (line) => {
|
|
280
|
+
const parts = line.split('|');
|
|
281
|
+
const trimmedParts = parts.map(part => part.trim());
|
|
282
|
+
return trimmedParts.join('|');
|
|
283
|
+
});
|
|
284
|
+
logger.debug(`Post-processing Markdown content (Pending includes, links, cleanup)`);
|
|
285
|
+
// Replace include placeholders with actual content (handle escaped variants)
|
|
286
|
+
for (const include of this.pendingIncludes) {
|
|
287
|
+
// raw
|
|
288
|
+
markdown = markdown.replace(include.placeholder, include.content);
|
|
289
|
+
// escaped underscores (e.g. \_\_INCLUDE_1\_\_)
|
|
290
|
+
const escaped = include.placeholder.replace(/_/g, '\\_');
|
|
291
|
+
markdown = markdown.replace(escaped, include.content);
|
|
292
|
+
// double-escaped (e.g. \\\_\\\_INCLUDE_1\\\_\\\_)
|
|
293
|
+
const doubleEscaped = escaped.replace(/\\/g, '\\\\');
|
|
294
|
+
markdown = markdown.replace(doubleEscaped, include.content);
|
|
295
|
+
}
|
|
296
|
+
this.pendingIncludes = [];
|
|
297
|
+
logger.debug(`Pending includes processed`);
|
|
298
|
+
// Restore page links that were escaped by htmlToMarkdown
|
|
299
|
+
// Pattern: \[Title\](url.md) -> [Title](url.md)
|
|
300
|
+
markdown = markdown.replace(/\\?\[([^\]]+)\\?\]\\?\(([^)]+\.md)\\?\)/g, '[$1]($2)');
|
|
301
|
+
// Unescape image and link bracket escaping produced by converters
|
|
302
|
+
// Example: !\[image.png\]\(images/image.png\) -> 
|
|
303
|
+
markdown = markdown.replace(/!\\\[([^\]]+)\\\]\(\s*([^\)]+)\s*\)/g, '');
|
|
304
|
+
markdown = markdown.replace(/\\\[([^\]]+)\\\]\(\s*([^\)]+)\s*\)/g, '[$1]($2)');
|
|
305
|
+
// Remove remaining ac:link elements
|
|
306
|
+
markdown = markdown.replace(/<ac:link[^>]*>[\s\S]*?<\/ac:link>/g, '');
|
|
307
|
+
logger.debug(`Converting headers`);
|
|
308
|
+
// Headers
|
|
309
|
+
markdown = markdown.replace(/<h1[^>]*>(.*?)<\/h1>/gi, '\n# $1\n');
|
|
310
|
+
markdown = markdown.replace(/<h2[^>]*>(.*?)<\/h2>/gi, '\n## $1\n');
|
|
311
|
+
markdown = markdown.replace(/<h3[^>]*>(.*?)<\/h3>/gi, '\n### $1\n');
|
|
312
|
+
markdown = markdown.replace(/<h4[^>]*>(.*?)<\/h4>/gi, '\n#### $1\n');
|
|
313
|
+
markdown = markdown.replace(/<h5[^>]*>(.*?)<\/h5>/gi, '\n##### $1\n');
|
|
314
|
+
markdown = markdown.replace(/<h6[^>]*>(.*?)<\/h6>/gi, '\n###### $1\n');
|
|
315
|
+
logger.debug(`Converting text formatting`);
|
|
316
|
+
// Bold and italic
|
|
317
|
+
markdown = markdown.replace(/<strong[^>]*>(.*?)<\/strong>/gi, '**$1**');
|
|
318
|
+
markdown = markdown.replace(/<b[^>]*>(.*?)<\/b>/gi, '**$1**');
|
|
319
|
+
markdown = markdown.replace(/<em[^>]*>(.*?)<\/em>/gi, '*$1*');
|
|
320
|
+
markdown = markdown.replace(/<i[^>]*>(.*?)<\/i>/gi, '*$1*');
|
|
321
|
+
// Links
|
|
322
|
+
logger.debug(`Converting links`);
|
|
323
|
+
markdown = markdown.replace(/<a[^>]*href="([^"]*)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
|
|
324
|
+
// Lists
|
|
325
|
+
logger.debug(`Converting lists`);
|
|
326
|
+
markdown = markdown.replace(/<ul[^>]*>/gi, '\n');
|
|
327
|
+
markdown = markdown.replace(/<\/ul>/gi, '\n');
|
|
328
|
+
markdown = markdown.replace(/<ol[^>]*>/gi, '\n');
|
|
329
|
+
markdown = markdown.replace(/<\/ol>/gi, '\n');
|
|
330
|
+
markdown = markdown.replace(/<li[^>]*>(.*?)<\/li>/gi, '- $1\n');
|
|
331
|
+
// Paragraphs
|
|
332
|
+
markdown = markdown.replace(/<p[^>]*>(.*?)<\/p>/gi, '$1\n\n');
|
|
333
|
+
// Code blocks
|
|
334
|
+
markdown = markdown.replace(/<pre[^>]*><code[^>]*>([\s\S]*?)<\/code><\/pre>/gi, '```\n$1\n```\n');
|
|
335
|
+
markdown = markdown.replace(/<code[^>]*>(.*?)<\/code>/gi, '`$1`');
|
|
336
|
+
// Line breaks
|
|
337
|
+
markdown = markdown.replace(/<br\s*\/?>/gi, '\n');
|
|
338
|
+
// Remove remaining HTML tags
|
|
339
|
+
logger.debug(`Removing remaining HTML tags`);
|
|
340
|
+
markdown = markdown.replace(/<[^>]+>/g, '');
|
|
341
|
+
// Clean up HTML entities
|
|
342
|
+
markdown = markdown.replace(/ /g, ' ');
|
|
343
|
+
markdown = markdown.replace(/&/g, '&');
|
|
344
|
+
markdown = markdown.replace(/</g, '<');
|
|
345
|
+
markdown = markdown.replace(/>/g, '>');
|
|
346
|
+
markdown = markdown.replace(/"/g, '"');
|
|
347
|
+
// Clean up extra whitespace
|
|
348
|
+
markdown = markdown.replace(/\n{3,}/g, '\n\n');
|
|
349
|
+
markdown = markdown.trim();
|
|
350
|
+
// Apply markdown cleanup to remove malformed patterns
|
|
351
|
+
logger.debug(`Cleaning up markdown`);
|
|
352
|
+
markdown = this.cleanMarkdown(markdown);
|
|
353
|
+
return markdown;
|
|
354
|
+
}
|
|
355
|
+
/**
|
|
356
|
+
* Transform images and download attachments
|
|
357
|
+
*/
|
|
358
|
+
async transformImages(content, pageId, images) {
|
|
359
|
+
let result = content;
|
|
360
|
+
const downloadPromises = [];
|
|
361
|
+
// Match image attachments: <ac:image><ri:attachment ri:filename="..." /></ac:image>
|
|
362
|
+
const imageRegex = /<ac:image[^>]*><ri:attachment[^>]*ri:filename="([^"]+)"[^>]*\/><\/ac:image>/gi;
|
|
363
|
+
const imageMatches = Array.from(content.matchAll(imageRegex));
|
|
364
|
+
for (const match of imageMatches) {
|
|
365
|
+
const originalFilename = match[1];
|
|
366
|
+
logger.debug(`Processing image attachment: ${originalFilename}`);
|
|
367
|
+
// Extract extension and slugify the base name
|
|
368
|
+
const lastDotIndex = originalFilename.lastIndexOf('.');
|
|
369
|
+
const extension = lastDotIndex > 0 ? originalFilename.slice(lastDotIndex) : '';
|
|
370
|
+
const baseName = lastDotIndex > 0 ? originalFilename.slice(0, lastDotIndex) : originalFilename;
|
|
371
|
+
const slugifiedFilename = slugify(baseName) + extension;
|
|
372
|
+
let replacement = ``;
|
|
373
|
+
// Download the image if API is available
|
|
374
|
+
if (this.api) {
|
|
375
|
+
downloadPromises.push((async () => {
|
|
376
|
+
try {
|
|
377
|
+
// Try downloading with original filename first (Confluence API may handle encoding internally)
|
|
378
|
+
let imageData = await this.api.downloadAttachment(pageId, originalFilename);
|
|
379
|
+
// If that fails, try with URL-encoded filename
|
|
380
|
+
if (!imageData) {
|
|
381
|
+
const encodedImageName = encodeURIComponent(originalFilename);
|
|
382
|
+
imageData = await this.api.downloadAttachment(pageId, encodedImageName);
|
|
383
|
+
}
|
|
384
|
+
if (imageData) {
|
|
385
|
+
images.push({ filename: slugifiedFilename, data: imageData });
|
|
386
|
+
logger.info(` ✓ Downloaded image: ${originalFilename} -> ${slugifiedFilename}`);
|
|
387
|
+
}
|
|
388
|
+
else {
|
|
389
|
+
// Image might be on a different page or not exist
|
|
390
|
+
logger.warn(` ⚠ Image not found on this page: ${originalFilename} (may be on parent/child page)`);
|
|
391
|
+
}
|
|
392
|
+
}
|
|
393
|
+
catch (error) {
|
|
394
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
395
|
+
if (errorMessage.includes('404')) {
|
|
396
|
+
logger.warn(` ⚠ Image not attached to this page: ${originalFilename}`);
|
|
397
|
+
}
|
|
398
|
+
else {
|
|
399
|
+
logger.warn(` ⚠ Error downloading image ${originalFilename}:`, errorMessage);
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
})());
|
|
403
|
+
}
|
|
404
|
+
logger.debug(`Replacing image tag with markdown: ${replacement}`);
|
|
405
|
+
result = result.replace(match[0], replacement);
|
|
406
|
+
}
|
|
407
|
+
logger.debug(`Processed inline <img> tags that reference /download/attachments/...`);
|
|
408
|
+
// Also handle inline <img> tags that reference /download/attachments/... with optional data-linked-resource-container-id
|
|
409
|
+
// Example: <img class="confluence-embedded-image" src="/download/attachments/715168874/image.png?version=1&api=v2" data-linked-resource-container-id="715168874" />
|
|
410
|
+
const inlineImgRegex = /<img[^>]*src="([^"]*\/download\/attachments\/[^"\s]+)"[^>]*>/gi;
|
|
411
|
+
const inlineImgMatches = Array.from(content.matchAll(inlineImgRegex));
|
|
412
|
+
logger.debug(`Found ${inlineImgMatches.length} inline <img> tags with /download/attachments/ URLs`);
|
|
413
|
+
for (const match of inlineImgMatches) {
|
|
414
|
+
const src = match[1];
|
|
415
|
+
logger.debug(`Processing inline image src: ${src}`);
|
|
416
|
+
// Try to extract filename from URL path
|
|
417
|
+
let filename = src.split('/').pop() || 'image';
|
|
418
|
+
// Strip query params if present
|
|
419
|
+
filename = filename.split('?')[0];
|
|
420
|
+
// Try to extract container id from the tag using a secondary regex on the original match
|
|
421
|
+
const fullTag = match[0];
|
|
422
|
+
const containerIdMatch = fullTag.match(/data-linked-resource-container-id="([^"<>]+)"/i);
|
|
423
|
+
const containerId = containerIdMatch ? containerIdMatch[1] : pageId;
|
|
424
|
+
const lastDotIndex = filename.lastIndexOf('.');
|
|
425
|
+
const extension = lastDotIndex > 0 ? filename.slice(lastDotIndex) : '';
|
|
426
|
+
const baseName = lastDotIndex > 0 ? filename.slice(0, lastDotIndex) : filename;
|
|
427
|
+
const slugifiedFilename = slugify(baseName) + extension;
|
|
428
|
+
let replacement = ``;
|
|
429
|
+
if (this.api) {
|
|
430
|
+
downloadPromises.push((async () => {
|
|
431
|
+
try {
|
|
432
|
+
logger.debug(`Downloading inline image from container ${containerId} with filename ${filename}`);
|
|
433
|
+
// The API expects the filename as-is; try original filename first
|
|
434
|
+
let imageData = await this.api.downloadAttachment(containerId, filename);
|
|
435
|
+
// Fallback: try URL-decoded filename
|
|
436
|
+
if (!imageData) {
|
|
437
|
+
const decoded = decodeURIComponent(filename);
|
|
438
|
+
if (decoded !== filename) {
|
|
439
|
+
imageData = await this.api.downloadAttachment(containerId, decoded);
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
// Another fallback: try removing any appended tokens (some Confluence instances append ids)
|
|
443
|
+
if (!imageData) {
|
|
444
|
+
const simpleName = filename.replace(/^[^a-z0-9]+/i, '').split(/[^a-z0-9.\-_]/i)[0];
|
|
445
|
+
if (simpleName && simpleName !== filename) {
|
|
446
|
+
imageData = await this.api.downloadAttachment(containerId, simpleName);
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
if (imageData) {
|
|
450
|
+
images.push({ filename: slugifiedFilename, data: imageData });
|
|
451
|
+
logger.info(` ✓ Downloaded inline image: ${filename} -> ${slugifiedFilename}`);
|
|
452
|
+
}
|
|
453
|
+
else {
|
|
454
|
+
logger.warn(` ⚠ Inline image not downloaded: ${filename} (container ${containerId})`);
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
catch (error) {
|
|
458
|
+
const errorMessage = error instanceof Error ? error.message : String(error);
|
|
459
|
+
logger.warn(` ⚠ Error downloading inline image ${filename}:`, errorMessage);
|
|
460
|
+
}
|
|
461
|
+
})());
|
|
462
|
+
}
|
|
463
|
+
result = result.replace(match[0], replacement);
|
|
464
|
+
}
|
|
465
|
+
// Wait for all downloads to complete
|
|
466
|
+
await Promise.all(downloadPromises);
|
|
467
|
+
logger.debug(`Completed processing inline <img> tags`);
|
|
468
|
+
return result;
|
|
469
|
+
}
|
|
470
|
+
/**
|
|
471
|
+
* Build a Markdown list from an included page's HTML content.
|
|
472
|
+
* Prefer extracting <ul>/<ol> list items and anchor links; fall back to full page transform.
|
|
473
|
+
*/
|
|
474
|
+
async buildIncludeList(page, title) {
|
|
475
|
+
try {
|
|
476
|
+
const html = page.body || '';
|
|
477
|
+
// Extract list items inside <ul> or <ol>
|
|
478
|
+
const listRegex = /<ul[^>]*>([\s\S]*?)<\/ul>/i;
|
|
479
|
+
const listMatch = html.match(listRegex);
|
|
480
|
+
if (listMatch) {
|
|
481
|
+
const itemsHtml = listMatch[1];
|
|
482
|
+
const itemRegex = /<li[^>]*>([\s\S]*?)<\/li>/gi;
|
|
483
|
+
const items = [];
|
|
484
|
+
for (const m of Array.from(itemsHtml.matchAll(itemRegex))) {
|
|
485
|
+
let item = m[1].trim();
|
|
486
|
+
// Convert <a href> to markdown
|
|
487
|
+
item = item.replace(/<a[^>]*href="([^"]+)"[^>]*>(.*?)<\/a>/gi, '[$2]($1)');
|
|
488
|
+
// Strip remaining tags
|
|
489
|
+
item = item.replace(/<[^>]+>/g, '').trim();
|
|
490
|
+
items.push(`- ${item}`);
|
|
491
|
+
}
|
|
492
|
+
if (items.length > 0) {
|
|
493
|
+
return `\n\n## ${title}\n\n${items.join('\n')}\n\n`;
|
|
494
|
+
}
|
|
495
|
+
}
|
|
496
|
+
// If no lists found, look for anchor links
|
|
497
|
+
const anchorRegex = /<a[^>]*href="([^"]+)"[^>]*>(.*?)<\/a>/gi;
|
|
498
|
+
const anchors = Array.from(html.matchAll(anchorRegex));
|
|
499
|
+
if (anchors.length > 0) {
|
|
500
|
+
const items = anchors.map(a => `- [${a[2].replace(/<[^>]+>/g, '').trim()}](${a[1]})`);
|
|
501
|
+
return `\n\n## ${title}\n\n${items.join('\n')}\n\n`;
|
|
502
|
+
}
|
|
503
|
+
// Fall back to full-page transform
|
|
504
|
+
const full = await this.htmlToMarkdown(html, page.id || title, []);
|
|
505
|
+
return `\n\n## ${title}\n\n${full}\n\n`;
|
|
506
|
+
}
|
|
507
|
+
catch (error) {
|
|
508
|
+
logger.warn(`Failed to build include list for ${title}:`, error);
|
|
509
|
+
return `\n\n## ${title}\n\n<!-- failed to include content -->\n\n`;
|
|
510
|
+
}
|
|
511
|
+
}
|
|
512
|
+
/**
|
|
513
|
+
* Transform Confluence macros to Markdown
|
|
514
|
+
*/
|
|
515
|
+
async transformMacros(content, pageId) {
|
|
516
|
+
let result = content;
|
|
517
|
+
// Handle children macro - fetch child pages of specified page or current page
|
|
518
|
+
const childrenRegex = /<ac:structured-macro[^>]*ac:name="children"[^>]*>([\s\S]*?)<\/ac:structured-macro>/gis;
|
|
519
|
+
const childrenMatches = Array.from(content.matchAll(childrenRegex));
|
|
520
|
+
for (const match of childrenMatches) {
|
|
521
|
+
let replacement = '<!-- Child Pages -->\n\n';
|
|
522
|
+
const macroContent = match[1];
|
|
523
|
+
if (this.api) {
|
|
524
|
+
try {
|
|
525
|
+
// Check if there's a page parameter
|
|
526
|
+
const pageParamMatch = macroContent.match(/ri:content-title="([^"]+)"/i);
|
|
527
|
+
let targetPageId = pageId;
|
|
528
|
+
let targetTitle = '';
|
|
529
|
+
if (pageParamMatch) {
|
|
530
|
+
targetTitle = pageParamMatch[1];
|
|
531
|
+
// Try to find the page by title
|
|
532
|
+
const targetPage = await this.api.getPageByTitle(this.config.spaceKey, targetTitle);
|
|
533
|
+
if (targetPage) {
|
|
534
|
+
targetPageId = targetPage.id;
|
|
535
|
+
}
|
|
536
|
+
}
|
|
537
|
+
const childPages = await this.api.getChildPages(targetPageId);
|
|
538
|
+
if (childPages.length > 0) {
|
|
539
|
+
replacement = childPages.map(child => `- [${child.title}](${slugify(child.title)}.md)`).join('\n') + '\n\n';
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
catch (error) {
|
|
543
|
+
logger.warn(`Failed to fetch child pages:`, error);
|
|
544
|
+
}
|
|
545
|
+
}
|
|
546
|
+
result = result.replace(match[0], replacement);
|
|
547
|
+
}
|
|
548
|
+
// Handle list-children macro - fetch actual child pages
|
|
549
|
+
const listChildrenRegex = /<ac:structured-macro[^>]*ac:name="list-children"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis;
|
|
550
|
+
const listChildrenMatches = Array.from(result.matchAll(listChildrenRegex));
|
|
551
|
+
for (const match of listChildrenMatches) {
|
|
552
|
+
let replacement = '<!-- Child Pages List -->\n\n';
|
|
553
|
+
if (this.api) {
|
|
554
|
+
try {
|
|
555
|
+
const childPages = await this.api.getChildPages(pageId);
|
|
556
|
+
if (childPages.length > 0) {
|
|
557
|
+
replacement = childPages.map(child => `- [${child.title}](${slugify(child.title)}.md)`).join('\n') + '\n\n';
|
|
558
|
+
}
|
|
559
|
+
}
|
|
560
|
+
catch (error) {
|
|
561
|
+
logger.warn(`Failed to fetch child pages for ${pageId}:`, error);
|
|
562
|
+
}
|
|
563
|
+
}
|
|
564
|
+
result = result.replace(match[0], replacement);
|
|
565
|
+
}
|
|
566
|
+
// Handle include macro - fetch content from included page
|
|
567
|
+
const includeRegex = /<ac:structured-macro[^>]*ac:name="include"[^>]*>([\s\S]*?)<\/ac:structured-macro>/gis;
|
|
568
|
+
const includeMatches = Array.from(result.matchAll(includeRegex));
|
|
569
|
+
for (const match of includeMatches) {
|
|
570
|
+
const macroContent = match[1];
|
|
571
|
+
const titleMatch = macroContent.match(/ri:content-title="([^"]+)"/i);
|
|
572
|
+
if (titleMatch && this.api) {
|
|
573
|
+
const includeTitle = titleMatch[1];
|
|
574
|
+
try {
|
|
575
|
+
let includedPage;
|
|
576
|
+
if (includeTitle === "FCS Useful Links") {
|
|
577
|
+
// Hardcode the pageId for FCS Useful Links
|
|
578
|
+
includedPage = await this.api.getPage("167810724");
|
|
579
|
+
}
|
|
580
|
+
else {
|
|
581
|
+
includedPage = await this.api.getPageByTitle(this.config.spaceKey, includeTitle);
|
|
582
|
+
}
|
|
583
|
+
if (includedPage && includedPage.body) {
|
|
584
|
+
// Build a concise Markdown list from the included page using the API
|
|
585
|
+
const listMd = await this.buildIncludeList(includedPage, includeTitle);
|
|
586
|
+
// Generate a unique placeholder per include to avoid collisions
|
|
587
|
+
const placeholder = `__INCLUDE_${this.pendingIncludes.length + 1}__`;
|
|
588
|
+
// Replace macro with placeholder and remember the content for later
|
|
589
|
+
result = result.replace(match[0], placeholder);
|
|
590
|
+
this.pendingIncludes.push({ placeholder, content: listMd });
|
|
591
|
+
}
|
|
592
|
+
else {
|
|
593
|
+
result = result.replace(match[0], `<!-- Include: ${includeTitle} (page not found) -->\n\n`);
|
|
594
|
+
}
|
|
595
|
+
}
|
|
596
|
+
catch (error) {
|
|
597
|
+
logger.warn(`Failed to fetch included page "${includeTitle}":`, error);
|
|
598
|
+
result = result.replace(match[0], `<!-- Include: ${includeTitle} (error) -->\n\n`);
|
|
599
|
+
}
|
|
600
|
+
}
|
|
601
|
+
else {
|
|
602
|
+
result = result.replace(match[0], '<!-- Include macro -->\n\n');
|
|
603
|
+
}
|
|
604
|
+
}
|
|
605
|
+
// Preserve table-like macros: extract the inner rich-text-body so HTML tables
|
|
606
|
+
// inside macros (e.g. table-filter) are retained and later converted to Markdown.
|
|
607
|
+
result = result.replace(/<ac:structured-macro[^>]*ac:name="(?:table|table-filter)"[^>]*>[\s\S]*?<ac:rich-text-body>([\s\S]*?)<\/ac:rich-text-body>[\s\S]*?<\/ac:structured-macro>/gis, '$1\n\n');
|
|
608
|
+
// Apply other macro transformations
|
|
609
|
+
result = result
|
|
610
|
+
// Code blocks with language
|
|
611
|
+
.replace(/<ac:structured-macro[^>]*ac:name="code"[^>]*>.*?<ac:parameter[^>]*ac:name="language"[^>]*>(.*?)<\/ac:parameter>.*?<ac:plain-text-body><!\[CDATA\[(.*?)\]\]><\/ac:plain-text-body>.*?<\/ac:structured-macro>/gis, '```$1\n$2\n```\n\n')
|
|
612
|
+
// Code blocks without language
|
|
613
|
+
.replace(/<ac:structured-macro[^>]*ac:name="code"[^>]*>.*?<ac:plain-text-body><!\[CDATA\[(.*?)\]\]><\/ac:plain-text-body>.*?<\/ac:structured-macro>/gis, '```\n$1\n```\n\n')
|
|
614
|
+
// Info panels
|
|
615
|
+
/* Replace info macro with a concise inline marker using the macro title and body.
|
|
616
|
+
Desired output example:
|
|
617
|
+
[i] Here you will find
|
|
618
|
+
<body content...>
|
|
619
|
+
*/
|
|
620
|
+
.replace(/<ac:structured-macro[^>]*ac:name="info"[^>]*>([\s\S]*?)<\/ac:structured-macro>/gis, (_match, inner) => {
|
|
621
|
+
try {
|
|
622
|
+
// Extract title parameter if present
|
|
623
|
+
const titleMatch = inner.match(/<ac:parameter[^>]*ac:name="title"[^>]*>([\s\S]*?)<\/ac:parameter>/i);
|
|
624
|
+
const title = titleMatch ? titleMatch[1].trim() : '';
|
|
625
|
+
// Extract rich-text-body content
|
|
626
|
+
const bodyMatch = inner.match(/<ac:rich-text-body>([\s\S]*?)<\/ac:rich-text-body>/i);
|
|
627
|
+
const body = bodyMatch ? bodyMatch[1].trim() : '';
|
|
628
|
+
const titleLine = title ? `[i] ${title}\n\n` : '';
|
|
629
|
+
// Return title marker plus body (body will be further transformed later)
|
|
630
|
+
return `${titleLine}${body}\n\n`;
|
|
631
|
+
}
|
|
632
|
+
catch (e) {
|
|
633
|
+
return '<!-- Info macro -->\n\n';
|
|
634
|
+
}
|
|
635
|
+
})
|
|
636
|
+
// Warning panels
|
|
637
|
+
.replace(/<ac:structured-macro[^>]*ac:name="warning"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '> **Warning:** $1\n\n')
|
|
638
|
+
// Note panels
|
|
639
|
+
.replace(/<ac:structured-macro[^>]*ac:name="note"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '> **Note:** $1\n\n')
|
|
640
|
+
// Panel macro - extract content
|
|
641
|
+
.replace(/<ac:structured-macro[^>]*ac:name="panel"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '$1\n\n')
|
|
642
|
+
// Excerpt macro - extract content
|
|
643
|
+
.replace(/<ac:structured-macro[^>]*ac:name="excerpt"[^>]*>.*?<ac:rich-text-body>(.*?)<\/ac:rich-text-body>.*?<\/ac:structured-macro>/gis, '$1\n\n')
|
|
644
|
+
// Table of contents
|
|
645
|
+
.replace(/<ac:structured-macro[^>]*ac:name="toc"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Table of Contents -->\n\n')
|
|
646
|
+
// Content by label
|
|
647
|
+
.replace(/<ac:structured-macro[^>]*ac:name="contentbylabel"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Content by Label -->\n\n')
|
|
648
|
+
// Livesearch macro
|
|
649
|
+
.replace(/<ac:structured-macro[^>]*ac:name="livesearch"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Live Search -->\n\n')
|
|
650
|
+
// Jira macro
|
|
651
|
+
.replace(/<ac:structured-macro[^>]*ac:name="jira"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Jira Issues -->\n\n')
|
|
652
|
+
// Recently updated macro
|
|
653
|
+
.replace(/<ac:structured-macro[^>]*ac:name="recently-updated"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Recently Updated Pages -->\n\n')
|
|
654
|
+
// Popular labels macro
|
|
655
|
+
.replace(/<ac:structured-macro[^>]*ac:name="popular-labels"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Popular Labels -->\n\n')
|
|
656
|
+
// Other macros - convert to comments
|
|
657
|
+
.replace(/<ac:structured-macro[^>]*ac:name="([^"]*)"[^>]*(?:\/>|>.*?<\/ac:structured-macro>)/gis, '<!-- Confluence Macro: $1 -->\n\n');
|
|
658
|
+
return result;
|
|
659
|
+
}
|
|
660
|
+
/**
|
|
661
|
+
* Transform user links to display names
|
|
662
|
+
*/
|
|
663
|
+
async transformUserLinks(html) {
|
|
664
|
+
if (!this.api) {
|
|
665
|
+
// If no API provided, just remove user links
|
|
666
|
+
return html.replace(/<ac:link[^>]*><ri:user[^>]*\/><\/ac:link>/g, '@unknown-user');
|
|
667
|
+
}
|
|
668
|
+
let result = html;
|
|
669
|
+
// Match user links by username
|
|
670
|
+
const usernameRegex = /<ac:link[^>]*><ri:user[^>]*ri:username="([^"]+)"[^>]*\/><\/ac:link>/gi;
|
|
671
|
+
const usernameMatches = Array.from(html.matchAll(usernameRegex));
|
|
672
|
+
for (const match of usernameMatches) {
|
|
673
|
+
const username = match[1];
|
|
674
|
+
const user = await this.api.getUserByUsername(username);
|
|
675
|
+
if (user) {
|
|
676
|
+
result = result.replace(match[0], `@${user.displayName}`);
|
|
677
|
+
}
|
|
678
|
+
else {
|
|
679
|
+
result = result.replace(match[0], `@${username}`);
|
|
680
|
+
}
|
|
681
|
+
}
|
|
682
|
+
// Match user links by userkey
|
|
683
|
+
const userkeyRegex = /<ac:link[^>]*><ri:user[^>]*ri:userkey="([^"]+)"[^>]*\/><\/ac:link>/gi;
|
|
684
|
+
const userkeyMatches = Array.from(result.matchAll(userkeyRegex));
|
|
685
|
+
for (const match of userkeyMatches) {
|
|
686
|
+
const userKey = match[1];
|
|
687
|
+
const user = await this.api.getUserByKey(userKey);
|
|
688
|
+
if (user) {
|
|
689
|
+
result = result.replace(match[0], `@${user.displayName}`);
|
|
690
|
+
}
|
|
691
|
+
else {
|
|
692
|
+
result = result.replace(match[0], `@user-${userKey.slice(-8)}`);
|
|
693
|
+
}
|
|
694
|
+
}
|
|
695
|
+
return result;
|
|
696
|
+
}
|
|
697
|
+
/**
|
|
698
|
+
* Transform page links to markdown links
|
|
699
|
+
*/
|
|
700
|
+
async transformPageLinks(html) {
|
|
701
|
+
let result = html;
|
|
702
|
+
// Match page links by content title - various formats
|
|
703
|
+
// Format 1: <ac:link><ri:page ri:content-title="Title" /></ac:link>
|
|
704
|
+
const pageLinkRegex1 = /<ac:link[^>]*>\s*<ri:page[^>]*ri:content-title="([^"]+)"[^>]*\/>\s*<\/ac:link>/gi;
|
|
705
|
+
const matches1 = Array.from(html.matchAll(pageLinkRegex1));
|
|
706
|
+
for (const match of matches1) {
|
|
707
|
+
const title = match[1];
|
|
708
|
+
const link = `[${title}](${slugify(title)}.md)`;
|
|
709
|
+
result = result.replace(match[0], link);
|
|
710
|
+
}
|
|
711
|
+
// Format 2: Just <ri:page ri:content-title="Title" /> without ac:link wrapper
|
|
712
|
+
const pageLinkRegex2 = /<ri:page[^>]*ri:content-title="([^"]+)"[^>]*\/>/gi;
|
|
713
|
+
const matches2 = Array.from(result.matchAll(pageLinkRegex2));
|
|
714
|
+
for (const match of matches2) {
|
|
715
|
+
const title = match[1];
|
|
716
|
+
const link = `[${title}](${slugify(title)}.md)`;
|
|
717
|
+
result = result.replace(match[0], link);
|
|
718
|
+
}
|
|
719
|
+
return result;
|
|
720
|
+
}
|
|
721
|
+
/**
|
|
722
|
+
* Clean up malformed markdown patterns
|
|
723
|
+
*/
|
|
724
|
+
cleanMarkdown(markdown) {
|
|
725
|
+
let cleaned = markdown;
|
|
726
|
+
// First pass: clean confluence-specific patterns
|
|
727
|
+
logger.debug('Cleaning Confluence-specific markdown patterns');
|
|
728
|
+
cleaned = this.cleanConfluencePatterns(cleaned);
|
|
729
|
+
// Second pass: general cleanup
|
|
730
|
+
logger.debug('Cleaning general markdown patterns');
|
|
731
|
+
cleaned = this.cleanGeneral(cleaned);
|
|
732
|
+
// Third pass: another round of confluence patterns to catch any new issues
|
|
733
|
+
logger.debug('Cleaning Confluence-specific markdown patterns (second pass)');
|
|
734
|
+
cleaned = this.cleanConfluencePatterns(cleaned);
|
|
735
|
+
// Final cleanup of excessive whitespace
|
|
736
|
+
cleaned = cleaned.replace(/\n{4,}/g, '\n\n\n');
|
|
737
|
+
cleaned = cleaned.trim() + '\n';
|
|
738
|
+
logger.debug('Final cleanup of excessive whitespace');
|
|
739
|
+
return cleaned;
|
|
740
|
+
}
|
|
741
|
+
/**
|
|
742
|
+
* Clean up specific problematic patterns that appear in Confluence exports
|
|
743
|
+
*/
|
|
744
|
+
cleanConfluencePatterns(markdown) {
|
|
745
|
+
let cleaned = markdown;
|
|
746
|
+
// Remove standalone bold markers that are not part of content
|
|
747
|
+
// This handles cases like "**\n\n**" or "** **"
|
|
748
|
+
cleaned = cleaned.replace(/\*\*\s*\n\s*\n\s*\*\*/g, '');
|
|
749
|
+
// Remove lines that only contain **
|
|
750
|
+
cleaned = cleaned.replace(/^\s*\*\*\s*$/gm, '');
|
|
751
|
+
// Remove empty headers (headers with no content)
|
|
752
|
+
cleaned = cleaned.replace(/^#+\s*$/gm, '');
|
|
753
|
+
// Remove bold markers around only whitespace
|
|
754
|
+
cleaned = cleaned.replace(/\*\*\s+\*\*/g, ' ');
|
|
755
|
+
// Remove italic markers around only whitespace
|
|
756
|
+
cleaned = cleaned.replace(/\*\s+\*/g, ' ');
|
|
757
|
+
// Clean up malformed blockquotes
|
|
758
|
+
cleaned = cleaned.replace(/^>\s*$/gm, '');
|
|
759
|
+
// Remove empty code blocks
|
|
760
|
+
cleaned = cleaned.replace(/```\s*\n\s*```/g, '');
|
|
761
|
+
// Clean up malformed horizontal rules
|
|
762
|
+
cleaned = cleaned.replace(/^[-*_]\s*$/gm, '');
|
|
763
|
+
return cleaned;
|
|
764
|
+
}
|
|
765
|
+
/**
|
|
766
|
+
* General markdown cleanup
|
|
767
|
+
*/
|
|
768
|
+
cleanGeneral(markdown) {
|
|
769
|
+
let cleaned = markdown;
|
|
770
|
+
// Remove empty headers with just bold/italic markers (no content between them)
|
|
771
|
+
// Match: ## ** or ## * (at end of line)
|
|
772
|
+
logger.debug('Removing empty headers with only formatting markers');
|
|
773
|
+
cleaned = cleaned.replace(/^#+\s*\*\*\s*$/gm, '');
|
|
774
|
+
cleaned = cleaned.replace(/^#+\s*\*\s*$/gm, '');
|
|
775
|
+
cleaned = cleaned.replace(/^#+\s*__\s*$/gm, '');
|
|
776
|
+
cleaned = cleaned.replace(/^#+\s*_\s*$/gm, '');
|
|
777
|
+
// Remove headers that only contain bold/italic markers across multiple lines
|
|
778
|
+
// Example: ## **\n\n** (with only whitespace between)
|
|
779
|
+
logger.debug('Removing headers with only formatting markers across multiple lines');
|
|
780
|
+
cleaned = cleaned.replace(/^(#+)\s*\*\*\s*\n+\s*\*\*\s*$/gm, '');
|
|
781
|
+
cleaned = cleaned.replace(/^(#+)\s*\*\s*\n+\s*\*\s*$/gm, '');
|
|
782
|
+
// Remove empty bold markers (no content or only whitespace between)
|
|
783
|
+
logger.debug('Removing empty bold markers');
|
|
784
|
+
cleaned = cleaned.replace(/\*\*\s*\*\*/g, '');
|
|
785
|
+
cleaned = cleaned.replace(/__\s*__/g, '');
|
|
786
|
+
// Remove standalone italic markers on their own line
|
|
787
|
+
logger.debug('Removing standalone italic markers on their own line');
|
|
788
|
+
cleaned = cleaned.replace(/^\s*\*\s*$/gm, '');
|
|
789
|
+
cleaned = cleaned.replace(/^\s*_\s*$/gm, '');
|
|
790
|
+
// Remove empty italic markers that span multiple lines (only if truly empty)
|
|
791
|
+
logger.debug('Removing empty italic markers that span multiple lines');
|
|
792
|
+
cleaned = cleaned.replace(/\*\s*\n+\s*\*/g, '\n\n');
|
|
793
|
+
// Remove empty links
|
|
794
|
+
logger.debug('Removing empty links');
|
|
795
|
+
cleaned = cleaned.replace(/\[\s*\]\(\s*\)/g, '');
|
|
796
|
+
// Remove empty list items
|
|
797
|
+
logger.debug('Removing empty list items');
|
|
798
|
+
cleaned = cleaned.replace(/^[-*+]\s*$/gm, '');
|
|
799
|
+
// Clean up excessive blank lines (more than 3 consecutive)
|
|
800
|
+
logger.debug('Cleaning up excessive blank lines');
|
|
801
|
+
cleaned = cleaned.replace(/\n{4,}/g, '\n\n\n');
|
|
802
|
+
// Remove trailing whitespace from each line
|
|
803
|
+
logger.debug('Removing trailing whitespace from each line');
|
|
804
|
+
cleaned = cleaned.split('\n').map(line => line.trimEnd()).join('\n');
|
|
805
|
+
// Ensure single trailing newline at end of file
|
|
806
|
+
logger.debug('Ensuring single trailing newline at end of file');
|
|
807
|
+
cleaned = cleaned.trim() + '\n';
|
|
808
|
+
return cleaned;
|
|
809
|
+
}
|
|
810
|
+
/**
|
|
811
|
+
* Create links folder with symlinks and _links.md with tree structure
|
|
812
|
+
*/
|
|
813
|
+
async createLinksStructure(outputDir) {
|
|
814
|
+
const linksDir = path.join(outputDir, 'links');
|
|
815
|
+
// Remove existing links folder if it exists
|
|
816
|
+
try {
|
|
817
|
+
await fs.rm(linksDir, { recursive: true, force: true });
|
|
818
|
+
}
|
|
819
|
+
catch {
|
|
820
|
+
// Ignore if doesn't exist
|
|
821
|
+
}
|
|
822
|
+
// Create fresh links folder
|
|
823
|
+
await fs.mkdir(linksDir, { recursive: true });
|
|
824
|
+
// Find all MD files recursively
|
|
825
|
+
const findMdFiles = async (dir, fileList = []) => {
|
|
826
|
+
const entries = await fs.readdir(dir, { withFileTypes: true });
|
|
827
|
+
for (const entry of entries) {
|
|
828
|
+
const fullPath = path.join(dir, entry.name);
|
|
829
|
+
if (entry.isDirectory() && !entry.name.startsWith('_') && entry.name !== 'images' && entry.name !== 'links') {
|
|
830
|
+
await findMdFiles(fullPath, fileList);
|
|
831
|
+
}
|
|
832
|
+
else if (entry.isFile() && entry.name.endsWith('.md') && !entry.name.startsWith('_')) {
|
|
833
|
+
const relativePath = path.relative(outputDir, fullPath);
|
|
834
|
+
fileList.push({ path: fullPath, relativePath });
|
|
835
|
+
}
|
|
836
|
+
}
|
|
837
|
+
return fileList;
|
|
838
|
+
};
|
|
839
|
+
const mdFiles = await findMdFiles(outputDir);
|
|
840
|
+
// Create symlinks in links folder
|
|
841
|
+
for (const file of mdFiles) {
|
|
842
|
+
const linkName = path.basename(file.path);
|
|
843
|
+
const linkPath = path.join(linksDir, linkName);
|
|
844
|
+
const targetPath = path.relative(linksDir, file.path);
|
|
845
|
+
try {
|
|
846
|
+
await fs.symlink(targetPath, linkPath);
|
|
847
|
+
}
|
|
848
|
+
catch (error) {
|
|
849
|
+
logger.warn(` ⚠ Failed to create symlink for ${linkName}:`, error instanceof Error ? error.message : error);
|
|
850
|
+
}
|
|
851
|
+
}
|
|
852
|
+
logger.info(` ✓ Created ${mdFiles.length} symlinks in links/`);
|
|
853
|
+
// Build tree structure for _links.md
|
|
854
|
+
const tree = this.buildFileTree(mdFiles);
|
|
855
|
+
const treeMarkdown = this.generateTreeMarkdown(tree, outputDir);
|
|
856
|
+
// Write _links.md
|
|
857
|
+
const linksFilePath = path.join(outputDir, '_links.md');
|
|
858
|
+
const linksContent = `# Documentation Links\n\n${treeMarkdown}`;
|
|
859
|
+
try {
|
|
860
|
+
const formattedContent = await prettier.format(linksContent, {
|
|
861
|
+
parser: 'markdown',
|
|
862
|
+
printWidth: 120,
|
|
863
|
+
proseWrap: 'preserve',
|
|
864
|
+
tabWidth: 2,
|
|
865
|
+
useTabs: false
|
|
866
|
+
});
|
|
867
|
+
await fs.writeFile(linksFilePath, formattedContent, 'utf-8');
|
|
868
|
+
}
|
|
869
|
+
catch {
|
|
870
|
+
await fs.writeFile(linksFilePath, linksContent, 'utf-8');
|
|
871
|
+
}
|
|
872
|
+
logger.info(` ✓ Created _links.md with tree structure`);
|
|
873
|
+
}
|
|
874
|
+
/**
|
|
875
|
+
* Build a tree structure from flat file list
|
|
876
|
+
*/
|
|
877
|
+
buildFileTree(files) {
|
|
878
|
+
const root = { name: '', children: {}, files: [] };
|
|
879
|
+
for (const file of files) {
|
|
880
|
+
const parts = file.relativePath.split(path.sep);
|
|
881
|
+
let current = root;
|
|
882
|
+
// Navigate/create directory structure
|
|
883
|
+
for (let i = 0; i < parts.length - 1; i++) {
|
|
884
|
+
const part = parts[i];
|
|
885
|
+
if (!current.children[part]) {
|
|
886
|
+
current.children[part] = { name: part, children: {}, files: [] };
|
|
887
|
+
}
|
|
888
|
+
current = current.children[part];
|
|
889
|
+
}
|
|
890
|
+
// Add file to current directory
|
|
891
|
+
current.files.push({
|
|
892
|
+
name: parts[parts.length - 1],
|
|
893
|
+
relativePath: file.relativePath
|
|
894
|
+
});
|
|
895
|
+
}
|
|
896
|
+
return root;
|
|
897
|
+
}
|
|
898
|
+
/**
|
|
899
|
+
* Generate markdown tree structure
|
|
900
|
+
*/
|
|
901
|
+
generateTreeMarkdown(node, outputDir, level = 0) {
|
|
902
|
+
let result = '';
|
|
903
|
+
const indent = ' '.repeat(level);
|
|
904
|
+
// Sort directories and files alphabetically
|
|
905
|
+
const sortedDirs = Object.keys(node.children).sort();
|
|
906
|
+
const sortedFiles = node.files.sort((a, b) => a.name.localeCompare(b.name));
|
|
907
|
+
// Add directories first
|
|
908
|
+
for (const dirName of sortedDirs) {
|
|
909
|
+
const child = node.children[dirName];
|
|
910
|
+
result += `${indent}- **${dirName}/**\n`;
|
|
911
|
+
result += this.generateTreeMarkdown(child, outputDir, level + 1);
|
|
912
|
+
}
|
|
913
|
+
// Add files
|
|
914
|
+
for (const file of sortedFiles) {
|
|
915
|
+
const linkPath = file.relativePath;
|
|
916
|
+
result += `${indent}- [${file.name}](${linkPath})\n`;
|
|
917
|
+
}
|
|
918
|
+
return result;
|
|
919
|
+
}
|
|
920
|
+
/**
|
|
921
|
+
* Recursively clear existing .md files and images folders
|
|
922
|
+
*/
|
|
923
|
+
async clearExistingFiles(dir) {
|
|
924
|
+
try {
|
|
925
|
+
const entries = await fs.readdir(dir, { withFileTypes: true });
|
|
926
|
+
for (const entry of entries) {
|
|
927
|
+
const fullPath = path.join(dir, entry.name);
|
|
928
|
+
if (entry.isDirectory()) {
|
|
929
|
+
if (entry.name === 'images' || entry.name === 'links') {
|
|
930
|
+
// Remove entire images and links folders
|
|
931
|
+
await fs.rm(fullPath, { recursive: true, force: true });
|
|
932
|
+
logger.info(` Removed: ${path.relative(this.config.outputDir, fullPath)}/`);
|
|
933
|
+
}
|
|
934
|
+
else if (!entry.name.startsWith('_')) {
|
|
935
|
+
// Recursively clear subdirectories (skip _index, _queue, etc.)
|
|
936
|
+
await this.clearExistingFiles(fullPath);
|
|
937
|
+
}
|
|
938
|
+
}
|
|
939
|
+
else if (entry.isFile() && entry.name.endsWith('.md') && !entry.name.startsWith('_')) {
|
|
940
|
+
// Remove .md files
|
|
941
|
+
await fs.unlink(fullPath);
|
|
942
|
+
logger.info(` Removed: ${path.relative(this.config.outputDir, fullPath)}`);
|
|
943
|
+
}
|
|
944
|
+
}
|
|
945
|
+
}
|
|
946
|
+
catch (error) {
|
|
947
|
+
logger.warn(`Warning: Could not clear files in ${dir}:`, error instanceof Error ? error.message : error);
|
|
948
|
+
}
|
|
949
|
+
}
|
|
950
|
+
}
|
|
951
|
+
//# sourceMappingURL=transform.command.js.map
|