stegdoc 4.0.0 → 5.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,161 +1,587 @@
1
- const { Document, Paragraph, TextRun, Packer } = require('docx');
2
- const fs = require('fs');
3
- const path = require('path');
4
- const { serializeMetadata, parseMetadata } = require('./metadata');
5
- const { parseXmlFromZip, ensureArray, extractTextContent } = require('./xml-utils');
6
-
7
- /**
8
- * Create a DOCX file with base64 content and metadata
9
- * @param {object} options - Options for creating the DOCX
10
- * @param {string} options.base64Content - Base64 content to store
11
- * @param {object} options.metadata - Metadata object
12
- * @param {string} options.outputPath - Output file path
13
- * @returns {Promise<string>} Path to created file
14
- */
15
- async function createDocxWithBase64(options) {
16
- const { base64Content, metadata, outputPath } = options;
17
-
18
- // Serialize metadata to JSON string
19
- const metadataStr = serializeMetadata(metadata);
20
-
21
- // Create document with metadata in custom properties and hidden paragraph
22
- const doc = new Document({
23
- sections: [
24
- {
25
- properties: {},
26
- children: [
27
- // Metadata paragraph (hidden for user, but readable programmatically)
28
- new Paragraph({
29
- children: [
30
- new TextRun({
31
- text: `WHITENER_METADATA:${metadataStr}`,
32
- size: 1, // Very small font
33
- }),
34
- ],
35
- }),
36
- // Separator
37
- new Paragraph({
38
- children: [
39
- new TextRun({
40
- text: '---',
41
- break: 1,
42
- }),
43
- ],
44
- }),
45
- // Base64 content
46
- new Paragraph({
47
- children: [
48
- new TextRun({
49
- text: base64Content,
50
- font: 'Courier New', // Monospace for base64
51
- size: 16, // 8pt font
52
- }),
53
- ],
54
- }),
55
- ],
56
- },
57
- ],
58
- });
59
-
60
- // Generate DOCX file
61
- const buffer = await Packer.toBuffer(doc);
62
-
63
- // Ensure output directory exists
64
- const outputDir = path.dirname(outputPath);
65
- if (!fs.existsSync(outputDir)) {
66
- fs.mkdirSync(outputDir, { recursive: true });
67
- }
68
-
69
- // Write to file
70
- fs.writeFileSync(outputPath, buffer);
71
-
72
- return outputPath;
73
- }
74
-
75
- /**
76
- * Read a DOCX file and extract base64 content and metadata
77
- * Uses namespace-agnostic XML parsing to handle w:, ns0:, ns1:, etc.
78
- * @param {string} docxPath - Path to DOCX file
79
- * @returns {Promise<object>} Object containing base64Content and metadata
80
- */
81
- async function readDocxBase64(docxPath) {
82
- if (!fs.existsSync(docxPath)) {
83
- throw new Error(`DOCX file not found: ${docxPath}`);
84
- }
85
-
86
- try {
87
- // Parse document.xml with namespace-agnostic parser
88
- const docParsed = parseXmlFromZip(docxPath, 'word/document.xml');
89
-
90
- if (!docParsed) {
91
- throw new Error('Could not find document.xml in DOCX file');
92
- }
93
-
94
- // Extract all text from the document
95
- // Structure: document > body > p[] > r[] > t
96
- const fullText = extractAllText(docParsed);
97
-
98
- // Parse the extracted text
99
- const metadataMarker = 'WHITENER_METADATA:';
100
- const metadataStart = fullText.indexOf(metadataMarker);
101
-
102
- if (metadataStart === -1) {
103
- throw new Error('No metadata found in DOCX file. This may not be a stegdoc-encoded file.');
104
- }
105
-
106
- // Find the separator "---" which comes after the metadata
107
- const separatorIndex = fullText.indexOf('---', metadataStart);
108
-
109
- if (separatorIndex === -1) {
110
- throw new Error('Invalid file format: separator not found');
111
- }
112
-
113
- // Extract metadata JSON between marker and separator
114
- const metadataStr = fullText.substring(metadataStart + metadataMarker.length, separatorIndex).trim();
115
- const metadata = parseMetadata(metadataStr);
116
-
117
- // Extract base64 content (everything after the separator)
118
- const base64Content = fullText.substring(separatorIndex + 3).trim();
119
-
120
- return {
121
- base64Content,
122
- metadata,
123
- };
124
- } catch (error) {
125
- throw new Error(`Failed to read DOCX file: ${error.message}`);
126
- }
127
- }
128
-
129
- /**
130
- * Extract all text content from parsed DOCX document
131
- * @param {object} docParsed - Parsed document.xml
132
- * @returns {string} Concatenated text content
133
- */
134
- function extractAllText(docParsed) {
135
- let fullText = '';
136
-
137
- // Navigate: document > body > p (paragraphs)
138
- const body = docParsed?.document?.body;
139
- if (!body) return fullText;
140
-
141
- const paragraphs = ensureArray(body.p);
142
-
143
- for (const para of paragraphs) {
144
- // Each paragraph has r (runs) containing t (text)
145
- const runs = ensureArray(para.r);
146
-
147
- for (const run of runs) {
148
- // Text can be in 't' property
149
- if (run.t !== undefined) {
150
- fullText += extractTextContent(run.t);
151
- }
152
- }
153
- }
154
-
155
- return fullText;
156
- }
157
-
158
- module.exports = {
159
- createDocxWithBase64,
160
- readDocxBase64,
161
- };
1
+ const {
2
+ Document, Paragraph, TextRun, Packer, Table, TableRow, TableCell,
3
+ AlignmentType, HeadingLevel, WidthType, ShadingType, BorderStyle, PageBreak,
4
+ } = require('docx');
5
+ const fs = require('fs');
6
+ const path = require('path');
7
+ const { serializeMetadata, parseMetadata } = require('./metadata');
8
+ const { parseXmlFromZip, ensureArray, extractTextContent } = require('./xml-utils');
9
+ const { generateIncident, generateHebrewDate } = require('./docx-templates');
10
+ const {
11
+ encodePayloadToLogLines, decodeLogLines, generateLogHeaders, resetTimeState,
12
+ BYTES_PER_DATA_LINE, calculateDataLineCount,
13
+ } = require('./log-generator');
14
+
15
+ // ─── Shared Styles ──────────────────────────────────────────────────────────
16
+
17
+ const FONT_HEBREW = { ascii: 'Arial', cs: 'Arial', hAnsi: 'Arial', eastAsia: 'Arial' };
18
+ const FONT_CODE = { ascii: 'Consolas', cs: 'Consolas', hAnsi: 'Consolas', eastAsia: 'Consolas' };
19
+ const SIZE_BODY = '11pt';
20
+ const SIZE_BODY_CS = '11pt';
21
+ const SIZE_CODE = '8pt';
22
+ const SIZE_SMALL = '9pt';
23
+
24
+ /**
25
+ * Create a Hebrew RTL paragraph
26
+ */
27
+ function heParagraph(text, opts = {}) {
28
+ const paragraphOpts = {
29
+ bidirectional: true,
30
+ spacing: opts.spacing || { after: 120, line: 276 },
31
+ ...opts.paragraphOpts,
32
+ children: [
33
+ new TextRun({
34
+ text,
35
+ rightToLeft: true,
36
+ font: FONT_HEBREW,
37
+ size: opts.size || SIZE_BODY,
38
+ sizeComplexScript: opts.size || SIZE_BODY_CS,
39
+ bold: opts.bold || false,
40
+ boldComplexScript: opts.bold || false,
41
+ color: opts.color,
42
+ ...opts.runOpts,
43
+ }),
44
+ ],
45
+ };
46
+ // Only set alignment if explicitly requested (e.g. CENTER)
47
+ // For RTL bidi paragraphs, omitting alignment lets Word use the natural RTL default (right-aligned)
48
+ if (opts.alignment) {
49
+ paragraphOpts.alignment = opts.alignment;
50
+ }
51
+ return new Paragraph(paragraphOpts);
52
+ }
53
+
54
+ /**
55
+ * Create a Hebrew heading
56
+ */
57
+ function heHeading(text, level = HeadingLevel.HEADING_1, opts = {}) {
58
+ const sizes = {
59
+ [HeadingLevel.HEADING_1]: '16pt',
60
+ [HeadingLevel.HEADING_2]: '14pt',
61
+ [HeadingLevel.HEADING_3]: '12pt',
62
+ };
63
+ // Don't use heading: level — built-in heading styles override alignment.
64
+ // Instead, manually style to look like headings.
65
+ return new Paragraph({
66
+ bidirectional: true,
67
+ // No alignment — let bidi default handle it (natural RTL = right-aligned)
68
+ spacing: { before: 240, after: 120 },
69
+ ...opts.paragraphOpts,
70
+ children: [
71
+ new TextRun({
72
+ text,
73
+ rightToLeft: true,
74
+ font: FONT_HEBREW,
75
+ size: sizes[level] || '14pt',
76
+ sizeComplexScript: sizes[level] || '14pt',
77
+ bold: true,
78
+ boldComplexScript: true,
79
+ color: opts.color || '1F3864',
80
+ ...opts.runOpts,
81
+ }),
82
+ ],
83
+ });
84
+ }
85
+
86
+ /**
87
+ * Create an LTR code/log line paragraph (English monospace)
88
+ */
89
+ function codeParagraph(text) {
90
+ return new Paragraph({
91
+ bidirectional: false,
92
+ alignment: AlignmentType.LEFT,
93
+ spacing: { after: 20, line: 240 },
94
+ shading: { type: ShadingType.SOLID, color: 'F2F2F2', fill: 'F2F2F2' },
95
+ indent: { left: 200, right: 200 },
96
+ children: [
97
+ new TextRun({
98
+ text,
99
+ rightToLeft: false,
100
+ font: FONT_CODE,
101
+ size: SIZE_CODE,
102
+ sizeComplexScript: SIZE_CODE,
103
+ color: '333333',
104
+ }),
105
+ ],
106
+ });
107
+ }
108
+
109
+ /**
110
+ * Create an empty paragraph (spacer)
111
+ */
112
+ function spacer() {
113
+ return new Paragraph({ spacing: { after: 80 }, children: [] });
114
+ }
115
+
116
+ // ─── v5 Log-Embed DOCX ─────────────────────────────────────────────────────
117
+
118
+ /**
119
+ * Create a v5 log-embed DOCX file — Hebrew incident report with embedded log lines.
120
+ *
121
+ * @param {object} options
122
+ * @param {Buffer} options.payloadBuffer - Encrypted binary payload
123
+ * @param {string} options.encryptionMeta - Packed encryption metadata or ''
124
+ * @param {string} options.metadataJson - Serialized metadata JSON string
125
+ * @param {string} options.outputPath - Output file path
126
+ * @param {string} options.hash - File hash for deterministic template selection
127
+ * @returns {Promise<string>} Path to created file
128
+ */
129
+ async function createDocxV5(options) {
130
+ const { payloadBuffer, encryptionMeta, metadataJson, outputPath, hash } = options;
131
+
132
+ resetTimeState();
133
+
134
+ // Generate a unique report from hash + part number
135
+ const metadata = JSON.parse(metadataJson);
136
+ const partNum = metadata.partNumber || 1;
137
+ const incident = generateIncident(hash || 'default', partNum);
138
+ const dateStr = generateHebrewDate(hash || 'default');
139
+
140
+ // Generate log lines from payload
141
+ const { headerRows, dataRows, fillerRows } = encodePayloadToLogLines(
142
+ payloadBuffer, metadataJson, encryptionMeta
143
+ );
144
+
145
+ // Build document sections
146
+ const children = [];
147
+
148
+ // ── Title ──
149
+ children.push(heHeading(`דוח תקרית ${incident.title}`, HeadingLevel.HEADING_1));
150
+ children.push(heParagraph(`תאריך: ${dateStr}`, { size: SIZE_SMALL, color: '666666' }));
151
+ children.push(heParagraph('מסווג: פנימי בלבד', { size: SIZE_SMALL, color: '666666' }));
152
+ children.push(spacer());
153
+
154
+ // ── Executive Summary ──
155
+ children.push(heHeading('תקציר מנהלים', HeadingLevel.HEADING_2));
156
+ children.push(heParagraph(incident.summary));
157
+ children.push(spacer());
158
+
159
+ // ── Timeline ──
160
+ children.push(heHeading('ציר זמן', HeadingLevel.HEADING_2));
161
+
162
+ const timelineTable = new Table({
163
+ visuallyRightToLeft: true,
164
+ width: { size: 100, type: WidthType.PERCENTAGE },
165
+ rows: [
166
+ // Header row
167
+ new TableRow({
168
+ tableHeader: true,
169
+ children: [
170
+ createHebrewCell('שעה', true, 20),
171
+ createHebrewCell('אירוע', true, 80),
172
+ ],
173
+ }),
174
+ // Data rows
175
+ ...incident.timeline.map(entry =>
176
+ new TableRow({
177
+ children: [
178
+ createHebrewCell(entry.time, false, 20),
179
+ createHebrewCell(entry.desc, false, 80),
180
+ ],
181
+ })
182
+ ),
183
+ ],
184
+ });
185
+ children.push(timelineTable);
186
+ children.push(spacer());
187
+
188
+ // ── Relevant Log Entries ──
189
+ children.push(heHeading('רשומות לוג רלוונטיות', HeadingLevel.HEADING_2));
190
+ children.push(heParagraph('להלן רשומות הלוג שאותרו כחלק מהחקירה. הרשומות סוננו מתוך מערכת הניטור ומכילות את הבקשות הרלוונטיות לתקרית:'));
191
+ children.push(spacer());
192
+
193
+ // Header log lines (metadata) — first batch
194
+ const headerLogLabel = `// Filtered logs — ${headerRows.length + dataRows.length + fillerRows.length} entries`;
195
+ children.push(codeParagraph(headerLogLabel));
196
+
197
+ for (const row of headerRows) {
198
+ children.push(codeParagraph(formatLogLine(row)));
199
+ }
200
+
201
+ // Data log lines (payload)
202
+ for (const row of dataRows) {
203
+ children.push(codeParagraph(formatLogLine(row)));
204
+ }
205
+
206
+ // Filler log lines
207
+ for (const row of fillerRows) {
208
+ children.push(codeParagraph(formatLogLine(row)));
209
+ }
210
+
211
+ children.push(spacer());
212
+
213
+ // ── Root Cause ──
214
+ children.push(heHeading('ניתוח שורש הבעיה', HeadingLevel.HEADING_2));
215
+ children.push(heParagraph(incident.rootCause));
216
+ children.push(spacer());
217
+
218
+ // ── Recommendations ──
219
+ children.push(heHeading('המלצות', HeadingLevel.HEADING_2));
220
+ for (let i = 0; i < incident.recommendations.length; i++) {
221
+ children.push(heParagraph(`${i + 1}. ${incident.recommendations[i]}`));
222
+ }
223
+ children.push(spacer());
224
+
225
+ // ── Footer ──
226
+ children.push(heParagraph('—— סוף הדוח ——', {
227
+ alignment: AlignmentType.CENTER,
228
+ size: SIZE_SMALL,
229
+ color: '999999',
230
+ }));
231
+
232
+ // Build document
233
+ const doc = new Document({
234
+ styles: {
235
+ default: {
236
+ document: {
237
+ run: {
238
+ rightToLeft: true,
239
+ font: FONT_HEBREW,
240
+ size: SIZE_BODY,
241
+ sizeComplexScript: SIZE_BODY_CS,
242
+ language: { bidirectional: 'he-IL' },
243
+ },
244
+ paragraph: {
245
+ // No alignment — bidi paragraphs default to right in RTL context
246
+ spacing: { line: 276 },
247
+ },
248
+ },
249
+ },
250
+ },
251
+ sections: [{
252
+ properties: {
253
+ page: {
254
+ size: { width: '21cm', height: '29.7cm' }, // A4
255
+ margin: {
256
+ top: '2.54cm',
257
+ bottom: '2.54cm',
258
+ left: '2.54cm',
259
+ right: '2.54cm',
260
+ },
261
+ },
262
+ },
263
+ children,
264
+ }],
265
+ });
266
+
267
+ const buffer = await Packer.toBuffer(doc);
268
+
269
+ const outputDir = path.dirname(outputPath);
270
+ if (!fs.existsSync(outputDir)) {
271
+ fs.mkdirSync(outputDir, { recursive: true });
272
+ }
273
+
274
+ fs.writeFileSync(outputPath, buffer);
275
+ return outputPath;
276
+ }
277
+
278
+ /**
279
+ * Format a log row array into a single log line string (nginx combined format).
280
+ */
281
+ function formatLogLine(row) {
282
+ // row: [ip, timestamp, method, request, status, bytes, referer, ua, requestId, traceId]
283
+ return `${row[0]} - - ${row[1]} "${row[3]}" ${row[4]} ${row[5]} "${row[6]}" "${row[7]}" "${row[8]}" "${row[9]}"`;
284
+ }
285
+
286
+ /**
287
+ * Create a Hebrew table cell
288
+ */
289
+ function createHebrewCell(text, isHeader, widthPct) {
290
+ return new TableCell({
291
+ width: { size: widthPct, type: WidthType.PERCENTAGE },
292
+ shading: isHeader ? { type: ShadingType.SOLID, color: '2F5496', fill: '2F5496' } : undefined,
293
+ children: [
294
+ new Paragraph({
295
+ bidirectional: true,
296
+ spacing: { before: 40, after: 40 },
297
+ children: [
298
+ new TextRun({
299
+ text,
300
+ rightToLeft: true,
301
+ font: FONT_HEBREW,
302
+ size: SIZE_SMALL,
303
+ sizeComplexScript: SIZE_SMALL,
304
+ bold: isHeader,
305
+ boldComplexScript: isHeader,
306
+ color: isHeader ? 'FFFFFF' : '333333',
307
+ }),
308
+ ],
309
+ }),
310
+ ],
311
+ });
312
+ }
313
+
314
+ // ─── v5 DOCX Reader ─────────────────────────────────────────────────────────
315
+
316
+ /**
317
+ * Read a v5 log-embed DOCX file and extract payload.
318
+ * Scans for monospace paragraphs that contain log lines.
319
+ * @param {string} docxPath - Path to DOCX file
320
+ * @returns {object} { payloadBuffer, metadataJson, encryptionMeta, metadata, formatVersion }
321
+ */
322
+ async function readDocxV5(docxPath) {
323
+ const docParsed = parseXmlFromZip(docxPath, 'word/document.xml');
324
+ if (!docParsed) {
325
+ throw new Error('Could not find document.xml in DOCX file');
326
+ }
327
+
328
+ // Also parse styles to identify monospace/code runs
329
+ const stylesParsed = parseXmlFromZip(docxPath, 'word/styles.xml');
330
+
331
+ // Extract all paragraphs with their formatting info
332
+ const body = docParsed?.document?.body;
333
+ if (!body) throw new Error('Empty document body');
334
+
335
+ const paragraphs = ensureArray(body.p);
336
+ const logLines = [];
337
+
338
+ for (const para of paragraphs) {
339
+ const runs = ensureArray(para.r);
340
+ let paraText = '';
341
+ let isCode = false;
342
+
343
+ for (const run of runs) {
344
+ if (run.t !== undefined) {
345
+ paraText += extractTextContent(run.t);
346
+ }
347
+
348
+ // Check if run uses monospace font (Consolas/Courier)
349
+ const rPr = run.rPr;
350
+ if (rPr) {
351
+ const fonts = rPr.rFonts;
352
+ if (fonts) {
353
+ const fontName = fonts['@_w:ascii'] || fonts['@_ascii'] || '';
354
+ if (/consolas|courier/i.test(fontName)) {
355
+ isCode = true;
356
+ }
357
+ }
358
+ }
359
+ }
360
+
361
+ // Also check paragraph-level shading as indicator of code block
362
+ const pPr = para.pPr;
363
+ if (pPr && pPr.shd) {
364
+ const fill = pPr.shd['@_w:fill'] || pPr.shd['@_fill'] || '';
365
+ if (fill === 'F2F2F2' || fill === 'f2f2f2') {
366
+ isCode = true;
367
+ }
368
+ }
369
+
370
+ // Collect code paragraphs that look like log lines
371
+ if (isCode && paraText.trim().length > 0) {
372
+ // Skip the comment line
373
+ if (paraText.startsWith('//')) continue;
374
+ logLines.push(paraText.trim());
375
+ }
376
+ }
377
+
378
+ if (logLines.length === 0) {
379
+ throw new Error('No log lines found in DOCX file. This may not be a v5 stegdoc file.');
380
+ }
381
+
382
+ // Parse log lines back into row arrays
383
+ const rows = logLines.map(parseLogLine);
384
+
385
+ // Decode using the same engine as XLSX
386
+ return decodeLogLines(rows);
387
+ }
388
+
389
+ /**
390
+ * Parse a formatted log line string back into a row array.
391
+ * Input format: `IP - - [timestamp] "request" status bytes "referer" "ua" "requestId" "traceId"`
392
+ */
393
+ function parseLogLine(line) {
394
+ // Regex to parse nginx combined log format with extra fields
395
+ const regex = /^(\S+)\s+-\s+-\s+(\[[^\]]+\])\s+"([^"]+)"\s+(\d+)\s+(\d+)\s+"([^"]+)"\s+"([^"]+)"\s+"([^"]+)"\s+"([^"]+)"$/;
396
+ const match = line.match(regex);
397
+
398
+ if (!match) {
399
+ // Fallback: try to extract what we can
400
+ return ['', '', '', line, '', '', '', '', '', ''];
401
+ }
402
+
403
+ const [, ip, timestamp, request, status, bytes, referer, ua, requestId, traceId] = match;
404
+
405
+ // Extract method from request
406
+ const methodMatch = request.match(/^(\w+)\s/);
407
+ const method = methodMatch ? methodMatch[1] : '';
408
+
409
+ return [ip, timestamp, method, request, status, bytes, referer, ua, requestId, traceId];
410
+ }
411
+
412
+ /**
413
+ * Detect if a DOCX file is v5 (log-embed) format.
414
+ * Checks for the presence of log-formatted monospace content.
415
+ */
416
+ function detectDocxVersion(docxPath) {
417
+ try {
418
+ const docParsed = parseXmlFromZip(docxPath, 'word/document.xml');
419
+ if (!docParsed) return 'legacy';
420
+
421
+ const body = docParsed?.document?.body;
422
+ if (!body) return 'legacy';
423
+
424
+ // Quick check: look for the STGD05 marker in raw text
425
+ const paragraphs = ensureArray(body.p);
426
+ for (const para of paragraphs) {
427
+ const runs = ensureArray(para.r);
428
+ for (const run of runs) {
429
+ const text = extractTextContent(run.t || '');
430
+ if (text.includes('/api/v1/health/')) {
431
+ return 'v5';
432
+ }
433
+ }
434
+ }
435
+
436
+ // Check for WHITENER_METADATA (legacy)
437
+ for (const para of paragraphs) {
438
+ const runs = ensureArray(para.r);
439
+ for (const run of runs) {
440
+ const text = extractTextContent(run.t || '');
441
+ if (text.includes('WHITENER_METADATA:')) {
442
+ return 'legacy';
443
+ }
444
+ }
445
+ }
446
+
447
+ return 'legacy';
448
+ } catch {
449
+ return 'legacy';
450
+ }
451
+ }
452
+
453
+ // ─── Legacy DOCX (v3/v4) ───────────────────────────────────────────────────
454
+
455
+ /**
456
+ * Create a legacy DOCX file with base64 content (v3/v4 format)
457
+ */
458
+ async function createDocxWithBase64(options) {
459
+ const { base64Content, metadata, outputPath } = options;
460
+
461
+ const metadataStr = serializeMetadata(metadata);
462
+
463
+ const doc = new Document({
464
+ sections: [{
465
+ properties: {},
466
+ children: [
467
+ new Paragraph({
468
+ children: [
469
+ new TextRun({
470
+ text: `WHITENER_METADATA:${metadataStr}`,
471
+ size: 1,
472
+ }),
473
+ ],
474
+ }),
475
+ new Paragraph({
476
+ children: [
477
+ new TextRun({ text: '---', break: 1 }),
478
+ ],
479
+ }),
480
+ new Paragraph({
481
+ children: [
482
+ new TextRun({
483
+ text: base64Content,
484
+ font: 'Courier New',
485
+ size: 16,
486
+ }),
487
+ ],
488
+ }),
489
+ ],
490
+ }],
491
+ });
492
+
493
+ const buffer = await Packer.toBuffer(doc);
494
+
495
+ const outputDir = path.dirname(outputPath);
496
+ if (!fs.existsSync(outputDir)) {
497
+ fs.mkdirSync(outputDir, { recursive: true });
498
+ }
499
+
500
+ fs.writeFileSync(outputPath, buffer);
501
+ return outputPath;
502
+ }
503
+
504
+ // ─── Unified Reader ─────────────────────────────────────────────────────────
505
+
506
+ /**
507
+ * Read a DOCX file, auto-detecting v5 vs legacy format.
508
+ */
509
+ async function readDocxBase64(docxPath) {
510
+ if (!fs.existsSync(docxPath)) {
511
+ throw new Error(`DOCX file not found: ${docxPath}`);
512
+ }
513
+
514
+ const version = detectDocxVersion(docxPath);
515
+
516
+ if (version === 'v5') {
517
+ const result = await readDocxV5(docxPath);
518
+ return {
519
+ ...result,
520
+ formatVersion: 'v5',
521
+ };
522
+ }
523
+
524
+ // Legacy path
525
+ try {
526
+ const docParsed = parseXmlFromZip(docxPath, 'word/document.xml');
527
+ if (!docParsed) {
528
+ throw new Error('Could not find document.xml in DOCX file');
529
+ }
530
+
531
+ const fullText = extractAllText(docParsed);
532
+ const metadataMarker = 'WHITENER_METADATA:';
533
+ const metadataStart = fullText.indexOf(metadataMarker);
534
+
535
+ if (metadataStart === -1) {
536
+ throw new Error('No metadata found in DOCX file. This may not be a stegdoc-encoded file.');
537
+ }
538
+
539
+ const separatorIndex = fullText.indexOf('---', metadataStart);
540
+ if (separatorIndex === -1) {
541
+ throw new Error('Invalid file format: separator not found');
542
+ }
543
+
544
+ const metadataStr = fullText.substring(metadataStart + metadataMarker.length, separatorIndex).trim();
545
+ const metadata = parseMetadata(metadataStr);
546
+ const base64Content = fullText.substring(separatorIndex + 3).trim();
547
+
548
+ return {
549
+ base64Content,
550
+ metadata,
551
+ formatVersion: 'legacy',
552
+ };
553
+ } catch (error) {
554
+ throw new Error(`Failed to read DOCX file: ${error.message}`);
555
+ }
556
+ }
557
+
558
+ /**
559
+ * Extract all text content from parsed DOCX document (legacy)
560
+ */
561
+ function extractAllText(docParsed) {
562
+ let fullText = '';
563
+ const body = docParsed?.document?.body;
564
+ if (!body) return fullText;
565
+
566
+ const paragraphs = ensureArray(body.p);
567
+ for (const para of paragraphs) {
568
+ const runs = ensureArray(para.r);
569
+ for (const run of runs) {
570
+ if (run.t !== undefined) {
571
+ fullText += extractTextContent(run.t);
572
+ }
573
+ }
574
+ }
575
+ return fullText;
576
+ }
577
+
578
+ module.exports = {
579
+ // v5
580
+ createDocxV5,
581
+ readDocxV5,
582
+ detectDocxVersion,
583
+ // Legacy
584
+ createDocxWithBase64,
585
+ // Unified
586
+ readDocxBase64,
587
+ };