hwpkit-dev 0.0.2 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ .npmignore +4 -2
- package/README.md +39 -2
- package/dist/index.d.mts +41 -14
- package/dist/index.d.ts +41 -14
- package/dist/index.js +3553 -1159
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +3553 -1159
- package/dist/index.mjs.map +1 -1
- package/package.json +2 -1
- package/playground/index.html +346 -0
- package/playground/main.ts +302 -0
- package/playground/vite.config.ts +16 -0
- package/src/contract/decoder.ts +1 -0
- package/src/contract/encoder.ts +6 -1
- package/src/core/BaseDecoder.ts +118 -0
- package/src/core/BaseEncoder.ts +146 -0
- package/src/decoders/docx/DocxDecoder.ts +743 -151
- package/src/decoders/html/HtmlDecoder.ts +366 -0
- package/src/decoders/hwp/HwpScanner.ts +325 -157
- package/src/decoders/hwpx/HwpxDecoder.ts +785 -297
- package/src/decoders/md/MdDecoder.ts +4 -4
- package/src/encoders/docx/DocxEncoder.ts +504 -240
- package/src/encoders/html/HtmlEncoder.ts +17 -19
- package/src/encoders/hwp/HwpEncoder.ts +1466 -859
- package/src/encoders/hwpx/HwpxEncoder.ts +1477 -469
- package/src/encoders/hwpx/constants.ts +148 -0
- package/src/encoders/hwpx/utils.ts +198 -0
- package/src/encoders/md/MdEncoder.ts +20 -15
- package/src/model/builders.ts +4 -4
- package/src/model/doc-props.ts +19 -5
- package/src/model/doc-tree.ts +12 -4
- package/src/pipeline/Pipeline.ts +7 -3
- package/src/pipeline/registry.ts +13 -2
- package/src/safety/StyleBridge.ts +51 -6
- package/src/toolkit/ArchiveKit.ts +56 -0
- package/src/toolkit/StyleMapper.ts +221 -0
- package/src/toolkit/UnitConverter.ts +138 -0
- package/src/toolkit/XmlKit.ts +0 -5
- package/test-styling.ts +210 -0
- package/hwp-analyze.ts +0 -90
- package/inspect-doc.ts +0 -57
- package/output_test.hwp +0 -0
- package/test-docx-to-hwp.ts +0 -45
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import type { Decoder } from '../../contract/decoder';
|
|
2
|
-
import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode } from '../../model/doc-tree';
|
|
2
|
+
import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode, GridNode } from '../../model/doc-tree';
|
|
3
3
|
import type { Outcome } from '../../contract/result';
|
|
4
4
|
import type { Align, Stroke, StrokeKind, PageDims, TextProps, ParaProps, CellProps, GridProps } from '../../model/doc-props';
|
|
5
5
|
import { succeed, fail } from '../../contract/result';
|
|
6
6
|
import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell, buildImg } from '../../model/builders';
|
|
7
7
|
import { ShieldedParser } from '../../safety/ShieldedParser';
|
|
8
8
|
import { BinaryKit } from '../../toolkit/BinaryKit';
|
|
9
|
+
import { TextKit } from '../../toolkit/TextKit';
|
|
9
10
|
import { Metric, safeHex, safeFont } from '../../safety/StyleBridge';
|
|
10
11
|
import { registry } from '../../pipeline/registry';
|
|
11
12
|
import { A4 } from '../../model/doc-props';
|
|
@@ -38,10 +39,11 @@ function isTableTag(t: number) { return t === TAG_TABLE_A || t === TAG_TABLE_B;
|
|
|
38
39
|
function isCellTag(t: number) { return t === TAG_CELL_A || t === TAG_CELL_B || t === TAG_LIST_HEADER; }
|
|
39
40
|
|
|
40
41
|
// CTRL_HEADER ctrlId values (UINT32-LE as ASCII)
|
|
41
|
-
const CTRL_TABLE = 0x74626C20; // ' lbt'
|
|
42
|
+
const CTRL_TABLE = 0x74626C20; // ' lbt' = 표(table)
|
|
42
43
|
const CTRL_IMAGE = 0x696D6720; // 'img '
|
|
43
44
|
const CTRL_OBJ = 0x6F626A20; // 'obj '
|
|
44
45
|
const CTRL_FIG = 0x66696720; // 'fig '
|
|
46
|
+
const CTRL_GSO = 0x67736F20; // 'gso ' = 그리기 객체 (drawing object, contains embedded images)
|
|
45
47
|
|
|
46
48
|
/* ═══════════════════════════════════════════════════════════════
|
|
47
49
|
Types
|
|
@@ -70,6 +72,7 @@ interface HwpParaShape {
|
|
|
70
72
|
spaceBefore: number;
|
|
71
73
|
spaceAfter: number;
|
|
72
74
|
lineSpacing: number;
|
|
75
|
+
leftMargin: number;
|
|
73
76
|
indent: number;
|
|
74
77
|
}
|
|
75
78
|
|
|
@@ -221,11 +224,12 @@ function parseCharShape(d: Uint8Array): HwpCharShape {
|
|
|
221
224
|
const ALIGN_TBL: Record<number, Align> = { 0: 'justify', 1: 'left', 2: 'right', 3: 'center', 4: 'justify' };
|
|
222
225
|
|
|
223
226
|
function parseParaShape(d: Uint8Array): HwpParaShape {
|
|
224
|
-
if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, indent: 0 };
|
|
227
|
+
if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, leftMargin: 0, indent: 0 };
|
|
225
228
|
const attr = BinaryKit.readU32LE(d, 0);
|
|
226
229
|
return {
|
|
227
230
|
align: ALIGN_TBL[(attr >> 2) & 0x7] ?? 'left',
|
|
228
|
-
|
|
231
|
+
leftMargin: d.length >= 8 ? i32(d, 4) : 0, // offset 4: leftMargin (들여쓰기)
|
|
232
|
+
indent: d.length >= 16 ? i32(d, 12) : 0, // offset 12: first-line indent
|
|
229
233
|
spaceBefore: d.length >= 20 ? i32(d, 16) : 0,
|
|
230
234
|
spaceAfter: d.length >= 24 ? i32(d, 20) : 0,
|
|
231
235
|
lineSpacing: d.length >= 28 ? i32(d, 24) : 160,
|
|
@@ -275,8 +279,13 @@ function parseBorderFill(d: Uint8Array): HwpBorderFill {
|
|
|
275
279
|
Body section parsing
|
|
276
280
|
═══════════════════════════════════════════════════════════════ */
|
|
277
281
|
|
|
282
|
+
// gsoCtx: shared mutable counter for 'gso ' drawing objects.
|
|
283
|
+
// Each 'gso ' CTRL_HEADER encountered increments this counter.
|
|
284
|
+
// objectMap is keyed by 0-based gso order = sequential BinData insertion order.
|
|
285
|
+
interface GsoCtx { count: number }
|
|
286
|
+
|
|
278
287
|
function parseBody(
|
|
279
|
-
raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser,
|
|
288
|
+
raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
|
|
280
289
|
): { content: ContentNode[]; pageDims?: PageDims } {
|
|
281
290
|
const recs = parseRecords(compressed ? tryInflate(raw) : raw);
|
|
282
291
|
const content: ContentNode[] = [];
|
|
@@ -296,7 +305,7 @@ function parseBody(
|
|
|
296
305
|
i++; // already handled above; skip at top level
|
|
297
306
|
} else if (recs[i].tag === TAG_PARA_HEADER) {
|
|
298
307
|
const r = shield.guard(
|
|
299
|
-
() => parseParagraphGroup(recs, i, di, shield),
|
|
308
|
+
() => parseParagraphGroup(recs, i, di, shield, gsoCtx),
|
|
300
309
|
{ nodes: [] as ContentNode[], next: i + 1 },
|
|
301
310
|
`hwp:para@${i}`,
|
|
302
311
|
);
|
|
@@ -312,7 +321,7 @@ function parseBody(
|
|
|
312
321
|
/* ── Paragraph group ────────────────────────────────────────── */
|
|
313
322
|
|
|
314
323
|
function parseParagraphGroup(
|
|
315
|
-
recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser,
|
|
324
|
+
recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
|
|
316
325
|
): { nodes: ContentNode[]; next: number } {
|
|
317
326
|
const hdr = recs[start];
|
|
318
327
|
const lv = hdr.level;
|
|
@@ -324,7 +333,8 @@ function parseParagraphGroup(
|
|
|
324
333
|
let text: ParaTextResult | null = null;
|
|
325
334
|
let csPairs: [number, number][] = [];
|
|
326
335
|
const grids: ContentNode[] = [];
|
|
327
|
-
|
|
336
|
+
// imgId: for 'gso' uses sequential gsoCtx.count; for others uses flags-based objId
|
|
337
|
+
const ctrlHeaders: { ctrlId: number; imgId: number; wPt: number; hPt: number }[] = [];
|
|
328
338
|
let i = start + 1;
|
|
329
339
|
|
|
330
340
|
while (i < recs.length && recs[i].level > lv) {
|
|
@@ -339,13 +349,23 @@ function parseParagraphGroup(
|
|
|
339
349
|
} else if (r.tag === TAG_CTRL_HEADER && r.level === lv + 1) {
|
|
340
350
|
if (r.data.length >= 4) {
|
|
341
351
|
const ctrlId = BinaryKit.readU32LE(r.data, 0);
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
352
|
+
|
|
353
|
+
// HWP 5.0 general-object layout:
|
|
354
|
+
// [0:4] ctrlId [4:4] flags [8:4] xOff [12:4] yOff
|
|
355
|
+
// [16:4] width(HWPUNIT) [20:4] height(HWPUNIT)
|
|
356
|
+
const MAX_HWP = 1_000_000;
|
|
357
|
+
const rawW = r.data.length >= 24 ? BinaryKit.readU32LE(r.data, 16) : 0;
|
|
358
|
+
const rawH = r.data.length >= 28 ? BinaryKit.readU32LE(r.data, 20) : 0;
|
|
359
|
+
const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
|
|
360
|
+
const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
|
|
361
|
+
|
|
362
|
+
// 'gso ' (그리기 객체) uses sequential counter; others use flags-based id
|
|
363
|
+
const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (r.data.length >= 6 ? BinaryKit.readU16LE(r.data, 4) : 0);
|
|
364
|
+
ctrlHeaders.push({ ctrlId, imgId, wPt, hPt });
|
|
345
365
|
|
|
346
366
|
if (ctrlId === CTRL_TABLE) {
|
|
347
367
|
const tr = shield.guard(
|
|
348
|
-
() => parseTableCtrl(recs, i, di, shield),
|
|
368
|
+
() => parseTableCtrl(recs, i, di, shield, gsoCtx),
|
|
349
369
|
{ grid: null, next: skipKids(recs, i) },
|
|
350
370
|
`hwp:tbl@${i}`,
|
|
351
371
|
);
|
|
@@ -362,34 +382,29 @@ function parseParagraphGroup(
|
|
|
362
382
|
}
|
|
363
383
|
}
|
|
364
384
|
|
|
365
|
-
// Match extended controls with CTRL_HEADER entries
|
|
366
|
-
if (text && ctrlHeaders.length > 0) {
|
|
367
|
-
for (let ci = 0; ci < text.controls.length; ci++) {
|
|
368
|
-
if (ci < ctrlHeaders.length) {
|
|
369
|
-
text.controls[ci].ctrlId = ctrlHeaders[ci].ctrlId;
|
|
370
|
-
text.controls[ci].matched = true;
|
|
371
|
-
}
|
|
372
|
-
}
|
|
373
|
-
}
|
|
374
|
-
|
|
375
385
|
const nodes: ContentNode[] = [];
|
|
376
386
|
|
|
377
387
|
// Build paragraph from text and inline controls (images)
|
|
378
388
|
if (text && (text.chars.length > 0 || text.controls.length > 0)) {
|
|
379
389
|
const paraContent: (SpanNode | ContentNode)[] = [];
|
|
380
390
|
|
|
381
|
-
// Process text chars and controls together
|
|
382
391
|
if (text.chars.length > 0) {
|
|
383
392
|
const spans = resolveCharShapes(text.chars, csPairs, di);
|
|
384
393
|
paraContent.push(...spans);
|
|
385
394
|
}
|
|
386
395
|
|
|
387
|
-
//
|
|
396
|
+
// Image placeholder spans: only for actual image controls.
|
|
397
|
+
// Non-image controls (footnotes, TOC entries, etc.) are silently skipped.
|
|
388
398
|
if (text.controls.length > 0) {
|
|
389
399
|
for (let ci = 0; ci < text.controls.length; ci++) {
|
|
390
|
-
|
|
391
|
-
//
|
|
392
|
-
|
|
400
|
+
const ch = ctrlHeaders[ci];
|
|
401
|
+
if (!ch) continue; // anchor-only ctrl (gso is sibling, not inline)
|
|
402
|
+
const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
|
|
403
|
+
if (!isImg) continue; // skip footnotes, TOC, page num, etc.
|
|
404
|
+
const dimStr = (ch.wPt > 0 && ch.hPt > 0)
|
|
405
|
+
? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}`
|
|
406
|
+
: '';
|
|
407
|
+
paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
|
|
393
408
|
}
|
|
394
409
|
}
|
|
395
410
|
|
|
@@ -511,7 +526,7 @@ function styledSpan(text: string, shapeId: number, di: DocInfo): SpanNode {
|
|
|
511
526
|
/* ── Table control parsing ──────────────────────────────────── */
|
|
512
527
|
|
|
513
528
|
function parseTableCtrl(
|
|
514
|
-
recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser,
|
|
529
|
+
recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
|
|
515
530
|
): { grid: ContentNode | null; next: number } {
|
|
516
531
|
const ctrlLv = recs[ctrlIdx].level;
|
|
517
532
|
let i = ctrlIdx + 1;
|
|
@@ -567,15 +582,15 @@ function parseTableCtrl(
|
|
|
567
582
|
const rowCnt = tblData.length >= 6 ? BinaryKit.readU16LE(tblData, 4) : 1;
|
|
568
583
|
const colCnt = tblData.length >= 8 ? BinaryKit.readU16LE(tblData, 6) : 1;
|
|
569
584
|
|
|
570
|
-
interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; props: CellProps;
|
|
585
|
+
interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; heightHwp?: number; props: CellProps; cellChildren: (ParaNode | GridNode)[] }
|
|
571
586
|
const parsed: PC[] = [];
|
|
572
587
|
|
|
573
588
|
for (let ci = 0; ci < cells.length; ci++) {
|
|
574
589
|
const c = cells[ci];
|
|
575
590
|
const seqIdx = ci;
|
|
576
591
|
const pc = shield.guard(
|
|
577
|
-
() => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt),
|
|
578
|
-
{ row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, props: {},
|
|
592
|
+
() => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt, gsoCtx),
|
|
593
|
+
{ row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, heightHwp: undefined, props: {}, cellChildren: [buildPara([buildSpan('')])] },
|
|
579
594
|
`hwp:cell@${c.cStart}`,
|
|
580
595
|
);
|
|
581
596
|
parsed.push(pc);
|
|
@@ -602,9 +617,11 @@ function parseTableCtrl(
|
|
|
602
617
|
}
|
|
603
618
|
}
|
|
604
619
|
// Pass 2: for columns still 0, try to derive from multi-span cells
|
|
620
|
+
// Sort by span size ascending so smaller, more precise spans fill widths before larger spans
|
|
605
621
|
const zeroColumns = colWidthsPt.filter(w => w === 0).length;
|
|
606
622
|
if (zeroColumns > 0) {
|
|
607
|
-
|
|
623
|
+
const spanCells = parsed.filter(c => c.cs > 1 && c.widthHwp > 0).sort((a, b) => a.cs - b.cs);
|
|
624
|
+
for (const c of spanCells) {
|
|
608
625
|
if (c.cs > 1 && c.widthHwp > 0) {
|
|
609
626
|
// Subtract known column widths from the span
|
|
610
627
|
let known = 0;
|
|
@@ -624,13 +641,37 @@ function parseTableCtrl(
|
|
|
624
641
|
}
|
|
625
642
|
}
|
|
626
643
|
|
|
644
|
+
// Post-process: clamp near-zero column widths (< 1pt = floating-point artifact) to minimum 1pt
|
|
645
|
+
for (let i = 0; i < colWidthsPt.length; i++) {
|
|
646
|
+
if (colWidthsPt[i] > 0 && colWidthsPt[i] < 1) colWidthsPt[i] = 1;
|
|
647
|
+
}
|
|
648
|
+
|
|
627
649
|
const rows = [];
|
|
628
650
|
for (let r = 0; r < actualRowCnt; r++) {
|
|
629
651
|
const rc = parsed.filter(c => c.row === r).sort((a, b) => a.col - b.col);
|
|
630
652
|
if (rc.length === 0) continue;
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
653
|
+
|
|
654
|
+
// Calculate row height — prefer rs=1 cells (exact per-row height)
|
|
655
|
+
let rowHeightPt: number | undefined = undefined;
|
|
656
|
+
for (const c of rc) {
|
|
657
|
+
if (c.heightHwp && c.heightHwp > 0 && c.rs === 1) {
|
|
658
|
+
const hPt = Metric.hwpToPt(c.heightHwp);
|
|
659
|
+
if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
// Fallback: all cells span multiple rows → approximate height per row
|
|
663
|
+
if (rowHeightPt == null) {
|
|
664
|
+
for (const c of rc) {
|
|
665
|
+
if (c.heightHwp && c.heightHwp > 0) {
|
|
666
|
+
const hPt = Metric.hwpToPt(c.heightHwp) / c.rs;
|
|
667
|
+
if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
rows.push(buildRow(rc.map(c => {
|
|
673
|
+
return buildCell(c.cellChildren, { cs: c.cs, rs: c.rs, props: c.props });
|
|
674
|
+
}), rowHeightPt));
|
|
634
675
|
}
|
|
635
676
|
if (rows.length === 0) return { grid: null, next: i };
|
|
636
677
|
|
|
@@ -659,10 +700,11 @@ function parseTableCtrl(
|
|
|
659
700
|
|
|
660
701
|
function parseCellRec(
|
|
661
702
|
d: Uint8Array, tag: number, recs: HwpRecord[], cStart: number, cEnd: number,
|
|
662
|
-
di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number,
|
|
703
|
+
di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number, gsoCtx: GsoCtx,
|
|
663
704
|
) {
|
|
664
705
|
let col: number, row: number, cs = 1, rs = 1;
|
|
665
706
|
let widthHwp = 0;
|
|
707
|
+
let heightHwp = 0;
|
|
666
708
|
const props: CellProps = {};
|
|
667
709
|
|
|
668
710
|
const attr = d.length >= 6 ? BinaryKit.readU32LE(d, 2) : 0;
|
|
@@ -670,57 +712,55 @@ function parseCellRec(
|
|
|
670
712
|
if (va === 1) props.va = 'mid';
|
|
671
713
|
else if (va === 2) props.va = 'bot';
|
|
672
714
|
|
|
715
|
+
const HWP_PAD_LR_DEFAULT = 360;
|
|
716
|
+
const HWP_PAD_TB_DEFAULT = 141;
|
|
717
|
+
|
|
673
718
|
if (tag === TAG_LIST_HEADER && d.length >= 22) {
|
|
674
|
-
// LIST_HEADER with cell-specific fields
|
|
675
|
-
// offset 8: colAddr, offset 10: rowAddr (HWP 5.0 spec)
|
|
676
719
|
col = BinaryKit.readU16LE(d, 8);
|
|
677
720
|
row = BinaryKit.readU16LE(d, 10);
|
|
678
721
|
cs = Math.max(1, BinaryKit.readU16LE(d, 12));
|
|
679
722
|
rs = Math.max(1, BinaryKit.readU16LE(d, 14));
|
|
680
723
|
widthHwp = BinaryKit.readU32LE(d, 16);
|
|
681
|
-
|
|
682
|
-
|
|
683
|
-
|
|
684
|
-
const
|
|
685
|
-
if (
|
|
686
|
-
|
|
687
|
-
|
|
688
|
-
|
|
689
|
-
props.bot = toStroke(bf.borders[3]);
|
|
690
|
-
}
|
|
691
|
-
if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
|
|
724
|
+
heightHwp = d.length >= 24 ? BinaryKit.readU32LE(d, 20) : 0;
|
|
725
|
+
if (d.length >= 32) {
|
|
726
|
+
const pL = BinaryKit.readU16LE(d, 24); const pR = BinaryKit.readU16LE(d, 26);
|
|
727
|
+
const pT = BinaryKit.readU16LE(d, 28); const pB = BinaryKit.readU16LE(d, 30);
|
|
728
|
+
if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
|
|
729
|
+
if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
|
|
730
|
+
if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
|
|
731
|
+
if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
|
|
692
732
|
}
|
|
733
|
+
const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
|
|
734
|
+
if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
|
|
693
735
|
} else if (tag !== TAG_LIST_HEADER) {
|
|
694
|
-
// Full CELL record with position/span/borderFill
|
|
695
736
|
col = d.length >= 8 ? BinaryKit.readU16LE(d, 6) : seqIdx % (colCnt || 1);
|
|
696
737
|
row = d.length >= 10 ? BinaryKit.readU16LE(d, 8) : Math.floor(seqIdx / (colCnt || 1));
|
|
697
738
|
cs = d.length >= 12 ? Math.max(1, BinaryKit.readU16LE(d, 10)) : 1;
|
|
698
739
|
rs = d.length >= 14 ? Math.max(1, BinaryKit.readU16LE(d, 12)) : 1;
|
|
699
740
|
widthHwp = d.length >= 18 ? BinaryKit.readU32LE(d, 14) : 0;
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
const
|
|
704
|
-
if (
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
props.bot = toStroke(bf.borders[3]);
|
|
709
|
-
}
|
|
710
|
-
if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
|
|
741
|
+
heightHwp = d.length >= 22 ? BinaryKit.readU32LE(d, 18) : 0;
|
|
742
|
+
if (d.length >= 30) {
|
|
743
|
+
const pL = BinaryKit.readU16LE(d, 22); const pR = BinaryKit.readU16LE(d, 24);
|
|
744
|
+
const pT = BinaryKit.readU16LE(d, 26); const pB = BinaryKit.readU16LE(d, 28);
|
|
745
|
+
if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
|
|
746
|
+
if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
|
|
747
|
+
if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
|
|
748
|
+
if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
|
|
711
749
|
}
|
|
750
|
+
const bfId = d.length >= 32 ? BinaryKit.readU16LE(d, 30) : 0;
|
|
751
|
+
if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
|
|
712
752
|
} else {
|
|
713
|
-
// Fallback: LIST_HEADER too short, compute sequentially
|
|
714
753
|
row = Math.floor(seqIdx / (colCnt || 1));
|
|
715
754
|
col = seqIdx % (colCnt || 1);
|
|
716
755
|
}
|
|
717
756
|
|
|
718
|
-
|
|
719
|
-
const
|
|
757
|
+
const cellChildren: (ParaNode | GridNode)[] = [];
|
|
758
|
+
const MAX_HWP = 1_000_000;
|
|
720
759
|
let k = cStart;
|
|
760
|
+
|
|
721
761
|
while (k < cEnd) {
|
|
722
762
|
if (recs[k].tag === TAG_PARA_HEADER) {
|
|
723
|
-
//
|
|
763
|
+
// Parse paragraph inside cell — also extracts nested tables within the paragraph
|
|
724
764
|
const r = shield.guard(
|
|
725
765
|
() => {
|
|
726
766
|
const hdr = recs[k];
|
|
@@ -729,24 +769,91 @@ function parseCellRec(
|
|
|
729
769
|
const ps = di.paraShapes[psId];
|
|
730
770
|
let txt: ParaTextResult | null = null;
|
|
731
771
|
let csp: [number, number][] = [];
|
|
772
|
+
const ctrlHdrs: { ctrlId: number; imgId: number; wPt: number; hPt: number }[] = [];
|
|
773
|
+
const innerGrids: GridNode[] = [];
|
|
732
774
|
let j = k + 1;
|
|
733
775
|
while (j < cEnd && recs[j].level > lv) {
|
|
734
776
|
if (recs[j].tag === TAG_PARA_TEXT) { txt = decodeParaText(recs[j].data); j++; }
|
|
735
777
|
else if (recs[j].tag === TAG_PARA_CHAR_SHAPE) { csp = parseCharShapePairs(recs[j].data); j++; }
|
|
778
|
+
else if (recs[j].tag === TAG_CTRL_HEADER && recs[j].level === lv + 1) {
|
|
779
|
+
if (recs[j].data.length >= 4) {
|
|
780
|
+
const ctrlId = BinaryKit.readU32LE(recs[j].data, 0);
|
|
781
|
+
if (ctrlId === CTRL_TABLE) {
|
|
782
|
+
// Nested table inside a cell paragraph — recurse into parseTableCtrl
|
|
783
|
+
const nestedTr = shield.guard(
|
|
784
|
+
() => parseTableCtrl(recs, j, di, shield, gsoCtx),
|
|
785
|
+
{ grid: null, next: skipKids(recs, j) },
|
|
786
|
+
`hwp:innerNestedTbl@${j}`,
|
|
787
|
+
);
|
|
788
|
+
if (nestedTr.grid) innerGrids.push(nestedTr.grid as GridNode);
|
|
789
|
+
j = nestedTr.next;
|
|
790
|
+
} else {
|
|
791
|
+
const rawW = recs[j].data.length >= 24 ? BinaryKit.readU32LE(recs[j].data, 16) : 0;
|
|
792
|
+
const rawH = recs[j].data.length >= 28 ? BinaryKit.readU32LE(recs[j].data, 20) : 0;
|
|
793
|
+
const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
|
|
794
|
+
const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
|
|
795
|
+
const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (recs[j].data.length >= 6 ? BinaryKit.readU16LE(recs[j].data, 4) : 0);
|
|
796
|
+
ctrlHdrs.push({ ctrlId, imgId, wPt, hPt });
|
|
797
|
+
j = skipKids(recs, j);
|
|
798
|
+
}
|
|
799
|
+
} else {
|
|
800
|
+
j = skipKids(recs, j);
|
|
801
|
+
}
|
|
802
|
+
}
|
|
736
803
|
else j++;
|
|
737
804
|
}
|
|
738
|
-
const
|
|
739
|
-
|
|
805
|
+
const paraContent: (SpanNode | ContentNode)[] = [];
|
|
806
|
+
if (txt && txt.chars.length > 0) paraContent.push(...resolveCharShapes(txt.chars, csp, di));
|
|
807
|
+
if (txt && txt.controls.length > 0) {
|
|
808
|
+
for (let ci = 0; ci < txt.controls.length; ci++) {
|
|
809
|
+
const ch = ctrlHdrs[ci];
|
|
810
|
+
if (!ch) continue;
|
|
811
|
+
const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
|
|
812
|
+
if (!isImg) continue;
|
|
813
|
+
const dimStr = (ch.wPt > 0 && ch.hPt > 0) ? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}` : '';
|
|
814
|
+
paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
const kids = paraContent.length > 0 ? paraContent as any : [buildSpan('')];
|
|
818
|
+
const items: (ParaNode | GridNode)[] = [buildPara(kids, buildParaProps(ps)), ...innerGrids];
|
|
819
|
+
return { items, next: j };
|
|
740
820
|
},
|
|
741
|
-
{
|
|
821
|
+
{ items: [buildPara([buildSpan('')])] as (ParaNode | GridNode)[], next: k + 1 },
|
|
742
822
|
`hwp:cellP@${k}`,
|
|
743
823
|
);
|
|
744
|
-
|
|
824
|
+
cellChildren.push(...r.items);
|
|
745
825
|
k = r.next;
|
|
826
|
+
} else if (recs[k].tag === TAG_CTRL_HEADER && recs[k].data.length >= 4) {
|
|
827
|
+
// CTRL_HEADER at cell level (sibling of PARA_HEADER) — anchored 'gso' images and outer-level nested tables
|
|
828
|
+
const cellCtrlId = BinaryKit.readU32LE(recs[k].data, 0);
|
|
829
|
+
if (cellCtrlId === CTRL_GSO) {
|
|
830
|
+
const gsoId = gsoCtx.count++;
|
|
831
|
+
const rawW = recs[k].data.length >= 24 ? BinaryKit.readU32LE(recs[k].data, 16) : 0;
|
|
832
|
+
const rawH = recs[k].data.length >= 28 ? BinaryKit.readU32LE(recs[k].data, 20) : 0;
|
|
833
|
+
const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
|
|
834
|
+
const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
|
|
835
|
+
const dimStr = (wPt > 0 && hPt > 0) ? `_W${Math.round(wPt)}_H${Math.round(hPt)}` : '';
|
|
836
|
+
cellChildren.push(buildPara([buildSpan(`__EXT_${gsoId}${dimStr}__`)]));
|
|
837
|
+
k = skipKids(recs, k);
|
|
838
|
+
} else if (cellCtrlId === CTRL_TABLE) {
|
|
839
|
+
const tr = shield.guard(
|
|
840
|
+
() => parseTableCtrl(recs, k, di, shield, gsoCtx),
|
|
841
|
+
{ grid: null, next: skipKids(recs, k) },
|
|
842
|
+
`hwp:nestedTbl@${k}`,
|
|
843
|
+
);
|
|
844
|
+
if (tr.grid) cellChildren.push(tr.grid as GridNode);
|
|
845
|
+
k = tr.next;
|
|
846
|
+
} else {
|
|
847
|
+
k = skipKids(recs, k);
|
|
848
|
+
}
|
|
746
849
|
} else { k++; }
|
|
747
850
|
}
|
|
748
851
|
|
|
749
|
-
return {
|
|
852
|
+
return {
|
|
853
|
+
row, col, cs, rs, props, widthHwp,
|
|
854
|
+
heightHwp: heightHwp || undefined,
|
|
855
|
+
cellChildren: cellChildren.length ? cellChildren : [buildPara([buildSpan('')])],
|
|
856
|
+
};
|
|
750
857
|
}
|
|
751
858
|
|
|
752
859
|
/* ── PAGE_DEF ───────────────────────────────────────────────── */
|
|
@@ -788,6 +895,18 @@ function toStroke(b: { type: number; widthPt: number; color: string }): Stroke {
|
|
|
788
895
|
return { kind: BORDER_KIND[b.type] ?? 'solid', pt: b.widthPt, color: b.color };
|
|
789
896
|
}
|
|
790
897
|
|
|
898
|
+
// Apply borderFill to CellProps. Preserve explicit NONE so DOCX tcBorders can
|
|
899
|
+
// override the table-level tblBorders. Filtering NONE would let tblBorders bleed through.
|
|
900
|
+
function applyCellBorderFill(bf: HwpBorderFill, props: CellProps): void {
|
|
901
|
+
if (bf.borders.length >= 4) {
|
|
902
|
+
props.left = toStroke(bf.borders[0]);
|
|
903
|
+
props.right = toStroke(bf.borders[1]);
|
|
904
|
+
props.top = toStroke(bf.borders[2]);
|
|
905
|
+
props.bot = toStroke(bf.borders[3]);
|
|
906
|
+
}
|
|
907
|
+
if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
|
|
908
|
+
}
|
|
909
|
+
|
|
791
910
|
function strokeFromBF(bfId: number, di: DocInfo): Stroke | undefined {
|
|
792
911
|
if (bfId <= 0 || bfId > di.borderFills.length) return undefined;
|
|
793
912
|
const bf = di.borderFills[bfId - 1];
|
|
@@ -803,7 +922,11 @@ function buildParaProps(ps?: HwpParaShape): ParaProps {
|
|
|
803
922
|
if (ps.spaceBefore > 0) p.spaceBefore = Metric.hwpToPt(ps.spaceBefore);
|
|
804
923
|
if (ps.spaceAfter > 0) p.spaceAfter = Metric.hwpToPt(ps.spaceAfter);
|
|
805
924
|
if (ps.lineSpacing > 0 && ps.lineSpacing !== 160) p.lineHeight = ps.lineSpacing / 100;
|
|
806
|
-
|
|
925
|
+
// leftMargin (offset 4) = 문단 몸체 왼쪽 여백 → leftMargin (pt), ensure non-negative
|
|
926
|
+
const leftMarginPt = Math.max(0, Metric.hwpToPt(ps.leftMargin));
|
|
927
|
+
if (leftMarginPt > 0) p.leftMargin = leftMarginPt;
|
|
928
|
+
// indent (offset 12) = 첫 줄 들여쓰기(양수) / 내어쓰기(음수) → firstLineIndentPt
|
|
929
|
+
if (ps.indent !== 0) p.firstLineIndentPt = Metric.hwpToPt(ps.indent);
|
|
807
930
|
return p;
|
|
808
931
|
}
|
|
809
932
|
|
|
@@ -813,6 +936,7 @@ function buildParaProps(ps?: HwpParaShape): ParaProps {
|
|
|
813
936
|
|
|
814
937
|
export class HwpScanner implements Decoder {
|
|
815
938
|
readonly format = 'hwp';
|
|
939
|
+
readonly aliases = ['application/vnd.hancom.hwp'];
|
|
816
940
|
|
|
817
941
|
async decode(data: Uint8Array): Promise<Outcome<DocRoot>> {
|
|
818
942
|
const shield = new ShieldedParser();
|
|
@@ -834,52 +958,36 @@ export class HwpScanner implements Decoder {
|
|
|
834
958
|
di = shield.guard(() => parseDocInfo(diRaw, compressed), di, 'hwp:docInfo');
|
|
835
959
|
}
|
|
836
960
|
|
|
837
|
-
// Extract images from BinData streams
|
|
838
|
-
|
|
839
|
-
|
|
840
|
-
|
|
841
|
-
|
|
842
|
-
|
|
843
|
-
|
|
844
|
-
|
|
961
|
+
// Extract images from BinData streams.
|
|
962
|
+
// HWP duplicates each BinData entry: once as "BinData/BIN0001.jpg" and once as "BIN0001.jpg".
|
|
963
|
+
// We keep only the "BinData/" prefixed versions, sort by BIN number, then assign 0-based keys
|
|
964
|
+
// matching the order 'gso' CTRL_HEADER records are encountered during body parsing.
|
|
965
|
+
const binEntries: { binNum: number; data: Uint8Array }[] = [];
|
|
966
|
+
for (const [path, streamData] of streams) {
|
|
967
|
+
// Match "BinData/BIN0001.jpg" style — the canonical form
|
|
968
|
+
const m = path.match(/^BinData[/\\]BIN(\d+)\.\w+$/i);
|
|
969
|
+
if (m) binEntries.push({ binNum: parseInt(m[1], 10), data: streamData });
|
|
845
970
|
}
|
|
971
|
+
// Sort by BIN number (ascending) so BIN0001→idx0, BIN0002→idx1, …
|
|
972
|
+
binEntries.sort((a, b) => a.binNum - b.binNum);
|
|
846
973
|
|
|
847
|
-
// Create image nodes for each image stream (deduplicated by hash)
|
|
848
974
|
const objectMap = new Map<number, ImgNode>();
|
|
849
|
-
|
|
850
|
-
|
|
851
|
-
|
|
852
|
-
// Determine MIME type from
|
|
853
|
-
let mimeType = 'image/jpeg';
|
|
854
|
-
|
|
855
|
-
if (
|
|
856
|
-
else if (
|
|
857
|
-
|
|
858
|
-
|
|
859
|
-
|
|
860
|
-
|
|
861
|
-
else if (data[0] === 0x47 && data[1] === 0x49 && data[2] === 0x46 && data[3] === 0x3538) mimeType = 'image/gif';
|
|
862
|
-
else if (data[0] === 0x42 && data[1] === 0x4D) mimeType = 'image/bmp';
|
|
863
|
-
|
|
864
|
-
const imgData = Buffer.from(data);
|
|
865
|
-
const base64 = imgData.toString('base64');
|
|
866
|
-
const hash = base64.slice(0, 20); // Use first 20 chars as simple hash
|
|
867
|
-
if (!seenHashes.has(hash)) {
|
|
868
|
-
seenHashes.add(hash);
|
|
869
|
-
objectMap.set(imgIdx++, buildImg(
|
|
870
|
-
base64,
|
|
871
|
-
mimeType as any,
|
|
872
|
-
0, // w
|
|
873
|
-
0, // h
|
|
874
|
-
`Image from ${path}`,
|
|
875
|
-
));
|
|
876
|
-
console.log(`[HwpScanner] Added unique image: ${hash}... (${data.length} bytes)`);
|
|
877
|
-
} else {
|
|
878
|
-
console.log(`[HwpScanner] Duplicate image skipped: ${hash}...`);
|
|
879
|
-
}
|
|
975
|
+
for (let idx = 0; idx < binEntries.length; idx++) {
|
|
976
|
+
const { data: imgData } = binEntries[idx];
|
|
977
|
+
|
|
978
|
+
// Determine MIME type from binary signature first, then fall back to extension
|
|
979
|
+
let mimeType: ImgNode['mime'] = 'image/jpeg';
|
|
980
|
+
if (imgData[0] === 0x89 && imgData[1] === 0x50) mimeType = 'image/png';
|
|
981
|
+
else if (imgData[0] === 0x47 && imgData[1] === 0x49) mimeType = 'image/gif';
|
|
982
|
+
else if (imgData[0] === 0x42 && imgData[1] === 0x4D) mimeType = 'image/bmp';
|
|
983
|
+
|
|
984
|
+
const base64 = TextKit.base64Encode(imgData);
|
|
985
|
+
const { wPt, hPt } = getImageDimsPt(imgData, mimeType);
|
|
986
|
+
objectMap.set(idx, buildImg(base64, mimeType, wPt, hPt));
|
|
880
987
|
}
|
|
881
988
|
|
|
882
|
-
|
|
989
|
+
// gsoCtx tracks sequential 'gso' encounter order — must be shared across all sections
|
|
990
|
+
const gsoCtx: GsoCtx = { count: 0 };
|
|
883
991
|
|
|
884
992
|
// Body sections
|
|
885
993
|
const allContent: ContentNode[] = [];
|
|
@@ -891,7 +999,7 @@ export class HwpScanner implements Decoder {
|
|
|
891
999
|
if (s === 0) {
|
|
892
1000
|
const fb = findBodySection(streams);
|
|
893
1001
|
if (fb) {
|
|
894
|
-
const r = parseBody(fb, compressed, di, shield);
|
|
1002
|
+
const r = parseBody(fb, compressed, di, shield, gsoCtx);
|
|
895
1003
|
allContent.push(...r.content);
|
|
896
1004
|
if (r.pageDims) pageDims = r.pageDims;
|
|
897
1005
|
}
|
|
@@ -899,7 +1007,7 @@ export class HwpScanner implements Decoder {
|
|
|
899
1007
|
break;
|
|
900
1008
|
}
|
|
901
1009
|
const r = shield.guard(
|
|
902
|
-
() => parseBody(sec, compressed, di, shield),
|
|
1010
|
+
() => parseBody(sec, compressed, di, shield, gsoCtx),
|
|
903
1011
|
{ content: [], pageDims: undefined },
|
|
904
1012
|
`hwp:sec${s}`,
|
|
905
1013
|
);
|
|
@@ -907,30 +1015,10 @@ export class HwpScanner implements Decoder {
|
|
|
907
1015
|
if (r.pageDims) pageDims = r.pageDims;
|
|
908
1016
|
}
|
|
909
1017
|
|
|
910
|
-
// Inject images into paragraphs (only if images are available)
|
|
911
|
-
console.log(`[HwpScanner] Before injection: ${allContent.length} nodes, ${objectMap.size} images available`);
|
|
912
1018
|
if (objectMap.size > 0) {
|
|
913
1019
|
injectImagesIntoContent(allContent, objectMap);
|
|
914
|
-
console.log(`[HwpScanner] After injection: ${allContent.length} nodes`);
|
|
915
1020
|
}
|
|
916
1021
|
|
|
917
|
-
// Count images (recursively)
|
|
918
|
-
const countImages = (nodes: ContentNode[]): number => {
|
|
919
|
-
let count = 0;
|
|
920
|
-
for (const node of nodes) {
|
|
921
|
-
if ((node as any).tag === 'img') count++;
|
|
922
|
-
if ((node as any).tag === 'para' && (node as any).kids) count += countImages((node as any).kids);
|
|
923
|
-
if ((node as any).tag === 'grid' && (node as any).kids) {
|
|
924
|
-
for (const row of (node as any).kids) {
|
|
925
|
-
if (row.kids) count += countImages(row.kids);
|
|
926
|
-
}
|
|
927
|
-
}
|
|
928
|
-
}
|
|
929
|
-
return count;
|
|
930
|
-
};
|
|
931
|
-
const imgCount = countImages(allContent);
|
|
932
|
-
console.log(`[HwpScanner] Images in content: ${imgCount}`);
|
|
933
|
-
|
|
934
1022
|
warns.push(...shield.flush());
|
|
935
1023
|
const content = allContent.length > 0 ? allContent : [buildPara([buildSpan('')])];
|
|
936
1024
|
return succeed(buildRoot({}, [buildSheet(content, pageDims)]), warns);
|
|
@@ -947,6 +1035,52 @@ function findBodySection(streams: Map<string, Uint8Array>): Uint8Array | undefin
|
|
|
947
1035
|
return undefined;
|
|
948
1036
|
}
|
|
949
1037
|
|
|
1038
|
+
/* ═══════════════════════════════════════════════════════════════
|
|
1039
|
+
Image dimension extraction from binary headers
|
|
1040
|
+
════════════════════════════════════════════════════════════ */
|
|
1041
|
+
|
|
1042
|
+
// Returns { wPt, hPt } by parsing image headers; falls back to { wPt: 72, hPt: 72 } (1-inch)
|
|
1043
|
+
function getImageDimsPt(data: Uint8Array, mime: string): { wPt: number; hPt: number } {
|
|
1044
|
+
const fallback = { wPt: 72, hPt: 72 };
|
|
1045
|
+
try {
|
|
1046
|
+
if (mime === 'image/png' && data.length >= 24) {
|
|
1047
|
+
// PNG IHDR: sig(8) + length(4) + type(4) + width(4) + height(4) — all big-endian
|
|
1048
|
+
const w = (data[16] << 24 | data[17] << 16 | data[18] << 8 | data[19]) >>> 0;
|
|
1049
|
+
const h = (data[20] << 24 | data[21] << 16 | data[22] << 8 | data[23]) >>> 0;
|
|
1050
|
+
if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 }; // 96 DPI → pt
|
|
1051
|
+
}
|
|
1052
|
+
if (mime === 'image/jpeg') {
|
|
1053
|
+
// Scan for SOF markers: FF C0 / C1 / C2 / C3
|
|
1054
|
+
let i = 2;
|
|
1055
|
+
while (i + 8 < data.length) {
|
|
1056
|
+
if (data[i] !== 0xFF) { i++; continue; }
|
|
1057
|
+
const marker = data[i + 1];
|
|
1058
|
+
if (marker >= 0xC0 && marker <= 0xC3) {
|
|
1059
|
+
// SOF: 2-byte marker + 2-byte length + 1-byte precision + 2-byte height + 2-byte width
|
|
1060
|
+
const h = (data[i + 5] << 8 | data[i + 6]) >>> 0;
|
|
1061
|
+
const w = (data[i + 7] << 8 | data[i + 8]) >>> 0;
|
|
1062
|
+
if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
|
|
1063
|
+
}
|
|
1064
|
+
const segLen = data[i + 2] << 8 | data[i + 3];
|
|
1065
|
+
i += 2 + (segLen > 0 ? segLen : 2);
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
if (mime === 'image/bmp' && data.length >= 26) {
|
|
1069
|
+
// BMP DIB header: width at 18, height at 22 (signed int32 LE; negative = top-down)
|
|
1070
|
+
const w = BinaryKit.readU32LE(data, 18);
|
|
1071
|
+
const h = Math.abs(BinaryKit.readU32LE(data, 22) | 0);
|
|
1072
|
+
if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
|
|
1073
|
+
}
|
|
1074
|
+
if (mime === 'image/gif' && data.length >= 10) {
|
|
1075
|
+
// GIF: width at 6, height at 8 (uint16 LE)
|
|
1076
|
+
const w = data[6] | data[7] << 8;
|
|
1077
|
+
const h = data[8] | data[9] << 8;
|
|
1078
|
+
if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
|
|
1079
|
+
}
|
|
1080
|
+
} catch { /* ignore */ }
|
|
1081
|
+
return fallback;
|
|
1082
|
+
}
|
|
1083
|
+
|
|
950
1084
|
/* ═══════════════════════════════════════════════════════════════
|
|
951
1085
|
OLE Object extraction (images)
|
|
952
1086
|
════════════════════════════════════════════════════════════ */
|
|
@@ -994,35 +1128,69 @@ function injectImagesIntoContent(
|
|
|
994
1128
|
content: ContentNode[],
|
|
995
1129
|
objectMap: Map<number, ImgNode>
|
|
996
1130
|
): void {
|
|
997
|
-
|
|
998
|
-
|
|
1131
|
+
if (objectMap.size === 0) return;
|
|
1132
|
+
|
|
1133
|
+
// Helper function to process a list of kids (spans, images, etc.)
|
|
1134
|
+
const processKids = (kids: any[]) => {
|
|
1135
|
+
for (let i = 0; i < kids.length; i++) {
|
|
1136
|
+
const kid = kids[i];
|
|
1137
|
+
// Span node structure: { tag: 'span', props, kids: [{ tag: 'txt', content }] }
|
|
1138
|
+
if (kid.tag === 'span' && kid.kids && kid.kids[0]?.tag === 'txt') {
|
|
1139
|
+
const text = kid.kids[0].content;
|
|
1140
|
+
// __EXT_N__ or __EXT_N_W<wPt>_H<hPt>__ (with encoded display size)
|
|
1141
|
+
// N is the objId that matches the index in objectMap
|
|
1142
|
+
const match = text.match?.(/^__(?:IMG|EXT)_(\d+)(?:_W(\d+)_H(\d+))?__$/);
|
|
1143
|
+
if (match) {
|
|
1144
|
+
const objId = parseInt(match[1], 10);
|
|
1145
|
+
const base = objectMap.get(objId);
|
|
1146
|
+
if (base) {
|
|
1147
|
+
const wPt = match[2] ? parseInt(match[2], 10) : 0;
|
|
1148
|
+
const hPt = match[3] ? parseInt(match[3], 10) : 0;
|
|
1149
|
+
// Use encoded display size when valid; otherwise keep pixel-based dims
|
|
1150
|
+
kids[i] = (wPt > 0 && hPt > 0) ? { ...base, w: wPt, h: hPt } : base;
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
};
|
|
1156
|
+
|
|
1157
|
+
// Recursively process a grid (table): resolves image placeholders in all cells,
|
|
1158
|
+
// including nested grids inside cells.
|
|
1159
|
+
const processGridKids = (grid: any) => {
|
|
1160
|
+
if (!grid.kids || !Array.isArray(grid.kids)) return;
|
|
999
1161
|
|
|
1000
|
-
|
|
1001
|
-
|
|
1002
|
-
|
|
1003
|
-
|
|
1004
|
-
|
|
1162
|
+
for (const row of grid.kids) {
|
|
1163
|
+
if (!row.kids || !Array.isArray(row.kids)) continue;
|
|
1164
|
+
|
|
1165
|
+
for (const cell of row.kids) {
|
|
1166
|
+
if (!cell.kids || !Array.isArray(cell.kids)) continue;
|
|
1167
|
+
|
|
1168
|
+
for (const cellKid of cell.kids) {
|
|
1169
|
+
if (cellKid.tag === 'grid') {
|
|
1170
|
+
// Nested table inside cell — recurse
|
|
1171
|
+
processGridKids(cellKid);
|
|
1172
|
+
} else if (cellKid.tag === 'para' && cellKid.kids) {
|
|
1173
|
+
processKids(cellKid.kids);
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
}
|
|
1178
|
+
};
|
|
1005
1179
|
|
|
1006
|
-
let imgIdx = 0;
|
|
1007
1180
|
for (const node of content) {
|
|
1008
1181
|
if (node.tag === 'para' && node.kids) {
|
|
1009
|
-
|
|
1010
|
-
|
|
1011
|
-
|
|
1012
|
-
|
|
1013
|
-
|
|
1014
|
-
|
|
1015
|
-
|
|
1016
|
-
if (match) {
|
|
1017
|
-
// Replace placeholder with next available image (round-robin)
|
|
1018
|
-
const imgNode = uniqueImages[imgIdx % uniqueImages.length];
|
|
1019
|
-
if (imgNode) {
|
|
1020
|
-
node.kids[i] = imgNode;
|
|
1021
|
-
imgIdx++;
|
|
1022
|
-
}
|
|
1023
|
-
}
|
|
1182
|
+
// Process paragraph kids (spans, images, links, grids)
|
|
1183
|
+
processKids(node.kids);
|
|
1184
|
+
|
|
1185
|
+
// Also process any nested grids inside the paragraph
|
|
1186
|
+
for (const kid of node.kids) {
|
|
1187
|
+
if (kid.tag === 'grid') {
|
|
1188
|
+
processGridKids(kid);
|
|
1024
1189
|
}
|
|
1025
1190
|
}
|
|
1191
|
+
} else if (node.tag === 'grid') {
|
|
1192
|
+
// Process grid nodes (tables)
|
|
1193
|
+
processGridKids(node);
|
|
1026
1194
|
}
|
|
1027
1195
|
}
|
|
1028
1196
|
}
|