hwpkit-dev 0.0.2 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. package/ .npmignore +4 -2
  2. package/README.md +39 -2
  3. package/dist/index.d.mts +41 -14
  4. package/dist/index.d.ts +41 -14
  5. package/dist/index.js +3553 -1159
  6. package/dist/index.js.map +1 -1
  7. package/dist/index.mjs +3553 -1159
  8. package/dist/index.mjs.map +1 -1
  9. package/package.json +2 -1
  10. package/playground/index.html +346 -0
  11. package/playground/main.ts +302 -0
  12. package/playground/vite.config.ts +16 -0
  13. package/src/contract/decoder.ts +1 -0
  14. package/src/contract/encoder.ts +6 -1
  15. package/src/core/BaseDecoder.ts +118 -0
  16. package/src/core/BaseEncoder.ts +146 -0
  17. package/src/decoders/docx/DocxDecoder.ts +743 -151
  18. package/src/decoders/html/HtmlDecoder.ts +366 -0
  19. package/src/decoders/hwp/HwpScanner.ts +325 -157
  20. package/src/decoders/hwpx/HwpxDecoder.ts +785 -297
  21. package/src/decoders/md/MdDecoder.ts +4 -4
  22. package/src/encoders/docx/DocxEncoder.ts +504 -240
  23. package/src/encoders/html/HtmlEncoder.ts +17 -19
  24. package/src/encoders/hwp/HwpEncoder.ts +1466 -859
  25. package/src/encoders/hwpx/HwpxEncoder.ts +1477 -469
  26. package/src/encoders/hwpx/constants.ts +148 -0
  27. package/src/encoders/hwpx/utils.ts +198 -0
  28. package/src/encoders/md/MdEncoder.ts +20 -15
  29. package/src/model/builders.ts +4 -4
  30. package/src/model/doc-props.ts +19 -5
  31. package/src/model/doc-tree.ts +12 -4
  32. package/src/pipeline/Pipeline.ts +7 -3
  33. package/src/pipeline/registry.ts +13 -2
  34. package/src/safety/StyleBridge.ts +51 -6
  35. package/src/toolkit/ArchiveKit.ts +56 -0
  36. package/src/toolkit/StyleMapper.ts +221 -0
  37. package/src/toolkit/UnitConverter.ts +138 -0
  38. package/src/toolkit/XmlKit.ts +0 -5
  39. package/test-styling.ts +210 -0
  40. package/hwp-analyze.ts +0 -90
  41. package/inspect-doc.ts +0 -57
  42. package/output_test.hwp +0 -0
  43. package/test-docx-to-hwp.ts +0 -45
@@ -1,11 +1,12 @@
1
1
  import type { Decoder } from '../../contract/decoder';
2
- import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode } from '../../model/doc-tree';
2
+ import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode, GridNode } from '../../model/doc-tree';
3
3
  import type { Outcome } from '../../contract/result';
4
4
  import type { Align, Stroke, StrokeKind, PageDims, TextProps, ParaProps, CellProps, GridProps } from '../../model/doc-props';
5
5
  import { succeed, fail } from '../../contract/result';
6
6
  import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell, buildImg } from '../../model/builders';
7
7
  import { ShieldedParser } from '../../safety/ShieldedParser';
8
8
  import { BinaryKit } from '../../toolkit/BinaryKit';
9
+ import { TextKit } from '../../toolkit/TextKit';
9
10
  import { Metric, safeHex, safeFont } from '../../safety/StyleBridge';
10
11
  import { registry } from '../../pipeline/registry';
11
12
  import { A4 } from '../../model/doc-props';
@@ -38,10 +39,11 @@ function isTableTag(t: number) { return t === TAG_TABLE_A || t === TAG_TABLE_B;
38
39
  function isCellTag(t: number) { return t === TAG_CELL_A || t === TAG_CELL_B || t === TAG_LIST_HEADER; }
39
40
 
40
41
  // CTRL_HEADER ctrlId values (UINT32-LE as ASCII)
41
- const CTRL_TABLE = 0x74626C20; // ' lbt'
42
+ const CTRL_TABLE = 0x74626C20; // ' lbt' = 표(table)
42
43
  const CTRL_IMAGE = 0x696D6720; // 'img '
43
44
  const CTRL_OBJ = 0x6F626A20; // 'obj '
44
45
  const CTRL_FIG = 0x66696720; // 'fig '
46
+ const CTRL_GSO = 0x67736F20; // 'gso ' = 그리기 객체 (drawing object, contains embedded images)
45
47
 
46
48
  /* ═══════════════════════════════════════════════════════════════
47
49
  Types
@@ -70,6 +72,7 @@ interface HwpParaShape {
70
72
  spaceBefore: number;
71
73
  spaceAfter: number;
72
74
  lineSpacing: number;
75
+ leftMargin: number;
73
76
  indent: number;
74
77
  }
75
78
 
@@ -221,11 +224,12 @@ function parseCharShape(d: Uint8Array): HwpCharShape {
221
224
  const ALIGN_TBL: Record<number, Align> = { 0: 'justify', 1: 'left', 2: 'right', 3: 'center', 4: 'justify' };
222
225
 
223
226
  function parseParaShape(d: Uint8Array): HwpParaShape {
224
- if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, indent: 0 };
227
+ if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, leftMargin: 0, indent: 0 };
225
228
  const attr = BinaryKit.readU32LE(d, 0);
226
229
  return {
227
230
  align: ALIGN_TBL[(attr >> 2) & 0x7] ?? 'left',
228
- indent: d.length >= 16 ? i32(d, 12) : 0,
231
+ leftMargin: d.length >= 8 ? i32(d, 4) : 0, // offset 4: leftMargin (들여쓰기)
232
+ indent: d.length >= 16 ? i32(d, 12) : 0, // offset 12: first-line indent
229
233
  spaceBefore: d.length >= 20 ? i32(d, 16) : 0,
230
234
  spaceAfter: d.length >= 24 ? i32(d, 20) : 0,
231
235
  lineSpacing: d.length >= 28 ? i32(d, 24) : 160,
@@ -275,8 +279,13 @@ function parseBorderFill(d: Uint8Array): HwpBorderFill {
275
279
  Body section parsing
276
280
  ═══════════════════════════════════════════════════════════════ */
277
281
 
282
+ // gsoCtx: shared mutable counter for 'gso ' drawing objects.
283
+ // Each 'gso ' CTRL_HEADER encountered increments this counter.
284
+ // objectMap is keyed by 0-based gso order = sequential BinData insertion order.
285
+ interface GsoCtx { count: number }
286
+
278
287
  function parseBody(
279
- raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser,
288
+ raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
280
289
  ): { content: ContentNode[]; pageDims?: PageDims } {
281
290
  const recs = parseRecords(compressed ? tryInflate(raw) : raw);
282
291
  const content: ContentNode[] = [];
@@ -296,7 +305,7 @@ function parseBody(
296
305
  i++; // already handled above; skip at top level
297
306
  } else if (recs[i].tag === TAG_PARA_HEADER) {
298
307
  const r = shield.guard(
299
- () => parseParagraphGroup(recs, i, di, shield),
308
+ () => parseParagraphGroup(recs, i, di, shield, gsoCtx),
300
309
  { nodes: [] as ContentNode[], next: i + 1 },
301
310
  `hwp:para@${i}`,
302
311
  );
@@ -312,7 +321,7 @@ function parseBody(
312
321
  /* ── Paragraph group ────────────────────────────────────────── */
313
322
 
314
323
  function parseParagraphGroup(
315
- recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser,
324
+ recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
316
325
  ): { nodes: ContentNode[]; next: number } {
317
326
  const hdr = recs[start];
318
327
  const lv = hdr.level;
@@ -324,7 +333,8 @@ function parseParagraphGroup(
324
333
  let text: ParaTextResult | null = null;
325
334
  let csPairs: [number, number][] = [];
326
335
  const grids: ContentNode[] = [];
327
- const ctrlHeaders: { ctrlId: number; objId: number }[] = [];
336
+ // imgId: for 'gso' uses sequential gsoCtx.count; for others uses flags-based objId
337
+ const ctrlHeaders: { ctrlId: number; imgId: number; wPt: number; hPt: number }[] = [];
328
338
  let i = start + 1;
329
339
 
330
340
  while (i < recs.length && recs[i].level > lv) {
@@ -339,13 +349,23 @@ function parseParagraphGroup(
339
349
  } else if (r.tag === TAG_CTRL_HEADER && r.level === lv + 1) {
340
350
  if (r.data.length >= 4) {
341
351
  const ctrlId = BinaryKit.readU32LE(r.data, 0);
342
- // objId at offset 4 (UINT16) - identifies the image/object in BinData
343
- const objId = r.data.length >= 6 ? BinaryKit.readU16LE(r.data, 4) : 0;
344
- ctrlHeaders.push({ ctrlId, objId });
352
+
353
+ // HWP 5.0 general-object layout:
354
+ // [0:4] ctrlId [4:4] flags [8:4] xOff [12:4] yOff
355
+ // [16:4] width(HWPUNIT) [20:4] height(HWPUNIT)
356
+ const MAX_HWP = 1_000_000;
357
+ const rawW = r.data.length >= 24 ? BinaryKit.readU32LE(r.data, 16) : 0;
358
+ const rawH = r.data.length >= 28 ? BinaryKit.readU32LE(r.data, 20) : 0;
359
+ const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
360
+ const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
361
+
362
+ // 'gso ' (그리기 객체) uses sequential counter; others use flags-based id
363
+ const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (r.data.length >= 6 ? BinaryKit.readU16LE(r.data, 4) : 0);
364
+ ctrlHeaders.push({ ctrlId, imgId, wPt, hPt });
345
365
 
346
366
  if (ctrlId === CTRL_TABLE) {
347
367
  const tr = shield.guard(
348
- () => parseTableCtrl(recs, i, di, shield),
368
+ () => parseTableCtrl(recs, i, di, shield, gsoCtx),
349
369
  { grid: null, next: skipKids(recs, i) },
350
370
  `hwp:tbl@${i}`,
351
371
  );
@@ -362,34 +382,29 @@ function parseParagraphGroup(
362
382
  }
363
383
  }
364
384
 
365
- // Match extended controls with CTRL_HEADER entries
366
- if (text && ctrlHeaders.length > 0) {
367
- for (let ci = 0; ci < text.controls.length; ci++) {
368
- if (ci < ctrlHeaders.length) {
369
- text.controls[ci].ctrlId = ctrlHeaders[ci].ctrlId;
370
- text.controls[ci].matched = true;
371
- }
372
- }
373
- }
374
-
375
385
  const nodes: ContentNode[] = [];
376
386
 
377
387
  // Build paragraph from text and inline controls (images)
378
388
  if (text && (text.chars.length > 0 || text.controls.length > 0)) {
379
389
  const paraContent: (SpanNode | ContentNode)[] = [];
380
390
 
381
- // Process text chars and controls together
382
391
  if (text.chars.length > 0) {
383
392
  const spans = resolveCharShapes(text.chars, csPairs, di);
384
393
  paraContent.push(...spans);
385
394
  }
386
395
 
387
- // Add placeholder spans for extended controls (images)
396
+ // Image placeholder spans: only for actual image controls.
397
+ // Non-image controls (footnotes, TOC entries, etc.) are silently skipped.
388
398
  if (text.controls.length > 0) {
389
399
  for (let ci = 0; ci < text.controls.length; ci++) {
390
- // Create placeholder for all extended controls
391
- // Image replacement will happen later in injectImagesIntoContent
392
- paraContent.push(buildSpan(`__EXT_${ci}__`));
400
+ const ch = ctrlHeaders[ci];
401
+ if (!ch) continue; // anchor-only ctrl (gso is sibling, not inline)
402
+ const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
403
+ if (!isImg) continue; // skip footnotes, TOC, page num, etc.
404
+ const dimStr = (ch.wPt > 0 && ch.hPt > 0)
405
+ ? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}`
406
+ : '';
407
+ paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
393
408
  }
394
409
  }
395
410
 
@@ -511,7 +526,7 @@ function styledSpan(text: string, shapeId: number, di: DocInfo): SpanNode {
511
526
  /* ── Table control parsing ──────────────────────────────────── */
512
527
 
513
528
  function parseTableCtrl(
514
- recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser,
529
+ recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
515
530
  ): { grid: ContentNode | null; next: number } {
516
531
  const ctrlLv = recs[ctrlIdx].level;
517
532
  let i = ctrlIdx + 1;
@@ -567,15 +582,15 @@ function parseTableCtrl(
567
582
  const rowCnt = tblData.length >= 6 ? BinaryKit.readU16LE(tblData, 4) : 1;
568
583
  const colCnt = tblData.length >= 8 ? BinaryKit.readU16LE(tblData, 6) : 1;
569
584
 
570
- interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; props: CellProps; paras: ParaNode[] }
585
+ interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; heightHwp?: number; props: CellProps; cellChildren: (ParaNode | GridNode)[] }
571
586
  const parsed: PC[] = [];
572
587
 
573
588
  for (let ci = 0; ci < cells.length; ci++) {
574
589
  const c = cells[ci];
575
590
  const seqIdx = ci;
576
591
  const pc = shield.guard(
577
- () => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt),
578
- { row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, props: {}, paras: [buildPara([buildSpan('')])] },
592
+ () => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt, gsoCtx),
593
+ { row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, heightHwp: undefined, props: {}, cellChildren: [buildPara([buildSpan('')])] },
579
594
  `hwp:cell@${c.cStart}`,
580
595
  );
581
596
  parsed.push(pc);
@@ -602,9 +617,11 @@ function parseTableCtrl(
602
617
  }
603
618
  }
604
619
  // Pass 2: for columns still 0, try to derive from multi-span cells
620
+ // Sort by span size ascending so smaller, more precise spans fill widths before larger spans
605
621
  const zeroColumns = colWidthsPt.filter(w => w === 0).length;
606
622
  if (zeroColumns > 0) {
607
- for (const c of parsed) {
623
+ const spanCells = parsed.filter(c => c.cs > 1 && c.widthHwp > 0).sort((a, b) => a.cs - b.cs);
624
+ for (const c of spanCells) {
608
625
  if (c.cs > 1 && c.widthHwp > 0) {
609
626
  // Subtract known column widths from the span
610
627
  let known = 0;
@@ -624,13 +641,37 @@ function parseTableCtrl(
624
641
  }
625
642
  }
626
643
 
644
+ // Post-process: clamp near-zero column widths (< 1pt = floating-point artifact) to minimum 1pt
645
+ for (let i = 0; i < colWidthsPt.length; i++) {
646
+ if (colWidthsPt[i] > 0 && colWidthsPt[i] < 1) colWidthsPt[i] = 1;
647
+ }
648
+
627
649
  const rows = [];
628
650
  for (let r = 0; r < actualRowCnt; r++) {
629
651
  const rc = parsed.filter(c => c.row === r).sort((a, b) => a.col - b.col);
630
652
  if (rc.length === 0) continue;
631
- rows.push(buildRow(rc.map(c =>
632
- buildCell(c.paras.length ? c.paras : [buildPara([buildSpan('')])], { cs: c.cs, rs: c.rs, props: c.props }),
633
- )));
653
+
654
+ // Calculate row height prefer rs=1 cells (exact per-row height)
655
+ let rowHeightPt: number | undefined = undefined;
656
+ for (const c of rc) {
657
+ if (c.heightHwp && c.heightHwp > 0 && c.rs === 1) {
658
+ const hPt = Metric.hwpToPt(c.heightHwp);
659
+ if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
660
+ }
661
+ }
662
+ // Fallback: all cells span multiple rows → approximate height per row
663
+ if (rowHeightPt == null) {
664
+ for (const c of rc) {
665
+ if (c.heightHwp && c.heightHwp > 0) {
666
+ const hPt = Metric.hwpToPt(c.heightHwp) / c.rs;
667
+ if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
668
+ }
669
+ }
670
+ }
671
+
672
+ rows.push(buildRow(rc.map(c => {
673
+ return buildCell(c.cellChildren, { cs: c.cs, rs: c.rs, props: c.props });
674
+ }), rowHeightPt));
634
675
  }
635
676
  if (rows.length === 0) return { grid: null, next: i };
636
677
 
@@ -659,10 +700,11 @@ function parseTableCtrl(
659
700
 
660
701
  function parseCellRec(
661
702
  d: Uint8Array, tag: number, recs: HwpRecord[], cStart: number, cEnd: number,
662
- di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number,
703
+ di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number, gsoCtx: GsoCtx,
663
704
  ) {
664
705
  let col: number, row: number, cs = 1, rs = 1;
665
706
  let widthHwp = 0;
707
+ let heightHwp = 0;
666
708
  const props: CellProps = {};
667
709
 
668
710
  const attr = d.length >= 6 ? BinaryKit.readU32LE(d, 2) : 0;
@@ -670,57 +712,55 @@ function parseCellRec(
670
712
  if (va === 1) props.va = 'mid';
671
713
  else if (va === 2) props.va = 'bot';
672
714
 
715
+ const HWP_PAD_LR_DEFAULT = 360;
716
+ const HWP_PAD_TB_DEFAULT = 141;
717
+
673
718
  if (tag === TAG_LIST_HEADER && d.length >= 22) {
674
- // LIST_HEADER with cell-specific fields
675
- // offset 8: colAddr, offset 10: rowAddr (HWP 5.0 spec)
676
719
  col = BinaryKit.readU16LE(d, 8);
677
720
  row = BinaryKit.readU16LE(d, 10);
678
721
  cs = Math.max(1, BinaryKit.readU16LE(d, 12));
679
722
  rs = Math.max(1, BinaryKit.readU16LE(d, 14));
680
723
  widthHwp = BinaryKit.readU32LE(d, 16);
681
-
682
- const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
683
- if (bfId > 0 && bfId <= di.borderFills.length) {
684
- const bf = di.borderFills[bfId - 1];
685
- if (bf.borders.length >= 4) {
686
- props.left = toStroke(bf.borders[0]);
687
- props.right = toStroke(bf.borders[1]);
688
- props.top = toStroke(bf.borders[2]);
689
- props.bot = toStroke(bf.borders[3]);
690
- }
691
- if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
724
+ heightHwp = d.length >= 24 ? BinaryKit.readU32LE(d, 20) : 0;
725
+ if (d.length >= 32) {
726
+ const pL = BinaryKit.readU16LE(d, 24); const pR = BinaryKit.readU16LE(d, 26);
727
+ const pT = BinaryKit.readU16LE(d, 28); const pB = BinaryKit.readU16LE(d, 30);
728
+ if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
729
+ if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
730
+ if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
731
+ if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
692
732
  }
733
+ const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
734
+ if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
693
735
  } else if (tag !== TAG_LIST_HEADER) {
694
- // Full CELL record with position/span/borderFill
695
736
  col = d.length >= 8 ? BinaryKit.readU16LE(d, 6) : seqIdx % (colCnt || 1);
696
737
  row = d.length >= 10 ? BinaryKit.readU16LE(d, 8) : Math.floor(seqIdx / (colCnt || 1));
697
738
  cs = d.length >= 12 ? Math.max(1, BinaryKit.readU16LE(d, 10)) : 1;
698
739
  rs = d.length >= 14 ? Math.max(1, BinaryKit.readU16LE(d, 12)) : 1;
699
740
  widthHwp = d.length >= 18 ? BinaryKit.readU32LE(d, 14) : 0;
700
-
701
- const bfId = d.length >= 32 ? BinaryKit.readU16LE(d, 30) : 0;
702
- if (bfId > 0 && bfId <= di.borderFills.length) {
703
- const bf = di.borderFills[bfId - 1];
704
- if (bf.borders.length >= 4) {
705
- props.left = toStroke(bf.borders[0]);
706
- props.right = toStroke(bf.borders[1]);
707
- props.top = toStroke(bf.borders[2]);
708
- props.bot = toStroke(bf.borders[3]);
709
- }
710
- if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
741
+ heightHwp = d.length >= 22 ? BinaryKit.readU32LE(d, 18) : 0;
742
+ if (d.length >= 30) {
743
+ const pL = BinaryKit.readU16LE(d, 22); const pR = BinaryKit.readU16LE(d, 24);
744
+ const pT = BinaryKit.readU16LE(d, 26); const pB = BinaryKit.readU16LE(d, 28);
745
+ if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
746
+ if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
747
+ if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
748
+ if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
711
749
  }
750
+ const bfId = d.length >= 32 ? BinaryKit.readU16LE(d, 30) : 0;
751
+ if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
712
752
  } else {
713
- // Fallback: LIST_HEADER too short, compute sequentially
714
753
  row = Math.floor(seqIdx / (colCnt || 1));
715
754
  col = seqIdx % (colCnt || 1);
716
755
  }
717
756
 
718
- // Parse cell content paragraphs
719
- const paras: ParaNode[] = [];
757
+ const cellChildren: (ParaNode | GridNode)[] = [];
758
+ const MAX_HWP = 1_000_000;
720
759
  let k = cStart;
760
+
721
761
  while (k < cEnd) {
722
762
  if (recs[k].tag === TAG_PARA_HEADER) {
723
- // For cell paragraphs, they might be at various nesting levels
763
+ // Parse paragraph inside cell also extracts nested tables within the paragraph
724
764
  const r = shield.guard(
725
765
  () => {
726
766
  const hdr = recs[k];
@@ -729,24 +769,91 @@ function parseCellRec(
729
769
  const ps = di.paraShapes[psId];
730
770
  let txt: ParaTextResult | null = null;
731
771
  let csp: [number, number][] = [];
772
+ const ctrlHdrs: { ctrlId: number; imgId: number; wPt: number; hPt: number }[] = [];
773
+ const innerGrids: GridNode[] = [];
732
774
  let j = k + 1;
733
775
  while (j < cEnd && recs[j].level > lv) {
734
776
  if (recs[j].tag === TAG_PARA_TEXT) { txt = decodeParaText(recs[j].data); j++; }
735
777
  else if (recs[j].tag === TAG_PARA_CHAR_SHAPE) { csp = parseCharShapePairs(recs[j].data); j++; }
778
+ else if (recs[j].tag === TAG_CTRL_HEADER && recs[j].level === lv + 1) {
779
+ if (recs[j].data.length >= 4) {
780
+ const ctrlId = BinaryKit.readU32LE(recs[j].data, 0);
781
+ if (ctrlId === CTRL_TABLE) {
782
+ // Nested table inside a cell paragraph — recurse into parseTableCtrl
783
+ const nestedTr = shield.guard(
784
+ () => parseTableCtrl(recs, j, di, shield, gsoCtx),
785
+ { grid: null, next: skipKids(recs, j) },
786
+ `hwp:innerNestedTbl@${j}`,
787
+ );
788
+ if (nestedTr.grid) innerGrids.push(nestedTr.grid as GridNode);
789
+ j = nestedTr.next;
790
+ } else {
791
+ const rawW = recs[j].data.length >= 24 ? BinaryKit.readU32LE(recs[j].data, 16) : 0;
792
+ const rawH = recs[j].data.length >= 28 ? BinaryKit.readU32LE(recs[j].data, 20) : 0;
793
+ const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
794
+ const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
795
+ const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (recs[j].data.length >= 6 ? BinaryKit.readU16LE(recs[j].data, 4) : 0);
796
+ ctrlHdrs.push({ ctrlId, imgId, wPt, hPt });
797
+ j = skipKids(recs, j);
798
+ }
799
+ } else {
800
+ j = skipKids(recs, j);
801
+ }
802
+ }
736
803
  else j++;
737
804
  }
738
- const spans = txt && txt.chars.length > 0 ? resolveCharShapes(txt.chars, csp, di) : [buildSpan('')];
739
- return { para: buildPara(spans, buildParaProps(ps)), next: j };
805
+ const paraContent: (SpanNode | ContentNode)[] = [];
806
+ if (txt && txt.chars.length > 0) paraContent.push(...resolveCharShapes(txt.chars, csp, di));
807
+ if (txt && txt.controls.length > 0) {
808
+ for (let ci = 0; ci < txt.controls.length; ci++) {
809
+ const ch = ctrlHdrs[ci];
810
+ if (!ch) continue;
811
+ const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
812
+ if (!isImg) continue;
813
+ const dimStr = (ch.wPt > 0 && ch.hPt > 0) ? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}` : '';
814
+ paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
815
+ }
816
+ }
817
+ const kids = paraContent.length > 0 ? paraContent as any : [buildSpan('')];
818
+ const items: (ParaNode | GridNode)[] = [buildPara(kids, buildParaProps(ps)), ...innerGrids];
819
+ return { items, next: j };
740
820
  },
741
- { para: buildPara([buildSpan('')]), next: k + 1 },
821
+ { items: [buildPara([buildSpan('')])] as (ParaNode | GridNode)[], next: k + 1 },
742
822
  `hwp:cellP@${k}`,
743
823
  );
744
- paras.push(r.para);
824
+ cellChildren.push(...r.items);
745
825
  k = r.next;
826
+ } else if (recs[k].tag === TAG_CTRL_HEADER && recs[k].data.length >= 4) {
827
+ // CTRL_HEADER at cell level (sibling of PARA_HEADER) — anchored 'gso' images and outer-level nested tables
828
+ const cellCtrlId = BinaryKit.readU32LE(recs[k].data, 0);
829
+ if (cellCtrlId === CTRL_GSO) {
830
+ const gsoId = gsoCtx.count++;
831
+ const rawW = recs[k].data.length >= 24 ? BinaryKit.readU32LE(recs[k].data, 16) : 0;
832
+ const rawH = recs[k].data.length >= 28 ? BinaryKit.readU32LE(recs[k].data, 20) : 0;
833
+ const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
834
+ const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
835
+ const dimStr = (wPt > 0 && hPt > 0) ? `_W${Math.round(wPt)}_H${Math.round(hPt)}` : '';
836
+ cellChildren.push(buildPara([buildSpan(`__EXT_${gsoId}${dimStr}__`)]));
837
+ k = skipKids(recs, k);
838
+ } else if (cellCtrlId === CTRL_TABLE) {
839
+ const tr = shield.guard(
840
+ () => parseTableCtrl(recs, k, di, shield, gsoCtx),
841
+ { grid: null, next: skipKids(recs, k) },
842
+ `hwp:nestedTbl@${k}`,
843
+ );
844
+ if (tr.grid) cellChildren.push(tr.grid as GridNode);
845
+ k = tr.next;
846
+ } else {
847
+ k = skipKids(recs, k);
848
+ }
746
849
  } else { k++; }
747
850
  }
748
851
 
749
- return { row, col, cs, rs, props, widthHwp, paras: paras.length ? paras : [buildPara([buildSpan('')])] };
852
+ return {
853
+ row, col, cs, rs, props, widthHwp,
854
+ heightHwp: heightHwp || undefined,
855
+ cellChildren: cellChildren.length ? cellChildren : [buildPara([buildSpan('')])],
856
+ };
750
857
  }
751
858
 
752
859
  /* ── PAGE_DEF ───────────────────────────────────────────────── */
@@ -788,6 +895,18 @@ function toStroke(b: { type: number; widthPt: number; color: string }): Stroke {
788
895
  return { kind: BORDER_KIND[b.type] ?? 'solid', pt: b.widthPt, color: b.color };
789
896
  }
790
897
 
898
+ // Apply borderFill to CellProps. Preserve explicit NONE so DOCX tcBorders can
899
+ // override the table-level tblBorders. Filtering NONE would let tblBorders bleed through.
900
+ function applyCellBorderFill(bf: HwpBorderFill, props: CellProps): void {
901
+ if (bf.borders.length >= 4) {
902
+ props.left = toStroke(bf.borders[0]);
903
+ props.right = toStroke(bf.borders[1]);
904
+ props.top = toStroke(bf.borders[2]);
905
+ props.bot = toStroke(bf.borders[3]);
906
+ }
907
+ if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
908
+ }
909
+
791
910
  function strokeFromBF(bfId: number, di: DocInfo): Stroke | undefined {
792
911
  if (bfId <= 0 || bfId > di.borderFills.length) return undefined;
793
912
  const bf = di.borderFills[bfId - 1];
@@ -803,7 +922,11 @@ function buildParaProps(ps?: HwpParaShape): ParaProps {
803
922
  if (ps.spaceBefore > 0) p.spaceBefore = Metric.hwpToPt(ps.spaceBefore);
804
923
  if (ps.spaceAfter > 0) p.spaceAfter = Metric.hwpToPt(ps.spaceAfter);
805
924
  if (ps.lineSpacing > 0 && ps.lineSpacing !== 160) p.lineHeight = ps.lineSpacing / 100;
806
- if (ps.indent > 0) p.indentPt = Metric.hwpToPt(ps.indent);
925
+ // leftMargin (offset 4) = 문단 몸체 왼쪽 여백 → leftMargin (pt), ensure non-negative
926
+ const leftMarginPt = Math.max(0, Metric.hwpToPt(ps.leftMargin));
927
+ if (leftMarginPt > 0) p.leftMargin = leftMarginPt;
928
+ // indent (offset 12) = 첫 줄 들여쓰기(양수) / 내어쓰기(음수) → firstLineIndentPt
929
+ if (ps.indent !== 0) p.firstLineIndentPt = Metric.hwpToPt(ps.indent);
807
930
  return p;
808
931
  }
809
932
 
@@ -813,6 +936,7 @@ function buildParaProps(ps?: HwpParaShape): ParaProps {
813
936
 
814
937
  export class HwpScanner implements Decoder {
815
938
  readonly format = 'hwp';
939
+ readonly aliases = ['application/vnd.hancom.hwp'];
816
940
 
817
941
  async decode(data: Uint8Array): Promise<Outcome<DocRoot>> {
818
942
  const shield = new ShieldedParser();
@@ -834,52 +958,36 @@ export class HwpScanner implements Decoder {
834
958
  di = shield.guard(() => parseDocInfo(diRaw, compressed), di, 'hwp:docInfo');
835
959
  }
836
960
 
837
- // Extract images from BinData streams
838
- const imageStreams: { path: string; data: Uint8Array }[] = [];
839
- for (const [path, data] of streams) {
840
- if ((path.includes('BinData') || path.includes('.jpg') || path.includes('.jpeg') || path.includes('.png') || path.includes('.gif') || path.includes('.bmp'))
841
- && !path.includes('FileHeader') && !path.includes('DocInfo') && !path.includes('BodyText') && !path.includes('Section')) {
842
- imageStreams.push({ path, data });
843
- console.log(`[HwpScanner] Image stream found: ${path} (${data.length} bytes)`);
844
- }
961
+ // Extract images from BinData streams.
962
+ // HWP duplicates each BinData entry: once as "BinData/BIN0001.jpg" and once as "BIN0001.jpg".
963
+ // We keep only the "BinData/" prefixed versions, sort by BIN number, then assign 0-based keys
964
+ // matching the order 'gso' CTRL_HEADER records are encountered during body parsing.
965
+ const binEntries: { binNum: number; data: Uint8Array }[] = [];
966
+ for (const [path, streamData] of streams) {
967
+ // Match "BinData/BIN0001.jpg" style the canonical form
968
+ const m = path.match(/^BinData[/\\]BIN(\d+)\.\w+$/i);
969
+ if (m) binEntries.push({ binNum: parseInt(m[1], 10), data: streamData });
845
970
  }
971
+ // Sort by BIN number (ascending) so BIN0001→idx0, BIN0002→idx1, …
972
+ binEntries.sort((a, b) => a.binNum - b.binNum);
846
973
 
847
- // Create image nodes for each image stream (deduplicated by hash)
848
974
  const objectMap = new Map<number, ImgNode>();
849
- const seenHashes = new Set<string>();
850
- let imgIdx = 0;
851
- for (const { path, data } of imageStreams) {
852
- // Determine MIME type from extension or signature
853
- let mimeType = 'image/jpeg';
854
- const lowerPath = path.toLowerCase();
855
- if (lowerPath.includes('.png')) mimeType = 'image/png';
856
- else if (lowerPath.includes('.gif')) mimeType = 'image/gif';
857
- else if (lowerPath.includes('.bmp')) mimeType = 'image/bmp';
858
-
859
- // Also check signature
860
- if (data[0] === 0x89 && data[1] === 0x50 && data[2] === 0x4E && data[3] === 0x47) mimeType = 'image/png';
861
- else if (data[0] === 0x47 && data[1] === 0x49 && data[2] === 0x46 && data[3] === 0x3538) mimeType = 'image/gif';
862
- else if (data[0] === 0x42 && data[1] === 0x4D) mimeType = 'image/bmp';
863
-
864
- const imgData = Buffer.from(data);
865
- const base64 = imgData.toString('base64');
866
- const hash = base64.slice(0, 20); // Use first 20 chars as simple hash
867
- if (!seenHashes.has(hash)) {
868
- seenHashes.add(hash);
869
- objectMap.set(imgIdx++, buildImg(
870
- base64,
871
- mimeType as any,
872
- 0, // w
873
- 0, // h
874
- `Image from ${path}`,
875
- ));
876
- console.log(`[HwpScanner] Added unique image: ${hash}... (${data.length} bytes)`);
877
- } else {
878
- console.log(`[HwpScanner] Duplicate image skipped: ${hash}...`);
879
- }
975
+ for (let idx = 0; idx < binEntries.length; idx++) {
976
+ const { data: imgData } = binEntries[idx];
977
+
978
+ // Determine MIME type from binary signature first, then fall back to extension
979
+ let mimeType: ImgNode['mime'] = 'image/jpeg';
980
+ if (imgData[0] === 0x89 && imgData[1] === 0x50) mimeType = 'image/png';
981
+ else if (imgData[0] === 0x47 && imgData[1] === 0x49) mimeType = 'image/gif';
982
+ else if (imgData[0] === 0x42 && imgData[1] === 0x4D) mimeType = 'image/bmp';
983
+
984
+ const base64 = TextKit.base64Encode(imgData);
985
+ const { wPt, hPt } = getImageDimsPt(imgData, mimeType);
986
+ objectMap.set(idx, buildImg(base64, mimeType, wPt, hPt));
880
987
  }
881
988
 
882
- console.log(`[HwpScanner] Found ${imageStreams.length} image streams, ${objectMap.size} unique images`);
989
+ // gsoCtx tracks sequential 'gso' encounter order — must be shared across all sections
990
+ const gsoCtx: GsoCtx = { count: 0 };
883
991
 
884
992
  // Body sections
885
993
  const allContent: ContentNode[] = [];
@@ -891,7 +999,7 @@ export class HwpScanner implements Decoder {
891
999
  if (s === 0) {
892
1000
  const fb = findBodySection(streams);
893
1001
  if (fb) {
894
- const r = parseBody(fb, compressed, di, shield);
1002
+ const r = parseBody(fb, compressed, di, shield, gsoCtx);
895
1003
  allContent.push(...r.content);
896
1004
  if (r.pageDims) pageDims = r.pageDims;
897
1005
  }
@@ -899,7 +1007,7 @@ export class HwpScanner implements Decoder {
899
1007
  break;
900
1008
  }
901
1009
  const r = shield.guard(
902
- () => parseBody(sec, compressed, di, shield),
1010
+ () => parseBody(sec, compressed, di, shield, gsoCtx),
903
1011
  { content: [], pageDims: undefined },
904
1012
  `hwp:sec${s}`,
905
1013
  );
@@ -907,30 +1015,10 @@ export class HwpScanner implements Decoder {
907
1015
  if (r.pageDims) pageDims = r.pageDims;
908
1016
  }
909
1017
 
910
- // Inject images into paragraphs (only if images are available)
911
- console.log(`[HwpScanner] Before injection: ${allContent.length} nodes, ${objectMap.size} images available`);
912
1018
  if (objectMap.size > 0) {
913
1019
  injectImagesIntoContent(allContent, objectMap);
914
- console.log(`[HwpScanner] After injection: ${allContent.length} nodes`);
915
1020
  }
916
1021
 
917
- // Count images (recursively)
918
- const countImages = (nodes: ContentNode[]): number => {
919
- let count = 0;
920
- for (const node of nodes) {
921
- if ((node as any).tag === 'img') count++;
922
- if ((node as any).tag === 'para' && (node as any).kids) count += countImages((node as any).kids);
923
- if ((node as any).tag === 'grid' && (node as any).kids) {
924
- for (const row of (node as any).kids) {
925
- if (row.kids) count += countImages(row.kids);
926
- }
927
- }
928
- }
929
- return count;
930
- };
931
- const imgCount = countImages(allContent);
932
- console.log(`[HwpScanner] Images in content: ${imgCount}`);
933
-
934
1022
  warns.push(...shield.flush());
935
1023
  const content = allContent.length > 0 ? allContent : [buildPara([buildSpan('')])];
936
1024
  return succeed(buildRoot({}, [buildSheet(content, pageDims)]), warns);
@@ -947,6 +1035,52 @@ function findBodySection(streams: Map<string, Uint8Array>): Uint8Array | undefin
947
1035
  return undefined;
948
1036
  }
949
1037
 
1038
+ /* ═══════════════════════════════════════════════════════════════
1039
+ Image dimension extraction from binary headers
1040
+ ════════════════════════════════════════════════════════════ */
1041
+
1042
+ // Returns { wPt, hPt } by parsing image headers; falls back to { wPt: 72, hPt: 72 } (1-inch)
1043
+ function getImageDimsPt(data: Uint8Array, mime: string): { wPt: number; hPt: number } {
1044
+ const fallback = { wPt: 72, hPt: 72 };
1045
+ try {
1046
+ if (mime === 'image/png' && data.length >= 24) {
1047
+ // PNG IHDR: sig(8) + length(4) + type(4) + width(4) + height(4) — all big-endian
1048
+ const w = (data[16] << 24 | data[17] << 16 | data[18] << 8 | data[19]) >>> 0;
1049
+ const h = (data[20] << 24 | data[21] << 16 | data[22] << 8 | data[23]) >>> 0;
1050
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 }; // 96 DPI → pt
1051
+ }
1052
+ if (mime === 'image/jpeg') {
1053
+ // Scan for SOF markers: FF C0 / C1 / C2 / C3
1054
+ let i = 2;
1055
+ while (i + 8 < data.length) {
1056
+ if (data[i] !== 0xFF) { i++; continue; }
1057
+ const marker = data[i + 1];
1058
+ if (marker >= 0xC0 && marker <= 0xC3) {
1059
+ // SOF: 2-byte marker + 2-byte length + 1-byte precision + 2-byte height + 2-byte width
1060
+ const h = (data[i + 5] << 8 | data[i + 6]) >>> 0;
1061
+ const w = (data[i + 7] << 8 | data[i + 8]) >>> 0;
1062
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
1063
+ }
1064
+ const segLen = data[i + 2] << 8 | data[i + 3];
1065
+ i += 2 + (segLen > 0 ? segLen : 2);
1066
+ }
1067
+ }
1068
+ if (mime === 'image/bmp' && data.length >= 26) {
1069
+ // BMP DIB header: width at 18, height at 22 (signed int32 LE; negative = top-down)
1070
+ const w = BinaryKit.readU32LE(data, 18);
1071
+ const h = Math.abs(BinaryKit.readU32LE(data, 22) | 0);
1072
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
1073
+ }
1074
+ if (mime === 'image/gif' && data.length >= 10) {
1075
+ // GIF: width at 6, height at 8 (uint16 LE)
1076
+ const w = data[6] | data[7] << 8;
1077
+ const h = data[8] | data[9] << 8;
1078
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
1079
+ }
1080
+ } catch { /* ignore */ }
1081
+ return fallback;
1082
+ }
1083
+
950
1084
  /* ═══════════════════════════════════════════════════════════════
951
1085
  OLE Object extraction (images)
952
1086
  ════════════════════════════════════════════════════════════ */
@@ -994,35 +1128,69 @@ function injectImagesIntoContent(
994
1128
  content: ContentNode[],
995
1129
  objectMap: Map<number, ImgNode>
996
1130
  ): void {
997
- const imageArray = Array.from(objectMap.values());
998
- if (imageArray.length === 0) return;
1131
+ if (objectMap.size === 0) return;
1132
+
1133
+ // Helper function to process a list of kids (spans, images, etc.)
1134
+ const processKids = (kids: any[]) => {
1135
+ for (let i = 0; i < kids.length; i++) {
1136
+ const kid = kids[i];
1137
+ // Span node structure: { tag: 'span', props, kids: [{ tag: 'txt', content }] }
1138
+ if (kid.tag === 'span' && kid.kids && kid.kids[0]?.tag === 'txt') {
1139
+ const text = kid.kids[0].content;
1140
+ // __EXT_N__ or __EXT_N_W<wPt>_H<hPt>__ (with encoded display size)
1141
+ // N is the objId that matches the index in objectMap
1142
+ const match = text.match?.(/^__(?:IMG|EXT)_(\d+)(?:_W(\d+)_H(\d+))?__$/);
1143
+ if (match) {
1144
+ const objId = parseInt(match[1], 10);
1145
+ const base = objectMap.get(objId);
1146
+ if (base) {
1147
+ const wPt = match[2] ? parseInt(match[2], 10) : 0;
1148
+ const hPt = match[3] ? parseInt(match[3], 10) : 0;
1149
+ // Use encoded display size when valid; otherwise keep pixel-based dims
1150
+ kids[i] = (wPt > 0 && hPt > 0) ? { ...base, w: wPt, h: hPt } : base;
1151
+ }
1152
+ }
1153
+ }
1154
+ }
1155
+ };
1156
+
1157
+ // Recursively process a grid (table): resolves image placeholders in all cells,
1158
+ // including nested grids inside cells.
1159
+ const processGridKids = (grid: any) => {
1160
+ if (!grid.kids || !Array.isArray(grid.kids)) return;
999
1161
 
1000
- // Get unique images (deduplicate by base64 content)
1001
- const uniqueImages = Array.from(new Set(imageArray.map(img => img.b64))).map(b64 => {
1002
- return imageArray.find(img => img.b64 === b64)!;
1003
- });
1004
- if (uniqueImages.length === 0) return;
1162
+ for (const row of grid.kids) {
1163
+ if (!row.kids || !Array.isArray(row.kids)) continue;
1164
+
1165
+ for (const cell of row.kids) {
1166
+ if (!cell.kids || !Array.isArray(cell.kids)) continue;
1167
+
1168
+ for (const cellKid of cell.kids) {
1169
+ if (cellKid.tag === 'grid') {
1170
+ // Nested table inside cell — recurse
1171
+ processGridKids(cellKid);
1172
+ } else if (cellKid.tag === 'para' && cellKid.kids) {
1173
+ processKids(cellKid.kids);
1174
+ }
1175
+ }
1176
+ }
1177
+ }
1178
+ };
1005
1179
 
1006
- let imgIdx = 0;
1007
1180
  for (const node of content) {
1008
1181
  if (node.tag === 'para' && node.kids) {
1009
- for (let i = 0; i < node.kids.length; i++) {
1010
- const kid = node.kids[i];
1011
- // Span node structure: { tag: 'span', props, kids: [{ tag: 'txt', content }] }
1012
- if (kid.tag === 'span' && kid.kids && kid.kids[0]?.tag === 'txt') {
1013
- const text = kid.kids[0].content;
1014
- // Support both __IMG_N__ and __EXT_N__ patterns
1015
- const match = text.match?.(/^__(?:IMG|EXT)_(\d+)__$/);
1016
- if (match) {
1017
- // Replace placeholder with next available image (round-robin)
1018
- const imgNode = uniqueImages[imgIdx % uniqueImages.length];
1019
- if (imgNode) {
1020
- node.kids[i] = imgNode;
1021
- imgIdx++;
1022
- }
1023
- }
1182
+ // Process paragraph kids (spans, images, links, grids)
1183
+ processKids(node.kids);
1184
+
1185
+ // Also process any nested grids inside the paragraph
1186
+ for (const kid of node.kids) {
1187
+ if (kid.tag === 'grid') {
1188
+ processGridKids(kid);
1024
1189
  }
1025
1190
  }
1191
+ } else if (node.tag === 'grid') {
1192
+ // Process grid nodes (tables)
1193
+ processGridKids(node);
1026
1194
  }
1027
1195
  }
1028
1196
  }