hwpkit-dev 0.0.2 → 0.0.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (42) hide show
  1. package/ .npmignore +4 -1
  2. package/README.md +44 -7
  3. package/dist/index.d.mts +46 -16
  4. package/dist/index.d.ts +46 -16
  5. package/dist/index.js +3964 -1227
  6. package/dist/index.js.map +1 -1
  7. package/dist/index.mjs +3964 -1227
  8. package/dist/index.mjs.map +1 -1
  9. package/package.json +2 -1
  10. package/playground/index.html +346 -0
  11. package/playground/main.ts +302 -0
  12. package/playground/vite.config.ts +16 -0
  13. package/src/contract/decoder.ts +1 -0
  14. package/src/contract/encoder.ts +6 -1
  15. package/src/core/BaseDecoder.ts +118 -0
  16. package/src/core/BaseEncoder.ts +146 -0
  17. package/src/decoders/docx/DocxDecoder.ts +743 -151
  18. package/src/decoders/html/HtmlDecoder.ts +366 -0
  19. package/src/decoders/hwp/HwpScanner.ts +478 -193
  20. package/src/decoders/hwpx/HwpxDecoder.ts +796 -297
  21. package/src/decoders/md/MdDecoder.ts +4 -4
  22. package/src/encoders/docx/DocxEncoder.ts +549 -240
  23. package/src/encoders/html/HtmlEncoder.ts +17 -19
  24. package/src/encoders/hwp/HwpEncoder.ts +1643 -890
  25. package/src/encoders/hwpx/HwpxEncoder.ts +1626 -472
  26. package/src/encoders/hwpx/constants.ts +148 -0
  27. package/src/encoders/hwpx/utils.ts +198 -0
  28. package/src/encoders/md/MdEncoder.ts +20 -15
  29. package/src/model/builders.ts +4 -4
  30. package/src/model/doc-props.ts +24 -10
  31. package/src/model/doc-tree.ts +13 -5
  32. package/src/pipeline/Pipeline.ts +7 -3
  33. package/src/pipeline/registry.ts +13 -2
  34. package/src/safety/StyleBridge.ts +51 -6
  35. package/src/toolkit/ArchiveKit.ts +56 -0
  36. package/src/toolkit/StyleMapper.ts +221 -0
  37. package/src/toolkit/UnitConverter.ts +138 -0
  38. package/src/toolkit/XmlKit.ts +0 -5
  39. package/hwp-analyze.ts +0 -90
  40. package/inspect-doc.ts +0 -57
  41. package/output_test.hwp +0 -0
  42. package/test-docx-to-hwp.ts +0 -45
@@ -1,11 +1,12 @@
1
1
  import type { Decoder } from '../../contract/decoder';
2
- import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode } from '../../model/doc-tree';
2
+ import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode, GridNode, PageNumNode } from '../../model/doc-tree';
3
3
  import type { Outcome } from '../../contract/result';
4
4
  import type { Align, Stroke, StrokeKind, PageDims, TextProps, ParaProps, CellProps, GridProps } from '../../model/doc-props';
5
5
  import { succeed, fail } from '../../contract/result';
6
- import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell, buildImg } from '../../model/builders';
6
+ import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell, buildImg, buildPb, buildPageNum } from '../../model/builders';
7
7
  import { ShieldedParser } from '../../safety/ShieldedParser';
8
8
  import { BinaryKit } from '../../toolkit/BinaryKit';
9
+ import { TextKit } from '../../toolkit/TextKit';
9
10
  import { Metric, safeHex, safeFont } from '../../safety/StyleBridge';
10
11
  import { registry } from '../../pipeline/registry';
11
12
  import { A4 } from '../../model/doc-props';
@@ -38,10 +39,14 @@ function isTableTag(t: number) { return t === TAG_TABLE_A || t === TAG_TABLE_B;
38
39
  function isCellTag(t: number) { return t === TAG_CELL_A || t === TAG_CELL_B || t === TAG_LIST_HEADER; }
39
40
 
40
41
  // CTRL_HEADER ctrlId values (UINT32-LE as ASCII)
41
- const CTRL_TABLE = 0x74626C20; // ' lbt'
42
+ const CTRL_TABLE = 0x74626C20; // 'tbl ' = 표(table)
42
43
  const CTRL_IMAGE = 0x696D6720; // 'img '
43
44
  const CTRL_OBJ = 0x6F626A20; // 'obj '
44
45
  const CTRL_FIG = 0x66696720; // 'fig '
46
+ const CTRL_GSO = 0x67736F20; // 'gso ' = 그리기 객체 (drawing object, contains embedded images)
47
+ const CTRL_HEAD = 0x68656164; // 'head' = 머리말
48
+ const CTRL_FOOT = 0x666F6F74; // 'foot' = 꼬리말
49
+ const CTRL_ATNO = 0x61746E6F; // 'atno' = 자동 번호 (쪽번호 등)
45
50
 
46
51
  /* ═══════════════════════════════════════════════════════════════
47
52
  Types
@@ -64,15 +69,18 @@ interface HwpCharShape {
64
69
  subscript: boolean;
65
70
  textColor: string;
66
71
  }
67
-
68
72
  interface HwpParaShape {
69
73
  align: Align;
70
74
  spaceBefore: number;
71
75
  spaceAfter: number;
72
76
  lineSpacing: number;
77
+ lineSpacingType: 0 | 1 | 2 | 3; // 0=PERCENT, 1=FIXED, 2=BETWEEN_LINES, 3=AT_LEAST
78
+ leftMargin: number;
79
+ rightMargin: number;
73
80
  indent: number;
81
+ verAlign?: 'baseline' | 'top' | 'center' | 'bottom';
82
+ lineWrap?: 'break' | 'squeeze' | 'keep';
74
83
  }
75
-
76
84
  interface HwpBorderFill {
77
85
  borders: { type: number; widthPt: number; color: string }[];
78
86
  bgColor?: string;
@@ -210,7 +218,7 @@ function parseCharShape(d: Uint8Array): HwpCharShape {
210
218
 
211
219
  /* ── PARA_SHAPE ─────────────────────────────────────────────── */
212
220
  /* offset size field
213
- 0 4 attr1 (bits 0-1 = alignment: 0=justify,1=left,2=right,3=center)
221
+ 0 4 attr1 (bits 0-1 = line spacing type, bits 2-4 = alignment)
214
222
  4 4 leftMargin (HWPUNIT)
215
223
  8 4 rightMargin
216
224
  12 4 indent
@@ -218,17 +226,36 @@ function parseCharShape(d: Uint8Array): HwpCharShape {
218
226
  20 4 spaceAfter
219
227
  24 4 lineSpacing */
220
228
 
221
- const ALIGN_TBL: Record<number, Align> = { 0: 'justify', 1: 'left', 2: 'right', 3: 'center', 4: 'justify' };
229
+ const ALIGN_TBL: Record<number, Align> = { 0: 'justify', 1: 'left', 2: 'right', 3: 'center', 4: 'distribute', 5: 'distribute_space' };
222
230
 
223
231
  function parseParaShape(d: Uint8Array): HwpParaShape {
224
- if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, indent: 0 };
232
+ if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, lineSpacingType: 0, leftMargin: 0, rightMargin: 0, indent: 0 };
225
233
  const attr = BinaryKit.readU32LE(d, 0);
234
+
235
+ // bits 0-1: 줄 간격 종류 (0=PERCENT, 1=FIXED, 2=BETWEEN_LINES, 3=AT_LEAST)
236
+ const lineSpacingType = (attr & 0x3) as 0 | 1 | 2 | 3;
237
+
238
+ // bits 2-4: 정렬 방식 (0=justify,1=left,2=right,3=center,4=distribute,5=split)
239
+ const align = ALIGN_TBL[(attr >> 2) & 0x7] ?? 'left';
240
+
241
+ // 세로 정렬 (Bit 18 ~ Bit 19)
242
+ const vVal = (attr >> 18) & 0x3;
243
+ const verAlign = vVal === 1 ? 'top' : vVal === 2 ? 'center' : vVal === 3 ? 'bottom' : 'baseline';
244
+
245
+ // 줄 바꿈 기준: attr1 에는 별도 비트 없음, 기본값 'break'
246
+ const lineWrap: 'break' = 'break';
247
+
226
248
  return {
227
- align: ALIGN_TBL[(attr >> 2) & 0x7] ?? 'left',
228
- indent: d.length >= 16 ? i32(d, 12) : 0,
249
+ align,
250
+ lineSpacingType,
251
+ leftMargin: d.length >= 8 ? i32(d, 4) : 0, // offset 4: 문단 몸체 왼쪽 여백 (HWPUNIT)
252
+ rightMargin: d.length >= 12 ? i32(d, 8) : 0, // offset 8: 문단 몸체 오른쪽 여백 (HWPUNIT)
253
+ indent: d.length >= 16 ? i32(d, 12) : 0, // offset 12: 첫 줄 들여쓰기 (HWPUNIT)
229
254
  spaceBefore: d.length >= 20 ? i32(d, 16) : 0,
230
255
  spaceAfter: d.length >= 24 ? i32(d, 20) : 0,
231
256
  lineSpacing: d.length >= 28 ? i32(d, 24) : 160,
257
+ verAlign,
258
+ lineWrap,
232
259
  };
233
260
  }
234
261
 
@@ -275,8 +302,17 @@ function parseBorderFill(d: Uint8Array): HwpBorderFill {
275
302
  Body section parsing
276
303
  ═══════════════════════════════════════════════════════════════ */
277
304
 
305
+ // gsoCtx: shared mutable counter for 'gso ' drawing objects.
306
+ // Each 'gso ' CTRL_HEADER encountered increments this counter.
307
+ // objectMap is keyed by 0-based gso order = sequential BinData insertion order.
308
+ interface GsoCtx {
309
+ count: number;
310
+ headers?: ParaNode[];
311
+ footers?: ParaNode[];
312
+ }
313
+
278
314
  function parseBody(
279
- raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser,
315
+ raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
280
316
  ): { content: ContentNode[]; pageDims?: PageDims } {
281
317
  const recs = parseRecords(compressed ? tryInflate(raw) : raw);
282
318
  const content: ContentNode[] = [];
@@ -296,7 +332,7 @@ function parseBody(
296
332
  i++; // already handled above; skip at top level
297
333
  } else if (recs[i].tag === TAG_PARA_HEADER) {
298
334
  const r = shield.guard(
299
- () => parseParagraphGroup(recs, i, di, shield),
335
+ () => parseParagraphGroup(recs, i, di, shield, gsoCtx),
300
336
  { nodes: [] as ContentNode[], next: i + 1 },
301
337
  `hwp:para@${i}`,
302
338
  );
@@ -312,19 +348,25 @@ function parseBody(
312
348
  /* ── Paragraph group ────────────────────────────────────────── */
313
349
 
314
350
  function parseParagraphGroup(
315
- recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser,
351
+ recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
316
352
  ): { nodes: ContentNode[]; next: number } {
317
353
  const hdr = recs[start];
318
354
  const lv = hdr.level;
319
355
 
320
- // paraShapeId at offset 8 (UINT16)
321
- const psId = hdr.data.length >= 10 ? BinaryKit.readU16LE(hdr.data, 8) : 0;
322
- const ps = di.paraShapes[psId];
356
+ // P1: PARA_HEADER 레이아웃
357
+ // offset 8-9: paraShapeId (UINT16)
358
+ // offset 10: styleId (UINT8)
359
+ // offset 11: divideSort (UINT8) — 0x04=쪽나누기
360
+ const psId = hdr.data.length >= 10 ? BinaryKit.readU16LE(hdr.data, 8) : 0;
361
+ const hwpStyleId = hdr.data.length >= 11 ? hdr.data[10] : 0;
362
+ const divideSort = hdr.data.length >= 12 ? hdr.data[11] : 0;
363
+ const ps = di.paraShapes[psId];
323
364
 
324
365
  let text: ParaTextResult | null = null;
325
366
  let csPairs: [number, number][] = [];
326
367
  const grids: ContentNode[] = [];
327
- const ctrlHeaders: { ctrlId: number; objId: number }[] = [];
368
+ // imgId: for 'gso' uses sequential gsoCtx.count; for others uses flags-based objId
369
+ const ctrlHeaders: { ctrlId: number; imgId: number; wPt: number; hPt: number; atnoType?: number }[] = [];
328
370
  let i = start + 1;
329
371
 
330
372
  while (i < recs.length && recs[i].level > lv) {
@@ -339,20 +381,60 @@ function parseParagraphGroup(
339
381
  } else if (r.tag === TAG_CTRL_HEADER && r.level === lv + 1) {
340
382
  if (r.data.length >= 4) {
341
383
  const ctrlId = BinaryKit.readU32LE(r.data, 0);
342
- // objId at offset 4 (UINT16) - identifies the image/object in BinData
343
- const objId = r.data.length >= 6 ? BinaryKit.readU16LE(r.data, 4) : 0;
344
- ctrlHeaders.push({ ctrlId, objId });
345
-
346
- if (ctrlId === CTRL_TABLE) {
347
- const tr = shield.guard(
348
- () => parseTableCtrl(recs, i, di, shield),
349
- { grid: null, next: skipKids(recs, i) },
350
- `hwp:tbl@${i}`,
351
- );
352
- if (tr.grid) grids.push(tr.grid);
353
- i = tr.next;
384
+
385
+ if (ctrlId === CTRL_HEAD || ctrlId === CTRL_FOOT) {
386
+ // P8: 머리말/꼬리말 컨트롤 — 자식 문단을 파싱해 gsoCtx에 저장
387
+ const ctrlLv = r.level;
388
+ const hfParas: ParaNode[] = [];
389
+ let j = i + 1;
390
+ while (j < recs.length && recs[j].level > ctrlLv) {
391
+ if (recs[j].tag === TAG_PARA_HEADER) {
392
+ const pr = shield.guard(
393
+ () => parseParagraphGroup(recs, j, di, shield, gsoCtx),
394
+ { nodes: [] as ContentNode[], next: j + 1 },
395
+ `hwp:hf@${j}`,
396
+ );
397
+ hfParas.push(...pr.nodes.filter((n): n is ParaNode => n.tag === 'para'));
398
+ j = pr.next;
399
+ } else {
400
+ j++;
401
+ }
402
+ }
403
+ if (hfParas.length > 0) {
404
+ const key = ctrlId === CTRL_HEAD ? 'headers' : 'footers';
405
+ if (!gsoCtx[key]) gsoCtx[key] = hfParas;
406
+ }
407
+ i = j;
354
408
  } else {
355
- i = skipKids(recs, i);
409
+ // HWP 5.0 general-object layout:
410
+ // [0:4] ctrlId [4:4] flags [8:4] xOff [12:4] yOff
411
+ // [16:4] width(HWPUNIT) [20:4] height(HWPUNIT)
412
+ const MAX_HWP = 1_000_000;
413
+ const rawW = r.data.length >= 24 ? BinaryKit.readU32LE(r.data, 16) : 0;
414
+ const rawH = r.data.length >= 28 ? BinaryKit.readU32LE(r.data, 20) : 0;
415
+ const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
416
+ const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
417
+
418
+ // P9: atno — offset 4 u32 하위 4bit = 번호 종류 (0=쪽번호, 6=전체쪽수)
419
+ const atnoType = ctrlId === CTRL_ATNO && r.data.length >= 8
420
+ ? BinaryKit.readU32LE(r.data, 4) & 15
421
+ : undefined;
422
+
423
+ // 'gso ' (그리기 객체) uses sequential counter; others use flags-based id
424
+ const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (r.data.length >= 6 ? BinaryKit.readU16LE(r.data, 4) : 0);
425
+ ctrlHeaders.push({ ctrlId, imgId, wPt, hPt, atnoType });
426
+
427
+ if (ctrlId === CTRL_TABLE) {
428
+ const tr = shield.guard(
429
+ () => parseTableCtrl(recs, i, di, shield, gsoCtx),
430
+ { grid: null, next: skipKids(recs, i) },
431
+ `hwp:tbl@${i}`,
432
+ );
433
+ if (tr.grid) grids.push(tr.grid);
434
+ i = tr.next;
435
+ } else {
436
+ i = skipKids(recs, i);
437
+ }
356
438
  }
357
439
  } else {
358
440
  i = skipKids(recs, i);
@@ -362,43 +444,68 @@ function parseParagraphGroup(
362
444
  }
363
445
  }
364
446
 
365
- // Match extended controls with CTRL_HEADER entries
366
- if (text && ctrlHeaders.length > 0) {
367
- for (let ci = 0; ci < text.controls.length; ci++) {
368
- if (ci < ctrlHeaders.length) {
369
- text.controls[ci].ctrlId = ctrlHeaders[ci].ctrlId;
370
- text.controls[ci].matched = true;
371
- }
372
- }
373
- }
374
-
375
447
  const nodes: ContentNode[] = [];
376
448
 
377
- // Build paragraph from text and inline controls (images)
378
- if (text && (text.chars.length > 0 || text.controls.length > 0)) {
379
- const paraContent: (SpanNode | ContentNode)[] = [];
449
+ {
450
+ const paraContent: Array<SpanNode | GridNode | PageNumNode> = [];
380
451
 
381
- // Process text chars and controls together
382
- if (text.chars.length > 0) {
383
- const spans = resolveCharShapes(text.chars, csPairs, di);
384
- paraContent.push(...spans);
452
+ // P9: atno 컨트롤 위치 수집 (pos 기준 정렬)
453
+ const atnoCtrls: { pos: number; type: number }[] = [];
454
+ if (text && text.controls.length > 0) {
455
+ for (let ci = 0; ci < text.controls.length; ci++) {
456
+ const ch = ctrlHeaders[ci];
457
+ if (ch && ch.ctrlId === CTRL_ATNO)
458
+ atnoCtrls.push({ pos: text.controls[ci].pos, type: ch.atnoType ?? 0 });
459
+ }
460
+ atnoCtrls.sort((a, b) => a.pos - b.pos);
461
+ }
462
+
463
+ // P9: 텍스트 chars를 atno 위치 기준으로 분할하여 PageNumNode 삽입
464
+ if (text && text.chars.length > 0) {
465
+ if (atnoCtrls.length > 0) {
466
+ let k = 0;
467
+ for (const ac of atnoCtrls) {
468
+ const seg: ParsedChar[] = [];
469
+ while (k < text.chars.length && text.chars[k].pos < ac.pos) seg.push(text.chars[k++]);
470
+ if (seg.length > 0) paraContent.push(...resolveCharShapes(seg, csPairs, di));
471
+ paraContent.push(buildPageNum(ac.type === 0 ? 'decimal' : 'total'));
472
+ }
473
+ const rest = text.chars.slice(k);
474
+ if (rest.length > 0) paraContent.push(...resolveCharShapes(rest, csPairs, di));
475
+ } else {
476
+ paraContent.push(...resolveCharShapes(text.chars, csPairs, di));
477
+ }
478
+ } else if (atnoCtrls.length > 0) {
479
+ for (const ac of atnoCtrls) paraContent.push(buildPageNum(ac.type === 0 ? 'decimal' : 'total'));
385
480
  }
386
481
 
387
- // Add placeholder spans for extended controls (images)
388
- if (text.controls.length > 0) {
482
+ // Image placeholder spans: only for actual image controls.
483
+ // Non-image controls (footnotes, TOC entries, etc.) are silently skipped.
484
+ if (text && text.controls.length > 0) {
389
485
  for (let ci = 0; ci < text.controls.length; ci++) {
390
- // Create placeholder for all extended controls
391
- // Image replacement will happen later in injectImagesIntoContent
392
- paraContent.push(buildSpan(`__EXT_${ci}__`));
486
+ const ch = ctrlHeaders[ci];
487
+ if (!ch) continue; // anchor-only ctrl (gso is sibling, not inline)
488
+ const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
489
+ if (!isImg) continue; // skip footnotes, TOC, page num, etc.
490
+ const dimStr = (ch.wPt > 0 && ch.hPt > 0)
491
+ ? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}`
492
+ : '';
493
+ paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
393
494
  }
394
495
  }
395
496
 
396
- if (paraContent.length > 0) {
397
- nodes.push(buildPara(paraContent as any, buildParaProps(ps)));
497
+ // P5: 쪽나누기(divideSort & 4) → page-break 문단 먼저 출력
498
+ if (divideSort & 4) {
499
+ nodes.push(buildPara([{ tag: 'span', props: {}, kids: [buildPb()] } as SpanNode]));
398
500
  }
501
+ // P5: 표 → 앵커 문단 순서 (앵커 문단 드롭 금지)
502
+ nodes.push(...grids);
503
+ nodes.push(buildPara(
504
+ paraContent.length > 0 ? paraContent as any : [buildSpan('')],
505
+ buildParaProps(ps, hwpStyleId),
506
+ ));
399
507
  }
400
508
 
401
- nodes.push(...grids);
402
509
  return { nodes, next: i };
403
510
  }
404
511
 
@@ -411,8 +518,8 @@ function skipKids(recs: HwpRecord[], idx: number): number {
411
518
 
412
519
  /* ── PARA_TEXT ───────────────────────────────────────────────── */
413
520
 
414
- // Extended controls: 8 WORDs, associated CTRL_HEADER
415
- const EXT_CTRL = new Set([2, 3, 11, 12, 14, 15]);
521
+ // Extended controls: 8 WORDs, associated CTRL_HEADER (16-25 also skip 16 bytes)
522
+ const EXT_CTRL = new Set([2, 3, 11, 12, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25]);
416
523
  // Inline controls: 8 WORDs, no CTRL_HEADER
417
524
  const INL_CTRL = new Set([4, 5, 6, 7, 8]);
418
525
 
@@ -511,7 +618,7 @@ function styledSpan(text: string, shapeId: number, di: DocInfo): SpanNode {
511
618
  /* ── Table control parsing ──────────────────────────────────── */
512
619
 
513
620
  function parseTableCtrl(
514
- recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser,
621
+ recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
515
622
  ): { grid: ContentNode | null; next: number } {
516
623
  const ctrlLv = recs[ctrlIdx].level;
517
624
  let i = ctrlIdx + 1;
@@ -567,15 +674,15 @@ function parseTableCtrl(
567
674
  const rowCnt = tblData.length >= 6 ? BinaryKit.readU16LE(tblData, 4) : 1;
568
675
  const colCnt = tblData.length >= 8 ? BinaryKit.readU16LE(tblData, 6) : 1;
569
676
 
570
- interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; props: CellProps; paras: ParaNode[] }
677
+ interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; heightHwp?: number; props: CellProps; cellChildren: (ParaNode | GridNode)[] }
571
678
  const parsed: PC[] = [];
572
679
 
573
680
  for (let ci = 0; ci < cells.length; ci++) {
574
681
  const c = cells[ci];
575
682
  const seqIdx = ci;
576
683
  const pc = shield.guard(
577
- () => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt),
578
- { row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, props: {}, paras: [buildPara([buildSpan('')])] },
684
+ () => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt, gsoCtx),
685
+ { row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, heightHwp: undefined, props: {}, cellChildren: [buildPara([buildSpan('')])] },
579
686
  `hwp:cell@${c.cStart}`,
580
687
  );
581
688
  parsed.push(pc);
@@ -602,9 +709,11 @@ function parseTableCtrl(
602
709
  }
603
710
  }
604
711
  // Pass 2: for columns still 0, try to derive from multi-span cells
712
+ // Sort by span size ascending so smaller, more precise spans fill widths before larger spans
605
713
  const zeroColumns = colWidthsPt.filter(w => w === 0).length;
606
714
  if (zeroColumns > 0) {
607
- for (const c of parsed) {
715
+ const spanCells = parsed.filter(c => c.cs > 1 && c.widthHwp > 0).sort((a, b) => a.cs - b.cs);
716
+ for (const c of spanCells) {
608
717
  if (c.cs > 1 && c.widthHwp > 0) {
609
718
  // Subtract known column widths from the span
610
719
  let known = 0;
@@ -624,13 +733,37 @@ function parseTableCtrl(
624
733
  }
625
734
  }
626
735
 
736
+ // Post-process: clamp near-zero column widths (< 1pt = floating-point artifact) to minimum 1pt
737
+ for (let i = 0; i < colWidthsPt.length; i++) {
738
+ if (colWidthsPt[i] > 0 && colWidthsPt[i] < 1) colWidthsPt[i] = 1;
739
+ }
740
+
627
741
  const rows = [];
628
742
  for (let r = 0; r < actualRowCnt; r++) {
629
743
  const rc = parsed.filter(c => c.row === r).sort((a, b) => a.col - b.col);
630
744
  if (rc.length === 0) continue;
631
- rows.push(buildRow(rc.map(c =>
632
- buildCell(c.paras.length ? c.paras : [buildPara([buildSpan('')])], { cs: c.cs, rs: c.rs, props: c.props }),
633
- )));
745
+
746
+ // Calculate row height prefer rs=1 cells (exact per-row height)
747
+ let rowHeightPt: number | undefined = undefined;
748
+ for (const c of rc) {
749
+ if (c.heightHwp && c.heightHwp > 0 && c.rs === 1) {
750
+ const hPt = Metric.hwpToPt(c.heightHwp);
751
+ if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
752
+ }
753
+ }
754
+ // Fallback: all cells span multiple rows → approximate height per row
755
+ if (rowHeightPt == null) {
756
+ for (const c of rc) {
757
+ if (c.heightHwp && c.heightHwp > 0) {
758
+ const hPt = Metric.hwpToPt(c.heightHwp) / c.rs;
759
+ if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
760
+ }
761
+ }
762
+ }
763
+
764
+ rows.push(buildRow(rc.map(c => {
765
+ return buildCell(c.cellChildren, { cs: c.cs, rs: c.rs, props: c.props });
766
+ }), rowHeightPt));
634
767
  }
635
768
  if (rows.length === 0) return { grid: null, next: i };
636
769
 
@@ -659,10 +792,11 @@ function parseTableCtrl(
659
792
 
660
793
  function parseCellRec(
661
794
  d: Uint8Array, tag: number, recs: HwpRecord[], cStart: number, cEnd: number,
662
- di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number,
795
+ di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number, gsoCtx: GsoCtx,
663
796
  ) {
664
797
  let col: number, row: number, cs = 1, rs = 1;
665
798
  let widthHwp = 0;
799
+ let heightHwp = 0;
666
800
  const props: CellProps = {};
667
801
 
668
802
  const attr = d.length >= 6 ? BinaryKit.readU32LE(d, 2) : 0;
@@ -670,88 +804,158 @@ function parseCellRec(
670
804
  if (va === 1) props.va = 'mid';
671
805
  else if (va === 2) props.va = 'bot';
672
806
 
807
+ const HWP_PAD_LR_DEFAULT = 360;
808
+ const HWP_PAD_TB_DEFAULT = 141;
809
+
673
810
  if (tag === TAG_LIST_HEADER && d.length >= 22) {
674
- // LIST_HEADER with cell-specific fields
675
- // offset 8: colAddr, offset 10: rowAddr (HWP 5.0 spec)
676
811
  col = BinaryKit.readU16LE(d, 8);
677
812
  row = BinaryKit.readU16LE(d, 10);
678
813
  cs = Math.max(1, BinaryKit.readU16LE(d, 12));
679
814
  rs = Math.max(1, BinaryKit.readU16LE(d, 14));
680
815
  widthHwp = BinaryKit.readU32LE(d, 16);
681
-
682
- const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
683
- if (bfId > 0 && bfId <= di.borderFills.length) {
684
- const bf = di.borderFills[bfId - 1];
685
- if (bf.borders.length >= 4) {
686
- props.left = toStroke(bf.borders[0]);
687
- props.right = toStroke(bf.borders[1]);
688
- props.top = toStroke(bf.borders[2]);
689
- props.bot = toStroke(bf.borders[3]);
690
- }
691
- if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
816
+ heightHwp = d.length >= 24 ? BinaryKit.readU32LE(d, 20) : 0;
817
+ if (d.length >= 32) {
818
+ const pL = BinaryKit.readU16LE(d, 24); const pR = BinaryKit.readU16LE(d, 26);
819
+ const pT = BinaryKit.readU16LE(d, 28); const pB = BinaryKit.readU16LE(d, 30);
820
+ if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
821
+ if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
822
+ if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
823
+ if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
692
824
  }
825
+ const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
826
+ if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
693
827
  } else if (tag !== TAG_LIST_HEADER) {
694
- // Full CELL record with position/span/borderFill
695
828
  col = d.length >= 8 ? BinaryKit.readU16LE(d, 6) : seqIdx % (colCnt || 1);
696
829
  row = d.length >= 10 ? BinaryKit.readU16LE(d, 8) : Math.floor(seqIdx / (colCnt || 1));
697
830
  cs = d.length >= 12 ? Math.max(1, BinaryKit.readU16LE(d, 10)) : 1;
698
831
  rs = d.length >= 14 ? Math.max(1, BinaryKit.readU16LE(d, 12)) : 1;
699
832
  widthHwp = d.length >= 18 ? BinaryKit.readU32LE(d, 14) : 0;
700
-
701
- const bfId = d.length >= 32 ? BinaryKit.readU16LE(d, 30) : 0;
702
- if (bfId > 0 && bfId <= di.borderFills.length) {
703
- const bf = di.borderFills[bfId - 1];
704
- if (bf.borders.length >= 4) {
705
- props.left = toStroke(bf.borders[0]);
706
- props.right = toStroke(bf.borders[1]);
707
- props.top = toStroke(bf.borders[2]);
708
- props.bot = toStroke(bf.borders[3]);
709
- }
710
- if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
833
+ heightHwp = d.length >= 22 ? BinaryKit.readU32LE(d, 18) : 0;
834
+ if (d.length >= 30) {
835
+ const pL = BinaryKit.readU16LE(d, 22); const pR = BinaryKit.readU16LE(d, 24);
836
+ const pT = BinaryKit.readU16LE(d, 26); const pB = BinaryKit.readU16LE(d, 28);
837
+ if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
838
+ if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
839
+ if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
840
+ if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
711
841
  }
842
+ const bfId = d.length >= 32 ? BinaryKit.readU16LE(d, 30) : 0;
843
+ if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
712
844
  } else {
713
- // Fallback: LIST_HEADER too short, compute sequentially
714
845
  row = Math.floor(seqIdx / (colCnt || 1));
715
846
  col = seqIdx % (colCnt || 1);
716
847
  }
717
848
 
718
- // Parse cell content paragraphs
719
- const paras: ParaNode[] = [];
849
+ const cellChildren: (ParaNode | GridNode)[] = [];
850
+ const MAX_HWP = 1_000_000;
720
851
  let k = cStart;
852
+
721
853
  while (k < cEnd) {
722
854
  if (recs[k].tag === TAG_PARA_HEADER) {
723
- // For cell paragraphs, they might be at various nesting levels
855
+ // Parse paragraph inside cell also extracts nested tables within the paragraph
724
856
  const r = shield.guard(
725
857
  () => {
726
858
  const hdr = recs[k];
727
859
  const lv = hdr.level;
728
860
  const psId = hdr.data.length >= 10 ? BinaryKit.readU16LE(hdr.data, 8) : 0;
861
+ // P6: 셀 내부 문단의 styleId / divideSort 읽기
862
+ const cellStyleId = hdr.data.length >= 11 ? hdr.data[10] : 0;
863
+ const cellDivide = hdr.data.length >= 12 ? hdr.data[11] : 0;
729
864
  const ps = di.paraShapes[psId];
730
865
  let txt: ParaTextResult | null = null;
731
866
  let csp: [number, number][] = [];
867
+ const ctrlHdrs: { ctrlId: number; imgId: number; wPt: number; hPt: number }[] = [];
868
+ const innerGrids: GridNode[] = [];
732
869
  let j = k + 1;
733
870
  while (j < cEnd && recs[j].level > lv) {
734
871
  if (recs[j].tag === TAG_PARA_TEXT) { txt = decodeParaText(recs[j].data); j++; }
735
872
  else if (recs[j].tag === TAG_PARA_CHAR_SHAPE) { csp = parseCharShapePairs(recs[j].data); j++; }
873
+ else if (recs[j].tag === TAG_CTRL_HEADER && recs[j].level === lv + 1) {
874
+ if (recs[j].data.length >= 4) {
875
+ const ctrlId = BinaryKit.readU32LE(recs[j].data, 0);
876
+ if (ctrlId === CTRL_TABLE) {
877
+ // Nested table inside a cell paragraph — recurse into parseTableCtrl
878
+ const nestedTr = shield.guard(
879
+ () => parseTableCtrl(recs, j, di, shield, gsoCtx),
880
+ { grid: null, next: skipKids(recs, j) },
881
+ `hwp:innerNestedTbl@${j}`,
882
+ );
883
+ if (nestedTr.grid) innerGrids.push(nestedTr.grid as GridNode);
884
+ j = nestedTr.next;
885
+ } else {
886
+ const rawW = recs[j].data.length >= 24 ? BinaryKit.readU32LE(recs[j].data, 16) : 0;
887
+ const rawH = recs[j].data.length >= 28 ? BinaryKit.readU32LE(recs[j].data, 20) : 0;
888
+ const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
889
+ const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
890
+ const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (recs[j].data.length >= 6 ? BinaryKit.readU16LE(recs[j].data, 4) : 0);
891
+ ctrlHdrs.push({ ctrlId, imgId, wPt, hPt });
892
+ j = skipKids(recs, j);
893
+ }
894
+ } else {
895
+ j = skipKids(recs, j);
896
+ }
897
+ }
736
898
  else j++;
737
899
  }
738
- const spans = txt && txt.chars.length > 0 ? resolveCharShapes(txt.chars, csp, di) : [buildSpan('')];
739
- return { para: buildPara(spans, buildParaProps(ps)), next: j };
900
+ const paraContent: (SpanNode | ContentNode)[] = [];
901
+ if (txt && txt.chars.length > 0) paraContent.push(...resolveCharShapes(txt.chars, csp, di));
902
+ if (txt && txt.controls.length > 0) {
903
+ for (let ci = 0; ci < txt.controls.length; ci++) {
904
+ const ch = ctrlHdrs[ci];
905
+ if (!ch) continue;
906
+ const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
907
+ if (!isImg) continue;
908
+ const dimStr = (ch.wPt > 0 && ch.hPt > 0) ? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}` : '';
909
+ paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
910
+ }
911
+ }
912
+ const kids = paraContent.length > 0 ? paraContent as any : [buildSpan('')];
913
+ // P6: innerGrids 먼저, 앵커 문단 나중 (P5와 동일한 순서)
914
+ const items: (ParaNode | GridNode)[] = [...innerGrids, buildPara(kids, buildParaProps(ps, cellStyleId))];
915
+ if (cellDivide & 4) items.unshift(buildPara([{ tag: 'span', props: {}, kids: [buildPb()] } as SpanNode]));
916
+ return { items, next: j };
740
917
  },
741
- { para: buildPara([buildSpan('')]), next: k + 1 },
918
+ { items: [buildPara([buildSpan('')])] as (ParaNode | GridNode)[], next: k + 1 },
742
919
  `hwp:cellP@${k}`,
743
920
  );
744
- paras.push(r.para);
921
+ cellChildren.push(...r.items);
745
922
  k = r.next;
923
+ } else if (recs[k].tag === TAG_CTRL_HEADER && recs[k].data.length >= 4) {
924
+ // CTRL_HEADER at cell level (sibling of PARA_HEADER) — anchored 'gso' images and outer-level nested tables
925
+ const cellCtrlId = BinaryKit.readU32LE(recs[k].data, 0);
926
+ if (cellCtrlId === CTRL_GSO) {
927
+ const gsoId = gsoCtx.count++;
928
+ const rawW = recs[k].data.length >= 24 ? BinaryKit.readU32LE(recs[k].data, 16) : 0;
929
+ const rawH = recs[k].data.length >= 28 ? BinaryKit.readU32LE(recs[k].data, 20) : 0;
930
+ const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
931
+ const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
932
+ const dimStr = (wPt > 0 && hPt > 0) ? `_W${Math.round(wPt)}_H${Math.round(hPt)}` : '';
933
+ cellChildren.push(buildPara([buildSpan(`__EXT_${gsoId}${dimStr}__`)]));
934
+ k = skipKids(recs, k);
935
+ } else if (cellCtrlId === CTRL_TABLE) {
936
+ const tr = shield.guard(
937
+ () => parseTableCtrl(recs, k, di, shield, gsoCtx),
938
+ { grid: null, next: skipKids(recs, k) },
939
+ `hwp:nestedTbl@${k}`,
940
+ );
941
+ if (tr.grid) cellChildren.push(tr.grid as GridNode);
942
+ k = tr.next;
943
+ } else {
944
+ k = skipKids(recs, k);
945
+ }
746
946
  } else { k++; }
747
947
  }
748
948
 
749
- return { row, col, cs, rs, props, widthHwp, paras: paras.length ? paras : [buildPara([buildSpan('')])] };
949
+ return {
950
+ row, col, cs, rs, props, widthHwp,
951
+ heightHwp: heightHwp || undefined,
952
+ cellChildren: cellChildren.length ? cellChildren : [buildPara([buildSpan('')])],
953
+ };
750
954
  }
751
955
 
752
956
  /* ── PAGE_DEF ───────────────────────────────────────────────── */
753
957
  /* [0:4] width [4:4] height [8:4] ml [12:4] mr
754
- [16:4] mt [20:4] mb [36:4] attr (bit0=landscape) */
958
+ [16:4] mt [20:4] mb [24:4] header [28:4] footer [36:4] attr (bit0=landscape) */
755
959
 
756
960
  function parsePageDef(d: Uint8Array): PageDims {
757
961
  if (d.length < 24) return A4;
@@ -761,11 +965,15 @@ function parsePageDef(d: Uint8Array): PageDims {
761
965
  const mr = BinaryKit.readU32LE(d, 12);
762
966
  const mt = BinaryKit.readU32LE(d, 16);
763
967
  const mb = BinaryKit.readU32LE(d, 20);
968
+ const header = d.length >= 28 ? BinaryKit.readU32LE(d, 24) : 0;
969
+ const footer = d.length >= 32 ? BinaryKit.readU32LE(d, 28) : 0;
764
970
  const at = d.length >= 40 ? BinaryKit.readU32LE(d, 36) : 0;
765
971
  return {
766
972
  wPt: Metric.hwpToPt(w), hPt: Metric.hwpToPt(h),
767
973
  ml: Metric.hwpToPt(ml), mr: Metric.hwpToPt(mr),
768
974
  mt: Metric.hwpToPt(mt), mb: Metric.hwpToPt(mb),
975
+ headerPt: header > 0 ? Metric.hwpToPt(header) : undefined,
976
+ footerPt: footer > 0 ? Metric.hwpToPt(footer) : undefined,
769
977
  orient: (at & 1) ? 'landscape' : 'portrait',
770
978
  };
771
979
  }
@@ -788,6 +996,18 @@ function toStroke(b: { type: number; widthPt: number; color: string }): Stroke {
788
996
  return { kind: BORDER_KIND[b.type] ?? 'solid', pt: b.widthPt, color: b.color };
789
997
  }
790
998
 
999
+ // Apply borderFill to CellProps. Preserve explicit NONE so DOCX tcBorders can
1000
+ // override the table-level tblBorders. Filtering NONE would let tblBorders bleed through.
1001
+ function applyCellBorderFill(bf: HwpBorderFill, props: CellProps): void {
1002
+ if (bf.borders.length >= 4) {
1003
+ props.left = toStroke(bf.borders[0]);
1004
+ props.right = toStroke(bf.borders[1]);
1005
+ props.top = toStroke(bf.borders[2]);
1006
+ props.bot = toStroke(bf.borders[3]);
1007
+ }
1008
+ if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
1009
+ }
1010
+
791
1011
  function strokeFromBF(bfId: number, di: DocInfo): Stroke | undefined {
792
1012
  if (bfId <= 0 || bfId > di.borderFills.length) return undefined;
793
1013
  const bf = di.borderFills[bfId - 1];
@@ -796,14 +1016,30 @@ function strokeFromBF(bfId: number, di: DocInfo): Stroke | undefined {
796
1016
  return { kind: BORDER_KIND[b.type] ?? 'solid', pt: b.widthPt, color: b.color };
797
1017
  }
798
1018
 
799
- function buildParaProps(ps?: HwpParaShape): ParaProps {
800
- if (!ps) return {};
801
- const p: ParaProps = {};
1019
+ function buildParaProps(ps?: HwpParaShape, hwpStyleId?: number): ParaProps {
1020
+ // P2: hwpStyleId를 초기값으로 포함 (undefined이면 객체)
1021
+ const p: ParaProps = hwpStyleId !== undefined ? { hwpStyleId } : {};
1022
+ if (!ps) return p;
802
1023
  if (ps.align && ps.align !== 'left') p.align = ps.align;
803
1024
  if (ps.spaceBefore > 0) p.spaceBefore = Metric.hwpToPt(ps.spaceBefore);
804
1025
  if (ps.spaceAfter > 0) p.spaceAfter = Metric.hwpToPt(ps.spaceAfter);
805
- if (ps.lineSpacing > 0 && ps.lineSpacing !== 160) p.lineHeight = ps.lineSpacing / 100;
806
- if (ps.indent > 0) p.indentPt = Metric.hwpToPt(ps.indent);
1026
+ // 간격: type=0(PERCENT) lineHeight, type=1(FIXED) lineHeightFixed
1027
+ if (ps.lineSpacingType === 1) {
1028
+ if (ps.lineSpacing > 0) p.lineHeightFixed = Metric.hwpToPt(ps.lineSpacing);
1029
+ } else {
1030
+ // P10: 160%(HWP 기본값) 생략 버그 수정 — 항상 lineHeight 설정
1031
+ if (ps.lineSpacing > 0) p.lineHeight = ps.lineSpacing / 100;
1032
+ }
1033
+ // leftMargin (offset 4) = 문단 몸체 왼쪽 여백 → leftMargin (pt), ensure non-negative
1034
+ const leftMarginPt = Math.max(0, Metric.hwpToPt(ps.leftMargin));
1035
+ if (leftMarginPt > 0) p.leftMargin = leftMarginPt;
1036
+ // rightMargin (offset 8) = 문단 몸체 오른쪽 여백 → indentRightPt (pt)
1037
+ const rightMarginPt = Math.max(0, Metric.hwpToPt(ps.rightMargin));
1038
+ if (rightMarginPt > 0) p.indentRightPt = rightMarginPt;
1039
+ // indent (offset 12) = 첫 줄 들여쓰기(양수) / 내어쓰기(음수) → firstLineIndentPt
1040
+ if (ps.indent !== 0) p.firstLineIndentPt = Metric.hwpToPt(ps.indent);
1041
+ if (ps.verAlign && ps.verAlign !== 'baseline') p.verAlign = ps.verAlign;
1042
+ if (ps.lineWrap && ps.lineWrap !== 'break') p.lineWrap = ps.lineWrap;
807
1043
  return p;
808
1044
  }
809
1045
 
@@ -813,6 +1049,7 @@ function buildParaProps(ps?: HwpParaShape): ParaProps {
813
1049
 
814
1050
  export class HwpScanner implements Decoder {
815
1051
  readonly format = 'hwp';
1052
+ readonly aliases = ['application/vnd.hancom.hwp'];
816
1053
 
817
1054
  async decode(data: Uint8Array): Promise<Outcome<DocRoot>> {
818
1055
  const shield = new ShieldedParser();
@@ -834,52 +1071,36 @@ export class HwpScanner implements Decoder {
834
1071
  di = shield.guard(() => parseDocInfo(diRaw, compressed), di, 'hwp:docInfo');
835
1072
  }
836
1073
 
837
- // Extract images from BinData streams
838
- const imageStreams: { path: string; data: Uint8Array }[] = [];
839
- for (const [path, data] of streams) {
840
- if ((path.includes('BinData') || path.includes('.jpg') || path.includes('.jpeg') || path.includes('.png') || path.includes('.gif') || path.includes('.bmp'))
841
- && !path.includes('FileHeader') && !path.includes('DocInfo') && !path.includes('BodyText') && !path.includes('Section')) {
842
- imageStreams.push({ path, data });
843
- console.log(`[HwpScanner] Image stream found: ${path} (${data.length} bytes)`);
844
- }
1074
+ // Extract images from BinData streams.
1075
+ // HWP duplicates each BinData entry: once as "BinData/BIN0001.jpg" and once as "BIN0001.jpg".
1076
+ // We keep only the "BinData/" prefixed versions, sort by BIN number, then assign 0-based keys
1077
+ // matching the order 'gso' CTRL_HEADER records are encountered during body parsing.
1078
+ const binEntries: { binNum: number; data: Uint8Array }[] = [];
1079
+ for (const [path, streamData] of streams) {
1080
+ // Match "BinData/BIN0001.jpg" style the canonical form
1081
+ const m = path.match(/^BinData[/\\]BIN(\d+)\.\w+$/i);
1082
+ if (m) binEntries.push({ binNum: parseInt(m[1], 10), data: streamData });
845
1083
  }
1084
+ // Sort by BIN number (ascending) so BIN0001→idx0, BIN0002→idx1, …
1085
+ binEntries.sort((a, b) => a.binNum - b.binNum);
846
1086
 
847
- // Create image nodes for each image stream (deduplicated by hash)
848
1087
  const objectMap = new Map<number, ImgNode>();
849
- const seenHashes = new Set<string>();
850
- let imgIdx = 0;
851
- for (const { path, data } of imageStreams) {
852
- // Determine MIME type from extension or signature
853
- let mimeType = 'image/jpeg';
854
- const lowerPath = path.toLowerCase();
855
- if (lowerPath.includes('.png')) mimeType = 'image/png';
856
- else if (lowerPath.includes('.gif')) mimeType = 'image/gif';
857
- else if (lowerPath.includes('.bmp')) mimeType = 'image/bmp';
858
-
859
- // Also check signature
860
- if (data[0] === 0x89 && data[1] === 0x50 && data[2] === 0x4E && data[3] === 0x47) mimeType = 'image/png';
861
- else if (data[0] === 0x47 && data[1] === 0x49 && data[2] === 0x46 && data[3] === 0x3538) mimeType = 'image/gif';
862
- else if (data[0] === 0x42 && data[1] === 0x4D) mimeType = 'image/bmp';
863
-
864
- const imgData = Buffer.from(data);
865
- const base64 = imgData.toString('base64');
866
- const hash = base64.slice(0, 20); // Use first 20 chars as simple hash
867
- if (!seenHashes.has(hash)) {
868
- seenHashes.add(hash);
869
- objectMap.set(imgIdx++, buildImg(
870
- base64,
871
- mimeType as any,
872
- 0, // w
873
- 0, // h
874
- `Image from ${path}`,
875
- ));
876
- console.log(`[HwpScanner] Added unique image: ${hash}... (${data.length} bytes)`);
877
- } else {
878
- console.log(`[HwpScanner] Duplicate image skipped: ${hash}...`);
879
- }
1088
+ for (let idx = 0; idx < binEntries.length; idx++) {
1089
+ const { data: imgData } = binEntries[idx];
1090
+
1091
+ // Determine MIME type from binary signature first, then fall back to extension
1092
+ let mimeType: ImgNode['mime'] = 'image/jpeg';
1093
+ if (imgData[0] === 0x89 && imgData[1] === 0x50) mimeType = 'image/png';
1094
+ else if (imgData[0] === 0x47 && imgData[1] === 0x49) mimeType = 'image/gif';
1095
+ else if (imgData[0] === 0x42 && imgData[1] === 0x4D) mimeType = 'image/bmp';
1096
+
1097
+ const base64 = TextKit.base64Encode(imgData);
1098
+ const { wPt, hPt } = getImageDimsPt(imgData, mimeType);
1099
+ objectMap.set(idx, buildImg(base64, mimeType, wPt, hPt));
880
1100
  }
881
1101
 
882
- console.log(`[HwpScanner] Found ${imageStreams.length} image streams, ${objectMap.size} unique images`);
1102
+ // gsoCtx tracks sequential 'gso' encounter order — must be shared across all sections
1103
+ const gsoCtx: GsoCtx = { count: 0 };
883
1104
 
884
1105
  // Body sections
885
1106
  const allContent: ContentNode[] = [];
@@ -891,7 +1112,7 @@ export class HwpScanner implements Decoder {
891
1112
  if (s === 0) {
892
1113
  const fb = findBodySection(streams);
893
1114
  if (fb) {
894
- const r = parseBody(fb, compressed, di, shield);
1115
+ const r = parseBody(fb, compressed, di, shield, gsoCtx);
895
1116
  allContent.push(...r.content);
896
1117
  if (r.pageDims) pageDims = r.pageDims;
897
1118
  }
@@ -899,7 +1120,7 @@ export class HwpScanner implements Decoder {
899
1120
  break;
900
1121
  }
901
1122
  const r = shield.guard(
902
- () => parseBody(sec, compressed, di, shield),
1123
+ () => parseBody(sec, compressed, di, shield, gsoCtx),
903
1124
  { content: [], pageDims: undefined },
904
1125
  `hwp:sec${s}`,
905
1126
  );
@@ -907,33 +1128,17 @@ export class HwpScanner implements Decoder {
907
1128
  if (r.pageDims) pageDims = r.pageDims;
908
1129
  }
909
1130
 
910
- // Inject images into paragraphs (only if images are available)
911
- console.log(`[HwpScanner] Before injection: ${allContent.length} nodes, ${objectMap.size} images available`);
912
1131
  if (objectMap.size > 0) {
913
1132
  injectImagesIntoContent(allContent, objectMap);
914
- console.log(`[HwpScanner] After injection: ${allContent.length} nodes`);
915
1133
  }
916
1134
 
917
- // Count images (recursively)
918
- const countImages = (nodes: ContentNode[]): number => {
919
- let count = 0;
920
- for (const node of nodes) {
921
- if ((node as any).tag === 'img') count++;
922
- if ((node as any).tag === 'para' && (node as any).kids) count += countImages((node as any).kids);
923
- if ((node as any).tag === 'grid' && (node as any).kids) {
924
- for (const row of (node as any).kids) {
925
- if (row.kids) count += countImages(row.kids);
926
- }
927
- }
928
- }
929
- return count;
930
- };
931
- const imgCount = countImages(allContent);
932
- console.log(`[HwpScanner] Images in content: ${imgCount}`);
933
-
934
1135
  warns.push(...shield.flush());
935
1136
  const content = allContent.length > 0 ? allContent : [buildPara([buildSpan('')])];
936
- return succeed(buildRoot({}, [buildSheet(content, pageDims)]), warns);
1137
+ // P8: 머리말/꼬리말을 gsoCtx에서 가져와 buildSheet 전달
1138
+ return succeed(buildRoot({}, [buildSheet(content, pageDims, {
1139
+ headers: gsoCtx.headers ? { default: gsoCtx.headers } : undefined,
1140
+ footers: gsoCtx.footers ? { default: gsoCtx.footers } : undefined,
1141
+ })]), warns);
937
1142
  } catch (e: any) {
938
1143
  warns.push(...shield.flush());
939
1144
  return fail(`HWP decode error: ${e?.message ?? String(e)}`, warns);
@@ -947,6 +1152,52 @@ function findBodySection(streams: Map<string, Uint8Array>): Uint8Array | undefin
947
1152
  return undefined;
948
1153
  }
949
1154
 
1155
+ /* ═══════════════════════════════════════════════════════════════
1156
+ Image dimension extraction from binary headers
1157
+ ════════════════════════════════════════════════════════════ */
1158
+
1159
+ // Returns { wPt, hPt } by parsing image headers; falls back to { wPt: 72, hPt: 72 } (1-inch)
1160
+ function getImageDimsPt(data: Uint8Array, mime: string): { wPt: number; hPt: number } {
1161
+ const fallback = { wPt: 72, hPt: 72 };
1162
+ try {
1163
+ if (mime === 'image/png' && data.length >= 24) {
1164
+ // PNG IHDR: sig(8) + length(4) + type(4) + width(4) + height(4) — all big-endian
1165
+ const w = (data[16] << 24 | data[17] << 16 | data[18] << 8 | data[19]) >>> 0;
1166
+ const h = (data[20] << 24 | data[21] << 16 | data[22] << 8 | data[23]) >>> 0;
1167
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 }; // 96 DPI → pt
1168
+ }
1169
+ if (mime === 'image/jpeg') {
1170
+ // Scan for SOF markers: FF C0 / C1 / C2 / C3
1171
+ let i = 2;
1172
+ while (i + 8 < data.length) {
1173
+ if (data[i] !== 0xFF) { i++; continue; }
1174
+ const marker = data[i + 1];
1175
+ if (marker >= 0xC0 && marker <= 0xC3) {
1176
+ // SOF: 2-byte marker + 2-byte length + 1-byte precision + 2-byte height + 2-byte width
1177
+ const h = (data[i + 5] << 8 | data[i + 6]) >>> 0;
1178
+ const w = (data[i + 7] << 8 | data[i + 8]) >>> 0;
1179
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
1180
+ }
1181
+ const segLen = data[i + 2] << 8 | data[i + 3];
1182
+ i += 2 + (segLen > 0 ? segLen : 2);
1183
+ }
1184
+ }
1185
+ if (mime === 'image/bmp' && data.length >= 26) {
1186
+ // BMP DIB header: width at 18, height at 22 (signed int32 LE; negative = top-down)
1187
+ const w = BinaryKit.readU32LE(data, 18);
1188
+ const h = Math.abs(BinaryKit.readU32LE(data, 22) | 0);
1189
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
1190
+ }
1191
+ if (mime === 'image/gif' && data.length >= 10) {
1192
+ // GIF: width at 6, height at 8 (uint16 LE)
1193
+ const w = data[6] | data[7] << 8;
1194
+ const h = data[8] | data[9] << 8;
1195
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
1196
+ }
1197
+ } catch { /* ignore */ }
1198
+ return fallback;
1199
+ }
1200
+
950
1201
  /* ═══════════════════════════════════════════════════════════════
951
1202
  OLE Object extraction (images)
952
1203
  ════════════════════════════════════════════════════════════ */
@@ -994,35 +1245,69 @@ function injectImagesIntoContent(
994
1245
  content: ContentNode[],
995
1246
  objectMap: Map<number, ImgNode>
996
1247
  ): void {
997
- const imageArray = Array.from(objectMap.values());
998
- if (imageArray.length === 0) return;
1248
+ if (objectMap.size === 0) return;
1249
+
1250
+ // Helper function to process a list of kids (spans, images, etc.)
1251
+ const processKids = (kids: any[]) => {
1252
+ for (let i = 0; i < kids.length; i++) {
1253
+ const kid = kids[i];
1254
+ // Span node structure: { tag: 'span', props, kids: [{ tag: 'txt', content }] }
1255
+ if (kid.tag === 'span' && kid.kids && kid.kids[0]?.tag === 'txt') {
1256
+ const text = kid.kids[0].content;
1257
+ // __EXT_N__ or __EXT_N_W<wPt>_H<hPt>__ (with encoded display size)
1258
+ // N is the objId that matches the index in objectMap
1259
+ const match = text.match?.(/^__(?:IMG|EXT)_(\d+)(?:_W(\d+)_H(\d+))?__$/);
1260
+ if (match) {
1261
+ const objId = parseInt(match[1], 10);
1262
+ const base = objectMap.get(objId);
1263
+ if (base) {
1264
+ const wPt = match[2] ? parseInt(match[2], 10) : 0;
1265
+ const hPt = match[3] ? parseInt(match[3], 10) : 0;
1266
+ // Use encoded display size when valid; otherwise keep pixel-based dims
1267
+ kids[i] = (wPt > 0 && hPt > 0) ? { ...base, w: wPt, h: hPt } : base;
1268
+ }
1269
+ }
1270
+ }
1271
+ }
1272
+ };
999
1273
 
1000
- // Get unique images (deduplicate by base64 content)
1001
- const uniqueImages = Array.from(new Set(imageArray.map(img => img.b64))).map(b64 => {
1002
- return imageArray.find(img => img.b64 === b64)!;
1003
- });
1004
- if (uniqueImages.length === 0) return;
1274
+ // Recursively process a grid (table): resolves image placeholders in all cells,
1275
+ // including nested grids inside cells.
1276
+ const processGridKids = (grid: any) => {
1277
+ if (!grid.kids || !Array.isArray(grid.kids)) return;
1278
+
1279
+ for (const row of grid.kids) {
1280
+ if (!row.kids || !Array.isArray(row.kids)) continue;
1281
+
1282
+ for (const cell of row.kids) {
1283
+ if (!cell.kids || !Array.isArray(cell.kids)) continue;
1284
+
1285
+ for (const cellKid of cell.kids) {
1286
+ if (cellKid.tag === 'grid') {
1287
+ // Nested table inside cell — recurse
1288
+ processGridKids(cellKid);
1289
+ } else if (cellKid.tag === 'para' && cellKid.kids) {
1290
+ processKids(cellKid.kids);
1291
+ }
1292
+ }
1293
+ }
1294
+ }
1295
+ };
1005
1296
 
1006
- let imgIdx = 0;
1007
1297
  for (const node of content) {
1008
1298
  if (node.tag === 'para' && node.kids) {
1009
- for (let i = 0; i < node.kids.length; i++) {
1010
- const kid = node.kids[i];
1011
- // Span node structure: { tag: 'span', props, kids: [{ tag: 'txt', content }] }
1012
- if (kid.tag === 'span' && kid.kids && kid.kids[0]?.tag === 'txt') {
1013
- const text = kid.kids[0].content;
1014
- // Support both __IMG_N__ and __EXT_N__ patterns
1015
- const match = text.match?.(/^__(?:IMG|EXT)_(\d+)__$/);
1016
- if (match) {
1017
- // Replace placeholder with next available image (round-robin)
1018
- const imgNode = uniqueImages[imgIdx % uniqueImages.length];
1019
- if (imgNode) {
1020
- node.kids[i] = imgNode;
1021
- imgIdx++;
1022
- }
1023
- }
1299
+ // Process paragraph kids (spans, images, links, grids)
1300
+ processKids(node.kids);
1301
+
1302
+ // Also process any nested grids inside the paragraph
1303
+ for (const kid of node.kids) {
1304
+ if (kid.tag === 'grid') {
1305
+ processGridKids(kid);
1024
1306
  }
1025
1307
  }
1308
+ } else if (node.tag === 'grid') {
1309
+ // Process grid nodes (tables)
1310
+ processGridKids(node);
1026
1311
  }
1027
1312
  }
1028
1313
  }