hwpkit-dev 0.0.1 → 0.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/ .npmignore +4 -1
  2. package/README.md +39 -2
  3. package/dist/index.d.mts +74 -16
  4. package/dist/index.d.ts +70 -16
  5. package/dist/index.js +4985 -698
  6. package/dist/index.js.map +1 -1
  7. package/dist/index.mjs +4981 -698
  8. package/dist/index.mjs.map +1 -1
  9. package/package.json +4 -1
  10. package/playground/index.html +346 -0
  11. package/playground/main.ts +302 -0
  12. package/playground/vite.config.ts +16 -0
  13. package/src/contract/decoder.ts +1 -0
  14. package/src/contract/encoder.ts +6 -1
  15. package/src/core/BaseDecoder.ts +118 -0
  16. package/src/core/BaseEncoder.ts +146 -0
  17. package/src/decoders/docx/DocxDecoder.ts +867 -150
  18. package/src/decoders/html/HtmlDecoder.ts +366 -0
  19. package/src/decoders/hwp/HwpScanner.ts +477 -88
  20. package/src/decoders/hwpx/HwpxDecoder.ts +789 -293
  21. package/src/decoders/md/MdDecoder.ts +4 -4
  22. package/src/encoders/docx/DocxEncoder.ts +600 -295
  23. package/src/encoders/html/HtmlEncoder.ts +203 -0
  24. package/src/encoders/hwp/HwpEncoder.ts +1647 -398
  25. package/src/encoders/hwpx/HwpxEncoder.ts +1512 -444
  26. package/src/encoders/hwpx/constants.ts +148 -0
  27. package/src/encoders/hwpx/utils.ts +198 -0
  28. package/src/encoders/md/MdEncoder.ts +117 -30
  29. package/src/index.ts +1 -0
  30. package/src/model/builders.ts +8 -6
  31. package/src/model/doc-props.ts +19 -5
  32. package/src/model/doc-tree.ts +13 -5
  33. package/src/pipeline/Pipeline.ts +21 -4
  34. package/src/pipeline/registry.ts +13 -2
  35. package/src/safety/StyleBridge.ts +52 -7
  36. package/src/toolkit/ArchiveKit.ts +56 -0
  37. package/src/toolkit/StyleMapper.ts +221 -0
  38. package/src/toolkit/UnitConverter.ts +138 -0
  39. package/src/toolkit/XmlKit.ts +0 -5
  40. package/test-styling.ts +210 -0
@@ -1,11 +1,12 @@
1
1
  import type { Decoder } from '../../contract/decoder';
2
- import type { DocRoot, ContentNode, ParaNode, SpanNode } from '../../model/doc-tree';
2
+ import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode, GridNode } from '../../model/doc-tree';
3
3
  import type { Outcome } from '../../contract/result';
4
4
  import type { Align, Stroke, StrokeKind, PageDims, TextProps, ParaProps, CellProps, GridProps } from '../../model/doc-props';
5
5
  import { succeed, fail } from '../../contract/result';
6
- import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell } from '../../model/builders';
6
+ import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell, buildImg } from '../../model/builders';
7
7
  import { ShieldedParser } from '../../safety/ShieldedParser';
8
8
  import { BinaryKit } from '../../toolkit/BinaryKit';
9
+ import { TextKit } from '../../toolkit/TextKit';
9
10
  import { Metric, safeHex, safeFont } from '../../safety/StyleBridge';
10
11
  import { registry } from '../../pipeline/registry';
11
12
  import { A4 } from '../../model/doc-props';
@@ -37,8 +38,12 @@ const TAG_CELL_B = HWPTAG_BEGIN + 65; // 81
37
38
  function isTableTag(t: number) { return t === TAG_TABLE_A || t === TAG_TABLE_B; }
38
39
  function isCellTag(t: number) { return t === TAG_CELL_A || t === TAG_CELL_B || t === TAG_LIST_HEADER; }
39
40
 
40
- // CTRL_HEADER ctrlId for table: ' lbt' as UINT32-LE = 0x74626C20
41
- const CTRL_TABLE = 0x74626C20;
41
+ // CTRL_HEADER ctrlId values (UINT32-LE as ASCII)
42
+ const CTRL_TABLE = 0x74626C20; // ' lbt' = 표(table)
43
+ const CTRL_IMAGE = 0x696D6720; // 'img '
44
+ const CTRL_OBJ = 0x6F626A20; // 'obj '
45
+ const CTRL_FIG = 0x66696720; // 'fig '
46
+ const CTRL_GSO = 0x67736F20; // 'gso ' = 그리기 객체 (drawing object, contains embedded images)
42
47
 
43
48
  /* ═══════════════════════════════════════════════════════════════
44
49
  Types
@@ -67,6 +72,7 @@ interface HwpParaShape {
67
72
  spaceBefore: number;
68
73
  spaceAfter: number;
69
74
  lineSpacing: number;
75
+ leftMargin: number;
70
76
  indent: number;
71
77
  }
72
78
 
@@ -83,7 +89,14 @@ interface DocInfo {
83
89
  }
84
90
 
85
91
  interface ParsedChar { pos: number; ch: string }
86
- interface ParaTextResult { chars: ParsedChar[]; controlPositions: number[] }
92
+ interface ParsedCtrl { pos: number; ctrlId: number; objId: number; matched: boolean }
93
+ interface ParaTextResult { chars: ParsedChar[]; controls: ParsedCtrl[] }
94
+
95
+ interface OleObject {
96
+ id: number;
97
+ data: Uint8Array;
98
+ mimeType: string;
99
+ }
87
100
 
88
101
  /* ═══════════════════════════════════════════════════════════════
89
102
  Low-level record parsing
@@ -111,7 +124,9 @@ function parseRecords(data: Uint8Array): HwpRecord[] {
111
124
  }
112
125
 
113
126
  function tryInflate(data: Uint8Array): Uint8Array {
114
- try { return pako.inflateRaw(data); } catch { return data; }
127
+ try { return pako.inflate(data); } catch {
128
+ try { return pako.inflateRaw(data); } catch { return data; }
129
+ }
115
130
  }
116
131
 
117
132
  /* ═══════════════════════════════════════════════════════════════
@@ -209,11 +224,12 @@ function parseCharShape(d: Uint8Array): HwpCharShape {
209
224
  const ALIGN_TBL: Record<number, Align> = { 0: 'justify', 1: 'left', 2: 'right', 3: 'center', 4: 'justify' };
210
225
 
211
226
  function parseParaShape(d: Uint8Array): HwpParaShape {
212
- if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, indent: 0 };
227
+ if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, leftMargin: 0, indent: 0 };
213
228
  const attr = BinaryKit.readU32LE(d, 0);
214
229
  return {
215
- align: ALIGN_TBL[attr & 0x7] ?? 'left',
216
- indent: d.length >= 16 ? i32(d, 12) : 0,
230
+ align: ALIGN_TBL[(attr >> 2) & 0x7] ?? 'left',
231
+ leftMargin: d.length >= 8 ? i32(d, 4) : 0, // offset 4: leftMargin (들여쓰기)
232
+ indent: d.length >= 16 ? i32(d, 12) : 0, // offset 12: first-line indent
217
233
  spaceBefore: d.length >= 20 ? i32(d, 16) : 0,
218
234
  spaceAfter: d.length >= 24 ? i32(d, 20) : 0,
219
235
  lineSpacing: d.length >= 28 ? i32(d, 24) : 160,
@@ -228,20 +244,30 @@ function parseParaShape(d: Uint8Array): HwpParaShape {
228
244
  [36:4] faceColor (bgColor for solid fill) */
229
245
 
230
246
  const BORDER_W_PT = [0.28, 0.34, 0.43, 0.57, 0.71, 0.85, 1.13, 1.42, 1.70, 1.98, 2.84, 4.25, 5.67, 8.50, 11.34, 14.17];
231
- const BORDER_KIND: Record<number, StrokeKind> = { 0:'none',1:'solid',2:'dash',3:'dot',4:'dash',5:'dash',6:'dash',7:'dot',8:'double',9:'double',10:'double' };
247
+ const BORDER_KIND: Record<number, StrokeKind> = { 0:'solid',1:'dash',2:'dash',3:'dot',4:'dash',5:'dash',6:'dash',7:'double',8:'double',9:'double',10:'none' };
232
248
 
233
249
  function parseBorderFill(d: Uint8Array): HwpBorderFill {
250
+ // Spec grouped format (표 23):
251
+ // [0:2] attr
252
+ // [2:4] 4 border types (left, right, top, bottom) — 1 byte each
253
+ // [6:4] 4 border widths (left, right, top, bottom) — 1 byte each (index into BORDER_W_PT)
254
+ // [10:16] 4 border colors (left, right, top, bottom) — 4 bytes each (COLORREF)
255
+ // [26:3] diagonal: type(1) + width(1) + color(4) = 6 bytes actually [26:6]
256
+ // [32:4] fillType
257
+ // [36:4] faceColor (bgColor for solid fill)
234
258
  const borders: HwpBorderFill['borders'] = [];
259
+ const BASE_TYPE = 2; // 4 type bytes
260
+ const BASE_WIDTH = 6; // 4 width bytes
261
+ const BASE_COLOR = 10; // 4 × 4-byte colors
235
262
  for (let i = 0; i < 4; i++) {
236
- const b = 2 + i * 6;
237
- if (b + 6 <= d.length) {
238
- borders.push({ type: d[b], widthPt: BORDER_W_PT[d[b + 1]] ?? 0.5, color: colorRef(d, b + 2) });
239
- } else {
240
- borders.push({ type: 0, widthPt: 0.5, color: '000000' });
241
- }
263
+ const type = BASE_TYPE + i < d.length ? d[BASE_TYPE + i] : 0;
264
+ const widthPt = BASE_WIDTH + i < d.length ? (BORDER_W_PT[d[BASE_WIDTH + i]] ?? 0.5) : 0.5;
265
+ const color = BASE_COLOR + i * 4 + 4 <= d.length ? colorRef(d, BASE_COLOR + i * 4) : '000000';
266
+ borders.push({ type, widthPt, color });
242
267
  }
243
268
  let bgColor: string | undefined;
244
- const fOff = 32; // after attr(2) + 5 borders(30)
269
+ // after attr(2) + 4 types(4) + 4 widths(4) + 4 colors(16) + diagonal(6) = offset 32
270
+ const fOff = 32;
245
271
  if (d.length >= fOff + 8) {
246
272
  const ft = BinaryKit.readU32LE(d, fOff);
247
273
  if (ft & 1) bgColor = colorRef(d, fOff + 4);
@@ -253,21 +279,33 @@ function parseBorderFill(d: Uint8Array): HwpBorderFill {
253
279
  Body section parsing
254
280
  ═══════════════════════════════════════════════════════════════ */
255
281
 
282
+ // gsoCtx: shared mutable counter for 'gso ' drawing objects.
283
+ // Each 'gso ' CTRL_HEADER encountered increments this counter.
284
+ // objectMap is keyed by 0-based gso order = sequential BinData insertion order.
285
+ interface GsoCtx { count: number }
286
+
256
287
  function parseBody(
257
- raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser,
288
+ raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
258
289
  ): { content: ContentNode[]; pageDims?: PageDims } {
259
290
  const recs = parseRecords(compressed ? tryInflate(raw) : raw);
260
291
  const content: ContentNode[] = [];
261
292
  let pageDims: PageDims | undefined;
262
- let i = 0;
263
293
 
294
+ // Pre-scan for PAGE_DEF at any nesting level (real HWP stores it at level 2 inside section ctrl)
295
+ for (const r of recs) {
296
+ if (r.tag === TAG_PAGE_DEF) {
297
+ pageDims = shield.guard(() => parsePageDef(r.data), A4, 'hwp:pageDef');
298
+ break;
299
+ }
300
+ }
301
+
302
+ let i = 0;
264
303
  while (i < recs.length) {
265
304
  if (recs[i].tag === TAG_PAGE_DEF) {
266
- pageDims = shield.guard(() => parsePageDef(recs[i].data), A4, 'hwp:pageDef');
267
- i++;
305
+ i++; // already handled above; skip at top level
268
306
  } else if (recs[i].tag === TAG_PARA_HEADER) {
269
307
  const r = shield.guard(
270
- () => parseParagraphGroup(recs, i, di, shield),
308
+ () => parseParagraphGroup(recs, i, di, shield, gsoCtx),
271
309
  { nodes: [] as ContentNode[], next: i + 1 },
272
310
  `hwp:para@${i}`,
273
311
  );
@@ -283,7 +321,7 @@ function parseBody(
283
321
  /* ── Paragraph group ────────────────────────────────────────── */
284
322
 
285
323
  function parseParagraphGroup(
286
- recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser,
324
+ recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
287
325
  ): { nodes: ContentNode[]; next: number } {
288
326
  const hdr = recs[start];
289
327
  const lv = hdr.level;
@@ -295,6 +333,8 @@ function parseParagraphGroup(
295
333
  let text: ParaTextResult | null = null;
296
334
  let csPairs: [number, number][] = [];
297
335
  const grids: ContentNode[] = [];
336
+ // imgId: for 'gso' uses sequential gsoCtx.count; for others uses flags-based objId
337
+ const ctrlHeaders: { ctrlId: number; imgId: number; wPt: number; hPt: number }[] = [];
298
338
  let i = start + 1;
299
339
 
300
340
  while (i < recs.length && recs[i].level > lv) {
@@ -307,14 +347,33 @@ function parseParagraphGroup(
307
347
  csPairs = parseCharShapePairs(r.data);
308
348
  i++;
309
349
  } else if (r.tag === TAG_CTRL_HEADER && r.level === lv + 1) {
310
- if (r.data.length >= 4 && BinaryKit.readU32LE(r.data, 0) === CTRL_TABLE) {
311
- const tr = shield.guard(
312
- () => parseTableCtrl(recs, i, di, shield),
313
- { grid: null, next: skipKids(recs, i) },
314
- `hwp:tbl@${i}`,
315
- );
316
- if (tr.grid) grids.push(tr.grid);
317
- i = tr.next;
350
+ if (r.data.length >= 4) {
351
+ const ctrlId = BinaryKit.readU32LE(r.data, 0);
352
+
353
+ // HWP 5.0 general-object layout:
354
+ // [0:4] ctrlId [4:4] flags [8:4] xOff [12:4] yOff
355
+ // [16:4] width(HWPUNIT) [20:4] height(HWPUNIT)
356
+ const MAX_HWP = 1_000_000;
357
+ const rawW = r.data.length >= 24 ? BinaryKit.readU32LE(r.data, 16) : 0;
358
+ const rawH = r.data.length >= 28 ? BinaryKit.readU32LE(r.data, 20) : 0;
359
+ const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
360
+ const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
361
+
362
+ // 'gso ' (그리기 객체) uses sequential counter; others use flags-based id
363
+ const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (r.data.length >= 6 ? BinaryKit.readU16LE(r.data, 4) : 0);
364
+ ctrlHeaders.push({ ctrlId, imgId, wPt, hPt });
365
+
366
+ if (ctrlId === CTRL_TABLE) {
367
+ const tr = shield.guard(
368
+ () => parseTableCtrl(recs, i, di, shield, gsoCtx),
369
+ { grid: null, next: skipKids(recs, i) },
370
+ `hwp:tbl@${i}`,
371
+ );
372
+ if (tr.grid) grids.push(tr.grid);
373
+ i = tr.next;
374
+ } else {
375
+ i = skipKids(recs, i);
376
+ }
318
377
  } else {
319
378
  i = skipKids(recs, i);
320
379
  }
@@ -325,12 +384,32 @@ function parseParagraphGroup(
325
384
 
326
385
  const nodes: ContentNode[] = [];
327
386
 
328
- // Build paragraph from text
329
- if (text && text.chars.length > 0) {
330
- const joined = text.chars.map(c => c.ch).join('');
331
- if (joined.trim()) {
387
+ // Build paragraph from text and inline controls (images)
388
+ if (text && (text.chars.length > 0 || text.controls.length > 0)) {
389
+ const paraContent: (SpanNode | ContentNode)[] = [];
390
+
391
+ if (text.chars.length > 0) {
332
392
  const spans = resolveCharShapes(text.chars, csPairs, di);
333
- nodes.push(buildPara(spans, buildParaProps(ps)));
393
+ paraContent.push(...spans);
394
+ }
395
+
396
+ // Image placeholder spans: only for actual image controls.
397
+ // Non-image controls (footnotes, TOC entries, etc.) are silently skipped.
398
+ if (text.controls.length > 0) {
399
+ for (let ci = 0; ci < text.controls.length; ci++) {
400
+ const ch = ctrlHeaders[ci];
401
+ if (!ch) continue; // anchor-only ctrl (gso is sibling, not inline)
402
+ const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
403
+ if (!isImg) continue; // skip footnotes, TOC, page num, etc.
404
+ const dimStr = (ch.wPt > 0 && ch.hPt > 0)
405
+ ? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}`
406
+ : '';
407
+ paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
408
+ }
409
+ }
410
+
411
+ if (paraContent.length > 0) {
412
+ nodes.push(buildPara(paraContent as any, buildParaProps(ps)));
334
413
  }
335
414
  }
336
415
 
@@ -354,7 +433,7 @@ const INL_CTRL = new Set([4, 5, 6, 7, 8]);
354
433
 
355
434
  function decodeParaText(d: Uint8Array): ParaTextResult {
356
435
  const chars: ParsedChar[] = [];
357
- const controlPositions: number[] = [];
436
+ const controls: ParsedCtrl[] = [];
358
437
  let i = 0, pos = 0;
359
438
 
360
439
  while (i + 1 < d.length) {
@@ -364,8 +443,14 @@ function decodeParaText(d: Uint8Array): ParaTextResult {
364
443
  if (c === 10) { chars.push({ pos, ch: '\n' }); i += 2; pos++; continue; }
365
444
 
366
445
  if (EXT_CTRL.has(c)) {
367
- controlPositions.push(pos);
368
- i += 16; pos += 8; continue; // 8 WORDs
446
+ // Extended control: 8 WORDs (16 bytes)
447
+ // WORD 4 contains objId (for images, charts, etc.)
448
+ let objId = 0;
449
+ if (i + 16 <= d.length) {
450
+ objId = BinaryKit.readU16LE(d, i + 8); // 4th WORD (offset 8) contains objId
451
+ }
452
+ controls.push({ pos, ctrlId: 0, objId, matched: false });
453
+ i += 16; pos += 8; continue;
369
454
  }
370
455
  if (INL_CTRL.has(c)) {
371
456
  i += 16; pos += 8; continue;
@@ -379,7 +464,7 @@ function decodeParaText(d: Uint8Array): ParaTextResult {
379
464
  chars.push({ pos, ch: String.fromCharCode(c) });
380
465
  i += 2; pos++;
381
466
  }
382
- return { chars, controlPositions };
467
+ return { chars, controls };
383
468
  }
384
469
 
385
470
  /* ── PARA_CHAR_SHAPE ────────────────────────────────────────── */
@@ -441,7 +526,7 @@ function styledSpan(text: string, shapeId: number, di: DocInfo): SpanNode {
441
526
  /* ── Table control parsing ──────────────────────────────────── */
442
527
 
443
528
  function parseTableCtrl(
444
- recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser,
529
+ recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
445
530
  ): { grid: ContentNode | null; next: number } {
446
531
  const ctrlLv = recs[ctrlIdx].level;
447
532
  let i = ctrlIdx + 1;
@@ -497,15 +582,15 @@ function parseTableCtrl(
497
582
  const rowCnt = tblData.length >= 6 ? BinaryKit.readU16LE(tblData, 4) : 1;
498
583
  const colCnt = tblData.length >= 8 ? BinaryKit.readU16LE(tblData, 6) : 1;
499
584
 
500
- interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; props: CellProps; paras: ParaNode[] }
585
+ interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; heightHwp?: number; props: CellProps; cellChildren: (ParaNode | GridNode)[] }
501
586
  const parsed: PC[] = [];
502
587
 
503
588
  for (let ci = 0; ci < cells.length; ci++) {
504
589
  const c = cells[ci];
505
590
  const seqIdx = ci;
506
591
  const pc = shield.guard(
507
- () => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt),
508
- { row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, props: {}, paras: [buildPara([buildSpan('')])] },
592
+ () => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt, gsoCtx),
593
+ { row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, heightHwp: undefined, props: {}, cellChildren: [buildPara([buildSpan('')])] },
509
594
  `hwp:cell@${c.cStart}`,
510
595
  );
511
596
  parsed.push(pc);
@@ -532,9 +617,11 @@ function parseTableCtrl(
532
617
  }
533
618
  }
534
619
  // Pass 2: for columns still 0, try to derive from multi-span cells
620
+ // Sort by span size ascending so smaller, more precise spans fill widths before larger spans
535
621
  const zeroColumns = colWidthsPt.filter(w => w === 0).length;
536
622
  if (zeroColumns > 0) {
537
- for (const c of parsed) {
623
+ const spanCells = parsed.filter(c => c.cs > 1 && c.widthHwp > 0).sort((a, b) => a.cs - b.cs);
624
+ for (const c of spanCells) {
538
625
  if (c.cs > 1 && c.widthHwp > 0) {
539
626
  // Subtract known column widths from the span
540
627
  let known = 0;
@@ -554,13 +641,37 @@ function parseTableCtrl(
554
641
  }
555
642
  }
556
643
 
644
+ // Post-process: clamp near-zero column widths (< 1pt = floating-point artifact) to minimum 1pt
645
+ for (let i = 0; i < colWidthsPt.length; i++) {
646
+ if (colWidthsPt[i] > 0 && colWidthsPt[i] < 1) colWidthsPt[i] = 1;
647
+ }
648
+
557
649
  const rows = [];
558
650
  for (let r = 0; r < actualRowCnt; r++) {
559
651
  const rc = parsed.filter(c => c.row === r).sort((a, b) => a.col - b.col);
560
652
  if (rc.length === 0) continue;
561
- rows.push(buildRow(rc.map(c =>
562
- buildCell(c.paras.length ? c.paras : [buildPara([buildSpan('')])], { cs: c.cs, rs: c.rs, props: c.props }),
563
- )));
653
+
654
+ // Calculate row height prefer rs=1 cells (exact per-row height)
655
+ let rowHeightPt: number | undefined = undefined;
656
+ for (const c of rc) {
657
+ if (c.heightHwp && c.heightHwp > 0 && c.rs === 1) {
658
+ const hPt = Metric.hwpToPt(c.heightHwp);
659
+ if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
660
+ }
661
+ }
662
+ // Fallback: all cells span multiple rows → approximate height per row
663
+ if (rowHeightPt == null) {
664
+ for (const c of rc) {
665
+ if (c.heightHwp && c.heightHwp > 0) {
666
+ const hPt = Metric.hwpToPt(c.heightHwp) / c.rs;
667
+ if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
668
+ }
669
+ }
670
+ }
671
+
672
+ rows.push(buildRow(rc.map(c => {
673
+ return buildCell(c.cellChildren, { cs: c.cs, rs: c.rs, props: c.props });
674
+ }), rowHeightPt));
564
675
  }
565
676
  if (rows.length === 0) return { grid: null, next: i };
566
677
 
@@ -589,10 +700,11 @@ function parseTableCtrl(
589
700
 
590
701
  function parseCellRec(
591
702
  d: Uint8Array, tag: number, recs: HwpRecord[], cStart: number, cEnd: number,
592
- di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number,
703
+ di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number, gsoCtx: GsoCtx,
593
704
  ) {
594
705
  let col: number, row: number, cs = 1, rs = 1;
595
706
  let widthHwp = 0;
707
+ let heightHwp = 0;
596
708
  const props: CellProps = {};
597
709
 
598
710
  const attr = d.length >= 6 ? BinaryKit.readU32LE(d, 2) : 0;
@@ -600,57 +712,55 @@ function parseCellRec(
600
712
  if (va === 1) props.va = 'mid';
601
713
  else if (va === 2) props.va = 'bot';
602
714
 
715
+ const HWP_PAD_LR_DEFAULT = 360;
716
+ const HWP_PAD_TB_DEFAULT = 141;
717
+
603
718
  if (tag === TAG_LIST_HEADER && d.length >= 22) {
604
- // LIST_HEADER with cell-specific fields
605
- // offset 8: colAddr, offset 10: rowAddr (HWP 5.0 spec)
606
719
  col = BinaryKit.readU16LE(d, 8);
607
720
  row = BinaryKit.readU16LE(d, 10);
608
- rs = Math.max(1, BinaryKit.readU16LE(d, 12));
609
- cs = Math.max(1, BinaryKit.readU16LE(d, 14));
721
+ cs = Math.max(1, BinaryKit.readU16LE(d, 12));
722
+ rs = Math.max(1, BinaryKit.readU16LE(d, 14));
610
723
  widthHwp = BinaryKit.readU32LE(d, 16);
611
-
612
- const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
613
- if (bfId > 0 && bfId <= di.borderFills.length) {
614
- const bf = di.borderFills[bfId - 1];
615
- if (bf.borders.length >= 4) {
616
- props.left = toStroke(bf.borders[0]);
617
- props.right = toStroke(bf.borders[1]);
618
- props.top = toStroke(bf.borders[2]);
619
- props.bot = toStroke(bf.borders[3]);
620
- }
621
- if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
724
+ heightHwp = d.length >= 24 ? BinaryKit.readU32LE(d, 20) : 0;
725
+ if (d.length >= 32) {
726
+ const pL = BinaryKit.readU16LE(d, 24); const pR = BinaryKit.readU16LE(d, 26);
727
+ const pT = BinaryKit.readU16LE(d, 28); const pB = BinaryKit.readU16LE(d, 30);
728
+ if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
729
+ if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
730
+ if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
731
+ if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
622
732
  }
733
+ const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
734
+ if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
623
735
  } else if (tag !== TAG_LIST_HEADER) {
624
- // Full CELL record with position/span/borderFill
625
736
  col = d.length >= 8 ? BinaryKit.readU16LE(d, 6) : seqIdx % (colCnt || 1);
626
737
  row = d.length >= 10 ? BinaryKit.readU16LE(d, 8) : Math.floor(seqIdx / (colCnt || 1));
627
738
  cs = d.length >= 12 ? Math.max(1, BinaryKit.readU16LE(d, 10)) : 1;
628
739
  rs = d.length >= 14 ? Math.max(1, BinaryKit.readU16LE(d, 12)) : 1;
629
740
  widthHwp = d.length >= 18 ? BinaryKit.readU32LE(d, 14) : 0;
630
-
631
- const bfId = d.length >= 32 ? BinaryKit.readU16LE(d, 30) : 0;
632
- if (bfId > 0 && bfId <= di.borderFills.length) {
633
- const bf = di.borderFills[bfId - 1];
634
- if (bf.borders.length >= 4) {
635
- props.left = toStroke(bf.borders[0]);
636
- props.right = toStroke(bf.borders[1]);
637
- props.top = toStroke(bf.borders[2]);
638
- props.bot = toStroke(bf.borders[3]);
639
- }
640
- if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
741
+ heightHwp = d.length >= 22 ? BinaryKit.readU32LE(d, 18) : 0;
742
+ if (d.length >= 30) {
743
+ const pL = BinaryKit.readU16LE(d, 22); const pR = BinaryKit.readU16LE(d, 24);
744
+ const pT = BinaryKit.readU16LE(d, 26); const pB = BinaryKit.readU16LE(d, 28);
745
+ if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
746
+ if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
747
+ if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
748
+ if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
641
749
  }
750
+ const bfId = d.length >= 32 ? BinaryKit.readU16LE(d, 30) : 0;
751
+ if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
642
752
  } else {
643
- // Fallback: LIST_HEADER too short, compute sequentially
644
753
  row = Math.floor(seqIdx / (colCnt || 1));
645
754
  col = seqIdx % (colCnt || 1);
646
755
  }
647
756
 
648
- // Parse cell content paragraphs
649
- const paras: ParaNode[] = [];
757
+ const cellChildren: (ParaNode | GridNode)[] = [];
758
+ const MAX_HWP = 1_000_000;
650
759
  let k = cStart;
760
+
651
761
  while (k < cEnd) {
652
762
  if (recs[k].tag === TAG_PARA_HEADER) {
653
- // For cell paragraphs, they might be at various nesting levels
763
+ // Parse paragraph inside cell also extracts nested tables within the paragraph
654
764
  const r = shield.guard(
655
765
  () => {
656
766
  const hdr = recs[k];
@@ -659,24 +769,91 @@ function parseCellRec(
659
769
  const ps = di.paraShapes[psId];
660
770
  let txt: ParaTextResult | null = null;
661
771
  let csp: [number, number][] = [];
772
+ const ctrlHdrs: { ctrlId: number; imgId: number; wPt: number; hPt: number }[] = [];
773
+ const innerGrids: GridNode[] = [];
662
774
  let j = k + 1;
663
775
  while (j < cEnd && recs[j].level > lv) {
664
776
  if (recs[j].tag === TAG_PARA_TEXT) { txt = decodeParaText(recs[j].data); j++; }
665
777
  else if (recs[j].tag === TAG_PARA_CHAR_SHAPE) { csp = parseCharShapePairs(recs[j].data); j++; }
778
+ else if (recs[j].tag === TAG_CTRL_HEADER && recs[j].level === lv + 1) {
779
+ if (recs[j].data.length >= 4) {
780
+ const ctrlId = BinaryKit.readU32LE(recs[j].data, 0);
781
+ if (ctrlId === CTRL_TABLE) {
782
+ // Nested table inside a cell paragraph — recurse into parseTableCtrl
783
+ const nestedTr = shield.guard(
784
+ () => parseTableCtrl(recs, j, di, shield, gsoCtx),
785
+ { grid: null, next: skipKids(recs, j) },
786
+ `hwp:innerNestedTbl@${j}`,
787
+ );
788
+ if (nestedTr.grid) innerGrids.push(nestedTr.grid as GridNode);
789
+ j = nestedTr.next;
790
+ } else {
791
+ const rawW = recs[j].data.length >= 24 ? BinaryKit.readU32LE(recs[j].data, 16) : 0;
792
+ const rawH = recs[j].data.length >= 28 ? BinaryKit.readU32LE(recs[j].data, 20) : 0;
793
+ const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
794
+ const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
795
+ const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (recs[j].data.length >= 6 ? BinaryKit.readU16LE(recs[j].data, 4) : 0);
796
+ ctrlHdrs.push({ ctrlId, imgId, wPt, hPt });
797
+ j = skipKids(recs, j);
798
+ }
799
+ } else {
800
+ j = skipKids(recs, j);
801
+ }
802
+ }
666
803
  else j++;
667
804
  }
668
- const spans = txt && txt.chars.length > 0 ? resolveCharShapes(txt.chars, csp, di) : [buildSpan('')];
669
- return { para: buildPara(spans, buildParaProps(ps)), next: j };
805
+ const paraContent: (SpanNode | ContentNode)[] = [];
806
+ if (txt && txt.chars.length > 0) paraContent.push(...resolveCharShapes(txt.chars, csp, di));
807
+ if (txt && txt.controls.length > 0) {
808
+ for (let ci = 0; ci < txt.controls.length; ci++) {
809
+ const ch = ctrlHdrs[ci];
810
+ if (!ch) continue;
811
+ const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
812
+ if (!isImg) continue;
813
+ const dimStr = (ch.wPt > 0 && ch.hPt > 0) ? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}` : '';
814
+ paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
815
+ }
816
+ }
817
+ const kids = paraContent.length > 0 ? paraContent as any : [buildSpan('')];
818
+ const items: (ParaNode | GridNode)[] = [buildPara(kids, buildParaProps(ps)), ...innerGrids];
819
+ return { items, next: j };
670
820
  },
671
- { para: buildPara([buildSpan('')]), next: k + 1 },
821
+ { items: [buildPara([buildSpan('')])] as (ParaNode | GridNode)[], next: k + 1 },
672
822
  `hwp:cellP@${k}`,
673
823
  );
674
- paras.push(r.para);
824
+ cellChildren.push(...r.items);
675
825
  k = r.next;
826
+ } else if (recs[k].tag === TAG_CTRL_HEADER && recs[k].data.length >= 4) {
827
+ // CTRL_HEADER at cell level (sibling of PARA_HEADER) — anchored 'gso' images and outer-level nested tables
828
+ const cellCtrlId = BinaryKit.readU32LE(recs[k].data, 0);
829
+ if (cellCtrlId === CTRL_GSO) {
830
+ const gsoId = gsoCtx.count++;
831
+ const rawW = recs[k].data.length >= 24 ? BinaryKit.readU32LE(recs[k].data, 16) : 0;
832
+ const rawH = recs[k].data.length >= 28 ? BinaryKit.readU32LE(recs[k].data, 20) : 0;
833
+ const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
834
+ const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
835
+ const dimStr = (wPt > 0 && hPt > 0) ? `_W${Math.round(wPt)}_H${Math.round(hPt)}` : '';
836
+ cellChildren.push(buildPara([buildSpan(`__EXT_${gsoId}${dimStr}__`)]));
837
+ k = skipKids(recs, k);
838
+ } else if (cellCtrlId === CTRL_TABLE) {
839
+ const tr = shield.guard(
840
+ () => parseTableCtrl(recs, k, di, shield, gsoCtx),
841
+ { grid: null, next: skipKids(recs, k) },
842
+ `hwp:nestedTbl@${k}`,
843
+ );
844
+ if (tr.grid) cellChildren.push(tr.grid as GridNode);
845
+ k = tr.next;
846
+ } else {
847
+ k = skipKids(recs, k);
848
+ }
676
849
  } else { k++; }
677
850
  }
678
851
 
679
- return { row, col, cs, rs, props, widthHwp, paras: paras.length ? paras : [buildPara([buildSpan('')])] };
852
+ return {
853
+ row, col, cs, rs, props, widthHwp,
854
+ heightHwp: heightHwp || undefined,
855
+ cellChildren: cellChildren.length ? cellChildren : [buildPara([buildSpan('')])],
856
+ };
680
857
  }
681
858
 
682
859
  /* ── PAGE_DEF ───────────────────────────────────────────────── */
@@ -718,6 +895,18 @@ function toStroke(b: { type: number; widthPt: number; color: string }): Stroke {
718
895
  return { kind: BORDER_KIND[b.type] ?? 'solid', pt: b.widthPt, color: b.color };
719
896
  }
720
897
 
898
+ // Apply borderFill to CellProps. Preserve explicit NONE so DOCX tcBorders can
899
+ // override the table-level tblBorders. Filtering NONE would let tblBorders bleed through.
900
+ function applyCellBorderFill(bf: HwpBorderFill, props: CellProps): void {
901
+ if (bf.borders.length >= 4) {
902
+ props.left = toStroke(bf.borders[0]);
903
+ props.right = toStroke(bf.borders[1]);
904
+ props.top = toStroke(bf.borders[2]);
905
+ props.bot = toStroke(bf.borders[3]);
906
+ }
907
+ if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
908
+ }
909
+
721
910
  function strokeFromBF(bfId: number, di: DocInfo): Stroke | undefined {
722
911
  if (bfId <= 0 || bfId > di.borderFills.length) return undefined;
723
912
  const bf = di.borderFills[bfId - 1];
@@ -733,7 +922,11 @@ function buildParaProps(ps?: HwpParaShape): ParaProps {
733
922
  if (ps.spaceBefore > 0) p.spaceBefore = Metric.hwpToPt(ps.spaceBefore);
734
923
  if (ps.spaceAfter > 0) p.spaceAfter = Metric.hwpToPt(ps.spaceAfter);
735
924
  if (ps.lineSpacing > 0 && ps.lineSpacing !== 160) p.lineHeight = ps.lineSpacing / 100;
736
- if (ps.indent > 0) p.indentPt = Metric.hwpToPt(ps.indent);
925
+ // leftMargin (offset 4) = 문단 몸체 왼쪽 여백 → leftMargin (pt), ensure non-negative
926
+ const leftMarginPt = Math.max(0, Metric.hwpToPt(ps.leftMargin));
927
+ if (leftMarginPt > 0) p.leftMargin = leftMarginPt;
928
+ // indent (offset 12) = 첫 줄 들여쓰기(양수) / 내어쓰기(음수) → firstLineIndentPt
929
+ if (ps.indent !== 0) p.firstLineIndentPt = Metric.hwpToPt(ps.indent);
737
930
  return p;
738
931
  }
739
932
 
@@ -743,6 +936,7 @@ function buildParaProps(ps?: HwpParaShape): ParaProps {
743
936
 
744
937
  export class HwpScanner implements Decoder {
745
938
  readonly format = 'hwp';
939
+ readonly aliases = ['application/vnd.hancom.hwp'];
746
940
 
747
941
  async decode(data: Uint8Array): Promise<Outcome<DocRoot>> {
748
942
  const shield = new ShieldedParser();
@@ -764,6 +958,37 @@ export class HwpScanner implements Decoder {
764
958
  di = shield.guard(() => parseDocInfo(diRaw, compressed), di, 'hwp:docInfo');
765
959
  }
766
960
 
961
+ // Extract images from BinData streams.
962
+ // HWP duplicates each BinData entry: once as "BinData/BIN0001.jpg" and once as "BIN0001.jpg".
963
+ // We keep only the "BinData/" prefixed versions, sort by BIN number, then assign 0-based keys
964
+ // matching the order 'gso' CTRL_HEADER records are encountered during body parsing.
965
+ const binEntries: { binNum: number; data: Uint8Array }[] = [];
966
+ for (const [path, streamData] of streams) {
967
+ // Match "BinData/BIN0001.jpg" style — the canonical form
968
+ const m = path.match(/^BinData[/\\]BIN(\d+)\.\w+$/i);
969
+ if (m) binEntries.push({ binNum: parseInt(m[1], 10), data: streamData });
970
+ }
971
+ // Sort by BIN number (ascending) so BIN0001→idx0, BIN0002→idx1, …
972
+ binEntries.sort((a, b) => a.binNum - b.binNum);
973
+
974
+ const objectMap = new Map<number, ImgNode>();
975
+ for (let idx = 0; idx < binEntries.length; idx++) {
976
+ const { data: imgData } = binEntries[idx];
977
+
978
+ // Determine MIME type from binary signature first, then fall back to extension
979
+ let mimeType: ImgNode['mime'] = 'image/jpeg';
980
+ if (imgData[0] === 0x89 && imgData[1] === 0x50) mimeType = 'image/png';
981
+ else if (imgData[0] === 0x47 && imgData[1] === 0x49) mimeType = 'image/gif';
982
+ else if (imgData[0] === 0x42 && imgData[1] === 0x4D) mimeType = 'image/bmp';
983
+
984
+ const base64 = TextKit.base64Encode(imgData);
985
+ const { wPt, hPt } = getImageDimsPt(imgData, mimeType);
986
+ objectMap.set(idx, buildImg(base64, mimeType, wPt, hPt));
987
+ }
988
+
989
+ // gsoCtx tracks sequential 'gso' encounter order — must be shared across all sections
990
+ const gsoCtx: GsoCtx = { count: 0 };
991
+
767
992
  // Body sections
768
993
  const allContent: ContentNode[] = [];
769
994
  let pageDims: PageDims = A4;
@@ -774,7 +999,7 @@ export class HwpScanner implements Decoder {
774
999
  if (s === 0) {
775
1000
  const fb = findBodySection(streams);
776
1001
  if (fb) {
777
- const r = parseBody(fb, compressed, di, shield);
1002
+ const r = parseBody(fb, compressed, di, shield, gsoCtx);
778
1003
  allContent.push(...r.content);
779
1004
  if (r.pageDims) pageDims = r.pageDims;
780
1005
  }
@@ -782,7 +1007,7 @@ export class HwpScanner implements Decoder {
782
1007
  break;
783
1008
  }
784
1009
  const r = shield.guard(
785
- () => parseBody(sec, compressed, di, shield),
1010
+ () => parseBody(sec, compressed, di, shield, gsoCtx),
786
1011
  { content: [], pageDims: undefined },
787
1012
  `hwp:sec${s}`,
788
1013
  );
@@ -790,6 +1015,10 @@ export class HwpScanner implements Decoder {
790
1015
  if (r.pageDims) pageDims = r.pageDims;
791
1016
  }
792
1017
 
1018
+ if (objectMap.size > 0) {
1019
+ injectImagesIntoContent(allContent, objectMap);
1020
+ }
1021
+
793
1022
  warns.push(...shield.flush());
794
1023
  const content = allContent.length > 0 ? allContent : [buildPara([buildSpan('')])];
795
1024
  return succeed(buildRoot({}, [buildSheet(content, pageDims)]), warns);
@@ -806,4 +1035,164 @@ function findBodySection(streams: Map<string, Uint8Array>): Uint8Array | undefin
806
1035
  return undefined;
807
1036
  }
808
1037
 
1038
+ /* ═══════════════════════════════════════════════════════════════
1039
+ Image dimension extraction from binary headers
1040
+ ════════════════════════════════════════════════════════════ */
1041
+
1042
+ // Returns { wPt, hPt } by parsing image headers; falls back to { wPt: 72, hPt: 72 } (1-inch)
1043
+ function getImageDimsPt(data: Uint8Array, mime: string): { wPt: number; hPt: number } {
1044
+ const fallback = { wPt: 72, hPt: 72 };
1045
+ try {
1046
+ if (mime === 'image/png' && data.length >= 24) {
1047
+ // PNG IHDR: sig(8) + length(4) + type(4) + width(4) + height(4) — all big-endian
1048
+ const w = (data[16] << 24 | data[17] << 16 | data[18] << 8 | data[19]) >>> 0;
1049
+ const h = (data[20] << 24 | data[21] << 16 | data[22] << 8 | data[23]) >>> 0;
1050
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 }; // 96 DPI → pt
1051
+ }
1052
+ if (mime === 'image/jpeg') {
1053
+ // Scan for SOF markers: FF C0 / C1 / C2 / C3
1054
+ let i = 2;
1055
+ while (i + 8 < data.length) {
1056
+ if (data[i] !== 0xFF) { i++; continue; }
1057
+ const marker = data[i + 1];
1058
+ if (marker >= 0xC0 && marker <= 0xC3) {
1059
+ // SOF: 2-byte marker + 2-byte length + 1-byte precision + 2-byte height + 2-byte width
1060
+ const h = (data[i + 5] << 8 | data[i + 6]) >>> 0;
1061
+ const w = (data[i + 7] << 8 | data[i + 8]) >>> 0;
1062
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
1063
+ }
1064
+ const segLen = data[i + 2] << 8 | data[i + 3];
1065
+ i += 2 + (segLen > 0 ? segLen : 2);
1066
+ }
1067
+ }
1068
+ if (mime === 'image/bmp' && data.length >= 26) {
1069
+ // BMP DIB header: width at 18, height at 22 (signed int32 LE; negative = top-down)
1070
+ const w = BinaryKit.readU32LE(data, 18);
1071
+ const h = Math.abs(BinaryKit.readU32LE(data, 22) | 0);
1072
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
1073
+ }
1074
+ if (mime === 'image/gif' && data.length >= 10) {
1075
+ // GIF: width at 6, height at 8 (uint16 LE)
1076
+ const w = data[6] | data[7] << 8;
1077
+ const h = data[8] | data[9] << 8;
1078
+ if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
1079
+ }
1080
+ } catch { /* ignore */ }
1081
+ return fallback;
1082
+ }
1083
+
1084
+ /* ═══════════════════════════════════════════════════════════════
1085
+ OLE Object extraction (images)
1086
+ ════════════════════════════════════════════════════════════ */
1087
+
1088
+ function extractImagesFromOleObjectLink(data: Uint8Array): OleObject[] {
1089
+ const objects: OleObject[] = [];
1090
+ let off = 0;
1091
+
1092
+ while (off + 8 <= data.length) {
1093
+ const objId = BinaryKit.readU32LE(data, off);
1094
+ const dataSize = BinaryKit.readU32LE(data, off + 4);
1095
+ const reserved = BinaryKit.readU32LE(data, off + 8);
1096
+
1097
+ if (objId === 0 || dataSize === 0) break;
1098
+
1099
+ const objOff = off + 16;
1100
+ if (objOff + dataSize > data.length) break;
1101
+
1102
+ const objData = data.subarray(objOff, objOff + dataSize);
1103
+
1104
+ // Detect MIME type from signature
1105
+ let mimeType = 'application/octet-stream';
1106
+ if (objData[0] === 0xFF && objData[1] === 0xD8 && objData[2] === 0xFF) {
1107
+ mimeType = 'image/jpeg';
1108
+ } else if (objData[0] === 0x89 && objData[1] === 0x50 && objData[2] === 0x4E && objData[3] === 0x47) {
1109
+ mimeType = 'image/png';
1110
+ } else if (objData[0] === 0x47 && objData[1] === 0x49 && objData[2] === 0x46 && objData[3] === 0x3538) {
1111
+ mimeType = 'image/gif';
1112
+ } else if (objData[0] === 0x42 && objData[1] === 0x4D) {
1113
+ mimeType = 'image/bmp';
1114
+ }
1115
+
1116
+ objects.push({ id: objId, data: objData, mimeType });
1117
+ off = objOff + dataSize;
1118
+ }
1119
+
1120
+ return objects;
1121
+ }
1122
+
1123
+ /* ═══════════════════════════════════════════════════════════════
1124
+ Helper to inject images into paragraph content
1125
+ ════════════════════════════════════════════════════════════ */
1126
+
1127
+ function injectImagesIntoContent(
1128
+ content: ContentNode[],
1129
+ objectMap: Map<number, ImgNode>
1130
+ ): void {
1131
+ if (objectMap.size === 0) return;
1132
+
1133
+ // Helper function to process a list of kids (spans, images, etc.)
1134
+ const processKids = (kids: any[]) => {
1135
+ for (let i = 0; i < kids.length; i++) {
1136
+ const kid = kids[i];
1137
+ // Span node structure: { tag: 'span', props, kids: [{ tag: 'txt', content }] }
1138
+ if (kid.tag === 'span' && kid.kids && kid.kids[0]?.tag === 'txt') {
1139
+ const text = kid.kids[0].content;
1140
+ // __EXT_N__ or __EXT_N_W<wPt>_H<hPt>__ (with encoded display size)
1141
+ // N is the objId that matches the index in objectMap
1142
+ const match = text.match?.(/^__(?:IMG|EXT)_(\d+)(?:_W(\d+)_H(\d+))?__$/);
1143
+ if (match) {
1144
+ const objId = parseInt(match[1], 10);
1145
+ const base = objectMap.get(objId);
1146
+ if (base) {
1147
+ const wPt = match[2] ? parseInt(match[2], 10) : 0;
1148
+ const hPt = match[3] ? parseInt(match[3], 10) : 0;
1149
+ // Use encoded display size when valid; otherwise keep pixel-based dims
1150
+ kids[i] = (wPt > 0 && hPt > 0) ? { ...base, w: wPt, h: hPt } : base;
1151
+ }
1152
+ }
1153
+ }
1154
+ }
1155
+ };
1156
+
1157
+ // Recursively process a grid (table): resolves image placeholders in all cells,
1158
+ // including nested grids inside cells.
1159
+ const processGridKids = (grid: any) => {
1160
+ if (!grid.kids || !Array.isArray(grid.kids)) return;
1161
+
1162
+ for (const row of grid.kids) {
1163
+ if (!row.kids || !Array.isArray(row.kids)) continue;
1164
+
1165
+ for (const cell of row.kids) {
1166
+ if (!cell.kids || !Array.isArray(cell.kids)) continue;
1167
+
1168
+ for (const cellKid of cell.kids) {
1169
+ if (cellKid.tag === 'grid') {
1170
+ // Nested table inside cell — recurse
1171
+ processGridKids(cellKid);
1172
+ } else if (cellKid.tag === 'para' && cellKid.kids) {
1173
+ processKids(cellKid.kids);
1174
+ }
1175
+ }
1176
+ }
1177
+ }
1178
+ };
1179
+
1180
+ for (const node of content) {
1181
+ if (node.tag === 'para' && node.kids) {
1182
+ // Process paragraph kids (spans, images, links, grids)
1183
+ processKids(node.kids);
1184
+
1185
+ // Also process any nested grids inside the paragraph
1186
+ for (const kid of node.kids) {
1187
+ if (kid.tag === 'grid') {
1188
+ processGridKids(kid);
1189
+ }
1190
+ }
1191
+ } else if (node.tag === 'grid') {
1192
+ // Process grid nodes (tables)
1193
+ processGridKids(node);
1194
+ }
1195
+ }
1196
+ }
1197
+
809
1198
  registry.registerDecoder(new HwpScanner());