hwpkit-dev 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,809 @@
1
+ import type { Decoder } from '../../contract/decoder';
2
+ import type { DocRoot, ContentNode, ParaNode, SpanNode } from '../../model/doc-tree';
3
+ import type { Outcome } from '../../contract/result';
4
+ import type { Align, Stroke, StrokeKind, PageDims, TextProps, ParaProps, CellProps, GridProps } from '../../model/doc-props';
5
+ import { succeed, fail } from '../../contract/result';
6
+ import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell } from '../../model/builders';
7
+ import { ShieldedParser } from '../../safety/ShieldedParser';
8
+ import { BinaryKit } from '../../toolkit/BinaryKit';
9
+ import { Metric, safeHex, safeFont } from '../../safety/StyleBridge';
10
+ import { registry } from '../../pipeline/registry';
11
+ import { A4 } from '../../model/doc-props';
12
+ import pako from 'pako';
13
+
14
+ /* ═══════════════════════════════════════════════════════════════
15
+ HWP 5.0 Tag Constants
16
+ ═══════════════════════════════════════════════════════════════ */
17
+
18
+ const HWPTAG_BEGIN = 16;
19
+
20
+ const TAG_FACE_NAME = HWPTAG_BEGIN + 3; // 19
21
+ const TAG_BORDER_FILL = HWPTAG_BEGIN + 4; // 20
22
+ const TAG_CHAR_SHAPE = HWPTAG_BEGIN + 5; // 21
23
+ const TAG_PARA_SHAPE = HWPTAG_BEGIN + 9; // 25
24
+ const TAG_PARA_HEADER = HWPTAG_BEGIN + 50; // 66
25
+ const TAG_PARA_TEXT = HWPTAG_BEGIN + 51; // 67
26
+ const TAG_PARA_CHAR_SHAPE = HWPTAG_BEGIN + 52; // 68
27
+ const TAG_CTRL_HEADER = HWPTAG_BEGIN + 55; // 71
28
+ const TAG_PAGE_DEF = HWPTAG_BEGIN + 57; // 73
29
+
30
+ // TABLE / CELL tags vary by HWP version
31
+ const TAG_LIST_HEADER = HWPTAG_BEGIN + 56; // 72
32
+ const TAG_TABLE_A = HWPTAG_BEGIN + 61; // 77
33
+ const TAG_CELL_A = HWPTAG_BEGIN + 62; // 78
34
+ const TAG_TABLE_B = HWPTAG_BEGIN + 64; // 80
35
+ const TAG_CELL_B = HWPTAG_BEGIN + 65; // 81
36
+
37
+ function isTableTag(t: number) { return t === TAG_TABLE_A || t === TAG_TABLE_B; }
38
+ function isCellTag(t: number) { return t === TAG_CELL_A || t === TAG_CELL_B || t === TAG_LIST_HEADER; }
39
+
40
+ // CTRL_HEADER ctrlId for table: ' lbt' as UINT32-LE = 0x74626C20
41
+ const CTRL_TABLE = 0x74626C20;
42
+
43
+ /* ═══════════════════════════════════════════════════════════════
44
+ Types
45
+ ═══════════════════════════════════════════════════════════════ */
46
+
47
+ interface HwpRecord {
48
+ tag: number;
49
+ level: number;
50
+ data: Uint8Array;
51
+ }
52
+
53
+ interface HwpCharShape {
54
+ faceIds: number[];
55
+ height: number;
56
+ bold: boolean;
57
+ italic: boolean;
58
+ underline: boolean;
59
+ strikeout: boolean;
60
+ superscript: boolean;
61
+ subscript: boolean;
62
+ textColor: string;
63
+ }
64
+
65
+ interface HwpParaShape {
66
+ align: Align;
67
+ spaceBefore: number;
68
+ spaceAfter: number;
69
+ lineSpacing: number;
70
+ indent: number;
71
+ }
72
+
73
+ interface HwpBorderFill {
74
+ borders: { type: number; widthPt: number; color: string }[];
75
+ bgColor?: string;
76
+ }
77
+
78
+ interface DocInfo {
79
+ faceNames: string[];
80
+ charShapes: HwpCharShape[];
81
+ paraShapes: HwpParaShape[];
82
+ borderFills: HwpBorderFill[];
83
+ }
84
+
85
+ interface ParsedChar { pos: number; ch: string }
86
+ interface ParaTextResult { chars: ParsedChar[]; controlPositions: number[] }
87
+
88
+ /* ═══════════════════════════════════════════════════════════════
89
+ Low-level record parsing
90
+ ═══════════════════════════════════════════════════════════════ */
91
+
92
+ function parseRecords(data: Uint8Array): HwpRecord[] {
93
+ const out: HwpRecord[] = [];
94
+ let off = 0;
95
+ while (off + 4 <= data.length) {
96
+ const hdr = BinaryKit.readU32LE(data, off);
97
+ const tag = hdr & 0x3FF;
98
+ const level = (hdr >> 10) & 0x3FF;
99
+ let size = (hdr >> 20) & 0xFFF;
100
+ off += 4;
101
+ if (size === 0xFFF) {
102
+ if (off + 4 > data.length) break;
103
+ size = BinaryKit.readU32LE(data, off);
104
+ off += 4;
105
+ }
106
+ if (off + size > data.length) break;
107
+ out.push({ tag, level, data: data.subarray(off, off + size) });
108
+ off += size;
109
+ }
110
+ return out;
111
+ }
112
+
113
+ function tryInflate(data: Uint8Array): Uint8Array {
114
+ try { return pako.inflateRaw(data); } catch { return data; }
115
+ }
116
+
117
+ /* ═══════════════════════════════════════════════════════════════
118
+ FileHeader
119
+ ═══════════════════════════════════════════════════════════════ */
120
+
121
+ function parseFileHeader(buf: Uint8Array) {
122
+ if (buf.length < 40) return { compressed: true, encrypted: false };
123
+ const props = BinaryKit.readU32LE(buf, 36);
124
+ return { compressed: (props & 1) !== 0, encrypted: (props & 2) !== 0 };
125
+ }
126
+
127
+ /* ═══════════════════════════════════════════════════════════════
128
+ DocInfo parsing
129
+ ═══════════════════════════════════════════════════════════════ */
130
+
131
+ function parseDocInfo(data: Uint8Array, compressed: boolean): DocInfo {
132
+ const raw = compressed ? tryInflate(data) : data;
133
+ const recs = parseRecords(raw);
134
+ const info: DocInfo = { faceNames: [], charShapes: [], paraShapes: [], borderFills: [] };
135
+
136
+ for (const r of recs) {
137
+ try {
138
+ if (r.tag === TAG_FACE_NAME) info.faceNames.push(parseFaceName(r.data));
139
+ if (r.tag === TAG_CHAR_SHAPE) info.charShapes.push(parseCharShape(r.data));
140
+ if (r.tag === TAG_PARA_SHAPE) info.paraShapes.push(parseParaShape(r.data));
141
+ if (r.tag === TAG_BORDER_FILL) info.borderFills.push(parseBorderFill(r.data));
142
+ } catch { /* skip malformed record */ }
143
+ }
144
+ return info;
145
+ }
146
+
147
+ /* ── FACE_NAME ──────────────────────────────────────────────── */
148
+
149
+ function parseFaceName(d: Uint8Array): string {
150
+ if (d.length < 3) return '';
151
+ const len = BinaryKit.readU16LE(d, 1); // UTF-16 char count
152
+ if (d.length < 3 + len * 2) return '';
153
+ return new TextDecoder('utf-16le').decode(d.subarray(3, 3 + len * 2));
154
+ }
155
+
156
+ /* ── CHAR_SHAPE ─────────────────────────────────────────────── */
157
+ /* offset size field
158
+ 0 14 faceId[7] (UINT16 × 7)
159
+ 14 7 ratio[7]
160
+ 21 7 spacing[7]
161
+ 28 7 relSize[7]
162
+ 35 7 offset[7]
163
+ 42 4 height (UINT32, HWP-units 100 = 1pt)
164
+ 46 4 attr (UINT32, bit flags)
165
+ 50 1 shadowX
166
+ 51 1 shadowY
167
+ 52 4 textColor (COLORREF R,G,B,0) */
168
+
169
+ function parseCharShape(d: Uint8Array): HwpCharShape {
170
+ const faceIds: number[] = [];
171
+ for (let i = 0; i < 7; i++) faceIds.push(d.length >= (i + 1) * 2 ? BinaryKit.readU16LE(d, i * 2) : 0);
172
+
173
+ const height = d.length >= 46 ? BinaryKit.readU32LE(d, 42) : 1000;
174
+ const attr = d.length >= 50 ? BinaryKit.readU32LE(d, 46) : 0;
175
+
176
+ // attr bit layout (HWP 5.0 spec Table 35):
177
+ // 0: italic, 1: bold, 2-4: underline type(3), 5-8: underline shape(4),
178
+ // 9-11: outline(3), 12-13: shadow(2), 14: emboss, 15: engrave,
179
+ // 16-17: super/sub(2, 0=none,1=super,2=sub), 18-20: strikeout type(3),
180
+ // 21-24: strikeout shape(4), 25: annotLine, 26-28: annotLine type,
181
+ // 29: useFontSpace, 30: kerning
182
+ const ulType = (attr >> 2) & 0x7; // 3 bits at 2-4
183
+ const skType = (attr >> 18) & 0x7; // 3 bits at 18-20
184
+ const suType = (attr >> 16) & 0x3; // 2 bits at 16-17 (0=none,1=super,2=sub)
185
+
186
+ return {
187
+ faceIds,
188
+ height: (height > 0 && height < 100000) ? height : 1000,
189
+ italic: (attr & 1) !== 0,
190
+ bold: ((attr >> 1) & 1) !== 0,
191
+ underline: ulType !== 0,
192
+ strikeout: skType !== 0,
193
+ superscript: suType === 1,
194
+ subscript: suType === 2,
195
+ textColor: d.length >= 56 ? colorRef(d, 52) : '000000',
196
+ };
197
+ }
198
+
199
+ /* ── PARA_SHAPE ─────────────────────────────────────────────── */
200
+ /* offset size field
201
+ 0 4 attr1 (bits 0-1 = alignment: 0=justify,1=left,2=right,3=center)
202
+ 4 4 leftMargin (HWPUNIT)
203
+ 8 4 rightMargin
204
+ 12 4 indent
205
+ 16 4 spaceBefore
206
+ 20 4 spaceAfter
207
+ 24 4 lineSpacing */
208
+
209
+ const ALIGN_TBL: Record<number, Align> = { 0: 'justify', 1: 'left', 2: 'right', 3: 'center', 4: 'justify' };
210
+
211
+ function parseParaShape(d: Uint8Array): HwpParaShape {
212
+ if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, indent: 0 };
213
+ const attr = BinaryKit.readU32LE(d, 0);
214
+ return {
215
+ align: ALIGN_TBL[attr & 0x7] ?? 'left',
216
+ indent: d.length >= 16 ? i32(d, 12) : 0,
217
+ spaceBefore: d.length >= 20 ? i32(d, 16) : 0,
218
+ spaceAfter: d.length >= 24 ? i32(d, 20) : 0,
219
+ lineSpacing: d.length >= 28 ? i32(d, 24) : 160,
220
+ };
221
+ }
222
+
223
+ /* ── BORDER_FILL ────────────────────────────────────────────── */
224
+ /* [0:2] attr
225
+ For each of 5 borders (left,right,top,bottom,diagonal): 6 bytes
226
+ +0 type(BYTE) +1 widthIdx(BYTE) +2 color(COLORREF)
227
+ [32:4] fillType
228
+ [36:4] faceColor (bgColor for solid fill) */
229
+
230
+ const BORDER_W_PT = [0.28, 0.34, 0.43, 0.57, 0.71, 0.85, 1.13, 1.42, 1.70, 1.98, 2.84, 4.25, 5.67, 8.50, 11.34, 14.17];
231
+ const BORDER_KIND: Record<number, StrokeKind> = { 0:'none',1:'solid',2:'dash',3:'dot',4:'dash',5:'dash',6:'dash',7:'dot',8:'double',9:'double',10:'double' };
232
+
233
+ function parseBorderFill(d: Uint8Array): HwpBorderFill {
234
+ const borders: HwpBorderFill['borders'] = [];
235
+ for (let i = 0; i < 4; i++) {
236
+ const b = 2 + i * 6;
237
+ if (b + 6 <= d.length) {
238
+ borders.push({ type: d[b], widthPt: BORDER_W_PT[d[b + 1]] ?? 0.5, color: colorRef(d, b + 2) });
239
+ } else {
240
+ borders.push({ type: 0, widthPt: 0.5, color: '000000' });
241
+ }
242
+ }
243
+ let bgColor: string | undefined;
244
+ const fOff = 32; // after attr(2) + 5 borders(30)
245
+ if (d.length >= fOff + 8) {
246
+ const ft = BinaryKit.readU32LE(d, fOff);
247
+ if (ft & 1) bgColor = colorRef(d, fOff + 4);
248
+ }
249
+ return { borders, bgColor };
250
+ }
251
+
252
+ /* ═══════════════════════════════════════════════════════════════
253
+ Body section parsing
254
+ ═══════════════════════════════════════════════════════════════ */
255
+
256
+ function parseBody(
257
+ raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser,
258
+ ): { content: ContentNode[]; pageDims?: PageDims } {
259
+ const recs = parseRecords(compressed ? tryInflate(raw) : raw);
260
+ const content: ContentNode[] = [];
261
+ let pageDims: PageDims | undefined;
262
+ let i = 0;
263
+
264
+ while (i < recs.length) {
265
+ if (recs[i].tag === TAG_PAGE_DEF) {
266
+ pageDims = shield.guard(() => parsePageDef(recs[i].data), A4, 'hwp:pageDef');
267
+ i++;
268
+ } else if (recs[i].tag === TAG_PARA_HEADER) {
269
+ const r = shield.guard(
270
+ () => parseParagraphGroup(recs, i, di, shield),
271
+ { nodes: [] as ContentNode[], next: i + 1 },
272
+ `hwp:para@${i}`,
273
+ );
274
+ content.push(...r.nodes);
275
+ i = r.next;
276
+ } else {
277
+ i++;
278
+ }
279
+ }
280
+ return { content, pageDims };
281
+ }
282
+
283
+ /* ── Paragraph group ────────────────────────────────────────── */
284
+
285
+ function parseParagraphGroup(
286
+ recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser,
287
+ ): { nodes: ContentNode[]; next: number } {
288
+ const hdr = recs[start];
289
+ const lv = hdr.level;
290
+
291
+ // paraShapeId at offset 8 (UINT16)
292
+ const psId = hdr.data.length >= 10 ? BinaryKit.readU16LE(hdr.data, 8) : 0;
293
+ const ps = di.paraShapes[psId];
294
+
295
+ let text: ParaTextResult | null = null;
296
+ let csPairs: [number, number][] = [];
297
+ const grids: ContentNode[] = [];
298
+ let i = start + 1;
299
+
300
+ while (i < recs.length && recs[i].level > lv) {
301
+ const r = recs[i];
302
+
303
+ if (r.tag === TAG_PARA_TEXT && r.level === lv + 1) {
304
+ text = decodeParaText(r.data);
305
+ i++;
306
+ } else if (r.tag === TAG_PARA_CHAR_SHAPE && r.level === lv + 1) {
307
+ csPairs = parseCharShapePairs(r.data);
308
+ i++;
309
+ } else if (r.tag === TAG_CTRL_HEADER && r.level === lv + 1) {
310
+ if (r.data.length >= 4 && BinaryKit.readU32LE(r.data, 0) === CTRL_TABLE) {
311
+ const tr = shield.guard(
312
+ () => parseTableCtrl(recs, i, di, shield),
313
+ { grid: null, next: skipKids(recs, i) },
314
+ `hwp:tbl@${i}`,
315
+ );
316
+ if (tr.grid) grids.push(tr.grid);
317
+ i = tr.next;
318
+ } else {
319
+ i = skipKids(recs, i);
320
+ }
321
+ } else {
322
+ i++;
323
+ }
324
+ }
325
+
326
+ const nodes: ContentNode[] = [];
327
+
328
+ // Build paragraph from text
329
+ if (text && text.chars.length > 0) {
330
+ const joined = text.chars.map(c => c.ch).join('');
331
+ if (joined.trim()) {
332
+ const spans = resolveCharShapes(text.chars, csPairs, di);
333
+ nodes.push(buildPara(spans, buildParaProps(ps)));
334
+ }
335
+ }
336
+
337
+ nodes.push(...grids);
338
+ return { nodes, next: i };
339
+ }
340
+
341
+ function skipKids(recs: HwpRecord[], idx: number): number {
342
+ const lv = recs[idx].level;
343
+ let i = idx + 1;
344
+ while (i < recs.length && recs[i].level > lv) i++;
345
+ return i;
346
+ }
347
+
348
+ /* ── PARA_TEXT ───────────────────────────────────────────────── */
349
+
350
+ // Extended controls: 8 WORDs, associated CTRL_HEADER
351
+ const EXT_CTRL = new Set([2, 3, 11, 12, 14, 15]);
352
+ // Inline controls: 8 WORDs, no CTRL_HEADER
353
+ const INL_CTRL = new Set([4, 5, 6, 7, 8]);
354
+
355
+ function decodeParaText(d: Uint8Array): ParaTextResult {
356
+ const chars: ParsedChar[] = [];
357
+ const controlPositions: number[] = [];
358
+ let i = 0, pos = 0;
359
+
360
+ while (i + 1 < d.length) {
361
+ const c = d[i] | (d[i + 1] << 8);
362
+ if (c === 0) { i += 2; pos++; continue; }
363
+ if (c === 13) { break; } // paragraph end
364
+ if (c === 10) { chars.push({ pos, ch: '\n' }); i += 2; pos++; continue; }
365
+
366
+ if (EXT_CTRL.has(c)) {
367
+ controlPositions.push(pos);
368
+ i += 16; pos += 8; continue; // 8 WORDs
369
+ }
370
+ if (INL_CTRL.has(c)) {
371
+ i += 16; pos += 8; continue;
372
+ }
373
+ if (c === 9) { // tab (inline 8 WORDs)
374
+ chars.push({ pos, ch: '\t' });
375
+ i += 16; pos += 8; continue;
376
+ }
377
+ if (c >= 1 && c <= 31) { i += 2; pos++; continue; } // other control
378
+
379
+ chars.push({ pos, ch: String.fromCharCode(c) });
380
+ i += 2; pos++;
381
+ }
382
+ return { chars, controlPositions };
383
+ }
384
+
385
+ /* ── PARA_CHAR_SHAPE ────────────────────────────────────────── */
386
+
387
+ function parseCharShapePairs(d: Uint8Array): [number, number][] {
388
+ const out: [number, number][] = [];
389
+ for (let i = 0; i + 7 < d.length; i += 8)
390
+ out.push([BinaryKit.readU32LE(d, i), BinaryKit.readU32LE(d, i + 4)]);
391
+ return out;
392
+ }
393
+
394
+ /* ── Char-shape → SpanNode resolution ───────────────────────── */
395
+
396
+ function resolveCharShapes(chars: ParsedChar[], pairs: [number, number][], di: DocInfo): SpanNode[] {
397
+ if (chars.length === 0) return [buildSpan('')];
398
+
399
+ const defaultId = pairs.length > 0 ? pairs[0][1] : 0;
400
+
401
+ function idFor(pos: number): number {
402
+ let id = defaultId;
403
+ for (const [p, sid] of pairs) { if (p <= pos) id = sid; else break; }
404
+ return id;
405
+ }
406
+
407
+ const spans: SpanNode[] = [];
408
+ let curId = idFor(chars[0].pos);
409
+ let buf = chars[0].ch;
410
+
411
+ for (let k = 1; k < chars.length; k++) {
412
+ const sid = idFor(chars[k].pos);
413
+ if (sid !== curId) { spans.push(styledSpan(buf, curId, di)); buf = ''; curId = sid; }
414
+ buf += chars[k].ch;
415
+ }
416
+ if (buf) spans.push(styledSpan(buf, curId, di));
417
+ return spans;
418
+ }
419
+
420
+ function styledSpan(text: string, shapeId: number, di: DocInfo): SpanNode {
421
+ const cs = di.charShapes[shapeId];
422
+ if (!cs) return buildSpan(text);
423
+
424
+ const props: TextProps = {};
425
+ const fid = cs.faceIds[0] ?? 0;
426
+ if (fid < di.faceNames.length && di.faceNames[fid]) props.font = safeFont(di.faceNames[fid]);
427
+ if (cs.height > 0) props.pt = Metric.hwpToPt(cs.height);
428
+ if (cs.bold) props.b = true;
429
+ if (cs.italic) props.i = true;
430
+ if (cs.underline) props.u = true;
431
+ if (cs.strikeout) props.s = true;
432
+ if (cs.superscript) props.sup = true;
433
+ if (cs.subscript) props.sub = true;
434
+
435
+ const hex = safeHex(cs.textColor);
436
+ if (hex && hex !== '000000') props.color = hex;
437
+
438
+ return buildSpan(text, props);
439
+ }
440
+
441
+ /* ── Table control parsing ──────────────────────────────────── */
442
+
443
+ function parseTableCtrl(
444
+ recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser,
445
+ ): { grid: ContentNode | null; next: number } {
446
+ const ctrlLv = recs[ctrlIdx].level;
447
+ let i = ctrlIdx + 1;
448
+
449
+ let tblData: Uint8Array | null = null;
450
+ const cells: { data: Uint8Array; tag: number; cStart: number; cEnd: number }[] = [];
451
+
452
+ // Collect TABLE and cell records within this control's scope
453
+ const tblLevel = ctrlLv + 1;
454
+
455
+ while (i < recs.length && recs[i].level > ctrlLv) {
456
+ const r = recs[i];
457
+
458
+ if (isTableTag(r.tag) && r.level === tblLevel) {
459
+ tblData = r.data;
460
+ i++;
461
+ } else if (r.tag === TAG_LIST_HEADER && r.level === tblLevel) {
462
+ // LIST_HEADER as cell: paraCount tells how many paragraphs follow
463
+ const cellData = r.data;
464
+ const paraCount = cellData.length >= 2 ? BinaryKit.readU16LE(cellData, 0) : 0;
465
+ i++;
466
+ const cStart = i;
467
+ // Consume exactly paraCount paragraphs (each with its child records)
468
+ let consumed = 0;
469
+ while (i < recs.length && consumed < paraCount) {
470
+ if (recs[i].tag === TAG_PARA_HEADER && recs[i].level === tblLevel) {
471
+ consumed++;
472
+ i++;
473
+ // Skip child records of this paragraph
474
+ while (i < recs.length && recs[i].level > tblLevel) i++;
475
+ } else if (recs[i].level > tblLevel) {
476
+ i++;
477
+ } else {
478
+ break; // hit next sibling at same level
479
+ }
480
+ }
481
+ cells.push({ data: cellData, tag: TAG_LIST_HEADER, cStart, cEnd: i });
482
+ } else if (isCellTag(r.tag) && r.level === tblLevel) {
483
+ // Full CELL record (with cell-specific fields)
484
+ const cellData = r.data;
485
+ const cellTag = r.tag;
486
+ i++;
487
+ const cStart = i;
488
+ while (i < recs.length && recs[i].level > tblLevel) i++;
489
+ cells.push({ data: cellData, tag: cellTag, cStart, cEnd: i });
490
+ } else {
491
+ i++;
492
+ }
493
+ }
494
+
495
+ if (!tblData || cells.length === 0) return { grid: null, next: i };
496
+
497
+ const rowCnt = tblData.length >= 6 ? BinaryKit.readU16LE(tblData, 4) : 1;
498
+ const colCnt = tblData.length >= 8 ? BinaryKit.readU16LE(tblData, 6) : 1;
499
+
500
+ interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; props: CellProps; paras: ParaNode[] }
501
+ const parsed: PC[] = [];
502
+
503
+ for (let ci = 0; ci < cells.length; ci++) {
504
+ const c = cells[ci];
505
+ const seqIdx = ci;
506
+ const pc = shield.guard(
507
+ () => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt),
508
+ { row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, props: {}, paras: [buildPara([buildSpan('')])] },
509
+ `hwp:cell@${c.cStart}`,
510
+ );
511
+ parsed.push(pc);
512
+ }
513
+
514
+ // Determine actual row count from cell data (may exceed rowCnt for merged cells)
515
+ const maxRow = parsed.reduce((m, c) => Math.max(m, c.row + c.rs), 0);
516
+ const actualRowCnt = Math.max(rowCnt, maxRow);
517
+
518
+ // Validate cell positions; fallback to sequential layout if invalid
519
+ const posValid = parsed.every(c => c.row >= 0 && c.col >= 0 && c.col < colCnt);
520
+ if (!posValid) {
521
+ let idx = 0;
522
+ for (const c of parsed) { c.row = Math.floor(idx / colCnt); c.col = idx % colCnt; idx++; }
523
+ }
524
+
525
+ // Compute column widths in points from cell widths
526
+ const colWidthsPt: number[] = new Array(colCnt).fill(0);
527
+ // Pass 1: use cells with cs=1 for exact column widths
528
+ for (const c of parsed) {
529
+ if (c.cs === 1 && c.widthHwp > 0) {
530
+ const wPt = Metric.hwpToPt(c.widthHwp);
531
+ if (wPt > colWidthsPt[c.col]) colWidthsPt[c.col] = wPt;
532
+ }
533
+ }
534
+ // Pass 2: for columns still 0, try to derive from multi-span cells
535
+ const zeroColumns = colWidthsPt.filter(w => w === 0).length;
536
+ if (zeroColumns > 0) {
537
+ for (const c of parsed) {
538
+ if (c.cs > 1 && c.widthHwp > 0) {
539
+ // Subtract known column widths from the span
540
+ let known = 0;
541
+ let unknownCols = 0;
542
+ for (let ci = c.col; ci < c.col + c.cs && ci < colCnt; ci++) {
543
+ if (colWidthsPt[ci] > 0) known += colWidthsPt[ci];
544
+ else unknownCols++;
545
+ }
546
+ if (unknownCols > 0) {
547
+ const remaining = Metric.hwpToPt(c.widthHwp) - known;
548
+ const each = remaining > 0 ? remaining / unknownCols : 0;
549
+ for (let ci = c.col; ci < c.col + c.cs && ci < colCnt; ci++) {
550
+ if (colWidthsPt[ci] === 0 && each > 0) colWidthsPt[ci] = each;
551
+ }
552
+ }
553
+ }
554
+ }
555
+ }
556
+
557
+ const rows = [];
558
+ for (let r = 0; r < actualRowCnt; r++) {
559
+ const rc = parsed.filter(c => c.row === r).sort((a, b) => a.col - b.col);
560
+ if (rc.length === 0) continue;
561
+ rows.push(buildRow(rc.map(c =>
562
+ buildCell(c.paras.length ? c.paras : [buildPara([buildSpan('')])], { cs: c.cs, rs: c.rs, props: c.props }),
563
+ )));
564
+ }
565
+ if (rows.length === 0) return { grid: null, next: i };
566
+
567
+ // Table-level default stroke
568
+ let defStroke: Stroke | undefined;
569
+ const bfOff = 18 + rowCnt * 2;
570
+ if (tblData.length >= bfOff + 2) {
571
+ const bfId = BinaryKit.readU16LE(tblData, bfOff);
572
+ defStroke = strokeFromBF(bfId, di);
573
+ }
574
+
575
+ const gp: GridProps = {};
576
+ if (defStroke) gp.defaultStroke = defStroke;
577
+ const hasWidths = colWidthsPt.some(w => w > 0);
578
+ if (hasWidths) gp.colWidths = colWidthsPt;
579
+ return { grid: buildGrid(rows, gp), next: i };
580
+ }
581
+
582
+ /* ── Cell record ────────────────────────────────────────────── */
583
+ /* LIST_HEADER for cells (HWP 5.0/5.1):
584
+ [0:2] paraCount [2:4] attr (bits 6-7 = vertAlign)
585
+ [6:2] unknown [8:2] rowAddr [10:2] colAddr
586
+ [12:2] rowSpan [14:2] colSpan
587
+ [16:4] width(HWPUNIT) [20:4] height(HWPUNIT)
588
+ [24:8] padding[4] [32:2] borderFillId */
589
+
590
+ function parseCellRec(
591
+ d: Uint8Array, tag: number, recs: HwpRecord[], cStart: number, cEnd: number,
592
+ di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number,
593
+ ) {
594
+ let col: number, row: number, cs = 1, rs = 1;
595
+ let widthHwp = 0;
596
+ const props: CellProps = {};
597
+
598
+ const attr = d.length >= 6 ? BinaryKit.readU32LE(d, 2) : 0;
599
+ const va = (attr >> 6) & 0x3;
600
+ if (va === 1) props.va = 'mid';
601
+ else if (va === 2) props.va = 'bot';
602
+
603
+ if (tag === TAG_LIST_HEADER && d.length >= 22) {
604
+ // LIST_HEADER with cell-specific fields
605
+ // offset 8: colAddr, offset 10: rowAddr (HWP 5.0 spec)
606
+ col = BinaryKit.readU16LE(d, 8);
607
+ row = BinaryKit.readU16LE(d, 10);
608
+ rs = Math.max(1, BinaryKit.readU16LE(d, 12));
609
+ cs = Math.max(1, BinaryKit.readU16LE(d, 14));
610
+ widthHwp = BinaryKit.readU32LE(d, 16);
611
+
612
+ const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
613
+ if (bfId > 0 && bfId <= di.borderFills.length) {
614
+ const bf = di.borderFills[bfId - 1];
615
+ if (bf.borders.length >= 4) {
616
+ props.left = toStroke(bf.borders[0]);
617
+ props.right = toStroke(bf.borders[1]);
618
+ props.top = toStroke(bf.borders[2]);
619
+ props.bot = toStroke(bf.borders[3]);
620
+ }
621
+ if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
622
+ }
623
+ } else if (tag !== TAG_LIST_HEADER) {
624
+ // Full CELL record with position/span/borderFill
625
+ col = d.length >= 8 ? BinaryKit.readU16LE(d, 6) : seqIdx % (colCnt || 1);
626
+ row = d.length >= 10 ? BinaryKit.readU16LE(d, 8) : Math.floor(seqIdx / (colCnt || 1));
627
+ cs = d.length >= 12 ? Math.max(1, BinaryKit.readU16LE(d, 10)) : 1;
628
+ rs = d.length >= 14 ? Math.max(1, BinaryKit.readU16LE(d, 12)) : 1;
629
+ widthHwp = d.length >= 18 ? BinaryKit.readU32LE(d, 14) : 0;
630
+
631
+ const bfId = d.length >= 32 ? BinaryKit.readU16LE(d, 30) : 0;
632
+ if (bfId > 0 && bfId <= di.borderFills.length) {
633
+ const bf = di.borderFills[bfId - 1];
634
+ if (bf.borders.length >= 4) {
635
+ props.left = toStroke(bf.borders[0]);
636
+ props.right = toStroke(bf.borders[1]);
637
+ props.top = toStroke(bf.borders[2]);
638
+ props.bot = toStroke(bf.borders[3]);
639
+ }
640
+ if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
641
+ }
642
+ } else {
643
+ // Fallback: LIST_HEADER too short, compute sequentially
644
+ row = Math.floor(seqIdx / (colCnt || 1));
645
+ col = seqIdx % (colCnt || 1);
646
+ }
647
+
648
+ // Parse cell content paragraphs
649
+ const paras: ParaNode[] = [];
650
+ let k = cStart;
651
+ while (k < cEnd) {
652
+ if (recs[k].tag === TAG_PARA_HEADER) {
653
+ // For cell paragraphs, they might be at various nesting levels
654
+ const r = shield.guard(
655
+ () => {
656
+ const hdr = recs[k];
657
+ const lv = hdr.level;
658
+ const psId = hdr.data.length >= 10 ? BinaryKit.readU16LE(hdr.data, 8) : 0;
659
+ const ps = di.paraShapes[psId];
660
+ let txt: ParaTextResult | null = null;
661
+ let csp: [number, number][] = [];
662
+ let j = k + 1;
663
+ while (j < cEnd && recs[j].level > lv) {
664
+ if (recs[j].tag === TAG_PARA_TEXT) { txt = decodeParaText(recs[j].data); j++; }
665
+ else if (recs[j].tag === TAG_PARA_CHAR_SHAPE) { csp = parseCharShapePairs(recs[j].data); j++; }
666
+ else j++;
667
+ }
668
+ const spans = txt && txt.chars.length > 0 ? resolveCharShapes(txt.chars, csp, di) : [buildSpan('')];
669
+ return { para: buildPara(spans, buildParaProps(ps)), next: j };
670
+ },
671
+ { para: buildPara([buildSpan('')]), next: k + 1 },
672
+ `hwp:cellP@${k}`,
673
+ );
674
+ paras.push(r.para);
675
+ k = r.next;
676
+ } else { k++; }
677
+ }
678
+
679
+ return { row, col, cs, rs, props, widthHwp, paras: paras.length ? paras : [buildPara([buildSpan('')])] };
680
+ }
681
+
682
+ /* ── PAGE_DEF ───────────────────────────────────────────────── */
683
+ /* [0:4] width [4:4] height [8:4] ml [12:4] mr
684
+ [16:4] mt [20:4] mb [36:4] attr (bit0=landscape) */
685
+
686
+ function parsePageDef(d: Uint8Array): PageDims {
687
+ if (d.length < 24) return A4;
688
+ const w = BinaryKit.readU32LE(d, 0);
689
+ const h = BinaryKit.readU32LE(d, 4);
690
+ const ml = BinaryKit.readU32LE(d, 8);
691
+ const mr = BinaryKit.readU32LE(d, 12);
692
+ const mt = BinaryKit.readU32LE(d, 16);
693
+ const mb = BinaryKit.readU32LE(d, 20);
694
+ const at = d.length >= 40 ? BinaryKit.readU32LE(d, 36) : 0;
695
+ return {
696
+ wPt: Metric.hwpToPt(w), hPt: Metric.hwpToPt(h),
697
+ ml: Metric.hwpToPt(ml), mr: Metric.hwpToPt(mr),
698
+ mt: Metric.hwpToPt(mt), mb: Metric.hwpToPt(mb),
699
+ orient: (at & 1) ? 'landscape' : 'portrait',
700
+ };
701
+ }
702
+
703
+ /* ═══════════════════════════════════════════════════════════════
704
+ Helpers
705
+ ═══════════════════════════════════════════════════════════════ */
706
+
707
+ function i32(d: Uint8Array, o: number): number {
708
+ const u = BinaryKit.readU32LE(d, o);
709
+ return u > 0x7FFFFFFF ? u - 0x100000000 : u;
710
+ }
711
+
712
+ function colorRef(d: Uint8Array, o: number): string {
713
+ if (o + 3 > d.length) return '000000';
714
+ return ((d[o] << 16) | (d[o + 1] << 8) | d[o + 2]).toString(16).padStart(6, '0').toUpperCase();
715
+ }
716
+
717
+ function toStroke(b: { type: number; widthPt: number; color: string }): Stroke {
718
+ return { kind: BORDER_KIND[b.type] ?? 'solid', pt: b.widthPt, color: b.color };
719
+ }
720
+
721
+ function strokeFromBF(bfId: number, di: DocInfo): Stroke | undefined {
722
+ if (bfId <= 0 || bfId > di.borderFills.length) return undefined;
723
+ const bf = di.borderFills[bfId - 1];
724
+ if (!bf.borders.length) return undefined;
725
+ const b = bf.borders[0];
726
+ return { kind: BORDER_KIND[b.type] ?? 'solid', pt: b.widthPt, color: b.color };
727
+ }
728
+
729
+ function buildParaProps(ps?: HwpParaShape): ParaProps {
730
+ if (!ps) return {};
731
+ const p: ParaProps = {};
732
+ if (ps.align && ps.align !== 'left') p.align = ps.align;
733
+ if (ps.spaceBefore > 0) p.spaceBefore = Metric.hwpToPt(ps.spaceBefore);
734
+ if (ps.spaceAfter > 0) p.spaceAfter = Metric.hwpToPt(ps.spaceAfter);
735
+ if (ps.lineSpacing > 0 && ps.lineSpacing !== 160) p.lineHeight = ps.lineSpacing / 100;
736
+ if (ps.indent > 0) p.indentPt = Metric.hwpToPt(ps.indent);
737
+ return p;
738
+ }
739
+
740
+ /* ═══════════════════════════════════════════════════════════════
741
+ Decoder class
742
+ ═══════════════════════════════════════════════════════════════ */
743
+
744
+ export class HwpScanner implements Decoder {
745
+ readonly format = 'hwp';
746
+
747
+ async decode(data: Uint8Array): Promise<Outcome<DocRoot>> {
748
+ const shield = new ShieldedParser();
749
+ const warns: string[] = [];
750
+
751
+ try {
752
+ if (!BinaryKit.isOle2(data)) return fail('HWP: Invalid OLE2 signature');
753
+ const streams = BinaryKit.parseCfb(data);
754
+
755
+ // FileHeader
756
+ const fh = streams.get('FileHeader');
757
+ const { compressed, encrypted } = fh ? parseFileHeader(fh) : { compressed: true, encrypted: false };
758
+ if (encrypted) return fail('HWP: 암호화된 파일은 지원하지 않습니다');
759
+
760
+ // DocInfo
761
+ const diRaw = streams.get('DocInfo');
762
+ let di: DocInfo = { faceNames: [], charShapes: [], paraShapes: [], borderFills: [] };
763
+ if (diRaw) {
764
+ di = shield.guard(() => parseDocInfo(diRaw, compressed), di, 'hwp:docInfo');
765
+ }
766
+
767
+ // Body sections
768
+ const allContent: ContentNode[] = [];
769
+ let pageDims: PageDims = A4;
770
+
771
+ for (let s = 0; s < 100; s++) {
772
+ const sec = streams.get(`BodyText/Section${s}`) ?? streams.get(`Section${s}`);
773
+ if (!sec) {
774
+ if (s === 0) {
775
+ const fb = findBodySection(streams);
776
+ if (fb) {
777
+ const r = parseBody(fb, compressed, di, shield);
778
+ allContent.push(...r.content);
779
+ if (r.pageDims) pageDims = r.pageDims;
780
+ }
781
+ }
782
+ break;
783
+ }
784
+ const r = shield.guard(
785
+ () => parseBody(sec, compressed, di, shield),
786
+ { content: [], pageDims: undefined },
787
+ `hwp:sec${s}`,
788
+ );
789
+ allContent.push(...r.content);
790
+ if (r.pageDims) pageDims = r.pageDims;
791
+ }
792
+
793
+ warns.push(...shield.flush());
794
+ const content = allContent.length > 0 ? allContent : [buildPara([buildSpan('')])];
795
+ return succeed(buildRoot({}, [buildSheet(content, pageDims)]), warns);
796
+ } catch (e: any) {
797
+ warns.push(...shield.flush());
798
+ return fail(`HWP decode error: ${e?.message ?? String(e)}`, warns);
799
+ }
800
+ }
801
+ }
802
+
803
+ function findBodySection(streams: Map<string, Uint8Array>): Uint8Array | undefined {
804
+ for (const [k, v] of streams)
805
+ if (k.includes('Section') && !k.includes('Header') && !k.includes('Info')) return v;
806
+ return undefined;
807
+ }
808
+
809
+ registry.registerDecoder(new HwpScanner());