hwpkit-dev 0.0.1 → 0.0.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/ .npmignore +4 -1
- package/README.md +39 -2
- package/dist/index.d.mts +74 -16
- package/dist/index.d.ts +70 -16
- package/dist/index.js +4985 -698
- package/dist/index.js.map +1 -1
- package/dist/index.mjs +4981 -698
- package/dist/index.mjs.map +1 -1
- package/package.json +4 -1
- package/playground/index.html +346 -0
- package/playground/main.ts +302 -0
- package/playground/vite.config.ts +16 -0
- package/src/contract/decoder.ts +1 -0
- package/src/contract/encoder.ts +6 -1
- package/src/core/BaseDecoder.ts +118 -0
- package/src/core/BaseEncoder.ts +146 -0
- package/src/decoders/docx/DocxDecoder.ts +867 -150
- package/src/decoders/html/HtmlDecoder.ts +366 -0
- package/src/decoders/hwp/HwpScanner.ts +477 -88
- package/src/decoders/hwpx/HwpxDecoder.ts +789 -293
- package/src/decoders/md/MdDecoder.ts +4 -4
- package/src/encoders/docx/DocxEncoder.ts +600 -295
- package/src/encoders/html/HtmlEncoder.ts +203 -0
- package/src/encoders/hwp/HwpEncoder.ts +1647 -398
- package/src/encoders/hwpx/HwpxEncoder.ts +1512 -444
- package/src/encoders/hwpx/constants.ts +148 -0
- package/src/encoders/hwpx/utils.ts +198 -0
- package/src/encoders/md/MdEncoder.ts +117 -30
- package/src/index.ts +1 -0
- package/src/model/builders.ts +8 -6
- package/src/model/doc-props.ts +19 -5
- package/src/model/doc-tree.ts +13 -5
- package/src/pipeline/Pipeline.ts +21 -4
- package/src/pipeline/registry.ts +13 -2
- package/src/safety/StyleBridge.ts +52 -7
- package/src/toolkit/ArchiveKit.ts +56 -0
- package/src/toolkit/StyleMapper.ts +221 -0
- package/src/toolkit/UnitConverter.ts +138 -0
- package/src/toolkit/XmlKit.ts +0 -5
- package/test-styling.ts +210 -0
|
@@ -1,11 +1,12 @@
|
|
|
1
1
|
import type { Decoder } from '../../contract/decoder';
|
|
2
|
-
import type { DocRoot, ContentNode, ParaNode, SpanNode } from '../../model/doc-tree';
|
|
2
|
+
import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode, GridNode } from '../../model/doc-tree';
|
|
3
3
|
import type { Outcome } from '../../contract/result';
|
|
4
4
|
import type { Align, Stroke, StrokeKind, PageDims, TextProps, ParaProps, CellProps, GridProps } from '../../model/doc-props';
|
|
5
5
|
import { succeed, fail } from '../../contract/result';
|
|
6
|
-
import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell } from '../../model/builders';
|
|
6
|
+
import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell, buildImg } from '../../model/builders';
|
|
7
7
|
import { ShieldedParser } from '../../safety/ShieldedParser';
|
|
8
8
|
import { BinaryKit } from '../../toolkit/BinaryKit';
|
|
9
|
+
import { TextKit } from '../../toolkit/TextKit';
|
|
9
10
|
import { Metric, safeHex, safeFont } from '../../safety/StyleBridge';
|
|
10
11
|
import { registry } from '../../pipeline/registry';
|
|
11
12
|
import { A4 } from '../../model/doc-props';
|
|
@@ -37,8 +38,12 @@ const TAG_CELL_B = HWPTAG_BEGIN + 65; // 81
|
|
|
37
38
|
function isTableTag(t: number) { return t === TAG_TABLE_A || t === TAG_TABLE_B; }
|
|
38
39
|
function isCellTag(t: number) { return t === TAG_CELL_A || t === TAG_CELL_B || t === TAG_LIST_HEADER; }
|
|
39
40
|
|
|
40
|
-
// CTRL_HEADER ctrlId
|
|
41
|
-
const CTRL_TABLE = 0x74626C20;
|
|
41
|
+
// CTRL_HEADER ctrlId values (UINT32-LE as ASCII)
|
|
42
|
+
const CTRL_TABLE = 0x74626C20; // ' lbt' = 표(table)
|
|
43
|
+
const CTRL_IMAGE = 0x696D6720; // 'img '
|
|
44
|
+
const CTRL_OBJ = 0x6F626A20; // 'obj '
|
|
45
|
+
const CTRL_FIG = 0x66696720; // 'fig '
|
|
46
|
+
const CTRL_GSO = 0x67736F20; // 'gso ' = 그리기 객체 (drawing object, contains embedded images)
|
|
42
47
|
|
|
43
48
|
/* ═══════════════════════════════════════════════════════════════
|
|
44
49
|
Types
|
|
@@ -67,6 +72,7 @@ interface HwpParaShape {
|
|
|
67
72
|
spaceBefore: number;
|
|
68
73
|
spaceAfter: number;
|
|
69
74
|
lineSpacing: number;
|
|
75
|
+
leftMargin: number;
|
|
70
76
|
indent: number;
|
|
71
77
|
}
|
|
72
78
|
|
|
@@ -83,7 +89,14 @@ interface DocInfo {
|
|
|
83
89
|
}
|
|
84
90
|
|
|
85
91
|
interface ParsedChar { pos: number; ch: string }
|
|
86
|
-
interface
|
|
92
|
+
interface ParsedCtrl { pos: number; ctrlId: number; objId: number; matched: boolean }
|
|
93
|
+
interface ParaTextResult { chars: ParsedChar[]; controls: ParsedCtrl[] }
|
|
94
|
+
|
|
95
|
+
interface OleObject {
|
|
96
|
+
id: number;
|
|
97
|
+
data: Uint8Array;
|
|
98
|
+
mimeType: string;
|
|
99
|
+
}
|
|
87
100
|
|
|
88
101
|
/* ═══════════════════════════════════════════════════════════════
|
|
89
102
|
Low-level record parsing
|
|
@@ -111,7 +124,9 @@ function parseRecords(data: Uint8Array): HwpRecord[] {
|
|
|
111
124
|
}
|
|
112
125
|
|
|
113
126
|
function tryInflate(data: Uint8Array): Uint8Array {
|
|
114
|
-
try { return pako.
|
|
127
|
+
try { return pako.inflate(data); } catch {
|
|
128
|
+
try { return pako.inflateRaw(data); } catch { return data; }
|
|
129
|
+
}
|
|
115
130
|
}
|
|
116
131
|
|
|
117
132
|
/* ═══════════════════════════════════════════════════════════════
|
|
@@ -209,11 +224,12 @@ function parseCharShape(d: Uint8Array): HwpCharShape {
|
|
|
209
224
|
const ALIGN_TBL: Record<number, Align> = { 0: 'justify', 1: 'left', 2: 'right', 3: 'center', 4: 'justify' };
|
|
210
225
|
|
|
211
226
|
function parseParaShape(d: Uint8Array): HwpParaShape {
|
|
212
|
-
if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, indent: 0 };
|
|
227
|
+
if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, leftMargin: 0, indent: 0 };
|
|
213
228
|
const attr = BinaryKit.readU32LE(d, 0);
|
|
214
229
|
return {
|
|
215
|
-
align: ALIGN_TBL[attr & 0x7] ?? 'left',
|
|
216
|
-
|
|
230
|
+
align: ALIGN_TBL[(attr >> 2) & 0x7] ?? 'left',
|
|
231
|
+
leftMargin: d.length >= 8 ? i32(d, 4) : 0, // offset 4: leftMargin (들여쓰기)
|
|
232
|
+
indent: d.length >= 16 ? i32(d, 12) : 0, // offset 12: first-line indent
|
|
217
233
|
spaceBefore: d.length >= 20 ? i32(d, 16) : 0,
|
|
218
234
|
spaceAfter: d.length >= 24 ? i32(d, 20) : 0,
|
|
219
235
|
lineSpacing: d.length >= 28 ? i32(d, 24) : 160,
|
|
@@ -228,20 +244,30 @@ function parseParaShape(d: Uint8Array): HwpParaShape {
|
|
|
228
244
|
[36:4] faceColor (bgColor for solid fill) */
|
|
229
245
|
|
|
230
246
|
const BORDER_W_PT = [0.28, 0.34, 0.43, 0.57, 0.71, 0.85, 1.13, 1.42, 1.70, 1.98, 2.84, 4.25, 5.67, 8.50, 11.34, 14.17];
|
|
231
|
-
const BORDER_KIND: Record<number, StrokeKind> = { 0:'
|
|
247
|
+
const BORDER_KIND: Record<number, StrokeKind> = { 0:'solid',1:'dash',2:'dash',3:'dot',4:'dash',5:'dash',6:'dash',7:'double',8:'double',9:'double',10:'none' };
|
|
232
248
|
|
|
233
249
|
function parseBorderFill(d: Uint8Array): HwpBorderFill {
|
|
250
|
+
// Spec grouped format (표 23):
|
|
251
|
+
// [0:2] attr
|
|
252
|
+
// [2:4] 4 border types (left, right, top, bottom) — 1 byte each
|
|
253
|
+
// [6:4] 4 border widths (left, right, top, bottom) — 1 byte each (index into BORDER_W_PT)
|
|
254
|
+
// [10:16] 4 border colors (left, right, top, bottom) — 4 bytes each (COLORREF)
|
|
255
|
+
// [26:3] diagonal: type(1) + width(1) + color(4) = 6 bytes actually [26:6]
|
|
256
|
+
// [32:4] fillType
|
|
257
|
+
// [36:4] faceColor (bgColor for solid fill)
|
|
234
258
|
const borders: HwpBorderFill['borders'] = [];
|
|
259
|
+
const BASE_TYPE = 2; // 4 type bytes
|
|
260
|
+
const BASE_WIDTH = 6; // 4 width bytes
|
|
261
|
+
const BASE_COLOR = 10; // 4 × 4-byte colors
|
|
235
262
|
for (let i = 0; i < 4; i++) {
|
|
236
|
-
const
|
|
237
|
-
|
|
238
|
-
|
|
239
|
-
|
|
240
|
-
borders.push({ type: 0, widthPt: 0.5, color: '000000' });
|
|
241
|
-
}
|
|
263
|
+
const type = BASE_TYPE + i < d.length ? d[BASE_TYPE + i] : 0;
|
|
264
|
+
const widthPt = BASE_WIDTH + i < d.length ? (BORDER_W_PT[d[BASE_WIDTH + i]] ?? 0.5) : 0.5;
|
|
265
|
+
const color = BASE_COLOR + i * 4 + 4 <= d.length ? colorRef(d, BASE_COLOR + i * 4) : '000000';
|
|
266
|
+
borders.push({ type, widthPt, color });
|
|
242
267
|
}
|
|
243
268
|
let bgColor: string | undefined;
|
|
244
|
-
|
|
269
|
+
// after attr(2) + 4 types(4) + 4 widths(4) + 4 colors(16) + diagonal(6) = offset 32
|
|
270
|
+
const fOff = 32;
|
|
245
271
|
if (d.length >= fOff + 8) {
|
|
246
272
|
const ft = BinaryKit.readU32LE(d, fOff);
|
|
247
273
|
if (ft & 1) bgColor = colorRef(d, fOff + 4);
|
|
@@ -253,21 +279,33 @@ function parseBorderFill(d: Uint8Array): HwpBorderFill {
|
|
|
253
279
|
Body section parsing
|
|
254
280
|
═══════════════════════════════════════════════════════════════ */
|
|
255
281
|
|
|
282
|
+
// gsoCtx: shared mutable counter for 'gso ' drawing objects.
|
|
283
|
+
// Each 'gso ' CTRL_HEADER encountered increments this counter.
|
|
284
|
+
// objectMap is keyed by 0-based gso order = sequential BinData insertion order.
|
|
285
|
+
interface GsoCtx { count: number }
|
|
286
|
+
|
|
256
287
|
function parseBody(
|
|
257
|
-
raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser,
|
|
288
|
+
raw: Uint8Array, compressed: boolean, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
|
|
258
289
|
): { content: ContentNode[]; pageDims?: PageDims } {
|
|
259
290
|
const recs = parseRecords(compressed ? tryInflate(raw) : raw);
|
|
260
291
|
const content: ContentNode[] = [];
|
|
261
292
|
let pageDims: PageDims | undefined;
|
|
262
|
-
let i = 0;
|
|
263
293
|
|
|
294
|
+
// Pre-scan for PAGE_DEF at any nesting level (real HWP stores it at level 2 inside section ctrl)
|
|
295
|
+
for (const r of recs) {
|
|
296
|
+
if (r.tag === TAG_PAGE_DEF) {
|
|
297
|
+
pageDims = shield.guard(() => parsePageDef(r.data), A4, 'hwp:pageDef');
|
|
298
|
+
break;
|
|
299
|
+
}
|
|
300
|
+
}
|
|
301
|
+
|
|
302
|
+
let i = 0;
|
|
264
303
|
while (i < recs.length) {
|
|
265
304
|
if (recs[i].tag === TAG_PAGE_DEF) {
|
|
266
|
-
|
|
267
|
-
i++;
|
|
305
|
+
i++; // already handled above; skip at top level
|
|
268
306
|
} else if (recs[i].tag === TAG_PARA_HEADER) {
|
|
269
307
|
const r = shield.guard(
|
|
270
|
-
() => parseParagraphGroup(recs, i, di, shield),
|
|
308
|
+
() => parseParagraphGroup(recs, i, di, shield, gsoCtx),
|
|
271
309
|
{ nodes: [] as ContentNode[], next: i + 1 },
|
|
272
310
|
`hwp:para@${i}`,
|
|
273
311
|
);
|
|
@@ -283,7 +321,7 @@ function parseBody(
|
|
|
283
321
|
/* ── Paragraph group ────────────────────────────────────────── */
|
|
284
322
|
|
|
285
323
|
function parseParagraphGroup(
|
|
286
|
-
recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser,
|
|
324
|
+
recs: HwpRecord[], start: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
|
|
287
325
|
): { nodes: ContentNode[]; next: number } {
|
|
288
326
|
const hdr = recs[start];
|
|
289
327
|
const lv = hdr.level;
|
|
@@ -295,6 +333,8 @@ function parseParagraphGroup(
|
|
|
295
333
|
let text: ParaTextResult | null = null;
|
|
296
334
|
let csPairs: [number, number][] = [];
|
|
297
335
|
const grids: ContentNode[] = [];
|
|
336
|
+
// imgId: for 'gso' uses sequential gsoCtx.count; for others uses flags-based objId
|
|
337
|
+
const ctrlHeaders: { ctrlId: number; imgId: number; wPt: number; hPt: number }[] = [];
|
|
298
338
|
let i = start + 1;
|
|
299
339
|
|
|
300
340
|
while (i < recs.length && recs[i].level > lv) {
|
|
@@ -307,14 +347,33 @@ function parseParagraphGroup(
|
|
|
307
347
|
csPairs = parseCharShapePairs(r.data);
|
|
308
348
|
i++;
|
|
309
349
|
} else if (r.tag === TAG_CTRL_HEADER && r.level === lv + 1) {
|
|
310
|
-
if (r.data.length >= 4
|
|
311
|
-
const
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
)
|
|
316
|
-
|
|
317
|
-
|
|
350
|
+
if (r.data.length >= 4) {
|
|
351
|
+
const ctrlId = BinaryKit.readU32LE(r.data, 0);
|
|
352
|
+
|
|
353
|
+
// HWP 5.0 general-object layout:
|
|
354
|
+
// [0:4] ctrlId [4:4] flags [8:4] xOff [12:4] yOff
|
|
355
|
+
// [16:4] width(HWPUNIT) [20:4] height(HWPUNIT)
|
|
356
|
+
const MAX_HWP = 1_000_000;
|
|
357
|
+
const rawW = r.data.length >= 24 ? BinaryKit.readU32LE(r.data, 16) : 0;
|
|
358
|
+
const rawH = r.data.length >= 28 ? BinaryKit.readU32LE(r.data, 20) : 0;
|
|
359
|
+
const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
|
|
360
|
+
const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
|
|
361
|
+
|
|
362
|
+
// 'gso ' (그리기 객체) uses sequential counter; others use flags-based id
|
|
363
|
+
const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (r.data.length >= 6 ? BinaryKit.readU16LE(r.data, 4) : 0);
|
|
364
|
+
ctrlHeaders.push({ ctrlId, imgId, wPt, hPt });
|
|
365
|
+
|
|
366
|
+
if (ctrlId === CTRL_TABLE) {
|
|
367
|
+
const tr = shield.guard(
|
|
368
|
+
() => parseTableCtrl(recs, i, di, shield, gsoCtx),
|
|
369
|
+
{ grid: null, next: skipKids(recs, i) },
|
|
370
|
+
`hwp:tbl@${i}`,
|
|
371
|
+
);
|
|
372
|
+
if (tr.grid) grids.push(tr.grid);
|
|
373
|
+
i = tr.next;
|
|
374
|
+
} else {
|
|
375
|
+
i = skipKids(recs, i);
|
|
376
|
+
}
|
|
318
377
|
} else {
|
|
319
378
|
i = skipKids(recs, i);
|
|
320
379
|
}
|
|
@@ -325,12 +384,32 @@ function parseParagraphGroup(
|
|
|
325
384
|
|
|
326
385
|
const nodes: ContentNode[] = [];
|
|
327
386
|
|
|
328
|
-
// Build paragraph from text
|
|
329
|
-
if (text && text.chars.length > 0) {
|
|
330
|
-
const
|
|
331
|
-
|
|
387
|
+
// Build paragraph from text and inline controls (images)
|
|
388
|
+
if (text && (text.chars.length > 0 || text.controls.length > 0)) {
|
|
389
|
+
const paraContent: (SpanNode | ContentNode)[] = [];
|
|
390
|
+
|
|
391
|
+
if (text.chars.length > 0) {
|
|
332
392
|
const spans = resolveCharShapes(text.chars, csPairs, di);
|
|
333
|
-
|
|
393
|
+
paraContent.push(...spans);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
// Image placeholder spans: only for actual image controls.
|
|
397
|
+
// Non-image controls (footnotes, TOC entries, etc.) are silently skipped.
|
|
398
|
+
if (text.controls.length > 0) {
|
|
399
|
+
for (let ci = 0; ci < text.controls.length; ci++) {
|
|
400
|
+
const ch = ctrlHeaders[ci];
|
|
401
|
+
if (!ch) continue; // anchor-only ctrl (gso is sibling, not inline)
|
|
402
|
+
const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
|
|
403
|
+
if (!isImg) continue; // skip footnotes, TOC, page num, etc.
|
|
404
|
+
const dimStr = (ch.wPt > 0 && ch.hPt > 0)
|
|
405
|
+
? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}`
|
|
406
|
+
: '';
|
|
407
|
+
paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
|
|
411
|
+
if (paraContent.length > 0) {
|
|
412
|
+
nodes.push(buildPara(paraContent as any, buildParaProps(ps)));
|
|
334
413
|
}
|
|
335
414
|
}
|
|
336
415
|
|
|
@@ -354,7 +433,7 @@ const INL_CTRL = new Set([4, 5, 6, 7, 8]);
|
|
|
354
433
|
|
|
355
434
|
function decodeParaText(d: Uint8Array): ParaTextResult {
|
|
356
435
|
const chars: ParsedChar[] = [];
|
|
357
|
-
const
|
|
436
|
+
const controls: ParsedCtrl[] = [];
|
|
358
437
|
let i = 0, pos = 0;
|
|
359
438
|
|
|
360
439
|
while (i + 1 < d.length) {
|
|
@@ -364,8 +443,14 @@ function decodeParaText(d: Uint8Array): ParaTextResult {
|
|
|
364
443
|
if (c === 10) { chars.push({ pos, ch: '\n' }); i += 2; pos++; continue; }
|
|
365
444
|
|
|
366
445
|
if (EXT_CTRL.has(c)) {
|
|
367
|
-
|
|
368
|
-
|
|
446
|
+
// Extended control: 8 WORDs (16 bytes)
|
|
447
|
+
// WORD 4 contains objId (for images, charts, etc.)
|
|
448
|
+
let objId = 0;
|
|
449
|
+
if (i + 16 <= d.length) {
|
|
450
|
+
objId = BinaryKit.readU16LE(d, i + 8); // 4th WORD (offset 8) contains objId
|
|
451
|
+
}
|
|
452
|
+
controls.push({ pos, ctrlId: 0, objId, matched: false });
|
|
453
|
+
i += 16; pos += 8; continue;
|
|
369
454
|
}
|
|
370
455
|
if (INL_CTRL.has(c)) {
|
|
371
456
|
i += 16; pos += 8; continue;
|
|
@@ -379,7 +464,7 @@ function decodeParaText(d: Uint8Array): ParaTextResult {
|
|
|
379
464
|
chars.push({ pos, ch: String.fromCharCode(c) });
|
|
380
465
|
i += 2; pos++;
|
|
381
466
|
}
|
|
382
|
-
return { chars,
|
|
467
|
+
return { chars, controls };
|
|
383
468
|
}
|
|
384
469
|
|
|
385
470
|
/* ── PARA_CHAR_SHAPE ────────────────────────────────────────── */
|
|
@@ -441,7 +526,7 @@ function styledSpan(text: string, shapeId: number, di: DocInfo): SpanNode {
|
|
|
441
526
|
/* ── Table control parsing ──────────────────────────────────── */
|
|
442
527
|
|
|
443
528
|
function parseTableCtrl(
|
|
444
|
-
recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser,
|
|
529
|
+
recs: HwpRecord[], ctrlIdx: number, di: DocInfo, shield: ShieldedParser, gsoCtx: GsoCtx,
|
|
445
530
|
): { grid: ContentNode | null; next: number } {
|
|
446
531
|
const ctrlLv = recs[ctrlIdx].level;
|
|
447
532
|
let i = ctrlIdx + 1;
|
|
@@ -497,15 +582,15 @@ function parseTableCtrl(
|
|
|
497
582
|
const rowCnt = tblData.length >= 6 ? BinaryKit.readU16LE(tblData, 4) : 1;
|
|
498
583
|
const colCnt = tblData.length >= 8 ? BinaryKit.readU16LE(tblData, 6) : 1;
|
|
499
584
|
|
|
500
|
-
interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; props: CellProps;
|
|
585
|
+
interface PC { row: number; col: number; cs: number; rs: number; widthHwp: number; heightHwp?: number; props: CellProps; cellChildren: (ParaNode | GridNode)[] }
|
|
501
586
|
const parsed: PC[] = [];
|
|
502
587
|
|
|
503
588
|
for (let ci = 0; ci < cells.length; ci++) {
|
|
504
589
|
const c = cells[ci];
|
|
505
590
|
const seqIdx = ci;
|
|
506
591
|
const pc = shield.guard(
|
|
507
|
-
() => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt),
|
|
508
|
-
{ row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, props: {},
|
|
592
|
+
() => parseCellRec(c.data, c.tag, recs, c.cStart, c.cEnd, di, shield, seqIdx, colCnt, gsoCtx),
|
|
593
|
+
{ row: Math.floor(ci / (colCnt || 1)), col: ci % (colCnt || 1), cs: 1, rs: 1, widthHwp: 0, heightHwp: undefined, props: {}, cellChildren: [buildPara([buildSpan('')])] },
|
|
509
594
|
`hwp:cell@${c.cStart}`,
|
|
510
595
|
);
|
|
511
596
|
parsed.push(pc);
|
|
@@ -532,9 +617,11 @@ function parseTableCtrl(
|
|
|
532
617
|
}
|
|
533
618
|
}
|
|
534
619
|
// Pass 2: for columns still 0, try to derive from multi-span cells
|
|
620
|
+
// Sort by span size ascending so smaller, more precise spans fill widths before larger spans
|
|
535
621
|
const zeroColumns = colWidthsPt.filter(w => w === 0).length;
|
|
536
622
|
if (zeroColumns > 0) {
|
|
537
|
-
|
|
623
|
+
const spanCells = parsed.filter(c => c.cs > 1 && c.widthHwp > 0).sort((a, b) => a.cs - b.cs);
|
|
624
|
+
for (const c of spanCells) {
|
|
538
625
|
if (c.cs > 1 && c.widthHwp > 0) {
|
|
539
626
|
// Subtract known column widths from the span
|
|
540
627
|
let known = 0;
|
|
@@ -554,13 +641,37 @@ function parseTableCtrl(
|
|
|
554
641
|
}
|
|
555
642
|
}
|
|
556
643
|
|
|
644
|
+
// Post-process: clamp near-zero column widths (< 1pt = floating-point artifact) to minimum 1pt
|
|
645
|
+
for (let i = 0; i < colWidthsPt.length; i++) {
|
|
646
|
+
if (colWidthsPt[i] > 0 && colWidthsPt[i] < 1) colWidthsPt[i] = 1;
|
|
647
|
+
}
|
|
648
|
+
|
|
557
649
|
const rows = [];
|
|
558
650
|
for (let r = 0; r < actualRowCnt; r++) {
|
|
559
651
|
const rc = parsed.filter(c => c.row === r).sort((a, b) => a.col - b.col);
|
|
560
652
|
if (rc.length === 0) continue;
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
653
|
+
|
|
654
|
+
// Calculate row height — prefer rs=1 cells (exact per-row height)
|
|
655
|
+
let rowHeightPt: number | undefined = undefined;
|
|
656
|
+
for (const c of rc) {
|
|
657
|
+
if (c.heightHwp && c.heightHwp > 0 && c.rs === 1) {
|
|
658
|
+
const hPt = Metric.hwpToPt(c.heightHwp);
|
|
659
|
+
if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
|
|
660
|
+
}
|
|
661
|
+
}
|
|
662
|
+
// Fallback: all cells span multiple rows → approximate height per row
|
|
663
|
+
if (rowHeightPt == null) {
|
|
664
|
+
for (const c of rc) {
|
|
665
|
+
if (c.heightHwp && c.heightHwp > 0) {
|
|
666
|
+
const hPt = Metric.hwpToPt(c.heightHwp) / c.rs;
|
|
667
|
+
if (rowHeightPt == null || hPt > rowHeightPt) rowHeightPt = hPt;
|
|
668
|
+
}
|
|
669
|
+
}
|
|
670
|
+
}
|
|
671
|
+
|
|
672
|
+
rows.push(buildRow(rc.map(c => {
|
|
673
|
+
return buildCell(c.cellChildren, { cs: c.cs, rs: c.rs, props: c.props });
|
|
674
|
+
}), rowHeightPt));
|
|
564
675
|
}
|
|
565
676
|
if (rows.length === 0) return { grid: null, next: i };
|
|
566
677
|
|
|
@@ -589,10 +700,11 @@ function parseTableCtrl(
|
|
|
589
700
|
|
|
590
701
|
function parseCellRec(
|
|
591
702
|
d: Uint8Array, tag: number, recs: HwpRecord[], cStart: number, cEnd: number,
|
|
592
|
-
di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number,
|
|
703
|
+
di: DocInfo, shield: ShieldedParser, seqIdx: number, colCnt: number, gsoCtx: GsoCtx,
|
|
593
704
|
) {
|
|
594
705
|
let col: number, row: number, cs = 1, rs = 1;
|
|
595
706
|
let widthHwp = 0;
|
|
707
|
+
let heightHwp = 0;
|
|
596
708
|
const props: CellProps = {};
|
|
597
709
|
|
|
598
710
|
const attr = d.length >= 6 ? BinaryKit.readU32LE(d, 2) : 0;
|
|
@@ -600,57 +712,55 @@ function parseCellRec(
|
|
|
600
712
|
if (va === 1) props.va = 'mid';
|
|
601
713
|
else if (va === 2) props.va = 'bot';
|
|
602
714
|
|
|
715
|
+
const HWP_PAD_LR_DEFAULT = 360;
|
|
716
|
+
const HWP_PAD_TB_DEFAULT = 141;
|
|
717
|
+
|
|
603
718
|
if (tag === TAG_LIST_HEADER && d.length >= 22) {
|
|
604
|
-
// LIST_HEADER with cell-specific fields
|
|
605
|
-
// offset 8: colAddr, offset 10: rowAddr (HWP 5.0 spec)
|
|
606
719
|
col = BinaryKit.readU16LE(d, 8);
|
|
607
720
|
row = BinaryKit.readU16LE(d, 10);
|
|
608
|
-
|
|
609
|
-
|
|
721
|
+
cs = Math.max(1, BinaryKit.readU16LE(d, 12));
|
|
722
|
+
rs = Math.max(1, BinaryKit.readU16LE(d, 14));
|
|
610
723
|
widthHwp = BinaryKit.readU32LE(d, 16);
|
|
611
|
-
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
const
|
|
615
|
-
if (
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
props.bot = toStroke(bf.borders[3]);
|
|
620
|
-
}
|
|
621
|
-
if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
|
|
724
|
+
heightHwp = d.length >= 24 ? BinaryKit.readU32LE(d, 20) : 0;
|
|
725
|
+
if (d.length >= 32) {
|
|
726
|
+
const pL = BinaryKit.readU16LE(d, 24); const pR = BinaryKit.readU16LE(d, 26);
|
|
727
|
+
const pT = BinaryKit.readU16LE(d, 28); const pB = BinaryKit.readU16LE(d, 30);
|
|
728
|
+
if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
|
|
729
|
+
if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
|
|
730
|
+
if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
|
|
731
|
+
if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
|
|
622
732
|
}
|
|
733
|
+
const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
|
|
734
|
+
if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
|
|
623
735
|
} else if (tag !== TAG_LIST_HEADER) {
|
|
624
|
-
// Full CELL record with position/span/borderFill
|
|
625
736
|
col = d.length >= 8 ? BinaryKit.readU16LE(d, 6) : seqIdx % (colCnt || 1);
|
|
626
737
|
row = d.length >= 10 ? BinaryKit.readU16LE(d, 8) : Math.floor(seqIdx / (colCnt || 1));
|
|
627
738
|
cs = d.length >= 12 ? Math.max(1, BinaryKit.readU16LE(d, 10)) : 1;
|
|
628
739
|
rs = d.length >= 14 ? Math.max(1, BinaryKit.readU16LE(d, 12)) : 1;
|
|
629
740
|
widthHwp = d.length >= 18 ? BinaryKit.readU32LE(d, 14) : 0;
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
const
|
|
634
|
-
if (
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
props.bot = toStroke(bf.borders[3]);
|
|
639
|
-
}
|
|
640
|
-
if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
|
|
741
|
+
heightHwp = d.length >= 22 ? BinaryKit.readU32LE(d, 18) : 0;
|
|
742
|
+
if (d.length >= 30) {
|
|
743
|
+
const pL = BinaryKit.readU16LE(d, 22); const pR = BinaryKit.readU16LE(d, 24);
|
|
744
|
+
const pT = BinaryKit.readU16LE(d, 26); const pB = BinaryKit.readU16LE(d, 28);
|
|
745
|
+
if (pL !== HWP_PAD_LR_DEFAULT) props.padL = Metric.hwpToPt(pL);
|
|
746
|
+
if (pR !== HWP_PAD_LR_DEFAULT) props.padR = Metric.hwpToPt(pR);
|
|
747
|
+
if (pT !== HWP_PAD_TB_DEFAULT) props.padT = Metric.hwpToPt(pT);
|
|
748
|
+
if (pB !== HWP_PAD_TB_DEFAULT) props.padB = Metric.hwpToPt(pB);
|
|
641
749
|
}
|
|
750
|
+
const bfId = d.length >= 32 ? BinaryKit.readU16LE(d, 30) : 0;
|
|
751
|
+
if (bfId > 0 && bfId <= di.borderFills.length) applyCellBorderFill(di.borderFills[bfId - 1], props);
|
|
642
752
|
} else {
|
|
643
|
-
// Fallback: LIST_HEADER too short, compute sequentially
|
|
644
753
|
row = Math.floor(seqIdx / (colCnt || 1));
|
|
645
754
|
col = seqIdx % (colCnt || 1);
|
|
646
755
|
}
|
|
647
756
|
|
|
648
|
-
|
|
649
|
-
const
|
|
757
|
+
const cellChildren: (ParaNode | GridNode)[] = [];
|
|
758
|
+
const MAX_HWP = 1_000_000;
|
|
650
759
|
let k = cStart;
|
|
760
|
+
|
|
651
761
|
while (k < cEnd) {
|
|
652
762
|
if (recs[k].tag === TAG_PARA_HEADER) {
|
|
653
|
-
//
|
|
763
|
+
// Parse paragraph inside cell — also extracts nested tables within the paragraph
|
|
654
764
|
const r = shield.guard(
|
|
655
765
|
() => {
|
|
656
766
|
const hdr = recs[k];
|
|
@@ -659,24 +769,91 @@ function parseCellRec(
|
|
|
659
769
|
const ps = di.paraShapes[psId];
|
|
660
770
|
let txt: ParaTextResult | null = null;
|
|
661
771
|
let csp: [number, number][] = [];
|
|
772
|
+
const ctrlHdrs: { ctrlId: number; imgId: number; wPt: number; hPt: number }[] = [];
|
|
773
|
+
const innerGrids: GridNode[] = [];
|
|
662
774
|
let j = k + 1;
|
|
663
775
|
while (j < cEnd && recs[j].level > lv) {
|
|
664
776
|
if (recs[j].tag === TAG_PARA_TEXT) { txt = decodeParaText(recs[j].data); j++; }
|
|
665
777
|
else if (recs[j].tag === TAG_PARA_CHAR_SHAPE) { csp = parseCharShapePairs(recs[j].data); j++; }
|
|
778
|
+
else if (recs[j].tag === TAG_CTRL_HEADER && recs[j].level === lv + 1) {
|
|
779
|
+
if (recs[j].data.length >= 4) {
|
|
780
|
+
const ctrlId = BinaryKit.readU32LE(recs[j].data, 0);
|
|
781
|
+
if (ctrlId === CTRL_TABLE) {
|
|
782
|
+
// Nested table inside a cell paragraph — recurse into parseTableCtrl
|
|
783
|
+
const nestedTr = shield.guard(
|
|
784
|
+
() => parseTableCtrl(recs, j, di, shield, gsoCtx),
|
|
785
|
+
{ grid: null, next: skipKids(recs, j) },
|
|
786
|
+
`hwp:innerNestedTbl@${j}`,
|
|
787
|
+
);
|
|
788
|
+
if (nestedTr.grid) innerGrids.push(nestedTr.grid as GridNode);
|
|
789
|
+
j = nestedTr.next;
|
|
790
|
+
} else {
|
|
791
|
+
const rawW = recs[j].data.length >= 24 ? BinaryKit.readU32LE(recs[j].data, 16) : 0;
|
|
792
|
+
const rawH = recs[j].data.length >= 28 ? BinaryKit.readU32LE(recs[j].data, 20) : 0;
|
|
793
|
+
const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
|
|
794
|
+
const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
|
|
795
|
+
const imgId = ctrlId === CTRL_GSO ? gsoCtx.count++ : (recs[j].data.length >= 6 ? BinaryKit.readU16LE(recs[j].data, 4) : 0);
|
|
796
|
+
ctrlHdrs.push({ ctrlId, imgId, wPt, hPt });
|
|
797
|
+
j = skipKids(recs, j);
|
|
798
|
+
}
|
|
799
|
+
} else {
|
|
800
|
+
j = skipKids(recs, j);
|
|
801
|
+
}
|
|
802
|
+
}
|
|
666
803
|
else j++;
|
|
667
804
|
}
|
|
668
|
-
const
|
|
669
|
-
|
|
805
|
+
const paraContent: (SpanNode | ContentNode)[] = [];
|
|
806
|
+
if (txt && txt.chars.length > 0) paraContent.push(...resolveCharShapes(txt.chars, csp, di));
|
|
807
|
+
if (txt && txt.controls.length > 0) {
|
|
808
|
+
for (let ci = 0; ci < txt.controls.length; ci++) {
|
|
809
|
+
const ch = ctrlHdrs[ci];
|
|
810
|
+
if (!ch) continue;
|
|
811
|
+
const isImg = ch.ctrlId === CTRL_IMAGE || ch.ctrlId === CTRL_FIG || ch.ctrlId === CTRL_OBJ || ch.ctrlId === CTRL_GSO;
|
|
812
|
+
if (!isImg) continue;
|
|
813
|
+
const dimStr = (ch.wPt > 0 && ch.hPt > 0) ? `_W${Math.round(ch.wPt)}_H${Math.round(ch.hPt)}` : '';
|
|
814
|
+
paraContent.push(buildSpan(`__EXT_${ch.imgId}${dimStr}__`));
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
const kids = paraContent.length > 0 ? paraContent as any : [buildSpan('')];
|
|
818
|
+
const items: (ParaNode | GridNode)[] = [buildPara(kids, buildParaProps(ps)), ...innerGrids];
|
|
819
|
+
return { items, next: j };
|
|
670
820
|
},
|
|
671
|
-
{
|
|
821
|
+
{ items: [buildPara([buildSpan('')])] as (ParaNode | GridNode)[], next: k + 1 },
|
|
672
822
|
`hwp:cellP@${k}`,
|
|
673
823
|
);
|
|
674
|
-
|
|
824
|
+
cellChildren.push(...r.items);
|
|
675
825
|
k = r.next;
|
|
826
|
+
} else if (recs[k].tag === TAG_CTRL_HEADER && recs[k].data.length >= 4) {
|
|
827
|
+
// CTRL_HEADER at cell level (sibling of PARA_HEADER) — anchored 'gso' images and outer-level nested tables
|
|
828
|
+
const cellCtrlId = BinaryKit.readU32LE(recs[k].data, 0);
|
|
829
|
+
if (cellCtrlId === CTRL_GSO) {
|
|
830
|
+
const gsoId = gsoCtx.count++;
|
|
831
|
+
const rawW = recs[k].data.length >= 24 ? BinaryKit.readU32LE(recs[k].data, 16) : 0;
|
|
832
|
+
const rawH = recs[k].data.length >= 28 ? BinaryKit.readU32LE(recs[k].data, 20) : 0;
|
|
833
|
+
const wPt = rawW > 0 && rawW < MAX_HWP ? Metric.hwpToPt(rawW) : 0;
|
|
834
|
+
const hPt = rawH > 0 && rawH < MAX_HWP ? Metric.hwpToPt(rawH) : 0;
|
|
835
|
+
const dimStr = (wPt > 0 && hPt > 0) ? `_W${Math.round(wPt)}_H${Math.round(hPt)}` : '';
|
|
836
|
+
cellChildren.push(buildPara([buildSpan(`__EXT_${gsoId}${dimStr}__`)]));
|
|
837
|
+
k = skipKids(recs, k);
|
|
838
|
+
} else if (cellCtrlId === CTRL_TABLE) {
|
|
839
|
+
const tr = shield.guard(
|
|
840
|
+
() => parseTableCtrl(recs, k, di, shield, gsoCtx),
|
|
841
|
+
{ grid: null, next: skipKids(recs, k) },
|
|
842
|
+
`hwp:nestedTbl@${k}`,
|
|
843
|
+
);
|
|
844
|
+
if (tr.grid) cellChildren.push(tr.grid as GridNode);
|
|
845
|
+
k = tr.next;
|
|
846
|
+
} else {
|
|
847
|
+
k = skipKids(recs, k);
|
|
848
|
+
}
|
|
676
849
|
} else { k++; }
|
|
677
850
|
}
|
|
678
851
|
|
|
679
|
-
return {
|
|
852
|
+
return {
|
|
853
|
+
row, col, cs, rs, props, widthHwp,
|
|
854
|
+
heightHwp: heightHwp || undefined,
|
|
855
|
+
cellChildren: cellChildren.length ? cellChildren : [buildPara([buildSpan('')])],
|
|
856
|
+
};
|
|
680
857
|
}
|
|
681
858
|
|
|
682
859
|
/* ── PAGE_DEF ───────────────────────────────────────────────── */
|
|
@@ -718,6 +895,18 @@ function toStroke(b: { type: number; widthPt: number; color: string }): Stroke {
|
|
|
718
895
|
return { kind: BORDER_KIND[b.type] ?? 'solid', pt: b.widthPt, color: b.color };
|
|
719
896
|
}
|
|
720
897
|
|
|
898
|
+
// Apply borderFill to CellProps. Preserve explicit NONE so DOCX tcBorders can
|
|
899
|
+
// override the table-level tblBorders. Filtering NONE would let tblBorders bleed through.
|
|
900
|
+
function applyCellBorderFill(bf: HwpBorderFill, props: CellProps): void {
|
|
901
|
+
if (bf.borders.length >= 4) {
|
|
902
|
+
props.left = toStroke(bf.borders[0]);
|
|
903
|
+
props.right = toStroke(bf.borders[1]);
|
|
904
|
+
props.top = toStroke(bf.borders[2]);
|
|
905
|
+
props.bot = toStroke(bf.borders[3]);
|
|
906
|
+
}
|
|
907
|
+
if (bf.bgColor && bf.bgColor !== 'FFFFFF') props.bg = bf.bgColor;
|
|
908
|
+
}
|
|
909
|
+
|
|
721
910
|
function strokeFromBF(bfId: number, di: DocInfo): Stroke | undefined {
|
|
722
911
|
if (bfId <= 0 || bfId > di.borderFills.length) return undefined;
|
|
723
912
|
const bf = di.borderFills[bfId - 1];
|
|
@@ -733,7 +922,11 @@ function buildParaProps(ps?: HwpParaShape): ParaProps {
|
|
|
733
922
|
if (ps.spaceBefore > 0) p.spaceBefore = Metric.hwpToPt(ps.spaceBefore);
|
|
734
923
|
if (ps.spaceAfter > 0) p.spaceAfter = Metric.hwpToPt(ps.spaceAfter);
|
|
735
924
|
if (ps.lineSpacing > 0 && ps.lineSpacing !== 160) p.lineHeight = ps.lineSpacing / 100;
|
|
736
|
-
|
|
925
|
+
// leftMargin (offset 4) = 문단 몸체 왼쪽 여백 → leftMargin (pt), ensure non-negative
|
|
926
|
+
const leftMarginPt = Math.max(0, Metric.hwpToPt(ps.leftMargin));
|
|
927
|
+
if (leftMarginPt > 0) p.leftMargin = leftMarginPt;
|
|
928
|
+
// indent (offset 12) = 첫 줄 들여쓰기(양수) / 내어쓰기(음수) → firstLineIndentPt
|
|
929
|
+
if (ps.indent !== 0) p.firstLineIndentPt = Metric.hwpToPt(ps.indent);
|
|
737
930
|
return p;
|
|
738
931
|
}
|
|
739
932
|
|
|
@@ -743,6 +936,7 @@ function buildParaProps(ps?: HwpParaShape): ParaProps {
|
|
|
743
936
|
|
|
744
937
|
export class HwpScanner implements Decoder {
|
|
745
938
|
readonly format = 'hwp';
|
|
939
|
+
readonly aliases = ['application/vnd.hancom.hwp'];
|
|
746
940
|
|
|
747
941
|
async decode(data: Uint8Array): Promise<Outcome<DocRoot>> {
|
|
748
942
|
const shield = new ShieldedParser();
|
|
@@ -764,6 +958,37 @@ export class HwpScanner implements Decoder {
|
|
|
764
958
|
di = shield.guard(() => parseDocInfo(diRaw, compressed), di, 'hwp:docInfo');
|
|
765
959
|
}
|
|
766
960
|
|
|
961
|
+
// Extract images from BinData streams.
|
|
962
|
+
// HWP duplicates each BinData entry: once as "BinData/BIN0001.jpg" and once as "BIN0001.jpg".
|
|
963
|
+
// We keep only the "BinData/" prefixed versions, sort by BIN number, then assign 0-based keys
|
|
964
|
+
// matching the order 'gso' CTRL_HEADER records are encountered during body parsing.
|
|
965
|
+
const binEntries: { binNum: number; data: Uint8Array }[] = [];
|
|
966
|
+
for (const [path, streamData] of streams) {
|
|
967
|
+
// Match "BinData/BIN0001.jpg" style — the canonical form
|
|
968
|
+
const m = path.match(/^BinData[/\\]BIN(\d+)\.\w+$/i);
|
|
969
|
+
if (m) binEntries.push({ binNum: parseInt(m[1], 10), data: streamData });
|
|
970
|
+
}
|
|
971
|
+
// Sort by BIN number (ascending) so BIN0001→idx0, BIN0002→idx1, …
|
|
972
|
+
binEntries.sort((a, b) => a.binNum - b.binNum);
|
|
973
|
+
|
|
974
|
+
const objectMap = new Map<number, ImgNode>();
|
|
975
|
+
for (let idx = 0; idx < binEntries.length; idx++) {
|
|
976
|
+
const { data: imgData } = binEntries[idx];
|
|
977
|
+
|
|
978
|
+
// Determine MIME type from binary signature first, then fall back to extension
|
|
979
|
+
let mimeType: ImgNode['mime'] = 'image/jpeg';
|
|
980
|
+
if (imgData[0] === 0x89 && imgData[1] === 0x50) mimeType = 'image/png';
|
|
981
|
+
else if (imgData[0] === 0x47 && imgData[1] === 0x49) mimeType = 'image/gif';
|
|
982
|
+
else if (imgData[0] === 0x42 && imgData[1] === 0x4D) mimeType = 'image/bmp';
|
|
983
|
+
|
|
984
|
+
const base64 = TextKit.base64Encode(imgData);
|
|
985
|
+
const { wPt, hPt } = getImageDimsPt(imgData, mimeType);
|
|
986
|
+
objectMap.set(idx, buildImg(base64, mimeType, wPt, hPt));
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
// gsoCtx tracks sequential 'gso' encounter order — must be shared across all sections
|
|
990
|
+
const gsoCtx: GsoCtx = { count: 0 };
|
|
991
|
+
|
|
767
992
|
// Body sections
|
|
768
993
|
const allContent: ContentNode[] = [];
|
|
769
994
|
let pageDims: PageDims = A4;
|
|
@@ -774,7 +999,7 @@ export class HwpScanner implements Decoder {
|
|
|
774
999
|
if (s === 0) {
|
|
775
1000
|
const fb = findBodySection(streams);
|
|
776
1001
|
if (fb) {
|
|
777
|
-
const r = parseBody(fb, compressed, di, shield);
|
|
1002
|
+
const r = parseBody(fb, compressed, di, shield, gsoCtx);
|
|
778
1003
|
allContent.push(...r.content);
|
|
779
1004
|
if (r.pageDims) pageDims = r.pageDims;
|
|
780
1005
|
}
|
|
@@ -782,7 +1007,7 @@ export class HwpScanner implements Decoder {
|
|
|
782
1007
|
break;
|
|
783
1008
|
}
|
|
784
1009
|
const r = shield.guard(
|
|
785
|
-
() => parseBody(sec, compressed, di, shield),
|
|
1010
|
+
() => parseBody(sec, compressed, di, shield, gsoCtx),
|
|
786
1011
|
{ content: [], pageDims: undefined },
|
|
787
1012
|
`hwp:sec${s}`,
|
|
788
1013
|
);
|
|
@@ -790,6 +1015,10 @@ export class HwpScanner implements Decoder {
|
|
|
790
1015
|
if (r.pageDims) pageDims = r.pageDims;
|
|
791
1016
|
}
|
|
792
1017
|
|
|
1018
|
+
if (objectMap.size > 0) {
|
|
1019
|
+
injectImagesIntoContent(allContent, objectMap);
|
|
1020
|
+
}
|
|
1021
|
+
|
|
793
1022
|
warns.push(...shield.flush());
|
|
794
1023
|
const content = allContent.length > 0 ? allContent : [buildPara([buildSpan('')])];
|
|
795
1024
|
return succeed(buildRoot({}, [buildSheet(content, pageDims)]), warns);
|
|
@@ -806,4 +1035,164 @@ function findBodySection(streams: Map<string, Uint8Array>): Uint8Array | undefin
|
|
|
806
1035
|
return undefined;
|
|
807
1036
|
}
|
|
808
1037
|
|
|
1038
|
+
/* ═══════════════════════════════════════════════════════════════
|
|
1039
|
+
Image dimension extraction from binary headers
|
|
1040
|
+
════════════════════════════════════════════════════════════ */
|
|
1041
|
+
|
|
1042
|
+
// Returns { wPt, hPt } by parsing image headers; falls back to { wPt: 72, hPt: 72 } (1-inch)
|
|
1043
|
+
function getImageDimsPt(data: Uint8Array, mime: string): { wPt: number; hPt: number } {
|
|
1044
|
+
const fallback = { wPt: 72, hPt: 72 };
|
|
1045
|
+
try {
|
|
1046
|
+
if (mime === 'image/png' && data.length >= 24) {
|
|
1047
|
+
// PNG IHDR: sig(8) + length(4) + type(4) + width(4) + height(4) — all big-endian
|
|
1048
|
+
const w = (data[16] << 24 | data[17] << 16 | data[18] << 8 | data[19]) >>> 0;
|
|
1049
|
+
const h = (data[20] << 24 | data[21] << 16 | data[22] << 8 | data[23]) >>> 0;
|
|
1050
|
+
if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 }; // 96 DPI → pt
|
|
1051
|
+
}
|
|
1052
|
+
if (mime === 'image/jpeg') {
|
|
1053
|
+
// Scan for SOF markers: FF C0 / C1 / C2 / C3
|
|
1054
|
+
let i = 2;
|
|
1055
|
+
while (i + 8 < data.length) {
|
|
1056
|
+
if (data[i] !== 0xFF) { i++; continue; }
|
|
1057
|
+
const marker = data[i + 1];
|
|
1058
|
+
if (marker >= 0xC0 && marker <= 0xC3) {
|
|
1059
|
+
// SOF: 2-byte marker + 2-byte length + 1-byte precision + 2-byte height + 2-byte width
|
|
1060
|
+
const h = (data[i + 5] << 8 | data[i + 6]) >>> 0;
|
|
1061
|
+
const w = (data[i + 7] << 8 | data[i + 8]) >>> 0;
|
|
1062
|
+
if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
|
|
1063
|
+
}
|
|
1064
|
+
const segLen = data[i + 2] << 8 | data[i + 3];
|
|
1065
|
+
i += 2 + (segLen > 0 ? segLen : 2);
|
|
1066
|
+
}
|
|
1067
|
+
}
|
|
1068
|
+
if (mime === 'image/bmp' && data.length >= 26) {
|
|
1069
|
+
// BMP DIB header: width at 18, height at 22 (signed int32 LE; negative = top-down)
|
|
1070
|
+
const w = BinaryKit.readU32LE(data, 18);
|
|
1071
|
+
const h = Math.abs(BinaryKit.readU32LE(data, 22) | 0);
|
|
1072
|
+
if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
|
|
1073
|
+
}
|
|
1074
|
+
if (mime === 'image/gif' && data.length >= 10) {
|
|
1075
|
+
// GIF: width at 6, height at 8 (uint16 LE)
|
|
1076
|
+
const w = data[6] | data[7] << 8;
|
|
1077
|
+
const h = data[8] | data[9] << 8;
|
|
1078
|
+
if (w > 0 && h > 0) return { wPt: w * 0.75, hPt: h * 0.75 };
|
|
1079
|
+
}
|
|
1080
|
+
} catch { /* ignore */ }
|
|
1081
|
+
return fallback;
|
|
1082
|
+
}
|
|
1083
|
+
|
|
1084
|
+
/* ═══════════════════════════════════════════════════════════════
|
|
1085
|
+
OLE Object extraction (images)
|
|
1086
|
+
════════════════════════════════════════════════════════════ */
|
|
1087
|
+
|
|
1088
|
+
function extractImagesFromOleObjectLink(data: Uint8Array): OleObject[] {
|
|
1089
|
+
const objects: OleObject[] = [];
|
|
1090
|
+
let off = 0;
|
|
1091
|
+
|
|
1092
|
+
while (off + 8 <= data.length) {
|
|
1093
|
+
const objId = BinaryKit.readU32LE(data, off);
|
|
1094
|
+
const dataSize = BinaryKit.readU32LE(data, off + 4);
|
|
1095
|
+
const reserved = BinaryKit.readU32LE(data, off + 8);
|
|
1096
|
+
|
|
1097
|
+
if (objId === 0 || dataSize === 0) break;
|
|
1098
|
+
|
|
1099
|
+
const objOff = off + 16;
|
|
1100
|
+
if (objOff + dataSize > data.length) break;
|
|
1101
|
+
|
|
1102
|
+
const objData = data.subarray(objOff, objOff + dataSize);
|
|
1103
|
+
|
|
1104
|
+
// Detect MIME type from signature
|
|
1105
|
+
let mimeType = 'application/octet-stream';
|
|
1106
|
+
if (objData[0] === 0xFF && objData[1] === 0xD8 && objData[2] === 0xFF) {
|
|
1107
|
+
mimeType = 'image/jpeg';
|
|
1108
|
+
} else if (objData[0] === 0x89 && objData[1] === 0x50 && objData[2] === 0x4E && objData[3] === 0x47) {
|
|
1109
|
+
mimeType = 'image/png';
|
|
1110
|
+
} else if (objData[0] === 0x47 && objData[1] === 0x49 && objData[2] === 0x46 && objData[3] === 0x3538) {
|
|
1111
|
+
mimeType = 'image/gif';
|
|
1112
|
+
} else if (objData[0] === 0x42 && objData[1] === 0x4D) {
|
|
1113
|
+
mimeType = 'image/bmp';
|
|
1114
|
+
}
|
|
1115
|
+
|
|
1116
|
+
objects.push({ id: objId, data: objData, mimeType });
|
|
1117
|
+
off = objOff + dataSize;
|
|
1118
|
+
}
|
|
1119
|
+
|
|
1120
|
+
return objects;
|
|
1121
|
+
}
|
|
1122
|
+
|
|
1123
|
+
/* ═══════════════════════════════════════════════════════════════
|
|
1124
|
+
Helper to inject images into paragraph content
|
|
1125
|
+
════════════════════════════════════════════════════════════ */
|
|
1126
|
+
|
|
1127
|
+
function injectImagesIntoContent(
|
|
1128
|
+
content: ContentNode[],
|
|
1129
|
+
objectMap: Map<number, ImgNode>
|
|
1130
|
+
): void {
|
|
1131
|
+
if (objectMap.size === 0) return;
|
|
1132
|
+
|
|
1133
|
+
// Helper function to process a list of kids (spans, images, etc.)
|
|
1134
|
+
const processKids = (kids: any[]) => {
|
|
1135
|
+
for (let i = 0; i < kids.length; i++) {
|
|
1136
|
+
const kid = kids[i];
|
|
1137
|
+
// Span node structure: { tag: 'span', props, kids: [{ tag: 'txt', content }] }
|
|
1138
|
+
if (kid.tag === 'span' && kid.kids && kid.kids[0]?.tag === 'txt') {
|
|
1139
|
+
const text = kid.kids[0].content;
|
|
1140
|
+
// __EXT_N__ or __EXT_N_W<wPt>_H<hPt>__ (with encoded display size)
|
|
1141
|
+
// N is the objId that matches the index in objectMap
|
|
1142
|
+
const match = text.match?.(/^__(?:IMG|EXT)_(\d+)(?:_W(\d+)_H(\d+))?__$/);
|
|
1143
|
+
if (match) {
|
|
1144
|
+
const objId = parseInt(match[1], 10);
|
|
1145
|
+
const base = objectMap.get(objId);
|
|
1146
|
+
if (base) {
|
|
1147
|
+
const wPt = match[2] ? parseInt(match[2], 10) : 0;
|
|
1148
|
+
const hPt = match[3] ? parseInt(match[3], 10) : 0;
|
|
1149
|
+
// Use encoded display size when valid; otherwise keep pixel-based dims
|
|
1150
|
+
kids[i] = (wPt > 0 && hPt > 0) ? { ...base, w: wPt, h: hPt } : base;
|
|
1151
|
+
}
|
|
1152
|
+
}
|
|
1153
|
+
}
|
|
1154
|
+
}
|
|
1155
|
+
};
|
|
1156
|
+
|
|
1157
|
+
// Recursively process a grid (table): resolves image placeholders in all cells,
|
|
1158
|
+
// including nested grids inside cells.
|
|
1159
|
+
const processGridKids = (grid: any) => {
|
|
1160
|
+
if (!grid.kids || !Array.isArray(grid.kids)) return;
|
|
1161
|
+
|
|
1162
|
+
for (const row of grid.kids) {
|
|
1163
|
+
if (!row.kids || !Array.isArray(row.kids)) continue;
|
|
1164
|
+
|
|
1165
|
+
for (const cell of row.kids) {
|
|
1166
|
+
if (!cell.kids || !Array.isArray(cell.kids)) continue;
|
|
1167
|
+
|
|
1168
|
+
for (const cellKid of cell.kids) {
|
|
1169
|
+
if (cellKid.tag === 'grid') {
|
|
1170
|
+
// Nested table inside cell — recurse
|
|
1171
|
+
processGridKids(cellKid);
|
|
1172
|
+
} else if (cellKid.tag === 'para' && cellKid.kids) {
|
|
1173
|
+
processKids(cellKid.kids);
|
|
1174
|
+
}
|
|
1175
|
+
}
|
|
1176
|
+
}
|
|
1177
|
+
}
|
|
1178
|
+
};
|
|
1179
|
+
|
|
1180
|
+
for (const node of content) {
|
|
1181
|
+
if (node.tag === 'para' && node.kids) {
|
|
1182
|
+
// Process paragraph kids (spans, images, links, grids)
|
|
1183
|
+
processKids(node.kids);
|
|
1184
|
+
|
|
1185
|
+
// Also process any nested grids inside the paragraph
|
|
1186
|
+
for (const kid of node.kids) {
|
|
1187
|
+
if (kid.tag === 'grid') {
|
|
1188
|
+
processGridKids(kid);
|
|
1189
|
+
}
|
|
1190
|
+
}
|
|
1191
|
+
} else if (node.tag === 'grid') {
|
|
1192
|
+
// Process grid nodes (tables)
|
|
1193
|
+
processGridKids(node);
|
|
1194
|
+
}
|
|
1195
|
+
}
|
|
1196
|
+
}
|
|
1197
|
+
|
|
809
1198
|
registry.registerDecoder(new HwpScanner());
|