hwpkit-dev 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,9 +1,9 @@
1
1
  import type { Decoder } from '../../contract/decoder';
2
- import type { DocRoot, ContentNode, ParaNode, SpanNode } from '../../model/doc-tree';
2
+ import type { DocRoot, ContentNode, ParaNode, SpanNode, ImgNode } from '../../model/doc-tree';
3
3
  import type { Outcome } from '../../contract/result';
4
4
  import type { Align, Stroke, StrokeKind, PageDims, TextProps, ParaProps, CellProps, GridProps } from '../../model/doc-props';
5
5
  import { succeed, fail } from '../../contract/result';
6
- import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell } from '../../model/builders';
6
+ import { buildRoot, buildSheet, buildPara, buildSpan, buildGrid, buildRow, buildCell, buildImg } from '../../model/builders';
7
7
  import { ShieldedParser } from '../../safety/ShieldedParser';
8
8
  import { BinaryKit } from '../../toolkit/BinaryKit';
9
9
  import { Metric, safeHex, safeFont } from '../../safety/StyleBridge';
@@ -37,8 +37,11 @@ const TAG_CELL_B = HWPTAG_BEGIN + 65; // 81
37
37
  function isTableTag(t: number) { return t === TAG_TABLE_A || t === TAG_TABLE_B; }
38
38
  function isCellTag(t: number) { return t === TAG_CELL_A || t === TAG_CELL_B || t === TAG_LIST_HEADER; }
39
39
 
40
- // CTRL_HEADER ctrlId for table: ' lbt' as UINT32-LE = 0x74626C20
41
- const CTRL_TABLE = 0x74626C20;
40
+ // CTRL_HEADER ctrlId values (UINT32-LE as ASCII)
41
+ const CTRL_TABLE = 0x74626C20; // ' lbt'
42
+ const CTRL_IMAGE = 0x696D6720; // 'img '
43
+ const CTRL_OBJ = 0x6F626A20; // 'obj '
44
+ const CTRL_FIG = 0x66696720; // 'fig '
42
45
 
43
46
  /* ═══════════════════════════════════════════════════════════════
44
47
  Types
@@ -83,7 +86,14 @@ interface DocInfo {
83
86
  }
84
87
 
85
88
  interface ParsedChar { pos: number; ch: string }
86
- interface ParaTextResult { chars: ParsedChar[]; controlPositions: number[] }
89
+ interface ParsedCtrl { pos: number; ctrlId: number; objId: number; matched: boolean }
90
+ interface ParaTextResult { chars: ParsedChar[]; controls: ParsedCtrl[] }
91
+
92
+ interface OleObject {
93
+ id: number;
94
+ data: Uint8Array;
95
+ mimeType: string;
96
+ }
87
97
 
88
98
  /* ═══════════════════════════════════════════════════════════════
89
99
  Low-level record parsing
@@ -111,7 +121,9 @@ function parseRecords(data: Uint8Array): HwpRecord[] {
111
121
  }
112
122
 
113
123
  function tryInflate(data: Uint8Array): Uint8Array {
114
- try { return pako.inflateRaw(data); } catch { return data; }
124
+ try { return pako.inflate(data); } catch {
125
+ try { return pako.inflateRaw(data); } catch { return data; }
126
+ }
115
127
  }
116
128
 
117
129
  /* ═══════════════════════════════════════════════════════════════
@@ -212,7 +224,7 @@ function parseParaShape(d: Uint8Array): HwpParaShape {
212
224
  if (d.length < 4) return { align: 'left', spaceBefore: 0, spaceAfter: 0, lineSpacing: 160, indent: 0 };
213
225
  const attr = BinaryKit.readU32LE(d, 0);
214
226
  return {
215
- align: ALIGN_TBL[attr & 0x7] ?? 'left',
227
+ align: ALIGN_TBL[(attr >> 2) & 0x7] ?? 'left',
216
228
  indent: d.length >= 16 ? i32(d, 12) : 0,
217
229
  spaceBefore: d.length >= 20 ? i32(d, 16) : 0,
218
230
  spaceAfter: d.length >= 24 ? i32(d, 20) : 0,
@@ -228,20 +240,30 @@ function parseParaShape(d: Uint8Array): HwpParaShape {
228
240
  [36:4] faceColor (bgColor for solid fill) */
229
241
 
230
242
  const BORDER_W_PT = [0.28, 0.34, 0.43, 0.57, 0.71, 0.85, 1.13, 1.42, 1.70, 1.98, 2.84, 4.25, 5.67, 8.50, 11.34, 14.17];
231
- const BORDER_KIND: Record<number, StrokeKind> = { 0:'none',1:'solid',2:'dash',3:'dot',4:'dash',5:'dash',6:'dash',7:'dot',8:'double',9:'double',10:'double' };
243
+ const BORDER_KIND: Record<number, StrokeKind> = { 0:'solid',1:'dash',2:'dash',3:'dot',4:'dash',5:'dash',6:'dash',7:'double',8:'double',9:'double',10:'none' };
232
244
 
233
245
  function parseBorderFill(d: Uint8Array): HwpBorderFill {
246
+ // Spec grouped format (표 23):
247
+ // [0:2] attr
248
+ // [2:4] 4 border types (left, right, top, bottom) — 1 byte each
249
+ // [6:4] 4 border widths (left, right, top, bottom) — 1 byte each (index into BORDER_W_PT)
250
+ // [10:16] 4 border colors (left, right, top, bottom) — 4 bytes each (COLORREF)
251
+ // [26:3] diagonal: type(1) + width(1) + color(4) = 6 bytes actually [26:6]
252
+ // [32:4] fillType
253
+ // [36:4] faceColor (bgColor for solid fill)
234
254
  const borders: HwpBorderFill['borders'] = [];
255
+ const BASE_TYPE = 2; // 4 type bytes
256
+ const BASE_WIDTH = 6; // 4 width bytes
257
+ const BASE_COLOR = 10; // 4 × 4-byte colors
235
258
  for (let i = 0; i < 4; i++) {
236
- const b = 2 + i * 6;
237
- if (b + 6 <= d.length) {
238
- borders.push({ type: d[b], widthPt: BORDER_W_PT[d[b + 1]] ?? 0.5, color: colorRef(d, b + 2) });
239
- } else {
240
- borders.push({ type: 0, widthPt: 0.5, color: '000000' });
241
- }
259
+ const type = BASE_TYPE + i < d.length ? d[BASE_TYPE + i] : 0;
260
+ const widthPt = BASE_WIDTH + i < d.length ? (BORDER_W_PT[d[BASE_WIDTH + i]] ?? 0.5) : 0.5;
261
+ const color = BASE_COLOR + i * 4 + 4 <= d.length ? colorRef(d, BASE_COLOR + i * 4) : '000000';
262
+ borders.push({ type, widthPt, color });
242
263
  }
243
264
  let bgColor: string | undefined;
244
- const fOff = 32; // after attr(2) + 5 borders(30)
265
+ // after attr(2) + 4 types(4) + 4 widths(4) + 4 colors(16) + diagonal(6) = offset 32
266
+ const fOff = 32;
245
267
  if (d.length >= fOff + 8) {
246
268
  const ft = BinaryKit.readU32LE(d, fOff);
247
269
  if (ft & 1) bgColor = colorRef(d, fOff + 4);
@@ -259,12 +281,19 @@ function parseBody(
259
281
  const recs = parseRecords(compressed ? tryInflate(raw) : raw);
260
282
  const content: ContentNode[] = [];
261
283
  let pageDims: PageDims | undefined;
262
- let i = 0;
263
284
 
285
+ // Pre-scan for PAGE_DEF at any nesting level (real HWP stores it at level 2 inside section ctrl)
286
+ for (const r of recs) {
287
+ if (r.tag === TAG_PAGE_DEF) {
288
+ pageDims = shield.guard(() => parsePageDef(r.data), A4, 'hwp:pageDef');
289
+ break;
290
+ }
291
+ }
292
+
293
+ let i = 0;
264
294
  while (i < recs.length) {
265
295
  if (recs[i].tag === TAG_PAGE_DEF) {
266
- pageDims = shield.guard(() => parsePageDef(recs[i].data), A4, 'hwp:pageDef');
267
- i++;
296
+ i++; // already handled above; skip at top level
268
297
  } else if (recs[i].tag === TAG_PARA_HEADER) {
269
298
  const r = shield.guard(
270
299
  () => parseParagraphGroup(recs, i, di, shield),
@@ -295,6 +324,7 @@ function parseParagraphGroup(
295
324
  let text: ParaTextResult | null = null;
296
325
  let csPairs: [number, number][] = [];
297
326
  const grids: ContentNode[] = [];
327
+ const ctrlHeaders: { ctrlId: number; objId: number }[] = [];
298
328
  let i = start + 1;
299
329
 
300
330
  while (i < recs.length && recs[i].level > lv) {
@@ -307,14 +337,23 @@ function parseParagraphGroup(
307
337
  csPairs = parseCharShapePairs(r.data);
308
338
  i++;
309
339
  } else if (r.tag === TAG_CTRL_HEADER && r.level === lv + 1) {
310
- if (r.data.length >= 4 && BinaryKit.readU32LE(r.data, 0) === CTRL_TABLE) {
311
- const tr = shield.guard(
312
- () => parseTableCtrl(recs, i, di, shield),
313
- { grid: null, next: skipKids(recs, i) },
314
- `hwp:tbl@${i}`,
315
- );
316
- if (tr.grid) grids.push(tr.grid);
317
- i = tr.next;
340
+ if (r.data.length >= 4) {
341
+ const ctrlId = BinaryKit.readU32LE(r.data, 0);
342
+ // objId at offset 4 (UINT16) - identifies the image/object in BinData
343
+ const objId = r.data.length >= 6 ? BinaryKit.readU16LE(r.data, 4) : 0;
344
+ ctrlHeaders.push({ ctrlId, objId });
345
+
346
+ if (ctrlId === CTRL_TABLE) {
347
+ const tr = shield.guard(
348
+ () => parseTableCtrl(recs, i, di, shield),
349
+ { grid: null, next: skipKids(recs, i) },
350
+ `hwp:tbl@${i}`,
351
+ );
352
+ if (tr.grid) grids.push(tr.grid);
353
+ i = tr.next;
354
+ } else {
355
+ i = skipKids(recs, i);
356
+ }
318
357
  } else {
319
358
  i = skipKids(recs, i);
320
359
  }
@@ -323,14 +362,39 @@ function parseParagraphGroup(
323
362
  }
324
363
  }
325
364
 
365
+ // Match extended controls with CTRL_HEADER entries
366
+ if (text && ctrlHeaders.length > 0) {
367
+ for (let ci = 0; ci < text.controls.length; ci++) {
368
+ if (ci < ctrlHeaders.length) {
369
+ text.controls[ci].ctrlId = ctrlHeaders[ci].ctrlId;
370
+ text.controls[ci].matched = true;
371
+ }
372
+ }
373
+ }
374
+
326
375
  const nodes: ContentNode[] = [];
327
376
 
328
- // Build paragraph from text
329
- if (text && text.chars.length > 0) {
330
- const joined = text.chars.map(c => c.ch).join('');
331
- if (joined.trim()) {
377
+ // Build paragraph from text and inline controls (images)
378
+ if (text && (text.chars.length > 0 || text.controls.length > 0)) {
379
+ const paraContent: (SpanNode | ContentNode)[] = [];
380
+
381
+ // Process text chars and controls together
382
+ if (text.chars.length > 0) {
332
383
  const spans = resolveCharShapes(text.chars, csPairs, di);
333
- nodes.push(buildPara(spans, buildParaProps(ps)));
384
+ paraContent.push(...spans);
385
+ }
386
+
387
+ // Add placeholder spans for extended controls (images)
388
+ if (text.controls.length > 0) {
389
+ for (let ci = 0; ci < text.controls.length; ci++) {
390
+ // Create placeholder for all extended controls
391
+ // Image replacement will happen later in injectImagesIntoContent
392
+ paraContent.push(buildSpan(`__EXT_${ci}__`));
393
+ }
394
+ }
395
+
396
+ if (paraContent.length > 0) {
397
+ nodes.push(buildPara(paraContent as any, buildParaProps(ps)));
334
398
  }
335
399
  }
336
400
 
@@ -354,7 +418,7 @@ const INL_CTRL = new Set([4, 5, 6, 7, 8]);
354
418
 
355
419
  function decodeParaText(d: Uint8Array): ParaTextResult {
356
420
  const chars: ParsedChar[] = [];
357
- const controlPositions: number[] = [];
421
+ const controls: ParsedCtrl[] = [];
358
422
  let i = 0, pos = 0;
359
423
 
360
424
  while (i + 1 < d.length) {
@@ -364,8 +428,14 @@ function decodeParaText(d: Uint8Array): ParaTextResult {
364
428
  if (c === 10) { chars.push({ pos, ch: '\n' }); i += 2; pos++; continue; }
365
429
 
366
430
  if (EXT_CTRL.has(c)) {
367
- controlPositions.push(pos);
368
- i += 16; pos += 8; continue; // 8 WORDs
431
+ // Extended control: 8 WORDs (16 bytes)
432
+ // WORD 4 contains objId (for images, charts, etc.)
433
+ let objId = 0;
434
+ if (i + 16 <= d.length) {
435
+ objId = BinaryKit.readU16LE(d, i + 8); // 4th WORD (offset 8) contains objId
436
+ }
437
+ controls.push({ pos, ctrlId: 0, objId, matched: false });
438
+ i += 16; pos += 8; continue;
369
439
  }
370
440
  if (INL_CTRL.has(c)) {
371
441
  i += 16; pos += 8; continue;
@@ -379,7 +449,7 @@ function decodeParaText(d: Uint8Array): ParaTextResult {
379
449
  chars.push({ pos, ch: String.fromCharCode(c) });
380
450
  i += 2; pos++;
381
451
  }
382
- return { chars, controlPositions };
452
+ return { chars, controls };
383
453
  }
384
454
 
385
455
  /* ── PARA_CHAR_SHAPE ────────────────────────────────────────── */
@@ -605,8 +675,8 @@ function parseCellRec(
605
675
  // offset 8: colAddr, offset 10: rowAddr (HWP 5.0 spec)
606
676
  col = BinaryKit.readU16LE(d, 8);
607
677
  row = BinaryKit.readU16LE(d, 10);
608
- rs = Math.max(1, BinaryKit.readU16LE(d, 12));
609
- cs = Math.max(1, BinaryKit.readU16LE(d, 14));
678
+ cs = Math.max(1, BinaryKit.readU16LE(d, 12));
679
+ rs = Math.max(1, BinaryKit.readU16LE(d, 14));
610
680
  widthHwp = BinaryKit.readU32LE(d, 16);
611
681
 
612
682
  const bfId = d.length >= 34 ? BinaryKit.readU16LE(d, 32) : 0;
@@ -764,6 +834,53 @@ export class HwpScanner implements Decoder {
764
834
  di = shield.guard(() => parseDocInfo(diRaw, compressed), di, 'hwp:docInfo');
765
835
  }
766
836
 
837
+ // Extract images from BinData streams
838
+ const imageStreams: { path: string; data: Uint8Array }[] = [];
839
+ for (const [path, data] of streams) {
840
+ if ((path.includes('BinData') || path.includes('.jpg') || path.includes('.jpeg') || path.includes('.png') || path.includes('.gif') || path.includes('.bmp'))
841
+ && !path.includes('FileHeader') && !path.includes('DocInfo') && !path.includes('BodyText') && !path.includes('Section')) {
842
+ imageStreams.push({ path, data });
843
+ console.log(`[HwpScanner] Image stream found: ${path} (${data.length} bytes)`);
844
+ }
845
+ }
846
+
847
+ // Create image nodes for each image stream (deduplicated by hash)
848
+ const objectMap = new Map<number, ImgNode>();
849
+ const seenHashes = new Set<string>();
850
+ let imgIdx = 0;
851
+ for (const { path, data } of imageStreams) {
852
+ // Determine MIME type from extension or signature
853
+ let mimeType = 'image/jpeg';
854
+ const lowerPath = path.toLowerCase();
855
+ if (lowerPath.includes('.png')) mimeType = 'image/png';
856
+ else if (lowerPath.includes('.gif')) mimeType = 'image/gif';
857
+ else if (lowerPath.includes('.bmp')) mimeType = 'image/bmp';
858
+
859
+ // Also check signature
860
+ if (data[0] === 0x89 && data[1] === 0x50 && data[2] === 0x4E && data[3] === 0x47) mimeType = 'image/png';
861
+ else if (data[0] === 0x47 && data[1] === 0x49 && data[2] === 0x46 && data[3] === 0x3538) mimeType = 'image/gif';
862
+ else if (data[0] === 0x42 && data[1] === 0x4D) mimeType = 'image/bmp';
863
+
864
+ const imgData = Buffer.from(data);
865
+ const base64 = imgData.toString('base64');
866
+ const hash = base64.slice(0, 20); // Use first 20 chars as simple hash
867
+ if (!seenHashes.has(hash)) {
868
+ seenHashes.add(hash);
869
+ objectMap.set(imgIdx++, buildImg(
870
+ base64,
871
+ mimeType as any,
872
+ 0, // w
873
+ 0, // h
874
+ `Image from ${path}`,
875
+ ));
876
+ console.log(`[HwpScanner] Added unique image: ${hash}... (${data.length} bytes)`);
877
+ } else {
878
+ console.log(`[HwpScanner] Duplicate image skipped: ${hash}...`);
879
+ }
880
+ }
881
+
882
+ console.log(`[HwpScanner] Found ${imageStreams.length} image streams, ${objectMap.size} unique images`);
883
+
767
884
  // Body sections
768
885
  const allContent: ContentNode[] = [];
769
886
  let pageDims: PageDims = A4;
@@ -790,6 +907,30 @@ export class HwpScanner implements Decoder {
790
907
  if (r.pageDims) pageDims = r.pageDims;
791
908
  }
792
909
 
910
+ // Inject images into paragraphs (only if images are available)
911
+ console.log(`[HwpScanner] Before injection: ${allContent.length} nodes, ${objectMap.size} images available`);
912
+ if (objectMap.size > 0) {
913
+ injectImagesIntoContent(allContent, objectMap);
914
+ console.log(`[HwpScanner] After injection: ${allContent.length} nodes`);
915
+ }
916
+
917
+ // Count images (recursively)
918
+ const countImages = (nodes: ContentNode[]): number => {
919
+ let count = 0;
920
+ for (const node of nodes) {
921
+ if ((node as any).tag === 'img') count++;
922
+ if ((node as any).tag === 'para' && (node as any).kids) count += countImages((node as any).kids);
923
+ if ((node as any).tag === 'grid' && (node as any).kids) {
924
+ for (const row of (node as any).kids) {
925
+ if (row.kids) count += countImages(row.kids);
926
+ }
927
+ }
928
+ }
929
+ return count;
930
+ };
931
+ const imgCount = countImages(allContent);
932
+ console.log(`[HwpScanner] Images in content: ${imgCount}`);
933
+
793
934
  warns.push(...shield.flush());
794
935
  const content = allContent.length > 0 ? allContent : [buildPara([buildSpan('')])];
795
936
  return succeed(buildRoot({}, [buildSheet(content, pageDims)]), warns);
@@ -806,4 +947,84 @@ function findBodySection(streams: Map<string, Uint8Array>): Uint8Array | undefin
806
947
  return undefined;
807
948
  }
808
949
 
950
+ /* ═══════════════════════════════════════════════════════════════
951
+ OLE Object extraction (images)
952
+ ════════════════════════════════════════════════════════════ */
953
+
954
+ function extractImagesFromOleObjectLink(data: Uint8Array): OleObject[] {
955
+ const objects: OleObject[] = [];
956
+ let off = 0;
957
+
958
+ while (off + 8 <= data.length) {
959
+ const objId = BinaryKit.readU32LE(data, off);
960
+ const dataSize = BinaryKit.readU32LE(data, off + 4);
961
+ const reserved = BinaryKit.readU32LE(data, off + 8);
962
+
963
+ if (objId === 0 || dataSize === 0) break;
964
+
965
+ const objOff = off + 16;
966
+ if (objOff + dataSize > data.length) break;
967
+
968
+ const objData = data.subarray(objOff, objOff + dataSize);
969
+
970
+ // Detect MIME type from signature
971
+ let mimeType = 'application/octet-stream';
972
+ if (objData[0] === 0xFF && objData[1] === 0xD8 && objData[2] === 0xFF) {
973
+ mimeType = 'image/jpeg';
974
+ } else if (objData[0] === 0x89 && objData[1] === 0x50 && objData[2] === 0x4E && objData[3] === 0x47) {
975
+ mimeType = 'image/png';
976
+ } else if (objData[0] === 0x47 && objData[1] === 0x49 && objData[2] === 0x46 && objData[3] === 0x3538) {
977
+ mimeType = 'image/gif';
978
+ } else if (objData[0] === 0x42 && objData[1] === 0x4D) {
979
+ mimeType = 'image/bmp';
980
+ }
981
+
982
+ objects.push({ id: objId, data: objData, mimeType });
983
+ off = objOff + dataSize;
984
+ }
985
+
986
+ return objects;
987
+ }
988
+
989
+ /* ═══════════════════════════════════════════════════════════════
990
+ Helper to inject images into paragraph content
991
+ ════════════════════════════════════════════════════════════ */
992
+
993
+ function injectImagesIntoContent(
994
+ content: ContentNode[],
995
+ objectMap: Map<number, ImgNode>
996
+ ): void {
997
+ const imageArray = Array.from(objectMap.values());
998
+ if (imageArray.length === 0) return;
999
+
1000
+ // Get unique images (deduplicate by base64 content)
1001
+ const uniqueImages = Array.from(new Set(imageArray.map(img => img.b64))).map(b64 => {
1002
+ return imageArray.find(img => img.b64 === b64)!;
1003
+ });
1004
+ if (uniqueImages.length === 0) return;
1005
+
1006
+ let imgIdx = 0;
1007
+ for (const node of content) {
1008
+ if (node.tag === 'para' && node.kids) {
1009
+ for (let i = 0; i < node.kids.length; i++) {
1010
+ const kid = node.kids[i];
1011
+ // Span node structure: { tag: 'span', props, kids: [{ tag: 'txt', content }] }
1012
+ if (kid.tag === 'span' && kid.kids && kid.kids[0]?.tag === 'txt') {
1013
+ const text = kid.kids[0].content;
1014
+ // Support both __IMG_N__ and __EXT_N__ patterns
1015
+ const match = text.match?.(/^__(?:IMG|EXT)_(\d+)__$/);
1016
+ if (match) {
1017
+ // Replace placeholder with next available image (round-robin)
1018
+ const imgNode = uniqueImages[imgIdx % uniqueImages.length];
1019
+ if (imgNode) {
1020
+ node.kids[i] = imgNode;
1021
+ imgIdx++;
1022
+ }
1023
+ }
1024
+ }
1025
+ }
1026
+ }
1027
+ }
1028
+ }
1029
+
809
1030
  registry.registerDecoder(new HwpScanner());
@@ -715,7 +715,15 @@ function decodeGrid(tbl: any, ctx: DecCtx): GridNode {
715
715
  { cs, rs, props: cellProps },
716
716
  );
717
717
  });
718
- return buildRow(cellNodes);
718
+ // Row height: read from the first cell's cellSz height
719
+ let rowHeightPt: number | undefined;
720
+ const firstCellForH = cellArr[0];
721
+ if (firstCellForH) {
722
+ const hSz = firstCellForH?.['hp:cellSz']?.[0]?._attr ?? {};
723
+ const hVal = Number(hSz.height ?? 0);
724
+ if (hVal > 0) rowHeightPt = Metric.hwpToPt(hVal);
725
+ }
726
+ return buildRow(cellNodes, rowHeightPt);
719
727
  });
720
728
  return buildGrid(rowNodes, gridProps);
721
729
  }