kordoc 2.0.2 → 2.0.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -138,7 +138,7 @@ import { inflateRawSync } from "zlib";
138
138
  import { DOMParser } from "@xmldom/xmldom";
139
139
 
140
140
  // src/utils.ts
141
- var VERSION = true ? "2.0.2" : "0.0.0-dev";
141
+ var VERSION = true ? "2.0.3" : "0.0.0-dev";
142
142
  function toArrayBuffer(buf) {
143
143
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
144
144
  return buf.buffer;
@@ -327,6 +327,47 @@ function sanitizeText(text) {
327
327
  }
328
328
  return result;
329
329
  }
330
+ function flattenLayoutTables(blocks) {
331
+ const result = [];
332
+ for (const block of blocks) {
333
+ if (block.type !== "table" || !block.table) {
334
+ result.push(block);
335
+ continue;
336
+ }
337
+ const { rows: numRows, cols: numCols, cells } = block.table;
338
+ if (numRows === 1 && numCols === 1) {
339
+ result.push(block);
340
+ continue;
341
+ }
342
+ if (numRows <= 3) {
343
+ let totalNewlines = 0;
344
+ let totalTextLen = 0;
345
+ for (let r = 0; r < numRows; r++) {
346
+ for (let c = 0; c < numCols; c++) {
347
+ const t = cells[r]?.[c]?.text || "";
348
+ totalNewlines += (t.match(/\n/g) || []).length;
349
+ totalTextLen += t.length;
350
+ }
351
+ }
352
+ if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
353
+ for (let r = 0; r < numRows; r++) {
354
+ for (let c = 0; c < numCols; c++) {
355
+ const cellText = cells[r]?.[c]?.text?.trim();
356
+ if (!cellText) continue;
357
+ for (const line of cellText.split("\n")) {
358
+ const trimmed = line.trim();
359
+ if (!trimmed) continue;
360
+ result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
361
+ }
362
+ }
363
+ }
364
+ continue;
365
+ }
366
+ }
367
+ result.push(block);
368
+ }
369
+ return result;
370
+ }
330
371
  function blocksToMarkdown(blocks) {
331
372
  const lines = [];
332
373
  for (let i = 0; i < blocks.length; i++) {
@@ -1225,8 +1266,9 @@ var TAG_CHAR_SHAPE = 68;
1225
1266
  var TAG_CTRL_HEADER = 71;
1226
1267
  var TAG_LIST_HEADER = 72;
1227
1268
  var TAG_TABLE = 77;
1228
- var TAG_DOC_CHAR_SHAPE = 55;
1229
- var TAG_DOC_STYLE = 58;
1269
+ var TAG_DOC_CHAR_SHAPE = 21;
1270
+ var TAG_DOC_PARA_SHAPE = 25;
1271
+ var TAG_DOC_STYLE = 26;
1230
1272
  var CHAR_LINE = 0;
1231
1273
  var CHAR_SECTION_BREAK = 10;
1232
1274
  var CHAR_PARA = 13;
@@ -1282,8 +1324,14 @@ function parseFileHeader(data) {
1282
1324
  }
1283
1325
  function parseDocInfo(records) {
1284
1326
  const charShapes = [];
1327
+ const paraShapes = [];
1285
1328
  const styles = [];
1286
1329
  for (const rec of records) {
1330
+ if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
1331
+ const flags = rec.data.readUInt32LE(0);
1332
+ const outlineLevel = flags >> 25 & 7;
1333
+ paraShapes.push({ outlineLevel });
1334
+ }
1287
1335
  if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
1288
1336
  if (rec.data.length >= 50) {
1289
1337
  const fontSize = rec.data.readUInt32LE(42);
@@ -1323,7 +1371,7 @@ function parseDocInfo(records) {
1323
1371
  }
1324
1372
  }
1325
1373
  }
1326
- return { charShapes, styles };
1374
+ return { charShapes, paraShapes, styles };
1327
1375
  }
1328
1376
  function extractText(data) {
1329
1377
  let result = "";
@@ -2334,12 +2382,13 @@ function parseHwp5Document(buffer, options) {
2334
2382
  }
2335
2383
  }
2336
2384
  const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
2385
+ const flatBlocks = flattenLayoutTables(blocks);
2337
2386
  if (docInfo) {
2338
- detectHwp5Headings(blocks, docInfo);
2387
+ detectHwp5Headings(flatBlocks, docInfo);
2339
2388
  }
2340
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2341
- const markdown = blocksToMarkdown(blocks);
2342
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2389
+ const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2390
+ const markdown = blocksToMarkdown(flatBlocks);
2391
+ return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2343
2392
  }
2344
2393
  function parseDocInfoStream(cfb, compressed) {
2345
2394
  try {
@@ -2390,16 +2439,21 @@ function detectHwp5Headings(blocks, docInfo) {
2390
2439
  }
2391
2440
  if (baseFontSize <= 0) return;
2392
2441
  for (const block of blocks) {
2393
- if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
2442
+ if (block.type === "heading") continue;
2443
+ if (block.type !== "paragraph" || !block.text) continue;
2394
2444
  const text = block.text.trim();
2395
2445
  if (text.length === 0 || text.length > 200) continue;
2396
2446
  if (/^\d+$/.test(text)) continue;
2397
- const ratio = block.style.fontSize / baseFontSize;
2398
2447
  let level = 0;
2399
- if (ratio >= HEADING_RATIO_H1) level = 1;
2400
- else if (ratio >= HEADING_RATIO_H2) level = 2;
2401
- else if (ratio >= HEADING_RATIO_H3) level = 3;
2402
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
2448
+ if (block.style?.fontSize && baseFontSize > 0) {
2449
+ const ratio = block.style.fontSize / baseFontSize;
2450
+ if (ratio >= HEADING_RATIO_H1) level = 1;
2451
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
2452
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
2453
+ }
2454
+ if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
2455
+ if (level === 0) level = 2;
2456
+ } else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
2403
2457
  if (level === 0) level = 3;
2404
2458
  }
2405
2459
  if (level > 0) {
@@ -2631,13 +2685,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2631
2685
  while (i < records.length) {
2632
2686
  const rec = records[i];
2633
2687
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
2634
- const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
2688
+ const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
2635
2689
  if (paragraph) {
2636
2690
  const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
2637
2691
  if (docInfo && charShapeIds.length > 0) {
2638
2692
  const style = resolveCharStyle(charShapeIds, docInfo);
2639
2693
  if (style) block.style = style;
2640
2694
  }
2695
+ if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
2696
+ const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
2697
+ if (ol >= 1 && ol <= 6) {
2698
+ block.type = "heading";
2699
+ block.level = ol;
2700
+ }
2701
+ }
2641
2702
  blocks.push(block);
2642
2703
  }
2643
2704
  for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
@@ -2757,6 +2818,8 @@ function parseParagraphWithTables(records, startIdx) {
2757
2818
  let text = "";
2758
2819
  const tables = [];
2759
2820
  const charShapeIds = [];
2821
+ const paraHeaderData = records[startIdx].data;
2822
+ const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
2760
2823
  let i = startIdx + 1;
2761
2824
  while (i < records.length) {
2762
2825
  const rec = records[i];
@@ -2781,7 +2844,7 @@ function parseParagraphWithTables(records, startIdx) {
2781
2844
  i++;
2782
2845
  }
2783
2846
  const trimmed = text.trim();
2784
- return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
2847
+ return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
2785
2848
  }
2786
2849
  function parseTableBlock(records, startIdx) {
2787
2850
  const tableLevel = records[startIdx].level;