kordoc 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -63,6 +63,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
63
63
  blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
64
64
  }
65
65
  } catch {
66
+ blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
66
67
  }
67
68
  }
68
69
  return blocks;
@@ -138,7 +139,7 @@ import { inflateRawSync } from "zlib";
138
139
  import { DOMParser } from "@xmldom/xmldom";
139
140
 
140
141
  // src/utils.ts
141
- var VERSION = true ? "2.0.2" : "0.0.0-dev";
142
+ var VERSION = true ? "2.1.0" : "0.0.0-dev";
142
143
  function toArrayBuffer(buf) {
143
144
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
144
145
  return buf.buffer;
@@ -327,6 +328,47 @@ function sanitizeText(text) {
327
328
  }
328
329
  return result;
329
330
  }
331
+ function flattenLayoutTables(blocks) {
332
+ const result = [];
333
+ for (const block of blocks) {
334
+ if (block.type !== "table" || !block.table) {
335
+ result.push(block);
336
+ continue;
337
+ }
338
+ const { rows: numRows, cols: numCols, cells } = block.table;
339
+ if (numRows === 1 && numCols === 1) {
340
+ result.push(block);
341
+ continue;
342
+ }
343
+ if (numRows <= 3) {
344
+ let totalNewlines = 0;
345
+ let totalTextLen = 0;
346
+ for (let r = 0; r < numRows; r++) {
347
+ for (let c = 0; c < numCols; c++) {
348
+ const t = cells[r]?.[c]?.text || "";
349
+ totalNewlines += (t.match(/\n/g) || []).length;
350
+ totalTextLen += t.length;
351
+ }
352
+ }
353
+ if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
354
+ for (let r = 0; r < numRows; r++) {
355
+ for (let c = 0; c < numCols; c++) {
356
+ const cellText = cells[r]?.[c]?.text?.trim();
357
+ if (!cellText) continue;
358
+ for (const line of cellText.split("\n")) {
359
+ const trimmed = line.trim();
360
+ if (!trimmed) continue;
361
+ result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
362
+ }
363
+ }
364
+ }
365
+ continue;
366
+ }
367
+ }
368
+ result.push(block);
369
+ }
370
+ return result;
371
+ }
330
372
  function blocksToMarkdown(blocks) {
331
373
  const lines = [];
332
374
  for (let i = 0; i < blocks.length; i++) {
@@ -427,6 +469,9 @@ function tableToMarkdown(table) {
427
469
  if (dr === 0 && dc === 0) continue;
428
470
  if (r + dr < numRows && c + dc < numCols) {
429
471
  skip.add(`${r + dr},${c + dc}`);
472
+ if (dr === 0) {
473
+ display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
474
+ }
430
475
  }
431
476
  }
432
477
  }
@@ -522,7 +567,12 @@ function parseCharProperties(doc, map) {
522
567
  if (!id) continue;
523
568
  const prop = {};
524
569
  const height = el.getAttribute("height");
525
- if (height) prop.fontSize = parseInt(height, 10) / 100;
570
+ if (height) {
571
+ const parsedHeight = parseInt(height, 10);
572
+ if (!isNaN(parsedHeight) && parsedHeight > 0) {
573
+ prop.fontSize = parsedHeight / 100;
574
+ }
575
+ }
526
576
  const bold = el.getAttribute("bold");
527
577
  if (bold === "true" || bold === "1") prop.bold = true;
528
578
  const italic = el.getAttribute("italic");
@@ -662,7 +712,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
662
712
  const data = await file.async("uint8array");
663
713
  decompressed.total += data.length;
664
714
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
665
- const ext = ref.includes(".") ? ref.split(".").pop() : "png";
715
+ const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
666
716
  const mimeType = imageExtToMime(ext);
667
717
  imageIndex++;
668
718
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -956,8 +1006,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
956
1006
  break;
957
1007
  case "cellSpan":
958
1008
  if (tableCtx?.cell) {
959
- const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
960
- const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1009
+ const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
1010
+ const cs = isNaN(rawCs) ? 1 : rawCs;
1011
+ const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1012
+ const rs = isNaN(rawRs) ? 1 : rawRs;
961
1013
  tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
962
1014
  tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
963
1015
  }
@@ -1049,6 +1101,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1049
1101
  extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
1050
1102
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
1051
1103
  walkChildren(el, d + 1);
1104
+ } else if (localTag === "run") {
1105
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
1052
1106
  }
1053
1107
  }
1054
1108
  };
@@ -1225,8 +1279,9 @@ var TAG_CHAR_SHAPE = 68;
1225
1279
  var TAG_CTRL_HEADER = 71;
1226
1280
  var TAG_LIST_HEADER = 72;
1227
1281
  var TAG_TABLE = 77;
1228
- var TAG_DOC_CHAR_SHAPE = 55;
1229
- var TAG_DOC_STYLE = 58;
1282
+ var TAG_DOC_CHAR_SHAPE = 21;
1283
+ var TAG_DOC_PARA_SHAPE = 25;
1284
+ var TAG_DOC_STYLE = 26;
1230
1285
  var CHAR_LINE = 0;
1231
1286
  var CHAR_SECTION_BREAK = 10;
1232
1287
  var CHAR_PARA = 13;
@@ -1282,8 +1337,14 @@ function parseFileHeader(data) {
1282
1337
  }
1283
1338
  function parseDocInfo(records) {
1284
1339
  const charShapes = [];
1340
+ const paraShapes = [];
1285
1341
  const styles = [];
1286
1342
  for (const rec of records) {
1343
+ if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
1344
+ const flags = rec.data.readUInt32LE(0);
1345
+ const outlineLevel = flags >> 25 & 7;
1346
+ paraShapes.push({ outlineLevel });
1347
+ }
1287
1348
  if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
1288
1349
  if (rec.data.length >= 50) {
1289
1350
  const fontSize = rec.data.readUInt32LE(42);
@@ -1323,7 +1384,7 @@ function parseDocInfo(records) {
1323
1384
  }
1324
1385
  }
1325
1386
  }
1326
- return { charShapes, styles };
1387
+ return { charShapes, paraShapes, styles };
1327
1388
  }
1328
1389
  function extractText(data) {
1329
1390
  let result = "";
@@ -2334,12 +2395,13 @@ function parseHwp5Document(buffer, options) {
2334
2395
  }
2335
2396
  }
2336
2397
  const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
2398
+ const flatBlocks = flattenLayoutTables(blocks);
2337
2399
  if (docInfo) {
2338
- detectHwp5Headings(blocks, docInfo);
2400
+ detectHwp5Headings(flatBlocks, docInfo);
2339
2401
  }
2340
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2341
- const markdown = blocksToMarkdown(blocks);
2342
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2402
+ const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2403
+ const markdown = blocksToMarkdown(flatBlocks);
2404
+ return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2343
2405
  }
2344
2406
  function parseDocInfoStream(cfb, compressed) {
2345
2407
  try {
@@ -2390,16 +2452,21 @@ function detectHwp5Headings(blocks, docInfo) {
2390
2452
  }
2391
2453
  if (baseFontSize <= 0) return;
2392
2454
  for (const block of blocks) {
2393
- if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
2455
+ if (block.type === "heading") continue;
2456
+ if (block.type !== "paragraph" || !block.text) continue;
2394
2457
  const text = block.text.trim();
2395
2458
  if (text.length === 0 || text.length > 200) continue;
2396
2459
  if (/^\d+$/.test(text)) continue;
2397
- const ratio = block.style.fontSize / baseFontSize;
2398
2460
  let level = 0;
2399
- if (ratio >= HEADING_RATIO_H1) level = 1;
2400
- else if (ratio >= HEADING_RATIO_H2) level = 2;
2401
- else if (ratio >= HEADING_RATIO_H3) level = 3;
2402
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
2461
+ if (block.style?.fontSize && baseFontSize > 0) {
2462
+ const ratio = block.style.fontSize / baseFontSize;
2463
+ if (ratio >= HEADING_RATIO_H1) level = 1;
2464
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
2465
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
2466
+ }
2467
+ if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
2468
+ if (level === 0) level = 2;
2469
+ } else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
2403
2470
  if (level === 0) level = 3;
2404
2471
  }
2405
2472
  if (level > 0) {
@@ -2631,13 +2698,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2631
2698
  while (i < records.length) {
2632
2699
  const rec = records[i];
2633
2700
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
2634
- const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
2701
+ const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
2635
2702
  if (paragraph) {
2636
2703
  const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
2637
2704
  if (docInfo && charShapeIds.length > 0) {
2638
2705
  const style = resolveCharStyle(charShapeIds, docInfo);
2639
2706
  if (style) block.style = style;
2640
2707
  }
2708
+ if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
2709
+ const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
2710
+ if (ol >= 1 && ol <= 6) {
2711
+ block.type = "heading";
2712
+ block.level = ol;
2713
+ }
2714
+ }
2641
2715
  blocks.push(block);
2642
2716
  }
2643
2717
  for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
@@ -2757,6 +2831,8 @@ function parseParagraphWithTables(records, startIdx) {
2757
2831
  let text = "";
2758
2832
  const tables = [];
2759
2833
  const charShapeIds = [];
2834
+ const paraHeaderData = records[startIdx].data;
2835
+ const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
2760
2836
  let i = startIdx + 1;
2761
2837
  while (i < records.length) {
2762
2838
  const rec = records[i];
@@ -2781,7 +2857,7 @@ function parseParagraphWithTables(records, startIdx) {
2781
2857
  i++;
2782
2858
  }
2783
2859
  const trimmed = text.trim();
2784
- return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
2860
+ return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
2785
2861
  }
2786
2862
  function parseTableBlock(records, startIdx) {
2787
2863
  const tableLevel = records[startIdx].level;
@@ -2894,10 +2970,33 @@ var MIN_LINE_LENGTH = 10;
2894
2970
  var COORD_MERGE_TOL = 3;
2895
2971
  var CONNECT_TOL = 5;
2896
2972
  var CELL_PADDING = 2;
2973
+ var MAX_LINE_WIDTH = 5;
2974
+ var IDENTITY = [1, 0, 0, 1, 0, 0];
2975
+ function matMultiply(m1, m2) {
2976
+ return [
2977
+ m1[0] * m2[0] + m1[2] * m2[1],
2978
+ m1[1] * m2[0] + m1[3] * m2[1],
2979
+ m1[0] * m2[2] + m1[2] * m2[3],
2980
+ m1[1] * m2[2] + m1[3] * m2[3],
2981
+ m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
2982
+ m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
2983
+ ];
2984
+ }
2985
+ function matTransformPoint(m, x, y) {
2986
+ return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
2987
+ }
2988
+ function matScale(m) {
2989
+ return Math.max(
2990
+ Math.sqrt(m[1] * m[1] + m[3] * m[3]),
2991
+ Math.sqrt(m[0] * m[0] + m[2] * m[2])
2992
+ );
2993
+ }
2897
2994
  function extractLines(fnArray, argsArray) {
2898
2995
  const horizontals = [];
2899
2996
  const verticals = [];
2997
+ let ctm = [...IDENTITY];
2900
2998
  let lineWidth = 1;
2999
+ const stateStack = [];
2901
3000
  let currentPath = [];
2902
3001
  let pathStartX = 0, pathStartY = 0;
2903
3002
  let curX = 0, curY = 0;
@@ -2915,13 +3014,53 @@ function extractLines(fnArray, argsArray) {
2915
3014
  );
2916
3015
  }
2917
3016
  }
2918
- function flushPath(isStroke) {
2919
- if (!isStroke) {
3017
+ function tryConvertLinesToRectangle(path) {
3018
+ if (path.length < 3 || path.length > 5) return false;
3019
+ const first = path[0], last = path[path.length - 1];
3020
+ const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
3021
+ if (!closed) return false;
3022
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
3023
+ for (const seg of path) {
3024
+ minX = Math.min(minX, seg.x1, seg.x2);
3025
+ minY = Math.min(minY, seg.y1, seg.y2);
3026
+ maxX = Math.max(maxX, seg.x1, seg.x2);
3027
+ maxY = Math.max(maxY, seg.y1, seg.y2);
3028
+ }
3029
+ const w = maxX - minX, h = maxY - minY;
3030
+ if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
3031
+ path.length = 0;
3032
+ if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
3033
+ path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
3034
+ } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
3035
+ path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
3036
+ } else {
3037
+ pushRectangle(path, minX, minY, w, h);
3038
+ }
3039
+ return true;
3040
+ }
3041
+ function flushPath(isStroke, isFill) {
3042
+ if (!isStroke && !isFill) {
3043
+ currentPath = [];
3044
+ return;
3045
+ }
3046
+ if (isFill && !isStroke && currentPath.length >= 3) {
3047
+ tryConvertLinesToRectangle(currentPath);
3048
+ }
3049
+ const scale = matScale(ctm);
3050
+ const effectiveLW = lineWidth * scale;
3051
+ if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
2920
3052
  currentPath = [];
2921
3053
  return;
2922
3054
  }
2923
3055
  for (const seg of currentPath) {
2924
- classifyAndAdd(seg, lineWidth, horizontals, verticals);
3056
+ const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
3057
+ const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
3058
+ classifyAndAdd(
3059
+ { x1: px1, y1: py1, x2: px2, y2: py2 },
3060
+ effectiveLW,
3061
+ horizontals,
3062
+ verticals
3063
+ );
2925
3064
  }
2926
3065
  currentPath = [];
2927
3066
  }
@@ -2929,9 +3068,28 @@ function extractLines(fnArray, argsArray) {
2929
3068
  const op = fnArray[i];
2930
3069
  const args = argsArray[i];
2931
3070
  switch (op) {
3071
+ // ── Graphics State ──
3072
+ case OPS.save:
3073
+ stateStack.push({ ctm: [...ctm], lineWidth });
3074
+ break;
3075
+ case OPS.restore:
3076
+ if (stateStack.length > 0) {
3077
+ const state = stateStack.pop();
3078
+ ctm = state.ctm;
3079
+ lineWidth = state.lineWidth;
3080
+ }
3081
+ break;
3082
+ case OPS.transform: {
3083
+ const m = args;
3084
+ if (m.length >= 6) {
3085
+ ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
3086
+ }
3087
+ break;
3088
+ }
2932
3089
  case OPS.setLineWidth:
2933
3090
  lineWidth = args[0] || 1;
2934
3091
  break;
3092
+ // ── Path Construction ──
2935
3093
  case OPS.constructPath: {
2936
3094
  const arg0 = args[0];
2937
3095
  if (Array.isArray(arg0)) {
@@ -2999,34 +3157,60 @@ function extractLines(fnArray, argsArray) {
2999
3157
  }
3000
3158
  }
3001
3159
  }
3002
- if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
3003
- flushPath(true);
3004
- } else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
3005
- flushPath(true);
3160
+ const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
3161
+ const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
3162
+ const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
3163
+ if (isStroke5 || isFill5 || isBoth5) {
3164
+ flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
3006
3165
  } else if (afterOp === OPS.endPath) {
3007
- flushPath(false);
3166
+ flushPath(false, false);
3008
3167
  }
3009
3168
  }
3010
3169
  break;
3011
3170
  }
3171
+ // ── Paint Operations ──
3012
3172
  case OPS.stroke:
3013
3173
  case OPS.closeStroke:
3014
- flushPath(true);
3174
+ flushPath(true, false);
3015
3175
  break;
3016
3176
  case OPS.fill:
3017
3177
  case OPS.eoFill:
3178
+ flushPath(false, true);
3179
+ break;
3018
3180
  case OPS.fillStroke:
3019
3181
  case OPS.eoFillStroke:
3020
3182
  case OPS.closeFillStroke:
3021
3183
  case OPS.closeEOFillStroke:
3022
- flushPath(true);
3184
+ flushPath(true, true);
3023
3185
  break;
3024
3186
  case OPS.endPath:
3025
- flushPath(false);
3187
+ flushPath(false, false);
3026
3188
  break;
3027
3189
  }
3028
3190
  }
3029
- return { horizontals, verticals };
3191
+ return {
3192
+ horizontals: deduplicateLines(horizontals),
3193
+ verticals: deduplicateLines(verticals)
3194
+ };
3195
+ }
3196
+ function deduplicateLines(lines) {
3197
+ if (lines.length <= 1) return lines;
3198
+ const result = [];
3199
+ const tol = COORD_MERGE_TOL;
3200
+ for (const line of lines) {
3201
+ let isDuplicate = false;
3202
+ for (const existing of result) {
3203
+ if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3204
+ if (line.lineWidth > existing.lineWidth) {
3205
+ existing.lineWidth = line.lineWidth;
3206
+ }
3207
+ isDuplicate = true;
3208
+ break;
3209
+ }
3210
+ }
3211
+ if (!isDuplicate) result.push(line);
3212
+ }
3213
+ return result;
3030
3214
  }
3031
3215
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3032
3216
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3622,6 +3806,7 @@ async function parsePdfDocument(buffer, options) {
3622
3806
  const medianFontSize = computeMedianFontSize(allFontSizes);
3623
3807
  if (medianFontSize > 0) {
3624
3808
  detectHeadings(blocks, medianFontSize);
3809
+ mergeAdjacentHeadings(blocks);
3625
3810
  }
3626
3811
  detectMarkerHeadings(blocks);
3627
3812
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3696,6 +3881,46 @@ function detectHeadings(blocks, medianFontSize) {
3696
3881
  }
3697
3882
  }
3698
3883
  }
3884
+ function mergeAdjacentHeadings(blocks) {
3885
+ let i = 0;
3886
+ while (i < blocks.length - 1) {
3887
+ const curr = blocks[i];
3888
+ const next = blocks[i + 1];
3889
+ if (curr.type !== "heading" || next.type !== "heading") {
3890
+ i++;
3891
+ continue;
3892
+ }
3893
+ if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3894
+ i++;
3895
+ continue;
3896
+ }
3897
+ const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3898
+ const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3899
+ const yDiff = Math.abs(currBaseline - nextBaseline);
3900
+ const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3901
+ const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3902
+ const sameLevel = curr.level === next.level;
3903
+ if (sameY && sameLevel) {
3904
+ const currX = curr.bbox.x;
3905
+ const nextX = next.bbox.x;
3906
+ if (currX <= nextX) {
3907
+ curr.text = curr.text + " " + next.text;
3908
+ } else {
3909
+ curr.text = next.text + " " + curr.text;
3910
+ }
3911
+ curr.bbox = {
3912
+ page: curr.bbox.page,
3913
+ x: Math.min(curr.bbox.x, next.bbox.x),
3914
+ y: Math.min(curr.bbox.y, next.bbox.y),
3915
+ width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3916
+ height: Math.max(curr.bbox.height, next.bbox.height)
3917
+ };
3918
+ blocks.splice(i + 1, 1);
3919
+ } else {
3920
+ i++;
3921
+ }
3922
+ }
3923
+ }
3699
3924
  function collapseEvenSpacing(text) {
3700
3925
  const tokens = text.split(" ");
3701
3926
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
@@ -3704,6 +3929,169 @@ function collapseEvenSpacing(text) {
3704
3929
  }
3705
3930
  return text;
3706
3931
  }
3932
+ function buildXyCutBlocks(items, pageNum) {
3933
+ const allY = items.map((i) => i.y);
3934
+ const pageHeight = Math.max(...allY) - Math.min(...allY);
3935
+ const gapThreshold = Math.max(15, pageHeight * 0.03);
3936
+ const orderedGroups = xyCutOrder(items, gapThreshold);
3937
+ const blocks = [];
3938
+ for (const group of orderedGroups) {
3939
+ if (group.length === 0) continue;
3940
+ const yLines = groupByY(group);
3941
+ for (const line of yLines) {
3942
+ const text = mergeLineSimple(line);
3943
+ if (!text.trim()) continue;
3944
+ blocks.push({
3945
+ type: "paragraph",
3946
+ text,
3947
+ pageNumber: pageNum,
3948
+ bbox: computeBBox(line, pageNum),
3949
+ style: dominantStyle(line)
3950
+ });
3951
+ }
3952
+ }
3953
+ return blocks.length > 0 ? blocks : null;
3954
+ }
3955
+ function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
3956
+ const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
3957
+ const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
3958
+ const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
3959
+ if (!isUnderSegmented) return null;
3960
+ if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
3961
+ const directTable = buildTableFromTextLayout(items, pageNum, bbox);
3962
+ if (directTable) return directTable;
3963
+ const clusterItems = items.map((i) => ({
3964
+ text: i.text,
3965
+ x: i.x,
3966
+ y: i.y,
3967
+ w: i.w,
3968
+ h: i.h,
3969
+ fontSize: i.fontSize,
3970
+ fontName: i.fontName
3971
+ }));
3972
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
3973
+ if (clusterResults.length > 0) {
3974
+ const blocks = [];
3975
+ const ciToIdx = /* @__PURE__ */ new Map();
3976
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
3977
+ const usedIndices = /* @__PURE__ */ new Set();
3978
+ for (const cr of clusterResults) {
3979
+ for (const ci of cr.usedItems) {
3980
+ const idx = ciToIdx.get(ci);
3981
+ if (idx !== void 0) usedIndices.add(idx);
3982
+ }
3983
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
3984
+ }
3985
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
3986
+ for (const item of remaining) {
3987
+ if (!item.text.trim()) continue;
3988
+ blocks.push({
3989
+ type: "paragraph",
3990
+ text: item.text,
3991
+ pageNumber: pageNum,
3992
+ bbox: computeBBox([item], pageNum),
3993
+ style: { fontSize: item.fontSize, fontName: item.fontName }
3994
+ });
3995
+ }
3996
+ blocks.sort((a, b) => {
3997
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3998
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3999
+ return by - ay;
4000
+ });
4001
+ return blocks.length > 0 ? blocks : null;
4002
+ }
4003
+ return null;
4004
+ }
4005
+ function buildTableFromTextLayout(items, pageNum, bbox) {
4006
+ if (items.length < 4) return null;
4007
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
4008
+ const yTol = 3;
4009
+ const rows = [];
4010
+ let curRow = [sorted[0]];
4011
+ let curY = sorted[0].y;
4012
+ for (let i = 1; i < sorted.length; i++) {
4013
+ if (Math.abs(sorted[i].y - curY) <= yTol) {
4014
+ curRow.push(sorted[i]);
4015
+ } else {
4016
+ rows.push(curRow);
4017
+ curRow = [sorted[i]];
4018
+ curY = sorted[i].y;
4019
+ }
4020
+ }
4021
+ rows.push(curRow);
4022
+ if (rows.length < 2) return null;
4023
+ const gapPositions = [];
4024
+ for (const row of rows) {
4025
+ if (row.length < 2) continue;
4026
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
4027
+ const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
4028
+ for (let j = 1; j < sortedX.length; j++) {
4029
+ const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
4030
+ if (gap >= avgFs * 1.5) {
4031
+ gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
4032
+ }
4033
+ }
4034
+ }
4035
+ if (gapPositions.length < 2) return null;
4036
+ gapPositions.sort((a, b) => a - b);
4037
+ const colBoundaries = [];
4038
+ let clusterSum = gapPositions[0], clusterCount = 1;
4039
+ for (let i = 1; i < gapPositions.length; i++) {
4040
+ const avg = clusterSum / clusterCount;
4041
+ if (Math.abs(gapPositions[i] - avg) <= 15) {
4042
+ clusterSum += gapPositions[i];
4043
+ clusterCount++;
4044
+ } else {
4045
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4046
+ clusterSum = gapPositions[i];
4047
+ clusterCount = 1;
4048
+ }
4049
+ }
4050
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4051
+ if (colBoundaries.length === 0) return null;
4052
+ const numCols = colBoundaries.length + 1;
4053
+ const tableRows = [];
4054
+ for (const row of rows) {
4055
+ const cells = Array(numCols).fill("");
4056
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
4057
+ for (const item of sortedX) {
4058
+ const cx = item.x + item.w / 2;
4059
+ let col = 0;
4060
+ for (let b = 0; b < colBoundaries.length; b++) {
4061
+ if (cx > colBoundaries[b]) col = b + 1;
4062
+ }
4063
+ cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
4064
+ }
4065
+ if (cells[0].trim() === "" && tableRows.length > 0) {
4066
+ const prevCells = tableRows[tableRows.length - 1].cells;
4067
+ for (let c = 0; c < numCols; c++) {
4068
+ if (cells[c].trim()) {
4069
+ prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
4070
+ }
4071
+ }
4072
+ } else {
4073
+ tableRows.push({ cells });
4074
+ }
4075
+ }
4076
+ if (tableRows.length < 2) return null;
4077
+ const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
4078
+ const totalCount = tableRows.length * numCols;
4079
+ if (nonEmptyCount < totalCount * 0.3) return null;
4080
+ const irCells = tableRows.map(
4081
+ (r) => r.cells.map((text, colIdx) => {
4082
+ let cleaned = text.trim();
4083
+ if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
4084
+ return { text: cleaned, colSpan: 1, rowSpan: 1 };
4085
+ })
4086
+ );
4087
+ const irTable = {
4088
+ rows: tableRows.length,
4089
+ cols: numCols,
4090
+ cells: irCells,
4091
+ hasHeader: tableRows.length > 1
4092
+ };
4093
+ return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
4094
+ }
3707
4095
  function shouldDemoteTable(table) {
3708
4096
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3709
4097
  const allText = allCells.join(" ");
@@ -3750,6 +4138,32 @@ function detectMarkerHeadings(blocks) {
3750
4138
  }
3751
4139
  }
3752
4140
  }
4141
+ function hasMultiColumnLayout(items) {
4142
+ if (items.length < 30) return false;
4143
+ const sorted = [...items].sort((a, b) => a.x - b.x);
4144
+ const minX = sorted[0].x;
4145
+ let maxX = minX;
4146
+ for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4147
+ const pageWidth = maxX - minX;
4148
+ if (pageWidth < 200) return false;
4149
+ let bestGap = 0;
4150
+ let bestSplit = 0;
4151
+ for (let j = 1; j < sorted.length; j++) {
4152
+ const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4153
+ if (gap > bestGap) {
4154
+ bestGap = gap;
4155
+ bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4156
+ }
4157
+ }
4158
+ if (bestGap < 20) return false;
4159
+ const splitRatio = (bestSplit - minX) / pageWidth;
4160
+ if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4161
+ const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4162
+ const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4163
+ if (leftCount < 15 || rightCount < 15) return false;
4164
+ if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4165
+ return true;
4166
+ }
3753
4167
  var MAX_XYCUT_DEPTH = 50;
3754
4168
  function xyCutOrder(items, gapThreshold, depth = 0) {
3755
4169
  if (items.length === 0) return [];
@@ -3880,6 +4294,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3880
4294
  width: grid.bbox.x2 - grid.bbox.x1,
3881
4295
  height: grid.bbox.y2 - grid.bbox.y1
3882
4296
  };
4297
+ const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4298
+ if (normalized) {
4299
+ blocks.push(...normalized);
4300
+ continue;
4301
+ }
3883
4302
  if (shouldDemoteTable(irTable)) {
3884
4303
  const demoted = demoteTableToText(irTable);
3885
4304
  if (demoted) {
@@ -3925,6 +4344,10 @@ function mergeAdjacentTableBlocks(blocks) {
3925
4344
  }
3926
4345
  function extractPageBlocksFallback(items, pageNum) {
3927
4346
  if (items.length === 0) return [];
4347
+ if (hasMultiColumnLayout(items)) {
4348
+ const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4349
+ return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4350
+ }
3928
4351
  const blocks = [];
3929
4352
  const allYLines = groupByY(items);
3930
4353
  const columns = detectColumns(allYLines);
@@ -3942,7 +4365,7 @@ function extractPageBlocksFallback(items, pageNum) {
3942
4365
  fontSize: i.fontSize,
3943
4366
  fontName: i.fontName
3944
4367
  }));
3945
- const clusterResults = detectClusterTables(clusterItems, pageNum);
4368
+ const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
3946
4369
  if (clusterResults.length > 0) {
3947
4370
  const ciToIdx = /* @__PURE__ */ new Map();
3948
4371
  for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);