kordoc 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -85,6 +85,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
85
85
  blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
86
86
  }
87
87
  } catch {
88
+ blocks.push({ type: "paragraph", text: `[OCR \uC2E4\uD328: \uD398\uC774\uC9C0 ${i}]` });
88
89
  }
89
90
  }
90
91
  return blocks;
@@ -182,7 +183,7 @@ var import_zlib = require("zlib");
182
183
  var import_xmldom = require("@xmldom/xmldom");
183
184
 
184
185
  // src/utils.ts
185
- var VERSION = true ? "2.0.2" : "0.0.0-dev";
186
+ var VERSION = true ? "2.1.0" : "0.0.0-dev";
186
187
  function toArrayBuffer(buf) {
187
188
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
188
189
  return buf.buffer;
@@ -371,6 +372,47 @@ function sanitizeText(text) {
371
372
  }
372
373
  return result;
373
374
  }
375
+ function flattenLayoutTables(blocks) {
376
+ const result = [];
377
+ for (const block of blocks) {
378
+ if (block.type !== "table" || !block.table) {
379
+ result.push(block);
380
+ continue;
381
+ }
382
+ const { rows: numRows, cols: numCols, cells } = block.table;
383
+ if (numRows === 1 && numCols === 1) {
384
+ result.push(block);
385
+ continue;
386
+ }
387
+ if (numRows <= 3) {
388
+ let totalNewlines = 0;
389
+ let totalTextLen = 0;
390
+ for (let r = 0; r < numRows; r++) {
391
+ for (let c = 0; c < numCols; c++) {
392
+ const t = cells[r]?.[c]?.text || "";
393
+ totalNewlines += (t.match(/\n/g) || []).length;
394
+ totalTextLen += t.length;
395
+ }
396
+ }
397
+ if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
398
+ for (let r = 0; r < numRows; r++) {
399
+ for (let c = 0; c < numCols; c++) {
400
+ const cellText = cells[r]?.[c]?.text?.trim();
401
+ if (!cellText) continue;
402
+ for (const line of cellText.split("\n")) {
403
+ const trimmed = line.trim();
404
+ if (!trimmed) continue;
405
+ result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
406
+ }
407
+ }
408
+ }
409
+ continue;
410
+ }
411
+ }
412
+ result.push(block);
413
+ }
414
+ return result;
415
+ }
374
416
  function blocksToMarkdown(blocks) {
375
417
  const lines = [];
376
418
  for (let i = 0; i < blocks.length; i++) {
@@ -471,6 +513,9 @@ function tableToMarkdown(table) {
471
513
  if (dr === 0 && dc === 0) continue;
472
514
  if (r + dr < numRows && c + dc < numCols) {
473
515
  skip.add(`${r + dr},${c + dc}`);
516
+ if (dr === 0) {
517
+ display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
518
+ }
474
519
  }
475
520
  }
476
521
  }
@@ -566,7 +611,12 @@ function parseCharProperties(doc, map) {
566
611
  if (!id) continue;
567
612
  const prop = {};
568
613
  const height = el.getAttribute("height");
569
- if (height) prop.fontSize = parseInt(height, 10) / 100;
614
+ if (height) {
615
+ const parsedHeight = parseInt(height, 10);
616
+ if (!isNaN(parsedHeight) && parsedHeight > 0) {
617
+ prop.fontSize = parsedHeight / 100;
618
+ }
619
+ }
570
620
  const bold = el.getAttribute("bold");
571
621
  if (bold === "true" || bold === "1") prop.bold = true;
572
622
  const italic = el.getAttribute("italic");
@@ -706,7 +756,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
706
756
  const data = await file.async("uint8array");
707
757
  decompressed.total += data.length;
708
758
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
709
- const ext = ref.includes(".") ? ref.split(".").pop() : "png";
759
+ const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
710
760
  const mimeType = imageExtToMime(ext);
711
761
  imageIndex++;
712
762
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -1000,8 +1050,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
1000
1050
  break;
1001
1051
  case "cellSpan":
1002
1052
  if (tableCtx?.cell) {
1003
- const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
1004
- const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1053
+ const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
1054
+ const cs = isNaN(rawCs) ? 1 : rawCs;
1055
+ const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
1056
+ const rs = isNaN(rawRs) ? 1 : rawRs;
1005
1057
  tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
1006
1058
  tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
1007
1059
  }
@@ -1093,6 +1145,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
1093
1145
  extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
1094
1146
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
1095
1147
  walkChildren(el, d + 1);
1148
+ } else if (localTag === "run") {
1149
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
1096
1150
  }
1097
1151
  }
1098
1152
  };
@@ -1269,8 +1323,9 @@ var TAG_CHAR_SHAPE = 68;
1269
1323
  var TAG_CTRL_HEADER = 71;
1270
1324
  var TAG_LIST_HEADER = 72;
1271
1325
  var TAG_TABLE = 77;
1272
- var TAG_DOC_CHAR_SHAPE = 55;
1273
- var TAG_DOC_STYLE = 58;
1326
+ var TAG_DOC_CHAR_SHAPE = 21;
1327
+ var TAG_DOC_PARA_SHAPE = 25;
1328
+ var TAG_DOC_STYLE = 26;
1274
1329
  var CHAR_LINE = 0;
1275
1330
  var CHAR_SECTION_BREAK = 10;
1276
1331
  var CHAR_PARA = 13;
@@ -1326,8 +1381,14 @@ function parseFileHeader(data) {
1326
1381
  }
1327
1382
  function parseDocInfo(records) {
1328
1383
  const charShapes = [];
1384
+ const paraShapes = [];
1329
1385
  const styles = [];
1330
1386
  for (const rec of records) {
1387
+ if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
1388
+ const flags = rec.data.readUInt32LE(0);
1389
+ const outlineLevel = flags >> 25 & 7;
1390
+ paraShapes.push({ outlineLevel });
1391
+ }
1331
1392
  if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
1332
1393
  if (rec.data.length >= 50) {
1333
1394
  const fontSize = rec.data.readUInt32LE(42);
@@ -1367,7 +1428,7 @@ function parseDocInfo(records) {
1367
1428
  }
1368
1429
  }
1369
1430
  }
1370
- return { charShapes, styles };
1431
+ return { charShapes, paraShapes, styles };
1371
1432
  }
1372
1433
  function extractText(data) {
1373
1434
  let result = "";
@@ -2379,12 +2440,13 @@ function parseHwp5Document(buffer, options) {
2379
2440
  }
2380
2441
  }
2381
2442
  const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
2443
+ const flatBlocks = flattenLayoutTables(blocks);
2382
2444
  if (docInfo) {
2383
- detectHwp5Headings(blocks, docInfo);
2445
+ detectHwp5Headings(flatBlocks, docInfo);
2384
2446
  }
2385
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2386
- const markdown = blocksToMarkdown(blocks);
2387
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2447
+ const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2448
+ const markdown = blocksToMarkdown(flatBlocks);
2449
+ return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2388
2450
  }
2389
2451
  function parseDocInfoStream(cfb, compressed) {
2390
2452
  try {
@@ -2435,16 +2497,21 @@ function detectHwp5Headings(blocks, docInfo) {
2435
2497
  }
2436
2498
  if (baseFontSize <= 0) return;
2437
2499
  for (const block of blocks) {
2438
- if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
2500
+ if (block.type === "heading") continue;
2501
+ if (block.type !== "paragraph" || !block.text) continue;
2439
2502
  const text = block.text.trim();
2440
2503
  if (text.length === 0 || text.length > 200) continue;
2441
2504
  if (/^\d+$/.test(text)) continue;
2442
- const ratio = block.style.fontSize / baseFontSize;
2443
2505
  let level = 0;
2444
- if (ratio >= HEADING_RATIO_H1) level = 1;
2445
- else if (ratio >= HEADING_RATIO_H2) level = 2;
2446
- else if (ratio >= HEADING_RATIO_H3) level = 3;
2447
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
2506
+ if (block.style?.fontSize && baseFontSize > 0) {
2507
+ const ratio = block.style.fontSize / baseFontSize;
2508
+ if (ratio >= HEADING_RATIO_H1) level = 1;
2509
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
2510
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
2511
+ }
2512
+ if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
2513
+ if (level === 0) level = 2;
2514
+ } else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
2448
2515
  if (level === 0) level = 3;
2449
2516
  }
2450
2517
  if (level > 0) {
@@ -2676,13 +2743,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2676
2743
  while (i < records.length) {
2677
2744
  const rec = records[i];
2678
2745
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
2679
- const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
2746
+ const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
2680
2747
  if (paragraph) {
2681
2748
  const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
2682
2749
  if (docInfo && charShapeIds.length > 0) {
2683
2750
  const style = resolveCharStyle(charShapeIds, docInfo);
2684
2751
  if (style) block.style = style;
2685
2752
  }
2753
+ if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
2754
+ const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
2755
+ if (ol >= 1 && ol <= 6) {
2756
+ block.type = "heading";
2757
+ block.level = ol;
2758
+ }
2759
+ }
2686
2760
  blocks.push(block);
2687
2761
  }
2688
2762
  for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
@@ -2802,6 +2876,8 @@ function parseParagraphWithTables(records, startIdx) {
2802
2876
  let text = "";
2803
2877
  const tables = [];
2804
2878
  const charShapeIds = [];
2879
+ const paraHeaderData = records[startIdx].data;
2880
+ const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
2805
2881
  let i = startIdx + 1;
2806
2882
  while (i < records.length) {
2807
2883
  const rec = records[i];
@@ -2826,7 +2902,7 @@ function parseParagraphWithTables(records, startIdx) {
2826
2902
  i++;
2827
2903
  }
2828
2904
  const trimmed = text.trim();
2829
- return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
2905
+ return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
2830
2906
  }
2831
2907
  function parseTableBlock(records, startIdx) {
2832
2908
  const tableLevel = records[startIdx].level;
@@ -2939,10 +3015,33 @@ var MIN_LINE_LENGTH = 10;
2939
3015
  var COORD_MERGE_TOL = 3;
2940
3016
  var CONNECT_TOL = 5;
2941
3017
  var CELL_PADDING = 2;
3018
+ var MAX_LINE_WIDTH = 5;
3019
+ var IDENTITY = [1, 0, 0, 1, 0, 0];
3020
+ function matMultiply(m1, m2) {
3021
+ return [
3022
+ m1[0] * m2[0] + m1[2] * m2[1],
3023
+ m1[1] * m2[0] + m1[3] * m2[1],
3024
+ m1[0] * m2[2] + m1[2] * m2[3],
3025
+ m1[1] * m2[2] + m1[3] * m2[3],
3026
+ m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
3027
+ m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
3028
+ ];
3029
+ }
3030
+ function matTransformPoint(m, x, y) {
3031
+ return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
3032
+ }
3033
+ function matScale(m) {
3034
+ return Math.max(
3035
+ Math.sqrt(m[1] * m[1] + m[3] * m[3]),
3036
+ Math.sqrt(m[0] * m[0] + m[2] * m[2])
3037
+ );
3038
+ }
2942
3039
  function extractLines(fnArray, argsArray) {
2943
3040
  const horizontals = [];
2944
3041
  const verticals = [];
3042
+ let ctm = [...IDENTITY];
2945
3043
  let lineWidth = 1;
3044
+ const stateStack = [];
2946
3045
  let currentPath = [];
2947
3046
  let pathStartX = 0, pathStartY = 0;
2948
3047
  let curX = 0, curY = 0;
@@ -2960,13 +3059,53 @@ function extractLines(fnArray, argsArray) {
2960
3059
  );
2961
3060
  }
2962
3061
  }
2963
- function flushPath(isStroke) {
2964
- if (!isStroke) {
3062
+ function tryConvertLinesToRectangle(path) {
3063
+ if (path.length < 3 || path.length > 5) return false;
3064
+ const first = path[0], last = path[path.length - 1];
3065
+ const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
3066
+ if (!closed) return false;
3067
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
3068
+ for (const seg of path) {
3069
+ minX = Math.min(minX, seg.x1, seg.x2);
3070
+ minY = Math.min(minY, seg.y1, seg.y2);
3071
+ maxX = Math.max(maxX, seg.x1, seg.x2);
3072
+ maxY = Math.max(maxY, seg.y1, seg.y2);
3073
+ }
3074
+ const w = maxX - minX, h = maxY - minY;
3075
+ if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
3076
+ path.length = 0;
3077
+ if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
3078
+ path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
3079
+ } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
3080
+ path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
3081
+ } else {
3082
+ pushRectangle(path, minX, minY, w, h);
3083
+ }
3084
+ return true;
3085
+ }
3086
+ function flushPath(isStroke, isFill) {
3087
+ if (!isStroke && !isFill) {
3088
+ currentPath = [];
3089
+ return;
3090
+ }
3091
+ if (isFill && !isStroke && currentPath.length >= 3) {
3092
+ tryConvertLinesToRectangle(currentPath);
3093
+ }
3094
+ const scale = matScale(ctm);
3095
+ const effectiveLW = lineWidth * scale;
3096
+ if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
2965
3097
  currentPath = [];
2966
3098
  return;
2967
3099
  }
2968
3100
  for (const seg of currentPath) {
2969
- classifyAndAdd(seg, lineWidth, horizontals, verticals);
3101
+ const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
3102
+ const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
3103
+ classifyAndAdd(
3104
+ { x1: px1, y1: py1, x2: px2, y2: py2 },
3105
+ effectiveLW,
3106
+ horizontals,
3107
+ verticals
3108
+ );
2970
3109
  }
2971
3110
  currentPath = [];
2972
3111
  }
@@ -2974,9 +3113,28 @@ function extractLines(fnArray, argsArray) {
2974
3113
  const op = fnArray[i];
2975
3114
  const args = argsArray[i];
2976
3115
  switch (op) {
3116
+ // ── Graphics State ──
3117
+ case import_pdf.OPS.save:
3118
+ stateStack.push({ ctm: [...ctm], lineWidth });
3119
+ break;
3120
+ case import_pdf.OPS.restore:
3121
+ if (stateStack.length > 0) {
3122
+ const state = stateStack.pop();
3123
+ ctm = state.ctm;
3124
+ lineWidth = state.lineWidth;
3125
+ }
3126
+ break;
3127
+ case import_pdf.OPS.transform: {
3128
+ const m = args;
3129
+ if (m.length >= 6) {
3130
+ ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
3131
+ }
3132
+ break;
3133
+ }
2977
3134
  case import_pdf.OPS.setLineWidth:
2978
3135
  lineWidth = args[0] || 1;
2979
3136
  break;
3137
+ // ── Path Construction ──
2980
3138
  case import_pdf.OPS.constructPath: {
2981
3139
  const arg0 = args[0];
2982
3140
  if (Array.isArray(arg0)) {
@@ -3044,34 +3202,60 @@ function extractLines(fnArray, argsArray) {
3044
3202
  }
3045
3203
  }
3046
3204
  }
3047
- if (afterOp === import_pdf.OPS.stroke || afterOp === import_pdf.OPS.closeStroke) {
3048
- flushPath(true);
3049
- } else if (afterOp === import_pdf.OPS.fill || afterOp === import_pdf.OPS.eoFill || afterOp === import_pdf.OPS.fillStroke || afterOp === import_pdf.OPS.eoFillStroke || afterOp === import_pdf.OPS.closeFillStroke || afterOp === import_pdf.OPS.closeEOFillStroke) {
3050
- flushPath(true);
3205
+ const isStroke5 = afterOp === import_pdf.OPS.stroke || afterOp === import_pdf.OPS.closeStroke;
3206
+ const isFill5 = afterOp === import_pdf.OPS.fill || afterOp === import_pdf.OPS.eoFill;
3207
+ const isBoth5 = afterOp === import_pdf.OPS.fillStroke || afterOp === import_pdf.OPS.eoFillStroke || afterOp === import_pdf.OPS.closeFillStroke || afterOp === import_pdf.OPS.closeEOFillStroke;
3208
+ if (isStroke5 || isFill5 || isBoth5) {
3209
+ flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
3051
3210
  } else if (afterOp === import_pdf.OPS.endPath) {
3052
- flushPath(false);
3211
+ flushPath(false, false);
3053
3212
  }
3054
3213
  }
3055
3214
  break;
3056
3215
  }
3216
+ // ── Paint Operations ──
3057
3217
  case import_pdf.OPS.stroke:
3058
3218
  case import_pdf.OPS.closeStroke:
3059
- flushPath(true);
3219
+ flushPath(true, false);
3060
3220
  break;
3061
3221
  case import_pdf.OPS.fill:
3062
3222
  case import_pdf.OPS.eoFill:
3223
+ flushPath(false, true);
3224
+ break;
3063
3225
  case import_pdf.OPS.fillStroke:
3064
3226
  case import_pdf.OPS.eoFillStroke:
3065
3227
  case import_pdf.OPS.closeFillStroke:
3066
3228
  case import_pdf.OPS.closeEOFillStroke:
3067
- flushPath(true);
3229
+ flushPath(true, true);
3068
3230
  break;
3069
3231
  case import_pdf.OPS.endPath:
3070
- flushPath(false);
3232
+ flushPath(false, false);
3071
3233
  break;
3072
3234
  }
3073
3235
  }
3074
- return { horizontals, verticals };
3236
+ return {
3237
+ horizontals: deduplicateLines(horizontals),
3238
+ verticals: deduplicateLines(verticals)
3239
+ };
3240
+ }
3241
+ function deduplicateLines(lines) {
3242
+ if (lines.length <= 1) return lines;
3243
+ const result = [];
3244
+ const tol = COORD_MERGE_TOL;
3245
+ for (const line of lines) {
3246
+ let isDuplicate = false;
3247
+ for (const existing of result) {
3248
+ if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3249
+ if (line.lineWidth > existing.lineWidth) {
3250
+ existing.lineWidth = line.lineWidth;
3251
+ }
3252
+ isDuplicate = true;
3253
+ break;
3254
+ }
3255
+ }
3256
+ if (!isDuplicate) result.push(line);
3257
+ }
3258
+ return result;
3075
3259
  }
3076
3260
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
3077
3261
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3667,6 +3851,7 @@ async function parsePdfDocument(buffer, options) {
3667
3851
  const medianFontSize = computeMedianFontSize(allFontSizes);
3668
3852
  if (medianFontSize > 0) {
3669
3853
  detectHeadings(blocks, medianFontSize);
3854
+ mergeAdjacentHeadings(blocks);
3670
3855
  }
3671
3856
  detectMarkerHeadings(blocks);
3672
3857
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3741,6 +3926,46 @@ function detectHeadings(blocks, medianFontSize) {
3741
3926
  }
3742
3927
  }
3743
3928
  }
3929
+ function mergeAdjacentHeadings(blocks) {
3930
+ let i = 0;
3931
+ while (i < blocks.length - 1) {
3932
+ const curr = blocks[i];
3933
+ const next = blocks[i + 1];
3934
+ if (curr.type !== "heading" || next.type !== "heading") {
3935
+ i++;
3936
+ continue;
3937
+ }
3938
+ if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3939
+ i++;
3940
+ continue;
3941
+ }
3942
+ const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3943
+ const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3944
+ const yDiff = Math.abs(currBaseline - nextBaseline);
3945
+ const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3946
+ const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3947
+ const sameLevel = curr.level === next.level;
3948
+ if (sameY && sameLevel) {
3949
+ const currX = curr.bbox.x;
3950
+ const nextX = next.bbox.x;
3951
+ if (currX <= nextX) {
3952
+ curr.text = curr.text + " " + next.text;
3953
+ } else {
3954
+ curr.text = next.text + " " + curr.text;
3955
+ }
3956
+ curr.bbox = {
3957
+ page: curr.bbox.page,
3958
+ x: Math.min(curr.bbox.x, next.bbox.x),
3959
+ y: Math.min(curr.bbox.y, next.bbox.y),
3960
+ width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3961
+ height: Math.max(curr.bbox.height, next.bbox.height)
3962
+ };
3963
+ blocks.splice(i + 1, 1);
3964
+ } else {
3965
+ i++;
3966
+ }
3967
+ }
3968
+ }
3744
3969
  function collapseEvenSpacing(text) {
3745
3970
  const tokens = text.split(" ");
3746
3971
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
@@ -3749,6 +3974,169 @@ function collapseEvenSpacing(text) {
3749
3974
  }
3750
3975
  return text;
3751
3976
  }
3977
+ function buildXyCutBlocks(items, pageNum) {
3978
+ const allY = items.map((i) => i.y);
3979
+ const pageHeight = Math.max(...allY) - Math.min(...allY);
3980
+ const gapThreshold = Math.max(15, pageHeight * 0.03);
3981
+ const orderedGroups = xyCutOrder(items, gapThreshold);
3982
+ const blocks = [];
3983
+ for (const group of orderedGroups) {
3984
+ if (group.length === 0) continue;
3985
+ const yLines = groupByY(group);
3986
+ for (const line of yLines) {
3987
+ const text = mergeLineSimple(line);
3988
+ if (!text.trim()) continue;
3989
+ blocks.push({
3990
+ type: "paragraph",
3991
+ text,
3992
+ pageNumber: pageNum,
3993
+ bbox: computeBBox(line, pageNum),
3994
+ style: dominantStyle(line)
3995
+ });
3996
+ }
3997
+ }
3998
+ return blocks.length > 0 ? blocks : null;
3999
+ }
4000
+ function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
4001
+ const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
4002
+ const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
4003
+ const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
4004
+ if (!isUnderSegmented) return null;
4005
+ if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
4006
+ const directTable = buildTableFromTextLayout(items, pageNum, bbox);
4007
+ if (directTable) return directTable;
4008
+ const clusterItems = items.map((i) => ({
4009
+ text: i.text,
4010
+ x: i.x,
4011
+ y: i.y,
4012
+ w: i.w,
4013
+ h: i.h,
4014
+ fontSize: i.fontSize,
4015
+ fontName: i.fontName
4016
+ }));
4017
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
4018
+ if (clusterResults.length > 0) {
4019
+ const blocks = [];
4020
+ const ciToIdx = /* @__PURE__ */ new Map();
4021
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
4022
+ const usedIndices = /* @__PURE__ */ new Set();
4023
+ for (const cr of clusterResults) {
4024
+ for (const ci of cr.usedItems) {
4025
+ const idx = ciToIdx.get(ci);
4026
+ if (idx !== void 0) usedIndices.add(idx);
4027
+ }
4028
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
4029
+ }
4030
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
4031
+ for (const item of remaining) {
4032
+ if (!item.text.trim()) continue;
4033
+ blocks.push({
4034
+ type: "paragraph",
4035
+ text: item.text,
4036
+ pageNumber: pageNum,
4037
+ bbox: computeBBox([item], pageNum),
4038
+ style: { fontSize: item.fontSize, fontName: item.fontName }
4039
+ });
4040
+ }
4041
+ blocks.sort((a, b) => {
4042
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
4043
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
4044
+ return by - ay;
4045
+ });
4046
+ return blocks.length > 0 ? blocks : null;
4047
+ }
4048
+ return null;
4049
+ }
4050
+ function buildTableFromTextLayout(items, pageNum, bbox) {
4051
+ if (items.length < 4) return null;
4052
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
4053
+ const yTol = 3;
4054
+ const rows = [];
4055
+ let curRow = [sorted[0]];
4056
+ let curY = sorted[0].y;
4057
+ for (let i = 1; i < sorted.length; i++) {
4058
+ if (Math.abs(sorted[i].y - curY) <= yTol) {
4059
+ curRow.push(sorted[i]);
4060
+ } else {
4061
+ rows.push(curRow);
4062
+ curRow = [sorted[i]];
4063
+ curY = sorted[i].y;
4064
+ }
4065
+ }
4066
+ rows.push(curRow);
4067
+ if (rows.length < 2) return null;
4068
+ const gapPositions = [];
4069
+ for (const row of rows) {
4070
+ if (row.length < 2) continue;
4071
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
4072
+ const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
4073
+ for (let j = 1; j < sortedX.length; j++) {
4074
+ const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
4075
+ if (gap >= avgFs * 1.5) {
4076
+ gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
4077
+ }
4078
+ }
4079
+ }
4080
+ if (gapPositions.length < 2) return null;
4081
+ gapPositions.sort((a, b) => a - b);
4082
+ const colBoundaries = [];
4083
+ let clusterSum = gapPositions[0], clusterCount = 1;
4084
+ for (let i = 1; i < gapPositions.length; i++) {
4085
+ const avg = clusterSum / clusterCount;
4086
+ if (Math.abs(gapPositions[i] - avg) <= 15) {
4087
+ clusterSum += gapPositions[i];
4088
+ clusterCount++;
4089
+ } else {
4090
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4091
+ clusterSum = gapPositions[i];
4092
+ clusterCount = 1;
4093
+ }
4094
+ }
4095
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
4096
+ if (colBoundaries.length === 0) return null;
4097
+ const numCols = colBoundaries.length + 1;
4098
+ const tableRows = [];
4099
+ for (const row of rows) {
4100
+ const cells = Array(numCols).fill("");
4101
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
4102
+ for (const item of sortedX) {
4103
+ const cx = item.x + item.w / 2;
4104
+ let col = 0;
4105
+ for (let b = 0; b < colBoundaries.length; b++) {
4106
+ if (cx > colBoundaries[b]) col = b + 1;
4107
+ }
4108
+ cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
4109
+ }
4110
+ if (cells[0].trim() === "" && tableRows.length > 0) {
4111
+ const prevCells = tableRows[tableRows.length - 1].cells;
4112
+ for (let c = 0; c < numCols; c++) {
4113
+ if (cells[c].trim()) {
4114
+ prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
4115
+ }
4116
+ }
4117
+ } else {
4118
+ tableRows.push({ cells });
4119
+ }
4120
+ }
4121
+ if (tableRows.length < 2) return null;
4122
+ const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
4123
+ const totalCount = tableRows.length * numCols;
4124
+ if (nonEmptyCount < totalCount * 0.3) return null;
4125
+ const irCells = tableRows.map(
4126
+ (r) => r.cells.map((text, colIdx) => {
4127
+ let cleaned = text.trim();
4128
+ if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
4129
+ return { text: cleaned, colSpan: 1, rowSpan: 1 };
4130
+ })
4131
+ );
4132
+ const irTable = {
4133
+ rows: tableRows.length,
4134
+ cols: numCols,
4135
+ cells: irCells,
4136
+ hasHeader: tableRows.length > 1
4137
+ };
4138
+ return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
4139
+ }
3752
4140
  function shouldDemoteTable(table) {
3753
4141
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3754
4142
  const allText = allCells.join(" ");
@@ -3795,6 +4183,32 @@ function detectMarkerHeadings(blocks) {
3795
4183
  }
3796
4184
  }
3797
4185
  }
4186
+ function hasMultiColumnLayout(items) {
4187
+ if (items.length < 30) return false;
4188
+ const sorted = [...items].sort((a, b) => a.x - b.x);
4189
+ const minX = sorted[0].x;
4190
+ let maxX = minX;
4191
+ for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4192
+ const pageWidth = maxX - minX;
4193
+ if (pageWidth < 200) return false;
4194
+ let bestGap = 0;
4195
+ let bestSplit = 0;
4196
+ for (let j = 1; j < sorted.length; j++) {
4197
+ const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4198
+ if (gap > bestGap) {
4199
+ bestGap = gap;
4200
+ bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4201
+ }
4202
+ }
4203
+ if (bestGap < 20) return false;
4204
+ const splitRatio = (bestSplit - minX) / pageWidth;
4205
+ if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4206
+ const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4207
+ const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4208
+ if (leftCount < 15 || rightCount < 15) return false;
4209
+ if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4210
+ return true;
4211
+ }
3798
4212
  var MAX_XYCUT_DEPTH = 50;
3799
4213
  function xyCutOrder(items, gapThreshold, depth = 0) {
3800
4214
  if (items.length === 0) return [];
@@ -3925,6 +4339,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3925
4339
  width: grid.bbox.x2 - grid.bbox.x1,
3926
4340
  height: grid.bbox.y2 - grid.bbox.y1
3927
4341
  };
4342
+ const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4343
+ if (normalized) {
4344
+ blocks.push(...normalized);
4345
+ continue;
4346
+ }
3928
4347
  if (shouldDemoteTable(irTable)) {
3929
4348
  const demoted = demoteTableToText(irTable);
3930
4349
  if (demoted) {
@@ -3970,6 +4389,10 @@ function mergeAdjacentTableBlocks(blocks) {
3970
4389
  }
3971
4390
  function extractPageBlocksFallback(items, pageNum) {
3972
4391
  if (items.length === 0) return [];
4392
+ if (hasMultiColumnLayout(items)) {
4393
+ const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4394
+ return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4395
+ }
3973
4396
  const blocks = [];
3974
4397
  const allYLines = groupByY(items);
3975
4398
  const columns = detectColumns(allYLines);
@@ -3987,7 +4410,7 @@ function extractPageBlocksFallback(items, pageNum) {
3987
4410
  fontSize: i.fontSize,
3988
4411
  fontName: i.fontName
3989
4412
  }));
3990
- const clusterResults = detectClusterTables(clusterItems, pageNum);
4413
+ const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
3991
4414
  if (clusterResults.length > 0) {
3992
4415
  const ciToIdx = /* @__PURE__ */ new Map();
3993
4416
  for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);