kordoc 2.0.2 → 2.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -6,7 +6,7 @@ import {
6
6
  precheckZipSize,
7
7
  sanitizeHref,
8
8
  toArrayBuffer
9
- } from "./chunk-EVWOJ4T5.js";
9
+ } from "./chunk-PKIJLEV6.js";
10
10
  import {
11
11
  parsePageRange
12
12
  } from "./chunk-MOL7MDBG.js";
@@ -163,6 +163,47 @@ function sanitizeText(text) {
163
163
  }
164
164
  return result;
165
165
  }
166
+ function flattenLayoutTables(blocks) {
167
+ const result = [];
168
+ for (const block of blocks) {
169
+ if (block.type !== "table" || !block.table) {
170
+ result.push(block);
171
+ continue;
172
+ }
173
+ const { rows: numRows, cols: numCols, cells } = block.table;
174
+ if (numRows === 1 && numCols === 1) {
175
+ result.push(block);
176
+ continue;
177
+ }
178
+ if (numRows <= 3) {
179
+ let totalNewlines = 0;
180
+ let totalTextLen = 0;
181
+ for (let r = 0; r < numRows; r++) {
182
+ for (let c = 0; c < numCols; c++) {
183
+ const t = cells[r]?.[c]?.text || "";
184
+ totalNewlines += (t.match(/\n/g) || []).length;
185
+ totalTextLen += t.length;
186
+ }
187
+ }
188
+ if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
189
+ for (let r = 0; r < numRows; r++) {
190
+ for (let c = 0; c < numCols; c++) {
191
+ const cellText = cells[r]?.[c]?.text?.trim();
192
+ if (!cellText) continue;
193
+ for (const line of cellText.split("\n")) {
194
+ const trimmed = line.trim();
195
+ if (!trimmed) continue;
196
+ result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
197
+ }
198
+ }
199
+ }
200
+ continue;
201
+ }
202
+ }
203
+ result.push(block);
204
+ }
205
+ return result;
206
+ }
166
207
  function blocksToMarkdown(blocks) {
167
208
  const lines = [];
168
209
  for (let i = 0; i < blocks.length; i++) {
@@ -263,6 +304,9 @@ function tableToMarkdown(table) {
263
304
  if (dr === 0 && dc === 0) continue;
264
305
  if (r + dr < numRows && c + dc < numCols) {
265
306
  skip.add(`${r + dr},${c + dc}`);
307
+ if (dr === 0) {
308
+ display[r][c + dc] = cell.text.replace(/\n/g, "<br>");
309
+ }
266
310
  }
267
311
  }
268
312
  }
@@ -362,7 +406,12 @@ function parseCharProperties(doc, map) {
362
406
  if (!id) continue;
363
407
  const prop = {};
364
408
  const height = el.getAttribute("height");
365
- if (height) prop.fontSize = parseInt(height, 10) / 100;
409
+ if (height) {
410
+ const parsedHeight = parseInt(height, 10);
411
+ if (!isNaN(parsedHeight) && parsedHeight > 0) {
412
+ prop.fontSize = parsedHeight / 100;
413
+ }
414
+ }
366
415
  const bold = el.getAttribute("bold");
367
416
  if (bold === "true" || bold === "1") prop.bold = true;
368
417
  const italic = el.getAttribute("italic");
@@ -502,7 +551,7 @@ async function extractImagesFromZip(zip, blocks, decompressed, warnings) {
502
551
  const data = await file.async("uint8array");
503
552
  decompressed.total += data.length;
504
553
  if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
505
- const ext = ref.includes(".") ? ref.split(".").pop() : "png";
554
+ const ext = ref.includes(".") ? ref.split(".").pop() || "png" : "png";
506
555
  const mimeType = imageExtToMime(ext);
507
556
  imageIndex++;
508
557
  const filename = `image_${String(imageIndex).padStart(3, "0")}.${mimeToExt(mimeType)}`;
@@ -809,8 +858,10 @@ function walkSection(node, blocks, tableCtx, tableStack, styleMap, warnings, sec
809
858
  break;
810
859
  case "cellSpan":
811
860
  if (tableCtx?.cell) {
812
- const cs = parseInt(el.getAttribute("colSpan") || "1", 10);
813
- const rs = parseInt(el.getAttribute("rowSpan") || "1", 10);
861
+ const rawCs = parseInt(el.getAttribute("colSpan") || "1", 10);
862
+ const cs = isNaN(rawCs) ? 1 : rawCs;
863
+ const rawRs = parseInt(el.getAttribute("rowSpan") || "1", 10);
864
+ const rs = isNaN(rawRs) ? 1 : rawRs;
814
865
  tableCtx.cell.colSpan = clampSpan(cs, MAX_COLS);
815
866
  tableCtx.cell.rowSpan = clampSpan(rs, MAX_ROWS);
816
867
  }
@@ -902,6 +953,8 @@ function walkParagraphChildren(node, blocks, tableCtx, tableStack, styleMap, war
902
953
  extractDrawTextBlocks(el, blocks, styleMap, sectionNum);
903
954
  } else if (localTag === "r" || localTag === "run" || localTag === "ctrl" || localTag === "rect" || localTag === "ellipse" || localTag === "polygon" || localTag === "line" || localTag === "arc" || localTag === "curve" || localTag === "connectLine" || localTag === "container") {
904
955
  walkChildren(el, d + 1);
956
+ } else if (localTag === "run") {
957
+ tableCtx = walkParagraphChildren(el, blocks, tableCtx, tableStack, styleMap, warnings, sectionNum, depth + 1);
905
958
  }
906
959
  }
907
960
  };
@@ -1078,8 +1131,9 @@ var TAG_CHAR_SHAPE = 68;
1078
1131
  var TAG_CTRL_HEADER = 71;
1079
1132
  var TAG_LIST_HEADER = 72;
1080
1133
  var TAG_TABLE = 77;
1081
- var TAG_DOC_CHAR_SHAPE = 55;
1082
- var TAG_DOC_STYLE = 58;
1134
+ var TAG_DOC_CHAR_SHAPE = 21;
1135
+ var TAG_DOC_PARA_SHAPE = 25;
1136
+ var TAG_DOC_STYLE = 26;
1083
1137
  var CHAR_LINE = 0;
1084
1138
  var CHAR_SECTION_BREAK = 10;
1085
1139
  var CHAR_PARA = 13;
@@ -1135,8 +1189,14 @@ function parseFileHeader(data) {
1135
1189
  }
1136
1190
  function parseDocInfo(records) {
1137
1191
  const charShapes = [];
1192
+ const paraShapes = [];
1138
1193
  const styles = [];
1139
1194
  for (const rec of records) {
1195
+ if (rec.tagId === TAG_DOC_PARA_SHAPE && rec.data.length >= 4) {
1196
+ const flags = rec.data.readUInt32LE(0);
1197
+ const outlineLevel = flags >> 25 & 7;
1198
+ paraShapes.push({ outlineLevel });
1199
+ }
1140
1200
  if (rec.tagId === TAG_DOC_CHAR_SHAPE && rec.data.length >= 18) {
1141
1201
  if (rec.data.length >= 50) {
1142
1202
  const fontSize = rec.data.readUInt32LE(42);
@@ -1176,7 +1236,7 @@ function parseDocInfo(records) {
1176
1236
  }
1177
1237
  }
1178
1238
  }
1179
- return { charShapes, styles };
1239
+ return { charShapes, paraShapes, styles };
1180
1240
  }
1181
1241
  function extractText(data) {
1182
1242
  let result = "";
@@ -2186,12 +2246,13 @@ function parseHwp5Document(buffer, options) {
2186
2246
  }
2187
2247
  }
2188
2248
  const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
2249
+ const flatBlocks = flattenLayoutTables(blocks);
2189
2250
  if (docInfo) {
2190
- detectHwp5Headings(blocks, docInfo);
2251
+ detectHwp5Headings(flatBlocks, docInfo);
2191
2252
  }
2192
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2193
- const markdown = blocksToMarkdown(blocks);
2194
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2253
+ const outline = flatBlocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
2254
+ const markdown = blocksToMarkdown(flatBlocks);
2255
+ return { markdown, blocks: flatBlocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
2195
2256
  }
2196
2257
  function parseDocInfoStream(cfb, compressed) {
2197
2258
  try {
@@ -2242,16 +2303,21 @@ function detectHwp5Headings(blocks, docInfo) {
2242
2303
  }
2243
2304
  if (baseFontSize <= 0) return;
2244
2305
  for (const block of blocks) {
2245
- if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
2306
+ if (block.type === "heading") continue;
2307
+ if (block.type !== "paragraph" || !block.text) continue;
2246
2308
  const text = block.text.trim();
2247
2309
  if (text.length === 0 || text.length > 200) continue;
2248
2310
  if (/^\d+$/.test(text)) continue;
2249
- const ratio = block.style.fontSize / baseFontSize;
2250
2311
  let level = 0;
2251
- if (ratio >= HEADING_RATIO_H1) level = 1;
2252
- else if (ratio >= HEADING_RATIO_H2) level = 2;
2253
- else if (ratio >= HEADING_RATIO_H3) level = 3;
2254
- if (/^제\d+[조장절편]/.test(text) && text.length <= 50) {
2312
+ if (block.style?.fontSize && baseFontSize > 0) {
2313
+ const ratio = block.style.fontSize / baseFontSize;
2314
+ if (ratio >= HEADING_RATIO_H1) level = 1;
2315
+ else if (ratio >= HEADING_RATIO_H2) level = 2;
2316
+ else if (ratio >= HEADING_RATIO_H3) level = 3;
2317
+ }
2318
+ if (/^제\d+[장절편]\s/.test(text) && text.length <= 50) {
2319
+ if (level === 0) level = 2;
2320
+ } else if (/^제\d+(조의?\d*)\s*[\((]/.test(text) && text.length <= 80) {
2255
2321
  if (level === 0) level = 3;
2256
2322
  }
2257
2323
  if (level > 0) {
@@ -2497,13 +2563,20 @@ function parseSection(records, docInfo, warnings, sectionNum) {
2497
2563
  while (i < records.length) {
2498
2564
  const rec = records[i];
2499
2565
  if (rec.tagId === TAG_PARA_HEADER && rec.level === 0) {
2500
- const { paragraph, tables, nextIdx, charShapeIds } = parseParagraphWithTables(records, i);
2566
+ const { paragraph, tables, nextIdx, charShapeIds, paraShapeId } = parseParagraphWithTables(records, i);
2501
2567
  if (paragraph) {
2502
2568
  const block = { type: "paragraph", text: paragraph, pageNumber: sectionNum };
2503
2569
  if (docInfo && charShapeIds.length > 0) {
2504
2570
  const style = resolveCharStyle(charShapeIds, docInfo);
2505
2571
  if (style) block.style = style;
2506
2572
  }
2573
+ if (docInfo && paraShapeId >= 0 && paraShapeId < docInfo.paraShapes.length) {
2574
+ const ol = docInfo.paraShapes[paraShapeId].outlineLevel;
2575
+ if (ol >= 1 && ol <= 6) {
2576
+ block.type = "heading";
2577
+ block.level = ol;
2578
+ }
2579
+ }
2507
2580
  blocks.push(block);
2508
2581
  }
2509
2582
  for (const t of tables) blocks.push({ type: "table", table: t, pageNumber: sectionNum });
@@ -2623,6 +2696,8 @@ function parseParagraphWithTables(records, startIdx) {
2623
2696
  let text = "";
2624
2697
  const tables = [];
2625
2698
  const charShapeIds = [];
2699
+ const paraHeaderData = records[startIdx].data;
2700
+ const paraShapeId = paraHeaderData.length >= 10 ? paraHeaderData.readUInt16LE(8) : -1;
2626
2701
  let i = startIdx + 1;
2627
2702
  while (i < records.length) {
2628
2703
  const rec = records[i];
@@ -2647,7 +2722,7 @@ function parseParagraphWithTables(records, startIdx) {
2647
2722
  i++;
2648
2723
  }
2649
2724
  const trimmed = text.trim();
2650
- return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds };
2725
+ return { paragraph: trimmed || null, tables, nextIdx: i, charShapeIds, paraShapeId };
2651
2726
  }
2652
2727
  function parseTableBlock(records, startIdx) {
2653
2728
  const tableLevel = records[startIdx].level;
@@ -2757,10 +2832,33 @@ var MIN_LINE_LENGTH = 10;
2757
2832
  var COORD_MERGE_TOL = 3;
2758
2833
  var CONNECT_TOL = 5;
2759
2834
  var CELL_PADDING = 2;
2835
+ var MAX_LINE_WIDTH = 5;
2836
+ var IDENTITY = [1, 0, 0, 1, 0, 0];
2837
+ function matMultiply(m1, m2) {
2838
+ return [
2839
+ m1[0] * m2[0] + m1[2] * m2[1],
2840
+ m1[1] * m2[0] + m1[3] * m2[1],
2841
+ m1[0] * m2[2] + m1[2] * m2[3],
2842
+ m1[1] * m2[2] + m1[3] * m2[3],
2843
+ m1[0] * m2[4] + m1[2] * m2[5] + m1[4],
2844
+ m1[1] * m2[4] + m1[3] * m2[5] + m1[5]
2845
+ ];
2846
+ }
2847
+ function matTransformPoint(m, x, y) {
2848
+ return [m[0] * x + m[2] * y + m[4], m[1] * x + m[3] * y + m[5]];
2849
+ }
2850
+ function matScale(m) {
2851
+ return Math.max(
2852
+ Math.sqrt(m[1] * m[1] + m[3] * m[3]),
2853
+ Math.sqrt(m[0] * m[0] + m[2] * m[2])
2854
+ );
2855
+ }
2760
2856
  function extractLines(fnArray, argsArray) {
2761
2857
  const horizontals = [];
2762
2858
  const verticals = [];
2859
+ let ctm = [...IDENTITY];
2763
2860
  let lineWidth = 1;
2861
+ const stateStack = [];
2764
2862
  let currentPath = [];
2765
2863
  let pathStartX = 0, pathStartY = 0;
2766
2864
  let curX = 0, curY = 0;
@@ -2778,13 +2876,53 @@ function extractLines(fnArray, argsArray) {
2778
2876
  );
2779
2877
  }
2780
2878
  }
2781
- function flushPath(isStroke) {
2782
- if (!isStroke) {
2879
+ function tryConvertLinesToRectangle(path) {
2880
+ if (path.length < 3 || path.length > 5) return false;
2881
+ const first = path[0], last = path[path.length - 1];
2882
+ const closed = Math.abs(first.x1 - last.x2) < 1 && Math.abs(first.y1 - last.y2) < 1;
2883
+ if (!closed) return false;
2884
+ let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
2885
+ for (const seg of path) {
2886
+ minX = Math.min(minX, seg.x1, seg.x2);
2887
+ minY = Math.min(minY, seg.y1, seg.y2);
2888
+ maxX = Math.max(maxX, seg.x1, seg.x2);
2889
+ maxY = Math.max(maxY, seg.y1, seg.y2);
2890
+ }
2891
+ const w = maxX - minX, h = maxY - minY;
2892
+ if (w < MIN_LINE_LENGTH && h < MIN_LINE_LENGTH) return false;
2893
+ path.length = 0;
2894
+ if (h < ORIENTATION_TOL * 2 || w > MIN_LINE_LENGTH && h <= MAX_LINE_WIDTH) {
2895
+ path.push({ x1: minX, y1: (minY + maxY) / 2, x2: maxX, y2: (minY + maxY) / 2 });
2896
+ } else if (w < ORIENTATION_TOL * 2 || h > MIN_LINE_LENGTH && w <= MAX_LINE_WIDTH) {
2897
+ path.push({ x1: (minX + maxX) / 2, y1: minY, x2: (minX + maxX) / 2, y2: maxY });
2898
+ } else {
2899
+ pushRectangle(path, minX, minY, w, h);
2900
+ }
2901
+ return true;
2902
+ }
2903
+ function flushPath(isStroke, isFill) {
2904
+ if (!isStroke && !isFill) {
2905
+ currentPath = [];
2906
+ return;
2907
+ }
2908
+ if (isFill && !isStroke && currentPath.length >= 3) {
2909
+ tryConvertLinesToRectangle(currentPath);
2910
+ }
2911
+ const scale = matScale(ctm);
2912
+ const effectiveLW = lineWidth * scale;
2913
+ if (effectiveLW > MAX_LINE_WIDTH && isStroke && !isFill) {
2783
2914
  currentPath = [];
2784
2915
  return;
2785
2916
  }
2786
2917
  for (const seg of currentPath) {
2787
- classifyAndAdd(seg, lineWidth, horizontals, verticals);
2918
+ const [px1, py1] = matTransformPoint(ctm, seg.x1, seg.y1);
2919
+ const [px2, py2] = matTransformPoint(ctm, seg.x2, seg.y2);
2920
+ classifyAndAdd(
2921
+ { x1: px1, y1: py1, x2: px2, y2: py2 },
2922
+ effectiveLW,
2923
+ horizontals,
2924
+ verticals
2925
+ );
2788
2926
  }
2789
2927
  currentPath = [];
2790
2928
  }
@@ -2792,9 +2930,28 @@ function extractLines(fnArray, argsArray) {
2792
2930
  const op = fnArray[i];
2793
2931
  const args = argsArray[i];
2794
2932
  switch (op) {
2933
+ // ── Graphics State ──
2934
+ case OPS.save:
2935
+ stateStack.push({ ctm: [...ctm], lineWidth });
2936
+ break;
2937
+ case OPS.restore:
2938
+ if (stateStack.length > 0) {
2939
+ const state = stateStack.pop();
2940
+ ctm = state.ctm;
2941
+ lineWidth = state.lineWidth;
2942
+ }
2943
+ break;
2944
+ case OPS.transform: {
2945
+ const m = args;
2946
+ if (m.length >= 6) {
2947
+ ctm = matMultiply(ctm, [m[0], m[1], m[2], m[3], m[4], m[5]]);
2948
+ }
2949
+ break;
2950
+ }
2795
2951
  case OPS.setLineWidth:
2796
2952
  lineWidth = args[0] || 1;
2797
2953
  break;
2954
+ // ── Path Construction ──
2798
2955
  case OPS.constructPath: {
2799
2956
  const arg0 = args[0];
2800
2957
  if (Array.isArray(arg0)) {
@@ -2862,34 +3019,60 @@ function extractLines(fnArray, argsArray) {
2862
3019
  }
2863
3020
  }
2864
3021
  }
2865
- if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
2866
- flushPath(true);
2867
- } else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
2868
- flushPath(true);
3022
+ const isStroke5 = afterOp === OPS.stroke || afterOp === OPS.closeStroke;
3023
+ const isFill5 = afterOp === OPS.fill || afterOp === OPS.eoFill;
3024
+ const isBoth5 = afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke;
3025
+ if (isStroke5 || isFill5 || isBoth5) {
3026
+ flushPath(isStroke5 || isBoth5, isFill5 || isBoth5);
2869
3027
  } else if (afterOp === OPS.endPath) {
2870
- flushPath(false);
3028
+ flushPath(false, false);
2871
3029
  }
2872
3030
  }
2873
3031
  break;
2874
3032
  }
3033
+ // ── Paint Operations ──
2875
3034
  case OPS.stroke:
2876
3035
  case OPS.closeStroke:
2877
- flushPath(true);
3036
+ flushPath(true, false);
2878
3037
  break;
2879
3038
  case OPS.fill:
2880
3039
  case OPS.eoFill:
3040
+ flushPath(false, true);
3041
+ break;
2881
3042
  case OPS.fillStroke:
2882
3043
  case OPS.eoFillStroke:
2883
3044
  case OPS.closeFillStroke:
2884
3045
  case OPS.closeEOFillStroke:
2885
- flushPath(true);
3046
+ flushPath(true, true);
2886
3047
  break;
2887
3048
  case OPS.endPath:
2888
- flushPath(false);
3049
+ flushPath(false, false);
2889
3050
  break;
2890
3051
  }
2891
3052
  }
2892
- return { horizontals, verticals };
3053
+ return {
3054
+ horizontals: deduplicateLines(horizontals),
3055
+ verticals: deduplicateLines(verticals)
3056
+ };
3057
+ }
3058
+ function deduplicateLines(lines) {
3059
+ if (lines.length <= 1) return lines;
3060
+ const result = [];
3061
+ const tol = COORD_MERGE_TOL;
3062
+ for (const line of lines) {
3063
+ let isDuplicate = false;
3064
+ for (const existing of result) {
3065
+ if (Math.abs(line.y1 - existing.y1) <= tol && Math.abs(line.y2 - existing.y2) <= tol && Math.abs(line.x1 - existing.x1) <= tol && Math.abs(line.x2 - existing.x2) <= tol) {
3066
+ if (line.lineWidth > existing.lineWidth) {
3067
+ existing.lineWidth = line.lineWidth;
3068
+ }
3069
+ isDuplicate = true;
3070
+ break;
3071
+ }
3072
+ }
3073
+ if (!isDuplicate) result.push(line);
3074
+ }
3075
+ return result;
2893
3076
  }
2894
3077
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
2895
3078
  const dx = Math.abs(seg.x2 - seg.x1);
@@ -3465,7 +3648,7 @@ async function parsePdfDocument(buffer, options) {
3465
3648
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
3466
3649
  if (options?.ocr) {
3467
3650
  try {
3468
- const { ocrPages } = await import("./provider-A4FHJSID.js");
3651
+ const { ocrPages } = await import("./provider-7H4CPZYS.js");
3469
3652
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
3470
3653
  if (ocrBlocks.length > 0) {
3471
3654
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
@@ -3485,6 +3668,7 @@ async function parsePdfDocument(buffer, options) {
3485
3668
  const medianFontSize = computeMedianFontSize(allFontSizes);
3486
3669
  if (medianFontSize > 0) {
3487
3670
  detectHeadings(blocks, medianFontSize);
3671
+ mergeAdjacentHeadings(blocks);
3488
3672
  }
3489
3673
  detectMarkerHeadings(blocks);
3490
3674
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
@@ -3570,6 +3754,46 @@ function detectHeadings(blocks, medianFontSize) {
3570
3754
  }
3571
3755
  }
3572
3756
  }
3757
+ function mergeAdjacentHeadings(blocks) {
3758
+ let i = 0;
3759
+ while (i < blocks.length - 1) {
3760
+ const curr = blocks[i];
3761
+ const next = blocks[i + 1];
3762
+ if (curr.type !== "heading" || next.type !== "heading") {
3763
+ i++;
3764
+ continue;
3765
+ }
3766
+ if (!curr.bbox || !next.bbox || !curr.text || !next.text) {
3767
+ i++;
3768
+ continue;
3769
+ }
3770
+ const currBaseline = curr.bbox.y + (curr.style?.fontSize || curr.bbox.height);
3771
+ const nextBaseline = next.bbox.y + (next.style?.fontSize || next.bbox.height);
3772
+ const yDiff = Math.abs(currBaseline - nextBaseline);
3773
+ const maxFs = Math.max(curr.style?.fontSize || 12, next.style?.fontSize || 12);
3774
+ const sameY = curr.bbox.page === next.bbox.page && yDiff < maxFs * 1.5;
3775
+ const sameLevel = curr.level === next.level;
3776
+ if (sameY && sameLevel) {
3777
+ const currX = curr.bbox.x;
3778
+ const nextX = next.bbox.x;
3779
+ if (currX <= nextX) {
3780
+ curr.text = curr.text + " " + next.text;
3781
+ } else {
3782
+ curr.text = next.text + " " + curr.text;
3783
+ }
3784
+ curr.bbox = {
3785
+ page: curr.bbox.page,
3786
+ x: Math.min(curr.bbox.x, next.bbox.x),
3787
+ y: Math.min(curr.bbox.y, next.bbox.y),
3788
+ width: Math.max(curr.bbox.x + curr.bbox.width, next.bbox.x + next.bbox.width) - Math.min(curr.bbox.x, next.bbox.x),
3789
+ height: Math.max(curr.bbox.height, next.bbox.height)
3790
+ };
3791
+ blocks.splice(i + 1, 1);
3792
+ } else {
3793
+ i++;
3794
+ }
3795
+ }
3796
+ }
3573
3797
  function collapseEvenSpacing(text) {
3574
3798
  const tokens = text.split(" ");
3575
3799
  const singleCharCount = tokens.filter((t) => t.length === 1).length;
@@ -3578,6 +3802,169 @@ function collapseEvenSpacing(text) {
3578
3802
  }
3579
3803
  return text;
3580
3804
  }
3805
+ function buildXyCutBlocks(items, pageNum) {
3806
+ const allY = items.map((i) => i.y);
3807
+ const pageHeight = Math.max(...allY) - Math.min(...allY);
3808
+ const gapThreshold = Math.max(15, pageHeight * 0.03);
3809
+ const orderedGroups = xyCutOrder(items, gapThreshold);
3810
+ const blocks = [];
3811
+ for (const group of orderedGroups) {
3812
+ if (group.length === 0) continue;
3813
+ const yLines = groupByY(group);
3814
+ for (const line of yLines) {
3815
+ const text = mergeLineSimple(line);
3816
+ if (!text.trim()) continue;
3817
+ blocks.push({
3818
+ type: "paragraph",
3819
+ text,
3820
+ pageNumber: pageNum,
3821
+ bbox: computeBBox(line, pageNum),
3822
+ style: dominantStyle(line)
3823
+ });
3824
+ }
3825
+ }
3826
+ return blocks.length > 0 ? blocks : null;
3827
+ }
3828
+ function normalizeUnderSegmentedTable(table, items, pageNum, bbox) {
3829
+ const totalCells = table.cells.reduce((sum, row) => sum + row.filter((c) => c.text.trim()).length, 0);
3830
+ const totalTextLines = table.cells.reduce((sum, row) => sum + row.reduce((s, c) => s + (c.text.trim() ? c.text.split("\n").length : 0), 0), 0);
3831
+ const isUnderSegmented = table.rows === 1 && table.cols === 1 || totalCells <= 2 && totalTextLines >= 8 || totalCells <= 2 && items.length >= 6;
3832
+ if (!isUnderSegmented) return null;
3833
+ if (hasMultiColumnLayout(items)) return buildXyCutBlocks(items, pageNum);
3834
+ const directTable = buildTableFromTextLayout(items, pageNum, bbox);
3835
+ if (directTable) return directTable;
3836
+ const clusterItems = items.map((i) => ({
3837
+ text: i.text,
3838
+ x: i.x,
3839
+ y: i.y,
3840
+ w: i.w,
3841
+ h: i.h,
3842
+ fontSize: i.fontSize,
3843
+ fontName: i.fontName
3844
+ }));
3845
+ const clusterResults = detectClusterTables(clusterItems, pageNum);
3846
+ if (clusterResults.length > 0) {
3847
+ const blocks = [];
3848
+ const ciToIdx = /* @__PURE__ */ new Map();
3849
+ for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
3850
+ const usedIndices = /* @__PURE__ */ new Set();
3851
+ for (const cr of clusterResults) {
3852
+ for (const ci of cr.usedItems) {
3853
+ const idx = ciToIdx.get(ci);
3854
+ if (idx !== void 0) usedIndices.add(idx);
3855
+ }
3856
+ blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
3857
+ }
3858
+ const remaining = items.filter((_, idx) => !usedIndices.has(idx));
3859
+ for (const item of remaining) {
3860
+ if (!item.text.trim()) continue;
3861
+ blocks.push({
3862
+ type: "paragraph",
3863
+ text: item.text,
3864
+ pageNumber: pageNum,
3865
+ bbox: computeBBox([item], pageNum),
3866
+ style: { fontSize: item.fontSize, fontName: item.fontName }
3867
+ });
3868
+ }
3869
+ blocks.sort((a, b) => {
3870
+ const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
3871
+ const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
3872
+ return by - ay;
3873
+ });
3874
+ return blocks.length > 0 ? blocks : null;
3875
+ }
3876
+ return null;
3877
+ }
3878
+ function buildTableFromTextLayout(items, pageNum, bbox) {
3879
+ if (items.length < 4) return null;
3880
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
3881
+ const yTol = 3;
3882
+ const rows = [];
3883
+ let curRow = [sorted[0]];
3884
+ let curY = sorted[0].y;
3885
+ for (let i = 1; i < sorted.length; i++) {
3886
+ if (Math.abs(sorted[i].y - curY) <= yTol) {
3887
+ curRow.push(sorted[i]);
3888
+ } else {
3889
+ rows.push(curRow);
3890
+ curRow = [sorted[i]];
3891
+ curY = sorted[i].y;
3892
+ }
3893
+ }
3894
+ rows.push(curRow);
3895
+ if (rows.length < 2) return null;
3896
+ const gapPositions = [];
3897
+ for (const row of rows) {
3898
+ if (row.length < 2) continue;
3899
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
3900
+ const avgFs = sortedX.reduce((s, i) => s + i.fontSize, 0) / sortedX.length;
3901
+ for (let j = 1; j < sortedX.length; j++) {
3902
+ const gap = sortedX[j].x - (sortedX[j - 1].x + sortedX[j - 1].w);
3903
+ if (gap >= avgFs * 1.5) {
3904
+ gapPositions.push(sortedX[j - 1].x + sortedX[j - 1].w + gap / 2);
3905
+ }
3906
+ }
3907
+ }
3908
+ if (gapPositions.length < 2) return null;
3909
+ gapPositions.sort((a, b) => a - b);
3910
+ const colBoundaries = [];
3911
+ let clusterSum = gapPositions[0], clusterCount = 1;
3912
+ for (let i = 1; i < gapPositions.length; i++) {
3913
+ const avg = clusterSum / clusterCount;
3914
+ if (Math.abs(gapPositions[i] - avg) <= 15) {
3915
+ clusterSum += gapPositions[i];
3916
+ clusterCount++;
3917
+ } else {
3918
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
3919
+ clusterSum = gapPositions[i];
3920
+ clusterCount = 1;
3921
+ }
3922
+ }
3923
+ if (clusterCount >= 2) colBoundaries.push(clusterSum / clusterCount);
3924
+ if (colBoundaries.length === 0) return null;
3925
+ const numCols = colBoundaries.length + 1;
3926
+ const tableRows = [];
3927
+ for (const row of rows) {
3928
+ const cells = Array(numCols).fill("");
3929
+ const sortedX = [...row].sort((a, b) => a.x - b.x);
3930
+ for (const item of sortedX) {
3931
+ const cx = item.x + item.w / 2;
3932
+ let col = 0;
3933
+ for (let b = 0; b < colBoundaries.length; b++) {
3934
+ if (cx > colBoundaries[b]) col = b + 1;
3935
+ }
3936
+ cells[col] = cells[col] ? cells[col] + " " + item.text : item.text;
3937
+ }
3938
+ if (cells[0].trim() === "" && tableRows.length > 0) {
3939
+ const prevCells = tableRows[tableRows.length - 1].cells;
3940
+ for (let c = 0; c < numCols; c++) {
3941
+ if (cells[c].trim()) {
3942
+ prevCells[c] = prevCells[c] ? prevCells[c] + " " + cells[c].trim() : cells[c].trim();
3943
+ }
3944
+ }
3945
+ } else {
3946
+ tableRows.push({ cells });
3947
+ }
3948
+ }
3949
+ if (tableRows.length < 2) return null;
3950
+ const nonEmptyCount = tableRows.reduce((sum, r) => sum + r.cells.filter((c) => c.trim()).length, 0);
3951
+ const totalCount = tableRows.length * numCols;
3952
+ if (nonEmptyCount < totalCount * 0.3) return null;
3953
+ const irCells = tableRows.map(
3954
+ (r) => r.cells.map((text, colIdx) => {
3955
+ let cleaned = text.trim();
3956
+ if (colIdx > 0) cleaned = cleaned.replace(/^[•○·\-]\s*/, "");
3957
+ return { text: cleaned, colSpan: 1, rowSpan: 1 };
3958
+ })
3959
+ );
3960
+ const irTable = {
3961
+ rows: tableRows.length,
3962
+ cols: numCols,
3963
+ cells: irCells,
3964
+ hasHeader: tableRows.length > 1
3965
+ };
3966
+ return [{ type: "table", table: irTable, pageNumber: pageNum, bbox }];
3967
+ }
3581
3968
  function shouldDemoteTable(table) {
3582
3969
  const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
3583
3970
  const allText = allCells.join(" ");
@@ -3624,6 +4011,32 @@ function detectMarkerHeadings(blocks) {
3624
4011
  }
3625
4012
  }
3626
4013
  }
4014
+ function hasMultiColumnLayout(items) {
4015
+ if (items.length < 30) return false;
4016
+ const sorted = [...items].sort((a, b) => a.x - b.x);
4017
+ const minX = sorted[0].x;
4018
+ let maxX = minX;
4019
+ for (const i of sorted) if (i.x + i.w > maxX) maxX = i.x + i.w;
4020
+ const pageWidth = maxX - minX;
4021
+ if (pageWidth < 200) return false;
4022
+ let bestGap = 0;
4023
+ let bestSplit = 0;
4024
+ for (let j = 1; j < sorted.length; j++) {
4025
+ const gap = sorted[j].x - (sorted[j - 1].x + sorted[j - 1].w);
4026
+ if (gap > bestGap) {
4027
+ bestGap = gap;
4028
+ bestSplit = (sorted[j - 1].x + sorted[j - 1].w + sorted[j].x) / 2;
4029
+ }
4030
+ }
4031
+ if (bestGap < 20) return false;
4032
+ const splitRatio = (bestSplit - minX) / pageWidth;
4033
+ if (splitRatio < 0.35 || splitRatio > 0.65) return false;
4034
+ const leftCount = items.filter((i) => i.x + i.w / 2 < bestSplit).length;
4035
+ const rightCount = items.filter((i) => i.x + i.w / 2 >= bestSplit).length;
4036
+ if (leftCount < 15 || rightCount < 15) return false;
4037
+ if (Math.min(leftCount, rightCount) / Math.max(leftCount, rightCount) < 0.35) return false;
4038
+ return true;
4039
+ }
3627
4040
  var MAX_XYCUT_DEPTH = 50;
3628
4041
  function xyCutOrder(items, gapThreshold, depth = 0) {
3629
4042
  if (items.length === 0) return [];
@@ -3754,6 +4167,11 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
3754
4167
  width: grid.bbox.x2 - grid.bbox.x1,
3755
4168
  height: grid.bbox.y2 - grid.bbox.y1
3756
4169
  };
4170
+ const normalized = normalizeUnderSegmentedTable(irTable, tableItems, pageNum, tableBbox);
4171
+ if (normalized) {
4172
+ blocks.push(...normalized);
4173
+ continue;
4174
+ }
3757
4175
  if (shouldDemoteTable(irTable)) {
3758
4176
  const demoted = demoteTableToText(irTable);
3759
4177
  if (demoted) {
@@ -3799,6 +4217,10 @@ function mergeAdjacentTableBlocks(blocks) {
3799
4217
  }
3800
4218
  function extractPageBlocksFallback(items, pageNum) {
3801
4219
  if (items.length === 0) return [];
4220
+ if (hasMultiColumnLayout(items)) {
4221
+ const xyBlocks = buildXyCutBlocks(items, pageNum) || [];
4222
+ return detectSpecialKoreanTables(detectListBlocks(xyBlocks));
4223
+ }
3802
4224
  const blocks = [];
3803
4225
  const allYLines = groupByY(items);
3804
4226
  const columns = detectColumns(allYLines);
@@ -3816,7 +4238,7 @@ function extractPageBlocksFallback(items, pageNum) {
3816
4238
  fontSize: i.fontSize,
3817
4239
  fontName: i.fontName
3818
4240
  }));
3819
- const clusterResults = detectClusterTables(clusterItems, pageNum);
4241
+ const clusterResults = hasMultiColumnLayout(items) ? [] : detectClusterTables(clusterItems, pageNum);
3820
4242
  if (clusterResults.length > 0) {
3821
4243
  const ciToIdx = /* @__PURE__ */ new Map();
3822
4244
  for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
@@ -5446,4 +5868,4 @@ export {
5446
5868
  extractFormFields,
5447
5869
  parse
5448
5870
  };
5449
- //# sourceMappingURL=chunk-XJYM2AUA.js.map
5871
+ //# sourceMappingURL=chunk-GJ2S6IMC.js.map