kordoc 2.9.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +28 -0
  2. package/dist/-K5SLEFZD.js +71 -0
  3. package/dist/-K5SLEFZD.js.map +1 -0
  4. package/dist/{chunk-GQQNAYZA.js → chunk-326STEDU.js} +6684 -4061
  5. package/dist/chunk-326STEDU.js.map +1 -0
  6. package/dist/{chunk-FWAXCTSX.cjs → chunk-3WRJQQIO.cjs} +185 -16
  7. package/dist/chunk-3WRJQQIO.cjs.map +1 -0
  8. package/dist/chunk-MUOQXDZ4.cjs.map +1 -1
  9. package/dist/{chunk-Z6TLTWYK.js → chunk-NHXKJWR7.js} +182 -13
  10. package/dist/chunk-NHXKJWR7.js.map +1 -0
  11. package/dist/{chunk-ODF24QXC.js → chunk-SA2PERJ5.js} +182 -13
  12. package/dist/chunk-SA2PERJ5.js.map +1 -0
  13. package/dist/cli.js +42 -3
  14. package/dist/cli.js.map +1 -1
  15. package/dist/formula-XGG6ZP42.cjs.map +1 -1
  16. package/dist/index.cjs +3247 -822
  17. package/dist/index.cjs.map +1 -1
  18. package/dist/index.d.cts +61 -2
  19. package/dist/index.d.ts +61 -2
  20. package/dist/index.js +3025 -600
  21. package/dist/index.js.map +1 -1
  22. package/dist/mcp.js +3 -3
  23. package/dist/page-range-3C7UGGEK.cjs.map +1 -1
  24. package/dist/{parser-BKYM3LKN.js → parser-4IVYHKSL.js} +677 -85
  25. package/dist/parser-4IVYHKSL.js.map +1 -0
  26. package/dist/{parser-BTIPAEDZ.cjs → parser-5KHU732L.cjs} +689 -97
  27. package/dist/parser-5KHU732L.cjs.map +1 -0
  28. package/dist/{parser-FJNQEW7K.js → parser-AU2NLC44.js} +677 -85
  29. package/dist/parser-AU2NLC44.js.map +1 -0
  30. package/dist/provider-SNONEZNW.cjs.map +1 -1
  31. package/dist/{watch-SBLSWHL7.js → watch-5DDN4BUI.js} +3 -3
  32. package/package.json +1 -1
  33. package/dist/chunk-FWAXCTSX.cjs.map +0 -1
  34. package/dist/chunk-GQQNAYZA.js.map +0 -1
  35. package/dist/chunk-ODF24QXC.js.map +0 -1
  36. package/dist/chunk-Z6TLTWYK.js.map +0 -1
  37. package/dist/parser-BKYM3LKN.js.map +0 -1
  38. package/dist/parser-BTIPAEDZ.cjs.map +0 -1
  39. package/dist/parser-FJNQEW7K.js.map +0 -1
  40. /package/dist/{watch-SBLSWHL7.js.map → watch-5DDN4BUI.js.map} +0 -0
@@ -6,7 +6,7 @@
6
6
 
7
7
 
8
8
 
9
- var _chunkFWAXCTSXcjs = require('./chunk-FWAXCTSX.cjs');
9
+ var _chunk3WRJQQIOcjs = require('./chunk-3WRJQQIO.cjs');
10
10
 
11
11
 
12
12
  var _chunkMUOQXDZ4cjs = require('./chunk-MUOQXDZ4.cjs');
@@ -156,6 +156,55 @@ function extractLines(fnArray, argsArray) {
156
156
  }
157
157
  return { horizontals, verticals };
158
158
  }
159
+ function multiplyTransform(m, t) {
160
+ return [
161
+ m[0] * t[0] + m[2] * t[1],
162
+ m[1] * t[0] + m[3] * t[1],
163
+ m[0] * t[2] + m[2] * t[3],
164
+ m[1] * t[2] + m[3] * t[3],
165
+ m[0] * t[4] + m[2] * t[5] + m[4],
166
+ m[1] * t[4] + m[3] * t[5] + m[5]
167
+ ];
168
+ }
169
+ function extractImageRegions(fnArray, argsArray) {
170
+ const regions = [];
171
+ let ctm = [1, 0, 0, 1, 0, 0];
172
+ const stack = [];
173
+ for (let i = 0; i < fnArray.length; i++) {
174
+ const op = fnArray[i];
175
+ switch (op) {
176
+ case _pdfmjs.OPS.save:
177
+ stack.push(ctm);
178
+ break;
179
+ case _pdfmjs.OPS.restore:
180
+ ctm = stack.pop() || [1, 0, 0, 1, 0, 0];
181
+ break;
182
+ case _pdfmjs.OPS.transform: {
183
+ const t = argsArray[i];
184
+ if (Array.isArray(t) && t.length >= 6) ctm = multiplyTransform(ctm, t);
185
+ break;
186
+ }
187
+ case _pdfmjs.OPS.paintImageXObject:
188
+ case _pdfmjs.OPS.paintInlineImageXObject:
189
+ case _pdfmjs.OPS.paintImageMaskXObject:
190
+ case _pdfmjs.OPS.paintImageXObjectRepeat: {
191
+ const corners = [[0, 0], [1, 0], [0, 1], [1, 1]];
192
+ let x1 = Infinity, y1 = Infinity, x2 = -Infinity, y2 = -Infinity;
193
+ for (const [u, v] of corners) {
194
+ const x = ctm[0] * u + ctm[2] * v + ctm[4];
195
+ const y = ctm[1] * u + ctm[3] * v + ctm[5];
196
+ if (x < x1) x1 = x;
197
+ if (x > x2) x2 = x;
198
+ if (y < y1) y1 = y;
199
+ if (y > y2) y2 = y;
200
+ }
201
+ if (x2 - x1 > 0 && y2 - y1 > 0) regions.push({ x1, y1, x2, y2 });
202
+ break;
203
+ }
204
+ }
205
+ }
206
+ return regions;
207
+ }
159
208
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
160
209
  const dx = Math.abs(seg.x2 - seg.x1);
161
210
  const dy = Math.abs(seg.y2 - seg.y1);
@@ -541,6 +590,10 @@ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
541
590
  }
542
591
  return false;
543
592
  }
593
+ var SPACE_GAP_RATIO = 0.17;
594
+ function spaceGapThreshold(fontSize) {
595
+ return Math.max(fontSize * SPACE_GAP_RATIO, 1);
596
+ }
544
597
  function mapTextToCells(items, cells) {
545
598
  const result = /* @__PURE__ */ new Map();
546
599
  for (const cell of cells) {
@@ -600,14 +653,12 @@ function cellTextToString(items) {
600
653
  }
601
654
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
602
655
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
603
- const prevIsKorean = /[가-힣]$/.test(result);
604
- const currIsKorean = /^[가-힣]/.test(s[j].text);
605
- if (gap < avgFs * 0.15) {
606
- result += s[j].text;
607
- } else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
608
- result += s[j].text;
609
- } else {
656
+ if (s[j].hasSpaceBefore && gap >= avgFs * 0.05) {
657
+ result += " " + s[j].text;
658
+ } else if (gap > spaceGapThreshold(avgFs)) {
610
659
  result += " " + s[j].text;
660
+ } else {
661
+ result += s[j].text;
611
662
  }
612
663
  }
613
664
  return result;
@@ -620,6 +671,11 @@ function detectEvenSpacedItems(items) {
620
671
  let runStart = -1;
621
672
  for (let i = 0; i < items.length; i++) {
622
673
  const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
674
+ if (isShortKorean && runStart >= 0 && items[i].hasSpaceBefore) {
675
+ if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
676
+ runStart = i;
677
+ continue;
678
+ }
623
679
  if (isShortKorean && runStart >= 0 && i > 0) {
624
680
  const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
625
681
  const maxRunGap = Math.max(items[i].fontSize * 3, 30);
@@ -662,6 +718,119 @@ function markEvenRun(items, result, start, end) {
662
718
  }
663
719
  }
664
720
  }
721
+ var MAX_UNDERSEGMENTED_ROWS = 2;
722
+ var MIN_UNDERSEGMENTED_COLUMNS = 3;
723
+ var MIN_UNDERSEGMENTED_TEXT_LINES = 8;
724
+ var MIN_ROW_BAND_MISMATCH = 2;
725
+ var MIN_ROW_BAND_EPSILON = 3;
726
+ var ROW_BAND_EPSILON_RATIO = 0.6;
727
+ function itemCenterY(item) {
728
+ return item.y + (item.h > 0 ? item.h : item.fontSize) / 2;
729
+ }
730
+ function itemHeight(item) {
731
+ return item.h > 0 ? item.h : item.fontSize;
732
+ }
733
+ function findColumnIndex(item, colXs) {
734
+ const cx = item.x + item.w / 2;
735
+ for (let c = 0; c < colXs.length - 1; c++) {
736
+ if (cx >= colXs[c] && cx <= colXs[c + 1]) return c;
737
+ }
738
+ let best = 0;
739
+ let bestDist = Infinity;
740
+ for (let c = 0; c < colXs.length - 1; c++) {
741
+ const center = (colXs[c] + colXs[c + 1]) / 2;
742
+ const d = Math.abs(cx - center);
743
+ if (d < bestDist) {
744
+ bestDist = d;
745
+ best = c;
746
+ }
747
+ }
748
+ return best;
749
+ }
750
+ function groupItemsToVisualLines(items) {
751
+ if (items.length === 0) return [];
752
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
753
+ const lines = [];
754
+ let cur = [sorted[0]];
755
+ let curY = sorted[0].y;
756
+ for (let i = 1; i < sorted.length; i++) {
757
+ const tol = Math.max(3, Math.min(sorted[i].fontSize, cur[0].fontSize) * 0.6);
758
+ if (Math.abs(sorted[i].y - curY) <= tol) {
759
+ cur.push(sorted[i]);
760
+ } else {
761
+ lines.push(cur);
762
+ cur = [sorted[i]];
763
+ curY = sorted[i].y;
764
+ }
765
+ }
766
+ lines.push(cur);
767
+ return lines;
768
+ }
769
+ function normalizeUndersegmentedTable(originalCells, colXs, items) {
770
+ const numRows = originalCells.length;
771
+ const numCols = colXs.length - 1;
772
+ if (numRows > MAX_UNDERSEGMENTED_ROWS || numCols < MIN_UNDERSEGMENTED_COLUMNS) return null;
773
+ if (items.length === 0) return null;
774
+ const itemsByCol = Array.from({ length: numCols }, () => []);
775
+ for (const item of items) {
776
+ if (!item.text.trim()) continue;
777
+ itemsByCol[findColumnIndex(item, colXs)].push(item);
778
+ }
779
+ let denseColumns = 0;
780
+ for (const colItems of itemsByCol) {
781
+ if (groupItemsToVisualLines(colItems).length >= MIN_UNDERSEGMENTED_TEXT_LINES) denseColumns++;
782
+ }
783
+ if (denseColumns < 2) return null;
784
+ const allLines = groupItemsToVisualLines(items.filter((i) => i.text.trim()));
785
+ const bands = [];
786
+ for (const line of allLines) {
787
+ let cy = 0, h = 0;
788
+ for (const it of line) {
789
+ cy += itemCenterY(it);
790
+ h += itemHeight(it);
791
+ }
792
+ cy /= line.length;
793
+ h /= line.length;
794
+ const top = cy + h / 2;
795
+ const bottom = cy - h / 2;
796
+ let matched = null;
797
+ for (const band of bands) {
798
+ const epsilon = Math.max(MIN_ROW_BAND_EPSILON, Math.min(band.avgHeight, h) * ROW_BAND_EPSILON_RATIO);
799
+ if (Math.abs(band.centerY - cy) <= epsilon || bottom <= band.topY && top >= band.bottomY) {
800
+ matched = band;
801
+ break;
802
+ }
803
+ }
804
+ if (!matched) {
805
+ matched = { centerY: 0, avgHeight: 0, topY: -Infinity, bottomY: Infinity, lineCount: 0, itemsByCol: Array.from({ length: numCols }, () => []) };
806
+ bands.push(matched);
807
+ }
808
+ matched.centerY = (matched.centerY * matched.lineCount + cy) / (matched.lineCount + 1);
809
+ matched.avgHeight = (matched.avgHeight * matched.lineCount + h) / (matched.lineCount + 1);
810
+ matched.topY = Math.max(matched.topY, top);
811
+ matched.bottomY = Math.min(matched.bottomY, bottom);
812
+ matched.lineCount++;
813
+ for (const it of line) {
814
+ matched.itemsByCol[findColumnIndex(it, colXs)].push(it);
815
+ }
816
+ }
817
+ if (bands.length < numRows + MIN_ROW_BAND_MISMATCH) return null;
818
+ bands.sort((a, b) => b.centerY - a.centerY);
819
+ const rebuilt = bands.map(
820
+ (band) => band.itemsByCol.map((colItems) => colItems.length > 0 ? cellTextToString(colItems) : "")
821
+ );
822
+ const countNonEmptyRows = (cells) => cells.filter((row) => row.some((c) => (typeof c === "string" ? c : c.text).trim() !== "")).length;
823
+ const countNonEmptyCols = (cells, cols) => {
824
+ let n = 0;
825
+ for (let c = 0; c < cols; c++) {
826
+ if (cells.some((row) => row[c] != null && (typeof row[c] === "string" ? row[c] : row[c].text).trim() !== "")) n++;
827
+ }
828
+ return n;
829
+ };
830
+ if (countNonEmptyRows(rebuilt) <= countNonEmptyRows(originalCells)) return null;
831
+ if (countNonEmptyCols(rebuilt, numCols) < countNonEmptyCols(originalCells, numCols)) return null;
832
+ return rebuilt;
833
+ }
665
834
  function mergeCellTextLines(textLines) {
666
835
  if (textLines.length <= 1) return textLines[0] || "";
667
836
  const merged = [textLines[0]];
@@ -694,7 +863,7 @@ var MIN_COL_FILL_RATIO = 0.4;
694
863
  function detectClusterTables(items, pageNum) {
695
864
  if (items.length < MIN_ROWS * MIN_COLS) return [];
696
865
  const { merged, originMap } = mergeEvenSpacedClusters(items);
697
- const rows = groupByBaseline(merged);
866
+ const rows = mergeOverlappingRows(groupByBaseline(merged));
698
867
  if (rows.length < MIN_ROWS) return [];
699
868
  const results = [];
700
869
  const headerResult = detectHeaderRow(rows);
@@ -743,6 +912,7 @@ function mergeEvenSpacedClusters(items) {
743
912
  if (/^[가-힣\d]$/.test(sorted[i].text)) {
744
913
  let runEnd = i + 1;
745
914
  while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
915
+ if (sorted[runEnd].hasSpaceBefore) break;
746
916
  const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
747
917
  const fs = sorted[runEnd].fontSize;
748
918
  if (gap < fs * 0.1 || gap > fs * 3) break;
@@ -833,6 +1003,38 @@ function detectHeaderRow(rows) {
833
1003
  }
834
1004
  return null;
835
1005
  }
1006
+ function mergeOverlappingRows(rows) {
1007
+ if (rows.length <= 1) return rows;
1008
+ const result = [rows[0]];
1009
+ for (let i = 1; i < rows.length; i++) {
1010
+ const prev = result[result.length - 1];
1011
+ const curr = rows[i];
1012
+ const a = rowBand(prev);
1013
+ const b = rowBand(curr);
1014
+ const overlap = Math.min(a.top, b.top) - Math.max(a.bottom, b.bottom);
1015
+ const prevIsFrag = isFragmentRow(prev) && a.height <= b.height * 0.8 && overlap >= a.height * 0.5;
1016
+ const currIsFrag = isFragmentRow(curr) && b.height <= a.height * 0.8 && overlap >= b.height * 0.5;
1017
+ if (prevIsFrag || currIsFrag) {
1018
+ const baseY = prevIsFrag ? curr.y : prev.y;
1019
+ result[result.length - 1] = { y: baseY, items: [...prev.items, ...curr.items] };
1020
+ } else {
1021
+ result.push(curr);
1022
+ }
1023
+ }
1024
+ return result;
1025
+ }
1026
+ function isFragmentRow(row) {
1027
+ return row.items.length <= 3 && row.items.every((i) => i.text.length <= 8);
1028
+ }
1029
+ function rowBand(row) {
1030
+ let bottom = Infinity, top = -Infinity;
1031
+ for (const i of row.items) {
1032
+ const h = i.h > 0 ? i.h : i.fontSize;
1033
+ if (i.y < bottom) bottom = i.y;
1034
+ if (i.y + h > top) top = i.y + h;
1035
+ }
1036
+ return { bottom, top, height: top - bottom };
1037
+ }
836
1038
  function mergeMultiLineRows(rows, columns) {
837
1039
  if (rows.length <= 1) return rows;
838
1040
  const result = [rows[0]];
@@ -1293,7 +1495,7 @@ async function loadPdfWithTimeout(buffer) {
1293
1495
  new Promise((_, reject) => {
1294
1496
  timer = setTimeout(() => {
1295
1497
  loadingTask.destroy();
1296
- reject(new (0, _chunkFWAXCTSXcjs.KordocError)("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
1498
+ reject(new (0, _chunk3WRJQQIOcjs.KordocError)("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
1297
1499
  }, PDF_LOAD_TIMEOUT_MS);
1298
1500
  })
1299
1501
  ]);
@@ -1306,7 +1508,7 @@ async function parsePdfDocument(buffer, options) {
1306
1508
  const doc = await loadPdfWithTimeout(buffer);
1307
1509
  try {
1308
1510
  const pageCount = doc.numPages;
1309
- if (pageCount === 0) throw new (0, _chunkFWAXCTSXcjs.KordocError)("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
1511
+ if (pageCount === 0) throw new (0, _chunk3WRJQQIOcjs.KordocError)("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
1310
1512
  const metadata = { pageCount };
1311
1513
  await extractPdfMetadata(doc, metadata);
1312
1514
  const blocks = [];
@@ -1319,6 +1521,8 @@ async function parsePdfDocument(buffer, options) {
1319
1521
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
1320
1522
  const fontSizeFreq = /* @__PURE__ */ new Map();
1321
1523
  const pageHeights = /* @__PURE__ */ new Map();
1524
+ const pagesWithLargeImage = /* @__PURE__ */ new Set();
1525
+ const skippedImagePages = /* @__PURE__ */ new Map();
1322
1526
  let parsedPages = 0;
1323
1527
  for (let i = 1; i <= effectivePageCount; i++) {
1324
1528
  if (pageFilter && !pageFilter.has(i)) continue;
@@ -1337,6 +1541,23 @@ async function parsePdfDocument(buffer, options) {
1337
1541
  if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
1338
1542
  }
1339
1543
  const opList = await page.getOperatorList();
1544
+ const pageArea = viewport.width * viewport.height;
1545
+ if (pageArea > 0) {
1546
+ const imageRegions = extractImageRegions(opList.fnArray, opList.argsArray);
1547
+ let uncovered = 0;
1548
+ for (const r of imageRegions) {
1549
+ const area = (r.x2 - r.x1) * (r.y2 - r.y1);
1550
+ if (area < pageArea * 0.05) continue;
1551
+ pagesWithLargeImage.add(i);
1552
+ const hasText = visible.some((it) => {
1553
+ const cx = it.x + it.w / 2;
1554
+ const cy = it.y + (it.h || it.fontSize) / 2;
1555
+ return cx >= r.x1 && cx <= r.x2 && cy >= r.y1 && cy <= r.y2;
1556
+ });
1557
+ if (!hasText) uncovered++;
1558
+ }
1559
+ if (uncovered > 0) skippedImagePages.set(i, uncovered);
1560
+ }
1340
1561
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
1341
1562
  for (const b of pageBlocks) blocks.push(b);
1342
1563
  let pageText = "";
@@ -1347,15 +1568,16 @@ async function parsePdfDocument(buffer, options) {
1347
1568
  pageText += pageText ? "\n" + t : t;
1348
1569
  }
1349
1570
  pageQuality.push(computePageQuality(i, pageText));
1350
- if (totalTextBytes > MAX_TOTAL_TEXT) throw new (0, _chunkFWAXCTSXcjs.KordocError)("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
1571
+ if (totalTextBytes > MAX_TOTAL_TEXT) throw new (0, _chunk3WRJQQIOcjs.KordocError)("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
1351
1572
  parsedPages++;
1352
1573
  _optionalChain([options, 'optionalAccess', _12 => _12.onProgress, 'optionalCall', _13 => _13(parsedPages, totalTarget)]);
1353
1574
  } catch (pageErr) {
1354
- if (pageErr instanceof _chunkFWAXCTSXcjs.KordocError) throw pageErr;
1575
+ if (pageErr instanceof _chunk3WRJQQIOcjs.KordocError) throw pageErr;
1355
1576
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
1356
1577
  }
1357
1578
  }
1358
1579
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
1580
+ let isImageBased = false;
1359
1581
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
1360
1582
  if (_optionalChain([options, 'optionalAccess', _14 => _14.ocr])) {
1361
1583
  try {
@@ -1368,7 +1590,29 @@ async function parsePdfDocument(buffer, options) {
1368
1590
  } catch (e2) {
1369
1591
  }
1370
1592
  }
1371
- throw Object.assign(new (0, _chunkFWAXCTSXcjs.KordocError)(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
1593
+ isImageBased = true;
1594
+ warnings.push({
1595
+ message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, \uD14D\uC2A4\uD2B8 ${totalChars}\uC790) \u2014 \uD14D\uC2A4\uD2B8 \uB808\uC774\uC5B4\uAC00 \uC5C6\uC5B4 OCR\uC774 \uD544\uC694\uD569\uB2C8\uB2E4`,
1596
+ code: "NEEDS_OCR"
1597
+ });
1598
+ }
1599
+ if (!isImageBased) {
1600
+ const OCR_REASON_MESSAGES = {
1601
+ low_text: "\uD14D\uC2A4\uD2B8\uAC00 \uAC70\uC758 \uC5C6\uB294 \uD398\uC774\uC9C0 (\uC2A4\uCE94/\uC774\uBBF8\uC9C0 \uCD94\uC815)",
1602
+ high_pua: "\uAE00\uAF34 \uB9E4\uD551 \uC2E4\uD328 (PUA \uBE44\uC728 \uB192\uC74C) \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00",
1603
+ high_control: "\uC81C\uC5B4\uBB38\uC790 \uBE44\uC728 \uB192\uC74C \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00",
1604
+ high_replacement: "\uB300\uCCB4\uBB38\uC790(U+FFFD) \uBE44\uC728 \uB192\uC74C \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00"
1605
+ };
1606
+ for (const pq of pageQuality) {
1607
+ if (!pq.needsOcr || !pq.ocrReason) continue;
1608
+ if (pq.ocrReason === "low_text" && !pagesWithLargeImage.has(pq.page)) continue;
1609
+ warnings.push({ page: pq.page, message: `${OCR_REASON_MESSAGES[pq.ocrReason]} \u2014 OCR \uAC80\uD1A0 \uD544\uC694`, code: "NEEDS_OCR" });
1610
+ }
1611
+ }
1612
+ if (!isImageBased) {
1613
+ for (const [page, count] of [...skippedImagePages.entries()].sort((a, b) => a[0] - b[0])) {
1614
+ warnings.push({ page, message: `${count}\uAC1C \uC774\uBBF8\uC9C0 \uC601\uC5ED\uC5D0 \uCD94\uCD9C \uAC00\uB2A5\uD55C \uD14D\uC2A4\uD2B8 \uC5C6\uC74C (\uADF8\uB9BC/\uCC28\uD2B8/\uB3C4\uC7A5 \uB0B4\uC6A9 \uB204\uB77D \uAC00\uB2A5)`, code: "SKIPPED_IMAGE" });
1615
+ }
1372
1616
  }
1373
1617
  if (_optionalChain([options, 'optionalAccess', _15 => _15.removeHeaderFooter]) !== false && parsedPageCount >= 3) {
1374
1618
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
@@ -1376,6 +1620,7 @@ async function parsePdfDocument(buffer, options) {
1376
1620
  blocks.splice(removed[ri], 1);
1377
1621
  }
1378
1622
  }
1623
+ mergeCrossPageTables(blocks);
1379
1624
  if (_optionalChain([options, 'optionalAccess', _16 => _16.formulaOcr]) && formulaBuffer) {
1380
1625
  try {
1381
1626
  await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
@@ -1391,15 +1636,18 @@ async function parsePdfDocument(buffer, options) {
1391
1636
  detectHeadings(blocks, medianFontSize);
1392
1637
  }
1393
1638
  detectMarkerHeadings(blocks);
1639
+ detectTableCaptions(blocks);
1640
+ detectKoreanListBlocks(blocks);
1394
1641
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
1395
1642
  sanitizeBlockControlChars(blocks);
1396
- let markdown = cleanPdfText(_chunkFWAXCTSXcjs.blocksToMarkdown.call(void 0, blocks));
1643
+ let markdown = cleanPdfText(_chunk3WRJQQIOcjs.blocksToMarkdown.call(void 0, blocks));
1397
1644
  return {
1398
1645
  markdown,
1399
1646
  blocks,
1400
1647
  metadata,
1401
1648
  outline: outline.length > 0 ? outline : void 0,
1402
1649
  warnings: warnings.length > 0 ? warnings : void 0,
1650
+ isImageBased: isImageBased || void 0,
1403
1651
  pageQuality,
1404
1652
  qualitySummary: summarizeDocumentQuality(pageQuality)
1405
1653
  };
@@ -1480,9 +1728,9 @@ function detectHeadings(blocks, medianFontSize) {
1480
1728
  if (/^\d+$/.test(text)) continue;
1481
1729
  const ratio = block.style.fontSize / medianFontSize;
1482
1730
  let level = 0;
1483
- if (ratio >= _chunkFWAXCTSXcjs.HEADING_RATIO_H1) level = 1;
1484
- else if (ratio >= _chunkFWAXCTSXcjs.HEADING_RATIO_H2) level = 2;
1485
- else if (ratio >= _chunkFWAXCTSXcjs.HEADING_RATIO_H3) level = 3;
1731
+ if (ratio >= _chunk3WRJQQIOcjs.HEADING_RATIO_H1) level = 1;
1732
+ else if (ratio >= _chunk3WRJQQIOcjs.HEADING_RATIO_H2) level = 2;
1733
+ else if (ratio >= _chunk3WRJQQIOcjs.HEADING_RATIO_H3) level = 3;
1486
1734
  if (level > 0) {
1487
1735
  block.type = "heading";
1488
1736
  block.level = level;
@@ -1555,79 +1803,218 @@ function detectMarkerHeadings(blocks) {
1555
1803
  }
1556
1804
  }
1557
1805
  var MAX_XYCUT_DEPTH = 50;
1806
+ var XYCUT_MIN_GAP = 5;
1807
+ var CROSS_LAYOUT_BETA = 2;
1808
+ var CROSS_OVERLAP_RATIO = 0.1;
1809
+ var CROSS_MIN_OVERLAPS = 2;
1810
+ var CROSS_MAX_MASK_RATIO = 0.2;
1811
+ var NARROW_ELEMENT_WIDTH_RATIO = 0.1;
1558
1812
  function xyCutOrder(items, gapThreshold, depth = 0) {
1559
1813
  if (items.length === 0) return [];
1560
1814
  if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
1561
- const region = computeRegion(items);
1562
- const ySplit = findYSplit(items, region, gapThreshold);
1563
- if (ySplit !== null) {
1564
- const upper = items.filter((i) => i.y > ySplit);
1565
- const lower = items.filter((i) => i.y <= ySplit);
1815
+ if (depth === 0 && items.length >= 3) {
1816
+ const cross = identifyCrossLayoutItems(items);
1817
+ if (cross.size > 0 && cross.size <= items.length * CROSS_MAX_MASK_RATIO) {
1818
+ const rest = items.filter((i) => !cross.has(i));
1819
+ if (rest.length > 0) {
1820
+ const groups = xyCutOrder(rest, gapThreshold, 1);
1821
+ return mergeCrossLayoutGroups(groups, [...cross]);
1822
+ }
1823
+ }
1824
+ }
1825
+ const minGap = Math.max(XYCUT_MIN_GAP, gapThreshold);
1826
+ const hCut = findHorizontalCut(items);
1827
+ const vCut = findVerticalCutWithOutlierFilter(items, minGap);
1828
+ const hValid = hCut.gap >= minGap;
1829
+ const vValid = vCut.gap >= minGap;
1830
+ let useHorizontal;
1831
+ if (hValid && vValid) useHorizontal = vCut.gap <= hCut.gap * 1.5;
1832
+ else if (hValid) useHorizontal = true;
1833
+ else if (vValid) useHorizontal = false;
1834
+ else return [items];
1835
+ if (useHorizontal) {
1836
+ const upper = items.filter((i) => i.y > hCut.position);
1837
+ const lower = items.filter((i) => i.y <= hCut.position);
1566
1838
  if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
1567
1839
  return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
1568
1840
  }
1569
- }
1570
- const xSplit = findXSplit(items, region, gapThreshold);
1571
- if (xSplit !== null) {
1572
- const left = items.filter((i) => i.x + i.w / 2 < xSplit);
1573
- const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
1841
+ } else {
1842
+ const left = items.filter((i) => i.x + i.w / 2 < vCut.position);
1843
+ const right = items.filter((i) => i.x + i.w / 2 >= vCut.position);
1574
1844
  if (left.length > 0 && right.length > 0 && left.length < items.length) {
1575
1845
  return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
1576
1846
  }
1577
1847
  }
1578
1848
  return [items];
1579
1849
  }
1580
- function computeRegion(items) {
1581
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
1850
+ function identifyCrossLayoutItems(items) {
1851
+ const cross = /* @__PURE__ */ new Set();
1852
+ if (items.length < 3) return cross;
1853
+ let maxWidth = 0;
1582
1854
  for (const i of items) {
1583
- if (i.x < minX) minX = i.x;
1584
- if (i.y < minY) minY = i.y;
1585
- if (i.x + i.w > maxX) maxX = i.x + i.w;
1586
- if (i.y + i.h > maxY) maxY = i.y + i.h;
1855
+ if (i.w > maxWidth) maxWidth = i.w;
1587
1856
  }
1588
- return { items, minX, minY, maxX, maxY };
1857
+ const threshold = CROSS_LAYOUT_BETA * maxWidth;
1858
+ for (const item of items) {
1859
+ if (item.w < threshold) continue;
1860
+ let overlaps = 0;
1861
+ for (const other of items) {
1862
+ if (other === item) continue;
1863
+ const left = Math.max(item.x, other.x);
1864
+ const right = Math.min(item.x + item.w, other.x + other.w);
1865
+ const overlapW = right - left;
1866
+ if (overlapW <= 0) continue;
1867
+ const smaller = Math.min(item.w, other.w);
1868
+ if (smaller > 0 && overlapW / smaller >= CROSS_OVERLAP_RATIO) {
1869
+ overlaps++;
1870
+ if (overlaps >= CROSS_MIN_OVERLAPS) break;
1871
+ }
1872
+ }
1873
+ if (overlaps >= CROSS_MIN_OVERLAPS) cross.add(item);
1874
+ }
1875
+ return cross;
1589
1876
  }
1590
- function findYSplit(items, _region, gapThreshold) {
1877
+ function mergeCrossLayoutGroups(groups, cross) {
1878
+ if (cross.length === 0) return groups;
1879
+ const sortedCross = [...cross].sort((a, b) => b.y + b.h - (a.y + a.h) || a.x - b.x);
1880
+ const groupTop = (g2) => {
1881
+ let top = -Infinity;
1882
+ for (const i of g2) {
1883
+ const t = i.y + i.h;
1884
+ if (t > top) top = t;
1885
+ }
1886
+ return top;
1887
+ };
1888
+ const result = [];
1889
+ let gi = 0, ci = 0;
1890
+ while (gi < groups.length || ci < sortedCross.length) {
1891
+ if (ci >= sortedCross.length) {
1892
+ result.push(groups[gi++]);
1893
+ continue;
1894
+ }
1895
+ if (gi >= groups.length) {
1896
+ result.push([sortedCross[ci++]]);
1897
+ continue;
1898
+ }
1899
+ const crossTop = sortedCross[ci].y + sortedCross[ci].h;
1900
+ if (crossTop >= groupTop(groups[gi])) result.push([sortedCross[ci++]]);
1901
+ else result.push(groups[gi++]);
1902
+ }
1903
+ return result;
1904
+ }
1905
+ function findHorizontalCut(items) {
1906
+ if (items.length < 2) return { position: 0, gap: 0 };
1591
1907
  const sorted = [...items].sort((a, b) => b.y - a.y);
1592
- let bestGap = gapThreshold;
1593
- let bestSplit = null;
1908
+ let largestGap = 0;
1909
+ let position = 0;
1594
1910
  for (let i = 1; i < sorted.length; i++) {
1595
1911
  const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
1596
1912
  const currTop = sorted[i].y;
1597
1913
  const gap = prevBottom - currTop;
1598
- if (gap > bestGap) {
1599
- bestGap = gap;
1600
- bestSplit = (prevBottom + currTop) / 2;
1914
+ if (gap > largestGap) {
1915
+ largestGap = gap;
1916
+ position = (prevBottom + currTop) / 2;
1601
1917
  }
1602
1918
  }
1603
- return bestSplit;
1919
+ return { position, gap: largestGap };
1604
1920
  }
1605
- function findXSplit(items, _region, gapThreshold) {
1606
- const sorted = [...items].sort((a, b) => a.x - b.x);
1607
- let bestGap = gapThreshold;
1608
- let bestSplit = null;
1609
- for (let i = 1; i < sorted.length; i++) {
1610
- const prevRight = sorted[i - 1].x + sorted[i - 1].w;
1611
- const currLeft = sorted[i].x;
1612
- const gap = currLeft - prevRight;
1613
- if (gap > bestGap) {
1614
- bestGap = gap;
1615
- bestSplit = (prevRight + currLeft) / 2;
1921
+ function findVerticalCutWithOutlierFilter(items, minGap) {
1922
+ const edgeCut = findVerticalCut(items);
1923
+ if (edgeCut.gap >= minGap) return edgeCut;
1924
+ if (items.length >= 3) {
1925
+ let minX = Infinity, maxX = -Infinity;
1926
+ for (const i of items) {
1927
+ if (i.x < minX) minX = i.x;
1928
+ const r = i.x + i.w;
1929
+ if (r > maxX) maxX = r;
1930
+ }
1931
+ const narrowThreshold = (maxX - minX) * NARROW_ELEMENT_WIDTH_RATIO;
1932
+ const filtered = items.filter((i) => i.w >= narrowThreshold);
1933
+ if (filtered.length >= 2 && filtered.length < items.length && filtered.length >= items.length * 0.7) {
1934
+ const filteredCut = findVerticalCut(filtered);
1935
+ if (filteredCut.gap > edgeCut.gap && filteredCut.gap >= minGap) {
1936
+ return filteredCut;
1937
+ }
1616
1938
  }
1617
1939
  }
1618
- return bestSplit;
1940
+ return edgeCut;
1941
+ }
1942
+ function findVerticalCut(items) {
1943
+ if (items.length < 2) return { position: 0, gap: 0 };
1944
+ const sorted = [...items].sort((a, b) => a.x - b.x || a.x + a.w - (b.x + b.w));
1945
+ let largestGap = 0;
1946
+ let position = 0;
1947
+ let prevRight = null;
1948
+ for (const it of sorted) {
1949
+ const left = it.x;
1950
+ const right = it.x + it.w;
1951
+ if (prevRight !== null && left > prevRight) {
1952
+ const gap = left - prevRight;
1953
+ if (gap > largestGap) {
1954
+ largestGap = gap;
1955
+ position = (prevRight + left) / 2;
1956
+ }
1957
+ }
1958
+ prevRight = prevRight === null ? right : Math.max(prevRight, right);
1959
+ }
1960
+ return { position, gap: largestGap };
1619
1961
  }
1620
1962
  function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
1621
1963
  if (items.length === 0) return [];
1622
1964
  let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
1623
1965
  ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
1624
1966
  ({ horizontals, verticals } = preprocessLines(horizontals, verticals));
1967
+ markStrikethroughItems(items, horizontals);
1968
+ wrapStrikethroughRuns(items);
1625
1969
  const grids = buildTableGrids(horizontals, verticals);
1626
1970
  if (grids.length > 0) {
1627
1971
  return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
1628
1972
  }
1629
1973
  return extractPageBlocksFallback(items, pageNum);
1630
1974
  }
1975
+ var STRIKE_MAX_THICKNESS = 2;
1976
+ var STRIKE_MAX_THICKNESS_RATIO = 0.25;
1977
+ var STRIKE_CENTER_TOLERANCE = 0.25;
1978
+ var STRIKE_MIN_OVERLAP_RATIO = 0.8;
1979
+ var STRIKE_MAX_LINE_TO_TEXT_RATIO = 1.5;
1980
+ function markStrikethroughItems(items, horizontals) {
1981
+ if (items.length === 0 || horizontals.length === 0) return;
1982
+ for (const line of horizontals) {
1983
+ if (line.lineWidth > STRIKE_MAX_THICKNESS) continue;
1984
+ const matches = [];
1985
+ for (const item of items) {
1986
+ const h = item.h > 0 ? item.h : item.fontSize;
1987
+ if (h <= 0 || item.w <= 0) continue;
1988
+ if (line.lineWidth > h * STRIKE_MAX_THICKNESS_RATIO) continue;
1989
+ const centerY = item.y + h * 0.4;
1990
+ if (Math.abs(line.y1 - centerY) > h * STRIKE_CENTER_TOLERANCE) continue;
1991
+ const overlap = Math.min(line.x2, item.x + item.w) - Math.max(line.x1, item.x);
1992
+ if (overlap / item.w < STRIKE_MIN_OVERLAP_RATIO) continue;
1993
+ matches.push(item);
1994
+ }
1995
+ if (matches.length === 0) continue;
1996
+ let totalW = 0;
1997
+ for (const m of matches) totalW += m.w;
1998
+ if (totalW <= 0 || (line.x2 - line.x1) / totalW > STRIKE_MAX_LINE_TO_TEXT_RATIO) continue;
1999
+ for (const m of matches) m.strike = true;
2000
+ }
2001
+ }
2002
+ function wrapStrikethroughRuns(items) {
2003
+ const struck = items.filter((i) => i.strike);
2004
+ if (struck.length === 0) return;
2005
+ const lines = /* @__PURE__ */ new Map();
2006
+ for (const item of struck) {
2007
+ const key = Math.round(item.y / 3);
2008
+ const arr = lines.get(key) || [];
2009
+ arr.push(item);
2010
+ lines.set(key, arr);
2011
+ }
2012
+ for (const arr of lines.values()) {
2013
+ arr.sort((a, b) => a.x - b.x);
2014
+ arr[0].text = "~~" + arr[0].text;
2015
+ arr[arr.length - 1].text = arr[arr.length - 1].text + "~~";
2016
+ }
2017
+ }
1631
2018
  function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1632
2019
  const blocks = [];
1633
2020
  const usedItems = /* @__PURE__ */ new Set();
@@ -1657,7 +2044,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1657
2044
  w: i.w,
1658
2045
  h: i.h,
1659
2046
  fontSize: i.fontSize,
1660
- fontName: i.fontName
2047
+ fontName: i.fontName,
2048
+ hasSpaceBefore: i.hasSpaceBefore
1661
2049
  }));
1662
2050
  const cellTextMap = mapTextToCells(textItems, cells);
1663
2051
  const numRows = grid.rowYs.length - 1;
@@ -1677,13 +2065,29 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1677
2065
  rowSpan: cell.rowSpan
1678
2066
  };
1679
2067
  }
2068
+ let finalGrid = irGrid;
2069
+ let finalRows = numRows;
2070
+ if (numRows <= 2 && numCols >= 3) {
2071
+ const rebuilt = normalizeUndersegmentedTable(irGrid, grid.colXs, textItems);
2072
+ if (rebuilt) {
2073
+ finalGrid = rebuilt.map((row) => row.map((rawText) => {
2074
+ const cleaned = rawText.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
2075
+ return {
2076
+ text: cleaned.split("\n").map((line) => collapseEvenSpacing(line)).join("\n"),
2077
+ colSpan: 1,
2078
+ rowSpan: 1
2079
+ };
2080
+ }));
2081
+ finalRows = finalGrid.length;
2082
+ }
2083
+ }
1680
2084
  const irTable = {
1681
- rows: numRows,
2085
+ rows: finalRows,
1682
2086
  cols: numCols,
1683
- cells: irGrid,
1684
- hasHeader: numRows > 1
2087
+ cells: finalGrid,
2088
+ hasHeader: finalRows > 1
1685
2089
  };
1686
- const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
2090
+ const hasContent = finalGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
1687
2091
  if (!hasContent) continue;
1688
2092
  const tableBbox = {
1689
2093
  page: pageNum,
@@ -1712,7 +2116,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1712
2116
  w: i.w,
1713
2117
  h: i.h,
1714
2118
  fontSize: i.fontSize,
1715
- fontName: i.fontName
2119
+ fontName: i.fontName,
2120
+ hasSpaceBefore: i.hasSpaceBefore
1716
2121
  }));
1717
2122
  const clusterResults = detectClusterTables(clusterItems, pageNum);
1718
2123
  if (clusterResults.length > 0) {
@@ -1730,7 +2135,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1730
2135
  }
1731
2136
  if (remaining.length > 0) {
1732
2137
  const allY = remaining.map((i) => i.y);
1733
- const pageH = _chunkFWAXCTSXcjs.safeMax.call(void 0, allY) - _chunkFWAXCTSXcjs.safeMin.call(void 0, allY);
2138
+ const pageH = _chunk3WRJQQIOcjs.safeMax.call(void 0, allY) - _chunk3WRJQQIOcjs.safeMin.call(void 0, allY);
1734
2139
  const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
1735
2140
  const textBlocks = [];
1736
2141
  for (const group of groups) {
@@ -1750,6 +2155,46 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1750
2155
  }
1751
2156
  return mergeAdjacentTableBlocks(blocks);
1752
2157
  }
2158
+ var NEIGHBOR_TABLE_EPSILON = 0.2;
2159
+ function mergeCrossPageTables(blocks) {
2160
+ for (let i = blocks.length - 2; i >= 0; i--) {
2161
+ const prev = blocks[i];
2162
+ const curr = blocks[i + 1];
2163
+ if (prev.type !== "table" || curr.type !== "table" || !prev.table || !curr.table) continue;
2164
+ if (!prev.pageNumber || !curr.pageNumber || curr.pageNumber !== prev.pageNumber + 1) continue;
2165
+ if (prev.table.cols !== curr.table.cols) continue;
2166
+ if (!prev.bbox || !curr.bbox) continue;
2167
+ const width = Math.max(prev.bbox.width, curr.bbox.width, 1);
2168
+ const leftDiff = Math.abs(prev.bbox.x - curr.bbox.x);
2169
+ const rightDiff = Math.abs(prev.bbox.x + prev.bbox.width - (curr.bbox.x + curr.bbox.width));
2170
+ if (leftDiff > width * NEIGHBOR_TABLE_EPSILON || rightDiff > width * NEIGHBOR_TABLE_EPSILON) continue;
2171
+ let currCells = curr.table.cells;
2172
+ if (currCells.length > 1 && prev.table.cells.length > 0 && rowTextsEqual(prev.table.cells[0], currCells[0])) {
2173
+ currCells = currCells.slice(1);
2174
+ }
2175
+ if (currCells.length === 0) {
2176
+ blocks.splice(i + 1, 1);
2177
+ continue;
2178
+ }
2179
+ const merged = {
2180
+ rows: prev.table.rows + currCells.length,
2181
+ cols: prev.table.cols,
2182
+ cells: [...prev.table.cells, ...currCells],
2183
+ hasHeader: prev.table.hasHeader,
2184
+ caption: prev.table.caption
2185
+ };
2186
+ blocks[i] = { ...prev, table: merged };
2187
+ blocks.splice(i + 1, 1);
2188
+ }
2189
+ }
2190
+ function rowTextsEqual(a, b) {
2191
+ if (a.length !== b.length) return false;
2192
+ const norm = (t) => t.replace(/\s+/g, "");
2193
+ for (let i = 0; i < a.length; i++) {
2194
+ if (norm(a[i].text) !== norm(b[i].text)) return false;
2195
+ }
2196
+ return a.some((c) => c.text.trim() !== "");
2197
+ }
1753
2198
  function mergeAdjacentTableBlocks(blocks) {
1754
2199
  if (blocks.length <= 1) return blocks;
1755
2200
  const result = [blocks[0]];
@@ -1780,7 +2225,8 @@ function extractPageBlocksFallback(items, pageNum) {
1780
2225
  w: i.w,
1781
2226
  h: i.h,
1782
2227
  fontSize: i.fontSize,
1783
- fontName: i.fontName
2228
+ fontName: i.fontName,
2229
+ hasSpaceBefore: i.hasSpaceBefore
1784
2230
  }));
1785
2231
  const clusterResults = detectClusterTables(clusterItems, pageNum);
1786
2232
  if (clusterResults.length > 0) {
@@ -1796,7 +2242,7 @@ function extractPageBlocksFallback(items, pageNum) {
1796
2242
  }
1797
2243
  const remaining = items.filter((_, idx) => !usedIndices.has(idx));
1798
2244
  if (remaining.length > 0) {
1799
- const yLines = groupByY(remaining);
2245
+ const yLines = mergeSuperscriptLines(groupByY(remaining));
1800
2246
  for (const line of yLines) {
1801
2247
  const text = mergeLineSimple(line);
1802
2248
  if (!text.trim()) continue;
@@ -1810,7 +2256,7 @@ function extractPageBlocksFallback(items, pageNum) {
1810
2256
  return by - ay;
1811
2257
  });
1812
2258
  } else {
1813
- const allYLines = groupByY(items);
2259
+ const allYLines = mergeSuperscriptLines(groupByY(items));
1814
2260
  const columns = detectColumns(allYLines);
1815
2261
  if (columns && columns.length >= 3) {
1816
2262
  const tableText = extractWithColumns(allYLines, columns);
@@ -1818,12 +2264,12 @@ function extractPageBlocksFallback(items, pageNum) {
1818
2264
  blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
1819
2265
  } else {
1820
2266
  const allY = items.map((i) => i.y);
1821
- const pageHeight = _chunkFWAXCTSXcjs.safeMax.call(void 0, allY) - _chunkFWAXCTSXcjs.safeMin.call(void 0, allY);
2267
+ const pageHeight = _chunk3WRJQQIOcjs.safeMax.call(void 0, allY) - _chunk3WRJQQIOcjs.safeMin.call(void 0, allY);
1822
2268
  const gapThreshold = Math.max(15, pageHeight * 0.03);
1823
2269
  const orderedGroups = xyCutOrder(items, gapThreshold);
1824
2270
  for (const group of orderedGroups) {
1825
2271
  if (group.length === 0) continue;
1826
- const yLines = groupByY(group);
2272
+ const yLines = mergeSuperscriptLines(groupByY(group));
1827
2273
  const groupColumns = detectColumns(yLines);
1828
2274
  if (groupColumns && groupColumns.length >= 3) {
1829
2275
  const tableText = extractWithColumns(yLines, groupColumns);
@@ -1915,16 +2361,16 @@ function normalizeItems(rawItems) {
1915
2361
  if (!isDup) deduped.push(sorted[i]);
1916
2362
  }
1917
2363
  if (spacePositions.length > 0) {
1918
- for (const item of deduped) {
1919
- for (const sp of spacePositions) {
1920
- if (Math.abs(sp.y - item.y) <= 3) {
1921
- const dist = item.x - sp.x;
1922
- if (dist >= 0 && dist <= 20) {
1923
- item.hasSpaceBefore = true;
1924
- break;
1925
- }
2364
+ for (const sp of spacePositions) {
2365
+ let nearest = null;
2366
+ for (const item of deduped) {
2367
+ if (Math.abs(sp.y - item.y) > 3) continue;
2368
+ const dist = item.x - sp.x;
2369
+ if (dist >= -1 && dist <= 20 && (!nearest || item.x < nearest.x)) {
2370
+ nearest = item;
1926
2371
  }
1927
2372
  }
2373
+ if (nearest) nearest.hasSpaceBefore = true;
1928
2374
  }
1929
2375
  }
1930
2376
  return deduped;
@@ -1958,6 +2404,35 @@ function groupByY(items) {
1958
2404
  if (curLine.length > 0) lines.push(curLine);
1959
2405
  return lines;
1960
2406
  }
2407
+ function mergeSuperscriptLines(lines) {
2408
+ if (lines.length <= 1) return lines;
2409
+ const band = (line) => {
2410
+ let bottom = Infinity, top = -Infinity;
2411
+ for (const i of line) {
2412
+ const h = i.h > 0 ? i.h : i.fontSize;
2413
+ if (i.y < bottom) bottom = i.y;
2414
+ if (i.y + h > top) top = i.y + h;
2415
+ }
2416
+ return { bottom, top, height: top - bottom };
2417
+ };
2418
+ const isFrag = (line) => line.length <= 3 && line.every((i) => i.text.trim().length <= 8);
2419
+ const result = [lines[0]];
2420
+ for (let i = 1; i < lines.length; i++) {
2421
+ const prev = result[result.length - 1];
2422
+ const curr = lines[i];
2423
+ const a = band(prev);
2424
+ const b = band(curr);
2425
+ const overlap = Math.min(a.top, b.top) - Math.max(a.bottom, b.bottom);
2426
+ const prevIsFrag = isFrag(prev) && a.height <= b.height * 0.8 && overlap >= a.height * 0.5;
2427
+ const currIsFrag = isFrag(curr) && b.height <= a.height * 0.8 && overlap >= b.height * 0.5;
2428
+ if (prevIsFrag || currIsFrag) {
2429
+ result[result.length - 1] = [...prev, ...curr];
2430
+ } else {
2431
+ result.push(curr);
2432
+ }
2433
+ }
2434
+ return result;
2435
+ }
1961
2436
  function isProseSpread(items) {
1962
2437
  if (items.length < 4) return false;
1963
2438
  const sorted = [...items].sort((a, b) => a.x - b.x);
@@ -1965,14 +2440,14 @@ function isProseSpread(items) {
1965
2440
  for (let i = 1; i < sorted.length; i++) {
1966
2441
  gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
1967
2442
  }
1968
- const maxGap = _chunkFWAXCTSXcjs.safeMax.call(void 0, gaps);
2443
+ const maxGap = _chunk3WRJQQIOcjs.safeMax.call(void 0, gaps);
1969
2444
  const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
1970
2445
  return maxGap < 40 && avgLen < 5;
1971
2446
  }
1972
2447
  function detectColumns(yLines) {
1973
2448
  const allItems = yLines.flat();
1974
2449
  if (allItems.length === 0) return null;
1975
- const pageWidth = _chunkFWAXCTSXcjs.safeMax.call(void 0, allItems.map((i) => i.x + i.w)) - _chunkFWAXCTSXcjs.safeMin.call(void 0, allItems.map((i) => i.x));
2450
+ const pageWidth = _chunk3WRJQQIOcjs.safeMax.call(void 0, allItems.map((i) => i.x + i.w)) - _chunk3WRJQQIOcjs.safeMin.call(void 0, allItems.map((i) => i.x));
1976
2451
  if (pageWidth < 100) return null;
1977
2452
  let bigoLineIdx = -1;
1978
2453
  for (let i = 0; i < yLines.length; i++) {
@@ -2199,9 +2674,7 @@ function mergeLineSimple(items) {
2199
2674
  result += sorted[i].text;
2200
2675
  continue;
2201
2676
  }
2202
- if (gap < avgFs * 0.15) {
2203
- } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
2204
- } else if (gap > 3) result += " ";
2677
+ if (gap > spaceGapThreshold(avgFs)) result += " ";
2205
2678
  result += sorted[i].text;
2206
2679
  }
2207
2680
  return result;
@@ -2225,7 +2698,7 @@ function cleanPdfText(text) {
2225
2698
  ).replace(/^(?!\| ---).*$/gm, (line) => {
2226
2699
  if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
2227
2700
  return collapseEvenSpacing(line);
2228
- }).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2701
+ }).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\\~\\~/g, "~~").replace(/~~~~/g, "").replace(/\n{3,}/g, "\n\n").trim();
2229
2702
  }
2230
2703
  function startsWithMarker(line) {
2231
2704
  const t = line.trimStart();
@@ -2234,6 +2707,134 @@ function startsWithMarker(line) {
2234
2707
  function isStandaloneHeader(line) {
2235
2708
  return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
2236
2709
  }
2710
+ var TABLE_CAPTION_RE = /^[<\[(【〈]?\s*(표|그림|도표|Table|Figure|Fig\.?)\s*[\d①-⑮][\d.\-]*\s*[\])】〉>]?[.:]?\s*/i;
2711
+ var CAPTION_MAX_LENGTH = 100;
2712
+ var CAPTION_MAX_GAP = 30;
2713
+ function detectTableCaptions(blocks) {
2714
+ const isCaptionCandidate = (b, table) => {
2715
+ if (!b || b.type !== "paragraph" || !b.text) return false;
2716
+ if (b.pageNumber !== table.pageNumber) return false;
2717
+ const text = b.text.trim();
2718
+ if (!text || text.length > CAPTION_MAX_LENGTH || text.includes("\n")) return false;
2719
+ if (!TABLE_CAPTION_RE.test(text)) return false;
2720
+ if (b.bbox && table.bbox) {
2721
+ const capTop = b.bbox.y + b.bbox.height;
2722
+ const capBottom = b.bbox.y;
2723
+ const tblTop = table.bbox.y + table.bbox.height;
2724
+ const tblBottom = table.bbox.y;
2725
+ const gap = capBottom >= tblTop ? capBottom - tblTop : tblBottom - capTop;
2726
+ if (gap > CAPTION_MAX_GAP) return false;
2727
+ const overlap = Math.min(b.bbox.x + b.bbox.width, table.bbox.x + table.bbox.width) - Math.max(b.bbox.x, table.bbox.x);
2728
+ if (overlap < Math.min(b.bbox.width, table.bbox.width) * 0.3) return false;
2729
+ }
2730
+ return true;
2731
+ };
2732
+ for (let i = 0; i < blocks.length; i++) {
2733
+ const block = blocks[i];
2734
+ if (block.type !== "table" || !block.table || block.table.caption) continue;
2735
+ if (isCaptionCandidate(blocks[i - 1], block)) {
2736
+ block.table.caption = blocks[i - 1].text.trim();
2737
+ blocks.splice(i - 1, 1);
2738
+ i--;
2739
+ } else if (isCaptionCandidate(blocks[i + 1], block)) {
2740
+ block.table.caption = blocks[i + 1].text.trim();
2741
+ blocks.splice(i + 1, 1);
2742
+ }
2743
+ }
2744
+ }
2745
+ var KOREAN_LIST_SEQ = "\uAC00\uB098\uB2E4\uB77C\uB9C8\uBC14\uC0AC\uC544\uC790\uCC28\uCE74\uD0C0\uD30C\uD558";
2746
+ function parseListLabel(text) {
2747
+ let m = text.match(/^(\d{1,2})\.(?!\d)\s+/);
2748
+ if (m) return { family: "arabicDot", ord: parseInt(m[1], 10) };
2749
+ m = text.match(/^([가-하])\.\s+/);
2750
+ if (m) {
2751
+ const idx = KOREAN_LIST_SEQ.indexOf(m[1]);
2752
+ if (idx >= 0) return { family: "korDot", ord: idx + 1 };
2753
+ }
2754
+ m = text.match(/^(\d{1,2})\)\s*/);
2755
+ if (m) return { family: "arabicParen", ord: parseInt(m[1], 10) };
2756
+ m = text.match(/^([가-하])\)\s*/);
2757
+ if (m) {
2758
+ const idx = KOREAN_LIST_SEQ.indexOf(m[1]);
2759
+ if (idx >= 0) return { family: "korParen", ord: idx + 1 };
2760
+ }
2761
+ m = text.match(/^([①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮])\s*/);
2762
+ if (m) return { family: "circled", ord: m[1].charCodeAt(0) - 9312 + 1 };
2763
+ return null;
2764
+ }
2765
+ var ATTACHMENT_RE = /^붙\s*임\s*(\d+[.:]?)?\s/;
2766
+ function detectKoreanListBlocks(blocks) {
2767
+ const labeled = [];
2768
+ for (let i = 0; i < blocks.length; i++) {
2769
+ const b = blocks[i];
2770
+ if (b.type !== "paragraph" && b.type !== "list" || !b.text) continue;
2771
+ const label = parseListLabel(b.text.trim());
2772
+ if (label) labeled.push({ idx: i, label });
2773
+ }
2774
+ const validated = /* @__PURE__ */ new Set();
2775
+ const byFamily = /* @__PURE__ */ new Map();
2776
+ for (const l of labeled) {
2777
+ const arr = byFamily.get(l.label.family) || [];
2778
+ arr.push(l);
2779
+ byFamily.set(l.label.family, arr);
2780
+ }
2781
+ for (const arr of byFamily.values()) {
2782
+ let chain = [];
2783
+ for (const item of arr) {
2784
+ const prev = chain[chain.length - 1];
2785
+ if (prev && item.label.ord === prev.label.ord + 1 && item.idx - prev.idx <= 20) {
2786
+ chain.push(item);
2787
+ } else {
2788
+ if (chain.length >= 2) for (const c of chain) validated.add(c.idx);
2789
+ chain = [item];
2790
+ }
2791
+ }
2792
+ if (chain.length >= 2) for (const c of chain) validated.add(c.idx);
2793
+ }
2794
+ let familyStack = [];
2795
+ let lastTopLevelList = null;
2796
+ const toRemove = /* @__PURE__ */ new Set();
2797
+ for (let i = 0; i < blocks.length; i++) {
2798
+ const b = blocks[i];
2799
+ if (b.type === "table" || b.type === "heading" || b.type === "separator") {
2800
+ familyStack = [];
2801
+ lastTopLevelList = null;
2802
+ continue;
2803
+ }
2804
+ if (b.type !== "paragraph" && b.type !== "list" || !b.text) continue;
2805
+ const text = b.text.trim();
2806
+ if (b.type === "paragraph" && ATTACHMENT_RE.test(text)) {
2807
+ blocks[i] = { ...b, type: "list", listType: "unordered" };
2808
+ continue;
2809
+ }
2810
+ if (!validated.has(i)) continue;
2811
+ const label = parseListLabel(text);
2812
+ let depth = familyStack.indexOf(label.family);
2813
+ if (depth < 0) {
2814
+ familyStack.push(label.family);
2815
+ depth = familyStack.length - 1;
2816
+ } else {
2817
+ familyStack = familyStack.slice(0, depth + 1);
2818
+ }
2819
+ const listType = label.family === "arabicDot" ? "ordered" : "unordered";
2820
+ const listBlock = { ...b, type: "list", listType };
2821
+ if (depth === 0) {
2822
+ blocks[i] = listBlock;
2823
+ lastTopLevelList = listBlock;
2824
+ } else if (lastTopLevelList) {
2825
+ if (!lastTopLevelList.children) lastTopLevelList.children = [];
2826
+ lastTopLevelList.children.push(listBlock);
2827
+ toRemove.add(i);
2828
+ } else {
2829
+ blocks[i] = listBlock;
2830
+ lastTopLevelList = listBlock;
2831
+ }
2832
+ }
2833
+ if (toRemove.size > 0) {
2834
+ const sorted = [...toRemove].sort((a, b) => b - a);
2835
+ for (const idx of sorted) blocks.splice(idx, 1);
2836
+ }
2837
+ }
2237
2838
  function detectListBlocks(blocks) {
2238
2839
  const result = [];
2239
2840
  for (let i = 0; i < blocks.length; i++) {
@@ -2343,7 +2944,6 @@ function detectSpecialKoreanTables(blocks) {
2343
2944
  function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2344
2945
  const ZONE_RATIO = 0.12;
2345
2946
  const MIN_REPEAT = 3;
2346
- const Y_BUCKET = 5;
2347
2947
  const topEntries = [];
2348
2948
  const bottomEntries = [];
2349
2949
  for (let bi = 0; bi < blocks.length; bi++) {
@@ -2353,7 +2953,7 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2353
2953
  if (!ph) continue;
2354
2954
  const blockTop = ph - (b.bbox.y + b.bbox.height);
2355
2955
  const blockBottom = ph - b.bbox.y;
2356
- const entry = { blockIdx: bi, page: b.pageNumber, y: b.bbox.y, text: b.text.trim() };
2956
+ const entry = { blockIdx: bi, page: b.pageNumber, text: b.text.trim() };
2357
2957
  if (blockBottom <= ph * ZONE_RATIO) bottomEntries.push(entry);
2358
2958
  else if (blockTop >= ph * (1 - ZONE_RATIO)) topEntries.push(entry);
2359
2959
  }
@@ -2375,21 +2975,9 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2375
2975
  repeatedPatterns.add(p);
2376
2976
  }
2377
2977
  }
2378
- const bucketPages = /* @__PURE__ */ new Map();
2379
- for (const e of entries) {
2380
- const bucket = Math.round(e.y / Y_BUCKET);
2381
- const pages = bucketPages.get(bucket) || /* @__PURE__ */ new Set();
2382
- pages.add(e.page);
2383
- bucketPages.set(bucket, pages);
2384
- }
2385
- const repeatedBuckets = /* @__PURE__ */ new Set();
2386
- for (const [b, pages] of bucketPages) {
2387
- if (pages.size >= MIN_REPEAT) repeatedBuckets.add(b);
2388
- }
2389
2978
  for (const e of entries) {
2390
2979
  const norm = e.text.replace(/\d+/g, "#");
2391
- const bucket = Math.round(e.y / Y_BUCKET);
2392
- if (repeatedPatterns.has(norm) || repeatedBuckets.has(bucket)) {
2980
+ if (repeatedPatterns.has(norm)) {
2393
2981
  removeSet.add(e.blockIdx);
2394
2982
  }
2395
2983
  }
@@ -2550,5 +3138,9 @@ function formatMb(bytes) {
2550
3138
 
2551
3139
 
2552
3140
 
2553
- exports.cleanPdfText = cleanPdfText; exports.extractPdfMetadataOnly = extractPdfMetadataOnly; exports.parsePdfDocument = parsePdfDocument;
2554
- //# sourceMappingURL=parser-BTIPAEDZ.cjs.map
3141
+
3142
+
3143
+
3144
+
3145
+ exports.cleanPdfText = cleanPdfText; exports.detectKoreanListBlocks = detectKoreanListBlocks; exports.detectTableCaptions = detectTableCaptions; exports.extractPdfMetadataOnly = extractPdfMetadataOnly; exports.mergeCrossPageTables = mergeCrossPageTables; exports.parsePdfDocument = parsePdfDocument; exports.removeHeaderFooterBlocks = removeHeaderFooterBlocks;
3146
+ //# sourceMappingURL=parser-5KHU732L.cjs.map