kordoc 2.9.1 → 3.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (40) hide show
  1. package/README.md +28 -0
  2. package/dist/-K5SLEFZD.js +71 -0
  3. package/dist/-K5SLEFZD.js.map +1 -0
  4. package/dist/{chunk-GQQNAYZA.js → chunk-326STEDU.js} +6684 -4061
  5. package/dist/chunk-326STEDU.js.map +1 -0
  6. package/dist/{chunk-FWAXCTSX.cjs → chunk-3WRJQQIO.cjs} +185 -16
  7. package/dist/chunk-3WRJQQIO.cjs.map +1 -0
  8. package/dist/chunk-MUOQXDZ4.cjs.map +1 -1
  9. package/dist/{chunk-Z6TLTWYK.js → chunk-NHXKJWR7.js} +182 -13
  10. package/dist/chunk-NHXKJWR7.js.map +1 -0
  11. package/dist/{chunk-ODF24QXC.js → chunk-SA2PERJ5.js} +182 -13
  12. package/dist/chunk-SA2PERJ5.js.map +1 -0
  13. package/dist/cli.js +42 -3
  14. package/dist/cli.js.map +1 -1
  15. package/dist/formula-XGG6ZP42.cjs.map +1 -1
  16. package/dist/index.cjs +3247 -822
  17. package/dist/index.cjs.map +1 -1
  18. package/dist/index.d.cts +61 -2
  19. package/dist/index.d.ts +61 -2
  20. package/dist/index.js +3025 -600
  21. package/dist/index.js.map +1 -1
  22. package/dist/mcp.js +3 -3
  23. package/dist/page-range-3C7UGGEK.cjs.map +1 -1
  24. package/dist/{parser-BKYM3LKN.js → parser-4IVYHKSL.js} +677 -85
  25. package/dist/parser-4IVYHKSL.js.map +1 -0
  26. package/dist/{parser-BTIPAEDZ.cjs → parser-5KHU732L.cjs} +689 -97
  27. package/dist/parser-5KHU732L.cjs.map +1 -0
  28. package/dist/{parser-FJNQEW7K.js → parser-AU2NLC44.js} +677 -85
  29. package/dist/parser-AU2NLC44.js.map +1 -0
  30. package/dist/provider-SNONEZNW.cjs.map +1 -1
  31. package/dist/{watch-SBLSWHL7.js → watch-5DDN4BUI.js} +3 -3
  32. package/package.json +1 -1
  33. package/dist/chunk-FWAXCTSX.cjs.map +0 -1
  34. package/dist/chunk-GQQNAYZA.js.map +0 -1
  35. package/dist/chunk-ODF24QXC.js.map +0 -1
  36. package/dist/chunk-Z6TLTWYK.js.map +0 -1
  37. package/dist/parser-BKYM3LKN.js.map +0 -1
  38. package/dist/parser-BTIPAEDZ.cjs.map +0 -1
  39. package/dist/parser-FJNQEW7K.js.map +0 -1
  40. /package/dist/{watch-SBLSWHL7.js.map → watch-5DDN4BUI.js.map} +0 -0
@@ -7,7 +7,7 @@ import {
7
7
  blocksToMarkdown,
8
8
  safeMax,
9
9
  safeMin
10
- } from "./chunk-ODF24QXC.js";
10
+ } from "./chunk-SA2PERJ5.js";
11
11
  import {
12
12
  parsePageRange
13
13
  } from "./chunk-MOL7MDBG.js";
@@ -157,6 +157,55 @@ function extractLines(fnArray, argsArray) {
157
157
  }
158
158
  return { horizontals, verticals };
159
159
  }
160
+ function multiplyTransform(m, t) {
161
+ return [
162
+ m[0] * t[0] + m[2] * t[1],
163
+ m[1] * t[0] + m[3] * t[1],
164
+ m[0] * t[2] + m[2] * t[3],
165
+ m[1] * t[2] + m[3] * t[3],
166
+ m[0] * t[4] + m[2] * t[5] + m[4],
167
+ m[1] * t[4] + m[3] * t[5] + m[5]
168
+ ];
169
+ }
170
+ function extractImageRegions(fnArray, argsArray) {
171
+ const regions = [];
172
+ let ctm = [1, 0, 0, 1, 0, 0];
173
+ const stack = [];
174
+ for (let i = 0; i < fnArray.length; i++) {
175
+ const op = fnArray[i];
176
+ switch (op) {
177
+ case OPS.save:
178
+ stack.push(ctm);
179
+ break;
180
+ case OPS.restore:
181
+ ctm = stack.pop() || [1, 0, 0, 1, 0, 0];
182
+ break;
183
+ case OPS.transform: {
184
+ const t = argsArray[i];
185
+ if (Array.isArray(t) && t.length >= 6) ctm = multiplyTransform(ctm, t);
186
+ break;
187
+ }
188
+ case OPS.paintImageXObject:
189
+ case OPS.paintInlineImageXObject:
190
+ case OPS.paintImageMaskXObject:
191
+ case OPS.paintImageXObjectRepeat: {
192
+ const corners = [[0, 0], [1, 0], [0, 1], [1, 1]];
193
+ let x1 = Infinity, y1 = Infinity, x2 = -Infinity, y2 = -Infinity;
194
+ for (const [u, v] of corners) {
195
+ const x = ctm[0] * u + ctm[2] * v + ctm[4];
196
+ const y = ctm[1] * u + ctm[3] * v + ctm[5];
197
+ if (x < x1) x1 = x;
198
+ if (x > x2) x2 = x;
199
+ if (y < y1) y1 = y;
200
+ if (y > y2) y2 = y;
201
+ }
202
+ if (x2 - x1 > 0 && y2 - y1 > 0) regions.push({ x1, y1, x2, y2 });
203
+ break;
204
+ }
205
+ }
206
+ }
207
+ return regions;
208
+ }
160
209
  function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
161
210
  const dx = Math.abs(seg.x2 - seg.x1);
162
211
  const dy = Math.abs(seg.y2 - seg.y1);
@@ -542,6 +591,10 @@ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
542
591
  }
543
592
  return false;
544
593
  }
594
+ var SPACE_GAP_RATIO = 0.17;
595
+ function spaceGapThreshold(fontSize) {
596
+ return Math.max(fontSize * SPACE_GAP_RATIO, 1);
597
+ }
545
598
  function mapTextToCells(items, cells) {
546
599
  const result = /* @__PURE__ */ new Map();
547
600
  for (const cell of cells) {
@@ -601,14 +654,12 @@ function cellTextToString(items) {
601
654
  }
602
655
  const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
603
656
  const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
604
- const prevIsKorean = /[가-힣]$/.test(result);
605
- const currIsKorean = /^[가-힣]/.test(s[j].text);
606
- if (gap < avgFs * 0.15) {
607
- result += s[j].text;
608
- } else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
609
- result += s[j].text;
610
- } else {
657
+ if (s[j].hasSpaceBefore && gap >= avgFs * 0.05) {
611
658
  result += " " + s[j].text;
659
+ } else if (gap > spaceGapThreshold(avgFs)) {
660
+ result += " " + s[j].text;
661
+ } else {
662
+ result += s[j].text;
612
663
  }
613
664
  }
614
665
  return result;
@@ -621,6 +672,11 @@ function detectEvenSpacedItems(items) {
621
672
  let runStart = -1;
622
673
  for (let i = 0; i < items.length; i++) {
623
674
  const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
675
+ if (isShortKorean && runStart >= 0 && items[i].hasSpaceBefore) {
676
+ if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
677
+ runStart = i;
678
+ continue;
679
+ }
624
680
  if (isShortKorean && runStart >= 0 && i > 0) {
625
681
  const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
626
682
  const maxRunGap = Math.max(items[i].fontSize * 3, 30);
@@ -663,6 +719,119 @@ function markEvenRun(items, result, start, end) {
663
719
  }
664
720
  }
665
721
  }
722
+ var MAX_UNDERSEGMENTED_ROWS = 2;
723
+ var MIN_UNDERSEGMENTED_COLUMNS = 3;
724
+ var MIN_UNDERSEGMENTED_TEXT_LINES = 8;
725
+ var MIN_ROW_BAND_MISMATCH = 2;
726
+ var MIN_ROW_BAND_EPSILON = 3;
727
+ var ROW_BAND_EPSILON_RATIO = 0.6;
728
+ function itemCenterY(item) {
729
+ return item.y + (item.h > 0 ? item.h : item.fontSize) / 2;
730
+ }
731
+ function itemHeight(item) {
732
+ return item.h > 0 ? item.h : item.fontSize;
733
+ }
734
+ function findColumnIndex(item, colXs) {
735
+ const cx = item.x + item.w / 2;
736
+ for (let c = 0; c < colXs.length - 1; c++) {
737
+ if (cx >= colXs[c] && cx <= colXs[c + 1]) return c;
738
+ }
739
+ let best = 0;
740
+ let bestDist = Infinity;
741
+ for (let c = 0; c < colXs.length - 1; c++) {
742
+ const center = (colXs[c] + colXs[c + 1]) / 2;
743
+ const d = Math.abs(cx - center);
744
+ if (d < bestDist) {
745
+ bestDist = d;
746
+ best = c;
747
+ }
748
+ }
749
+ return best;
750
+ }
751
+ function groupItemsToVisualLines(items) {
752
+ if (items.length === 0) return [];
753
+ const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
754
+ const lines = [];
755
+ let cur = [sorted[0]];
756
+ let curY = sorted[0].y;
757
+ for (let i = 1; i < sorted.length; i++) {
758
+ const tol = Math.max(3, Math.min(sorted[i].fontSize, cur[0].fontSize) * 0.6);
759
+ if (Math.abs(sorted[i].y - curY) <= tol) {
760
+ cur.push(sorted[i]);
761
+ } else {
762
+ lines.push(cur);
763
+ cur = [sorted[i]];
764
+ curY = sorted[i].y;
765
+ }
766
+ }
767
+ lines.push(cur);
768
+ return lines;
769
+ }
770
+ function normalizeUndersegmentedTable(originalCells, colXs, items) {
771
+ const numRows = originalCells.length;
772
+ const numCols = colXs.length - 1;
773
+ if (numRows > MAX_UNDERSEGMENTED_ROWS || numCols < MIN_UNDERSEGMENTED_COLUMNS) return null;
774
+ if (items.length === 0) return null;
775
+ const itemsByCol = Array.from({ length: numCols }, () => []);
776
+ for (const item of items) {
777
+ if (!item.text.trim()) continue;
778
+ itemsByCol[findColumnIndex(item, colXs)].push(item);
779
+ }
780
+ let denseColumns = 0;
781
+ for (const colItems of itemsByCol) {
782
+ if (groupItemsToVisualLines(colItems).length >= MIN_UNDERSEGMENTED_TEXT_LINES) denseColumns++;
783
+ }
784
+ if (denseColumns < 2) return null;
785
+ const allLines = groupItemsToVisualLines(items.filter((i) => i.text.trim()));
786
+ const bands = [];
787
+ for (const line of allLines) {
788
+ let cy = 0, h = 0;
789
+ for (const it of line) {
790
+ cy += itemCenterY(it);
791
+ h += itemHeight(it);
792
+ }
793
+ cy /= line.length;
794
+ h /= line.length;
795
+ const top = cy + h / 2;
796
+ const bottom = cy - h / 2;
797
+ let matched = null;
798
+ for (const band of bands) {
799
+ const epsilon = Math.max(MIN_ROW_BAND_EPSILON, Math.min(band.avgHeight, h) * ROW_BAND_EPSILON_RATIO);
800
+ if (Math.abs(band.centerY - cy) <= epsilon || bottom <= band.topY && top >= band.bottomY) {
801
+ matched = band;
802
+ break;
803
+ }
804
+ }
805
+ if (!matched) {
806
+ matched = { centerY: 0, avgHeight: 0, topY: -Infinity, bottomY: Infinity, lineCount: 0, itemsByCol: Array.from({ length: numCols }, () => []) };
807
+ bands.push(matched);
808
+ }
809
+ matched.centerY = (matched.centerY * matched.lineCount + cy) / (matched.lineCount + 1);
810
+ matched.avgHeight = (matched.avgHeight * matched.lineCount + h) / (matched.lineCount + 1);
811
+ matched.topY = Math.max(matched.topY, top);
812
+ matched.bottomY = Math.min(matched.bottomY, bottom);
813
+ matched.lineCount++;
814
+ for (const it of line) {
815
+ matched.itemsByCol[findColumnIndex(it, colXs)].push(it);
816
+ }
817
+ }
818
+ if (bands.length < numRows + MIN_ROW_BAND_MISMATCH) return null;
819
+ bands.sort((a, b) => b.centerY - a.centerY);
820
+ const rebuilt = bands.map(
821
+ (band) => band.itemsByCol.map((colItems) => colItems.length > 0 ? cellTextToString(colItems) : "")
822
+ );
823
+ const countNonEmptyRows = (cells) => cells.filter((row) => row.some((c) => (typeof c === "string" ? c : c.text).trim() !== "")).length;
824
+ const countNonEmptyCols = (cells, cols) => {
825
+ let n = 0;
826
+ for (let c = 0; c < cols; c++) {
827
+ if (cells.some((row) => row[c] != null && (typeof row[c] === "string" ? row[c] : row[c].text).trim() !== "")) n++;
828
+ }
829
+ return n;
830
+ };
831
+ if (countNonEmptyRows(rebuilt) <= countNonEmptyRows(originalCells)) return null;
832
+ if (countNonEmptyCols(rebuilt, numCols) < countNonEmptyCols(originalCells, numCols)) return null;
833
+ return rebuilt;
834
+ }
666
835
  function mergeCellTextLines(textLines) {
667
836
  if (textLines.length <= 1) return textLines[0] || "";
668
837
  const merged = [textLines[0]];
@@ -695,7 +864,7 @@ var MIN_COL_FILL_RATIO = 0.4;
695
864
  function detectClusterTables(items, pageNum) {
696
865
  if (items.length < MIN_ROWS * MIN_COLS) return [];
697
866
  const { merged, originMap } = mergeEvenSpacedClusters(items);
698
- const rows = groupByBaseline(merged);
867
+ const rows = mergeOverlappingRows(groupByBaseline(merged));
699
868
  if (rows.length < MIN_ROWS) return [];
700
869
  const results = [];
701
870
  const headerResult = detectHeaderRow(rows);
@@ -744,6 +913,7 @@ function mergeEvenSpacedClusters(items) {
744
913
  if (/^[가-힣\d]$/.test(sorted[i].text)) {
745
914
  let runEnd = i + 1;
746
915
  while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
916
+ if (sorted[runEnd].hasSpaceBefore) break;
747
917
  const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
748
918
  const fs = sorted[runEnd].fontSize;
749
919
  if (gap < fs * 0.1 || gap > fs * 3) break;
@@ -834,6 +1004,38 @@ function detectHeaderRow(rows) {
834
1004
  }
835
1005
  return null;
836
1006
  }
1007
+ function mergeOverlappingRows(rows) {
1008
+ if (rows.length <= 1) return rows;
1009
+ const result = [rows[0]];
1010
+ for (let i = 1; i < rows.length; i++) {
1011
+ const prev = result[result.length - 1];
1012
+ const curr = rows[i];
1013
+ const a = rowBand(prev);
1014
+ const b = rowBand(curr);
1015
+ const overlap = Math.min(a.top, b.top) - Math.max(a.bottom, b.bottom);
1016
+ const prevIsFrag = isFragmentRow(prev) && a.height <= b.height * 0.8 && overlap >= a.height * 0.5;
1017
+ const currIsFrag = isFragmentRow(curr) && b.height <= a.height * 0.8 && overlap >= b.height * 0.5;
1018
+ if (prevIsFrag || currIsFrag) {
1019
+ const baseY = prevIsFrag ? curr.y : prev.y;
1020
+ result[result.length - 1] = { y: baseY, items: [...prev.items, ...curr.items] };
1021
+ } else {
1022
+ result.push(curr);
1023
+ }
1024
+ }
1025
+ return result;
1026
+ }
1027
+ function isFragmentRow(row) {
1028
+ return row.items.length <= 3 && row.items.every((i) => i.text.length <= 8);
1029
+ }
1030
+ function rowBand(row) {
1031
+ let bottom = Infinity, top = -Infinity;
1032
+ for (const i of row.items) {
1033
+ const h = i.h > 0 ? i.h : i.fontSize;
1034
+ if (i.y < bottom) bottom = i.y;
1035
+ if (i.y + h > top) top = i.y + h;
1036
+ }
1037
+ return { bottom, top, height: top - bottom };
1038
+ }
837
1039
  function mergeMultiLineRows(rows, columns) {
838
1040
  if (rows.length <= 1) return rows;
839
1041
  const result = [rows[0]];
@@ -1320,6 +1522,8 @@ async function parsePdfDocument(buffer, options) {
1320
1522
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
1321
1523
  const fontSizeFreq = /* @__PURE__ */ new Map();
1322
1524
  const pageHeights = /* @__PURE__ */ new Map();
1525
+ const pagesWithLargeImage = /* @__PURE__ */ new Set();
1526
+ const skippedImagePages = /* @__PURE__ */ new Map();
1323
1527
  let parsedPages = 0;
1324
1528
  for (let i = 1; i <= effectivePageCount; i++) {
1325
1529
  if (pageFilter && !pageFilter.has(i)) continue;
@@ -1338,6 +1542,23 @@ async function parsePdfDocument(buffer, options) {
1338
1542
  if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
1339
1543
  }
1340
1544
  const opList = await page.getOperatorList();
1545
+ const pageArea = viewport.width * viewport.height;
1546
+ if (pageArea > 0) {
1547
+ const imageRegions = extractImageRegions(opList.fnArray, opList.argsArray);
1548
+ let uncovered = 0;
1549
+ for (const r of imageRegions) {
1550
+ const area = (r.x2 - r.x1) * (r.y2 - r.y1);
1551
+ if (area < pageArea * 0.05) continue;
1552
+ pagesWithLargeImage.add(i);
1553
+ const hasText = visible.some((it) => {
1554
+ const cx = it.x + it.w / 2;
1555
+ const cy = it.y + (it.h || it.fontSize) / 2;
1556
+ return cx >= r.x1 && cx <= r.x2 && cy >= r.y1 && cy <= r.y2;
1557
+ });
1558
+ if (!hasText) uncovered++;
1559
+ }
1560
+ if (uncovered > 0) skippedImagePages.set(i, uncovered);
1561
+ }
1341
1562
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
1342
1563
  for (const b of pageBlocks) blocks.push(b);
1343
1564
  let pageText = "";
@@ -1357,6 +1578,7 @@ async function parsePdfDocument(buffer, options) {
1357
1578
  }
1358
1579
  }
1359
1580
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
1581
+ let isImageBased = false;
1360
1582
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
1361
1583
  if (options?.ocr) {
1362
1584
  try {
@@ -1369,7 +1591,29 @@ async function parsePdfDocument(buffer, options) {
1369
1591
  } catch {
1370
1592
  }
1371
1593
  }
1372
- throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
1594
+ isImageBased = true;
1595
+ warnings.push({
1596
+ message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, \uD14D\uC2A4\uD2B8 ${totalChars}\uC790) \u2014 \uD14D\uC2A4\uD2B8 \uB808\uC774\uC5B4\uAC00 \uC5C6\uC5B4 OCR\uC774 \uD544\uC694\uD569\uB2C8\uB2E4`,
1597
+ code: "NEEDS_OCR"
1598
+ });
1599
+ }
1600
+ if (!isImageBased) {
1601
+ const OCR_REASON_MESSAGES = {
1602
+ low_text: "\uD14D\uC2A4\uD2B8\uAC00 \uAC70\uC758 \uC5C6\uB294 \uD398\uC774\uC9C0 (\uC2A4\uCE94/\uC774\uBBF8\uC9C0 \uCD94\uC815)",
1603
+ high_pua: "\uAE00\uAF34 \uB9E4\uD551 \uC2E4\uD328 (PUA \uBE44\uC728 \uB192\uC74C) \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00",
1604
+ high_control: "\uC81C\uC5B4\uBB38\uC790 \uBE44\uC728 \uB192\uC74C \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00",
1605
+ high_replacement: "\uB300\uCCB4\uBB38\uC790(U+FFFD) \uBE44\uC728 \uB192\uC74C \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00"
1606
+ };
1607
+ for (const pq of pageQuality) {
1608
+ if (!pq.needsOcr || !pq.ocrReason) continue;
1609
+ if (pq.ocrReason === "low_text" && !pagesWithLargeImage.has(pq.page)) continue;
1610
+ warnings.push({ page: pq.page, message: `${OCR_REASON_MESSAGES[pq.ocrReason]} \u2014 OCR \uAC80\uD1A0 \uD544\uC694`, code: "NEEDS_OCR" });
1611
+ }
1612
+ }
1613
+ if (!isImageBased) {
1614
+ for (const [page, count] of [...skippedImagePages.entries()].sort((a, b) => a[0] - b[0])) {
1615
+ warnings.push({ page, message: `${count}\uAC1C \uC774\uBBF8\uC9C0 \uC601\uC5ED\uC5D0 \uCD94\uCD9C \uAC00\uB2A5\uD55C \uD14D\uC2A4\uD2B8 \uC5C6\uC74C (\uADF8\uB9BC/\uCC28\uD2B8/\uB3C4\uC7A5 \uB0B4\uC6A9 \uB204\uB77D \uAC00\uB2A5)`, code: "SKIPPED_IMAGE" });
1616
+ }
1373
1617
  }
1374
1618
  if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
1375
1619
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
@@ -1377,6 +1621,7 @@ async function parsePdfDocument(buffer, options) {
1377
1621
  blocks.splice(removed[ri], 1);
1378
1622
  }
1379
1623
  }
1624
+ mergeCrossPageTables(blocks);
1380
1625
  if (options?.formulaOcr && formulaBuffer) {
1381
1626
  try {
1382
1627
  await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
@@ -1392,6 +1637,8 @@ async function parsePdfDocument(buffer, options) {
1392
1637
  detectHeadings(blocks, medianFontSize);
1393
1638
  }
1394
1639
  detectMarkerHeadings(blocks);
1640
+ detectTableCaptions(blocks);
1641
+ detectKoreanListBlocks(blocks);
1395
1642
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
1396
1643
  sanitizeBlockControlChars(blocks);
1397
1644
  let markdown = cleanPdfText(blocksToMarkdown(blocks));
@@ -1401,6 +1648,7 @@ async function parsePdfDocument(buffer, options) {
1401
1648
  metadata,
1402
1649
  outline: outline.length > 0 ? outline : void 0,
1403
1650
  warnings: warnings.length > 0 ? warnings : void 0,
1651
+ isImageBased: isImageBased || void 0,
1404
1652
  pageQuality,
1405
1653
  qualitySummary: summarizeDocumentQuality(pageQuality)
1406
1654
  };
@@ -1556,79 +1804,218 @@ function detectMarkerHeadings(blocks) {
1556
1804
  }
1557
1805
  }
1558
1806
  var MAX_XYCUT_DEPTH = 50;
1807
+ var XYCUT_MIN_GAP = 5;
1808
+ var CROSS_LAYOUT_BETA = 2;
1809
+ var CROSS_OVERLAP_RATIO = 0.1;
1810
+ var CROSS_MIN_OVERLAPS = 2;
1811
+ var CROSS_MAX_MASK_RATIO = 0.2;
1812
+ var NARROW_ELEMENT_WIDTH_RATIO = 0.1;
1559
1813
  function xyCutOrder(items, gapThreshold, depth = 0) {
1560
1814
  if (items.length === 0) return [];
1561
1815
  if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
1562
- const region = computeRegion(items);
1563
- const ySplit = findYSplit(items, region, gapThreshold);
1564
- if (ySplit !== null) {
1565
- const upper = items.filter((i) => i.y > ySplit);
1566
- const lower = items.filter((i) => i.y <= ySplit);
1816
+ if (depth === 0 && items.length >= 3) {
1817
+ const cross = identifyCrossLayoutItems(items);
1818
+ if (cross.size > 0 && cross.size <= items.length * CROSS_MAX_MASK_RATIO) {
1819
+ const rest = items.filter((i) => !cross.has(i));
1820
+ if (rest.length > 0) {
1821
+ const groups = xyCutOrder(rest, gapThreshold, 1);
1822
+ return mergeCrossLayoutGroups(groups, [...cross]);
1823
+ }
1824
+ }
1825
+ }
1826
+ const minGap = Math.max(XYCUT_MIN_GAP, gapThreshold);
1827
+ const hCut = findHorizontalCut(items);
1828
+ const vCut = findVerticalCutWithOutlierFilter(items, minGap);
1829
+ const hValid = hCut.gap >= minGap;
1830
+ const vValid = vCut.gap >= minGap;
1831
+ let useHorizontal;
1832
+ if (hValid && vValid) useHorizontal = vCut.gap <= hCut.gap * 1.5;
1833
+ else if (hValid) useHorizontal = true;
1834
+ else if (vValid) useHorizontal = false;
1835
+ else return [items];
1836
+ if (useHorizontal) {
1837
+ const upper = items.filter((i) => i.y > hCut.position);
1838
+ const lower = items.filter((i) => i.y <= hCut.position);
1567
1839
  if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
1568
1840
  return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
1569
1841
  }
1570
- }
1571
- const xSplit = findXSplit(items, region, gapThreshold);
1572
- if (xSplit !== null) {
1573
- const left = items.filter((i) => i.x + i.w / 2 < xSplit);
1574
- const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
1842
+ } else {
1843
+ const left = items.filter((i) => i.x + i.w / 2 < vCut.position);
1844
+ const right = items.filter((i) => i.x + i.w / 2 >= vCut.position);
1575
1845
  if (left.length > 0 && right.length > 0 && left.length < items.length) {
1576
1846
  return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
1577
1847
  }
1578
1848
  }
1579
1849
  return [items];
1580
1850
  }
1581
- function computeRegion(items) {
1582
- let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
1851
+ function identifyCrossLayoutItems(items) {
1852
+ const cross = /* @__PURE__ */ new Set();
1853
+ if (items.length < 3) return cross;
1854
+ let maxWidth = 0;
1583
1855
  for (const i of items) {
1584
- if (i.x < minX) minX = i.x;
1585
- if (i.y < minY) minY = i.y;
1586
- if (i.x + i.w > maxX) maxX = i.x + i.w;
1587
- if (i.y + i.h > maxY) maxY = i.y + i.h;
1856
+ if (i.w > maxWidth) maxWidth = i.w;
1588
1857
  }
1589
- return { items, minX, minY, maxX, maxY };
1858
+ const threshold = CROSS_LAYOUT_BETA * maxWidth;
1859
+ for (const item of items) {
1860
+ if (item.w < threshold) continue;
1861
+ let overlaps = 0;
1862
+ for (const other of items) {
1863
+ if (other === item) continue;
1864
+ const left = Math.max(item.x, other.x);
1865
+ const right = Math.min(item.x + item.w, other.x + other.w);
1866
+ const overlapW = right - left;
1867
+ if (overlapW <= 0) continue;
1868
+ const smaller = Math.min(item.w, other.w);
1869
+ if (smaller > 0 && overlapW / smaller >= CROSS_OVERLAP_RATIO) {
1870
+ overlaps++;
1871
+ if (overlaps >= CROSS_MIN_OVERLAPS) break;
1872
+ }
1873
+ }
1874
+ if (overlaps >= CROSS_MIN_OVERLAPS) cross.add(item);
1875
+ }
1876
+ return cross;
1590
1877
  }
1591
- function findYSplit(items, _region, gapThreshold) {
1878
+ function mergeCrossLayoutGroups(groups, cross) {
1879
+ if (cross.length === 0) return groups;
1880
+ const sortedCross = [...cross].sort((a, b) => b.y + b.h - (a.y + a.h) || a.x - b.x);
1881
+ const groupTop = (g2) => {
1882
+ let top = -Infinity;
1883
+ for (const i of g2) {
1884
+ const t = i.y + i.h;
1885
+ if (t > top) top = t;
1886
+ }
1887
+ return top;
1888
+ };
1889
+ const result = [];
1890
+ let gi = 0, ci = 0;
1891
+ while (gi < groups.length || ci < sortedCross.length) {
1892
+ if (ci >= sortedCross.length) {
1893
+ result.push(groups[gi++]);
1894
+ continue;
1895
+ }
1896
+ if (gi >= groups.length) {
1897
+ result.push([sortedCross[ci++]]);
1898
+ continue;
1899
+ }
1900
+ const crossTop = sortedCross[ci].y + sortedCross[ci].h;
1901
+ if (crossTop >= groupTop(groups[gi])) result.push([sortedCross[ci++]]);
1902
+ else result.push(groups[gi++]);
1903
+ }
1904
+ return result;
1905
+ }
1906
+ function findHorizontalCut(items) {
1907
+ if (items.length < 2) return { position: 0, gap: 0 };
1592
1908
  const sorted = [...items].sort((a, b) => b.y - a.y);
1593
- let bestGap = gapThreshold;
1594
- let bestSplit = null;
1909
+ let largestGap = 0;
1910
+ let position = 0;
1595
1911
  for (let i = 1; i < sorted.length; i++) {
1596
1912
  const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
1597
1913
  const currTop = sorted[i].y;
1598
1914
  const gap = prevBottom - currTop;
1599
- if (gap > bestGap) {
1600
- bestGap = gap;
1601
- bestSplit = (prevBottom + currTop) / 2;
1915
+ if (gap > largestGap) {
1916
+ largestGap = gap;
1917
+ position = (prevBottom + currTop) / 2;
1602
1918
  }
1603
1919
  }
1604
- return bestSplit;
1920
+ return { position, gap: largestGap };
1605
1921
  }
1606
- function findXSplit(items, _region, gapThreshold) {
1607
- const sorted = [...items].sort((a, b) => a.x - b.x);
1608
- let bestGap = gapThreshold;
1609
- let bestSplit = null;
1610
- for (let i = 1; i < sorted.length; i++) {
1611
- const prevRight = sorted[i - 1].x + sorted[i - 1].w;
1612
- const currLeft = sorted[i].x;
1613
- const gap = currLeft - prevRight;
1614
- if (gap > bestGap) {
1615
- bestGap = gap;
1616
- bestSplit = (prevRight + currLeft) / 2;
1922
+ function findVerticalCutWithOutlierFilter(items, minGap) {
1923
+ const edgeCut = findVerticalCut(items);
1924
+ if (edgeCut.gap >= minGap) return edgeCut;
1925
+ if (items.length >= 3) {
1926
+ let minX = Infinity, maxX = -Infinity;
1927
+ for (const i of items) {
1928
+ if (i.x < minX) minX = i.x;
1929
+ const r = i.x + i.w;
1930
+ if (r > maxX) maxX = r;
1931
+ }
1932
+ const narrowThreshold = (maxX - minX) * NARROW_ELEMENT_WIDTH_RATIO;
1933
+ const filtered = items.filter((i) => i.w >= narrowThreshold);
1934
+ if (filtered.length >= 2 && filtered.length < items.length && filtered.length >= items.length * 0.7) {
1935
+ const filteredCut = findVerticalCut(filtered);
1936
+ if (filteredCut.gap > edgeCut.gap && filteredCut.gap >= minGap) {
1937
+ return filteredCut;
1938
+ }
1939
+ }
1940
+ }
1941
+ return edgeCut;
1942
+ }
1943
+ function findVerticalCut(items) {
1944
+ if (items.length < 2) return { position: 0, gap: 0 };
1945
+ const sorted = [...items].sort((a, b) => a.x - b.x || a.x + a.w - (b.x + b.w));
1946
+ let largestGap = 0;
1947
+ let position = 0;
1948
+ let prevRight = null;
1949
+ for (const it of sorted) {
1950
+ const left = it.x;
1951
+ const right = it.x + it.w;
1952
+ if (prevRight !== null && left > prevRight) {
1953
+ const gap = left - prevRight;
1954
+ if (gap > largestGap) {
1955
+ largestGap = gap;
1956
+ position = (prevRight + left) / 2;
1957
+ }
1617
1958
  }
1959
+ prevRight = prevRight === null ? right : Math.max(prevRight, right);
1618
1960
  }
1619
- return bestSplit;
1961
+ return { position, gap: largestGap };
1620
1962
  }
1621
1963
  function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
1622
1964
  if (items.length === 0) return [];
1623
1965
  let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
1624
1966
  ({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
1625
1967
  ({ horizontals, verticals } = preprocessLines(horizontals, verticals));
1968
+ markStrikethroughItems(items, horizontals);
1969
+ wrapStrikethroughRuns(items);
1626
1970
  const grids = buildTableGrids(horizontals, verticals);
1627
1971
  if (grids.length > 0) {
1628
1972
  return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
1629
1973
  }
1630
1974
  return extractPageBlocksFallback(items, pageNum);
1631
1975
  }
1976
+ var STRIKE_MAX_THICKNESS = 2;
1977
+ var STRIKE_MAX_THICKNESS_RATIO = 0.25;
1978
+ var STRIKE_CENTER_TOLERANCE = 0.25;
1979
+ var STRIKE_MIN_OVERLAP_RATIO = 0.8;
1980
+ var STRIKE_MAX_LINE_TO_TEXT_RATIO = 1.5;
1981
+ function markStrikethroughItems(items, horizontals) {
1982
+ if (items.length === 0 || horizontals.length === 0) return;
1983
+ for (const line of horizontals) {
1984
+ if (line.lineWidth > STRIKE_MAX_THICKNESS) continue;
1985
+ const matches = [];
1986
+ for (const item of items) {
1987
+ const h = item.h > 0 ? item.h : item.fontSize;
1988
+ if (h <= 0 || item.w <= 0) continue;
1989
+ if (line.lineWidth > h * STRIKE_MAX_THICKNESS_RATIO) continue;
1990
+ const centerY = item.y + h * 0.4;
1991
+ if (Math.abs(line.y1 - centerY) > h * STRIKE_CENTER_TOLERANCE) continue;
1992
+ const overlap = Math.min(line.x2, item.x + item.w) - Math.max(line.x1, item.x);
1993
+ if (overlap / item.w < STRIKE_MIN_OVERLAP_RATIO) continue;
1994
+ matches.push(item);
1995
+ }
1996
+ if (matches.length === 0) continue;
1997
+ let totalW = 0;
1998
+ for (const m of matches) totalW += m.w;
1999
+ if (totalW <= 0 || (line.x2 - line.x1) / totalW > STRIKE_MAX_LINE_TO_TEXT_RATIO) continue;
2000
+ for (const m of matches) m.strike = true;
2001
+ }
2002
+ }
2003
+ function wrapStrikethroughRuns(items) {
2004
+ const struck = items.filter((i) => i.strike);
2005
+ if (struck.length === 0) return;
2006
+ const lines = /* @__PURE__ */ new Map();
2007
+ for (const item of struck) {
2008
+ const key = Math.round(item.y / 3);
2009
+ const arr = lines.get(key) || [];
2010
+ arr.push(item);
2011
+ lines.set(key, arr);
2012
+ }
2013
+ for (const arr of lines.values()) {
2014
+ arr.sort((a, b) => a.x - b.x);
2015
+ arr[0].text = "~~" + arr[0].text;
2016
+ arr[arr.length - 1].text = arr[arr.length - 1].text + "~~";
2017
+ }
2018
+ }
1632
2019
  function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1633
2020
  const blocks = [];
1634
2021
  const usedItems = /* @__PURE__ */ new Set();
@@ -1658,7 +2045,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1658
2045
  w: i.w,
1659
2046
  h: i.h,
1660
2047
  fontSize: i.fontSize,
1661
- fontName: i.fontName
2048
+ fontName: i.fontName,
2049
+ hasSpaceBefore: i.hasSpaceBefore
1662
2050
  }));
1663
2051
  const cellTextMap = mapTextToCells(textItems, cells);
1664
2052
  const numRows = grid.rowYs.length - 1;
@@ -1678,13 +2066,29 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1678
2066
  rowSpan: cell.rowSpan
1679
2067
  };
1680
2068
  }
2069
+ let finalGrid = irGrid;
2070
+ let finalRows = numRows;
2071
+ if (numRows <= 2 && numCols >= 3) {
2072
+ const rebuilt = normalizeUndersegmentedTable(irGrid, grid.colXs, textItems);
2073
+ if (rebuilt) {
2074
+ finalGrid = rebuilt.map((row) => row.map((rawText) => {
2075
+ const cleaned = rawText.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
2076
+ return {
2077
+ text: cleaned.split("\n").map((line) => collapseEvenSpacing(line)).join("\n"),
2078
+ colSpan: 1,
2079
+ rowSpan: 1
2080
+ };
2081
+ }));
2082
+ finalRows = finalGrid.length;
2083
+ }
2084
+ }
1681
2085
  const irTable = {
1682
- rows: numRows,
2086
+ rows: finalRows,
1683
2087
  cols: numCols,
1684
- cells: irGrid,
1685
- hasHeader: numRows > 1
2088
+ cells: finalGrid,
2089
+ hasHeader: finalRows > 1
1686
2090
  };
1687
- const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
2091
+ const hasContent = finalGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
1688
2092
  if (!hasContent) continue;
1689
2093
  const tableBbox = {
1690
2094
  page: pageNum,
@@ -1713,7 +2117,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1713
2117
  w: i.w,
1714
2118
  h: i.h,
1715
2119
  fontSize: i.fontSize,
1716
- fontName: i.fontName
2120
+ fontName: i.fontName,
2121
+ hasSpaceBefore: i.hasSpaceBefore
1717
2122
  }));
1718
2123
  const clusterResults = detectClusterTables(clusterItems, pageNum);
1719
2124
  if (clusterResults.length > 0) {
@@ -1751,6 +2156,46 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1751
2156
  }
1752
2157
  return mergeAdjacentTableBlocks(blocks);
1753
2158
  }
2159
+ var NEIGHBOR_TABLE_EPSILON = 0.2;
2160
+ function mergeCrossPageTables(blocks) {
2161
+ for (let i = blocks.length - 2; i >= 0; i--) {
2162
+ const prev = blocks[i];
2163
+ const curr = blocks[i + 1];
2164
+ if (prev.type !== "table" || curr.type !== "table" || !prev.table || !curr.table) continue;
2165
+ if (!prev.pageNumber || !curr.pageNumber || curr.pageNumber !== prev.pageNumber + 1) continue;
2166
+ if (prev.table.cols !== curr.table.cols) continue;
2167
+ if (!prev.bbox || !curr.bbox) continue;
2168
+ const width = Math.max(prev.bbox.width, curr.bbox.width, 1);
2169
+ const leftDiff = Math.abs(prev.bbox.x - curr.bbox.x);
2170
+ const rightDiff = Math.abs(prev.bbox.x + prev.bbox.width - (curr.bbox.x + curr.bbox.width));
2171
+ if (leftDiff > width * NEIGHBOR_TABLE_EPSILON || rightDiff > width * NEIGHBOR_TABLE_EPSILON) continue;
2172
+ let currCells = curr.table.cells;
2173
+ if (currCells.length > 1 && prev.table.cells.length > 0 && rowTextsEqual(prev.table.cells[0], currCells[0])) {
2174
+ currCells = currCells.slice(1);
2175
+ }
2176
+ if (currCells.length === 0) {
2177
+ blocks.splice(i + 1, 1);
2178
+ continue;
2179
+ }
2180
+ const merged = {
2181
+ rows: prev.table.rows + currCells.length,
2182
+ cols: prev.table.cols,
2183
+ cells: [...prev.table.cells, ...currCells],
2184
+ hasHeader: prev.table.hasHeader,
2185
+ caption: prev.table.caption
2186
+ };
2187
+ blocks[i] = { ...prev, table: merged };
2188
+ blocks.splice(i + 1, 1);
2189
+ }
2190
+ }
2191
+ function rowTextsEqual(a, b) {
2192
+ if (a.length !== b.length) return false;
2193
+ const norm = (t) => t.replace(/\s+/g, "");
2194
+ for (let i = 0; i < a.length; i++) {
2195
+ if (norm(a[i].text) !== norm(b[i].text)) return false;
2196
+ }
2197
+ return a.some((c) => c.text.trim() !== "");
2198
+ }
1754
2199
  function mergeAdjacentTableBlocks(blocks) {
1755
2200
  if (blocks.length <= 1) return blocks;
1756
2201
  const result = [blocks[0]];
@@ -1781,7 +2226,8 @@ function extractPageBlocksFallback(items, pageNum) {
1781
2226
  w: i.w,
1782
2227
  h: i.h,
1783
2228
  fontSize: i.fontSize,
1784
- fontName: i.fontName
2229
+ fontName: i.fontName,
2230
+ hasSpaceBefore: i.hasSpaceBefore
1785
2231
  }));
1786
2232
  const clusterResults = detectClusterTables(clusterItems, pageNum);
1787
2233
  if (clusterResults.length > 0) {
@@ -1797,7 +2243,7 @@ function extractPageBlocksFallback(items, pageNum) {
1797
2243
  }
1798
2244
  const remaining = items.filter((_, idx) => !usedIndices.has(idx));
1799
2245
  if (remaining.length > 0) {
1800
- const yLines = groupByY(remaining);
2246
+ const yLines = mergeSuperscriptLines(groupByY(remaining));
1801
2247
  for (const line of yLines) {
1802
2248
  const text = mergeLineSimple(line);
1803
2249
  if (!text.trim()) continue;
@@ -1811,7 +2257,7 @@ function extractPageBlocksFallback(items, pageNum) {
1811
2257
  return by - ay;
1812
2258
  });
1813
2259
  } else {
1814
- const allYLines = groupByY(items);
2260
+ const allYLines = mergeSuperscriptLines(groupByY(items));
1815
2261
  const columns = detectColumns(allYLines);
1816
2262
  if (columns && columns.length >= 3) {
1817
2263
  const tableText = extractWithColumns(allYLines, columns);
@@ -1824,7 +2270,7 @@ function extractPageBlocksFallback(items, pageNum) {
1824
2270
  const orderedGroups = xyCutOrder(items, gapThreshold);
1825
2271
  for (const group of orderedGroups) {
1826
2272
  if (group.length === 0) continue;
1827
- const yLines = groupByY(group);
2273
+ const yLines = mergeSuperscriptLines(groupByY(group));
1828
2274
  const groupColumns = detectColumns(yLines);
1829
2275
  if (groupColumns && groupColumns.length >= 3) {
1830
2276
  const tableText = extractWithColumns(yLines, groupColumns);
@@ -1916,16 +2362,16 @@ function normalizeItems(rawItems) {
1916
2362
  if (!isDup) deduped.push(sorted[i]);
1917
2363
  }
1918
2364
  if (spacePositions.length > 0) {
1919
- for (const item of deduped) {
1920
- for (const sp of spacePositions) {
1921
- if (Math.abs(sp.y - item.y) <= 3) {
1922
- const dist = item.x - sp.x;
1923
- if (dist >= 0 && dist <= 20) {
1924
- item.hasSpaceBefore = true;
1925
- break;
1926
- }
2365
+ for (const sp of spacePositions) {
2366
+ let nearest = null;
2367
+ for (const item of deduped) {
2368
+ if (Math.abs(sp.y - item.y) > 3) continue;
2369
+ const dist = item.x - sp.x;
2370
+ if (dist >= -1 && dist <= 20 && (!nearest || item.x < nearest.x)) {
2371
+ nearest = item;
1927
2372
  }
1928
2373
  }
2374
+ if (nearest) nearest.hasSpaceBefore = true;
1929
2375
  }
1930
2376
  }
1931
2377
  return deduped;
@@ -1959,6 +2405,35 @@ function groupByY(items) {
1959
2405
  if (curLine.length > 0) lines.push(curLine);
1960
2406
  return lines;
1961
2407
  }
2408
+ function mergeSuperscriptLines(lines) {
2409
+ if (lines.length <= 1) return lines;
2410
+ const band = (line) => {
2411
+ let bottom = Infinity, top = -Infinity;
2412
+ for (const i of line) {
2413
+ const h = i.h > 0 ? i.h : i.fontSize;
2414
+ if (i.y < bottom) bottom = i.y;
2415
+ if (i.y + h > top) top = i.y + h;
2416
+ }
2417
+ return { bottom, top, height: top - bottom };
2418
+ };
2419
+ const isFrag = (line) => line.length <= 3 && line.every((i) => i.text.trim().length <= 8);
2420
+ const result = [lines[0]];
2421
+ for (let i = 1; i < lines.length; i++) {
2422
+ const prev = result[result.length - 1];
2423
+ const curr = lines[i];
2424
+ const a = band(prev);
2425
+ const b = band(curr);
2426
+ const overlap = Math.min(a.top, b.top) - Math.max(a.bottom, b.bottom);
2427
+ const prevIsFrag = isFrag(prev) && a.height <= b.height * 0.8 && overlap >= a.height * 0.5;
2428
+ const currIsFrag = isFrag(curr) && b.height <= a.height * 0.8 && overlap >= b.height * 0.5;
2429
+ if (prevIsFrag || currIsFrag) {
2430
+ result[result.length - 1] = [...prev, ...curr];
2431
+ } else {
2432
+ result.push(curr);
2433
+ }
2434
+ }
2435
+ return result;
2436
+ }
1962
2437
  function isProseSpread(items) {
1963
2438
  if (items.length < 4) return false;
1964
2439
  const sorted = [...items].sort((a, b) => a.x - b.x);
@@ -2200,9 +2675,7 @@ function mergeLineSimple(items) {
2200
2675
  result += sorted[i].text;
2201
2676
  continue;
2202
2677
  }
2203
- if (gap < avgFs * 0.15) {
2204
- } else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
2205
- } else if (gap > 3) result += " ";
2678
+ if (gap > spaceGapThreshold(avgFs)) result += " ";
2206
2679
  result += sorted[i].text;
2207
2680
  }
2208
2681
  return result;
@@ -2226,7 +2699,7 @@ function cleanPdfText(text) {
2226
2699
  ).replace(/^(?!\| ---).*$/gm, (line) => {
2227
2700
  if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
2228
2701
  return collapseEvenSpacing(line);
2229
- }).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2702
+ }).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\\~\\~/g, "~~").replace(/~~~~/g, "").replace(/\n{3,}/g, "\n\n").trim();
2230
2703
  }
2231
2704
  function startsWithMarker(line) {
2232
2705
  const t = line.trimStart();
@@ -2235,6 +2708,134 @@ function startsWithMarker(line) {
2235
2708
  function isStandaloneHeader(line) {
2236
2709
  return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
2237
2710
  }
2711
+ var TABLE_CAPTION_RE = /^[<\[(【〈]?\s*(표|그림|도표|Table|Figure|Fig\.?)\s*[\d①-⑮][\d.\-]*\s*[\])】〉>]?[.:]?\s*/i;
2712
+ var CAPTION_MAX_LENGTH = 100;
2713
+ var CAPTION_MAX_GAP = 30;
2714
+ function detectTableCaptions(blocks) {
2715
+ const isCaptionCandidate = (b, table) => {
2716
+ if (!b || b.type !== "paragraph" || !b.text) return false;
2717
+ if (b.pageNumber !== table.pageNumber) return false;
2718
+ const text = b.text.trim();
2719
+ if (!text || text.length > CAPTION_MAX_LENGTH || text.includes("\n")) return false;
2720
+ if (!TABLE_CAPTION_RE.test(text)) return false;
2721
+ if (b.bbox && table.bbox) {
2722
+ const capTop = b.bbox.y + b.bbox.height;
2723
+ const capBottom = b.bbox.y;
2724
+ const tblTop = table.bbox.y + table.bbox.height;
2725
+ const tblBottom = table.bbox.y;
2726
+ const gap = capBottom >= tblTop ? capBottom - tblTop : tblBottom - capTop;
2727
+ if (gap > CAPTION_MAX_GAP) return false;
2728
+ const overlap = Math.min(b.bbox.x + b.bbox.width, table.bbox.x + table.bbox.width) - Math.max(b.bbox.x, table.bbox.x);
2729
+ if (overlap < Math.min(b.bbox.width, table.bbox.width) * 0.3) return false;
2730
+ }
2731
+ return true;
2732
+ };
2733
+ for (let i = 0; i < blocks.length; i++) {
2734
+ const block = blocks[i];
2735
+ if (block.type !== "table" || !block.table || block.table.caption) continue;
2736
+ if (isCaptionCandidate(blocks[i - 1], block)) {
2737
+ block.table.caption = blocks[i - 1].text.trim();
2738
+ blocks.splice(i - 1, 1);
2739
+ i--;
2740
+ } else if (isCaptionCandidate(blocks[i + 1], block)) {
2741
+ block.table.caption = blocks[i + 1].text.trim();
2742
+ blocks.splice(i + 1, 1);
2743
+ }
2744
+ }
2745
+ }
2746
+ var KOREAN_LIST_SEQ = "\uAC00\uB098\uB2E4\uB77C\uB9C8\uBC14\uC0AC\uC544\uC790\uCC28\uCE74\uD0C0\uD30C\uD558";
2747
+ function parseListLabel(text) {
2748
+ let m = text.match(/^(\d{1,2})\.(?!\d)\s+/);
2749
+ if (m) return { family: "arabicDot", ord: parseInt(m[1], 10) };
2750
+ m = text.match(/^([가-하])\.\s+/);
2751
+ if (m) {
2752
+ const idx = KOREAN_LIST_SEQ.indexOf(m[1]);
2753
+ if (idx >= 0) return { family: "korDot", ord: idx + 1 };
2754
+ }
2755
+ m = text.match(/^(\d{1,2})\)\s*/);
2756
+ if (m) return { family: "arabicParen", ord: parseInt(m[1], 10) };
2757
+ m = text.match(/^([가-하])\)\s*/);
2758
+ if (m) {
2759
+ const idx = KOREAN_LIST_SEQ.indexOf(m[1]);
2760
+ if (idx >= 0) return { family: "korParen", ord: idx + 1 };
2761
+ }
2762
+ m = text.match(/^([①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮])\s*/);
2763
+ if (m) return { family: "circled", ord: m[1].charCodeAt(0) - 9312 + 1 };
2764
+ return null;
2765
+ }
2766
+ var ATTACHMENT_RE = /^붙\s*임\s*(\d+[.:]?)?\s/;
2767
+ function detectKoreanListBlocks(blocks) {
2768
+ const labeled = [];
2769
+ for (let i = 0; i < blocks.length; i++) {
2770
+ const b = blocks[i];
2771
+ if (b.type !== "paragraph" && b.type !== "list" || !b.text) continue;
2772
+ const label = parseListLabel(b.text.trim());
2773
+ if (label) labeled.push({ idx: i, label });
2774
+ }
2775
+ const validated = /* @__PURE__ */ new Set();
2776
+ const byFamily = /* @__PURE__ */ new Map();
2777
+ for (const l of labeled) {
2778
+ const arr = byFamily.get(l.label.family) || [];
2779
+ arr.push(l);
2780
+ byFamily.set(l.label.family, arr);
2781
+ }
2782
+ for (const arr of byFamily.values()) {
2783
+ let chain = [];
2784
+ for (const item of arr) {
2785
+ const prev = chain[chain.length - 1];
2786
+ if (prev && item.label.ord === prev.label.ord + 1 && item.idx - prev.idx <= 20) {
2787
+ chain.push(item);
2788
+ } else {
2789
+ if (chain.length >= 2) for (const c of chain) validated.add(c.idx);
2790
+ chain = [item];
2791
+ }
2792
+ }
2793
+ if (chain.length >= 2) for (const c of chain) validated.add(c.idx);
2794
+ }
2795
+ let familyStack = [];
2796
+ let lastTopLevelList = null;
2797
+ const toRemove = /* @__PURE__ */ new Set();
2798
+ for (let i = 0; i < blocks.length; i++) {
2799
+ const b = blocks[i];
2800
+ if (b.type === "table" || b.type === "heading" || b.type === "separator") {
2801
+ familyStack = [];
2802
+ lastTopLevelList = null;
2803
+ continue;
2804
+ }
2805
+ if (b.type !== "paragraph" && b.type !== "list" || !b.text) continue;
2806
+ const text = b.text.trim();
2807
+ if (b.type === "paragraph" && ATTACHMENT_RE.test(text)) {
2808
+ blocks[i] = { ...b, type: "list", listType: "unordered" };
2809
+ continue;
2810
+ }
2811
+ if (!validated.has(i)) continue;
2812
+ const label = parseListLabel(text);
2813
+ let depth = familyStack.indexOf(label.family);
2814
+ if (depth < 0) {
2815
+ familyStack.push(label.family);
2816
+ depth = familyStack.length - 1;
2817
+ } else {
2818
+ familyStack = familyStack.slice(0, depth + 1);
2819
+ }
2820
+ const listType = label.family === "arabicDot" ? "ordered" : "unordered";
2821
+ const listBlock = { ...b, type: "list", listType };
2822
+ if (depth === 0) {
2823
+ blocks[i] = listBlock;
2824
+ lastTopLevelList = listBlock;
2825
+ } else if (lastTopLevelList) {
2826
+ if (!lastTopLevelList.children) lastTopLevelList.children = [];
2827
+ lastTopLevelList.children.push(listBlock);
2828
+ toRemove.add(i);
2829
+ } else {
2830
+ blocks[i] = listBlock;
2831
+ lastTopLevelList = listBlock;
2832
+ }
2833
+ }
2834
+ if (toRemove.size > 0) {
2835
+ const sorted = [...toRemove].sort((a, b) => b - a);
2836
+ for (const idx of sorted) blocks.splice(idx, 1);
2837
+ }
2838
+ }
2238
2839
  function detectListBlocks(blocks) {
2239
2840
  const result = [];
2240
2841
  for (let i = 0; i < blocks.length; i++) {
@@ -2344,7 +2945,6 @@ function detectSpecialKoreanTables(blocks) {
2344
2945
  function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2345
2946
  const ZONE_RATIO = 0.12;
2346
2947
  const MIN_REPEAT = 3;
2347
- const Y_BUCKET = 5;
2348
2948
  const topEntries = [];
2349
2949
  const bottomEntries = [];
2350
2950
  for (let bi = 0; bi < blocks.length; bi++) {
@@ -2354,7 +2954,7 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2354
2954
  if (!ph) continue;
2355
2955
  const blockTop = ph - (b.bbox.y + b.bbox.height);
2356
2956
  const blockBottom = ph - b.bbox.y;
2357
- const entry = { blockIdx: bi, page: b.pageNumber, y: b.bbox.y, text: b.text.trim() };
2957
+ const entry = { blockIdx: bi, page: b.pageNumber, text: b.text.trim() };
2358
2958
  if (blockBottom <= ph * ZONE_RATIO) bottomEntries.push(entry);
2359
2959
  else if (blockTop >= ph * (1 - ZONE_RATIO)) topEntries.push(entry);
2360
2960
  }
@@ -2376,21 +2976,9 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2376
2976
  repeatedPatterns.add(p);
2377
2977
  }
2378
2978
  }
2379
- const bucketPages = /* @__PURE__ */ new Map();
2380
- for (const e of entries) {
2381
- const bucket = Math.round(e.y / Y_BUCKET);
2382
- const pages = bucketPages.get(bucket) || /* @__PURE__ */ new Set();
2383
- pages.add(e.page);
2384
- bucketPages.set(bucket, pages);
2385
- }
2386
- const repeatedBuckets = /* @__PURE__ */ new Set();
2387
- for (const [b, pages] of bucketPages) {
2388
- if (pages.size >= MIN_REPEAT) repeatedBuckets.add(b);
2389
- }
2390
2979
  for (const e of entries) {
2391
2980
  const norm = e.text.replace(/\d+/g, "#");
2392
- const bucket = Math.round(e.y / Y_BUCKET);
2393
- if (repeatedPatterns.has(norm) || repeatedBuckets.has(bucket)) {
2981
+ if (repeatedPatterns.has(norm)) {
2394
2982
  removeSet.add(e.blockIdx);
2395
2983
  }
2396
2984
  }
@@ -2549,7 +3137,11 @@ function formatMb(bytes) {
2549
3137
  }
2550
3138
  export {
2551
3139
  cleanPdfText,
3140
+ detectKoreanListBlocks,
3141
+ detectTableCaptions,
2552
3142
  extractPdfMetadataOnly,
2553
- parsePdfDocument
3143
+ mergeCrossPageTables,
3144
+ parsePdfDocument,
3145
+ removeHeaderFooterBlocks
2554
3146
  };
2555
- //# sourceMappingURL=parser-FJNQEW7K.js.map
3147
+ //# sourceMappingURL=parser-AU2NLC44.js.map