kordoc 2.9.0 → 3.0.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +66 -8
- package/dist/-K5SLEFZD.js +71 -0
- package/dist/-K5SLEFZD.js.map +1 -0
- package/dist/{chunk-M24KMDAR.js → chunk-326STEDU.js} +6684 -4061
- package/dist/chunk-326STEDU.js.map +1 -0
- package/dist/{chunk-QB7CS534.cjs → chunk-3WRJQQIO.cjs} +185 -16
- package/dist/chunk-3WRJQQIO.cjs.map +1 -0
- package/dist/chunk-MUOQXDZ4.cjs.map +1 -1
- package/dist/{chunk-RXZLTACX.js → chunk-NHXKJWR7.js} +182 -13
- package/dist/chunk-NHXKJWR7.js.map +1 -0
- package/dist/{chunk-SJ5TPMBT.js → chunk-SA2PERJ5.js} +182 -13
- package/dist/chunk-SA2PERJ5.js.map +1 -0
- package/dist/cli.js +42 -3
- package/dist/cli.js.map +1 -1
- package/dist/formula-XGG6ZP42.cjs.map +1 -1
- package/dist/index.cjs +3247 -822
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +61 -2
- package/dist/index.d.ts +61 -2
- package/dist/index.js +3025 -600
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/page-range-3C7UGGEK.cjs.map +1 -1
- package/dist/{parser-OMPBVEFU.js → parser-4IVYHKSL.js} +677 -85
- package/dist/parser-4IVYHKSL.js.map +1 -0
- package/dist/{parser-EL5YETUA.cjs → parser-5KHU732L.cjs} +689 -97
- package/dist/parser-5KHU732L.cjs.map +1 -0
- package/dist/{parser-XBYGROQB.js → parser-AU2NLC44.js} +677 -85
- package/dist/parser-AU2NLC44.js.map +1 -0
- package/dist/provider-SNONEZNW.cjs.map +1 -1
- package/dist/{watch-ULLLK7ID.js → watch-5DDN4BUI.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-M24KMDAR.js.map +0 -1
- package/dist/chunk-QB7CS534.cjs.map +0 -1
- package/dist/chunk-RXZLTACX.js.map +0 -1
- package/dist/chunk-SJ5TPMBT.js.map +0 -1
- package/dist/parser-EL5YETUA.cjs.map +0 -1
- package/dist/parser-OMPBVEFU.js.map +0 -1
- package/dist/parser-XBYGROQB.js.map +0 -1
- /package/dist/{watch-ULLLK7ID.js.map → watch-5DDN4BUI.js.map} +0 -0
|
@@ -7,7 +7,7 @@ import {
|
|
|
7
7
|
blocksToMarkdown,
|
|
8
8
|
safeMax,
|
|
9
9
|
safeMin
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-SA2PERJ5.js";
|
|
11
11
|
import {
|
|
12
12
|
parsePageRange
|
|
13
13
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -157,6 +157,55 @@ function extractLines(fnArray, argsArray) {
|
|
|
157
157
|
}
|
|
158
158
|
return { horizontals, verticals };
|
|
159
159
|
}
|
|
160
|
+
function multiplyTransform(m, t) {
|
|
161
|
+
return [
|
|
162
|
+
m[0] * t[0] + m[2] * t[1],
|
|
163
|
+
m[1] * t[0] + m[3] * t[1],
|
|
164
|
+
m[0] * t[2] + m[2] * t[3],
|
|
165
|
+
m[1] * t[2] + m[3] * t[3],
|
|
166
|
+
m[0] * t[4] + m[2] * t[5] + m[4],
|
|
167
|
+
m[1] * t[4] + m[3] * t[5] + m[5]
|
|
168
|
+
];
|
|
169
|
+
}
|
|
170
|
+
function extractImageRegions(fnArray, argsArray) {
|
|
171
|
+
const regions = [];
|
|
172
|
+
let ctm = [1, 0, 0, 1, 0, 0];
|
|
173
|
+
const stack = [];
|
|
174
|
+
for (let i = 0; i < fnArray.length; i++) {
|
|
175
|
+
const op = fnArray[i];
|
|
176
|
+
switch (op) {
|
|
177
|
+
case OPS.save:
|
|
178
|
+
stack.push(ctm);
|
|
179
|
+
break;
|
|
180
|
+
case OPS.restore:
|
|
181
|
+
ctm = stack.pop() || [1, 0, 0, 1, 0, 0];
|
|
182
|
+
break;
|
|
183
|
+
case OPS.transform: {
|
|
184
|
+
const t = argsArray[i];
|
|
185
|
+
if (Array.isArray(t) && t.length >= 6) ctm = multiplyTransform(ctm, t);
|
|
186
|
+
break;
|
|
187
|
+
}
|
|
188
|
+
case OPS.paintImageXObject:
|
|
189
|
+
case OPS.paintInlineImageXObject:
|
|
190
|
+
case OPS.paintImageMaskXObject:
|
|
191
|
+
case OPS.paintImageXObjectRepeat: {
|
|
192
|
+
const corners = [[0, 0], [1, 0], [0, 1], [1, 1]];
|
|
193
|
+
let x1 = Infinity, y1 = Infinity, x2 = -Infinity, y2 = -Infinity;
|
|
194
|
+
for (const [u, v] of corners) {
|
|
195
|
+
const x = ctm[0] * u + ctm[2] * v + ctm[4];
|
|
196
|
+
const y = ctm[1] * u + ctm[3] * v + ctm[5];
|
|
197
|
+
if (x < x1) x1 = x;
|
|
198
|
+
if (x > x2) x2 = x;
|
|
199
|
+
if (y < y1) y1 = y;
|
|
200
|
+
if (y > y2) y2 = y;
|
|
201
|
+
}
|
|
202
|
+
if (x2 - x1 > 0 && y2 - y1 > 0) regions.push({ x1, y1, x2, y2 });
|
|
203
|
+
break;
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
return regions;
|
|
208
|
+
}
|
|
160
209
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
161
210
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
162
211
|
const dy = Math.abs(seg.y2 - seg.y1);
|
|
@@ -542,6 +591,10 @@ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
|
542
591
|
}
|
|
543
592
|
return false;
|
|
544
593
|
}
|
|
594
|
+
var SPACE_GAP_RATIO = 0.17;
|
|
595
|
+
function spaceGapThreshold(fontSize) {
|
|
596
|
+
return Math.max(fontSize * SPACE_GAP_RATIO, 1);
|
|
597
|
+
}
|
|
545
598
|
function mapTextToCells(items, cells) {
|
|
546
599
|
const result = /* @__PURE__ */ new Map();
|
|
547
600
|
for (const cell of cells) {
|
|
@@ -601,14 +654,12 @@ function cellTextToString(items) {
|
|
|
601
654
|
}
|
|
602
655
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
603
656
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
604
|
-
|
|
605
|
-
const currIsKorean = /^[가-힣]/.test(s[j].text);
|
|
606
|
-
if (gap < avgFs * 0.15) {
|
|
607
|
-
result += s[j].text;
|
|
608
|
-
} else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
|
|
609
|
-
result += s[j].text;
|
|
610
|
-
} else {
|
|
657
|
+
if (s[j].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
611
658
|
result += " " + s[j].text;
|
|
659
|
+
} else if (gap > spaceGapThreshold(avgFs)) {
|
|
660
|
+
result += " " + s[j].text;
|
|
661
|
+
} else {
|
|
662
|
+
result += s[j].text;
|
|
612
663
|
}
|
|
613
664
|
}
|
|
614
665
|
return result;
|
|
@@ -621,6 +672,11 @@ function detectEvenSpacedItems(items) {
|
|
|
621
672
|
let runStart = -1;
|
|
622
673
|
for (let i = 0; i < items.length; i++) {
|
|
623
674
|
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
675
|
+
if (isShortKorean && runStart >= 0 && items[i].hasSpaceBefore) {
|
|
676
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
677
|
+
runStart = i;
|
|
678
|
+
continue;
|
|
679
|
+
}
|
|
624
680
|
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
625
681
|
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
626
682
|
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
@@ -663,6 +719,119 @@ function markEvenRun(items, result, start, end) {
|
|
|
663
719
|
}
|
|
664
720
|
}
|
|
665
721
|
}
|
|
722
|
+
var MAX_UNDERSEGMENTED_ROWS = 2;
|
|
723
|
+
var MIN_UNDERSEGMENTED_COLUMNS = 3;
|
|
724
|
+
var MIN_UNDERSEGMENTED_TEXT_LINES = 8;
|
|
725
|
+
var MIN_ROW_BAND_MISMATCH = 2;
|
|
726
|
+
var MIN_ROW_BAND_EPSILON = 3;
|
|
727
|
+
var ROW_BAND_EPSILON_RATIO = 0.6;
|
|
728
|
+
function itemCenterY(item) {
|
|
729
|
+
return item.y + (item.h > 0 ? item.h : item.fontSize) / 2;
|
|
730
|
+
}
|
|
731
|
+
function itemHeight(item) {
|
|
732
|
+
return item.h > 0 ? item.h : item.fontSize;
|
|
733
|
+
}
|
|
734
|
+
function findColumnIndex(item, colXs) {
|
|
735
|
+
const cx = item.x + item.w / 2;
|
|
736
|
+
for (let c = 0; c < colXs.length - 1; c++) {
|
|
737
|
+
if (cx >= colXs[c] && cx <= colXs[c + 1]) return c;
|
|
738
|
+
}
|
|
739
|
+
let best = 0;
|
|
740
|
+
let bestDist = Infinity;
|
|
741
|
+
for (let c = 0; c < colXs.length - 1; c++) {
|
|
742
|
+
const center = (colXs[c] + colXs[c + 1]) / 2;
|
|
743
|
+
const d = Math.abs(cx - center);
|
|
744
|
+
if (d < bestDist) {
|
|
745
|
+
bestDist = d;
|
|
746
|
+
best = c;
|
|
747
|
+
}
|
|
748
|
+
}
|
|
749
|
+
return best;
|
|
750
|
+
}
|
|
751
|
+
function groupItemsToVisualLines(items) {
|
|
752
|
+
if (items.length === 0) return [];
|
|
753
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
754
|
+
const lines = [];
|
|
755
|
+
let cur = [sorted[0]];
|
|
756
|
+
let curY = sorted[0].y;
|
|
757
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
758
|
+
const tol = Math.max(3, Math.min(sorted[i].fontSize, cur[0].fontSize) * 0.6);
|
|
759
|
+
if (Math.abs(sorted[i].y - curY) <= tol) {
|
|
760
|
+
cur.push(sorted[i]);
|
|
761
|
+
} else {
|
|
762
|
+
lines.push(cur);
|
|
763
|
+
cur = [sorted[i]];
|
|
764
|
+
curY = sorted[i].y;
|
|
765
|
+
}
|
|
766
|
+
}
|
|
767
|
+
lines.push(cur);
|
|
768
|
+
return lines;
|
|
769
|
+
}
|
|
770
|
+
function normalizeUndersegmentedTable(originalCells, colXs, items) {
|
|
771
|
+
const numRows = originalCells.length;
|
|
772
|
+
const numCols = colXs.length - 1;
|
|
773
|
+
if (numRows > MAX_UNDERSEGMENTED_ROWS || numCols < MIN_UNDERSEGMENTED_COLUMNS) return null;
|
|
774
|
+
if (items.length === 0) return null;
|
|
775
|
+
const itemsByCol = Array.from({ length: numCols }, () => []);
|
|
776
|
+
for (const item of items) {
|
|
777
|
+
if (!item.text.trim()) continue;
|
|
778
|
+
itemsByCol[findColumnIndex(item, colXs)].push(item);
|
|
779
|
+
}
|
|
780
|
+
let denseColumns = 0;
|
|
781
|
+
for (const colItems of itemsByCol) {
|
|
782
|
+
if (groupItemsToVisualLines(colItems).length >= MIN_UNDERSEGMENTED_TEXT_LINES) denseColumns++;
|
|
783
|
+
}
|
|
784
|
+
if (denseColumns < 2) return null;
|
|
785
|
+
const allLines = groupItemsToVisualLines(items.filter((i) => i.text.trim()));
|
|
786
|
+
const bands = [];
|
|
787
|
+
for (const line of allLines) {
|
|
788
|
+
let cy = 0, h = 0;
|
|
789
|
+
for (const it of line) {
|
|
790
|
+
cy += itemCenterY(it);
|
|
791
|
+
h += itemHeight(it);
|
|
792
|
+
}
|
|
793
|
+
cy /= line.length;
|
|
794
|
+
h /= line.length;
|
|
795
|
+
const top = cy + h / 2;
|
|
796
|
+
const bottom = cy - h / 2;
|
|
797
|
+
let matched = null;
|
|
798
|
+
for (const band of bands) {
|
|
799
|
+
const epsilon = Math.max(MIN_ROW_BAND_EPSILON, Math.min(band.avgHeight, h) * ROW_BAND_EPSILON_RATIO);
|
|
800
|
+
if (Math.abs(band.centerY - cy) <= epsilon || bottom <= band.topY && top >= band.bottomY) {
|
|
801
|
+
matched = band;
|
|
802
|
+
break;
|
|
803
|
+
}
|
|
804
|
+
}
|
|
805
|
+
if (!matched) {
|
|
806
|
+
matched = { centerY: 0, avgHeight: 0, topY: -Infinity, bottomY: Infinity, lineCount: 0, itemsByCol: Array.from({ length: numCols }, () => []) };
|
|
807
|
+
bands.push(matched);
|
|
808
|
+
}
|
|
809
|
+
matched.centerY = (matched.centerY * matched.lineCount + cy) / (matched.lineCount + 1);
|
|
810
|
+
matched.avgHeight = (matched.avgHeight * matched.lineCount + h) / (matched.lineCount + 1);
|
|
811
|
+
matched.topY = Math.max(matched.topY, top);
|
|
812
|
+
matched.bottomY = Math.min(matched.bottomY, bottom);
|
|
813
|
+
matched.lineCount++;
|
|
814
|
+
for (const it of line) {
|
|
815
|
+
matched.itemsByCol[findColumnIndex(it, colXs)].push(it);
|
|
816
|
+
}
|
|
817
|
+
}
|
|
818
|
+
if (bands.length < numRows + MIN_ROW_BAND_MISMATCH) return null;
|
|
819
|
+
bands.sort((a, b) => b.centerY - a.centerY);
|
|
820
|
+
const rebuilt = bands.map(
|
|
821
|
+
(band) => band.itemsByCol.map((colItems) => colItems.length > 0 ? cellTextToString(colItems) : "")
|
|
822
|
+
);
|
|
823
|
+
const countNonEmptyRows = (cells) => cells.filter((row) => row.some((c) => (typeof c === "string" ? c : c.text).trim() !== "")).length;
|
|
824
|
+
const countNonEmptyCols = (cells, cols) => {
|
|
825
|
+
let n = 0;
|
|
826
|
+
for (let c = 0; c < cols; c++) {
|
|
827
|
+
if (cells.some((row) => row[c] != null && (typeof row[c] === "string" ? row[c] : row[c].text).trim() !== "")) n++;
|
|
828
|
+
}
|
|
829
|
+
return n;
|
|
830
|
+
};
|
|
831
|
+
if (countNonEmptyRows(rebuilt) <= countNonEmptyRows(originalCells)) return null;
|
|
832
|
+
if (countNonEmptyCols(rebuilt, numCols) < countNonEmptyCols(originalCells, numCols)) return null;
|
|
833
|
+
return rebuilt;
|
|
834
|
+
}
|
|
666
835
|
function mergeCellTextLines(textLines) {
|
|
667
836
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
668
837
|
const merged = [textLines[0]];
|
|
@@ -695,7 +864,7 @@ var MIN_COL_FILL_RATIO = 0.4;
|
|
|
695
864
|
function detectClusterTables(items, pageNum) {
|
|
696
865
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
697
866
|
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
698
|
-
const rows = groupByBaseline(merged);
|
|
867
|
+
const rows = mergeOverlappingRows(groupByBaseline(merged));
|
|
699
868
|
if (rows.length < MIN_ROWS) return [];
|
|
700
869
|
const results = [];
|
|
701
870
|
const headerResult = detectHeaderRow(rows);
|
|
@@ -744,6 +913,7 @@ function mergeEvenSpacedClusters(items) {
|
|
|
744
913
|
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
745
914
|
let runEnd = i + 1;
|
|
746
915
|
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
916
|
+
if (sorted[runEnd].hasSpaceBefore) break;
|
|
747
917
|
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
748
918
|
const fs = sorted[runEnd].fontSize;
|
|
749
919
|
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
@@ -834,6 +1004,38 @@ function detectHeaderRow(rows) {
|
|
|
834
1004
|
}
|
|
835
1005
|
return null;
|
|
836
1006
|
}
|
|
1007
|
+
function mergeOverlappingRows(rows) {
|
|
1008
|
+
if (rows.length <= 1) return rows;
|
|
1009
|
+
const result = [rows[0]];
|
|
1010
|
+
for (let i = 1; i < rows.length; i++) {
|
|
1011
|
+
const prev = result[result.length - 1];
|
|
1012
|
+
const curr = rows[i];
|
|
1013
|
+
const a = rowBand(prev);
|
|
1014
|
+
const b = rowBand(curr);
|
|
1015
|
+
const overlap = Math.min(a.top, b.top) - Math.max(a.bottom, b.bottom);
|
|
1016
|
+
const prevIsFrag = isFragmentRow(prev) && a.height <= b.height * 0.8 && overlap >= a.height * 0.5;
|
|
1017
|
+
const currIsFrag = isFragmentRow(curr) && b.height <= a.height * 0.8 && overlap >= b.height * 0.5;
|
|
1018
|
+
if (prevIsFrag || currIsFrag) {
|
|
1019
|
+
const baseY = prevIsFrag ? curr.y : prev.y;
|
|
1020
|
+
result[result.length - 1] = { y: baseY, items: [...prev.items, ...curr.items] };
|
|
1021
|
+
} else {
|
|
1022
|
+
result.push(curr);
|
|
1023
|
+
}
|
|
1024
|
+
}
|
|
1025
|
+
return result;
|
|
1026
|
+
}
|
|
1027
|
+
function isFragmentRow(row) {
|
|
1028
|
+
return row.items.length <= 3 && row.items.every((i) => i.text.length <= 8);
|
|
1029
|
+
}
|
|
1030
|
+
function rowBand(row) {
|
|
1031
|
+
let bottom = Infinity, top = -Infinity;
|
|
1032
|
+
for (const i of row.items) {
|
|
1033
|
+
const h = i.h > 0 ? i.h : i.fontSize;
|
|
1034
|
+
if (i.y < bottom) bottom = i.y;
|
|
1035
|
+
if (i.y + h > top) top = i.y + h;
|
|
1036
|
+
}
|
|
1037
|
+
return { bottom, top, height: top - bottom };
|
|
1038
|
+
}
|
|
837
1039
|
function mergeMultiLineRows(rows, columns) {
|
|
838
1040
|
if (rows.length <= 1) return rows;
|
|
839
1041
|
const result = [rows[0]];
|
|
@@ -1320,6 +1522,8 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1320
1522
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
1321
1523
|
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
1322
1524
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
1525
|
+
const pagesWithLargeImage = /* @__PURE__ */ new Set();
|
|
1526
|
+
const skippedImagePages = /* @__PURE__ */ new Map();
|
|
1323
1527
|
let parsedPages = 0;
|
|
1324
1528
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
1325
1529
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
@@ -1338,6 +1542,23 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1338
1542
|
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
1339
1543
|
}
|
|
1340
1544
|
const opList = await page.getOperatorList();
|
|
1545
|
+
const pageArea = viewport.width * viewport.height;
|
|
1546
|
+
if (pageArea > 0) {
|
|
1547
|
+
const imageRegions = extractImageRegions(opList.fnArray, opList.argsArray);
|
|
1548
|
+
let uncovered = 0;
|
|
1549
|
+
for (const r of imageRegions) {
|
|
1550
|
+
const area = (r.x2 - r.x1) * (r.y2 - r.y1);
|
|
1551
|
+
if (area < pageArea * 0.05) continue;
|
|
1552
|
+
pagesWithLargeImage.add(i);
|
|
1553
|
+
const hasText = visible.some((it) => {
|
|
1554
|
+
const cx = it.x + it.w / 2;
|
|
1555
|
+
const cy = it.y + (it.h || it.fontSize) / 2;
|
|
1556
|
+
return cx >= r.x1 && cx <= r.x2 && cy >= r.y1 && cy <= r.y2;
|
|
1557
|
+
});
|
|
1558
|
+
if (!hasText) uncovered++;
|
|
1559
|
+
}
|
|
1560
|
+
if (uncovered > 0) skippedImagePages.set(i, uncovered);
|
|
1561
|
+
}
|
|
1341
1562
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
1342
1563
|
for (const b of pageBlocks) blocks.push(b);
|
|
1343
1564
|
let pageText = "";
|
|
@@ -1357,6 +1578,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1357
1578
|
}
|
|
1358
1579
|
}
|
|
1359
1580
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
1581
|
+
let isImageBased = false;
|
|
1360
1582
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
1361
1583
|
if (options?.ocr) {
|
|
1362
1584
|
try {
|
|
@@ -1369,7 +1591,29 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1369
1591
|
} catch {
|
|
1370
1592
|
}
|
|
1371
1593
|
}
|
|
1372
|
-
|
|
1594
|
+
isImageBased = true;
|
|
1595
|
+
warnings.push({
|
|
1596
|
+
message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, \uD14D\uC2A4\uD2B8 ${totalChars}\uC790) \u2014 \uD14D\uC2A4\uD2B8 \uB808\uC774\uC5B4\uAC00 \uC5C6\uC5B4 OCR\uC774 \uD544\uC694\uD569\uB2C8\uB2E4`,
|
|
1597
|
+
code: "NEEDS_OCR"
|
|
1598
|
+
});
|
|
1599
|
+
}
|
|
1600
|
+
if (!isImageBased) {
|
|
1601
|
+
const OCR_REASON_MESSAGES = {
|
|
1602
|
+
low_text: "\uD14D\uC2A4\uD2B8\uAC00 \uAC70\uC758 \uC5C6\uB294 \uD398\uC774\uC9C0 (\uC2A4\uCE94/\uC774\uBBF8\uC9C0 \uCD94\uC815)",
|
|
1603
|
+
high_pua: "\uAE00\uAF34 \uB9E4\uD551 \uC2E4\uD328 (PUA \uBE44\uC728 \uB192\uC74C) \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00",
|
|
1604
|
+
high_control: "\uC81C\uC5B4\uBB38\uC790 \uBE44\uC728 \uB192\uC74C \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00",
|
|
1605
|
+
high_replacement: "\uB300\uCCB4\uBB38\uC790(U+FFFD) \uBE44\uC728 \uB192\uC74C \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00"
|
|
1606
|
+
};
|
|
1607
|
+
for (const pq of pageQuality) {
|
|
1608
|
+
if (!pq.needsOcr || !pq.ocrReason) continue;
|
|
1609
|
+
if (pq.ocrReason === "low_text" && !pagesWithLargeImage.has(pq.page)) continue;
|
|
1610
|
+
warnings.push({ page: pq.page, message: `${OCR_REASON_MESSAGES[pq.ocrReason]} \u2014 OCR \uAC80\uD1A0 \uD544\uC694`, code: "NEEDS_OCR" });
|
|
1611
|
+
}
|
|
1612
|
+
}
|
|
1613
|
+
if (!isImageBased) {
|
|
1614
|
+
for (const [page, count] of [...skippedImagePages.entries()].sort((a, b) => a[0] - b[0])) {
|
|
1615
|
+
warnings.push({ page, message: `${count}\uAC1C \uC774\uBBF8\uC9C0 \uC601\uC5ED\uC5D0 \uCD94\uCD9C \uAC00\uB2A5\uD55C \uD14D\uC2A4\uD2B8 \uC5C6\uC74C (\uADF8\uB9BC/\uCC28\uD2B8/\uB3C4\uC7A5 \uB0B4\uC6A9 \uB204\uB77D \uAC00\uB2A5)`, code: "SKIPPED_IMAGE" });
|
|
1616
|
+
}
|
|
1373
1617
|
}
|
|
1374
1618
|
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
1375
1619
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
@@ -1377,6 +1621,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1377
1621
|
blocks.splice(removed[ri], 1);
|
|
1378
1622
|
}
|
|
1379
1623
|
}
|
|
1624
|
+
mergeCrossPageTables(blocks);
|
|
1380
1625
|
if (options?.formulaOcr && formulaBuffer) {
|
|
1381
1626
|
try {
|
|
1382
1627
|
await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
|
|
@@ -1392,6 +1637,8 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1392
1637
|
detectHeadings(blocks, medianFontSize);
|
|
1393
1638
|
}
|
|
1394
1639
|
detectMarkerHeadings(blocks);
|
|
1640
|
+
detectTableCaptions(blocks);
|
|
1641
|
+
detectKoreanListBlocks(blocks);
|
|
1395
1642
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
1396
1643
|
sanitizeBlockControlChars(blocks);
|
|
1397
1644
|
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
@@ -1401,6 +1648,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1401
1648
|
metadata,
|
|
1402
1649
|
outline: outline.length > 0 ? outline : void 0,
|
|
1403
1650
|
warnings: warnings.length > 0 ? warnings : void 0,
|
|
1651
|
+
isImageBased: isImageBased || void 0,
|
|
1404
1652
|
pageQuality,
|
|
1405
1653
|
qualitySummary: summarizeDocumentQuality(pageQuality)
|
|
1406
1654
|
};
|
|
@@ -1556,79 +1804,218 @@ function detectMarkerHeadings(blocks) {
|
|
|
1556
1804
|
}
|
|
1557
1805
|
}
|
|
1558
1806
|
var MAX_XYCUT_DEPTH = 50;
|
|
1807
|
+
var XYCUT_MIN_GAP = 5;
|
|
1808
|
+
var CROSS_LAYOUT_BETA = 2;
|
|
1809
|
+
var CROSS_OVERLAP_RATIO = 0.1;
|
|
1810
|
+
var CROSS_MIN_OVERLAPS = 2;
|
|
1811
|
+
var CROSS_MAX_MASK_RATIO = 0.2;
|
|
1812
|
+
var NARROW_ELEMENT_WIDTH_RATIO = 0.1;
|
|
1559
1813
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
1560
1814
|
if (items.length === 0) return [];
|
|
1561
1815
|
if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1566
|
-
|
|
1816
|
+
if (depth === 0 && items.length >= 3) {
|
|
1817
|
+
const cross = identifyCrossLayoutItems(items);
|
|
1818
|
+
if (cross.size > 0 && cross.size <= items.length * CROSS_MAX_MASK_RATIO) {
|
|
1819
|
+
const rest = items.filter((i) => !cross.has(i));
|
|
1820
|
+
if (rest.length > 0) {
|
|
1821
|
+
const groups = xyCutOrder(rest, gapThreshold, 1);
|
|
1822
|
+
return mergeCrossLayoutGroups(groups, [...cross]);
|
|
1823
|
+
}
|
|
1824
|
+
}
|
|
1825
|
+
}
|
|
1826
|
+
const minGap = Math.max(XYCUT_MIN_GAP, gapThreshold);
|
|
1827
|
+
const hCut = findHorizontalCut(items);
|
|
1828
|
+
const vCut = findVerticalCutWithOutlierFilter(items, minGap);
|
|
1829
|
+
const hValid = hCut.gap >= minGap;
|
|
1830
|
+
const vValid = vCut.gap >= minGap;
|
|
1831
|
+
let useHorizontal;
|
|
1832
|
+
if (hValid && vValid) useHorizontal = vCut.gap <= hCut.gap * 1.5;
|
|
1833
|
+
else if (hValid) useHorizontal = true;
|
|
1834
|
+
else if (vValid) useHorizontal = false;
|
|
1835
|
+
else return [items];
|
|
1836
|
+
if (useHorizontal) {
|
|
1837
|
+
const upper = items.filter((i) => i.y > hCut.position);
|
|
1838
|
+
const lower = items.filter((i) => i.y <= hCut.position);
|
|
1567
1839
|
if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
|
|
1568
1840
|
return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
|
|
1569
1841
|
}
|
|
1570
|
-
}
|
|
1571
|
-
|
|
1572
|
-
|
|
1573
|
-
const left = items.filter((i) => i.x + i.w / 2 < xSplit);
|
|
1574
|
-
const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
|
|
1842
|
+
} else {
|
|
1843
|
+
const left = items.filter((i) => i.x + i.w / 2 < vCut.position);
|
|
1844
|
+
const right = items.filter((i) => i.x + i.w / 2 >= vCut.position);
|
|
1575
1845
|
if (left.length > 0 && right.length > 0 && left.length < items.length) {
|
|
1576
1846
|
return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
|
|
1577
1847
|
}
|
|
1578
1848
|
}
|
|
1579
1849
|
return [items];
|
|
1580
1850
|
}
|
|
1581
|
-
function
|
|
1582
|
-
|
|
1851
|
+
function identifyCrossLayoutItems(items) {
|
|
1852
|
+
const cross = /* @__PURE__ */ new Set();
|
|
1853
|
+
if (items.length < 3) return cross;
|
|
1854
|
+
let maxWidth = 0;
|
|
1583
1855
|
for (const i of items) {
|
|
1584
|
-
if (i.
|
|
1585
|
-
if (i.y < minY) minY = i.y;
|
|
1586
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1587
|
-
if (i.y + i.h > maxY) maxY = i.y + i.h;
|
|
1856
|
+
if (i.w > maxWidth) maxWidth = i.w;
|
|
1588
1857
|
}
|
|
1589
|
-
|
|
1858
|
+
const threshold = CROSS_LAYOUT_BETA * maxWidth;
|
|
1859
|
+
for (const item of items) {
|
|
1860
|
+
if (item.w < threshold) continue;
|
|
1861
|
+
let overlaps = 0;
|
|
1862
|
+
for (const other of items) {
|
|
1863
|
+
if (other === item) continue;
|
|
1864
|
+
const left = Math.max(item.x, other.x);
|
|
1865
|
+
const right = Math.min(item.x + item.w, other.x + other.w);
|
|
1866
|
+
const overlapW = right - left;
|
|
1867
|
+
if (overlapW <= 0) continue;
|
|
1868
|
+
const smaller = Math.min(item.w, other.w);
|
|
1869
|
+
if (smaller > 0 && overlapW / smaller >= CROSS_OVERLAP_RATIO) {
|
|
1870
|
+
overlaps++;
|
|
1871
|
+
if (overlaps >= CROSS_MIN_OVERLAPS) break;
|
|
1872
|
+
}
|
|
1873
|
+
}
|
|
1874
|
+
if (overlaps >= CROSS_MIN_OVERLAPS) cross.add(item);
|
|
1875
|
+
}
|
|
1876
|
+
return cross;
|
|
1590
1877
|
}
|
|
1591
|
-
function
|
|
1878
|
+
function mergeCrossLayoutGroups(groups, cross) {
|
|
1879
|
+
if (cross.length === 0) return groups;
|
|
1880
|
+
const sortedCross = [...cross].sort((a, b) => b.y + b.h - (a.y + a.h) || a.x - b.x);
|
|
1881
|
+
const groupTop = (g2) => {
|
|
1882
|
+
let top = -Infinity;
|
|
1883
|
+
for (const i of g2) {
|
|
1884
|
+
const t = i.y + i.h;
|
|
1885
|
+
if (t > top) top = t;
|
|
1886
|
+
}
|
|
1887
|
+
return top;
|
|
1888
|
+
};
|
|
1889
|
+
const result = [];
|
|
1890
|
+
let gi = 0, ci = 0;
|
|
1891
|
+
while (gi < groups.length || ci < sortedCross.length) {
|
|
1892
|
+
if (ci >= sortedCross.length) {
|
|
1893
|
+
result.push(groups[gi++]);
|
|
1894
|
+
continue;
|
|
1895
|
+
}
|
|
1896
|
+
if (gi >= groups.length) {
|
|
1897
|
+
result.push([sortedCross[ci++]]);
|
|
1898
|
+
continue;
|
|
1899
|
+
}
|
|
1900
|
+
const crossTop = sortedCross[ci].y + sortedCross[ci].h;
|
|
1901
|
+
if (crossTop >= groupTop(groups[gi])) result.push([sortedCross[ci++]]);
|
|
1902
|
+
else result.push(groups[gi++]);
|
|
1903
|
+
}
|
|
1904
|
+
return result;
|
|
1905
|
+
}
|
|
1906
|
+
function findHorizontalCut(items) {
|
|
1907
|
+
if (items.length < 2) return { position: 0, gap: 0 };
|
|
1592
1908
|
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
1593
|
-
let
|
|
1594
|
-
let
|
|
1909
|
+
let largestGap = 0;
|
|
1910
|
+
let position = 0;
|
|
1595
1911
|
for (let i = 1; i < sorted.length; i++) {
|
|
1596
1912
|
const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
|
|
1597
1913
|
const currTop = sorted[i].y;
|
|
1598
1914
|
const gap = prevBottom - currTop;
|
|
1599
|
-
if (gap >
|
|
1600
|
-
|
|
1601
|
-
|
|
1915
|
+
if (gap > largestGap) {
|
|
1916
|
+
largestGap = gap;
|
|
1917
|
+
position = (prevBottom + currTop) / 2;
|
|
1602
1918
|
}
|
|
1603
1919
|
}
|
|
1604
|
-
return
|
|
1920
|
+
return { position, gap: largestGap };
|
|
1605
1921
|
}
|
|
1606
|
-
function
|
|
1607
|
-
const
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
|
|
1611
|
-
const
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1616
|
-
|
|
1922
|
+
function findVerticalCutWithOutlierFilter(items, minGap) {
|
|
1923
|
+
const edgeCut = findVerticalCut(items);
|
|
1924
|
+
if (edgeCut.gap >= minGap) return edgeCut;
|
|
1925
|
+
if (items.length >= 3) {
|
|
1926
|
+
let minX = Infinity, maxX = -Infinity;
|
|
1927
|
+
for (const i of items) {
|
|
1928
|
+
if (i.x < minX) minX = i.x;
|
|
1929
|
+
const r = i.x + i.w;
|
|
1930
|
+
if (r > maxX) maxX = r;
|
|
1931
|
+
}
|
|
1932
|
+
const narrowThreshold = (maxX - minX) * NARROW_ELEMENT_WIDTH_RATIO;
|
|
1933
|
+
const filtered = items.filter((i) => i.w >= narrowThreshold);
|
|
1934
|
+
if (filtered.length >= 2 && filtered.length < items.length && filtered.length >= items.length * 0.7) {
|
|
1935
|
+
const filteredCut = findVerticalCut(filtered);
|
|
1936
|
+
if (filteredCut.gap > edgeCut.gap && filteredCut.gap >= minGap) {
|
|
1937
|
+
return filteredCut;
|
|
1938
|
+
}
|
|
1939
|
+
}
|
|
1940
|
+
}
|
|
1941
|
+
return edgeCut;
|
|
1942
|
+
}
|
|
1943
|
+
function findVerticalCut(items) {
|
|
1944
|
+
if (items.length < 2) return { position: 0, gap: 0 };
|
|
1945
|
+
const sorted = [...items].sort((a, b) => a.x - b.x || a.x + a.w - (b.x + b.w));
|
|
1946
|
+
let largestGap = 0;
|
|
1947
|
+
let position = 0;
|
|
1948
|
+
let prevRight = null;
|
|
1949
|
+
for (const it of sorted) {
|
|
1950
|
+
const left = it.x;
|
|
1951
|
+
const right = it.x + it.w;
|
|
1952
|
+
if (prevRight !== null && left > prevRight) {
|
|
1953
|
+
const gap = left - prevRight;
|
|
1954
|
+
if (gap > largestGap) {
|
|
1955
|
+
largestGap = gap;
|
|
1956
|
+
position = (prevRight + left) / 2;
|
|
1957
|
+
}
|
|
1617
1958
|
}
|
|
1959
|
+
prevRight = prevRight === null ? right : Math.max(prevRight, right);
|
|
1618
1960
|
}
|
|
1619
|
-
return
|
|
1961
|
+
return { position, gap: largestGap };
|
|
1620
1962
|
}
|
|
1621
1963
|
function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
|
|
1622
1964
|
if (items.length === 0) return [];
|
|
1623
1965
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
1624
1966
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
1625
1967
|
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
1968
|
+
markStrikethroughItems(items, horizontals);
|
|
1969
|
+
wrapStrikethroughRuns(items);
|
|
1626
1970
|
const grids = buildTableGrids(horizontals, verticals);
|
|
1627
1971
|
if (grids.length > 0) {
|
|
1628
1972
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
1629
1973
|
}
|
|
1630
1974
|
return extractPageBlocksFallback(items, pageNum);
|
|
1631
1975
|
}
|
|
1976
|
+
var STRIKE_MAX_THICKNESS = 2;
|
|
1977
|
+
var STRIKE_MAX_THICKNESS_RATIO = 0.25;
|
|
1978
|
+
var STRIKE_CENTER_TOLERANCE = 0.25;
|
|
1979
|
+
var STRIKE_MIN_OVERLAP_RATIO = 0.8;
|
|
1980
|
+
var STRIKE_MAX_LINE_TO_TEXT_RATIO = 1.5;
|
|
1981
|
+
function markStrikethroughItems(items, horizontals) {
|
|
1982
|
+
if (items.length === 0 || horizontals.length === 0) return;
|
|
1983
|
+
for (const line of horizontals) {
|
|
1984
|
+
if (line.lineWidth > STRIKE_MAX_THICKNESS) continue;
|
|
1985
|
+
const matches = [];
|
|
1986
|
+
for (const item of items) {
|
|
1987
|
+
const h = item.h > 0 ? item.h : item.fontSize;
|
|
1988
|
+
if (h <= 0 || item.w <= 0) continue;
|
|
1989
|
+
if (line.lineWidth > h * STRIKE_MAX_THICKNESS_RATIO) continue;
|
|
1990
|
+
const centerY = item.y + h * 0.4;
|
|
1991
|
+
if (Math.abs(line.y1 - centerY) > h * STRIKE_CENTER_TOLERANCE) continue;
|
|
1992
|
+
const overlap = Math.min(line.x2, item.x + item.w) - Math.max(line.x1, item.x);
|
|
1993
|
+
if (overlap / item.w < STRIKE_MIN_OVERLAP_RATIO) continue;
|
|
1994
|
+
matches.push(item);
|
|
1995
|
+
}
|
|
1996
|
+
if (matches.length === 0) continue;
|
|
1997
|
+
let totalW = 0;
|
|
1998
|
+
for (const m of matches) totalW += m.w;
|
|
1999
|
+
if (totalW <= 0 || (line.x2 - line.x1) / totalW > STRIKE_MAX_LINE_TO_TEXT_RATIO) continue;
|
|
2000
|
+
for (const m of matches) m.strike = true;
|
|
2001
|
+
}
|
|
2002
|
+
}
|
|
2003
|
+
function wrapStrikethroughRuns(items) {
|
|
2004
|
+
const struck = items.filter((i) => i.strike);
|
|
2005
|
+
if (struck.length === 0) return;
|
|
2006
|
+
const lines = /* @__PURE__ */ new Map();
|
|
2007
|
+
for (const item of struck) {
|
|
2008
|
+
const key = Math.round(item.y / 3);
|
|
2009
|
+
const arr = lines.get(key) || [];
|
|
2010
|
+
arr.push(item);
|
|
2011
|
+
lines.set(key, arr);
|
|
2012
|
+
}
|
|
2013
|
+
for (const arr of lines.values()) {
|
|
2014
|
+
arr.sort((a, b) => a.x - b.x);
|
|
2015
|
+
arr[0].text = "~~" + arr[0].text;
|
|
2016
|
+
arr[arr.length - 1].text = arr[arr.length - 1].text + "~~";
|
|
2017
|
+
}
|
|
2018
|
+
}
|
|
1632
2019
|
function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
1633
2020
|
const blocks = [];
|
|
1634
2021
|
const usedItems = /* @__PURE__ */ new Set();
|
|
@@ -1658,7 +2045,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1658
2045
|
w: i.w,
|
|
1659
2046
|
h: i.h,
|
|
1660
2047
|
fontSize: i.fontSize,
|
|
1661
|
-
fontName: i.fontName
|
|
2048
|
+
fontName: i.fontName,
|
|
2049
|
+
hasSpaceBefore: i.hasSpaceBefore
|
|
1662
2050
|
}));
|
|
1663
2051
|
const cellTextMap = mapTextToCells(textItems, cells);
|
|
1664
2052
|
const numRows = grid.rowYs.length - 1;
|
|
@@ -1678,13 +2066,29 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1678
2066
|
rowSpan: cell.rowSpan
|
|
1679
2067
|
};
|
|
1680
2068
|
}
|
|
2069
|
+
let finalGrid = irGrid;
|
|
2070
|
+
let finalRows = numRows;
|
|
2071
|
+
if (numRows <= 2 && numCols >= 3) {
|
|
2072
|
+
const rebuilt = normalizeUndersegmentedTable(irGrid, grid.colXs, textItems);
|
|
2073
|
+
if (rebuilt) {
|
|
2074
|
+
finalGrid = rebuilt.map((row) => row.map((rawText) => {
|
|
2075
|
+
const cleaned = rawText.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
2076
|
+
return {
|
|
2077
|
+
text: cleaned.split("\n").map((line) => collapseEvenSpacing(line)).join("\n"),
|
|
2078
|
+
colSpan: 1,
|
|
2079
|
+
rowSpan: 1
|
|
2080
|
+
};
|
|
2081
|
+
}));
|
|
2082
|
+
finalRows = finalGrid.length;
|
|
2083
|
+
}
|
|
2084
|
+
}
|
|
1681
2085
|
const irTable = {
|
|
1682
|
-
rows:
|
|
2086
|
+
rows: finalRows,
|
|
1683
2087
|
cols: numCols,
|
|
1684
|
-
cells:
|
|
1685
|
-
hasHeader:
|
|
2088
|
+
cells: finalGrid,
|
|
2089
|
+
hasHeader: finalRows > 1
|
|
1686
2090
|
};
|
|
1687
|
-
const hasContent =
|
|
2091
|
+
const hasContent = finalGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
1688
2092
|
if (!hasContent) continue;
|
|
1689
2093
|
const tableBbox = {
|
|
1690
2094
|
page: pageNum,
|
|
@@ -1713,7 +2117,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1713
2117
|
w: i.w,
|
|
1714
2118
|
h: i.h,
|
|
1715
2119
|
fontSize: i.fontSize,
|
|
1716
|
-
fontName: i.fontName
|
|
2120
|
+
fontName: i.fontName,
|
|
2121
|
+
hasSpaceBefore: i.hasSpaceBefore
|
|
1717
2122
|
}));
|
|
1718
2123
|
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
1719
2124
|
if (clusterResults.length > 0) {
|
|
@@ -1751,6 +2156,46 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1751
2156
|
}
|
|
1752
2157
|
return mergeAdjacentTableBlocks(blocks);
|
|
1753
2158
|
}
|
|
2159
|
+
var NEIGHBOR_TABLE_EPSILON = 0.2;
|
|
2160
|
+
function mergeCrossPageTables(blocks) {
|
|
2161
|
+
for (let i = blocks.length - 2; i >= 0; i--) {
|
|
2162
|
+
const prev = blocks[i];
|
|
2163
|
+
const curr = blocks[i + 1];
|
|
2164
|
+
if (prev.type !== "table" || curr.type !== "table" || !prev.table || !curr.table) continue;
|
|
2165
|
+
if (!prev.pageNumber || !curr.pageNumber || curr.pageNumber !== prev.pageNumber + 1) continue;
|
|
2166
|
+
if (prev.table.cols !== curr.table.cols) continue;
|
|
2167
|
+
if (!prev.bbox || !curr.bbox) continue;
|
|
2168
|
+
const width = Math.max(prev.bbox.width, curr.bbox.width, 1);
|
|
2169
|
+
const leftDiff = Math.abs(prev.bbox.x - curr.bbox.x);
|
|
2170
|
+
const rightDiff = Math.abs(prev.bbox.x + prev.bbox.width - (curr.bbox.x + curr.bbox.width));
|
|
2171
|
+
if (leftDiff > width * NEIGHBOR_TABLE_EPSILON || rightDiff > width * NEIGHBOR_TABLE_EPSILON) continue;
|
|
2172
|
+
let currCells = curr.table.cells;
|
|
2173
|
+
if (currCells.length > 1 && prev.table.cells.length > 0 && rowTextsEqual(prev.table.cells[0], currCells[0])) {
|
|
2174
|
+
currCells = currCells.slice(1);
|
|
2175
|
+
}
|
|
2176
|
+
if (currCells.length === 0) {
|
|
2177
|
+
blocks.splice(i + 1, 1);
|
|
2178
|
+
continue;
|
|
2179
|
+
}
|
|
2180
|
+
const merged = {
|
|
2181
|
+
rows: prev.table.rows + currCells.length,
|
|
2182
|
+
cols: prev.table.cols,
|
|
2183
|
+
cells: [...prev.table.cells, ...currCells],
|
|
2184
|
+
hasHeader: prev.table.hasHeader,
|
|
2185
|
+
caption: prev.table.caption
|
|
2186
|
+
};
|
|
2187
|
+
blocks[i] = { ...prev, table: merged };
|
|
2188
|
+
blocks.splice(i + 1, 1);
|
|
2189
|
+
}
|
|
2190
|
+
}
|
|
2191
|
+
function rowTextsEqual(a, b) {
|
|
2192
|
+
if (a.length !== b.length) return false;
|
|
2193
|
+
const norm = (t) => t.replace(/\s+/g, "");
|
|
2194
|
+
for (let i = 0; i < a.length; i++) {
|
|
2195
|
+
if (norm(a[i].text) !== norm(b[i].text)) return false;
|
|
2196
|
+
}
|
|
2197
|
+
return a.some((c) => c.text.trim() !== "");
|
|
2198
|
+
}
|
|
1754
2199
|
function mergeAdjacentTableBlocks(blocks) {
|
|
1755
2200
|
if (blocks.length <= 1) return blocks;
|
|
1756
2201
|
const result = [blocks[0]];
|
|
@@ -1781,7 +2226,8 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1781
2226
|
w: i.w,
|
|
1782
2227
|
h: i.h,
|
|
1783
2228
|
fontSize: i.fontSize,
|
|
1784
|
-
fontName: i.fontName
|
|
2229
|
+
fontName: i.fontName,
|
|
2230
|
+
hasSpaceBefore: i.hasSpaceBefore
|
|
1785
2231
|
}));
|
|
1786
2232
|
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
1787
2233
|
if (clusterResults.length > 0) {
|
|
@@ -1797,7 +2243,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1797
2243
|
}
|
|
1798
2244
|
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
1799
2245
|
if (remaining.length > 0) {
|
|
1800
|
-
const yLines = groupByY(remaining);
|
|
2246
|
+
const yLines = mergeSuperscriptLines(groupByY(remaining));
|
|
1801
2247
|
for (const line of yLines) {
|
|
1802
2248
|
const text = mergeLineSimple(line);
|
|
1803
2249
|
if (!text.trim()) continue;
|
|
@@ -1811,7 +2257,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1811
2257
|
return by - ay;
|
|
1812
2258
|
});
|
|
1813
2259
|
} else {
|
|
1814
|
-
const allYLines = groupByY(items);
|
|
2260
|
+
const allYLines = mergeSuperscriptLines(groupByY(items));
|
|
1815
2261
|
const columns = detectColumns(allYLines);
|
|
1816
2262
|
if (columns && columns.length >= 3) {
|
|
1817
2263
|
const tableText = extractWithColumns(allYLines, columns);
|
|
@@ -1824,7 +2270,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1824
2270
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
1825
2271
|
for (const group of orderedGroups) {
|
|
1826
2272
|
if (group.length === 0) continue;
|
|
1827
|
-
const yLines = groupByY(group);
|
|
2273
|
+
const yLines = mergeSuperscriptLines(groupByY(group));
|
|
1828
2274
|
const groupColumns = detectColumns(yLines);
|
|
1829
2275
|
if (groupColumns && groupColumns.length >= 3) {
|
|
1830
2276
|
const tableText = extractWithColumns(yLines, groupColumns);
|
|
@@ -1916,16 +2362,16 @@ function normalizeItems(rawItems) {
|
|
|
1916
2362
|
if (!isDup) deduped.push(sorted[i]);
|
|
1917
2363
|
}
|
|
1918
2364
|
if (spacePositions.length > 0) {
|
|
1919
|
-
for (const
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
|
|
1926
|
-
}
|
|
2365
|
+
for (const sp of spacePositions) {
|
|
2366
|
+
let nearest = null;
|
|
2367
|
+
for (const item of deduped) {
|
|
2368
|
+
if (Math.abs(sp.y - item.y) > 3) continue;
|
|
2369
|
+
const dist = item.x - sp.x;
|
|
2370
|
+
if (dist >= -1 && dist <= 20 && (!nearest || item.x < nearest.x)) {
|
|
2371
|
+
nearest = item;
|
|
1927
2372
|
}
|
|
1928
2373
|
}
|
|
2374
|
+
if (nearest) nearest.hasSpaceBefore = true;
|
|
1929
2375
|
}
|
|
1930
2376
|
}
|
|
1931
2377
|
return deduped;
|
|
@@ -1959,6 +2405,35 @@ function groupByY(items) {
|
|
|
1959
2405
|
if (curLine.length > 0) lines.push(curLine);
|
|
1960
2406
|
return lines;
|
|
1961
2407
|
}
|
|
2408
|
+
function mergeSuperscriptLines(lines) {
|
|
2409
|
+
if (lines.length <= 1) return lines;
|
|
2410
|
+
const band = (line) => {
|
|
2411
|
+
let bottom = Infinity, top = -Infinity;
|
|
2412
|
+
for (const i of line) {
|
|
2413
|
+
const h = i.h > 0 ? i.h : i.fontSize;
|
|
2414
|
+
if (i.y < bottom) bottom = i.y;
|
|
2415
|
+
if (i.y + h > top) top = i.y + h;
|
|
2416
|
+
}
|
|
2417
|
+
return { bottom, top, height: top - bottom };
|
|
2418
|
+
};
|
|
2419
|
+
const isFrag = (line) => line.length <= 3 && line.every((i) => i.text.trim().length <= 8);
|
|
2420
|
+
const result = [lines[0]];
|
|
2421
|
+
for (let i = 1; i < lines.length; i++) {
|
|
2422
|
+
const prev = result[result.length - 1];
|
|
2423
|
+
const curr = lines[i];
|
|
2424
|
+
const a = band(prev);
|
|
2425
|
+
const b = band(curr);
|
|
2426
|
+
const overlap = Math.min(a.top, b.top) - Math.max(a.bottom, b.bottom);
|
|
2427
|
+
const prevIsFrag = isFrag(prev) && a.height <= b.height * 0.8 && overlap >= a.height * 0.5;
|
|
2428
|
+
const currIsFrag = isFrag(curr) && b.height <= a.height * 0.8 && overlap >= b.height * 0.5;
|
|
2429
|
+
if (prevIsFrag || currIsFrag) {
|
|
2430
|
+
result[result.length - 1] = [...prev, ...curr];
|
|
2431
|
+
} else {
|
|
2432
|
+
result.push(curr);
|
|
2433
|
+
}
|
|
2434
|
+
}
|
|
2435
|
+
return result;
|
|
2436
|
+
}
|
|
1962
2437
|
function isProseSpread(items) {
|
|
1963
2438
|
if (items.length < 4) return false;
|
|
1964
2439
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
@@ -2200,9 +2675,7 @@ function mergeLineSimple(items) {
|
|
|
2200
2675
|
result += sorted[i].text;
|
|
2201
2676
|
continue;
|
|
2202
2677
|
}
|
|
2203
|
-
if (gap
|
|
2204
|
-
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
2205
|
-
} else if (gap > 3) result += " ";
|
|
2678
|
+
if (gap > spaceGapThreshold(avgFs)) result += " ";
|
|
2206
2679
|
result += sorted[i].text;
|
|
2207
2680
|
}
|
|
2208
2681
|
return result;
|
|
@@ -2226,7 +2699,7 @@ function cleanPdfText(text) {
|
|
|
2226
2699
|
).replace(/^(?!\| ---).*$/gm, (line) => {
|
|
2227
2700
|
if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
|
|
2228
2701
|
return collapseEvenSpacing(line);
|
|
2229
|
-
}).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
2702
|
+
}).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\\~\\~/g, "~~").replace(/~~~~/g, "").replace(/\n{3,}/g, "\n\n").trim();
|
|
2230
2703
|
}
|
|
2231
2704
|
function startsWithMarker(line) {
|
|
2232
2705
|
const t = line.trimStart();
|
|
@@ -2235,6 +2708,134 @@ function startsWithMarker(line) {
|
|
|
2235
2708
|
function isStandaloneHeader(line) {
|
|
2236
2709
|
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
2237
2710
|
}
|
|
2711
|
+
var TABLE_CAPTION_RE = /^[<\[(【〈]?\s*(표|그림|도표|Table|Figure|Fig\.?)\s*[\d①-⑮][\d.\-]*\s*[\])】〉>]?[.:]?\s*/i;
|
|
2712
|
+
var CAPTION_MAX_LENGTH = 100;
|
|
2713
|
+
var CAPTION_MAX_GAP = 30;
|
|
2714
|
+
function detectTableCaptions(blocks) {
|
|
2715
|
+
const isCaptionCandidate = (b, table) => {
|
|
2716
|
+
if (!b || b.type !== "paragraph" || !b.text) return false;
|
|
2717
|
+
if (b.pageNumber !== table.pageNumber) return false;
|
|
2718
|
+
const text = b.text.trim();
|
|
2719
|
+
if (!text || text.length > CAPTION_MAX_LENGTH || text.includes("\n")) return false;
|
|
2720
|
+
if (!TABLE_CAPTION_RE.test(text)) return false;
|
|
2721
|
+
if (b.bbox && table.bbox) {
|
|
2722
|
+
const capTop = b.bbox.y + b.bbox.height;
|
|
2723
|
+
const capBottom = b.bbox.y;
|
|
2724
|
+
const tblTop = table.bbox.y + table.bbox.height;
|
|
2725
|
+
const tblBottom = table.bbox.y;
|
|
2726
|
+
const gap = capBottom >= tblTop ? capBottom - tblTop : tblBottom - capTop;
|
|
2727
|
+
if (gap > CAPTION_MAX_GAP) return false;
|
|
2728
|
+
const overlap = Math.min(b.bbox.x + b.bbox.width, table.bbox.x + table.bbox.width) - Math.max(b.bbox.x, table.bbox.x);
|
|
2729
|
+
if (overlap < Math.min(b.bbox.width, table.bbox.width) * 0.3) return false;
|
|
2730
|
+
}
|
|
2731
|
+
return true;
|
|
2732
|
+
};
|
|
2733
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2734
|
+
const block = blocks[i];
|
|
2735
|
+
if (block.type !== "table" || !block.table || block.table.caption) continue;
|
|
2736
|
+
if (isCaptionCandidate(blocks[i - 1], block)) {
|
|
2737
|
+
block.table.caption = blocks[i - 1].text.trim();
|
|
2738
|
+
blocks.splice(i - 1, 1);
|
|
2739
|
+
i--;
|
|
2740
|
+
} else if (isCaptionCandidate(blocks[i + 1], block)) {
|
|
2741
|
+
block.table.caption = blocks[i + 1].text.trim();
|
|
2742
|
+
blocks.splice(i + 1, 1);
|
|
2743
|
+
}
|
|
2744
|
+
}
|
|
2745
|
+
}
|
|
2746
|
+
var KOREAN_LIST_SEQ = "\uAC00\uB098\uB2E4\uB77C\uB9C8\uBC14\uC0AC\uC544\uC790\uCC28\uCE74\uD0C0\uD30C\uD558";
|
|
2747
|
+
function parseListLabel(text) {
|
|
2748
|
+
let m = text.match(/^(\d{1,2})\.(?!\d)\s+/);
|
|
2749
|
+
if (m) return { family: "arabicDot", ord: parseInt(m[1], 10) };
|
|
2750
|
+
m = text.match(/^([가-하])\.\s+/);
|
|
2751
|
+
if (m) {
|
|
2752
|
+
const idx = KOREAN_LIST_SEQ.indexOf(m[1]);
|
|
2753
|
+
if (idx >= 0) return { family: "korDot", ord: idx + 1 };
|
|
2754
|
+
}
|
|
2755
|
+
m = text.match(/^(\d{1,2})\)\s*/);
|
|
2756
|
+
if (m) return { family: "arabicParen", ord: parseInt(m[1], 10) };
|
|
2757
|
+
m = text.match(/^([가-하])\)\s*/);
|
|
2758
|
+
if (m) {
|
|
2759
|
+
const idx = KOREAN_LIST_SEQ.indexOf(m[1]);
|
|
2760
|
+
if (idx >= 0) return { family: "korParen", ord: idx + 1 };
|
|
2761
|
+
}
|
|
2762
|
+
m = text.match(/^([①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮])\s*/);
|
|
2763
|
+
if (m) return { family: "circled", ord: m[1].charCodeAt(0) - 9312 + 1 };
|
|
2764
|
+
return null;
|
|
2765
|
+
}
|
|
2766
|
+
var ATTACHMENT_RE = /^붙\s*임\s*(\d+[.:]?)?\s/;
|
|
2767
|
+
function detectKoreanListBlocks(blocks) {
|
|
2768
|
+
const labeled = [];
|
|
2769
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2770
|
+
const b = blocks[i];
|
|
2771
|
+
if (b.type !== "paragraph" && b.type !== "list" || !b.text) continue;
|
|
2772
|
+
const label = parseListLabel(b.text.trim());
|
|
2773
|
+
if (label) labeled.push({ idx: i, label });
|
|
2774
|
+
}
|
|
2775
|
+
const validated = /* @__PURE__ */ new Set();
|
|
2776
|
+
const byFamily = /* @__PURE__ */ new Map();
|
|
2777
|
+
for (const l of labeled) {
|
|
2778
|
+
const arr = byFamily.get(l.label.family) || [];
|
|
2779
|
+
arr.push(l);
|
|
2780
|
+
byFamily.set(l.label.family, arr);
|
|
2781
|
+
}
|
|
2782
|
+
for (const arr of byFamily.values()) {
|
|
2783
|
+
let chain = [];
|
|
2784
|
+
for (const item of arr) {
|
|
2785
|
+
const prev = chain[chain.length - 1];
|
|
2786
|
+
if (prev && item.label.ord === prev.label.ord + 1 && item.idx - prev.idx <= 20) {
|
|
2787
|
+
chain.push(item);
|
|
2788
|
+
} else {
|
|
2789
|
+
if (chain.length >= 2) for (const c of chain) validated.add(c.idx);
|
|
2790
|
+
chain = [item];
|
|
2791
|
+
}
|
|
2792
|
+
}
|
|
2793
|
+
if (chain.length >= 2) for (const c of chain) validated.add(c.idx);
|
|
2794
|
+
}
|
|
2795
|
+
let familyStack = [];
|
|
2796
|
+
let lastTopLevelList = null;
|
|
2797
|
+
const toRemove = /* @__PURE__ */ new Set();
|
|
2798
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2799
|
+
const b = blocks[i];
|
|
2800
|
+
if (b.type === "table" || b.type === "heading" || b.type === "separator") {
|
|
2801
|
+
familyStack = [];
|
|
2802
|
+
lastTopLevelList = null;
|
|
2803
|
+
continue;
|
|
2804
|
+
}
|
|
2805
|
+
if (b.type !== "paragraph" && b.type !== "list" || !b.text) continue;
|
|
2806
|
+
const text = b.text.trim();
|
|
2807
|
+
if (b.type === "paragraph" && ATTACHMENT_RE.test(text)) {
|
|
2808
|
+
blocks[i] = { ...b, type: "list", listType: "unordered" };
|
|
2809
|
+
continue;
|
|
2810
|
+
}
|
|
2811
|
+
if (!validated.has(i)) continue;
|
|
2812
|
+
const label = parseListLabel(text);
|
|
2813
|
+
let depth = familyStack.indexOf(label.family);
|
|
2814
|
+
if (depth < 0) {
|
|
2815
|
+
familyStack.push(label.family);
|
|
2816
|
+
depth = familyStack.length - 1;
|
|
2817
|
+
} else {
|
|
2818
|
+
familyStack = familyStack.slice(0, depth + 1);
|
|
2819
|
+
}
|
|
2820
|
+
const listType = label.family === "arabicDot" ? "ordered" : "unordered";
|
|
2821
|
+
const listBlock = { ...b, type: "list", listType };
|
|
2822
|
+
if (depth === 0) {
|
|
2823
|
+
blocks[i] = listBlock;
|
|
2824
|
+
lastTopLevelList = listBlock;
|
|
2825
|
+
} else if (lastTopLevelList) {
|
|
2826
|
+
if (!lastTopLevelList.children) lastTopLevelList.children = [];
|
|
2827
|
+
lastTopLevelList.children.push(listBlock);
|
|
2828
|
+
toRemove.add(i);
|
|
2829
|
+
} else {
|
|
2830
|
+
blocks[i] = listBlock;
|
|
2831
|
+
lastTopLevelList = listBlock;
|
|
2832
|
+
}
|
|
2833
|
+
}
|
|
2834
|
+
if (toRemove.size > 0) {
|
|
2835
|
+
const sorted = [...toRemove].sort((a, b) => b - a);
|
|
2836
|
+
for (const idx of sorted) blocks.splice(idx, 1);
|
|
2837
|
+
}
|
|
2838
|
+
}
|
|
2238
2839
|
function detectListBlocks(blocks) {
|
|
2239
2840
|
const result = [];
|
|
2240
2841
|
for (let i = 0; i < blocks.length; i++) {
|
|
@@ -2344,7 +2945,6 @@ function detectSpecialKoreanTables(blocks) {
|
|
|
2344
2945
|
function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
2345
2946
|
const ZONE_RATIO = 0.12;
|
|
2346
2947
|
const MIN_REPEAT = 3;
|
|
2347
|
-
const Y_BUCKET = 5;
|
|
2348
2948
|
const topEntries = [];
|
|
2349
2949
|
const bottomEntries = [];
|
|
2350
2950
|
for (let bi = 0; bi < blocks.length; bi++) {
|
|
@@ -2354,7 +2954,7 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
|
2354
2954
|
if (!ph) continue;
|
|
2355
2955
|
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
2356
2956
|
const blockBottom = ph - b.bbox.y;
|
|
2357
|
-
const entry = { blockIdx: bi, page: b.pageNumber,
|
|
2957
|
+
const entry = { blockIdx: bi, page: b.pageNumber, text: b.text.trim() };
|
|
2358
2958
|
if (blockBottom <= ph * ZONE_RATIO) bottomEntries.push(entry);
|
|
2359
2959
|
else if (blockTop >= ph * (1 - ZONE_RATIO)) topEntries.push(entry);
|
|
2360
2960
|
}
|
|
@@ -2376,21 +2976,9 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
|
2376
2976
|
repeatedPatterns.add(p);
|
|
2377
2977
|
}
|
|
2378
2978
|
}
|
|
2379
|
-
const bucketPages = /* @__PURE__ */ new Map();
|
|
2380
|
-
for (const e of entries) {
|
|
2381
|
-
const bucket = Math.round(e.y / Y_BUCKET);
|
|
2382
|
-
const pages = bucketPages.get(bucket) || /* @__PURE__ */ new Set();
|
|
2383
|
-
pages.add(e.page);
|
|
2384
|
-
bucketPages.set(bucket, pages);
|
|
2385
|
-
}
|
|
2386
|
-
const repeatedBuckets = /* @__PURE__ */ new Set();
|
|
2387
|
-
for (const [b, pages] of bucketPages) {
|
|
2388
|
-
if (pages.size >= MIN_REPEAT) repeatedBuckets.add(b);
|
|
2389
|
-
}
|
|
2390
2979
|
for (const e of entries) {
|
|
2391
2980
|
const norm = e.text.replace(/\d+/g, "#");
|
|
2392
|
-
|
|
2393
|
-
if (repeatedPatterns.has(norm) || repeatedBuckets.has(bucket)) {
|
|
2981
|
+
if (repeatedPatterns.has(norm)) {
|
|
2394
2982
|
removeSet.add(e.blockIdx);
|
|
2395
2983
|
}
|
|
2396
2984
|
}
|
|
@@ -2549,7 +3137,11 @@ function formatMb(bytes) {
|
|
|
2549
3137
|
}
|
|
2550
3138
|
export {
|
|
2551
3139
|
cleanPdfText,
|
|
3140
|
+
detectKoreanListBlocks,
|
|
3141
|
+
detectTableCaptions,
|
|
2552
3142
|
extractPdfMetadataOnly,
|
|
2553
|
-
|
|
3143
|
+
mergeCrossPageTables,
|
|
3144
|
+
parsePdfDocument,
|
|
3145
|
+
removeHeaderFooterBlocks
|
|
2554
3146
|
};
|
|
2555
|
-
//# sourceMappingURL=parser-
|
|
3147
|
+
//# sourceMappingURL=parser-AU2NLC44.js.map
|