kordoc 2.9.1 → 3.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +37 -0
- package/dist/-5BWAV4ZY.js +73 -0
- package/dist/-5BWAV4ZY.js.map +1 -0
- package/dist/chunk-MUOQXDZ4.cjs.map +1 -1
- package/dist/{chunk-FWAXCTSX.cjs → chunk-NBJB6TJB.cjs} +185 -16
- package/dist/chunk-NBJB6TJB.cjs.map +1 -0
- package/dist/{chunk-ODF24QXC.js → chunk-O5P6EG5L.js} +182 -13
- package/dist/chunk-O5P6EG5L.js.map +1 -0
- package/dist/{chunk-Z6TLTWYK.js → chunk-X3SCCO5Q.js} +182 -13
- package/dist/chunk-X3SCCO5Q.js.map +1 -0
- package/dist/{chunk-GQQNAYZA.js → chunk-X7VQVMXQ.js} +7453 -3997
- package/dist/chunk-X7VQVMXQ.js.map +1 -0
- package/dist/cli.js +44 -3
- package/dist/cli.js.map +1 -1
- package/dist/formula-XGG6ZP42.cjs.map +1 -1
- package/dist/index.cjs +4087 -829
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +87 -2
- package/dist/index.d.ts +87 -2
- package/dist/index.js +3867 -609
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/page-range-3C7UGGEK.cjs.map +1 -1
- package/dist/{parser-FJNQEW7K.js → parser-3N6FZSKU.js} +677 -85
- package/dist/parser-3N6FZSKU.js.map +1 -0
- package/dist/{parser-BTIPAEDZ.cjs → parser-5FZJVLQL.cjs} +689 -97
- package/dist/parser-5FZJVLQL.cjs.map +1 -0
- package/dist/{parser-BKYM3LKN.js → parser-LZH7ZELV.js} +677 -85
- package/dist/parser-LZH7ZELV.js.map +1 -0
- package/dist/provider-SNONEZNW.cjs.map +1 -1
- package/dist/{watch-SBLSWHL7.js → watch-4FMRS7QU.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-FWAXCTSX.cjs.map +0 -1
- package/dist/chunk-GQQNAYZA.js.map +0 -1
- package/dist/chunk-ODF24QXC.js.map +0 -1
- package/dist/chunk-Z6TLTWYK.js.map +0 -1
- package/dist/parser-BKYM3LKN.js.map +0 -1
- package/dist/parser-BTIPAEDZ.cjs.map +0 -1
- package/dist/parser-FJNQEW7K.js.map +0 -1
- /package/dist/{watch-SBLSWHL7.js.map → watch-4FMRS7QU.js.map} +0 -0
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
blocksToMarkdown,
|
|
7
7
|
safeMax,
|
|
8
8
|
safeMin
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-X3SCCO5Q.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
12
|
} from "./chunk-SBVRCJFH.js";
|
|
@@ -156,6 +156,55 @@ function extractLines(fnArray, argsArray) {
|
|
|
156
156
|
}
|
|
157
157
|
return { horizontals, verticals };
|
|
158
158
|
}
|
|
159
|
+
function multiplyTransform(m, t) {
|
|
160
|
+
return [
|
|
161
|
+
m[0] * t[0] + m[2] * t[1],
|
|
162
|
+
m[1] * t[0] + m[3] * t[1],
|
|
163
|
+
m[0] * t[2] + m[2] * t[3],
|
|
164
|
+
m[1] * t[2] + m[3] * t[3],
|
|
165
|
+
m[0] * t[4] + m[2] * t[5] + m[4],
|
|
166
|
+
m[1] * t[4] + m[3] * t[5] + m[5]
|
|
167
|
+
];
|
|
168
|
+
}
|
|
169
|
+
function extractImageRegions(fnArray, argsArray) {
|
|
170
|
+
const regions = [];
|
|
171
|
+
let ctm = [1, 0, 0, 1, 0, 0];
|
|
172
|
+
const stack = [];
|
|
173
|
+
for (let i = 0; i < fnArray.length; i++) {
|
|
174
|
+
const op = fnArray[i];
|
|
175
|
+
switch (op) {
|
|
176
|
+
case OPS.save:
|
|
177
|
+
stack.push(ctm);
|
|
178
|
+
break;
|
|
179
|
+
case OPS.restore:
|
|
180
|
+
ctm = stack.pop() || [1, 0, 0, 1, 0, 0];
|
|
181
|
+
break;
|
|
182
|
+
case OPS.transform: {
|
|
183
|
+
const t = argsArray[i];
|
|
184
|
+
if (Array.isArray(t) && t.length >= 6) ctm = multiplyTransform(ctm, t);
|
|
185
|
+
break;
|
|
186
|
+
}
|
|
187
|
+
case OPS.paintImageXObject:
|
|
188
|
+
case OPS.paintInlineImageXObject:
|
|
189
|
+
case OPS.paintImageMaskXObject:
|
|
190
|
+
case OPS.paintImageXObjectRepeat: {
|
|
191
|
+
const corners = [[0, 0], [1, 0], [0, 1], [1, 1]];
|
|
192
|
+
let x1 = Infinity, y1 = Infinity, x2 = -Infinity, y2 = -Infinity;
|
|
193
|
+
for (const [u, v] of corners) {
|
|
194
|
+
const x = ctm[0] * u + ctm[2] * v + ctm[4];
|
|
195
|
+
const y = ctm[1] * u + ctm[3] * v + ctm[5];
|
|
196
|
+
if (x < x1) x1 = x;
|
|
197
|
+
if (x > x2) x2 = x;
|
|
198
|
+
if (y < y1) y1 = y;
|
|
199
|
+
if (y > y2) y2 = y;
|
|
200
|
+
}
|
|
201
|
+
if (x2 - x1 > 0 && y2 - y1 > 0) regions.push({ x1, y1, x2, y2 });
|
|
202
|
+
break;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
}
|
|
206
|
+
return regions;
|
|
207
|
+
}
|
|
159
208
|
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
160
209
|
const dx = Math.abs(seg.x2 - seg.x1);
|
|
161
210
|
const dy = Math.abs(seg.y2 - seg.y1);
|
|
@@ -541,6 +590,10 @@ function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
|
541
590
|
}
|
|
542
591
|
return false;
|
|
543
592
|
}
|
|
593
|
+
var SPACE_GAP_RATIO = 0.17;
|
|
594
|
+
function spaceGapThreshold(fontSize) {
|
|
595
|
+
return Math.max(fontSize * SPACE_GAP_RATIO, 1);
|
|
596
|
+
}
|
|
544
597
|
function mapTextToCells(items, cells) {
|
|
545
598
|
const result = /* @__PURE__ */ new Map();
|
|
546
599
|
for (const cell of cells) {
|
|
@@ -600,14 +653,12 @@ function cellTextToString(items) {
|
|
|
600
653
|
}
|
|
601
654
|
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
602
655
|
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
603
|
-
|
|
604
|
-
const currIsKorean = /^[가-힣]/.test(s[j].text);
|
|
605
|
-
if (gap < avgFs * 0.15) {
|
|
606
|
-
result += s[j].text;
|
|
607
|
-
} else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
|
|
608
|
-
result += s[j].text;
|
|
609
|
-
} else {
|
|
656
|
+
if (s[j].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
610
657
|
result += " " + s[j].text;
|
|
658
|
+
} else if (gap > spaceGapThreshold(avgFs)) {
|
|
659
|
+
result += " " + s[j].text;
|
|
660
|
+
} else {
|
|
661
|
+
result += s[j].text;
|
|
611
662
|
}
|
|
612
663
|
}
|
|
613
664
|
return result;
|
|
@@ -620,6 +671,11 @@ function detectEvenSpacedItems(items) {
|
|
|
620
671
|
let runStart = -1;
|
|
621
672
|
for (let i = 0; i < items.length; i++) {
|
|
622
673
|
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
674
|
+
if (isShortKorean && runStart >= 0 && items[i].hasSpaceBefore) {
|
|
675
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
676
|
+
runStart = i;
|
|
677
|
+
continue;
|
|
678
|
+
}
|
|
623
679
|
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
624
680
|
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
625
681
|
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
@@ -662,6 +718,119 @@ function markEvenRun(items, result, start, end) {
|
|
|
662
718
|
}
|
|
663
719
|
}
|
|
664
720
|
}
|
|
721
|
+
var MAX_UNDERSEGMENTED_ROWS = 2;
|
|
722
|
+
var MIN_UNDERSEGMENTED_COLUMNS = 3;
|
|
723
|
+
var MIN_UNDERSEGMENTED_TEXT_LINES = 8;
|
|
724
|
+
var MIN_ROW_BAND_MISMATCH = 2;
|
|
725
|
+
var MIN_ROW_BAND_EPSILON = 3;
|
|
726
|
+
var ROW_BAND_EPSILON_RATIO = 0.6;
|
|
727
|
+
function itemCenterY(item) {
|
|
728
|
+
return item.y + (item.h > 0 ? item.h : item.fontSize) / 2;
|
|
729
|
+
}
|
|
730
|
+
function itemHeight(item) {
|
|
731
|
+
return item.h > 0 ? item.h : item.fontSize;
|
|
732
|
+
}
|
|
733
|
+
function findColumnIndex(item, colXs) {
|
|
734
|
+
const cx = item.x + item.w / 2;
|
|
735
|
+
for (let c = 0; c < colXs.length - 1; c++) {
|
|
736
|
+
if (cx >= colXs[c] && cx <= colXs[c + 1]) return c;
|
|
737
|
+
}
|
|
738
|
+
let best = 0;
|
|
739
|
+
let bestDist = Infinity;
|
|
740
|
+
for (let c = 0; c < colXs.length - 1; c++) {
|
|
741
|
+
const center = (colXs[c] + colXs[c + 1]) / 2;
|
|
742
|
+
const d = Math.abs(cx - center);
|
|
743
|
+
if (d < bestDist) {
|
|
744
|
+
bestDist = d;
|
|
745
|
+
best = c;
|
|
746
|
+
}
|
|
747
|
+
}
|
|
748
|
+
return best;
|
|
749
|
+
}
|
|
750
|
+
function groupItemsToVisualLines(items) {
|
|
751
|
+
if (items.length === 0) return [];
|
|
752
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
753
|
+
const lines = [];
|
|
754
|
+
let cur = [sorted[0]];
|
|
755
|
+
let curY = sorted[0].y;
|
|
756
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
757
|
+
const tol = Math.max(3, Math.min(sorted[i].fontSize, cur[0].fontSize) * 0.6);
|
|
758
|
+
if (Math.abs(sorted[i].y - curY) <= tol) {
|
|
759
|
+
cur.push(sorted[i]);
|
|
760
|
+
} else {
|
|
761
|
+
lines.push(cur);
|
|
762
|
+
cur = [sorted[i]];
|
|
763
|
+
curY = sorted[i].y;
|
|
764
|
+
}
|
|
765
|
+
}
|
|
766
|
+
lines.push(cur);
|
|
767
|
+
return lines;
|
|
768
|
+
}
|
|
769
|
+
function normalizeUndersegmentedTable(originalCells, colXs, items) {
|
|
770
|
+
const numRows = originalCells.length;
|
|
771
|
+
const numCols = colXs.length - 1;
|
|
772
|
+
if (numRows > MAX_UNDERSEGMENTED_ROWS || numCols < MIN_UNDERSEGMENTED_COLUMNS) return null;
|
|
773
|
+
if (items.length === 0) return null;
|
|
774
|
+
const itemsByCol = Array.from({ length: numCols }, () => []);
|
|
775
|
+
for (const item of items) {
|
|
776
|
+
if (!item.text.trim()) continue;
|
|
777
|
+
itemsByCol[findColumnIndex(item, colXs)].push(item);
|
|
778
|
+
}
|
|
779
|
+
let denseColumns = 0;
|
|
780
|
+
for (const colItems of itemsByCol) {
|
|
781
|
+
if (groupItemsToVisualLines(colItems).length >= MIN_UNDERSEGMENTED_TEXT_LINES) denseColumns++;
|
|
782
|
+
}
|
|
783
|
+
if (denseColumns < 2) return null;
|
|
784
|
+
const allLines = groupItemsToVisualLines(items.filter((i) => i.text.trim()));
|
|
785
|
+
const bands = [];
|
|
786
|
+
for (const line of allLines) {
|
|
787
|
+
let cy = 0, h = 0;
|
|
788
|
+
for (const it of line) {
|
|
789
|
+
cy += itemCenterY(it);
|
|
790
|
+
h += itemHeight(it);
|
|
791
|
+
}
|
|
792
|
+
cy /= line.length;
|
|
793
|
+
h /= line.length;
|
|
794
|
+
const top = cy + h / 2;
|
|
795
|
+
const bottom = cy - h / 2;
|
|
796
|
+
let matched = null;
|
|
797
|
+
for (const band of bands) {
|
|
798
|
+
const epsilon = Math.max(MIN_ROW_BAND_EPSILON, Math.min(band.avgHeight, h) * ROW_BAND_EPSILON_RATIO);
|
|
799
|
+
if (Math.abs(band.centerY - cy) <= epsilon || bottom <= band.topY && top >= band.bottomY) {
|
|
800
|
+
matched = band;
|
|
801
|
+
break;
|
|
802
|
+
}
|
|
803
|
+
}
|
|
804
|
+
if (!matched) {
|
|
805
|
+
matched = { centerY: 0, avgHeight: 0, topY: -Infinity, bottomY: Infinity, lineCount: 0, itemsByCol: Array.from({ length: numCols }, () => []) };
|
|
806
|
+
bands.push(matched);
|
|
807
|
+
}
|
|
808
|
+
matched.centerY = (matched.centerY * matched.lineCount + cy) / (matched.lineCount + 1);
|
|
809
|
+
matched.avgHeight = (matched.avgHeight * matched.lineCount + h) / (matched.lineCount + 1);
|
|
810
|
+
matched.topY = Math.max(matched.topY, top);
|
|
811
|
+
matched.bottomY = Math.min(matched.bottomY, bottom);
|
|
812
|
+
matched.lineCount++;
|
|
813
|
+
for (const it of line) {
|
|
814
|
+
matched.itemsByCol[findColumnIndex(it, colXs)].push(it);
|
|
815
|
+
}
|
|
816
|
+
}
|
|
817
|
+
if (bands.length < numRows + MIN_ROW_BAND_MISMATCH) return null;
|
|
818
|
+
bands.sort((a, b) => b.centerY - a.centerY);
|
|
819
|
+
const rebuilt = bands.map(
|
|
820
|
+
(band) => band.itemsByCol.map((colItems) => colItems.length > 0 ? cellTextToString(colItems) : "")
|
|
821
|
+
);
|
|
822
|
+
const countNonEmptyRows = (cells) => cells.filter((row) => row.some((c) => (typeof c === "string" ? c : c.text).trim() !== "")).length;
|
|
823
|
+
const countNonEmptyCols = (cells, cols) => {
|
|
824
|
+
let n = 0;
|
|
825
|
+
for (let c = 0; c < cols; c++) {
|
|
826
|
+
if (cells.some((row) => row[c] != null && (typeof row[c] === "string" ? row[c] : row[c].text).trim() !== "")) n++;
|
|
827
|
+
}
|
|
828
|
+
return n;
|
|
829
|
+
};
|
|
830
|
+
if (countNonEmptyRows(rebuilt) <= countNonEmptyRows(originalCells)) return null;
|
|
831
|
+
if (countNonEmptyCols(rebuilt, numCols) < countNonEmptyCols(originalCells, numCols)) return null;
|
|
832
|
+
return rebuilt;
|
|
833
|
+
}
|
|
665
834
|
function mergeCellTextLines(textLines) {
|
|
666
835
|
if (textLines.length <= 1) return textLines[0] || "";
|
|
667
836
|
const merged = [textLines[0]];
|
|
@@ -694,7 +863,7 @@ var MIN_COL_FILL_RATIO = 0.4;
|
|
|
694
863
|
function detectClusterTables(items, pageNum) {
|
|
695
864
|
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
696
865
|
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
697
|
-
const rows = groupByBaseline(merged);
|
|
866
|
+
const rows = mergeOverlappingRows(groupByBaseline(merged));
|
|
698
867
|
if (rows.length < MIN_ROWS) return [];
|
|
699
868
|
const results = [];
|
|
700
869
|
const headerResult = detectHeaderRow(rows);
|
|
@@ -743,6 +912,7 @@ function mergeEvenSpacedClusters(items) {
|
|
|
743
912
|
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
744
913
|
let runEnd = i + 1;
|
|
745
914
|
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
915
|
+
if (sorted[runEnd].hasSpaceBefore) break;
|
|
746
916
|
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
747
917
|
const fs = sorted[runEnd].fontSize;
|
|
748
918
|
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
@@ -833,6 +1003,38 @@ function detectHeaderRow(rows) {
|
|
|
833
1003
|
}
|
|
834
1004
|
return null;
|
|
835
1005
|
}
|
|
1006
|
+
function mergeOverlappingRows(rows) {
|
|
1007
|
+
if (rows.length <= 1) return rows;
|
|
1008
|
+
const result = [rows[0]];
|
|
1009
|
+
for (let i = 1; i < rows.length; i++) {
|
|
1010
|
+
const prev = result[result.length - 1];
|
|
1011
|
+
const curr = rows[i];
|
|
1012
|
+
const a = rowBand(prev);
|
|
1013
|
+
const b = rowBand(curr);
|
|
1014
|
+
const overlap = Math.min(a.top, b.top) - Math.max(a.bottom, b.bottom);
|
|
1015
|
+
const prevIsFrag = isFragmentRow(prev) && a.height <= b.height * 0.8 && overlap >= a.height * 0.5;
|
|
1016
|
+
const currIsFrag = isFragmentRow(curr) && b.height <= a.height * 0.8 && overlap >= b.height * 0.5;
|
|
1017
|
+
if (prevIsFrag || currIsFrag) {
|
|
1018
|
+
const baseY = prevIsFrag ? curr.y : prev.y;
|
|
1019
|
+
result[result.length - 1] = { y: baseY, items: [...prev.items, ...curr.items] };
|
|
1020
|
+
} else {
|
|
1021
|
+
result.push(curr);
|
|
1022
|
+
}
|
|
1023
|
+
}
|
|
1024
|
+
return result;
|
|
1025
|
+
}
|
|
1026
|
+
function isFragmentRow(row) {
|
|
1027
|
+
return row.items.length <= 3 && row.items.every((i) => i.text.length <= 8);
|
|
1028
|
+
}
|
|
1029
|
+
function rowBand(row) {
|
|
1030
|
+
let bottom = Infinity, top = -Infinity;
|
|
1031
|
+
for (const i of row.items) {
|
|
1032
|
+
const h = i.h > 0 ? i.h : i.fontSize;
|
|
1033
|
+
if (i.y < bottom) bottom = i.y;
|
|
1034
|
+
if (i.y + h > top) top = i.y + h;
|
|
1035
|
+
}
|
|
1036
|
+
return { bottom, top, height: top - bottom };
|
|
1037
|
+
}
|
|
836
1038
|
function mergeMultiLineRows(rows, columns) {
|
|
837
1039
|
if (rows.length <= 1) return rows;
|
|
838
1040
|
const result = [rows[0]];
|
|
@@ -1319,6 +1521,8 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1319
1521
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
1320
1522
|
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
1321
1523
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
1524
|
+
const pagesWithLargeImage = /* @__PURE__ */ new Set();
|
|
1525
|
+
const skippedImagePages = /* @__PURE__ */ new Map();
|
|
1322
1526
|
let parsedPages = 0;
|
|
1323
1527
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
1324
1528
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
@@ -1337,6 +1541,23 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1337
1541
|
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
1338
1542
|
}
|
|
1339
1543
|
const opList = await page.getOperatorList();
|
|
1544
|
+
const pageArea = viewport.width * viewport.height;
|
|
1545
|
+
if (pageArea > 0) {
|
|
1546
|
+
const imageRegions = extractImageRegions(opList.fnArray, opList.argsArray);
|
|
1547
|
+
let uncovered = 0;
|
|
1548
|
+
for (const r of imageRegions) {
|
|
1549
|
+
const area = (r.x2 - r.x1) * (r.y2 - r.y1);
|
|
1550
|
+
if (area < pageArea * 0.05) continue;
|
|
1551
|
+
pagesWithLargeImage.add(i);
|
|
1552
|
+
const hasText = visible.some((it) => {
|
|
1553
|
+
const cx = it.x + it.w / 2;
|
|
1554
|
+
const cy = it.y + (it.h || it.fontSize) / 2;
|
|
1555
|
+
return cx >= r.x1 && cx <= r.x2 && cy >= r.y1 && cy <= r.y2;
|
|
1556
|
+
});
|
|
1557
|
+
if (!hasText) uncovered++;
|
|
1558
|
+
}
|
|
1559
|
+
if (uncovered > 0) skippedImagePages.set(i, uncovered);
|
|
1560
|
+
}
|
|
1340
1561
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
1341
1562
|
for (const b of pageBlocks) blocks.push(b);
|
|
1342
1563
|
let pageText = "";
|
|
@@ -1356,6 +1577,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1356
1577
|
}
|
|
1357
1578
|
}
|
|
1358
1579
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
1580
|
+
let isImageBased = false;
|
|
1359
1581
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
1360
1582
|
if (options?.ocr) {
|
|
1361
1583
|
try {
|
|
@@ -1368,7 +1590,29 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1368
1590
|
} catch {
|
|
1369
1591
|
}
|
|
1370
1592
|
}
|
|
1371
|
-
|
|
1593
|
+
isImageBased = true;
|
|
1594
|
+
warnings.push({
|
|
1595
|
+
message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, \uD14D\uC2A4\uD2B8 ${totalChars}\uC790) \u2014 \uD14D\uC2A4\uD2B8 \uB808\uC774\uC5B4\uAC00 \uC5C6\uC5B4 OCR\uC774 \uD544\uC694\uD569\uB2C8\uB2E4`,
|
|
1596
|
+
code: "NEEDS_OCR"
|
|
1597
|
+
});
|
|
1598
|
+
}
|
|
1599
|
+
if (!isImageBased) {
|
|
1600
|
+
const OCR_REASON_MESSAGES = {
|
|
1601
|
+
low_text: "\uD14D\uC2A4\uD2B8\uAC00 \uAC70\uC758 \uC5C6\uB294 \uD398\uC774\uC9C0 (\uC2A4\uCE94/\uC774\uBBF8\uC9C0 \uCD94\uC815)",
|
|
1602
|
+
high_pua: "\uAE00\uAF34 \uB9E4\uD551 \uC2E4\uD328 (PUA \uBE44\uC728 \uB192\uC74C) \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00",
|
|
1603
|
+
high_control: "\uC81C\uC5B4\uBB38\uC790 \uBE44\uC728 \uB192\uC74C \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00",
|
|
1604
|
+
high_replacement: "\uB300\uCCB4\uBB38\uC790(U+FFFD) \uBE44\uC728 \uB192\uC74C \u2014 \uCD94\uCD9C \uD14D\uC2A4\uD2B8 \uC2E0\uB8B0 \uBD88\uAC00"
|
|
1605
|
+
};
|
|
1606
|
+
for (const pq of pageQuality) {
|
|
1607
|
+
if (!pq.needsOcr || !pq.ocrReason) continue;
|
|
1608
|
+
if (pq.ocrReason === "low_text" && !pagesWithLargeImage.has(pq.page)) continue;
|
|
1609
|
+
warnings.push({ page: pq.page, message: `${OCR_REASON_MESSAGES[pq.ocrReason]} \u2014 OCR \uAC80\uD1A0 \uD544\uC694`, code: "NEEDS_OCR" });
|
|
1610
|
+
}
|
|
1611
|
+
}
|
|
1612
|
+
if (!isImageBased) {
|
|
1613
|
+
for (const [page, count] of [...skippedImagePages.entries()].sort((a, b) => a[0] - b[0])) {
|
|
1614
|
+
warnings.push({ page, message: `${count}\uAC1C \uC774\uBBF8\uC9C0 \uC601\uC5ED\uC5D0 \uCD94\uCD9C \uAC00\uB2A5\uD55C \uD14D\uC2A4\uD2B8 \uC5C6\uC74C (\uADF8\uB9BC/\uCC28\uD2B8/\uB3C4\uC7A5 \uB0B4\uC6A9 \uB204\uB77D \uAC00\uB2A5)`, code: "SKIPPED_IMAGE" });
|
|
1615
|
+
}
|
|
1372
1616
|
}
|
|
1373
1617
|
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
1374
1618
|
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
@@ -1376,6 +1620,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1376
1620
|
blocks.splice(removed[ri], 1);
|
|
1377
1621
|
}
|
|
1378
1622
|
}
|
|
1623
|
+
mergeCrossPageTables(blocks);
|
|
1379
1624
|
if (options?.formulaOcr && formulaBuffer) {
|
|
1380
1625
|
try {
|
|
1381
1626
|
await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
|
|
@@ -1391,6 +1636,8 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1391
1636
|
detectHeadings(blocks, medianFontSize);
|
|
1392
1637
|
}
|
|
1393
1638
|
detectMarkerHeadings(blocks);
|
|
1639
|
+
detectTableCaptions(blocks);
|
|
1640
|
+
detectKoreanListBlocks(blocks);
|
|
1394
1641
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
1395
1642
|
sanitizeBlockControlChars(blocks);
|
|
1396
1643
|
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
@@ -1400,6 +1647,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1400
1647
|
metadata,
|
|
1401
1648
|
outline: outline.length > 0 ? outline : void 0,
|
|
1402
1649
|
warnings: warnings.length > 0 ? warnings : void 0,
|
|
1650
|
+
isImageBased: isImageBased || void 0,
|
|
1403
1651
|
pageQuality,
|
|
1404
1652
|
qualitySummary: summarizeDocumentQuality(pageQuality)
|
|
1405
1653
|
};
|
|
@@ -1555,79 +1803,218 @@ function detectMarkerHeadings(blocks) {
|
|
|
1555
1803
|
}
|
|
1556
1804
|
}
|
|
1557
1805
|
var MAX_XYCUT_DEPTH = 50;
|
|
1806
|
+
var XYCUT_MIN_GAP = 5;
|
|
1807
|
+
var CROSS_LAYOUT_BETA = 2;
|
|
1808
|
+
var CROSS_OVERLAP_RATIO = 0.1;
|
|
1809
|
+
var CROSS_MIN_OVERLAPS = 2;
|
|
1810
|
+
var CROSS_MAX_MASK_RATIO = 0.2;
|
|
1811
|
+
var NARROW_ELEMENT_WIDTH_RATIO = 0.1;
|
|
1558
1812
|
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
1559
1813
|
if (items.length === 0) return [];
|
|
1560
1814
|
if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
|
|
1561
|
-
|
|
1562
|
-
|
|
1563
|
-
|
|
1564
|
-
|
|
1565
|
-
|
|
1815
|
+
if (depth === 0 && items.length >= 3) {
|
|
1816
|
+
const cross = identifyCrossLayoutItems(items);
|
|
1817
|
+
if (cross.size > 0 && cross.size <= items.length * CROSS_MAX_MASK_RATIO) {
|
|
1818
|
+
const rest = items.filter((i) => !cross.has(i));
|
|
1819
|
+
if (rest.length > 0) {
|
|
1820
|
+
const groups = xyCutOrder(rest, gapThreshold, 1);
|
|
1821
|
+
return mergeCrossLayoutGroups(groups, [...cross]);
|
|
1822
|
+
}
|
|
1823
|
+
}
|
|
1824
|
+
}
|
|
1825
|
+
const minGap = Math.max(XYCUT_MIN_GAP, gapThreshold);
|
|
1826
|
+
const hCut = findHorizontalCut(items);
|
|
1827
|
+
const vCut = findVerticalCutWithOutlierFilter(items, minGap);
|
|
1828
|
+
const hValid = hCut.gap >= minGap;
|
|
1829
|
+
const vValid = vCut.gap >= minGap;
|
|
1830
|
+
let useHorizontal;
|
|
1831
|
+
if (hValid && vValid) useHorizontal = vCut.gap <= hCut.gap * 1.5;
|
|
1832
|
+
else if (hValid) useHorizontal = true;
|
|
1833
|
+
else if (vValid) useHorizontal = false;
|
|
1834
|
+
else return [items];
|
|
1835
|
+
if (useHorizontal) {
|
|
1836
|
+
const upper = items.filter((i) => i.y > hCut.position);
|
|
1837
|
+
const lower = items.filter((i) => i.y <= hCut.position);
|
|
1566
1838
|
if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
|
|
1567
1839
|
return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
|
|
1568
1840
|
}
|
|
1569
|
-
}
|
|
1570
|
-
|
|
1571
|
-
|
|
1572
|
-
const left = items.filter((i) => i.x + i.w / 2 < xSplit);
|
|
1573
|
-
const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
|
|
1841
|
+
} else {
|
|
1842
|
+
const left = items.filter((i) => i.x + i.w / 2 < vCut.position);
|
|
1843
|
+
const right = items.filter((i) => i.x + i.w / 2 >= vCut.position);
|
|
1574
1844
|
if (left.length > 0 && right.length > 0 && left.length < items.length) {
|
|
1575
1845
|
return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
|
|
1576
1846
|
}
|
|
1577
1847
|
}
|
|
1578
1848
|
return [items];
|
|
1579
1849
|
}
|
|
1580
|
-
function
|
|
1581
|
-
|
|
1850
|
+
function identifyCrossLayoutItems(items) {
|
|
1851
|
+
const cross = /* @__PURE__ */ new Set();
|
|
1852
|
+
if (items.length < 3) return cross;
|
|
1853
|
+
let maxWidth = 0;
|
|
1582
1854
|
for (const i of items) {
|
|
1583
|
-
if (i.
|
|
1584
|
-
if (i.y < minY) minY = i.y;
|
|
1585
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1586
|
-
if (i.y + i.h > maxY) maxY = i.y + i.h;
|
|
1855
|
+
if (i.w > maxWidth) maxWidth = i.w;
|
|
1587
1856
|
}
|
|
1588
|
-
|
|
1857
|
+
const threshold = CROSS_LAYOUT_BETA * maxWidth;
|
|
1858
|
+
for (const item of items) {
|
|
1859
|
+
if (item.w < threshold) continue;
|
|
1860
|
+
let overlaps = 0;
|
|
1861
|
+
for (const other of items) {
|
|
1862
|
+
if (other === item) continue;
|
|
1863
|
+
const left = Math.max(item.x, other.x);
|
|
1864
|
+
const right = Math.min(item.x + item.w, other.x + other.w);
|
|
1865
|
+
const overlapW = right - left;
|
|
1866
|
+
if (overlapW <= 0) continue;
|
|
1867
|
+
const smaller = Math.min(item.w, other.w);
|
|
1868
|
+
if (smaller > 0 && overlapW / smaller >= CROSS_OVERLAP_RATIO) {
|
|
1869
|
+
overlaps++;
|
|
1870
|
+
if (overlaps >= CROSS_MIN_OVERLAPS) break;
|
|
1871
|
+
}
|
|
1872
|
+
}
|
|
1873
|
+
if (overlaps >= CROSS_MIN_OVERLAPS) cross.add(item);
|
|
1874
|
+
}
|
|
1875
|
+
return cross;
|
|
1589
1876
|
}
|
|
1590
|
-
function
|
|
1877
|
+
function mergeCrossLayoutGroups(groups, cross) {
|
|
1878
|
+
if (cross.length === 0) return groups;
|
|
1879
|
+
const sortedCross = [...cross].sort((a, b) => b.y + b.h - (a.y + a.h) || a.x - b.x);
|
|
1880
|
+
const groupTop = (g2) => {
|
|
1881
|
+
let top = -Infinity;
|
|
1882
|
+
for (const i of g2) {
|
|
1883
|
+
const t = i.y + i.h;
|
|
1884
|
+
if (t > top) top = t;
|
|
1885
|
+
}
|
|
1886
|
+
return top;
|
|
1887
|
+
};
|
|
1888
|
+
const result = [];
|
|
1889
|
+
let gi = 0, ci = 0;
|
|
1890
|
+
while (gi < groups.length || ci < sortedCross.length) {
|
|
1891
|
+
if (ci >= sortedCross.length) {
|
|
1892
|
+
result.push(groups[gi++]);
|
|
1893
|
+
continue;
|
|
1894
|
+
}
|
|
1895
|
+
if (gi >= groups.length) {
|
|
1896
|
+
result.push([sortedCross[ci++]]);
|
|
1897
|
+
continue;
|
|
1898
|
+
}
|
|
1899
|
+
const crossTop = sortedCross[ci].y + sortedCross[ci].h;
|
|
1900
|
+
if (crossTop >= groupTop(groups[gi])) result.push([sortedCross[ci++]]);
|
|
1901
|
+
else result.push(groups[gi++]);
|
|
1902
|
+
}
|
|
1903
|
+
return result;
|
|
1904
|
+
}
|
|
1905
|
+
function findHorizontalCut(items) {
|
|
1906
|
+
if (items.length < 2) return { position: 0, gap: 0 };
|
|
1591
1907
|
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
1592
|
-
let
|
|
1593
|
-
let
|
|
1908
|
+
let largestGap = 0;
|
|
1909
|
+
let position = 0;
|
|
1594
1910
|
for (let i = 1; i < sorted.length; i++) {
|
|
1595
1911
|
const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
|
|
1596
1912
|
const currTop = sorted[i].y;
|
|
1597
1913
|
const gap = prevBottom - currTop;
|
|
1598
|
-
if (gap >
|
|
1599
|
-
|
|
1600
|
-
|
|
1914
|
+
if (gap > largestGap) {
|
|
1915
|
+
largestGap = gap;
|
|
1916
|
+
position = (prevBottom + currTop) / 2;
|
|
1601
1917
|
}
|
|
1602
1918
|
}
|
|
1603
|
-
return
|
|
1919
|
+
return { position, gap: largestGap };
|
|
1604
1920
|
}
|
|
1605
|
-
function
|
|
1606
|
-
const
|
|
1607
|
-
|
|
1608
|
-
|
|
1609
|
-
|
|
1610
|
-
const
|
|
1611
|
-
|
|
1612
|
-
|
|
1613
|
-
|
|
1614
|
-
|
|
1615
|
-
|
|
1921
|
+
function findVerticalCutWithOutlierFilter(items, minGap) {
|
|
1922
|
+
const edgeCut = findVerticalCut(items);
|
|
1923
|
+
if (edgeCut.gap >= minGap) return edgeCut;
|
|
1924
|
+
if (items.length >= 3) {
|
|
1925
|
+
let minX = Infinity, maxX = -Infinity;
|
|
1926
|
+
for (const i of items) {
|
|
1927
|
+
if (i.x < minX) minX = i.x;
|
|
1928
|
+
const r = i.x + i.w;
|
|
1929
|
+
if (r > maxX) maxX = r;
|
|
1930
|
+
}
|
|
1931
|
+
const narrowThreshold = (maxX - minX) * NARROW_ELEMENT_WIDTH_RATIO;
|
|
1932
|
+
const filtered = items.filter((i) => i.w >= narrowThreshold);
|
|
1933
|
+
if (filtered.length >= 2 && filtered.length < items.length && filtered.length >= items.length * 0.7) {
|
|
1934
|
+
const filteredCut = findVerticalCut(filtered);
|
|
1935
|
+
if (filteredCut.gap > edgeCut.gap && filteredCut.gap >= minGap) {
|
|
1936
|
+
return filteredCut;
|
|
1937
|
+
}
|
|
1938
|
+
}
|
|
1939
|
+
}
|
|
1940
|
+
return edgeCut;
|
|
1941
|
+
}
|
|
1942
|
+
function findVerticalCut(items) {
|
|
1943
|
+
if (items.length < 2) return { position: 0, gap: 0 };
|
|
1944
|
+
const sorted = [...items].sort((a, b) => a.x - b.x || a.x + a.w - (b.x + b.w));
|
|
1945
|
+
let largestGap = 0;
|
|
1946
|
+
let position = 0;
|
|
1947
|
+
let prevRight = null;
|
|
1948
|
+
for (const it of sorted) {
|
|
1949
|
+
const left = it.x;
|
|
1950
|
+
const right = it.x + it.w;
|
|
1951
|
+
if (prevRight !== null && left > prevRight) {
|
|
1952
|
+
const gap = left - prevRight;
|
|
1953
|
+
if (gap > largestGap) {
|
|
1954
|
+
largestGap = gap;
|
|
1955
|
+
position = (prevRight + left) / 2;
|
|
1956
|
+
}
|
|
1616
1957
|
}
|
|
1958
|
+
prevRight = prevRight === null ? right : Math.max(prevRight, right);
|
|
1617
1959
|
}
|
|
1618
|
-
return
|
|
1960
|
+
return { position, gap: largestGap };
|
|
1619
1961
|
}
|
|
1620
1962
|
function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
|
|
1621
1963
|
if (items.length === 0) return [];
|
|
1622
1964
|
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
1623
1965
|
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
1624
1966
|
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
1967
|
+
markStrikethroughItems(items, horizontals);
|
|
1968
|
+
wrapStrikethroughRuns(items);
|
|
1625
1969
|
const grids = buildTableGrids(horizontals, verticals);
|
|
1626
1970
|
if (grids.length > 0) {
|
|
1627
1971
|
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
1628
1972
|
}
|
|
1629
1973
|
return extractPageBlocksFallback(items, pageNum);
|
|
1630
1974
|
}
|
|
1975
|
+
var STRIKE_MAX_THICKNESS = 2;
|
|
1976
|
+
var STRIKE_MAX_THICKNESS_RATIO = 0.25;
|
|
1977
|
+
var STRIKE_CENTER_TOLERANCE = 0.25;
|
|
1978
|
+
var STRIKE_MIN_OVERLAP_RATIO = 0.8;
|
|
1979
|
+
var STRIKE_MAX_LINE_TO_TEXT_RATIO = 1.5;
|
|
1980
|
+
function markStrikethroughItems(items, horizontals) {
|
|
1981
|
+
if (items.length === 0 || horizontals.length === 0) return;
|
|
1982
|
+
for (const line of horizontals) {
|
|
1983
|
+
if (line.lineWidth > STRIKE_MAX_THICKNESS) continue;
|
|
1984
|
+
const matches = [];
|
|
1985
|
+
for (const item of items) {
|
|
1986
|
+
const h = item.h > 0 ? item.h : item.fontSize;
|
|
1987
|
+
if (h <= 0 || item.w <= 0) continue;
|
|
1988
|
+
if (line.lineWidth > h * STRIKE_MAX_THICKNESS_RATIO) continue;
|
|
1989
|
+
const centerY = item.y + h * 0.4;
|
|
1990
|
+
if (Math.abs(line.y1 - centerY) > h * STRIKE_CENTER_TOLERANCE) continue;
|
|
1991
|
+
const overlap = Math.min(line.x2, item.x + item.w) - Math.max(line.x1, item.x);
|
|
1992
|
+
if (overlap / item.w < STRIKE_MIN_OVERLAP_RATIO) continue;
|
|
1993
|
+
matches.push(item);
|
|
1994
|
+
}
|
|
1995
|
+
if (matches.length === 0) continue;
|
|
1996
|
+
let totalW = 0;
|
|
1997
|
+
for (const m of matches) totalW += m.w;
|
|
1998
|
+
if (totalW <= 0 || (line.x2 - line.x1) / totalW > STRIKE_MAX_LINE_TO_TEXT_RATIO) continue;
|
|
1999
|
+
for (const m of matches) m.strike = true;
|
|
2000
|
+
}
|
|
2001
|
+
}
|
|
2002
|
+
function wrapStrikethroughRuns(items) {
|
|
2003
|
+
const struck = items.filter((i) => i.strike);
|
|
2004
|
+
if (struck.length === 0) return;
|
|
2005
|
+
const lines = /* @__PURE__ */ new Map();
|
|
2006
|
+
for (const item of struck) {
|
|
2007
|
+
const key = Math.round(item.y / 3);
|
|
2008
|
+
const arr = lines.get(key) || [];
|
|
2009
|
+
arr.push(item);
|
|
2010
|
+
lines.set(key, arr);
|
|
2011
|
+
}
|
|
2012
|
+
for (const arr of lines.values()) {
|
|
2013
|
+
arr.sort((a, b) => a.x - b.x);
|
|
2014
|
+
arr[0].text = "~~" + arr[0].text;
|
|
2015
|
+
arr[arr.length - 1].text = arr[arr.length - 1].text + "~~";
|
|
2016
|
+
}
|
|
2017
|
+
}
|
|
1631
2018
|
function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
1632
2019
|
const blocks = [];
|
|
1633
2020
|
const usedItems = /* @__PURE__ */ new Set();
|
|
@@ -1657,7 +2044,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1657
2044
|
w: i.w,
|
|
1658
2045
|
h: i.h,
|
|
1659
2046
|
fontSize: i.fontSize,
|
|
1660
|
-
fontName: i.fontName
|
|
2047
|
+
fontName: i.fontName,
|
|
2048
|
+
hasSpaceBefore: i.hasSpaceBefore
|
|
1661
2049
|
}));
|
|
1662
2050
|
const cellTextMap = mapTextToCells(textItems, cells);
|
|
1663
2051
|
const numRows = grid.rowYs.length - 1;
|
|
@@ -1677,13 +2065,29 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1677
2065
|
rowSpan: cell.rowSpan
|
|
1678
2066
|
};
|
|
1679
2067
|
}
|
|
2068
|
+
let finalGrid = irGrid;
|
|
2069
|
+
let finalRows = numRows;
|
|
2070
|
+
if (numRows <= 2 && numCols >= 3) {
|
|
2071
|
+
const rebuilt = normalizeUndersegmentedTable(irGrid, grid.colXs, textItems);
|
|
2072
|
+
if (rebuilt) {
|
|
2073
|
+
finalGrid = rebuilt.map((row) => row.map((rawText) => {
|
|
2074
|
+
const cleaned = rawText.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
2075
|
+
return {
|
|
2076
|
+
text: cleaned.split("\n").map((line) => collapseEvenSpacing(line)).join("\n"),
|
|
2077
|
+
colSpan: 1,
|
|
2078
|
+
rowSpan: 1
|
|
2079
|
+
};
|
|
2080
|
+
}));
|
|
2081
|
+
finalRows = finalGrid.length;
|
|
2082
|
+
}
|
|
2083
|
+
}
|
|
1680
2084
|
const irTable = {
|
|
1681
|
-
rows:
|
|
2085
|
+
rows: finalRows,
|
|
1682
2086
|
cols: numCols,
|
|
1683
|
-
cells:
|
|
1684
|
-
hasHeader:
|
|
2087
|
+
cells: finalGrid,
|
|
2088
|
+
hasHeader: finalRows > 1
|
|
1685
2089
|
};
|
|
1686
|
-
const hasContent =
|
|
2090
|
+
const hasContent = finalGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
1687
2091
|
if (!hasContent) continue;
|
|
1688
2092
|
const tableBbox = {
|
|
1689
2093
|
page: pageNum,
|
|
@@ -1712,7 +2116,8 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1712
2116
|
w: i.w,
|
|
1713
2117
|
h: i.h,
|
|
1714
2118
|
fontSize: i.fontSize,
|
|
1715
|
-
fontName: i.fontName
|
|
2119
|
+
fontName: i.fontName,
|
|
2120
|
+
hasSpaceBefore: i.hasSpaceBefore
|
|
1716
2121
|
}));
|
|
1717
2122
|
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
1718
2123
|
if (clusterResults.length > 0) {
|
|
@@ -1750,6 +2155,46 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
|
1750
2155
|
}
|
|
1751
2156
|
return mergeAdjacentTableBlocks(blocks);
|
|
1752
2157
|
}
|
|
2158
|
+
var NEIGHBOR_TABLE_EPSILON = 0.2;
|
|
2159
|
+
function mergeCrossPageTables(blocks) {
|
|
2160
|
+
for (let i = blocks.length - 2; i >= 0; i--) {
|
|
2161
|
+
const prev = blocks[i];
|
|
2162
|
+
const curr = blocks[i + 1];
|
|
2163
|
+
if (prev.type !== "table" || curr.type !== "table" || !prev.table || !curr.table) continue;
|
|
2164
|
+
if (!prev.pageNumber || !curr.pageNumber || curr.pageNumber !== prev.pageNumber + 1) continue;
|
|
2165
|
+
if (prev.table.cols !== curr.table.cols) continue;
|
|
2166
|
+
if (!prev.bbox || !curr.bbox) continue;
|
|
2167
|
+
const width = Math.max(prev.bbox.width, curr.bbox.width, 1);
|
|
2168
|
+
const leftDiff = Math.abs(prev.bbox.x - curr.bbox.x);
|
|
2169
|
+
const rightDiff = Math.abs(prev.bbox.x + prev.bbox.width - (curr.bbox.x + curr.bbox.width));
|
|
2170
|
+
if (leftDiff > width * NEIGHBOR_TABLE_EPSILON || rightDiff > width * NEIGHBOR_TABLE_EPSILON) continue;
|
|
2171
|
+
let currCells = curr.table.cells;
|
|
2172
|
+
if (currCells.length > 1 && prev.table.cells.length > 0 && rowTextsEqual(prev.table.cells[0], currCells[0])) {
|
|
2173
|
+
currCells = currCells.slice(1);
|
|
2174
|
+
}
|
|
2175
|
+
if (currCells.length === 0) {
|
|
2176
|
+
blocks.splice(i + 1, 1);
|
|
2177
|
+
continue;
|
|
2178
|
+
}
|
|
2179
|
+
const merged = {
|
|
2180
|
+
rows: prev.table.rows + currCells.length,
|
|
2181
|
+
cols: prev.table.cols,
|
|
2182
|
+
cells: [...prev.table.cells, ...currCells],
|
|
2183
|
+
hasHeader: prev.table.hasHeader,
|
|
2184
|
+
caption: prev.table.caption
|
|
2185
|
+
};
|
|
2186
|
+
blocks[i] = { ...prev, table: merged };
|
|
2187
|
+
blocks.splice(i + 1, 1);
|
|
2188
|
+
}
|
|
2189
|
+
}
|
|
2190
|
+
function rowTextsEqual(a, b) {
|
|
2191
|
+
if (a.length !== b.length) return false;
|
|
2192
|
+
const norm = (t) => t.replace(/\s+/g, "");
|
|
2193
|
+
for (let i = 0; i < a.length; i++) {
|
|
2194
|
+
if (norm(a[i].text) !== norm(b[i].text)) return false;
|
|
2195
|
+
}
|
|
2196
|
+
return a.some((c) => c.text.trim() !== "");
|
|
2197
|
+
}
|
|
1753
2198
|
function mergeAdjacentTableBlocks(blocks) {
|
|
1754
2199
|
if (blocks.length <= 1) return blocks;
|
|
1755
2200
|
const result = [blocks[0]];
|
|
@@ -1780,7 +2225,8 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1780
2225
|
w: i.w,
|
|
1781
2226
|
h: i.h,
|
|
1782
2227
|
fontSize: i.fontSize,
|
|
1783
|
-
fontName: i.fontName
|
|
2228
|
+
fontName: i.fontName,
|
|
2229
|
+
hasSpaceBefore: i.hasSpaceBefore
|
|
1784
2230
|
}));
|
|
1785
2231
|
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
1786
2232
|
if (clusterResults.length > 0) {
|
|
@@ -1796,7 +2242,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1796
2242
|
}
|
|
1797
2243
|
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
1798
2244
|
if (remaining.length > 0) {
|
|
1799
|
-
const yLines = groupByY(remaining);
|
|
2245
|
+
const yLines = mergeSuperscriptLines(groupByY(remaining));
|
|
1800
2246
|
for (const line of yLines) {
|
|
1801
2247
|
const text = mergeLineSimple(line);
|
|
1802
2248
|
if (!text.trim()) continue;
|
|
@@ -1810,7 +2256,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1810
2256
|
return by - ay;
|
|
1811
2257
|
});
|
|
1812
2258
|
} else {
|
|
1813
|
-
const allYLines = groupByY(items);
|
|
2259
|
+
const allYLines = mergeSuperscriptLines(groupByY(items));
|
|
1814
2260
|
const columns = detectColumns(allYLines);
|
|
1815
2261
|
if (columns && columns.length >= 3) {
|
|
1816
2262
|
const tableText = extractWithColumns(allYLines, columns);
|
|
@@ -1823,7 +2269,7 @@ function extractPageBlocksFallback(items, pageNum) {
|
|
|
1823
2269
|
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
1824
2270
|
for (const group of orderedGroups) {
|
|
1825
2271
|
if (group.length === 0) continue;
|
|
1826
|
-
const yLines = groupByY(group);
|
|
2272
|
+
const yLines = mergeSuperscriptLines(groupByY(group));
|
|
1827
2273
|
const groupColumns = detectColumns(yLines);
|
|
1828
2274
|
if (groupColumns && groupColumns.length >= 3) {
|
|
1829
2275
|
const tableText = extractWithColumns(yLines, groupColumns);
|
|
@@ -1915,16 +2361,16 @@ function normalizeItems(rawItems) {
|
|
|
1915
2361
|
if (!isDup) deduped.push(sorted[i]);
|
|
1916
2362
|
}
|
|
1917
2363
|
if (spacePositions.length > 0) {
|
|
1918
|
-
for (const
|
|
1919
|
-
|
|
1920
|
-
|
|
1921
|
-
|
|
1922
|
-
|
|
1923
|
-
|
|
1924
|
-
|
|
1925
|
-
}
|
|
2364
|
+
for (const sp of spacePositions) {
|
|
2365
|
+
let nearest = null;
|
|
2366
|
+
for (const item of deduped) {
|
|
2367
|
+
if (Math.abs(sp.y - item.y) > 3) continue;
|
|
2368
|
+
const dist = item.x - sp.x;
|
|
2369
|
+
if (dist >= -1 && dist <= 20 && (!nearest || item.x < nearest.x)) {
|
|
2370
|
+
nearest = item;
|
|
1926
2371
|
}
|
|
1927
2372
|
}
|
|
2373
|
+
if (nearest) nearest.hasSpaceBefore = true;
|
|
1928
2374
|
}
|
|
1929
2375
|
}
|
|
1930
2376
|
return deduped;
|
|
@@ -1958,6 +2404,35 @@ function groupByY(items) {
|
|
|
1958
2404
|
if (curLine.length > 0) lines.push(curLine);
|
|
1959
2405
|
return lines;
|
|
1960
2406
|
}
|
|
2407
|
+
function mergeSuperscriptLines(lines) {
|
|
2408
|
+
if (lines.length <= 1) return lines;
|
|
2409
|
+
const band = (line) => {
|
|
2410
|
+
let bottom = Infinity, top = -Infinity;
|
|
2411
|
+
for (const i of line) {
|
|
2412
|
+
const h = i.h > 0 ? i.h : i.fontSize;
|
|
2413
|
+
if (i.y < bottom) bottom = i.y;
|
|
2414
|
+
if (i.y + h > top) top = i.y + h;
|
|
2415
|
+
}
|
|
2416
|
+
return { bottom, top, height: top - bottom };
|
|
2417
|
+
};
|
|
2418
|
+
const isFrag = (line) => line.length <= 3 && line.every((i) => i.text.trim().length <= 8);
|
|
2419
|
+
const result = [lines[0]];
|
|
2420
|
+
for (let i = 1; i < lines.length; i++) {
|
|
2421
|
+
const prev = result[result.length - 1];
|
|
2422
|
+
const curr = lines[i];
|
|
2423
|
+
const a = band(prev);
|
|
2424
|
+
const b = band(curr);
|
|
2425
|
+
const overlap = Math.min(a.top, b.top) - Math.max(a.bottom, b.bottom);
|
|
2426
|
+
const prevIsFrag = isFrag(prev) && a.height <= b.height * 0.8 && overlap >= a.height * 0.5;
|
|
2427
|
+
const currIsFrag = isFrag(curr) && b.height <= a.height * 0.8 && overlap >= b.height * 0.5;
|
|
2428
|
+
if (prevIsFrag || currIsFrag) {
|
|
2429
|
+
result[result.length - 1] = [...prev, ...curr];
|
|
2430
|
+
} else {
|
|
2431
|
+
result.push(curr);
|
|
2432
|
+
}
|
|
2433
|
+
}
|
|
2434
|
+
return result;
|
|
2435
|
+
}
|
|
1961
2436
|
function isProseSpread(items) {
|
|
1962
2437
|
if (items.length < 4) return false;
|
|
1963
2438
|
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
@@ -2199,9 +2674,7 @@ function mergeLineSimple(items) {
|
|
|
2199
2674
|
result += sorted[i].text;
|
|
2200
2675
|
continue;
|
|
2201
2676
|
}
|
|
2202
|
-
if (gap
|
|
2203
|
-
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
2204
|
-
} else if (gap > 3) result += " ";
|
|
2677
|
+
if (gap > spaceGapThreshold(avgFs)) result += " ";
|
|
2205
2678
|
result += sorted[i].text;
|
|
2206
2679
|
}
|
|
2207
2680
|
return result;
|
|
@@ -2225,7 +2698,7 @@ function cleanPdfText(text) {
|
|
|
2225
2698
|
).replace(/^(?!\| ---).*$/gm, (line) => {
|
|
2226
2699
|
if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
|
|
2227
2700
|
return collapseEvenSpacing(line);
|
|
2228
|
-
}).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
2701
|
+
}).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\\~\\~/g, "~~").replace(/~~~~/g, "").replace(/\n{3,}/g, "\n\n").trim();
|
|
2229
2702
|
}
|
|
2230
2703
|
function startsWithMarker(line) {
|
|
2231
2704
|
const t = line.trimStart();
|
|
@@ -2234,6 +2707,134 @@ function startsWithMarker(line) {
|
|
|
2234
2707
|
function isStandaloneHeader(line) {
|
|
2235
2708
|
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
2236
2709
|
}
|
|
2710
|
+
var TABLE_CAPTION_RE = /^[<\[(【〈]?\s*(표|그림|도표|Table|Figure|Fig\.?)\s*[\d①-⑮][\d.\-]*\s*[\])】〉>]?[.:]?\s*/i;
|
|
2711
|
+
var CAPTION_MAX_LENGTH = 100;
|
|
2712
|
+
var CAPTION_MAX_GAP = 30;
|
|
2713
|
+
function detectTableCaptions(blocks) {
|
|
2714
|
+
const isCaptionCandidate = (b, table) => {
|
|
2715
|
+
if (!b || b.type !== "paragraph" || !b.text) return false;
|
|
2716
|
+
if (b.pageNumber !== table.pageNumber) return false;
|
|
2717
|
+
const text = b.text.trim();
|
|
2718
|
+
if (!text || text.length > CAPTION_MAX_LENGTH || text.includes("\n")) return false;
|
|
2719
|
+
if (!TABLE_CAPTION_RE.test(text)) return false;
|
|
2720
|
+
if (b.bbox && table.bbox) {
|
|
2721
|
+
const capTop = b.bbox.y + b.bbox.height;
|
|
2722
|
+
const capBottom = b.bbox.y;
|
|
2723
|
+
const tblTop = table.bbox.y + table.bbox.height;
|
|
2724
|
+
const tblBottom = table.bbox.y;
|
|
2725
|
+
const gap = capBottom >= tblTop ? capBottom - tblTop : tblBottom - capTop;
|
|
2726
|
+
if (gap > CAPTION_MAX_GAP) return false;
|
|
2727
|
+
const overlap = Math.min(b.bbox.x + b.bbox.width, table.bbox.x + table.bbox.width) - Math.max(b.bbox.x, table.bbox.x);
|
|
2728
|
+
if (overlap < Math.min(b.bbox.width, table.bbox.width) * 0.3) return false;
|
|
2729
|
+
}
|
|
2730
|
+
return true;
|
|
2731
|
+
};
|
|
2732
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2733
|
+
const block = blocks[i];
|
|
2734
|
+
if (block.type !== "table" || !block.table || block.table.caption) continue;
|
|
2735
|
+
if (isCaptionCandidate(blocks[i - 1], block)) {
|
|
2736
|
+
block.table.caption = blocks[i - 1].text.trim();
|
|
2737
|
+
blocks.splice(i - 1, 1);
|
|
2738
|
+
i--;
|
|
2739
|
+
} else if (isCaptionCandidate(blocks[i + 1], block)) {
|
|
2740
|
+
block.table.caption = blocks[i + 1].text.trim();
|
|
2741
|
+
blocks.splice(i + 1, 1);
|
|
2742
|
+
}
|
|
2743
|
+
}
|
|
2744
|
+
}
|
|
2745
|
+
var KOREAN_LIST_SEQ = "\uAC00\uB098\uB2E4\uB77C\uB9C8\uBC14\uC0AC\uC544\uC790\uCC28\uCE74\uD0C0\uD30C\uD558";
|
|
2746
|
+
function parseListLabel(text) {
|
|
2747
|
+
let m = text.match(/^(\d{1,2})\.(?!\d)\s+/);
|
|
2748
|
+
if (m) return { family: "arabicDot", ord: parseInt(m[1], 10) };
|
|
2749
|
+
m = text.match(/^([가-하])\.\s+/);
|
|
2750
|
+
if (m) {
|
|
2751
|
+
const idx = KOREAN_LIST_SEQ.indexOf(m[1]);
|
|
2752
|
+
if (idx >= 0) return { family: "korDot", ord: idx + 1 };
|
|
2753
|
+
}
|
|
2754
|
+
m = text.match(/^(\d{1,2})\)\s*/);
|
|
2755
|
+
if (m) return { family: "arabicParen", ord: parseInt(m[1], 10) };
|
|
2756
|
+
m = text.match(/^([가-하])\)\s*/);
|
|
2757
|
+
if (m) {
|
|
2758
|
+
const idx = KOREAN_LIST_SEQ.indexOf(m[1]);
|
|
2759
|
+
if (idx >= 0) return { family: "korParen", ord: idx + 1 };
|
|
2760
|
+
}
|
|
2761
|
+
m = text.match(/^([①②③④⑤⑥⑦⑧⑨⑩⑪⑫⑬⑭⑮])\s*/);
|
|
2762
|
+
if (m) return { family: "circled", ord: m[1].charCodeAt(0) - 9312 + 1 };
|
|
2763
|
+
return null;
|
|
2764
|
+
}
|
|
2765
|
+
var ATTACHMENT_RE = /^붙\s*임\s*(\d+[.:]?)?\s/;
|
|
2766
|
+
function detectKoreanListBlocks(blocks) {
|
|
2767
|
+
const labeled = [];
|
|
2768
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2769
|
+
const b = blocks[i];
|
|
2770
|
+
if (b.type !== "paragraph" && b.type !== "list" || !b.text) continue;
|
|
2771
|
+
const label = parseListLabel(b.text.trim());
|
|
2772
|
+
if (label) labeled.push({ idx: i, label });
|
|
2773
|
+
}
|
|
2774
|
+
const validated = /* @__PURE__ */ new Set();
|
|
2775
|
+
const byFamily = /* @__PURE__ */ new Map();
|
|
2776
|
+
for (const l of labeled) {
|
|
2777
|
+
const arr = byFamily.get(l.label.family) || [];
|
|
2778
|
+
arr.push(l);
|
|
2779
|
+
byFamily.set(l.label.family, arr);
|
|
2780
|
+
}
|
|
2781
|
+
for (const arr of byFamily.values()) {
|
|
2782
|
+
let chain = [];
|
|
2783
|
+
for (const item of arr) {
|
|
2784
|
+
const prev = chain[chain.length - 1];
|
|
2785
|
+
if (prev && item.label.ord === prev.label.ord + 1 && item.idx - prev.idx <= 20) {
|
|
2786
|
+
chain.push(item);
|
|
2787
|
+
} else {
|
|
2788
|
+
if (chain.length >= 2) for (const c of chain) validated.add(c.idx);
|
|
2789
|
+
chain = [item];
|
|
2790
|
+
}
|
|
2791
|
+
}
|
|
2792
|
+
if (chain.length >= 2) for (const c of chain) validated.add(c.idx);
|
|
2793
|
+
}
|
|
2794
|
+
let familyStack = [];
|
|
2795
|
+
let lastTopLevelList = null;
|
|
2796
|
+
const toRemove = /* @__PURE__ */ new Set();
|
|
2797
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2798
|
+
const b = blocks[i];
|
|
2799
|
+
if (b.type === "table" || b.type === "heading" || b.type === "separator") {
|
|
2800
|
+
familyStack = [];
|
|
2801
|
+
lastTopLevelList = null;
|
|
2802
|
+
continue;
|
|
2803
|
+
}
|
|
2804
|
+
if (b.type !== "paragraph" && b.type !== "list" || !b.text) continue;
|
|
2805
|
+
const text = b.text.trim();
|
|
2806
|
+
if (b.type === "paragraph" && ATTACHMENT_RE.test(text)) {
|
|
2807
|
+
blocks[i] = { ...b, type: "list", listType: "unordered" };
|
|
2808
|
+
continue;
|
|
2809
|
+
}
|
|
2810
|
+
if (!validated.has(i)) continue;
|
|
2811
|
+
const label = parseListLabel(text);
|
|
2812
|
+
let depth = familyStack.indexOf(label.family);
|
|
2813
|
+
if (depth < 0) {
|
|
2814
|
+
familyStack.push(label.family);
|
|
2815
|
+
depth = familyStack.length - 1;
|
|
2816
|
+
} else {
|
|
2817
|
+
familyStack = familyStack.slice(0, depth + 1);
|
|
2818
|
+
}
|
|
2819
|
+
const listType = label.family === "arabicDot" ? "ordered" : "unordered";
|
|
2820
|
+
const listBlock = { ...b, type: "list", listType };
|
|
2821
|
+
if (depth === 0) {
|
|
2822
|
+
blocks[i] = listBlock;
|
|
2823
|
+
lastTopLevelList = listBlock;
|
|
2824
|
+
} else if (lastTopLevelList) {
|
|
2825
|
+
if (!lastTopLevelList.children) lastTopLevelList.children = [];
|
|
2826
|
+
lastTopLevelList.children.push(listBlock);
|
|
2827
|
+
toRemove.add(i);
|
|
2828
|
+
} else {
|
|
2829
|
+
blocks[i] = listBlock;
|
|
2830
|
+
lastTopLevelList = listBlock;
|
|
2831
|
+
}
|
|
2832
|
+
}
|
|
2833
|
+
if (toRemove.size > 0) {
|
|
2834
|
+
const sorted = [...toRemove].sort((a, b) => b - a);
|
|
2835
|
+
for (const idx of sorted) blocks.splice(idx, 1);
|
|
2836
|
+
}
|
|
2837
|
+
}
|
|
2237
2838
|
function detectListBlocks(blocks) {
|
|
2238
2839
|
const result = [];
|
|
2239
2840
|
for (let i = 0; i < blocks.length; i++) {
|
|
@@ -2343,7 +2944,6 @@ function detectSpecialKoreanTables(blocks) {
|
|
|
2343
2944
|
function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
2344
2945
|
const ZONE_RATIO = 0.12;
|
|
2345
2946
|
const MIN_REPEAT = 3;
|
|
2346
|
-
const Y_BUCKET = 5;
|
|
2347
2947
|
const topEntries = [];
|
|
2348
2948
|
const bottomEntries = [];
|
|
2349
2949
|
for (let bi = 0; bi < blocks.length; bi++) {
|
|
@@ -2353,7 +2953,7 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
|
2353
2953
|
if (!ph) continue;
|
|
2354
2954
|
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
2355
2955
|
const blockBottom = ph - b.bbox.y;
|
|
2356
|
-
const entry = { blockIdx: bi, page: b.pageNumber,
|
|
2956
|
+
const entry = { blockIdx: bi, page: b.pageNumber, text: b.text.trim() };
|
|
2357
2957
|
if (blockBottom <= ph * ZONE_RATIO) bottomEntries.push(entry);
|
|
2358
2958
|
else if (blockTop >= ph * (1 - ZONE_RATIO)) topEntries.push(entry);
|
|
2359
2959
|
}
|
|
@@ -2375,21 +2975,9 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
|
2375
2975
|
repeatedPatterns.add(p);
|
|
2376
2976
|
}
|
|
2377
2977
|
}
|
|
2378
|
-
const bucketPages = /* @__PURE__ */ new Map();
|
|
2379
|
-
for (const e of entries) {
|
|
2380
|
-
const bucket = Math.round(e.y / Y_BUCKET);
|
|
2381
|
-
const pages = bucketPages.get(bucket) || /* @__PURE__ */ new Set();
|
|
2382
|
-
pages.add(e.page);
|
|
2383
|
-
bucketPages.set(bucket, pages);
|
|
2384
|
-
}
|
|
2385
|
-
const repeatedBuckets = /* @__PURE__ */ new Set();
|
|
2386
|
-
for (const [b, pages] of bucketPages) {
|
|
2387
|
-
if (pages.size >= MIN_REPEAT) repeatedBuckets.add(b);
|
|
2388
|
-
}
|
|
2389
2978
|
for (const e of entries) {
|
|
2390
2979
|
const norm = e.text.replace(/\d+/g, "#");
|
|
2391
|
-
|
|
2392
|
-
if (repeatedPatterns.has(norm) || repeatedBuckets.has(bucket)) {
|
|
2980
|
+
if (repeatedPatterns.has(norm)) {
|
|
2393
2981
|
removeSet.add(e.blockIdx);
|
|
2394
2982
|
}
|
|
2395
2983
|
}
|
|
@@ -2548,7 +3136,11 @@ function formatMb(bytes) {
|
|
|
2548
3136
|
}
|
|
2549
3137
|
export {
|
|
2550
3138
|
cleanPdfText,
|
|
3139
|
+
detectKoreanListBlocks,
|
|
3140
|
+
detectTableCaptions,
|
|
2551
3141
|
extractPdfMetadataOnly,
|
|
2552
|
-
|
|
3142
|
+
mergeCrossPageTables,
|
|
3143
|
+
parsePdfDocument,
|
|
3144
|
+
removeHeaderFooterBlocks
|
|
2553
3145
|
};
|
|
2554
|
-
//# sourceMappingURL=parser-
|
|
3146
|
+
//# sourceMappingURL=parser-LZH7ZELV.js.map
|