kordoc 2.2.3 → 2.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -3
- package/dist/chunk-JH5XLWJQ.js +457 -0
- package/dist/chunk-JH5XLWJQ.js.map +1 -0
- package/dist/chunk-MUOQXDZ4.cjs +33 -0
- package/dist/chunk-MUOQXDZ4.cjs.map +1 -0
- package/dist/chunk-OJ4QR33V.cjs +450 -0
- package/dist/chunk-OJ4QR33V.cjs.map +1 -0
- package/dist/{chunk-AIG7SDWU.js → chunk-RQWICKON.js} +964 -2732
- package/dist/chunk-RQWICKON.js.map +1 -0
- package/dist/chunk-SBVRCJFH.js +33 -0
- package/dist/chunk-SBVRCJFH.js.map +1 -0
- package/dist/chunk-UU2O6D3R.js +450 -0
- package/dist/chunk-UU2O6D3R.js.map +1 -0
- package/dist/cli.js +154 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1095 -3324
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +98 -8
- package/dist/index.d.ts +98 -8
- package/dist/index.js +917 -3100
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +140 -14
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-3C7UGGEK.cjs +7 -0
- package/dist/page-range-3C7UGGEK.cjs.map +1 -0
- package/dist/page-range-H35FN3OQ.js +7 -0
- package/dist/page-range-H35FN3OQ.js.map +1 -0
- package/dist/parser-CYBX5MP4.cjs +2278 -0
- package/dist/parser-CYBX5MP4.cjs.map +1 -0
- package/dist/parser-OIRWPKIQ.js +2278 -0
- package/dist/parser-OIRWPKIQ.js.map +1 -0
- package/dist/parser-PXD73E4H.js +2279 -0
- package/dist/parser-PXD73E4H.js.map +1 -0
- package/dist/provider-WPIYEALY.js +37 -0
- package/dist/provider-WPIYEALY.js.map +1 -0
- package/dist/provider-YN2SSK4X.cjs +37 -0
- package/dist/provider-YN2SSK4X.cjs.map +1 -0
- package/dist/{watch-H672QAW2.js → watch-NSBABJ4A.js} +6 -4
- package/dist/{watch-H672QAW2.js.map → watch-NSBABJ4A.js.map} +1 -1
- package/package.json +1 -1
- package/dist/chunk-AIG7SDWU.js.map +0 -1
|
@@ -0,0 +1,2278 @@
|
|
|
1
|
+
"use strict";Object.defineProperty(exports, "__esModule", {value: true}); function _interopRequireWildcard(obj) { if (obj && obj.__esModule) { return obj; } else { var newObj = {}; if (obj != null) { for (var key in obj) { if (Object.prototype.hasOwnProperty.call(obj, key)) { newObj[key] = obj[key]; } } } newObj.default = obj; return newObj; } } function _optionalChain(ops) { let lastAccessLHS = undefined; let value = ops[0]; let i = 1; while (i < ops.length) { const op = ops[i]; const fn = ops[i + 1]; i += 2; if ((op === 'optionalAccess' || op === 'optionalCall') && value == null) { return undefined; } if (op === 'access' || op === 'optionalAccess') { lastAccessLHS = value; value = fn(value); } else if (op === 'call' || op === 'optionalCall') { value = fn((...args) => value.call(lastAccessLHS, ...args)); lastAccessLHS = undefined; } } return value; } var _class;
|
|
2
|
+
|
|
3
|
+
|
|
4
|
+
|
|
5
|
+
|
|
6
|
+
|
|
7
|
+
|
|
8
|
+
|
|
9
|
+
var _chunkOJ4QR33Vcjs = require('./chunk-OJ4QR33V.cjs');
|
|
10
|
+
|
|
11
|
+
|
|
12
|
+
var _chunkMUOQXDZ4cjs = require('./chunk-MUOQXDZ4.cjs');
|
|
13
|
+
|
|
14
|
+
// src/pdf/line-detector.ts
|
|
15
|
+
var _pdfmjs = require('pdfjs-dist/legacy/build/pdf.mjs');
|
|
16
|
+
var ORIENTATION_TOL = 2;
|
|
17
|
+
var MIN_LINE_LENGTH = 15;
|
|
18
|
+
var MAX_LINE_WIDTH = 5;
|
|
19
|
+
var CONNECT_TOL = 5;
|
|
20
|
+
var CELL_PADDING = 2;
|
|
21
|
+
var MIN_COL_WIDTH = 15;
|
|
22
|
+
var MIN_ROW_HEIGHT = 6;
|
|
23
|
+
var VERTEX_MERGE_FACTOR = 4;
|
|
24
|
+
var MIN_COORD_MERGE_TOL = 8;
|
|
25
|
+
function extractLines(fnArray, argsArray) {
|
|
26
|
+
const horizontals = [];
|
|
27
|
+
const verticals = [];
|
|
28
|
+
let lineWidth = 1;
|
|
29
|
+
let currentPath = [];
|
|
30
|
+
let pathStartX = 0, pathStartY = 0;
|
|
31
|
+
let curX = 0, curY = 0;
|
|
32
|
+
function pushRectangle(path, rx, ry, rw, rh) {
|
|
33
|
+
if (Math.abs(rh) < ORIENTATION_TOL * 2) {
|
|
34
|
+
path.push({ x1: rx, y1: ry + rh / 2, x2: rx + rw, y2: ry + rh / 2 });
|
|
35
|
+
} else if (Math.abs(rw) < ORIENTATION_TOL * 2) {
|
|
36
|
+
path.push({ x1: rx + rw / 2, y1: ry, x2: rx + rw / 2, y2: ry + rh });
|
|
37
|
+
} else {
|
|
38
|
+
path.push(
|
|
39
|
+
{ x1: rx, y1: ry, x2: rx + rw, y2: ry },
|
|
40
|
+
{ x1: rx + rw, y1: ry, x2: rx + rw, y2: ry + rh },
|
|
41
|
+
{ x1: rx + rw, y1: ry + rh, x2: rx, y2: ry + rh },
|
|
42
|
+
{ x1: rx, y1: ry + rh, x2: rx, y2: ry }
|
|
43
|
+
);
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
function flushPath(isStroke) {
|
|
47
|
+
if (!isStroke) {
|
|
48
|
+
currentPath = [];
|
|
49
|
+
return;
|
|
50
|
+
}
|
|
51
|
+
for (const seg of currentPath) {
|
|
52
|
+
classifyAndAdd(seg, lineWidth, horizontals, verticals);
|
|
53
|
+
}
|
|
54
|
+
currentPath = [];
|
|
55
|
+
}
|
|
56
|
+
for (let i = 0; i < fnArray.length; i++) {
|
|
57
|
+
const op = fnArray[i];
|
|
58
|
+
const args = argsArray[i];
|
|
59
|
+
switch (op) {
|
|
60
|
+
case _pdfmjs.OPS.setLineWidth:
|
|
61
|
+
lineWidth = args[0] || 1;
|
|
62
|
+
break;
|
|
63
|
+
case _pdfmjs.OPS.constructPath: {
|
|
64
|
+
const arg0 = args[0];
|
|
65
|
+
if (Array.isArray(arg0)) {
|
|
66
|
+
const subOps = arg0;
|
|
67
|
+
const coords = args[1];
|
|
68
|
+
let ci = 0;
|
|
69
|
+
for (const subOp of subOps) {
|
|
70
|
+
if (subOp === _pdfmjs.OPS.moveTo) {
|
|
71
|
+
curX = coords[ci++];
|
|
72
|
+
curY = coords[ci++];
|
|
73
|
+
pathStartX = curX;
|
|
74
|
+
pathStartY = curY;
|
|
75
|
+
} else if (subOp === _pdfmjs.OPS.lineTo) {
|
|
76
|
+
const x2 = coords[ci++], y2 = coords[ci++];
|
|
77
|
+
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
78
|
+
curX = x2;
|
|
79
|
+
curY = y2;
|
|
80
|
+
} else if (subOp === _pdfmjs.OPS.rectangle) {
|
|
81
|
+
const rx = coords[ci++], ry = coords[ci++];
|
|
82
|
+
const rw = coords[ci++], rh = coords[ci++];
|
|
83
|
+
pushRectangle(currentPath, rx, ry, rw, rh);
|
|
84
|
+
} else if (subOp === _pdfmjs.OPS.closePath) {
|
|
85
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
86
|
+
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
87
|
+
}
|
|
88
|
+
curX = pathStartX;
|
|
89
|
+
curY = pathStartY;
|
|
90
|
+
} else if (subOp === _pdfmjs.OPS.curveTo) {
|
|
91
|
+
ci += 6;
|
|
92
|
+
} else if (subOp === _pdfmjs.OPS.curveTo2 || subOp === _pdfmjs.OPS.curveTo3) {
|
|
93
|
+
ci += 4;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
} else {
|
|
97
|
+
const afterOp = arg0;
|
|
98
|
+
const dataArr = args[1];
|
|
99
|
+
const pathData = _optionalChain([dataArr, 'optionalAccess', _3 => _3[0]]);
|
|
100
|
+
if (pathData && typeof pathData === "object") {
|
|
101
|
+
const len = Object.keys(pathData).length;
|
|
102
|
+
let di = 0;
|
|
103
|
+
while (di < len) {
|
|
104
|
+
const drawOp = pathData[di++];
|
|
105
|
+
if (drawOp === 0 /* moveTo */) {
|
|
106
|
+
curX = pathData[di++];
|
|
107
|
+
curY = pathData[di++];
|
|
108
|
+
pathStartX = curX;
|
|
109
|
+
pathStartY = curY;
|
|
110
|
+
} else if (drawOp === 1 /* lineTo */) {
|
|
111
|
+
const x2 = pathData[di++], y2 = pathData[di++];
|
|
112
|
+
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
113
|
+
curX = x2;
|
|
114
|
+
curY = y2;
|
|
115
|
+
} else if (drawOp === 2 /* curveTo */) {
|
|
116
|
+
di += 6;
|
|
117
|
+
} else if (drawOp === 3 /* quadraticCurveTo */) {
|
|
118
|
+
di += 4;
|
|
119
|
+
} else if (drawOp === 4 /* closePath */) {
|
|
120
|
+
if (curX !== pathStartX || curY !== pathStartY) {
|
|
121
|
+
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
122
|
+
}
|
|
123
|
+
curX = pathStartX;
|
|
124
|
+
curY = pathStartY;
|
|
125
|
+
} else {
|
|
126
|
+
break;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
}
|
|
130
|
+
if (afterOp === _pdfmjs.OPS.stroke || afterOp === _pdfmjs.OPS.closeStroke) {
|
|
131
|
+
flushPath(true);
|
|
132
|
+
} else if (afterOp === _pdfmjs.OPS.fill || afterOp === _pdfmjs.OPS.eoFill || afterOp === _pdfmjs.OPS.fillStroke || afterOp === _pdfmjs.OPS.eoFillStroke || afterOp === _pdfmjs.OPS.closeFillStroke || afterOp === _pdfmjs.OPS.closeEOFillStroke) {
|
|
133
|
+
flushPath(true);
|
|
134
|
+
} else if (afterOp === _pdfmjs.OPS.endPath) {
|
|
135
|
+
flushPath(false);
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
break;
|
|
139
|
+
}
|
|
140
|
+
case _pdfmjs.OPS.stroke:
|
|
141
|
+
case _pdfmjs.OPS.closeStroke:
|
|
142
|
+
flushPath(true);
|
|
143
|
+
break;
|
|
144
|
+
case _pdfmjs.OPS.fill:
|
|
145
|
+
case _pdfmjs.OPS.eoFill:
|
|
146
|
+
case _pdfmjs.OPS.fillStroke:
|
|
147
|
+
case _pdfmjs.OPS.eoFillStroke:
|
|
148
|
+
case _pdfmjs.OPS.closeFillStroke:
|
|
149
|
+
case _pdfmjs.OPS.closeEOFillStroke:
|
|
150
|
+
flushPath(true);
|
|
151
|
+
break;
|
|
152
|
+
case _pdfmjs.OPS.endPath:
|
|
153
|
+
flushPath(false);
|
|
154
|
+
break;
|
|
155
|
+
}
|
|
156
|
+
}
|
|
157
|
+
return { horizontals, verticals };
|
|
158
|
+
}
|
|
159
|
+
function classifyAndAdd(seg, lineWidth, horizontals, verticals) {
|
|
160
|
+
const dx = Math.abs(seg.x2 - seg.x1);
|
|
161
|
+
const dy = Math.abs(seg.y2 - seg.y1);
|
|
162
|
+
const length = Math.sqrt(dx * dx + dy * dy);
|
|
163
|
+
if (length < MIN_LINE_LENGTH) return;
|
|
164
|
+
if (dy <= ORIENTATION_TOL) {
|
|
165
|
+
const y = (seg.y1 + seg.y2) / 2;
|
|
166
|
+
const x1 = Math.min(seg.x1, seg.x2);
|
|
167
|
+
const x2 = Math.max(seg.x1, seg.x2);
|
|
168
|
+
horizontals.push({ x1, y1: y, x2, y2: y, lineWidth });
|
|
169
|
+
} else if (dx <= ORIENTATION_TOL) {
|
|
170
|
+
const x = (seg.x1 + seg.x2) / 2;
|
|
171
|
+
const y1 = Math.min(seg.y1, seg.y2);
|
|
172
|
+
const y2 = Math.max(seg.y1, seg.y2);
|
|
173
|
+
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
function preprocessLines(horizontals, verticals) {
|
|
177
|
+
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
178
|
+
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
179
|
+
h = mergeParallelLines(h, "h");
|
|
180
|
+
v = mergeParallelLines(v, "v");
|
|
181
|
+
return { horizontals: h, verticals: v };
|
|
182
|
+
}
|
|
183
|
+
function mergeParallelLines(lines, dir) {
|
|
184
|
+
if (lines.length <= 1) return lines;
|
|
185
|
+
const sorted = [...lines].sort((a, b) => {
|
|
186
|
+
const posA = dir === "h" ? a.y1 : a.x1;
|
|
187
|
+
const posB = dir === "h" ? b.y1 : b.x1;
|
|
188
|
+
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
189
|
+
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
190
|
+
});
|
|
191
|
+
const MERGE_TOL = 3;
|
|
192
|
+
const result = [sorted[0]];
|
|
193
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
194
|
+
const prev = result[result.length - 1];
|
|
195
|
+
const curr = sorted[i];
|
|
196
|
+
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
197
|
+
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
198
|
+
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
199
|
+
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
200
|
+
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
201
|
+
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
202
|
+
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
203
|
+
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
204
|
+
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
205
|
+
if (overlap > minLen * 0.3) {
|
|
206
|
+
if (dir === "h") {
|
|
207
|
+
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
208
|
+
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
209
|
+
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
210
|
+
prev.y2 = prev.y1;
|
|
211
|
+
} else {
|
|
212
|
+
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
213
|
+
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
214
|
+
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
215
|
+
prev.x2 = prev.x1;
|
|
216
|
+
}
|
|
217
|
+
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
218
|
+
continue;
|
|
219
|
+
}
|
|
220
|
+
}
|
|
221
|
+
result.push(curr);
|
|
222
|
+
}
|
|
223
|
+
return result;
|
|
224
|
+
}
|
|
225
|
+
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
226
|
+
const margin = 5;
|
|
227
|
+
return {
|
|
228
|
+
horizontals: horizontals.filter(
|
|
229
|
+
(l) => !(Math.abs(l.y1) < margin || Math.abs(l.y1 - pageHeight) < margin) || l.x2 - l.x1 < pageWidth * 0.9
|
|
230
|
+
),
|
|
231
|
+
verticals: verticals.filter(
|
|
232
|
+
(l) => !(Math.abs(l.x1) < margin || Math.abs(l.x1 - pageWidth) < margin) || l.y2 - l.y1 < pageHeight * 0.9
|
|
233
|
+
)
|
|
234
|
+
};
|
|
235
|
+
}
|
|
236
|
+
function buildVertices(horizontals, verticals) {
|
|
237
|
+
const vertices = [];
|
|
238
|
+
const tol = CONNECT_TOL;
|
|
239
|
+
for (const h of horizontals) {
|
|
240
|
+
for (const v of verticals) {
|
|
241
|
+
if (v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol) {
|
|
242
|
+
const radius = Math.max(h.lineWidth, v.lineWidth, 1);
|
|
243
|
+
vertices.push({ x: v.x1, y: h.y1, radius });
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
}
|
|
247
|
+
return vertices;
|
|
248
|
+
}
|
|
249
|
+
function mergeVertices(vertices) {
|
|
250
|
+
if (vertices.length <= 1) return vertices;
|
|
251
|
+
const merged = [];
|
|
252
|
+
const used = new Array(vertices.length).fill(false);
|
|
253
|
+
for (let i = 0; i < vertices.length; i++) {
|
|
254
|
+
if (used[i]) continue;
|
|
255
|
+
let sumX = vertices[i].x, sumY = vertices[i].y;
|
|
256
|
+
let maxRadius = vertices[i].radius;
|
|
257
|
+
let count = 1;
|
|
258
|
+
for (let j = i + 1; j < vertices.length; j++) {
|
|
259
|
+
if (used[j]) continue;
|
|
260
|
+
const mergeTol = VERTEX_MERGE_FACTOR * Math.max(maxRadius, vertices[j].radius);
|
|
261
|
+
if (Math.abs(vertices[i].x - vertices[j].x) <= mergeTol && Math.abs(vertices[i].y - vertices[j].y) <= mergeTol) {
|
|
262
|
+
sumX += vertices[j].x;
|
|
263
|
+
sumY += vertices[j].y;
|
|
264
|
+
maxRadius = Math.max(maxRadius, vertices[j].radius);
|
|
265
|
+
count++;
|
|
266
|
+
used[j] = true;
|
|
267
|
+
}
|
|
268
|
+
}
|
|
269
|
+
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
270
|
+
}
|
|
271
|
+
return merged;
|
|
272
|
+
}
|
|
273
|
+
function buildTableGrids(horizontals, verticals) {
|
|
274
|
+
if (horizontals.length < 2 || verticals.length < 2) return [];
|
|
275
|
+
const allVertices = buildVertices(horizontals, verticals);
|
|
276
|
+
const vertices = mergeVertices(allVertices);
|
|
277
|
+
if (vertices.length < 4) return [];
|
|
278
|
+
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
279
|
+
const allLines = [
|
|
280
|
+
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
281
|
+
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
282
|
+
];
|
|
283
|
+
const groups = groupConnectedLines(allLines);
|
|
284
|
+
const grids = [];
|
|
285
|
+
for (const group of groups) {
|
|
286
|
+
const hLines = group.filter((l) => l.type === "h");
|
|
287
|
+
const vLines = group.filter((l) => l.type === "v");
|
|
288
|
+
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
289
|
+
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
290
|
+
for (const l of vLines) {
|
|
291
|
+
if (l.x1 < gx1) gx1 = l.x1;
|
|
292
|
+
if (l.x1 > gx2) gx2 = l.x1;
|
|
293
|
+
}
|
|
294
|
+
for (const l of hLines) {
|
|
295
|
+
if (l.y1 < gy1) gy1 = l.y1;
|
|
296
|
+
if (l.y1 > gy2) gy2 = l.y1;
|
|
297
|
+
}
|
|
298
|
+
const groupBbox = {
|
|
299
|
+
x1: gx1 - CONNECT_TOL,
|
|
300
|
+
y1: gy1 - CONNECT_TOL,
|
|
301
|
+
x2: gx2 + CONNECT_TOL,
|
|
302
|
+
y2: gy2 + CONNECT_TOL
|
|
303
|
+
};
|
|
304
|
+
const groupVertices = vertices.filter(
|
|
305
|
+
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
306
|
+
);
|
|
307
|
+
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
308
|
+
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
309
|
+
const rawYs = [
|
|
310
|
+
...hLines.map((l) => l.y1),
|
|
311
|
+
...groupVertices.map((v) => v.y)
|
|
312
|
+
];
|
|
313
|
+
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
314
|
+
const rawXs = [
|
|
315
|
+
...vLines.map((l) => l.x1),
|
|
316
|
+
...groupVertices.map((v) => v.x)
|
|
317
|
+
];
|
|
318
|
+
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
319
|
+
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
320
|
+
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
321
|
+
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
322
|
+
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
323
|
+
const bbox = {
|
|
324
|
+
x1: validColXs[0],
|
|
325
|
+
y1: validRowYs[validRowYs.length - 1],
|
|
326
|
+
x2: validColXs[validColXs.length - 1],
|
|
327
|
+
y2: validRowYs[0]
|
|
328
|
+
};
|
|
329
|
+
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
330
|
+
}
|
|
331
|
+
return mergeAdjacentGrids(grids);
|
|
332
|
+
}
|
|
333
|
+
function enforceMinWidth(colXs, minWidth) {
|
|
334
|
+
if (colXs.length <= 2) return colXs;
|
|
335
|
+
const result = [colXs[0]];
|
|
336
|
+
for (let i = 1; i < colXs.length; i++) {
|
|
337
|
+
const prevX = result[result.length - 1];
|
|
338
|
+
if (colXs[i] - prevX < minWidth && i < colXs.length - 1) {
|
|
339
|
+
continue;
|
|
340
|
+
}
|
|
341
|
+
result.push(colXs[i]);
|
|
342
|
+
}
|
|
343
|
+
return result;
|
|
344
|
+
}
|
|
345
|
+
function enforceMinHeight(rowYs, minHeight) {
|
|
346
|
+
if (rowYs.length <= 2) return rowYs;
|
|
347
|
+
const result = [rowYs[0]];
|
|
348
|
+
for (let i = 1; i < rowYs.length; i++) {
|
|
349
|
+
const prevY = result[result.length - 1];
|
|
350
|
+
if (prevY - rowYs[i] < minHeight && i < rowYs.length - 1) {
|
|
351
|
+
continue;
|
|
352
|
+
}
|
|
353
|
+
result.push(rowYs[i]);
|
|
354
|
+
}
|
|
355
|
+
return result;
|
|
356
|
+
}
|
|
357
|
+
function mergeAdjacentGrids(grids) {
|
|
358
|
+
if (grids.length <= 1) return grids;
|
|
359
|
+
const sorted = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
360
|
+
const merged = [sorted[0]];
|
|
361
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
362
|
+
const prev = merged[merged.length - 1];
|
|
363
|
+
const curr = sorted[i];
|
|
364
|
+
if (prev.colXs.length === curr.colXs.length) {
|
|
365
|
+
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
366
|
+
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
367
|
+
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
368
|
+
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
369
|
+
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
370
|
+
merged[merged.length - 1] = {
|
|
371
|
+
rowYs: allRowYs,
|
|
372
|
+
colXs: prev.colXs,
|
|
373
|
+
bbox: {
|
|
374
|
+
x1: Math.min(prev.bbox.x1, curr.bbox.x1),
|
|
375
|
+
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
376
|
+
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
377
|
+
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
378
|
+
},
|
|
379
|
+
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
380
|
+
};
|
|
381
|
+
continue;
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
merged.push(curr);
|
|
385
|
+
}
|
|
386
|
+
return merged;
|
|
387
|
+
}
|
|
388
|
+
function clusterCoordinates(values, tolerance) {
|
|
389
|
+
if (values.length === 0) return [];
|
|
390
|
+
const sorted = [...values].sort((a, b) => a - b);
|
|
391
|
+
const clusters = [{ sum: sorted[0], count: 1 }];
|
|
392
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
393
|
+
const last = clusters[clusters.length - 1];
|
|
394
|
+
const avg = last.sum / last.count;
|
|
395
|
+
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
396
|
+
last.sum += sorted[i];
|
|
397
|
+
last.count++;
|
|
398
|
+
} else {
|
|
399
|
+
clusters.push({ sum: sorted[i], count: 1 });
|
|
400
|
+
}
|
|
401
|
+
}
|
|
402
|
+
return clusters.map((c) => c.sum / c.count);
|
|
403
|
+
}
|
|
404
|
+
function groupConnectedLines(lines) {
|
|
405
|
+
const parent = lines.map((_, i) => i);
|
|
406
|
+
function find(x) {
|
|
407
|
+
while (parent[x] !== x) {
|
|
408
|
+
parent[x] = parent[parent[x]];
|
|
409
|
+
x = parent[x];
|
|
410
|
+
}
|
|
411
|
+
return x;
|
|
412
|
+
}
|
|
413
|
+
function union(a, b) {
|
|
414
|
+
const ra = find(a), rb = find(b);
|
|
415
|
+
if (ra !== rb) parent[ra] = rb;
|
|
416
|
+
}
|
|
417
|
+
for (let i = 0; i < lines.length; i++) {
|
|
418
|
+
for (let j = i + 1; j < lines.length; j++) {
|
|
419
|
+
if (linesIntersect(lines[i], lines[j])) {
|
|
420
|
+
union(i, j);
|
|
421
|
+
}
|
|
422
|
+
}
|
|
423
|
+
}
|
|
424
|
+
const groups = /* @__PURE__ */ new Map();
|
|
425
|
+
for (let i = 0; i < lines.length; i++) {
|
|
426
|
+
const root = find(i);
|
|
427
|
+
if (!groups.has(root)) groups.set(root, []);
|
|
428
|
+
groups.get(root).push(lines[i]);
|
|
429
|
+
}
|
|
430
|
+
return [...groups.values()];
|
|
431
|
+
}
|
|
432
|
+
function linesIntersect(a, b) {
|
|
433
|
+
if (a.type === b.type) {
|
|
434
|
+
if (a.type === "h") {
|
|
435
|
+
if (Math.abs(a.y1 - b.y1) > CONNECT_TOL) return false;
|
|
436
|
+
return Math.min(a.x2, b.x2) >= Math.max(a.x1, b.x1) - CONNECT_TOL;
|
|
437
|
+
} else {
|
|
438
|
+
if (Math.abs(a.x1 - b.x1) > CONNECT_TOL) return false;
|
|
439
|
+
return Math.min(a.y2, b.y2) >= Math.max(a.y1, b.y1) - CONNECT_TOL;
|
|
440
|
+
}
|
|
441
|
+
}
|
|
442
|
+
const h = a.type === "h" ? a : b;
|
|
443
|
+
const v = a.type === "h" ? b : a;
|
|
444
|
+
const tol = CONNECT_TOL;
|
|
445
|
+
return v.x1 >= h.x1 - tol && v.x1 <= h.x2 + tol && h.y1 >= v.y1 - tol && h.y1 <= v.y2 + tol;
|
|
446
|
+
}
|
|
447
|
+
function extractCells(grid, horizontals, verticals) {
|
|
448
|
+
const { rowYs, colXs } = grid;
|
|
449
|
+
const numRows = rowYs.length - 1;
|
|
450
|
+
const numCols = colXs.length - 1;
|
|
451
|
+
if (numRows <= 0 || numCols <= 0) return [];
|
|
452
|
+
const vBorders = Array.from(
|
|
453
|
+
{ length: numRows },
|
|
454
|
+
(_, r) => Array.from(
|
|
455
|
+
{ length: numCols + 1 },
|
|
456
|
+
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
457
|
+
)
|
|
458
|
+
);
|
|
459
|
+
const hBorders = Array.from(
|
|
460
|
+
{ length: numRows + 1 },
|
|
461
|
+
(_, r) => Array.from(
|
|
462
|
+
{ length: numCols },
|
|
463
|
+
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
464
|
+
)
|
|
465
|
+
);
|
|
466
|
+
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
467
|
+
const cells = [];
|
|
468
|
+
for (let r = 0; r < numRows; r++) {
|
|
469
|
+
for (let c = 0; c < numCols; c++) {
|
|
470
|
+
if (occupied[r][c]) continue;
|
|
471
|
+
let colSpan = 1;
|
|
472
|
+
let rowSpan = 1;
|
|
473
|
+
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
474
|
+
let canExpand = true;
|
|
475
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
476
|
+
if (vBorders[r + dr][c + colSpan]) {
|
|
477
|
+
canExpand = false;
|
|
478
|
+
break;
|
|
479
|
+
}
|
|
480
|
+
}
|
|
481
|
+
if (!canExpand) break;
|
|
482
|
+
colSpan++;
|
|
483
|
+
}
|
|
484
|
+
while (r + rowSpan < numRows) {
|
|
485
|
+
let hasLine = false;
|
|
486
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
487
|
+
if (hBorders[r + rowSpan][c + dc]) {
|
|
488
|
+
hasLine = true;
|
|
489
|
+
break;
|
|
490
|
+
}
|
|
491
|
+
}
|
|
492
|
+
if (hasLine) break;
|
|
493
|
+
rowSpan++;
|
|
494
|
+
}
|
|
495
|
+
for (let dr = 0; dr < rowSpan; dr++) {
|
|
496
|
+
for (let dc = 0; dc < colSpan; dc++) {
|
|
497
|
+
occupied[r + dr][c + dc] = true;
|
|
498
|
+
}
|
|
499
|
+
}
|
|
500
|
+
cells.push({
|
|
501
|
+
row: r,
|
|
502
|
+
col: c,
|
|
503
|
+
rowSpan,
|
|
504
|
+
colSpan,
|
|
505
|
+
bbox: {
|
|
506
|
+
x1: colXs[c],
|
|
507
|
+
y1: rowYs[r + rowSpan],
|
|
508
|
+
x2: colXs[c + colSpan],
|
|
509
|
+
y2: rowYs[r]
|
|
510
|
+
}
|
|
511
|
+
});
|
|
512
|
+
}
|
|
513
|
+
}
|
|
514
|
+
return cells;
|
|
515
|
+
}
|
|
516
|
+
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
517
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
518
|
+
for (const v of verticals) {
|
|
519
|
+
if (Math.abs(v.x1 - x) <= tol) {
|
|
520
|
+
const cellH = Math.abs(topY - botY);
|
|
521
|
+
if (cellH < 0.1) continue;
|
|
522
|
+
const overlapTop = Math.min(v.y2, topY);
|
|
523
|
+
const overlapBot = Math.max(v.y1, botY);
|
|
524
|
+
const overlap = overlapTop - overlapBot;
|
|
525
|
+
if (overlap >= cellH * 0.75) return true;
|
|
526
|
+
}
|
|
527
|
+
}
|
|
528
|
+
return false;
|
|
529
|
+
}
|
|
530
|
+
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
531
|
+
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
532
|
+
for (const h of horizontals) {
|
|
533
|
+
if (Math.abs(h.y1 - y) <= tol) {
|
|
534
|
+
const cellW = Math.abs(rightX - leftX);
|
|
535
|
+
if (cellW < 0.1) continue;
|
|
536
|
+
const overlapLeft = Math.max(h.x1, leftX);
|
|
537
|
+
const overlapRight = Math.min(h.x2, rightX);
|
|
538
|
+
const overlap = overlapRight - overlapLeft;
|
|
539
|
+
if (overlap >= cellW * 0.75) return true;
|
|
540
|
+
}
|
|
541
|
+
}
|
|
542
|
+
return false;
|
|
543
|
+
}
|
|
544
|
+
function mapTextToCells(items, cells) {
|
|
545
|
+
const result = /* @__PURE__ */ new Map();
|
|
546
|
+
for (const cell of cells) {
|
|
547
|
+
result.set(cell, []);
|
|
548
|
+
}
|
|
549
|
+
for (const item of items) {
|
|
550
|
+
const pad = CELL_PADDING;
|
|
551
|
+
let bestCell = null;
|
|
552
|
+
let bestScore = 0;
|
|
553
|
+
for (const cell of cells) {
|
|
554
|
+
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
555
|
+
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
556
|
+
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
557
|
+
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
558
|
+
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
559
|
+
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
560
|
+
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
561
|
+
const score = intersectArea / itemArea;
|
|
562
|
+
if (score > bestScore) {
|
|
563
|
+
bestScore = score;
|
|
564
|
+
bestCell = cell;
|
|
565
|
+
}
|
|
566
|
+
}
|
|
567
|
+
if (bestCell && bestScore > 0.3) {
|
|
568
|
+
result.get(bestCell).push(item);
|
|
569
|
+
}
|
|
570
|
+
}
|
|
571
|
+
return result;
|
|
572
|
+
}
|
|
573
|
+
function cellTextToString(items) {
|
|
574
|
+
if (items.length === 0) return "";
|
|
575
|
+
if (items.length === 1) return items[0].text;
|
|
576
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
577
|
+
const lines = [];
|
|
578
|
+
let curLine = [sorted[0]];
|
|
579
|
+
let curY = sorted[0].y;
|
|
580
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
581
|
+
const tol = Math.max(3, Math.min(sorted[i].fontSize, curLine[0].fontSize) * 0.6);
|
|
582
|
+
if (Math.abs(sorted[i].y - curY) <= tol) {
|
|
583
|
+
curLine.push(sorted[i]);
|
|
584
|
+
} else {
|
|
585
|
+
lines.push(curLine);
|
|
586
|
+
curLine = [sorted[i]];
|
|
587
|
+
curY = sorted[i].y;
|
|
588
|
+
}
|
|
589
|
+
}
|
|
590
|
+
lines.push(curLine);
|
|
591
|
+
const textLines = lines.map((line) => {
|
|
592
|
+
const s = line.sort((a, b) => a.x - b.x);
|
|
593
|
+
if (s.length === 1) return s[0].text;
|
|
594
|
+
const evenSpaced = detectEvenSpacedItems(s);
|
|
595
|
+
let result = s[0].text;
|
|
596
|
+
for (let j = 1; j < s.length; j++) {
|
|
597
|
+
if (evenSpaced[j]) {
|
|
598
|
+
result += s[j].text;
|
|
599
|
+
continue;
|
|
600
|
+
}
|
|
601
|
+
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
602
|
+
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
603
|
+
const prevIsKorean = /[가-힣]$/.test(result);
|
|
604
|
+
const currIsKorean = /^[가-힣]/.test(s[j].text);
|
|
605
|
+
if (gap < avgFs * 0.15) {
|
|
606
|
+
result += s[j].text;
|
|
607
|
+
} else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
|
|
608
|
+
result += s[j].text;
|
|
609
|
+
} else {
|
|
610
|
+
result += " " + s[j].text;
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
return result;
|
|
614
|
+
});
|
|
615
|
+
return mergeCellTextLines(textLines);
|
|
616
|
+
}
|
|
617
|
+
function detectEvenSpacedItems(items) {
|
|
618
|
+
const result = new Array(items.length).fill(false);
|
|
619
|
+
if (items.length < 3) return result;
|
|
620
|
+
let runStart = -1;
|
|
621
|
+
for (let i = 0; i < items.length; i++) {
|
|
622
|
+
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
623
|
+
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
624
|
+
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
625
|
+
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
626
|
+
if (gap > maxRunGap) {
|
|
627
|
+
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
628
|
+
runStart = i;
|
|
629
|
+
continue;
|
|
630
|
+
}
|
|
631
|
+
}
|
|
632
|
+
if (isShortKorean) {
|
|
633
|
+
if (runStart < 0) runStart = i;
|
|
634
|
+
} else {
|
|
635
|
+
if (runStart >= 0 && i - runStart >= 3) {
|
|
636
|
+
markEvenRun(items, result, runStart, i);
|
|
637
|
+
}
|
|
638
|
+
runStart = -1;
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
642
|
+
markEvenRun(items, result, runStart, items.length);
|
|
643
|
+
}
|
|
644
|
+
return result;
|
|
645
|
+
}
|
|
646
|
+
function markEvenRun(items, result, start, end) {
|
|
647
|
+
const gaps = [];
|
|
648
|
+
for (let i = start + 1; i < end; i++) {
|
|
649
|
+
gaps.push(items[i].x - (items[i - 1].x + items[i - 1].w));
|
|
650
|
+
}
|
|
651
|
+
const posGaps = gaps.filter((g2) => g2 > 0);
|
|
652
|
+
if (posGaps.length < 2) return;
|
|
653
|
+
let minGap = Infinity, maxGap = -Infinity;
|
|
654
|
+
for (const g2 of posGaps) {
|
|
655
|
+
if (g2 < minGap) minGap = g2;
|
|
656
|
+
if (g2 > maxGap) maxGap = g2;
|
|
657
|
+
}
|
|
658
|
+
const avgFs = items[start].fontSize;
|
|
659
|
+
if (minGap >= avgFs * 0.1 && maxGap <= avgFs * 3 && maxGap / Math.max(minGap, 0.1) <= 3) {
|
|
660
|
+
for (let i = start + 1; i < end; i++) {
|
|
661
|
+
result[i] = true;
|
|
662
|
+
}
|
|
663
|
+
}
|
|
664
|
+
}
|
|
665
|
+
function mergeCellTextLines(textLines) {
|
|
666
|
+
if (textLines.length <= 1) return textLines[0] || "";
|
|
667
|
+
const merged = [textLines[0]];
|
|
668
|
+
for (let i = 1; i < textLines.length; i++) {
|
|
669
|
+
const prev = merged[merged.length - 1];
|
|
670
|
+
const curr = textLines[i];
|
|
671
|
+
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
|
|
672
|
+
merged[merged.length - 1] = prev + curr;
|
|
673
|
+
} else if (curr.trim().length <= 3 && /^[)\]%}]/.test(curr.trim())) {
|
|
674
|
+
merged[merged.length - 1] = prev + curr.trim();
|
|
675
|
+
} else if (/[,(]$/.test(prev.trim()) && curr.trim().length <= 15) {
|
|
676
|
+
merged[merged.length - 1] = prev + curr.trim();
|
|
677
|
+
} else if (/[\d,]$/.test(prev) && /^[\d,]+[)\]]?$/.test(curr.trim()) && curr.trim().length <= 10) {
|
|
678
|
+
merged[merged.length - 1] = prev + curr.trim();
|
|
679
|
+
} else {
|
|
680
|
+
merged.push(curr);
|
|
681
|
+
}
|
|
682
|
+
}
|
|
683
|
+
return merged.join("\n");
|
|
684
|
+
}
|
|
685
|
+
|
|
686
|
+
// src/pdf/cluster-detector.ts
|
|
687
|
+
var Y_TOL = 3;
|
|
688
|
+
var COL_CLUSTER_TOL = 15;
|
|
689
|
+
var MIN_ROWS = 3;
|
|
690
|
+
var MIN_COLS = 2;
|
|
691
|
+
var MIN_GAP_FACTOR = 2;
|
|
692
|
+
var MIN_GAP_ABSOLUTE = 20;
|
|
693
|
+
var MIN_COL_FILL_RATIO = 0.4;
|
|
694
|
+
function detectClusterTables(items, pageNum) {
|
|
695
|
+
if (items.length < MIN_ROWS * MIN_COLS) return [];
|
|
696
|
+
const { merged, originMap } = mergeEvenSpacedClusters(items);
|
|
697
|
+
const rows = groupByBaseline(merged);
|
|
698
|
+
if (rows.length < MIN_ROWS) return [];
|
|
699
|
+
const results = [];
|
|
700
|
+
const headerResult = detectHeaderRow(rows);
|
|
701
|
+
if (headerResult) {
|
|
702
|
+
const { columns, headerIdx } = headerResult;
|
|
703
|
+
const headerRow = rows[headerIdx];
|
|
704
|
+
const headerItems = [...headerRow.items].sort((a, b) => a.x - b.x);
|
|
705
|
+
const headerAndBelow = rows.slice(headerIdx);
|
|
706
|
+
const mergedRows = mergeMultiLineRows(headerAndBelow, columns);
|
|
707
|
+
const tableRegions = findTableRegionsByHeader(mergedRows, columns, headerItems);
|
|
708
|
+
for (const region of tableRegions) {
|
|
709
|
+
const table = buildClusterTable(region.rows, columns, pageNum);
|
|
710
|
+
if (table) {
|
|
711
|
+
expandUsedItems(table.usedItems, originMap);
|
|
712
|
+
results.push(table);
|
|
713
|
+
}
|
|
714
|
+
}
|
|
715
|
+
}
|
|
716
|
+
if (results.length === 0) {
|
|
717
|
+
const suspiciousRows = rows.filter((row) => hasSuspiciousGaps(row));
|
|
718
|
+
if (suspiciousRows.length >= MIN_ROWS) {
|
|
719
|
+
const columns = extractColumnClusters(suspiciousRows);
|
|
720
|
+
if (columns.length >= MIN_COLS) {
|
|
721
|
+
const tableRegions = findTableRegions(rows, columns);
|
|
722
|
+
for (const region of tableRegions) {
|
|
723
|
+
const mergedRows = mergeMultiLineRows(region.rows, columns);
|
|
724
|
+
const table = buildClusterTable(mergedRows, columns, pageNum);
|
|
725
|
+
if (table) {
|
|
726
|
+
expandUsedItems(table.usedItems, originMap);
|
|
727
|
+
results.push(table);
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
}
|
|
733
|
+
return results;
|
|
734
|
+
}
|
|
735
|
+
function mergeEvenSpacedClusters(items) {
|
|
736
|
+
const originMap = /* @__PURE__ */ new Map();
|
|
737
|
+
const rows = groupByBaseline(items);
|
|
738
|
+
const merged = [];
|
|
739
|
+
for (const row of rows) {
|
|
740
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
741
|
+
let i = 0;
|
|
742
|
+
while (i < sorted.length) {
|
|
743
|
+
if (/^[가-힣\d]$/.test(sorted[i].text)) {
|
|
744
|
+
let runEnd = i + 1;
|
|
745
|
+
while (runEnd < sorted.length && /^[가-힣\d]$/.test(sorted[runEnd].text)) {
|
|
746
|
+
const gap = sorted[runEnd].x - (sorted[runEnd - 1].x + sorted[runEnd - 1].w);
|
|
747
|
+
const fs = sorted[runEnd].fontSize;
|
|
748
|
+
if (gap < fs * 0.1 || gap > fs * 3) break;
|
|
749
|
+
runEnd++;
|
|
750
|
+
}
|
|
751
|
+
if (runEnd - i >= 3) {
|
|
752
|
+
const gaps = [];
|
|
753
|
+
for (let g2 = i + 1; g2 < runEnd; g2++) {
|
|
754
|
+
gaps.push(sorted[g2].x - (sorted[g2 - 1].x + sorted[g2 - 1].w));
|
|
755
|
+
}
|
|
756
|
+
let minG = Infinity, maxG = -Infinity;
|
|
757
|
+
for (const g2 of gaps) {
|
|
758
|
+
if (g2 < minG) minG = g2;
|
|
759
|
+
if (g2 > maxG) maxG = g2;
|
|
760
|
+
}
|
|
761
|
+
if (minG > 0 && maxG / minG <= 3) {
|
|
762
|
+
const run = sorted.slice(i, runEnd);
|
|
763
|
+
const text = run.map((r) => r.text).join("");
|
|
764
|
+
const first = run[0], last = run[runEnd - i - 1];
|
|
765
|
+
const item = {
|
|
766
|
+
text,
|
|
767
|
+
x: first.x,
|
|
768
|
+
y: first.y,
|
|
769
|
+
w: last.x + last.w - first.x,
|
|
770
|
+
h: first.h,
|
|
771
|
+
fontSize: first.fontSize,
|
|
772
|
+
fontName: first.fontName
|
|
773
|
+
};
|
|
774
|
+
originMap.set(item, run);
|
|
775
|
+
merged.push(item);
|
|
776
|
+
i = runEnd;
|
|
777
|
+
continue;
|
|
778
|
+
}
|
|
779
|
+
}
|
|
780
|
+
}
|
|
781
|
+
merged.push(sorted[i]);
|
|
782
|
+
i++;
|
|
783
|
+
}
|
|
784
|
+
}
|
|
785
|
+
return { merged, originMap };
|
|
786
|
+
}
|
|
787
|
+
function expandUsedItems(usedItems, originMap) {
|
|
788
|
+
const toAdd = [];
|
|
789
|
+
for (const item of usedItems) {
|
|
790
|
+
const origins = originMap.get(item);
|
|
791
|
+
if (origins) for (const o of origins) toAdd.push(o);
|
|
792
|
+
}
|
|
793
|
+
for (const a of toAdd) usedItems.add(a);
|
|
794
|
+
}
|
|
795
|
+
function detectHeaderRow(rows) {
|
|
796
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
797
|
+
if (allItems.length === 0) return null;
|
|
798
|
+
let allMinX = Infinity, allMaxX = -Infinity;
|
|
799
|
+
for (const i of allItems) {
|
|
800
|
+
if (i.x < allMinX) allMinX = i.x;
|
|
801
|
+
const r = i.x + i.w;
|
|
802
|
+
if (r > allMaxX) allMaxX = r;
|
|
803
|
+
}
|
|
804
|
+
const pageSpan = allMaxX - allMinX;
|
|
805
|
+
if (pageSpan <= 0) return null;
|
|
806
|
+
for (let ri = 0; ri < rows.length; ri++) {
|
|
807
|
+
const row = rows[ri];
|
|
808
|
+
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
809
|
+
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
810
|
+
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
811
|
+
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
812
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
813
|
+
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
814
|
+
if (xSpan / pageSpan < 0.4) continue;
|
|
815
|
+
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
816
|
+
let hasLargeGap = false;
|
|
817
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
818
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
819
|
+
if (gap >= avgFs * 2.5) {
|
|
820
|
+
hasLargeGap = true;
|
|
821
|
+
break;
|
|
822
|
+
}
|
|
823
|
+
}
|
|
824
|
+
if (!hasLargeGap) continue;
|
|
825
|
+
const columns = sorted.map((item) => ({ x: item.x, count: 0 }));
|
|
826
|
+
let matchCount = 0;
|
|
827
|
+
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
828
|
+
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
829
|
+
if (matched >= MIN_COLS) matchCount++;
|
|
830
|
+
}
|
|
831
|
+
if (matchCount < MIN_ROWS) continue;
|
|
832
|
+
return { columns, headerIdx: ri };
|
|
833
|
+
}
|
|
834
|
+
return null;
|
|
835
|
+
}
|
|
836
|
+
function mergeMultiLineRows(rows, columns) {
|
|
837
|
+
if (rows.length <= 1) return rows;
|
|
838
|
+
const result = [rows[0]];
|
|
839
|
+
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
840
|
+
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
841
|
+
for (let i = 1; i < rows.length; i++) {
|
|
842
|
+
const prev = result[result.length - 1];
|
|
843
|
+
const curr = rows[i];
|
|
844
|
+
const yGap = Math.abs(prev.y - curr.y);
|
|
845
|
+
const matchedCols = countMatchedColumns(curr, columns);
|
|
846
|
+
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
847
|
+
result[result.length - 1] = {
|
|
848
|
+
y: prev.y,
|
|
849
|
+
items: [...prev.items, ...curr.items]
|
|
850
|
+
};
|
|
851
|
+
} else {
|
|
852
|
+
result.push(curr);
|
|
853
|
+
}
|
|
854
|
+
}
|
|
855
|
+
return result;
|
|
856
|
+
}
|
|
857
|
+
function groupByBaseline(items) {
|
|
858
|
+
if (items.length === 0) return [];
|
|
859
|
+
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
860
|
+
const rows = [];
|
|
861
|
+
let curItems = [sorted[0]];
|
|
862
|
+
let curY = sorted[0].y;
|
|
863
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
864
|
+
if (Math.abs(sorted[i].y - curY) <= Y_TOL) {
|
|
865
|
+
curItems.push(sorted[i]);
|
|
866
|
+
} else {
|
|
867
|
+
rows.push({ y: curY, items: curItems });
|
|
868
|
+
curItems = [sorted[i]];
|
|
869
|
+
curY = sorted[i].y;
|
|
870
|
+
}
|
|
871
|
+
}
|
|
872
|
+
if (curItems.length > 0) rows.push({ y: curY, items: curItems });
|
|
873
|
+
return rows;
|
|
874
|
+
}
|
|
875
|
+
function hasSuspiciousGaps(row) {
|
|
876
|
+
if (row.items.length < 2) return false;
|
|
877
|
+
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
878
|
+
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
879
|
+
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
880
|
+
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
881
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
882
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
883
|
+
if (gap >= minGap) return true;
|
|
884
|
+
}
|
|
885
|
+
return false;
|
|
886
|
+
}
|
|
887
|
+
function extractColumnClusters(rows) {
|
|
888
|
+
const allX = [];
|
|
889
|
+
for (const row of rows) {
|
|
890
|
+
for (const item of row.items) allX.push(item.x);
|
|
891
|
+
}
|
|
892
|
+
if (allX.length === 0) return [];
|
|
893
|
+
allX.sort((a, b) => a - b);
|
|
894
|
+
const clusters = [];
|
|
895
|
+
let clusterStart = 0;
|
|
896
|
+
for (let i = 1; i <= allX.length; i++) {
|
|
897
|
+
if (i === allX.length || allX[i] - allX[i - 1] > COL_CLUSTER_TOL) {
|
|
898
|
+
const slice = allX.slice(clusterStart, i);
|
|
899
|
+
const avg = Math.round(slice.reduce((s, v) => s + v, 0) / slice.length);
|
|
900
|
+
clusters.push({ x: avg, count: slice.length });
|
|
901
|
+
clusterStart = i;
|
|
902
|
+
}
|
|
903
|
+
}
|
|
904
|
+
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
905
|
+
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
906
|
+
}
|
|
907
|
+
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
908
|
+
const regions = [];
|
|
909
|
+
let currentRegion = [];
|
|
910
|
+
let missStreak = 0;
|
|
911
|
+
for (const row of allRows) {
|
|
912
|
+
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
913
|
+
if (matchedCols >= MIN_COLS) {
|
|
914
|
+
currentRegion.push(row);
|
|
915
|
+
missStreak = 0;
|
|
916
|
+
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
917
|
+
currentRegion.push(row);
|
|
918
|
+
missStreak++;
|
|
919
|
+
} else {
|
|
920
|
+
while (currentRegion.length > 0) {
|
|
921
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
922
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
923
|
+
currentRegion.pop();
|
|
924
|
+
}
|
|
925
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
926
|
+
regions.push({ rows: [...currentRegion] });
|
|
927
|
+
}
|
|
928
|
+
currentRegion = [];
|
|
929
|
+
missStreak = 0;
|
|
930
|
+
}
|
|
931
|
+
}
|
|
932
|
+
while (currentRegion.length > 0) {
|
|
933
|
+
const last = currentRegion[currentRegion.length - 1];
|
|
934
|
+
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
935
|
+
currentRegion.pop();
|
|
936
|
+
}
|
|
937
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
938
|
+
regions.push({ rows: currentRegion });
|
|
939
|
+
}
|
|
940
|
+
return regions;
|
|
941
|
+
}
|
|
942
|
+
function findTableRegions(allRows, columns) {
|
|
943
|
+
const regions = [];
|
|
944
|
+
let currentRegion = [];
|
|
945
|
+
for (const row of allRows) {
|
|
946
|
+
const matchedCols = countMatchedColumns(row, columns);
|
|
947
|
+
if (matchedCols >= MIN_COLS) {
|
|
948
|
+
currentRegion.push(row);
|
|
949
|
+
} else if (row.items.length === 1) {
|
|
950
|
+
if (currentRegion.length > 0) {
|
|
951
|
+
currentRegion.push(row);
|
|
952
|
+
}
|
|
953
|
+
} else {
|
|
954
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
955
|
+
regions.push({ rows: [...currentRegion] });
|
|
956
|
+
}
|
|
957
|
+
currentRegion = [];
|
|
958
|
+
}
|
|
959
|
+
}
|
|
960
|
+
if (currentRegion.length >= MIN_ROWS) {
|
|
961
|
+
regions.push({ rows: currentRegion });
|
|
962
|
+
}
|
|
963
|
+
return regions;
|
|
964
|
+
}
|
|
965
|
+
function countMatchedColumns(row, columns) {
|
|
966
|
+
const matched = /* @__PURE__ */ new Set();
|
|
967
|
+
for (const item of row.items) {
|
|
968
|
+
for (let ci = 0; ci < columns.length; ci++) {
|
|
969
|
+
if (Math.abs(item.x - columns[ci].x) <= COL_CLUSTER_TOL * 2) {
|
|
970
|
+
matched.add(ci);
|
|
971
|
+
break;
|
|
972
|
+
}
|
|
973
|
+
}
|
|
974
|
+
}
|
|
975
|
+
return matched.size;
|
|
976
|
+
}
|
|
977
|
+
function countMatchedColumnsRange(row, columns, headerItems) {
|
|
978
|
+
const boundaries = [];
|
|
979
|
+
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
980
|
+
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
981
|
+
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
982
|
+
boundaries.push({ left, right });
|
|
983
|
+
}
|
|
984
|
+
const matched = /* @__PURE__ */ new Set();
|
|
985
|
+
for (const item of row.items) {
|
|
986
|
+
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
987
|
+
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
988
|
+
matched.add(ci);
|
|
989
|
+
break;
|
|
990
|
+
}
|
|
991
|
+
}
|
|
992
|
+
}
|
|
993
|
+
return matched.size;
|
|
994
|
+
}
|
|
995
|
+
function assignRowItems(items, columns, numCols) {
|
|
996
|
+
if (items.length === 0) return [];
|
|
997
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
998
|
+
const colCenters = columns.map((c) => c.x);
|
|
999
|
+
const gaps = [];
|
|
1000
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1001
|
+
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
1002
|
+
}
|
|
1003
|
+
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
1004
|
+
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
1005
|
+
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
1006
|
+
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
1007
|
+
const groups = [];
|
|
1008
|
+
let start = 0;
|
|
1009
|
+
for (const gap of significantGaps) {
|
|
1010
|
+
groups.push(sorted.slice(start, gap.idx));
|
|
1011
|
+
start = gap.idx;
|
|
1012
|
+
}
|
|
1013
|
+
groups.push(sorted.slice(start));
|
|
1014
|
+
const result = [];
|
|
1015
|
+
const usedCols = /* @__PURE__ */ new Set();
|
|
1016
|
+
const groupCenters = groups.map((g2) => {
|
|
1017
|
+
let minX = Infinity, maxX = -Infinity;
|
|
1018
|
+
for (const i of g2) {
|
|
1019
|
+
if (i.x < minX) minX = i.x;
|
|
1020
|
+
const r = i.x + i.w;
|
|
1021
|
+
if (r > maxX) maxX = r;
|
|
1022
|
+
}
|
|
1023
|
+
return (minX + maxX) / 2;
|
|
1024
|
+
});
|
|
1025
|
+
const assignments = [];
|
|
1026
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
1027
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
1028
|
+
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
1029
|
+
}
|
|
1030
|
+
}
|
|
1031
|
+
assignments.sort((a, b) => a.dist - b.dist);
|
|
1032
|
+
const assignedGroups = /* @__PURE__ */ new Set();
|
|
1033
|
+
for (const { gi, ci } of assignments) {
|
|
1034
|
+
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
1035
|
+
result.push({ col: ci, items: groups[gi] });
|
|
1036
|
+
assignedGroups.add(gi);
|
|
1037
|
+
usedCols.add(ci);
|
|
1038
|
+
}
|
|
1039
|
+
for (let gi = 0; gi < groups.length; gi++) {
|
|
1040
|
+
if (assignedGroups.has(gi)) continue;
|
|
1041
|
+
let bestCol = 0, bestDist = Infinity;
|
|
1042
|
+
for (let ci = 0; ci < numCols; ci++) {
|
|
1043
|
+
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
1044
|
+
if (d < bestDist) {
|
|
1045
|
+
bestDist = d;
|
|
1046
|
+
bestCol = ci;
|
|
1047
|
+
}
|
|
1048
|
+
}
|
|
1049
|
+
result.push({ col: bestCol, items: groups[gi] });
|
|
1050
|
+
}
|
|
1051
|
+
return result;
|
|
1052
|
+
}
|
|
1053
|
+
function buildClusterTable(rows, columns, pageNum) {
|
|
1054
|
+
const numCols = columns.length;
|
|
1055
|
+
const numRows = rows.length;
|
|
1056
|
+
if (numRows < MIN_ROWS || numCols < MIN_COLS) return null;
|
|
1057
|
+
const cells = Array.from(
|
|
1058
|
+
{ length: numRows },
|
|
1059
|
+
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
1060
|
+
);
|
|
1061
|
+
const usedItems = /* @__PURE__ */ new Set();
|
|
1062
|
+
for (let r = 0; r < numRows; r++) {
|
|
1063
|
+
const row = rows[r];
|
|
1064
|
+
if (row.items.length === 1 && numCols > 1) {
|
|
1065
|
+
cells[r][0] = { text: row.items[0].text, colSpan: numCols, rowSpan: 1 };
|
|
1066
|
+
usedItems.add(row.items[0]);
|
|
1067
|
+
continue;
|
|
1068
|
+
}
|
|
1069
|
+
const assignments = assignRowItems(row.items, columns, numCols);
|
|
1070
|
+
for (const { col, items } of assignments) {
|
|
1071
|
+
const text = items.map((i) => i.text).join(" ");
|
|
1072
|
+
const existing = cells[r][col].text;
|
|
1073
|
+
cells[r][col].text = existing ? existing + " " + text : text;
|
|
1074
|
+
for (const item of items) usedItems.add(item);
|
|
1075
|
+
}
|
|
1076
|
+
}
|
|
1077
|
+
let emptyRows = 0;
|
|
1078
|
+
for (const row of cells) {
|
|
1079
|
+
if (row.every((c) => c.text === "")) emptyRows++;
|
|
1080
|
+
}
|
|
1081
|
+
if (emptyRows > numRows * 0.5) return null;
|
|
1082
|
+
for (let c = 0; c < numCols; c++) {
|
|
1083
|
+
const hasValue = cells.some((row) => row[c].text !== "");
|
|
1084
|
+
if (!hasValue) return null;
|
|
1085
|
+
}
|
|
1086
|
+
for (let r = numRows - 1; r >= 1; r--) {
|
|
1087
|
+
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
1088
|
+
if (nonEmptyCols !== 1) continue;
|
|
1089
|
+
if (cells[r][0].text.trim() !== "") continue;
|
|
1090
|
+
const contentText = _optionalChain([cells, 'access', _4 => _4[r], 'access', _5 => _5.find, 'call', _6 => _6((c) => c.text.trim()), 'optionalAccess', _7 => _7.text, 'access', _8 => _8.trim, 'call', _9 => _9()]) || "";
|
|
1091
|
+
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
1092
|
+
for (let pr = r - 1; pr >= 0; pr--) {
|
|
1093
|
+
if (cells[pr].some((c) => c.text.trim())) {
|
|
1094
|
+
for (let c = 0; c < numCols; c++) {
|
|
1095
|
+
const prev = cells[pr][c].text.trim();
|
|
1096
|
+
const curr = cells[r][c].text.trim();
|
|
1097
|
+
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
1098
|
+
}
|
|
1099
|
+
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
1100
|
+
break;
|
|
1101
|
+
}
|
|
1102
|
+
}
|
|
1103
|
+
}
|
|
1104
|
+
for (let r = 0; r < cells.length - 1; r++) {
|
|
1105
|
+
const row = cells[r];
|
|
1106
|
+
const hasCol0 = row[0].text.trim() !== "";
|
|
1107
|
+
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
1108
|
+
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
1109
|
+
if (hasCol0 && hasColLast && midEmpty) {
|
|
1110
|
+
const next = cells[r + 1];
|
|
1111
|
+
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
1112
|
+
for (let c = 1; c < numCols; c++) {
|
|
1113
|
+
const curr = next[c].text.trim();
|
|
1114
|
+
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
1115
|
+
}
|
|
1116
|
+
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
1117
|
+
}
|
|
1118
|
+
}
|
|
1119
|
+
}
|
|
1120
|
+
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
1121
|
+
const finalRowCount = filteredCells.length;
|
|
1122
|
+
if (finalRowCount < MIN_ROWS) return null;
|
|
1123
|
+
const irTable = {
|
|
1124
|
+
rows: finalRowCount,
|
|
1125
|
+
cols: numCols,
|
|
1126
|
+
cells: filteredCells,
|
|
1127
|
+
hasHeader: finalRowCount > 1
|
|
1128
|
+
};
|
|
1129
|
+
const allItems = rows.flatMap((r) => r.items);
|
|
1130
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
1131
|
+
for (const i of allItems) {
|
|
1132
|
+
if (i.x < minX) minX = i.x;
|
|
1133
|
+
if (i.y < minY) minY = i.y;
|
|
1134
|
+
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1135
|
+
const h = i.h > 0 ? i.h : i.fontSize;
|
|
1136
|
+
if (i.y + h > maxY) maxY = i.y + h;
|
|
1137
|
+
}
|
|
1138
|
+
return {
|
|
1139
|
+
table: irTable,
|
|
1140
|
+
bbox: { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY },
|
|
1141
|
+
usedItems
|
|
1142
|
+
};
|
|
1143
|
+
}
|
|
1144
|
+
|
|
1145
|
+
// src/pdf/polyfill.ts
|
|
1146
|
+
var _pdfworkermjs = require('pdfjs-dist/legacy/build/pdf.worker.mjs'); var pdfjsWorker = _interopRequireWildcard(_pdfworkermjs);
|
|
1147
|
+
var g = globalThis;
|
|
1148
|
+
if (typeof g.DOMMatrix === "undefined") {
|
|
1149
|
+
g.DOMMatrix = (_class = class DOMMatrix {
|
|
1150
|
+
__init() {this.m = [1, 0, 0, 1, 0, 0]}
|
|
1151
|
+
constructor(init) {;_class.prototype.__init.call(this);
|
|
1152
|
+
if (init) this.m = init;
|
|
1153
|
+
}
|
|
1154
|
+
}, _class);
|
|
1155
|
+
}
|
|
1156
|
+
if (typeof g.Path2D === "undefined") {
|
|
1157
|
+
g.Path2D = class Path2D {
|
|
1158
|
+
};
|
|
1159
|
+
}
|
|
1160
|
+
g.pdfjsWorker = pdfjsWorker;
|
|
1161
|
+
|
|
1162
|
+
// src/pdf/parser.ts
|
|
1163
|
+
|
|
1164
|
+
_pdfmjs.GlobalWorkerOptions.workerSrc = "";
|
|
1165
|
+
var MAX_PAGES = 5e3;
|
|
1166
|
+
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
1167
|
+
var PDF_LOAD_TIMEOUT_MS = 3e4;
|
|
1168
|
+
async function loadPdfWithTimeout(buffer) {
|
|
1169
|
+
const loadingTask = _pdfmjs.getDocument.call(void 0, {
|
|
1170
|
+
data: new Uint8Array(buffer),
|
|
1171
|
+
useSystemFonts: true,
|
|
1172
|
+
disableFontFace: true,
|
|
1173
|
+
isEvalSupported: false
|
|
1174
|
+
});
|
|
1175
|
+
let timer;
|
|
1176
|
+
try {
|
|
1177
|
+
return await Promise.race([
|
|
1178
|
+
loadingTask.promise,
|
|
1179
|
+
new Promise((_, reject) => {
|
|
1180
|
+
timer = setTimeout(() => {
|
|
1181
|
+
loadingTask.destroy();
|
|
1182
|
+
reject(new (0, _chunkOJ4QR33Vcjs.KordocError)("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
|
|
1183
|
+
}, PDF_LOAD_TIMEOUT_MS);
|
|
1184
|
+
})
|
|
1185
|
+
]);
|
|
1186
|
+
} finally {
|
|
1187
|
+
if (timer !== void 0) clearTimeout(timer);
|
|
1188
|
+
}
|
|
1189
|
+
}
|
|
1190
|
+
async function parsePdfDocument(buffer, options) {
|
|
1191
|
+
const doc = await loadPdfWithTimeout(buffer);
|
|
1192
|
+
try {
|
|
1193
|
+
const pageCount = doc.numPages;
|
|
1194
|
+
if (pageCount === 0) throw new (0, _chunkOJ4QR33Vcjs.KordocError)("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
1195
|
+
const metadata = { pageCount };
|
|
1196
|
+
await extractPdfMetadata(doc, metadata);
|
|
1197
|
+
const blocks = [];
|
|
1198
|
+
const warnings = [];
|
|
1199
|
+
let totalChars = 0;
|
|
1200
|
+
let totalTextBytes = 0;
|
|
1201
|
+
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
1202
|
+
const pageFilter = _optionalChain([options, 'optionalAccess', _10 => _10.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, effectivePageCount) : null;
|
|
1203
|
+
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
1204
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
1205
|
+
const pageHeights = /* @__PURE__ */ new Map();
|
|
1206
|
+
let parsedPages = 0;
|
|
1207
|
+
for (let i = 1; i <= effectivePageCount; i++) {
|
|
1208
|
+
if (pageFilter && !pageFilter.has(i)) continue;
|
|
1209
|
+
try {
|
|
1210
|
+
const page = await doc.getPage(i);
|
|
1211
|
+
const tc = await page.getTextContent();
|
|
1212
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
1213
|
+
pageHeights.set(i, viewport.height);
|
|
1214
|
+
const rawItems = tc.items;
|
|
1215
|
+
const items = normalizeItems(rawItems);
|
|
1216
|
+
const { visible, hiddenCount } = filterHiddenText(items, viewport.width, viewport.height);
|
|
1217
|
+
if (hiddenCount > 0) {
|
|
1218
|
+
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
1219
|
+
}
|
|
1220
|
+
for (const item of visible) {
|
|
1221
|
+
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
1222
|
+
}
|
|
1223
|
+
const opList = await page.getOperatorList();
|
|
1224
|
+
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
1225
|
+
for (const b of pageBlocks) blocks.push(b);
|
|
1226
|
+
for (const b of pageBlocks) {
|
|
1227
|
+
const t = b.text || "";
|
|
1228
|
+
totalChars += t.replace(/\s/g, "").length;
|
|
1229
|
+
totalTextBytes += t.length * 2;
|
|
1230
|
+
}
|
|
1231
|
+
if (totalTextBytes > MAX_TOTAL_TEXT) throw new (0, _chunkOJ4QR33Vcjs.KordocError)("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
1232
|
+
parsedPages++;
|
|
1233
|
+
_optionalChain([options, 'optionalAccess', _11 => _11.onProgress, 'optionalCall', _12 => _12(parsedPages, totalTarget)]);
|
|
1234
|
+
} catch (pageErr) {
|
|
1235
|
+
if (pageErr instanceof _chunkOJ4QR33Vcjs.KordocError) throw pageErr;
|
|
1236
|
+
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
1237
|
+
}
|
|
1238
|
+
}
|
|
1239
|
+
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
1240
|
+
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
1241
|
+
if (_optionalChain([options, 'optionalAccess', _13 => _13.ocr])) {
|
|
1242
|
+
try {
|
|
1243
|
+
const { ocrPages } = await Promise.resolve().then(() => _interopRequireWildcard(require("./provider-YN2SSK4X.cjs")));
|
|
1244
|
+
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
1245
|
+
if (ocrBlocks.length > 0) {
|
|
1246
|
+
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
1247
|
+
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
|
|
1248
|
+
}
|
|
1249
|
+
} catch (e) {
|
|
1250
|
+
}
|
|
1251
|
+
}
|
|
1252
|
+
throw Object.assign(new (0, _chunkOJ4QR33Vcjs.KordocError)(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
1253
|
+
}
|
|
1254
|
+
if (_optionalChain([options, 'optionalAccess', _14 => _14.removeHeaderFooter]) !== false && parsedPageCount >= 3) {
|
|
1255
|
+
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
1256
|
+
for (let ri = removed.length - 1; ri >= 0; ri--) {
|
|
1257
|
+
blocks.splice(removed[ri], 1);
|
|
1258
|
+
}
|
|
1259
|
+
}
|
|
1260
|
+
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
1261
|
+
if (medianFontSize > 0) {
|
|
1262
|
+
detectHeadings(blocks, medianFontSize);
|
|
1263
|
+
}
|
|
1264
|
+
detectMarkerHeadings(blocks);
|
|
1265
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
1266
|
+
let markdown = cleanPdfText(_chunkOJ4QR33Vcjs.blocksToMarkdown.call(void 0, blocks));
|
|
1267
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
1268
|
+
} finally {
|
|
1269
|
+
await doc.destroy().catch(() => {
|
|
1270
|
+
});
|
|
1271
|
+
}
|
|
1272
|
+
}
|
|
1273
|
+
async function extractPdfMetadata(doc, metadata) {
|
|
1274
|
+
try {
|
|
1275
|
+
const result = await doc.getMetadata();
|
|
1276
|
+
if (!_optionalChain([result, 'optionalAccess', _15 => _15.info])) return;
|
|
1277
|
+
const info = result.info;
|
|
1278
|
+
if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
|
|
1279
|
+
if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
|
|
1280
|
+
if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
|
|
1281
|
+
if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
|
|
1282
|
+
if (typeof info.Keywords === "string" && info.Keywords.trim()) {
|
|
1283
|
+
metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
1284
|
+
}
|
|
1285
|
+
if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
|
|
1286
|
+
if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
|
|
1287
|
+
} catch (e2) {
|
|
1288
|
+
}
|
|
1289
|
+
}
|
|
1290
|
+
function parsePdfDate(dateStr) {
|
|
1291
|
+
const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
|
|
1292
|
+
if (!m) return void 0;
|
|
1293
|
+
const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
|
|
1294
|
+
return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
|
|
1295
|
+
}
|
|
1296
|
+
async function extractPdfMetadataOnly(buffer) {
|
|
1297
|
+
const doc = await loadPdfWithTimeout(buffer);
|
|
1298
|
+
try {
|
|
1299
|
+
const metadata = { pageCount: doc.numPages };
|
|
1300
|
+
await extractPdfMetadata(doc, metadata);
|
|
1301
|
+
return metadata;
|
|
1302
|
+
} finally {
|
|
1303
|
+
await doc.destroy().catch(() => {
|
|
1304
|
+
});
|
|
1305
|
+
}
|
|
1306
|
+
}
|
|
1307
|
+
function filterHiddenText(items, pageWidth, pageHeight) {
|
|
1308
|
+
let hiddenCount = 0;
|
|
1309
|
+
const visible = [];
|
|
1310
|
+
for (const item of items) {
|
|
1311
|
+
if (item.isHidden) {
|
|
1312
|
+
hiddenCount++;
|
|
1313
|
+
continue;
|
|
1314
|
+
}
|
|
1315
|
+
const margin = Math.max(pageWidth, pageHeight) * 0.1;
|
|
1316
|
+
if (item.x < -margin || item.x > pageWidth + margin || item.y < -margin || item.y > pageHeight + margin) {
|
|
1317
|
+
hiddenCount++;
|
|
1318
|
+
continue;
|
|
1319
|
+
}
|
|
1320
|
+
visible.push(item);
|
|
1321
|
+
}
|
|
1322
|
+
return { visible, hiddenCount };
|
|
1323
|
+
}
|
|
1324
|
+
function computeMedianFontSizeFromFreq(freq) {
|
|
1325
|
+
if (freq.size === 0) return 0;
|
|
1326
|
+
let total = 0;
|
|
1327
|
+
for (const count of freq.values()) total += count;
|
|
1328
|
+
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
1329
|
+
const mid = Math.floor(total / 2);
|
|
1330
|
+
let cumulative = 0;
|
|
1331
|
+
for (const [size, count] of sorted) {
|
|
1332
|
+
cumulative += count;
|
|
1333
|
+
if (cumulative > mid) return size;
|
|
1334
|
+
}
|
|
1335
|
+
return sorted[sorted.length - 1][0];
|
|
1336
|
+
}
|
|
1337
|
+
function detectHeadings(blocks, medianFontSize) {
|
|
1338
|
+
for (const block of blocks) {
|
|
1339
|
+
if (block.type !== "paragraph" || !block.text || !_optionalChain([block, 'access', _16 => _16.style, 'optionalAccess', _17 => _17.fontSize])) continue;
|
|
1340
|
+
const text = block.text.trim();
|
|
1341
|
+
if (text.length === 0 || text.length > 200) continue;
|
|
1342
|
+
if (/^\d+$/.test(text)) continue;
|
|
1343
|
+
const ratio = block.style.fontSize / medianFontSize;
|
|
1344
|
+
let level = 0;
|
|
1345
|
+
if (ratio >= _chunkOJ4QR33Vcjs.HEADING_RATIO_H1) level = 1;
|
|
1346
|
+
else if (ratio >= _chunkOJ4QR33Vcjs.HEADING_RATIO_H2) level = 2;
|
|
1347
|
+
else if (ratio >= _chunkOJ4QR33Vcjs.HEADING_RATIO_H3) level = 3;
|
|
1348
|
+
if (level > 0) {
|
|
1349
|
+
block.type = "heading";
|
|
1350
|
+
block.level = level;
|
|
1351
|
+
block.text = collapseEvenSpacing(text);
|
|
1352
|
+
}
|
|
1353
|
+
}
|
|
1354
|
+
}
|
|
1355
|
+
function collapseEvenSpacing(text) {
|
|
1356
|
+
const tokens = text.split(" ");
|
|
1357
|
+
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
1358
|
+
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
1359
|
+
return tokens.join("");
|
|
1360
|
+
}
|
|
1361
|
+
return text.replace(
|
|
1362
|
+
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
1363
|
+
(match) => match.replace(/ /g, "")
|
|
1364
|
+
);
|
|
1365
|
+
}
|
|
1366
|
+
function shouldDemoteTable(table) {
|
|
1367
|
+
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
1368
|
+
const allText = allCells.join(" ");
|
|
1369
|
+
if (table.rows <= 3 && table.cols <= 3) {
|
|
1370
|
+
const totalCells2 = table.rows * table.cols;
|
|
1371
|
+
const emptyCells2 = totalCells2 - allCells.length;
|
|
1372
|
+
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
1373
|
+
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
1374
|
+
if (/<[^>]+>/.test(allText)) return true;
|
|
1375
|
+
}
|
|
1376
|
+
if (allText.length > 200) return false;
|
|
1377
|
+
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
1378
|
+
const totalCells = table.rows * table.cols;
|
|
1379
|
+
const emptyCells = totalCells - allCells.length;
|
|
1380
|
+
if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
|
|
1381
|
+
if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
|
|
1382
|
+
return false;
|
|
1383
|
+
}
|
|
1384
|
+
function demoteTableToText(table) {
|
|
1385
|
+
const lines = [];
|
|
1386
|
+
for (let r = 0; r < table.rows; r++) {
|
|
1387
|
+
const cells = table.cells[r].map((c) => c.text.trim()).filter(Boolean);
|
|
1388
|
+
if (cells.length === 0) continue;
|
|
1389
|
+
if (table.cols === 2 && cells.length === 2) {
|
|
1390
|
+
lines.push(`${cells[0]} : ${cells[1]}`);
|
|
1391
|
+
} else {
|
|
1392
|
+
lines.push(cells.join(" "));
|
|
1393
|
+
}
|
|
1394
|
+
}
|
|
1395
|
+
return lines.join("\n");
|
|
1396
|
+
}
|
|
1397
|
+
function detectMarkerHeadings(blocks) {
|
|
1398
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
1399
|
+
const block = blocks[i];
|
|
1400
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
1401
|
+
const text = block.text.trim();
|
|
1402
|
+
if (text.length < 50 && /^[□■◆◇▶]\s*[가-힣]/.test(text)) {
|
|
1403
|
+
block.type = "heading";
|
|
1404
|
+
block.level = 4;
|
|
1405
|
+
continue;
|
|
1406
|
+
}
|
|
1407
|
+
if (/^[가-힣]{2,6}$/.test(text) && _optionalChain([block, 'access', _18 => _18.style, 'optionalAccess', _19 => _19.fontSize])) {
|
|
1408
|
+
const prev = blocks[i - 1];
|
|
1409
|
+
const next = blocks[i + 1];
|
|
1410
|
+
const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
|
|
1411
|
+
const nextIsStructural = !next || next.type === "table" || next.type === "heading" || next.type === "paragraph" && next.text && /^[□■◆○●]/.test(next.text.trim());
|
|
1412
|
+
if (prevIsStructural || nextIsStructural) {
|
|
1413
|
+
block.type = "heading";
|
|
1414
|
+
block.level = 3;
|
|
1415
|
+
}
|
|
1416
|
+
}
|
|
1417
|
+
}
|
|
1418
|
+
}
|
|
1419
|
+
var MAX_XYCUT_DEPTH = 50;
|
|
1420
|
+
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
1421
|
+
if (items.length === 0) return [];
|
|
1422
|
+
if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
|
|
1423
|
+
const region = computeRegion(items);
|
|
1424
|
+
const ySplit = findYSplit(items, region, gapThreshold);
|
|
1425
|
+
if (ySplit !== null) {
|
|
1426
|
+
const upper = items.filter((i) => i.y > ySplit);
|
|
1427
|
+
const lower = items.filter((i) => i.y <= ySplit);
|
|
1428
|
+
if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
|
|
1429
|
+
return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
|
|
1430
|
+
}
|
|
1431
|
+
}
|
|
1432
|
+
const xSplit = findXSplit(items, region, gapThreshold);
|
|
1433
|
+
if (xSplit !== null) {
|
|
1434
|
+
const left = items.filter((i) => i.x + i.w / 2 < xSplit);
|
|
1435
|
+
const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
|
|
1436
|
+
if (left.length > 0 && right.length > 0 && left.length < items.length) {
|
|
1437
|
+
return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
|
|
1438
|
+
}
|
|
1439
|
+
}
|
|
1440
|
+
return [items];
|
|
1441
|
+
}
|
|
1442
|
+
function computeRegion(items) {
|
|
1443
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
1444
|
+
for (const i of items) {
|
|
1445
|
+
if (i.x < minX) minX = i.x;
|
|
1446
|
+
if (i.y < minY) minY = i.y;
|
|
1447
|
+
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1448
|
+
if (i.y + i.h > maxY) maxY = i.y + i.h;
|
|
1449
|
+
}
|
|
1450
|
+
return { items, minX, minY, maxX, maxY };
|
|
1451
|
+
}
|
|
1452
|
+
function findYSplit(items, _region, gapThreshold) {
|
|
1453
|
+
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
1454
|
+
let bestGap = gapThreshold;
|
|
1455
|
+
let bestSplit = null;
|
|
1456
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1457
|
+
const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
|
|
1458
|
+
const currTop = sorted[i].y;
|
|
1459
|
+
const gap = prevBottom - currTop;
|
|
1460
|
+
if (gap > bestGap) {
|
|
1461
|
+
bestGap = gap;
|
|
1462
|
+
bestSplit = (prevBottom + currTop) / 2;
|
|
1463
|
+
}
|
|
1464
|
+
}
|
|
1465
|
+
return bestSplit;
|
|
1466
|
+
}
|
|
1467
|
+
function findXSplit(items, _region, gapThreshold) {
|
|
1468
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
1469
|
+
let bestGap = gapThreshold;
|
|
1470
|
+
let bestSplit = null;
|
|
1471
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1472
|
+
const prevRight = sorted[i - 1].x + sorted[i - 1].w;
|
|
1473
|
+
const currLeft = sorted[i].x;
|
|
1474
|
+
const gap = currLeft - prevRight;
|
|
1475
|
+
if (gap > bestGap) {
|
|
1476
|
+
bestGap = gap;
|
|
1477
|
+
bestSplit = (prevRight + currLeft) / 2;
|
|
1478
|
+
}
|
|
1479
|
+
}
|
|
1480
|
+
return bestSplit;
|
|
1481
|
+
}
|
|
1482
|
+
function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
|
|
1483
|
+
if (items.length === 0) return [];
|
|
1484
|
+
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
1485
|
+
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
1486
|
+
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
1487
|
+
const grids = buildTableGrids(horizontals, verticals);
|
|
1488
|
+
if (grids.length > 0) {
|
|
1489
|
+
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
1490
|
+
}
|
|
1491
|
+
return extractPageBlocksFallback(items, pageNum);
|
|
1492
|
+
}
|
|
1493
|
+
function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
1494
|
+
const blocks = [];
|
|
1495
|
+
const usedItems = /* @__PURE__ */ new Set();
|
|
1496
|
+
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
1497
|
+
for (const grid of sortedGrids) {
|
|
1498
|
+
const numGridRows = grid.rowYs.length - 1;
|
|
1499
|
+
const numGridCols = grid.colXs.length - 1;
|
|
1500
|
+
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
1501
|
+
const tableItems = [];
|
|
1502
|
+
const pad = 3;
|
|
1503
|
+
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
1504
|
+
for (const item of items) {
|
|
1505
|
+
if (usedItems.has(item)) continue;
|
|
1506
|
+
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
1507
|
+
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
1508
|
+
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
1509
|
+
tableItems.push(item);
|
|
1510
|
+
usedItems.add(item);
|
|
1511
|
+
}
|
|
1512
|
+
const cells = extractCells(grid, horizontals, verticals);
|
|
1513
|
+
if (cells.length === 0) continue;
|
|
1514
|
+
const textItems = tableItems.map((i) => ({
|
|
1515
|
+
text: i.text,
|
|
1516
|
+
x: i.x,
|
|
1517
|
+
y: i.y,
|
|
1518
|
+
w: i.w,
|
|
1519
|
+
h: i.h,
|
|
1520
|
+
fontSize: i.fontSize,
|
|
1521
|
+
fontName: i.fontName
|
|
1522
|
+
}));
|
|
1523
|
+
const cellTextMap = mapTextToCells(textItems, cells);
|
|
1524
|
+
const numRows = grid.rowYs.length - 1;
|
|
1525
|
+
const numCols = grid.colXs.length - 1;
|
|
1526
|
+
const irGrid = Array.from(
|
|
1527
|
+
{ length: numRows },
|
|
1528
|
+
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
1529
|
+
);
|
|
1530
|
+
for (const cell of cells) {
|
|
1531
|
+
const cellItems = cellTextMap.get(cell) || [];
|
|
1532
|
+
let text = cellTextToString(cellItems);
|
|
1533
|
+
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
1534
|
+
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
1535
|
+
irGrid[cell.row][cell.col] = {
|
|
1536
|
+
text,
|
|
1537
|
+
colSpan: cell.colSpan,
|
|
1538
|
+
rowSpan: cell.rowSpan
|
|
1539
|
+
};
|
|
1540
|
+
}
|
|
1541
|
+
const irTable = {
|
|
1542
|
+
rows: numRows,
|
|
1543
|
+
cols: numCols,
|
|
1544
|
+
cells: irGrid,
|
|
1545
|
+
hasHeader: numRows > 1
|
|
1546
|
+
};
|
|
1547
|
+
const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
1548
|
+
if (!hasContent) continue;
|
|
1549
|
+
const tableBbox = {
|
|
1550
|
+
page: pageNum,
|
|
1551
|
+
x: grid.bbox.x1,
|
|
1552
|
+
y: grid.bbox.y1,
|
|
1553
|
+
width: grid.bbox.x2 - grid.bbox.x1,
|
|
1554
|
+
height: grid.bbox.y2 - grid.bbox.y1
|
|
1555
|
+
};
|
|
1556
|
+
if (shouldDemoteTable(irTable)) {
|
|
1557
|
+
const demoted = demoteTableToText(irTable);
|
|
1558
|
+
if (demoted) {
|
|
1559
|
+
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
1560
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
1561
|
+
}
|
|
1562
|
+
continue;
|
|
1563
|
+
}
|
|
1564
|
+
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
1565
|
+
}
|
|
1566
|
+
let remaining = items.filter((i) => !usedItems.has(i));
|
|
1567
|
+
if (remaining.length > 0) {
|
|
1568
|
+
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1569
|
+
const clusterItems = remaining.map((i) => ({
|
|
1570
|
+
text: i.text,
|
|
1571
|
+
x: i.x,
|
|
1572
|
+
y: i.y,
|
|
1573
|
+
w: i.w,
|
|
1574
|
+
h: i.h,
|
|
1575
|
+
fontSize: i.fontSize,
|
|
1576
|
+
fontName: i.fontName
|
|
1577
|
+
}));
|
|
1578
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
1579
|
+
if (clusterResults.length > 0) {
|
|
1580
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
1581
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
1582
|
+
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
1583
|
+
for (const cr of clusterResults) {
|
|
1584
|
+
for (const ci of cr.usedItems) {
|
|
1585
|
+
const idx = ciToIdx.get(ci);
|
|
1586
|
+
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
1587
|
+
}
|
|
1588
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
1589
|
+
}
|
|
1590
|
+
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
1591
|
+
}
|
|
1592
|
+
if (remaining.length > 0) {
|
|
1593
|
+
const allY = remaining.map((i) => i.y);
|
|
1594
|
+
const pageH = _chunkOJ4QR33Vcjs.safeMax.call(void 0, allY) - _chunkOJ4QR33Vcjs.safeMin.call(void 0, allY);
|
|
1595
|
+
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
1596
|
+
const textBlocks = [];
|
|
1597
|
+
for (const group of groups) {
|
|
1598
|
+
if (group.length === 0) continue;
|
|
1599
|
+
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
1600
|
+
for (const b of groupBlocks) textBlocks.push(b);
|
|
1601
|
+
}
|
|
1602
|
+
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
1603
|
+
for (const b of finalTextBlocks) blocks.push(b);
|
|
1604
|
+
}
|
|
1605
|
+
blocks.sort((a, b) => {
|
|
1606
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
1607
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
1608
|
+
return by - ay;
|
|
1609
|
+
});
|
|
1610
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
1611
|
+
}
|
|
1612
|
+
return mergeAdjacentTableBlocks(blocks);
|
|
1613
|
+
}
|
|
1614
|
+
function mergeAdjacentTableBlocks(blocks) {
|
|
1615
|
+
if (blocks.length <= 1) return blocks;
|
|
1616
|
+
const result = [blocks[0]];
|
|
1617
|
+
for (let i = 1; i < blocks.length; i++) {
|
|
1618
|
+
const prev = result[result.length - 1];
|
|
1619
|
+
const curr = blocks[i];
|
|
1620
|
+
if (prev.type === "table" && curr.type === "table" && prev.table && curr.table && prev.table.cols === curr.table.cols) {
|
|
1621
|
+
const merged = {
|
|
1622
|
+
rows: prev.table.rows + curr.table.rows,
|
|
1623
|
+
cols: prev.table.cols,
|
|
1624
|
+
cells: [...prev.table.cells, ...curr.table.cells],
|
|
1625
|
+
hasHeader: prev.table.hasHeader
|
|
1626
|
+
};
|
|
1627
|
+
result[result.length - 1] = { ...prev, table: merged };
|
|
1628
|
+
} else {
|
|
1629
|
+
result.push(curr);
|
|
1630
|
+
}
|
|
1631
|
+
}
|
|
1632
|
+
return result;
|
|
1633
|
+
}
|
|
1634
|
+
function extractPageBlocksFallback(items, pageNum) {
|
|
1635
|
+
if (items.length === 0) return [];
|
|
1636
|
+
const blocks = [];
|
|
1637
|
+
const clusterItems = items.map((i) => ({
|
|
1638
|
+
text: i.text,
|
|
1639
|
+
x: i.x,
|
|
1640
|
+
y: i.y,
|
|
1641
|
+
w: i.w,
|
|
1642
|
+
h: i.h,
|
|
1643
|
+
fontSize: i.fontSize,
|
|
1644
|
+
fontName: i.fontName
|
|
1645
|
+
}));
|
|
1646
|
+
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
1647
|
+
if (clusterResults.length > 0) {
|
|
1648
|
+
const ciToIdx = /* @__PURE__ */ new Map();
|
|
1649
|
+
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
1650
|
+
const usedIndices = /* @__PURE__ */ new Set();
|
|
1651
|
+
for (const cr of clusterResults) {
|
|
1652
|
+
for (const ci of cr.usedItems) {
|
|
1653
|
+
const idx = ciToIdx.get(ci);
|
|
1654
|
+
if (idx !== void 0) usedIndices.add(idx);
|
|
1655
|
+
}
|
|
1656
|
+
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
1657
|
+
}
|
|
1658
|
+
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
1659
|
+
if (remaining.length > 0) {
|
|
1660
|
+
const yLines = groupByY(remaining);
|
|
1661
|
+
for (const line of yLines) {
|
|
1662
|
+
const text = mergeLineSimple(line);
|
|
1663
|
+
if (!text.trim()) continue;
|
|
1664
|
+
const bbox = computeBBox(line, pageNum);
|
|
1665
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
1666
|
+
}
|
|
1667
|
+
}
|
|
1668
|
+
blocks.sort((a, b) => {
|
|
1669
|
+
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
1670
|
+
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
1671
|
+
return by - ay;
|
|
1672
|
+
});
|
|
1673
|
+
} else {
|
|
1674
|
+
const allYLines = groupByY(items);
|
|
1675
|
+
const columns = detectColumns(allYLines);
|
|
1676
|
+
if (columns && columns.length >= 3) {
|
|
1677
|
+
const tableText = extractWithColumns(allYLines, columns);
|
|
1678
|
+
const bbox = computeBBox(items, pageNum);
|
|
1679
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
1680
|
+
} else {
|
|
1681
|
+
const allY = items.map((i) => i.y);
|
|
1682
|
+
const pageHeight = _chunkOJ4QR33Vcjs.safeMax.call(void 0, allY) - _chunkOJ4QR33Vcjs.safeMin.call(void 0, allY);
|
|
1683
|
+
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
1684
|
+
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
1685
|
+
for (const group of orderedGroups) {
|
|
1686
|
+
if (group.length === 0) continue;
|
|
1687
|
+
const yLines = groupByY(group);
|
|
1688
|
+
const groupColumns = detectColumns(yLines);
|
|
1689
|
+
if (groupColumns && groupColumns.length >= 3) {
|
|
1690
|
+
const tableText = extractWithColumns(yLines, groupColumns);
|
|
1691
|
+
const bbox = computeBBox(group, pageNum);
|
|
1692
|
+
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
|
|
1693
|
+
} else {
|
|
1694
|
+
for (const line of yLines) {
|
|
1695
|
+
const text = mergeLineSimple(line);
|
|
1696
|
+
if (!text.trim()) continue;
|
|
1697
|
+
const bbox = computeBBox(line, pageNum);
|
|
1698
|
+
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
1699
|
+
}
|
|
1700
|
+
}
|
|
1701
|
+
}
|
|
1702
|
+
}
|
|
1703
|
+
}
|
|
1704
|
+
return detectSpecialKoreanTables(blocks);
|
|
1705
|
+
}
|
|
1706
|
+
function computeBBox(items, pageNum) {
|
|
1707
|
+
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
1708
|
+
for (const i of items) {
|
|
1709
|
+
if (i.x < minX) minX = i.x;
|
|
1710
|
+
if (i.y < minY) minY = i.y;
|
|
1711
|
+
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
1712
|
+
const effectiveH = i.h > 0 ? i.h : i.fontSize;
|
|
1713
|
+
if (i.y + effectiveH > maxY) maxY = i.y + effectiveH;
|
|
1714
|
+
}
|
|
1715
|
+
return { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY };
|
|
1716
|
+
}
|
|
1717
|
+
function dominantStyle(items) {
|
|
1718
|
+
if (items.length === 0) return void 0;
|
|
1719
|
+
const freq = /* @__PURE__ */ new Map();
|
|
1720
|
+
let maxCount = 0, dominantSize = 0;
|
|
1721
|
+
for (const i of items) {
|
|
1722
|
+
if (i.fontSize <= 0) continue;
|
|
1723
|
+
const count = (freq.get(i.fontSize) || 0) + 1;
|
|
1724
|
+
freq.set(i.fontSize, count);
|
|
1725
|
+
if (count > maxCount) {
|
|
1726
|
+
maxCount = count;
|
|
1727
|
+
dominantSize = i.fontSize;
|
|
1728
|
+
}
|
|
1729
|
+
}
|
|
1730
|
+
if (dominantSize === 0) return void 0;
|
|
1731
|
+
const fontName = _optionalChain([items, 'access', _20 => _20.find, 'call', _21 => _21((i) => i.fontSize === dominantSize), 'optionalAccess', _22 => _22.fontName]) || void 0;
|
|
1732
|
+
return { fontSize: dominantSize, fontName };
|
|
1733
|
+
}
|
|
1734
|
+
function normalizeItems(rawItems) {
|
|
1735
|
+
const items = [];
|
|
1736
|
+
const spacePositions = [];
|
|
1737
|
+
for (const i of rawItems) {
|
|
1738
|
+
if (typeof i.str !== "string") continue;
|
|
1739
|
+
const x = Math.round(i.transform[4]);
|
|
1740
|
+
const y = Math.round(i.transform[5]);
|
|
1741
|
+
if (!i.str.trim()) {
|
|
1742
|
+
spacePositions.push({ x, y });
|
|
1743
|
+
continue;
|
|
1744
|
+
}
|
|
1745
|
+
const scaleY = Math.abs(i.transform[3]);
|
|
1746
|
+
const scaleX = Math.abs(i.transform[0]);
|
|
1747
|
+
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
1748
|
+
const w = Math.round(i.width);
|
|
1749
|
+
const h = Math.round(i.height);
|
|
1750
|
+
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
1751
|
+
let text = i.str.trim();
|
|
1752
|
+
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
1753
|
+
text = text.replace(/ /g, "");
|
|
1754
|
+
}
|
|
1755
|
+
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
1756
|
+
if (split) {
|
|
1757
|
+
for (const s of split) {
|
|
1758
|
+
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
1759
|
+
}
|
|
1760
|
+
} else {
|
|
1761
|
+
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
1762
|
+
}
|
|
1763
|
+
}
|
|
1764
|
+
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
1765
|
+
const deduped = [];
|
|
1766
|
+
for (let i = 0; i < sorted.length; i++) {
|
|
1767
|
+
let isDup = false;
|
|
1768
|
+
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
1769
|
+
const prev = deduped[j];
|
|
1770
|
+
if (prev.y - sorted[i].y > 3) break;
|
|
1771
|
+
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
1772
|
+
isDup = true;
|
|
1773
|
+
break;
|
|
1774
|
+
}
|
|
1775
|
+
}
|
|
1776
|
+
if (!isDup) deduped.push(sorted[i]);
|
|
1777
|
+
}
|
|
1778
|
+
if (spacePositions.length > 0) {
|
|
1779
|
+
for (const item of deduped) {
|
|
1780
|
+
for (const sp of spacePositions) {
|
|
1781
|
+
if (Math.abs(sp.y - item.y) <= 3) {
|
|
1782
|
+
const dist = item.x - sp.x;
|
|
1783
|
+
if (dist >= 0 && dist <= 20) {
|
|
1784
|
+
item.hasSpaceBefore = true;
|
|
1785
|
+
break;
|
|
1786
|
+
}
|
|
1787
|
+
}
|
|
1788
|
+
}
|
|
1789
|
+
}
|
|
1790
|
+
}
|
|
1791
|
+
return deduped;
|
|
1792
|
+
}
|
|
1793
|
+
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
1794
|
+
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
1795
|
+
const chars = text.split(" ");
|
|
1796
|
+
if (chars.length < 3) return null;
|
|
1797
|
+
const charW = itemW / chars.length;
|
|
1798
|
+
if (charW > fontSize * 2) return null;
|
|
1799
|
+
return chars.map((ch, idx) => ({
|
|
1800
|
+
text: ch,
|
|
1801
|
+
x: Math.round(itemX + idx * charW),
|
|
1802
|
+
w: Math.round(charW * 0.8)
|
|
1803
|
+
// 실제 글자 폭은 간격보다 좁음
|
|
1804
|
+
}));
|
|
1805
|
+
}
|
|
1806
|
+
function groupByY(items) {
|
|
1807
|
+
if (items.length === 0) return [];
|
|
1808
|
+
const lines = [];
|
|
1809
|
+
let curY = items[0].y;
|
|
1810
|
+
let curLine = [items[0]];
|
|
1811
|
+
for (let i = 1; i < items.length; i++) {
|
|
1812
|
+
if (Math.abs(items[i].y - curY) > 3) {
|
|
1813
|
+
lines.push(curLine);
|
|
1814
|
+
curLine = [];
|
|
1815
|
+
curY = items[i].y;
|
|
1816
|
+
}
|
|
1817
|
+
curLine.push(items[i]);
|
|
1818
|
+
}
|
|
1819
|
+
if (curLine.length > 0) lines.push(curLine);
|
|
1820
|
+
return lines;
|
|
1821
|
+
}
|
|
1822
|
+
function isProseSpread(items) {
|
|
1823
|
+
if (items.length < 4) return false;
|
|
1824
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
1825
|
+
const gaps = [];
|
|
1826
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
1827
|
+
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
1828
|
+
}
|
|
1829
|
+
const maxGap = _chunkOJ4QR33Vcjs.safeMax.call(void 0, gaps);
|
|
1830
|
+
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
1831
|
+
return maxGap < 40 && avgLen < 5;
|
|
1832
|
+
}
|
|
1833
|
+
function detectColumns(yLines) {
|
|
1834
|
+
const allItems = yLines.flat();
|
|
1835
|
+
if (allItems.length === 0) return null;
|
|
1836
|
+
const pageWidth = _chunkOJ4QR33Vcjs.safeMax.call(void 0, allItems.map((i) => i.x + i.w)) - _chunkOJ4QR33Vcjs.safeMin.call(void 0, allItems.map((i) => i.x));
|
|
1837
|
+
if (pageWidth < 100) return null;
|
|
1838
|
+
let bigoLineIdx = -1;
|
|
1839
|
+
for (let i = 0; i < yLines.length; i++) {
|
|
1840
|
+
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
1841
|
+
bigoLineIdx = i;
|
|
1842
|
+
break;
|
|
1843
|
+
}
|
|
1844
|
+
}
|
|
1845
|
+
const tableYLines = bigoLineIdx >= 0 ? yLines.slice(0, bigoLineIdx) : yLines;
|
|
1846
|
+
const CLUSTER_TOL = 22;
|
|
1847
|
+
const xClusters = [];
|
|
1848
|
+
for (const line of tableYLines) {
|
|
1849
|
+
if (isProseSpread(line)) continue;
|
|
1850
|
+
for (const item of line) {
|
|
1851
|
+
let found = false;
|
|
1852
|
+
for (const c of xClusters) {
|
|
1853
|
+
if (Math.abs(item.x - c.center) <= CLUSTER_TOL) {
|
|
1854
|
+
c.center = Math.round((c.center * c.count + item.x) / (c.count + 1));
|
|
1855
|
+
c.minX = Math.min(c.minX, item.x);
|
|
1856
|
+
c.count++;
|
|
1857
|
+
found = true;
|
|
1858
|
+
break;
|
|
1859
|
+
}
|
|
1860
|
+
}
|
|
1861
|
+
if (!found) {
|
|
1862
|
+
xClusters.push({ center: item.x, count: 1, minX: item.x });
|
|
1863
|
+
}
|
|
1864
|
+
}
|
|
1865
|
+
}
|
|
1866
|
+
const peaks = xClusters.filter((c) => c.count >= 3).sort((a, b) => a.minX - b.minX);
|
|
1867
|
+
if (peaks.length < 3) return null;
|
|
1868
|
+
const MERGE_TOL = 40;
|
|
1869
|
+
const merged = [peaks[0]];
|
|
1870
|
+
for (let i = 1; i < peaks.length; i++) {
|
|
1871
|
+
const prev = merged[merged.length - 1];
|
|
1872
|
+
if (peaks[i].minX - prev.minX < MERGE_TOL) {
|
|
1873
|
+
if (peaks[i].count > prev.count) {
|
|
1874
|
+
prev.center = peaks[i].center;
|
|
1875
|
+
}
|
|
1876
|
+
prev.count += peaks[i].count;
|
|
1877
|
+
prev.minX = Math.min(prev.minX, peaks[i].minX);
|
|
1878
|
+
} else {
|
|
1879
|
+
merged.push({ ...peaks[i] });
|
|
1880
|
+
}
|
|
1881
|
+
}
|
|
1882
|
+
const rawColumns = merged.filter((c) => c.count >= 3).map((c) => c.minX);
|
|
1883
|
+
if (rawColumns.length < 3) return null;
|
|
1884
|
+
const MIN_DETECT_COL_WIDTH = 30;
|
|
1885
|
+
const columns = [rawColumns[0]];
|
|
1886
|
+
for (let i = 1; i < rawColumns.length; i++) {
|
|
1887
|
+
if (rawColumns[i] - columns[columns.length - 1] < MIN_DETECT_COL_WIDTH) continue;
|
|
1888
|
+
columns.push(rawColumns[i]);
|
|
1889
|
+
}
|
|
1890
|
+
return columns.length >= 3 ? columns : null;
|
|
1891
|
+
}
|
|
1892
|
+
function findColumn(x, columns) {
|
|
1893
|
+
for (let i = columns.length - 1; i >= 0; i--) {
|
|
1894
|
+
if (x >= columns[i] - 10) return i;
|
|
1895
|
+
}
|
|
1896
|
+
return 0;
|
|
1897
|
+
}
|
|
1898
|
+
function extractWithColumns(yLines, columns) {
|
|
1899
|
+
const result = [];
|
|
1900
|
+
const colMin = columns[0];
|
|
1901
|
+
const colMax = columns[columns.length - 1];
|
|
1902
|
+
let bigoIdx = -1;
|
|
1903
|
+
for (let i = 0; i < yLines.length; i++) {
|
|
1904
|
+
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
1905
|
+
bigoIdx = i;
|
|
1906
|
+
break;
|
|
1907
|
+
}
|
|
1908
|
+
}
|
|
1909
|
+
let tableStart = -1;
|
|
1910
|
+
for (let i = 0; i < (bigoIdx >= 0 ? bigoIdx : yLines.length); i++) {
|
|
1911
|
+
const usedCols = new Set(yLines[i].map((item) => findColumn(item.x, columns)));
|
|
1912
|
+
if (usedCols.size >= 3) {
|
|
1913
|
+
tableStart = i;
|
|
1914
|
+
break;
|
|
1915
|
+
}
|
|
1916
|
+
}
|
|
1917
|
+
const tableEnd = bigoIdx >= 0 ? bigoIdx : yLines.length;
|
|
1918
|
+
for (let i = 0; i < (tableStart >= 0 ? tableStart : tableEnd); i++) {
|
|
1919
|
+
result.push(mergeLineSimple(yLines[i]));
|
|
1920
|
+
}
|
|
1921
|
+
if (tableStart >= 0) {
|
|
1922
|
+
const tableLines = yLines.slice(tableStart, tableEnd);
|
|
1923
|
+
const gridLines = [];
|
|
1924
|
+
for (const line of tableLines) {
|
|
1925
|
+
const inRange = line.some(
|
|
1926
|
+
(item) => item.x >= colMin - 20 && item.x <= colMax + 200
|
|
1927
|
+
);
|
|
1928
|
+
if (inRange && !isProseSpread(line)) {
|
|
1929
|
+
gridLines.push(line);
|
|
1930
|
+
} else {
|
|
1931
|
+
if (gridLines.length > 0) {
|
|
1932
|
+
result.push(buildGridTable(gridLines.splice(0), columns));
|
|
1933
|
+
}
|
|
1934
|
+
result.push(mergeLineSimple(line));
|
|
1935
|
+
}
|
|
1936
|
+
}
|
|
1937
|
+
if (gridLines.length > 0) {
|
|
1938
|
+
result.push(buildGridTable(gridLines, columns));
|
|
1939
|
+
}
|
|
1940
|
+
}
|
|
1941
|
+
if (bigoIdx >= 0) {
|
|
1942
|
+
result.push("");
|
|
1943
|
+
for (let i = bigoIdx; i < yLines.length; i++) {
|
|
1944
|
+
result.push(mergeLineSimple(yLines[i]));
|
|
1945
|
+
}
|
|
1946
|
+
}
|
|
1947
|
+
return result.join("\n");
|
|
1948
|
+
}
|
|
1949
|
+
function buildGridTable(lines, columns) {
|
|
1950
|
+
const numCols = columns.length;
|
|
1951
|
+
const yRows = lines.map((items) => {
|
|
1952
|
+
const row = Array(numCols).fill("");
|
|
1953
|
+
for (const item of items) {
|
|
1954
|
+
const col = findColumn(item.x, columns);
|
|
1955
|
+
row[col] = row[col] ? row[col] + " " + item.text : item.text;
|
|
1956
|
+
}
|
|
1957
|
+
return row;
|
|
1958
|
+
});
|
|
1959
|
+
const dataColStart = Math.max(2, Math.floor(numCols / 2));
|
|
1960
|
+
const merged = [];
|
|
1961
|
+
for (const row of yRows) {
|
|
1962
|
+
if (row.every((c) => c === "")) continue;
|
|
1963
|
+
if (merged.length === 0) {
|
|
1964
|
+
merged.push([...row]);
|
|
1965
|
+
continue;
|
|
1966
|
+
}
|
|
1967
|
+
const prev = merged[merged.length - 1];
|
|
1968
|
+
const filledCols = row.map((c, i) => c ? i : -1).filter((i) => i >= 0);
|
|
1969
|
+
const filledCount = filledCols.length;
|
|
1970
|
+
let isNewRow = false;
|
|
1971
|
+
if (row[0] && row[0].length >= 3) {
|
|
1972
|
+
isNewRow = true;
|
|
1973
|
+
}
|
|
1974
|
+
if (!isNewRow && numCols > 1 && row[1]) {
|
|
1975
|
+
isNewRow = true;
|
|
1976
|
+
}
|
|
1977
|
+
if (!isNewRow) {
|
|
1978
|
+
const hasData = row.slice(dataColStart).some((c) => c !== "");
|
|
1979
|
+
const prevHasData = prev.slice(dataColStart).some((c) => c !== "");
|
|
1980
|
+
if (hasData && prevHasData) {
|
|
1981
|
+
isNewRow = true;
|
|
1982
|
+
}
|
|
1983
|
+
}
|
|
1984
|
+
if (isNewRow && filledCount === 1 && row[0] && row[0].length <= 2) {
|
|
1985
|
+
isNewRow = false;
|
|
1986
|
+
}
|
|
1987
|
+
if (isNewRow) {
|
|
1988
|
+
merged.push([...row]);
|
|
1989
|
+
} else {
|
|
1990
|
+
for (let c = 0; c < numCols; c++) {
|
|
1991
|
+
if (row[c]) {
|
|
1992
|
+
prev[c] = prev[c] ? prev[c] + " " + row[c] : row[c];
|
|
1993
|
+
}
|
|
1994
|
+
}
|
|
1995
|
+
}
|
|
1996
|
+
}
|
|
1997
|
+
if (merged.length < 2) {
|
|
1998
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
1999
|
+
}
|
|
2000
|
+
let headerEnd = 0;
|
|
2001
|
+
for (let r = 0; r < merged.length; r++) {
|
|
2002
|
+
const hasDataValues = merged[r].slice(dataColStart).some((c) => c && /\d/.test(c));
|
|
2003
|
+
if (hasDataValues) break;
|
|
2004
|
+
headerEnd = r + 1;
|
|
2005
|
+
}
|
|
2006
|
+
if (headerEnd > 1) {
|
|
2007
|
+
const headerRow = Array(numCols).fill("");
|
|
2008
|
+
for (let r = 0; r < headerEnd; r++) {
|
|
2009
|
+
for (let c = 0; c < numCols; c++) {
|
|
2010
|
+
if (merged[r][c]) {
|
|
2011
|
+
headerRow[c] = headerRow[c] ? headerRow[c] + " " + merged[r][c] : merged[r][c];
|
|
2012
|
+
}
|
|
2013
|
+
}
|
|
2014
|
+
}
|
|
2015
|
+
merged.splice(0, headerEnd, headerRow);
|
|
2016
|
+
}
|
|
2017
|
+
for (const row of merged) {
|
|
2018
|
+
for (let c = 0; c < row.length; c++) {
|
|
2019
|
+
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
2020
|
+
}
|
|
2021
|
+
}
|
|
2022
|
+
const totalCells = merged.length * numCols;
|
|
2023
|
+
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
2024
|
+
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
2025
|
+
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
2026
|
+
}
|
|
2027
|
+
const md = [];
|
|
2028
|
+
md.push("| " + merged[0].join(" | ") + " |");
|
|
2029
|
+
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
2030
|
+
for (let r = 1; r < merged.length; r++) {
|
|
2031
|
+
md.push("| " + merged[r].join(" | ") + " |");
|
|
2032
|
+
}
|
|
2033
|
+
return md.join("\n");
|
|
2034
|
+
}
|
|
2035
|
+
function mergeLineSimple(items) {
|
|
2036
|
+
if (items.length <= 1) return _optionalChain([items, 'access', _23 => _23[0], 'optionalAccess', _24 => _24.text]) || "";
|
|
2037
|
+
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
2038
|
+
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
2039
|
+
let result = sorted[0].text;
|
|
2040
|
+
for (let i = 1; i < sorted.length; i++) {
|
|
2041
|
+
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
2042
|
+
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
2043
|
+
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
2044
|
+
if (gap > tabThreshold) {
|
|
2045
|
+
result += " ";
|
|
2046
|
+
result += sorted[i].text;
|
|
2047
|
+
continue;
|
|
2048
|
+
}
|
|
2049
|
+
if (isEvenSpaced[i]) {
|
|
2050
|
+
result += sorted[i].text;
|
|
2051
|
+
continue;
|
|
2052
|
+
}
|
|
2053
|
+
if (sorted[i].hasSpaceBefore && gap >= avgFs * 0.05) {
|
|
2054
|
+
result += " ";
|
|
2055
|
+
result += sorted[i].text;
|
|
2056
|
+
continue;
|
|
2057
|
+
}
|
|
2058
|
+
if (/[□■○●▶◆◇ㅇ]$/.test(sorted[i - 1].text) && /^[가-힣]/.test(sorted[i].text) && gap > 1) {
|
|
2059
|
+
result += " ";
|
|
2060
|
+
result += sorted[i].text;
|
|
2061
|
+
continue;
|
|
2062
|
+
}
|
|
2063
|
+
if (gap < avgFs * 0.15) {
|
|
2064
|
+
} else if (gap < avgFs * 0.35 && (/[가-힣]$/.test(result) || /^[가-힣]/.test(sorted[i].text))) {
|
|
2065
|
+
} else if (gap > 3) result += " ";
|
|
2066
|
+
result += sorted[i].text;
|
|
2067
|
+
}
|
|
2068
|
+
return result;
|
|
2069
|
+
}
|
|
2070
|
+
function cleanPdfText(text) {
|
|
2071
|
+
return mergeKoreanLines(
|
|
2072
|
+
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
2073
|
+
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
2074
|
+
}
|
|
2075
|
+
function startsWithMarker(line) {
|
|
2076
|
+
const t = line.trimStart();
|
|
2077
|
+
return /^[가-힣ㄱ-ㅎ][.)]/.test(t) || /^\d+[.)]/.test(t) || /^\([가-힣ㄱ-ㅎ\d]+\)/.test(t) || /^[○●※▶▷◆◇■□★☆\-·]\s/.test(t) || /^제\d+[조항호장절]/.test(t);
|
|
2078
|
+
}
|
|
2079
|
+
function isStandaloneHeader(line) {
|
|
2080
|
+
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
2081
|
+
}
|
|
2082
|
+
function detectListBlocks(blocks) {
|
|
2083
|
+
const result = [];
|
|
2084
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2085
|
+
const block = blocks[i];
|
|
2086
|
+
if (block.type === "paragraph" && block.text) {
|
|
2087
|
+
const text = block.text.trim();
|
|
2088
|
+
if (/^\d+\.\s/.test(text)) {
|
|
2089
|
+
result.push({ ...block, type: "list", listType: "ordered", text: block.text });
|
|
2090
|
+
continue;
|
|
2091
|
+
}
|
|
2092
|
+
if (/^[○●·※▶▷◆◇\-]\s/.test(text)) {
|
|
2093
|
+
result.push({ ...block, type: "list", listType: "unordered", text: block.text });
|
|
2094
|
+
continue;
|
|
2095
|
+
}
|
|
2096
|
+
}
|
|
2097
|
+
result.push(block);
|
|
2098
|
+
}
|
|
2099
|
+
return result;
|
|
2100
|
+
}
|
|
2101
|
+
var KOREAN_TABLE_HEADER_RE = /^\(?(구분|항목|종류|분류|유형|대상|내용|기간|금액|비율|방법|절차|요건|조건|근거|목적|범위|기준)\)?[:\s]/;
|
|
2102
|
+
var KV_FALSE_POSITIVE_RE = /\d{1,2}:\d{2}|:\/\/|\d+:\d+/;
|
|
2103
|
+
function detectSpecialKoreanTables(blocks) {
|
|
2104
|
+
const result = [];
|
|
2105
|
+
let kvLines = [];
|
|
2106
|
+
const flushKvTable = () => {
|
|
2107
|
+
if (kvLines.length < 2) {
|
|
2108
|
+
for (const kv of kvLines) result.push(kv.block);
|
|
2109
|
+
kvLines = [];
|
|
2110
|
+
return;
|
|
2111
|
+
}
|
|
2112
|
+
const cells = kvLines.map((kv) => {
|
|
2113
|
+
if (kv.value) {
|
|
2114
|
+
return [
|
|
2115
|
+
{ text: kv.key, colSpan: 1, rowSpan: 1 },
|
|
2116
|
+
{ text: kv.value, colSpan: 1, rowSpan: 1 }
|
|
2117
|
+
];
|
|
2118
|
+
}
|
|
2119
|
+
return [
|
|
2120
|
+
{ text: kv.key, colSpan: 2, rowSpan: 1 },
|
|
2121
|
+
{ text: "", colSpan: 1, rowSpan: 1 }
|
|
2122
|
+
];
|
|
2123
|
+
});
|
|
2124
|
+
const irTable = {
|
|
2125
|
+
rows: cells.length,
|
|
2126
|
+
cols: 2,
|
|
2127
|
+
cells,
|
|
2128
|
+
hasHeader: true
|
|
2129
|
+
};
|
|
2130
|
+
const firstBlock = kvLines[0].block;
|
|
2131
|
+
result.push({
|
|
2132
|
+
type: "table",
|
|
2133
|
+
table: irTable,
|
|
2134
|
+
pageNumber: firstBlock.pageNumber,
|
|
2135
|
+
bbox: firstBlock.bbox
|
|
2136
|
+
});
|
|
2137
|
+
kvLines = [];
|
|
2138
|
+
};
|
|
2139
|
+
for (const block of blocks) {
|
|
2140
|
+
if (block.type !== "paragraph" || !block.text) {
|
|
2141
|
+
flushKvTable();
|
|
2142
|
+
result.push(block);
|
|
2143
|
+
continue;
|
|
2144
|
+
}
|
|
2145
|
+
const text = block.text.trim();
|
|
2146
|
+
if (KOREAN_TABLE_HEADER_RE.test(text)) {
|
|
2147
|
+
const colonIdx = text.indexOf(":");
|
|
2148
|
+
if (colonIdx >= 0) {
|
|
2149
|
+
kvLines.push({
|
|
2150
|
+
key: text.slice(0, colonIdx).trim(),
|
|
2151
|
+
value: text.slice(colonIdx + 1).trim(),
|
|
2152
|
+
block
|
|
2153
|
+
});
|
|
2154
|
+
} else {
|
|
2155
|
+
const spaceIdx = text.search(/\s/);
|
|
2156
|
+
if (spaceIdx > 0) {
|
|
2157
|
+
kvLines.push({
|
|
2158
|
+
key: text.slice(0, spaceIdx).trim(),
|
|
2159
|
+
value: text.slice(spaceIdx + 1).trim(),
|
|
2160
|
+
block
|
|
2161
|
+
});
|
|
2162
|
+
} else {
|
|
2163
|
+
kvLines.push({ key: text, value: "", block });
|
|
2164
|
+
}
|
|
2165
|
+
}
|
|
2166
|
+
continue;
|
|
2167
|
+
}
|
|
2168
|
+
if (kvLines.length > 0 && text.includes(":")) {
|
|
2169
|
+
if (!KV_FALSE_POSITIVE_RE.test(text) && !text.includes("(") && !text.includes(")")) {
|
|
2170
|
+
const colonIdx = text.indexOf(":");
|
|
2171
|
+
const key = text.slice(0, colonIdx).trim();
|
|
2172
|
+
if (/^[가-힣]+$/.test(key) && key.length >= 2 && key.length <= 8) {
|
|
2173
|
+
kvLines.push({
|
|
2174
|
+
key,
|
|
2175
|
+
value: text.slice(colonIdx + 1).trim(),
|
|
2176
|
+
block
|
|
2177
|
+
});
|
|
2178
|
+
continue;
|
|
2179
|
+
}
|
|
2180
|
+
}
|
|
2181
|
+
}
|
|
2182
|
+
flushKvTable();
|
|
2183
|
+
result.push(block);
|
|
2184
|
+
}
|
|
2185
|
+
flushKvTable();
|
|
2186
|
+
return result;
|
|
2187
|
+
}
|
|
2188
|
+
function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
2189
|
+
const ZONE_RATIO = 0.1;
|
|
2190
|
+
const MIN_REPEAT = 3;
|
|
2191
|
+
const headerTexts = /* @__PURE__ */ new Map();
|
|
2192
|
+
const footerTexts = /* @__PURE__ */ new Map();
|
|
2193
|
+
for (let bi = 0; bi < blocks.length; bi++) {
|
|
2194
|
+
const b = blocks[bi];
|
|
2195
|
+
if (!b.bbox || !b.pageNumber || !_optionalChain([b, 'access', _25 => _25.text, 'optionalAccess', _26 => _26.trim, 'call', _27 => _27()])) continue;
|
|
2196
|
+
const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
|
|
2197
|
+
if (!ph) continue;
|
|
2198
|
+
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
2199
|
+
const blockBottom = ph - b.bbox.y;
|
|
2200
|
+
if (blockBottom <= ph * ZONE_RATIO) {
|
|
2201
|
+
const arr = footerTexts.get(b.pageNumber) || [];
|
|
2202
|
+
arr.push(b.text.trim());
|
|
2203
|
+
footerTexts.set(b.pageNumber, arr);
|
|
2204
|
+
} else if (blockTop >= ph * (1 - ZONE_RATIO)) {
|
|
2205
|
+
const arr = headerTexts.get(b.pageNumber) || [];
|
|
2206
|
+
arr.push(b.text.trim());
|
|
2207
|
+
headerTexts.set(b.pageNumber, arr);
|
|
2208
|
+
}
|
|
2209
|
+
}
|
|
2210
|
+
const repeatedPatterns = /* @__PURE__ */ new Set();
|
|
2211
|
+
for (const textsMap of [headerTexts, footerTexts]) {
|
|
2212
|
+
const patternCount = /* @__PURE__ */ new Map();
|
|
2213
|
+
for (const [, texts] of textsMap) {
|
|
2214
|
+
for (const t of texts) {
|
|
2215
|
+
const normalized = t.replace(/\d+/g, "#");
|
|
2216
|
+
patternCount.set(normalized, (patternCount.get(normalized) || 0) + 1);
|
|
2217
|
+
}
|
|
2218
|
+
}
|
|
2219
|
+
for (const [pattern, count] of patternCount) {
|
|
2220
|
+
if (count >= MIN_REPEAT) repeatedPatterns.add(pattern);
|
|
2221
|
+
}
|
|
2222
|
+
}
|
|
2223
|
+
if (repeatedPatterns.size === 0) return [];
|
|
2224
|
+
const removeIndices = [];
|
|
2225
|
+
for (let bi = 0; bi < blocks.length; bi++) {
|
|
2226
|
+
const b = blocks[bi];
|
|
2227
|
+
if (!b.bbox || !b.pageNumber || !_optionalChain([b, 'access', _28 => _28.text, 'optionalAccess', _29 => _29.trim, 'call', _30 => _30()])) continue;
|
|
2228
|
+
const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
|
|
2229
|
+
if (!ph) continue;
|
|
2230
|
+
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
2231
|
+
const blockBottom = ph - b.bbox.y;
|
|
2232
|
+
const inZone = blockBottom <= ph * ZONE_RATIO || blockTop >= ph * (1 - ZONE_RATIO);
|
|
2233
|
+
if (!inZone) continue;
|
|
2234
|
+
const normalized = b.text.trim().replace(/\d+/g, "#");
|
|
2235
|
+
if (repeatedPatterns.has(normalized)) {
|
|
2236
|
+
removeIndices.push(bi);
|
|
2237
|
+
}
|
|
2238
|
+
}
|
|
2239
|
+
if (removeIndices.length > 0) {
|
|
2240
|
+
warnings.push({ message: `${removeIndices.length}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
2241
|
+
}
|
|
2242
|
+
return removeIndices;
|
|
2243
|
+
}
|
|
2244
|
+
function mergeKoreanLines(text) {
|
|
2245
|
+
if (!text) return "";
|
|
2246
|
+
const lines = text.split("\n");
|
|
2247
|
+
if (lines.length <= 1) return text;
|
|
2248
|
+
const result = [lines[0]];
|
|
2249
|
+
for (let i = 1; i < lines.length; i++) {
|
|
2250
|
+
const prev = result[result.length - 1];
|
|
2251
|
+
const curr = lines[i];
|
|
2252
|
+
const currTrimmed = curr.trim();
|
|
2253
|
+
if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr) || /^\|/.test(currTrimmed) || /^---/.test(currTrimmed)) {
|
|
2254
|
+
result.push(curr);
|
|
2255
|
+
continue;
|
|
2256
|
+
}
|
|
2257
|
+
if (/,$/.test(prev.trim()) && currTrimmed.length > 0) {
|
|
2258
|
+
result[result.length - 1] = prev + "\n" + curr;
|
|
2259
|
+
continue;
|
|
2260
|
+
}
|
|
2261
|
+
if (/^\(※/.test(currTrimmed)) {
|
|
2262
|
+
result[result.length - 1] = prev + " " + currTrimmed;
|
|
2263
|
+
continue;
|
|
2264
|
+
}
|
|
2265
|
+
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
2266
|
+
result[result.length - 1] = prev + " " + curr;
|
|
2267
|
+
} else {
|
|
2268
|
+
result.push(curr);
|
|
2269
|
+
}
|
|
2270
|
+
}
|
|
2271
|
+
return result.join("\n");
|
|
2272
|
+
}
|
|
2273
|
+
|
|
2274
|
+
|
|
2275
|
+
|
|
2276
|
+
|
|
2277
|
+
exports.cleanPdfText = cleanPdfText; exports.extractPdfMetadataOnly = extractPdfMetadataOnly; exports.parsePdfDocument = parsePdfDocument;
|
|
2278
|
+
//# sourceMappingURL=parser-CYBX5MP4.cjs.map
|