kordoc 2.2.3 → 2.2.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +42 -3
- package/dist/chunk-JH5XLWJQ.js +457 -0
- package/dist/chunk-JH5XLWJQ.js.map +1 -0
- package/dist/chunk-MUOQXDZ4.cjs +33 -0
- package/dist/chunk-MUOQXDZ4.cjs.map +1 -0
- package/dist/chunk-OJ4QR33V.cjs +450 -0
- package/dist/chunk-OJ4QR33V.cjs.map +1 -0
- package/dist/{chunk-AIG7SDWU.js → chunk-RQWICKON.js} +964 -2732
- package/dist/chunk-RQWICKON.js.map +1 -0
- package/dist/chunk-SBVRCJFH.js +33 -0
- package/dist/chunk-SBVRCJFH.js.map +1 -0
- package/dist/chunk-UU2O6D3R.js +450 -0
- package/dist/chunk-UU2O6D3R.js.map +1 -0
- package/dist/cli.js +154 -7
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1095 -3324
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +98 -8
- package/dist/index.d.ts +98 -8
- package/dist/index.js +917 -3100
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +140 -14
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-3C7UGGEK.cjs +7 -0
- package/dist/page-range-3C7UGGEK.cjs.map +1 -0
- package/dist/page-range-H35FN3OQ.js +7 -0
- package/dist/page-range-H35FN3OQ.js.map +1 -0
- package/dist/parser-CYBX5MP4.cjs +2278 -0
- package/dist/parser-CYBX5MP4.cjs.map +1 -0
- package/dist/parser-OIRWPKIQ.js +2278 -0
- package/dist/parser-OIRWPKIQ.js.map +1 -0
- package/dist/parser-PXD73E4H.js +2279 -0
- package/dist/parser-PXD73E4H.js.map +1 -0
- package/dist/provider-WPIYEALY.js +37 -0
- package/dist/provider-WPIYEALY.js.map +1 -0
- package/dist/provider-YN2SSK4X.cjs +37 -0
- package/dist/provider-YN2SSK4X.cjs.map +1 -0
- package/dist/{watch-H672QAW2.js → watch-NSBABJ4A.js} +6 -4
- package/dist/{watch-H672QAW2.js.map → watch-NSBABJ4A.js.map} +1 -1
- package/package.json +1 -1
- package/dist/chunk-AIG7SDWU.js.map +0 -1
|
@@ -3,448 +3,32 @@ import {
|
|
|
3
3
|
detectFormat,
|
|
4
4
|
detectZipFormat
|
|
5
5
|
} from "./chunk-5Y2Q3BRW.js";
|
|
6
|
+
import {
|
|
7
|
+
HEADING_RATIO_H1,
|
|
8
|
+
HEADING_RATIO_H2,
|
|
9
|
+
HEADING_RATIO_H3,
|
|
10
|
+
KordocError,
|
|
11
|
+
MAX_COLS,
|
|
12
|
+
MAX_ROWS,
|
|
13
|
+
blocksToMarkdown,
|
|
14
|
+
buildTable,
|
|
15
|
+
classifyError,
|
|
16
|
+
convertTableToText,
|
|
17
|
+
flattenLayoutTables,
|
|
18
|
+
isPathTraversal,
|
|
19
|
+
precheckZipSize,
|
|
20
|
+
sanitizeHref,
|
|
21
|
+
stripDtd,
|
|
22
|
+
toArrayBuffer
|
|
23
|
+
} from "./chunk-JH5XLWJQ.js";
|
|
6
24
|
import {
|
|
7
25
|
parsePageRange
|
|
8
26
|
} from "./chunk-MOL7MDBG.js";
|
|
9
27
|
|
|
10
|
-
// src/utils.ts
|
|
11
|
-
var VERSION = true ? "2.2.3" : "0.0.0-dev";
|
|
12
|
-
function toArrayBuffer(buf) {
|
|
13
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
14
|
-
return buf.buffer;
|
|
15
|
-
}
|
|
16
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
17
|
-
}
|
|
18
|
-
var KordocError = class extends Error {
|
|
19
|
-
constructor(message) {
|
|
20
|
-
super(message);
|
|
21
|
-
this.name = "KordocError";
|
|
22
|
-
}
|
|
23
|
-
};
|
|
24
|
-
function sanitizeError(err) {
|
|
25
|
-
if (err instanceof KordocError) return err.message;
|
|
26
|
-
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
27
|
-
}
|
|
28
|
-
function isPathTraversal(name) {
|
|
29
|
-
if (name.includes("\0")) return true;
|
|
30
|
-
const normalized = name.replace(/\\/g, "/");
|
|
31
|
-
const segments = normalized.split("/");
|
|
32
|
-
return segments.some((s) => s === "..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
33
|
-
}
|
|
34
|
-
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
35
|
-
try {
|
|
36
|
-
const data = new DataView(buffer);
|
|
37
|
-
const len = buffer.byteLength;
|
|
38
|
-
let eocdOffset = -1;
|
|
39
|
-
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
40
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
41
|
-
eocdOffset = i;
|
|
42
|
-
break;
|
|
43
|
-
}
|
|
44
|
-
}
|
|
45
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
46
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
47
|
-
if (entryCount > maxEntries) {
|
|
48
|
-
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
49
|
-
}
|
|
50
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
51
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
52
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
53
|
-
let totalUncompressed = 0;
|
|
54
|
-
let pos = cdOffset;
|
|
55
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
56
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
57
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
58
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
59
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
60
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
61
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
62
|
-
}
|
|
63
|
-
if (totalUncompressed > maxUncompressedSize) {
|
|
64
|
-
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
65
|
-
}
|
|
66
|
-
return { totalUncompressed, entryCount };
|
|
67
|
-
} catch (err) {
|
|
68
|
-
if (err instanceof KordocError) throw err;
|
|
69
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
70
|
-
}
|
|
71
|
-
}
|
|
72
|
-
function stripDtd(xml) {
|
|
73
|
-
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
74
|
-
}
|
|
75
|
-
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
76
|
-
function sanitizeHref(href) {
|
|
77
|
-
const trimmed = href.trim();
|
|
78
|
-
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
79
|
-
return trimmed;
|
|
80
|
-
}
|
|
81
|
-
function safeMin(arr) {
|
|
82
|
-
let min = Infinity;
|
|
83
|
-
for (let i = 0; i < arr.length; i++) if (arr[i] < min) min = arr[i];
|
|
84
|
-
return min;
|
|
85
|
-
}
|
|
86
|
-
function safeMax(arr) {
|
|
87
|
-
let max = -Infinity;
|
|
88
|
-
for (let i = 0; i < arr.length; i++) if (arr[i] > max) max = arr[i];
|
|
89
|
-
return max;
|
|
90
|
-
}
|
|
91
|
-
function classifyError(err) {
|
|
92
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
93
|
-
const msg = err.message;
|
|
94
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
95
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
96
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
97
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
98
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
99
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
100
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
101
|
-
return "PARSE_ERROR";
|
|
102
|
-
}
|
|
103
|
-
|
|
104
|
-
// src/table/builder.ts
|
|
105
|
-
var MAX_COLS = 200;
|
|
106
|
-
var MAX_ROWS = 1e4;
|
|
107
|
-
function buildTable(rows) {
|
|
108
|
-
if (rows.length > MAX_ROWS) rows = rows.slice(0, MAX_ROWS);
|
|
109
|
-
const numRows = rows.length;
|
|
110
|
-
const hasAddr = rows.some((row) => row.some((c) => c.colAddr !== void 0 && c.rowAddr !== void 0));
|
|
111
|
-
if (hasAddr) return buildTableDirect(rows, numRows);
|
|
112
|
-
let maxCols = 0;
|
|
113
|
-
const tempOccupied = Array.from({ length: numRows }, () => []);
|
|
114
|
-
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
115
|
-
let colIdx = 0;
|
|
116
|
-
for (const cell of rows[rowIdx]) {
|
|
117
|
-
while (colIdx < MAX_COLS && tempOccupied[rowIdx][colIdx]) colIdx++;
|
|
118
|
-
if (colIdx >= MAX_COLS) break;
|
|
119
|
-
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
120
|
-
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, MAX_COLS); c++) {
|
|
121
|
-
tempOccupied[r][c] = true;
|
|
122
|
-
}
|
|
123
|
-
}
|
|
124
|
-
colIdx += cell.colSpan;
|
|
125
|
-
if (colIdx > maxCols) maxCols = colIdx;
|
|
126
|
-
}
|
|
127
|
-
}
|
|
128
|
-
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
129
|
-
const grid = Array.from(
|
|
130
|
-
{ length: numRows },
|
|
131
|
-
() => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
132
|
-
);
|
|
133
|
-
const occupied = Array.from({ length: numRows }, () => Array(maxCols).fill(false));
|
|
134
|
-
for (let rowIdx = 0; rowIdx < numRows; rowIdx++) {
|
|
135
|
-
let colIdx = 0;
|
|
136
|
-
let cellIdx = 0;
|
|
137
|
-
while (colIdx < maxCols && cellIdx < rows[rowIdx].length) {
|
|
138
|
-
while (colIdx < maxCols && occupied[rowIdx][colIdx]) colIdx++;
|
|
139
|
-
if (colIdx >= maxCols) break;
|
|
140
|
-
const cell = rows[rowIdx][cellIdx];
|
|
141
|
-
grid[rowIdx][colIdx] = {
|
|
142
|
-
text: cell.text.trim(),
|
|
143
|
-
colSpan: cell.colSpan,
|
|
144
|
-
rowSpan: cell.rowSpan
|
|
145
|
-
};
|
|
146
|
-
for (let r = rowIdx; r < Math.min(rowIdx + cell.rowSpan, numRows); r++) {
|
|
147
|
-
for (let c = colIdx; c < Math.min(colIdx + cell.colSpan, maxCols); c++) {
|
|
148
|
-
occupied[r][c] = true;
|
|
149
|
-
}
|
|
150
|
-
}
|
|
151
|
-
colIdx += cell.colSpan;
|
|
152
|
-
cellIdx++;
|
|
153
|
-
}
|
|
154
|
-
}
|
|
155
|
-
return trimAndReturn(grid, numRows, maxCols);
|
|
156
|
-
}
|
|
157
|
-
function buildTableDirect(rows, numRows) {
|
|
158
|
-
let maxCols = 0;
|
|
159
|
-
for (const row of rows) {
|
|
160
|
-
for (const cell of row) {
|
|
161
|
-
const end = (cell.colAddr ?? 0) + cell.colSpan;
|
|
162
|
-
if (end > maxCols) maxCols = end;
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
if (maxCols > MAX_COLS) maxCols = MAX_COLS;
|
|
166
|
-
if (maxCols === 0) return { rows: 0, cols: 0, cells: [], hasHeader: false };
|
|
167
|
-
const grid = Array.from(
|
|
168
|
-
{ length: numRows },
|
|
169
|
-
() => Array.from({ length: maxCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
170
|
-
);
|
|
171
|
-
for (const row of rows) {
|
|
172
|
-
for (const cell of row) {
|
|
173
|
-
const r = cell.rowAddr ?? 0;
|
|
174
|
-
const c = cell.colAddr ?? 0;
|
|
175
|
-
if (r >= numRows || c >= maxCols || r < 0 || c < 0) continue;
|
|
176
|
-
grid[r][c] = { text: cell.text.trim(), colSpan: cell.colSpan, rowSpan: cell.rowSpan };
|
|
177
|
-
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
178
|
-
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
179
|
-
if (dr === 0 && dc === 0) continue;
|
|
180
|
-
if (r + dr < numRows && c + dc < maxCols) {
|
|
181
|
-
grid[r + dr][c + dc] = { text: "", colSpan: 1, rowSpan: 1 };
|
|
182
|
-
}
|
|
183
|
-
}
|
|
184
|
-
}
|
|
185
|
-
}
|
|
186
|
-
}
|
|
187
|
-
return trimAndReturn(grid, numRows, maxCols);
|
|
188
|
-
}
|
|
189
|
-
function trimAndReturn(grid, numRows, maxCols) {
|
|
190
|
-
let effectiveCols = maxCols;
|
|
191
|
-
while (effectiveCols > 0) {
|
|
192
|
-
const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
|
|
193
|
-
if (!colEmpty) break;
|
|
194
|
-
effectiveCols--;
|
|
195
|
-
}
|
|
196
|
-
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
197
|
-
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
198
|
-
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
199
|
-
}
|
|
200
|
-
return { rows: numRows, cols: maxCols, cells: grid, hasHeader: numRows > 1 };
|
|
201
|
-
}
|
|
202
|
-
function convertTableToText(rows) {
|
|
203
|
-
return rows.map(
|
|
204
|
-
(row) => row.map((c) => c.text.trim().replace(/\n/g, " ").replace(/\|/g, "\\|")).filter(Boolean).join(" / ")
|
|
205
|
-
).filter(Boolean).join("\n");
|
|
206
|
-
}
|
|
207
|
-
function escapeGfm(text) {
|
|
208
|
-
return text.replace(/~/g, "\\~");
|
|
209
|
-
}
|
|
210
|
-
var HWP_SHAPE_ALT_TEXT_RE = /(?:모서리가 둥근 |둥근 )?(?:사각형|직사각형|정사각형|원|타원|삼각형|이등변 삼각형|직각 삼각형|선|직선|곡선|화살표|굵은 화살표|이중 화살표|오각형|육각형|팔각형|별|[4-8]점별|십자|십자형|구름|구름형|마름모|도넛|평행사변형|사다리꼴|부채꼴|호|반원|물결|번개|하트|빗금|블록 화살표|수식|표|그림|개체|그리기\s?개체|묶음\s?개체|글상자|수식\s?개체|OLE\s?개체)\s?입니다\.?/g;
|
|
211
|
-
function sanitizeText(text) {
|
|
212
|
-
let result = text.replace(/[\u{F0000}-\u{FFFFD}]/gu, "").replace(HWP_SHAPE_ALT_TEXT_RE, "").replace(/ +/g, " ").trim();
|
|
213
|
-
if (result.length <= 30 && result.includes(" ")) {
|
|
214
|
-
const tokens = result.split(" ");
|
|
215
|
-
const koreanSingleCharCount = tokens.filter((t) => t.length === 1 && /[\uAC00-\uD7AF\u3131-\u318E]/.test(t)).length;
|
|
216
|
-
if (tokens.length >= 3 && koreanSingleCharCount / tokens.length >= 0.7) {
|
|
217
|
-
result = tokens.join("");
|
|
218
|
-
}
|
|
219
|
-
}
|
|
220
|
-
return result;
|
|
221
|
-
}
|
|
222
|
-
function flattenLayoutTables(blocks) {
|
|
223
|
-
const result = [];
|
|
224
|
-
for (const block of blocks) {
|
|
225
|
-
if (block.type !== "table" || !block.table) {
|
|
226
|
-
result.push(block);
|
|
227
|
-
continue;
|
|
228
|
-
}
|
|
229
|
-
const { rows: numRows, cols: numCols, cells } = block.table;
|
|
230
|
-
if (numRows === 1 && numCols === 1) {
|
|
231
|
-
result.push(block);
|
|
232
|
-
continue;
|
|
233
|
-
}
|
|
234
|
-
if (numRows <= 3) {
|
|
235
|
-
let totalNewlines = 0;
|
|
236
|
-
let totalTextLen = 0;
|
|
237
|
-
for (let r = 0; r < numRows; r++) {
|
|
238
|
-
for (let c = 0; c < numCols; c++) {
|
|
239
|
-
const t = cells[r]?.[c]?.text || "";
|
|
240
|
-
totalNewlines += (t.match(/\n/g) || []).length;
|
|
241
|
-
totalTextLen += t.length;
|
|
242
|
-
}
|
|
243
|
-
}
|
|
244
|
-
if (totalNewlines > 5 || numRows <= 2 && totalTextLen > 300) {
|
|
245
|
-
for (let r = 0; r < numRows; r++) {
|
|
246
|
-
for (let c = 0; c < numCols; c++) {
|
|
247
|
-
const cellText = cells[r]?.[c]?.text?.trim();
|
|
248
|
-
if (!cellText) continue;
|
|
249
|
-
for (const line of cellText.split("\n")) {
|
|
250
|
-
const trimmed = line.trim();
|
|
251
|
-
if (!trimmed) continue;
|
|
252
|
-
result.push({ type: "paragraph", text: trimmed, pageNumber: block.pageNumber });
|
|
253
|
-
}
|
|
254
|
-
}
|
|
255
|
-
}
|
|
256
|
-
continue;
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
result.push(block);
|
|
260
|
-
}
|
|
261
|
-
return result;
|
|
262
|
-
}
|
|
263
|
-
function blocksToMarkdown(blocks) {
|
|
264
|
-
const lines = [];
|
|
265
|
-
for (let i = 0; i < blocks.length; i++) {
|
|
266
|
-
const block = blocks[i];
|
|
267
|
-
if (block.type === "heading" && block.text) {
|
|
268
|
-
const prefix = "#".repeat(Math.min(block.level || 2, 6));
|
|
269
|
-
const headingText = sanitizeText(block.text);
|
|
270
|
-
if (headingText) lines.push("", `${prefix} ${headingText}`, "");
|
|
271
|
-
continue;
|
|
272
|
-
}
|
|
273
|
-
if (block.type === "image" && block.text) {
|
|
274
|
-
lines.push("", ``, "");
|
|
275
|
-
continue;
|
|
276
|
-
}
|
|
277
|
-
if (block.type === "separator") {
|
|
278
|
-
lines.push("", "---", "");
|
|
279
|
-
continue;
|
|
280
|
-
}
|
|
281
|
-
if (block.type === "list" && block.text) {
|
|
282
|
-
const listText = sanitizeText(block.text);
|
|
283
|
-
if (!listText) continue;
|
|
284
|
-
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
|
|
285
|
-
const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
|
|
286
|
-
lines.push(`${prefix}${listText}`);
|
|
287
|
-
if (block.children) {
|
|
288
|
-
for (const child of block.children) {
|
|
289
|
-
const childPrefix = child.listType === "ordered" ? "1." : "-";
|
|
290
|
-
lines.push(` ${childPrefix} ${child.text || ""}`);
|
|
291
|
-
}
|
|
292
|
-
}
|
|
293
|
-
continue;
|
|
294
|
-
}
|
|
295
|
-
if (block.type === "paragraph" && block.text) {
|
|
296
|
-
let text = sanitizeText(block.text);
|
|
297
|
-
if (!text) continue;
|
|
298
|
-
if (/^\[별표\s*\d+/.test(text)) {
|
|
299
|
-
const nextBlock = blocks[i + 1];
|
|
300
|
-
if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
|
|
301
|
-
lines.push("", `## ${text} ${nextBlock.text}`, "");
|
|
302
|
-
i++;
|
|
303
|
-
} else {
|
|
304
|
-
lines.push("", `## ${text}`, "");
|
|
305
|
-
}
|
|
306
|
-
continue;
|
|
307
|
-
}
|
|
308
|
-
if (/^\([^)]*조[^)]*관련\)$/.test(text)) {
|
|
309
|
-
lines.push(`*${text}*`, "");
|
|
310
|
-
continue;
|
|
311
|
-
}
|
|
312
|
-
if (block.href) {
|
|
313
|
-
const href = sanitizeHref(block.href);
|
|
314
|
-
if (href) text = `[${text}](${href})`;
|
|
315
|
-
}
|
|
316
|
-
if (block.footnoteText) {
|
|
317
|
-
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
318
|
-
}
|
|
319
|
-
lines.push(escapeGfm(text), "");
|
|
320
|
-
} else if (block.type === "table" && block.table) {
|
|
321
|
-
if (lines.length > 0 && lines[lines.length - 1] !== "") {
|
|
322
|
-
lines.push("");
|
|
323
|
-
}
|
|
324
|
-
const tableMd = tableToMarkdown(block.table);
|
|
325
|
-
if (tableMd) {
|
|
326
|
-
lines.push(tableMd);
|
|
327
|
-
lines.push("");
|
|
328
|
-
}
|
|
329
|
-
}
|
|
330
|
-
}
|
|
331
|
-
return lines.join("\n").trim();
|
|
332
|
-
}
|
|
333
|
-
function hasMergedCells(table) {
|
|
334
|
-
for (const row of table.cells) {
|
|
335
|
-
for (const cell of row) {
|
|
336
|
-
if (cell.colSpan > 1 || cell.rowSpan > 1) return true;
|
|
337
|
-
}
|
|
338
|
-
}
|
|
339
|
-
return false;
|
|
340
|
-
}
|
|
341
|
-
function tableToHtml(table) {
|
|
342
|
-
const { cells, rows: numRows, cols: numCols } = table;
|
|
343
|
-
const skip = /* @__PURE__ */ new Set();
|
|
344
|
-
const lines = ["<table>"];
|
|
345
|
-
for (let r = 0; r < numRows; r++) {
|
|
346
|
-
const tag = r === 0 ? "th" : "td";
|
|
347
|
-
const rowHtml = [];
|
|
348
|
-
for (let c = 0; c < numCols; c++) {
|
|
349
|
-
if (skip.has(`${r},${c}`)) continue;
|
|
350
|
-
const cell = cells[r]?.[c];
|
|
351
|
-
if (!cell) continue;
|
|
352
|
-
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
353
|
-
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
354
|
-
if (dr === 0 && dc === 0) continue;
|
|
355
|
-
if (r + dr < numRows && c + dc < numCols) skip.add(`${r + dr},${c + dc}`);
|
|
356
|
-
}
|
|
357
|
-
}
|
|
358
|
-
const text = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
359
|
-
const attrs = [];
|
|
360
|
-
if (cell.colSpan > 1) attrs.push(`colspan="${cell.colSpan}"`);
|
|
361
|
-
if (cell.rowSpan > 1) attrs.push(`rowspan="${cell.rowSpan}"`);
|
|
362
|
-
const attrStr = attrs.length ? " " + attrs.join(" ") : "";
|
|
363
|
-
rowHtml.push(`<${tag}${attrStr}>${text}</${tag}>`);
|
|
364
|
-
}
|
|
365
|
-
if (rowHtml.length) lines.push(`<tr>${rowHtml.join("")}</tr>`);
|
|
366
|
-
}
|
|
367
|
-
lines.push("</table>");
|
|
368
|
-
return lines.join("\n");
|
|
369
|
-
}
|
|
370
|
-
function tableToMarkdown(table) {
|
|
371
|
-
if (table.rows === 0 || table.cols === 0) return "";
|
|
372
|
-
const { cells, rows: numRows, cols: numCols } = table;
|
|
373
|
-
if (hasMergedCells(table)) return tableToHtml(table);
|
|
374
|
-
if (numRows === 1 && numCols === 1) {
|
|
375
|
-
const content = sanitizeText(cells[0][0].text);
|
|
376
|
-
if (!content) return "";
|
|
377
|
-
return content.split(/\n/).map((line) => {
|
|
378
|
-
const trimmed = line.trim();
|
|
379
|
-
if (!trimmed) return "";
|
|
380
|
-
if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed)}**`;
|
|
381
|
-
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed)}`;
|
|
382
|
-
return escapeGfm(trimmed);
|
|
383
|
-
}).filter(Boolean).join("\n");
|
|
384
|
-
}
|
|
385
|
-
if (numCols === 1 && numRows >= 2) {
|
|
386
|
-
return cells.map((row) => escapeGfm(sanitizeText(row[0].text)).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
387
|
-
}
|
|
388
|
-
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
389
|
-
const skip = /* @__PURE__ */ new Set();
|
|
390
|
-
for (let r = 0; r < numRows; r++) {
|
|
391
|
-
for (let c = 0; c < numCols; c++) {
|
|
392
|
-
if (skip.has(`${r},${c}`)) continue;
|
|
393
|
-
const cell = cells[r]?.[c];
|
|
394
|
-
if (!cell) continue;
|
|
395
|
-
display[r][c] = escapeGfm(sanitizeText(cell.text)).replace(/\|/g, "\\|").replace(/\n/g, "<br>");
|
|
396
|
-
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
397
|
-
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
398
|
-
if (dr === 0 && dc === 0) continue;
|
|
399
|
-
if (r + dr < numRows && c + dc < numCols) {
|
|
400
|
-
skip.add(`${r + dr},${c + dc}`);
|
|
401
|
-
}
|
|
402
|
-
}
|
|
403
|
-
}
|
|
404
|
-
c += cell.colSpan - 1;
|
|
405
|
-
}
|
|
406
|
-
}
|
|
407
|
-
const uniqueRows = [];
|
|
408
|
-
let pendingFirstCol = "";
|
|
409
|
-
for (let r = 0; r < display.length; r++) {
|
|
410
|
-
const row = display[r];
|
|
411
|
-
const isEmptyPlaceholder = row.every((cell) => cell === "");
|
|
412
|
-
if (isEmptyPlaceholder) continue;
|
|
413
|
-
const nonEmptyCols = row.filter((cell) => cell !== "");
|
|
414
|
-
const hasSkipInRow = row.some((_, c) => skip.has(`${r},${c}`));
|
|
415
|
-
if (!hasSkipInRow && nonEmptyCols.length === 1 && row[0] !== "" && row.slice(1).every((c) => c === "")) {
|
|
416
|
-
pendingFirstCol = row[0];
|
|
417
|
-
continue;
|
|
418
|
-
}
|
|
419
|
-
if (pendingFirstCol && row[0] === "") {
|
|
420
|
-
row[0] = pendingFirstCol;
|
|
421
|
-
pendingFirstCol = "";
|
|
422
|
-
} else {
|
|
423
|
-
pendingFirstCol = "";
|
|
424
|
-
}
|
|
425
|
-
uniqueRows.push(row);
|
|
426
|
-
}
|
|
427
|
-
if (uniqueRows.length === 0) return "";
|
|
428
|
-
const md = [];
|
|
429
|
-
md.push("| " + uniqueRows[0].join(" | ") + " |");
|
|
430
|
-
md.push("| " + uniqueRows[0].map(() => "---").join(" | ") + " |");
|
|
431
|
-
for (let i = 1; i < uniqueRows.length; i++) {
|
|
432
|
-
md.push("| " + uniqueRows[i].join(" | ") + " |");
|
|
433
|
-
}
|
|
434
|
-
return md.join("\n");
|
|
435
|
-
}
|
|
436
|
-
|
|
437
28
|
// src/hwpx/parser.ts
|
|
438
29
|
import JSZip from "jszip";
|
|
439
30
|
import { inflateRawSync } from "zlib";
|
|
440
31
|
import { DOMParser } from "@xmldom/xmldom";
|
|
441
|
-
|
|
442
|
-
// src/types.ts
|
|
443
|
-
var HEADING_RATIO_H1 = 1.5;
|
|
444
|
-
var HEADING_RATIO_H2 = 1.3;
|
|
445
|
-
var HEADING_RATIO_H3 = 1.15;
|
|
446
|
-
|
|
447
|
-
// src/hwpx/parser.ts
|
|
448
32
|
var MAX_DECOMPRESS_SIZE = 100 * 1024 * 1024;
|
|
449
33
|
var MAX_ZIP_ENTRIES = 500;
|
|
450
34
|
function clampSpan(val, max) {
|
|
@@ -2929,2272 +2513,1040 @@ function arrangeCells(rows, cols, cells) {
|
|
|
2929
2513
|
return grid.map((row) => row.map((c) => c || { text: "", colSpan: 1, rowSpan: 1 }));
|
|
2930
2514
|
}
|
|
2931
2515
|
|
|
2932
|
-
// src/
|
|
2933
|
-
|
|
2934
|
-
|
|
2935
|
-
|
|
2936
|
-
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
|
|
2940
|
-
|
|
2941
|
-
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
|
|
2946
|
-
|
|
2947
|
-
|
|
2948
|
-
|
|
2949
|
-
|
|
2950
|
-
|
|
2951
|
-
|
|
2952
|
-
|
|
2953
|
-
|
|
2954
|
-
|
|
2955
|
-
|
|
2956
|
-
|
|
2957
|
-
|
|
2958
|
-
|
|
2959
|
-
|
|
2960
|
-
|
|
2961
|
-
|
|
2962
|
-
|
|
2516
|
+
// src/form/recognize.ts
|
|
2517
|
+
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
2518
|
+
"\uC131\uBA85",
|
|
2519
|
+
"\uC774\uB984",
|
|
2520
|
+
"\uC8FC\uC18C",
|
|
2521
|
+
"\uC804\uD654",
|
|
2522
|
+
"\uC804\uD654\uBC88\uD638",
|
|
2523
|
+
"\uD734\uB300\uD3F0",
|
|
2524
|
+
"\uD578\uB4DC\uD3F0",
|
|
2525
|
+
"\uC5F0\uB77D\uCC98",
|
|
2526
|
+
"\uC0DD\uB144\uC6D4\uC77C",
|
|
2527
|
+
"\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
|
|
2528
|
+
"\uC18C\uC18D",
|
|
2529
|
+
"\uC9C1\uC704",
|
|
2530
|
+
"\uC9C1\uAE09",
|
|
2531
|
+
"\uBD80\uC11C",
|
|
2532
|
+
"\uC774\uBA54\uC77C",
|
|
2533
|
+
"\uD329\uC2A4",
|
|
2534
|
+
"\uD559\uAD50",
|
|
2535
|
+
"\uD559\uB144",
|
|
2536
|
+
"\uBC18",
|
|
2537
|
+
"\uBC88\uD638",
|
|
2538
|
+
"\uC2E0\uCCAD\uC778",
|
|
2539
|
+
"\uB300\uD45C\uC790",
|
|
2540
|
+
"\uB2F4\uB2F9\uC790",
|
|
2541
|
+
"\uC791\uC131\uC790",
|
|
2542
|
+
"\uD655\uC778\uC790",
|
|
2543
|
+
"\uC2B9\uC778\uC790",
|
|
2544
|
+
"\uC77C\uC2DC",
|
|
2545
|
+
"\uB0A0\uC9DC",
|
|
2546
|
+
"\uAE30\uAC04",
|
|
2547
|
+
"\uC7A5\uC18C",
|
|
2548
|
+
"\uBAA9\uC801",
|
|
2549
|
+
"\uC0AC\uC720",
|
|
2550
|
+
"\uBE44\uACE0",
|
|
2551
|
+
"\uAE08\uC561",
|
|
2552
|
+
"\uC218\uB7C9",
|
|
2553
|
+
"\uB2E8\uAC00",
|
|
2554
|
+
"\uD569\uACC4",
|
|
2555
|
+
"\uACC4",
|
|
2556
|
+
"\uC18C\uACC4",
|
|
2557
|
+
"\uB4F1\uB85D\uAE30\uC900\uC9C0",
|
|
2558
|
+
"\uBCF8\uC801",
|
|
2559
|
+
"\uC704\uC784\uC778",
|
|
2560
|
+
"\uCCAD\uAD6C\uC0AC\uC720",
|
|
2561
|
+
"\uC18C\uBA85\uC790\uB8CC"
|
|
2562
|
+
]);
|
|
2563
|
+
function isLabelCell(text) {
|
|
2564
|
+
const trimmed = text.trim().replace(/[¹²³⁴⁵⁶⁷⁸⁹⁰*※]+$/g, "").trim();
|
|
2565
|
+
if (!trimmed || trimmed.length > 30) return false;
|
|
2566
|
+
for (const kw of LABEL_KEYWORDS) {
|
|
2567
|
+
if (trimmed.includes(kw)) return true;
|
|
2963
2568
|
}
|
|
2964
|
-
|
|
2965
|
-
|
|
2966
|
-
|
|
2967
|
-
|
|
2569
|
+
if (/^[가-힣\s()()·::]+$/.test(trimmed) && trimmed.replace(/\s/g, "").length >= 2 && trimmed.replace(/\s/g, "").length <= 8 && !/\d/.test(trimmed)) return true;
|
|
2570
|
+
if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
|
|
2571
|
+
return false;
|
|
2572
|
+
}
|
|
2573
|
+
function extractFormFields(blocks) {
|
|
2574
|
+
const fields = [];
|
|
2575
|
+
let totalTables = 0;
|
|
2576
|
+
let formTables = 0;
|
|
2577
|
+
for (const block of blocks) {
|
|
2578
|
+
if (block.type !== "table" || !block.table) continue;
|
|
2579
|
+
totalTables++;
|
|
2580
|
+
const tableFields = extractFromTable(block.table);
|
|
2581
|
+
if (tableFields.length > 0) {
|
|
2582
|
+
formTables++;
|
|
2583
|
+
fields.push(...tableFields);
|
|
2968
2584
|
}
|
|
2969
|
-
|
|
2970
|
-
|
|
2585
|
+
}
|
|
2586
|
+
for (const block of blocks) {
|
|
2587
|
+
if (block.type === "paragraph" && block.text) {
|
|
2588
|
+
const inlineFields = extractInlineFields(block.text);
|
|
2589
|
+
fields.push(...inlineFields);
|
|
2971
2590
|
}
|
|
2972
|
-
currentPath = [];
|
|
2973
2591
|
}
|
|
2974
|
-
|
|
2975
|
-
|
|
2976
|
-
|
|
2977
|
-
|
|
2978
|
-
|
|
2979
|
-
|
|
2980
|
-
|
|
2981
|
-
|
|
2982
|
-
const
|
|
2983
|
-
|
|
2984
|
-
|
|
2985
|
-
|
|
2986
|
-
|
|
2987
|
-
|
|
2988
|
-
|
|
2989
|
-
|
|
2990
|
-
|
|
2991
|
-
pathStartX = curX;
|
|
2992
|
-
pathStartY = curY;
|
|
2993
|
-
} else if (subOp === OPS.lineTo) {
|
|
2994
|
-
const x2 = coords[ci++], y2 = coords[ci++];
|
|
2995
|
-
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
2996
|
-
curX = x2;
|
|
2997
|
-
curY = y2;
|
|
2998
|
-
} else if (subOp === OPS.rectangle) {
|
|
2999
|
-
const rx = coords[ci++], ry = coords[ci++];
|
|
3000
|
-
const rw = coords[ci++], rh = coords[ci++];
|
|
3001
|
-
pushRectangle(currentPath, rx, ry, rw, rh);
|
|
3002
|
-
} else if (subOp === OPS.closePath) {
|
|
3003
|
-
if (curX !== pathStartX || curY !== pathStartY) {
|
|
3004
|
-
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
3005
|
-
}
|
|
3006
|
-
curX = pathStartX;
|
|
3007
|
-
curY = pathStartY;
|
|
3008
|
-
} else if (subOp === OPS.curveTo) {
|
|
3009
|
-
ci += 6;
|
|
3010
|
-
} else if (subOp === OPS.curveTo2 || subOp === OPS.curveTo3) {
|
|
3011
|
-
ci += 4;
|
|
3012
|
-
}
|
|
3013
|
-
}
|
|
3014
|
-
} else {
|
|
3015
|
-
const afterOp = arg0;
|
|
3016
|
-
const dataArr = args[1];
|
|
3017
|
-
const pathData = dataArr?.[0];
|
|
3018
|
-
if (pathData && typeof pathData === "object") {
|
|
3019
|
-
const len = Object.keys(pathData).length;
|
|
3020
|
-
let di = 0;
|
|
3021
|
-
while (di < len) {
|
|
3022
|
-
const drawOp = pathData[di++];
|
|
3023
|
-
if (drawOp === 0 /* moveTo */) {
|
|
3024
|
-
curX = pathData[di++];
|
|
3025
|
-
curY = pathData[di++];
|
|
3026
|
-
pathStartX = curX;
|
|
3027
|
-
pathStartY = curY;
|
|
3028
|
-
} else if (drawOp === 1 /* lineTo */) {
|
|
3029
|
-
const x2 = pathData[di++], y2 = pathData[di++];
|
|
3030
|
-
currentPath.push({ x1: curX, y1: curY, x2, y2 });
|
|
3031
|
-
curX = x2;
|
|
3032
|
-
curY = y2;
|
|
3033
|
-
} else if (drawOp === 2 /* curveTo */) {
|
|
3034
|
-
di += 6;
|
|
3035
|
-
} else if (drawOp === 3 /* quadraticCurveTo */) {
|
|
3036
|
-
di += 4;
|
|
3037
|
-
} else if (drawOp === 4 /* closePath */) {
|
|
3038
|
-
if (curX !== pathStartX || curY !== pathStartY) {
|
|
3039
|
-
currentPath.push({ x1: curX, y1: curY, x2: pathStartX, y2: pathStartY });
|
|
3040
|
-
}
|
|
3041
|
-
curX = pathStartX;
|
|
3042
|
-
curY = pathStartY;
|
|
3043
|
-
} else {
|
|
3044
|
-
break;
|
|
3045
|
-
}
|
|
3046
|
-
}
|
|
3047
|
-
}
|
|
3048
|
-
if (afterOp === OPS.stroke || afterOp === OPS.closeStroke) {
|
|
3049
|
-
flushPath(true);
|
|
3050
|
-
} else if (afterOp === OPS.fill || afterOp === OPS.eoFill || afterOp === OPS.fillStroke || afterOp === OPS.eoFillStroke || afterOp === OPS.closeFillStroke || afterOp === OPS.closeEOFillStroke) {
|
|
3051
|
-
flushPath(true);
|
|
3052
|
-
} else if (afterOp === OPS.endPath) {
|
|
3053
|
-
flushPath(false);
|
|
3054
|
-
}
|
|
2592
|
+
const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
|
|
2593
|
+
return { fields, confidence: Math.min(confidence, 1) };
|
|
2594
|
+
}
|
|
2595
|
+
function extractFromTable(table) {
|
|
2596
|
+
const fields = [];
|
|
2597
|
+
if (table.cols >= 2) {
|
|
2598
|
+
for (let r = 0; r < table.rows; r++) {
|
|
2599
|
+
for (let c = 0; c < table.cols - 1; c++) {
|
|
2600
|
+
const labelCell = table.cells[r][c];
|
|
2601
|
+
const valueCell = table.cells[r][c + 1];
|
|
2602
|
+
if (isLabelCell(labelCell.text)) {
|
|
2603
|
+
fields.push({
|
|
2604
|
+
label: labelCell.text.trim().replace(/[::]\s*$/, ""),
|
|
2605
|
+
value: valueCell.text.trim(),
|
|
2606
|
+
row: r,
|
|
2607
|
+
col: c
|
|
2608
|
+
});
|
|
3055
2609
|
}
|
|
3056
|
-
break;
|
|
3057
2610
|
}
|
|
3058
|
-
case OPS.stroke:
|
|
3059
|
-
case OPS.closeStroke:
|
|
3060
|
-
flushPath(true);
|
|
3061
|
-
break;
|
|
3062
|
-
case OPS.fill:
|
|
3063
|
-
case OPS.eoFill:
|
|
3064
|
-
case OPS.fillStroke:
|
|
3065
|
-
case OPS.eoFillStroke:
|
|
3066
|
-
case OPS.closeFillStroke:
|
|
3067
|
-
case OPS.closeEOFillStroke:
|
|
3068
|
-
flushPath(true);
|
|
3069
|
-
break;
|
|
3070
|
-
case OPS.endPath:
|
|
3071
|
-
flushPath(false);
|
|
3072
|
-
break;
|
|
3073
2611
|
}
|
|
3074
2612
|
}
|
|
3075
|
-
|
|
3076
|
-
|
|
3077
|
-
|
|
3078
|
-
|
|
3079
|
-
|
|
3080
|
-
|
|
3081
|
-
|
|
3082
|
-
|
|
3083
|
-
|
|
3084
|
-
|
|
3085
|
-
|
|
3086
|
-
|
|
3087
|
-
|
|
3088
|
-
|
|
3089
|
-
const y1 = Math.min(seg.y1, seg.y2);
|
|
3090
|
-
const y2 = Math.max(seg.y1, seg.y2);
|
|
3091
|
-
verticals.push({ x1: x, y1, x2: x, y2, lineWidth });
|
|
3092
|
-
}
|
|
3093
|
-
}
|
|
3094
|
-
function preprocessLines(horizontals, verticals) {
|
|
3095
|
-
let h = horizontals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3096
|
-
let v = verticals.filter((l) => l.lineWidth <= MAX_LINE_WIDTH);
|
|
3097
|
-
h = mergeParallelLines(h, "h");
|
|
3098
|
-
v = mergeParallelLines(v, "v");
|
|
3099
|
-
return { horizontals: h, verticals: v };
|
|
3100
|
-
}
|
|
3101
|
-
function mergeParallelLines(lines, dir) {
|
|
3102
|
-
if (lines.length <= 1) return lines;
|
|
3103
|
-
const sorted = [...lines].sort((a, b) => {
|
|
3104
|
-
const posA = dir === "h" ? a.y1 : a.x1;
|
|
3105
|
-
const posB = dir === "h" ? b.y1 : b.x1;
|
|
3106
|
-
if (Math.abs(posA - posB) > 0.1) return posA - posB;
|
|
3107
|
-
return dir === "h" ? a.x1 - b.x1 : a.y1 - b.y1;
|
|
3108
|
-
});
|
|
3109
|
-
const MERGE_TOL = 3;
|
|
3110
|
-
const result = [sorted[0]];
|
|
3111
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3112
|
-
const prev = result[result.length - 1];
|
|
3113
|
-
const curr = sorted[i];
|
|
3114
|
-
const prevPos = dir === "h" ? prev.y1 : prev.x1;
|
|
3115
|
-
const currPos = dir === "h" ? curr.y1 : curr.x1;
|
|
3116
|
-
if (Math.abs(prevPos - currPos) <= MERGE_TOL) {
|
|
3117
|
-
const prevStart = dir === "h" ? prev.x1 : prev.y1;
|
|
3118
|
-
const prevEnd = dir === "h" ? prev.x2 : prev.y2;
|
|
3119
|
-
const currStart = dir === "h" ? curr.x1 : curr.y1;
|
|
3120
|
-
const currEnd = dir === "h" ? curr.x2 : curr.y2;
|
|
3121
|
-
const overlap = Math.min(prevEnd, currEnd) - Math.max(prevStart, currStart);
|
|
3122
|
-
const minLen = Math.min(prevEnd - prevStart, currEnd - currStart);
|
|
3123
|
-
if (overlap > minLen * 0.3) {
|
|
3124
|
-
if (dir === "h") {
|
|
3125
|
-
prev.x1 = Math.min(prev.x1, curr.x1);
|
|
3126
|
-
prev.x2 = Math.max(prev.x2, curr.x2);
|
|
3127
|
-
prev.y1 = (prev.y1 + curr.y1) / 2;
|
|
3128
|
-
prev.y2 = prev.y1;
|
|
3129
|
-
} else {
|
|
3130
|
-
prev.y1 = Math.min(prev.y1, curr.y1);
|
|
3131
|
-
prev.y2 = Math.max(prev.y2, curr.y2);
|
|
3132
|
-
prev.x1 = (prev.x1 + curr.x1) / 2;
|
|
3133
|
-
prev.x2 = prev.x1;
|
|
2613
|
+
if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
|
|
2614
|
+
const headerRow = table.cells[0];
|
|
2615
|
+
const allLabels = headerRow.every((cell) => {
|
|
2616
|
+
const t = cell.text.trim();
|
|
2617
|
+
return t.length > 0 && t.length <= 20;
|
|
2618
|
+
});
|
|
2619
|
+
if (allLabels) {
|
|
2620
|
+
for (let r = 1; r < table.rows; r++) {
|
|
2621
|
+
for (let c = 0; c < table.cols; c++) {
|
|
2622
|
+
const label = headerRow[c].text.trim();
|
|
2623
|
+
const value = table.cells[r][c].text.trim();
|
|
2624
|
+
if (label && value) {
|
|
2625
|
+
fields.push({ label, value, row: r, col: c });
|
|
2626
|
+
}
|
|
3134
2627
|
}
|
|
3135
|
-
prev.lineWidth = Math.max(prev.lineWidth, curr.lineWidth);
|
|
3136
|
-
continue;
|
|
3137
2628
|
}
|
|
3138
2629
|
}
|
|
3139
|
-
result.push(curr);
|
|
3140
2630
|
}
|
|
3141
|
-
return
|
|
3142
|
-
}
|
|
3143
|
-
function filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight) {
|
|
3144
|
-
const margin = 5;
|
|
3145
|
-
return {
|
|
3146
|
-
horizontals: horizontals.filter(
|
|
3147
|
-
(l) => !(Math.abs(l.y1) < margin || Math.abs(l.y1 - pageHeight) < margin) || l.x2 - l.x1 < pageWidth * 0.9
|
|
3148
|
-
),
|
|
3149
|
-
verticals: verticals.filter(
|
|
3150
|
-
(l) => !(Math.abs(l.x1) < margin || Math.abs(l.x1 - pageWidth) < margin) || l.y2 - l.y1 < pageHeight * 0.9
|
|
3151
|
-
)
|
|
3152
|
-
};
|
|
2631
|
+
return fields;
|
|
3153
2632
|
}
|
|
3154
|
-
function
|
|
3155
|
-
const
|
|
3156
|
-
const
|
|
3157
|
-
|
|
3158
|
-
|
|
3159
|
-
|
|
3160
|
-
|
|
3161
|
-
|
|
3162
|
-
}
|
|
2633
|
+
function extractInlineFields(text) {
|
|
2634
|
+
const fields = [];
|
|
2635
|
+
const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
|
|
2636
|
+
let match;
|
|
2637
|
+
while ((match = pattern.exec(text)) !== null) {
|
|
2638
|
+
const label = match[1].trim();
|
|
2639
|
+
const value = match[2].trim();
|
|
2640
|
+
if (value) {
|
|
2641
|
+
fields.push({ label, value, row: -1, col: -1 });
|
|
3163
2642
|
}
|
|
3164
2643
|
}
|
|
3165
|
-
return
|
|
3166
|
-
}
|
|
3167
|
-
|
|
3168
|
-
|
|
3169
|
-
|
|
3170
|
-
|
|
3171
|
-
|
|
3172
|
-
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
|
|
3177
|
-
|
|
3178
|
-
|
|
3179
|
-
|
|
3180
|
-
|
|
3181
|
-
|
|
3182
|
-
|
|
3183
|
-
|
|
3184
|
-
|
|
2644
|
+
return fields;
|
|
2645
|
+
}
|
|
2646
|
+
|
|
2647
|
+
// src/form/match.ts
|
|
2648
|
+
function normalizeLabel(label) {
|
|
2649
|
+
return label.trim().replace(/[::\s()()·]/g, "");
|
|
2650
|
+
}
|
|
2651
|
+
function findMatchingKey(cellLabel, values) {
|
|
2652
|
+
if (values.has(cellLabel)) return cellLabel;
|
|
2653
|
+
let bestKey;
|
|
2654
|
+
let bestLen = 0;
|
|
2655
|
+
for (const key of values.keys()) {
|
|
2656
|
+
if (cellLabel.startsWith(key)) {
|
|
2657
|
+
if (key.length >= cellLabel.length * 0.6 && key.length > bestLen) {
|
|
2658
|
+
bestLen = key.length;
|
|
2659
|
+
bestKey = key;
|
|
2660
|
+
}
|
|
2661
|
+
} else if (key.startsWith(cellLabel)) {
|
|
2662
|
+
if (cellLabel.length >= key.length * 0.6 && cellLabel.length > bestLen) {
|
|
2663
|
+
bestLen = cellLabel.length;
|
|
2664
|
+
bestKey = key;
|
|
3185
2665
|
}
|
|
3186
2666
|
}
|
|
3187
|
-
merged.push({ x: sumX / count, y: sumY / count, radius: maxRadius });
|
|
3188
2667
|
}
|
|
3189
|
-
return
|
|
2668
|
+
return bestKey;
|
|
3190
2669
|
}
|
|
3191
|
-
function
|
|
3192
|
-
|
|
3193
|
-
|
|
3194
|
-
const
|
|
3195
|
-
|
|
3196
|
-
const globalRadius = vertices.reduce((max, v) => Math.max(max, v.radius), 1);
|
|
3197
|
-
const allLines = [
|
|
3198
|
-
...horizontals.map((l, i) => ({ ...l, type: "h", id: i })),
|
|
3199
|
-
...verticals.map((l, i) => ({ ...l, type: "v", id: i + horizontals.length }))
|
|
3200
|
-
];
|
|
3201
|
-
const groups = groupConnectedLines(allLines);
|
|
3202
|
-
const grids = [];
|
|
3203
|
-
for (const group of groups) {
|
|
3204
|
-
const hLines = group.filter((l) => l.type === "h");
|
|
3205
|
-
const vLines = group.filter((l) => l.type === "v");
|
|
3206
|
-
if (hLines.length < 2 || vLines.length < 2) continue;
|
|
3207
|
-
let gx1 = Infinity, gy1 = Infinity, gx2 = -Infinity, gy2 = -Infinity;
|
|
3208
|
-
for (const l of vLines) {
|
|
3209
|
-
if (l.x1 < gx1) gx1 = l.x1;
|
|
3210
|
-
if (l.x1 > gx2) gx2 = l.x1;
|
|
3211
|
-
}
|
|
3212
|
-
for (const l of hLines) {
|
|
3213
|
-
if (l.y1 < gy1) gy1 = l.y1;
|
|
3214
|
-
if (l.y1 > gy2) gy2 = l.y1;
|
|
3215
|
-
}
|
|
3216
|
-
const groupBbox = {
|
|
3217
|
-
x1: gx1 - CONNECT_TOL,
|
|
3218
|
-
y1: gy1 - CONNECT_TOL,
|
|
3219
|
-
x2: gx2 + CONNECT_TOL,
|
|
3220
|
-
y2: gy2 + CONNECT_TOL
|
|
3221
|
-
};
|
|
3222
|
-
const groupVertices = vertices.filter(
|
|
3223
|
-
(v) => v.x >= groupBbox.x1 && v.x <= groupBbox.x2 && v.y >= groupBbox.y1 && v.y <= groupBbox.y2
|
|
3224
|
-
);
|
|
3225
|
-
const groupRadius = groupVertices.length > 0 ? groupVertices.reduce((max, v) => Math.max(max, v.radius), 1) : globalRadius;
|
|
3226
|
-
const coordMergeTol = Math.max(VERTEX_MERGE_FACTOR * groupRadius, MIN_COORD_MERGE_TOL);
|
|
3227
|
-
const rawYs = [
|
|
3228
|
-
...hLines.map((l) => l.y1),
|
|
3229
|
-
...groupVertices.map((v) => v.y)
|
|
3230
|
-
];
|
|
3231
|
-
const rowYs = clusterCoordinates(rawYs, coordMergeTol).sort((a, b) => b - a);
|
|
3232
|
-
const rawXs = [
|
|
3233
|
-
...vLines.map((l) => l.x1),
|
|
3234
|
-
...groupVertices.map((v) => v.x)
|
|
3235
|
-
];
|
|
3236
|
-
const colXs = clusterCoordinates(rawXs, coordMergeTol).sort((a, b) => a - b);
|
|
3237
|
-
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
3238
|
-
const validColXs = enforceMinWidth(colXs, MIN_COL_WIDTH);
|
|
3239
|
-
const validRowYs = enforceMinHeight(rowYs, MIN_ROW_HEIGHT);
|
|
3240
|
-
if (validRowYs.length < 2 || validColXs.length < 2) continue;
|
|
3241
|
-
const bbox = {
|
|
3242
|
-
x1: validColXs[0],
|
|
3243
|
-
y1: validRowYs[validRowYs.length - 1],
|
|
3244
|
-
x2: validColXs[validColXs.length - 1],
|
|
3245
|
-
y2: validRowYs[0]
|
|
3246
|
-
};
|
|
3247
|
-
grids.push({ rowYs: validRowYs, colXs: validColXs, bbox, vertexRadius: groupRadius });
|
|
2670
|
+
function isKeywordLabel(text) {
|
|
2671
|
+
const trimmed = text.trim().replace(/[¹²³⁴⁵⁶⁷⁸⁹⁰*※]+$/g, "").trim();
|
|
2672
|
+
if (!trimmed || trimmed.length > 15) return false;
|
|
2673
|
+
for (const kw of LABEL_KEYWORDS) {
|
|
2674
|
+
if (trimmed.includes(kw)) return true;
|
|
3248
2675
|
}
|
|
3249
|
-
return
|
|
2676
|
+
return false;
|
|
3250
2677
|
}
|
|
3251
|
-
function
|
|
3252
|
-
|
|
3253
|
-
const
|
|
3254
|
-
|
|
3255
|
-
|
|
3256
|
-
|
|
3257
|
-
|
|
2678
|
+
function fillInCellPatterns(cellText, values, matchedLabels) {
|
|
2679
|
+
let text = cellText;
|
|
2680
|
+
const matches = [];
|
|
2681
|
+
text = text.replace(
|
|
2682
|
+
/([가-힣A-Za-z]+)\(\s{1,}\)([가-힣A-Za-z]*)/g,
|
|
2683
|
+
(match, prefix, suffix) => {
|
|
2684
|
+
const label = prefix + suffix;
|
|
2685
|
+
const normalizedLabel = normalizeLabel(label);
|
|
2686
|
+
const matchKey = values.has(normalizedLabel) ? normalizedLabel : values.has(normalizeLabel(prefix)) ? normalizeLabel(prefix) : void 0;
|
|
2687
|
+
if (matchKey === void 0) return match;
|
|
2688
|
+
const newValue = values.get(matchKey);
|
|
2689
|
+
matchedLabels.add(matchKey);
|
|
2690
|
+
matches.push({ key: matchKey, label, value: newValue });
|
|
2691
|
+
return `${prefix}(${newValue})${suffix}`;
|
|
3258
2692
|
}
|
|
3259
|
-
|
|
3260
|
-
|
|
3261
|
-
|
|
3262
|
-
|
|
3263
|
-
|
|
3264
|
-
|
|
3265
|
-
|
|
3266
|
-
|
|
3267
|
-
|
|
3268
|
-
|
|
3269
|
-
|
|
2693
|
+
);
|
|
2694
|
+
text = text.replace(
|
|
2695
|
+
/□([가-힣A-Za-z]+)/g,
|
|
2696
|
+
(match, keyword) => {
|
|
2697
|
+
const normalizedKw = normalizeLabel(keyword);
|
|
2698
|
+
const matchKey = values.has(normalizedKw) ? normalizedKw : void 0;
|
|
2699
|
+
if (matchKey === void 0) return match;
|
|
2700
|
+
const val = values.get(matchKey);
|
|
2701
|
+
const isTruthy = ["\u2611", "\u2713", "\u2714", "v", "V", "true", "1", "yes", "o", "O"].includes(val.trim()) || val.trim() === "";
|
|
2702
|
+
if (!isTruthy) return match;
|
|
2703
|
+
matchedLabels.add(matchKey);
|
|
2704
|
+
matches.push({ key: matchKey, label: `\u25A1${keyword}`, value: "\u2611" });
|
|
2705
|
+
return `\u2611${keyword}`;
|
|
3270
2706
|
}
|
|
3271
|
-
|
|
3272
|
-
|
|
3273
|
-
|
|
3274
|
-
|
|
3275
|
-
|
|
3276
|
-
|
|
3277
|
-
|
|
3278
|
-
|
|
3279
|
-
|
|
3280
|
-
|
|
3281
|
-
|
|
3282
|
-
if (prev.colXs.length === curr.colXs.length) {
|
|
3283
|
-
const mergeTol = Math.max(VERTEX_MERGE_FACTOR * Math.max(prev.vertexRadius, curr.vertexRadius), 6) * 3;
|
|
3284
|
-
const colMatch = prev.colXs.every((x, ci) => Math.abs(x - curr.colXs[ci]) <= mergeTol);
|
|
3285
|
-
const verticalGap = prev.bbox.y1 - curr.bbox.y2;
|
|
3286
|
-
if (colMatch && verticalGap >= -CONNECT_TOL && verticalGap <= 20) {
|
|
3287
|
-
const allRowYs = [.../* @__PURE__ */ new Set([...prev.rowYs, ...curr.rowYs])].sort((a, b) => b - a);
|
|
3288
|
-
merged[merged.length - 1] = {
|
|
3289
|
-
rowYs: allRowYs,
|
|
3290
|
-
colXs: prev.colXs,
|
|
3291
|
-
bbox: {
|
|
3292
|
-
x1: Math.min(prev.bbox.x1, curr.bbox.x1),
|
|
3293
|
-
y1: Math.min(prev.bbox.y1, curr.bbox.y1),
|
|
3294
|
-
x2: Math.max(prev.bbox.x2, curr.bbox.x2),
|
|
3295
|
-
y2: Math.max(prev.bbox.y2, curr.bbox.y2)
|
|
3296
|
-
},
|
|
3297
|
-
vertexRadius: Math.max(prev.vertexRadius, curr.vertexRadius)
|
|
3298
|
-
};
|
|
3299
|
-
continue;
|
|
3300
|
-
}
|
|
2707
|
+
);
|
|
2708
|
+
text = text.replace(
|
|
2709
|
+
/\(([가-힣A-Za-z]+)[::]\s{1,}\)/g,
|
|
2710
|
+
(match, keyword) => {
|
|
2711
|
+
const normalizedKw = normalizeLabel(keyword);
|
|
2712
|
+
const matchKey = values.has(normalizedKw) ? normalizedKw : void 0;
|
|
2713
|
+
if (matchKey === void 0) return match;
|
|
2714
|
+
const newValue = values.get(matchKey);
|
|
2715
|
+
matchedLabels.add(matchKey);
|
|
2716
|
+
matches.push({ key: matchKey, label: keyword, value: newValue });
|
|
2717
|
+
return `(${keyword}\uFF1A${newValue})`;
|
|
3301
2718
|
}
|
|
3302
|
-
|
|
3303
|
-
}
|
|
3304
|
-
return merged;
|
|
2719
|
+
);
|
|
2720
|
+
return matches.length > 0 ? { text, matches } : null;
|
|
3305
2721
|
}
|
|
3306
|
-
function
|
|
3307
|
-
|
|
3308
|
-
const
|
|
3309
|
-
|
|
3310
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3311
|
-
const last = clusters[clusters.length - 1];
|
|
3312
|
-
const avg = last.sum / last.count;
|
|
3313
|
-
if (Math.abs(sorted[i] - avg) <= tolerance) {
|
|
3314
|
-
last.sum += sorted[i];
|
|
3315
|
-
last.count++;
|
|
3316
|
-
} else {
|
|
3317
|
-
clusters.push({ sum: sorted[i], count: 1 });
|
|
3318
|
-
}
|
|
2722
|
+
function normalizeValues(values) {
|
|
2723
|
+
const map = /* @__PURE__ */ new Map();
|
|
2724
|
+
for (const [label, value] of Object.entries(values)) {
|
|
2725
|
+
map.set(normalizeLabel(label), value);
|
|
3319
2726
|
}
|
|
3320
|
-
return
|
|
2727
|
+
return map;
|
|
3321
2728
|
}
|
|
3322
|
-
function
|
|
3323
|
-
|
|
3324
|
-
|
|
3325
|
-
|
|
3326
|
-
parent[x] = parent[parent[x]];
|
|
3327
|
-
x = parent[x];
|
|
3328
|
-
}
|
|
3329
|
-
return x;
|
|
3330
|
-
}
|
|
3331
|
-
function union(a, b) {
|
|
3332
|
-
const ra = find(a), rb = find(b);
|
|
3333
|
-
if (ra !== rb) parent[ra] = rb;
|
|
3334
|
-
}
|
|
3335
|
-
for (let i = 0; i < lines.length; i++) {
|
|
3336
|
-
for (let j = i + 1; j < lines.length; j++) {
|
|
3337
|
-
if (linesIntersect(lines[i], lines[j])) {
|
|
3338
|
-
union(i, j);
|
|
3339
|
-
}
|
|
2729
|
+
function resolveUnmatched(normalizedValues, matchedLabels, originalValues) {
|
|
2730
|
+
return [...normalizedValues.keys()].filter((k) => !matchedLabels.has(k)).map((k) => {
|
|
2731
|
+
for (const orig of Object.keys(originalValues)) {
|
|
2732
|
+
if (normalizeLabel(orig) === k) return orig;
|
|
3340
2733
|
}
|
|
3341
|
-
|
|
3342
|
-
|
|
3343
|
-
for (let i = 0; i < lines.length; i++) {
|
|
3344
|
-
const root = find(i);
|
|
3345
|
-
if (!groups.has(root)) groups.set(root, []);
|
|
3346
|
-
groups.get(root).push(lines[i]);
|
|
3347
|
-
}
|
|
3348
|
-
return [...groups.values()];
|
|
2734
|
+
return k;
|
|
2735
|
+
});
|
|
3349
2736
|
}
|
|
3350
|
-
|
|
3351
|
-
|
|
3352
|
-
|
|
3353
|
-
|
|
3354
|
-
|
|
3355
|
-
|
|
3356
|
-
|
|
3357
|
-
|
|
3358
|
-
|
|
3359
|
-
|
|
3360
|
-
|
|
3361
|
-
|
|
3362
|
-
|
|
3363
|
-
|
|
3364
|
-
|
|
3365
|
-
|
|
3366
|
-
|
|
3367
|
-
|
|
3368
|
-
|
|
3369
|
-
|
|
3370
|
-
const vBorders = Array.from(
|
|
3371
|
-
{ length: numRows },
|
|
3372
|
-
(_, r) => Array.from(
|
|
3373
|
-
{ length: numCols + 1 },
|
|
3374
|
-
(_2, c) => hasVerticalLine(verticals, colXs[c], rowYs[r], rowYs[r + 1], grid.vertexRadius)
|
|
3375
|
-
)
|
|
3376
|
-
);
|
|
3377
|
-
const hBorders = Array.from(
|
|
3378
|
-
{ length: numRows + 1 },
|
|
3379
|
-
(_, r) => Array.from(
|
|
3380
|
-
{ length: numCols },
|
|
3381
|
-
(_2, c) => hasHorizontalLine(horizontals, rowYs[r], colXs[c], colXs[c + 1], grid.vertexRadius)
|
|
3382
|
-
)
|
|
3383
|
-
);
|
|
3384
|
-
const occupied = Array.from({ length: numRows }, () => Array(numCols).fill(false));
|
|
3385
|
-
const cells = [];
|
|
3386
|
-
for (let r = 0; r < numRows; r++) {
|
|
3387
|
-
for (let c = 0; c < numCols; c++) {
|
|
3388
|
-
if (occupied[r][c]) continue;
|
|
3389
|
-
let colSpan = 1;
|
|
3390
|
-
let rowSpan = 1;
|
|
3391
|
-
while (c + colSpan < numCols && !vBorders[r][c + colSpan]) {
|
|
3392
|
-
let canExpand = true;
|
|
3393
|
-
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3394
|
-
if (vBorders[r + dr][c + colSpan]) {
|
|
3395
|
-
canExpand = false;
|
|
3396
|
-
break;
|
|
3397
|
-
}
|
|
3398
|
-
}
|
|
3399
|
-
if (!canExpand) break;
|
|
3400
|
-
colSpan++;
|
|
3401
|
-
}
|
|
3402
|
-
while (r + rowSpan < numRows) {
|
|
3403
|
-
let hasLine = false;
|
|
3404
|
-
for (let dc = 0; dc < colSpan; dc++) {
|
|
3405
|
-
if (hBorders[r + rowSpan][c + dc]) {
|
|
3406
|
-
hasLine = true;
|
|
3407
|
-
break;
|
|
2737
|
+
|
|
2738
|
+
// src/form/filler.ts
|
|
2739
|
+
function fillFormFields(blocks, values) {
|
|
2740
|
+
const cloned = structuredClone(blocks);
|
|
2741
|
+
const filled = [];
|
|
2742
|
+
const matchedLabels = /* @__PURE__ */ new Set();
|
|
2743
|
+
const normalizedValues = normalizeValues(values);
|
|
2744
|
+
const patternFilledCells = /* @__PURE__ */ new Set();
|
|
2745
|
+
for (const block of cloned) {
|
|
2746
|
+
if (block.type !== "table" || !block.table) continue;
|
|
2747
|
+
for (let r = 0; r < block.table.rows; r++) {
|
|
2748
|
+
for (let c = 0; c < block.table.cols; c++) {
|
|
2749
|
+
const cell = block.table.cells[r]?.[c];
|
|
2750
|
+
if (!cell) continue;
|
|
2751
|
+
const result = fillInCellPatterns(cell.text, normalizedValues, matchedLabels);
|
|
2752
|
+
if (result) {
|
|
2753
|
+
cell.text = result.text;
|
|
2754
|
+
patternFilledCells.add(`${r},${c}`);
|
|
2755
|
+
for (const m of result.matches) {
|
|
2756
|
+
filled.push({ label: m.label, value: m.value, row: r, col: c });
|
|
3408
2757
|
}
|
|
3409
2758
|
}
|
|
3410
|
-
if (hasLine) break;
|
|
3411
|
-
rowSpan++;
|
|
3412
|
-
}
|
|
3413
|
-
for (let dr = 0; dr < rowSpan; dr++) {
|
|
3414
|
-
for (let dc = 0; dc < colSpan; dc++) {
|
|
3415
|
-
occupied[r + dr][c + dc] = true;
|
|
3416
|
-
}
|
|
3417
2759
|
}
|
|
3418
|
-
cells.push({
|
|
3419
|
-
row: r,
|
|
3420
|
-
col: c,
|
|
3421
|
-
rowSpan,
|
|
3422
|
-
colSpan,
|
|
3423
|
-
bbox: {
|
|
3424
|
-
x1: colXs[c],
|
|
3425
|
-
y1: rowYs[r + rowSpan],
|
|
3426
|
-
x2: colXs[c + colSpan],
|
|
3427
|
-
y2: rowYs[r]
|
|
3428
|
-
}
|
|
3429
|
-
});
|
|
3430
|
-
}
|
|
3431
|
-
}
|
|
3432
|
-
return cells;
|
|
3433
|
-
}
|
|
3434
|
-
function hasVerticalLine(verticals, x, topY, botY, vertexRadius) {
|
|
3435
|
-
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3436
|
-
for (const v of verticals) {
|
|
3437
|
-
if (Math.abs(v.x1 - x) <= tol) {
|
|
3438
|
-
const cellH = Math.abs(topY - botY);
|
|
3439
|
-
if (cellH < 0.1) continue;
|
|
3440
|
-
const overlapTop = Math.min(v.y2, topY);
|
|
3441
|
-
const overlapBot = Math.max(v.y1, botY);
|
|
3442
|
-
const overlap = overlapTop - overlapBot;
|
|
3443
|
-
if (overlap >= cellH * 0.75) return true;
|
|
3444
|
-
}
|
|
3445
|
-
}
|
|
3446
|
-
return false;
|
|
3447
|
-
}
|
|
3448
|
-
function hasHorizontalLine(horizontals, y, leftX, rightX, vertexRadius) {
|
|
3449
|
-
const tol = Math.max(VERTEX_MERGE_FACTOR * vertexRadius, 4);
|
|
3450
|
-
for (const h of horizontals) {
|
|
3451
|
-
if (Math.abs(h.y1 - y) <= tol) {
|
|
3452
|
-
const cellW = Math.abs(rightX - leftX);
|
|
3453
|
-
if (cellW < 0.1) continue;
|
|
3454
|
-
const overlapLeft = Math.max(h.x1, leftX);
|
|
3455
|
-
const overlapRight = Math.min(h.x2, rightX);
|
|
3456
|
-
const overlap = overlapRight - overlapLeft;
|
|
3457
|
-
if (overlap >= cellW * 0.75) return true;
|
|
3458
2760
|
}
|
|
3459
2761
|
}
|
|
3460
|
-
|
|
3461
|
-
|
|
3462
|
-
|
|
3463
|
-
const result = /* @__PURE__ */ new Map();
|
|
3464
|
-
for (const cell of cells) {
|
|
3465
|
-
result.set(cell, []);
|
|
2762
|
+
for (const block of cloned) {
|
|
2763
|
+
if (block.type !== "table" || !block.table) continue;
|
|
2764
|
+
fillTable(block.table, normalizedValues, filled, matchedLabels, patternFilledCells);
|
|
3466
2765
|
}
|
|
3467
|
-
for (const
|
|
3468
|
-
|
|
3469
|
-
|
|
3470
|
-
|
|
3471
|
-
for (const cell of cells) {
|
|
3472
|
-
const ix1 = Math.max(item.x, cell.bbox.x1 - pad);
|
|
3473
|
-
const ix2 = Math.min(item.x + item.w, cell.bbox.x2 + pad);
|
|
3474
|
-
const iy1 = Math.max(item.y, cell.bbox.y1 - pad);
|
|
3475
|
-
const iy2 = Math.min(item.y + (item.h || item.fontSize), cell.bbox.y2 + pad);
|
|
3476
|
-
if (ix1 >= ix2 || iy1 >= iy2) continue;
|
|
3477
|
-
const intersectArea = (ix2 - ix1) * (iy2 - iy1);
|
|
3478
|
-
const itemArea = Math.max(item.w, 1) * Math.max(item.h || item.fontSize, 1);
|
|
3479
|
-
const score = intersectArea / itemArea;
|
|
3480
|
-
if (score > bestScore) {
|
|
3481
|
-
bestScore = score;
|
|
3482
|
-
bestCell = cell;
|
|
3483
|
-
}
|
|
3484
|
-
}
|
|
3485
|
-
if (bestCell && bestScore > 0.3) {
|
|
3486
|
-
result.get(bestCell).push(item);
|
|
3487
|
-
}
|
|
2766
|
+
for (const block of cloned) {
|
|
2767
|
+
if (block.type !== "paragraph" || !block.text) continue;
|
|
2768
|
+
const newText = fillInlineFields(block.text, normalizedValues, filled, matchedLabels);
|
|
2769
|
+
if (newText !== block.text) block.text = newText;
|
|
3488
2770
|
}
|
|
3489
|
-
|
|
2771
|
+
const unmatched = resolveUnmatched(normalizedValues, matchedLabels, values);
|
|
2772
|
+
return { blocks: cloned, filled, unmatched };
|
|
3490
2773
|
}
|
|
3491
|
-
function
|
|
3492
|
-
if (
|
|
3493
|
-
|
|
3494
|
-
|
|
3495
|
-
|
|
3496
|
-
|
|
3497
|
-
|
|
3498
|
-
|
|
3499
|
-
|
|
3500
|
-
|
|
3501
|
-
|
|
3502
|
-
|
|
3503
|
-
|
|
3504
|
-
|
|
3505
|
-
|
|
3506
|
-
|
|
3507
|
-
}
|
|
3508
|
-
lines.push(curLine);
|
|
3509
|
-
const textLines = lines.map((line) => {
|
|
3510
|
-
const s = line.sort((a, b) => a.x - b.x);
|
|
3511
|
-
if (s.length === 1) return s[0].text;
|
|
3512
|
-
const evenSpaced = detectEvenSpacedItems(s);
|
|
3513
|
-
let result = s[0].text;
|
|
3514
|
-
for (let j = 1; j < s.length; j++) {
|
|
3515
|
-
if (evenSpaced[j]) {
|
|
3516
|
-
result += s[j].text;
|
|
3517
|
-
continue;
|
|
3518
|
-
}
|
|
3519
|
-
const gap = s[j].x - (s[j - 1].x + s[j - 1].w);
|
|
3520
|
-
const avgFs = (s[j].fontSize + s[j - 1].fontSize) / 2;
|
|
3521
|
-
const prevIsKorean = /[가-힣]$/.test(result);
|
|
3522
|
-
const currIsKorean = /^[가-힣]/.test(s[j].text);
|
|
3523
|
-
if (gap < avgFs * 0.15) {
|
|
3524
|
-
result += s[j].text;
|
|
3525
|
-
} else if (gap < avgFs * 0.35 && (prevIsKorean || currIsKorean)) {
|
|
3526
|
-
result += s[j].text;
|
|
2774
|
+
function fillTable(table, values, filled, matchedLabels, patternFilledCells) {
|
|
2775
|
+
if (table.cols < 2) return;
|
|
2776
|
+
for (let r = 0; r < table.rows; r++) {
|
|
2777
|
+
for (let c = 0; c < table.cols - 1; c++) {
|
|
2778
|
+
const labelCell = table.cells[r][c];
|
|
2779
|
+
const valueCell = table.cells[r][c + 1];
|
|
2780
|
+
if (!labelCell || !valueCell) continue;
|
|
2781
|
+
if (!isLabelCell(labelCell.text)) continue;
|
|
2782
|
+
if (isKeywordLabel(valueCell.text)) continue;
|
|
2783
|
+
const normalizedCellLabel = normalizeLabel(labelCell.text);
|
|
2784
|
+
if (!normalizedCellLabel) continue;
|
|
2785
|
+
const matchKey = findMatchingKey(normalizedCellLabel, values);
|
|
2786
|
+
if (matchKey === void 0) continue;
|
|
2787
|
+
const newValue = values.get(matchKey);
|
|
2788
|
+
if (patternFilledCells?.has(`${r},${c + 1}`)) {
|
|
2789
|
+
valueCell.text = newValue + " " + valueCell.text;
|
|
3527
2790
|
} else {
|
|
3528
|
-
|
|
3529
|
-
}
|
|
3530
|
-
}
|
|
3531
|
-
return result;
|
|
3532
|
-
});
|
|
3533
|
-
return mergeCellTextLines(textLines);
|
|
3534
|
-
}
|
|
3535
|
-
function detectEvenSpacedItems(items) {
|
|
3536
|
-
const result = new Array(items.length).fill(false);
|
|
3537
|
-
if (items.length < 3) return result;
|
|
3538
|
-
let runStart = -1;
|
|
3539
|
-
for (let i = 0; i < items.length; i++) {
|
|
3540
|
-
const isShortKorean = /^[가-힣]{1}$/.test(items[i].text) || /^[\d]{1}$/.test(items[i].text);
|
|
3541
|
-
if (isShortKorean && runStart >= 0 && i > 0) {
|
|
3542
|
-
const gap = items[i].x - (items[i - 1].x + items[i - 1].w);
|
|
3543
|
-
const maxRunGap = Math.max(items[i].fontSize * 3, 30);
|
|
3544
|
-
if (gap > maxRunGap) {
|
|
3545
|
-
if (i - runStart >= 3) markEvenRun(items, result, runStart, i);
|
|
3546
|
-
runStart = i;
|
|
3547
|
-
continue;
|
|
2791
|
+
valueCell.text = newValue;
|
|
3548
2792
|
}
|
|
2793
|
+
matchedLabels.add(matchKey);
|
|
2794
|
+
filled.push({
|
|
2795
|
+
label: labelCell.text.trim().replace(/[::]\s*$/, ""),
|
|
2796
|
+
value: newValue,
|
|
2797
|
+
row: r,
|
|
2798
|
+
col: c
|
|
2799
|
+
});
|
|
3549
2800
|
}
|
|
3550
|
-
|
|
3551
|
-
|
|
3552
|
-
|
|
3553
|
-
|
|
3554
|
-
|
|
2801
|
+
}
|
|
2802
|
+
if (table.rows >= 2 && table.cols >= 2) {
|
|
2803
|
+
const headerRow = table.cells[0];
|
|
2804
|
+
const allLabels = headerRow.every((cell) => {
|
|
2805
|
+
const t = cell.text.trim();
|
|
2806
|
+
return t.length > 0 && t.length <= 20 && isLabelCell(t);
|
|
2807
|
+
});
|
|
2808
|
+
if (!allLabels) return;
|
|
2809
|
+
for (let r = 1; r < table.rows; r++) {
|
|
2810
|
+
for (let c = 0; c < table.cols; c++) {
|
|
2811
|
+
const headerLabel = normalizeLabel(headerRow[c].text);
|
|
2812
|
+
const matchKey = findMatchingKey(headerLabel, values);
|
|
2813
|
+
if (matchKey === void 0) continue;
|
|
2814
|
+
if (matchedLabels.has(matchKey)) continue;
|
|
2815
|
+
const newValue = values.get(matchKey);
|
|
2816
|
+
table.cells[r][c].text = newValue;
|
|
2817
|
+
matchedLabels.add(matchKey);
|
|
2818
|
+
filled.push({
|
|
2819
|
+
label: headerRow[c].text.trim(),
|
|
2820
|
+
value: newValue,
|
|
2821
|
+
row: r,
|
|
2822
|
+
col: c
|
|
2823
|
+
});
|
|
3555
2824
|
}
|
|
3556
|
-
runStart = -1;
|
|
3557
2825
|
}
|
|
3558
2826
|
}
|
|
3559
|
-
if (runStart >= 0 && items.length - runStart >= 3) {
|
|
3560
|
-
markEvenRun(items, result, runStart, items.length);
|
|
3561
|
-
}
|
|
3562
|
-
return result;
|
|
3563
2827
|
}
|
|
3564
|
-
function
|
|
3565
|
-
|
|
3566
|
-
|
|
3567
|
-
|
|
3568
|
-
|
|
3569
|
-
|
|
3570
|
-
|
|
3571
|
-
|
|
3572
|
-
|
|
3573
|
-
|
|
3574
|
-
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
|
|
3579
|
-
|
|
3580
|
-
}
|
|
3581
|
-
}
|
|
3582
|
-
}
|
|
3583
|
-
function mergeCellTextLines(textLines) {
|
|
3584
|
-
if (textLines.length <= 1) return textLines[0] || "";
|
|
3585
|
-
const merged = [textLines[0]];
|
|
3586
|
-
for (let i = 1; i < textLines.length; i++) {
|
|
3587
|
-
const prev = merged[merged.length - 1];
|
|
3588
|
-
const curr = textLines[i];
|
|
3589
|
-
if (/[가-힣]$/.test(prev) && /^[가-힣]+$/.test(curr) && curr.length <= 8 && !curr.includes(" ")) {
|
|
3590
|
-
merged[merged.length - 1] = prev + curr;
|
|
3591
|
-
} else if (curr.trim().length <= 3 && /^[)\]%}]/.test(curr.trim())) {
|
|
3592
|
-
merged[merged.length - 1] = prev + curr.trim();
|
|
3593
|
-
} else if (/[,(]$/.test(prev.trim()) && curr.trim().length <= 15) {
|
|
3594
|
-
merged[merged.length - 1] = prev + curr.trim();
|
|
3595
|
-
} else if (/[\d,]$/.test(prev) && /^[\d,]+[)\]]?$/.test(curr.trim()) && curr.trim().length <= 10) {
|
|
3596
|
-
merged[merged.length - 1] = prev + curr.trim();
|
|
3597
|
-
} else {
|
|
3598
|
-
merged.push(curr);
|
|
2828
|
+
function fillInlineFields(text, values, filled, matchedLabels) {
|
|
2829
|
+
return text.replace(
|
|
2830
|
+
/([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{0,100})/g,
|
|
2831
|
+
(match, rawLabel, _oldValue) => {
|
|
2832
|
+
const normalized = normalizeLabel(rawLabel);
|
|
2833
|
+
const matchKey = findMatchingKey(normalized, values);
|
|
2834
|
+
if (matchKey === void 0) return match;
|
|
2835
|
+
const newValue = values.get(matchKey);
|
|
2836
|
+
matchedLabels.add(matchKey);
|
|
2837
|
+
filled.push({
|
|
2838
|
+
label: rawLabel.trim(),
|
|
2839
|
+
value: newValue,
|
|
2840
|
+
row: -1,
|
|
2841
|
+
col: -1
|
|
2842
|
+
});
|
|
2843
|
+
return `${rawLabel}: ${newValue}`;
|
|
3599
2844
|
}
|
|
3600
|
-
|
|
3601
|
-
return merged.join("\n");
|
|
2845
|
+
);
|
|
3602
2846
|
}
|
|
3603
2847
|
|
|
3604
|
-
// src/
|
|
3605
|
-
|
|
3606
|
-
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
|
|
3610
|
-
|
|
3611
|
-
|
|
3612
|
-
|
|
3613
|
-
if (
|
|
3614
|
-
|
|
3615
|
-
|
|
3616
|
-
|
|
3617
|
-
const
|
|
3618
|
-
const
|
|
3619
|
-
|
|
3620
|
-
|
|
3621
|
-
const
|
|
3622
|
-
const
|
|
3623
|
-
|
|
3624
|
-
|
|
3625
|
-
const
|
|
3626
|
-
|
|
3627
|
-
|
|
3628
|
-
|
|
3629
|
-
|
|
3630
|
-
|
|
2848
|
+
// src/form/filler-hwpx.ts
|
|
2849
|
+
import JSZip2 from "jszip";
|
|
2850
|
+
import { DOMParser as DOMParser2, XMLSerializer } from "@xmldom/xmldom";
|
|
2851
|
+
async function fillHwpx(hwpxBuffer, values) {
|
|
2852
|
+
const zip = await JSZip2.loadAsync(hwpxBuffer);
|
|
2853
|
+
const filled = [];
|
|
2854
|
+
const matchedLabels = /* @__PURE__ */ new Set();
|
|
2855
|
+
const normalizedValues = normalizeValues(values);
|
|
2856
|
+
const sectionFiles = Object.keys(zip.files).filter((name) => /[Ss]ection\d+\.xml$/i.test(name)).sort();
|
|
2857
|
+
if (sectionFiles.length === 0) {
|
|
2858
|
+
throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
2859
|
+
}
|
|
2860
|
+
const xmlParser = new DOMParser2();
|
|
2861
|
+
const xmlSerializer = new XMLSerializer();
|
|
2862
|
+
for (const sectionPath of sectionFiles) {
|
|
2863
|
+
const zipEntry = zip.file(sectionPath);
|
|
2864
|
+
if (!zipEntry) continue;
|
|
2865
|
+
const rawXml = await zipEntry.async("text");
|
|
2866
|
+
const doc = xmlParser.parseFromString(stripDtd(rawXml), "text/xml");
|
|
2867
|
+
if (!doc.documentElement) continue;
|
|
2868
|
+
let modified = false;
|
|
2869
|
+
const tables = findAllElements(doc.documentElement, "tbl");
|
|
2870
|
+
const cellPatternApplied = /* @__PURE__ */ new Set();
|
|
2871
|
+
for (const tblEl of tables) {
|
|
2872
|
+
const allCells = findAllElements(tblEl, "tc");
|
|
2873
|
+
for (const tcEl of allCells) {
|
|
2874
|
+
const tNodes = collectCellTextNodes(tcEl);
|
|
2875
|
+
const fullText = tNodes.map((n) => n.text).join("");
|
|
2876
|
+
const result = fillInCellPatterns(fullText, normalizedValues, matchedLabels);
|
|
2877
|
+
if (!result) continue;
|
|
2878
|
+
applyTextReplacements(tNodes, fullText, result.text);
|
|
2879
|
+
cellPatternApplied.add(tcEl);
|
|
2880
|
+
for (const m of result.matches) {
|
|
2881
|
+
filled.push({ label: m.label, value: m.value, row: -1, col: -1 });
|
|
2882
|
+
}
|
|
2883
|
+
modified = true;
|
|
3631
2884
|
}
|
|
3632
2885
|
}
|
|
3633
|
-
|
|
3634
|
-
|
|
3635
|
-
|
|
3636
|
-
|
|
3637
|
-
|
|
3638
|
-
|
|
3639
|
-
|
|
3640
|
-
|
|
3641
|
-
const
|
|
3642
|
-
const
|
|
3643
|
-
if (
|
|
3644
|
-
|
|
3645
|
-
|
|
2886
|
+
for (const tblEl of tables) {
|
|
2887
|
+
const rows = findDirectChildren(tblEl, "tr");
|
|
2888
|
+
for (let rowIdx = 0; rowIdx < rows.length; rowIdx++) {
|
|
2889
|
+
const trEl = rows[rowIdx];
|
|
2890
|
+
const cells = findDirectChildren(trEl, "tc");
|
|
2891
|
+
for (let colIdx = 0; colIdx < cells.length - 1; colIdx++) {
|
|
2892
|
+
const labelText = extractCellText(cells[colIdx]);
|
|
2893
|
+
if (!isLabelCell(labelText)) continue;
|
|
2894
|
+
const valueCell = cells[colIdx + 1];
|
|
2895
|
+
const valueText = extractCellText(valueCell);
|
|
2896
|
+
if (isKeywordLabel(valueText)) continue;
|
|
2897
|
+
const normalizedCellLabel = normalizeLabel(labelText);
|
|
2898
|
+
if (!normalizedCellLabel) continue;
|
|
2899
|
+
const matchKey = findMatchingKey(normalizedCellLabel, normalizedValues);
|
|
2900
|
+
if (matchKey === void 0) continue;
|
|
2901
|
+
const newValue = normalizedValues.get(matchKey);
|
|
2902
|
+
if (cellPatternApplied.has(valueCell)) {
|
|
2903
|
+
prependCellText(valueCell, newValue);
|
|
2904
|
+
} else {
|
|
2905
|
+
replaceCellText(valueCell, newValue);
|
|
3646
2906
|
}
|
|
2907
|
+
matchedLabels.add(matchKey);
|
|
2908
|
+
filled.push({
|
|
2909
|
+
label: labelText.trim().replace(/[::]\s*$/, ""),
|
|
2910
|
+
value: newValue,
|
|
2911
|
+
row: rowIdx,
|
|
2912
|
+
col: colIdx
|
|
2913
|
+
});
|
|
2914
|
+
modified = true;
|
|
3647
2915
|
}
|
|
3648
2916
|
}
|
|
3649
|
-
|
|
3650
|
-
|
|
3651
|
-
|
|
3652
|
-
|
|
3653
|
-
|
|
3654
|
-
|
|
3655
|
-
|
|
3656
|
-
|
|
3657
|
-
|
|
3658
|
-
|
|
3659
|
-
|
|
3660
|
-
|
|
3661
|
-
|
|
3662
|
-
|
|
3663
|
-
|
|
3664
|
-
|
|
3665
|
-
|
|
3666
|
-
|
|
3667
|
-
|
|
3668
|
-
|
|
3669
|
-
|
|
3670
|
-
|
|
3671
|
-
|
|
3672
|
-
|
|
3673
|
-
|
|
3674
|
-
let minG = Infinity, maxG = -Infinity;
|
|
3675
|
-
for (const g2 of gaps) {
|
|
3676
|
-
if (g2 < minG) minG = g2;
|
|
3677
|
-
if (g2 > maxG) maxG = g2;
|
|
3678
|
-
}
|
|
3679
|
-
if (minG > 0 && maxG / minG <= 3) {
|
|
3680
|
-
const run = sorted.slice(i, runEnd);
|
|
3681
|
-
const text = run.map((r) => r.text).join("");
|
|
3682
|
-
const first = run[0], last = run[runEnd - i - 1];
|
|
3683
|
-
const item = {
|
|
3684
|
-
text,
|
|
3685
|
-
x: first.x,
|
|
3686
|
-
y: first.y,
|
|
3687
|
-
w: last.x + last.w - first.x,
|
|
3688
|
-
h: first.h,
|
|
3689
|
-
fontSize: first.fontSize,
|
|
3690
|
-
fontName: first.fontName
|
|
3691
|
-
};
|
|
3692
|
-
originMap.set(item, run);
|
|
3693
|
-
merged.push(item);
|
|
3694
|
-
i = runEnd;
|
|
3695
|
-
continue;
|
|
2917
|
+
if (rows.length >= 2) {
|
|
2918
|
+
const headerCells = findDirectChildren(rows[0], "tc");
|
|
2919
|
+
const allLabels = headerCells.every((cell) => {
|
|
2920
|
+
const t = extractCellText(cell).trim();
|
|
2921
|
+
return t.length > 0 && t.length <= 20 && isLabelCell(t);
|
|
2922
|
+
});
|
|
2923
|
+
if (allLabels) {
|
|
2924
|
+
for (let rowIdx = 1; rowIdx < rows.length; rowIdx++) {
|
|
2925
|
+
const dataCells = findDirectChildren(rows[rowIdx], "tc");
|
|
2926
|
+
for (let colIdx = 0; colIdx < Math.min(headerCells.length, dataCells.length); colIdx++) {
|
|
2927
|
+
const headerLabel = normalizeLabel(extractCellText(headerCells[colIdx]));
|
|
2928
|
+
const matchKey = findMatchingKey(headerLabel, normalizedValues);
|
|
2929
|
+
if (matchKey === void 0) continue;
|
|
2930
|
+
if (matchedLabels.has(matchKey)) continue;
|
|
2931
|
+
const newValue = normalizedValues.get(matchKey);
|
|
2932
|
+
replaceCellText(dataCells[colIdx], newValue);
|
|
2933
|
+
matchedLabels.add(matchKey);
|
|
2934
|
+
filled.push({
|
|
2935
|
+
label: extractCellText(headerCells[colIdx]).trim(),
|
|
2936
|
+
value: newValue,
|
|
2937
|
+
row: rowIdx,
|
|
2938
|
+
col: colIdx
|
|
2939
|
+
});
|
|
2940
|
+
modified = true;
|
|
2941
|
+
}
|
|
3696
2942
|
}
|
|
3697
2943
|
}
|
|
3698
2944
|
}
|
|
3699
|
-
merged.push(sorted[i]);
|
|
3700
|
-
i++;
|
|
3701
2945
|
}
|
|
3702
|
-
|
|
3703
|
-
|
|
3704
|
-
|
|
3705
|
-
|
|
3706
|
-
|
|
3707
|
-
|
|
3708
|
-
|
|
3709
|
-
|
|
3710
|
-
|
|
3711
|
-
|
|
3712
|
-
|
|
3713
|
-
|
|
3714
|
-
|
|
3715
|
-
|
|
3716
|
-
|
|
3717
|
-
|
|
3718
|
-
|
|
3719
|
-
|
|
3720
|
-
|
|
3721
|
-
}
|
|
3722
|
-
const pageSpan = allMaxX - allMinX;
|
|
3723
|
-
if (pageSpan <= 0) return null;
|
|
3724
|
-
for (let ri = 0; ri < rows.length; ri++) {
|
|
3725
|
-
const row = rows[ri];
|
|
3726
|
-
if (row.items.length < MIN_COLS || row.items.length > 6) continue;
|
|
3727
|
-
if (row.items.some((i) => i.text.length > 8)) continue;
|
|
3728
|
-
if (!row.items.some((i) => /[가-힣]/.test(i.text))) continue;
|
|
3729
|
-
if (row.items.some((i) => /^[□■○●·※▶▷◆◇\-]/.test(i.text))) continue;
|
|
3730
|
-
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3731
|
-
const xSpan = sorted[sorted.length - 1].x + sorted[sorted.length - 1].w - sorted[0].x;
|
|
3732
|
-
if (xSpan / pageSpan < 0.4) continue;
|
|
3733
|
-
const avgFs = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3734
|
-
let hasLargeGap = false;
|
|
3735
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3736
|
-
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3737
|
-
if (gap >= avgFs * 2.5) {
|
|
3738
|
-
hasLargeGap = true;
|
|
2946
|
+
const allParagraphs = findAllElements(doc.documentElement, "p");
|
|
2947
|
+
for (const pEl of allParagraphs) {
|
|
2948
|
+
if (isInsideTable(pEl)) continue;
|
|
2949
|
+
const tNodes = collectTextNodes(pEl);
|
|
2950
|
+
const fullText = tNodes.map((n) => n.text).join("");
|
|
2951
|
+
const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{0,100})/g;
|
|
2952
|
+
let match;
|
|
2953
|
+
while ((match = pattern.exec(fullText)) !== null) {
|
|
2954
|
+
const rawLabel = match[1];
|
|
2955
|
+
const normalized = normalizeLabel(rawLabel);
|
|
2956
|
+
const matchKey = findMatchingKey(normalized, normalizedValues);
|
|
2957
|
+
if (matchKey === void 0) continue;
|
|
2958
|
+
const newValue = normalizedValues.get(matchKey);
|
|
2959
|
+
const valueStart = match.index + match[0].length - match[2].length;
|
|
2960
|
+
const valueEnd = match.index + match[0].length;
|
|
2961
|
+
replaceTextRange(tNodes, valueStart, valueEnd, newValue);
|
|
2962
|
+
matchedLabels.add(matchKey);
|
|
2963
|
+
filled.push({ label: rawLabel.trim(), value: newValue, row: -1, col: -1 });
|
|
2964
|
+
modified = true;
|
|
3739
2965
|
break;
|
|
3740
2966
|
}
|
|
3741
2967
|
}
|
|
3742
|
-
if (
|
|
3743
|
-
|
|
3744
|
-
|
|
3745
|
-
for (let j = ri + 1; j < rows.length && matchCount < MIN_ROWS + 2; j++) {
|
|
3746
|
-
const matched = countMatchedColumnsRange(rows[j], columns, sorted);
|
|
3747
|
-
if (matched >= MIN_COLS) matchCount++;
|
|
3748
|
-
}
|
|
3749
|
-
if (matchCount < MIN_ROWS) continue;
|
|
3750
|
-
return { columns, headerIdx: ri };
|
|
3751
|
-
}
|
|
3752
|
-
return null;
|
|
3753
|
-
}
|
|
3754
|
-
function mergeMultiLineRows(rows, columns) {
|
|
3755
|
-
if (rows.length <= 1) return rows;
|
|
3756
|
-
const result = [rows[0]];
|
|
3757
|
-
const allFontSizes = rows.flatMap((r) => r.items).map((i) => i.fontSize);
|
|
3758
|
-
const avgFontSize = allFontSizes.length > 0 ? allFontSizes.reduce((s, v) => s + v, 0) / allFontSizes.length : 12;
|
|
3759
|
-
for (let i = 1; i < rows.length; i++) {
|
|
3760
|
-
const prev = result[result.length - 1];
|
|
3761
|
-
const curr = rows[i];
|
|
3762
|
-
const yGap = Math.abs(prev.y - curr.y);
|
|
3763
|
-
const matchedCols = countMatchedColumns(curr, columns);
|
|
3764
|
-
if (yGap < avgFontSize * 1.8 && curr.items.length <= 2 && (matchedCols < MIN_COLS || curr.items.length === 1)) {
|
|
3765
|
-
result[result.length - 1] = {
|
|
3766
|
-
y: prev.y,
|
|
3767
|
-
items: [...prev.items, ...curr.items]
|
|
3768
|
-
};
|
|
3769
|
-
} else {
|
|
3770
|
-
result.push(curr);
|
|
3771
|
-
}
|
|
3772
|
-
}
|
|
3773
|
-
return result;
|
|
3774
|
-
}
|
|
3775
|
-
function groupByBaseline(items) {
|
|
3776
|
-
if (items.length === 0) return [];
|
|
3777
|
-
const sorted = [...items].sort((a, b) => b.y - a.y || a.x - b.x);
|
|
3778
|
-
const rows = [];
|
|
3779
|
-
let curItems = [sorted[0]];
|
|
3780
|
-
let curY = sorted[0].y;
|
|
3781
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3782
|
-
if (Math.abs(sorted[i].y - curY) <= Y_TOL) {
|
|
3783
|
-
curItems.push(sorted[i]);
|
|
3784
|
-
} else {
|
|
3785
|
-
rows.push({ y: curY, items: curItems });
|
|
3786
|
-
curItems = [sorted[i]];
|
|
3787
|
-
curY = sorted[i].y;
|
|
3788
|
-
}
|
|
3789
|
-
}
|
|
3790
|
-
if (curItems.length > 0) rows.push({ y: curY, items: curItems });
|
|
3791
|
-
return rows;
|
|
3792
|
-
}
|
|
3793
|
-
function hasSuspiciousGaps(row) {
|
|
3794
|
-
if (row.items.length < 2) return false;
|
|
3795
|
-
const sorted = [...row.items].sort((a, b) => a.x - b.x);
|
|
3796
|
-
if (sorted.length === 2 && sorted[1].text.length > 20) return false;
|
|
3797
|
-
const avgFontSize = sorted.reduce((s, i) => s + i.fontSize, 0) / sorted.length;
|
|
3798
|
-
const minGap = Math.max(avgFontSize * MIN_GAP_FACTOR, MIN_GAP_ABSOLUTE);
|
|
3799
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3800
|
-
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
3801
|
-
if (gap >= minGap) return true;
|
|
3802
|
-
}
|
|
3803
|
-
return false;
|
|
3804
|
-
}
|
|
3805
|
-
function extractColumnClusters(rows) {
|
|
3806
|
-
const allX = [];
|
|
3807
|
-
for (const row of rows) {
|
|
3808
|
-
for (const item of row.items) allX.push(item.x);
|
|
3809
|
-
}
|
|
3810
|
-
if (allX.length === 0) return [];
|
|
3811
|
-
allX.sort((a, b) => a - b);
|
|
3812
|
-
const clusters = [];
|
|
3813
|
-
let clusterStart = 0;
|
|
3814
|
-
for (let i = 1; i <= allX.length; i++) {
|
|
3815
|
-
if (i === allX.length || allX[i] - allX[i - 1] > COL_CLUSTER_TOL) {
|
|
3816
|
-
const slice = allX.slice(clusterStart, i);
|
|
3817
|
-
const avg = Math.round(slice.reduce((s, v) => s + v, 0) / slice.length);
|
|
3818
|
-
clusters.push({ x: avg, count: slice.length });
|
|
3819
|
-
clusterStart = i;
|
|
3820
|
-
}
|
|
3821
|
-
}
|
|
3822
|
-
const minCount = Math.max(2, Math.floor(rows.length * MIN_COL_FILL_RATIO));
|
|
3823
|
-
return clusters.filter((c) => c.count >= minCount).sort((a, b) => a.x - b.x);
|
|
3824
|
-
}
|
|
3825
|
-
function findTableRegionsByHeader(allRows, columns, headerItems) {
|
|
3826
|
-
const regions = [];
|
|
3827
|
-
let currentRegion = [];
|
|
3828
|
-
let missStreak = 0;
|
|
3829
|
-
for (const row of allRows) {
|
|
3830
|
-
const matchedCols = countMatchedColumnsRange(row, columns, headerItems);
|
|
3831
|
-
if (matchedCols >= MIN_COLS) {
|
|
3832
|
-
currentRegion.push(row);
|
|
3833
|
-
missStreak = 0;
|
|
3834
|
-
} else if (currentRegion.length > 0 && (row.items.length <= 2 || missStreak === 0)) {
|
|
3835
|
-
currentRegion.push(row);
|
|
3836
|
-
missStreak++;
|
|
3837
|
-
} else {
|
|
3838
|
-
while (currentRegion.length > 0) {
|
|
3839
|
-
const last = currentRegion[currentRegion.length - 1];
|
|
3840
|
-
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3841
|
-
currentRegion.pop();
|
|
3842
|
-
}
|
|
3843
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
3844
|
-
regions.push({ rows: [...currentRegion] });
|
|
3845
|
-
}
|
|
3846
|
-
currentRegion = [];
|
|
3847
|
-
missStreak = 0;
|
|
3848
|
-
}
|
|
3849
|
-
}
|
|
3850
|
-
while (currentRegion.length > 0) {
|
|
3851
|
-
const last = currentRegion[currentRegion.length - 1];
|
|
3852
|
-
if (countMatchedColumnsRange(last, columns, headerItems) >= MIN_COLS) break;
|
|
3853
|
-
currentRegion.pop();
|
|
3854
|
-
}
|
|
3855
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
3856
|
-
regions.push({ rows: currentRegion });
|
|
3857
|
-
}
|
|
3858
|
-
return regions;
|
|
3859
|
-
}
|
|
3860
|
-
function findTableRegions(allRows, columns) {
|
|
3861
|
-
const regions = [];
|
|
3862
|
-
let currentRegion = [];
|
|
3863
|
-
for (const row of allRows) {
|
|
3864
|
-
const matchedCols = countMatchedColumns(row, columns);
|
|
3865
|
-
if (matchedCols >= MIN_COLS) {
|
|
3866
|
-
currentRegion.push(row);
|
|
3867
|
-
} else if (row.items.length === 1) {
|
|
3868
|
-
if (currentRegion.length > 0) {
|
|
3869
|
-
currentRegion.push(row);
|
|
3870
|
-
}
|
|
3871
|
-
} else {
|
|
3872
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
3873
|
-
regions.push({ rows: [...currentRegion] });
|
|
3874
|
-
}
|
|
3875
|
-
currentRegion = [];
|
|
3876
|
-
}
|
|
3877
|
-
}
|
|
3878
|
-
if (currentRegion.length >= MIN_ROWS) {
|
|
3879
|
-
regions.push({ rows: currentRegion });
|
|
3880
|
-
}
|
|
3881
|
-
return regions;
|
|
3882
|
-
}
|
|
3883
|
-
function countMatchedColumns(row, columns) {
|
|
3884
|
-
const matched = /* @__PURE__ */ new Set();
|
|
3885
|
-
for (const item of row.items) {
|
|
3886
|
-
for (let ci = 0; ci < columns.length; ci++) {
|
|
3887
|
-
if (Math.abs(item.x - columns[ci].x) <= COL_CLUSTER_TOL * 2) {
|
|
3888
|
-
matched.add(ci);
|
|
3889
|
-
break;
|
|
3890
|
-
}
|
|
2968
|
+
if (modified) {
|
|
2969
|
+
const newXml = xmlSerializer.serializeToString(doc);
|
|
2970
|
+
zip.file(sectionPath, newXml);
|
|
3891
2971
|
}
|
|
3892
2972
|
}
|
|
3893
|
-
|
|
2973
|
+
const unmatched = resolveUnmatched(normalizedValues, matchedLabels, values);
|
|
2974
|
+
const buffer = await zip.generateAsync({ type: "arraybuffer" });
|
|
2975
|
+
return { buffer, filled, unmatched };
|
|
3894
2976
|
}
|
|
3895
|
-
function
|
|
3896
|
-
|
|
3897
|
-
for (let ci = 0; ci < headerItems.length; ci++) {
|
|
3898
|
-
const left = ci === 0 ? 0 : (headerItems[ci - 1].x + headerItems[ci - 1].w + headerItems[ci].x) / 2;
|
|
3899
|
-
const right = ci === headerItems.length - 1 ? Infinity : (headerItems[ci].x + headerItems[ci].w + headerItems[ci + 1].x) / 2;
|
|
3900
|
-
boundaries.push({ left, right });
|
|
3901
|
-
}
|
|
3902
|
-
const matched = /* @__PURE__ */ new Set();
|
|
3903
|
-
for (const item of row.items) {
|
|
3904
|
-
for (let ci = 0; ci < boundaries.length; ci++) {
|
|
3905
|
-
if (item.x >= boundaries[ci].left && item.x < boundaries[ci].right) {
|
|
3906
|
-
matched.add(ci);
|
|
3907
|
-
break;
|
|
3908
|
-
}
|
|
3909
|
-
}
|
|
3910
|
-
}
|
|
3911
|
-
return matched.size;
|
|
2977
|
+
function localName(el) {
|
|
2978
|
+
return (el.tagName || el.localName || "").replace(/^[^:]+:/, "");
|
|
3912
2979
|
}
|
|
3913
|
-
function
|
|
3914
|
-
if (items.length === 0) return [];
|
|
3915
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
3916
|
-
const colCenters = columns.map((c) => c.x);
|
|
3917
|
-
const gaps = [];
|
|
3918
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
3919
|
-
gaps.push({ idx: i, size: sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w) });
|
|
3920
|
-
}
|
|
3921
|
-
const gapSizes = gaps.map((g2) => g2.size).sort((a, b) => a - b);
|
|
3922
|
-
const medianGap = gapSizes.length > 0 ? gapSizes[Math.floor(gapSizes.length / 2)] : 0;
|
|
3923
|
-
const gapThreshold = sorted.length <= numCols + 1 ? 12 : Math.max(medianGap * 2.5, 12);
|
|
3924
|
-
const significantGaps = gaps.filter((g2) => g2.size >= gapThreshold).sort((a, b) => b.size - a.size).slice(0, numCols - 1).sort((a, b) => a.idx - b.idx);
|
|
3925
|
-
const groups = [];
|
|
3926
|
-
let start = 0;
|
|
3927
|
-
for (const gap of significantGaps) {
|
|
3928
|
-
groups.push(sorted.slice(start, gap.idx));
|
|
3929
|
-
start = gap.idx;
|
|
3930
|
-
}
|
|
3931
|
-
groups.push(sorted.slice(start));
|
|
2980
|
+
function findAllElements(node, tagLocalName) {
|
|
3932
2981
|
const result = [];
|
|
3933
|
-
const
|
|
3934
|
-
|
|
3935
|
-
|
|
3936
|
-
for (
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
if (
|
|
3940
|
-
|
|
3941
|
-
return (minX + maxX) / 2;
|
|
3942
|
-
});
|
|
3943
|
-
const assignments = [];
|
|
3944
|
-
for (let gi = 0; gi < groups.length; gi++) {
|
|
3945
|
-
for (let ci = 0; ci < numCols; ci++) {
|
|
3946
|
-
assignments.push({ gi, ci, dist: Math.abs(groupCenters[gi] - colCenters[ci]) });
|
|
3947
|
-
}
|
|
3948
|
-
}
|
|
3949
|
-
assignments.sort((a, b) => a.dist - b.dist);
|
|
3950
|
-
const assignedGroups = /* @__PURE__ */ new Set();
|
|
3951
|
-
for (const { gi, ci } of assignments) {
|
|
3952
|
-
if (assignedGroups.has(gi) || usedCols.has(ci)) continue;
|
|
3953
|
-
result.push({ col: ci, items: groups[gi] });
|
|
3954
|
-
assignedGroups.add(gi);
|
|
3955
|
-
usedCols.add(ci);
|
|
3956
|
-
}
|
|
3957
|
-
for (let gi = 0; gi < groups.length; gi++) {
|
|
3958
|
-
if (assignedGroups.has(gi)) continue;
|
|
3959
|
-
let bestCol = 0, bestDist = Infinity;
|
|
3960
|
-
for (let ci = 0; ci < numCols; ci++) {
|
|
3961
|
-
const d = Math.abs(groupCenters[gi] - colCenters[ci]);
|
|
3962
|
-
if (d < bestDist) {
|
|
3963
|
-
bestDist = d;
|
|
3964
|
-
bestCol = ci;
|
|
3965
|
-
}
|
|
3966
|
-
}
|
|
3967
|
-
result.push({ col: bestCol, items: groups[gi] });
|
|
3968
|
-
}
|
|
3969
|
-
return result;
|
|
3970
|
-
}
|
|
3971
|
-
function buildClusterTable(rows, columns, pageNum) {
|
|
3972
|
-
const numCols = columns.length;
|
|
3973
|
-
const numRows = rows.length;
|
|
3974
|
-
if (numRows < MIN_ROWS || numCols < MIN_COLS) return null;
|
|
3975
|
-
const cells = Array.from(
|
|
3976
|
-
{ length: numRows },
|
|
3977
|
-
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
3978
|
-
);
|
|
3979
|
-
const usedItems = /* @__PURE__ */ new Set();
|
|
3980
|
-
for (let r = 0; r < numRows; r++) {
|
|
3981
|
-
const row = rows[r];
|
|
3982
|
-
if (row.items.length === 1 && numCols > 1) {
|
|
3983
|
-
cells[r][0] = { text: row.items[0].text, colSpan: numCols, rowSpan: 1 };
|
|
3984
|
-
usedItems.add(row.items[0]);
|
|
3985
|
-
continue;
|
|
3986
|
-
}
|
|
3987
|
-
const assignments = assignRowItems(row.items, columns, numCols);
|
|
3988
|
-
for (const { col, items } of assignments) {
|
|
3989
|
-
const text = items.map((i) => i.text).join(" ");
|
|
3990
|
-
const existing = cells[r][col].text;
|
|
3991
|
-
cells[r][col].text = existing ? existing + " " + text : text;
|
|
3992
|
-
for (const item of items) usedItems.add(item);
|
|
3993
|
-
}
|
|
3994
|
-
}
|
|
3995
|
-
let emptyRows = 0;
|
|
3996
|
-
for (const row of cells) {
|
|
3997
|
-
if (row.every((c) => c.text === "")) emptyRows++;
|
|
3998
|
-
}
|
|
3999
|
-
if (emptyRows > numRows * 0.5) return null;
|
|
4000
|
-
for (let c = 0; c < numCols; c++) {
|
|
4001
|
-
const hasValue = cells.some((row) => row[c].text !== "");
|
|
4002
|
-
if (!hasValue) return null;
|
|
4003
|
-
}
|
|
4004
|
-
for (let r = numRows - 1; r >= 1; r--) {
|
|
4005
|
-
const nonEmptyCols = cells[r].filter((c) => c.text.trim()).length;
|
|
4006
|
-
if (nonEmptyCols !== 1) continue;
|
|
4007
|
-
if (cells[r][0].text.trim() !== "") continue;
|
|
4008
|
-
const contentText = cells[r].find((c) => c.text.trim())?.text.trim() || "";
|
|
4009
|
-
if (/^[○●▶\-·]/.test(contentText)) continue;
|
|
4010
|
-
for (let pr = r - 1; pr >= 0; pr--) {
|
|
4011
|
-
if (cells[pr].some((c) => c.text.trim())) {
|
|
4012
|
-
for (let c = 0; c < numCols; c++) {
|
|
4013
|
-
const prev = cells[pr][c].text.trim();
|
|
4014
|
-
const curr = cells[r][c].text.trim();
|
|
4015
|
-
if (curr) cells[pr][c].text = prev ? prev + " " + curr : curr;
|
|
4016
|
-
}
|
|
4017
|
-
for (let c = 0; c < numCols; c++) cells[r][c].text = "";
|
|
4018
|
-
break;
|
|
4019
|
-
}
|
|
4020
|
-
}
|
|
4021
|
-
}
|
|
4022
|
-
for (let r = 0; r < cells.length - 1; r++) {
|
|
4023
|
-
const row = cells[r];
|
|
4024
|
-
const hasCol0 = row[0].text.trim() !== "";
|
|
4025
|
-
const hasColLast = numCols > 1 && row[numCols - 1].text.trim() !== "";
|
|
4026
|
-
const midEmpty = row.slice(1, numCols - 1).every((c) => c.text.trim() === "");
|
|
4027
|
-
if (hasCol0 && hasColLast && midEmpty) {
|
|
4028
|
-
const next = cells[r + 1];
|
|
4029
|
-
if (next[0].text.trim() === "" && next.some((c) => c.text.trim())) {
|
|
4030
|
-
for (let c = 1; c < numCols; c++) {
|
|
4031
|
-
const curr = next[c].text.trim();
|
|
4032
|
-
if (curr) row[c].text = row[c].text.trim() ? row[c].text.trim() + " " + curr : curr;
|
|
4033
|
-
}
|
|
4034
|
-
for (let c = 0; c < numCols; c++) next[c].text = "";
|
|
4035
|
-
}
|
|
4036
|
-
}
|
|
4037
|
-
}
|
|
4038
|
-
const filteredCells = cells.filter((row) => row.some((c) => c.text.trim()));
|
|
4039
|
-
const finalRowCount = filteredCells.length;
|
|
4040
|
-
if (finalRowCount < MIN_ROWS) return null;
|
|
4041
|
-
const irTable = {
|
|
4042
|
-
rows: finalRowCount,
|
|
4043
|
-
cols: numCols,
|
|
4044
|
-
cells: filteredCells,
|
|
4045
|
-
hasHeader: finalRowCount > 1
|
|
4046
|
-
};
|
|
4047
|
-
const allItems = rows.flatMap((r) => r.items);
|
|
4048
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
4049
|
-
for (const i of allItems) {
|
|
4050
|
-
if (i.x < minX) minX = i.x;
|
|
4051
|
-
if (i.y < minY) minY = i.y;
|
|
4052
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4053
|
-
const h = i.h > 0 ? i.h : i.fontSize;
|
|
4054
|
-
if (i.y + h > maxY) maxY = i.y + h;
|
|
4055
|
-
}
|
|
4056
|
-
return {
|
|
4057
|
-
table: irTable,
|
|
4058
|
-
bbox: { page: pageNum, x: minX, y: minY, width: maxX - minX, height: maxY - minY },
|
|
4059
|
-
usedItems
|
|
4060
|
-
};
|
|
4061
|
-
}
|
|
4062
|
-
|
|
4063
|
-
// src/pdf/polyfill.ts
|
|
4064
|
-
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
4065
|
-
var g = globalThis;
|
|
4066
|
-
if (typeof g.DOMMatrix === "undefined") {
|
|
4067
|
-
g.DOMMatrix = class DOMMatrix {
|
|
4068
|
-
m = [1, 0, 0, 1, 0, 0];
|
|
4069
|
-
constructor(init) {
|
|
4070
|
-
if (init) this.m = init;
|
|
2982
|
+
const walk = (n) => {
|
|
2983
|
+
const children = n.childNodes;
|
|
2984
|
+
if (!children) return;
|
|
2985
|
+
for (let i = 0; i < children.length; i++) {
|
|
2986
|
+
const child = children[i];
|
|
2987
|
+
if (child.nodeType !== 1) continue;
|
|
2988
|
+
if (localName(child) === tagLocalName) result.push(child);
|
|
2989
|
+
walk(child);
|
|
4071
2990
|
}
|
|
4072
2991
|
};
|
|
4073
|
-
|
|
4074
|
-
if (typeof g.Path2D === "undefined") {
|
|
4075
|
-
g.Path2D = class Path2D {
|
|
4076
|
-
};
|
|
4077
|
-
}
|
|
4078
|
-
g.pdfjsWorker = pdfjsWorker;
|
|
4079
|
-
|
|
4080
|
-
// src/pdf/parser.ts
|
|
4081
|
-
import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
4082
|
-
GlobalWorkerOptions.workerSrc = "";
|
|
4083
|
-
var MAX_PAGES = 5e3;
|
|
4084
|
-
var MAX_TOTAL_TEXT = 100 * 1024 * 1024;
|
|
4085
|
-
var PDF_LOAD_TIMEOUT_MS = 3e4;
|
|
4086
|
-
async function loadPdfWithTimeout(buffer) {
|
|
4087
|
-
const loadingTask = getDocument({
|
|
4088
|
-
data: new Uint8Array(buffer),
|
|
4089
|
-
useSystemFonts: true,
|
|
4090
|
-
disableFontFace: true,
|
|
4091
|
-
isEvalSupported: false
|
|
4092
|
-
});
|
|
4093
|
-
let timer;
|
|
4094
|
-
try {
|
|
4095
|
-
return await Promise.race([
|
|
4096
|
-
loadingTask.promise,
|
|
4097
|
-
new Promise((_, reject) => {
|
|
4098
|
-
timer = setTimeout(() => {
|
|
4099
|
-
loadingTask.destroy();
|
|
4100
|
-
reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
|
|
4101
|
-
}, PDF_LOAD_TIMEOUT_MS);
|
|
4102
|
-
})
|
|
4103
|
-
]);
|
|
4104
|
-
} finally {
|
|
4105
|
-
if (timer !== void 0) clearTimeout(timer);
|
|
4106
|
-
}
|
|
4107
|
-
}
|
|
4108
|
-
async function parsePdfDocument(buffer, options) {
|
|
4109
|
-
const doc = await loadPdfWithTimeout(buffer);
|
|
4110
|
-
try {
|
|
4111
|
-
const pageCount = doc.numPages;
|
|
4112
|
-
if (pageCount === 0) throw new KordocError("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
4113
|
-
const metadata = { pageCount };
|
|
4114
|
-
await extractPdfMetadata(doc, metadata);
|
|
4115
|
-
const blocks = [];
|
|
4116
|
-
const warnings = [];
|
|
4117
|
-
let totalChars = 0;
|
|
4118
|
-
let totalTextBytes = 0;
|
|
4119
|
-
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
4120
|
-
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
4121
|
-
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
4122
|
-
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
4123
|
-
const pageHeights = /* @__PURE__ */ new Map();
|
|
4124
|
-
let parsedPages = 0;
|
|
4125
|
-
for (let i = 1; i <= effectivePageCount; i++) {
|
|
4126
|
-
if (pageFilter && !pageFilter.has(i)) continue;
|
|
4127
|
-
try {
|
|
4128
|
-
const page = await doc.getPage(i);
|
|
4129
|
-
const tc = await page.getTextContent();
|
|
4130
|
-
const viewport = page.getViewport({ scale: 1 });
|
|
4131
|
-
pageHeights.set(i, viewport.height);
|
|
4132
|
-
const rawItems = tc.items;
|
|
4133
|
-
const items = normalizeItems(rawItems);
|
|
4134
|
-
const { visible, hiddenCount } = filterHiddenText(items, viewport.width, viewport.height);
|
|
4135
|
-
if (hiddenCount > 0) {
|
|
4136
|
-
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
4137
|
-
}
|
|
4138
|
-
for (const item of visible) {
|
|
4139
|
-
if (item.fontSize > 0) fontSizeFreq.set(item.fontSize, (fontSizeFreq.get(item.fontSize) || 0) + 1);
|
|
4140
|
-
}
|
|
4141
|
-
const opList = await page.getOperatorList();
|
|
4142
|
-
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
4143
|
-
for (const b of pageBlocks) blocks.push(b);
|
|
4144
|
-
for (const b of pageBlocks) {
|
|
4145
|
-
const t = b.text || "";
|
|
4146
|
-
totalChars += t.replace(/\s/g, "").length;
|
|
4147
|
-
totalTextBytes += t.length * 2;
|
|
4148
|
-
}
|
|
4149
|
-
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
4150
|
-
parsedPages++;
|
|
4151
|
-
options?.onProgress?.(parsedPages, totalTarget);
|
|
4152
|
-
} catch (pageErr) {
|
|
4153
|
-
if (pageErr instanceof KordocError) throw pageErr;
|
|
4154
|
-
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
4155
|
-
}
|
|
4156
|
-
}
|
|
4157
|
-
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
4158
|
-
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
4159
|
-
if (options?.ocr) {
|
|
4160
|
-
try {
|
|
4161
|
-
const { ocrPages } = await import("./provider-7H4CPZYS.js");
|
|
4162
|
-
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
4163
|
-
if (ocrBlocks.length > 0) {
|
|
4164
|
-
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
4165
|
-
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
|
|
4166
|
-
}
|
|
4167
|
-
} catch {
|
|
4168
|
-
}
|
|
4169
|
-
}
|
|
4170
|
-
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
4171
|
-
}
|
|
4172
|
-
if (options?.removeHeaderFooter !== false && parsedPageCount >= 3) {
|
|
4173
|
-
const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
|
|
4174
|
-
for (let ri = removed.length - 1; ri >= 0; ri--) {
|
|
4175
|
-
blocks.splice(removed[ri], 1);
|
|
4176
|
-
}
|
|
4177
|
-
}
|
|
4178
|
-
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
4179
|
-
if (medianFontSize > 0) {
|
|
4180
|
-
detectHeadings(blocks, medianFontSize);
|
|
4181
|
-
}
|
|
4182
|
-
detectMarkerHeadings(blocks);
|
|
4183
|
-
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
4184
|
-
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
4185
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
4186
|
-
} finally {
|
|
4187
|
-
await doc.destroy().catch(() => {
|
|
4188
|
-
});
|
|
4189
|
-
}
|
|
4190
|
-
}
|
|
4191
|
-
async function extractPdfMetadata(doc, metadata) {
|
|
4192
|
-
try {
|
|
4193
|
-
const result = await doc.getMetadata();
|
|
4194
|
-
if (!result?.info) return;
|
|
4195
|
-
const info = result.info;
|
|
4196
|
-
if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
|
|
4197
|
-
if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
|
|
4198
|
-
if (typeof info.Creator === "string" && info.Creator.trim()) metadata.creator = info.Creator.trim();
|
|
4199
|
-
if (typeof info.Subject === "string" && info.Subject.trim()) metadata.description = info.Subject.trim();
|
|
4200
|
-
if (typeof info.Keywords === "string" && info.Keywords.trim()) {
|
|
4201
|
-
metadata.keywords = info.Keywords.split(/[,;]/).map((k) => k.trim()).filter(Boolean);
|
|
4202
|
-
}
|
|
4203
|
-
if (typeof info.CreationDate === "string") metadata.createdAt = parsePdfDate(info.CreationDate);
|
|
4204
|
-
if (typeof info.ModDate === "string") metadata.modifiedAt = parsePdfDate(info.ModDate);
|
|
4205
|
-
} catch {
|
|
4206
|
-
}
|
|
4207
|
-
}
|
|
4208
|
-
function parsePdfDate(dateStr) {
|
|
4209
|
-
const m = dateStr.match(/D:(\d{4})(\d{2})?(\d{2})?(\d{2})?(\d{2})?(\d{2})?/);
|
|
4210
|
-
if (!m) return void 0;
|
|
4211
|
-
const [, year, month = "01", day = "01", hour = "00", min = "00", sec = "00"] = m;
|
|
4212
|
-
return `${year}-${month}-${day}T${hour}:${min}:${sec}`;
|
|
4213
|
-
}
|
|
4214
|
-
async function extractPdfMetadataOnly(buffer) {
|
|
4215
|
-
const doc = await loadPdfWithTimeout(buffer);
|
|
4216
|
-
try {
|
|
4217
|
-
const metadata = { pageCount: doc.numPages };
|
|
4218
|
-
await extractPdfMetadata(doc, metadata);
|
|
4219
|
-
return metadata;
|
|
4220
|
-
} finally {
|
|
4221
|
-
await doc.destroy().catch(() => {
|
|
4222
|
-
});
|
|
4223
|
-
}
|
|
4224
|
-
}
|
|
4225
|
-
function filterHiddenText(items, pageWidth, pageHeight) {
|
|
4226
|
-
let hiddenCount = 0;
|
|
4227
|
-
const visible = [];
|
|
4228
|
-
for (const item of items) {
|
|
4229
|
-
if (item.isHidden) {
|
|
4230
|
-
hiddenCount++;
|
|
4231
|
-
continue;
|
|
4232
|
-
}
|
|
4233
|
-
const margin = Math.max(pageWidth, pageHeight) * 0.1;
|
|
4234
|
-
if (item.x < -margin || item.x > pageWidth + margin || item.y < -margin || item.y > pageHeight + margin) {
|
|
4235
|
-
hiddenCount++;
|
|
4236
|
-
continue;
|
|
4237
|
-
}
|
|
4238
|
-
visible.push(item);
|
|
4239
|
-
}
|
|
4240
|
-
return { visible, hiddenCount };
|
|
4241
|
-
}
|
|
4242
|
-
function computeMedianFontSizeFromFreq(freq) {
|
|
4243
|
-
if (freq.size === 0) return 0;
|
|
4244
|
-
let total = 0;
|
|
4245
|
-
for (const count of freq.values()) total += count;
|
|
4246
|
-
const sorted = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
4247
|
-
const mid = Math.floor(total / 2);
|
|
4248
|
-
let cumulative = 0;
|
|
4249
|
-
for (const [size, count] of sorted) {
|
|
4250
|
-
cumulative += count;
|
|
4251
|
-
if (cumulative > mid) return size;
|
|
4252
|
-
}
|
|
4253
|
-
return sorted[sorted.length - 1][0];
|
|
4254
|
-
}
|
|
4255
|
-
function detectHeadings(blocks, medianFontSize) {
|
|
4256
|
-
for (const block of blocks) {
|
|
4257
|
-
if (block.type !== "paragraph" || !block.text || !block.style?.fontSize) continue;
|
|
4258
|
-
const text = block.text.trim();
|
|
4259
|
-
if (text.length === 0 || text.length > 200) continue;
|
|
4260
|
-
if (/^\d+$/.test(text)) continue;
|
|
4261
|
-
const ratio = block.style.fontSize / medianFontSize;
|
|
4262
|
-
let level = 0;
|
|
4263
|
-
if (ratio >= HEADING_RATIO_H1) level = 1;
|
|
4264
|
-
else if (ratio >= HEADING_RATIO_H2) level = 2;
|
|
4265
|
-
else if (ratio >= HEADING_RATIO_H3) level = 3;
|
|
4266
|
-
if (level > 0) {
|
|
4267
|
-
block.type = "heading";
|
|
4268
|
-
block.level = level;
|
|
4269
|
-
block.text = collapseEvenSpacing(text);
|
|
4270
|
-
}
|
|
4271
|
-
}
|
|
4272
|
-
}
|
|
4273
|
-
function collapseEvenSpacing(text) {
|
|
4274
|
-
const tokens = text.split(" ");
|
|
4275
|
-
const singleCharCount = tokens.filter((t) => t.length === 1).length;
|
|
4276
|
-
if (tokens.length >= 3 && singleCharCount / tokens.length >= 0.7) {
|
|
4277
|
-
return tokens.join("");
|
|
4278
|
-
}
|
|
4279
|
-
return text.replace(
|
|
4280
|
-
/(?<![가-힣])[가-힣](?: [가-힣\d]){2,}(?![가-힣])/g,
|
|
4281
|
-
(match) => match.replace(/ /g, "")
|
|
4282
|
-
);
|
|
4283
|
-
}
|
|
4284
|
-
function shouldDemoteTable(table) {
|
|
4285
|
-
const allCells = table.cells.flatMap((row) => row.map((c) => c.text.trim())).filter(Boolean);
|
|
4286
|
-
const allText = allCells.join(" ");
|
|
4287
|
-
if (table.rows <= 3 && table.cols <= 3) {
|
|
4288
|
-
const totalCells2 = table.rows * table.cols;
|
|
4289
|
-
const emptyCells2 = totalCells2 - allCells.length;
|
|
4290
|
-
if (emptyCells2 >= totalCells2 * 0.3) return true;
|
|
4291
|
-
if (/[□■◆○●▶ㅇ]/.test(allText)) return true;
|
|
4292
|
-
if (/<[^>]+>/.test(allText)) return true;
|
|
4293
|
-
}
|
|
4294
|
-
if (allText.length > 200) return false;
|
|
4295
|
-
if (/[□■◆○●▶]/.test(allText) && table.rows <= 3) return true;
|
|
4296
|
-
const totalCells = table.rows * table.cols;
|
|
4297
|
-
const emptyCells = totalCells - allCells.length;
|
|
4298
|
-
if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
|
|
4299
|
-
if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
|
|
4300
|
-
return false;
|
|
4301
|
-
}
|
|
4302
|
-
function demoteTableToText(table) {
|
|
4303
|
-
const lines = [];
|
|
4304
|
-
for (let r = 0; r < table.rows; r++) {
|
|
4305
|
-
const cells = table.cells[r].map((c) => c.text.trim()).filter(Boolean);
|
|
4306
|
-
if (cells.length === 0) continue;
|
|
4307
|
-
if (table.cols === 2 && cells.length === 2) {
|
|
4308
|
-
lines.push(`${cells[0]} : ${cells[1]}`);
|
|
4309
|
-
} else {
|
|
4310
|
-
lines.push(cells.join(" "));
|
|
4311
|
-
}
|
|
4312
|
-
}
|
|
4313
|
-
return lines.join("\n");
|
|
4314
|
-
}
|
|
4315
|
-
function detectMarkerHeadings(blocks) {
|
|
4316
|
-
for (let i = 0; i < blocks.length; i++) {
|
|
4317
|
-
const block = blocks[i];
|
|
4318
|
-
if (block.type !== "paragraph" || !block.text) continue;
|
|
4319
|
-
const text = block.text.trim();
|
|
4320
|
-
if (text.length < 50 && /^[□■◆◇▶]\s*[가-힣]/.test(text)) {
|
|
4321
|
-
block.type = "heading";
|
|
4322
|
-
block.level = 4;
|
|
4323
|
-
continue;
|
|
4324
|
-
}
|
|
4325
|
-
if (/^[가-힣]{2,6}$/.test(text) && block.style?.fontSize) {
|
|
4326
|
-
const prev = blocks[i - 1];
|
|
4327
|
-
const next = blocks[i + 1];
|
|
4328
|
-
const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
|
|
4329
|
-
const nextIsStructural = !next || next.type === "table" || next.type === "heading" || next.type === "paragraph" && next.text && /^[□■◆○●]/.test(next.text.trim());
|
|
4330
|
-
if (prevIsStructural || nextIsStructural) {
|
|
4331
|
-
block.type = "heading";
|
|
4332
|
-
block.level = 3;
|
|
4333
|
-
}
|
|
4334
|
-
}
|
|
4335
|
-
}
|
|
4336
|
-
}
|
|
4337
|
-
var MAX_XYCUT_DEPTH = 50;
|
|
4338
|
-
function xyCutOrder(items, gapThreshold, depth = 0) {
|
|
4339
|
-
if (items.length === 0) return [];
|
|
4340
|
-
if (items.length <= 2 || depth >= MAX_XYCUT_DEPTH) return [items];
|
|
4341
|
-
const region = computeRegion(items);
|
|
4342
|
-
const ySplit = findYSplit(items, region, gapThreshold);
|
|
4343
|
-
if (ySplit !== null) {
|
|
4344
|
-
const upper = items.filter((i) => i.y > ySplit);
|
|
4345
|
-
const lower = items.filter((i) => i.y <= ySplit);
|
|
4346
|
-
if (upper.length > 0 && lower.length > 0 && upper.length < items.length) {
|
|
4347
|
-
return [...xyCutOrder(upper, gapThreshold, depth + 1), ...xyCutOrder(lower, gapThreshold, depth + 1)];
|
|
4348
|
-
}
|
|
4349
|
-
}
|
|
4350
|
-
const xSplit = findXSplit(items, region, gapThreshold);
|
|
4351
|
-
if (xSplit !== null) {
|
|
4352
|
-
const left = items.filter((i) => i.x + i.w / 2 < xSplit);
|
|
4353
|
-
const right = items.filter((i) => i.x + i.w / 2 >= xSplit);
|
|
4354
|
-
if (left.length > 0 && right.length > 0 && left.length < items.length) {
|
|
4355
|
-
return [...xyCutOrder(left, gapThreshold, depth + 1), ...xyCutOrder(right, gapThreshold, depth + 1)];
|
|
4356
|
-
}
|
|
4357
|
-
}
|
|
4358
|
-
return [items];
|
|
4359
|
-
}
|
|
4360
|
-
function computeRegion(items) {
|
|
4361
|
-
let minX = Infinity, minY = Infinity, maxX = -Infinity, maxY = -Infinity;
|
|
4362
|
-
for (const i of items) {
|
|
4363
|
-
if (i.x < minX) minX = i.x;
|
|
4364
|
-
if (i.y < minY) minY = i.y;
|
|
4365
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4366
|
-
if (i.y + i.h > maxY) maxY = i.y + i.h;
|
|
4367
|
-
}
|
|
4368
|
-
return { items, minX, minY, maxX, maxY };
|
|
4369
|
-
}
|
|
4370
|
-
function findYSplit(items, _region, gapThreshold) {
|
|
4371
|
-
const sorted = [...items].sort((a, b) => b.y - a.y);
|
|
4372
|
-
let bestGap = gapThreshold;
|
|
4373
|
-
let bestSplit = null;
|
|
4374
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4375
|
-
const prevBottom = sorted[i - 1].y - sorted[i - 1].h;
|
|
4376
|
-
const currTop = sorted[i].y;
|
|
4377
|
-
const gap = prevBottom - currTop;
|
|
4378
|
-
if (gap > bestGap) {
|
|
4379
|
-
bestGap = gap;
|
|
4380
|
-
bestSplit = (prevBottom + currTop) / 2;
|
|
4381
|
-
}
|
|
4382
|
-
}
|
|
4383
|
-
return bestSplit;
|
|
4384
|
-
}
|
|
4385
|
-
function findXSplit(items, _region, gapThreshold) {
|
|
4386
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4387
|
-
let bestGap = gapThreshold;
|
|
4388
|
-
let bestSplit = null;
|
|
4389
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4390
|
-
const prevRight = sorted[i - 1].x + sorted[i - 1].w;
|
|
4391
|
-
const currLeft = sorted[i].x;
|
|
4392
|
-
const gap = currLeft - prevRight;
|
|
4393
|
-
if (gap > bestGap) {
|
|
4394
|
-
bestGap = gap;
|
|
4395
|
-
bestSplit = (prevRight + currLeft) / 2;
|
|
4396
|
-
}
|
|
4397
|
-
}
|
|
4398
|
-
return bestSplit;
|
|
4399
|
-
}
|
|
4400
|
-
function extractPageBlocksWithLines(items, pageNum, opList, pageWidth, pageHeight) {
|
|
4401
|
-
if (items.length === 0) return [];
|
|
4402
|
-
let { horizontals, verticals } = extractLines(opList.fnArray, opList.argsArray);
|
|
4403
|
-
({ horizontals, verticals } = filterPageBorderLines(horizontals, verticals, pageWidth, pageHeight));
|
|
4404
|
-
({ horizontals, verticals } = preprocessLines(horizontals, verticals));
|
|
4405
|
-
const grids = buildTableGrids(horizontals, verticals);
|
|
4406
|
-
if (grids.length > 0) {
|
|
4407
|
-
return extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals);
|
|
4408
|
-
}
|
|
4409
|
-
return extractPageBlocksFallback(items, pageNum);
|
|
4410
|
-
}
|
|
4411
|
-
function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
|
|
4412
|
-
const blocks = [];
|
|
4413
|
-
const usedItems = /* @__PURE__ */ new Set();
|
|
4414
|
-
const sortedGrids = [...grids].sort((a, b) => b.bbox.y2 - a.bbox.y2);
|
|
4415
|
-
for (const grid of sortedGrids) {
|
|
4416
|
-
const numGridRows = grid.rowYs.length - 1;
|
|
4417
|
-
const numGridCols = grid.colXs.length - 1;
|
|
4418
|
-
if (numGridRows === 1 && numGridCols >= 2) continue;
|
|
4419
|
-
const tableItems = [];
|
|
4420
|
-
const pad = 3;
|
|
4421
|
-
const gridW = grid.bbox.x2 - grid.bbox.x1;
|
|
4422
|
-
for (const item of items) {
|
|
4423
|
-
if (usedItems.has(item)) continue;
|
|
4424
|
-
if (item.y < grid.bbox.y1 - pad || item.y > grid.bbox.y2 + pad) continue;
|
|
4425
|
-
if (item.x < grid.bbox.x1 - pad || item.x + item.w > grid.bbox.x2 + pad) continue;
|
|
4426
|
-
if (gridW < 120 && item.x + item.w > grid.bbox.x2 - 2) continue;
|
|
4427
|
-
tableItems.push(item);
|
|
4428
|
-
usedItems.add(item);
|
|
4429
|
-
}
|
|
4430
|
-
const cells = extractCells(grid, horizontals, verticals);
|
|
4431
|
-
if (cells.length === 0) continue;
|
|
4432
|
-
const textItems = tableItems.map((i) => ({
|
|
4433
|
-
text: i.text,
|
|
4434
|
-
x: i.x,
|
|
4435
|
-
y: i.y,
|
|
4436
|
-
w: i.w,
|
|
4437
|
-
h: i.h,
|
|
4438
|
-
fontSize: i.fontSize,
|
|
4439
|
-
fontName: i.fontName
|
|
4440
|
-
}));
|
|
4441
|
-
const cellTextMap = mapTextToCells(textItems, cells);
|
|
4442
|
-
const numRows = grid.rowYs.length - 1;
|
|
4443
|
-
const numCols = grid.colXs.length - 1;
|
|
4444
|
-
const irGrid = Array.from(
|
|
4445
|
-
{ length: numRows },
|
|
4446
|
-
() => Array.from({ length: numCols }, () => ({ text: "", colSpan: 1, rowSpan: 1 }))
|
|
4447
|
-
);
|
|
4448
|
-
for (const cell of cells) {
|
|
4449
|
-
const cellItems = cellTextMap.get(cell) || [];
|
|
4450
|
-
let text = cellTextToString(cellItems);
|
|
4451
|
-
text = text.replace(/^[\s]*[-–—]\s*\d+\s*[-–—][\s]*$/gm, "").trim();
|
|
4452
|
-
text = text.split("\n").map((line) => collapseEvenSpacing(line)).join("\n");
|
|
4453
|
-
irGrid[cell.row][cell.col] = {
|
|
4454
|
-
text,
|
|
4455
|
-
colSpan: cell.colSpan,
|
|
4456
|
-
rowSpan: cell.rowSpan
|
|
4457
|
-
};
|
|
4458
|
-
}
|
|
4459
|
-
const irTable = {
|
|
4460
|
-
rows: numRows,
|
|
4461
|
-
cols: numCols,
|
|
4462
|
-
cells: irGrid,
|
|
4463
|
-
hasHeader: numRows > 1
|
|
4464
|
-
};
|
|
4465
|
-
const hasContent = irGrid.some((row) => row.some((cell) => cell.text.trim() !== ""));
|
|
4466
|
-
if (!hasContent) continue;
|
|
4467
|
-
const tableBbox = {
|
|
4468
|
-
page: pageNum,
|
|
4469
|
-
x: grid.bbox.x1,
|
|
4470
|
-
y: grid.bbox.y1,
|
|
4471
|
-
width: grid.bbox.x2 - grid.bbox.x1,
|
|
4472
|
-
height: grid.bbox.y2 - grid.bbox.y1
|
|
4473
|
-
};
|
|
4474
|
-
if (shouldDemoteTable(irTable)) {
|
|
4475
|
-
const demoted = demoteTableToText(irTable);
|
|
4476
|
-
if (demoted) {
|
|
4477
|
-
const text = numGridRows === 1 ? "\n" + demoted + "\n" : demoted;
|
|
4478
|
-
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox: tableBbox, style: dominantStyle(tableItems) });
|
|
4479
|
-
}
|
|
4480
|
-
continue;
|
|
4481
|
-
}
|
|
4482
|
-
blocks.push({ type: "table", table: irTable, pageNumber: pageNum, bbox: tableBbox });
|
|
4483
|
-
}
|
|
4484
|
-
let remaining = items.filter((i) => !usedItems.has(i));
|
|
4485
|
-
if (remaining.length > 0) {
|
|
4486
|
-
remaining.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4487
|
-
const clusterItems = remaining.map((i) => ({
|
|
4488
|
-
text: i.text,
|
|
4489
|
-
x: i.x,
|
|
4490
|
-
y: i.y,
|
|
4491
|
-
w: i.w,
|
|
4492
|
-
h: i.h,
|
|
4493
|
-
fontSize: i.fontSize,
|
|
4494
|
-
fontName: i.fontName
|
|
4495
|
-
}));
|
|
4496
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4497
|
-
if (clusterResults.length > 0) {
|
|
4498
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4499
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4500
|
-
const usedClusterIndices = /* @__PURE__ */ new Set();
|
|
4501
|
-
for (const cr of clusterResults) {
|
|
4502
|
-
for (const ci of cr.usedItems) {
|
|
4503
|
-
const idx = ciToIdx.get(ci);
|
|
4504
|
-
if (idx !== void 0) usedClusterIndices.add(idx);
|
|
4505
|
-
}
|
|
4506
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4507
|
-
}
|
|
4508
|
-
remaining = remaining.filter((_, idx) => !usedClusterIndices.has(idx));
|
|
4509
|
-
}
|
|
4510
|
-
if (remaining.length > 0) {
|
|
4511
|
-
const allY = remaining.map((i) => i.y);
|
|
4512
|
-
const pageH = safeMax(allY) - safeMin(allY);
|
|
4513
|
-
const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
|
|
4514
|
-
const textBlocks = [];
|
|
4515
|
-
for (const group of groups) {
|
|
4516
|
-
if (group.length === 0) continue;
|
|
4517
|
-
const groupBlocks = extractPageBlocksFallback(group, pageNum);
|
|
4518
|
-
for (const b of groupBlocks) textBlocks.push(b);
|
|
4519
|
-
}
|
|
4520
|
-
const finalTextBlocks = detectListBlocks(textBlocks);
|
|
4521
|
-
for (const b of finalTextBlocks) blocks.push(b);
|
|
4522
|
-
}
|
|
4523
|
-
blocks.sort((a, b) => {
|
|
4524
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4525
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4526
|
-
return by - ay;
|
|
4527
|
-
});
|
|
4528
|
-
return mergeAdjacentTableBlocks(blocks);
|
|
4529
|
-
}
|
|
4530
|
-
return mergeAdjacentTableBlocks(blocks);
|
|
4531
|
-
}
|
|
4532
|
-
function mergeAdjacentTableBlocks(blocks) {
|
|
4533
|
-
if (blocks.length <= 1) return blocks;
|
|
4534
|
-
const result = [blocks[0]];
|
|
4535
|
-
for (let i = 1; i < blocks.length; i++) {
|
|
4536
|
-
const prev = result[result.length - 1];
|
|
4537
|
-
const curr = blocks[i];
|
|
4538
|
-
if (prev.type === "table" && curr.type === "table" && prev.table && curr.table && prev.table.cols === curr.table.cols) {
|
|
4539
|
-
const merged = {
|
|
4540
|
-
rows: prev.table.rows + curr.table.rows,
|
|
4541
|
-
cols: prev.table.cols,
|
|
4542
|
-
cells: [...prev.table.cells, ...curr.table.cells],
|
|
4543
|
-
hasHeader: prev.table.hasHeader
|
|
4544
|
-
};
|
|
4545
|
-
result[result.length - 1] = { ...prev, table: merged };
|
|
4546
|
-
} else {
|
|
4547
|
-
result.push(curr);
|
|
4548
|
-
}
|
|
4549
|
-
}
|
|
2992
|
+
walk(node);
|
|
4550
2993
|
return result;
|
|
4551
2994
|
}
|
|
4552
|
-
function
|
|
4553
|
-
|
|
4554
|
-
const
|
|
4555
|
-
|
|
4556
|
-
|
|
4557
|
-
|
|
4558
|
-
|
|
4559
|
-
|
|
4560
|
-
h: i.h,
|
|
4561
|
-
fontSize: i.fontSize,
|
|
4562
|
-
fontName: i.fontName
|
|
4563
|
-
}));
|
|
4564
|
-
const clusterResults = detectClusterTables(clusterItems, pageNum);
|
|
4565
|
-
if (clusterResults.length > 0) {
|
|
4566
|
-
const ciToIdx = /* @__PURE__ */ new Map();
|
|
4567
|
-
for (let ci = 0; ci < clusterItems.length; ci++) ciToIdx.set(clusterItems[ci], ci);
|
|
4568
|
-
const usedIndices = /* @__PURE__ */ new Set();
|
|
4569
|
-
for (const cr of clusterResults) {
|
|
4570
|
-
for (const ci of cr.usedItems) {
|
|
4571
|
-
const idx = ciToIdx.get(ci);
|
|
4572
|
-
if (idx !== void 0) usedIndices.add(idx);
|
|
4573
|
-
}
|
|
4574
|
-
blocks.push({ type: "table", table: cr.table, pageNumber: pageNum, bbox: cr.bbox });
|
|
4575
|
-
}
|
|
4576
|
-
const remaining = items.filter((_, idx) => !usedIndices.has(idx));
|
|
4577
|
-
if (remaining.length > 0) {
|
|
4578
|
-
const yLines = groupByY(remaining);
|
|
4579
|
-
for (const line of yLines) {
|
|
4580
|
-
const text = mergeLineSimple(line);
|
|
4581
|
-
if (!text.trim()) continue;
|
|
4582
|
-
const bbox = computeBBox(line, pageNum);
|
|
4583
|
-
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4584
|
-
}
|
|
4585
|
-
}
|
|
4586
|
-
blocks.sort((a, b) => {
|
|
4587
|
-
const ay = a.bbox ? a.bbox.y + a.bbox.height : 0;
|
|
4588
|
-
const by = b.bbox ? b.bbox.y + b.bbox.height : 0;
|
|
4589
|
-
return by - ay;
|
|
4590
|
-
});
|
|
4591
|
-
} else {
|
|
4592
|
-
const allYLines = groupByY(items);
|
|
4593
|
-
const columns = detectColumns(allYLines);
|
|
4594
|
-
if (columns && columns.length >= 3) {
|
|
4595
|
-
const tableText = extractWithColumns(allYLines, columns);
|
|
4596
|
-
const bbox = computeBBox(items, pageNum);
|
|
4597
|
-
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
|
|
4598
|
-
} else {
|
|
4599
|
-
const allY = items.map((i) => i.y);
|
|
4600
|
-
const pageHeight = safeMax(allY) - safeMin(allY);
|
|
4601
|
-
const gapThreshold = Math.max(15, pageHeight * 0.03);
|
|
4602
|
-
const orderedGroups = xyCutOrder(items, gapThreshold);
|
|
4603
|
-
for (const group of orderedGroups) {
|
|
4604
|
-
if (group.length === 0) continue;
|
|
4605
|
-
const yLines = groupByY(group);
|
|
4606
|
-
const groupColumns = detectColumns(yLines);
|
|
4607
|
-
if (groupColumns && groupColumns.length >= 3) {
|
|
4608
|
-
const tableText = extractWithColumns(yLines, groupColumns);
|
|
4609
|
-
const bbox = computeBBox(group, pageNum);
|
|
4610
|
-
blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(group) });
|
|
4611
|
-
} else {
|
|
4612
|
-
for (const line of yLines) {
|
|
4613
|
-
const text = mergeLineSimple(line);
|
|
4614
|
-
if (!text.trim()) continue;
|
|
4615
|
-
const bbox = computeBBox(line, pageNum);
|
|
4616
|
-
blocks.push({ type: "paragraph", text, pageNumber: pageNum, bbox, style: dominantStyle(line) });
|
|
4617
|
-
}
|
|
4618
|
-
}
|
|
4619
|
-
}
|
|
2995
|
+
function findDirectChildren(parent, tagLocalName) {
|
|
2996
|
+
const result = [];
|
|
2997
|
+
const children = parent.childNodes;
|
|
2998
|
+
if (!children) return result;
|
|
2999
|
+
for (let i = 0; i < children.length; i++) {
|
|
3000
|
+
const child = children[i];
|
|
3001
|
+
if (child.nodeType === 1 && localName(child) === tagLocalName) {
|
|
3002
|
+
result.push(child);
|
|
4620
3003
|
}
|
|
4621
3004
|
}
|
|
4622
|
-
return
|
|
3005
|
+
return result;
|
|
4623
3006
|
}
|
|
4624
|
-
function
|
|
4625
|
-
let
|
|
4626
|
-
|
|
4627
|
-
if (
|
|
4628
|
-
|
|
4629
|
-
if (i.x + i.w > maxX) maxX = i.x + i.w;
|
|
4630
|
-
const effectiveH = i.h > 0 ? i.h : i.fontSize;
|
|
4631
|
-
if (i.y + effectiveH > maxY) maxY = i.y + effectiveH;
|
|
3007
|
+
function isInsideTable(el) {
|
|
3008
|
+
let parent = el.parentNode;
|
|
3009
|
+
while (parent) {
|
|
3010
|
+
if (parent.nodeType === 1 && localName(parent) === "tbl") return true;
|
|
3011
|
+
parent = parent.parentNode;
|
|
4632
3012
|
}
|
|
4633
|
-
return
|
|
3013
|
+
return false;
|
|
4634
3014
|
}
|
|
4635
|
-
function
|
|
4636
|
-
|
|
4637
|
-
const
|
|
4638
|
-
|
|
4639
|
-
|
|
4640
|
-
|
|
4641
|
-
|
|
4642
|
-
|
|
4643
|
-
|
|
4644
|
-
|
|
4645
|
-
|
|
4646
|
-
|
|
4647
|
-
|
|
4648
|
-
|
|
4649
|
-
|
|
4650
|
-
return { fontSize: dominantSize, fontName };
|
|
4651
|
-
}
|
|
4652
|
-
function normalizeItems(rawItems) {
|
|
4653
|
-
const items = [];
|
|
4654
|
-
const spacePositions = [];
|
|
4655
|
-
for (const i of rawItems) {
|
|
4656
|
-
if (typeof i.str !== "string") continue;
|
|
4657
|
-
const x = Math.round(i.transform[4]);
|
|
4658
|
-
const y = Math.round(i.transform[5]);
|
|
4659
|
-
if (!i.str.trim()) {
|
|
4660
|
-
spacePositions.push({ x, y });
|
|
4661
|
-
continue;
|
|
4662
|
-
}
|
|
4663
|
-
const scaleY = Math.abs(i.transform[3]);
|
|
4664
|
-
const scaleX = Math.abs(i.transform[0]);
|
|
4665
|
-
const fontSize = Math.round(Math.max(scaleY, scaleX));
|
|
4666
|
-
const w = Math.round(i.width);
|
|
4667
|
-
const h = Math.round(i.height);
|
|
4668
|
-
const isHidden = fontSize === 0 || i.width === 0 && i.str.trim().length > 0;
|
|
4669
|
-
let text = i.str.trim();
|
|
4670
|
-
if (/^[\d\s\-().·,☎]+$/.test(text) && /\d/.test(text) && / /.test(text)) {
|
|
4671
|
-
text = text.replace(/ /g, "");
|
|
4672
|
-
}
|
|
4673
|
-
const split = splitEvenSpacedItem(text, x, w, fontSize);
|
|
4674
|
-
if (split) {
|
|
4675
|
-
for (const s of split) {
|
|
4676
|
-
items.push({ text: s.text, x: s.x, y, w: s.w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4677
|
-
}
|
|
4678
|
-
} else {
|
|
4679
|
-
items.push({ text, x, y, w, h, fontSize, fontName: i.fontName || "", isHidden });
|
|
4680
|
-
}
|
|
4681
|
-
}
|
|
4682
|
-
const sorted = items.sort((a, b) => b.y - a.y || a.x - b.x);
|
|
4683
|
-
const deduped = [];
|
|
4684
|
-
for (let i = 0; i < sorted.length; i++) {
|
|
4685
|
-
let isDup = false;
|
|
4686
|
-
for (let j = deduped.length - 1; j >= 0; j--) {
|
|
4687
|
-
const prev = deduped[j];
|
|
4688
|
-
if (prev.y - sorted[i].y > 3) break;
|
|
4689
|
-
if (Math.abs(prev.y - sorted[i].y) <= 3 && prev.text === sorted[i].text && Math.abs(prev.x - sorted[i].x) <= 3) {
|
|
4690
|
-
isDup = true;
|
|
4691
|
-
break;
|
|
4692
|
-
}
|
|
4693
|
-
}
|
|
4694
|
-
if (!isDup) deduped.push(sorted[i]);
|
|
4695
|
-
}
|
|
4696
|
-
if (spacePositions.length > 0) {
|
|
4697
|
-
for (const item of deduped) {
|
|
4698
|
-
for (const sp of spacePositions) {
|
|
4699
|
-
if (Math.abs(sp.y - item.y) <= 3) {
|
|
4700
|
-
const dist = item.x - sp.x;
|
|
4701
|
-
if (dist >= 0 && dist <= 20) {
|
|
4702
|
-
item.hasSpaceBefore = true;
|
|
4703
|
-
break;
|
|
4704
|
-
}
|
|
4705
|
-
}
|
|
4706
|
-
}
|
|
4707
|
-
}
|
|
4708
|
-
}
|
|
4709
|
-
return deduped;
|
|
4710
|
-
}
|
|
4711
|
-
function splitEvenSpacedItem(text, itemX, itemW, fontSize) {
|
|
4712
|
-
if (!/^[가-힣\d](?: [가-힣\d]){2,}$/.test(text)) return null;
|
|
4713
|
-
const chars = text.split(" ");
|
|
4714
|
-
if (chars.length < 3) return null;
|
|
4715
|
-
const charW = itemW / chars.length;
|
|
4716
|
-
if (charW > fontSize * 2) return null;
|
|
4717
|
-
return chars.map((ch, idx) => ({
|
|
4718
|
-
text: ch,
|
|
4719
|
-
x: Math.round(itemX + idx * charW),
|
|
4720
|
-
w: Math.round(charW * 0.8)
|
|
4721
|
-
// 실제 글자 폭은 간격보다 좁음
|
|
4722
|
-
}));
|
|
4723
|
-
}
|
|
4724
|
-
function groupByY(items) {
|
|
4725
|
-
if (items.length === 0) return [];
|
|
4726
|
-
const lines = [];
|
|
4727
|
-
let curY = items[0].y;
|
|
4728
|
-
let curLine = [items[0]];
|
|
4729
|
-
for (let i = 1; i < items.length; i++) {
|
|
4730
|
-
if (Math.abs(items[i].y - curY) > 3) {
|
|
4731
|
-
lines.push(curLine);
|
|
4732
|
-
curLine = [];
|
|
4733
|
-
curY = items[i].y;
|
|
4734
|
-
}
|
|
4735
|
-
curLine.push(items[i]);
|
|
4736
|
-
}
|
|
4737
|
-
if (curLine.length > 0) lines.push(curLine);
|
|
4738
|
-
return lines;
|
|
4739
|
-
}
|
|
4740
|
-
function isProseSpread(items) {
|
|
4741
|
-
if (items.length < 4) return false;
|
|
4742
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4743
|
-
const gaps = [];
|
|
4744
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4745
|
-
gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
|
|
4746
|
-
}
|
|
4747
|
-
const maxGap = safeMax(gaps);
|
|
4748
|
-
const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
|
|
4749
|
-
return maxGap < 40 && avgLen < 5;
|
|
4750
|
-
}
|
|
4751
|
-
function detectColumns(yLines) {
|
|
4752
|
-
const allItems = yLines.flat();
|
|
4753
|
-
if (allItems.length === 0) return null;
|
|
4754
|
-
const pageWidth = safeMax(allItems.map((i) => i.x + i.w)) - safeMin(allItems.map((i) => i.x));
|
|
4755
|
-
if (pageWidth < 100) return null;
|
|
4756
|
-
let bigoLineIdx = -1;
|
|
4757
|
-
for (let i = 0; i < yLines.length; i++) {
|
|
4758
|
-
if (yLines[i].length <= 2 && yLines[i].some((item) => item.text === "\uBE44\uACE0")) {
|
|
4759
|
-
bigoLineIdx = i;
|
|
4760
|
-
break;
|
|
4761
|
-
}
|
|
4762
|
-
}
|
|
4763
|
-
const tableYLines = bigoLineIdx >= 0 ? yLines.slice(0, bigoLineIdx) : yLines;
|
|
4764
|
-
const CLUSTER_TOL = 22;
|
|
4765
|
-
const xClusters = [];
|
|
4766
|
-
for (const line of tableYLines) {
|
|
4767
|
-
if (isProseSpread(line)) continue;
|
|
4768
|
-
for (const item of line) {
|
|
4769
|
-
let found = false;
|
|
4770
|
-
for (const c of xClusters) {
|
|
4771
|
-
if (Math.abs(item.x - c.center) <= CLUSTER_TOL) {
|
|
4772
|
-
c.center = Math.round((c.center * c.count + item.x) / (c.count + 1));
|
|
4773
|
-
c.minX = Math.min(c.minX, item.x);
|
|
4774
|
-
c.count++;
|
|
4775
|
-
found = true;
|
|
4776
|
-
break;
|
|
4777
|
-
}
|
|
4778
|
-
}
|
|
4779
|
-
if (!found) {
|
|
4780
|
-
xClusters.push({ center: item.x, count: 1, minX: item.x });
|
|
3015
|
+
function extractCellText(tcEl) {
|
|
3016
|
+
const parts = [];
|
|
3017
|
+
const walk = (node) => {
|
|
3018
|
+
const children = node.childNodes;
|
|
3019
|
+
if (!children) return;
|
|
3020
|
+
for (let i = 0; i < children.length; i++) {
|
|
3021
|
+
const child = children[i];
|
|
3022
|
+
if (child.nodeType === 3) {
|
|
3023
|
+
parts.push(child.textContent || "");
|
|
3024
|
+
} else if (child.nodeType === 1) {
|
|
3025
|
+
const tag = localName(child);
|
|
3026
|
+
if (tag === "t") walk(child);
|
|
3027
|
+
else if (tag === "run" || tag === "r" || tag === "p" || tag === "subList") walk(child);
|
|
3028
|
+
else if (tag === "tab") parts.push(" ");
|
|
3029
|
+
else if (tag === "br") parts.push("\n");
|
|
4781
3030
|
}
|
|
4782
3031
|
}
|
|
4783
|
-
}
|
|
4784
|
-
|
|
4785
|
-
|
|
4786
|
-
|
|
4787
|
-
|
|
4788
|
-
|
|
4789
|
-
|
|
4790
|
-
|
|
4791
|
-
|
|
4792
|
-
|
|
3032
|
+
};
|
|
3033
|
+
walk(tcEl);
|
|
3034
|
+
return parts.join("");
|
|
3035
|
+
}
|
|
3036
|
+
function prependCellText(tcEl, text) {
|
|
3037
|
+
const tElements = findAllElements(tcEl, "t");
|
|
3038
|
+
if (tElements.length === 0) return;
|
|
3039
|
+
const firstT = tElements[0];
|
|
3040
|
+
const existing = firstT.textContent || "";
|
|
3041
|
+
clearChildren(firstT);
|
|
3042
|
+
firstT.appendChild(firstT.ownerDocument.createTextNode(text + " " + existing));
|
|
3043
|
+
}
|
|
3044
|
+
function replaceCellText(tcEl, newValue) {
|
|
3045
|
+
const paragraphs = findAllElements(tcEl, "p");
|
|
3046
|
+
if (paragraphs.length === 0) return;
|
|
3047
|
+
const firstP = paragraphs[0];
|
|
3048
|
+
const runs = findAllElements(firstP, "run").concat(findAllElements(firstP, "r"));
|
|
3049
|
+
if (runs.length > 0) {
|
|
3050
|
+
setRunText(runs[0], newValue);
|
|
3051
|
+
for (let i = 1; i < runs.length; i++) {
|
|
3052
|
+
setRunText(runs[i], "");
|
|
3053
|
+
}
|
|
3054
|
+
} else {
|
|
3055
|
+
const tElements = findAllElements(firstP, "t");
|
|
3056
|
+
if (tElements.length > 0) {
|
|
3057
|
+
clearChildren(tElements[0]);
|
|
3058
|
+
tElements[0].appendChild(tElements[0].ownerDocument.createTextNode(newValue));
|
|
3059
|
+
for (let i = 1; i < tElements.length; i++) {
|
|
3060
|
+
clearChildren(tElements[i]);
|
|
4793
3061
|
}
|
|
4794
|
-
prev.count += peaks[i].count;
|
|
4795
|
-
prev.minX = Math.min(prev.minX, peaks[i].minX);
|
|
4796
|
-
} else {
|
|
4797
|
-
merged.push({ ...peaks[i] });
|
|
4798
3062
|
}
|
|
4799
3063
|
}
|
|
4800
|
-
|
|
4801
|
-
|
|
4802
|
-
|
|
4803
|
-
|
|
4804
|
-
|
|
4805
|
-
|
|
4806
|
-
|
|
3064
|
+
for (let i = 1; i < paragraphs.length; i++) {
|
|
3065
|
+
const p = paragraphs[i];
|
|
3066
|
+
if (p.parentNode) {
|
|
3067
|
+
const pRuns = findAllElements(p, "run").concat(findAllElements(p, "r"));
|
|
3068
|
+
for (const run of pRuns) setRunText(run, "");
|
|
3069
|
+
const pTs = findAllElements(p, "t");
|
|
3070
|
+
for (const t of pTs) clearChildren(t);
|
|
3071
|
+
}
|
|
4807
3072
|
}
|
|
4808
|
-
return columns.length >= 3 ? columns : null;
|
|
4809
3073
|
}
|
|
4810
|
-
function
|
|
4811
|
-
|
|
4812
|
-
|
|
3074
|
+
function setRunText(runEl, text) {
|
|
3075
|
+
const tElements = findAllElements(runEl, "t");
|
|
3076
|
+
if (tElements.length > 0) {
|
|
3077
|
+
clearChildren(tElements[0]);
|
|
3078
|
+
tElements[0].appendChild(tElements[0].ownerDocument.createTextNode(text));
|
|
3079
|
+
for (let i = 1; i < tElements.length; i++) {
|
|
3080
|
+
clearChildren(tElements[i]);
|
|
3081
|
+
}
|
|
4813
3082
|
}
|
|
4814
|
-
return 0;
|
|
4815
3083
|
}
|
|
4816
|
-
function
|
|
3084
|
+
function clearChildren(el) {
|
|
3085
|
+
while (el.firstChild) el.removeChild(el.firstChild);
|
|
3086
|
+
}
|
|
3087
|
+
function collectTextNodes(pEl) {
|
|
3088
|
+
const tElements = findAllElements(pEl, "t");
|
|
4817
3089
|
const result = [];
|
|
4818
|
-
|
|
4819
|
-
const
|
|
4820
|
-
|
|
4821
|
-
|
|
4822
|
-
|
|
4823
|
-
bigoIdx = i;
|
|
4824
|
-
break;
|
|
4825
|
-
}
|
|
3090
|
+
let offset = 0;
|
|
3091
|
+
for (const t of tElements) {
|
|
3092
|
+
const text = t.textContent || "";
|
|
3093
|
+
result.push({ element: t, text, offset });
|
|
3094
|
+
offset += text.length;
|
|
4826
3095
|
}
|
|
4827
|
-
|
|
4828
|
-
|
|
4829
|
-
|
|
4830
|
-
|
|
4831
|
-
|
|
4832
|
-
|
|
3096
|
+
return result;
|
|
3097
|
+
}
|
|
3098
|
+
function replaceTextRange(tNodes, globalStart, globalEnd, newValue) {
|
|
3099
|
+
let replaced = false;
|
|
3100
|
+
for (const node of tNodes) {
|
|
3101
|
+
const nodeStart = node.offset;
|
|
3102
|
+
const nodeEnd = node.offset + node.text.length;
|
|
3103
|
+
if (nodeEnd <= globalStart || nodeStart >= globalEnd) continue;
|
|
3104
|
+
const localStart = Math.max(0, globalStart - nodeStart);
|
|
3105
|
+
const localEnd = Math.min(node.text.length, globalEnd - nodeStart);
|
|
3106
|
+
if (!replaced) {
|
|
3107
|
+
const before = node.text.slice(0, localStart);
|
|
3108
|
+
const after = node.text.slice(localEnd);
|
|
3109
|
+
const newText = before + newValue + after;
|
|
3110
|
+
clearChildren(node.element);
|
|
3111
|
+
node.element.appendChild(node.element.ownerDocument.createTextNode(newText));
|
|
3112
|
+
replaced = true;
|
|
3113
|
+
} else {
|
|
3114
|
+
const before = node.text.slice(0, localStart);
|
|
3115
|
+
const after = node.text.slice(localEnd);
|
|
3116
|
+
const newText = before + after;
|
|
3117
|
+
clearChildren(node.element);
|
|
3118
|
+
node.element.appendChild(node.element.ownerDocument.createTextNode(newText));
|
|
4833
3119
|
}
|
|
4834
3120
|
}
|
|
4835
|
-
|
|
4836
|
-
|
|
4837
|
-
|
|
3121
|
+
}
|
|
3122
|
+
function collectCellTextNodes(tcEl) {
|
|
3123
|
+
const tElements = findAllElements(tcEl, "t");
|
|
3124
|
+
const result = [];
|
|
3125
|
+
let offset = 0;
|
|
3126
|
+
for (const t of tElements) {
|
|
3127
|
+
const text = t.textContent || "";
|
|
3128
|
+
result.push({ element: t, text, offset });
|
|
3129
|
+
offset += text.length;
|
|
4838
3130
|
}
|
|
4839
|
-
|
|
4840
|
-
|
|
4841
|
-
|
|
4842
|
-
|
|
4843
|
-
|
|
4844
|
-
|
|
4845
|
-
|
|
4846
|
-
|
|
4847
|
-
|
|
4848
|
-
|
|
4849
|
-
if (gridLines.length > 0) {
|
|
4850
|
-
result.push(buildGridTable(gridLines.splice(0), columns));
|
|
4851
|
-
}
|
|
4852
|
-
result.push(mergeLineSimple(line));
|
|
4853
|
-
}
|
|
4854
|
-
}
|
|
4855
|
-
if (gridLines.length > 0) {
|
|
4856
|
-
result.push(buildGridTable(gridLines, columns));
|
|
4857
|
-
}
|
|
3131
|
+
return result;
|
|
3132
|
+
}
|
|
3133
|
+
function applyTextReplacements(tNodes, originalFull, replacedFull) {
|
|
3134
|
+
if (originalFull === replacedFull) return;
|
|
3135
|
+
if (tNodes.length === 1) {
|
|
3136
|
+
clearChildren(tNodes[0].element);
|
|
3137
|
+
tNodes[0].element.appendChild(
|
|
3138
|
+
tNodes[0].element.ownerDocument.createTextNode(replacedFull)
|
|
3139
|
+
);
|
|
3140
|
+
return;
|
|
4858
3141
|
}
|
|
4859
|
-
|
|
4860
|
-
|
|
4861
|
-
|
|
4862
|
-
|
|
4863
|
-
|
|
3142
|
+
let diffStart = 0;
|
|
3143
|
+
while (diffStart < originalFull.length && diffStart < replacedFull.length && originalFull[diffStart] === replacedFull[diffStart]) {
|
|
3144
|
+
diffStart++;
|
|
3145
|
+
}
|
|
3146
|
+
let diffEndOrig = originalFull.length;
|
|
3147
|
+
let diffEndRepl = replacedFull.length;
|
|
3148
|
+
while (diffEndOrig > diffStart && diffEndRepl > diffStart && originalFull[diffEndOrig - 1] === replacedFull[diffEndRepl - 1]) {
|
|
3149
|
+
diffEndOrig--;
|
|
3150
|
+
diffEndRepl--;
|
|
4864
3151
|
}
|
|
4865
|
-
|
|
3152
|
+
const newPart = replacedFull.slice(diffStart, diffEndRepl);
|
|
3153
|
+
replaceTextRange(tNodes, diffStart, diffEndOrig, newPart);
|
|
4866
3154
|
}
|
|
4867
|
-
|
|
4868
|
-
|
|
4869
|
-
|
|
4870
|
-
|
|
4871
|
-
|
|
4872
|
-
|
|
4873
|
-
|
|
4874
|
-
|
|
4875
|
-
|
|
4876
|
-
|
|
4877
|
-
|
|
4878
|
-
|
|
4879
|
-
|
|
4880
|
-
|
|
4881
|
-
|
|
4882
|
-
|
|
3155
|
+
|
|
3156
|
+
// src/hwpx/generator.ts
|
|
3157
|
+
import JSZip3 from "jszip";
|
|
3158
|
+
var NS_SECTION = "http://www.hancom.co.kr/hwpml/2011/section";
|
|
3159
|
+
var NS_PARA = "http://www.hancom.co.kr/hwpml/2011/paragraph";
|
|
3160
|
+
var NS_HEAD = "http://www.hancom.co.kr/hwpml/2011/head";
|
|
3161
|
+
var NS_OPF = "http://www.idpf.org/2007/opf/";
|
|
3162
|
+
var NS_HPF = "http://www.hancom.co.kr/schema/2011/hpf";
|
|
3163
|
+
var NS_OCF = "urn:oasis:names:tc:opendocument:xmlns:container";
|
|
3164
|
+
var CHAR_NORMAL = 0;
|
|
3165
|
+
var CHAR_BOLD = 1;
|
|
3166
|
+
var CHAR_ITALIC = 2;
|
|
3167
|
+
var CHAR_BOLD_ITALIC = 3;
|
|
3168
|
+
var CHAR_CODE = 4;
|
|
3169
|
+
var CHAR_H1 = 5;
|
|
3170
|
+
var CHAR_H2 = 6;
|
|
3171
|
+
var CHAR_H3 = 7;
|
|
3172
|
+
var CHAR_H4 = 8;
|
|
3173
|
+
var PARA_NORMAL = 0;
|
|
3174
|
+
var PARA_H1 = 1;
|
|
3175
|
+
var PARA_H2 = 2;
|
|
3176
|
+
var PARA_H3 = 3;
|
|
3177
|
+
var PARA_H4 = 4;
|
|
3178
|
+
var PARA_CODE = 5;
|
|
3179
|
+
var PARA_QUOTE = 6;
|
|
3180
|
+
var PARA_LIST = 7;
|
|
3181
|
+
async function markdownToHwpx(markdown) {
|
|
3182
|
+
const blocks = parseMarkdownToBlocks(markdown);
|
|
3183
|
+
const sectionXml = blocksToSectionXml(blocks);
|
|
3184
|
+
const zip = new JSZip3();
|
|
3185
|
+
zip.file("mimetype", "application/hwp+zip", { compression: "STORE" });
|
|
3186
|
+
zip.file("META-INF/container.xml", generateContainerXml());
|
|
3187
|
+
zip.file("Contents/content.hpf", generateManifest());
|
|
3188
|
+
zip.file("Contents/header.xml", generateHeaderXml());
|
|
3189
|
+
zip.file("Contents/section0.xml", sectionXml);
|
|
3190
|
+
return await zip.generateAsync({ type: "arraybuffer" });
|
|
3191
|
+
}
|
|
3192
|
+
function parseMarkdownToBlocks(md) {
|
|
3193
|
+
const lines = md.split("\n");
|
|
3194
|
+
const blocks = [];
|
|
3195
|
+
let i = 0;
|
|
3196
|
+
while (i < lines.length) {
|
|
3197
|
+
const line = lines[i];
|
|
3198
|
+
if (!line.trim()) {
|
|
3199
|
+
i++;
|
|
4883
3200
|
continue;
|
|
4884
3201
|
}
|
|
4885
|
-
const
|
|
4886
|
-
|
|
4887
|
-
|
|
4888
|
-
|
|
4889
|
-
|
|
4890
|
-
|
|
4891
|
-
|
|
4892
|
-
|
|
4893
|
-
|
|
4894
|
-
}
|
|
4895
|
-
if (!isNewRow) {
|
|
4896
|
-
const hasData = row.slice(dataColStart).some((c) => c !== "");
|
|
4897
|
-
const prevHasData = prev.slice(dataColStart).some((c) => c !== "");
|
|
4898
|
-
if (hasData && prevHasData) {
|
|
4899
|
-
isNewRow = true;
|
|
3202
|
+
const fenceMatch = line.match(/^(`{3,}|~{3,})(.*)$/);
|
|
3203
|
+
if (fenceMatch) {
|
|
3204
|
+
const fence = fenceMatch[1];
|
|
3205
|
+
const lang = fenceMatch[2].trim();
|
|
3206
|
+
const codeLines = [];
|
|
3207
|
+
i++;
|
|
3208
|
+
while (i < lines.length && !lines[i].startsWith(fence)) {
|
|
3209
|
+
codeLines.push(lines[i]);
|
|
3210
|
+
i++;
|
|
4900
3211
|
}
|
|
3212
|
+
if (i < lines.length) i++;
|
|
3213
|
+
blocks.push({ type: "code_block", text: codeLines.join("\n"), lang });
|
|
3214
|
+
continue;
|
|
4901
3215
|
}
|
|
4902
|
-
if (
|
|
4903
|
-
|
|
3216
|
+
if (/^(\*{3,}|-{3,}|_{3,})\s*$/.test(line.trim())) {
|
|
3217
|
+
blocks.push({ type: "hr" });
|
|
3218
|
+
i++;
|
|
3219
|
+
continue;
|
|
4904
3220
|
}
|
|
4905
|
-
|
|
4906
|
-
|
|
4907
|
-
|
|
4908
|
-
|
|
4909
|
-
|
|
4910
|
-
prev[c] = prev[c] ? prev[c] + " " + row[c] : row[c];
|
|
4911
|
-
}
|
|
4912
|
-
}
|
|
3221
|
+
const headingMatch = line.match(/^(#{1,6})\s+(.+)$/);
|
|
3222
|
+
if (headingMatch) {
|
|
3223
|
+
blocks.push({ type: "heading", text: headingMatch[2].trim(), level: headingMatch[1].length });
|
|
3224
|
+
i++;
|
|
3225
|
+
continue;
|
|
4913
3226
|
}
|
|
4914
|
-
|
|
4915
|
-
|
|
4916
|
-
|
|
4917
|
-
|
|
4918
|
-
|
|
4919
|
-
|
|
4920
|
-
|
|
4921
|
-
if (hasDataValues) break;
|
|
4922
|
-
headerEnd = r + 1;
|
|
4923
|
-
}
|
|
4924
|
-
if (headerEnd > 1) {
|
|
4925
|
-
const headerRow = Array(numCols).fill("");
|
|
4926
|
-
for (let r = 0; r < headerEnd; r++) {
|
|
4927
|
-
for (let c = 0; c < numCols; c++) {
|
|
4928
|
-
if (merged[r][c]) {
|
|
4929
|
-
headerRow[c] = headerRow[c] ? headerRow[c] + " " + merged[r][c] : merged[r][c];
|
|
3227
|
+
if (line.trimStart().startsWith("|")) {
|
|
3228
|
+
const tableRows = [];
|
|
3229
|
+
while (i < lines.length && lines[i].trimStart().startsWith("|")) {
|
|
3230
|
+
const row = lines[i];
|
|
3231
|
+
if (/^[\s|:\-]+$/.test(row)) {
|
|
3232
|
+
i++;
|
|
3233
|
+
continue;
|
|
4930
3234
|
}
|
|
3235
|
+
const cells = row.split("|").slice(1, -1).map((c) => c.trim());
|
|
3236
|
+
if (cells.length > 0) tableRows.push(cells);
|
|
3237
|
+
i++;
|
|
4931
3238
|
}
|
|
4932
|
-
|
|
4933
|
-
merged.splice(0, headerEnd, headerRow);
|
|
4934
|
-
}
|
|
4935
|
-
for (const row of merged) {
|
|
4936
|
-
for (let c = 0; c < row.length; c++) {
|
|
4937
|
-
if (row[c]) row[c] = collapseEvenSpacing(row[c]);
|
|
4938
|
-
}
|
|
4939
|
-
}
|
|
4940
|
-
const totalCells = merged.length * numCols;
|
|
4941
|
-
const filledCells = merged.reduce((s, row) => s + row.filter((c) => c).length, 0);
|
|
4942
|
-
if (filledCells < totalCells * 0.35 || merged.length < 2 || merged.length <= 3 && numCols >= 7) {
|
|
4943
|
-
return merged.map((r) => r.filter((c) => c).join(" ")).join("\n");
|
|
4944
|
-
}
|
|
4945
|
-
const md = [];
|
|
4946
|
-
md.push("| " + merged[0].join(" | ") + " |");
|
|
4947
|
-
md.push("| " + merged[0].map(() => "---").join(" | ") + " |");
|
|
4948
|
-
for (let r = 1; r < merged.length; r++) {
|
|
4949
|
-
md.push("| " + merged[r].join(" | ") + " |");
|
|
4950
|
-
}
|
|
4951
|
-
return md.join("\n");
|
|
4952
|
-
}
|
|
4953
|
-
function mergeLineSimple(items) {
|
|
4954
|
-
if (items.length <= 1) return items[0]?.text || "";
|
|
4955
|
-
const sorted = [...items].sort((a, b) => a.x - b.x);
|
|
4956
|
-
const isEvenSpaced = detectEvenSpacedItems(sorted);
|
|
4957
|
-
let result = sorted[0].text;
|
|
4958
|
-
for (let i = 1; i < sorted.length; i++) {
|
|
4959
|
-
const gap = sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w);
|
|
4960
|
-
const avgFs = (sorted[i].fontSize + sorted[i - 1].fontSize) / 2;
|
|
4961
|
-
const tabThreshold = Math.max(avgFs * 2, 30);
|
|
4962
|
-
if (gap > tabThreshold) {
|
|
4963
|
-
result += " ";
|
|
4964
|
-
result += sorted[i].text;
|
|
4965
|
-
continue;
|
|
4966
|
-
}
|
|
4967
|
-
if (isEvenSpaced[i]) {
|
|
4968
|
-
result += sorted[i].text;
|
|
3239
|
+
if (tableRows.length > 0) blocks.push({ type: "table", rows: tableRows });
|
|
4969
3240
|
continue;
|
|
4970
3241
|
}
|
|
4971
|
-
if (
|
|
4972
|
-
|
|
4973
|
-
|
|
3242
|
+
if (line.trimStart().startsWith("> ")) {
|
|
3243
|
+
const quoteLines = [];
|
|
3244
|
+
while (i < lines.length && (lines[i].trimStart().startsWith("> ") || lines[i].trimStart().startsWith(">"))) {
|
|
3245
|
+
quoteLines.push(lines[i].replace(/^>\s?/, ""));
|
|
3246
|
+
i++;
|
|
3247
|
+
}
|
|
3248
|
+
for (const ql of quoteLines) {
|
|
3249
|
+
blocks.push({ type: "blockquote", text: ql.trim() || "" });
|
|
3250
|
+
}
|
|
4974
3251
|
continue;
|
|
4975
3252
|
}
|
|
4976
|
-
|
|
4977
|
-
|
|
4978
|
-
|
|
3253
|
+
const listMatch = line.match(/^(\s*)([-*+]|\d+[.)]) (.+)$/);
|
|
3254
|
+
if (listMatch) {
|
|
3255
|
+
const indent = Math.floor(listMatch[1].length / 2);
|
|
3256
|
+
const ordered = /\d/.test(listMatch[2]);
|
|
3257
|
+
blocks.push({ type: "list_item", text: listMatch[3].trim(), ordered, indent });
|
|
3258
|
+
i++;
|
|
4979
3259
|
continue;
|
|
4980
3260
|
}
|
|
4981
|
-
|
|
4982
|
-
|
|
4983
|
-
} else if (gap > 3) result += " ";
|
|
4984
|
-
result += sorted[i].text;
|
|
3261
|
+
blocks.push({ type: "paragraph", text: line.trim() });
|
|
3262
|
+
i++;
|
|
4985
3263
|
}
|
|
4986
|
-
return
|
|
4987
|
-
}
|
|
4988
|
-
function cleanPdfText(text) {
|
|
4989
|
-
return mergeKoreanLines(
|
|
4990
|
-
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
4991
|
-
).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
4992
|
-
}
|
|
4993
|
-
function startsWithMarker(line) {
|
|
4994
|
-
const t = line.trimStart();
|
|
4995
|
-
return /^[가-힣ㄱ-ㅎ][.)]/.test(t) || /^\d+[.)]/.test(t) || /^\([가-힣ㄱ-ㅎ\d]+\)/.test(t) || /^[○●※▶▷◆◇■□★☆\-·]\s/.test(t) || /^제\d+[조항호장절]/.test(t);
|
|
4996
|
-
}
|
|
4997
|
-
function isStandaloneHeader(line) {
|
|
4998
|
-
return /^제\d+[조항호장절](\([^)]*\))?(\s+\S+){0,7}$/.test(line.trim());
|
|
3264
|
+
return blocks;
|
|
4999
3265
|
}
|
|
5000
|
-
function
|
|
5001
|
-
|
|
5002
|
-
|
|
5003
|
-
|
|
5004
|
-
|
|
5005
|
-
|
|
5006
|
-
|
|
5007
|
-
|
|
5008
|
-
|
|
3266
|
+
function parseInlineMarkdown(text) {
|
|
3267
|
+
text = text.replace(/!\[([^\]]*)\]\([^)]*\)/g, "$1");
|
|
3268
|
+
text = text.replace(/\[([^\]]*)\]\(([^)]*)\)/g, (_, t, u) => t || u);
|
|
3269
|
+
text = text.replace(/~~([^~]+)~~/g, "$1");
|
|
3270
|
+
const spans = [];
|
|
3271
|
+
const regex = /(`[^`]+`|\*{3}[^*]+\*{3}|\*{2}[^*]+\*{2}|\*[^*]+\*|_{2}[^_]+_{2}|_[^_]+_)/g;
|
|
3272
|
+
let lastIdx = 0;
|
|
3273
|
+
for (const match of text.matchAll(regex)) {
|
|
3274
|
+
const idx = match.index;
|
|
3275
|
+
if (idx > lastIdx) {
|
|
3276
|
+
spans.push({ text: text.slice(lastIdx, idx), bold: false, italic: false, code: false });
|
|
3277
|
+
}
|
|
3278
|
+
const raw = match[0];
|
|
3279
|
+
if (raw.startsWith("`")) {
|
|
3280
|
+
spans.push({ text: raw.slice(1, -1), bold: false, italic: false, code: true });
|
|
3281
|
+
} else if (raw.startsWith("***") || raw.startsWith("___")) {
|
|
3282
|
+
spans.push({ text: raw.slice(3, -3), bold: true, italic: true, code: false });
|
|
3283
|
+
} else if (raw.startsWith("**") || raw.startsWith("__")) {
|
|
3284
|
+
spans.push({ text: raw.slice(2, -2), bold: true, italic: false, code: false });
|
|
3285
|
+
} else {
|
|
3286
|
+
spans.push({ text: raw.slice(1, -1), bold: false, italic: true, code: false });
|
|
3287
|
+
}
|
|
3288
|
+
lastIdx = idx + raw.length;
|
|
3289
|
+
}
|
|
3290
|
+
if (lastIdx < text.length) {
|
|
3291
|
+
spans.push({ text: text.slice(lastIdx), bold: false, italic: false, code: false });
|
|
3292
|
+
}
|
|
3293
|
+
if (spans.length === 0) {
|
|
3294
|
+
spans.push({ text, bold: false, italic: false, code: false });
|
|
3295
|
+
}
|
|
3296
|
+
return spans;
|
|
3297
|
+
}
|
|
3298
|
+
function spanToCharPrId(span) {
|
|
3299
|
+
if (span.code) return CHAR_CODE;
|
|
3300
|
+
if (span.bold && span.italic) return CHAR_BOLD_ITALIC;
|
|
3301
|
+
if (span.bold) return CHAR_BOLD;
|
|
3302
|
+
if (span.italic) return CHAR_ITALIC;
|
|
3303
|
+
return CHAR_NORMAL;
|
|
3304
|
+
}
|
|
3305
|
+
function escapeXml(text) {
|
|
3306
|
+
return text.replace(/&/g, "&").replace(/</g, "<").replace(/>/g, ">").replace(/"/g, """);
|
|
3307
|
+
}
|
|
3308
|
+
function generateRuns(text, defaultCharPr = CHAR_NORMAL) {
|
|
3309
|
+
const spans = parseInlineMarkdown(text);
|
|
3310
|
+
return spans.map((span) => {
|
|
3311
|
+
const charId = span.code || span.bold || span.italic ? spanToCharPrId(span) : defaultCharPr;
|
|
3312
|
+
return `<hp:run charPrIDRef="${charId}"><hp:t>${escapeXml(span.text)}</hp:t></hp:run>`;
|
|
3313
|
+
}).join("");
|
|
3314
|
+
}
|
|
3315
|
+
function generateParagraph(text, paraPrId = PARA_NORMAL, charPrId = CHAR_NORMAL) {
|
|
3316
|
+
if (paraPrId === PARA_CODE) {
|
|
3317
|
+
return `<hp:p paraPrIDRef="${paraPrId}" styleIDRef="0"><hp:run charPrIDRef="${CHAR_CODE}"><hp:t>${escapeXml(text)}</hp:t></hp:run></hp:p>`;
|
|
3318
|
+
}
|
|
3319
|
+
const runs = generateRuns(text, charPrId);
|
|
3320
|
+
return `<hp:p paraPrIDRef="${paraPrId}" styleIDRef="0">${runs}</hp:p>`;
|
|
3321
|
+
}
|
|
3322
|
+
function headingParaPrId(level) {
|
|
3323
|
+
if (level === 1) return PARA_H1;
|
|
3324
|
+
if (level === 2) return PARA_H2;
|
|
3325
|
+
if (level === 3) return PARA_H3;
|
|
3326
|
+
return PARA_H4;
|
|
3327
|
+
}
|
|
3328
|
+
function headingCharPrId(level) {
|
|
3329
|
+
if (level === 1) return CHAR_H1;
|
|
3330
|
+
if (level === 2) return CHAR_H2;
|
|
3331
|
+
if (level === 3) return CHAR_H3;
|
|
3332
|
+
return CHAR_H4;
|
|
3333
|
+
}
|
|
3334
|
+
function generateContainerXml() {
|
|
3335
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
3336
|
+
<ocf:container xmlns:ocf="${NS_OCF}" xmlns:hpf="${NS_HPF}">
|
|
3337
|
+
<ocf:rootfiles>
|
|
3338
|
+
<ocf:rootfile full-path="Contents/content.hpf" media-type="application/hwpml-package+xml"/>
|
|
3339
|
+
</ocf:rootfiles>
|
|
3340
|
+
</ocf:container>`;
|
|
3341
|
+
}
|
|
3342
|
+
function generateManifest() {
|
|
3343
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
3344
|
+
<opf:package xmlns:opf="${NS_OPF}" xmlns:hpf="${NS_HPF}" xmlns:hh="${NS_HEAD}">
|
|
3345
|
+
<opf:manifest>
|
|
3346
|
+
<opf:item id="header" href="Contents/header.xml" media-type="application/xml"/>
|
|
3347
|
+
<opf:item id="section0" href="Contents/section0.xml" media-type="application/xml"/>
|
|
3348
|
+
</opf:manifest>
|
|
3349
|
+
<opf:spine>
|
|
3350
|
+
<opf:itemref idref="header" linear="no"/>
|
|
3351
|
+
<opf:itemref idref="section0" linear="yes"/>
|
|
3352
|
+
</opf:spine>
|
|
3353
|
+
</opf:package>`;
|
|
3354
|
+
}
|
|
3355
|
+
function charPr(id, height, bold, italic, fontId = 0) {
|
|
3356
|
+
const boldAttr = bold ? ` bold="1"` : "";
|
|
3357
|
+
const italicAttr = italic ? ` italic="1"` : "";
|
|
3358
|
+
return ` <hh:charPr id="${id}" height="${height}" textColor="#000000" shadeColor="none" useFontSpace="0" useKerning="0" symMark="NONE" borderFillIDRef="0"${boldAttr}${italicAttr}>
|
|
3359
|
+
<hh:fontRef hangul="${fontId}" latin="${fontId}" hanja="${fontId}" japanese="${fontId}" other="${fontId}" symbol="${fontId}" user="${fontId}"/>
|
|
3360
|
+
<hh:ratio hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
3361
|
+
<hh:spacing hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
3362
|
+
<hh:relSz hangul="100" latin="100" hanja="100" japanese="100" other="100" symbol="100" user="100"/>
|
|
3363
|
+
<hh:offset hangul="0" latin="0" hanja="0" japanese="0" other="0" symbol="0" user="0"/>
|
|
3364
|
+
</hh:charPr>`;
|
|
3365
|
+
}
|
|
3366
|
+
function paraPr(id, opts = {}) {
|
|
3367
|
+
const { align = "JUSTIFY", spaceBefore = 0, spaceAfter = 0, lineSpacing = 160, indent = 0 } = opts;
|
|
3368
|
+
return ` <hh:paraPr id="${id}" tabPrIDRef="0" condense="0" fontLineHeight="0" snapToGrid="1" suppressLineNumbers="0" checked="0" textDir="AUTO">
|
|
3369
|
+
<hh:align horizontal="${align}" vertical="BASELINE"/>
|
|
3370
|
+
<hh:heading type="NONE" idRef="0" level="0"/>
|
|
3371
|
+
<hh:breakSetting breakLatinWord="KEEP_WORD" breakNonLatinWord="BREAK_WORD" widowOrphan="0" keepWithNext="0" keepLines="0" pageBreakBefore="0" lineWrap="BREAK"/>
|
|
3372
|
+
<hh:autoSpacing eAsianEng="0" eAsianNum="0"/>
|
|
3373
|
+
<hh:margin indent="${indent}" left="0" right="0" prev="${spaceBefore}" next="${spaceAfter}"/>
|
|
3374
|
+
<hh:lineSpacing type="PERCENT" value="${lineSpacing}"/>
|
|
3375
|
+
<hh:border borderFillIDRef="0" offsetLeft="0" offsetRight="0" offsetTop="0" offsetBottom="0" connect="0" ignoreMargin="0"/>
|
|
3376
|
+
</hh:paraPr>`;
|
|
3377
|
+
}
|
|
3378
|
+
function generateHeaderXml() {
|
|
3379
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
3380
|
+
<hh:head xmlns:hh="${NS_HEAD}" xmlns:hp="${NS_PARA}" version="1.4" secCnt="1">
|
|
3381
|
+
<hh:beginNum page="1" footnote="1" endnote="1" pic="1" tbl="1" equation="1"/>
|
|
3382
|
+
<hh:refList>
|
|
3383
|
+
<hh:fontfaces itemCnt="7">
|
|
3384
|
+
<hh:fontface lang="HANGUL" fontCnt="2">
|
|
3385
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
3386
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
3387
|
+
</hh:font>
|
|
3388
|
+
<hh:font id="1" face="\uD568\uCD08\uB86C\uB3CB\uC6C0" type="TTF" isEmbedded="0">
|
|
3389
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
3390
|
+
</hh:font>
|
|
3391
|
+
</hh:fontface>
|
|
3392
|
+
<hh:fontface lang="LATIN" fontCnt="2">
|
|
3393
|
+
<hh:font id="0" face="Times New Roman" type="TTF" isEmbedded="0">
|
|
3394
|
+
<hh:typeInfo familyType="FCAT_OLDSTYLE" weight="5" proportion="4" contrast="2" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="4"/>
|
|
3395
|
+
</hh:font>
|
|
3396
|
+
<hh:font id="1" face="Consolas" type="TTF" isEmbedded="0">
|
|
3397
|
+
<hh:typeInfo familyType="FCAT_MODERN" weight="5" proportion="0" contrast="0" strokeVariation="0" armStyle="0" letterform="0" midline="0" xHeight="0"/>
|
|
3398
|
+
</hh:font>
|
|
3399
|
+
</hh:fontface>
|
|
3400
|
+
<hh:fontface lang="HANJA" fontCnt="1">
|
|
3401
|
+
<hh:font id="0" face="\uD568\uCD08\uB86C\uBC14\uD0D5" type="TTF" isEmbedded="0">
|
|
3402
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="4" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
3403
|
+
</hh:font>
|
|
3404
|
+
</hh:fontface>
|
|
3405
|
+
<hh:fontface lang="JAPANESE" fontCnt="1">
|
|
3406
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
3407
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
3408
|
+
</hh:font>
|
|
3409
|
+
</hh:fontface>
|
|
3410
|
+
<hh:fontface lang="OTHER" fontCnt="1">
|
|
3411
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
3412
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
3413
|
+
</hh:font>
|
|
3414
|
+
</hh:fontface>
|
|
3415
|
+
<hh:fontface lang="SYMBOL" fontCnt="1">
|
|
3416
|
+
<hh:font id="0" face="Symbol" type="TTF" isEmbedded="0">
|
|
3417
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
3418
|
+
</hh:font>
|
|
3419
|
+
</hh:fontface>
|
|
3420
|
+
<hh:fontface lang="USER" fontCnt="1">
|
|
3421
|
+
<hh:font id="0" face="\uAD74\uB9BC" type="TTF" isEmbedded="0">
|
|
3422
|
+
<hh:typeInfo familyType="FCAT_GOTHIC" weight="6" proportion="0" contrast="0" strokeVariation="1" armStyle="1" letterform="1" midline="1" xHeight="1"/>
|
|
3423
|
+
</hh:font>
|
|
3424
|
+
</hh:fontface>
|
|
3425
|
+
</hh:fontfaces>
|
|
3426
|
+
<hh:borderFills itemCnt="1">
|
|
3427
|
+
<hh:borderFill id="0" threeD="0" shadow="0" centerLine="0" breakCellSeparateLine="0">
|
|
3428
|
+
<hh:slash type="NONE" Crooked="0" isCounter="0"/>
|
|
3429
|
+
<hh:backSlash type="NONE" Crooked="0" isCounter="0"/>
|
|
3430
|
+
<hh:leftBorder type="NONE" width="0.1mm" color="#000000"/>
|
|
3431
|
+
<hh:rightBorder type="NONE" width="0.1mm" color="#000000"/>
|
|
3432
|
+
<hh:topBorder type="NONE" width="0.1mm" color="#000000"/>
|
|
3433
|
+
<hh:bottomBorder type="NONE" width="0.1mm" color="#000000"/>
|
|
3434
|
+
<hh:diagonal type="NONE" width="0.1mm" color="#000000"/>
|
|
3435
|
+
<hh:fillInfo/>
|
|
3436
|
+
</hh:borderFill>
|
|
3437
|
+
</hh:borderFills>
|
|
3438
|
+
<hh:charProperties itemCnt="9">
|
|
3439
|
+
${charPr(0, 1e3, false, false)}
|
|
3440
|
+
${charPr(1, 1e3, true, false)}
|
|
3441
|
+
${charPr(2, 1e3, false, true)}
|
|
3442
|
+
${charPr(3, 1e3, true, true)}
|
|
3443
|
+
${charPr(4, 900, false, false, 1)}
|
|
3444
|
+
${charPr(5, 1800, true, false, 1)}
|
|
3445
|
+
${charPr(6, 1400, true, false, 1)}
|
|
3446
|
+
${charPr(7, 1200, true, false, 1)}
|
|
3447
|
+
${charPr(8, 1100, true, false, 1)}
|
|
3448
|
+
</hh:charProperties>
|
|
3449
|
+
<hh:tabProperties itemCnt="0"/>
|
|
3450
|
+
<hh:numberings itemCnt="0"/>
|
|
3451
|
+
<hh:bullets itemCnt="0"/>
|
|
3452
|
+
<hh:paraProperties itemCnt="8">
|
|
3453
|
+
${paraPr(0)}
|
|
3454
|
+
${paraPr(1, { align: "LEFT", spaceBefore: 800, spaceAfter: 200, lineSpacing: 180 })}
|
|
3455
|
+
${paraPr(2, { align: "LEFT", spaceBefore: 600, spaceAfter: 150, lineSpacing: 170 })}
|
|
3456
|
+
${paraPr(3, { align: "LEFT", spaceBefore: 400, spaceAfter: 100, lineSpacing: 160 })}
|
|
3457
|
+
${paraPr(4, { align: "LEFT", spaceBefore: 300, spaceAfter: 100, lineSpacing: 160 })}
|
|
3458
|
+
${paraPr(5, { align: "LEFT", lineSpacing: 130, indent: 400 })}
|
|
3459
|
+
${paraPr(6, { align: "LEFT", lineSpacing: 150, indent: 600 })}
|
|
3460
|
+
${paraPr(7, { align: "LEFT", lineSpacing: 160, indent: 600 })}
|
|
3461
|
+
</hh:paraProperties>
|
|
3462
|
+
<hh:styles itemCnt="1">
|
|
3463
|
+
<hh:style id="0" type="PARA" name="\uBC14\uD0D5\uAE00" engName="Normal" paraPrIDRef="0" charPrIDRef="0" nextStyleIDRef="0" langIDRef="1042" lockForm="0"/>
|
|
3464
|
+
</hh:styles>
|
|
3465
|
+
</hh:refList>
|
|
3466
|
+
<hh:compatibleDocument targetProgram="HWP2018"/>
|
|
3467
|
+
</hh:head>`;
|
|
3468
|
+
}
|
|
3469
|
+
function generateSecPr() {
|
|
3470
|
+
return `<hp:secPr textDirection="HORIZONTAL" spaceColumns="1134" tabStop="8000" outlineShapeIDRef="0" memoShapeIDRef="0" textVerticalWidthHead="0" masterPageCnt="0"><hp:grid lineGrid="0" charGrid="0" wonggojiFormat="0"/><hp:startNum pageStartsOn="BOTH" page="0" pic="0" tbl="0" equation="0"/><hp:visibility hideFirstHeader="0" hideFirstFooter="0" hideFirstMasterPage="0" border="SHOW_ALL" fill="SHOW_ALL" hideFirstPageNum="0" hideFirstEmptyLine="0" showLineNumber="0"/><hp:pagePr landscape="WIDELY" width="59528" height="84188" gutterType="LEFT_ONLY"><hp:margin header="2835" footer="2835" gutter="0" left="5670" right="4252" top="8504" bottom="4252"/></hp:pagePr><hp:footNotePr><hp:autoNumFormat type="DIGIT" userChar="" prefixChar="" suffixChar=")" supscript="0"/><hp:noteLine length="-1" type="SOLID" width="0.12 mm" color="#000000"/><hp:noteSpacing betweenNotes="283" belowLine="567" aboveLine="850"/><hp:numbering type="CONTINUOUS" newNum="1"/><hp:placement place="EACH_COLUMN" beneathText="0"/></hp:footNotePr><hp:endNotePr><hp:autoNumFormat type="DIGIT" userChar="" prefixChar="" suffixChar=")" supscript="0"/><hp:noteLine length="14692344" type="SOLID" width="0.12 mm" color="#000000"/><hp:noteSpacing betweenNotes="0" belowLine="567" aboveLine="850"/><hp:numbering type="CONTINUOUS" newNum="1"/><hp:placement place="END_OF_DOCUMENT" beneathText="0"/></hp:endNotePr></hp:secPr>`;
|
|
3471
|
+
}
|
|
3472
|
+
function generateTable(rows) {
|
|
3473
|
+
const trElements = rows.map((row) => {
|
|
3474
|
+
const tdElements = row.map((cell) => {
|
|
3475
|
+
const runs = generateRuns(cell);
|
|
3476
|
+
return `<hp:tc><hp:cellSpan colSpan="1" rowSpan="1"/><hp:p paraPrIDRef="0" styleIDRef="0">${runs}</hp:p></hp:tc>`;
|
|
3477
|
+
}).join("");
|
|
3478
|
+
return `<hp:tr>${tdElements}</hp:tr>`;
|
|
3479
|
+
}).join("");
|
|
3480
|
+
return `<hp:tbl>${trElements}</hp:tbl>`;
|
|
3481
|
+
}
|
|
3482
|
+
function blocksToSectionXml(blocks) {
|
|
3483
|
+
const paraXmls = [];
|
|
3484
|
+
let isFirst = true;
|
|
3485
|
+
for (const block of blocks) {
|
|
3486
|
+
let xml = "";
|
|
3487
|
+
switch (block.type) {
|
|
3488
|
+
case "heading": {
|
|
3489
|
+
const pId = headingParaPrId(block.level || 1);
|
|
3490
|
+
const cId = headingCharPrId(block.level || 1);
|
|
3491
|
+
xml = generateParagraph(block.text || "", pId, cId);
|
|
3492
|
+
break;
|
|
5009
3493
|
}
|
|
5010
|
-
|
|
5011
|
-
|
|
5012
|
-
|
|
3494
|
+
case "paragraph":
|
|
3495
|
+
xml = generateParagraph(block.text || "");
|
|
3496
|
+
break;
|
|
3497
|
+
case "code_block": {
|
|
3498
|
+
const codeLines = (block.text || "").split("\n");
|
|
3499
|
+
xml = codeLines.map((line) => generateParagraph(line || " ", PARA_CODE)).join("\n ");
|
|
3500
|
+
break;
|
|
5013
3501
|
}
|
|
5014
|
-
|
|
5015
|
-
|
|
5016
|
-
|
|
5017
|
-
|
|
5018
|
-
}
|
|
5019
|
-
|
|
5020
|
-
|
|
5021
|
-
|
|
5022
|
-
const result = [];
|
|
5023
|
-
let kvLines = [];
|
|
5024
|
-
const flushKvTable = () => {
|
|
5025
|
-
if (kvLines.length < 2) {
|
|
5026
|
-
for (const kv of kvLines) result.push(kv.block);
|
|
5027
|
-
kvLines = [];
|
|
5028
|
-
return;
|
|
5029
|
-
}
|
|
5030
|
-
const cells = kvLines.map((kv) => {
|
|
5031
|
-
if (kv.value) {
|
|
5032
|
-
return [
|
|
5033
|
-
{ text: kv.key, colSpan: 1, rowSpan: 1 },
|
|
5034
|
-
{ text: kv.value, colSpan: 1, rowSpan: 1 }
|
|
5035
|
-
];
|
|
3502
|
+
case "blockquote":
|
|
3503
|
+
xml = generateParagraph(block.text || "", PARA_QUOTE);
|
|
3504
|
+
break;
|
|
3505
|
+
case "list_item": {
|
|
3506
|
+
const marker = block.ordered ? `${(block.indent || 0) + 1}. ` : "\xB7 ";
|
|
3507
|
+
const indentPrefix = " ".repeat(block.indent || 0);
|
|
3508
|
+
xml = generateParagraph(indentPrefix + marker + (block.text || ""), PARA_LIST);
|
|
3509
|
+
break;
|
|
5036
3510
|
}
|
|
5037
|
-
|
|
5038
|
-
|
|
5039
|
-
|
|
5040
|
-
|
|
5041
|
-
|
|
5042
|
-
|
|
5043
|
-
|
|
5044
|
-
|
|
5045
|
-
|
|
5046
|
-
|
|
5047
|
-
|
|
5048
|
-
const firstBlock = kvLines[0].block;
|
|
5049
|
-
result.push({
|
|
5050
|
-
type: "table",
|
|
5051
|
-
table: irTable,
|
|
5052
|
-
pageNumber: firstBlock.pageNumber,
|
|
5053
|
-
bbox: firstBlock.bbox
|
|
5054
|
-
});
|
|
5055
|
-
kvLines = [];
|
|
5056
|
-
};
|
|
5057
|
-
for (const block of blocks) {
|
|
5058
|
-
if (block.type !== "paragraph" || !block.text) {
|
|
5059
|
-
flushKvTable();
|
|
5060
|
-
result.push(block);
|
|
5061
|
-
continue;
|
|
5062
|
-
}
|
|
5063
|
-
const text = block.text.trim();
|
|
5064
|
-
if (KOREAN_TABLE_HEADER_RE.test(text)) {
|
|
5065
|
-
const colonIdx = text.indexOf(":");
|
|
5066
|
-
if (colonIdx >= 0) {
|
|
5067
|
-
kvLines.push({
|
|
5068
|
-
key: text.slice(0, colonIdx).trim(),
|
|
5069
|
-
value: text.slice(colonIdx + 1).trim(),
|
|
5070
|
-
block
|
|
5071
|
-
});
|
|
5072
|
-
} else {
|
|
5073
|
-
const spaceIdx = text.search(/\s/);
|
|
5074
|
-
if (spaceIdx > 0) {
|
|
5075
|
-
kvLines.push({
|
|
5076
|
-
key: text.slice(0, spaceIdx).trim(),
|
|
5077
|
-
value: text.slice(spaceIdx + 1).trim(),
|
|
5078
|
-
block
|
|
5079
|
-
});
|
|
5080
|
-
} else {
|
|
5081
|
-
kvLines.push({ key: text, value: "", block });
|
|
3511
|
+
case "hr":
|
|
3512
|
+
xml = `<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0"><hp:t>\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500\u2500</hp:t></hp:run></hp:p>`;
|
|
3513
|
+
break;
|
|
3514
|
+
case "table":
|
|
3515
|
+
if (block.rows) {
|
|
3516
|
+
if (isFirst) {
|
|
3517
|
+
const secRun = `<hp:run charPrIDRef="0">${generateSecPr()}<hp:t></hp:t></hp:run>`;
|
|
3518
|
+
paraXmls.push(`<hp:p paraPrIDRef="0" styleIDRef="0">${secRun}</hp:p>`);
|
|
3519
|
+
isFirst = false;
|
|
3520
|
+
}
|
|
3521
|
+
xml = generateTable(block.rows);
|
|
5082
3522
|
}
|
|
5083
|
-
|
|
5084
|
-
continue;
|
|
3523
|
+
break;
|
|
5085
3524
|
}
|
|
5086
|
-
if (
|
|
5087
|
-
|
|
5088
|
-
|
|
5089
|
-
|
|
5090
|
-
|
|
5091
|
-
|
|
5092
|
-
|
|
5093
|
-
value: text.slice(colonIdx + 1).trim(),
|
|
5094
|
-
block
|
|
5095
|
-
});
|
|
5096
|
-
continue;
|
|
5097
|
-
}
|
|
5098
|
-
}
|
|
3525
|
+
if (!xml) continue;
|
|
3526
|
+
if (isFirst && block.type !== "table") {
|
|
3527
|
+
xml = xml.replace(
|
|
3528
|
+
/<hp:run charPrIDRef="(\d+)">/,
|
|
3529
|
+
`<hp:run charPrIDRef="$1">${generateSecPr()}`
|
|
3530
|
+
);
|
|
3531
|
+
isFirst = false;
|
|
5099
3532
|
}
|
|
5100
|
-
|
|
5101
|
-
result.push(block);
|
|
3533
|
+
paraXmls.push(xml);
|
|
5102
3534
|
}
|
|
5103
|
-
|
|
5104
|
-
|
|
5105
|
-
}
|
|
5106
|
-
function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
|
|
5107
|
-
const ZONE_RATIO = 0.1;
|
|
5108
|
-
const MIN_REPEAT = 3;
|
|
5109
|
-
const headerTexts = /* @__PURE__ */ new Map();
|
|
5110
|
-
const footerTexts = /* @__PURE__ */ new Map();
|
|
5111
|
-
for (let bi = 0; bi < blocks.length; bi++) {
|
|
5112
|
-
const b = blocks[bi];
|
|
5113
|
-
if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
|
|
5114
|
-
const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
|
|
5115
|
-
if (!ph) continue;
|
|
5116
|
-
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
5117
|
-
const blockBottom = ph - b.bbox.y;
|
|
5118
|
-
if (blockBottom <= ph * ZONE_RATIO) {
|
|
5119
|
-
const arr = footerTexts.get(b.pageNumber) || [];
|
|
5120
|
-
arr.push(b.text.trim());
|
|
5121
|
-
footerTexts.set(b.pageNumber, arr);
|
|
5122
|
-
} else if (blockTop >= ph * (1 - ZONE_RATIO)) {
|
|
5123
|
-
const arr = headerTexts.get(b.pageNumber) || [];
|
|
5124
|
-
arr.push(b.text.trim());
|
|
5125
|
-
headerTexts.set(b.pageNumber, arr);
|
|
5126
|
-
}
|
|
5127
|
-
}
|
|
5128
|
-
const repeatedPatterns = /* @__PURE__ */ new Set();
|
|
5129
|
-
for (const textsMap of [headerTexts, footerTexts]) {
|
|
5130
|
-
const patternCount = /* @__PURE__ */ new Map();
|
|
5131
|
-
for (const [, texts] of textsMap) {
|
|
5132
|
-
for (const t of texts) {
|
|
5133
|
-
const normalized = t.replace(/\d+/g, "#");
|
|
5134
|
-
patternCount.set(normalized, (patternCount.get(normalized) || 0) + 1);
|
|
5135
|
-
}
|
|
5136
|
-
}
|
|
5137
|
-
for (const [pattern, count] of patternCount) {
|
|
5138
|
-
if (count >= MIN_REPEAT) repeatedPatterns.add(pattern);
|
|
5139
|
-
}
|
|
5140
|
-
}
|
|
5141
|
-
if (repeatedPatterns.size === 0) return [];
|
|
5142
|
-
const removeIndices = [];
|
|
5143
|
-
for (let bi = 0; bi < blocks.length; bi++) {
|
|
5144
|
-
const b = blocks[bi];
|
|
5145
|
-
if (!b.bbox || !b.pageNumber || !b.text?.trim()) continue;
|
|
5146
|
-
const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
|
|
5147
|
-
if (!ph) continue;
|
|
5148
|
-
const blockTop = ph - (b.bbox.y + b.bbox.height);
|
|
5149
|
-
const blockBottom = ph - b.bbox.y;
|
|
5150
|
-
const inZone = blockBottom <= ph * ZONE_RATIO || blockTop >= ph * (1 - ZONE_RATIO);
|
|
5151
|
-
if (!inZone) continue;
|
|
5152
|
-
const normalized = b.text.trim().replace(/\d+/g, "#");
|
|
5153
|
-
if (repeatedPatterns.has(normalized)) {
|
|
5154
|
-
removeIndices.push(bi);
|
|
5155
|
-
}
|
|
5156
|
-
}
|
|
5157
|
-
if (removeIndices.length > 0) {
|
|
5158
|
-
warnings.push({ message: `${removeIndices.length}\uAC1C \uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00 \uC694\uC18C \uC81C\uAC70\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
5159
|
-
}
|
|
5160
|
-
return removeIndices;
|
|
5161
|
-
}
|
|
5162
|
-
function mergeKoreanLines(text) {
|
|
5163
|
-
if (!text) return "";
|
|
5164
|
-
const lines = text.split("\n");
|
|
5165
|
-
if (lines.length <= 1) return text;
|
|
5166
|
-
const result = [lines[0]];
|
|
5167
|
-
for (let i = 1; i < lines.length; i++) {
|
|
5168
|
-
const prev = result[result.length - 1];
|
|
5169
|
-
const curr = lines[i];
|
|
5170
|
-
const currTrimmed = curr.trim();
|
|
5171
|
-
if (/^#{1,6}\s/.test(prev) || /^#{1,6}\s/.test(curr) || /^\|/.test(currTrimmed) || /^---/.test(currTrimmed)) {
|
|
5172
|
-
result.push(curr);
|
|
5173
|
-
continue;
|
|
5174
|
-
}
|
|
5175
|
-
if (/,$/.test(prev.trim()) && currTrimmed.length > 0) {
|
|
5176
|
-
result[result.length - 1] = prev + "\n" + curr;
|
|
5177
|
-
continue;
|
|
5178
|
-
}
|
|
5179
|
-
if (/^\(※/.test(currTrimmed)) {
|
|
5180
|
-
result[result.length - 1] = prev + " " + currTrimmed;
|
|
5181
|
-
continue;
|
|
5182
|
-
}
|
|
5183
|
-
if (/[가-힣·,\-]$/.test(prev) && /^[가-힣(]/.test(curr) && !startsWithMarker(curr) && !isStandaloneHeader(prev) && !startsWithMarker(prev)) {
|
|
5184
|
-
result[result.length - 1] = prev + " " + curr;
|
|
5185
|
-
} else {
|
|
5186
|
-
result.push(curr);
|
|
5187
|
-
}
|
|
3535
|
+
if (paraXmls.length === 0) {
|
|
3536
|
+
paraXmls.push(`<hp:p paraPrIDRef="0" styleIDRef="0"><hp:run charPrIDRef="0">${generateSecPr()}<hp:t></hp:t></hp:run></hp:p>`);
|
|
5188
3537
|
}
|
|
5189
|
-
return
|
|
3538
|
+
return `<?xml version="1.0" encoding="UTF-8" standalone="yes" ?>
|
|
3539
|
+
<hs:sec xmlns:hs="${NS_SECTION}" xmlns:hp="${NS_PARA}">
|
|
3540
|
+
${paraXmls.join("\n ")}
|
|
3541
|
+
</hs:sec>`;
|
|
5190
3542
|
}
|
|
5191
3543
|
|
|
5192
3544
|
// src/index.ts
|
|
5193
3545
|
import { readFile } from "fs/promises";
|
|
5194
3546
|
|
|
5195
3547
|
// src/xlsx/parser.ts
|
|
5196
|
-
import
|
|
5197
|
-
import { DOMParser as
|
|
3548
|
+
import JSZip4 from "jszip";
|
|
3549
|
+
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
5198
3550
|
var MAX_SHEETS = 100;
|
|
5199
3551
|
var MAX_DECOMPRESS_SIZE3 = 100 * 1024 * 1024;
|
|
5200
3552
|
var MAX_ROWS2 = 1e4;
|
|
@@ -5231,7 +3583,7 @@ function getTextContent(el) {
|
|
|
5231
3583
|
return el.textContent?.trim() ?? "";
|
|
5232
3584
|
}
|
|
5233
3585
|
function parseXml(text) {
|
|
5234
|
-
return new
|
|
3586
|
+
return new DOMParser3().parseFromString(stripDtd(text), "text/xml");
|
|
5235
3587
|
}
|
|
5236
3588
|
function parseSharedStrings(xml) {
|
|
5237
3589
|
const doc = parseXml(xml);
|
|
@@ -5384,7 +3736,7 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
5384
3736
|
}
|
|
5385
3737
|
async function parseXlsxDocument(buffer, options) {
|
|
5386
3738
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
5387
|
-
const zip = await
|
|
3739
|
+
const zip = await JSZip4.loadAsync(buffer);
|
|
5388
3740
|
const warnings = [];
|
|
5389
3741
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
5390
3742
|
if (!workbookFile) {
|
|
@@ -5474,24 +3826,24 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
5474
3826
|
}
|
|
5475
3827
|
|
|
5476
3828
|
// src/docx/parser.ts
|
|
5477
|
-
import
|
|
5478
|
-
import { DOMParser as
|
|
3829
|
+
import JSZip5 from "jszip";
|
|
3830
|
+
import { DOMParser as DOMParser4 } from "@xmldom/xmldom";
|
|
5479
3831
|
var MAX_DECOMPRESS_SIZE4 = 100 * 1024 * 1024;
|
|
5480
|
-
function getChildElements(parent,
|
|
3832
|
+
function getChildElements(parent, localName2) {
|
|
5481
3833
|
const result = [];
|
|
5482
3834
|
const children = parent.childNodes;
|
|
5483
3835
|
for (let i = 0; i < children.length; i++) {
|
|
5484
3836
|
const node = children[i];
|
|
5485
3837
|
if (node.nodeType === 1) {
|
|
5486
3838
|
const el = node;
|
|
5487
|
-
if (el.localName ===
|
|
3839
|
+
if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
|
|
5488
3840
|
result.push(el);
|
|
5489
3841
|
}
|
|
5490
3842
|
}
|
|
5491
3843
|
}
|
|
5492
3844
|
return result;
|
|
5493
3845
|
}
|
|
5494
|
-
function findElements(parent,
|
|
3846
|
+
function findElements(parent, localName2) {
|
|
5495
3847
|
const result = [];
|
|
5496
3848
|
const walk = (node) => {
|
|
5497
3849
|
const children = node.childNodes;
|
|
@@ -5499,7 +3851,7 @@ function findElements(parent, localName) {
|
|
|
5499
3851
|
const child = children[i];
|
|
5500
3852
|
if (child.nodeType === 1) {
|
|
5501
3853
|
const el = child;
|
|
5502
|
-
if (el.localName ===
|
|
3854
|
+
if (el.localName === localName2 || el.tagName?.endsWith(`:${localName2}`)) {
|
|
5503
3855
|
result.push(el);
|
|
5504
3856
|
}
|
|
5505
3857
|
walk(el);
|
|
@@ -5509,16 +3861,16 @@ function findElements(parent, localName) {
|
|
|
5509
3861
|
walk(parent);
|
|
5510
3862
|
return result;
|
|
5511
3863
|
}
|
|
5512
|
-
function getAttr(el,
|
|
3864
|
+
function getAttr(el, localName2) {
|
|
5513
3865
|
const attrs = el.attributes;
|
|
5514
3866
|
for (let i = 0; i < attrs.length; i++) {
|
|
5515
3867
|
const attr = attrs[i];
|
|
5516
|
-
if (attr.localName ===
|
|
3868
|
+
if (attr.localName === localName2 || attr.name === localName2) return attr.value;
|
|
5517
3869
|
}
|
|
5518
3870
|
return null;
|
|
5519
3871
|
}
|
|
5520
3872
|
function parseXml2(text) {
|
|
5521
|
-
return new
|
|
3873
|
+
return new DOMParser4().parseFromString(stripDtd(text), "text/xml");
|
|
5522
3874
|
}
|
|
5523
3875
|
function parseStyles(xml) {
|
|
5524
3876
|
const doc = parseXml2(xml);
|
|
@@ -5812,7 +4164,7 @@ async function extractImages(zip, rels, doc) {
|
|
|
5812
4164
|
}
|
|
5813
4165
|
async function parseDocxDocument(buffer, options) {
|
|
5814
4166
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
5815
|
-
const zip = await
|
|
4167
|
+
const zip = await JSZip5.loadAsync(buffer);
|
|
5816
4168
|
const warnings = [];
|
|
5817
4169
|
const docFile = zip.file("word/document.xml");
|
|
5818
4170
|
if (!docFile) {
|
|
@@ -5860,11 +4212,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
5860
4212
|
const node = children[i];
|
|
5861
4213
|
if (node.nodeType !== 1) continue;
|
|
5862
4214
|
const el = node;
|
|
5863
|
-
const
|
|
5864
|
-
if (
|
|
4215
|
+
const localName2 = el.localName ?? el.tagName?.split(":").pop();
|
|
4216
|
+
if (localName2 === "p") {
|
|
5865
4217
|
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
5866
4218
|
if (block) blocks.push(block);
|
|
5867
|
-
} else if (
|
|
4219
|
+
} else if (localName2 === "tbl") {
|
|
5868
4220
|
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
5869
4221
|
if (block) blocks.push(block);
|
|
5870
4222
|
}
|
|
@@ -5902,135 +4254,6 @@ async function parseDocxDocument(buffer, options) {
|
|
|
5902
4254
|
};
|
|
5903
4255
|
}
|
|
5904
4256
|
|
|
5905
|
-
// src/form/recognize.ts
|
|
5906
|
-
var LABEL_KEYWORDS = /* @__PURE__ */ new Set([
|
|
5907
|
-
"\uC131\uBA85",
|
|
5908
|
-
"\uC774\uB984",
|
|
5909
|
-
"\uC8FC\uC18C",
|
|
5910
|
-
"\uC804\uD654",
|
|
5911
|
-
"\uC804\uD654\uBC88\uD638",
|
|
5912
|
-
"\uD734\uB300\uD3F0",
|
|
5913
|
-
"\uD578\uB4DC\uD3F0",
|
|
5914
|
-
"\uC5F0\uB77D\uCC98",
|
|
5915
|
-
"\uC0DD\uB144\uC6D4\uC77C",
|
|
5916
|
-
"\uC8FC\uBBFC\uB4F1\uB85D\uBC88\uD638",
|
|
5917
|
-
"\uC18C\uC18D",
|
|
5918
|
-
"\uC9C1\uC704",
|
|
5919
|
-
"\uC9C1\uAE09",
|
|
5920
|
-
"\uBD80\uC11C",
|
|
5921
|
-
"\uC774\uBA54\uC77C",
|
|
5922
|
-
"\uD329\uC2A4",
|
|
5923
|
-
"\uD559\uAD50",
|
|
5924
|
-
"\uD559\uB144",
|
|
5925
|
-
"\uBC18",
|
|
5926
|
-
"\uBC88\uD638",
|
|
5927
|
-
"\uC2E0\uCCAD\uC778",
|
|
5928
|
-
"\uB300\uD45C\uC790",
|
|
5929
|
-
"\uB2F4\uB2F9\uC790",
|
|
5930
|
-
"\uC791\uC131\uC790",
|
|
5931
|
-
"\uD655\uC778\uC790",
|
|
5932
|
-
"\uC2B9\uC778\uC790",
|
|
5933
|
-
"\uC77C\uC2DC",
|
|
5934
|
-
"\uB0A0\uC9DC",
|
|
5935
|
-
"\uAE30\uAC04",
|
|
5936
|
-
"\uC7A5\uC18C",
|
|
5937
|
-
"\uBAA9\uC801",
|
|
5938
|
-
"\uC0AC\uC720",
|
|
5939
|
-
"\uBE44\uACE0",
|
|
5940
|
-
"\uAE08\uC561",
|
|
5941
|
-
"\uC218\uB7C9",
|
|
5942
|
-
"\uB2E8\uAC00",
|
|
5943
|
-
"\uD569\uACC4",
|
|
5944
|
-
"\uACC4",
|
|
5945
|
-
"\uC18C\uACC4"
|
|
5946
|
-
]);
|
|
5947
|
-
function isLabelCell(text) {
|
|
5948
|
-
const trimmed = text.trim();
|
|
5949
|
-
if (!trimmed || trimmed.length > 30) return false;
|
|
5950
|
-
for (const kw of LABEL_KEYWORDS) {
|
|
5951
|
-
if (trimmed.includes(kw)) return true;
|
|
5952
|
-
}
|
|
5953
|
-
if (/^[가-힣\s()·:]{2,8}$/.test(trimmed) && !/\d/.test(trimmed)) return true;
|
|
5954
|
-
if (/^[가-힣A-Za-z\s]+[::]$/.test(trimmed)) return true;
|
|
5955
|
-
return false;
|
|
5956
|
-
}
|
|
5957
|
-
function extractFormFields(blocks) {
|
|
5958
|
-
const fields = [];
|
|
5959
|
-
let totalTables = 0;
|
|
5960
|
-
let formTables = 0;
|
|
5961
|
-
for (const block of blocks) {
|
|
5962
|
-
if (block.type !== "table" || !block.table) continue;
|
|
5963
|
-
totalTables++;
|
|
5964
|
-
const tableFields = extractFromTable(block.table);
|
|
5965
|
-
if (tableFields.length > 0) {
|
|
5966
|
-
formTables++;
|
|
5967
|
-
fields.push(...tableFields);
|
|
5968
|
-
}
|
|
5969
|
-
}
|
|
5970
|
-
for (const block of blocks) {
|
|
5971
|
-
if (block.type === "paragraph" && block.text) {
|
|
5972
|
-
const inlineFields = extractInlineFields(block.text);
|
|
5973
|
-
fields.push(...inlineFields);
|
|
5974
|
-
}
|
|
5975
|
-
}
|
|
5976
|
-
const confidence = totalTables > 0 ? formTables / totalTables : fields.length > 0 ? 0.3 : 0;
|
|
5977
|
-
return { fields, confidence: Math.min(confidence, 1) };
|
|
5978
|
-
}
|
|
5979
|
-
function extractFromTable(table) {
|
|
5980
|
-
const fields = [];
|
|
5981
|
-
if (table.cols >= 2) {
|
|
5982
|
-
for (let r = 0; r < table.rows; r++) {
|
|
5983
|
-
for (let c = 0; c < table.cols - 1; c++) {
|
|
5984
|
-
const labelCell = table.cells[r][c];
|
|
5985
|
-
const valueCell = table.cells[r][c + 1];
|
|
5986
|
-
if (isLabelCell(labelCell.text) && valueCell.text.trim()) {
|
|
5987
|
-
fields.push({
|
|
5988
|
-
label: labelCell.text.trim().replace(/[::]\s*$/, ""),
|
|
5989
|
-
value: valueCell.text.trim(),
|
|
5990
|
-
row: r,
|
|
5991
|
-
col: c
|
|
5992
|
-
});
|
|
5993
|
-
}
|
|
5994
|
-
}
|
|
5995
|
-
}
|
|
5996
|
-
}
|
|
5997
|
-
if (fields.length === 0 && table.rows >= 2 && table.cols >= 2) {
|
|
5998
|
-
const headerRow = table.cells[0];
|
|
5999
|
-
const allLabels = headerRow.every((cell) => {
|
|
6000
|
-
const t = cell.text.trim();
|
|
6001
|
-
return t.length > 0 && t.length <= 20;
|
|
6002
|
-
});
|
|
6003
|
-
if (allLabels) {
|
|
6004
|
-
for (let r = 1; r < table.rows; r++) {
|
|
6005
|
-
for (let c = 0; c < table.cols; c++) {
|
|
6006
|
-
const label = headerRow[c].text.trim();
|
|
6007
|
-
const value = table.cells[r][c].text.trim();
|
|
6008
|
-
if (label && value) {
|
|
6009
|
-
fields.push({ label, value, row: r, col: c });
|
|
6010
|
-
}
|
|
6011
|
-
}
|
|
6012
|
-
}
|
|
6013
|
-
}
|
|
6014
|
-
}
|
|
6015
|
-
return fields;
|
|
6016
|
-
}
|
|
6017
|
-
function extractInlineFields(text) {
|
|
6018
|
-
const fields = [];
|
|
6019
|
-
const pattern = /([가-힣A-Za-z]{2,10})\s*[::]\s*([^\n,;]{1,100})/g;
|
|
6020
|
-
let match;
|
|
6021
|
-
while ((match = pattern.exec(text)) !== null) {
|
|
6022
|
-
const label = match[1].trim();
|
|
6023
|
-
const value = match[2].trim();
|
|
6024
|
-
if (value) {
|
|
6025
|
-
fields.push({ label, value, row: -1, col: -1 });
|
|
6026
|
-
}
|
|
6027
|
-
}
|
|
6028
|
-
return fields;
|
|
6029
|
-
}
|
|
6030
|
-
|
|
6031
|
-
// src/hwpx/generator.ts
|
|
6032
|
-
import JSZip4 from "jszip";
|
|
6033
|
-
|
|
6034
4257
|
// src/index.ts
|
|
6035
4258
|
async function parse(input, options) {
|
|
6036
4259
|
let buffer;
|
|
@@ -6083,6 +4306,18 @@ async function parseHwp(buffer, options) {
|
|
|
6083
4306
|
}
|
|
6084
4307
|
}
|
|
6085
4308
|
async function parsePdf(buffer, options) {
|
|
4309
|
+
let parsePdfDocument;
|
|
4310
|
+
try {
|
|
4311
|
+
const mod = await import("./parser-PXD73E4H.js");
|
|
4312
|
+
parsePdfDocument = mod.parsePdfDocument;
|
|
4313
|
+
} catch {
|
|
4314
|
+
return {
|
|
4315
|
+
success: false,
|
|
4316
|
+
fileType: "pdf",
|
|
4317
|
+
error: "PDF \uD30C\uC2F1\uC5D0 pdfjs-dist\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4. \uC124\uCE58: npm install pdfjs-dist",
|
|
4318
|
+
code: "MISSING_DEPENDENCY"
|
|
4319
|
+
};
|
|
4320
|
+
}
|
|
6086
4321
|
try {
|
|
6087
4322
|
const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
|
|
6088
4323
|
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
@@ -6286,16 +4521,13 @@ function diffTableCells(a, b) {
|
|
|
6286
4521
|
}
|
|
6287
4522
|
|
|
6288
4523
|
export {
|
|
6289
|
-
VERSION,
|
|
6290
|
-
toArrayBuffer,
|
|
6291
|
-
KordocError,
|
|
6292
|
-
sanitizeError,
|
|
6293
|
-
blocksToMarkdown,
|
|
6294
4524
|
extractHwpxMetadataOnly,
|
|
6295
4525
|
extractHwp5MetadataOnly,
|
|
6296
|
-
extractPdfMetadataOnly,
|
|
6297
|
-
compare,
|
|
6298
4526
|
extractFormFields,
|
|
4527
|
+
fillFormFields,
|
|
4528
|
+
fillHwpx,
|
|
4529
|
+
markdownToHwpx,
|
|
4530
|
+
compare,
|
|
6299
4531
|
parse
|
|
6300
4532
|
};
|
|
6301
|
-
//# sourceMappingURL=chunk-
|
|
4533
|
+
//# sourceMappingURL=chunk-RQWICKON.js.map
|