kordoc 2.8.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +469 -450
- package/dist/{chunk-3QA624ON.js → chunk-M24KMDAR.js} +6 -6
- package/dist/chunk-M24KMDAR.js.map +1 -0
- package/dist/{chunk-5CJGKKMZ.js → chunk-MEPHGCPQ.js} +1 -1
- package/dist/chunk-MEPHGCPQ.js.map +1 -0
- package/dist/chunk-MOL7MDBG.js +0 -0
- package/dist/chunk-MUOQXDZ4.cjs.map +1 -1
- package/dist/{chunk-HXWPJPRO.cjs → chunk-QB7CS534.cjs} +2 -2
- package/dist/chunk-QB7CS534.cjs.map +1 -0
- package/dist/{chunk-DLQY6FJH.js → chunk-RXZLTACX.js} +2 -2
- package/dist/chunk-RXZLTACX.js.map +1 -0
- package/dist/{chunk-XSF3N6GU.js → chunk-SJ5TPMBT.js} +2 -2
- package/dist/chunk-SJ5TPMBT.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/cli.js.map +1 -1
- package/dist/{detect-PJZMUL2Z.js → detect-RI2MQ33K.js} +2 -2
- package/dist/formula-JCNF43NE.js +0 -0
- package/dist/formula-XGG6ZP42.cjs.map +1 -1
- package/dist/index.cjs +99 -99
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +4 -4
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-3C7UGGEK.cjs.map +1 -1
- package/dist/page-range-737B4EZW.js +0 -0
- package/dist/{parser-LKF6PGPD.cjs → parser-EL5YETUA.cjs} +159 -19
- package/dist/parser-EL5YETUA.cjs.map +1 -0
- package/dist/{parser-ZQQM6J7T.js → parser-OMPBVEFU.js} +146 -6
- package/dist/parser-OMPBVEFU.js.map +1 -0
- package/dist/{parser-UCO6WPUW.js → parser-XBYGROQB.js} +146 -6
- package/dist/parser-XBYGROQB.js.map +1 -0
- package/dist/{provider-WPIYEALY.js → provider-2SEHU2FM.js} +1 -1
- package/dist/provider-2SEHU2FM.js.map +1 -0
- package/dist/{provider-7H4CPZYS.js → provider-AKROB7WQ.js} +1 -1
- package/dist/provider-AKROB7WQ.js.map +1 -0
- package/dist/{provider-YN2SSK4X.cjs → provider-SNONEZNW.cjs} +1 -1
- package/dist/provider-SNONEZNW.cjs.map +1 -0
- package/dist/setup-57FB3LSP.js +0 -0
- package/dist/{watch-MRHNFJPC.js → watch-ULLLK7ID.js} +4 -4
- package/dist/watch-ULLLK7ID.js.map +1 -0
- package/package.json +98 -98
- package/dist/chunk-3QA624ON.js.map +0 -1
- package/dist/chunk-5CJGKKMZ.js.map +0 -1
- package/dist/chunk-DLQY6FJH.js.map +0 -1
- package/dist/chunk-HXWPJPRO.cjs.map +0 -1
- package/dist/chunk-XSF3N6GU.js.map +0 -1
- package/dist/parser-LKF6PGPD.cjs.map +0 -1
- package/dist/parser-UCO6WPUW.js.map +0 -1
- package/dist/parser-ZQQM6J7T.js.map +0 -1
- package/dist/provider-7H4CPZYS.js.map +0 -1
- package/dist/provider-WPIYEALY.js.map +0 -1
- package/dist/provider-YN2SSK4X.cjs.map +0 -1
- package/dist/watch-MRHNFJPC.js.map +0 -1
- /package/dist/{detect-PJZMUL2Z.js.map → detect-RI2MQ33K.js.map} +0 -0
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
blocksToMarkdown,
|
|
7
7
|
safeMax,
|
|
8
8
|
safeMin
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-RXZLTACX.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
12
|
} from "./chunk-SBVRCJFH.js";
|
|
@@ -1142,6 +1142,120 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
1142
1142
|
};
|
|
1143
1143
|
}
|
|
1144
1144
|
|
|
1145
|
+
// src/pdf/quality.ts
|
|
1146
|
+
function computePageQuality(page, text) {
|
|
1147
|
+
let total = 0;
|
|
1148
|
+
let hangul = 0;
|
|
1149
|
+
let control = 0;
|
|
1150
|
+
let replacement = 0;
|
|
1151
|
+
let pua = 0;
|
|
1152
|
+
for (let i = 0; i < text.length; i++) {
|
|
1153
|
+
const code = text.charCodeAt(i);
|
|
1154
|
+
if (code === 32 || code === 9 || code === 10 || code === 13) continue;
|
|
1155
|
+
total++;
|
|
1156
|
+
if (code < 32 || code === 127 || code >= 128 && code <= 159) {
|
|
1157
|
+
control++;
|
|
1158
|
+
continue;
|
|
1159
|
+
}
|
|
1160
|
+
if (code === 65533) {
|
|
1161
|
+
replacement++;
|
|
1162
|
+
continue;
|
|
1163
|
+
}
|
|
1164
|
+
if (code >= 44032 && code <= 55203) {
|
|
1165
|
+
hangul++;
|
|
1166
|
+
continue;
|
|
1167
|
+
}
|
|
1168
|
+
if (code >= 57344 && code <= 63743 || code >= 56192 && code <= 56319) {
|
|
1169
|
+
pua++;
|
|
1170
|
+
continue;
|
|
1171
|
+
}
|
|
1172
|
+
}
|
|
1173
|
+
const denom = total || 1;
|
|
1174
|
+
const puaRatio = pua / denom;
|
|
1175
|
+
const controlCharRatio = control / denom;
|
|
1176
|
+
const replacementCharRatio = replacement / denom;
|
|
1177
|
+
let needsOcr = false;
|
|
1178
|
+
let ocrReason;
|
|
1179
|
+
if (total < LOW_TEXT_THRESHOLD) {
|
|
1180
|
+
needsOcr = true;
|
|
1181
|
+
ocrReason = "low_text";
|
|
1182
|
+
} else if (puaRatio >= HIGH_PUA_THRESHOLD) {
|
|
1183
|
+
needsOcr = true;
|
|
1184
|
+
ocrReason = "high_pua";
|
|
1185
|
+
} else if (controlCharRatio >= HIGH_CONTROL_THRESHOLD) {
|
|
1186
|
+
needsOcr = true;
|
|
1187
|
+
ocrReason = "high_control";
|
|
1188
|
+
} else if (replacementCharRatio >= HIGH_REPLACEMENT_THRESHOLD) {
|
|
1189
|
+
needsOcr = true;
|
|
1190
|
+
ocrReason = "high_replacement";
|
|
1191
|
+
}
|
|
1192
|
+
return {
|
|
1193
|
+
page,
|
|
1194
|
+
textChars: total,
|
|
1195
|
+
hangulRatio: hangul / denom,
|
|
1196
|
+
controlCharRatio,
|
|
1197
|
+
replacementCharRatio,
|
|
1198
|
+
puaRatio,
|
|
1199
|
+
needsOcr,
|
|
1200
|
+
ocrReason
|
|
1201
|
+
};
|
|
1202
|
+
}
|
|
1203
|
+
var LOW_TEXT_THRESHOLD = 20;
|
|
1204
|
+
var HIGH_PUA_THRESHOLD = 0.2;
|
|
1205
|
+
var HIGH_CONTROL_THRESHOLD = 0.05;
|
|
1206
|
+
var HIGH_REPLACEMENT_THRESHOLD = 0.05;
|
|
1207
|
+
var DOC_NEEDS_OCR_PAGE_RATIO = 0.3;
|
|
1208
|
+
function stripControlChars(text) {
|
|
1209
|
+
return text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\x9F]/g, "");
|
|
1210
|
+
}
|
|
1211
|
+
function summarizeDocumentQuality(pages) {
|
|
1212
|
+
if (pages.length === 0) {
|
|
1213
|
+
return {
|
|
1214
|
+
totalPages: 0,
|
|
1215
|
+
totalTextChars: 0,
|
|
1216
|
+
avgHangulRatio: 0,
|
|
1217
|
+
avgControlCharRatio: 0,
|
|
1218
|
+
avgReplacementCharRatio: 0,
|
|
1219
|
+
avgPuaRatio: 0,
|
|
1220
|
+
lowTextPageCount: 0,
|
|
1221
|
+
highPuaPageCount: 0,
|
|
1222
|
+
needsOcr: false,
|
|
1223
|
+
ocrCandidatePages: []
|
|
1224
|
+
};
|
|
1225
|
+
}
|
|
1226
|
+
let textChars = 0;
|
|
1227
|
+
let hangul = 0;
|
|
1228
|
+
let control = 0;
|
|
1229
|
+
let replacement = 0;
|
|
1230
|
+
let pua = 0;
|
|
1231
|
+
let lowText = 0;
|
|
1232
|
+
let highPua = 0;
|
|
1233
|
+
const ocrCandidatePages = [];
|
|
1234
|
+
for (const p of pages) {
|
|
1235
|
+
textChars += p.textChars;
|
|
1236
|
+
hangul += p.hangulRatio;
|
|
1237
|
+
control += p.controlCharRatio;
|
|
1238
|
+
replacement += p.replacementCharRatio;
|
|
1239
|
+
pua += p.puaRatio;
|
|
1240
|
+
if (p.textChars < LOW_TEXT_THRESHOLD) lowText++;
|
|
1241
|
+
if (p.puaRatio >= HIGH_PUA_THRESHOLD) highPua++;
|
|
1242
|
+
if (p.needsOcr) ocrCandidatePages.push(p.page);
|
|
1243
|
+
}
|
|
1244
|
+
const n = pages.length;
|
|
1245
|
+
return {
|
|
1246
|
+
totalPages: n,
|
|
1247
|
+
totalTextChars: textChars,
|
|
1248
|
+
avgHangulRatio: hangul / n,
|
|
1249
|
+
avgControlCharRatio: control / n,
|
|
1250
|
+
avgReplacementCharRatio: replacement / n,
|
|
1251
|
+
avgPuaRatio: pua / n,
|
|
1252
|
+
lowTextPageCount: lowText,
|
|
1253
|
+
highPuaPageCount: highPua,
|
|
1254
|
+
needsOcr: ocrCandidatePages.length / n >= DOC_NEEDS_OCR_PAGE_RATIO,
|
|
1255
|
+
ocrCandidatePages
|
|
1256
|
+
};
|
|
1257
|
+
}
|
|
1258
|
+
|
|
1145
1259
|
// src/pdf/polyfill.ts
|
|
1146
1260
|
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
1147
1261
|
var g = globalThis;
|
|
@@ -1197,6 +1311,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1197
1311
|
await extractPdfMetadata(doc, metadata);
|
|
1198
1312
|
const blocks = [];
|
|
1199
1313
|
const warnings = [];
|
|
1314
|
+
const pageQuality = [];
|
|
1200
1315
|
let totalChars = 0;
|
|
1201
1316
|
let totalTextBytes = 0;
|
|
1202
1317
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
@@ -1224,11 +1339,14 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1224
1339
|
const opList = await page.getOperatorList();
|
|
1225
1340
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
1226
1341
|
for (const b of pageBlocks) blocks.push(b);
|
|
1342
|
+
let pageText = "";
|
|
1227
1343
|
for (const b of pageBlocks) {
|
|
1228
1344
|
const t = b.text || "";
|
|
1229
1345
|
totalChars += t.replace(/\s/g, "").length;
|
|
1230
1346
|
totalTextBytes += t.length * 2;
|
|
1347
|
+
pageText += pageText ? "\n" + t : t;
|
|
1231
1348
|
}
|
|
1349
|
+
pageQuality.push(computePageQuality(i, pageText));
|
|
1232
1350
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
1233
1351
|
parsedPages++;
|
|
1234
1352
|
options?.onProgress?.(parsedPages, totalTarget);
|
|
@@ -1241,11 +1359,11 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1241
1359
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
1242
1360
|
if (options?.ocr) {
|
|
1243
1361
|
try {
|
|
1244
|
-
const { ocrPages } = await import("./provider-
|
|
1362
|
+
const { ocrPages } = await import("./provider-2SEHU2FM.js");
|
|
1245
1363
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
1246
1364
|
if (ocrBlocks.length > 0) {
|
|
1247
1365
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
1248
|
-
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
|
|
1366
|
+
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true, pageQuality, qualitySummary: summarizeDocumentQuality(pageQuality) };
|
|
1249
1367
|
}
|
|
1250
1368
|
} catch {
|
|
1251
1369
|
}
|
|
@@ -1274,8 +1392,17 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1274
1392
|
}
|
|
1275
1393
|
detectMarkerHeadings(blocks);
|
|
1276
1394
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
1395
|
+
sanitizeBlockControlChars(blocks);
|
|
1277
1396
|
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
1278
|
-
return {
|
|
1397
|
+
return {
|
|
1398
|
+
markdown,
|
|
1399
|
+
blocks,
|
|
1400
|
+
metadata,
|
|
1401
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
1402
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
1403
|
+
pageQuality,
|
|
1404
|
+
qualitySummary: summarizeDocumentQuality(pageQuality)
|
|
1405
|
+
};
|
|
1279
1406
|
} finally {
|
|
1280
1407
|
await doc.destroy().catch(() => {
|
|
1281
1408
|
});
|
|
@@ -2079,9 +2206,22 @@ function mergeLineSimple(items) {
|
|
|
2079
2206
|
}
|
|
2080
2207
|
return result;
|
|
2081
2208
|
}
|
|
2209
|
+
function sanitizeBlockControlChars(blocks) {
|
|
2210
|
+
for (const b of blocks) {
|
|
2211
|
+
if (b.text) b.text = stripControlChars(b.text);
|
|
2212
|
+
if (b.table) {
|
|
2213
|
+
for (const row of b.table.cells) {
|
|
2214
|
+
for (const cell of row) {
|
|
2215
|
+
if (cell.text) cell.text = stripControlChars(cell.text);
|
|
2216
|
+
}
|
|
2217
|
+
}
|
|
2218
|
+
}
|
|
2219
|
+
if (b.children) sanitizeBlockControlChars(b.children);
|
|
2220
|
+
}
|
|
2221
|
+
}
|
|
2082
2222
|
function cleanPdfText(text) {
|
|
2083
2223
|
return mergeKoreanLines(
|
|
2084
|
-
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
2224
|
+
stripControlChars(text).replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
2085
2225
|
).replace(/^(?!\| ---).*$/gm, (line) => {
|
|
2086
2226
|
if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
|
|
2087
2227
|
return collapseEvenSpacing(line);
|
|
@@ -2411,4 +2551,4 @@ export {
|
|
|
2411
2551
|
extractPdfMetadataOnly,
|
|
2412
2552
|
parsePdfDocument
|
|
2413
2553
|
};
|
|
2414
|
-
//# sourceMappingURL=parser-
|
|
2554
|
+
//# sourceMappingURL=parser-OMPBVEFU.js.map
|