kordoc 2.8.0 → 2.9.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +469 -450
- package/dist/{chunk-3QA624ON.js → chunk-M24KMDAR.js} +6 -6
- package/dist/chunk-M24KMDAR.js.map +1 -0
- package/dist/{chunk-5CJGKKMZ.js → chunk-MEPHGCPQ.js} +1 -1
- package/dist/chunk-MEPHGCPQ.js.map +1 -0
- package/dist/chunk-MOL7MDBG.js +0 -0
- package/dist/chunk-MUOQXDZ4.cjs.map +1 -1
- package/dist/{chunk-HXWPJPRO.cjs → chunk-QB7CS534.cjs} +2 -2
- package/dist/chunk-QB7CS534.cjs.map +1 -0
- package/dist/{chunk-DLQY6FJH.js → chunk-RXZLTACX.js} +2 -2
- package/dist/chunk-RXZLTACX.js.map +1 -0
- package/dist/{chunk-XSF3N6GU.js → chunk-SJ5TPMBT.js} +2 -2
- package/dist/chunk-SJ5TPMBT.js.map +1 -0
- package/dist/cli.js +4 -4
- package/dist/cli.js.map +1 -1
- package/dist/{detect-PJZMUL2Z.js → detect-RI2MQ33K.js} +2 -2
- package/dist/formula-JCNF43NE.js +0 -0
- package/dist/formula-XGG6ZP42.cjs.map +1 -1
- package/dist/index.cjs +99 -99
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +28 -0
- package/dist/index.d.ts +28 -0
- package/dist/index.js +4 -4
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-3C7UGGEK.cjs.map +1 -1
- package/dist/page-range-737B4EZW.js +0 -0
- package/dist/{parser-LKF6PGPD.cjs → parser-EL5YETUA.cjs} +159 -19
- package/dist/parser-EL5YETUA.cjs.map +1 -0
- package/dist/{parser-ZQQM6J7T.js → parser-OMPBVEFU.js} +146 -6
- package/dist/parser-OMPBVEFU.js.map +1 -0
- package/dist/{parser-UCO6WPUW.js → parser-XBYGROQB.js} +146 -6
- package/dist/parser-XBYGROQB.js.map +1 -0
- package/dist/{provider-WPIYEALY.js → provider-2SEHU2FM.js} +1 -1
- package/dist/provider-2SEHU2FM.js.map +1 -0
- package/dist/{provider-7H4CPZYS.js → provider-AKROB7WQ.js} +1 -1
- package/dist/provider-AKROB7WQ.js.map +1 -0
- package/dist/{provider-YN2SSK4X.cjs → provider-SNONEZNW.cjs} +1 -1
- package/dist/provider-SNONEZNW.cjs.map +1 -0
- package/dist/setup-57FB3LSP.js +0 -0
- package/dist/{watch-MRHNFJPC.js → watch-ULLLK7ID.js} +4 -4
- package/dist/watch-ULLLK7ID.js.map +1 -0
- package/package.json +98 -98
- package/dist/chunk-3QA624ON.js.map +0 -1
- package/dist/chunk-5CJGKKMZ.js.map +0 -1
- package/dist/chunk-DLQY6FJH.js.map +0 -1
- package/dist/chunk-HXWPJPRO.cjs.map +0 -1
- package/dist/chunk-XSF3N6GU.js.map +0 -1
- package/dist/parser-LKF6PGPD.cjs.map +0 -1
- package/dist/parser-UCO6WPUW.js.map +0 -1
- package/dist/parser-ZQQM6J7T.js.map +0 -1
- package/dist/provider-7H4CPZYS.js.map +0 -1
- package/dist/provider-WPIYEALY.js.map +0 -1
- package/dist/provider-YN2SSK4X.cjs.map +0 -1
- package/dist/watch-MRHNFJPC.js.map +0 -1
- /package/dist/{detect-PJZMUL2Z.js.map → detect-RI2MQ33K.js.map} +0 -0
|
@@ -7,7 +7,7 @@ import {
|
|
|
7
7
|
blocksToMarkdown,
|
|
8
8
|
safeMax,
|
|
9
9
|
safeMin
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-SJ5TPMBT.js";
|
|
11
11
|
import {
|
|
12
12
|
parsePageRange
|
|
13
13
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -1143,6 +1143,120 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
1143
1143
|
};
|
|
1144
1144
|
}
|
|
1145
1145
|
|
|
1146
|
+
// src/pdf/quality.ts
|
|
1147
|
+
function computePageQuality(page, text) {
|
|
1148
|
+
let total = 0;
|
|
1149
|
+
let hangul = 0;
|
|
1150
|
+
let control = 0;
|
|
1151
|
+
let replacement = 0;
|
|
1152
|
+
let pua = 0;
|
|
1153
|
+
for (let i = 0; i < text.length; i++) {
|
|
1154
|
+
const code = text.charCodeAt(i);
|
|
1155
|
+
if (code === 32 || code === 9 || code === 10 || code === 13) continue;
|
|
1156
|
+
total++;
|
|
1157
|
+
if (code < 32 || code === 127 || code >= 128 && code <= 159) {
|
|
1158
|
+
control++;
|
|
1159
|
+
continue;
|
|
1160
|
+
}
|
|
1161
|
+
if (code === 65533) {
|
|
1162
|
+
replacement++;
|
|
1163
|
+
continue;
|
|
1164
|
+
}
|
|
1165
|
+
if (code >= 44032 && code <= 55203) {
|
|
1166
|
+
hangul++;
|
|
1167
|
+
continue;
|
|
1168
|
+
}
|
|
1169
|
+
if (code >= 57344 && code <= 63743 || code >= 56192 && code <= 56319) {
|
|
1170
|
+
pua++;
|
|
1171
|
+
continue;
|
|
1172
|
+
}
|
|
1173
|
+
}
|
|
1174
|
+
const denom = total || 1;
|
|
1175
|
+
const puaRatio = pua / denom;
|
|
1176
|
+
const controlCharRatio = control / denom;
|
|
1177
|
+
const replacementCharRatio = replacement / denom;
|
|
1178
|
+
let needsOcr = false;
|
|
1179
|
+
let ocrReason;
|
|
1180
|
+
if (total < LOW_TEXT_THRESHOLD) {
|
|
1181
|
+
needsOcr = true;
|
|
1182
|
+
ocrReason = "low_text";
|
|
1183
|
+
} else if (puaRatio >= HIGH_PUA_THRESHOLD) {
|
|
1184
|
+
needsOcr = true;
|
|
1185
|
+
ocrReason = "high_pua";
|
|
1186
|
+
} else if (controlCharRatio >= HIGH_CONTROL_THRESHOLD) {
|
|
1187
|
+
needsOcr = true;
|
|
1188
|
+
ocrReason = "high_control";
|
|
1189
|
+
} else if (replacementCharRatio >= HIGH_REPLACEMENT_THRESHOLD) {
|
|
1190
|
+
needsOcr = true;
|
|
1191
|
+
ocrReason = "high_replacement";
|
|
1192
|
+
}
|
|
1193
|
+
return {
|
|
1194
|
+
page,
|
|
1195
|
+
textChars: total,
|
|
1196
|
+
hangulRatio: hangul / denom,
|
|
1197
|
+
controlCharRatio,
|
|
1198
|
+
replacementCharRatio,
|
|
1199
|
+
puaRatio,
|
|
1200
|
+
needsOcr,
|
|
1201
|
+
ocrReason
|
|
1202
|
+
};
|
|
1203
|
+
}
|
|
1204
|
+
var LOW_TEXT_THRESHOLD = 20;
|
|
1205
|
+
var HIGH_PUA_THRESHOLD = 0.2;
|
|
1206
|
+
var HIGH_CONTROL_THRESHOLD = 0.05;
|
|
1207
|
+
var HIGH_REPLACEMENT_THRESHOLD = 0.05;
|
|
1208
|
+
var DOC_NEEDS_OCR_PAGE_RATIO = 0.3;
|
|
1209
|
+
function stripControlChars(text) {
|
|
1210
|
+
return text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\x9F]/g, "");
|
|
1211
|
+
}
|
|
1212
|
+
function summarizeDocumentQuality(pages) {
|
|
1213
|
+
if (pages.length === 0) {
|
|
1214
|
+
return {
|
|
1215
|
+
totalPages: 0,
|
|
1216
|
+
totalTextChars: 0,
|
|
1217
|
+
avgHangulRatio: 0,
|
|
1218
|
+
avgControlCharRatio: 0,
|
|
1219
|
+
avgReplacementCharRatio: 0,
|
|
1220
|
+
avgPuaRatio: 0,
|
|
1221
|
+
lowTextPageCount: 0,
|
|
1222
|
+
highPuaPageCount: 0,
|
|
1223
|
+
needsOcr: false,
|
|
1224
|
+
ocrCandidatePages: []
|
|
1225
|
+
};
|
|
1226
|
+
}
|
|
1227
|
+
let textChars = 0;
|
|
1228
|
+
let hangul = 0;
|
|
1229
|
+
let control = 0;
|
|
1230
|
+
let replacement = 0;
|
|
1231
|
+
let pua = 0;
|
|
1232
|
+
let lowText = 0;
|
|
1233
|
+
let highPua = 0;
|
|
1234
|
+
const ocrCandidatePages = [];
|
|
1235
|
+
for (const p of pages) {
|
|
1236
|
+
textChars += p.textChars;
|
|
1237
|
+
hangul += p.hangulRatio;
|
|
1238
|
+
control += p.controlCharRatio;
|
|
1239
|
+
replacement += p.replacementCharRatio;
|
|
1240
|
+
pua += p.puaRatio;
|
|
1241
|
+
if (p.textChars < LOW_TEXT_THRESHOLD) lowText++;
|
|
1242
|
+
if (p.puaRatio >= HIGH_PUA_THRESHOLD) highPua++;
|
|
1243
|
+
if (p.needsOcr) ocrCandidatePages.push(p.page);
|
|
1244
|
+
}
|
|
1245
|
+
const n = pages.length;
|
|
1246
|
+
return {
|
|
1247
|
+
totalPages: n,
|
|
1248
|
+
totalTextChars: textChars,
|
|
1249
|
+
avgHangulRatio: hangul / n,
|
|
1250
|
+
avgControlCharRatio: control / n,
|
|
1251
|
+
avgReplacementCharRatio: replacement / n,
|
|
1252
|
+
avgPuaRatio: pua / n,
|
|
1253
|
+
lowTextPageCount: lowText,
|
|
1254
|
+
highPuaPageCount: highPua,
|
|
1255
|
+
needsOcr: ocrCandidatePages.length / n >= DOC_NEEDS_OCR_PAGE_RATIO,
|
|
1256
|
+
ocrCandidatePages
|
|
1257
|
+
};
|
|
1258
|
+
}
|
|
1259
|
+
|
|
1146
1260
|
// src/pdf/polyfill.ts
|
|
1147
1261
|
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
1148
1262
|
var g = globalThis;
|
|
@@ -1198,6 +1312,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1198
1312
|
await extractPdfMetadata(doc, metadata);
|
|
1199
1313
|
const blocks = [];
|
|
1200
1314
|
const warnings = [];
|
|
1315
|
+
const pageQuality = [];
|
|
1201
1316
|
let totalChars = 0;
|
|
1202
1317
|
let totalTextBytes = 0;
|
|
1203
1318
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
@@ -1225,11 +1340,14 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1225
1340
|
const opList = await page.getOperatorList();
|
|
1226
1341
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
1227
1342
|
for (const b of pageBlocks) blocks.push(b);
|
|
1343
|
+
let pageText = "";
|
|
1228
1344
|
for (const b of pageBlocks) {
|
|
1229
1345
|
const t = b.text || "";
|
|
1230
1346
|
totalChars += t.replace(/\s/g, "").length;
|
|
1231
1347
|
totalTextBytes += t.length * 2;
|
|
1348
|
+
pageText += pageText ? "\n" + t : t;
|
|
1232
1349
|
}
|
|
1350
|
+
pageQuality.push(computePageQuality(i, pageText));
|
|
1233
1351
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
1234
1352
|
parsedPages++;
|
|
1235
1353
|
options?.onProgress?.(parsedPages, totalTarget);
|
|
@@ -1242,11 +1360,11 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1242
1360
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
1243
1361
|
if (options?.ocr) {
|
|
1244
1362
|
try {
|
|
1245
|
-
const { ocrPages } = await import("./provider-
|
|
1363
|
+
const { ocrPages } = await import("./provider-AKROB7WQ.js");
|
|
1246
1364
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
1247
1365
|
if (ocrBlocks.length > 0) {
|
|
1248
1366
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
1249
|
-
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
|
|
1367
|
+
return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true, pageQuality, qualitySummary: summarizeDocumentQuality(pageQuality) };
|
|
1250
1368
|
}
|
|
1251
1369
|
} catch {
|
|
1252
1370
|
}
|
|
@@ -1275,8 +1393,17 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1275
1393
|
}
|
|
1276
1394
|
detectMarkerHeadings(blocks);
|
|
1277
1395
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
1396
|
+
sanitizeBlockControlChars(blocks);
|
|
1278
1397
|
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
1279
|
-
return {
|
|
1398
|
+
return {
|
|
1399
|
+
markdown,
|
|
1400
|
+
blocks,
|
|
1401
|
+
metadata,
|
|
1402
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
1403
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
1404
|
+
pageQuality,
|
|
1405
|
+
qualitySummary: summarizeDocumentQuality(pageQuality)
|
|
1406
|
+
};
|
|
1280
1407
|
} finally {
|
|
1281
1408
|
await doc.destroy().catch(() => {
|
|
1282
1409
|
});
|
|
@@ -2080,9 +2207,22 @@ function mergeLineSimple(items) {
|
|
|
2080
2207
|
}
|
|
2081
2208
|
return result;
|
|
2082
2209
|
}
|
|
2210
|
+
function sanitizeBlockControlChars(blocks) {
|
|
2211
|
+
for (const b of blocks) {
|
|
2212
|
+
if (b.text) b.text = stripControlChars(b.text);
|
|
2213
|
+
if (b.table) {
|
|
2214
|
+
for (const row of b.table.cells) {
|
|
2215
|
+
for (const cell of row) {
|
|
2216
|
+
if (cell.text) cell.text = stripControlChars(cell.text);
|
|
2217
|
+
}
|
|
2218
|
+
}
|
|
2219
|
+
}
|
|
2220
|
+
if (b.children) sanitizeBlockControlChars(b.children);
|
|
2221
|
+
}
|
|
2222
|
+
}
|
|
2083
2223
|
function cleanPdfText(text) {
|
|
2084
2224
|
return mergeKoreanLines(
|
|
2085
|
-
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
2225
|
+
stripControlChars(text).replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
2086
2226
|
).replace(/^(?!\| ---).*$/gm, (line) => {
|
|
2087
2227
|
if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
|
|
2088
2228
|
return collapseEvenSpacing(line);
|
|
@@ -2412,4 +2552,4 @@ export {
|
|
|
2412
2552
|
extractPdfMetadataOnly,
|
|
2413
2553
|
parsePdfDocument
|
|
2414
2554
|
};
|
|
2415
|
-
//# sourceMappingURL=parser-
|
|
2555
|
+
//# sourceMappingURL=parser-XBYGROQB.js.map
|