kordoc 2.8.0 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (57) hide show
  1. package/README.md +469 -450
  2. package/dist/{chunk-3QA624ON.js → chunk-M24KMDAR.js} +6 -6
  3. package/dist/chunk-M24KMDAR.js.map +1 -0
  4. package/dist/{chunk-5CJGKKMZ.js → chunk-MEPHGCPQ.js} +1 -1
  5. package/dist/chunk-MEPHGCPQ.js.map +1 -0
  6. package/dist/chunk-MOL7MDBG.js +0 -0
  7. package/dist/chunk-MUOQXDZ4.cjs.map +1 -1
  8. package/dist/{chunk-HXWPJPRO.cjs → chunk-QB7CS534.cjs} +2 -2
  9. package/dist/chunk-QB7CS534.cjs.map +1 -0
  10. package/dist/{chunk-DLQY6FJH.js → chunk-RXZLTACX.js} +2 -2
  11. package/dist/chunk-RXZLTACX.js.map +1 -0
  12. package/dist/{chunk-XSF3N6GU.js → chunk-SJ5TPMBT.js} +2 -2
  13. package/dist/chunk-SJ5TPMBT.js.map +1 -0
  14. package/dist/cli.js +4 -4
  15. package/dist/cli.js.map +1 -1
  16. package/dist/{detect-PJZMUL2Z.js → detect-RI2MQ33K.js} +2 -2
  17. package/dist/formula-JCNF43NE.js +0 -0
  18. package/dist/formula-XGG6ZP42.cjs.map +1 -1
  19. package/dist/index.cjs +99 -99
  20. package/dist/index.cjs.map +1 -1
  21. package/dist/index.d.cts +28 -0
  22. package/dist/index.d.ts +28 -0
  23. package/dist/index.js +4 -4
  24. package/dist/index.js.map +1 -1
  25. package/dist/mcp.js +5 -5
  26. package/dist/mcp.js.map +1 -1
  27. package/dist/page-range-3C7UGGEK.cjs.map +1 -1
  28. package/dist/page-range-737B4EZW.js +0 -0
  29. package/dist/{parser-LKF6PGPD.cjs → parser-EL5YETUA.cjs} +159 -19
  30. package/dist/parser-EL5YETUA.cjs.map +1 -0
  31. package/dist/{parser-ZQQM6J7T.js → parser-OMPBVEFU.js} +146 -6
  32. package/dist/parser-OMPBVEFU.js.map +1 -0
  33. package/dist/{parser-UCO6WPUW.js → parser-XBYGROQB.js} +146 -6
  34. package/dist/parser-XBYGROQB.js.map +1 -0
  35. package/dist/{provider-WPIYEALY.js → provider-2SEHU2FM.js} +1 -1
  36. package/dist/provider-2SEHU2FM.js.map +1 -0
  37. package/dist/{provider-7H4CPZYS.js → provider-AKROB7WQ.js} +1 -1
  38. package/dist/provider-AKROB7WQ.js.map +1 -0
  39. package/dist/{provider-YN2SSK4X.cjs → provider-SNONEZNW.cjs} +1 -1
  40. package/dist/provider-SNONEZNW.cjs.map +1 -0
  41. package/dist/setup-57FB3LSP.js +0 -0
  42. package/dist/{watch-MRHNFJPC.js → watch-ULLLK7ID.js} +4 -4
  43. package/dist/watch-ULLLK7ID.js.map +1 -0
  44. package/package.json +98 -98
  45. package/dist/chunk-3QA624ON.js.map +0 -1
  46. package/dist/chunk-5CJGKKMZ.js.map +0 -1
  47. package/dist/chunk-DLQY6FJH.js.map +0 -1
  48. package/dist/chunk-HXWPJPRO.cjs.map +0 -1
  49. package/dist/chunk-XSF3N6GU.js.map +0 -1
  50. package/dist/parser-LKF6PGPD.cjs.map +0 -1
  51. package/dist/parser-UCO6WPUW.js.map +0 -1
  52. package/dist/parser-ZQQM6J7T.js.map +0 -1
  53. package/dist/provider-7H4CPZYS.js.map +0 -1
  54. package/dist/provider-WPIYEALY.js.map +0 -1
  55. package/dist/provider-YN2SSK4X.cjs.map +0 -1
  56. package/dist/watch-MRHNFJPC.js.map +0 -1
  57. /package/dist/{detect-PJZMUL2Z.js.map → detect-RI2MQ33K.js.map} +0 -0
@@ -6,7 +6,7 @@ import {
6
6
  blocksToMarkdown,
7
7
  safeMax,
8
8
  safeMin
9
- } from "./chunk-DLQY6FJH.js";
9
+ } from "./chunk-RXZLTACX.js";
10
10
  import {
11
11
  parsePageRange
12
12
  } from "./chunk-SBVRCJFH.js";
@@ -1142,6 +1142,120 @@ function buildClusterTable(rows, columns, pageNum) {
1142
1142
  };
1143
1143
  }
1144
1144
 
1145
+ // src/pdf/quality.ts
1146
+ function computePageQuality(page, text) {
1147
+ let total = 0;
1148
+ let hangul = 0;
1149
+ let control = 0;
1150
+ let replacement = 0;
1151
+ let pua = 0;
1152
+ for (let i = 0; i < text.length; i++) {
1153
+ const code = text.charCodeAt(i);
1154
+ if (code === 32 || code === 9 || code === 10 || code === 13) continue;
1155
+ total++;
1156
+ if (code < 32 || code === 127 || code >= 128 && code <= 159) {
1157
+ control++;
1158
+ continue;
1159
+ }
1160
+ if (code === 65533) {
1161
+ replacement++;
1162
+ continue;
1163
+ }
1164
+ if (code >= 44032 && code <= 55203) {
1165
+ hangul++;
1166
+ continue;
1167
+ }
1168
+ if (code >= 57344 && code <= 63743 || code >= 56192 && code <= 56319) {
1169
+ pua++;
1170
+ continue;
1171
+ }
1172
+ }
1173
+ const denom = total || 1;
1174
+ const puaRatio = pua / denom;
1175
+ const controlCharRatio = control / denom;
1176
+ const replacementCharRatio = replacement / denom;
1177
+ let needsOcr = false;
1178
+ let ocrReason;
1179
+ if (total < LOW_TEXT_THRESHOLD) {
1180
+ needsOcr = true;
1181
+ ocrReason = "low_text";
1182
+ } else if (puaRatio >= HIGH_PUA_THRESHOLD) {
1183
+ needsOcr = true;
1184
+ ocrReason = "high_pua";
1185
+ } else if (controlCharRatio >= HIGH_CONTROL_THRESHOLD) {
1186
+ needsOcr = true;
1187
+ ocrReason = "high_control";
1188
+ } else if (replacementCharRatio >= HIGH_REPLACEMENT_THRESHOLD) {
1189
+ needsOcr = true;
1190
+ ocrReason = "high_replacement";
1191
+ }
1192
+ return {
1193
+ page,
1194
+ textChars: total,
1195
+ hangulRatio: hangul / denom,
1196
+ controlCharRatio,
1197
+ replacementCharRatio,
1198
+ puaRatio,
1199
+ needsOcr,
1200
+ ocrReason
1201
+ };
1202
+ }
1203
+ var LOW_TEXT_THRESHOLD = 20;
1204
+ var HIGH_PUA_THRESHOLD = 0.2;
1205
+ var HIGH_CONTROL_THRESHOLD = 0.05;
1206
+ var HIGH_REPLACEMENT_THRESHOLD = 0.05;
1207
+ var DOC_NEEDS_OCR_PAGE_RATIO = 0.3;
1208
+ function stripControlChars(text) {
1209
+ return text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\x9F]/g, "");
1210
+ }
1211
+ function summarizeDocumentQuality(pages) {
1212
+ if (pages.length === 0) {
1213
+ return {
1214
+ totalPages: 0,
1215
+ totalTextChars: 0,
1216
+ avgHangulRatio: 0,
1217
+ avgControlCharRatio: 0,
1218
+ avgReplacementCharRatio: 0,
1219
+ avgPuaRatio: 0,
1220
+ lowTextPageCount: 0,
1221
+ highPuaPageCount: 0,
1222
+ needsOcr: false,
1223
+ ocrCandidatePages: []
1224
+ };
1225
+ }
1226
+ let textChars = 0;
1227
+ let hangul = 0;
1228
+ let control = 0;
1229
+ let replacement = 0;
1230
+ let pua = 0;
1231
+ let lowText = 0;
1232
+ let highPua = 0;
1233
+ const ocrCandidatePages = [];
1234
+ for (const p of pages) {
1235
+ textChars += p.textChars;
1236
+ hangul += p.hangulRatio;
1237
+ control += p.controlCharRatio;
1238
+ replacement += p.replacementCharRatio;
1239
+ pua += p.puaRatio;
1240
+ if (p.textChars < LOW_TEXT_THRESHOLD) lowText++;
1241
+ if (p.puaRatio >= HIGH_PUA_THRESHOLD) highPua++;
1242
+ if (p.needsOcr) ocrCandidatePages.push(p.page);
1243
+ }
1244
+ const n = pages.length;
1245
+ return {
1246
+ totalPages: n,
1247
+ totalTextChars: textChars,
1248
+ avgHangulRatio: hangul / n,
1249
+ avgControlCharRatio: control / n,
1250
+ avgReplacementCharRatio: replacement / n,
1251
+ avgPuaRatio: pua / n,
1252
+ lowTextPageCount: lowText,
1253
+ highPuaPageCount: highPua,
1254
+ needsOcr: ocrCandidatePages.length / n >= DOC_NEEDS_OCR_PAGE_RATIO,
1255
+ ocrCandidatePages
1256
+ };
1257
+ }
1258
+
1145
1259
  // src/pdf/polyfill.ts
1146
1260
  import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
1147
1261
  var g = globalThis;
@@ -1197,6 +1311,7 @@ async function parsePdfDocument(buffer, options) {
1197
1311
  await extractPdfMetadata(doc, metadata);
1198
1312
  const blocks = [];
1199
1313
  const warnings = [];
1314
+ const pageQuality = [];
1200
1315
  let totalChars = 0;
1201
1316
  let totalTextBytes = 0;
1202
1317
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
@@ -1224,11 +1339,14 @@ async function parsePdfDocument(buffer, options) {
1224
1339
  const opList = await page.getOperatorList();
1225
1340
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
1226
1341
  for (const b of pageBlocks) blocks.push(b);
1342
+ let pageText = "";
1227
1343
  for (const b of pageBlocks) {
1228
1344
  const t = b.text || "";
1229
1345
  totalChars += t.replace(/\s/g, "").length;
1230
1346
  totalTextBytes += t.length * 2;
1347
+ pageText += pageText ? "\n" + t : t;
1231
1348
  }
1349
+ pageQuality.push(computePageQuality(i, pageText));
1232
1350
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
1233
1351
  parsedPages++;
1234
1352
  options?.onProgress?.(parsedPages, totalTarget);
@@ -1241,11 +1359,11 @@ async function parsePdfDocument(buffer, options) {
1241
1359
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
1242
1360
  if (options?.ocr) {
1243
1361
  try {
1244
- const { ocrPages } = await import("./provider-WPIYEALY.js");
1362
+ const { ocrPages } = await import("./provider-2SEHU2FM.js");
1245
1363
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
1246
1364
  if (ocrBlocks.length > 0) {
1247
1365
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
1248
- return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
1366
+ return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true, pageQuality, qualitySummary: summarizeDocumentQuality(pageQuality) };
1249
1367
  }
1250
1368
  } catch {
1251
1369
  }
@@ -1274,8 +1392,17 @@ async function parsePdfDocument(buffer, options) {
1274
1392
  }
1275
1393
  detectMarkerHeadings(blocks);
1276
1394
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
1395
+ sanitizeBlockControlChars(blocks);
1277
1396
  let markdown = cleanPdfText(blocksToMarkdown(blocks));
1278
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
1397
+ return {
1398
+ markdown,
1399
+ blocks,
1400
+ metadata,
1401
+ outline: outline.length > 0 ? outline : void 0,
1402
+ warnings: warnings.length > 0 ? warnings : void 0,
1403
+ pageQuality,
1404
+ qualitySummary: summarizeDocumentQuality(pageQuality)
1405
+ };
1279
1406
  } finally {
1280
1407
  await doc.destroy().catch(() => {
1281
1408
  });
@@ -2079,9 +2206,22 @@ function mergeLineSimple(items) {
2079
2206
  }
2080
2207
  return result;
2081
2208
  }
2209
+ function sanitizeBlockControlChars(blocks) {
2210
+ for (const b of blocks) {
2211
+ if (b.text) b.text = stripControlChars(b.text);
2212
+ if (b.table) {
2213
+ for (const row of b.table.cells) {
2214
+ for (const cell of row) {
2215
+ if (cell.text) cell.text = stripControlChars(cell.text);
2216
+ }
2217
+ }
2218
+ }
2219
+ if (b.children) sanitizeBlockControlChars(b.children);
2220
+ }
2221
+ }
2082
2222
  function cleanPdfText(text) {
2083
2223
  return mergeKoreanLines(
2084
- text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
2224
+ stripControlChars(text).replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
2085
2225
  ).replace(/^(?!\| ---).*$/gm, (line) => {
2086
2226
  if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
2087
2227
  return collapseEvenSpacing(line);
@@ -2411,4 +2551,4 @@ export {
2411
2551
  extractPdfMetadataOnly,
2412
2552
  parsePdfDocument
2413
2553
  };
2414
- //# sourceMappingURL=parser-ZQQM6J7T.js.map
2554
+ //# sourceMappingURL=parser-OMPBVEFU.js.map