kordoc 2.7.2 → 2.9.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -7,7 +7,7 @@ import {
7
7
  blocksToMarkdown,
8
8
  safeMax,
9
9
  safeMin
10
- } from "./chunk-4SK2PDMQ.js";
10
+ } from "./chunk-SJ5TPMBT.js";
11
11
  import {
12
12
  parsePageRange
13
13
  } from "./chunk-MOL7MDBG.js";
@@ -1143,6 +1143,120 @@ function buildClusterTable(rows, columns, pageNum) {
1143
1143
  };
1144
1144
  }
1145
1145
 
1146
+ // src/pdf/quality.ts
1147
+ function computePageQuality(page, text) {
1148
+ let total = 0;
1149
+ let hangul = 0;
1150
+ let control = 0;
1151
+ let replacement = 0;
1152
+ let pua = 0;
1153
+ for (let i = 0; i < text.length; i++) {
1154
+ const code = text.charCodeAt(i);
1155
+ if (code === 32 || code === 9 || code === 10 || code === 13) continue;
1156
+ total++;
1157
+ if (code < 32 || code === 127 || code >= 128 && code <= 159) {
1158
+ control++;
1159
+ continue;
1160
+ }
1161
+ if (code === 65533) {
1162
+ replacement++;
1163
+ continue;
1164
+ }
1165
+ if (code >= 44032 && code <= 55203) {
1166
+ hangul++;
1167
+ continue;
1168
+ }
1169
+ if (code >= 57344 && code <= 63743 || code >= 56192 && code <= 56319) {
1170
+ pua++;
1171
+ continue;
1172
+ }
1173
+ }
1174
+ const denom = total || 1;
1175
+ const puaRatio = pua / denom;
1176
+ const controlCharRatio = control / denom;
1177
+ const replacementCharRatio = replacement / denom;
1178
+ let needsOcr = false;
1179
+ let ocrReason;
1180
+ if (total < LOW_TEXT_THRESHOLD) {
1181
+ needsOcr = true;
1182
+ ocrReason = "low_text";
1183
+ } else if (puaRatio >= HIGH_PUA_THRESHOLD) {
1184
+ needsOcr = true;
1185
+ ocrReason = "high_pua";
1186
+ } else if (controlCharRatio >= HIGH_CONTROL_THRESHOLD) {
1187
+ needsOcr = true;
1188
+ ocrReason = "high_control";
1189
+ } else if (replacementCharRatio >= HIGH_REPLACEMENT_THRESHOLD) {
1190
+ needsOcr = true;
1191
+ ocrReason = "high_replacement";
1192
+ }
1193
+ return {
1194
+ page,
1195
+ textChars: total,
1196
+ hangulRatio: hangul / denom,
1197
+ controlCharRatio,
1198
+ replacementCharRatio,
1199
+ puaRatio,
1200
+ needsOcr,
1201
+ ocrReason
1202
+ };
1203
+ }
1204
+ var LOW_TEXT_THRESHOLD = 20;
1205
+ var HIGH_PUA_THRESHOLD = 0.2;
1206
+ var HIGH_CONTROL_THRESHOLD = 0.05;
1207
+ var HIGH_REPLACEMENT_THRESHOLD = 0.05;
1208
+ var DOC_NEEDS_OCR_PAGE_RATIO = 0.3;
1209
+ function stripControlChars(text) {
1210
+ return text.replace(/[\x00-\x08\x0B\x0C\x0E-\x1F\x7F\x80-\x9F]/g, "");
1211
+ }
1212
+ function summarizeDocumentQuality(pages) {
1213
+ if (pages.length === 0) {
1214
+ return {
1215
+ totalPages: 0,
1216
+ totalTextChars: 0,
1217
+ avgHangulRatio: 0,
1218
+ avgControlCharRatio: 0,
1219
+ avgReplacementCharRatio: 0,
1220
+ avgPuaRatio: 0,
1221
+ lowTextPageCount: 0,
1222
+ highPuaPageCount: 0,
1223
+ needsOcr: false,
1224
+ ocrCandidatePages: []
1225
+ };
1226
+ }
1227
+ let textChars = 0;
1228
+ let hangul = 0;
1229
+ let control = 0;
1230
+ let replacement = 0;
1231
+ let pua = 0;
1232
+ let lowText = 0;
1233
+ let highPua = 0;
1234
+ const ocrCandidatePages = [];
1235
+ for (const p of pages) {
1236
+ textChars += p.textChars;
1237
+ hangul += p.hangulRatio;
1238
+ control += p.controlCharRatio;
1239
+ replacement += p.replacementCharRatio;
1240
+ pua += p.puaRatio;
1241
+ if (p.textChars < LOW_TEXT_THRESHOLD) lowText++;
1242
+ if (p.puaRatio >= HIGH_PUA_THRESHOLD) highPua++;
1243
+ if (p.needsOcr) ocrCandidatePages.push(p.page);
1244
+ }
1245
+ const n = pages.length;
1246
+ return {
1247
+ totalPages: n,
1248
+ totalTextChars: textChars,
1249
+ avgHangulRatio: hangul / n,
1250
+ avgControlCharRatio: control / n,
1251
+ avgReplacementCharRatio: replacement / n,
1252
+ avgPuaRatio: pua / n,
1253
+ lowTextPageCount: lowText,
1254
+ highPuaPageCount: highPua,
1255
+ needsOcr: ocrCandidatePages.length / n >= DOC_NEEDS_OCR_PAGE_RATIO,
1256
+ ocrCandidatePages
1257
+ };
1258
+ }
1259
+
1146
1260
  // src/pdf/polyfill.ts
1147
1261
  import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
1148
1262
  var g = globalThis;
@@ -1198,6 +1312,7 @@ async function parsePdfDocument(buffer, options) {
1198
1312
  await extractPdfMetadata(doc, metadata);
1199
1313
  const blocks = [];
1200
1314
  const warnings = [];
1315
+ const pageQuality = [];
1201
1316
  let totalChars = 0;
1202
1317
  let totalTextBytes = 0;
1203
1318
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
@@ -1225,11 +1340,14 @@ async function parsePdfDocument(buffer, options) {
1225
1340
  const opList = await page.getOperatorList();
1226
1341
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
1227
1342
  for (const b of pageBlocks) blocks.push(b);
1343
+ let pageText = "";
1228
1344
  for (const b of pageBlocks) {
1229
1345
  const t = b.text || "";
1230
1346
  totalChars += t.replace(/\s/g, "").length;
1231
1347
  totalTextBytes += t.length * 2;
1348
+ pageText += pageText ? "\n" + t : t;
1232
1349
  }
1350
+ pageQuality.push(computePageQuality(i, pageText));
1233
1351
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
1234
1352
  parsedPages++;
1235
1353
  options?.onProgress?.(parsedPages, totalTarget);
@@ -1246,7 +1364,7 @@ async function parsePdfDocument(buffer, options) {
1246
1364
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
1247
1365
  if (ocrBlocks.length > 0) {
1248
1366
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
1249
- return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true };
1367
+ return { markdown: ocrMarkdown, blocks: ocrBlocks, metadata, warnings, isImageBased: true, pageQuality, qualitySummary: summarizeDocumentQuality(pageQuality) };
1250
1368
  }
1251
1369
  } catch {
1252
1370
  }
@@ -1275,8 +1393,17 @@ async function parsePdfDocument(buffer, options) {
1275
1393
  }
1276
1394
  detectMarkerHeadings(blocks);
1277
1395
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
1396
+ sanitizeBlockControlChars(blocks);
1278
1397
  let markdown = cleanPdfText(blocksToMarkdown(blocks));
1279
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
1398
+ return {
1399
+ markdown,
1400
+ blocks,
1401
+ metadata,
1402
+ outline: outline.length > 0 ? outline : void 0,
1403
+ warnings: warnings.length > 0 ? warnings : void 0,
1404
+ pageQuality,
1405
+ qualitySummary: summarizeDocumentQuality(pageQuality)
1406
+ };
1280
1407
  } finally {
1281
1408
  await doc.destroy().catch(() => {
1282
1409
  });
@@ -2080,9 +2207,22 @@ function mergeLineSimple(items) {
2080
2207
  }
2081
2208
  return result;
2082
2209
  }
2210
+ function sanitizeBlockControlChars(blocks) {
2211
+ for (const b of blocks) {
2212
+ if (b.text) b.text = stripControlChars(b.text);
2213
+ if (b.table) {
2214
+ for (const row of b.table.cells) {
2215
+ for (const cell of row) {
2216
+ if (cell.text) cell.text = stripControlChars(cell.text);
2217
+ }
2218
+ }
2219
+ }
2220
+ if (b.children) sanitizeBlockControlChars(b.children);
2221
+ }
2222
+ }
2083
2223
  function cleanPdfText(text) {
2084
2224
  return mergeKoreanLines(
2085
- text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
2225
+ stripControlChars(text).replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
2086
2226
  ).replace(/^(?!\| ---).*$/gm, (line) => {
2087
2227
  if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
2088
2228
  return collapseEvenSpacing(line);
@@ -2412,4 +2552,4 @@ export {
2412
2552
  extractPdfMetadataOnly,
2413
2553
  parsePdfDocument
2414
2554
  };
2415
- //# sourceMappingURL=parser-QMMQ7Y7R.js.map
2555
+ //# sourceMappingURL=parser-XBYGROQB.js.map