kordoc 2.5.1 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +21 -2
  2. package/dist/chunk-5CJGKKMZ.js +266 -0
  3. package/dist/chunk-5CJGKKMZ.js.map +1 -0
  4. package/dist/{chunk-OCVWJSG7.js → chunk-GNN6MHH4.js} +14 -3
  5. package/dist/chunk-GNN6MHH4.js.map +1 -0
  6. package/dist/{chunk-KO7DKAXW.js → chunk-LA66FVBN.js} +14 -3
  7. package/dist/chunk-LA66FVBN.js.map +1 -0
  8. package/dist/chunk-OBSPVJ6A.js +18947 -0
  9. package/dist/chunk-OBSPVJ6A.js.map +1 -0
  10. package/dist/{chunk-TTSFPEDM.cjs → chunk-RFGEEHI4.cjs} +14 -3
  11. package/dist/{chunk-TTSFPEDM.cjs.map → chunk-RFGEEHI4.cjs.map} +1 -1
  12. package/dist/cli.js +60 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{detect-I7YIS4Q6.js → detect-PJZMUL2Z.js} +6 -2
  15. package/dist/formula-3AQUUIRF.js +1151 -0
  16. package/dist/formula-3AQUUIRF.js.map +1 -0
  17. package/dist/formula-JCNF43NE.js +1153 -0
  18. package/dist/formula-JCNF43NE.js.map +1 -0
  19. package/dist/formula-XGG6ZP42.cjs +1151 -0
  20. package/dist/formula-XGG6ZP42.cjs.map +1 -0
  21. package/dist/index.cjs +14743 -465
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.d.cts +73 -2
  24. package/dist/index.d.ts +73 -2
  25. package/dist/index.js +14615 -337
  26. package/dist/index.js.map +1 -1
  27. package/dist/mcp.js +5 -5
  28. package/dist/{parser-DA3CGOZF.js → parser-5CJGXQCJ.js} +135 -3
  29. package/dist/parser-5CJGXQCJ.js.map +1 -0
  30. package/dist/{parser-NZFDRZLS.js → parser-6L6DZCOB.js} +135 -3
  31. package/dist/parser-6L6DZCOB.js.map +1 -0
  32. package/dist/{parser-BOIVVDYI.cjs → parser-SRI2TIZX.cjs} +159 -27
  33. package/dist/{parser-BOIVVDYI.cjs.map → parser-SRI2TIZX.cjs.map} +1 -1
  34. package/dist/{watch-HWN6Y6Q2.js → watch-7CTGUDQB.js} +4 -4
  35. package/package.json +25 -4
  36. package/dist/chunk-KO7DKAXW.js.map +0 -1
  37. package/dist/chunk-M3E3C5GS.js +0 -59
  38. package/dist/chunk-M3E3C5GS.js.map +0 -1
  39. package/dist/chunk-OCVWJSG7.js.map +0 -1
  40. package/dist/chunk-QEZ4CUF7.js +0 -5022
  41. package/dist/chunk-QEZ4CUF7.js.map +0 -1
  42. package/dist/parser-DA3CGOZF.js.map +0 -1
  43. package/dist/parser-NZFDRZLS.js.map +0 -1
  44. /package/dist/{detect-I7YIS4Q6.js.map → detect-PJZMUL2Z.js.map} +0 -0
  45. /package/dist/{watch-HWN6Y6Q2.js.map → watch-7CTGUDQB.js.map} +0 -0
@@ -6,7 +6,7 @@ import {
6
6
  blocksToMarkdown,
7
7
  safeMax,
8
8
  safeMin
9
- } from "./chunk-OCVWJSG7.js";
9
+ } from "./chunk-GNN6MHH4.js";
10
10
  import {
11
11
  parsePageRange
12
12
  } from "./chunk-SBVRCJFH.js";
@@ -1188,6 +1188,7 @@ async function loadPdfWithTimeout(buffer) {
1188
1188
  }
1189
1189
  }
1190
1190
  async function parsePdfDocument(buffer, options) {
1191
+ const formulaBuffer = options?.formulaOcr ? buffer.slice(0) : null;
1191
1192
  const doc = await loadPdfWithTimeout(buffer);
1192
1193
  try {
1193
1194
  const pageCount = doc.numPages;
@@ -1257,6 +1258,16 @@ async function parsePdfDocument(buffer, options) {
1257
1258
  blocks.splice(removed[ri], 1);
1258
1259
  }
1259
1260
  }
1261
+ if (options?.formulaOcr && formulaBuffer) {
1262
+ try {
1263
+ await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
1264
+ } catch (e) {
1265
+ warnings.push({
1266
+ message: `\uC218\uC2DD OCR \uC2E4\uD328: ${e instanceof Error ? e.message : String(e)}`,
1267
+ code: "PARTIAL_PARSE"
1268
+ });
1269
+ }
1270
+ }
1260
1271
  const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
1261
1272
  if (medianFontSize > 0) {
1262
1273
  detectHeadings(blocks, medianFontSize);
@@ -2071,7 +2082,10 @@ function mergeLineSimple(items) {
2071
2082
  function cleanPdfText(text) {
2072
2083
  return mergeKoreanLines(
2073
2084
  text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
2074
- ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2085
+ ).replace(/^(?!\| ---).*$/gm, (line) => {
2086
+ if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
2087
+ return collapseEvenSpacing(line);
2088
+ }).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2075
2089
  }
2076
2090
  function startsWithMarker(line) {
2077
2091
  const t = line.trimStart();
@@ -2274,9 +2288,127 @@ function mergeKoreanLines(text) {
2274
2288
  }
2275
2289
  return result.join("\n");
2276
2290
  }
2291
+ async function applyFormulaOcr(buffer, blocks, pageFilter, effectivePageCount, warnings, _onProgress) {
2292
+ const formulaMod = await import("./formula-3AQUUIRF.js");
2293
+ const { FormulaPipeline, ensureFormulaModels } = formulaMod;
2294
+ await ensureFormulaModels((p) => {
2295
+ if (p.phase === "download" && p.total) {
2296
+ const pct = Math.floor(p.downloaded / p.total * 100);
2297
+ process.stderr.write(`\r[kordoc-formula] ${p.spec.name} ${pct}% (${formatMb(p.downloaded)}/${formatMb(p.total)})`);
2298
+ if (p.downloaded >= p.total) process.stderr.write("\n");
2299
+ } else if (p.phase === "verify") {
2300
+ process.stderr.write(`[kordoc-formula] ${p.spec.name} SHA-256 \uAC80\uC99D \uC911...
2301
+ `);
2302
+ } else if (p.phase === "done") {
2303
+ process.stderr.write(`[kordoc-formula] ${p.spec.name} \uC900\uBE44 \uC644\uB8CC
2304
+ `);
2305
+ } else if (p.phase === "skip") {
2306
+ }
2307
+ });
2308
+ const pipeline = await FormulaPipeline.create();
2309
+ try {
2310
+ const pagesResult = await pipeline.runOnBuffer(buffer, pageFilter);
2311
+ if (pagesResult.length === 0) return;
2312
+ let insertedCount = 0;
2313
+ let removedDupCount = 0;
2314
+ for (const page of pagesResult) {
2315
+ const pageNumber = page.pageNumber;
2316
+ const pdfHeight = page.pdfHeight;
2317
+ const scaleX = page.renderedWidth > 0 ? page.pdfWidth / page.renderedWidth : 0.5;
2318
+ const scaleY = page.renderedHeight > 0 ? page.pdfHeight / page.renderedHeight : 0.5;
2319
+ const candidates = [];
2320
+ for (const r of page.regions) {
2321
+ if (!r.latex || !r.latex.trim()) continue;
2322
+ const wrapped = r.kind === "display" ? `$$${r.latex}$$` : `$${r.latex}$`;
2323
+ const x1 = r.bbox.x1 * scaleX;
2324
+ const x2 = r.bbox.x2 * scaleX;
2325
+ const yTop = pdfHeight - r.bbox.y1 * scaleY;
2326
+ const yBottom = pdfHeight - r.bbox.y2 * scaleY;
2327
+ const centerY = (yTop + yBottom) / 2;
2328
+ const width = x2 - x1;
2329
+ const height = yTop - yBottom;
2330
+ candidates.push({
2331
+ block: {
2332
+ type: "paragraph",
2333
+ text: wrapped,
2334
+ pageNumber,
2335
+ bbox: { page: pageNumber, x: x1, y: yBottom, width, height }
2336
+ },
2337
+ pdfBbox: { x1, x2, yTop, yBottom },
2338
+ centerY
2339
+ });
2340
+ }
2341
+ if (candidates.length === 0) continue;
2342
+ const OVERLAP_THRESHOLD = 0.6;
2343
+ const indicesToRemove = /* @__PURE__ */ new Set();
2344
+ for (let i = 0; i < blocks.length; i++) {
2345
+ const b = blocks[i];
2346
+ if (b.pageNumber !== pageNumber) continue;
2347
+ if (b.type === "table") continue;
2348
+ if (!b.bbox || b.bbox.width <= 0 || b.bbox.height <= 0) continue;
2349
+ const blockArea = b.bbox.width * b.bbox.height;
2350
+ if (blockArea <= 0) continue;
2351
+ for (const c of candidates) {
2352
+ const ox1 = Math.max(b.bbox.x, c.pdfBbox.x1);
2353
+ const ox2 = Math.min(b.bbox.x + b.bbox.width, c.pdfBbox.x2);
2354
+ const oy1 = Math.max(b.bbox.y, c.pdfBbox.yBottom);
2355
+ const oy2 = Math.min(b.bbox.y + b.bbox.height, c.pdfBbox.yTop);
2356
+ const interArea = Math.max(0, ox2 - ox1) * Math.max(0, oy2 - oy1);
2357
+ if (interArea / blockArea >= OVERLAP_THRESHOLD) {
2358
+ indicesToRemove.add(i);
2359
+ break;
2360
+ }
2361
+ }
2362
+ }
2363
+ if (indicesToRemove.size > 0) {
2364
+ const sorted = [...indicesToRemove].sort((a, b) => b - a);
2365
+ for (const idx of sorted) blocks.splice(idx, 1);
2366
+ removedDupCount += indicesToRemove.size;
2367
+ }
2368
+ candidates.sort((a, b) => b.centerY - a.centerY);
2369
+ for (const c of candidates) {
2370
+ let insertIdx = -1;
2371
+ let pageFirstIdx = -1;
2372
+ let pageLastIdx = -1;
2373
+ for (let i = 0; i < blocks.length; i++) {
2374
+ const b = blocks[i];
2375
+ if (b.pageNumber !== pageNumber) continue;
2376
+ if (pageFirstIdx === -1) pageFirstIdx = i;
2377
+ pageLastIdx = i;
2378
+ if (!b.bbox) continue;
2379
+ const blockCenter = b.bbox.y + b.bbox.height / 2;
2380
+ if (blockCenter < c.centerY) {
2381
+ insertIdx = i;
2382
+ break;
2383
+ }
2384
+ }
2385
+ if (insertIdx !== -1) {
2386
+ blocks.splice(insertIdx, 0, c.block);
2387
+ } else if (pageLastIdx !== -1) {
2388
+ blocks.splice(pageLastIdx + 1, 0, c.block);
2389
+ } else {
2390
+ blocks.push(c.block);
2391
+ }
2392
+ insertedCount++;
2393
+ }
2394
+ }
2395
+ if (insertedCount > 0 || removedDupCount > 0) {
2396
+ process.stderr.write(
2397
+ `[kordoc-formula] ${insertedCount}\uAC1C \uC218\uC2DD \uC0BD\uC785, ${removedDupCount}\uAC1C \uC911\uBCF5 block \uC81C\uAC70 (${pagesResult.length}\uAC1C \uD398\uC774\uC9C0)
2398
+ `
2399
+ );
2400
+ }
2401
+ } finally {
2402
+ await pipeline.destroy().catch(() => {
2403
+ });
2404
+ }
2405
+ }
2406
+ function formatMb(bytes) {
2407
+ return `${(bytes / 1024 / 1024).toFixed(1)}MB`;
2408
+ }
2277
2409
  export {
2278
2410
  cleanPdfText,
2279
2411
  extractPdfMetadataOnly,
2280
2412
  parsePdfDocument
2281
2413
  };
2282
- //# sourceMappingURL=parser-NZFDRZLS.js.map
2414
+ //# sourceMappingURL=parser-6L6DZCOB.js.map