kordoc 2.5.2 → 2.7.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (63) hide show
  1. package/README.md +450 -431
  2. package/dist/chunk-4NWDJGAU.js +18955 -0
  3. package/dist/chunk-4NWDJGAU.js.map +1 -0
  4. package/dist/{chunk-NKKLA43G.js → chunk-4SK2PDMQ.js} +14 -3
  5. package/dist/chunk-4SK2PDMQ.js.map +1 -0
  6. package/dist/{chunk-24NKFRB4.js → chunk-LB7E2KDF.js} +14 -3
  7. package/dist/chunk-LB7E2KDF.js.map +1 -0
  8. package/dist/chunk-MEPHGCPQ.js +266 -0
  9. package/dist/chunk-MEPHGCPQ.js.map +1 -0
  10. package/dist/chunk-MOL7MDBG.js +0 -0
  11. package/dist/chunk-MUOQXDZ4.cjs.map +1 -1
  12. package/dist/{chunk-Z65OQP3H.cjs → chunk-Y476BOHI.cjs} +14 -3
  13. package/dist/chunk-Y476BOHI.cjs.map +1 -0
  14. package/dist/cli.js +60 -5
  15. package/dist/cli.js.map +1 -1
  16. package/dist/{detect-I7YIS4Q6.js → detect-RI2MQ33K.js} +6 -2
  17. package/dist/formula-3AQUUIRF.js +1151 -0
  18. package/dist/formula-3AQUUIRF.js.map +1 -0
  19. package/dist/formula-JCNF43NE.js +1153 -0
  20. package/dist/formula-JCNF43NE.js.map +1 -0
  21. package/dist/formula-XGG6ZP42.cjs +1151 -0
  22. package/dist/formula-XGG6ZP42.cjs.map +1 -0
  23. package/dist/index.cjs +14706 -450
  24. package/dist/index.cjs.map +1 -1
  25. package/dist/index.d.cts +73 -2
  26. package/dist/index.d.ts +73 -2
  27. package/dist/index.js +14583 -327
  28. package/dist/index.js.map +1 -1
  29. package/dist/mcp.js +5 -5
  30. package/dist/mcp.js.map +1 -1
  31. package/dist/page-range-3C7UGGEK.cjs.map +1 -1
  32. package/dist/page-range-737B4EZW.js +0 -0
  33. package/dist/{parser-AZYPOKAR.cjs → parser-7OFQ67QL.cjs} +160 -28
  34. package/dist/parser-7OFQ67QL.cjs.map +1 -0
  35. package/dist/{parser-BQKQOIJU.js → parser-DJCMY3OO.js} +136 -4
  36. package/dist/parser-DJCMY3OO.js.map +1 -0
  37. package/dist/{parser-FRROKAB7.js → parser-QMMQ7Y7R.js} +136 -4
  38. package/dist/parser-QMMQ7Y7R.js.map +1 -0
  39. package/dist/{provider-WPIYEALY.js → provider-2SEHU2FM.js} +1 -1
  40. package/dist/provider-2SEHU2FM.js.map +1 -0
  41. package/dist/{provider-7H4CPZYS.js → provider-AKROB7WQ.js} +1 -1
  42. package/dist/provider-AKROB7WQ.js.map +1 -0
  43. package/dist/{provider-YN2SSK4X.cjs → provider-SNONEZNW.cjs} +1 -1
  44. package/dist/provider-SNONEZNW.cjs.map +1 -0
  45. package/dist/setup-57FB3LSP.js +0 -0
  46. package/dist/{watch-ZJAUWUAE.js → watch-FVMVIZ5Q.js} +4 -4
  47. package/dist/watch-FVMVIZ5Q.js.map +1 -0
  48. package/package.json +98 -77
  49. package/dist/chunk-24NKFRB4.js.map +0 -1
  50. package/dist/chunk-2CAJSQK5.js +0 -5052
  51. package/dist/chunk-2CAJSQK5.js.map +0 -1
  52. package/dist/chunk-M3E3C5GS.js +0 -59
  53. package/dist/chunk-M3E3C5GS.js.map +0 -1
  54. package/dist/chunk-NKKLA43G.js.map +0 -1
  55. package/dist/chunk-Z65OQP3H.cjs.map +0 -1
  56. package/dist/parser-AZYPOKAR.cjs.map +0 -1
  57. package/dist/parser-BQKQOIJU.js.map +0 -1
  58. package/dist/parser-FRROKAB7.js.map +0 -1
  59. package/dist/provider-7H4CPZYS.js.map +0 -1
  60. package/dist/provider-WPIYEALY.js.map +0 -1
  61. package/dist/provider-YN2SSK4X.cjs.map +0 -1
  62. package/dist/watch-ZJAUWUAE.js.map +0 -1
  63. /package/dist/{detect-I7YIS4Q6.js.map → detect-RI2MQ33K.js.map} +0 -0
@@ -7,7 +7,7 @@ import {
7
7
  blocksToMarkdown,
8
8
  safeMax,
9
9
  safeMin
10
- } from "./chunk-NKKLA43G.js";
10
+ } from "./chunk-4SK2PDMQ.js";
11
11
  import {
12
12
  parsePageRange
13
13
  } from "./chunk-MOL7MDBG.js";
@@ -1189,6 +1189,7 @@ async function loadPdfWithTimeout(buffer) {
1189
1189
  }
1190
1190
  }
1191
1191
  async function parsePdfDocument(buffer, options) {
1192
+ const formulaBuffer = options?.formulaOcr ? buffer.slice(0) : null;
1192
1193
  const doc = await loadPdfWithTimeout(buffer);
1193
1194
  try {
1194
1195
  const pageCount = doc.numPages;
@@ -1241,7 +1242,7 @@ async function parsePdfDocument(buffer, options) {
1241
1242
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
1242
1243
  if (options?.ocr) {
1243
1244
  try {
1244
- const { ocrPages } = await import("./provider-7H4CPZYS.js");
1245
+ const { ocrPages } = await import("./provider-AKROB7WQ.js");
1245
1246
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
1246
1247
  if (ocrBlocks.length > 0) {
1247
1248
  const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
@@ -1258,6 +1259,16 @@ async function parsePdfDocument(buffer, options) {
1258
1259
  blocks.splice(removed[ri], 1);
1259
1260
  }
1260
1261
  }
1262
+ if (options?.formulaOcr && formulaBuffer) {
1263
+ try {
1264
+ await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
1265
+ } catch (e) {
1266
+ warnings.push({
1267
+ message: `\uC218\uC2DD OCR \uC2E4\uD328: ${e instanceof Error ? e.message : String(e)}`,
1268
+ code: "PARTIAL_PARSE"
1269
+ });
1270
+ }
1271
+ }
1261
1272
  const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
1262
1273
  if (medianFontSize > 0) {
1263
1274
  detectHeadings(blocks, medianFontSize);
@@ -2072,7 +2083,10 @@ function mergeLineSimple(items) {
2072
2083
  function cleanPdfText(text) {
2073
2084
  return mergeKoreanLines(
2074
2085
  text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
2075
- ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2086
+ ).replace(/^(?!\| ---).*$/gm, (line) => {
2087
+ if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
2088
+ return collapseEvenSpacing(line);
2089
+ }).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2076
2090
  }
2077
2091
  function startsWithMarker(line) {
2078
2092
  const t = line.trimStart();
@@ -2275,9 +2289,127 @@ function mergeKoreanLines(text) {
2275
2289
  }
2276
2290
  return result.join("\n");
2277
2291
  }
2292
+ async function applyFormulaOcr(buffer, blocks, pageFilter, effectivePageCount, warnings, _onProgress) {
2293
+ const formulaMod = await import("./formula-JCNF43NE.js");
2294
+ const { FormulaPipeline, ensureFormulaModels } = formulaMod;
2295
+ await ensureFormulaModels((p) => {
2296
+ if (p.phase === "download" && p.total) {
2297
+ const pct = Math.floor(p.downloaded / p.total * 100);
2298
+ process.stderr.write(`\r[kordoc-formula] ${p.spec.name} ${pct}% (${formatMb(p.downloaded)}/${formatMb(p.total)})`);
2299
+ if (p.downloaded >= p.total) process.stderr.write("\n");
2300
+ } else if (p.phase === "verify") {
2301
+ process.stderr.write(`[kordoc-formula] ${p.spec.name} SHA-256 \uAC80\uC99D \uC911...
2302
+ `);
2303
+ } else if (p.phase === "done") {
2304
+ process.stderr.write(`[kordoc-formula] ${p.spec.name} \uC900\uBE44 \uC644\uB8CC
2305
+ `);
2306
+ } else if (p.phase === "skip") {
2307
+ }
2308
+ });
2309
+ const pipeline = await FormulaPipeline.create();
2310
+ try {
2311
+ const pagesResult = await pipeline.runOnBuffer(buffer, pageFilter);
2312
+ if (pagesResult.length === 0) return;
2313
+ let insertedCount = 0;
2314
+ let removedDupCount = 0;
2315
+ for (const page of pagesResult) {
2316
+ const pageNumber = page.pageNumber;
2317
+ const pdfHeight = page.pdfHeight;
2318
+ const scaleX = page.renderedWidth > 0 ? page.pdfWidth / page.renderedWidth : 0.5;
2319
+ const scaleY = page.renderedHeight > 0 ? page.pdfHeight / page.renderedHeight : 0.5;
2320
+ const candidates = [];
2321
+ for (const r of page.regions) {
2322
+ if (!r.latex || !r.latex.trim()) continue;
2323
+ const wrapped = r.kind === "display" ? `$$${r.latex}$$` : `$${r.latex}$`;
2324
+ const x1 = r.bbox.x1 * scaleX;
2325
+ const x2 = r.bbox.x2 * scaleX;
2326
+ const yTop = pdfHeight - r.bbox.y1 * scaleY;
2327
+ const yBottom = pdfHeight - r.bbox.y2 * scaleY;
2328
+ const centerY = (yTop + yBottom) / 2;
2329
+ const width = x2 - x1;
2330
+ const height = yTop - yBottom;
2331
+ candidates.push({
2332
+ block: {
2333
+ type: "paragraph",
2334
+ text: wrapped,
2335
+ pageNumber,
2336
+ bbox: { page: pageNumber, x: x1, y: yBottom, width, height }
2337
+ },
2338
+ pdfBbox: { x1, x2, yTop, yBottom },
2339
+ centerY
2340
+ });
2341
+ }
2342
+ if (candidates.length === 0) continue;
2343
+ const OVERLAP_THRESHOLD = 0.6;
2344
+ const indicesToRemove = /* @__PURE__ */ new Set();
2345
+ for (let i = 0; i < blocks.length; i++) {
2346
+ const b = blocks[i];
2347
+ if (b.pageNumber !== pageNumber) continue;
2348
+ if (b.type === "table") continue;
2349
+ if (!b.bbox || b.bbox.width <= 0 || b.bbox.height <= 0) continue;
2350
+ const blockArea = b.bbox.width * b.bbox.height;
2351
+ if (blockArea <= 0) continue;
2352
+ for (const c of candidates) {
2353
+ const ox1 = Math.max(b.bbox.x, c.pdfBbox.x1);
2354
+ const ox2 = Math.min(b.bbox.x + b.bbox.width, c.pdfBbox.x2);
2355
+ const oy1 = Math.max(b.bbox.y, c.pdfBbox.yBottom);
2356
+ const oy2 = Math.min(b.bbox.y + b.bbox.height, c.pdfBbox.yTop);
2357
+ const interArea = Math.max(0, ox2 - ox1) * Math.max(0, oy2 - oy1);
2358
+ if (interArea / blockArea >= OVERLAP_THRESHOLD) {
2359
+ indicesToRemove.add(i);
2360
+ break;
2361
+ }
2362
+ }
2363
+ }
2364
+ if (indicesToRemove.size > 0) {
2365
+ const sorted = [...indicesToRemove].sort((a, b) => b - a);
2366
+ for (const idx of sorted) blocks.splice(idx, 1);
2367
+ removedDupCount += indicesToRemove.size;
2368
+ }
2369
+ candidates.sort((a, b) => b.centerY - a.centerY);
2370
+ for (const c of candidates) {
2371
+ let insertIdx = -1;
2372
+ let pageFirstIdx = -1;
2373
+ let pageLastIdx = -1;
2374
+ for (let i = 0; i < blocks.length; i++) {
2375
+ const b = blocks[i];
2376
+ if (b.pageNumber !== pageNumber) continue;
2377
+ if (pageFirstIdx === -1) pageFirstIdx = i;
2378
+ pageLastIdx = i;
2379
+ if (!b.bbox) continue;
2380
+ const blockCenter = b.bbox.y + b.bbox.height / 2;
2381
+ if (blockCenter < c.centerY) {
2382
+ insertIdx = i;
2383
+ break;
2384
+ }
2385
+ }
2386
+ if (insertIdx !== -1) {
2387
+ blocks.splice(insertIdx, 0, c.block);
2388
+ } else if (pageLastIdx !== -1) {
2389
+ blocks.splice(pageLastIdx + 1, 0, c.block);
2390
+ } else {
2391
+ blocks.push(c.block);
2392
+ }
2393
+ insertedCount++;
2394
+ }
2395
+ }
2396
+ if (insertedCount > 0 || removedDupCount > 0) {
2397
+ process.stderr.write(
2398
+ `[kordoc-formula] ${insertedCount}\uAC1C \uC218\uC2DD \uC0BD\uC785, ${removedDupCount}\uAC1C \uC911\uBCF5 block \uC81C\uAC70 (${pagesResult.length}\uAC1C \uD398\uC774\uC9C0)
2399
+ `
2400
+ );
2401
+ }
2402
+ } finally {
2403
+ await pipeline.destroy().catch(() => {
2404
+ });
2405
+ }
2406
+ }
2407
+ function formatMb(bytes) {
2408
+ return `${(bytes / 1024 / 1024).toFixed(1)}MB`;
2409
+ }
2278
2410
  export {
2279
2411
  cleanPdfText,
2280
2412
  extractPdfMetadataOnly,
2281
2413
  parsePdfDocument
2282
2414
  };
2283
- //# sourceMappingURL=parser-FRROKAB7.js.map
2415
+ //# sourceMappingURL=parser-QMMQ7Y7R.js.map