kordoc 2.5.2 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +21 -2
  2. package/dist/chunk-5CJGKKMZ.js +266 -0
  3. package/dist/chunk-5CJGKKMZ.js.map +1 -0
  4. package/dist/{chunk-24NKFRB4.js → chunk-GNN6MHH4.js} +14 -3
  5. package/dist/chunk-GNN6MHH4.js.map +1 -0
  6. package/dist/{chunk-NKKLA43G.js → chunk-LA66FVBN.js} +14 -3
  7. package/dist/chunk-LA66FVBN.js.map +1 -0
  8. package/dist/chunk-OBSPVJ6A.js +18947 -0
  9. package/dist/chunk-OBSPVJ6A.js.map +1 -0
  10. package/dist/{chunk-Z65OQP3H.cjs → chunk-RFGEEHI4.cjs} +14 -3
  11. package/dist/{chunk-Z65OQP3H.cjs.map → chunk-RFGEEHI4.cjs.map} +1 -1
  12. package/dist/cli.js +60 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{detect-I7YIS4Q6.js → detect-PJZMUL2Z.js} +6 -2
  15. package/dist/formula-3AQUUIRF.js +1151 -0
  16. package/dist/formula-3AQUUIRF.js.map +1 -0
  17. package/dist/formula-JCNF43NE.js +1153 -0
  18. package/dist/formula-JCNF43NE.js.map +1 -0
  19. package/dist/formula-XGG6ZP42.cjs +1151 -0
  20. package/dist/formula-XGG6ZP42.cjs.map +1 -0
  21. package/dist/index.cjs +14703 -455
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.d.cts +73 -2
  24. package/dist/index.d.ts +73 -2
  25. package/dist/index.js +14575 -327
  26. package/dist/index.js.map +1 -1
  27. package/dist/mcp.js +5 -5
  28. package/dist/{parser-FRROKAB7.js → parser-5CJGXQCJ.js} +135 -3
  29. package/dist/parser-5CJGXQCJ.js.map +1 -0
  30. package/dist/{parser-BQKQOIJU.js → parser-6L6DZCOB.js} +135 -3
  31. package/dist/parser-6L6DZCOB.js.map +1 -0
  32. package/dist/{parser-AZYPOKAR.cjs → parser-SRI2TIZX.cjs} +159 -27
  33. package/dist/{parser-AZYPOKAR.cjs.map → parser-SRI2TIZX.cjs.map} +1 -1
  34. package/dist/{watch-ZJAUWUAE.js → watch-7CTGUDQB.js} +4 -4
  35. package/package.json +25 -4
  36. package/dist/chunk-24NKFRB4.js.map +0 -1
  37. package/dist/chunk-2CAJSQK5.js +0 -5052
  38. package/dist/chunk-2CAJSQK5.js.map +0 -1
  39. package/dist/chunk-M3E3C5GS.js +0 -59
  40. package/dist/chunk-M3E3C5GS.js.map +0 -1
  41. package/dist/chunk-NKKLA43G.js.map +0 -1
  42. package/dist/parser-BQKQOIJU.js.map +0 -1
  43. package/dist/parser-FRROKAB7.js.map +0 -1
  44. /package/dist/{detect-I7YIS4Q6.js.map → detect-PJZMUL2Z.js.map} +0 -0
  45. /package/dist/{watch-ZJAUWUAE.js.map → watch-7CTGUDQB.js.map} +0 -0
package/dist/mcp.js CHANGED
@@ -8,18 +8,18 @@ import {
8
8
  fillHwpx,
9
9
  markdownToHwpx,
10
10
  parse
11
- } from "./chunk-2CAJSQK5.js";
11
+ } from "./chunk-OBSPVJ6A.js";
12
12
  import {
13
13
  detectFormat,
14
14
  detectZipFormat
15
- } from "./chunk-M3E3C5GS.js";
15
+ } from "./chunk-5CJGKKMZ.js";
16
16
  import {
17
17
  KordocError,
18
18
  VERSION,
19
19
  blocksToMarkdown,
20
20
  sanitizeError,
21
21
  toArrayBuffer
22
- } from "./chunk-NKKLA43G.js";
22
+ } from "./chunk-LA66FVBN.js";
23
23
  import "./chunk-MOL7MDBG.js";
24
24
 
25
25
  // src/mcp.ts
@@ -178,7 +178,7 @@ server.tool(
178
178
  let metadata;
179
179
  let effectiveFormat = format;
180
180
  if (format === "hwpx") {
181
- const { detectZipFormat: detectZipFormat2 } = await import("./detect-I7YIS4Q6.js");
181
+ const { detectZipFormat: detectZipFormat2 } = await import("./detect-PJZMUL2Z.js");
182
182
  const zipFormat = await detectZipFormat2(buffer);
183
183
  if (zipFormat === "xlsx" || zipFormat === "docx") effectiveFormat = zipFormat;
184
184
  }
@@ -191,7 +191,7 @@ server.tool(
191
191
  break;
192
192
  case "pdf":
193
193
  try {
194
- const { extractPdfMetadataOnly } = await import("./parser-FRROKAB7.js");
194
+ const { extractPdfMetadataOnly } = await import("./parser-5CJGXQCJ.js");
195
195
  metadata = await extractPdfMetadataOnly(buffer);
196
196
  } catch {
197
197
  metadata = void 0;
@@ -7,7 +7,7 @@ import {
7
7
  blocksToMarkdown,
8
8
  safeMax,
9
9
  safeMin
10
- } from "./chunk-NKKLA43G.js";
10
+ } from "./chunk-LA66FVBN.js";
11
11
  import {
12
12
  parsePageRange
13
13
  } from "./chunk-MOL7MDBG.js";
@@ -1189,6 +1189,7 @@ async function loadPdfWithTimeout(buffer) {
1189
1189
  }
1190
1190
  }
1191
1191
  async function parsePdfDocument(buffer, options) {
1192
+ const formulaBuffer = options?.formulaOcr ? buffer.slice(0) : null;
1192
1193
  const doc = await loadPdfWithTimeout(buffer);
1193
1194
  try {
1194
1195
  const pageCount = doc.numPages;
@@ -1258,6 +1259,16 @@ async function parsePdfDocument(buffer, options) {
1258
1259
  blocks.splice(removed[ri], 1);
1259
1260
  }
1260
1261
  }
1262
+ if (options?.formulaOcr && formulaBuffer) {
1263
+ try {
1264
+ await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
1265
+ } catch (e) {
1266
+ warnings.push({
1267
+ message: `\uC218\uC2DD OCR \uC2E4\uD328: ${e instanceof Error ? e.message : String(e)}`,
1268
+ code: "PARTIAL_PARSE"
1269
+ });
1270
+ }
1271
+ }
1261
1272
  const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
1262
1273
  if (medianFontSize > 0) {
1263
1274
  detectHeadings(blocks, medianFontSize);
@@ -2072,7 +2083,10 @@ function mergeLineSimple(items) {
2072
2083
  function cleanPdfText(text) {
2073
2084
  return mergeKoreanLines(
2074
2085
  text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
2075
- ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2086
+ ).replace(/^(?!\| ---).*$/gm, (line) => {
2087
+ if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
2088
+ return collapseEvenSpacing(line);
2089
+ }).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2076
2090
  }
2077
2091
  function startsWithMarker(line) {
2078
2092
  const t = line.trimStart();
@@ -2275,9 +2289,127 @@ function mergeKoreanLines(text) {
2275
2289
  }
2276
2290
  return result.join("\n");
2277
2291
  }
2292
+ async function applyFormulaOcr(buffer, blocks, pageFilter, effectivePageCount, warnings, _onProgress) {
2293
+ const formulaMod = await import("./formula-JCNF43NE.js");
2294
+ const { FormulaPipeline, ensureFormulaModels } = formulaMod;
2295
+ await ensureFormulaModels((p) => {
2296
+ if (p.phase === "download" && p.total) {
2297
+ const pct = Math.floor(p.downloaded / p.total * 100);
2298
+ process.stderr.write(`\r[kordoc-formula] ${p.spec.name} ${pct}% (${formatMb(p.downloaded)}/${formatMb(p.total)})`);
2299
+ if (p.downloaded >= p.total) process.stderr.write("\n");
2300
+ } else if (p.phase === "verify") {
2301
+ process.stderr.write(`[kordoc-formula] ${p.spec.name} SHA-256 \uAC80\uC99D \uC911...
2302
+ `);
2303
+ } else if (p.phase === "done") {
2304
+ process.stderr.write(`[kordoc-formula] ${p.spec.name} \uC900\uBE44 \uC644\uB8CC
2305
+ `);
2306
+ } else if (p.phase === "skip") {
2307
+ }
2308
+ });
2309
+ const pipeline = await FormulaPipeline.create();
2310
+ try {
2311
+ const pagesResult = await pipeline.runOnBuffer(buffer, pageFilter);
2312
+ if (pagesResult.length === 0) return;
2313
+ let insertedCount = 0;
2314
+ let removedDupCount = 0;
2315
+ for (const page of pagesResult) {
2316
+ const pageNumber = page.pageNumber;
2317
+ const pdfHeight = page.pdfHeight;
2318
+ const scaleX = page.renderedWidth > 0 ? page.pdfWidth / page.renderedWidth : 0.5;
2319
+ const scaleY = page.renderedHeight > 0 ? page.pdfHeight / page.renderedHeight : 0.5;
2320
+ const candidates = [];
2321
+ for (const r of page.regions) {
2322
+ if (!r.latex || !r.latex.trim()) continue;
2323
+ const wrapped = r.kind === "display" ? `$$${r.latex}$$` : `$${r.latex}$`;
2324
+ const x1 = r.bbox.x1 * scaleX;
2325
+ const x2 = r.bbox.x2 * scaleX;
2326
+ const yTop = pdfHeight - r.bbox.y1 * scaleY;
2327
+ const yBottom = pdfHeight - r.bbox.y2 * scaleY;
2328
+ const centerY = (yTop + yBottom) / 2;
2329
+ const width = x2 - x1;
2330
+ const height = yTop - yBottom;
2331
+ candidates.push({
2332
+ block: {
2333
+ type: "paragraph",
2334
+ text: wrapped,
2335
+ pageNumber,
2336
+ bbox: { page: pageNumber, x: x1, y: yBottom, width, height }
2337
+ },
2338
+ pdfBbox: { x1, x2, yTop, yBottom },
2339
+ centerY
2340
+ });
2341
+ }
2342
+ if (candidates.length === 0) continue;
2343
+ const OVERLAP_THRESHOLD = 0.6;
2344
+ const indicesToRemove = /* @__PURE__ */ new Set();
2345
+ for (let i = 0; i < blocks.length; i++) {
2346
+ const b = blocks[i];
2347
+ if (b.pageNumber !== pageNumber) continue;
2348
+ if (b.type === "table") continue;
2349
+ if (!b.bbox || b.bbox.width <= 0 || b.bbox.height <= 0) continue;
2350
+ const blockArea = b.bbox.width * b.bbox.height;
2351
+ if (blockArea <= 0) continue;
2352
+ for (const c of candidates) {
2353
+ const ox1 = Math.max(b.bbox.x, c.pdfBbox.x1);
2354
+ const ox2 = Math.min(b.bbox.x + b.bbox.width, c.pdfBbox.x2);
2355
+ const oy1 = Math.max(b.bbox.y, c.pdfBbox.yBottom);
2356
+ const oy2 = Math.min(b.bbox.y + b.bbox.height, c.pdfBbox.yTop);
2357
+ const interArea = Math.max(0, ox2 - ox1) * Math.max(0, oy2 - oy1);
2358
+ if (interArea / blockArea >= OVERLAP_THRESHOLD) {
2359
+ indicesToRemove.add(i);
2360
+ break;
2361
+ }
2362
+ }
2363
+ }
2364
+ if (indicesToRemove.size > 0) {
2365
+ const sorted = [...indicesToRemove].sort((a, b) => b - a);
2366
+ for (const idx of sorted) blocks.splice(idx, 1);
2367
+ removedDupCount += indicesToRemove.size;
2368
+ }
2369
+ candidates.sort((a, b) => b.centerY - a.centerY);
2370
+ for (const c of candidates) {
2371
+ let insertIdx = -1;
2372
+ let pageFirstIdx = -1;
2373
+ let pageLastIdx = -1;
2374
+ for (let i = 0; i < blocks.length; i++) {
2375
+ const b = blocks[i];
2376
+ if (b.pageNumber !== pageNumber) continue;
2377
+ if (pageFirstIdx === -1) pageFirstIdx = i;
2378
+ pageLastIdx = i;
2379
+ if (!b.bbox) continue;
2380
+ const blockCenter = b.bbox.y + b.bbox.height / 2;
2381
+ if (blockCenter < c.centerY) {
2382
+ insertIdx = i;
2383
+ break;
2384
+ }
2385
+ }
2386
+ if (insertIdx !== -1) {
2387
+ blocks.splice(insertIdx, 0, c.block);
2388
+ } else if (pageLastIdx !== -1) {
2389
+ blocks.splice(pageLastIdx + 1, 0, c.block);
2390
+ } else {
2391
+ blocks.push(c.block);
2392
+ }
2393
+ insertedCount++;
2394
+ }
2395
+ }
2396
+ if (insertedCount > 0 || removedDupCount > 0) {
2397
+ process.stderr.write(
2398
+ `[kordoc-formula] ${insertedCount}\uAC1C \uC218\uC2DD \uC0BD\uC785, ${removedDupCount}\uAC1C \uC911\uBCF5 block \uC81C\uAC70 (${pagesResult.length}\uAC1C \uD398\uC774\uC9C0)
2399
+ `
2400
+ );
2401
+ }
2402
+ } finally {
2403
+ await pipeline.destroy().catch(() => {
2404
+ });
2405
+ }
2406
+ }
2407
+ function formatMb(bytes) {
2408
+ return `${(bytes / 1024 / 1024).toFixed(1)}MB`;
2409
+ }
2278
2410
  export {
2279
2411
  cleanPdfText,
2280
2412
  extractPdfMetadataOnly,
2281
2413
  parsePdfDocument
2282
2414
  };
2283
- //# sourceMappingURL=parser-FRROKAB7.js.map
2415
+ //# sourceMappingURL=parser-5CJGXQCJ.js.map