@clazic/kordoc 2.3.1 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
  2. package/dist/batch-provider-PNDCSGQW.js.map +1 -0
  3. package/dist/{chunk-ZOEUKD77.js → chunk-2GFJFTKS.js} +193 -49
  4. package/dist/chunk-2GFJFTKS.js.map +1 -0
  5. package/dist/chunk-4PP34NVQ.js +121 -0
  6. package/dist/chunk-4PP34NVQ.js.map +1 -0
  7. package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
  8. package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
  9. package/dist/chunk-JOGAFNIL.js +153 -0
  10. package/dist/chunk-JOGAFNIL.js.map +1 -0
  11. package/dist/{chunk-W5KUC23B.js → chunk-STIKJGEA.js} +2 -2
  12. package/dist/cli.js +8 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/index.cjs +217 -70
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +11 -6
  17. package/dist/index.d.ts +11 -6
  18. package/dist/index.js +217 -70
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp.js +5 -2
  21. package/dist/mcp.js.map +1 -1
  22. package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
  23. package/dist/provider-HE727F7Z.js.map +1 -0
  24. package/dist/resolve-QA3VACUP.js +111 -0
  25. package/dist/resolve-QA3VACUP.js.map +1 -0
  26. package/dist/tesseract-provider-MNMZPSGF.js +11 -0
  27. package/dist/{utils-HSF5HI5T.js → utils-FFUQJTTI.js} +2 -2
  28. package/dist/utils-FFUQJTTI.js.map +1 -0
  29. package/dist/{watch-R2JHXDGF.js → watch-2O32L6IF.js} +6 -3
  30. package/dist/{watch-R2JHXDGF.js.map → watch-2O32L6IF.js.map} +1 -1
  31. package/package.json +1 -1
  32. package/dist/batch-provider-PCT4I4LK.js.map +0 -1
  33. package/dist/chunk-ZOEUKD77.js.map +0 -1
  34. package/dist/provider-WYHC4NHI.js.map +0 -1
  35. package/dist/resolve-4FSAQF2S.js +0 -247
  36. package/dist/resolve-4FSAQF2S.js.map +0 -1
  37. /package/dist/{chunk-W5KUC23B.js.map → chunk-STIKJGEA.js.map} +0 -0
  38. /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.cjs CHANGED
@@ -1993,8 +1993,8 @@ function getTesseractFallbackMessage() {
1993
1993
  "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
1994
1994
  "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
1995
1995
  "",
1996
- " [\uAD8C\uC7A5] Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
1997
- " Codex CLI: npm install -g @openai/codex",
1996
+ " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
1997
+ " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
1998
1998
  " Claude CLI: npm install -g @anthropic-ai/claude-code",
1999
1999
  " Ollama: brew install ollama (+ ollama pull gemma4:27b)"
2000
2000
  ].join("\n");
@@ -2004,7 +2004,7 @@ var init_auto_detect = __esm({
2004
2004
  "src/ocr/auto-detect.ts"() {
2005
2005
  "use strict";
2006
2006
  import_child_process = require("child_process");
2007
- CLI_PRIORITY = ["gemini", "codex", "claude", "ollama"];
2007
+ CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
2008
2008
  }
2009
2009
  });
2010
2010
 
@@ -2043,7 +2043,7 @@ function callCli(mode, imagePath) {
2043
2043
  const args = buildCliArgs(mode, imagePath);
2044
2044
  const result = (0, import_child_process2.spawnSync)(mode, args, {
2045
2045
  encoding: "utf-8",
2046
- timeout: 18e4,
2046
+ timeout: 6e5,
2047
2047
  maxBuffer: 10 * 1024 * 1024,
2048
2048
  // claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
2049
2049
  ...mode === "claude" ? { cwd: (0, import_os.tmpdir)() } : {}
@@ -2137,7 +2137,7 @@ async function callOllamaApi(imagePath) {
2137
2137
  return data.message?.content || "";
2138
2138
  }
2139
2139
  function stripCodeFence(text) {
2140
- const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
2140
+ const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
2141
2141
  return match ? match[1].trim() : text;
2142
2142
  }
2143
2143
  var import_child_process2, import_fs, import_path, import_os, OCR_PROMPT, _tempDir;
@@ -2148,7 +2148,15 @@ var init_cli_provider = __esm({
2148
2148
  import_fs = require("fs");
2149
2149
  import_path = require("path");
2150
2150
  import_os = require("os");
2151
- OCR_PROMPT = "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\uADDC\uCE59:\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2151
+ OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
2152
+ \uADDC\uCE59:
2153
+ - \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
2154
+ - \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
2155
+ - \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
2156
+ - \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
2157
+ - \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
2158
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
2159
+ - \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
2152
2160
  _tempDir = null;
2153
2161
  }
2154
2162
  });
@@ -2314,9 +2322,8 @@ async function callBatchCli(mode, imagePaths) {
2314
2322
  ${fileRefs}`;
2315
2323
  let args;
2316
2324
  if (mode === "gemini") {
2317
- args = ["--prompt", prompt, "--yolo"];
2318
- const model = process.env.KORDOC_GEMINI_MODEL;
2319
- if (model) args.push("--model", model);
2325
+ const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
2326
+ args = ["--prompt", prompt, "--yolo", "--model", model];
2320
2327
  } else {
2321
2328
  args = ["--print", prompt];
2322
2329
  const model = process.env.KORDOC_CLAUDE_MODEL;
@@ -2664,22 +2671,22 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2664
2671
  if (pageFilter && !pageFilter.has(i)) continue;
2665
2672
  pageNumbers.push(i);
2666
2673
  }
2667
- const pageImages = [];
2668
- for (const pageNum of pageNumbers) {
2669
- const page = await doc.getPage(pageNum);
2670
- const image = await renderPageToPng(page);
2671
- pageImages.push({ image, pageNum });
2672
- }
2673
- const batches = [];
2674
- for (let i = 0; i < pageImages.length; i += provider.batchSize) {
2675
- batches.push(pageImages.slice(i, i + provider.batchSize));
2674
+ const pageBatches = [];
2675
+ for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
2676
+ pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
2676
2677
  }
2677
2678
  let processed = 0;
2678
- const batchTasks = batches.map((batch, batchIdx) => async () => {
2679
+ const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2679
2680
  const pageBlocks = [];
2680
2681
  try {
2681
- const results = await provider.processBatch(batch);
2682
- for (const { pageNum } of batch) {
2682
+ const batchImages = [];
2683
+ for (const pageNum of batchPageNums) {
2684
+ const page = await doc.getPage(pageNum);
2685
+ const image = await renderPageToPng(page);
2686
+ batchImages.push({ image, pageNum });
2687
+ }
2688
+ const results = await provider.processBatch(batchImages);
2689
+ for (const { pageNum } of batchImages) {
2683
2690
  const result = results.get(pageNum);
2684
2691
  pageBlocks.push({
2685
2692
  pageNum,
@@ -2687,16 +2694,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2687
2694
  });
2688
2695
  }
2689
2696
  } catch (err) {
2690
- const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
2697
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2691
2698
  warnings?.push({
2692
2699
  message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2693
2700
  code: "OCR_PAGE_FAILED"
2694
2701
  });
2695
- for (const { pageNum } of batch) {
2702
+ for (const pageNum of batchPageNums) {
2696
2703
  pageBlocks.push({ pageNum, blocks: [] });
2697
2704
  }
2698
2705
  }
2699
- processed += batch.length;
2706
+ processed += batchPageNums.length;
2700
2707
  onProgress?.(processed, pageNumbers.length);
2701
2708
  return { batchIdx, pageBlocks };
2702
2709
  });
@@ -2772,24 +2779,29 @@ function isPdfFile(buffer) {
2772
2779
  const b = magicBytes(buffer);
2773
2780
  return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
2774
2781
  }
2782
+ function isPngFile(buffer) {
2783
+ const b = magicBytes(buffer);
2784
+ return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
2785
+ }
2775
2786
  function detectFormat(buffer) {
2776
2787
  if (buffer.byteLength < 4) return "unknown";
2777
2788
  if (isZipFile(buffer)) return "hwpx";
2778
2789
  if (isOldHwpFile(buffer)) return "hwp";
2779
2790
  if (isPdfFile(buffer)) return "pdf";
2791
+ if (isPngFile(buffer)) return "image";
2780
2792
  return "unknown";
2781
2793
  }
2782
2794
  async function detectZipFormat(buffer) {
2783
2795
  try {
2784
2796
  const zip = await import_jszip.default.loadAsync(buffer);
2785
- if (zip.file("xl/workbook.xml")) return "xlsx";
2786
- if (zip.file("word/document.xml")) return "docx";
2787
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
2797
+ if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
2798
+ if (zip.file("word/document.xml")) return { format: "docx", zip };
2799
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
2788
2800
  const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
2789
- if (hasSection) return "hwpx";
2790
- return "unknown";
2801
+ if (hasSection) return { format: "hwpx", zip };
2802
+ return { format: "unknown", zip: null };
2791
2803
  } catch {
2792
- return "unknown";
2804
+ return { format: "unknown", zip: null };
2793
2805
  }
2794
2806
  }
2795
2807
 
@@ -2798,7 +2810,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
2798
2810
  var import_xmldom = require("@xmldom/xmldom");
2799
2811
 
2800
2812
  // src/utils.ts
2801
- var VERSION = true ? "2.3.1" : "0.0.0-dev";
2813
+ var VERSION = true ? "2.3.2" : "0.0.0-dev";
2802
2814
  function toArrayBuffer(buf) {
2803
2815
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2804
2816
  return buf.buffer;
@@ -2958,12 +2970,16 @@ function buildTableDirect(rows, numRows) {
2958
2970
  return trimAndReturn(grid, numRows, maxCols);
2959
2971
  }
2960
2972
  function trimAndReturn(grid, numRows, maxCols) {
2961
- let effectiveCols = maxCols;
2962
- while (effectiveCols > 0) {
2963
- const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
2964
- if (!colEmpty) break;
2965
- effectiveCols--;
2973
+ let effectiveCols = 0;
2974
+ for (const row of grid) {
2975
+ for (let c = row.length - 1; c >= effectiveCols; c--) {
2976
+ if (row[c]?.text?.trim()) {
2977
+ effectiveCols = c + 1;
2978
+ break;
2979
+ }
2980
+ }
2966
2981
  }
2982
+ if (effectiveCols === 0) effectiveCols = maxCols;
2967
2983
  if (effectiveCols < maxCols && effectiveCols > 0) {
2968
2984
  const trimmed = grid.map((row) => row.slice(0, effectiveCols));
2969
2985
  return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
@@ -3220,11 +3236,11 @@ function parseStyleElements(doc, map) {
3220
3236
  function stripDtd(xml) {
3221
3237
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
3222
3238
  }
3223
- async function parseHwpxDocument(buffer, options) {
3239
+ async function parseHwpxDocument(buffer, options, existingZip) {
3224
3240
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3225
3241
  let zip;
3226
3242
  try {
3227
- zip = await import_jszip2.default.loadAsync(buffer);
3243
+ zip = existingZip ?? await import_jszip2.default.loadAsync(buffer);
3228
3244
  } catch {
3229
3245
  return await extractFromBrokenZip(buffer);
3230
3246
  }
@@ -6236,8 +6252,15 @@ var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
6236
6252
  import_pdf2.GlobalWorkerOptions.workerSrc = "";
6237
6253
  var MAX_PAGES = 5e3;
6238
6254
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
6239
- var PDF_LOAD_TIMEOUT_MS = 3e4;
6255
+ function calcPdfTimeout(bufferSize) {
6256
+ const base = 3e4;
6257
+ const perMb = 500;
6258
+ const mb = bufferSize / (1024 * 1024);
6259
+ return Math.min(base + Math.ceil(mb * perMb), 3e5);
6260
+ }
6240
6261
  async function loadPdfWithTimeout(buffer) {
6262
+ const timeoutMs = calcPdfTimeout(buffer.byteLength);
6263
+ const timeoutSec = Math.round(timeoutMs / 1e3);
6241
6264
  const loadingTask = (0, import_pdf2.getDocument)({
6242
6265
  data: new Uint8Array(buffer),
6243
6266
  useSystemFonts: true,
@@ -6251,8 +6274,8 @@ async function loadPdfWithTimeout(buffer) {
6251
6274
  new Promise((_, reject) => {
6252
6275
  timer = setTimeout(() => {
6253
6276
  loadingTask.destroy();
6254
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
6255
- }, PDF_LOAD_TIMEOUT_MS);
6277
+ reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
6278
+ }, timeoutMs);
6256
6279
  })
6257
6280
  ]);
6258
6281
  } finally {
@@ -6273,11 +6296,15 @@ async function parsePdfDocument(buffer, options) {
6273
6296
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
6274
6297
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
6275
6298
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
6276
- const allFontSizes = [];
6299
+ const fontSizeFreq = /* @__PURE__ */ new Map();
6277
6300
  const pageHeights = /* @__PURE__ */ new Map();
6278
- let parsedPages = 0;
6301
+ const targetPageNums = [];
6279
6302
  for (let i = 1; i <= effectivePageCount; i++) {
6280
6303
  if (pageFilter && !pageFilter.has(i)) continue;
6304
+ targetPageNums.push(i);
6305
+ }
6306
+ let parsedPages = 0;
6307
+ const parseSinglePage = async (i) => {
6281
6308
  try {
6282
6309
  const page = await doc.getPage(i);
6283
6310
  const tc = await page.getTextContent();
@@ -6290,7 +6317,10 @@ async function parsePdfDocument(buffer, options) {
6290
6317
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
6291
6318
  }
6292
6319
  for (const item of visible) {
6293
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
6320
+ if (item.fontSize > 0) {
6321
+ const rounded = Math.round(item.fontSize * 10) / 10;
6322
+ fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
6323
+ }
6294
6324
  }
6295
6325
  const opList = await page.getOperatorList();
6296
6326
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -6307,12 +6337,23 @@ async function parsePdfDocument(buffer, options) {
6307
6337
  if (pageErr instanceof KordocError) throw pageErr;
6308
6338
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
6309
6339
  }
6340
+ };
6341
+ const sampleCount = Math.min(5, targetPageNums.length);
6342
+ for (let si = 0; si < sampleCount; si++) {
6343
+ await parseSinglePage(targetPageNums[si]);
6344
+ }
6345
+ const sampleParsed = parsedPages || sampleCount;
6346
+ const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
6347
+ if (!isImageBased) {
6348
+ for (let si = sampleCount; si < targetPageNums.length; si++) {
6349
+ await parseSinglePage(targetPageNums[si]);
6350
+ }
6310
6351
  }
6311
6352
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6312
- if (totalChars / Math.max(parsedPageCount, 1) < 10) {
6353
+ if (isImageBased) {
6313
6354
  let ocrProvider = options?.ocr ?? null;
6314
- const ocrMode = options?.ocrMode;
6315
- if (!ocrProvider && ocrMode && ocrMode !== "off") {
6355
+ const ocrMode = options?.ocrMode ?? "auto";
6356
+ if (!ocrProvider && ocrMode !== "off") {
6316
6357
  try {
6317
6358
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6318
6359
  const concurrency = options?.ocrConcurrency ?? 1;
@@ -6364,7 +6405,7 @@ async function parsePdfDocument(buffer, options) {
6364
6405
  blocks.splice(removed[ri], 1);
6365
6406
  }
6366
6407
  }
6367
- const medianFontSize = computeMedianFontSize(allFontSizes);
6408
+ const medianFontSize = computeMedianFromFreq(fontSizeFreq);
6368
6409
  if (medianFontSize > 0) {
6369
6410
  detectHeadings(blocks, medianFontSize);
6370
6411
  }
@@ -6417,11 +6458,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
6417
6458
  }
6418
6459
  return { visible, hiddenCount };
6419
6460
  }
6420
- function computeMedianFontSize(sizes) {
6421
- if (sizes.length === 0) return 0;
6422
- const sorted = [...sizes].sort((a, b) => a - b);
6423
- const mid = Math.floor(sorted.length / 2);
6424
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
6461
+ function computeMedianFromFreq(freq) {
6462
+ if (freq.size === 0) return 0;
6463
+ const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
6464
+ let total = 0;
6465
+ for (const [, count] of entries) total += count;
6466
+ const mid = total / 2;
6467
+ let cumulative = 0;
6468
+ for (const [size, count] of entries) {
6469
+ cumulative += count;
6470
+ if (cumulative >= mid) return size;
6471
+ }
6472
+ return 0;
6425
6473
  }
6426
6474
  function detectHeadings(blocks, medianFontSize) {
6427
6475
  for (const block of blocks) {
@@ -7224,6 +7272,7 @@ var MAX_SHEETS = 100;
7224
7272
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
7225
7273
  var MAX_ROWS2 = 1e4;
7226
7274
  var MAX_COLS2 = 200;
7275
+ var MAX_TOTAL_CELLS = 2e6;
7227
7276
  function cleanNumericValue(raw) {
7228
7277
  if (!/^-?\d+\.\d+$/.test(raw)) return raw;
7229
7278
  const num = parseFloat(raw);
@@ -7407,9 +7456,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
7407
7456
  }
7408
7457
  return blocks;
7409
7458
  }
7410
- async function parseXlsxDocument(buffer, options) {
7459
+ async function parseXlsxDocument(buffer, options, existingZip) {
7411
7460
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
7412
- const zip = await import_jszip3.default.loadAsync(buffer);
7461
+ const zip = existingZip ?? await import_jszip3.default.loadAsync(buffer);
7413
7462
  const warnings = [];
7414
7463
  const workbookFile = zip.file("xl/workbook.xml");
7415
7464
  if (!workbookFile) {
@@ -7436,6 +7485,7 @@ async function parseXlsxDocument(buffer, options) {
7436
7485
  }
7437
7486
  const blocks = [];
7438
7487
  const processedSheets = Math.min(sheets.length, MAX_SHEETS);
7488
+ let totalCells = 0;
7439
7489
  for (let i = 0; i < processedSheets; i++) {
7440
7490
  if (pageFilter && !pageFilter.has(i + 1)) continue;
7441
7491
  const sheet = sheets[i];
@@ -7462,6 +7512,11 @@ async function parseXlsxDocument(buffer, options) {
7462
7512
  try {
7463
7513
  const sheetXml = await sheetFile.async("text");
7464
7514
  const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
7515
+ totalCells += maxRow * maxCol;
7516
+ if (totalCells > MAX_TOTAL_CELLS) {
7517
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
7518
+ break;
7519
+ }
7465
7520
  const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
7466
7521
  blocks.push(...sheetBlocks);
7467
7522
  } catch (err) {
@@ -7545,10 +7600,35 @@ function getAttr(el, localName) {
7545
7600
  function parseXml2(text) {
7546
7601
  return new import_xmldom3.DOMParser().parseFromString(text, "text/xml");
7547
7602
  }
7603
+ function buildElementIndex(root) {
7604
+ const index = /* @__PURE__ */ new Map();
7605
+ const walk = (node) => {
7606
+ const children = node.childNodes;
7607
+ for (let i = 0; i < children.length; i++) {
7608
+ const child = children[i];
7609
+ if (child.nodeType === 1) {
7610
+ const el = child;
7611
+ const name = el.localName ?? "";
7612
+ if (name) {
7613
+ let list = index.get(name);
7614
+ if (!list) {
7615
+ list = [];
7616
+ index.set(name, list);
7617
+ }
7618
+ list.push(el);
7619
+ }
7620
+ walk(el);
7621
+ }
7622
+ }
7623
+ };
7624
+ walk(root);
7625
+ return index;
7626
+ }
7548
7627
  function parseStyles(xml) {
7549
7628
  const doc = parseXml2(xml);
7550
7629
  const styles = /* @__PURE__ */ new Map();
7551
- const styleElements = findElements(doc, "style");
7630
+ const idx = buildElementIndex(doc);
7631
+ const styleElements = idx.get("style") ?? [];
7552
7632
  for (const el of styleElements) {
7553
7633
  const styleId = getAttr(el, "styleId");
7554
7634
  if (!styleId) continue;
@@ -7576,7 +7656,8 @@ function parseStyles(xml) {
7576
7656
  function parseNumbering(xml) {
7577
7657
  const doc = parseXml2(xml);
7578
7658
  const abstractNums = /* @__PURE__ */ new Map();
7579
- const abstractElements = findElements(doc, "abstractNum");
7659
+ const idx = buildElementIndex(doc);
7660
+ const abstractElements = idx.get("abstractNum") ?? [];
7580
7661
  for (const el of abstractElements) {
7581
7662
  const abstractNumId = getAttr(el, "abstractNumId");
7582
7663
  if (!abstractNumId) continue;
@@ -7591,7 +7672,7 @@ function parseNumbering(xml) {
7591
7672
  abstractNums.set(abstractNumId, levels);
7592
7673
  }
7593
7674
  const nums = /* @__PURE__ */ new Map();
7594
- const numElements = findElements(doc, "num");
7675
+ const numElements = idx.get("num") ?? [];
7595
7676
  for (const el of numElements) {
7596
7677
  const numId = getAttr(el, "numId");
7597
7678
  if (!numId) continue;
@@ -7835,9 +7916,9 @@ async function extractImages(zip, rels, doc) {
7835
7916
  }
7836
7917
  return { blocks, images };
7837
7918
  }
7838
- async function parseDocxDocument(buffer, options) {
7919
+ async function parseDocxDocument(buffer, options, existingZip) {
7839
7920
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
7840
- const zip = await import_jszip4.default.loadAsync(buffer);
7921
+ const zip = existingZip ?? await import_jszip4.default.loadAsync(buffer);
7841
7922
  const warnings = [];
7842
7923
  const docFile = zip.file("word/document.xml");
7843
7924
  if (!docFile) {
@@ -7927,6 +8008,11 @@ async function parseDocxDocument(buffer, options) {
7927
8008
  };
7928
8009
  }
7929
8010
 
8011
+ // src/index.ts
8012
+ init_cli_provider();
8013
+ init_tesseract_provider();
8014
+ init_markdown_to_blocks();
8015
+
7930
8016
  // src/diff/text-diff.ts
7931
8017
  function similarity(a, b) {
7932
8018
  if (a === b) return 1;
@@ -10443,25 +10529,86 @@ async function parse2(input, options) {
10443
10529
  if (!buffer || buffer.byteLength === 0) {
10444
10530
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
10445
10531
  }
10532
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
10533
+ if (buffer.byteLength > MAX_FILE_SIZE) {
10534
+ return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
10535
+ }
10446
10536
  const format = detectFormat(buffer);
10447
10537
  switch (format) {
10448
10538
  case "hwpx": {
10449
- const zipFormat = await detectZipFormat(buffer);
10450
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
10451
- if (zipFormat === "docx") return parseDocx(buffer, options);
10452
- return parseHwpx(buffer, options);
10539
+ const { format: zipFormat, zip } = await detectZipFormat(buffer);
10540
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
10541
+ if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
10542
+ return parseHwpx(buffer, options, zip ?? void 0);
10453
10543
  }
10454
10544
  case "hwp":
10455
10545
  return parseHwp(buffer, options);
10456
10546
  case "pdf":
10457
10547
  return parsePdf(buffer, options);
10548
+ case "image":
10549
+ return parseImage(buffer, options);
10458
10550
  default:
10459
10551
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
10460
10552
  }
10461
10553
  }
10462
- async function parseHwpx(buffer, options) {
10554
+ async function parseImage(buffer, options) {
10555
+ const ocrMode = options?.ocrMode || "auto";
10556
+ if (ocrMode === "off") {
10557
+ return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
10558
+ }
10559
+ let ocrProvider;
10560
+ let actualOcrMode = "auto";
10561
+ try {
10562
+ if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
10563
+ ocrProvider = createCliOcrProvider(ocrMode);
10564
+ actualOcrMode = ocrMode;
10565
+ } else if (ocrMode === "tesseract") {
10566
+ ocrProvider = await createTesseractProvider();
10567
+ actualOcrMode = ocrMode;
10568
+ } else if (ocrMode === "auto") {
10569
+ const modesToTry = ["gemini", "claude", "codex", "ollama"];
10570
+ for (const mode of modesToTry) {
10571
+ try {
10572
+ ocrProvider = createCliOcrProvider(mode);
10573
+ actualOcrMode = mode;
10574
+ break;
10575
+ } catch (e) {
10576
+ console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
10577
+ }
10578
+ }
10579
+ if (!ocrProvider) {
10580
+ ocrProvider = await createTesseractProvider();
10581
+ actualOcrMode = "tesseract";
10582
+ }
10583
+ }
10584
+ if (!ocrProvider) {
10585
+ return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
10586
+ }
10587
+ const imageUint8Array = new Uint8Array(buffer);
10588
+ const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
10589
+ if (ocrProvider.terminate) {
10590
+ await ocrProvider.terminate();
10591
+ }
10592
+ const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
10593
+ const blocks = markdownToBlocks(markdown, 1);
10594
+ return {
10595
+ success: true,
10596
+ fileType: "image",
10597
+ markdown,
10598
+ blocks,
10599
+ isImageBased: true,
10600
+ warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
10601
+ };
10602
+ } catch (err) {
10603
+ if (ocrProvider && ocrProvider.terminate) {
10604
+ await ocrProvider.terminate();
10605
+ }
10606
+ return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
10607
+ }
10608
+ }
10609
+ async function parseHwpx(buffer, options, zip) {
10463
10610
  try {
10464
- const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
10611
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
10465
10612
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10466
10613
  } catch (err) {
10467
10614
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -10484,17 +10631,17 @@ async function parsePdf(buffer, options) {
10484
10631
  return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
10485
10632
  }
10486
10633
  }
10487
- async function parseXlsx(buffer, options) {
10634
+ async function parseXlsx(buffer, options, zip) {
10488
10635
  try {
10489
- const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
10636
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
10490
10637
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
10491
10638
  } catch (err) {
10492
10639
  return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
10493
10640
  }
10494
10641
  }
10495
- async function parseDocx(buffer, options) {
10642
+ async function parseDocx(buffer, options, zip) {
10496
10643
  try {
10497
- const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
10644
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
10498
10645
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10499
10646
  } catch (err) {
10500
10647
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };