@clazic/kordoc 2.3.0 → 2.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/{batch-provider-FUCIIS4M.js → batch-provider-PNDCSGQW.js} +59 -30
  2. package/dist/batch-provider-PNDCSGQW.js.map +1 -0
  3. package/dist/{chunk-2ZGLFZCN.js → chunk-2GFJFTKS.js} +193 -49
  4. package/dist/chunk-2GFJFTKS.js.map +1 -0
  5. package/dist/chunk-4PP34NVQ.js +121 -0
  6. package/dist/chunk-4PP34NVQ.js.map +1 -0
  7. package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
  8. package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
  9. package/dist/chunk-JOGAFNIL.js +153 -0
  10. package/dist/chunk-JOGAFNIL.js.map +1 -0
  11. package/dist/{chunk-WWILSVMJ.js → chunk-STIKJGEA.js} +2 -2
  12. package/dist/cli.js +10 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/index.cjs +291 -103
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +11 -6
  17. package/dist/index.d.ts +11 -6
  18. package/dist/index.js +292 -104
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp.js +5 -2
  21. package/dist/mcp.js.map +1 -1
  22. package/dist/{provider-OBY3XFSZ.js → provider-HE727F7Z.js} +38 -139
  23. package/dist/provider-HE727F7Z.js.map +1 -0
  24. package/dist/resolve-QA3VACUP.js +111 -0
  25. package/dist/resolve-QA3VACUP.js.map +1 -0
  26. package/dist/tesseract-provider-MNMZPSGF.js +11 -0
  27. package/dist/{utils-QAK24RJS.js → utils-FFUQJTTI.js} +2 -2
  28. package/dist/utils-FFUQJTTI.js.map +1 -0
  29. package/dist/{watch-MPHX3QIH.js → watch-2O32L6IF.js} +6 -3
  30. package/dist/{watch-MPHX3QIH.js.map → watch-2O32L6IF.js.map} +1 -1
  31. package/package.json +1 -1
  32. package/dist/batch-provider-FUCIIS4M.js.map +0 -1
  33. package/dist/chunk-2ZGLFZCN.js.map +0 -1
  34. package/dist/provider-OBY3XFSZ.js.map +0 -1
  35. package/dist/resolve-LBFYRHJI.js +0 -247
  36. package/dist/resolve-LBFYRHJI.js.map +0 -1
  37. /package/dist/{chunk-WWILSVMJ.js.map → chunk-STIKJGEA.js.map} +0 -0
  38. /package/dist/{utils-QAK24RJS.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
@@ -2,7 +2,7 @@
2
2
  import "./chunk-ZWE3DS7E.js";
3
3
 
4
4
  // src/ocr/batch-provider.ts
5
- import { spawnSync } from "child_process";
5
+ import { spawn } from "child_process";
6
6
  import { writeFileSync, readFileSync, unlinkSync, mkdirSync } from "fs";
7
7
  import { join } from "path";
8
8
  import { tmpdir } from "os";
@@ -36,9 +36,9 @@ function createBatchCliProvider(mode, batchSize) {
36
36
  }
37
37
  let output;
38
38
  if (mode === "codex") {
39
- output = callBatchCodexCli(tempFiles);
39
+ output = await callBatchCodexCli(tempFiles);
40
40
  } else {
41
- output = callBatchCli(mode, tempFiles);
41
+ output = await callBatchCli(mode, tempFiles);
42
42
  }
43
43
  const cleaned = stripCodeFence(output.trim());
44
44
  const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
@@ -60,40 +60,74 @@ function createBatchCliProvider(mode, batchSize) {
60
60
  }
61
61
  };
62
62
  }
63
- function callBatchCli(mode, imagePaths) {
63
+ function spawnAsync(cmd, args, opts) {
64
+ return new Promise((resolve, reject) => {
65
+ const child = spawn(cmd, args, {
66
+ cwd: opts.cwd,
67
+ env: process.env,
68
+ stdio: ["pipe", "pipe", "pipe"]
69
+ });
70
+ let stdout = "";
71
+ let stderr = "";
72
+ let killed = false;
73
+ child.stdout.setEncoding("utf-8");
74
+ child.stderr.setEncoding("utf-8");
75
+ child.stdout.on("data", (d) => {
76
+ stdout += d;
77
+ });
78
+ child.stderr.on("data", (d) => {
79
+ stderr += d;
80
+ });
81
+ const timer = setTimeout(() => {
82
+ killed = true;
83
+ child.kill("SIGTERM");
84
+ }, opts.timeoutMs);
85
+ if (opts.stdin !== void 0) {
86
+ child.stdin.end(opts.stdin);
87
+ } else {
88
+ child.stdin.end();
89
+ }
90
+ child.on("close", (code) => {
91
+ clearTimeout(timer);
92
+ if (killed) {
93
+ reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
94
+ } else {
95
+ resolve({ stdout, stderr, exitCode: code ?? 1 });
96
+ }
97
+ });
98
+ child.on("error", (err) => {
99
+ clearTimeout(timer);
100
+ reject(err);
101
+ });
102
+ });
103
+ }
104
+ async function callBatchCli(mode, imagePaths) {
64
105
  const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
65
106
  const prompt = `${BATCH_OCR_PROMPT}
66
107
 
67
108
  ${fileRefs}`;
68
109
  let args;
69
110
  if (mode === "gemini") {
70
- args = ["--prompt", prompt, "--yolo"];
71
- const model = process.env.KORDOC_GEMINI_MODEL;
72
- if (model) args.push("--model", model);
111
+ const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
112
+ args = ["--prompt", prompt, "--yolo", "--model", model];
73
113
  } else {
74
114
  args = ["--print", prompt];
75
115
  const model = process.env.KORDOC_CLAUDE_MODEL;
76
116
  if (model) args.push("--model", model);
77
117
  }
78
118
  const timeoutMs = 6e4 + imagePaths.length * 2e4;
79
- const result = spawnSync(mode, args, {
80
- encoding: "utf-8",
81
- timeout: timeoutMs,
82
- maxBuffer: 50 * 1024 * 1024,
83
- // 50MB (large batch output)
119
+ const result = await spawnAsync(mode, args, {
120
+ timeoutMs,
84
121
  ...mode === "claude" ? { cwd: tmpdir() } : {}
85
122
  });
86
- if (result.error) {
87
- throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
88
- }
89
- if (result.status !== 0) {
90
- const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
123
+ if (result.exitCode !== 0) {
124
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
91
125
  throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
92
126
  }
93
127
  return result.stdout || "";
94
128
  }
95
- function callBatchCodexCli(imagePaths) {
96
- const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}.txt`);
129
+ async function callBatchCodexCli(imagePaths) {
130
+ const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
97
131
  try {
98
132
  const args = ["exec", BATCH_OCR_PROMPT];
99
133
  for (const p of imagePaths) {
@@ -103,17 +137,12 @@ function callBatchCodexCli(imagePaths) {
103
137
  const model = process.env.KORDOC_CODEX_MODEL;
104
138
  if (model) args.push("--model", model);
105
139
  const timeoutMs = 6e4 + imagePaths.length * 2e4;
106
- const result = spawnSync("codex", args, {
107
- encoding: "utf-8",
108
- timeout: timeoutMs,
109
- maxBuffer: 50 * 1024 * 1024,
110
- input: ""
140
+ const result = await spawnAsync("codex", args, {
141
+ timeoutMs,
142
+ stdin: ""
111
143
  });
112
- if (result.error) {
113
- throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${result.error.message}`);
114
- }
115
- if (result.status !== 0) {
116
- const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
144
+ if (result.exitCode !== 0) {
145
+ const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
117
146
  throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
118
147
  }
119
148
  try {
@@ -136,4 +165,4 @@ export {
136
165
  DEFAULT_BATCH_SIZES,
137
166
  createBatchCliProvider
138
167
  };
139
- //# sourceMappingURL=batch-provider-FUCIIS4M.js.map
168
+ //# sourceMappingURL=batch-provider-PNDCSGQW.js.map
@@ -0,0 +1 @@
1
+ {"version":3,"sources":["../src/ocr/batch-provider.ts"],"sourcesContent":["/**\n * CLI 배치 OCR 프로바이더\n *\n * 여러 페이지 이미지를 단일 CLI 호출로 처리하여 API 호출 수를 대폭 감소.\n * gemini/claude: @file 멀티 참조, codex: --image 멀티 플래그\n *\n * 299페이지 기준:\n * - 기존: CLI 299회 호출 (~30분)\n * - 배치: CLI 3~6회 호출 (~3분)\n */\n\nimport { spawn } from \"child_process\"\nimport { writeFileSync, readFileSync, unlinkSync, mkdirSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { StructuredOcrResult, BatchOcrProvider } from \"../types.js\"\n\n/** 배치 OCR 프롬프트 */\nconst BATCH_OCR_PROMPT =\n \"다음 문서 페이지 이미지들을 OCR하여 순수 Markdown으로 변환하세요.\\n\\n\" +\n \"규칙:\\n\" +\n \"- 각 페이지 결과 사이에 반드시 이 구분자를 삽입: <!-- PAGE_BREAK -->\\n\" +\n \"- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\\n\" +\n \"- 병합된 셀은 해당 위치에 내용 기재\\n\" +\n \"- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\\n\" +\n \"- 리스트는 - 또는 1. 사용\\n\" +\n \"- 이미지, 도형 등 비텍스트 요소는 무시\\n\" +\n \"- 원문의 읽기 순서와 구조를 유지\\n\" +\n \"- ```로 감싸지 말고 순수 Markdown만 출력\"\n\n/** 모드별 기본 배치 크기 (CLI 내부 타임아웃 + 실측 기반)\n *\n * gemini CLI: 10장 이상에서 AbortError 발생 (내부 타임아웃).\n * 5장 배치가 안정적으로 동작 확인 (35초/배치).\n * 299페이지 = 60배치 = 기존 299회 대비 80% 감소.\n */\nexport const DEFAULT_BATCH_SIZES: Record<string, number> = {\n gemini: 5,\n claude: 5,\n codex: 10,\n}\n\n/** 임시 디렉토리 — gemini CLI는 cwd 하위 + gitignore 밖만 @참조 가능 */\nlet _batchTempDir: string | null = null\nfunction getBatchTempDir(): string {\n if (!_batchTempDir) {\n _batchTempDir = join(process.cwd(), \"_kordoc_ocr_tmp\")\n mkdirSync(_batchTempDir, { recursive: true })\n }\n return _batchTempDir\n}\n\n/**\n * 배치 CLI 프로바이더 생성\n */\nexport function createBatchCliProvider(\n mode: \"gemini\" | \"claude\" | \"codex\",\n batchSize: number\n): BatchOcrProvider {\n return {\n __batch: true as const,\n batchSize,\n async processBatch(pages) {\n const results = new Map<number, StructuredOcrResult>()\n const tempDir = getBatchTempDir()\n const tempFiles: string[] = []\n\n try {\n // 1. Write all page images to temp files\n for (const { image, pageNum } of pages) {\n const path = join(tempDir, `batch-p${pageNum}.png`)\n writeFileSync(path, image)\n tempFiles.push(path)\n }\n\n // 2. Call CLI with all file references (비동기 — 병렬 배치 실행 가능)\n let output: string\n if (mode === \"codex\") {\n output = await callBatchCodexCli(tempFiles)\n } else {\n output = await callBatchCli(mode, tempFiles)\n }\n\n // 3. Parse response by PAGE_BREAK separator\n const cleaned = stripCodeFence(output.trim())\n const parts = cleaned.split(/<!--\\s*PAGE_BREAK\\s*-->/)\n .map(p => p.trim())\n .filter(p => p.length > 0)\n\n // 4. Map results to page numbers (best-effort if count mismatch)\n for (let i = 0; i < pages.length; i++) {\n const pageNum = pages[i].pageNum\n if (i < parts.length) {\n results.set(pageNum, { markdown: parts[i] })\n }\n // If fewer parts than pages, remaining pages get no result\n }\n } finally {\n // 5. Clean up temp files\n for (const f of tempFiles) {\n try { unlinkSync(f) } catch { /* ignore */ }\n }\n }\n\n return results\n },\n }\n}\n\n/**\n * 비동기 CLI 실행 헬퍼 — spawn + Promise 래핑.\n * spawnSync는 이벤트 루프를 차단하여 병렬 배치 실행 불가.\n */\nfunction spawnAsync(\n cmd: string,\n args: string[],\n opts: { timeoutMs: number; cwd?: string; stdin?: string }\n): Promise<{ stdout: string; stderr: string; exitCode: number }> {\n return new Promise((resolve, reject) => {\n const child = spawn(cmd, args, {\n cwd: opts.cwd,\n env: process.env,\n stdio: [\"pipe\", \"pipe\", \"pipe\"],\n })\n\n let stdout = \"\"\n let stderr = \"\"\n let killed = false\n\n child.stdout.setEncoding(\"utf-8\")\n child.stderr.setEncoding(\"utf-8\")\n child.stdout.on(\"data\", (d: string) => { stdout += d })\n child.stderr.on(\"data\", (d: string) => { stderr += d })\n\n const timer = setTimeout(() => {\n killed = true\n child.kill(\"SIGTERM\")\n }, opts.timeoutMs)\n\n if (opts.stdin !== undefined) {\n child.stdin.end(opts.stdin)\n } else {\n child.stdin.end()\n }\n\n child.on(\"close\", (code) => {\n clearTimeout(timer)\n if (killed) {\n reject(new Error(`타임아웃 (${Math.round(opts.timeoutMs / 1000)}초)`))\n } else {\n resolve({ stdout, stderr, exitCode: code ?? 1 })\n }\n })\n child.on(\"error\", (err) => {\n clearTimeout(timer)\n reject(err)\n })\n })\n}\n\n/** gemini/claude 배치 호출 (비동기) */\nasync function callBatchCli(mode: \"gemini\" | \"claude\", imagePaths: string[]): Promise<string> {\n const fileRefs = imagePaths.map(p => `@${p}`).join(\"\\n\")\n const prompt = `${BATCH_OCR_PROMPT}\\n\\n${fileRefs}`\n\n let args: string[]\n if (mode === \"gemini\") {\n const model = process.env.KORDOC_GEMINI_MODEL ?? \"gemini-2.5-flash\"\n args = [\"--prompt\", prompt, \"--yolo\", \"--model\", model]\n } else {\n args = [\"--print\", prompt]\n const model = process.env.KORDOC_CLAUDE_MODEL\n if (model) args.push(\"--model\", model)\n }\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(mode, args, {\n timeoutMs,\n ...(mode === \"claude\" ? { cwd: tmpdir() } : {}),\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`${mode} 배치 OCR 실패: ${errMsg}`)\n }\n\n return result.stdout || \"\"\n}\n\n/** codex 배치 호출 (비동기) — --image를 여러 번 지정 */\nasync function callBatchCodexCli(imagePaths: string[]): Promise<string> {\n const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`)\n try {\n const args = [\"exec\", BATCH_OCR_PROMPT]\n for (const p of imagePaths) {\n args.push(\"--image\", p)\n }\n args.push(\"--output-last-message\", outPath)\n const model = process.env.KORDOC_CODEX_MODEL\n if (model) args.push(\"--model\", model)\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(\"codex\", args, {\n timeoutMs,\n stdin: \"\",\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`codex 배치 OCR 실패: ${errMsg}`)\n }\n\n try {\n return readFileSync(outPath, \"utf-8\")\n } catch {\n return result.stdout || \"\"\n }\n } finally {\n try { unlinkSync(outPath) } catch { /* ignore */ }\n }\n}\n\n/** LLM 출력에서 코드 펜스 제거 (cli-provider.ts와 동일 로직) */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*\\n([\\s\\S]*?)\\n```\\s*$/m)\n return match ? match[1].trim() : text\n}\n"],"mappings":";;;;AAWA,SAAS,aAAa;AACtB,SAAS,eAAe,cAAc,YAAY,iBAAiB;AACnE,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,mBACJ;AAiBK,IAAM,sBAA8C;AAAA,EACzD,QAAQ;AAAA,EACR,QAAQ;AAAA,EACR,OAAO;AACT;AAGA,IAAI,gBAA+B;AACnC,SAAS,kBAA0B;AACjC,MAAI,CAAC,eAAe;AAClB,oBAAgB,KAAK,QAAQ,IAAI,GAAG,iBAAiB;AACrD,cAAU,eAAe,EAAE,WAAW,KAAK,CAAC;AAAA,EAC9C;AACA,SAAO;AACT;AAKO,SAAS,uBACd,MACA,WACkB;AAClB,SAAO;AAAA,IACL,SAAS;AAAA,IACT;AAAA,IACA,MAAM,aAAa,OAAO;AACxB,YAAM,UAAU,oBAAI,IAAiC;AACrD,YAAM,UAAU,gBAAgB;AAChC,YAAM,YAAsB,CAAC;AAE7B,UAAI;AAEF,mBAAW,EAAE,OAAO,QAAQ,KAAK,OAAO;AACtC,gBAAM,OAAO,KAAK,SAAS,UAAU,OAAO,MAAM;AAClD,wBAAc,MAAM,KAAK;AACzB,oBAAU,KAAK,IAAI;AAAA,QACrB;AAGA,YAAI;AACJ,YAAI,SAAS,SAAS;AACpB,mBAAS,MAAM,kBAAkB,SAAS;AAAA,QAC5C,OAAO;AACL,mBAAS,MAAM,aAAa,MAAM,SAAS;AAAA,QAC7C;AAGA,cAAM,UAAU,eAAe,OAAO,KAAK,CAAC;AAC5C,cAAM,QAAQ,QAAQ,MAAM,yBAAyB,EAClD,IAAI,OAAK,EAAE,KAAK,CAAC,EACjB,OAAO,OAAK,EAAE,SAAS,CAAC;AAG3B,iBAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,gBAAM,UAAU,MAAM,CAAC,EAAE;AACzB,cAAI,IAAI,MAAM,QAAQ;AACpB,oBAAQ,IAAI,SAAS,EAAE,UAAU,MAAM,CAAC,EAAE,CAAC;AAAA,UAC7C;AAAA,QAEF;AAAA,MACF,UAAE;AAEA,mBAAW,KAAK,WAAW;AACzB,cAAI;AAAE,uBAAW,CAAC;AAAA,UAAE,QAAQ;AAAA,UAAe;AAAA,QAC7C;AAAA,MACF;AAEA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAMA,SAAS,WACP,KACA,MACA,MAC+D;AAC/D,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,QAAQ,MAAM,KAAK,MAAM;AAAA,MAC7B,KAAK,KAAK;AAAA,MACV,KAAK,QAAQ;AAAA,MACb,OAAO,CAAC,QAAQ,QAAQ,MAAM;AAAA,IAChC,CAAC;AAED,QAAI,SAAS;AACb,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AACtD,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AAEtD,UAAM,QAAQ,WAAW,MAAM;AAC7B,eAAS;AACT,YAAM,KAAK,SAAS;AAAA,IACtB,GAAG,KAAK,SAAS;AAEjB,QAAI,KAAK,UAAU,QAAW;AAC5B,YAAM,MAAM,IAAI,KAAK,KAAK;AAAA,IAC5B,OAAO;AACL,YAAM,MAAM,IAAI;AAAA,IAClB;AAEA,UAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,mBAAa,KAAK;AAClB,UAAI,QAAQ;AACV,eAAO,IAAI,MAAM,6BAAS,KAAK,MAAM,KAAK,YAAY,GAAI,CAAC,SAAI,CAAC;AAAA,MAClE,OAAO;AACL,gBAAQ,EAAE,QAAQ,QAAQ,UAAU,QAAQ,EAAE,CAAC;AAAA,MACjD;AAAA,IACF,CAAC;AACD,UAAM,GAAG,SAAS,CAAC,QAAQ;AACzB,mBAAa,KAAK;AAClB,aAAO,GAAG;AAAA,IACZ,CAAC;AAAA,EACH,CAAC;AACH;AAGA,eAAe,aAAa,MAA2B,YAAuC;AAC5F,QAAM,WAAW,WAAW,IAAI,OAAK,IAAI,CAAC,EAAE,EAAE,KAAK,IAAI;AACvD,QAAM,SAAS,GAAG,gBAAgB;AAAA;AAAA,EAAO,QAAQ;AAEjD,MAAI;AACJ,MAAI,SAAS,UAAU;AACrB,UAAM,QAAQ,QAAQ,IAAI,uBAAuB;AACjD,WAAO,CAAC,YAAY,QAAQ,UAAU,WAAW,KAAK;AAAA,EACxD,OAAO;AACL,WAAO,CAAC,WAAW,MAAM;AACzB,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC;AAEA,QAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,QAAM,SAAS,MAAM,WAAW,MAAM,MAAM;AAAA,IAC1C;AAAA,IACA,GAAI,SAAS,WAAW,EAAE,KAAK,OAAO,EAAE,IAAI,CAAC;AAAA,EAC/C,CAAC;AAED,MAAI,OAAO,aAAa,GAAG;AACzB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,MAAM,EAAE;AAAA,EAChD;AAEA,SAAO,OAAO,UAAU;AAC1B;AAGA,eAAe,kBAAkB,YAAuC;AACtE,QAAM,UAAU,KAAK,OAAO,GAAG,sBAAsB,KAAK,IAAI,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,CAAC,MAAM;AAC5G,MAAI;AACF,UAAM,OAAO,CAAC,QAAQ,gBAAgB;AACtC,eAAW,KAAK,YAAY;AAC1B,WAAK,KAAK,WAAW,CAAC;AAAA,IACxB;AACA,SAAK,KAAK,yBAAyB,OAAO;AAC1C,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAErC,UAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,UAAM,SAAS,MAAM,WAAW,SAAS,MAAM;AAAA,MAC7C;AAAA,MACA,OAAO;AAAA,IACT,CAAC;AAED,QAAI,OAAO,aAAa,GAAG;AACzB,YAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,YAAM,IAAI,MAAM,wCAAoB,MAAM,EAAE;AAAA,IAC9C;AAEA,QAAI;AACF,aAAO,aAAa,SAAS,OAAO;AAAA,IACtC,QAAQ;AACN,aAAO,OAAO,UAAU;AAAA,IAC1B;AAAA,EACF,UAAE;AACA,QAAI;AAAE,iBAAW,OAAO;AAAA,IAAE,QAAQ;AAAA,IAAe;AAAA,EACnD;AACF;AAGA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,+CAA+C;AACxE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;","names":[]}
@@ -6,10 +6,19 @@ import {
6
6
  precheckZipSize,
7
7
  sanitizeHref,
8
8
  toArrayBuffer
9
- } from "./chunk-WWILSVMJ.js";
9
+ } from "./chunk-STIKJGEA.js";
10
10
  import {
11
11
  parsePageRange
12
12
  } from "./chunk-MOL7MDBG.js";
13
+ import {
14
+ createTesseractProvider
15
+ } from "./chunk-7FMKAV4P.js";
16
+ import {
17
+ createCliOcrProvider
18
+ } from "./chunk-JOGAFNIL.js";
19
+ import {
20
+ markdownToBlocks
21
+ } from "./chunk-4PP34NVQ.js";
13
22
  import {
14
23
  __commonJS,
15
24
  __require,
@@ -1918,24 +1927,29 @@ function isPdfFile(buffer) {
1918
1927
  const b = magicBytes(buffer);
1919
1928
  return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
1920
1929
  }
1930
+ function isPngFile(buffer) {
1931
+ const b = magicBytes(buffer);
1932
+ return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
1933
+ }
1921
1934
  function detectFormat(buffer) {
1922
1935
  if (buffer.byteLength < 4) return "unknown";
1923
1936
  if (isZipFile(buffer)) return "hwpx";
1924
1937
  if (isOldHwpFile(buffer)) return "hwp";
1925
1938
  if (isPdfFile(buffer)) return "pdf";
1939
+ if (isPngFile(buffer)) return "image";
1926
1940
  return "unknown";
1927
1941
  }
1928
1942
  async function detectZipFormat(buffer) {
1929
1943
  try {
1930
1944
  const zip = await JSZip.loadAsync(buffer);
1931
- if (zip.file("xl/workbook.xml")) return "xlsx";
1932
- if (zip.file("word/document.xml")) return "docx";
1933
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
1945
+ if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
1946
+ if (zip.file("word/document.xml")) return { format: "docx", zip };
1947
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
1934
1948
  const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
1935
- if (hasSection) return "hwpx";
1936
- return "unknown";
1949
+ if (hasSection) return { format: "hwpx", zip };
1950
+ return { format: "unknown", zip: null };
1937
1951
  } catch {
1938
- return "unknown";
1952
+ return { format: "unknown", zip: null };
1939
1953
  }
1940
1954
  }
1941
1955
 
@@ -2024,12 +2038,16 @@ function buildTableDirect(rows, numRows) {
2024
2038
  return trimAndReturn(grid, numRows, maxCols);
2025
2039
  }
2026
2040
  function trimAndReturn(grid, numRows, maxCols) {
2027
- let effectiveCols = maxCols;
2028
- while (effectiveCols > 0) {
2029
- const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
2030
- if (!colEmpty) break;
2031
- effectiveCols--;
2041
+ let effectiveCols = 0;
2042
+ for (const row of grid) {
2043
+ for (let c = row.length - 1; c >= effectiveCols; c--) {
2044
+ if (row[c]?.text?.trim()) {
2045
+ effectiveCols = c + 1;
2046
+ break;
2047
+ }
2048
+ }
2032
2049
  }
2050
+ if (effectiveCols === 0) effectiveCols = maxCols;
2033
2051
  if (effectiveCols < maxCols && effectiveCols > 0) {
2034
2052
  const trimmed = grid.map((row) => row.slice(0, effectiveCols));
2035
2053
  return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
@@ -2289,11 +2307,11 @@ function parseStyleElements(doc, map) {
2289
2307
  function stripDtd(xml) {
2290
2308
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
2291
2309
  }
2292
- async function parseHwpxDocument(buffer, options) {
2310
+ async function parseHwpxDocument(buffer, options, existingZip) {
2293
2311
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
2294
2312
  let zip;
2295
2313
  try {
2296
- zip = await JSZip2.loadAsync(buffer);
2314
+ zip = existingZip ?? await JSZip2.loadAsync(buffer);
2297
2315
  } catch {
2298
2316
  return await extractFromBrokenZip(buffer);
2299
2317
  }
@@ -5328,8 +5346,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
5328
5346
  GlobalWorkerOptions.workerSrc = "";
5329
5347
  var MAX_PAGES = 5e3;
5330
5348
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
5331
- var PDF_LOAD_TIMEOUT_MS = 3e4;
5349
+ function calcPdfTimeout(bufferSize) {
5350
+ const base = 3e4;
5351
+ const perMb = 500;
5352
+ const mb = bufferSize / (1024 * 1024);
5353
+ return Math.min(base + Math.ceil(mb * perMb), 3e5);
5354
+ }
5332
5355
  async function loadPdfWithTimeout(buffer) {
5356
+ const timeoutMs = calcPdfTimeout(buffer.byteLength);
5357
+ const timeoutSec = Math.round(timeoutMs / 1e3);
5333
5358
  const loadingTask = getDocument({
5334
5359
  data: new Uint8Array(buffer),
5335
5360
  useSystemFonts: true,
@@ -5343,8 +5368,8 @@ async function loadPdfWithTimeout(buffer) {
5343
5368
  new Promise((_, reject) => {
5344
5369
  timer = setTimeout(() => {
5345
5370
  loadingTask.destroy();
5346
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
5347
- }, PDF_LOAD_TIMEOUT_MS);
5371
+ reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
5372
+ }, timeoutMs);
5348
5373
  })
5349
5374
  ]);
5350
5375
  } finally {
@@ -5365,11 +5390,15 @@ async function parsePdfDocument(buffer, options) {
5365
5390
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
5366
5391
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
5367
5392
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
5368
- const allFontSizes = [];
5393
+ const fontSizeFreq = /* @__PURE__ */ new Map();
5369
5394
  const pageHeights = /* @__PURE__ */ new Map();
5370
- let parsedPages = 0;
5395
+ const targetPageNums = [];
5371
5396
  for (let i = 1; i <= effectivePageCount; i++) {
5372
5397
  if (pageFilter && !pageFilter.has(i)) continue;
5398
+ targetPageNums.push(i);
5399
+ }
5400
+ let parsedPages = 0;
5401
+ const parseSinglePage = async (i) => {
5373
5402
  try {
5374
5403
  const page = await doc.getPage(i);
5375
5404
  const tc = await page.getTextContent();
@@ -5382,7 +5411,10 @@ async function parsePdfDocument(buffer, options) {
5382
5411
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
5383
5412
  }
5384
5413
  for (const item of visible) {
5385
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
5414
+ if (item.fontSize > 0) {
5415
+ const rounded = Math.round(item.fontSize * 10) / 10;
5416
+ fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
5417
+ }
5386
5418
  }
5387
5419
  const opList = await page.getOperatorList();
5388
5420
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -5399,14 +5431,25 @@ async function parsePdfDocument(buffer, options) {
5399
5431
  if (pageErr instanceof KordocError) throw pageErr;
5400
5432
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
5401
5433
  }
5434
+ };
5435
+ const sampleCount = Math.min(5, targetPageNums.length);
5436
+ for (let si = 0; si < sampleCount; si++) {
5437
+ await parseSinglePage(targetPageNums[si]);
5438
+ }
5439
+ const sampleParsed = parsedPages || sampleCount;
5440
+ const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
5441
+ if (!isImageBased) {
5442
+ for (let si = sampleCount; si < targetPageNums.length; si++) {
5443
+ await parseSinglePage(targetPageNums[si]);
5444
+ }
5402
5445
  }
5403
5446
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
5404
- if (totalChars / Math.max(parsedPageCount, 1) < 10) {
5447
+ if (isImageBased) {
5405
5448
  let ocrProvider = options?.ocr ?? null;
5406
- const ocrMode = options?.ocrMode;
5407
- if (!ocrProvider && ocrMode && ocrMode !== "off") {
5449
+ const ocrMode = options?.ocrMode ?? "auto";
5450
+ if (!ocrProvider && ocrMode !== "off") {
5408
5451
  try {
5409
- const { resolveOcrProvider } = await import("./resolve-LBFYRHJI.js");
5452
+ const { resolveOcrProvider } = await import("./resolve-QA3VACUP.js");
5410
5453
  const concurrency = options?.ocrConcurrency ?? 1;
5411
5454
  const batchSize = options?.ocrBatchSize;
5412
5455
  ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
@@ -5422,7 +5465,7 @@ async function parsePdfDocument(buffer, options) {
5422
5465
  if (ocrProvider) {
5423
5466
  let ocrBlocks = [];
5424
5467
  try {
5425
- const { ocrPages } = await import("./provider-OBY3XFSZ.js");
5468
+ const { ocrPages } = await import("./provider-HE727F7Z.js");
5426
5469
  const concurrency = options?.ocrConcurrency ?? 1;
5427
5470
  ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
5428
5471
  } catch {
@@ -5456,7 +5499,7 @@ async function parsePdfDocument(buffer, options) {
5456
5499
  blocks.splice(removed[ri], 1);
5457
5500
  }
5458
5501
  }
5459
- const medianFontSize = computeMedianFontSize(allFontSizes);
5502
+ const medianFontSize = computeMedianFromFreq(fontSizeFreq);
5460
5503
  if (medianFontSize > 0) {
5461
5504
  detectHeadings(blocks, medianFontSize);
5462
5505
  }
@@ -5520,11 +5563,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
5520
5563
  }
5521
5564
  return { visible, hiddenCount };
5522
5565
  }
5523
- function computeMedianFontSize(sizes) {
5524
- if (sizes.length === 0) return 0;
5525
- const sorted = [...sizes].sort((a, b) => a - b);
5526
- const mid = Math.floor(sorted.length / 2);
5527
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
5566
+ function computeMedianFromFreq(freq) {
5567
+ if (freq.size === 0) return 0;
5568
+ const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
5569
+ let total = 0;
5570
+ for (const [, count] of entries) total += count;
5571
+ const mid = total / 2;
5572
+ let cumulative = 0;
5573
+ for (const [size, count] of entries) {
5574
+ cumulative += count;
5575
+ if (cumulative >= mid) return size;
5576
+ }
5577
+ return 0;
5528
5578
  }
5529
5579
  function detectHeadings(blocks, medianFontSize) {
5530
5580
  for (const block of blocks) {
@@ -6330,6 +6380,7 @@ var MAX_SHEETS = 100;
6330
6380
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
6331
6381
  var MAX_ROWS2 = 1e4;
6332
6382
  var MAX_COLS2 = 200;
6383
+ var MAX_TOTAL_CELLS = 2e6;
6333
6384
  function cleanNumericValue(raw) {
6334
6385
  if (!/^-?\d+\.\d+$/.test(raw)) return raw;
6335
6386
  const num = parseFloat(raw);
@@ -6513,9 +6564,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
6513
6564
  }
6514
6565
  return blocks;
6515
6566
  }
6516
- async function parseXlsxDocument(buffer, options) {
6567
+ async function parseXlsxDocument(buffer, options, existingZip) {
6517
6568
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
6518
- const zip = await JSZip3.loadAsync(buffer);
6569
+ const zip = existingZip ?? await JSZip3.loadAsync(buffer);
6519
6570
  const warnings = [];
6520
6571
  const workbookFile = zip.file("xl/workbook.xml");
6521
6572
  if (!workbookFile) {
@@ -6542,6 +6593,7 @@ async function parseXlsxDocument(buffer, options) {
6542
6593
  }
6543
6594
  const blocks = [];
6544
6595
  const processedSheets = Math.min(sheets.length, MAX_SHEETS);
6596
+ let totalCells = 0;
6545
6597
  for (let i = 0; i < processedSheets; i++) {
6546
6598
  if (pageFilter && !pageFilter.has(i + 1)) continue;
6547
6599
  const sheet = sheets[i];
@@ -6568,6 +6620,11 @@ async function parseXlsxDocument(buffer, options) {
6568
6620
  try {
6569
6621
  const sheetXml = await sheetFile.async("text");
6570
6622
  const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
6623
+ totalCells += maxRow * maxCol;
6624
+ if (totalCells > MAX_TOTAL_CELLS) {
6625
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
6626
+ break;
6627
+ }
6571
6628
  const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
6572
6629
  blocks.push(...sheetBlocks);
6573
6630
  } catch (err) {
@@ -6651,10 +6708,35 @@ function getAttr(el, localName) {
6651
6708
  function parseXml2(text) {
6652
6709
  return new DOMParser3().parseFromString(text, "text/xml");
6653
6710
  }
6711
+ function buildElementIndex(root) {
6712
+ const index = /* @__PURE__ */ new Map();
6713
+ const walk = (node) => {
6714
+ const children = node.childNodes;
6715
+ for (let i = 0; i < children.length; i++) {
6716
+ const child = children[i];
6717
+ if (child.nodeType === 1) {
6718
+ const el = child;
6719
+ const name = el.localName ?? "";
6720
+ if (name) {
6721
+ let list = index.get(name);
6722
+ if (!list) {
6723
+ list = [];
6724
+ index.set(name, list);
6725
+ }
6726
+ list.push(el);
6727
+ }
6728
+ walk(el);
6729
+ }
6730
+ }
6731
+ };
6732
+ walk(root);
6733
+ return index;
6734
+ }
6654
6735
  function parseStyles(xml) {
6655
6736
  const doc = parseXml2(xml);
6656
6737
  const styles = /* @__PURE__ */ new Map();
6657
- const styleElements = findElements(doc, "style");
6738
+ const idx = buildElementIndex(doc);
6739
+ const styleElements = idx.get("style") ?? [];
6658
6740
  for (const el of styleElements) {
6659
6741
  const styleId = getAttr(el, "styleId");
6660
6742
  if (!styleId) continue;
@@ -6682,7 +6764,8 @@ function parseStyles(xml) {
6682
6764
  function parseNumbering(xml) {
6683
6765
  const doc = parseXml2(xml);
6684
6766
  const abstractNums = /* @__PURE__ */ new Map();
6685
- const abstractElements = findElements(doc, "abstractNum");
6767
+ const idx = buildElementIndex(doc);
6768
+ const abstractElements = idx.get("abstractNum") ?? [];
6686
6769
  for (const el of abstractElements) {
6687
6770
  const abstractNumId = getAttr(el, "abstractNumId");
6688
6771
  if (!abstractNumId) continue;
@@ -6697,7 +6780,7 @@ function parseNumbering(xml) {
6697
6780
  abstractNums.set(abstractNumId, levels);
6698
6781
  }
6699
6782
  const nums = /* @__PURE__ */ new Map();
6700
- const numElements = findElements(doc, "num");
6783
+ const numElements = idx.get("num") ?? [];
6701
6784
  for (const el of numElements) {
6702
6785
  const numId = getAttr(el, "numId");
6703
6786
  if (!numId) continue;
@@ -6941,9 +7024,9 @@ async function extractImages(zip, rels, doc) {
6941
7024
  }
6942
7025
  return { blocks, images };
6943
7026
  }
6944
- async function parseDocxDocument(buffer, options) {
7027
+ async function parseDocxDocument(buffer, options, existingZip) {
6945
7028
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
6946
- const zip = await JSZip4.loadAsync(buffer);
7029
+ const zip = existingZip ?? await JSZip4.loadAsync(buffer);
6947
7030
  const warnings = [];
6948
7031
  const docFile = zip.file("word/document.xml");
6949
7032
  if (!docFile) {
@@ -9378,25 +9461,86 @@ async function parse2(input, options) {
9378
9461
  if (!buffer || buffer.byteLength === 0) {
9379
9462
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
9380
9463
  }
9464
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
9465
+ if (buffer.byteLength > MAX_FILE_SIZE) {
9466
+ return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
9467
+ }
9381
9468
  const format = detectFormat(buffer);
9382
9469
  switch (format) {
9383
9470
  case "hwpx": {
9384
- const zipFormat = await detectZipFormat(buffer);
9385
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
9386
- if (zipFormat === "docx") return parseDocx(buffer, options);
9387
- return parseHwpx(buffer, options);
9471
+ const { format: zipFormat, zip } = await detectZipFormat(buffer);
9472
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
9473
+ if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
9474
+ return parseHwpx(buffer, options, zip ?? void 0);
9388
9475
  }
9389
9476
  case "hwp":
9390
9477
  return parseHwp(buffer, options);
9391
9478
  case "pdf":
9392
9479
  return parsePdf(buffer, options);
9480
+ case "image":
9481
+ return parseImage(buffer, options);
9393
9482
  default:
9394
9483
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
9395
9484
  }
9396
9485
  }
9397
- async function parseHwpx(buffer, options) {
9486
+ async function parseImage(buffer, options) {
9487
+ const ocrMode = options?.ocrMode || "auto";
9488
+ if (ocrMode === "off") {
9489
+ return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
9490
+ }
9491
+ let ocrProvider;
9492
+ let actualOcrMode = "auto";
9493
+ try {
9494
+ if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
9495
+ ocrProvider = createCliOcrProvider(ocrMode);
9496
+ actualOcrMode = ocrMode;
9497
+ } else if (ocrMode === "tesseract") {
9498
+ ocrProvider = await createTesseractProvider();
9499
+ actualOcrMode = ocrMode;
9500
+ } else if (ocrMode === "auto") {
9501
+ const modesToTry = ["gemini", "claude", "codex", "ollama"];
9502
+ for (const mode of modesToTry) {
9503
+ try {
9504
+ ocrProvider = createCliOcrProvider(mode);
9505
+ actualOcrMode = mode;
9506
+ break;
9507
+ } catch (e) {
9508
+ console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
9509
+ }
9510
+ }
9511
+ if (!ocrProvider) {
9512
+ ocrProvider = await createTesseractProvider();
9513
+ actualOcrMode = "tesseract";
9514
+ }
9515
+ }
9516
+ if (!ocrProvider) {
9517
+ return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
9518
+ }
9519
+ const imageUint8Array = new Uint8Array(buffer);
9520
+ const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
9521
+ if (ocrProvider.terminate) {
9522
+ await ocrProvider.terminate();
9523
+ }
9524
+ const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
9525
+ const blocks = markdownToBlocks(markdown, 1);
9526
+ return {
9527
+ success: true,
9528
+ fileType: "image",
9529
+ markdown,
9530
+ blocks,
9531
+ isImageBased: true,
9532
+ warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
9533
+ };
9534
+ } catch (err) {
9535
+ if (ocrProvider && ocrProvider.terminate) {
9536
+ await ocrProvider.terminate();
9537
+ }
9538
+ return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
9539
+ }
9540
+ }
9541
+ async function parseHwpx(buffer, options, zip) {
9398
9542
  try {
9399
- const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
9543
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
9400
9544
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
9401
9545
  } catch (err) {
9402
9546
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -9419,17 +9563,17 @@ async function parsePdf(buffer, options) {
9419
9563
  return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
9420
9564
  }
9421
9565
  }
9422
- async function parseXlsx(buffer, options) {
9566
+ async function parseXlsx(buffer, options, zip) {
9423
9567
  try {
9424
- const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
9568
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
9425
9569
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
9426
9570
  } catch (err) {
9427
9571
  return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
9428
9572
  }
9429
9573
  }
9430
- async function parseDocx(buffer, options) {
9574
+ async function parseDocx(buffer, options, zip) {
9431
9575
  try {
9432
- const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
9576
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
9433
9577
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
9434
9578
  } catch (err) {
9435
9579
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -9624,4 +9768,4 @@ export {
9624
9768
  cfb/cfb.js:
9625
9769
  (*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
9626
9770
  */
9627
- //# sourceMappingURL=chunk-2ZGLFZCN.js.map
9771
+ //# sourceMappingURL=chunk-2GFJFTKS.js.map