@clazic/kordoc 2.4.9 → 2.4.11
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-VX7CY6UH.js → batch-provider-5BFJRKAZ.js} +15 -4
- package/dist/{batch-provider-VX7CY6UH.js.map → batch-provider-5BFJRKAZ.js.map} +1 -1
- package/dist/{chunk-YC2MEB7R.js → chunk-34WIGIQC.js} +16 -5
- package/dist/chunk-34WIGIQC.js.map +1 -0
- package/dist/{chunk-MPMKWVV2.js → chunk-JGMLDBW5.js} +4 -4
- package/dist/{chunk-SHM3PYVA.js → chunk-PJSXZBZB.js} +2 -2
- package/dist/{chunk-SHM3PYVA.js.map → chunk-PJSXZBZB.js.map} +1 -1
- package/dist/cli.js +6 -6
- package/dist/index.cjs +30 -8
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +30 -8
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +3 -3
- package/dist/{resolve-BGOGWG6E.js → resolve-4I65IGMM.js} +4 -4
- package/dist/{utils-IUWGWQWV.js → utils-HKVOS2O3.js} +2 -2
- package/dist/{watch-CRENPZU5.js → watch-EYOGF3HY.js} +4 -4
- package/package.json +1 -1
- package/dist/chunk-YC2MEB7R.js.map +0 -1
- /package/dist/{chunk-MPMKWVV2.js.map → chunk-JGMLDBW5.js.map} +0 -0
- /package/dist/{resolve-BGOGWG6E.js.map → resolve-4I65IGMM.js.map} +0 -0
- /package/dist/{utils-IUWGWQWV.js.map → utils-HKVOS2O3.js.map} +0 -0
- /package/dist/{watch-CRENPZU5.js.map → watch-EYOGF3HY.js.map} +0 -0
|
@@ -135,7 +135,9 @@ ${fileRefs}`;
|
|
|
135
135
|
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
136
136
|
throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
137
137
|
}
|
|
138
|
-
|
|
138
|
+
const output = result.stdout || "";
|
|
139
|
+
checkForLimitError(output, mode);
|
|
140
|
+
return output;
|
|
139
141
|
}
|
|
140
142
|
async function callBatchCodexCli(imagePaths) {
|
|
141
143
|
const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
|
|
@@ -156,11 +158,14 @@ async function callBatchCodexCli(imagePaths) {
|
|
|
156
158
|
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
157
159
|
throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
158
160
|
}
|
|
161
|
+
let text;
|
|
159
162
|
try {
|
|
160
|
-
|
|
163
|
+
text = readFileSync(outPath, "utf-8");
|
|
161
164
|
} catch {
|
|
162
|
-
|
|
165
|
+
text = result.stdout || "";
|
|
163
166
|
}
|
|
167
|
+
checkForLimitError(text, "codex");
|
|
168
|
+
return text;
|
|
164
169
|
} finally {
|
|
165
170
|
try {
|
|
166
171
|
unlinkSync(outPath);
|
|
@@ -168,6 +173,12 @@ async function callBatchCodexCli(imagePaths) {
|
|
|
168
173
|
}
|
|
169
174
|
}
|
|
170
175
|
}
|
|
176
|
+
function checkForLimitError(output, mode) {
|
|
177
|
+
const lower = output.toLowerCase();
|
|
178
|
+
if (lower.includes("usage limit") || lower.includes("rate limit")) {
|
|
179
|
+
throw new Error(`${mode} \uC0AC\uC6A9\uB7C9/\uC18D\uB3C4 \uC81C\uD55C: ${output.trim().slice(0, 200)}`);
|
|
180
|
+
}
|
|
181
|
+
}
|
|
171
182
|
function stripCodeFence(text) {
|
|
172
183
|
const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
|
|
173
184
|
return match ? match[1].trim() : text;
|
|
@@ -176,4 +187,4 @@ export {
|
|
|
176
187
|
DEFAULT_BATCH_SIZES,
|
|
177
188
|
createBatchCliProvider
|
|
178
189
|
};
|
|
179
|
-
//# sourceMappingURL=batch-provider-
|
|
190
|
+
//# sourceMappingURL=batch-provider-5BFJRKAZ.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/ocr/batch-provider.ts"],"sourcesContent":["/**\n * CLI 배치 OCR 프로바이더\n *\n * 여러 페이지 이미지를 단일 CLI 호출로 처리하여 API 호출 수를 대폭 감소.\n * gemini/claude: @file 멀티 참조, codex: --image 멀티 플래그\n *\n * 299페이지 기준:\n * - 기존: CLI 299회 호출 (~30분)\n * - 배치: CLI 3~6회 호출 (~3분)\n */\n\nimport { spawn, execSync } from \"child_process\"\nimport { writeFileSync, readFileSync, unlinkSync, mkdirSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { StructuredOcrResult, BatchOcrProvider } from \"../types.js\"\n\n/** 배치 OCR 프롬프트 */\nconst BATCH_OCR_PROMPT =\n \"다음 문서 페이지 이미지들을 OCR하여 순수 Markdown으로 변환하세요.\\n\\n\" +\n \"규칙:\\n\" +\n \"- 각 페이지 결과 사이에 반드시 이 구분자를 삽입: <!-- PAGE_BREAK -->\\n\" +\n \"- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\\n\" +\n \"- 병합된 셀은 해당 위치에 내용 기재\\n\" +\n \"- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\\n\" +\n \"- 리스트는 - 또는 1. 사용\\n\" +\n \"- 이미지, 도형 등 비텍스트 요소는 무시\\n\" +\n \"- 원문의 읽기 순서와 구조를 유지\\n\" +\n \"- ```로 감싸지 말고 순수 Markdown만 출력\"\n\n/** 모드별 기본 배치 크기 (CLI 내부 타임아웃 + 실측 기반)\n *\n * gemini CLI: 10장 이상에서 AbortError 발생 (내부 타임아웃).\n * 5장 배치가 안정적으로 동작 확인 (35초/배치).\n * 299페이지 = 60배치 = 기존 299회 대비 80% 감소.\n */\nexport const DEFAULT_BATCH_SIZES: Record<string, number> = {\n gemini: 5,\n claude: 5,\n codex: 10,\n}\n\n/**\n * 임시 디렉토리 — gemini CLI는 cwd 하위 + gitignore 밖만 @참조 가능\n *\n * 숨김 처리:\n * - macOS/Linux: '.' 접두사로 기본 숨김 (ls -a 로만 표시)\n * - Windows: '.' 접두사 + attrib +h 로 숨김 속성 부여\n */\nlet _batchTempDir: string | null = null\nfunction getBatchTempDir(): string {\n if (!_batchTempDir) {\n _batchTempDir = join(process.cwd(), \".kordoc_ocr_tmp\")\n mkdirSync(_batchTempDir, { recursive: true })\n // Windows: dot-prefix만으로 숨김 처리 불충분 → attrib +h 추가\n if (process.platform === \"win32\") {\n try { execSync(`attrib +h \"${_batchTempDir}\"`, { stdio: \"ignore\" }) } catch { /* ignore */ }\n }\n }\n return _batchTempDir\n}\n\n/**\n * 배치 CLI 프로바이더 생성\n */\nexport function createBatchCliProvider(\n mode: \"gemini\" | \"claude\" | \"codex\",\n batchSize: number\n): BatchOcrProvider {\n return {\n __batch: true as const,\n batchSize,\n async processBatch(pages) {\n const results = new Map<number, StructuredOcrResult>()\n const tempDir = getBatchTempDir()\n const tempFiles: string[] = []\n\n try {\n // 1. Write all page images to temp files\n for (const { image, pageNum } of pages) {\n const path = join(tempDir, `batch-p${pageNum}.png`)\n writeFileSync(path, image)\n tempFiles.push(path)\n }\n\n // 2. Call CLI with all file references (비동기 — 병렬 배치 실행 가능)\n let output: string\n if (mode === \"codex\") {\n output = await callBatchCodexCli(tempFiles)\n } else {\n output = await callBatchCli(mode, tempFiles)\n }\n\n // 3. Parse response by PAGE_BREAK separator\n const cleaned = stripCodeFence(output.trim())\n const parts = cleaned.split(/<!--\\s*PAGE_BREAK\\s*-->/)\n .map(p => p.trim())\n .filter(p => p.length > 0)\n\n // 4. Map results to page numbers (best-effort if count mismatch)\n for (let i = 0; i < pages.length; i++) {\n const pageNum = pages[i].pageNum\n if (i < parts.length) {\n results.set(pageNum, { markdown: parts[i] })\n }\n // If fewer parts than pages, remaining pages get no result\n }\n } finally {\n // 5. Clean up temp files\n for (const f of tempFiles) {\n try { unlinkSync(f) } catch { /* ignore */ }\n }\n }\n\n return results\n },\n }\n}\n\n/**\n * 비동기 CLI 실행 헬퍼 — spawn + Promise 래핑.\n * spawnSync는 이벤트 루프를 차단하여 병렬 배치 실행 불가.\n */\nfunction spawnAsync(\n cmd: string,\n args: string[],\n opts: { timeoutMs: number; cwd?: string; stdin?: string }\n): Promise<{ stdout: string; stderr: string; exitCode: number }> {\n return new Promise((resolve, reject) => {\n const child = spawn(cmd, args, {\n cwd: opts.cwd,\n env: process.env,\n stdio: [\"pipe\", \"pipe\", \"pipe\"],\n shell: process.platform === \"win32\",\n })\n\n let stdout = \"\"\n let stderr = \"\"\n let killed = false\n\n child.stdout.setEncoding(\"utf-8\")\n child.stderr.setEncoding(\"utf-8\")\n child.stdout.on(\"data\", (d: string) => { stdout += d })\n child.stderr.on(\"data\", (d: string) => { stderr += d })\n\n const timer = setTimeout(() => {\n killed = true\n if (process.platform === \"win32\") {\n child.kill()\n } else {\n child.kill(\"SIGTERM\")\n }\n }, opts.timeoutMs)\n\n if (opts.stdin !== undefined) {\n child.stdin.end(opts.stdin)\n } else {\n child.stdin.end()\n }\n\n child.on(\"close\", (code) => {\n clearTimeout(timer)\n if (killed) {\n reject(new Error(`타임아웃 (${Math.round(opts.timeoutMs / 1000)}초)`))\n } else {\n resolve({ stdout, stderr, exitCode: code ?? 1 })\n }\n })\n child.on(\"error\", (err) => {\n clearTimeout(timer)\n reject(err)\n })\n })\n}\n\n/** gemini/claude 배치 호출 (비동기) */\nasync function callBatchCli(mode: \"gemini\" | \"claude\", imagePaths: string[]): Promise<string> {\n const fileRefs = imagePaths.map(p => `@${p.replace(/\\\\/g, \"/\")}`).join(\"\\n\")\n const prompt = `${BATCH_OCR_PROMPT}\\n\\n${fileRefs}`\n\n let args: string[]\n if (mode === \"gemini\") {\n const model = process.env.KORDOC_GEMINI_MODEL ?? \"gemini-2.5-flash\"\n args = [\"--prompt\", prompt, \"--yolo\", \"--model\", model]\n } else {\n args = [\"--print\", prompt]\n const model = process.env.KORDOC_CLAUDE_MODEL\n if (model) args.push(\"--model\", model)\n }\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(mode, args, {\n timeoutMs,\n ...(mode === \"claude\" ? { cwd: tmpdir() } : {}),\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`${mode} 배치 OCR 실패: ${errMsg}`)\n }\n\n return result.stdout || \"\"\n}\n\n/** codex 배치 호출 (비동기) — --image를 여러 번 지정 */\nasync function callBatchCodexCli(imagePaths: string[]): Promise<string> {\n const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`)\n try {\n const args = [\"exec\", BATCH_OCR_PROMPT]\n for (const p of imagePaths) {\n args.push(\"--image\", p)\n }\n args.push(\"--output-last-message\", outPath)\n const model = process.env.KORDOC_CODEX_MODEL\n if (model) args.push(\"--model\", model)\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(\"codex\", args, {\n timeoutMs,\n stdin: \"\",\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`codex 배치 OCR 실패: ${errMsg}`)\n }\n\n try {\n return readFileSync(outPath, \"utf-8\")\n } catch {\n return result.stdout || \"\"\n }\n } finally {\n try { unlinkSync(outPath) } catch { /* ignore */ }\n }\n}\n\n/** LLM 출력에서 코드 펜스 제거 (cli-provider.ts와 동일 로직) */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*\\n([\\s\\S]*?)\\n```\\s*$/m)\n return match ? match[1].trim() : text\n}\n"],"mappings":";;;;AAWA,SAAS,OAAO,gBAAgB;AAChC,SAAS,eAAe,cAAc,YAAY,iBAAiB;AACnE,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,mBACJ;AAiBK,IAAM,sBAA8C;AAAA,EACzD,QAAQ;AAAA,EACR,QAAQ;AAAA,EACR,OAAO;AACT;AASA,IAAI,gBAA+B;AACnC,SAAS,kBAA0B;AACjC,MAAI,CAAC,eAAe;AAClB,oBAAgB,KAAK,QAAQ,IAAI,GAAG,iBAAiB;AACrD,cAAU,eAAe,EAAE,WAAW,KAAK,CAAC;AAE5C,QAAI,QAAQ,aAAa,SAAS;AAChC,UAAI;AAAE,iBAAS,cAAc,aAAa,KAAK,EAAE,OAAO,SAAS,CAAC;AAAA,MAAE,QAAQ;AAAA,MAAe;AAAA,IAC7F;AAAA,EACF;AACA,SAAO;AACT;AAKO,SAAS,uBACd,MACA,WACkB;AAClB,SAAO;AAAA,IACL,SAAS;AAAA,IACT;AAAA,IACA,MAAM,aAAa,OAAO;AACxB,YAAM,UAAU,oBAAI,IAAiC;AACrD,YAAM,UAAU,gBAAgB;AAChC,YAAM,YAAsB,CAAC;AAE7B,UAAI;AAEF,mBAAW,EAAE,OAAO,QAAQ,KAAK,OAAO;AACtC,gBAAM,OAAO,KAAK,SAAS,UAAU,OAAO,MAAM;AAClD,wBAAc,MAAM,KAAK;AACzB,oBAAU,KAAK,IAAI;AAAA,QACrB;AAGA,YAAI;AACJ,YAAI,SAAS,SAAS;AACpB,mBAAS,MAAM,kBAAkB,SAAS;AAAA,QAC5C,OAAO;AACL,mBAAS,MAAM,aAAa,MAAM,SAAS;AAAA,QAC7C;AAGA,cAAM,UAAU,eAAe,OAAO,KAAK,CAAC;AAC5C,cAAM,QAAQ,QAAQ,MAAM,yBAAyB,EAClD,IAAI,OAAK,EAAE,KAAK,CAAC,EACjB,OAAO,OAAK,EAAE,SAAS,CAAC;AAG3B,iBAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,gBAAM,UAAU,MAAM,CAAC,EAAE;AACzB,cAAI,IAAI,MAAM,QAAQ;AACpB,oBAAQ,IAAI,SAAS,EAAE,UAAU,MAAM,CAAC,EAAE,CAAC;AAAA,UAC7C;AAAA,QAEF;AAAA,MACF,UAAE;AAEA,mBAAW,KAAK,WAAW;AACzB,cAAI;AAAE,uBAAW,CAAC;AAAA,UAAE,QAAQ;AAAA,UAAe;AAAA,QAC7C;AAAA,MACF;AAEA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAMA,SAAS,WACP,KACA,MACA,MAC+D;AAC/D,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,QAAQ,MAAM,KAAK,MAAM;AAAA,MAC7B,KAAK,KAAK;AAAA,MACV,KAAK,QAAQ;AAAA,MACb,OAAO,CAAC,QAAQ,QAAQ,MAAM;AAAA,MAC9B,OAAO,QAAQ,aAAa;AAAA,IAC9B,CAAC;AAED,QAAI,SAAS;AACb,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AACtD,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AAEtD,UAAM,QAAQ,WAAW,MAAM;AAC7B,eAAS;AACT,UAAI,QAAQ,aAAa,SAAS;AAChC,cAAM,KAAK;AAAA,MACb,OAAO;AACL,cAAM,KAAK,SAAS;AAAA,MACtB;AAAA,IACF,GAAG,KAAK,SAAS;AAEjB,QAAI,KAAK,UAAU,QAAW;AAC5B,YAAM,MAAM,IAAI,KAAK,KAAK;AAAA,IAC5B,OAAO;AACL,YAAM,MAAM,IAAI;AAAA,IAClB;AAEA,UAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,mBAAa,KAAK;AAClB,UAAI,QAAQ;AACV,eAAO,IAAI,MAAM,6BAAS,KAAK,MAAM,KAAK,YAAY,GAAI,CAAC,SAAI,CAAC;AAAA,MAClE,OAAO;AACL,gBAAQ,EAAE,QAAQ,QAAQ,UAAU,QAAQ,EAAE,CAAC;AAAA,MACjD;AAAA,IACF,CAAC;AACD,UAAM,GAAG,SAAS,CAAC,QAAQ;AACzB,mBAAa,KAAK;AAClB,aAAO,GAAG;AAAA,IACZ,CAAC;AAAA,EACH,CAAC;AACH;AAGA,eAAe,aAAa,MAA2B,YAAuC;AAC5F,QAAM,WAAW,WAAW,IAAI,OAAK,IAAI,EAAE,QAAQ,OAAO,GAAG,CAAC,EAAE,EAAE,KAAK,IAAI;AAC3E,QAAM,SAAS,GAAG,gBAAgB;AAAA;AAAA,EAAO,QAAQ;AAEjD,MAAI;AACJ,MAAI,SAAS,UAAU;AACrB,UAAM,QAAQ,QAAQ,IAAI,uBAAuB;AACjD,WAAO,CAAC,YAAY,QAAQ,UAAU,WAAW,KAAK;AAAA,EACxD,OAAO;AACL,WAAO,CAAC,WAAW,MAAM;AACzB,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC;AAEA,QAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,QAAM,SAAS,MAAM,WAAW,MAAM,MAAM;AAAA,IAC1C;AAAA,IACA,GAAI,SAAS,WAAW,EAAE,KAAK,OAAO,EAAE,IAAI,CAAC;AAAA,EAC/C,CAAC;AAED,MAAI,OAAO,aAAa,GAAG;AACzB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,MAAM,EAAE;AAAA,EAChD;AAEA,SAAO,OAAO,UAAU;AAC1B;AAGA,eAAe,kBAAkB,YAAuC;AACtE,QAAM,UAAU,KAAK,OAAO,GAAG,sBAAsB,KAAK,IAAI,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,CAAC,MAAM;AAC5G,MAAI;AACF,UAAM,OAAO,CAAC,QAAQ,gBAAgB;AACtC,eAAW,KAAK,YAAY;AAC1B,WAAK,KAAK,WAAW,CAAC;AAAA,IACxB;AACA,SAAK,KAAK,yBAAyB,OAAO;AAC1C,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAErC,UAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,UAAM,SAAS,MAAM,WAAW,SAAS,MAAM;AAAA,MAC7C;AAAA,MACA,OAAO;AAAA,IACT,CAAC;AAED,QAAI,OAAO,aAAa,GAAG;AACzB,YAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,YAAM,IAAI,MAAM,wCAAoB,MAAM,EAAE;AAAA,IAC9C;AAEA,QAAI;AACF,aAAO,aAAa,SAAS,OAAO;AAAA,IACtC,QAAQ;AACN,aAAO,OAAO,UAAU;AAAA,IAC1B;AAAA,EACF,UAAE;AACA,QAAI;AAAE,iBAAW,OAAO;AAAA,IAAE,QAAQ;AAAA,IAAe;AAAA,EACnD;AACF;AAGA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,+CAA+C;AACxE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/ocr/batch-provider.ts"],"sourcesContent":["/**\n * CLI 배치 OCR 프로바이더\n *\n * 여러 페이지 이미지를 단일 CLI 호출로 처리하여 API 호출 수를 대폭 감소.\n * gemini/claude: @file 멀티 참조, codex: --image 멀티 플래그\n *\n * 299페이지 기준:\n * - 기존: CLI 299회 호출 (~30분)\n * - 배치: CLI 3~6회 호출 (~3분)\n */\n\nimport { spawn, execSync } from \"child_process\"\nimport { writeFileSync, readFileSync, unlinkSync, mkdirSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { StructuredOcrResult, BatchOcrProvider } from \"../types.js\"\n\n/** 배치 OCR 프롬프트 */\nconst BATCH_OCR_PROMPT =\n \"다음 문서 페이지 이미지들을 OCR하여 순수 Markdown으로 변환하세요.\\n\\n\" +\n \"규칙:\\n\" +\n \"- 각 페이지 결과 사이에 반드시 이 구분자를 삽입: <!-- PAGE_BREAK -->\\n\" +\n \"- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\\n\" +\n \"- 병합된 셀은 해당 위치에 내용 기재\\n\" +\n \"- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\\n\" +\n \"- 리스트는 - 또는 1. 사용\\n\" +\n \"- 이미지, 도형 등 비텍스트 요소는 무시\\n\" +\n \"- 원문의 읽기 순서와 구조를 유지\\n\" +\n \"- ```로 감싸지 말고 순수 Markdown만 출력\"\n\n/** 모드별 기본 배치 크기 (CLI 내부 타임아웃 + 실측 기반)\n *\n * gemini CLI: 10장 이상에서 AbortError 발생 (내부 타임아웃).\n * 5장 배치가 안정적으로 동작 확인 (35초/배치).\n * 299페이지 = 60배치 = 기존 299회 대비 80% 감소.\n */\nexport const DEFAULT_BATCH_SIZES: Record<string, number> = {\n gemini: 5,\n claude: 5,\n codex: 10,\n}\n\n/**\n * 임시 디렉토리 — gemini CLI는 cwd 하위 + gitignore 밖만 @참조 가능\n *\n * 숨김 처리:\n * - macOS/Linux: '.' 접두사로 기본 숨김 (ls -a 로만 표시)\n * - Windows: '.' 접두사 + attrib +h 로 숨김 속성 부여\n */\nlet _batchTempDir: string | null = null\nfunction getBatchTempDir(): string {\n if (!_batchTempDir) {\n _batchTempDir = join(process.cwd(), \".kordoc_ocr_tmp\")\n mkdirSync(_batchTempDir, { recursive: true })\n // Windows: dot-prefix만으로 숨김 처리 불충분 → attrib +h 추가\n if (process.platform === \"win32\") {\n try { execSync(`attrib +h \"${_batchTempDir}\"`, { stdio: \"ignore\" }) } catch { /* ignore */ }\n }\n }\n return _batchTempDir\n}\n\n/**\n * 배치 CLI 프로바이더 생성\n */\nexport function createBatchCliProvider(\n mode: \"gemini\" | \"claude\" | \"codex\",\n batchSize: number\n): BatchOcrProvider {\n return {\n __batch: true as const,\n batchSize,\n async processBatch(pages) {\n const results = new Map<number, StructuredOcrResult>()\n const tempDir = getBatchTempDir()\n const tempFiles: string[] = []\n\n try {\n // 1. Write all page images to temp files\n for (const { image, pageNum } of pages) {\n const path = join(tempDir, `batch-p${pageNum}.png`)\n writeFileSync(path, image)\n tempFiles.push(path)\n }\n\n // 2. Call CLI with all file references (비동기 — 병렬 배치 실행 가능)\n let output: string\n if (mode === \"codex\") {\n output = await callBatchCodexCli(tempFiles)\n } else {\n output = await callBatchCli(mode, tempFiles)\n }\n\n // 3. Parse response by PAGE_BREAK separator\n const cleaned = stripCodeFence(output.trim())\n const parts = cleaned.split(/<!--\\s*PAGE_BREAK\\s*-->/)\n .map(p => p.trim())\n .filter(p => p.length > 0)\n\n // 4. Map results to page numbers (best-effort if count mismatch)\n for (let i = 0; i < pages.length; i++) {\n const pageNum = pages[i].pageNum\n if (i < parts.length) {\n results.set(pageNum, { markdown: parts[i] })\n }\n // If fewer parts than pages, remaining pages get no result\n }\n } finally {\n // 5. Clean up temp files\n for (const f of tempFiles) {\n try { unlinkSync(f) } catch { /* ignore */ }\n }\n }\n\n return results\n },\n }\n}\n\n/**\n * 비동기 CLI 실행 헬퍼 — spawn + Promise 래핑.\n * spawnSync는 이벤트 루프를 차단하여 병렬 배치 실행 불가.\n */\nfunction spawnAsync(\n cmd: string,\n args: string[],\n opts: { timeoutMs: number; cwd?: string; stdin?: string }\n): Promise<{ stdout: string; stderr: string; exitCode: number }> {\n return new Promise((resolve, reject) => {\n const child = spawn(cmd, args, {\n cwd: opts.cwd,\n env: process.env,\n stdio: [\"pipe\", \"pipe\", \"pipe\"],\n shell: process.platform === \"win32\",\n })\n\n let stdout = \"\"\n let stderr = \"\"\n let killed = false\n\n child.stdout.setEncoding(\"utf-8\")\n child.stderr.setEncoding(\"utf-8\")\n child.stdout.on(\"data\", (d: string) => { stdout += d })\n child.stderr.on(\"data\", (d: string) => { stderr += d })\n\n const timer = setTimeout(() => {\n killed = true\n if (process.platform === \"win32\") {\n child.kill()\n } else {\n child.kill(\"SIGTERM\")\n }\n }, opts.timeoutMs)\n\n if (opts.stdin !== undefined) {\n child.stdin.end(opts.stdin)\n } else {\n child.stdin.end()\n }\n\n child.on(\"close\", (code) => {\n clearTimeout(timer)\n if (killed) {\n reject(new Error(`타임아웃 (${Math.round(opts.timeoutMs / 1000)}초)`))\n } else {\n resolve({ stdout, stderr, exitCode: code ?? 1 })\n }\n })\n child.on(\"error\", (err) => {\n clearTimeout(timer)\n reject(err)\n })\n })\n}\n\n/** gemini/claude 배치 호출 (비동기) */\nasync function callBatchCli(mode: \"gemini\" | \"claude\", imagePaths: string[]): Promise<string> {\n const fileRefs = imagePaths.map(p => `@${p.replace(/\\\\/g, \"/\")}`).join(\"\\n\")\n const prompt = `${BATCH_OCR_PROMPT}\\n\\n${fileRefs}`\n\n let args: string[]\n if (mode === \"gemini\") {\n const model = process.env.KORDOC_GEMINI_MODEL ?? \"gemini-2.5-flash\"\n args = [\"--prompt\", prompt, \"--yolo\", \"--model\", model]\n } else {\n args = [\"--print\", prompt]\n const model = process.env.KORDOC_CLAUDE_MODEL\n if (model) args.push(\"--model\", model)\n }\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(mode, args, {\n timeoutMs,\n ...(mode === \"claude\" ? { cwd: tmpdir() } : {}),\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`${mode} 배치 OCR 실패: ${errMsg}`)\n }\n\n const output = result.stdout || \"\"\n checkForLimitError(output, mode)\n return output\n}\n\n/** codex 배치 호출 (비동기) — --image를 여러 번 지정 */\nasync function callBatchCodexCli(imagePaths: string[]): Promise<string> {\n const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`)\n try {\n const args = [\"exec\", BATCH_OCR_PROMPT]\n for (const p of imagePaths) {\n args.push(\"--image\", p)\n }\n args.push(\"--output-last-message\", outPath)\n const model = process.env.KORDOC_CODEX_MODEL\n if (model) args.push(\"--model\", model)\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(\"codex\", args, {\n timeoutMs,\n stdin: \"\",\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`codex 배치 OCR 실패: ${errMsg}`)\n }\n\n let text: string\n try {\n text = readFileSync(outPath, \"utf-8\")\n } catch {\n text = result.stdout || \"\"\n }\n checkForLimitError(text, \"codex\")\n return text\n } finally {\n try { unlinkSync(outPath) } catch { /* ignore */ }\n }\n}\n\n/**\n * 출력 텍스트에서 사용량·속도 제한 에러 감지.\n * 해당 메시지가 포함된 경우 throw하여 다음 엔진으로 fallback 트리거.\n */\nfunction checkForLimitError(output: string, mode: string): void {\n const lower = output.toLowerCase()\n if (lower.includes(\"usage limit\") || lower.includes(\"rate limit\")) {\n throw new Error(`${mode} 사용량/속도 제한: ${output.trim().slice(0, 200)}`)\n }\n}\n\n/** LLM 출력에서 코드 펜스 제거 (cli-provider.ts와 동일 로직) */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*\\n([\\s\\S]*?)\\n```\\s*$/m)\n return match ? match[1].trim() : text\n}\n"],"mappings":";;;;AAWA,SAAS,OAAO,gBAAgB;AAChC,SAAS,eAAe,cAAc,YAAY,iBAAiB;AACnE,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,mBACJ;AAiBK,IAAM,sBAA8C;AAAA,EACzD,QAAQ;AAAA,EACR,QAAQ;AAAA,EACR,OAAO;AACT;AASA,IAAI,gBAA+B;AACnC,SAAS,kBAA0B;AACjC,MAAI,CAAC,eAAe;AAClB,oBAAgB,KAAK,QAAQ,IAAI,GAAG,iBAAiB;AACrD,cAAU,eAAe,EAAE,WAAW,KAAK,CAAC;AAE5C,QAAI,QAAQ,aAAa,SAAS;AAChC,UAAI;AAAE,iBAAS,cAAc,aAAa,KAAK,EAAE,OAAO,SAAS,CAAC;AAAA,MAAE,QAAQ;AAAA,MAAe;AAAA,IAC7F;AAAA,EACF;AACA,SAAO;AACT;AAKO,SAAS,uBACd,MACA,WACkB;AAClB,SAAO;AAAA,IACL,SAAS;AAAA,IACT;AAAA,IACA,MAAM,aAAa,OAAO;AACxB,YAAM,UAAU,oBAAI,IAAiC;AACrD,YAAM,UAAU,gBAAgB;AAChC,YAAM,YAAsB,CAAC;AAE7B,UAAI;AAEF,mBAAW,EAAE,OAAO,QAAQ,KAAK,OAAO;AACtC,gBAAM,OAAO,KAAK,SAAS,UAAU,OAAO,MAAM;AAClD,wBAAc,MAAM,KAAK;AACzB,oBAAU,KAAK,IAAI;AAAA,QACrB;AAGA,YAAI;AACJ,YAAI,SAAS,SAAS;AACpB,mBAAS,MAAM,kBAAkB,SAAS;AAAA,QAC5C,OAAO;AACL,mBAAS,MAAM,aAAa,MAAM,SAAS;AAAA,QAC7C;AAGA,cAAM,UAAU,eAAe,OAAO,KAAK,CAAC;AAC5C,cAAM,QAAQ,QAAQ,MAAM,yBAAyB,EAClD,IAAI,OAAK,EAAE,KAAK,CAAC,EACjB,OAAO,OAAK,EAAE,SAAS,CAAC;AAG3B,iBAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,gBAAM,UAAU,MAAM,CAAC,EAAE;AACzB,cAAI,IAAI,MAAM,QAAQ;AACpB,oBAAQ,IAAI,SAAS,EAAE,UAAU,MAAM,CAAC,EAAE,CAAC;AAAA,UAC7C;AAAA,QAEF;AAAA,MACF,UAAE;AAEA,mBAAW,KAAK,WAAW;AACzB,cAAI;AAAE,uBAAW,CAAC;AAAA,UAAE,QAAQ;AAAA,UAAe;AAAA,QAC7C;AAAA,MACF;AAEA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAMA,SAAS,WACP,KACA,MACA,MAC+D;AAC/D,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,QAAQ,MAAM,KAAK,MAAM;AAAA,MAC7B,KAAK,KAAK;AAAA,MACV,KAAK,QAAQ;AAAA,MACb,OAAO,CAAC,QAAQ,QAAQ,MAAM;AAAA,MAC9B,OAAO,QAAQ,aAAa;AAAA,IAC9B,CAAC;AAED,QAAI,SAAS;AACb,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AACtD,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AAEtD,UAAM,QAAQ,WAAW,MAAM;AAC7B,eAAS;AACT,UAAI,QAAQ,aAAa,SAAS;AAChC,cAAM,KAAK;AAAA,MACb,OAAO;AACL,cAAM,KAAK,SAAS;AAAA,MACtB;AAAA,IACF,GAAG,KAAK,SAAS;AAEjB,QAAI,KAAK,UAAU,QAAW;AAC5B,YAAM,MAAM,IAAI,KAAK,KAAK;AAAA,IAC5B,OAAO;AACL,YAAM,MAAM,IAAI;AAAA,IAClB;AAEA,UAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,mBAAa,KAAK;AAClB,UAAI,QAAQ;AACV,eAAO,IAAI,MAAM,6BAAS,KAAK,MAAM,KAAK,YAAY,GAAI,CAAC,SAAI,CAAC;AAAA,MAClE,OAAO;AACL,gBAAQ,EAAE,QAAQ,QAAQ,UAAU,QAAQ,EAAE,CAAC;AAAA,MACjD;AAAA,IACF,CAAC;AACD,UAAM,GAAG,SAAS,CAAC,QAAQ;AACzB,mBAAa,KAAK;AAClB,aAAO,GAAG;AAAA,IACZ,CAAC;AAAA,EACH,CAAC;AACH;AAGA,eAAe,aAAa,MAA2B,YAAuC;AAC5F,QAAM,WAAW,WAAW,IAAI,OAAK,IAAI,EAAE,QAAQ,OAAO,GAAG,CAAC,EAAE,EAAE,KAAK,IAAI;AAC3E,QAAM,SAAS,GAAG,gBAAgB;AAAA;AAAA,EAAO,QAAQ;AAEjD,MAAI;AACJ,MAAI,SAAS,UAAU;AACrB,UAAM,QAAQ,QAAQ,IAAI,uBAAuB;AACjD,WAAO,CAAC,YAAY,QAAQ,UAAU,WAAW,KAAK;AAAA,EACxD,OAAO;AACL,WAAO,CAAC,WAAW,MAAM;AACzB,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC;AAEA,QAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,QAAM,SAAS,MAAM,WAAW,MAAM,MAAM;AAAA,IAC1C;AAAA,IACA,GAAI,SAAS,WAAW,EAAE,KAAK,OAAO,EAAE,IAAI,CAAC;AAAA,EAC/C,CAAC;AAED,MAAI,OAAO,aAAa,GAAG;AACzB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,MAAM,EAAE;AAAA,EAChD;AAEA,QAAM,SAAS,OAAO,UAAU;AAChC,qBAAmB,QAAQ,IAAI;AAC/B,SAAO;AACT;AAGA,eAAe,kBAAkB,YAAuC;AACtE,QAAM,UAAU,KAAK,OAAO,GAAG,sBAAsB,KAAK,IAAI,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,CAAC,MAAM;AAC5G,MAAI;AACF,UAAM,OAAO,CAAC,QAAQ,gBAAgB;AACtC,eAAW,KAAK,YAAY;AAC1B,WAAK,KAAK,WAAW,CAAC;AAAA,IACxB;AACA,SAAK,KAAK,yBAAyB,OAAO;AAC1C,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAErC,UAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,UAAM,SAAS,MAAM,WAAW,SAAS,MAAM;AAAA,MAC7C;AAAA,MACA,OAAO;AAAA,IACT,CAAC;AAED,QAAI,OAAO,aAAa,GAAG;AACzB,YAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,YAAM,IAAI,MAAM,wCAAoB,MAAM,EAAE;AAAA,IAC9C;AAEA,QAAI;AACJ,QAAI;AACF,aAAO,aAAa,SAAS,OAAO;AAAA,IACtC,QAAQ;AACN,aAAO,OAAO,UAAU;AAAA,IAC1B;AACA,uBAAmB,MAAM,OAAO;AAChC,WAAO;AAAA,EACT,UAAE;AACA,QAAI;AAAE,iBAAW,OAAO;AAAA,IAAE,QAAQ;AAAA,IAAe;AAAA,EACnD;AACF;AAMA,SAAS,mBAAmB,QAAgB,MAAoB;AAC9D,QAAM,QAAQ,OAAO,YAAY;AACjC,MAAI,MAAM,SAAS,aAAa,KAAK,MAAM,SAAS,YAAY,GAAG;AACjE,UAAM,IAAI,MAAM,GAAG,IAAI,kDAAe,OAAO,KAAK,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EACrE;AACF;AAGA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,+CAA+C;AACxE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;","names":[]}
|
|
@@ -17,7 +17,7 @@ var OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \
|
|
|
17
17
|
var _tempDir = null;
|
|
18
18
|
function getTempDir() {
|
|
19
19
|
if (!_tempDir) {
|
|
20
|
-
_tempDir = join(process.cwd(), "
|
|
20
|
+
_tempDir = join(process.cwd(), ".kordoc_ocr_tmp");
|
|
21
21
|
mkdirSync(_tempDir, { recursive: true });
|
|
22
22
|
}
|
|
23
23
|
return _tempDir;
|
|
@@ -42,6 +42,12 @@ function createCliOcrProvider(mode) {
|
|
|
42
42
|
}
|
|
43
43
|
};
|
|
44
44
|
}
|
|
45
|
+
function checkForLimitError(output, mode) {
|
|
46
|
+
const lower = output.toLowerCase();
|
|
47
|
+
if (lower.includes("usage limit") || lower.includes("rate limit")) {
|
|
48
|
+
throw new Error(`${mode} \uC0AC\uC6A9\uB7C9/\uC18D\uB3C4 \uC81C\uD55C: ${output.trim().slice(0, 200)}`);
|
|
49
|
+
}
|
|
50
|
+
}
|
|
45
51
|
function callCli(mode, imagePath) {
|
|
46
52
|
if (mode === "codex") {
|
|
47
53
|
return callCodexCli(imagePath);
|
|
@@ -62,7 +68,9 @@ function callCli(mode, imagePath) {
|
|
|
62
68
|
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
63
69
|
throw new Error(`${mode} OCR \uC2E4\uD328: ${errMsg}`);
|
|
64
70
|
}
|
|
65
|
-
|
|
71
|
+
const output = result.stdout || "";
|
|
72
|
+
checkForLimitError(output, mode);
|
|
73
|
+
return output;
|
|
66
74
|
}
|
|
67
75
|
function callCodexCli(imagePath) {
|
|
68
76
|
const outPath = join(tmpdir(), `kordoc-codex-out-${Date.now()}.txt`);
|
|
@@ -85,11 +93,14 @@ function callCodexCli(imagePath) {
|
|
|
85
93
|
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
86
94
|
throw new Error(`codex OCR \uC2E4\uD328: ${errMsg}`);
|
|
87
95
|
}
|
|
96
|
+
let text;
|
|
88
97
|
try {
|
|
89
|
-
|
|
98
|
+
text = readFileSync(outPath, "utf-8");
|
|
90
99
|
} catch {
|
|
91
|
-
|
|
100
|
+
text = result.stdout || "";
|
|
92
101
|
}
|
|
102
|
+
checkForLimitError(text, "codex");
|
|
103
|
+
return text;
|
|
93
104
|
} finally {
|
|
94
105
|
try {
|
|
95
106
|
unlinkSync(outPath);
|
|
@@ -153,4 +164,4 @@ function stripCodeFence(text) {
|
|
|
153
164
|
export {
|
|
154
165
|
createCliOcrProvider
|
|
155
166
|
};
|
|
156
|
-
//# sourceMappingURL=chunk-
|
|
167
|
+
//# sourceMappingURL=chunk-34WIGIQC.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/ocr/cli-provider.ts"],"sourcesContent":["/**\n * CLI 기반 OCR 프로바이더\n *\n * gemini / claude / codex / ollama CLI를 subprocess로 호출하여\n * PDF 페이지 이미지를 Markdown으로 변환.\n *\n * 이미지 전달 방식:\n * - gemini: -p \"프롬프트 @이미지경로\" (@ 파일 참조)\n * - claude: -p \"프롬프트 @이미지경로\" (@ 파일 참조, --print 모드)\n * - codex: exec -i 이미지경로 \"프롬프트\" (-i/--image 플래그)\n * - ollama: REST API (localhost:11434) — CLI는 이미지 입력 미지원\n */\n\nimport { spawnSync } from \"child_process\"\nimport { writeFileSync, readFileSync, unlinkSync, mkdirSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { OcrMode, StructuredOcrResult } from \"../types.js\"\n\n/** OCR 프롬프트 — 모든 CLI 공통 */\nconst OCR_PROMPT = `이 PDF 페이지 이미지에서 텍스트와 테이블을 추출하여 순수 Markdown으로 변환하세요.\n규칙:\n- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\n- 병합된 셀은 해당 위치에 내용 기재\n- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\n- 리스트는 - 또는 1. 사용\n- 이미지, 도형 등 비텍스트 요소는 무시\n- 원문의 읽기 순서와 구조를 유지\n- \\`\\`\\`로 감싸지 말고 순수 Markdown만 출력`\n\n/** 임시 디렉토리 (프로세스당 1회 생성)\n *\n * gemini CLI는 /tmp/ 등 시스템 임시 디렉토리를 워크스페이스 외부로 간주하여\n * @파일참조 시 접근을 거부할 수 있음. cwd 하위 폴더를 사용하면 모든 CLI에서 접근 가능.\n *\n * ⚠️ .gitignore에 포함된 경로(예: .kordoc-tmp/)는 gemini CLI가 무시하므로\n * 반드시 gitignore되지 않는 이름 사용. 파일은 try/finally로 즉시 정리.\n */\nlet _tempDir: string | null = null\nfunction getTempDir(): string {\n if (!_tempDir) {\n _tempDir = join(process.cwd(), \".kordoc_ocr_tmp\")\n mkdirSync(_tempDir, { recursive: true })\n }\n return _tempDir\n}\n\n/**\n * CLI OcrProvider 생성.\n *\n * @param mode - 사용할 CLI (gemini, claude, codex, ollama)\n * @returns OcrProvider 함수 (StructuredOcrResult 반환)\n */\nexport function createCliOcrProvider(\n mode: Exclude<OcrMode, \"auto\" | \"off\" | \"tesseract\">\n): (pageImage: Uint8Array, pageNumber: number, mimeType: \"image/png\") => Promise<StructuredOcrResult> {\n return async (pageImage: Uint8Array, pageNumber: number): Promise<StructuredOcrResult> => {\n const tempPath = join(getTempDir(), `page-${pageNumber}.png`)\n\n try {\n writeFileSync(tempPath, pageImage)\n\n let output: string\n if (mode === \"ollama\") {\n output = await callOllamaApi(tempPath)\n } else {\n output = callCli(mode, tempPath)\n }\n\n return { markdown: stripCodeFence(output.trim()) }\n } finally {\n try { unlinkSync(tempPath) } catch { /* 임시 파일 정리 실패 무시 */ }\n }\n }\n}\n\n/**\n * 출력 텍스트에서 사용량·속도 제한 에러 감지.\n * 해당 메시지가 포함된 경우 throw하여 다음 엔진으로 fallback 트리거.\n */\nfunction checkForLimitError(output: string, mode: string): void {\n const lower = output.toLowerCase()\n if (lower.includes(\"usage limit\") || lower.includes(\"rate limit\")) {\n throw new Error(`${mode} 사용량/속도 제한: ${output.trim().slice(0, 200)}`)\n }\n}\n\n/**\n * CLI 실행 — gemini / claude / codex\n *\n * @throws CLI 실행 실패 또는 타임아웃(180초) 시 Error\n */\nfunction callCli(mode: string, imagePath: string): string {\n // codex는 --output-last-message로 대화 헤더 없는 깔끔한 출력 사용\n if (mode === \"codex\") {\n return callCodexCli(imagePath)\n }\n\n const args = buildCliArgs(mode, imagePath)\n\n const result = spawnSync(mode, args, {\n encoding: \"utf-8\",\n timeout: 600_000,\n maxBuffer: 10 * 1024 * 1024,\n shell: process.platform === \"win32\",\n // claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지\n ...(mode === \"claude\" ? { cwd: tmpdir() } : {}),\n })\n\n if (result.error) {\n throw new Error(`${mode} CLI 실행 실패: ${result.error.message}`)\n }\n if (result.status !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.status}`\n throw new Error(`${mode} OCR 실패: ${errMsg}`)\n }\n\n const output = result.stdout || \"\"\n checkForLimitError(output, mode)\n return output\n}\n\n/**\n * codex exec 실행 — --output-last-message로 대화 헤더 없는 깔끔한 출력.\n * 인자 순서: `codex exec <prompt> --image <file> --output-last-message <outfile>`\n */\nfunction callCodexCli(imagePath: string): string {\n // 출력 파일은 /tmp/ 사용 — codex sandbox는 cwd 내 쓰기를 막을 수 있음\n const outPath = join(tmpdir(), `kordoc-codex-out-${Date.now()}.txt`)\n try {\n const args = [\"exec\", OCR_PROMPT, \"--image\", imagePath, \"--output-last-message\", outPath]\n const model = process.env.KORDOC_CODEX_MODEL\n if (model) args.push(\"--model\", model)\n\n const result = spawnSync(\"codex\", args, {\n encoding: \"utf-8\",\n timeout: 180_000,\n maxBuffer: 10 * 1024 * 1024,\n input: \"\", // stdin EOF 즉시 전달 (대화형 입력 차단)\n shell: process.platform === \"win32\",\n })\n\n if (result.error) {\n throw new Error(`codex CLI 실행 실패: ${result.error.message}`)\n }\n if (result.status !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.status}`\n throw new Error(`codex OCR 실패: ${errMsg}`)\n }\n\n // --output-last-message 파일에서 읽기 (없으면 stdout 폴백)\n let text: string\n try {\n text = readFileSync(outPath, \"utf-8\")\n } catch {\n text = result.stdout || \"\"\n }\n checkForLimitError(text, \"codex\")\n return text\n } finally {\n try { unlinkSync(outPath) } catch { /* 무시 */ }\n }\n}\n\n/**\n * CLI별 인자 배열 생성.\n *\n * gemini: [\"--prompt\", \"프롬프트 @이미지경로\", \"--yolo\"]\n * - -y/--yolo: 자동 승인 (OCR은 도구 사용 없으므로 실질적 영향 없음)\n * - @ 파일 참조로 이미지를 컨텍스트에 포함\n *\n * claude: [\"--print\", \"프롬프트 @이미지경로\"]\n * - --print(-p): 비대화형 출력 모드\n * - @ 파일 참조로 이미지를 컨텍스트에 포함\n *\n * codex: callCodexCli()에서 별도 처리\n * - `codex exec <prompt> --image <file> --output-last-message <outfile>`\n * - 프롬프트가 --image보다 앞에 위치해야 함 (인자 순서 중요)\n *\n * ⚠️ CLI 버전에 따라 문법이 다를 수 있음. 업데이트 시 --help 재확인 필요.\n */\nfunction buildCliArgs(mode: string, imagePath: string): string[] {\n const normalizedPath = imagePath.replace(/\\\\/g, \"/\")\n const promptWithImage = `${OCR_PROMPT}\n\n이미지: @${normalizedPath}`\n\n switch (mode) {\n case \"gemini\": {\n const args = [\"--prompt\", promptWithImage, \"--yolo\"]\n const model = process.env.KORDOC_GEMINI_MODEL\n if (model) args.push(\"--model\", model)\n return args\n }\n\n case \"claude\": {\n const args = [\"--print\", promptWithImage]\n const model = process.env.KORDOC_CLAUDE_MODEL\n if (model) args.push(\"--model\", model)\n return args\n }\n\n default:\n throw new Error(`지원하지 않는 CLI: ${mode}`)\n }\n}\n\n/**\n * Ollama REST API 호출 — CLI는 이미지 입력을 지원하지 않으므로 API 직접 사용.\n *\n * 기본 모델: KORDOC_OLLAMA_MODEL 환경변수 또는 \"gemma4:27b\"\n * 기본 호스트: KORDOC_OLLAMA_HOST 환경변수 또는 \"http://localhost:11434\"\n *\n * @throws Ollama 서버 미실행 또는 응답 오류 시 Error\n */\nasync function callOllamaApi(imagePath: string): Promise<string> {\n const { readFileSync } = await import(\"fs\")\n const imageBase64 = readFileSync(imagePath).toString(\"base64\")\n\n const model = process.env.KORDOC_OLLAMA_MODEL || \"qwen3-vl:8b\"\n const host = process.env.KORDOC_OLLAMA_HOST || \"http://localhost:11434\"\n const timeoutMs = Number(process.env.KORDOC_OLLAMA_TIMEOUT) || 120_000\n\n const response = await fetch(`${host}/api/chat`, {\n method: \"POST\",\n headers: { \"Content-Type\": \"application/json\" },\n body: JSON.stringify({\n model,\n messages: [{\n role: \"user\",\n content: OCR_PROMPT,\n images: [imageBase64],\n }],\n stream: false,\n }),\n signal: AbortSignal.timeout(timeoutMs),\n })\n\n if (!response.ok) {\n throw new Error(`Ollama API 오류: ${response.status} ${response.statusText}`)\n }\n\n const data = await response.json() as { message?: { content?: string } }\n return data.message?.content || \"\"\n}\n\n/**\n * LLM 출력에서 코드 펜스 제거.\n * LLM이 가끔 결과를 ```markdown ... ``` 으로 감싸는 경우 처리.\n */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*([\\s\\S]*?)```\\s*$/m)\n return match ? match[1].trim() : text\n}\n"],"mappings":";;;AAaA,SAAS,iBAAiB;AAC1B,SAAS,eAAe,cAAc,YAAY,iBAAiB;AACnE,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,aAAa;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAAA;AAkBnB,IAAI,WAA0B;AAC9B,SAAS,aAAqB;AAC5B,MAAI,CAAC,UAAU;AACb,eAAW,KAAK,QAAQ,IAAI,GAAG,iBAAiB;AAChD,cAAU,UAAU,EAAE,WAAW,KAAK,CAAC;AAAA,EACzC;AACA,SAAO;AACT;AAQO,SAAS,qBACd,MACoG;AACpG,SAAO,OAAO,WAAuB,eAAqD;AACxF,UAAM,WAAW,KAAK,WAAW,GAAG,QAAQ,UAAU,MAAM;AAE5D,QAAI;AACF,oBAAc,UAAU,SAAS;AAEjC,UAAI;AACJ,UAAI,SAAS,UAAU;AACrB,iBAAS,MAAM,cAAc,QAAQ;AAAA,MACvC,OAAO;AACL,iBAAS,QAAQ,MAAM,QAAQ;AAAA,MACjC;AAEA,aAAO,EAAE,UAAU,eAAe,OAAO,KAAK,CAAC,EAAE;AAAA,IACnD,UAAE;AACA,UAAI;AAAE,mBAAW,QAAQ;AAAA,MAAE,QAAQ;AAAA,MAAuB;AAAA,IAC5D;AAAA,EACF;AACF;AAMA,SAAS,mBAAmB,QAAgB,MAAoB;AAC9D,QAAM,QAAQ,OAAO,YAAY;AACjC,MAAI,MAAM,SAAS,aAAa,KAAK,MAAM,SAAS,YAAY,GAAG;AACjE,UAAM,IAAI,MAAM,GAAG,IAAI,kDAAe,OAAO,KAAK,EAAE,MAAM,GAAG,GAAG,CAAC,EAAE;AAAA,EACrE;AACF;AAOA,SAAS,QAAQ,MAAc,WAA2B;AAExD,MAAI,SAAS,SAAS;AACpB,WAAO,aAAa,SAAS;AAAA,EAC/B;AAEA,QAAM,OAAO,aAAa,MAAM,SAAS;AAEzC,QAAM,SAAS,UAAU,MAAM,MAAM;AAAA,IACnC,UAAU;AAAA,IACV,SAAS;AAAA,IACT,WAAW,KAAK,OAAO;AAAA,IACvB,OAAO,QAAQ,aAAa;AAAA;AAAA,IAE5B,GAAI,SAAS,WAAW,EAAE,KAAK,OAAO,EAAE,IAAI,CAAC;AAAA,EAC/C,CAAC;AAED,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,OAAO,MAAM,OAAO,EAAE;AAAA,EAC9D;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,MAAM;AAClE,UAAM,IAAI,MAAM,GAAG,IAAI,sBAAY,MAAM,EAAE;AAAA,EAC7C;AAEA,QAAM,SAAS,OAAO,UAAU;AAChC,qBAAmB,QAAQ,IAAI;AAC/B,SAAO;AACT;AAMA,SAAS,aAAa,WAA2B;AAE/C,QAAM,UAAU,KAAK,OAAO,GAAG,oBAAoB,KAAK,IAAI,CAAC,MAAM;AACnE,MAAI;AACF,UAAM,OAAO,CAAC,QAAQ,YAAY,WAAW,WAAW,yBAAyB,OAAO;AACxF,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAErC,UAAM,SAAS,UAAU,SAAS,MAAM;AAAA,MACtC,UAAU;AAAA,MACV,SAAS;AAAA,MACT,WAAW,KAAK,OAAO;AAAA,MACvB,OAAO;AAAA;AAAA,MACP,OAAO,QAAQ,aAAa;AAAA,IAC9B,CAAC;AAED,QAAI,OAAO,OAAO;AAChB,YAAM,IAAI,MAAM,wCAAoB,OAAO,MAAM,OAAO,EAAE;AAAA,IAC5D;AACA,QAAI,OAAO,WAAW,GAAG;AACvB,YAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,MAAM;AAClE,YAAM,IAAI,MAAM,2BAAiB,MAAM,EAAE;AAAA,IAC3C;AAGA,QAAI;AACJ,QAAI;AACF,aAAO,aAAa,SAAS,OAAO;AAAA,IACtC,QAAQ;AACN,aAAO,OAAO,UAAU;AAAA,IAC1B;AACA,uBAAmB,MAAM,OAAO;AAChC,WAAO;AAAA,EACT,UAAE;AACA,QAAI;AAAE,iBAAW,OAAO;AAAA,IAAE,QAAQ;AAAA,IAAW;AAAA,EAC/C;AACF;AAmBA,SAAS,aAAa,MAAc,WAA6B;AAC/D,QAAM,iBAAiB,UAAU,QAAQ,OAAO,GAAG;AACnD,QAAM,kBAAkB,GAAG,UAAU;AAAA;AAAA,uBAE/B,cAAc;AAEpB,UAAQ,MAAM;AAAA,IACZ,KAAK,UAAU;AACb,YAAM,OAAO,CAAC,YAAY,iBAAiB,QAAQ;AACnD,YAAM,QAAQ,QAAQ,IAAI;AAC1B,UAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AACrC,aAAO;AAAA,IACT;AAAA,IAEA,KAAK,UAAU;AACb,YAAM,OAAO,CAAC,WAAW,eAAe;AACxC,YAAM,QAAQ,QAAQ,IAAI;AAC1B,UAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AACrC,aAAO;AAAA,IACT;AAAA,IAEA;AACE,YAAM,IAAI,MAAM,8CAAgB,IAAI,EAAE;AAAA,EAC1C;AACF;AAUA,eAAe,cAAc,WAAoC;AAC/D,QAAM,EAAE,cAAAA,cAAa,IAAI,MAAM,OAAO,IAAI;AAC1C,QAAM,cAAcA,cAAa,SAAS,EAAE,SAAS,QAAQ;AAE7D,QAAM,QAAQ,QAAQ,IAAI,uBAAuB;AACjD,QAAM,OAAO,QAAQ,IAAI,sBAAsB;AAC/C,QAAM,YAAY,OAAO,QAAQ,IAAI,qBAAqB,KAAK;AAE/D,QAAM,WAAW,MAAM,MAAM,GAAG,IAAI,aAAa;AAAA,IAC/C,QAAQ;AAAA,IACR,SAAS,EAAE,gBAAgB,mBAAmB;AAAA,IAC9C,MAAM,KAAK,UAAU;AAAA,MACnB;AAAA,MACA,UAAU,CAAC;AAAA,QACT,MAAM;AAAA,QACN,SAAS;AAAA,QACT,QAAQ,CAAC,WAAW;AAAA,MACtB,CAAC;AAAA,MACD,QAAQ;AAAA,IACV,CAAC;AAAA,IACD,QAAQ,YAAY,QAAQ,SAAS;AAAA,EACvC,CAAC;AAED,MAAI,CAAC,SAAS,IAAI;AAChB,UAAM,IAAI,MAAM,4BAAkB,SAAS,MAAM,IAAI,SAAS,UAAU,EAAE;AAAA,EAC5E;AAEA,QAAM,OAAO,MAAM,SAAS,KAAK;AACjC,SAAO,KAAK,SAAS,WAAW;AAClC;AAMA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,2CAA2C;AACpE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;","names":["readFileSync"]}
|
|
@@ -9,7 +9,7 @@ import {
|
|
|
9
9
|
precheckZipSize,
|
|
10
10
|
sanitizeHref,
|
|
11
11
|
toArrayBuffer
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-PJSXZBZB.js";
|
|
13
13
|
import {
|
|
14
14
|
parsePageRange
|
|
15
15
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -18,7 +18,7 @@ import {
|
|
|
18
18
|
} from "./chunk-7FMKAV4P.js";
|
|
19
19
|
import {
|
|
20
20
|
createCliOcrProvider
|
|
21
|
-
} from "./chunk-
|
|
21
|
+
} from "./chunk-34WIGIQC.js";
|
|
22
22
|
import {
|
|
23
23
|
__commonJS,
|
|
24
24
|
__require,
|
|
@@ -5462,7 +5462,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5462
5462
|
if (ocrMode === "off") {
|
|
5463
5463
|
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
5464
5464
|
}
|
|
5465
|
-
const { resolveOcrProvider } = await import("./resolve-
|
|
5465
|
+
const { resolveOcrProvider } = await import("./resolve-4I65IGMM.js");
|
|
5466
5466
|
const { ocrPages } = await import("./provider-PYZL2VNN.js");
|
|
5467
5467
|
const tryProvider = async (provider, filter) => {
|
|
5468
5468
|
try {
|
|
@@ -9813,4 +9813,4 @@ export {
|
|
|
9813
9813
|
cfb/cfb.js:
|
|
9814
9814
|
(*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
|
|
9815
9815
|
*/
|
|
9816
|
-
//# sourceMappingURL=chunk-
|
|
9816
|
+
//# sourceMappingURL=chunk-JGMLDBW5.js.map
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/utils.ts
|
|
4
|
-
var VERSION = true ? "2.4.
|
|
4
|
+
var VERSION = true ? "2.4.11" : "0.0.0-dev";
|
|
5
5
|
function toArrayBuffer(buf) {
|
|
6
6
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
7
7
|
return buf.buffer;
|
|
@@ -90,4 +90,4 @@ export {
|
|
|
90
90
|
sanitizeHref,
|
|
91
91
|
classifyError
|
|
92
92
|
};
|
|
93
|
-
//# sourceMappingURL=chunk-
|
|
93
|
+
//# sourceMappingURL=chunk-PJSXZBZB.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/utils.ts"],"sourcesContent":["/** kordoc 공용 유틸리티 */\n\n/** 빌드 타임에 tsup define으로 주입되는 버전 */\ndeclare const __KORDOC_VERSION__: string\nexport const VERSION: string = typeof __KORDOC_VERSION__ !== \"undefined\" ? __KORDOC_VERSION__ : \"0.0.0-dev\"\n\n/**\n * Node.js Buffer → ArrayBuffer 변환\n * pool Buffer의 공유 ArrayBuffer 문제를 안전하게 처리.\n * offset=0이고 전체 ArrayBuffer를 차지하면 복사 없이 직접 반환.\n */\nexport function toArrayBuffer(buf: Buffer): ArrayBuffer {\n if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {\n return buf.buffer as ArrayBuffer\n }\n return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) as ArrayBuffer\n}\n\n/**\n * kordoc 내부 에러 클래스 — 사용자에게 노출해도 안전한 메시지만 포함.\n * MCP 에러 정제에서 instanceof로 판별하여 allowlist 패턴 매칭 없이 안전하게 통과.\n */\nexport class KordocError extends Error {\n constructor(message: string) {\n super(message)\n this.name = \"KordocError\"\n }\n}\n\n/**\n * 에러 메시지 정제 — KordocError는 그대로, 나머지는 일반 메시지로 대체.\n * 파일시스템 경로, 스택 트레이스 등 내부 정보 노출 방지.\n */\nexport function sanitizeError(err: unknown): string {\n if (err instanceof KordocError) return err.message\n return \"문서 처리 중 오류가 발생했습니다\"\n}\n\n/**\n * ZIP 엔트리 경로의 경로 순회 여부 판별.\n * 백슬래시 정규화, .., 절대경로, Windows 드라이브 문자 모두 차단.\n */\nexport function isPathTraversal(name: string): boolean {\n if (name.includes(\"\\x00\")) return true\n const normalized = name.replace(/\\\\/g, \"/\")\n return normalized.includes(\"..\") || normalized.startsWith(\"/\") || /^[A-Za-z]:/.test(normalized)\n}\n\n// ─── ZIP 안전 로딩 (ZIP bomb 방지) ────────────────────\n\n/**\n * ZIP bomb 사전 검사 — Central Directory에서 비압축 합계와 엔트리 수 확인.\n * HWPX/XLSX/DOCX 등 모든 ZIP 기반 포맷에서 공통 사용.\n */\nexport function precheckZipSize(\n buffer: ArrayBuffer,\n maxUncompressedSize = 100 * 1024 * 1024,\n maxEntries = 500,\n): { totalUncompressed: number; entryCount: number } {\n try {\n const data = new DataView(buffer)\n const len = buffer.byteLength\n // EOCD 시그니처 역방향 스캔\n let eocdOffset = -1\n for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {\n if (data.getUint32(i, true) === 0x06054b50) { eocdOffset = i; break }\n }\n if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 }\n\n const entryCount = data.getUint16(eocdOffset + 10, true)\n if (entryCount > maxEntries) {\n throw new KordocError(`ZIP 엔트리 수 초과: ${entryCount} (최대 ${maxEntries})`)\n }\n\n const cdSize = data.getUint32(eocdOffset + 12, true)\n const cdOffset = data.getUint32(eocdOffset + 16, true)\n if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount }\n\n let totalUncompressed = 0\n let pos = cdOffset\n for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {\n if (data.getUint32(pos, true) !== 0x02014b50) break\n totalUncompressed += data.getUint32(pos + 24, true)\n const nameLen = data.getUint16(pos + 28, true)\n const extraLen = data.getUint16(pos + 30, true)\n const commentLen = data.getUint16(pos + 32, true)\n pos += 46 + nameLen + extraLen + commentLen\n }\n\n if (totalUncompressed > maxUncompressedSize) {\n throw new KordocError(`ZIP 비압축 크기 초과: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (최대 ${maxUncompressedSize / 1024 / 1024}MB)`)\n }\n\n return { totalUncompressed, entryCount }\n } catch (err) {\n if (err instanceof KordocError) throw err\n return { totalUncompressed: 0, entryCount: 0 }\n }\n}\n\n/** 하이퍼링크 URL 살균 — javascript: 등 XSS 위험 스킴 차단 */\nconst SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i\nexport function sanitizeHref(href: string): string | null {\n const trimmed = href.trim()\n if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null\n return trimmed\n}\n\n// ─── 에러 분류 ──────────────────────────────────────\n\nimport type { ErrorCode } from \"./types.js\"\n\n/** 에러를 구조화된 ErrorCode로 분류 — KordocError 메시지 패턴 매칭 */\nexport function classifyError(err: unknown): ErrorCode {\n if (!(err instanceof Error)) return \"PARSE_ERROR\"\n const msg = err.message\n if (msg.includes(\"암호화\")) return \"ENCRYPTED\"\n if (msg.includes(\"DRM\")) return \"DRM_PROTECTED\"\n if (msg.includes(\"ZIP bomb\") || msg.includes(\"ZIP 비압축 크기 초과\") || msg.includes(\"ZIP 엔트리 수 초과\")) return \"ZIP_BOMB\"\n if (msg.includes(\"bomb\") || msg.includes(\"크기 초과\") || msg.includes(\"압축 해제\")) return \"DECOMPRESSION_BOMB\"\n if (msg.includes(\"이미지 기반\")) return \"IMAGE_BASED_PDF\"\n if (msg.includes(\"섹션\") && (msg.includes(\"찾을 수 없\") || msg.includes(\"없음\"))) return \"NO_SECTIONS\"\n if (msg.includes(\"시그니처\") || msg.includes(\"복구할 수 없\")) return \"CORRUPTED\"\n return \"PARSE_ERROR\"\n}\n"],"mappings":";;;AAIO,IAAM,UAAkB,OAA4C,
|
|
1
|
+
{"version":3,"sources":["../src/utils.ts"],"sourcesContent":["/** kordoc 공용 유틸리티 */\n\n/** 빌드 타임에 tsup define으로 주입되는 버전 */\ndeclare const __KORDOC_VERSION__: string\nexport const VERSION: string = typeof __KORDOC_VERSION__ !== \"undefined\" ? __KORDOC_VERSION__ : \"0.0.0-dev\"\n\n/**\n * Node.js Buffer → ArrayBuffer 변환\n * pool Buffer의 공유 ArrayBuffer 문제를 안전하게 처리.\n * offset=0이고 전체 ArrayBuffer를 차지하면 복사 없이 직접 반환.\n */\nexport function toArrayBuffer(buf: Buffer): ArrayBuffer {\n if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {\n return buf.buffer as ArrayBuffer\n }\n return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) as ArrayBuffer\n}\n\n/**\n * kordoc 내부 에러 클래스 — 사용자에게 노출해도 안전한 메시지만 포함.\n * MCP 에러 정제에서 instanceof로 판별하여 allowlist 패턴 매칭 없이 안전하게 통과.\n */\nexport class KordocError extends Error {\n constructor(message: string) {\n super(message)\n this.name = \"KordocError\"\n }\n}\n\n/**\n * 에러 메시지 정제 — KordocError는 그대로, 나머지는 일반 메시지로 대체.\n * 파일시스템 경로, 스택 트레이스 등 내부 정보 노출 방지.\n */\nexport function sanitizeError(err: unknown): string {\n if (err instanceof KordocError) return err.message\n return \"문서 처리 중 오류가 발생했습니다\"\n}\n\n/**\n * ZIP 엔트리 경로의 경로 순회 여부 판별.\n * 백슬래시 정규화, .., 절대경로, Windows 드라이브 문자 모두 차단.\n */\nexport function isPathTraversal(name: string): boolean {\n if (name.includes(\"\\x00\")) return true\n const normalized = name.replace(/\\\\/g, \"/\")\n return normalized.includes(\"..\") || normalized.startsWith(\"/\") || /^[A-Za-z]:/.test(normalized)\n}\n\n// ─── ZIP 안전 로딩 (ZIP bomb 방지) ────────────────────\n\n/**\n * ZIP bomb 사전 검사 — Central Directory에서 비압축 합계와 엔트리 수 확인.\n * HWPX/XLSX/DOCX 등 모든 ZIP 기반 포맷에서 공통 사용.\n */\nexport function precheckZipSize(\n buffer: ArrayBuffer,\n maxUncompressedSize = 100 * 1024 * 1024,\n maxEntries = 500,\n): { totalUncompressed: number; entryCount: number } {\n try {\n const data = new DataView(buffer)\n const len = buffer.byteLength\n // EOCD 시그니처 역방향 스캔\n let eocdOffset = -1\n for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {\n if (data.getUint32(i, true) === 0x06054b50) { eocdOffset = i; break }\n }\n if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 }\n\n const entryCount = data.getUint16(eocdOffset + 10, true)\n if (entryCount > maxEntries) {\n throw new KordocError(`ZIP 엔트리 수 초과: ${entryCount} (최대 ${maxEntries})`)\n }\n\n const cdSize = data.getUint32(eocdOffset + 12, true)\n const cdOffset = data.getUint32(eocdOffset + 16, true)\n if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount }\n\n let totalUncompressed = 0\n let pos = cdOffset\n for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {\n if (data.getUint32(pos, true) !== 0x02014b50) break\n totalUncompressed += data.getUint32(pos + 24, true)\n const nameLen = data.getUint16(pos + 28, true)\n const extraLen = data.getUint16(pos + 30, true)\n const commentLen = data.getUint16(pos + 32, true)\n pos += 46 + nameLen + extraLen + commentLen\n }\n\n if (totalUncompressed > maxUncompressedSize) {\n throw new KordocError(`ZIP 비압축 크기 초과: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (최대 ${maxUncompressedSize / 1024 / 1024}MB)`)\n }\n\n return { totalUncompressed, entryCount }\n } catch (err) {\n if (err instanceof KordocError) throw err\n return { totalUncompressed: 0, entryCount: 0 }\n }\n}\n\n/** 하이퍼링크 URL 살균 — javascript: 등 XSS 위험 스킴 차단 */\nconst SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i\nexport function sanitizeHref(href: string): string | null {\n const trimmed = href.trim()\n if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null\n return trimmed\n}\n\n// ─── 에러 분류 ──────────────────────────────────────\n\nimport type { ErrorCode } from \"./types.js\"\n\n/** 에러를 구조화된 ErrorCode로 분류 — KordocError 메시지 패턴 매칭 */\nexport function classifyError(err: unknown): ErrorCode {\n if (!(err instanceof Error)) return \"PARSE_ERROR\"\n const msg = err.message\n if (msg.includes(\"암호화\")) return \"ENCRYPTED\"\n if (msg.includes(\"DRM\")) return \"DRM_PROTECTED\"\n if (msg.includes(\"ZIP bomb\") || msg.includes(\"ZIP 비압축 크기 초과\") || msg.includes(\"ZIP 엔트리 수 초과\")) return \"ZIP_BOMB\"\n if (msg.includes(\"bomb\") || msg.includes(\"크기 초과\") || msg.includes(\"압축 해제\")) return \"DECOMPRESSION_BOMB\"\n if (msg.includes(\"이미지 기반\")) return \"IMAGE_BASED_PDF\"\n if (msg.includes(\"섹션\") && (msg.includes(\"찾을 수 없\") || msg.includes(\"없음\"))) return \"NO_SECTIONS\"\n if (msg.includes(\"시그니처\") || msg.includes(\"복구할 수 없\")) return \"CORRUPTED\"\n return \"PARSE_ERROR\"\n}\n"],"mappings":";;;AAIO,IAAM,UAAkB,OAA4C,WAAqB;AAOzF,SAAS,cAAc,KAA0B;AACtD,MAAI,IAAI,eAAe,KAAK,IAAI,eAAe,IAAI,OAAO,YAAY;AACpE,WAAO,IAAI;AAAA,EACb;AACA,SAAO,IAAI,OAAO,MAAM,IAAI,YAAY,IAAI,aAAa,IAAI,UAAU;AACzE;AAMO,IAAM,cAAN,cAA0B,MAAM;AAAA,EACrC,YAAY,SAAiB;AAC3B,UAAM,OAAO;AACb,SAAK,OAAO;AAAA,EACd;AACF;AAMO,SAAS,cAAc,KAAsB;AAClD,MAAI,eAAe,YAAa,QAAO,IAAI;AAC3C,SAAO;AACT;AAMO,SAAS,gBAAgB,MAAuB;AACrD,MAAI,KAAK,SAAS,IAAM,EAAG,QAAO;AAClC,QAAM,aAAa,KAAK,QAAQ,OAAO,GAAG;AAC1C,SAAO,WAAW,SAAS,IAAI,KAAK,WAAW,WAAW,GAAG,KAAK,aAAa,KAAK,UAAU;AAChG;AAQO,SAAS,gBACd,QACA,sBAAsB,MAAM,OAAO,MACnC,aAAa,KACsC;AACnD,MAAI;AACF,UAAM,OAAO,IAAI,SAAS,MAAM;AAChC,UAAM,MAAM,OAAO;AAEnB,QAAI,aAAa;AACjB,aAAS,IAAI,MAAM,IAAI,KAAK,KAAK,IAAI,GAAG,MAAM,KAAK,GAAG,KAAK;AACzD,UAAI,KAAK,UAAU,GAAG,IAAI,MAAM,WAAY;AAAE,qBAAa;AAAG;AAAA,MAAM;AAAA,IACtE;AACA,QAAI,aAAa,EAAG,QAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAEjE,UAAM,aAAa,KAAK,UAAU,aAAa,IAAI,IAAI;AACvD,QAAI,aAAa,YAAY;AAC3B,YAAM,IAAI,YAAY,+CAAiB,UAAU,kBAAQ,UAAU,GAAG;AAAA,IACxE;AAEA,UAAM,SAAS,KAAK,UAAU,aAAa,IAAI,IAAI;AACnD,UAAM,WAAW,KAAK,UAAU,aAAa,IAAI,IAAI;AACrD,QAAI,WAAW,SAAS,IAAK,QAAO,EAAE,mBAAmB,GAAG,WAAW;AAEvE,QAAI,oBAAoB;AACxB,QAAI,MAAM;AACV,aAAS,IAAI,GAAG,IAAI,cAAc,MAAM,MAAM,WAAW,QAAQ,KAAK;AACpE,UAAI,KAAK,UAAU,KAAK,IAAI,MAAM,SAAY;AAC9C,2BAAqB,KAAK,UAAU,MAAM,IAAI,IAAI;AAClD,YAAM,UAAU,KAAK,UAAU,MAAM,IAAI,IAAI;AAC7C,YAAM,WAAW,KAAK,UAAU,MAAM,IAAI,IAAI;AAC9C,YAAM,aAAa,KAAK,UAAU,MAAM,IAAI,IAAI;AAChD,aAAO,KAAK,UAAU,WAAW;AAAA,IACnC;AAEA,QAAI,oBAAoB,qBAAqB;AAC3C,YAAM,IAAI,YAAY,sDAAmB,oBAAoB,OAAO,MAAM,QAAQ,CAAC,CAAC,oBAAU,sBAAsB,OAAO,IAAI,KAAK;AAAA,IACtI;AAEA,WAAO,EAAE,mBAAmB,WAAW;AAAA,EACzC,SAAS,KAAK;AACZ,QAAI,eAAe,YAAa,OAAM;AACtC,WAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAAA,EAC/C;AACF;AAGA,IAAM,eAAe;AACd,SAAS,aAAa,MAA6B;AACxD,QAAM,UAAU,KAAK,KAAK;AAC1B,MAAI,CAAC,WAAW,CAAC,aAAa,KAAK,OAAO,EAAG,QAAO;AACpD,SAAO;AACT;AAOO,SAAS,cAAc,KAAyB;AACrD,MAAI,EAAE,eAAe,OAAQ,QAAO;AACpC,QAAM,MAAM,IAAI;AAChB,MAAI,IAAI,SAAS,oBAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,KAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,UAAU,KAAK,IAAI,SAAS,kDAAe,KAAK,IAAI,SAAS,4CAAc,EAAG,QAAO;AACtG,MAAI,IAAI,SAAS,MAAM,KAAK,IAAI,SAAS,2BAAO,KAAK,IAAI,SAAS,2BAAO,EAAG,QAAO;AACnF,MAAI,IAAI,SAAS,iCAAQ,EAAG,QAAO;AACnC,MAAI,IAAI,SAAS,cAAI,MAAM,IAAI,SAAS,4BAAQ,KAAK,IAAI,SAAS,cAAI,GAAI,QAAO;AACjF,MAAI,IAAI,SAAS,0BAAM,KAAK,IAAI,SAAS,kCAAS,EAAG,QAAO;AAC5D,SAAO;AACT;","names":[]}
|
package/dist/cli.js
CHANGED
|
@@ -4,15 +4,15 @@ import {
|
|
|
4
4
|
markdownToHwpx,
|
|
5
5
|
markdownToXlsx,
|
|
6
6
|
parse
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-JGMLDBW5.js";
|
|
8
8
|
import "./chunk-YW5G6BCJ.js";
|
|
9
9
|
import {
|
|
10
10
|
VERSION,
|
|
11
11
|
toArrayBuffer
|
|
12
|
-
} from "./chunk-
|
|
12
|
+
} from "./chunk-PJSXZBZB.js";
|
|
13
13
|
import "./chunk-MOL7MDBG.js";
|
|
14
14
|
import "./chunk-7FMKAV4P.js";
|
|
15
|
-
import "./chunk-
|
|
15
|
+
import "./chunk-34WIGIQC.js";
|
|
16
16
|
import "./chunk-ZWE3DS7E.js";
|
|
17
17
|
|
|
18
18
|
// src/cli.ts
|
|
@@ -141,7 +141,7 @@ async function runParse(files, opts) {
|
|
|
141
141
|
saveImages(absPath);
|
|
142
142
|
}
|
|
143
143
|
} catch (err) {
|
|
144
|
-
const { sanitizeError } = await import("./utils-
|
|
144
|
+
const { sanitizeError } = await import("./utils-HKVOS2O3.js");
|
|
145
145
|
process.stderr.write(`
|
|
146
146
|
[kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
|
|
147
147
|
`);
|
|
@@ -225,7 +225,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
|
|
|
225
225
|
`));
|
|
226
226
|
}
|
|
227
227
|
} catch (err) {
|
|
228
|
-
const { sanitizeError } = await import("./utils-
|
|
228
|
+
const { sanitizeError } = await import("./utils-HKVOS2O3.js");
|
|
229
229
|
process.stderr.write(` FAIL
|
|
230
230
|
`);
|
|
231
231
|
process.stderr.write(` \u2192 ${sanitizeError(err)}
|
|
@@ -234,7 +234,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
|
|
|
234
234
|
}
|
|
235
235
|
});
|
|
236
236
|
program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
|
|
237
|
-
const { watchDirectory } = await import("./watch-
|
|
237
|
+
const { watchDirectory } = await import("./watch-EYOGF3HY.js");
|
|
238
238
|
await watchDirectory({
|
|
239
239
|
dir,
|
|
240
240
|
outDir: opts.outDir,
|
package/dist/index.cjs
CHANGED
|
@@ -2026,7 +2026,7 @@ var init_auto_detect = __esm({
|
|
|
2026
2026
|
// src/ocr/cli-provider.ts
|
|
2027
2027
|
function getTempDir() {
|
|
2028
2028
|
if (!_tempDir) {
|
|
2029
|
-
_tempDir = (0, import_path.join)(process.cwd(), "
|
|
2029
|
+
_tempDir = (0, import_path.join)(process.cwd(), ".kordoc_ocr_tmp");
|
|
2030
2030
|
(0, import_fs.mkdirSync)(_tempDir, { recursive: true });
|
|
2031
2031
|
}
|
|
2032
2032
|
return _tempDir;
|
|
@@ -2051,6 +2051,12 @@ function createCliOcrProvider(mode) {
|
|
|
2051
2051
|
}
|
|
2052
2052
|
};
|
|
2053
2053
|
}
|
|
2054
|
+
function checkForLimitError(output, mode) {
|
|
2055
|
+
const lower = output.toLowerCase();
|
|
2056
|
+
if (lower.includes("usage limit") || lower.includes("rate limit")) {
|
|
2057
|
+
throw new Error(`${mode} \uC0AC\uC6A9\uB7C9/\uC18D\uB3C4 \uC81C\uD55C: ${output.trim().slice(0, 200)}`);
|
|
2058
|
+
}
|
|
2059
|
+
}
|
|
2054
2060
|
function callCli(mode, imagePath) {
|
|
2055
2061
|
if (mode === "codex") {
|
|
2056
2062
|
return callCodexCli(imagePath);
|
|
@@ -2071,7 +2077,9 @@ function callCli(mode, imagePath) {
|
|
|
2071
2077
|
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2072
2078
|
throw new Error(`${mode} OCR \uC2E4\uD328: ${errMsg}`);
|
|
2073
2079
|
}
|
|
2074
|
-
|
|
2080
|
+
const output = result.stdout || "";
|
|
2081
|
+
checkForLimitError(output, mode);
|
|
2082
|
+
return output;
|
|
2075
2083
|
}
|
|
2076
2084
|
function callCodexCli(imagePath) {
|
|
2077
2085
|
const outPath = (0, import_path.join)((0, import_os.tmpdir)(), `kordoc-codex-out-${Date.now()}.txt`);
|
|
@@ -2094,11 +2102,14 @@ function callCodexCli(imagePath) {
|
|
|
2094
2102
|
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2095
2103
|
throw new Error(`codex OCR \uC2E4\uD328: ${errMsg}`);
|
|
2096
2104
|
}
|
|
2105
|
+
let text;
|
|
2097
2106
|
try {
|
|
2098
|
-
|
|
2107
|
+
text = (0, import_fs.readFileSync)(outPath, "utf-8");
|
|
2099
2108
|
} catch {
|
|
2100
|
-
|
|
2109
|
+
text = result.stdout || "";
|
|
2101
2110
|
}
|
|
2111
|
+
checkForLimitError(text, "codex");
|
|
2112
|
+
return text;
|
|
2102
2113
|
} finally {
|
|
2103
2114
|
try {
|
|
2104
2115
|
(0, import_fs.unlinkSync)(outPath);
|
|
@@ -2367,7 +2378,9 @@ ${fileRefs}`;
|
|
|
2367
2378
|
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2368
2379
|
throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2369
2380
|
}
|
|
2370
|
-
|
|
2381
|
+
const output = result.stdout || "";
|
|
2382
|
+
checkForLimitError2(output, mode);
|
|
2383
|
+
return output;
|
|
2371
2384
|
}
|
|
2372
2385
|
async function callBatchCodexCli(imagePaths) {
|
|
2373
2386
|
const outPath = (0, import_path2.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
|
|
@@ -2388,11 +2401,14 @@ async function callBatchCodexCli(imagePaths) {
|
|
|
2388
2401
|
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2389
2402
|
throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2390
2403
|
}
|
|
2404
|
+
let text;
|
|
2391
2405
|
try {
|
|
2392
|
-
|
|
2406
|
+
text = (0, import_fs2.readFileSync)(outPath, "utf-8");
|
|
2393
2407
|
} catch {
|
|
2394
|
-
|
|
2408
|
+
text = result.stdout || "";
|
|
2395
2409
|
}
|
|
2410
|
+
checkForLimitError2(text, "codex");
|
|
2411
|
+
return text;
|
|
2396
2412
|
} finally {
|
|
2397
2413
|
try {
|
|
2398
2414
|
(0, import_fs2.unlinkSync)(outPath);
|
|
@@ -2400,6 +2416,12 @@ async function callBatchCodexCli(imagePaths) {
|
|
|
2400
2416
|
}
|
|
2401
2417
|
}
|
|
2402
2418
|
}
|
|
2419
|
+
function checkForLimitError2(output, mode) {
|
|
2420
|
+
const lower = output.toLowerCase();
|
|
2421
|
+
if (lower.includes("usage limit") || lower.includes("rate limit")) {
|
|
2422
|
+
throw new Error(`${mode} \uC0AC\uC6A9\uB7C9/\uC18D\uB3C4 \uC81C\uD55C: ${output.trim().slice(0, 200)}`);
|
|
2423
|
+
}
|
|
2424
|
+
}
|
|
2403
2425
|
function stripCodeFence2(text) {
|
|
2404
2426
|
const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
|
|
2405
2427
|
return match ? match[1].trim() : text;
|
|
@@ -2854,7 +2876,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
2854
2876
|
var import_xmldom = require("@xmldom/xmldom");
|
|
2855
2877
|
|
|
2856
2878
|
// src/utils.ts
|
|
2857
|
-
var VERSION = true ? "2.4.
|
|
2879
|
+
var VERSION = true ? "2.4.11" : "0.0.0-dev";
|
|
2858
2880
|
function toArrayBuffer(buf) {
|
|
2859
2881
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2860
2882
|
return buf.buffer;
|