@clazic/kordoc 2.3.1 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
  2. package/dist/batch-provider-PNDCSGQW.js.map +1 -0
  3. package/dist/{chunk-ZOEUKD77.js → chunk-2GFJFTKS.js} +193 -49
  4. package/dist/chunk-2GFJFTKS.js.map +1 -0
  5. package/dist/chunk-4PP34NVQ.js +121 -0
  6. package/dist/chunk-4PP34NVQ.js.map +1 -0
  7. package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
  8. package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
  9. package/dist/chunk-JOGAFNIL.js +153 -0
  10. package/dist/chunk-JOGAFNIL.js.map +1 -0
  11. package/dist/{chunk-W5KUC23B.js → chunk-STIKJGEA.js} +2 -2
  12. package/dist/cli.js +8 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/index.cjs +217 -70
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +11 -6
  17. package/dist/index.d.ts +11 -6
  18. package/dist/index.js +217 -70
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp.js +5 -2
  21. package/dist/mcp.js.map +1 -1
  22. package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
  23. package/dist/provider-HE727F7Z.js.map +1 -0
  24. package/dist/resolve-QA3VACUP.js +111 -0
  25. package/dist/resolve-QA3VACUP.js.map +1 -0
  26. package/dist/tesseract-provider-MNMZPSGF.js +11 -0
  27. package/dist/{utils-HSF5HI5T.js → utils-FFUQJTTI.js} +2 -2
  28. package/dist/utils-FFUQJTTI.js.map +1 -0
  29. package/dist/{watch-R2JHXDGF.js → watch-2O32L6IF.js} +6 -3
  30. package/dist/{watch-R2JHXDGF.js.map → watch-2O32L6IF.js.map} +1 -1
  31. package/package.json +7 -8
  32. package/dist/batch-provider-PCT4I4LK.js.map +0 -1
  33. package/dist/chunk-ZOEUKD77.js.map +0 -1
  34. package/dist/provider-WYHC4NHI.js.map +0 -1
  35. package/dist/resolve-4FSAQF2S.js +0 -247
  36. package/dist/resolve-4FSAQF2S.js.map +0 -1
  37. /package/dist/{chunk-W5KUC23B.js.map → chunk-STIKJGEA.js.map} +0 -0
  38. /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@clazic/kordoc",
3
- "version": "2.3.1",
3
+ "version": "2.4.0",
4
4
  "description": "Parse Korean documents (HWP, HWPX, PDF, XLSX, DOCX) to Markdown",
5
5
  "type": "module",
6
6
  "exports": {
@@ -55,22 +55,21 @@
55
55
  "node": ">=18"
56
56
  },
57
57
  "dependencies": {
58
- "@modelcontextprotocol/sdk": "^1.28.0",
58
+ "@modelcontextprotocol/sdk": "^1.29.0",
59
59
  "@napi-rs/canvas": "^0.1.97",
60
- "@xmldom/xmldom": "^0.9.8",
60
+ "@xmldom/xmldom": "^0.9.9",
61
61
  "cfb": "1.2.2",
62
- "commander": "^13.0.0",
62
+ "commander": "^14.0.3",
63
63
  "exceljs": "^4.4.0",
64
64
  "jszip": "^3.10.1",
65
- "pdfjs-dist": "^4.10.38",
66
65
  "tesseract.js": "^7.0.0",
67
- "zod": "^3.23.0"
66
+ "zod": "^4.3.6"
68
67
  },
69
68
  "devDependencies": {
70
69
  "@types/node": "^18.19.130",
71
- "pdfjs-dist": "^4.10.38",
70
+ "pdfjs-dist": "^5.6.205",
72
71
  "tsup": "^8.4.0",
73
72
  "tsx": "^4.21.0",
74
- "typescript": "^5.9.0"
73
+ "typescript": "^6.0.2"
75
74
  }
76
75
  }
@@ -1 +0,0 @@
1
- {"version":3,"sources":["../src/ocr/batch-provider.ts"],"sourcesContent":["/**\n * CLI 배치 OCR 프로바이더\n *\n * 여러 페이지 이미지를 단일 CLI 호출로 처리하여 API 호출 수를 대폭 감소.\n * gemini/claude: @file 멀티 참조, codex: --image 멀티 플래그\n *\n * 299페이지 기준:\n * - 기존: CLI 299회 호출 (~30분)\n * - 배치: CLI 3~6회 호출 (~3분)\n */\n\nimport { spawn } from \"child_process\"\nimport { writeFileSync, readFileSync, unlinkSync, mkdirSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { StructuredOcrResult, BatchOcrProvider } from \"../types.js\"\n\n/** 배치 OCR 프롬프트 */\nconst BATCH_OCR_PROMPT =\n \"다음 문서 페이지 이미지들을 OCR하여 순수 Markdown으로 변환하세요.\\n\\n\" +\n \"규칙:\\n\" +\n \"- 각 페이지 결과 사이에 반드시 이 구분자를 삽입: <!-- PAGE_BREAK -->\\n\" +\n \"- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\\n\" +\n \"- 병합된 셀은 해당 위치에 내용 기재\\n\" +\n \"- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\\n\" +\n \"- 리스트는 - 또는 1. 사용\\n\" +\n \"- 이미지, 도형 등 비텍스트 요소는 무시\\n\" +\n \"- 원문의 읽기 순서와 구조를 유지\\n\" +\n \"- ```로 감싸지 말고 순수 Markdown만 출력\"\n\n/** 모드별 기본 배치 크기 (CLI 내부 타임아웃 + 실측 기반)\n *\n * gemini CLI: 10장 이상에서 AbortError 발생 (내부 타임아웃).\n * 5장 배치가 안정적으로 동작 확인 (35초/배치).\n * 299페이지 = 60배치 = 기존 299회 대비 80% 감소.\n */\nexport const DEFAULT_BATCH_SIZES: Record<string, number> = {\n gemini: 5,\n claude: 5,\n codex: 10,\n}\n\n/** 임시 디렉토리 — gemini CLI는 cwd 하위 + gitignore 밖만 @참조 가능 */\nlet _batchTempDir: string | null = null\nfunction getBatchTempDir(): string {\n if (!_batchTempDir) {\n _batchTempDir = join(process.cwd(), \"_kordoc_ocr_tmp\")\n mkdirSync(_batchTempDir, { recursive: true })\n }\n return _batchTempDir\n}\n\n/**\n * 배치 CLI 프로바이더 생성\n */\nexport function createBatchCliProvider(\n mode: \"gemini\" | \"claude\" | \"codex\",\n batchSize: number\n): BatchOcrProvider {\n return {\n __batch: true as const,\n batchSize,\n async processBatch(pages) {\n const results = new Map<number, StructuredOcrResult>()\n const tempDir = getBatchTempDir()\n const tempFiles: string[] = []\n\n try {\n // 1. Write all page images to temp files\n for (const { image, pageNum } of pages) {\n const path = join(tempDir, `batch-p${pageNum}.png`)\n writeFileSync(path, image)\n tempFiles.push(path)\n }\n\n // 2. Call CLI with all file references (비동기 — 병렬 배치 실행 가능)\n let output: string\n if (mode === \"codex\") {\n output = await callBatchCodexCli(tempFiles)\n } else {\n output = await callBatchCli(mode, tempFiles)\n }\n\n // 3. Parse response by PAGE_BREAK separator\n const cleaned = stripCodeFence(output.trim())\n const parts = cleaned.split(/<!--\\s*PAGE_BREAK\\s*-->/)\n .map(p => p.trim())\n .filter(p => p.length > 0)\n\n // 4. Map results to page numbers (best-effort if count mismatch)\n for (let i = 0; i < pages.length; i++) {\n const pageNum = pages[i].pageNum\n if (i < parts.length) {\n results.set(pageNum, { markdown: parts[i] })\n }\n // If fewer parts than pages, remaining pages get no result\n }\n } finally {\n // 5. Clean up temp files\n for (const f of tempFiles) {\n try { unlinkSync(f) } catch { /* ignore */ }\n }\n }\n\n return results\n },\n }\n}\n\n/**\n * 비동기 CLI 실행 헬퍼 — spawn + Promise 래핑.\n * spawnSync는 이벤트 루프를 차단하여 병렬 배치 실행 불가.\n */\nfunction spawnAsync(\n cmd: string,\n args: string[],\n opts: { timeoutMs: number; cwd?: string; stdin?: string }\n): Promise<{ stdout: string; stderr: string; exitCode: number }> {\n return new Promise((resolve, reject) => {\n const child = spawn(cmd, args, {\n cwd: opts.cwd,\n env: process.env,\n stdio: [\"pipe\", \"pipe\", \"pipe\"],\n })\n\n let stdout = \"\"\n let stderr = \"\"\n let killed = false\n\n child.stdout.setEncoding(\"utf-8\")\n child.stderr.setEncoding(\"utf-8\")\n child.stdout.on(\"data\", (d: string) => { stdout += d })\n child.stderr.on(\"data\", (d: string) => { stderr += d })\n\n const timer = setTimeout(() => {\n killed = true\n child.kill(\"SIGTERM\")\n }, opts.timeoutMs)\n\n if (opts.stdin !== undefined) {\n child.stdin.end(opts.stdin)\n } else {\n child.stdin.end()\n }\n\n child.on(\"close\", (code) => {\n clearTimeout(timer)\n if (killed) {\n reject(new Error(`타임아웃 (${Math.round(opts.timeoutMs / 1000)}초)`))\n } else {\n resolve({ stdout, stderr, exitCode: code ?? 1 })\n }\n })\n child.on(\"error\", (err) => {\n clearTimeout(timer)\n reject(err)\n })\n })\n}\n\n/** gemini/claude 배치 호출 (비동기) */\nasync function callBatchCli(mode: \"gemini\" | \"claude\", imagePaths: string[]): Promise<string> {\n const fileRefs = imagePaths.map(p => `@${p}`).join(\"\\n\")\n const prompt = `${BATCH_OCR_PROMPT}\\n\\n${fileRefs}`\n\n let args: string[]\n if (mode === \"gemini\") {\n args = [\"--prompt\", prompt, \"--yolo\"]\n const model = process.env.KORDOC_GEMINI_MODEL\n if (model) args.push(\"--model\", model)\n } else {\n args = [\"--print\", prompt]\n const model = process.env.KORDOC_CLAUDE_MODEL\n if (model) args.push(\"--model\", model)\n }\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(mode, args, {\n timeoutMs,\n ...(mode === \"claude\" ? { cwd: tmpdir() } : {}),\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`${mode} 배치 OCR 실패: ${errMsg}`)\n }\n\n return result.stdout || \"\"\n}\n\n/** codex 배치 호출 (비동기) — --image를 여러 번 지정 */\nasync function callBatchCodexCli(imagePaths: string[]): Promise<string> {\n const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`)\n try {\n const args = [\"exec\", BATCH_OCR_PROMPT]\n for (const p of imagePaths) {\n args.push(\"--image\", p)\n }\n args.push(\"--output-last-message\", outPath)\n const model = process.env.KORDOC_CODEX_MODEL\n if (model) args.push(\"--model\", model)\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(\"codex\", args, {\n timeoutMs,\n stdin: \"\",\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`codex 배치 OCR 실패: ${errMsg}`)\n }\n\n try {\n return readFileSync(outPath, \"utf-8\")\n } catch {\n return result.stdout || \"\"\n }\n } finally {\n try { unlinkSync(outPath) } catch { /* ignore */ }\n }\n}\n\n/** LLM 출력에서 코드 펜스 제거 (cli-provider.ts와 동일 로직) */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*\\n([\\s\\S]*?)\\n```\\s*$/m)\n return match ? match[1].trim() : text\n}\n"],"mappings":";;;;AAWA,SAAS,aAAa;AACtB,SAAS,eAAe,cAAc,YAAY,iBAAiB;AACnE,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,mBACJ;AAiBK,IAAM,sBAA8C;AAAA,EACzD,QAAQ;AAAA,EACR,QAAQ;AAAA,EACR,OAAO;AACT;AAGA,IAAI,gBAA+B;AACnC,SAAS,kBAA0B;AACjC,MAAI,CAAC,eAAe;AAClB,oBAAgB,KAAK,QAAQ,IAAI,GAAG,iBAAiB;AACrD,cAAU,eAAe,EAAE,WAAW,KAAK,CAAC;AAAA,EAC9C;AACA,SAAO;AACT;AAKO,SAAS,uBACd,MACA,WACkB;AAClB,SAAO;AAAA,IACL,SAAS;AAAA,IACT;AAAA,IACA,MAAM,aAAa,OAAO;AACxB,YAAM,UAAU,oBAAI,IAAiC;AACrD,YAAM,UAAU,gBAAgB;AAChC,YAAM,YAAsB,CAAC;AAE7B,UAAI;AAEF,mBAAW,EAAE,OAAO,QAAQ,KAAK,OAAO;AACtC,gBAAM,OAAO,KAAK,SAAS,UAAU,OAAO,MAAM;AAClD,wBAAc,MAAM,KAAK;AACzB,oBAAU,KAAK,IAAI;AAAA,QACrB;AAGA,YAAI;AACJ,YAAI,SAAS,SAAS;AACpB,mBAAS,MAAM,kBAAkB,SAAS;AAAA,QAC5C,OAAO;AACL,mBAAS,MAAM,aAAa,MAAM,SAAS;AAAA,QAC7C;AAGA,cAAM,UAAU,eAAe,OAAO,KAAK,CAAC;AAC5C,cAAM,QAAQ,QAAQ,MAAM,yBAAyB,EAClD,IAAI,OAAK,EAAE,KAAK,CAAC,EACjB,OAAO,OAAK,EAAE,SAAS,CAAC;AAG3B,iBAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,gBAAM,UAAU,MAAM,CAAC,EAAE;AACzB,cAAI,IAAI,MAAM,QAAQ;AACpB,oBAAQ,IAAI,SAAS,EAAE,UAAU,MAAM,CAAC,EAAE,CAAC;AAAA,UAC7C;AAAA,QAEF;AAAA,MACF,UAAE;AAEA,mBAAW,KAAK,WAAW;AACzB,cAAI;AAAE,uBAAW,CAAC;AAAA,UAAE,QAAQ;AAAA,UAAe;AAAA,QAC7C;AAAA,MACF;AAEA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAMA,SAAS,WACP,KACA,MACA,MAC+D;AAC/D,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,QAAQ,MAAM,KAAK,MAAM;AAAA,MAC7B,KAAK,KAAK;AAAA,MACV,KAAK,QAAQ;AAAA,MACb,OAAO,CAAC,QAAQ,QAAQ,MAAM;AAAA,IAChC,CAAC;AAED,QAAI,SAAS;AACb,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AACtD,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AAEtD,UAAM,QAAQ,WAAW,MAAM;AAC7B,eAAS;AACT,YAAM,KAAK,SAAS;AAAA,IACtB,GAAG,KAAK,SAAS;AAEjB,QAAI,KAAK,UAAU,QAAW;AAC5B,YAAM,MAAM,IAAI,KAAK,KAAK;AAAA,IAC5B,OAAO;AACL,YAAM,MAAM,IAAI;AAAA,IAClB;AAEA,UAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,mBAAa,KAAK;AAClB,UAAI,QAAQ;AACV,eAAO,IAAI,MAAM,6BAAS,KAAK,MAAM,KAAK,YAAY,GAAI,CAAC,SAAI,CAAC;AAAA,MAClE,OAAO;AACL,gBAAQ,EAAE,QAAQ,QAAQ,UAAU,QAAQ,EAAE,CAAC;AAAA,MACjD;AAAA,IACF,CAAC;AACD,UAAM,GAAG,SAAS,CAAC,QAAQ;AACzB,mBAAa,KAAK;AAClB,aAAO,GAAG;AAAA,IACZ,CAAC;AAAA,EACH,CAAC;AACH;AAGA,eAAe,aAAa,MAA2B,YAAuC;AAC5F,QAAM,WAAW,WAAW,IAAI,OAAK,IAAI,CAAC,EAAE,EAAE,KAAK,IAAI;AACvD,QAAM,SAAS,GAAG,gBAAgB;AAAA;AAAA,EAAO,QAAQ;AAEjD,MAAI;AACJ,MAAI,SAAS,UAAU;AACrB,WAAO,CAAC,YAAY,QAAQ,QAAQ;AACpC,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC,OAAO;AACL,WAAO,CAAC,WAAW,MAAM;AACzB,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC;AAEA,QAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,QAAM,SAAS,MAAM,WAAW,MAAM,MAAM;AAAA,IAC1C;AAAA,IACA,GAAI,SAAS,WAAW,EAAE,KAAK,OAAO,EAAE,IAAI,CAAC;AAAA,EAC/C,CAAC;AAED,MAAI,OAAO,aAAa,GAAG;AACzB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,MAAM,EAAE;AAAA,EAChD;AAEA,SAAO,OAAO,UAAU;AAC1B;AAGA,eAAe,kBAAkB,YAAuC;AACtE,QAAM,UAAU,KAAK,OAAO,GAAG,sBAAsB,KAAK,IAAI,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,CAAC,MAAM;AAC5G,MAAI;AACF,UAAM,OAAO,CAAC,QAAQ,gBAAgB;AACtC,eAAW,KAAK,YAAY;AAC1B,WAAK,KAAK,WAAW,CAAC;AAAA,IACxB;AACA,SAAK,KAAK,yBAAyB,OAAO;AAC1C,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAErC,UAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,UAAM,SAAS,MAAM,WAAW,SAAS,MAAM;AAAA,MAC7C;AAAA,MACA,OAAO;AAAA,IACT,CAAC;AAED,QAAI,OAAO,aAAa,GAAG;AACzB,YAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,YAAM,IAAI,MAAM,wCAAoB,MAAM,EAAE;AAAA,IAC9C;AAEA,QAAI;AACF,aAAO,aAAa,SAAS,OAAO;AAAA,IACtC,QAAQ;AACN,aAAO,OAAO,UAAU;AAAA,IAC1B;AAAA,EACF,UAAE;AACA,QAAI;AAAE,iBAAW,OAAO;AAAA,IAAE,QAAQ;AAAA,IAAe;AAAA,EACnD;AACF;AAGA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,+CAA+C;AACxE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;","names":[]}