@clazic/kordoc 2.3.0 → 2.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-FUCIIS4M.js → batch-provider-PCT4I4LK.js} +57 -27
- package/dist/batch-provider-PCT4I4LK.js.map +1 -0
- package/dist/{chunk-WWILSVMJ.js → chunk-W5KUC23B.js} +2 -2
- package/dist/{chunk-2ZGLFZCN.js → chunk-ZOEUKD77.js} +4 -4
- package/dist/cli.js +7 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +78 -37
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +79 -38
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{provider-OBY3XFSZ.js → provider-WYHC4NHI.js} +23 -12
- package/dist/provider-WYHC4NHI.js.map +1 -0
- package/dist/{resolve-LBFYRHJI.js → resolve-4FSAQF2S.js} +3 -3
- package/dist/{utils-QAK24RJS.js → utils-HSF5HI5T.js} +2 -2
- package/dist/{watch-MPHX3QIH.js → watch-R2JHXDGF.js} +3 -3
- package/package.json +1 -1
- package/dist/batch-provider-FUCIIS4M.js.map +0 -1
- package/dist/provider-OBY3XFSZ.js.map +0 -1
- /package/dist/{chunk-WWILSVMJ.js.map → chunk-W5KUC23B.js.map} +0 -0
- /package/dist/{chunk-2ZGLFZCN.js.map → chunk-ZOEUKD77.js.map} +0 -0
- /package/dist/{resolve-LBFYRHJI.js.map → resolve-4FSAQF2S.js.map} +0 -0
- /package/dist/{utils-QAK24RJS.js.map → utils-HSF5HI5T.js.map} +0 -0
- /package/dist/{watch-MPHX3QIH.js.map → watch-R2JHXDGF.js.map} +0 -0
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
import "./chunk-ZWE3DS7E.js";
|
|
3
3
|
|
|
4
4
|
// src/ocr/batch-provider.ts
|
|
5
|
-
import {
|
|
5
|
+
import { spawn } from "child_process";
|
|
6
6
|
import { writeFileSync, readFileSync, unlinkSync, mkdirSync } from "fs";
|
|
7
7
|
import { join } from "path";
|
|
8
8
|
import { tmpdir } from "os";
|
|
@@ -36,9 +36,9 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
36
36
|
}
|
|
37
37
|
let output;
|
|
38
38
|
if (mode === "codex") {
|
|
39
|
-
output = callBatchCodexCli(tempFiles);
|
|
39
|
+
output = await callBatchCodexCli(tempFiles);
|
|
40
40
|
} else {
|
|
41
|
-
output = callBatchCli(mode, tempFiles);
|
|
41
|
+
output = await callBatchCli(mode, tempFiles);
|
|
42
42
|
}
|
|
43
43
|
const cleaned = stripCodeFence(output.trim());
|
|
44
44
|
const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
|
|
@@ -60,7 +60,48 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
60
60
|
}
|
|
61
61
|
};
|
|
62
62
|
}
|
|
63
|
-
function
|
|
63
|
+
function spawnAsync(cmd, args, opts) {
|
|
64
|
+
return new Promise((resolve, reject) => {
|
|
65
|
+
const child = spawn(cmd, args, {
|
|
66
|
+
cwd: opts.cwd,
|
|
67
|
+
env: process.env,
|
|
68
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
69
|
+
});
|
|
70
|
+
let stdout = "";
|
|
71
|
+
let stderr = "";
|
|
72
|
+
let killed = false;
|
|
73
|
+
child.stdout.setEncoding("utf-8");
|
|
74
|
+
child.stderr.setEncoding("utf-8");
|
|
75
|
+
child.stdout.on("data", (d) => {
|
|
76
|
+
stdout += d;
|
|
77
|
+
});
|
|
78
|
+
child.stderr.on("data", (d) => {
|
|
79
|
+
stderr += d;
|
|
80
|
+
});
|
|
81
|
+
const timer = setTimeout(() => {
|
|
82
|
+
killed = true;
|
|
83
|
+
child.kill("SIGTERM");
|
|
84
|
+
}, opts.timeoutMs);
|
|
85
|
+
if (opts.stdin !== void 0) {
|
|
86
|
+
child.stdin.end(opts.stdin);
|
|
87
|
+
} else {
|
|
88
|
+
child.stdin.end();
|
|
89
|
+
}
|
|
90
|
+
child.on("close", (code) => {
|
|
91
|
+
clearTimeout(timer);
|
|
92
|
+
if (killed) {
|
|
93
|
+
reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
|
|
94
|
+
} else {
|
|
95
|
+
resolve({ stdout, stderr, exitCode: code ?? 1 });
|
|
96
|
+
}
|
|
97
|
+
});
|
|
98
|
+
child.on("error", (err) => {
|
|
99
|
+
clearTimeout(timer);
|
|
100
|
+
reject(err);
|
|
101
|
+
});
|
|
102
|
+
});
|
|
103
|
+
}
|
|
104
|
+
async function callBatchCli(mode, imagePaths) {
|
|
64
105
|
const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
|
|
65
106
|
const prompt = `${BATCH_OCR_PROMPT}
|
|
66
107
|
|
|
@@ -76,24 +117,18 @@ ${fileRefs}`;
|
|
|
76
117
|
if (model) args.push("--model", model);
|
|
77
118
|
}
|
|
78
119
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
79
|
-
const result =
|
|
80
|
-
|
|
81
|
-
timeout: timeoutMs,
|
|
82
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
83
|
-
// 50MB (large batch output)
|
|
120
|
+
const result = await spawnAsync(mode, args, {
|
|
121
|
+
timeoutMs,
|
|
84
122
|
...mode === "claude" ? { cwd: tmpdir() } : {}
|
|
85
123
|
});
|
|
86
|
-
if (result.
|
|
87
|
-
|
|
88
|
-
}
|
|
89
|
-
if (result.status !== 0) {
|
|
90
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
124
|
+
if (result.exitCode !== 0) {
|
|
125
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
91
126
|
throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
92
127
|
}
|
|
93
128
|
return result.stdout || "";
|
|
94
129
|
}
|
|
95
|
-
function callBatchCodexCli(imagePaths) {
|
|
96
|
-
const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}.txt`);
|
|
130
|
+
async function callBatchCodexCli(imagePaths) {
|
|
131
|
+
const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
|
|
97
132
|
try {
|
|
98
133
|
const args = ["exec", BATCH_OCR_PROMPT];
|
|
99
134
|
for (const p of imagePaths) {
|
|
@@ -103,17 +138,12 @@ function callBatchCodexCli(imagePaths) {
|
|
|
103
138
|
const model = process.env.KORDOC_CODEX_MODEL;
|
|
104
139
|
if (model) args.push("--model", model);
|
|
105
140
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
106
|
-
const result =
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
110
|
-
input: ""
|
|
141
|
+
const result = await spawnAsync("codex", args, {
|
|
142
|
+
timeoutMs,
|
|
143
|
+
stdin: ""
|
|
111
144
|
});
|
|
112
|
-
if (result.
|
|
113
|
-
|
|
114
|
-
}
|
|
115
|
-
if (result.status !== 0) {
|
|
116
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
145
|
+
if (result.exitCode !== 0) {
|
|
146
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
117
147
|
throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
118
148
|
}
|
|
119
149
|
try {
|
|
@@ -136,4 +166,4 @@ export {
|
|
|
136
166
|
DEFAULT_BATCH_SIZES,
|
|
137
167
|
createBatchCliProvider
|
|
138
168
|
};
|
|
139
|
-
//# sourceMappingURL=batch-provider-
|
|
169
|
+
//# sourceMappingURL=batch-provider-PCT4I4LK.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/ocr/batch-provider.ts"],"sourcesContent":["/**\n * CLI 배치 OCR 프로바이더\n *\n * 여러 페이지 이미지를 단일 CLI 호출로 처리하여 API 호출 수를 대폭 감소.\n * gemini/claude: @file 멀티 참조, codex: --image 멀티 플래그\n *\n * 299페이지 기준:\n * - 기존: CLI 299회 호출 (~30분)\n * - 배치: CLI 3~6회 호출 (~3분)\n */\n\nimport { spawn } from \"child_process\"\nimport { writeFileSync, readFileSync, unlinkSync, mkdirSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { StructuredOcrResult, BatchOcrProvider } from \"../types.js\"\n\n/** 배치 OCR 프롬프트 */\nconst BATCH_OCR_PROMPT =\n \"다음 문서 페이지 이미지들을 OCR하여 순수 Markdown으로 변환하세요.\\n\\n\" +\n \"규칙:\\n\" +\n \"- 각 페이지 결과 사이에 반드시 이 구분자를 삽입: <!-- PAGE_BREAK -->\\n\" +\n \"- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\\n\" +\n \"- 병합된 셀은 해당 위치에 내용 기재\\n\" +\n \"- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\\n\" +\n \"- 리스트는 - 또는 1. 사용\\n\" +\n \"- 이미지, 도형 등 비텍스트 요소는 무시\\n\" +\n \"- 원문의 읽기 순서와 구조를 유지\\n\" +\n \"- ```로 감싸지 말고 순수 Markdown만 출력\"\n\n/** 모드별 기본 배치 크기 (CLI 내부 타임아웃 + 실측 기반)\n *\n * gemini CLI: 10장 이상에서 AbortError 발생 (내부 타임아웃).\n * 5장 배치가 안정적으로 동작 확인 (35초/배치).\n * 299페이지 = 60배치 = 기존 299회 대비 80% 감소.\n */\nexport const DEFAULT_BATCH_SIZES: Record<string, number> = {\n gemini: 5,\n claude: 5,\n codex: 10,\n}\n\n/** 임시 디렉토리 — gemini CLI는 cwd 하위 + gitignore 밖만 @참조 가능 */\nlet _batchTempDir: string | null = null\nfunction getBatchTempDir(): string {\n if (!_batchTempDir) {\n _batchTempDir = join(process.cwd(), \"_kordoc_ocr_tmp\")\n mkdirSync(_batchTempDir, { recursive: true })\n }\n return _batchTempDir\n}\n\n/**\n * 배치 CLI 프로바이더 생성\n */\nexport function createBatchCliProvider(\n mode: \"gemini\" | \"claude\" | \"codex\",\n batchSize: number\n): BatchOcrProvider {\n return {\n __batch: true as const,\n batchSize,\n async processBatch(pages) {\n const results = new Map<number, StructuredOcrResult>()\n const tempDir = getBatchTempDir()\n const tempFiles: string[] = []\n\n try {\n // 1. Write all page images to temp files\n for (const { image, pageNum } of pages) {\n const path = join(tempDir, `batch-p${pageNum}.png`)\n writeFileSync(path, image)\n tempFiles.push(path)\n }\n\n // 2. Call CLI with all file references (비동기 — 병렬 배치 실행 가능)\n let output: string\n if (mode === \"codex\") {\n output = await callBatchCodexCli(tempFiles)\n } else {\n output = await callBatchCli(mode, tempFiles)\n }\n\n // 3. Parse response by PAGE_BREAK separator\n const cleaned = stripCodeFence(output.trim())\n const parts = cleaned.split(/<!--\\s*PAGE_BREAK\\s*-->/)\n .map(p => p.trim())\n .filter(p => p.length > 0)\n\n // 4. Map results to page numbers (best-effort if count mismatch)\n for (let i = 0; i < pages.length; i++) {\n const pageNum = pages[i].pageNum\n if (i < parts.length) {\n results.set(pageNum, { markdown: parts[i] })\n }\n // If fewer parts than pages, remaining pages get no result\n }\n } finally {\n // 5. Clean up temp files\n for (const f of tempFiles) {\n try { unlinkSync(f) } catch { /* ignore */ }\n }\n }\n\n return results\n },\n }\n}\n\n/**\n * 비동기 CLI 실행 헬퍼 — spawn + Promise 래핑.\n * spawnSync는 이벤트 루프를 차단하여 병렬 배치 실행 불가.\n */\nfunction spawnAsync(\n cmd: string,\n args: string[],\n opts: { timeoutMs: number; cwd?: string; stdin?: string }\n): Promise<{ stdout: string; stderr: string; exitCode: number }> {\n return new Promise((resolve, reject) => {\n const child = spawn(cmd, args, {\n cwd: opts.cwd,\n env: process.env,\n stdio: [\"pipe\", \"pipe\", \"pipe\"],\n })\n\n let stdout = \"\"\n let stderr = \"\"\n let killed = false\n\n child.stdout.setEncoding(\"utf-8\")\n child.stderr.setEncoding(\"utf-8\")\n child.stdout.on(\"data\", (d: string) => { stdout += d })\n child.stderr.on(\"data\", (d: string) => { stderr += d })\n\n const timer = setTimeout(() => {\n killed = true\n child.kill(\"SIGTERM\")\n }, opts.timeoutMs)\n\n if (opts.stdin !== undefined) {\n child.stdin.end(opts.stdin)\n } else {\n child.stdin.end()\n }\n\n child.on(\"close\", (code) => {\n clearTimeout(timer)\n if (killed) {\n reject(new Error(`타임아웃 (${Math.round(opts.timeoutMs / 1000)}초)`))\n } else {\n resolve({ stdout, stderr, exitCode: code ?? 1 })\n }\n })\n child.on(\"error\", (err) => {\n clearTimeout(timer)\n reject(err)\n })\n })\n}\n\n/** gemini/claude 배치 호출 (비동기) */\nasync function callBatchCli(mode: \"gemini\" | \"claude\", imagePaths: string[]): Promise<string> {\n const fileRefs = imagePaths.map(p => `@${p}`).join(\"\\n\")\n const prompt = `${BATCH_OCR_PROMPT}\\n\\n${fileRefs}`\n\n let args: string[]\n if (mode === \"gemini\") {\n args = [\"--prompt\", prompt, \"--yolo\"]\n const model = process.env.KORDOC_GEMINI_MODEL\n if (model) args.push(\"--model\", model)\n } else {\n args = [\"--print\", prompt]\n const model = process.env.KORDOC_CLAUDE_MODEL\n if (model) args.push(\"--model\", model)\n }\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(mode, args, {\n timeoutMs,\n ...(mode === \"claude\" ? { cwd: tmpdir() } : {}),\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`${mode} 배치 OCR 실패: ${errMsg}`)\n }\n\n return result.stdout || \"\"\n}\n\n/** codex 배치 호출 (비동기) — --image를 여러 번 지정 */\nasync function callBatchCodexCli(imagePaths: string[]): Promise<string> {\n const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`)\n try {\n const args = [\"exec\", BATCH_OCR_PROMPT]\n for (const p of imagePaths) {\n args.push(\"--image\", p)\n }\n args.push(\"--output-last-message\", outPath)\n const model = process.env.KORDOC_CODEX_MODEL\n if (model) args.push(\"--model\", model)\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(\"codex\", args, {\n timeoutMs,\n stdin: \"\",\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`codex 배치 OCR 실패: ${errMsg}`)\n }\n\n try {\n return readFileSync(outPath, \"utf-8\")\n } catch {\n return result.stdout || \"\"\n }\n } finally {\n try { unlinkSync(outPath) } catch { /* ignore */ }\n }\n}\n\n/** LLM 출력에서 코드 펜스 제거 (cli-provider.ts와 동일 로직) */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*\\n([\\s\\S]*?)\\n```\\s*$/m)\n return match ? match[1].trim() : text\n}\n"],"mappings":";;;;AAWA,SAAS,aAAa;AACtB,SAAS,eAAe,cAAc,YAAY,iBAAiB;AACnE,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,mBACJ;AAiBK,IAAM,sBAA8C;AAAA,EACzD,QAAQ;AAAA,EACR,QAAQ;AAAA,EACR,OAAO;AACT;AAGA,IAAI,gBAA+B;AACnC,SAAS,kBAA0B;AACjC,MAAI,CAAC,eAAe;AAClB,oBAAgB,KAAK,QAAQ,IAAI,GAAG,iBAAiB;AACrD,cAAU,eAAe,EAAE,WAAW,KAAK,CAAC;AAAA,EAC9C;AACA,SAAO;AACT;AAKO,SAAS,uBACd,MACA,WACkB;AAClB,SAAO;AAAA,IACL,SAAS;AAAA,IACT;AAAA,IACA,MAAM,aAAa,OAAO;AACxB,YAAM,UAAU,oBAAI,IAAiC;AACrD,YAAM,UAAU,gBAAgB;AAChC,YAAM,YAAsB,CAAC;AAE7B,UAAI;AAEF,mBAAW,EAAE,OAAO,QAAQ,KAAK,OAAO;AACtC,gBAAM,OAAO,KAAK,SAAS,UAAU,OAAO,MAAM;AAClD,wBAAc,MAAM,KAAK;AACzB,oBAAU,KAAK,IAAI;AAAA,QACrB;AAGA,YAAI;AACJ,YAAI,SAAS,SAAS;AACpB,mBAAS,MAAM,kBAAkB,SAAS;AAAA,QAC5C,OAAO;AACL,mBAAS,MAAM,aAAa,MAAM,SAAS;AAAA,QAC7C;AAGA,cAAM,UAAU,eAAe,OAAO,KAAK,CAAC;AAC5C,cAAM,QAAQ,QAAQ,MAAM,yBAAyB,EAClD,IAAI,OAAK,EAAE,KAAK,CAAC,EACjB,OAAO,OAAK,EAAE,SAAS,CAAC;AAG3B,iBAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,gBAAM,UAAU,MAAM,CAAC,EAAE;AACzB,cAAI,IAAI,MAAM,QAAQ;AACpB,oBAAQ,IAAI,SAAS,EAAE,UAAU,MAAM,CAAC,EAAE,CAAC;AAAA,UAC7C;AAAA,QAEF;AAAA,MACF,UAAE;AAEA,mBAAW,KAAK,WAAW;AACzB,cAAI;AAAE,uBAAW,CAAC;AAAA,UAAE,QAAQ;AAAA,UAAe;AAAA,QAC7C;AAAA,MACF;AAEA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAMA,SAAS,WACP,KACA,MACA,MAC+D;AAC/D,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,QAAQ,MAAM,KAAK,MAAM;AAAA,MAC7B,KAAK,KAAK;AAAA,MACV,KAAK,QAAQ;AAAA,MACb,OAAO,CAAC,QAAQ,QAAQ,MAAM;AAAA,IAChC,CAAC;AAED,QAAI,SAAS;AACb,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AACtD,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AAEtD,UAAM,QAAQ,WAAW,MAAM;AAC7B,eAAS;AACT,YAAM,KAAK,SAAS;AAAA,IACtB,GAAG,KAAK,SAAS;AAEjB,QAAI,KAAK,UAAU,QAAW;AAC5B,YAAM,MAAM,IAAI,KAAK,KAAK;AAAA,IAC5B,OAAO;AACL,YAAM,MAAM,IAAI;AAAA,IAClB;AAEA,UAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,mBAAa,KAAK;AAClB,UAAI,QAAQ;AACV,eAAO,IAAI,MAAM,6BAAS,KAAK,MAAM,KAAK,YAAY,GAAI,CAAC,SAAI,CAAC;AAAA,MAClE,OAAO;AACL,gBAAQ,EAAE,QAAQ,QAAQ,UAAU,QAAQ,EAAE,CAAC;AAAA,MACjD;AAAA,IACF,CAAC;AACD,UAAM,GAAG,SAAS,CAAC,QAAQ;AACzB,mBAAa,KAAK;AAClB,aAAO,GAAG;AAAA,IACZ,CAAC;AAAA,EACH,CAAC;AACH;AAGA,eAAe,aAAa,MAA2B,YAAuC;AAC5F,QAAM,WAAW,WAAW,IAAI,OAAK,IAAI,CAAC,EAAE,EAAE,KAAK,IAAI;AACvD,QAAM,SAAS,GAAG,gBAAgB;AAAA;AAAA,EAAO,QAAQ;AAEjD,MAAI;AACJ,MAAI,SAAS,UAAU;AACrB,WAAO,CAAC,YAAY,QAAQ,QAAQ;AACpC,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC,OAAO;AACL,WAAO,CAAC,WAAW,MAAM;AACzB,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC;AAEA,QAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,QAAM,SAAS,MAAM,WAAW,MAAM,MAAM;AAAA,IAC1C;AAAA,IACA,GAAI,SAAS,WAAW,EAAE,KAAK,OAAO,EAAE,IAAI,CAAC;AAAA,EAC/C,CAAC;AAED,MAAI,OAAO,aAAa,GAAG;AACzB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,MAAM,EAAE;AAAA,EAChD;AAEA,SAAO,OAAO,UAAU;AAC1B;AAGA,eAAe,kBAAkB,YAAuC;AACtE,QAAM,UAAU,KAAK,OAAO,GAAG,sBAAsB,KAAK,IAAI,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,CAAC,MAAM;AAC5G,MAAI;AACF,UAAM,OAAO,CAAC,QAAQ,gBAAgB;AACtC,eAAW,KAAK,YAAY;AAC1B,WAAK,KAAK,WAAW,CAAC;AAAA,IACxB;AACA,SAAK,KAAK,yBAAyB,OAAO;AAC1C,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAErC,UAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,UAAM,SAAS,MAAM,WAAW,SAAS,MAAM;AAAA,MAC7C;AAAA,MACA,OAAO;AAAA,IACT,CAAC;AAED,QAAI,OAAO,aAAa,GAAG;AACzB,YAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,YAAM,IAAI,MAAM,wCAAoB,MAAM,EAAE;AAAA,IAC9C;AAEA,QAAI;AACF,aAAO,aAAa,SAAS,OAAO;AAAA,IACtC,QAAQ;AACN,aAAO,OAAO,UAAU;AAAA,IAC1B;AAAA,EACF,UAAE;AACA,QAAI;AAAE,iBAAW,OAAO;AAAA,IAAE,QAAQ;AAAA,IAAe;AAAA,EACnD;AACF;AAGA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,+CAA+C;AACxE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;","names":[]}
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
#!/usr/bin/env node
|
|
2
2
|
|
|
3
3
|
// src/utils.ts
|
|
4
|
-
var VERSION = true ? "2.3.
|
|
4
|
+
var VERSION = true ? "2.3.1" : "0.0.0-dev";
|
|
5
5
|
function toArrayBuffer(buf) {
|
|
6
6
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
7
7
|
return buf.buffer;
|
|
@@ -90,4 +90,4 @@ export {
|
|
|
90
90
|
sanitizeHref,
|
|
91
91
|
classifyError
|
|
92
92
|
};
|
|
93
|
-
//# sourceMappingURL=chunk-
|
|
93
|
+
//# sourceMappingURL=chunk-W5KUC23B.js.map
|
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
precheckZipSize,
|
|
7
7
|
sanitizeHref,
|
|
8
8
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-W5KUC23B.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
12
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -5406,7 +5406,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5406
5406
|
const ocrMode = options?.ocrMode;
|
|
5407
5407
|
if (!ocrProvider && ocrMode && ocrMode !== "off") {
|
|
5408
5408
|
try {
|
|
5409
|
-
const { resolveOcrProvider } = await import("./resolve-
|
|
5409
|
+
const { resolveOcrProvider } = await import("./resolve-4FSAQF2S.js");
|
|
5410
5410
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5411
5411
|
const batchSize = options?.ocrBatchSize;
|
|
5412
5412
|
ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
|
|
@@ -5422,7 +5422,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5422
5422
|
if (ocrProvider) {
|
|
5423
5423
|
let ocrBlocks = [];
|
|
5424
5424
|
try {
|
|
5425
|
-
const { ocrPages } = await import("./provider-
|
|
5425
|
+
const { ocrPages } = await import("./provider-WYHC4NHI.js");
|
|
5426
5426
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5427
5427
|
ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5428
5428
|
} catch {
|
|
@@ -9624,4 +9624,4 @@ export {
|
|
|
9624
9624
|
cfb/cfb.js:
|
|
9625
9625
|
(*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
|
|
9626
9626
|
*/
|
|
9627
|
-
//# sourceMappingURL=chunk-
|
|
9627
|
+
//# sourceMappingURL=chunk-ZOEUKD77.js.map
|
package/dist/cli.js
CHANGED
|
@@ -4,11 +4,11 @@ import {
|
|
|
4
4
|
markdownToHwpx,
|
|
5
5
|
markdownToXlsx,
|
|
6
6
|
parse
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-ZOEUKD77.js";
|
|
8
8
|
import {
|
|
9
9
|
VERSION,
|
|
10
10
|
toArrayBuffer
|
|
11
|
-
} from "./chunk-
|
|
11
|
+
} from "./chunk-W5KUC23B.js";
|
|
12
12
|
import "./chunk-MOL7MDBG.js";
|
|
13
13
|
import "./chunk-ZWE3DS7E.js";
|
|
14
14
|
|
|
@@ -66,6 +66,8 @@ async function runParse(files, opts) {
|
|
|
66
66
|
if (n > 0) parseOptions.ocrConcurrency = n;
|
|
67
67
|
} else if (parseOptions.ocrMode === "tesseract") {
|
|
68
68
|
parseOptions.ocrConcurrency = cpus().length;
|
|
69
|
+
} else {
|
|
70
|
+
parseOptions.ocrConcurrency = 4;
|
|
69
71
|
}
|
|
70
72
|
if (opts.ocrBatchSize) {
|
|
71
73
|
const n = parseInt(opts.ocrBatchSize, 10);
|
|
@@ -132,7 +134,7 @@ async function runParse(files, opts) {
|
|
|
132
134
|
saveImages(absPath);
|
|
133
135
|
}
|
|
134
136
|
} catch (err) {
|
|
135
|
-
const { sanitizeError } = await import("./utils-
|
|
137
|
+
const { sanitizeError } = await import("./utils-HSF5HI5T.js");
|
|
136
138
|
process.stderr.write(`
|
|
137
139
|
[kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
|
|
138
140
|
`);
|
|
@@ -216,7 +218,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
|
|
|
216
218
|
`));
|
|
217
219
|
}
|
|
218
220
|
} catch (err) {
|
|
219
|
-
const { sanitizeError } = await import("./utils-
|
|
221
|
+
const { sanitizeError } = await import("./utils-HSF5HI5T.js");
|
|
220
222
|
process.stderr.write(` FAIL
|
|
221
223
|
`);
|
|
222
224
|
process.stderr.write(` \u2192 ${sanitizeError(err)}
|
|
@@ -225,7 +227,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
|
|
|
225
227
|
}
|
|
226
228
|
});
|
|
227
229
|
program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
|
|
228
|
-
const { watchDirectory } = await import("./watch-
|
|
230
|
+
const { watchDirectory } = await import("./watch-R2JHXDGF.js");
|
|
229
231
|
await watchDirectory({
|
|
230
232
|
dir,
|
|
231
233
|
outDir: opts.outDir,
|
package/dist/cli.js.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["/** kordoc CLI — 모두 파싱해버리겠다 */\n\nimport { readFileSync, writeFileSync, mkdirSync, statSync, existsSync, readdirSync } from \"fs\"\nimport { basename, resolve, extname } from \"path\"\nimport { cpus } from \"os\"\nimport { Command } from \"commander\"\nimport { parse, detectFormat, markdownToHwpx, markdownToXlsx } from \"./index.js\"\nimport type { ParseOptions, OcrMode } from \"./types.js\"\nimport { VERSION, toArrayBuffer } from \"./utils.js\"\n\nconst program = new Command()\n\n/** 공통 parse 옵션 타입 */\ninterface ParseOpts {\n output?: string\n outDir?: string\n pages?: string\n format: string\n headerFooter: boolean\n imageDir?: string\n silent?: boolean\n ocr?: string\n ocrJobs?: string\n ocrBatchSize?: string\n}\n\n/** parse 액션 공통 구현 — 루트 커맨드와 `parse` 서브커맨드가 공유 */\nasync function runParse(files: string[], opts: ParseOpts) {\n const validFormats = [\"markdown\", \"json\"]\n if (!validFormats.includes(opts.format)) {\n process.stderr.write(`[kordoc] 지원하지 않는 형식: ${opts.format} (markdown 또는 json)\\n`)\n process.exit(1)\n }\n for (let fi = 0; fi < files.length; fi++) {\n const filePath = files[fi]\n const absPath = resolve(filePath)\n const fileName = basename(absPath)\n const filePrefix = files.length > 1 ? `[${fi + 1}/${files.length}] ` : \"\"\n\n try {\n const fileSize = statSync(absPath).size\n if (fileSize > 500 * 1024 * 1024) {\n process.stderr.write(`\\n[kordoc] SKIP: ${fileName} — 파일이 너무 큽니다 (${(fileSize / 1024 / 1024).toFixed(1)}MB)\\n`)\n process.exitCode = 1\n continue\n }\n const buffer = readFileSync(absPath)\n const arrayBuffer = toArrayBuffer(buffer)\n const format = detectFormat(arrayBuffer)\n\n if (!opts.silent) {\n process.stderr.write(`[kordoc] ${filePrefix}${fileName} (${format}) ...`)\n }\n\n const parseOptions: ParseOptions = {}\n if (opts.pages) parseOptions.pages = opts.pages as string\n if (opts.headerFooter === false) parseOptions.removeHeaderFooter = false\n\n // OCR 모드: CLI 기본값 \"auto\" (라이브러리 API는 undefined 유지)\n const validOcrModes = [\"auto\", \"gemini\", \"claude\", \"codex\", \"ollama\", \"tesseract\", \"off\"]\n if (opts.ocr) {\n if (!validOcrModes.includes(opts.ocr)) {\n process.stderr.write(`[kordoc] 지원하지 않는 OCR 모드: ${opts.ocr}\\n`)\n process.stderr.write(` 사용 가능: ${validOcrModes.join(\", \")}\\n`)\n process.exit(1)\n }\n parseOptions.ocrMode = opts.ocr as OcrMode\n } else {\n parseOptions.ocrMode = \"auto\"\n }\n\n // OCR 병렬 처리 수 (--ocr-jobs): tesseract 기본값은 CPU 코어 수\n if (opts.ocrJobs) {\n const n = parseInt(opts.ocrJobs, 10)\n if (n > 0) parseOptions.ocrConcurrency = n\n } else if (parseOptions.ocrMode === \"tesseract\") {\n parseOptions.ocrConcurrency = cpus().length\n }\n\n // OCR 배치 크기 (--ocr-batch-size)\n if (opts.ocrBatchSize) {\n const n = parseInt(opts.ocrBatchSize, 10)\n if (n > 0) parseOptions.ocrBatchSize = n\n }\n\n if (!opts.silent) {\n parseOptions.onProgress = (current: number, total: number) => {\n process.stderr.write(`\\r[kordoc] ${filePrefix}${fileName} (${format}) [${current}/${total}]`)\n }\n }\n const result = await parse(arrayBuffer, parseOptions)\n\n if (!result.success) {\n process.stderr.write(` FAIL\\n`)\n process.stderr.write(` → ${result.error}\\n`)\n process.exitCode = 1\n continue\n }\n\n if (!opts.silent) process.stderr.write(` OK\\n`)\n\n // 이미지 기반 PDF OCR 결과 표시\n if (!opts.silent && result.success && result.isImageBased) {\n process.stderr.write(` → 이미지 기반 PDF — OCR 처리됨\\n`)\n }\n\n // 경고 표시\n if (!opts.silent && result.success && result.warnings?.length) {\n for (const w of result.warnings) {\n process.stderr.write(` ⚠ ${w.message}\\n`)\n }\n }\n\n let markdown = result.markdown\n // --out-dir 시 이미지 참조 경로에 images/ 접두사 추가\n if (opts.outDir && result.images?.length) {\n markdown = markdown.replace(/!\\[image\\]\\(image_/g, \"\n }\n const output = opts.format === \"json\"\n ? JSON.stringify(result, null, 2)\n : markdown\n\n // 이미지 저장: 출력 MD 파일 기준 폴더 사용 (convert와 일치)\n const saveImages = (outFilePath: string) => {\n if (!result.images?.length) return\n const stem = basename(outFilePath).replace(/\\.[^.]+$/, \"\")\n const defaultDir = resolve(outFilePath, \"..\", stem + \"_images\")\n const imgDir = opts.imageDir ? resolve(opts.imageDir) : defaultDir\n mkdirSync(imgDir, { recursive: true })\n for (const img of result.images) {\n writeFileSync(resolve(imgDir, img.filename), img.data)\n }\n if (!opts.silent) process.stderr.write(` → ${result.images.length}개 이미지 → ${imgDir}\\n`)\n }\n\n if (opts.output && files.length === 1) {\n writeFileSync(opts.output, output, \"utf-8\")\n if (!opts.silent) process.stderr.write(` → ${opts.output}\\n`)\n saveImages(resolve(opts.output))\n } else if (opts.outDir) {\n mkdirSync(opts.outDir, { recursive: true })\n const outExt = opts.format === \"json\" ? \".json\" : \".md\"\n const outPath = resolve(opts.outDir, fileName.replace(/\\.[^.]+$/, outExt))\n writeFileSync(outPath, output, \"utf-8\")\n if (!opts.silent) process.stderr.write(` → ${outPath}\\n`)\n saveImages(outPath)\n } else {\n process.stdout.write(output + \"\\n\")\n saveImages(absPath) // stdout 출력 시 입력 파일 기준\n }\n } catch (err) {\n const { sanitizeError } = await import(\"./utils.js\")\n process.stderr.write(`\\n[kordoc] ERROR: ${fileName} — ${sanitizeError(err)}\\n`)\n process.exitCode = 1\n }\n }\n}\n\n/** 공통 parse 옵션 등록 헬퍼 */\nfunction addParseOptions(cmd: Command): Command {\n return cmd\n .option(\"-o, --output <path>\", \"출력 파일 경로 (단일 파일 시)\")\n .option(\"-d, --out-dir <dir>\", \"출력 디렉토리 (다중 파일 시)\")\n .option(\"-p, --pages <range>\", \"페이지/섹션 범위 (예: 1-3, 1,3,5)\")\n .option(\"--format <type>\", \"출력 형식: markdown (기본) 또는 json\", \"markdown\")\n .option(\"--no-header-footer\", \"PDF 머리글/바닥글 자동 제거\")\n .option(\"--image-dir <dir>\", \"이미지 저장 폴더 (기본: 입력 파일명_images 폴더)\")\n .option(\"--silent\", \"진행 메시지 숨기기\")\n .option(\"--ocr <mode>\", \"OCR 모드: auto(기본), gemini, claude, codex, ollama, tesseract, off\")\n .option(\"--ocr-jobs <n>\", \"OCR 병렬 처리 수 (기본: CPU 코어 수, tesseract 전용)\")\n .option(\"--ocr-batch-size <n>\", \"OCR 배치 크기 — CLI당 페이지 수 (기본: gemini/claude 50, codex 100)\")\n}\n\nprogram\n .enablePositionalOptions()\n .name(\"kordoc\")\n .description(\"모두 파싱해버리겠다 — HWP, HWPX, PDF, XLSX, DOCX → Markdown\")\n .version(VERSION)\n\n// `kordoc parse <files>` 서브커맨드 (권장)\naddParseOptions(\n program\n .command(\"parse\")\n .description(\"파일을 마크다운으로 파싱 (HWP, HWPX, PDF, XLSX, DOCX)\")\n .argument(\"<files...>\", \"변환할 파일 경로\")\n).action(runParse)\n\n// `kordoc <files>` 루트 커맨드 (하위 호환)\naddParseOptions(\n program\n .argument(\"<files...>\", \"변환할 파일 경로 (HWP, HWPX, PDF, XLSX, DOCX)\")\n).action(runParse)\n\nprogram\n .command(\"convert <input>\")\n .description(\"마크다운 파일을 HWPX 또는 XLSX로 변환\")\n .option(\"-f, --format <type>\", \"출력 포맷: hwpx | xlsx\", \"hwpx\")\n .option(\"-o, --output <path>\", \"출력 파일 경로 (기본: 입력명 + 포맷 확장자)\")\n .option(\"--image-dir <dir>\", \"이미지 폴더 경로 (기본: 입력 MD 파일명_images 폴더)\")\n .option(\"--images\", \"이미지 포함 (기본: 생략, 레이아웃 문제 방지)\")\n .option(\"--template <path>\", \"HWPX 템플릿 파일 경로 (hwpx 전용)\")\n .option(\"--silent\", \"진행 메시지 숨기기\")\n .action(async (input: string, opts) => {\n const validFormats = [\"hwpx\", \"xlsx\"]\n if (!validFormats.includes(opts.format)) {\n process.stderr.write(`[kordoc] 지원하지 않는 포맷: ${opts.format} (hwpx 또는 xlsx)\\n`)\n process.exit(1)\n }\n\n const absInput = resolve(input)\n if (!existsSync(absInput)) {\n process.stderr.write(`[kordoc] 파일을 찾을 수 없습니다: ${input}\\n`)\n process.exit(1)\n }\n\n const stem = basename(absInput).replace(/\\.[^.]+$/, \"\")\n const outPath = opts.output\n ? resolve(opts.output)\n : resolve(absInput, \"..\", `${stem}.${opts.format}`)\n\n if (!opts.silent) process.stderr.write(`[kordoc] ${basename(absInput)} → ${basename(outPath)} ...`)\n\n try {\n const markdown = readFileSync(absInput, \"utf-8\")\n\n // 이미지 폴더에서 이미지 로드 (--images 플래그 필요)\n const imgDir = opts.imageDir ? resolve(opts.imageDir) : resolve(absInput, \"..\", stem + \"_images\")\n const images: import(\"./types.js\").ExtractedImage[] = []\n if (opts.images && existsSync(imgDir)) {\n const mimeMap: Record<string, string> = {\n png: \"image/png\", jpg: \"image/jpeg\", jpeg: \"image/jpeg\",\n gif: \"image/gif\", bmp: \"image/bmp\",\n }\n for (const entry of readdirSync(imgDir, { withFileTypes: true })) {\n if (!entry.isFile()) continue\n const fname = entry.name\n const ext = extname(fname).slice(1).toLowerCase()\n if (!mimeMap[ext]) continue\n const data = readFileSync(resolve(imgDir, fname))\n images.push({ filename: fname, data: new Uint8Array(data), mimeType: mimeMap[ext] })\n }\n if (!opts.silent) process.stderr.write(` → 이미지 ${images.length}개 로드\\n`)\n }\n\n const warnings: string[] = []\n\n let buf: ArrayBuffer\n if (opts.format === \"xlsx\") {\n if (opts.template && !opts.silent) {\n process.stderr.write(`\\n[kordoc] 경고: --template은 hwpx 전용입니다. 무시됩니다.\\n`)\n }\n buf = await markdownToXlsx(markdown, { warnings, images: images.length ? images : undefined })\n } else {\n let templateArrayBuffer: ArrayBuffer | undefined\n if (opts.template) {\n const tmplBuf = readFileSync(resolve(opts.template))\n templateArrayBuffer = tmplBuf.buffer.slice(tmplBuf.byteOffset, tmplBuf.byteOffset + tmplBuf.byteLength)\n }\n buf = await markdownToHwpx(markdown, { warnings, images: images.length ? images : undefined, templateArrayBuffer })\n }\n\n writeFileSync(outPath, Buffer.from(buf))\n\n if (!opts.silent) {\n process.stderr.write(` OK\\n`)\n process.stderr.write(` → ${outPath}\\n`)\n if (warnings.length) warnings.forEach(w => process.stderr.write(` ${w}\\n`))\n }\n } catch (err) {\n const { sanitizeError } = await import(\"./utils.js\")\n process.stderr.write(` FAIL\\n`)\n process.stderr.write(` → ${sanitizeError(err)}\\n`)\n process.exit(1)\n }\n })\n\nprogram\n .command(\"watch <dir>\")\n .description(\"디렉토리 감시 — 새 문서 자동 변환\")\n .option(\"--webhook <url>\", \"결과 전송 웹훅 URL\")\n .option(\"-d, --out-dir <dir>\", \"변환 결과 출력 디렉토리\")\n .option(\"-p, --pages <range>\", \"페이지/섹션 범위\")\n .option(\"--format <type>\", \"출력 형식: markdown 또는 json\", \"markdown\")\n .option(\"--silent\", \"진행 메시지 숨기기\")\n .action(async (dir: string, opts) => {\n const { watchDirectory } = await import(\"./watch.js\")\n await watchDirectory({\n dir,\n outDir: opts.outDir,\n webhook: opts.webhook,\n format: opts.format,\n pages: opts.pages,\n silent: opts.silent,\n })\n })\n\nprogram.parse()\n"],"mappings":";;;;;;;;;;;;;;;AAEA,SAAS,cAAc,eAAe,WAAW,UAAU,YAAY,mBAAmB;AAC1F,SAAS,UAAU,SAAS,eAAe;AAC3C,SAAS,YAAY;AACrB,SAAS,eAAe;AAKxB,IAAM,UAAU,IAAI,QAAQ;AAiB5B,eAAe,SAAS,OAAiB,MAAiB;AACxD,QAAM,eAAe,CAAC,YAAY,MAAM;AACxC,MAAI,CAAC,aAAa,SAAS,KAAK,MAAM,GAAG;AACvC,YAAQ,OAAO,MAAM,gEAAwB,KAAK,MAAM;AAAA,CAAuB;AAC/E,YAAQ,KAAK,CAAC;AAAA,EAChB;AACA,WAAS,KAAK,GAAG,KAAK,MAAM,QAAQ,MAAM;AACxC,UAAM,WAAW,MAAM,EAAE;AACzB,UAAM,UAAU,QAAQ,QAAQ;AAChC,UAAM,WAAW,SAAS,OAAO;AACjC,UAAM,aAAa,MAAM,SAAS,IAAI,IAAI,KAAK,CAAC,IAAI,MAAM,MAAM,OAAO;AAEvE,QAAI;AACF,YAAM,WAAW,SAAS,OAAO,EAAE;AACnC,UAAI,WAAW,MAAM,OAAO,MAAM;AAChC,gBAAQ,OAAO,MAAM;AAAA,iBAAoB,QAAQ,gEAAmB,WAAW,OAAO,MAAM,QAAQ,CAAC,CAAC;AAAA,CAAO;AAC7G,gBAAQ,WAAW;AACnB;AAAA,MACF;AACA,YAAM,SAAS,aAAa,OAAO;AACnC,YAAM,cAAc,cAAc,MAAM;AACxC,YAAM,SAAS,aAAa,WAAW;AAEvC,UAAI,CAAC,KAAK,QAAQ;AAChB,gBAAQ,OAAO,MAAM,YAAY,UAAU,GAAG,QAAQ,KAAK,MAAM,OAAO;AAAA,MAC1E;AAEA,YAAM,eAA6B,CAAC;AACpC,UAAI,KAAK,MAAO,cAAa,QAAQ,KAAK;AAC1C,UAAI,KAAK,iBAAiB,MAAO,cAAa,qBAAqB;AAGnE,YAAM,gBAAgB,CAAC,QAAQ,UAAU,UAAU,SAAS,UAAU,aAAa,KAAK;AACxF,UAAI,KAAK,KAAK;AACZ,YAAI,CAAC,cAAc,SAAS,KAAK,GAAG,GAAG;AACrC,kBAAQ,OAAO,MAAM,oEAA4B,KAAK,GAAG;AAAA,CAAI;AAC7D,kBAAQ,OAAO,MAAM,gCAAY,cAAc,KAAK,IAAI,CAAC;AAAA,CAAI;AAC7D,kBAAQ,KAAK,CAAC;AAAA,QAChB;AACA,qBAAa,UAAU,KAAK;AAAA,MAC9B,OAAO;AACL,qBAAa,UAAU;AAAA,MACzB;AAGA,UAAI,KAAK,SAAS;AAChB,cAAM,IAAI,SAAS,KAAK,SAAS,EAAE;AACnC,YAAI,IAAI,EAAG,cAAa,iBAAiB;AAAA,MAC3C,WAAW,aAAa,YAAY,aAAa;AAC/C,qBAAa,iBAAiB,KAAK,EAAE;AAAA,MACvC;AAGA,UAAI,KAAK,cAAc;AACrB,cAAM,IAAI,SAAS,KAAK,cAAc,EAAE;AACxC,YAAI,IAAI,EAAG,cAAa,eAAe;AAAA,MACzC;AAEA,UAAI,CAAC,KAAK,QAAQ;AAChB,qBAAa,aAAa,CAAC,SAAiB,UAAkB;AAC5D,kBAAQ,OAAO,MAAM,cAAc,UAAU,GAAG,QAAQ,KAAK,MAAM,MAAM,OAAO,IAAI,KAAK,GAAG;AAAA,QAC9F;AAAA,MACF;AACA,YAAM,SAAS,MAAM,MAAM,aAAa,YAAY;AAEpD,UAAI,CAAC,OAAO,SAAS;AACnB,gBAAQ,OAAO,MAAM;AAAA,CAAS;AAC9B,gBAAQ,OAAO,MAAM,YAAO,OAAO,KAAK;AAAA,CAAI;AAC5C,gBAAQ,WAAW;AACnB;AAAA,MACF;AAEA,UAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM;AAAA,CAAO;AAG9C,UAAI,CAAC,KAAK,UAAU,OAAO,WAAW,OAAO,cAAc;AACzD,gBAAQ,OAAO,MAAM;AAAA,CAA4B;AAAA,MACnD;AAGA,UAAI,CAAC,KAAK,UAAU,OAAO,WAAW,OAAO,UAAU,QAAQ;AAC7D,mBAAW,KAAK,OAAO,UAAU;AAC/B,kBAAQ,OAAO,MAAM,YAAO,EAAE,OAAO;AAAA,CAAI;AAAA,QAC3C;AAAA,MACF;AAEA,UAAI,WAAW,OAAO;AAEtB,UAAI,KAAK,UAAU,OAAO,QAAQ,QAAQ;AACxC,mBAAW,SAAS,QAAQ,uBAAuB,wBAAwB;AAAA,MAC7E;AACA,YAAM,SAAS,KAAK,WAAW,SAC3B,KAAK,UAAU,QAAQ,MAAM,CAAC,IAC9B;AAGJ,YAAM,aAAa,CAAC,gBAAwB;AAC1C,YAAI,CAAC,OAAO,QAAQ,OAAQ;AAC5B,cAAM,OAAO,SAAS,WAAW,EAAE,QAAQ,YAAY,EAAE;AACzD,cAAM,aAAa,QAAQ,aAAa,MAAM,OAAO,SAAS;AAC9D,cAAM,SAAS,KAAK,WAAW,QAAQ,KAAK,QAAQ,IAAI;AACxD,kBAAU,QAAQ,EAAE,WAAW,KAAK,CAAC;AACrC,mBAAW,OAAO,OAAO,QAAQ;AAC/B,wBAAc,QAAQ,QAAQ,IAAI,QAAQ,GAAG,IAAI,IAAI;AAAA,QACvD;AACA,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,OAAO,OAAO,MAAM,oCAAW,MAAM;AAAA,CAAI;AAAA,MACzF;AAEA,UAAI,KAAK,UAAU,MAAM,WAAW,GAAG;AACrC,sBAAc,KAAK,QAAQ,QAAQ,OAAO;AAC1C,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,KAAK,MAAM;AAAA,CAAI;AAC7D,mBAAW,QAAQ,KAAK,MAAM,CAAC;AAAA,MACjC,WAAW,KAAK,QAAQ;AACtB,kBAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAC1C,cAAM,SAAS,KAAK,WAAW,SAAS,UAAU;AAClD,cAAM,UAAU,QAAQ,KAAK,QAAQ,SAAS,QAAQ,YAAY,MAAM,CAAC;AACzE,sBAAc,SAAS,QAAQ,OAAO;AACtC,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,OAAO;AAAA,CAAI;AACzD,mBAAW,OAAO;AAAA,MACpB,OAAO;AACL,gBAAQ,OAAO,MAAM,SAAS,IAAI;AAClC,mBAAW,OAAO;AAAA,MACpB;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,EAAE,cAAc,IAAI,MAAM,OAAO,qBAAY;AACnD,cAAQ,OAAO,MAAM;AAAA,kBAAqB,QAAQ,WAAM,cAAc,GAAG,CAAC;AAAA,CAAI;AAC9E,cAAQ,WAAW;AAAA,IACrB;AAAA,EACF;AACF;AAGA,SAAS,gBAAgB,KAAuB;AAC9C,SAAO,IACJ,OAAO,uBAAuB,2EAAoB,EAClD,OAAO,uBAAuB,0EAAmB,EACjD,OAAO,uBAAuB,mEAA2B,EACzD,OAAO,mBAAmB,wEAAgC,UAAU,EACpE,OAAO,sBAAsB,qEAAmB,EAChD,OAAO,qBAAqB,kHAAkC,EAC9D,OAAO,YAAY,oDAAY,EAC/B,OAAO,gBAAgB,qFAAiE,EACxF,OAAO,kBAAkB,sGAA0C,EACnE,OAAO,wBAAwB,sHAA0D;AAC9F;AAEA,QACG,wBAAwB,EACxB,KAAK,QAAQ,EACb,YAAY,2GAAoD,EAChE,QAAQ,OAAO;AAGlB;AAAA,EACE,QACG,QAAQ,OAAO,EACf,YAAY,mGAA4C,EACxD,SAAS,cAAc,8CAAW;AACvC,EAAE,OAAO,QAAQ;AAGjB;AAAA,EACE,QACG,SAAS,cAAc,2EAAwC;AACpE,EAAE,OAAO,QAAQ;AAEjB,QACG,QAAQ,iBAAiB,EACzB,YAAY,uFAA2B,EACvC,OAAO,uBAAuB,0CAAsB,MAAM,EAC1D,OAAO,uBAAuB,6GAA6B,EAC3D,OAAO,qBAAqB,qHAAqC,EACjE,OAAO,YAAY,kHAA6B,EAChD,OAAO,qBAAqB,uEAA0B,EACtD,OAAO,YAAY,oDAAY,EAC/B,OAAO,OAAO,OAAe,SAAS;AACrC,QAAM,eAAe,CAAC,QAAQ,MAAM;AACpC,MAAI,CAAC,aAAa,SAAS,KAAK,MAAM,GAAG;AACvC,YAAQ,OAAO,MAAM,gEAAwB,KAAK,MAAM;AAAA,CAAmB;AAC3E,YAAQ,KAAK,CAAC;AAAA,EAChB;AAEA,QAAM,WAAW,QAAQ,KAAK;AAC9B,MAAI,CAAC,WAAW,QAAQ,GAAG;AACzB,YAAQ,OAAO,MAAM,6EAA2B,KAAK;AAAA,CAAI;AACzD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAEA,QAAM,OAAO,SAAS,QAAQ,EAAE,QAAQ,YAAY,EAAE;AACtD,QAAM,UAAU,KAAK,SACjB,QAAQ,KAAK,MAAM,IACnB,QAAQ,UAAU,MAAM,GAAG,IAAI,IAAI,KAAK,MAAM,EAAE;AAEpD,MAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAY,SAAS,QAAQ,CAAC,WAAM,SAAS,OAAO,CAAC,MAAM;AAElG,MAAI;AACF,UAAM,WAAW,aAAa,UAAU,OAAO;AAG/C,UAAM,SAAS,KAAK,WAAW,QAAQ,KAAK,QAAQ,IAAI,QAAQ,UAAU,MAAM,OAAO,SAAS;AAChG,UAAM,SAAgD,CAAC;AACvD,QAAI,KAAK,UAAU,WAAW,MAAM,GAAG;AACrC,YAAM,UAAkC;AAAA,QACtC,KAAK;AAAA,QAAa,KAAK;AAAA,QAAc,MAAM;AAAA,QAC3C,KAAK;AAAA,QAAa,KAAK;AAAA,MACzB;AACA,iBAAW,SAAS,YAAY,QAAQ,EAAE,eAAe,KAAK,CAAC,GAAG;AAChE,YAAI,CAAC,MAAM,OAAO,EAAG;AACrB,cAAM,QAAQ,MAAM;AACpB,cAAM,MAAM,QAAQ,KAAK,EAAE,MAAM,CAAC,EAAE,YAAY;AAChD,YAAI,CAAC,QAAQ,GAAG,EAAG;AACnB,cAAM,OAAO,aAAa,QAAQ,QAAQ,KAAK,CAAC;AAChD,eAAO,KAAK,EAAE,UAAU,OAAO,MAAM,IAAI,WAAW,IAAI,GAAG,UAAU,QAAQ,GAAG,EAAE,CAAC;AAAA,MACrF;AACA,UAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,+BAAW,OAAO,MAAM;AAAA,CAAQ;AAAA,IACzE;AAEA,UAAM,WAAqB,CAAC;AAE5B,QAAI;AACJ,QAAI,KAAK,WAAW,QAAQ;AAC1B,UAAI,KAAK,YAAY,CAAC,KAAK,QAAQ;AACjC,gBAAQ,OAAO,MAAM;AAAA;AAAA,CAAiD;AAAA,MACxE;AACA,YAAM,MAAM,eAAe,UAAU,EAAE,UAAU,QAAQ,OAAO,SAAS,SAAS,OAAU,CAAC;AAAA,IAC/F,OAAO;AACL,UAAI;AACJ,UAAI,KAAK,UAAU;AACjB,cAAM,UAAU,aAAa,QAAQ,KAAK,QAAQ,CAAC;AACnD,8BAAsB,QAAQ,OAAO,MAAM,QAAQ,YAAY,QAAQ,aAAa,QAAQ,UAAU;AAAA,MACxG;AACA,YAAM,MAAM,eAAe,UAAU,EAAE,UAAU,QAAQ,OAAO,SAAS,SAAS,QAAW,oBAAoB,CAAC;AAAA,IACpH;AAEA,kBAAc,SAAS,OAAO,KAAK,GAAG,CAAC;AAEvC,QAAI,CAAC,KAAK,QAAQ;AAChB,cAAQ,OAAO,MAAM;AAAA,CAAO;AAC5B,cAAQ,OAAO,MAAM,YAAO,OAAO;AAAA,CAAI;AACvC,UAAI,SAAS,OAAQ,UAAS,QAAQ,OAAK,QAAQ,OAAO,MAAM,KAAK,CAAC;AAAA,CAAI,CAAC;AAAA,IAC7E;AAAA,EACF,SAAS,KAAK;AACZ,UAAM,EAAE,cAAc,IAAI,MAAM,OAAO,qBAAY;AACnD,YAAQ,OAAO,MAAM;AAAA,CAAS;AAC9B,YAAQ,OAAO,MAAM,YAAO,cAAc,GAAG,CAAC;AAAA,CAAI;AAClD,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF,CAAC;AAEH,QACG,QAAQ,aAAa,EACrB,YAAY,4FAAsB,EAClC,OAAO,mBAAmB,4CAAc,EACxC,OAAO,uBAAuB,iEAAe,EAC7C,OAAO,uBAAuB,8CAAW,EACzC,OAAO,mBAAmB,yDAA2B,UAAU,EAC/D,OAAO,YAAY,oDAAY,EAC/B,OAAO,OAAO,KAAa,SAAS;AACnC,QAAM,EAAE,eAAe,IAAI,MAAM,OAAO,qBAAY;AACpD,QAAM,eAAe;AAAA,IACnB;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb,OAAO,KAAK;AAAA,IACZ,QAAQ,KAAK;AAAA,EACf,CAAC;AACH,CAAC;AAEH,QAAQ,MAAM;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/cli.ts"],"sourcesContent":["/** kordoc CLI — 모두 파싱해버리겠다 */\n\nimport { readFileSync, writeFileSync, mkdirSync, statSync, existsSync, readdirSync } from \"fs\"\nimport { basename, resolve, extname } from \"path\"\nimport { cpus } from \"os\"\nimport { Command } from \"commander\"\nimport { parse, detectFormat, markdownToHwpx, markdownToXlsx } from \"./index.js\"\nimport type { ParseOptions, OcrMode } from \"./types.js\"\nimport { VERSION, toArrayBuffer } from \"./utils.js\"\n\nconst program = new Command()\n\n/** 공통 parse 옵션 타입 */\ninterface ParseOpts {\n output?: string\n outDir?: string\n pages?: string\n format: string\n headerFooter: boolean\n imageDir?: string\n silent?: boolean\n ocr?: string\n ocrJobs?: string\n ocrBatchSize?: string\n}\n\n/** parse 액션 공통 구현 — 루트 커맨드와 `parse` 서브커맨드가 공유 */\nasync function runParse(files: string[], opts: ParseOpts) {\n const validFormats = [\"markdown\", \"json\"]\n if (!validFormats.includes(opts.format)) {\n process.stderr.write(`[kordoc] 지원하지 않는 형식: ${opts.format} (markdown 또는 json)\\n`)\n process.exit(1)\n }\n for (let fi = 0; fi < files.length; fi++) {\n const filePath = files[fi]\n const absPath = resolve(filePath)\n const fileName = basename(absPath)\n const filePrefix = files.length > 1 ? `[${fi + 1}/${files.length}] ` : \"\"\n\n try {\n const fileSize = statSync(absPath).size\n if (fileSize > 500 * 1024 * 1024) {\n process.stderr.write(`\\n[kordoc] SKIP: ${fileName} — 파일이 너무 큽니다 (${(fileSize / 1024 / 1024).toFixed(1)}MB)\\n`)\n process.exitCode = 1\n continue\n }\n const buffer = readFileSync(absPath)\n const arrayBuffer = toArrayBuffer(buffer)\n const format = detectFormat(arrayBuffer)\n\n if (!opts.silent) {\n process.stderr.write(`[kordoc] ${filePrefix}${fileName} (${format}) ...`)\n }\n\n const parseOptions: ParseOptions = {}\n if (opts.pages) parseOptions.pages = opts.pages as string\n if (opts.headerFooter === false) parseOptions.removeHeaderFooter = false\n\n // OCR 모드: CLI 기본값 \"auto\" (라이브러리 API는 undefined 유지)\n const validOcrModes = [\"auto\", \"gemini\", \"claude\", \"codex\", \"ollama\", \"tesseract\", \"off\"]\n if (opts.ocr) {\n if (!validOcrModes.includes(opts.ocr)) {\n process.stderr.write(`[kordoc] 지원하지 않는 OCR 모드: ${opts.ocr}\\n`)\n process.stderr.write(` 사용 가능: ${validOcrModes.join(\", \")}\\n`)\n process.exit(1)\n }\n parseOptions.ocrMode = opts.ocr as OcrMode\n } else {\n parseOptions.ocrMode = \"auto\"\n }\n\n // OCR 병렬 처리 수 (--ocr-jobs)\n // - tesseract: CPU 코어 수 (워커 스레드 병렬)\n // - gemini/claude/codex/auto: 4 (배치 병렬 실행)\n if (opts.ocrJobs) {\n const n = parseInt(opts.ocrJobs, 10)\n if (n > 0) parseOptions.ocrConcurrency = n\n } else if (parseOptions.ocrMode === \"tesseract\") {\n parseOptions.ocrConcurrency = cpus().length\n } else {\n parseOptions.ocrConcurrency = 4\n }\n\n // OCR 배치 크기 (--ocr-batch-size)\n if (opts.ocrBatchSize) {\n const n = parseInt(opts.ocrBatchSize, 10)\n if (n > 0) parseOptions.ocrBatchSize = n\n }\n\n if (!opts.silent) {\n parseOptions.onProgress = (current: number, total: number) => {\n process.stderr.write(`\\r[kordoc] ${filePrefix}${fileName} (${format}) [${current}/${total}]`)\n }\n }\n const result = await parse(arrayBuffer, parseOptions)\n\n if (!result.success) {\n process.stderr.write(` FAIL\\n`)\n process.stderr.write(` → ${result.error}\\n`)\n process.exitCode = 1\n continue\n }\n\n if (!opts.silent) process.stderr.write(` OK\\n`)\n\n // 이미지 기반 PDF OCR 결과 표시\n if (!opts.silent && result.success && result.isImageBased) {\n process.stderr.write(` → 이미지 기반 PDF — OCR 처리됨\\n`)\n }\n\n // 경고 표시\n if (!opts.silent && result.success && result.warnings?.length) {\n for (const w of result.warnings) {\n process.stderr.write(` ⚠ ${w.message}\\n`)\n }\n }\n\n let markdown = result.markdown\n // --out-dir 시 이미지 참조 경로에 images/ 접두사 추가\n if (opts.outDir && result.images?.length) {\n markdown = markdown.replace(/!\\[image\\]\\(image_/g, \"\n }\n const output = opts.format === \"json\"\n ? JSON.stringify(result, null, 2)\n : markdown\n\n // 이미지 저장: 출력 MD 파일 기준 폴더 사용 (convert와 일치)\n const saveImages = (outFilePath: string) => {\n if (!result.images?.length) return\n const stem = basename(outFilePath).replace(/\\.[^.]+$/, \"\")\n const defaultDir = resolve(outFilePath, \"..\", stem + \"_images\")\n const imgDir = opts.imageDir ? resolve(opts.imageDir) : defaultDir\n mkdirSync(imgDir, { recursive: true })\n for (const img of result.images) {\n writeFileSync(resolve(imgDir, img.filename), img.data)\n }\n if (!opts.silent) process.stderr.write(` → ${result.images.length}개 이미지 → ${imgDir}\\n`)\n }\n\n if (opts.output && files.length === 1) {\n writeFileSync(opts.output, output, \"utf-8\")\n if (!opts.silent) process.stderr.write(` → ${opts.output}\\n`)\n saveImages(resolve(opts.output))\n } else if (opts.outDir) {\n mkdirSync(opts.outDir, { recursive: true })\n const outExt = opts.format === \"json\" ? \".json\" : \".md\"\n const outPath = resolve(opts.outDir, fileName.replace(/\\.[^.]+$/, outExt))\n writeFileSync(outPath, output, \"utf-8\")\n if (!opts.silent) process.stderr.write(` → ${outPath}\\n`)\n saveImages(outPath)\n } else {\n process.stdout.write(output + \"\\n\")\n saveImages(absPath) // stdout 출력 시 입력 파일 기준\n }\n } catch (err) {\n const { sanitizeError } = await import(\"./utils.js\")\n process.stderr.write(`\\n[kordoc] ERROR: ${fileName} — ${sanitizeError(err)}\\n`)\n process.exitCode = 1\n }\n }\n}\n\n/** 공통 parse 옵션 등록 헬퍼 */\nfunction addParseOptions(cmd: Command): Command {\n return cmd\n .option(\"-o, --output <path>\", \"출력 파일 경로 (단일 파일 시)\")\n .option(\"-d, --out-dir <dir>\", \"출력 디렉토리 (다중 파일 시)\")\n .option(\"-p, --pages <range>\", \"페이지/섹션 범위 (예: 1-3, 1,3,5)\")\n .option(\"--format <type>\", \"출력 형식: markdown (기본) 또는 json\", \"markdown\")\n .option(\"--no-header-footer\", \"PDF 머리글/바닥글 자동 제거\")\n .option(\"--image-dir <dir>\", \"이미지 저장 폴더 (기본: 입력 파일명_images 폴더)\")\n .option(\"--silent\", \"진행 메시지 숨기기\")\n .option(\"--ocr <mode>\", \"OCR 모드: auto(기본), gemini, claude, codex, ollama, tesseract, off\")\n .option(\"--ocr-jobs <n>\", \"OCR 병렬 처리 수 (기본: CPU 코어 수, tesseract 전용)\")\n .option(\"--ocr-batch-size <n>\", \"OCR 배치 크기 — CLI당 페이지 수 (기본: gemini/claude 50, codex 100)\")\n}\n\nprogram\n .enablePositionalOptions()\n .name(\"kordoc\")\n .description(\"모두 파싱해버리겠다 — HWP, HWPX, PDF, XLSX, DOCX → Markdown\")\n .version(VERSION)\n\n// `kordoc parse <files>` 서브커맨드 (권장)\naddParseOptions(\n program\n .command(\"parse\")\n .description(\"파일을 마크다운으로 파싱 (HWP, HWPX, PDF, XLSX, DOCX)\")\n .argument(\"<files...>\", \"변환할 파일 경로\")\n).action(runParse)\n\n// `kordoc <files>` 루트 커맨드 (하위 호환)\naddParseOptions(\n program\n .argument(\"<files...>\", \"변환할 파일 경로 (HWP, HWPX, PDF, XLSX, DOCX)\")\n).action(runParse)\n\nprogram\n .command(\"convert <input>\")\n .description(\"마크다운 파일을 HWPX 또는 XLSX로 변환\")\n .option(\"-f, --format <type>\", \"출력 포맷: hwpx | xlsx\", \"hwpx\")\n .option(\"-o, --output <path>\", \"출력 파일 경로 (기본: 입력명 + 포맷 확장자)\")\n .option(\"--image-dir <dir>\", \"이미지 폴더 경로 (기본: 입력 MD 파일명_images 폴더)\")\n .option(\"--images\", \"이미지 포함 (기본: 생략, 레이아웃 문제 방지)\")\n .option(\"--template <path>\", \"HWPX 템플릿 파일 경로 (hwpx 전용)\")\n .option(\"--silent\", \"진행 메시지 숨기기\")\n .action(async (input: string, opts) => {\n const validFormats = [\"hwpx\", \"xlsx\"]\n if (!validFormats.includes(opts.format)) {\n process.stderr.write(`[kordoc] 지원하지 않는 포맷: ${opts.format} (hwpx 또는 xlsx)\\n`)\n process.exit(1)\n }\n\n const absInput = resolve(input)\n if (!existsSync(absInput)) {\n process.stderr.write(`[kordoc] 파일을 찾을 수 없습니다: ${input}\\n`)\n process.exit(1)\n }\n\n const stem = basename(absInput).replace(/\\.[^.]+$/, \"\")\n const outPath = opts.output\n ? resolve(opts.output)\n : resolve(absInput, \"..\", `${stem}.${opts.format}`)\n\n if (!opts.silent) process.stderr.write(`[kordoc] ${basename(absInput)} → ${basename(outPath)} ...`)\n\n try {\n const markdown = readFileSync(absInput, \"utf-8\")\n\n // 이미지 폴더에서 이미지 로드 (--images 플래그 필요)\n const imgDir = opts.imageDir ? resolve(opts.imageDir) : resolve(absInput, \"..\", stem + \"_images\")\n const images: import(\"./types.js\").ExtractedImage[] = []\n if (opts.images && existsSync(imgDir)) {\n const mimeMap: Record<string, string> = {\n png: \"image/png\", jpg: \"image/jpeg\", jpeg: \"image/jpeg\",\n gif: \"image/gif\", bmp: \"image/bmp\",\n }\n for (const entry of readdirSync(imgDir, { withFileTypes: true })) {\n if (!entry.isFile()) continue\n const fname = entry.name\n const ext = extname(fname).slice(1).toLowerCase()\n if (!mimeMap[ext]) continue\n const data = readFileSync(resolve(imgDir, fname))\n images.push({ filename: fname, data: new Uint8Array(data), mimeType: mimeMap[ext] })\n }\n if (!opts.silent) process.stderr.write(` → 이미지 ${images.length}개 로드\\n`)\n }\n\n const warnings: string[] = []\n\n let buf: ArrayBuffer\n if (opts.format === \"xlsx\") {\n if (opts.template && !opts.silent) {\n process.stderr.write(`\\n[kordoc] 경고: --template은 hwpx 전용입니다. 무시됩니다.\\n`)\n }\n buf = await markdownToXlsx(markdown, { warnings, images: images.length ? images : undefined })\n } else {\n let templateArrayBuffer: ArrayBuffer | undefined\n if (opts.template) {\n const tmplBuf = readFileSync(resolve(opts.template))\n templateArrayBuffer = tmplBuf.buffer.slice(tmplBuf.byteOffset, tmplBuf.byteOffset + tmplBuf.byteLength)\n }\n buf = await markdownToHwpx(markdown, { warnings, images: images.length ? images : undefined, templateArrayBuffer })\n }\n\n writeFileSync(outPath, Buffer.from(buf))\n\n if (!opts.silent) {\n process.stderr.write(` OK\\n`)\n process.stderr.write(` → ${outPath}\\n`)\n if (warnings.length) warnings.forEach(w => process.stderr.write(` ${w}\\n`))\n }\n } catch (err) {\n const { sanitizeError } = await import(\"./utils.js\")\n process.stderr.write(` FAIL\\n`)\n process.stderr.write(` → ${sanitizeError(err)}\\n`)\n process.exit(1)\n }\n })\n\nprogram\n .command(\"watch <dir>\")\n .description(\"디렉토리 감시 — 새 문서 자동 변환\")\n .option(\"--webhook <url>\", \"결과 전송 웹훅 URL\")\n .option(\"-d, --out-dir <dir>\", \"변환 결과 출력 디렉토리\")\n .option(\"-p, --pages <range>\", \"페이지/섹션 범위\")\n .option(\"--format <type>\", \"출력 형식: markdown 또는 json\", \"markdown\")\n .option(\"--silent\", \"진행 메시지 숨기기\")\n .action(async (dir: string, opts) => {\n const { watchDirectory } = await import(\"./watch.js\")\n await watchDirectory({\n dir,\n outDir: opts.outDir,\n webhook: opts.webhook,\n format: opts.format,\n pages: opts.pages,\n silent: opts.silent,\n })\n })\n\nprogram.parse()\n"],"mappings":";;;;;;;;;;;;;;;AAEA,SAAS,cAAc,eAAe,WAAW,UAAU,YAAY,mBAAmB;AAC1F,SAAS,UAAU,SAAS,eAAe;AAC3C,SAAS,YAAY;AACrB,SAAS,eAAe;AAKxB,IAAM,UAAU,IAAI,QAAQ;AAiB5B,eAAe,SAAS,OAAiB,MAAiB;AACxD,QAAM,eAAe,CAAC,YAAY,MAAM;AACxC,MAAI,CAAC,aAAa,SAAS,KAAK,MAAM,GAAG;AACvC,YAAQ,OAAO,MAAM,gEAAwB,KAAK,MAAM;AAAA,CAAuB;AAC/E,YAAQ,KAAK,CAAC;AAAA,EAChB;AACA,WAAS,KAAK,GAAG,KAAK,MAAM,QAAQ,MAAM;AACxC,UAAM,WAAW,MAAM,EAAE;AACzB,UAAM,UAAU,QAAQ,QAAQ;AAChC,UAAM,WAAW,SAAS,OAAO;AACjC,UAAM,aAAa,MAAM,SAAS,IAAI,IAAI,KAAK,CAAC,IAAI,MAAM,MAAM,OAAO;AAEvE,QAAI;AACF,YAAM,WAAW,SAAS,OAAO,EAAE;AACnC,UAAI,WAAW,MAAM,OAAO,MAAM;AAChC,gBAAQ,OAAO,MAAM;AAAA,iBAAoB,QAAQ,gEAAmB,WAAW,OAAO,MAAM,QAAQ,CAAC,CAAC;AAAA,CAAO;AAC7G,gBAAQ,WAAW;AACnB;AAAA,MACF;AACA,YAAM,SAAS,aAAa,OAAO;AACnC,YAAM,cAAc,cAAc,MAAM;AACxC,YAAM,SAAS,aAAa,WAAW;AAEvC,UAAI,CAAC,KAAK,QAAQ;AAChB,gBAAQ,OAAO,MAAM,YAAY,UAAU,GAAG,QAAQ,KAAK,MAAM,OAAO;AAAA,MAC1E;AAEA,YAAM,eAA6B,CAAC;AACpC,UAAI,KAAK,MAAO,cAAa,QAAQ,KAAK;AAC1C,UAAI,KAAK,iBAAiB,MAAO,cAAa,qBAAqB;AAGnE,YAAM,gBAAgB,CAAC,QAAQ,UAAU,UAAU,SAAS,UAAU,aAAa,KAAK;AACxF,UAAI,KAAK,KAAK;AACZ,YAAI,CAAC,cAAc,SAAS,KAAK,GAAG,GAAG;AACrC,kBAAQ,OAAO,MAAM,oEAA4B,KAAK,GAAG;AAAA,CAAI;AAC7D,kBAAQ,OAAO,MAAM,gCAAY,cAAc,KAAK,IAAI,CAAC;AAAA,CAAI;AAC7D,kBAAQ,KAAK,CAAC;AAAA,QAChB;AACA,qBAAa,UAAU,KAAK;AAAA,MAC9B,OAAO;AACL,qBAAa,UAAU;AAAA,MACzB;AAKA,UAAI,KAAK,SAAS;AAChB,cAAM,IAAI,SAAS,KAAK,SAAS,EAAE;AACnC,YAAI,IAAI,EAAG,cAAa,iBAAiB;AAAA,MAC3C,WAAW,aAAa,YAAY,aAAa;AAC/C,qBAAa,iBAAiB,KAAK,EAAE;AAAA,MACvC,OAAO;AACL,qBAAa,iBAAiB;AAAA,MAChC;AAGA,UAAI,KAAK,cAAc;AACrB,cAAM,IAAI,SAAS,KAAK,cAAc,EAAE;AACxC,YAAI,IAAI,EAAG,cAAa,eAAe;AAAA,MACzC;AAEA,UAAI,CAAC,KAAK,QAAQ;AAChB,qBAAa,aAAa,CAAC,SAAiB,UAAkB;AAC5D,kBAAQ,OAAO,MAAM,cAAc,UAAU,GAAG,QAAQ,KAAK,MAAM,MAAM,OAAO,IAAI,KAAK,GAAG;AAAA,QAC9F;AAAA,MACF;AACA,YAAM,SAAS,MAAM,MAAM,aAAa,YAAY;AAEpD,UAAI,CAAC,OAAO,SAAS;AACnB,gBAAQ,OAAO,MAAM;AAAA,CAAS;AAC9B,gBAAQ,OAAO,MAAM,YAAO,OAAO,KAAK;AAAA,CAAI;AAC5C,gBAAQ,WAAW;AACnB;AAAA,MACF;AAEA,UAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM;AAAA,CAAO;AAG9C,UAAI,CAAC,KAAK,UAAU,OAAO,WAAW,OAAO,cAAc;AACzD,gBAAQ,OAAO,MAAM;AAAA,CAA4B;AAAA,MACnD;AAGA,UAAI,CAAC,KAAK,UAAU,OAAO,WAAW,OAAO,UAAU,QAAQ;AAC7D,mBAAW,KAAK,OAAO,UAAU;AAC/B,kBAAQ,OAAO,MAAM,YAAO,EAAE,OAAO;AAAA,CAAI;AAAA,QAC3C;AAAA,MACF;AAEA,UAAI,WAAW,OAAO;AAEtB,UAAI,KAAK,UAAU,OAAO,QAAQ,QAAQ;AACxC,mBAAW,SAAS,QAAQ,uBAAuB,wBAAwB;AAAA,MAC7E;AACA,YAAM,SAAS,KAAK,WAAW,SAC3B,KAAK,UAAU,QAAQ,MAAM,CAAC,IAC9B;AAGJ,YAAM,aAAa,CAAC,gBAAwB;AAC1C,YAAI,CAAC,OAAO,QAAQ,OAAQ;AAC5B,cAAM,OAAO,SAAS,WAAW,EAAE,QAAQ,YAAY,EAAE;AACzD,cAAM,aAAa,QAAQ,aAAa,MAAM,OAAO,SAAS;AAC9D,cAAM,SAAS,KAAK,WAAW,QAAQ,KAAK,QAAQ,IAAI;AACxD,kBAAU,QAAQ,EAAE,WAAW,KAAK,CAAC;AACrC,mBAAW,OAAO,OAAO,QAAQ;AAC/B,wBAAc,QAAQ,QAAQ,IAAI,QAAQ,GAAG,IAAI,IAAI;AAAA,QACvD;AACA,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,OAAO,OAAO,MAAM,oCAAW,MAAM;AAAA,CAAI;AAAA,MACzF;AAEA,UAAI,KAAK,UAAU,MAAM,WAAW,GAAG;AACrC,sBAAc,KAAK,QAAQ,QAAQ,OAAO;AAC1C,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,KAAK,MAAM;AAAA,CAAI;AAC7D,mBAAW,QAAQ,KAAK,MAAM,CAAC;AAAA,MACjC,WAAW,KAAK,QAAQ;AACtB,kBAAU,KAAK,QAAQ,EAAE,WAAW,KAAK,CAAC;AAC1C,cAAM,SAAS,KAAK,WAAW,SAAS,UAAU;AAClD,cAAM,UAAU,QAAQ,KAAK,QAAQ,SAAS,QAAQ,YAAY,MAAM,CAAC;AACzE,sBAAc,SAAS,QAAQ,OAAO;AACtC,YAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAO,OAAO;AAAA,CAAI;AACzD,mBAAW,OAAO;AAAA,MACpB,OAAO;AACL,gBAAQ,OAAO,MAAM,SAAS,IAAI;AAClC,mBAAW,OAAO;AAAA,MACpB;AAAA,IACF,SAAS,KAAK;AACZ,YAAM,EAAE,cAAc,IAAI,MAAM,OAAO,qBAAY;AACnD,cAAQ,OAAO,MAAM;AAAA,kBAAqB,QAAQ,WAAM,cAAc,GAAG,CAAC;AAAA,CAAI;AAC9E,cAAQ,WAAW;AAAA,IACrB;AAAA,EACF;AACF;AAGA,SAAS,gBAAgB,KAAuB;AAC9C,SAAO,IACJ,OAAO,uBAAuB,2EAAoB,EAClD,OAAO,uBAAuB,0EAAmB,EACjD,OAAO,uBAAuB,mEAA2B,EACzD,OAAO,mBAAmB,wEAAgC,UAAU,EACpE,OAAO,sBAAsB,qEAAmB,EAChD,OAAO,qBAAqB,kHAAkC,EAC9D,OAAO,YAAY,oDAAY,EAC/B,OAAO,gBAAgB,qFAAiE,EACxF,OAAO,kBAAkB,sGAA0C,EACnE,OAAO,wBAAwB,sHAA0D;AAC9F;AAEA,QACG,wBAAwB,EACxB,KAAK,QAAQ,EACb,YAAY,2GAAoD,EAChE,QAAQ,OAAO;AAGlB;AAAA,EACE,QACG,QAAQ,OAAO,EACf,YAAY,mGAA4C,EACxD,SAAS,cAAc,8CAAW;AACvC,EAAE,OAAO,QAAQ;AAGjB;AAAA,EACE,QACG,SAAS,cAAc,2EAAwC;AACpE,EAAE,OAAO,QAAQ;AAEjB,QACG,QAAQ,iBAAiB,EACzB,YAAY,uFAA2B,EACvC,OAAO,uBAAuB,0CAAsB,MAAM,EAC1D,OAAO,uBAAuB,6GAA6B,EAC3D,OAAO,qBAAqB,qHAAqC,EACjE,OAAO,YAAY,kHAA6B,EAChD,OAAO,qBAAqB,uEAA0B,EACtD,OAAO,YAAY,oDAAY,EAC/B,OAAO,OAAO,OAAe,SAAS;AACrC,QAAM,eAAe,CAAC,QAAQ,MAAM;AACpC,MAAI,CAAC,aAAa,SAAS,KAAK,MAAM,GAAG;AACvC,YAAQ,OAAO,MAAM,gEAAwB,KAAK,MAAM;AAAA,CAAmB;AAC3E,YAAQ,KAAK,CAAC;AAAA,EAChB;AAEA,QAAM,WAAW,QAAQ,KAAK;AAC9B,MAAI,CAAC,WAAW,QAAQ,GAAG;AACzB,YAAQ,OAAO,MAAM,6EAA2B,KAAK;AAAA,CAAI;AACzD,YAAQ,KAAK,CAAC;AAAA,EAChB;AAEA,QAAM,OAAO,SAAS,QAAQ,EAAE,QAAQ,YAAY,EAAE;AACtD,QAAM,UAAU,KAAK,SACjB,QAAQ,KAAK,MAAM,IACnB,QAAQ,UAAU,MAAM,GAAG,IAAI,IAAI,KAAK,MAAM,EAAE;AAEpD,MAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,YAAY,SAAS,QAAQ,CAAC,WAAM,SAAS,OAAO,CAAC,MAAM;AAElG,MAAI;AACF,UAAM,WAAW,aAAa,UAAU,OAAO;AAG/C,UAAM,SAAS,KAAK,WAAW,QAAQ,KAAK,QAAQ,IAAI,QAAQ,UAAU,MAAM,OAAO,SAAS;AAChG,UAAM,SAAgD,CAAC;AACvD,QAAI,KAAK,UAAU,WAAW,MAAM,GAAG;AACrC,YAAM,UAAkC;AAAA,QACtC,KAAK;AAAA,QAAa,KAAK;AAAA,QAAc,MAAM;AAAA,QAC3C,KAAK;AAAA,QAAa,KAAK;AAAA,MACzB;AACA,iBAAW,SAAS,YAAY,QAAQ,EAAE,eAAe,KAAK,CAAC,GAAG;AAChE,YAAI,CAAC,MAAM,OAAO,EAAG;AACrB,cAAM,QAAQ,MAAM;AACpB,cAAM,MAAM,QAAQ,KAAK,EAAE,MAAM,CAAC,EAAE,YAAY;AAChD,YAAI,CAAC,QAAQ,GAAG,EAAG;AACnB,cAAM,OAAO,aAAa,QAAQ,QAAQ,KAAK,CAAC;AAChD,eAAO,KAAK,EAAE,UAAU,OAAO,MAAM,IAAI,WAAW,IAAI,GAAG,UAAU,QAAQ,GAAG,EAAE,CAAC;AAAA,MACrF;AACA,UAAI,CAAC,KAAK,OAAQ,SAAQ,OAAO,MAAM,+BAAW,OAAO,MAAM;AAAA,CAAQ;AAAA,IACzE;AAEA,UAAM,WAAqB,CAAC;AAE5B,QAAI;AACJ,QAAI,KAAK,WAAW,QAAQ;AAC1B,UAAI,KAAK,YAAY,CAAC,KAAK,QAAQ;AACjC,gBAAQ,OAAO,MAAM;AAAA;AAAA,CAAiD;AAAA,MACxE;AACA,YAAM,MAAM,eAAe,UAAU,EAAE,UAAU,QAAQ,OAAO,SAAS,SAAS,OAAU,CAAC;AAAA,IAC/F,OAAO;AACL,UAAI;AACJ,UAAI,KAAK,UAAU;AACjB,cAAM,UAAU,aAAa,QAAQ,KAAK,QAAQ,CAAC;AACnD,8BAAsB,QAAQ,OAAO,MAAM,QAAQ,YAAY,QAAQ,aAAa,QAAQ,UAAU;AAAA,MACxG;AACA,YAAM,MAAM,eAAe,UAAU,EAAE,UAAU,QAAQ,OAAO,SAAS,SAAS,QAAW,oBAAoB,CAAC;AAAA,IACpH;AAEA,kBAAc,SAAS,OAAO,KAAK,GAAG,CAAC;AAEvC,QAAI,CAAC,KAAK,QAAQ;AAChB,cAAQ,OAAO,MAAM;AAAA,CAAO;AAC5B,cAAQ,OAAO,MAAM,YAAO,OAAO;AAAA,CAAI;AACvC,UAAI,SAAS,OAAQ,UAAS,QAAQ,OAAK,QAAQ,OAAO,MAAM,KAAK,CAAC;AAAA,CAAI,CAAC;AAAA,IAC7E;AAAA,EACF,SAAS,KAAK;AACZ,UAAM,EAAE,cAAc,IAAI,MAAM,OAAO,qBAAY;AACnD,YAAQ,OAAO,MAAM;AAAA,CAAS;AAC9B,YAAQ,OAAO,MAAM,YAAO,cAAc,GAAG,CAAC;AAAA,CAAI;AAClD,YAAQ,KAAK,CAAC;AAAA,EAChB;AACF,CAAC;AAEH,QACG,QAAQ,aAAa,EACrB,YAAY,4FAAsB,EAClC,OAAO,mBAAmB,4CAAc,EACxC,OAAO,uBAAuB,iEAAe,EAC7C,OAAO,uBAAuB,8CAAW,EACzC,OAAO,mBAAmB,yDAA2B,UAAU,EAC/D,OAAO,YAAY,oDAAY,EAC/B,OAAO,OAAO,KAAa,SAAS;AACnC,QAAM,EAAE,eAAe,IAAI,MAAM,OAAO,qBAAY;AACpD,QAAM,eAAe;AAAA,IACnB;AAAA,IACA,QAAQ,KAAK;AAAA,IACb,SAAS,KAAK;AAAA,IACd,QAAQ,KAAK;AAAA,IACb,OAAO,KAAK;AAAA,IACZ,QAAQ,KAAK;AAAA,EACf,CAAC;AACH,CAAC;AAEH,QAAQ,MAAM;","names":[]}
|
package/dist/index.cjs
CHANGED
|
@@ -2242,9 +2242,9 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2242
2242
|
}
|
|
2243
2243
|
let output;
|
|
2244
2244
|
if (mode === "codex") {
|
|
2245
|
-
output = callBatchCodexCli(tempFiles);
|
|
2245
|
+
output = await callBatchCodexCli(tempFiles);
|
|
2246
2246
|
} else {
|
|
2247
|
-
output = callBatchCli(mode, tempFiles);
|
|
2247
|
+
output = await callBatchCli(mode, tempFiles);
|
|
2248
2248
|
}
|
|
2249
2249
|
const cleaned = stripCodeFence2(output.trim());
|
|
2250
2250
|
const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
|
|
@@ -2266,7 +2266,48 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2266
2266
|
}
|
|
2267
2267
|
};
|
|
2268
2268
|
}
|
|
2269
|
-
function
|
|
2269
|
+
function spawnAsync(cmd, args, opts) {
|
|
2270
|
+
return new Promise((resolve, reject) => {
|
|
2271
|
+
const child = (0, import_child_process3.spawn)(cmd, args, {
|
|
2272
|
+
cwd: opts.cwd,
|
|
2273
|
+
env: process.env,
|
|
2274
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
2275
|
+
});
|
|
2276
|
+
let stdout = "";
|
|
2277
|
+
let stderr = "";
|
|
2278
|
+
let killed = false;
|
|
2279
|
+
child.stdout.setEncoding("utf-8");
|
|
2280
|
+
child.stderr.setEncoding("utf-8");
|
|
2281
|
+
child.stdout.on("data", (d) => {
|
|
2282
|
+
stdout += d;
|
|
2283
|
+
});
|
|
2284
|
+
child.stderr.on("data", (d) => {
|
|
2285
|
+
stderr += d;
|
|
2286
|
+
});
|
|
2287
|
+
const timer = setTimeout(() => {
|
|
2288
|
+
killed = true;
|
|
2289
|
+
child.kill("SIGTERM");
|
|
2290
|
+
}, opts.timeoutMs);
|
|
2291
|
+
if (opts.stdin !== void 0) {
|
|
2292
|
+
child.stdin.end(opts.stdin);
|
|
2293
|
+
} else {
|
|
2294
|
+
child.stdin.end();
|
|
2295
|
+
}
|
|
2296
|
+
child.on("close", (code) => {
|
|
2297
|
+
clearTimeout(timer);
|
|
2298
|
+
if (killed) {
|
|
2299
|
+
reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
|
|
2300
|
+
} else {
|
|
2301
|
+
resolve({ stdout, stderr, exitCode: code ?? 1 });
|
|
2302
|
+
}
|
|
2303
|
+
});
|
|
2304
|
+
child.on("error", (err) => {
|
|
2305
|
+
clearTimeout(timer);
|
|
2306
|
+
reject(err);
|
|
2307
|
+
});
|
|
2308
|
+
});
|
|
2309
|
+
}
|
|
2310
|
+
async function callBatchCli(mode, imagePaths) {
|
|
2270
2311
|
const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
|
|
2271
2312
|
const prompt = `${BATCH_OCR_PROMPT}
|
|
2272
2313
|
|
|
@@ -2282,24 +2323,18 @@ ${fileRefs}`;
|
|
|
2282
2323
|
if (model) args.push("--model", model);
|
|
2283
2324
|
}
|
|
2284
2325
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2285
|
-
const result =
|
|
2286
|
-
|
|
2287
|
-
timeout: timeoutMs,
|
|
2288
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
2289
|
-
// 50MB (large batch output)
|
|
2326
|
+
const result = await spawnAsync(mode, args, {
|
|
2327
|
+
timeoutMs,
|
|
2290
2328
|
...mode === "claude" ? { cwd: (0, import_os2.tmpdir)() } : {}
|
|
2291
2329
|
});
|
|
2292
|
-
if (result.
|
|
2293
|
-
|
|
2294
|
-
}
|
|
2295
|
-
if (result.status !== 0) {
|
|
2296
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2330
|
+
if (result.exitCode !== 0) {
|
|
2331
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2297
2332
|
throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2298
2333
|
}
|
|
2299
2334
|
return result.stdout || "";
|
|
2300
2335
|
}
|
|
2301
|
-
function callBatchCodexCli(imagePaths) {
|
|
2302
|
-
const outPath = (0, import_path2.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}.txt`);
|
|
2336
|
+
async function callBatchCodexCli(imagePaths) {
|
|
2337
|
+
const outPath = (0, import_path2.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
|
|
2303
2338
|
try {
|
|
2304
2339
|
const args = ["exec", BATCH_OCR_PROMPT];
|
|
2305
2340
|
for (const p of imagePaths) {
|
|
@@ -2309,17 +2344,12 @@ function callBatchCodexCli(imagePaths) {
|
|
|
2309
2344
|
const model = process.env.KORDOC_CODEX_MODEL;
|
|
2310
2345
|
if (model) args.push("--model", model);
|
|
2311
2346
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2312
|
-
const result =
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
2316
|
-
input: ""
|
|
2347
|
+
const result = await spawnAsync("codex", args, {
|
|
2348
|
+
timeoutMs,
|
|
2349
|
+
stdin: ""
|
|
2317
2350
|
});
|
|
2318
|
-
if (result.
|
|
2319
|
-
|
|
2320
|
-
}
|
|
2321
|
-
if (result.status !== 0) {
|
|
2322
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2351
|
+
if (result.exitCode !== 0) {
|
|
2352
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2323
2353
|
throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2324
2354
|
}
|
|
2325
2355
|
try {
|
|
@@ -2581,7 +2611,7 @@ function isBatchProvider(p) {
|
|
|
2581
2611
|
async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2582
2612
|
const blocks = [];
|
|
2583
2613
|
if (isBatchProvider(provider)) {
|
|
2584
|
-
return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress);
|
|
2614
|
+
return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
|
|
2585
2615
|
}
|
|
2586
2616
|
if (concurrency <= 1) {
|
|
2587
2617
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -2628,8 +2658,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2628
2658
|
}
|
|
2629
2659
|
return blocks;
|
|
2630
2660
|
}
|
|
2631
|
-
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress) {
|
|
2632
|
-
const blocks = [];
|
|
2661
|
+
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2633
2662
|
const pageNumbers = [];
|
|
2634
2663
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2635
2664
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
@@ -2646,16 +2675,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2646
2675
|
batches.push(pageImages.slice(i, i + provider.batchSize));
|
|
2647
2676
|
}
|
|
2648
2677
|
let processed = 0;
|
|
2649
|
-
|
|
2678
|
+
const batchTasks = batches.map((batch, batchIdx) => async () => {
|
|
2679
|
+
const pageBlocks = [];
|
|
2650
2680
|
try {
|
|
2651
2681
|
const results = await provider.processBatch(batch);
|
|
2652
2682
|
for (const { pageNum } of batch) {
|
|
2653
2683
|
const result = results.get(pageNum);
|
|
2654
|
-
|
|
2655
|
-
|
|
2656
|
-
|
|
2657
|
-
|
|
2658
|
-
onProgress?.(processed, pageNumbers.length);
|
|
2684
|
+
pageBlocks.push({
|
|
2685
|
+
pageNum,
|
|
2686
|
+
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2687
|
+
});
|
|
2659
2688
|
}
|
|
2660
2689
|
} catch (err) {
|
|
2661
2690
|
const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
|
|
@@ -2663,8 +2692,20 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2663
2692
|
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2664
2693
|
code: "OCR_PAGE_FAILED"
|
|
2665
2694
|
});
|
|
2666
|
-
|
|
2667
|
-
|
|
2695
|
+
for (const { pageNum } of batch) {
|
|
2696
|
+
pageBlocks.push({ pageNum, blocks: [] });
|
|
2697
|
+
}
|
|
2698
|
+
}
|
|
2699
|
+
processed += batch.length;
|
|
2700
|
+
onProgress?.(processed, pageNumbers.length);
|
|
2701
|
+
return { batchIdx, pageBlocks };
|
|
2702
|
+
});
|
|
2703
|
+
const effectiveConcurrency = Math.max(1, concurrency);
|
|
2704
|
+
const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency);
|
|
2705
|
+
const blocks = [];
|
|
2706
|
+
for (const result of batchResults) {
|
|
2707
|
+
for (const { blocks: pageBlks } of result.pageBlocks) {
|
|
2708
|
+
for (const b of pageBlks) blocks.push(b);
|
|
2668
2709
|
}
|
|
2669
2710
|
}
|
|
2670
2711
|
return blocks;
|
|
@@ -2757,7 +2798,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
2757
2798
|
var import_xmldom = require("@xmldom/xmldom");
|
|
2758
2799
|
|
|
2759
2800
|
// src/utils.ts
|
|
2760
|
-
var VERSION = true ? "2.3.
|
|
2801
|
+
var VERSION = true ? "2.3.1" : "0.0.0-dev";
|
|
2761
2802
|
function toArrayBuffer(buf) {
|
|
2762
2803
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2763
2804
|
return buf.buffer;
|