@clazic/kordoc 2.3.1 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
- package/dist/batch-provider-PNDCSGQW.js.map +1 -0
- package/dist/{chunk-ZOEUKD77.js → chunk-2GFJFTKS.js} +193 -49
- package/dist/chunk-2GFJFTKS.js.map +1 -0
- package/dist/chunk-4PP34NVQ.js +121 -0
- package/dist/chunk-4PP34NVQ.js.map +1 -0
- package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
- package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
- package/dist/chunk-JOGAFNIL.js +153 -0
- package/dist/chunk-JOGAFNIL.js.map +1 -0
- package/dist/{chunk-W5KUC23B.js → chunk-STIKJGEA.js} +2 -2
- package/dist/cli.js +8 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +217 -70
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -6
- package/dist/index.d.ts +11 -6
- package/dist/index.js +217 -70
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
- package/dist/provider-HE727F7Z.js.map +1 -0
- package/dist/resolve-QA3VACUP.js +111 -0
- package/dist/resolve-QA3VACUP.js.map +1 -0
- package/dist/tesseract-provider-MNMZPSGF.js +11 -0
- package/dist/{utils-HSF5HI5T.js → utils-FFUQJTTI.js} +2 -2
- package/dist/utils-FFUQJTTI.js.map +1 -0
- package/dist/{watch-R2JHXDGF.js → watch-2O32L6IF.js} +6 -3
- package/dist/{watch-R2JHXDGF.js.map → watch-2O32L6IF.js.map} +1 -1
- package/package.json +7 -8
- package/dist/batch-provider-PCT4I4LK.js.map +0 -1
- package/dist/chunk-ZOEUKD77.js.map +0 -1
- package/dist/provider-WYHC4NHI.js.map +0 -1
- package/dist/resolve-4FSAQF2S.js +0 -247
- package/dist/resolve-4FSAQF2S.js.map +0 -1
- /package/dist/{chunk-W5KUC23B.js.map → chunk-STIKJGEA.js.map} +0 -0
- /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
|
@@ -108,9 +108,8 @@ async function callBatchCli(mode, imagePaths) {
|
|
|
108
108
|
${fileRefs}`;
|
|
109
109
|
let args;
|
|
110
110
|
if (mode === "gemini") {
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
if (model) args.push("--model", model);
|
|
111
|
+
const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
|
|
112
|
+
args = ["--prompt", prompt, "--yolo", "--model", model];
|
|
114
113
|
} else {
|
|
115
114
|
args = ["--print", prompt];
|
|
116
115
|
const model = process.env.KORDOC_CLAUDE_MODEL;
|
|
@@ -166,4 +165,4 @@ export {
|
|
|
166
165
|
DEFAULT_BATCH_SIZES,
|
|
167
166
|
createBatchCliProvider
|
|
168
167
|
};
|
|
169
|
-
//# sourceMappingURL=batch-provider-
|
|
168
|
+
//# sourceMappingURL=batch-provider-PNDCSGQW.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/ocr/batch-provider.ts"],"sourcesContent":["/**\n * CLI 배치 OCR 프로바이더\n *\n * 여러 페이지 이미지를 단일 CLI 호출로 처리하여 API 호출 수를 대폭 감소.\n * gemini/claude: @file 멀티 참조, codex: --image 멀티 플래그\n *\n * 299페이지 기준:\n * - 기존: CLI 299회 호출 (~30분)\n * - 배치: CLI 3~6회 호출 (~3분)\n */\n\nimport { spawn } from \"child_process\"\nimport { writeFileSync, readFileSync, unlinkSync, mkdirSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { StructuredOcrResult, BatchOcrProvider } from \"../types.js\"\n\n/** 배치 OCR 프롬프트 */\nconst BATCH_OCR_PROMPT =\n \"다음 문서 페이지 이미지들을 OCR하여 순수 Markdown으로 변환하세요.\\n\\n\" +\n \"규칙:\\n\" +\n \"- 각 페이지 결과 사이에 반드시 이 구분자를 삽입: <!-- PAGE_BREAK -->\\n\" +\n \"- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\\n\" +\n \"- 병합된 셀은 해당 위치에 내용 기재\\n\" +\n \"- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\\n\" +\n \"- 리스트는 - 또는 1. 사용\\n\" +\n \"- 이미지, 도형 등 비텍스트 요소는 무시\\n\" +\n \"- 원문의 읽기 순서와 구조를 유지\\n\" +\n \"- ```로 감싸지 말고 순수 Markdown만 출력\"\n\n/** 모드별 기본 배치 크기 (CLI 내부 타임아웃 + 실측 기반)\n *\n * gemini CLI: 10장 이상에서 AbortError 발생 (내부 타임아웃).\n * 5장 배치가 안정적으로 동작 확인 (35초/배치).\n * 299페이지 = 60배치 = 기존 299회 대비 80% 감소.\n */\nexport const DEFAULT_BATCH_SIZES: Record<string, number> = {\n gemini: 5,\n claude: 5,\n codex: 10,\n}\n\n/** 임시 디렉토리 — gemini CLI는 cwd 하위 + gitignore 밖만 @참조 가능 */\nlet _batchTempDir: string | null = null\nfunction getBatchTempDir(): string {\n if (!_batchTempDir) {\n _batchTempDir = join(process.cwd(), \"_kordoc_ocr_tmp\")\n mkdirSync(_batchTempDir, { recursive: true })\n }\n return _batchTempDir\n}\n\n/**\n * 배치 CLI 프로바이더 생성\n */\nexport function createBatchCliProvider(\n mode: \"gemini\" | \"claude\" | \"codex\",\n batchSize: number\n): BatchOcrProvider {\n return {\n __batch: true as const,\n batchSize,\n async processBatch(pages) {\n const results = new Map<number, StructuredOcrResult>()\n const tempDir = getBatchTempDir()\n const tempFiles: string[] = []\n\n try {\n // 1. Write all page images to temp files\n for (const { image, pageNum } of pages) {\n const path = join(tempDir, `batch-p${pageNum}.png`)\n writeFileSync(path, image)\n tempFiles.push(path)\n }\n\n // 2. Call CLI with all file references (비동기 — 병렬 배치 실행 가능)\n let output: string\n if (mode === \"codex\") {\n output = await callBatchCodexCli(tempFiles)\n } else {\n output = await callBatchCli(mode, tempFiles)\n }\n\n // 3. Parse response by PAGE_BREAK separator\n const cleaned = stripCodeFence(output.trim())\n const parts = cleaned.split(/<!--\\s*PAGE_BREAK\\s*-->/)\n .map(p => p.trim())\n .filter(p => p.length > 0)\n\n // 4. Map results to page numbers (best-effort if count mismatch)\n for (let i = 0; i < pages.length; i++) {\n const pageNum = pages[i].pageNum\n if (i < parts.length) {\n results.set(pageNum, { markdown: parts[i] })\n }\n // If fewer parts than pages, remaining pages get no result\n }\n } finally {\n // 5. Clean up temp files\n for (const f of tempFiles) {\n try { unlinkSync(f) } catch { /* ignore */ }\n }\n }\n\n return results\n },\n }\n}\n\n/**\n * 비동기 CLI 실행 헬퍼 — spawn + Promise 래핑.\n * spawnSync는 이벤트 루프를 차단하여 병렬 배치 실행 불가.\n */\nfunction spawnAsync(\n cmd: string,\n args: string[],\n opts: { timeoutMs: number; cwd?: string; stdin?: string }\n): Promise<{ stdout: string; stderr: string; exitCode: number }> {\n return new Promise((resolve, reject) => {\n const child = spawn(cmd, args, {\n cwd: opts.cwd,\n env: process.env,\n stdio: [\"pipe\", \"pipe\", \"pipe\"],\n })\n\n let stdout = \"\"\n let stderr = \"\"\n let killed = false\n\n child.stdout.setEncoding(\"utf-8\")\n child.stderr.setEncoding(\"utf-8\")\n child.stdout.on(\"data\", (d: string) => { stdout += d })\n child.stderr.on(\"data\", (d: string) => { stderr += d })\n\n const timer = setTimeout(() => {\n killed = true\n child.kill(\"SIGTERM\")\n }, opts.timeoutMs)\n\n if (opts.stdin !== undefined) {\n child.stdin.end(opts.stdin)\n } else {\n child.stdin.end()\n }\n\n child.on(\"close\", (code) => {\n clearTimeout(timer)\n if (killed) {\n reject(new Error(`타임아웃 (${Math.round(opts.timeoutMs / 1000)}초)`))\n } else {\n resolve({ stdout, stderr, exitCode: code ?? 1 })\n }\n })\n child.on(\"error\", (err) => {\n clearTimeout(timer)\n reject(err)\n })\n })\n}\n\n/** gemini/claude 배치 호출 (비동기) */\nasync function callBatchCli(mode: \"gemini\" | \"claude\", imagePaths: string[]): Promise<string> {\n const fileRefs = imagePaths.map(p => `@${p}`).join(\"\\n\")\n const prompt = `${BATCH_OCR_PROMPT}\\n\\n${fileRefs}`\n\n let args: string[]\n if (mode === \"gemini\") {\n const model = process.env.KORDOC_GEMINI_MODEL ?? \"gemini-2.5-flash\"\n args = [\"--prompt\", prompt, \"--yolo\", \"--model\", model]\n } else {\n args = [\"--print\", prompt]\n const model = process.env.KORDOC_CLAUDE_MODEL\n if (model) args.push(\"--model\", model)\n }\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(mode, args, {\n timeoutMs,\n ...(mode === \"claude\" ? { cwd: tmpdir() } : {}),\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`${mode} 배치 OCR 실패: ${errMsg}`)\n }\n\n return result.stdout || \"\"\n}\n\n/** codex 배치 호출 (비동기) — --image를 여러 번 지정 */\nasync function callBatchCodexCli(imagePaths: string[]): Promise<string> {\n const outPath = join(tmpdir(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`)\n try {\n const args = [\"exec\", BATCH_OCR_PROMPT]\n for (const p of imagePaths) {\n args.push(\"--image\", p)\n }\n args.push(\"--output-last-message\", outPath)\n const model = process.env.KORDOC_CODEX_MODEL\n if (model) args.push(\"--model\", model)\n\n const timeoutMs = 60_000 + imagePaths.length * 20_000\n const result = await spawnAsync(\"codex\", args, {\n timeoutMs,\n stdin: \"\",\n })\n\n if (result.exitCode !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`\n throw new Error(`codex 배치 OCR 실패: ${errMsg}`)\n }\n\n try {\n return readFileSync(outPath, \"utf-8\")\n } catch {\n return result.stdout || \"\"\n }\n } finally {\n try { unlinkSync(outPath) } catch { /* ignore */ }\n }\n}\n\n/** LLM 출력에서 코드 펜스 제거 (cli-provider.ts와 동일 로직) */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*\\n([\\s\\S]*?)\\n```\\s*$/m)\n return match ? match[1].trim() : text\n}\n"],"mappings":";;;;AAWA,SAAS,aAAa;AACtB,SAAS,eAAe,cAAc,YAAY,iBAAiB;AACnE,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,mBACJ;AAiBK,IAAM,sBAA8C;AAAA,EACzD,QAAQ;AAAA,EACR,QAAQ;AAAA,EACR,OAAO;AACT;AAGA,IAAI,gBAA+B;AACnC,SAAS,kBAA0B;AACjC,MAAI,CAAC,eAAe;AAClB,oBAAgB,KAAK,QAAQ,IAAI,GAAG,iBAAiB;AACrD,cAAU,eAAe,EAAE,WAAW,KAAK,CAAC;AAAA,EAC9C;AACA,SAAO;AACT;AAKO,SAAS,uBACd,MACA,WACkB;AAClB,SAAO;AAAA,IACL,SAAS;AAAA,IACT;AAAA,IACA,MAAM,aAAa,OAAO;AACxB,YAAM,UAAU,oBAAI,IAAiC;AACrD,YAAM,UAAU,gBAAgB;AAChC,YAAM,YAAsB,CAAC;AAE7B,UAAI;AAEF,mBAAW,EAAE,OAAO,QAAQ,KAAK,OAAO;AACtC,gBAAM,OAAO,KAAK,SAAS,UAAU,OAAO,MAAM;AAClD,wBAAc,MAAM,KAAK;AACzB,oBAAU,KAAK,IAAI;AAAA,QACrB;AAGA,YAAI;AACJ,YAAI,SAAS,SAAS;AACpB,mBAAS,MAAM,kBAAkB,SAAS;AAAA,QAC5C,OAAO;AACL,mBAAS,MAAM,aAAa,MAAM,SAAS;AAAA,QAC7C;AAGA,cAAM,UAAU,eAAe,OAAO,KAAK,CAAC;AAC5C,cAAM,QAAQ,QAAQ,MAAM,yBAAyB,EAClD,IAAI,OAAK,EAAE,KAAK,CAAC,EACjB,OAAO,OAAK,EAAE,SAAS,CAAC;AAG3B,iBAAS,IAAI,GAAG,IAAI,MAAM,QAAQ,KAAK;AACrC,gBAAM,UAAU,MAAM,CAAC,EAAE;AACzB,cAAI,IAAI,MAAM,QAAQ;AACpB,oBAAQ,IAAI,SAAS,EAAE,UAAU,MAAM,CAAC,EAAE,CAAC;AAAA,UAC7C;AAAA,QAEF;AAAA,MACF,UAAE;AAEA,mBAAW,KAAK,WAAW;AACzB,cAAI;AAAE,uBAAW,CAAC;AAAA,UAAE,QAAQ;AAAA,UAAe;AAAA,QAC7C;AAAA,MACF;AAEA,aAAO;AAAA,IACT;AAAA,EACF;AACF;AAMA,SAAS,WACP,KACA,MACA,MAC+D;AAC/D,SAAO,IAAI,QAAQ,CAAC,SAAS,WAAW;AACtC,UAAM,QAAQ,MAAM,KAAK,MAAM;AAAA,MAC7B,KAAK,KAAK;AAAA,MACV,KAAK,QAAQ;AAAA,MACb,OAAO,CAAC,QAAQ,QAAQ,MAAM;AAAA,IAChC,CAAC;AAED,QAAI,SAAS;AACb,QAAI,SAAS;AACb,QAAI,SAAS;AAEb,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,YAAY,OAAO;AAChC,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AACtD,UAAM,OAAO,GAAG,QAAQ,CAAC,MAAc;AAAE,gBAAU;AAAA,IAAE,CAAC;AAEtD,UAAM,QAAQ,WAAW,MAAM;AAC7B,eAAS;AACT,YAAM,KAAK,SAAS;AAAA,IACtB,GAAG,KAAK,SAAS;AAEjB,QAAI,KAAK,UAAU,QAAW;AAC5B,YAAM,MAAM,IAAI,KAAK,KAAK;AAAA,IAC5B,OAAO;AACL,YAAM,MAAM,IAAI;AAAA,IAClB;AAEA,UAAM,GAAG,SAAS,CAAC,SAAS;AAC1B,mBAAa,KAAK;AAClB,UAAI,QAAQ;AACV,eAAO,IAAI,MAAM,6BAAS,KAAK,MAAM,KAAK,YAAY,GAAI,CAAC,SAAI,CAAC;AAAA,MAClE,OAAO;AACL,gBAAQ,EAAE,QAAQ,QAAQ,UAAU,QAAQ,EAAE,CAAC;AAAA,MACjD;AAAA,IACF,CAAC;AACD,UAAM,GAAG,SAAS,CAAC,QAAQ;AACzB,mBAAa,KAAK;AAClB,aAAO,GAAG;AAAA,IACZ,CAAC;AAAA,EACH,CAAC;AACH;AAGA,eAAe,aAAa,MAA2B,YAAuC;AAC5F,QAAM,WAAW,WAAW,IAAI,OAAK,IAAI,CAAC,EAAE,EAAE,KAAK,IAAI;AACvD,QAAM,SAAS,GAAG,gBAAgB;AAAA;AAAA,EAAO,QAAQ;AAEjD,MAAI;AACJ,MAAI,SAAS,UAAU;AACrB,UAAM,QAAQ,QAAQ,IAAI,uBAAuB;AACjD,WAAO,CAAC,YAAY,QAAQ,UAAU,WAAW,KAAK;AAAA,EACxD,OAAO;AACL,WAAO,CAAC,WAAW,MAAM;AACzB,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAAA,EACvC;AAEA,QAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,QAAM,SAAS,MAAM,WAAW,MAAM,MAAM;AAAA,IAC1C;AAAA,IACA,GAAI,SAAS,WAAW,EAAE,KAAK,OAAO,EAAE,IAAI,CAAC;AAAA,EAC/C,CAAC;AAED,MAAI,OAAO,aAAa,GAAG;AACzB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,MAAM,EAAE;AAAA,EAChD;AAEA,SAAO,OAAO,UAAU;AAC1B;AAGA,eAAe,kBAAkB,YAAuC;AACtE,QAAM,UAAU,KAAK,OAAO,GAAG,sBAAsB,KAAK,IAAI,CAAC,IAAI,KAAK,OAAO,EAAE,SAAS,EAAE,EAAE,MAAM,CAAC,CAAC,MAAM;AAC5G,MAAI;AACF,UAAM,OAAO,CAAC,QAAQ,gBAAgB;AACtC,eAAW,KAAK,YAAY;AAC1B,WAAK,KAAK,WAAW,CAAC;AAAA,IACxB;AACA,SAAK,KAAK,yBAAyB,OAAO;AAC1C,UAAM,QAAQ,QAAQ,IAAI;AAC1B,QAAI,MAAO,MAAK,KAAK,WAAW,KAAK;AAErC,UAAM,YAAY,MAAS,WAAW,SAAS;AAC/C,UAAM,SAAS,MAAM,WAAW,SAAS,MAAM;AAAA,MAC7C;AAAA,MACA,OAAO;AAAA,IACT,CAAC;AAED,QAAI,OAAO,aAAa,GAAG;AACzB,YAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,QAAQ;AACpE,YAAM,IAAI,MAAM,wCAAoB,MAAM,EAAE;AAAA,IAC9C;AAEA,QAAI;AACF,aAAO,aAAa,SAAS,OAAO;AAAA,IACtC,QAAQ;AACN,aAAO,OAAO,UAAU;AAAA,IAC1B;AAAA,EACF,UAAE;AACA,QAAI;AAAE,iBAAW,OAAO;AAAA,IAAE,QAAQ;AAAA,IAAe;AAAA,EACnD;AACF;AAGA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,+CAA+C;AACxE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;","names":[]}
|
|
@@ -6,10 +6,19 @@ import {
|
|
|
6
6
|
precheckZipSize,
|
|
7
7
|
sanitizeHref,
|
|
8
8
|
toArrayBuffer
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-STIKJGEA.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
12
|
} from "./chunk-MOL7MDBG.js";
|
|
13
|
+
import {
|
|
14
|
+
createTesseractProvider
|
|
15
|
+
} from "./chunk-7FMKAV4P.js";
|
|
16
|
+
import {
|
|
17
|
+
createCliOcrProvider
|
|
18
|
+
} from "./chunk-JOGAFNIL.js";
|
|
19
|
+
import {
|
|
20
|
+
markdownToBlocks
|
|
21
|
+
} from "./chunk-4PP34NVQ.js";
|
|
13
22
|
import {
|
|
14
23
|
__commonJS,
|
|
15
24
|
__require,
|
|
@@ -1918,24 +1927,29 @@ function isPdfFile(buffer) {
|
|
|
1918
1927
|
const b = magicBytes(buffer);
|
|
1919
1928
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
1920
1929
|
}
|
|
1930
|
+
function isPngFile(buffer) {
|
|
1931
|
+
const b = magicBytes(buffer);
|
|
1932
|
+
return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
|
|
1933
|
+
}
|
|
1921
1934
|
function detectFormat(buffer) {
|
|
1922
1935
|
if (buffer.byteLength < 4) return "unknown";
|
|
1923
1936
|
if (isZipFile(buffer)) return "hwpx";
|
|
1924
1937
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
1925
1938
|
if (isPdfFile(buffer)) return "pdf";
|
|
1939
|
+
if (isPngFile(buffer)) return "image";
|
|
1926
1940
|
return "unknown";
|
|
1927
1941
|
}
|
|
1928
1942
|
async function detectZipFormat(buffer) {
|
|
1929
1943
|
try {
|
|
1930
1944
|
const zip = await JSZip.loadAsync(buffer);
|
|
1931
|
-
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
1932
|
-
if (zip.file("word/document.xml")) return "docx";
|
|
1933
|
-
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
1945
|
+
if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
|
|
1946
|
+
if (zip.file("word/document.xml")) return { format: "docx", zip };
|
|
1947
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
|
|
1934
1948
|
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
1935
|
-
if (hasSection) return "hwpx";
|
|
1936
|
-
return "unknown";
|
|
1949
|
+
if (hasSection) return { format: "hwpx", zip };
|
|
1950
|
+
return { format: "unknown", zip: null };
|
|
1937
1951
|
} catch {
|
|
1938
|
-
return "unknown";
|
|
1952
|
+
return { format: "unknown", zip: null };
|
|
1939
1953
|
}
|
|
1940
1954
|
}
|
|
1941
1955
|
|
|
@@ -2024,12 +2038,16 @@ function buildTableDirect(rows, numRows) {
|
|
|
2024
2038
|
return trimAndReturn(grid, numRows, maxCols);
|
|
2025
2039
|
}
|
|
2026
2040
|
function trimAndReturn(grid, numRows, maxCols) {
|
|
2027
|
-
let effectiveCols =
|
|
2028
|
-
|
|
2029
|
-
|
|
2030
|
-
|
|
2031
|
-
|
|
2041
|
+
let effectiveCols = 0;
|
|
2042
|
+
for (const row of grid) {
|
|
2043
|
+
for (let c = row.length - 1; c >= effectiveCols; c--) {
|
|
2044
|
+
if (row[c]?.text?.trim()) {
|
|
2045
|
+
effectiveCols = c + 1;
|
|
2046
|
+
break;
|
|
2047
|
+
}
|
|
2048
|
+
}
|
|
2032
2049
|
}
|
|
2050
|
+
if (effectiveCols === 0) effectiveCols = maxCols;
|
|
2033
2051
|
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
2034
2052
|
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
2035
2053
|
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
@@ -2289,11 +2307,11 @@ function parseStyleElements(doc, map) {
|
|
|
2289
2307
|
function stripDtd(xml) {
|
|
2290
2308
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
2291
2309
|
}
|
|
2292
|
-
async function parseHwpxDocument(buffer, options) {
|
|
2310
|
+
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
2293
2311
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
2294
2312
|
let zip;
|
|
2295
2313
|
try {
|
|
2296
|
-
zip = await JSZip2.loadAsync(buffer);
|
|
2314
|
+
zip = existingZip ?? await JSZip2.loadAsync(buffer);
|
|
2297
2315
|
} catch {
|
|
2298
2316
|
return await extractFromBrokenZip(buffer);
|
|
2299
2317
|
}
|
|
@@ -5328,8 +5346,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
|
|
|
5328
5346
|
GlobalWorkerOptions.workerSrc = "";
|
|
5329
5347
|
var MAX_PAGES = 5e3;
|
|
5330
5348
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
5331
|
-
|
|
5349
|
+
function calcPdfTimeout(bufferSize) {
|
|
5350
|
+
const base = 3e4;
|
|
5351
|
+
const perMb = 500;
|
|
5352
|
+
const mb = bufferSize / (1024 * 1024);
|
|
5353
|
+
return Math.min(base + Math.ceil(mb * perMb), 3e5);
|
|
5354
|
+
}
|
|
5332
5355
|
async function loadPdfWithTimeout(buffer) {
|
|
5356
|
+
const timeoutMs = calcPdfTimeout(buffer.byteLength);
|
|
5357
|
+
const timeoutSec = Math.round(timeoutMs / 1e3);
|
|
5333
5358
|
const loadingTask = getDocument({
|
|
5334
5359
|
data: new Uint8Array(buffer),
|
|
5335
5360
|
useSystemFonts: true,
|
|
@@ -5343,8 +5368,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
5343
5368
|
new Promise((_, reject) => {
|
|
5344
5369
|
timer = setTimeout(() => {
|
|
5345
5370
|
loadingTask.destroy();
|
|
5346
|
-
reject(new KordocError(
|
|
5347
|
-
},
|
|
5371
|
+
reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
|
|
5372
|
+
}, timeoutMs);
|
|
5348
5373
|
})
|
|
5349
5374
|
]);
|
|
5350
5375
|
} finally {
|
|
@@ -5365,11 +5390,15 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5365
5390
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
5366
5391
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
5367
5392
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
5368
|
-
const
|
|
5393
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
5369
5394
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
5370
|
-
|
|
5395
|
+
const targetPageNums = [];
|
|
5371
5396
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
5372
5397
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
5398
|
+
targetPageNums.push(i);
|
|
5399
|
+
}
|
|
5400
|
+
let parsedPages = 0;
|
|
5401
|
+
const parseSinglePage = async (i) => {
|
|
5373
5402
|
try {
|
|
5374
5403
|
const page = await doc.getPage(i);
|
|
5375
5404
|
const tc = await page.getTextContent();
|
|
@@ -5382,7 +5411,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5382
5411
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
5383
5412
|
}
|
|
5384
5413
|
for (const item of visible) {
|
|
5385
|
-
if (item.fontSize > 0)
|
|
5414
|
+
if (item.fontSize > 0) {
|
|
5415
|
+
const rounded = Math.round(item.fontSize * 10) / 10;
|
|
5416
|
+
fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
|
|
5417
|
+
}
|
|
5386
5418
|
}
|
|
5387
5419
|
const opList = await page.getOperatorList();
|
|
5388
5420
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -5399,14 +5431,25 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5399
5431
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
5400
5432
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
5401
5433
|
}
|
|
5434
|
+
};
|
|
5435
|
+
const sampleCount = Math.min(5, targetPageNums.length);
|
|
5436
|
+
for (let si = 0; si < sampleCount; si++) {
|
|
5437
|
+
await parseSinglePage(targetPageNums[si]);
|
|
5438
|
+
}
|
|
5439
|
+
const sampleParsed = parsedPages || sampleCount;
|
|
5440
|
+
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
5441
|
+
if (!isImageBased) {
|
|
5442
|
+
for (let si = sampleCount; si < targetPageNums.length; si++) {
|
|
5443
|
+
await parseSinglePage(targetPageNums[si]);
|
|
5444
|
+
}
|
|
5402
5445
|
}
|
|
5403
5446
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
5404
|
-
if (
|
|
5447
|
+
if (isImageBased) {
|
|
5405
5448
|
let ocrProvider = options?.ocr ?? null;
|
|
5406
|
-
const ocrMode = options?.ocrMode;
|
|
5407
|
-
if (!ocrProvider && ocrMode
|
|
5449
|
+
const ocrMode = options?.ocrMode ?? "auto";
|
|
5450
|
+
if (!ocrProvider && ocrMode !== "off") {
|
|
5408
5451
|
try {
|
|
5409
|
-
const { resolveOcrProvider } = await import("./resolve-
|
|
5452
|
+
const { resolveOcrProvider } = await import("./resolve-QA3VACUP.js");
|
|
5410
5453
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5411
5454
|
const batchSize = options?.ocrBatchSize;
|
|
5412
5455
|
ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
|
|
@@ -5422,7 +5465,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5422
5465
|
if (ocrProvider) {
|
|
5423
5466
|
let ocrBlocks = [];
|
|
5424
5467
|
try {
|
|
5425
|
-
const { ocrPages } = await import("./provider-
|
|
5468
|
+
const { ocrPages } = await import("./provider-HE727F7Z.js");
|
|
5426
5469
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
5427
5470
|
ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
5428
5471
|
} catch {
|
|
@@ -5456,7 +5499,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
5456
5499
|
blocks.splice(removed[ri], 1);
|
|
5457
5500
|
}
|
|
5458
5501
|
}
|
|
5459
|
-
const medianFontSize =
|
|
5502
|
+
const medianFontSize = computeMedianFromFreq(fontSizeFreq);
|
|
5460
5503
|
if (medianFontSize > 0) {
|
|
5461
5504
|
detectHeadings(blocks, medianFontSize);
|
|
5462
5505
|
}
|
|
@@ -5520,11 +5563,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
5520
5563
|
}
|
|
5521
5564
|
return { visible, hiddenCount };
|
|
5522
5565
|
}
|
|
5523
|
-
function
|
|
5524
|
-
if (
|
|
5525
|
-
const
|
|
5526
|
-
|
|
5527
|
-
|
|
5566
|
+
function computeMedianFromFreq(freq) {
|
|
5567
|
+
if (freq.size === 0) return 0;
|
|
5568
|
+
const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
5569
|
+
let total = 0;
|
|
5570
|
+
for (const [, count] of entries) total += count;
|
|
5571
|
+
const mid = total / 2;
|
|
5572
|
+
let cumulative = 0;
|
|
5573
|
+
for (const [size, count] of entries) {
|
|
5574
|
+
cumulative += count;
|
|
5575
|
+
if (cumulative >= mid) return size;
|
|
5576
|
+
}
|
|
5577
|
+
return 0;
|
|
5528
5578
|
}
|
|
5529
5579
|
function detectHeadings(blocks, medianFontSize) {
|
|
5530
5580
|
for (const block of blocks) {
|
|
@@ -6330,6 +6380,7 @@ var MAX_SHEETS = 100;
|
|
|
6330
6380
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
6331
6381
|
var MAX_ROWS2 = 1e4;
|
|
6332
6382
|
var MAX_COLS2 = 200;
|
|
6383
|
+
var MAX_TOTAL_CELLS = 2e6;
|
|
6333
6384
|
function cleanNumericValue(raw) {
|
|
6334
6385
|
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
6335
6386
|
const num = parseFloat(raw);
|
|
@@ -6513,9 +6564,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
6513
6564
|
}
|
|
6514
6565
|
return blocks;
|
|
6515
6566
|
}
|
|
6516
|
-
async function parseXlsxDocument(buffer, options) {
|
|
6567
|
+
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
6517
6568
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
6518
|
-
const zip = await JSZip3.loadAsync(buffer);
|
|
6569
|
+
const zip = existingZip ?? await JSZip3.loadAsync(buffer);
|
|
6519
6570
|
const warnings = [];
|
|
6520
6571
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
6521
6572
|
if (!workbookFile) {
|
|
@@ -6542,6 +6593,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
6542
6593
|
}
|
|
6543
6594
|
const blocks = [];
|
|
6544
6595
|
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
6596
|
+
let totalCells = 0;
|
|
6545
6597
|
for (let i = 0; i < processedSheets; i++) {
|
|
6546
6598
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
6547
6599
|
const sheet = sheets[i];
|
|
@@ -6568,6 +6620,11 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
6568
6620
|
try {
|
|
6569
6621
|
const sheetXml = await sheetFile.async("text");
|
|
6570
6622
|
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
6623
|
+
totalCells += maxRow * maxCol;
|
|
6624
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
6625
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
6626
|
+
break;
|
|
6627
|
+
}
|
|
6571
6628
|
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
6572
6629
|
blocks.push(...sheetBlocks);
|
|
6573
6630
|
} catch (err) {
|
|
@@ -6651,10 +6708,35 @@ function getAttr(el, localName) {
|
|
|
6651
6708
|
function parseXml2(text) {
|
|
6652
6709
|
return new DOMParser3().parseFromString(text, "text/xml");
|
|
6653
6710
|
}
|
|
6711
|
+
function buildElementIndex(root) {
|
|
6712
|
+
const index = /* @__PURE__ */ new Map();
|
|
6713
|
+
const walk = (node) => {
|
|
6714
|
+
const children = node.childNodes;
|
|
6715
|
+
for (let i = 0; i < children.length; i++) {
|
|
6716
|
+
const child = children[i];
|
|
6717
|
+
if (child.nodeType === 1) {
|
|
6718
|
+
const el = child;
|
|
6719
|
+
const name = el.localName ?? "";
|
|
6720
|
+
if (name) {
|
|
6721
|
+
let list = index.get(name);
|
|
6722
|
+
if (!list) {
|
|
6723
|
+
list = [];
|
|
6724
|
+
index.set(name, list);
|
|
6725
|
+
}
|
|
6726
|
+
list.push(el);
|
|
6727
|
+
}
|
|
6728
|
+
walk(el);
|
|
6729
|
+
}
|
|
6730
|
+
}
|
|
6731
|
+
};
|
|
6732
|
+
walk(root);
|
|
6733
|
+
return index;
|
|
6734
|
+
}
|
|
6654
6735
|
function parseStyles(xml) {
|
|
6655
6736
|
const doc = parseXml2(xml);
|
|
6656
6737
|
const styles = /* @__PURE__ */ new Map();
|
|
6657
|
-
const
|
|
6738
|
+
const idx = buildElementIndex(doc);
|
|
6739
|
+
const styleElements = idx.get("style") ?? [];
|
|
6658
6740
|
for (const el of styleElements) {
|
|
6659
6741
|
const styleId = getAttr(el, "styleId");
|
|
6660
6742
|
if (!styleId) continue;
|
|
@@ -6682,7 +6764,8 @@ function parseStyles(xml) {
|
|
|
6682
6764
|
function parseNumbering(xml) {
|
|
6683
6765
|
const doc = parseXml2(xml);
|
|
6684
6766
|
const abstractNums = /* @__PURE__ */ new Map();
|
|
6685
|
-
const
|
|
6767
|
+
const idx = buildElementIndex(doc);
|
|
6768
|
+
const abstractElements = idx.get("abstractNum") ?? [];
|
|
6686
6769
|
for (const el of abstractElements) {
|
|
6687
6770
|
const abstractNumId = getAttr(el, "abstractNumId");
|
|
6688
6771
|
if (!abstractNumId) continue;
|
|
@@ -6697,7 +6780,7 @@ function parseNumbering(xml) {
|
|
|
6697
6780
|
abstractNums.set(abstractNumId, levels);
|
|
6698
6781
|
}
|
|
6699
6782
|
const nums = /* @__PURE__ */ new Map();
|
|
6700
|
-
const numElements =
|
|
6783
|
+
const numElements = idx.get("num") ?? [];
|
|
6701
6784
|
for (const el of numElements) {
|
|
6702
6785
|
const numId = getAttr(el, "numId");
|
|
6703
6786
|
if (!numId) continue;
|
|
@@ -6941,9 +7024,9 @@ async function extractImages(zip, rels, doc) {
|
|
|
6941
7024
|
}
|
|
6942
7025
|
return { blocks, images };
|
|
6943
7026
|
}
|
|
6944
|
-
async function parseDocxDocument(buffer, options) {
|
|
7027
|
+
async function parseDocxDocument(buffer, options, existingZip) {
|
|
6945
7028
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
6946
|
-
const zip = await JSZip4.loadAsync(buffer);
|
|
7029
|
+
const zip = existingZip ?? await JSZip4.loadAsync(buffer);
|
|
6947
7030
|
const warnings = [];
|
|
6948
7031
|
const docFile = zip.file("word/document.xml");
|
|
6949
7032
|
if (!docFile) {
|
|
@@ -9378,25 +9461,86 @@ async function parse2(input, options) {
|
|
|
9378
9461
|
if (!buffer || buffer.byteLength === 0) {
|
|
9379
9462
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
9380
9463
|
}
|
|
9464
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
9465
|
+
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
9466
|
+
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
9467
|
+
}
|
|
9381
9468
|
const format = detectFormat(buffer);
|
|
9382
9469
|
switch (format) {
|
|
9383
9470
|
case "hwpx": {
|
|
9384
|
-
const zipFormat = await detectZipFormat(buffer);
|
|
9385
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
9386
|
-
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
9387
|
-
return parseHwpx(buffer, options);
|
|
9471
|
+
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
9472
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
|
|
9473
|
+
if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
|
|
9474
|
+
return parseHwpx(buffer, options, zip ?? void 0);
|
|
9388
9475
|
}
|
|
9389
9476
|
case "hwp":
|
|
9390
9477
|
return parseHwp(buffer, options);
|
|
9391
9478
|
case "pdf":
|
|
9392
9479
|
return parsePdf(buffer, options);
|
|
9480
|
+
case "image":
|
|
9481
|
+
return parseImage(buffer, options);
|
|
9393
9482
|
default:
|
|
9394
9483
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
9395
9484
|
}
|
|
9396
9485
|
}
|
|
9397
|
-
async function
|
|
9486
|
+
async function parseImage(buffer, options) {
|
|
9487
|
+
const ocrMode = options?.ocrMode || "auto";
|
|
9488
|
+
if (ocrMode === "off") {
|
|
9489
|
+
return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
|
|
9490
|
+
}
|
|
9491
|
+
let ocrProvider;
|
|
9492
|
+
let actualOcrMode = "auto";
|
|
9493
|
+
try {
|
|
9494
|
+
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
9495
|
+
ocrProvider = createCliOcrProvider(ocrMode);
|
|
9496
|
+
actualOcrMode = ocrMode;
|
|
9497
|
+
} else if (ocrMode === "tesseract") {
|
|
9498
|
+
ocrProvider = await createTesseractProvider();
|
|
9499
|
+
actualOcrMode = ocrMode;
|
|
9500
|
+
} else if (ocrMode === "auto") {
|
|
9501
|
+
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
9502
|
+
for (const mode of modesToTry) {
|
|
9503
|
+
try {
|
|
9504
|
+
ocrProvider = createCliOcrProvider(mode);
|
|
9505
|
+
actualOcrMode = mode;
|
|
9506
|
+
break;
|
|
9507
|
+
} catch (e) {
|
|
9508
|
+
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
9509
|
+
}
|
|
9510
|
+
}
|
|
9511
|
+
if (!ocrProvider) {
|
|
9512
|
+
ocrProvider = await createTesseractProvider();
|
|
9513
|
+
actualOcrMode = "tesseract";
|
|
9514
|
+
}
|
|
9515
|
+
}
|
|
9516
|
+
if (!ocrProvider) {
|
|
9517
|
+
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
9518
|
+
}
|
|
9519
|
+
const imageUint8Array = new Uint8Array(buffer);
|
|
9520
|
+
const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
|
|
9521
|
+
if (ocrProvider.terminate) {
|
|
9522
|
+
await ocrProvider.terminate();
|
|
9523
|
+
}
|
|
9524
|
+
const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
|
|
9525
|
+
const blocks = markdownToBlocks(markdown, 1);
|
|
9526
|
+
return {
|
|
9527
|
+
success: true,
|
|
9528
|
+
fileType: "image",
|
|
9529
|
+
markdown,
|
|
9530
|
+
blocks,
|
|
9531
|
+
isImageBased: true,
|
|
9532
|
+
warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
|
|
9533
|
+
};
|
|
9534
|
+
} catch (err) {
|
|
9535
|
+
if (ocrProvider && ocrProvider.terminate) {
|
|
9536
|
+
await ocrProvider.terminate();
|
|
9537
|
+
}
|
|
9538
|
+
return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
|
|
9539
|
+
}
|
|
9540
|
+
}
|
|
9541
|
+
async function parseHwpx(buffer, options, zip) {
|
|
9398
9542
|
try {
|
|
9399
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
9543
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
9400
9544
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
9401
9545
|
} catch (err) {
|
|
9402
9546
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -9419,17 +9563,17 @@ async function parsePdf(buffer, options) {
|
|
|
9419
9563
|
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
9420
9564
|
}
|
|
9421
9565
|
}
|
|
9422
|
-
async function parseXlsx(buffer, options) {
|
|
9566
|
+
async function parseXlsx(buffer, options, zip) {
|
|
9423
9567
|
try {
|
|
9424
|
-
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
9568
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
9425
9569
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
9426
9570
|
} catch (err) {
|
|
9427
9571
|
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
9428
9572
|
}
|
|
9429
9573
|
}
|
|
9430
|
-
async function parseDocx(buffer, options) {
|
|
9574
|
+
async function parseDocx(buffer, options, zip) {
|
|
9431
9575
|
try {
|
|
9432
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
9576
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
9433
9577
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
9434
9578
|
} catch (err) {
|
|
9435
9579
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -9624,4 +9768,4 @@ export {
|
|
|
9624
9768
|
cfb/cfb.js:
|
|
9625
9769
|
(*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
|
|
9626
9770
|
*/
|
|
9627
|
-
//# sourceMappingURL=chunk-
|
|
9771
|
+
//# sourceMappingURL=chunk-2GFJFTKS.js.map
|