@clazic/kordoc 2.2.0 → 2.2.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +2 -2
- package/dist/{chunk-JJMA5HGQ.js → chunk-64QPUEYH.js} +4 -4
- package/dist/{chunk-XWET7ONC.js → chunk-UPJWEES3.js} +2 -2
- package/dist/cli.js +5 -5
- package/dist/index.cjs +30 -51
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +28 -49
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{provider-XVKP5OGI.js → provider-EPHXUWRL.js} +2 -8
- package/dist/{provider-XVKP5OGI.js.map → provider-EPHXUWRL.js.map} +1 -1
- package/dist/{resolve-Y3KMGD3R.js → resolve-Z4DEPDUS.js} +27 -35
- package/dist/resolve-Z4DEPDUS.js.map +1 -0
- package/dist/{tesseract-provider-MZ37ZKQW.js → tesseract-provider-UNJOI25M.js} +3 -10
- package/dist/tesseract-provider-UNJOI25M.js.map +1 -0
- package/dist/{utils-4NP2VUFW.js → utils-TPAR37RJ.js} +2 -2
- package/dist/{watch-4VVWG2WC.js → watch-FEW5NWVC.js} +3 -3
- package/package.json +3 -1
- package/dist/resolve-Y3KMGD3R.js.map +0 -1
- package/dist/tesseract-provider-MZ37ZKQW.js.map +0 -1
- /package/dist/{chunk-JJMA5HGQ.js.map → chunk-64QPUEYH.js.map} +0 -0
- /package/dist/{chunk-XWET7ONC.js.map → chunk-UPJWEES3.js.map} +0 -0
- /package/dist/{utils-4NP2VUFW.js.map → utils-TPAR37RJ.js.map} +0 -0
- /package/dist/{watch-4VVWG2WC.js.map → watch-FEW5NWVC.js.map} +0 -0
package/dist/mcp.js
CHANGED
|
@@ -10,13 +10,13 @@ import {
|
|
|
10
10
|
markdownToHwpx,
|
|
11
11
|
markdownToXlsx,
|
|
12
12
|
parse
|
|
13
|
-
} from "./chunk-
|
|
13
|
+
} from "./chunk-64QPUEYH.js";
|
|
14
14
|
import {
|
|
15
15
|
KordocError,
|
|
16
16
|
VERSION,
|
|
17
17
|
sanitizeError,
|
|
18
18
|
toArrayBuffer
|
|
19
|
-
} from "./chunk-
|
|
19
|
+
} from "./chunk-UPJWEES3.js";
|
|
20
20
|
import "./chunk-MOL7MDBG.js";
|
|
21
21
|
import "./chunk-ZWE3DS7E.js";
|
|
22
22
|
|
|
@@ -147,13 +147,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings)
|
|
|
147
147
|
return blocks;
|
|
148
148
|
}
|
|
149
149
|
async function renderPageToPng(page) {
|
|
150
|
-
|
|
151
|
-
try {
|
|
152
|
-
const canvasModule = await import("canvas");
|
|
153
|
-
createCanvas = canvasModule.createCanvas;
|
|
154
|
-
} catch {
|
|
155
|
-
throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
|
|
156
|
-
}
|
|
150
|
+
const { createCanvas } = await import("@napi-rs/canvas");
|
|
157
151
|
const scale = 2;
|
|
158
152
|
const viewport = page.getViewport({ scale });
|
|
159
153
|
const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
|
|
@@ -164,4 +158,4 @@ async function renderPageToPng(page) {
|
|
|
164
158
|
export {
|
|
165
159
|
ocrPages
|
|
166
160
|
};
|
|
167
|
-
//# sourceMappingURL=provider-
|
|
161
|
+
//# sourceMappingURL=provider-EPHXUWRL.js.map
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/ocr/markdown-to-blocks.ts","../src/ocr/provider.ts"],"sourcesContent":["/**\n * Markdown → IRBlock[] 역파싱\n *\n * Vision LLM(gemini/claude/codex 등)이 반환한 Markdown 문자열을\n * kordoc의 IRBlock[] 중간 표현으로 변환.\n * 기존 blocksToMarkdown()의 역방향 처리.\n */\n\nimport type { IRBlock, IRTable, IRCell } from \"../types.js\"\n\n/**\n * Markdown 문자열을 IRBlock[] 배열로 변환.\n *\n * 지원 요소:\n * - 헤딩: # ~ ######\n * - 테이블: | col1 | col2 | (파이프 구분, |---|---| 구분선 포함)\n * - 순서/비순서 리스트: - / 1.\n * - 구분선: ---, ***, ___\n * - 일반 텍스트 (paragraph)\n */\nexport function markdownToBlocks(markdown: string, pageNumber: number): IRBlock[] {\n const blocks: IRBlock[] = []\n const lines = markdown.split(\"\\n\")\n let i = 0\n\n while (i < lines.length) {\n const line = lines[i]\n\n // 빈 줄 스킵\n if (line.trim() === \"\") {\n i++\n continue\n }\n\n // 1. 헤딩: # ~ ######\n const headingMatch = line.match(/^(#{1,6})\\s+(.+)$/)\n if (headingMatch) {\n blocks.push({\n type: \"heading\",\n level: headingMatch[1].length,\n text: headingMatch[2].trim(),\n pageNumber,\n })\n i++\n continue\n }\n\n // 2. 구분선: ---, ***, ___\n if (/^[-*_]{3,}\\s*$/.test(line.trim())) {\n blocks.push({ type: \"separator\", pageNumber })\n i++\n continue\n }\n\n // 3. 테이블: | 로 시작하는 연속 행 수집\n if (line.trim().startsWith(\"|\")) {\n const tableLines: string[] = []\n while (i < lines.length && lines[i].trim().startsWith(\"|\")) {\n tableLines.push(lines[i])\n i++\n }\n const table = parseMarkdownTable(tableLines)\n if (table) {\n blocks.push({ type: \"table\", table, pageNumber })\n }\n continue\n }\n\n // 4. 비순서 리스트: -, *, +\n const ulMatch = line.match(/^(\\s*)[-*+]\\s+(.+)$/)\n if (ulMatch) {\n blocks.push({\n type: \"list\",\n listType: \"unordered\",\n text: ulMatch[2].trim(),\n pageNumber,\n })\n i++\n continue\n }\n\n // 5. 순서 리스트: 1.\n const olMatch = line.match(/^(\\s*)\\d+\\.\\s+(.+)$/)\n if (olMatch) {\n blocks.push({\n type: \"list\",\n listType: \"ordered\",\n text: olMatch[2].trim(),\n pageNumber,\n })\n i++\n continue\n }\n\n // 6. 일반 텍스트 — 구조적 행이 나올 때까지 병합\n const paraLines: string[] = []\n while (i < lines.length && lines[i].trim() !== \"\" && !isStructuralLine(lines[i])) {\n paraLines.push(lines[i].trim())\n i++\n }\n if (paraLines.length > 0) {\n blocks.push({\n type: \"paragraph\",\n text: paraLines.join(\"\\n\"),\n pageNumber,\n })\n }\n }\n\n return blocks\n}\n\n/**\n * 구조적 행 판별 — paragraph 병합 중단 트리거.\n */\nfunction isStructuralLine(line: string): boolean {\n if (/^#{1,6}\\s+/.test(line)) return true\n if (line.trim().startsWith(\"|\")) return true\n if (/^[-*_]{3,}\\s*$/.test(line.trim())) return true\n if (/^\\s*[-*+]\\s+/.test(line)) return true\n if (/^\\s*\\d+\\.\\s+/.test(line)) return true\n return false\n}\n\n/**\n * Markdown 테이블 행 배열을 IRTable로 변환.\n *\n * 구분선 행(|---|---|)은 제거 후 데이터 행만 파싱.\n * hasHeader: 구분선이 있었으면 true.\n */\nfunction parseMarkdownTable(lines: string[]): IRTable | null {\n const hasSeparator = lines.some(line => /^\\|[\\s:|-]+\\|$/.test(line.trim()))\n\n const rows: IRCell[][] = []\n let maxCols = 0\n\n for (const line of lines) {\n // 구분선 행 스킵: |---|---| 패턴\n if (/^\\|\\s*:?-+:?\\s*(\\|\\s*:?-+:?\\s*)+\\|?\\s*$/.test(line.trim())) continue\n\n const parts = line.split(\"|\")\n // 앞뒤 빈 요소 제거 (| 로 시작/종료하는 행)\n const cells: IRCell[] = parts\n .slice(1, parts[parts.length - 1].trim() === \"\" ? -1 : undefined)\n .map(cell => ({\n text: cell.trim(),\n colSpan: 1,\n rowSpan: 1,\n }))\n\n if (cells.length > 0) {\n rows.push(cells)\n maxCols = Math.max(maxCols, cells.length)\n }\n }\n\n if (rows.length === 0) return null\n\n // 열 수 통일 (부족한 셀은 빈 셀로 채움)\n for (const row of rows) {\n while (row.length < maxCols) {\n row.push({ text: \"\", colSpan: 1, rowSpan: 1 })\n }\n }\n\n return {\n rows: rows.length,\n cols: maxCols,\n cells: rows,\n hasHeader: hasSeparator && rows.length > 1,\n }\n}\n","/**\n * OCR 프로바이더 브릿지 — PDF 페이지를 이미지로 렌더링하여 OCR 호출\n *\n * kordoc은 OCR 라이브러리를 번들하지 않음.\n * 사용자가 OcrProvider 함수를 제공하면 이미지 기반 PDF도 텍스트 추출 가능.\n *\n * @example\n * ```ts\n * import { parse } from \"kordoc\"\n *\n * const result = await parse(buffer, {\n * ocr: async (pageImage, pageNumber, mimeType) => {\n * // Tesseract, Claude Vision, Google Vision 등 사용\n * return await myOcrService.recognize(pageImage)\n * }\n * })\n * ```\n */\n\nimport type { OcrProvider, IRBlock, ParseWarning, StructuredOcrResult } from \"../types.js\"\nimport { markdownToBlocks } from \"./markdown-to-blocks.js\"\n\n/**\n * 이미지 기반 PDF 페이지에 OCR을 적용하여 IRBlock[] 반환.\n *\n * pdfjs page 객체에서 viewport + render를 통해 PNG 생성 후\n * 사용자 제공 OcrProvider 호출.\n *\n * - string 반환: 단순 텍스트 → paragraph 블록\n * - StructuredOcrResult 반환: Markdown → markdownToBlocks()로 구조화\n *\n * canvas 미설치 시 pdfjs render 불가하므로 에러 반환.\n */\nexport async function ocrPages(\n doc: { numPages: number; getPage(n: number): Promise<PdfPageProxy> },\n provider: OcrProvider,\n pageFilter: Set<number> | null,\n effectivePageCount: number,\n warnings?: ParseWarning[]\n): Promise<IRBlock[]> {\n const blocks: IRBlock[] = []\n\n for (let i = 1; i <= effectivePageCount; i++) {\n if (pageFilter && !pageFilter.has(i)) continue\n const page = await doc.getPage(i)\n try {\n const imageData = await renderPageToPng(page)\n const result = await provider(imageData, i, \"image/png\")\n\n if (typeof result === \"string\") {\n // 기존 동작: 순수 텍스트 → paragraph 블록\n if (result.trim()) {\n blocks.push({ type: \"paragraph\", text: result.trim(), pageNumber: i })\n }\n } else if (result && typeof result === \"object\" && \"markdown\" in result) {\n // 신규: 구조화된 결과 → Markdown → IRBlock[]\n const structured = result as StructuredOcrResult\n if (structured.markdown.trim()) {\n const pageBlocks = markdownToBlocks(structured.markdown, i)\n for (const b of pageBlocks) blocks.push(b)\n }\n }\n } catch (err) {\n // 개별 페이지 실패 시 경고 발행 후 계속 진행\n warnings?.push({\n page: i,\n message: `페이지 ${i} OCR 실패: ${err instanceof Error ? err.message : \"알 수 없는 오류\"}`,\n code: \"OCR_PAGE_FAILED\",\n })\n }\n }\n\n return blocks\n}\n\ninterface PdfPageProxy {\n getViewport(params: { scale: number }): { width: number; height: number }\n render(params: { canvasContext: unknown; viewport: unknown }): { promise: Promise<void> }\n}\n\n/**\n * PDF 페이지를 PNG로 렌더링.\n * node-canvas가 설치되어 있어야 동작.\n * 미설치 시 에러 throw → 호출측에서 catch.\n */\nasync function renderPageToPng(page: PdfPageProxy): Promise<Uint8Array> {\n // node-canvas 동적 로드 (선택적 의존성)\n let createCanvas: (w: number, h: number) => { getContext(t: string): unknown; toBuffer(t: string): Buffer }\n try {\n const canvasModule = await import(\"canvas\")\n createCanvas = canvasModule.createCanvas\n } catch {\n throw new Error(\"OCR을 사용하려면 'canvas' 패키지를 설치하세요: npm install canvas\")\n }\n\n const scale = 2.0 // 300 DPI 근사\n const viewport = page.getViewport({ scale })\n const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height))\n const ctx = canvas.getContext(\"2d\")\n\n await page.render({ canvasContext: ctx, viewport }).promise\n return new Uint8Array(canvas.toBuffer(\"image/png\"))\n}\n"],"mappings":";;;;AAoBO,SAAS,iBAAiB,UAAkB,YAA+B;AAChF,QAAM,SAAoB,CAAC;AAC3B,QAAM,QAAQ,SAAS,MAAM,IAAI;AACjC,MAAI,IAAI;AAER,SAAO,IAAI,MAAM,QAAQ;AACvB,UAAM,OAAO,MAAM,CAAC;AAGpB,QAAI,KAAK,KAAK,MAAM,IAAI;AACtB;AACA;AAAA,IACF;AAGA,UAAM,eAAe,KAAK,MAAM,mBAAmB;AACnD,QAAI,cAAc;AAChB,aAAO,KAAK;AAAA,QACV,MAAM;AAAA,QACN,OAAO,aAAa,CAAC,EAAE;AAAA,QACvB,MAAM,aAAa,CAAC,EAAE,KAAK;AAAA,QAC3B;AAAA,MACF,CAAC;AACD;AACA;AAAA,IACF;AAGA,QAAI,iBAAiB,KAAK,KAAK,KAAK,CAAC,GAAG;AACtC,aAAO,KAAK,EAAE,MAAM,aAAa,WAAW,CAAC;AAC7C;AACA;AAAA,IACF;AAGA,QAAI,KAAK,KAAK,EAAE,WAAW,GAAG,GAAG;AAC/B,YAAM,aAAuB,CAAC;AAC9B,aAAO,IAAI,MAAM,UAAU,MAAM,CAAC,EAAE,KAAK,EAAE,WAAW,GAAG,GAAG;AAC1D,mBAAW,KAAK,MAAM,CAAC,CAAC;AACxB;AAAA,MACF;AACA,YAAM,QAAQ,mBAAmB,UAAU;AAC3C,UAAI,OAAO;AACT,eAAO,KAAK,EAAE,MAAM,SAAS,OAAO,WAAW,CAAC;AAAA,MAClD;AACA;AAAA,IACF;AAGA,UAAM,UAAU,KAAK,MAAM,qBAAqB;AAChD,QAAI,SAAS;AACX,aAAO,KAAK;AAAA,QACV,MAAM;AAAA,QACN,UAAU;AAAA,QACV,MAAM,QAAQ,CAAC,EAAE,KAAK;AAAA,QACtB;AAAA,MACF,CAAC;AACD;AACA;AAAA,IACF;AAGA,UAAM,UAAU,KAAK,MAAM,qBAAqB;AAChD,QAAI,SAAS;AACX,aAAO,KAAK;AAAA,QACV,MAAM;AAAA,QACN,UAAU;AAAA,QACV,MAAM,QAAQ,CAAC,EAAE,KAAK;AAAA,QACtB;AAAA,MACF,CAAC;AACD;AACA;AAAA,IACF;AAGA,UAAM,YAAsB,CAAC;AAC7B,WAAO,IAAI,MAAM,UAAU,MAAM,CAAC,EAAE,KAAK,MAAM,MAAM,CAAC,iBAAiB,MAAM,CAAC,CAAC,GAAG;AAChF,gBAAU,KAAK,MAAM,CAAC,EAAE,KAAK,CAAC;AAC9B;AAAA,IACF;AACA,QAAI,UAAU,SAAS,GAAG;AACxB,aAAO,KAAK;AAAA,QACV,MAAM;AAAA,QACN,MAAM,UAAU,KAAK,IAAI;AAAA,QACzB;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AAEA,SAAO;AACT;AAKA,SAAS,iBAAiB,MAAuB;AAC/C,MAAI,aAAa,KAAK,IAAI,EAAG,QAAO;AACpC,MAAI,KAAK,KAAK,EAAE,WAAW,GAAG,EAAG,QAAO;AACxC,MAAI,iBAAiB,KAAK,KAAK,KAAK,CAAC,EAAG,QAAO;AAC/C,MAAI,eAAe,KAAK,IAAI,EAAG,QAAO;AACtC,MAAI,eAAe,KAAK,IAAI,EAAG,QAAO;AACtC,SAAO;AACT;AAQA,SAAS,mBAAmB,OAAiC;AAC3D,QAAM,eAAe,MAAM,KAAK,UAAQ,iBAAiB,KAAK,KAAK,KAAK,CAAC,CAAC;AAE1E,QAAM,OAAmB,CAAC;AAC1B,MAAI,UAAU;AAEd,aAAW,QAAQ,OAAO;AAExB,QAAI,0CAA0C,KAAK,KAAK,KAAK,CAAC,EAAG;AAEjE,UAAM,QAAQ,KAAK,MAAM,GAAG;AAE5B,UAAM,QAAkB,MACrB,MAAM,GAAG,MAAM,MAAM,SAAS,CAAC,EAAE,KAAK,MAAM,KAAK,KAAK,MAAS,EAC/D,IAAI,WAAS;AAAA,MACZ,MAAM,KAAK,KAAK;AAAA,MAChB,SAAS;AAAA,MACT,SAAS;AAAA,IACX,EAAE;AAEJ,QAAI,MAAM,SAAS,GAAG;AACpB,WAAK,KAAK,KAAK;AACf,gBAAU,KAAK,IAAI,SAAS,MAAM,MAAM;AAAA,IAC1C;AAAA,EACF;AAEA,MAAI,KAAK,WAAW,EAAG,QAAO;AAG9B,aAAW,OAAO,MAAM;AACtB,WAAO,IAAI,SAAS,SAAS;AAC3B,UAAI,KAAK,EAAE,MAAM,IAAI,SAAS,GAAG,SAAS,EAAE,CAAC;AAAA,IAC/C;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM,KAAK;AAAA,IACX,MAAM;AAAA,IACN,OAAO;AAAA,IACP,WAAW,gBAAgB,KAAK,SAAS;AAAA,EAC3C;AACF;;;AC1IA,eAAsB,SACpB,KACA,UACA,YACA,oBACA,UACoB;AACpB,QAAM,SAAoB,CAAC;AAE3B,WAAS,IAAI,GAAG,KAAK,oBAAoB,KAAK;AAC5C,QAAI,cAAc,CAAC,WAAW,IAAI,CAAC,EAAG;AACtC,UAAM,OAAO,MAAM,IAAI,QAAQ,CAAC;AAChC,QAAI;AACF,YAAM,YAAY,MAAM,gBAAgB,IAAI;AAC5C,YAAM,SAAS,MAAM,SAAS,WAAW,GAAG,WAAW;AAEvD,UAAI,OAAO,WAAW,UAAU;AAE9B,YAAI,OAAO,KAAK,GAAG;AACjB,iBAAO,KAAK,EAAE,MAAM,aAAa,MAAM,OAAO,KAAK,GAAG,YAAY,EAAE,CAAC;AAAA,QACvE;AAAA,MACF,WAAW,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAEvE,cAAM,aAAa;AACnB,YAAI,WAAW,SAAS,KAAK,GAAG;AAC9B,gBAAM,aAAa,iBAAiB,WAAW,UAAU,CAAC;AAC1D,qBAAW,KAAK,WAAY,QAAO,KAAK,CAAC;AAAA,QAC3C;AAAA,MACF;AAAA,IACF,SAAS,KAAK;AAEZ,gBAAU,KAAK;AAAA,QACb,MAAM;AAAA,QACN,SAAS,sBAAO,CAAC,sBAAY,eAAe,QAAQ,IAAI,UAAU,yCAAW;AAAA,QAC7E,MAAM;AAAA,MACR,CAAC;AAAA,IACH;AAAA,EACF;AAEA,SAAO;AACT;AAYA,eAAe,gBAAgB,MAAyC;AAEtE,MAAI;AACJ,MAAI;AACF,UAAM,eAAe,MAAM,OAAO,QAAQ;AAC1C,mBAAe,aAAa;AAAA,EAC9B,QAAQ;AACN,UAAM,IAAI,MAAM,+HAAoD;AAAA,EACtE;AAEA,QAAM,QAAQ;AACd,QAAM,WAAW,KAAK,YAAY,EAAE,MAAM,CAAC;AAC3C,QAAM,SAAS,aAAa,KAAK,MAAM,SAAS,KAAK,GAAG,KAAK,MAAM,SAAS,MAAM,CAAC;AACnF,QAAM,MAAM,OAAO,WAAW,IAAI;AAElC,QAAM,KAAK,OAAO,EAAE,eAAe,KAAK,SAAS,CAAC,EAAE;AACpD,SAAO,IAAI,WAAW,OAAO,SAAS,WAAW,CAAC;AACpD;","names":[]}
|
|
1
|
+
{"version":3,"sources":["../src/ocr/markdown-to-blocks.ts","../src/ocr/provider.ts"],"sourcesContent":["/**\n * Markdown → IRBlock[] 역파싱\n *\n * Vision LLM(gemini/claude/codex 등)이 반환한 Markdown 문자열을\n * kordoc의 IRBlock[] 중간 표현으로 변환.\n * 기존 blocksToMarkdown()의 역방향 처리.\n */\n\nimport type { IRBlock, IRTable, IRCell } from \"../types.js\"\n\n/**\n * Markdown 문자열을 IRBlock[] 배열로 변환.\n *\n * 지원 요소:\n * - 헤딩: # ~ ######\n * - 테이블: | col1 | col2 | (파이프 구분, |---|---| 구분선 포함)\n * - 순서/비순서 리스트: - / 1.\n * - 구분선: ---, ***, ___\n * - 일반 텍스트 (paragraph)\n */\nexport function markdownToBlocks(markdown: string, pageNumber: number): IRBlock[] {\n const blocks: IRBlock[] = []\n const lines = markdown.split(\"\\n\")\n let i = 0\n\n while (i < lines.length) {\n const line = lines[i]\n\n // 빈 줄 스킵\n if (line.trim() === \"\") {\n i++\n continue\n }\n\n // 1. 헤딩: # ~ ######\n const headingMatch = line.match(/^(#{1,6})\\s+(.+)$/)\n if (headingMatch) {\n blocks.push({\n type: \"heading\",\n level: headingMatch[1].length,\n text: headingMatch[2].trim(),\n pageNumber,\n })\n i++\n continue\n }\n\n // 2. 구분선: ---, ***, ___\n if (/^[-*_]{3,}\\s*$/.test(line.trim())) {\n blocks.push({ type: \"separator\", pageNumber })\n i++\n continue\n }\n\n // 3. 테이블: | 로 시작하는 연속 행 수집\n if (line.trim().startsWith(\"|\")) {\n const tableLines: string[] = []\n while (i < lines.length && lines[i].trim().startsWith(\"|\")) {\n tableLines.push(lines[i])\n i++\n }\n const table = parseMarkdownTable(tableLines)\n if (table) {\n blocks.push({ type: \"table\", table, pageNumber })\n }\n continue\n }\n\n // 4. 비순서 리스트: -, *, +\n const ulMatch = line.match(/^(\\s*)[-*+]\\s+(.+)$/)\n if (ulMatch) {\n blocks.push({\n type: \"list\",\n listType: \"unordered\",\n text: ulMatch[2].trim(),\n pageNumber,\n })\n i++\n continue\n }\n\n // 5. 순서 리스트: 1.\n const olMatch = line.match(/^(\\s*)\\d+\\.\\s+(.+)$/)\n if (olMatch) {\n blocks.push({\n type: \"list\",\n listType: \"ordered\",\n text: olMatch[2].trim(),\n pageNumber,\n })\n i++\n continue\n }\n\n // 6. 일반 텍스트 — 구조적 행이 나올 때까지 병합\n const paraLines: string[] = []\n while (i < lines.length && lines[i].trim() !== \"\" && !isStructuralLine(lines[i])) {\n paraLines.push(lines[i].trim())\n i++\n }\n if (paraLines.length > 0) {\n blocks.push({\n type: \"paragraph\",\n text: paraLines.join(\"\\n\"),\n pageNumber,\n })\n }\n }\n\n return blocks\n}\n\n/**\n * 구조적 행 판별 — paragraph 병합 중단 트리거.\n */\nfunction isStructuralLine(line: string): boolean {\n if (/^#{1,6}\\s+/.test(line)) return true\n if (line.trim().startsWith(\"|\")) return true\n if (/^[-*_]{3,}\\s*$/.test(line.trim())) return true\n if (/^\\s*[-*+]\\s+/.test(line)) return true\n if (/^\\s*\\d+\\.\\s+/.test(line)) return true\n return false\n}\n\n/**\n * Markdown 테이블 행 배열을 IRTable로 변환.\n *\n * 구분선 행(|---|---|)은 제거 후 데이터 행만 파싱.\n * hasHeader: 구분선이 있었으면 true.\n */\nfunction parseMarkdownTable(lines: string[]): IRTable | null {\n const hasSeparator = lines.some(line => /^\\|[\\s:|-]+\\|$/.test(line.trim()))\n\n const rows: IRCell[][] = []\n let maxCols = 0\n\n for (const line of lines) {\n // 구분선 행 스킵: |---|---| 패턴\n if (/^\\|\\s*:?-+:?\\s*(\\|\\s*:?-+:?\\s*)+\\|?\\s*$/.test(line.trim())) continue\n\n const parts = line.split(\"|\")\n // 앞뒤 빈 요소 제거 (| 로 시작/종료하는 행)\n const cells: IRCell[] = parts\n .slice(1, parts[parts.length - 1].trim() === \"\" ? -1 : undefined)\n .map(cell => ({\n text: cell.trim(),\n colSpan: 1,\n rowSpan: 1,\n }))\n\n if (cells.length > 0) {\n rows.push(cells)\n maxCols = Math.max(maxCols, cells.length)\n }\n }\n\n if (rows.length === 0) return null\n\n // 열 수 통일 (부족한 셀은 빈 셀로 채움)\n for (const row of rows) {\n while (row.length < maxCols) {\n row.push({ text: \"\", colSpan: 1, rowSpan: 1 })\n }\n }\n\n return {\n rows: rows.length,\n cols: maxCols,\n cells: rows,\n hasHeader: hasSeparator && rows.length > 1,\n }\n}\n","/**\n * OCR 프로바이더 브릿지 — PDF 페이지를 이미지로 렌더링하여 OCR 호출\n *\n * kordoc은 OCR 라이브러리를 번들하지 않음.\n * 사용자가 OcrProvider 함수를 제공하면 이미지 기반 PDF도 텍스트 추출 가능.\n *\n * @example\n * ```ts\n * import { parse } from \"kordoc\"\n *\n * const result = await parse(buffer, {\n * ocr: async (pageImage, pageNumber, mimeType) => {\n * // Tesseract, Claude Vision, Google Vision 등 사용\n * return await myOcrService.recognize(pageImage)\n * }\n * })\n * ```\n */\n\nimport type { OcrProvider, IRBlock, ParseWarning, StructuredOcrResult } from \"../types.js\"\nimport { markdownToBlocks } from \"./markdown-to-blocks.js\"\n\n/**\n * 이미지 기반 PDF 페이지에 OCR을 적용하여 IRBlock[] 반환.\n *\n * pdfjs page 객체에서 viewport + render를 통해 PNG 생성 후\n * 사용자 제공 OcrProvider 호출.\n *\n * - string 반환: 단순 텍스트 → paragraph 블록\n * - StructuredOcrResult 반환: Markdown → markdownToBlocks()로 구조화\n *\n * canvas 미설치 시 pdfjs render 불가하므로 에러 반환.\n */\nexport async function ocrPages(\n doc: { numPages: number; getPage(n: number): Promise<PdfPageProxy> },\n provider: OcrProvider,\n pageFilter: Set<number> | null,\n effectivePageCount: number,\n warnings?: ParseWarning[]\n): Promise<IRBlock[]> {\n const blocks: IRBlock[] = []\n\n for (let i = 1; i <= effectivePageCount; i++) {\n if (pageFilter && !pageFilter.has(i)) continue\n const page = await doc.getPage(i)\n try {\n const imageData = await renderPageToPng(page)\n const result = await provider(imageData, i, \"image/png\")\n\n if (typeof result === \"string\") {\n // 기존 동작: 순수 텍스트 → paragraph 블록\n if (result.trim()) {\n blocks.push({ type: \"paragraph\", text: result.trim(), pageNumber: i })\n }\n } else if (result && typeof result === \"object\" && \"markdown\" in result) {\n // 신규: 구조화된 결과 → Markdown → IRBlock[]\n const structured = result as StructuredOcrResult\n if (structured.markdown.trim()) {\n const pageBlocks = markdownToBlocks(structured.markdown, i)\n for (const b of pageBlocks) blocks.push(b)\n }\n }\n } catch (err) {\n // 개별 페이지 실패 시 경고 발행 후 계속 진행\n warnings?.push({\n page: i,\n message: `페이지 ${i} OCR 실패: ${err instanceof Error ? err.message : \"알 수 없는 오류\"}`,\n code: \"OCR_PAGE_FAILED\",\n })\n }\n }\n\n return blocks\n}\n\ninterface PdfPageProxy {\n getViewport(params: { scale: number }): { width: number; height: number }\n render(params: { canvasContext: unknown; viewport: unknown }): { promise: Promise<void> }\n}\n\n/**\n * PDF 페이지를 PNG로 렌더링.\n * @napi-rs/canvas 사용 (kordoc 번들 의존성, 별도 설치 불필요)\n */\nasync function renderPageToPng(page: PdfPageProxy): Promise<Uint8Array> {\n const { createCanvas } = await import(\"@napi-rs/canvas\")\n\n const scale = 2.0 // 300 DPI 근사\n const viewport = page.getViewport({ scale })\n const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height))\n const ctx = canvas.getContext(\"2d\")\n\n await page.render({ canvasContext: ctx as unknown, viewport }).promise\n return new Uint8Array(canvas.toBuffer(\"image/png\"))\n}\n"],"mappings":";;;;AAoBO,SAAS,iBAAiB,UAAkB,YAA+B;AAChF,QAAM,SAAoB,CAAC;AAC3B,QAAM,QAAQ,SAAS,MAAM,IAAI;AACjC,MAAI,IAAI;AAER,SAAO,IAAI,MAAM,QAAQ;AACvB,UAAM,OAAO,MAAM,CAAC;AAGpB,QAAI,KAAK,KAAK,MAAM,IAAI;AACtB;AACA;AAAA,IACF;AAGA,UAAM,eAAe,KAAK,MAAM,mBAAmB;AACnD,QAAI,cAAc;AAChB,aAAO,KAAK;AAAA,QACV,MAAM;AAAA,QACN,OAAO,aAAa,CAAC,EAAE;AAAA,QACvB,MAAM,aAAa,CAAC,EAAE,KAAK;AAAA,QAC3B;AAAA,MACF,CAAC;AACD;AACA;AAAA,IACF;AAGA,QAAI,iBAAiB,KAAK,KAAK,KAAK,CAAC,GAAG;AACtC,aAAO,KAAK,EAAE,MAAM,aAAa,WAAW,CAAC;AAC7C;AACA;AAAA,IACF;AAGA,QAAI,KAAK,KAAK,EAAE,WAAW,GAAG,GAAG;AAC/B,YAAM,aAAuB,CAAC;AAC9B,aAAO,IAAI,MAAM,UAAU,MAAM,CAAC,EAAE,KAAK,EAAE,WAAW,GAAG,GAAG;AAC1D,mBAAW,KAAK,MAAM,CAAC,CAAC;AACxB;AAAA,MACF;AACA,YAAM,QAAQ,mBAAmB,UAAU;AAC3C,UAAI,OAAO;AACT,eAAO,KAAK,EAAE,MAAM,SAAS,OAAO,WAAW,CAAC;AAAA,MAClD;AACA;AAAA,IACF;AAGA,UAAM,UAAU,KAAK,MAAM,qBAAqB;AAChD,QAAI,SAAS;AACX,aAAO,KAAK;AAAA,QACV,MAAM;AAAA,QACN,UAAU;AAAA,QACV,MAAM,QAAQ,CAAC,EAAE,KAAK;AAAA,QACtB;AAAA,MACF,CAAC;AACD;AACA;AAAA,IACF;AAGA,UAAM,UAAU,KAAK,MAAM,qBAAqB;AAChD,QAAI,SAAS;AACX,aAAO,KAAK;AAAA,QACV,MAAM;AAAA,QACN,UAAU;AAAA,QACV,MAAM,QAAQ,CAAC,EAAE,KAAK;AAAA,QACtB;AAAA,MACF,CAAC;AACD;AACA;AAAA,IACF;AAGA,UAAM,YAAsB,CAAC;AAC7B,WAAO,IAAI,MAAM,UAAU,MAAM,CAAC,EAAE,KAAK,MAAM,MAAM,CAAC,iBAAiB,MAAM,CAAC,CAAC,GAAG;AAChF,gBAAU,KAAK,MAAM,CAAC,EAAE,KAAK,CAAC;AAC9B;AAAA,IACF;AACA,QAAI,UAAU,SAAS,GAAG;AACxB,aAAO,KAAK;AAAA,QACV,MAAM;AAAA,QACN,MAAM,UAAU,KAAK,IAAI;AAAA,QACzB;AAAA,MACF,CAAC;AAAA,IACH;AAAA,EACF;AAEA,SAAO;AACT;AAKA,SAAS,iBAAiB,MAAuB;AAC/C,MAAI,aAAa,KAAK,IAAI,EAAG,QAAO;AACpC,MAAI,KAAK,KAAK,EAAE,WAAW,GAAG,EAAG,QAAO;AACxC,MAAI,iBAAiB,KAAK,KAAK,KAAK,CAAC,EAAG,QAAO;AAC/C,MAAI,eAAe,KAAK,IAAI,EAAG,QAAO;AACtC,MAAI,eAAe,KAAK,IAAI,EAAG,QAAO;AACtC,SAAO;AACT;AAQA,SAAS,mBAAmB,OAAiC;AAC3D,QAAM,eAAe,MAAM,KAAK,UAAQ,iBAAiB,KAAK,KAAK,KAAK,CAAC,CAAC;AAE1E,QAAM,OAAmB,CAAC;AAC1B,MAAI,UAAU;AAEd,aAAW,QAAQ,OAAO;AAExB,QAAI,0CAA0C,KAAK,KAAK,KAAK,CAAC,EAAG;AAEjE,UAAM,QAAQ,KAAK,MAAM,GAAG;AAE5B,UAAM,QAAkB,MACrB,MAAM,GAAG,MAAM,MAAM,SAAS,CAAC,EAAE,KAAK,MAAM,KAAK,KAAK,MAAS,EAC/D,IAAI,WAAS;AAAA,MACZ,MAAM,KAAK,KAAK;AAAA,MAChB,SAAS;AAAA,MACT,SAAS;AAAA,IACX,EAAE;AAEJ,QAAI,MAAM,SAAS,GAAG;AACpB,WAAK,KAAK,KAAK;AACf,gBAAU,KAAK,IAAI,SAAS,MAAM,MAAM;AAAA,IAC1C;AAAA,EACF;AAEA,MAAI,KAAK,WAAW,EAAG,QAAO;AAG9B,aAAW,OAAO,MAAM;AACtB,WAAO,IAAI,SAAS,SAAS;AAC3B,UAAI,KAAK,EAAE,MAAM,IAAI,SAAS,GAAG,SAAS,EAAE,CAAC;AAAA,IAC/C;AAAA,EACF;AAEA,SAAO;AAAA,IACL,MAAM,KAAK;AAAA,IACX,MAAM;AAAA,IACN,OAAO;AAAA,IACP,WAAW,gBAAgB,KAAK,SAAS;AAAA,EAC3C;AACF;;;AC1IA,eAAsB,SACpB,KACA,UACA,YACA,oBACA,UACoB;AACpB,QAAM,SAAoB,CAAC;AAE3B,WAAS,IAAI,GAAG,KAAK,oBAAoB,KAAK;AAC5C,QAAI,cAAc,CAAC,WAAW,IAAI,CAAC,EAAG;AACtC,UAAM,OAAO,MAAM,IAAI,QAAQ,CAAC;AAChC,QAAI;AACF,YAAM,YAAY,MAAM,gBAAgB,IAAI;AAC5C,YAAM,SAAS,MAAM,SAAS,WAAW,GAAG,WAAW;AAEvD,UAAI,OAAO,WAAW,UAAU;AAE9B,YAAI,OAAO,KAAK,GAAG;AACjB,iBAAO,KAAK,EAAE,MAAM,aAAa,MAAM,OAAO,KAAK,GAAG,YAAY,EAAE,CAAC;AAAA,QACvE;AAAA,MACF,WAAW,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAEvE,cAAM,aAAa;AACnB,YAAI,WAAW,SAAS,KAAK,GAAG;AAC9B,gBAAM,aAAa,iBAAiB,WAAW,UAAU,CAAC;AAC1D,qBAAW,KAAK,WAAY,QAAO,KAAK,CAAC;AAAA,QAC3C;AAAA,MACF;AAAA,IACF,SAAS,KAAK;AAEZ,gBAAU,KAAK;AAAA,QACb,MAAM;AAAA,QACN,SAAS,sBAAO,CAAC,sBAAY,eAAe,QAAQ,IAAI,UAAU,yCAAW;AAAA,QAC7E,MAAM;AAAA,MACR,CAAC;AAAA,IACH;AAAA,EACF;AAEA,SAAO;AACT;AAWA,eAAe,gBAAgB,MAAyC;AACtE,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,iBAAiB;AAEvD,QAAM,QAAQ;AACd,QAAM,WAAW,KAAK,YAAY,EAAE,MAAM,CAAC;AAC3C,QAAM,SAAS,aAAa,KAAK,MAAM,SAAS,KAAK,GAAG,KAAK,MAAM,SAAS,MAAM,CAAC;AACnF,QAAM,MAAM,OAAO,WAAW,IAAI;AAElC,QAAM,KAAK,OAAO,EAAE,eAAe,KAAgB,SAAS,CAAC,EAAE;AAC/D,SAAO,IAAI,WAAW,OAAO,SAAS,WAAW,CAAC;AACpD;","names":[]}
|
|
@@ -3,14 +3,12 @@ import "./chunk-ZWE3DS7E.js";
|
|
|
3
3
|
|
|
4
4
|
// src/ocr/auto-detect.ts
|
|
5
5
|
import { execSync } from "child_process";
|
|
6
|
-
import { createRequire } from "module";
|
|
7
6
|
var CLI_PRIORITY = ["gemini", "claude", "codex", "ollama"];
|
|
8
7
|
function detectAvailableOcr() {
|
|
9
8
|
for (const cli of CLI_PRIORITY) {
|
|
10
9
|
if (isCliInstalled(cli)) return cli;
|
|
11
10
|
}
|
|
12
|
-
|
|
13
|
-
return null;
|
|
11
|
+
return "tesseract";
|
|
14
12
|
}
|
|
15
13
|
function isCliInstalled(name) {
|
|
16
14
|
try {
|
|
@@ -21,25 +19,8 @@ function isCliInstalled(name) {
|
|
|
21
19
|
return false;
|
|
22
20
|
}
|
|
23
21
|
}
|
|
24
|
-
function isTesseractAvailable() {
|
|
25
|
-
try {
|
|
26
|
-
const require2 = createRequire(import.meta.url);
|
|
27
|
-
require2.resolve("tesseract.js");
|
|
28
|
-
return true;
|
|
29
|
-
} catch {
|
|
30
|
-
return false;
|
|
31
|
-
}
|
|
32
|
-
}
|
|
33
22
|
function validateOcrMode(mode) {
|
|
34
|
-
if (mode === "auto" || mode === "off") return;
|
|
35
|
-
if (mode === "tesseract") {
|
|
36
|
-
if (!isTesseractAvailable()) {
|
|
37
|
-
throw new Error(
|
|
38
|
-
"tesseract.js\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.\n\uC124\uCE58: npm install tesseract.js"
|
|
39
|
-
);
|
|
40
|
-
}
|
|
41
|
-
return;
|
|
42
|
-
}
|
|
23
|
+
if (mode === "auto" || mode === "off" || mode === "tesseract") return;
|
|
43
24
|
if (!isCliInstalled(mode)) {
|
|
44
25
|
throw new Error(`'${mode}' CLI\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.
|
|
45
26
|
${getInstallGuide(mode)}`);
|
|
@@ -54,6 +35,17 @@ function getInstallGuide(mode) {
|
|
|
54
35
|
};
|
|
55
36
|
return guides[mode] || `'${mode}'\uC744(\uB97C) \uC124\uCE58\uD574\uC8FC\uC138\uC694.`;
|
|
56
37
|
}
|
|
38
|
+
function getTesseractFallbackMessage() {
|
|
39
|
+
return [
|
|
40
|
+
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
|
|
41
|
+
"\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
42
|
+
"",
|
|
43
|
+
" [\uAD8C\uC7A5] Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
44
|
+
" Claude CLI: npm install -g @anthropic-ai/claude-code",
|
|
45
|
+
" Codex CLI: npm install -g @openai/codex",
|
|
46
|
+
" Ollama: brew install ollama (+ ollama pull gemma4:27b)"
|
|
47
|
+
].join("\n");
|
|
48
|
+
}
|
|
57
49
|
|
|
58
50
|
// src/ocr/cli-provider.ts
|
|
59
51
|
import { spawnSync } from "child_process";
|
|
@@ -156,27 +148,27 @@ async function resolveOcrProvider(mode, warnings) {
|
|
|
156
148
|
if (mode !== "auto") {
|
|
157
149
|
validateOcrMode(mode);
|
|
158
150
|
if (mode === "tesseract") {
|
|
159
|
-
const { createTesseractProvider } = await import("./tesseract-provider-
|
|
151
|
+
const { createTesseractProvider } = await import("./tesseract-provider-UNJOI25M.js");
|
|
160
152
|
return createTesseractProvider();
|
|
161
153
|
}
|
|
162
154
|
return createCliOcrProvider(mode);
|
|
163
155
|
}
|
|
164
156
|
const detected = detectAvailableOcr();
|
|
165
|
-
if (!detected) {
|
|
166
|
-
throw new Error("\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uB3C4\uAD6C\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
167
|
-
}
|
|
168
157
|
if (detected !== "gemini") {
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
158
|
+
if (detected === "tesseract") {
|
|
159
|
+
warnings?.push({
|
|
160
|
+
message: getTesseractFallbackMessage(),
|
|
161
|
+
code: "OCR_CLI_FALLBACK"
|
|
162
|
+
});
|
|
163
|
+
} else {
|
|
164
|
+
warnings?.push({
|
|
165
|
+
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (gemini CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 gemini CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
166
|
+
code: "OCR_CLI_FALLBACK"
|
|
167
|
+
});
|
|
168
|
+
}
|
|
173
169
|
}
|
|
174
170
|
if (detected === "tesseract") {
|
|
175
|
-
|
|
176
|
-
message: "tesseract.js\uB294 \uD14C\uC774\uBE14 \uAD6C\uC870\uB97C \uBCF5\uC6D0\uD558\uC9C0 \uBABB\uD569\uB2C8\uB2E4. Vision LLM CLI(gemini/claude/codex) \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.",
|
|
177
|
-
code: "OCR_CLI_FALLBACK"
|
|
178
|
-
});
|
|
179
|
-
const { createTesseractProvider } = await import("./tesseract-provider-MZ37ZKQW.js");
|
|
171
|
+
const { createTesseractProvider } = await import("./tesseract-provider-UNJOI25M.js");
|
|
180
172
|
return createTesseractProvider();
|
|
181
173
|
}
|
|
182
174
|
return createCliOcrProvider(detected);
|
|
@@ -184,4 +176,4 @@ async function resolveOcrProvider(mode, warnings) {
|
|
|
184
176
|
export {
|
|
185
177
|
resolveOcrProvider
|
|
186
178
|
};
|
|
187
|
-
//# sourceMappingURL=resolve-
|
|
179
|
+
//# sourceMappingURL=resolve-Z4DEPDUS.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/ocr/auto-detect.ts","../src/ocr/cli-provider.ts","../src/ocr/resolve.ts"],"sourcesContent":["/**\n * OCR CLI 자동 탐색\n *\n * 탐색 순서: gemini → claude → codex → ollama → tesseract.js\n * CLI는 which(unix) / where(win) 명령어로 PATH 존재 확인.\n * tesseract.js는 bundled 의존성이므로 항상 사용 가능 (최후 fallback).\n */\n\nimport { execSync } from \"child_process\"\nimport type { OcrMode } from \"../types.js\"\n\n/** CLI 탐색 우선순위 */\nconst CLI_PRIORITY = [\"gemini\", \"claude\", \"codex\", \"ollama\"] as const\n\n/**\n * 시스템에 설치된 OCR 도구를 우선순위대로 탐색.\n * tesseract.js는 bundled 의존성이므로 CLI를 찾지 못해도 항상 \"tesseract\" 반환.\n * @returns 사용 가능한 OcrMode (null 반환 없음)\n */\nexport function detectAvailableOcr(): OcrMode {\n // 1. CLI 프로그램 탐색 (gemini → claude → codex → ollama)\n for (const cli of CLI_PRIORITY) {\n if (isCliInstalled(cli)) return cli\n }\n\n // 2. tesseract.js — bundled 의존성, 항상 사용 가능\n return \"tesseract\"\n}\n\n/**\n * 특정 CLI가 시스템 PATH에 있는지 확인.\n * which(unix) 또는 where(win32) 사용.\n */\nfunction isCliInstalled(name: string): boolean {\n try {\n const cmd = process.platform === \"win32\" ? \"where\" : \"which\"\n execSync(`${cmd} ${name}`, { stdio: \"ignore\", timeout: 3000 })\n return true\n } catch {\n return false\n }\n}\n\n/**\n * 수동 지정된 OcrMode 유효성 검증.\n * --ocr gemini 등 강제 지정 시 호출.\n * @throws 해당 CLI가 설치되지 않은 경우 Error (tesseract는 항상 통과)\n */\nexport function validateOcrMode(mode: OcrMode): void {\n if (mode === \"auto\" || mode === \"off\" || mode === \"tesseract\") return\n\n if (!isCliInstalled(mode)) {\n throw new Error(`'${mode}' CLI가 설치되지 않았습니다.\\n${getInstallGuide(mode)}`)\n }\n}\n\n/** CLI별 설치 안내 메시지 */\nfunction getInstallGuide(mode: string): string {\n const guides: Record<string, string> = {\n gemini: \"설치: https://ai.google.dev/gemini-api/docs/cli\",\n claude: \"설치: npm install -g @anthropic-ai/claude-code 또는 https://claude.ai/code\",\n codex: \"설치: npm install -g @openai/codex 또는 https://github.com/openai/codex\",\n ollama: \"설치: brew install ollama 또는 https://ollama.com/download\",\n }\n return guides[mode] || `'${mode}'을(를) 설치해주세요.`\n}\n\n/**\n * AI CLI가 없어 tesseract.js로 fallback할 때 표시할 안내 메시지.\n */\nexport function getTesseractFallbackMessage(): string {\n return [\n \"설치된 AI CLI가 없어 내장 tesseract.js로 OCR을 수행합니다.\",\n \"더 나은 품질(테이블/헤딩 구조 보존)을 위해 AI CLI 설치를 권장합니다:\",\n \"\",\n \" [권장] Gemini CLI: https://ai.google.dev/gemini-api/docs/cli\",\n \" Claude CLI: npm install -g @anthropic-ai/claude-code\",\n \" Codex CLI: npm install -g @openai/codex\",\n \" Ollama: brew install ollama (+ ollama pull gemma4:27b)\",\n ].join(\"\\n\")\n}\n","/**\n * CLI 기반 OCR 프로바이더\n *\n * gemini / claude / codex / ollama CLI를 subprocess로 호출하여\n * PDF 페이지 이미지를 Markdown으로 변환.\n *\n * 이미지 전달 방식:\n * - gemini: -p \"프롬프트 @이미지경로\" (@ 파일 참조)\n * - claude: -p \"프롬프트 @이미지경로\" (@ 파일 참조, --print 모드)\n * - codex: exec -i 이미지경로 \"프롬프트\" (-i/--image 플래그)\n * - ollama: REST API (localhost:11434) — CLI는 이미지 입력 미지원\n */\n\nimport { spawnSync } from \"child_process\"\nimport { writeFileSync, unlinkSync, mkdtempSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { OcrMode, StructuredOcrResult } from \"../types.js\"\n\n/** OCR 프롬프트 — 모든 CLI 공통 */\nconst OCR_PROMPT =\n \"이 PDF 페이지 이미지에서 텍스트와 테이블을 추출하여 순수 Markdown으로 변환하세요.\\n\" +\n \"규칙:\\n\" +\n \"- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\\n\" +\n \"- 병합된 셀은 해당 위치에 내용 기재\\n\" +\n \"- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\\n\" +\n \"- 리스트는 - 또는 1. 사용\\n\" +\n \"- 이미지, 도형 등 비텍스트 요소는 무시\\n\" +\n \"- 원문의 읽기 순서와 구조를 유지\\n\" +\n \"- ```로 감싸지 말고 순수 Markdown만 출력\"\n\n/** 임시 디렉토리 (프로세스당 1회 생성) */\nlet _tempDir: string | null = null\nfunction getTempDir(): string {\n if (!_tempDir) _tempDir = mkdtempSync(join(tmpdir(), \"kordoc-ocr-\"))\n return _tempDir\n}\n\n/**\n * CLI OcrProvider 생성.\n *\n * @param mode - 사용할 CLI (gemini, claude, codex, ollama)\n * @returns OcrProvider 함수 (StructuredOcrResult 반환)\n */\nexport function createCliOcrProvider(\n mode: Exclude<OcrMode, \"auto\" | \"off\" | \"tesseract\">\n): (pageImage: Uint8Array, pageNumber: number, mimeType: \"image/png\") => Promise<StructuredOcrResult> {\n return async (pageImage: Uint8Array, pageNumber: number): Promise<StructuredOcrResult> => {\n const tempPath = join(getTempDir(), `page-${pageNumber}.png`)\n\n try {\n writeFileSync(tempPath, pageImage)\n\n let output: string\n if (mode === \"ollama\") {\n output = await callOllamaApi(tempPath)\n } else {\n output = callCli(mode, tempPath)\n }\n\n return { markdown: stripCodeFence(output.trim()) }\n } finally {\n try { unlinkSync(tempPath) } catch { /* 임시 파일 정리 실패 무시 */ }\n }\n }\n}\n\n/**\n * CLI 실행 — gemini / claude / codex\n *\n * @throws CLI 실행 실패 또는 타임아웃(60초) 시 Error\n */\nfunction callCli(mode: string, imagePath: string): string {\n const args = buildCliArgs(mode, imagePath)\n\n const result = spawnSync(mode === \"codex\" ? \"codex\" : mode, args, {\n encoding: \"utf-8\",\n timeout: 60_000,\n maxBuffer: 10 * 1024 * 1024,\n })\n\n if (result.error) {\n throw new Error(`${mode} CLI 실행 실패: ${result.error.message}`)\n }\n if (result.status !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.status}`\n throw new Error(`${mode} OCR 실패: ${errMsg}`)\n }\n\n return result.stdout || \"\"\n}\n\n/**\n * CLI별 인자 배열 생성.\n *\n * gemini: [\"--prompt\", \"프롬프트 @이미지경로\", \"--yolo\"]\n * - -y/--yolo: 자동 승인 (OCR은 도구 사용 없으므로 실질적 영향 없음)\n * - @ 파일 참조로 이미지를 컨텍스트에 포함\n *\n * claude: [\"--print\", \"프롬프트 @이미지경로\"]\n * - --print(-p): 비대화형 출력 모드\n * - @ 파일 참조로 이미지를 컨텍스트에 포함\n *\n * codex: [\"exec\", \"--image\", 이미지경로, \"프롬프트\"]\n * - exec: 비대화형 실행 서브커맨드\n * - -i/--image: 이미지 첨부 플래그 (codex exec --help 로 확인됨)\n *\n * ⚠️ CLI 버전에 따라 문법이 다를 수 있음. 업데이트 시 --help 재확인 필요.\n */\nfunction buildCliArgs(mode: string, imagePath: string): string[] {\n const promptWithImage = `${OCR_PROMPT}\\n\\n이미지: @${imagePath}`\n const promptOnly = OCR_PROMPT\n\n switch (mode) {\n case \"gemini\":\n return [\"--prompt\", promptWithImage, \"--yolo\"]\n\n case \"claude\":\n return [\"--print\", promptWithImage]\n\n case \"codex\":\n return [\"exec\", \"--image\", imagePath, promptOnly]\n\n default:\n throw new Error(`지원하지 않는 CLI: ${mode}`)\n }\n}\n\n/**\n * Ollama REST API 호출 — CLI는 이미지 입력을 지원하지 않으므로 API 직접 사용.\n *\n * 기본 모델: KORDOC_OLLAMA_MODEL 환경변수 또는 \"gemma4:27b\"\n * 기본 호스트: KORDOC_OLLAMA_HOST 환경변수 또는 \"http://localhost:11434\"\n *\n * @throws Ollama 서버 미실행 또는 응답 오류 시 Error\n */\nasync function callOllamaApi(imagePath: string): Promise<string> {\n const { readFileSync } = await import(\"fs\")\n const imageBase64 = readFileSync(imagePath).toString(\"base64\")\n\n const model = process.env.KORDOC_OLLAMA_MODEL || \"gemma4:27b\"\n const host = process.env.KORDOC_OLLAMA_HOST || \"http://localhost:11434\"\n\n const response = await fetch(`${host}/api/chat`, {\n method: \"POST\",\n headers: { \"Content-Type\": \"application/json\" },\n body: JSON.stringify({\n model,\n messages: [{\n role: \"user\",\n content: OCR_PROMPT,\n images: [imageBase64],\n }],\n stream: false,\n }),\n signal: AbortSignal.timeout(60_000),\n })\n\n if (!response.ok) {\n throw new Error(`Ollama API 오류: ${response.status} ${response.statusText}`)\n }\n\n const data = await response.json() as { message?: { content?: string } }\n return data.message?.content || \"\"\n}\n\n/**\n * LLM 출력에서 코드 펜스 제거.\n * LLM이 가끔 결과를 ```markdown ... ``` 으로 감싸는 경우 처리.\n */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*\\n([\\s\\S]*?)\\n```\\s*$/m)\n return match ? match[1].trim() : text\n}\n","/**\n * OCR 프로바이더 팩토리\n *\n * ocrMode에 따라 적절한 OcrProvider를 생성하여 반환.\n * - \"auto\": 설치된 CLI 자동 탐색 (gemini → claude → codex → ollama → tesseract)\n * tesseract.js는 bundled 의존성이므로 항상 사용 가능 (null 반환 없음)\n * - 특정 CLI: 해당 CLI 사용 (미설치 시 에러)\n * - \"tesseract\": 내장 tesseract.js 직접 사용\n * - \"off\": 에러 throw\n */\n\nimport type { OcrMode, OcrProvider, ParseWarning } from \"../types.js\"\nimport { detectAvailableOcr, validateOcrMode, getTesseractFallbackMessage } from \"./auto-detect.js\"\nimport { createCliOcrProvider } from \"./cli-provider.js\"\n\n/**\n * ocrMode에 따라 OcrProvider를 생성.\n *\n * @param mode - OCR 모드\n * @param warnings - 경고 수집 배열 (fallback 발생 시 경고 추가)\n * @returns OcrProvider 함수\n * @throws mode=\"off\"이거나 지정 CLI 미설치 시 Error\n */\nexport async function resolveOcrProvider(\n mode: OcrMode,\n warnings?: ParseWarning[]\n): Promise<OcrProvider> {\n if (mode === \"off\") {\n throw new Error(\"OCR이 비활성화되어 있습니다 (--ocr off).\")\n }\n\n // ── 수동 지정 모드 ──────────────────────────────────\n if (mode !== \"auto\") {\n validateOcrMode(mode) // tesseract는 항상 통과\n\n if (mode === \"tesseract\") {\n const { createTesseractProvider } = await import(\"./tesseract-provider.js\")\n return createTesseractProvider()\n }\n\n return createCliOcrProvider(mode)\n }\n\n // ── 자동 탐색 모드 ───────────────────────────────────\n // detectAvailableOcr()는 항상 값을 반환 (tesseract fallback으로 null 없음)\n const detected = detectAvailableOcr()\n\n // gemini가 아닌 경우 fallback 경고\n if (detected !== \"gemini\") {\n if (detected === \"tesseract\") {\n // 내장 tesseract로 fallback — 구조 복원 제한 안내\n warnings?.push({\n message: getTesseractFallbackMessage(),\n code: \"OCR_CLI_FALLBACK\",\n })\n } else {\n warnings?.push({\n message: `OCR: '${detected}' 사용 중 (gemini CLI가 없어 fallback). 더 나은 품질을 위해 gemini CLI 설치를 권장합니다.`,\n code: \"OCR_CLI_FALLBACK\",\n })\n }\n }\n\n if (detected === \"tesseract\") {\n const { createTesseractProvider } = await import(\"./tesseract-provider.js\")\n return createTesseractProvider()\n }\n\n return createCliOcrProvider(detected)\n}\n"],"mappings":";;;;AAQA,SAAS,gBAAgB;AAIzB,IAAM,eAAe,CAAC,UAAU,UAAU,SAAS,QAAQ;AAOpD,SAAS,qBAA8B;AAE5C,aAAW,OAAO,cAAc;AAC9B,QAAI,eAAe,GAAG,EAAG,QAAO;AAAA,EAClC;AAGA,SAAO;AACT;AAMA,SAAS,eAAe,MAAuB;AAC7C,MAAI;AACF,UAAM,MAAM,QAAQ,aAAa,UAAU,UAAU;AACrD,aAAS,GAAG,GAAG,IAAI,IAAI,IAAI,EAAE,OAAO,UAAU,SAAS,IAAK,CAAC;AAC7D,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAOO,SAAS,gBAAgB,MAAqB;AACnD,MAAI,SAAS,UAAU,SAAS,SAAS,SAAS,YAAa;AAE/D,MAAI,CAAC,eAAe,IAAI,GAAG;AACzB,UAAM,IAAI,MAAM,IAAI,IAAI;AAAA,EAAuB,gBAAgB,IAAI,CAAC,EAAE;AAAA,EACxE;AACF;AAGA,SAAS,gBAAgB,MAAsB;AAC7C,QAAM,SAAiC;AAAA,IACrC,QAAQ;AAAA,IACR,QAAQ;AAAA,IACR,OAAQ;AAAA,IACR,QAAQ;AAAA,EACV;AACA,SAAO,OAAO,IAAI,KAAK,IAAI,IAAI;AACjC;AAKO,SAAS,8BAAsC;AACpD,SAAO;AAAA,IACL;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,IACA;AAAA,EACF,EAAE,KAAK,IAAI;AACb;;;ACnEA,SAAS,iBAAiB;AAC1B,SAAS,eAAe,YAAY,mBAAmB;AACvD,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,aACJ;AAWF,IAAI,WAA0B;AAC9B,SAAS,aAAqB;AAC5B,MAAI,CAAC,SAAU,YAAW,YAAY,KAAK,OAAO,GAAG,aAAa,CAAC;AACnE,SAAO;AACT;AAQO,SAAS,qBACd,MACoG;AACpG,SAAO,OAAO,WAAuB,eAAqD;AACxF,UAAM,WAAW,KAAK,WAAW,GAAG,QAAQ,UAAU,MAAM;AAE5D,QAAI;AACF,oBAAc,UAAU,SAAS;AAEjC,UAAI;AACJ,UAAI,SAAS,UAAU;AACrB,iBAAS,MAAM,cAAc,QAAQ;AAAA,MACvC,OAAO;AACL,iBAAS,QAAQ,MAAM,QAAQ;AAAA,MACjC;AAEA,aAAO,EAAE,UAAU,eAAe,OAAO,KAAK,CAAC,EAAE;AAAA,IACnD,UAAE;AACA,UAAI;AAAE,mBAAW,QAAQ;AAAA,MAAE,QAAQ;AAAA,MAAuB;AAAA,IAC5D;AAAA,EACF;AACF;AAOA,SAAS,QAAQ,MAAc,WAA2B;AACxD,QAAM,OAAO,aAAa,MAAM,SAAS;AAEzC,QAAM,SAAS,UAAU,SAAS,UAAU,UAAU,MAAM,MAAM;AAAA,IAChE,UAAU;AAAA,IACV,SAAS;AAAA,IACT,WAAW,KAAK,OAAO;AAAA,EACzB,CAAC;AAED,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,OAAO,MAAM,OAAO,EAAE;AAAA,EAC9D;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,MAAM;AAClE,UAAM,IAAI,MAAM,GAAG,IAAI,sBAAY,MAAM,EAAE;AAAA,EAC7C;AAEA,SAAO,OAAO,UAAU;AAC1B;AAmBA,SAAS,aAAa,MAAc,WAA6B;AAC/D,QAAM,kBAAkB,GAAG,UAAU;AAAA;AAAA,uBAAa,SAAS;AAC3D,QAAM,aAAa;AAEnB,UAAQ,MAAM;AAAA,IACZ,KAAK;AACH,aAAO,CAAC,YAAY,iBAAiB,QAAQ;AAAA,IAE/C,KAAK;AACH,aAAO,CAAC,WAAW,eAAe;AAAA,IAEpC,KAAK;AACH,aAAO,CAAC,QAAQ,WAAW,WAAW,UAAU;AAAA,IAElD;AACE,YAAM,IAAI,MAAM,8CAAgB,IAAI,EAAE;AAAA,EAC1C;AACF;AAUA,eAAe,cAAc,WAAoC;AAC/D,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,IAAI;AAC1C,QAAM,cAAc,aAAa,SAAS,EAAE,SAAS,QAAQ;AAE7D,QAAM,QAAQ,QAAQ,IAAI,uBAAuB;AACjD,QAAM,OAAO,QAAQ,IAAI,sBAAsB;AAE/C,QAAM,WAAW,MAAM,MAAM,GAAG,IAAI,aAAa;AAAA,IAC/C,QAAQ;AAAA,IACR,SAAS,EAAE,gBAAgB,mBAAmB;AAAA,IAC9C,MAAM,KAAK,UAAU;AAAA,MACnB;AAAA,MACA,UAAU,CAAC;AAAA,QACT,MAAM;AAAA,QACN,SAAS;AAAA,QACT,QAAQ,CAAC,WAAW;AAAA,MACtB,CAAC;AAAA,MACD,QAAQ;AAAA,IACV,CAAC;AAAA,IACD,QAAQ,YAAY,QAAQ,GAAM;AAAA,EACpC,CAAC;AAED,MAAI,CAAC,SAAS,IAAI;AAChB,UAAM,IAAI,MAAM,4BAAkB,SAAS,MAAM,IAAI,SAAS,UAAU,EAAE;AAAA,EAC5E;AAEA,QAAM,OAAO,MAAM,SAAS,KAAK;AACjC,SAAO,KAAK,SAAS,WAAW;AAClC;AAMA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,+CAA+C;AACxE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;;;ACtJA,eAAsB,mBACpB,MACA,UACsB;AACtB,MAAI,SAAS,OAAO;AAClB,UAAM,IAAI,MAAM,sFAA+B;AAAA,EACjD;AAGA,MAAI,SAAS,QAAQ;AACnB,oBAAgB,IAAI;AAEpB,QAAI,SAAS,aAAa;AACxB,YAAM,EAAE,wBAAwB,IAAI,MAAM,OAAO,kCAAyB;AAC1E,aAAO,wBAAwB;AAAA,IACjC;AAEA,WAAO,qBAAqB,IAAI;AAAA,EAClC;AAIA,QAAM,WAAW,mBAAmB;AAGpC,MAAI,aAAa,UAAU;AACzB,QAAI,aAAa,aAAa;AAE5B,gBAAU,KAAK;AAAA,QACb,SAAS,4BAA4B;AAAA,QACrC,MAAM;AAAA,MACR,CAAC;AAAA,IACH,OAAO;AACL,gBAAU,KAAK;AAAA,QACb,SAAS,SAAS,QAAQ;AAAA,QAC1B,MAAM;AAAA,MACR,CAAC;AAAA,IACH;AAAA,EACF;AAEA,MAAI,aAAa,aAAa;AAC5B,UAAM,EAAE,wBAAwB,IAAI,MAAM,OAAO,kCAAyB;AAC1E,WAAO,wBAAwB;AAAA,EACjC;AAEA,SAAO,qBAAqB,QAAQ;AACtC;","names":[]}
|
|
@@ -2,16 +2,9 @@
|
|
|
2
2
|
import "./chunk-ZWE3DS7E.js";
|
|
3
3
|
|
|
4
4
|
// src/ocr/tesseract-provider.ts
|
|
5
|
+
import { createWorker } from "tesseract.js";
|
|
5
6
|
async function createTesseractProvider() {
|
|
6
|
-
|
|
7
|
-
try {
|
|
8
|
-
tesseract = await import("tesseract.js");
|
|
9
|
-
} catch {
|
|
10
|
-
throw new Error(
|
|
11
|
-
"tesseract.js\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4.\n\uC124\uCE58: npm install tesseract.js"
|
|
12
|
-
);
|
|
13
|
-
}
|
|
14
|
-
const worker = await tesseract.createWorker("kor+eng");
|
|
7
|
+
const worker = await createWorker("kor+eng");
|
|
15
8
|
let terminated = false;
|
|
16
9
|
const provider = async (pageImage, _pageNumber, _mimeType) => {
|
|
17
10
|
const { data } = await worker.recognize(pageImage);
|
|
@@ -28,4 +21,4 @@ async function createTesseractProvider() {
|
|
|
28
21
|
export {
|
|
29
22
|
createTesseractProvider
|
|
30
23
|
};
|
|
31
|
-
//# sourceMappingURL=tesseract-provider-
|
|
24
|
+
//# sourceMappingURL=tesseract-provider-UNJOI25M.js.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"sources":["../src/ocr/tesseract-provider.ts"],"sourcesContent":["/**\n * Tesseract.js 기반 OCR 프로바이더\n *\n * tesseract.js는 bundled 의존성으로 별도 설치 불필요.\n * Vision LLM CLI가 없을 때의 최후 fallback으로 자동 사용.\n *\n * 특성:\n * - 순수 텍스트만 반환 (테이블/헤딩 구조 복원 불가)\n * - 한글 인식률 약 85-90% (깨끗한 이미지 기준)\n * - 완전 오프라인 동작 (API 키 불필요)\n */\n\nimport { createWorker } from \"tesseract.js\"\nimport type { OcrProvider } from \"../types.js\"\n\n/**\n * Tesseract.js OcrProvider 생성.\n *\n * 워커를 1회 생성하여 재사용 (매 페이지마다 초기화 방지).\n * 문서 처리 완료 후 terminate()로 워커 정리.\n *\n * @returns OcrProvider 함수 (+ terminate 메서드)\n */\nexport async function createTesseractProvider(): Promise<OcrProvider & { terminate: () => Promise<void> }> {\n // kor+eng: 한글 + 영문 동시 인식 (한국 공문서 특성)\n const worker = await createWorker(\"kor+eng\")\n let terminated = false\n\n const provider = async (\n pageImage: Uint8Array,\n _pageNumber: number,\n _mimeType: \"image/png\"\n ): Promise<string> => {\n const { data } = await worker.recognize(pageImage)\n return data.text\n }\n\n ;(provider as OcrProvider & { terminate: () => Promise<void> }).terminate = async () => {\n if (!terminated) {\n await worker.terminate()\n terminated = true\n }\n }\n\n return provider as OcrProvider & { terminate: () => Promise<void> }\n}\n"],"mappings":";;;;AAYA,SAAS,oBAAoB;AAW7B,eAAsB,0BAAqF;AAEzG,QAAM,SAAS,MAAM,aAAa,SAAS;AAC3C,MAAI,aAAa;AAEjB,QAAM,WAAW,OACf,WACA,aACA,cACoB;AACpB,UAAM,EAAE,KAAK,IAAI,MAAM,OAAO,UAAU,SAAS;AACjD,WAAO,KAAK;AAAA,EACd;AAEC,EAAC,SAA8D,YAAY,YAAY;AACtF,QAAI,CAAC,YAAY;AACf,YAAM,OAAO,UAAU;AACvB,mBAAa;AAAA,IACf;AAAA,EACF;AAEA,SAAO;AACT;","names":[]}
|
|
@@ -8,7 +8,7 @@ import {
|
|
|
8
8
|
sanitizeError,
|
|
9
9
|
sanitizeHref,
|
|
10
10
|
toArrayBuffer
|
|
11
|
-
} from "./chunk-
|
|
11
|
+
} from "./chunk-UPJWEES3.js";
|
|
12
12
|
import "./chunk-ZWE3DS7E.js";
|
|
13
13
|
export {
|
|
14
14
|
KordocError,
|
|
@@ -20,4 +20,4 @@ export {
|
|
|
20
20
|
sanitizeHref,
|
|
21
21
|
toArrayBuffer
|
|
22
22
|
};
|
|
23
|
-
//# sourceMappingURL=utils-
|
|
23
|
+
//# sourceMappingURL=utils-TPAR37RJ.js.map
|
|
@@ -2,10 +2,10 @@
|
|
|
2
2
|
import {
|
|
3
3
|
detectFormat,
|
|
4
4
|
parse
|
|
5
|
-
} from "./chunk-
|
|
5
|
+
} from "./chunk-64QPUEYH.js";
|
|
6
6
|
import {
|
|
7
7
|
toArrayBuffer
|
|
8
|
-
} from "./chunk-
|
|
8
|
+
} from "./chunk-UPJWEES3.js";
|
|
9
9
|
import "./chunk-MOL7MDBG.js";
|
|
10
10
|
import "./chunk-ZWE3DS7E.js";
|
|
11
11
|
|
|
@@ -126,4 +126,4 @@ async function sendWebhook(url, payload) {
|
|
|
126
126
|
export {
|
|
127
127
|
watchDirectory
|
|
128
128
|
};
|
|
129
|
-
//# sourceMappingURL=watch-
|
|
129
|
+
//# sourceMappingURL=watch-FEW5NWVC.js.map
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@clazic/kordoc",
|
|
3
|
-
"version": "2.2.
|
|
3
|
+
"version": "2.2.2",
|
|
4
4
|
"description": "Parse Korean documents (HWP, HWPX, PDF, XLSX, DOCX) to Markdown",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"exports": {
|
|
@@ -56,12 +56,14 @@
|
|
|
56
56
|
},
|
|
57
57
|
"dependencies": {
|
|
58
58
|
"@modelcontextprotocol/sdk": "^1.28.0",
|
|
59
|
+
"@napi-rs/canvas": "^0.1.97",
|
|
59
60
|
"@xmldom/xmldom": "^0.9.8",
|
|
60
61
|
"cfb": "1.2.2",
|
|
61
62
|
"commander": "^13.0.0",
|
|
62
63
|
"exceljs": "^4.4.0",
|
|
63
64
|
"jszip": "^3.10.1",
|
|
64
65
|
"pdfjs-dist": "^4.10.38",
|
|
66
|
+
"tesseract.js": "^7.0.0",
|
|
65
67
|
"zod": "^3.23.0"
|
|
66
68
|
},
|
|
67
69
|
"devDependencies": {
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/ocr/auto-detect.ts","../src/ocr/cli-provider.ts","../src/ocr/resolve.ts"],"sourcesContent":["/**\n * OCR CLI 자동 탐색\n *\n * 탐색 순서: gemini → claude → codex → ollama → tesseract.js\n * CLI는 which(unix) / where(win) 명령어로 PATH 존재 확인.\n * tesseract.js는 createRequire로 resolve 시도.\n */\n\nimport { execSync } from \"child_process\"\nimport { createRequire } from \"module\"\nimport type { OcrMode } from \"../types.js\"\n\n/** CLI 탐색 우선순위 */\nconst CLI_PRIORITY = [\"gemini\", \"claude\", \"codex\", \"ollama\"] as const\n\n/**\n * 시스템에 설치된 OCR 도구를 우선순위대로 탐색.\n * @returns 사용 가능한 OcrMode 또는 null\n */\nexport function detectAvailableOcr(): OcrMode | null {\n // 1. CLI 프로그램 탐색 (gemini → claude → codex → ollama)\n for (const cli of CLI_PRIORITY) {\n if (isCliInstalled(cli)) return cli\n }\n\n // 2. tesseract.js npm 패키지 탐색\n if (isTesseractAvailable()) return \"tesseract\"\n\n return null\n}\n\n/**\n * 특정 CLI가 시스템 PATH에 있는지 확인.\n * which(unix) 또는 where(win32) 사용.\n */\nfunction isCliInstalled(name: string): boolean {\n try {\n const cmd = process.platform === \"win32\" ? \"where\" : \"which\"\n execSync(`${cmd} ${name}`, { stdio: \"ignore\", timeout: 3000 })\n return true\n } catch {\n return false\n }\n}\n\n/**\n * tesseract.js npm 패키지가 설치되어 있는지 확인.\n * ESM 환경에서 createRequire를 사용하여 resolve 시도.\n */\nfunction isTesseractAvailable(): boolean {\n try {\n const require = createRequire(import.meta.url)\n require.resolve(\"tesseract.js\")\n return true\n } catch {\n return false\n }\n}\n\n/**\n * 수동 지정된 OcrMode 유효성 검증.\n * --ocr gemini 등 강제 지정 시 호출.\n * @throws 해당 CLI/패키지가 설치되지 않은 경우 Error\n */\nexport function validateOcrMode(mode: OcrMode): void {\n if (mode === \"auto\" || mode === \"off\") return\n\n if (mode === \"tesseract\") {\n if (!isTesseractAvailable()) {\n throw new Error(\n \"tesseract.js가 설치되지 않았습니다.\\n\" +\n \"설치: npm install tesseract.js\"\n )\n }\n return\n }\n\n if (!isCliInstalled(mode)) {\n throw new Error(`'${mode}' CLI가 설치되지 않았습니다.\\n${getInstallGuide(mode)}`)\n }\n}\n\n/** CLI별 설치 안내 메시지 */\nfunction getInstallGuide(mode: string): string {\n const guides: Record<string, string> = {\n gemini: \"설치: https://ai.google.dev/gemini-api/docs/cli\",\n claude: \"설치: npm install -g @anthropic-ai/claude-code 또는 https://claude.ai/code\",\n codex: \"설치: npm install -g @openai/codex 또는 https://github.com/openai/codex\",\n ollama: \"설치: brew install ollama 또는 https://ollama.com/download\",\n }\n return guides[mode] || `'${mode}'을(를) 설치해주세요.`\n}\n\n/**\n * OCR 도구를 하나도 찾지 못했을 때 표시할 에러 메시지.\n */\nexport function getNoOcrErrorMessage(): string {\n return [\n \"이미지 기반 PDF입니다. OCR을 위해 다음 중 하나를 설치하세요:\",\n \"\",\n \" [권장] Gemini CLI: https://ai.google.dev/gemini-api/docs/cli\",\n \" Claude CLI: npm install -g @anthropic-ai/claude-code\",\n \" Codex CLI: npm install -g @openai/codex\",\n \" Ollama: brew install ollama (+ ollama pull gemma4:27b)\",\n \" Tesseract: npm install tesseract.js\",\n \"\",\n \"설치 후 다시 실행하면 자동으로 감지됩니다.\",\n \"특정 도구 지정: kordoc parse <파일> --ocr gemini\",\n ].join(\"\\n\")\n}\n","/**\n * CLI 기반 OCR 프로바이더\n *\n * gemini / claude / codex / ollama CLI를 subprocess로 호출하여\n * PDF 페이지 이미지를 Markdown으로 변환.\n *\n * 이미지 전달 방식:\n * - gemini: -p \"프롬프트 @이미지경로\" (@ 파일 참조)\n * - claude: -p \"프롬프트 @이미지경로\" (@ 파일 참조, --print 모드)\n * - codex: exec -i 이미지경로 \"프롬프트\" (-i/--image 플래그)\n * - ollama: REST API (localhost:11434) — CLI는 이미지 입력 미지원\n */\n\nimport { spawnSync } from \"child_process\"\nimport { writeFileSync, unlinkSync, mkdtempSync } from \"fs\"\nimport { join } from \"path\"\nimport { tmpdir } from \"os\"\nimport type { OcrMode, StructuredOcrResult } from \"../types.js\"\n\n/** OCR 프롬프트 — 모든 CLI 공통 */\nconst OCR_PROMPT =\n \"이 PDF 페이지 이미지에서 텍스트와 테이블을 추출하여 순수 Markdown으로 변환하세요.\\n\" +\n \"규칙:\\n\" +\n \"- 테이블은 Markdown 테이블 문법 사용 (| 구분, |---|---| 헤더 구분선 포함)\\n\" +\n \"- 병합된 셀은 해당 위치에 내용 기재\\n\" +\n \"- 헤딩은 글자 크기에 따라 ## ~ ###### 사용\\n\" +\n \"- 리스트는 - 또는 1. 사용\\n\" +\n \"- 이미지, 도형 등 비텍스트 요소는 무시\\n\" +\n \"- 원문의 읽기 순서와 구조를 유지\\n\" +\n \"- ```로 감싸지 말고 순수 Markdown만 출력\"\n\n/** 임시 디렉토리 (프로세스당 1회 생성) */\nlet _tempDir: string | null = null\nfunction getTempDir(): string {\n if (!_tempDir) _tempDir = mkdtempSync(join(tmpdir(), \"kordoc-ocr-\"))\n return _tempDir\n}\n\n/**\n * CLI OcrProvider 생성.\n *\n * @param mode - 사용할 CLI (gemini, claude, codex, ollama)\n * @returns OcrProvider 함수 (StructuredOcrResult 반환)\n */\nexport function createCliOcrProvider(\n mode: Exclude<OcrMode, \"auto\" | \"off\" | \"tesseract\">\n): (pageImage: Uint8Array, pageNumber: number, mimeType: \"image/png\") => Promise<StructuredOcrResult> {\n return async (pageImage: Uint8Array, pageNumber: number): Promise<StructuredOcrResult> => {\n const tempPath = join(getTempDir(), `page-${pageNumber}.png`)\n\n try {\n writeFileSync(tempPath, pageImage)\n\n let output: string\n if (mode === \"ollama\") {\n output = await callOllamaApi(tempPath)\n } else {\n output = callCli(mode, tempPath)\n }\n\n return { markdown: stripCodeFence(output.trim()) }\n } finally {\n try { unlinkSync(tempPath) } catch { /* 임시 파일 정리 실패 무시 */ }\n }\n }\n}\n\n/**\n * CLI 실행 — gemini / claude / codex\n *\n * @throws CLI 실행 실패 또는 타임아웃(60초) 시 Error\n */\nfunction callCli(mode: string, imagePath: string): string {\n const args = buildCliArgs(mode, imagePath)\n\n const result = spawnSync(mode === \"codex\" ? \"codex\" : mode, args, {\n encoding: \"utf-8\",\n timeout: 60_000,\n maxBuffer: 10 * 1024 * 1024,\n })\n\n if (result.error) {\n throw new Error(`${mode} CLI 실행 실패: ${result.error.message}`)\n }\n if (result.status !== 0) {\n const errMsg = result.stderr?.trim() || `exit code ${result.status}`\n throw new Error(`${mode} OCR 실패: ${errMsg}`)\n }\n\n return result.stdout || \"\"\n}\n\n/**\n * CLI별 인자 배열 생성.\n *\n * gemini: [\"--prompt\", \"프롬프트 @이미지경로\", \"--yolo\"]\n * - -y/--yolo: 자동 승인 (OCR은 도구 사용 없으므로 실질적 영향 없음)\n * - @ 파일 참조로 이미지를 컨텍스트에 포함\n *\n * claude: [\"--print\", \"프롬프트 @이미지경로\"]\n * - --print(-p): 비대화형 출력 모드\n * - @ 파일 참조로 이미지를 컨텍스트에 포함\n *\n * codex: [\"exec\", \"--image\", 이미지경로, \"프롬프트\"]\n * - exec: 비대화형 실행 서브커맨드\n * - -i/--image: 이미지 첨부 플래그 (codex exec --help 로 확인됨)\n *\n * ⚠️ CLI 버전에 따라 문법이 다를 수 있음. 업데이트 시 --help 재확인 필요.\n */\nfunction buildCliArgs(mode: string, imagePath: string): string[] {\n const promptWithImage = `${OCR_PROMPT}\\n\\n이미지: @${imagePath}`\n const promptOnly = OCR_PROMPT\n\n switch (mode) {\n case \"gemini\":\n return [\"--prompt\", promptWithImage, \"--yolo\"]\n\n case \"claude\":\n return [\"--print\", promptWithImage]\n\n case \"codex\":\n return [\"exec\", \"--image\", imagePath, promptOnly]\n\n default:\n throw new Error(`지원하지 않는 CLI: ${mode}`)\n }\n}\n\n/**\n * Ollama REST API 호출 — CLI는 이미지 입력을 지원하지 않으므로 API 직접 사용.\n *\n * 기본 모델: KORDOC_OLLAMA_MODEL 환경변수 또는 \"gemma4:27b\"\n * 기본 호스트: KORDOC_OLLAMA_HOST 환경변수 또는 \"http://localhost:11434\"\n *\n * @throws Ollama 서버 미실행 또는 응답 오류 시 Error\n */\nasync function callOllamaApi(imagePath: string): Promise<string> {\n const { readFileSync } = await import(\"fs\")\n const imageBase64 = readFileSync(imagePath).toString(\"base64\")\n\n const model = process.env.KORDOC_OLLAMA_MODEL || \"gemma4:27b\"\n const host = process.env.KORDOC_OLLAMA_HOST || \"http://localhost:11434\"\n\n const response = await fetch(`${host}/api/chat`, {\n method: \"POST\",\n headers: { \"Content-Type\": \"application/json\" },\n body: JSON.stringify({\n model,\n messages: [{\n role: \"user\",\n content: OCR_PROMPT,\n images: [imageBase64],\n }],\n stream: false,\n }),\n signal: AbortSignal.timeout(60_000),\n })\n\n if (!response.ok) {\n throw new Error(`Ollama API 오류: ${response.status} ${response.statusText}`)\n }\n\n const data = await response.json() as { message?: { content?: string } }\n return data.message?.content || \"\"\n}\n\n/**\n * LLM 출력에서 코드 펜스 제거.\n * LLM이 가끔 결과를 ```markdown ... ``` 으로 감싸는 경우 처리.\n */\nfunction stripCodeFence(text: string): string {\n const match = text.match(/^```(?:markdown|md)?\\s*\\n([\\s\\S]*?)\\n```\\s*$/m)\n return match ? match[1].trim() : text\n}\n","/**\n * OCR 프로바이더 팩토리\n *\n * ocrMode에 따라 적절한 OcrProvider를 생성하여 반환.\n * - \"auto\": 설치된 CLI 자동 탐색 (gemini → claude → codex → ollama → tesseract)\n * - 특정 CLI: 해당 CLI 사용 (미설치 시 에러)\n * - \"off\": 에러 throw\n */\n\nimport type { OcrMode, OcrProvider, ParseWarning } from \"../types.js\"\nimport { detectAvailableOcr, validateOcrMode } from \"./auto-detect.js\"\nimport { createCliOcrProvider } from \"./cli-provider.js\"\n\n/**\n * ocrMode에 따라 OcrProvider를 생성.\n *\n * @param mode - OCR 모드\n * @param warnings - 경고 수집 배열 (fallback 발생 시 경고 추가)\n * @returns OcrProvider 함수\n * @throws 사용 가능한 OCR 도구가 없거나 mode=\"off\"인 경우\n */\nexport async function resolveOcrProvider(\n mode: OcrMode,\n warnings?: ParseWarning[]\n): Promise<OcrProvider> {\n if (mode === \"off\") {\n throw new Error(\"OCR이 비활성화되어 있습니다 (--ocr off).\")\n }\n\n // ── 수동 지정 모드 ──────────────────────────────────\n if (mode !== \"auto\") {\n validateOcrMode(mode)\n\n if (mode === \"tesseract\") {\n const { createTesseractProvider } = await import(\"./tesseract-provider.js\")\n return createTesseractProvider()\n }\n\n return createCliOcrProvider(mode)\n }\n\n // ── 자동 탐색 모드 ───────────────────────────────────\n const detected = detectAvailableOcr()\n\n if (!detected) {\n throw new Error(\"사용 가능한 OCR 도구를 찾을 수 없습니다.\")\n }\n\n // gemini가 아닌 경우 fallback 경고\n if (detected !== \"gemini\") {\n warnings?.push({\n message: `OCR: '${detected}' 사용 중 (gemini CLI가 없어 fallback). 더 나은 품질을 위해 gemini CLI 설치를 권장합니다.`,\n code: \"OCR_CLI_FALLBACK\",\n })\n }\n\n if (detected === \"tesseract\") {\n warnings?.push({\n message: \"tesseract.js는 테이블 구조를 복원하지 못합니다. Vision LLM CLI(gemini/claude/codex) 설치를 권장합니다.\",\n code: \"OCR_CLI_FALLBACK\",\n })\n const { createTesseractProvider } = await import(\"./tesseract-provider.js\")\n return createTesseractProvider()\n }\n\n return createCliOcrProvider(detected)\n}\n"],"mappings":";;;;AAQA,SAAS,gBAAgB;AACzB,SAAS,qBAAqB;AAI9B,IAAM,eAAe,CAAC,UAAU,UAAU,SAAS,QAAQ;AAMpD,SAAS,qBAAqC;AAEnD,aAAW,OAAO,cAAc;AAC9B,QAAI,eAAe,GAAG,EAAG,QAAO;AAAA,EAClC;AAGA,MAAI,qBAAqB,EAAG,QAAO;AAEnC,SAAO;AACT;AAMA,SAAS,eAAe,MAAuB;AAC7C,MAAI;AACF,UAAM,MAAM,QAAQ,aAAa,UAAU,UAAU;AACrD,aAAS,GAAG,GAAG,IAAI,IAAI,IAAI,EAAE,OAAO,UAAU,SAAS,IAAK,CAAC;AAC7D,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAMA,SAAS,uBAAgC;AACvC,MAAI;AACF,UAAMA,WAAU,cAAc,YAAY,GAAG;AAC7C,IAAAA,SAAQ,QAAQ,cAAc;AAC9B,WAAO;AAAA,EACT,QAAQ;AACN,WAAO;AAAA,EACT;AACF;AAOO,SAAS,gBAAgB,MAAqB;AACnD,MAAI,SAAS,UAAU,SAAS,MAAO;AAEvC,MAAI,SAAS,aAAa;AACxB,QAAI,CAAC,qBAAqB,GAAG;AAC3B,YAAM,IAAI;AAAA,QACR;AAAA,MAEF;AAAA,IACF;AACA;AAAA,EACF;AAEA,MAAI,CAAC,eAAe,IAAI,GAAG;AACzB,UAAM,IAAI,MAAM,IAAI,IAAI;AAAA,EAAuB,gBAAgB,IAAI,CAAC,EAAE;AAAA,EACxE;AACF;AAGA,SAAS,gBAAgB,MAAsB;AAC7C,QAAM,SAAiC;AAAA,IACrC,QAAQ;AAAA,IACR,QAAQ;AAAA,IACR,OAAQ;AAAA,IACR,QAAQ;AAAA,EACV;AACA,SAAO,OAAO,IAAI,KAAK,IAAI,IAAI;AACjC;;;AC9EA,SAAS,iBAAiB;AAC1B,SAAS,eAAe,YAAY,mBAAmB;AACvD,SAAS,YAAY;AACrB,SAAS,cAAc;AAIvB,IAAM,aACJ;AAWF,IAAI,WAA0B;AAC9B,SAAS,aAAqB;AAC5B,MAAI,CAAC,SAAU,YAAW,YAAY,KAAK,OAAO,GAAG,aAAa,CAAC;AACnE,SAAO;AACT;AAQO,SAAS,qBACd,MACoG;AACpG,SAAO,OAAO,WAAuB,eAAqD;AACxF,UAAM,WAAW,KAAK,WAAW,GAAG,QAAQ,UAAU,MAAM;AAE5D,QAAI;AACF,oBAAc,UAAU,SAAS;AAEjC,UAAI;AACJ,UAAI,SAAS,UAAU;AACrB,iBAAS,MAAM,cAAc,QAAQ;AAAA,MACvC,OAAO;AACL,iBAAS,QAAQ,MAAM,QAAQ;AAAA,MACjC;AAEA,aAAO,EAAE,UAAU,eAAe,OAAO,KAAK,CAAC,EAAE;AAAA,IACnD,UAAE;AACA,UAAI;AAAE,mBAAW,QAAQ;AAAA,MAAE,QAAQ;AAAA,MAAuB;AAAA,IAC5D;AAAA,EACF;AACF;AAOA,SAAS,QAAQ,MAAc,WAA2B;AACxD,QAAM,OAAO,aAAa,MAAM,SAAS;AAEzC,QAAM,SAAS,UAAU,SAAS,UAAU,UAAU,MAAM,MAAM;AAAA,IAChE,UAAU;AAAA,IACV,SAAS;AAAA,IACT,WAAW,KAAK,OAAO;AAAA,EACzB,CAAC;AAED,MAAI,OAAO,OAAO;AAChB,UAAM,IAAI,MAAM,GAAG,IAAI,mCAAe,OAAO,MAAM,OAAO,EAAE;AAAA,EAC9D;AACA,MAAI,OAAO,WAAW,GAAG;AACvB,UAAM,SAAS,OAAO,QAAQ,KAAK,KAAK,aAAa,OAAO,MAAM;AAClE,UAAM,IAAI,MAAM,GAAG,IAAI,sBAAY,MAAM,EAAE;AAAA,EAC7C;AAEA,SAAO,OAAO,UAAU;AAC1B;AAmBA,SAAS,aAAa,MAAc,WAA6B;AAC/D,QAAM,kBAAkB,GAAG,UAAU;AAAA;AAAA,uBAAa,SAAS;AAC3D,QAAM,aAAa;AAEnB,UAAQ,MAAM;AAAA,IACZ,KAAK;AACH,aAAO,CAAC,YAAY,iBAAiB,QAAQ;AAAA,IAE/C,KAAK;AACH,aAAO,CAAC,WAAW,eAAe;AAAA,IAEpC,KAAK;AACH,aAAO,CAAC,QAAQ,WAAW,WAAW,UAAU;AAAA,IAElD;AACE,YAAM,IAAI,MAAM,8CAAgB,IAAI,EAAE;AAAA,EAC1C;AACF;AAUA,eAAe,cAAc,WAAoC;AAC/D,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,IAAI;AAC1C,QAAM,cAAc,aAAa,SAAS,EAAE,SAAS,QAAQ;AAE7D,QAAM,QAAQ,QAAQ,IAAI,uBAAuB;AACjD,QAAM,OAAO,QAAQ,IAAI,sBAAsB;AAE/C,QAAM,WAAW,MAAM,MAAM,GAAG,IAAI,aAAa;AAAA,IAC/C,QAAQ;AAAA,IACR,SAAS,EAAE,gBAAgB,mBAAmB;AAAA,IAC9C,MAAM,KAAK,UAAU;AAAA,MACnB;AAAA,MACA,UAAU,CAAC;AAAA,QACT,MAAM;AAAA,QACN,SAAS;AAAA,QACT,QAAQ,CAAC,WAAW;AAAA,MACtB,CAAC;AAAA,MACD,QAAQ;AAAA,IACV,CAAC;AAAA,IACD,QAAQ,YAAY,QAAQ,GAAM;AAAA,EACpC,CAAC;AAED,MAAI,CAAC,SAAS,IAAI;AAChB,UAAM,IAAI,MAAM,4BAAkB,SAAS,MAAM,IAAI,SAAS,UAAU,EAAE;AAAA,EAC5E;AAEA,QAAM,OAAO,MAAM,SAAS,KAAK;AACjC,SAAO,KAAK,SAAS,WAAW;AAClC;AAMA,SAAS,eAAe,MAAsB;AAC5C,QAAM,QAAQ,KAAK,MAAM,+CAA+C;AACxE,SAAO,QAAQ,MAAM,CAAC,EAAE,KAAK,IAAI;AACnC;;;ACxJA,eAAsB,mBACpB,MACA,UACsB;AACtB,MAAI,SAAS,OAAO;AAClB,UAAM,IAAI,MAAM,sFAA+B;AAAA,EACjD;AAGA,MAAI,SAAS,QAAQ;AACnB,oBAAgB,IAAI;AAEpB,QAAI,SAAS,aAAa;AACxB,YAAM,EAAE,wBAAwB,IAAI,MAAM,OAAO,kCAAyB;AAC1E,aAAO,wBAAwB;AAAA,IACjC;AAEA,WAAO,qBAAqB,IAAI;AAAA,EAClC;AAGA,QAAM,WAAW,mBAAmB;AAEpC,MAAI,CAAC,UAAU;AACb,UAAM,IAAI,MAAM,sGAA2B;AAAA,EAC7C;AAGA,MAAI,aAAa,UAAU;AACzB,cAAU,KAAK;AAAA,MACb,SAAS,SAAS,QAAQ;AAAA,MAC1B,MAAM;AAAA,IACR,CAAC;AAAA,EACH;AAEA,MAAI,aAAa,aAAa;AAC5B,cAAU,KAAK;AAAA,MACb,SAAS;AAAA,MACT,MAAM;AAAA,IACR,CAAC;AACD,UAAM,EAAE,wBAAwB,IAAI,MAAM,OAAO,kCAAyB;AAC1E,WAAO,wBAAwB;AAAA,EACjC;AAEA,SAAO,qBAAqB,QAAQ;AACtC;","names":["require"]}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/ocr/tesseract-provider.ts"],"sourcesContent":["/**\n * Tesseract.js 기반 OCR 프로바이더\n *\n * tesseract.js를 선택적 의존성으로 동적 import하여 텍스트 추출.\n * Vision LLM CLI가 없을 때의 최후 fallback.\n *\n * 제한사항:\n * - 순수 텍스트만 반환 (테이블/헤딩 구조 복원 불가)\n * - 한글 인식률 약 85-90% (깨끗한 이미지 기준)\n * - 설치 필요: npm install tesseract.js\n */\n\nimport type { OcrProvider } from \"../types.js\"\n\n/**\n * Tesseract.js OcrProvider 생성.\n *\n * 워커를 1회 생성하여 재사용 (매 페이지마다 초기화 방지).\n * 문서 처리 완료 후 terminate()로 워커 정리 필요.\n *\n * @returns OcrProvider 함수\n * @throws tesseract.js 미설치 시 Error\n */\nexport async function createTesseractProvider(): Promise<OcrProvider & { terminate: () => Promise<void> }> {\n // 동적 import — 미설치 시 명확한 에러\n let tesseract: typeof import(\"tesseract.js\")\n try {\n tesseract = await import(\"tesseract.js\")\n } catch {\n throw new Error(\n \"tesseract.js가 설치되지 않았습니다.\\n\" +\n \"설치: npm install tesseract.js\"\n )\n }\n\n // kor+eng: 한글 + 영문 동시 인식 (한국 공문서 특성)\n const worker = await tesseract.createWorker(\"kor+eng\")\n let terminated = false\n\n const provider = async (\n pageImage: Uint8Array,\n _pageNumber: number,\n _mimeType: \"image/png\"\n ): Promise<string> => {\n const { data } = await worker.recognize(pageImage)\n return data.text\n }\n\n // 외부에서 호출하여 워커 정리\n ;(provider as OcrProvider & { terminate: () => Promise<void> }).terminate = async () => {\n if (!terminated) {\n await worker.terminate()\n terminated = true\n }\n }\n\n return provider as OcrProvider & { terminate: () => Promise<void> }\n}\n"],"mappings":";;;;AAuBA,eAAsB,0BAAqF;AAEzG,MAAI;AACJ,MAAI;AACF,gBAAY,MAAM,OAAO,cAAc;AAAA,EACzC,QAAQ;AACN,UAAM,IAAI;AAAA,MACR;AAAA,IAEF;AAAA,EACF;AAGA,QAAM,SAAS,MAAM,UAAU,aAAa,SAAS;AACrD,MAAI,aAAa;AAEjB,QAAM,WAAW,OACf,WACA,aACA,cACoB;AACpB,UAAM,EAAE,KAAK,IAAI,MAAM,OAAO,UAAU,SAAS;AACjD,WAAO,KAAK;AAAA,EACd;AAGC,EAAC,SAA8D,YAAY,YAAY;AACtF,QAAI,CAAC,YAAY;AACf,YAAM,OAAO,UAAU;AACvB,mBAAa;AAAA,IACf;AAAA,EACF;AAEA,SAAO;AACT;","names":[]}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|
|
File without changes
|