@clazic/kordoc 2.4.11 → 2.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -0
- package/dist/{chunk-PJSXZBZB.js → chunk-5R37N6KE.js} +19 -4
- package/dist/chunk-5R37N6KE.js.map +1 -0
- package/dist/chunk-I6YC6ZGK.js +219 -0
- package/dist/chunk-I6YC6ZGK.js.map +1 -0
- package/dist/{chunk-JGMLDBW5.js → chunk-KJEZPVEK.js} +680 -301
- package/dist/chunk-KJEZPVEK.js.map +1 -0
- package/dist/cli.js +68 -8
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1678 -329
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +121 -1
- package/dist/index.d.ts +121 -1
- package/dist/index.js +1656 -310
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +11 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-PYZL2VNN.js → provider-T2D5XRTI.js} +30 -2
- package/dist/provider-T2D5XRTI.js.map +1 -0
- package/dist/{resolve-4I65IGMM.js → resolve-673XFZQ6.js} +18 -1
- package/dist/resolve-673XFZQ6.js.map +1 -0
- package/dist/{utils-HKVOS2O3.js → utils-XLLXVB7V.js} +4 -2
- package/dist/{watch-EYOGF3HY.js → watch-SOMS2KR7.js} +4 -3
- package/dist/{watch-EYOGF3HY.js.map → watch-SOMS2KR7.js.map} +1 -1
- package/package.json +2 -1
- package/dist/chunk-JGMLDBW5.js.map +0 -1
- package/dist/chunk-PJSXZBZB.js.map +0 -1
- package/dist/provider-PYZL2VNN.js.map +0 -1
- package/dist/resolve-4I65IGMM.js.map +0 -1
- /package/dist/{utils-HKVOS2O3.js.map → utils-XLLXVB7V.js.map} +0 -0
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/utils.ts"],"sourcesContent":["/** kordoc 공용 유틸리티 */\n\n/** 빌드 타임에 tsup define으로 주입되는 버전 */\ndeclare const __KORDOC_VERSION__: string\nexport const VERSION: string = typeof __KORDOC_VERSION__ !== \"undefined\" ? __KORDOC_VERSION__ : \"0.0.0-dev\"\n\n/**\n * Node.js Buffer → ArrayBuffer 변환\n * pool Buffer의 공유 ArrayBuffer 문제를 안전하게 처리.\n * offset=0이고 전체 ArrayBuffer를 차지하면 복사 없이 직접 반환.\n */\nexport function toArrayBuffer(buf: Buffer): ArrayBuffer {\n if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {\n return buf.buffer as ArrayBuffer\n }\n return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength) as ArrayBuffer\n}\n\n/**\n * kordoc 내부 에러 클래스 — 사용자에게 노출해도 안전한 메시지만 포함.\n * MCP 에러 정제에서 instanceof로 판별하여 allowlist 패턴 매칭 없이 안전하게 통과.\n */\nexport class KordocError extends Error {\n constructor(message: string) {\n super(message)\n this.name = \"KordocError\"\n }\n}\n\n/**\n * 에러 메시지 정제 — KordocError는 그대로, 나머지는 일반 메시지로 대체.\n * 파일시스템 경로, 스택 트레이스 등 내부 정보 노출 방지.\n */\nexport function sanitizeError(err: unknown): string {\n if (err instanceof KordocError) return err.message\n return \"문서 처리 중 오류가 발생했습니다\"\n}\n\n/**\n * ZIP 엔트리 경로의 경로 순회 여부 판별.\n * 백슬래시 정규화, .., 절대경로, Windows 드라이브 문자 모두 차단.\n */\nexport function isPathTraversal(name: string): boolean {\n if (name.includes(\"\\x00\")) return true\n const normalized = name.replace(/\\\\/g, \"/\")\n return normalized.includes(\"..\") || normalized.startsWith(\"/\") || /^[A-Za-z]:/.test(normalized)\n}\n\n// ─── ZIP 안전 로딩 (ZIP bomb 방지) ────────────────────\n\n/**\n * ZIP bomb 사전 검사 — Central Directory에서 비압축 합계와 엔트리 수 확인.\n * HWPX/XLSX/DOCX 등 모든 ZIP 기반 포맷에서 공통 사용.\n */\nexport function precheckZipSize(\n buffer: ArrayBuffer,\n maxUncompressedSize = 100 * 1024 * 1024,\n maxEntries = 500,\n): { totalUncompressed: number; entryCount: number } {\n try {\n const data = new DataView(buffer)\n const len = buffer.byteLength\n // EOCD 시그니처 역방향 스캔\n let eocdOffset = -1\n for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {\n if (data.getUint32(i, true) === 0x06054b50) { eocdOffset = i; break }\n }\n if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 }\n\n const entryCount = data.getUint16(eocdOffset + 10, true)\n if (entryCount > maxEntries) {\n throw new KordocError(`ZIP 엔트리 수 초과: ${entryCount} (최대 ${maxEntries})`)\n }\n\n const cdSize = data.getUint32(eocdOffset + 12, true)\n const cdOffset = data.getUint32(eocdOffset + 16, true)\n if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount }\n\n let totalUncompressed = 0\n let pos = cdOffset\n for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {\n if (data.getUint32(pos, true) !== 0x02014b50) break\n totalUncompressed += data.getUint32(pos + 24, true)\n const nameLen = data.getUint16(pos + 28, true)\n const extraLen = data.getUint16(pos + 30, true)\n const commentLen = data.getUint16(pos + 32, true)\n pos += 46 + nameLen + extraLen + commentLen\n }\n\n if (totalUncompressed > maxUncompressedSize) {\n throw new KordocError(`ZIP 비압축 크기 초과: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (최대 ${maxUncompressedSize / 1024 / 1024}MB)`)\n }\n\n return { totalUncompressed, entryCount }\n } catch (err) {\n if (err instanceof KordocError) throw err\n return { totalUncompressed: 0, entryCount: 0 }\n }\n}\n\n/** 하이퍼링크 URL 살균 — javascript: 등 XSS 위험 스킴 차단 */\nconst SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i\nexport function sanitizeHref(href: string): string | null {\n const trimmed = href.trim()\n if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null\n return trimmed\n}\n\n// ─── 에러 분류 ──────────────────────────────────────\n\nimport type { ErrorCode } from \"./types.js\"\n\n/** 에러를 구조화된 ErrorCode로 분류 — KordocError 메시지 패턴 매칭 */\nexport function classifyError(err: unknown): ErrorCode {\n if (!(err instanceof Error)) return \"PARSE_ERROR\"\n const msg = err.message\n if (msg.includes(\"암호화\")) return \"ENCRYPTED\"\n if (msg.includes(\"DRM\")) return \"DRM_PROTECTED\"\n if (msg.includes(\"ZIP bomb\") || msg.includes(\"ZIP 비압축 크기 초과\") || msg.includes(\"ZIP 엔트리 수 초과\")) return \"ZIP_BOMB\"\n if (msg.includes(\"bomb\") || msg.includes(\"크기 초과\") || msg.includes(\"압축 해제\")) return \"DECOMPRESSION_BOMB\"\n if (msg.includes(\"이미지 기반\")) return \"IMAGE_BASED_PDF\"\n if (msg.includes(\"섹션\") && (msg.includes(\"찾을 수 없\") || msg.includes(\"없음\"))) return \"NO_SECTIONS\"\n if (msg.includes(\"시그니처\") || msg.includes(\"복구할 수 없\")) return \"CORRUPTED\"\n return \"PARSE_ERROR\"\n}\n"],"mappings":";;;AAIO,IAAM,UAAkB,OAA4C,WAAqB;AAOzF,SAAS,cAAc,KAA0B;AACtD,MAAI,IAAI,eAAe,KAAK,IAAI,eAAe,IAAI,OAAO,YAAY;AACpE,WAAO,IAAI;AAAA,EACb;AACA,SAAO,IAAI,OAAO,MAAM,IAAI,YAAY,IAAI,aAAa,IAAI,UAAU;AACzE;AAMO,IAAM,cAAN,cAA0B,MAAM;AAAA,EACrC,YAAY,SAAiB;AAC3B,UAAM,OAAO;AACb,SAAK,OAAO;AAAA,EACd;AACF;AAMO,SAAS,cAAc,KAAsB;AAClD,MAAI,eAAe,YAAa,QAAO,IAAI;AAC3C,SAAO;AACT;AAMO,SAAS,gBAAgB,MAAuB;AACrD,MAAI,KAAK,SAAS,IAAM,EAAG,QAAO;AAClC,QAAM,aAAa,KAAK,QAAQ,OAAO,GAAG;AAC1C,SAAO,WAAW,SAAS,IAAI,KAAK,WAAW,WAAW,GAAG,KAAK,aAAa,KAAK,UAAU;AAChG;AAQO,SAAS,gBACd,QACA,sBAAsB,MAAM,OAAO,MACnC,aAAa,KACsC;AACnD,MAAI;AACF,UAAM,OAAO,IAAI,SAAS,MAAM;AAChC,UAAM,MAAM,OAAO;AAEnB,QAAI,aAAa;AACjB,aAAS,IAAI,MAAM,IAAI,KAAK,KAAK,IAAI,GAAG,MAAM,KAAK,GAAG,KAAK;AACzD,UAAI,KAAK,UAAU,GAAG,IAAI,MAAM,WAAY;AAAE,qBAAa;AAAG;AAAA,MAAM;AAAA,IACtE;AACA,QAAI,aAAa,EAAG,QAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAEjE,UAAM,aAAa,KAAK,UAAU,aAAa,IAAI,IAAI;AACvD,QAAI,aAAa,YAAY;AAC3B,YAAM,IAAI,YAAY,+CAAiB,UAAU,kBAAQ,UAAU,GAAG;AAAA,IACxE;AAEA,UAAM,SAAS,KAAK,UAAU,aAAa,IAAI,IAAI;AACnD,UAAM,WAAW,KAAK,UAAU,aAAa,IAAI,IAAI;AACrD,QAAI,WAAW,SAAS,IAAK,QAAO,EAAE,mBAAmB,GAAG,WAAW;AAEvE,QAAI,oBAAoB;AACxB,QAAI,MAAM;AACV,aAAS,IAAI,GAAG,IAAI,cAAc,MAAM,MAAM,WAAW,QAAQ,KAAK;AACpE,UAAI,KAAK,UAAU,KAAK,IAAI,MAAM,SAAY;AAC9C,2BAAqB,KAAK,UAAU,MAAM,IAAI,IAAI;AAClD,YAAM,UAAU,KAAK,UAAU,MAAM,IAAI,IAAI;AAC7C,YAAM,WAAW,KAAK,UAAU,MAAM,IAAI,IAAI;AAC9C,YAAM,aAAa,KAAK,UAAU,MAAM,IAAI,IAAI;AAChD,aAAO,KAAK,UAAU,WAAW;AAAA,IACnC;AAEA,QAAI,oBAAoB,qBAAqB;AAC3C,YAAM,IAAI,YAAY,sDAAmB,oBAAoB,OAAO,MAAM,QAAQ,CAAC,CAAC,oBAAU,sBAAsB,OAAO,IAAI,KAAK;AAAA,IACtI;AAEA,WAAO,EAAE,mBAAmB,WAAW;AAAA,EACzC,SAAS,KAAK;AACZ,QAAI,eAAe,YAAa,OAAM;AACtC,WAAO,EAAE,mBAAmB,GAAG,YAAY,EAAE;AAAA,EAC/C;AACF;AAGA,IAAM,eAAe;AACd,SAAS,aAAa,MAA6B;AACxD,QAAM,UAAU,KAAK,KAAK;AAC1B,MAAI,CAAC,WAAW,CAAC,aAAa,KAAK,OAAO,EAAG,QAAO;AACpD,SAAO;AACT;AAOO,SAAS,cAAc,KAAyB;AACrD,MAAI,EAAE,eAAe,OAAQ,QAAO;AACpC,QAAM,MAAM,IAAI;AAChB,MAAI,IAAI,SAAS,oBAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,KAAK,EAAG,QAAO;AAChC,MAAI,IAAI,SAAS,UAAU,KAAK,IAAI,SAAS,kDAAe,KAAK,IAAI,SAAS,4CAAc,EAAG,QAAO;AACtG,MAAI,IAAI,SAAS,MAAM,KAAK,IAAI,SAAS,2BAAO,KAAK,IAAI,SAAS,2BAAO,EAAG,QAAO;AACnF,MAAI,IAAI,SAAS,iCAAQ,EAAG,QAAO;AACnC,MAAI,IAAI,SAAS,cAAI,MAAM,IAAI,SAAS,4BAAQ,KAAK,IAAI,SAAS,cAAI,GAAI,QAAO;AACjF,MAAI,IAAI,SAAS,0BAAM,KAAK,IAAI,SAAS,kCAAS,EAAG,QAAO;AAC5D,SAAO;AACT;","names":[]}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/ocr/provider.ts"],"sourcesContent":["/**\n * OCR 프로바이더 브릿지 — PDF 페이지를 이미지로 렌더링하여 OCR 호출\n *\n * kordoc은 OCR 라이브러리를 번들하지 않음.\n * 사용자가 OcrProvider 함수를 제공하면 이미지 기반 PDF도 텍스트 추출 가능.\n *\n * @example\n * ```ts\n * import { parse } from \"kordoc\"\n *\n * const result = await parse(buffer, {\n * ocr: async (pageImage, pageNumber, mimeType) => {\n * // Tesseract, Claude Vision, Google Vision 등 사용\n * return await myOcrService.recognize(pageImage)\n * }\n * })\n * ```\n */\n\nimport type { OcrProvider, IRBlock, ParseWarning, StructuredOcrResult, BatchOcrProvider } from \"../types.js\"\nimport { markdownToBlocks } from \"./markdown-to-blocks.js\"\n\n/**\n * 동시 실행 수를 제한한 병렬 태스크 실행 헬퍼.\n *\n * limit개의 워커를 만들어 tasks 배열을 순서대로 처리.\n * 각 워커는 완료되는 즉시 다음 태스크를 가져가므로 순서가 보존됨.\n *\n * @param tasks - 실행할 비동기 함수 배열\n * @param limit - 최대 동시 실행 수\n * @returns 입력 순서와 동일한 결과 배열\n */\nasync function runWithConcurrency<T>(\n tasks: (() => Promise<T>)[],\n limit: number\n): Promise<T[]> {\n const results: T[] = new Array(tasks.length)\n let nextIndex = 0\n\n // 각 워커는 처리할 태스크가 없을 때까지 반복\n async function worker() {\n while (nextIndex < tasks.length) {\n const idx = nextIndex++\n results[idx] = await tasks[idx]()\n }\n }\n\n // limit개 워커를 동시 실행 (tasks가 limit보다 적으면 tasks 수만큼)\n await Promise.all(Array.from({ length: Math.min(limit, tasks.length) }, () => worker()))\n return results\n}\n\n/**\n * OCR 결과(string | StructuredOcrResult)를 IRBlock[]으로 변환.\n */\nfunction ocrResultToBlocks(result: string | StructuredOcrResult, pageNum: number): IRBlock[] {\n const pageBlocks: IRBlock[] = []\n if (typeof result === \"string\") {\n // 순수 텍스트 → paragraph 블록\n if (result.trim()) {\n pageBlocks.push({ type: \"paragraph\", text: result.trim(), pageNumber: pageNum })\n }\n } else if (result && typeof result === \"object\" && \"markdown\" in result) {\n // 구조화된 결과 → Markdown → IRBlock[]\n const structured = result as StructuredOcrResult\n if (structured.markdown.trim()) {\n const converted = markdownToBlocks(structured.markdown, pageNum)\n for (const b of converted) pageBlocks.push(b)\n }\n }\n return pageBlocks\n}\n\n/** BatchOcrProvider 타입 가드 */\nfunction isBatchProvider(p: unknown): p is BatchOcrProvider {\n return !!p && typeof p === \"object\" && \"__batch\" in p && (p as BatchOcrProvider).__batch === true\n}\n\n/**\n * 이미지 기반 PDF 페이지에 OCR을 적용하여 IRBlock[] 반환.\n *\n * pdfjs page 객체에서 viewport + render를 통해 PNG 생성 후\n * 사용자 제공 OcrProvider 호출.\n *\n * - string 반환: 단순 텍스트 → paragraph 블록\n * - StructuredOcrResult 반환: Markdown → markdownToBlocks()로 구조화\n * - concurrency > 1: 병렬 처리 (워커 풀 프로바이더 권장)\n *\n * canvas 미설치 시 pdfjs render 불가하므로 에러 반환.\n */\nexport async function ocrPages(\n doc: { numPages: number; getPage(n: number): Promise<PdfPageProxy> },\n provider: OcrProvider | BatchOcrProvider,\n pageFilter: Set<number> | null,\n effectivePageCount: number,\n warnings?: ParseWarning[],\n concurrency: number = 1, // 기본값 1 = 순차 처리 (하위 호환)\n onProgress?: (current: number, total: number) => void\n): Promise<IRBlock[]> {\n const blocks: IRBlock[] = []\n\n // ── 배치 처리 (BatchOcrProvider) ────────────────────\n if (isBatchProvider(provider)) {\n return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress)\n }\n\n // ── 순차 처리 (concurrency === 1) ────────────────────\n if (concurrency <= 1) {\n for (let i = 1; i <= effectivePageCount; i++) {\n if (pageFilter && !pageFilter.has(i)) continue\n const page = await doc.getPage(i)\n try {\n const imageData = await renderPageToPng(page)\n const result = await provider(imageData, i, \"image/png\")\n for (const b of ocrResultToBlocks(result, i)) blocks.push(b)\n } catch (err) {\n // 개별 페이지 실패 시 경고 발행 후 계속 진행\n warnings?.push({\n page: i,\n message: `페이지 ${i} OCR 실패: ${err instanceof Error ? err.message : \"알 수 없는 오류\"}`,\n code: \"OCR_PAGE_FAILED\",\n })\n }\n }\n return blocks\n }\n\n // ── 병렬 처리 (concurrency > 1) ──────────────────────\n // 처리 대상 페이지 번호 수집\n const pageNumbers: number[] = []\n for (let i = 1; i <= effectivePageCount; i++) {\n if (pageFilter && !pageFilter.has(i)) continue\n pageNumbers.push(i)\n }\n\n // 각 페이지에 대한 태스크 생성 (에러는 개별 캐치)\n const tasks = pageNumbers.map(pageNum => async (): Promise<{ pageNum: number; pageBlocks: IRBlock[] } | null> => {\n try {\n const page = await doc.getPage(pageNum)\n const imageData = await renderPageToPng(page)\n const result = await provider(imageData, pageNum, \"image/png\")\n return { pageNum, pageBlocks: ocrResultToBlocks(result, pageNum) }\n } catch (err) {\n // 개별 페이지 실패 시 경고 발행 후 null 반환\n warnings?.push({\n page: pageNum,\n message: `페이지 ${pageNum} OCR 실패: ${err instanceof Error ? err.message : \"알 수 없는 오류\"}`,\n code: \"OCR_PAGE_FAILED\",\n })\n return null\n }\n })\n\n // 병렬 실행 — concurrency 수만큼 동시 처리\n const taskResults = await runWithConcurrency(tasks, concurrency)\n\n // 결과를 페이지 번호 순서대로 합산 (pageNumbers 순서 = 오름차순 보장)\n for (const item of taskResults) {\n if (!item) continue\n for (const b of item.pageBlocks) blocks.push(b)\n }\n\n return blocks\n}\n\n/**\n * 배치 OCR 처리 — BatchOcrProvider를 사용하여 N페이지씩 묶어 처리.\n *\n * concurrency > 1이면 여러 배치를 동시에 실행하여 속도 향상.\n * 예: 5페이지/배치 × 4 동시 = 20페이지 동시 처리.\n */\nasync function ocrPagesBatch(\n doc: { numPages: number; getPage(n: number): Promise<PdfPageProxy> },\n provider: BatchOcrProvider,\n pageFilter: Set<number> | null,\n effectivePageCount: number,\n warnings?: ParseWarning[],\n concurrency: number = 1,\n onProgress?: (current: number, total: number) => void\n): Promise<IRBlock[]> {\n // 1. 대상 페이지 번호 수집\n const pageNumbers: number[] = []\n for (let i = 1; i <= effectivePageCount; i++) {\n if (pageFilter && !pageFilter.has(i)) continue\n pageNumbers.push(i)\n }\n\n // 2. 페이지 번호를 batchSize 단위로 분할\n const pageBatches: number[][] = []\n for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {\n pageBatches.push(pageNumbers.slice(i, i + provider.batchSize))\n }\n\n // 3. 배치 태스크 생성 — 각 배치 내에서 렌더링→처리→해제\n let processed = 0\n type BatchResult = { batchIdx: number; pageBlocks: Array<{pageNum: number; blocks: IRBlock[]}> }\n\n const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async (): Promise<BatchResult> => {\n const pageBlocks: Array<{pageNum: number; blocks: IRBlock[]}> = []\n\n // 렌더링은 try 바깥에서 수행 — 배치 실패 시 단일 재시도에 재사용\n const batchImages: Array<{image: Uint8Array, pageNum: number}> = []\n try {\n for (const pageNum of batchPageNums) {\n const page = await doc.getPage(pageNum)\n const image = await renderPageToPng(page)\n batchImages.push({ image, pageNum })\n }\n } catch (renderErr) {\n // 렌더링 자체 실패 → 해당 페이지 이후 빈 결과\n const rendered = new Set(batchImages.map(b => b.pageNum))\n for (const pageNum of batchPageNums) {\n if (!rendered.has(pageNum)) pageBlocks.push({ pageNum, blocks: [] })\n }\n }\n\n if (batchImages.length > 0) {\n try {\n // provider.processBatch() 호출\n const results = await provider.processBatch(batchImages)\n for (const { pageNum } of batchImages) {\n const result = results.get(pageNum)\n pageBlocks.push({\n pageNum,\n blocks: result ? ocrResultToBlocks(result, pageNum) : [],\n })\n }\n } catch (err) {\n const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`\n warnings?.push({\n message: `배치 OCR 실패 (페이지 ${range}): ${err instanceof Error ? err.message : \"알 수 없는 오류\"} — 단일 페이지로 재시도`,\n code: \"OCR_PAGE_FAILED\",\n })\n // 배치 실패 시 단일 페이지씩 재시도 (같은 엔진)\n for (const { image, pageNum } of batchImages) {\n try {\n const singleResult = await provider.processBatch([{ image, pageNum }])\n const r = singleResult.get(pageNum)\n pageBlocks.push({ pageNum, blocks: r ? ocrResultToBlocks(r, pageNum) : [] })\n } catch {\n pageBlocks.push({ pageNum, blocks: [] })\n }\n }\n }\n }\n // 진행률 갱신 (병렬 실행 중 atomic하지 않지만 표시용으로 충분)\n processed += batchPageNums.length\n onProgress?.(processed, pageNumbers.length)\n return { batchIdx, pageBlocks }\n })\n\n // 5. 병렬 실행 — concurrency개 배치를 동시 처리\n const effectiveConcurrency = Math.max(1, concurrency)\n const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency)\n\n // 6. 배치 순서대로 블록 합산 (페이지 순서 보존)\n const blocks: IRBlock[] = []\n for (const result of batchResults) {\n for (const { blocks: pageBlks } of result.pageBlocks) {\n for (const b of pageBlks) blocks.push(b)\n }\n }\n\n return blocks\n}\n\ninterface PdfPageProxy {\n getViewport(params: { scale: number }): { width: number; height: number }\n render(params: { canvasContext: unknown; viewport: unknown }): { promise: Promise<void> }\n}\n\n/**\n * PDF 페이지를 PNG로 렌더링.\n * @napi-rs/canvas 사용 (kordoc 번들 의존성, 별도 설치 불필요)\n */\nasync function renderPageToPng(page: PdfPageProxy): Promise<Uint8Array> {\n const { createCanvas } = await import(\"@napi-rs/canvas\")\n\n const scale = 2.0 // 300 DPI 근사\n const viewport = page.getViewport({ scale })\n const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height))\n const ctx = canvas.getContext(\"2d\")\n\n await page.render({ canvasContext: ctx as unknown, viewport }).promise\n return new Uint8Array(canvas.toBuffer(\"image/png\"))\n}\n"],"mappings":";;;;;;;AAgCA,eAAe,mBACb,OACA,OACc;AACd,QAAM,UAAe,IAAI,MAAM,MAAM,MAAM;AAC3C,MAAI,YAAY;AAGhB,iBAAe,SAAS;AACtB,WAAO,YAAY,MAAM,QAAQ;AAC/B,YAAM,MAAM;AACZ,cAAQ,GAAG,IAAI,MAAM,MAAM,GAAG,EAAE;AAAA,IAClC;AAAA,EACF;AAGA,QAAM,QAAQ,IAAI,MAAM,KAAK,EAAE,QAAQ,KAAK,IAAI,OAAO,MAAM,MAAM,EAAE,GAAG,MAAM,OAAO,CAAC,CAAC;AACvF,SAAO;AACT;AAKA,SAAS,kBAAkB,QAAsC,SAA4B;AAC3F,QAAM,aAAwB,CAAC;AAC/B,MAAI,OAAO,WAAW,UAAU;AAE9B,QAAI,OAAO,KAAK,GAAG;AACjB,iBAAW,KAAK,EAAE,MAAM,aAAa,MAAM,OAAO,KAAK,GAAG,YAAY,QAAQ,CAAC;AAAA,IACjF;AAAA,EACF,WAAW,UAAU,OAAO,WAAW,YAAY,cAAc,QAAQ;AAEvE,UAAM,aAAa;AACnB,QAAI,WAAW,SAAS,KAAK,GAAG;AAC9B,YAAM,YAAY,iBAAiB,WAAW,UAAU,OAAO;AAC/D,iBAAW,KAAK,UAAW,YAAW,KAAK,CAAC;AAAA,IAC9C;AAAA,EACF;AACA,SAAO;AACT;AAGA,SAAS,gBAAgB,GAAmC;AAC1D,SAAO,CAAC,CAAC,KAAK,OAAO,MAAM,YAAY,aAAa,KAAM,EAAuB,YAAY;AAC/F;AAcA,eAAsB,SACpB,KACA,UACA,YACA,oBACA,UACA,cAAsB,GACtB,YACoB;AACpB,QAAM,SAAoB,CAAC;AAG3B,MAAI,gBAAgB,QAAQ,GAAG;AAC7B,WAAO,cAAc,KAAK,UAAU,YAAY,oBAAoB,UAAU,aAAa,UAAU;AAAA,EACvG;AAGA,MAAI,eAAe,GAAG;AACpB,aAAS,IAAI,GAAG,KAAK,oBAAoB,KAAK;AAC5C,UAAI,cAAc,CAAC,WAAW,IAAI,CAAC,EAAG;AACtC,YAAM,OAAO,MAAM,IAAI,QAAQ,CAAC;AAChC,UAAI;AACF,cAAM,YAAY,MAAM,gBAAgB,IAAI;AAC5C,cAAM,SAAS,MAAM,SAAS,WAAW,GAAG,WAAW;AACvD,mBAAW,KAAK,kBAAkB,QAAQ,CAAC,EAAG,QAAO,KAAK,CAAC;AAAA,MAC7D,SAAS,KAAK;AAEZ,kBAAU,KAAK;AAAA,UACb,MAAM;AAAA,UACN,SAAS,sBAAO,CAAC,sBAAY,eAAe,QAAQ,IAAI,UAAU,yCAAW;AAAA,UAC7E,MAAM;AAAA,QACR,CAAC;AAAA,MACH;AAAA,IACF;AACA,WAAO;AAAA,EACT;AAIA,QAAM,cAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,KAAK,oBAAoB,KAAK;AAC5C,QAAI,cAAc,CAAC,WAAW,IAAI,CAAC,EAAG;AACtC,gBAAY,KAAK,CAAC;AAAA,EACpB;AAGA,QAAM,QAAQ,YAAY,IAAI,aAAW,YAAwE;AAC/G,QAAI;AACF,YAAM,OAAO,MAAM,IAAI,QAAQ,OAAO;AACtC,YAAM,YAAY,MAAM,gBAAgB,IAAI;AAC5C,YAAM,SAAS,MAAM,SAAS,WAAW,SAAS,WAAW;AAC7D,aAAO,EAAE,SAAS,YAAY,kBAAkB,QAAQ,OAAO,EAAE;AAAA,IACnE,SAAS,KAAK;AAEZ,gBAAU,KAAK;AAAA,QACb,MAAM;AAAA,QACN,SAAS,sBAAO,OAAO,sBAAY,eAAe,QAAQ,IAAI,UAAU,yCAAW;AAAA,QACnF,MAAM;AAAA,MACR,CAAC;AACD,aAAO;AAAA,IACT;AAAA,EACF,CAAC;AAGD,QAAM,cAAc,MAAM,mBAAmB,OAAO,WAAW;AAG/D,aAAW,QAAQ,aAAa;AAC9B,QAAI,CAAC,KAAM;AACX,eAAW,KAAK,KAAK,WAAY,QAAO,KAAK,CAAC;AAAA,EAChD;AAEA,SAAO;AACT;AAQA,eAAe,cACb,KACA,UACA,YACA,oBACA,UACA,cAAsB,GACtB,YACoB;AAEpB,QAAM,cAAwB,CAAC;AAC/B,WAAS,IAAI,GAAG,KAAK,oBAAoB,KAAK;AAC5C,QAAI,cAAc,CAAC,WAAW,IAAI,CAAC,EAAG;AACtC,gBAAY,KAAK,CAAC;AAAA,EACpB;AAGA,QAAM,cAA0B,CAAC;AACjC,WAAS,IAAI,GAAG,IAAI,YAAY,QAAQ,KAAK,SAAS,WAAW;AAC/D,gBAAY,KAAK,YAAY,MAAM,GAAG,IAAI,SAAS,SAAS,CAAC;AAAA,EAC/D;AAGA,MAAI,YAAY;AAGhB,QAAM,aAAa,YAAY,IAAI,CAAC,eAAe,aAAa,YAAkC;AAChG,UAAM,aAA0D,CAAC;AAGjE,UAAM,cAA2D,CAAC;AAClE,QAAI;AACF,iBAAW,WAAW,eAAe;AACnC,cAAM,OAAO,MAAM,IAAI,QAAQ,OAAO;AACtC,cAAM,QAAQ,MAAM,gBAAgB,IAAI;AACxC,oBAAY,KAAK,EAAE,OAAO,QAAQ,CAAC;AAAA,MACrC;AAAA,IACF,SAAS,WAAW;AAElB,YAAM,WAAW,IAAI,IAAI,YAAY,IAAI,OAAK,EAAE,OAAO,CAAC;AACxD,iBAAW,WAAW,eAAe;AACnC,YAAI,CAAC,SAAS,IAAI,OAAO,EAAG,YAAW,KAAK,EAAE,SAAS,QAAQ,CAAC,EAAE,CAAC;AAAA,MACrE;AAAA,IACF;AAEA,QAAI,YAAY,SAAS,GAAG;AAC1B,UAAI;AAEF,cAAM,UAAU,MAAM,SAAS,aAAa,WAAW;AACvD,mBAAW,EAAE,QAAQ,KAAK,aAAa;AACrC,gBAAM,SAAS,QAAQ,IAAI,OAAO;AAClC,qBAAW,KAAK;AAAA,YACd;AAAA,YACA,QAAQ,SAAS,kBAAkB,QAAQ,OAAO,IAAI,CAAC;AAAA,UACzD,CAAC;AAAA,QACH;AAAA,MACF,SAAS,KAAK;AACZ,cAAM,QAAQ,GAAG,cAAc,CAAC,CAAC,IAAI,cAAc,cAAc,SAAS,CAAC,CAAC;AAC5E,kBAAU,KAAK;AAAA,UACb,SAAS,qDAAkB,KAAK,MAAM,eAAe,QAAQ,IAAI,UAAU,yCAAW;AAAA,UACtF,MAAM;AAAA,QACR,CAAC;AAED,mBAAW,EAAE,OAAO,QAAQ,KAAK,aAAa;AAC5C,cAAI;AACF,kBAAM,eAAe,MAAM,SAAS,aAAa,CAAC,EAAE,OAAO,QAAQ,CAAC,CAAC;AACrE,kBAAM,IAAI,aAAa,IAAI,OAAO;AAClC,uBAAW,KAAK,EAAE,SAAS,QAAQ,IAAI,kBAAkB,GAAG,OAAO,IAAI,CAAC,EAAE,CAAC;AAAA,UAC7E,QAAQ;AACN,uBAAW,KAAK,EAAE,SAAS,QAAQ,CAAC,EAAE,CAAC;AAAA,UACzC;AAAA,QACF;AAAA,MACF;AAAA,IACF;AAEA,iBAAa,cAAc;AAC3B,iBAAa,WAAW,YAAY,MAAM;AAC1C,WAAO,EAAE,UAAU,WAAW;AAAA,EAChC,CAAC;AAGD,QAAM,uBAAuB,KAAK,IAAI,GAAG,WAAW;AACpD,QAAM,eAAe,MAAM,mBAAmB,YAAY,oBAAoB;AAG9E,QAAM,SAAoB,CAAC;AAC3B,aAAW,UAAU,cAAc;AACjC,eAAW,EAAE,QAAQ,SAAS,KAAK,OAAO,YAAY;AACpD,iBAAW,KAAK,SAAU,QAAO,KAAK,CAAC;AAAA,IACzC;AAAA,EACF;AAEA,SAAO;AACT;AAWA,eAAe,gBAAgB,MAAyC;AACtE,QAAM,EAAE,aAAa,IAAI,MAAM,OAAO,iBAAiB;AAEvD,QAAM,QAAQ;AACd,QAAM,WAAW,KAAK,YAAY,EAAE,MAAM,CAAC;AAC3C,QAAM,SAAS,aAAa,KAAK,MAAM,SAAS,KAAK,GAAG,KAAK,MAAM,SAAS,MAAM,CAAC;AACnF,QAAM,MAAM,OAAO,WAAW,IAAI;AAElC,QAAM,KAAK,OAAO,EAAE,eAAe,KAAgB,SAAS,CAAC,EAAE;AAC/D,SAAO,IAAI,WAAW,OAAO,SAAS,WAAW,CAAC;AACpD;","names":[]}
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/ocr/resolve.ts"],"sourcesContent":["/**\n * OCR 프로바이더 팩토리\n *\n * ocrMode에 따라 적절한 OcrProvider를 생성하여 반환.\n * - \"auto\": 설치된 CLI 자동 탐색 (gemini → claude → codex → ollama → tesseract)\n * tesseract.js는 bundled 의존성이므로 항상 사용 가능 (null 반환 없음)\n * - 특정 CLI: 해당 CLI 사용 (미설치 시 에러)\n * - \"tesseract\": 내장 tesseract.js 직접 사용\n * - \"off\": 에러 throw\n */\n\nimport type { OcrMode, OcrProvider, ParseWarning, BatchOcrProvider } from \"../types.js\"\nimport { detectAvailableOcr, validateOcrMode, getTesseractFallbackMessage } from \"./auto-detect.js\"\nimport { createCliOcrProvider } from \"./cli-provider.js\"\n\n/**\n * ocrMode에 따라 OcrProvider를 생성.\n *\n * @param mode - OCR 모드\n * @param warnings - 경고 수집 배열 (fallback 발생 시 경고 추가)\n * @param concurrency - 병렬 처리 수 (tesseract 전용, 기본: 1=순차)\n * @returns OcrProvider 함수\n * @throws mode=\"off\"이거나 지정 CLI 미설치 시 Error\n */\nexport async function resolveOcrProvider(\n mode: OcrMode,\n warnings?: ParseWarning[],\n concurrency?: number,\n batchSize?: number\n): Promise<OcrProvider | BatchOcrProvider> {\n if (mode === \"off\") {\n throw new Error(\"OCR이 비활성화되어 있습니다 (--ocr off).\")\n }\n\n // ── 수동 지정 모드 ──────────────────────────────────\n if (mode !== \"auto\") {\n validateOcrMode(mode) // tesseract는 항상 통과\n\n if (mode === \"tesseract\") {\n const { createTesseractProvider, createTesseractPoolProvider } = await import(\"./tesseract-provider.js\")\n // concurrency > 1이면 워커 풀 사용, 그 외 단일 워커 사용\n if (concurrency && concurrency > 1) {\n return createTesseractPoolProvider(concurrency)\n }\n return createTesseractProvider()\n }\n\n // gemini/claude/codex: 배치 크기 > 1이면 배치 프로바이더 사용\n if (mode === \"gemini\" || mode === \"claude\" || mode === \"codex\") {\n const { createBatchCliProvider, DEFAULT_BATCH_SIZES } = await import(\"./batch-provider.js\")\n const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES[mode]\n if (effectiveBatch > 1) {\n return createBatchCliProvider(mode, effectiveBatch)\n }\n return createCliOcrProvider(mode)\n }\n\n // CLI 프로바이더는 rate limit 보호를 위해 concurrency 무시 (항상 순차)\n return createCliOcrProvider(mode)\n }\n\n // ── 자동 탐색 모드 ───────────────────────────────────\n // detectAvailableOcr()는 항상 값을 반환 (tesseract fallback으로 null 없음)\n const detected = detectAvailableOcr()\n\n // codex가 아닌 경우 fallback 경고\n if (detected !== \"codex\") {\n if (detected === \"tesseract\") {\n // 내장 tesseract로 fallback — 구조 복원 제한 안내\n warnings?.push({\n message: getTesseractFallbackMessage(),\n code: \"OCR_CLI_FALLBACK\",\n })\n } else {\n warnings?.push({\n message: `OCR: '${detected}' 사용 중 (codex CLI가 없어 fallback). 더 나은 품질을 위해 codex CLI 설치를 권장합니다.`,\n code: \"OCR_CLI_FALLBACK\",\n })\n }\n }\n\n if (detected === \"tesseract\") {\n const { createTesseractProvider, createTesseractPoolProvider } = await import(\"./tesseract-provider.js\")\n // concurrency > 1이면 워커 풀 사용, 그 외 단일 워커 사용\n if (concurrency && concurrency > 1) {\n return createTesseractPoolProvider(concurrency)\n }\n return createTesseractProvider()\n }\n\n // gemini/claude/codex: 배치 크기 > 1이면 배치 프로바이더 사용\n if (detected === \"gemini\" || detected === \"codex\" || detected === \"claude\") {\n const { createBatchCliProvider, DEFAULT_BATCH_SIZES } = await import(\"./batch-provider.js\")\n const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES[detected]\n if (effectiveBatch > 1) {\n return createBatchCliProvider(detected, effectiveBatch)\n }\n return createCliOcrProvider(detected)\n }\n\n // CLI 프로바이더는 rate limit 보호를 위해 concurrency 무시 (항상 순차)\n return createCliOcrProvider(detected)\n}\n"],"mappings":";;;;;;;;;;;;AAwBA,eAAsB,mBACpB,MACA,UACA,aACA,WACyC;AACzC,MAAI,SAAS,OAAO;AAClB,UAAM,IAAI,MAAM,sFAA+B;AAAA,EACjD;AAGA,MAAI,SAAS,QAAQ;AACnB,oBAAgB,IAAI;AAEpB,QAAI,SAAS,aAAa;AACxB,YAAM,EAAE,yBAAyB,4BAA4B,IAAI,MAAM,OAAO,kCAAyB;AAEvG,UAAI,eAAe,cAAc,GAAG;AAClC,eAAO,4BAA4B,WAAW;AAAA,MAChD;AACA,aAAO,wBAAwB;AAAA,IACjC;AAGA,QAAI,SAAS,YAAY,SAAS,YAAY,SAAS,SAAS;AAC9D,YAAM,EAAE,wBAAwB,oBAAoB,IAAI,MAAM,OAAO,8BAAqB;AAC1F,YAAM,iBAAiB,aAAa,oBAAoB,IAAI;AAC5D,UAAI,iBAAiB,GAAG;AACtB,eAAO,uBAAuB,MAAM,cAAc;AAAA,MACpD;AACA,aAAO,qBAAqB,IAAI;AAAA,IAClC;AAGA,WAAO,qBAAqB,IAAI;AAAA,EAClC;AAIA,QAAM,WAAW,mBAAmB;AAGpC,MAAI,aAAa,SAAS;AACxB,QAAI,aAAa,aAAa;AAE5B,gBAAU,KAAK;AAAA,QACb,SAAS,4BAA4B;AAAA,QACrC,MAAM;AAAA,MACR,CAAC;AAAA,IACH,OAAO;AACL,gBAAU,KAAK;AAAA,QACb,SAAS,SAAS,QAAQ;AAAA,QAC1B,MAAM;AAAA,MACR,CAAC;AAAA,IACH;AAAA,EACF;AAEA,MAAI,aAAa,aAAa;AAC5B,UAAM,EAAE,yBAAyB,4BAA4B,IAAI,MAAM,OAAO,kCAAyB;AAEvG,QAAI,eAAe,cAAc,GAAG;AAClC,aAAO,4BAA4B,WAAW;AAAA,IAChD;AACA,WAAO,wBAAwB;AAAA,EACjC;AAGA,MAAI,aAAa,YAAY,aAAa,WAAW,aAAa,UAAU;AAC1E,UAAM,EAAE,wBAAwB,oBAAoB,IAAI,MAAM,OAAO,8BAAqB;AAC1F,UAAM,iBAAiB,aAAa,oBAAoB,QAAQ;AAChE,QAAI,iBAAiB,GAAG;AACtB,aAAO,uBAAuB,UAAU,cAAc;AAAA,IACxD;AACA,WAAO,qBAAqB,QAAQ;AAAA,EACtC;AAGA,SAAO,qBAAqB,QAAQ;AACtC;","names":[]}
|
|
File without changes
|