@clazic/kordoc 2.1.5 → 2.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +67 -4
- package/dist/chunk-JJMA5HGQ.js +9617 -0
- package/dist/chunk-JJMA5HGQ.js.map +1 -0
- package/dist/{chunk-LP7HUOZB.js → chunk-XWET7ONC.js} +2 -2
- package/dist/chunk-ZWE3DS7E.js +39 -0
- package/dist/cli.js +114 -11
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +3134 -149
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +36 -4
- package/dist/index.d.ts +36 -4
- package/dist/index.js +3158 -148
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +100 -7
- package/dist/mcp.js.map +1 -1
- package/dist/{page-range-737B4EZW.js → page-range-ALIRXAL5.js} +2 -1
- package/dist/provider-XVKP5OGI.js +167 -0
- package/dist/provider-XVKP5OGI.js.map +1 -0
- package/dist/resolve-Y3KMGD3R.js +187 -0
- package/dist/resolve-Y3KMGD3R.js.map +1 -0
- package/dist/tesseract-provider-MZ37ZKQW.js +31 -0
- package/dist/tesseract-provider-MZ37ZKQW.js.map +1 -0
- package/dist/{utils-EHWBYPP7.js → utils-4NP2VUFW.js} +3 -2
- package/dist/utils-4NP2VUFW.js.map +1 -0
- package/dist/{watch-TNLNQF2I.js → watch-4VVWG2WC.js} +4 -3
- package/dist/{watch-TNLNQF2I.js.map → watch-4VVWG2WC.js.map} +1 -1
- package/package.json +4 -2
- package/dist/chunk-A7G6BYLH.js +0 -5494
- package/dist/chunk-A7G6BYLH.js.map +0 -1
- package/dist/provider-A4FHJSID.js +0 -38
- package/dist/provider-A4FHJSID.js.map +0 -1
- /package/dist/{chunk-LP7HUOZB.js.map → chunk-XWET7ONC.js.map} +0 -0
- /package/dist/{page-range-737B4EZW.js.map → chunk-ZWE3DS7E.js.map} +0 -0
- /package/dist/{utils-EHWBYPP7.js.map → page-range-ALIRXAL5.js.map} +0 -0
|
@@ -1,38 +0,0 @@
|
|
|
1
|
-
#!/usr/bin/env node
|
|
2
|
-
|
|
3
|
-
// src/ocr/provider.ts
|
|
4
|
-
async function ocrPages(doc, provider, pageFilter, effectivePageCount) {
|
|
5
|
-
const blocks = [];
|
|
6
|
-
for (let i = 1; i <= effectivePageCount; i++) {
|
|
7
|
-
if (pageFilter && !pageFilter.has(i)) continue;
|
|
8
|
-
const page = await doc.getPage(i);
|
|
9
|
-
try {
|
|
10
|
-
const imageData = await renderPageToPng(page);
|
|
11
|
-
const text = await provider(imageData, i, "image/png");
|
|
12
|
-
if (text.trim()) {
|
|
13
|
-
blocks.push({ type: "paragraph", text: text.trim(), pageNumber: i });
|
|
14
|
-
}
|
|
15
|
-
} catch {
|
|
16
|
-
}
|
|
17
|
-
}
|
|
18
|
-
return blocks;
|
|
19
|
-
}
|
|
20
|
-
async function renderPageToPng(page) {
|
|
21
|
-
let createCanvas;
|
|
22
|
-
try {
|
|
23
|
-
const canvasModule = await import("canvas");
|
|
24
|
-
createCanvas = canvasModule.createCanvas;
|
|
25
|
-
} catch {
|
|
26
|
-
throw new Error("OCR\uC744 \uC0AC\uC6A9\uD558\uB824\uBA74 'canvas' \uD328\uD0A4\uC9C0\uB97C \uC124\uCE58\uD558\uC138\uC694: npm install canvas");
|
|
27
|
-
}
|
|
28
|
-
const scale = 2;
|
|
29
|
-
const viewport = page.getViewport({ scale });
|
|
30
|
-
const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height));
|
|
31
|
-
const ctx = canvas.getContext("2d");
|
|
32
|
-
await page.render({ canvasContext: ctx, viewport }).promise;
|
|
33
|
-
return new Uint8Array(canvas.toBuffer("image/png"));
|
|
34
|
-
}
|
|
35
|
-
export {
|
|
36
|
-
ocrPages
|
|
37
|
-
};
|
|
38
|
-
//# sourceMappingURL=provider-A4FHJSID.js.map
|
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"sources":["../src/ocr/provider.ts"],"sourcesContent":["/**\n * OCR 프로바이더 브릿지 — PDF 페이지를 이미지로 렌더링하여 OCR 호출\n *\n * kordoc은 OCR 라이브러리를 번들하지 않음.\n * 사용자가 OcrProvider 함수를 제공하면 이미지 기반 PDF도 텍스트 추출 가능.\n *\n * @example\n * ```ts\n * import { parse } from \"kordoc\"\n *\n * const result = await parse(buffer, {\n * ocr: async (pageImage, pageNumber, mimeType) => {\n * // Tesseract, Claude Vision, Google Vision 등 사용\n * return await myOcrService.recognize(pageImage)\n * }\n * })\n * ```\n */\n\nimport type { OcrProvider, IRBlock } from \"../types.js\"\n\n/**\n * 이미지 기반 PDF 페이지에 OCR을 적용하여 IRBlock[] 반환.\n *\n * pdfjs page 객체에서 viewport + render를 통해 PNG 생성 후\n * 사용자 제공 OcrProvider 호출.\n *\n * canvas 미설치 시 pdfjs render 불가하므로 에러 반환.\n */\nexport async function ocrPages(\n doc: { numPages: number; getPage(n: number): Promise<PdfPageProxy> },\n provider: OcrProvider,\n pageFilter: Set<number> | null,\n effectivePageCount: number\n): Promise<IRBlock[]> {\n const blocks: IRBlock[] = []\n\n for (let i = 1; i <= effectivePageCount; i++) {\n if (pageFilter && !pageFilter.has(i)) continue\n const page = await doc.getPage(i)\n try {\n const imageData = await renderPageToPng(page)\n const text = await provider(imageData, i, \"image/png\")\n if (text.trim()) {\n blocks.push({ type: \"paragraph\", text: text.trim(), pageNumber: i })\n }\n } catch {\n // OCR 실패한 페이지는 건너뜀\n }\n }\n\n return blocks\n}\n\ninterface PdfPageProxy {\n getViewport(params: { scale: number }): { width: number; height: number }\n render(params: { canvasContext: unknown; viewport: unknown }): { promise: Promise<void> }\n}\n\n/**\n * PDF 페이지를 PNG로 렌더링.\n * node-canvas가 설치되어 있어야 동작.\n * 미설치 시 에러 throw → 호출측에서 catch.\n */\nasync function renderPageToPng(page: PdfPageProxy): Promise<Uint8Array> {\n // node-canvas 동적 로드 (선택적 의존성)\n let createCanvas: (w: number, h: number) => { getContext(t: string): unknown; toBuffer(t: string): Buffer }\n try {\n const canvasModule = await import(\"canvas\")\n createCanvas = canvasModule.createCanvas\n } catch {\n throw new Error(\"OCR을 사용하려면 'canvas' 패키지를 설치하세요: npm install canvas\")\n }\n\n const scale = 2.0 // 300 DPI 근사\n const viewport = page.getViewport({ scale })\n const canvas = createCanvas(Math.floor(viewport.width), Math.floor(viewport.height))\n const ctx = canvas.getContext(\"2d\")\n\n await page.render({ canvasContext: ctx, viewport }).promise\n return new Uint8Array(canvas.toBuffer(\"image/png\"))\n}\n"],"mappings":";;;AA6BA,eAAsB,SACpB,KACA,UACA,YACA,oBACoB;AACpB,QAAM,SAAoB,CAAC;AAE3B,WAAS,IAAI,GAAG,KAAK,oBAAoB,KAAK;AAC5C,QAAI,cAAc,CAAC,WAAW,IAAI,CAAC,EAAG;AACtC,UAAM,OAAO,MAAM,IAAI,QAAQ,CAAC;AAChC,QAAI;AACF,YAAM,YAAY,MAAM,gBAAgB,IAAI;AAC5C,YAAM,OAAO,MAAM,SAAS,WAAW,GAAG,WAAW;AACrD,UAAI,KAAK,KAAK,GAAG;AACf,eAAO,KAAK,EAAE,MAAM,aAAa,MAAM,KAAK,KAAK,GAAG,YAAY,EAAE,CAAC;AAAA,MACrE;AAAA,IACF,QAAQ;AAAA,IAER;AAAA,EACF;AAEA,SAAO;AACT;AAYA,eAAe,gBAAgB,MAAyC;AAEtE,MAAI;AACJ,MAAI;AACF,UAAM,eAAe,MAAM,OAAO,QAAQ;AAC1C,mBAAe,aAAa;AAAA,EAC9B,QAAQ;AACN,UAAM,IAAI,MAAM,+HAAoD;AAAA,EACtE;AAEA,QAAM,QAAQ;AACd,QAAM,WAAW,KAAK,YAAY,EAAE,MAAM,CAAC;AAC3C,QAAM,SAAS,aAAa,KAAK,MAAM,SAAS,KAAK,GAAG,KAAK,MAAM,SAAS,MAAM,CAAC;AACnF,QAAM,MAAM,OAAO,WAAW,IAAI;AAElC,QAAM,KAAK,OAAO,EAAE,eAAe,KAAK,SAAS,CAAC,EAAE;AACpD,SAAO,IAAI,WAAW,OAAO,SAAS,WAAW,CAAC;AACpD;","names":[]}
|
|
File without changes
|
|
File without changes
|
|
File without changes
|