@clazic/kordoc 2.1.6 → 2.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (34) hide show
  1. package/README.md +67 -4
  2. package/dist/chunk-3NF22UFF.js +9617 -0
  3. package/dist/chunk-3NF22UFF.js.map +1 -0
  4. package/dist/{chunk-TFYOEQE2.js → chunk-7MXQWWUW.js} +2 -2
  5. package/dist/chunk-ZWE3DS7E.js +39 -0
  6. package/dist/cli.js +114 -11
  7. package/dist/cli.js.map +1 -1
  8. package/dist/index.cjs +2985 -179
  9. package/dist/index.cjs.map +1 -1
  10. package/dist/index.d.cts +36 -4
  11. package/dist/index.d.ts +36 -4
  12. package/dist/index.js +3009 -178
  13. package/dist/index.js.map +1 -1
  14. package/dist/mcp.js +100 -7
  15. package/dist/mcp.js.map +1 -1
  16. package/dist/{page-range-737B4EZW.js → page-range-ALIRXAL5.js} +2 -1
  17. package/dist/provider-XVKP5OGI.js +167 -0
  18. package/dist/provider-XVKP5OGI.js.map +1 -0
  19. package/dist/resolve-Z4DEPDUS.js +179 -0
  20. package/dist/resolve-Z4DEPDUS.js.map +1 -0
  21. package/dist/tesseract-provider-UNJOI25M.js +24 -0
  22. package/dist/tesseract-provider-UNJOI25M.js.map +1 -0
  23. package/dist/{utils-7JE5SKSL.js → utils-I4UIMOH7.js} +3 -2
  24. package/dist/utils-I4UIMOH7.js.map +1 -0
  25. package/dist/{watch-XALC6VOR.js → watch-XPLMUIZB.js} +4 -3
  26. package/dist/{watch-XALC6VOR.js.map → watch-XPLMUIZB.js.map} +1 -1
  27. package/package.json +5 -2
  28. package/dist/chunk-H7HMKSLX.js +0 -5494
  29. package/dist/chunk-H7HMKSLX.js.map +0 -1
  30. package/dist/provider-A4FHJSID.js +0 -38
  31. package/dist/provider-A4FHJSID.js.map +0 -1
  32. /package/dist/{chunk-TFYOEQE2.js.map → chunk-7MXQWWUW.js.map} +0 -0
  33. /package/dist/{page-range-737B4EZW.js.map → chunk-ZWE3DS7E.js.map} +0 -0
  34. /package/dist/{utils-7JE5SKSL.js.map → page-range-ALIRXAL5.js.map} +0 -0
package/dist/index.d.cts CHANGED
@@ -102,6 +102,14 @@ interface ParseOptions {
102
102
  pages?: number[] | string;
103
103
  /** 이미지 기반 PDF용 OCR 프로바이더 (선택) */
104
104
  ocr?: OcrProvider;
105
+ /**
106
+ * OCR 모드 (CLI 자동 탐색용).
107
+ * - "auto": 설치된 CLI 자동 탐색 (gemini→claude→codex→ollama→tesseract)
108
+ * - "gemini"|"claude"|"codex"|"ollama"|"tesseract": 특정 도구 강제 지정
109
+ * - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
110
+ * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
111
+ */
112
+ ocrMode?: OcrMode;
105
113
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
106
114
  onProgress?: (current: number, total: number) => void;
107
115
  /** PDF 머리글/바닥글 자동 제거 */
@@ -116,7 +124,7 @@ interface ParseWarning {
116
124
  /** 구조화된 경고 코드 */
117
125
  code: WarningCode;
118
126
  }
119
- type WarningCode = "SKIPPED_IMAGE" | "SKIPPED_OLE" | "TRUNCATED_TABLE" | "OCR_FALLBACK" | "UNSUPPORTED_ELEMENT" | "BROKEN_ZIP_RECOVERY" | "HIDDEN_TEXT_FILTERED" | "MALFORMED_XML" | "PARTIAL_PARSE" | "LENIENT_CFB_RECOVERY";
127
+ type WarningCode = "SKIPPED_IMAGE" | "SKIPPED_OLE" | "TRUNCATED_TABLE" | "OCR_FALLBACK" | "UNSUPPORTED_ELEMENT" | "BROKEN_ZIP_RECOVERY" | "HIDDEN_TEXT_FILTERED" | "MALFORMED_XML" | "PARTIAL_PARSE" | "LENIENT_CFB_RECOVERY" | "OCR_PAGE_FAILED" | "OCR_CLI_FALLBACK";
120
128
  /** 문서 구조 (헤딩 트리) */
121
129
  interface OutlineItem {
122
130
  level: number;
@@ -204,8 +212,15 @@ interface FormResult {
204
212
  /** 양식 확신도 (0-1) */
205
213
  confidence: number;
206
214
  }
207
- /** 사용자 제공 OCR 함수 페이지 이미지를 받아 텍스트 반환 */
208
- type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string>;
215
+ /** Vision LLM이 반환하는 구조화된 OCR 결과 */
216
+ interface StructuredOcrResult {
217
+ /** 구조화된 Markdown (테이블/헤딩/리스트 포함) */
218
+ markdown: string;
219
+ }
220
+ /** OCR 모드 — CLI --ocr 옵션 허용값 */
221
+ type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "tesseract" | "off";
222
+ /** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
223
+ type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
209
224
  interface WatchOptions {
210
225
  dir: string;
211
226
  outDir?: string;
@@ -252,6 +267,23 @@ interface MarkdownToHwpxOptions {
252
267
  */
253
268
  declare function markdownToHwpx(markdown: string, options?: MarkdownToHwpxOptions | ArrayBuffer): Promise<ArrayBuffer>;
254
269
 
270
+ /**
271
+ * Markdown → XLSX 변환기
272
+ *
273
+ * 지원: 헤딩, 단락, 코드, blockquote, hr, 테이블(별도 시트), 이미지
274
+ */
275
+
276
+ interface MarkdownToXlsxOptions {
277
+ warnings?: string[];
278
+ images?: ExtractedImage[];
279
+ }
280
+ /**
281
+ * 마크다운 텍스트를 XLSX (ArrayBuffer)로 변환.
282
+ * @param markdown 마크다운 텍스트
283
+ * @param options 경고 수집, 이미지 데이터 등
284
+ */
285
+ declare function markdownToXlsx(markdown: string, options?: MarkdownToXlsxOptions): Promise<ArrayBuffer>;
286
+
255
287
  /** 매직 바이트 기반 파일 포맷 감지 */
256
288
 
257
289
  /** ZIP 파일 여부: PK\x03\x04 */
@@ -307,4 +339,4 @@ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions): Promise
307
339
  /** DOCX 파일을 Markdown으로 변환 */
308
340
  declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
309
341
 
310
- export { type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx };
342
+ export { type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx };
package/dist/index.d.ts CHANGED
@@ -102,6 +102,14 @@ interface ParseOptions {
102
102
  pages?: number[] | string;
103
103
  /** 이미지 기반 PDF용 OCR 프로바이더 (선택) */
104
104
  ocr?: OcrProvider;
105
+ /**
106
+ * OCR 모드 (CLI 자동 탐색용).
107
+ * - "auto": 설치된 CLI 자동 탐색 (gemini→claude→codex→ollama→tesseract)
108
+ * - "gemini"|"claude"|"codex"|"ollama"|"tesseract": 특정 도구 강제 지정
109
+ * - "off": OCR 비활성화 (이미지 기반 PDF면 에러)
110
+ * - undefined: 라이브러리 API 기존 동작 유지 (자동 탐색 안 함)
111
+ */
112
+ ocrMode?: OcrMode;
105
113
  /** 진행률 콜백 — current: 현재 페이지/섹션, total: 전체 수 */
106
114
  onProgress?: (current: number, total: number) => void;
107
115
  /** PDF 머리글/바닥글 자동 제거 */
@@ -116,7 +124,7 @@ interface ParseWarning {
116
124
  /** 구조화된 경고 코드 */
117
125
  code: WarningCode;
118
126
  }
119
- type WarningCode = "SKIPPED_IMAGE" | "SKIPPED_OLE" | "TRUNCATED_TABLE" | "OCR_FALLBACK" | "UNSUPPORTED_ELEMENT" | "BROKEN_ZIP_RECOVERY" | "HIDDEN_TEXT_FILTERED" | "MALFORMED_XML" | "PARTIAL_PARSE" | "LENIENT_CFB_RECOVERY";
127
+ type WarningCode = "SKIPPED_IMAGE" | "SKIPPED_OLE" | "TRUNCATED_TABLE" | "OCR_FALLBACK" | "UNSUPPORTED_ELEMENT" | "BROKEN_ZIP_RECOVERY" | "HIDDEN_TEXT_FILTERED" | "MALFORMED_XML" | "PARTIAL_PARSE" | "LENIENT_CFB_RECOVERY" | "OCR_PAGE_FAILED" | "OCR_CLI_FALLBACK";
120
128
  /** 문서 구조 (헤딩 트리) */
121
129
  interface OutlineItem {
122
130
  level: number;
@@ -204,8 +212,15 @@ interface FormResult {
204
212
  /** 양식 확신도 (0-1) */
205
213
  confidence: number;
206
214
  }
207
- /** 사용자 제공 OCR 함수 페이지 이미지를 받아 텍스트 반환 */
208
- type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string>;
215
+ /** Vision LLM이 반환하는 구조화된 OCR 결과 */
216
+ interface StructuredOcrResult {
217
+ /** 구조화된 Markdown (테이블/헤딩/리스트 포함) */
218
+ markdown: string;
219
+ }
220
+ /** OCR 모드 — CLI --ocr 옵션 허용값 */
221
+ type OcrMode = "auto" | "gemini" | "claude" | "codex" | "ollama" | "tesseract" | "off";
222
+ /** 사용자 제공 OCR 함수 — 페이지 이미지를 받아 텍스트 또는 구조화된 결과 반환 */
223
+ type OcrProvider = (pageImage: Uint8Array, pageNumber: number, mimeType: "image/png") => Promise<string | StructuredOcrResult>;
209
224
  interface WatchOptions {
210
225
  dir: string;
211
226
  outDir?: string;
@@ -252,6 +267,23 @@ interface MarkdownToHwpxOptions {
252
267
  */
253
268
  declare function markdownToHwpx(markdown: string, options?: MarkdownToHwpxOptions | ArrayBuffer): Promise<ArrayBuffer>;
254
269
 
270
+ /**
271
+ * Markdown → XLSX 변환기
272
+ *
273
+ * 지원: 헤딩, 단락, 코드, blockquote, hr, 테이블(별도 시트), 이미지
274
+ */
275
+
276
+ interface MarkdownToXlsxOptions {
277
+ warnings?: string[];
278
+ images?: ExtractedImage[];
279
+ }
280
+ /**
281
+ * 마크다운 텍스트를 XLSX (ArrayBuffer)로 변환.
282
+ * @param markdown 마크다운 텍스트
283
+ * @param options 경고 수집, 이미지 데이터 등
284
+ */
285
+ declare function markdownToXlsx(markdown: string, options?: MarkdownToXlsxOptions): Promise<ArrayBuffer>;
286
+
255
287
  /** 매직 바이트 기반 파일 포맷 감지 */
256
288
 
257
289
  /** ZIP 파일 여부: PK\x03\x04 */
@@ -307,4 +339,4 @@ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions): Promise
307
339
  /** DOCX 파일을 Markdown으로 변환 */
308
340
  declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
309
341
 
310
- export { type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx };
342
+ export { type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx };