@clazic/kordoc 2.3.1 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
- package/dist/batch-provider-PNDCSGQW.js.map +1 -0
- package/dist/{chunk-ZOEUKD77.js → chunk-2GFJFTKS.js} +193 -49
- package/dist/chunk-2GFJFTKS.js.map +1 -0
- package/dist/chunk-4PP34NVQ.js +121 -0
- package/dist/chunk-4PP34NVQ.js.map +1 -0
- package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
- package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
- package/dist/chunk-JOGAFNIL.js +153 -0
- package/dist/chunk-JOGAFNIL.js.map +1 -0
- package/dist/{chunk-W5KUC23B.js → chunk-STIKJGEA.js} +2 -2
- package/dist/cli.js +8 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +217 -70
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -6
- package/dist/index.d.ts +11 -6
- package/dist/index.js +217 -70
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
- package/dist/provider-HE727F7Z.js.map +1 -0
- package/dist/resolve-QA3VACUP.js +111 -0
- package/dist/resolve-QA3VACUP.js.map +1 -0
- package/dist/tesseract-provider-MNMZPSGF.js +11 -0
- package/dist/{utils-HSF5HI5T.js → utils-FFUQJTTI.js} +2 -2
- package/dist/utils-FFUQJTTI.js.map +1 -0
- package/dist/{watch-R2JHXDGF.js → watch-2O32L6IF.js} +6 -3
- package/dist/{watch-R2JHXDGF.js.map → watch-2O32L6IF.js.map} +1 -1
- package/package.json +7 -8
- package/dist/batch-provider-PCT4I4LK.js.map +0 -1
- package/dist/chunk-ZOEUKD77.js.map +0 -1
- package/dist/provider-WYHC4NHI.js.map +0 -1
- package/dist/resolve-4FSAQF2S.js +0 -247
- package/dist/resolve-4FSAQF2S.js.map +0 -1
- /package/dist/{chunk-W5KUC23B.js.map → chunk-STIKJGEA.js.map} +0 -0
- /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.d.cts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import JSZip from 'jszip';
|
|
2
|
+
|
|
1
3
|
/** kordoc 공통 타입 정의 */
|
|
2
4
|
interface CellContext {
|
|
3
5
|
text: string;
|
|
@@ -141,8 +143,8 @@ interface OutlineItem {
|
|
|
141
143
|
pageNumber?: number;
|
|
142
144
|
}
|
|
143
145
|
/** 구조화된 에러 코드 — 프로그래밍적 에러 핸들링용 */
|
|
144
|
-
type ErrorCode = "EMPTY_INPUT" | "UNSUPPORTED_FORMAT" | "ENCRYPTED" | "DRM_PROTECTED" | "CORRUPTED" | "DECOMPRESSION_BOMB" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | "NO_SECTIONS" | "PARSE_ERROR";
|
|
145
|
-
type FileType = "hwpx" | "hwp" | "pdf" | "xlsx" | "docx" | "unknown";
|
|
146
|
+
type ErrorCode = "EMPTY_INPUT" | "UNSUPPORTED_FORMAT" | "ENCRYPTED" | "DRM_PROTECTED" | "CORRUPTED" | "DECOMPRESSION_BOMB" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | "NO_SECTIONS" | "PARSE_ERROR" | "FILE_TOO_LARGE";
|
|
147
|
+
type FileType = "hwpx" | "hwp" | "pdf" | "xlsx" | "docx" | "image" | "unknown";
|
|
146
148
|
interface ParseResultBase {
|
|
147
149
|
fileType: FileType;
|
|
148
150
|
/** 페이지/섹션 수 — PDF: 실제 페이지 수, HWP/HWPX: 섹션 수, XLSX: 시트 수 */
|
|
@@ -309,7 +311,10 @@ declare function detectFormat(buffer: ArrayBuffer): FileType;
|
|
|
309
311
|
* ZIP 내부 구조 기반 포맷 세분화.
|
|
310
312
|
* HWPX, XLSX, DOCX 모두 ZIP이므로 내부 파일로 구분.
|
|
311
313
|
*/
|
|
312
|
-
declare function detectZipFormat(buffer: ArrayBuffer): Promise<
|
|
314
|
+
declare function detectZipFormat(buffer: ArrayBuffer): Promise<{
|
|
315
|
+
format: "hwpx" | "xlsx" | "docx" | "unknown";
|
|
316
|
+
zip: JSZip | null;
|
|
317
|
+
}>;
|
|
313
318
|
|
|
314
319
|
/** 2-pass colSpan/rowSpan 테이블 빌더 및 Markdown 변환 */
|
|
315
320
|
|
|
@@ -338,14 +343,14 @@ declare const VERSION: string;
|
|
|
338
343
|
*/
|
|
339
344
|
declare function parse(input: string | ArrayBuffer | Buffer, options?: ParseOptions): Promise<ParseResult>;
|
|
340
345
|
/** HWPX 파일을 Markdown으로 변환 */
|
|
341
|
-
declare function parseHwpx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
346
|
+
declare function parseHwpx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
342
347
|
/** HWP 5.x 바이너리 파일을 Markdown으로 변환 */
|
|
343
348
|
declare function parseHwp(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
344
349
|
/** PDF 파일에서 텍스트를 추출하여 Markdown으로 변환 */
|
|
345
350
|
declare function parsePdf(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
346
351
|
/** XLSX 파일을 Markdown으로 변환 */
|
|
347
|
-
declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
352
|
+
declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
348
353
|
/** DOCX 파일을 Markdown으로 변환 */
|
|
349
|
-
declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
354
|
+
declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
350
355
|
|
|
351
356
|
export { type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx };
|
package/dist/index.d.ts
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
import JSZip from 'jszip';
|
|
2
|
+
|
|
1
3
|
/** kordoc 공통 타입 정의 */
|
|
2
4
|
interface CellContext {
|
|
3
5
|
text: string;
|
|
@@ -141,8 +143,8 @@ interface OutlineItem {
|
|
|
141
143
|
pageNumber?: number;
|
|
142
144
|
}
|
|
143
145
|
/** 구조화된 에러 코드 — 프로그래밍적 에러 핸들링용 */
|
|
144
|
-
type ErrorCode = "EMPTY_INPUT" | "UNSUPPORTED_FORMAT" | "ENCRYPTED" | "DRM_PROTECTED" | "CORRUPTED" | "DECOMPRESSION_BOMB" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | "NO_SECTIONS" | "PARSE_ERROR";
|
|
145
|
-
type FileType = "hwpx" | "hwp" | "pdf" | "xlsx" | "docx" | "unknown";
|
|
146
|
+
type ErrorCode = "EMPTY_INPUT" | "UNSUPPORTED_FORMAT" | "ENCRYPTED" | "DRM_PROTECTED" | "CORRUPTED" | "DECOMPRESSION_BOMB" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | "NO_SECTIONS" | "PARSE_ERROR" | "FILE_TOO_LARGE";
|
|
147
|
+
type FileType = "hwpx" | "hwp" | "pdf" | "xlsx" | "docx" | "image" | "unknown";
|
|
146
148
|
interface ParseResultBase {
|
|
147
149
|
fileType: FileType;
|
|
148
150
|
/** 페이지/섹션 수 — PDF: 실제 페이지 수, HWP/HWPX: 섹션 수, XLSX: 시트 수 */
|
|
@@ -309,7 +311,10 @@ declare function detectFormat(buffer: ArrayBuffer): FileType;
|
|
|
309
311
|
* ZIP 내부 구조 기반 포맷 세분화.
|
|
310
312
|
* HWPX, XLSX, DOCX 모두 ZIP이므로 내부 파일로 구분.
|
|
311
313
|
*/
|
|
312
|
-
declare function detectZipFormat(buffer: ArrayBuffer): Promise<
|
|
314
|
+
declare function detectZipFormat(buffer: ArrayBuffer): Promise<{
|
|
315
|
+
format: "hwpx" | "xlsx" | "docx" | "unknown";
|
|
316
|
+
zip: JSZip | null;
|
|
317
|
+
}>;
|
|
313
318
|
|
|
314
319
|
/** 2-pass colSpan/rowSpan 테이블 빌더 및 Markdown 변환 */
|
|
315
320
|
|
|
@@ -338,14 +343,14 @@ declare const VERSION: string;
|
|
|
338
343
|
*/
|
|
339
344
|
declare function parse(input: string | ArrayBuffer | Buffer, options?: ParseOptions): Promise<ParseResult>;
|
|
340
345
|
/** HWPX 파일을 Markdown으로 변환 */
|
|
341
|
-
declare function parseHwpx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
346
|
+
declare function parseHwpx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
342
347
|
/** HWP 5.x 바이너리 파일을 Markdown으로 변환 */
|
|
343
348
|
declare function parseHwp(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
344
349
|
/** PDF 파일에서 텍스트를 추출하여 Markdown으로 변환 */
|
|
345
350
|
declare function parsePdf(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
346
351
|
/** XLSX 파일을 Markdown으로 변환 */
|
|
347
|
-
declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
352
|
+
declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
348
353
|
/** DOCX 파일을 Markdown으로 변환 */
|
|
349
|
-
declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
|
|
354
|
+
declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
|
|
350
355
|
|
|
351
356
|
export { type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx };
|
package/dist/index.js
CHANGED
|
@@ -1998,8 +1998,8 @@ function getTesseractFallbackMessage() {
|
|
|
1998
1998
|
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
|
|
1999
1999
|
"\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
2000
2000
|
"",
|
|
2001
|
-
" [\uAD8C\uC7A5]
|
|
2002
|
-
"
|
|
2001
|
+
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
2002
|
+
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
2003
2003
|
" Claude CLI: npm install -g @anthropic-ai/claude-code",
|
|
2004
2004
|
" Ollama: brew install ollama (+ ollama pull gemma4:27b)"
|
|
2005
2005
|
].join("\n");
|
|
@@ -2008,7 +2008,7 @@ var CLI_PRIORITY;
|
|
|
2008
2008
|
var init_auto_detect = __esm({
|
|
2009
2009
|
"src/ocr/auto-detect.ts"() {
|
|
2010
2010
|
"use strict";
|
|
2011
|
-
CLI_PRIORITY = ["
|
|
2011
|
+
CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
|
|
2012
2012
|
}
|
|
2013
2013
|
});
|
|
2014
2014
|
|
|
@@ -2051,7 +2051,7 @@ function callCli(mode, imagePath) {
|
|
|
2051
2051
|
const args = buildCliArgs(mode, imagePath);
|
|
2052
2052
|
const result = spawnSync(mode, args, {
|
|
2053
2053
|
encoding: "utf-8",
|
|
2054
|
-
timeout:
|
|
2054
|
+
timeout: 6e5,
|
|
2055
2055
|
maxBuffer: 10 * 1024 * 1024,
|
|
2056
2056
|
// claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
|
|
2057
2057
|
...mode === "claude" ? { cwd: tmpdir() } : {}
|
|
@@ -2145,14 +2145,22 @@ async function callOllamaApi(imagePath) {
|
|
|
2145
2145
|
return data.message?.content || "";
|
|
2146
2146
|
}
|
|
2147
2147
|
function stripCodeFence(text) {
|
|
2148
|
-
const match = text.match(/^```(?:markdown|md)?\s
|
|
2148
|
+
const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
|
|
2149
2149
|
return match ? match[1].trim() : text;
|
|
2150
2150
|
}
|
|
2151
2151
|
var OCR_PROMPT, _tempDir;
|
|
2152
2152
|
var init_cli_provider = __esm({
|
|
2153
2153
|
"src/ocr/cli-provider.ts"() {
|
|
2154
2154
|
"use strict";
|
|
2155
|
-
OCR_PROMPT =
|
|
2155
|
+
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
|
|
2156
|
+
\uADDC\uCE59:
|
|
2157
|
+
- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
|
|
2158
|
+
- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
|
|
2159
|
+
- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
|
|
2160
|
+
- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
|
|
2161
|
+
- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
|
|
2162
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
|
|
2163
|
+
- \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
|
|
2156
2164
|
_tempDir = null;
|
|
2157
2165
|
}
|
|
2158
2166
|
});
|
|
@@ -2321,9 +2329,8 @@ async function callBatchCli(mode, imagePaths) {
|
|
|
2321
2329
|
${fileRefs}`;
|
|
2322
2330
|
let args;
|
|
2323
2331
|
if (mode === "gemini") {
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
if (model) args.push("--model", model);
|
|
2332
|
+
const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
|
|
2333
|
+
args = ["--prompt", prompt, "--yolo", "--model", model];
|
|
2327
2334
|
} else {
|
|
2328
2335
|
args = ["--print", prompt];
|
|
2329
2336
|
const model = process.env.KORDOC_CLAUDE_MODEL;
|
|
@@ -2667,22 +2674,22 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2667
2674
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2668
2675
|
pageNumbers.push(i);
|
|
2669
2676
|
}
|
|
2670
|
-
const
|
|
2671
|
-
for (
|
|
2672
|
-
|
|
2673
|
-
const image = await renderPageToPng(page);
|
|
2674
|
-
pageImages.push({ image, pageNum });
|
|
2675
|
-
}
|
|
2676
|
-
const batches = [];
|
|
2677
|
-
for (let i = 0; i < pageImages.length; i += provider.batchSize) {
|
|
2678
|
-
batches.push(pageImages.slice(i, i + provider.batchSize));
|
|
2677
|
+
const pageBatches = [];
|
|
2678
|
+
for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
|
|
2679
|
+
pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
|
|
2679
2680
|
}
|
|
2680
2681
|
let processed = 0;
|
|
2681
|
-
const batchTasks =
|
|
2682
|
+
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2682
2683
|
const pageBlocks = [];
|
|
2683
2684
|
try {
|
|
2684
|
-
const
|
|
2685
|
-
for (const
|
|
2685
|
+
const batchImages = [];
|
|
2686
|
+
for (const pageNum of batchPageNums) {
|
|
2687
|
+
const page = await doc.getPage(pageNum);
|
|
2688
|
+
const image = await renderPageToPng(page);
|
|
2689
|
+
batchImages.push({ image, pageNum });
|
|
2690
|
+
}
|
|
2691
|
+
const results = await provider.processBatch(batchImages);
|
|
2692
|
+
for (const { pageNum } of batchImages) {
|
|
2686
2693
|
const result = results.get(pageNum);
|
|
2687
2694
|
pageBlocks.push({
|
|
2688
2695
|
pageNum,
|
|
@@ -2690,16 +2697,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2690
2697
|
});
|
|
2691
2698
|
}
|
|
2692
2699
|
} catch (err) {
|
|
2693
|
-
const range = `${
|
|
2700
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2694
2701
|
warnings?.push({
|
|
2695
2702
|
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2696
2703
|
code: "OCR_PAGE_FAILED"
|
|
2697
2704
|
});
|
|
2698
|
-
for (const
|
|
2705
|
+
for (const pageNum of batchPageNums) {
|
|
2699
2706
|
pageBlocks.push({ pageNum, blocks: [] });
|
|
2700
2707
|
}
|
|
2701
2708
|
}
|
|
2702
|
-
processed +=
|
|
2709
|
+
processed += batchPageNums.length;
|
|
2703
2710
|
onProgress?.(processed, pageNumbers.length);
|
|
2704
2711
|
return { batchIdx, pageBlocks };
|
|
2705
2712
|
});
|
|
@@ -2752,24 +2759,29 @@ function isPdfFile(buffer) {
|
|
|
2752
2759
|
const b = magicBytes(buffer);
|
|
2753
2760
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
2754
2761
|
}
|
|
2762
|
+
function isPngFile(buffer) {
|
|
2763
|
+
const b = magicBytes(buffer);
|
|
2764
|
+
return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
|
|
2765
|
+
}
|
|
2755
2766
|
function detectFormat(buffer) {
|
|
2756
2767
|
if (buffer.byteLength < 4) return "unknown";
|
|
2757
2768
|
if (isZipFile(buffer)) return "hwpx";
|
|
2758
2769
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
2759
2770
|
if (isPdfFile(buffer)) return "pdf";
|
|
2771
|
+
if (isPngFile(buffer)) return "image";
|
|
2760
2772
|
return "unknown";
|
|
2761
2773
|
}
|
|
2762
2774
|
async function detectZipFormat(buffer) {
|
|
2763
2775
|
try {
|
|
2764
2776
|
const zip = await JSZip.loadAsync(buffer);
|
|
2765
|
-
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
2766
|
-
if (zip.file("word/document.xml")) return "docx";
|
|
2767
|
-
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
2777
|
+
if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
|
|
2778
|
+
if (zip.file("word/document.xml")) return { format: "docx", zip };
|
|
2779
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
|
|
2768
2780
|
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
2769
|
-
if (hasSection) return "hwpx";
|
|
2770
|
-
return "unknown";
|
|
2781
|
+
if (hasSection) return { format: "hwpx", zip };
|
|
2782
|
+
return { format: "unknown", zip: null };
|
|
2771
2783
|
} catch {
|
|
2772
|
-
return "unknown";
|
|
2784
|
+
return { format: "unknown", zip: null };
|
|
2773
2785
|
}
|
|
2774
2786
|
}
|
|
2775
2787
|
|
|
@@ -2778,7 +2790,7 @@ import JSZip2 from "jszip";
|
|
|
2778
2790
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2779
2791
|
|
|
2780
2792
|
// src/utils.ts
|
|
2781
|
-
var VERSION = true ? "2.3.
|
|
2793
|
+
var VERSION = true ? "2.3.2" : "0.0.0-dev";
|
|
2782
2794
|
function toArrayBuffer(buf) {
|
|
2783
2795
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2784
2796
|
return buf.buffer;
|
|
@@ -2938,12 +2950,16 @@ function buildTableDirect(rows, numRows) {
|
|
|
2938
2950
|
return trimAndReturn(grid, numRows, maxCols);
|
|
2939
2951
|
}
|
|
2940
2952
|
function trimAndReturn(grid, numRows, maxCols) {
|
|
2941
|
-
let effectiveCols =
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
|
|
2953
|
+
let effectiveCols = 0;
|
|
2954
|
+
for (const row of grid) {
|
|
2955
|
+
for (let c = row.length - 1; c >= effectiveCols; c--) {
|
|
2956
|
+
if (row[c]?.text?.trim()) {
|
|
2957
|
+
effectiveCols = c + 1;
|
|
2958
|
+
break;
|
|
2959
|
+
}
|
|
2960
|
+
}
|
|
2946
2961
|
}
|
|
2962
|
+
if (effectiveCols === 0) effectiveCols = maxCols;
|
|
2947
2963
|
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
2948
2964
|
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
2949
2965
|
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
@@ -3200,11 +3216,11 @@ function parseStyleElements(doc, map) {
|
|
|
3200
3216
|
function stripDtd(xml) {
|
|
3201
3217
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
3202
3218
|
}
|
|
3203
|
-
async function parseHwpxDocument(buffer, options) {
|
|
3219
|
+
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
3204
3220
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
3205
3221
|
let zip;
|
|
3206
3222
|
try {
|
|
3207
|
-
zip = await JSZip2.loadAsync(buffer);
|
|
3223
|
+
zip = existingZip ?? await JSZip2.loadAsync(buffer);
|
|
3208
3224
|
} catch {
|
|
3209
3225
|
return await extractFromBrokenZip(buffer);
|
|
3210
3226
|
}
|
|
@@ -6216,8 +6232,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
|
|
|
6216
6232
|
GlobalWorkerOptions.workerSrc = "";
|
|
6217
6233
|
var MAX_PAGES = 5e3;
|
|
6218
6234
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
6219
|
-
|
|
6235
|
+
function calcPdfTimeout(bufferSize) {
|
|
6236
|
+
const base = 3e4;
|
|
6237
|
+
const perMb = 500;
|
|
6238
|
+
const mb = bufferSize / (1024 * 1024);
|
|
6239
|
+
return Math.min(base + Math.ceil(mb * perMb), 3e5);
|
|
6240
|
+
}
|
|
6220
6241
|
async function loadPdfWithTimeout(buffer) {
|
|
6242
|
+
const timeoutMs = calcPdfTimeout(buffer.byteLength);
|
|
6243
|
+
const timeoutSec = Math.round(timeoutMs / 1e3);
|
|
6221
6244
|
const loadingTask = getDocument({
|
|
6222
6245
|
data: new Uint8Array(buffer),
|
|
6223
6246
|
useSystemFonts: true,
|
|
@@ -6231,8 +6254,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6231
6254
|
new Promise((_, reject) => {
|
|
6232
6255
|
timer = setTimeout(() => {
|
|
6233
6256
|
loadingTask.destroy();
|
|
6234
|
-
reject(new KordocError(
|
|
6235
|
-
},
|
|
6257
|
+
reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
|
|
6258
|
+
}, timeoutMs);
|
|
6236
6259
|
})
|
|
6237
6260
|
]);
|
|
6238
6261
|
} finally {
|
|
@@ -6253,11 +6276,15 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6253
6276
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
6254
6277
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
6255
6278
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
6256
|
-
const
|
|
6279
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
6257
6280
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
6258
|
-
|
|
6281
|
+
const targetPageNums = [];
|
|
6259
6282
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6260
6283
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
6284
|
+
targetPageNums.push(i);
|
|
6285
|
+
}
|
|
6286
|
+
let parsedPages = 0;
|
|
6287
|
+
const parseSinglePage = async (i) => {
|
|
6261
6288
|
try {
|
|
6262
6289
|
const page = await doc.getPage(i);
|
|
6263
6290
|
const tc = await page.getTextContent();
|
|
@@ -6270,7 +6297,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6270
6297
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
6271
6298
|
}
|
|
6272
6299
|
for (const item of visible) {
|
|
6273
|
-
if (item.fontSize > 0)
|
|
6300
|
+
if (item.fontSize > 0) {
|
|
6301
|
+
const rounded = Math.round(item.fontSize * 10) / 10;
|
|
6302
|
+
fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
|
|
6303
|
+
}
|
|
6274
6304
|
}
|
|
6275
6305
|
const opList = await page.getOperatorList();
|
|
6276
6306
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -6287,12 +6317,23 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6287
6317
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
6288
6318
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
6289
6319
|
}
|
|
6320
|
+
};
|
|
6321
|
+
const sampleCount = Math.min(5, targetPageNums.length);
|
|
6322
|
+
for (let si = 0; si < sampleCount; si++) {
|
|
6323
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6324
|
+
}
|
|
6325
|
+
const sampleParsed = parsedPages || sampleCount;
|
|
6326
|
+
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
6327
|
+
if (!isImageBased) {
|
|
6328
|
+
for (let si = sampleCount; si < targetPageNums.length; si++) {
|
|
6329
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6330
|
+
}
|
|
6290
6331
|
}
|
|
6291
6332
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6292
|
-
if (
|
|
6333
|
+
if (isImageBased) {
|
|
6293
6334
|
let ocrProvider = options?.ocr ?? null;
|
|
6294
|
-
const ocrMode = options?.ocrMode;
|
|
6295
|
-
if (!ocrProvider && ocrMode
|
|
6335
|
+
const ocrMode = options?.ocrMode ?? "auto";
|
|
6336
|
+
if (!ocrProvider && ocrMode !== "off") {
|
|
6296
6337
|
try {
|
|
6297
6338
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6298
6339
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
@@ -6344,7 +6385,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6344
6385
|
blocks.splice(removed[ri], 1);
|
|
6345
6386
|
}
|
|
6346
6387
|
}
|
|
6347
|
-
const medianFontSize =
|
|
6388
|
+
const medianFontSize = computeMedianFromFreq(fontSizeFreq);
|
|
6348
6389
|
if (medianFontSize > 0) {
|
|
6349
6390
|
detectHeadings(blocks, medianFontSize);
|
|
6350
6391
|
}
|
|
@@ -6397,11 +6438,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
6397
6438
|
}
|
|
6398
6439
|
return { visible, hiddenCount };
|
|
6399
6440
|
}
|
|
6400
|
-
function
|
|
6401
|
-
if (
|
|
6402
|
-
const
|
|
6403
|
-
|
|
6404
|
-
|
|
6441
|
+
function computeMedianFromFreq(freq) {
|
|
6442
|
+
if (freq.size === 0) return 0;
|
|
6443
|
+
const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
6444
|
+
let total = 0;
|
|
6445
|
+
for (const [, count] of entries) total += count;
|
|
6446
|
+
const mid = total / 2;
|
|
6447
|
+
let cumulative = 0;
|
|
6448
|
+
for (const [size, count] of entries) {
|
|
6449
|
+
cumulative += count;
|
|
6450
|
+
if (cumulative >= mid) return size;
|
|
6451
|
+
}
|
|
6452
|
+
return 0;
|
|
6405
6453
|
}
|
|
6406
6454
|
function detectHeadings(blocks, medianFontSize) {
|
|
6407
6455
|
for (const block of blocks) {
|
|
@@ -7204,6 +7252,7 @@ var MAX_SHEETS = 100;
|
|
|
7204
7252
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
7205
7253
|
var MAX_ROWS2 = 1e4;
|
|
7206
7254
|
var MAX_COLS2 = 200;
|
|
7255
|
+
var MAX_TOTAL_CELLS = 2e6;
|
|
7207
7256
|
function cleanNumericValue(raw) {
|
|
7208
7257
|
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
7209
7258
|
const num = parseFloat(raw);
|
|
@@ -7387,9 +7436,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
7387
7436
|
}
|
|
7388
7437
|
return blocks;
|
|
7389
7438
|
}
|
|
7390
|
-
async function parseXlsxDocument(buffer, options) {
|
|
7439
|
+
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
7391
7440
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
7392
|
-
const zip = await JSZip3.loadAsync(buffer);
|
|
7441
|
+
const zip = existingZip ?? await JSZip3.loadAsync(buffer);
|
|
7393
7442
|
const warnings = [];
|
|
7394
7443
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
7395
7444
|
if (!workbookFile) {
|
|
@@ -7416,6 +7465,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7416
7465
|
}
|
|
7417
7466
|
const blocks = [];
|
|
7418
7467
|
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
7468
|
+
let totalCells = 0;
|
|
7419
7469
|
for (let i = 0; i < processedSheets; i++) {
|
|
7420
7470
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
7421
7471
|
const sheet = sheets[i];
|
|
@@ -7442,6 +7492,11 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7442
7492
|
try {
|
|
7443
7493
|
const sheetXml = await sheetFile.async("text");
|
|
7444
7494
|
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
7495
|
+
totalCells += maxRow * maxCol;
|
|
7496
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
7497
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
7498
|
+
break;
|
|
7499
|
+
}
|
|
7445
7500
|
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
7446
7501
|
blocks.push(...sheetBlocks);
|
|
7447
7502
|
} catch (err) {
|
|
@@ -7525,10 +7580,35 @@ function getAttr(el, localName) {
|
|
|
7525
7580
|
function parseXml2(text) {
|
|
7526
7581
|
return new DOMParser3().parseFromString(text, "text/xml");
|
|
7527
7582
|
}
|
|
7583
|
+
function buildElementIndex(root) {
|
|
7584
|
+
const index = /* @__PURE__ */ new Map();
|
|
7585
|
+
const walk = (node) => {
|
|
7586
|
+
const children = node.childNodes;
|
|
7587
|
+
for (let i = 0; i < children.length; i++) {
|
|
7588
|
+
const child = children[i];
|
|
7589
|
+
if (child.nodeType === 1) {
|
|
7590
|
+
const el = child;
|
|
7591
|
+
const name = el.localName ?? "";
|
|
7592
|
+
if (name) {
|
|
7593
|
+
let list = index.get(name);
|
|
7594
|
+
if (!list) {
|
|
7595
|
+
list = [];
|
|
7596
|
+
index.set(name, list);
|
|
7597
|
+
}
|
|
7598
|
+
list.push(el);
|
|
7599
|
+
}
|
|
7600
|
+
walk(el);
|
|
7601
|
+
}
|
|
7602
|
+
}
|
|
7603
|
+
};
|
|
7604
|
+
walk(root);
|
|
7605
|
+
return index;
|
|
7606
|
+
}
|
|
7528
7607
|
function parseStyles(xml) {
|
|
7529
7608
|
const doc = parseXml2(xml);
|
|
7530
7609
|
const styles = /* @__PURE__ */ new Map();
|
|
7531
|
-
const
|
|
7610
|
+
const idx = buildElementIndex(doc);
|
|
7611
|
+
const styleElements = idx.get("style") ?? [];
|
|
7532
7612
|
for (const el of styleElements) {
|
|
7533
7613
|
const styleId = getAttr(el, "styleId");
|
|
7534
7614
|
if (!styleId) continue;
|
|
@@ -7556,7 +7636,8 @@ function parseStyles(xml) {
|
|
|
7556
7636
|
function parseNumbering(xml) {
|
|
7557
7637
|
const doc = parseXml2(xml);
|
|
7558
7638
|
const abstractNums = /* @__PURE__ */ new Map();
|
|
7559
|
-
const
|
|
7639
|
+
const idx = buildElementIndex(doc);
|
|
7640
|
+
const abstractElements = idx.get("abstractNum") ?? [];
|
|
7560
7641
|
for (const el of abstractElements) {
|
|
7561
7642
|
const abstractNumId = getAttr(el, "abstractNumId");
|
|
7562
7643
|
if (!abstractNumId) continue;
|
|
@@ -7571,7 +7652,7 @@ function parseNumbering(xml) {
|
|
|
7571
7652
|
abstractNums.set(abstractNumId, levels);
|
|
7572
7653
|
}
|
|
7573
7654
|
const nums = /* @__PURE__ */ new Map();
|
|
7574
|
-
const numElements =
|
|
7655
|
+
const numElements = idx.get("num") ?? [];
|
|
7575
7656
|
for (const el of numElements) {
|
|
7576
7657
|
const numId = getAttr(el, "numId");
|
|
7577
7658
|
if (!numId) continue;
|
|
@@ -7815,9 +7896,9 @@ async function extractImages(zip, rels, doc) {
|
|
|
7815
7896
|
}
|
|
7816
7897
|
return { blocks, images };
|
|
7817
7898
|
}
|
|
7818
|
-
async function parseDocxDocument(buffer, options) {
|
|
7899
|
+
async function parseDocxDocument(buffer, options, existingZip) {
|
|
7819
7900
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
7820
|
-
const zip = await JSZip4.loadAsync(buffer);
|
|
7901
|
+
const zip = existingZip ?? await JSZip4.loadAsync(buffer);
|
|
7821
7902
|
const warnings = [];
|
|
7822
7903
|
const docFile = zip.file("word/document.xml");
|
|
7823
7904
|
if (!docFile) {
|
|
@@ -7907,6 +7988,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
7907
7988
|
};
|
|
7908
7989
|
}
|
|
7909
7990
|
|
|
7991
|
+
// src/index.ts
|
|
7992
|
+
init_cli_provider();
|
|
7993
|
+
init_tesseract_provider();
|
|
7994
|
+
init_markdown_to_blocks();
|
|
7995
|
+
|
|
7910
7996
|
// src/diff/text-diff.ts
|
|
7911
7997
|
function similarity(a, b) {
|
|
7912
7998
|
if (a === b) return 1;
|
|
@@ -10423,25 +10509,86 @@ async function parse2(input, options) {
|
|
|
10423
10509
|
if (!buffer || buffer.byteLength === 0) {
|
|
10424
10510
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
10425
10511
|
}
|
|
10512
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
10513
|
+
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
10514
|
+
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
10515
|
+
}
|
|
10426
10516
|
const format = detectFormat(buffer);
|
|
10427
10517
|
switch (format) {
|
|
10428
10518
|
case "hwpx": {
|
|
10429
|
-
const zipFormat = await detectZipFormat(buffer);
|
|
10430
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
10431
|
-
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
10432
|
-
return parseHwpx(buffer, options);
|
|
10519
|
+
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
10520
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
|
|
10521
|
+
if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
|
|
10522
|
+
return parseHwpx(buffer, options, zip ?? void 0);
|
|
10433
10523
|
}
|
|
10434
10524
|
case "hwp":
|
|
10435
10525
|
return parseHwp(buffer, options);
|
|
10436
10526
|
case "pdf":
|
|
10437
10527
|
return parsePdf(buffer, options);
|
|
10528
|
+
case "image":
|
|
10529
|
+
return parseImage(buffer, options);
|
|
10438
10530
|
default:
|
|
10439
10531
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
10440
10532
|
}
|
|
10441
10533
|
}
|
|
10442
|
-
async function
|
|
10534
|
+
async function parseImage(buffer, options) {
|
|
10535
|
+
const ocrMode = options?.ocrMode || "auto";
|
|
10536
|
+
if (ocrMode === "off") {
|
|
10537
|
+
return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
|
|
10538
|
+
}
|
|
10539
|
+
let ocrProvider;
|
|
10540
|
+
let actualOcrMode = "auto";
|
|
10541
|
+
try {
|
|
10542
|
+
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
10543
|
+
ocrProvider = createCliOcrProvider(ocrMode);
|
|
10544
|
+
actualOcrMode = ocrMode;
|
|
10545
|
+
} else if (ocrMode === "tesseract") {
|
|
10546
|
+
ocrProvider = await createTesseractProvider();
|
|
10547
|
+
actualOcrMode = ocrMode;
|
|
10548
|
+
} else if (ocrMode === "auto") {
|
|
10549
|
+
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
10550
|
+
for (const mode of modesToTry) {
|
|
10551
|
+
try {
|
|
10552
|
+
ocrProvider = createCliOcrProvider(mode);
|
|
10553
|
+
actualOcrMode = mode;
|
|
10554
|
+
break;
|
|
10555
|
+
} catch (e) {
|
|
10556
|
+
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
10557
|
+
}
|
|
10558
|
+
}
|
|
10559
|
+
if (!ocrProvider) {
|
|
10560
|
+
ocrProvider = await createTesseractProvider();
|
|
10561
|
+
actualOcrMode = "tesseract";
|
|
10562
|
+
}
|
|
10563
|
+
}
|
|
10564
|
+
if (!ocrProvider) {
|
|
10565
|
+
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
10566
|
+
}
|
|
10567
|
+
const imageUint8Array = new Uint8Array(buffer);
|
|
10568
|
+
const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
|
|
10569
|
+
if (ocrProvider.terminate) {
|
|
10570
|
+
await ocrProvider.terminate();
|
|
10571
|
+
}
|
|
10572
|
+
const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
|
|
10573
|
+
const blocks = markdownToBlocks(markdown, 1);
|
|
10574
|
+
return {
|
|
10575
|
+
success: true,
|
|
10576
|
+
fileType: "image",
|
|
10577
|
+
markdown,
|
|
10578
|
+
blocks,
|
|
10579
|
+
isImageBased: true,
|
|
10580
|
+
warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
|
|
10581
|
+
};
|
|
10582
|
+
} catch (err) {
|
|
10583
|
+
if (ocrProvider && ocrProvider.terminate) {
|
|
10584
|
+
await ocrProvider.terminate();
|
|
10585
|
+
}
|
|
10586
|
+
return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
|
|
10587
|
+
}
|
|
10588
|
+
}
|
|
10589
|
+
async function parseHwpx(buffer, options, zip) {
|
|
10443
10590
|
try {
|
|
10444
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
10591
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
10445
10592
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10446
10593
|
} catch (err) {
|
|
10447
10594
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -10464,17 +10611,17 @@ async function parsePdf(buffer, options) {
|
|
|
10464
10611
|
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
10465
10612
|
}
|
|
10466
10613
|
}
|
|
10467
|
-
async function parseXlsx(buffer, options) {
|
|
10614
|
+
async function parseXlsx(buffer, options, zip) {
|
|
10468
10615
|
try {
|
|
10469
|
-
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
10616
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
10470
10617
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
10471
10618
|
} catch (err) {
|
|
10472
10619
|
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
10473
10620
|
}
|
|
10474
10621
|
}
|
|
10475
|
-
async function parseDocx(buffer, options) {
|
|
10622
|
+
async function parseDocx(buffer, options, zip) {
|
|
10476
10623
|
try {
|
|
10477
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
10624
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
10478
10625
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10479
10626
|
} catch (err) {
|
|
10480
10627
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|