@clazic/kordoc 2.3.1 → 2.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
  2. package/dist/batch-provider-PNDCSGQW.js.map +1 -0
  3. package/dist/{chunk-ZOEUKD77.js → chunk-2GFJFTKS.js} +193 -49
  4. package/dist/chunk-2GFJFTKS.js.map +1 -0
  5. package/dist/chunk-4PP34NVQ.js +121 -0
  6. package/dist/chunk-4PP34NVQ.js.map +1 -0
  7. package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
  8. package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
  9. package/dist/chunk-JOGAFNIL.js +153 -0
  10. package/dist/chunk-JOGAFNIL.js.map +1 -0
  11. package/dist/{chunk-W5KUC23B.js → chunk-STIKJGEA.js} +2 -2
  12. package/dist/cli.js +8 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/index.cjs +217 -70
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +11 -6
  17. package/dist/index.d.ts +11 -6
  18. package/dist/index.js +217 -70
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp.js +5 -2
  21. package/dist/mcp.js.map +1 -1
  22. package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
  23. package/dist/provider-HE727F7Z.js.map +1 -0
  24. package/dist/resolve-QA3VACUP.js +111 -0
  25. package/dist/resolve-QA3VACUP.js.map +1 -0
  26. package/dist/tesseract-provider-MNMZPSGF.js +11 -0
  27. package/dist/{utils-HSF5HI5T.js → utils-FFUQJTTI.js} +2 -2
  28. package/dist/utils-FFUQJTTI.js.map +1 -0
  29. package/dist/{watch-R2JHXDGF.js → watch-2O32L6IF.js} +6 -3
  30. package/dist/{watch-R2JHXDGF.js.map → watch-2O32L6IF.js.map} +1 -1
  31. package/package.json +7 -8
  32. package/dist/batch-provider-PCT4I4LK.js.map +0 -1
  33. package/dist/chunk-ZOEUKD77.js.map +0 -1
  34. package/dist/provider-WYHC4NHI.js.map +0 -1
  35. package/dist/resolve-4FSAQF2S.js +0 -247
  36. package/dist/resolve-4FSAQF2S.js.map +0 -1
  37. /package/dist/{chunk-W5KUC23B.js.map → chunk-STIKJGEA.js.map} +0 -0
  38. /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.d.cts CHANGED
@@ -1,3 +1,5 @@
1
+ import JSZip from 'jszip';
2
+
1
3
  /** kordoc 공통 타입 정의 */
2
4
  interface CellContext {
3
5
  text: string;
@@ -141,8 +143,8 @@ interface OutlineItem {
141
143
  pageNumber?: number;
142
144
  }
143
145
  /** 구조화된 에러 코드 — 프로그래밍적 에러 핸들링용 */
144
- type ErrorCode = "EMPTY_INPUT" | "UNSUPPORTED_FORMAT" | "ENCRYPTED" | "DRM_PROTECTED" | "CORRUPTED" | "DECOMPRESSION_BOMB" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | "NO_SECTIONS" | "PARSE_ERROR";
145
- type FileType = "hwpx" | "hwp" | "pdf" | "xlsx" | "docx" | "unknown";
146
+ type ErrorCode = "EMPTY_INPUT" | "UNSUPPORTED_FORMAT" | "ENCRYPTED" | "DRM_PROTECTED" | "CORRUPTED" | "DECOMPRESSION_BOMB" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | "NO_SECTIONS" | "PARSE_ERROR" | "FILE_TOO_LARGE";
147
+ type FileType = "hwpx" | "hwp" | "pdf" | "xlsx" | "docx" | "image" | "unknown";
146
148
  interface ParseResultBase {
147
149
  fileType: FileType;
148
150
  /** 페이지/섹션 수 — PDF: 실제 페이지 수, HWP/HWPX: 섹션 수, XLSX: 시트 수 */
@@ -309,7 +311,10 @@ declare function detectFormat(buffer: ArrayBuffer): FileType;
309
311
  * ZIP 내부 구조 기반 포맷 세분화.
310
312
  * HWPX, XLSX, DOCX 모두 ZIP이므로 내부 파일로 구분.
311
313
  */
312
- declare function detectZipFormat(buffer: ArrayBuffer): Promise<"hwpx" | "xlsx" | "docx" | "unknown">;
314
+ declare function detectZipFormat(buffer: ArrayBuffer): Promise<{
315
+ format: "hwpx" | "xlsx" | "docx" | "unknown";
316
+ zip: JSZip | null;
317
+ }>;
313
318
 
314
319
  /** 2-pass colSpan/rowSpan 테이블 빌더 및 Markdown 변환 */
315
320
 
@@ -338,14 +343,14 @@ declare const VERSION: string;
338
343
  */
339
344
  declare function parse(input: string | ArrayBuffer | Buffer, options?: ParseOptions): Promise<ParseResult>;
340
345
  /** HWPX 파일을 Markdown으로 변환 */
341
- declare function parseHwpx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
346
+ declare function parseHwpx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
342
347
  /** HWP 5.x 바이너리 파일을 Markdown으로 변환 */
343
348
  declare function parseHwp(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
344
349
  /** PDF 파일에서 텍스트를 추출하여 Markdown으로 변환 */
345
350
  declare function parsePdf(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
346
351
  /** XLSX 파일을 Markdown으로 변환 */
347
- declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
352
+ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
348
353
  /** DOCX 파일을 Markdown으로 변환 */
349
- declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
354
+ declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
350
355
 
351
356
  export { type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx };
package/dist/index.d.ts CHANGED
@@ -1,3 +1,5 @@
1
+ import JSZip from 'jszip';
2
+
1
3
  /** kordoc 공통 타입 정의 */
2
4
  interface CellContext {
3
5
  text: string;
@@ -141,8 +143,8 @@ interface OutlineItem {
141
143
  pageNumber?: number;
142
144
  }
143
145
  /** 구조화된 에러 코드 — 프로그래밍적 에러 핸들링용 */
144
- type ErrorCode = "EMPTY_INPUT" | "UNSUPPORTED_FORMAT" | "ENCRYPTED" | "DRM_PROTECTED" | "CORRUPTED" | "DECOMPRESSION_BOMB" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | "NO_SECTIONS" | "PARSE_ERROR";
145
- type FileType = "hwpx" | "hwp" | "pdf" | "xlsx" | "docx" | "unknown";
146
+ type ErrorCode = "EMPTY_INPUT" | "UNSUPPORTED_FORMAT" | "ENCRYPTED" | "DRM_PROTECTED" | "CORRUPTED" | "DECOMPRESSION_BOMB" | "ZIP_BOMB" | "IMAGE_BASED_PDF" | "NO_SECTIONS" | "PARSE_ERROR" | "FILE_TOO_LARGE";
147
+ type FileType = "hwpx" | "hwp" | "pdf" | "xlsx" | "docx" | "image" | "unknown";
146
148
  interface ParseResultBase {
147
149
  fileType: FileType;
148
150
  /** 페이지/섹션 수 — PDF: 실제 페이지 수, HWP/HWPX: 섹션 수, XLSX: 시트 수 */
@@ -309,7 +311,10 @@ declare function detectFormat(buffer: ArrayBuffer): FileType;
309
311
  * ZIP 내부 구조 기반 포맷 세분화.
310
312
  * HWPX, XLSX, DOCX 모두 ZIP이므로 내부 파일로 구분.
311
313
  */
312
- declare function detectZipFormat(buffer: ArrayBuffer): Promise<"hwpx" | "xlsx" | "docx" | "unknown">;
314
+ declare function detectZipFormat(buffer: ArrayBuffer): Promise<{
315
+ format: "hwpx" | "xlsx" | "docx" | "unknown";
316
+ zip: JSZip | null;
317
+ }>;
313
318
 
314
319
  /** 2-pass colSpan/rowSpan 테이블 빌더 및 Markdown 변환 */
315
320
 
@@ -338,14 +343,14 @@ declare const VERSION: string;
338
343
  */
339
344
  declare function parse(input: string | ArrayBuffer | Buffer, options?: ParseOptions): Promise<ParseResult>;
340
345
  /** HWPX 파일을 Markdown으로 변환 */
341
- declare function parseHwpx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
346
+ declare function parseHwpx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
342
347
  /** HWP 5.x 바이너리 파일을 Markdown으로 변환 */
343
348
  declare function parseHwp(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
344
349
  /** PDF 파일에서 텍스트를 추출하여 Markdown으로 변환 */
345
350
  declare function parsePdf(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
346
351
  /** XLSX 파일을 Markdown으로 변환 */
347
- declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
352
+ declare function parseXlsx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
348
353
  /** DOCX 파일을 Markdown으로 변환 */
349
- declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions): Promise<ParseResult>;
354
+ declare function parseDocx(buffer: ArrayBuffer, options?: ParseOptions, zip?: JSZip): Promise<ParseResult>;
350
355
 
351
356
  export { type BlockDiff, type BoundingBox, type CellContext, type CellDiff, type DiffChangeType, type DiffResult, type DocumentMetadata, type ErrorCode, type ExtractedImage, type FileType, type FormField, type FormResult, type IRBlock, type IRBlockType, type IRCell, type IRTable, type ImageData, type InlineStyle, type MarkdownToXlsxOptions, type OcrMode, type OcrProvider, type OutlineItem, type ParseFailure, type ParseOptions, type ParseResult, type ParseSuccess, type ParseWarning, type StructuredOcrResult, VERSION, type WarningCode, type WatchOptions, blocksToMarkdown, compare, detectFormat, detectZipFormat, diffBlocks, extractFormFields, isHwpxFile, isOldHwpFile, isPdfFile, isZipFile, markdownToHwpx, markdownToXlsx, parse, parseDocx, parseHwp, parseHwpx, parsePdf, parseXlsx };
package/dist/index.js CHANGED
@@ -1998,8 +1998,8 @@ function getTesseractFallbackMessage() {
1998
1998
  "\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
1999
1999
  "\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
2000
2000
  "",
2001
- " [\uAD8C\uC7A5] Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
2002
- " Codex CLI: npm install -g @openai/codex",
2001
+ " [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
2002
+ " Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
2003
2003
  " Claude CLI: npm install -g @anthropic-ai/claude-code",
2004
2004
  " Ollama: brew install ollama (+ ollama pull gemma4:27b)"
2005
2005
  ].join("\n");
@@ -2008,7 +2008,7 @@ var CLI_PRIORITY;
2008
2008
  var init_auto_detect = __esm({
2009
2009
  "src/ocr/auto-detect.ts"() {
2010
2010
  "use strict";
2011
- CLI_PRIORITY = ["gemini", "codex", "claude", "ollama"];
2011
+ CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
2012
2012
  }
2013
2013
  });
2014
2014
 
@@ -2051,7 +2051,7 @@ function callCli(mode, imagePath) {
2051
2051
  const args = buildCliArgs(mode, imagePath);
2052
2052
  const result = spawnSync(mode, args, {
2053
2053
  encoding: "utf-8",
2054
- timeout: 18e4,
2054
+ timeout: 6e5,
2055
2055
  maxBuffer: 10 * 1024 * 1024,
2056
2056
  // claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
2057
2057
  ...mode === "claude" ? { cwd: tmpdir() } : {}
@@ -2145,14 +2145,22 @@ async function callOllamaApi(imagePath) {
2145
2145
  return data.message?.content || "";
2146
2146
  }
2147
2147
  function stripCodeFence(text) {
2148
- const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
2148
+ const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
2149
2149
  return match ? match[1].trim() : text;
2150
2150
  }
2151
2151
  var OCR_PROMPT, _tempDir;
2152
2152
  var init_cli_provider = __esm({
2153
2153
  "src/ocr/cli-provider.ts"() {
2154
2154
  "use strict";
2155
- OCR_PROMPT = "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\uADDC\uCE59:\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2155
+ OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
2156
+ \uADDC\uCE59:
2157
+ - \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
2158
+ - \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
2159
+ - \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
2160
+ - \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
2161
+ - \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
2162
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
2163
+ - \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
2156
2164
  _tempDir = null;
2157
2165
  }
2158
2166
  });
@@ -2321,9 +2329,8 @@ async function callBatchCli(mode, imagePaths) {
2321
2329
  ${fileRefs}`;
2322
2330
  let args;
2323
2331
  if (mode === "gemini") {
2324
- args = ["--prompt", prompt, "--yolo"];
2325
- const model = process.env.KORDOC_GEMINI_MODEL;
2326
- if (model) args.push("--model", model);
2332
+ const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
2333
+ args = ["--prompt", prompt, "--yolo", "--model", model];
2327
2334
  } else {
2328
2335
  args = ["--print", prompt];
2329
2336
  const model = process.env.KORDOC_CLAUDE_MODEL;
@@ -2667,22 +2674,22 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2667
2674
  if (pageFilter && !pageFilter.has(i)) continue;
2668
2675
  pageNumbers.push(i);
2669
2676
  }
2670
- const pageImages = [];
2671
- for (const pageNum of pageNumbers) {
2672
- const page = await doc.getPage(pageNum);
2673
- const image = await renderPageToPng(page);
2674
- pageImages.push({ image, pageNum });
2675
- }
2676
- const batches = [];
2677
- for (let i = 0; i < pageImages.length; i += provider.batchSize) {
2678
- batches.push(pageImages.slice(i, i + provider.batchSize));
2677
+ const pageBatches = [];
2678
+ for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
2679
+ pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
2679
2680
  }
2680
2681
  let processed = 0;
2681
- const batchTasks = batches.map((batch, batchIdx) => async () => {
2682
+ const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
2682
2683
  const pageBlocks = [];
2683
2684
  try {
2684
- const results = await provider.processBatch(batch);
2685
- for (const { pageNum } of batch) {
2685
+ const batchImages = [];
2686
+ for (const pageNum of batchPageNums) {
2687
+ const page = await doc.getPage(pageNum);
2688
+ const image = await renderPageToPng(page);
2689
+ batchImages.push({ image, pageNum });
2690
+ }
2691
+ const results = await provider.processBatch(batchImages);
2692
+ for (const { pageNum } of batchImages) {
2686
2693
  const result = results.get(pageNum);
2687
2694
  pageBlocks.push({
2688
2695
  pageNum,
@@ -2690,16 +2697,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
2690
2697
  });
2691
2698
  }
2692
2699
  } catch (err) {
2693
- const range = `${batch[0].pageNum}-${batch[batch.length - 1].pageNum}`;
2700
+ const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
2694
2701
  warnings?.push({
2695
2702
  message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2696
2703
  code: "OCR_PAGE_FAILED"
2697
2704
  });
2698
- for (const { pageNum } of batch) {
2705
+ for (const pageNum of batchPageNums) {
2699
2706
  pageBlocks.push({ pageNum, blocks: [] });
2700
2707
  }
2701
2708
  }
2702
- processed += batch.length;
2709
+ processed += batchPageNums.length;
2703
2710
  onProgress?.(processed, pageNumbers.length);
2704
2711
  return { batchIdx, pageBlocks };
2705
2712
  });
@@ -2752,24 +2759,29 @@ function isPdfFile(buffer) {
2752
2759
  const b = magicBytes(buffer);
2753
2760
  return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
2754
2761
  }
2762
+ function isPngFile(buffer) {
2763
+ const b = magicBytes(buffer);
2764
+ return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
2765
+ }
2755
2766
  function detectFormat(buffer) {
2756
2767
  if (buffer.byteLength < 4) return "unknown";
2757
2768
  if (isZipFile(buffer)) return "hwpx";
2758
2769
  if (isOldHwpFile(buffer)) return "hwp";
2759
2770
  if (isPdfFile(buffer)) return "pdf";
2771
+ if (isPngFile(buffer)) return "image";
2760
2772
  return "unknown";
2761
2773
  }
2762
2774
  async function detectZipFormat(buffer) {
2763
2775
  try {
2764
2776
  const zip = await JSZip.loadAsync(buffer);
2765
- if (zip.file("xl/workbook.xml")) return "xlsx";
2766
- if (zip.file("word/document.xml")) return "docx";
2767
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
2777
+ if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
2778
+ if (zip.file("word/document.xml")) return { format: "docx", zip };
2779
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
2768
2780
  const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
2769
- if (hasSection) return "hwpx";
2770
- return "unknown";
2781
+ if (hasSection) return { format: "hwpx", zip };
2782
+ return { format: "unknown", zip: null };
2771
2783
  } catch {
2772
- return "unknown";
2784
+ return { format: "unknown", zip: null };
2773
2785
  }
2774
2786
  }
2775
2787
 
@@ -2778,7 +2790,7 @@ import JSZip2 from "jszip";
2778
2790
  import { DOMParser } from "@xmldom/xmldom";
2779
2791
 
2780
2792
  // src/utils.ts
2781
- var VERSION = true ? "2.3.1" : "0.0.0-dev";
2793
+ var VERSION = true ? "2.3.2" : "0.0.0-dev";
2782
2794
  function toArrayBuffer(buf) {
2783
2795
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2784
2796
  return buf.buffer;
@@ -2938,12 +2950,16 @@ function buildTableDirect(rows, numRows) {
2938
2950
  return trimAndReturn(grid, numRows, maxCols);
2939
2951
  }
2940
2952
  function trimAndReturn(grid, numRows, maxCols) {
2941
- let effectiveCols = maxCols;
2942
- while (effectiveCols > 0) {
2943
- const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
2944
- if (!colEmpty) break;
2945
- effectiveCols--;
2953
+ let effectiveCols = 0;
2954
+ for (const row of grid) {
2955
+ for (let c = row.length - 1; c >= effectiveCols; c--) {
2956
+ if (row[c]?.text?.trim()) {
2957
+ effectiveCols = c + 1;
2958
+ break;
2959
+ }
2960
+ }
2946
2961
  }
2962
+ if (effectiveCols === 0) effectiveCols = maxCols;
2947
2963
  if (effectiveCols < maxCols && effectiveCols > 0) {
2948
2964
  const trimmed = grid.map((row) => row.slice(0, effectiveCols));
2949
2965
  return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
@@ -3200,11 +3216,11 @@ function parseStyleElements(doc, map) {
3200
3216
  function stripDtd(xml) {
3201
3217
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
3202
3218
  }
3203
- async function parseHwpxDocument(buffer, options) {
3219
+ async function parseHwpxDocument(buffer, options, existingZip) {
3204
3220
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3205
3221
  let zip;
3206
3222
  try {
3207
- zip = await JSZip2.loadAsync(buffer);
3223
+ zip = existingZip ?? await JSZip2.loadAsync(buffer);
3208
3224
  } catch {
3209
3225
  return await extractFromBrokenZip(buffer);
3210
3226
  }
@@ -6216,8 +6232,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
6216
6232
  GlobalWorkerOptions.workerSrc = "";
6217
6233
  var MAX_PAGES = 5e3;
6218
6234
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
6219
- var PDF_LOAD_TIMEOUT_MS = 3e4;
6235
+ function calcPdfTimeout(bufferSize) {
6236
+ const base = 3e4;
6237
+ const perMb = 500;
6238
+ const mb = bufferSize / (1024 * 1024);
6239
+ return Math.min(base + Math.ceil(mb * perMb), 3e5);
6240
+ }
6220
6241
  async function loadPdfWithTimeout(buffer) {
6242
+ const timeoutMs = calcPdfTimeout(buffer.byteLength);
6243
+ const timeoutSec = Math.round(timeoutMs / 1e3);
6221
6244
  const loadingTask = getDocument({
6222
6245
  data: new Uint8Array(buffer),
6223
6246
  useSystemFonts: true,
@@ -6231,8 +6254,8 @@ async function loadPdfWithTimeout(buffer) {
6231
6254
  new Promise((_, reject) => {
6232
6255
  timer = setTimeout(() => {
6233
6256
  loadingTask.destroy();
6234
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
6235
- }, PDF_LOAD_TIMEOUT_MS);
6257
+ reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
6258
+ }, timeoutMs);
6236
6259
  })
6237
6260
  ]);
6238
6261
  } finally {
@@ -6253,11 +6276,15 @@ async function parsePdfDocument(buffer, options) {
6253
6276
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
6254
6277
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
6255
6278
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
6256
- const allFontSizes = [];
6279
+ const fontSizeFreq = /* @__PURE__ */ new Map();
6257
6280
  const pageHeights = /* @__PURE__ */ new Map();
6258
- let parsedPages = 0;
6281
+ const targetPageNums = [];
6259
6282
  for (let i = 1; i <= effectivePageCount; i++) {
6260
6283
  if (pageFilter && !pageFilter.has(i)) continue;
6284
+ targetPageNums.push(i);
6285
+ }
6286
+ let parsedPages = 0;
6287
+ const parseSinglePage = async (i) => {
6261
6288
  try {
6262
6289
  const page = await doc.getPage(i);
6263
6290
  const tc = await page.getTextContent();
@@ -6270,7 +6297,10 @@ async function parsePdfDocument(buffer, options) {
6270
6297
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
6271
6298
  }
6272
6299
  for (const item of visible) {
6273
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
6300
+ if (item.fontSize > 0) {
6301
+ const rounded = Math.round(item.fontSize * 10) / 10;
6302
+ fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
6303
+ }
6274
6304
  }
6275
6305
  const opList = await page.getOperatorList();
6276
6306
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -6287,12 +6317,23 @@ async function parsePdfDocument(buffer, options) {
6287
6317
  if (pageErr instanceof KordocError) throw pageErr;
6288
6318
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
6289
6319
  }
6320
+ };
6321
+ const sampleCount = Math.min(5, targetPageNums.length);
6322
+ for (let si = 0; si < sampleCount; si++) {
6323
+ await parseSinglePage(targetPageNums[si]);
6324
+ }
6325
+ const sampleParsed = parsedPages || sampleCount;
6326
+ const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
6327
+ if (!isImageBased) {
6328
+ for (let si = sampleCount; si < targetPageNums.length; si++) {
6329
+ await parseSinglePage(targetPageNums[si]);
6330
+ }
6290
6331
  }
6291
6332
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6292
- if (totalChars / Math.max(parsedPageCount, 1) < 10) {
6333
+ if (isImageBased) {
6293
6334
  let ocrProvider = options?.ocr ?? null;
6294
- const ocrMode = options?.ocrMode;
6295
- if (!ocrProvider && ocrMode && ocrMode !== "off") {
6335
+ const ocrMode = options?.ocrMode ?? "auto";
6336
+ if (!ocrProvider && ocrMode !== "off") {
6296
6337
  try {
6297
6338
  const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
6298
6339
  const concurrency = options?.ocrConcurrency ?? 1;
@@ -6344,7 +6385,7 @@ async function parsePdfDocument(buffer, options) {
6344
6385
  blocks.splice(removed[ri], 1);
6345
6386
  }
6346
6387
  }
6347
- const medianFontSize = computeMedianFontSize(allFontSizes);
6388
+ const medianFontSize = computeMedianFromFreq(fontSizeFreq);
6348
6389
  if (medianFontSize > 0) {
6349
6390
  detectHeadings(blocks, medianFontSize);
6350
6391
  }
@@ -6397,11 +6438,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
6397
6438
  }
6398
6439
  return { visible, hiddenCount };
6399
6440
  }
6400
- function computeMedianFontSize(sizes) {
6401
- if (sizes.length === 0) return 0;
6402
- const sorted = [...sizes].sort((a, b) => a - b);
6403
- const mid = Math.floor(sorted.length / 2);
6404
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
6441
+ function computeMedianFromFreq(freq) {
6442
+ if (freq.size === 0) return 0;
6443
+ const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
6444
+ let total = 0;
6445
+ for (const [, count] of entries) total += count;
6446
+ const mid = total / 2;
6447
+ let cumulative = 0;
6448
+ for (const [size, count] of entries) {
6449
+ cumulative += count;
6450
+ if (cumulative >= mid) return size;
6451
+ }
6452
+ return 0;
6405
6453
  }
6406
6454
  function detectHeadings(blocks, medianFontSize) {
6407
6455
  for (const block of blocks) {
@@ -7204,6 +7252,7 @@ var MAX_SHEETS = 100;
7204
7252
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
7205
7253
  var MAX_ROWS2 = 1e4;
7206
7254
  var MAX_COLS2 = 200;
7255
+ var MAX_TOTAL_CELLS = 2e6;
7207
7256
  function cleanNumericValue(raw) {
7208
7257
  if (!/^-?\d+\.\d+$/.test(raw)) return raw;
7209
7258
  const num = parseFloat(raw);
@@ -7387,9 +7436,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
7387
7436
  }
7388
7437
  return blocks;
7389
7438
  }
7390
- async function parseXlsxDocument(buffer, options) {
7439
+ async function parseXlsxDocument(buffer, options, existingZip) {
7391
7440
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
7392
- const zip = await JSZip3.loadAsync(buffer);
7441
+ const zip = existingZip ?? await JSZip3.loadAsync(buffer);
7393
7442
  const warnings = [];
7394
7443
  const workbookFile = zip.file("xl/workbook.xml");
7395
7444
  if (!workbookFile) {
@@ -7416,6 +7465,7 @@ async function parseXlsxDocument(buffer, options) {
7416
7465
  }
7417
7466
  const blocks = [];
7418
7467
  const processedSheets = Math.min(sheets.length, MAX_SHEETS);
7468
+ let totalCells = 0;
7419
7469
  for (let i = 0; i < processedSheets; i++) {
7420
7470
  if (pageFilter && !pageFilter.has(i + 1)) continue;
7421
7471
  const sheet = sheets[i];
@@ -7442,6 +7492,11 @@ async function parseXlsxDocument(buffer, options) {
7442
7492
  try {
7443
7493
  const sheetXml = await sheetFile.async("text");
7444
7494
  const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
7495
+ totalCells += maxRow * maxCol;
7496
+ if (totalCells > MAX_TOTAL_CELLS) {
7497
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
7498
+ break;
7499
+ }
7445
7500
  const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
7446
7501
  blocks.push(...sheetBlocks);
7447
7502
  } catch (err) {
@@ -7525,10 +7580,35 @@ function getAttr(el, localName) {
7525
7580
  function parseXml2(text) {
7526
7581
  return new DOMParser3().parseFromString(text, "text/xml");
7527
7582
  }
7583
+ function buildElementIndex(root) {
7584
+ const index = /* @__PURE__ */ new Map();
7585
+ const walk = (node) => {
7586
+ const children = node.childNodes;
7587
+ for (let i = 0; i < children.length; i++) {
7588
+ const child = children[i];
7589
+ if (child.nodeType === 1) {
7590
+ const el = child;
7591
+ const name = el.localName ?? "";
7592
+ if (name) {
7593
+ let list = index.get(name);
7594
+ if (!list) {
7595
+ list = [];
7596
+ index.set(name, list);
7597
+ }
7598
+ list.push(el);
7599
+ }
7600
+ walk(el);
7601
+ }
7602
+ }
7603
+ };
7604
+ walk(root);
7605
+ return index;
7606
+ }
7528
7607
  function parseStyles(xml) {
7529
7608
  const doc = parseXml2(xml);
7530
7609
  const styles = /* @__PURE__ */ new Map();
7531
- const styleElements = findElements(doc, "style");
7610
+ const idx = buildElementIndex(doc);
7611
+ const styleElements = idx.get("style") ?? [];
7532
7612
  for (const el of styleElements) {
7533
7613
  const styleId = getAttr(el, "styleId");
7534
7614
  if (!styleId) continue;
@@ -7556,7 +7636,8 @@ function parseStyles(xml) {
7556
7636
  function parseNumbering(xml) {
7557
7637
  const doc = parseXml2(xml);
7558
7638
  const abstractNums = /* @__PURE__ */ new Map();
7559
- const abstractElements = findElements(doc, "abstractNum");
7639
+ const idx = buildElementIndex(doc);
7640
+ const abstractElements = idx.get("abstractNum") ?? [];
7560
7641
  for (const el of abstractElements) {
7561
7642
  const abstractNumId = getAttr(el, "abstractNumId");
7562
7643
  if (!abstractNumId) continue;
@@ -7571,7 +7652,7 @@ function parseNumbering(xml) {
7571
7652
  abstractNums.set(abstractNumId, levels);
7572
7653
  }
7573
7654
  const nums = /* @__PURE__ */ new Map();
7574
- const numElements = findElements(doc, "num");
7655
+ const numElements = idx.get("num") ?? [];
7575
7656
  for (const el of numElements) {
7576
7657
  const numId = getAttr(el, "numId");
7577
7658
  if (!numId) continue;
@@ -7815,9 +7896,9 @@ async function extractImages(zip, rels, doc) {
7815
7896
  }
7816
7897
  return { blocks, images };
7817
7898
  }
7818
- async function parseDocxDocument(buffer, options) {
7899
+ async function parseDocxDocument(buffer, options, existingZip) {
7819
7900
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
7820
- const zip = await JSZip4.loadAsync(buffer);
7901
+ const zip = existingZip ?? await JSZip4.loadAsync(buffer);
7821
7902
  const warnings = [];
7822
7903
  const docFile = zip.file("word/document.xml");
7823
7904
  if (!docFile) {
@@ -7907,6 +7988,11 @@ async function parseDocxDocument(buffer, options) {
7907
7988
  };
7908
7989
  }
7909
7990
 
7991
+ // src/index.ts
7992
+ init_cli_provider();
7993
+ init_tesseract_provider();
7994
+ init_markdown_to_blocks();
7995
+
7910
7996
  // src/diff/text-diff.ts
7911
7997
  function similarity(a, b) {
7912
7998
  if (a === b) return 1;
@@ -10423,25 +10509,86 @@ async function parse2(input, options) {
10423
10509
  if (!buffer || buffer.byteLength === 0) {
10424
10510
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
10425
10511
  }
10512
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
10513
+ if (buffer.byteLength > MAX_FILE_SIZE) {
10514
+ return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
10515
+ }
10426
10516
  const format = detectFormat(buffer);
10427
10517
  switch (format) {
10428
10518
  case "hwpx": {
10429
- const zipFormat = await detectZipFormat(buffer);
10430
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
10431
- if (zipFormat === "docx") return parseDocx(buffer, options);
10432
- return parseHwpx(buffer, options);
10519
+ const { format: zipFormat, zip } = await detectZipFormat(buffer);
10520
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
10521
+ if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
10522
+ return parseHwpx(buffer, options, zip ?? void 0);
10433
10523
  }
10434
10524
  case "hwp":
10435
10525
  return parseHwp(buffer, options);
10436
10526
  case "pdf":
10437
10527
  return parsePdf(buffer, options);
10528
+ case "image":
10529
+ return parseImage(buffer, options);
10438
10530
  default:
10439
10531
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
10440
10532
  }
10441
10533
  }
10442
- async function parseHwpx(buffer, options) {
10534
+ async function parseImage(buffer, options) {
10535
+ const ocrMode = options?.ocrMode || "auto";
10536
+ if (ocrMode === "off") {
10537
+ return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
10538
+ }
10539
+ let ocrProvider;
10540
+ let actualOcrMode = "auto";
10541
+ try {
10542
+ if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
10543
+ ocrProvider = createCliOcrProvider(ocrMode);
10544
+ actualOcrMode = ocrMode;
10545
+ } else if (ocrMode === "tesseract") {
10546
+ ocrProvider = await createTesseractProvider();
10547
+ actualOcrMode = ocrMode;
10548
+ } else if (ocrMode === "auto") {
10549
+ const modesToTry = ["gemini", "claude", "codex", "ollama"];
10550
+ for (const mode of modesToTry) {
10551
+ try {
10552
+ ocrProvider = createCliOcrProvider(mode);
10553
+ actualOcrMode = mode;
10554
+ break;
10555
+ } catch (e) {
10556
+ console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
10557
+ }
10558
+ }
10559
+ if (!ocrProvider) {
10560
+ ocrProvider = await createTesseractProvider();
10561
+ actualOcrMode = "tesseract";
10562
+ }
10563
+ }
10564
+ if (!ocrProvider) {
10565
+ return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
10566
+ }
10567
+ const imageUint8Array = new Uint8Array(buffer);
10568
+ const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
10569
+ if (ocrProvider.terminate) {
10570
+ await ocrProvider.terminate();
10571
+ }
10572
+ const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
10573
+ const blocks = markdownToBlocks(markdown, 1);
10574
+ return {
10575
+ success: true,
10576
+ fileType: "image",
10577
+ markdown,
10578
+ blocks,
10579
+ isImageBased: true,
10580
+ warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
10581
+ };
10582
+ } catch (err) {
10583
+ if (ocrProvider && ocrProvider.terminate) {
10584
+ await ocrProvider.terminate();
10585
+ }
10586
+ return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
10587
+ }
10588
+ }
10589
+ async function parseHwpx(buffer, options, zip) {
10443
10590
  try {
10444
- const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
10591
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
10445
10592
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10446
10593
  } catch (err) {
10447
10594
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -10464,17 +10611,17 @@ async function parsePdf(buffer, options) {
10464
10611
  return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
10465
10612
  }
10466
10613
  }
10467
- async function parseXlsx(buffer, options) {
10614
+ async function parseXlsx(buffer, options, zip) {
10468
10615
  try {
10469
- const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
10616
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
10470
10617
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
10471
10618
  } catch (err) {
10472
10619
  return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
10473
10620
  }
10474
10621
  }
10475
- async function parseDocx(buffer, options) {
10622
+ async function parseDocx(buffer, options, zip) {
10476
10623
  try {
10477
- const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
10624
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
10478
10625
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10479
10626
  } catch (err) {
10480
10627
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };