@clazic/kordoc 2.3.1 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
  2. package/dist/batch-provider-PNDCSGQW.js.map +1 -0
  3. package/dist/chunk-4PP34NVQ.js +121 -0
  4. package/dist/chunk-4PP34NVQ.js.map +1 -0
  5. package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
  6. package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
  7. package/dist/chunk-JOGAFNIL.js +153 -0
  8. package/dist/chunk-JOGAFNIL.js.map +1 -0
  9. package/dist/{chunk-W5KUC23B.js → chunk-NU3KFVVZ.js} +2 -2
  10. package/dist/{chunk-ZOEUKD77.js → chunk-UDFKY7CH.js} +204 -49
  11. package/dist/chunk-UDFKY7CH.js.map +1 -0
  12. package/dist/cli.js +8 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/index.cjs +230 -72
  15. package/dist/index.cjs.map +1 -1
  16. package/dist/index.d.cts +11 -6
  17. package/dist/index.d.ts +11 -6
  18. package/dist/index.js +230 -72
  19. package/dist/index.js.map +1 -1
  20. package/dist/mcp.js +5 -2
  21. package/dist/mcp.js.map +1 -1
  22. package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
  23. package/dist/provider-HE727F7Z.js.map +1 -0
  24. package/dist/resolve-UOAOPQ4H.js +111 -0
  25. package/dist/resolve-UOAOPQ4H.js.map +1 -0
  26. package/dist/tesseract-provider-MNMZPSGF.js +11 -0
  27. package/dist/{utils-HSF5HI5T.js → utils-STJT6CFC.js} +2 -2
  28. package/dist/utils-STJT6CFC.js.map +1 -0
  29. package/dist/{watch-R2JHXDGF.js → watch-PRQGLOW3.js} +6 -3
  30. package/dist/{watch-R2JHXDGF.js.map → watch-PRQGLOW3.js.map} +1 -1
  31. package/package.json +8 -8
  32. package/dist/batch-provider-PCT4I4LK.js.map +0 -1
  33. package/dist/chunk-ZOEUKD77.js.map +0 -1
  34. package/dist/provider-WYHC4NHI.js.map +0 -1
  35. package/dist/resolve-4FSAQF2S.js +0 -247
  36. package/dist/resolve-4FSAQF2S.js.map +0 -1
  37. /package/dist/{chunk-W5KUC23B.js.map → chunk-NU3KFVVZ.js.map} +0 -0
  38. /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
@@ -6,10 +6,19 @@ import {
6
6
  precheckZipSize,
7
7
  sanitizeHref,
8
8
  toArrayBuffer
9
- } from "./chunk-W5KUC23B.js";
9
+ } from "./chunk-NU3KFVVZ.js";
10
10
  import {
11
11
  parsePageRange
12
12
  } from "./chunk-MOL7MDBG.js";
13
+ import {
14
+ createTesseractProvider
15
+ } from "./chunk-7FMKAV4P.js";
16
+ import {
17
+ createCliOcrProvider
18
+ } from "./chunk-JOGAFNIL.js";
19
+ import {
20
+ markdownToBlocks
21
+ } from "./chunk-4PP34NVQ.js";
13
22
  import {
14
23
  __commonJS,
15
24
  __require,
@@ -1918,24 +1927,29 @@ function isPdfFile(buffer) {
1918
1927
  const b = magicBytes(buffer);
1919
1928
  return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
1920
1929
  }
1930
+ function isPngFile(buffer) {
1931
+ const b = magicBytes(buffer);
1932
+ return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
1933
+ }
1921
1934
  function detectFormat(buffer) {
1922
1935
  if (buffer.byteLength < 4) return "unknown";
1923
1936
  if (isZipFile(buffer)) return "hwpx";
1924
1937
  if (isOldHwpFile(buffer)) return "hwp";
1925
1938
  if (isPdfFile(buffer)) return "pdf";
1939
+ if (isPngFile(buffer)) return "image";
1926
1940
  return "unknown";
1927
1941
  }
1928
1942
  async function detectZipFormat(buffer) {
1929
1943
  try {
1930
1944
  const zip = await JSZip.loadAsync(buffer);
1931
- if (zip.file("xl/workbook.xml")) return "xlsx";
1932
- if (zip.file("word/document.xml")) return "docx";
1933
- if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
1945
+ if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
1946
+ if (zip.file("word/document.xml")) return { format: "docx", zip };
1947
+ if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
1934
1948
  const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
1935
- if (hasSection) return "hwpx";
1936
- return "unknown";
1949
+ if (hasSection) return { format: "hwpx", zip };
1950
+ return { format: "unknown", zip: null };
1937
1951
  } catch {
1938
- return "unknown";
1952
+ return { format: "unknown", zip: null };
1939
1953
  }
1940
1954
  }
1941
1955
 
@@ -2024,12 +2038,16 @@ function buildTableDirect(rows, numRows) {
2024
2038
  return trimAndReturn(grid, numRows, maxCols);
2025
2039
  }
2026
2040
  function trimAndReturn(grid, numRows, maxCols) {
2027
- let effectiveCols = maxCols;
2028
- while (effectiveCols > 0) {
2029
- const colEmpty = grid.every((row) => !row[effectiveCols - 1]?.text?.trim());
2030
- if (!colEmpty) break;
2031
- effectiveCols--;
2041
+ let effectiveCols = 0;
2042
+ for (const row of grid) {
2043
+ for (let c = row.length - 1; c >= effectiveCols; c--) {
2044
+ if (row[c]?.text?.trim()) {
2045
+ effectiveCols = c + 1;
2046
+ break;
2047
+ }
2048
+ }
2032
2049
  }
2050
+ if (effectiveCols === 0) effectiveCols = maxCols;
2033
2051
  if (effectiveCols < maxCols && effectiveCols > 0) {
2034
2052
  const trimmed = grid.map((row) => row.slice(0, effectiveCols));
2035
2053
  return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
@@ -2289,11 +2307,11 @@ function parseStyleElements(doc, map) {
2289
2307
  function stripDtd(xml) {
2290
2308
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
2291
2309
  }
2292
- async function parseHwpxDocument(buffer, options) {
2310
+ async function parseHwpxDocument(buffer, options, existingZip) {
2293
2311
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
2294
2312
  let zip;
2295
2313
  try {
2296
- zip = await JSZip2.loadAsync(buffer);
2314
+ zip = existingZip ?? await JSZip2.loadAsync(buffer);
2297
2315
  } catch {
2298
2316
  return await extractFromBrokenZip(buffer);
2299
2317
  }
@@ -5328,8 +5346,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
5328
5346
  GlobalWorkerOptions.workerSrc = "";
5329
5347
  var MAX_PAGES = 5e3;
5330
5348
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
5331
- var PDF_LOAD_TIMEOUT_MS = 3e4;
5349
+ function calcPdfTimeout(bufferSize) {
5350
+ const base = 3e4;
5351
+ const perMb = 500;
5352
+ const mb = bufferSize / (1024 * 1024);
5353
+ return Math.min(base + Math.ceil(mb * perMb), 3e5);
5354
+ }
5332
5355
  async function loadPdfWithTimeout(buffer) {
5356
+ const timeoutMs = calcPdfTimeout(buffer.byteLength);
5357
+ const timeoutSec = Math.round(timeoutMs / 1e3);
5333
5358
  const loadingTask = getDocument({
5334
5359
  data: new Uint8Array(buffer),
5335
5360
  useSystemFonts: true,
@@ -5343,8 +5368,8 @@ async function loadPdfWithTimeout(buffer) {
5343
5368
  new Promise((_, reject) => {
5344
5369
  timer = setTimeout(() => {
5345
5370
  loadingTask.destroy();
5346
- reject(new KordocError("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
5347
- }, PDF_LOAD_TIMEOUT_MS);
5371
+ reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
5372
+ }, timeoutMs);
5348
5373
  })
5349
5374
  ]);
5350
5375
  } finally {
@@ -5365,11 +5390,15 @@ async function parsePdfDocument(buffer, options) {
5365
5390
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
5366
5391
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
5367
5392
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
5368
- const allFontSizes = [];
5393
+ const fontSizeFreq = /* @__PURE__ */ new Map();
5369
5394
  const pageHeights = /* @__PURE__ */ new Map();
5370
- let parsedPages = 0;
5395
+ const targetPageNums = [];
5371
5396
  for (let i = 1; i <= effectivePageCount; i++) {
5372
5397
  if (pageFilter && !pageFilter.has(i)) continue;
5398
+ targetPageNums.push(i);
5399
+ }
5400
+ let parsedPages = 0;
5401
+ const parseSinglePage = async (i) => {
5373
5402
  try {
5374
5403
  const page = await doc.getPage(i);
5375
5404
  const tc = await page.getTextContent();
@@ -5382,7 +5411,10 @@ async function parsePdfDocument(buffer, options) {
5382
5411
  warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
5383
5412
  }
5384
5413
  for (const item of visible) {
5385
- if (item.fontSize > 0) allFontSizes.push(item.fontSize);
5414
+ if (item.fontSize > 0) {
5415
+ const rounded = Math.round(item.fontSize * 10) / 10;
5416
+ fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
5417
+ }
5386
5418
  }
5387
5419
  const opList = await page.getOperatorList();
5388
5420
  const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
@@ -5399,14 +5431,36 @@ async function parsePdfDocument(buffer, options) {
5399
5431
  if (pageErr instanceof KordocError) throw pageErr;
5400
5432
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
5401
5433
  }
5434
+ };
5435
+ const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
5436
+ const sampledIndices = /* @__PURE__ */ new Set();
5437
+ if (targetPageNums.length <= SAMPLE_SIZE) {
5438
+ for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
5439
+ } else {
5440
+ for (let i = 0; i < SAMPLE_SIZE; i++) {
5441
+ const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
5442
+ sampledIndices.add(idx);
5443
+ }
5444
+ }
5445
+ for (const si of sampledIndices) {
5446
+ await parseSinglePage(targetPageNums[si]);
5447
+ }
5448
+ const sampleParsed = parsedPages || sampledIndices.size;
5449
+ const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
5450
+ if (!isImageBased) {
5451
+ for (let si = 0; si < targetPageNums.length; si++) {
5452
+ if (!sampledIndices.has(si)) {
5453
+ await parseSinglePage(targetPageNums[si]);
5454
+ }
5455
+ }
5402
5456
  }
5403
5457
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
5404
- if (totalChars / Math.max(parsedPageCount, 1) < 10) {
5458
+ if (isImageBased) {
5405
5459
  let ocrProvider = options?.ocr ?? null;
5406
- const ocrMode = options?.ocrMode;
5407
- if (!ocrProvider && ocrMode && ocrMode !== "off") {
5460
+ const ocrMode = options?.ocrMode ?? "auto";
5461
+ if (!ocrProvider && ocrMode !== "off") {
5408
5462
  try {
5409
- const { resolveOcrProvider } = await import("./resolve-4FSAQF2S.js");
5463
+ const { resolveOcrProvider } = await import("./resolve-UOAOPQ4H.js");
5410
5464
  const concurrency = options?.ocrConcurrency ?? 1;
5411
5465
  const batchSize = options?.ocrBatchSize;
5412
5466
  ocrProvider = await resolveOcrProvider(ocrMode, warnings, concurrency, batchSize);
@@ -5422,7 +5476,7 @@ async function parsePdfDocument(buffer, options) {
5422
5476
  if (ocrProvider) {
5423
5477
  let ocrBlocks = [];
5424
5478
  try {
5425
- const { ocrPages } = await import("./provider-WYHC4NHI.js");
5479
+ const { ocrPages } = await import("./provider-HE727F7Z.js");
5426
5480
  const concurrency = options?.ocrConcurrency ?? 1;
5427
5481
  ocrBlocks = await ocrPages(doc, ocrProvider, pageFilter, effectivePageCount, warnings, concurrency, options?.onProgress);
5428
5482
  } catch {
@@ -5456,7 +5510,7 @@ async function parsePdfDocument(buffer, options) {
5456
5510
  blocks.splice(removed[ri], 1);
5457
5511
  }
5458
5512
  }
5459
- const medianFontSize = computeMedianFontSize(allFontSizes);
5513
+ const medianFontSize = computeMedianFromFreq(fontSizeFreq);
5460
5514
  if (medianFontSize > 0) {
5461
5515
  detectHeadings(blocks, medianFontSize);
5462
5516
  }
@@ -5520,11 +5574,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
5520
5574
  }
5521
5575
  return { visible, hiddenCount };
5522
5576
  }
5523
- function computeMedianFontSize(sizes) {
5524
- if (sizes.length === 0) return 0;
5525
- const sorted = [...sizes].sort((a, b) => a - b);
5526
- const mid = Math.floor(sorted.length / 2);
5527
- return sorted.length % 2 === 0 ? (sorted[mid - 1] + sorted[mid]) / 2 : sorted[mid];
5577
+ function computeMedianFromFreq(freq) {
5578
+ if (freq.size === 0) return 0;
5579
+ const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
5580
+ let total = 0;
5581
+ for (const [, count] of entries) total += count;
5582
+ const mid = total / 2;
5583
+ let cumulative = 0;
5584
+ for (const [size, count] of entries) {
5585
+ cumulative += count;
5586
+ if (cumulative >= mid) return size;
5587
+ }
5588
+ return 0;
5528
5589
  }
5529
5590
  function detectHeadings(blocks, medianFontSize) {
5530
5591
  for (const block of blocks) {
@@ -6330,6 +6391,7 @@ var MAX_SHEETS = 100;
6330
6391
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
6331
6392
  var MAX_ROWS2 = 1e4;
6332
6393
  var MAX_COLS2 = 200;
6394
+ var MAX_TOTAL_CELLS = 2e6;
6333
6395
  function cleanNumericValue(raw) {
6334
6396
  if (!/^-?\d+\.\d+$/.test(raw)) return raw;
6335
6397
  const num = parseFloat(raw);
@@ -6513,9 +6575,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
6513
6575
  }
6514
6576
  return blocks;
6515
6577
  }
6516
- async function parseXlsxDocument(buffer, options) {
6578
+ async function parseXlsxDocument(buffer, options, existingZip) {
6517
6579
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
6518
- const zip = await JSZip3.loadAsync(buffer);
6580
+ const zip = existingZip ?? await JSZip3.loadAsync(buffer);
6519
6581
  const warnings = [];
6520
6582
  const workbookFile = zip.file("xl/workbook.xml");
6521
6583
  if (!workbookFile) {
@@ -6542,6 +6604,7 @@ async function parseXlsxDocument(buffer, options) {
6542
6604
  }
6543
6605
  const blocks = [];
6544
6606
  const processedSheets = Math.min(sheets.length, MAX_SHEETS);
6607
+ let totalCells = 0;
6545
6608
  for (let i = 0; i < processedSheets; i++) {
6546
6609
  if (pageFilter && !pageFilter.has(i + 1)) continue;
6547
6610
  const sheet = sheets[i];
@@ -6568,6 +6631,11 @@ async function parseXlsxDocument(buffer, options) {
6568
6631
  try {
6569
6632
  const sheetXml = await sheetFile.async("text");
6570
6633
  const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
6634
+ totalCells += maxRow * maxCol;
6635
+ if (totalCells > MAX_TOTAL_CELLS) {
6636
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
6637
+ break;
6638
+ }
6571
6639
  const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
6572
6640
  blocks.push(...sheetBlocks);
6573
6641
  } catch (err) {
@@ -6651,10 +6719,35 @@ function getAttr(el, localName) {
6651
6719
  function parseXml2(text) {
6652
6720
  return new DOMParser3().parseFromString(text, "text/xml");
6653
6721
  }
6722
+ function buildElementIndex(root) {
6723
+ const index = /* @__PURE__ */ new Map();
6724
+ const walk = (node) => {
6725
+ const children = node.childNodes;
6726
+ for (let i = 0; i < children.length; i++) {
6727
+ const child = children[i];
6728
+ if (child.nodeType === 1) {
6729
+ const el = child;
6730
+ const name = el.localName ?? "";
6731
+ if (name) {
6732
+ let list = index.get(name);
6733
+ if (!list) {
6734
+ list = [];
6735
+ index.set(name, list);
6736
+ }
6737
+ list.push(el);
6738
+ }
6739
+ walk(el);
6740
+ }
6741
+ }
6742
+ };
6743
+ walk(root);
6744
+ return index;
6745
+ }
6654
6746
  function parseStyles(xml) {
6655
6747
  const doc = parseXml2(xml);
6656
6748
  const styles = /* @__PURE__ */ new Map();
6657
- const styleElements = findElements(doc, "style");
6749
+ const idx = buildElementIndex(doc);
6750
+ const styleElements = idx.get("style") ?? [];
6658
6751
  for (const el of styleElements) {
6659
6752
  const styleId = getAttr(el, "styleId");
6660
6753
  if (!styleId) continue;
@@ -6682,7 +6775,8 @@ function parseStyles(xml) {
6682
6775
  function parseNumbering(xml) {
6683
6776
  const doc = parseXml2(xml);
6684
6777
  const abstractNums = /* @__PURE__ */ new Map();
6685
- const abstractElements = findElements(doc, "abstractNum");
6778
+ const idx = buildElementIndex(doc);
6779
+ const abstractElements = idx.get("abstractNum") ?? [];
6686
6780
  for (const el of abstractElements) {
6687
6781
  const abstractNumId = getAttr(el, "abstractNumId");
6688
6782
  if (!abstractNumId) continue;
@@ -6697,7 +6791,7 @@ function parseNumbering(xml) {
6697
6791
  abstractNums.set(abstractNumId, levels);
6698
6792
  }
6699
6793
  const nums = /* @__PURE__ */ new Map();
6700
- const numElements = findElements(doc, "num");
6794
+ const numElements = idx.get("num") ?? [];
6701
6795
  for (const el of numElements) {
6702
6796
  const numId = getAttr(el, "numId");
6703
6797
  if (!numId) continue;
@@ -6941,9 +7035,9 @@ async function extractImages(zip, rels, doc) {
6941
7035
  }
6942
7036
  return { blocks, images };
6943
7037
  }
6944
- async function parseDocxDocument(buffer, options) {
7038
+ async function parseDocxDocument(buffer, options, existingZip) {
6945
7039
  precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
6946
- const zip = await JSZip4.loadAsync(buffer);
7040
+ const zip = existingZip ?? await JSZip4.loadAsync(buffer);
6947
7041
  const warnings = [];
6948
7042
  const docFile = zip.file("word/document.xml");
6949
7043
  if (!docFile) {
@@ -9378,25 +9472,86 @@ async function parse2(input, options) {
9378
9472
  if (!buffer || buffer.byteLength === 0) {
9379
9473
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
9380
9474
  }
9475
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
9476
+ if (buffer.byteLength > MAX_FILE_SIZE) {
9477
+ return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
9478
+ }
9381
9479
  const format = detectFormat(buffer);
9382
9480
  switch (format) {
9383
9481
  case "hwpx": {
9384
- const zipFormat = await detectZipFormat(buffer);
9385
- if (zipFormat === "xlsx") return parseXlsx(buffer, options);
9386
- if (zipFormat === "docx") return parseDocx(buffer, options);
9387
- return parseHwpx(buffer, options);
9482
+ const { format: zipFormat, zip } = await detectZipFormat(buffer);
9483
+ if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
9484
+ if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
9485
+ return parseHwpx(buffer, options, zip ?? void 0);
9388
9486
  }
9389
9487
  case "hwp":
9390
9488
  return parseHwp(buffer, options);
9391
9489
  case "pdf":
9392
9490
  return parsePdf(buffer, options);
9491
+ case "image":
9492
+ return parseImage(buffer, options);
9393
9493
  default:
9394
9494
  return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
9395
9495
  }
9396
9496
  }
9397
- async function parseHwpx(buffer, options) {
9497
+ async function parseImage(buffer, options) {
9498
+ const ocrMode = options?.ocrMode || "auto";
9499
+ if (ocrMode === "off") {
9500
+ return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
9501
+ }
9502
+ let ocrProvider;
9503
+ let actualOcrMode = "auto";
9504
+ try {
9505
+ if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
9506
+ ocrProvider = createCliOcrProvider(ocrMode);
9507
+ actualOcrMode = ocrMode;
9508
+ } else if (ocrMode === "tesseract") {
9509
+ ocrProvider = await createTesseractProvider();
9510
+ actualOcrMode = ocrMode;
9511
+ } else if (ocrMode === "auto") {
9512
+ const modesToTry = ["gemini", "claude", "codex", "ollama"];
9513
+ for (const mode of modesToTry) {
9514
+ try {
9515
+ ocrProvider = createCliOcrProvider(mode);
9516
+ actualOcrMode = mode;
9517
+ break;
9518
+ } catch (e) {
9519
+ console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
9520
+ }
9521
+ }
9522
+ if (!ocrProvider) {
9523
+ ocrProvider = await createTesseractProvider();
9524
+ actualOcrMode = "tesseract";
9525
+ }
9526
+ }
9527
+ if (!ocrProvider) {
9528
+ return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
9529
+ }
9530
+ const imageUint8Array = new Uint8Array(buffer);
9531
+ const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
9532
+ if (ocrProvider.terminate) {
9533
+ await ocrProvider.terminate();
9534
+ }
9535
+ const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
9536
+ const blocks = markdownToBlocks(markdown, 1);
9537
+ return {
9538
+ success: true,
9539
+ fileType: "image",
9540
+ markdown,
9541
+ blocks,
9542
+ isImageBased: true,
9543
+ warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
9544
+ };
9545
+ } catch (err) {
9546
+ if (ocrProvider && ocrProvider.terminate) {
9547
+ await ocrProvider.terminate();
9548
+ }
9549
+ return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
9550
+ }
9551
+ }
9552
+ async function parseHwpx(buffer, options, zip) {
9398
9553
  try {
9399
- const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
9554
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
9400
9555
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
9401
9556
  } catch (err) {
9402
9557
  return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -9419,17 +9574,17 @@ async function parsePdf(buffer, options) {
9419
9574
  return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
9420
9575
  }
9421
9576
  }
9422
- async function parseXlsx(buffer, options) {
9577
+ async function parseXlsx(buffer, options, zip) {
9423
9578
  try {
9424
- const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
9579
+ const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
9425
9580
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
9426
9581
  } catch (err) {
9427
9582
  return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
9428
9583
  }
9429
9584
  }
9430
- async function parseDocx(buffer, options) {
9585
+ async function parseDocx(buffer, options, zip) {
9431
9586
  try {
9432
- const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
9587
+ const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
9433
9588
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
9434
9589
  } catch (err) {
9435
9590
  return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
@@ -9624,4 +9779,4 @@ export {
9624
9779
  cfb/cfb.js:
9625
9780
  (*! crc32.js (C) 2014-present SheetJS -- http://sheetjs.com *)
9626
9781
  */
9627
- //# sourceMappingURL=chunk-ZOEUKD77.js.map
9782
+ //# sourceMappingURL=chunk-UDFKY7CH.js.map