@clazic/kordoc 2.3.2 → 2.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -2425,7 +2425,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2425
2425
  return createCliOcrProvider(mode);
2426
2426
  }
2427
2427
  const detected = detectAvailableOcr();
2428
- if (detected !== "gemini") {
2428
+ if (detected !== "codex") {
2429
2429
  if (detected === "tesseract") {
2430
2430
  warnings?.push({
2431
2431
  message: getTesseractFallbackMessage(),
@@ -2433,7 +2433,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2433
2433
  });
2434
2434
  } else {
2435
2435
  warnings?.push({
2436
- message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (gemini CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 gemini CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2436
+ message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
2437
2437
  code: "OCR_CLI_FALLBACK"
2438
2438
  });
2439
2439
  }
@@ -2790,7 +2790,7 @@ import JSZip2 from "jszip";
2790
2790
  import { DOMParser } from "@xmldom/xmldom";
2791
2791
 
2792
2792
  // src/utils.ts
2793
- var VERSION = true ? "2.3.2" : "0.0.0-dev";
2793
+ var VERSION = true ? "2.3.3" : "0.0.0-dev";
2794
2794
  function toArrayBuffer(buf) {
2795
2795
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2796
2796
  return buf.buffer;
@@ -6318,15 +6318,26 @@ async function parsePdfDocument(buffer, options) {
6318
6318
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
6319
6319
  }
6320
6320
  };
6321
- const sampleCount = Math.min(5, targetPageNums.length);
6322
- for (let si = 0; si < sampleCount; si++) {
6321
+ const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
6322
+ const sampledIndices = /* @__PURE__ */ new Set();
6323
+ if (targetPageNums.length <= SAMPLE_SIZE) {
6324
+ for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
6325
+ } else {
6326
+ for (let i = 0; i < SAMPLE_SIZE; i++) {
6327
+ const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
6328
+ sampledIndices.add(idx);
6329
+ }
6330
+ }
6331
+ for (const si of sampledIndices) {
6323
6332
  await parseSinglePage(targetPageNums[si]);
6324
6333
  }
6325
- const sampleParsed = parsedPages || sampleCount;
6334
+ const sampleParsed = parsedPages || sampledIndices.size;
6326
6335
  const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
6327
6336
  if (!isImageBased) {
6328
- for (let si = sampleCount; si < targetPageNums.length; si++) {
6329
- await parseSinglePage(targetPageNums[si]);
6337
+ for (let si = 0; si < targetPageNums.length; si++) {
6338
+ if (!sampledIndices.has(si)) {
6339
+ await parseSinglePage(targetPageNums[si]);
6340
+ }
6330
6341
  }
6331
6342
  }
6332
6343
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);