@clazic/kordoc 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
- package/dist/batch-provider-PNDCSGQW.js.map +1 -0
- package/dist/chunk-4PP34NVQ.js +121 -0
- package/dist/chunk-4PP34NVQ.js.map +1 -0
- package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
- package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
- package/dist/chunk-JOGAFNIL.js +153 -0
- package/dist/chunk-JOGAFNIL.js.map +1 -0
- package/dist/{chunk-W5KUC23B.js → chunk-NU3KFVVZ.js} +2 -2
- package/dist/{chunk-ZOEUKD77.js → chunk-UDFKY7CH.js} +204 -49
- package/dist/chunk-UDFKY7CH.js.map +1 -0
- package/dist/cli.js +8 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +230 -72
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -6
- package/dist/index.d.ts +11 -6
- package/dist/index.js +230 -72
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
- package/dist/provider-HE727F7Z.js.map +1 -0
- package/dist/resolve-UOAOPQ4H.js +111 -0
- package/dist/resolve-UOAOPQ4H.js.map +1 -0
- package/dist/tesseract-provider-MNMZPSGF.js +11 -0
- package/dist/{utils-HSF5HI5T.js → utils-STJT6CFC.js} +2 -2
- package/dist/utils-STJT6CFC.js.map +1 -0
- package/dist/{watch-R2JHXDGF.js → watch-PRQGLOW3.js} +6 -3
- package/dist/{watch-R2JHXDGF.js.map → watch-PRQGLOW3.js.map} +1 -1
- package/package.json +8 -8
- package/dist/batch-provider-PCT4I4LK.js.map +0 -1
- package/dist/chunk-ZOEUKD77.js.map +0 -1
- package/dist/provider-WYHC4NHI.js.map +0 -1
- package/dist/resolve-4FSAQF2S.js +0 -247
- package/dist/resolve-4FSAQF2S.js.map +0 -1
- /package/dist/{chunk-W5KUC23B.js.map → chunk-NU3KFVVZ.js.map} +0 -0
- /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -1993,8 +1993,8 @@ function getTesseractFallbackMessage() {
|
|
|
1993
1993
|
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
|
|
1994
1994
|
"\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
1995
1995
|
"",
|
|
1996
|
-
" [\uAD8C\uC7A5]
|
|
1997
|
-
"
|
|
1996
|
+
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
1997
|
+
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
1998
1998
|
" Claude CLI: npm install -g @anthropic-ai/claude-code",
|
|
1999
1999
|
" Ollama: brew install ollama (+ ollama pull gemma4:27b)"
|
|
2000
2000
|
].join("\n");
|
|
@@ -2004,7 +2004,7 @@ var init_auto_detect = __esm({
|
|
|
2004
2004
|
"src/ocr/auto-detect.ts"() {
|
|
2005
2005
|
"use strict";
|
|
2006
2006
|
import_child_process = require("child_process");
|
|
2007
|
-
CLI_PRIORITY = ["
|
|
2007
|
+
CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
|
|
2008
2008
|
}
|
|
2009
2009
|
});
|
|
2010
2010
|
|
|
@@ -2043,7 +2043,7 @@ function callCli(mode, imagePath) {
|
|
|
2043
2043
|
const args = buildCliArgs(mode, imagePath);
|
|
2044
2044
|
const result = (0, import_child_process2.spawnSync)(mode, args, {
|
|
2045
2045
|
encoding: "utf-8",
|
|
2046
|
-
timeout:
|
|
2046
|
+
timeout: 6e5,
|
|
2047
2047
|
maxBuffer: 10 * 1024 * 1024,
|
|
2048
2048
|
// claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
|
|
2049
2049
|
...mode === "claude" ? { cwd: (0, import_os.tmpdir)() } : {}
|
|
@@ -2137,7 +2137,7 @@ async function callOllamaApi(imagePath) {
|
|
|
2137
2137
|
return data.message?.content || "";
|
|
2138
2138
|
}
|
|
2139
2139
|
function stripCodeFence(text) {
|
|
2140
|
-
const match = text.match(/^```(?:markdown|md)?\s
|
|
2140
|
+
const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
|
|
2141
2141
|
return match ? match[1].trim() : text;
|
|
2142
2142
|
}
|
|
2143
2143
|
var import_child_process2, import_fs, import_path, import_os, OCR_PROMPT, _tempDir;
|
|
@@ -2148,7 +2148,15 @@ var init_cli_provider = __esm({
|
|
|
2148
2148
|
import_fs = require("fs");
|
|
2149
2149
|
import_path = require("path");
|
|
2150
2150
|
import_os = require("os");
|
|
2151
|
-
OCR_PROMPT =
|
|
2151
|
+
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
|
|
2152
|
+
\uADDC\uCE59:
|
|
2153
|
+
- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
|
|
2154
|
+
- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
|
|
2155
|
+
- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
|
|
2156
|
+
- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
|
|
2157
|
+
- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
|
|
2158
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
|
|
2159
|
+
- \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
|
|
2152
2160
|
_tempDir = null;
|
|
2153
2161
|
}
|
|
2154
2162
|
});
|
|
@@ -2314,9 +2322,8 @@ async function callBatchCli(mode, imagePaths) {
|
|
|
2314
2322
|
${fileRefs}`;
|
|
2315
2323
|
let args;
|
|
2316
2324
|
if (mode === "gemini") {
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
if (model) args.push("--model", model);
|
|
2325
|
+
const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
|
|
2326
|
+
args = ["--prompt", prompt, "--yolo", "--model", model];
|
|
2320
2327
|
} else {
|
|
2321
2328
|
args = ["--print", prompt];
|
|
2322
2329
|
const model = process.env.KORDOC_CLAUDE_MODEL;
|
|
@@ -2415,7 +2422,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2415
2422
|
return createCliOcrProvider(mode);
|
|
2416
2423
|
}
|
|
2417
2424
|
const detected = detectAvailableOcr();
|
|
2418
|
-
if (detected !== "
|
|
2425
|
+
if (detected !== "codex") {
|
|
2419
2426
|
if (detected === "tesseract") {
|
|
2420
2427
|
warnings?.push({
|
|
2421
2428
|
message: getTesseractFallbackMessage(),
|
|
@@ -2423,7 +2430,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2423
2430
|
});
|
|
2424
2431
|
} else {
|
|
2425
2432
|
warnings?.push({
|
|
2426
|
-
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (
|
|
2433
|
+
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
2427
2434
|
code: "OCR_CLI_FALLBACK"
|
|
2428
2435
|
});
|
|
2429
2436
|
}
|
|
@@ -2664,22 +2671,22 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2664
2671
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2665
2672
|
pageNumbers.push(i);
|
|
2666
2673
|
}
|
|
2667
|
-
const
|
|
2668
|
-
for (
|
|
2669
|
-
|
|
2670
|
-
const image = await renderPageToPng(page);
|
|
2671
|
-
pageImages.push({ image, pageNum });
|
|
2672
|
-
}
|
|
2673
|
-
const batches = [];
|
|
2674
|
-
for (let i = 0; i < pageImages.length; i += provider.batchSize) {
|
|
2675
|
-
batches.push(pageImages.slice(i, i + provider.batchSize));
|
|
2674
|
+
const pageBatches = [];
|
|
2675
|
+
for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
|
|
2676
|
+
pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
|
|
2676
2677
|
}
|
|
2677
2678
|
let processed = 0;
|
|
2678
|
-
const batchTasks =
|
|
2679
|
+
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2679
2680
|
const pageBlocks = [];
|
|
2680
2681
|
try {
|
|
2681
|
-
const
|
|
2682
|
-
for (const
|
|
2682
|
+
const batchImages = [];
|
|
2683
|
+
for (const pageNum of batchPageNums) {
|
|
2684
|
+
const page = await doc.getPage(pageNum);
|
|
2685
|
+
const image = await renderPageToPng(page);
|
|
2686
|
+
batchImages.push({ image, pageNum });
|
|
2687
|
+
}
|
|
2688
|
+
const results = await provider.processBatch(batchImages);
|
|
2689
|
+
for (const { pageNum } of batchImages) {
|
|
2683
2690
|
const result = results.get(pageNum);
|
|
2684
2691
|
pageBlocks.push({
|
|
2685
2692
|
pageNum,
|
|
@@ -2687,16 +2694,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2687
2694
|
});
|
|
2688
2695
|
}
|
|
2689
2696
|
} catch (err) {
|
|
2690
|
-
const range = `${
|
|
2697
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2691
2698
|
warnings?.push({
|
|
2692
2699
|
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2693
2700
|
code: "OCR_PAGE_FAILED"
|
|
2694
2701
|
});
|
|
2695
|
-
for (const
|
|
2702
|
+
for (const pageNum of batchPageNums) {
|
|
2696
2703
|
pageBlocks.push({ pageNum, blocks: [] });
|
|
2697
2704
|
}
|
|
2698
2705
|
}
|
|
2699
|
-
processed +=
|
|
2706
|
+
processed += batchPageNums.length;
|
|
2700
2707
|
onProgress?.(processed, pageNumbers.length);
|
|
2701
2708
|
return { batchIdx, pageBlocks };
|
|
2702
2709
|
});
|
|
@@ -2772,24 +2779,29 @@ function isPdfFile(buffer) {
|
|
|
2772
2779
|
const b = magicBytes(buffer);
|
|
2773
2780
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
2774
2781
|
}
|
|
2782
|
+
function isPngFile(buffer) {
|
|
2783
|
+
const b = magicBytes(buffer);
|
|
2784
|
+
return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
|
|
2785
|
+
}
|
|
2775
2786
|
function detectFormat(buffer) {
|
|
2776
2787
|
if (buffer.byteLength < 4) return "unknown";
|
|
2777
2788
|
if (isZipFile(buffer)) return "hwpx";
|
|
2778
2789
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
2779
2790
|
if (isPdfFile(buffer)) return "pdf";
|
|
2791
|
+
if (isPngFile(buffer)) return "image";
|
|
2780
2792
|
return "unknown";
|
|
2781
2793
|
}
|
|
2782
2794
|
async function detectZipFormat(buffer) {
|
|
2783
2795
|
try {
|
|
2784
2796
|
const zip = await import_jszip.default.loadAsync(buffer);
|
|
2785
|
-
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
2786
|
-
if (zip.file("word/document.xml")) return "docx";
|
|
2787
|
-
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
2797
|
+
if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
|
|
2798
|
+
if (zip.file("word/document.xml")) return { format: "docx", zip };
|
|
2799
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
|
|
2788
2800
|
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
2789
|
-
if (hasSection) return "hwpx";
|
|
2790
|
-
return "unknown";
|
|
2801
|
+
if (hasSection) return { format: "hwpx", zip };
|
|
2802
|
+
return { format: "unknown", zip: null };
|
|
2791
2803
|
} catch {
|
|
2792
|
-
return "unknown";
|
|
2804
|
+
return { format: "unknown", zip: null };
|
|
2793
2805
|
}
|
|
2794
2806
|
}
|
|
2795
2807
|
|
|
@@ -2798,7 +2810,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
2798
2810
|
var import_xmldom = require("@xmldom/xmldom");
|
|
2799
2811
|
|
|
2800
2812
|
// src/utils.ts
|
|
2801
|
-
var VERSION = true ? "2.3.
|
|
2813
|
+
var VERSION = true ? "2.3.3" : "0.0.0-dev";
|
|
2802
2814
|
function toArrayBuffer(buf) {
|
|
2803
2815
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2804
2816
|
return buf.buffer;
|
|
@@ -2958,12 +2970,16 @@ function buildTableDirect(rows, numRows) {
|
|
|
2958
2970
|
return trimAndReturn(grid, numRows, maxCols);
|
|
2959
2971
|
}
|
|
2960
2972
|
function trimAndReturn(grid, numRows, maxCols) {
|
|
2961
|
-
let effectiveCols =
|
|
2962
|
-
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
|
|
2973
|
+
let effectiveCols = 0;
|
|
2974
|
+
for (const row of grid) {
|
|
2975
|
+
for (let c = row.length - 1; c >= effectiveCols; c--) {
|
|
2976
|
+
if (row[c]?.text?.trim()) {
|
|
2977
|
+
effectiveCols = c + 1;
|
|
2978
|
+
break;
|
|
2979
|
+
}
|
|
2980
|
+
}
|
|
2966
2981
|
}
|
|
2982
|
+
if (effectiveCols === 0) effectiveCols = maxCols;
|
|
2967
2983
|
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
2968
2984
|
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
2969
2985
|
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
@@ -3220,11 +3236,11 @@ function parseStyleElements(doc, map) {
|
|
|
3220
3236
|
function stripDtd(xml) {
|
|
3221
3237
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
3222
3238
|
}
|
|
3223
|
-
async function parseHwpxDocument(buffer, options) {
|
|
3239
|
+
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
3224
3240
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
3225
3241
|
let zip;
|
|
3226
3242
|
try {
|
|
3227
|
-
zip = await import_jszip2.default.loadAsync(buffer);
|
|
3243
|
+
zip = existingZip ?? await import_jszip2.default.loadAsync(buffer);
|
|
3228
3244
|
} catch {
|
|
3229
3245
|
return await extractFromBrokenZip(buffer);
|
|
3230
3246
|
}
|
|
@@ -6236,8 +6252,15 @@ var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
|
6236
6252
|
import_pdf2.GlobalWorkerOptions.workerSrc = "";
|
|
6237
6253
|
var MAX_PAGES = 5e3;
|
|
6238
6254
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
6239
|
-
|
|
6255
|
+
function calcPdfTimeout(bufferSize) {
|
|
6256
|
+
const base = 3e4;
|
|
6257
|
+
const perMb = 500;
|
|
6258
|
+
const mb = bufferSize / (1024 * 1024);
|
|
6259
|
+
return Math.min(base + Math.ceil(mb * perMb), 3e5);
|
|
6260
|
+
}
|
|
6240
6261
|
async function loadPdfWithTimeout(buffer) {
|
|
6262
|
+
const timeoutMs = calcPdfTimeout(buffer.byteLength);
|
|
6263
|
+
const timeoutSec = Math.round(timeoutMs / 1e3);
|
|
6241
6264
|
const loadingTask = (0, import_pdf2.getDocument)({
|
|
6242
6265
|
data: new Uint8Array(buffer),
|
|
6243
6266
|
useSystemFonts: true,
|
|
@@ -6251,8 +6274,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6251
6274
|
new Promise((_, reject) => {
|
|
6252
6275
|
timer = setTimeout(() => {
|
|
6253
6276
|
loadingTask.destroy();
|
|
6254
|
-
reject(new KordocError(
|
|
6255
|
-
},
|
|
6277
|
+
reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
|
|
6278
|
+
}, timeoutMs);
|
|
6256
6279
|
})
|
|
6257
6280
|
]);
|
|
6258
6281
|
} finally {
|
|
@@ -6273,11 +6296,15 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6273
6296
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
6274
6297
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
6275
6298
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
6276
|
-
const
|
|
6299
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
6277
6300
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
6278
|
-
|
|
6301
|
+
const targetPageNums = [];
|
|
6279
6302
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6280
6303
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
6304
|
+
targetPageNums.push(i);
|
|
6305
|
+
}
|
|
6306
|
+
let parsedPages = 0;
|
|
6307
|
+
const parseSinglePage = async (i) => {
|
|
6281
6308
|
try {
|
|
6282
6309
|
const page = await doc.getPage(i);
|
|
6283
6310
|
const tc = await page.getTextContent();
|
|
@@ -6290,7 +6317,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6290
6317
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
6291
6318
|
}
|
|
6292
6319
|
for (const item of visible) {
|
|
6293
|
-
if (item.fontSize > 0)
|
|
6320
|
+
if (item.fontSize > 0) {
|
|
6321
|
+
const rounded = Math.round(item.fontSize * 10) / 10;
|
|
6322
|
+
fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
|
|
6323
|
+
}
|
|
6294
6324
|
}
|
|
6295
6325
|
const opList = await page.getOperatorList();
|
|
6296
6326
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -6307,12 +6337,34 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6307
6337
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
6308
6338
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
6309
6339
|
}
|
|
6340
|
+
};
|
|
6341
|
+
const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
|
|
6342
|
+
const sampledIndices = /* @__PURE__ */ new Set();
|
|
6343
|
+
if (targetPageNums.length <= SAMPLE_SIZE) {
|
|
6344
|
+
for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
|
|
6345
|
+
} else {
|
|
6346
|
+
for (let i = 0; i < SAMPLE_SIZE; i++) {
|
|
6347
|
+
const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
|
|
6348
|
+
sampledIndices.add(idx);
|
|
6349
|
+
}
|
|
6350
|
+
}
|
|
6351
|
+
for (const si of sampledIndices) {
|
|
6352
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6353
|
+
}
|
|
6354
|
+
const sampleParsed = parsedPages || sampledIndices.size;
|
|
6355
|
+
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
6356
|
+
if (!isImageBased) {
|
|
6357
|
+
for (let si = 0; si < targetPageNums.length; si++) {
|
|
6358
|
+
if (!sampledIndices.has(si)) {
|
|
6359
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6360
|
+
}
|
|
6361
|
+
}
|
|
6310
6362
|
}
|
|
6311
6363
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6312
|
-
if (
|
|
6364
|
+
if (isImageBased) {
|
|
6313
6365
|
let ocrProvider = options?.ocr ?? null;
|
|
6314
|
-
const ocrMode = options?.ocrMode;
|
|
6315
|
-
if (!ocrProvider && ocrMode
|
|
6366
|
+
const ocrMode = options?.ocrMode ?? "auto";
|
|
6367
|
+
if (!ocrProvider && ocrMode !== "off") {
|
|
6316
6368
|
try {
|
|
6317
6369
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6318
6370
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
@@ -6364,7 +6416,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6364
6416
|
blocks.splice(removed[ri], 1);
|
|
6365
6417
|
}
|
|
6366
6418
|
}
|
|
6367
|
-
const medianFontSize =
|
|
6419
|
+
const medianFontSize = computeMedianFromFreq(fontSizeFreq);
|
|
6368
6420
|
if (medianFontSize > 0) {
|
|
6369
6421
|
detectHeadings(blocks, medianFontSize);
|
|
6370
6422
|
}
|
|
@@ -6417,11 +6469,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
6417
6469
|
}
|
|
6418
6470
|
return { visible, hiddenCount };
|
|
6419
6471
|
}
|
|
6420
|
-
function
|
|
6421
|
-
if (
|
|
6422
|
-
const
|
|
6423
|
-
|
|
6424
|
-
|
|
6472
|
+
function computeMedianFromFreq(freq) {
|
|
6473
|
+
if (freq.size === 0) return 0;
|
|
6474
|
+
const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
6475
|
+
let total = 0;
|
|
6476
|
+
for (const [, count] of entries) total += count;
|
|
6477
|
+
const mid = total / 2;
|
|
6478
|
+
let cumulative = 0;
|
|
6479
|
+
for (const [size, count] of entries) {
|
|
6480
|
+
cumulative += count;
|
|
6481
|
+
if (cumulative >= mid) return size;
|
|
6482
|
+
}
|
|
6483
|
+
return 0;
|
|
6425
6484
|
}
|
|
6426
6485
|
function detectHeadings(blocks, medianFontSize) {
|
|
6427
6486
|
for (const block of blocks) {
|
|
@@ -7224,6 +7283,7 @@ var MAX_SHEETS = 100;
|
|
|
7224
7283
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
7225
7284
|
var MAX_ROWS2 = 1e4;
|
|
7226
7285
|
var MAX_COLS2 = 200;
|
|
7286
|
+
var MAX_TOTAL_CELLS = 2e6;
|
|
7227
7287
|
function cleanNumericValue(raw) {
|
|
7228
7288
|
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
7229
7289
|
const num = parseFloat(raw);
|
|
@@ -7407,9 +7467,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
7407
7467
|
}
|
|
7408
7468
|
return blocks;
|
|
7409
7469
|
}
|
|
7410
|
-
async function parseXlsxDocument(buffer, options) {
|
|
7470
|
+
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
7411
7471
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
7412
|
-
const zip = await import_jszip3.default.loadAsync(buffer);
|
|
7472
|
+
const zip = existingZip ?? await import_jszip3.default.loadAsync(buffer);
|
|
7413
7473
|
const warnings = [];
|
|
7414
7474
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
7415
7475
|
if (!workbookFile) {
|
|
@@ -7436,6 +7496,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7436
7496
|
}
|
|
7437
7497
|
const blocks = [];
|
|
7438
7498
|
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
7499
|
+
let totalCells = 0;
|
|
7439
7500
|
for (let i = 0; i < processedSheets; i++) {
|
|
7440
7501
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
7441
7502
|
const sheet = sheets[i];
|
|
@@ -7462,6 +7523,11 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7462
7523
|
try {
|
|
7463
7524
|
const sheetXml = await sheetFile.async("text");
|
|
7464
7525
|
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
7526
|
+
totalCells += maxRow * maxCol;
|
|
7527
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
7528
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
7529
|
+
break;
|
|
7530
|
+
}
|
|
7465
7531
|
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
7466
7532
|
blocks.push(...sheetBlocks);
|
|
7467
7533
|
} catch (err) {
|
|
@@ -7545,10 +7611,35 @@ function getAttr(el, localName) {
|
|
|
7545
7611
|
function parseXml2(text) {
|
|
7546
7612
|
return new import_xmldom3.DOMParser().parseFromString(text, "text/xml");
|
|
7547
7613
|
}
|
|
7614
|
+
function buildElementIndex(root) {
|
|
7615
|
+
const index = /* @__PURE__ */ new Map();
|
|
7616
|
+
const walk = (node) => {
|
|
7617
|
+
const children = node.childNodes;
|
|
7618
|
+
for (let i = 0; i < children.length; i++) {
|
|
7619
|
+
const child = children[i];
|
|
7620
|
+
if (child.nodeType === 1) {
|
|
7621
|
+
const el = child;
|
|
7622
|
+
const name = el.localName ?? "";
|
|
7623
|
+
if (name) {
|
|
7624
|
+
let list = index.get(name);
|
|
7625
|
+
if (!list) {
|
|
7626
|
+
list = [];
|
|
7627
|
+
index.set(name, list);
|
|
7628
|
+
}
|
|
7629
|
+
list.push(el);
|
|
7630
|
+
}
|
|
7631
|
+
walk(el);
|
|
7632
|
+
}
|
|
7633
|
+
}
|
|
7634
|
+
};
|
|
7635
|
+
walk(root);
|
|
7636
|
+
return index;
|
|
7637
|
+
}
|
|
7548
7638
|
function parseStyles(xml) {
|
|
7549
7639
|
const doc = parseXml2(xml);
|
|
7550
7640
|
const styles = /* @__PURE__ */ new Map();
|
|
7551
|
-
const
|
|
7641
|
+
const idx = buildElementIndex(doc);
|
|
7642
|
+
const styleElements = idx.get("style") ?? [];
|
|
7552
7643
|
for (const el of styleElements) {
|
|
7553
7644
|
const styleId = getAttr(el, "styleId");
|
|
7554
7645
|
if (!styleId) continue;
|
|
@@ -7576,7 +7667,8 @@ function parseStyles(xml) {
|
|
|
7576
7667
|
function parseNumbering(xml) {
|
|
7577
7668
|
const doc = parseXml2(xml);
|
|
7578
7669
|
const abstractNums = /* @__PURE__ */ new Map();
|
|
7579
|
-
const
|
|
7670
|
+
const idx = buildElementIndex(doc);
|
|
7671
|
+
const abstractElements = idx.get("abstractNum") ?? [];
|
|
7580
7672
|
for (const el of abstractElements) {
|
|
7581
7673
|
const abstractNumId = getAttr(el, "abstractNumId");
|
|
7582
7674
|
if (!abstractNumId) continue;
|
|
@@ -7591,7 +7683,7 @@ function parseNumbering(xml) {
|
|
|
7591
7683
|
abstractNums.set(abstractNumId, levels);
|
|
7592
7684
|
}
|
|
7593
7685
|
const nums = /* @__PURE__ */ new Map();
|
|
7594
|
-
const numElements =
|
|
7686
|
+
const numElements = idx.get("num") ?? [];
|
|
7595
7687
|
for (const el of numElements) {
|
|
7596
7688
|
const numId = getAttr(el, "numId");
|
|
7597
7689
|
if (!numId) continue;
|
|
@@ -7835,9 +7927,9 @@ async function extractImages(zip, rels, doc) {
|
|
|
7835
7927
|
}
|
|
7836
7928
|
return { blocks, images };
|
|
7837
7929
|
}
|
|
7838
|
-
async function parseDocxDocument(buffer, options) {
|
|
7930
|
+
async function parseDocxDocument(buffer, options, existingZip) {
|
|
7839
7931
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
7840
|
-
const zip = await import_jszip4.default.loadAsync(buffer);
|
|
7932
|
+
const zip = existingZip ?? await import_jszip4.default.loadAsync(buffer);
|
|
7841
7933
|
const warnings = [];
|
|
7842
7934
|
const docFile = zip.file("word/document.xml");
|
|
7843
7935
|
if (!docFile) {
|
|
@@ -7927,6 +8019,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
7927
8019
|
};
|
|
7928
8020
|
}
|
|
7929
8021
|
|
|
8022
|
+
// src/index.ts
|
|
8023
|
+
init_cli_provider();
|
|
8024
|
+
init_tesseract_provider();
|
|
8025
|
+
init_markdown_to_blocks();
|
|
8026
|
+
|
|
7930
8027
|
// src/diff/text-diff.ts
|
|
7931
8028
|
function similarity(a, b) {
|
|
7932
8029
|
if (a === b) return 1;
|
|
@@ -10443,25 +10540,86 @@ async function parse2(input, options) {
|
|
|
10443
10540
|
if (!buffer || buffer.byteLength === 0) {
|
|
10444
10541
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
10445
10542
|
}
|
|
10543
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
10544
|
+
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
10545
|
+
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
10546
|
+
}
|
|
10446
10547
|
const format = detectFormat(buffer);
|
|
10447
10548
|
switch (format) {
|
|
10448
10549
|
case "hwpx": {
|
|
10449
|
-
const zipFormat = await detectZipFormat(buffer);
|
|
10450
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
10451
|
-
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
10452
|
-
return parseHwpx(buffer, options);
|
|
10550
|
+
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
10551
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
|
|
10552
|
+
if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
|
|
10553
|
+
return parseHwpx(buffer, options, zip ?? void 0);
|
|
10453
10554
|
}
|
|
10454
10555
|
case "hwp":
|
|
10455
10556
|
return parseHwp(buffer, options);
|
|
10456
10557
|
case "pdf":
|
|
10457
10558
|
return parsePdf(buffer, options);
|
|
10559
|
+
case "image":
|
|
10560
|
+
return parseImage(buffer, options);
|
|
10458
10561
|
default:
|
|
10459
10562
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
10460
10563
|
}
|
|
10461
10564
|
}
|
|
10462
|
-
async function
|
|
10565
|
+
async function parseImage(buffer, options) {
|
|
10566
|
+
const ocrMode = options?.ocrMode || "auto";
|
|
10567
|
+
if (ocrMode === "off") {
|
|
10568
|
+
return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
|
|
10569
|
+
}
|
|
10570
|
+
let ocrProvider;
|
|
10571
|
+
let actualOcrMode = "auto";
|
|
10572
|
+
try {
|
|
10573
|
+
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
10574
|
+
ocrProvider = createCliOcrProvider(ocrMode);
|
|
10575
|
+
actualOcrMode = ocrMode;
|
|
10576
|
+
} else if (ocrMode === "tesseract") {
|
|
10577
|
+
ocrProvider = await createTesseractProvider();
|
|
10578
|
+
actualOcrMode = ocrMode;
|
|
10579
|
+
} else if (ocrMode === "auto") {
|
|
10580
|
+
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
10581
|
+
for (const mode of modesToTry) {
|
|
10582
|
+
try {
|
|
10583
|
+
ocrProvider = createCliOcrProvider(mode);
|
|
10584
|
+
actualOcrMode = mode;
|
|
10585
|
+
break;
|
|
10586
|
+
} catch (e) {
|
|
10587
|
+
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
10588
|
+
}
|
|
10589
|
+
}
|
|
10590
|
+
if (!ocrProvider) {
|
|
10591
|
+
ocrProvider = await createTesseractProvider();
|
|
10592
|
+
actualOcrMode = "tesseract";
|
|
10593
|
+
}
|
|
10594
|
+
}
|
|
10595
|
+
if (!ocrProvider) {
|
|
10596
|
+
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
10597
|
+
}
|
|
10598
|
+
const imageUint8Array = new Uint8Array(buffer);
|
|
10599
|
+
const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
|
|
10600
|
+
if (ocrProvider.terminate) {
|
|
10601
|
+
await ocrProvider.terminate();
|
|
10602
|
+
}
|
|
10603
|
+
const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
|
|
10604
|
+
const blocks = markdownToBlocks(markdown, 1);
|
|
10605
|
+
return {
|
|
10606
|
+
success: true,
|
|
10607
|
+
fileType: "image",
|
|
10608
|
+
markdown,
|
|
10609
|
+
blocks,
|
|
10610
|
+
isImageBased: true,
|
|
10611
|
+
warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
|
|
10612
|
+
};
|
|
10613
|
+
} catch (err) {
|
|
10614
|
+
if (ocrProvider && ocrProvider.terminate) {
|
|
10615
|
+
await ocrProvider.terminate();
|
|
10616
|
+
}
|
|
10617
|
+
return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
|
|
10618
|
+
}
|
|
10619
|
+
}
|
|
10620
|
+
async function parseHwpx(buffer, options, zip) {
|
|
10463
10621
|
try {
|
|
10464
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
10622
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
10465
10623
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10466
10624
|
} catch (err) {
|
|
10467
10625
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -10484,17 +10642,17 @@ async function parsePdf(buffer, options) {
|
|
|
10484
10642
|
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
10485
10643
|
}
|
|
10486
10644
|
}
|
|
10487
|
-
async function parseXlsx(buffer, options) {
|
|
10645
|
+
async function parseXlsx(buffer, options, zip) {
|
|
10488
10646
|
try {
|
|
10489
|
-
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
10647
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
10490
10648
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
10491
10649
|
} catch (err) {
|
|
10492
10650
|
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
10493
10651
|
}
|
|
10494
10652
|
}
|
|
10495
|
-
async function parseDocx(buffer, options) {
|
|
10653
|
+
async function parseDocx(buffer, options, zip) {
|
|
10496
10654
|
try {
|
|
10497
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
10655
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
10498
10656
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10499
10657
|
} catch (err) {
|
|
10500
10658
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|