@clazic/kordoc 2.3.1 → 2.4.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
- package/dist/batch-provider-PNDCSGQW.js.map +1 -0
- package/dist/{chunk-ZOEUKD77.js → chunk-2GFJFTKS.js} +193 -49
- package/dist/chunk-2GFJFTKS.js.map +1 -0
- package/dist/chunk-4PP34NVQ.js +121 -0
- package/dist/chunk-4PP34NVQ.js.map +1 -0
- package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
- package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
- package/dist/chunk-JOGAFNIL.js +153 -0
- package/dist/chunk-JOGAFNIL.js.map +1 -0
- package/dist/{chunk-W5KUC23B.js → chunk-STIKJGEA.js} +2 -2
- package/dist/cli.js +8 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +217 -70
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -6
- package/dist/index.d.ts +11 -6
- package/dist/index.js +217 -70
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
- package/dist/provider-HE727F7Z.js.map +1 -0
- package/dist/resolve-QA3VACUP.js +111 -0
- package/dist/resolve-QA3VACUP.js.map +1 -0
- package/dist/tesseract-provider-MNMZPSGF.js +11 -0
- package/dist/{utils-HSF5HI5T.js → utils-FFUQJTTI.js} +2 -2
- package/dist/utils-FFUQJTTI.js.map +1 -0
- package/dist/{watch-R2JHXDGF.js → watch-2O32L6IF.js} +6 -3
- package/dist/{watch-R2JHXDGF.js.map → watch-2O32L6IF.js.map} +1 -1
- package/package.json +7 -8
- package/dist/batch-provider-PCT4I4LK.js.map +0 -1
- package/dist/chunk-ZOEUKD77.js.map +0 -1
- package/dist/provider-WYHC4NHI.js.map +0 -1
- package/dist/resolve-4FSAQF2S.js +0 -247
- package/dist/resolve-4FSAQF2S.js.map +0 -1
- /package/dist/{chunk-W5KUC23B.js.map → chunk-STIKJGEA.js.map} +0 -0
- /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -1993,8 +1993,8 @@ function getTesseractFallbackMessage() {
|
|
|
1993
1993
|
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
|
|
1994
1994
|
"\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
1995
1995
|
"",
|
|
1996
|
-
" [\uAD8C\uC7A5]
|
|
1997
|
-
"
|
|
1996
|
+
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
1997
|
+
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
1998
1998
|
" Claude CLI: npm install -g @anthropic-ai/claude-code",
|
|
1999
1999
|
" Ollama: brew install ollama (+ ollama pull gemma4:27b)"
|
|
2000
2000
|
].join("\n");
|
|
@@ -2004,7 +2004,7 @@ var init_auto_detect = __esm({
|
|
|
2004
2004
|
"src/ocr/auto-detect.ts"() {
|
|
2005
2005
|
"use strict";
|
|
2006
2006
|
import_child_process = require("child_process");
|
|
2007
|
-
CLI_PRIORITY = ["
|
|
2007
|
+
CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
|
|
2008
2008
|
}
|
|
2009
2009
|
});
|
|
2010
2010
|
|
|
@@ -2043,7 +2043,7 @@ function callCli(mode, imagePath) {
|
|
|
2043
2043
|
const args = buildCliArgs(mode, imagePath);
|
|
2044
2044
|
const result = (0, import_child_process2.spawnSync)(mode, args, {
|
|
2045
2045
|
encoding: "utf-8",
|
|
2046
|
-
timeout:
|
|
2046
|
+
timeout: 6e5,
|
|
2047
2047
|
maxBuffer: 10 * 1024 * 1024,
|
|
2048
2048
|
// claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
|
|
2049
2049
|
...mode === "claude" ? { cwd: (0, import_os.tmpdir)() } : {}
|
|
@@ -2137,7 +2137,7 @@ async function callOllamaApi(imagePath) {
|
|
|
2137
2137
|
return data.message?.content || "";
|
|
2138
2138
|
}
|
|
2139
2139
|
function stripCodeFence(text) {
|
|
2140
|
-
const match = text.match(/^```(?:markdown|md)?\s
|
|
2140
|
+
const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
|
|
2141
2141
|
return match ? match[1].trim() : text;
|
|
2142
2142
|
}
|
|
2143
2143
|
var import_child_process2, import_fs, import_path, import_os, OCR_PROMPT, _tempDir;
|
|
@@ -2148,7 +2148,15 @@ var init_cli_provider = __esm({
|
|
|
2148
2148
|
import_fs = require("fs");
|
|
2149
2149
|
import_path = require("path");
|
|
2150
2150
|
import_os = require("os");
|
|
2151
|
-
OCR_PROMPT =
|
|
2151
|
+
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
|
|
2152
|
+
\uADDC\uCE59:
|
|
2153
|
+
- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
|
|
2154
|
+
- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
|
|
2155
|
+
- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
|
|
2156
|
+
- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
|
|
2157
|
+
- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
|
|
2158
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
|
|
2159
|
+
- \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
|
|
2152
2160
|
_tempDir = null;
|
|
2153
2161
|
}
|
|
2154
2162
|
});
|
|
@@ -2314,9 +2322,8 @@ async function callBatchCli(mode, imagePaths) {
|
|
|
2314
2322
|
${fileRefs}`;
|
|
2315
2323
|
let args;
|
|
2316
2324
|
if (mode === "gemini") {
|
|
2317
|
-
|
|
2318
|
-
|
|
2319
|
-
if (model) args.push("--model", model);
|
|
2325
|
+
const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
|
|
2326
|
+
args = ["--prompt", prompt, "--yolo", "--model", model];
|
|
2320
2327
|
} else {
|
|
2321
2328
|
args = ["--print", prompt];
|
|
2322
2329
|
const model = process.env.KORDOC_CLAUDE_MODEL;
|
|
@@ -2664,22 +2671,22 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2664
2671
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2665
2672
|
pageNumbers.push(i);
|
|
2666
2673
|
}
|
|
2667
|
-
const
|
|
2668
|
-
for (
|
|
2669
|
-
|
|
2670
|
-
const image = await renderPageToPng(page);
|
|
2671
|
-
pageImages.push({ image, pageNum });
|
|
2672
|
-
}
|
|
2673
|
-
const batches = [];
|
|
2674
|
-
for (let i = 0; i < pageImages.length; i += provider.batchSize) {
|
|
2675
|
-
batches.push(pageImages.slice(i, i + provider.batchSize));
|
|
2674
|
+
const pageBatches = [];
|
|
2675
|
+
for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
|
|
2676
|
+
pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
|
|
2676
2677
|
}
|
|
2677
2678
|
let processed = 0;
|
|
2678
|
-
const batchTasks =
|
|
2679
|
+
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2679
2680
|
const pageBlocks = [];
|
|
2680
2681
|
try {
|
|
2681
|
-
const
|
|
2682
|
-
for (const
|
|
2682
|
+
const batchImages = [];
|
|
2683
|
+
for (const pageNum of batchPageNums) {
|
|
2684
|
+
const page = await doc.getPage(pageNum);
|
|
2685
|
+
const image = await renderPageToPng(page);
|
|
2686
|
+
batchImages.push({ image, pageNum });
|
|
2687
|
+
}
|
|
2688
|
+
const results = await provider.processBatch(batchImages);
|
|
2689
|
+
for (const { pageNum } of batchImages) {
|
|
2683
2690
|
const result = results.get(pageNum);
|
|
2684
2691
|
pageBlocks.push({
|
|
2685
2692
|
pageNum,
|
|
@@ -2687,16 +2694,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2687
2694
|
});
|
|
2688
2695
|
}
|
|
2689
2696
|
} catch (err) {
|
|
2690
|
-
const range = `${
|
|
2697
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2691
2698
|
warnings?.push({
|
|
2692
2699
|
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2693
2700
|
code: "OCR_PAGE_FAILED"
|
|
2694
2701
|
});
|
|
2695
|
-
for (const
|
|
2702
|
+
for (const pageNum of batchPageNums) {
|
|
2696
2703
|
pageBlocks.push({ pageNum, blocks: [] });
|
|
2697
2704
|
}
|
|
2698
2705
|
}
|
|
2699
|
-
processed +=
|
|
2706
|
+
processed += batchPageNums.length;
|
|
2700
2707
|
onProgress?.(processed, pageNumbers.length);
|
|
2701
2708
|
return { batchIdx, pageBlocks };
|
|
2702
2709
|
});
|
|
@@ -2772,24 +2779,29 @@ function isPdfFile(buffer) {
|
|
|
2772
2779
|
const b = magicBytes(buffer);
|
|
2773
2780
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
2774
2781
|
}
|
|
2782
|
+
function isPngFile(buffer) {
|
|
2783
|
+
const b = magicBytes(buffer);
|
|
2784
|
+
return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
|
|
2785
|
+
}
|
|
2775
2786
|
function detectFormat(buffer) {
|
|
2776
2787
|
if (buffer.byteLength < 4) return "unknown";
|
|
2777
2788
|
if (isZipFile(buffer)) return "hwpx";
|
|
2778
2789
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
2779
2790
|
if (isPdfFile(buffer)) return "pdf";
|
|
2791
|
+
if (isPngFile(buffer)) return "image";
|
|
2780
2792
|
return "unknown";
|
|
2781
2793
|
}
|
|
2782
2794
|
async function detectZipFormat(buffer) {
|
|
2783
2795
|
try {
|
|
2784
2796
|
const zip = await import_jszip.default.loadAsync(buffer);
|
|
2785
|
-
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
2786
|
-
if (zip.file("word/document.xml")) return "docx";
|
|
2787
|
-
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
2797
|
+
if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
|
|
2798
|
+
if (zip.file("word/document.xml")) return { format: "docx", zip };
|
|
2799
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
|
|
2788
2800
|
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
2789
|
-
if (hasSection) return "hwpx";
|
|
2790
|
-
return "unknown";
|
|
2801
|
+
if (hasSection) return { format: "hwpx", zip };
|
|
2802
|
+
return { format: "unknown", zip: null };
|
|
2791
2803
|
} catch {
|
|
2792
|
-
return "unknown";
|
|
2804
|
+
return { format: "unknown", zip: null };
|
|
2793
2805
|
}
|
|
2794
2806
|
}
|
|
2795
2807
|
|
|
@@ -2798,7 +2810,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
2798
2810
|
var import_xmldom = require("@xmldom/xmldom");
|
|
2799
2811
|
|
|
2800
2812
|
// src/utils.ts
|
|
2801
|
-
var VERSION = true ? "2.3.
|
|
2813
|
+
var VERSION = true ? "2.3.2" : "0.0.0-dev";
|
|
2802
2814
|
function toArrayBuffer(buf) {
|
|
2803
2815
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2804
2816
|
return buf.buffer;
|
|
@@ -2958,12 +2970,16 @@ function buildTableDirect(rows, numRows) {
|
|
|
2958
2970
|
return trimAndReturn(grid, numRows, maxCols);
|
|
2959
2971
|
}
|
|
2960
2972
|
function trimAndReturn(grid, numRows, maxCols) {
|
|
2961
|
-
let effectiveCols =
|
|
2962
|
-
|
|
2963
|
-
|
|
2964
|
-
|
|
2965
|
-
|
|
2973
|
+
let effectiveCols = 0;
|
|
2974
|
+
for (const row of grid) {
|
|
2975
|
+
for (let c = row.length - 1; c >= effectiveCols; c--) {
|
|
2976
|
+
if (row[c]?.text?.trim()) {
|
|
2977
|
+
effectiveCols = c + 1;
|
|
2978
|
+
break;
|
|
2979
|
+
}
|
|
2980
|
+
}
|
|
2966
2981
|
}
|
|
2982
|
+
if (effectiveCols === 0) effectiveCols = maxCols;
|
|
2967
2983
|
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
2968
2984
|
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
2969
2985
|
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
@@ -3220,11 +3236,11 @@ function parseStyleElements(doc, map) {
|
|
|
3220
3236
|
function stripDtd(xml) {
|
|
3221
3237
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
3222
3238
|
}
|
|
3223
|
-
async function parseHwpxDocument(buffer, options) {
|
|
3239
|
+
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
3224
3240
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
3225
3241
|
let zip;
|
|
3226
3242
|
try {
|
|
3227
|
-
zip = await import_jszip2.default.loadAsync(buffer);
|
|
3243
|
+
zip = existingZip ?? await import_jszip2.default.loadAsync(buffer);
|
|
3228
3244
|
} catch {
|
|
3229
3245
|
return await extractFromBrokenZip(buffer);
|
|
3230
3246
|
}
|
|
@@ -6236,8 +6252,15 @@ var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
|
6236
6252
|
import_pdf2.GlobalWorkerOptions.workerSrc = "";
|
|
6237
6253
|
var MAX_PAGES = 5e3;
|
|
6238
6254
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
6239
|
-
|
|
6255
|
+
function calcPdfTimeout(bufferSize) {
|
|
6256
|
+
const base = 3e4;
|
|
6257
|
+
const perMb = 500;
|
|
6258
|
+
const mb = bufferSize / (1024 * 1024);
|
|
6259
|
+
return Math.min(base + Math.ceil(mb * perMb), 3e5);
|
|
6260
|
+
}
|
|
6240
6261
|
async function loadPdfWithTimeout(buffer) {
|
|
6262
|
+
const timeoutMs = calcPdfTimeout(buffer.byteLength);
|
|
6263
|
+
const timeoutSec = Math.round(timeoutMs / 1e3);
|
|
6241
6264
|
const loadingTask = (0, import_pdf2.getDocument)({
|
|
6242
6265
|
data: new Uint8Array(buffer),
|
|
6243
6266
|
useSystemFonts: true,
|
|
@@ -6251,8 +6274,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6251
6274
|
new Promise((_, reject) => {
|
|
6252
6275
|
timer = setTimeout(() => {
|
|
6253
6276
|
loadingTask.destroy();
|
|
6254
|
-
reject(new KordocError(
|
|
6255
|
-
},
|
|
6277
|
+
reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
|
|
6278
|
+
}, timeoutMs);
|
|
6256
6279
|
})
|
|
6257
6280
|
]);
|
|
6258
6281
|
} finally {
|
|
@@ -6273,11 +6296,15 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6273
6296
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
6274
6297
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
6275
6298
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
6276
|
-
const
|
|
6299
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
6277
6300
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
6278
|
-
|
|
6301
|
+
const targetPageNums = [];
|
|
6279
6302
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6280
6303
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
6304
|
+
targetPageNums.push(i);
|
|
6305
|
+
}
|
|
6306
|
+
let parsedPages = 0;
|
|
6307
|
+
const parseSinglePage = async (i) => {
|
|
6281
6308
|
try {
|
|
6282
6309
|
const page = await doc.getPage(i);
|
|
6283
6310
|
const tc = await page.getTextContent();
|
|
@@ -6290,7 +6317,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6290
6317
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
6291
6318
|
}
|
|
6292
6319
|
for (const item of visible) {
|
|
6293
|
-
if (item.fontSize > 0)
|
|
6320
|
+
if (item.fontSize > 0) {
|
|
6321
|
+
const rounded = Math.round(item.fontSize * 10) / 10;
|
|
6322
|
+
fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
|
|
6323
|
+
}
|
|
6294
6324
|
}
|
|
6295
6325
|
const opList = await page.getOperatorList();
|
|
6296
6326
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -6307,12 +6337,23 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6307
6337
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
6308
6338
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
6309
6339
|
}
|
|
6340
|
+
};
|
|
6341
|
+
const sampleCount = Math.min(5, targetPageNums.length);
|
|
6342
|
+
for (let si = 0; si < sampleCount; si++) {
|
|
6343
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6344
|
+
}
|
|
6345
|
+
const sampleParsed = parsedPages || sampleCount;
|
|
6346
|
+
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
6347
|
+
if (!isImageBased) {
|
|
6348
|
+
for (let si = sampleCount; si < targetPageNums.length; si++) {
|
|
6349
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6350
|
+
}
|
|
6310
6351
|
}
|
|
6311
6352
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6312
|
-
if (
|
|
6353
|
+
if (isImageBased) {
|
|
6313
6354
|
let ocrProvider = options?.ocr ?? null;
|
|
6314
|
-
const ocrMode = options?.ocrMode;
|
|
6315
|
-
if (!ocrProvider && ocrMode
|
|
6355
|
+
const ocrMode = options?.ocrMode ?? "auto";
|
|
6356
|
+
if (!ocrProvider && ocrMode !== "off") {
|
|
6316
6357
|
try {
|
|
6317
6358
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6318
6359
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
@@ -6364,7 +6405,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6364
6405
|
blocks.splice(removed[ri], 1);
|
|
6365
6406
|
}
|
|
6366
6407
|
}
|
|
6367
|
-
const medianFontSize =
|
|
6408
|
+
const medianFontSize = computeMedianFromFreq(fontSizeFreq);
|
|
6368
6409
|
if (medianFontSize > 0) {
|
|
6369
6410
|
detectHeadings(blocks, medianFontSize);
|
|
6370
6411
|
}
|
|
@@ -6417,11 +6458,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
6417
6458
|
}
|
|
6418
6459
|
return { visible, hiddenCount };
|
|
6419
6460
|
}
|
|
6420
|
-
function
|
|
6421
|
-
if (
|
|
6422
|
-
const
|
|
6423
|
-
|
|
6424
|
-
|
|
6461
|
+
function computeMedianFromFreq(freq) {
|
|
6462
|
+
if (freq.size === 0) return 0;
|
|
6463
|
+
const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
6464
|
+
let total = 0;
|
|
6465
|
+
for (const [, count] of entries) total += count;
|
|
6466
|
+
const mid = total / 2;
|
|
6467
|
+
let cumulative = 0;
|
|
6468
|
+
for (const [size, count] of entries) {
|
|
6469
|
+
cumulative += count;
|
|
6470
|
+
if (cumulative >= mid) return size;
|
|
6471
|
+
}
|
|
6472
|
+
return 0;
|
|
6425
6473
|
}
|
|
6426
6474
|
function detectHeadings(blocks, medianFontSize) {
|
|
6427
6475
|
for (const block of blocks) {
|
|
@@ -7224,6 +7272,7 @@ var MAX_SHEETS = 100;
|
|
|
7224
7272
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
7225
7273
|
var MAX_ROWS2 = 1e4;
|
|
7226
7274
|
var MAX_COLS2 = 200;
|
|
7275
|
+
var MAX_TOTAL_CELLS = 2e6;
|
|
7227
7276
|
function cleanNumericValue(raw) {
|
|
7228
7277
|
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
7229
7278
|
const num = parseFloat(raw);
|
|
@@ -7407,9 +7456,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
7407
7456
|
}
|
|
7408
7457
|
return blocks;
|
|
7409
7458
|
}
|
|
7410
|
-
async function parseXlsxDocument(buffer, options) {
|
|
7459
|
+
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
7411
7460
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
7412
|
-
const zip = await import_jszip3.default.loadAsync(buffer);
|
|
7461
|
+
const zip = existingZip ?? await import_jszip3.default.loadAsync(buffer);
|
|
7413
7462
|
const warnings = [];
|
|
7414
7463
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
7415
7464
|
if (!workbookFile) {
|
|
@@ -7436,6 +7485,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7436
7485
|
}
|
|
7437
7486
|
const blocks = [];
|
|
7438
7487
|
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
7488
|
+
let totalCells = 0;
|
|
7439
7489
|
for (let i = 0; i < processedSheets; i++) {
|
|
7440
7490
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
7441
7491
|
const sheet = sheets[i];
|
|
@@ -7462,6 +7512,11 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7462
7512
|
try {
|
|
7463
7513
|
const sheetXml = await sheetFile.async("text");
|
|
7464
7514
|
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
7515
|
+
totalCells += maxRow * maxCol;
|
|
7516
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
7517
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
7518
|
+
break;
|
|
7519
|
+
}
|
|
7465
7520
|
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
7466
7521
|
blocks.push(...sheetBlocks);
|
|
7467
7522
|
} catch (err) {
|
|
@@ -7545,10 +7600,35 @@ function getAttr(el, localName) {
|
|
|
7545
7600
|
function parseXml2(text) {
|
|
7546
7601
|
return new import_xmldom3.DOMParser().parseFromString(text, "text/xml");
|
|
7547
7602
|
}
|
|
7603
|
+
function buildElementIndex(root) {
|
|
7604
|
+
const index = /* @__PURE__ */ new Map();
|
|
7605
|
+
const walk = (node) => {
|
|
7606
|
+
const children = node.childNodes;
|
|
7607
|
+
for (let i = 0; i < children.length; i++) {
|
|
7608
|
+
const child = children[i];
|
|
7609
|
+
if (child.nodeType === 1) {
|
|
7610
|
+
const el = child;
|
|
7611
|
+
const name = el.localName ?? "";
|
|
7612
|
+
if (name) {
|
|
7613
|
+
let list = index.get(name);
|
|
7614
|
+
if (!list) {
|
|
7615
|
+
list = [];
|
|
7616
|
+
index.set(name, list);
|
|
7617
|
+
}
|
|
7618
|
+
list.push(el);
|
|
7619
|
+
}
|
|
7620
|
+
walk(el);
|
|
7621
|
+
}
|
|
7622
|
+
}
|
|
7623
|
+
};
|
|
7624
|
+
walk(root);
|
|
7625
|
+
return index;
|
|
7626
|
+
}
|
|
7548
7627
|
function parseStyles(xml) {
|
|
7549
7628
|
const doc = parseXml2(xml);
|
|
7550
7629
|
const styles = /* @__PURE__ */ new Map();
|
|
7551
|
-
const
|
|
7630
|
+
const idx = buildElementIndex(doc);
|
|
7631
|
+
const styleElements = idx.get("style") ?? [];
|
|
7552
7632
|
for (const el of styleElements) {
|
|
7553
7633
|
const styleId = getAttr(el, "styleId");
|
|
7554
7634
|
if (!styleId) continue;
|
|
@@ -7576,7 +7656,8 @@ function parseStyles(xml) {
|
|
|
7576
7656
|
function parseNumbering(xml) {
|
|
7577
7657
|
const doc = parseXml2(xml);
|
|
7578
7658
|
const abstractNums = /* @__PURE__ */ new Map();
|
|
7579
|
-
const
|
|
7659
|
+
const idx = buildElementIndex(doc);
|
|
7660
|
+
const abstractElements = idx.get("abstractNum") ?? [];
|
|
7580
7661
|
for (const el of abstractElements) {
|
|
7581
7662
|
const abstractNumId = getAttr(el, "abstractNumId");
|
|
7582
7663
|
if (!abstractNumId) continue;
|
|
@@ -7591,7 +7672,7 @@ function parseNumbering(xml) {
|
|
|
7591
7672
|
abstractNums.set(abstractNumId, levels);
|
|
7592
7673
|
}
|
|
7593
7674
|
const nums = /* @__PURE__ */ new Map();
|
|
7594
|
-
const numElements =
|
|
7675
|
+
const numElements = idx.get("num") ?? [];
|
|
7595
7676
|
for (const el of numElements) {
|
|
7596
7677
|
const numId = getAttr(el, "numId");
|
|
7597
7678
|
if (!numId) continue;
|
|
@@ -7835,9 +7916,9 @@ async function extractImages(zip, rels, doc) {
|
|
|
7835
7916
|
}
|
|
7836
7917
|
return { blocks, images };
|
|
7837
7918
|
}
|
|
7838
|
-
async function parseDocxDocument(buffer, options) {
|
|
7919
|
+
async function parseDocxDocument(buffer, options, existingZip) {
|
|
7839
7920
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
7840
|
-
const zip = await import_jszip4.default.loadAsync(buffer);
|
|
7921
|
+
const zip = existingZip ?? await import_jszip4.default.loadAsync(buffer);
|
|
7841
7922
|
const warnings = [];
|
|
7842
7923
|
const docFile = zip.file("word/document.xml");
|
|
7843
7924
|
if (!docFile) {
|
|
@@ -7927,6 +8008,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
7927
8008
|
};
|
|
7928
8009
|
}
|
|
7929
8010
|
|
|
8011
|
+
// src/index.ts
|
|
8012
|
+
init_cli_provider();
|
|
8013
|
+
init_tesseract_provider();
|
|
8014
|
+
init_markdown_to_blocks();
|
|
8015
|
+
|
|
7930
8016
|
// src/diff/text-diff.ts
|
|
7931
8017
|
function similarity(a, b) {
|
|
7932
8018
|
if (a === b) return 1;
|
|
@@ -10443,25 +10529,86 @@ async function parse2(input, options) {
|
|
|
10443
10529
|
if (!buffer || buffer.byteLength === 0) {
|
|
10444
10530
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
10445
10531
|
}
|
|
10532
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
10533
|
+
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
10534
|
+
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
10535
|
+
}
|
|
10446
10536
|
const format = detectFormat(buffer);
|
|
10447
10537
|
switch (format) {
|
|
10448
10538
|
case "hwpx": {
|
|
10449
|
-
const zipFormat = await detectZipFormat(buffer);
|
|
10450
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
10451
|
-
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
10452
|
-
return parseHwpx(buffer, options);
|
|
10539
|
+
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
10540
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
|
|
10541
|
+
if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
|
|
10542
|
+
return parseHwpx(buffer, options, zip ?? void 0);
|
|
10453
10543
|
}
|
|
10454
10544
|
case "hwp":
|
|
10455
10545
|
return parseHwp(buffer, options);
|
|
10456
10546
|
case "pdf":
|
|
10457
10547
|
return parsePdf(buffer, options);
|
|
10548
|
+
case "image":
|
|
10549
|
+
return parseImage(buffer, options);
|
|
10458
10550
|
default:
|
|
10459
10551
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
10460
10552
|
}
|
|
10461
10553
|
}
|
|
10462
|
-
async function
|
|
10554
|
+
async function parseImage(buffer, options) {
|
|
10555
|
+
const ocrMode = options?.ocrMode || "auto";
|
|
10556
|
+
if (ocrMode === "off") {
|
|
10557
|
+
return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
|
|
10558
|
+
}
|
|
10559
|
+
let ocrProvider;
|
|
10560
|
+
let actualOcrMode = "auto";
|
|
10561
|
+
try {
|
|
10562
|
+
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
10563
|
+
ocrProvider = createCliOcrProvider(ocrMode);
|
|
10564
|
+
actualOcrMode = ocrMode;
|
|
10565
|
+
} else if (ocrMode === "tesseract") {
|
|
10566
|
+
ocrProvider = await createTesseractProvider();
|
|
10567
|
+
actualOcrMode = ocrMode;
|
|
10568
|
+
} else if (ocrMode === "auto") {
|
|
10569
|
+
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
10570
|
+
for (const mode of modesToTry) {
|
|
10571
|
+
try {
|
|
10572
|
+
ocrProvider = createCliOcrProvider(mode);
|
|
10573
|
+
actualOcrMode = mode;
|
|
10574
|
+
break;
|
|
10575
|
+
} catch (e) {
|
|
10576
|
+
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
10577
|
+
}
|
|
10578
|
+
}
|
|
10579
|
+
if (!ocrProvider) {
|
|
10580
|
+
ocrProvider = await createTesseractProvider();
|
|
10581
|
+
actualOcrMode = "tesseract";
|
|
10582
|
+
}
|
|
10583
|
+
}
|
|
10584
|
+
if (!ocrProvider) {
|
|
10585
|
+
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
10586
|
+
}
|
|
10587
|
+
const imageUint8Array = new Uint8Array(buffer);
|
|
10588
|
+
const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
|
|
10589
|
+
if (ocrProvider.terminate) {
|
|
10590
|
+
await ocrProvider.terminate();
|
|
10591
|
+
}
|
|
10592
|
+
const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
|
|
10593
|
+
const blocks = markdownToBlocks(markdown, 1);
|
|
10594
|
+
return {
|
|
10595
|
+
success: true,
|
|
10596
|
+
fileType: "image",
|
|
10597
|
+
markdown,
|
|
10598
|
+
blocks,
|
|
10599
|
+
isImageBased: true,
|
|
10600
|
+
warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
|
|
10601
|
+
};
|
|
10602
|
+
} catch (err) {
|
|
10603
|
+
if (ocrProvider && ocrProvider.terminate) {
|
|
10604
|
+
await ocrProvider.terminate();
|
|
10605
|
+
}
|
|
10606
|
+
return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
|
|
10607
|
+
}
|
|
10608
|
+
}
|
|
10609
|
+
async function parseHwpx(buffer, options, zip) {
|
|
10463
10610
|
try {
|
|
10464
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
10611
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
10465
10612
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10466
10613
|
} catch (err) {
|
|
10467
10614
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -10484,17 +10631,17 @@ async function parsePdf(buffer, options) {
|
|
|
10484
10631
|
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
10485
10632
|
}
|
|
10486
10633
|
}
|
|
10487
|
-
async function parseXlsx(buffer, options) {
|
|
10634
|
+
async function parseXlsx(buffer, options, zip) {
|
|
10488
10635
|
try {
|
|
10489
|
-
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
10636
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
10490
10637
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
10491
10638
|
} catch (err) {
|
|
10492
10639
|
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
10493
10640
|
}
|
|
10494
10641
|
}
|
|
10495
|
-
async function parseDocx(buffer, options) {
|
|
10642
|
+
async function parseDocx(buffer, options, zip) {
|
|
10496
10643
|
try {
|
|
10497
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
10644
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
10498
10645
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10499
10646
|
} catch (err) {
|
|
10500
10647
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|