@clazic/kordoc 2.3.1 → 2.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-PCT4I4LK.js → batch-provider-PNDCSGQW.js} +3 -4
- package/dist/batch-provider-PNDCSGQW.js.map +1 -0
- package/dist/chunk-4PP34NVQ.js +121 -0
- package/dist/chunk-4PP34NVQ.js.map +1 -0
- package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
- package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
- package/dist/chunk-JOGAFNIL.js +153 -0
- package/dist/chunk-JOGAFNIL.js.map +1 -0
- package/dist/{chunk-W5KUC23B.js → chunk-NU3KFVVZ.js} +2 -2
- package/dist/{chunk-ZOEUKD77.js → chunk-UDFKY7CH.js} +204 -49
- package/dist/chunk-UDFKY7CH.js.map +1 -0
- package/dist/cli.js +8 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +230 -72
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -6
- package/dist/index.d.ts +11 -6
- package/dist/index.js +230 -72
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-WYHC4NHI.js → provider-HE727F7Z.js} +19 -131
- package/dist/provider-HE727F7Z.js.map +1 -0
- package/dist/resolve-UOAOPQ4H.js +111 -0
- package/dist/resolve-UOAOPQ4H.js.map +1 -0
- package/dist/tesseract-provider-MNMZPSGF.js +11 -0
- package/dist/{utils-HSF5HI5T.js → utils-STJT6CFC.js} +2 -2
- package/dist/utils-STJT6CFC.js.map +1 -0
- package/dist/{watch-R2JHXDGF.js → watch-PRQGLOW3.js} +6 -3
- package/dist/{watch-R2JHXDGF.js.map → watch-PRQGLOW3.js.map} +1 -1
- package/package.json +8 -8
- package/dist/batch-provider-PCT4I4LK.js.map +0 -1
- package/dist/chunk-ZOEUKD77.js.map +0 -1
- package/dist/provider-WYHC4NHI.js.map +0 -1
- package/dist/resolve-4FSAQF2S.js +0 -247
- package/dist/resolve-4FSAQF2S.js.map +0 -1
- /package/dist/{chunk-W5KUC23B.js.map → chunk-NU3KFVVZ.js.map} +0 -0
- /package/dist/{utils-HSF5HI5T.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1998,8 +1998,8 @@ function getTesseractFallbackMessage() {
|
|
|
1998
1998
|
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
|
|
1999
1999
|
"\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
2000
2000
|
"",
|
|
2001
|
-
" [\uAD8C\uC7A5]
|
|
2002
|
-
"
|
|
2001
|
+
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
2002
|
+
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
2003
2003
|
" Claude CLI: npm install -g @anthropic-ai/claude-code",
|
|
2004
2004
|
" Ollama: brew install ollama (+ ollama pull gemma4:27b)"
|
|
2005
2005
|
].join("\n");
|
|
@@ -2008,7 +2008,7 @@ var CLI_PRIORITY;
|
|
|
2008
2008
|
var init_auto_detect = __esm({
|
|
2009
2009
|
"src/ocr/auto-detect.ts"() {
|
|
2010
2010
|
"use strict";
|
|
2011
|
-
CLI_PRIORITY = ["
|
|
2011
|
+
CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
|
|
2012
2012
|
}
|
|
2013
2013
|
});
|
|
2014
2014
|
|
|
@@ -2051,7 +2051,7 @@ function callCli(mode, imagePath) {
|
|
|
2051
2051
|
const args = buildCliArgs(mode, imagePath);
|
|
2052
2052
|
const result = spawnSync(mode, args, {
|
|
2053
2053
|
encoding: "utf-8",
|
|
2054
|
-
timeout:
|
|
2054
|
+
timeout: 6e5,
|
|
2055
2055
|
maxBuffer: 10 * 1024 * 1024,
|
|
2056
2056
|
// claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
|
|
2057
2057
|
...mode === "claude" ? { cwd: tmpdir() } : {}
|
|
@@ -2145,14 +2145,22 @@ async function callOllamaApi(imagePath) {
|
|
|
2145
2145
|
return data.message?.content || "";
|
|
2146
2146
|
}
|
|
2147
2147
|
function stripCodeFence(text) {
|
|
2148
|
-
const match = text.match(/^```(?:markdown|md)?\s
|
|
2148
|
+
const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
|
|
2149
2149
|
return match ? match[1].trim() : text;
|
|
2150
2150
|
}
|
|
2151
2151
|
var OCR_PROMPT, _tempDir;
|
|
2152
2152
|
var init_cli_provider = __esm({
|
|
2153
2153
|
"src/ocr/cli-provider.ts"() {
|
|
2154
2154
|
"use strict";
|
|
2155
|
-
OCR_PROMPT =
|
|
2155
|
+
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
|
|
2156
|
+
\uADDC\uCE59:
|
|
2157
|
+
- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
|
|
2158
|
+
- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
|
|
2159
|
+
- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
|
|
2160
|
+
- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
|
|
2161
|
+
- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
|
|
2162
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
|
|
2163
|
+
- \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
|
|
2156
2164
|
_tempDir = null;
|
|
2157
2165
|
}
|
|
2158
2166
|
});
|
|
@@ -2321,9 +2329,8 @@ async function callBatchCli(mode, imagePaths) {
|
|
|
2321
2329
|
${fileRefs}`;
|
|
2322
2330
|
let args;
|
|
2323
2331
|
if (mode === "gemini") {
|
|
2324
|
-
|
|
2325
|
-
|
|
2326
|
-
if (model) args.push("--model", model);
|
|
2332
|
+
const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
|
|
2333
|
+
args = ["--prompt", prompt, "--yolo", "--model", model];
|
|
2327
2334
|
} else {
|
|
2328
2335
|
args = ["--print", prompt];
|
|
2329
2336
|
const model = process.env.KORDOC_CLAUDE_MODEL;
|
|
@@ -2418,7 +2425,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2418
2425
|
return createCliOcrProvider(mode);
|
|
2419
2426
|
}
|
|
2420
2427
|
const detected = detectAvailableOcr();
|
|
2421
|
-
if (detected !== "
|
|
2428
|
+
if (detected !== "codex") {
|
|
2422
2429
|
if (detected === "tesseract") {
|
|
2423
2430
|
warnings?.push({
|
|
2424
2431
|
message: getTesseractFallbackMessage(),
|
|
@@ -2426,7 +2433,7 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2426
2433
|
});
|
|
2427
2434
|
} else {
|
|
2428
2435
|
warnings?.push({
|
|
2429
|
-
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (
|
|
2436
|
+
message: `OCR: '${detected}' \uC0AC\uC6A9 \uC911 (codex CLI\uAC00 \uC5C6\uC5B4 fallback). \uB354 \uB098\uC740 \uD488\uC9C8\uC744 \uC704\uD574 codex CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4.`,
|
|
2430
2437
|
code: "OCR_CLI_FALLBACK"
|
|
2431
2438
|
});
|
|
2432
2439
|
}
|
|
@@ -2667,22 +2674,22 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2667
2674
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2668
2675
|
pageNumbers.push(i);
|
|
2669
2676
|
}
|
|
2670
|
-
const
|
|
2671
|
-
for (
|
|
2672
|
-
|
|
2673
|
-
const image = await renderPageToPng(page);
|
|
2674
|
-
pageImages.push({ image, pageNum });
|
|
2675
|
-
}
|
|
2676
|
-
const batches = [];
|
|
2677
|
-
for (let i = 0; i < pageImages.length; i += provider.batchSize) {
|
|
2678
|
-
batches.push(pageImages.slice(i, i + provider.batchSize));
|
|
2677
|
+
const pageBatches = [];
|
|
2678
|
+
for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
|
|
2679
|
+
pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
|
|
2679
2680
|
}
|
|
2680
2681
|
let processed = 0;
|
|
2681
|
-
const batchTasks =
|
|
2682
|
+
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2682
2683
|
const pageBlocks = [];
|
|
2683
2684
|
try {
|
|
2684
|
-
const
|
|
2685
|
-
for (const
|
|
2685
|
+
const batchImages = [];
|
|
2686
|
+
for (const pageNum of batchPageNums) {
|
|
2687
|
+
const page = await doc.getPage(pageNum);
|
|
2688
|
+
const image = await renderPageToPng(page);
|
|
2689
|
+
batchImages.push({ image, pageNum });
|
|
2690
|
+
}
|
|
2691
|
+
const results = await provider.processBatch(batchImages);
|
|
2692
|
+
for (const { pageNum } of batchImages) {
|
|
2686
2693
|
const result = results.get(pageNum);
|
|
2687
2694
|
pageBlocks.push({
|
|
2688
2695
|
pageNum,
|
|
@@ -2690,16 +2697,16 @@ async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warn
|
|
|
2690
2697
|
});
|
|
2691
2698
|
}
|
|
2692
2699
|
} catch (err) {
|
|
2693
|
-
const range = `${
|
|
2700
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2694
2701
|
warnings?.push({
|
|
2695
2702
|
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2696
2703
|
code: "OCR_PAGE_FAILED"
|
|
2697
2704
|
});
|
|
2698
|
-
for (const
|
|
2705
|
+
for (const pageNum of batchPageNums) {
|
|
2699
2706
|
pageBlocks.push({ pageNum, blocks: [] });
|
|
2700
2707
|
}
|
|
2701
2708
|
}
|
|
2702
|
-
processed +=
|
|
2709
|
+
processed += batchPageNums.length;
|
|
2703
2710
|
onProgress?.(processed, pageNumbers.length);
|
|
2704
2711
|
return { batchIdx, pageBlocks };
|
|
2705
2712
|
});
|
|
@@ -2752,24 +2759,29 @@ function isPdfFile(buffer) {
|
|
|
2752
2759
|
const b = magicBytes(buffer);
|
|
2753
2760
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
2754
2761
|
}
|
|
2762
|
+
function isPngFile(buffer) {
|
|
2763
|
+
const b = magicBytes(buffer);
|
|
2764
|
+
return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
|
|
2765
|
+
}
|
|
2755
2766
|
function detectFormat(buffer) {
|
|
2756
2767
|
if (buffer.byteLength < 4) return "unknown";
|
|
2757
2768
|
if (isZipFile(buffer)) return "hwpx";
|
|
2758
2769
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
2759
2770
|
if (isPdfFile(buffer)) return "pdf";
|
|
2771
|
+
if (isPngFile(buffer)) return "image";
|
|
2760
2772
|
return "unknown";
|
|
2761
2773
|
}
|
|
2762
2774
|
async function detectZipFormat(buffer) {
|
|
2763
2775
|
try {
|
|
2764
2776
|
const zip = await JSZip.loadAsync(buffer);
|
|
2765
|
-
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
2766
|
-
if (zip.file("word/document.xml")) return "docx";
|
|
2767
|
-
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
2777
|
+
if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
|
|
2778
|
+
if (zip.file("word/document.xml")) return { format: "docx", zip };
|
|
2779
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
|
|
2768
2780
|
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
2769
|
-
if (hasSection) return "hwpx";
|
|
2770
|
-
return "unknown";
|
|
2781
|
+
if (hasSection) return { format: "hwpx", zip };
|
|
2782
|
+
return { format: "unknown", zip: null };
|
|
2771
2783
|
} catch {
|
|
2772
|
-
return "unknown";
|
|
2784
|
+
return { format: "unknown", zip: null };
|
|
2773
2785
|
}
|
|
2774
2786
|
}
|
|
2775
2787
|
|
|
@@ -2778,7 +2790,7 @@ import JSZip2 from "jszip";
|
|
|
2778
2790
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2779
2791
|
|
|
2780
2792
|
// src/utils.ts
|
|
2781
|
-
var VERSION = true ? "2.3.
|
|
2793
|
+
var VERSION = true ? "2.3.3" : "0.0.0-dev";
|
|
2782
2794
|
function toArrayBuffer(buf) {
|
|
2783
2795
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2784
2796
|
return buf.buffer;
|
|
@@ -2938,12 +2950,16 @@ function buildTableDirect(rows, numRows) {
|
|
|
2938
2950
|
return trimAndReturn(grid, numRows, maxCols);
|
|
2939
2951
|
}
|
|
2940
2952
|
function trimAndReturn(grid, numRows, maxCols) {
|
|
2941
|
-
let effectiveCols =
|
|
2942
|
-
|
|
2943
|
-
|
|
2944
|
-
|
|
2945
|
-
|
|
2953
|
+
let effectiveCols = 0;
|
|
2954
|
+
for (const row of grid) {
|
|
2955
|
+
for (let c = row.length - 1; c >= effectiveCols; c--) {
|
|
2956
|
+
if (row[c]?.text?.trim()) {
|
|
2957
|
+
effectiveCols = c + 1;
|
|
2958
|
+
break;
|
|
2959
|
+
}
|
|
2960
|
+
}
|
|
2946
2961
|
}
|
|
2962
|
+
if (effectiveCols === 0) effectiveCols = maxCols;
|
|
2947
2963
|
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
2948
2964
|
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
2949
2965
|
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
@@ -3200,11 +3216,11 @@ function parseStyleElements(doc, map) {
|
|
|
3200
3216
|
function stripDtd(xml) {
|
|
3201
3217
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
3202
3218
|
}
|
|
3203
|
-
async function parseHwpxDocument(buffer, options) {
|
|
3219
|
+
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
3204
3220
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
3205
3221
|
let zip;
|
|
3206
3222
|
try {
|
|
3207
|
-
zip = await JSZip2.loadAsync(buffer);
|
|
3223
|
+
zip = existingZip ?? await JSZip2.loadAsync(buffer);
|
|
3208
3224
|
} catch {
|
|
3209
3225
|
return await extractFromBrokenZip(buffer);
|
|
3210
3226
|
}
|
|
@@ -6216,8 +6232,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
|
|
|
6216
6232
|
GlobalWorkerOptions.workerSrc = "";
|
|
6217
6233
|
var MAX_PAGES = 5e3;
|
|
6218
6234
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
6219
|
-
|
|
6235
|
+
function calcPdfTimeout(bufferSize) {
|
|
6236
|
+
const base = 3e4;
|
|
6237
|
+
const perMb = 500;
|
|
6238
|
+
const mb = bufferSize / (1024 * 1024);
|
|
6239
|
+
return Math.min(base + Math.ceil(mb * perMb), 3e5);
|
|
6240
|
+
}
|
|
6220
6241
|
async function loadPdfWithTimeout(buffer) {
|
|
6242
|
+
const timeoutMs = calcPdfTimeout(buffer.byteLength);
|
|
6243
|
+
const timeoutSec = Math.round(timeoutMs / 1e3);
|
|
6221
6244
|
const loadingTask = getDocument({
|
|
6222
6245
|
data: new Uint8Array(buffer),
|
|
6223
6246
|
useSystemFonts: true,
|
|
@@ -6231,8 +6254,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6231
6254
|
new Promise((_, reject) => {
|
|
6232
6255
|
timer = setTimeout(() => {
|
|
6233
6256
|
loadingTask.destroy();
|
|
6234
|
-
reject(new KordocError(
|
|
6235
|
-
},
|
|
6257
|
+
reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
|
|
6258
|
+
}, timeoutMs);
|
|
6236
6259
|
})
|
|
6237
6260
|
]);
|
|
6238
6261
|
} finally {
|
|
@@ -6253,11 +6276,15 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6253
6276
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
6254
6277
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
6255
6278
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
6256
|
-
const
|
|
6279
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
6257
6280
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
6258
|
-
|
|
6281
|
+
const targetPageNums = [];
|
|
6259
6282
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6260
6283
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
6284
|
+
targetPageNums.push(i);
|
|
6285
|
+
}
|
|
6286
|
+
let parsedPages = 0;
|
|
6287
|
+
const parseSinglePage = async (i) => {
|
|
6261
6288
|
try {
|
|
6262
6289
|
const page = await doc.getPage(i);
|
|
6263
6290
|
const tc = await page.getTextContent();
|
|
@@ -6270,7 +6297,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6270
6297
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
6271
6298
|
}
|
|
6272
6299
|
for (const item of visible) {
|
|
6273
|
-
if (item.fontSize > 0)
|
|
6300
|
+
if (item.fontSize > 0) {
|
|
6301
|
+
const rounded = Math.round(item.fontSize * 10) / 10;
|
|
6302
|
+
fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
|
|
6303
|
+
}
|
|
6274
6304
|
}
|
|
6275
6305
|
const opList = await page.getOperatorList();
|
|
6276
6306
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -6287,12 +6317,34 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6287
6317
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
6288
6318
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
6289
6319
|
}
|
|
6320
|
+
};
|
|
6321
|
+
const SAMPLE_SIZE = Math.min(10, targetPageNums.length);
|
|
6322
|
+
const sampledIndices = /* @__PURE__ */ new Set();
|
|
6323
|
+
if (targetPageNums.length <= SAMPLE_SIZE) {
|
|
6324
|
+
for (let i = 0; i < targetPageNums.length; i++) sampledIndices.add(i);
|
|
6325
|
+
} else {
|
|
6326
|
+
for (let i = 0; i < SAMPLE_SIZE; i++) {
|
|
6327
|
+
const idx = Math.round(i * (targetPageNums.length - 1) / (SAMPLE_SIZE - 1));
|
|
6328
|
+
sampledIndices.add(idx);
|
|
6329
|
+
}
|
|
6330
|
+
}
|
|
6331
|
+
for (const si of sampledIndices) {
|
|
6332
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6333
|
+
}
|
|
6334
|
+
const sampleParsed = parsedPages || sampledIndices.size;
|
|
6335
|
+
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
6336
|
+
if (!isImageBased) {
|
|
6337
|
+
for (let si = 0; si < targetPageNums.length; si++) {
|
|
6338
|
+
if (!sampledIndices.has(si)) {
|
|
6339
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6340
|
+
}
|
|
6341
|
+
}
|
|
6290
6342
|
}
|
|
6291
6343
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6292
|
-
if (
|
|
6344
|
+
if (isImageBased) {
|
|
6293
6345
|
let ocrProvider = options?.ocr ?? null;
|
|
6294
|
-
const ocrMode = options?.ocrMode;
|
|
6295
|
-
if (!ocrProvider && ocrMode
|
|
6346
|
+
const ocrMode = options?.ocrMode ?? "auto";
|
|
6347
|
+
if (!ocrProvider && ocrMode !== "off") {
|
|
6296
6348
|
try {
|
|
6297
6349
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6298
6350
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
@@ -6344,7 +6396,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6344
6396
|
blocks.splice(removed[ri], 1);
|
|
6345
6397
|
}
|
|
6346
6398
|
}
|
|
6347
|
-
const medianFontSize =
|
|
6399
|
+
const medianFontSize = computeMedianFromFreq(fontSizeFreq);
|
|
6348
6400
|
if (medianFontSize > 0) {
|
|
6349
6401
|
detectHeadings(blocks, medianFontSize);
|
|
6350
6402
|
}
|
|
@@ -6397,11 +6449,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
6397
6449
|
}
|
|
6398
6450
|
return { visible, hiddenCount };
|
|
6399
6451
|
}
|
|
6400
|
-
function
|
|
6401
|
-
if (
|
|
6402
|
-
const
|
|
6403
|
-
|
|
6404
|
-
|
|
6452
|
+
function computeMedianFromFreq(freq) {
|
|
6453
|
+
if (freq.size === 0) return 0;
|
|
6454
|
+
const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
6455
|
+
let total = 0;
|
|
6456
|
+
for (const [, count] of entries) total += count;
|
|
6457
|
+
const mid = total / 2;
|
|
6458
|
+
let cumulative = 0;
|
|
6459
|
+
for (const [size, count] of entries) {
|
|
6460
|
+
cumulative += count;
|
|
6461
|
+
if (cumulative >= mid) return size;
|
|
6462
|
+
}
|
|
6463
|
+
return 0;
|
|
6405
6464
|
}
|
|
6406
6465
|
function detectHeadings(blocks, medianFontSize) {
|
|
6407
6466
|
for (const block of blocks) {
|
|
@@ -7204,6 +7263,7 @@ var MAX_SHEETS = 100;
|
|
|
7204
7263
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
7205
7264
|
var MAX_ROWS2 = 1e4;
|
|
7206
7265
|
var MAX_COLS2 = 200;
|
|
7266
|
+
var MAX_TOTAL_CELLS = 2e6;
|
|
7207
7267
|
function cleanNumericValue(raw) {
|
|
7208
7268
|
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
7209
7269
|
const num = parseFloat(raw);
|
|
@@ -7387,9 +7447,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
7387
7447
|
}
|
|
7388
7448
|
return blocks;
|
|
7389
7449
|
}
|
|
7390
|
-
async function parseXlsxDocument(buffer, options) {
|
|
7450
|
+
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
7391
7451
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
7392
|
-
const zip = await JSZip3.loadAsync(buffer);
|
|
7452
|
+
const zip = existingZip ?? await JSZip3.loadAsync(buffer);
|
|
7393
7453
|
const warnings = [];
|
|
7394
7454
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
7395
7455
|
if (!workbookFile) {
|
|
@@ -7416,6 +7476,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7416
7476
|
}
|
|
7417
7477
|
const blocks = [];
|
|
7418
7478
|
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
7479
|
+
let totalCells = 0;
|
|
7419
7480
|
for (let i = 0; i < processedSheets; i++) {
|
|
7420
7481
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
7421
7482
|
const sheet = sheets[i];
|
|
@@ -7442,6 +7503,11 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7442
7503
|
try {
|
|
7443
7504
|
const sheetXml = await sheetFile.async("text");
|
|
7444
7505
|
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
7506
|
+
totalCells += maxRow * maxCol;
|
|
7507
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
7508
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
7509
|
+
break;
|
|
7510
|
+
}
|
|
7445
7511
|
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
7446
7512
|
blocks.push(...sheetBlocks);
|
|
7447
7513
|
} catch (err) {
|
|
@@ -7525,10 +7591,35 @@ function getAttr(el, localName) {
|
|
|
7525
7591
|
function parseXml2(text) {
|
|
7526
7592
|
return new DOMParser3().parseFromString(text, "text/xml");
|
|
7527
7593
|
}
|
|
7594
|
+
function buildElementIndex(root) {
|
|
7595
|
+
const index = /* @__PURE__ */ new Map();
|
|
7596
|
+
const walk = (node) => {
|
|
7597
|
+
const children = node.childNodes;
|
|
7598
|
+
for (let i = 0; i < children.length; i++) {
|
|
7599
|
+
const child = children[i];
|
|
7600
|
+
if (child.nodeType === 1) {
|
|
7601
|
+
const el = child;
|
|
7602
|
+
const name = el.localName ?? "";
|
|
7603
|
+
if (name) {
|
|
7604
|
+
let list = index.get(name);
|
|
7605
|
+
if (!list) {
|
|
7606
|
+
list = [];
|
|
7607
|
+
index.set(name, list);
|
|
7608
|
+
}
|
|
7609
|
+
list.push(el);
|
|
7610
|
+
}
|
|
7611
|
+
walk(el);
|
|
7612
|
+
}
|
|
7613
|
+
}
|
|
7614
|
+
};
|
|
7615
|
+
walk(root);
|
|
7616
|
+
return index;
|
|
7617
|
+
}
|
|
7528
7618
|
function parseStyles(xml) {
|
|
7529
7619
|
const doc = parseXml2(xml);
|
|
7530
7620
|
const styles = /* @__PURE__ */ new Map();
|
|
7531
|
-
const
|
|
7621
|
+
const idx = buildElementIndex(doc);
|
|
7622
|
+
const styleElements = idx.get("style") ?? [];
|
|
7532
7623
|
for (const el of styleElements) {
|
|
7533
7624
|
const styleId = getAttr(el, "styleId");
|
|
7534
7625
|
if (!styleId) continue;
|
|
@@ -7556,7 +7647,8 @@ function parseStyles(xml) {
|
|
|
7556
7647
|
function parseNumbering(xml) {
|
|
7557
7648
|
const doc = parseXml2(xml);
|
|
7558
7649
|
const abstractNums = /* @__PURE__ */ new Map();
|
|
7559
|
-
const
|
|
7650
|
+
const idx = buildElementIndex(doc);
|
|
7651
|
+
const abstractElements = idx.get("abstractNum") ?? [];
|
|
7560
7652
|
for (const el of abstractElements) {
|
|
7561
7653
|
const abstractNumId = getAttr(el, "abstractNumId");
|
|
7562
7654
|
if (!abstractNumId) continue;
|
|
@@ -7571,7 +7663,7 @@ function parseNumbering(xml) {
|
|
|
7571
7663
|
abstractNums.set(abstractNumId, levels);
|
|
7572
7664
|
}
|
|
7573
7665
|
const nums = /* @__PURE__ */ new Map();
|
|
7574
|
-
const numElements =
|
|
7666
|
+
const numElements = idx.get("num") ?? [];
|
|
7575
7667
|
for (const el of numElements) {
|
|
7576
7668
|
const numId = getAttr(el, "numId");
|
|
7577
7669
|
if (!numId) continue;
|
|
@@ -7815,9 +7907,9 @@ async function extractImages(zip, rels, doc) {
|
|
|
7815
7907
|
}
|
|
7816
7908
|
return { blocks, images };
|
|
7817
7909
|
}
|
|
7818
|
-
async function parseDocxDocument(buffer, options) {
|
|
7910
|
+
async function parseDocxDocument(buffer, options, existingZip) {
|
|
7819
7911
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
7820
|
-
const zip = await JSZip4.loadAsync(buffer);
|
|
7912
|
+
const zip = existingZip ?? await JSZip4.loadAsync(buffer);
|
|
7821
7913
|
const warnings = [];
|
|
7822
7914
|
const docFile = zip.file("word/document.xml");
|
|
7823
7915
|
if (!docFile) {
|
|
@@ -7907,6 +7999,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
7907
7999
|
};
|
|
7908
8000
|
}
|
|
7909
8001
|
|
|
8002
|
+
// src/index.ts
|
|
8003
|
+
init_cli_provider();
|
|
8004
|
+
init_tesseract_provider();
|
|
8005
|
+
init_markdown_to_blocks();
|
|
8006
|
+
|
|
7910
8007
|
// src/diff/text-diff.ts
|
|
7911
8008
|
function similarity(a, b) {
|
|
7912
8009
|
if (a === b) return 1;
|
|
@@ -10423,25 +10520,86 @@ async function parse2(input, options) {
|
|
|
10423
10520
|
if (!buffer || buffer.byteLength === 0) {
|
|
10424
10521
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
10425
10522
|
}
|
|
10523
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
10524
|
+
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
10525
|
+
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
10526
|
+
}
|
|
10426
10527
|
const format = detectFormat(buffer);
|
|
10427
10528
|
switch (format) {
|
|
10428
10529
|
case "hwpx": {
|
|
10429
|
-
const zipFormat = await detectZipFormat(buffer);
|
|
10430
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
10431
|
-
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
10432
|
-
return parseHwpx(buffer, options);
|
|
10530
|
+
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
10531
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
|
|
10532
|
+
if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
|
|
10533
|
+
return parseHwpx(buffer, options, zip ?? void 0);
|
|
10433
10534
|
}
|
|
10434
10535
|
case "hwp":
|
|
10435
10536
|
return parseHwp(buffer, options);
|
|
10436
10537
|
case "pdf":
|
|
10437
10538
|
return parsePdf(buffer, options);
|
|
10539
|
+
case "image":
|
|
10540
|
+
return parseImage(buffer, options);
|
|
10438
10541
|
default:
|
|
10439
10542
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
10440
10543
|
}
|
|
10441
10544
|
}
|
|
10442
|
-
async function
|
|
10545
|
+
async function parseImage(buffer, options) {
|
|
10546
|
+
const ocrMode = options?.ocrMode || "auto";
|
|
10547
|
+
if (ocrMode === "off") {
|
|
10548
|
+
return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
|
|
10549
|
+
}
|
|
10550
|
+
let ocrProvider;
|
|
10551
|
+
let actualOcrMode = "auto";
|
|
10552
|
+
try {
|
|
10553
|
+
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
10554
|
+
ocrProvider = createCliOcrProvider(ocrMode);
|
|
10555
|
+
actualOcrMode = ocrMode;
|
|
10556
|
+
} else if (ocrMode === "tesseract") {
|
|
10557
|
+
ocrProvider = await createTesseractProvider();
|
|
10558
|
+
actualOcrMode = ocrMode;
|
|
10559
|
+
} else if (ocrMode === "auto") {
|
|
10560
|
+
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
10561
|
+
for (const mode of modesToTry) {
|
|
10562
|
+
try {
|
|
10563
|
+
ocrProvider = createCliOcrProvider(mode);
|
|
10564
|
+
actualOcrMode = mode;
|
|
10565
|
+
break;
|
|
10566
|
+
} catch (e) {
|
|
10567
|
+
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
10568
|
+
}
|
|
10569
|
+
}
|
|
10570
|
+
if (!ocrProvider) {
|
|
10571
|
+
ocrProvider = await createTesseractProvider();
|
|
10572
|
+
actualOcrMode = "tesseract";
|
|
10573
|
+
}
|
|
10574
|
+
}
|
|
10575
|
+
if (!ocrProvider) {
|
|
10576
|
+
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
10577
|
+
}
|
|
10578
|
+
const imageUint8Array = new Uint8Array(buffer);
|
|
10579
|
+
const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
|
|
10580
|
+
if (ocrProvider.terminate) {
|
|
10581
|
+
await ocrProvider.terminate();
|
|
10582
|
+
}
|
|
10583
|
+
const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
|
|
10584
|
+
const blocks = markdownToBlocks(markdown, 1);
|
|
10585
|
+
return {
|
|
10586
|
+
success: true,
|
|
10587
|
+
fileType: "image",
|
|
10588
|
+
markdown,
|
|
10589
|
+
blocks,
|
|
10590
|
+
isImageBased: true,
|
|
10591
|
+
warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
|
|
10592
|
+
};
|
|
10593
|
+
} catch (err) {
|
|
10594
|
+
if (ocrProvider && ocrProvider.terminate) {
|
|
10595
|
+
await ocrProvider.terminate();
|
|
10596
|
+
}
|
|
10597
|
+
return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
|
|
10598
|
+
}
|
|
10599
|
+
}
|
|
10600
|
+
async function parseHwpx(buffer, options, zip) {
|
|
10443
10601
|
try {
|
|
10444
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
10602
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
10445
10603
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10446
10604
|
} catch (err) {
|
|
10447
10605
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -10464,17 +10622,17 @@ async function parsePdf(buffer, options) {
|
|
|
10464
10622
|
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
10465
10623
|
}
|
|
10466
10624
|
}
|
|
10467
|
-
async function parseXlsx(buffer, options) {
|
|
10625
|
+
async function parseXlsx(buffer, options, zip) {
|
|
10468
10626
|
try {
|
|
10469
|
-
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
10627
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
10470
10628
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
10471
10629
|
} catch (err) {
|
|
10472
10630
|
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
10473
10631
|
}
|
|
10474
10632
|
}
|
|
10475
|
-
async function parseDocx(buffer, options) {
|
|
10633
|
+
async function parseDocx(buffer, options, zip) {
|
|
10476
10634
|
try {
|
|
10477
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
10635
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
10478
10636
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10479
10637
|
} catch (err) {
|
|
10480
10638
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|