@clazic/kordoc 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-FUCIIS4M.js → batch-provider-PNDCSGQW.js} +59 -30
- package/dist/batch-provider-PNDCSGQW.js.map +1 -0
- package/dist/{chunk-2ZGLFZCN.js → chunk-2GFJFTKS.js} +193 -49
- package/dist/chunk-2GFJFTKS.js.map +1 -0
- package/dist/chunk-4PP34NVQ.js +121 -0
- package/dist/chunk-4PP34NVQ.js.map +1 -0
- package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
- package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
- package/dist/chunk-JOGAFNIL.js +153 -0
- package/dist/chunk-JOGAFNIL.js.map +1 -0
- package/dist/{chunk-WWILSVMJ.js → chunk-STIKJGEA.js} +2 -2
- package/dist/cli.js +10 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +291 -103
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -6
- package/dist/index.d.ts +11 -6
- package/dist/index.js +292 -104
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-OBY3XFSZ.js → provider-HE727F7Z.js} +38 -139
- package/dist/provider-HE727F7Z.js.map +1 -0
- package/dist/resolve-QA3VACUP.js +111 -0
- package/dist/resolve-QA3VACUP.js.map +1 -0
- package/dist/tesseract-provider-MNMZPSGF.js +11 -0
- package/dist/{utils-QAK24RJS.js → utils-FFUQJTTI.js} +2 -2
- package/dist/utils-FFUQJTTI.js.map +1 -0
- package/dist/{watch-MPHX3QIH.js → watch-2O32L6IF.js} +6 -3
- package/dist/{watch-MPHX3QIH.js.map → watch-2O32L6IF.js.map} +1 -1
- package/package.json +1 -1
- package/dist/batch-provider-FUCIIS4M.js.map +0 -1
- package/dist/chunk-2ZGLFZCN.js.map +0 -1
- package/dist/provider-OBY3XFSZ.js.map +0 -1
- package/dist/resolve-LBFYRHJI.js +0 -247
- package/dist/resolve-LBFYRHJI.js.map +0 -1
- /package/dist/{chunk-WWILSVMJ.js.map → chunk-STIKJGEA.js.map} +0 -0
- /package/dist/{utils-QAK24RJS.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -1993,8 +1993,8 @@ function getTesseractFallbackMessage() {
|
|
|
1993
1993
|
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
|
|
1994
1994
|
"\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
1995
1995
|
"",
|
|
1996
|
-
" [\uAD8C\uC7A5]
|
|
1997
|
-
"
|
|
1996
|
+
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
1997
|
+
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
1998
1998
|
" Claude CLI: npm install -g @anthropic-ai/claude-code",
|
|
1999
1999
|
" Ollama: brew install ollama (+ ollama pull gemma4:27b)"
|
|
2000
2000
|
].join("\n");
|
|
@@ -2004,7 +2004,7 @@ var init_auto_detect = __esm({
|
|
|
2004
2004
|
"src/ocr/auto-detect.ts"() {
|
|
2005
2005
|
"use strict";
|
|
2006
2006
|
import_child_process = require("child_process");
|
|
2007
|
-
CLI_PRIORITY = ["
|
|
2007
|
+
CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
|
|
2008
2008
|
}
|
|
2009
2009
|
});
|
|
2010
2010
|
|
|
@@ -2043,7 +2043,7 @@ function callCli(mode, imagePath) {
|
|
|
2043
2043
|
const args = buildCliArgs(mode, imagePath);
|
|
2044
2044
|
const result = (0, import_child_process2.spawnSync)(mode, args, {
|
|
2045
2045
|
encoding: "utf-8",
|
|
2046
|
-
timeout:
|
|
2046
|
+
timeout: 6e5,
|
|
2047
2047
|
maxBuffer: 10 * 1024 * 1024,
|
|
2048
2048
|
// claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
|
|
2049
2049
|
...mode === "claude" ? { cwd: (0, import_os.tmpdir)() } : {}
|
|
@@ -2137,7 +2137,7 @@ async function callOllamaApi(imagePath) {
|
|
|
2137
2137
|
return data.message?.content || "";
|
|
2138
2138
|
}
|
|
2139
2139
|
function stripCodeFence(text) {
|
|
2140
|
-
const match = text.match(/^```(?:markdown|md)?\s
|
|
2140
|
+
const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
|
|
2141
2141
|
return match ? match[1].trim() : text;
|
|
2142
2142
|
}
|
|
2143
2143
|
var import_child_process2, import_fs, import_path, import_os, OCR_PROMPT, _tempDir;
|
|
@@ -2148,7 +2148,15 @@ var init_cli_provider = __esm({
|
|
|
2148
2148
|
import_fs = require("fs");
|
|
2149
2149
|
import_path = require("path");
|
|
2150
2150
|
import_os = require("os");
|
|
2151
|
-
OCR_PROMPT =
|
|
2151
|
+
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
|
|
2152
|
+
\uADDC\uCE59:
|
|
2153
|
+
- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
|
|
2154
|
+
- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
|
|
2155
|
+
- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
|
|
2156
|
+
- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
|
|
2157
|
+
- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
|
|
2158
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
|
|
2159
|
+
- \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
|
|
2152
2160
|
_tempDir = null;
|
|
2153
2161
|
}
|
|
2154
2162
|
});
|
|
@@ -2242,9 +2250,9 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2242
2250
|
}
|
|
2243
2251
|
let output;
|
|
2244
2252
|
if (mode === "codex") {
|
|
2245
|
-
output = callBatchCodexCli(tempFiles);
|
|
2253
|
+
output = await callBatchCodexCli(tempFiles);
|
|
2246
2254
|
} else {
|
|
2247
|
-
output = callBatchCli(mode, tempFiles);
|
|
2255
|
+
output = await callBatchCli(mode, tempFiles);
|
|
2248
2256
|
}
|
|
2249
2257
|
const cleaned = stripCodeFence2(output.trim());
|
|
2250
2258
|
const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
|
|
@@ -2266,40 +2274,74 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2266
2274
|
}
|
|
2267
2275
|
};
|
|
2268
2276
|
}
|
|
2269
|
-
function
|
|
2277
|
+
function spawnAsync(cmd, args, opts) {
|
|
2278
|
+
return new Promise((resolve, reject) => {
|
|
2279
|
+
const child = (0, import_child_process3.spawn)(cmd, args, {
|
|
2280
|
+
cwd: opts.cwd,
|
|
2281
|
+
env: process.env,
|
|
2282
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
2283
|
+
});
|
|
2284
|
+
let stdout = "";
|
|
2285
|
+
let stderr = "";
|
|
2286
|
+
let killed = false;
|
|
2287
|
+
child.stdout.setEncoding("utf-8");
|
|
2288
|
+
child.stderr.setEncoding("utf-8");
|
|
2289
|
+
child.stdout.on("data", (d) => {
|
|
2290
|
+
stdout += d;
|
|
2291
|
+
});
|
|
2292
|
+
child.stderr.on("data", (d) => {
|
|
2293
|
+
stderr += d;
|
|
2294
|
+
});
|
|
2295
|
+
const timer = setTimeout(() => {
|
|
2296
|
+
killed = true;
|
|
2297
|
+
child.kill("SIGTERM");
|
|
2298
|
+
}, opts.timeoutMs);
|
|
2299
|
+
if (opts.stdin !== void 0) {
|
|
2300
|
+
child.stdin.end(opts.stdin);
|
|
2301
|
+
} else {
|
|
2302
|
+
child.stdin.end();
|
|
2303
|
+
}
|
|
2304
|
+
child.on("close", (code) => {
|
|
2305
|
+
clearTimeout(timer);
|
|
2306
|
+
if (killed) {
|
|
2307
|
+
reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
|
|
2308
|
+
} else {
|
|
2309
|
+
resolve({ stdout, stderr, exitCode: code ?? 1 });
|
|
2310
|
+
}
|
|
2311
|
+
});
|
|
2312
|
+
child.on("error", (err) => {
|
|
2313
|
+
clearTimeout(timer);
|
|
2314
|
+
reject(err);
|
|
2315
|
+
});
|
|
2316
|
+
});
|
|
2317
|
+
}
|
|
2318
|
+
async function callBatchCli(mode, imagePaths) {
|
|
2270
2319
|
const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
|
|
2271
2320
|
const prompt = `${BATCH_OCR_PROMPT}
|
|
2272
2321
|
|
|
2273
2322
|
${fileRefs}`;
|
|
2274
2323
|
let args;
|
|
2275
2324
|
if (mode === "gemini") {
|
|
2276
|
-
|
|
2277
|
-
|
|
2278
|
-
if (model) args.push("--model", model);
|
|
2325
|
+
const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
|
|
2326
|
+
args = ["--prompt", prompt, "--yolo", "--model", model];
|
|
2279
2327
|
} else {
|
|
2280
2328
|
args = ["--print", prompt];
|
|
2281
2329
|
const model = process.env.KORDOC_CLAUDE_MODEL;
|
|
2282
2330
|
if (model) args.push("--model", model);
|
|
2283
2331
|
}
|
|
2284
2332
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2285
|
-
const result =
|
|
2286
|
-
|
|
2287
|
-
timeout: timeoutMs,
|
|
2288
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
2289
|
-
// 50MB (large batch output)
|
|
2333
|
+
const result = await spawnAsync(mode, args, {
|
|
2334
|
+
timeoutMs,
|
|
2290
2335
|
...mode === "claude" ? { cwd: (0, import_os2.tmpdir)() } : {}
|
|
2291
2336
|
});
|
|
2292
|
-
if (result.
|
|
2293
|
-
|
|
2294
|
-
}
|
|
2295
|
-
if (result.status !== 0) {
|
|
2296
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2337
|
+
if (result.exitCode !== 0) {
|
|
2338
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2297
2339
|
throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2298
2340
|
}
|
|
2299
2341
|
return result.stdout || "";
|
|
2300
2342
|
}
|
|
2301
|
-
function callBatchCodexCli(imagePaths) {
|
|
2302
|
-
const outPath = (0, import_path2.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}.txt`);
|
|
2343
|
+
async function callBatchCodexCli(imagePaths) {
|
|
2344
|
+
const outPath = (0, import_path2.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
|
|
2303
2345
|
try {
|
|
2304
2346
|
const args = ["exec", BATCH_OCR_PROMPT];
|
|
2305
2347
|
for (const p of imagePaths) {
|
|
@@ -2309,17 +2351,12 @@ function callBatchCodexCli(imagePaths) {
|
|
|
2309
2351
|
const model = process.env.KORDOC_CODEX_MODEL;
|
|
2310
2352
|
if (model) args.push("--model", model);
|
|
2311
2353
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2312
|
-
const result =
|
|
2313
|
-
|
|
2314
|
-
|
|
2315
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
2316
|
-
input: ""
|
|
2354
|
+
const result = await spawnAsync("codex", args, {
|
|
2355
|
+
timeoutMs,
|
|
2356
|
+
stdin: ""
|
|
2317
2357
|
});
|
|
2318
|
-
if (result.
|
|
2319
|
-
|
|
2320
|
-
}
|
|
2321
|
-
if (result.status !== 0) {
|
|
2322
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2358
|
+
if (result.exitCode !== 0) {
|
|
2359
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2323
2360
|
throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2324
2361
|
}
|
|
2325
2362
|
try {
|
|
@@ -2581,7 +2618,7 @@ function isBatchProvider(p) {
|
|
|
2581
2618
|
async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2582
2619
|
const blocks = [];
|
|
2583
2620
|
if (isBatchProvider(provider)) {
|
|
2584
|
-
return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress);
|
|
2621
|
+
return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
|
|
2585
2622
|
}
|
|
2586
2623
|
if (concurrency <= 1) {
|
|
2587
2624
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -2628,43 +2665,54 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2628
2665
|
}
|
|
2629
2666
|
return blocks;
|
|
2630
2667
|
}
|
|
2631
|
-
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress) {
|
|
2632
|
-
const blocks = [];
|
|
2668
|
+
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2633
2669
|
const pageNumbers = [];
|
|
2634
2670
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2635
2671
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2636
2672
|
pageNumbers.push(i);
|
|
2637
2673
|
}
|
|
2638
|
-
const
|
|
2639
|
-
for (
|
|
2640
|
-
|
|
2641
|
-
const image = await renderPageToPng(page);
|
|
2642
|
-
pageImages.push({ image, pageNum });
|
|
2643
|
-
}
|
|
2644
|
-
const batches = [];
|
|
2645
|
-
for (let i = 0; i < pageImages.length; i += provider.batchSize) {
|
|
2646
|
-
batches.push(pageImages.slice(i, i + provider.batchSize));
|
|
2674
|
+
const pageBatches = [];
|
|
2675
|
+
for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
|
|
2676
|
+
pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
|
|
2647
2677
|
}
|
|
2648
2678
|
let processed = 0;
|
|
2649
|
-
|
|
2679
|
+
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2680
|
+
const pageBlocks = [];
|
|
2650
2681
|
try {
|
|
2651
|
-
const
|
|
2652
|
-
for (const
|
|
2682
|
+
const batchImages = [];
|
|
2683
|
+
for (const pageNum of batchPageNums) {
|
|
2684
|
+
const page = await doc.getPage(pageNum);
|
|
2685
|
+
const image = await renderPageToPng(page);
|
|
2686
|
+
batchImages.push({ image, pageNum });
|
|
2687
|
+
}
|
|
2688
|
+
const results = await provider.processBatch(batchImages);
|
|
2689
|
+
for (const { pageNum } of batchImages) {
|
|
2653
2690
|
const result = results.get(pageNum);
|
|
2654
|
-
|
|
2655
|
-
|
|
2656
|
-
|
|
2657
|
-
|
|
2658
|
-
onProgress?.(processed, pageNumbers.length);
|
|
2691
|
+
pageBlocks.push({
|
|
2692
|
+
pageNum,
|
|
2693
|
+
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2694
|
+
});
|
|
2659
2695
|
}
|
|
2660
2696
|
} catch (err) {
|
|
2661
|
-
const range = `${
|
|
2697
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2662
2698
|
warnings?.push({
|
|
2663
2699
|
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2664
2700
|
code: "OCR_PAGE_FAILED"
|
|
2665
2701
|
});
|
|
2666
|
-
|
|
2667
|
-
|
|
2702
|
+
for (const pageNum of batchPageNums) {
|
|
2703
|
+
pageBlocks.push({ pageNum, blocks: [] });
|
|
2704
|
+
}
|
|
2705
|
+
}
|
|
2706
|
+
processed += batchPageNums.length;
|
|
2707
|
+
onProgress?.(processed, pageNumbers.length);
|
|
2708
|
+
return { batchIdx, pageBlocks };
|
|
2709
|
+
});
|
|
2710
|
+
const effectiveConcurrency = Math.max(1, concurrency);
|
|
2711
|
+
const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency);
|
|
2712
|
+
const blocks = [];
|
|
2713
|
+
for (const result of batchResults) {
|
|
2714
|
+
for (const { blocks: pageBlks } of result.pageBlocks) {
|
|
2715
|
+
for (const b of pageBlks) blocks.push(b);
|
|
2668
2716
|
}
|
|
2669
2717
|
}
|
|
2670
2718
|
return blocks;
|
|
@@ -2731,24 +2779,29 @@ function isPdfFile(buffer) {
|
|
|
2731
2779
|
const b = magicBytes(buffer);
|
|
2732
2780
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
2733
2781
|
}
|
|
2782
|
+
function isPngFile(buffer) {
|
|
2783
|
+
const b = magicBytes(buffer);
|
|
2784
|
+
return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
|
|
2785
|
+
}
|
|
2734
2786
|
function detectFormat(buffer) {
|
|
2735
2787
|
if (buffer.byteLength < 4) return "unknown";
|
|
2736
2788
|
if (isZipFile(buffer)) return "hwpx";
|
|
2737
2789
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
2738
2790
|
if (isPdfFile(buffer)) return "pdf";
|
|
2791
|
+
if (isPngFile(buffer)) return "image";
|
|
2739
2792
|
return "unknown";
|
|
2740
2793
|
}
|
|
2741
2794
|
async function detectZipFormat(buffer) {
|
|
2742
2795
|
try {
|
|
2743
2796
|
const zip = await import_jszip.default.loadAsync(buffer);
|
|
2744
|
-
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
2745
|
-
if (zip.file("word/document.xml")) return "docx";
|
|
2746
|
-
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
2797
|
+
if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
|
|
2798
|
+
if (zip.file("word/document.xml")) return { format: "docx", zip };
|
|
2799
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
|
|
2747
2800
|
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
2748
|
-
if (hasSection) return "hwpx";
|
|
2749
|
-
return "unknown";
|
|
2801
|
+
if (hasSection) return { format: "hwpx", zip };
|
|
2802
|
+
return { format: "unknown", zip: null };
|
|
2750
2803
|
} catch {
|
|
2751
|
-
return "unknown";
|
|
2804
|
+
return { format: "unknown", zip: null };
|
|
2752
2805
|
}
|
|
2753
2806
|
}
|
|
2754
2807
|
|
|
@@ -2757,7 +2810,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
2757
2810
|
var import_xmldom = require("@xmldom/xmldom");
|
|
2758
2811
|
|
|
2759
2812
|
// src/utils.ts
|
|
2760
|
-
var VERSION = true ? "2.3.
|
|
2813
|
+
var VERSION = true ? "2.3.2" : "0.0.0-dev";
|
|
2761
2814
|
function toArrayBuffer(buf) {
|
|
2762
2815
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2763
2816
|
return buf.buffer;
|
|
@@ -2917,12 +2970,16 @@ function buildTableDirect(rows, numRows) {
|
|
|
2917
2970
|
return trimAndReturn(grid, numRows, maxCols);
|
|
2918
2971
|
}
|
|
2919
2972
|
function trimAndReturn(grid, numRows, maxCols) {
|
|
2920
|
-
let effectiveCols =
|
|
2921
|
-
|
|
2922
|
-
|
|
2923
|
-
|
|
2924
|
-
|
|
2973
|
+
let effectiveCols = 0;
|
|
2974
|
+
for (const row of grid) {
|
|
2975
|
+
for (let c = row.length - 1; c >= effectiveCols; c--) {
|
|
2976
|
+
if (row[c]?.text?.trim()) {
|
|
2977
|
+
effectiveCols = c + 1;
|
|
2978
|
+
break;
|
|
2979
|
+
}
|
|
2980
|
+
}
|
|
2925
2981
|
}
|
|
2982
|
+
if (effectiveCols === 0) effectiveCols = maxCols;
|
|
2926
2983
|
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
2927
2984
|
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
2928
2985
|
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
@@ -3179,11 +3236,11 @@ function parseStyleElements(doc, map) {
|
|
|
3179
3236
|
function stripDtd(xml) {
|
|
3180
3237
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
3181
3238
|
}
|
|
3182
|
-
async function parseHwpxDocument(buffer, options) {
|
|
3239
|
+
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
3183
3240
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
3184
3241
|
let zip;
|
|
3185
3242
|
try {
|
|
3186
|
-
zip = await import_jszip2.default.loadAsync(buffer);
|
|
3243
|
+
zip = existingZip ?? await import_jszip2.default.loadAsync(buffer);
|
|
3187
3244
|
} catch {
|
|
3188
3245
|
return await extractFromBrokenZip(buffer);
|
|
3189
3246
|
}
|
|
@@ -6195,8 +6252,15 @@ var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
|
6195
6252
|
import_pdf2.GlobalWorkerOptions.workerSrc = "";
|
|
6196
6253
|
var MAX_PAGES = 5e3;
|
|
6197
6254
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
6198
|
-
|
|
6255
|
+
function calcPdfTimeout(bufferSize) {
|
|
6256
|
+
const base = 3e4;
|
|
6257
|
+
const perMb = 500;
|
|
6258
|
+
const mb = bufferSize / (1024 * 1024);
|
|
6259
|
+
return Math.min(base + Math.ceil(mb * perMb), 3e5);
|
|
6260
|
+
}
|
|
6199
6261
|
async function loadPdfWithTimeout(buffer) {
|
|
6262
|
+
const timeoutMs = calcPdfTimeout(buffer.byteLength);
|
|
6263
|
+
const timeoutSec = Math.round(timeoutMs / 1e3);
|
|
6200
6264
|
const loadingTask = (0, import_pdf2.getDocument)({
|
|
6201
6265
|
data: new Uint8Array(buffer),
|
|
6202
6266
|
useSystemFonts: true,
|
|
@@ -6210,8 +6274,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6210
6274
|
new Promise((_, reject) => {
|
|
6211
6275
|
timer = setTimeout(() => {
|
|
6212
6276
|
loadingTask.destroy();
|
|
6213
|
-
reject(new KordocError(
|
|
6214
|
-
},
|
|
6277
|
+
reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
|
|
6278
|
+
}, timeoutMs);
|
|
6215
6279
|
})
|
|
6216
6280
|
]);
|
|
6217
6281
|
} finally {
|
|
@@ -6232,11 +6296,15 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6232
6296
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
6233
6297
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
6234
6298
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
6235
|
-
const
|
|
6299
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
6236
6300
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
6237
|
-
|
|
6301
|
+
const targetPageNums = [];
|
|
6238
6302
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6239
6303
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
6304
|
+
targetPageNums.push(i);
|
|
6305
|
+
}
|
|
6306
|
+
let parsedPages = 0;
|
|
6307
|
+
const parseSinglePage = async (i) => {
|
|
6240
6308
|
try {
|
|
6241
6309
|
const page = await doc.getPage(i);
|
|
6242
6310
|
const tc = await page.getTextContent();
|
|
@@ -6249,7 +6317,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6249
6317
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
6250
6318
|
}
|
|
6251
6319
|
for (const item of visible) {
|
|
6252
|
-
if (item.fontSize > 0)
|
|
6320
|
+
if (item.fontSize > 0) {
|
|
6321
|
+
const rounded = Math.round(item.fontSize * 10) / 10;
|
|
6322
|
+
fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
|
|
6323
|
+
}
|
|
6253
6324
|
}
|
|
6254
6325
|
const opList = await page.getOperatorList();
|
|
6255
6326
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -6266,12 +6337,23 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6266
6337
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
6267
6338
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
6268
6339
|
}
|
|
6340
|
+
};
|
|
6341
|
+
const sampleCount = Math.min(5, targetPageNums.length);
|
|
6342
|
+
for (let si = 0; si < sampleCount; si++) {
|
|
6343
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6344
|
+
}
|
|
6345
|
+
const sampleParsed = parsedPages || sampleCount;
|
|
6346
|
+
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
6347
|
+
if (!isImageBased) {
|
|
6348
|
+
for (let si = sampleCount; si < targetPageNums.length; si++) {
|
|
6349
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6350
|
+
}
|
|
6269
6351
|
}
|
|
6270
6352
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6271
|
-
if (
|
|
6353
|
+
if (isImageBased) {
|
|
6272
6354
|
let ocrProvider = options?.ocr ?? null;
|
|
6273
|
-
const ocrMode = options?.ocrMode;
|
|
6274
|
-
if (!ocrProvider && ocrMode
|
|
6355
|
+
const ocrMode = options?.ocrMode ?? "auto";
|
|
6356
|
+
if (!ocrProvider && ocrMode !== "off") {
|
|
6275
6357
|
try {
|
|
6276
6358
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6277
6359
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
@@ -6323,7 +6405,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6323
6405
|
blocks.splice(removed[ri], 1);
|
|
6324
6406
|
}
|
|
6325
6407
|
}
|
|
6326
|
-
const medianFontSize =
|
|
6408
|
+
const medianFontSize = computeMedianFromFreq(fontSizeFreq);
|
|
6327
6409
|
if (medianFontSize > 0) {
|
|
6328
6410
|
detectHeadings(blocks, medianFontSize);
|
|
6329
6411
|
}
|
|
@@ -6376,11 +6458,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
6376
6458
|
}
|
|
6377
6459
|
return { visible, hiddenCount };
|
|
6378
6460
|
}
|
|
6379
|
-
function
|
|
6380
|
-
if (
|
|
6381
|
-
const
|
|
6382
|
-
|
|
6383
|
-
|
|
6461
|
+
function computeMedianFromFreq(freq) {
|
|
6462
|
+
if (freq.size === 0) return 0;
|
|
6463
|
+
const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
6464
|
+
let total = 0;
|
|
6465
|
+
for (const [, count] of entries) total += count;
|
|
6466
|
+
const mid = total / 2;
|
|
6467
|
+
let cumulative = 0;
|
|
6468
|
+
for (const [size, count] of entries) {
|
|
6469
|
+
cumulative += count;
|
|
6470
|
+
if (cumulative >= mid) return size;
|
|
6471
|
+
}
|
|
6472
|
+
return 0;
|
|
6384
6473
|
}
|
|
6385
6474
|
function detectHeadings(blocks, medianFontSize) {
|
|
6386
6475
|
for (const block of blocks) {
|
|
@@ -7183,6 +7272,7 @@ var MAX_SHEETS = 100;
|
|
|
7183
7272
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
7184
7273
|
var MAX_ROWS2 = 1e4;
|
|
7185
7274
|
var MAX_COLS2 = 200;
|
|
7275
|
+
var MAX_TOTAL_CELLS = 2e6;
|
|
7186
7276
|
function cleanNumericValue(raw) {
|
|
7187
7277
|
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
7188
7278
|
const num = parseFloat(raw);
|
|
@@ -7366,9 +7456,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
7366
7456
|
}
|
|
7367
7457
|
return blocks;
|
|
7368
7458
|
}
|
|
7369
|
-
async function parseXlsxDocument(buffer, options) {
|
|
7459
|
+
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
7370
7460
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
7371
|
-
const zip = await import_jszip3.default.loadAsync(buffer);
|
|
7461
|
+
const zip = existingZip ?? await import_jszip3.default.loadAsync(buffer);
|
|
7372
7462
|
const warnings = [];
|
|
7373
7463
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
7374
7464
|
if (!workbookFile) {
|
|
@@ -7395,6 +7485,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7395
7485
|
}
|
|
7396
7486
|
const blocks = [];
|
|
7397
7487
|
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
7488
|
+
let totalCells = 0;
|
|
7398
7489
|
for (let i = 0; i < processedSheets; i++) {
|
|
7399
7490
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
7400
7491
|
const sheet = sheets[i];
|
|
@@ -7421,6 +7512,11 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7421
7512
|
try {
|
|
7422
7513
|
const sheetXml = await sheetFile.async("text");
|
|
7423
7514
|
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
7515
|
+
totalCells += maxRow * maxCol;
|
|
7516
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
7517
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
7518
|
+
break;
|
|
7519
|
+
}
|
|
7424
7520
|
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
7425
7521
|
blocks.push(...sheetBlocks);
|
|
7426
7522
|
} catch (err) {
|
|
@@ -7504,10 +7600,35 @@ function getAttr(el, localName) {
|
|
|
7504
7600
|
function parseXml2(text) {
|
|
7505
7601
|
return new import_xmldom3.DOMParser().parseFromString(text, "text/xml");
|
|
7506
7602
|
}
|
|
7603
|
+
function buildElementIndex(root) {
|
|
7604
|
+
const index = /* @__PURE__ */ new Map();
|
|
7605
|
+
const walk = (node) => {
|
|
7606
|
+
const children = node.childNodes;
|
|
7607
|
+
for (let i = 0; i < children.length; i++) {
|
|
7608
|
+
const child = children[i];
|
|
7609
|
+
if (child.nodeType === 1) {
|
|
7610
|
+
const el = child;
|
|
7611
|
+
const name = el.localName ?? "";
|
|
7612
|
+
if (name) {
|
|
7613
|
+
let list = index.get(name);
|
|
7614
|
+
if (!list) {
|
|
7615
|
+
list = [];
|
|
7616
|
+
index.set(name, list);
|
|
7617
|
+
}
|
|
7618
|
+
list.push(el);
|
|
7619
|
+
}
|
|
7620
|
+
walk(el);
|
|
7621
|
+
}
|
|
7622
|
+
}
|
|
7623
|
+
};
|
|
7624
|
+
walk(root);
|
|
7625
|
+
return index;
|
|
7626
|
+
}
|
|
7507
7627
|
function parseStyles(xml) {
|
|
7508
7628
|
const doc = parseXml2(xml);
|
|
7509
7629
|
const styles = /* @__PURE__ */ new Map();
|
|
7510
|
-
const
|
|
7630
|
+
const idx = buildElementIndex(doc);
|
|
7631
|
+
const styleElements = idx.get("style") ?? [];
|
|
7511
7632
|
for (const el of styleElements) {
|
|
7512
7633
|
const styleId = getAttr(el, "styleId");
|
|
7513
7634
|
if (!styleId) continue;
|
|
@@ -7535,7 +7656,8 @@ function parseStyles(xml) {
|
|
|
7535
7656
|
function parseNumbering(xml) {
|
|
7536
7657
|
const doc = parseXml2(xml);
|
|
7537
7658
|
const abstractNums = /* @__PURE__ */ new Map();
|
|
7538
|
-
const
|
|
7659
|
+
const idx = buildElementIndex(doc);
|
|
7660
|
+
const abstractElements = idx.get("abstractNum") ?? [];
|
|
7539
7661
|
for (const el of abstractElements) {
|
|
7540
7662
|
const abstractNumId = getAttr(el, "abstractNumId");
|
|
7541
7663
|
if (!abstractNumId) continue;
|
|
@@ -7550,7 +7672,7 @@ function parseNumbering(xml) {
|
|
|
7550
7672
|
abstractNums.set(abstractNumId, levels);
|
|
7551
7673
|
}
|
|
7552
7674
|
const nums = /* @__PURE__ */ new Map();
|
|
7553
|
-
const numElements =
|
|
7675
|
+
const numElements = idx.get("num") ?? [];
|
|
7554
7676
|
for (const el of numElements) {
|
|
7555
7677
|
const numId = getAttr(el, "numId");
|
|
7556
7678
|
if (!numId) continue;
|
|
@@ -7794,9 +7916,9 @@ async function extractImages(zip, rels, doc) {
|
|
|
7794
7916
|
}
|
|
7795
7917
|
return { blocks, images };
|
|
7796
7918
|
}
|
|
7797
|
-
async function parseDocxDocument(buffer, options) {
|
|
7919
|
+
async function parseDocxDocument(buffer, options, existingZip) {
|
|
7798
7920
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
7799
|
-
const zip = await import_jszip4.default.loadAsync(buffer);
|
|
7921
|
+
const zip = existingZip ?? await import_jszip4.default.loadAsync(buffer);
|
|
7800
7922
|
const warnings = [];
|
|
7801
7923
|
const docFile = zip.file("word/document.xml");
|
|
7802
7924
|
if (!docFile) {
|
|
@@ -7886,6 +8008,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
7886
8008
|
};
|
|
7887
8009
|
}
|
|
7888
8010
|
|
|
8011
|
+
// src/index.ts
|
|
8012
|
+
init_cli_provider();
|
|
8013
|
+
init_tesseract_provider();
|
|
8014
|
+
init_markdown_to_blocks();
|
|
8015
|
+
|
|
7889
8016
|
// src/diff/text-diff.ts
|
|
7890
8017
|
function similarity(a, b) {
|
|
7891
8018
|
if (a === b) return 1;
|
|
@@ -10402,25 +10529,86 @@ async function parse2(input, options) {
|
|
|
10402
10529
|
if (!buffer || buffer.byteLength === 0) {
|
|
10403
10530
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
10404
10531
|
}
|
|
10532
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
10533
|
+
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
10534
|
+
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
10535
|
+
}
|
|
10405
10536
|
const format = detectFormat(buffer);
|
|
10406
10537
|
switch (format) {
|
|
10407
10538
|
case "hwpx": {
|
|
10408
|
-
const zipFormat = await detectZipFormat(buffer);
|
|
10409
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
10410
|
-
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
10411
|
-
return parseHwpx(buffer, options);
|
|
10539
|
+
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
10540
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
|
|
10541
|
+
if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
|
|
10542
|
+
return parseHwpx(buffer, options, zip ?? void 0);
|
|
10412
10543
|
}
|
|
10413
10544
|
case "hwp":
|
|
10414
10545
|
return parseHwp(buffer, options);
|
|
10415
10546
|
case "pdf":
|
|
10416
10547
|
return parsePdf(buffer, options);
|
|
10548
|
+
case "image":
|
|
10549
|
+
return parseImage(buffer, options);
|
|
10417
10550
|
default:
|
|
10418
10551
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
10419
10552
|
}
|
|
10420
10553
|
}
|
|
10421
|
-
async function
|
|
10554
|
+
async function parseImage(buffer, options) {
|
|
10555
|
+
const ocrMode = options?.ocrMode || "auto";
|
|
10556
|
+
if (ocrMode === "off") {
|
|
10557
|
+
return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
|
|
10558
|
+
}
|
|
10559
|
+
let ocrProvider;
|
|
10560
|
+
let actualOcrMode = "auto";
|
|
10561
|
+
try {
|
|
10562
|
+
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
10563
|
+
ocrProvider = createCliOcrProvider(ocrMode);
|
|
10564
|
+
actualOcrMode = ocrMode;
|
|
10565
|
+
} else if (ocrMode === "tesseract") {
|
|
10566
|
+
ocrProvider = await createTesseractProvider();
|
|
10567
|
+
actualOcrMode = ocrMode;
|
|
10568
|
+
} else if (ocrMode === "auto") {
|
|
10569
|
+
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
10570
|
+
for (const mode of modesToTry) {
|
|
10571
|
+
try {
|
|
10572
|
+
ocrProvider = createCliOcrProvider(mode);
|
|
10573
|
+
actualOcrMode = mode;
|
|
10574
|
+
break;
|
|
10575
|
+
} catch (e) {
|
|
10576
|
+
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
10577
|
+
}
|
|
10578
|
+
}
|
|
10579
|
+
if (!ocrProvider) {
|
|
10580
|
+
ocrProvider = await createTesseractProvider();
|
|
10581
|
+
actualOcrMode = "tesseract";
|
|
10582
|
+
}
|
|
10583
|
+
}
|
|
10584
|
+
if (!ocrProvider) {
|
|
10585
|
+
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
10586
|
+
}
|
|
10587
|
+
const imageUint8Array = new Uint8Array(buffer);
|
|
10588
|
+
const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
|
|
10589
|
+
if (ocrProvider.terminate) {
|
|
10590
|
+
await ocrProvider.terminate();
|
|
10591
|
+
}
|
|
10592
|
+
const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
|
|
10593
|
+
const blocks = markdownToBlocks(markdown, 1);
|
|
10594
|
+
return {
|
|
10595
|
+
success: true,
|
|
10596
|
+
fileType: "image",
|
|
10597
|
+
markdown,
|
|
10598
|
+
blocks,
|
|
10599
|
+
isImageBased: true,
|
|
10600
|
+
warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
|
|
10601
|
+
};
|
|
10602
|
+
} catch (err) {
|
|
10603
|
+
if (ocrProvider && ocrProvider.terminate) {
|
|
10604
|
+
await ocrProvider.terminate();
|
|
10605
|
+
}
|
|
10606
|
+
return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
|
|
10607
|
+
}
|
|
10608
|
+
}
|
|
10609
|
+
async function parseHwpx(buffer, options, zip) {
|
|
10422
10610
|
try {
|
|
10423
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
10611
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
10424
10612
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10425
10613
|
} catch (err) {
|
|
10426
10614
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -10443,17 +10631,17 @@ async function parsePdf(buffer, options) {
|
|
|
10443
10631
|
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
10444
10632
|
}
|
|
10445
10633
|
}
|
|
10446
|
-
async function parseXlsx(buffer, options) {
|
|
10634
|
+
async function parseXlsx(buffer, options, zip) {
|
|
10447
10635
|
try {
|
|
10448
|
-
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
10636
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
10449
10637
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
10450
10638
|
} catch (err) {
|
|
10451
10639
|
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
10452
10640
|
}
|
|
10453
10641
|
}
|
|
10454
|
-
async function parseDocx(buffer, options) {
|
|
10642
|
+
async function parseDocx(buffer, options, zip) {
|
|
10455
10643
|
try {
|
|
10456
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
10644
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
10457
10645
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10458
10646
|
} catch (err) {
|
|
10459
10647
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|