@clazic/kordoc 2.3.0 → 2.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{batch-provider-FUCIIS4M.js → batch-provider-PNDCSGQW.js} +59 -30
- package/dist/batch-provider-PNDCSGQW.js.map +1 -0
- package/dist/{chunk-2ZGLFZCN.js → chunk-2GFJFTKS.js} +193 -49
- package/dist/chunk-2GFJFTKS.js.map +1 -0
- package/dist/chunk-4PP34NVQ.js +121 -0
- package/dist/chunk-4PP34NVQ.js.map +1 -0
- package/dist/{tesseract-provider-WCVJWBUT.js → chunk-7FMKAV4P.js} +4 -4
- package/dist/{tesseract-provider-WCVJWBUT.js.map → chunk-7FMKAV4P.js.map} +1 -1
- package/dist/chunk-JOGAFNIL.js +153 -0
- package/dist/chunk-JOGAFNIL.js.map +1 -0
- package/dist/{chunk-WWILSVMJ.js → chunk-STIKJGEA.js} +2 -2
- package/dist/cli.js +10 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +291 -103
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +11 -6
- package/dist/index.d.ts +11 -6
- package/dist/index.js +292 -104
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-OBY3XFSZ.js → provider-HE727F7Z.js} +38 -139
- package/dist/provider-HE727F7Z.js.map +1 -0
- package/dist/resolve-QA3VACUP.js +111 -0
- package/dist/resolve-QA3VACUP.js.map +1 -0
- package/dist/tesseract-provider-MNMZPSGF.js +11 -0
- package/dist/{utils-QAK24RJS.js → utils-FFUQJTTI.js} +2 -2
- package/dist/utils-FFUQJTTI.js.map +1 -0
- package/dist/{watch-MPHX3QIH.js → watch-2O32L6IF.js} +6 -3
- package/dist/{watch-MPHX3QIH.js.map → watch-2O32L6IF.js.map} +1 -1
- package/package.json +1 -1
- package/dist/batch-provider-FUCIIS4M.js.map +0 -1
- package/dist/chunk-2ZGLFZCN.js.map +0 -1
- package/dist/provider-OBY3XFSZ.js.map +0 -1
- package/dist/resolve-LBFYRHJI.js +0 -247
- package/dist/resolve-LBFYRHJI.js.map +0 -1
- /package/dist/{chunk-WWILSVMJ.js.map → chunk-STIKJGEA.js.map} +0 -0
- /package/dist/{utils-QAK24RJS.js.map → tesseract-provider-MNMZPSGF.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -1998,8 +1998,8 @@ function getTesseractFallbackMessage() {
|
|
|
1998
1998
|
"\uC124\uCE58\uB41C AI CLI\uAC00 \uC5C6\uC5B4 \uB0B4\uC7A5 tesseract.js\uB85C OCR\uC744 \uC218\uD589\uD569\uB2C8\uB2E4.",
|
|
1999
1999
|
"\uB354 \uB098\uC740 \uD488\uC9C8(\uD14C\uC774\uBE14/\uD5E4\uB529 \uAD6C\uC870 \uBCF4\uC874)\uC744 \uC704\uD574 AI CLI \uC124\uCE58\uB97C \uAD8C\uC7A5\uD569\uB2C8\uB2E4:",
|
|
2000
2000
|
"",
|
|
2001
|
-
" [\uAD8C\uC7A5]
|
|
2002
|
-
"
|
|
2001
|
+
" [\uAD8C\uC7A5] Codex CLI: npm install -g @openai/codex",
|
|
2002
|
+
" Gemini CLI: https://ai.google.dev/gemini-api/docs/cli",
|
|
2003
2003
|
" Claude CLI: npm install -g @anthropic-ai/claude-code",
|
|
2004
2004
|
" Ollama: brew install ollama (+ ollama pull gemma4:27b)"
|
|
2005
2005
|
].join("\n");
|
|
@@ -2008,7 +2008,7 @@ var CLI_PRIORITY;
|
|
|
2008
2008
|
var init_auto_detect = __esm({
|
|
2009
2009
|
"src/ocr/auto-detect.ts"() {
|
|
2010
2010
|
"use strict";
|
|
2011
|
-
CLI_PRIORITY = ["
|
|
2011
|
+
CLI_PRIORITY = ["codex", "gemini", "claude", "ollama"];
|
|
2012
2012
|
}
|
|
2013
2013
|
});
|
|
2014
2014
|
|
|
@@ -2051,7 +2051,7 @@ function callCli(mode, imagePath) {
|
|
|
2051
2051
|
const args = buildCliArgs(mode, imagePath);
|
|
2052
2052
|
const result = spawnSync(mode, args, {
|
|
2053
2053
|
encoding: "utf-8",
|
|
2054
|
-
timeout:
|
|
2054
|
+
timeout: 6e5,
|
|
2055
2055
|
maxBuffer: 10 * 1024 * 1024,
|
|
2056
2056
|
// claude: /tmp에서 실행하여 프로젝트 CLAUDE.md의 규칙 간섭 방지
|
|
2057
2057
|
...mode === "claude" ? { cwd: tmpdir() } : {}
|
|
@@ -2145,14 +2145,22 @@ async function callOllamaApi(imagePath) {
|
|
|
2145
2145
|
return data.message?.content || "";
|
|
2146
2146
|
}
|
|
2147
2147
|
function stripCodeFence(text) {
|
|
2148
|
-
const match = text.match(/^```(?:markdown|md)?\s
|
|
2148
|
+
const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
|
|
2149
2149
|
return match ? match[1].trim() : text;
|
|
2150
2150
|
}
|
|
2151
2151
|
var OCR_PROMPT, _tempDir;
|
|
2152
2152
|
var init_cli_provider = __esm({
|
|
2153
2153
|
"src/ocr/cli-provider.ts"() {
|
|
2154
2154
|
"use strict";
|
|
2155
|
-
OCR_PROMPT =
|
|
2155
|
+
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
|
|
2156
|
+
\uADDC\uCE59:
|
|
2157
|
+
- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
|
|
2158
|
+
- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
|
|
2159
|
+
- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
|
|
2160
|
+
- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
|
|
2161
|
+
- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
|
|
2162
|
+
- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
|
|
2163
|
+
- \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
|
|
2156
2164
|
_tempDir = null;
|
|
2157
2165
|
}
|
|
2158
2166
|
});
|
|
@@ -2222,7 +2230,7 @@ __export(batch_provider_exports, {
|
|
|
2222
2230
|
DEFAULT_BATCH_SIZES: () => DEFAULT_BATCH_SIZES,
|
|
2223
2231
|
createBatchCliProvider: () => createBatchCliProvider
|
|
2224
2232
|
});
|
|
2225
|
-
import {
|
|
2233
|
+
import { spawn } from "child_process";
|
|
2226
2234
|
import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync2 } from "fs";
|
|
2227
2235
|
import { join as join2 } from "path";
|
|
2228
2236
|
import { tmpdir as tmpdir2 } from "os";
|
|
@@ -2249,9 +2257,9 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2249
2257
|
}
|
|
2250
2258
|
let output;
|
|
2251
2259
|
if (mode === "codex") {
|
|
2252
|
-
output = callBatchCodexCli(tempFiles);
|
|
2260
|
+
output = await callBatchCodexCli(tempFiles);
|
|
2253
2261
|
} else {
|
|
2254
|
-
output = callBatchCli(mode, tempFiles);
|
|
2262
|
+
output = await callBatchCli(mode, tempFiles);
|
|
2255
2263
|
}
|
|
2256
2264
|
const cleaned = stripCodeFence2(output.trim());
|
|
2257
2265
|
const parts = cleaned.split(/<!--\s*PAGE_BREAK\s*-->/).map((p) => p.trim()).filter((p) => p.length > 0);
|
|
@@ -2273,40 +2281,74 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2273
2281
|
}
|
|
2274
2282
|
};
|
|
2275
2283
|
}
|
|
2276
|
-
function
|
|
2284
|
+
function spawnAsync(cmd, args, opts) {
|
|
2285
|
+
return new Promise((resolve, reject) => {
|
|
2286
|
+
const child = spawn(cmd, args, {
|
|
2287
|
+
cwd: opts.cwd,
|
|
2288
|
+
env: process.env,
|
|
2289
|
+
stdio: ["pipe", "pipe", "pipe"]
|
|
2290
|
+
});
|
|
2291
|
+
let stdout = "";
|
|
2292
|
+
let stderr = "";
|
|
2293
|
+
let killed = false;
|
|
2294
|
+
child.stdout.setEncoding("utf-8");
|
|
2295
|
+
child.stderr.setEncoding("utf-8");
|
|
2296
|
+
child.stdout.on("data", (d) => {
|
|
2297
|
+
stdout += d;
|
|
2298
|
+
});
|
|
2299
|
+
child.stderr.on("data", (d) => {
|
|
2300
|
+
stderr += d;
|
|
2301
|
+
});
|
|
2302
|
+
const timer = setTimeout(() => {
|
|
2303
|
+
killed = true;
|
|
2304
|
+
child.kill("SIGTERM");
|
|
2305
|
+
}, opts.timeoutMs);
|
|
2306
|
+
if (opts.stdin !== void 0) {
|
|
2307
|
+
child.stdin.end(opts.stdin);
|
|
2308
|
+
} else {
|
|
2309
|
+
child.stdin.end();
|
|
2310
|
+
}
|
|
2311
|
+
child.on("close", (code) => {
|
|
2312
|
+
clearTimeout(timer);
|
|
2313
|
+
if (killed) {
|
|
2314
|
+
reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
|
|
2315
|
+
} else {
|
|
2316
|
+
resolve({ stdout, stderr, exitCode: code ?? 1 });
|
|
2317
|
+
}
|
|
2318
|
+
});
|
|
2319
|
+
child.on("error", (err) => {
|
|
2320
|
+
clearTimeout(timer);
|
|
2321
|
+
reject(err);
|
|
2322
|
+
});
|
|
2323
|
+
});
|
|
2324
|
+
}
|
|
2325
|
+
async function callBatchCli(mode, imagePaths) {
|
|
2277
2326
|
const fileRefs = imagePaths.map((p) => `@${p}`).join("\n");
|
|
2278
2327
|
const prompt = `${BATCH_OCR_PROMPT}
|
|
2279
2328
|
|
|
2280
2329
|
${fileRefs}`;
|
|
2281
2330
|
let args;
|
|
2282
2331
|
if (mode === "gemini") {
|
|
2283
|
-
|
|
2284
|
-
|
|
2285
|
-
if (model) args.push("--model", model);
|
|
2332
|
+
const model = process.env.KORDOC_GEMINI_MODEL ?? "gemini-2.5-flash";
|
|
2333
|
+
args = ["--prompt", prompt, "--yolo", "--model", model];
|
|
2286
2334
|
} else {
|
|
2287
2335
|
args = ["--print", prompt];
|
|
2288
2336
|
const model = process.env.KORDOC_CLAUDE_MODEL;
|
|
2289
2337
|
if (model) args.push("--model", model);
|
|
2290
2338
|
}
|
|
2291
2339
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2292
|
-
const result =
|
|
2293
|
-
|
|
2294
|
-
timeout: timeoutMs,
|
|
2295
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
2296
|
-
// 50MB (large batch output)
|
|
2340
|
+
const result = await spawnAsync(mode, args, {
|
|
2341
|
+
timeoutMs,
|
|
2297
2342
|
...mode === "claude" ? { cwd: tmpdir2() } : {}
|
|
2298
2343
|
});
|
|
2299
|
-
if (result.
|
|
2300
|
-
|
|
2301
|
-
}
|
|
2302
|
-
if (result.status !== 0) {
|
|
2303
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2344
|
+
if (result.exitCode !== 0) {
|
|
2345
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2304
2346
|
throw new Error(`${mode} \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2305
2347
|
}
|
|
2306
2348
|
return result.stdout || "";
|
|
2307
2349
|
}
|
|
2308
|
-
function callBatchCodexCli(imagePaths) {
|
|
2309
|
-
const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}.txt`);
|
|
2350
|
+
async function callBatchCodexCli(imagePaths) {
|
|
2351
|
+
const outPath = join2(tmpdir2(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
|
|
2310
2352
|
try {
|
|
2311
2353
|
const args = ["exec", BATCH_OCR_PROMPT];
|
|
2312
2354
|
for (const p of imagePaths) {
|
|
@@ -2316,17 +2358,12 @@ function callBatchCodexCli(imagePaths) {
|
|
|
2316
2358
|
const model = process.env.KORDOC_CODEX_MODEL;
|
|
2317
2359
|
if (model) args.push("--model", model);
|
|
2318
2360
|
const timeoutMs = 6e4 + imagePaths.length * 2e4;
|
|
2319
|
-
const result =
|
|
2320
|
-
|
|
2321
|
-
|
|
2322
|
-
maxBuffer: 50 * 1024 * 1024,
|
|
2323
|
-
input: ""
|
|
2361
|
+
const result = await spawnAsync("codex", args, {
|
|
2362
|
+
timeoutMs,
|
|
2363
|
+
stdin: ""
|
|
2324
2364
|
});
|
|
2325
|
-
if (result.
|
|
2326
|
-
|
|
2327
|
-
}
|
|
2328
|
-
if (result.status !== 0) {
|
|
2329
|
-
const errMsg = result.stderr?.trim() || `exit code ${result.status}`;
|
|
2365
|
+
if (result.exitCode !== 0) {
|
|
2366
|
+
const errMsg = result.stderr?.trim() || `exit code ${result.exitCode}`;
|
|
2330
2367
|
throw new Error(`codex \uBC30\uCE58 OCR \uC2E4\uD328: ${errMsg}`);
|
|
2331
2368
|
}
|
|
2332
2369
|
try {
|
|
@@ -2584,7 +2621,7 @@ function isBatchProvider(p) {
|
|
|
2584
2621
|
async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2585
2622
|
const blocks = [];
|
|
2586
2623
|
if (isBatchProvider(provider)) {
|
|
2587
|
-
return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress);
|
|
2624
|
+
return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
|
|
2588
2625
|
}
|
|
2589
2626
|
if (concurrency <= 1) {
|
|
2590
2627
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -2631,43 +2668,54 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2631
2668
|
}
|
|
2632
2669
|
return blocks;
|
|
2633
2670
|
}
|
|
2634
|
-
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, onProgress) {
|
|
2635
|
-
const blocks = [];
|
|
2671
|
+
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2636
2672
|
const pageNumbers = [];
|
|
2637
2673
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
2638
2674
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
2639
2675
|
pageNumbers.push(i);
|
|
2640
2676
|
}
|
|
2641
|
-
const
|
|
2642
|
-
for (
|
|
2643
|
-
|
|
2644
|
-
const image = await renderPageToPng(page);
|
|
2645
|
-
pageImages.push({ image, pageNum });
|
|
2646
|
-
}
|
|
2647
|
-
const batches = [];
|
|
2648
|
-
for (let i = 0; i < pageImages.length; i += provider.batchSize) {
|
|
2649
|
-
batches.push(pageImages.slice(i, i + provider.batchSize));
|
|
2677
|
+
const pageBatches = [];
|
|
2678
|
+
for (let i = 0; i < pageNumbers.length; i += provider.batchSize) {
|
|
2679
|
+
pageBatches.push(pageNumbers.slice(i, i + provider.batchSize));
|
|
2650
2680
|
}
|
|
2651
2681
|
let processed = 0;
|
|
2652
|
-
|
|
2682
|
+
const batchTasks = pageBatches.map((batchPageNums, batchIdx) => async () => {
|
|
2683
|
+
const pageBlocks = [];
|
|
2653
2684
|
try {
|
|
2654
|
-
const
|
|
2655
|
-
for (const
|
|
2685
|
+
const batchImages = [];
|
|
2686
|
+
for (const pageNum of batchPageNums) {
|
|
2687
|
+
const page = await doc.getPage(pageNum);
|
|
2688
|
+
const image = await renderPageToPng(page);
|
|
2689
|
+
batchImages.push({ image, pageNum });
|
|
2690
|
+
}
|
|
2691
|
+
const results = await provider.processBatch(batchImages);
|
|
2692
|
+
for (const { pageNum } of batchImages) {
|
|
2656
2693
|
const result = results.get(pageNum);
|
|
2657
|
-
|
|
2658
|
-
|
|
2659
|
-
|
|
2660
|
-
|
|
2661
|
-
onProgress?.(processed, pageNumbers.length);
|
|
2694
|
+
pageBlocks.push({
|
|
2695
|
+
pageNum,
|
|
2696
|
+
blocks: result ? ocrResultToBlocks(result, pageNum) : []
|
|
2697
|
+
});
|
|
2662
2698
|
}
|
|
2663
2699
|
} catch (err) {
|
|
2664
|
-
const range = `${
|
|
2700
|
+
const range = `${batchPageNums[0]}-${batchPageNums[batchPageNums.length - 1]}`;
|
|
2665
2701
|
warnings?.push({
|
|
2666
2702
|
message: `\uBC30\uCE58 OCR \uC2E4\uD328 (\uD398\uC774\uC9C0 ${range}): ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2667
2703
|
code: "OCR_PAGE_FAILED"
|
|
2668
2704
|
});
|
|
2669
|
-
|
|
2670
|
-
|
|
2705
|
+
for (const pageNum of batchPageNums) {
|
|
2706
|
+
pageBlocks.push({ pageNum, blocks: [] });
|
|
2707
|
+
}
|
|
2708
|
+
}
|
|
2709
|
+
processed += batchPageNums.length;
|
|
2710
|
+
onProgress?.(processed, pageNumbers.length);
|
|
2711
|
+
return { batchIdx, pageBlocks };
|
|
2712
|
+
});
|
|
2713
|
+
const effectiveConcurrency = Math.max(1, concurrency);
|
|
2714
|
+
const batchResults = await runWithConcurrency(batchTasks, effectiveConcurrency);
|
|
2715
|
+
const blocks = [];
|
|
2716
|
+
for (const result of batchResults) {
|
|
2717
|
+
for (const { blocks: pageBlks } of result.pageBlocks) {
|
|
2718
|
+
for (const b of pageBlks) blocks.push(b);
|
|
2671
2719
|
}
|
|
2672
2720
|
}
|
|
2673
2721
|
return blocks;
|
|
@@ -2711,24 +2759,29 @@ function isPdfFile(buffer) {
|
|
|
2711
2759
|
const b = magicBytes(buffer);
|
|
2712
2760
|
return b[0] === 37 && b[1] === 80 && b[2] === 68 && b[3] === 70;
|
|
2713
2761
|
}
|
|
2762
|
+
function isPngFile(buffer) {
|
|
2763
|
+
const b = magicBytes(buffer);
|
|
2764
|
+
return b[0] === 137 && b[1] === 80 && b[2] === 78 && b[3] === 71;
|
|
2765
|
+
}
|
|
2714
2766
|
function detectFormat(buffer) {
|
|
2715
2767
|
if (buffer.byteLength < 4) return "unknown";
|
|
2716
2768
|
if (isZipFile(buffer)) return "hwpx";
|
|
2717
2769
|
if (isOldHwpFile(buffer)) return "hwp";
|
|
2718
2770
|
if (isPdfFile(buffer)) return "pdf";
|
|
2771
|
+
if (isPngFile(buffer)) return "image";
|
|
2719
2772
|
return "unknown";
|
|
2720
2773
|
}
|
|
2721
2774
|
async function detectZipFormat(buffer) {
|
|
2722
2775
|
try {
|
|
2723
2776
|
const zip = await JSZip.loadAsync(buffer);
|
|
2724
|
-
if (zip.file("xl/workbook.xml")) return "xlsx";
|
|
2725
|
-
if (zip.file("word/document.xml")) return "docx";
|
|
2726
|
-
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return "hwpx";
|
|
2777
|
+
if (zip.file("xl/workbook.xml")) return { format: "xlsx", zip };
|
|
2778
|
+
if (zip.file("word/document.xml")) return { format: "docx", zip };
|
|
2779
|
+
if (zip.file("Contents/content.hpf") || zip.file("mimetype")) return { format: "hwpx", zip };
|
|
2727
2780
|
const hasSection = Object.keys(zip.files).some((f) => f.startsWith("Contents/"));
|
|
2728
|
-
if (hasSection) return "hwpx";
|
|
2729
|
-
return "unknown";
|
|
2781
|
+
if (hasSection) return { format: "hwpx", zip };
|
|
2782
|
+
return { format: "unknown", zip: null };
|
|
2730
2783
|
} catch {
|
|
2731
|
-
return "unknown";
|
|
2784
|
+
return { format: "unknown", zip: null };
|
|
2732
2785
|
}
|
|
2733
2786
|
}
|
|
2734
2787
|
|
|
@@ -2737,7 +2790,7 @@ import JSZip2 from "jszip";
|
|
|
2737
2790
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2738
2791
|
|
|
2739
2792
|
// src/utils.ts
|
|
2740
|
-
var VERSION = true ? "2.3.
|
|
2793
|
+
var VERSION = true ? "2.3.2" : "0.0.0-dev";
|
|
2741
2794
|
function toArrayBuffer(buf) {
|
|
2742
2795
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2743
2796
|
return buf.buffer;
|
|
@@ -2897,12 +2950,16 @@ function buildTableDirect(rows, numRows) {
|
|
|
2897
2950
|
return trimAndReturn(grid, numRows, maxCols);
|
|
2898
2951
|
}
|
|
2899
2952
|
function trimAndReturn(grid, numRows, maxCols) {
|
|
2900
|
-
let effectiveCols =
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
|
|
2904
|
-
|
|
2953
|
+
let effectiveCols = 0;
|
|
2954
|
+
for (const row of grid) {
|
|
2955
|
+
for (let c = row.length - 1; c >= effectiveCols; c--) {
|
|
2956
|
+
if (row[c]?.text?.trim()) {
|
|
2957
|
+
effectiveCols = c + 1;
|
|
2958
|
+
break;
|
|
2959
|
+
}
|
|
2960
|
+
}
|
|
2905
2961
|
}
|
|
2962
|
+
if (effectiveCols === 0) effectiveCols = maxCols;
|
|
2906
2963
|
if (effectiveCols < maxCols && effectiveCols > 0) {
|
|
2907
2964
|
const trimmed = grid.map((row) => row.slice(0, effectiveCols));
|
|
2908
2965
|
return { rows: numRows, cols: effectiveCols, cells: trimmed, hasHeader: numRows > 1 };
|
|
@@ -3159,11 +3216,11 @@ function parseStyleElements(doc, map) {
|
|
|
3159
3216
|
function stripDtd(xml) {
|
|
3160
3217
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
3161
3218
|
}
|
|
3162
|
-
async function parseHwpxDocument(buffer, options) {
|
|
3219
|
+
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
3163
3220
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
3164
3221
|
let zip;
|
|
3165
3222
|
try {
|
|
3166
|
-
zip = await JSZip2.loadAsync(buffer);
|
|
3223
|
+
zip = existingZip ?? await JSZip2.loadAsync(buffer);
|
|
3167
3224
|
} catch {
|
|
3168
3225
|
return await extractFromBrokenZip(buffer);
|
|
3169
3226
|
}
|
|
@@ -6175,8 +6232,15 @@ import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mj
|
|
|
6175
6232
|
GlobalWorkerOptions.workerSrc = "";
|
|
6176
6233
|
var MAX_PAGES = 5e3;
|
|
6177
6234
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
6178
|
-
|
|
6235
|
+
function calcPdfTimeout(bufferSize) {
|
|
6236
|
+
const base = 3e4;
|
|
6237
|
+
const perMb = 500;
|
|
6238
|
+
const mb = bufferSize / (1024 * 1024);
|
|
6239
|
+
return Math.min(base + Math.ceil(mb * perMb), 3e5);
|
|
6240
|
+
}
|
|
6179
6241
|
async function loadPdfWithTimeout(buffer) {
|
|
6242
|
+
const timeoutMs = calcPdfTimeout(buffer.byteLength);
|
|
6243
|
+
const timeoutSec = Math.round(timeoutMs / 1e3);
|
|
6180
6244
|
const loadingTask = getDocument({
|
|
6181
6245
|
data: new Uint8Array(buffer),
|
|
6182
6246
|
useSystemFonts: true,
|
|
@@ -6190,8 +6254,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6190
6254
|
new Promise((_, reject) => {
|
|
6191
6255
|
timer = setTimeout(() => {
|
|
6192
6256
|
loadingTask.destroy();
|
|
6193
|
-
reject(new KordocError(
|
|
6194
|
-
},
|
|
6257
|
+
reject(new KordocError(`PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (${timeoutSec}\uCD08 \uCD08\uACFC)`));
|
|
6258
|
+
}, timeoutMs);
|
|
6195
6259
|
})
|
|
6196
6260
|
]);
|
|
6197
6261
|
} finally {
|
|
@@ -6212,11 +6276,15 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6212
6276
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
6213
6277
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
6214
6278
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
6215
|
-
const
|
|
6279
|
+
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
6216
6280
|
const pageHeights = /* @__PURE__ */ new Map();
|
|
6217
|
-
|
|
6281
|
+
const targetPageNums = [];
|
|
6218
6282
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
6219
6283
|
if (pageFilter && !pageFilter.has(i)) continue;
|
|
6284
|
+
targetPageNums.push(i);
|
|
6285
|
+
}
|
|
6286
|
+
let parsedPages = 0;
|
|
6287
|
+
const parseSinglePage = async (i) => {
|
|
6220
6288
|
try {
|
|
6221
6289
|
const page = await doc.getPage(i);
|
|
6222
6290
|
const tc = await page.getTextContent();
|
|
@@ -6229,7 +6297,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6229
6297
|
warnings.push({ page: i, message: `${hiddenCount}\uAC1C \uC228\uACA8\uC9C4 \uD14D\uC2A4\uD2B8 \uC694\uC18C \uD544\uD130\uB9C1\uB428`, code: "HIDDEN_TEXT_FILTERED" });
|
|
6230
6298
|
}
|
|
6231
6299
|
for (const item of visible) {
|
|
6232
|
-
if (item.fontSize > 0)
|
|
6300
|
+
if (item.fontSize > 0) {
|
|
6301
|
+
const rounded = Math.round(item.fontSize * 10) / 10;
|
|
6302
|
+
fontSizeFreq.set(rounded, (fontSizeFreq.get(rounded) || 0) + 1);
|
|
6303
|
+
}
|
|
6233
6304
|
}
|
|
6234
6305
|
const opList = await page.getOperatorList();
|
|
6235
6306
|
const pageBlocks = extractPageBlocksWithLines(visible, i, opList, viewport.width, viewport.height);
|
|
@@ -6246,12 +6317,23 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6246
6317
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
6247
6318
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
6248
6319
|
}
|
|
6320
|
+
};
|
|
6321
|
+
const sampleCount = Math.min(5, targetPageNums.length);
|
|
6322
|
+
for (let si = 0; si < sampleCount; si++) {
|
|
6323
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6324
|
+
}
|
|
6325
|
+
const sampleParsed = parsedPages || sampleCount;
|
|
6326
|
+
const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
|
|
6327
|
+
if (!isImageBased) {
|
|
6328
|
+
for (let si = sampleCount; si < targetPageNums.length; si++) {
|
|
6329
|
+
await parseSinglePage(targetPageNums[si]);
|
|
6330
|
+
}
|
|
6249
6331
|
}
|
|
6250
6332
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6251
|
-
if (
|
|
6333
|
+
if (isImageBased) {
|
|
6252
6334
|
let ocrProvider = options?.ocr ?? null;
|
|
6253
|
-
const ocrMode = options?.ocrMode;
|
|
6254
|
-
if (!ocrProvider && ocrMode
|
|
6335
|
+
const ocrMode = options?.ocrMode ?? "auto";
|
|
6336
|
+
if (!ocrProvider && ocrMode !== "off") {
|
|
6255
6337
|
try {
|
|
6256
6338
|
const { resolveOcrProvider: resolveOcrProvider2 } = await Promise.resolve().then(() => (init_resolve(), resolve_exports));
|
|
6257
6339
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
@@ -6303,7 +6385,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6303
6385
|
blocks.splice(removed[ri], 1);
|
|
6304
6386
|
}
|
|
6305
6387
|
}
|
|
6306
|
-
const medianFontSize =
|
|
6388
|
+
const medianFontSize = computeMedianFromFreq(fontSizeFreq);
|
|
6307
6389
|
if (medianFontSize > 0) {
|
|
6308
6390
|
detectHeadings(blocks, medianFontSize);
|
|
6309
6391
|
}
|
|
@@ -6356,11 +6438,18 @@ function filterHiddenText(items, pageWidth, pageHeight) {
|
|
|
6356
6438
|
}
|
|
6357
6439
|
return { visible, hiddenCount };
|
|
6358
6440
|
}
|
|
6359
|
-
function
|
|
6360
|
-
if (
|
|
6361
|
-
const
|
|
6362
|
-
|
|
6363
|
-
|
|
6441
|
+
function computeMedianFromFreq(freq) {
|
|
6442
|
+
if (freq.size === 0) return 0;
|
|
6443
|
+
const entries = [...freq.entries()].sort((a, b) => a[0] - b[0]);
|
|
6444
|
+
let total = 0;
|
|
6445
|
+
for (const [, count] of entries) total += count;
|
|
6446
|
+
const mid = total / 2;
|
|
6447
|
+
let cumulative = 0;
|
|
6448
|
+
for (const [size, count] of entries) {
|
|
6449
|
+
cumulative += count;
|
|
6450
|
+
if (cumulative >= mid) return size;
|
|
6451
|
+
}
|
|
6452
|
+
return 0;
|
|
6364
6453
|
}
|
|
6365
6454
|
function detectHeadings(blocks, medianFontSize) {
|
|
6366
6455
|
for (const block of blocks) {
|
|
@@ -7163,6 +7252,7 @@ var MAX_SHEETS = 100;
|
|
|
7163
7252
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
7164
7253
|
var MAX_ROWS2 = 1e4;
|
|
7165
7254
|
var MAX_COLS2 = 200;
|
|
7255
|
+
var MAX_TOTAL_CELLS = 2e6;
|
|
7166
7256
|
function cleanNumericValue(raw) {
|
|
7167
7257
|
if (!/^-?\d+\.\d+$/.test(raw)) return raw;
|
|
7168
7258
|
const num = parseFloat(raw);
|
|
@@ -7346,9 +7436,9 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
7346
7436
|
}
|
|
7347
7437
|
return blocks;
|
|
7348
7438
|
}
|
|
7349
|
-
async function parseXlsxDocument(buffer, options) {
|
|
7439
|
+
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
7350
7440
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
7351
|
-
const zip = await JSZip3.loadAsync(buffer);
|
|
7441
|
+
const zip = existingZip ?? await JSZip3.loadAsync(buffer);
|
|
7352
7442
|
const warnings = [];
|
|
7353
7443
|
const workbookFile = zip.file("xl/workbook.xml");
|
|
7354
7444
|
if (!workbookFile) {
|
|
@@ -7375,6 +7465,7 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7375
7465
|
}
|
|
7376
7466
|
const blocks = [];
|
|
7377
7467
|
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
7468
|
+
let totalCells = 0;
|
|
7378
7469
|
for (let i = 0; i < processedSheets; i++) {
|
|
7379
7470
|
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
7380
7471
|
const sheet = sheets[i];
|
|
@@ -7401,6 +7492,11 @@ async function parseXlsxDocument(buffer, options) {
|
|
|
7401
7492
|
try {
|
|
7402
7493
|
const sheetXml = await sheetFile.async("text");
|
|
7403
7494
|
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
7495
|
+
totalCells += maxRow * maxCol;
|
|
7496
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
7497
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
7498
|
+
break;
|
|
7499
|
+
}
|
|
7404
7500
|
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
7405
7501
|
blocks.push(...sheetBlocks);
|
|
7406
7502
|
} catch (err) {
|
|
@@ -7484,10 +7580,35 @@ function getAttr(el, localName) {
|
|
|
7484
7580
|
function parseXml2(text) {
|
|
7485
7581
|
return new DOMParser3().parseFromString(text, "text/xml");
|
|
7486
7582
|
}
|
|
7583
|
+
function buildElementIndex(root) {
|
|
7584
|
+
const index = /* @__PURE__ */ new Map();
|
|
7585
|
+
const walk = (node) => {
|
|
7586
|
+
const children = node.childNodes;
|
|
7587
|
+
for (let i = 0; i < children.length; i++) {
|
|
7588
|
+
const child = children[i];
|
|
7589
|
+
if (child.nodeType === 1) {
|
|
7590
|
+
const el = child;
|
|
7591
|
+
const name = el.localName ?? "";
|
|
7592
|
+
if (name) {
|
|
7593
|
+
let list = index.get(name);
|
|
7594
|
+
if (!list) {
|
|
7595
|
+
list = [];
|
|
7596
|
+
index.set(name, list);
|
|
7597
|
+
}
|
|
7598
|
+
list.push(el);
|
|
7599
|
+
}
|
|
7600
|
+
walk(el);
|
|
7601
|
+
}
|
|
7602
|
+
}
|
|
7603
|
+
};
|
|
7604
|
+
walk(root);
|
|
7605
|
+
return index;
|
|
7606
|
+
}
|
|
7487
7607
|
function parseStyles(xml) {
|
|
7488
7608
|
const doc = parseXml2(xml);
|
|
7489
7609
|
const styles = /* @__PURE__ */ new Map();
|
|
7490
|
-
const
|
|
7610
|
+
const idx = buildElementIndex(doc);
|
|
7611
|
+
const styleElements = idx.get("style") ?? [];
|
|
7491
7612
|
for (const el of styleElements) {
|
|
7492
7613
|
const styleId = getAttr(el, "styleId");
|
|
7493
7614
|
if (!styleId) continue;
|
|
@@ -7515,7 +7636,8 @@ function parseStyles(xml) {
|
|
|
7515
7636
|
function parseNumbering(xml) {
|
|
7516
7637
|
const doc = parseXml2(xml);
|
|
7517
7638
|
const abstractNums = /* @__PURE__ */ new Map();
|
|
7518
|
-
const
|
|
7639
|
+
const idx = buildElementIndex(doc);
|
|
7640
|
+
const abstractElements = idx.get("abstractNum") ?? [];
|
|
7519
7641
|
for (const el of abstractElements) {
|
|
7520
7642
|
const abstractNumId = getAttr(el, "abstractNumId");
|
|
7521
7643
|
if (!abstractNumId) continue;
|
|
@@ -7530,7 +7652,7 @@ function parseNumbering(xml) {
|
|
|
7530
7652
|
abstractNums.set(abstractNumId, levels);
|
|
7531
7653
|
}
|
|
7532
7654
|
const nums = /* @__PURE__ */ new Map();
|
|
7533
|
-
const numElements =
|
|
7655
|
+
const numElements = idx.get("num") ?? [];
|
|
7534
7656
|
for (const el of numElements) {
|
|
7535
7657
|
const numId = getAttr(el, "numId");
|
|
7536
7658
|
if (!numId) continue;
|
|
@@ -7774,9 +7896,9 @@ async function extractImages(zip, rels, doc) {
|
|
|
7774
7896
|
}
|
|
7775
7897
|
return { blocks, images };
|
|
7776
7898
|
}
|
|
7777
|
-
async function parseDocxDocument(buffer, options) {
|
|
7899
|
+
async function parseDocxDocument(buffer, options, existingZip) {
|
|
7778
7900
|
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
7779
|
-
const zip = await JSZip4.loadAsync(buffer);
|
|
7901
|
+
const zip = existingZip ?? await JSZip4.loadAsync(buffer);
|
|
7780
7902
|
const warnings = [];
|
|
7781
7903
|
const docFile = zip.file("word/document.xml");
|
|
7782
7904
|
if (!docFile) {
|
|
@@ -7866,6 +7988,11 @@ async function parseDocxDocument(buffer, options) {
|
|
|
7866
7988
|
};
|
|
7867
7989
|
}
|
|
7868
7990
|
|
|
7991
|
+
// src/index.ts
|
|
7992
|
+
init_cli_provider();
|
|
7993
|
+
init_tesseract_provider();
|
|
7994
|
+
init_markdown_to_blocks();
|
|
7995
|
+
|
|
7869
7996
|
// src/diff/text-diff.ts
|
|
7870
7997
|
function similarity(a, b) {
|
|
7871
7998
|
if (a === b) return 1;
|
|
@@ -10382,25 +10509,86 @@ async function parse2(input, options) {
|
|
|
10382
10509
|
if (!buffer || buffer.byteLength === 0) {
|
|
10383
10510
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
10384
10511
|
}
|
|
10512
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
10513
|
+
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
10514
|
+
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
10515
|
+
}
|
|
10385
10516
|
const format = detectFormat(buffer);
|
|
10386
10517
|
switch (format) {
|
|
10387
10518
|
case "hwpx": {
|
|
10388
|
-
const zipFormat = await detectZipFormat(buffer);
|
|
10389
|
-
if (zipFormat === "xlsx") return parseXlsx(buffer, options);
|
|
10390
|
-
if (zipFormat === "docx") return parseDocx(buffer, options);
|
|
10391
|
-
return parseHwpx(buffer, options);
|
|
10519
|
+
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
10520
|
+
if (zipFormat === "xlsx") return parseXlsx(buffer, options, zip ?? void 0);
|
|
10521
|
+
if (zipFormat === "docx") return parseDocx(buffer, options, zip ?? void 0);
|
|
10522
|
+
return parseHwpx(buffer, options, zip ?? void 0);
|
|
10392
10523
|
}
|
|
10393
10524
|
case "hwp":
|
|
10394
10525
|
return parseHwp(buffer, options);
|
|
10395
10526
|
case "pdf":
|
|
10396
10527
|
return parsePdf(buffer, options);
|
|
10528
|
+
case "image":
|
|
10529
|
+
return parseImage(buffer, options);
|
|
10397
10530
|
default:
|
|
10398
10531
|
return { success: false, fileType: "unknown", error: "\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD30C\uC77C \uD615\uC2DD\uC785\uB2C8\uB2E4.", code: "UNSUPPORTED_FORMAT" };
|
|
10399
10532
|
}
|
|
10400
10533
|
}
|
|
10401
|
-
async function
|
|
10534
|
+
async function parseImage(buffer, options) {
|
|
10535
|
+
const ocrMode = options?.ocrMode || "auto";
|
|
10536
|
+
if (ocrMode === "off") {
|
|
10537
|
+
return { success: false, fileType: "image", error: "OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC774\uBBF8\uC9C0 \uD30C\uC77C\uC744 \uCC98\uB9AC\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "IMAGE_BASED_PDF" };
|
|
10538
|
+
}
|
|
10539
|
+
let ocrProvider;
|
|
10540
|
+
let actualOcrMode = "auto";
|
|
10541
|
+
try {
|
|
10542
|
+
if (ocrMode === "gemini" || ocrMode === "claude" || ocrMode === "codex" || ocrMode === "ollama") {
|
|
10543
|
+
ocrProvider = createCliOcrProvider(ocrMode);
|
|
10544
|
+
actualOcrMode = ocrMode;
|
|
10545
|
+
} else if (ocrMode === "tesseract") {
|
|
10546
|
+
ocrProvider = await createTesseractProvider();
|
|
10547
|
+
actualOcrMode = ocrMode;
|
|
10548
|
+
} else if (ocrMode === "auto") {
|
|
10549
|
+
const modesToTry = ["gemini", "claude", "codex", "ollama"];
|
|
10550
|
+
for (const mode of modesToTry) {
|
|
10551
|
+
try {
|
|
10552
|
+
ocrProvider = createCliOcrProvider(mode);
|
|
10553
|
+
actualOcrMode = mode;
|
|
10554
|
+
break;
|
|
10555
|
+
} catch (e) {
|
|
10556
|
+
console.warn(`[kordoc] OCR auto-detection: ${mode} CLI not available or failed. Trying next.`, e);
|
|
10557
|
+
}
|
|
10558
|
+
}
|
|
10559
|
+
if (!ocrProvider) {
|
|
10560
|
+
ocrProvider = await createTesseractProvider();
|
|
10561
|
+
actualOcrMode = "tesseract";
|
|
10562
|
+
}
|
|
10563
|
+
}
|
|
10564
|
+
if (!ocrProvider) {
|
|
10565
|
+
return { success: false, fileType: "image", error: "\uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uD504\uB85C\uBC14\uC774\uB354\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.", code: "PARSE_ERROR" };
|
|
10566
|
+
}
|
|
10567
|
+
const imageUint8Array = new Uint8Array(buffer);
|
|
10568
|
+
const ocrResult = await ocrProvider(imageUint8Array, 1, "image/png");
|
|
10569
|
+
if (ocrProvider.terminate) {
|
|
10570
|
+
await ocrProvider.terminate();
|
|
10571
|
+
}
|
|
10572
|
+
const markdown = typeof ocrResult === "string" ? ocrResult : ocrResult.markdown;
|
|
10573
|
+
const blocks = markdownToBlocks(markdown, 1);
|
|
10574
|
+
return {
|
|
10575
|
+
success: true,
|
|
10576
|
+
fileType: "image",
|
|
10577
|
+
markdown,
|
|
10578
|
+
blocks,
|
|
10579
|
+
isImageBased: true,
|
|
10580
|
+
warnings: [{ message: `OCR \uCC98\uB9AC\uB428 (${actualOcrMode})`, code: "OCR_FALLBACK" }]
|
|
10581
|
+
};
|
|
10582
|
+
} catch (err) {
|
|
10583
|
+
if (ocrProvider && ocrProvider.terminate) {
|
|
10584
|
+
await ocrProvider.terminate();
|
|
10585
|
+
}
|
|
10586
|
+
return { success: false, fileType: "image", error: err instanceof Error ? err.message : "\uC774\uBBF8\uC9C0 OCR \uC2E4\uD328", code: classifyError(err) };
|
|
10587
|
+
}
|
|
10588
|
+
}
|
|
10589
|
+
async function parseHwpx(buffer, options, zip) {
|
|
10402
10590
|
try {
|
|
10403
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options);
|
|
10591
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
10404
10592
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10405
10593
|
} catch (err) {
|
|
10406
10594
|
return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
@@ -10423,17 +10611,17 @@ async function parsePdf(buffer, options) {
|
|
|
10423
10611
|
return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
|
|
10424
10612
|
}
|
|
10425
10613
|
}
|
|
10426
|
-
async function parseXlsx(buffer, options) {
|
|
10614
|
+
async function parseXlsx(buffer, options, zip) {
|
|
10427
10615
|
try {
|
|
10428
|
-
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options);
|
|
10616
|
+
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
10429
10617
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
10430
10618
|
} catch (err) {
|
|
10431
10619
|
return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|
|
10432
10620
|
}
|
|
10433
10621
|
}
|
|
10434
|
-
async function parseDocx(buffer, options) {
|
|
10622
|
+
async function parseDocx(buffer, options, zip) {
|
|
10435
10623
|
try {
|
|
10436
|
-
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options);
|
|
10624
|
+
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
10437
10625
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10438
10626
|
} catch (err) {
|
|
10439
10627
|
return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
|