@clazic/kordoc 2.7.5 → 2.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -29
- package/dist/{chunk-6DUCYZRR.js → chunk-URSQEMVJ.js} +345 -523
- package/dist/chunk-URSQEMVJ.js.map +1 -0
- package/dist/{chunk-5CIZV5C3.js → chunk-X7UUXEMM.js} +2 -2
- package/dist/cli.js +5 -87
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +447 -634
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -135
- package/dist/index.d.ts +4 -135
- package/dist/index.js +440 -624
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -43
- package/dist/mcp.js.map +1 -1
- package/dist/{utils-NR7YWMWB.js → utils-QQVZGOGU.js} +2 -2
- package/dist/{watch-LDX5GPEE.js → watch-RQYUNSSH.js} +3 -3
- package/package.json +1 -2
- package/dist/chunk-6DUCYZRR.js.map +0 -1
- /package/dist/{chunk-5CIZV5C3.js.map → chunk-X7UUXEMM.js.map} +0 -0
- /package/dist/{utils-NR7YWMWB.js.map → utils-QQVZGOGU.js.map} +0 -0
- /package/dist/{watch-LDX5GPEE.js.map → watch-RQYUNSSH.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -3062,9 +3062,6 @@ __export(index_exports, {
|
|
|
3062
3062
|
VERSION: () => VERSION,
|
|
3063
3063
|
blocksToMarkdown: () => blocksToMarkdown,
|
|
3064
3064
|
compare: () => compare,
|
|
3065
|
-
convertHwpToPdf: () => convertHwpToPdf,
|
|
3066
|
-
convertHwpxToPdf: () => convertHwpxToPdf,
|
|
3067
|
-
convertToPdf: () => convertToPdf,
|
|
3068
3065
|
detectFormat: () => detectFormat,
|
|
3069
3066
|
detectZipFormat: () => detectZipFormat,
|
|
3070
3067
|
diffBlocks: () => diffBlocks,
|
|
@@ -3084,7 +3081,7 @@ __export(index_exports, {
|
|
|
3084
3081
|
runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
|
|
3085
3082
|
});
|
|
3086
3083
|
module.exports = __toCommonJS(index_exports);
|
|
3087
|
-
var
|
|
3084
|
+
var import_promises3 = require("fs/promises");
|
|
3088
3085
|
|
|
3089
3086
|
// src/detect.ts
|
|
3090
3087
|
var import_jszip = __toESM(require("jszip"), 1);
|
|
@@ -3137,7 +3134,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
3137
3134
|
var import_xmldom = require("@xmldom/xmldom");
|
|
3138
3135
|
|
|
3139
3136
|
// src/utils.ts
|
|
3140
|
-
var VERSION = true ? "2.7.
|
|
3137
|
+
var VERSION = true ? "2.7.6" : "0.0.0-dev";
|
|
3141
3138
|
function toArrayBuffer(buf) {
|
|
3142
3139
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3143
3140
|
return buf.buffer;
|
|
@@ -3344,13 +3341,21 @@ function sanitizeText(text) {
|
|
|
3344
3341
|
}
|
|
3345
3342
|
return result;
|
|
3346
3343
|
}
|
|
3344
|
+
function escapeGfm(text, inTableCell = false) {
|
|
3345
|
+
if (!text) return text;
|
|
3346
|
+
let result = text.replace(/(?<!\\)~/g, "\\~");
|
|
3347
|
+
if (inTableCell) {
|
|
3348
|
+
result = result.replace(/(?<!\\)\|/g, "\\|");
|
|
3349
|
+
}
|
|
3350
|
+
return result;
|
|
3351
|
+
}
|
|
3347
3352
|
function blocksToMarkdown(blocks) {
|
|
3348
3353
|
const lines = [];
|
|
3349
3354
|
for (let i = 0; i < blocks.length; i++) {
|
|
3350
3355
|
const block = blocks[i];
|
|
3351
3356
|
if (block.type === "heading" && block.text) {
|
|
3352
3357
|
const prefix = "#".repeat(Math.min(block.level || 2, 6));
|
|
3353
|
-
const headingText = sanitizeText(block.text);
|
|
3358
|
+
const headingText = escapeGfm(sanitizeText(block.text), false);
|
|
3354
3359
|
if (headingText) lines.push("", `${prefix} ${headingText}`, "");
|
|
3355
3360
|
continue;
|
|
3356
3361
|
}
|
|
@@ -3363,42 +3368,47 @@ function blocksToMarkdown(blocks) {
|
|
|
3363
3368
|
continue;
|
|
3364
3369
|
}
|
|
3365
3370
|
if (block.type === "list" && block.text) {
|
|
3366
|
-
const
|
|
3367
|
-
if (!
|
|
3368
|
-
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(
|
|
3371
|
+
const sanitized = sanitizeText(block.text);
|
|
3372
|
+
if (!sanitized) continue;
|
|
3373
|
+
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(sanitized);
|
|
3369
3374
|
const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
|
|
3375
|
+
const listText = escapeGfm(sanitized, false);
|
|
3370
3376
|
lines.push(`${prefix}${listText}`);
|
|
3371
3377
|
if (block.children) {
|
|
3372
3378
|
for (const child of block.children) {
|
|
3373
3379
|
const childPrefix = child.listType === "ordered" ? "1." : "-";
|
|
3374
|
-
|
|
3380
|
+
const childText = child.text ? escapeGfm(sanitizeText(child.text), false) : "";
|
|
3381
|
+
lines.push(` ${childPrefix} ${childText}`);
|
|
3375
3382
|
}
|
|
3376
3383
|
}
|
|
3377
3384
|
continue;
|
|
3378
3385
|
}
|
|
3379
3386
|
if (block.type === "paragraph" && block.text) {
|
|
3380
|
-
|
|
3381
|
-
if (!
|
|
3382
|
-
if (/^\[별표\s*\d+/.test(
|
|
3387
|
+
const sanitized = sanitizeText(block.text);
|
|
3388
|
+
if (!sanitized) continue;
|
|
3389
|
+
if (/^\[별표\s*\d+/.test(sanitized)) {
|
|
3383
3390
|
const nextBlock = blocks[i + 1];
|
|
3391
|
+
const escapedSelf = escapeGfm(sanitized, false);
|
|
3384
3392
|
if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
|
|
3385
|
-
|
|
3393
|
+
const nextEscaped = escapeGfm(sanitizeText(nextBlock.text), false);
|
|
3394
|
+
lines.push("", `## ${escapedSelf} ${nextEscaped}`, "");
|
|
3386
3395
|
i++;
|
|
3387
3396
|
} else {
|
|
3388
|
-
lines.push("", `## ${
|
|
3397
|
+
lines.push("", `## ${escapedSelf}`, "");
|
|
3389
3398
|
}
|
|
3390
3399
|
continue;
|
|
3391
3400
|
}
|
|
3392
|
-
if (/^\([^)]*조[^)]*관련\)$/.test(
|
|
3393
|
-
lines.push(`*${
|
|
3401
|
+
if (/^\([^)]*조[^)]*관련\)$/.test(sanitized)) {
|
|
3402
|
+
lines.push(`*${escapeGfm(sanitized, false)}*`, "");
|
|
3394
3403
|
continue;
|
|
3395
3404
|
}
|
|
3405
|
+
let text = escapeGfm(sanitized, false);
|
|
3396
3406
|
if (block.href) {
|
|
3397
3407
|
const href = sanitizeHref(block.href);
|
|
3398
3408
|
if (href) text = `[${text}](${href})`;
|
|
3399
3409
|
}
|
|
3400
3410
|
if (block.footnoteText) {
|
|
3401
|
-
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
3411
|
+
text += ` (\uC8FC: ${escapeGfm(block.footnoteText, false)})`;
|
|
3402
3412
|
}
|
|
3403
3413
|
lines.push(text);
|
|
3404
3414
|
} else if (block.type === "table" && block.table) {
|
|
@@ -3423,13 +3433,13 @@ function tableToMarkdown(table) {
|
|
|
3423
3433
|
return content.split(/\n/).map((line) => {
|
|
3424
3434
|
const trimmed = line.trim();
|
|
3425
3435
|
if (!trimmed) return "";
|
|
3426
|
-
if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
|
|
3427
|
-
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
|
|
3428
|
-
return trimmed;
|
|
3436
|
+
if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed, false)}**`;
|
|
3437
|
+
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed, false)}`;
|
|
3438
|
+
return escapeGfm(trimmed, false);
|
|
3429
3439
|
}).filter(Boolean).join("\n");
|
|
3430
3440
|
}
|
|
3431
3441
|
if (numCols === 1 && numRows >= 2) {
|
|
3432
|
-
return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
3442
|
+
return cells.map((row) => escapeGfm(sanitizeText(row[0].text).replace(/\n/g, " "), false)).filter(Boolean).join("\n");
|
|
3433
3443
|
}
|
|
3434
3444
|
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
3435
3445
|
const skip = /* @__PURE__ */ new Set();
|
|
@@ -3438,7 +3448,7 @@ function tableToMarkdown(table) {
|
|
|
3438
3448
|
if (skip.has(`${r},${c}`)) continue;
|
|
3439
3449
|
const cell = cells[r]?.[c];
|
|
3440
3450
|
if (!cell) continue;
|
|
3441
|
-
display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
3451
|
+
display[r][c] = escapeGfm(sanitizeText(cell.text).replace(/\n/g, "<br>"), true);
|
|
3442
3452
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
3443
3453
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
3444
3454
|
if (dr === 0 && dc === 0) continue;
|
|
@@ -3485,6 +3495,223 @@ var HEADING_RATIO_H1 = 1.5;
|
|
|
3485
3495
|
var HEADING_RATIO_H2 = 1.3;
|
|
3486
3496
|
var HEADING_RATIO_H3 = 1.15;
|
|
3487
3497
|
|
|
3498
|
+
// src/hwp5/equation.ts
|
|
3499
|
+
var WORD_COMMANDS = /* @__PURE__ */ new Map([
|
|
3500
|
+
["alpha", "\\alpha"],
|
|
3501
|
+
["beta", "\\beta"],
|
|
3502
|
+
["gamma", "\\gamma"],
|
|
3503
|
+
["delta", "\\delta"],
|
|
3504
|
+
["epsilon", "\\epsilon"],
|
|
3505
|
+
["theta", "\\theta"],
|
|
3506
|
+
["lambda", "\\lambda"],
|
|
3507
|
+
["mu", "\\mu"],
|
|
3508
|
+
["pi", "\\pi"],
|
|
3509
|
+
["sigma", "\\sigma"],
|
|
3510
|
+
["tau", "\\tau"],
|
|
3511
|
+
["phi", "\\phi"],
|
|
3512
|
+
["omega", "\\omega"],
|
|
3513
|
+
["sin", "\\sin"],
|
|
3514
|
+
["cos", "\\cos"],
|
|
3515
|
+
["tan", "\\tan"],
|
|
3516
|
+
["sec", "\\sec"],
|
|
3517
|
+
["csc", "\\csc"],
|
|
3518
|
+
["cot", "\\cot"],
|
|
3519
|
+
["log", "\\log"],
|
|
3520
|
+
["ln", "\\ln"],
|
|
3521
|
+
["lim", "\\lim"],
|
|
3522
|
+
["inf", "\\infty"],
|
|
3523
|
+
["sum", "\\sum"],
|
|
3524
|
+
["smallsum", "\\sum"],
|
|
3525
|
+
["prod", "\\prod"],
|
|
3526
|
+
["int", "\\int"],
|
|
3527
|
+
["oint", "\\oint"],
|
|
3528
|
+
["rightarrow", "\\rightarrow"],
|
|
3529
|
+
["leftarrow", "\\leftarrow"],
|
|
3530
|
+
["partial", "\\partial"],
|
|
3531
|
+
["nabla", "\\nabla"],
|
|
3532
|
+
["angle", "\\angle"],
|
|
3533
|
+
["triangle", "\\triangle"],
|
|
3534
|
+
["vec", "\\vec"],
|
|
3535
|
+
["bar", "\\overline"],
|
|
3536
|
+
["dot", "\\dot"],
|
|
3537
|
+
["hat", "\\hat"],
|
|
3538
|
+
["left", "\\left"],
|
|
3539
|
+
["right", "\\right"]
|
|
3540
|
+
]);
|
|
3541
|
+
var SYMBOL_WORDS = /* @__PURE__ */ new Map([
|
|
3542
|
+
["times", "\\times"],
|
|
3543
|
+
["divide", "\\div"],
|
|
3544
|
+
["div", "\\div"],
|
|
3545
|
+
["le", "\\leq"],
|
|
3546
|
+
["ge", "\\geq"],
|
|
3547
|
+
["geq", "\\geq"],
|
|
3548
|
+
["deg", "^\\circ"],
|
|
3549
|
+
["rarrow", "\\rightarrow"],
|
|
3550
|
+
["larrow", "\\leftarrow"],
|
|
3551
|
+
["lrarrow", "\\leftrightarrow"],
|
|
3552
|
+
["in", "\\in"],
|
|
3553
|
+
["notin", "\\notin"],
|
|
3554
|
+
["emptyset", "\\emptyset"],
|
|
3555
|
+
["subset", "\\subset"],
|
|
3556
|
+
["nsubset", "\\nsubseteq"],
|
|
3557
|
+
["cup", "\\cup"],
|
|
3558
|
+
["cap", "\\cap"],
|
|
3559
|
+
["smallinter", "\\cap"],
|
|
3560
|
+
["sim", "\\sim"],
|
|
3561
|
+
["circ", "\\circ"],
|
|
3562
|
+
["bot", "\\perp"],
|
|
3563
|
+
["dyad", "\\overleftrightarrow"],
|
|
3564
|
+
["arch", "\\overset{\\frown}"]
|
|
3565
|
+
]);
|
|
3566
|
+
function hwpEquationToLatex(equation) {
|
|
3567
|
+
return convertEquation(equation.replace(/\0/g, "").trim(), 0);
|
|
3568
|
+
}
|
|
3569
|
+
function convertEquation(equation, depth) {
|
|
3570
|
+
if (!equation || depth > 12) return equation;
|
|
3571
|
+
let result = equation.replace(/\s+/g, " ").replace(/`+/g, "\\,").replace(/~+/g, "\\,").trim();
|
|
3572
|
+
result = convertMatrixLike(result);
|
|
3573
|
+
result = convertRoots(result, depth);
|
|
3574
|
+
result = convertOver(result, depth);
|
|
3575
|
+
result = convertSqrt(result, depth);
|
|
3576
|
+
result = convertScripts(result);
|
|
3577
|
+
result = convertOperators(result);
|
|
3578
|
+
result = removeFontDirectives(result);
|
|
3579
|
+
result = convertWords(result);
|
|
3580
|
+
result = cleanupLatexSpacing(result);
|
|
3581
|
+
return result;
|
|
3582
|
+
}
|
|
3583
|
+
function convertMatrixLike(input) {
|
|
3584
|
+
return input.replace(
|
|
3585
|
+
/\bmatrix\s*\{([^{}]*)\}/gi,
|
|
3586
|
+
(_match, body) => `\\begin{matrix} ${body.split("#").map((part) => part.trim()).join(" & ")} \\end{matrix}`
|
|
3587
|
+
).replace(
|
|
3588
|
+
/\bcases\s*\{([^{}]*)\}/gi,
|
|
3589
|
+
(_match, body) => `\\begin{cases} ${body.split("#").map((part) => part.trim()).join(" \\\\ ")} \\end{cases}`
|
|
3590
|
+
);
|
|
3591
|
+
}
|
|
3592
|
+
function convertRoots(input, depth) {
|
|
3593
|
+
return input.replace(/(?<!\\)\broot\s+({[^{}]*}|\S+)\s+of\s+({[^{}]*}|\S+)/gi, (_match, degree, radicand) => {
|
|
3594
|
+
return `\\sqrt[${convertEquation(unwrapGroup(degree), depth + 1)}]{${convertEquation(unwrapGroup(radicand), depth + 1)}}`;
|
|
3595
|
+
});
|
|
3596
|
+
}
|
|
3597
|
+
function convertSqrt(input, depth) {
|
|
3598
|
+
return input.replace(/(?<!\\)\bsqrt\s*({[^{}]*}|\S+)/gi, (_match, radicand) => {
|
|
3599
|
+
return `\\sqrt{${convertEquation(unwrapGroup(radicand), depth + 1)}}`;
|
|
3600
|
+
});
|
|
3601
|
+
}
|
|
3602
|
+
function convertOver(input, depth) {
|
|
3603
|
+
let result = input;
|
|
3604
|
+
for (let guard = 0; guard < 50; guard++) {
|
|
3605
|
+
const over = findTopLevelWord(result, "over");
|
|
3606
|
+
if (over < 0) break;
|
|
3607
|
+
const left = readLeftAtom(result, over);
|
|
3608
|
+
const right = readRightAtom(result, over + "over".length);
|
|
3609
|
+
if (!left || !right) break;
|
|
3610
|
+
const numerator = convertEquation(unwrapGroup(left.atom), depth + 1);
|
|
3611
|
+
const denominator = convertEquation(unwrapGroup(right.atom), depth + 1);
|
|
3612
|
+
result = result.slice(0, left.start) + `\\frac{${numerator}}{${denominator}}` + result.slice(right.end);
|
|
3613
|
+
}
|
|
3614
|
+
return result;
|
|
3615
|
+
}
|
|
3616
|
+
function convertScripts(input) {
|
|
3617
|
+
return input.replace(/\s*\^\s*/g, "^").replace(/\s*_\s*/g, "_").replace(/\^(?!\{)([^\s{}_^]+)/g, "^{$1}").replace(/_(?!\{)([^\s{}_^]+)/g, "_{$1}");
|
|
3618
|
+
}
|
|
3619
|
+
function convertOperators(input) {
|
|
3620
|
+
return input.replace(/\+-/g, "\\pm").replace(/-\+/g, "\\mp").replace(/\/\//g, "\\parallel").replace(/△/g, "\\triangle ").replace(/□/g, "\\square ").replace(/‧/g, "\\cdot ").replace(/!=/g, "\\neq").replace(/<=/g, "\\leq").replace(/>=/g, "\\geq").replace(/==/g, "\\equiv");
|
|
3621
|
+
}
|
|
3622
|
+
function removeFontDirectives(input) {
|
|
3623
|
+
return input.replace(/(?<!\\)\b(?:rm|it)\b\s*/gi, "");
|
|
3624
|
+
}
|
|
3625
|
+
function convertWords(input) {
|
|
3626
|
+
return input.replace(/(?<![\\A-Za-z0-9])([A-Za-z][A-Za-z0-9]*)(?![A-Za-z0-9])/g, (word) => {
|
|
3627
|
+
const exact = SYMBOL_WORDS.get(word);
|
|
3628
|
+
if (exact) return exact;
|
|
3629
|
+
const lower = word.toLowerCase();
|
|
3630
|
+
return SYMBOL_WORDS.get(lower) ?? WORD_COMMANDS.get(lower) ?? word;
|
|
3631
|
+
});
|
|
3632
|
+
}
|
|
3633
|
+
function cleanupLatexSpacing(input) {
|
|
3634
|
+
return input.replace(/\\left\s*\{/g, "\\left\\{").replace(/\\right\s*\}/g, "\\right\\}").replace(/\\left\s*([\[\]\(\)\|])/g, "\\left$1").replace(/\\right\s*([\[\]\(\)\|])/g, "\\right$1").replace(/\s*\\,\s*/g, "\\,").replace(/\s+/g, " ").replace(/\{\s+/g, "{").replace(/\s+\}/g, "}").trim();
|
|
3635
|
+
}
|
|
3636
|
+
function findTopLevelWord(input, word) {
|
|
3637
|
+
let curly = 0;
|
|
3638
|
+
let paren = 0;
|
|
3639
|
+
for (let i = 0; i <= input.length - word.length; i++) {
|
|
3640
|
+
const ch = input[i];
|
|
3641
|
+
if (ch === "{") curly++;
|
|
3642
|
+
else if (ch === "}") curly = Math.max(0, curly - 1);
|
|
3643
|
+
else if (ch === "(") paren++;
|
|
3644
|
+
else if (ch === ")") paren = Math.max(0, paren - 1);
|
|
3645
|
+
if (curly !== 0 || paren !== 0) continue;
|
|
3646
|
+
if (input.slice(i, i + word.length).toLowerCase() !== word) continue;
|
|
3647
|
+
if (isWordChar(input[i - 1]) || isWordChar(input[i + word.length])) continue;
|
|
3648
|
+
return i;
|
|
3649
|
+
}
|
|
3650
|
+
return -1;
|
|
3651
|
+
}
|
|
3652
|
+
function readLeftAtom(input, end) {
|
|
3653
|
+
let pos = end - 1;
|
|
3654
|
+
while (pos >= 0 && /\s/.test(input[pos])) pos--;
|
|
3655
|
+
if (pos < 0) return null;
|
|
3656
|
+
if (input[pos] === "}") {
|
|
3657
|
+
const start2 = findMatchingLeft(input, pos, "{", "}");
|
|
3658
|
+
if (start2 >= 0) return { start: start2, atom: input.slice(start2, pos + 1) };
|
|
3659
|
+
}
|
|
3660
|
+
if (input[pos] === ")") {
|
|
3661
|
+
const start2 = findMatchingLeft(input, pos, "(", ")");
|
|
3662
|
+
if (start2 >= 0) return { start: start2, atom: input.slice(start2, pos + 1) };
|
|
3663
|
+
}
|
|
3664
|
+
let start = pos;
|
|
3665
|
+
while (start >= 0 && !/\s/.test(input[start]) && !/[+\-=<>]/.test(input[start])) start--;
|
|
3666
|
+
return { start: start + 1, atom: input.slice(start + 1, pos + 1) };
|
|
3667
|
+
}
|
|
3668
|
+
function readRightAtom(input, start) {
|
|
3669
|
+
let pos = start;
|
|
3670
|
+
while (pos < input.length && /\s/.test(input[pos])) pos++;
|
|
3671
|
+
if (pos >= input.length) return null;
|
|
3672
|
+
if (input[pos] === "{") {
|
|
3673
|
+
const end2 = findMatchingRight(input, pos, "{", "}");
|
|
3674
|
+
if (end2 >= 0) return { end: end2 + 1, atom: input.slice(pos, end2 + 1) };
|
|
3675
|
+
}
|
|
3676
|
+
if (input[pos] === "(") {
|
|
3677
|
+
const end2 = findMatchingRight(input, pos, "(", ")");
|
|
3678
|
+
if (end2 >= 0) return { end: end2 + 1, atom: input.slice(pos, end2 + 1) };
|
|
3679
|
+
}
|
|
3680
|
+
let end = pos;
|
|
3681
|
+
while (end < input.length && !/\s/.test(input[end]) && !/[+\-=<>]/.test(input[end])) end++;
|
|
3682
|
+
return { end, atom: input.slice(pos, end) };
|
|
3683
|
+
}
|
|
3684
|
+
function findMatchingLeft(input, closeIndex, open, close) {
|
|
3685
|
+
let depth = 0;
|
|
3686
|
+
for (let i = closeIndex; i >= 0; i--) {
|
|
3687
|
+
if (input[i] === close) depth++;
|
|
3688
|
+
else if (input[i] === open) {
|
|
3689
|
+
depth--;
|
|
3690
|
+
if (depth === 0) return i;
|
|
3691
|
+
}
|
|
3692
|
+
}
|
|
3693
|
+
return -1;
|
|
3694
|
+
}
|
|
3695
|
+
function findMatchingRight(input, openIndex, open, close) {
|
|
3696
|
+
let depth = 0;
|
|
3697
|
+
for (let i = openIndex; i < input.length; i++) {
|
|
3698
|
+
if (input[i] === open) depth++;
|
|
3699
|
+
else if (input[i] === close) {
|
|
3700
|
+
depth--;
|
|
3701
|
+
if (depth === 0) return i;
|
|
3702
|
+
}
|
|
3703
|
+
}
|
|
3704
|
+
return -1;
|
|
3705
|
+
}
|
|
3706
|
+
function unwrapGroup(input) {
|
|
3707
|
+
const trimmed = input.trim();
|
|
3708
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) return trimmed.slice(1, -1);
|
|
3709
|
+
return trimmed;
|
|
3710
|
+
}
|
|
3711
|
+
function isWordChar(ch) {
|
|
3712
|
+
return !!ch && /[A-Za-z0-9_]/.test(ch);
|
|
3713
|
+
}
|
|
3714
|
+
|
|
3488
3715
|
// src/hwpx/parser.ts
|
|
3489
3716
|
init_page_range();
|
|
3490
3717
|
init_logger();
|
|
@@ -4166,6 +4393,17 @@ function findDescendant(node, targetTag, depth = 0) {
|
|
|
4166
4393
|
}
|
|
4167
4394
|
return null;
|
|
4168
4395
|
}
|
|
4396
|
+
function findChildByLocalName(node, targetTag) {
|
|
4397
|
+
const children = node.childNodes;
|
|
4398
|
+
if (!children) return null;
|
|
4399
|
+
for (let i = 0; i < children.length; i++) {
|
|
4400
|
+
const child = children[i];
|
|
4401
|
+
if (child.nodeType !== 1) continue;
|
|
4402
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
4403
|
+
if (tag === targetTag) return child;
|
|
4404
|
+
}
|
|
4405
|
+
return null;
|
|
4406
|
+
}
|
|
4169
4407
|
function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
|
|
4170
4408
|
const children = drawTextNode.childNodes;
|
|
4171
4409
|
if (!children) return;
|
|
@@ -4268,6 +4506,22 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
4268
4506
|
case "shapeComment":
|
|
4269
4507
|
case "drawText":
|
|
4270
4508
|
break;
|
|
4509
|
+
// 수식: <hp:equation> 내부의 <hp:script>에 HML/HULK-style 수식 본문이
|
|
4510
|
+
// 들어있음. hwpEquationToLatex로 LaTeX 변환 후 `$...$`로 래핑하여
|
|
4511
|
+
// 본문 텍스트에 인라인 삽입. 변환 실패/빈 결과는 조용히 드롭
|
|
4512
|
+
// (대체 텍스트 "수식입니다." 누출 방지는 기존 정규식이 처리).
|
|
4513
|
+
case "equation": {
|
|
4514
|
+
const script = findChildByLocalName(child, "script");
|
|
4515
|
+
const raw = script ? extractTextFromNode(script) : "";
|
|
4516
|
+
if (raw.trim()) {
|
|
4517
|
+
try {
|
|
4518
|
+
const latex = hwpEquationToLatex(raw).trim();
|
|
4519
|
+
if (latex) text += " $" + latex.replace(/\$/g, "\\$") + "$ ";
|
|
4520
|
+
} catch {
|
|
4521
|
+
}
|
|
4522
|
+
}
|
|
4523
|
+
break;
|
|
4524
|
+
}
|
|
4271
4525
|
// run 요소에서 charPrIDRef 추출
|
|
4272
4526
|
case "r": {
|
|
4273
4527
|
const runCharPr = child.getAttribute("charPrIDRef");
|
|
@@ -4334,8 +4588,13 @@ var TAG_CHAR_SHAPE = 68;
|
|
|
4334
4588
|
var TAG_CTRL_HEADER = 71;
|
|
4335
4589
|
var TAG_LIST_HEADER = 72;
|
|
4336
4590
|
var TAG_TABLE = 77;
|
|
4337
|
-
var
|
|
4338
|
-
var
|
|
4591
|
+
var TAG_EQEDIT = 88;
|
|
4592
|
+
var HWPTAG_BEGIN = 16;
|
|
4593
|
+
var TAG_ID_MAPPINGS = HWPTAG_BEGIN + 1;
|
|
4594
|
+
var TAG_FACE_NAME = HWPTAG_BEGIN + 3;
|
|
4595
|
+
var TAG_DOC_CHAR_SHAPE = HWPTAG_BEGIN + 5;
|
|
4596
|
+
var TAG_DOC_PARA_SHAPE = HWPTAG_BEGIN + 9;
|
|
4597
|
+
var TAG_DOC_STYLE = HWPTAG_BEGIN + 10;
|
|
4339
4598
|
var CHAR_LINE = 0;
|
|
4340
4599
|
var CHAR_SECTION_BREAK = 10;
|
|
4341
4600
|
var CHAR_PARA = 13;
|
|
@@ -4493,6 +4752,15 @@ function extractText(data) {
|
|
|
4493
4752
|
}
|
|
4494
4753
|
return result;
|
|
4495
4754
|
}
|
|
4755
|
+
function extractEquationText(data) {
|
|
4756
|
+
if (data.length < 6) return null;
|
|
4757
|
+
const scriptLength = data.readUInt16LE(4);
|
|
4758
|
+
const scriptStart = 6;
|
|
4759
|
+
const scriptEnd = scriptStart + scriptLength * 2;
|
|
4760
|
+
if (scriptLength <= 0 || scriptEnd > data.length) return null;
|
|
4761
|
+
const equation = data.subarray(scriptStart, scriptEnd).toString("utf16le").replace(/\0+/g, "").trim();
|
|
4762
|
+
return equation || null;
|
|
4763
|
+
}
|
|
4496
4764
|
|
|
4497
4765
|
// src/hwp5/aes.ts
|
|
4498
4766
|
var S_BOX = new Uint8Array([
|
|
@@ -5652,6 +5920,26 @@ function findViewTextSectionsLenient(lcfb, compressed) {
|
|
|
5652
5920
|
return sections.sort((a, b) => a.idx - b.idx).map((s) => s.content);
|
|
5653
5921
|
}
|
|
5654
5922
|
var TAG_SHAPE_COMPONENT = 74;
|
|
5923
|
+
var CTRL_ID_EQEDIT = "deqe";
|
|
5924
|
+
function isEquationControlId(ctrlId) {
|
|
5925
|
+
return ctrlId === CTRL_ID_EQEDIT || ctrlId === "eqed";
|
|
5926
|
+
}
|
|
5927
|
+
function formatEquationForMarkdown(equation) {
|
|
5928
|
+
const normalized = hwpEquationToLatex(equation);
|
|
5929
|
+
if (!normalized) return "";
|
|
5930
|
+
return `$${normalized.replace(/\$/g, "\\$")}$`;
|
|
5931
|
+
}
|
|
5932
|
+
function extractEquationFromControl(records, ctrlIdx) {
|
|
5933
|
+
const ctrlLevel = records[ctrlIdx].level;
|
|
5934
|
+
for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 10; j++) {
|
|
5935
|
+
const r = records[j];
|
|
5936
|
+
if (r.level <= ctrlLevel) break;
|
|
5937
|
+
if (r.tagId !== TAG_EQEDIT) continue;
|
|
5938
|
+
const equation = extractEquationText(r.data);
|
|
5939
|
+
return equation ? formatEquationForMarkdown(equation) : null;
|
|
5940
|
+
}
|
|
5941
|
+
return null;
|
|
5942
|
+
}
|
|
5655
5943
|
function extractBinDataId(records, ctrlIdx) {
|
|
5656
5944
|
const ctrlLevel = records[ctrlIdx].level;
|
|
5657
5945
|
for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 50; j++) {
|
|
@@ -5811,6 +6099,16 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
5811
6099
|
}
|
|
5812
6100
|
} else if (ctrlId === " elo" || ctrlId === "ole ") {
|
|
5813
6101
|
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
|
|
6102
|
+
} else if (isEquationControlId(ctrlId)) {
|
|
6103
|
+
const equation = extractEquationFromControl(records, i);
|
|
6104
|
+
if (equation) {
|
|
6105
|
+
const lastBlock = blocks[blocks.length - 1];
|
|
6106
|
+
if (lastBlock && lastBlock.type === "paragraph" && lastBlock.text) {
|
|
6107
|
+
lastBlock.text = lastBlock.text + " " + equation;
|
|
6108
|
+
} else {
|
|
6109
|
+
blocks.push({ type: "paragraph", text: equation, pageNumber: sectionNum });
|
|
6110
|
+
}
|
|
6111
|
+
}
|
|
5814
6112
|
} else if (ctrlId === "fn " || ctrlId === " nf " || ctrlId === "en " || ctrlId === " ne ") {
|
|
5815
6113
|
const noteText = extractNoteText(records, i);
|
|
5816
6114
|
if (noteText && blocks.length > 0) {
|
|
@@ -5843,6 +6141,13 @@ function extractNoteText(records, ctrlIdx) {
|
|
|
5843
6141
|
const t = extractText(r.data).trim();
|
|
5844
6142
|
if (t) texts.push(t);
|
|
5845
6143
|
}
|
|
6144
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
6145
|
+
const innerCtrlId = r.data.subarray(0, 4).toString("ascii");
|
|
6146
|
+
if (isEquationControlId(innerCtrlId)) {
|
|
6147
|
+
const equation = extractEquationFromControl(records, j);
|
|
6148
|
+
if (equation) texts.push(equation);
|
|
6149
|
+
}
|
|
6150
|
+
}
|
|
5846
6151
|
}
|
|
5847
6152
|
return texts.length > 0 ? texts.join(" ") : null;
|
|
5848
6153
|
}
|
|
@@ -5856,6 +6161,13 @@ function extractTextBoxText(records, ctrlIdx) {
|
|
|
5856
6161
|
const t = extractText(r.data).trim();
|
|
5857
6162
|
if (t) texts.push(t);
|
|
5858
6163
|
}
|
|
6164
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
6165
|
+
const innerCtrlId = r.data.subarray(0, 4).toString("ascii");
|
|
6166
|
+
if (isEquationControlId(innerCtrlId)) {
|
|
6167
|
+
const equation = extractEquationFromControl(records, j);
|
|
6168
|
+
if (equation) texts.push(equation);
|
|
6169
|
+
}
|
|
6170
|
+
}
|
|
5859
6171
|
}
|
|
5860
6172
|
return texts.length > 0 ? texts.join("\n") : null;
|
|
5861
6173
|
}
|
|
@@ -5924,6 +6236,12 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
5924
6236
|
i = nextIdx;
|
|
5925
6237
|
continue;
|
|
5926
6238
|
}
|
|
6239
|
+
if (isEquationControlId(ctrlId)) {
|
|
6240
|
+
const equation = extractEquationFromControl(records, i);
|
|
6241
|
+
if (equation) {
|
|
6242
|
+
text = text ? text + " " + equation : equation;
|
|
6243
|
+
}
|
|
6244
|
+
}
|
|
5927
6245
|
}
|
|
5928
6246
|
i++;
|
|
5929
6247
|
}
|
|
@@ -11233,528 +11551,6 @@ async function markdownToXlsx(markdown, options) {
|
|
|
11233
11551
|
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
11234
11552
|
}
|
|
11235
11553
|
|
|
11236
|
-
// src/convert/index.ts
|
|
11237
|
-
var import_promises3 = require("fs/promises");
|
|
11238
|
-
|
|
11239
|
-
// src/convert/libreoffice.ts
|
|
11240
|
-
var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
|
|
11241
|
-
|
|
11242
|
-
// src/convert/error.ts
|
|
11243
|
-
var ConvertError = class extends Error {
|
|
11244
|
-
constructor(code, message) {
|
|
11245
|
-
super(message);
|
|
11246
|
-
this.code = code;
|
|
11247
|
-
this.name = "ConvertError";
|
|
11248
|
-
}
|
|
11249
|
-
};
|
|
11250
|
-
|
|
11251
|
-
// src/convert/installer.ts
|
|
11252
|
-
var import_os3 = require("os");
|
|
11253
|
-
var import_path5 = require("path");
|
|
11254
|
-
var import_promises2 = require("fs/promises");
|
|
11255
|
-
var import_fs4 = require("fs");
|
|
11256
|
-
var import_child_process4 = require("child_process");
|
|
11257
|
-
var installInFlight = null;
|
|
11258
|
-
var CACHE_DIR = (0, import_path5.join)((0, import_os3.homedir)(), ".cache", "kordoc", "libreoffice");
|
|
11259
|
-
var VERSION_FILE = (0, import_path5.join)(CACHE_DIR, "version");
|
|
11260
|
-
var PACKAGES = {
|
|
11261
|
-
darwin: {
|
|
11262
|
-
url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/mac/x86_64/LibreOffice_26.2.3_MacOS_x86-64.dmg",
|
|
11263
|
-
binPath: "LibreOffice.app/Contents/MacOS/soffice",
|
|
11264
|
-
sizeMb: 300
|
|
11265
|
-
},
|
|
11266
|
-
linux: {
|
|
11267
|
-
url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/deb/x86_64/LibreOffice_26.2.3_Linux_x86-64_deb.tar.gz",
|
|
11268
|
-
binPath: "opt/libreoffice26.2/program/soffice",
|
|
11269
|
-
sizeMb: 210
|
|
11270
|
-
},
|
|
11271
|
-
win32: {
|
|
11272
|
-
url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/win/x86_64/LibreOffice_26.2.3_Win_x86-64.msi",
|
|
11273
|
-
binPath: "LibreOffice/program/soffice.exe",
|
|
11274
|
-
sizeMb: 360
|
|
11275
|
-
}
|
|
11276
|
-
};
|
|
11277
|
-
async function findInPath() {
|
|
11278
|
-
return new Promise((resolve4) => {
|
|
11279
|
-
const child = (0, import_child_process4.spawn)("soffice", ["--version"], { stdio: "ignore" });
|
|
11280
|
-
child.on("close", (code) => resolve4(code === 0 ? "soffice" : null));
|
|
11281
|
-
child.on("error", () => resolve4(null));
|
|
11282
|
-
});
|
|
11283
|
-
}
|
|
11284
|
-
async function findInCache() {
|
|
11285
|
-
const cachedBin = (0, import_path5.join)(CACHE_DIR, "bin", "soffice");
|
|
11286
|
-
try {
|
|
11287
|
-
await (0, import_promises2.access)(cachedBin);
|
|
11288
|
-
return cachedBin;
|
|
11289
|
-
} catch {
|
|
11290
|
-
return null;
|
|
11291
|
-
}
|
|
11292
|
-
}
|
|
11293
|
-
async function findInDefaultPaths() {
|
|
11294
|
-
const platform = process.platform;
|
|
11295
|
-
const paths = [];
|
|
11296
|
-
if (platform === "darwin") {
|
|
11297
|
-
paths.push(
|
|
11298
|
-
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
|
11299
|
-
"/opt/homebrew/bin/soffice",
|
|
11300
|
-
"/usr/local/bin/soffice"
|
|
11301
|
-
);
|
|
11302
|
-
} else if (platform === "linux") {
|
|
11303
|
-
paths.push(
|
|
11304
|
-
"/usr/bin/soffice",
|
|
11305
|
-
"/usr/lib/libreoffice/program/soffice"
|
|
11306
|
-
);
|
|
11307
|
-
} else if (platform === "win32") {
|
|
11308
|
-
const pf = process.env["ProgramFiles"] ?? "C:\\Program Files";
|
|
11309
|
-
const pf86 = process.env["ProgramFiles(x86)"] ?? "C:\\Program Files (x86)";
|
|
11310
|
-
paths.push(
|
|
11311
|
-
(0, import_path5.join)(pf, "LibreOffice", "program", "soffice.exe"),
|
|
11312
|
-
(0, import_path5.join)(pf86, "LibreOffice", "program", "soffice.exe")
|
|
11313
|
-
);
|
|
11314
|
-
}
|
|
11315
|
-
for (const p of paths) {
|
|
11316
|
-
try {
|
|
11317
|
-
await (0, import_promises2.access)(p);
|
|
11318
|
-
return p;
|
|
11319
|
-
} catch {
|
|
11320
|
-
continue;
|
|
11321
|
-
}
|
|
11322
|
-
}
|
|
11323
|
-
return null;
|
|
11324
|
-
}
|
|
11325
|
-
async function downloadWithProgress(url, dest, totalBytes, onProgress) {
|
|
11326
|
-
const response = await fetch(url);
|
|
11327
|
-
if (!response.ok) throw new Error(`\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: HTTP ${response.status} (${url})`);
|
|
11328
|
-
if (!response.body) throw new Error("\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: response body \uC5C6\uC74C");
|
|
11329
|
-
const file = (0, import_fs4.createWriteStream)(dest);
|
|
11330
|
-
const reader = response.body.getReader();
|
|
11331
|
-
let downloaded = 0;
|
|
11332
|
-
try {
|
|
11333
|
-
while (true) {
|
|
11334
|
-
const { done, value } = await reader.read();
|
|
11335
|
-
if (done) break;
|
|
11336
|
-
if (!file.write(value)) {
|
|
11337
|
-
await new Promise((resolve4) => file.once("drain", resolve4));
|
|
11338
|
-
}
|
|
11339
|
-
downloaded += value.length;
|
|
11340
|
-
onProgress?.(downloaded, totalBytes);
|
|
11341
|
-
}
|
|
11342
|
-
} finally {
|
|
11343
|
-
reader.releaseLock();
|
|
11344
|
-
await new Promise((resolve4, reject) => {
|
|
11345
|
-
file.end((err) => err ? reject(err) : resolve4());
|
|
11346
|
-
});
|
|
11347
|
-
}
|
|
11348
|
-
}
|
|
11349
|
-
async function installForPlatform(pkg, onProgress) {
|
|
11350
|
-
const platform = process.platform;
|
|
11351
|
-
await (0, import_promises2.mkdir)(CACHE_DIR, { recursive: true });
|
|
11352
|
-
const downloadPath = (0, import_path5.join)(CACHE_DIR, `download-${Date.now()}`);
|
|
11353
|
-
await downloadWithProgress(pkg.url, downloadPath, pkg.sizeMb * 1024 * 1024, onProgress);
|
|
11354
|
-
try {
|
|
11355
|
-
if (platform === "darwin") {
|
|
11356
|
-
return await installMacOS(pkg, downloadPath);
|
|
11357
|
-
} else if (platform === "linux") {
|
|
11358
|
-
return await installLinux(pkg, downloadPath);
|
|
11359
|
-
} else if (platform === "win32") {
|
|
11360
|
-
return await installWindows(pkg, downloadPath);
|
|
11361
|
-
}
|
|
11362
|
-
} catch (err) {
|
|
11363
|
-
await (0, import_promises2.rm)(downloadPath, { force: true });
|
|
11364
|
-
throw err;
|
|
11365
|
-
}
|
|
11366
|
-
throw new ConvertError("UNSUPPORTED_PLATFORM", `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4`);
|
|
11367
|
-
}
|
|
11368
|
-
async function installMacOS(pkg, downloadPath) {
|
|
11369
|
-
const mountPoint = `/Volumes/LibreOffice_${Date.now()}`;
|
|
11370
|
-
await new Promise((resolve4, reject) => {
|
|
11371
|
-
const stderr = [];
|
|
11372
|
-
const child = (0, import_child_process4.spawn)("hdiutil", ["attach", "-nobrowse", "-noverify", "-mountpoint", mountPoint, downloadPath]);
|
|
11373
|
-
child.stderr?.on("data", (d) => stderr.push(d.toString()));
|
|
11374
|
-
child.on(
|
|
11375
|
-
"close",
|
|
11376
|
-
(code) => code === 0 ? resolve4() : reject(new Error(`dmg \uB9C8\uC6B4\uD2B8 \uC2E4\uD328 (code=${code}): ${stderr.join("").trim()}`))
|
|
11377
|
-
);
|
|
11378
|
-
});
|
|
11379
|
-
try {
|
|
11380
|
-
const appSource = (0, import_path5.join)(mountPoint, "LibreOffice.app");
|
|
11381
|
-
const appDest = (0, import_path5.join)(CACHE_DIR, "LibreOffice.app");
|
|
11382
|
-
await new Promise((resolve4, reject) => {
|
|
11383
|
-
const child = (0, import_child_process4.spawn)("cp", ["-R", appSource, appDest]);
|
|
11384
|
-
child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(".app \uBCF5\uC0AC \uC2E4\uD328")));
|
|
11385
|
-
});
|
|
11386
|
-
} finally {
|
|
11387
|
-
await new Promise((resolve4) => {
|
|
11388
|
-
const child = (0, import_child_process4.spawn)("hdiutil", ["detach", mountPoint]);
|
|
11389
|
-
child.on("close", () => resolve4());
|
|
11390
|
-
});
|
|
11391
|
-
}
|
|
11392
|
-
await (0, import_promises2.rm)(downloadPath, { force: true });
|
|
11393
|
-
return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
|
|
11394
|
-
}
|
|
11395
|
-
async function installLinux(pkg, downloadPath) {
|
|
11396
|
-
const extractDir = (0, import_path5.join)(CACHE_DIR, `extract-${Date.now()}`);
|
|
11397
|
-
await (0, import_promises2.mkdir)(extractDir, { recursive: true });
|
|
11398
|
-
await new Promise((resolve4, reject) => {
|
|
11399
|
-
const child = (0, import_child_process4.spawn)("tar", ["xzf", downloadPath, "-C", extractDir]);
|
|
11400
|
-
child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("\uC555\uCD95 \uD574\uC81C \uC2E4\uD328")));
|
|
11401
|
-
});
|
|
11402
|
-
const debsDir = (0, import_path5.join)(extractDir, "DEBS");
|
|
11403
|
-
try {
|
|
11404
|
-
await (0, import_promises2.access)(debsDir);
|
|
11405
|
-
const entries = await (await import("fs/promises")).readdir(debsDir);
|
|
11406
|
-
for (const entry of entries) {
|
|
11407
|
-
if (entry.endsWith(".deb")) {
|
|
11408
|
-
await new Promise((resolve4, reject) => {
|
|
11409
|
-
const child = (0, import_child_process4.spawn)("dpkg-deb", ["-x", (0, import_path5.join)(debsDir, entry), CACHE_DIR]);
|
|
11410
|
-
child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(`${entry} \uCD94\uCD9C \uC2E4\uD328`)));
|
|
11411
|
-
});
|
|
11412
|
-
}
|
|
11413
|
-
}
|
|
11414
|
-
} catch {
|
|
11415
|
-
}
|
|
11416
|
-
await (0, import_promises2.rm)(downloadPath, { force: true });
|
|
11417
|
-
await (0, import_promises2.rm)(extractDir, { recursive: true, force: true });
|
|
11418
|
-
return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
|
|
11419
|
-
}
|
|
11420
|
-
async function installWindows(pkg, downloadPath) {
|
|
11421
|
-
await new Promise((resolve4, reject) => {
|
|
11422
|
-
const child = (0, import_child_process4.spawn)("msiexec", ["/a", downloadPath, "/qn", `TARGETDIR=${CACHE_DIR}`]);
|
|
11423
|
-
child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("MSI \uC124\uCE58 \uC2E4\uD328")));
|
|
11424
|
-
});
|
|
11425
|
-
await (0, import_promises2.rm)(downloadPath, { force: true });
|
|
11426
|
-
return (0, import_path5.join)(CACHE_DIR, pkg.binPath);
|
|
11427
|
-
}
|
|
11428
|
-
async function createSymlink(actualBin) {
|
|
11429
|
-
const binDir = (0, import_path5.join)(CACHE_DIR, "bin");
|
|
11430
|
-
await (0, import_promises2.mkdir)(binDir, { recursive: true });
|
|
11431
|
-
const linkBin = (0, import_path5.join)(binDir, "soffice");
|
|
11432
|
-
try {
|
|
11433
|
-
await (0, import_promises2.symlink)(actualBin, linkBin);
|
|
11434
|
-
} catch {
|
|
11435
|
-
}
|
|
11436
|
-
process.env.PATH = `${binDir}${import_path5.delimiter}${process.env.PATH}`;
|
|
11437
|
-
return linkBin;
|
|
11438
|
-
}
|
|
11439
|
-
async function installLibreOffice(onProgress) {
|
|
11440
|
-
const platform = process.platform;
|
|
11441
|
-
const pkg = PACKAGES[platform];
|
|
11442
|
-
if (!pkg) {
|
|
11443
|
-
throw new ConvertError(
|
|
11444
|
-
"UNSUPPORTED_PLATFORM",
|
|
11445
|
-
`${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4. \uC218\uB3D9\uC73C\uB85C LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.`
|
|
11446
|
-
);
|
|
11447
|
-
}
|
|
11448
|
-
return await installForPlatform(pkg, onProgress);
|
|
11449
|
-
}
|
|
11450
|
-
async function resolveSoffice(emitter, autoInstall = true) {
|
|
11451
|
-
emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
|
|
11452
|
-
const inPath = await findInPath();
|
|
11453
|
-
if (inPath) {
|
|
11454
|
-
emitter.validate("soffice_found", "\uC2DC\uC2A4\uD15C PATH\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inPath });
|
|
11455
|
-
return inPath;
|
|
11456
|
-
}
|
|
11457
|
-
const inCache = await findInCache();
|
|
11458
|
-
if (inCache) {
|
|
11459
|
-
emitter.validate("soffice_found", "\uCE90\uC2DC\uB41C LibreOffice \uBC1C\uACAC", { sofficePath: inCache });
|
|
11460
|
-
return inCache;
|
|
11461
|
-
}
|
|
11462
|
-
const inDefault = await findInDefaultPaths();
|
|
11463
|
-
if (inDefault) {
|
|
11464
|
-
emitter.validate("soffice_found", "\uAE30\uBCF8 \uACBD\uB85C\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inDefault });
|
|
11465
|
-
return inDefault;
|
|
11466
|
-
}
|
|
11467
|
-
if (!autoInstall) {
|
|
11468
|
-
emitter.error(
|
|
11469
|
-
"validate",
|
|
11470
|
-
"SOFFICE_NOT_FOUND",
|
|
11471
|
-
"LibreOffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4",
|
|
11472
|
-
"\uC218\uB3D9\uC73C\uB85C \uC124\uCE58\uD558\uAC70\uB098 autoInstallLibreOffice: true \uC635\uC158\uC744 \uC0AC\uC6A9\uD558\uC138\uC694."
|
|
11473
|
-
);
|
|
11474
|
-
throw new ConvertError("SOFFICE_NOT_FOUND", "LibreOffice\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4");
|
|
11475
|
-
}
|
|
11476
|
-
if (installInFlight) {
|
|
11477
|
-
return installInFlight;
|
|
11478
|
-
}
|
|
11479
|
-
emitter.install("install_start", "LibreOffice \uC790\uB3D9 \uC124\uCE58\uB97C \uC2DC\uC791\uD569\uB2C8\uB2E4...");
|
|
11480
|
-
installInFlight = (async () => {
|
|
11481
|
-
try {
|
|
11482
|
-
const installed = await installLibreOffice((downloaded, total) => {
|
|
11483
|
-
const percent = Math.round(downloaded / total * 100);
|
|
11484
|
-
emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
|
|
11485
|
-
percent,
|
|
11486
|
-
downloadedBytes: downloaded,
|
|
11487
|
-
totalBytes: total
|
|
11488
|
-
});
|
|
11489
|
-
});
|
|
11490
|
-
emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
|
|
11491
|
-
return installed;
|
|
11492
|
-
} catch (err) {
|
|
11493
|
-
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
11494
|
-
emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
|
|
11495
|
-
throw err;
|
|
11496
|
-
} finally {
|
|
11497
|
-
installInFlight = null;
|
|
11498
|
-
}
|
|
11499
|
-
})();
|
|
11500
|
-
return installInFlight;
|
|
11501
|
-
}
|
|
11502
|
-
|
|
11503
|
-
// src/convert/libreoffice.ts
|
|
11504
|
-
var libreConvert = import_libreoffice_convert.default.convert;
|
|
11505
|
-
var libreConvertWithOptions = import_libreoffice_convert.default.convertWithOptions;
|
|
11506
|
-
async function convertBuffer(buffer, targetExt, timeoutMs = 6e4, sofficePath, sourceExt) {
|
|
11507
|
-
return new Promise((resolve4, reject) => {
|
|
11508
|
-
const timer = setTimeout(() => {
|
|
11509
|
-
reject(
|
|
11510
|
-
new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
|
|
11511
|
-
);
|
|
11512
|
-
}, timeoutMs);
|
|
11513
|
-
const cb = (err, done) => {
|
|
11514
|
-
clearTimeout(timer);
|
|
11515
|
-
if (err || !done) {
|
|
11516
|
-
reject(
|
|
11517
|
-
new ConvertError(
|
|
11518
|
-
"CONVERT_FAILED",
|
|
11519
|
-
err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
|
|
11520
|
-
)
|
|
11521
|
-
);
|
|
11522
|
-
return;
|
|
11523
|
-
}
|
|
11524
|
-
resolve4(done);
|
|
11525
|
-
};
|
|
11526
|
-
if (sofficePath) {
|
|
11527
|
-
const fileName = sourceExt ? `source${sourceExt}` : "source";
|
|
11528
|
-
libreConvertWithOptions(buffer, targetExt, void 0, { sofficeBinaryPaths: [sofficePath], fileName }, cb);
|
|
11529
|
-
} else {
|
|
11530
|
-
libreConvert(buffer, targetExt, void 0, cb);
|
|
11531
|
-
}
|
|
11532
|
-
});
|
|
11533
|
-
}
|
|
11534
|
-
|
|
11535
|
-
// src/convert/events.ts
|
|
11536
|
-
var ConvertEventEmitter = class {
|
|
11537
|
-
listener = null;
|
|
11538
|
-
/** 이벤트 리스너 등록 */
|
|
11539
|
-
setListener(listener) {
|
|
11540
|
-
this.listener = listener;
|
|
11541
|
-
}
|
|
11542
|
-
/** 이벤트 발송 */
|
|
11543
|
-
emit(event) {
|
|
11544
|
-
try {
|
|
11545
|
-
this.listener?.(event);
|
|
11546
|
-
} catch {
|
|
11547
|
-
}
|
|
11548
|
-
}
|
|
11549
|
-
/** 타입 안전한 헬퍼: detect 이벤트 */
|
|
11550
|
-
detect(stage, message, meta) {
|
|
11551
|
-
this.emit({ type: "detect", stage, message, ...meta });
|
|
11552
|
-
}
|
|
11553
|
-
/** 타입 안전한 헬퍼: validate 이벤트 */
|
|
11554
|
-
validate(stage, message, meta) {
|
|
11555
|
-
this.emit({ type: "validate", stage, message, ...meta });
|
|
11556
|
-
}
|
|
11557
|
-
/** 타입 안전한 헬퍼: install 이벤트 */
|
|
11558
|
-
install(stage, message, meta) {
|
|
11559
|
-
this.emit({ type: "install", stage, message, ...meta });
|
|
11560
|
-
}
|
|
11561
|
-
/** 타입 안전한 헬퍼: convert 진행 이벤트 */
|
|
11562
|
-
progress(percent, message) {
|
|
11563
|
-
this.emit({ type: "convert", stage: "convert_progress", message, percent });
|
|
11564
|
-
}
|
|
11565
|
-
/** 타입 안전한 헬퍼: convert 시작 */
|
|
11566
|
-
convertStart(message) {
|
|
11567
|
-
this.emit({ type: "convert", stage: "convert_start", message, percent: 0 });
|
|
11568
|
-
}
|
|
11569
|
-
/** 타입 안전한 헬퍼: convert 완료 */
|
|
11570
|
-
convertDone(message) {
|
|
11571
|
-
this.emit({ type: "convert", stage: "convert_done", message, percent: 100 });
|
|
11572
|
-
}
|
|
11573
|
-
/** 타입 안전한 헬퍼: 완료 이벤트 */
|
|
11574
|
-
complete(result) {
|
|
11575
|
-
this.emit({ type: "complete", stage: "success", message: "\uBCC0\uD658 \uC644\uB8CC", result });
|
|
11576
|
-
}
|
|
11577
|
-
/** 타입 안전한 헬퍼: 에러 이벤트 */
|
|
11578
|
-
error(stage, code, message, suggestion) {
|
|
11579
|
-
this.emit({ type: "error", stage, code, message, recoverable: true, suggestion });
|
|
11580
|
-
}
|
|
11581
|
-
};
|
|
11582
|
-
|
|
11583
|
-
// src/convert/index.ts
|
|
11584
|
-
var isConverting = false;
|
|
11585
|
-
var queue = [];
|
|
11586
|
-
async function acquireConvertLock() {
|
|
11587
|
-
if (!isConverting) {
|
|
11588
|
-
isConverting = true;
|
|
11589
|
-
return () => {
|
|
11590
|
-
isConverting = false;
|
|
11591
|
-
const next = queue.shift();
|
|
11592
|
-
next?.();
|
|
11593
|
-
};
|
|
11594
|
-
}
|
|
11595
|
-
return new Promise((resolve4) => {
|
|
11596
|
-
queue.push(() => {
|
|
11597
|
-
isConverting = true;
|
|
11598
|
-
resolve4(() => {
|
|
11599
|
-
isConverting = false;
|
|
11600
|
-
const next = queue.shift();
|
|
11601
|
-
next?.();
|
|
11602
|
-
});
|
|
11603
|
-
});
|
|
11604
|
-
});
|
|
11605
|
-
}
|
|
11606
|
-
async function convertToPdf(input, options) {
|
|
11607
|
-
const emitter = new ConvertEventEmitter();
|
|
11608
|
-
if (options?.onEvent) {
|
|
11609
|
-
emitter.setListener(options.onEvent);
|
|
11610
|
-
}
|
|
11611
|
-
if (options?.onProgress) {
|
|
11612
|
-
const legacyProgress = options.onProgress;
|
|
11613
|
-
emitter.setListener((event) => {
|
|
11614
|
-
if (event.type === "convert" && event.stage === "convert_progress") {
|
|
11615
|
-
legacyProgress(event.percent, event.message);
|
|
11616
|
-
}
|
|
11617
|
-
});
|
|
11618
|
-
}
|
|
11619
|
-
try {
|
|
11620
|
-
emitter.detect("reading", "\uC785\uB825 \uD30C\uC77C \uC77D\uB294 \uC911...");
|
|
11621
|
-
let buffer;
|
|
11622
|
-
try {
|
|
11623
|
-
if (typeof input === "string") {
|
|
11624
|
-
buffer = await (0, import_promises3.readFile)(input);
|
|
11625
|
-
} else if (Buffer.isBuffer(input)) {
|
|
11626
|
-
buffer = input;
|
|
11627
|
-
} else {
|
|
11628
|
-
buffer = Buffer.from(input);
|
|
11629
|
-
}
|
|
11630
|
-
} catch (err) {
|
|
11631
|
-
emitter.error(
|
|
11632
|
-
"detect",
|
|
11633
|
-
"PARSE_ERROR",
|
|
11634
|
-
`\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`
|
|
11635
|
-
);
|
|
11636
|
-
return {
|
|
11637
|
-
success: false,
|
|
11638
|
-
code: "PARSE_ERROR",
|
|
11639
|
-
error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
|
|
11640
|
-
stage: "detect"
|
|
11641
|
-
};
|
|
11642
|
-
}
|
|
11643
|
-
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
11644
|
-
if (buffer.length > MAX_FILE_SIZE) {
|
|
11645
|
-
emitter.error(
|
|
11646
|
-
"detect",
|
|
11647
|
-
"FILE_TOO_LARGE",
|
|
11648
|
-
`\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`
|
|
11649
|
-
);
|
|
11650
|
-
return {
|
|
11651
|
-
success: false,
|
|
11652
|
-
code: "FILE_TOO_LARGE",
|
|
11653
|
-
error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
|
|
11654
|
-
stage: "detect"
|
|
11655
|
-
};
|
|
11656
|
-
}
|
|
11657
|
-
const format = detectFormat(toArrayBuffer(buffer));
|
|
11658
|
-
emitter.detect("format_detected", `\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC: ${format}`, { format });
|
|
11659
|
-
if (format !== "hwp" && format !== "hwpx") {
|
|
11660
|
-
emitter.error("detect", "UNSUPPORTED_FORMAT", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`);
|
|
11661
|
-
return {
|
|
11662
|
-
success: false,
|
|
11663
|
-
code: "UNSUPPORTED_FORMAT",
|
|
11664
|
-
error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
|
|
11665
|
-
stage: "detect"
|
|
11666
|
-
};
|
|
11667
|
-
}
|
|
11668
|
-
emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
|
|
11669
|
-
let sofficePath;
|
|
11670
|
-
try {
|
|
11671
|
-
sofficePath = await resolveSoffice(emitter, options?.autoInstallLibreOffice ?? true);
|
|
11672
|
-
} catch (err) {
|
|
11673
|
-
if (err instanceof ConvertError) {
|
|
11674
|
-
return {
|
|
11675
|
-
success: false,
|
|
11676
|
-
code: err.code,
|
|
11677
|
-
error: err.message,
|
|
11678
|
-
stage: "validate"
|
|
11679
|
-
};
|
|
11680
|
-
}
|
|
11681
|
-
throw err;
|
|
11682
|
-
}
|
|
11683
|
-
const releaseLock = await acquireConvertLock();
|
|
11684
|
-
try {
|
|
11685
|
-
emitter.convertStart("\uBCC0\uD658 \uC2DC\uC791...");
|
|
11686
|
-
emitter.progress(10, "\uBCC0\uD658 \uC911...");
|
|
11687
|
-
const sourceExt = format === "hwpx" ? ".hwpx" : ".hwp";
|
|
11688
|
-
const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs, sofficePath, sourceExt);
|
|
11689
|
-
emitter.progress(100, "\uBCC0\uD658 \uC644\uB8CC");
|
|
11690
|
-
emitter.convertDone("\uBCC0\uD658 \uC644\uB8CC");
|
|
11691
|
-
const result = {
|
|
11692
|
-
success: true,
|
|
11693
|
-
pdf: new Uint8Array(pdf),
|
|
11694
|
-
sourceFormat: format
|
|
11695
|
-
};
|
|
11696
|
-
emitter.complete({
|
|
11697
|
-
sourceFormat: format,
|
|
11698
|
-
pdfSize: pdf.length
|
|
11699
|
-
});
|
|
11700
|
-
return result;
|
|
11701
|
-
} catch (err) {
|
|
11702
|
-
if (err instanceof ConvertError) {
|
|
11703
|
-
emitter.error("convert", err.code, err.message);
|
|
11704
|
-
return {
|
|
11705
|
-
success: false,
|
|
11706
|
-
code: err.code,
|
|
11707
|
-
error: err.message,
|
|
11708
|
-
stage: "convert"
|
|
11709
|
-
};
|
|
11710
|
-
}
|
|
11711
|
-
const errorMsg = err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328";
|
|
11712
|
-
emitter.error("convert", classifyError(err), errorMsg);
|
|
11713
|
-
return {
|
|
11714
|
-
success: false,
|
|
11715
|
-
code: classifyError(err),
|
|
11716
|
-
error: errorMsg,
|
|
11717
|
-
stage: "convert"
|
|
11718
|
-
};
|
|
11719
|
-
} finally {
|
|
11720
|
-
releaseLock();
|
|
11721
|
-
}
|
|
11722
|
-
} catch (unexpectedErr) {
|
|
11723
|
-
const errorMsg = unexpectedErr instanceof Error ? unexpectedErr.message : "\uC608\uC0C1\uCE58 \uBABB\uD55C \uC624\uB958";
|
|
11724
|
-
emitter.error("convert", "PARSE_ERROR", errorMsg);
|
|
11725
|
-
return {
|
|
11726
|
-
success: false,
|
|
11727
|
-
code: "PARSE_ERROR",
|
|
11728
|
-
error: errorMsg,
|
|
11729
|
-
stage: "convert"
|
|
11730
|
-
};
|
|
11731
|
-
}
|
|
11732
|
-
}
|
|
11733
|
-
async function convertHwpToPdf(input, options) {
|
|
11734
|
-
const result = await convertToPdf(input, options);
|
|
11735
|
-
if (result.success && result.sourceFormat !== "hwp") {
|
|
11736
|
-
return {
|
|
11737
|
-
success: false,
|
|
11738
|
-
code: "UNSUPPORTED_FORMAT",
|
|
11739
|
-
error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11740
|
-
stage: "detect"
|
|
11741
|
-
};
|
|
11742
|
-
}
|
|
11743
|
-
return result;
|
|
11744
|
-
}
|
|
11745
|
-
async function convertHwpxToPdf(input, options) {
|
|
11746
|
-
const result = await convertToPdf(input, options);
|
|
11747
|
-
if (result.success && result.sourceFormat !== "hwpx") {
|
|
11748
|
-
return {
|
|
11749
|
-
success: false,
|
|
11750
|
-
code: "UNSUPPORTED_FORMAT",
|
|
11751
|
-
error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11752
|
-
stage: "detect"
|
|
11753
|
-
};
|
|
11754
|
-
}
|
|
11755
|
-
return result;
|
|
11756
|
-
}
|
|
11757
|
-
|
|
11758
11554
|
// src/ocr/api-key-rotation.ts
|
|
11759
11555
|
var AllKeysCoolingDownError = class extends Error {
|
|
11760
11556
|
waitMs;
|
|
@@ -11849,9 +11645,9 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
|
11849
11645
|
};
|
|
11850
11646
|
|
|
11851
11647
|
// src/pipeline/unified-ocr.ts
|
|
11852
|
-
var
|
|
11853
|
-
var
|
|
11854
|
-
var
|
|
11648
|
+
var import_promises2 = require("fs/promises");
|
|
11649
|
+
var import_path5 = require("path");
|
|
11650
|
+
var import_child_process4 = require("child_process");
|
|
11855
11651
|
var import_node_perf_hooks = require("perf_hooks");
|
|
11856
11652
|
init_logger();
|
|
11857
11653
|
|
|
@@ -11985,15 +11781,15 @@ function elapsedMs(startAt) {
|
|
|
11985
11781
|
return Math.round(import_node_perf_hooks.performance.now() - startAt);
|
|
11986
11782
|
}
|
|
11987
11783
|
async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
11988
|
-
const absInput = (0,
|
|
11989
|
-
const stem = (0,
|
|
11990
|
-
const workspaceDir = (0,
|
|
11991
|
-
const imagesDir = (0,
|
|
11992
|
-
const rawDir = (0,
|
|
11993
|
-
const diffDir = (0,
|
|
11994
|
-
const outputPath = (0,
|
|
11995
|
-
const reportPath = (0,
|
|
11996
|
-
const modelCachePath = (0,
|
|
11784
|
+
const absInput = (0, import_path5.resolve)(inputPath);
|
|
11785
|
+
const stem = (0, import_path5.basename)(absInput, (0, import_path5.extname)(absInput));
|
|
11786
|
+
const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
|
|
11787
|
+
const imagesDir = (0, import_path5.join)(workspaceDir, "images");
|
|
11788
|
+
const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
|
|
11789
|
+
const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
|
|
11790
|
+
const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
|
|
11791
|
+
const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
|
|
11792
|
+
const modelCachePath = (0, import_path5.join)((0, import_path5.dirname)(absInput), ".kordoc-model-cache.json");
|
|
11997
11793
|
const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
|
|
11998
11794
|
const timeoutMs = options.timeoutMs ?? 6e4;
|
|
11999
11795
|
const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
|
|
@@ -12004,12 +11800,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12004
11800
|
const models = sortModelsByCache(modelsInput, modelCache);
|
|
12005
11801
|
const modelMaxTokens = { ...DEFAULT_MODEL_MAX_TOKENS, ...options.modelMaxTokens ?? {} };
|
|
12006
11802
|
const stageWeights = normalizeWeights({ ...DEFAULT_STAGE_WEIGHTS, ...options.stageWeights ?? {} });
|
|
12007
|
-
const keyPool = ApiKeyRotationPool.fromEnv();
|
|
12008
11803
|
const runId = options.runId ?? generateRunId("ocr");
|
|
12009
11804
|
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
12010
|
-
await (0,
|
|
12011
|
-
await (0,
|
|
12012
|
-
await (0,
|
|
11805
|
+
await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
|
|
11806
|
+
await (0, import_promises2.mkdir)(rawDir, { recursive: true });
|
|
11807
|
+
await (0, import_promises2.mkdir)(diffDir, { recursive: true });
|
|
12013
11808
|
const timingsMs = {};
|
|
12014
11809
|
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
12015
11810
|
const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
|
|
@@ -12020,52 +11815,57 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12020
11815
|
};
|
|
12021
11816
|
try {
|
|
12022
11817
|
ensureSupportedInput(absInput);
|
|
12023
|
-
let workingPdfPath = absInput;
|
|
12024
11818
|
const convertStart = import_node_perf_hooks.performance.now();
|
|
12025
11819
|
currentStage = "convert";
|
|
12026
|
-
|
|
12027
|
-
|
|
12028
|
-
|
|
12029
|
-
const
|
|
12030
|
-
|
|
12031
|
-
|
|
12032
|
-
|
|
12033
|
-
|
|
12034
|
-
|
|
12035
|
-
|
|
12036
|
-
|
|
12037
|
-
|
|
12038
|
-
|
|
12039
|
-
|
|
12040
|
-
}
|
|
12041
|
-
|
|
12042
|
-
|
|
12043
|
-
|
|
12044
|
-
|
|
12045
|
-
|
|
12046
|
-
|
|
12047
|
-
|
|
12048
|
-
|
|
12049
|
-
|
|
12050
|
-
|
|
12051
|
-
|
|
12052
|
-
|
|
12053
|
-
|
|
12054
|
-
await (0,
|
|
11820
|
+
if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
|
|
11821
|
+
markStageStart("convert", "\uC790\uCCB4 \uD30C\uC11C\uB85C Markdown \uBCC0\uD658 \uC911");
|
|
11822
|
+
logStage("info", "convert", "start", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
|
|
11823
|
+
const inputBuffer = await (0, import_promises2.readFile)(absInput);
|
|
11824
|
+
const parsed = await parseNativeDocument(inputBuffer);
|
|
11825
|
+
timingsMs.convert = elapsedMs(convertStart);
|
|
11826
|
+
markStageDone("convert", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC644\uB8CC");
|
|
11827
|
+
logStage("info", "convert", "done", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC644\uB8CC", { format: parsed.fileType, elapsedMs: timingsMs.convert });
|
|
11828
|
+
const mergeStart2 = import_node_perf_hooks.performance.now();
|
|
11829
|
+
currentStage = "merge";
|
|
11830
|
+
markStageStart("merge", "Markdown \uC800\uC7A5 \uC911");
|
|
11831
|
+
await (0, import_promises2.writeFile)(outputPath, parsed.markdown, "utf-8");
|
|
11832
|
+
timingsMs.merge = elapsedMs(mergeStart2);
|
|
11833
|
+
markStageDone("merge", "Markdown \uC800\uC7A5 \uC644\uB8CC");
|
|
11834
|
+
logStage("info", "merge", "done", "Markdown \uC800\uC7A5 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
|
|
11835
|
+
const report2 = {
|
|
11836
|
+
inputPath: absInput,
|
|
11837
|
+
outputPath,
|
|
11838
|
+
workspaceDir,
|
|
11839
|
+
selectedModel: "native-parser",
|
|
11840
|
+
probeImage: "",
|
|
11841
|
+
probeResults: [],
|
|
11842
|
+
pageCount: parsed.pageCount,
|
|
11843
|
+
sourceFormat: parsed.fileType,
|
|
11844
|
+
keyHealth: [],
|
|
11845
|
+
timingsMs,
|
|
11846
|
+
modelCachePath
|
|
11847
|
+
};
|
|
11848
|
+
await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report2, null, 2), "utf-8");
|
|
11849
|
+
logStage("info", "finalize", "done", "native parse run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
|
|
11850
|
+
return { outputPath, reportPath, selectedModel: "native-parser" };
|
|
12055
11851
|
}
|
|
11852
|
+
const workingPdfPath = absInput;
|
|
11853
|
+
markStageStart("convert", "PDF \uC785\uB825 \uD655\uC778 \uC911");
|
|
11854
|
+
logStage("info", "convert", "start", "PDF \uC785\uB825 \uD655\uC778", { input: absInput });
|
|
12056
11855
|
timingsMs.convert = elapsedMs(convertStart);
|
|
12057
|
-
markStageDone("convert", "PDF \
|
|
12058
|
-
logStage("info", "convert", "done", "PDF \
|
|
11856
|
+
markStageDone("convert", "PDF \uC785\uB825 \uD655\uC778 \uC644\uB8CC");
|
|
11857
|
+
logStage("info", "convert", "done", "PDF \uC785\uB825 \uD655\uC778 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
|
|
11858
|
+
const keyPool = ApiKeyRotationPool.fromEnv();
|
|
12059
11859
|
const renderStart = import_node_perf_hooks.performance.now();
|
|
12060
11860
|
currentStage = "render";
|
|
12061
11861
|
const totalPages = await getPdfPageCount(workingPdfPath).catch(() => 0);
|
|
12062
11862
|
if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
12063
11863
|
markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
|
|
12064
11864
|
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
|
|
12065
|
-
await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0,
|
|
12066
|
-
const firstFiles = (await (0,
|
|
11865
|
+
await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
|
|
11866
|
+
const firstFiles = (await (0, import_promises2.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
12067
11867
|
if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
|
|
12068
|
-
const probeImage = (0,
|
|
11868
|
+
const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
|
|
12069
11869
|
markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
|
|
12070
11870
|
const probeStart = import_node_perf_hooks.performance.now();
|
|
12071
11871
|
currentStage = "probe";
|
|
@@ -12101,7 +11901,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12101
11901
|
const keyCount = keyPool.snapshot().length;
|
|
12102
11902
|
const workerCount = Math.max(1, keyCount * concurrencyPerKey);
|
|
12103
11903
|
const queueCapacity = workerCount * 2;
|
|
12104
|
-
const
|
|
11904
|
+
const queue = new BoundedQueue(queueCapacity);
|
|
12105
11905
|
const ocrStart = import_node_perf_hooks.performance.now();
|
|
12106
11906
|
currentStage = "ocr";
|
|
12107
11907
|
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
|
|
@@ -12109,17 +11909,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12109
11909
|
let renderDone = 1;
|
|
12110
11910
|
const renderProducer = (async () => {
|
|
12111
11911
|
try {
|
|
12112
|
-
await
|
|
11912
|
+
await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
|
|
12113
11913
|
if (totalPages > 1) {
|
|
12114
|
-
for await (const item of renderPdfToPngStream(workingPdfPath, (0,
|
|
12115
|
-
await
|
|
11914
|
+
for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
|
|
11915
|
+
await queue.enqueue(item);
|
|
12116
11916
|
renderDone++;
|
|
12117
11917
|
markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
|
|
12118
11918
|
logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
|
|
12119
11919
|
}
|
|
12120
11920
|
}
|
|
12121
11921
|
} finally {
|
|
12122
|
-
|
|
11922
|
+
queue.close();
|
|
12123
11923
|
timingsMs.render = elapsedMs(renderStart);
|
|
12124
11924
|
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
12125
11925
|
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
|
|
@@ -12128,7 +11928,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12128
11928
|
const [, pageResultsMap] = await Promise.all([
|
|
12129
11929
|
renderProducer,
|
|
12130
11930
|
ocrWorkerPool({
|
|
12131
|
-
queue
|
|
11931
|
+
queue,
|
|
12132
11932
|
workerCount,
|
|
12133
11933
|
totalPages,
|
|
12134
11934
|
ocrInput: {
|
|
@@ -12161,8 +11961,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12161
11961
|
const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
|
|
12162
11962
|
const rawPagePaths = [];
|
|
12163
11963
|
for (const [pageNum, markdown] of sortedEntries) {
|
|
12164
|
-
const pagePath = (0,
|
|
12165
|
-
await (0,
|
|
11964
|
+
const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
|
|
11965
|
+
await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
|
|
12166
11966
|
rawPagePaths.push(pagePath);
|
|
12167
11967
|
}
|
|
12168
11968
|
const mergeStart = import_node_perf_hooks.performance.now();
|
|
@@ -12170,7 +11970,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12170
11970
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
12171
11971
|
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
12172
11972
|
const merged = await mergeMarkdownPages(rawPagePaths);
|
|
12173
|
-
await (0,
|
|
11973
|
+
await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
|
|
12174
11974
|
timingsMs.merge = elapsedMs(mergeStart);
|
|
12175
11975
|
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
12176
11976
|
logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
|
|
@@ -12186,7 +11986,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12186
11986
|
timingsMs,
|
|
12187
11987
|
modelCachePath
|
|
12188
11988
|
};
|
|
12189
|
-
await (0,
|
|
11989
|
+
await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
|
|
12190
11990
|
logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
|
|
12191
11991
|
return { outputPath, reportPath, selectedModel };
|
|
12192
11992
|
} catch (err) {
|
|
@@ -12261,7 +12061,7 @@ async function getPdfPageCount(pdfPath) {
|
|
|
12261
12061
|
return n;
|
|
12262
12062
|
}
|
|
12263
12063
|
async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, startPage = 1) {
|
|
12264
|
-
const imagesDir = (0,
|
|
12064
|
+
const imagesDir = (0, import_path5.dirname)(prefixPath);
|
|
12265
12065
|
for (let page = startPage; page <= totalPages; page++) {
|
|
12266
12066
|
try {
|
|
12267
12067
|
await runCommand("pdftoppm", [
|
|
@@ -12275,9 +12075,9 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
|
|
|
12275
12075
|
pdfPath,
|
|
12276
12076
|
prefixPath
|
|
12277
12077
|
]);
|
|
12278
|
-
const files = await (0,
|
|
12078
|
+
const files = await (0, import_promises2.readdir)(imagesDir);
|
|
12279
12079
|
const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
12280
|
-
const imagePath = (0,
|
|
12080
|
+
const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
|
|
12281
12081
|
yield { pageNumber: page, imagePath };
|
|
12282
12082
|
} catch (err) {
|
|
12283
12083
|
yield {
|
|
@@ -12290,7 +12090,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
|
|
|
12290
12090
|
}
|
|
12291
12091
|
async function runCommand(cmd, args) {
|
|
12292
12092
|
await new Promise((resolvePromise, reject) => {
|
|
12293
|
-
const child = (0,
|
|
12093
|
+
const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
|
|
12294
12094
|
let stderr = "";
|
|
12295
12095
|
child.stderr.on("data", (d) => {
|
|
12296
12096
|
stderr += String(d);
|
|
@@ -12304,7 +12104,7 @@ async function runCommand(cmd, args) {
|
|
|
12304
12104
|
}
|
|
12305
12105
|
async function runCommandWithStdout(cmd, args) {
|
|
12306
12106
|
return await new Promise((resolvePromise, reject) => {
|
|
12307
|
-
const child = (0,
|
|
12107
|
+
const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
|
|
12308
12108
|
let stdout = "";
|
|
12309
12109
|
let stderr = "";
|
|
12310
12110
|
child.stdout.on("data", (d) => {
|
|
@@ -12320,6 +12120,32 @@ async function runCommandWithStdout(cmd, args) {
|
|
|
12320
12120
|
});
|
|
12321
12121
|
});
|
|
12322
12122
|
}
|
|
12123
|
+
async function parseNativeDocument(buffer) {
|
|
12124
|
+
const arrayBuffer = toArrayBuffer(buffer);
|
|
12125
|
+
const format = detectFormat(arrayBuffer);
|
|
12126
|
+
let result;
|
|
12127
|
+
let fileType;
|
|
12128
|
+
if (format === "hwp") {
|
|
12129
|
+
result = parseHwp5Document(buffer);
|
|
12130
|
+
fileType = "hwp";
|
|
12131
|
+
} else if (format === "hwpx") {
|
|
12132
|
+
const { format: zipFormat, zip } = await detectZipFormat(arrayBuffer);
|
|
12133
|
+
if (zipFormat === "xlsx") {
|
|
12134
|
+
result = await parseXlsxDocument(arrayBuffer, void 0, zip ?? void 0);
|
|
12135
|
+
fileType = "xlsx";
|
|
12136
|
+
} else if (zipFormat === "docx") {
|
|
12137
|
+
result = await parseDocxDocument(arrayBuffer, void 0, zip ?? void 0);
|
|
12138
|
+
fileType = "docx";
|
|
12139
|
+
} else {
|
|
12140
|
+
result = await parseHwpxDocument(arrayBuffer, void 0, zip ?? void 0);
|
|
12141
|
+
fileType = "hwpx";
|
|
12142
|
+
}
|
|
12143
|
+
} else {
|
|
12144
|
+
throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC790\uCCB4 \uD30C\uC11C\uB85C \uCC98\uB9AC\uD560 \uC218 \uC5C6\uB294 \uC785\uB825 \uD3EC\uB9F7: ${format}`);
|
|
12145
|
+
}
|
|
12146
|
+
const pageCount = result.metadata?.pageCount ?? Math.max(1, ...result.blocks.map((block) => block.pageNumber ?? 1));
|
|
12147
|
+
return { markdown: result.markdown, fileType, pageCount };
|
|
12148
|
+
}
|
|
12323
12149
|
function naturalPageSort(a, b) {
|
|
12324
12150
|
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
12325
12151
|
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
@@ -12393,7 +12219,7 @@ function startParallelProbeRuns(input) {
|
|
|
12393
12219
|
}
|
|
12394
12220
|
async function loadModelCache(path) {
|
|
12395
12221
|
try {
|
|
12396
|
-
const raw = await (0,
|
|
12222
|
+
const raw = await (0, import_promises2.readFile)(path, "utf-8");
|
|
12397
12223
|
return JSON.parse(raw);
|
|
12398
12224
|
} catch {
|
|
12399
12225
|
return null;
|
|
@@ -12424,15 +12250,15 @@ async function updateModelCache(path, probes) {
|
|
|
12424
12250
|
}
|
|
12425
12251
|
}
|
|
12426
12252
|
current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
12427
|
-
await (0,
|
|
12253
|
+
await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
|
|
12428
12254
|
}
|
|
12429
12255
|
async function ocrWorkerPool(input) {
|
|
12430
|
-
const { queue
|
|
12256
|
+
const { queue, workerCount, ocrInput, onPageDone } = input;
|
|
12431
12257
|
const results = /* @__PURE__ */ new Map();
|
|
12432
12258
|
let completedCount = 0;
|
|
12433
12259
|
async function worker() {
|
|
12434
12260
|
while (true) {
|
|
12435
|
-
const item = await
|
|
12261
|
+
const item = await queue.dequeue();
|
|
12436
12262
|
if (item === QUEUE_DONE) break;
|
|
12437
12263
|
const { pageNumber, imagePath, error } = item;
|
|
12438
12264
|
if (imagePath === null) {
|
|
@@ -12484,7 +12310,7 @@ async function ocrImageWithFallback(input) {
|
|
|
12484
12310
|
async function mergeMarkdownPages(paths) {
|
|
12485
12311
|
const out = [];
|
|
12486
12312
|
for (let i = 0; i < paths.length; i++) {
|
|
12487
|
-
const txt = (await (0,
|
|
12313
|
+
const txt = (await (0, import_promises2.readFile)(paths[i], "utf-8")).trim();
|
|
12488
12314
|
if (!txt) continue;
|
|
12489
12315
|
out.push(txt);
|
|
12490
12316
|
}
|
|
@@ -12600,7 +12426,7 @@ async function ocrImageViaNim(input) {
|
|
|
12600
12426
|
throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
|
|
12601
12427
|
}
|
|
12602
12428
|
async function encodeBase64(path) {
|
|
12603
|
-
const b = await (0,
|
|
12429
|
+
const b = await (0, import_promises2.readFile)(path);
|
|
12604
12430
|
return b.toString("base64");
|
|
12605
12431
|
}
|
|
12606
12432
|
function stripCodeFence3(text) {
|
|
@@ -12612,7 +12438,7 @@ async function delay(ms) {
|
|
|
12612
12438
|
await new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
|
|
12613
12439
|
}
|
|
12614
12440
|
function ensureSupportedInput(path) {
|
|
12615
|
-
const ext = (0,
|
|
12441
|
+
const ext = (0, import_path5.extname)(path).toLowerCase();
|
|
12616
12442
|
const allowed = /* @__PURE__ */ new Set([".pdf", ".hwp", ".hwpx", ".docx", ".xlsx"]);
|
|
12617
12443
|
if (!allowed.has(ext)) {
|
|
12618
12444
|
throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
|
|
@@ -12620,16 +12446,6 @@ function ensureSupportedInput(path) {
|
|
|
12620
12446
|
}
|
|
12621
12447
|
function normalizePipelineError(err, stage) {
|
|
12622
12448
|
if (err instanceof UnifiedOcrError) return err;
|
|
12623
|
-
if (err instanceof ConvertError) {
|
|
12624
|
-
const codeMap = {
|
|
12625
|
-
SOFFICE_NOT_FOUND: "SOFFICE_NOT_FOUND",
|
|
12626
|
-
CONVERT_FAILED: "CONVERT_FAILED",
|
|
12627
|
-
TIMEOUT: "CONVERT_FAILED",
|
|
12628
|
-
UNSUPPORTED_PLATFORM: "CONVERT_FAILED",
|
|
12629
|
-
UNSUPPORTED_FORMAT: "UNSUPPORTED_INPUT"
|
|
12630
|
-
};
|
|
12631
|
-
return new UnifiedOcrError(codeMap[err.code] ?? "CONVERT_FAILED", stage, err.message);
|
|
12632
|
-
}
|
|
12633
12449
|
const message = err instanceof Error ? err.message : String(err);
|
|
12634
12450
|
const codeByStage = {
|
|
12635
12451
|
convert: "CONVERT_FAILED",
|
|
@@ -12649,7 +12465,7 @@ async function parse2(input, options) {
|
|
|
12649
12465
|
let buffer;
|
|
12650
12466
|
if (typeof input === "string") {
|
|
12651
12467
|
try {
|
|
12652
|
-
const buf = await (0,
|
|
12468
|
+
const buf = await (0, import_promises3.readFile)(input);
|
|
12653
12469
|
buffer = toArrayBuffer(buf);
|
|
12654
12470
|
} catch (err) {
|
|
12655
12471
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
@@ -12809,9 +12625,6 @@ async function parseDocx(buffer, options, zip) {
|
|
|
12809
12625
|
VERSION,
|
|
12810
12626
|
blocksToMarkdown,
|
|
12811
12627
|
compare,
|
|
12812
|
-
convertHwpToPdf,
|
|
12813
|
-
convertHwpxToPdf,
|
|
12814
|
-
convertToPdf,
|
|
12815
12628
|
detectFormat,
|
|
12816
12629
|
detectZipFormat,
|
|
12817
12630
|
diffBlocks,
|