@clazic/kordoc 2.7.5 → 2.7.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +1 -29
- package/dist/{chunk-6DUCYZRR.js → chunk-URSQEMVJ.js} +345 -523
- package/dist/chunk-URSQEMVJ.js.map +1 -0
- package/dist/{chunk-5CIZV5C3.js → chunk-X7UUXEMM.js} +2 -2
- package/dist/cli.js +5 -87
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +447 -634
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +4 -135
- package/dist/index.d.ts +4 -135
- package/dist/index.js +440 -624
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -43
- package/dist/mcp.js.map +1 -1
- package/dist/{utils-NR7YWMWB.js → utils-QQVZGOGU.js} +2 -2
- package/dist/{watch-LDX5GPEE.js → watch-RQYUNSSH.js} +3 -3
- package/package.json +1 -2
- package/dist/chunk-6DUCYZRR.js.map +0 -1
- /package/dist/{chunk-5CIZV5C3.js.map → chunk-X7UUXEMM.js.map} +0 -0
- /package/dist/{utils-NR7YWMWB.js.map → utils-QQVZGOGU.js.map} +0 -0
- /package/dist/{watch-LDX5GPEE.js.map → watch-RQYUNSSH.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -3059,7 +3059,7 @@ var init_provider = __esm({
|
|
|
3059
3059
|
});
|
|
3060
3060
|
|
|
3061
3061
|
// src/index.ts
|
|
3062
|
-
import { readFile as
|
|
3062
|
+
import { readFile as readFile2 } from "fs/promises";
|
|
3063
3063
|
|
|
3064
3064
|
// src/detect.ts
|
|
3065
3065
|
import JSZip from "jszip";
|
|
@@ -3112,7 +3112,7 @@ import JSZip2 from "jszip";
|
|
|
3112
3112
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3113
3113
|
|
|
3114
3114
|
// src/utils.ts
|
|
3115
|
-
var VERSION = true ? "2.7.
|
|
3115
|
+
var VERSION = true ? "2.7.6" : "0.0.0-dev";
|
|
3116
3116
|
function toArrayBuffer(buf) {
|
|
3117
3117
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3118
3118
|
return buf.buffer;
|
|
@@ -3319,13 +3319,21 @@ function sanitizeText(text) {
|
|
|
3319
3319
|
}
|
|
3320
3320
|
return result;
|
|
3321
3321
|
}
|
|
3322
|
+
function escapeGfm(text, inTableCell = false) {
|
|
3323
|
+
if (!text) return text;
|
|
3324
|
+
let result = text.replace(/(?<!\\)~/g, "\\~");
|
|
3325
|
+
if (inTableCell) {
|
|
3326
|
+
result = result.replace(/(?<!\\)\|/g, "\\|");
|
|
3327
|
+
}
|
|
3328
|
+
return result;
|
|
3329
|
+
}
|
|
3322
3330
|
function blocksToMarkdown(blocks) {
|
|
3323
3331
|
const lines = [];
|
|
3324
3332
|
for (let i = 0; i < blocks.length; i++) {
|
|
3325
3333
|
const block = blocks[i];
|
|
3326
3334
|
if (block.type === "heading" && block.text) {
|
|
3327
3335
|
const prefix = "#".repeat(Math.min(block.level || 2, 6));
|
|
3328
|
-
const headingText = sanitizeText(block.text);
|
|
3336
|
+
const headingText = escapeGfm(sanitizeText(block.text), false);
|
|
3329
3337
|
if (headingText) lines.push("", `${prefix} ${headingText}`, "");
|
|
3330
3338
|
continue;
|
|
3331
3339
|
}
|
|
@@ -3338,42 +3346,47 @@ function blocksToMarkdown(blocks) {
|
|
|
3338
3346
|
continue;
|
|
3339
3347
|
}
|
|
3340
3348
|
if (block.type === "list" && block.text) {
|
|
3341
|
-
const
|
|
3342
|
-
if (!
|
|
3343
|
-
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(
|
|
3349
|
+
const sanitized = sanitizeText(block.text);
|
|
3350
|
+
if (!sanitized) continue;
|
|
3351
|
+
const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(sanitized);
|
|
3344
3352
|
const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
|
|
3353
|
+
const listText = escapeGfm(sanitized, false);
|
|
3345
3354
|
lines.push(`${prefix}${listText}`);
|
|
3346
3355
|
if (block.children) {
|
|
3347
3356
|
for (const child of block.children) {
|
|
3348
3357
|
const childPrefix = child.listType === "ordered" ? "1." : "-";
|
|
3349
|
-
|
|
3358
|
+
const childText = child.text ? escapeGfm(sanitizeText(child.text), false) : "";
|
|
3359
|
+
lines.push(` ${childPrefix} ${childText}`);
|
|
3350
3360
|
}
|
|
3351
3361
|
}
|
|
3352
3362
|
continue;
|
|
3353
3363
|
}
|
|
3354
3364
|
if (block.type === "paragraph" && block.text) {
|
|
3355
|
-
|
|
3356
|
-
if (!
|
|
3357
|
-
if (/^\[별표\s*\d+/.test(
|
|
3365
|
+
const sanitized = sanitizeText(block.text);
|
|
3366
|
+
if (!sanitized) continue;
|
|
3367
|
+
if (/^\[별표\s*\d+/.test(sanitized)) {
|
|
3358
3368
|
const nextBlock = blocks[i + 1];
|
|
3369
|
+
const escapedSelf = escapeGfm(sanitized, false);
|
|
3359
3370
|
if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
|
|
3360
|
-
|
|
3371
|
+
const nextEscaped = escapeGfm(sanitizeText(nextBlock.text), false);
|
|
3372
|
+
lines.push("", `## ${escapedSelf} ${nextEscaped}`, "");
|
|
3361
3373
|
i++;
|
|
3362
3374
|
} else {
|
|
3363
|
-
lines.push("", `## ${
|
|
3375
|
+
lines.push("", `## ${escapedSelf}`, "");
|
|
3364
3376
|
}
|
|
3365
3377
|
continue;
|
|
3366
3378
|
}
|
|
3367
|
-
if (/^\([^)]*조[^)]*관련\)$/.test(
|
|
3368
|
-
lines.push(`*${
|
|
3379
|
+
if (/^\([^)]*조[^)]*관련\)$/.test(sanitized)) {
|
|
3380
|
+
lines.push(`*${escapeGfm(sanitized, false)}*`, "");
|
|
3369
3381
|
continue;
|
|
3370
3382
|
}
|
|
3383
|
+
let text = escapeGfm(sanitized, false);
|
|
3371
3384
|
if (block.href) {
|
|
3372
3385
|
const href = sanitizeHref(block.href);
|
|
3373
3386
|
if (href) text = `[${text}](${href})`;
|
|
3374
3387
|
}
|
|
3375
3388
|
if (block.footnoteText) {
|
|
3376
|
-
text += ` (\uC8FC: ${block.footnoteText})`;
|
|
3389
|
+
text += ` (\uC8FC: ${escapeGfm(block.footnoteText, false)})`;
|
|
3377
3390
|
}
|
|
3378
3391
|
lines.push(text);
|
|
3379
3392
|
} else if (block.type === "table" && block.table) {
|
|
@@ -3398,13 +3411,13 @@ function tableToMarkdown(table) {
|
|
|
3398
3411
|
return content.split(/\n/).map((line) => {
|
|
3399
3412
|
const trimmed = line.trim();
|
|
3400
3413
|
if (!trimmed) return "";
|
|
3401
|
-
if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
|
|
3402
|
-
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
|
|
3403
|
-
return trimmed;
|
|
3414
|
+
if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed, false)}**`;
|
|
3415
|
+
if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed, false)}`;
|
|
3416
|
+
return escapeGfm(trimmed, false);
|
|
3404
3417
|
}).filter(Boolean).join("\n");
|
|
3405
3418
|
}
|
|
3406
3419
|
if (numCols === 1 && numRows >= 2) {
|
|
3407
|
-
return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
|
|
3420
|
+
return cells.map((row) => escapeGfm(sanitizeText(row[0].text).replace(/\n/g, " "), false)).filter(Boolean).join("\n");
|
|
3408
3421
|
}
|
|
3409
3422
|
const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
|
|
3410
3423
|
const skip = /* @__PURE__ */ new Set();
|
|
@@ -3413,7 +3426,7 @@ function tableToMarkdown(table) {
|
|
|
3413
3426
|
if (skip.has(`${r},${c}`)) continue;
|
|
3414
3427
|
const cell = cells[r]?.[c];
|
|
3415
3428
|
if (!cell) continue;
|
|
3416
|
-
display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
|
|
3429
|
+
display[r][c] = escapeGfm(sanitizeText(cell.text).replace(/\n/g, "<br>"), true);
|
|
3417
3430
|
for (let dr = 0; dr < cell.rowSpan; dr++) {
|
|
3418
3431
|
for (let dc = 0; dc < cell.colSpan; dc++) {
|
|
3419
3432
|
if (dr === 0 && dc === 0) continue;
|
|
@@ -3460,6 +3473,223 @@ var HEADING_RATIO_H1 = 1.5;
|
|
|
3460
3473
|
var HEADING_RATIO_H2 = 1.3;
|
|
3461
3474
|
var HEADING_RATIO_H3 = 1.15;
|
|
3462
3475
|
|
|
3476
|
+
// src/hwp5/equation.ts
|
|
3477
|
+
var WORD_COMMANDS = /* @__PURE__ */ new Map([
|
|
3478
|
+
["alpha", "\\alpha"],
|
|
3479
|
+
["beta", "\\beta"],
|
|
3480
|
+
["gamma", "\\gamma"],
|
|
3481
|
+
["delta", "\\delta"],
|
|
3482
|
+
["epsilon", "\\epsilon"],
|
|
3483
|
+
["theta", "\\theta"],
|
|
3484
|
+
["lambda", "\\lambda"],
|
|
3485
|
+
["mu", "\\mu"],
|
|
3486
|
+
["pi", "\\pi"],
|
|
3487
|
+
["sigma", "\\sigma"],
|
|
3488
|
+
["tau", "\\tau"],
|
|
3489
|
+
["phi", "\\phi"],
|
|
3490
|
+
["omega", "\\omega"],
|
|
3491
|
+
["sin", "\\sin"],
|
|
3492
|
+
["cos", "\\cos"],
|
|
3493
|
+
["tan", "\\tan"],
|
|
3494
|
+
["sec", "\\sec"],
|
|
3495
|
+
["csc", "\\csc"],
|
|
3496
|
+
["cot", "\\cot"],
|
|
3497
|
+
["log", "\\log"],
|
|
3498
|
+
["ln", "\\ln"],
|
|
3499
|
+
["lim", "\\lim"],
|
|
3500
|
+
["inf", "\\infty"],
|
|
3501
|
+
["sum", "\\sum"],
|
|
3502
|
+
["smallsum", "\\sum"],
|
|
3503
|
+
["prod", "\\prod"],
|
|
3504
|
+
["int", "\\int"],
|
|
3505
|
+
["oint", "\\oint"],
|
|
3506
|
+
["rightarrow", "\\rightarrow"],
|
|
3507
|
+
["leftarrow", "\\leftarrow"],
|
|
3508
|
+
["partial", "\\partial"],
|
|
3509
|
+
["nabla", "\\nabla"],
|
|
3510
|
+
["angle", "\\angle"],
|
|
3511
|
+
["triangle", "\\triangle"],
|
|
3512
|
+
["vec", "\\vec"],
|
|
3513
|
+
["bar", "\\overline"],
|
|
3514
|
+
["dot", "\\dot"],
|
|
3515
|
+
["hat", "\\hat"],
|
|
3516
|
+
["left", "\\left"],
|
|
3517
|
+
["right", "\\right"]
|
|
3518
|
+
]);
|
|
3519
|
+
var SYMBOL_WORDS = /* @__PURE__ */ new Map([
|
|
3520
|
+
["times", "\\times"],
|
|
3521
|
+
["divide", "\\div"],
|
|
3522
|
+
["div", "\\div"],
|
|
3523
|
+
["le", "\\leq"],
|
|
3524
|
+
["ge", "\\geq"],
|
|
3525
|
+
["geq", "\\geq"],
|
|
3526
|
+
["deg", "^\\circ"],
|
|
3527
|
+
["rarrow", "\\rightarrow"],
|
|
3528
|
+
["larrow", "\\leftarrow"],
|
|
3529
|
+
["lrarrow", "\\leftrightarrow"],
|
|
3530
|
+
["in", "\\in"],
|
|
3531
|
+
["notin", "\\notin"],
|
|
3532
|
+
["emptyset", "\\emptyset"],
|
|
3533
|
+
["subset", "\\subset"],
|
|
3534
|
+
["nsubset", "\\nsubseteq"],
|
|
3535
|
+
["cup", "\\cup"],
|
|
3536
|
+
["cap", "\\cap"],
|
|
3537
|
+
["smallinter", "\\cap"],
|
|
3538
|
+
["sim", "\\sim"],
|
|
3539
|
+
["circ", "\\circ"],
|
|
3540
|
+
["bot", "\\perp"],
|
|
3541
|
+
["dyad", "\\overleftrightarrow"],
|
|
3542
|
+
["arch", "\\overset{\\frown}"]
|
|
3543
|
+
]);
|
|
3544
|
+
function hwpEquationToLatex(equation) {
|
|
3545
|
+
return convertEquation(equation.replace(/\0/g, "").trim(), 0);
|
|
3546
|
+
}
|
|
3547
|
+
function convertEquation(equation, depth) {
|
|
3548
|
+
if (!equation || depth > 12) return equation;
|
|
3549
|
+
let result = equation.replace(/\s+/g, " ").replace(/`+/g, "\\,").replace(/~+/g, "\\,").trim();
|
|
3550
|
+
result = convertMatrixLike(result);
|
|
3551
|
+
result = convertRoots(result, depth);
|
|
3552
|
+
result = convertOver(result, depth);
|
|
3553
|
+
result = convertSqrt(result, depth);
|
|
3554
|
+
result = convertScripts(result);
|
|
3555
|
+
result = convertOperators(result);
|
|
3556
|
+
result = removeFontDirectives(result);
|
|
3557
|
+
result = convertWords(result);
|
|
3558
|
+
result = cleanupLatexSpacing(result);
|
|
3559
|
+
return result;
|
|
3560
|
+
}
|
|
3561
|
+
function convertMatrixLike(input) {
|
|
3562
|
+
return input.replace(
|
|
3563
|
+
/\bmatrix\s*\{([^{}]*)\}/gi,
|
|
3564
|
+
(_match, body) => `\\begin{matrix} ${body.split("#").map((part) => part.trim()).join(" & ")} \\end{matrix}`
|
|
3565
|
+
).replace(
|
|
3566
|
+
/\bcases\s*\{([^{}]*)\}/gi,
|
|
3567
|
+
(_match, body) => `\\begin{cases} ${body.split("#").map((part) => part.trim()).join(" \\\\ ")} \\end{cases}`
|
|
3568
|
+
);
|
|
3569
|
+
}
|
|
3570
|
+
function convertRoots(input, depth) {
|
|
3571
|
+
return input.replace(/(?<!\\)\broot\s+({[^{}]*}|\S+)\s+of\s+({[^{}]*}|\S+)/gi, (_match, degree, radicand) => {
|
|
3572
|
+
return `\\sqrt[${convertEquation(unwrapGroup(degree), depth + 1)}]{${convertEquation(unwrapGroup(radicand), depth + 1)}}`;
|
|
3573
|
+
});
|
|
3574
|
+
}
|
|
3575
|
+
function convertSqrt(input, depth) {
|
|
3576
|
+
return input.replace(/(?<!\\)\bsqrt\s*({[^{}]*}|\S+)/gi, (_match, radicand) => {
|
|
3577
|
+
return `\\sqrt{${convertEquation(unwrapGroup(radicand), depth + 1)}}`;
|
|
3578
|
+
});
|
|
3579
|
+
}
|
|
3580
|
+
function convertOver(input, depth) {
|
|
3581
|
+
let result = input;
|
|
3582
|
+
for (let guard = 0; guard < 50; guard++) {
|
|
3583
|
+
const over = findTopLevelWord(result, "over");
|
|
3584
|
+
if (over < 0) break;
|
|
3585
|
+
const left = readLeftAtom(result, over);
|
|
3586
|
+
const right = readRightAtom(result, over + "over".length);
|
|
3587
|
+
if (!left || !right) break;
|
|
3588
|
+
const numerator = convertEquation(unwrapGroup(left.atom), depth + 1);
|
|
3589
|
+
const denominator = convertEquation(unwrapGroup(right.atom), depth + 1);
|
|
3590
|
+
result = result.slice(0, left.start) + `\\frac{${numerator}}{${denominator}}` + result.slice(right.end);
|
|
3591
|
+
}
|
|
3592
|
+
return result;
|
|
3593
|
+
}
|
|
3594
|
+
function convertScripts(input) {
|
|
3595
|
+
return input.replace(/\s*\^\s*/g, "^").replace(/\s*_\s*/g, "_").replace(/\^(?!\{)([^\s{}_^]+)/g, "^{$1}").replace(/_(?!\{)([^\s{}_^]+)/g, "_{$1}");
|
|
3596
|
+
}
|
|
3597
|
+
function convertOperators(input) {
|
|
3598
|
+
return input.replace(/\+-/g, "\\pm").replace(/-\+/g, "\\mp").replace(/\/\//g, "\\parallel").replace(/△/g, "\\triangle ").replace(/□/g, "\\square ").replace(/‧/g, "\\cdot ").replace(/!=/g, "\\neq").replace(/<=/g, "\\leq").replace(/>=/g, "\\geq").replace(/==/g, "\\equiv");
|
|
3599
|
+
}
|
|
3600
|
+
function removeFontDirectives(input) {
|
|
3601
|
+
return input.replace(/(?<!\\)\b(?:rm|it)\b\s*/gi, "");
|
|
3602
|
+
}
|
|
3603
|
+
function convertWords(input) {
|
|
3604
|
+
return input.replace(/(?<![\\A-Za-z0-9])([A-Za-z][A-Za-z0-9]*)(?![A-Za-z0-9])/g, (word) => {
|
|
3605
|
+
const exact = SYMBOL_WORDS.get(word);
|
|
3606
|
+
if (exact) return exact;
|
|
3607
|
+
const lower = word.toLowerCase();
|
|
3608
|
+
return SYMBOL_WORDS.get(lower) ?? WORD_COMMANDS.get(lower) ?? word;
|
|
3609
|
+
});
|
|
3610
|
+
}
|
|
3611
|
+
function cleanupLatexSpacing(input) {
|
|
3612
|
+
return input.replace(/\\left\s*\{/g, "\\left\\{").replace(/\\right\s*\}/g, "\\right\\}").replace(/\\left\s*([\[\]\(\)\|])/g, "\\left$1").replace(/\\right\s*([\[\]\(\)\|])/g, "\\right$1").replace(/\s*\\,\s*/g, "\\,").replace(/\s+/g, " ").replace(/\{\s+/g, "{").replace(/\s+\}/g, "}").trim();
|
|
3613
|
+
}
|
|
3614
|
+
function findTopLevelWord(input, word) {
|
|
3615
|
+
let curly = 0;
|
|
3616
|
+
let paren = 0;
|
|
3617
|
+
for (let i = 0; i <= input.length - word.length; i++) {
|
|
3618
|
+
const ch = input[i];
|
|
3619
|
+
if (ch === "{") curly++;
|
|
3620
|
+
else if (ch === "}") curly = Math.max(0, curly - 1);
|
|
3621
|
+
else if (ch === "(") paren++;
|
|
3622
|
+
else if (ch === ")") paren = Math.max(0, paren - 1);
|
|
3623
|
+
if (curly !== 0 || paren !== 0) continue;
|
|
3624
|
+
if (input.slice(i, i + word.length).toLowerCase() !== word) continue;
|
|
3625
|
+
if (isWordChar(input[i - 1]) || isWordChar(input[i + word.length])) continue;
|
|
3626
|
+
return i;
|
|
3627
|
+
}
|
|
3628
|
+
return -1;
|
|
3629
|
+
}
|
|
3630
|
+
function readLeftAtom(input, end) {
|
|
3631
|
+
let pos = end - 1;
|
|
3632
|
+
while (pos >= 0 && /\s/.test(input[pos])) pos--;
|
|
3633
|
+
if (pos < 0) return null;
|
|
3634
|
+
if (input[pos] === "}") {
|
|
3635
|
+
const start2 = findMatchingLeft(input, pos, "{", "}");
|
|
3636
|
+
if (start2 >= 0) return { start: start2, atom: input.slice(start2, pos + 1) };
|
|
3637
|
+
}
|
|
3638
|
+
if (input[pos] === ")") {
|
|
3639
|
+
const start2 = findMatchingLeft(input, pos, "(", ")");
|
|
3640
|
+
if (start2 >= 0) return { start: start2, atom: input.slice(start2, pos + 1) };
|
|
3641
|
+
}
|
|
3642
|
+
let start = pos;
|
|
3643
|
+
while (start >= 0 && !/\s/.test(input[start]) && !/[+\-=<>]/.test(input[start])) start--;
|
|
3644
|
+
return { start: start + 1, atom: input.slice(start + 1, pos + 1) };
|
|
3645
|
+
}
|
|
3646
|
+
function readRightAtom(input, start) {
|
|
3647
|
+
let pos = start;
|
|
3648
|
+
while (pos < input.length && /\s/.test(input[pos])) pos++;
|
|
3649
|
+
if (pos >= input.length) return null;
|
|
3650
|
+
if (input[pos] === "{") {
|
|
3651
|
+
const end2 = findMatchingRight(input, pos, "{", "}");
|
|
3652
|
+
if (end2 >= 0) return { end: end2 + 1, atom: input.slice(pos, end2 + 1) };
|
|
3653
|
+
}
|
|
3654
|
+
if (input[pos] === "(") {
|
|
3655
|
+
const end2 = findMatchingRight(input, pos, "(", ")");
|
|
3656
|
+
if (end2 >= 0) return { end: end2 + 1, atom: input.slice(pos, end2 + 1) };
|
|
3657
|
+
}
|
|
3658
|
+
let end = pos;
|
|
3659
|
+
while (end < input.length && !/\s/.test(input[end]) && !/[+\-=<>]/.test(input[end])) end++;
|
|
3660
|
+
return { end, atom: input.slice(pos, end) };
|
|
3661
|
+
}
|
|
3662
|
+
function findMatchingLeft(input, closeIndex, open, close) {
|
|
3663
|
+
let depth = 0;
|
|
3664
|
+
for (let i = closeIndex; i >= 0; i--) {
|
|
3665
|
+
if (input[i] === close) depth++;
|
|
3666
|
+
else if (input[i] === open) {
|
|
3667
|
+
depth--;
|
|
3668
|
+
if (depth === 0) return i;
|
|
3669
|
+
}
|
|
3670
|
+
}
|
|
3671
|
+
return -1;
|
|
3672
|
+
}
|
|
3673
|
+
function findMatchingRight(input, openIndex, open, close) {
|
|
3674
|
+
let depth = 0;
|
|
3675
|
+
for (let i = openIndex; i < input.length; i++) {
|
|
3676
|
+
if (input[i] === open) depth++;
|
|
3677
|
+
else if (input[i] === close) {
|
|
3678
|
+
depth--;
|
|
3679
|
+
if (depth === 0) return i;
|
|
3680
|
+
}
|
|
3681
|
+
}
|
|
3682
|
+
return -1;
|
|
3683
|
+
}
|
|
3684
|
+
function unwrapGroup(input) {
|
|
3685
|
+
const trimmed = input.trim();
|
|
3686
|
+
if (trimmed.startsWith("{") && trimmed.endsWith("}")) return trimmed.slice(1, -1);
|
|
3687
|
+
return trimmed;
|
|
3688
|
+
}
|
|
3689
|
+
function isWordChar(ch) {
|
|
3690
|
+
return !!ch && /[A-Za-z0-9_]/.test(ch);
|
|
3691
|
+
}
|
|
3692
|
+
|
|
3463
3693
|
// src/hwpx/parser.ts
|
|
3464
3694
|
init_page_range();
|
|
3465
3695
|
init_logger();
|
|
@@ -4141,6 +4371,17 @@ function findDescendant(node, targetTag, depth = 0) {
|
|
|
4141
4371
|
}
|
|
4142
4372
|
return null;
|
|
4143
4373
|
}
|
|
4374
|
+
function findChildByLocalName(node, targetTag) {
|
|
4375
|
+
const children = node.childNodes;
|
|
4376
|
+
if (!children) return null;
|
|
4377
|
+
for (let i = 0; i < children.length; i++) {
|
|
4378
|
+
const child = children[i];
|
|
4379
|
+
if (child.nodeType !== 1) continue;
|
|
4380
|
+
const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
|
|
4381
|
+
if (tag === targetTag) return child;
|
|
4382
|
+
}
|
|
4383
|
+
return null;
|
|
4384
|
+
}
|
|
4144
4385
|
function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
|
|
4145
4386
|
const children = drawTextNode.childNodes;
|
|
4146
4387
|
if (!children) return;
|
|
@@ -4243,6 +4484,22 @@ function extractParagraphInfo(para, styleMap) {
|
|
|
4243
4484
|
case "shapeComment":
|
|
4244
4485
|
case "drawText":
|
|
4245
4486
|
break;
|
|
4487
|
+
// 수식: <hp:equation> 내부의 <hp:script>에 HML/HULK-style 수식 본문이
|
|
4488
|
+
// 들어있음. hwpEquationToLatex로 LaTeX 변환 후 `$...$`로 래핑하여
|
|
4489
|
+
// 본문 텍스트에 인라인 삽입. 변환 실패/빈 결과는 조용히 드롭
|
|
4490
|
+
// (대체 텍스트 "수식입니다." 누출 방지는 기존 정규식이 처리).
|
|
4491
|
+
case "equation": {
|
|
4492
|
+
const script = findChildByLocalName(child, "script");
|
|
4493
|
+
const raw = script ? extractTextFromNode(script) : "";
|
|
4494
|
+
if (raw.trim()) {
|
|
4495
|
+
try {
|
|
4496
|
+
const latex = hwpEquationToLatex(raw).trim();
|
|
4497
|
+
if (latex) text += " $" + latex.replace(/\$/g, "\\$") + "$ ";
|
|
4498
|
+
} catch {
|
|
4499
|
+
}
|
|
4500
|
+
}
|
|
4501
|
+
break;
|
|
4502
|
+
}
|
|
4246
4503
|
// run 요소에서 charPrIDRef 추출
|
|
4247
4504
|
case "r": {
|
|
4248
4505
|
const runCharPr = child.getAttribute("charPrIDRef");
|
|
@@ -4309,8 +4566,13 @@ var TAG_CHAR_SHAPE = 68;
|
|
|
4309
4566
|
var TAG_CTRL_HEADER = 71;
|
|
4310
4567
|
var TAG_LIST_HEADER = 72;
|
|
4311
4568
|
var TAG_TABLE = 77;
|
|
4312
|
-
var
|
|
4313
|
-
var
|
|
4569
|
+
var TAG_EQEDIT = 88;
|
|
4570
|
+
var HWPTAG_BEGIN = 16;
|
|
4571
|
+
var TAG_ID_MAPPINGS = HWPTAG_BEGIN + 1;
|
|
4572
|
+
var TAG_FACE_NAME = HWPTAG_BEGIN + 3;
|
|
4573
|
+
var TAG_DOC_CHAR_SHAPE = HWPTAG_BEGIN + 5;
|
|
4574
|
+
var TAG_DOC_PARA_SHAPE = HWPTAG_BEGIN + 9;
|
|
4575
|
+
var TAG_DOC_STYLE = HWPTAG_BEGIN + 10;
|
|
4314
4576
|
var CHAR_LINE = 0;
|
|
4315
4577
|
var CHAR_SECTION_BREAK = 10;
|
|
4316
4578
|
var CHAR_PARA = 13;
|
|
@@ -4468,6 +4730,15 @@ function extractText(data) {
|
|
|
4468
4730
|
}
|
|
4469
4731
|
return result;
|
|
4470
4732
|
}
|
|
4733
|
+
function extractEquationText(data) {
|
|
4734
|
+
if (data.length < 6) return null;
|
|
4735
|
+
const scriptLength = data.readUInt16LE(4);
|
|
4736
|
+
const scriptStart = 6;
|
|
4737
|
+
const scriptEnd = scriptStart + scriptLength * 2;
|
|
4738
|
+
if (scriptLength <= 0 || scriptEnd > data.length) return null;
|
|
4739
|
+
const equation = data.subarray(scriptStart, scriptEnd).toString("utf16le").replace(/\0+/g, "").trim();
|
|
4740
|
+
return equation || null;
|
|
4741
|
+
}
|
|
4471
4742
|
|
|
4472
4743
|
// src/hwp5/aes.ts
|
|
4473
4744
|
var S_BOX = new Uint8Array([
|
|
@@ -5627,6 +5898,26 @@ function findViewTextSectionsLenient(lcfb, compressed) {
|
|
|
5627
5898
|
return sections.sort((a, b) => a.idx - b.idx).map((s) => s.content);
|
|
5628
5899
|
}
|
|
5629
5900
|
var TAG_SHAPE_COMPONENT = 74;
|
|
5901
|
+
var CTRL_ID_EQEDIT = "deqe";
|
|
5902
|
+
function isEquationControlId(ctrlId) {
|
|
5903
|
+
return ctrlId === CTRL_ID_EQEDIT || ctrlId === "eqed";
|
|
5904
|
+
}
|
|
5905
|
+
function formatEquationForMarkdown(equation) {
|
|
5906
|
+
const normalized = hwpEquationToLatex(equation);
|
|
5907
|
+
if (!normalized) return "";
|
|
5908
|
+
return `$${normalized.replace(/\$/g, "\\$")}$`;
|
|
5909
|
+
}
|
|
5910
|
+
function extractEquationFromControl(records, ctrlIdx) {
|
|
5911
|
+
const ctrlLevel = records[ctrlIdx].level;
|
|
5912
|
+
for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 10; j++) {
|
|
5913
|
+
const r = records[j];
|
|
5914
|
+
if (r.level <= ctrlLevel) break;
|
|
5915
|
+
if (r.tagId !== TAG_EQEDIT) continue;
|
|
5916
|
+
const equation = extractEquationText(r.data);
|
|
5917
|
+
return equation ? formatEquationForMarkdown(equation) : null;
|
|
5918
|
+
}
|
|
5919
|
+
return null;
|
|
5920
|
+
}
|
|
5630
5921
|
function extractBinDataId(records, ctrlIdx) {
|
|
5631
5922
|
const ctrlLevel = records[ctrlIdx].level;
|
|
5632
5923
|
for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 50; j++) {
|
|
@@ -5786,6 +6077,16 @@ function parseSection(records, docInfo, warnings, sectionNum) {
|
|
|
5786
6077
|
}
|
|
5787
6078
|
} else if (ctrlId === " elo" || ctrlId === "ole ") {
|
|
5788
6079
|
warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
|
|
6080
|
+
} else if (isEquationControlId(ctrlId)) {
|
|
6081
|
+
const equation = extractEquationFromControl(records, i);
|
|
6082
|
+
if (equation) {
|
|
6083
|
+
const lastBlock = blocks[blocks.length - 1];
|
|
6084
|
+
if (lastBlock && lastBlock.type === "paragraph" && lastBlock.text) {
|
|
6085
|
+
lastBlock.text = lastBlock.text + " " + equation;
|
|
6086
|
+
} else {
|
|
6087
|
+
blocks.push({ type: "paragraph", text: equation, pageNumber: sectionNum });
|
|
6088
|
+
}
|
|
6089
|
+
}
|
|
5789
6090
|
} else if (ctrlId === "fn " || ctrlId === " nf " || ctrlId === "en " || ctrlId === " ne ") {
|
|
5790
6091
|
const noteText = extractNoteText(records, i);
|
|
5791
6092
|
if (noteText && blocks.length > 0) {
|
|
@@ -5818,6 +6119,13 @@ function extractNoteText(records, ctrlIdx) {
|
|
|
5818
6119
|
const t = extractText(r.data).trim();
|
|
5819
6120
|
if (t) texts.push(t);
|
|
5820
6121
|
}
|
|
6122
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
6123
|
+
const innerCtrlId = r.data.subarray(0, 4).toString("ascii");
|
|
6124
|
+
if (isEquationControlId(innerCtrlId)) {
|
|
6125
|
+
const equation = extractEquationFromControl(records, j);
|
|
6126
|
+
if (equation) texts.push(equation);
|
|
6127
|
+
}
|
|
6128
|
+
}
|
|
5821
6129
|
}
|
|
5822
6130
|
return texts.length > 0 ? texts.join(" ") : null;
|
|
5823
6131
|
}
|
|
@@ -5831,6 +6139,13 @@ function extractTextBoxText(records, ctrlIdx) {
|
|
|
5831
6139
|
const t = extractText(r.data).trim();
|
|
5832
6140
|
if (t) texts.push(t);
|
|
5833
6141
|
}
|
|
6142
|
+
if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
|
|
6143
|
+
const innerCtrlId = r.data.subarray(0, 4).toString("ascii");
|
|
6144
|
+
if (isEquationControlId(innerCtrlId)) {
|
|
6145
|
+
const equation = extractEquationFromControl(records, j);
|
|
6146
|
+
if (equation) texts.push(equation);
|
|
6147
|
+
}
|
|
6148
|
+
}
|
|
5834
6149
|
}
|
|
5835
6150
|
return texts.length > 0 ? texts.join("\n") : null;
|
|
5836
6151
|
}
|
|
@@ -5899,6 +6214,12 @@ function parseParagraphWithTables(records, startIdx) {
|
|
|
5899
6214
|
i = nextIdx;
|
|
5900
6215
|
continue;
|
|
5901
6216
|
}
|
|
6217
|
+
if (isEquationControlId(ctrlId)) {
|
|
6218
|
+
const equation = extractEquationFromControl(records, i);
|
|
6219
|
+
if (equation) {
|
|
6220
|
+
text = text ? text + " " + equation : equation;
|
|
6221
|
+
}
|
|
6222
|
+
}
|
|
5902
6223
|
}
|
|
5903
6224
|
i++;
|
|
5904
6225
|
}
|
|
@@ -11208,528 +11529,6 @@ async function markdownToXlsx(markdown, options) {
|
|
|
11208
11529
|
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
11209
11530
|
}
|
|
11210
11531
|
|
|
11211
|
-
// src/convert/index.ts
|
|
11212
|
-
import { readFile } from "fs/promises";
|
|
11213
|
-
|
|
11214
|
-
// src/convert/libreoffice.ts
|
|
11215
|
-
import libre from "libreoffice-convert";
|
|
11216
|
-
|
|
11217
|
-
// src/convert/error.ts
|
|
11218
|
-
var ConvertError = class extends Error {
|
|
11219
|
-
constructor(code, message) {
|
|
11220
|
-
super(message);
|
|
11221
|
-
this.code = code;
|
|
11222
|
-
this.name = "ConvertError";
|
|
11223
|
-
}
|
|
11224
|
-
};
|
|
11225
|
-
|
|
11226
|
-
// src/convert/installer.ts
|
|
11227
|
-
import { homedir } from "os";
|
|
11228
|
-
import { join as join4, delimiter } from "path";
|
|
11229
|
-
import { mkdir, access, symlink, rm } from "fs/promises";
|
|
11230
|
-
import { createWriteStream } from "fs";
|
|
11231
|
-
import { spawn as spawn2 } from "child_process";
|
|
11232
|
-
var installInFlight = null;
|
|
11233
|
-
var CACHE_DIR = join4(homedir(), ".cache", "kordoc", "libreoffice");
|
|
11234
|
-
var VERSION_FILE = join4(CACHE_DIR, "version");
|
|
11235
|
-
var PACKAGES = {
|
|
11236
|
-
darwin: {
|
|
11237
|
-
url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/mac/x86_64/LibreOffice_26.2.3_MacOS_x86-64.dmg",
|
|
11238
|
-
binPath: "LibreOffice.app/Contents/MacOS/soffice",
|
|
11239
|
-
sizeMb: 300
|
|
11240
|
-
},
|
|
11241
|
-
linux: {
|
|
11242
|
-
url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/deb/x86_64/LibreOffice_26.2.3_Linux_x86-64_deb.tar.gz",
|
|
11243
|
-
binPath: "opt/libreoffice26.2/program/soffice",
|
|
11244
|
-
sizeMb: 210
|
|
11245
|
-
},
|
|
11246
|
-
win32: {
|
|
11247
|
-
url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/win/x86_64/LibreOffice_26.2.3_Win_x86-64.msi",
|
|
11248
|
-
binPath: "LibreOffice/program/soffice.exe",
|
|
11249
|
-
sizeMb: 360
|
|
11250
|
-
}
|
|
11251
|
-
};
|
|
11252
|
-
async function findInPath() {
|
|
11253
|
-
return new Promise((resolve4) => {
|
|
11254
|
-
const child = spawn2("soffice", ["--version"], { stdio: "ignore" });
|
|
11255
|
-
child.on("close", (code) => resolve4(code === 0 ? "soffice" : null));
|
|
11256
|
-
child.on("error", () => resolve4(null));
|
|
11257
|
-
});
|
|
11258
|
-
}
|
|
11259
|
-
async function findInCache() {
|
|
11260
|
-
const cachedBin = join4(CACHE_DIR, "bin", "soffice");
|
|
11261
|
-
try {
|
|
11262
|
-
await access(cachedBin);
|
|
11263
|
-
return cachedBin;
|
|
11264
|
-
} catch {
|
|
11265
|
-
return null;
|
|
11266
|
-
}
|
|
11267
|
-
}
|
|
11268
|
-
async function findInDefaultPaths() {
|
|
11269
|
-
const platform = process.platform;
|
|
11270
|
-
const paths = [];
|
|
11271
|
-
if (platform === "darwin") {
|
|
11272
|
-
paths.push(
|
|
11273
|
-
"/Applications/LibreOffice.app/Contents/MacOS/soffice",
|
|
11274
|
-
"/opt/homebrew/bin/soffice",
|
|
11275
|
-
"/usr/local/bin/soffice"
|
|
11276
|
-
);
|
|
11277
|
-
} else if (platform === "linux") {
|
|
11278
|
-
paths.push(
|
|
11279
|
-
"/usr/bin/soffice",
|
|
11280
|
-
"/usr/lib/libreoffice/program/soffice"
|
|
11281
|
-
);
|
|
11282
|
-
} else if (platform === "win32") {
|
|
11283
|
-
const pf = process.env["ProgramFiles"] ?? "C:\\Program Files";
|
|
11284
|
-
const pf86 = process.env["ProgramFiles(x86)"] ?? "C:\\Program Files (x86)";
|
|
11285
|
-
paths.push(
|
|
11286
|
-
join4(pf, "LibreOffice", "program", "soffice.exe"),
|
|
11287
|
-
join4(pf86, "LibreOffice", "program", "soffice.exe")
|
|
11288
|
-
);
|
|
11289
|
-
}
|
|
11290
|
-
for (const p of paths) {
|
|
11291
|
-
try {
|
|
11292
|
-
await access(p);
|
|
11293
|
-
return p;
|
|
11294
|
-
} catch {
|
|
11295
|
-
continue;
|
|
11296
|
-
}
|
|
11297
|
-
}
|
|
11298
|
-
return null;
|
|
11299
|
-
}
|
|
11300
|
-
async function downloadWithProgress(url, dest, totalBytes, onProgress) {
|
|
11301
|
-
const response = await fetch(url);
|
|
11302
|
-
if (!response.ok) throw new Error(`\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: HTTP ${response.status} (${url})`);
|
|
11303
|
-
if (!response.body) throw new Error("\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: response body \uC5C6\uC74C");
|
|
11304
|
-
const file = createWriteStream(dest);
|
|
11305
|
-
const reader = response.body.getReader();
|
|
11306
|
-
let downloaded = 0;
|
|
11307
|
-
try {
|
|
11308
|
-
while (true) {
|
|
11309
|
-
const { done, value } = await reader.read();
|
|
11310
|
-
if (done) break;
|
|
11311
|
-
if (!file.write(value)) {
|
|
11312
|
-
await new Promise((resolve4) => file.once("drain", resolve4));
|
|
11313
|
-
}
|
|
11314
|
-
downloaded += value.length;
|
|
11315
|
-
onProgress?.(downloaded, totalBytes);
|
|
11316
|
-
}
|
|
11317
|
-
} finally {
|
|
11318
|
-
reader.releaseLock();
|
|
11319
|
-
await new Promise((resolve4, reject) => {
|
|
11320
|
-
file.end((err) => err ? reject(err) : resolve4());
|
|
11321
|
-
});
|
|
11322
|
-
}
|
|
11323
|
-
}
|
|
11324
|
-
async function installForPlatform(pkg, onProgress) {
|
|
11325
|
-
const platform = process.platform;
|
|
11326
|
-
await mkdir(CACHE_DIR, { recursive: true });
|
|
11327
|
-
const downloadPath = join4(CACHE_DIR, `download-${Date.now()}`);
|
|
11328
|
-
await downloadWithProgress(pkg.url, downloadPath, pkg.sizeMb * 1024 * 1024, onProgress);
|
|
11329
|
-
try {
|
|
11330
|
-
if (platform === "darwin") {
|
|
11331
|
-
return await installMacOS(pkg, downloadPath);
|
|
11332
|
-
} else if (platform === "linux") {
|
|
11333
|
-
return await installLinux(pkg, downloadPath);
|
|
11334
|
-
} else if (platform === "win32") {
|
|
11335
|
-
return await installWindows(pkg, downloadPath);
|
|
11336
|
-
}
|
|
11337
|
-
} catch (err) {
|
|
11338
|
-
await rm(downloadPath, { force: true });
|
|
11339
|
-
throw err;
|
|
11340
|
-
}
|
|
11341
|
-
throw new ConvertError("UNSUPPORTED_PLATFORM", `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4`);
|
|
11342
|
-
}
|
|
11343
|
-
async function installMacOS(pkg, downloadPath) {
|
|
11344
|
-
const mountPoint = `/Volumes/LibreOffice_${Date.now()}`;
|
|
11345
|
-
await new Promise((resolve4, reject) => {
|
|
11346
|
-
const stderr = [];
|
|
11347
|
-
const child = spawn2("hdiutil", ["attach", "-nobrowse", "-noverify", "-mountpoint", mountPoint, downloadPath]);
|
|
11348
|
-
child.stderr?.on("data", (d) => stderr.push(d.toString()));
|
|
11349
|
-
child.on(
|
|
11350
|
-
"close",
|
|
11351
|
-
(code) => code === 0 ? resolve4() : reject(new Error(`dmg \uB9C8\uC6B4\uD2B8 \uC2E4\uD328 (code=${code}): ${stderr.join("").trim()}`))
|
|
11352
|
-
);
|
|
11353
|
-
});
|
|
11354
|
-
try {
|
|
11355
|
-
const appSource = join4(mountPoint, "LibreOffice.app");
|
|
11356
|
-
const appDest = join4(CACHE_DIR, "LibreOffice.app");
|
|
11357
|
-
await new Promise((resolve4, reject) => {
|
|
11358
|
-
const child = spawn2("cp", ["-R", appSource, appDest]);
|
|
11359
|
-
child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(".app \uBCF5\uC0AC \uC2E4\uD328")));
|
|
11360
|
-
});
|
|
11361
|
-
} finally {
|
|
11362
|
-
await new Promise((resolve4) => {
|
|
11363
|
-
const child = spawn2("hdiutil", ["detach", mountPoint]);
|
|
11364
|
-
child.on("close", () => resolve4());
|
|
11365
|
-
});
|
|
11366
|
-
}
|
|
11367
|
-
await rm(downloadPath, { force: true });
|
|
11368
|
-
return await createSymlink(join4(CACHE_DIR, pkg.binPath));
|
|
11369
|
-
}
|
|
11370
|
-
async function installLinux(pkg, downloadPath) {
|
|
11371
|
-
const extractDir = join4(CACHE_DIR, `extract-${Date.now()}`);
|
|
11372
|
-
await mkdir(extractDir, { recursive: true });
|
|
11373
|
-
await new Promise((resolve4, reject) => {
|
|
11374
|
-
const child = spawn2("tar", ["xzf", downloadPath, "-C", extractDir]);
|
|
11375
|
-
child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("\uC555\uCD95 \uD574\uC81C \uC2E4\uD328")));
|
|
11376
|
-
});
|
|
11377
|
-
const debsDir = join4(extractDir, "DEBS");
|
|
11378
|
-
try {
|
|
11379
|
-
await access(debsDir);
|
|
11380
|
-
const entries = await (await import("fs/promises")).readdir(debsDir);
|
|
11381
|
-
for (const entry of entries) {
|
|
11382
|
-
if (entry.endsWith(".deb")) {
|
|
11383
|
-
await new Promise((resolve4, reject) => {
|
|
11384
|
-
const child = spawn2("dpkg-deb", ["-x", join4(debsDir, entry), CACHE_DIR]);
|
|
11385
|
-
child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(`${entry} \uCD94\uCD9C \uC2E4\uD328`)));
|
|
11386
|
-
});
|
|
11387
|
-
}
|
|
11388
|
-
}
|
|
11389
|
-
} catch {
|
|
11390
|
-
}
|
|
11391
|
-
await rm(downloadPath, { force: true });
|
|
11392
|
-
await rm(extractDir, { recursive: true, force: true });
|
|
11393
|
-
return await createSymlink(join4(CACHE_DIR, pkg.binPath));
|
|
11394
|
-
}
|
|
11395
|
-
async function installWindows(pkg, downloadPath) {
|
|
11396
|
-
await new Promise((resolve4, reject) => {
|
|
11397
|
-
const child = spawn2("msiexec", ["/a", downloadPath, "/qn", `TARGETDIR=${CACHE_DIR}`]);
|
|
11398
|
-
child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("MSI \uC124\uCE58 \uC2E4\uD328")));
|
|
11399
|
-
});
|
|
11400
|
-
await rm(downloadPath, { force: true });
|
|
11401
|
-
return join4(CACHE_DIR, pkg.binPath);
|
|
11402
|
-
}
|
|
11403
|
-
async function createSymlink(actualBin) {
|
|
11404
|
-
const binDir = join4(CACHE_DIR, "bin");
|
|
11405
|
-
await mkdir(binDir, { recursive: true });
|
|
11406
|
-
const linkBin = join4(binDir, "soffice");
|
|
11407
|
-
try {
|
|
11408
|
-
await symlink(actualBin, linkBin);
|
|
11409
|
-
} catch {
|
|
11410
|
-
}
|
|
11411
|
-
process.env.PATH = `${binDir}${delimiter}${process.env.PATH}`;
|
|
11412
|
-
return linkBin;
|
|
11413
|
-
}
|
|
11414
|
-
async function installLibreOffice(onProgress) {
|
|
11415
|
-
const platform = process.platform;
|
|
11416
|
-
const pkg = PACKAGES[platform];
|
|
11417
|
-
if (!pkg) {
|
|
11418
|
-
throw new ConvertError(
|
|
11419
|
-
"UNSUPPORTED_PLATFORM",
|
|
11420
|
-
`${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4. \uC218\uB3D9\uC73C\uB85C LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.`
|
|
11421
|
-
);
|
|
11422
|
-
}
|
|
11423
|
-
return await installForPlatform(pkg, onProgress);
|
|
11424
|
-
}
|
|
11425
|
-
async function resolveSoffice(emitter, autoInstall = true) {
|
|
11426
|
-
emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
|
|
11427
|
-
const inPath = await findInPath();
|
|
11428
|
-
if (inPath) {
|
|
11429
|
-
emitter.validate("soffice_found", "\uC2DC\uC2A4\uD15C PATH\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inPath });
|
|
11430
|
-
return inPath;
|
|
11431
|
-
}
|
|
11432
|
-
const inCache = await findInCache();
|
|
11433
|
-
if (inCache) {
|
|
11434
|
-
emitter.validate("soffice_found", "\uCE90\uC2DC\uB41C LibreOffice \uBC1C\uACAC", { sofficePath: inCache });
|
|
11435
|
-
return inCache;
|
|
11436
|
-
}
|
|
11437
|
-
const inDefault = await findInDefaultPaths();
|
|
11438
|
-
if (inDefault) {
|
|
11439
|
-
emitter.validate("soffice_found", "\uAE30\uBCF8 \uACBD\uB85C\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inDefault });
|
|
11440
|
-
return inDefault;
|
|
11441
|
-
}
|
|
11442
|
-
if (!autoInstall) {
|
|
11443
|
-
emitter.error(
|
|
11444
|
-
"validate",
|
|
11445
|
-
"SOFFICE_NOT_FOUND",
|
|
11446
|
-
"LibreOffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4",
|
|
11447
|
-
"\uC218\uB3D9\uC73C\uB85C \uC124\uCE58\uD558\uAC70\uB098 autoInstallLibreOffice: true \uC635\uC158\uC744 \uC0AC\uC6A9\uD558\uC138\uC694."
|
|
11448
|
-
);
|
|
11449
|
-
throw new ConvertError("SOFFICE_NOT_FOUND", "LibreOffice\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4");
|
|
11450
|
-
}
|
|
11451
|
-
if (installInFlight) {
|
|
11452
|
-
return installInFlight;
|
|
11453
|
-
}
|
|
11454
|
-
emitter.install("install_start", "LibreOffice \uC790\uB3D9 \uC124\uCE58\uB97C \uC2DC\uC791\uD569\uB2C8\uB2E4...");
|
|
11455
|
-
installInFlight = (async () => {
|
|
11456
|
-
try {
|
|
11457
|
-
const installed = await installLibreOffice((downloaded, total) => {
|
|
11458
|
-
const percent = Math.round(downloaded / total * 100);
|
|
11459
|
-
emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
|
|
11460
|
-
percent,
|
|
11461
|
-
downloadedBytes: downloaded,
|
|
11462
|
-
totalBytes: total
|
|
11463
|
-
});
|
|
11464
|
-
});
|
|
11465
|
-
emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
|
|
11466
|
-
return installed;
|
|
11467
|
-
} catch (err) {
|
|
11468
|
-
const errorMsg = err instanceof Error ? err.message : String(err);
|
|
11469
|
-
emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
|
|
11470
|
-
throw err;
|
|
11471
|
-
} finally {
|
|
11472
|
-
installInFlight = null;
|
|
11473
|
-
}
|
|
11474
|
-
})();
|
|
11475
|
-
return installInFlight;
|
|
11476
|
-
}
|
|
11477
|
-
|
|
11478
|
-
// src/convert/libreoffice.ts
|
|
11479
|
-
var libreConvert = libre.convert;
|
|
11480
|
-
var libreConvertWithOptions = libre.convertWithOptions;
|
|
11481
|
-
async function convertBuffer(buffer, targetExt, timeoutMs = 6e4, sofficePath, sourceExt) {
|
|
11482
|
-
return new Promise((resolve4, reject) => {
|
|
11483
|
-
const timer = setTimeout(() => {
|
|
11484
|
-
reject(
|
|
11485
|
-
new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
|
|
11486
|
-
);
|
|
11487
|
-
}, timeoutMs);
|
|
11488
|
-
const cb = (err, done) => {
|
|
11489
|
-
clearTimeout(timer);
|
|
11490
|
-
if (err || !done) {
|
|
11491
|
-
reject(
|
|
11492
|
-
new ConvertError(
|
|
11493
|
-
"CONVERT_FAILED",
|
|
11494
|
-
err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
|
|
11495
|
-
)
|
|
11496
|
-
);
|
|
11497
|
-
return;
|
|
11498
|
-
}
|
|
11499
|
-
resolve4(done);
|
|
11500
|
-
};
|
|
11501
|
-
if (sofficePath) {
|
|
11502
|
-
const fileName = sourceExt ? `source${sourceExt}` : "source";
|
|
11503
|
-
libreConvertWithOptions(buffer, targetExt, void 0, { sofficeBinaryPaths: [sofficePath], fileName }, cb);
|
|
11504
|
-
} else {
|
|
11505
|
-
libreConvert(buffer, targetExt, void 0, cb);
|
|
11506
|
-
}
|
|
11507
|
-
});
|
|
11508
|
-
}
|
|
11509
|
-
|
|
11510
|
-
// src/convert/events.ts
|
|
11511
|
-
var ConvertEventEmitter = class {
|
|
11512
|
-
listener = null;
|
|
11513
|
-
/** 이벤트 리스너 등록 */
|
|
11514
|
-
setListener(listener) {
|
|
11515
|
-
this.listener = listener;
|
|
11516
|
-
}
|
|
11517
|
-
/** 이벤트 발송 */
|
|
11518
|
-
emit(event) {
|
|
11519
|
-
try {
|
|
11520
|
-
this.listener?.(event);
|
|
11521
|
-
} catch {
|
|
11522
|
-
}
|
|
11523
|
-
}
|
|
11524
|
-
/** 타입 안전한 헬퍼: detect 이벤트 */
|
|
11525
|
-
detect(stage, message, meta) {
|
|
11526
|
-
this.emit({ type: "detect", stage, message, ...meta });
|
|
11527
|
-
}
|
|
11528
|
-
/** 타입 안전한 헬퍼: validate 이벤트 */
|
|
11529
|
-
validate(stage, message, meta) {
|
|
11530
|
-
this.emit({ type: "validate", stage, message, ...meta });
|
|
11531
|
-
}
|
|
11532
|
-
/** 타입 안전한 헬퍼: install 이벤트 */
|
|
11533
|
-
install(stage, message, meta) {
|
|
11534
|
-
this.emit({ type: "install", stage, message, ...meta });
|
|
11535
|
-
}
|
|
11536
|
-
/** 타입 안전한 헬퍼: convert 진행 이벤트 */
|
|
11537
|
-
progress(percent, message) {
|
|
11538
|
-
this.emit({ type: "convert", stage: "convert_progress", message, percent });
|
|
11539
|
-
}
|
|
11540
|
-
/** 타입 안전한 헬퍼: convert 시작 */
|
|
11541
|
-
convertStart(message) {
|
|
11542
|
-
this.emit({ type: "convert", stage: "convert_start", message, percent: 0 });
|
|
11543
|
-
}
|
|
11544
|
-
/** 타입 안전한 헬퍼: convert 완료 */
|
|
11545
|
-
convertDone(message) {
|
|
11546
|
-
this.emit({ type: "convert", stage: "convert_done", message, percent: 100 });
|
|
11547
|
-
}
|
|
11548
|
-
/** 타입 안전한 헬퍼: 완료 이벤트 */
|
|
11549
|
-
complete(result) {
|
|
11550
|
-
this.emit({ type: "complete", stage: "success", message: "\uBCC0\uD658 \uC644\uB8CC", result });
|
|
11551
|
-
}
|
|
11552
|
-
/** 타입 안전한 헬퍼: 에러 이벤트 */
|
|
11553
|
-
error(stage, code, message, suggestion) {
|
|
11554
|
-
this.emit({ type: "error", stage, code, message, recoverable: true, suggestion });
|
|
11555
|
-
}
|
|
11556
|
-
};
|
|
11557
|
-
|
|
11558
|
-
// src/convert/index.ts
|
|
11559
|
-
var isConverting = false;
|
|
11560
|
-
var queue = [];
|
|
11561
|
-
async function acquireConvertLock() {
|
|
11562
|
-
if (!isConverting) {
|
|
11563
|
-
isConverting = true;
|
|
11564
|
-
return () => {
|
|
11565
|
-
isConverting = false;
|
|
11566
|
-
const next = queue.shift();
|
|
11567
|
-
next?.();
|
|
11568
|
-
};
|
|
11569
|
-
}
|
|
11570
|
-
return new Promise((resolve4) => {
|
|
11571
|
-
queue.push(() => {
|
|
11572
|
-
isConverting = true;
|
|
11573
|
-
resolve4(() => {
|
|
11574
|
-
isConverting = false;
|
|
11575
|
-
const next = queue.shift();
|
|
11576
|
-
next?.();
|
|
11577
|
-
});
|
|
11578
|
-
});
|
|
11579
|
-
});
|
|
11580
|
-
}
|
|
11581
|
-
async function convertToPdf(input, options) {
|
|
11582
|
-
const emitter = new ConvertEventEmitter();
|
|
11583
|
-
if (options?.onEvent) {
|
|
11584
|
-
emitter.setListener(options.onEvent);
|
|
11585
|
-
}
|
|
11586
|
-
if (options?.onProgress) {
|
|
11587
|
-
const legacyProgress = options.onProgress;
|
|
11588
|
-
emitter.setListener((event) => {
|
|
11589
|
-
if (event.type === "convert" && event.stage === "convert_progress") {
|
|
11590
|
-
legacyProgress(event.percent, event.message);
|
|
11591
|
-
}
|
|
11592
|
-
});
|
|
11593
|
-
}
|
|
11594
|
-
try {
|
|
11595
|
-
emitter.detect("reading", "\uC785\uB825 \uD30C\uC77C \uC77D\uB294 \uC911...");
|
|
11596
|
-
let buffer;
|
|
11597
|
-
try {
|
|
11598
|
-
if (typeof input === "string") {
|
|
11599
|
-
buffer = await readFile(input);
|
|
11600
|
-
} else if (Buffer.isBuffer(input)) {
|
|
11601
|
-
buffer = input;
|
|
11602
|
-
} else {
|
|
11603
|
-
buffer = Buffer.from(input);
|
|
11604
|
-
}
|
|
11605
|
-
} catch (err) {
|
|
11606
|
-
emitter.error(
|
|
11607
|
-
"detect",
|
|
11608
|
-
"PARSE_ERROR",
|
|
11609
|
-
`\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`
|
|
11610
|
-
);
|
|
11611
|
-
return {
|
|
11612
|
-
success: false,
|
|
11613
|
-
code: "PARSE_ERROR",
|
|
11614
|
-
error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
|
|
11615
|
-
stage: "detect"
|
|
11616
|
-
};
|
|
11617
|
-
}
|
|
11618
|
-
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
11619
|
-
if (buffer.length > MAX_FILE_SIZE) {
|
|
11620
|
-
emitter.error(
|
|
11621
|
-
"detect",
|
|
11622
|
-
"FILE_TOO_LARGE",
|
|
11623
|
-
`\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`
|
|
11624
|
-
);
|
|
11625
|
-
return {
|
|
11626
|
-
success: false,
|
|
11627
|
-
code: "FILE_TOO_LARGE",
|
|
11628
|
-
error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
|
|
11629
|
-
stage: "detect"
|
|
11630
|
-
};
|
|
11631
|
-
}
|
|
11632
|
-
const format = detectFormat(toArrayBuffer(buffer));
|
|
11633
|
-
emitter.detect("format_detected", `\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC: ${format}`, { format });
|
|
11634
|
-
if (format !== "hwp" && format !== "hwpx") {
|
|
11635
|
-
emitter.error("detect", "UNSUPPORTED_FORMAT", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`);
|
|
11636
|
-
return {
|
|
11637
|
-
success: false,
|
|
11638
|
-
code: "UNSUPPORTED_FORMAT",
|
|
11639
|
-
error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
|
|
11640
|
-
stage: "detect"
|
|
11641
|
-
};
|
|
11642
|
-
}
|
|
11643
|
-
emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
|
|
11644
|
-
let sofficePath;
|
|
11645
|
-
try {
|
|
11646
|
-
sofficePath = await resolveSoffice(emitter, options?.autoInstallLibreOffice ?? true);
|
|
11647
|
-
} catch (err) {
|
|
11648
|
-
if (err instanceof ConvertError) {
|
|
11649
|
-
return {
|
|
11650
|
-
success: false,
|
|
11651
|
-
code: err.code,
|
|
11652
|
-
error: err.message,
|
|
11653
|
-
stage: "validate"
|
|
11654
|
-
};
|
|
11655
|
-
}
|
|
11656
|
-
throw err;
|
|
11657
|
-
}
|
|
11658
|
-
const releaseLock = await acquireConvertLock();
|
|
11659
|
-
try {
|
|
11660
|
-
emitter.convertStart("\uBCC0\uD658 \uC2DC\uC791...");
|
|
11661
|
-
emitter.progress(10, "\uBCC0\uD658 \uC911...");
|
|
11662
|
-
const sourceExt = format === "hwpx" ? ".hwpx" : ".hwp";
|
|
11663
|
-
const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs, sofficePath, sourceExt);
|
|
11664
|
-
emitter.progress(100, "\uBCC0\uD658 \uC644\uB8CC");
|
|
11665
|
-
emitter.convertDone("\uBCC0\uD658 \uC644\uB8CC");
|
|
11666
|
-
const result = {
|
|
11667
|
-
success: true,
|
|
11668
|
-
pdf: new Uint8Array(pdf),
|
|
11669
|
-
sourceFormat: format
|
|
11670
|
-
};
|
|
11671
|
-
emitter.complete({
|
|
11672
|
-
sourceFormat: format,
|
|
11673
|
-
pdfSize: pdf.length
|
|
11674
|
-
});
|
|
11675
|
-
return result;
|
|
11676
|
-
} catch (err) {
|
|
11677
|
-
if (err instanceof ConvertError) {
|
|
11678
|
-
emitter.error("convert", err.code, err.message);
|
|
11679
|
-
return {
|
|
11680
|
-
success: false,
|
|
11681
|
-
code: err.code,
|
|
11682
|
-
error: err.message,
|
|
11683
|
-
stage: "convert"
|
|
11684
|
-
};
|
|
11685
|
-
}
|
|
11686
|
-
const errorMsg = err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328";
|
|
11687
|
-
emitter.error("convert", classifyError(err), errorMsg);
|
|
11688
|
-
return {
|
|
11689
|
-
success: false,
|
|
11690
|
-
code: classifyError(err),
|
|
11691
|
-
error: errorMsg,
|
|
11692
|
-
stage: "convert"
|
|
11693
|
-
};
|
|
11694
|
-
} finally {
|
|
11695
|
-
releaseLock();
|
|
11696
|
-
}
|
|
11697
|
-
} catch (unexpectedErr) {
|
|
11698
|
-
const errorMsg = unexpectedErr instanceof Error ? unexpectedErr.message : "\uC608\uC0C1\uCE58 \uBABB\uD55C \uC624\uB958";
|
|
11699
|
-
emitter.error("convert", "PARSE_ERROR", errorMsg);
|
|
11700
|
-
return {
|
|
11701
|
-
success: false,
|
|
11702
|
-
code: "PARSE_ERROR",
|
|
11703
|
-
error: errorMsg,
|
|
11704
|
-
stage: "convert"
|
|
11705
|
-
};
|
|
11706
|
-
}
|
|
11707
|
-
}
|
|
11708
|
-
async function convertHwpToPdf(input, options) {
|
|
11709
|
-
const result = await convertToPdf(input, options);
|
|
11710
|
-
if (result.success && result.sourceFormat !== "hwp") {
|
|
11711
|
-
return {
|
|
11712
|
-
success: false,
|
|
11713
|
-
code: "UNSUPPORTED_FORMAT",
|
|
11714
|
-
error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11715
|
-
stage: "detect"
|
|
11716
|
-
};
|
|
11717
|
-
}
|
|
11718
|
-
return result;
|
|
11719
|
-
}
|
|
11720
|
-
async function convertHwpxToPdf(input, options) {
|
|
11721
|
-
const result = await convertToPdf(input, options);
|
|
11722
|
-
if (result.success && result.sourceFormat !== "hwpx") {
|
|
11723
|
-
return {
|
|
11724
|
-
success: false,
|
|
11725
|
-
code: "UNSUPPORTED_FORMAT",
|
|
11726
|
-
error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11727
|
-
stage: "detect"
|
|
11728
|
-
};
|
|
11729
|
-
}
|
|
11730
|
-
return result;
|
|
11731
|
-
}
|
|
11732
|
-
|
|
11733
11532
|
// src/ocr/api-key-rotation.ts
|
|
11734
11533
|
var AllKeysCoolingDownError = class extends Error {
|
|
11735
11534
|
waitMs;
|
|
@@ -11824,9 +11623,9 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
|
11824
11623
|
};
|
|
11825
11624
|
|
|
11826
11625
|
// src/pipeline/unified-ocr.ts
|
|
11827
|
-
import { mkdir
|
|
11828
|
-
import { basename as basename2,
|
|
11829
|
-
import { spawn as
|
|
11626
|
+
import { mkdir, readdir, readFile, stat, writeFile } from "fs/promises";
|
|
11627
|
+
import { basename as basename2, dirname as dirname3, extname, join as join4, resolve as resolve3 } from "path";
|
|
11628
|
+
import { spawn as spawn2 } from "child_process";
|
|
11830
11629
|
import { performance } from "perf_hooks";
|
|
11831
11630
|
init_logger();
|
|
11832
11631
|
|
|
@@ -11962,13 +11761,13 @@ function elapsedMs(startAt) {
|
|
|
11962
11761
|
async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
11963
11762
|
const absInput = resolve3(inputPath);
|
|
11964
11763
|
const stem = basename2(absInput, extname(absInput));
|
|
11965
|
-
const workspaceDir = resolve3(options.workspaceDir ??
|
|
11966
|
-
const imagesDir =
|
|
11967
|
-
const rawDir =
|
|
11968
|
-
const diffDir =
|
|
11969
|
-
const outputPath = resolve3(options.outputPath ??
|
|
11970
|
-
const reportPath =
|
|
11971
|
-
const modelCachePath =
|
|
11764
|
+
const workspaceDir = resolve3(options.workspaceDir ?? join4(dirname3(absInput), `${stem}_ocr_workspace`));
|
|
11765
|
+
const imagesDir = join4(workspaceDir, "images");
|
|
11766
|
+
const rawDir = join4(workspaceDir, "ocr", "raw");
|
|
11767
|
+
const diffDir = join4(workspaceDir, "ocr", "diff");
|
|
11768
|
+
const outputPath = resolve3(options.outputPath ?? join4(dirname3(absInput), `${stem}.md`));
|
|
11769
|
+
const reportPath = join4(workspaceDir, "run-report.json");
|
|
11770
|
+
const modelCachePath = join4(dirname3(absInput), ".kordoc-model-cache.json");
|
|
11972
11771
|
const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
|
|
11973
11772
|
const timeoutMs = options.timeoutMs ?? 6e4;
|
|
11974
11773
|
const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
|
|
@@ -11979,12 +11778,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11979
11778
|
const models = sortModelsByCache(modelsInput, modelCache);
|
|
11980
11779
|
const modelMaxTokens = { ...DEFAULT_MODEL_MAX_TOKENS, ...options.modelMaxTokens ?? {} };
|
|
11981
11780
|
const stageWeights = normalizeWeights({ ...DEFAULT_STAGE_WEIGHTS, ...options.stageWeights ?? {} });
|
|
11982
|
-
const keyPool = ApiKeyRotationPool.fromEnv();
|
|
11983
11781
|
const runId = options.runId ?? generateRunId("ocr");
|
|
11984
11782
|
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
11985
|
-
await
|
|
11986
|
-
await
|
|
11987
|
-
await
|
|
11783
|
+
await mkdir(imagesDir, { recursive: true });
|
|
11784
|
+
await mkdir(rawDir, { recursive: true });
|
|
11785
|
+
await mkdir(diffDir, { recursive: true });
|
|
11988
11786
|
const timingsMs = {};
|
|
11989
11787
|
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
11990
11788
|
const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
|
|
@@ -11995,52 +11793,57 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11995
11793
|
};
|
|
11996
11794
|
try {
|
|
11997
11795
|
ensureSupportedInput(absInput);
|
|
11998
|
-
let workingPdfPath = absInput;
|
|
11999
11796
|
const convertStart = performance.now();
|
|
12000
11797
|
currentStage = "convert";
|
|
12001
|
-
markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
|
|
12002
|
-
logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
|
|
12003
11798
|
if (extname(absInput).toLowerCase() !== ".pdf") {
|
|
12004
|
-
|
|
12005
|
-
|
|
12006
|
-
|
|
12007
|
-
|
|
12008
|
-
|
|
12009
|
-
|
|
12010
|
-
|
|
12011
|
-
|
|
12012
|
-
|
|
12013
|
-
|
|
12014
|
-
|
|
12015
|
-
|
|
12016
|
-
|
|
12017
|
-
|
|
12018
|
-
|
|
12019
|
-
|
|
12020
|
-
|
|
12021
|
-
|
|
12022
|
-
|
|
12023
|
-
|
|
12024
|
-
|
|
12025
|
-
|
|
12026
|
-
|
|
12027
|
-
|
|
12028
|
-
|
|
12029
|
-
|
|
11799
|
+
markStageStart("convert", "\uC790\uCCB4 \uD30C\uC11C\uB85C Markdown \uBCC0\uD658 \uC911");
|
|
11800
|
+
logStage("info", "convert", "start", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
|
|
11801
|
+
const inputBuffer = await readFile(absInput);
|
|
11802
|
+
const parsed = await parseNativeDocument(inputBuffer);
|
|
11803
|
+
timingsMs.convert = elapsedMs(convertStart);
|
|
11804
|
+
markStageDone("convert", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC644\uB8CC");
|
|
11805
|
+
logStage("info", "convert", "done", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC644\uB8CC", { format: parsed.fileType, elapsedMs: timingsMs.convert });
|
|
11806
|
+
const mergeStart2 = performance.now();
|
|
11807
|
+
currentStage = "merge";
|
|
11808
|
+
markStageStart("merge", "Markdown \uC800\uC7A5 \uC911");
|
|
11809
|
+
await writeFile(outputPath, parsed.markdown, "utf-8");
|
|
11810
|
+
timingsMs.merge = elapsedMs(mergeStart2);
|
|
11811
|
+
markStageDone("merge", "Markdown \uC800\uC7A5 \uC644\uB8CC");
|
|
11812
|
+
logStage("info", "merge", "done", "Markdown \uC800\uC7A5 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
|
|
11813
|
+
const report2 = {
|
|
11814
|
+
inputPath: absInput,
|
|
11815
|
+
outputPath,
|
|
11816
|
+
workspaceDir,
|
|
11817
|
+
selectedModel: "native-parser",
|
|
11818
|
+
probeImage: "",
|
|
11819
|
+
probeResults: [],
|
|
11820
|
+
pageCount: parsed.pageCount,
|
|
11821
|
+
sourceFormat: parsed.fileType,
|
|
11822
|
+
keyHealth: [],
|
|
11823
|
+
timingsMs,
|
|
11824
|
+
modelCachePath
|
|
11825
|
+
};
|
|
11826
|
+
await writeFile(reportPath, JSON.stringify(report2, null, 2), "utf-8");
|
|
11827
|
+
logStage("info", "finalize", "done", "native parse run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
|
|
11828
|
+
return { outputPath, reportPath, selectedModel: "native-parser" };
|
|
12030
11829
|
}
|
|
11830
|
+
const workingPdfPath = absInput;
|
|
11831
|
+
markStageStart("convert", "PDF \uC785\uB825 \uD655\uC778 \uC911");
|
|
11832
|
+
logStage("info", "convert", "start", "PDF \uC785\uB825 \uD655\uC778", { input: absInput });
|
|
12031
11833
|
timingsMs.convert = elapsedMs(convertStart);
|
|
12032
|
-
markStageDone("convert", "PDF \
|
|
12033
|
-
logStage("info", "convert", "done", "PDF \
|
|
11834
|
+
markStageDone("convert", "PDF \uC785\uB825 \uD655\uC778 \uC644\uB8CC");
|
|
11835
|
+
logStage("info", "convert", "done", "PDF \uC785\uB825 \uD655\uC778 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
|
|
11836
|
+
const keyPool = ApiKeyRotationPool.fromEnv();
|
|
12034
11837
|
const renderStart = performance.now();
|
|
12035
11838
|
currentStage = "render";
|
|
12036
11839
|
const totalPages = await getPdfPageCount(workingPdfPath).catch(() => 0);
|
|
12037
11840
|
if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
12038
11841
|
markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
|
|
12039
11842
|
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
|
|
12040
|
-
await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath,
|
|
11843
|
+
await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, join4(imagesDir, "page")]);
|
|
12041
11844
|
const firstFiles = (await readdir(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
12042
11845
|
if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
|
|
12043
|
-
const probeImage =
|
|
11846
|
+
const probeImage = join4(imagesDir, firstFiles[0]);
|
|
12044
11847
|
markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
|
|
12045
11848
|
const probeStart = performance.now();
|
|
12046
11849
|
currentStage = "probe";
|
|
@@ -12076,7 +11879,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12076
11879
|
const keyCount = keyPool.snapshot().length;
|
|
12077
11880
|
const workerCount = Math.max(1, keyCount * concurrencyPerKey);
|
|
12078
11881
|
const queueCapacity = workerCount * 2;
|
|
12079
|
-
const
|
|
11882
|
+
const queue = new BoundedQueue(queueCapacity);
|
|
12080
11883
|
const ocrStart = performance.now();
|
|
12081
11884
|
currentStage = "ocr";
|
|
12082
11885
|
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
|
|
@@ -12084,17 +11887,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12084
11887
|
let renderDone = 1;
|
|
12085
11888
|
const renderProducer = (async () => {
|
|
12086
11889
|
try {
|
|
12087
|
-
await
|
|
11890
|
+
await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
|
|
12088
11891
|
if (totalPages > 1) {
|
|
12089
|
-
for await (const item of renderPdfToPngStream(workingPdfPath,
|
|
12090
|
-
await
|
|
11892
|
+
for await (const item of renderPdfToPngStream(workingPdfPath, join4(imagesDir, "page"), dpi, totalPages, 2)) {
|
|
11893
|
+
await queue.enqueue(item);
|
|
12091
11894
|
renderDone++;
|
|
12092
11895
|
markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
|
|
12093
11896
|
logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
|
|
12094
11897
|
}
|
|
12095
11898
|
}
|
|
12096
11899
|
} finally {
|
|
12097
|
-
|
|
11900
|
+
queue.close();
|
|
12098
11901
|
timingsMs.render = elapsedMs(renderStart);
|
|
12099
11902
|
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
12100
11903
|
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
|
|
@@ -12103,7 +11906,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12103
11906
|
const [, pageResultsMap] = await Promise.all([
|
|
12104
11907
|
renderProducer,
|
|
12105
11908
|
ocrWorkerPool({
|
|
12106
|
-
queue
|
|
11909
|
+
queue,
|
|
12107
11910
|
workerCount,
|
|
12108
11911
|
totalPages,
|
|
12109
11912
|
ocrInput: {
|
|
@@ -12136,8 +11939,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12136
11939
|
const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
|
|
12137
11940
|
const rawPagePaths = [];
|
|
12138
11941
|
for (const [pageNum, markdown] of sortedEntries) {
|
|
12139
|
-
const pagePath =
|
|
12140
|
-
await
|
|
11942
|
+
const pagePath = join4(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
|
|
11943
|
+
await writeFile(pagePath, markdown, "utf-8");
|
|
12141
11944
|
rawPagePaths.push(pagePath);
|
|
12142
11945
|
}
|
|
12143
11946
|
const mergeStart = performance.now();
|
|
@@ -12145,7 +11948,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12145
11948
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
12146
11949
|
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
12147
11950
|
const merged = await mergeMarkdownPages(rawPagePaths);
|
|
12148
|
-
await
|
|
11951
|
+
await writeFile(outputPath, merged, "utf-8");
|
|
12149
11952
|
timingsMs.merge = elapsedMs(mergeStart);
|
|
12150
11953
|
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
12151
11954
|
logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
|
|
@@ -12161,7 +11964,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
12161
11964
|
timingsMs,
|
|
12162
11965
|
modelCachePath
|
|
12163
11966
|
};
|
|
12164
|
-
await
|
|
11967
|
+
await writeFile(reportPath, JSON.stringify(report, null, 2), "utf-8");
|
|
12165
11968
|
logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
|
|
12166
11969
|
return { outputPath, reportPath, selectedModel };
|
|
12167
11970
|
} catch (err) {
|
|
@@ -12252,7 +12055,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
|
|
|
12252
12055
|
]);
|
|
12253
12056
|
const files = await readdir(imagesDir);
|
|
12254
12057
|
const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
12255
|
-
const imagePath =
|
|
12058
|
+
const imagePath = join4(imagesDir, pageFiles[pageFiles.length - 1]);
|
|
12256
12059
|
yield { pageNumber: page, imagePath };
|
|
12257
12060
|
} catch (err) {
|
|
12258
12061
|
yield {
|
|
@@ -12265,7 +12068,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
|
|
|
12265
12068
|
}
|
|
12266
12069
|
async function runCommand(cmd, args) {
|
|
12267
12070
|
await new Promise((resolvePromise, reject) => {
|
|
12268
|
-
const child =
|
|
12071
|
+
const child = spawn2(cmd, args, { stdio: "pipe" });
|
|
12269
12072
|
let stderr = "";
|
|
12270
12073
|
child.stderr.on("data", (d) => {
|
|
12271
12074
|
stderr += String(d);
|
|
@@ -12279,7 +12082,7 @@ async function runCommand(cmd, args) {
|
|
|
12279
12082
|
}
|
|
12280
12083
|
async function runCommandWithStdout(cmd, args) {
|
|
12281
12084
|
return await new Promise((resolvePromise, reject) => {
|
|
12282
|
-
const child =
|
|
12085
|
+
const child = spawn2(cmd, args, { stdio: "pipe" });
|
|
12283
12086
|
let stdout = "";
|
|
12284
12087
|
let stderr = "";
|
|
12285
12088
|
child.stdout.on("data", (d) => {
|
|
@@ -12295,6 +12098,32 @@ async function runCommandWithStdout(cmd, args) {
|
|
|
12295
12098
|
});
|
|
12296
12099
|
});
|
|
12297
12100
|
}
|
|
12101
|
+
async function parseNativeDocument(buffer) {
|
|
12102
|
+
const arrayBuffer = toArrayBuffer(buffer);
|
|
12103
|
+
const format = detectFormat(arrayBuffer);
|
|
12104
|
+
let result;
|
|
12105
|
+
let fileType;
|
|
12106
|
+
if (format === "hwp") {
|
|
12107
|
+
result = parseHwp5Document(buffer);
|
|
12108
|
+
fileType = "hwp";
|
|
12109
|
+
} else if (format === "hwpx") {
|
|
12110
|
+
const { format: zipFormat, zip } = await detectZipFormat(arrayBuffer);
|
|
12111
|
+
if (zipFormat === "xlsx") {
|
|
12112
|
+
result = await parseXlsxDocument(arrayBuffer, void 0, zip ?? void 0);
|
|
12113
|
+
fileType = "xlsx";
|
|
12114
|
+
} else if (zipFormat === "docx") {
|
|
12115
|
+
result = await parseDocxDocument(arrayBuffer, void 0, zip ?? void 0);
|
|
12116
|
+
fileType = "docx";
|
|
12117
|
+
} else {
|
|
12118
|
+
result = await parseHwpxDocument(arrayBuffer, void 0, zip ?? void 0);
|
|
12119
|
+
fileType = "hwpx";
|
|
12120
|
+
}
|
|
12121
|
+
} else {
|
|
12122
|
+
throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC790\uCCB4 \uD30C\uC11C\uB85C \uCC98\uB9AC\uD560 \uC218 \uC5C6\uB294 \uC785\uB825 \uD3EC\uB9F7: ${format}`);
|
|
12123
|
+
}
|
|
12124
|
+
const pageCount = result.metadata?.pageCount ?? Math.max(1, ...result.blocks.map((block) => block.pageNumber ?? 1));
|
|
12125
|
+
return { markdown: result.markdown, fileType, pageCount };
|
|
12126
|
+
}
|
|
12298
12127
|
function naturalPageSort(a, b) {
|
|
12299
12128
|
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
12300
12129
|
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
@@ -12368,7 +12197,7 @@ function startParallelProbeRuns(input) {
|
|
|
12368
12197
|
}
|
|
12369
12198
|
async function loadModelCache(path) {
|
|
12370
12199
|
try {
|
|
12371
|
-
const raw = await
|
|
12200
|
+
const raw = await readFile(path, "utf-8");
|
|
12372
12201
|
return JSON.parse(raw);
|
|
12373
12202
|
} catch {
|
|
12374
12203
|
return null;
|
|
@@ -12399,15 +12228,15 @@ async function updateModelCache(path, probes) {
|
|
|
12399
12228
|
}
|
|
12400
12229
|
}
|
|
12401
12230
|
current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
12402
|
-
await
|
|
12231
|
+
await writeFile(path, JSON.stringify(current, null, 2), "utf-8");
|
|
12403
12232
|
}
|
|
12404
12233
|
async function ocrWorkerPool(input) {
|
|
12405
|
-
const { queue
|
|
12234
|
+
const { queue, workerCount, ocrInput, onPageDone } = input;
|
|
12406
12235
|
const results = /* @__PURE__ */ new Map();
|
|
12407
12236
|
let completedCount = 0;
|
|
12408
12237
|
async function worker() {
|
|
12409
12238
|
while (true) {
|
|
12410
|
-
const item = await
|
|
12239
|
+
const item = await queue.dequeue();
|
|
12411
12240
|
if (item === QUEUE_DONE) break;
|
|
12412
12241
|
const { pageNumber, imagePath, error } = item;
|
|
12413
12242
|
if (imagePath === null) {
|
|
@@ -12459,7 +12288,7 @@ async function ocrImageWithFallback(input) {
|
|
|
12459
12288
|
async function mergeMarkdownPages(paths) {
|
|
12460
12289
|
const out = [];
|
|
12461
12290
|
for (let i = 0; i < paths.length; i++) {
|
|
12462
|
-
const txt = (await
|
|
12291
|
+
const txt = (await readFile(paths[i], "utf-8")).trim();
|
|
12463
12292
|
if (!txt) continue;
|
|
12464
12293
|
out.push(txt);
|
|
12465
12294
|
}
|
|
@@ -12575,7 +12404,7 @@ async function ocrImageViaNim(input) {
|
|
|
12575
12404
|
throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
|
|
12576
12405
|
}
|
|
12577
12406
|
async function encodeBase64(path) {
|
|
12578
|
-
const b = await
|
|
12407
|
+
const b = await readFile(path);
|
|
12579
12408
|
return b.toString("base64");
|
|
12580
12409
|
}
|
|
12581
12410
|
function stripCodeFence3(text) {
|
|
@@ -12595,16 +12424,6 @@ function ensureSupportedInput(path) {
|
|
|
12595
12424
|
}
|
|
12596
12425
|
function normalizePipelineError(err, stage) {
|
|
12597
12426
|
if (err instanceof UnifiedOcrError) return err;
|
|
12598
|
-
if (err instanceof ConvertError) {
|
|
12599
|
-
const codeMap = {
|
|
12600
|
-
SOFFICE_NOT_FOUND: "SOFFICE_NOT_FOUND",
|
|
12601
|
-
CONVERT_FAILED: "CONVERT_FAILED",
|
|
12602
|
-
TIMEOUT: "CONVERT_FAILED",
|
|
12603
|
-
UNSUPPORTED_PLATFORM: "CONVERT_FAILED",
|
|
12604
|
-
UNSUPPORTED_FORMAT: "UNSUPPORTED_INPUT"
|
|
12605
|
-
};
|
|
12606
|
-
return new UnifiedOcrError(codeMap[err.code] ?? "CONVERT_FAILED", stage, err.message);
|
|
12607
|
-
}
|
|
12608
12427
|
const message = err instanceof Error ? err.message : String(err);
|
|
12609
12428
|
const codeByStage = {
|
|
12610
12429
|
convert: "CONVERT_FAILED",
|
|
@@ -12624,7 +12443,7 @@ async function parse2(input, options) {
|
|
|
12624
12443
|
let buffer;
|
|
12625
12444
|
if (typeof input === "string") {
|
|
12626
12445
|
try {
|
|
12627
|
-
const buf = await
|
|
12446
|
+
const buf = await readFile2(input);
|
|
12628
12447
|
buffer = toArrayBuffer(buf);
|
|
12629
12448
|
} catch (err) {
|
|
12630
12449
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
@@ -12783,9 +12602,6 @@ export {
|
|
|
12783
12602
|
VERSION,
|
|
12784
12603
|
blocksToMarkdown,
|
|
12785
12604
|
compare,
|
|
12786
|
-
convertHwpToPdf,
|
|
12787
|
-
convertHwpxToPdf,
|
|
12788
|
-
convertToPdf,
|
|
12789
12605
|
detectFormat,
|
|
12790
12606
|
detectZipFormat,
|
|
12791
12607
|
diffBlocks,
|