@clazic/kordoc 2.7.4 → 2.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -3062,9 +3062,6 @@ __export(index_exports, {
3062
3062
  VERSION: () => VERSION,
3063
3063
  blocksToMarkdown: () => blocksToMarkdown,
3064
3064
  compare: () => compare,
3065
- convertHwpToPdf: () => convertHwpToPdf,
3066
- convertHwpxToPdf: () => convertHwpxToPdf,
3067
- convertToPdf: () => convertToPdf,
3068
3065
  detectFormat: () => detectFormat,
3069
3066
  detectZipFormat: () => detectZipFormat,
3070
3067
  diffBlocks: () => diffBlocks,
@@ -3084,7 +3081,7 @@ __export(index_exports, {
3084
3081
  runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
3085
3082
  });
3086
3083
  module.exports = __toCommonJS(index_exports);
3087
- var import_promises5 = require("fs/promises");
3084
+ var import_promises3 = require("fs/promises");
3088
3085
 
3089
3086
  // src/detect.ts
3090
3087
  var import_jszip = __toESM(require("jszip"), 1);
@@ -3137,7 +3134,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
3137
3134
  var import_xmldom = require("@xmldom/xmldom");
3138
3135
 
3139
3136
  // src/utils.ts
3140
- var VERSION = true ? "2.7.3" : "0.0.0-dev";
3137
+ var VERSION = true ? "2.7.6" : "0.0.0-dev";
3141
3138
  function toArrayBuffer(buf) {
3142
3139
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3143
3140
  return buf.buffer;
@@ -3344,13 +3341,21 @@ function sanitizeText(text) {
3344
3341
  }
3345
3342
  return result;
3346
3343
  }
3344
+ function escapeGfm(text, inTableCell = false) {
3345
+ if (!text) return text;
3346
+ let result = text.replace(/(?<!\\)~/g, "\\~");
3347
+ if (inTableCell) {
3348
+ result = result.replace(/(?<!\\)\|/g, "\\|");
3349
+ }
3350
+ return result;
3351
+ }
3347
3352
  function blocksToMarkdown(blocks) {
3348
3353
  const lines = [];
3349
3354
  for (let i = 0; i < blocks.length; i++) {
3350
3355
  const block = blocks[i];
3351
3356
  if (block.type === "heading" && block.text) {
3352
3357
  const prefix = "#".repeat(Math.min(block.level || 2, 6));
3353
- const headingText = sanitizeText(block.text);
3358
+ const headingText = escapeGfm(sanitizeText(block.text), false);
3354
3359
  if (headingText) lines.push("", `${prefix} ${headingText}`, "");
3355
3360
  continue;
3356
3361
  }
@@ -3363,42 +3368,47 @@ function blocksToMarkdown(blocks) {
3363
3368
  continue;
3364
3369
  }
3365
3370
  if (block.type === "list" && block.text) {
3366
- const listText = sanitizeText(block.text);
3367
- if (!listText) continue;
3368
- const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
3371
+ const sanitized = sanitizeText(block.text);
3372
+ if (!sanitized) continue;
3373
+ const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(sanitized);
3369
3374
  const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
3375
+ const listText = escapeGfm(sanitized, false);
3370
3376
  lines.push(`${prefix}${listText}`);
3371
3377
  if (block.children) {
3372
3378
  for (const child of block.children) {
3373
3379
  const childPrefix = child.listType === "ordered" ? "1." : "-";
3374
- lines.push(` ${childPrefix} ${child.text || ""}`);
3380
+ const childText = child.text ? escapeGfm(sanitizeText(child.text), false) : "";
3381
+ lines.push(` ${childPrefix} ${childText}`);
3375
3382
  }
3376
3383
  }
3377
3384
  continue;
3378
3385
  }
3379
3386
  if (block.type === "paragraph" && block.text) {
3380
- let text = sanitizeText(block.text);
3381
- if (!text) continue;
3382
- if (/^\[별표\s*\d+/.test(text)) {
3387
+ const sanitized = sanitizeText(block.text);
3388
+ if (!sanitized) continue;
3389
+ if (/^\[별표\s*\d+/.test(sanitized)) {
3383
3390
  const nextBlock = blocks[i + 1];
3391
+ const escapedSelf = escapeGfm(sanitized, false);
3384
3392
  if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
3385
- lines.push("", `## ${text} ${nextBlock.text}`, "");
3393
+ const nextEscaped = escapeGfm(sanitizeText(nextBlock.text), false);
3394
+ lines.push("", `## ${escapedSelf} ${nextEscaped}`, "");
3386
3395
  i++;
3387
3396
  } else {
3388
- lines.push("", `## ${text}`, "");
3397
+ lines.push("", `## ${escapedSelf}`, "");
3389
3398
  }
3390
3399
  continue;
3391
3400
  }
3392
- if (/^\([^)]*조[^)]*관련\)$/.test(text)) {
3393
- lines.push(`*${text}*`, "");
3401
+ if (/^\([^)]*조[^)]*관련\)$/.test(sanitized)) {
3402
+ lines.push(`*${escapeGfm(sanitized, false)}*`, "");
3394
3403
  continue;
3395
3404
  }
3405
+ let text = escapeGfm(sanitized, false);
3396
3406
  if (block.href) {
3397
3407
  const href = sanitizeHref(block.href);
3398
3408
  if (href) text = `[${text}](${href})`;
3399
3409
  }
3400
3410
  if (block.footnoteText) {
3401
- text += ` (\uC8FC: ${block.footnoteText})`;
3411
+ text += ` (\uC8FC: ${escapeGfm(block.footnoteText, false)})`;
3402
3412
  }
3403
3413
  lines.push(text);
3404
3414
  } else if (block.type === "table" && block.table) {
@@ -3423,13 +3433,13 @@ function tableToMarkdown(table) {
3423
3433
  return content.split(/\n/).map((line) => {
3424
3434
  const trimmed = line.trim();
3425
3435
  if (!trimmed) return "";
3426
- if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
3427
- if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
3428
- return trimmed;
3436
+ if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed, false)}**`;
3437
+ if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed, false)}`;
3438
+ return escapeGfm(trimmed, false);
3429
3439
  }).filter(Boolean).join("\n");
3430
3440
  }
3431
3441
  if (numCols === 1 && numRows >= 2) {
3432
- return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
3442
+ return cells.map((row) => escapeGfm(sanitizeText(row[0].text).replace(/\n/g, " "), false)).filter(Boolean).join("\n");
3433
3443
  }
3434
3444
  const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
3435
3445
  const skip = /* @__PURE__ */ new Set();
@@ -3438,7 +3448,7 @@ function tableToMarkdown(table) {
3438
3448
  if (skip.has(`${r},${c}`)) continue;
3439
3449
  const cell = cells[r]?.[c];
3440
3450
  if (!cell) continue;
3441
- display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
3451
+ display[r][c] = escapeGfm(sanitizeText(cell.text).replace(/\n/g, "<br>"), true);
3442
3452
  for (let dr = 0; dr < cell.rowSpan; dr++) {
3443
3453
  for (let dc = 0; dc < cell.colSpan; dc++) {
3444
3454
  if (dr === 0 && dc === 0) continue;
@@ -3485,6 +3495,223 @@ var HEADING_RATIO_H1 = 1.5;
3485
3495
  var HEADING_RATIO_H2 = 1.3;
3486
3496
  var HEADING_RATIO_H3 = 1.15;
3487
3497
 
3498
+ // src/hwp5/equation.ts
3499
+ var WORD_COMMANDS = /* @__PURE__ */ new Map([
3500
+ ["alpha", "\\alpha"],
3501
+ ["beta", "\\beta"],
3502
+ ["gamma", "\\gamma"],
3503
+ ["delta", "\\delta"],
3504
+ ["epsilon", "\\epsilon"],
3505
+ ["theta", "\\theta"],
3506
+ ["lambda", "\\lambda"],
3507
+ ["mu", "\\mu"],
3508
+ ["pi", "\\pi"],
3509
+ ["sigma", "\\sigma"],
3510
+ ["tau", "\\tau"],
3511
+ ["phi", "\\phi"],
3512
+ ["omega", "\\omega"],
3513
+ ["sin", "\\sin"],
3514
+ ["cos", "\\cos"],
3515
+ ["tan", "\\tan"],
3516
+ ["sec", "\\sec"],
3517
+ ["csc", "\\csc"],
3518
+ ["cot", "\\cot"],
3519
+ ["log", "\\log"],
3520
+ ["ln", "\\ln"],
3521
+ ["lim", "\\lim"],
3522
+ ["inf", "\\infty"],
3523
+ ["sum", "\\sum"],
3524
+ ["smallsum", "\\sum"],
3525
+ ["prod", "\\prod"],
3526
+ ["int", "\\int"],
3527
+ ["oint", "\\oint"],
3528
+ ["rightarrow", "\\rightarrow"],
3529
+ ["leftarrow", "\\leftarrow"],
3530
+ ["partial", "\\partial"],
3531
+ ["nabla", "\\nabla"],
3532
+ ["angle", "\\angle"],
3533
+ ["triangle", "\\triangle"],
3534
+ ["vec", "\\vec"],
3535
+ ["bar", "\\overline"],
3536
+ ["dot", "\\dot"],
3537
+ ["hat", "\\hat"],
3538
+ ["left", "\\left"],
3539
+ ["right", "\\right"]
3540
+ ]);
3541
+ var SYMBOL_WORDS = /* @__PURE__ */ new Map([
3542
+ ["times", "\\times"],
3543
+ ["divide", "\\div"],
3544
+ ["div", "\\div"],
3545
+ ["le", "\\leq"],
3546
+ ["ge", "\\geq"],
3547
+ ["geq", "\\geq"],
3548
+ ["deg", "^\\circ"],
3549
+ ["rarrow", "\\rightarrow"],
3550
+ ["larrow", "\\leftarrow"],
3551
+ ["lrarrow", "\\leftrightarrow"],
3552
+ ["in", "\\in"],
3553
+ ["notin", "\\notin"],
3554
+ ["emptyset", "\\emptyset"],
3555
+ ["subset", "\\subset"],
3556
+ ["nsubset", "\\nsubseteq"],
3557
+ ["cup", "\\cup"],
3558
+ ["cap", "\\cap"],
3559
+ ["smallinter", "\\cap"],
3560
+ ["sim", "\\sim"],
3561
+ ["circ", "\\circ"],
3562
+ ["bot", "\\perp"],
3563
+ ["dyad", "\\overleftrightarrow"],
3564
+ ["arch", "\\overset{\\frown}"]
3565
+ ]);
3566
+ function hwpEquationToLatex(equation) {
3567
+ return convertEquation(equation.replace(/\0/g, "").trim(), 0);
3568
+ }
3569
+ function convertEquation(equation, depth) {
3570
+ if (!equation || depth > 12) return equation;
3571
+ let result = equation.replace(/\s+/g, " ").replace(/`+/g, "\\,").replace(/~+/g, "\\,").trim();
3572
+ result = convertMatrixLike(result);
3573
+ result = convertRoots(result, depth);
3574
+ result = convertOver(result, depth);
3575
+ result = convertSqrt(result, depth);
3576
+ result = convertScripts(result);
3577
+ result = convertOperators(result);
3578
+ result = removeFontDirectives(result);
3579
+ result = convertWords(result);
3580
+ result = cleanupLatexSpacing(result);
3581
+ return result;
3582
+ }
3583
+ function convertMatrixLike(input) {
3584
+ return input.replace(
3585
+ /\bmatrix\s*\{([^{}]*)\}/gi,
3586
+ (_match, body) => `\\begin{matrix} ${body.split("#").map((part) => part.trim()).join(" & ")} \\end{matrix}`
3587
+ ).replace(
3588
+ /\bcases\s*\{([^{}]*)\}/gi,
3589
+ (_match, body) => `\\begin{cases} ${body.split("#").map((part) => part.trim()).join(" \\\\ ")} \\end{cases}`
3590
+ );
3591
+ }
3592
+ function convertRoots(input, depth) {
3593
+ return input.replace(/(?<!\\)\broot\s+({[^{}]*}|\S+)\s+of\s+({[^{}]*}|\S+)/gi, (_match, degree, radicand) => {
3594
+ return `\\sqrt[${convertEquation(unwrapGroup(degree), depth + 1)}]{${convertEquation(unwrapGroup(radicand), depth + 1)}}`;
3595
+ });
3596
+ }
3597
+ function convertSqrt(input, depth) {
3598
+ return input.replace(/(?<!\\)\bsqrt\s*({[^{}]*}|\S+)/gi, (_match, radicand) => {
3599
+ return `\\sqrt{${convertEquation(unwrapGroup(radicand), depth + 1)}}`;
3600
+ });
3601
+ }
3602
+ function convertOver(input, depth) {
3603
+ let result = input;
3604
+ for (let guard = 0; guard < 50; guard++) {
3605
+ const over = findTopLevelWord(result, "over");
3606
+ if (over < 0) break;
3607
+ const left = readLeftAtom(result, over);
3608
+ const right = readRightAtom(result, over + "over".length);
3609
+ if (!left || !right) break;
3610
+ const numerator = convertEquation(unwrapGroup(left.atom), depth + 1);
3611
+ const denominator = convertEquation(unwrapGroup(right.atom), depth + 1);
3612
+ result = result.slice(0, left.start) + `\\frac{${numerator}}{${denominator}}` + result.slice(right.end);
3613
+ }
3614
+ return result;
3615
+ }
3616
+ function convertScripts(input) {
3617
+ return input.replace(/\s*\^\s*/g, "^").replace(/\s*_\s*/g, "_").replace(/\^(?!\{)([^\s{}_^]+)/g, "^{$1}").replace(/_(?!\{)([^\s{}_^]+)/g, "_{$1}");
3618
+ }
3619
+ function convertOperators(input) {
3620
+ return input.replace(/\+-/g, "\\pm").replace(/-\+/g, "\\mp").replace(/\/\//g, "\\parallel").replace(/△/g, "\\triangle ").replace(/□/g, "\\square ").replace(/‧/g, "\\cdot ").replace(/!=/g, "\\neq").replace(/<=/g, "\\leq").replace(/>=/g, "\\geq").replace(/==/g, "\\equiv");
3621
+ }
3622
+ function removeFontDirectives(input) {
3623
+ return input.replace(/(?<!\\)\b(?:rm|it)\b\s*/gi, "");
3624
+ }
3625
+ function convertWords(input) {
3626
+ return input.replace(/(?<![\\A-Za-z0-9])([A-Za-z][A-Za-z0-9]*)(?![A-Za-z0-9])/g, (word) => {
3627
+ const exact = SYMBOL_WORDS.get(word);
3628
+ if (exact) return exact;
3629
+ const lower = word.toLowerCase();
3630
+ return SYMBOL_WORDS.get(lower) ?? WORD_COMMANDS.get(lower) ?? word;
3631
+ });
3632
+ }
3633
+ function cleanupLatexSpacing(input) {
3634
+ return input.replace(/\\left\s*\{/g, "\\left\\{").replace(/\\right\s*\}/g, "\\right\\}").replace(/\\left\s*([\[\]\(\)\|])/g, "\\left$1").replace(/\\right\s*([\[\]\(\)\|])/g, "\\right$1").replace(/\s*\\,\s*/g, "\\,").replace(/\s+/g, " ").replace(/\{\s+/g, "{").replace(/\s+\}/g, "}").trim();
3635
+ }
3636
+ function findTopLevelWord(input, word) {
3637
+ let curly = 0;
3638
+ let paren = 0;
3639
+ for (let i = 0; i <= input.length - word.length; i++) {
3640
+ const ch = input[i];
3641
+ if (ch === "{") curly++;
3642
+ else if (ch === "}") curly = Math.max(0, curly - 1);
3643
+ else if (ch === "(") paren++;
3644
+ else if (ch === ")") paren = Math.max(0, paren - 1);
3645
+ if (curly !== 0 || paren !== 0) continue;
3646
+ if (input.slice(i, i + word.length).toLowerCase() !== word) continue;
3647
+ if (isWordChar(input[i - 1]) || isWordChar(input[i + word.length])) continue;
3648
+ return i;
3649
+ }
3650
+ return -1;
3651
+ }
3652
+ function readLeftAtom(input, end) {
3653
+ let pos = end - 1;
3654
+ while (pos >= 0 && /\s/.test(input[pos])) pos--;
3655
+ if (pos < 0) return null;
3656
+ if (input[pos] === "}") {
3657
+ const start2 = findMatchingLeft(input, pos, "{", "}");
3658
+ if (start2 >= 0) return { start: start2, atom: input.slice(start2, pos + 1) };
3659
+ }
3660
+ if (input[pos] === ")") {
3661
+ const start2 = findMatchingLeft(input, pos, "(", ")");
3662
+ if (start2 >= 0) return { start: start2, atom: input.slice(start2, pos + 1) };
3663
+ }
3664
+ let start = pos;
3665
+ while (start >= 0 && !/\s/.test(input[start]) && !/[+\-=<>]/.test(input[start])) start--;
3666
+ return { start: start + 1, atom: input.slice(start + 1, pos + 1) };
3667
+ }
3668
+ function readRightAtom(input, start) {
3669
+ let pos = start;
3670
+ while (pos < input.length && /\s/.test(input[pos])) pos++;
3671
+ if (pos >= input.length) return null;
3672
+ if (input[pos] === "{") {
3673
+ const end2 = findMatchingRight(input, pos, "{", "}");
3674
+ if (end2 >= 0) return { end: end2 + 1, atom: input.slice(pos, end2 + 1) };
3675
+ }
3676
+ if (input[pos] === "(") {
3677
+ const end2 = findMatchingRight(input, pos, "(", ")");
3678
+ if (end2 >= 0) return { end: end2 + 1, atom: input.slice(pos, end2 + 1) };
3679
+ }
3680
+ let end = pos;
3681
+ while (end < input.length && !/\s/.test(input[end]) && !/[+\-=<>]/.test(input[end])) end++;
3682
+ return { end, atom: input.slice(pos, end) };
3683
+ }
3684
+ function findMatchingLeft(input, closeIndex, open, close) {
3685
+ let depth = 0;
3686
+ for (let i = closeIndex; i >= 0; i--) {
3687
+ if (input[i] === close) depth++;
3688
+ else if (input[i] === open) {
3689
+ depth--;
3690
+ if (depth === 0) return i;
3691
+ }
3692
+ }
3693
+ return -1;
3694
+ }
3695
+ function findMatchingRight(input, openIndex, open, close) {
3696
+ let depth = 0;
3697
+ for (let i = openIndex; i < input.length; i++) {
3698
+ if (input[i] === open) depth++;
3699
+ else if (input[i] === close) {
3700
+ depth--;
3701
+ if (depth === 0) return i;
3702
+ }
3703
+ }
3704
+ return -1;
3705
+ }
3706
+ function unwrapGroup(input) {
3707
+ const trimmed = input.trim();
3708
+ if (trimmed.startsWith("{") && trimmed.endsWith("}")) return trimmed.slice(1, -1);
3709
+ return trimmed;
3710
+ }
3711
+ function isWordChar(ch) {
3712
+ return !!ch && /[A-Za-z0-9_]/.test(ch);
3713
+ }
3714
+
3488
3715
  // src/hwpx/parser.ts
3489
3716
  init_page_range();
3490
3717
  init_logger();
@@ -4166,6 +4393,17 @@ function findDescendant(node, targetTag, depth = 0) {
4166
4393
  }
4167
4394
  return null;
4168
4395
  }
4396
+ function findChildByLocalName(node, targetTag) {
4397
+ const children = node.childNodes;
4398
+ if (!children) return null;
4399
+ for (let i = 0; i < children.length; i++) {
4400
+ const child = children[i];
4401
+ if (child.nodeType !== 1) continue;
4402
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
4403
+ if (tag === targetTag) return child;
4404
+ }
4405
+ return null;
4406
+ }
4169
4407
  function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
4170
4408
  const children = drawTextNode.childNodes;
4171
4409
  if (!children) return;
@@ -4268,6 +4506,22 @@ function extractParagraphInfo(para, styleMap) {
4268
4506
  case "shapeComment":
4269
4507
  case "drawText":
4270
4508
  break;
4509
+ // 수식: <hp:equation> 내부의 <hp:script>에 HML/HULK-style 수식 본문이
4510
+ // 들어있음. hwpEquationToLatex로 LaTeX 변환 후 `$...$`로 래핑하여
4511
+ // 본문 텍스트에 인라인 삽입. 변환 실패/빈 결과는 조용히 드롭
4512
+ // (대체 텍스트 "수식입니다." 누출 방지는 기존 정규식이 처리).
4513
+ case "equation": {
4514
+ const script = findChildByLocalName(child, "script");
4515
+ const raw = script ? extractTextFromNode(script) : "";
4516
+ if (raw.trim()) {
4517
+ try {
4518
+ const latex = hwpEquationToLatex(raw).trim();
4519
+ if (latex) text += " $" + latex.replace(/\$/g, "\\$") + "$ ";
4520
+ } catch {
4521
+ }
4522
+ }
4523
+ break;
4524
+ }
4271
4525
  // run 요소에서 charPrIDRef 추출
4272
4526
  case "r": {
4273
4527
  const runCharPr = child.getAttribute("charPrIDRef");
@@ -4334,8 +4588,13 @@ var TAG_CHAR_SHAPE = 68;
4334
4588
  var TAG_CTRL_HEADER = 71;
4335
4589
  var TAG_LIST_HEADER = 72;
4336
4590
  var TAG_TABLE = 77;
4337
- var TAG_DOC_CHAR_SHAPE = 55;
4338
- var TAG_DOC_STYLE = 58;
4591
+ var TAG_EQEDIT = 88;
4592
+ var HWPTAG_BEGIN = 16;
4593
+ var TAG_ID_MAPPINGS = HWPTAG_BEGIN + 1;
4594
+ var TAG_FACE_NAME = HWPTAG_BEGIN + 3;
4595
+ var TAG_DOC_CHAR_SHAPE = HWPTAG_BEGIN + 5;
4596
+ var TAG_DOC_PARA_SHAPE = HWPTAG_BEGIN + 9;
4597
+ var TAG_DOC_STYLE = HWPTAG_BEGIN + 10;
4339
4598
  var CHAR_LINE = 0;
4340
4599
  var CHAR_SECTION_BREAK = 10;
4341
4600
  var CHAR_PARA = 13;
@@ -4493,6 +4752,15 @@ function extractText(data) {
4493
4752
  }
4494
4753
  return result;
4495
4754
  }
4755
+ function extractEquationText(data) {
4756
+ if (data.length < 6) return null;
4757
+ const scriptLength = data.readUInt16LE(4);
4758
+ const scriptStart = 6;
4759
+ const scriptEnd = scriptStart + scriptLength * 2;
4760
+ if (scriptLength <= 0 || scriptEnd > data.length) return null;
4761
+ const equation = data.subarray(scriptStart, scriptEnd).toString("utf16le").replace(/\0+/g, "").trim();
4762
+ return equation || null;
4763
+ }
4496
4764
 
4497
4765
  // src/hwp5/aes.ts
4498
4766
  var S_BOX = new Uint8Array([
@@ -5652,6 +5920,26 @@ function findViewTextSectionsLenient(lcfb, compressed) {
5652
5920
  return sections.sort((a, b) => a.idx - b.idx).map((s) => s.content);
5653
5921
  }
5654
5922
  var TAG_SHAPE_COMPONENT = 74;
5923
+ var CTRL_ID_EQEDIT = "deqe";
5924
+ function isEquationControlId(ctrlId) {
5925
+ return ctrlId === CTRL_ID_EQEDIT || ctrlId === "eqed";
5926
+ }
5927
+ function formatEquationForMarkdown(equation) {
5928
+ const normalized = hwpEquationToLatex(equation);
5929
+ if (!normalized) return "";
5930
+ return `$${normalized.replace(/\$/g, "\\$")}$`;
5931
+ }
5932
+ function extractEquationFromControl(records, ctrlIdx) {
5933
+ const ctrlLevel = records[ctrlIdx].level;
5934
+ for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 10; j++) {
5935
+ const r = records[j];
5936
+ if (r.level <= ctrlLevel) break;
5937
+ if (r.tagId !== TAG_EQEDIT) continue;
5938
+ const equation = extractEquationText(r.data);
5939
+ return equation ? formatEquationForMarkdown(equation) : null;
5940
+ }
5941
+ return null;
5942
+ }
5655
5943
  function extractBinDataId(records, ctrlIdx) {
5656
5944
  const ctrlLevel = records[ctrlIdx].level;
5657
5945
  for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 50; j++) {
@@ -5811,6 +6099,16 @@ function parseSection(records, docInfo, warnings, sectionNum) {
5811
6099
  }
5812
6100
  } else if (ctrlId === " elo" || ctrlId === "ole ") {
5813
6101
  warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
6102
+ } else if (isEquationControlId(ctrlId)) {
6103
+ const equation = extractEquationFromControl(records, i);
6104
+ if (equation) {
6105
+ const lastBlock = blocks[blocks.length - 1];
6106
+ if (lastBlock && lastBlock.type === "paragraph" && lastBlock.text) {
6107
+ lastBlock.text = lastBlock.text + " " + equation;
6108
+ } else {
6109
+ blocks.push({ type: "paragraph", text: equation, pageNumber: sectionNum });
6110
+ }
6111
+ }
5814
6112
  } else if (ctrlId === "fn " || ctrlId === " nf " || ctrlId === "en " || ctrlId === " ne ") {
5815
6113
  const noteText = extractNoteText(records, i);
5816
6114
  if (noteText && blocks.length > 0) {
@@ -5843,6 +6141,13 @@ function extractNoteText(records, ctrlIdx) {
5843
6141
  const t = extractText(r.data).trim();
5844
6142
  if (t) texts.push(t);
5845
6143
  }
6144
+ if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
6145
+ const innerCtrlId = r.data.subarray(0, 4).toString("ascii");
6146
+ if (isEquationControlId(innerCtrlId)) {
6147
+ const equation = extractEquationFromControl(records, j);
6148
+ if (equation) texts.push(equation);
6149
+ }
6150
+ }
5846
6151
  }
5847
6152
  return texts.length > 0 ? texts.join(" ") : null;
5848
6153
  }
@@ -5856,6 +6161,13 @@ function extractTextBoxText(records, ctrlIdx) {
5856
6161
  const t = extractText(r.data).trim();
5857
6162
  if (t) texts.push(t);
5858
6163
  }
6164
+ if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
6165
+ const innerCtrlId = r.data.subarray(0, 4).toString("ascii");
6166
+ if (isEquationControlId(innerCtrlId)) {
6167
+ const equation = extractEquationFromControl(records, j);
6168
+ if (equation) texts.push(equation);
6169
+ }
6170
+ }
5859
6171
  }
5860
6172
  return texts.length > 0 ? texts.join("\n") : null;
5861
6173
  }
@@ -5924,6 +6236,12 @@ function parseParagraphWithTables(records, startIdx) {
5924
6236
  i = nextIdx;
5925
6237
  continue;
5926
6238
  }
6239
+ if (isEquationControlId(ctrlId)) {
6240
+ const equation = extractEquationFromControl(records, i);
6241
+ if (equation) {
6242
+ text = text ? text + " " + equation : equation;
6243
+ }
6244
+ }
5927
6245
  }
5928
6246
  i++;
5929
6247
  }
@@ -11233,526 +11551,6 @@ async function markdownToXlsx(markdown, options) {
11233
11551
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
11234
11552
  }
11235
11553
 
11236
- // src/convert/index.ts
11237
- var import_promises3 = require("fs/promises");
11238
-
11239
- // src/convert/libreoffice.ts
11240
- var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11241
-
11242
- // src/convert/error.ts
11243
- var ConvertError = class extends Error {
11244
- constructor(code, message) {
11245
- super(message);
11246
- this.code = code;
11247
- this.name = "ConvertError";
11248
- }
11249
- };
11250
-
11251
- // src/convert/installer.ts
11252
- var import_os3 = require("os");
11253
- var import_path5 = require("path");
11254
- var import_promises2 = require("fs/promises");
11255
- var import_fs4 = require("fs");
11256
- var import_child_process4 = require("child_process");
11257
- var installInFlight = null;
11258
- var CACHE_DIR = (0, import_path5.join)((0, import_os3.homedir)(), ".cache", "kordoc", "libreoffice");
11259
- var VERSION_FILE = (0, import_path5.join)(CACHE_DIR, "version");
11260
- var PACKAGES = {
11261
- darwin: {
11262
- url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/mac/x86_64/LibreOffice_26.2.3_MacOS_x86-64.dmg",
11263
- binPath: "LibreOffice.app/Contents/MacOS/soffice",
11264
- sizeMb: 300
11265
- },
11266
- linux: {
11267
- url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/deb/x86_64/LibreOffice_26.2.3_Linux_x86-64_deb.tar.gz",
11268
- binPath: "opt/libreoffice26.2/program/soffice",
11269
- sizeMb: 210
11270
- },
11271
- win32: {
11272
- url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/win/x86_64/LibreOffice_26.2.3_Win_x86-64.msi",
11273
- binPath: "LibreOffice/program/soffice.exe",
11274
- sizeMb: 360
11275
- }
11276
- };
11277
- async function findInPath() {
11278
- return new Promise((resolve4) => {
11279
- const child = (0, import_child_process4.spawn)("soffice", ["--version"], { stdio: "ignore" });
11280
- child.on("close", (code) => resolve4(code === 0 ? "soffice" : null));
11281
- child.on("error", () => resolve4(null));
11282
- });
11283
- }
11284
- async function findInCache() {
11285
- const cachedBin = (0, import_path5.join)(CACHE_DIR, "bin", "soffice");
11286
- try {
11287
- await (0, import_promises2.access)(cachedBin);
11288
- return cachedBin;
11289
- } catch {
11290
- return null;
11291
- }
11292
- }
11293
- async function findInDefaultPaths() {
11294
- const platform = process.platform;
11295
- const paths = [];
11296
- if (platform === "darwin") {
11297
- paths.push(
11298
- "/Applications/LibreOffice.app/Contents/MacOS/soffice",
11299
- "/opt/homebrew/bin/soffice",
11300
- "/usr/local/bin/soffice"
11301
- );
11302
- } else if (platform === "linux") {
11303
- paths.push(
11304
- "/usr/bin/soffice",
11305
- "/usr/lib/libreoffice/program/soffice"
11306
- );
11307
- } else if (platform === "win32") {
11308
- const pf = process.env["ProgramFiles"] ?? "C:\\Program Files";
11309
- const pf86 = process.env["ProgramFiles(x86)"] ?? "C:\\Program Files (x86)";
11310
- paths.push(
11311
- (0, import_path5.join)(pf, "LibreOffice", "program", "soffice.exe"),
11312
- (0, import_path5.join)(pf86, "LibreOffice", "program", "soffice.exe")
11313
- );
11314
- }
11315
- for (const p of paths) {
11316
- try {
11317
- await (0, import_promises2.access)(p);
11318
- return p;
11319
- } catch {
11320
- continue;
11321
- }
11322
- }
11323
- return null;
11324
- }
11325
- async function downloadWithProgress(url, dest, totalBytes, onProgress) {
11326
- const response = await fetch(url);
11327
- if (!response.ok) throw new Error(`\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: HTTP ${response.status} (${url})`);
11328
- if (!response.body) throw new Error("\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: response body \uC5C6\uC74C");
11329
- const file = (0, import_fs4.createWriteStream)(dest);
11330
- const reader = response.body.getReader();
11331
- let downloaded = 0;
11332
- try {
11333
- while (true) {
11334
- const { done, value } = await reader.read();
11335
- if (done) break;
11336
- if (!file.write(value)) {
11337
- await new Promise((resolve4) => file.once("drain", resolve4));
11338
- }
11339
- downloaded += value.length;
11340
- onProgress?.(downloaded, totalBytes);
11341
- }
11342
- } finally {
11343
- reader.releaseLock();
11344
- await new Promise((resolve4, reject) => {
11345
- file.end((err) => err ? reject(err) : resolve4());
11346
- });
11347
- }
11348
- }
11349
- async function installForPlatform(pkg, onProgress) {
11350
- const platform = process.platform;
11351
- await (0, import_promises2.mkdir)(CACHE_DIR, { recursive: true });
11352
- const downloadPath = (0, import_path5.join)(CACHE_DIR, `download-${Date.now()}`);
11353
- await downloadWithProgress(pkg.url, downloadPath, pkg.sizeMb * 1024 * 1024, onProgress);
11354
- try {
11355
- if (platform === "darwin") {
11356
- return await installMacOS(pkg, downloadPath);
11357
- } else if (platform === "linux") {
11358
- return await installLinux(pkg, downloadPath);
11359
- } else if (platform === "win32") {
11360
- return await installWindows(pkg, downloadPath);
11361
- }
11362
- } catch (err) {
11363
- await (0, import_promises2.rm)(downloadPath, { force: true });
11364
- throw err;
11365
- }
11366
- throw new ConvertError("UNSUPPORTED_PLATFORM", `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4`);
11367
- }
11368
- async function installMacOS(pkg, downloadPath) {
11369
- const mountPoint = `/Volumes/LibreOffice_${Date.now()}`;
11370
- await new Promise((resolve4, reject) => {
11371
- const stderr = [];
11372
- const child = (0, import_child_process4.spawn)("hdiutil", ["attach", "-nobrowse", "-noverify", "-mountpoint", mountPoint, downloadPath]);
11373
- child.stderr?.on("data", (d) => stderr.push(d.toString()));
11374
- child.on(
11375
- "close",
11376
- (code) => code === 0 ? resolve4() : reject(new Error(`dmg \uB9C8\uC6B4\uD2B8 \uC2E4\uD328 (code=${code}): ${stderr.join("").trim()}`))
11377
- );
11378
- });
11379
- try {
11380
- const appSource = (0, import_path5.join)(mountPoint, "LibreOffice.app");
11381
- const appDest = (0, import_path5.join)(CACHE_DIR, "LibreOffice.app");
11382
- await new Promise((resolve4, reject) => {
11383
- const child = (0, import_child_process4.spawn)("cp", ["-R", appSource, appDest]);
11384
- child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(".app \uBCF5\uC0AC \uC2E4\uD328")));
11385
- });
11386
- } finally {
11387
- await new Promise((resolve4) => {
11388
- const child = (0, import_child_process4.spawn)("hdiutil", ["detach", mountPoint]);
11389
- child.on("close", () => resolve4());
11390
- });
11391
- }
11392
- await (0, import_promises2.rm)(downloadPath, { force: true });
11393
- return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
11394
- }
11395
- async function installLinux(pkg, downloadPath) {
11396
- const extractDir = (0, import_path5.join)(CACHE_DIR, `extract-${Date.now()}`);
11397
- await (0, import_promises2.mkdir)(extractDir, { recursive: true });
11398
- await new Promise((resolve4, reject) => {
11399
- const child = (0, import_child_process4.spawn)("tar", ["xzf", downloadPath, "-C", extractDir]);
11400
- child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("\uC555\uCD95 \uD574\uC81C \uC2E4\uD328")));
11401
- });
11402
- const debsDir = (0, import_path5.join)(extractDir, "DEBS");
11403
- try {
11404
- await (0, import_promises2.access)(debsDir);
11405
- const entries = await (await import("fs/promises")).readdir(debsDir);
11406
- for (const entry of entries) {
11407
- if (entry.endsWith(".deb")) {
11408
- await new Promise((resolve4, reject) => {
11409
- const child = (0, import_child_process4.spawn)("dpkg-deb", ["-x", (0, import_path5.join)(debsDir, entry), CACHE_DIR]);
11410
- child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(`${entry} \uCD94\uCD9C \uC2E4\uD328`)));
11411
- });
11412
- }
11413
- }
11414
- } catch {
11415
- }
11416
- await (0, import_promises2.rm)(downloadPath, { force: true });
11417
- await (0, import_promises2.rm)(extractDir, { recursive: true, force: true });
11418
- return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
11419
- }
11420
- async function installWindows(pkg, downloadPath) {
11421
- await new Promise((resolve4, reject) => {
11422
- const child = (0, import_child_process4.spawn)("msiexec", ["/a", downloadPath, "/qn", `TARGETDIR=${CACHE_DIR}`]);
11423
- child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("MSI \uC124\uCE58 \uC2E4\uD328")));
11424
- });
11425
- await (0, import_promises2.rm)(downloadPath, { force: true });
11426
- return (0, import_path5.join)(CACHE_DIR, pkg.binPath);
11427
- }
11428
- async function createSymlink(actualBin) {
11429
- const binDir = (0, import_path5.join)(CACHE_DIR, "bin");
11430
- await (0, import_promises2.mkdir)(binDir, { recursive: true });
11431
- const linkBin = (0, import_path5.join)(binDir, "soffice");
11432
- try {
11433
- await (0, import_promises2.symlink)(actualBin, linkBin);
11434
- } catch {
11435
- }
11436
- process.env.PATH = `${binDir}${import_path5.delimiter}${process.env.PATH}`;
11437
- return linkBin;
11438
- }
11439
- async function installLibreOffice(onProgress) {
11440
- const platform = process.platform;
11441
- const pkg = PACKAGES[platform];
11442
- if (!pkg) {
11443
- throw new ConvertError(
11444
- "UNSUPPORTED_PLATFORM",
11445
- `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4. \uC218\uB3D9\uC73C\uB85C LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.`
11446
- );
11447
- }
11448
- return await installForPlatform(pkg, onProgress);
11449
- }
11450
- async function resolveSoffice(emitter, autoInstall = true) {
11451
- emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
11452
- const inPath = await findInPath();
11453
- if (inPath) {
11454
- emitter.validate("soffice_found", "\uC2DC\uC2A4\uD15C PATH\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inPath });
11455
- return inPath;
11456
- }
11457
- const inCache = await findInCache();
11458
- if (inCache) {
11459
- emitter.validate("soffice_found", "\uCE90\uC2DC\uB41C LibreOffice \uBC1C\uACAC", { sofficePath: inCache });
11460
- return inCache;
11461
- }
11462
- const inDefault = await findInDefaultPaths();
11463
- if (inDefault) {
11464
- emitter.validate("soffice_found", "\uAE30\uBCF8 \uACBD\uB85C\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inDefault });
11465
- return inDefault;
11466
- }
11467
- if (!autoInstall) {
11468
- emitter.error(
11469
- "validate",
11470
- "SOFFICE_NOT_FOUND",
11471
- "LibreOffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4",
11472
- "\uC218\uB3D9\uC73C\uB85C \uC124\uCE58\uD558\uAC70\uB098 autoInstallLibreOffice: true \uC635\uC158\uC744 \uC0AC\uC6A9\uD558\uC138\uC694."
11473
- );
11474
- throw new ConvertError("SOFFICE_NOT_FOUND", "LibreOffice\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4");
11475
- }
11476
- if (installInFlight) {
11477
- return installInFlight;
11478
- }
11479
- emitter.install("install_start", "LibreOffice \uC790\uB3D9 \uC124\uCE58\uB97C \uC2DC\uC791\uD569\uB2C8\uB2E4...");
11480
- installInFlight = (async () => {
11481
- try {
11482
- const installed = await installLibreOffice((downloaded, total) => {
11483
- const percent = Math.round(downloaded / total * 100);
11484
- emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
11485
- percent,
11486
- downloadedBytes: downloaded,
11487
- totalBytes: total
11488
- });
11489
- });
11490
- emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
11491
- return installed;
11492
- } catch (err) {
11493
- const errorMsg = err instanceof Error ? err.message : String(err);
11494
- emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
11495
- throw err;
11496
- } finally {
11497
- installInFlight = null;
11498
- }
11499
- })();
11500
- return installInFlight;
11501
- }
11502
-
11503
- // src/convert/libreoffice.ts
11504
- var libreConvert = import_libreoffice_convert.default.convert;
11505
- var libreConvertWithOptions = import_libreoffice_convert.default.convertWithOptions;
11506
- async function convertBuffer(buffer, targetExt, timeoutMs = 6e4, sofficePath) {
11507
- return new Promise((resolve4, reject) => {
11508
- const timer = setTimeout(() => {
11509
- reject(
11510
- new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
11511
- );
11512
- }, timeoutMs);
11513
- const cb = (err, done) => {
11514
- clearTimeout(timer);
11515
- if (err || !done) {
11516
- reject(
11517
- new ConvertError(
11518
- "CONVERT_FAILED",
11519
- err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
11520
- )
11521
- );
11522
- return;
11523
- }
11524
- resolve4(done);
11525
- };
11526
- if (sofficePath) {
11527
- libreConvertWithOptions(buffer, targetExt, void 0, { sofficeBinaryPaths: [sofficePath] }, cb);
11528
- } else {
11529
- libreConvert(buffer, targetExt, void 0, cb);
11530
- }
11531
- });
11532
- }
11533
-
11534
- // src/convert/events.ts
11535
- var ConvertEventEmitter = class {
11536
- listener = null;
11537
- /** 이벤트 리스너 등록 */
11538
- setListener(listener) {
11539
- this.listener = listener;
11540
- }
11541
- /** 이벤트 발송 */
11542
- emit(event) {
11543
- try {
11544
- this.listener?.(event);
11545
- } catch {
11546
- }
11547
- }
11548
- /** 타입 안전한 헬퍼: detect 이벤트 */
11549
- detect(stage, message, meta) {
11550
- this.emit({ type: "detect", stage, message, ...meta });
11551
- }
11552
- /** 타입 안전한 헬퍼: validate 이벤트 */
11553
- validate(stage, message, meta) {
11554
- this.emit({ type: "validate", stage, message, ...meta });
11555
- }
11556
- /** 타입 안전한 헬퍼: install 이벤트 */
11557
- install(stage, message, meta) {
11558
- this.emit({ type: "install", stage, message, ...meta });
11559
- }
11560
- /** 타입 안전한 헬퍼: convert 진행 이벤트 */
11561
- progress(percent, message) {
11562
- this.emit({ type: "convert", stage: "convert_progress", message, percent });
11563
- }
11564
- /** 타입 안전한 헬퍼: convert 시작 */
11565
- convertStart(message) {
11566
- this.emit({ type: "convert", stage: "convert_start", message, percent: 0 });
11567
- }
11568
- /** 타입 안전한 헬퍼: convert 완료 */
11569
- convertDone(message) {
11570
- this.emit({ type: "convert", stage: "convert_done", message, percent: 100 });
11571
- }
11572
- /** 타입 안전한 헬퍼: 완료 이벤트 */
11573
- complete(result) {
11574
- this.emit({ type: "complete", stage: "success", message: "\uBCC0\uD658 \uC644\uB8CC", result });
11575
- }
11576
- /** 타입 안전한 헬퍼: 에러 이벤트 */
11577
- error(stage, code, message, suggestion) {
11578
- this.emit({ type: "error", stage, code, message, recoverable: true, suggestion });
11579
- }
11580
- };
11581
-
11582
- // src/convert/index.ts
11583
- var isConverting = false;
11584
- var queue = [];
11585
- async function acquireConvertLock() {
11586
- if (!isConverting) {
11587
- isConverting = true;
11588
- return () => {
11589
- isConverting = false;
11590
- const next = queue.shift();
11591
- next?.();
11592
- };
11593
- }
11594
- return new Promise((resolve4) => {
11595
- queue.push(() => {
11596
- isConverting = true;
11597
- resolve4(() => {
11598
- isConverting = false;
11599
- const next = queue.shift();
11600
- next?.();
11601
- });
11602
- });
11603
- });
11604
- }
11605
- async function convertToPdf(input, options) {
11606
- const emitter = new ConvertEventEmitter();
11607
- if (options?.onEvent) {
11608
- emitter.setListener(options.onEvent);
11609
- }
11610
- if (options?.onProgress) {
11611
- const legacyProgress = options.onProgress;
11612
- emitter.setListener((event) => {
11613
- if (event.type === "convert" && event.stage === "convert_progress") {
11614
- legacyProgress(event.percent, event.message);
11615
- }
11616
- });
11617
- }
11618
- try {
11619
- emitter.detect("reading", "\uC785\uB825 \uD30C\uC77C \uC77D\uB294 \uC911...");
11620
- let buffer;
11621
- try {
11622
- if (typeof input === "string") {
11623
- buffer = await (0, import_promises3.readFile)(input);
11624
- } else if (Buffer.isBuffer(input)) {
11625
- buffer = input;
11626
- } else {
11627
- buffer = Buffer.from(input);
11628
- }
11629
- } catch (err) {
11630
- emitter.error(
11631
- "detect",
11632
- "PARSE_ERROR",
11633
- `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`
11634
- );
11635
- return {
11636
- success: false,
11637
- code: "PARSE_ERROR",
11638
- error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
11639
- stage: "detect"
11640
- };
11641
- }
11642
- const MAX_FILE_SIZE = 500 * 1024 * 1024;
11643
- if (buffer.length > MAX_FILE_SIZE) {
11644
- emitter.error(
11645
- "detect",
11646
- "FILE_TOO_LARGE",
11647
- `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`
11648
- );
11649
- return {
11650
- success: false,
11651
- code: "FILE_TOO_LARGE",
11652
- error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
11653
- stage: "detect"
11654
- };
11655
- }
11656
- const format = detectFormat(toArrayBuffer(buffer));
11657
- emitter.detect("format_detected", `\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC: ${format}`, { format });
11658
- if (format !== "hwp" && format !== "hwpx") {
11659
- emitter.error("detect", "UNSUPPORTED_FORMAT", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`);
11660
- return {
11661
- success: false,
11662
- code: "UNSUPPORTED_FORMAT",
11663
- error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
11664
- stage: "detect"
11665
- };
11666
- }
11667
- emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
11668
- let sofficePath;
11669
- try {
11670
- sofficePath = await resolveSoffice(emitter, options?.autoInstallLibreOffice ?? true);
11671
- } catch (err) {
11672
- if (err instanceof ConvertError) {
11673
- return {
11674
- success: false,
11675
- code: err.code,
11676
- error: err.message,
11677
- stage: "validate"
11678
- };
11679
- }
11680
- throw err;
11681
- }
11682
- const releaseLock = await acquireConvertLock();
11683
- try {
11684
- emitter.convertStart("\uBCC0\uD658 \uC2DC\uC791...");
11685
- emitter.progress(10, "\uBCC0\uD658 \uC911...");
11686
- const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs, sofficePath);
11687
- emitter.progress(100, "\uBCC0\uD658 \uC644\uB8CC");
11688
- emitter.convertDone("\uBCC0\uD658 \uC644\uB8CC");
11689
- const result = {
11690
- success: true,
11691
- pdf: new Uint8Array(pdf),
11692
- sourceFormat: format
11693
- };
11694
- emitter.complete({
11695
- sourceFormat: format,
11696
- pdfSize: pdf.length
11697
- });
11698
- return result;
11699
- } catch (err) {
11700
- if (err instanceof ConvertError) {
11701
- emitter.error("convert", err.code, err.message);
11702
- return {
11703
- success: false,
11704
- code: err.code,
11705
- error: err.message,
11706
- stage: "convert"
11707
- };
11708
- }
11709
- const errorMsg = err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328";
11710
- emitter.error("convert", classifyError(err), errorMsg);
11711
- return {
11712
- success: false,
11713
- code: classifyError(err),
11714
- error: errorMsg,
11715
- stage: "convert"
11716
- };
11717
- } finally {
11718
- releaseLock();
11719
- }
11720
- } catch (unexpectedErr) {
11721
- const errorMsg = unexpectedErr instanceof Error ? unexpectedErr.message : "\uC608\uC0C1\uCE58 \uBABB\uD55C \uC624\uB958";
11722
- emitter.error("convert", "PARSE_ERROR", errorMsg);
11723
- return {
11724
- success: false,
11725
- code: "PARSE_ERROR",
11726
- error: errorMsg,
11727
- stage: "convert"
11728
- };
11729
- }
11730
- }
11731
- async function convertHwpToPdf(input, options) {
11732
- const result = await convertToPdf(input, options);
11733
- if (result.success && result.sourceFormat !== "hwp") {
11734
- return {
11735
- success: false,
11736
- code: "UNSUPPORTED_FORMAT",
11737
- error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11738
- stage: "detect"
11739
- };
11740
- }
11741
- return result;
11742
- }
11743
- async function convertHwpxToPdf(input, options) {
11744
- const result = await convertToPdf(input, options);
11745
- if (result.success && result.sourceFormat !== "hwpx") {
11746
- return {
11747
- success: false,
11748
- code: "UNSUPPORTED_FORMAT",
11749
- error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11750
- stage: "detect"
11751
- };
11752
- }
11753
- return result;
11754
- }
11755
-
11756
11554
  // src/ocr/api-key-rotation.ts
11757
11555
  var AllKeysCoolingDownError = class extends Error {
11758
11556
  waitMs;
@@ -11847,9 +11645,9 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11847
11645
  };
11848
11646
 
11849
11647
  // src/pipeline/unified-ocr.ts
11850
- var import_promises4 = require("fs/promises");
11851
- var import_path6 = require("path");
11852
- var import_child_process5 = require("child_process");
11648
+ var import_promises2 = require("fs/promises");
11649
+ var import_path5 = require("path");
11650
+ var import_child_process4 = require("child_process");
11853
11651
  var import_node_perf_hooks = require("perf_hooks");
11854
11652
  init_logger();
11855
11653
 
@@ -11983,15 +11781,15 @@ function elapsedMs(startAt) {
11983
11781
  return Math.round(import_node_perf_hooks.performance.now() - startAt);
11984
11782
  }
11985
11783
  async function runUnifiedOcrPipeline(inputPath, options = {}) {
11986
- const absInput = (0, import_path6.resolve)(inputPath);
11987
- const stem = (0, import_path6.basename)(absInput, (0, import_path6.extname)(absInput));
11988
- const workspaceDir = (0, import_path6.resolve)(options.workspaceDir ?? (0, import_path6.join)((0, import_path6.dirname)(absInput), `${stem}_ocr_workspace`));
11989
- const imagesDir = (0, import_path6.join)(workspaceDir, "images");
11990
- const rawDir = (0, import_path6.join)(workspaceDir, "ocr", "raw");
11991
- const diffDir = (0, import_path6.join)(workspaceDir, "ocr", "diff");
11992
- const outputPath = (0, import_path6.resolve)(options.outputPath ?? (0, import_path6.join)((0, import_path6.dirname)(absInput), `${stem}.md`));
11993
- const reportPath = (0, import_path6.join)(workspaceDir, "run-report.json");
11994
- const modelCachePath = (0, import_path6.join)((0, import_path6.dirname)(absInput), ".kordoc-model-cache.json");
11784
+ const absInput = (0, import_path5.resolve)(inputPath);
11785
+ const stem = (0, import_path5.basename)(absInput, (0, import_path5.extname)(absInput));
11786
+ const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
11787
+ const imagesDir = (0, import_path5.join)(workspaceDir, "images");
11788
+ const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
11789
+ const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
11790
+ const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
11791
+ const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
11792
+ const modelCachePath = (0, import_path5.join)((0, import_path5.dirname)(absInput), ".kordoc-model-cache.json");
11995
11793
  const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
11996
11794
  const timeoutMs = options.timeoutMs ?? 6e4;
11997
11795
  const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
@@ -12002,12 +11800,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12002
11800
  const models = sortModelsByCache(modelsInput, modelCache);
12003
11801
  const modelMaxTokens = { ...DEFAULT_MODEL_MAX_TOKENS, ...options.modelMaxTokens ?? {} };
12004
11802
  const stageWeights = normalizeWeights({ ...DEFAULT_STAGE_WEIGHTS, ...options.stageWeights ?? {} });
12005
- const keyPool = ApiKeyRotationPool.fromEnv();
12006
11803
  const runId = options.runId ?? generateRunId("ocr");
12007
11804
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
12008
- await (0, import_promises4.mkdir)(imagesDir, { recursive: true });
12009
- await (0, import_promises4.mkdir)(rawDir, { recursive: true });
12010
- await (0, import_promises4.mkdir)(diffDir, { recursive: true });
11805
+ await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
11806
+ await (0, import_promises2.mkdir)(rawDir, { recursive: true });
11807
+ await (0, import_promises2.mkdir)(diffDir, { recursive: true });
12011
11808
  const timingsMs = {};
12012
11809
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
12013
11810
  const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
@@ -12018,51 +11815,57 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12018
11815
  };
12019
11816
  try {
12020
11817
  ensureSupportedInput(absInput);
12021
- let workingPdfPath = absInput;
12022
11818
  const convertStart = import_node_perf_hooks.performance.now();
12023
11819
  currentStage = "convert";
12024
- markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
12025
- logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
12026
- if ((0, import_path6.extname)(absInput).toLowerCase() !== ".pdf") {
12027
- const convertEmitter = new ConvertEventEmitter();
12028
- if (options.onEvent) {
12029
- convertEmitter.setListener((evt) => {
12030
- if (evt.type === "install" || evt.type === "validate" || evt.type === "error") {
12031
- try {
12032
- ;
12033
- options.onEvent(evt);
12034
- } catch {
12035
- }
12036
- }
12037
- });
12038
- }
12039
- let resolvedSofficePath;
12040
- if (options.sofficePath) {
12041
- const sofficeDir = (0, import_path6.dirname)(options.sofficePath);
12042
- process.env.PATH = `${sofficeDir}${import_path6.delimiter}${process.env.PATH ?? ""}`;
12043
- convertEmitter.validate("soffice_found", "\uC9C1\uC811 \uC9C0\uC815\uB41C LibreOffice \uACBD\uB85C \uC0AC\uC6A9", { sofficePath: options.sofficePath });
12044
- resolvedSofficePath = options.sofficePath;
12045
- } else {
12046
- resolvedSofficePath = await resolveSoffice(convertEmitter, options.autoInstallLibreOffice ?? false);
12047
- }
12048
- workingPdfPath = (0, import_path6.join)(workspaceDir, `${stem}.pdf`);
12049
- const inputBuffer = await (0, import_promises4.readFile)(absInput);
12050
- const out = await convertBuffer(inputBuffer, ".pdf", 5 * 6e4, resolvedSofficePath);
12051
- await (0, import_promises4.writeFile)(workingPdfPath, out);
11820
+ if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
11821
+ markStageStart("convert", "\uC790\uCCB4 \uD30C\uC11C\uB85C Markdown \uBCC0\uD658 \uC911");
11822
+ logStage("info", "convert", "start", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
11823
+ const inputBuffer = await (0, import_promises2.readFile)(absInput);
11824
+ const parsed = await parseNativeDocument(inputBuffer);
11825
+ timingsMs.convert = elapsedMs(convertStart);
11826
+ markStageDone("convert", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC644\uB8CC");
11827
+ logStage("info", "convert", "done", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC644\uB8CC", { format: parsed.fileType, elapsedMs: timingsMs.convert });
11828
+ const mergeStart2 = import_node_perf_hooks.performance.now();
11829
+ currentStage = "merge";
11830
+ markStageStart("merge", "Markdown \uC800\uC7A5 \uC911");
11831
+ await (0, import_promises2.writeFile)(outputPath, parsed.markdown, "utf-8");
11832
+ timingsMs.merge = elapsedMs(mergeStart2);
11833
+ markStageDone("merge", "Markdown \uC800\uC7A5 \uC644\uB8CC");
11834
+ logStage("info", "merge", "done", "Markdown \uC800\uC7A5 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
11835
+ const report2 = {
11836
+ inputPath: absInput,
11837
+ outputPath,
11838
+ workspaceDir,
11839
+ selectedModel: "native-parser",
11840
+ probeImage: "",
11841
+ probeResults: [],
11842
+ pageCount: parsed.pageCount,
11843
+ sourceFormat: parsed.fileType,
11844
+ keyHealth: [],
11845
+ timingsMs,
11846
+ modelCachePath
11847
+ };
11848
+ await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report2, null, 2), "utf-8");
11849
+ logStage("info", "finalize", "done", "native parse run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
11850
+ return { outputPath, reportPath, selectedModel: "native-parser" };
12052
11851
  }
11852
+ const workingPdfPath = absInput;
11853
+ markStageStart("convert", "PDF \uC785\uB825 \uD655\uC778 \uC911");
11854
+ logStage("info", "convert", "start", "PDF \uC785\uB825 \uD655\uC778", { input: absInput });
12053
11855
  timingsMs.convert = elapsedMs(convertStart);
12054
- markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
12055
- logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
11856
+ markStageDone("convert", "PDF \uC785\uB825 \uD655\uC778 \uC644\uB8CC");
11857
+ logStage("info", "convert", "done", "PDF \uC785\uB825 \uD655\uC778 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
11858
+ const keyPool = ApiKeyRotationPool.fromEnv();
12056
11859
  const renderStart = import_node_perf_hooks.performance.now();
12057
11860
  currentStage = "render";
12058
11861
  const totalPages = await getPdfPageCount(workingPdfPath).catch(() => 0);
12059
11862
  if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
12060
11863
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
12061
11864
  logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
12062
- await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path6.join)(imagesDir, "page")]);
12063
- const firstFiles = (await (0, import_promises4.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11865
+ await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
11866
+ const firstFiles = (await (0, import_promises2.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
12064
11867
  if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
12065
- const probeImage = (0, import_path6.join)(imagesDir, firstFiles[0]);
11868
+ const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
12066
11869
  markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
12067
11870
  const probeStart = import_node_perf_hooks.performance.now();
12068
11871
  currentStage = "probe";
@@ -12098,7 +11901,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12098
11901
  const keyCount = keyPool.snapshot().length;
12099
11902
  const workerCount = Math.max(1, keyCount * concurrencyPerKey);
12100
11903
  const queueCapacity = workerCount * 2;
12101
- const queue2 = new BoundedQueue(queueCapacity);
11904
+ const queue = new BoundedQueue(queueCapacity);
12102
11905
  const ocrStart = import_node_perf_hooks.performance.now();
12103
11906
  currentStage = "ocr";
12104
11907
  markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
@@ -12106,17 +11909,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12106
11909
  let renderDone = 1;
12107
11910
  const renderProducer = (async () => {
12108
11911
  try {
12109
- await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
11912
+ await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
12110
11913
  if (totalPages > 1) {
12111
- for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path6.join)(imagesDir, "page"), dpi, totalPages, 2)) {
12112
- await queue2.enqueue(item);
11914
+ for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
11915
+ await queue.enqueue(item);
12113
11916
  renderDone++;
12114
11917
  markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
12115
11918
  logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
12116
11919
  }
12117
11920
  }
12118
11921
  } finally {
12119
- queue2.close();
11922
+ queue.close();
12120
11923
  timingsMs.render = elapsedMs(renderStart);
12121
11924
  markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
12122
11925
  logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
@@ -12125,7 +11928,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12125
11928
  const [, pageResultsMap] = await Promise.all([
12126
11929
  renderProducer,
12127
11930
  ocrWorkerPool({
12128
- queue: queue2,
11931
+ queue,
12129
11932
  workerCount,
12130
11933
  totalPages,
12131
11934
  ocrInput: {
@@ -12158,8 +11961,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12158
11961
  const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
12159
11962
  const rawPagePaths = [];
12160
11963
  for (const [pageNum, markdown] of sortedEntries) {
12161
- const pagePath = (0, import_path6.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
12162
- await (0, import_promises4.writeFile)(pagePath, markdown, "utf-8");
11964
+ const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
11965
+ await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
12163
11966
  rawPagePaths.push(pagePath);
12164
11967
  }
12165
11968
  const mergeStart = import_node_perf_hooks.performance.now();
@@ -12167,7 +11970,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12167
11970
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
12168
11971
  logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
12169
11972
  const merged = await mergeMarkdownPages(rawPagePaths);
12170
- await (0, import_promises4.writeFile)(outputPath, merged, "utf-8");
11973
+ await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
12171
11974
  timingsMs.merge = elapsedMs(mergeStart);
12172
11975
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
12173
11976
  logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
@@ -12183,7 +11986,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12183
11986
  timingsMs,
12184
11987
  modelCachePath
12185
11988
  };
12186
- await (0, import_promises4.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
11989
+ await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
12187
11990
  logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
12188
11991
  return { outputPath, reportPath, selectedModel };
12189
11992
  } catch (err) {
@@ -12258,7 +12061,7 @@ async function getPdfPageCount(pdfPath) {
12258
12061
  return n;
12259
12062
  }
12260
12063
  async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, startPage = 1) {
12261
- const imagesDir = (0, import_path6.dirname)(prefixPath);
12064
+ const imagesDir = (0, import_path5.dirname)(prefixPath);
12262
12065
  for (let page = startPage; page <= totalPages; page++) {
12263
12066
  try {
12264
12067
  await runCommand("pdftoppm", [
@@ -12272,9 +12075,9 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
12272
12075
  pdfPath,
12273
12076
  prefixPath
12274
12077
  ]);
12275
- const files = await (0, import_promises4.readdir)(imagesDir);
12078
+ const files = await (0, import_promises2.readdir)(imagesDir);
12276
12079
  const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
12277
- const imagePath = (0, import_path6.join)(imagesDir, pageFiles[pageFiles.length - 1]);
12080
+ const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
12278
12081
  yield { pageNumber: page, imagePath };
12279
12082
  } catch (err) {
12280
12083
  yield {
@@ -12287,7 +12090,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
12287
12090
  }
12288
12091
  async function runCommand(cmd, args) {
12289
12092
  await new Promise((resolvePromise, reject) => {
12290
- const child = (0, import_child_process5.spawn)(cmd, args, { stdio: "pipe" });
12093
+ const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
12291
12094
  let stderr = "";
12292
12095
  child.stderr.on("data", (d) => {
12293
12096
  stderr += String(d);
@@ -12301,7 +12104,7 @@ async function runCommand(cmd, args) {
12301
12104
  }
12302
12105
  async function runCommandWithStdout(cmd, args) {
12303
12106
  return await new Promise((resolvePromise, reject) => {
12304
- const child = (0, import_child_process5.spawn)(cmd, args, { stdio: "pipe" });
12107
+ const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
12305
12108
  let stdout = "";
12306
12109
  let stderr = "";
12307
12110
  child.stdout.on("data", (d) => {
@@ -12317,6 +12120,32 @@ async function runCommandWithStdout(cmd, args) {
12317
12120
  });
12318
12121
  });
12319
12122
  }
12123
+ async function parseNativeDocument(buffer) {
12124
+ const arrayBuffer = toArrayBuffer(buffer);
12125
+ const format = detectFormat(arrayBuffer);
12126
+ let result;
12127
+ let fileType;
12128
+ if (format === "hwp") {
12129
+ result = parseHwp5Document(buffer);
12130
+ fileType = "hwp";
12131
+ } else if (format === "hwpx") {
12132
+ const { format: zipFormat, zip } = await detectZipFormat(arrayBuffer);
12133
+ if (zipFormat === "xlsx") {
12134
+ result = await parseXlsxDocument(arrayBuffer, void 0, zip ?? void 0);
12135
+ fileType = "xlsx";
12136
+ } else if (zipFormat === "docx") {
12137
+ result = await parseDocxDocument(arrayBuffer, void 0, zip ?? void 0);
12138
+ fileType = "docx";
12139
+ } else {
12140
+ result = await parseHwpxDocument(arrayBuffer, void 0, zip ?? void 0);
12141
+ fileType = "hwpx";
12142
+ }
12143
+ } else {
12144
+ throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC790\uCCB4 \uD30C\uC11C\uB85C \uCC98\uB9AC\uD560 \uC218 \uC5C6\uB294 \uC785\uB825 \uD3EC\uB9F7: ${format}`);
12145
+ }
12146
+ const pageCount = result.metadata?.pageCount ?? Math.max(1, ...result.blocks.map((block) => block.pageNumber ?? 1));
12147
+ return { markdown: result.markdown, fileType, pageCount };
12148
+ }
12320
12149
  function naturalPageSort(a, b) {
12321
12150
  const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
12322
12151
  const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
@@ -12390,7 +12219,7 @@ function startParallelProbeRuns(input) {
12390
12219
  }
12391
12220
  async function loadModelCache(path) {
12392
12221
  try {
12393
- const raw = await (0, import_promises4.readFile)(path, "utf-8");
12222
+ const raw = await (0, import_promises2.readFile)(path, "utf-8");
12394
12223
  return JSON.parse(raw);
12395
12224
  } catch {
12396
12225
  return null;
@@ -12421,15 +12250,15 @@ async function updateModelCache(path, probes) {
12421
12250
  }
12422
12251
  }
12423
12252
  current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
12424
- await (0, import_promises4.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
12253
+ await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
12425
12254
  }
12426
12255
  async function ocrWorkerPool(input) {
12427
- const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
12256
+ const { queue, workerCount, ocrInput, onPageDone } = input;
12428
12257
  const results = /* @__PURE__ */ new Map();
12429
12258
  let completedCount = 0;
12430
12259
  async function worker() {
12431
12260
  while (true) {
12432
- const item = await queue2.dequeue();
12261
+ const item = await queue.dequeue();
12433
12262
  if (item === QUEUE_DONE) break;
12434
12263
  const { pageNumber, imagePath, error } = item;
12435
12264
  if (imagePath === null) {
@@ -12481,7 +12310,7 @@ async function ocrImageWithFallback(input) {
12481
12310
  async function mergeMarkdownPages(paths) {
12482
12311
  const out = [];
12483
12312
  for (let i = 0; i < paths.length; i++) {
12484
- const txt = (await (0, import_promises4.readFile)(paths[i], "utf-8")).trim();
12313
+ const txt = (await (0, import_promises2.readFile)(paths[i], "utf-8")).trim();
12485
12314
  if (!txt) continue;
12486
12315
  out.push(txt);
12487
12316
  }
@@ -12597,7 +12426,7 @@ async function ocrImageViaNim(input) {
12597
12426
  throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
12598
12427
  }
12599
12428
  async function encodeBase64(path) {
12600
- const b = await (0, import_promises4.readFile)(path);
12429
+ const b = await (0, import_promises2.readFile)(path);
12601
12430
  return b.toString("base64");
12602
12431
  }
12603
12432
  function stripCodeFence3(text) {
@@ -12609,7 +12438,7 @@ async function delay(ms) {
12609
12438
  await new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
12610
12439
  }
12611
12440
  function ensureSupportedInput(path) {
12612
- const ext = (0, import_path6.extname)(path).toLowerCase();
12441
+ const ext = (0, import_path5.extname)(path).toLowerCase();
12613
12442
  const allowed = /* @__PURE__ */ new Set([".pdf", ".hwp", ".hwpx", ".docx", ".xlsx"]);
12614
12443
  if (!allowed.has(ext)) {
12615
12444
  throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
@@ -12617,16 +12446,6 @@ function ensureSupportedInput(path) {
12617
12446
  }
12618
12447
  function normalizePipelineError(err, stage) {
12619
12448
  if (err instanceof UnifiedOcrError) return err;
12620
- if (err instanceof ConvertError) {
12621
- const codeMap = {
12622
- SOFFICE_NOT_FOUND: "SOFFICE_NOT_FOUND",
12623
- CONVERT_FAILED: "CONVERT_FAILED",
12624
- TIMEOUT: "CONVERT_FAILED",
12625
- UNSUPPORTED_PLATFORM: "CONVERT_FAILED",
12626
- UNSUPPORTED_FORMAT: "UNSUPPORTED_INPUT"
12627
- };
12628
- return new UnifiedOcrError(codeMap[err.code] ?? "CONVERT_FAILED", stage, err.message);
12629
- }
12630
12449
  const message = err instanceof Error ? err.message : String(err);
12631
12450
  const codeByStage = {
12632
12451
  convert: "CONVERT_FAILED",
@@ -12646,7 +12465,7 @@ async function parse2(input, options) {
12646
12465
  let buffer;
12647
12466
  if (typeof input === "string") {
12648
12467
  try {
12649
- const buf = await (0, import_promises5.readFile)(input);
12468
+ const buf = await (0, import_promises3.readFile)(input);
12650
12469
  buffer = toArrayBuffer(buf);
12651
12470
  } catch (err) {
12652
12471
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
@@ -12806,9 +12625,6 @@ async function parseDocx(buffer, options, zip) {
12806
12625
  VERSION,
12807
12626
  blocksToMarkdown,
12808
12627
  compare,
12809
- convertHwpToPdf,
12810
- convertHwpxToPdf,
12811
- convertToPdf,
12812
12628
  detectFormat,
12813
12629
  detectZipFormat,
12814
12630
  diffBlocks,