@clazic/kordoc 2.7.5 → 2.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -3062,9 +3062,6 @@ __export(index_exports, {
3062
3062
  VERSION: () => VERSION,
3063
3063
  blocksToMarkdown: () => blocksToMarkdown,
3064
3064
  compare: () => compare,
3065
- convertHwpToPdf: () => convertHwpToPdf,
3066
- convertHwpxToPdf: () => convertHwpxToPdf,
3067
- convertToPdf: () => convertToPdf,
3068
3065
  detectFormat: () => detectFormat,
3069
3066
  detectZipFormat: () => detectZipFormat,
3070
3067
  diffBlocks: () => diffBlocks,
@@ -3084,7 +3081,7 @@ __export(index_exports, {
3084
3081
  runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
3085
3082
  });
3086
3083
  module.exports = __toCommonJS(index_exports);
3087
- var import_promises5 = require("fs/promises");
3084
+ var import_promises3 = require("fs/promises");
3088
3085
 
3089
3086
  // src/detect.ts
3090
3087
  var import_jszip = __toESM(require("jszip"), 1);
@@ -3137,7 +3134,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
3137
3134
  var import_xmldom = require("@xmldom/xmldom");
3138
3135
 
3139
3136
  // src/utils.ts
3140
- var VERSION = true ? "2.7.4" : "0.0.0-dev";
3137
+ var VERSION = true ? "2.7.6" : "0.0.0-dev";
3141
3138
  function toArrayBuffer(buf) {
3142
3139
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3143
3140
  return buf.buffer;
@@ -3344,13 +3341,21 @@ function sanitizeText(text) {
3344
3341
  }
3345
3342
  return result;
3346
3343
  }
3344
+ function escapeGfm(text, inTableCell = false) {
3345
+ if (!text) return text;
3346
+ let result = text.replace(/(?<!\\)~/g, "\\~");
3347
+ if (inTableCell) {
3348
+ result = result.replace(/(?<!\\)\|/g, "\\|");
3349
+ }
3350
+ return result;
3351
+ }
3347
3352
  function blocksToMarkdown(blocks) {
3348
3353
  const lines = [];
3349
3354
  for (let i = 0; i < blocks.length; i++) {
3350
3355
  const block = blocks[i];
3351
3356
  if (block.type === "heading" && block.text) {
3352
3357
  const prefix = "#".repeat(Math.min(block.level || 2, 6));
3353
- const headingText = sanitizeText(block.text);
3358
+ const headingText = escapeGfm(sanitizeText(block.text), false);
3354
3359
  if (headingText) lines.push("", `${prefix} ${headingText}`, "");
3355
3360
  continue;
3356
3361
  }
@@ -3363,42 +3368,47 @@ function blocksToMarkdown(blocks) {
3363
3368
  continue;
3364
3369
  }
3365
3370
  if (block.type === "list" && block.text) {
3366
- const listText = sanitizeText(block.text);
3367
- if (!listText) continue;
3368
- const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(listText);
3371
+ const sanitized = sanitizeText(block.text);
3372
+ if (!sanitized) continue;
3373
+ const alreadyNumbered = block.listType === "ordered" && /^\d+\.\s/.test(sanitized);
3369
3374
  const prefix = alreadyNumbered ? "" : block.listType === "ordered" ? "1. " : "- ";
3375
+ const listText = escapeGfm(sanitized, false);
3370
3376
  lines.push(`${prefix}${listText}`);
3371
3377
  if (block.children) {
3372
3378
  for (const child of block.children) {
3373
3379
  const childPrefix = child.listType === "ordered" ? "1." : "-";
3374
- lines.push(` ${childPrefix} ${child.text || ""}`);
3380
+ const childText = child.text ? escapeGfm(sanitizeText(child.text), false) : "";
3381
+ lines.push(` ${childPrefix} ${childText}`);
3375
3382
  }
3376
3383
  }
3377
3384
  continue;
3378
3385
  }
3379
3386
  if (block.type === "paragraph" && block.text) {
3380
- let text = sanitizeText(block.text);
3381
- if (!text) continue;
3382
- if (/^\[별표\s*\d+/.test(text)) {
3387
+ const sanitized = sanitizeText(block.text);
3388
+ if (!sanitized) continue;
3389
+ if (/^\[별표\s*\d+/.test(sanitized)) {
3383
3390
  const nextBlock = blocks[i + 1];
3391
+ const escapedSelf = escapeGfm(sanitized, false);
3384
3392
  if (nextBlock?.type === "paragraph" && nextBlock.text && /관련\)?$/.test(nextBlock.text)) {
3385
- lines.push("", `## ${text} ${nextBlock.text}`, "");
3393
+ const nextEscaped = escapeGfm(sanitizeText(nextBlock.text), false);
3394
+ lines.push("", `## ${escapedSelf} ${nextEscaped}`, "");
3386
3395
  i++;
3387
3396
  } else {
3388
- lines.push("", `## ${text}`, "");
3397
+ lines.push("", `## ${escapedSelf}`, "");
3389
3398
  }
3390
3399
  continue;
3391
3400
  }
3392
- if (/^\([^)]*조[^)]*관련\)$/.test(text)) {
3393
- lines.push(`*${text}*`, "");
3401
+ if (/^\([^)]*조[^)]*관련\)$/.test(sanitized)) {
3402
+ lines.push(`*${escapeGfm(sanitized, false)}*`, "");
3394
3403
  continue;
3395
3404
  }
3405
+ let text = escapeGfm(sanitized, false);
3396
3406
  if (block.href) {
3397
3407
  const href = sanitizeHref(block.href);
3398
3408
  if (href) text = `[${text}](${href})`;
3399
3409
  }
3400
3410
  if (block.footnoteText) {
3401
- text += ` (\uC8FC: ${block.footnoteText})`;
3411
+ text += ` (\uC8FC: ${escapeGfm(block.footnoteText, false)})`;
3402
3412
  }
3403
3413
  lines.push(text);
3404
3414
  } else if (block.type === "table" && block.table) {
@@ -3423,13 +3433,13 @@ function tableToMarkdown(table) {
3423
3433
  return content.split(/\n/).map((line) => {
3424
3434
  const trimmed = line.trim();
3425
3435
  if (!trimmed) return "";
3426
- if (/^\d+\.\s/.test(trimmed)) return `**${trimmed}**`;
3427
- if (/^[가-힣]\.\s/.test(trimmed)) return ` ${trimmed}`;
3428
- return trimmed;
3436
+ if (/^\d+\.\s/.test(trimmed)) return `**${escapeGfm(trimmed, false)}**`;
3437
+ if (/^[가-힣]\.\s/.test(trimmed)) return ` ${escapeGfm(trimmed, false)}`;
3438
+ return escapeGfm(trimmed, false);
3429
3439
  }).filter(Boolean).join("\n");
3430
3440
  }
3431
3441
  if (numCols === 1 && numRows >= 2) {
3432
- return cells.map((row) => sanitizeText(row[0].text).replace(/\n/g, " ")).filter(Boolean).join("\n");
3442
+ return cells.map((row) => escapeGfm(sanitizeText(row[0].text).replace(/\n/g, " "), false)).filter(Boolean).join("\n");
3433
3443
  }
3434
3444
  const display = Array.from({ length: numRows }, () => Array(numCols).fill(""));
3435
3445
  const skip = /* @__PURE__ */ new Set();
@@ -3438,7 +3448,7 @@ function tableToMarkdown(table) {
3438
3448
  if (skip.has(`${r},${c}`)) continue;
3439
3449
  const cell = cells[r]?.[c];
3440
3450
  if (!cell) continue;
3441
- display[r][c] = sanitizeText(cell.text).replace(/\n/g, "<br>");
3451
+ display[r][c] = escapeGfm(sanitizeText(cell.text).replace(/\n/g, "<br>"), true);
3442
3452
  for (let dr = 0; dr < cell.rowSpan; dr++) {
3443
3453
  for (let dc = 0; dc < cell.colSpan; dc++) {
3444
3454
  if (dr === 0 && dc === 0) continue;
@@ -3485,6 +3495,223 @@ var HEADING_RATIO_H1 = 1.5;
3485
3495
  var HEADING_RATIO_H2 = 1.3;
3486
3496
  var HEADING_RATIO_H3 = 1.15;
3487
3497
 
3498
+ // src/hwp5/equation.ts
3499
+ var WORD_COMMANDS = /* @__PURE__ */ new Map([
3500
+ ["alpha", "\\alpha"],
3501
+ ["beta", "\\beta"],
3502
+ ["gamma", "\\gamma"],
3503
+ ["delta", "\\delta"],
3504
+ ["epsilon", "\\epsilon"],
3505
+ ["theta", "\\theta"],
3506
+ ["lambda", "\\lambda"],
3507
+ ["mu", "\\mu"],
3508
+ ["pi", "\\pi"],
3509
+ ["sigma", "\\sigma"],
3510
+ ["tau", "\\tau"],
3511
+ ["phi", "\\phi"],
3512
+ ["omega", "\\omega"],
3513
+ ["sin", "\\sin"],
3514
+ ["cos", "\\cos"],
3515
+ ["tan", "\\tan"],
3516
+ ["sec", "\\sec"],
3517
+ ["csc", "\\csc"],
3518
+ ["cot", "\\cot"],
3519
+ ["log", "\\log"],
3520
+ ["ln", "\\ln"],
3521
+ ["lim", "\\lim"],
3522
+ ["inf", "\\infty"],
3523
+ ["sum", "\\sum"],
3524
+ ["smallsum", "\\sum"],
3525
+ ["prod", "\\prod"],
3526
+ ["int", "\\int"],
3527
+ ["oint", "\\oint"],
3528
+ ["rightarrow", "\\rightarrow"],
3529
+ ["leftarrow", "\\leftarrow"],
3530
+ ["partial", "\\partial"],
3531
+ ["nabla", "\\nabla"],
3532
+ ["angle", "\\angle"],
3533
+ ["triangle", "\\triangle"],
3534
+ ["vec", "\\vec"],
3535
+ ["bar", "\\overline"],
3536
+ ["dot", "\\dot"],
3537
+ ["hat", "\\hat"],
3538
+ ["left", "\\left"],
3539
+ ["right", "\\right"]
3540
+ ]);
3541
+ var SYMBOL_WORDS = /* @__PURE__ */ new Map([
3542
+ ["times", "\\times"],
3543
+ ["divide", "\\div"],
3544
+ ["div", "\\div"],
3545
+ ["le", "\\leq"],
3546
+ ["ge", "\\geq"],
3547
+ ["geq", "\\geq"],
3548
+ ["deg", "^\\circ"],
3549
+ ["rarrow", "\\rightarrow"],
3550
+ ["larrow", "\\leftarrow"],
3551
+ ["lrarrow", "\\leftrightarrow"],
3552
+ ["in", "\\in"],
3553
+ ["notin", "\\notin"],
3554
+ ["emptyset", "\\emptyset"],
3555
+ ["subset", "\\subset"],
3556
+ ["nsubset", "\\nsubseteq"],
3557
+ ["cup", "\\cup"],
3558
+ ["cap", "\\cap"],
3559
+ ["smallinter", "\\cap"],
3560
+ ["sim", "\\sim"],
3561
+ ["circ", "\\circ"],
3562
+ ["bot", "\\perp"],
3563
+ ["dyad", "\\overleftrightarrow"],
3564
+ ["arch", "\\overset{\\frown}"]
3565
+ ]);
3566
+ function hwpEquationToLatex(equation) {
3567
+ return convertEquation(equation.replace(/\0/g, "").trim(), 0);
3568
+ }
3569
+ function convertEquation(equation, depth) {
3570
+ if (!equation || depth > 12) return equation;
3571
+ let result = equation.replace(/\s+/g, " ").replace(/`+/g, "\\,").replace(/~+/g, "\\,").trim();
3572
+ result = convertMatrixLike(result);
3573
+ result = convertRoots(result, depth);
3574
+ result = convertOver(result, depth);
3575
+ result = convertSqrt(result, depth);
3576
+ result = convertScripts(result);
3577
+ result = convertOperators(result);
3578
+ result = removeFontDirectives(result);
3579
+ result = convertWords(result);
3580
+ result = cleanupLatexSpacing(result);
3581
+ return result;
3582
+ }
3583
+ function convertMatrixLike(input) {
3584
+ return input.replace(
3585
+ /\bmatrix\s*\{([^{}]*)\}/gi,
3586
+ (_match, body) => `\\begin{matrix} ${body.split("#").map((part) => part.trim()).join(" & ")} \\end{matrix}`
3587
+ ).replace(
3588
+ /\bcases\s*\{([^{}]*)\}/gi,
3589
+ (_match, body) => `\\begin{cases} ${body.split("#").map((part) => part.trim()).join(" \\\\ ")} \\end{cases}`
3590
+ );
3591
+ }
3592
+ function convertRoots(input, depth) {
3593
+ return input.replace(/(?<!\\)\broot\s+({[^{}]*}|\S+)\s+of\s+({[^{}]*}|\S+)/gi, (_match, degree, radicand) => {
3594
+ return `\\sqrt[${convertEquation(unwrapGroup(degree), depth + 1)}]{${convertEquation(unwrapGroup(radicand), depth + 1)}}`;
3595
+ });
3596
+ }
3597
+ function convertSqrt(input, depth) {
3598
+ return input.replace(/(?<!\\)\bsqrt\s*({[^{}]*}|\S+)/gi, (_match, radicand) => {
3599
+ return `\\sqrt{${convertEquation(unwrapGroup(radicand), depth + 1)}}`;
3600
+ });
3601
+ }
3602
+ function convertOver(input, depth) {
3603
+ let result = input;
3604
+ for (let guard = 0; guard < 50; guard++) {
3605
+ const over = findTopLevelWord(result, "over");
3606
+ if (over < 0) break;
3607
+ const left = readLeftAtom(result, over);
3608
+ const right = readRightAtom(result, over + "over".length);
3609
+ if (!left || !right) break;
3610
+ const numerator = convertEquation(unwrapGroup(left.atom), depth + 1);
3611
+ const denominator = convertEquation(unwrapGroup(right.atom), depth + 1);
3612
+ result = result.slice(0, left.start) + `\\frac{${numerator}}{${denominator}}` + result.slice(right.end);
3613
+ }
3614
+ return result;
3615
+ }
3616
+ function convertScripts(input) {
3617
+ return input.replace(/\s*\^\s*/g, "^").replace(/\s*_\s*/g, "_").replace(/\^(?!\{)([^\s{}_^]+)/g, "^{$1}").replace(/_(?!\{)([^\s{}_^]+)/g, "_{$1}");
3618
+ }
3619
+ function convertOperators(input) {
3620
+ return input.replace(/\+-/g, "\\pm").replace(/-\+/g, "\\mp").replace(/\/\//g, "\\parallel").replace(/△/g, "\\triangle ").replace(/□/g, "\\square ").replace(/‧/g, "\\cdot ").replace(/!=/g, "\\neq").replace(/<=/g, "\\leq").replace(/>=/g, "\\geq").replace(/==/g, "\\equiv");
3621
+ }
3622
+ function removeFontDirectives(input) {
3623
+ return input.replace(/(?<!\\)\b(?:rm|it)\b\s*/gi, "");
3624
+ }
3625
+ function convertWords(input) {
3626
+ return input.replace(/(?<![\\A-Za-z0-9])([A-Za-z][A-Za-z0-9]*)(?![A-Za-z0-9])/g, (word) => {
3627
+ const exact = SYMBOL_WORDS.get(word);
3628
+ if (exact) return exact;
3629
+ const lower = word.toLowerCase();
3630
+ return SYMBOL_WORDS.get(lower) ?? WORD_COMMANDS.get(lower) ?? word;
3631
+ });
3632
+ }
3633
+ function cleanupLatexSpacing(input) {
3634
+ return input.replace(/\\left\s*\{/g, "\\left\\{").replace(/\\right\s*\}/g, "\\right\\}").replace(/\\left\s*([\[\]\(\)\|])/g, "\\left$1").replace(/\\right\s*([\[\]\(\)\|])/g, "\\right$1").replace(/\s*\\,\s*/g, "\\,").replace(/\s+/g, " ").replace(/\{\s+/g, "{").replace(/\s+\}/g, "}").trim();
3635
+ }
3636
+ function findTopLevelWord(input, word) {
3637
+ let curly = 0;
3638
+ let paren = 0;
3639
+ for (let i = 0; i <= input.length - word.length; i++) {
3640
+ const ch = input[i];
3641
+ if (ch === "{") curly++;
3642
+ else if (ch === "}") curly = Math.max(0, curly - 1);
3643
+ else if (ch === "(") paren++;
3644
+ else if (ch === ")") paren = Math.max(0, paren - 1);
3645
+ if (curly !== 0 || paren !== 0) continue;
3646
+ if (input.slice(i, i + word.length).toLowerCase() !== word) continue;
3647
+ if (isWordChar(input[i - 1]) || isWordChar(input[i + word.length])) continue;
3648
+ return i;
3649
+ }
3650
+ return -1;
3651
+ }
3652
+ function readLeftAtom(input, end) {
3653
+ let pos = end - 1;
3654
+ while (pos >= 0 && /\s/.test(input[pos])) pos--;
3655
+ if (pos < 0) return null;
3656
+ if (input[pos] === "}") {
3657
+ const start2 = findMatchingLeft(input, pos, "{", "}");
3658
+ if (start2 >= 0) return { start: start2, atom: input.slice(start2, pos + 1) };
3659
+ }
3660
+ if (input[pos] === ")") {
3661
+ const start2 = findMatchingLeft(input, pos, "(", ")");
3662
+ if (start2 >= 0) return { start: start2, atom: input.slice(start2, pos + 1) };
3663
+ }
3664
+ let start = pos;
3665
+ while (start >= 0 && !/\s/.test(input[start]) && !/[+\-=<>]/.test(input[start])) start--;
3666
+ return { start: start + 1, atom: input.slice(start + 1, pos + 1) };
3667
+ }
3668
+ function readRightAtom(input, start) {
3669
+ let pos = start;
3670
+ while (pos < input.length && /\s/.test(input[pos])) pos++;
3671
+ if (pos >= input.length) return null;
3672
+ if (input[pos] === "{") {
3673
+ const end2 = findMatchingRight(input, pos, "{", "}");
3674
+ if (end2 >= 0) return { end: end2 + 1, atom: input.slice(pos, end2 + 1) };
3675
+ }
3676
+ if (input[pos] === "(") {
3677
+ const end2 = findMatchingRight(input, pos, "(", ")");
3678
+ if (end2 >= 0) return { end: end2 + 1, atom: input.slice(pos, end2 + 1) };
3679
+ }
3680
+ let end = pos;
3681
+ while (end < input.length && !/\s/.test(input[end]) && !/[+\-=<>]/.test(input[end])) end++;
3682
+ return { end, atom: input.slice(pos, end) };
3683
+ }
3684
+ function findMatchingLeft(input, closeIndex, open, close) {
3685
+ let depth = 0;
3686
+ for (let i = closeIndex; i >= 0; i--) {
3687
+ if (input[i] === close) depth++;
3688
+ else if (input[i] === open) {
3689
+ depth--;
3690
+ if (depth === 0) return i;
3691
+ }
3692
+ }
3693
+ return -1;
3694
+ }
3695
+ function findMatchingRight(input, openIndex, open, close) {
3696
+ let depth = 0;
3697
+ for (let i = openIndex; i < input.length; i++) {
3698
+ if (input[i] === open) depth++;
3699
+ else if (input[i] === close) {
3700
+ depth--;
3701
+ if (depth === 0) return i;
3702
+ }
3703
+ }
3704
+ return -1;
3705
+ }
3706
+ function unwrapGroup(input) {
3707
+ const trimmed = input.trim();
3708
+ if (trimmed.startsWith("{") && trimmed.endsWith("}")) return trimmed.slice(1, -1);
3709
+ return trimmed;
3710
+ }
3711
+ function isWordChar(ch) {
3712
+ return !!ch && /[A-Za-z0-9_]/.test(ch);
3713
+ }
3714
+
3488
3715
  // src/hwpx/parser.ts
3489
3716
  init_page_range();
3490
3717
  init_logger();
@@ -4166,6 +4393,17 @@ function findDescendant(node, targetTag, depth = 0) {
4166
4393
  }
4167
4394
  return null;
4168
4395
  }
4396
+ function findChildByLocalName(node, targetTag) {
4397
+ const children = node.childNodes;
4398
+ if (!children) return null;
4399
+ for (let i = 0; i < children.length; i++) {
4400
+ const child = children[i];
4401
+ if (child.nodeType !== 1) continue;
4402
+ const tag = (child.tagName || child.localName || "").replace(/^[^:]+:/, "");
4403
+ if (tag === targetTag) return child;
4404
+ }
4405
+ return null;
4406
+ }
4169
4407
  function extractDrawTextBlocks(drawTextNode, blocks, styleMap, sectionNum) {
4170
4408
  const children = drawTextNode.childNodes;
4171
4409
  if (!children) return;
@@ -4268,6 +4506,22 @@ function extractParagraphInfo(para, styleMap) {
4268
4506
  case "shapeComment":
4269
4507
  case "drawText":
4270
4508
  break;
4509
+ // 수식: <hp:equation> 내부의 <hp:script>에 HML/HULK-style 수식 본문이
4510
+ // 들어있음. hwpEquationToLatex로 LaTeX 변환 후 `$...$`로 래핑하여
4511
+ // 본문 텍스트에 인라인 삽입. 변환 실패/빈 결과는 조용히 드롭
4512
+ // (대체 텍스트 "수식입니다." 누출 방지는 기존 정규식이 처리).
4513
+ case "equation": {
4514
+ const script = findChildByLocalName(child, "script");
4515
+ const raw = script ? extractTextFromNode(script) : "";
4516
+ if (raw.trim()) {
4517
+ try {
4518
+ const latex = hwpEquationToLatex(raw).trim();
4519
+ if (latex) text += " $" + latex.replace(/\$/g, "\\$") + "$ ";
4520
+ } catch {
4521
+ }
4522
+ }
4523
+ break;
4524
+ }
4271
4525
  // run 요소에서 charPrIDRef 추출
4272
4526
  case "r": {
4273
4527
  const runCharPr = child.getAttribute("charPrIDRef");
@@ -4334,8 +4588,13 @@ var TAG_CHAR_SHAPE = 68;
4334
4588
  var TAG_CTRL_HEADER = 71;
4335
4589
  var TAG_LIST_HEADER = 72;
4336
4590
  var TAG_TABLE = 77;
4337
- var TAG_DOC_CHAR_SHAPE = 55;
4338
- var TAG_DOC_STYLE = 58;
4591
+ var TAG_EQEDIT = 88;
4592
+ var HWPTAG_BEGIN = 16;
4593
+ var TAG_ID_MAPPINGS = HWPTAG_BEGIN + 1;
4594
+ var TAG_FACE_NAME = HWPTAG_BEGIN + 3;
4595
+ var TAG_DOC_CHAR_SHAPE = HWPTAG_BEGIN + 5;
4596
+ var TAG_DOC_PARA_SHAPE = HWPTAG_BEGIN + 9;
4597
+ var TAG_DOC_STYLE = HWPTAG_BEGIN + 10;
4339
4598
  var CHAR_LINE = 0;
4340
4599
  var CHAR_SECTION_BREAK = 10;
4341
4600
  var CHAR_PARA = 13;
@@ -4493,6 +4752,15 @@ function extractText(data) {
4493
4752
  }
4494
4753
  return result;
4495
4754
  }
4755
+ function extractEquationText(data) {
4756
+ if (data.length < 6) return null;
4757
+ const scriptLength = data.readUInt16LE(4);
4758
+ const scriptStart = 6;
4759
+ const scriptEnd = scriptStart + scriptLength * 2;
4760
+ if (scriptLength <= 0 || scriptEnd > data.length) return null;
4761
+ const equation = data.subarray(scriptStart, scriptEnd).toString("utf16le").replace(/\0+/g, "").trim();
4762
+ return equation || null;
4763
+ }
4496
4764
 
4497
4765
  // src/hwp5/aes.ts
4498
4766
  var S_BOX = new Uint8Array([
@@ -5652,6 +5920,26 @@ function findViewTextSectionsLenient(lcfb, compressed) {
5652
5920
  return sections.sort((a, b) => a.idx - b.idx).map((s) => s.content);
5653
5921
  }
5654
5922
  var TAG_SHAPE_COMPONENT = 74;
5923
+ var CTRL_ID_EQEDIT = "deqe";
5924
+ function isEquationControlId(ctrlId) {
5925
+ return ctrlId === CTRL_ID_EQEDIT || ctrlId === "eqed";
5926
+ }
5927
+ function formatEquationForMarkdown(equation) {
5928
+ const normalized = hwpEquationToLatex(equation);
5929
+ if (!normalized) return "";
5930
+ return `$${normalized.replace(/\$/g, "\\$")}$`;
5931
+ }
5932
+ function extractEquationFromControl(records, ctrlIdx) {
5933
+ const ctrlLevel = records[ctrlIdx].level;
5934
+ for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 10; j++) {
5935
+ const r = records[j];
5936
+ if (r.level <= ctrlLevel) break;
5937
+ if (r.tagId !== TAG_EQEDIT) continue;
5938
+ const equation = extractEquationText(r.data);
5939
+ return equation ? formatEquationForMarkdown(equation) : null;
5940
+ }
5941
+ return null;
5942
+ }
5655
5943
  function extractBinDataId(records, ctrlIdx) {
5656
5944
  const ctrlLevel = records[ctrlIdx].level;
5657
5945
  for (let j = ctrlIdx + 1; j < records.length && j < ctrlIdx + 50; j++) {
@@ -5811,6 +6099,16 @@ function parseSection(records, docInfo, warnings, sectionNum) {
5811
6099
  }
5812
6100
  } else if (ctrlId === " elo" || ctrlId === "ole ") {
5813
6101
  warnings.push({ page: sectionNum, message: `\uC2A4\uD0B5\uB41C \uC81C\uC5B4 \uC694\uC18C: ${ctrlId.trim()}`, code: "SKIPPED_IMAGE" });
6102
+ } else if (isEquationControlId(ctrlId)) {
6103
+ const equation = extractEquationFromControl(records, i);
6104
+ if (equation) {
6105
+ const lastBlock = blocks[blocks.length - 1];
6106
+ if (lastBlock && lastBlock.type === "paragraph" && lastBlock.text) {
6107
+ lastBlock.text = lastBlock.text + " " + equation;
6108
+ } else {
6109
+ blocks.push({ type: "paragraph", text: equation, pageNumber: sectionNum });
6110
+ }
6111
+ }
5814
6112
  } else if (ctrlId === "fn " || ctrlId === " nf " || ctrlId === "en " || ctrlId === " ne ") {
5815
6113
  const noteText = extractNoteText(records, i);
5816
6114
  if (noteText && blocks.length > 0) {
@@ -5843,6 +6141,13 @@ function extractNoteText(records, ctrlIdx) {
5843
6141
  const t = extractText(r.data).trim();
5844
6142
  if (t) texts.push(t);
5845
6143
  }
6144
+ if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
6145
+ const innerCtrlId = r.data.subarray(0, 4).toString("ascii");
6146
+ if (isEquationControlId(innerCtrlId)) {
6147
+ const equation = extractEquationFromControl(records, j);
6148
+ if (equation) texts.push(equation);
6149
+ }
6150
+ }
5846
6151
  }
5847
6152
  return texts.length > 0 ? texts.join(" ") : null;
5848
6153
  }
@@ -5856,6 +6161,13 @@ function extractTextBoxText(records, ctrlIdx) {
5856
6161
  const t = extractText(r.data).trim();
5857
6162
  if (t) texts.push(t);
5858
6163
  }
6164
+ if (r.tagId === TAG_CTRL_HEADER && r.data.length >= 4) {
6165
+ const innerCtrlId = r.data.subarray(0, 4).toString("ascii");
6166
+ if (isEquationControlId(innerCtrlId)) {
6167
+ const equation = extractEquationFromControl(records, j);
6168
+ if (equation) texts.push(equation);
6169
+ }
6170
+ }
5859
6171
  }
5860
6172
  return texts.length > 0 ? texts.join("\n") : null;
5861
6173
  }
@@ -5924,6 +6236,12 @@ function parseParagraphWithTables(records, startIdx) {
5924
6236
  i = nextIdx;
5925
6237
  continue;
5926
6238
  }
6239
+ if (isEquationControlId(ctrlId)) {
6240
+ const equation = extractEquationFromControl(records, i);
6241
+ if (equation) {
6242
+ text = text ? text + " " + equation : equation;
6243
+ }
6244
+ }
5927
6245
  }
5928
6246
  i++;
5929
6247
  }
@@ -11233,528 +11551,6 @@ async function markdownToXlsx(markdown, options) {
11233
11551
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
11234
11552
  }
11235
11553
 
11236
- // src/convert/index.ts
11237
- var import_promises3 = require("fs/promises");
11238
-
11239
- // src/convert/libreoffice.ts
11240
- var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11241
-
11242
- // src/convert/error.ts
11243
- var ConvertError = class extends Error {
11244
- constructor(code, message) {
11245
- super(message);
11246
- this.code = code;
11247
- this.name = "ConvertError";
11248
- }
11249
- };
11250
-
11251
- // src/convert/installer.ts
11252
- var import_os3 = require("os");
11253
- var import_path5 = require("path");
11254
- var import_promises2 = require("fs/promises");
11255
- var import_fs4 = require("fs");
11256
- var import_child_process4 = require("child_process");
11257
- var installInFlight = null;
11258
- var CACHE_DIR = (0, import_path5.join)((0, import_os3.homedir)(), ".cache", "kordoc", "libreoffice");
11259
- var VERSION_FILE = (0, import_path5.join)(CACHE_DIR, "version");
11260
- var PACKAGES = {
11261
- darwin: {
11262
- url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/mac/x86_64/LibreOffice_26.2.3_MacOS_x86-64.dmg",
11263
- binPath: "LibreOffice.app/Contents/MacOS/soffice",
11264
- sizeMb: 300
11265
- },
11266
- linux: {
11267
- url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/deb/x86_64/LibreOffice_26.2.3_Linux_x86-64_deb.tar.gz",
11268
- binPath: "opt/libreoffice26.2/program/soffice",
11269
- sizeMb: 210
11270
- },
11271
- win32: {
11272
- url: "https://ftp.osuosl.org/pub/tdf/libreoffice/stable/26.2.3/win/x86_64/LibreOffice_26.2.3_Win_x86-64.msi",
11273
- binPath: "LibreOffice/program/soffice.exe",
11274
- sizeMb: 360
11275
- }
11276
- };
11277
- async function findInPath() {
11278
- return new Promise((resolve4) => {
11279
- const child = (0, import_child_process4.spawn)("soffice", ["--version"], { stdio: "ignore" });
11280
- child.on("close", (code) => resolve4(code === 0 ? "soffice" : null));
11281
- child.on("error", () => resolve4(null));
11282
- });
11283
- }
11284
- async function findInCache() {
11285
- const cachedBin = (0, import_path5.join)(CACHE_DIR, "bin", "soffice");
11286
- try {
11287
- await (0, import_promises2.access)(cachedBin);
11288
- return cachedBin;
11289
- } catch {
11290
- return null;
11291
- }
11292
- }
11293
- async function findInDefaultPaths() {
11294
- const platform = process.platform;
11295
- const paths = [];
11296
- if (platform === "darwin") {
11297
- paths.push(
11298
- "/Applications/LibreOffice.app/Contents/MacOS/soffice",
11299
- "/opt/homebrew/bin/soffice",
11300
- "/usr/local/bin/soffice"
11301
- );
11302
- } else if (platform === "linux") {
11303
- paths.push(
11304
- "/usr/bin/soffice",
11305
- "/usr/lib/libreoffice/program/soffice"
11306
- );
11307
- } else if (platform === "win32") {
11308
- const pf = process.env["ProgramFiles"] ?? "C:\\Program Files";
11309
- const pf86 = process.env["ProgramFiles(x86)"] ?? "C:\\Program Files (x86)";
11310
- paths.push(
11311
- (0, import_path5.join)(pf, "LibreOffice", "program", "soffice.exe"),
11312
- (0, import_path5.join)(pf86, "LibreOffice", "program", "soffice.exe")
11313
- );
11314
- }
11315
- for (const p of paths) {
11316
- try {
11317
- await (0, import_promises2.access)(p);
11318
- return p;
11319
- } catch {
11320
- continue;
11321
- }
11322
- }
11323
- return null;
11324
- }
11325
- async function downloadWithProgress(url, dest, totalBytes, onProgress) {
11326
- const response = await fetch(url);
11327
- if (!response.ok) throw new Error(`\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: HTTP ${response.status} (${url})`);
11328
- if (!response.body) throw new Error("\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: response body \uC5C6\uC74C");
11329
- const file = (0, import_fs4.createWriteStream)(dest);
11330
- const reader = response.body.getReader();
11331
- let downloaded = 0;
11332
- try {
11333
- while (true) {
11334
- const { done, value } = await reader.read();
11335
- if (done) break;
11336
- if (!file.write(value)) {
11337
- await new Promise((resolve4) => file.once("drain", resolve4));
11338
- }
11339
- downloaded += value.length;
11340
- onProgress?.(downloaded, totalBytes);
11341
- }
11342
- } finally {
11343
- reader.releaseLock();
11344
- await new Promise((resolve4, reject) => {
11345
- file.end((err) => err ? reject(err) : resolve4());
11346
- });
11347
- }
11348
- }
11349
- async function installForPlatform(pkg, onProgress) {
11350
- const platform = process.platform;
11351
- await (0, import_promises2.mkdir)(CACHE_DIR, { recursive: true });
11352
- const downloadPath = (0, import_path5.join)(CACHE_DIR, `download-${Date.now()}`);
11353
- await downloadWithProgress(pkg.url, downloadPath, pkg.sizeMb * 1024 * 1024, onProgress);
11354
- try {
11355
- if (platform === "darwin") {
11356
- return await installMacOS(pkg, downloadPath);
11357
- } else if (platform === "linux") {
11358
- return await installLinux(pkg, downloadPath);
11359
- } else if (platform === "win32") {
11360
- return await installWindows(pkg, downloadPath);
11361
- }
11362
- } catch (err) {
11363
- await (0, import_promises2.rm)(downloadPath, { force: true });
11364
- throw err;
11365
- }
11366
- throw new ConvertError("UNSUPPORTED_PLATFORM", `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4`);
11367
- }
11368
- async function installMacOS(pkg, downloadPath) {
11369
- const mountPoint = `/Volumes/LibreOffice_${Date.now()}`;
11370
- await new Promise((resolve4, reject) => {
11371
- const stderr = [];
11372
- const child = (0, import_child_process4.spawn)("hdiutil", ["attach", "-nobrowse", "-noverify", "-mountpoint", mountPoint, downloadPath]);
11373
- child.stderr?.on("data", (d) => stderr.push(d.toString()));
11374
- child.on(
11375
- "close",
11376
- (code) => code === 0 ? resolve4() : reject(new Error(`dmg \uB9C8\uC6B4\uD2B8 \uC2E4\uD328 (code=${code}): ${stderr.join("").trim()}`))
11377
- );
11378
- });
11379
- try {
11380
- const appSource = (0, import_path5.join)(mountPoint, "LibreOffice.app");
11381
- const appDest = (0, import_path5.join)(CACHE_DIR, "LibreOffice.app");
11382
- await new Promise((resolve4, reject) => {
11383
- const child = (0, import_child_process4.spawn)("cp", ["-R", appSource, appDest]);
11384
- child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(".app \uBCF5\uC0AC \uC2E4\uD328")));
11385
- });
11386
- } finally {
11387
- await new Promise((resolve4) => {
11388
- const child = (0, import_child_process4.spawn)("hdiutil", ["detach", mountPoint]);
11389
- child.on("close", () => resolve4());
11390
- });
11391
- }
11392
- await (0, import_promises2.rm)(downloadPath, { force: true });
11393
- return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
11394
- }
11395
- async function installLinux(pkg, downloadPath) {
11396
- const extractDir = (0, import_path5.join)(CACHE_DIR, `extract-${Date.now()}`);
11397
- await (0, import_promises2.mkdir)(extractDir, { recursive: true });
11398
- await new Promise((resolve4, reject) => {
11399
- const child = (0, import_child_process4.spawn)("tar", ["xzf", downloadPath, "-C", extractDir]);
11400
- child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("\uC555\uCD95 \uD574\uC81C \uC2E4\uD328")));
11401
- });
11402
- const debsDir = (0, import_path5.join)(extractDir, "DEBS");
11403
- try {
11404
- await (0, import_promises2.access)(debsDir);
11405
- const entries = await (await import("fs/promises")).readdir(debsDir);
11406
- for (const entry of entries) {
11407
- if (entry.endsWith(".deb")) {
11408
- await new Promise((resolve4, reject) => {
11409
- const child = (0, import_child_process4.spawn)("dpkg-deb", ["-x", (0, import_path5.join)(debsDir, entry), CACHE_DIR]);
11410
- child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(`${entry} \uCD94\uCD9C \uC2E4\uD328`)));
11411
- });
11412
- }
11413
- }
11414
- } catch {
11415
- }
11416
- await (0, import_promises2.rm)(downloadPath, { force: true });
11417
- await (0, import_promises2.rm)(extractDir, { recursive: true, force: true });
11418
- return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
11419
- }
11420
- async function installWindows(pkg, downloadPath) {
11421
- await new Promise((resolve4, reject) => {
11422
- const child = (0, import_child_process4.spawn)("msiexec", ["/a", downloadPath, "/qn", `TARGETDIR=${CACHE_DIR}`]);
11423
- child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("MSI \uC124\uCE58 \uC2E4\uD328")));
11424
- });
11425
- await (0, import_promises2.rm)(downloadPath, { force: true });
11426
- return (0, import_path5.join)(CACHE_DIR, pkg.binPath);
11427
- }
11428
- async function createSymlink(actualBin) {
11429
- const binDir = (0, import_path5.join)(CACHE_DIR, "bin");
11430
- await (0, import_promises2.mkdir)(binDir, { recursive: true });
11431
- const linkBin = (0, import_path5.join)(binDir, "soffice");
11432
- try {
11433
- await (0, import_promises2.symlink)(actualBin, linkBin);
11434
- } catch {
11435
- }
11436
- process.env.PATH = `${binDir}${import_path5.delimiter}${process.env.PATH}`;
11437
- return linkBin;
11438
- }
11439
- async function installLibreOffice(onProgress) {
11440
- const platform = process.platform;
11441
- const pkg = PACKAGES[platform];
11442
- if (!pkg) {
11443
- throw new ConvertError(
11444
- "UNSUPPORTED_PLATFORM",
11445
- `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4. \uC218\uB3D9\uC73C\uB85C LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.`
11446
- );
11447
- }
11448
- return await installForPlatform(pkg, onProgress);
11449
- }
11450
- async function resolveSoffice(emitter, autoInstall = true) {
11451
- emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
11452
- const inPath = await findInPath();
11453
- if (inPath) {
11454
- emitter.validate("soffice_found", "\uC2DC\uC2A4\uD15C PATH\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inPath });
11455
- return inPath;
11456
- }
11457
- const inCache = await findInCache();
11458
- if (inCache) {
11459
- emitter.validate("soffice_found", "\uCE90\uC2DC\uB41C LibreOffice \uBC1C\uACAC", { sofficePath: inCache });
11460
- return inCache;
11461
- }
11462
- const inDefault = await findInDefaultPaths();
11463
- if (inDefault) {
11464
- emitter.validate("soffice_found", "\uAE30\uBCF8 \uACBD\uB85C\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inDefault });
11465
- return inDefault;
11466
- }
11467
- if (!autoInstall) {
11468
- emitter.error(
11469
- "validate",
11470
- "SOFFICE_NOT_FOUND",
11471
- "LibreOffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4",
11472
- "\uC218\uB3D9\uC73C\uB85C \uC124\uCE58\uD558\uAC70\uB098 autoInstallLibreOffice: true \uC635\uC158\uC744 \uC0AC\uC6A9\uD558\uC138\uC694."
11473
- );
11474
- throw new ConvertError("SOFFICE_NOT_FOUND", "LibreOffice\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4");
11475
- }
11476
- if (installInFlight) {
11477
- return installInFlight;
11478
- }
11479
- emitter.install("install_start", "LibreOffice \uC790\uB3D9 \uC124\uCE58\uB97C \uC2DC\uC791\uD569\uB2C8\uB2E4...");
11480
- installInFlight = (async () => {
11481
- try {
11482
- const installed = await installLibreOffice((downloaded, total) => {
11483
- const percent = Math.round(downloaded / total * 100);
11484
- emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
11485
- percent,
11486
- downloadedBytes: downloaded,
11487
- totalBytes: total
11488
- });
11489
- });
11490
- emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
11491
- return installed;
11492
- } catch (err) {
11493
- const errorMsg = err instanceof Error ? err.message : String(err);
11494
- emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
11495
- throw err;
11496
- } finally {
11497
- installInFlight = null;
11498
- }
11499
- })();
11500
- return installInFlight;
11501
- }
11502
-
11503
- // src/convert/libreoffice.ts
11504
- var libreConvert = import_libreoffice_convert.default.convert;
11505
- var libreConvertWithOptions = import_libreoffice_convert.default.convertWithOptions;
11506
- async function convertBuffer(buffer, targetExt, timeoutMs = 6e4, sofficePath, sourceExt) {
11507
- return new Promise((resolve4, reject) => {
11508
- const timer = setTimeout(() => {
11509
- reject(
11510
- new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
11511
- );
11512
- }, timeoutMs);
11513
- const cb = (err, done) => {
11514
- clearTimeout(timer);
11515
- if (err || !done) {
11516
- reject(
11517
- new ConvertError(
11518
- "CONVERT_FAILED",
11519
- err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
11520
- )
11521
- );
11522
- return;
11523
- }
11524
- resolve4(done);
11525
- };
11526
- if (sofficePath) {
11527
- const fileName = sourceExt ? `source${sourceExt}` : "source";
11528
- libreConvertWithOptions(buffer, targetExt, void 0, { sofficeBinaryPaths: [sofficePath], fileName }, cb);
11529
- } else {
11530
- libreConvert(buffer, targetExt, void 0, cb);
11531
- }
11532
- });
11533
- }
11534
-
11535
- // src/convert/events.ts
11536
- var ConvertEventEmitter = class {
11537
- listener = null;
11538
- /** 이벤트 리스너 등록 */
11539
- setListener(listener) {
11540
- this.listener = listener;
11541
- }
11542
- /** 이벤트 발송 */
11543
- emit(event) {
11544
- try {
11545
- this.listener?.(event);
11546
- } catch {
11547
- }
11548
- }
11549
- /** 타입 안전한 헬퍼: detect 이벤트 */
11550
- detect(stage, message, meta) {
11551
- this.emit({ type: "detect", stage, message, ...meta });
11552
- }
11553
- /** 타입 안전한 헬퍼: validate 이벤트 */
11554
- validate(stage, message, meta) {
11555
- this.emit({ type: "validate", stage, message, ...meta });
11556
- }
11557
- /** 타입 안전한 헬퍼: install 이벤트 */
11558
- install(stage, message, meta) {
11559
- this.emit({ type: "install", stage, message, ...meta });
11560
- }
11561
- /** 타입 안전한 헬퍼: convert 진행 이벤트 */
11562
- progress(percent, message) {
11563
- this.emit({ type: "convert", stage: "convert_progress", message, percent });
11564
- }
11565
- /** 타입 안전한 헬퍼: convert 시작 */
11566
- convertStart(message) {
11567
- this.emit({ type: "convert", stage: "convert_start", message, percent: 0 });
11568
- }
11569
- /** 타입 안전한 헬퍼: convert 완료 */
11570
- convertDone(message) {
11571
- this.emit({ type: "convert", stage: "convert_done", message, percent: 100 });
11572
- }
11573
- /** 타입 안전한 헬퍼: 완료 이벤트 */
11574
- complete(result) {
11575
- this.emit({ type: "complete", stage: "success", message: "\uBCC0\uD658 \uC644\uB8CC", result });
11576
- }
11577
- /** 타입 안전한 헬퍼: 에러 이벤트 */
11578
- error(stage, code, message, suggestion) {
11579
- this.emit({ type: "error", stage, code, message, recoverable: true, suggestion });
11580
- }
11581
- };
11582
-
11583
- // src/convert/index.ts
11584
- var isConverting = false;
11585
- var queue = [];
11586
- async function acquireConvertLock() {
11587
- if (!isConverting) {
11588
- isConverting = true;
11589
- return () => {
11590
- isConverting = false;
11591
- const next = queue.shift();
11592
- next?.();
11593
- };
11594
- }
11595
- return new Promise((resolve4) => {
11596
- queue.push(() => {
11597
- isConverting = true;
11598
- resolve4(() => {
11599
- isConverting = false;
11600
- const next = queue.shift();
11601
- next?.();
11602
- });
11603
- });
11604
- });
11605
- }
11606
- async function convertToPdf(input, options) {
11607
- const emitter = new ConvertEventEmitter();
11608
- if (options?.onEvent) {
11609
- emitter.setListener(options.onEvent);
11610
- }
11611
- if (options?.onProgress) {
11612
- const legacyProgress = options.onProgress;
11613
- emitter.setListener((event) => {
11614
- if (event.type === "convert" && event.stage === "convert_progress") {
11615
- legacyProgress(event.percent, event.message);
11616
- }
11617
- });
11618
- }
11619
- try {
11620
- emitter.detect("reading", "\uC785\uB825 \uD30C\uC77C \uC77D\uB294 \uC911...");
11621
- let buffer;
11622
- try {
11623
- if (typeof input === "string") {
11624
- buffer = await (0, import_promises3.readFile)(input);
11625
- } else if (Buffer.isBuffer(input)) {
11626
- buffer = input;
11627
- } else {
11628
- buffer = Buffer.from(input);
11629
- }
11630
- } catch (err) {
11631
- emitter.error(
11632
- "detect",
11633
- "PARSE_ERROR",
11634
- `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`
11635
- );
11636
- return {
11637
- success: false,
11638
- code: "PARSE_ERROR",
11639
- error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
11640
- stage: "detect"
11641
- };
11642
- }
11643
- const MAX_FILE_SIZE = 500 * 1024 * 1024;
11644
- if (buffer.length > MAX_FILE_SIZE) {
11645
- emitter.error(
11646
- "detect",
11647
- "FILE_TOO_LARGE",
11648
- `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`
11649
- );
11650
- return {
11651
- success: false,
11652
- code: "FILE_TOO_LARGE",
11653
- error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
11654
- stage: "detect"
11655
- };
11656
- }
11657
- const format = detectFormat(toArrayBuffer(buffer));
11658
- emitter.detect("format_detected", `\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC: ${format}`, { format });
11659
- if (format !== "hwp" && format !== "hwpx") {
11660
- emitter.error("detect", "UNSUPPORTED_FORMAT", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`);
11661
- return {
11662
- success: false,
11663
- code: "UNSUPPORTED_FORMAT",
11664
- error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
11665
- stage: "detect"
11666
- };
11667
- }
11668
- emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
11669
- let sofficePath;
11670
- try {
11671
- sofficePath = await resolveSoffice(emitter, options?.autoInstallLibreOffice ?? true);
11672
- } catch (err) {
11673
- if (err instanceof ConvertError) {
11674
- return {
11675
- success: false,
11676
- code: err.code,
11677
- error: err.message,
11678
- stage: "validate"
11679
- };
11680
- }
11681
- throw err;
11682
- }
11683
- const releaseLock = await acquireConvertLock();
11684
- try {
11685
- emitter.convertStart("\uBCC0\uD658 \uC2DC\uC791...");
11686
- emitter.progress(10, "\uBCC0\uD658 \uC911...");
11687
- const sourceExt = format === "hwpx" ? ".hwpx" : ".hwp";
11688
- const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs, sofficePath, sourceExt);
11689
- emitter.progress(100, "\uBCC0\uD658 \uC644\uB8CC");
11690
- emitter.convertDone("\uBCC0\uD658 \uC644\uB8CC");
11691
- const result = {
11692
- success: true,
11693
- pdf: new Uint8Array(pdf),
11694
- sourceFormat: format
11695
- };
11696
- emitter.complete({
11697
- sourceFormat: format,
11698
- pdfSize: pdf.length
11699
- });
11700
- return result;
11701
- } catch (err) {
11702
- if (err instanceof ConvertError) {
11703
- emitter.error("convert", err.code, err.message);
11704
- return {
11705
- success: false,
11706
- code: err.code,
11707
- error: err.message,
11708
- stage: "convert"
11709
- };
11710
- }
11711
- const errorMsg = err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328";
11712
- emitter.error("convert", classifyError(err), errorMsg);
11713
- return {
11714
- success: false,
11715
- code: classifyError(err),
11716
- error: errorMsg,
11717
- stage: "convert"
11718
- };
11719
- } finally {
11720
- releaseLock();
11721
- }
11722
- } catch (unexpectedErr) {
11723
- const errorMsg = unexpectedErr instanceof Error ? unexpectedErr.message : "\uC608\uC0C1\uCE58 \uBABB\uD55C \uC624\uB958";
11724
- emitter.error("convert", "PARSE_ERROR", errorMsg);
11725
- return {
11726
- success: false,
11727
- code: "PARSE_ERROR",
11728
- error: errorMsg,
11729
- stage: "convert"
11730
- };
11731
- }
11732
- }
11733
- async function convertHwpToPdf(input, options) {
11734
- const result = await convertToPdf(input, options);
11735
- if (result.success && result.sourceFormat !== "hwp") {
11736
- return {
11737
- success: false,
11738
- code: "UNSUPPORTED_FORMAT",
11739
- error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11740
- stage: "detect"
11741
- };
11742
- }
11743
- return result;
11744
- }
11745
- async function convertHwpxToPdf(input, options) {
11746
- const result = await convertToPdf(input, options);
11747
- if (result.success && result.sourceFormat !== "hwpx") {
11748
- return {
11749
- success: false,
11750
- code: "UNSUPPORTED_FORMAT",
11751
- error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11752
- stage: "detect"
11753
- };
11754
- }
11755
- return result;
11756
- }
11757
-
11758
11554
  // src/ocr/api-key-rotation.ts
11759
11555
  var AllKeysCoolingDownError = class extends Error {
11760
11556
  waitMs;
@@ -11849,9 +11645,9 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11849
11645
  };
11850
11646
 
11851
11647
  // src/pipeline/unified-ocr.ts
11852
- var import_promises4 = require("fs/promises");
11853
- var import_path6 = require("path");
11854
- var import_child_process5 = require("child_process");
11648
+ var import_promises2 = require("fs/promises");
11649
+ var import_path5 = require("path");
11650
+ var import_child_process4 = require("child_process");
11855
11651
  var import_node_perf_hooks = require("perf_hooks");
11856
11652
  init_logger();
11857
11653
 
@@ -11985,15 +11781,15 @@ function elapsedMs(startAt) {
11985
11781
  return Math.round(import_node_perf_hooks.performance.now() - startAt);
11986
11782
  }
11987
11783
  async function runUnifiedOcrPipeline(inputPath, options = {}) {
11988
- const absInput = (0, import_path6.resolve)(inputPath);
11989
- const stem = (0, import_path6.basename)(absInput, (0, import_path6.extname)(absInput));
11990
- const workspaceDir = (0, import_path6.resolve)(options.workspaceDir ?? (0, import_path6.join)((0, import_path6.dirname)(absInput), `${stem}_ocr_workspace`));
11991
- const imagesDir = (0, import_path6.join)(workspaceDir, "images");
11992
- const rawDir = (0, import_path6.join)(workspaceDir, "ocr", "raw");
11993
- const diffDir = (0, import_path6.join)(workspaceDir, "ocr", "diff");
11994
- const outputPath = (0, import_path6.resolve)(options.outputPath ?? (0, import_path6.join)((0, import_path6.dirname)(absInput), `${stem}.md`));
11995
- const reportPath = (0, import_path6.join)(workspaceDir, "run-report.json");
11996
- const modelCachePath = (0, import_path6.join)((0, import_path6.dirname)(absInput), ".kordoc-model-cache.json");
11784
+ const absInput = (0, import_path5.resolve)(inputPath);
11785
+ const stem = (0, import_path5.basename)(absInput, (0, import_path5.extname)(absInput));
11786
+ const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
11787
+ const imagesDir = (0, import_path5.join)(workspaceDir, "images");
11788
+ const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
11789
+ const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
11790
+ const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
11791
+ const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
11792
+ const modelCachePath = (0, import_path5.join)((0, import_path5.dirname)(absInput), ".kordoc-model-cache.json");
11997
11793
  const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
11998
11794
  const timeoutMs = options.timeoutMs ?? 6e4;
11999
11795
  const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
@@ -12004,12 +11800,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12004
11800
  const models = sortModelsByCache(modelsInput, modelCache);
12005
11801
  const modelMaxTokens = { ...DEFAULT_MODEL_MAX_TOKENS, ...options.modelMaxTokens ?? {} };
12006
11802
  const stageWeights = normalizeWeights({ ...DEFAULT_STAGE_WEIGHTS, ...options.stageWeights ?? {} });
12007
- const keyPool = ApiKeyRotationPool.fromEnv();
12008
11803
  const runId = options.runId ?? generateRunId("ocr");
12009
11804
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
12010
- await (0, import_promises4.mkdir)(imagesDir, { recursive: true });
12011
- await (0, import_promises4.mkdir)(rawDir, { recursive: true });
12012
- await (0, import_promises4.mkdir)(diffDir, { recursive: true });
11805
+ await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
11806
+ await (0, import_promises2.mkdir)(rawDir, { recursive: true });
11807
+ await (0, import_promises2.mkdir)(diffDir, { recursive: true });
12013
11808
  const timingsMs = {};
12014
11809
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
12015
11810
  const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
@@ -12020,52 +11815,57 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12020
11815
  };
12021
11816
  try {
12022
11817
  ensureSupportedInput(absInput);
12023
- let workingPdfPath = absInput;
12024
11818
  const convertStart = import_node_perf_hooks.performance.now();
12025
11819
  currentStage = "convert";
12026
- markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
12027
- logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
12028
- if ((0, import_path6.extname)(absInput).toLowerCase() !== ".pdf") {
12029
- const convertEmitter = new ConvertEventEmitter();
12030
- if (options.onEvent) {
12031
- convertEmitter.setListener((evt) => {
12032
- if (evt.type === "install" || evt.type === "validate" || evt.type === "error") {
12033
- try {
12034
- ;
12035
- options.onEvent(evt);
12036
- } catch {
12037
- }
12038
- }
12039
- });
12040
- }
12041
- let resolvedSofficePath;
12042
- if (options.sofficePath) {
12043
- const sofficeDir = (0, import_path6.dirname)(options.sofficePath);
12044
- process.env.PATH = `${sofficeDir}${import_path6.delimiter}${process.env.PATH ?? ""}`;
12045
- convertEmitter.validate("soffice_found", "\uC9C1\uC811 \uC9C0\uC815\uB41C LibreOffice \uACBD\uB85C \uC0AC\uC6A9", { sofficePath: options.sofficePath });
12046
- resolvedSofficePath = options.sofficePath;
12047
- } else {
12048
- resolvedSofficePath = await resolveSoffice(convertEmitter, options.autoInstallLibreOffice ?? false);
12049
- }
12050
- workingPdfPath = (0, import_path6.join)(workspaceDir, `${stem}.pdf`);
12051
- const inputBuffer = await (0, import_promises4.readFile)(absInput);
12052
- const sourceExt = (0, import_path6.extname)(absInput).toLowerCase();
12053
- const out = await convertBuffer(inputBuffer, ".pdf", 5 * 6e4, resolvedSofficePath, sourceExt);
12054
- await (0, import_promises4.writeFile)(workingPdfPath, out);
11820
+ if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
11821
+ markStageStart("convert", "\uC790\uCCB4 \uD30C\uC11C\uB85C Markdown \uBCC0\uD658 \uC911");
11822
+ logStage("info", "convert", "start", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
11823
+ const inputBuffer = await (0, import_promises2.readFile)(absInput);
11824
+ const parsed = await parseNativeDocument(inputBuffer);
11825
+ timingsMs.convert = elapsedMs(convertStart);
11826
+ markStageDone("convert", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC644\uB8CC");
11827
+ logStage("info", "convert", "done", "\uC790\uCCB4 \uD30C\uC11C \uBCC0\uD658 \uC644\uB8CC", { format: parsed.fileType, elapsedMs: timingsMs.convert });
11828
+ const mergeStart2 = import_node_perf_hooks.performance.now();
11829
+ currentStage = "merge";
11830
+ markStageStart("merge", "Markdown \uC800\uC7A5 \uC911");
11831
+ await (0, import_promises2.writeFile)(outputPath, parsed.markdown, "utf-8");
11832
+ timingsMs.merge = elapsedMs(mergeStart2);
11833
+ markStageDone("merge", "Markdown \uC800\uC7A5 \uC644\uB8CC");
11834
+ logStage("info", "merge", "done", "Markdown \uC800\uC7A5 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
11835
+ const report2 = {
11836
+ inputPath: absInput,
11837
+ outputPath,
11838
+ workspaceDir,
11839
+ selectedModel: "native-parser",
11840
+ probeImage: "",
11841
+ probeResults: [],
11842
+ pageCount: parsed.pageCount,
11843
+ sourceFormat: parsed.fileType,
11844
+ keyHealth: [],
11845
+ timingsMs,
11846
+ modelCachePath
11847
+ };
11848
+ await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report2, null, 2), "utf-8");
11849
+ logStage("info", "finalize", "done", "native parse run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
11850
+ return { outputPath, reportPath, selectedModel: "native-parser" };
12055
11851
  }
11852
+ const workingPdfPath = absInput;
11853
+ markStageStart("convert", "PDF \uC785\uB825 \uD655\uC778 \uC911");
11854
+ logStage("info", "convert", "start", "PDF \uC785\uB825 \uD655\uC778", { input: absInput });
12056
11855
  timingsMs.convert = elapsedMs(convertStart);
12057
- markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
12058
- logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
11856
+ markStageDone("convert", "PDF \uC785\uB825 \uD655\uC778 \uC644\uB8CC");
11857
+ logStage("info", "convert", "done", "PDF \uC785\uB825 \uD655\uC778 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
11858
+ const keyPool = ApiKeyRotationPool.fromEnv();
12059
11859
  const renderStart = import_node_perf_hooks.performance.now();
12060
11860
  currentStage = "render";
12061
11861
  const totalPages = await getPdfPageCount(workingPdfPath).catch(() => 0);
12062
11862
  if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
12063
11863
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
12064
11864
  logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
12065
- await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path6.join)(imagesDir, "page")]);
12066
- const firstFiles = (await (0, import_promises4.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11865
+ await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
11866
+ const firstFiles = (await (0, import_promises2.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
12067
11867
  if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
12068
- const probeImage = (0, import_path6.join)(imagesDir, firstFiles[0]);
11868
+ const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
12069
11869
  markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
12070
11870
  const probeStart = import_node_perf_hooks.performance.now();
12071
11871
  currentStage = "probe";
@@ -12101,7 +11901,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12101
11901
  const keyCount = keyPool.snapshot().length;
12102
11902
  const workerCount = Math.max(1, keyCount * concurrencyPerKey);
12103
11903
  const queueCapacity = workerCount * 2;
12104
- const queue2 = new BoundedQueue(queueCapacity);
11904
+ const queue = new BoundedQueue(queueCapacity);
12105
11905
  const ocrStart = import_node_perf_hooks.performance.now();
12106
11906
  currentStage = "ocr";
12107
11907
  markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
@@ -12109,17 +11909,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12109
11909
  let renderDone = 1;
12110
11910
  const renderProducer = (async () => {
12111
11911
  try {
12112
- await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
11912
+ await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
12113
11913
  if (totalPages > 1) {
12114
- for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path6.join)(imagesDir, "page"), dpi, totalPages, 2)) {
12115
- await queue2.enqueue(item);
11914
+ for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
11915
+ await queue.enqueue(item);
12116
11916
  renderDone++;
12117
11917
  markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
12118
11918
  logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
12119
11919
  }
12120
11920
  }
12121
11921
  } finally {
12122
- queue2.close();
11922
+ queue.close();
12123
11923
  timingsMs.render = elapsedMs(renderStart);
12124
11924
  markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
12125
11925
  logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
@@ -12128,7 +11928,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12128
11928
  const [, pageResultsMap] = await Promise.all([
12129
11929
  renderProducer,
12130
11930
  ocrWorkerPool({
12131
- queue: queue2,
11931
+ queue,
12132
11932
  workerCount,
12133
11933
  totalPages,
12134
11934
  ocrInput: {
@@ -12161,8 +11961,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12161
11961
  const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
12162
11962
  const rawPagePaths = [];
12163
11963
  for (const [pageNum, markdown] of sortedEntries) {
12164
- const pagePath = (0, import_path6.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
12165
- await (0, import_promises4.writeFile)(pagePath, markdown, "utf-8");
11964
+ const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
11965
+ await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
12166
11966
  rawPagePaths.push(pagePath);
12167
11967
  }
12168
11968
  const mergeStart = import_node_perf_hooks.performance.now();
@@ -12170,7 +11970,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12170
11970
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
12171
11971
  logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
12172
11972
  const merged = await mergeMarkdownPages(rawPagePaths);
12173
- await (0, import_promises4.writeFile)(outputPath, merged, "utf-8");
11973
+ await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
12174
11974
  timingsMs.merge = elapsedMs(mergeStart);
12175
11975
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
12176
11976
  logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
@@ -12186,7 +11986,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
12186
11986
  timingsMs,
12187
11987
  modelCachePath
12188
11988
  };
12189
- await (0, import_promises4.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
11989
+ await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
12190
11990
  logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
12191
11991
  return { outputPath, reportPath, selectedModel };
12192
11992
  } catch (err) {
@@ -12261,7 +12061,7 @@ async function getPdfPageCount(pdfPath) {
12261
12061
  return n;
12262
12062
  }
12263
12063
  async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, startPage = 1) {
12264
- const imagesDir = (0, import_path6.dirname)(prefixPath);
12064
+ const imagesDir = (0, import_path5.dirname)(prefixPath);
12265
12065
  for (let page = startPage; page <= totalPages; page++) {
12266
12066
  try {
12267
12067
  await runCommand("pdftoppm", [
@@ -12275,9 +12075,9 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
12275
12075
  pdfPath,
12276
12076
  prefixPath
12277
12077
  ]);
12278
- const files = await (0, import_promises4.readdir)(imagesDir);
12078
+ const files = await (0, import_promises2.readdir)(imagesDir);
12279
12079
  const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
12280
- const imagePath = (0, import_path6.join)(imagesDir, pageFiles[pageFiles.length - 1]);
12080
+ const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
12281
12081
  yield { pageNumber: page, imagePath };
12282
12082
  } catch (err) {
12283
12083
  yield {
@@ -12290,7 +12090,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
12290
12090
  }
12291
12091
  async function runCommand(cmd, args) {
12292
12092
  await new Promise((resolvePromise, reject) => {
12293
- const child = (0, import_child_process5.spawn)(cmd, args, { stdio: "pipe" });
12093
+ const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
12294
12094
  let stderr = "";
12295
12095
  child.stderr.on("data", (d) => {
12296
12096
  stderr += String(d);
@@ -12304,7 +12104,7 @@ async function runCommand(cmd, args) {
12304
12104
  }
12305
12105
  async function runCommandWithStdout(cmd, args) {
12306
12106
  return await new Promise((resolvePromise, reject) => {
12307
- const child = (0, import_child_process5.spawn)(cmd, args, { stdio: "pipe" });
12107
+ const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
12308
12108
  let stdout = "";
12309
12109
  let stderr = "";
12310
12110
  child.stdout.on("data", (d) => {
@@ -12320,6 +12120,32 @@ async function runCommandWithStdout(cmd, args) {
12320
12120
  });
12321
12121
  });
12322
12122
  }
12123
+ async function parseNativeDocument(buffer) {
12124
+ const arrayBuffer = toArrayBuffer(buffer);
12125
+ const format = detectFormat(arrayBuffer);
12126
+ let result;
12127
+ let fileType;
12128
+ if (format === "hwp") {
12129
+ result = parseHwp5Document(buffer);
12130
+ fileType = "hwp";
12131
+ } else if (format === "hwpx") {
12132
+ const { format: zipFormat, zip } = await detectZipFormat(arrayBuffer);
12133
+ if (zipFormat === "xlsx") {
12134
+ result = await parseXlsxDocument(arrayBuffer, void 0, zip ?? void 0);
12135
+ fileType = "xlsx";
12136
+ } else if (zipFormat === "docx") {
12137
+ result = await parseDocxDocument(arrayBuffer, void 0, zip ?? void 0);
12138
+ fileType = "docx";
12139
+ } else {
12140
+ result = await parseHwpxDocument(arrayBuffer, void 0, zip ?? void 0);
12141
+ fileType = "hwpx";
12142
+ }
12143
+ } else {
12144
+ throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC790\uCCB4 \uD30C\uC11C\uB85C \uCC98\uB9AC\uD560 \uC218 \uC5C6\uB294 \uC785\uB825 \uD3EC\uB9F7: ${format}`);
12145
+ }
12146
+ const pageCount = result.metadata?.pageCount ?? Math.max(1, ...result.blocks.map((block) => block.pageNumber ?? 1));
12147
+ return { markdown: result.markdown, fileType, pageCount };
12148
+ }
12323
12149
  function naturalPageSort(a, b) {
12324
12150
  const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
12325
12151
  const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
@@ -12393,7 +12219,7 @@ function startParallelProbeRuns(input) {
12393
12219
  }
12394
12220
  async function loadModelCache(path) {
12395
12221
  try {
12396
- const raw = await (0, import_promises4.readFile)(path, "utf-8");
12222
+ const raw = await (0, import_promises2.readFile)(path, "utf-8");
12397
12223
  return JSON.parse(raw);
12398
12224
  } catch {
12399
12225
  return null;
@@ -12424,15 +12250,15 @@ async function updateModelCache(path, probes) {
12424
12250
  }
12425
12251
  }
12426
12252
  current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
12427
- await (0, import_promises4.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
12253
+ await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
12428
12254
  }
12429
12255
  async function ocrWorkerPool(input) {
12430
- const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
12256
+ const { queue, workerCount, ocrInput, onPageDone } = input;
12431
12257
  const results = /* @__PURE__ */ new Map();
12432
12258
  let completedCount = 0;
12433
12259
  async function worker() {
12434
12260
  while (true) {
12435
- const item = await queue2.dequeue();
12261
+ const item = await queue.dequeue();
12436
12262
  if (item === QUEUE_DONE) break;
12437
12263
  const { pageNumber, imagePath, error } = item;
12438
12264
  if (imagePath === null) {
@@ -12484,7 +12310,7 @@ async function ocrImageWithFallback(input) {
12484
12310
  async function mergeMarkdownPages(paths) {
12485
12311
  const out = [];
12486
12312
  for (let i = 0; i < paths.length; i++) {
12487
- const txt = (await (0, import_promises4.readFile)(paths[i], "utf-8")).trim();
12313
+ const txt = (await (0, import_promises2.readFile)(paths[i], "utf-8")).trim();
12488
12314
  if (!txt) continue;
12489
12315
  out.push(txt);
12490
12316
  }
@@ -12600,7 +12426,7 @@ async function ocrImageViaNim(input) {
12600
12426
  throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
12601
12427
  }
12602
12428
  async function encodeBase64(path) {
12603
- const b = await (0, import_promises4.readFile)(path);
12429
+ const b = await (0, import_promises2.readFile)(path);
12604
12430
  return b.toString("base64");
12605
12431
  }
12606
12432
  function stripCodeFence3(text) {
@@ -12612,7 +12438,7 @@ async function delay(ms) {
12612
12438
  await new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
12613
12439
  }
12614
12440
  function ensureSupportedInput(path) {
12615
- const ext = (0, import_path6.extname)(path).toLowerCase();
12441
+ const ext = (0, import_path5.extname)(path).toLowerCase();
12616
12442
  const allowed = /* @__PURE__ */ new Set([".pdf", ".hwp", ".hwpx", ".docx", ".xlsx"]);
12617
12443
  if (!allowed.has(ext)) {
12618
12444
  throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
@@ -12620,16 +12446,6 @@ function ensureSupportedInput(path) {
12620
12446
  }
12621
12447
  function normalizePipelineError(err, stage) {
12622
12448
  if (err instanceof UnifiedOcrError) return err;
12623
- if (err instanceof ConvertError) {
12624
- const codeMap = {
12625
- SOFFICE_NOT_FOUND: "SOFFICE_NOT_FOUND",
12626
- CONVERT_FAILED: "CONVERT_FAILED",
12627
- TIMEOUT: "CONVERT_FAILED",
12628
- UNSUPPORTED_PLATFORM: "CONVERT_FAILED",
12629
- UNSUPPORTED_FORMAT: "UNSUPPORTED_INPUT"
12630
- };
12631
- return new UnifiedOcrError(codeMap[err.code] ?? "CONVERT_FAILED", stage, err.message);
12632
- }
12633
12449
  const message = err instanceof Error ? err.message : String(err);
12634
12450
  const codeByStage = {
12635
12451
  convert: "CONVERT_FAILED",
@@ -12649,7 +12465,7 @@ async function parse2(input, options) {
12649
12465
  let buffer;
12650
12466
  if (typeof input === "string") {
12651
12467
  try {
12652
- const buf = await (0, import_promises5.readFile)(input);
12468
+ const buf = await (0, import_promises3.readFile)(input);
12653
12469
  buffer = toArrayBuffer(buf);
12654
12470
  } catch (err) {
12655
12471
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
@@ -12809,9 +12625,6 @@ async function parseDocx(buffer, options, zip) {
12809
12625
  VERSION,
12810
12626
  blocksToMarkdown,
12811
12627
  compare,
12812
- convertHwpToPdf,
12813
- convertHwpxToPdf,
12814
- convertToPdf,
12815
12628
  detectFormat,
12816
12629
  detectZipFormat,
12817
12630
  diffBlocks,