@clazic/kordoc 2.5.1 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -37,6 +37,118 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
37
37
  mod
38
38
  ));
39
39
 
40
+ // src/utils.ts
41
+ var utils_exports = {};
42
+ __export(utils_exports, {
43
+ KordocError: () => KordocError,
44
+ VERSION: () => VERSION,
45
+ classifyError: () => classifyError,
46
+ isPathTraversal: () => isPathTraversal,
47
+ normalizeKordocError: () => normalizeKordocError,
48
+ precheckZipSize: () => precheckZipSize,
49
+ sanitizeError: () => sanitizeError,
50
+ sanitizeHref: () => sanitizeHref,
51
+ toArrayBuffer: () => toArrayBuffer
52
+ });
53
+ function toArrayBuffer(buf) {
54
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
55
+ return buf.buffer;
56
+ }
57
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
58
+ }
59
+ function sanitizeError(err) {
60
+ if (err instanceof KordocError) return err.message;
61
+ return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
62
+ }
63
+ function isPathTraversal(name) {
64
+ if (name.includes("\0")) return true;
65
+ const normalized = name.replace(/\\/g, "/");
66
+ return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
67
+ }
68
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
69
+ try {
70
+ const data = new DataView(buffer);
71
+ const len = buffer.byteLength;
72
+ let eocdOffset = -1;
73
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
74
+ if (data.getUint32(i, true) === 101010256) {
75
+ eocdOffset = i;
76
+ break;
77
+ }
78
+ }
79
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
80
+ const entryCount = data.getUint16(eocdOffset + 10, true);
81
+ if (entryCount > maxEntries) {
82
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
83
+ }
84
+ const cdSize = data.getUint32(eocdOffset + 12, true);
85
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
86
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
87
+ let totalUncompressed = 0;
88
+ let pos = cdOffset;
89
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
90
+ if (data.getUint32(pos, true) !== 33639248) break;
91
+ totalUncompressed += data.getUint32(pos + 24, true);
92
+ const nameLen = data.getUint16(pos + 28, true);
93
+ const extraLen = data.getUint16(pos + 30, true);
94
+ const commentLen = data.getUint16(pos + 32, true);
95
+ pos += 46 + nameLen + extraLen + commentLen;
96
+ }
97
+ if (totalUncompressed > maxUncompressedSize) {
98
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
99
+ }
100
+ return { totalUncompressed, entryCount };
101
+ } catch (err) {
102
+ if (err instanceof KordocError) throw err;
103
+ return { totalUncompressed: 0, entryCount: 0 };
104
+ }
105
+ }
106
+ function sanitizeHref(href) {
107
+ const trimmed = href.trim();
108
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
109
+ return trimmed;
110
+ }
111
+ function classifyError(err) {
112
+ if (!(err instanceof Error)) return "PARSE_ERROR";
113
+ const msg = err.message;
114
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
115
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
116
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
117
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
118
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
119
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
120
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
121
+ return "PARSE_ERROR";
122
+ }
123
+ function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
124
+ if (err instanceof KordocError) {
125
+ if (!err.stage) err.stage = stage;
126
+ if (!err.code) err.code = fallbackCode;
127
+ return err;
128
+ }
129
+ const message = err instanceof Error ? err.message : fallbackMessage;
130
+ const code = err instanceof Error ? classifyError(err) : fallbackCode;
131
+ return new KordocError(message || fallbackMessage, { code, stage });
132
+ }
133
+ var VERSION, KordocError, SAFE_HREF_RE;
134
+ var init_utils = __esm({
135
+ "src/utils.ts"() {
136
+ "use strict";
137
+ VERSION = true ? "2.5.2" : "0.0.0-dev";
138
+ KordocError = class extends Error {
139
+ code;
140
+ stage;
141
+ constructor(message, opts = {}) {
142
+ super(message);
143
+ this.name = "KordocError";
144
+ this.code = opts.code;
145
+ this.stage = opts.stage;
146
+ }
147
+ };
148
+ SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
149
+ }
150
+ });
151
+
40
152
  // src/page-range.ts
41
153
  var page_range_exports = {};
42
154
  __export(page_range_exports, {
@@ -2398,15 +2510,48 @@ var OCR_PROMPT, _tempDir;
2398
2510
  var init_cli_provider = __esm({
2399
2511
  "src/ocr/cli-provider.ts"() {
2400
2512
  "use strict";
2401
- OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
2402
- \uADDC\uCE59:
2403
- - \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
2404
- - \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
2405
- - \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
2406
- - \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
2407
- - \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
2408
- - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
2409
- - \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
2513
+ OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, \uBA85\uBC31\uD55C OCR \uC624\uC778\uC2DD\uACFC \uD45C \uAD6C\uC870 \uAE68\uC9D0\uB9CC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uB97C \uCD9C\uB825\uD558\uC138\uC694.
2514
+
2515
+ [\uAE30\uBCF8 \uCD94\uCD9C \uADDC\uCE59]
2516
+ - \uBCF8\uBB38, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8, \uCEA1\uC158\uC744 \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658
2517
+ - \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\xB7\uAD75\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9
2518
+ - \uB9AC\uC2A4\uD2B8\uB294 -, 1. \uC0AC\uC6A9 (\uC6D0\uBB38 \uBC88\uD638 \uCCB4\uACC4\uAC00 \u2460, \uAC00., 1) \uB4F1\uC774\uBA74 \uADF8 \uD45C\uAE30 \uC720\uC9C0)
2519
+ - \uC774\uBBF8\uC9C0\xB7\uB3C4\uD615\xB7\uB85C\uACE0\xB7\uD398\uC774\uC9C0\uBC88\uD638\xB7\uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00\uC740 \uBB34\uC2DC
2520
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C(\uC88C\u2192\uC6B0, \uC704\u2192\uC544\uB798, \uB2E4\uB2E8\uC774\uBA74 \uB2E8\uBCC4\uB85C)\uB97C \uC720\uC9C0
2521
+
2522
+ [\uD45C \uCD94\uCD9C \uADDC\uCE59 \u2014 \uAC00\uC7A5 \uC911\uC694]
2523
+ - \uD45C\uB294 \uBC18\uB4DC\uC2DC Markdown \uD30C\uC774\uD504 \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9: \uD5E4\uB354 \uD589 + |---| \uAD6C\uBD84\uC120 + \uB370\uC774\uD130 \uD589
2524
+ - \uC2DC\uAC01\uC801\uC73C\uB85C \uAD75\uAC70\uB098 \uC74C\uC601(\uD68C\uC0C9/\uC0C9\uC0C1) \uBC30\uACBD\uC774 \uC788\uB294 \uD589\uC744 \uD5E4\uB354\uB85C \uC2DD\uBCC4. \uD5E4\uB354\uAC00 \uC5C6\uB294 \uD45C\uB77C\uB3C4 \uCCAB \uD589\uC744 \uD5E4\uB354\uB85C \uB450\uACE0 |---| \uAD6C\uBD84\uC120 \uCD94\uAC00
2525
+ - \uBAA8\uB4E0 \uD589\uC758 \uD30C\uC774\uD504(|) \uAC1C\uC218\uB97C \uD5E4\uB354\uC640 \uB3D9\uC77C\uD558\uAC8C \uB9DE\uCD9C \uAC83 \u2014 \uBE48 \uC140\uC740 \uACF5\uBC31\uC73C\uB85C \uCC44\uC6B0\uACE0 \uC808\uB300 \uC140\uC744 \uC0DD\uB7B5\uD558\uC9C0 \uB9D0 \uAC83
2526
+ - \uC140 \uC548\uC758 \uC904\uBC14\uAFC8\uC740 <br>\uB85C \uD45C\uAE30 (\uC2E4\uC81C \uAC1C\uD589 \uBB38\uC790 \uC0AC\uC6A9 \uAE08\uC9C0 \u2014 \uD45C\uAC00 \uAE68\uC9D0)
2527
+ - \uBCD1\uD569 \uC140 \uCC98\uB9AC:
2528
+ - \uAC00\uB85C \uBCD1\uD569(colspan): \uBCD1\uD569\uB41C \uC140 \uB0B4\uC6A9\uC744 \uCCAB \uCE78\uC5D0 \uC4F0\uACE0 \uB098\uBA38\uC9C0 \uCE78\uC740 \uBE48 \uCE78\uC73C\uB85C \uB458 \uAC83
2529
+ - \uC138\uB85C \uBCD1\uD569(rowspan): \uBCD1\uD569\uB41C \uBAA8\uB4E0 \uD589\uC758 \uD574\uB2F9 \uCE78\uC5D0 \uB3D9\uC77C\uD55C \uB0B4\uC6A9\uC744 \uBC18\uBCF5 \uAE30\uC7AC\uD560 \uAC83
2530
+ - 2\uB2E8 \uD5E4\uB354(\uD5E4\uB354\uAC00 \uB450 \uC904\uC778 \uACBD\uC6B0): \uC0C1\uC704/\uD558\uC704 \uD5E4\uB354\uB97C " / "\uB85C \uD569\uCCD0 \uB2E8\uC77C \uD5E4\uB354 \uD589\uC73C\uB85C \uB9CC\uB4E4 \uAC83 (\uC608: "2024 / 1\uBD84\uAE30")
2531
+ - \uD45C \uC704\xB7\uC544\uB798\uC758 \uCEA1\uC158(\uC608: "<\uD45C 1> ...", "[\uD45C 2-1]")\uC740 \uD45C \uBC14\uB85C \uC704\uC5D0 \uBCC4\uB3C4 \uC904\uB85C \uBCF4\uC874
2532
+ - \uD55C \uD398\uC774\uC9C0\uC5D0\uC11C \uD45C\uAC00 \uC911\uAC04\uC5D0 \uB04A\uC5B4\uC838 \uBCF4\uC774\uB354\uB77C\uB3C4, \uD5E4\uB354\uAC00 \uB3D9\uC77C\uD558\uBA74 \uD558\uB098\uC758 \uD45C\uB85C \uC774\uC5B4\uBD99\uC77C \uAC83
2533
+ - \uC88C\uCE21 \uCCAB \uCEEC\uB7FC\uC774 \uC138\uB85C\uC4F0\uAE30 \uB77C\uBCA8(\uAD6C\uBD84/\uBD84\uB958/\uC5F0\uB3C4 \uB4F1)\uC774\uB77C\uB3C4 \uC77C\uBC18 \uC140\uB85C \uBCC0\uD658\uD558\uC5EC \uB204\uB77D \uAE08\uC9C0
2534
+
2535
+ [OCR \uC624\uC778\uC2DD\xB7\uAE68\uC9D0 \uAD50\uC815 \u2014 \uD5C8\uC6A9 \uBC94\uC704\uB9CC]
2536
+ - \uC22B\uC790 \uCE78\uC5D0\uC11C 'O'\u2192'0', 'l/I'\u2192'1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uBC31\uD55C \uC624\uC778\uC2DD\uB9CC \uAD50\uC815
2537
+ - \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70 (\uC608: "\uC8FC\uAC70 \uC885 \uD569 \uACC4\uD68D" \u2192 "\uC8FC\uAC70\uC885\uD569\uACC4\uD68D")
2538
+ - \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uB04A\uAE34 \uBB38\uC7A5\uC740 \uC758\uBBF8 \uB2E8\uC704\uB85C \uBCD1\uD569 (\uB2E8, \uD45C \uC140 \uB0B4\uBD80\uB294 <br> \uC720\uC9C0)
2539
+ - \uD45C\uC5D0\uC11C \uD589 \uC5B4\uAE0B\uB0A8\uC774 \uC758\uC2EC\uB418\uBA74(\uCEEC\uB7FC \uC218 \uBD88\uC77C\uCE58) \uBE48 \uC140\uB85C \uD328\uB529\uD558\uC5EC \uCEEC\uB7FC \uC815\uD569\uC131\uC744 \uC6B0\uC120 \uD655\uBCF4
2540
+
2541
+ [\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D]
2542
+ - \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\xB7\uD45C \uD589/\uC5F4\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83
2543
+ - \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
2544
+ - \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85, \uC778\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
2545
+ - \uD45C\uC758 \uD5E4\uB354 \uD14D\uC2A4\uD2B8, \uCEA1\uC158, \uD589/\uC5F4 \uAC1C\uC218\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
2546
+ - \uCCB4\uD06C\uBC15\uC2A4(\u2611/\u2610/\u25A0/\u25A1), \uD2B9\uC218\uBB38\uC790(\u203B, \u2460\u2461\u2462, \u3260, \uAC00., \uB098.)\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
2547
+ - \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83
2548
+ - \`\`\`, \`\`\`markdown \uAC19\uC740 \uCF54\uB4DC\uD39C\uC2A4\uB85C \uAC10\uC2F8\uC9C0 \uB9D0 \uAC83
2549
+ - \uC124\uBA85\xB7\uC8FC\uC11D\xB7\uBA54\uD0C0 \uBB38\uC7A5 \uCD94\uAC00 \uAE08\uC9C0
2550
+
2551
+ [\uBD88\uD655\uC2E4\uD560 \uB54C]
2552
+ - \uAE00\uC790\uAC00 \uD750\uB9BF\uD558\uAC70\uB098 \uD310\uB3C5 \uBD88\uAC00\uD558\uBA74 \uBCF4\uC774\uB294 \uADF8\uB300\uB85C \uB450\uAC70\uB098, \uBD88\uAC00\uD53C\uD558\uBA74 \uD55C \uAE00\uC790\uB9CC ?\uB85C \uD45C\uC2DC
2553
+ - \uD45C \uAD6C\uC870\uAC00 \uBAA8\uD638\uD558\uBA74 \uCEEC\uB7FC \uC218 \uC77C\uCE58 + \uBE48 \uC140 \uD328\uB529 \uD615\uD0DC\uB85C \uCD9C\uB825
2554
+ - \uCD94\uCE21\uBCF4\uB2E4 \uC6D0\uBB38 \uBCF4\uC874\uC744 \uD56D\uC0C1 \uC6B0\uC120`;
2410
2555
  _tempDir = null;
2411
2556
  }
2412
2557
  });
@@ -2595,7 +2740,51 @@ var BATCH_OCR_PROMPT, DEFAULT_BATCH_SIZES, _batchTempDir;
2595
2740
  var init_batch_provider = __esm({
2596
2741
  "src/ocr/batch-provider.ts"() {
2597
2742
  "use strict";
2598
- BATCH_OCR_PROMPT = "\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\n\uADDC\uCE59:\n- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2743
+ BATCH_OCR_PROMPT = `\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, \uBA85\uBC31\uD55C OCR \uC624\uC778\uC2DD\uACFC \uD45C \uAD6C\uC870 \uAE68\uC9D0\uB9CC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uB97C \uCD9C\uB825\uD558\uC138\uC694.
2744
+
2745
+ [\uD398\uC774\uC9C0 \uAD6C\uBD84 \u2014 \uD544\uC218]
2746
+ - \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->
2747
+
2748
+ [\uAE30\uBCF8 \uCD94\uCD9C \uADDC\uCE59]
2749
+ - \uBCF8\uBB38, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8, \uCEA1\uC158\uC744 \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658
2750
+ - \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\xB7\uAD75\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9
2751
+ - \uB9AC\uC2A4\uD2B8\uB294 -, 1. \uC0AC\uC6A9 (\uC6D0\uBB38 \uBC88\uD638 \uCCB4\uACC4\uAC00 \u2460, \uAC00., 1) \uB4F1\uC774\uBA74 \uADF8 \uD45C\uAE30 \uC720\uC9C0)
2752
+ - \uC774\uBBF8\uC9C0\xB7\uB3C4\uD615\xB7\uB85C\uACE0\xB7\uD398\uC774\uC9C0\uBC88\uD638\xB7\uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00\uC740 \uBB34\uC2DC
2753
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C(\uC88C\u2192\uC6B0, \uC704\u2192\uC544\uB798, \uB2E4\uB2E8\uC774\uBA74 \uB2E8\uBCC4\uB85C)\uB97C \uC720\uC9C0
2754
+
2755
+ [\uD45C \uCD94\uCD9C \uADDC\uCE59 \u2014 \uAC00\uC7A5 \uC911\uC694]
2756
+ - \uD45C\uB294 \uBC18\uB4DC\uC2DC Markdown \uD30C\uC774\uD504 \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9: \uD5E4\uB354 \uD589 + |---| \uAD6C\uBD84\uC120 + \uB370\uC774\uD130 \uD589
2757
+ - \uC2DC\uAC01\uC801\uC73C\uB85C \uAD75\uAC70\uB098 \uC74C\uC601(\uD68C\uC0C9/\uC0C9\uC0C1) \uBC30\uACBD\uC774 \uC788\uB294 \uD589\uC744 \uD5E4\uB354\uB85C \uC2DD\uBCC4. \uD5E4\uB354\uAC00 \uC5C6\uB294 \uD45C\uB77C\uB3C4 \uCCAB \uD589\uC744 \uD5E4\uB354\uB85C \uB450\uACE0 |---| \uAD6C\uBD84\uC120 \uCD94\uAC00
2758
+ - \uBAA8\uB4E0 \uD589\uC758 \uD30C\uC774\uD504(|) \uAC1C\uC218\uB97C \uD5E4\uB354\uC640 \uB3D9\uC77C\uD558\uAC8C \uB9DE\uCD9C \uAC83 \u2014 \uBE48 \uC140\uC740 \uACF5\uBC31\uC73C\uB85C \uCC44\uC6B0\uACE0 \uC808\uB300 \uC140\uC744 \uC0DD\uB7B5\uD558\uC9C0 \uB9D0 \uAC83
2759
+ - \uC140 \uC548\uC758 \uC904\uBC14\uAFC8\uC740 <br>\uB85C \uD45C\uAE30 (\uC2E4\uC81C \uAC1C\uD589 \uBB38\uC790 \uC0AC\uC6A9 \uAE08\uC9C0 \u2014 \uD45C\uAC00 \uAE68\uC9D0)
2760
+ - \uBCD1\uD569 \uC140 \uCC98\uB9AC:
2761
+ - \uAC00\uB85C \uBCD1\uD569(colspan): \uBCD1\uD569\uB41C \uC140 \uB0B4\uC6A9\uC744 \uCCAB \uCE78\uC5D0 \uC4F0\uACE0 \uB098\uBA38\uC9C0 \uCE78\uC740 \uBE48 \uCE78\uC73C\uB85C \uB458 \uAC83
2762
+ - \uC138\uB85C \uBCD1\uD569(rowspan): \uBCD1\uD569\uB41C \uBAA8\uB4E0 \uD589\uC758 \uD574\uB2F9 \uCE78\uC5D0 \uB3D9\uC77C\uD55C \uB0B4\uC6A9\uC744 \uBC18\uBCF5 \uAE30\uC7AC\uD560 \uAC83
2763
+ - 2\uB2E8 \uD5E4\uB354(\uD5E4\uB354\uAC00 \uB450 \uC904\uC778 \uACBD\uC6B0): \uC0C1\uC704/\uD558\uC704 \uD5E4\uB354\uB97C " / "\uB85C \uD569\uCCD0 \uB2E8\uC77C \uD5E4\uB354 \uD589\uC73C\uB85C \uB9CC\uB4E4 \uAC83 (\uC608: "2024 / 1\uBD84\uAE30")
2764
+ - \uD45C \uC704\xB7\uC544\uB798\uC758 \uCEA1\uC158(\uC608: "<\uD45C 1> ...", "[\uD45C 2-1]")\uC740 \uD45C \uBC14\uB85C \uC704\uC5D0 \uBCC4\uB3C4 \uC904\uB85C \uBCF4\uC874
2765
+ - \uD55C \uD398\uC774\uC9C0\uC5D0\uC11C \uD45C\uAC00 \uC911\uAC04\uC5D0 \uB04A\uC5B4\uC838 \uBCF4\uC774\uB354\uB77C\uB3C4, \uD5E4\uB354\uAC00 \uB3D9\uC77C\uD558\uBA74 \uD558\uB098\uC758 \uD45C\uB85C \uC774\uC5B4\uBD99\uC77C \uAC83
2766
+ - \uC88C\uCE21 \uCCAB \uCEEC\uB7FC\uC774 \uC138\uB85C\uC4F0\uAE30 \uB77C\uBCA8(\uAD6C\uBD84/\uBD84\uB958/\uC5F0\uB3C4 \uB4F1)\uC774\uB77C\uB3C4 \uC77C\uBC18 \uC140\uB85C \uBCC0\uD658\uD558\uC5EC \uB204\uB77D \uAE08\uC9C0
2767
+
2768
+ [OCR \uC624\uC778\uC2DD\xB7\uAE68\uC9D0 \uAD50\uC815 \u2014 \uD5C8\uC6A9 \uBC94\uC704\uB9CC]
2769
+ - \uC22B\uC790 \uCE78\uC5D0\uC11C 'O'\u2192'0', 'l/I'\u2192'1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uBC31\uD55C \uC624\uC778\uC2DD\uB9CC \uAD50\uC815
2770
+ - \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70 (\uC608: "\uC8FC\uAC70 \uC885 \uD569 \uACC4\uD68D" \u2192 "\uC8FC\uAC70\uC885\uD569\uACC4\uD68D")
2771
+ - \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uB04A\uAE34 \uBB38\uC7A5\uC740 \uC758\uBBF8 \uB2E8\uC704\uB85C \uBCD1\uD569 (\uB2E8, \uD45C \uC140 \uB0B4\uBD80\uB294 <br> \uC720\uC9C0)
2772
+ - \uD45C\uC5D0\uC11C \uD589 \uC5B4\uAE0B\uB0A8\uC774 \uC758\uC2EC\uB418\uBA74(\uCEEC\uB7FC \uC218 \uBD88\uC77C\uCE58) \uBE48 \uC140\uB85C \uD328\uB529\uD558\uC5EC \uCEEC\uB7FC \uC815\uD569\uC131\uC744 \uC6B0\uC120 \uD655\uBCF4
2773
+
2774
+ [\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D]
2775
+ - \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\xB7\uD45C \uD589/\uC5F4\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83
2776
+ - \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
2777
+ - \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85, \uC778\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
2778
+ - \uD45C\uC758 \uD5E4\uB354 \uD14D\uC2A4\uD2B8, \uCEA1\uC158, \uD589/\uC5F4 \uAC1C\uC218\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
2779
+ - \uCCB4\uD06C\uBC15\uC2A4(\u2611/\u2610/\u25A0/\u25A1), \uD2B9\uC218\uBB38\uC790(\u203B, \u2460\u2461\u2462, \u3260, \uAC00., \uB098.)\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
2780
+ - \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83
2781
+ - \`\`\`, \`\`\`markdown \uAC19\uC740 \uCF54\uB4DC\uD39C\uC2A4\uB85C \uAC10\uC2F8\uC9C0 \uB9D0 \uAC83
2782
+ - \uC124\uBA85\xB7\uC8FC\uC11D\xB7\uBA54\uD0C0 \uBB38\uC7A5 \uCD94\uAC00 \uAE08\uC9C0
2783
+
2784
+ [\uBD88\uD655\uC2E4\uD560 \uB54C]
2785
+ - \uAE00\uC790\uAC00 \uD750\uB9BF\uD558\uAC70\uB098 \uD310\uB3C5 \uBD88\uAC00\uD558\uBA74 \uBCF4\uC774\uB294 \uADF8\uB300\uB85C \uB450\uAC70\uB098, \uBD88\uAC00\uD53C\uD558\uBA74 \uD55C \uAE00\uC790\uB9CC ?\uB85C \uD45C\uC2DC
2786
+ - \uD45C \uAD6C\uC870\uAC00 \uBAA8\uD638\uD558\uBA74 \uCEEC\uB7FC \uC218 \uC77C\uCE58 + \uBE48 \uC140 \uD328\uB529 \uD615\uD0DC\uB85C \uCD9C\uB825
2787
+ - \uCD94\uCE21\uBCF4\uB2E4 \uC6D0\uBB38 \uBCF4\uC874\uC744 \uD56D\uC0C1 \uC6B0\uC120`;
2599
2788
  DEFAULT_BATCH_SIZES = {
2600
2789
  gemini: 5,
2601
2790
  claude: 5,
@@ -2982,7 +3171,7 @@ var init_provider = __esm({
2982
3171
  });
2983
3172
 
2984
3173
  // src/index.ts
2985
- import { readFile as readFile2 } from "fs/promises";
3174
+ import { readFile as readFile3 } from "fs/promises";
2986
3175
 
2987
3176
  // src/detect.ts
2988
3177
  import JSZip from "jszip";
@@ -3034,97 +3223,8 @@ async function detectZipFormat(buffer) {
3034
3223
  import JSZip2 from "jszip";
3035
3224
  import { DOMParser } from "@xmldom/xmldom";
3036
3225
 
3037
- // src/utils.ts
3038
- var VERSION = true ? "2.5.0" : "0.0.0-dev";
3039
- function toArrayBuffer(buf) {
3040
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3041
- return buf.buffer;
3042
- }
3043
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
3044
- }
3045
- var KordocError = class extends Error {
3046
- code;
3047
- stage;
3048
- constructor(message, opts = {}) {
3049
- super(message);
3050
- this.name = "KordocError";
3051
- this.code = opts.code;
3052
- this.stage = opts.stage;
3053
- }
3054
- };
3055
- function isPathTraversal(name) {
3056
- if (name.includes("\0")) return true;
3057
- const normalized = name.replace(/\\/g, "/");
3058
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
3059
- }
3060
- function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
3061
- try {
3062
- const data = new DataView(buffer);
3063
- const len = buffer.byteLength;
3064
- let eocdOffset = -1;
3065
- for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
3066
- if (data.getUint32(i, true) === 101010256) {
3067
- eocdOffset = i;
3068
- break;
3069
- }
3070
- }
3071
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
3072
- const entryCount = data.getUint16(eocdOffset + 10, true);
3073
- if (entryCount > maxEntries) {
3074
- throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
3075
- }
3076
- const cdSize = data.getUint32(eocdOffset + 12, true);
3077
- const cdOffset = data.getUint32(eocdOffset + 16, true);
3078
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
3079
- let totalUncompressed = 0;
3080
- let pos = cdOffset;
3081
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
3082
- if (data.getUint32(pos, true) !== 33639248) break;
3083
- totalUncompressed += data.getUint32(pos + 24, true);
3084
- const nameLen = data.getUint16(pos + 28, true);
3085
- const extraLen = data.getUint16(pos + 30, true);
3086
- const commentLen = data.getUint16(pos + 32, true);
3087
- pos += 46 + nameLen + extraLen + commentLen;
3088
- }
3089
- if (totalUncompressed > maxUncompressedSize) {
3090
- throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
3091
- }
3092
- return { totalUncompressed, entryCount };
3093
- } catch (err) {
3094
- if (err instanceof KordocError) throw err;
3095
- return { totalUncompressed: 0, entryCount: 0 };
3096
- }
3097
- }
3098
- var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
3099
- function sanitizeHref(href) {
3100
- const trimmed = href.trim();
3101
- if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
3102
- return trimmed;
3103
- }
3104
- function classifyError(err) {
3105
- if (!(err instanceof Error)) return "PARSE_ERROR";
3106
- const msg = err.message;
3107
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
3108
- if (msg.includes("DRM")) return "DRM_PROTECTED";
3109
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
3110
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
3111
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
3112
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
3113
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
3114
- return "PARSE_ERROR";
3115
- }
3116
- function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
3117
- if (err instanceof KordocError) {
3118
- if (!err.stage) err.stage = stage;
3119
- if (!err.code) err.code = fallbackCode;
3120
- return err;
3121
- }
3122
- const message = err instanceof Error ? err.message : fallbackMessage;
3123
- const code = err instanceof Error ? classifyError(err) : fallbackCode;
3124
- return new KordocError(message || fallbackMessage, { code, stage });
3125
- }
3126
-
3127
3226
  // src/table/builder.ts
3227
+ init_utils();
3128
3228
  var MAX_COLS = 200;
3129
3229
  var MAX_ROWS = 1e4;
3130
3230
  function buildTable(rows) {
@@ -3384,6 +3484,8 @@ var HEADING_RATIO_H2 = 1.3;
3384
3484
  var HEADING_RATIO_H3 = 1.15;
3385
3485
 
3386
3486
  // src/hwpx/parser.ts
3487
+ init_utils();
3488
+ init_utils();
3387
3489
  init_page_range();
3388
3490
  init_logger();
3389
3491
  var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
@@ -4225,6 +4327,7 @@ function extractTextFromNode(node) {
4225
4327
  }
4226
4328
 
4227
4329
  // src/hwp5/record.ts
4330
+ init_utils();
4228
4331
  import { inflateRawSync, inflateSync } from "zlib";
4229
4332
  var TAG_PARA_HEADER = 66;
4230
4333
  var TAG_PARA_TEXT = 67;
@@ -5275,6 +5378,7 @@ function parseLenientCfb(data) {
5275
5378
  }
5276
5379
 
5277
5380
  // src/hwp5/parser.ts
5381
+ init_utils();
5278
5382
  init_page_range();
5279
5383
  init_logger();
5280
5384
  var CFB = __toESM(require_cfb(), 1);
@@ -5930,6 +6034,7 @@ function arrangeCells(rows, cols, cells) {
5930
6034
  }
5931
6035
 
5932
6036
  // src/pdf/parser.ts
6037
+ init_utils();
5933
6038
  init_page_range();
5934
6039
  import { createRequire } from "module";
5935
6040
  import { dirname as dirname2, join as join3, resolve as resolve2 } from "path";
@@ -7821,6 +7926,7 @@ function mergeKoreanLines(text) {
7821
7926
  }
7822
7927
 
7823
7928
  // src/xlsx/parser.ts
7929
+ init_utils();
7824
7930
  import JSZip3 from "jszip";
7825
7931
  import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
7826
7932
  init_logger();
@@ -8149,6 +8255,7 @@ async function parseXlsxDocument(buffer, options, existingZip) {
8149
8255
  }
8150
8256
 
8151
8257
  // src/docx/parser.ts
8258
+ init_utils();
8152
8259
  import JSZip4 from "jszip";
8153
8260
  import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
8154
8261
  init_logger();
@@ -8630,6 +8737,7 @@ async function parseDocxDocument(buffer, options, existingZip) {
8630
8737
  }
8631
8738
 
8632
8739
  // src/index.ts
8740
+ init_utils();
8633
8741
  init_cli_provider();
8634
8742
  init_markdown_to_blocks();
8635
8743
  init_logger();
@@ -11131,6 +11239,187 @@ async function markdownToXlsx(markdown, options) {
11131
11239
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
11132
11240
  }
11133
11241
 
11242
+ // src/convert/index.ts
11243
+ import { readFile } from "fs/promises";
11244
+ init_utils();
11245
+
11246
+ // src/convert/libreoffice.ts
11247
+ import libre from "libreoffice-convert";
11248
+
11249
+ // src/convert/error.ts
11250
+ var ConvertError = class extends Error {
11251
+ constructor(code, message) {
11252
+ super(message);
11253
+ this.code = code;
11254
+ this.name = "ConvertError";
11255
+ }
11256
+ };
11257
+
11258
+ // src/convert/libreoffice.ts
11259
+ var libreConvert = libre.convert;
11260
+ async function assertSofficeAvailable() {
11261
+ const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
11262
+ try {
11263
+ await runCommand2("soffice", ["--version"]);
11264
+ } catch {
11265
+ throw new ConvertError(
11266
+ "SOFFICE_NOT_FOUND",
11267
+ "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
11268
+ );
11269
+ }
11270
+ }
11271
+ async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
11272
+ return new Promise((resolve4, reject) => {
11273
+ const timer = setTimeout(() => {
11274
+ reject(
11275
+ new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
11276
+ );
11277
+ }, timeoutMs);
11278
+ libreConvert(buffer, targetExt, void 0, (err, done) => {
11279
+ clearTimeout(timer);
11280
+ if (err || !done) {
11281
+ reject(
11282
+ new ConvertError(
11283
+ "CONVERT_FAILED",
11284
+ err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
11285
+ )
11286
+ );
11287
+ return;
11288
+ }
11289
+ resolve4(done);
11290
+ });
11291
+ });
11292
+ }
11293
+
11294
+ // src/convert/index.ts
11295
+ var isConverting = false;
11296
+ var queue = [];
11297
+ async function acquireConvertLock() {
11298
+ if (!isConverting) {
11299
+ isConverting = true;
11300
+ return () => {
11301
+ isConverting = false;
11302
+ const next = queue.shift();
11303
+ next?.();
11304
+ };
11305
+ }
11306
+ return new Promise((resolve4) => {
11307
+ queue.push(() => {
11308
+ isConverting = true;
11309
+ resolve4(() => {
11310
+ isConverting = false;
11311
+ const next = queue.shift();
11312
+ next?.();
11313
+ });
11314
+ });
11315
+ });
11316
+ }
11317
+ async function convertToPdf(input, options) {
11318
+ let buffer;
11319
+ try {
11320
+ if (typeof input === "string") {
11321
+ buffer = await readFile(input);
11322
+ } else if (Buffer.isBuffer(input)) {
11323
+ buffer = input;
11324
+ } else {
11325
+ buffer = Buffer.from(input);
11326
+ }
11327
+ } catch (err) {
11328
+ return {
11329
+ success: false,
11330
+ code: "PARSE_ERROR",
11331
+ error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
11332
+ stage: "detect"
11333
+ };
11334
+ }
11335
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
11336
+ if (buffer.length > MAX_FILE_SIZE) {
11337
+ return {
11338
+ success: false,
11339
+ code: "FILE_TOO_LARGE",
11340
+ error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
11341
+ stage: "detect"
11342
+ };
11343
+ }
11344
+ const format = detectFormat(toArrayBuffer(buffer));
11345
+ if (format !== "hwp" && format !== "hwpx") {
11346
+ return {
11347
+ success: false,
11348
+ code: "UNSUPPORTED_FORMAT",
11349
+ error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
11350
+ stage: "detect"
11351
+ };
11352
+ }
11353
+ try {
11354
+ await assertSofficeAvailable();
11355
+ } catch (err) {
11356
+ if (err instanceof ConvertError) {
11357
+ return {
11358
+ success: false,
11359
+ code: err.code,
11360
+ error: err.message,
11361
+ stage: "validate"
11362
+ };
11363
+ }
11364
+ throw err;
11365
+ }
11366
+ const releaseLock = await acquireConvertLock();
11367
+ try {
11368
+ options?.onProgress?.(10, "convert");
11369
+ const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
11370
+ options?.onProgress?.(100, "done");
11371
+ return {
11372
+ success: true,
11373
+ pdf: new Uint8Array(pdf),
11374
+ sourceFormat: format
11375
+ };
11376
+ } catch (err) {
11377
+ if (err instanceof ConvertError) {
11378
+ return {
11379
+ success: false,
11380
+ code: err.code,
11381
+ error: err.message,
11382
+ stage: "convert"
11383
+ };
11384
+ }
11385
+ return {
11386
+ success: false,
11387
+ code: classifyError(err),
11388
+ error: err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328",
11389
+ stage: "convert"
11390
+ };
11391
+ } finally {
11392
+ releaseLock();
11393
+ }
11394
+ }
11395
+ async function convertHwpToPdf(input, options) {
11396
+ const result = await convertToPdf(input, options);
11397
+ if (result.success && result.sourceFormat !== "hwp") {
11398
+ return {
11399
+ success: false,
11400
+ code: "UNSUPPORTED_FORMAT",
11401
+ error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11402
+ stage: "detect"
11403
+ };
11404
+ }
11405
+ return result;
11406
+ }
11407
+ async function convertHwpxToPdf(input, options) {
11408
+ const result = await convertToPdf(input, options);
11409
+ if (result.success && result.sourceFormat !== "hwpx") {
11410
+ return {
11411
+ success: false,
11412
+ code: "UNSUPPORTED_FORMAT",
11413
+ error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11414
+ stage: "detect"
11415
+ };
11416
+ }
11417
+ return result;
11418
+ }
11419
+
11420
+ // src/index.ts
11421
+ init_utils();
11422
+
11134
11423
  // src/ocr/api-key-rotation.ts
11135
11424
  var AllKeysCoolingDownError = class extends Error {
11136
11425
  waitMs;
@@ -11225,11 +11514,10 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11225
11514
  };
11226
11515
 
11227
11516
  // src/pipeline/unified-ocr.ts
11228
- import { mkdir, readdir, readFile, stat, writeFile } from "fs/promises";
11517
+ import { mkdir, readdir, readFile as readFile2, stat, writeFile } from "fs/promises";
11229
11518
  import { basename as basename2, dirname as dirname3, extname, join as join4, resolve as resolve3 } from "path";
11230
11519
  import { spawn as spawn2 } from "child_process";
11231
11520
  import { performance } from "perf_hooks";
11232
- import libre from "libreoffice-convert";
11233
11521
  init_logger();
11234
11522
 
11235
11523
  // src/pipeline/bounded-queue.ts
@@ -11291,7 +11579,6 @@ var BoundedQueue = class {
11291
11579
  };
11292
11580
 
11293
11581
  // src/pipeline/unified-ocr.ts
11294
- var libreConvert = libre.convert;
11295
11582
  var UnifiedOcrError = class extends Error {
11296
11583
  code;
11297
11584
  stage;
@@ -11406,8 +11693,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11406
11693
  if (extname(absInput).toLowerCase() !== ".pdf") {
11407
11694
  await assertSofficeAvailable();
11408
11695
  workingPdfPath = join4(workspaceDir, `${stem}.pdf`);
11409
- const inputBuffer = await readFile(absInput);
11410
- const out = await convertWithLibreOffice(inputBuffer, ".pdf");
11696
+ const inputBuffer = await readFile2(absInput);
11697
+ const out = await convertBuffer(inputBuffer, ".pdf");
11411
11698
  await writeFile(workingPdfPath, out);
11412
11699
  }
11413
11700
  timingsMs.convert = elapsedMs(convertStart);
@@ -11458,7 +11745,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11458
11745
  const keyCount = keyPool.snapshot().length;
11459
11746
  const workerCount = Math.max(1, keyCount * concurrencyPerKey);
11460
11747
  const queueCapacity = workerCount * 2;
11461
- const queue = new BoundedQueue(queueCapacity);
11748
+ const queue2 = new BoundedQueue(queueCapacity);
11462
11749
  const ocrStart = performance.now();
11463
11750
  currentStage = "ocr";
11464
11751
  markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
@@ -11466,17 +11753,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11466
11753
  let renderDone = 1;
11467
11754
  const renderProducer = (async () => {
11468
11755
  try {
11469
- await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
11756
+ await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
11470
11757
  if (totalPages > 1) {
11471
11758
  for await (const item of renderPdfToPngStream(workingPdfPath, join4(imagesDir, "page"), dpi, totalPages, 2)) {
11472
- await queue.enqueue(item);
11759
+ await queue2.enqueue(item);
11473
11760
  renderDone++;
11474
11761
  markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
11475
11762
  logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
11476
11763
  }
11477
11764
  }
11478
11765
  } finally {
11479
- queue.close();
11766
+ queue2.close();
11480
11767
  timingsMs.render = elapsedMs(renderStart);
11481
11768
  markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11482
11769
  logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
@@ -11485,7 +11772,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11485
11772
  const [, pageResultsMap] = await Promise.all([
11486
11773
  renderProducer,
11487
11774
  ocrWorkerPool({
11488
- queue,
11775
+ queue: queue2,
11489
11776
  workerCount,
11490
11777
  totalPages,
11491
11778
  ocrInput: {
@@ -11605,17 +11892,6 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
11605
11892
  model: extra.model
11606
11893
  });
11607
11894
  }
11608
- async function convertWithLibreOffice(buffer, ext) {
11609
- return await new Promise((resolvePromise, reject) => {
11610
- libreConvert(buffer, ext, void 0, (err, done) => {
11611
- if (err || !done) {
11612
- reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
11613
- return;
11614
- }
11615
- resolvePromise(done);
11616
- });
11617
- });
11618
- }
11619
11895
  async function getPdfPageCount(pdfPath) {
11620
11896
  const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
11621
11897
  const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
@@ -11688,13 +11964,6 @@ async function runCommandWithStdout(cmd, args) {
11688
11964
  });
11689
11965
  });
11690
11966
  }
11691
- async function assertSofficeAvailable() {
11692
- try {
11693
- await runCommand("soffice", ["--version"]);
11694
- } catch {
11695
- throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
11696
- }
11697
- }
11698
11967
  function naturalPageSort(a, b) {
11699
11968
  const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
11700
11969
  const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
@@ -11768,7 +12037,7 @@ function startParallelProbeRuns(input) {
11768
12037
  }
11769
12038
  async function loadModelCache(path) {
11770
12039
  try {
11771
- const raw = await readFile(path, "utf-8");
12040
+ const raw = await readFile2(path, "utf-8");
11772
12041
  return JSON.parse(raw);
11773
12042
  } catch {
11774
12043
  return null;
@@ -11802,12 +12071,12 @@ async function updateModelCache(path, probes) {
11802
12071
  await writeFile(path, JSON.stringify(current, null, 2), "utf-8");
11803
12072
  }
11804
12073
  async function ocrWorkerPool(input) {
11805
- const { queue, workerCount, ocrInput, onPageDone } = input;
12074
+ const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
11806
12075
  const results = /* @__PURE__ */ new Map();
11807
12076
  let completedCount = 0;
11808
12077
  async function worker() {
11809
12078
  while (true) {
11810
- const item = await queue.dequeue();
12079
+ const item = await queue2.dequeue();
11811
12080
  if (item === QUEUE_DONE) break;
11812
12081
  const { pageNumber, imagePath, error } = item;
11813
12082
  if (imagePath === null) {
@@ -11859,7 +12128,7 @@ async function ocrImageWithFallback(input) {
11859
12128
  async function mergeMarkdownPages(paths) {
11860
12129
  const out = [];
11861
12130
  for (let i = 0; i < paths.length; i++) {
11862
- const txt = (await readFile(paths[i], "utf-8")).trim();
12131
+ const txt = (await readFile2(paths[i], "utf-8")).trim();
11863
12132
  if (!txt) continue;
11864
12133
  out.push(txt);
11865
12134
  }
@@ -11975,7 +12244,7 @@ async function ocrImageViaNim(input) {
11975
12244
  throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
11976
12245
  }
11977
12246
  async function encodeBase64(path) {
11978
- const b = await readFile(path);
12247
+ const b = await readFile2(path);
11979
12248
  return b.toString("base64");
11980
12249
  }
11981
12250
  function stripCodeFence3(text) {
@@ -12014,7 +12283,7 @@ async function parse2(input, options) {
12014
12283
  let buffer;
12015
12284
  if (typeof input === "string") {
12016
12285
  try {
12017
- const buf = await readFile2(input);
12286
+ const buf = await readFile3(input);
12018
12287
  buffer = toArrayBuffer(buf);
12019
12288
  } catch (err) {
12020
12289
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
@@ -12173,6 +12442,9 @@ export {
12173
12442
  VERSION,
12174
12443
  blocksToMarkdown,
12175
12444
  compare,
12445
+ convertHwpToPdf,
12446
+ convertHwpxToPdf,
12447
+ convertToPdf,
12176
12448
  detectFormat,
12177
12449
  detectZipFormat,
12178
12450
  diffBlocks,