@clazic/kordoc 2.5.1 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -33,6 +33,118 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
33
33
  ));
34
34
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
35
35
 
36
+ // src/utils.ts
37
+ var utils_exports = {};
38
+ __export(utils_exports, {
39
+ KordocError: () => KordocError,
40
+ VERSION: () => VERSION,
41
+ classifyError: () => classifyError,
42
+ isPathTraversal: () => isPathTraversal,
43
+ normalizeKordocError: () => normalizeKordocError,
44
+ precheckZipSize: () => precheckZipSize,
45
+ sanitizeError: () => sanitizeError,
46
+ sanitizeHref: () => sanitizeHref,
47
+ toArrayBuffer: () => toArrayBuffer
48
+ });
49
+ function toArrayBuffer(buf) {
50
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
51
+ return buf.buffer;
52
+ }
53
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
54
+ }
55
+ function sanitizeError(err) {
56
+ if (err instanceof KordocError) return err.message;
57
+ return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
58
+ }
59
+ function isPathTraversal(name) {
60
+ if (name.includes("\0")) return true;
61
+ const normalized = name.replace(/\\/g, "/");
62
+ return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
63
+ }
64
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
65
+ try {
66
+ const data = new DataView(buffer);
67
+ const len = buffer.byteLength;
68
+ let eocdOffset = -1;
69
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
70
+ if (data.getUint32(i, true) === 101010256) {
71
+ eocdOffset = i;
72
+ break;
73
+ }
74
+ }
75
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
76
+ const entryCount = data.getUint16(eocdOffset + 10, true);
77
+ if (entryCount > maxEntries) {
78
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
79
+ }
80
+ const cdSize = data.getUint32(eocdOffset + 12, true);
81
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
82
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
83
+ let totalUncompressed = 0;
84
+ let pos = cdOffset;
85
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
86
+ if (data.getUint32(pos, true) !== 33639248) break;
87
+ totalUncompressed += data.getUint32(pos + 24, true);
88
+ const nameLen = data.getUint16(pos + 28, true);
89
+ const extraLen = data.getUint16(pos + 30, true);
90
+ const commentLen = data.getUint16(pos + 32, true);
91
+ pos += 46 + nameLen + extraLen + commentLen;
92
+ }
93
+ if (totalUncompressed > maxUncompressedSize) {
94
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
95
+ }
96
+ return { totalUncompressed, entryCount };
97
+ } catch (err) {
98
+ if (err instanceof KordocError) throw err;
99
+ return { totalUncompressed: 0, entryCount: 0 };
100
+ }
101
+ }
102
+ function sanitizeHref(href) {
103
+ const trimmed = href.trim();
104
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
105
+ return trimmed;
106
+ }
107
+ function classifyError(err) {
108
+ if (!(err instanceof Error)) return "PARSE_ERROR";
109
+ const msg = err.message;
110
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
111
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
112
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
113
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
114
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
115
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
116
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
117
+ return "PARSE_ERROR";
118
+ }
119
+ function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
120
+ if (err instanceof KordocError) {
121
+ if (!err.stage) err.stage = stage;
122
+ if (!err.code) err.code = fallbackCode;
123
+ return err;
124
+ }
125
+ const message = err instanceof Error ? err.message : fallbackMessage;
126
+ const code = err instanceof Error ? classifyError(err) : fallbackCode;
127
+ return new KordocError(message || fallbackMessage, { code, stage });
128
+ }
129
+ var VERSION, KordocError, SAFE_HREF_RE;
130
+ var init_utils = __esm({
131
+ "src/utils.ts"() {
132
+ "use strict";
133
+ VERSION = true ? "2.5.2" : "0.0.0-dev";
134
+ KordocError = class extends Error {
135
+ code;
136
+ stage;
137
+ constructor(message, opts = {}) {
138
+ super(message);
139
+ this.name = "KordocError";
140
+ this.code = opts.code;
141
+ this.stage = opts.stage;
142
+ }
143
+ };
144
+ SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
145
+ }
146
+ });
147
+
36
148
  // src/page-range.ts
37
149
  var page_range_exports = {};
38
150
  __export(page_range_exports, {
@@ -2394,15 +2506,48 @@ var init_cli_provider = __esm({
2394
2506
  import_fs2 = require("fs");
2395
2507
  import_path2 = require("path");
2396
2508
  import_os = require("os");
2397
- OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
2398
- \uADDC\uCE59:
2399
- - \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)
2400
- - \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC
2401
- - \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9
2402
- - \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9
2403
- - \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC
2404
- - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0
2405
- - \`\`\`\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825`;
2509
+ OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, \uBA85\uBC31\uD55C OCR \uC624\uC778\uC2DD\uACFC \uD45C \uAD6C\uC870 \uAE68\uC9D0\uB9CC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uB97C \uCD9C\uB825\uD558\uC138\uC694.
2510
+
2511
+ [\uAE30\uBCF8 \uCD94\uCD9C \uADDC\uCE59]
2512
+ - \uBCF8\uBB38, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8, \uCEA1\uC158\uC744 \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658
2513
+ - \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\xB7\uAD75\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9
2514
+ - \uB9AC\uC2A4\uD2B8\uB294 -, 1. \uC0AC\uC6A9 (\uC6D0\uBB38 \uBC88\uD638 \uCCB4\uACC4\uAC00 \u2460, \uAC00., 1) \uB4F1\uC774\uBA74 \uADF8 \uD45C\uAE30 \uC720\uC9C0)
2515
+ - \uC774\uBBF8\uC9C0\xB7\uB3C4\uD615\xB7\uB85C\uACE0\xB7\uD398\uC774\uC9C0\uBC88\uD638\xB7\uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00\uC740 \uBB34\uC2DC
2516
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C(\uC88C\u2192\uC6B0, \uC704\u2192\uC544\uB798, \uB2E4\uB2E8\uC774\uBA74 \uB2E8\uBCC4\uB85C)\uB97C \uC720\uC9C0
2517
+
2518
+ [\uD45C \uCD94\uCD9C \uADDC\uCE59 \u2014 \uAC00\uC7A5 \uC911\uC694]
2519
+ - \uD45C\uB294 \uBC18\uB4DC\uC2DC Markdown \uD30C\uC774\uD504 \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9: \uD5E4\uB354 \uD589 + |---| \uAD6C\uBD84\uC120 + \uB370\uC774\uD130 \uD589
2520
+ - \uC2DC\uAC01\uC801\uC73C\uB85C \uAD75\uAC70\uB098 \uC74C\uC601(\uD68C\uC0C9/\uC0C9\uC0C1) \uBC30\uACBD\uC774 \uC788\uB294 \uD589\uC744 \uD5E4\uB354\uB85C \uC2DD\uBCC4. \uD5E4\uB354\uAC00 \uC5C6\uB294 \uD45C\uB77C\uB3C4 \uCCAB \uD589\uC744 \uD5E4\uB354\uB85C \uB450\uACE0 |---| \uAD6C\uBD84\uC120 \uCD94\uAC00
2521
+ - \uBAA8\uB4E0 \uD589\uC758 \uD30C\uC774\uD504(|) \uAC1C\uC218\uB97C \uD5E4\uB354\uC640 \uB3D9\uC77C\uD558\uAC8C \uB9DE\uCD9C \uAC83 \u2014 \uBE48 \uC140\uC740 \uACF5\uBC31\uC73C\uB85C \uCC44\uC6B0\uACE0 \uC808\uB300 \uC140\uC744 \uC0DD\uB7B5\uD558\uC9C0 \uB9D0 \uAC83
2522
+ - \uC140 \uC548\uC758 \uC904\uBC14\uAFC8\uC740 <br>\uB85C \uD45C\uAE30 (\uC2E4\uC81C \uAC1C\uD589 \uBB38\uC790 \uC0AC\uC6A9 \uAE08\uC9C0 \u2014 \uD45C\uAC00 \uAE68\uC9D0)
2523
+ - \uBCD1\uD569 \uC140 \uCC98\uB9AC:
2524
+ - \uAC00\uB85C \uBCD1\uD569(colspan): \uBCD1\uD569\uB41C \uC140 \uB0B4\uC6A9\uC744 \uCCAB \uCE78\uC5D0 \uC4F0\uACE0 \uB098\uBA38\uC9C0 \uCE78\uC740 \uBE48 \uCE78\uC73C\uB85C \uB458 \uAC83
2525
+ - \uC138\uB85C \uBCD1\uD569(rowspan): \uBCD1\uD569\uB41C \uBAA8\uB4E0 \uD589\uC758 \uD574\uB2F9 \uCE78\uC5D0 \uB3D9\uC77C\uD55C \uB0B4\uC6A9\uC744 \uBC18\uBCF5 \uAE30\uC7AC\uD560 \uAC83
2526
+ - 2\uB2E8 \uD5E4\uB354(\uD5E4\uB354\uAC00 \uB450 \uC904\uC778 \uACBD\uC6B0): \uC0C1\uC704/\uD558\uC704 \uD5E4\uB354\uB97C " / "\uB85C \uD569\uCCD0 \uB2E8\uC77C \uD5E4\uB354 \uD589\uC73C\uB85C \uB9CC\uB4E4 \uAC83 (\uC608: "2024 / 1\uBD84\uAE30")
2527
+ - \uD45C \uC704\xB7\uC544\uB798\uC758 \uCEA1\uC158(\uC608: "<\uD45C 1> ...", "[\uD45C 2-1]")\uC740 \uD45C \uBC14\uB85C \uC704\uC5D0 \uBCC4\uB3C4 \uC904\uB85C \uBCF4\uC874
2528
+ - \uD55C \uD398\uC774\uC9C0\uC5D0\uC11C \uD45C\uAC00 \uC911\uAC04\uC5D0 \uB04A\uC5B4\uC838 \uBCF4\uC774\uB354\uB77C\uB3C4, \uD5E4\uB354\uAC00 \uB3D9\uC77C\uD558\uBA74 \uD558\uB098\uC758 \uD45C\uB85C \uC774\uC5B4\uBD99\uC77C \uAC83
2529
+ - \uC88C\uCE21 \uCCAB \uCEEC\uB7FC\uC774 \uC138\uB85C\uC4F0\uAE30 \uB77C\uBCA8(\uAD6C\uBD84/\uBD84\uB958/\uC5F0\uB3C4 \uB4F1)\uC774\uB77C\uB3C4 \uC77C\uBC18 \uC140\uB85C \uBCC0\uD658\uD558\uC5EC \uB204\uB77D \uAE08\uC9C0
2530
+
2531
+ [OCR \uC624\uC778\uC2DD\xB7\uAE68\uC9D0 \uAD50\uC815 \u2014 \uD5C8\uC6A9 \uBC94\uC704\uB9CC]
2532
+ - \uC22B\uC790 \uCE78\uC5D0\uC11C 'O'\u2192'0', 'l/I'\u2192'1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uBC31\uD55C \uC624\uC778\uC2DD\uB9CC \uAD50\uC815
2533
+ - \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70 (\uC608: "\uC8FC\uAC70 \uC885 \uD569 \uACC4\uD68D" \u2192 "\uC8FC\uAC70\uC885\uD569\uACC4\uD68D")
2534
+ - \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uB04A\uAE34 \uBB38\uC7A5\uC740 \uC758\uBBF8 \uB2E8\uC704\uB85C \uBCD1\uD569 (\uB2E8, \uD45C \uC140 \uB0B4\uBD80\uB294 <br> \uC720\uC9C0)
2535
+ - \uD45C\uC5D0\uC11C \uD589 \uC5B4\uAE0B\uB0A8\uC774 \uC758\uC2EC\uB418\uBA74(\uCEEC\uB7FC \uC218 \uBD88\uC77C\uCE58) \uBE48 \uC140\uB85C \uD328\uB529\uD558\uC5EC \uCEEC\uB7FC \uC815\uD569\uC131\uC744 \uC6B0\uC120 \uD655\uBCF4
2536
+
2537
+ [\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D]
2538
+ - \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\xB7\uD45C \uD589/\uC5F4\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83
2539
+ - \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
2540
+ - \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85, \uC778\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
2541
+ - \uD45C\uC758 \uD5E4\uB354 \uD14D\uC2A4\uD2B8, \uCEA1\uC158, \uD589/\uC5F4 \uAC1C\uC218\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
2542
+ - \uCCB4\uD06C\uBC15\uC2A4(\u2611/\u2610/\u25A0/\u25A1), \uD2B9\uC218\uBB38\uC790(\u203B, \u2460\u2461\u2462, \u3260, \uAC00., \uB098.)\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
2543
+ - \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83
2544
+ - \`\`\`, \`\`\`markdown \uAC19\uC740 \uCF54\uB4DC\uD39C\uC2A4\uB85C \uAC10\uC2F8\uC9C0 \uB9D0 \uAC83
2545
+ - \uC124\uBA85\xB7\uC8FC\uC11D\xB7\uBA54\uD0C0 \uBB38\uC7A5 \uCD94\uAC00 \uAE08\uC9C0
2546
+
2547
+ [\uBD88\uD655\uC2E4\uD560 \uB54C]
2548
+ - \uAE00\uC790\uAC00 \uD750\uB9BF\uD558\uAC70\uB098 \uD310\uB3C5 \uBD88\uAC00\uD558\uBA74 \uBCF4\uC774\uB294 \uADF8\uB300\uB85C \uB450\uAC70\uB098, \uBD88\uAC00\uD53C\uD558\uBA74 \uD55C \uAE00\uC790\uB9CC ?\uB85C \uD45C\uC2DC
2549
+ - \uD45C \uAD6C\uC870\uAC00 \uBAA8\uD638\uD558\uBA74 \uCEEC\uB7FC \uC218 \uC77C\uCE58 + \uBE48 \uC140 \uD328\uB529 \uD615\uD0DC\uB85C \uCD9C\uB825
2550
+ - \uCD94\uCE21\uBCF4\uB2E4 \uC6D0\uBB38 \uBCF4\uC874\uC744 \uD56D\uC0C1 \uC6B0\uC120`;
2406
2551
  _tempDir = null;
2407
2552
  }
2408
2553
  });
@@ -2591,7 +2736,51 @@ var init_batch_provider = __esm({
2591
2736
  import_fs3 = require("fs");
2592
2737
  import_path3 = require("path");
2593
2738
  import_os2 = require("os");
2594
- BATCH_OCR_PROMPT = "\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\n\uADDC\uCE59:\n- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2739
+ BATCH_OCR_PROMPT = `\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, \uBA85\uBC31\uD55C OCR \uC624\uC778\uC2DD\uACFC \uD45C \uAD6C\uC870 \uAE68\uC9D0\uB9CC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uB97C \uCD9C\uB825\uD558\uC138\uC694.
2740
+
2741
+ [\uD398\uC774\uC9C0 \uAD6C\uBD84 \u2014 \uD544\uC218]
2742
+ - \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->
2743
+
2744
+ [\uAE30\uBCF8 \uCD94\uCD9C \uADDC\uCE59]
2745
+ - \uBCF8\uBB38, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8, \uCEA1\uC158\uC744 \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658
2746
+ - \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\xB7\uAD75\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9
2747
+ - \uB9AC\uC2A4\uD2B8\uB294 -, 1. \uC0AC\uC6A9 (\uC6D0\uBB38 \uBC88\uD638 \uCCB4\uACC4\uAC00 \u2460, \uAC00., 1) \uB4F1\uC774\uBA74 \uADF8 \uD45C\uAE30 \uC720\uC9C0)
2748
+ - \uC774\uBBF8\uC9C0\xB7\uB3C4\uD615\xB7\uB85C\uACE0\xB7\uD398\uC774\uC9C0\uBC88\uD638\xB7\uBA38\uB9AC\uAE00/\uBC14\uB2E5\uAE00\uC740 \uBB34\uC2DC
2749
+ - \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C(\uC88C\u2192\uC6B0, \uC704\u2192\uC544\uB798, \uB2E4\uB2E8\uC774\uBA74 \uB2E8\uBCC4\uB85C)\uB97C \uC720\uC9C0
2750
+
2751
+ [\uD45C \uCD94\uCD9C \uADDC\uCE59 \u2014 \uAC00\uC7A5 \uC911\uC694]
2752
+ - \uD45C\uB294 \uBC18\uB4DC\uC2DC Markdown \uD30C\uC774\uD504 \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9: \uD5E4\uB354 \uD589 + |---| \uAD6C\uBD84\uC120 + \uB370\uC774\uD130 \uD589
2753
+ - \uC2DC\uAC01\uC801\uC73C\uB85C \uAD75\uAC70\uB098 \uC74C\uC601(\uD68C\uC0C9/\uC0C9\uC0C1) \uBC30\uACBD\uC774 \uC788\uB294 \uD589\uC744 \uD5E4\uB354\uB85C \uC2DD\uBCC4. \uD5E4\uB354\uAC00 \uC5C6\uB294 \uD45C\uB77C\uB3C4 \uCCAB \uD589\uC744 \uD5E4\uB354\uB85C \uB450\uACE0 |---| \uAD6C\uBD84\uC120 \uCD94\uAC00
2754
+ - \uBAA8\uB4E0 \uD589\uC758 \uD30C\uC774\uD504(|) \uAC1C\uC218\uB97C \uD5E4\uB354\uC640 \uB3D9\uC77C\uD558\uAC8C \uB9DE\uCD9C \uAC83 \u2014 \uBE48 \uC140\uC740 \uACF5\uBC31\uC73C\uB85C \uCC44\uC6B0\uACE0 \uC808\uB300 \uC140\uC744 \uC0DD\uB7B5\uD558\uC9C0 \uB9D0 \uAC83
2755
+ - \uC140 \uC548\uC758 \uC904\uBC14\uAFC8\uC740 <br>\uB85C \uD45C\uAE30 (\uC2E4\uC81C \uAC1C\uD589 \uBB38\uC790 \uC0AC\uC6A9 \uAE08\uC9C0 \u2014 \uD45C\uAC00 \uAE68\uC9D0)
2756
+ - \uBCD1\uD569 \uC140 \uCC98\uB9AC:
2757
+ - \uAC00\uB85C \uBCD1\uD569(colspan): \uBCD1\uD569\uB41C \uC140 \uB0B4\uC6A9\uC744 \uCCAB \uCE78\uC5D0 \uC4F0\uACE0 \uB098\uBA38\uC9C0 \uCE78\uC740 \uBE48 \uCE78\uC73C\uB85C \uB458 \uAC83
2758
+ - \uC138\uB85C \uBCD1\uD569(rowspan): \uBCD1\uD569\uB41C \uBAA8\uB4E0 \uD589\uC758 \uD574\uB2F9 \uCE78\uC5D0 \uB3D9\uC77C\uD55C \uB0B4\uC6A9\uC744 \uBC18\uBCF5 \uAE30\uC7AC\uD560 \uAC83
2759
+ - 2\uB2E8 \uD5E4\uB354(\uD5E4\uB354\uAC00 \uB450 \uC904\uC778 \uACBD\uC6B0): \uC0C1\uC704/\uD558\uC704 \uD5E4\uB354\uB97C " / "\uB85C \uD569\uCCD0 \uB2E8\uC77C \uD5E4\uB354 \uD589\uC73C\uB85C \uB9CC\uB4E4 \uAC83 (\uC608: "2024 / 1\uBD84\uAE30")
2760
+ - \uD45C \uC704\xB7\uC544\uB798\uC758 \uCEA1\uC158(\uC608: "<\uD45C 1> ...", "[\uD45C 2-1]")\uC740 \uD45C \uBC14\uB85C \uC704\uC5D0 \uBCC4\uB3C4 \uC904\uB85C \uBCF4\uC874
2761
+ - \uD55C \uD398\uC774\uC9C0\uC5D0\uC11C \uD45C\uAC00 \uC911\uAC04\uC5D0 \uB04A\uC5B4\uC838 \uBCF4\uC774\uB354\uB77C\uB3C4, \uD5E4\uB354\uAC00 \uB3D9\uC77C\uD558\uBA74 \uD558\uB098\uC758 \uD45C\uB85C \uC774\uC5B4\uBD99\uC77C \uAC83
2762
+ - \uC88C\uCE21 \uCCAB \uCEEC\uB7FC\uC774 \uC138\uB85C\uC4F0\uAE30 \uB77C\uBCA8(\uAD6C\uBD84/\uBD84\uB958/\uC5F0\uB3C4 \uB4F1)\uC774\uB77C\uB3C4 \uC77C\uBC18 \uC140\uB85C \uBCC0\uD658\uD558\uC5EC \uB204\uB77D \uAE08\uC9C0
2763
+
2764
+ [OCR \uC624\uC778\uC2DD\xB7\uAE68\uC9D0 \uAD50\uC815 \u2014 \uD5C8\uC6A9 \uBC94\uC704\uB9CC]
2765
+ - \uC22B\uC790 \uCE78\uC5D0\uC11C 'O'\u2192'0', 'l/I'\u2192'1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uBC31\uD55C \uC624\uC778\uC2DD\uB9CC \uAD50\uC815
2766
+ - \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70 (\uC608: "\uC8FC\uAC70 \uC885 \uD569 \uACC4\uD68D" \u2192 "\uC8FC\uAC70\uC885\uD569\uACC4\uD68D")
2767
+ - \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uB04A\uAE34 \uBB38\uC7A5\uC740 \uC758\uBBF8 \uB2E8\uC704\uB85C \uBCD1\uD569 (\uB2E8, \uD45C \uC140 \uB0B4\uBD80\uB294 <br> \uC720\uC9C0)
2768
+ - \uD45C\uC5D0\uC11C \uD589 \uC5B4\uAE0B\uB0A8\uC774 \uC758\uC2EC\uB418\uBA74(\uCEEC\uB7FC \uC218 \uBD88\uC77C\uCE58) \uBE48 \uC140\uB85C \uD328\uB529\uD558\uC5EC \uCEEC\uB7FC \uC815\uD569\uC131\uC744 \uC6B0\uC120 \uD655\uBCF4
2769
+
2770
+ [\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D]
2771
+ - \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\xB7\uD45C \uD589/\uC5F4\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83
2772
+ - \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
2773
+ - \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85, \uC778\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83
2774
+ - \uD45C\uC758 \uD5E4\uB354 \uD14D\uC2A4\uD2B8, \uCEA1\uC158, \uD589/\uC5F4 \uAC1C\uC218\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
2775
+ - \uCCB4\uD06C\uBC15\uC2A4(\u2611/\u2610/\u25A0/\u25A1), \uD2B9\uC218\uBB38\uC790(\u203B, \u2460\u2461\u2462, \u3260, \uAC00., \uB098.)\uB97C \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83
2776
+ - \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83
2777
+ - \`\`\`, \`\`\`markdown \uAC19\uC740 \uCF54\uB4DC\uD39C\uC2A4\uB85C \uAC10\uC2F8\uC9C0 \uB9D0 \uAC83
2778
+ - \uC124\uBA85\xB7\uC8FC\uC11D\xB7\uBA54\uD0C0 \uBB38\uC7A5 \uCD94\uAC00 \uAE08\uC9C0
2779
+
2780
+ [\uBD88\uD655\uC2E4\uD560 \uB54C]
2781
+ - \uAE00\uC790\uAC00 \uD750\uB9BF\uD558\uAC70\uB098 \uD310\uB3C5 \uBD88\uAC00\uD558\uBA74 \uBCF4\uC774\uB294 \uADF8\uB300\uB85C \uB450\uAC70\uB098, \uBD88\uAC00\uD53C\uD558\uBA74 \uD55C \uAE00\uC790\uB9CC ?\uB85C \uD45C\uC2DC
2782
+ - \uD45C \uAD6C\uC870\uAC00 \uBAA8\uD638\uD558\uBA74 \uCEEC\uB7FC \uC218 \uC77C\uCE58 + \uBE48 \uC140 \uD328\uB529 \uD615\uD0DC\uB85C \uCD9C\uB825
2783
+ - \uCD94\uCE21\uBCF4\uB2E4 \uC6D0\uBB38 \uBCF4\uC874\uC744 \uD56D\uC0C1 \uC6B0\uC120`;
2595
2784
  DEFAULT_BATCH_SIZES = {
2596
2785
  gemini: 5,
2597
2786
  claude: 5,
@@ -2985,6 +3174,9 @@ __export(index_exports, {
2985
3174
  VERSION: () => VERSION,
2986
3175
  blocksToMarkdown: () => blocksToMarkdown,
2987
3176
  compare: () => compare,
3177
+ convertHwpToPdf: () => convertHwpToPdf,
3178
+ convertHwpxToPdf: () => convertHwpxToPdf,
3179
+ convertToPdf: () => convertToPdf,
2988
3180
  detectFormat: () => detectFormat,
2989
3181
  detectZipFormat: () => detectZipFormat,
2990
3182
  diffBlocks: () => diffBlocks,
@@ -3004,7 +3196,7 @@ __export(index_exports, {
3004
3196
  runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
3005
3197
  });
3006
3198
  module.exports = __toCommonJS(index_exports);
3007
- var import_promises3 = require("fs/promises");
3199
+ var import_promises4 = require("fs/promises");
3008
3200
 
3009
3201
  // src/detect.ts
3010
3202
  var import_jszip = __toESM(require("jszip"), 1);
@@ -3056,97 +3248,8 @@ async function detectZipFormat(buffer) {
3056
3248
  var import_jszip2 = __toESM(require("jszip"), 1);
3057
3249
  var import_xmldom = require("@xmldom/xmldom");
3058
3250
 
3059
- // src/utils.ts
3060
- var VERSION = true ? "2.5.0" : "0.0.0-dev";
3061
- function toArrayBuffer(buf) {
3062
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3063
- return buf.buffer;
3064
- }
3065
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
3066
- }
3067
- var KordocError = class extends Error {
3068
- code;
3069
- stage;
3070
- constructor(message, opts = {}) {
3071
- super(message);
3072
- this.name = "KordocError";
3073
- this.code = opts.code;
3074
- this.stage = opts.stage;
3075
- }
3076
- };
3077
- function isPathTraversal(name) {
3078
- if (name.includes("\0")) return true;
3079
- const normalized = name.replace(/\\/g, "/");
3080
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
3081
- }
3082
- function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
3083
- try {
3084
- const data = new DataView(buffer);
3085
- const len = buffer.byteLength;
3086
- let eocdOffset = -1;
3087
- for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
3088
- if (data.getUint32(i, true) === 101010256) {
3089
- eocdOffset = i;
3090
- break;
3091
- }
3092
- }
3093
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
3094
- const entryCount = data.getUint16(eocdOffset + 10, true);
3095
- if (entryCount > maxEntries) {
3096
- throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
3097
- }
3098
- const cdSize = data.getUint32(eocdOffset + 12, true);
3099
- const cdOffset = data.getUint32(eocdOffset + 16, true);
3100
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
3101
- let totalUncompressed = 0;
3102
- let pos = cdOffset;
3103
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
3104
- if (data.getUint32(pos, true) !== 33639248) break;
3105
- totalUncompressed += data.getUint32(pos + 24, true);
3106
- const nameLen = data.getUint16(pos + 28, true);
3107
- const extraLen = data.getUint16(pos + 30, true);
3108
- const commentLen = data.getUint16(pos + 32, true);
3109
- pos += 46 + nameLen + extraLen + commentLen;
3110
- }
3111
- if (totalUncompressed > maxUncompressedSize) {
3112
- throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
3113
- }
3114
- return { totalUncompressed, entryCount };
3115
- } catch (err) {
3116
- if (err instanceof KordocError) throw err;
3117
- return { totalUncompressed: 0, entryCount: 0 };
3118
- }
3119
- }
3120
- var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
3121
- function sanitizeHref(href) {
3122
- const trimmed = href.trim();
3123
- if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
3124
- return trimmed;
3125
- }
3126
- function classifyError(err) {
3127
- if (!(err instanceof Error)) return "PARSE_ERROR";
3128
- const msg = err.message;
3129
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
3130
- if (msg.includes("DRM")) return "DRM_PROTECTED";
3131
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
3132
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
3133
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
3134
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
3135
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
3136
- return "PARSE_ERROR";
3137
- }
3138
- function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
3139
- if (err instanceof KordocError) {
3140
- if (!err.stage) err.stage = stage;
3141
- if (!err.code) err.code = fallbackCode;
3142
- return err;
3143
- }
3144
- const message = err instanceof Error ? err.message : fallbackMessage;
3145
- const code = err instanceof Error ? classifyError(err) : fallbackCode;
3146
- return new KordocError(message || fallbackMessage, { code, stage });
3147
- }
3148
-
3149
3251
  // src/table/builder.ts
3252
+ init_utils();
3150
3253
  var MAX_COLS = 200;
3151
3254
  var MAX_ROWS = 1e4;
3152
3255
  function buildTable(rows) {
@@ -3406,6 +3509,8 @@ var HEADING_RATIO_H2 = 1.3;
3406
3509
  var HEADING_RATIO_H3 = 1.15;
3407
3510
 
3408
3511
  // src/hwpx/parser.ts
3512
+ init_utils();
3513
+ init_utils();
3409
3514
  init_page_range();
3410
3515
  init_logger();
3411
3516
  var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
@@ -4248,6 +4353,7 @@ function extractTextFromNode(node) {
4248
4353
 
4249
4354
  // src/hwp5/record.ts
4250
4355
  var import_zlib = require("zlib");
4356
+ init_utils();
4251
4357
  var TAG_PARA_HEADER = 66;
4252
4358
  var TAG_PARA_TEXT = 67;
4253
4359
  var TAG_CHAR_SHAPE = 68;
@@ -5297,6 +5403,7 @@ function parseLenientCfb(data) {
5297
5403
  }
5298
5404
 
5299
5405
  // src/hwp5/parser.ts
5406
+ init_utils();
5300
5407
  init_page_range();
5301
5408
  init_logger();
5302
5409
  var CFB = __toESM(require_cfb(), 1);
@@ -5952,6 +6059,7 @@ function arrangeCells(rows, cols, cells) {
5952
6059
  }
5953
6060
 
5954
6061
  // src/pdf/parser.ts
6062
+ init_utils();
5955
6063
  init_page_range();
5956
6064
  var import_module = require("module");
5957
6065
  var import_path4 = require("path");
@@ -7845,6 +7953,7 @@ function mergeKoreanLines(text) {
7845
7953
  // src/xlsx/parser.ts
7846
7954
  var import_jszip3 = __toESM(require("jszip"), 1);
7847
7955
  var import_xmldom2 = require("@xmldom/xmldom");
7956
+ init_utils();
7848
7957
  init_logger();
7849
7958
  var MAX_SHEETS = 100;
7850
7959
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
@@ -8173,6 +8282,7 @@ async function parseXlsxDocument(buffer, options, existingZip) {
8173
8282
  // src/docx/parser.ts
8174
8283
  var import_jszip4 = __toESM(require("jszip"), 1);
8175
8284
  var import_xmldom3 = require("@xmldom/xmldom");
8285
+ init_utils();
8176
8286
  init_logger();
8177
8287
  var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
8178
8288
  function getChildElements(parent, localName) {
@@ -8652,6 +8762,7 @@ async function parseDocxDocument(buffer, options, existingZip) {
8652
8762
  }
8653
8763
 
8654
8764
  // src/index.ts
8765
+ init_utils();
8655
8766
  init_cli_provider();
8656
8767
  init_markdown_to_blocks();
8657
8768
  init_logger();
@@ -11153,6 +11264,187 @@ async function markdownToXlsx(markdown, options) {
11153
11264
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
11154
11265
  }
11155
11266
 
11267
+ // src/convert/index.ts
11268
+ var import_promises2 = require("fs/promises");
11269
+ init_utils();
11270
+
11271
+ // src/convert/libreoffice.ts
11272
+ var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11273
+
11274
+ // src/convert/error.ts
11275
+ var ConvertError = class extends Error {
11276
+ constructor(code, message) {
11277
+ super(message);
11278
+ this.code = code;
11279
+ this.name = "ConvertError";
11280
+ }
11281
+ };
11282
+
11283
+ // src/convert/libreoffice.ts
11284
+ var libreConvert = import_libreoffice_convert.default.convert;
11285
+ async function assertSofficeAvailable() {
11286
+ const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
11287
+ try {
11288
+ await runCommand2("soffice", ["--version"]);
11289
+ } catch {
11290
+ throw new ConvertError(
11291
+ "SOFFICE_NOT_FOUND",
11292
+ "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
11293
+ );
11294
+ }
11295
+ }
11296
+ async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
11297
+ return new Promise((resolve4, reject) => {
11298
+ const timer = setTimeout(() => {
11299
+ reject(
11300
+ new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
11301
+ );
11302
+ }, timeoutMs);
11303
+ libreConvert(buffer, targetExt, void 0, (err, done) => {
11304
+ clearTimeout(timer);
11305
+ if (err || !done) {
11306
+ reject(
11307
+ new ConvertError(
11308
+ "CONVERT_FAILED",
11309
+ err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
11310
+ )
11311
+ );
11312
+ return;
11313
+ }
11314
+ resolve4(done);
11315
+ });
11316
+ });
11317
+ }
11318
+
11319
+ // src/convert/index.ts
11320
+ var isConverting = false;
11321
+ var queue = [];
11322
+ async function acquireConvertLock() {
11323
+ if (!isConverting) {
11324
+ isConverting = true;
11325
+ return () => {
11326
+ isConverting = false;
11327
+ const next = queue.shift();
11328
+ next?.();
11329
+ };
11330
+ }
11331
+ return new Promise((resolve4) => {
11332
+ queue.push(() => {
11333
+ isConverting = true;
11334
+ resolve4(() => {
11335
+ isConverting = false;
11336
+ const next = queue.shift();
11337
+ next?.();
11338
+ });
11339
+ });
11340
+ });
11341
+ }
11342
+ async function convertToPdf(input, options) {
11343
+ let buffer;
11344
+ try {
11345
+ if (typeof input === "string") {
11346
+ buffer = await (0, import_promises2.readFile)(input);
11347
+ } else if (Buffer.isBuffer(input)) {
11348
+ buffer = input;
11349
+ } else {
11350
+ buffer = Buffer.from(input);
11351
+ }
11352
+ } catch (err) {
11353
+ return {
11354
+ success: false,
11355
+ code: "PARSE_ERROR",
11356
+ error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
11357
+ stage: "detect"
11358
+ };
11359
+ }
11360
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
11361
+ if (buffer.length > MAX_FILE_SIZE) {
11362
+ return {
11363
+ success: false,
11364
+ code: "FILE_TOO_LARGE",
11365
+ error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
11366
+ stage: "detect"
11367
+ };
11368
+ }
11369
+ const format = detectFormat(toArrayBuffer(buffer));
11370
+ if (format !== "hwp" && format !== "hwpx") {
11371
+ return {
11372
+ success: false,
11373
+ code: "UNSUPPORTED_FORMAT",
11374
+ error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
11375
+ stage: "detect"
11376
+ };
11377
+ }
11378
+ try {
11379
+ await assertSofficeAvailable();
11380
+ } catch (err) {
11381
+ if (err instanceof ConvertError) {
11382
+ return {
11383
+ success: false,
11384
+ code: err.code,
11385
+ error: err.message,
11386
+ stage: "validate"
11387
+ };
11388
+ }
11389
+ throw err;
11390
+ }
11391
+ const releaseLock = await acquireConvertLock();
11392
+ try {
11393
+ options?.onProgress?.(10, "convert");
11394
+ const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
11395
+ options?.onProgress?.(100, "done");
11396
+ return {
11397
+ success: true,
11398
+ pdf: new Uint8Array(pdf),
11399
+ sourceFormat: format
11400
+ };
11401
+ } catch (err) {
11402
+ if (err instanceof ConvertError) {
11403
+ return {
11404
+ success: false,
11405
+ code: err.code,
11406
+ error: err.message,
11407
+ stage: "convert"
11408
+ };
11409
+ }
11410
+ return {
11411
+ success: false,
11412
+ code: classifyError(err),
11413
+ error: err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328",
11414
+ stage: "convert"
11415
+ };
11416
+ } finally {
11417
+ releaseLock();
11418
+ }
11419
+ }
11420
+ async function convertHwpToPdf(input, options) {
11421
+ const result = await convertToPdf(input, options);
11422
+ if (result.success && result.sourceFormat !== "hwp") {
11423
+ return {
11424
+ success: false,
11425
+ code: "UNSUPPORTED_FORMAT",
11426
+ error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11427
+ stage: "detect"
11428
+ };
11429
+ }
11430
+ return result;
11431
+ }
11432
+ async function convertHwpxToPdf(input, options) {
11433
+ const result = await convertToPdf(input, options);
11434
+ if (result.success && result.sourceFormat !== "hwpx") {
11435
+ return {
11436
+ success: false,
11437
+ code: "UNSUPPORTED_FORMAT",
11438
+ error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11439
+ stage: "detect"
11440
+ };
11441
+ }
11442
+ return result;
11443
+ }
11444
+
11445
+ // src/index.ts
11446
+ init_utils();
11447
+
11156
11448
  // src/ocr/api-key-rotation.ts
11157
11449
  var AllKeysCoolingDownError = class extends Error {
11158
11450
  waitMs;
@@ -11247,11 +11539,10 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11247
11539
  };
11248
11540
 
11249
11541
  // src/pipeline/unified-ocr.ts
11250
- var import_promises2 = require("fs/promises");
11542
+ var import_promises3 = require("fs/promises");
11251
11543
  var import_path5 = require("path");
11252
11544
  var import_child_process4 = require("child_process");
11253
11545
  var import_node_perf_hooks = require("perf_hooks");
11254
- var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11255
11546
  init_logger();
11256
11547
 
11257
11548
  // src/pipeline/bounded-queue.ts
@@ -11313,7 +11604,6 @@ var BoundedQueue = class {
11313
11604
  };
11314
11605
 
11315
11606
  // src/pipeline/unified-ocr.ts
11316
- var libreConvert = import_libreoffice_convert.default.convert;
11317
11607
  var UnifiedOcrError = class extends Error {
11318
11608
  code;
11319
11609
  stage;
@@ -11407,9 +11697,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11407
11697
  const keyPool = ApiKeyRotationPool.fromEnv();
11408
11698
  const runId = options.runId ?? generateRunId("ocr");
11409
11699
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11410
- await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
11411
- await (0, import_promises2.mkdir)(rawDir, { recursive: true });
11412
- await (0, import_promises2.mkdir)(diffDir, { recursive: true });
11700
+ await (0, import_promises3.mkdir)(imagesDir, { recursive: true });
11701
+ await (0, import_promises3.mkdir)(rawDir, { recursive: true });
11702
+ await (0, import_promises3.mkdir)(diffDir, { recursive: true });
11413
11703
  const timingsMs = {};
11414
11704
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11415
11705
  const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
@@ -11428,9 +11718,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11428
11718
  if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
11429
11719
  await assertSofficeAvailable();
11430
11720
  workingPdfPath = (0, import_path5.join)(workspaceDir, `${stem}.pdf`);
11431
- const inputBuffer = await (0, import_promises2.readFile)(absInput);
11432
- const out = await convertWithLibreOffice(inputBuffer, ".pdf");
11433
- await (0, import_promises2.writeFile)(workingPdfPath, out);
11721
+ const inputBuffer = await (0, import_promises3.readFile)(absInput);
11722
+ const out = await convertBuffer(inputBuffer, ".pdf");
11723
+ await (0, import_promises3.writeFile)(workingPdfPath, out);
11434
11724
  }
11435
11725
  timingsMs.convert = elapsedMs(convertStart);
11436
11726
  markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
@@ -11442,7 +11732,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11442
11732
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11443
11733
  logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
11444
11734
  await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
11445
- const firstFiles = (await (0, import_promises2.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11735
+ const firstFiles = (await (0, import_promises3.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11446
11736
  if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
11447
11737
  const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
11448
11738
  markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
@@ -11480,7 +11770,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11480
11770
  const keyCount = keyPool.snapshot().length;
11481
11771
  const workerCount = Math.max(1, keyCount * concurrencyPerKey);
11482
11772
  const queueCapacity = workerCount * 2;
11483
- const queue = new BoundedQueue(queueCapacity);
11773
+ const queue2 = new BoundedQueue(queueCapacity);
11484
11774
  const ocrStart = import_node_perf_hooks.performance.now();
11485
11775
  currentStage = "ocr";
11486
11776
  markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
@@ -11488,17 +11778,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11488
11778
  let renderDone = 1;
11489
11779
  const renderProducer = (async () => {
11490
11780
  try {
11491
- await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
11781
+ await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
11492
11782
  if (totalPages > 1) {
11493
11783
  for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
11494
- await queue.enqueue(item);
11784
+ await queue2.enqueue(item);
11495
11785
  renderDone++;
11496
11786
  markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
11497
11787
  logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
11498
11788
  }
11499
11789
  }
11500
11790
  } finally {
11501
- queue.close();
11791
+ queue2.close();
11502
11792
  timingsMs.render = elapsedMs(renderStart);
11503
11793
  markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11504
11794
  logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
@@ -11507,7 +11797,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11507
11797
  const [, pageResultsMap] = await Promise.all([
11508
11798
  renderProducer,
11509
11799
  ocrWorkerPool({
11510
- queue,
11800
+ queue: queue2,
11511
11801
  workerCount,
11512
11802
  totalPages,
11513
11803
  ocrInput: {
@@ -11541,7 +11831,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11541
11831
  const rawPagePaths = [];
11542
11832
  for (const [pageNum, markdown] of sortedEntries) {
11543
11833
  const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
11544
- await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
11834
+ await (0, import_promises3.writeFile)(pagePath, markdown, "utf-8");
11545
11835
  rawPagePaths.push(pagePath);
11546
11836
  }
11547
11837
  const mergeStart = import_node_perf_hooks.performance.now();
@@ -11549,7 +11839,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11549
11839
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11550
11840
  logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
11551
11841
  const merged = await mergeMarkdownPages(rawPagePaths);
11552
- await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
11842
+ await (0, import_promises3.writeFile)(outputPath, merged, "utf-8");
11553
11843
  timingsMs.merge = elapsedMs(mergeStart);
11554
11844
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
11555
11845
  logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
@@ -11565,7 +11855,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11565
11855
  timingsMs,
11566
11856
  modelCachePath
11567
11857
  };
11568
- await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
11858
+ await (0, import_promises3.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
11569
11859
  logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
11570
11860
  return { outputPath, reportPath, selectedModel };
11571
11861
  } catch (err) {
@@ -11627,17 +11917,6 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
11627
11917
  model: extra.model
11628
11918
  });
11629
11919
  }
11630
- async function convertWithLibreOffice(buffer, ext) {
11631
- return await new Promise((resolvePromise, reject) => {
11632
- libreConvert(buffer, ext, void 0, (err, done) => {
11633
- if (err || !done) {
11634
- reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
11635
- return;
11636
- }
11637
- resolvePromise(done);
11638
- });
11639
- });
11640
- }
11641
11920
  async function getPdfPageCount(pdfPath) {
11642
11921
  const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
11643
11922
  const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
@@ -11665,7 +11944,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
11665
11944
  pdfPath,
11666
11945
  prefixPath
11667
11946
  ]);
11668
- const files = await (0, import_promises2.readdir)(imagesDir);
11947
+ const files = await (0, import_promises3.readdir)(imagesDir);
11669
11948
  const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11670
11949
  const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
11671
11950
  yield { pageNumber: page, imagePath };
@@ -11710,13 +11989,6 @@ async function runCommandWithStdout(cmd, args) {
11710
11989
  });
11711
11990
  });
11712
11991
  }
11713
- async function assertSofficeAvailable() {
11714
- try {
11715
- await runCommand("soffice", ["--version"]);
11716
- } catch {
11717
- throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
11718
- }
11719
- }
11720
11992
  function naturalPageSort(a, b) {
11721
11993
  const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
11722
11994
  const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
@@ -11790,7 +12062,7 @@ function startParallelProbeRuns(input) {
11790
12062
  }
11791
12063
  async function loadModelCache(path) {
11792
12064
  try {
11793
- const raw = await (0, import_promises2.readFile)(path, "utf-8");
12065
+ const raw = await (0, import_promises3.readFile)(path, "utf-8");
11794
12066
  return JSON.parse(raw);
11795
12067
  } catch {
11796
12068
  return null;
@@ -11821,15 +12093,15 @@ async function updateModelCache(path, probes) {
11821
12093
  }
11822
12094
  }
11823
12095
  current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
11824
- await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
12096
+ await (0, import_promises3.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
11825
12097
  }
11826
12098
  async function ocrWorkerPool(input) {
11827
- const { queue, workerCount, ocrInput, onPageDone } = input;
12099
+ const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
11828
12100
  const results = /* @__PURE__ */ new Map();
11829
12101
  let completedCount = 0;
11830
12102
  async function worker() {
11831
12103
  while (true) {
11832
- const item = await queue.dequeue();
12104
+ const item = await queue2.dequeue();
11833
12105
  if (item === QUEUE_DONE) break;
11834
12106
  const { pageNumber, imagePath, error } = item;
11835
12107
  if (imagePath === null) {
@@ -11881,7 +12153,7 @@ async function ocrImageWithFallback(input) {
11881
12153
  async function mergeMarkdownPages(paths) {
11882
12154
  const out = [];
11883
12155
  for (let i = 0; i < paths.length; i++) {
11884
- const txt = (await (0, import_promises2.readFile)(paths[i], "utf-8")).trim();
12156
+ const txt = (await (0, import_promises3.readFile)(paths[i], "utf-8")).trim();
11885
12157
  if (!txt) continue;
11886
12158
  out.push(txt);
11887
12159
  }
@@ -11997,7 +12269,7 @@ async function ocrImageViaNim(input) {
11997
12269
  throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
11998
12270
  }
11999
12271
  async function encodeBase64(path) {
12000
- const b = await (0, import_promises2.readFile)(path);
12272
+ const b = await (0, import_promises3.readFile)(path);
12001
12273
  return b.toString("base64");
12002
12274
  }
12003
12275
  function stripCodeFence3(text) {
@@ -12036,7 +12308,7 @@ async function parse2(input, options) {
12036
12308
  let buffer;
12037
12309
  if (typeof input === "string") {
12038
12310
  try {
12039
- const buf = await (0, import_promises3.readFile)(input);
12311
+ const buf = await (0, import_promises4.readFile)(input);
12040
12312
  buffer = toArrayBuffer(buf);
12041
12313
  } catch (err) {
12042
12314
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
@@ -12196,6 +12468,9 @@ async function parseDocx(buffer, options, zip) {
12196
12468
  VERSION,
12197
12469
  blocksToMarkdown,
12198
12470
  compare,
12471
+ convertHwpToPdf,
12472
+ convertHwpxToPdf,
12473
+ convertToPdf,
12199
12474
  detectFormat,
12200
12475
  detectZipFormat,
12201
12476
  diffBlocks,