@clazic/kordoc 2.5.2 → 2.6.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -33,6 +33,118 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
33
33
  ));
34
34
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
35
35
 
36
+ // src/utils.ts
37
+ var utils_exports = {};
38
+ __export(utils_exports, {
39
+ KordocError: () => KordocError,
40
+ VERSION: () => VERSION,
41
+ classifyError: () => classifyError,
42
+ isPathTraversal: () => isPathTraversal,
43
+ normalizeKordocError: () => normalizeKordocError,
44
+ precheckZipSize: () => precheckZipSize,
45
+ sanitizeError: () => sanitizeError,
46
+ sanitizeHref: () => sanitizeHref,
47
+ toArrayBuffer: () => toArrayBuffer
48
+ });
49
+ function toArrayBuffer(buf) {
50
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
51
+ return buf.buffer;
52
+ }
53
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
54
+ }
55
+ function sanitizeError(err) {
56
+ if (err instanceof KordocError) return err.message;
57
+ return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
58
+ }
59
+ function isPathTraversal(name) {
60
+ if (name.includes("\0")) return true;
61
+ const normalized = name.replace(/\\/g, "/");
62
+ return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
63
+ }
64
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
65
+ try {
66
+ const data = new DataView(buffer);
67
+ const len = buffer.byteLength;
68
+ let eocdOffset = -1;
69
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
70
+ if (data.getUint32(i, true) === 101010256) {
71
+ eocdOffset = i;
72
+ break;
73
+ }
74
+ }
75
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
76
+ const entryCount = data.getUint16(eocdOffset + 10, true);
77
+ if (entryCount > maxEntries) {
78
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
79
+ }
80
+ const cdSize = data.getUint32(eocdOffset + 12, true);
81
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
82
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
83
+ let totalUncompressed = 0;
84
+ let pos = cdOffset;
85
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
86
+ if (data.getUint32(pos, true) !== 33639248) break;
87
+ totalUncompressed += data.getUint32(pos + 24, true);
88
+ const nameLen = data.getUint16(pos + 28, true);
89
+ const extraLen = data.getUint16(pos + 30, true);
90
+ const commentLen = data.getUint16(pos + 32, true);
91
+ pos += 46 + nameLen + extraLen + commentLen;
92
+ }
93
+ if (totalUncompressed > maxUncompressedSize) {
94
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
95
+ }
96
+ return { totalUncompressed, entryCount };
97
+ } catch (err) {
98
+ if (err instanceof KordocError) throw err;
99
+ return { totalUncompressed: 0, entryCount: 0 };
100
+ }
101
+ }
102
+ function sanitizeHref(href) {
103
+ const trimmed = href.trim();
104
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
105
+ return trimmed;
106
+ }
107
+ function classifyError(err) {
108
+ if (!(err instanceof Error)) return "PARSE_ERROR";
109
+ const msg = err.message;
110
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
111
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
112
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
113
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
114
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
115
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
116
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
117
+ return "PARSE_ERROR";
118
+ }
119
+ function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
120
+ if (err instanceof KordocError) {
121
+ if (!err.stage) err.stage = stage;
122
+ if (!err.code) err.code = fallbackCode;
123
+ return err;
124
+ }
125
+ const message = err instanceof Error ? err.message : fallbackMessage;
126
+ const code = err instanceof Error ? classifyError(err) : fallbackCode;
127
+ return new KordocError(message || fallbackMessage, { code, stage });
128
+ }
129
+ var VERSION, KordocError, SAFE_HREF_RE;
130
+ var init_utils = __esm({
131
+ "src/utils.ts"() {
132
+ "use strict";
133
+ VERSION = true ? "2.6.0" : "0.0.0-dev";
134
+ KordocError = class extends Error {
135
+ code;
136
+ stage;
137
+ constructor(message, opts = {}) {
138
+ super(message);
139
+ this.name = "KordocError";
140
+ this.code = opts.code;
141
+ this.stage = opts.stage;
142
+ }
143
+ };
144
+ SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
145
+ }
146
+ });
147
+
36
148
  // src/page-range.ts
37
149
  var page_range_exports = {};
38
150
  __export(page_range_exports, {
@@ -3062,6 +3174,9 @@ __export(index_exports, {
3062
3174
  VERSION: () => VERSION,
3063
3175
  blocksToMarkdown: () => blocksToMarkdown,
3064
3176
  compare: () => compare,
3177
+ convertHwpToPdf: () => convertHwpToPdf,
3178
+ convertHwpxToPdf: () => convertHwpxToPdf,
3179
+ convertToPdf: () => convertToPdf,
3065
3180
  detectFormat: () => detectFormat,
3066
3181
  detectZipFormat: () => detectZipFormat,
3067
3182
  diffBlocks: () => diffBlocks,
@@ -3081,7 +3196,7 @@ __export(index_exports, {
3081
3196
  runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
3082
3197
  });
3083
3198
  module.exports = __toCommonJS(index_exports);
3084
- var import_promises3 = require("fs/promises");
3199
+ var import_promises5 = require("fs/promises");
3085
3200
 
3086
3201
  // src/detect.ts
3087
3202
  var import_jszip = __toESM(require("jszip"), 1);
@@ -3133,97 +3248,8 @@ async function detectZipFormat(buffer) {
3133
3248
  var import_jszip2 = __toESM(require("jszip"), 1);
3134
3249
  var import_xmldom = require("@xmldom/xmldom");
3135
3250
 
3136
- // src/utils.ts
3137
- var VERSION = true ? "2.5.1" : "0.0.0-dev";
3138
- function toArrayBuffer(buf) {
3139
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3140
- return buf.buffer;
3141
- }
3142
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
3143
- }
3144
- var KordocError = class extends Error {
3145
- code;
3146
- stage;
3147
- constructor(message, opts = {}) {
3148
- super(message);
3149
- this.name = "KordocError";
3150
- this.code = opts.code;
3151
- this.stage = opts.stage;
3152
- }
3153
- };
3154
- function isPathTraversal(name) {
3155
- if (name.includes("\0")) return true;
3156
- const normalized = name.replace(/\\/g, "/");
3157
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
3158
- }
3159
- function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
3160
- try {
3161
- const data = new DataView(buffer);
3162
- const len = buffer.byteLength;
3163
- let eocdOffset = -1;
3164
- for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
3165
- if (data.getUint32(i, true) === 101010256) {
3166
- eocdOffset = i;
3167
- break;
3168
- }
3169
- }
3170
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
3171
- const entryCount = data.getUint16(eocdOffset + 10, true);
3172
- if (entryCount > maxEntries) {
3173
- throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
3174
- }
3175
- const cdSize = data.getUint32(eocdOffset + 12, true);
3176
- const cdOffset = data.getUint32(eocdOffset + 16, true);
3177
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
3178
- let totalUncompressed = 0;
3179
- let pos = cdOffset;
3180
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
3181
- if (data.getUint32(pos, true) !== 33639248) break;
3182
- totalUncompressed += data.getUint32(pos + 24, true);
3183
- const nameLen = data.getUint16(pos + 28, true);
3184
- const extraLen = data.getUint16(pos + 30, true);
3185
- const commentLen = data.getUint16(pos + 32, true);
3186
- pos += 46 + nameLen + extraLen + commentLen;
3187
- }
3188
- if (totalUncompressed > maxUncompressedSize) {
3189
- throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
3190
- }
3191
- return { totalUncompressed, entryCount };
3192
- } catch (err) {
3193
- if (err instanceof KordocError) throw err;
3194
- return { totalUncompressed: 0, entryCount: 0 };
3195
- }
3196
- }
3197
- var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
3198
- function sanitizeHref(href) {
3199
- const trimmed = href.trim();
3200
- if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
3201
- return trimmed;
3202
- }
3203
- function classifyError(err) {
3204
- if (!(err instanceof Error)) return "PARSE_ERROR";
3205
- const msg = err.message;
3206
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
3207
- if (msg.includes("DRM")) return "DRM_PROTECTED";
3208
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
3209
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
3210
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
3211
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
3212
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
3213
- return "PARSE_ERROR";
3214
- }
3215
- function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
3216
- if (err instanceof KordocError) {
3217
- if (!err.stage) err.stage = stage;
3218
- if (!err.code) err.code = fallbackCode;
3219
- return err;
3220
- }
3221
- const message = err instanceof Error ? err.message : fallbackMessage;
3222
- const code = err instanceof Error ? classifyError(err) : fallbackCode;
3223
- return new KordocError(message || fallbackMessage, { code, stage });
3224
- }
3225
-
3226
3251
  // src/table/builder.ts
3252
+ init_utils();
3227
3253
  var MAX_COLS = 200;
3228
3254
  var MAX_ROWS = 1e4;
3229
3255
  function buildTable(rows) {
@@ -3483,6 +3509,8 @@ var HEADING_RATIO_H2 = 1.3;
3483
3509
  var HEADING_RATIO_H3 = 1.15;
3484
3510
 
3485
3511
  // src/hwpx/parser.ts
3512
+ init_utils();
3513
+ init_utils();
3486
3514
  init_page_range();
3487
3515
  init_logger();
3488
3516
  var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
@@ -4325,6 +4353,7 @@ function extractTextFromNode(node) {
4325
4353
 
4326
4354
  // src/hwp5/record.ts
4327
4355
  var import_zlib = require("zlib");
4356
+ init_utils();
4328
4357
  var TAG_PARA_HEADER = 66;
4329
4358
  var TAG_PARA_TEXT = 67;
4330
4359
  var TAG_CHAR_SHAPE = 68;
@@ -5374,6 +5403,7 @@ function parseLenientCfb(data) {
5374
5403
  }
5375
5404
 
5376
5405
  // src/hwp5/parser.ts
5406
+ init_utils();
5377
5407
  init_page_range();
5378
5408
  init_logger();
5379
5409
  var CFB = __toESM(require_cfb(), 1);
@@ -6029,6 +6059,7 @@ function arrangeCells(rows, cols, cells) {
6029
6059
  }
6030
6060
 
6031
6061
  // src/pdf/parser.ts
6062
+ init_utils();
6032
6063
  init_page_range();
6033
6064
  var import_module = require("module");
6034
6065
  var import_path4 = require("path");
@@ -7922,6 +7953,7 @@ function mergeKoreanLines(text) {
7922
7953
  // src/xlsx/parser.ts
7923
7954
  var import_jszip3 = __toESM(require("jszip"), 1);
7924
7955
  var import_xmldom2 = require("@xmldom/xmldom");
7956
+ init_utils();
7925
7957
  init_logger();
7926
7958
  var MAX_SHEETS = 100;
7927
7959
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
@@ -8250,6 +8282,7 @@ async function parseXlsxDocument(buffer, options, existingZip) {
8250
8282
  // src/docx/parser.ts
8251
8283
  var import_jszip4 = __toESM(require("jszip"), 1);
8252
8284
  var import_xmldom3 = require("@xmldom/xmldom");
8285
+ init_utils();
8253
8286
  init_logger();
8254
8287
  var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
8255
8288
  function getChildElements(parent, localName) {
@@ -8729,6 +8762,7 @@ async function parseDocxDocument(buffer, options, existingZip) {
8729
8762
  }
8730
8763
 
8731
8764
  // src/index.ts
8765
+ init_utils();
8732
8766
  init_cli_provider();
8733
8767
  init_markdown_to_blocks();
8734
8768
  init_logger();
@@ -11230,6 +11264,481 @@ async function markdownToXlsx(markdown, options) {
11230
11264
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
11231
11265
  }
11232
11266
 
11267
+ // src/convert/index.ts
11268
+ var import_promises3 = require("fs/promises");
11269
+ init_utils();
11270
+
11271
+ // src/convert/libreoffice.ts
11272
+ var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11273
+
11274
+ // src/convert/error.ts
11275
+ var ConvertError = class extends Error {
11276
+ constructor(code, message) {
11277
+ super(message);
11278
+ this.code = code;
11279
+ this.name = "ConvertError";
11280
+ }
11281
+ };
11282
+
11283
+ // src/convert/installer.ts
11284
+ var import_os3 = require("os");
11285
+ var import_path5 = require("path");
11286
+ var import_promises2 = require("fs/promises");
11287
+ var import_fs4 = require("fs");
11288
+ var import_child_process4 = require("child_process");
11289
+ var CACHE_DIR = (0, import_path5.join)((0, import_os3.homedir)(), ".cache", "kordoc", "libreoffice");
11290
+ var VERSION_FILE = (0, import_path5.join)(CACHE_DIR, "version");
11291
+ var PACKAGES = {
11292
+ darwin: {
11293
+ url: "https://download.documentfoundation.org/libreoffice/stable/24.8.4/mac/x86_64/LibreOffice_24.8.4_MacOS_x86-64.dmg",
11294
+ binPath: "LibreOffice.app/Contents/MacOS/soffice",
11295
+ sizeMb: 300
11296
+ },
11297
+ linux: {
11298
+ url: "https://download.documentfoundation.org/libreoffice/stable/24.8.4/deb/x86_64/LibreOffice_24.8.4_Linux_x86-64_deb.tar.gz",
11299
+ binPath: "opt/libreoffice24.8/program/soffice",
11300
+ sizeMb: 200
11301
+ },
11302
+ win32: {
11303
+ url: "https://download.documentfoundation.org/libreoffice/stable/24.8.4/win/x86_64/LibreOffice_24.8.4_Win_x86-64.msi",
11304
+ binPath: "LibreOffice/program/soffice.exe",
11305
+ sizeMb: 350
11306
+ }
11307
+ };
11308
+ async function findInPath() {
11309
+ try {
11310
+ const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
11311
+ await runCommand2("soffice", ["--version"]);
11312
+ return "soffice";
11313
+ } catch {
11314
+ return null;
11315
+ }
11316
+ }
11317
+ async function findInCache() {
11318
+ const cachedBin = (0, import_path5.join)(CACHE_DIR, "bin", "soffice");
11319
+ try {
11320
+ await (0, import_promises2.access)(cachedBin);
11321
+ return cachedBin;
11322
+ } catch {
11323
+ return null;
11324
+ }
11325
+ }
11326
+ async function downloadWithProgress(url, dest, totalBytes, onProgress) {
11327
+ const response = await fetch(url);
11328
+ if (!response.body) throw new Error("\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: response body \uC5C6\uC74C");
11329
+ const file = (0, import_fs4.createWriteStream)(dest);
11330
+ const reader = response.body.getReader();
11331
+ let downloaded = 0;
11332
+ try {
11333
+ while (true) {
11334
+ const { done, value } = await reader.read();
11335
+ if (done) break;
11336
+ file.write(value);
11337
+ downloaded += value.length;
11338
+ onProgress?.(downloaded, totalBytes);
11339
+ }
11340
+ } finally {
11341
+ file.end();
11342
+ reader.releaseLock();
11343
+ }
11344
+ }
11345
+ async function installForPlatform(pkg, onProgress) {
11346
+ const platform = process.platform;
11347
+ await (0, import_promises2.mkdir)(CACHE_DIR, { recursive: true });
11348
+ const downloadPath = (0, import_path5.join)(CACHE_DIR, `download-${Date.now()}`);
11349
+ await downloadWithProgress(pkg.url, downloadPath, pkg.sizeMb * 1024 * 1024, onProgress);
11350
+ try {
11351
+ if (platform === "darwin") {
11352
+ return await installMacOS(pkg, downloadPath);
11353
+ } else if (platform === "linux") {
11354
+ return await installLinux(pkg, downloadPath);
11355
+ } else if (platform === "win32") {
11356
+ return await installWindows(pkg, downloadPath);
11357
+ }
11358
+ } catch (err) {
11359
+ await (0, import_promises2.rm)(downloadPath, { force: true });
11360
+ throw err;
11361
+ }
11362
+ throw new ConvertError("UNSUPPORTED_PLATFORM", `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4`);
11363
+ }
11364
+ async function installMacOS(pkg, downloadPath) {
11365
+ const mountPoint = `/Volumes/LibreOffice_${Date.now()}`;
11366
+ await new Promise((resolve4, reject) => {
11367
+ const child = (0, import_child_process4.spawn)("hdiutil", ["attach", "-nobrowse", "-mountpoint", mountPoint, downloadPath]);
11368
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("dmg \uB9C8\uC6B4\uD2B8 \uC2E4\uD328")));
11369
+ });
11370
+ try {
11371
+ const appSource = (0, import_path5.join)(mountPoint, "LibreOffice.app");
11372
+ const appDest = (0, import_path5.join)(CACHE_DIR, "LibreOffice.app");
11373
+ await new Promise((resolve4, reject) => {
11374
+ const child = (0, import_child_process4.spawn)("cp", ["-R", appSource, appDest]);
11375
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(".app \uBCF5\uC0AC \uC2E4\uD328")));
11376
+ });
11377
+ } finally {
11378
+ await new Promise((resolve4) => {
11379
+ const child = (0, import_child_process4.spawn)("hdiutil", ["detach", mountPoint]);
11380
+ child.on("close", () => resolve4());
11381
+ });
11382
+ }
11383
+ await (0, import_promises2.rm)(downloadPath, { force: true });
11384
+ return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
11385
+ }
11386
+ async function installLinux(pkg, downloadPath) {
11387
+ const extractDir = (0, import_path5.join)(CACHE_DIR, `extract-${Date.now()}`);
11388
+ await (0, import_promises2.mkdir)(extractDir, { recursive: true });
11389
+ await new Promise((resolve4, reject) => {
11390
+ const child = (0, import_child_process4.spawn)("tar", ["xzf", downloadPath, "-C", extractDir]);
11391
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("\uC555\uCD95 \uD574\uC81C \uC2E4\uD328")));
11392
+ });
11393
+ const debsDir = (0, import_path5.join)(extractDir, "DEBS");
11394
+ try {
11395
+ await (0, import_promises2.access)(debsDir);
11396
+ const entries = await (await import("fs/promises")).readdir(debsDir);
11397
+ for (const entry of entries) {
11398
+ if (entry.endsWith(".deb")) {
11399
+ await new Promise((resolve4, reject) => {
11400
+ const child = (0, import_child_process4.spawn)("dpkg-deb", ["-x", (0, import_path5.join)(debsDir, entry), CACHE_DIR]);
11401
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(`${entry} \uCD94\uCD9C \uC2E4\uD328`)));
11402
+ });
11403
+ }
11404
+ }
11405
+ } catch {
11406
+ }
11407
+ await (0, import_promises2.rm)(downloadPath, { force: true });
11408
+ await (0, import_promises2.rm)(extractDir, { recursive: true, force: true });
11409
+ return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
11410
+ }
11411
+ async function installWindows(pkg, downloadPath) {
11412
+ await new Promise((resolve4, reject) => {
11413
+ const child = (0, import_child_process4.spawn)("msiexec", ["/a", downloadPath, "/qn", `TARGETDIR=${CACHE_DIR}`]);
11414
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("MSI \uC124\uCE58 \uC2E4\uD328")));
11415
+ });
11416
+ await (0, import_promises2.rm)(downloadPath, { force: true });
11417
+ return (0, import_path5.join)(CACHE_DIR, pkg.binPath);
11418
+ }
11419
+ async function createSymlink(actualBin) {
11420
+ const binDir = (0, import_path5.join)(CACHE_DIR, "bin");
11421
+ await (0, import_promises2.mkdir)(binDir, { recursive: true });
11422
+ const linkBin = (0, import_path5.join)(binDir, "soffice");
11423
+ try {
11424
+ await (0, import_promises2.symlink)(actualBin, linkBin);
11425
+ } catch {
11426
+ }
11427
+ process.env.PATH = `${binDir}${import_path5.delimiter}${process.env.PATH}`;
11428
+ return linkBin;
11429
+ }
11430
+ async function installLibreOffice(onProgress) {
11431
+ const platform = process.platform;
11432
+ const pkg = PACKAGES[platform];
11433
+ if (!pkg) {
11434
+ throw new ConvertError(
11435
+ "UNSUPPORTED_PLATFORM",
11436
+ `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4. \uC218\uB3D9\uC73C\uB85C LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.`
11437
+ );
11438
+ }
11439
+ return await installForPlatform(pkg, onProgress);
11440
+ }
11441
+ async function resolveSoffice(emitter, autoInstall = true) {
11442
+ emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
11443
+ const inPath = await findInPath();
11444
+ if (inPath) {
11445
+ emitter.validate("soffice_found", "\uC2DC\uC2A4\uD15C PATH\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inPath });
11446
+ return inPath;
11447
+ }
11448
+ const inCache = await findInCache();
11449
+ if (inCache) {
11450
+ emitter.validate("soffice_found", "\uCE90\uC2DC\uB41C LibreOffice \uBC1C\uACAC", { sofficePath: inCache });
11451
+ return inCache;
11452
+ }
11453
+ if (!autoInstall) {
11454
+ emitter.error(
11455
+ "validate",
11456
+ "SOFFICE_NOT_FOUND",
11457
+ "LibreOffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4",
11458
+ "\uC218\uB3D9\uC73C\uB85C \uC124\uCE58\uD558\uAC70\uB098 autoInstallLibreOffice: true \uC635\uC158\uC744 \uC0AC\uC6A9\uD558\uC138\uC694."
11459
+ );
11460
+ throw new ConvertError("SOFFICE_NOT_FOUND", "LibreOffice\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4");
11461
+ }
11462
+ emitter.install("install_start", "LibreOffice \uC790\uB3D9 \uC124\uCE58\uB97C \uC2DC\uC791\uD569\uB2C8\uB2E4...");
11463
+ try {
11464
+ const installed = await installLibreOffice((downloaded, total) => {
11465
+ const percent = Math.round(downloaded / total * 100);
11466
+ emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
11467
+ percent,
11468
+ downloadedBytes: downloaded,
11469
+ totalBytes: total
11470
+ });
11471
+ });
11472
+ emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
11473
+ return installed;
11474
+ } catch (err) {
11475
+ const errorMsg = err instanceof Error ? err.message : String(err);
11476
+ emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
11477
+ throw err;
11478
+ }
11479
+ }
11480
+
11481
+ // src/convert/libreoffice.ts
11482
+ var libreConvert = import_libreoffice_convert.default.convert;
11483
+ async function assertSofficeAvailable() {
11484
+ const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
11485
+ try {
11486
+ await runCommand2("soffice", ["--version"]);
11487
+ } catch {
11488
+ throw new ConvertError(
11489
+ "SOFFICE_NOT_FOUND",
11490
+ "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
11491
+ );
11492
+ }
11493
+ }
11494
+ async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
11495
+ return new Promise((resolve4, reject) => {
11496
+ const timer = setTimeout(() => {
11497
+ reject(
11498
+ new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
11499
+ );
11500
+ }, timeoutMs);
11501
+ libreConvert(buffer, targetExt, void 0, (err, done) => {
11502
+ clearTimeout(timer);
11503
+ if (err || !done) {
11504
+ reject(
11505
+ new ConvertError(
11506
+ "CONVERT_FAILED",
11507
+ err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
11508
+ )
11509
+ );
11510
+ return;
11511
+ }
11512
+ resolve4(done);
11513
+ });
11514
+ });
11515
+ }
11516
+
11517
+ // src/convert/events.ts
11518
+ var ConvertEventEmitter = class {
11519
+ listener = null;
11520
+ /** 이벤트 리스너 등록 */
11521
+ setListener(listener) {
11522
+ this.listener = listener;
11523
+ }
11524
+ /** 이벤트 발송 */
11525
+ emit(event) {
11526
+ try {
11527
+ this.listener?.(event);
11528
+ } catch {
11529
+ }
11530
+ }
11531
+ /** 타입 안전한 헬퍼: detect 이벤트 */
11532
+ detect(stage, message, meta) {
11533
+ this.emit({ type: "detect", stage, message, ...meta });
11534
+ }
11535
+ /** 타입 안전한 헬퍼: validate 이벤트 */
11536
+ validate(stage, message, meta) {
11537
+ this.emit({ type: "validate", stage, message, ...meta });
11538
+ }
11539
+ /** 타입 안전한 헬퍼: install 이벤트 */
11540
+ install(stage, message, meta) {
11541
+ this.emit({ type: "install", stage, message, ...meta });
11542
+ }
11543
+ /** 타입 안전한 헬퍼: convert 진행 이벤트 */
11544
+ progress(percent, message) {
11545
+ this.emit({ type: "convert", stage: "convert_progress", message, percent });
11546
+ }
11547
+ /** 타입 안전한 헬퍼: convert 시작 */
11548
+ convertStart(message) {
11549
+ this.emit({ type: "convert", stage: "convert_start", message, percent: 0 });
11550
+ }
11551
+ /** 타입 안전한 헬퍼: convert 완료 */
11552
+ convertDone(message) {
11553
+ this.emit({ type: "convert", stage: "convert_done", message, percent: 100 });
11554
+ }
11555
+ /** 타입 안전한 헬퍼: 완료 이벤트 */
11556
+ complete(result) {
11557
+ this.emit({ type: "complete", stage: "success", message: "\uBCC0\uD658 \uC644\uB8CC", result });
11558
+ }
11559
+ /** 타입 안전한 헬퍼: 에러 이벤트 */
11560
+ error(stage, code, message, suggestion) {
11561
+ this.emit({ type: "error", stage, code, message, recoverable: true, suggestion });
11562
+ }
11563
+ };
11564
+
11565
+ // src/convert/index.ts
11566
+ var isConverting = false;
11567
+ var queue = [];
11568
+ async function acquireConvertLock() {
11569
+ if (!isConverting) {
11570
+ isConverting = true;
11571
+ return () => {
11572
+ isConverting = false;
11573
+ const next = queue.shift();
11574
+ next?.();
11575
+ };
11576
+ }
11577
+ return new Promise((resolve4) => {
11578
+ queue.push(() => {
11579
+ isConverting = true;
11580
+ resolve4(() => {
11581
+ isConverting = false;
11582
+ const next = queue.shift();
11583
+ next?.();
11584
+ });
11585
+ });
11586
+ });
11587
+ }
11588
+ async function convertToPdf(input, options) {
11589
+ const emitter = new ConvertEventEmitter();
11590
+ if (options?.onEvent) {
11591
+ emitter.setListener(options.onEvent);
11592
+ }
11593
+ if (options?.onProgress) {
11594
+ const legacyProgress = options.onProgress;
11595
+ emitter.setListener((event) => {
11596
+ if (event.type === "convert" && event.stage === "convert_progress") {
11597
+ legacyProgress(event.percent, event.message);
11598
+ }
11599
+ });
11600
+ }
11601
+ try {
11602
+ emitter.detect("reading", "\uC785\uB825 \uD30C\uC77C \uC77D\uB294 \uC911...");
11603
+ let buffer;
11604
+ try {
11605
+ if (typeof input === "string") {
11606
+ buffer = await (0, import_promises3.readFile)(input);
11607
+ } else if (Buffer.isBuffer(input)) {
11608
+ buffer = input;
11609
+ } else {
11610
+ buffer = Buffer.from(input);
11611
+ }
11612
+ } catch (err) {
11613
+ emitter.error(
11614
+ "detect",
11615
+ "PARSE_ERROR",
11616
+ `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`
11617
+ );
11618
+ return {
11619
+ success: false,
11620
+ code: "PARSE_ERROR",
11621
+ error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
11622
+ stage: "detect"
11623
+ };
11624
+ }
11625
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
11626
+ if (buffer.length > MAX_FILE_SIZE) {
11627
+ emitter.error(
11628
+ "detect",
11629
+ "FILE_TOO_LARGE",
11630
+ `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`
11631
+ );
11632
+ return {
11633
+ success: false,
11634
+ code: "FILE_TOO_LARGE",
11635
+ error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
11636
+ stage: "detect"
11637
+ };
11638
+ }
11639
+ const format = detectFormat(toArrayBuffer(buffer));
11640
+ emitter.detect("format_detected", `\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC: ${format}`, { format });
11641
+ if (format !== "hwp" && format !== "hwpx") {
11642
+ emitter.error("detect", "UNSUPPORTED_FORMAT", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`);
11643
+ return {
11644
+ success: false,
11645
+ code: "UNSUPPORTED_FORMAT",
11646
+ error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
11647
+ stage: "detect"
11648
+ };
11649
+ }
11650
+ emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
11651
+ let sofficePath;
11652
+ try {
11653
+ sofficePath = await resolveSoffice(emitter, options?.autoInstallLibreOffice ?? true);
11654
+ } catch (err) {
11655
+ if (err instanceof ConvertError) {
11656
+ return {
11657
+ success: false,
11658
+ code: err.code,
11659
+ error: err.message,
11660
+ stage: "validate"
11661
+ };
11662
+ }
11663
+ throw err;
11664
+ }
11665
+ const releaseLock = await acquireConvertLock();
11666
+ try {
11667
+ emitter.convertStart("\uBCC0\uD658 \uC2DC\uC791...");
11668
+ emitter.progress(10, "\uBCC0\uD658 \uC911...");
11669
+ const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
11670
+ emitter.progress(100, "\uBCC0\uD658 \uC644\uB8CC");
11671
+ emitter.convertDone("\uBCC0\uD658 \uC644\uB8CC");
11672
+ const result = {
11673
+ success: true,
11674
+ pdf: new Uint8Array(pdf),
11675
+ sourceFormat: format
11676
+ };
11677
+ emitter.complete({
11678
+ sourceFormat: format,
11679
+ pdfSize: pdf.length
11680
+ });
11681
+ return result;
11682
+ } catch (err) {
11683
+ if (err instanceof ConvertError) {
11684
+ emitter.error("convert", err.code, err.message);
11685
+ return {
11686
+ success: false,
11687
+ code: err.code,
11688
+ error: err.message,
11689
+ stage: "convert"
11690
+ };
11691
+ }
11692
+ const errorMsg = err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328";
11693
+ emitter.error("convert", classifyError(err), errorMsg);
11694
+ return {
11695
+ success: false,
11696
+ code: classifyError(err),
11697
+ error: errorMsg,
11698
+ stage: "convert"
11699
+ };
11700
+ } finally {
11701
+ releaseLock();
11702
+ }
11703
+ } catch (unexpectedErr) {
11704
+ const errorMsg = unexpectedErr instanceof Error ? unexpectedErr.message : "\uC608\uC0C1\uCE58 \uBABB\uD55C \uC624\uB958";
11705
+ emitter.error("convert", "PARSE_ERROR", errorMsg);
11706
+ return {
11707
+ success: false,
11708
+ code: "PARSE_ERROR",
11709
+ error: errorMsg,
11710
+ stage: "convert"
11711
+ };
11712
+ }
11713
+ }
11714
+ async function convertHwpToPdf(input, options) {
11715
+ const result = await convertToPdf(input, options);
11716
+ if (result.success && result.sourceFormat !== "hwp") {
11717
+ return {
11718
+ success: false,
11719
+ code: "UNSUPPORTED_FORMAT",
11720
+ error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11721
+ stage: "detect"
11722
+ };
11723
+ }
11724
+ return result;
11725
+ }
11726
+ async function convertHwpxToPdf(input, options) {
11727
+ const result = await convertToPdf(input, options);
11728
+ if (result.success && result.sourceFormat !== "hwpx") {
11729
+ return {
11730
+ success: false,
11731
+ code: "UNSUPPORTED_FORMAT",
11732
+ error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11733
+ stage: "detect"
11734
+ };
11735
+ }
11736
+ return result;
11737
+ }
11738
+
11739
+ // src/index.ts
11740
+ init_utils();
11741
+
11233
11742
  // src/ocr/api-key-rotation.ts
11234
11743
  var AllKeysCoolingDownError = class extends Error {
11235
11744
  waitMs;
@@ -11324,11 +11833,10 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11324
11833
  };
11325
11834
 
11326
11835
  // src/pipeline/unified-ocr.ts
11327
- var import_promises2 = require("fs/promises");
11328
- var import_path5 = require("path");
11329
- var import_child_process4 = require("child_process");
11836
+ var import_promises4 = require("fs/promises");
11837
+ var import_path6 = require("path");
11838
+ var import_child_process5 = require("child_process");
11330
11839
  var import_node_perf_hooks = require("perf_hooks");
11331
- var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11332
11840
  init_logger();
11333
11841
 
11334
11842
  // src/pipeline/bounded-queue.ts
@@ -11390,7 +11898,6 @@ var BoundedQueue = class {
11390
11898
  };
11391
11899
 
11392
11900
  // src/pipeline/unified-ocr.ts
11393
- var libreConvert = import_libreoffice_convert.default.convert;
11394
11901
  var UnifiedOcrError = class extends Error {
11395
11902
  code;
11396
11903
  stage;
@@ -11462,15 +11969,15 @@ function elapsedMs(startAt) {
11462
11969
  return Math.round(import_node_perf_hooks.performance.now() - startAt);
11463
11970
  }
11464
11971
  async function runUnifiedOcrPipeline(inputPath, options = {}) {
11465
- const absInput = (0, import_path5.resolve)(inputPath);
11466
- const stem = (0, import_path5.basename)(absInput, (0, import_path5.extname)(absInput));
11467
- const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
11468
- const imagesDir = (0, import_path5.join)(workspaceDir, "images");
11469
- const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
11470
- const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
11471
- const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
11472
- const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
11473
- const modelCachePath = (0, import_path5.join)((0, import_path5.dirname)(absInput), ".kordoc-model-cache.json");
11972
+ const absInput = (0, import_path6.resolve)(inputPath);
11973
+ const stem = (0, import_path6.basename)(absInput, (0, import_path6.extname)(absInput));
11974
+ const workspaceDir = (0, import_path6.resolve)(options.workspaceDir ?? (0, import_path6.join)((0, import_path6.dirname)(absInput), `${stem}_ocr_workspace`));
11975
+ const imagesDir = (0, import_path6.join)(workspaceDir, "images");
11976
+ const rawDir = (0, import_path6.join)(workspaceDir, "ocr", "raw");
11977
+ const diffDir = (0, import_path6.join)(workspaceDir, "ocr", "diff");
11978
+ const outputPath = (0, import_path6.resolve)(options.outputPath ?? (0, import_path6.join)((0, import_path6.dirname)(absInput), `${stem}.md`));
11979
+ const reportPath = (0, import_path6.join)(workspaceDir, "run-report.json");
11980
+ const modelCachePath = (0, import_path6.join)((0, import_path6.dirname)(absInput), ".kordoc-model-cache.json");
11474
11981
  const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
11475
11982
  const timeoutMs = options.timeoutMs ?? 6e4;
11476
11983
  const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
@@ -11484,9 +11991,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11484
11991
  const keyPool = ApiKeyRotationPool.fromEnv();
11485
11992
  const runId = options.runId ?? generateRunId("ocr");
11486
11993
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11487
- await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
11488
- await (0, import_promises2.mkdir)(rawDir, { recursive: true });
11489
- await (0, import_promises2.mkdir)(diffDir, { recursive: true });
11994
+ await (0, import_promises4.mkdir)(imagesDir, { recursive: true });
11995
+ await (0, import_promises4.mkdir)(rawDir, { recursive: true });
11996
+ await (0, import_promises4.mkdir)(diffDir, { recursive: true });
11490
11997
  const timingsMs = {};
11491
11998
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11492
11999
  const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
@@ -11502,12 +12009,12 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11502
12009
  currentStage = "convert";
11503
12010
  markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
11504
12011
  logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
11505
- if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
12012
+ if ((0, import_path6.extname)(absInput).toLowerCase() !== ".pdf") {
11506
12013
  await assertSofficeAvailable();
11507
- workingPdfPath = (0, import_path5.join)(workspaceDir, `${stem}.pdf`);
11508
- const inputBuffer = await (0, import_promises2.readFile)(absInput);
11509
- const out = await convertWithLibreOffice(inputBuffer, ".pdf");
11510
- await (0, import_promises2.writeFile)(workingPdfPath, out);
12014
+ workingPdfPath = (0, import_path6.join)(workspaceDir, `${stem}.pdf`);
12015
+ const inputBuffer = await (0, import_promises4.readFile)(absInput);
12016
+ const out = await convertBuffer(inputBuffer, ".pdf");
12017
+ await (0, import_promises4.writeFile)(workingPdfPath, out);
11511
12018
  }
11512
12019
  timingsMs.convert = elapsedMs(convertStart);
11513
12020
  markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
@@ -11518,10 +12025,10 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11518
12025
  if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
11519
12026
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11520
12027
  logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
11521
- await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
11522
- const firstFiles = (await (0, import_promises2.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
12028
+ await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path6.join)(imagesDir, "page")]);
12029
+ const firstFiles = (await (0, import_promises4.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11523
12030
  if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
11524
- const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
12031
+ const probeImage = (0, import_path6.join)(imagesDir, firstFiles[0]);
11525
12032
  markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
11526
12033
  const probeStart = import_node_perf_hooks.performance.now();
11527
12034
  currentStage = "probe";
@@ -11557,7 +12064,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11557
12064
  const keyCount = keyPool.snapshot().length;
11558
12065
  const workerCount = Math.max(1, keyCount * concurrencyPerKey);
11559
12066
  const queueCapacity = workerCount * 2;
11560
- const queue = new BoundedQueue(queueCapacity);
12067
+ const queue2 = new BoundedQueue(queueCapacity);
11561
12068
  const ocrStart = import_node_perf_hooks.performance.now();
11562
12069
  currentStage = "ocr";
11563
12070
  markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
@@ -11565,17 +12072,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11565
12072
  let renderDone = 1;
11566
12073
  const renderProducer = (async () => {
11567
12074
  try {
11568
- await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
12075
+ await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
11569
12076
  if (totalPages > 1) {
11570
- for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
11571
- await queue.enqueue(item);
12077
+ for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path6.join)(imagesDir, "page"), dpi, totalPages, 2)) {
12078
+ await queue2.enqueue(item);
11572
12079
  renderDone++;
11573
12080
  markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
11574
12081
  logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
11575
12082
  }
11576
12083
  }
11577
12084
  } finally {
11578
- queue.close();
12085
+ queue2.close();
11579
12086
  timingsMs.render = elapsedMs(renderStart);
11580
12087
  markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11581
12088
  logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
@@ -11584,7 +12091,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11584
12091
  const [, pageResultsMap] = await Promise.all([
11585
12092
  renderProducer,
11586
12093
  ocrWorkerPool({
11587
- queue,
12094
+ queue: queue2,
11588
12095
  workerCount,
11589
12096
  totalPages,
11590
12097
  ocrInput: {
@@ -11617,8 +12124,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11617
12124
  const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
11618
12125
  const rawPagePaths = [];
11619
12126
  for (const [pageNum, markdown] of sortedEntries) {
11620
- const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
11621
- await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
12127
+ const pagePath = (0, import_path6.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
12128
+ await (0, import_promises4.writeFile)(pagePath, markdown, "utf-8");
11622
12129
  rawPagePaths.push(pagePath);
11623
12130
  }
11624
12131
  const mergeStart = import_node_perf_hooks.performance.now();
@@ -11626,7 +12133,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11626
12133
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11627
12134
  logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
11628
12135
  const merged = await mergeMarkdownPages(rawPagePaths);
11629
- await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
12136
+ await (0, import_promises4.writeFile)(outputPath, merged, "utf-8");
11630
12137
  timingsMs.merge = elapsedMs(mergeStart);
11631
12138
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
11632
12139
  logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
@@ -11642,7 +12149,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11642
12149
  timingsMs,
11643
12150
  modelCachePath
11644
12151
  };
11645
- await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
12152
+ await (0, import_promises4.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
11646
12153
  logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
11647
12154
  return { outputPath, reportPath, selectedModel };
11648
12155
  } catch (err) {
@@ -11704,17 +12211,6 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
11704
12211
  model: extra.model
11705
12212
  });
11706
12213
  }
11707
- async function convertWithLibreOffice(buffer, ext) {
11708
- return await new Promise((resolvePromise, reject) => {
11709
- libreConvert(buffer, ext, void 0, (err, done) => {
11710
- if (err || !done) {
11711
- reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
11712
- return;
11713
- }
11714
- resolvePromise(done);
11715
- });
11716
- });
11717
- }
11718
12214
  async function getPdfPageCount(pdfPath) {
11719
12215
  const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
11720
12216
  const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
@@ -11728,7 +12224,7 @@ async function getPdfPageCount(pdfPath) {
11728
12224
  return n;
11729
12225
  }
11730
12226
  async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, startPage = 1) {
11731
- const imagesDir = (0, import_path5.dirname)(prefixPath);
12227
+ const imagesDir = (0, import_path6.dirname)(prefixPath);
11732
12228
  for (let page = startPage; page <= totalPages; page++) {
11733
12229
  try {
11734
12230
  await runCommand("pdftoppm", [
@@ -11742,9 +12238,9 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
11742
12238
  pdfPath,
11743
12239
  prefixPath
11744
12240
  ]);
11745
- const files = await (0, import_promises2.readdir)(imagesDir);
12241
+ const files = await (0, import_promises4.readdir)(imagesDir);
11746
12242
  const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11747
- const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
12243
+ const imagePath = (0, import_path6.join)(imagesDir, pageFiles[pageFiles.length - 1]);
11748
12244
  yield { pageNumber: page, imagePath };
11749
12245
  } catch (err) {
11750
12246
  yield {
@@ -11757,7 +12253,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
11757
12253
  }
11758
12254
  async function runCommand(cmd, args) {
11759
12255
  await new Promise((resolvePromise, reject) => {
11760
- const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
12256
+ const child = (0, import_child_process5.spawn)(cmd, args, { stdio: "pipe" });
11761
12257
  let stderr = "";
11762
12258
  child.stderr.on("data", (d) => {
11763
12259
  stderr += String(d);
@@ -11771,7 +12267,7 @@ async function runCommand(cmd, args) {
11771
12267
  }
11772
12268
  async function runCommandWithStdout(cmd, args) {
11773
12269
  return await new Promise((resolvePromise, reject) => {
11774
- const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
12270
+ const child = (0, import_child_process5.spawn)(cmd, args, { stdio: "pipe" });
11775
12271
  let stdout = "";
11776
12272
  let stderr = "";
11777
12273
  child.stdout.on("data", (d) => {
@@ -11787,13 +12283,6 @@ async function runCommandWithStdout(cmd, args) {
11787
12283
  });
11788
12284
  });
11789
12285
  }
11790
- async function assertSofficeAvailable() {
11791
- try {
11792
- await runCommand("soffice", ["--version"]);
11793
- } catch {
11794
- throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
11795
- }
11796
- }
11797
12286
  function naturalPageSort(a, b) {
11798
12287
  const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
11799
12288
  const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
@@ -11867,7 +12356,7 @@ function startParallelProbeRuns(input) {
11867
12356
  }
11868
12357
  async function loadModelCache(path) {
11869
12358
  try {
11870
- const raw = await (0, import_promises2.readFile)(path, "utf-8");
12359
+ const raw = await (0, import_promises4.readFile)(path, "utf-8");
11871
12360
  return JSON.parse(raw);
11872
12361
  } catch {
11873
12362
  return null;
@@ -11898,15 +12387,15 @@ async function updateModelCache(path, probes) {
11898
12387
  }
11899
12388
  }
11900
12389
  current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
11901
- await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
12390
+ await (0, import_promises4.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
11902
12391
  }
11903
12392
  async function ocrWorkerPool(input) {
11904
- const { queue, workerCount, ocrInput, onPageDone } = input;
12393
+ const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
11905
12394
  const results = /* @__PURE__ */ new Map();
11906
12395
  let completedCount = 0;
11907
12396
  async function worker() {
11908
12397
  while (true) {
11909
- const item = await queue.dequeue();
12398
+ const item = await queue2.dequeue();
11910
12399
  if (item === QUEUE_DONE) break;
11911
12400
  const { pageNumber, imagePath, error } = item;
11912
12401
  if (imagePath === null) {
@@ -11958,7 +12447,7 @@ async function ocrImageWithFallback(input) {
11958
12447
  async function mergeMarkdownPages(paths) {
11959
12448
  const out = [];
11960
12449
  for (let i = 0; i < paths.length; i++) {
11961
- const txt = (await (0, import_promises2.readFile)(paths[i], "utf-8")).trim();
12450
+ const txt = (await (0, import_promises4.readFile)(paths[i], "utf-8")).trim();
11962
12451
  if (!txt) continue;
11963
12452
  out.push(txt);
11964
12453
  }
@@ -12074,7 +12563,7 @@ async function ocrImageViaNim(input) {
12074
12563
  throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
12075
12564
  }
12076
12565
  async function encodeBase64(path) {
12077
- const b = await (0, import_promises2.readFile)(path);
12566
+ const b = await (0, import_promises4.readFile)(path);
12078
12567
  return b.toString("base64");
12079
12568
  }
12080
12569
  function stripCodeFence3(text) {
@@ -12086,7 +12575,7 @@ async function delay(ms) {
12086
12575
  await new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
12087
12576
  }
12088
12577
  function ensureSupportedInput(path) {
12089
- const ext = (0, import_path5.extname)(path).toLowerCase();
12578
+ const ext = (0, import_path6.extname)(path).toLowerCase();
12090
12579
  const allowed = /* @__PURE__ */ new Set([".pdf", ".hwp", ".hwpx", ".docx", ".xlsx"]);
12091
12580
  if (!allowed.has(ext)) {
12092
12581
  throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
@@ -12113,7 +12602,7 @@ async function parse2(input, options) {
12113
12602
  let buffer;
12114
12603
  if (typeof input === "string") {
12115
12604
  try {
12116
- const buf = await (0, import_promises3.readFile)(input);
12605
+ const buf = await (0, import_promises5.readFile)(input);
12117
12606
  buffer = toArrayBuffer(buf);
12118
12607
  } catch (err) {
12119
12608
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
@@ -12273,6 +12762,9 @@ async function parseDocx(buffer, options, zip) {
12273
12762
  VERSION,
12274
12763
  blocksToMarkdown,
12275
12764
  compare,
12765
+ convertHwpToPdf,
12766
+ convertHwpxToPdf,
12767
+ convertToPdf,
12276
12768
  detectFormat,
12277
12769
  detectZipFormat,
12278
12770
  diffBlocks,