@clazic/kordoc 2.6.0 → 2.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -33,118 +33,6 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
33
33
  ));
34
34
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
35
35
 
36
- // src/utils.ts
37
- var utils_exports = {};
38
- __export(utils_exports, {
39
- KordocError: () => KordocError,
40
- VERSION: () => VERSION,
41
- classifyError: () => classifyError,
42
- isPathTraversal: () => isPathTraversal,
43
- normalizeKordocError: () => normalizeKordocError,
44
- precheckZipSize: () => precheckZipSize,
45
- sanitizeError: () => sanitizeError,
46
- sanitizeHref: () => sanitizeHref,
47
- toArrayBuffer: () => toArrayBuffer
48
- });
49
- function toArrayBuffer(buf) {
50
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
51
- return buf.buffer;
52
- }
53
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
54
- }
55
- function sanitizeError(err) {
56
- if (err instanceof KordocError) return err.message;
57
- return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
58
- }
59
- function isPathTraversal(name) {
60
- if (name.includes("\0")) return true;
61
- const normalized = name.replace(/\\/g, "/");
62
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
63
- }
64
- function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
65
- try {
66
- const data = new DataView(buffer);
67
- const len = buffer.byteLength;
68
- let eocdOffset = -1;
69
- for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
70
- if (data.getUint32(i, true) === 101010256) {
71
- eocdOffset = i;
72
- break;
73
- }
74
- }
75
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
76
- const entryCount = data.getUint16(eocdOffset + 10, true);
77
- if (entryCount > maxEntries) {
78
- throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
79
- }
80
- const cdSize = data.getUint32(eocdOffset + 12, true);
81
- const cdOffset = data.getUint32(eocdOffset + 16, true);
82
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
83
- let totalUncompressed = 0;
84
- let pos = cdOffset;
85
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
86
- if (data.getUint32(pos, true) !== 33639248) break;
87
- totalUncompressed += data.getUint32(pos + 24, true);
88
- const nameLen = data.getUint16(pos + 28, true);
89
- const extraLen = data.getUint16(pos + 30, true);
90
- const commentLen = data.getUint16(pos + 32, true);
91
- pos += 46 + nameLen + extraLen + commentLen;
92
- }
93
- if (totalUncompressed > maxUncompressedSize) {
94
- throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
95
- }
96
- return { totalUncompressed, entryCount };
97
- } catch (err) {
98
- if (err instanceof KordocError) throw err;
99
- return { totalUncompressed: 0, entryCount: 0 };
100
- }
101
- }
102
- function sanitizeHref(href) {
103
- const trimmed = href.trim();
104
- if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
105
- return trimmed;
106
- }
107
- function classifyError(err) {
108
- if (!(err instanceof Error)) return "PARSE_ERROR";
109
- const msg = err.message;
110
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
111
- if (msg.includes("DRM")) return "DRM_PROTECTED";
112
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
113
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
114
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
115
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
116
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
117
- return "PARSE_ERROR";
118
- }
119
- function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
120
- if (err instanceof KordocError) {
121
- if (!err.stage) err.stage = stage;
122
- if (!err.code) err.code = fallbackCode;
123
- return err;
124
- }
125
- const message = err instanceof Error ? err.message : fallbackMessage;
126
- const code = err instanceof Error ? classifyError(err) : fallbackCode;
127
- return new KordocError(message || fallbackMessage, { code, stage });
128
- }
129
- var VERSION, KordocError, SAFE_HREF_RE;
130
- var init_utils = __esm({
131
- "src/utils.ts"() {
132
- "use strict";
133
- VERSION = true ? "2.5.2" : "0.0.0-dev";
134
- KordocError = class extends Error {
135
- code;
136
- stage;
137
- constructor(message, opts = {}) {
138
- super(message);
139
- this.name = "KordocError";
140
- this.code = opts.code;
141
- this.stage = opts.stage;
142
- }
143
- };
144
- SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
145
- }
146
- });
147
-
148
36
  // src/page-range.ts
149
37
  var page_range_exports = {};
150
38
  __export(page_range_exports, {
@@ -3196,7 +3084,7 @@ __export(index_exports, {
3196
3084
  runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
3197
3085
  });
3198
3086
  module.exports = __toCommonJS(index_exports);
3199
- var import_promises4 = require("fs/promises");
3087
+ var import_promises5 = require("fs/promises");
3200
3088
 
3201
3089
  // src/detect.ts
3202
3090
  var import_jszip = __toESM(require("jszip"), 1);
@@ -3248,8 +3136,97 @@ async function detectZipFormat(buffer) {
3248
3136
  var import_jszip2 = __toESM(require("jszip"), 1);
3249
3137
  var import_xmldom = require("@xmldom/xmldom");
3250
3138
 
3139
+ // src/utils.ts
3140
+ var VERSION = true ? "2.6.1" : "0.0.0-dev";
3141
+ function toArrayBuffer(buf) {
3142
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3143
+ return buf.buffer;
3144
+ }
3145
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
3146
+ }
3147
+ var KordocError = class extends Error {
3148
+ code;
3149
+ stage;
3150
+ constructor(message, opts = {}) {
3151
+ super(message);
3152
+ this.name = "KordocError";
3153
+ this.code = opts.code;
3154
+ this.stage = opts.stage;
3155
+ }
3156
+ };
3157
+ function isPathTraversal(name) {
3158
+ if (name.includes("\0")) return true;
3159
+ const normalized = name.replace(/\\/g, "/");
3160
+ return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
3161
+ }
3162
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
3163
+ try {
3164
+ const data = new DataView(buffer);
3165
+ const len = buffer.byteLength;
3166
+ let eocdOffset = -1;
3167
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
3168
+ if (data.getUint32(i, true) === 101010256) {
3169
+ eocdOffset = i;
3170
+ break;
3171
+ }
3172
+ }
3173
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
3174
+ const entryCount = data.getUint16(eocdOffset + 10, true);
3175
+ if (entryCount > maxEntries) {
3176
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
3177
+ }
3178
+ const cdSize = data.getUint32(eocdOffset + 12, true);
3179
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
3180
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
3181
+ let totalUncompressed = 0;
3182
+ let pos = cdOffset;
3183
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
3184
+ if (data.getUint32(pos, true) !== 33639248) break;
3185
+ totalUncompressed += data.getUint32(pos + 24, true);
3186
+ const nameLen = data.getUint16(pos + 28, true);
3187
+ const extraLen = data.getUint16(pos + 30, true);
3188
+ const commentLen = data.getUint16(pos + 32, true);
3189
+ pos += 46 + nameLen + extraLen + commentLen;
3190
+ }
3191
+ if (totalUncompressed > maxUncompressedSize) {
3192
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
3193
+ }
3194
+ return { totalUncompressed, entryCount };
3195
+ } catch (err) {
3196
+ if (err instanceof KordocError) throw err;
3197
+ return { totalUncompressed: 0, entryCount: 0 };
3198
+ }
3199
+ }
3200
+ var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
3201
+ function sanitizeHref(href) {
3202
+ const trimmed = href.trim();
3203
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
3204
+ return trimmed;
3205
+ }
3206
+ function classifyError(err) {
3207
+ if (!(err instanceof Error)) return "PARSE_ERROR";
3208
+ const msg = err.message;
3209
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
3210
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
3211
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
3212
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
3213
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
3214
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
3215
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
3216
+ return "PARSE_ERROR";
3217
+ }
3218
+ function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
3219
+ if (err instanceof KordocError) {
3220
+ if (!err.stage) err.stage = stage;
3221
+ if (!err.code) err.code = fallbackCode;
3222
+ return err;
3223
+ }
3224
+ const message = err instanceof Error ? err.message : fallbackMessage;
3225
+ const code = err instanceof Error ? classifyError(err) : fallbackCode;
3226
+ return new KordocError(message || fallbackMessage, { code, stage });
3227
+ }
3228
+
3251
3229
  // src/table/builder.ts
3252
- init_utils();
3253
3230
  var MAX_COLS = 200;
3254
3231
  var MAX_ROWS = 1e4;
3255
3232
  function buildTable(rows) {
@@ -3509,8 +3486,6 @@ var HEADING_RATIO_H2 = 1.3;
3509
3486
  var HEADING_RATIO_H3 = 1.15;
3510
3487
 
3511
3488
  // src/hwpx/parser.ts
3512
- init_utils();
3513
- init_utils();
3514
3489
  init_page_range();
3515
3490
  init_logger();
3516
3491
  var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
@@ -4353,7 +4328,6 @@ function extractTextFromNode(node) {
4353
4328
 
4354
4329
  // src/hwp5/record.ts
4355
4330
  var import_zlib = require("zlib");
4356
- init_utils();
4357
4331
  var TAG_PARA_HEADER = 66;
4358
4332
  var TAG_PARA_TEXT = 67;
4359
4333
  var TAG_CHAR_SHAPE = 68;
@@ -5403,7 +5377,6 @@ function parseLenientCfb(data) {
5403
5377
  }
5404
5378
 
5405
5379
  // src/hwp5/parser.ts
5406
- init_utils();
5407
5380
  init_page_range();
5408
5381
  init_logger();
5409
5382
  var CFB = __toESM(require_cfb(), 1);
@@ -6059,7 +6032,6 @@ function arrangeCells(rows, cols, cells) {
6059
6032
  }
6060
6033
 
6061
6034
  // src/pdf/parser.ts
6062
- init_utils();
6063
6035
  init_page_range();
6064
6036
  var import_module = require("module");
6065
6037
  var import_path4 = require("path");
@@ -7953,7 +7925,6 @@ function mergeKoreanLines(text) {
7953
7925
  // src/xlsx/parser.ts
7954
7926
  var import_jszip3 = __toESM(require("jszip"), 1);
7955
7927
  var import_xmldom2 = require("@xmldom/xmldom");
7956
- init_utils();
7957
7928
  init_logger();
7958
7929
  var MAX_SHEETS = 100;
7959
7930
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
@@ -8282,7 +8253,6 @@ async function parseXlsxDocument(buffer, options, existingZip) {
8282
8253
  // src/docx/parser.ts
8283
8254
  var import_jszip4 = __toESM(require("jszip"), 1);
8284
8255
  var import_xmldom3 = require("@xmldom/xmldom");
8285
- init_utils();
8286
8256
  init_logger();
8287
8257
  var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
8288
8258
  function getChildElements(parent, localName) {
@@ -8762,7 +8732,6 @@ async function parseDocxDocument(buffer, options, existingZip) {
8762
8732
  }
8763
8733
 
8764
8734
  // src/index.ts
8765
- init_utils();
8766
8735
  init_cli_provider();
8767
8736
  init_markdown_to_blocks();
8768
8737
  init_logger();
@@ -11265,8 +11234,7 @@ async function markdownToXlsx(markdown, options) {
11265
11234
  }
11266
11235
 
11267
11236
  // src/convert/index.ts
11268
- var import_promises2 = require("fs/promises");
11269
- init_utils();
11237
+ var import_promises3 = require("fs/promises");
11270
11238
 
11271
11239
  // src/convert/libreoffice.ts
11272
11240
  var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
@@ -11280,19 +11248,250 @@ var ConvertError = class extends Error {
11280
11248
  }
11281
11249
  };
11282
11250
 
11283
- // src/convert/libreoffice.ts
11284
- var libreConvert = import_libreoffice_convert.default.convert;
11285
- async function assertSofficeAvailable() {
11286
- const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
11251
+ // src/convert/installer.ts
11252
+ var import_os3 = require("os");
11253
+ var import_path5 = require("path");
11254
+ var import_promises2 = require("fs/promises");
11255
+ var import_fs4 = require("fs");
11256
+ var import_child_process4 = require("child_process");
11257
+ var installInFlight = null;
11258
+ var CACHE_DIR = (0, import_path5.join)((0, import_os3.homedir)(), ".cache", "kordoc", "libreoffice");
11259
+ var VERSION_FILE = (0, import_path5.join)(CACHE_DIR, "version");
11260
+ var PACKAGES = {
11261
+ darwin: {
11262
+ url: "https://download.documentfoundation.org/libreoffice/stable/24.8.4/mac/x86_64/LibreOffice_24.8.4_MacOS_x86-64.dmg",
11263
+ binPath: "LibreOffice.app/Contents/MacOS/soffice",
11264
+ sizeMb: 300
11265
+ },
11266
+ linux: {
11267
+ url: "https://download.documentfoundation.org/libreoffice/stable/24.8.4/deb/x86_64/LibreOffice_24.8.4_Linux_x86-64_deb.tar.gz",
11268
+ binPath: "opt/libreoffice24.8/program/soffice",
11269
+ sizeMb: 200
11270
+ },
11271
+ win32: {
11272
+ url: "https://download.documentfoundation.org/libreoffice/stable/24.8.4/win/x86_64/LibreOffice_24.8.4_Win_x86-64.msi",
11273
+ binPath: "LibreOffice/program/soffice.exe",
11274
+ sizeMb: 350
11275
+ }
11276
+ };
11277
+ async function findInPath() {
11278
+ return new Promise((resolve4) => {
11279
+ const child = (0, import_child_process4.spawn)("soffice", ["--version"], { stdio: "ignore" });
11280
+ child.on("close", (code) => resolve4(code === 0 ? "soffice" : null));
11281
+ child.on("error", () => resolve4(null));
11282
+ });
11283
+ }
11284
+ async function findInCache() {
11285
+ const cachedBin = (0, import_path5.join)(CACHE_DIR, "bin", "soffice");
11287
11286
  try {
11288
- await runCommand2("soffice", ["--version"]);
11287
+ await (0, import_promises2.access)(cachedBin);
11288
+ return cachedBin;
11289
11289
  } catch {
11290
+ return null;
11291
+ }
11292
+ }
11293
+ async function findInDefaultPaths() {
11294
+ const platform = process.platform;
11295
+ const paths = [];
11296
+ if (platform === "darwin") {
11297
+ paths.push(
11298
+ "/Applications/LibreOffice.app/Contents/MacOS/soffice",
11299
+ "/opt/homebrew/bin/soffice",
11300
+ "/usr/local/bin/soffice"
11301
+ );
11302
+ } else if (platform === "linux") {
11303
+ paths.push(
11304
+ "/usr/bin/soffice",
11305
+ "/usr/lib/libreoffice/program/soffice"
11306
+ );
11307
+ } else if (platform === "win32") {
11308
+ const pf = process.env["ProgramFiles"] ?? "C:\\Program Files";
11309
+ const pf86 = process.env["ProgramFiles(x86)"] ?? "C:\\Program Files (x86)";
11310
+ paths.push(
11311
+ (0, import_path5.join)(pf, "LibreOffice", "program", "soffice.exe"),
11312
+ (0, import_path5.join)(pf86, "LibreOffice", "program", "soffice.exe")
11313
+ );
11314
+ }
11315
+ for (const p of paths) {
11316
+ try {
11317
+ await (0, import_promises2.access)(p);
11318
+ return p;
11319
+ } catch {
11320
+ continue;
11321
+ }
11322
+ }
11323
+ return null;
11324
+ }
11325
+ async function downloadWithProgress(url, dest, totalBytes, onProgress) {
11326
+ const response = await fetch(url);
11327
+ if (!response.body) throw new Error("\uB2E4\uC6B4\uB85C\uB4DC \uC2E4\uD328: response body \uC5C6\uC74C");
11328
+ const file = (0, import_fs4.createWriteStream)(dest);
11329
+ const reader = response.body.getReader();
11330
+ let downloaded = 0;
11331
+ try {
11332
+ while (true) {
11333
+ const { done, value } = await reader.read();
11334
+ if (done) break;
11335
+ file.write(value);
11336
+ downloaded += value.length;
11337
+ onProgress?.(downloaded, totalBytes);
11338
+ }
11339
+ } finally {
11340
+ file.end();
11341
+ reader.releaseLock();
11342
+ }
11343
+ }
11344
+ async function installForPlatform(pkg, onProgress) {
11345
+ const platform = process.platform;
11346
+ await (0, import_promises2.mkdir)(CACHE_DIR, { recursive: true });
11347
+ const downloadPath = (0, import_path5.join)(CACHE_DIR, `download-${Date.now()}`);
11348
+ await downloadWithProgress(pkg.url, downloadPath, pkg.sizeMb * 1024 * 1024, onProgress);
11349
+ try {
11350
+ if (platform === "darwin") {
11351
+ return await installMacOS(pkg, downloadPath);
11352
+ } else if (platform === "linux") {
11353
+ return await installLinux(pkg, downloadPath);
11354
+ } else if (platform === "win32") {
11355
+ return await installWindows(pkg, downloadPath);
11356
+ }
11357
+ } catch (err) {
11358
+ await (0, import_promises2.rm)(downloadPath, { force: true });
11359
+ throw err;
11360
+ }
11361
+ throw new ConvertError("UNSUPPORTED_PLATFORM", `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4`);
11362
+ }
11363
+ async function installMacOS(pkg, downloadPath) {
11364
+ const mountPoint = `/Volumes/LibreOffice_${Date.now()}`;
11365
+ await new Promise((resolve4, reject) => {
11366
+ const child = (0, import_child_process4.spawn)("hdiutil", ["attach", "-nobrowse", "-mountpoint", mountPoint, downloadPath]);
11367
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("dmg \uB9C8\uC6B4\uD2B8 \uC2E4\uD328")));
11368
+ });
11369
+ try {
11370
+ const appSource = (0, import_path5.join)(mountPoint, "LibreOffice.app");
11371
+ const appDest = (0, import_path5.join)(CACHE_DIR, "LibreOffice.app");
11372
+ await new Promise((resolve4, reject) => {
11373
+ const child = (0, import_child_process4.spawn)("cp", ["-R", appSource, appDest]);
11374
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(".app \uBCF5\uC0AC \uC2E4\uD328")));
11375
+ });
11376
+ } finally {
11377
+ await new Promise((resolve4) => {
11378
+ const child = (0, import_child_process4.spawn)("hdiutil", ["detach", mountPoint]);
11379
+ child.on("close", () => resolve4());
11380
+ });
11381
+ }
11382
+ await (0, import_promises2.rm)(downloadPath, { force: true });
11383
+ return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
11384
+ }
11385
+ async function installLinux(pkg, downloadPath) {
11386
+ const extractDir = (0, import_path5.join)(CACHE_DIR, `extract-${Date.now()}`);
11387
+ await (0, import_promises2.mkdir)(extractDir, { recursive: true });
11388
+ await new Promise((resolve4, reject) => {
11389
+ const child = (0, import_child_process4.spawn)("tar", ["xzf", downloadPath, "-C", extractDir]);
11390
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("\uC555\uCD95 \uD574\uC81C \uC2E4\uD328")));
11391
+ });
11392
+ const debsDir = (0, import_path5.join)(extractDir, "DEBS");
11393
+ try {
11394
+ await (0, import_promises2.access)(debsDir);
11395
+ const entries = await (await import("fs/promises")).readdir(debsDir);
11396
+ for (const entry of entries) {
11397
+ if (entry.endsWith(".deb")) {
11398
+ await new Promise((resolve4, reject) => {
11399
+ const child = (0, import_child_process4.spawn)("dpkg-deb", ["-x", (0, import_path5.join)(debsDir, entry), CACHE_DIR]);
11400
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error(`${entry} \uCD94\uCD9C \uC2E4\uD328`)));
11401
+ });
11402
+ }
11403
+ }
11404
+ } catch {
11405
+ }
11406
+ await (0, import_promises2.rm)(downloadPath, { force: true });
11407
+ await (0, import_promises2.rm)(extractDir, { recursive: true, force: true });
11408
+ return await createSymlink((0, import_path5.join)(CACHE_DIR, pkg.binPath));
11409
+ }
11410
+ async function installWindows(pkg, downloadPath) {
11411
+ await new Promise((resolve4, reject) => {
11412
+ const child = (0, import_child_process4.spawn)("msiexec", ["/a", downloadPath, "/qn", `TARGETDIR=${CACHE_DIR}`]);
11413
+ child.on("close", (code) => code === 0 ? resolve4() : reject(new Error("MSI \uC124\uCE58 \uC2E4\uD328")));
11414
+ });
11415
+ await (0, import_promises2.rm)(downloadPath, { force: true });
11416
+ return (0, import_path5.join)(CACHE_DIR, pkg.binPath);
11417
+ }
11418
+ async function createSymlink(actualBin) {
11419
+ const binDir = (0, import_path5.join)(CACHE_DIR, "bin");
11420
+ await (0, import_promises2.mkdir)(binDir, { recursive: true });
11421
+ const linkBin = (0, import_path5.join)(binDir, "soffice");
11422
+ try {
11423
+ await (0, import_promises2.symlink)(actualBin, linkBin);
11424
+ } catch {
11425
+ }
11426
+ process.env.PATH = `${binDir}${import_path5.delimiter}${process.env.PATH}`;
11427
+ return linkBin;
11428
+ }
11429
+ async function installLibreOffice(onProgress) {
11430
+ const platform = process.platform;
11431
+ const pkg = PACKAGES[platform];
11432
+ if (!pkg) {
11290
11433
  throw new ConvertError(
11434
+ "UNSUPPORTED_PLATFORM",
11435
+ `${platform}\uC740 \uC790\uB3D9 \uC124\uCE58\uB97C \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4. \uC218\uB3D9\uC73C\uB85C LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.`
11436
+ );
11437
+ }
11438
+ return await installForPlatform(pkg, onProgress);
11439
+ }
11440
+ async function resolveSoffice(emitter, autoInstall = true) {
11441
+ emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
11442
+ const inPath = await findInPath();
11443
+ if (inPath) {
11444
+ emitter.validate("soffice_found", "\uC2DC\uC2A4\uD15C PATH\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inPath });
11445
+ return inPath;
11446
+ }
11447
+ const inCache = await findInCache();
11448
+ if (inCache) {
11449
+ emitter.validate("soffice_found", "\uCE90\uC2DC\uB41C LibreOffice \uBC1C\uACAC", { sofficePath: inCache });
11450
+ return inCache;
11451
+ }
11452
+ const inDefault = await findInDefaultPaths();
11453
+ if (inDefault) {
11454
+ emitter.validate("soffice_found", "\uAE30\uBCF8 \uACBD\uB85C\uC5D0\uC11C LibreOffice \uBC1C\uACAC", { sofficePath: inDefault });
11455
+ return inDefault;
11456
+ }
11457
+ if (!autoInstall) {
11458
+ emitter.error(
11459
+ "validate",
11291
11460
  "SOFFICE_NOT_FOUND",
11292
- "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
11461
+ "LibreOffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4",
11462
+ "\uC218\uB3D9\uC73C\uB85C \uC124\uCE58\uD558\uAC70\uB098 autoInstallLibreOffice: true \uC635\uC158\uC744 \uC0AC\uC6A9\uD558\uC138\uC694."
11293
11463
  );
11464
+ throw new ConvertError("SOFFICE_NOT_FOUND", "LibreOffice\uAC00 \uC124\uCE58\uB418\uC9C0 \uC54A\uC558\uC2B5\uB2C8\uB2E4");
11465
+ }
11466
+ if (installInFlight) {
11467
+ return installInFlight;
11294
11468
  }
11469
+ emitter.install("install_start", "LibreOffice \uC790\uB3D9 \uC124\uCE58\uB97C \uC2DC\uC791\uD569\uB2C8\uB2E4...");
11470
+ installInFlight = (async () => {
11471
+ try {
11472
+ const installed = await installLibreOffice((downloaded, total) => {
11473
+ const percent = Math.round(downloaded / total * 100);
11474
+ emitter.install("download_progress", `\uB2E4\uC6B4\uB85C\uB4DC \uC911... ${percent}%`, {
11475
+ percent,
11476
+ downloadedBytes: downloaded,
11477
+ totalBytes: total
11478
+ });
11479
+ });
11480
+ emitter.install("install_complete", "\uC124\uCE58 \uC644\uB8CC", { installedPath: installed });
11481
+ return installed;
11482
+ } catch (err) {
11483
+ const errorMsg = err instanceof Error ? err.message : String(err);
11484
+ emitter.install("install_failed", "\uC124\uCE58 \uC2E4\uD328", { error: errorMsg });
11485
+ throw err;
11486
+ } finally {
11487
+ installInFlight = null;
11488
+ }
11489
+ })();
11490
+ return installInFlight;
11295
11491
  }
11492
+
11493
+ // src/convert/libreoffice.ts
11494
+ var libreConvert = import_libreoffice_convert.default.convert;
11296
11495
  async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
11297
11496
  return new Promise((resolve4, reject) => {
11298
11497
  const timer = setTimeout(() => {
@@ -11316,6 +11515,54 @@ async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
11316
11515
  });
11317
11516
  }
11318
11517
 
11518
+ // src/convert/events.ts
11519
+ var ConvertEventEmitter = class {
11520
+ listener = null;
11521
+ /** 이벤트 리스너 등록 */
11522
+ setListener(listener) {
11523
+ this.listener = listener;
11524
+ }
11525
+ /** 이벤트 발송 */
11526
+ emit(event) {
11527
+ try {
11528
+ this.listener?.(event);
11529
+ } catch {
11530
+ }
11531
+ }
11532
+ /** 타입 안전한 헬퍼: detect 이벤트 */
11533
+ detect(stage, message, meta) {
11534
+ this.emit({ type: "detect", stage, message, ...meta });
11535
+ }
11536
+ /** 타입 안전한 헬퍼: validate 이벤트 */
11537
+ validate(stage, message, meta) {
11538
+ this.emit({ type: "validate", stage, message, ...meta });
11539
+ }
11540
+ /** 타입 안전한 헬퍼: install 이벤트 */
11541
+ install(stage, message, meta) {
11542
+ this.emit({ type: "install", stage, message, ...meta });
11543
+ }
11544
+ /** 타입 안전한 헬퍼: convert 진행 이벤트 */
11545
+ progress(percent, message) {
11546
+ this.emit({ type: "convert", stage: "convert_progress", message, percent });
11547
+ }
11548
+ /** 타입 안전한 헬퍼: convert 시작 */
11549
+ convertStart(message) {
11550
+ this.emit({ type: "convert", stage: "convert_start", message, percent: 0 });
11551
+ }
11552
+ /** 타입 안전한 헬퍼: convert 완료 */
11553
+ convertDone(message) {
11554
+ this.emit({ type: "convert", stage: "convert_done", message, percent: 100 });
11555
+ }
11556
+ /** 타입 안전한 헬퍼: 완료 이벤트 */
11557
+ complete(result) {
11558
+ this.emit({ type: "complete", stage: "success", message: "\uBCC0\uD658 \uC644\uB8CC", result });
11559
+ }
11560
+ /** 타입 안전한 헬퍼: 에러 이벤트 */
11561
+ error(stage, code, message, suggestion) {
11562
+ this.emit({ type: "error", stage, code, message, recoverable: true, suggestion });
11563
+ }
11564
+ };
11565
+
11319
11566
  // src/convert/index.ts
11320
11567
  var isConverting = false;
11321
11568
  var queue = [];
@@ -11340,81 +11587,129 @@ async function acquireConvertLock() {
11340
11587
  });
11341
11588
  }
11342
11589
  async function convertToPdf(input, options) {
11343
- let buffer;
11344
- try {
11345
- if (typeof input === "string") {
11346
- buffer = await (0, import_promises2.readFile)(input);
11347
- } else if (Buffer.isBuffer(input)) {
11348
- buffer = input;
11349
- } else {
11350
- buffer = Buffer.from(input);
11351
- }
11352
- } catch (err) {
11353
- return {
11354
- success: false,
11355
- code: "PARSE_ERROR",
11356
- error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
11357
- stage: "detect"
11358
- };
11359
- }
11360
- const MAX_FILE_SIZE = 500 * 1024 * 1024;
11361
- if (buffer.length > MAX_FILE_SIZE) {
11362
- return {
11363
- success: false,
11364
- code: "FILE_TOO_LARGE",
11365
- error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
11366
- stage: "detect"
11367
- };
11590
+ const emitter = new ConvertEventEmitter();
11591
+ if (options?.onEvent) {
11592
+ emitter.setListener(options.onEvent);
11368
11593
  }
11369
- const format = detectFormat(toArrayBuffer(buffer));
11370
- if (format !== "hwp" && format !== "hwpx") {
11371
- return {
11372
- success: false,
11373
- code: "UNSUPPORTED_FORMAT",
11374
- error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
11375
- stage: "detect"
11376
- };
11594
+ if (options?.onProgress) {
11595
+ const legacyProgress = options.onProgress;
11596
+ emitter.setListener((event) => {
11597
+ if (event.type === "convert" && event.stage === "convert_progress") {
11598
+ legacyProgress(event.percent, event.message);
11599
+ }
11600
+ });
11377
11601
  }
11378
11602
  try {
11379
- await assertSofficeAvailable();
11380
- } catch (err) {
11381
- if (err instanceof ConvertError) {
11603
+ emitter.detect("reading", "\uC785\uB825 \uD30C\uC77C \uC77D\uB294 \uC911...");
11604
+ let buffer;
11605
+ try {
11606
+ if (typeof input === "string") {
11607
+ buffer = await (0, import_promises3.readFile)(input);
11608
+ } else if (Buffer.isBuffer(input)) {
11609
+ buffer = input;
11610
+ } else {
11611
+ buffer = Buffer.from(input);
11612
+ }
11613
+ } catch (err) {
11614
+ emitter.error(
11615
+ "detect",
11616
+ "PARSE_ERROR",
11617
+ `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`
11618
+ );
11382
11619
  return {
11383
11620
  success: false,
11384
- code: err.code,
11385
- error: err.message,
11386
- stage: "validate"
11621
+ code: "PARSE_ERROR",
11622
+ error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
11623
+ stage: "detect"
11387
11624
  };
11388
11625
  }
11389
- throw err;
11390
- }
11391
- const releaseLock = await acquireConvertLock();
11392
- try {
11393
- options?.onProgress?.(10, "convert");
11394
- const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
11395
- options?.onProgress?.(100, "done");
11396
- return {
11397
- success: true,
11398
- pdf: new Uint8Array(pdf),
11399
- sourceFormat: format
11400
- };
11401
- } catch (err) {
11402
- if (err instanceof ConvertError) {
11626
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
11627
+ if (buffer.length > MAX_FILE_SIZE) {
11628
+ emitter.error(
11629
+ "detect",
11630
+ "FILE_TOO_LARGE",
11631
+ `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`
11632
+ );
11633
+ return {
11634
+ success: false,
11635
+ code: "FILE_TOO_LARGE",
11636
+ error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
11637
+ stage: "detect"
11638
+ };
11639
+ }
11640
+ const format = detectFormat(toArrayBuffer(buffer));
11641
+ emitter.detect("format_detected", `\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC: ${format}`, { format });
11642
+ if (format !== "hwp" && format !== "hwpx") {
11643
+ emitter.error("detect", "UNSUPPORTED_FORMAT", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`);
11644
+ return {
11645
+ success: false,
11646
+ code: "UNSUPPORTED_FORMAT",
11647
+ error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
11648
+ stage: "detect"
11649
+ };
11650
+ }
11651
+ emitter.validate("soffice_check", "LibreOffice \uAC00\uC6A9\uC131 \uD655\uC778 \uC911...");
11652
+ let sofficePath;
11653
+ try {
11654
+ sofficePath = await resolveSoffice(emitter, options?.autoInstallLibreOffice ?? true);
11655
+ } catch (err) {
11656
+ if (err instanceof ConvertError) {
11657
+ return {
11658
+ success: false,
11659
+ code: err.code,
11660
+ error: err.message,
11661
+ stage: "validate"
11662
+ };
11663
+ }
11664
+ throw err;
11665
+ }
11666
+ const releaseLock = await acquireConvertLock();
11667
+ try {
11668
+ emitter.convertStart("\uBCC0\uD658 \uC2DC\uC791...");
11669
+ emitter.progress(10, "\uBCC0\uD658 \uC911...");
11670
+ const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
11671
+ emitter.progress(100, "\uBCC0\uD658 \uC644\uB8CC");
11672
+ emitter.convertDone("\uBCC0\uD658 \uC644\uB8CC");
11673
+ const result = {
11674
+ success: true,
11675
+ pdf: new Uint8Array(pdf),
11676
+ sourceFormat: format
11677
+ };
11678
+ emitter.complete({
11679
+ sourceFormat: format,
11680
+ pdfSize: pdf.length
11681
+ });
11682
+ return result;
11683
+ } catch (err) {
11684
+ if (err instanceof ConvertError) {
11685
+ emitter.error("convert", err.code, err.message);
11686
+ return {
11687
+ success: false,
11688
+ code: err.code,
11689
+ error: err.message,
11690
+ stage: "convert"
11691
+ };
11692
+ }
11693
+ const errorMsg = err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328";
11694
+ emitter.error("convert", classifyError(err), errorMsg);
11403
11695
  return {
11404
11696
  success: false,
11405
- code: err.code,
11406
- error: err.message,
11697
+ code: classifyError(err),
11698
+ error: errorMsg,
11407
11699
  stage: "convert"
11408
11700
  };
11701
+ } finally {
11702
+ releaseLock();
11409
11703
  }
11704
+ } catch (unexpectedErr) {
11705
+ const errorMsg = unexpectedErr instanceof Error ? unexpectedErr.message : "\uC608\uC0C1\uCE58 \uBABB\uD55C \uC624\uB958";
11706
+ emitter.error("convert", "PARSE_ERROR", errorMsg);
11410
11707
  return {
11411
11708
  success: false,
11412
- code: classifyError(err),
11413
- error: err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328",
11709
+ code: "PARSE_ERROR",
11710
+ error: errorMsg,
11414
11711
  stage: "convert"
11415
11712
  };
11416
- } finally {
11417
- releaseLock();
11418
11713
  }
11419
11714
  }
11420
11715
  async function convertHwpToPdf(input, options) {
@@ -11442,9 +11737,6 @@ async function convertHwpxToPdf(input, options) {
11442
11737
  return result;
11443
11738
  }
11444
11739
 
11445
- // src/index.ts
11446
- init_utils();
11447
-
11448
11740
  // src/ocr/api-key-rotation.ts
11449
11741
  var AllKeysCoolingDownError = class extends Error {
11450
11742
  waitMs;
@@ -11539,9 +11831,9 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11539
11831
  };
11540
11832
 
11541
11833
  // src/pipeline/unified-ocr.ts
11542
- var import_promises3 = require("fs/promises");
11543
- var import_path5 = require("path");
11544
- var import_child_process4 = require("child_process");
11834
+ var import_promises4 = require("fs/promises");
11835
+ var import_path6 = require("path");
11836
+ var import_child_process5 = require("child_process");
11545
11837
  var import_node_perf_hooks = require("perf_hooks");
11546
11838
  init_logger();
11547
11839
 
@@ -11675,15 +11967,15 @@ function elapsedMs(startAt) {
11675
11967
  return Math.round(import_node_perf_hooks.performance.now() - startAt);
11676
11968
  }
11677
11969
  async function runUnifiedOcrPipeline(inputPath, options = {}) {
11678
- const absInput = (0, import_path5.resolve)(inputPath);
11679
- const stem = (0, import_path5.basename)(absInput, (0, import_path5.extname)(absInput));
11680
- const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
11681
- const imagesDir = (0, import_path5.join)(workspaceDir, "images");
11682
- const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
11683
- const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
11684
- const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
11685
- const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
11686
- const modelCachePath = (0, import_path5.join)((0, import_path5.dirname)(absInput), ".kordoc-model-cache.json");
11970
+ const absInput = (0, import_path6.resolve)(inputPath);
11971
+ const stem = (0, import_path6.basename)(absInput, (0, import_path6.extname)(absInput));
11972
+ const workspaceDir = (0, import_path6.resolve)(options.workspaceDir ?? (0, import_path6.join)((0, import_path6.dirname)(absInput), `${stem}_ocr_workspace`));
11973
+ const imagesDir = (0, import_path6.join)(workspaceDir, "images");
11974
+ const rawDir = (0, import_path6.join)(workspaceDir, "ocr", "raw");
11975
+ const diffDir = (0, import_path6.join)(workspaceDir, "ocr", "diff");
11976
+ const outputPath = (0, import_path6.resolve)(options.outputPath ?? (0, import_path6.join)((0, import_path6.dirname)(absInput), `${stem}.md`));
11977
+ const reportPath = (0, import_path6.join)(workspaceDir, "run-report.json");
11978
+ const modelCachePath = (0, import_path6.join)((0, import_path6.dirname)(absInput), ".kordoc-model-cache.json");
11687
11979
  const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
11688
11980
  const timeoutMs = options.timeoutMs ?? 6e4;
11689
11981
  const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
@@ -11697,9 +11989,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11697
11989
  const keyPool = ApiKeyRotationPool.fromEnv();
11698
11990
  const runId = options.runId ?? generateRunId("ocr");
11699
11991
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11700
- await (0, import_promises3.mkdir)(imagesDir, { recursive: true });
11701
- await (0, import_promises3.mkdir)(rawDir, { recursive: true });
11702
- await (0, import_promises3.mkdir)(diffDir, { recursive: true });
11992
+ await (0, import_promises4.mkdir)(imagesDir, { recursive: true });
11993
+ await (0, import_promises4.mkdir)(rawDir, { recursive: true });
11994
+ await (0, import_promises4.mkdir)(diffDir, { recursive: true });
11703
11995
  const timingsMs = {};
11704
11996
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11705
11997
  const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
@@ -11715,12 +12007,30 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11715
12007
  currentStage = "convert";
11716
12008
  markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
11717
12009
  logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
11718
- if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
11719
- await assertSofficeAvailable();
11720
- workingPdfPath = (0, import_path5.join)(workspaceDir, `${stem}.pdf`);
11721
- const inputBuffer = await (0, import_promises3.readFile)(absInput);
12010
+ if ((0, import_path6.extname)(absInput).toLowerCase() !== ".pdf") {
12011
+ const convertEmitter = new ConvertEventEmitter();
12012
+ if (options.onEvent) {
12013
+ convertEmitter.setListener((evt) => {
12014
+ if (evt.type === "install" || evt.type === "validate" || evt.type === "error") {
12015
+ try {
12016
+ ;
12017
+ options.onEvent(evt);
12018
+ } catch {
12019
+ }
12020
+ }
12021
+ });
12022
+ }
12023
+ if (options.sofficePath) {
12024
+ const sofficeDir = (0, import_path6.dirname)(options.sofficePath);
12025
+ process.env.PATH = `${sofficeDir}${import_path6.delimiter}${process.env.PATH ?? ""}`;
12026
+ convertEmitter.validate("soffice_found", "\uC9C1\uC811 \uC9C0\uC815\uB41C LibreOffice \uACBD\uB85C \uC0AC\uC6A9", { sofficePath: options.sofficePath });
12027
+ } else {
12028
+ await resolveSoffice(convertEmitter, options.autoInstallLibreOffice ?? false);
12029
+ }
12030
+ workingPdfPath = (0, import_path6.join)(workspaceDir, `${stem}.pdf`);
12031
+ const inputBuffer = await (0, import_promises4.readFile)(absInput);
11722
12032
  const out = await convertBuffer(inputBuffer, ".pdf");
11723
- await (0, import_promises3.writeFile)(workingPdfPath, out);
12033
+ await (0, import_promises4.writeFile)(workingPdfPath, out);
11724
12034
  }
11725
12035
  timingsMs.convert = elapsedMs(convertStart);
11726
12036
  markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
@@ -11731,10 +12041,10 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11731
12041
  if (totalPages === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC218\uB97C \uD655\uC778\uD560 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4.");
11732
12042
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11733
12043
  logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
11734
- await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
11735
- const firstFiles = (await (0, import_promises3.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
12044
+ await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path6.join)(imagesDir, "page")]);
12045
+ const firstFiles = (await (0, import_promises4.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11736
12046
  if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
11737
- const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
12047
+ const probeImage = (0, import_path6.join)(imagesDir, firstFiles[0]);
11738
12048
  markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
11739
12049
  const probeStart = import_node_perf_hooks.performance.now();
11740
12050
  currentStage = "probe";
@@ -11780,7 +12090,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11780
12090
  try {
11781
12091
  await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
11782
12092
  if (totalPages > 1) {
11783
- for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
12093
+ for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path6.join)(imagesDir, "page"), dpi, totalPages, 2)) {
11784
12094
  await queue2.enqueue(item);
11785
12095
  renderDone++;
11786
12096
  markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
@@ -11830,8 +12140,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11830
12140
  const sortedEntries = Array.from(pageResultsMap.entries()).sort((a, b) => a[0] - b[0]);
11831
12141
  const rawPagePaths = [];
11832
12142
  for (const [pageNum, markdown] of sortedEntries) {
11833
- const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
11834
- await (0, import_promises3.writeFile)(pagePath, markdown, "utf-8");
12143
+ const pagePath = (0, import_path6.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
12144
+ await (0, import_promises4.writeFile)(pagePath, markdown, "utf-8");
11835
12145
  rawPagePaths.push(pagePath);
11836
12146
  }
11837
12147
  const mergeStart = import_node_perf_hooks.performance.now();
@@ -11839,7 +12149,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11839
12149
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11840
12150
  logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
11841
12151
  const merged = await mergeMarkdownPages(rawPagePaths);
11842
- await (0, import_promises3.writeFile)(outputPath, merged, "utf-8");
12152
+ await (0, import_promises4.writeFile)(outputPath, merged, "utf-8");
11843
12153
  timingsMs.merge = elapsedMs(mergeStart);
11844
12154
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
11845
12155
  logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
@@ -11855,7 +12165,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11855
12165
  timingsMs,
11856
12166
  modelCachePath
11857
12167
  };
11858
- await (0, import_promises3.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
12168
+ await (0, import_promises4.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
11859
12169
  logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
11860
12170
  return { outputPath, reportPath, selectedModel };
11861
12171
  } catch (err) {
@@ -11930,7 +12240,7 @@ async function getPdfPageCount(pdfPath) {
11930
12240
  return n;
11931
12241
  }
11932
12242
  async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, startPage = 1) {
11933
- const imagesDir = (0, import_path5.dirname)(prefixPath);
12243
+ const imagesDir = (0, import_path6.dirname)(prefixPath);
11934
12244
  for (let page = startPage; page <= totalPages; page++) {
11935
12245
  try {
11936
12246
  await runCommand("pdftoppm", [
@@ -11944,9 +12254,9 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
11944
12254
  pdfPath,
11945
12255
  prefixPath
11946
12256
  ]);
11947
- const files = await (0, import_promises3.readdir)(imagesDir);
12257
+ const files = await (0, import_promises4.readdir)(imagesDir);
11948
12258
  const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11949
- const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
12259
+ const imagePath = (0, import_path6.join)(imagesDir, pageFiles[pageFiles.length - 1]);
11950
12260
  yield { pageNumber: page, imagePath };
11951
12261
  } catch (err) {
11952
12262
  yield {
@@ -11959,7 +12269,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
11959
12269
  }
11960
12270
  async function runCommand(cmd, args) {
11961
12271
  await new Promise((resolvePromise, reject) => {
11962
- const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
12272
+ const child = (0, import_child_process5.spawn)(cmd, args, { stdio: "pipe" });
11963
12273
  let stderr = "";
11964
12274
  child.stderr.on("data", (d) => {
11965
12275
  stderr += String(d);
@@ -11973,7 +12283,7 @@ async function runCommand(cmd, args) {
11973
12283
  }
11974
12284
  async function runCommandWithStdout(cmd, args) {
11975
12285
  return await new Promise((resolvePromise, reject) => {
11976
- const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
12286
+ const child = (0, import_child_process5.spawn)(cmd, args, { stdio: "pipe" });
11977
12287
  let stdout = "";
11978
12288
  let stderr = "";
11979
12289
  child.stdout.on("data", (d) => {
@@ -12062,7 +12372,7 @@ function startParallelProbeRuns(input) {
12062
12372
  }
12063
12373
  async function loadModelCache(path) {
12064
12374
  try {
12065
- const raw = await (0, import_promises3.readFile)(path, "utf-8");
12375
+ const raw = await (0, import_promises4.readFile)(path, "utf-8");
12066
12376
  return JSON.parse(raw);
12067
12377
  } catch {
12068
12378
  return null;
@@ -12093,7 +12403,7 @@ async function updateModelCache(path, probes) {
12093
12403
  }
12094
12404
  }
12095
12405
  current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
12096
- await (0, import_promises3.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
12406
+ await (0, import_promises4.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
12097
12407
  }
12098
12408
  async function ocrWorkerPool(input) {
12099
12409
  const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
@@ -12153,7 +12463,7 @@ async function ocrImageWithFallback(input) {
12153
12463
  async function mergeMarkdownPages(paths) {
12154
12464
  const out = [];
12155
12465
  for (let i = 0; i < paths.length; i++) {
12156
- const txt = (await (0, import_promises3.readFile)(paths[i], "utf-8")).trim();
12466
+ const txt = (await (0, import_promises4.readFile)(paths[i], "utf-8")).trim();
12157
12467
  if (!txt) continue;
12158
12468
  out.push(txt);
12159
12469
  }
@@ -12269,7 +12579,7 @@ async function ocrImageViaNim(input) {
12269
12579
  throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
12270
12580
  }
12271
12581
  async function encodeBase64(path) {
12272
- const b = await (0, import_promises3.readFile)(path);
12582
+ const b = await (0, import_promises4.readFile)(path);
12273
12583
  return b.toString("base64");
12274
12584
  }
12275
12585
  function stripCodeFence3(text) {
@@ -12281,7 +12591,7 @@ async function delay(ms) {
12281
12591
  await new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
12282
12592
  }
12283
12593
  function ensureSupportedInput(path) {
12284
- const ext = (0, import_path5.extname)(path).toLowerCase();
12594
+ const ext = (0, import_path6.extname)(path).toLowerCase();
12285
12595
  const allowed = /* @__PURE__ */ new Set([".pdf", ".hwp", ".hwpx", ".docx", ".xlsx"]);
12286
12596
  if (!allowed.has(ext)) {
12287
12597
  throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
@@ -12289,6 +12599,16 @@ function ensureSupportedInput(path) {
12289
12599
  }
12290
12600
  function normalizePipelineError(err, stage) {
12291
12601
  if (err instanceof UnifiedOcrError) return err;
12602
+ if (err instanceof ConvertError) {
12603
+ const codeMap = {
12604
+ SOFFICE_NOT_FOUND: "SOFFICE_NOT_FOUND",
12605
+ CONVERT_FAILED: "CONVERT_FAILED",
12606
+ TIMEOUT: "CONVERT_FAILED",
12607
+ UNSUPPORTED_PLATFORM: "CONVERT_FAILED",
12608
+ UNSUPPORTED_FORMAT: "UNSUPPORTED_INPUT"
12609
+ };
12610
+ return new UnifiedOcrError(codeMap[err.code] ?? "CONVERT_FAILED", stage, err.message);
12611
+ }
12292
12612
  const message = err instanceof Error ? err.message : String(err);
12293
12613
  const codeByStage = {
12294
12614
  convert: "CONVERT_FAILED",
@@ -12308,7 +12628,7 @@ async function parse2(input, options) {
12308
12628
  let buffer;
12309
12629
  if (typeof input === "string") {
12310
12630
  try {
12311
- const buf = await (0, import_promises4.readFile)(input);
12631
+ const buf = await (0, import_promises5.readFile)(input);
12312
12632
  buffer = toArrayBuffer(buf);
12313
12633
  } catch (err) {
12314
12634
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;