@clazic/kordoc 2.5.2 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -37,6 +37,118 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
37
37
  mod
38
38
  ));
39
39
 
40
+ // src/utils.ts
41
+ var utils_exports = {};
42
+ __export(utils_exports, {
43
+ KordocError: () => KordocError,
44
+ VERSION: () => VERSION,
45
+ classifyError: () => classifyError,
46
+ isPathTraversal: () => isPathTraversal,
47
+ normalizeKordocError: () => normalizeKordocError,
48
+ precheckZipSize: () => precheckZipSize,
49
+ sanitizeError: () => sanitizeError,
50
+ sanitizeHref: () => sanitizeHref,
51
+ toArrayBuffer: () => toArrayBuffer
52
+ });
53
+ function toArrayBuffer(buf) {
54
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
55
+ return buf.buffer;
56
+ }
57
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
58
+ }
59
+ function sanitizeError(err) {
60
+ if (err instanceof KordocError) return err.message;
61
+ return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
62
+ }
63
+ function isPathTraversal(name) {
64
+ if (name.includes("\0")) return true;
65
+ const normalized = name.replace(/\\/g, "/");
66
+ return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
67
+ }
68
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
69
+ try {
70
+ const data = new DataView(buffer);
71
+ const len = buffer.byteLength;
72
+ let eocdOffset = -1;
73
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
74
+ if (data.getUint32(i, true) === 101010256) {
75
+ eocdOffset = i;
76
+ break;
77
+ }
78
+ }
79
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
80
+ const entryCount = data.getUint16(eocdOffset + 10, true);
81
+ if (entryCount > maxEntries) {
82
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
83
+ }
84
+ const cdSize = data.getUint32(eocdOffset + 12, true);
85
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
86
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
87
+ let totalUncompressed = 0;
88
+ let pos = cdOffset;
89
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
90
+ if (data.getUint32(pos, true) !== 33639248) break;
91
+ totalUncompressed += data.getUint32(pos + 24, true);
92
+ const nameLen = data.getUint16(pos + 28, true);
93
+ const extraLen = data.getUint16(pos + 30, true);
94
+ const commentLen = data.getUint16(pos + 32, true);
95
+ pos += 46 + nameLen + extraLen + commentLen;
96
+ }
97
+ if (totalUncompressed > maxUncompressedSize) {
98
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
99
+ }
100
+ return { totalUncompressed, entryCount };
101
+ } catch (err) {
102
+ if (err instanceof KordocError) throw err;
103
+ return { totalUncompressed: 0, entryCount: 0 };
104
+ }
105
+ }
106
+ function sanitizeHref(href) {
107
+ const trimmed = href.trim();
108
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
109
+ return trimmed;
110
+ }
111
+ function classifyError(err) {
112
+ if (!(err instanceof Error)) return "PARSE_ERROR";
113
+ const msg = err.message;
114
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
115
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
116
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
117
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
118
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
119
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
120
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
121
+ return "PARSE_ERROR";
122
+ }
123
+ function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
124
+ if (err instanceof KordocError) {
125
+ if (!err.stage) err.stage = stage;
126
+ if (!err.code) err.code = fallbackCode;
127
+ return err;
128
+ }
129
+ const message = err instanceof Error ? err.message : fallbackMessage;
130
+ const code = err instanceof Error ? classifyError(err) : fallbackCode;
131
+ return new KordocError(message || fallbackMessage, { code, stage });
132
+ }
133
+ var VERSION, KordocError, SAFE_HREF_RE;
134
+ var init_utils = __esm({
135
+ "src/utils.ts"() {
136
+ "use strict";
137
+ VERSION = true ? "2.5.2" : "0.0.0-dev";
138
+ KordocError = class extends Error {
139
+ code;
140
+ stage;
141
+ constructor(message, opts = {}) {
142
+ super(message);
143
+ this.name = "KordocError";
144
+ this.code = opts.code;
145
+ this.stage = opts.stage;
146
+ }
147
+ };
148
+ SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
149
+ }
150
+ });
151
+
40
152
  // src/page-range.ts
41
153
  var page_range_exports = {};
42
154
  __export(page_range_exports, {
@@ -3059,7 +3171,7 @@ var init_provider = __esm({
3059
3171
  });
3060
3172
 
3061
3173
  // src/index.ts
3062
- import { readFile as readFile2 } from "fs/promises";
3174
+ import { readFile as readFile3 } from "fs/promises";
3063
3175
 
3064
3176
  // src/detect.ts
3065
3177
  import JSZip from "jszip";
@@ -3111,97 +3223,8 @@ async function detectZipFormat(buffer) {
3111
3223
  import JSZip2 from "jszip";
3112
3224
  import { DOMParser } from "@xmldom/xmldom";
3113
3225
 
3114
- // src/utils.ts
3115
- var VERSION = true ? "2.5.1" : "0.0.0-dev";
3116
- function toArrayBuffer(buf) {
3117
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3118
- return buf.buffer;
3119
- }
3120
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
3121
- }
3122
- var KordocError = class extends Error {
3123
- code;
3124
- stage;
3125
- constructor(message, opts = {}) {
3126
- super(message);
3127
- this.name = "KordocError";
3128
- this.code = opts.code;
3129
- this.stage = opts.stage;
3130
- }
3131
- };
3132
- function isPathTraversal(name) {
3133
- if (name.includes("\0")) return true;
3134
- const normalized = name.replace(/\\/g, "/");
3135
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
3136
- }
3137
- function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
3138
- try {
3139
- const data = new DataView(buffer);
3140
- const len = buffer.byteLength;
3141
- let eocdOffset = -1;
3142
- for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
3143
- if (data.getUint32(i, true) === 101010256) {
3144
- eocdOffset = i;
3145
- break;
3146
- }
3147
- }
3148
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
3149
- const entryCount = data.getUint16(eocdOffset + 10, true);
3150
- if (entryCount > maxEntries) {
3151
- throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
3152
- }
3153
- const cdSize = data.getUint32(eocdOffset + 12, true);
3154
- const cdOffset = data.getUint32(eocdOffset + 16, true);
3155
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
3156
- let totalUncompressed = 0;
3157
- let pos = cdOffset;
3158
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
3159
- if (data.getUint32(pos, true) !== 33639248) break;
3160
- totalUncompressed += data.getUint32(pos + 24, true);
3161
- const nameLen = data.getUint16(pos + 28, true);
3162
- const extraLen = data.getUint16(pos + 30, true);
3163
- const commentLen = data.getUint16(pos + 32, true);
3164
- pos += 46 + nameLen + extraLen + commentLen;
3165
- }
3166
- if (totalUncompressed > maxUncompressedSize) {
3167
- throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
3168
- }
3169
- return { totalUncompressed, entryCount };
3170
- } catch (err) {
3171
- if (err instanceof KordocError) throw err;
3172
- return { totalUncompressed: 0, entryCount: 0 };
3173
- }
3174
- }
3175
- var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
3176
- function sanitizeHref(href) {
3177
- const trimmed = href.trim();
3178
- if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
3179
- return trimmed;
3180
- }
3181
- function classifyError(err) {
3182
- if (!(err instanceof Error)) return "PARSE_ERROR";
3183
- const msg = err.message;
3184
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
3185
- if (msg.includes("DRM")) return "DRM_PROTECTED";
3186
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
3187
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
3188
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
3189
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
3190
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
3191
- return "PARSE_ERROR";
3192
- }
3193
- function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
3194
- if (err instanceof KordocError) {
3195
- if (!err.stage) err.stage = stage;
3196
- if (!err.code) err.code = fallbackCode;
3197
- return err;
3198
- }
3199
- const message = err instanceof Error ? err.message : fallbackMessage;
3200
- const code = err instanceof Error ? classifyError(err) : fallbackCode;
3201
- return new KordocError(message || fallbackMessage, { code, stage });
3202
- }
3203
-
3204
3226
  // src/table/builder.ts
3227
+ init_utils();
3205
3228
  var MAX_COLS = 200;
3206
3229
  var MAX_ROWS = 1e4;
3207
3230
  function buildTable(rows) {
@@ -3461,6 +3484,8 @@ var HEADING_RATIO_H2 = 1.3;
3461
3484
  var HEADING_RATIO_H3 = 1.15;
3462
3485
 
3463
3486
  // src/hwpx/parser.ts
3487
+ init_utils();
3488
+ init_utils();
3464
3489
  init_page_range();
3465
3490
  init_logger();
3466
3491
  var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
@@ -4302,6 +4327,7 @@ function extractTextFromNode(node) {
4302
4327
  }
4303
4328
 
4304
4329
  // src/hwp5/record.ts
4330
+ init_utils();
4305
4331
  import { inflateRawSync, inflateSync } from "zlib";
4306
4332
  var TAG_PARA_HEADER = 66;
4307
4333
  var TAG_PARA_TEXT = 67;
@@ -5352,6 +5378,7 @@ function parseLenientCfb(data) {
5352
5378
  }
5353
5379
 
5354
5380
  // src/hwp5/parser.ts
5381
+ init_utils();
5355
5382
  init_page_range();
5356
5383
  init_logger();
5357
5384
  var CFB = __toESM(require_cfb(), 1);
@@ -6007,6 +6034,7 @@ function arrangeCells(rows, cols, cells) {
6007
6034
  }
6008
6035
 
6009
6036
  // src/pdf/parser.ts
6037
+ init_utils();
6010
6038
  init_page_range();
6011
6039
  import { createRequire } from "module";
6012
6040
  import { dirname as dirname2, join as join3, resolve as resolve2 } from "path";
@@ -7898,6 +7926,7 @@ function mergeKoreanLines(text) {
7898
7926
  }
7899
7927
 
7900
7928
  // src/xlsx/parser.ts
7929
+ init_utils();
7901
7930
  import JSZip3 from "jszip";
7902
7931
  import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
7903
7932
  init_logger();
@@ -8226,6 +8255,7 @@ async function parseXlsxDocument(buffer, options, existingZip) {
8226
8255
  }
8227
8256
 
8228
8257
  // src/docx/parser.ts
8258
+ init_utils();
8229
8259
  import JSZip4 from "jszip";
8230
8260
  import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
8231
8261
  init_logger();
@@ -8707,6 +8737,7 @@ async function parseDocxDocument(buffer, options, existingZip) {
8707
8737
  }
8708
8738
 
8709
8739
  // src/index.ts
8740
+ init_utils();
8710
8741
  init_cli_provider();
8711
8742
  init_markdown_to_blocks();
8712
8743
  init_logger();
@@ -11208,6 +11239,187 @@ async function markdownToXlsx(markdown, options) {
11208
11239
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
11209
11240
  }
11210
11241
 
11242
+ // src/convert/index.ts
11243
+ import { readFile } from "fs/promises";
11244
+ init_utils();
11245
+
11246
+ // src/convert/libreoffice.ts
11247
+ import libre from "libreoffice-convert";
11248
+
11249
+ // src/convert/error.ts
11250
+ var ConvertError = class extends Error {
11251
+ constructor(code, message) {
11252
+ super(message);
11253
+ this.code = code;
11254
+ this.name = "ConvertError";
11255
+ }
11256
+ };
11257
+
11258
+ // src/convert/libreoffice.ts
11259
+ var libreConvert = libre.convert;
11260
+ async function assertSofficeAvailable() {
11261
+ const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
11262
+ try {
11263
+ await runCommand2("soffice", ["--version"]);
11264
+ } catch {
11265
+ throw new ConvertError(
11266
+ "SOFFICE_NOT_FOUND",
11267
+ "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
11268
+ );
11269
+ }
11270
+ }
11271
+ async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
11272
+ return new Promise((resolve4, reject) => {
11273
+ const timer = setTimeout(() => {
11274
+ reject(
11275
+ new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
11276
+ );
11277
+ }, timeoutMs);
11278
+ libreConvert(buffer, targetExt, void 0, (err, done) => {
11279
+ clearTimeout(timer);
11280
+ if (err || !done) {
11281
+ reject(
11282
+ new ConvertError(
11283
+ "CONVERT_FAILED",
11284
+ err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
11285
+ )
11286
+ );
11287
+ return;
11288
+ }
11289
+ resolve4(done);
11290
+ });
11291
+ });
11292
+ }
11293
+
11294
+ // src/convert/index.ts
11295
+ var isConverting = false;
11296
+ var queue = [];
11297
+ async function acquireConvertLock() {
11298
+ if (!isConverting) {
11299
+ isConverting = true;
11300
+ return () => {
11301
+ isConverting = false;
11302
+ const next = queue.shift();
11303
+ next?.();
11304
+ };
11305
+ }
11306
+ return new Promise((resolve4) => {
11307
+ queue.push(() => {
11308
+ isConverting = true;
11309
+ resolve4(() => {
11310
+ isConverting = false;
11311
+ const next = queue.shift();
11312
+ next?.();
11313
+ });
11314
+ });
11315
+ });
11316
+ }
11317
+ async function convertToPdf(input, options) {
11318
+ let buffer;
11319
+ try {
11320
+ if (typeof input === "string") {
11321
+ buffer = await readFile(input);
11322
+ } else if (Buffer.isBuffer(input)) {
11323
+ buffer = input;
11324
+ } else {
11325
+ buffer = Buffer.from(input);
11326
+ }
11327
+ } catch (err) {
11328
+ return {
11329
+ success: false,
11330
+ code: "PARSE_ERROR",
11331
+ error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
11332
+ stage: "detect"
11333
+ };
11334
+ }
11335
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
11336
+ if (buffer.length > MAX_FILE_SIZE) {
11337
+ return {
11338
+ success: false,
11339
+ code: "FILE_TOO_LARGE",
11340
+ error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
11341
+ stage: "detect"
11342
+ };
11343
+ }
11344
+ const format = detectFormat(toArrayBuffer(buffer));
11345
+ if (format !== "hwp" && format !== "hwpx") {
11346
+ return {
11347
+ success: false,
11348
+ code: "UNSUPPORTED_FORMAT",
11349
+ error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
11350
+ stage: "detect"
11351
+ };
11352
+ }
11353
+ try {
11354
+ await assertSofficeAvailable();
11355
+ } catch (err) {
11356
+ if (err instanceof ConvertError) {
11357
+ return {
11358
+ success: false,
11359
+ code: err.code,
11360
+ error: err.message,
11361
+ stage: "validate"
11362
+ };
11363
+ }
11364
+ throw err;
11365
+ }
11366
+ const releaseLock = await acquireConvertLock();
11367
+ try {
11368
+ options?.onProgress?.(10, "convert");
11369
+ const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
11370
+ options?.onProgress?.(100, "done");
11371
+ return {
11372
+ success: true,
11373
+ pdf: new Uint8Array(pdf),
11374
+ sourceFormat: format
11375
+ };
11376
+ } catch (err) {
11377
+ if (err instanceof ConvertError) {
11378
+ return {
11379
+ success: false,
11380
+ code: err.code,
11381
+ error: err.message,
11382
+ stage: "convert"
11383
+ };
11384
+ }
11385
+ return {
11386
+ success: false,
11387
+ code: classifyError(err),
11388
+ error: err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328",
11389
+ stage: "convert"
11390
+ };
11391
+ } finally {
11392
+ releaseLock();
11393
+ }
11394
+ }
11395
+ async function convertHwpToPdf(input, options) {
11396
+ const result = await convertToPdf(input, options);
11397
+ if (result.success && result.sourceFormat !== "hwp") {
11398
+ return {
11399
+ success: false,
11400
+ code: "UNSUPPORTED_FORMAT",
11401
+ error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11402
+ stage: "detect"
11403
+ };
11404
+ }
11405
+ return result;
11406
+ }
11407
+ async function convertHwpxToPdf(input, options) {
11408
+ const result = await convertToPdf(input, options);
11409
+ if (result.success && result.sourceFormat !== "hwpx") {
11410
+ return {
11411
+ success: false,
11412
+ code: "UNSUPPORTED_FORMAT",
11413
+ error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11414
+ stage: "detect"
11415
+ };
11416
+ }
11417
+ return result;
11418
+ }
11419
+
11420
+ // src/index.ts
11421
+ init_utils();
11422
+
11211
11423
  // src/ocr/api-key-rotation.ts
11212
11424
  var AllKeysCoolingDownError = class extends Error {
11213
11425
  waitMs;
@@ -11302,11 +11514,10 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11302
11514
  };
11303
11515
 
11304
11516
  // src/pipeline/unified-ocr.ts
11305
- import { mkdir, readdir, readFile, stat, writeFile } from "fs/promises";
11517
+ import { mkdir, readdir, readFile as readFile2, stat, writeFile } from "fs/promises";
11306
11518
  import { basename as basename2, dirname as dirname3, extname, join as join4, resolve as resolve3 } from "path";
11307
11519
  import { spawn as spawn2 } from "child_process";
11308
11520
  import { performance } from "perf_hooks";
11309
- import libre from "libreoffice-convert";
11310
11521
  init_logger();
11311
11522
 
11312
11523
  // src/pipeline/bounded-queue.ts
@@ -11368,7 +11579,6 @@ var BoundedQueue = class {
11368
11579
  };
11369
11580
 
11370
11581
  // src/pipeline/unified-ocr.ts
11371
- var libreConvert = libre.convert;
11372
11582
  var UnifiedOcrError = class extends Error {
11373
11583
  code;
11374
11584
  stage;
@@ -11483,8 +11693,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11483
11693
  if (extname(absInput).toLowerCase() !== ".pdf") {
11484
11694
  await assertSofficeAvailable();
11485
11695
  workingPdfPath = join4(workspaceDir, `${stem}.pdf`);
11486
- const inputBuffer = await readFile(absInput);
11487
- const out = await convertWithLibreOffice(inputBuffer, ".pdf");
11696
+ const inputBuffer = await readFile2(absInput);
11697
+ const out = await convertBuffer(inputBuffer, ".pdf");
11488
11698
  await writeFile(workingPdfPath, out);
11489
11699
  }
11490
11700
  timingsMs.convert = elapsedMs(convertStart);
@@ -11535,7 +11745,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11535
11745
  const keyCount = keyPool.snapshot().length;
11536
11746
  const workerCount = Math.max(1, keyCount * concurrencyPerKey);
11537
11747
  const queueCapacity = workerCount * 2;
11538
- const queue = new BoundedQueue(queueCapacity);
11748
+ const queue2 = new BoundedQueue(queueCapacity);
11539
11749
  const ocrStart = performance.now();
11540
11750
  currentStage = "ocr";
11541
11751
  markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
@@ -11543,17 +11753,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11543
11753
  let renderDone = 1;
11544
11754
  const renderProducer = (async () => {
11545
11755
  try {
11546
- await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
11756
+ await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
11547
11757
  if (totalPages > 1) {
11548
11758
  for await (const item of renderPdfToPngStream(workingPdfPath, join4(imagesDir, "page"), dpi, totalPages, 2)) {
11549
- await queue.enqueue(item);
11759
+ await queue2.enqueue(item);
11550
11760
  renderDone++;
11551
11761
  markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
11552
11762
  logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
11553
11763
  }
11554
11764
  }
11555
11765
  } finally {
11556
- queue.close();
11766
+ queue2.close();
11557
11767
  timingsMs.render = elapsedMs(renderStart);
11558
11768
  markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11559
11769
  logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
@@ -11562,7 +11772,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11562
11772
  const [, pageResultsMap] = await Promise.all([
11563
11773
  renderProducer,
11564
11774
  ocrWorkerPool({
11565
- queue,
11775
+ queue: queue2,
11566
11776
  workerCount,
11567
11777
  totalPages,
11568
11778
  ocrInput: {
@@ -11682,17 +11892,6 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
11682
11892
  model: extra.model
11683
11893
  });
11684
11894
  }
11685
- async function convertWithLibreOffice(buffer, ext) {
11686
- return await new Promise((resolvePromise, reject) => {
11687
- libreConvert(buffer, ext, void 0, (err, done) => {
11688
- if (err || !done) {
11689
- reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
11690
- return;
11691
- }
11692
- resolvePromise(done);
11693
- });
11694
- });
11695
- }
11696
11895
  async function getPdfPageCount(pdfPath) {
11697
11896
  const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
11698
11897
  const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
@@ -11765,13 +11964,6 @@ async function runCommandWithStdout(cmd, args) {
11765
11964
  });
11766
11965
  });
11767
11966
  }
11768
- async function assertSofficeAvailable() {
11769
- try {
11770
- await runCommand("soffice", ["--version"]);
11771
- } catch {
11772
- throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
11773
- }
11774
- }
11775
11967
  function naturalPageSort(a, b) {
11776
11968
  const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
11777
11969
  const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
@@ -11845,7 +12037,7 @@ function startParallelProbeRuns(input) {
11845
12037
  }
11846
12038
  async function loadModelCache(path) {
11847
12039
  try {
11848
- const raw = await readFile(path, "utf-8");
12040
+ const raw = await readFile2(path, "utf-8");
11849
12041
  return JSON.parse(raw);
11850
12042
  } catch {
11851
12043
  return null;
@@ -11879,12 +12071,12 @@ async function updateModelCache(path, probes) {
11879
12071
  await writeFile(path, JSON.stringify(current, null, 2), "utf-8");
11880
12072
  }
11881
12073
  async function ocrWorkerPool(input) {
11882
- const { queue, workerCount, ocrInput, onPageDone } = input;
12074
+ const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
11883
12075
  const results = /* @__PURE__ */ new Map();
11884
12076
  let completedCount = 0;
11885
12077
  async function worker() {
11886
12078
  while (true) {
11887
- const item = await queue.dequeue();
12079
+ const item = await queue2.dequeue();
11888
12080
  if (item === QUEUE_DONE) break;
11889
12081
  const { pageNumber, imagePath, error } = item;
11890
12082
  if (imagePath === null) {
@@ -11936,7 +12128,7 @@ async function ocrImageWithFallback(input) {
11936
12128
  async function mergeMarkdownPages(paths) {
11937
12129
  const out = [];
11938
12130
  for (let i = 0; i < paths.length; i++) {
11939
- const txt = (await readFile(paths[i], "utf-8")).trim();
12131
+ const txt = (await readFile2(paths[i], "utf-8")).trim();
11940
12132
  if (!txt) continue;
11941
12133
  out.push(txt);
11942
12134
  }
@@ -12052,7 +12244,7 @@ async function ocrImageViaNim(input) {
12052
12244
  throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
12053
12245
  }
12054
12246
  async function encodeBase64(path) {
12055
- const b = await readFile(path);
12247
+ const b = await readFile2(path);
12056
12248
  return b.toString("base64");
12057
12249
  }
12058
12250
  function stripCodeFence3(text) {
@@ -12091,7 +12283,7 @@ async function parse2(input, options) {
12091
12283
  let buffer;
12092
12284
  if (typeof input === "string") {
12093
12285
  try {
12094
- const buf = await readFile2(input);
12286
+ const buf = await readFile3(input);
12095
12287
  buffer = toArrayBuffer(buf);
12096
12288
  } catch (err) {
12097
12289
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
@@ -12250,6 +12442,9 @@ export {
12250
12442
  VERSION,
12251
12443
  blocksToMarkdown,
12252
12444
  compare,
12445
+ convertHwpToPdf,
12446
+ convertHwpxToPdf,
12447
+ convertToPdf,
12253
12448
  detectFormat,
12254
12449
  detectZipFormat,
12255
12450
  diffBlocks,