@clazic/kordoc 2.5.2 → 2.6.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -33,6 +33,118 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
33
33
  ));
34
34
  var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
35
35
 
36
+ // src/utils.ts
37
+ var utils_exports = {};
38
+ __export(utils_exports, {
39
+ KordocError: () => KordocError,
40
+ VERSION: () => VERSION,
41
+ classifyError: () => classifyError,
42
+ isPathTraversal: () => isPathTraversal,
43
+ normalizeKordocError: () => normalizeKordocError,
44
+ precheckZipSize: () => precheckZipSize,
45
+ sanitizeError: () => sanitizeError,
46
+ sanitizeHref: () => sanitizeHref,
47
+ toArrayBuffer: () => toArrayBuffer
48
+ });
49
+ function toArrayBuffer(buf) {
50
+ if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
51
+ return buf.buffer;
52
+ }
53
+ return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
54
+ }
55
+ function sanitizeError(err) {
56
+ if (err instanceof KordocError) return err.message;
57
+ return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
58
+ }
59
+ function isPathTraversal(name) {
60
+ if (name.includes("\0")) return true;
61
+ const normalized = name.replace(/\\/g, "/");
62
+ return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
63
+ }
64
+ function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
65
+ try {
66
+ const data = new DataView(buffer);
67
+ const len = buffer.byteLength;
68
+ let eocdOffset = -1;
69
+ for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
70
+ if (data.getUint32(i, true) === 101010256) {
71
+ eocdOffset = i;
72
+ break;
73
+ }
74
+ }
75
+ if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
76
+ const entryCount = data.getUint16(eocdOffset + 10, true);
77
+ if (entryCount > maxEntries) {
78
+ throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
79
+ }
80
+ const cdSize = data.getUint32(eocdOffset + 12, true);
81
+ const cdOffset = data.getUint32(eocdOffset + 16, true);
82
+ if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
83
+ let totalUncompressed = 0;
84
+ let pos = cdOffset;
85
+ for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
86
+ if (data.getUint32(pos, true) !== 33639248) break;
87
+ totalUncompressed += data.getUint32(pos + 24, true);
88
+ const nameLen = data.getUint16(pos + 28, true);
89
+ const extraLen = data.getUint16(pos + 30, true);
90
+ const commentLen = data.getUint16(pos + 32, true);
91
+ pos += 46 + nameLen + extraLen + commentLen;
92
+ }
93
+ if (totalUncompressed > maxUncompressedSize) {
94
+ throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
95
+ }
96
+ return { totalUncompressed, entryCount };
97
+ } catch (err) {
98
+ if (err instanceof KordocError) throw err;
99
+ return { totalUncompressed: 0, entryCount: 0 };
100
+ }
101
+ }
102
+ function sanitizeHref(href) {
103
+ const trimmed = href.trim();
104
+ if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
105
+ return trimmed;
106
+ }
107
+ function classifyError(err) {
108
+ if (!(err instanceof Error)) return "PARSE_ERROR";
109
+ const msg = err.message;
110
+ if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
111
+ if (msg.includes("DRM")) return "DRM_PROTECTED";
112
+ if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
113
+ if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
114
+ if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
115
+ if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
116
+ if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
117
+ return "PARSE_ERROR";
118
+ }
119
+ function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
120
+ if (err instanceof KordocError) {
121
+ if (!err.stage) err.stage = stage;
122
+ if (!err.code) err.code = fallbackCode;
123
+ return err;
124
+ }
125
+ const message = err instanceof Error ? err.message : fallbackMessage;
126
+ const code = err instanceof Error ? classifyError(err) : fallbackCode;
127
+ return new KordocError(message || fallbackMessage, { code, stage });
128
+ }
129
+ var VERSION, KordocError, SAFE_HREF_RE;
130
+ var init_utils = __esm({
131
+ "src/utils.ts"() {
132
+ "use strict";
133
+ VERSION = true ? "2.5.2" : "0.0.0-dev";
134
+ KordocError = class extends Error {
135
+ code;
136
+ stage;
137
+ constructor(message, opts = {}) {
138
+ super(message);
139
+ this.name = "KordocError";
140
+ this.code = opts.code;
141
+ this.stage = opts.stage;
142
+ }
143
+ };
144
+ SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
145
+ }
146
+ });
147
+
36
148
  // src/page-range.ts
37
149
  var page_range_exports = {};
38
150
  __export(page_range_exports, {
@@ -3062,6 +3174,9 @@ __export(index_exports, {
3062
3174
  VERSION: () => VERSION,
3063
3175
  blocksToMarkdown: () => blocksToMarkdown,
3064
3176
  compare: () => compare,
3177
+ convertHwpToPdf: () => convertHwpToPdf,
3178
+ convertHwpxToPdf: () => convertHwpxToPdf,
3179
+ convertToPdf: () => convertToPdf,
3065
3180
  detectFormat: () => detectFormat,
3066
3181
  detectZipFormat: () => detectZipFormat,
3067
3182
  diffBlocks: () => diffBlocks,
@@ -3081,7 +3196,7 @@ __export(index_exports, {
3081
3196
  runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
3082
3197
  });
3083
3198
  module.exports = __toCommonJS(index_exports);
3084
- var import_promises3 = require("fs/promises");
3199
+ var import_promises4 = require("fs/promises");
3085
3200
 
3086
3201
  // src/detect.ts
3087
3202
  var import_jszip = __toESM(require("jszip"), 1);
@@ -3133,97 +3248,8 @@ async function detectZipFormat(buffer) {
3133
3248
  var import_jszip2 = __toESM(require("jszip"), 1);
3134
3249
  var import_xmldom = require("@xmldom/xmldom");
3135
3250
 
3136
- // src/utils.ts
3137
- var VERSION = true ? "2.5.1" : "0.0.0-dev";
3138
- function toArrayBuffer(buf) {
3139
- if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3140
- return buf.buffer;
3141
- }
3142
- return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
3143
- }
3144
- var KordocError = class extends Error {
3145
- code;
3146
- stage;
3147
- constructor(message, opts = {}) {
3148
- super(message);
3149
- this.name = "KordocError";
3150
- this.code = opts.code;
3151
- this.stage = opts.stage;
3152
- }
3153
- };
3154
- function isPathTraversal(name) {
3155
- if (name.includes("\0")) return true;
3156
- const normalized = name.replace(/\\/g, "/");
3157
- return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
3158
- }
3159
- function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
3160
- try {
3161
- const data = new DataView(buffer);
3162
- const len = buffer.byteLength;
3163
- let eocdOffset = -1;
3164
- for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
3165
- if (data.getUint32(i, true) === 101010256) {
3166
- eocdOffset = i;
3167
- break;
3168
- }
3169
- }
3170
- if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
3171
- const entryCount = data.getUint16(eocdOffset + 10, true);
3172
- if (entryCount > maxEntries) {
3173
- throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
3174
- }
3175
- const cdSize = data.getUint32(eocdOffset + 12, true);
3176
- const cdOffset = data.getUint32(eocdOffset + 16, true);
3177
- if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
3178
- let totalUncompressed = 0;
3179
- let pos = cdOffset;
3180
- for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
3181
- if (data.getUint32(pos, true) !== 33639248) break;
3182
- totalUncompressed += data.getUint32(pos + 24, true);
3183
- const nameLen = data.getUint16(pos + 28, true);
3184
- const extraLen = data.getUint16(pos + 30, true);
3185
- const commentLen = data.getUint16(pos + 32, true);
3186
- pos += 46 + nameLen + extraLen + commentLen;
3187
- }
3188
- if (totalUncompressed > maxUncompressedSize) {
3189
- throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
3190
- }
3191
- return { totalUncompressed, entryCount };
3192
- } catch (err) {
3193
- if (err instanceof KordocError) throw err;
3194
- return { totalUncompressed: 0, entryCount: 0 };
3195
- }
3196
- }
3197
- var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
3198
- function sanitizeHref(href) {
3199
- const trimmed = href.trim();
3200
- if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
3201
- return trimmed;
3202
- }
3203
- function classifyError(err) {
3204
- if (!(err instanceof Error)) return "PARSE_ERROR";
3205
- const msg = err.message;
3206
- if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
3207
- if (msg.includes("DRM")) return "DRM_PROTECTED";
3208
- if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
3209
- if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
3210
- if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
3211
- if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
3212
- if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
3213
- return "PARSE_ERROR";
3214
- }
3215
- function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
3216
- if (err instanceof KordocError) {
3217
- if (!err.stage) err.stage = stage;
3218
- if (!err.code) err.code = fallbackCode;
3219
- return err;
3220
- }
3221
- const message = err instanceof Error ? err.message : fallbackMessage;
3222
- const code = err instanceof Error ? classifyError(err) : fallbackCode;
3223
- return new KordocError(message || fallbackMessage, { code, stage });
3224
- }
3225
-
3226
3251
  // src/table/builder.ts
3252
+ init_utils();
3227
3253
  var MAX_COLS = 200;
3228
3254
  var MAX_ROWS = 1e4;
3229
3255
  function buildTable(rows) {
@@ -3483,6 +3509,8 @@ var HEADING_RATIO_H2 = 1.3;
3483
3509
  var HEADING_RATIO_H3 = 1.15;
3484
3510
 
3485
3511
  // src/hwpx/parser.ts
3512
+ init_utils();
3513
+ init_utils();
3486
3514
  init_page_range();
3487
3515
  init_logger();
3488
3516
  var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
@@ -4325,6 +4353,7 @@ function extractTextFromNode(node) {
4325
4353
 
4326
4354
  // src/hwp5/record.ts
4327
4355
  var import_zlib = require("zlib");
4356
+ init_utils();
4328
4357
  var TAG_PARA_HEADER = 66;
4329
4358
  var TAG_PARA_TEXT = 67;
4330
4359
  var TAG_CHAR_SHAPE = 68;
@@ -5374,6 +5403,7 @@ function parseLenientCfb(data) {
5374
5403
  }
5375
5404
 
5376
5405
  // src/hwp5/parser.ts
5406
+ init_utils();
5377
5407
  init_page_range();
5378
5408
  init_logger();
5379
5409
  var CFB = __toESM(require_cfb(), 1);
@@ -6029,6 +6059,7 @@ function arrangeCells(rows, cols, cells) {
6029
6059
  }
6030
6060
 
6031
6061
  // src/pdf/parser.ts
6062
+ init_utils();
6032
6063
  init_page_range();
6033
6064
  var import_module = require("module");
6034
6065
  var import_path4 = require("path");
@@ -7922,6 +7953,7 @@ function mergeKoreanLines(text) {
7922
7953
  // src/xlsx/parser.ts
7923
7954
  var import_jszip3 = __toESM(require("jszip"), 1);
7924
7955
  var import_xmldom2 = require("@xmldom/xmldom");
7956
+ init_utils();
7925
7957
  init_logger();
7926
7958
  var MAX_SHEETS = 100;
7927
7959
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
@@ -8250,6 +8282,7 @@ async function parseXlsxDocument(buffer, options, existingZip) {
8250
8282
  // src/docx/parser.ts
8251
8283
  var import_jszip4 = __toESM(require("jszip"), 1);
8252
8284
  var import_xmldom3 = require("@xmldom/xmldom");
8285
+ init_utils();
8253
8286
  init_logger();
8254
8287
  var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
8255
8288
  function getChildElements(parent, localName) {
@@ -8729,6 +8762,7 @@ async function parseDocxDocument(buffer, options, existingZip) {
8729
8762
  }
8730
8763
 
8731
8764
  // src/index.ts
8765
+ init_utils();
8732
8766
  init_cli_provider();
8733
8767
  init_markdown_to_blocks();
8734
8768
  init_logger();
@@ -11230,6 +11264,187 @@ async function markdownToXlsx(markdown, options) {
11230
11264
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
11231
11265
  }
11232
11266
 
11267
+ // src/convert/index.ts
11268
+ var import_promises2 = require("fs/promises");
11269
+ init_utils();
11270
+
11271
+ // src/convert/libreoffice.ts
11272
+ var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11273
+
11274
+ // src/convert/error.ts
11275
+ var ConvertError = class extends Error {
11276
+ constructor(code, message) {
11277
+ super(message);
11278
+ this.code = code;
11279
+ this.name = "ConvertError";
11280
+ }
11281
+ };
11282
+
11283
+ // src/convert/libreoffice.ts
11284
+ var libreConvert = import_libreoffice_convert.default.convert;
11285
+ async function assertSofficeAvailable() {
11286
+ const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
11287
+ try {
11288
+ await runCommand2("soffice", ["--version"]);
11289
+ } catch {
11290
+ throw new ConvertError(
11291
+ "SOFFICE_NOT_FOUND",
11292
+ "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
11293
+ );
11294
+ }
11295
+ }
11296
+ async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
11297
+ return new Promise((resolve4, reject) => {
11298
+ const timer = setTimeout(() => {
11299
+ reject(
11300
+ new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
11301
+ );
11302
+ }, timeoutMs);
11303
+ libreConvert(buffer, targetExt, void 0, (err, done) => {
11304
+ clearTimeout(timer);
11305
+ if (err || !done) {
11306
+ reject(
11307
+ new ConvertError(
11308
+ "CONVERT_FAILED",
11309
+ err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
11310
+ )
11311
+ );
11312
+ return;
11313
+ }
11314
+ resolve4(done);
11315
+ });
11316
+ });
11317
+ }
11318
+
11319
+ // src/convert/index.ts
11320
+ var isConverting = false;
11321
+ var queue = [];
11322
+ async function acquireConvertLock() {
11323
+ if (!isConverting) {
11324
+ isConverting = true;
11325
+ return () => {
11326
+ isConverting = false;
11327
+ const next = queue.shift();
11328
+ next?.();
11329
+ };
11330
+ }
11331
+ return new Promise((resolve4) => {
11332
+ queue.push(() => {
11333
+ isConverting = true;
11334
+ resolve4(() => {
11335
+ isConverting = false;
11336
+ const next = queue.shift();
11337
+ next?.();
11338
+ });
11339
+ });
11340
+ });
11341
+ }
11342
+ async function convertToPdf(input, options) {
11343
+ let buffer;
11344
+ try {
11345
+ if (typeof input === "string") {
11346
+ buffer = await (0, import_promises2.readFile)(input);
11347
+ } else if (Buffer.isBuffer(input)) {
11348
+ buffer = input;
11349
+ } else {
11350
+ buffer = Buffer.from(input);
11351
+ }
11352
+ } catch (err) {
11353
+ return {
11354
+ success: false,
11355
+ code: "PARSE_ERROR",
11356
+ error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
11357
+ stage: "detect"
11358
+ };
11359
+ }
11360
+ const MAX_FILE_SIZE = 500 * 1024 * 1024;
11361
+ if (buffer.length > MAX_FILE_SIZE) {
11362
+ return {
11363
+ success: false,
11364
+ code: "FILE_TOO_LARGE",
11365
+ error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
11366
+ stage: "detect"
11367
+ };
11368
+ }
11369
+ const format = detectFormat(toArrayBuffer(buffer));
11370
+ if (format !== "hwp" && format !== "hwpx") {
11371
+ return {
11372
+ success: false,
11373
+ code: "UNSUPPORTED_FORMAT",
11374
+ error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
11375
+ stage: "detect"
11376
+ };
11377
+ }
11378
+ try {
11379
+ await assertSofficeAvailable();
11380
+ } catch (err) {
11381
+ if (err instanceof ConvertError) {
11382
+ return {
11383
+ success: false,
11384
+ code: err.code,
11385
+ error: err.message,
11386
+ stage: "validate"
11387
+ };
11388
+ }
11389
+ throw err;
11390
+ }
11391
+ const releaseLock = await acquireConvertLock();
11392
+ try {
11393
+ options?.onProgress?.(10, "convert");
11394
+ const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
11395
+ options?.onProgress?.(100, "done");
11396
+ return {
11397
+ success: true,
11398
+ pdf: new Uint8Array(pdf),
11399
+ sourceFormat: format
11400
+ };
11401
+ } catch (err) {
11402
+ if (err instanceof ConvertError) {
11403
+ return {
11404
+ success: false,
11405
+ code: err.code,
11406
+ error: err.message,
11407
+ stage: "convert"
11408
+ };
11409
+ }
11410
+ return {
11411
+ success: false,
11412
+ code: classifyError(err),
11413
+ error: err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328",
11414
+ stage: "convert"
11415
+ };
11416
+ } finally {
11417
+ releaseLock();
11418
+ }
11419
+ }
11420
+ async function convertHwpToPdf(input, options) {
11421
+ const result = await convertToPdf(input, options);
11422
+ if (result.success && result.sourceFormat !== "hwp") {
11423
+ return {
11424
+ success: false,
11425
+ code: "UNSUPPORTED_FORMAT",
11426
+ error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11427
+ stage: "detect"
11428
+ };
11429
+ }
11430
+ return result;
11431
+ }
11432
+ async function convertHwpxToPdf(input, options) {
11433
+ const result = await convertToPdf(input, options);
11434
+ if (result.success && result.sourceFormat !== "hwpx") {
11435
+ return {
11436
+ success: false,
11437
+ code: "UNSUPPORTED_FORMAT",
11438
+ error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
11439
+ stage: "detect"
11440
+ };
11441
+ }
11442
+ return result;
11443
+ }
11444
+
11445
+ // src/index.ts
11446
+ init_utils();
11447
+
11233
11448
  // src/ocr/api-key-rotation.ts
11234
11449
  var AllKeysCoolingDownError = class extends Error {
11235
11450
  waitMs;
@@ -11324,11 +11539,10 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11324
11539
  };
11325
11540
 
11326
11541
  // src/pipeline/unified-ocr.ts
11327
- var import_promises2 = require("fs/promises");
11542
+ var import_promises3 = require("fs/promises");
11328
11543
  var import_path5 = require("path");
11329
11544
  var import_child_process4 = require("child_process");
11330
11545
  var import_node_perf_hooks = require("perf_hooks");
11331
- var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11332
11546
  init_logger();
11333
11547
 
11334
11548
  // src/pipeline/bounded-queue.ts
@@ -11390,7 +11604,6 @@ var BoundedQueue = class {
11390
11604
  };
11391
11605
 
11392
11606
  // src/pipeline/unified-ocr.ts
11393
- var libreConvert = import_libreoffice_convert.default.convert;
11394
11607
  var UnifiedOcrError = class extends Error {
11395
11608
  code;
11396
11609
  stage;
@@ -11484,9 +11697,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11484
11697
  const keyPool = ApiKeyRotationPool.fromEnv();
11485
11698
  const runId = options.runId ?? generateRunId("ocr");
11486
11699
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11487
- await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
11488
- await (0, import_promises2.mkdir)(rawDir, { recursive: true });
11489
- await (0, import_promises2.mkdir)(diffDir, { recursive: true });
11700
+ await (0, import_promises3.mkdir)(imagesDir, { recursive: true });
11701
+ await (0, import_promises3.mkdir)(rawDir, { recursive: true });
11702
+ await (0, import_promises3.mkdir)(diffDir, { recursive: true });
11490
11703
  const timingsMs = {};
11491
11704
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11492
11705
  const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
@@ -11505,9 +11718,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11505
11718
  if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
11506
11719
  await assertSofficeAvailable();
11507
11720
  workingPdfPath = (0, import_path5.join)(workspaceDir, `${stem}.pdf`);
11508
- const inputBuffer = await (0, import_promises2.readFile)(absInput);
11509
- const out = await convertWithLibreOffice(inputBuffer, ".pdf");
11510
- await (0, import_promises2.writeFile)(workingPdfPath, out);
11721
+ const inputBuffer = await (0, import_promises3.readFile)(absInput);
11722
+ const out = await convertBuffer(inputBuffer, ".pdf");
11723
+ await (0, import_promises3.writeFile)(workingPdfPath, out);
11511
11724
  }
11512
11725
  timingsMs.convert = elapsedMs(convertStart);
11513
11726
  markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
@@ -11519,7 +11732,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11519
11732
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11520
11733
  logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
11521
11734
  await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
11522
- const firstFiles = (await (0, import_promises2.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11735
+ const firstFiles = (await (0, import_promises3.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11523
11736
  if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
11524
11737
  const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
11525
11738
  markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
@@ -11557,7 +11770,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11557
11770
  const keyCount = keyPool.snapshot().length;
11558
11771
  const workerCount = Math.max(1, keyCount * concurrencyPerKey);
11559
11772
  const queueCapacity = workerCount * 2;
11560
- const queue = new BoundedQueue(queueCapacity);
11773
+ const queue2 = new BoundedQueue(queueCapacity);
11561
11774
  const ocrStart = import_node_perf_hooks.performance.now();
11562
11775
  currentStage = "ocr";
11563
11776
  markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
@@ -11565,17 +11778,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11565
11778
  let renderDone = 1;
11566
11779
  const renderProducer = (async () => {
11567
11780
  try {
11568
- await queue.enqueue({ pageNumber: 1, imagePath: probeImage });
11781
+ await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
11569
11782
  if (totalPages > 1) {
11570
11783
  for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
11571
- await queue.enqueue(item);
11784
+ await queue2.enqueue(item);
11572
11785
  renderDone++;
11573
11786
  markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
11574
11787
  logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
11575
11788
  }
11576
11789
  }
11577
11790
  } finally {
11578
- queue.close();
11791
+ queue2.close();
11579
11792
  timingsMs.render = elapsedMs(renderStart);
11580
11793
  markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11581
11794
  logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
@@ -11584,7 +11797,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11584
11797
  const [, pageResultsMap] = await Promise.all([
11585
11798
  renderProducer,
11586
11799
  ocrWorkerPool({
11587
- queue,
11800
+ queue: queue2,
11588
11801
  workerCount,
11589
11802
  totalPages,
11590
11803
  ocrInput: {
@@ -11618,7 +11831,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11618
11831
  const rawPagePaths = [];
11619
11832
  for (const [pageNum, markdown] of sortedEntries) {
11620
11833
  const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
11621
- await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
11834
+ await (0, import_promises3.writeFile)(pagePath, markdown, "utf-8");
11622
11835
  rawPagePaths.push(pagePath);
11623
11836
  }
11624
11837
  const mergeStart = import_node_perf_hooks.performance.now();
@@ -11626,7 +11839,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11626
11839
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11627
11840
  logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
11628
11841
  const merged = await mergeMarkdownPages(rawPagePaths);
11629
- await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
11842
+ await (0, import_promises3.writeFile)(outputPath, merged, "utf-8");
11630
11843
  timingsMs.merge = elapsedMs(mergeStart);
11631
11844
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
11632
11845
  logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
@@ -11642,7 +11855,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11642
11855
  timingsMs,
11643
11856
  modelCachePath
11644
11857
  };
11645
- await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
11858
+ await (0, import_promises3.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
11646
11859
  logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
11647
11860
  return { outputPath, reportPath, selectedModel };
11648
11861
  } catch (err) {
@@ -11704,17 +11917,6 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
11704
11917
  model: extra.model
11705
11918
  });
11706
11919
  }
11707
- async function convertWithLibreOffice(buffer, ext) {
11708
- return await new Promise((resolvePromise, reject) => {
11709
- libreConvert(buffer, ext, void 0, (err, done) => {
11710
- if (err || !done) {
11711
- reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
11712
- return;
11713
- }
11714
- resolvePromise(done);
11715
- });
11716
- });
11717
- }
11718
11920
  async function getPdfPageCount(pdfPath) {
11719
11921
  const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
11720
11922
  const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
@@ -11742,7 +11944,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
11742
11944
  pdfPath,
11743
11945
  prefixPath
11744
11946
  ]);
11745
- const files = await (0, import_promises2.readdir)(imagesDir);
11947
+ const files = await (0, import_promises3.readdir)(imagesDir);
11746
11948
  const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
11747
11949
  const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
11748
11950
  yield { pageNumber: page, imagePath };
@@ -11787,13 +11989,6 @@ async function runCommandWithStdout(cmd, args) {
11787
11989
  });
11788
11990
  });
11789
11991
  }
11790
- async function assertSofficeAvailable() {
11791
- try {
11792
- await runCommand("soffice", ["--version"]);
11793
- } catch {
11794
- throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
11795
- }
11796
- }
11797
11992
  function naturalPageSort(a, b) {
11798
11993
  const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
11799
11994
  const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
@@ -11867,7 +12062,7 @@ function startParallelProbeRuns(input) {
11867
12062
  }
11868
12063
  async function loadModelCache(path) {
11869
12064
  try {
11870
- const raw = await (0, import_promises2.readFile)(path, "utf-8");
12065
+ const raw = await (0, import_promises3.readFile)(path, "utf-8");
11871
12066
  return JSON.parse(raw);
11872
12067
  } catch {
11873
12068
  return null;
@@ -11898,15 +12093,15 @@ async function updateModelCache(path, probes) {
11898
12093
  }
11899
12094
  }
11900
12095
  current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
11901
- await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
12096
+ await (0, import_promises3.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
11902
12097
  }
11903
12098
  async function ocrWorkerPool(input) {
11904
- const { queue, workerCount, ocrInput, onPageDone } = input;
12099
+ const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
11905
12100
  const results = /* @__PURE__ */ new Map();
11906
12101
  let completedCount = 0;
11907
12102
  async function worker() {
11908
12103
  while (true) {
11909
- const item = await queue.dequeue();
12104
+ const item = await queue2.dequeue();
11910
12105
  if (item === QUEUE_DONE) break;
11911
12106
  const { pageNumber, imagePath, error } = item;
11912
12107
  if (imagePath === null) {
@@ -11958,7 +12153,7 @@ async function ocrImageWithFallback(input) {
11958
12153
  async function mergeMarkdownPages(paths) {
11959
12154
  const out = [];
11960
12155
  for (let i = 0; i < paths.length; i++) {
11961
- const txt = (await (0, import_promises2.readFile)(paths[i], "utf-8")).trim();
12156
+ const txt = (await (0, import_promises3.readFile)(paths[i], "utf-8")).trim();
11962
12157
  if (!txt) continue;
11963
12158
  out.push(txt);
11964
12159
  }
@@ -12074,7 +12269,7 @@ async function ocrImageViaNim(input) {
12074
12269
  throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
12075
12270
  }
12076
12271
  async function encodeBase64(path) {
12077
- const b = await (0, import_promises2.readFile)(path);
12272
+ const b = await (0, import_promises3.readFile)(path);
12078
12273
  return b.toString("base64");
12079
12274
  }
12080
12275
  function stripCodeFence3(text) {
@@ -12113,7 +12308,7 @@ async function parse2(input, options) {
12113
12308
  let buffer;
12114
12309
  if (typeof input === "string") {
12115
12310
  try {
12116
- const buf = await (0, import_promises3.readFile)(input);
12311
+ const buf = await (0, import_promises4.readFile)(input);
12117
12312
  buffer = toArrayBuffer(buf);
12118
12313
  } catch (err) {
12119
12314
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
@@ -12273,6 +12468,9 @@ async function parseDocx(buffer, options, zip) {
12273
12468
  VERSION,
12274
12469
  blocksToMarkdown,
12275
12470
  compare,
12471
+ convertHwpToPdf,
12472
+ convertHwpxToPdf,
12473
+ convertToPdf,
12276
12474
  detectFormat,
12277
12475
  detectZipFormat,
12278
12476
  diffBlocks,