@clazic/kordoc 2.5.2 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -1
- package/dist/{chunk-5CILZHRW.js → chunk-TND4YFBV.js} +2 -2
- package/dist/{chunk-25ZYYLVP.js → chunk-TS3F57LY.js} +158 -6
- package/dist/chunk-TS3F57LY.js.map +1 -0
- package/dist/cli.js +52 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +333 -135
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +71 -2
- package/dist/index.d.ts +71 -2
- package/dist/index.js +320 -125
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +43 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{utils-H2BL5GNR.js → utils-F66K7PXH.js} +2 -2
- package/dist/{watch-D6ODQLPJ.js → watch-2S5ULHAM.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-25ZYYLVP.js.map +0 -1
- /package/dist/{chunk-5CILZHRW.js.map → chunk-TND4YFBV.js.map} +0 -0
- /package/dist/{utils-H2BL5GNR.js.map → utils-F66K7PXH.js.map} +0 -0
- /package/dist/{watch-D6ODQLPJ.js.map → watch-2S5ULHAM.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -33,6 +33,118 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
33
33
|
));
|
|
34
34
|
var __toCommonJS = (mod) => __copyProps(__defProp({}, "__esModule", { value: true }), mod);
|
|
35
35
|
|
|
36
|
+
// src/utils.ts
|
|
37
|
+
var utils_exports = {};
|
|
38
|
+
__export(utils_exports, {
|
|
39
|
+
KordocError: () => KordocError,
|
|
40
|
+
VERSION: () => VERSION,
|
|
41
|
+
classifyError: () => classifyError,
|
|
42
|
+
isPathTraversal: () => isPathTraversal,
|
|
43
|
+
normalizeKordocError: () => normalizeKordocError,
|
|
44
|
+
precheckZipSize: () => precheckZipSize,
|
|
45
|
+
sanitizeError: () => sanitizeError,
|
|
46
|
+
sanitizeHref: () => sanitizeHref,
|
|
47
|
+
toArrayBuffer: () => toArrayBuffer
|
|
48
|
+
});
|
|
49
|
+
function toArrayBuffer(buf) {
|
|
50
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
51
|
+
return buf.buffer;
|
|
52
|
+
}
|
|
53
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
54
|
+
}
|
|
55
|
+
function sanitizeError(err) {
|
|
56
|
+
if (err instanceof KordocError) return err.message;
|
|
57
|
+
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
58
|
+
}
|
|
59
|
+
function isPathTraversal(name) {
|
|
60
|
+
if (name.includes("\0")) return true;
|
|
61
|
+
const normalized = name.replace(/\\/g, "/");
|
|
62
|
+
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
63
|
+
}
|
|
64
|
+
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
65
|
+
try {
|
|
66
|
+
const data = new DataView(buffer);
|
|
67
|
+
const len = buffer.byteLength;
|
|
68
|
+
let eocdOffset = -1;
|
|
69
|
+
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
70
|
+
if (data.getUint32(i, true) === 101010256) {
|
|
71
|
+
eocdOffset = i;
|
|
72
|
+
break;
|
|
73
|
+
}
|
|
74
|
+
}
|
|
75
|
+
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
76
|
+
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
77
|
+
if (entryCount > maxEntries) {
|
|
78
|
+
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
79
|
+
}
|
|
80
|
+
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
81
|
+
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
82
|
+
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
83
|
+
let totalUncompressed = 0;
|
|
84
|
+
let pos = cdOffset;
|
|
85
|
+
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
86
|
+
if (data.getUint32(pos, true) !== 33639248) break;
|
|
87
|
+
totalUncompressed += data.getUint32(pos + 24, true);
|
|
88
|
+
const nameLen = data.getUint16(pos + 28, true);
|
|
89
|
+
const extraLen = data.getUint16(pos + 30, true);
|
|
90
|
+
const commentLen = data.getUint16(pos + 32, true);
|
|
91
|
+
pos += 46 + nameLen + extraLen + commentLen;
|
|
92
|
+
}
|
|
93
|
+
if (totalUncompressed > maxUncompressedSize) {
|
|
94
|
+
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
95
|
+
}
|
|
96
|
+
return { totalUncompressed, entryCount };
|
|
97
|
+
} catch (err) {
|
|
98
|
+
if (err instanceof KordocError) throw err;
|
|
99
|
+
return { totalUncompressed: 0, entryCount: 0 };
|
|
100
|
+
}
|
|
101
|
+
}
|
|
102
|
+
function sanitizeHref(href) {
|
|
103
|
+
const trimmed = href.trim();
|
|
104
|
+
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
105
|
+
return trimmed;
|
|
106
|
+
}
|
|
107
|
+
function classifyError(err) {
|
|
108
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
109
|
+
const msg = err.message;
|
|
110
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
111
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
112
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
113
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
114
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
115
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
116
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
117
|
+
return "PARSE_ERROR";
|
|
118
|
+
}
|
|
119
|
+
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
120
|
+
if (err instanceof KordocError) {
|
|
121
|
+
if (!err.stage) err.stage = stage;
|
|
122
|
+
if (!err.code) err.code = fallbackCode;
|
|
123
|
+
return err;
|
|
124
|
+
}
|
|
125
|
+
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
126
|
+
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
127
|
+
return new KordocError(message || fallbackMessage, { code, stage });
|
|
128
|
+
}
|
|
129
|
+
var VERSION, KordocError, SAFE_HREF_RE;
|
|
130
|
+
var init_utils = __esm({
|
|
131
|
+
"src/utils.ts"() {
|
|
132
|
+
"use strict";
|
|
133
|
+
VERSION = true ? "2.5.2" : "0.0.0-dev";
|
|
134
|
+
KordocError = class extends Error {
|
|
135
|
+
code;
|
|
136
|
+
stage;
|
|
137
|
+
constructor(message, opts = {}) {
|
|
138
|
+
super(message);
|
|
139
|
+
this.name = "KordocError";
|
|
140
|
+
this.code = opts.code;
|
|
141
|
+
this.stage = opts.stage;
|
|
142
|
+
}
|
|
143
|
+
};
|
|
144
|
+
SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
145
|
+
}
|
|
146
|
+
});
|
|
147
|
+
|
|
36
148
|
// src/page-range.ts
|
|
37
149
|
var page_range_exports = {};
|
|
38
150
|
__export(page_range_exports, {
|
|
@@ -3062,6 +3174,9 @@ __export(index_exports, {
|
|
|
3062
3174
|
VERSION: () => VERSION,
|
|
3063
3175
|
blocksToMarkdown: () => blocksToMarkdown,
|
|
3064
3176
|
compare: () => compare,
|
|
3177
|
+
convertHwpToPdf: () => convertHwpToPdf,
|
|
3178
|
+
convertHwpxToPdf: () => convertHwpxToPdf,
|
|
3179
|
+
convertToPdf: () => convertToPdf,
|
|
3065
3180
|
detectFormat: () => detectFormat,
|
|
3066
3181
|
detectZipFormat: () => detectZipFormat,
|
|
3067
3182
|
diffBlocks: () => diffBlocks,
|
|
@@ -3081,7 +3196,7 @@ __export(index_exports, {
|
|
|
3081
3196
|
runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
|
|
3082
3197
|
});
|
|
3083
3198
|
module.exports = __toCommonJS(index_exports);
|
|
3084
|
-
var
|
|
3199
|
+
var import_promises4 = require("fs/promises");
|
|
3085
3200
|
|
|
3086
3201
|
// src/detect.ts
|
|
3087
3202
|
var import_jszip = __toESM(require("jszip"), 1);
|
|
@@ -3133,97 +3248,8 @@ async function detectZipFormat(buffer) {
|
|
|
3133
3248
|
var import_jszip2 = __toESM(require("jszip"), 1);
|
|
3134
3249
|
var import_xmldom = require("@xmldom/xmldom");
|
|
3135
3250
|
|
|
3136
|
-
// src/utils.ts
|
|
3137
|
-
var VERSION = true ? "2.5.1" : "0.0.0-dev";
|
|
3138
|
-
function toArrayBuffer(buf) {
|
|
3139
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3140
|
-
return buf.buffer;
|
|
3141
|
-
}
|
|
3142
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
3143
|
-
}
|
|
3144
|
-
var KordocError = class extends Error {
|
|
3145
|
-
code;
|
|
3146
|
-
stage;
|
|
3147
|
-
constructor(message, opts = {}) {
|
|
3148
|
-
super(message);
|
|
3149
|
-
this.name = "KordocError";
|
|
3150
|
-
this.code = opts.code;
|
|
3151
|
-
this.stage = opts.stage;
|
|
3152
|
-
}
|
|
3153
|
-
};
|
|
3154
|
-
function isPathTraversal(name) {
|
|
3155
|
-
if (name.includes("\0")) return true;
|
|
3156
|
-
const normalized = name.replace(/\\/g, "/");
|
|
3157
|
-
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
3158
|
-
}
|
|
3159
|
-
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
3160
|
-
try {
|
|
3161
|
-
const data = new DataView(buffer);
|
|
3162
|
-
const len = buffer.byteLength;
|
|
3163
|
-
let eocdOffset = -1;
|
|
3164
|
-
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
3165
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
3166
|
-
eocdOffset = i;
|
|
3167
|
-
break;
|
|
3168
|
-
}
|
|
3169
|
-
}
|
|
3170
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
3171
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
3172
|
-
if (entryCount > maxEntries) {
|
|
3173
|
-
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
3174
|
-
}
|
|
3175
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
3176
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
3177
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
3178
|
-
let totalUncompressed = 0;
|
|
3179
|
-
let pos = cdOffset;
|
|
3180
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
3181
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
3182
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
3183
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
3184
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
3185
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
3186
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
3187
|
-
}
|
|
3188
|
-
if (totalUncompressed > maxUncompressedSize) {
|
|
3189
|
-
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
3190
|
-
}
|
|
3191
|
-
return { totalUncompressed, entryCount };
|
|
3192
|
-
} catch (err) {
|
|
3193
|
-
if (err instanceof KordocError) throw err;
|
|
3194
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
3195
|
-
}
|
|
3196
|
-
}
|
|
3197
|
-
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
3198
|
-
function sanitizeHref(href) {
|
|
3199
|
-
const trimmed = href.trim();
|
|
3200
|
-
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
3201
|
-
return trimmed;
|
|
3202
|
-
}
|
|
3203
|
-
function classifyError(err) {
|
|
3204
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
3205
|
-
const msg = err.message;
|
|
3206
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
3207
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
3208
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
3209
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
3210
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
3211
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
3212
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
3213
|
-
return "PARSE_ERROR";
|
|
3214
|
-
}
|
|
3215
|
-
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
3216
|
-
if (err instanceof KordocError) {
|
|
3217
|
-
if (!err.stage) err.stage = stage;
|
|
3218
|
-
if (!err.code) err.code = fallbackCode;
|
|
3219
|
-
return err;
|
|
3220
|
-
}
|
|
3221
|
-
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
3222
|
-
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
3223
|
-
return new KordocError(message || fallbackMessage, { code, stage });
|
|
3224
|
-
}
|
|
3225
|
-
|
|
3226
3251
|
// src/table/builder.ts
|
|
3252
|
+
init_utils();
|
|
3227
3253
|
var MAX_COLS = 200;
|
|
3228
3254
|
var MAX_ROWS = 1e4;
|
|
3229
3255
|
function buildTable(rows) {
|
|
@@ -3483,6 +3509,8 @@ var HEADING_RATIO_H2 = 1.3;
|
|
|
3483
3509
|
var HEADING_RATIO_H3 = 1.15;
|
|
3484
3510
|
|
|
3485
3511
|
// src/hwpx/parser.ts
|
|
3512
|
+
init_utils();
|
|
3513
|
+
init_utils();
|
|
3486
3514
|
init_page_range();
|
|
3487
3515
|
init_logger();
|
|
3488
3516
|
var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
|
|
@@ -4325,6 +4353,7 @@ function extractTextFromNode(node) {
|
|
|
4325
4353
|
|
|
4326
4354
|
// src/hwp5/record.ts
|
|
4327
4355
|
var import_zlib = require("zlib");
|
|
4356
|
+
init_utils();
|
|
4328
4357
|
var TAG_PARA_HEADER = 66;
|
|
4329
4358
|
var TAG_PARA_TEXT = 67;
|
|
4330
4359
|
var TAG_CHAR_SHAPE = 68;
|
|
@@ -5374,6 +5403,7 @@ function parseLenientCfb(data) {
|
|
|
5374
5403
|
}
|
|
5375
5404
|
|
|
5376
5405
|
// src/hwp5/parser.ts
|
|
5406
|
+
init_utils();
|
|
5377
5407
|
init_page_range();
|
|
5378
5408
|
init_logger();
|
|
5379
5409
|
var CFB = __toESM(require_cfb(), 1);
|
|
@@ -6029,6 +6059,7 @@ function arrangeCells(rows, cols, cells) {
|
|
|
6029
6059
|
}
|
|
6030
6060
|
|
|
6031
6061
|
// src/pdf/parser.ts
|
|
6062
|
+
init_utils();
|
|
6032
6063
|
init_page_range();
|
|
6033
6064
|
var import_module = require("module");
|
|
6034
6065
|
var import_path4 = require("path");
|
|
@@ -7922,6 +7953,7 @@ function mergeKoreanLines(text) {
|
|
|
7922
7953
|
// src/xlsx/parser.ts
|
|
7923
7954
|
var import_jszip3 = __toESM(require("jszip"), 1);
|
|
7924
7955
|
var import_xmldom2 = require("@xmldom/xmldom");
|
|
7956
|
+
init_utils();
|
|
7925
7957
|
init_logger();
|
|
7926
7958
|
var MAX_SHEETS = 100;
|
|
7927
7959
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
@@ -8250,6 +8282,7 @@ async function parseXlsxDocument(buffer, options, existingZip) {
|
|
|
8250
8282
|
// src/docx/parser.ts
|
|
8251
8283
|
var import_jszip4 = __toESM(require("jszip"), 1);
|
|
8252
8284
|
var import_xmldom3 = require("@xmldom/xmldom");
|
|
8285
|
+
init_utils();
|
|
8253
8286
|
init_logger();
|
|
8254
8287
|
var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
|
|
8255
8288
|
function getChildElements(parent, localName) {
|
|
@@ -8729,6 +8762,7 @@ async function parseDocxDocument(buffer, options, existingZip) {
|
|
|
8729
8762
|
}
|
|
8730
8763
|
|
|
8731
8764
|
// src/index.ts
|
|
8765
|
+
init_utils();
|
|
8732
8766
|
init_cli_provider();
|
|
8733
8767
|
init_markdown_to_blocks();
|
|
8734
8768
|
init_logger();
|
|
@@ -11230,6 +11264,187 @@ async function markdownToXlsx(markdown, options) {
|
|
|
11230
11264
|
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
11231
11265
|
}
|
|
11232
11266
|
|
|
11267
|
+
// src/convert/index.ts
|
|
11268
|
+
var import_promises2 = require("fs/promises");
|
|
11269
|
+
init_utils();
|
|
11270
|
+
|
|
11271
|
+
// src/convert/libreoffice.ts
|
|
11272
|
+
var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
|
|
11273
|
+
|
|
11274
|
+
// src/convert/error.ts
|
|
11275
|
+
var ConvertError = class extends Error {
|
|
11276
|
+
constructor(code, message) {
|
|
11277
|
+
super(message);
|
|
11278
|
+
this.code = code;
|
|
11279
|
+
this.name = "ConvertError";
|
|
11280
|
+
}
|
|
11281
|
+
};
|
|
11282
|
+
|
|
11283
|
+
// src/convert/libreoffice.ts
|
|
11284
|
+
var libreConvert = import_libreoffice_convert.default.convert;
|
|
11285
|
+
async function assertSofficeAvailable() {
|
|
11286
|
+
const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
|
|
11287
|
+
try {
|
|
11288
|
+
await runCommand2("soffice", ["--version"]);
|
|
11289
|
+
} catch {
|
|
11290
|
+
throw new ConvertError(
|
|
11291
|
+
"SOFFICE_NOT_FOUND",
|
|
11292
|
+
"soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
|
|
11293
|
+
);
|
|
11294
|
+
}
|
|
11295
|
+
}
|
|
11296
|
+
async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
|
|
11297
|
+
return new Promise((resolve4, reject) => {
|
|
11298
|
+
const timer = setTimeout(() => {
|
|
11299
|
+
reject(
|
|
11300
|
+
new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
|
|
11301
|
+
);
|
|
11302
|
+
}, timeoutMs);
|
|
11303
|
+
libreConvert(buffer, targetExt, void 0, (err, done) => {
|
|
11304
|
+
clearTimeout(timer);
|
|
11305
|
+
if (err || !done) {
|
|
11306
|
+
reject(
|
|
11307
|
+
new ConvertError(
|
|
11308
|
+
"CONVERT_FAILED",
|
|
11309
|
+
err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
|
|
11310
|
+
)
|
|
11311
|
+
);
|
|
11312
|
+
return;
|
|
11313
|
+
}
|
|
11314
|
+
resolve4(done);
|
|
11315
|
+
});
|
|
11316
|
+
});
|
|
11317
|
+
}
|
|
11318
|
+
|
|
11319
|
+
// src/convert/index.ts
|
|
11320
|
+
var isConverting = false;
|
|
11321
|
+
var queue = [];
|
|
11322
|
+
async function acquireConvertLock() {
|
|
11323
|
+
if (!isConverting) {
|
|
11324
|
+
isConverting = true;
|
|
11325
|
+
return () => {
|
|
11326
|
+
isConverting = false;
|
|
11327
|
+
const next = queue.shift();
|
|
11328
|
+
next?.();
|
|
11329
|
+
};
|
|
11330
|
+
}
|
|
11331
|
+
return new Promise((resolve4) => {
|
|
11332
|
+
queue.push(() => {
|
|
11333
|
+
isConverting = true;
|
|
11334
|
+
resolve4(() => {
|
|
11335
|
+
isConverting = false;
|
|
11336
|
+
const next = queue.shift();
|
|
11337
|
+
next?.();
|
|
11338
|
+
});
|
|
11339
|
+
});
|
|
11340
|
+
});
|
|
11341
|
+
}
|
|
11342
|
+
async function convertToPdf(input, options) {
|
|
11343
|
+
let buffer;
|
|
11344
|
+
try {
|
|
11345
|
+
if (typeof input === "string") {
|
|
11346
|
+
buffer = await (0, import_promises2.readFile)(input);
|
|
11347
|
+
} else if (Buffer.isBuffer(input)) {
|
|
11348
|
+
buffer = input;
|
|
11349
|
+
} else {
|
|
11350
|
+
buffer = Buffer.from(input);
|
|
11351
|
+
}
|
|
11352
|
+
} catch (err) {
|
|
11353
|
+
return {
|
|
11354
|
+
success: false,
|
|
11355
|
+
code: "PARSE_ERROR",
|
|
11356
|
+
error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
|
|
11357
|
+
stage: "detect"
|
|
11358
|
+
};
|
|
11359
|
+
}
|
|
11360
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
11361
|
+
if (buffer.length > MAX_FILE_SIZE) {
|
|
11362
|
+
return {
|
|
11363
|
+
success: false,
|
|
11364
|
+
code: "FILE_TOO_LARGE",
|
|
11365
|
+
error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
|
|
11366
|
+
stage: "detect"
|
|
11367
|
+
};
|
|
11368
|
+
}
|
|
11369
|
+
const format = detectFormat(toArrayBuffer(buffer));
|
|
11370
|
+
if (format !== "hwp" && format !== "hwpx") {
|
|
11371
|
+
return {
|
|
11372
|
+
success: false,
|
|
11373
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11374
|
+
error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
|
|
11375
|
+
stage: "detect"
|
|
11376
|
+
};
|
|
11377
|
+
}
|
|
11378
|
+
try {
|
|
11379
|
+
await assertSofficeAvailable();
|
|
11380
|
+
} catch (err) {
|
|
11381
|
+
if (err instanceof ConvertError) {
|
|
11382
|
+
return {
|
|
11383
|
+
success: false,
|
|
11384
|
+
code: err.code,
|
|
11385
|
+
error: err.message,
|
|
11386
|
+
stage: "validate"
|
|
11387
|
+
};
|
|
11388
|
+
}
|
|
11389
|
+
throw err;
|
|
11390
|
+
}
|
|
11391
|
+
const releaseLock = await acquireConvertLock();
|
|
11392
|
+
try {
|
|
11393
|
+
options?.onProgress?.(10, "convert");
|
|
11394
|
+
const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
|
|
11395
|
+
options?.onProgress?.(100, "done");
|
|
11396
|
+
return {
|
|
11397
|
+
success: true,
|
|
11398
|
+
pdf: new Uint8Array(pdf),
|
|
11399
|
+
sourceFormat: format
|
|
11400
|
+
};
|
|
11401
|
+
} catch (err) {
|
|
11402
|
+
if (err instanceof ConvertError) {
|
|
11403
|
+
return {
|
|
11404
|
+
success: false,
|
|
11405
|
+
code: err.code,
|
|
11406
|
+
error: err.message,
|
|
11407
|
+
stage: "convert"
|
|
11408
|
+
};
|
|
11409
|
+
}
|
|
11410
|
+
return {
|
|
11411
|
+
success: false,
|
|
11412
|
+
code: classifyError(err),
|
|
11413
|
+
error: err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328",
|
|
11414
|
+
stage: "convert"
|
|
11415
|
+
};
|
|
11416
|
+
} finally {
|
|
11417
|
+
releaseLock();
|
|
11418
|
+
}
|
|
11419
|
+
}
|
|
11420
|
+
async function convertHwpToPdf(input, options) {
|
|
11421
|
+
const result = await convertToPdf(input, options);
|
|
11422
|
+
if (result.success && result.sourceFormat !== "hwp") {
|
|
11423
|
+
return {
|
|
11424
|
+
success: false,
|
|
11425
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11426
|
+
error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11427
|
+
stage: "detect"
|
|
11428
|
+
};
|
|
11429
|
+
}
|
|
11430
|
+
return result;
|
|
11431
|
+
}
|
|
11432
|
+
async function convertHwpxToPdf(input, options) {
|
|
11433
|
+
const result = await convertToPdf(input, options);
|
|
11434
|
+
if (result.success && result.sourceFormat !== "hwpx") {
|
|
11435
|
+
return {
|
|
11436
|
+
success: false,
|
|
11437
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11438
|
+
error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11439
|
+
stage: "detect"
|
|
11440
|
+
};
|
|
11441
|
+
}
|
|
11442
|
+
return result;
|
|
11443
|
+
}
|
|
11444
|
+
|
|
11445
|
+
// src/index.ts
|
|
11446
|
+
init_utils();
|
|
11447
|
+
|
|
11233
11448
|
// src/ocr/api-key-rotation.ts
|
|
11234
11449
|
var AllKeysCoolingDownError = class extends Error {
|
|
11235
11450
|
waitMs;
|
|
@@ -11324,11 +11539,10 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
|
11324
11539
|
};
|
|
11325
11540
|
|
|
11326
11541
|
// src/pipeline/unified-ocr.ts
|
|
11327
|
-
var
|
|
11542
|
+
var import_promises3 = require("fs/promises");
|
|
11328
11543
|
var import_path5 = require("path");
|
|
11329
11544
|
var import_child_process4 = require("child_process");
|
|
11330
11545
|
var import_node_perf_hooks = require("perf_hooks");
|
|
11331
|
-
var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
|
|
11332
11546
|
init_logger();
|
|
11333
11547
|
|
|
11334
11548
|
// src/pipeline/bounded-queue.ts
|
|
@@ -11390,7 +11604,6 @@ var BoundedQueue = class {
|
|
|
11390
11604
|
};
|
|
11391
11605
|
|
|
11392
11606
|
// src/pipeline/unified-ocr.ts
|
|
11393
|
-
var libreConvert = import_libreoffice_convert.default.convert;
|
|
11394
11607
|
var UnifiedOcrError = class extends Error {
|
|
11395
11608
|
code;
|
|
11396
11609
|
stage;
|
|
@@ -11484,9 +11697,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11484
11697
|
const keyPool = ApiKeyRotationPool.fromEnv();
|
|
11485
11698
|
const runId = options.runId ?? generateRunId("ocr");
|
|
11486
11699
|
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
11487
|
-
await (0,
|
|
11488
|
-
await (0,
|
|
11489
|
-
await (0,
|
|
11700
|
+
await (0, import_promises3.mkdir)(imagesDir, { recursive: true });
|
|
11701
|
+
await (0, import_promises3.mkdir)(rawDir, { recursive: true });
|
|
11702
|
+
await (0, import_promises3.mkdir)(diffDir, { recursive: true });
|
|
11490
11703
|
const timingsMs = {};
|
|
11491
11704
|
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
11492
11705
|
const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
|
|
@@ -11505,9 +11718,9 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11505
11718
|
if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
|
|
11506
11719
|
await assertSofficeAvailable();
|
|
11507
11720
|
workingPdfPath = (0, import_path5.join)(workspaceDir, `${stem}.pdf`);
|
|
11508
|
-
const inputBuffer = await (0,
|
|
11509
|
-
const out = await
|
|
11510
|
-
await (0,
|
|
11721
|
+
const inputBuffer = await (0, import_promises3.readFile)(absInput);
|
|
11722
|
+
const out = await convertBuffer(inputBuffer, ".pdf");
|
|
11723
|
+
await (0, import_promises3.writeFile)(workingPdfPath, out);
|
|
11511
11724
|
}
|
|
11512
11725
|
timingsMs.convert = elapsedMs(convertStart);
|
|
11513
11726
|
markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
|
|
@@ -11519,7 +11732,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11519
11732
|
markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
|
|
11520
11733
|
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi, totalPages });
|
|
11521
11734
|
await runCommand("pdftoppm", ["-png", "-r", String(dpi), "-f", "1", "-l", "1", workingPdfPath, (0, import_path5.join)(imagesDir, "page")]);
|
|
11522
|
-
const firstFiles = (await (0,
|
|
11735
|
+
const firstFiles = (await (0, import_promises3.readdir)(imagesDir)).filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
11523
11736
|
if (firstFiles.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uCCAB \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328");
|
|
11524
11737
|
const probeImage = (0, import_path5.join)(imagesDir, firstFiles[0]);
|
|
11525
11738
|
markStageProgress("render", Math.round(1 / totalPages * 100), 1, totalPages, `\uD398\uC774\uC9C0 1/${totalPages} \uB80C\uB354\uB9C1`);
|
|
@@ -11557,7 +11770,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11557
11770
|
const keyCount = keyPool.snapshot().length;
|
|
11558
11771
|
const workerCount = Math.max(1, keyCount * concurrencyPerKey);
|
|
11559
11772
|
const queueCapacity = workerCount * 2;
|
|
11560
|
-
const
|
|
11773
|
+
const queue2 = new BoundedQueue(queueCapacity);
|
|
11561
11774
|
const ocrStart = import_node_perf_hooks.performance.now();
|
|
11562
11775
|
currentStage = "ocr";
|
|
11563
11776
|
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
|
|
@@ -11565,17 +11778,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11565
11778
|
let renderDone = 1;
|
|
11566
11779
|
const renderProducer = (async () => {
|
|
11567
11780
|
try {
|
|
11568
|
-
await
|
|
11781
|
+
await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
|
|
11569
11782
|
if (totalPages > 1) {
|
|
11570
11783
|
for await (const item of renderPdfToPngStream(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi, totalPages, 2)) {
|
|
11571
|
-
await
|
|
11784
|
+
await queue2.enqueue(item);
|
|
11572
11785
|
renderDone++;
|
|
11573
11786
|
markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
|
|
11574
11787
|
logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
|
|
11575
11788
|
}
|
|
11576
11789
|
}
|
|
11577
11790
|
} finally {
|
|
11578
|
-
|
|
11791
|
+
queue2.close();
|
|
11579
11792
|
timingsMs.render = elapsedMs(renderStart);
|
|
11580
11793
|
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11581
11794
|
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
|
|
@@ -11584,7 +11797,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11584
11797
|
const [, pageResultsMap] = await Promise.all([
|
|
11585
11798
|
renderProducer,
|
|
11586
11799
|
ocrWorkerPool({
|
|
11587
|
-
queue,
|
|
11800
|
+
queue: queue2,
|
|
11588
11801
|
workerCount,
|
|
11589
11802
|
totalPages,
|
|
11590
11803
|
ocrInput: {
|
|
@@ -11618,7 +11831,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11618
11831
|
const rawPagePaths = [];
|
|
11619
11832
|
for (const [pageNum, markdown] of sortedEntries) {
|
|
11620
11833
|
const pagePath = (0, import_path5.join)(rawDir, `page_${String(pageNum).padStart(4, "0")}.md`);
|
|
11621
|
-
await (0,
|
|
11834
|
+
await (0, import_promises3.writeFile)(pagePath, markdown, "utf-8");
|
|
11622
11835
|
rawPagePaths.push(pagePath);
|
|
11623
11836
|
}
|
|
11624
11837
|
const mergeStart = import_node_perf_hooks.performance.now();
|
|
@@ -11626,7 +11839,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11626
11839
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
11627
11840
|
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11628
11841
|
const merged = await mergeMarkdownPages(rawPagePaths);
|
|
11629
|
-
await (0,
|
|
11842
|
+
await (0, import_promises3.writeFile)(outputPath, merged, "utf-8");
|
|
11630
11843
|
timingsMs.merge = elapsedMs(mergeStart);
|
|
11631
11844
|
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
11632
11845
|
logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
|
|
@@ -11642,7 +11855,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11642
11855
|
timingsMs,
|
|
11643
11856
|
modelCachePath
|
|
11644
11857
|
};
|
|
11645
|
-
await (0,
|
|
11858
|
+
await (0, import_promises3.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
|
|
11646
11859
|
logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
|
|
11647
11860
|
return { outputPath, reportPath, selectedModel };
|
|
11648
11861
|
} catch (err) {
|
|
@@ -11704,17 +11917,6 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
|
|
|
11704
11917
|
model: extra.model
|
|
11705
11918
|
});
|
|
11706
11919
|
}
|
|
11707
|
-
async function convertWithLibreOffice(buffer, ext) {
|
|
11708
|
-
return await new Promise((resolvePromise, reject) => {
|
|
11709
|
-
libreConvert(buffer, ext, void 0, (err, done) => {
|
|
11710
|
-
if (err || !done) {
|
|
11711
|
-
reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
|
|
11712
|
-
return;
|
|
11713
|
-
}
|
|
11714
|
-
resolvePromise(done);
|
|
11715
|
-
});
|
|
11716
|
-
});
|
|
11717
|
-
}
|
|
11718
11920
|
async function getPdfPageCount(pdfPath) {
|
|
11719
11921
|
const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
|
|
11720
11922
|
const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
|
|
@@ -11742,7 +11944,7 @@ async function* renderPdfToPngStream(pdfPath, prefixPath, dpi, totalPages, start
|
|
|
11742
11944
|
pdfPath,
|
|
11743
11945
|
prefixPath
|
|
11744
11946
|
]);
|
|
11745
|
-
const files = await (0,
|
|
11947
|
+
const files = await (0, import_promises3.readdir)(imagesDir);
|
|
11746
11948
|
const pageFiles = files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b));
|
|
11747
11949
|
const imagePath = (0, import_path5.join)(imagesDir, pageFiles[pageFiles.length - 1]);
|
|
11748
11950
|
yield { pageNumber: page, imagePath };
|
|
@@ -11787,13 +11989,6 @@ async function runCommandWithStdout(cmd, args) {
|
|
|
11787
11989
|
});
|
|
11788
11990
|
});
|
|
11789
11991
|
}
|
|
11790
|
-
async function assertSofficeAvailable() {
|
|
11791
|
-
try {
|
|
11792
|
-
await runCommand("soffice", ["--version"]);
|
|
11793
|
-
} catch {
|
|
11794
|
-
throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
|
|
11795
|
-
}
|
|
11796
|
-
}
|
|
11797
11992
|
function naturalPageSort(a, b) {
|
|
11798
11993
|
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11799
11994
|
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
@@ -11867,7 +12062,7 @@ function startParallelProbeRuns(input) {
|
|
|
11867
12062
|
}
|
|
11868
12063
|
async function loadModelCache(path) {
|
|
11869
12064
|
try {
|
|
11870
|
-
const raw = await (0,
|
|
12065
|
+
const raw = await (0, import_promises3.readFile)(path, "utf-8");
|
|
11871
12066
|
return JSON.parse(raw);
|
|
11872
12067
|
} catch {
|
|
11873
12068
|
return null;
|
|
@@ -11898,15 +12093,15 @@ async function updateModelCache(path, probes) {
|
|
|
11898
12093
|
}
|
|
11899
12094
|
}
|
|
11900
12095
|
current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
11901
|
-
await (0,
|
|
12096
|
+
await (0, import_promises3.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
|
|
11902
12097
|
}
|
|
11903
12098
|
async function ocrWorkerPool(input) {
|
|
11904
|
-
const { queue, workerCount, ocrInput, onPageDone } = input;
|
|
12099
|
+
const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
|
|
11905
12100
|
const results = /* @__PURE__ */ new Map();
|
|
11906
12101
|
let completedCount = 0;
|
|
11907
12102
|
async function worker() {
|
|
11908
12103
|
while (true) {
|
|
11909
|
-
const item = await
|
|
12104
|
+
const item = await queue2.dequeue();
|
|
11910
12105
|
if (item === QUEUE_DONE) break;
|
|
11911
12106
|
const { pageNumber, imagePath, error } = item;
|
|
11912
12107
|
if (imagePath === null) {
|
|
@@ -11958,7 +12153,7 @@ async function ocrImageWithFallback(input) {
|
|
|
11958
12153
|
async function mergeMarkdownPages(paths) {
|
|
11959
12154
|
const out = [];
|
|
11960
12155
|
for (let i = 0; i < paths.length; i++) {
|
|
11961
|
-
const txt = (await (0,
|
|
12156
|
+
const txt = (await (0, import_promises3.readFile)(paths[i], "utf-8")).trim();
|
|
11962
12157
|
if (!txt) continue;
|
|
11963
12158
|
out.push(txt);
|
|
11964
12159
|
}
|
|
@@ -12074,7 +12269,7 @@ async function ocrImageViaNim(input) {
|
|
|
12074
12269
|
throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
|
|
12075
12270
|
}
|
|
12076
12271
|
async function encodeBase64(path) {
|
|
12077
|
-
const b = await (0,
|
|
12272
|
+
const b = await (0, import_promises3.readFile)(path);
|
|
12078
12273
|
return b.toString("base64");
|
|
12079
12274
|
}
|
|
12080
12275
|
function stripCodeFence3(text) {
|
|
@@ -12113,7 +12308,7 @@ async function parse2(input, options) {
|
|
|
12113
12308
|
let buffer;
|
|
12114
12309
|
if (typeof input === "string") {
|
|
12115
12310
|
try {
|
|
12116
|
-
const buf = await (0,
|
|
12311
|
+
const buf = await (0, import_promises4.readFile)(input);
|
|
12117
12312
|
buffer = toArrayBuffer(buf);
|
|
12118
12313
|
} catch (err) {
|
|
12119
12314
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
@@ -12273,6 +12468,9 @@ async function parseDocx(buffer, options, zip) {
|
|
|
12273
12468
|
VERSION,
|
|
12274
12469
|
blocksToMarkdown,
|
|
12275
12470
|
compare,
|
|
12471
|
+
convertHwpToPdf,
|
|
12472
|
+
convertHwpxToPdf,
|
|
12473
|
+
convertToPdf,
|
|
12276
12474
|
detectFormat,
|
|
12277
12475
|
detectZipFormat,
|
|
12278
12476
|
diffBlocks,
|