@clazic/kordoc 2.5.2 → 2.6.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +29 -1
- package/dist/{chunk-5CILZHRW.js → chunk-TND4YFBV.js} +2 -2
- package/dist/{chunk-25ZYYLVP.js → chunk-TS3F57LY.js} +158 -6
- package/dist/chunk-TS3F57LY.js.map +1 -0
- package/dist/cli.js +52 -5
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +333 -135
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +71 -2
- package/dist/index.d.ts +71 -2
- package/dist/index.js +320 -125
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +43 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{utils-H2BL5GNR.js → utils-F66K7PXH.js} +2 -2
- package/dist/{watch-D6ODQLPJ.js → watch-2S5ULHAM.js} +3 -3
- package/package.json +1 -1
- package/dist/chunk-25ZYYLVP.js.map +0 -1
- /package/dist/{chunk-5CILZHRW.js.map → chunk-TND4YFBV.js.map} +0 -0
- /package/dist/{utils-H2BL5GNR.js.map → utils-F66K7PXH.js.map} +0 -0
- /package/dist/{watch-D6ODQLPJ.js.map → watch-2S5ULHAM.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -37,6 +37,118 @@ var __toESM = (mod, isNodeMode, target) => (target = mod != null ? __create(__ge
|
|
|
37
37
|
mod
|
|
38
38
|
));
|
|
39
39
|
|
|
40
|
+
// src/utils.ts
|
|
41
|
+
var utils_exports = {};
|
|
42
|
+
__export(utils_exports, {
|
|
43
|
+
KordocError: () => KordocError,
|
|
44
|
+
VERSION: () => VERSION,
|
|
45
|
+
classifyError: () => classifyError,
|
|
46
|
+
isPathTraversal: () => isPathTraversal,
|
|
47
|
+
normalizeKordocError: () => normalizeKordocError,
|
|
48
|
+
precheckZipSize: () => precheckZipSize,
|
|
49
|
+
sanitizeError: () => sanitizeError,
|
|
50
|
+
sanitizeHref: () => sanitizeHref,
|
|
51
|
+
toArrayBuffer: () => toArrayBuffer
|
|
52
|
+
});
|
|
53
|
+
function toArrayBuffer(buf) {
|
|
54
|
+
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
55
|
+
return buf.buffer;
|
|
56
|
+
}
|
|
57
|
+
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
58
|
+
}
|
|
59
|
+
function sanitizeError(err) {
|
|
60
|
+
if (err instanceof KordocError) return err.message;
|
|
61
|
+
return "\uBB38\uC11C \uCC98\uB9AC \uC911 \uC624\uB958\uAC00 \uBC1C\uC0DD\uD588\uC2B5\uB2C8\uB2E4";
|
|
62
|
+
}
|
|
63
|
+
function isPathTraversal(name) {
|
|
64
|
+
if (name.includes("\0")) return true;
|
|
65
|
+
const normalized = name.replace(/\\/g, "/");
|
|
66
|
+
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
67
|
+
}
|
|
68
|
+
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
69
|
+
try {
|
|
70
|
+
const data = new DataView(buffer);
|
|
71
|
+
const len = buffer.byteLength;
|
|
72
|
+
let eocdOffset = -1;
|
|
73
|
+
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
74
|
+
if (data.getUint32(i, true) === 101010256) {
|
|
75
|
+
eocdOffset = i;
|
|
76
|
+
break;
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
80
|
+
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
81
|
+
if (entryCount > maxEntries) {
|
|
82
|
+
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
83
|
+
}
|
|
84
|
+
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
85
|
+
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
86
|
+
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
87
|
+
let totalUncompressed = 0;
|
|
88
|
+
let pos = cdOffset;
|
|
89
|
+
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
90
|
+
if (data.getUint32(pos, true) !== 33639248) break;
|
|
91
|
+
totalUncompressed += data.getUint32(pos + 24, true);
|
|
92
|
+
const nameLen = data.getUint16(pos + 28, true);
|
|
93
|
+
const extraLen = data.getUint16(pos + 30, true);
|
|
94
|
+
const commentLen = data.getUint16(pos + 32, true);
|
|
95
|
+
pos += 46 + nameLen + extraLen + commentLen;
|
|
96
|
+
}
|
|
97
|
+
if (totalUncompressed > maxUncompressedSize) {
|
|
98
|
+
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
99
|
+
}
|
|
100
|
+
return { totalUncompressed, entryCount };
|
|
101
|
+
} catch (err) {
|
|
102
|
+
if (err instanceof KordocError) throw err;
|
|
103
|
+
return { totalUncompressed: 0, entryCount: 0 };
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
function sanitizeHref(href) {
|
|
107
|
+
const trimmed = href.trim();
|
|
108
|
+
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
109
|
+
return trimmed;
|
|
110
|
+
}
|
|
111
|
+
function classifyError(err) {
|
|
112
|
+
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
113
|
+
const msg = err.message;
|
|
114
|
+
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
115
|
+
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
116
|
+
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
117
|
+
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
118
|
+
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
119
|
+
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
120
|
+
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
121
|
+
return "PARSE_ERROR";
|
|
122
|
+
}
|
|
123
|
+
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
124
|
+
if (err instanceof KordocError) {
|
|
125
|
+
if (!err.stage) err.stage = stage;
|
|
126
|
+
if (!err.code) err.code = fallbackCode;
|
|
127
|
+
return err;
|
|
128
|
+
}
|
|
129
|
+
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
130
|
+
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
131
|
+
return new KordocError(message || fallbackMessage, { code, stage });
|
|
132
|
+
}
|
|
133
|
+
var VERSION, KordocError, SAFE_HREF_RE;
|
|
134
|
+
var init_utils = __esm({
|
|
135
|
+
"src/utils.ts"() {
|
|
136
|
+
"use strict";
|
|
137
|
+
VERSION = true ? "2.5.2" : "0.0.0-dev";
|
|
138
|
+
KordocError = class extends Error {
|
|
139
|
+
code;
|
|
140
|
+
stage;
|
|
141
|
+
constructor(message, opts = {}) {
|
|
142
|
+
super(message);
|
|
143
|
+
this.name = "KordocError";
|
|
144
|
+
this.code = opts.code;
|
|
145
|
+
this.stage = opts.stage;
|
|
146
|
+
}
|
|
147
|
+
};
|
|
148
|
+
SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
149
|
+
}
|
|
150
|
+
});
|
|
151
|
+
|
|
40
152
|
// src/page-range.ts
|
|
41
153
|
var page_range_exports = {};
|
|
42
154
|
__export(page_range_exports, {
|
|
@@ -3059,7 +3171,7 @@ var init_provider = __esm({
|
|
|
3059
3171
|
});
|
|
3060
3172
|
|
|
3061
3173
|
// src/index.ts
|
|
3062
|
-
import { readFile as
|
|
3174
|
+
import { readFile as readFile3 } from "fs/promises";
|
|
3063
3175
|
|
|
3064
3176
|
// src/detect.ts
|
|
3065
3177
|
import JSZip from "jszip";
|
|
@@ -3111,97 +3223,8 @@ async function detectZipFormat(buffer) {
|
|
|
3111
3223
|
import JSZip2 from "jszip";
|
|
3112
3224
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3113
3225
|
|
|
3114
|
-
// src/utils.ts
|
|
3115
|
-
var VERSION = true ? "2.5.1" : "0.0.0-dev";
|
|
3116
|
-
function toArrayBuffer(buf) {
|
|
3117
|
-
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3118
|
-
return buf.buffer;
|
|
3119
|
-
}
|
|
3120
|
-
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
3121
|
-
}
|
|
3122
|
-
var KordocError = class extends Error {
|
|
3123
|
-
code;
|
|
3124
|
-
stage;
|
|
3125
|
-
constructor(message, opts = {}) {
|
|
3126
|
-
super(message);
|
|
3127
|
-
this.name = "KordocError";
|
|
3128
|
-
this.code = opts.code;
|
|
3129
|
-
this.stage = opts.stage;
|
|
3130
|
-
}
|
|
3131
|
-
};
|
|
3132
|
-
function isPathTraversal(name) {
|
|
3133
|
-
if (name.includes("\0")) return true;
|
|
3134
|
-
const normalized = name.replace(/\\/g, "/");
|
|
3135
|
-
return normalized.includes("..") || normalized.startsWith("/") || /^[A-Za-z]:/.test(normalized);
|
|
3136
|
-
}
|
|
3137
|
-
function precheckZipSize(buffer, maxUncompressedSize = 100 * 1024 * 1024, maxEntries = 500) {
|
|
3138
|
-
try {
|
|
3139
|
-
const data = new DataView(buffer);
|
|
3140
|
-
const len = buffer.byteLength;
|
|
3141
|
-
let eocdOffset = -1;
|
|
3142
|
-
for (let i = len - 22; i >= Math.max(0, len - 65557); i--) {
|
|
3143
|
-
if (data.getUint32(i, true) === 101010256) {
|
|
3144
|
-
eocdOffset = i;
|
|
3145
|
-
break;
|
|
3146
|
-
}
|
|
3147
|
-
}
|
|
3148
|
-
if (eocdOffset < 0) return { totalUncompressed: 0, entryCount: 0 };
|
|
3149
|
-
const entryCount = data.getUint16(eocdOffset + 10, true);
|
|
3150
|
-
if (entryCount > maxEntries) {
|
|
3151
|
-
throw new KordocError(`ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC: ${entryCount} (\uCD5C\uB300 ${maxEntries})`);
|
|
3152
|
-
}
|
|
3153
|
-
const cdSize = data.getUint32(eocdOffset + 12, true);
|
|
3154
|
-
const cdOffset = data.getUint32(eocdOffset + 16, true);
|
|
3155
|
-
if (cdOffset + cdSize > len) return { totalUncompressed: 0, entryCount };
|
|
3156
|
-
let totalUncompressed = 0;
|
|
3157
|
-
let pos = cdOffset;
|
|
3158
|
-
for (let i = 0; i < entryCount && pos + 46 <= cdOffset + cdSize; i++) {
|
|
3159
|
-
if (data.getUint32(pos, true) !== 33639248) break;
|
|
3160
|
-
totalUncompressed += data.getUint32(pos + 24, true);
|
|
3161
|
-
const nameLen = data.getUint16(pos + 28, true);
|
|
3162
|
-
const extraLen = data.getUint16(pos + 30, true);
|
|
3163
|
-
const commentLen = data.getUint16(pos + 32, true);
|
|
3164
|
-
pos += 46 + nameLen + extraLen + commentLen;
|
|
3165
|
-
}
|
|
3166
|
-
if (totalUncompressed > maxUncompressedSize) {
|
|
3167
|
-
throw new KordocError(`ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC: ${(totalUncompressed / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 ${maxUncompressedSize / 1024 / 1024}MB)`);
|
|
3168
|
-
}
|
|
3169
|
-
return { totalUncompressed, entryCount };
|
|
3170
|
-
} catch (err) {
|
|
3171
|
-
if (err instanceof KordocError) throw err;
|
|
3172
|
-
return { totalUncompressed: 0, entryCount: 0 };
|
|
3173
|
-
}
|
|
3174
|
-
}
|
|
3175
|
-
var SAFE_HREF_RE = /^(?:https?:|mailto:|tel:|#)/i;
|
|
3176
|
-
function sanitizeHref(href) {
|
|
3177
|
-
const trimmed = href.trim();
|
|
3178
|
-
if (!trimmed || !SAFE_HREF_RE.test(trimmed)) return null;
|
|
3179
|
-
return trimmed;
|
|
3180
|
-
}
|
|
3181
|
-
function classifyError(err) {
|
|
3182
|
-
if (!(err instanceof Error)) return "PARSE_ERROR";
|
|
3183
|
-
const msg = err.message;
|
|
3184
|
-
if (msg.includes("\uC554\uD638\uD654")) return "ENCRYPTED";
|
|
3185
|
-
if (msg.includes("DRM")) return "DRM_PROTECTED";
|
|
3186
|
-
if (msg.includes("ZIP bomb") || msg.includes("ZIP \uBE44\uC555\uCD95 \uD06C\uAE30 \uCD08\uACFC") || msg.includes("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC")) return "ZIP_BOMB";
|
|
3187
|
-
if (msg.includes("bomb") || msg.includes("\uD06C\uAE30 \uCD08\uACFC") || msg.includes("\uC555\uCD95 \uD574\uC81C")) return "DECOMPRESSION_BOMB";
|
|
3188
|
-
if (msg.includes("\uC774\uBBF8\uC9C0 \uAE30\uBC18")) return "IMAGE_BASED_PDF";
|
|
3189
|
-
if (msg.includes("\uC139\uC158") && (msg.includes("\uCC3E\uC744 \uC218 \uC5C6") || msg.includes("\uC5C6\uC74C"))) return "NO_SECTIONS";
|
|
3190
|
-
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
3191
|
-
return "PARSE_ERROR";
|
|
3192
|
-
}
|
|
3193
|
-
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
3194
|
-
if (err instanceof KordocError) {
|
|
3195
|
-
if (!err.stage) err.stage = stage;
|
|
3196
|
-
if (!err.code) err.code = fallbackCode;
|
|
3197
|
-
return err;
|
|
3198
|
-
}
|
|
3199
|
-
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
3200
|
-
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
3201
|
-
return new KordocError(message || fallbackMessage, { code, stage });
|
|
3202
|
-
}
|
|
3203
|
-
|
|
3204
3226
|
// src/table/builder.ts
|
|
3227
|
+
init_utils();
|
|
3205
3228
|
var MAX_COLS = 200;
|
|
3206
3229
|
var MAX_ROWS = 1e4;
|
|
3207
3230
|
function buildTable(rows) {
|
|
@@ -3461,6 +3484,8 @@ var HEADING_RATIO_H2 = 1.3;
|
|
|
3461
3484
|
var HEADING_RATIO_H3 = 1.15;
|
|
3462
3485
|
|
|
3463
3486
|
// src/hwpx/parser.ts
|
|
3487
|
+
init_utils();
|
|
3488
|
+
init_utils();
|
|
3464
3489
|
init_page_range();
|
|
3465
3490
|
init_logger();
|
|
3466
3491
|
var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
|
|
@@ -4302,6 +4327,7 @@ function extractTextFromNode(node) {
|
|
|
4302
4327
|
}
|
|
4303
4328
|
|
|
4304
4329
|
// src/hwp5/record.ts
|
|
4330
|
+
init_utils();
|
|
4305
4331
|
import { inflateRawSync, inflateSync } from "zlib";
|
|
4306
4332
|
var TAG_PARA_HEADER = 66;
|
|
4307
4333
|
var TAG_PARA_TEXT = 67;
|
|
@@ -5352,6 +5378,7 @@ function parseLenientCfb(data) {
|
|
|
5352
5378
|
}
|
|
5353
5379
|
|
|
5354
5380
|
// src/hwp5/parser.ts
|
|
5381
|
+
init_utils();
|
|
5355
5382
|
init_page_range();
|
|
5356
5383
|
init_logger();
|
|
5357
5384
|
var CFB = __toESM(require_cfb(), 1);
|
|
@@ -6007,6 +6034,7 @@ function arrangeCells(rows, cols, cells) {
|
|
|
6007
6034
|
}
|
|
6008
6035
|
|
|
6009
6036
|
// src/pdf/parser.ts
|
|
6037
|
+
init_utils();
|
|
6010
6038
|
init_page_range();
|
|
6011
6039
|
import { createRequire } from "module";
|
|
6012
6040
|
import { dirname as dirname2, join as join3, resolve as resolve2 } from "path";
|
|
@@ -7898,6 +7926,7 @@ function mergeKoreanLines(text) {
|
|
|
7898
7926
|
}
|
|
7899
7927
|
|
|
7900
7928
|
// src/xlsx/parser.ts
|
|
7929
|
+
init_utils();
|
|
7901
7930
|
import JSZip3 from "jszip";
|
|
7902
7931
|
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
7903
7932
|
init_logger();
|
|
@@ -8226,6 +8255,7 @@ async function parseXlsxDocument(buffer, options, existingZip) {
|
|
|
8226
8255
|
}
|
|
8227
8256
|
|
|
8228
8257
|
// src/docx/parser.ts
|
|
8258
|
+
init_utils();
|
|
8229
8259
|
import JSZip4 from "jszip";
|
|
8230
8260
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
8231
8261
|
init_logger();
|
|
@@ -8707,6 +8737,7 @@ async function parseDocxDocument(buffer, options, existingZip) {
|
|
|
8707
8737
|
}
|
|
8708
8738
|
|
|
8709
8739
|
// src/index.ts
|
|
8740
|
+
init_utils();
|
|
8710
8741
|
init_cli_provider();
|
|
8711
8742
|
init_markdown_to_blocks();
|
|
8712
8743
|
init_logger();
|
|
@@ -11208,6 +11239,187 @@ async function markdownToXlsx(markdown, options) {
|
|
|
11208
11239
|
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
11209
11240
|
}
|
|
11210
11241
|
|
|
11242
|
+
// src/convert/index.ts
|
|
11243
|
+
import { readFile } from "fs/promises";
|
|
11244
|
+
init_utils();
|
|
11245
|
+
|
|
11246
|
+
// src/convert/libreoffice.ts
|
|
11247
|
+
import libre from "libreoffice-convert";
|
|
11248
|
+
|
|
11249
|
+
// src/convert/error.ts
|
|
11250
|
+
var ConvertError = class extends Error {
|
|
11251
|
+
constructor(code, message) {
|
|
11252
|
+
super(message);
|
|
11253
|
+
this.code = code;
|
|
11254
|
+
this.name = "ConvertError";
|
|
11255
|
+
}
|
|
11256
|
+
};
|
|
11257
|
+
|
|
11258
|
+
// src/convert/libreoffice.ts
|
|
11259
|
+
var libreConvert = libre.convert;
|
|
11260
|
+
async function assertSofficeAvailable() {
|
|
11261
|
+
const { runCommand: runCommand2 } = await Promise.resolve().then(() => (init_utils(), utils_exports));
|
|
11262
|
+
try {
|
|
11263
|
+
await runCommand2("soffice", ["--version"]);
|
|
11264
|
+
} catch {
|
|
11265
|
+
throw new ConvertError(
|
|
11266
|
+
"SOFFICE_NOT_FOUND",
|
|
11267
|
+
"soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694."
|
|
11268
|
+
);
|
|
11269
|
+
}
|
|
11270
|
+
}
|
|
11271
|
+
async function convertBuffer(buffer, targetExt, timeoutMs = 6e4) {
|
|
11272
|
+
return new Promise((resolve4, reject) => {
|
|
11273
|
+
const timer = setTimeout(() => {
|
|
11274
|
+
reject(
|
|
11275
|
+
new ConvertError("TIMEOUT", `\uBCC0\uD658 \uD0C0\uC784\uC544\uC6C3 (${timeoutMs}ms \uCD08\uACFC)`)
|
|
11276
|
+
);
|
|
11277
|
+
}, timeoutMs);
|
|
11278
|
+
libreConvert(buffer, targetExt, void 0, (err, done) => {
|
|
11279
|
+
clearTimeout(timer);
|
|
11280
|
+
if (err || !done) {
|
|
11281
|
+
reject(
|
|
11282
|
+
new ConvertError(
|
|
11283
|
+
"CONVERT_FAILED",
|
|
11284
|
+
err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"
|
|
11285
|
+
)
|
|
11286
|
+
);
|
|
11287
|
+
return;
|
|
11288
|
+
}
|
|
11289
|
+
resolve4(done);
|
|
11290
|
+
});
|
|
11291
|
+
});
|
|
11292
|
+
}
|
|
11293
|
+
|
|
11294
|
+
// src/convert/index.ts
|
|
11295
|
+
var isConverting = false;
|
|
11296
|
+
var queue = [];
|
|
11297
|
+
async function acquireConvertLock() {
|
|
11298
|
+
if (!isConverting) {
|
|
11299
|
+
isConverting = true;
|
|
11300
|
+
return () => {
|
|
11301
|
+
isConverting = false;
|
|
11302
|
+
const next = queue.shift();
|
|
11303
|
+
next?.();
|
|
11304
|
+
};
|
|
11305
|
+
}
|
|
11306
|
+
return new Promise((resolve4) => {
|
|
11307
|
+
queue.push(() => {
|
|
11308
|
+
isConverting = true;
|
|
11309
|
+
resolve4(() => {
|
|
11310
|
+
isConverting = false;
|
|
11311
|
+
const next = queue.shift();
|
|
11312
|
+
next?.();
|
|
11313
|
+
});
|
|
11314
|
+
});
|
|
11315
|
+
});
|
|
11316
|
+
}
|
|
11317
|
+
async function convertToPdf(input, options) {
|
|
11318
|
+
let buffer;
|
|
11319
|
+
try {
|
|
11320
|
+
if (typeof input === "string") {
|
|
11321
|
+
buffer = await readFile(input);
|
|
11322
|
+
} else if (Buffer.isBuffer(input)) {
|
|
11323
|
+
buffer = input;
|
|
11324
|
+
} else {
|
|
11325
|
+
buffer = Buffer.from(input);
|
|
11326
|
+
}
|
|
11327
|
+
} catch (err) {
|
|
11328
|
+
return {
|
|
11329
|
+
success: false,
|
|
11330
|
+
code: "PARSE_ERROR",
|
|
11331
|
+
error: `\uC785\uB825 \uC77D\uAE30 \uC2E4\uD328: ${err instanceof Error ? err.message : String(err)}`,
|
|
11332
|
+
stage: "detect"
|
|
11333
|
+
};
|
|
11334
|
+
}
|
|
11335
|
+
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
11336
|
+
if (buffer.length > MAX_FILE_SIZE) {
|
|
11337
|
+
return {
|
|
11338
|
+
success: false,
|
|
11339
|
+
code: "FILE_TOO_LARGE",
|
|
11340
|
+
error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.length / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`,
|
|
11341
|
+
stage: "detect"
|
|
11342
|
+
};
|
|
11343
|
+
}
|
|
11344
|
+
const format = detectFormat(toArrayBuffer(buffer));
|
|
11345
|
+
if (format !== "hwp" && format !== "hwpx") {
|
|
11346
|
+
return {
|
|
11347
|
+
success: false,
|
|
11348
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11349
|
+
error: `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uD3EC\uB9F7\uC785\uB2C8\uB2E4: ${format}`,
|
|
11350
|
+
stage: "detect"
|
|
11351
|
+
};
|
|
11352
|
+
}
|
|
11353
|
+
try {
|
|
11354
|
+
await assertSofficeAvailable();
|
|
11355
|
+
} catch (err) {
|
|
11356
|
+
if (err instanceof ConvertError) {
|
|
11357
|
+
return {
|
|
11358
|
+
success: false,
|
|
11359
|
+
code: err.code,
|
|
11360
|
+
error: err.message,
|
|
11361
|
+
stage: "validate"
|
|
11362
|
+
};
|
|
11363
|
+
}
|
|
11364
|
+
throw err;
|
|
11365
|
+
}
|
|
11366
|
+
const releaseLock = await acquireConvertLock();
|
|
11367
|
+
try {
|
|
11368
|
+
options?.onProgress?.(10, "convert");
|
|
11369
|
+
const pdf = await convertBuffer(buffer, ".pdf", options?.timeoutMs);
|
|
11370
|
+
options?.onProgress?.(100, "done");
|
|
11371
|
+
return {
|
|
11372
|
+
success: true,
|
|
11373
|
+
pdf: new Uint8Array(pdf),
|
|
11374
|
+
sourceFormat: format
|
|
11375
|
+
};
|
|
11376
|
+
} catch (err) {
|
|
11377
|
+
if (err instanceof ConvertError) {
|
|
11378
|
+
return {
|
|
11379
|
+
success: false,
|
|
11380
|
+
code: err.code,
|
|
11381
|
+
error: err.message,
|
|
11382
|
+
stage: "convert"
|
|
11383
|
+
};
|
|
11384
|
+
}
|
|
11385
|
+
return {
|
|
11386
|
+
success: false,
|
|
11387
|
+
code: classifyError(err),
|
|
11388
|
+
error: err instanceof Error ? err.message : "\uBCC0\uD658 \uC2E4\uD328",
|
|
11389
|
+
stage: "convert"
|
|
11390
|
+
};
|
|
11391
|
+
} finally {
|
|
11392
|
+
releaseLock();
|
|
11393
|
+
}
|
|
11394
|
+
}
|
|
11395
|
+
async function convertHwpToPdf(input, options) {
|
|
11396
|
+
const result = await convertToPdf(input, options);
|
|
11397
|
+
if (result.success && result.sourceFormat !== "hwp") {
|
|
11398
|
+
return {
|
|
11399
|
+
success: false,
|
|
11400
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11401
|
+
error: `HWP 5.x \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11402
|
+
stage: "detect"
|
|
11403
|
+
};
|
|
11404
|
+
}
|
|
11405
|
+
return result;
|
|
11406
|
+
}
|
|
11407
|
+
async function convertHwpxToPdf(input, options) {
|
|
11408
|
+
const result = await convertToPdf(input, options);
|
|
11409
|
+
if (result.success && result.sourceFormat !== "hwpx") {
|
|
11410
|
+
return {
|
|
11411
|
+
success: false,
|
|
11412
|
+
code: "UNSUPPORTED_FORMAT",
|
|
11413
|
+
error: `HWPX \uD3EC\uB9F7\uC774 \uC544\uB2D9\uB2C8\uB2E4: ${result.sourceFormat}`,
|
|
11414
|
+
stage: "detect"
|
|
11415
|
+
};
|
|
11416
|
+
}
|
|
11417
|
+
return result;
|
|
11418
|
+
}
|
|
11419
|
+
|
|
11420
|
+
// src/index.ts
|
|
11421
|
+
init_utils();
|
|
11422
|
+
|
|
11211
11423
|
// src/ocr/api-key-rotation.ts
|
|
11212
11424
|
var AllKeysCoolingDownError = class extends Error {
|
|
11213
11425
|
waitMs;
|
|
@@ -11302,11 +11514,10 @@ var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
|
11302
11514
|
};
|
|
11303
11515
|
|
|
11304
11516
|
// src/pipeline/unified-ocr.ts
|
|
11305
|
-
import { mkdir, readdir, readFile, stat, writeFile } from "fs/promises";
|
|
11517
|
+
import { mkdir, readdir, readFile as readFile2, stat, writeFile } from "fs/promises";
|
|
11306
11518
|
import { basename as basename2, dirname as dirname3, extname, join as join4, resolve as resolve3 } from "path";
|
|
11307
11519
|
import { spawn as spawn2 } from "child_process";
|
|
11308
11520
|
import { performance } from "perf_hooks";
|
|
11309
|
-
import libre from "libreoffice-convert";
|
|
11310
11521
|
init_logger();
|
|
11311
11522
|
|
|
11312
11523
|
// src/pipeline/bounded-queue.ts
|
|
@@ -11368,7 +11579,6 @@ var BoundedQueue = class {
|
|
|
11368
11579
|
};
|
|
11369
11580
|
|
|
11370
11581
|
// src/pipeline/unified-ocr.ts
|
|
11371
|
-
var libreConvert = libre.convert;
|
|
11372
11582
|
var UnifiedOcrError = class extends Error {
|
|
11373
11583
|
code;
|
|
11374
11584
|
stage;
|
|
@@ -11483,8 +11693,8 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11483
11693
|
if (extname(absInput).toLowerCase() !== ".pdf") {
|
|
11484
11694
|
await assertSofficeAvailable();
|
|
11485
11695
|
workingPdfPath = join4(workspaceDir, `${stem}.pdf`);
|
|
11486
|
-
const inputBuffer = await
|
|
11487
|
-
const out = await
|
|
11696
|
+
const inputBuffer = await readFile2(absInput);
|
|
11697
|
+
const out = await convertBuffer(inputBuffer, ".pdf");
|
|
11488
11698
|
await writeFile(workingPdfPath, out);
|
|
11489
11699
|
}
|
|
11490
11700
|
timingsMs.convert = elapsedMs(convertStart);
|
|
@@ -11535,7 +11745,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11535
11745
|
const keyCount = keyPool.snapshot().length;
|
|
11536
11746
|
const workerCount = Math.max(1, keyCount * concurrencyPerKey);
|
|
11537
11747
|
const queueCapacity = workerCount * 2;
|
|
11538
|
-
const
|
|
11748
|
+
const queue2 = new BoundedQueue(queueCapacity);
|
|
11539
11749
|
const ocrStart = performance.now();
|
|
11540
11750
|
currentStage = "ocr";
|
|
11541
11751
|
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (\uC6CC\uCEE4 ${workerCount}\uAC1C)`);
|
|
@@ -11543,17 +11753,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11543
11753
|
let renderDone = 1;
|
|
11544
11754
|
const renderProducer = (async () => {
|
|
11545
11755
|
try {
|
|
11546
|
-
await
|
|
11756
|
+
await queue2.enqueue({ pageNumber: 1, imagePath: probeImage });
|
|
11547
11757
|
if (totalPages > 1) {
|
|
11548
11758
|
for await (const item of renderPdfToPngStream(workingPdfPath, join4(imagesDir, "page"), dpi, totalPages, 2)) {
|
|
11549
|
-
await
|
|
11759
|
+
await queue2.enqueue(item);
|
|
11550
11760
|
renderDone++;
|
|
11551
11761
|
markStageProgress("render", Math.round(renderDone / totalPages * 100), renderDone, totalPages, `\uD398\uC774\uC9C0 ${renderDone}/${totalPages} \uB80C\uB354\uB9C1`);
|
|
11552
11762
|
logStage("debug", "render", "progress", "\uD398\uC774\uC9C0 \uB80C\uB354 \uC644\uB8CC", { page: item.pageNumber });
|
|
11553
11763
|
}
|
|
11554
11764
|
}
|
|
11555
11765
|
} finally {
|
|
11556
|
-
|
|
11766
|
+
queue2.close();
|
|
11557
11767
|
timingsMs.render = elapsedMs(renderStart);
|
|
11558
11768
|
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11559
11769
|
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: renderDone, elapsedMs: timingsMs.render });
|
|
@@ -11562,7 +11772,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11562
11772
|
const [, pageResultsMap] = await Promise.all([
|
|
11563
11773
|
renderProducer,
|
|
11564
11774
|
ocrWorkerPool({
|
|
11565
|
-
queue,
|
|
11775
|
+
queue: queue2,
|
|
11566
11776
|
workerCount,
|
|
11567
11777
|
totalPages,
|
|
11568
11778
|
ocrInput: {
|
|
@@ -11682,17 +11892,6 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
|
|
|
11682
11892
|
model: extra.model
|
|
11683
11893
|
});
|
|
11684
11894
|
}
|
|
11685
|
-
async function convertWithLibreOffice(buffer, ext) {
|
|
11686
|
-
return await new Promise((resolvePromise, reject) => {
|
|
11687
|
-
libreConvert(buffer, ext, void 0, (err, done) => {
|
|
11688
|
-
if (err || !done) {
|
|
11689
|
-
reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
|
|
11690
|
-
return;
|
|
11691
|
-
}
|
|
11692
|
-
resolvePromise(done);
|
|
11693
|
-
});
|
|
11694
|
-
});
|
|
11695
|
-
}
|
|
11696
11895
|
async function getPdfPageCount(pdfPath) {
|
|
11697
11896
|
const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
|
|
11698
11897
|
const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
|
|
@@ -11765,13 +11964,6 @@ async function runCommandWithStdout(cmd, args) {
|
|
|
11765
11964
|
});
|
|
11766
11965
|
});
|
|
11767
11966
|
}
|
|
11768
|
-
async function assertSofficeAvailable() {
|
|
11769
|
-
try {
|
|
11770
|
-
await runCommand("soffice", ["--version"]);
|
|
11771
|
-
} catch {
|
|
11772
|
-
throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
|
|
11773
|
-
}
|
|
11774
|
-
}
|
|
11775
11967
|
function naturalPageSort(a, b) {
|
|
11776
11968
|
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11777
11969
|
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
@@ -11845,7 +12037,7 @@ function startParallelProbeRuns(input) {
|
|
|
11845
12037
|
}
|
|
11846
12038
|
async function loadModelCache(path) {
|
|
11847
12039
|
try {
|
|
11848
|
-
const raw = await
|
|
12040
|
+
const raw = await readFile2(path, "utf-8");
|
|
11849
12041
|
return JSON.parse(raw);
|
|
11850
12042
|
} catch {
|
|
11851
12043
|
return null;
|
|
@@ -11879,12 +12071,12 @@ async function updateModelCache(path, probes) {
|
|
|
11879
12071
|
await writeFile(path, JSON.stringify(current, null, 2), "utf-8");
|
|
11880
12072
|
}
|
|
11881
12073
|
async function ocrWorkerPool(input) {
|
|
11882
|
-
const { queue, workerCount, ocrInput, onPageDone } = input;
|
|
12074
|
+
const { queue: queue2, workerCount, ocrInput, onPageDone } = input;
|
|
11883
12075
|
const results = /* @__PURE__ */ new Map();
|
|
11884
12076
|
let completedCount = 0;
|
|
11885
12077
|
async function worker() {
|
|
11886
12078
|
while (true) {
|
|
11887
|
-
const item = await
|
|
12079
|
+
const item = await queue2.dequeue();
|
|
11888
12080
|
if (item === QUEUE_DONE) break;
|
|
11889
12081
|
const { pageNumber, imagePath, error } = item;
|
|
11890
12082
|
if (imagePath === null) {
|
|
@@ -11936,7 +12128,7 @@ async function ocrImageWithFallback(input) {
|
|
|
11936
12128
|
async function mergeMarkdownPages(paths) {
|
|
11937
12129
|
const out = [];
|
|
11938
12130
|
for (let i = 0; i < paths.length; i++) {
|
|
11939
|
-
const txt = (await
|
|
12131
|
+
const txt = (await readFile2(paths[i], "utf-8")).trim();
|
|
11940
12132
|
if (!txt) continue;
|
|
11941
12133
|
out.push(txt);
|
|
11942
12134
|
}
|
|
@@ -12052,7 +12244,7 @@ async function ocrImageViaNim(input) {
|
|
|
12052
12244
|
throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
|
|
12053
12245
|
}
|
|
12054
12246
|
async function encodeBase64(path) {
|
|
12055
|
-
const b = await
|
|
12247
|
+
const b = await readFile2(path);
|
|
12056
12248
|
return b.toString("base64");
|
|
12057
12249
|
}
|
|
12058
12250
|
function stripCodeFence3(text) {
|
|
@@ -12091,7 +12283,7 @@ async function parse2(input, options) {
|
|
|
12091
12283
|
let buffer;
|
|
12092
12284
|
if (typeof input === "string") {
|
|
12093
12285
|
try {
|
|
12094
|
-
const buf = await
|
|
12286
|
+
const buf = await readFile3(input);
|
|
12095
12287
|
buffer = toArrayBuffer(buf);
|
|
12096
12288
|
} catch (err) {
|
|
12097
12289
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
@@ -12250,6 +12442,9 @@ export {
|
|
|
12250
12442
|
VERSION,
|
|
12251
12443
|
blocksToMarkdown,
|
|
12252
12444
|
compare,
|
|
12445
|
+
convertHwpToPdf,
|
|
12446
|
+
convertHwpxToPdf,
|
|
12447
|
+
convertToPdf,
|
|
12253
12448
|
detectFormat,
|
|
12254
12449
|
detectZipFormat,
|
|
12255
12450
|
diffBlocks,
|