@clazic/kordoc 2.4.11 → 2.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -0
- package/dist/{chunk-PJSXZBZB.js → chunk-5R37N6KE.js} +19 -4
- package/dist/chunk-5R37N6KE.js.map +1 -0
- package/dist/chunk-I6YC6ZGK.js +219 -0
- package/dist/chunk-I6YC6ZGK.js.map +1 -0
- package/dist/{chunk-JGMLDBW5.js → chunk-KJEZPVEK.js} +680 -301
- package/dist/chunk-KJEZPVEK.js.map +1 -0
- package/dist/cli.js +68 -8
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1678 -329
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +121 -1
- package/dist/index.d.ts +121 -1
- package/dist/index.js +1656 -310
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +11 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-PYZL2VNN.js → provider-T2D5XRTI.js} +30 -2
- package/dist/provider-T2D5XRTI.js.map +1 -0
- package/dist/{resolve-4I65IGMM.js → resolve-673XFZQ6.js} +18 -1
- package/dist/resolve-673XFZQ6.js.map +1 -0
- package/dist/{utils-HKVOS2O3.js → utils-XLLXVB7V.js} +4 -2
- package/dist/{watch-EYOGF3HY.js → watch-SOMS2KR7.js} +4 -3
- package/dist/{watch-EYOGF3HY.js.map → watch-SOMS2KR7.js.map} +1 -1
- package/package.json +2 -1
- package/dist/chunk-JGMLDBW5.js.map +0 -1
- package/dist/chunk-PJSXZBZB.js.map +0 -1
- package/dist/provider-PYZL2VNN.js.map +0 -1
- package/dist/resolve-4I65IGMM.js.map +0 -1
- /package/dist/{utils-HKVOS2O3.js.map → utils-XLLXVB7V.js.map} +0 -0
package/dist/index.cjs
CHANGED
|
@@ -71,6 +71,224 @@ var init_page_range = __esm({
|
|
|
71
71
|
}
|
|
72
72
|
});
|
|
73
73
|
|
|
74
|
+
// src/logging/logger.ts
|
|
75
|
+
function createLoggerFromEnv(env = process.env) {
|
|
76
|
+
const level = parseLevel(env.KORDOC_LOG_LEVEL);
|
|
77
|
+
const includeStack = env.KORDOC_LOG_STACK === "1";
|
|
78
|
+
const filePath = env.KORDOC_LOG_FILE ? (0, import_path.resolve)(env.KORDOC_LOG_FILE) : "";
|
|
79
|
+
const config = {
|
|
80
|
+
level,
|
|
81
|
+
includeStack,
|
|
82
|
+
progressSampleMs: parsePositiveInt(env.KORDOC_LOG_PROGRESS_SAMPLE_MS, 1e3),
|
|
83
|
+
basenamePaths: env.KORDOC_LOG_BASENAME_PATHS === "1",
|
|
84
|
+
textLimit: parsePositiveInt(env.KORDOC_LOG_TEXT_LIMIT, 400)
|
|
85
|
+
};
|
|
86
|
+
const consoleSink = new ConsoleLogger(config);
|
|
87
|
+
const sinks = [consoleSink];
|
|
88
|
+
if (filePath) sinks.push(new JsonlLogger(config, filePath));
|
|
89
|
+
return new CompositeLogger(config, sinks);
|
|
90
|
+
}
|
|
91
|
+
function generateRunId(prefix = "run") {
|
|
92
|
+
return `${prefix}_${(0, import_crypto.randomUUID)().slice(0, 8)}`;
|
|
93
|
+
}
|
|
94
|
+
function parseLevel(input) {
|
|
95
|
+
const v = (input || "").toLowerCase();
|
|
96
|
+
if (v === "error" || v === "warn" || v === "info" || v === "debug" || v === "trace") return v;
|
|
97
|
+
return "error";
|
|
98
|
+
}
|
|
99
|
+
function maskSecrets(input) {
|
|
100
|
+
return input.replace(/nvapi-[A-Za-z0-9_\-]+/g, "nvapi-***").replace(/Bearer\s+[A-Za-z0-9_\-\.]+/gi, "Bearer ***");
|
|
101
|
+
}
|
|
102
|
+
function sanitizeMeta(meta, cfg) {
|
|
103
|
+
const out = {};
|
|
104
|
+
for (const [k, v] of Object.entries(meta)) {
|
|
105
|
+
if (/authorization|api[_-]?key|token/i.test(k)) {
|
|
106
|
+
out[k] = "***";
|
|
107
|
+
continue;
|
|
108
|
+
}
|
|
109
|
+
if (typeof v === "string") {
|
|
110
|
+
let next = maskSecrets(v);
|
|
111
|
+
if (cfg.basenamePaths && /path|file|dir/i.test(k)) {
|
|
112
|
+
next = (0, import_path.basename)(next);
|
|
113
|
+
}
|
|
114
|
+
out[k] = limitText(next, cfg.textLimit);
|
|
115
|
+
} else {
|
|
116
|
+
out[k] = v;
|
|
117
|
+
}
|
|
118
|
+
}
|
|
119
|
+
return out;
|
|
120
|
+
}
|
|
121
|
+
function parsePositiveInt(input, fallback) {
|
|
122
|
+
const n = Number(input);
|
|
123
|
+
if (!Number.isFinite(n) || n < 0) return fallback;
|
|
124
|
+
return Math.floor(n);
|
|
125
|
+
}
|
|
126
|
+
function limitText(input, maxLen) {
|
|
127
|
+
if (maxLen <= 0) return input;
|
|
128
|
+
if (input.length <= maxLen) return input;
|
|
129
|
+
return `${input.slice(0, maxLen)}...(+${input.length - maxLen})`;
|
|
130
|
+
}
|
|
131
|
+
var import_fs, import_promises, import_path, import_crypto, LEVEL_ORDER, BaseLogger, ConsoleLogger, JsonlLogger, CompositeLogger;
|
|
132
|
+
var init_logger = __esm({
|
|
133
|
+
"src/logging/logger.ts"() {
|
|
134
|
+
"use strict";
|
|
135
|
+
import_fs = require("fs");
|
|
136
|
+
import_promises = require("fs/promises");
|
|
137
|
+
import_path = require("path");
|
|
138
|
+
import_crypto = require("crypto");
|
|
139
|
+
LEVEL_ORDER = {
|
|
140
|
+
error: 0,
|
|
141
|
+
warn: 1,
|
|
142
|
+
info: 2,
|
|
143
|
+
debug: 3,
|
|
144
|
+
trace: 4
|
|
145
|
+
};
|
|
146
|
+
BaseLogger = class _BaseLogger {
|
|
147
|
+
constructor(config, context = {}) {
|
|
148
|
+
this.config = config;
|
|
149
|
+
this.context = context;
|
|
150
|
+
}
|
|
151
|
+
static progressSeenAt = /* @__PURE__ */ new Map();
|
|
152
|
+
shouldLog(level) {
|
|
153
|
+
return LEVEL_ORDER[level] <= LEVEL_ORDER[this.config.level];
|
|
154
|
+
}
|
|
155
|
+
shouldEmitProgress(ev) {
|
|
156
|
+
if (this.config.progressSampleMs <= 0) return true;
|
|
157
|
+
if ((ev.event ?? "message") !== "progress") return true;
|
|
158
|
+
if (ev.level === "error" || ev.level === "warn") return true;
|
|
159
|
+
const key = [
|
|
160
|
+
this.context.runId ?? ev.runId ?? "no-run",
|
|
161
|
+
this.context.component ?? ev.component ?? "no-component",
|
|
162
|
+
this.context.stage ?? ev.stage ?? "unknown",
|
|
163
|
+
ev.message
|
|
164
|
+
].join("|");
|
|
165
|
+
const now = Date.now();
|
|
166
|
+
const prev = _BaseLogger.progressSeenAt.get(key) ?? 0;
|
|
167
|
+
if (now - prev < this.config.progressSampleMs) return false;
|
|
168
|
+
_BaseLogger.progressSeenAt.set(key, now);
|
|
169
|
+
return true;
|
|
170
|
+
}
|
|
171
|
+
merge(ev) {
|
|
172
|
+
const out = {
|
|
173
|
+
...this.context,
|
|
174
|
+
...ev,
|
|
175
|
+
ts: (/* @__PURE__ */ new Date()).toISOString(),
|
|
176
|
+
level: ev.level,
|
|
177
|
+
stage: ev.stage ?? this.context.stage ?? "unknown",
|
|
178
|
+
event: ev.event ?? "message",
|
|
179
|
+
message: ev.message
|
|
180
|
+
};
|
|
181
|
+
if (!this.config.includeStack && out.error?.stack) {
|
|
182
|
+
out.error = { ...out.error, stack: void 0 };
|
|
183
|
+
}
|
|
184
|
+
if (out.meta) out.meta = sanitizeMeta(out.meta, this.config);
|
|
185
|
+
if (out.error?.message) out.error.message = maskSecrets(out.error.message);
|
|
186
|
+
if (out.message) out.message = limitText(maskSecrets(out.message), this.config.textLimit);
|
|
187
|
+
return out;
|
|
188
|
+
}
|
|
189
|
+
child(context) {
|
|
190
|
+
return new _BaseLogger(this.config, { ...this.context, ...context });
|
|
191
|
+
}
|
|
192
|
+
withRun(runId) {
|
|
193
|
+
return this.child({ runId });
|
|
194
|
+
}
|
|
195
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
196
|
+
log(event) {
|
|
197
|
+
}
|
|
198
|
+
};
|
|
199
|
+
ConsoleLogger = class extends BaseLogger {
|
|
200
|
+
log(event) {
|
|
201
|
+
if (!this.shouldLog(event.level)) return;
|
|
202
|
+
if (!this.shouldEmitProgress(event)) return;
|
|
203
|
+
const e = this.merge(event);
|
|
204
|
+
const prefix = `[${e.ts}] [${e.level.toUpperCase()}]${e.runId ? ` [${e.runId}]` : ""}${e.stage ? ` [${e.stage}]` : ""}`;
|
|
205
|
+
const line = `${prefix} ${e.message}${e.component ? ` (${e.component})` : ""}`;
|
|
206
|
+
if (e.level === "error") {
|
|
207
|
+
process.stderr.write(line + "\n");
|
|
208
|
+
if (e.error?.stack) process.stderr.write(e.error.stack + "\n");
|
|
209
|
+
} else {
|
|
210
|
+
process.stdout.write(line + "\n");
|
|
211
|
+
}
|
|
212
|
+
}
|
|
213
|
+
};
|
|
214
|
+
JsonlLogger = class _JsonlLogger extends BaseLogger {
|
|
215
|
+
constructor(config, filePath, context = {}) {
|
|
216
|
+
super(config, context);
|
|
217
|
+
this.filePath = filePath;
|
|
218
|
+
(0, import_fs.mkdirSync)((0, import_path.dirname)(filePath), { recursive: true });
|
|
219
|
+
_JsonlLogger.ensureState(filePath);
|
|
220
|
+
}
|
|
221
|
+
static states = /* @__PURE__ */ new Map();
|
|
222
|
+
static ensureState(path) {
|
|
223
|
+
let state = _JsonlLogger.states.get(path);
|
|
224
|
+
if (!state) {
|
|
225
|
+
state = { queue: [], flushing: false };
|
|
226
|
+
_JsonlLogger.states.set(path, state);
|
|
227
|
+
const flushSync = () => {
|
|
228
|
+
const s = _JsonlLogger.states.get(path);
|
|
229
|
+
if (!s || s.queue.length === 0) return;
|
|
230
|
+
const payload = s.queue.join("");
|
|
231
|
+
s.queue = [];
|
|
232
|
+
if (!payload) return;
|
|
233
|
+
(0, import_fs.appendFileSync)(path, payload, "utf-8");
|
|
234
|
+
};
|
|
235
|
+
process.on("beforeExit", flushSync);
|
|
236
|
+
process.on("exit", flushSync);
|
|
237
|
+
}
|
|
238
|
+
return state;
|
|
239
|
+
}
|
|
240
|
+
scheduleFlush(path) {
|
|
241
|
+
const state = _JsonlLogger.ensureState(path);
|
|
242
|
+
if (state.timer || state.flushing) return;
|
|
243
|
+
state.timer = setTimeout(() => {
|
|
244
|
+
state.timer = void 0;
|
|
245
|
+
void this.flush(path);
|
|
246
|
+
}, 200);
|
|
247
|
+
}
|
|
248
|
+
async flush(path) {
|
|
249
|
+
const state = _JsonlLogger.ensureState(path);
|
|
250
|
+
if (state.flushing) return;
|
|
251
|
+
if (state.queue.length === 0) return;
|
|
252
|
+
state.flushing = true;
|
|
253
|
+
const payload = state.queue.join("");
|
|
254
|
+
state.queue = [];
|
|
255
|
+
try {
|
|
256
|
+
await (0, import_promises.appendFile)(path, payload, "utf-8");
|
|
257
|
+
} finally {
|
|
258
|
+
state.flushing = false;
|
|
259
|
+
if (state.queue.length > 0) this.scheduleFlush(path);
|
|
260
|
+
}
|
|
261
|
+
}
|
|
262
|
+
log(event) {
|
|
263
|
+
if (!this.shouldLog(event.level)) return;
|
|
264
|
+
if (!this.shouldEmitProgress(event)) return;
|
|
265
|
+
const e = this.merge(event);
|
|
266
|
+
const state = _JsonlLogger.ensureState(this.filePath);
|
|
267
|
+
state.queue.push(JSON.stringify(e) + "\n");
|
|
268
|
+
this.scheduleFlush(this.filePath);
|
|
269
|
+
}
|
|
270
|
+
child(context) {
|
|
271
|
+
return new _JsonlLogger(this.config, this.filePath, { ...this.context, ...context });
|
|
272
|
+
}
|
|
273
|
+
};
|
|
274
|
+
CompositeLogger = class _CompositeLogger extends BaseLogger {
|
|
275
|
+
constructor(config, sinks, context = {}) {
|
|
276
|
+
super(config, context);
|
|
277
|
+
this.sinks = sinks;
|
|
278
|
+
}
|
|
279
|
+
log(event) {
|
|
280
|
+
if (!this.shouldLog(event.level)) return;
|
|
281
|
+
if (!this.shouldEmitProgress(event)) return;
|
|
282
|
+
for (const sink of this.sinks) sink.log(event);
|
|
283
|
+
}
|
|
284
|
+
child(context) {
|
|
285
|
+
const nextSinks = this.sinks.map((s) => s.child(context));
|
|
286
|
+
return new _CompositeLogger(this.config, nextSinks, { ...this.context, ...context });
|
|
287
|
+
}
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
});
|
|
291
|
+
|
|
74
292
|
// node_modules/cfb/cfb.js
|
|
75
293
|
var require_cfb = __commonJS({
|
|
76
294
|
"node_modules/cfb/cfb.js"(exports2, module2) {
|
|
@@ -390,8 +608,8 @@ var require_cfb = __commonJS({
|
|
|
390
608
|
}
|
|
391
609
|
return L.length - R.length;
|
|
392
610
|
}
|
|
393
|
-
function
|
|
394
|
-
if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p :
|
|
611
|
+
function dirname4(p) {
|
|
612
|
+
if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p : dirname4(p.slice(0, -1));
|
|
395
613
|
var c = p.lastIndexOf("/");
|
|
396
614
|
return c === -1 ? p : p.slice(0, c + 1);
|
|
397
615
|
}
|
|
@@ -812,10 +1030,10 @@ var require_cfb = __commonJS({
|
|
|
812
1030
|
data.push([cfb.FullPaths[i2], cfb.FileIndex[i2]]);
|
|
813
1031
|
}
|
|
814
1032
|
for (i2 = 0; i2 < data.length; ++i2) {
|
|
815
|
-
var dad =
|
|
1033
|
+
var dad = dirname4(data[i2][0]);
|
|
816
1034
|
s = fullPaths[dad];
|
|
817
1035
|
while (!s) {
|
|
818
|
-
while (
|
|
1036
|
+
while (dirname4(dad) && !fullPaths[dirname4(dad)]) dad = dirname4(dad);
|
|
819
1037
|
data.push([dad, {
|
|
820
1038
|
name: filename(dad).replace("/", ""),
|
|
821
1039
|
type: 1,
|
|
@@ -825,7 +1043,7 @@ var require_cfb = __commonJS({
|
|
|
825
1043
|
content: null
|
|
826
1044
|
}]);
|
|
827
1045
|
fullPaths[dad] = true;
|
|
828
|
-
dad =
|
|
1046
|
+
dad = dirname4(data[i2][0]);
|
|
829
1047
|
s = fullPaths[dad];
|
|
830
1048
|
}
|
|
831
1049
|
}
|
|
@@ -851,13 +1069,13 @@ var require_cfb = __commonJS({
|
|
|
851
1069
|
elt.size = 0;
|
|
852
1070
|
elt.type = 5;
|
|
853
1071
|
} else if (nm.slice(-1) == "/") {
|
|
854
|
-
for (j = i2 + 1; j < data.length; ++j) if (
|
|
1072
|
+
for (j = i2 + 1; j < data.length; ++j) if (dirname4(cfb.FullPaths[j]) == nm) break;
|
|
855
1073
|
elt.C = j >= data.length ? -1 : j;
|
|
856
|
-
for (j = i2 + 1; j < data.length; ++j) if (
|
|
1074
|
+
for (j = i2 + 1; j < data.length; ++j) if (dirname4(cfb.FullPaths[j]) == dirname4(nm)) break;
|
|
857
1075
|
elt.R = j >= data.length ? -1 : j;
|
|
858
1076
|
elt.type = 1;
|
|
859
1077
|
} else {
|
|
860
|
-
if (
|
|
1078
|
+
if (dirname4(cfb.FullPaths[i2 + 1] || "") == dirname4(nm)) elt.R = i2 + 1;
|
|
861
1079
|
elt.type = 2;
|
|
862
1080
|
}
|
|
863
1081
|
}
|
|
@@ -2026,16 +2244,16 @@ var init_auto_detect = __esm({
|
|
|
2026
2244
|
// src/ocr/cli-provider.ts
|
|
2027
2245
|
function getTempDir() {
|
|
2028
2246
|
if (!_tempDir) {
|
|
2029
|
-
_tempDir = (0,
|
|
2030
|
-
(0,
|
|
2247
|
+
_tempDir = (0, import_path2.join)(process.cwd(), ".kordoc_ocr_tmp");
|
|
2248
|
+
(0, import_fs2.mkdirSync)(_tempDir, { recursive: true });
|
|
2031
2249
|
}
|
|
2032
2250
|
return _tempDir;
|
|
2033
2251
|
}
|
|
2034
2252
|
function createCliOcrProvider(mode) {
|
|
2035
2253
|
return async (pageImage, pageNumber) => {
|
|
2036
|
-
const tempPath = (0,
|
|
2254
|
+
const tempPath = (0, import_path2.join)(getTempDir(), `page-${pageNumber}.png`);
|
|
2037
2255
|
try {
|
|
2038
|
-
(0,
|
|
2256
|
+
(0, import_fs2.writeFileSync)(tempPath, pageImage);
|
|
2039
2257
|
let output;
|
|
2040
2258
|
if (mode === "ollama") {
|
|
2041
2259
|
output = await callOllamaApi(tempPath);
|
|
@@ -2045,7 +2263,7 @@ function createCliOcrProvider(mode) {
|
|
|
2045
2263
|
return { markdown: stripCodeFence(output.trim()) };
|
|
2046
2264
|
} finally {
|
|
2047
2265
|
try {
|
|
2048
|
-
(0,
|
|
2266
|
+
(0, import_fs2.unlinkSync)(tempPath);
|
|
2049
2267
|
} catch {
|
|
2050
2268
|
}
|
|
2051
2269
|
}
|
|
@@ -2082,7 +2300,7 @@ function callCli(mode, imagePath) {
|
|
|
2082
2300
|
return output;
|
|
2083
2301
|
}
|
|
2084
2302
|
function callCodexCli(imagePath) {
|
|
2085
|
-
const outPath = (0,
|
|
2303
|
+
const outPath = (0, import_path2.join)((0, import_os.tmpdir)(), `kordoc-codex-out-${Date.now()}.txt`);
|
|
2086
2304
|
try {
|
|
2087
2305
|
const args = ["exec", OCR_PROMPT, "--image", imagePath, "--output-last-message", outPath];
|
|
2088
2306
|
const model = process.env.KORDOC_CODEX_MODEL;
|
|
@@ -2104,7 +2322,7 @@ function callCodexCli(imagePath) {
|
|
|
2104
2322
|
}
|
|
2105
2323
|
let text;
|
|
2106
2324
|
try {
|
|
2107
|
-
text = (0,
|
|
2325
|
+
text = (0, import_fs2.readFileSync)(outPath, "utf-8");
|
|
2108
2326
|
} catch {
|
|
2109
2327
|
text = result.stdout || "";
|
|
2110
2328
|
}
|
|
@@ -2112,7 +2330,7 @@ function callCodexCli(imagePath) {
|
|
|
2112
2330
|
return text;
|
|
2113
2331
|
} finally {
|
|
2114
2332
|
try {
|
|
2115
|
-
(0,
|
|
2333
|
+
(0, import_fs2.unlinkSync)(outPath);
|
|
2116
2334
|
} catch {
|
|
2117
2335
|
}
|
|
2118
2336
|
}
|
|
@@ -2169,13 +2387,13 @@ function stripCodeFence(text) {
|
|
|
2169
2387
|
const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
|
|
2170
2388
|
return match ? match[1].trim() : text;
|
|
2171
2389
|
}
|
|
2172
|
-
var import_child_process2,
|
|
2390
|
+
var import_child_process2, import_fs2, import_path2, import_os, OCR_PROMPT, _tempDir;
|
|
2173
2391
|
var init_cli_provider = __esm({
|
|
2174
2392
|
"src/ocr/cli-provider.ts"() {
|
|
2175
2393
|
"use strict";
|
|
2176
2394
|
import_child_process2 = require("child_process");
|
|
2177
|
-
|
|
2178
|
-
|
|
2395
|
+
import_fs2 = require("fs");
|
|
2396
|
+
import_path2 = require("path");
|
|
2179
2397
|
import_os = require("os");
|
|
2180
2398
|
OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
|
|
2181
2399
|
\uADDC\uCE59:
|
|
@@ -2219,7 +2437,7 @@ async function createTesseractPoolProvider(concurrency) {
|
|
|
2219
2437
|
const waitQueue = [];
|
|
2220
2438
|
function acquire() {
|
|
2221
2439
|
if (idle.length > 0) return Promise.resolve(idle.pop());
|
|
2222
|
-
return new Promise((
|
|
2440
|
+
return new Promise((resolve4) => waitQueue.push(resolve4));
|
|
2223
2441
|
}
|
|
2224
2442
|
function release(w) {
|
|
2225
2443
|
if (waitQueue.length > 0) {
|
|
@@ -2258,8 +2476,8 @@ __export(batch_provider_exports, {
|
|
|
2258
2476
|
});
|
|
2259
2477
|
function getBatchTempDir() {
|
|
2260
2478
|
if (!_batchTempDir) {
|
|
2261
|
-
_batchTempDir = (0,
|
|
2262
|
-
(0,
|
|
2479
|
+
_batchTempDir = (0, import_path3.join)(process.cwd(), ".kordoc_ocr_tmp");
|
|
2480
|
+
(0, import_fs3.mkdirSync)(_batchTempDir, { recursive: true });
|
|
2263
2481
|
if (process.platform === "win32") {
|
|
2264
2482
|
try {
|
|
2265
2483
|
(0, import_child_process3.execSync)(`attrib +h "${_batchTempDir}"`, { stdio: "ignore" });
|
|
@@ -2279,8 +2497,8 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2279
2497
|
const tempFiles = [];
|
|
2280
2498
|
try {
|
|
2281
2499
|
for (const { image, pageNum } of pages) {
|
|
2282
|
-
const path = (0,
|
|
2283
|
-
(0,
|
|
2500
|
+
const path = (0, import_path3.join)(tempDir, `batch-p${pageNum}.png`);
|
|
2501
|
+
(0, import_fs3.writeFileSync)(path, image);
|
|
2284
2502
|
tempFiles.push(path);
|
|
2285
2503
|
}
|
|
2286
2504
|
let output;
|
|
@@ -2300,7 +2518,7 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2300
2518
|
} finally {
|
|
2301
2519
|
for (const f of tempFiles) {
|
|
2302
2520
|
try {
|
|
2303
|
-
(0,
|
|
2521
|
+
(0, import_fs3.unlinkSync)(f);
|
|
2304
2522
|
} catch {
|
|
2305
2523
|
}
|
|
2306
2524
|
}
|
|
@@ -2310,7 +2528,7 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2310
2528
|
};
|
|
2311
2529
|
}
|
|
2312
2530
|
function spawnAsync(cmd, args, opts) {
|
|
2313
|
-
return new Promise((
|
|
2531
|
+
return new Promise((resolve4, reject) => {
|
|
2314
2532
|
const child = (0, import_child_process3.spawn)(cmd, args, {
|
|
2315
2533
|
cwd: opts.cwd,
|
|
2316
2534
|
env: process.env,
|
|
@@ -2346,7 +2564,7 @@ function spawnAsync(cmd, args, opts) {
|
|
|
2346
2564
|
if (killed) {
|
|
2347
2565
|
reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
|
|
2348
2566
|
} else {
|
|
2349
|
-
|
|
2567
|
+
resolve4({ stdout, stderr, exitCode: code ?? 1 });
|
|
2350
2568
|
}
|
|
2351
2569
|
});
|
|
2352
2570
|
child.on("error", (err) => {
|
|
@@ -2383,7 +2601,7 @@ ${fileRefs}`;
|
|
|
2383
2601
|
return output;
|
|
2384
2602
|
}
|
|
2385
2603
|
async function callBatchCodexCli(imagePaths) {
|
|
2386
|
-
const outPath = (0,
|
|
2604
|
+
const outPath = (0, import_path3.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
|
|
2387
2605
|
try {
|
|
2388
2606
|
const args = ["exec", BATCH_OCR_PROMPT];
|
|
2389
2607
|
for (const p of imagePaths) {
|
|
@@ -2403,7 +2621,7 @@ async function callBatchCodexCli(imagePaths) {
|
|
|
2403
2621
|
}
|
|
2404
2622
|
let text;
|
|
2405
2623
|
try {
|
|
2406
|
-
text = (0,
|
|
2624
|
+
text = (0, import_fs3.readFileSync)(outPath, "utf-8");
|
|
2407
2625
|
} catch {
|
|
2408
2626
|
text = result.stdout || "";
|
|
2409
2627
|
}
|
|
@@ -2411,7 +2629,7 @@ async function callBatchCodexCli(imagePaths) {
|
|
|
2411
2629
|
return text;
|
|
2412
2630
|
} finally {
|
|
2413
2631
|
try {
|
|
2414
|
-
(0,
|
|
2632
|
+
(0, import_fs3.unlinkSync)(outPath);
|
|
2415
2633
|
} catch {
|
|
2416
2634
|
}
|
|
2417
2635
|
}
|
|
@@ -2426,13 +2644,13 @@ function stripCodeFence2(text) {
|
|
|
2426
2644
|
const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
|
|
2427
2645
|
return match ? match[1].trim() : text;
|
|
2428
2646
|
}
|
|
2429
|
-
var import_child_process3,
|
|
2647
|
+
var import_child_process3, import_fs3, import_path3, import_os2, BATCH_OCR_PROMPT, DEFAULT_BATCH_SIZES, _batchTempDir;
|
|
2430
2648
|
var init_batch_provider = __esm({
|
|
2431
2649
|
"src/ocr/batch-provider.ts"() {
|
|
2432
2650
|
"use strict";
|
|
2433
2651
|
import_child_process3 = require("child_process");
|
|
2434
|
-
|
|
2435
|
-
|
|
2652
|
+
import_fs3 = require("fs");
|
|
2653
|
+
import_path3 = require("path");
|
|
2436
2654
|
import_os2 = require("os");
|
|
2437
2655
|
BATCH_OCR_PROMPT = "\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\n\uADDC\uCE59:\n- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
|
|
2438
2656
|
DEFAULT_BATCH_SIZES = {
|
|
@@ -2450,7 +2668,10 @@ __export(resolve_exports, {
|
|
|
2450
2668
|
resolveOcrProvider: () => resolveOcrProvider
|
|
2451
2669
|
});
|
|
2452
2670
|
async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
2671
|
+
const logger = createLoggerFromEnv().child({ component: "ocr/resolve.ts", stage: "ocr" });
|
|
2672
|
+
logger.log({ level: "debug", event: "start", message: "OCR provider resolve \uC2DC\uC791", meta: { mode, concurrency, batchSize } });
|
|
2453
2673
|
if (mode === "off") {
|
|
2674
|
+
logger.log({ level: "warn", event: "error", message: "OCR \uBE44\uD65C\uC131\uD654 \uBAA8\uB4DC \uC694\uCCAD" });
|
|
2454
2675
|
throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
|
|
2455
2676
|
}
|
|
2456
2677
|
if (mode !== "auto") {
|
|
@@ -2458,21 +2679,27 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2458
2679
|
if (mode === "tesseract") {
|
|
2459
2680
|
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2460
2681
|
if (concurrency && concurrency > 1) {
|
|
2682
|
+
logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
|
|
2461
2683
|
return createTesseractPoolProvider2(concurrency);
|
|
2462
2684
|
}
|
|
2685
|
+
logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
|
|
2463
2686
|
return createTesseractProvider2();
|
|
2464
2687
|
}
|
|
2465
2688
|
if (mode === "gemini" || mode === "claude" || mode === "codex") {
|
|
2466
2689
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
2467
2690
|
const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
|
|
2468
2691
|
if (effectiveBatch > 1) {
|
|
2692
|
+
logger.log({ level: "info", event: "done", message: "Batch CLI provider \uC120\uD0DD", meta: { mode, batchSize: effectiveBatch } });
|
|
2469
2693
|
return createBatchCliProvider2(mode, effectiveBatch);
|
|
2470
2694
|
}
|
|
2695
|
+
logger.log({ level: "info", event: "done", message: "CLI provider \uC120\uD0DD", meta: { mode } });
|
|
2471
2696
|
return createCliOcrProvider(mode);
|
|
2472
2697
|
}
|
|
2698
|
+
logger.log({ level: "info", event: "done", message: "CLI provider \uC120\uD0DD", meta: { mode } });
|
|
2473
2699
|
return createCliOcrProvider(mode);
|
|
2474
2700
|
}
|
|
2475
2701
|
const detected = detectAvailableOcr();
|
|
2702
|
+
logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
|
|
2476
2703
|
if (detected !== "codex") {
|
|
2477
2704
|
if (detected === "tesseract") {
|
|
2478
2705
|
warnings?.push({
|
|
@@ -2489,18 +2716,23 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2489
2716
|
if (detected === "tesseract") {
|
|
2490
2717
|
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2491
2718
|
if (concurrency && concurrency > 1) {
|
|
2719
|
+
logger.log({ level: "info", event: "done", message: "AUTO: Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
|
|
2492
2720
|
return createTesseractPoolProvider2(concurrency);
|
|
2493
2721
|
}
|
|
2722
|
+
logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
|
|
2494
2723
|
return createTesseractProvider2();
|
|
2495
2724
|
}
|
|
2496
2725
|
if (detected === "gemini" || detected === "codex" || detected === "claude") {
|
|
2497
2726
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
2498
2727
|
const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[detected];
|
|
2499
2728
|
if (effectiveBatch > 1) {
|
|
2729
|
+
logger.log({ level: "info", event: "done", message: "AUTO: Batch CLI provider \uC120\uD0DD", meta: { mode: detected, batchSize: effectiveBatch } });
|
|
2500
2730
|
return createBatchCliProvider2(detected, effectiveBatch);
|
|
2501
2731
|
}
|
|
2732
|
+
logger.log({ level: "info", event: "done", message: "AUTO: CLI provider \uC120\uD0DD", meta: { mode: detected } });
|
|
2502
2733
|
return createCliOcrProvider(detected);
|
|
2503
2734
|
}
|
|
2735
|
+
logger.log({ level: "info", event: "done", message: "AUTO: CLI provider \uC120\uD0DD", meta: { mode: detected } });
|
|
2504
2736
|
return createCliOcrProvider(detected);
|
|
2505
2737
|
}
|
|
2506
2738
|
var init_resolve = __esm({
|
|
@@ -2508,6 +2740,7 @@ var init_resolve = __esm({
|
|
|
2508
2740
|
"use strict";
|
|
2509
2741
|
init_auto_detect();
|
|
2510
2742
|
init_cli_provider();
|
|
2743
|
+
init_logger();
|
|
2511
2744
|
}
|
|
2512
2745
|
});
|
|
2513
2746
|
|
|
@@ -2667,9 +2900,18 @@ function isBatchProvider(p) {
|
|
|
2667
2900
|
return !!p && typeof p === "object" && "__batch" in p && p.__batch === true;
|
|
2668
2901
|
}
|
|
2669
2902
|
async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2903
|
+
const logger = createLoggerFromEnv().child({ component: "ocr/provider.ts", stage: "ocr" });
|
|
2904
|
+
logger.log({
|
|
2905
|
+
level: "info",
|
|
2906
|
+
event: "start",
|
|
2907
|
+
message: "OCR \uD398\uC774\uC9C0 \uCC98\uB9AC \uC2DC\uC791",
|
|
2908
|
+
meta: { effectivePageCount, concurrency, filteredPages: pageFilter?.size, batchProvider: isBatchProvider(provider) }
|
|
2909
|
+
});
|
|
2670
2910
|
const blocks = [];
|
|
2671
2911
|
if (isBatchProvider(provider)) {
|
|
2672
|
-
|
|
2912
|
+
const result = await ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
|
|
2913
|
+
logger.log({ level: "info", event: "done", message: "OCR \uBC30\uCE58 \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: result.length } });
|
|
2914
|
+
return result;
|
|
2673
2915
|
}
|
|
2674
2916
|
if (concurrency <= 1) {
|
|
2675
2917
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -2685,8 +2927,16 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2685
2927
|
message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2686
2928
|
code: "OCR_PAGE_FAILED"
|
|
2687
2929
|
});
|
|
2930
|
+
logger.log({
|
|
2931
|
+
level: "warn",
|
|
2932
|
+
event: "progress",
|
|
2933
|
+
message: "\uD398\uC774\uC9C0 OCR \uC2E4\uD328",
|
|
2934
|
+
meta: { page: i },
|
|
2935
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
|
|
2936
|
+
});
|
|
2688
2937
|
}
|
|
2689
2938
|
}
|
|
2939
|
+
logger.log({ level: "info", event: "done", message: "OCR \uC21C\uCC28 \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: blocks.length } });
|
|
2690
2940
|
return blocks;
|
|
2691
2941
|
}
|
|
2692
2942
|
const pageNumbers = [];
|
|
@@ -2706,6 +2956,13 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2706
2956
|
message: `\uD398\uC774\uC9C0 ${pageNum} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2707
2957
|
code: "OCR_PAGE_FAILED"
|
|
2708
2958
|
});
|
|
2959
|
+
logger.log({
|
|
2960
|
+
level: "warn",
|
|
2961
|
+
event: "progress",
|
|
2962
|
+
message: "\uD398\uC774\uC9C0 OCR \uC2E4\uD328(\uBCD1\uB82C)",
|
|
2963
|
+
meta: { page: pageNum },
|
|
2964
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
|
|
2965
|
+
});
|
|
2709
2966
|
return null;
|
|
2710
2967
|
}
|
|
2711
2968
|
});
|
|
@@ -2714,6 +2971,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2714
2971
|
if (!item) continue;
|
|
2715
2972
|
for (const b of item.pageBlocks) blocks.push(b);
|
|
2716
2973
|
}
|
|
2974
|
+
logger.log({ level: "info", event: "done", message: "OCR \uBCD1\uB82C \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: blocks.length, pages: pageNumbers.length } });
|
|
2717
2975
|
return blocks;
|
|
2718
2976
|
}
|
|
2719
2977
|
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
@@ -2796,12 +3054,15 @@ var init_provider = __esm({
|
|
|
2796
3054
|
"src/ocr/provider.ts"() {
|
|
2797
3055
|
"use strict";
|
|
2798
3056
|
init_markdown_to_blocks();
|
|
3057
|
+
init_logger();
|
|
2799
3058
|
}
|
|
2800
3059
|
});
|
|
2801
3060
|
|
|
2802
3061
|
// src/index.ts
|
|
2803
3062
|
var index_exports = {};
|
|
2804
3063
|
__export(index_exports, {
|
|
3064
|
+
AllKeysCoolingDownError: () => AllKeysCoolingDownError,
|
|
3065
|
+
ApiKeyRotationPool: () => ApiKeyRotationPool,
|
|
2805
3066
|
VERSION: () => VERSION,
|
|
2806
3067
|
blocksToMarkdown: () => blocksToMarkdown,
|
|
2807
3068
|
compare: () => compare,
|
|
@@ -2820,10 +3081,11 @@ __export(index_exports, {
|
|
|
2820
3081
|
parseHwp: () => parseHwp,
|
|
2821
3082
|
parseHwpx: () => parseHwpx,
|
|
2822
3083
|
parsePdf: () => parsePdf,
|
|
2823
|
-
parseXlsx: () => parseXlsx
|
|
3084
|
+
parseXlsx: () => parseXlsx,
|
|
3085
|
+
runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
|
|
2824
3086
|
});
|
|
2825
3087
|
module.exports = __toCommonJS(index_exports);
|
|
2826
|
-
var
|
|
3088
|
+
var import_promises3 = require("fs/promises");
|
|
2827
3089
|
|
|
2828
3090
|
// src/detect.ts
|
|
2829
3091
|
var import_jszip = __toESM(require("jszip"), 1);
|
|
@@ -2876,7 +3138,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
|
|
|
2876
3138
|
var import_xmldom = require("@xmldom/xmldom");
|
|
2877
3139
|
|
|
2878
3140
|
// src/utils.ts
|
|
2879
|
-
var VERSION = true ? "2.4.
|
|
3141
|
+
var VERSION = true ? "2.4.12" : "0.0.0-dev";
|
|
2880
3142
|
function toArrayBuffer(buf) {
|
|
2881
3143
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2882
3144
|
return buf.buffer;
|
|
@@ -2884,9 +3146,13 @@ function toArrayBuffer(buf) {
|
|
|
2884
3146
|
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
2885
3147
|
}
|
|
2886
3148
|
var KordocError = class extends Error {
|
|
2887
|
-
|
|
3149
|
+
code;
|
|
3150
|
+
stage;
|
|
3151
|
+
constructor(message, opts = {}) {
|
|
2888
3152
|
super(message);
|
|
2889
3153
|
this.name = "KordocError";
|
|
3154
|
+
this.code = opts.code;
|
|
3155
|
+
this.stage = opts.stage;
|
|
2890
3156
|
}
|
|
2891
3157
|
};
|
|
2892
3158
|
function isPathTraversal(name) {
|
|
@@ -2950,6 +3216,16 @@ function classifyError(err) {
|
|
|
2950
3216
|
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
2951
3217
|
return "PARSE_ERROR";
|
|
2952
3218
|
}
|
|
3219
|
+
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
3220
|
+
if (err instanceof KordocError) {
|
|
3221
|
+
if (!err.stage) err.stage = stage;
|
|
3222
|
+
if (!err.code) err.code = fallbackCode;
|
|
3223
|
+
return err;
|
|
3224
|
+
}
|
|
3225
|
+
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
3226
|
+
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
3227
|
+
return new KordocError(message || fallbackMessage, { code, stage });
|
|
3228
|
+
}
|
|
2953
3229
|
|
|
2954
3230
|
// src/table/builder.ts
|
|
2955
3231
|
var MAX_COLS = 200;
|
|
@@ -3212,6 +3488,7 @@ var HEADING_RATIO_H3 = 1.15;
|
|
|
3212
3488
|
|
|
3213
3489
|
// src/hwpx/parser.ts
|
|
3214
3490
|
init_page_range();
|
|
3491
|
+
init_logger();
|
|
3215
3492
|
var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
|
|
3216
3493
|
var MAX_ZIP_ENTRIES = 2e3;
|
|
3217
3494
|
function clampSpan(val, max) {
|
|
@@ -3303,50 +3580,89 @@ function stripDtd(xml) {
|
|
|
3303
3580
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
3304
3581
|
}
|
|
3305
3582
|
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
3306
|
-
|
|
3307
|
-
|
|
3583
|
+
const logger = createLoggerFromEnv().child({ component: "hwpx/parser.ts", stage: "detect" });
|
|
3584
|
+
logger.log({ level: "info", event: "start", message: "HWPX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
3585
|
+
let lastParsedSection = 0;
|
|
3308
3586
|
try {
|
|
3309
|
-
|
|
3310
|
-
|
|
3311
|
-
return await extractFromBrokenZip(buffer);
|
|
3312
|
-
}
|
|
3313
|
-
const actualEntryCount = Object.keys(zip.files).length;
|
|
3314
|
-
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
3315
|
-
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
3316
|
-
}
|
|
3317
|
-
const decompressed = { total: 0 };
|
|
3318
|
-
const metadata = {};
|
|
3319
|
-
await extractHwpxMetadata(zip, metadata, decompressed);
|
|
3320
|
-
const styleMap = await extractHwpxStyles(zip, decompressed);
|
|
3321
|
-
const warnings = [];
|
|
3322
|
-
const sectionPaths = await resolveSectionPaths(zip);
|
|
3323
|
-
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3324
|
-
metadata.pageCount = sectionPaths.length;
|
|
3325
|
-
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
3326
|
-
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
3327
|
-
const blocks = [];
|
|
3328
|
-
let parsedSections = 0;
|
|
3329
|
-
for (let si = 0; si < sectionPaths.length; si++) {
|
|
3330
|
-
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
3331
|
-
const file = zip.file(sectionPaths[si]);
|
|
3332
|
-
if (!file) continue;
|
|
3587
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
3588
|
+
let zip;
|
|
3333
3589
|
try {
|
|
3334
|
-
|
|
3335
|
-
|
|
3336
|
-
|
|
3337
|
-
|
|
3338
|
-
|
|
3339
|
-
|
|
3340
|
-
|
|
3341
|
-
|
|
3342
|
-
|
|
3343
|
-
}
|
|
3344
|
-
|
|
3345
|
-
|
|
3346
|
-
|
|
3347
|
-
|
|
3348
|
-
|
|
3349
|
-
|
|
3590
|
+
zip = existingZip ?? await import_jszip2.default.loadAsync(buffer);
|
|
3591
|
+
} catch {
|
|
3592
|
+
return await extractFromBrokenZip(buffer);
|
|
3593
|
+
}
|
|
3594
|
+
const actualEntryCount = Object.keys(zip.files).length;
|
|
3595
|
+
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
3596
|
+
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
3597
|
+
}
|
|
3598
|
+
const decompressed = { total: 0 };
|
|
3599
|
+
const metadata = {};
|
|
3600
|
+
await extractHwpxMetadata(zip, metadata, decompressed);
|
|
3601
|
+
const styleMap = await extractHwpxStyles(zip, decompressed);
|
|
3602
|
+
const warnings = [];
|
|
3603
|
+
const sectionPaths = await resolveSectionPaths(zip);
|
|
3604
|
+
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3605
|
+
metadata.pageCount = sectionPaths.length;
|
|
3606
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uACBD\uB85C \uD574\uC11D \uC644\uB8CC", meta: { sections: sectionPaths.length } });
|
|
3607
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
3608
|
+
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
3609
|
+
const blocks = [];
|
|
3610
|
+
let parsedSections = 0;
|
|
3611
|
+
for (let si = 0; si < sectionPaths.length; si++) {
|
|
3612
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
3613
|
+
const file = zip.file(sectionPaths[si]);
|
|
3614
|
+
if (!file) continue;
|
|
3615
|
+
try {
|
|
3616
|
+
const xml = await file.async("text");
|
|
3617
|
+
decompressed.total += xml.length * 2;
|
|
3618
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
3619
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
3620
|
+
parsedSections++;
|
|
3621
|
+
options?.onProgress?.(parsedSections, totalTarget);
|
|
3622
|
+
logger.log({
|
|
3623
|
+
level: "debug",
|
|
3624
|
+
stage: "convert",
|
|
3625
|
+
event: "progress",
|
|
3626
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
|
|
3627
|
+
meta: { section: si + 1, parsedSections, totalTarget }
|
|
3628
|
+
});
|
|
3629
|
+
lastParsedSection = si + 1;
|
|
3630
|
+
} catch (secErr) {
|
|
3631
|
+
if (secErr instanceof KordocError) throw secErr;
|
|
3632
|
+
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
3633
|
+
logger.log({
|
|
3634
|
+
level: "warn",
|
|
3635
|
+
stage: "convert",
|
|
3636
|
+
event: "progress",
|
|
3637
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
|
|
3638
|
+
meta: { section: si + 1 },
|
|
3639
|
+
error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
|
|
3640
|
+
});
|
|
3641
|
+
}
|
|
3642
|
+
}
|
|
3643
|
+
const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
|
|
3644
|
+
detectHwpxHeadings(blocks, styleMap);
|
|
3645
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
3646
|
+
const markdown = blocksToMarkdown(blocks);
|
|
3647
|
+
logger.log({
|
|
3648
|
+
level: "info",
|
|
3649
|
+
stage: "finalize",
|
|
3650
|
+
event: "done",
|
|
3651
|
+
message: "HWPX \uD30C\uC2F1 \uC644\uB8CC",
|
|
3652
|
+
meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
|
|
3653
|
+
});
|
|
3654
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
3655
|
+
} catch (err) {
|
|
3656
|
+
logger.log({
|
|
3657
|
+
level: "error",
|
|
3658
|
+
stage: "finalize",
|
|
3659
|
+
event: "error",
|
|
3660
|
+
message: "HWPX \uD30C\uC2F1 \uC2E4\uD328",
|
|
3661
|
+
meta: { lastParsedSection },
|
|
3662
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
3663
|
+
});
|
|
3664
|
+
throw err;
|
|
3665
|
+
}
|
|
3350
3666
|
}
|
|
3351
3667
|
function imageExtToMime(ext) {
|
|
3352
3668
|
switch (ext.toLowerCase()) {
|
|
@@ -5063,75 +5379,115 @@ function parseLenientCfb(data) {
|
|
|
5063
5379
|
|
|
5064
5380
|
// src/hwp5/parser.ts
|
|
5065
5381
|
init_page_range();
|
|
5382
|
+
init_logger();
|
|
5066
5383
|
var CFB = __toESM(require_cfb(), 1);
|
|
5067
5384
|
var MAX_SECTIONS = 100;
|
|
5068
5385
|
var MAX_TOTAL_DECOMPRESS = 500 * 1024 * 1024;
|
|
5069
5386
|
function parseHwp5Document(buffer, options) {
|
|
5070
|
-
|
|
5071
|
-
|
|
5072
|
-
|
|
5387
|
+
const logger = createLoggerFromEnv().child({ component: "hwp5/parser.ts", stage: "detect" });
|
|
5388
|
+
logger.log({ level: "info", event: "start", message: "HWP5 \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.length } });
|
|
5389
|
+
let lastParsedSection = 0;
|
|
5073
5390
|
try {
|
|
5074
|
-
cfb =
|
|
5075
|
-
|
|
5391
|
+
let cfb = null;
|
|
5392
|
+
let lenientCfb = null;
|
|
5393
|
+
const warnings = [];
|
|
5076
5394
|
try {
|
|
5077
|
-
|
|
5078
|
-
warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
|
|
5395
|
+
cfb = CFB.parse(buffer);
|
|
5079
5396
|
} catch {
|
|
5080
|
-
|
|
5397
|
+
try {
|
|
5398
|
+
lenientCfb = parseLenientCfb(buffer);
|
|
5399
|
+
warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
|
|
5400
|
+
} catch {
|
|
5401
|
+
throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
|
|
5402
|
+
}
|
|
5081
5403
|
}
|
|
5082
|
-
|
|
5083
|
-
|
|
5084
|
-
|
|
5085
|
-
|
|
5086
|
-
|
|
5404
|
+
const findStream = (path) => {
|
|
5405
|
+
if (cfb) {
|
|
5406
|
+
const entry = CFB.find(cfb, path);
|
|
5407
|
+
return entry?.content ? Buffer.from(entry.content) : null;
|
|
5408
|
+
}
|
|
5409
|
+
return lenientCfb.findStream(path);
|
|
5410
|
+
};
|
|
5411
|
+
const headerData = findStream("/FileHeader");
|
|
5412
|
+
if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
5413
|
+
const header = parseFileHeader(headerData);
|
|
5414
|
+
if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
|
|
5415
|
+
if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
5416
|
+
if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
5417
|
+
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
5418
|
+
const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
|
|
5419
|
+
const metadata = {
|
|
5420
|
+
version: `${header.versionMajor}.x`
|
|
5421
|
+
};
|
|
5422
|
+
if (cfb) extractHwp5Metadata(cfb, metadata);
|
|
5423
|
+
const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
|
|
5424
|
+
const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
|
|
5425
|
+
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
5426
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uBAA9\uB85D \uD574\uC11D \uC644\uB8CC", meta: { sections: sections.length, distribution } });
|
|
5427
|
+
metadata.pageCount = sections.length;
|
|
5428
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
5429
|
+
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
5430
|
+
const blocks = [];
|
|
5431
|
+
let totalDecompressed = 0;
|
|
5432
|
+
let parsedSections = 0;
|
|
5433
|
+
for (let si = 0; si < sections.length; si++) {
|
|
5434
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
5435
|
+
try {
|
|
5436
|
+
const sectionData = sections[si];
|
|
5437
|
+
const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
5438
|
+
totalDecompressed += data.length;
|
|
5439
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
5440
|
+
const records = readRecords(data);
|
|
5441
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
5442
|
+
blocks.push(...sectionBlocks);
|
|
5443
|
+
parsedSections++;
|
|
5444
|
+
options?.onProgress?.(parsedSections, totalTarget);
|
|
5445
|
+
logger.log({
|
|
5446
|
+
level: "debug",
|
|
5447
|
+
stage: "convert",
|
|
5448
|
+
event: "progress",
|
|
5449
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
|
|
5450
|
+
meta: { section: si + 1, parsedSections, totalTarget }
|
|
5451
|
+
});
|
|
5452
|
+
lastParsedSection = si + 1;
|
|
5453
|
+
} catch (secErr) {
|
|
5454
|
+
if (secErr instanceof KordocError) throw secErr;
|
|
5455
|
+
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
5456
|
+
logger.log({
|
|
5457
|
+
level: "warn",
|
|
5458
|
+
stage: "convert",
|
|
5459
|
+
event: "progress",
|
|
5460
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
|
|
5461
|
+
meta: { section: si + 1 },
|
|
5462
|
+
error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
|
|
5463
|
+
});
|
|
5464
|
+
}
|
|
5087
5465
|
}
|
|
5088
|
-
|
|
5089
|
-
|
|
5090
|
-
|
|
5091
|
-
|
|
5092
|
-
|
|
5093
|
-
|
|
5094
|
-
|
|
5095
|
-
|
|
5096
|
-
|
|
5097
|
-
|
|
5098
|
-
|
|
5099
|
-
|
|
5100
|
-
|
|
5101
|
-
|
|
5102
|
-
|
|
5103
|
-
|
|
5104
|
-
|
|
5105
|
-
|
|
5106
|
-
|
|
5107
|
-
|
|
5108
|
-
|
|
5109
|
-
|
|
5110
|
-
|
|
5111
|
-
|
|
5112
|
-
|
|
5113
|
-
try {
|
|
5114
|
-
const sectionData = sections[si];
|
|
5115
|
-
const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
5116
|
-
totalDecompressed += data.length;
|
|
5117
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
5118
|
-
const records = readRecords(data);
|
|
5119
|
-
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
5120
|
-
blocks.push(...sectionBlocks);
|
|
5121
|
-
parsedSections++;
|
|
5122
|
-
options?.onProgress?.(parsedSections, totalTarget);
|
|
5123
|
-
} catch (secErr) {
|
|
5124
|
-
if (secErr instanceof KordocError) throw secErr;
|
|
5125
|
-
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
5126
|
-
}
|
|
5127
|
-
}
|
|
5128
|
-
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
5129
|
-
if (docInfo) {
|
|
5130
|
-
detectHwp5Headings(blocks, docInfo);
|
|
5131
|
-
}
|
|
5132
|
-
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
5133
|
-
const markdown = blocksToMarkdown(blocks);
|
|
5134
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
5466
|
+
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
5467
|
+
if (docInfo) {
|
|
5468
|
+
detectHwp5Headings(blocks, docInfo);
|
|
5469
|
+
}
|
|
5470
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
5471
|
+
const markdown = blocksToMarkdown(blocks);
|
|
5472
|
+
logger.log({
|
|
5473
|
+
level: "info",
|
|
5474
|
+
stage: "finalize",
|
|
5475
|
+
event: "done",
|
|
5476
|
+
message: "HWP5 \uD30C\uC2F1 \uC644\uB8CC",
|
|
5477
|
+
meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
|
|
5478
|
+
});
|
|
5479
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
5480
|
+
} catch (err) {
|
|
5481
|
+
logger.log({
|
|
5482
|
+
level: "error",
|
|
5483
|
+
stage: "finalize",
|
|
5484
|
+
event: "error",
|
|
5485
|
+
message: "HWP5 \uD30C\uC2F1 \uC2E4\uD328",
|
|
5486
|
+
meta: { lastParsedSection },
|
|
5487
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
5488
|
+
});
|
|
5489
|
+
throw err;
|
|
5490
|
+
}
|
|
5135
5491
|
}
|
|
5136
5492
|
function parseDocInfoStream(cfb, compressed) {
|
|
5137
5493
|
try {
|
|
@@ -5678,6 +6034,8 @@ function arrangeCells(rows, cols, cells) {
|
|
|
5678
6034
|
|
|
5679
6035
|
// src/pdf/parser.ts
|
|
5680
6036
|
init_page_range();
|
|
6037
|
+
var import_module = require("module");
|
|
6038
|
+
var import_path4 = require("path");
|
|
5681
6039
|
|
|
5682
6040
|
// src/pdf/line-detector.ts
|
|
5683
6041
|
var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
@@ -5865,12 +6223,17 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
5865
6223
|
const rawXs = vLines.map((l) => l.x1);
|
|
5866
6224
|
const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
|
|
5867
6225
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
6226
|
+
const rowCount = rowYs.length - 1;
|
|
6227
|
+
const colCount = colXs.length - 1;
|
|
6228
|
+
if (rowCount <= 0 || colCount <= 0) continue;
|
|
6229
|
+
if (rowCount * colCount < 2) continue;
|
|
5868
6230
|
const bbox = {
|
|
5869
6231
|
x1: colXs[0],
|
|
5870
6232
|
y1: rowYs[rowYs.length - 1],
|
|
5871
6233
|
x2: colXs[colXs.length - 1],
|
|
5872
6234
|
y2: rowYs[0]
|
|
5873
6235
|
};
|
|
6236
|
+
if (!hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox)) continue;
|
|
5874
6237
|
grids.push({ rowYs, colXs, bbox });
|
|
5875
6238
|
}
|
|
5876
6239
|
return mergeAdjacentGrids(grids);
|
|
@@ -5920,6 +6283,35 @@ function clusterCoordinates(values) {
|
|
|
5920
6283
|
}
|
|
5921
6284
|
return clusters.map((c) => c.sum / c.count);
|
|
5922
6285
|
}
|
|
6286
|
+
function hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox) {
|
|
6287
|
+
const internalRows = rowYs.slice(1, -1);
|
|
6288
|
+
const internalCols = colXs.slice(1, -1);
|
|
6289
|
+
const width = Math.max(1, bbox.x2 - bbox.x1);
|
|
6290
|
+
const height = Math.max(1, bbox.y2 - bbox.y1);
|
|
6291
|
+
const coverageThreshold = 0.55;
|
|
6292
|
+
const coveredRows = internalRows.filter(
|
|
6293
|
+
(y) => hLines.some((h) => Math.abs(h.y1 - y) <= COORD_MERGE_TOL && lineOverlapRatio(h.x1, h.x2, bbox.x1, bbox.x2) >= coverageThreshold)
|
|
6294
|
+
).length;
|
|
6295
|
+
const coveredCols = internalCols.filter(
|
|
6296
|
+
(x) => vLines.some((v) => Math.abs(v.x1 - x) <= COORD_MERGE_TOL && lineOverlapRatio(v.y1, v.y2, bbox.y1, bbox.y2) >= coverageThreshold)
|
|
6297
|
+
).length;
|
|
6298
|
+
const rowCoverage = internalRows.length > 0 ? coveredRows / internalRows.length : 1;
|
|
6299
|
+
const colCoverage = internalCols.length > 0 ? coveredCols / internalCols.length : 1;
|
|
6300
|
+
const longHorizontal = hLines.filter((h) => Math.abs(h.x2 - h.x1) >= width * 0.7).length;
|
|
6301
|
+
const longVertical = vLines.filter((v) => Math.abs(v.y2 - v.y1) >= height * 0.7).length;
|
|
6302
|
+
const hasAxisSupport = longHorizontal >= 2 && longVertical >= 2;
|
|
6303
|
+
if (!hasAxisSupport) return false;
|
|
6304
|
+
if (internalRows.length > 0 && rowCoverage < 0.5) return false;
|
|
6305
|
+
if (internalCols.length > 0 && colCoverage < 0.5) return false;
|
|
6306
|
+
return true;
|
|
6307
|
+
}
|
|
6308
|
+
function lineOverlapRatio(a1, a2, b1, b2) {
|
|
6309
|
+
const left = Math.max(Math.min(a1, a2), Math.min(b1, b2));
|
|
6310
|
+
const right = Math.min(Math.max(a1, a2), Math.max(b1, b2));
|
|
6311
|
+
const overlap = Math.max(0, right - left);
|
|
6312
|
+
const target = Math.max(1, Math.abs(b2 - b1));
|
|
6313
|
+
return overlap / target;
|
|
6314
|
+
}
|
|
5923
6315
|
function groupConnectedLines(lines) {
|
|
5924
6316
|
const parent = lines.map((_, i) => i);
|
|
5925
6317
|
function find2(x) {
|
|
@@ -6296,6 +6688,9 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
6296
6688
|
};
|
|
6297
6689
|
}
|
|
6298
6690
|
|
|
6691
|
+
// src/pdf/parser.ts
|
|
6692
|
+
init_logger();
|
|
6693
|
+
|
|
6299
6694
|
// src/pdf/polyfill.ts
|
|
6300
6695
|
var pdfjsWorker = __toESM(require("pdfjs-dist/legacy/build/pdf.worker.mjs"), 1);
|
|
6301
6696
|
var g = globalThis;
|
|
@@ -6316,6 +6711,17 @@ g.pdfjsWorker = pdfjsWorker;
|
|
|
6316
6711
|
// src/pdf/parser.ts
|
|
6317
6712
|
var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
|
|
6318
6713
|
import_pdf2.GlobalWorkerOptions.workerSrc = "";
|
|
6714
|
+
var require2 = (0, import_module.createRequire)(
|
|
6715
|
+
typeof __filename !== "undefined" ? __filename : (0, import_path4.resolve)(process.cwd(), "kordoc.require.cjs")
|
|
6716
|
+
);
|
|
6717
|
+
function resolvePdfjsWasmUrl() {
|
|
6718
|
+
try {
|
|
6719
|
+
const pdfjsPkg = require2.resolve("pdfjs-dist/package.json");
|
|
6720
|
+
return (0, import_path4.join)((0, import_path4.dirname)(pdfjsPkg), "wasm/");
|
|
6721
|
+
} catch {
|
|
6722
|
+
return (0, import_path4.resolve)(process.cwd(), "node_modules/pdfjs-dist/wasm/");
|
|
6723
|
+
}
|
|
6724
|
+
}
|
|
6319
6725
|
var MAX_PAGES = 5e3;
|
|
6320
6726
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
6321
6727
|
function calcPdfTimeout(bufferSize) {
|
|
@@ -6331,7 +6737,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6331
6737
|
data: new Uint8Array(buffer),
|
|
6332
6738
|
useSystemFonts: true,
|
|
6333
6739
|
disableFontFace: true,
|
|
6334
|
-
isEvalSupported: false
|
|
6740
|
+
isEvalSupported: false,
|
|
6741
|
+
wasmUrl: resolvePdfjsWasmUrl()
|
|
6335
6742
|
});
|
|
6336
6743
|
let timer;
|
|
6337
6744
|
try {
|
|
@@ -6348,7 +6755,47 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6348
6755
|
if (timer !== void 0) clearTimeout(timer);
|
|
6349
6756
|
}
|
|
6350
6757
|
}
|
|
6758
|
+
function estimateImageBasedPdf(metrics) {
|
|
6759
|
+
if (metrics.length === 0) {
|
|
6760
|
+
return { isImageBased: true, score: 1, reason: "\uC0D8\uD50C \uD1B5\uACC4 \uC5C6\uC74C" };
|
|
6761
|
+
}
|
|
6762
|
+
const totalPages = metrics.length;
|
|
6763
|
+
const totalChars = metrics.reduce((s, m) => s + m.nonWhitespaceChars, 0);
|
|
6764
|
+
const totalItems = metrics.reduce((s, m) => s + m.visibleItems, 0);
|
|
6765
|
+
const pagesWithText = metrics.filter((m) => m.nonWhitespaceChars >= 20 || m.visibleItems >= 15).length;
|
|
6766
|
+
const avgChars = totalChars / totalPages;
|
|
6767
|
+
const avgItems = totalItems / totalPages;
|
|
6768
|
+
const textPresenceRatio = pagesWithText / totalPages;
|
|
6769
|
+
let score = 0;
|
|
6770
|
+
if (avgChars < 10) score += 0.45;
|
|
6771
|
+
if (avgItems < 8) score += 0.35;
|
|
6772
|
+
if (textPresenceRatio < 0.35) score += 0.25;
|
|
6773
|
+
if (avgChars > 40) score -= 0.35;
|
|
6774
|
+
if (avgItems > 25) score -= 0.35;
|
|
6775
|
+
if (textPresenceRatio > 0.7) score -= 0.25;
|
|
6776
|
+
score = Math.max(0, Math.min(1, score));
|
|
6777
|
+
const isImageBased = score >= 0.5;
|
|
6778
|
+
const reason = `avgChars=${avgChars.toFixed(1)}, avgItems=${avgItems.toFixed(1)}, textPresence=${(textPresenceRatio * 100).toFixed(0)}%, score=${score.toFixed(2)}`;
|
|
6779
|
+
return { isImageBased, score, reason };
|
|
6780
|
+
}
|
|
6781
|
+
function summarizePartialFailures(failedPages, totalTarget) {
|
|
6782
|
+
if (failedPages.length === 0) return null;
|
|
6783
|
+
const sorted = [...failedPages].sort((a, b) => a - b);
|
|
6784
|
+
const preview = sorted.slice(0, 10).join(", ");
|
|
6785
|
+
const suffix = sorted.length > 10 ? ` \uC678 ${sorted.length - 10}\uD398\uC774\uC9C0` : "";
|
|
6786
|
+
return `\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uC694\uC57D: ${sorted.length}/${totalTarget}\uD398\uC774\uC9C0 \uC2E4\uD328 (p${preview}${suffix})`;
|
|
6787
|
+
}
|
|
6788
|
+
function shouldAbortForPartialFailures(failedPages, totalTarget, maxPartialFailureRatio) {
|
|
6789
|
+
if (typeof maxPartialFailureRatio !== "number") {
|
|
6790
|
+
return { abort: false, ratio: 0, threshold: 0 };
|
|
6791
|
+
}
|
|
6792
|
+
const threshold = Math.max(0, Math.min(1, maxPartialFailureRatio));
|
|
6793
|
+
const ratio = totalTarget > 0 ? failedPages.length / totalTarget : 0;
|
|
6794
|
+
return { abort: ratio > threshold, ratio, threshold };
|
|
6795
|
+
}
|
|
6351
6796
|
async function parsePdfDocument(buffer, options) {
|
|
6797
|
+
const logger = createLoggerFromEnv().child({ component: "pdf/parser.ts", stage: "detect" });
|
|
6798
|
+
logger.log({ level: "info", event: "start", message: "PDF \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
6352
6799
|
const doc = await loadPdfWithTimeout(buffer);
|
|
6353
6800
|
try {
|
|
6354
6801
|
const pageCount = doc.numPages;
|
|
@@ -6357,9 +6804,13 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6357
6804
|
await extractPdfMetadata(doc, metadata);
|
|
6358
6805
|
const blocks = [];
|
|
6359
6806
|
const warnings = [];
|
|
6807
|
+
const failedPages = [];
|
|
6808
|
+
let lastParsedPage2 = 0;
|
|
6809
|
+
const sampleMetricsByPage = /* @__PURE__ */ new Map();
|
|
6360
6810
|
let totalChars = 0;
|
|
6361
6811
|
let totalTextBytes = 0;
|
|
6362
6812
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
6813
|
+
logger.log({ level: "debug", event: "progress", message: "PDF \uB85C\uB529 \uC644\uB8CC", meta: { pageCount, effectivePageCount } });
|
|
6363
6814
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
6364
6815
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
6365
6816
|
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
@@ -6396,11 +6847,17 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6396
6847
|
totalChars += t.replace(/\s/g, "").length;
|
|
6397
6848
|
totalTextBytes += t.length * 2;
|
|
6398
6849
|
}
|
|
6850
|
+
sampleMetricsByPage.set(i, {
|
|
6851
|
+
nonWhitespaceChars: visible.reduce((sum, it) => sum + it.text.replace(/\s/g, "").length, 0),
|
|
6852
|
+
visibleItems: visible.length
|
|
6853
|
+
});
|
|
6854
|
+
lastParsedPage2 = i;
|
|
6399
6855
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
6400
6856
|
parsedPages++;
|
|
6401
6857
|
options?.onProgress?.(parsedPages, totalTarget);
|
|
6402
6858
|
} catch (pageErr) {
|
|
6403
6859
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
6860
|
+
if (!failedPages.includes(i)) failedPages.push(i);
|
|
6404
6861
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
6405
6862
|
}
|
|
6406
6863
|
};
|
|
@@ -6417,8 +6874,21 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6417
6874
|
for (const si of sampledIndices) {
|
|
6418
6875
|
await parseSinglePage(targetPageNums[si]);
|
|
6419
6876
|
}
|
|
6420
|
-
const
|
|
6421
|
-
const
|
|
6877
|
+
const sampledMetrics = [];
|
|
6878
|
+
for (const si of sampledIndices) {
|
|
6879
|
+
const pageNum = targetPageNums[si];
|
|
6880
|
+
const m = sampleMetricsByPage.get(pageNum);
|
|
6881
|
+
if (m) sampledMetrics.push(m);
|
|
6882
|
+
}
|
|
6883
|
+
const imageBasedDecision = estimateImageBasedPdf(sampledMetrics);
|
|
6884
|
+
const isImageBased = imageBasedDecision.isImageBased;
|
|
6885
|
+
logger.log({
|
|
6886
|
+
level: "info",
|
|
6887
|
+
stage: "probe",
|
|
6888
|
+
event: "done",
|
|
6889
|
+
message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815",
|
|
6890
|
+
meta: { isImageBased, reason: imageBasedDecision.reason, sampledPages: sampledMetrics.length }
|
|
6891
|
+
});
|
|
6422
6892
|
if (!isImageBased) {
|
|
6423
6893
|
for (let si = 0; si < targetPageNums.length; si++) {
|
|
6424
6894
|
if (!sampledIndices.has(si)) {
|
|
@@ -6426,11 +6896,41 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6426
6896
|
}
|
|
6427
6897
|
}
|
|
6428
6898
|
}
|
|
6899
|
+
const partialSummary = summarizePartialFailures(failedPages, totalTarget);
|
|
6900
|
+
if (partialSummary) {
|
|
6901
|
+
warnings.push({
|
|
6902
|
+
message: partialSummary,
|
|
6903
|
+
code: "PARTIAL_PARSE"
|
|
6904
|
+
});
|
|
6905
|
+
}
|
|
6906
|
+
if (isImageBased) {
|
|
6907
|
+
warnings.push({
|
|
6908
|
+
message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815: ${imageBasedDecision.reason}`,
|
|
6909
|
+
code: "OCR_FALLBACK"
|
|
6910
|
+
});
|
|
6911
|
+
}
|
|
6912
|
+
const partialPolicy = shouldAbortForPartialFailures(
|
|
6913
|
+
failedPages,
|
|
6914
|
+
totalTarget,
|
|
6915
|
+
options?.maxPartialFailureRatio
|
|
6916
|
+
);
|
|
6917
|
+
if (partialPolicy.abort) {
|
|
6918
|
+
throw new KordocError(
|
|
6919
|
+
`\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uBE44\uC728 \uCD08\uACFC: ${(partialPolicy.ratio * 100).toFixed(1)}% (\uD5C8\uC6A9 ${(partialPolicy.threshold * 100).toFixed(1)}%)`
|
|
6920
|
+
);
|
|
6921
|
+
}
|
|
6429
6922
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6430
6923
|
if (isImageBased) {
|
|
6431
6924
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
6432
6925
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6433
6926
|
const batchSize = options?.ocrBatchSize;
|
|
6927
|
+
logger.log({
|
|
6928
|
+
level: "info",
|
|
6929
|
+
stage: "ocr",
|
|
6930
|
+
event: "start",
|
|
6931
|
+
message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF OCR \uC2DC\uC791",
|
|
6932
|
+
meta: { ocrMode, concurrency, batchSize, totalTarget }
|
|
6933
|
+
});
|
|
6434
6934
|
if (ocrMode === "off") {
|
|
6435
6935
|
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
6436
6936
|
}
|
|
@@ -6438,8 +6938,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6438
6938
|
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
6439
6939
|
const tryProvider = async (provider, filter) => {
|
|
6440
6940
|
try {
|
|
6941
|
+
logger.log({ level: "debug", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589", meta: { filteredPages: filter?.size } });
|
|
6441
6942
|
return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6442
6943
|
} catch {
|
|
6944
|
+
logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589 \uC2E4\uD328(\uBE48 \uACB0\uACFC\uB85C \uCC98\uB9AC)" });
|
|
6443
6945
|
return [];
|
|
6444
6946
|
} finally {
|
|
6445
6947
|
const terminable = provider;
|
|
@@ -6462,6 +6964,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6462
6964
|
for (const mode of getAutoFallbackChain2()) {
|
|
6463
6965
|
if (pendingPages.size === 0) break;
|
|
6464
6966
|
try {
|
|
6967
|
+
logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uC2DC\uB3C4", meta: { mode, pendingPages: pendingPages.size } });
|
|
6465
6968
|
const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
|
|
6466
6969
|
const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
|
|
6467
6970
|
const blocks2 = await tryProvider(provider, modeFilter);
|
|
@@ -6476,10 +6979,20 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6476
6979
|
code: "OCR_CLI_FALLBACK"
|
|
6477
6980
|
});
|
|
6478
6981
|
}
|
|
6982
|
+
logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uCC98\uB9AC \uC644\uB8CC", meta: { mode, blocks: blocks2.length, pendingPages: pendingPages.size } });
|
|
6479
6983
|
} else {
|
|
6480
6984
|
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6985
|
+
logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uACB0\uACFC \uC5C6\uC74C", meta: { mode } });
|
|
6481
6986
|
}
|
|
6482
|
-
} catch {
|
|
6987
|
+
} catch (engineErr) {
|
|
6988
|
+
logger.log({
|
|
6989
|
+
level: "warn",
|
|
6990
|
+
stage: "ocr",
|
|
6991
|
+
event: "progress",
|
|
6992
|
+
message: "OCR \uC5D4\uC9C4 \uCD08\uAE30\uD654/\uC2E4\uD589 \uC2E4\uD328",
|
|
6993
|
+
meta: { mode },
|
|
6994
|
+
error: { message: engineErr instanceof Error ? engineErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: engineErr instanceof Error ? engineErr.name : "Error" }
|
|
6995
|
+
});
|
|
6483
6996
|
}
|
|
6484
6997
|
}
|
|
6485
6998
|
allOcrBlocks.sort((a, b) => (a.pageNumber ?? 0) - (b.pageNumber ?? 0));
|
|
@@ -6497,6 +7010,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6497
7010
|
}
|
|
6498
7011
|
if (ocrBlocks.length > 0) {
|
|
6499
7012
|
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
7013
|
+
logger.log({ level: "info", stage: "ocr", event: "done", message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 OCR \uC644\uB8CC", meta: { blocks: ocrBlocks.length } });
|
|
6500
7014
|
return {
|
|
6501
7015
|
markdown: ocrMarkdown,
|
|
6502
7016
|
blocks: ocrBlocks,
|
|
@@ -6522,8 +7036,25 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6522
7036
|
}
|
|
6523
7037
|
detectMarkerHeadings(blocks);
|
|
6524
7038
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
6525
|
-
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
7039
|
+
let markdown = cleanPdfText(blocksToMarkdown(blocks), options?.pdfTextNormalization ?? "default");
|
|
7040
|
+
logger.log({
|
|
7041
|
+
level: "info",
|
|
7042
|
+
stage: "finalize",
|
|
7043
|
+
event: "done",
|
|
7044
|
+
message: "PDF \uD30C\uC2F1 \uC644\uB8CC",
|
|
7045
|
+
meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, isImageBased: false }
|
|
7046
|
+
});
|
|
6526
7047
|
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
7048
|
+
} catch (err) {
|
|
7049
|
+
logger.log({
|
|
7050
|
+
level: "error",
|
|
7051
|
+
stage: "finalize",
|
|
7052
|
+
event: "error",
|
|
7053
|
+
message: "PDF \uD30C\uC2F1 \uC2E4\uD328",
|
|
7054
|
+
meta: { lastParsedPage },
|
|
7055
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
7056
|
+
});
|
|
7057
|
+
throw err;
|
|
6527
7058
|
} finally {
|
|
6528
7059
|
await doc.destroy().catch(() => {
|
|
6529
7060
|
});
|
|
@@ -6617,6 +7148,17 @@ function shouldDemoteTable(table) {
|
|
|
6617
7148
|
const emptyCells = totalCells - allCells.length;
|
|
6618
7149
|
if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
|
|
6619
7150
|
if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
|
|
7151
|
+
if (table.cols >= 3 && table.rows <= 4) {
|
|
7152
|
+
const markerCells = allCells.filter((t) => /^[□■◆○●▶▷◇◆]/.test(t)).length;
|
|
7153
|
+
const numericCells = allCells.filter((t) => /\d/.test(t)).length;
|
|
7154
|
+
if (markerCells >= Math.max(1, Math.floor(allCells.length * 0.35)) && numericCells <= Math.floor(allCells.length * 0.15)) {
|
|
7155
|
+
return true;
|
|
7156
|
+
}
|
|
7157
|
+
}
|
|
7158
|
+
if (table.cols >= 3 && table.rows >= 2) {
|
|
7159
|
+
const sparseRows = table.cells.filter((row) => row.filter((c) => c.text.trim()).length <= 1).length;
|
|
7160
|
+
if (sparseRows >= Math.ceil(table.rows * 0.7)) return true;
|
|
7161
|
+
}
|
|
6620
7162
|
return false;
|
|
6621
7163
|
}
|
|
6622
7164
|
function demoteTableToText(table) {
|
|
@@ -7172,10 +7714,15 @@ function mergeLineSimple(items) {
|
|
|
7172
7714
|
}
|
|
7173
7715
|
return result;
|
|
7174
7716
|
}
|
|
7175
|
-
function
|
|
7176
|
-
return
|
|
7177
|
-
|
|
7178
|
-
|
|
7717
|
+
function stripPdfPageNumberArtifacts(text) {
|
|
7718
|
+
return text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "");
|
|
7719
|
+
}
|
|
7720
|
+
function cleanPdfText(text, mode = "default") {
|
|
7721
|
+
const stripped = stripPdfPageNumberArtifacts(text);
|
|
7722
|
+
if (mode === "strict-preserve") {
|
|
7723
|
+
return stripped.replace(/\n{4,}/g, "\n\n\n").trim();
|
|
7724
|
+
}
|
|
7725
|
+
return mergeKoreanLines(stripped).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
|
|
7179
7726
|
}
|
|
7180
7727
|
function startsWithMarker(line) {
|
|
7181
7728
|
const t = line.trimStart();
|
|
@@ -7379,6 +7926,7 @@ function mergeKoreanLines(text) {
|
|
|
7379
7926
|
// src/xlsx/parser.ts
|
|
7380
7927
|
var import_jszip3 = __toESM(require("jszip"), 1);
|
|
7381
7928
|
var import_xmldom2 = require("@xmldom/xmldom");
|
|
7929
|
+
init_logger();
|
|
7382
7930
|
var MAX_SHEETS = 100;
|
|
7383
7931
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
7384
7932
|
var MAX_ROWS2 = 1e4;
|
|
@@ -7568,105 +8116,145 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
7568
8116
|
return blocks;
|
|
7569
8117
|
}
|
|
7570
8118
|
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
7571
|
-
|
|
7572
|
-
|
|
7573
|
-
|
|
7574
|
-
|
|
7575
|
-
|
|
7576
|
-
|
|
7577
|
-
|
|
7578
|
-
|
|
7579
|
-
|
|
7580
|
-
|
|
7581
|
-
|
|
7582
|
-
|
|
7583
|
-
|
|
7584
|
-
|
|
7585
|
-
|
|
7586
|
-
|
|
7587
|
-
|
|
7588
|
-
|
|
7589
|
-
|
|
7590
|
-
|
|
7591
|
-
|
|
7592
|
-
|
|
7593
|
-
|
|
7594
|
-
|
|
7595
|
-
|
|
7596
|
-
|
|
7597
|
-
|
|
7598
|
-
|
|
7599
|
-
|
|
7600
|
-
|
|
7601
|
-
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
7602
|
-
const sheet = sheets[i];
|
|
7603
|
-
options?.onProgress?.(i + 1, processedSheets);
|
|
7604
|
-
let sheetPath = relsMap.get(sheet.rId);
|
|
7605
|
-
if (sheetPath) {
|
|
7606
|
-
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
7607
|
-
sheetPath = `xl/${sheetPath}`;
|
|
7608
|
-
} else if (sheetPath.startsWith("/")) {
|
|
7609
|
-
sheetPath = sheetPath.slice(1);
|
|
7610
|
-
}
|
|
7611
|
-
} else {
|
|
7612
|
-
sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
|
|
7613
|
-
}
|
|
7614
|
-
const sheetFile = zip.file(sheetPath);
|
|
7615
|
-
if (!sheetFile) {
|
|
7616
|
-
warnings.push({
|
|
7617
|
-
page: i + 1,
|
|
7618
|
-
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
|
|
7619
|
-
code: "PARTIAL_PARSE"
|
|
7620
|
-
});
|
|
7621
|
-
continue;
|
|
8119
|
+
const logger = createLoggerFromEnv().child({ component: "xlsx/parser.ts", stage: "detect" });
|
|
8120
|
+
logger.log({ level: "info", event: "start", message: "XLSX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
8121
|
+
let lastProcessedSheet = 0;
|
|
8122
|
+
try {
|
|
8123
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
8124
|
+
const zip = existingZip ?? await import_jszip3.default.loadAsync(buffer);
|
|
8125
|
+
const warnings = [];
|
|
8126
|
+
const workbookFile = zip.file("xl/workbook.xml");
|
|
8127
|
+
if (!workbookFile) {
|
|
8128
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8129
|
+
}
|
|
8130
|
+
let sharedStrings = [];
|
|
8131
|
+
const ssFile = zip.file("xl/sharedStrings.xml");
|
|
8132
|
+
if (ssFile) {
|
|
8133
|
+
sharedStrings = parseSharedStrings(await ssFile.async("text"));
|
|
8134
|
+
}
|
|
8135
|
+
const sheets = parseWorkbook(await workbookFile.async("text"));
|
|
8136
|
+
if (sheets.length === 0) {
|
|
8137
|
+
throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8138
|
+
}
|
|
8139
|
+
logger.log({ level: "debug", event: "progress", message: "\uC2DC\uD2B8 \uBAA9\uB85D \uB85C\uB4DC", meta: { sheets: sheets.length } });
|
|
8140
|
+
let relsMap = /* @__PURE__ */ new Map();
|
|
8141
|
+
const relsFile = zip.file("xl/_rels/workbook.xml.rels");
|
|
8142
|
+
if (relsFile) {
|
|
8143
|
+
relsMap = parseRels(await relsFile.async("text"));
|
|
8144
|
+
}
|
|
8145
|
+
let pageFilter = null;
|
|
8146
|
+
if (options?.pages) {
|
|
8147
|
+
const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => (init_page_range(), page_range_exports));
|
|
8148
|
+
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
7622
8149
|
}
|
|
7623
|
-
|
|
7624
|
-
|
|
7625
|
-
|
|
7626
|
-
|
|
7627
|
-
if (
|
|
7628
|
-
|
|
7629
|
-
|
|
8150
|
+
const blocks = [];
|
|
8151
|
+
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
8152
|
+
let totalCells = 0;
|
|
8153
|
+
for (let i = 0; i < processedSheets; i++) {
|
|
8154
|
+
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
8155
|
+
const sheet = sheets[i];
|
|
8156
|
+
options?.onProgress?.(i + 1, processedSheets);
|
|
8157
|
+
let sheetPath = relsMap.get(sheet.rId);
|
|
8158
|
+
if (sheetPath) {
|
|
8159
|
+
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
8160
|
+
sheetPath = `xl/${sheetPath}`;
|
|
8161
|
+
} else if (sheetPath.startsWith("/")) {
|
|
8162
|
+
sheetPath = sheetPath.slice(1);
|
|
8163
|
+
}
|
|
8164
|
+
} else {
|
|
8165
|
+
sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
|
|
8166
|
+
}
|
|
8167
|
+
const sheetFile = zip.file(sheetPath);
|
|
8168
|
+
if (!sheetFile) {
|
|
8169
|
+
warnings.push({
|
|
8170
|
+
page: i + 1,
|
|
8171
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
|
|
8172
|
+
code: "PARTIAL_PARSE"
|
|
8173
|
+
});
|
|
8174
|
+
continue;
|
|
8175
|
+
}
|
|
8176
|
+
try {
|
|
8177
|
+
const sheetXml = await sheetFile.async("text");
|
|
8178
|
+
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
8179
|
+
totalCells += maxRow * maxCol;
|
|
8180
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
8181
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
8182
|
+
break;
|
|
8183
|
+
}
|
|
8184
|
+
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
8185
|
+
blocks.push(...sheetBlocks);
|
|
8186
|
+
logger.log({
|
|
8187
|
+
level: "debug",
|
|
8188
|
+
stage: "convert",
|
|
8189
|
+
event: "progress",
|
|
8190
|
+
message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC644\uB8CC",
|
|
8191
|
+
meta: { sheet: sheet.name, index: i + 1, processedSheets }
|
|
8192
|
+
});
|
|
8193
|
+
lastProcessedSheet = i + 1;
|
|
8194
|
+
} catch (err) {
|
|
8195
|
+
warnings.push({
|
|
8196
|
+
page: i + 1,
|
|
8197
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
8198
|
+
code: "PARTIAL_PARSE"
|
|
8199
|
+
});
|
|
8200
|
+
logger.log({
|
|
8201
|
+
level: "warn",
|
|
8202
|
+
stage: "convert",
|
|
8203
|
+
event: "progress",
|
|
8204
|
+
message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC2E4\uD328",
|
|
8205
|
+
meta: { sheet: sheet.name, index: i + 1 },
|
|
8206
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
|
|
8207
|
+
});
|
|
7630
8208
|
}
|
|
7631
|
-
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
7632
|
-
blocks.push(...sheetBlocks);
|
|
7633
|
-
} catch (err) {
|
|
7634
|
-
warnings.push({
|
|
7635
|
-
page: i + 1,
|
|
7636
|
-
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
7637
|
-
code: "PARTIAL_PARSE"
|
|
7638
|
-
});
|
|
7639
8209
|
}
|
|
7640
|
-
|
|
7641
|
-
|
|
7642
|
-
|
|
7643
|
-
|
|
7644
|
-
|
|
7645
|
-
|
|
7646
|
-
|
|
7647
|
-
|
|
7648
|
-
|
|
7649
|
-
|
|
7650
|
-
|
|
7651
|
-
|
|
7652
|
-
|
|
7653
|
-
|
|
7654
|
-
|
|
7655
|
-
|
|
7656
|
-
|
|
7657
|
-
|
|
7658
|
-
|
|
7659
|
-
|
|
7660
|
-
|
|
8210
|
+
const metadata = {
|
|
8211
|
+
pageCount: processedSheets
|
|
8212
|
+
};
|
|
8213
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
8214
|
+
if (coreFile) {
|
|
8215
|
+
try {
|
|
8216
|
+
const coreXml = await coreFile.async("text");
|
|
8217
|
+
const doc = parseXml(coreXml);
|
|
8218
|
+
const getFirst = (tag) => {
|
|
8219
|
+
const els = doc.getElementsByTagName(tag);
|
|
8220
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
8221
|
+
};
|
|
8222
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
8223
|
+
metadata.author = getFirst("dc:creator");
|
|
8224
|
+
metadata.description = getFirst("dc:description");
|
|
8225
|
+
const created = getFirst("dcterms:created");
|
|
8226
|
+
if (created) metadata.createdAt = created;
|
|
8227
|
+
const modified = getFirst("dcterms:modified");
|
|
8228
|
+
if (modified) metadata.modifiedAt = modified;
|
|
8229
|
+
} catch {
|
|
8230
|
+
}
|
|
7661
8231
|
}
|
|
8232
|
+
const markdown = blocksToMarkdown(blocks);
|
|
8233
|
+
logger.log({
|
|
8234
|
+
level: "info",
|
|
8235
|
+
stage: "finalize",
|
|
8236
|
+
event: "done",
|
|
8237
|
+
message: "XLSX \uD30C\uC2F1 \uC644\uB8CC",
|
|
8238
|
+
meta: { blocks: blocks.length, warnings: warnings.length, pageCount: processedSheets }
|
|
8239
|
+
});
|
|
8240
|
+
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
8241
|
+
} catch (err) {
|
|
8242
|
+
logger.log({
|
|
8243
|
+
level: "error",
|
|
8244
|
+
stage: "finalize",
|
|
8245
|
+
event: "error",
|
|
8246
|
+
message: "XLSX \uD30C\uC2F1 \uC2E4\uD328",
|
|
8247
|
+
meta: { lastProcessedSheet },
|
|
8248
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
8249
|
+
});
|
|
8250
|
+
throw err;
|
|
7662
8251
|
}
|
|
7663
|
-
const markdown = blocksToMarkdown(blocks);
|
|
7664
|
-
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
7665
8252
|
}
|
|
7666
8253
|
|
|
7667
8254
|
// src/docx/parser.ts
|
|
7668
8255
|
var import_jszip4 = __toESM(require("jszip"), 1);
|
|
7669
8256
|
var import_xmldom3 = require("@xmldom/xmldom");
|
|
8257
|
+
init_logger();
|
|
7670
8258
|
var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
|
|
7671
8259
|
function getChildElements(parent, localName) {
|
|
7672
8260
|
const result = [];
|
|
@@ -8028,101 +8616,127 @@ async function extractImages(zip, rels, doc) {
|
|
|
8028
8616
|
return { blocks, images };
|
|
8029
8617
|
}
|
|
8030
8618
|
async function parseDocxDocument(buffer, options, existingZip) {
|
|
8031
|
-
|
|
8032
|
-
|
|
8033
|
-
|
|
8034
|
-
|
|
8035
|
-
|
|
8036
|
-
|
|
8037
|
-
|
|
8038
|
-
|
|
8039
|
-
|
|
8040
|
-
|
|
8041
|
-
|
|
8042
|
-
|
|
8043
|
-
|
|
8044
|
-
|
|
8045
|
-
|
|
8046
|
-
|
|
8047
|
-
|
|
8048
|
-
|
|
8619
|
+
const logger = createLoggerFromEnv().child({ component: "docx/parser.ts", stage: "detect" });
|
|
8620
|
+
logger.log({ level: "info", event: "start", message: "DOCX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
8621
|
+
let lastProcessedNode = 0;
|
|
8622
|
+
try {
|
|
8623
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
8624
|
+
const zip = existingZip ?? await import_jszip4.default.loadAsync(buffer);
|
|
8625
|
+
const warnings = [];
|
|
8626
|
+
const docFile = zip.file("word/document.xml");
|
|
8627
|
+
if (!docFile) {
|
|
8628
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8629
|
+
}
|
|
8630
|
+
let rels = /* @__PURE__ */ new Map();
|
|
8631
|
+
const relsFile = zip.file("word/_rels/document.xml.rels");
|
|
8632
|
+
if (relsFile) {
|
|
8633
|
+
rels = parseRels2(await relsFile.async("text"));
|
|
8634
|
+
}
|
|
8635
|
+
let styles = /* @__PURE__ */ new Map();
|
|
8636
|
+
const stylesFile = zip.file("word/styles.xml");
|
|
8637
|
+
if (stylesFile) {
|
|
8638
|
+
try {
|
|
8639
|
+
styles = parseStyles(await stylesFile.async("text"));
|
|
8640
|
+
} catch {
|
|
8641
|
+
}
|
|
8049
8642
|
}
|
|
8050
|
-
|
|
8051
|
-
|
|
8052
|
-
|
|
8053
|
-
|
|
8054
|
-
|
|
8055
|
-
|
|
8056
|
-
|
|
8643
|
+
let numbering = /* @__PURE__ */ new Map();
|
|
8644
|
+
const numFile = zip.file("word/numbering.xml");
|
|
8645
|
+
if (numFile) {
|
|
8646
|
+
try {
|
|
8647
|
+
numbering = parseNumbering(await numFile.async("text"));
|
|
8648
|
+
} catch {
|
|
8649
|
+
}
|
|
8057
8650
|
}
|
|
8058
|
-
|
|
8059
|
-
|
|
8060
|
-
|
|
8061
|
-
|
|
8062
|
-
|
|
8063
|
-
|
|
8064
|
-
|
|
8651
|
+
let footnotes = /* @__PURE__ */ new Map();
|
|
8652
|
+
const fnFile = zip.file("word/footnotes.xml");
|
|
8653
|
+
if (fnFile) {
|
|
8654
|
+
try {
|
|
8655
|
+
footnotes = parseFootnotes(await fnFile.async("text"));
|
|
8656
|
+
} catch {
|
|
8657
|
+
}
|
|
8065
8658
|
}
|
|
8066
|
-
|
|
8067
|
-
|
|
8068
|
-
|
|
8069
|
-
|
|
8070
|
-
|
|
8071
|
-
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8072
|
-
}
|
|
8073
|
-
const blocks = [];
|
|
8074
|
-
const bodyEl = body[0];
|
|
8075
|
-
const children = bodyEl.childNodes;
|
|
8076
|
-
for (let i = 0; i < children.length; i++) {
|
|
8077
|
-
const node = children[i];
|
|
8078
|
-
if (node.nodeType !== 1) continue;
|
|
8079
|
-
const el = node;
|
|
8080
|
-
const localName = el.localName ?? el.tagName?.split(":").pop();
|
|
8081
|
-
if (localName === "p") {
|
|
8082
|
-
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
8083
|
-
if (block) blocks.push(block);
|
|
8084
|
-
} else if (localName === "tbl") {
|
|
8085
|
-
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
8086
|
-
if (block) blocks.push(block);
|
|
8087
|
-
}
|
|
8088
|
-
}
|
|
8089
|
-
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
8090
|
-
const metadata = {};
|
|
8091
|
-
const coreFile = zip.file("docProps/core.xml");
|
|
8092
|
-
if (coreFile) {
|
|
8093
|
-
try {
|
|
8094
|
-
const coreXml = await coreFile.async("text");
|
|
8095
|
-
const coreDoc = parseXml2(coreXml);
|
|
8096
|
-
const getFirst = (tag) => {
|
|
8097
|
-
const els = coreDoc.getElementsByTagName(tag);
|
|
8098
|
-
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
8099
|
-
};
|
|
8100
|
-
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
8101
|
-
metadata.author = getFirst("dc:creator");
|
|
8102
|
-
metadata.description = getFirst("dc:description");
|
|
8103
|
-
const created = getFirst("dcterms:created");
|
|
8104
|
-
if (created) metadata.createdAt = created;
|
|
8105
|
-
const modified = getFirst("dcterms:modified");
|
|
8106
|
-
if (modified) metadata.modifiedAt = modified;
|
|
8107
|
-
} catch {
|
|
8659
|
+
const docXml = await docFile.async("text");
|
|
8660
|
+
const doc = parseXml2(docXml);
|
|
8661
|
+
const body = findElements(doc, "body");
|
|
8662
|
+
if (body.length === 0) {
|
|
8663
|
+
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8108
8664
|
}
|
|
8665
|
+
const blocks = [];
|
|
8666
|
+
const bodyEl = body[0];
|
|
8667
|
+
const children = bodyEl.childNodes;
|
|
8668
|
+
for (let i = 0; i < children.length; i++) {
|
|
8669
|
+
const node = children[i];
|
|
8670
|
+
if (node.nodeType !== 1) continue;
|
|
8671
|
+
const el = node;
|
|
8672
|
+
const localName = el.localName ?? el.tagName?.split(":").pop();
|
|
8673
|
+
if (localName === "p") {
|
|
8674
|
+
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
8675
|
+
if (block) blocks.push(block);
|
|
8676
|
+
} else if (localName === "tbl") {
|
|
8677
|
+
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
8678
|
+
if (block) blocks.push(block);
|
|
8679
|
+
}
|
|
8680
|
+
lastProcessedNode = i + 1;
|
|
8681
|
+
}
|
|
8682
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uBCF8\uBB38 \uBE14\uB85D \uD30C\uC2F1 \uC644\uB8CC", meta: { blocks: blocks.length } });
|
|
8683
|
+
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
8684
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC774\uBBF8\uC9C0 \uCD94\uCD9C \uC644\uB8CC", meta: { imageBlocks: imgBlocks.length, images: images.length } });
|
|
8685
|
+
const metadata = {};
|
|
8686
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
8687
|
+
if (coreFile) {
|
|
8688
|
+
try {
|
|
8689
|
+
const coreXml = await coreFile.async("text");
|
|
8690
|
+
const coreDoc = parseXml2(coreXml);
|
|
8691
|
+
const getFirst = (tag) => {
|
|
8692
|
+
const els = coreDoc.getElementsByTagName(tag);
|
|
8693
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
8694
|
+
};
|
|
8695
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
8696
|
+
metadata.author = getFirst("dc:creator");
|
|
8697
|
+
metadata.description = getFirst("dc:description");
|
|
8698
|
+
const created = getFirst("dcterms:created");
|
|
8699
|
+
if (created) metadata.createdAt = created;
|
|
8700
|
+
const modified = getFirst("dcterms:modified");
|
|
8701
|
+
if (modified) metadata.modifiedAt = modified;
|
|
8702
|
+
} catch {
|
|
8703
|
+
}
|
|
8704
|
+
}
|
|
8705
|
+
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
8706
|
+
const markdown = blocksToMarkdown(blocks);
|
|
8707
|
+
logger.log({
|
|
8708
|
+
level: "info",
|
|
8709
|
+
stage: "finalize",
|
|
8710
|
+
event: "done",
|
|
8711
|
+
message: "DOCX \uD30C\uC2F1 \uC644\uB8CC",
|
|
8712
|
+
meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, images: images.length }
|
|
8713
|
+
});
|
|
8714
|
+
return {
|
|
8715
|
+
markdown,
|
|
8716
|
+
blocks,
|
|
8717
|
+
metadata,
|
|
8718
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
8719
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
8720
|
+
images: images.length > 0 ? images : void 0
|
|
8721
|
+
};
|
|
8722
|
+
} catch (err) {
|
|
8723
|
+
logger.log({
|
|
8724
|
+
level: "error",
|
|
8725
|
+
stage: "finalize",
|
|
8726
|
+
event: "error",
|
|
8727
|
+
message: "DOCX \uD30C\uC2F1 \uC2E4\uD328",
|
|
8728
|
+
meta: { lastProcessedNode },
|
|
8729
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
8730
|
+
});
|
|
8731
|
+
throw err;
|
|
8109
8732
|
}
|
|
8110
|
-
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
8111
|
-
const markdown = blocksToMarkdown(blocks);
|
|
8112
|
-
return {
|
|
8113
|
-
markdown,
|
|
8114
|
-
blocks,
|
|
8115
|
-
metadata,
|
|
8116
|
-
outline: outline.length > 0 ? outline : void 0,
|
|
8117
|
-
warnings: warnings.length > 0 ? warnings : void 0,
|
|
8118
|
-
images: images.length > 0 ? images : void 0
|
|
8119
|
-
};
|
|
8120
8733
|
}
|
|
8121
8734
|
|
|
8122
8735
|
// src/index.ts
|
|
8123
8736
|
init_cli_provider();
|
|
8124
8737
|
init_tesseract_provider();
|
|
8125
8738
|
init_markdown_to_blocks();
|
|
8739
|
+
init_logger();
|
|
8126
8740
|
|
|
8127
8741
|
// src/diff/text-diff.ts
|
|
8128
8742
|
function similarity(a, b) {
|
|
@@ -10621,15 +11235,726 @@ async function markdownToXlsx(markdown, options) {
|
|
|
10621
11235
|
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
10622
11236
|
}
|
|
10623
11237
|
|
|
11238
|
+
// src/ocr/api-key-rotation.ts
|
|
11239
|
+
var AllKeysCoolingDownError = class extends Error {
|
|
11240
|
+
waitMs;
|
|
11241
|
+
constructor(waitMs) {
|
|
11242
|
+
super(`\uBAA8\uB4E0 API \uD0A4\uAC00 cooldown \uC0C1\uD0DC\uC785\uB2C8\uB2E4. ${waitMs}ms \uD6C4 \uC7AC\uC2DC\uB3C4\uD558\uC138\uC694.`);
|
|
11243
|
+
this.name = "AllKeysCoolingDownError";
|
|
11244
|
+
this.waitMs = waitMs;
|
|
11245
|
+
}
|
|
11246
|
+
};
|
|
11247
|
+
var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
11248
|
+
states;
|
|
11249
|
+
baseCooldownMs;
|
|
11250
|
+
maxCooldownMs;
|
|
11251
|
+
cursor = -1;
|
|
11252
|
+
constructor(keys, options = {}) {
|
|
11253
|
+
const normalized = keys.map((k) => k.trim()).filter(Boolean);
|
|
11254
|
+
if (normalized.length === 0) {
|
|
11255
|
+
throw new Error("API \uD0A4\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11256
|
+
}
|
|
11257
|
+
this.states = normalized.map((key, idx) => ({
|
|
11258
|
+
key,
|
|
11259
|
+
keyId: `key_${idx + 1}`,
|
|
11260
|
+
totalRequests: 0,
|
|
11261
|
+
successCount: 0,
|
|
11262
|
+
failureCount: 0,
|
|
11263
|
+
consecutiveFailures: 0
|
|
11264
|
+
}));
|
|
11265
|
+
this.baseCooldownMs = options.baseCooldownMs ?? 5e3;
|
|
11266
|
+
this.maxCooldownMs = options.maxCooldownMs ?? 12e4;
|
|
11267
|
+
}
|
|
11268
|
+
static fromEnv(env = process.env) {
|
|
11269
|
+
const multi = (env.NVIDIA_API_KEYS || "").split(",").map((v) => v.trim()).filter(Boolean);
|
|
11270
|
+
if (multi.length > 0) return new _ApiKeyRotationPool(multi);
|
|
11271
|
+
const single = (env.NVIDIA_API_KEY || "").trim();
|
|
11272
|
+
if (single) return new _ApiKeyRotationPool([single]);
|
|
11273
|
+
throw new Error("NVIDIA_API_KEYS \uB610\uB294 NVIDIA_API_KEY \uD658\uACBD\uBCC0\uC218\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.");
|
|
11274
|
+
}
|
|
11275
|
+
acquire(now = Date.now()) {
|
|
11276
|
+
const n = this.states.length;
|
|
11277
|
+
for (let step = 1; step <= n; step++) {
|
|
11278
|
+
const idx = (this.cursor + step) % n;
|
|
11279
|
+
const s = this.states[idx];
|
|
11280
|
+
if (!s.cooldownUntil || s.cooldownUntil <= now) {
|
|
11281
|
+
this.cursor = idx;
|
|
11282
|
+
s.totalRequests++;
|
|
11283
|
+
s.lastUsedAt = now;
|
|
11284
|
+
return { key: s.key, keyId: s.keyId };
|
|
11285
|
+
}
|
|
11286
|
+
}
|
|
11287
|
+
const minCooldownUntil = this.states.map((s) => s.cooldownUntil ?? now).reduce((min, v) => Math.min(min, v), Number.POSITIVE_INFINITY);
|
|
11288
|
+
throw new AllKeysCoolingDownError(Math.max(0, minCooldownUntil - now));
|
|
11289
|
+
}
|
|
11290
|
+
markSuccess(keyId) {
|
|
11291
|
+
const s = this.find(keyId);
|
|
11292
|
+
s.successCount++;
|
|
11293
|
+
s.consecutiveFailures = 0;
|
|
11294
|
+
s.cooldownUntil = void 0;
|
|
11295
|
+
}
|
|
11296
|
+
markFailure(keyId, opts = {}, now = Date.now()) {
|
|
11297
|
+
const s = this.find(keyId);
|
|
11298
|
+
s.failureCount++;
|
|
11299
|
+
s.consecutiveFailures++;
|
|
11300
|
+
const retryable = this.isRetryableFailure(opts.status, opts.timeout);
|
|
11301
|
+
if (!retryable) return;
|
|
11302
|
+
const exp = Math.max(0, s.consecutiveFailures - 1);
|
|
11303
|
+
const backoff = Math.min(this.baseCooldownMs * 2 ** exp, this.maxCooldownMs);
|
|
11304
|
+
const cooldown = Math.max(backoff, opts.retryAfterMs ?? 0);
|
|
11305
|
+
s.cooldownUntil = now + cooldown;
|
|
11306
|
+
}
|
|
11307
|
+
snapshot() {
|
|
11308
|
+
return this.states.map((s) => ({
|
|
11309
|
+
keyId: s.keyId,
|
|
11310
|
+
totalRequests: s.totalRequests,
|
|
11311
|
+
successCount: s.successCount,
|
|
11312
|
+
failureCount: s.failureCount,
|
|
11313
|
+
consecutiveFailures: s.consecutiveFailures,
|
|
11314
|
+
lastUsedAt: s.lastUsedAt,
|
|
11315
|
+
cooldownUntil: s.cooldownUntil
|
|
11316
|
+
}));
|
|
11317
|
+
}
|
|
11318
|
+
isRetryableFailure(status, timeout) {
|
|
11319
|
+
if (timeout) return true;
|
|
11320
|
+
if (status === 429) return true;
|
|
11321
|
+
if (typeof status === "number" && status >= 500) return true;
|
|
11322
|
+
return false;
|
|
11323
|
+
}
|
|
11324
|
+
find(keyId) {
|
|
11325
|
+
const s = this.states.find((v) => v.keyId === keyId);
|
|
11326
|
+
if (!s) throw new Error(`\uC54C \uC218 \uC5C6\uB294 keyId: ${keyId}`);
|
|
11327
|
+
return s;
|
|
11328
|
+
}
|
|
11329
|
+
};
|
|
11330
|
+
|
|
11331
|
+
// src/pipeline/unified-ocr.ts
|
|
11332
|
+
var import_promises2 = require("fs/promises");
|
|
11333
|
+
var import_path5 = require("path");
|
|
11334
|
+
var import_child_process4 = require("child_process");
|
|
11335
|
+
var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
|
|
11336
|
+
init_logger();
|
|
11337
|
+
var libreConvert = import_libreoffice_convert.default.convert;
|
|
11338
|
+
var UnifiedOcrError = class extends Error {
|
|
11339
|
+
code;
|
|
11340
|
+
stage;
|
|
11341
|
+
constructor(code, stage, message) {
|
|
11342
|
+
super(message);
|
|
11343
|
+
this.name = "UnifiedOcrError";
|
|
11344
|
+
this.code = code;
|
|
11345
|
+
this.stage = stage;
|
|
11346
|
+
}
|
|
11347
|
+
};
|
|
11348
|
+
var DEFAULT_MODELS = [
|
|
11349
|
+
"mistralai/mistral-medium-3-instruct",
|
|
11350
|
+
"moonshotai/kimi-k2.5",
|
|
11351
|
+
"moonshotai/kimi-k2-thinking",
|
|
11352
|
+
"moonshotai/kimi-k2-instruct",
|
|
11353
|
+
"moonshotai/kimi-k2-instruct-0905",
|
|
11354
|
+
"qwen/qwen3.5-122b-a10b",
|
|
11355
|
+
"qwen/qwen3.5-397b-a17b"
|
|
11356
|
+
];
|
|
11357
|
+
var DEFAULT_MODEL_MAX_TOKENS = {
|
|
11358
|
+
"mistralai/mistral-medium-3-instruct": 8192,
|
|
11359
|
+
"moonshotai/kimi-k2.5": 64e3,
|
|
11360
|
+
"moonshotai/kimi-k2-thinking": 64e3,
|
|
11361
|
+
"moonshotai/kimi-k2-instruct": 64e3,
|
|
11362
|
+
"moonshotai/kimi-k2-instruct-0905": 64e3,
|
|
11363
|
+
"qwen/qwen3.5-122b-a10b": 64e3,
|
|
11364
|
+
"qwen/qwen3.5-397b-a17b": 64e3
|
|
11365
|
+
};
|
|
11366
|
+
var DEFAULT_STAGE_WEIGHTS = {
|
|
11367
|
+
convert: 15,
|
|
11368
|
+
render: 20,
|
|
11369
|
+
probe: 5,
|
|
11370
|
+
ocr: 45,
|
|
11371
|
+
proofread: 10,
|
|
11372
|
+
merge: 5
|
|
11373
|
+
};
|
|
11374
|
+
var OCR_PROMPT2 = "\uC774 \uC774\uBBF8\uC9C0 1\uC7A5\uC758 \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uC694\uC57D \uC5C6\uC774 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uCD94\uCD9C\uD558\uC138\uC694. \uC808\uB300\uB85C \uB0B4\uC6A9\uC744 \uCD94\uCE21\uD558\uAC70\uB098 \uBC14\uAFB8\uC9C0 \uB9C8\uC138\uC694.";
|
|
11375
|
+
var PROOFREAD_PROMPT = [
|
|
11376
|
+
"\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
|
|
11377
|
+
"\uADDC\uCE59:",
|
|
11378
|
+
"- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
|
|
11379
|
+
"- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
|
|
11380
|
+
"- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
|
|
11381
|
+
"- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
|
|
11382
|
+
].join("\n");
|
|
11383
|
+
async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
11384
|
+
const absInput = (0, import_path5.resolve)(inputPath);
|
|
11385
|
+
const stem = (0, import_path5.basename)(absInput, (0, import_path5.extname)(absInput));
|
|
11386
|
+
const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
|
|
11387
|
+
const imagesDir = (0, import_path5.join)(workspaceDir, "images");
|
|
11388
|
+
const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
|
|
11389
|
+
const proofDir = (0, import_path5.join)(workspaceDir, "ocr", "proofread");
|
|
11390
|
+
const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
|
|
11391
|
+
const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
|
|
11392
|
+
const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
|
|
11393
|
+
const modelCachePath = (0, import_path5.join)((0, import_path5.dirname)(absInput), ".kordoc-model-cache.json");
|
|
11394
|
+
const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
|
|
11395
|
+
const timeoutMs = options.timeoutMs ?? 6e4;
|
|
11396
|
+
const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
|
|
11397
|
+
const dpi = options.dpi ?? 300;
|
|
11398
|
+
const modelsInput = options.modelCandidates?.length ? options.modelCandidates : DEFAULT_MODELS;
|
|
11399
|
+
const modelCache = await loadModelCache(modelCachePath);
|
|
11400
|
+
const models = sortModelsByCache(modelsInput, modelCache);
|
|
11401
|
+
const modelMaxTokens = { ...DEFAULT_MODEL_MAX_TOKENS, ...options.modelMaxTokens ?? {} };
|
|
11402
|
+
const stageWeights = normalizeWeights({ ...DEFAULT_STAGE_WEIGHTS, ...options.stageWeights ?? {} });
|
|
11403
|
+
const keyPool = ApiKeyRotationPool.fromEnv();
|
|
11404
|
+
const runId = options.runId ?? generateRunId("ocr");
|
|
11405
|
+
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
11406
|
+
await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
|
|
11407
|
+
await (0, import_promises2.mkdir)(rawDir, { recursive: true });
|
|
11408
|
+
await (0, import_promises2.mkdir)(proofDir, { recursive: true });
|
|
11409
|
+
await (0, import_promises2.mkdir)(diffDir, { recursive: true });
|
|
11410
|
+
const timingsMs = {};
|
|
11411
|
+
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
11412
|
+
const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
|
|
11413
|
+
const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
|
|
11414
|
+
let currentStage = "convert";
|
|
11415
|
+
const logStage = (level, stage, event, message, meta) => {
|
|
11416
|
+
logger.log({ level, stage, event, message, meta });
|
|
11417
|
+
};
|
|
11418
|
+
try {
|
|
11419
|
+
ensureSupportedInput(absInput);
|
|
11420
|
+
let workingPdfPath = absInput;
|
|
11421
|
+
const convertStart = Date.now();
|
|
11422
|
+
currentStage = "convert";
|
|
11423
|
+
markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
|
|
11424
|
+
logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
|
|
11425
|
+
if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
|
|
11426
|
+
await assertSofficeAvailable();
|
|
11427
|
+
workingPdfPath = (0, import_path5.join)(workspaceDir, `${stem}.pdf`);
|
|
11428
|
+
const inputBuffer = await (0, import_promises2.readFile)(absInput);
|
|
11429
|
+
const out = await convertWithLibreOffice(inputBuffer, ".pdf");
|
|
11430
|
+
await (0, import_promises2.writeFile)(workingPdfPath, out);
|
|
11431
|
+
}
|
|
11432
|
+
timingsMs.convert = Date.now() - convertStart;
|
|
11433
|
+
markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
|
|
11434
|
+
logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
|
|
11435
|
+
const renderStart = Date.now();
|
|
11436
|
+
currentStage = "render";
|
|
11437
|
+
markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
|
|
11438
|
+
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
|
|
11439
|
+
await renderPdfToPng(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi);
|
|
11440
|
+
const images = await listPageImages(imagesDir);
|
|
11441
|
+
if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11442
|
+
markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
|
|
11443
|
+
timingsMs.render = Date.now() - renderStart;
|
|
11444
|
+
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11445
|
+
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: images.length, elapsedMs: timingsMs.render });
|
|
11446
|
+
const probeStart = Date.now();
|
|
11447
|
+
currentStage = "probe";
|
|
11448
|
+
markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
|
|
11449
|
+
logStage("info", "probe", "start", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2DC\uC791", { models });
|
|
11450
|
+
const probeImage = await pickRepresentativeImage(images);
|
|
11451
|
+
const probeResults = [];
|
|
11452
|
+
for (let i = 0; i < models.length; i++) {
|
|
11453
|
+
const model = models[i];
|
|
11454
|
+
const t0 = Date.now();
|
|
11455
|
+
try {
|
|
11456
|
+
await ocrImageViaNim({
|
|
11457
|
+
imagePath: probeImage,
|
|
11458
|
+
prompt: OCR_PROMPT2,
|
|
11459
|
+
model,
|
|
11460
|
+
maxTokens: modelMaxTokens[model] ?? 8192,
|
|
11461
|
+
baseUrl,
|
|
11462
|
+
keyPool,
|
|
11463
|
+
timeoutMs,
|
|
11464
|
+
maxRetries: 2,
|
|
11465
|
+
logger,
|
|
11466
|
+
stage: "probe"
|
|
11467
|
+
});
|
|
11468
|
+
probeResults.push({ model, durationMs: Date.now() - t0, success: true });
|
|
11469
|
+
} catch (err) {
|
|
11470
|
+
probeResults.push({
|
|
11471
|
+
model,
|
|
11472
|
+
durationMs: Date.now() - t0,
|
|
11473
|
+
success: false,
|
|
11474
|
+
error: err instanceof Error ? err.message : String(err)
|
|
11475
|
+
});
|
|
11476
|
+
}
|
|
11477
|
+
markStageProgress("probe", Math.round((i + 1) / models.length * 100), i + 1, models.length, `\uBAA8\uB378 \uD504\uB85C\uBE0C ${i + 1}/${models.length}`);
|
|
11478
|
+
logStage("debug", "probe", "progress", "\uBAA8\uB378 \uD504\uB85C\uBE0C \uC9C4\uD589", { index: i + 1, total: models.length, model, result: probeResults.at(-1) });
|
|
11479
|
+
}
|
|
11480
|
+
const selectedModel = chooseFastestModel(probeResults);
|
|
11481
|
+
if (!selectedModel) throw new UnifiedOcrError("PROBE_FAILED", "probe", "\uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2E4\uD328: \uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uBAA8\uB378\uC774 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11482
|
+
const fallbackModelOrder = probeResults.filter((r) => r.success).sort((a, b) => a.durationMs - b.durationMs).map((r) => r.model);
|
|
11483
|
+
timingsMs.probe = Date.now() - probeStart;
|
|
11484
|
+
await updateModelCache(modelCachePath, probeResults);
|
|
11485
|
+
markStageDone("probe", `\uD504\uB85C\uBE0C \uC644\uB8CC: ${selectedModel}`);
|
|
11486
|
+
logStage("info", "probe", "done", "\uBAA8\uB378 \uD504\uB85C\uBE0C \uC644\uB8CC", { selectedModel, probeResults, elapsedMs: timingsMs.probe, modelCachePath });
|
|
11487
|
+
const ocrStart = Date.now();
|
|
11488
|
+
currentStage = "ocr";
|
|
11489
|
+
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (${selectedModel})`);
|
|
11490
|
+
logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { selectedModel, pageCount: images.length });
|
|
11491
|
+
const rawPagePaths = [];
|
|
11492
|
+
for (let i = 0; i < images.length; i++) {
|
|
11493
|
+
const imagePath = images[i];
|
|
11494
|
+
const markdown = await ocrImageWithFallback({
|
|
11495
|
+
imagePath,
|
|
11496
|
+
prompt: OCR_PROMPT2,
|
|
11497
|
+
models: fallbackModelOrder,
|
|
11498
|
+
modelMaxTokens,
|
|
11499
|
+
baseUrl,
|
|
11500
|
+
keyPool,
|
|
11501
|
+
timeoutMs,
|
|
11502
|
+
maxRetriesPerPage,
|
|
11503
|
+
logger
|
|
11504
|
+
});
|
|
11505
|
+
const pagePath = (0, import_path5.join)(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11506
|
+
await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
|
|
11507
|
+
rawPagePaths.push(pagePath);
|
|
11508
|
+
markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`);
|
|
11509
|
+
logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
|
|
11510
|
+
}
|
|
11511
|
+
timingsMs.ocr = Date.now() - ocrStart;
|
|
11512
|
+
markStageDone("ocr", "OCR \uC644\uB8CC");
|
|
11513
|
+
logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
|
|
11514
|
+
const proofStart = Date.now();
|
|
11515
|
+
currentStage = "proofread";
|
|
11516
|
+
markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
|
|
11517
|
+
logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11518
|
+
const proofedPaths = [];
|
|
11519
|
+
for (let i = 0; i < rawPagePaths.length; i++) {
|
|
11520
|
+
const rawMd = await (0, import_promises2.readFile)(rawPagePaths[i], "utf-8");
|
|
11521
|
+
const prompt = `${PROOFREAD_PROMPT}
|
|
11522
|
+
|
|
11523
|
+
---
|
|
11524
|
+
${rawMd}
|
|
11525
|
+
---`;
|
|
11526
|
+
const corrected = await ocrImageViaNim({
|
|
11527
|
+
textOnlyPrompt: prompt,
|
|
11528
|
+
model: selectedModel,
|
|
11529
|
+
maxTokens: modelMaxTokens[selectedModel] ?? 8192,
|
|
11530
|
+
baseUrl,
|
|
11531
|
+
keyPool,
|
|
11532
|
+
timeoutMs,
|
|
11533
|
+
maxRetries: maxRetriesPerPage,
|
|
11534
|
+
logger,
|
|
11535
|
+
stage: "proofread"
|
|
11536
|
+
});
|
|
11537
|
+
const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
|
|
11538
|
+
const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
|
|
11539
|
+
const pagePath = (0, import_path5.join)(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11540
|
+
await (0, import_promises2.writeFile)(pagePath, taggedCorrected, "utf-8");
|
|
11541
|
+
await (0, import_promises2.writeFile)(
|
|
11542
|
+
(0, import_path5.join)(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
|
|
11543
|
+
JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
|
|
11544
|
+
"utf-8"
|
|
11545
|
+
);
|
|
11546
|
+
proofedPaths.push(pagePath);
|
|
11547
|
+
markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
|
|
11548
|
+
logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
|
|
11549
|
+
}
|
|
11550
|
+
timingsMs.proofread = Date.now() - proofStart;
|
|
11551
|
+
markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
|
|
11552
|
+
logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
|
|
11553
|
+
const mergeStart = Date.now();
|
|
11554
|
+
currentStage = "merge";
|
|
11555
|
+
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
11556
|
+
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: proofedPaths.length });
|
|
11557
|
+
const merged = await mergeMarkdownPages(proofedPaths);
|
|
11558
|
+
await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
|
|
11559
|
+
timingsMs.merge = Date.now() - mergeStart;
|
|
11560
|
+
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
11561
|
+
logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
|
|
11562
|
+
const report = {
|
|
11563
|
+
inputPath: absInput,
|
|
11564
|
+
outputPath,
|
|
11565
|
+
workspaceDir,
|
|
11566
|
+
selectedModel,
|
|
11567
|
+
probeImage,
|
|
11568
|
+
probeResults,
|
|
11569
|
+
pageCount: images.length,
|
|
11570
|
+
keyHealth: keyPool.snapshot(),
|
|
11571
|
+
timingsMs,
|
|
11572
|
+
modelCachePath
|
|
11573
|
+
};
|
|
11574
|
+
await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
|
|
11575
|
+
logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
|
|
11576
|
+
return { outputPath, reportPath, selectedModel };
|
|
11577
|
+
} catch (err) {
|
|
11578
|
+
const normalized = normalizePipelineError(err, currentStage);
|
|
11579
|
+
emitProgress(options.onEvent, currentStage, 0, stageWeights, {
|
|
11580
|
+
type: "error",
|
|
11581
|
+
code: normalized.code,
|
|
11582
|
+
message: normalized.message
|
|
11583
|
+
});
|
|
11584
|
+
logger.log({
|
|
11585
|
+
level: "error",
|
|
11586
|
+
stage: currentStage,
|
|
11587
|
+
event: "error",
|
|
11588
|
+
message: normalized.message,
|
|
11589
|
+
error: {
|
|
11590
|
+
code: normalized.code,
|
|
11591
|
+
name: normalized.name,
|
|
11592
|
+
message: normalized.message,
|
|
11593
|
+
stack: normalized.stack
|
|
11594
|
+
}
|
|
11595
|
+
});
|
|
11596
|
+
throw normalized;
|
|
11597
|
+
}
|
|
11598
|
+
}
|
|
11599
|
+
function normalizeWeights(weights) {
|
|
11600
|
+
const sum = Object.values(weights).reduce((a, b) => a + b, 0) || 1;
|
|
11601
|
+
return {
|
|
11602
|
+
convert: weights.convert / sum * 100,
|
|
11603
|
+
render: weights.render / sum * 100,
|
|
11604
|
+
probe: weights.probe / sum * 100,
|
|
11605
|
+
ocr: weights.ocr / sum * 100,
|
|
11606
|
+
proofread: weights.proofread / sum * 100,
|
|
11607
|
+
merge: weights.merge / sum * 100
|
|
11608
|
+
};
|
|
11609
|
+
}
|
|
11610
|
+
function computeOverallPercent(stage, stagePercent, weights) {
|
|
11611
|
+
const order = ["convert", "render", "probe", "ocr", "proofread", "merge"];
|
|
11612
|
+
let overall = 0;
|
|
11613
|
+
for (const s of order) {
|
|
11614
|
+
if (s === stage) {
|
|
11615
|
+
overall += weights[s] * Math.max(0, Math.min(100, stagePercent)) / 100;
|
|
11616
|
+
break;
|
|
11617
|
+
}
|
|
11618
|
+
overall += weights[s];
|
|
11619
|
+
}
|
|
11620
|
+
return Math.round(overall);
|
|
11621
|
+
}
|
|
11622
|
+
function emitProgress(cb, stage, stagePercent, weights, extra) {
|
|
11623
|
+
if (!cb) return;
|
|
11624
|
+
cb({
|
|
11625
|
+
type: extra.type ?? "stage_progress",
|
|
11626
|
+
stage,
|
|
11627
|
+
stagePercent: Math.max(0, Math.min(100, Math.round(stagePercent))),
|
|
11628
|
+
overallPercent: computeOverallPercent(stage, stagePercent, weights),
|
|
11629
|
+
current: extra.current,
|
|
11630
|
+
total: extra.total,
|
|
11631
|
+
code: extra.code,
|
|
11632
|
+
message: extra.message
|
|
11633
|
+
});
|
|
11634
|
+
}
|
|
11635
|
+
async function convertWithLibreOffice(buffer, ext) {
|
|
11636
|
+
return await new Promise((resolvePromise, reject) => {
|
|
11637
|
+
libreConvert(buffer, ext, void 0, (err, done) => {
|
|
11638
|
+
if (err || !done) {
|
|
11639
|
+
reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
|
|
11640
|
+
return;
|
|
11641
|
+
}
|
|
11642
|
+
resolvePromise(done);
|
|
11643
|
+
});
|
|
11644
|
+
});
|
|
11645
|
+
}
|
|
11646
|
+
async function renderPdfToPng(pdfPath, prefixPath, dpi) {
|
|
11647
|
+
try {
|
|
11648
|
+
await runCommand("pdftoppm", ["-png", "-r", String(dpi), pdfPath, prefixPath]);
|
|
11649
|
+
} catch (err) {
|
|
11650
|
+
throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
|
|
11651
|
+
}
|
|
11652
|
+
}
|
|
11653
|
+
async function runCommand(cmd, args) {
|
|
11654
|
+
await new Promise((resolvePromise, reject) => {
|
|
11655
|
+
const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
|
|
11656
|
+
let stderr = "";
|
|
11657
|
+
child.stderr.on("data", (d) => {
|
|
11658
|
+
stderr += String(d);
|
|
11659
|
+
});
|
|
11660
|
+
child.on("error", reject);
|
|
11661
|
+
child.on("close", (code) => {
|
|
11662
|
+
if (code === 0) resolvePromise();
|
|
11663
|
+
else reject(new Error(`${cmd} \uC2E4\uD328 (code=${code}): ${stderr.trim()}`));
|
|
11664
|
+
});
|
|
11665
|
+
});
|
|
11666
|
+
}
|
|
11667
|
+
async function assertSofficeAvailable() {
|
|
11668
|
+
try {
|
|
11669
|
+
await runCommand("soffice", ["--version"]);
|
|
11670
|
+
} catch {
|
|
11671
|
+
throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
|
|
11672
|
+
}
|
|
11673
|
+
}
|
|
11674
|
+
async function listPageImages(imagesDir) {
|
|
11675
|
+
const files = await (0, import_promises2.readdir)(imagesDir);
|
|
11676
|
+
return files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b)).map((f) => (0, import_path5.join)(imagesDir, f));
|
|
11677
|
+
}
|
|
11678
|
+
function naturalPageSort(a, b) {
|
|
11679
|
+
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11680
|
+
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11681
|
+
return na - nb;
|
|
11682
|
+
}
|
|
11683
|
+
async function pickRepresentativeImage(images) {
|
|
11684
|
+
const sample = images.slice(0, Math.min(images.length, 8));
|
|
11685
|
+
const weighted = [];
|
|
11686
|
+
for (const p of sample) {
|
|
11687
|
+
const st = await (0, import_promises2.stat)(p);
|
|
11688
|
+
if (st.size > 8 * 1024) weighted.push({ path: p, size: st.size });
|
|
11689
|
+
}
|
|
11690
|
+
const use = weighted.length > 0 ? weighted : await Promise.all(sample.map(async (p) => ({ path: p, size: (await (0, import_promises2.stat)(p)).size })));
|
|
11691
|
+
use.sort((a, b) => a.size - b.size);
|
|
11692
|
+
return use[Math.floor(use.length / 2)].path;
|
|
11693
|
+
}
|
|
11694
|
+
function chooseFastestModel(results) {
|
|
11695
|
+
const ok = results.filter((r) => r.success).sort((a, b) => a.durationMs - b.durationMs);
|
|
11696
|
+
return ok[0]?.model ?? null;
|
|
11697
|
+
}
|
|
11698
|
+
async function loadModelCache(path) {
|
|
11699
|
+
try {
|
|
11700
|
+
const raw = await (0, import_promises2.readFile)(path, "utf-8");
|
|
11701
|
+
return JSON.parse(raw);
|
|
11702
|
+
} catch {
|
|
11703
|
+
return null;
|
|
11704
|
+
}
|
|
11705
|
+
}
|
|
11706
|
+
function sortModelsByCache(models, cache) {
|
|
11707
|
+
if (!cache) return [...models];
|
|
11708
|
+
return [...models].sort((a, b) => {
|
|
11709
|
+
const av = cache.models[a]?.avgDurationMs ?? Number.POSITIVE_INFINITY;
|
|
11710
|
+
const bv = cache.models[b]?.avgDurationMs ?? Number.POSITIVE_INFINITY;
|
|
11711
|
+
return av - bv;
|
|
11712
|
+
});
|
|
11713
|
+
}
|
|
11714
|
+
async function updateModelCache(path, probes) {
|
|
11715
|
+
const prev = await loadModelCache(path);
|
|
11716
|
+
const current = prev ?? { updatedAt: (/* @__PURE__ */ new Date()).toISOString(), models: {} };
|
|
11717
|
+
for (const p of probes) {
|
|
11718
|
+
if (!p.success) continue;
|
|
11719
|
+
const existing = current.models[p.model];
|
|
11720
|
+
if (!existing) {
|
|
11721
|
+
current.models[p.model] = { count: 1, avgDurationMs: p.durationMs };
|
|
11722
|
+
} else {
|
|
11723
|
+
const nextCount = existing.count + 1;
|
|
11724
|
+
current.models[p.model] = {
|
|
11725
|
+
count: nextCount,
|
|
11726
|
+
avgDurationMs: Math.round((existing.avgDurationMs * existing.count + p.durationMs) / nextCount)
|
|
11727
|
+
};
|
|
11728
|
+
}
|
|
11729
|
+
}
|
|
11730
|
+
current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
11731
|
+
await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
|
|
11732
|
+
}
|
|
11733
|
+
async function ocrImageWithFallback(input) {
|
|
11734
|
+
let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
|
|
11735
|
+
for (const model of input.models) {
|
|
11736
|
+
try {
|
|
11737
|
+
return await ocrImageViaNim({
|
|
11738
|
+
imagePath: input.imagePath,
|
|
11739
|
+
prompt: input.prompt,
|
|
11740
|
+
model,
|
|
11741
|
+
maxTokens: input.modelMaxTokens[model] ?? 8192,
|
|
11742
|
+
baseUrl: input.baseUrl,
|
|
11743
|
+
keyPool: input.keyPool,
|
|
11744
|
+
timeoutMs: input.timeoutMs,
|
|
11745
|
+
maxRetries: input.maxRetriesPerPage,
|
|
11746
|
+
logger: input.logger,
|
|
11747
|
+
stage: "ocr"
|
|
11748
|
+
});
|
|
11749
|
+
} catch (err) {
|
|
11750
|
+
lastErr = err instanceof Error ? err.message : String(err);
|
|
11751
|
+
}
|
|
11752
|
+
}
|
|
11753
|
+
throw new UnifiedOcrError("OCR_FAILED", "ocr", `\uBAA8\uB4E0 OCR \uBAA8\uB378 \uC2E4\uD328: ${lastErr}`);
|
|
11754
|
+
}
|
|
11755
|
+
async function mergeMarkdownPages(paths) {
|
|
11756
|
+
const out = [];
|
|
11757
|
+
for (let i = 0; i < paths.length; i++) {
|
|
11758
|
+
const txt = (await (0, import_promises2.readFile)(paths[i], "utf-8")).trim();
|
|
11759
|
+
if (!txt) continue;
|
|
11760
|
+
out.push(txt);
|
|
11761
|
+
}
|
|
11762
|
+
return out.join("\n\n");
|
|
11763
|
+
}
|
|
11764
|
+
async function ocrImageViaNim(input) {
|
|
11765
|
+
const { model, maxTokens, baseUrl, keyPool, timeoutMs, maxRetries, logger, stage = "ocr" } = input;
|
|
11766
|
+
let attempt = 0;
|
|
11767
|
+
let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
|
|
11768
|
+
while (attempt < maxRetries) {
|
|
11769
|
+
attempt++;
|
|
11770
|
+
let acquired = null;
|
|
11771
|
+
try {
|
|
11772
|
+
acquired = keyPool.acquire();
|
|
11773
|
+
} catch (err) {
|
|
11774
|
+
if (err instanceof AllKeysCoolingDownError) {
|
|
11775
|
+
logger?.log({
|
|
11776
|
+
level: "warn",
|
|
11777
|
+
stage,
|
|
11778
|
+
event: "progress",
|
|
11779
|
+
message: "\uBAA8\uB4E0 API \uD0A4 cooldown \uC0C1\uD0DC\uB85C \uB300\uAE30",
|
|
11780
|
+
meta: { waitMs: err.waitMs, attempt, maxRetries, model }
|
|
11781
|
+
});
|
|
11782
|
+
await delay(err.waitMs);
|
|
11783
|
+
continue;
|
|
11784
|
+
}
|
|
11785
|
+
throw err;
|
|
11786
|
+
}
|
|
11787
|
+
try {
|
|
11788
|
+
const content = input.textOnlyPrompt ? [{ type: "text", text: input.textOnlyPrompt }] : [
|
|
11789
|
+
{ type: "text", text: input.prompt ?? OCR_PROMPT2 },
|
|
11790
|
+
{
|
|
11791
|
+
type: "image_url",
|
|
11792
|
+
image_url: { url: `data:image/png;base64,${await encodeBase64(input.imagePath)}` }
|
|
11793
|
+
}
|
|
11794
|
+
];
|
|
11795
|
+
const body = {
|
|
11796
|
+
model,
|
|
11797
|
+
messages: [{ role: "user", content }],
|
|
11798
|
+
max_tokens: maxTokens,
|
|
11799
|
+
temperature: 0
|
|
11800
|
+
};
|
|
11801
|
+
logger?.log({
|
|
11802
|
+
level: "debug",
|
|
11803
|
+
stage,
|
|
11804
|
+
event: "progress",
|
|
11805
|
+
message: "NIM \uC694\uCCAD \uC2DC\uB3C4",
|
|
11806
|
+
meta: { attempt, maxRetries, model, keyId: acquired.keyId, hasImage: Boolean(input.imagePath) }
|
|
11807
|
+
});
|
|
11808
|
+
const controller = new AbortController();
|
|
11809
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
11810
|
+
try {
|
|
11811
|
+
const resp = await fetch(baseUrl, {
|
|
11812
|
+
method: "POST",
|
|
11813
|
+
headers: {
|
|
11814
|
+
Authorization: `Bearer ${acquired.key}`,
|
|
11815
|
+
"Content-Type": "application/json"
|
|
11816
|
+
},
|
|
11817
|
+
body: JSON.stringify(body),
|
|
11818
|
+
signal: controller.signal
|
|
11819
|
+
});
|
|
11820
|
+
if (resp.ok) {
|
|
11821
|
+
const json = await resp.json();
|
|
11822
|
+
const text = json.choices?.[0]?.message?.content?.trim() ?? "";
|
|
11823
|
+
keyPool.markSuccess(acquired.keyId);
|
|
11824
|
+
logger?.log({
|
|
11825
|
+
level: "debug",
|
|
11826
|
+
stage,
|
|
11827
|
+
event: "done",
|
|
11828
|
+
message: "NIM \uC751\uB2F5 \uC131\uACF5",
|
|
11829
|
+
meta: { attempt, model, keyId: acquired.keyId }
|
|
11830
|
+
});
|
|
11831
|
+
if (!text) throw new UnifiedOcrError("OCR_FAILED", "ocr", "OCR \uC751\uB2F5\uC774 \uBE44\uC5B4 \uC788\uC2B5\uB2C8\uB2E4.");
|
|
11832
|
+
return stripCodeFence3(text);
|
|
11833
|
+
}
|
|
11834
|
+
const retryAfter = Number(resp.headers.get("retry-after") || "0");
|
|
11835
|
+
const retryAfterMs = Number.isFinite(retryAfter) && retryAfter > 0 ? retryAfter * 1e3 : void 0;
|
|
11836
|
+
keyPool.markFailure(acquired.keyId, { status: resp.status, retryAfterMs });
|
|
11837
|
+
lastErr = `NIM \uC751\uB2F5 \uC624\uB958: ${resp.status}`;
|
|
11838
|
+
logger?.log({
|
|
11839
|
+
level: "warn",
|
|
11840
|
+
stage,
|
|
11841
|
+
event: "progress",
|
|
11842
|
+
message: "NIM \uC751\uB2F5 \uC2E4\uD328",
|
|
11843
|
+
meta: { attempt, model, status: resp.status, retryAfterMs, keyId: acquired.keyId }
|
|
11844
|
+
});
|
|
11845
|
+
} finally {
|
|
11846
|
+
clearTimeout(timer);
|
|
11847
|
+
}
|
|
11848
|
+
} catch (err) {
|
|
11849
|
+
const isTimeout = err instanceof Error && err.name === "AbortError";
|
|
11850
|
+
if (acquired) keyPool.markFailure(acquired.keyId, { timeout: isTimeout });
|
|
11851
|
+
lastErr = err instanceof Error ? err.message : String(err);
|
|
11852
|
+
logger?.log({
|
|
11853
|
+
level: "warn",
|
|
11854
|
+
stage,
|
|
11855
|
+
event: "progress",
|
|
11856
|
+
message: "NIM \uC694\uCCAD \uC608\uC678",
|
|
11857
|
+
meta: { attempt, model, timeout: isTimeout, keyId: acquired?.keyId },
|
|
11858
|
+
error: { message: lastErr, name: err instanceof Error ? err.name : "Error" }
|
|
11859
|
+
});
|
|
11860
|
+
await delay(500);
|
|
11861
|
+
}
|
|
11862
|
+
}
|
|
11863
|
+
logger?.log({
|
|
11864
|
+
level: "error",
|
|
11865
|
+
stage,
|
|
11866
|
+
event: "error",
|
|
11867
|
+
message: "NIM \uCD5C\uB300 \uC7AC\uC2DC\uB3C4 \uCD08\uACFC",
|
|
11868
|
+
meta: { model, maxRetries },
|
|
11869
|
+
error: { code: "OCR_FAILED", message: lastErr }
|
|
11870
|
+
});
|
|
11871
|
+
throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
|
|
11872
|
+
}
|
|
11873
|
+
async function encodeBase64(path) {
|
|
11874
|
+
const b = await (0, import_promises2.readFile)(path);
|
|
11875
|
+
return b.toString("base64");
|
|
11876
|
+
}
|
|
11877
|
+
function stripCodeFence3(text) {
|
|
11878
|
+
const m = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/i);
|
|
11879
|
+
return m ? m[1].trim() : text;
|
|
11880
|
+
}
|
|
11881
|
+
async function delay(ms) {
|
|
11882
|
+
if (ms <= 0) return;
|
|
11883
|
+
await new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
|
|
11884
|
+
}
|
|
11885
|
+
function ensureSupportedInput(path) {
|
|
11886
|
+
const ext = (0, import_path5.extname)(path).toLowerCase();
|
|
11887
|
+
const allowed = /* @__PURE__ */ new Set([".pdf", ".hwp", ".hwpx", ".docx", ".xlsx"]);
|
|
11888
|
+
if (!allowed.has(ext)) {
|
|
11889
|
+
throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
|
|
11890
|
+
}
|
|
11891
|
+
}
|
|
11892
|
+
function extractNumericTokens(text) {
|
|
11893
|
+
return text.match(/\d[\d,./-]*/g) ?? [];
|
|
11894
|
+
}
|
|
11895
|
+
function preserveNumericIntegrity(rawText, correctedText) {
|
|
11896
|
+
const rawTokens = extractNumericTokens(rawText);
|
|
11897
|
+
const correctedTokens = extractNumericTokens(correctedText);
|
|
11898
|
+
if (rawTokens.length !== correctedTokens.length) return rawText;
|
|
11899
|
+
for (let i = 0; i < rawTokens.length; i++) {
|
|
11900
|
+
if (rawTokens[i] !== correctedTokens[i]) return rawText;
|
|
11901
|
+
}
|
|
11902
|
+
return correctedText;
|
|
11903
|
+
}
|
|
11904
|
+
function addUncertainTag(rawText, correctedText) {
|
|
11905
|
+
if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
|
|
11906
|
+
const rawLen = rawText.trim().length;
|
|
11907
|
+
const corrLen = correctedText.trim().length;
|
|
11908
|
+
if (rawLen === 0 || corrLen === 0) return correctedText;
|
|
11909
|
+
const rawLines = rawText.split("\n").filter(Boolean).length;
|
|
11910
|
+
const corrLines = correctedText.split("\n").filter(Boolean).length;
|
|
11911
|
+
const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
|
|
11912
|
+
const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
|
|
11913
|
+
const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
|
|
11914
|
+
if (!suspicious) return correctedText;
|
|
11915
|
+
return `${correctedText}
|
|
11916
|
+
|
|
11917
|
+
[\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
|
|
11918
|
+
}
|
|
11919
|
+
function buildDiffSummary(before, after) {
|
|
11920
|
+
return {
|
|
11921
|
+
changed: before !== after,
|
|
11922
|
+
beforeLength: before.length,
|
|
11923
|
+
afterLength: after.length
|
|
11924
|
+
};
|
|
11925
|
+
}
|
|
11926
|
+
function normalizePipelineError(err, stage) {
|
|
11927
|
+
if (err instanceof UnifiedOcrError) return err;
|
|
11928
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
11929
|
+
const codeByStage = {
|
|
11930
|
+
convert: "CONVERT_FAILED",
|
|
11931
|
+
render: "RENDER_FAILED",
|
|
11932
|
+
probe: "PROBE_FAILED",
|
|
11933
|
+
ocr: "OCR_FAILED",
|
|
11934
|
+
proofread: "PROOFREAD_FAILED",
|
|
11935
|
+
merge: "MERGE_FAILED"
|
|
11936
|
+
};
|
|
11937
|
+
return new UnifiedOcrError(codeByStage[stage] ?? "UNKNOWN", stage, message);
|
|
11938
|
+
}
|
|
11939
|
+
|
|
10624
11940
|
// src/index.ts
|
|
10625
11941
|
async function parse2(input, options) {
|
|
11942
|
+
const logger = createLoggerFromEnv().withRun(generateRunId("parse")).child({ component: "index.ts", stage: "detect" });
|
|
11943
|
+
logger.log({ level: "info", event: "start", message: "parse \uD638\uCD9C \uC2DC\uC791" });
|
|
10626
11944
|
let buffer;
|
|
10627
11945
|
if (typeof input === "string") {
|
|
10628
11946
|
try {
|
|
10629
|
-
const buf = await (0,
|
|
11947
|
+
const buf = await (0, import_promises3.readFile)(input);
|
|
10630
11948
|
buffer = toArrayBuffer(buf);
|
|
10631
11949
|
} catch (err) {
|
|
10632
11950
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
11951
|
+
logger.log({
|
|
11952
|
+
level: "error",
|
|
11953
|
+
stage: "detect",
|
|
11954
|
+
event: "error",
|
|
11955
|
+
message: msg,
|
|
11956
|
+
error: { code: "PARSE_ERROR", message: msg, name: err instanceof Error ? err.name : "Error" }
|
|
11957
|
+
});
|
|
10633
11958
|
return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
|
|
10634
11959
|
}
|
|
10635
11960
|
} else if (Buffer.isBuffer(input)) {
|
|
@@ -10638,13 +11963,23 @@ async function parse2(input, options) {
|
|
|
10638
11963
|
buffer = input;
|
|
10639
11964
|
}
|
|
10640
11965
|
if (!buffer || buffer.byteLength === 0) {
|
|
11966
|
+
logger.log({ level: "error", stage: "detect", event: "error", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", error: { code: "EMPTY_INPUT", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", name: "KordocError" } });
|
|
10641
11967
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
10642
11968
|
}
|
|
10643
11969
|
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
10644
11970
|
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
11971
|
+
logger.log({
|
|
11972
|
+
level: "error",
|
|
11973
|
+
stage: "detect",
|
|
11974
|
+
event: "error",
|
|
11975
|
+
message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC",
|
|
11976
|
+
meta: { size: buffer.byteLength },
|
|
11977
|
+
error: { code: "FILE_TOO_LARGE", message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC", name: "KordocError" }
|
|
11978
|
+
});
|
|
10645
11979
|
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
10646
11980
|
}
|
|
10647
11981
|
const format = detectFormat(buffer);
|
|
11982
|
+
logger.log({ level: "info", event: "done", message: "\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC", meta: { format } });
|
|
10648
11983
|
switch (format) {
|
|
10649
11984
|
case "hwpx": {
|
|
10650
11985
|
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
@@ -10722,7 +12057,8 @@ async function parseHwpx(buffer, options, zip) {
|
|
|
10722
12057
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
10723
12058
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10724
12059
|
} catch (err) {
|
|
10725
|
-
|
|
12060
|
+
const normalized = normalizeKordocError(err, "HWPX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
12061
|
+
return { success: false, fileType: "hwpx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
10726
12062
|
}
|
|
10727
12063
|
}
|
|
10728
12064
|
async function parseHwp(buffer, options) {
|
|
@@ -10730,7 +12066,8 @@ async function parseHwp(buffer, options) {
|
|
|
10730
12066
|
const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
|
|
10731
12067
|
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10732
12068
|
} catch (err) {
|
|
10733
|
-
|
|
12069
|
+
const normalized = normalizeKordocError(err, "HWP \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
12070
|
+
return { success: false, fileType: "hwp", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
10734
12071
|
}
|
|
10735
12072
|
}
|
|
10736
12073
|
async function parsePdf(buffer, options) {
|
|
@@ -10738,8 +12075,15 @@ async function parsePdf(buffer, options) {
|
|
|
10738
12075
|
const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
|
|
10739
12076
|
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
10740
12077
|
} catch (err) {
|
|
12078
|
+
const normalized = normalizeKordocError(err, "PDF \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
10741
12079
|
const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
|
|
10742
|
-
return {
|
|
12080
|
+
return {
|
|
12081
|
+
success: false,
|
|
12082
|
+
fileType: "pdf",
|
|
12083
|
+
error: normalized.message,
|
|
12084
|
+
code: normalized.code ?? classifyError(normalized),
|
|
12085
|
+
isImageBased
|
|
12086
|
+
};
|
|
10743
12087
|
}
|
|
10744
12088
|
}
|
|
10745
12089
|
async function parseXlsx(buffer, options, zip) {
|
|
@@ -10747,7 +12091,8 @@ async function parseXlsx(buffer, options, zip) {
|
|
|
10747
12091
|
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
10748
12092
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
10749
12093
|
} catch (err) {
|
|
10750
|
-
|
|
12094
|
+
const normalized = normalizeKordocError(err, "XLSX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
12095
|
+
return { success: false, fileType: "xlsx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
10751
12096
|
}
|
|
10752
12097
|
}
|
|
10753
12098
|
async function parseDocx(buffer, options, zip) {
|
|
@@ -10755,11 +12100,14 @@ async function parseDocx(buffer, options, zip) {
|
|
|
10755
12100
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
10756
12101
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10757
12102
|
} catch (err) {
|
|
10758
|
-
|
|
12103
|
+
const normalized = normalizeKordocError(err, "DOCX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
12104
|
+
return { success: false, fileType: "docx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
10759
12105
|
}
|
|
10760
12106
|
}
|
|
10761
12107
|
// Annotate the CommonJS export names for ESM import in node:
|
|
10762
12108
|
0 && (module.exports = {
|
|
12109
|
+
AllKeysCoolingDownError,
|
|
12110
|
+
ApiKeyRotationPool,
|
|
10763
12111
|
VERSION,
|
|
10764
12112
|
blocksToMarkdown,
|
|
10765
12113
|
compare,
|
|
@@ -10778,7 +12126,8 @@ async function parseDocx(buffer, options, zip) {
|
|
|
10778
12126
|
parseHwp,
|
|
10779
12127
|
parseHwpx,
|
|
10780
12128
|
parsePdf,
|
|
10781
|
-
parseXlsx
|
|
12129
|
+
parseXlsx,
|
|
12130
|
+
runUnifiedOcrPipeline
|
|
10782
12131
|
});
|
|
10783
12132
|
/*! Bundled license information:
|
|
10784
12133
|
|