@clazic/kordoc 2.4.11 → 2.4.12
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -0
- package/dist/{chunk-PJSXZBZB.js → chunk-5R37N6KE.js} +19 -4
- package/dist/chunk-5R37N6KE.js.map +1 -0
- package/dist/chunk-I6YC6ZGK.js +219 -0
- package/dist/chunk-I6YC6ZGK.js.map +1 -0
- package/dist/{chunk-JGMLDBW5.js → chunk-KJEZPVEK.js} +680 -301
- package/dist/chunk-KJEZPVEK.js.map +1 -0
- package/dist/cli.js +68 -8
- package/dist/cli.js.map +1 -1
- package/dist/index.cjs +1678 -329
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +121 -1
- package/dist/index.d.ts +121 -1
- package/dist/index.js +1656 -310
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +11 -2
- package/dist/mcp.js.map +1 -1
- package/dist/{provider-PYZL2VNN.js → provider-T2D5XRTI.js} +30 -2
- package/dist/provider-T2D5XRTI.js.map +1 -0
- package/dist/{resolve-4I65IGMM.js → resolve-673XFZQ6.js} +18 -1
- package/dist/resolve-673XFZQ6.js.map +1 -0
- package/dist/{utils-HKVOS2O3.js → utils-XLLXVB7V.js} +4 -2
- package/dist/{watch-EYOGF3HY.js → watch-SOMS2KR7.js} +4 -3
- package/dist/{watch-EYOGF3HY.js.map → watch-SOMS2KR7.js.map} +1 -1
- package/package.json +2 -1
- package/dist/chunk-JGMLDBW5.js.map +0 -1
- package/dist/chunk-PJSXZBZB.js.map +0 -1
- package/dist/provider-PYZL2VNN.js.map +0 -1
- package/dist/resolve-4I65IGMM.js.map +0 -1
- /package/dist/{utils-HKVOS2O3.js.map → utils-XLLXVB7V.js.map} +0 -0
package/dist/index.js
CHANGED
|
@@ -75,6 +75,224 @@ var init_page_range = __esm({
|
|
|
75
75
|
}
|
|
76
76
|
});
|
|
77
77
|
|
|
78
|
+
// src/logging/logger.ts
|
|
79
|
+
import { mkdirSync, appendFileSync } from "fs";
|
|
80
|
+
import { appendFile } from "fs/promises";
|
|
81
|
+
import { basename, dirname, resolve } from "path";
|
|
82
|
+
import { randomUUID } from "crypto";
|
|
83
|
+
function createLoggerFromEnv(env = process.env) {
|
|
84
|
+
const level = parseLevel(env.KORDOC_LOG_LEVEL);
|
|
85
|
+
const includeStack = env.KORDOC_LOG_STACK === "1";
|
|
86
|
+
const filePath = env.KORDOC_LOG_FILE ? resolve(env.KORDOC_LOG_FILE) : "";
|
|
87
|
+
const config = {
|
|
88
|
+
level,
|
|
89
|
+
includeStack,
|
|
90
|
+
progressSampleMs: parsePositiveInt(env.KORDOC_LOG_PROGRESS_SAMPLE_MS, 1e3),
|
|
91
|
+
basenamePaths: env.KORDOC_LOG_BASENAME_PATHS === "1",
|
|
92
|
+
textLimit: parsePositiveInt(env.KORDOC_LOG_TEXT_LIMIT, 400)
|
|
93
|
+
};
|
|
94
|
+
const consoleSink = new ConsoleLogger(config);
|
|
95
|
+
const sinks = [consoleSink];
|
|
96
|
+
if (filePath) sinks.push(new JsonlLogger(config, filePath));
|
|
97
|
+
return new CompositeLogger(config, sinks);
|
|
98
|
+
}
|
|
99
|
+
function generateRunId(prefix = "run") {
|
|
100
|
+
return `${prefix}_${randomUUID().slice(0, 8)}`;
|
|
101
|
+
}
|
|
102
|
+
function parseLevel(input) {
|
|
103
|
+
const v = (input || "").toLowerCase();
|
|
104
|
+
if (v === "error" || v === "warn" || v === "info" || v === "debug" || v === "trace") return v;
|
|
105
|
+
return "error";
|
|
106
|
+
}
|
|
107
|
+
function maskSecrets(input) {
|
|
108
|
+
return input.replace(/nvapi-[A-Za-z0-9_\-]+/g, "nvapi-***").replace(/Bearer\s+[A-Za-z0-9_\-\.]+/gi, "Bearer ***");
|
|
109
|
+
}
|
|
110
|
+
function sanitizeMeta(meta, cfg) {
|
|
111
|
+
const out = {};
|
|
112
|
+
for (const [k, v] of Object.entries(meta)) {
|
|
113
|
+
if (/authorization|api[_-]?key|token/i.test(k)) {
|
|
114
|
+
out[k] = "***";
|
|
115
|
+
continue;
|
|
116
|
+
}
|
|
117
|
+
if (typeof v === "string") {
|
|
118
|
+
let next = maskSecrets(v);
|
|
119
|
+
if (cfg.basenamePaths && /path|file|dir/i.test(k)) {
|
|
120
|
+
next = basename(next);
|
|
121
|
+
}
|
|
122
|
+
out[k] = limitText(next, cfg.textLimit);
|
|
123
|
+
} else {
|
|
124
|
+
out[k] = v;
|
|
125
|
+
}
|
|
126
|
+
}
|
|
127
|
+
return out;
|
|
128
|
+
}
|
|
129
|
+
function parsePositiveInt(input, fallback) {
|
|
130
|
+
const n = Number(input);
|
|
131
|
+
if (!Number.isFinite(n) || n < 0) return fallback;
|
|
132
|
+
return Math.floor(n);
|
|
133
|
+
}
|
|
134
|
+
function limitText(input, maxLen) {
|
|
135
|
+
if (maxLen <= 0) return input;
|
|
136
|
+
if (input.length <= maxLen) return input;
|
|
137
|
+
return `${input.slice(0, maxLen)}...(+${input.length - maxLen})`;
|
|
138
|
+
}
|
|
139
|
+
var LEVEL_ORDER, BaseLogger, ConsoleLogger, JsonlLogger, CompositeLogger;
|
|
140
|
+
var init_logger = __esm({
|
|
141
|
+
"src/logging/logger.ts"() {
|
|
142
|
+
"use strict";
|
|
143
|
+
LEVEL_ORDER = {
|
|
144
|
+
error: 0,
|
|
145
|
+
warn: 1,
|
|
146
|
+
info: 2,
|
|
147
|
+
debug: 3,
|
|
148
|
+
trace: 4
|
|
149
|
+
};
|
|
150
|
+
BaseLogger = class _BaseLogger {
|
|
151
|
+
constructor(config, context = {}) {
|
|
152
|
+
this.config = config;
|
|
153
|
+
this.context = context;
|
|
154
|
+
}
|
|
155
|
+
static progressSeenAt = /* @__PURE__ */ new Map();
|
|
156
|
+
shouldLog(level) {
|
|
157
|
+
return LEVEL_ORDER[level] <= LEVEL_ORDER[this.config.level];
|
|
158
|
+
}
|
|
159
|
+
shouldEmitProgress(ev) {
|
|
160
|
+
if (this.config.progressSampleMs <= 0) return true;
|
|
161
|
+
if ((ev.event ?? "message") !== "progress") return true;
|
|
162
|
+
if (ev.level === "error" || ev.level === "warn") return true;
|
|
163
|
+
const key = [
|
|
164
|
+
this.context.runId ?? ev.runId ?? "no-run",
|
|
165
|
+
this.context.component ?? ev.component ?? "no-component",
|
|
166
|
+
this.context.stage ?? ev.stage ?? "unknown",
|
|
167
|
+
ev.message
|
|
168
|
+
].join("|");
|
|
169
|
+
const now = Date.now();
|
|
170
|
+
const prev = _BaseLogger.progressSeenAt.get(key) ?? 0;
|
|
171
|
+
if (now - prev < this.config.progressSampleMs) return false;
|
|
172
|
+
_BaseLogger.progressSeenAt.set(key, now);
|
|
173
|
+
return true;
|
|
174
|
+
}
|
|
175
|
+
merge(ev) {
|
|
176
|
+
const out = {
|
|
177
|
+
...this.context,
|
|
178
|
+
...ev,
|
|
179
|
+
ts: (/* @__PURE__ */ new Date()).toISOString(),
|
|
180
|
+
level: ev.level,
|
|
181
|
+
stage: ev.stage ?? this.context.stage ?? "unknown",
|
|
182
|
+
event: ev.event ?? "message",
|
|
183
|
+
message: ev.message
|
|
184
|
+
};
|
|
185
|
+
if (!this.config.includeStack && out.error?.stack) {
|
|
186
|
+
out.error = { ...out.error, stack: void 0 };
|
|
187
|
+
}
|
|
188
|
+
if (out.meta) out.meta = sanitizeMeta(out.meta, this.config);
|
|
189
|
+
if (out.error?.message) out.error.message = maskSecrets(out.error.message);
|
|
190
|
+
if (out.message) out.message = limitText(maskSecrets(out.message), this.config.textLimit);
|
|
191
|
+
return out;
|
|
192
|
+
}
|
|
193
|
+
child(context) {
|
|
194
|
+
return new _BaseLogger(this.config, { ...this.context, ...context });
|
|
195
|
+
}
|
|
196
|
+
withRun(runId) {
|
|
197
|
+
return this.child({ runId });
|
|
198
|
+
}
|
|
199
|
+
// eslint-disable-next-line @typescript-eslint/no-unused-vars
|
|
200
|
+
log(event) {
|
|
201
|
+
}
|
|
202
|
+
};
|
|
203
|
+
ConsoleLogger = class extends BaseLogger {
|
|
204
|
+
log(event) {
|
|
205
|
+
if (!this.shouldLog(event.level)) return;
|
|
206
|
+
if (!this.shouldEmitProgress(event)) return;
|
|
207
|
+
const e = this.merge(event);
|
|
208
|
+
const prefix = `[${e.ts}] [${e.level.toUpperCase()}]${e.runId ? ` [${e.runId}]` : ""}${e.stage ? ` [${e.stage}]` : ""}`;
|
|
209
|
+
const line = `${prefix} ${e.message}${e.component ? ` (${e.component})` : ""}`;
|
|
210
|
+
if (e.level === "error") {
|
|
211
|
+
process.stderr.write(line + "\n");
|
|
212
|
+
if (e.error?.stack) process.stderr.write(e.error.stack + "\n");
|
|
213
|
+
} else {
|
|
214
|
+
process.stdout.write(line + "\n");
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
};
|
|
218
|
+
JsonlLogger = class _JsonlLogger extends BaseLogger {
|
|
219
|
+
constructor(config, filePath, context = {}) {
|
|
220
|
+
super(config, context);
|
|
221
|
+
this.filePath = filePath;
|
|
222
|
+
mkdirSync(dirname(filePath), { recursive: true });
|
|
223
|
+
_JsonlLogger.ensureState(filePath);
|
|
224
|
+
}
|
|
225
|
+
static states = /* @__PURE__ */ new Map();
|
|
226
|
+
static ensureState(path) {
|
|
227
|
+
let state = _JsonlLogger.states.get(path);
|
|
228
|
+
if (!state) {
|
|
229
|
+
state = { queue: [], flushing: false };
|
|
230
|
+
_JsonlLogger.states.set(path, state);
|
|
231
|
+
const flushSync = () => {
|
|
232
|
+
const s = _JsonlLogger.states.get(path);
|
|
233
|
+
if (!s || s.queue.length === 0) return;
|
|
234
|
+
const payload = s.queue.join("");
|
|
235
|
+
s.queue = [];
|
|
236
|
+
if (!payload) return;
|
|
237
|
+
appendFileSync(path, payload, "utf-8");
|
|
238
|
+
};
|
|
239
|
+
process.on("beforeExit", flushSync);
|
|
240
|
+
process.on("exit", flushSync);
|
|
241
|
+
}
|
|
242
|
+
return state;
|
|
243
|
+
}
|
|
244
|
+
scheduleFlush(path) {
|
|
245
|
+
const state = _JsonlLogger.ensureState(path);
|
|
246
|
+
if (state.timer || state.flushing) return;
|
|
247
|
+
state.timer = setTimeout(() => {
|
|
248
|
+
state.timer = void 0;
|
|
249
|
+
void this.flush(path);
|
|
250
|
+
}, 200);
|
|
251
|
+
}
|
|
252
|
+
async flush(path) {
|
|
253
|
+
const state = _JsonlLogger.ensureState(path);
|
|
254
|
+
if (state.flushing) return;
|
|
255
|
+
if (state.queue.length === 0) return;
|
|
256
|
+
state.flushing = true;
|
|
257
|
+
const payload = state.queue.join("");
|
|
258
|
+
state.queue = [];
|
|
259
|
+
try {
|
|
260
|
+
await appendFile(path, payload, "utf-8");
|
|
261
|
+
} finally {
|
|
262
|
+
state.flushing = false;
|
|
263
|
+
if (state.queue.length > 0) this.scheduleFlush(path);
|
|
264
|
+
}
|
|
265
|
+
}
|
|
266
|
+
log(event) {
|
|
267
|
+
if (!this.shouldLog(event.level)) return;
|
|
268
|
+
if (!this.shouldEmitProgress(event)) return;
|
|
269
|
+
const e = this.merge(event);
|
|
270
|
+
const state = _JsonlLogger.ensureState(this.filePath);
|
|
271
|
+
state.queue.push(JSON.stringify(e) + "\n");
|
|
272
|
+
this.scheduleFlush(this.filePath);
|
|
273
|
+
}
|
|
274
|
+
child(context) {
|
|
275
|
+
return new _JsonlLogger(this.config, this.filePath, { ...this.context, ...context });
|
|
276
|
+
}
|
|
277
|
+
};
|
|
278
|
+
CompositeLogger = class _CompositeLogger extends BaseLogger {
|
|
279
|
+
constructor(config, sinks, context = {}) {
|
|
280
|
+
super(config, context);
|
|
281
|
+
this.sinks = sinks;
|
|
282
|
+
}
|
|
283
|
+
log(event) {
|
|
284
|
+
if (!this.shouldLog(event.level)) return;
|
|
285
|
+
if (!this.shouldEmitProgress(event)) return;
|
|
286
|
+
for (const sink of this.sinks) sink.log(event);
|
|
287
|
+
}
|
|
288
|
+
child(context) {
|
|
289
|
+
const nextSinks = this.sinks.map((s) => s.child(context));
|
|
290
|
+
return new _CompositeLogger(this.config, nextSinks, { ...this.context, ...context });
|
|
291
|
+
}
|
|
292
|
+
};
|
|
293
|
+
}
|
|
294
|
+
});
|
|
295
|
+
|
|
78
296
|
// node_modules/cfb/cfb.js
|
|
79
297
|
var require_cfb = __commonJS({
|
|
80
298
|
"node_modules/cfb/cfb.js"(exports, module) {
|
|
@@ -394,8 +612,8 @@ var require_cfb = __commonJS({
|
|
|
394
612
|
}
|
|
395
613
|
return L.length - R.length;
|
|
396
614
|
}
|
|
397
|
-
function
|
|
398
|
-
if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p :
|
|
615
|
+
function dirname4(p) {
|
|
616
|
+
if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p : dirname4(p.slice(0, -1));
|
|
399
617
|
var c = p.lastIndexOf("/");
|
|
400
618
|
return c === -1 ? p : p.slice(0, c + 1);
|
|
401
619
|
}
|
|
@@ -816,10 +1034,10 @@ var require_cfb = __commonJS({
|
|
|
816
1034
|
data.push([cfb.FullPaths[i2], cfb.FileIndex[i2]]);
|
|
817
1035
|
}
|
|
818
1036
|
for (i2 = 0; i2 < data.length; ++i2) {
|
|
819
|
-
var dad =
|
|
1037
|
+
var dad = dirname4(data[i2][0]);
|
|
820
1038
|
s = fullPaths[dad];
|
|
821
1039
|
while (!s) {
|
|
822
|
-
while (
|
|
1040
|
+
while (dirname4(dad) && !fullPaths[dirname4(dad)]) dad = dirname4(dad);
|
|
823
1041
|
data.push([dad, {
|
|
824
1042
|
name: filename(dad).replace("/", ""),
|
|
825
1043
|
type: 1,
|
|
@@ -829,7 +1047,7 @@ var require_cfb = __commonJS({
|
|
|
829
1047
|
content: null
|
|
830
1048
|
}]);
|
|
831
1049
|
fullPaths[dad] = true;
|
|
832
|
-
dad =
|
|
1050
|
+
dad = dirname4(data[i2][0]);
|
|
833
1051
|
s = fullPaths[dad];
|
|
834
1052
|
}
|
|
835
1053
|
}
|
|
@@ -855,13 +1073,13 @@ var require_cfb = __commonJS({
|
|
|
855
1073
|
elt.size = 0;
|
|
856
1074
|
elt.type = 5;
|
|
857
1075
|
} else if (nm.slice(-1) == "/") {
|
|
858
|
-
for (j = i2 + 1; j < data.length; ++j) if (
|
|
1076
|
+
for (j = i2 + 1; j < data.length; ++j) if (dirname4(cfb.FullPaths[j]) == nm) break;
|
|
859
1077
|
elt.C = j >= data.length ? -1 : j;
|
|
860
|
-
for (j = i2 + 1; j < data.length; ++j) if (
|
|
1078
|
+
for (j = i2 + 1; j < data.length; ++j) if (dirname4(cfb.FullPaths[j]) == dirname4(nm)) break;
|
|
861
1079
|
elt.R = j >= data.length ? -1 : j;
|
|
862
1080
|
elt.type = 1;
|
|
863
1081
|
} else {
|
|
864
|
-
if (
|
|
1082
|
+
if (dirname4(cfb.FullPaths[i2 + 1] || "") == dirname4(nm)) elt.R = i2 + 1;
|
|
865
1083
|
elt.type = 2;
|
|
866
1084
|
}
|
|
867
1085
|
}
|
|
@@ -2029,13 +2247,13 @@ var init_auto_detect = __esm({
|
|
|
2029
2247
|
|
|
2030
2248
|
// src/ocr/cli-provider.ts
|
|
2031
2249
|
import { spawnSync } from "child_process";
|
|
2032
|
-
import { writeFileSync, readFileSync, unlinkSync, mkdirSync } from "fs";
|
|
2250
|
+
import { writeFileSync, readFileSync, unlinkSync, mkdirSync as mkdirSync2 } from "fs";
|
|
2033
2251
|
import { join } from "path";
|
|
2034
2252
|
import { tmpdir } from "os";
|
|
2035
2253
|
function getTempDir() {
|
|
2036
2254
|
if (!_tempDir) {
|
|
2037
2255
|
_tempDir = join(process.cwd(), ".kordoc_ocr_tmp");
|
|
2038
|
-
|
|
2256
|
+
mkdirSync2(_tempDir, { recursive: true });
|
|
2039
2257
|
}
|
|
2040
2258
|
return _tempDir;
|
|
2041
2259
|
}
|
|
@@ -2224,7 +2442,7 @@ async function createTesseractPoolProvider(concurrency) {
|
|
|
2224
2442
|
const waitQueue = [];
|
|
2225
2443
|
function acquire() {
|
|
2226
2444
|
if (idle.length > 0) return Promise.resolve(idle.pop());
|
|
2227
|
-
return new Promise((
|
|
2445
|
+
return new Promise((resolve4) => waitQueue.push(resolve4));
|
|
2228
2446
|
}
|
|
2229
2447
|
function release(w) {
|
|
2230
2448
|
if (waitQueue.length > 0) {
|
|
@@ -2260,13 +2478,13 @@ __export(batch_provider_exports, {
|
|
|
2260
2478
|
createBatchCliProvider: () => createBatchCliProvider
|
|
2261
2479
|
});
|
|
2262
2480
|
import { spawn, execSync as execSync2 } from "child_process";
|
|
2263
|
-
import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as
|
|
2481
|
+
import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync3 } from "fs";
|
|
2264
2482
|
import { join as join2 } from "path";
|
|
2265
2483
|
import { tmpdir as tmpdir2 } from "os";
|
|
2266
2484
|
function getBatchTempDir() {
|
|
2267
2485
|
if (!_batchTempDir) {
|
|
2268
2486
|
_batchTempDir = join2(process.cwd(), ".kordoc_ocr_tmp");
|
|
2269
|
-
|
|
2487
|
+
mkdirSync3(_batchTempDir, { recursive: true });
|
|
2270
2488
|
if (process.platform === "win32") {
|
|
2271
2489
|
try {
|
|
2272
2490
|
execSync2(`attrib +h "${_batchTempDir}"`, { stdio: "ignore" });
|
|
@@ -2317,7 +2535,7 @@ function createBatchCliProvider(mode, batchSize) {
|
|
|
2317
2535
|
};
|
|
2318
2536
|
}
|
|
2319
2537
|
function spawnAsync(cmd, args, opts) {
|
|
2320
|
-
return new Promise((
|
|
2538
|
+
return new Promise((resolve4, reject) => {
|
|
2321
2539
|
const child = spawn(cmd, args, {
|
|
2322
2540
|
cwd: opts.cwd,
|
|
2323
2541
|
env: process.env,
|
|
@@ -2353,7 +2571,7 @@ function spawnAsync(cmd, args, opts) {
|
|
|
2353
2571
|
if (killed) {
|
|
2354
2572
|
reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
|
|
2355
2573
|
} else {
|
|
2356
|
-
|
|
2574
|
+
resolve4({ stdout, stderr, exitCode: code ?? 1 });
|
|
2357
2575
|
}
|
|
2358
2576
|
});
|
|
2359
2577
|
child.on("error", (err) => {
|
|
@@ -2453,7 +2671,10 @@ __export(resolve_exports, {
|
|
|
2453
2671
|
resolveOcrProvider: () => resolveOcrProvider
|
|
2454
2672
|
});
|
|
2455
2673
|
async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
2674
|
+
const logger = createLoggerFromEnv().child({ component: "ocr/resolve.ts", stage: "ocr" });
|
|
2675
|
+
logger.log({ level: "debug", event: "start", message: "OCR provider resolve \uC2DC\uC791", meta: { mode, concurrency, batchSize } });
|
|
2456
2676
|
if (mode === "off") {
|
|
2677
|
+
logger.log({ level: "warn", event: "error", message: "OCR \uBE44\uD65C\uC131\uD654 \uBAA8\uB4DC \uC694\uCCAD" });
|
|
2457
2678
|
throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
|
|
2458
2679
|
}
|
|
2459
2680
|
if (mode !== "auto") {
|
|
@@ -2461,21 +2682,27 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2461
2682
|
if (mode === "tesseract") {
|
|
2462
2683
|
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2463
2684
|
if (concurrency && concurrency > 1) {
|
|
2685
|
+
logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
|
|
2464
2686
|
return createTesseractPoolProvider2(concurrency);
|
|
2465
2687
|
}
|
|
2688
|
+
logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
|
|
2466
2689
|
return createTesseractProvider2();
|
|
2467
2690
|
}
|
|
2468
2691
|
if (mode === "gemini" || mode === "claude" || mode === "codex") {
|
|
2469
2692
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
2470
2693
|
const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
|
|
2471
2694
|
if (effectiveBatch > 1) {
|
|
2695
|
+
logger.log({ level: "info", event: "done", message: "Batch CLI provider \uC120\uD0DD", meta: { mode, batchSize: effectiveBatch } });
|
|
2472
2696
|
return createBatchCliProvider2(mode, effectiveBatch);
|
|
2473
2697
|
}
|
|
2698
|
+
logger.log({ level: "info", event: "done", message: "CLI provider \uC120\uD0DD", meta: { mode } });
|
|
2474
2699
|
return createCliOcrProvider(mode);
|
|
2475
2700
|
}
|
|
2701
|
+
logger.log({ level: "info", event: "done", message: "CLI provider \uC120\uD0DD", meta: { mode } });
|
|
2476
2702
|
return createCliOcrProvider(mode);
|
|
2477
2703
|
}
|
|
2478
2704
|
const detected = detectAvailableOcr();
|
|
2705
|
+
logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
|
|
2479
2706
|
if (detected !== "codex") {
|
|
2480
2707
|
if (detected === "tesseract") {
|
|
2481
2708
|
warnings?.push({
|
|
@@ -2492,18 +2719,23 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
|
|
|
2492
2719
|
if (detected === "tesseract") {
|
|
2493
2720
|
const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
|
|
2494
2721
|
if (concurrency && concurrency > 1) {
|
|
2722
|
+
logger.log({ level: "info", event: "done", message: "AUTO: Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
|
|
2495
2723
|
return createTesseractPoolProvider2(concurrency);
|
|
2496
2724
|
}
|
|
2725
|
+
logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
|
|
2497
2726
|
return createTesseractProvider2();
|
|
2498
2727
|
}
|
|
2499
2728
|
if (detected === "gemini" || detected === "codex" || detected === "claude") {
|
|
2500
2729
|
const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
|
|
2501
2730
|
const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[detected];
|
|
2502
2731
|
if (effectiveBatch > 1) {
|
|
2732
|
+
logger.log({ level: "info", event: "done", message: "AUTO: Batch CLI provider \uC120\uD0DD", meta: { mode: detected, batchSize: effectiveBatch } });
|
|
2503
2733
|
return createBatchCliProvider2(detected, effectiveBatch);
|
|
2504
2734
|
}
|
|
2735
|
+
logger.log({ level: "info", event: "done", message: "AUTO: CLI provider \uC120\uD0DD", meta: { mode: detected } });
|
|
2505
2736
|
return createCliOcrProvider(detected);
|
|
2506
2737
|
}
|
|
2738
|
+
logger.log({ level: "info", event: "done", message: "AUTO: CLI provider \uC120\uD0DD", meta: { mode: detected } });
|
|
2507
2739
|
return createCliOcrProvider(detected);
|
|
2508
2740
|
}
|
|
2509
2741
|
var init_resolve = __esm({
|
|
@@ -2511,6 +2743,7 @@ var init_resolve = __esm({
|
|
|
2511
2743
|
"use strict";
|
|
2512
2744
|
init_auto_detect();
|
|
2513
2745
|
init_cli_provider();
|
|
2746
|
+
init_logger();
|
|
2514
2747
|
}
|
|
2515
2748
|
});
|
|
2516
2749
|
|
|
@@ -2670,9 +2903,18 @@ function isBatchProvider(p) {
|
|
|
2670
2903
|
return !!p && typeof p === "object" && "__batch" in p && p.__batch === true;
|
|
2671
2904
|
}
|
|
2672
2905
|
async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
2906
|
+
const logger = createLoggerFromEnv().child({ component: "ocr/provider.ts", stage: "ocr" });
|
|
2907
|
+
logger.log({
|
|
2908
|
+
level: "info",
|
|
2909
|
+
event: "start",
|
|
2910
|
+
message: "OCR \uD398\uC774\uC9C0 \uCC98\uB9AC \uC2DC\uC791",
|
|
2911
|
+
meta: { effectivePageCount, concurrency, filteredPages: pageFilter?.size, batchProvider: isBatchProvider(provider) }
|
|
2912
|
+
});
|
|
2673
2913
|
const blocks = [];
|
|
2674
2914
|
if (isBatchProvider(provider)) {
|
|
2675
|
-
|
|
2915
|
+
const result = await ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
|
|
2916
|
+
logger.log({ level: "info", event: "done", message: "OCR \uBC30\uCE58 \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: result.length } });
|
|
2917
|
+
return result;
|
|
2676
2918
|
}
|
|
2677
2919
|
if (concurrency <= 1) {
|
|
2678
2920
|
for (let i = 1; i <= effectivePageCount; i++) {
|
|
@@ -2688,8 +2930,16 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2688
2930
|
message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2689
2931
|
code: "OCR_PAGE_FAILED"
|
|
2690
2932
|
});
|
|
2933
|
+
logger.log({
|
|
2934
|
+
level: "warn",
|
|
2935
|
+
event: "progress",
|
|
2936
|
+
message: "\uD398\uC774\uC9C0 OCR \uC2E4\uD328",
|
|
2937
|
+
meta: { page: i },
|
|
2938
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
|
|
2939
|
+
});
|
|
2691
2940
|
}
|
|
2692
2941
|
}
|
|
2942
|
+
logger.log({ level: "info", event: "done", message: "OCR \uC21C\uCC28 \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: blocks.length } });
|
|
2693
2943
|
return blocks;
|
|
2694
2944
|
}
|
|
2695
2945
|
const pageNumbers = [];
|
|
@@ -2709,6 +2959,13 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2709
2959
|
message: `\uD398\uC774\uC9C0 ${pageNum} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
2710
2960
|
code: "OCR_PAGE_FAILED"
|
|
2711
2961
|
});
|
|
2962
|
+
logger.log({
|
|
2963
|
+
level: "warn",
|
|
2964
|
+
event: "progress",
|
|
2965
|
+
message: "\uD398\uC774\uC9C0 OCR \uC2E4\uD328(\uBCD1\uB82C)",
|
|
2966
|
+
meta: { page: pageNum },
|
|
2967
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
|
|
2968
|
+
});
|
|
2712
2969
|
return null;
|
|
2713
2970
|
}
|
|
2714
2971
|
});
|
|
@@ -2717,6 +2974,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
|
|
|
2717
2974
|
if (!item) continue;
|
|
2718
2975
|
for (const b of item.pageBlocks) blocks.push(b);
|
|
2719
2976
|
}
|
|
2977
|
+
logger.log({ level: "info", event: "done", message: "OCR \uBCD1\uB82C \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: blocks.length, pages: pageNumbers.length } });
|
|
2720
2978
|
return blocks;
|
|
2721
2979
|
}
|
|
2722
2980
|
async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
|
|
@@ -2799,11 +3057,12 @@ var init_provider = __esm({
|
|
|
2799
3057
|
"src/ocr/provider.ts"() {
|
|
2800
3058
|
"use strict";
|
|
2801
3059
|
init_markdown_to_blocks();
|
|
3060
|
+
init_logger();
|
|
2802
3061
|
}
|
|
2803
3062
|
});
|
|
2804
3063
|
|
|
2805
3064
|
// src/index.ts
|
|
2806
|
-
import { readFile } from "fs/promises";
|
|
3065
|
+
import { readFile as readFile2 } from "fs/promises";
|
|
2807
3066
|
|
|
2808
3067
|
// src/detect.ts
|
|
2809
3068
|
import JSZip from "jszip";
|
|
@@ -2856,7 +3115,7 @@ import JSZip2 from "jszip";
|
|
|
2856
3115
|
import { DOMParser } from "@xmldom/xmldom";
|
|
2857
3116
|
|
|
2858
3117
|
// src/utils.ts
|
|
2859
|
-
var VERSION = true ? "2.4.
|
|
3118
|
+
var VERSION = true ? "2.4.12" : "0.0.0-dev";
|
|
2860
3119
|
function toArrayBuffer(buf) {
|
|
2861
3120
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
2862
3121
|
return buf.buffer;
|
|
@@ -2864,9 +3123,13 @@ function toArrayBuffer(buf) {
|
|
|
2864
3123
|
return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
|
|
2865
3124
|
}
|
|
2866
3125
|
var KordocError = class extends Error {
|
|
2867
|
-
|
|
3126
|
+
code;
|
|
3127
|
+
stage;
|
|
3128
|
+
constructor(message, opts = {}) {
|
|
2868
3129
|
super(message);
|
|
2869
3130
|
this.name = "KordocError";
|
|
3131
|
+
this.code = opts.code;
|
|
3132
|
+
this.stage = opts.stage;
|
|
2870
3133
|
}
|
|
2871
3134
|
};
|
|
2872
3135
|
function isPathTraversal(name) {
|
|
@@ -2930,6 +3193,16 @@ function classifyError(err) {
|
|
|
2930
3193
|
if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
|
|
2931
3194
|
return "PARSE_ERROR";
|
|
2932
3195
|
}
|
|
3196
|
+
function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
|
|
3197
|
+
if (err instanceof KordocError) {
|
|
3198
|
+
if (!err.stage) err.stage = stage;
|
|
3199
|
+
if (!err.code) err.code = fallbackCode;
|
|
3200
|
+
return err;
|
|
3201
|
+
}
|
|
3202
|
+
const message = err instanceof Error ? err.message : fallbackMessage;
|
|
3203
|
+
const code = err instanceof Error ? classifyError(err) : fallbackCode;
|
|
3204
|
+
return new KordocError(message || fallbackMessage, { code, stage });
|
|
3205
|
+
}
|
|
2933
3206
|
|
|
2934
3207
|
// src/table/builder.ts
|
|
2935
3208
|
var MAX_COLS = 200;
|
|
@@ -3192,6 +3465,7 @@ var HEADING_RATIO_H3 = 1.15;
|
|
|
3192
3465
|
|
|
3193
3466
|
// src/hwpx/parser.ts
|
|
3194
3467
|
init_page_range();
|
|
3468
|
+
init_logger();
|
|
3195
3469
|
var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
|
|
3196
3470
|
var MAX_ZIP_ENTRIES = 2e3;
|
|
3197
3471
|
function clampSpan(val, max) {
|
|
@@ -3283,50 +3557,89 @@ function stripDtd(xml) {
|
|
|
3283
3557
|
return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
|
|
3284
3558
|
}
|
|
3285
3559
|
async function parseHwpxDocument(buffer, options, existingZip) {
|
|
3286
|
-
|
|
3287
|
-
|
|
3560
|
+
const logger = createLoggerFromEnv().child({ component: "hwpx/parser.ts", stage: "detect" });
|
|
3561
|
+
logger.log({ level: "info", event: "start", message: "HWPX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
3562
|
+
let lastParsedSection = 0;
|
|
3288
3563
|
try {
|
|
3289
|
-
|
|
3290
|
-
|
|
3291
|
-
return await extractFromBrokenZip(buffer);
|
|
3292
|
-
}
|
|
3293
|
-
const actualEntryCount = Object.keys(zip.files).length;
|
|
3294
|
-
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
3295
|
-
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
3296
|
-
}
|
|
3297
|
-
const decompressed = { total: 0 };
|
|
3298
|
-
const metadata = {};
|
|
3299
|
-
await extractHwpxMetadata(zip, metadata, decompressed);
|
|
3300
|
-
const styleMap = await extractHwpxStyles(zip, decompressed);
|
|
3301
|
-
const warnings = [];
|
|
3302
|
-
const sectionPaths = await resolveSectionPaths(zip);
|
|
3303
|
-
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3304
|
-
metadata.pageCount = sectionPaths.length;
|
|
3305
|
-
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
3306
|
-
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
3307
|
-
const blocks = [];
|
|
3308
|
-
let parsedSections = 0;
|
|
3309
|
-
for (let si = 0; si < sectionPaths.length; si++) {
|
|
3310
|
-
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
3311
|
-
const file = zip.file(sectionPaths[si]);
|
|
3312
|
-
if (!file) continue;
|
|
3564
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
|
|
3565
|
+
let zip;
|
|
3313
3566
|
try {
|
|
3314
|
-
|
|
3315
|
-
|
|
3316
|
-
|
|
3317
|
-
|
|
3318
|
-
|
|
3319
|
-
|
|
3320
|
-
|
|
3321
|
-
|
|
3322
|
-
|
|
3323
|
-
}
|
|
3324
|
-
|
|
3325
|
-
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3329
|
-
|
|
3567
|
+
zip = existingZip ?? await JSZip2.loadAsync(buffer);
|
|
3568
|
+
} catch {
|
|
3569
|
+
return await extractFromBrokenZip(buffer);
|
|
3570
|
+
}
|
|
3571
|
+
const actualEntryCount = Object.keys(zip.files).length;
|
|
3572
|
+
if (actualEntryCount > MAX_ZIP_ENTRIES) {
|
|
3573
|
+
throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
3574
|
+
}
|
|
3575
|
+
const decompressed = { total: 0 };
|
|
3576
|
+
const metadata = {};
|
|
3577
|
+
await extractHwpxMetadata(zip, metadata, decompressed);
|
|
3578
|
+
const styleMap = await extractHwpxStyles(zip, decompressed);
|
|
3579
|
+
const warnings = [];
|
|
3580
|
+
const sectionPaths = await resolveSectionPaths(zip);
|
|
3581
|
+
if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
3582
|
+
metadata.pageCount = sectionPaths.length;
|
|
3583
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uACBD\uB85C \uD574\uC11D \uC644\uB8CC", meta: { sections: sectionPaths.length } });
|
|
3584
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
|
|
3585
|
+
const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
|
|
3586
|
+
const blocks = [];
|
|
3587
|
+
let parsedSections = 0;
|
|
3588
|
+
for (let si = 0; si < sectionPaths.length; si++) {
|
|
3589
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
3590
|
+
const file = zip.file(sectionPaths[si]);
|
|
3591
|
+
if (!file) continue;
|
|
3592
|
+
try {
|
|
3593
|
+
const xml = await file.async("text");
|
|
3594
|
+
decompressed.total += xml.length * 2;
|
|
3595
|
+
if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
|
|
3596
|
+
blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
|
|
3597
|
+
parsedSections++;
|
|
3598
|
+
options?.onProgress?.(parsedSections, totalTarget);
|
|
3599
|
+
logger.log({
|
|
3600
|
+
level: "debug",
|
|
3601
|
+
stage: "convert",
|
|
3602
|
+
event: "progress",
|
|
3603
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
|
|
3604
|
+
meta: { section: si + 1, parsedSections, totalTarget }
|
|
3605
|
+
});
|
|
3606
|
+
lastParsedSection = si + 1;
|
|
3607
|
+
} catch (secErr) {
|
|
3608
|
+
if (secErr instanceof KordocError) throw secErr;
|
|
3609
|
+
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
3610
|
+
logger.log({
|
|
3611
|
+
level: "warn",
|
|
3612
|
+
stage: "convert",
|
|
3613
|
+
event: "progress",
|
|
3614
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
|
|
3615
|
+
meta: { section: si + 1 },
|
|
3616
|
+
error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
|
|
3617
|
+
});
|
|
3618
|
+
}
|
|
3619
|
+
}
|
|
3620
|
+
const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
|
|
3621
|
+
detectHwpxHeadings(blocks, styleMap);
|
|
3622
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
3623
|
+
const markdown = blocksToMarkdown(blocks);
|
|
3624
|
+
logger.log({
|
|
3625
|
+
level: "info",
|
|
3626
|
+
stage: "finalize",
|
|
3627
|
+
event: "done",
|
|
3628
|
+
message: "HWPX \uD30C\uC2F1 \uC644\uB8CC",
|
|
3629
|
+
meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
|
|
3630
|
+
});
|
|
3631
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
3632
|
+
} catch (err) {
|
|
3633
|
+
logger.log({
|
|
3634
|
+
level: "error",
|
|
3635
|
+
stage: "finalize",
|
|
3636
|
+
event: "error",
|
|
3637
|
+
message: "HWPX \uD30C\uC2F1 \uC2E4\uD328",
|
|
3638
|
+
meta: { lastParsedSection },
|
|
3639
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
3640
|
+
});
|
|
3641
|
+
throw err;
|
|
3642
|
+
}
|
|
3330
3643
|
}
|
|
3331
3644
|
function imageExtToMime(ext) {
|
|
3332
3645
|
switch (ext.toLowerCase()) {
|
|
@@ -5043,75 +5356,115 @@ function parseLenientCfb(data) {
|
|
|
5043
5356
|
|
|
5044
5357
|
// src/hwp5/parser.ts
|
|
5045
5358
|
init_page_range();
|
|
5359
|
+
init_logger();
|
|
5046
5360
|
var CFB = __toESM(require_cfb(), 1);
|
|
5047
5361
|
var MAX_SECTIONS = 100;
|
|
5048
5362
|
var MAX_TOTAL_DECOMPRESS = 500 * 1024 * 1024;
|
|
5049
5363
|
function parseHwp5Document(buffer, options) {
|
|
5050
|
-
|
|
5051
|
-
|
|
5052
|
-
|
|
5364
|
+
const logger = createLoggerFromEnv().child({ component: "hwp5/parser.ts", stage: "detect" });
|
|
5365
|
+
logger.log({ level: "info", event: "start", message: "HWP5 \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.length } });
|
|
5366
|
+
let lastParsedSection = 0;
|
|
5053
5367
|
try {
|
|
5054
|
-
cfb =
|
|
5055
|
-
|
|
5368
|
+
let cfb = null;
|
|
5369
|
+
let lenientCfb = null;
|
|
5370
|
+
const warnings = [];
|
|
5056
5371
|
try {
|
|
5057
|
-
|
|
5058
|
-
warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
|
|
5372
|
+
cfb = CFB.parse(buffer);
|
|
5059
5373
|
} catch {
|
|
5060
|
-
|
|
5374
|
+
try {
|
|
5375
|
+
lenientCfb = parseLenientCfb(buffer);
|
|
5376
|
+
warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
|
|
5377
|
+
} catch {
|
|
5378
|
+
throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
|
|
5379
|
+
}
|
|
5061
5380
|
}
|
|
5062
|
-
|
|
5063
|
-
|
|
5064
|
-
|
|
5065
|
-
|
|
5066
|
-
|
|
5381
|
+
const findStream = (path) => {
|
|
5382
|
+
if (cfb) {
|
|
5383
|
+
const entry = CFB.find(cfb, path);
|
|
5384
|
+
return entry?.content ? Buffer.from(entry.content) : null;
|
|
5385
|
+
}
|
|
5386
|
+
return lenientCfb.findStream(path);
|
|
5387
|
+
};
|
|
5388
|
+
const headerData = findStream("/FileHeader");
|
|
5389
|
+
if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
|
|
5390
|
+
const header = parseFileHeader(headerData);
|
|
5391
|
+
if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
|
|
5392
|
+
if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
5393
|
+
if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
|
|
5394
|
+
const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
|
|
5395
|
+
const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
|
|
5396
|
+
const metadata = {
|
|
5397
|
+
version: `${header.versionMajor}.x`
|
|
5398
|
+
};
|
|
5399
|
+
if (cfb) extractHwp5Metadata(cfb, metadata);
|
|
5400
|
+
const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
|
|
5401
|
+
const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
|
|
5402
|
+
if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
5403
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uBAA9\uB85D \uD574\uC11D \uC644\uB8CC", meta: { sections: sections.length, distribution } });
|
|
5404
|
+
metadata.pageCount = sections.length;
|
|
5405
|
+
const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
|
|
5406
|
+
const totalTarget = pageFilter ? pageFilter.size : sections.length;
|
|
5407
|
+
const blocks = [];
|
|
5408
|
+
let totalDecompressed = 0;
|
|
5409
|
+
let parsedSections = 0;
|
|
5410
|
+
for (let si = 0; si < sections.length; si++) {
|
|
5411
|
+
if (pageFilter && !pageFilter.has(si + 1)) continue;
|
|
5412
|
+
try {
|
|
5413
|
+
const sectionData = sections[si];
|
|
5414
|
+
const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
5415
|
+
totalDecompressed += data.length;
|
|
5416
|
+
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
5417
|
+
const records = readRecords(data);
|
|
5418
|
+
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
5419
|
+
blocks.push(...sectionBlocks);
|
|
5420
|
+
parsedSections++;
|
|
5421
|
+
options?.onProgress?.(parsedSections, totalTarget);
|
|
5422
|
+
logger.log({
|
|
5423
|
+
level: "debug",
|
|
5424
|
+
stage: "convert",
|
|
5425
|
+
event: "progress",
|
|
5426
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
|
|
5427
|
+
meta: { section: si + 1, parsedSections, totalTarget }
|
|
5428
|
+
});
|
|
5429
|
+
lastParsedSection = si + 1;
|
|
5430
|
+
} catch (secErr) {
|
|
5431
|
+
if (secErr instanceof KordocError) throw secErr;
|
|
5432
|
+
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
5433
|
+
logger.log({
|
|
5434
|
+
level: "warn",
|
|
5435
|
+
stage: "convert",
|
|
5436
|
+
event: "progress",
|
|
5437
|
+
message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
|
|
5438
|
+
meta: { section: si + 1 },
|
|
5439
|
+
error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
|
|
5440
|
+
});
|
|
5441
|
+
}
|
|
5067
5442
|
}
|
|
5068
|
-
|
|
5069
|
-
|
|
5070
|
-
|
|
5071
|
-
|
|
5072
|
-
|
|
5073
|
-
|
|
5074
|
-
|
|
5075
|
-
|
|
5076
|
-
|
|
5077
|
-
|
|
5078
|
-
|
|
5079
|
-
|
|
5080
|
-
|
|
5081
|
-
|
|
5082
|
-
|
|
5083
|
-
|
|
5084
|
-
|
|
5085
|
-
|
|
5086
|
-
|
|
5087
|
-
|
|
5088
|
-
|
|
5089
|
-
|
|
5090
|
-
|
|
5091
|
-
|
|
5092
|
-
|
|
5093
|
-
try {
|
|
5094
|
-
const sectionData = sections[si];
|
|
5095
|
-
const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
|
|
5096
|
-
totalDecompressed += data.length;
|
|
5097
|
-
if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
|
|
5098
|
-
const records = readRecords(data);
|
|
5099
|
-
const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
|
|
5100
|
-
blocks.push(...sectionBlocks);
|
|
5101
|
-
parsedSections++;
|
|
5102
|
-
options?.onProgress?.(parsedSections, totalTarget);
|
|
5103
|
-
} catch (secErr) {
|
|
5104
|
-
if (secErr instanceof KordocError) throw secErr;
|
|
5105
|
-
warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
5106
|
-
}
|
|
5107
|
-
}
|
|
5108
|
-
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
5109
|
-
if (docInfo) {
|
|
5110
|
-
detectHwp5Headings(blocks, docInfo);
|
|
5111
|
-
}
|
|
5112
|
-
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
5113
|
-
const markdown = blocksToMarkdown(blocks);
|
|
5114
|
-
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
5443
|
+
const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
|
|
5444
|
+
if (docInfo) {
|
|
5445
|
+
detectHwp5Headings(blocks, docInfo);
|
|
5446
|
+
}
|
|
5447
|
+
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
5448
|
+
const markdown = blocksToMarkdown(blocks);
|
|
5449
|
+
logger.log({
|
|
5450
|
+
level: "info",
|
|
5451
|
+
stage: "finalize",
|
|
5452
|
+
event: "done",
|
|
5453
|
+
message: "HWP5 \uD30C\uC2F1 \uC644\uB8CC",
|
|
5454
|
+
meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
|
|
5455
|
+
});
|
|
5456
|
+
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
|
|
5457
|
+
} catch (err) {
|
|
5458
|
+
logger.log({
|
|
5459
|
+
level: "error",
|
|
5460
|
+
stage: "finalize",
|
|
5461
|
+
event: "error",
|
|
5462
|
+
message: "HWP5 \uD30C\uC2F1 \uC2E4\uD328",
|
|
5463
|
+
meta: { lastParsedSection },
|
|
5464
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
5465
|
+
});
|
|
5466
|
+
throw err;
|
|
5467
|
+
}
|
|
5115
5468
|
}
|
|
5116
5469
|
function parseDocInfoStream(cfb, compressed) {
|
|
5117
5470
|
try {
|
|
@@ -5658,6 +6011,8 @@ function arrangeCells(rows, cols, cells) {
|
|
|
5658
6011
|
|
|
5659
6012
|
// src/pdf/parser.ts
|
|
5660
6013
|
init_page_range();
|
|
6014
|
+
import { createRequire } from "module";
|
|
6015
|
+
import { dirname as dirname2, join as join3, resolve as resolve2 } from "path";
|
|
5661
6016
|
|
|
5662
6017
|
// src/pdf/line-detector.ts
|
|
5663
6018
|
import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
@@ -5845,12 +6200,17 @@ function buildTableGrids(horizontals, verticals) {
|
|
|
5845
6200
|
const rawXs = vLines.map((l) => l.x1);
|
|
5846
6201
|
const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
|
|
5847
6202
|
if (rowYs.length < 2 || colXs.length < 2) continue;
|
|
6203
|
+
const rowCount = rowYs.length - 1;
|
|
6204
|
+
const colCount = colXs.length - 1;
|
|
6205
|
+
if (rowCount <= 0 || colCount <= 0) continue;
|
|
6206
|
+
if (rowCount * colCount < 2) continue;
|
|
5848
6207
|
const bbox = {
|
|
5849
6208
|
x1: colXs[0],
|
|
5850
6209
|
y1: rowYs[rowYs.length - 1],
|
|
5851
6210
|
x2: colXs[colXs.length - 1],
|
|
5852
6211
|
y2: rowYs[0]
|
|
5853
6212
|
};
|
|
6213
|
+
if (!hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox)) continue;
|
|
5854
6214
|
grids.push({ rowYs, colXs, bbox });
|
|
5855
6215
|
}
|
|
5856
6216
|
return mergeAdjacentGrids(grids);
|
|
@@ -5900,6 +6260,35 @@ function clusterCoordinates(values) {
|
|
|
5900
6260
|
}
|
|
5901
6261
|
return clusters.map((c) => c.sum / c.count);
|
|
5902
6262
|
}
|
|
6263
|
+
function hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox) {
|
|
6264
|
+
const internalRows = rowYs.slice(1, -1);
|
|
6265
|
+
const internalCols = colXs.slice(1, -1);
|
|
6266
|
+
const width = Math.max(1, bbox.x2 - bbox.x1);
|
|
6267
|
+
const height = Math.max(1, bbox.y2 - bbox.y1);
|
|
6268
|
+
const coverageThreshold = 0.55;
|
|
6269
|
+
const coveredRows = internalRows.filter(
|
|
6270
|
+
(y) => hLines.some((h) => Math.abs(h.y1 - y) <= COORD_MERGE_TOL && lineOverlapRatio(h.x1, h.x2, bbox.x1, bbox.x2) >= coverageThreshold)
|
|
6271
|
+
).length;
|
|
6272
|
+
const coveredCols = internalCols.filter(
|
|
6273
|
+
(x) => vLines.some((v) => Math.abs(v.x1 - x) <= COORD_MERGE_TOL && lineOverlapRatio(v.y1, v.y2, bbox.y1, bbox.y2) >= coverageThreshold)
|
|
6274
|
+
).length;
|
|
6275
|
+
const rowCoverage = internalRows.length > 0 ? coveredRows / internalRows.length : 1;
|
|
6276
|
+
const colCoverage = internalCols.length > 0 ? coveredCols / internalCols.length : 1;
|
|
6277
|
+
const longHorizontal = hLines.filter((h) => Math.abs(h.x2 - h.x1) >= width * 0.7).length;
|
|
6278
|
+
const longVertical = vLines.filter((v) => Math.abs(v.y2 - v.y1) >= height * 0.7).length;
|
|
6279
|
+
const hasAxisSupport = longHorizontal >= 2 && longVertical >= 2;
|
|
6280
|
+
if (!hasAxisSupport) return false;
|
|
6281
|
+
if (internalRows.length > 0 && rowCoverage < 0.5) return false;
|
|
6282
|
+
if (internalCols.length > 0 && colCoverage < 0.5) return false;
|
|
6283
|
+
return true;
|
|
6284
|
+
}
|
|
6285
|
+
function lineOverlapRatio(a1, a2, b1, b2) {
|
|
6286
|
+
const left = Math.max(Math.min(a1, a2), Math.min(b1, b2));
|
|
6287
|
+
const right = Math.min(Math.max(a1, a2), Math.max(b1, b2));
|
|
6288
|
+
const overlap = Math.max(0, right - left);
|
|
6289
|
+
const target = Math.max(1, Math.abs(b2 - b1));
|
|
6290
|
+
return overlap / target;
|
|
6291
|
+
}
|
|
5903
6292
|
function groupConnectedLines(lines) {
|
|
5904
6293
|
const parent = lines.map((_, i) => i);
|
|
5905
6294
|
function find2(x) {
|
|
@@ -6276,6 +6665,9 @@ function buildClusterTable(rows, columns, pageNum) {
|
|
|
6276
6665
|
};
|
|
6277
6666
|
}
|
|
6278
6667
|
|
|
6668
|
+
// src/pdf/parser.ts
|
|
6669
|
+
init_logger();
|
|
6670
|
+
|
|
6279
6671
|
// src/pdf/polyfill.ts
|
|
6280
6672
|
import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
|
|
6281
6673
|
var g = globalThis;
|
|
@@ -6296,6 +6688,17 @@ g.pdfjsWorker = pdfjsWorker;
|
|
|
6296
6688
|
// src/pdf/parser.ts
|
|
6297
6689
|
import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
|
|
6298
6690
|
GlobalWorkerOptions.workerSrc = "";
|
|
6691
|
+
var require2 = createRequire(
|
|
6692
|
+
typeof __filename !== "undefined" ? __filename : resolve2(process.cwd(), "kordoc.require.cjs")
|
|
6693
|
+
);
|
|
6694
|
+
function resolvePdfjsWasmUrl() {
|
|
6695
|
+
try {
|
|
6696
|
+
const pdfjsPkg = require2.resolve("pdfjs-dist/package.json");
|
|
6697
|
+
return join3(dirname2(pdfjsPkg), "wasm/");
|
|
6698
|
+
} catch {
|
|
6699
|
+
return resolve2(process.cwd(), "node_modules/pdfjs-dist/wasm/");
|
|
6700
|
+
}
|
|
6701
|
+
}
|
|
6299
6702
|
var MAX_PAGES = 5e3;
|
|
6300
6703
|
var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
|
|
6301
6704
|
function calcPdfTimeout(bufferSize) {
|
|
@@ -6311,7 +6714,8 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6311
6714
|
data: new Uint8Array(buffer),
|
|
6312
6715
|
useSystemFonts: true,
|
|
6313
6716
|
disableFontFace: true,
|
|
6314
|
-
isEvalSupported: false
|
|
6717
|
+
isEvalSupported: false,
|
|
6718
|
+
wasmUrl: resolvePdfjsWasmUrl()
|
|
6315
6719
|
});
|
|
6316
6720
|
let timer;
|
|
6317
6721
|
try {
|
|
@@ -6328,7 +6732,47 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
6328
6732
|
if (timer !== void 0) clearTimeout(timer);
|
|
6329
6733
|
}
|
|
6330
6734
|
}
|
|
6735
|
+
function estimateImageBasedPdf(metrics) {
|
|
6736
|
+
if (metrics.length === 0) {
|
|
6737
|
+
return { isImageBased: true, score: 1, reason: "\uC0D8\uD50C \uD1B5\uACC4 \uC5C6\uC74C" };
|
|
6738
|
+
}
|
|
6739
|
+
const totalPages = metrics.length;
|
|
6740
|
+
const totalChars = metrics.reduce((s, m) => s + m.nonWhitespaceChars, 0);
|
|
6741
|
+
const totalItems = metrics.reduce((s, m) => s + m.visibleItems, 0);
|
|
6742
|
+
const pagesWithText = metrics.filter((m) => m.nonWhitespaceChars >= 20 || m.visibleItems >= 15).length;
|
|
6743
|
+
const avgChars = totalChars / totalPages;
|
|
6744
|
+
const avgItems = totalItems / totalPages;
|
|
6745
|
+
const textPresenceRatio = pagesWithText / totalPages;
|
|
6746
|
+
let score = 0;
|
|
6747
|
+
if (avgChars < 10) score += 0.45;
|
|
6748
|
+
if (avgItems < 8) score += 0.35;
|
|
6749
|
+
if (textPresenceRatio < 0.35) score += 0.25;
|
|
6750
|
+
if (avgChars > 40) score -= 0.35;
|
|
6751
|
+
if (avgItems > 25) score -= 0.35;
|
|
6752
|
+
if (textPresenceRatio > 0.7) score -= 0.25;
|
|
6753
|
+
score = Math.max(0, Math.min(1, score));
|
|
6754
|
+
const isImageBased = score >= 0.5;
|
|
6755
|
+
const reason = `avgChars=${avgChars.toFixed(1)}, avgItems=${avgItems.toFixed(1)}, textPresence=${(textPresenceRatio * 100).toFixed(0)}%, score=${score.toFixed(2)}`;
|
|
6756
|
+
return { isImageBased, score, reason };
|
|
6757
|
+
}
|
|
6758
|
+
function summarizePartialFailures(failedPages, totalTarget) {
|
|
6759
|
+
if (failedPages.length === 0) return null;
|
|
6760
|
+
const sorted = [...failedPages].sort((a, b) => a - b);
|
|
6761
|
+
const preview = sorted.slice(0, 10).join(", ");
|
|
6762
|
+
const suffix = sorted.length > 10 ? ` \uC678 ${sorted.length - 10}\uD398\uC774\uC9C0` : "";
|
|
6763
|
+
return `\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uC694\uC57D: ${sorted.length}/${totalTarget}\uD398\uC774\uC9C0 \uC2E4\uD328 (p${preview}${suffix})`;
|
|
6764
|
+
}
|
|
6765
|
+
function shouldAbortForPartialFailures(failedPages, totalTarget, maxPartialFailureRatio) {
|
|
6766
|
+
if (typeof maxPartialFailureRatio !== "number") {
|
|
6767
|
+
return { abort: false, ratio: 0, threshold: 0 };
|
|
6768
|
+
}
|
|
6769
|
+
const threshold = Math.max(0, Math.min(1, maxPartialFailureRatio));
|
|
6770
|
+
const ratio = totalTarget > 0 ? failedPages.length / totalTarget : 0;
|
|
6771
|
+
return { abort: ratio > threshold, ratio, threshold };
|
|
6772
|
+
}
|
|
6331
6773
|
async function parsePdfDocument(buffer, options) {
|
|
6774
|
+
const logger = createLoggerFromEnv().child({ component: "pdf/parser.ts", stage: "detect" });
|
|
6775
|
+
logger.log({ level: "info", event: "start", message: "PDF \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
6332
6776
|
const doc = await loadPdfWithTimeout(buffer);
|
|
6333
6777
|
try {
|
|
6334
6778
|
const pageCount = doc.numPages;
|
|
@@ -6337,9 +6781,13 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6337
6781
|
await extractPdfMetadata(doc, metadata);
|
|
6338
6782
|
const blocks = [];
|
|
6339
6783
|
const warnings = [];
|
|
6784
|
+
const failedPages = [];
|
|
6785
|
+
let lastParsedPage2 = 0;
|
|
6786
|
+
const sampleMetricsByPage = /* @__PURE__ */ new Map();
|
|
6340
6787
|
let totalChars = 0;
|
|
6341
6788
|
let totalTextBytes = 0;
|
|
6342
6789
|
const effectivePageCount = Math.min(pageCount, MAX_PAGES);
|
|
6790
|
+
logger.log({ level: "debug", event: "progress", message: "PDF \uB85C\uB529 \uC644\uB8CC", meta: { pageCount, effectivePageCount } });
|
|
6343
6791
|
const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
|
|
6344
6792
|
const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
|
|
6345
6793
|
const fontSizeFreq = /* @__PURE__ */ new Map();
|
|
@@ -6376,11 +6824,17 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6376
6824
|
totalChars += t.replace(/\s/g, "").length;
|
|
6377
6825
|
totalTextBytes += t.length * 2;
|
|
6378
6826
|
}
|
|
6827
|
+
sampleMetricsByPage.set(i, {
|
|
6828
|
+
nonWhitespaceChars: visible.reduce((sum, it) => sum + it.text.replace(/\s/g, "").length, 0),
|
|
6829
|
+
visibleItems: visible.length
|
|
6830
|
+
});
|
|
6831
|
+
lastParsedPage2 = i;
|
|
6379
6832
|
if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
|
|
6380
6833
|
parsedPages++;
|
|
6381
6834
|
options?.onProgress?.(parsedPages, totalTarget);
|
|
6382
6835
|
} catch (pageErr) {
|
|
6383
6836
|
if (pageErr instanceof KordocError) throw pageErr;
|
|
6837
|
+
if (!failedPages.includes(i)) failedPages.push(i);
|
|
6384
6838
|
warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
|
|
6385
6839
|
}
|
|
6386
6840
|
};
|
|
@@ -6397,8 +6851,21 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6397
6851
|
for (const si of sampledIndices) {
|
|
6398
6852
|
await parseSinglePage(targetPageNums[si]);
|
|
6399
6853
|
}
|
|
6400
|
-
const
|
|
6401
|
-
const
|
|
6854
|
+
const sampledMetrics = [];
|
|
6855
|
+
for (const si of sampledIndices) {
|
|
6856
|
+
const pageNum = targetPageNums[si];
|
|
6857
|
+
const m = sampleMetricsByPage.get(pageNum);
|
|
6858
|
+
if (m) sampledMetrics.push(m);
|
|
6859
|
+
}
|
|
6860
|
+
const imageBasedDecision = estimateImageBasedPdf(sampledMetrics);
|
|
6861
|
+
const isImageBased = imageBasedDecision.isImageBased;
|
|
6862
|
+
logger.log({
|
|
6863
|
+
level: "info",
|
|
6864
|
+
stage: "probe",
|
|
6865
|
+
event: "done",
|
|
6866
|
+
message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815",
|
|
6867
|
+
meta: { isImageBased, reason: imageBasedDecision.reason, sampledPages: sampledMetrics.length }
|
|
6868
|
+
});
|
|
6402
6869
|
if (!isImageBased) {
|
|
6403
6870
|
for (let si = 0; si < targetPageNums.length; si++) {
|
|
6404
6871
|
if (!sampledIndices.has(si)) {
|
|
@@ -6406,11 +6873,41 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6406
6873
|
}
|
|
6407
6874
|
}
|
|
6408
6875
|
}
|
|
6876
|
+
const partialSummary = summarizePartialFailures(failedPages, totalTarget);
|
|
6877
|
+
if (partialSummary) {
|
|
6878
|
+
warnings.push({
|
|
6879
|
+
message: partialSummary,
|
|
6880
|
+
code: "PARTIAL_PARSE"
|
|
6881
|
+
});
|
|
6882
|
+
}
|
|
6883
|
+
if (isImageBased) {
|
|
6884
|
+
warnings.push({
|
|
6885
|
+
message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815: ${imageBasedDecision.reason}`,
|
|
6886
|
+
code: "OCR_FALLBACK"
|
|
6887
|
+
});
|
|
6888
|
+
}
|
|
6889
|
+
const partialPolicy = shouldAbortForPartialFailures(
|
|
6890
|
+
failedPages,
|
|
6891
|
+
totalTarget,
|
|
6892
|
+
options?.maxPartialFailureRatio
|
|
6893
|
+
);
|
|
6894
|
+
if (partialPolicy.abort) {
|
|
6895
|
+
throw new KordocError(
|
|
6896
|
+
`\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uBE44\uC728 \uCD08\uACFC: ${(partialPolicy.ratio * 100).toFixed(1)}% (\uD5C8\uC6A9 ${(partialPolicy.threshold * 100).toFixed(1)}%)`
|
|
6897
|
+
);
|
|
6898
|
+
}
|
|
6409
6899
|
const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
|
|
6410
6900
|
if (isImageBased) {
|
|
6411
6901
|
const ocrMode = options?.ocrMode ?? "auto";
|
|
6412
6902
|
const concurrency = options?.ocrConcurrency ?? 1;
|
|
6413
6903
|
const batchSize = options?.ocrBatchSize;
|
|
6904
|
+
logger.log({
|
|
6905
|
+
level: "info",
|
|
6906
|
+
stage: "ocr",
|
|
6907
|
+
event: "start",
|
|
6908
|
+
message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF OCR \uC2DC\uC791",
|
|
6909
|
+
meta: { ocrMode, concurrency, batchSize, totalTarget }
|
|
6910
|
+
});
|
|
6414
6911
|
if (ocrMode === "off") {
|
|
6415
6912
|
throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
|
|
6416
6913
|
}
|
|
@@ -6418,8 +6915,10 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6418
6915
|
const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
|
|
6419
6916
|
const tryProvider = async (provider, filter) => {
|
|
6420
6917
|
try {
|
|
6918
|
+
logger.log({ level: "debug", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589", meta: { filteredPages: filter?.size } });
|
|
6421
6919
|
return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
|
|
6422
6920
|
} catch {
|
|
6921
|
+
logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589 \uC2E4\uD328(\uBE48 \uACB0\uACFC\uB85C \uCC98\uB9AC)" });
|
|
6423
6922
|
return [];
|
|
6424
6923
|
} finally {
|
|
6425
6924
|
const terminable = provider;
|
|
@@ -6442,6 +6941,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6442
6941
|
for (const mode of getAutoFallbackChain2()) {
|
|
6443
6942
|
if (pendingPages.size === 0) break;
|
|
6444
6943
|
try {
|
|
6944
|
+
logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uC2DC\uB3C4", meta: { mode, pendingPages: pendingPages.size } });
|
|
6445
6945
|
const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
|
|
6446
6946
|
const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
|
|
6447
6947
|
const blocks2 = await tryProvider(provider, modeFilter);
|
|
@@ -6456,10 +6956,20 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6456
6956
|
code: "OCR_CLI_FALLBACK"
|
|
6457
6957
|
});
|
|
6458
6958
|
}
|
|
6959
|
+
logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uCC98\uB9AC \uC644\uB8CC", meta: { mode, blocks: blocks2.length, pendingPages: pendingPages.size } });
|
|
6459
6960
|
} else {
|
|
6460
6961
|
warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
|
|
6962
|
+
logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uACB0\uACFC \uC5C6\uC74C", meta: { mode } });
|
|
6461
6963
|
}
|
|
6462
|
-
} catch {
|
|
6964
|
+
} catch (engineErr) {
|
|
6965
|
+
logger.log({
|
|
6966
|
+
level: "warn",
|
|
6967
|
+
stage: "ocr",
|
|
6968
|
+
event: "progress",
|
|
6969
|
+
message: "OCR \uC5D4\uC9C4 \uCD08\uAE30\uD654/\uC2E4\uD589 \uC2E4\uD328",
|
|
6970
|
+
meta: { mode },
|
|
6971
|
+
error: { message: engineErr instanceof Error ? engineErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: engineErr instanceof Error ? engineErr.name : "Error" }
|
|
6972
|
+
});
|
|
6463
6973
|
}
|
|
6464
6974
|
}
|
|
6465
6975
|
allOcrBlocks.sort((a, b) => (a.pageNumber ?? 0) - (b.pageNumber ?? 0));
|
|
@@ -6477,6 +6987,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6477
6987
|
}
|
|
6478
6988
|
if (ocrBlocks.length > 0) {
|
|
6479
6989
|
const ocrMarkdown = blocksToMarkdown(ocrBlocks);
|
|
6990
|
+
logger.log({ level: "info", stage: "ocr", event: "done", message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 OCR \uC644\uB8CC", meta: { blocks: ocrBlocks.length } });
|
|
6480
6991
|
return {
|
|
6481
6992
|
markdown: ocrMarkdown,
|
|
6482
6993
|
blocks: ocrBlocks,
|
|
@@ -6502,8 +7013,25 @@ async function parsePdfDocument(buffer, options) {
|
|
|
6502
7013
|
}
|
|
6503
7014
|
detectMarkerHeadings(blocks);
|
|
6504
7015
|
const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
|
|
6505
|
-
let markdown = cleanPdfText(blocksToMarkdown(blocks));
|
|
7016
|
+
let markdown = cleanPdfText(blocksToMarkdown(blocks), options?.pdfTextNormalization ?? "default");
|
|
7017
|
+
logger.log({
|
|
7018
|
+
level: "info",
|
|
7019
|
+
stage: "finalize",
|
|
7020
|
+
event: "done",
|
|
7021
|
+
message: "PDF \uD30C\uC2F1 \uC644\uB8CC",
|
|
7022
|
+
meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, isImageBased: false }
|
|
7023
|
+
});
|
|
6506
7024
|
return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
7025
|
+
} catch (err) {
|
|
7026
|
+
logger.log({
|
|
7027
|
+
level: "error",
|
|
7028
|
+
stage: "finalize",
|
|
7029
|
+
event: "error",
|
|
7030
|
+
message: "PDF \uD30C\uC2F1 \uC2E4\uD328",
|
|
7031
|
+
meta: { lastParsedPage },
|
|
7032
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
7033
|
+
});
|
|
7034
|
+
throw err;
|
|
6507
7035
|
} finally {
|
|
6508
7036
|
await doc.destroy().catch(() => {
|
|
6509
7037
|
});
|
|
@@ -6597,6 +7125,17 @@ function shouldDemoteTable(table) {
|
|
|
6597
7125
|
const emptyCells = totalCells - allCells.length;
|
|
6598
7126
|
if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
|
|
6599
7127
|
if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
|
|
7128
|
+
if (table.cols >= 3 && table.rows <= 4) {
|
|
7129
|
+
const markerCells = allCells.filter((t) => /^[□■◆○●▶▷◇◆]/.test(t)).length;
|
|
7130
|
+
const numericCells = allCells.filter((t) => /\d/.test(t)).length;
|
|
7131
|
+
if (markerCells >= Math.max(1, Math.floor(allCells.length * 0.35)) && numericCells <= Math.floor(allCells.length * 0.15)) {
|
|
7132
|
+
return true;
|
|
7133
|
+
}
|
|
7134
|
+
}
|
|
7135
|
+
if (table.cols >= 3 && table.rows >= 2) {
|
|
7136
|
+
const sparseRows = table.cells.filter((row) => row.filter((c) => c.text.trim()).length <= 1).length;
|
|
7137
|
+
if (sparseRows >= Math.ceil(table.rows * 0.7)) return true;
|
|
7138
|
+
}
|
|
6600
7139
|
return false;
|
|
6601
7140
|
}
|
|
6602
7141
|
function demoteTableToText(table) {
|
|
@@ -7152,10 +7691,15 @@ function mergeLineSimple(items) {
|
|
|
7152
7691
|
}
|
|
7153
7692
|
return result;
|
|
7154
7693
|
}
|
|
7155
|
-
function
|
|
7156
|
-
return
|
|
7157
|
-
|
|
7158
|
-
|
|
7694
|
+
function stripPdfPageNumberArtifacts(text) {
|
|
7695
|
+
return text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "");
|
|
7696
|
+
}
|
|
7697
|
+
function cleanPdfText(text, mode = "default") {
|
|
7698
|
+
const stripped = stripPdfPageNumberArtifacts(text);
|
|
7699
|
+
if (mode === "strict-preserve") {
|
|
7700
|
+
return stripped.replace(/\n{4,}/g, "\n\n\n").trim();
|
|
7701
|
+
}
|
|
7702
|
+
return mergeKoreanLines(stripped).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
|
|
7159
7703
|
}
|
|
7160
7704
|
function startsWithMarker(line) {
|
|
7161
7705
|
const t = line.trimStart();
|
|
@@ -7359,6 +7903,7 @@ function mergeKoreanLines(text) {
|
|
|
7359
7903
|
// src/xlsx/parser.ts
|
|
7360
7904
|
import JSZip3 from "jszip";
|
|
7361
7905
|
import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
|
|
7906
|
+
init_logger();
|
|
7362
7907
|
var MAX_SHEETS = 100;
|
|
7363
7908
|
var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
|
|
7364
7909
|
var MAX_ROWS2 = 1e4;
|
|
@@ -7548,105 +8093,145 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
|
|
|
7548
8093
|
return blocks;
|
|
7549
8094
|
}
|
|
7550
8095
|
async function parseXlsxDocument(buffer, options, existingZip) {
|
|
7551
|
-
|
|
7552
|
-
|
|
7553
|
-
|
|
7554
|
-
|
|
7555
|
-
|
|
7556
|
-
|
|
7557
|
-
|
|
7558
|
-
|
|
7559
|
-
|
|
7560
|
-
|
|
7561
|
-
|
|
7562
|
-
|
|
7563
|
-
|
|
7564
|
-
|
|
7565
|
-
|
|
7566
|
-
|
|
7567
|
-
|
|
7568
|
-
|
|
7569
|
-
|
|
7570
|
-
|
|
7571
|
-
|
|
7572
|
-
|
|
7573
|
-
|
|
7574
|
-
|
|
7575
|
-
|
|
7576
|
-
|
|
7577
|
-
|
|
7578
|
-
|
|
7579
|
-
|
|
7580
|
-
|
|
7581
|
-
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
7582
|
-
const sheet = sheets[i];
|
|
7583
|
-
options?.onProgress?.(i + 1, processedSheets);
|
|
7584
|
-
let sheetPath = relsMap.get(sheet.rId);
|
|
7585
|
-
if (sheetPath) {
|
|
7586
|
-
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
7587
|
-
sheetPath = `xl/${sheetPath}`;
|
|
7588
|
-
} else if (sheetPath.startsWith("/")) {
|
|
7589
|
-
sheetPath = sheetPath.slice(1);
|
|
7590
|
-
}
|
|
7591
|
-
} else {
|
|
7592
|
-
sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
|
|
7593
|
-
}
|
|
7594
|
-
const sheetFile = zip.file(sheetPath);
|
|
7595
|
-
if (!sheetFile) {
|
|
7596
|
-
warnings.push({
|
|
7597
|
-
page: i + 1,
|
|
7598
|
-
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
|
|
7599
|
-
code: "PARTIAL_PARSE"
|
|
7600
|
-
});
|
|
7601
|
-
continue;
|
|
8096
|
+
const logger = createLoggerFromEnv().child({ component: "xlsx/parser.ts", stage: "detect" });
|
|
8097
|
+
logger.log({ level: "info", event: "start", message: "XLSX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
8098
|
+
let lastProcessedSheet = 0;
|
|
8099
|
+
try {
|
|
8100
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
|
|
8101
|
+
const zip = existingZip ?? await JSZip3.loadAsync(buffer);
|
|
8102
|
+
const warnings = [];
|
|
8103
|
+
const workbookFile = zip.file("xl/workbook.xml");
|
|
8104
|
+
if (!workbookFile) {
|
|
8105
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8106
|
+
}
|
|
8107
|
+
let sharedStrings = [];
|
|
8108
|
+
const ssFile = zip.file("xl/sharedStrings.xml");
|
|
8109
|
+
if (ssFile) {
|
|
8110
|
+
sharedStrings = parseSharedStrings(await ssFile.async("text"));
|
|
8111
|
+
}
|
|
8112
|
+
const sheets = parseWorkbook(await workbookFile.async("text"));
|
|
8113
|
+
if (sheets.length === 0) {
|
|
8114
|
+
throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8115
|
+
}
|
|
8116
|
+
logger.log({ level: "debug", event: "progress", message: "\uC2DC\uD2B8 \uBAA9\uB85D \uB85C\uB4DC", meta: { sheets: sheets.length } });
|
|
8117
|
+
let relsMap = /* @__PURE__ */ new Map();
|
|
8118
|
+
const relsFile = zip.file("xl/_rels/workbook.xml.rels");
|
|
8119
|
+
if (relsFile) {
|
|
8120
|
+
relsMap = parseRels(await relsFile.async("text"));
|
|
8121
|
+
}
|
|
8122
|
+
let pageFilter = null;
|
|
8123
|
+
if (options?.pages) {
|
|
8124
|
+
const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => (init_page_range(), page_range_exports));
|
|
8125
|
+
pageFilter = parsePageRange2(options.pages, sheets.length);
|
|
7602
8126
|
}
|
|
7603
|
-
|
|
7604
|
-
|
|
7605
|
-
|
|
7606
|
-
|
|
7607
|
-
if (
|
|
7608
|
-
|
|
7609
|
-
|
|
8127
|
+
const blocks = [];
|
|
8128
|
+
const processedSheets = Math.min(sheets.length, MAX_SHEETS);
|
|
8129
|
+
let totalCells = 0;
|
|
8130
|
+
for (let i = 0; i < processedSheets; i++) {
|
|
8131
|
+
if (pageFilter && !pageFilter.has(i + 1)) continue;
|
|
8132
|
+
const sheet = sheets[i];
|
|
8133
|
+
options?.onProgress?.(i + 1, processedSheets);
|
|
8134
|
+
let sheetPath = relsMap.get(sheet.rId);
|
|
8135
|
+
if (sheetPath) {
|
|
8136
|
+
if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
|
|
8137
|
+
sheetPath = `xl/${sheetPath}`;
|
|
8138
|
+
} else if (sheetPath.startsWith("/")) {
|
|
8139
|
+
sheetPath = sheetPath.slice(1);
|
|
8140
|
+
}
|
|
8141
|
+
} else {
|
|
8142
|
+
sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
|
|
8143
|
+
}
|
|
8144
|
+
const sheetFile = zip.file(sheetPath);
|
|
8145
|
+
if (!sheetFile) {
|
|
8146
|
+
warnings.push({
|
|
8147
|
+
page: i + 1,
|
|
8148
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
|
|
8149
|
+
code: "PARTIAL_PARSE"
|
|
8150
|
+
});
|
|
8151
|
+
continue;
|
|
8152
|
+
}
|
|
8153
|
+
try {
|
|
8154
|
+
const sheetXml = await sheetFile.async("text");
|
|
8155
|
+
const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
|
|
8156
|
+
totalCells += maxRow * maxCol;
|
|
8157
|
+
if (totalCells > MAX_TOTAL_CELLS) {
|
|
8158
|
+
warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
|
|
8159
|
+
break;
|
|
8160
|
+
}
|
|
8161
|
+
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
8162
|
+
blocks.push(...sheetBlocks);
|
|
8163
|
+
logger.log({
|
|
8164
|
+
level: "debug",
|
|
8165
|
+
stage: "convert",
|
|
8166
|
+
event: "progress",
|
|
8167
|
+
message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC644\uB8CC",
|
|
8168
|
+
meta: { sheet: sheet.name, index: i + 1, processedSheets }
|
|
8169
|
+
});
|
|
8170
|
+
lastProcessedSheet = i + 1;
|
|
8171
|
+
} catch (err) {
|
|
8172
|
+
warnings.push({
|
|
8173
|
+
page: i + 1,
|
|
8174
|
+
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
8175
|
+
code: "PARTIAL_PARSE"
|
|
8176
|
+
});
|
|
8177
|
+
logger.log({
|
|
8178
|
+
level: "warn",
|
|
8179
|
+
stage: "convert",
|
|
8180
|
+
event: "progress",
|
|
8181
|
+
message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC2E4\uD328",
|
|
8182
|
+
meta: { sheet: sheet.name, index: i + 1 },
|
|
8183
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
|
|
8184
|
+
});
|
|
7610
8185
|
}
|
|
7611
|
-
const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
|
|
7612
|
-
blocks.push(...sheetBlocks);
|
|
7613
|
-
} catch (err) {
|
|
7614
|
-
warnings.push({
|
|
7615
|
-
page: i + 1,
|
|
7616
|
-
message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
|
|
7617
|
-
code: "PARTIAL_PARSE"
|
|
7618
|
-
});
|
|
7619
8186
|
}
|
|
7620
|
-
|
|
7621
|
-
|
|
7622
|
-
|
|
7623
|
-
|
|
7624
|
-
|
|
7625
|
-
|
|
7626
|
-
|
|
7627
|
-
|
|
7628
|
-
|
|
7629
|
-
|
|
7630
|
-
|
|
7631
|
-
|
|
7632
|
-
|
|
7633
|
-
|
|
7634
|
-
|
|
7635
|
-
|
|
7636
|
-
|
|
7637
|
-
|
|
7638
|
-
|
|
7639
|
-
|
|
7640
|
-
|
|
8187
|
+
const metadata = {
|
|
8188
|
+
pageCount: processedSheets
|
|
8189
|
+
};
|
|
8190
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
8191
|
+
if (coreFile) {
|
|
8192
|
+
try {
|
|
8193
|
+
const coreXml = await coreFile.async("text");
|
|
8194
|
+
const doc = parseXml(coreXml);
|
|
8195
|
+
const getFirst = (tag) => {
|
|
8196
|
+
const els = doc.getElementsByTagName(tag);
|
|
8197
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
8198
|
+
};
|
|
8199
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
8200
|
+
metadata.author = getFirst("dc:creator");
|
|
8201
|
+
metadata.description = getFirst("dc:description");
|
|
8202
|
+
const created = getFirst("dcterms:created");
|
|
8203
|
+
if (created) metadata.createdAt = created;
|
|
8204
|
+
const modified = getFirst("dcterms:modified");
|
|
8205
|
+
if (modified) metadata.modifiedAt = modified;
|
|
8206
|
+
} catch {
|
|
8207
|
+
}
|
|
7641
8208
|
}
|
|
8209
|
+
const markdown = blocksToMarkdown(blocks);
|
|
8210
|
+
logger.log({
|
|
8211
|
+
level: "info",
|
|
8212
|
+
stage: "finalize",
|
|
8213
|
+
event: "done",
|
|
8214
|
+
message: "XLSX \uD30C\uC2F1 \uC644\uB8CC",
|
|
8215
|
+
meta: { blocks: blocks.length, warnings: warnings.length, pageCount: processedSheets }
|
|
8216
|
+
});
|
|
8217
|
+
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
8218
|
+
} catch (err) {
|
|
8219
|
+
logger.log({
|
|
8220
|
+
level: "error",
|
|
8221
|
+
stage: "finalize",
|
|
8222
|
+
event: "error",
|
|
8223
|
+
message: "XLSX \uD30C\uC2F1 \uC2E4\uD328",
|
|
8224
|
+
meta: { lastProcessedSheet },
|
|
8225
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
8226
|
+
});
|
|
8227
|
+
throw err;
|
|
7642
8228
|
}
|
|
7643
|
-
const markdown = blocksToMarkdown(blocks);
|
|
7644
|
-
return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
|
|
7645
8229
|
}
|
|
7646
8230
|
|
|
7647
8231
|
// src/docx/parser.ts
|
|
7648
8232
|
import JSZip4 from "jszip";
|
|
7649
8233
|
import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
|
|
8234
|
+
init_logger();
|
|
7650
8235
|
var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
|
|
7651
8236
|
function getChildElements(parent, localName) {
|
|
7652
8237
|
const result = [];
|
|
@@ -8008,101 +8593,127 @@ async function extractImages(zip, rels, doc) {
|
|
|
8008
8593
|
return { blocks, images };
|
|
8009
8594
|
}
|
|
8010
8595
|
async function parseDocxDocument(buffer, options, existingZip) {
|
|
8011
|
-
|
|
8012
|
-
|
|
8013
|
-
|
|
8014
|
-
|
|
8015
|
-
|
|
8016
|
-
|
|
8017
|
-
|
|
8018
|
-
|
|
8019
|
-
|
|
8020
|
-
|
|
8021
|
-
|
|
8022
|
-
|
|
8023
|
-
|
|
8024
|
-
|
|
8025
|
-
|
|
8026
|
-
|
|
8027
|
-
|
|
8028
|
-
|
|
8596
|
+
const logger = createLoggerFromEnv().child({ component: "docx/parser.ts", stage: "detect" });
|
|
8597
|
+
logger.log({ level: "info", event: "start", message: "DOCX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
|
|
8598
|
+
let lastProcessedNode = 0;
|
|
8599
|
+
try {
|
|
8600
|
+
precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
|
|
8601
|
+
const zip = existingZip ?? await JSZip4.loadAsync(buffer);
|
|
8602
|
+
const warnings = [];
|
|
8603
|
+
const docFile = zip.file("word/document.xml");
|
|
8604
|
+
if (!docFile) {
|
|
8605
|
+
throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8606
|
+
}
|
|
8607
|
+
let rels = /* @__PURE__ */ new Map();
|
|
8608
|
+
const relsFile = zip.file("word/_rels/document.xml.rels");
|
|
8609
|
+
if (relsFile) {
|
|
8610
|
+
rels = parseRels2(await relsFile.async("text"));
|
|
8611
|
+
}
|
|
8612
|
+
let styles = /* @__PURE__ */ new Map();
|
|
8613
|
+
const stylesFile = zip.file("word/styles.xml");
|
|
8614
|
+
if (stylesFile) {
|
|
8615
|
+
try {
|
|
8616
|
+
styles = parseStyles(await stylesFile.async("text"));
|
|
8617
|
+
} catch {
|
|
8618
|
+
}
|
|
8029
8619
|
}
|
|
8030
|
-
|
|
8031
|
-
|
|
8032
|
-
|
|
8033
|
-
|
|
8034
|
-
|
|
8035
|
-
|
|
8036
|
-
|
|
8620
|
+
let numbering = /* @__PURE__ */ new Map();
|
|
8621
|
+
const numFile = zip.file("word/numbering.xml");
|
|
8622
|
+
if (numFile) {
|
|
8623
|
+
try {
|
|
8624
|
+
numbering = parseNumbering(await numFile.async("text"));
|
|
8625
|
+
} catch {
|
|
8626
|
+
}
|
|
8037
8627
|
}
|
|
8038
|
-
|
|
8039
|
-
|
|
8040
|
-
|
|
8041
|
-
|
|
8042
|
-
|
|
8043
|
-
|
|
8044
|
-
|
|
8628
|
+
let footnotes = /* @__PURE__ */ new Map();
|
|
8629
|
+
const fnFile = zip.file("word/footnotes.xml");
|
|
8630
|
+
if (fnFile) {
|
|
8631
|
+
try {
|
|
8632
|
+
footnotes = parseFootnotes(await fnFile.async("text"));
|
|
8633
|
+
} catch {
|
|
8634
|
+
}
|
|
8045
8635
|
}
|
|
8046
|
-
|
|
8047
|
-
|
|
8048
|
-
|
|
8049
|
-
|
|
8050
|
-
|
|
8051
|
-
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8052
|
-
}
|
|
8053
|
-
const blocks = [];
|
|
8054
|
-
const bodyEl = body[0];
|
|
8055
|
-
const children = bodyEl.childNodes;
|
|
8056
|
-
for (let i = 0; i < children.length; i++) {
|
|
8057
|
-
const node = children[i];
|
|
8058
|
-
if (node.nodeType !== 1) continue;
|
|
8059
|
-
const el = node;
|
|
8060
|
-
const localName = el.localName ?? el.tagName?.split(":").pop();
|
|
8061
|
-
if (localName === "p") {
|
|
8062
|
-
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
8063
|
-
if (block) blocks.push(block);
|
|
8064
|
-
} else if (localName === "tbl") {
|
|
8065
|
-
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
8066
|
-
if (block) blocks.push(block);
|
|
8067
|
-
}
|
|
8068
|
-
}
|
|
8069
|
-
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
8070
|
-
const metadata = {};
|
|
8071
|
-
const coreFile = zip.file("docProps/core.xml");
|
|
8072
|
-
if (coreFile) {
|
|
8073
|
-
try {
|
|
8074
|
-
const coreXml = await coreFile.async("text");
|
|
8075
|
-
const coreDoc = parseXml2(coreXml);
|
|
8076
|
-
const getFirst = (tag) => {
|
|
8077
|
-
const els = coreDoc.getElementsByTagName(tag);
|
|
8078
|
-
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
8079
|
-
};
|
|
8080
|
-
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
8081
|
-
metadata.author = getFirst("dc:creator");
|
|
8082
|
-
metadata.description = getFirst("dc:description");
|
|
8083
|
-
const created = getFirst("dcterms:created");
|
|
8084
|
-
if (created) metadata.createdAt = created;
|
|
8085
|
-
const modified = getFirst("dcterms:modified");
|
|
8086
|
-
if (modified) metadata.modifiedAt = modified;
|
|
8087
|
-
} catch {
|
|
8636
|
+
const docXml = await docFile.async("text");
|
|
8637
|
+
const doc = parseXml2(docXml);
|
|
8638
|
+
const body = findElements(doc, "body");
|
|
8639
|
+
if (body.length === 0) {
|
|
8640
|
+
throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
|
|
8088
8641
|
}
|
|
8642
|
+
const blocks = [];
|
|
8643
|
+
const bodyEl = body[0];
|
|
8644
|
+
const children = bodyEl.childNodes;
|
|
8645
|
+
for (let i = 0; i < children.length; i++) {
|
|
8646
|
+
const node = children[i];
|
|
8647
|
+
if (node.nodeType !== 1) continue;
|
|
8648
|
+
const el = node;
|
|
8649
|
+
const localName = el.localName ?? el.tagName?.split(":").pop();
|
|
8650
|
+
if (localName === "p") {
|
|
8651
|
+
const block = parseParagraph(el, styles, numbering, footnotes, rels);
|
|
8652
|
+
if (block) blocks.push(block);
|
|
8653
|
+
} else if (localName === "tbl") {
|
|
8654
|
+
const block = parseTable(el, styles, numbering, footnotes, rels);
|
|
8655
|
+
if (block) blocks.push(block);
|
|
8656
|
+
}
|
|
8657
|
+
lastProcessedNode = i + 1;
|
|
8658
|
+
}
|
|
8659
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uBCF8\uBB38 \uBE14\uB85D \uD30C\uC2F1 \uC644\uB8CC", meta: { blocks: blocks.length } });
|
|
8660
|
+
const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
|
|
8661
|
+
logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC774\uBBF8\uC9C0 \uCD94\uCD9C \uC644\uB8CC", meta: { imageBlocks: imgBlocks.length, images: images.length } });
|
|
8662
|
+
const metadata = {};
|
|
8663
|
+
const coreFile = zip.file("docProps/core.xml");
|
|
8664
|
+
if (coreFile) {
|
|
8665
|
+
try {
|
|
8666
|
+
const coreXml = await coreFile.async("text");
|
|
8667
|
+
const coreDoc = parseXml2(coreXml);
|
|
8668
|
+
const getFirst = (tag) => {
|
|
8669
|
+
const els = coreDoc.getElementsByTagName(tag);
|
|
8670
|
+
return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
|
|
8671
|
+
};
|
|
8672
|
+
metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
|
|
8673
|
+
metadata.author = getFirst("dc:creator");
|
|
8674
|
+
metadata.description = getFirst("dc:description");
|
|
8675
|
+
const created = getFirst("dcterms:created");
|
|
8676
|
+
if (created) metadata.createdAt = created;
|
|
8677
|
+
const modified = getFirst("dcterms:modified");
|
|
8678
|
+
if (modified) metadata.modifiedAt = modified;
|
|
8679
|
+
} catch {
|
|
8680
|
+
}
|
|
8681
|
+
}
|
|
8682
|
+
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
8683
|
+
const markdown = blocksToMarkdown(blocks);
|
|
8684
|
+
logger.log({
|
|
8685
|
+
level: "info",
|
|
8686
|
+
stage: "finalize",
|
|
8687
|
+
event: "done",
|
|
8688
|
+
message: "DOCX \uD30C\uC2F1 \uC644\uB8CC",
|
|
8689
|
+
meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, images: images.length }
|
|
8690
|
+
});
|
|
8691
|
+
return {
|
|
8692
|
+
markdown,
|
|
8693
|
+
blocks,
|
|
8694
|
+
metadata,
|
|
8695
|
+
outline: outline.length > 0 ? outline : void 0,
|
|
8696
|
+
warnings: warnings.length > 0 ? warnings : void 0,
|
|
8697
|
+
images: images.length > 0 ? images : void 0
|
|
8698
|
+
};
|
|
8699
|
+
} catch (err) {
|
|
8700
|
+
logger.log({
|
|
8701
|
+
level: "error",
|
|
8702
|
+
stage: "finalize",
|
|
8703
|
+
event: "error",
|
|
8704
|
+
message: "DOCX \uD30C\uC2F1 \uC2E4\uD328",
|
|
8705
|
+
meta: { lastProcessedNode },
|
|
8706
|
+
error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
|
|
8707
|
+
});
|
|
8708
|
+
throw err;
|
|
8089
8709
|
}
|
|
8090
|
-
const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
|
|
8091
|
-
const markdown = blocksToMarkdown(blocks);
|
|
8092
|
-
return {
|
|
8093
|
-
markdown,
|
|
8094
|
-
blocks,
|
|
8095
|
-
metadata,
|
|
8096
|
-
outline: outline.length > 0 ? outline : void 0,
|
|
8097
|
-
warnings: warnings.length > 0 ? warnings : void 0,
|
|
8098
|
-
images: images.length > 0 ? images : void 0
|
|
8099
|
-
};
|
|
8100
8710
|
}
|
|
8101
8711
|
|
|
8102
8712
|
// src/index.ts
|
|
8103
8713
|
init_cli_provider();
|
|
8104
8714
|
init_tesseract_provider();
|
|
8105
8715
|
init_markdown_to_blocks();
|
|
8716
|
+
init_logger();
|
|
8106
8717
|
|
|
8107
8718
|
// src/diff/text-diff.ts
|
|
8108
8719
|
function similarity(a, b) {
|
|
@@ -10601,15 +11212,726 @@ async function markdownToXlsx(markdown, options) {
|
|
|
10601
11212
|
return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
|
|
10602
11213
|
}
|
|
10603
11214
|
|
|
11215
|
+
// src/ocr/api-key-rotation.ts
|
|
11216
|
+
var AllKeysCoolingDownError = class extends Error {
|
|
11217
|
+
waitMs;
|
|
11218
|
+
constructor(waitMs) {
|
|
11219
|
+
super(`\uBAA8\uB4E0 API \uD0A4\uAC00 cooldown \uC0C1\uD0DC\uC785\uB2C8\uB2E4. ${waitMs}ms \uD6C4 \uC7AC\uC2DC\uB3C4\uD558\uC138\uC694.`);
|
|
11220
|
+
this.name = "AllKeysCoolingDownError";
|
|
11221
|
+
this.waitMs = waitMs;
|
|
11222
|
+
}
|
|
11223
|
+
};
|
|
11224
|
+
var ApiKeyRotationPool = class _ApiKeyRotationPool {
|
|
11225
|
+
states;
|
|
11226
|
+
baseCooldownMs;
|
|
11227
|
+
maxCooldownMs;
|
|
11228
|
+
cursor = -1;
|
|
11229
|
+
constructor(keys, options = {}) {
|
|
11230
|
+
const normalized = keys.map((k) => k.trim()).filter(Boolean);
|
|
11231
|
+
if (normalized.length === 0) {
|
|
11232
|
+
throw new Error("API \uD0A4\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11233
|
+
}
|
|
11234
|
+
this.states = normalized.map((key, idx) => ({
|
|
11235
|
+
key,
|
|
11236
|
+
keyId: `key_${idx + 1}`,
|
|
11237
|
+
totalRequests: 0,
|
|
11238
|
+
successCount: 0,
|
|
11239
|
+
failureCount: 0,
|
|
11240
|
+
consecutiveFailures: 0
|
|
11241
|
+
}));
|
|
11242
|
+
this.baseCooldownMs = options.baseCooldownMs ?? 5e3;
|
|
11243
|
+
this.maxCooldownMs = options.maxCooldownMs ?? 12e4;
|
|
11244
|
+
}
|
|
11245
|
+
static fromEnv(env = process.env) {
|
|
11246
|
+
const multi = (env.NVIDIA_API_KEYS || "").split(",").map((v) => v.trim()).filter(Boolean);
|
|
11247
|
+
if (multi.length > 0) return new _ApiKeyRotationPool(multi);
|
|
11248
|
+
const single = (env.NVIDIA_API_KEY || "").trim();
|
|
11249
|
+
if (single) return new _ApiKeyRotationPool([single]);
|
|
11250
|
+
throw new Error("NVIDIA_API_KEYS \uB610\uB294 NVIDIA_API_KEY \uD658\uACBD\uBCC0\uC218\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.");
|
|
11251
|
+
}
|
|
11252
|
+
acquire(now = Date.now()) {
|
|
11253
|
+
const n = this.states.length;
|
|
11254
|
+
for (let step = 1; step <= n; step++) {
|
|
11255
|
+
const idx = (this.cursor + step) % n;
|
|
11256
|
+
const s = this.states[idx];
|
|
11257
|
+
if (!s.cooldownUntil || s.cooldownUntil <= now) {
|
|
11258
|
+
this.cursor = idx;
|
|
11259
|
+
s.totalRequests++;
|
|
11260
|
+
s.lastUsedAt = now;
|
|
11261
|
+
return { key: s.key, keyId: s.keyId };
|
|
11262
|
+
}
|
|
11263
|
+
}
|
|
11264
|
+
const minCooldownUntil = this.states.map((s) => s.cooldownUntil ?? now).reduce((min, v) => Math.min(min, v), Number.POSITIVE_INFINITY);
|
|
11265
|
+
throw new AllKeysCoolingDownError(Math.max(0, minCooldownUntil - now));
|
|
11266
|
+
}
|
|
11267
|
+
markSuccess(keyId) {
|
|
11268
|
+
const s = this.find(keyId);
|
|
11269
|
+
s.successCount++;
|
|
11270
|
+
s.consecutiveFailures = 0;
|
|
11271
|
+
s.cooldownUntil = void 0;
|
|
11272
|
+
}
|
|
11273
|
+
markFailure(keyId, opts = {}, now = Date.now()) {
|
|
11274
|
+
const s = this.find(keyId);
|
|
11275
|
+
s.failureCount++;
|
|
11276
|
+
s.consecutiveFailures++;
|
|
11277
|
+
const retryable = this.isRetryableFailure(opts.status, opts.timeout);
|
|
11278
|
+
if (!retryable) return;
|
|
11279
|
+
const exp = Math.max(0, s.consecutiveFailures - 1);
|
|
11280
|
+
const backoff = Math.min(this.baseCooldownMs * 2 ** exp, this.maxCooldownMs);
|
|
11281
|
+
const cooldown = Math.max(backoff, opts.retryAfterMs ?? 0);
|
|
11282
|
+
s.cooldownUntil = now + cooldown;
|
|
11283
|
+
}
|
|
11284
|
+
snapshot() {
|
|
11285
|
+
return this.states.map((s) => ({
|
|
11286
|
+
keyId: s.keyId,
|
|
11287
|
+
totalRequests: s.totalRequests,
|
|
11288
|
+
successCount: s.successCount,
|
|
11289
|
+
failureCount: s.failureCount,
|
|
11290
|
+
consecutiveFailures: s.consecutiveFailures,
|
|
11291
|
+
lastUsedAt: s.lastUsedAt,
|
|
11292
|
+
cooldownUntil: s.cooldownUntil
|
|
11293
|
+
}));
|
|
11294
|
+
}
|
|
11295
|
+
isRetryableFailure(status, timeout) {
|
|
11296
|
+
if (timeout) return true;
|
|
11297
|
+
if (status === 429) return true;
|
|
11298
|
+
if (typeof status === "number" && status >= 500) return true;
|
|
11299
|
+
return false;
|
|
11300
|
+
}
|
|
11301
|
+
find(keyId) {
|
|
11302
|
+
const s = this.states.find((v) => v.keyId === keyId);
|
|
11303
|
+
if (!s) throw new Error(`\uC54C \uC218 \uC5C6\uB294 keyId: ${keyId}`);
|
|
11304
|
+
return s;
|
|
11305
|
+
}
|
|
11306
|
+
};
|
|
11307
|
+
|
|
11308
|
+
// src/pipeline/unified-ocr.ts
|
|
11309
|
+
import { mkdir, readdir, readFile, stat, writeFile } from "fs/promises";
|
|
11310
|
+
import { basename as basename2, dirname as dirname3, extname, join as join4, resolve as resolve3 } from "path";
|
|
11311
|
+
import { spawn as spawn2 } from "child_process";
|
|
11312
|
+
import libre from "libreoffice-convert";
|
|
11313
|
+
init_logger();
|
|
11314
|
+
var libreConvert = libre.convert;
|
|
11315
|
+
var UnifiedOcrError = class extends Error {
|
|
11316
|
+
code;
|
|
11317
|
+
stage;
|
|
11318
|
+
constructor(code, stage, message) {
|
|
11319
|
+
super(message);
|
|
11320
|
+
this.name = "UnifiedOcrError";
|
|
11321
|
+
this.code = code;
|
|
11322
|
+
this.stage = stage;
|
|
11323
|
+
}
|
|
11324
|
+
};
|
|
11325
|
+
var DEFAULT_MODELS = [
|
|
11326
|
+
"mistralai/mistral-medium-3-instruct",
|
|
11327
|
+
"moonshotai/kimi-k2.5",
|
|
11328
|
+
"moonshotai/kimi-k2-thinking",
|
|
11329
|
+
"moonshotai/kimi-k2-instruct",
|
|
11330
|
+
"moonshotai/kimi-k2-instruct-0905",
|
|
11331
|
+
"qwen/qwen3.5-122b-a10b",
|
|
11332
|
+
"qwen/qwen3.5-397b-a17b"
|
|
11333
|
+
];
|
|
11334
|
+
var DEFAULT_MODEL_MAX_TOKENS = {
|
|
11335
|
+
"mistralai/mistral-medium-3-instruct": 8192,
|
|
11336
|
+
"moonshotai/kimi-k2.5": 64e3,
|
|
11337
|
+
"moonshotai/kimi-k2-thinking": 64e3,
|
|
11338
|
+
"moonshotai/kimi-k2-instruct": 64e3,
|
|
11339
|
+
"moonshotai/kimi-k2-instruct-0905": 64e3,
|
|
11340
|
+
"qwen/qwen3.5-122b-a10b": 64e3,
|
|
11341
|
+
"qwen/qwen3.5-397b-a17b": 64e3
|
|
11342
|
+
};
|
|
11343
|
+
var DEFAULT_STAGE_WEIGHTS = {
|
|
11344
|
+
convert: 15,
|
|
11345
|
+
render: 20,
|
|
11346
|
+
probe: 5,
|
|
11347
|
+
ocr: 45,
|
|
11348
|
+
proofread: 10,
|
|
11349
|
+
merge: 5
|
|
11350
|
+
};
|
|
11351
|
+
var OCR_PROMPT2 = "\uC774 \uC774\uBBF8\uC9C0 1\uC7A5\uC758 \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uC694\uC57D \uC5C6\uC774 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uCD94\uCD9C\uD558\uC138\uC694. \uC808\uB300\uB85C \uB0B4\uC6A9\uC744 \uCD94\uCE21\uD558\uAC70\uB098 \uBC14\uAFB8\uC9C0 \uB9C8\uC138\uC694.";
|
|
11352
|
+
var PROOFREAD_PROMPT = [
|
|
11353
|
+
"\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
|
|
11354
|
+
"\uADDC\uCE59:",
|
|
11355
|
+
"- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
|
|
11356
|
+
"- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
|
|
11357
|
+
"- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
|
|
11358
|
+
"- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
|
|
11359
|
+
].join("\n");
|
|
11360
|
+
async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
11361
|
+
const absInput = resolve3(inputPath);
|
|
11362
|
+
const stem = basename2(absInput, extname(absInput));
|
|
11363
|
+
const workspaceDir = resolve3(options.workspaceDir ?? join4(dirname3(absInput), `${stem}_ocr_workspace`));
|
|
11364
|
+
const imagesDir = join4(workspaceDir, "images");
|
|
11365
|
+
const rawDir = join4(workspaceDir, "ocr", "raw");
|
|
11366
|
+
const proofDir = join4(workspaceDir, "ocr", "proofread");
|
|
11367
|
+
const diffDir = join4(workspaceDir, "ocr", "diff");
|
|
11368
|
+
const outputPath = resolve3(options.outputPath ?? join4(dirname3(absInput), `${stem}.md`));
|
|
11369
|
+
const reportPath = join4(workspaceDir, "run-report.json");
|
|
11370
|
+
const modelCachePath = join4(dirname3(absInput), ".kordoc-model-cache.json");
|
|
11371
|
+
const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
|
|
11372
|
+
const timeoutMs = options.timeoutMs ?? 6e4;
|
|
11373
|
+
const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
|
|
11374
|
+
const dpi = options.dpi ?? 300;
|
|
11375
|
+
const modelsInput = options.modelCandidates?.length ? options.modelCandidates : DEFAULT_MODELS;
|
|
11376
|
+
const modelCache = await loadModelCache(modelCachePath);
|
|
11377
|
+
const models = sortModelsByCache(modelsInput, modelCache);
|
|
11378
|
+
const modelMaxTokens = { ...DEFAULT_MODEL_MAX_TOKENS, ...options.modelMaxTokens ?? {} };
|
|
11379
|
+
const stageWeights = normalizeWeights({ ...DEFAULT_STAGE_WEIGHTS, ...options.stageWeights ?? {} });
|
|
11380
|
+
const keyPool = ApiKeyRotationPool.fromEnv();
|
|
11381
|
+
const runId = options.runId ?? generateRunId("ocr");
|
|
11382
|
+
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
11383
|
+
await mkdir(imagesDir, { recursive: true });
|
|
11384
|
+
await mkdir(rawDir, { recursive: true });
|
|
11385
|
+
await mkdir(proofDir, { recursive: true });
|
|
11386
|
+
await mkdir(diffDir, { recursive: true });
|
|
11387
|
+
const timingsMs = {};
|
|
11388
|
+
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
11389
|
+
const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
|
|
11390
|
+
const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
|
|
11391
|
+
let currentStage = "convert";
|
|
11392
|
+
const logStage = (level, stage, event, message, meta) => {
|
|
11393
|
+
logger.log({ level, stage, event, message, meta });
|
|
11394
|
+
};
|
|
11395
|
+
try {
|
|
11396
|
+
ensureSupportedInput(absInput);
|
|
11397
|
+
let workingPdfPath = absInput;
|
|
11398
|
+
const convertStart = Date.now();
|
|
11399
|
+
currentStage = "convert";
|
|
11400
|
+
markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
|
|
11401
|
+
logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
|
|
11402
|
+
if (extname(absInput).toLowerCase() !== ".pdf") {
|
|
11403
|
+
await assertSofficeAvailable();
|
|
11404
|
+
workingPdfPath = join4(workspaceDir, `${stem}.pdf`);
|
|
11405
|
+
const inputBuffer = await readFile(absInput);
|
|
11406
|
+
const out = await convertWithLibreOffice(inputBuffer, ".pdf");
|
|
11407
|
+
await writeFile(workingPdfPath, out);
|
|
11408
|
+
}
|
|
11409
|
+
timingsMs.convert = Date.now() - convertStart;
|
|
11410
|
+
markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
|
|
11411
|
+
logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
|
|
11412
|
+
const renderStart = Date.now();
|
|
11413
|
+
currentStage = "render";
|
|
11414
|
+
markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
|
|
11415
|
+
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
|
|
11416
|
+
await renderPdfToPng(workingPdfPath, join4(imagesDir, "page"), dpi);
|
|
11417
|
+
const images = await listPageImages(imagesDir);
|
|
11418
|
+
if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11419
|
+
markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
|
|
11420
|
+
timingsMs.render = Date.now() - renderStart;
|
|
11421
|
+
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11422
|
+
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: images.length, elapsedMs: timingsMs.render });
|
|
11423
|
+
const probeStart = Date.now();
|
|
11424
|
+
currentStage = "probe";
|
|
11425
|
+
markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
|
|
11426
|
+
logStage("info", "probe", "start", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2DC\uC791", { models });
|
|
11427
|
+
const probeImage = await pickRepresentativeImage(images);
|
|
11428
|
+
const probeResults = [];
|
|
11429
|
+
for (let i = 0; i < models.length; i++) {
|
|
11430
|
+
const model = models[i];
|
|
11431
|
+
const t0 = Date.now();
|
|
11432
|
+
try {
|
|
11433
|
+
await ocrImageViaNim({
|
|
11434
|
+
imagePath: probeImage,
|
|
11435
|
+
prompt: OCR_PROMPT2,
|
|
11436
|
+
model,
|
|
11437
|
+
maxTokens: modelMaxTokens[model] ?? 8192,
|
|
11438
|
+
baseUrl,
|
|
11439
|
+
keyPool,
|
|
11440
|
+
timeoutMs,
|
|
11441
|
+
maxRetries: 2,
|
|
11442
|
+
logger,
|
|
11443
|
+
stage: "probe"
|
|
11444
|
+
});
|
|
11445
|
+
probeResults.push({ model, durationMs: Date.now() - t0, success: true });
|
|
11446
|
+
} catch (err) {
|
|
11447
|
+
probeResults.push({
|
|
11448
|
+
model,
|
|
11449
|
+
durationMs: Date.now() - t0,
|
|
11450
|
+
success: false,
|
|
11451
|
+
error: err instanceof Error ? err.message : String(err)
|
|
11452
|
+
});
|
|
11453
|
+
}
|
|
11454
|
+
markStageProgress("probe", Math.round((i + 1) / models.length * 100), i + 1, models.length, `\uBAA8\uB378 \uD504\uB85C\uBE0C ${i + 1}/${models.length}`);
|
|
11455
|
+
logStage("debug", "probe", "progress", "\uBAA8\uB378 \uD504\uB85C\uBE0C \uC9C4\uD589", { index: i + 1, total: models.length, model, result: probeResults.at(-1) });
|
|
11456
|
+
}
|
|
11457
|
+
const selectedModel = chooseFastestModel(probeResults);
|
|
11458
|
+
if (!selectedModel) throw new UnifiedOcrError("PROBE_FAILED", "probe", "\uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2E4\uD328: \uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uBAA8\uB378\uC774 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11459
|
+
const fallbackModelOrder = probeResults.filter((r) => r.success).sort((a, b) => a.durationMs - b.durationMs).map((r) => r.model);
|
|
11460
|
+
timingsMs.probe = Date.now() - probeStart;
|
|
11461
|
+
await updateModelCache(modelCachePath, probeResults);
|
|
11462
|
+
markStageDone("probe", `\uD504\uB85C\uBE0C \uC644\uB8CC: ${selectedModel}`);
|
|
11463
|
+
logStage("info", "probe", "done", "\uBAA8\uB378 \uD504\uB85C\uBE0C \uC644\uB8CC", { selectedModel, probeResults, elapsedMs: timingsMs.probe, modelCachePath });
|
|
11464
|
+
const ocrStart = Date.now();
|
|
11465
|
+
currentStage = "ocr";
|
|
11466
|
+
markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (${selectedModel})`);
|
|
11467
|
+
logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { selectedModel, pageCount: images.length });
|
|
11468
|
+
const rawPagePaths = [];
|
|
11469
|
+
for (let i = 0; i < images.length; i++) {
|
|
11470
|
+
const imagePath = images[i];
|
|
11471
|
+
const markdown = await ocrImageWithFallback({
|
|
11472
|
+
imagePath,
|
|
11473
|
+
prompt: OCR_PROMPT2,
|
|
11474
|
+
models: fallbackModelOrder,
|
|
11475
|
+
modelMaxTokens,
|
|
11476
|
+
baseUrl,
|
|
11477
|
+
keyPool,
|
|
11478
|
+
timeoutMs,
|
|
11479
|
+
maxRetriesPerPage,
|
|
11480
|
+
logger
|
|
11481
|
+
});
|
|
11482
|
+
const pagePath = join4(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11483
|
+
await writeFile(pagePath, markdown, "utf-8");
|
|
11484
|
+
rawPagePaths.push(pagePath);
|
|
11485
|
+
markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`);
|
|
11486
|
+
logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
|
|
11487
|
+
}
|
|
11488
|
+
timingsMs.ocr = Date.now() - ocrStart;
|
|
11489
|
+
markStageDone("ocr", "OCR \uC644\uB8CC");
|
|
11490
|
+
logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
|
|
11491
|
+
const proofStart = Date.now();
|
|
11492
|
+
currentStage = "proofread";
|
|
11493
|
+
markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
|
|
11494
|
+
logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11495
|
+
const proofedPaths = [];
|
|
11496
|
+
for (let i = 0; i < rawPagePaths.length; i++) {
|
|
11497
|
+
const rawMd = await readFile(rawPagePaths[i], "utf-8");
|
|
11498
|
+
const prompt = `${PROOFREAD_PROMPT}
|
|
11499
|
+
|
|
11500
|
+
---
|
|
11501
|
+
${rawMd}
|
|
11502
|
+
---`;
|
|
11503
|
+
const corrected = await ocrImageViaNim({
|
|
11504
|
+
textOnlyPrompt: prompt,
|
|
11505
|
+
model: selectedModel,
|
|
11506
|
+
maxTokens: modelMaxTokens[selectedModel] ?? 8192,
|
|
11507
|
+
baseUrl,
|
|
11508
|
+
keyPool,
|
|
11509
|
+
timeoutMs,
|
|
11510
|
+
maxRetries: maxRetriesPerPage,
|
|
11511
|
+
logger,
|
|
11512
|
+
stage: "proofread"
|
|
11513
|
+
});
|
|
11514
|
+
const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
|
|
11515
|
+
const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
|
|
11516
|
+
const pagePath = join4(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11517
|
+
await writeFile(pagePath, taggedCorrected, "utf-8");
|
|
11518
|
+
await writeFile(
|
|
11519
|
+
join4(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
|
|
11520
|
+
JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
|
|
11521
|
+
"utf-8"
|
|
11522
|
+
);
|
|
11523
|
+
proofedPaths.push(pagePath);
|
|
11524
|
+
markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
|
|
11525
|
+
logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
|
|
11526
|
+
}
|
|
11527
|
+
timingsMs.proofread = Date.now() - proofStart;
|
|
11528
|
+
markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
|
|
11529
|
+
logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
|
|
11530
|
+
const mergeStart = Date.now();
|
|
11531
|
+
currentStage = "merge";
|
|
11532
|
+
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
11533
|
+
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: proofedPaths.length });
|
|
11534
|
+
const merged = await mergeMarkdownPages(proofedPaths);
|
|
11535
|
+
await writeFile(outputPath, merged, "utf-8");
|
|
11536
|
+
timingsMs.merge = Date.now() - mergeStart;
|
|
11537
|
+
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
11538
|
+
logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
|
|
11539
|
+
const report = {
|
|
11540
|
+
inputPath: absInput,
|
|
11541
|
+
outputPath,
|
|
11542
|
+
workspaceDir,
|
|
11543
|
+
selectedModel,
|
|
11544
|
+
probeImage,
|
|
11545
|
+
probeResults,
|
|
11546
|
+
pageCount: images.length,
|
|
11547
|
+
keyHealth: keyPool.snapshot(),
|
|
11548
|
+
timingsMs,
|
|
11549
|
+
modelCachePath
|
|
11550
|
+
};
|
|
11551
|
+
await writeFile(reportPath, JSON.stringify(report, null, 2), "utf-8");
|
|
11552
|
+
logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
|
|
11553
|
+
return { outputPath, reportPath, selectedModel };
|
|
11554
|
+
} catch (err) {
|
|
11555
|
+
const normalized = normalizePipelineError(err, currentStage);
|
|
11556
|
+
emitProgress(options.onEvent, currentStage, 0, stageWeights, {
|
|
11557
|
+
type: "error",
|
|
11558
|
+
code: normalized.code,
|
|
11559
|
+
message: normalized.message
|
|
11560
|
+
});
|
|
11561
|
+
logger.log({
|
|
11562
|
+
level: "error",
|
|
11563
|
+
stage: currentStage,
|
|
11564
|
+
event: "error",
|
|
11565
|
+
message: normalized.message,
|
|
11566
|
+
error: {
|
|
11567
|
+
code: normalized.code,
|
|
11568
|
+
name: normalized.name,
|
|
11569
|
+
message: normalized.message,
|
|
11570
|
+
stack: normalized.stack
|
|
11571
|
+
}
|
|
11572
|
+
});
|
|
11573
|
+
throw normalized;
|
|
11574
|
+
}
|
|
11575
|
+
}
|
|
11576
|
+
function normalizeWeights(weights) {
|
|
11577
|
+
const sum = Object.values(weights).reduce((a, b) => a + b, 0) || 1;
|
|
11578
|
+
return {
|
|
11579
|
+
convert: weights.convert / sum * 100,
|
|
11580
|
+
render: weights.render / sum * 100,
|
|
11581
|
+
probe: weights.probe / sum * 100,
|
|
11582
|
+
ocr: weights.ocr / sum * 100,
|
|
11583
|
+
proofread: weights.proofread / sum * 100,
|
|
11584
|
+
merge: weights.merge / sum * 100
|
|
11585
|
+
};
|
|
11586
|
+
}
|
|
11587
|
+
function computeOverallPercent(stage, stagePercent, weights) {
|
|
11588
|
+
const order = ["convert", "render", "probe", "ocr", "proofread", "merge"];
|
|
11589
|
+
let overall = 0;
|
|
11590
|
+
for (const s of order) {
|
|
11591
|
+
if (s === stage) {
|
|
11592
|
+
overall += weights[s] * Math.max(0, Math.min(100, stagePercent)) / 100;
|
|
11593
|
+
break;
|
|
11594
|
+
}
|
|
11595
|
+
overall += weights[s];
|
|
11596
|
+
}
|
|
11597
|
+
return Math.round(overall);
|
|
11598
|
+
}
|
|
11599
|
+
function emitProgress(cb, stage, stagePercent, weights, extra) {
|
|
11600
|
+
if (!cb) return;
|
|
11601
|
+
cb({
|
|
11602
|
+
type: extra.type ?? "stage_progress",
|
|
11603
|
+
stage,
|
|
11604
|
+
stagePercent: Math.max(0, Math.min(100, Math.round(stagePercent))),
|
|
11605
|
+
overallPercent: computeOverallPercent(stage, stagePercent, weights),
|
|
11606
|
+
current: extra.current,
|
|
11607
|
+
total: extra.total,
|
|
11608
|
+
code: extra.code,
|
|
11609
|
+
message: extra.message
|
|
11610
|
+
});
|
|
11611
|
+
}
|
|
11612
|
+
async function convertWithLibreOffice(buffer, ext) {
|
|
11613
|
+
return await new Promise((resolvePromise, reject) => {
|
|
11614
|
+
libreConvert(buffer, ext, void 0, (err, done) => {
|
|
11615
|
+
if (err || !done) {
|
|
11616
|
+
reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
|
|
11617
|
+
return;
|
|
11618
|
+
}
|
|
11619
|
+
resolvePromise(done);
|
|
11620
|
+
});
|
|
11621
|
+
});
|
|
11622
|
+
}
|
|
11623
|
+
async function renderPdfToPng(pdfPath, prefixPath, dpi) {
|
|
11624
|
+
try {
|
|
11625
|
+
await runCommand("pdftoppm", ["-png", "-r", String(dpi), pdfPath, prefixPath]);
|
|
11626
|
+
} catch (err) {
|
|
11627
|
+
throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
|
|
11628
|
+
}
|
|
11629
|
+
}
|
|
11630
|
+
async function runCommand(cmd, args) {
|
|
11631
|
+
await new Promise((resolvePromise, reject) => {
|
|
11632
|
+
const child = spawn2(cmd, args, { stdio: "pipe" });
|
|
11633
|
+
let stderr = "";
|
|
11634
|
+
child.stderr.on("data", (d) => {
|
|
11635
|
+
stderr += String(d);
|
|
11636
|
+
});
|
|
11637
|
+
child.on("error", reject);
|
|
11638
|
+
child.on("close", (code) => {
|
|
11639
|
+
if (code === 0) resolvePromise();
|
|
11640
|
+
else reject(new Error(`${cmd} \uC2E4\uD328 (code=${code}): ${stderr.trim()}`));
|
|
11641
|
+
});
|
|
11642
|
+
});
|
|
11643
|
+
}
|
|
11644
|
+
async function assertSofficeAvailable() {
|
|
11645
|
+
try {
|
|
11646
|
+
await runCommand("soffice", ["--version"]);
|
|
11647
|
+
} catch {
|
|
11648
|
+
throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
|
|
11649
|
+
}
|
|
11650
|
+
}
|
|
11651
|
+
async function listPageImages(imagesDir) {
|
|
11652
|
+
const files = await readdir(imagesDir);
|
|
11653
|
+
return files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b)).map((f) => join4(imagesDir, f));
|
|
11654
|
+
}
|
|
11655
|
+
function naturalPageSort(a, b) {
|
|
11656
|
+
const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11657
|
+
const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
|
|
11658
|
+
return na - nb;
|
|
11659
|
+
}
|
|
11660
|
+
async function pickRepresentativeImage(images) {
|
|
11661
|
+
const sample = images.slice(0, Math.min(images.length, 8));
|
|
11662
|
+
const weighted = [];
|
|
11663
|
+
for (const p of sample) {
|
|
11664
|
+
const st = await stat(p);
|
|
11665
|
+
if (st.size > 8 * 1024) weighted.push({ path: p, size: st.size });
|
|
11666
|
+
}
|
|
11667
|
+
const use = weighted.length > 0 ? weighted : await Promise.all(sample.map(async (p) => ({ path: p, size: (await stat(p)).size })));
|
|
11668
|
+
use.sort((a, b) => a.size - b.size);
|
|
11669
|
+
return use[Math.floor(use.length / 2)].path;
|
|
11670
|
+
}
|
|
11671
|
+
function chooseFastestModel(results) {
|
|
11672
|
+
const ok = results.filter((r) => r.success).sort((a, b) => a.durationMs - b.durationMs);
|
|
11673
|
+
return ok[0]?.model ?? null;
|
|
11674
|
+
}
|
|
11675
|
+
async function loadModelCache(path) {
|
|
11676
|
+
try {
|
|
11677
|
+
const raw = await readFile(path, "utf-8");
|
|
11678
|
+
return JSON.parse(raw);
|
|
11679
|
+
} catch {
|
|
11680
|
+
return null;
|
|
11681
|
+
}
|
|
11682
|
+
}
|
|
11683
|
+
function sortModelsByCache(models, cache) {
|
|
11684
|
+
if (!cache) return [...models];
|
|
11685
|
+
return [...models].sort((a, b) => {
|
|
11686
|
+
const av = cache.models[a]?.avgDurationMs ?? Number.POSITIVE_INFINITY;
|
|
11687
|
+
const bv = cache.models[b]?.avgDurationMs ?? Number.POSITIVE_INFINITY;
|
|
11688
|
+
return av - bv;
|
|
11689
|
+
});
|
|
11690
|
+
}
|
|
11691
|
+
async function updateModelCache(path, probes) {
|
|
11692
|
+
const prev = await loadModelCache(path);
|
|
11693
|
+
const current = prev ?? { updatedAt: (/* @__PURE__ */ new Date()).toISOString(), models: {} };
|
|
11694
|
+
for (const p of probes) {
|
|
11695
|
+
if (!p.success) continue;
|
|
11696
|
+
const existing = current.models[p.model];
|
|
11697
|
+
if (!existing) {
|
|
11698
|
+
current.models[p.model] = { count: 1, avgDurationMs: p.durationMs };
|
|
11699
|
+
} else {
|
|
11700
|
+
const nextCount = existing.count + 1;
|
|
11701
|
+
current.models[p.model] = {
|
|
11702
|
+
count: nextCount,
|
|
11703
|
+
avgDurationMs: Math.round((existing.avgDurationMs * existing.count + p.durationMs) / nextCount)
|
|
11704
|
+
};
|
|
11705
|
+
}
|
|
11706
|
+
}
|
|
11707
|
+
current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
|
|
11708
|
+
await writeFile(path, JSON.stringify(current, null, 2), "utf-8");
|
|
11709
|
+
}
|
|
11710
|
+
async function ocrImageWithFallback(input) {
|
|
11711
|
+
let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
|
|
11712
|
+
for (const model of input.models) {
|
|
11713
|
+
try {
|
|
11714
|
+
return await ocrImageViaNim({
|
|
11715
|
+
imagePath: input.imagePath,
|
|
11716
|
+
prompt: input.prompt,
|
|
11717
|
+
model,
|
|
11718
|
+
maxTokens: input.modelMaxTokens[model] ?? 8192,
|
|
11719
|
+
baseUrl: input.baseUrl,
|
|
11720
|
+
keyPool: input.keyPool,
|
|
11721
|
+
timeoutMs: input.timeoutMs,
|
|
11722
|
+
maxRetries: input.maxRetriesPerPage,
|
|
11723
|
+
logger: input.logger,
|
|
11724
|
+
stage: "ocr"
|
|
11725
|
+
});
|
|
11726
|
+
} catch (err) {
|
|
11727
|
+
lastErr = err instanceof Error ? err.message : String(err);
|
|
11728
|
+
}
|
|
11729
|
+
}
|
|
11730
|
+
throw new UnifiedOcrError("OCR_FAILED", "ocr", `\uBAA8\uB4E0 OCR \uBAA8\uB378 \uC2E4\uD328: ${lastErr}`);
|
|
11731
|
+
}
|
|
11732
|
+
async function mergeMarkdownPages(paths) {
|
|
11733
|
+
const out = [];
|
|
11734
|
+
for (let i = 0; i < paths.length; i++) {
|
|
11735
|
+
const txt = (await readFile(paths[i], "utf-8")).trim();
|
|
11736
|
+
if (!txt) continue;
|
|
11737
|
+
out.push(txt);
|
|
11738
|
+
}
|
|
11739
|
+
return out.join("\n\n");
|
|
11740
|
+
}
|
|
11741
|
+
async function ocrImageViaNim(input) {
|
|
11742
|
+
const { model, maxTokens, baseUrl, keyPool, timeoutMs, maxRetries, logger, stage = "ocr" } = input;
|
|
11743
|
+
let attempt = 0;
|
|
11744
|
+
let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
|
|
11745
|
+
while (attempt < maxRetries) {
|
|
11746
|
+
attempt++;
|
|
11747
|
+
let acquired = null;
|
|
11748
|
+
try {
|
|
11749
|
+
acquired = keyPool.acquire();
|
|
11750
|
+
} catch (err) {
|
|
11751
|
+
if (err instanceof AllKeysCoolingDownError) {
|
|
11752
|
+
logger?.log({
|
|
11753
|
+
level: "warn",
|
|
11754
|
+
stage,
|
|
11755
|
+
event: "progress",
|
|
11756
|
+
message: "\uBAA8\uB4E0 API \uD0A4 cooldown \uC0C1\uD0DC\uB85C \uB300\uAE30",
|
|
11757
|
+
meta: { waitMs: err.waitMs, attempt, maxRetries, model }
|
|
11758
|
+
});
|
|
11759
|
+
await delay(err.waitMs);
|
|
11760
|
+
continue;
|
|
11761
|
+
}
|
|
11762
|
+
throw err;
|
|
11763
|
+
}
|
|
11764
|
+
try {
|
|
11765
|
+
const content = input.textOnlyPrompt ? [{ type: "text", text: input.textOnlyPrompt }] : [
|
|
11766
|
+
{ type: "text", text: input.prompt ?? OCR_PROMPT2 },
|
|
11767
|
+
{
|
|
11768
|
+
type: "image_url",
|
|
11769
|
+
image_url: { url: `data:image/png;base64,${await encodeBase64(input.imagePath)}` }
|
|
11770
|
+
}
|
|
11771
|
+
];
|
|
11772
|
+
const body = {
|
|
11773
|
+
model,
|
|
11774
|
+
messages: [{ role: "user", content }],
|
|
11775
|
+
max_tokens: maxTokens,
|
|
11776
|
+
temperature: 0
|
|
11777
|
+
};
|
|
11778
|
+
logger?.log({
|
|
11779
|
+
level: "debug",
|
|
11780
|
+
stage,
|
|
11781
|
+
event: "progress",
|
|
11782
|
+
message: "NIM \uC694\uCCAD \uC2DC\uB3C4",
|
|
11783
|
+
meta: { attempt, maxRetries, model, keyId: acquired.keyId, hasImage: Boolean(input.imagePath) }
|
|
11784
|
+
});
|
|
11785
|
+
const controller = new AbortController();
|
|
11786
|
+
const timer = setTimeout(() => controller.abort(), timeoutMs);
|
|
11787
|
+
try {
|
|
11788
|
+
const resp = await fetch(baseUrl, {
|
|
11789
|
+
method: "POST",
|
|
11790
|
+
headers: {
|
|
11791
|
+
Authorization: `Bearer ${acquired.key}`,
|
|
11792
|
+
"Content-Type": "application/json"
|
|
11793
|
+
},
|
|
11794
|
+
body: JSON.stringify(body),
|
|
11795
|
+
signal: controller.signal
|
|
11796
|
+
});
|
|
11797
|
+
if (resp.ok) {
|
|
11798
|
+
const json = await resp.json();
|
|
11799
|
+
const text = json.choices?.[0]?.message?.content?.trim() ?? "";
|
|
11800
|
+
keyPool.markSuccess(acquired.keyId);
|
|
11801
|
+
logger?.log({
|
|
11802
|
+
level: "debug",
|
|
11803
|
+
stage,
|
|
11804
|
+
event: "done",
|
|
11805
|
+
message: "NIM \uC751\uB2F5 \uC131\uACF5",
|
|
11806
|
+
meta: { attempt, model, keyId: acquired.keyId }
|
|
11807
|
+
});
|
|
11808
|
+
if (!text) throw new UnifiedOcrError("OCR_FAILED", "ocr", "OCR \uC751\uB2F5\uC774 \uBE44\uC5B4 \uC788\uC2B5\uB2C8\uB2E4.");
|
|
11809
|
+
return stripCodeFence3(text);
|
|
11810
|
+
}
|
|
11811
|
+
const retryAfter = Number(resp.headers.get("retry-after") || "0");
|
|
11812
|
+
const retryAfterMs = Number.isFinite(retryAfter) && retryAfter > 0 ? retryAfter * 1e3 : void 0;
|
|
11813
|
+
keyPool.markFailure(acquired.keyId, { status: resp.status, retryAfterMs });
|
|
11814
|
+
lastErr = `NIM \uC751\uB2F5 \uC624\uB958: ${resp.status}`;
|
|
11815
|
+
logger?.log({
|
|
11816
|
+
level: "warn",
|
|
11817
|
+
stage,
|
|
11818
|
+
event: "progress",
|
|
11819
|
+
message: "NIM \uC751\uB2F5 \uC2E4\uD328",
|
|
11820
|
+
meta: { attempt, model, status: resp.status, retryAfterMs, keyId: acquired.keyId }
|
|
11821
|
+
});
|
|
11822
|
+
} finally {
|
|
11823
|
+
clearTimeout(timer);
|
|
11824
|
+
}
|
|
11825
|
+
} catch (err) {
|
|
11826
|
+
const isTimeout = err instanceof Error && err.name === "AbortError";
|
|
11827
|
+
if (acquired) keyPool.markFailure(acquired.keyId, { timeout: isTimeout });
|
|
11828
|
+
lastErr = err instanceof Error ? err.message : String(err);
|
|
11829
|
+
logger?.log({
|
|
11830
|
+
level: "warn",
|
|
11831
|
+
stage,
|
|
11832
|
+
event: "progress",
|
|
11833
|
+
message: "NIM \uC694\uCCAD \uC608\uC678",
|
|
11834
|
+
meta: { attempt, model, timeout: isTimeout, keyId: acquired?.keyId },
|
|
11835
|
+
error: { message: lastErr, name: err instanceof Error ? err.name : "Error" }
|
|
11836
|
+
});
|
|
11837
|
+
await delay(500);
|
|
11838
|
+
}
|
|
11839
|
+
}
|
|
11840
|
+
logger?.log({
|
|
11841
|
+
level: "error",
|
|
11842
|
+
stage,
|
|
11843
|
+
event: "error",
|
|
11844
|
+
message: "NIM \uCD5C\uB300 \uC7AC\uC2DC\uB3C4 \uCD08\uACFC",
|
|
11845
|
+
meta: { model, maxRetries },
|
|
11846
|
+
error: { code: "OCR_FAILED", message: lastErr }
|
|
11847
|
+
});
|
|
11848
|
+
throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
|
|
11849
|
+
}
|
|
11850
|
+
async function encodeBase64(path) {
|
|
11851
|
+
const b = await readFile(path);
|
|
11852
|
+
return b.toString("base64");
|
|
11853
|
+
}
|
|
11854
|
+
function stripCodeFence3(text) {
|
|
11855
|
+
const m = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/i);
|
|
11856
|
+
return m ? m[1].trim() : text;
|
|
11857
|
+
}
|
|
11858
|
+
async function delay(ms) {
|
|
11859
|
+
if (ms <= 0) return;
|
|
11860
|
+
await new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
|
|
11861
|
+
}
|
|
11862
|
+
function ensureSupportedInput(path) {
|
|
11863
|
+
const ext = extname(path).toLowerCase();
|
|
11864
|
+
const allowed = /* @__PURE__ */ new Set([".pdf", ".hwp", ".hwpx", ".docx", ".xlsx"]);
|
|
11865
|
+
if (!allowed.has(ext)) {
|
|
11866
|
+
throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
|
|
11867
|
+
}
|
|
11868
|
+
}
|
|
11869
|
+
function extractNumericTokens(text) {
|
|
11870
|
+
return text.match(/\d[\d,./-]*/g) ?? [];
|
|
11871
|
+
}
|
|
11872
|
+
function preserveNumericIntegrity(rawText, correctedText) {
|
|
11873
|
+
const rawTokens = extractNumericTokens(rawText);
|
|
11874
|
+
const correctedTokens = extractNumericTokens(correctedText);
|
|
11875
|
+
if (rawTokens.length !== correctedTokens.length) return rawText;
|
|
11876
|
+
for (let i = 0; i < rawTokens.length; i++) {
|
|
11877
|
+
if (rawTokens[i] !== correctedTokens[i]) return rawText;
|
|
11878
|
+
}
|
|
11879
|
+
return correctedText;
|
|
11880
|
+
}
|
|
11881
|
+
function addUncertainTag(rawText, correctedText) {
|
|
11882
|
+
if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
|
|
11883
|
+
const rawLen = rawText.trim().length;
|
|
11884
|
+
const corrLen = correctedText.trim().length;
|
|
11885
|
+
if (rawLen === 0 || corrLen === 0) return correctedText;
|
|
11886
|
+
const rawLines = rawText.split("\n").filter(Boolean).length;
|
|
11887
|
+
const corrLines = correctedText.split("\n").filter(Boolean).length;
|
|
11888
|
+
const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
|
|
11889
|
+
const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
|
|
11890
|
+
const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
|
|
11891
|
+
if (!suspicious) return correctedText;
|
|
11892
|
+
return `${correctedText}
|
|
11893
|
+
|
|
11894
|
+
[\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
|
|
11895
|
+
}
|
|
11896
|
+
function buildDiffSummary(before, after) {
|
|
11897
|
+
return {
|
|
11898
|
+
changed: before !== after,
|
|
11899
|
+
beforeLength: before.length,
|
|
11900
|
+
afterLength: after.length
|
|
11901
|
+
};
|
|
11902
|
+
}
|
|
11903
|
+
function normalizePipelineError(err, stage) {
|
|
11904
|
+
if (err instanceof UnifiedOcrError) return err;
|
|
11905
|
+
const message = err instanceof Error ? err.message : String(err);
|
|
11906
|
+
const codeByStage = {
|
|
11907
|
+
convert: "CONVERT_FAILED",
|
|
11908
|
+
render: "RENDER_FAILED",
|
|
11909
|
+
probe: "PROBE_FAILED",
|
|
11910
|
+
ocr: "OCR_FAILED",
|
|
11911
|
+
proofread: "PROOFREAD_FAILED",
|
|
11912
|
+
merge: "MERGE_FAILED"
|
|
11913
|
+
};
|
|
11914
|
+
return new UnifiedOcrError(codeByStage[stage] ?? "UNKNOWN", stage, message);
|
|
11915
|
+
}
|
|
11916
|
+
|
|
10604
11917
|
// src/index.ts
|
|
10605
11918
|
async function parse2(input, options) {
|
|
11919
|
+
const logger = createLoggerFromEnv().withRun(generateRunId("parse")).child({ component: "index.ts", stage: "detect" });
|
|
11920
|
+
logger.log({ level: "info", event: "start", message: "parse \uD638\uCD9C \uC2DC\uC791" });
|
|
10606
11921
|
let buffer;
|
|
10607
11922
|
if (typeof input === "string") {
|
|
10608
11923
|
try {
|
|
10609
|
-
const buf = await
|
|
11924
|
+
const buf = await readFile2(input);
|
|
10610
11925
|
buffer = toArrayBuffer(buf);
|
|
10611
11926
|
} catch (err) {
|
|
10612
11927
|
const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
|
|
11928
|
+
logger.log({
|
|
11929
|
+
level: "error",
|
|
11930
|
+
stage: "detect",
|
|
11931
|
+
event: "error",
|
|
11932
|
+
message: msg,
|
|
11933
|
+
error: { code: "PARSE_ERROR", message: msg, name: err instanceof Error ? err.name : "Error" }
|
|
11934
|
+
});
|
|
10613
11935
|
return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
|
|
10614
11936
|
}
|
|
10615
11937
|
} else if (Buffer.isBuffer(input)) {
|
|
@@ -10618,13 +11940,23 @@ async function parse2(input, options) {
|
|
|
10618
11940
|
buffer = input;
|
|
10619
11941
|
}
|
|
10620
11942
|
if (!buffer || buffer.byteLength === 0) {
|
|
11943
|
+
logger.log({ level: "error", stage: "detect", event: "error", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", error: { code: "EMPTY_INPUT", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", name: "KordocError" } });
|
|
10621
11944
|
return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
|
|
10622
11945
|
}
|
|
10623
11946
|
const MAX_FILE_SIZE = 500 * 1024 * 1024;
|
|
10624
11947
|
if (buffer.byteLength > MAX_FILE_SIZE) {
|
|
11948
|
+
logger.log({
|
|
11949
|
+
level: "error",
|
|
11950
|
+
stage: "detect",
|
|
11951
|
+
event: "error",
|
|
11952
|
+
message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC",
|
|
11953
|
+
meta: { size: buffer.byteLength },
|
|
11954
|
+
error: { code: "FILE_TOO_LARGE", message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC", name: "KordocError" }
|
|
11955
|
+
});
|
|
10625
11956
|
return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
|
|
10626
11957
|
}
|
|
10627
11958
|
const format = detectFormat(buffer);
|
|
11959
|
+
logger.log({ level: "info", event: "done", message: "\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC", meta: { format } });
|
|
10628
11960
|
switch (format) {
|
|
10629
11961
|
case "hwpx": {
|
|
10630
11962
|
const { format: zipFormat, zip } = await detectZipFormat(buffer);
|
|
@@ -10702,7 +12034,8 @@ async function parseHwpx(buffer, options, zip) {
|
|
|
10702
12034
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
|
|
10703
12035
|
return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10704
12036
|
} catch (err) {
|
|
10705
|
-
|
|
12037
|
+
const normalized = normalizeKordocError(err, "HWPX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
12038
|
+
return { success: false, fileType: "hwpx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
10706
12039
|
}
|
|
10707
12040
|
}
|
|
10708
12041
|
async function parseHwp(buffer, options) {
|
|
@@ -10710,7 +12043,8 @@ async function parseHwp(buffer, options) {
|
|
|
10710
12043
|
const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
|
|
10711
12044
|
return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10712
12045
|
} catch (err) {
|
|
10713
|
-
|
|
12046
|
+
const normalized = normalizeKordocError(err, "HWP \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
12047
|
+
return { success: false, fileType: "hwp", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
10714
12048
|
}
|
|
10715
12049
|
}
|
|
10716
12050
|
async function parsePdf(buffer, options) {
|
|
@@ -10718,8 +12052,15 @@ async function parsePdf(buffer, options) {
|
|
|
10718
12052
|
const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
|
|
10719
12053
|
return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
|
|
10720
12054
|
} catch (err) {
|
|
12055
|
+
const normalized = normalizeKordocError(err, "PDF \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
10721
12056
|
const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
|
|
10722
|
-
return {
|
|
12057
|
+
return {
|
|
12058
|
+
success: false,
|
|
12059
|
+
fileType: "pdf",
|
|
12060
|
+
error: normalized.message,
|
|
12061
|
+
code: normalized.code ?? classifyError(normalized),
|
|
12062
|
+
isImageBased
|
|
12063
|
+
};
|
|
10723
12064
|
}
|
|
10724
12065
|
}
|
|
10725
12066
|
async function parseXlsx(buffer, options, zip) {
|
|
@@ -10727,7 +12068,8 @@ async function parseXlsx(buffer, options, zip) {
|
|
|
10727
12068
|
const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
|
|
10728
12069
|
return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
|
|
10729
12070
|
} catch (err) {
|
|
10730
|
-
|
|
12071
|
+
const normalized = normalizeKordocError(err, "XLSX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
12072
|
+
return { success: false, fileType: "xlsx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
10731
12073
|
}
|
|
10732
12074
|
}
|
|
10733
12075
|
async function parseDocx(buffer, options, zip) {
|
|
@@ -10735,10 +12077,13 @@ async function parseDocx(buffer, options, zip) {
|
|
|
10735
12077
|
const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
|
|
10736
12078
|
return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
|
|
10737
12079
|
} catch (err) {
|
|
10738
|
-
|
|
12080
|
+
const normalized = normalizeKordocError(err, "DOCX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
|
|
12081
|
+
return { success: false, fileType: "docx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
|
|
10739
12082
|
}
|
|
10740
12083
|
}
|
|
10741
12084
|
export {
|
|
12085
|
+
AllKeysCoolingDownError,
|
|
12086
|
+
ApiKeyRotationPool,
|
|
10742
12087
|
VERSION,
|
|
10743
12088
|
blocksToMarkdown,
|
|
10744
12089
|
compare,
|
|
@@ -10757,7 +12102,8 @@ export {
|
|
|
10757
12102
|
parseHwp,
|
|
10758
12103
|
parseHwpx,
|
|
10759
12104
|
parsePdf,
|
|
10760
|
-
parseXlsx
|
|
12105
|
+
parseXlsx,
|
|
12106
|
+
runUnifiedOcrPipeline
|
|
10761
12107
|
};
|
|
10762
12108
|
/*! Bundled license information:
|
|
10763
12109
|
|