@clazic/kordoc 2.4.11 → 2.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.cjs CHANGED
@@ -71,6 +71,224 @@ var init_page_range = __esm({
71
71
  }
72
72
  });
73
73
 
74
+ // src/logging/logger.ts
75
+ function createLoggerFromEnv(env = process.env) {
76
+ const level = parseLevel(env.KORDOC_LOG_LEVEL);
77
+ const includeStack = env.KORDOC_LOG_STACK === "1";
78
+ const filePath = env.KORDOC_LOG_FILE ? (0, import_path.resolve)(env.KORDOC_LOG_FILE) : "";
79
+ const config = {
80
+ level,
81
+ includeStack,
82
+ progressSampleMs: parsePositiveInt(env.KORDOC_LOG_PROGRESS_SAMPLE_MS, 1e3),
83
+ basenamePaths: env.KORDOC_LOG_BASENAME_PATHS === "1",
84
+ textLimit: parsePositiveInt(env.KORDOC_LOG_TEXT_LIMIT, 400)
85
+ };
86
+ const consoleSink = new ConsoleLogger(config);
87
+ const sinks = [consoleSink];
88
+ if (filePath) sinks.push(new JsonlLogger(config, filePath));
89
+ return new CompositeLogger(config, sinks);
90
+ }
91
+ function generateRunId(prefix = "run") {
92
+ return `${prefix}_${(0, import_crypto.randomUUID)().slice(0, 8)}`;
93
+ }
94
+ function parseLevel(input) {
95
+ const v = (input || "").toLowerCase();
96
+ if (v === "error" || v === "warn" || v === "info" || v === "debug" || v === "trace") return v;
97
+ return "error";
98
+ }
99
+ function maskSecrets(input) {
100
+ return input.replace(/nvapi-[A-Za-z0-9_\-]+/g, "nvapi-***").replace(/Bearer\s+[A-Za-z0-9_\-\.]+/gi, "Bearer ***");
101
+ }
102
+ function sanitizeMeta(meta, cfg) {
103
+ const out = {};
104
+ for (const [k, v] of Object.entries(meta)) {
105
+ if (/authorization|api[_-]?key|token/i.test(k)) {
106
+ out[k] = "***";
107
+ continue;
108
+ }
109
+ if (typeof v === "string") {
110
+ let next = maskSecrets(v);
111
+ if (cfg.basenamePaths && /path|file|dir/i.test(k)) {
112
+ next = (0, import_path.basename)(next);
113
+ }
114
+ out[k] = limitText(next, cfg.textLimit);
115
+ } else {
116
+ out[k] = v;
117
+ }
118
+ }
119
+ return out;
120
+ }
121
+ function parsePositiveInt(input, fallback) {
122
+ const n = Number(input);
123
+ if (!Number.isFinite(n) || n < 0) return fallback;
124
+ return Math.floor(n);
125
+ }
126
+ function limitText(input, maxLen) {
127
+ if (maxLen <= 0) return input;
128
+ if (input.length <= maxLen) return input;
129
+ return `${input.slice(0, maxLen)}...(+${input.length - maxLen})`;
130
+ }
131
+ var import_fs, import_promises, import_path, import_crypto, LEVEL_ORDER, BaseLogger, ConsoleLogger, JsonlLogger, CompositeLogger;
132
+ var init_logger = __esm({
133
+ "src/logging/logger.ts"() {
134
+ "use strict";
135
+ import_fs = require("fs");
136
+ import_promises = require("fs/promises");
137
+ import_path = require("path");
138
+ import_crypto = require("crypto");
139
+ LEVEL_ORDER = {
140
+ error: 0,
141
+ warn: 1,
142
+ info: 2,
143
+ debug: 3,
144
+ trace: 4
145
+ };
146
+ BaseLogger = class _BaseLogger {
147
+ constructor(config, context = {}) {
148
+ this.config = config;
149
+ this.context = context;
150
+ }
151
+ static progressSeenAt = /* @__PURE__ */ new Map();
152
+ shouldLog(level) {
153
+ return LEVEL_ORDER[level] <= LEVEL_ORDER[this.config.level];
154
+ }
155
+ shouldEmitProgress(ev) {
156
+ if (this.config.progressSampleMs <= 0) return true;
157
+ if ((ev.event ?? "message") !== "progress") return true;
158
+ if (ev.level === "error" || ev.level === "warn") return true;
159
+ const key = [
160
+ this.context.runId ?? ev.runId ?? "no-run",
161
+ this.context.component ?? ev.component ?? "no-component",
162
+ this.context.stage ?? ev.stage ?? "unknown",
163
+ ev.message
164
+ ].join("|");
165
+ const now = Date.now();
166
+ const prev = _BaseLogger.progressSeenAt.get(key) ?? 0;
167
+ if (now - prev < this.config.progressSampleMs) return false;
168
+ _BaseLogger.progressSeenAt.set(key, now);
169
+ return true;
170
+ }
171
+ merge(ev) {
172
+ const out = {
173
+ ...this.context,
174
+ ...ev,
175
+ ts: (/* @__PURE__ */ new Date()).toISOString(),
176
+ level: ev.level,
177
+ stage: ev.stage ?? this.context.stage ?? "unknown",
178
+ event: ev.event ?? "message",
179
+ message: ev.message
180
+ };
181
+ if (!this.config.includeStack && out.error?.stack) {
182
+ out.error = { ...out.error, stack: void 0 };
183
+ }
184
+ if (out.meta) out.meta = sanitizeMeta(out.meta, this.config);
185
+ if (out.error?.message) out.error.message = maskSecrets(out.error.message);
186
+ if (out.message) out.message = limitText(maskSecrets(out.message), this.config.textLimit);
187
+ return out;
188
+ }
189
+ child(context) {
190
+ return new _BaseLogger(this.config, { ...this.context, ...context });
191
+ }
192
+ withRun(runId) {
193
+ return this.child({ runId });
194
+ }
195
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
196
+ log(event) {
197
+ }
198
+ };
199
+ ConsoleLogger = class extends BaseLogger {
200
+ log(event) {
201
+ if (!this.shouldLog(event.level)) return;
202
+ if (!this.shouldEmitProgress(event)) return;
203
+ const e = this.merge(event);
204
+ const prefix = `[${e.ts}] [${e.level.toUpperCase()}]${e.runId ? ` [${e.runId}]` : ""}${e.stage ? ` [${e.stage}]` : ""}`;
205
+ const line = `${prefix} ${e.message}${e.component ? ` (${e.component})` : ""}`;
206
+ if (e.level === "error") {
207
+ process.stderr.write(line + "\n");
208
+ if (e.error?.stack) process.stderr.write(e.error.stack + "\n");
209
+ } else {
210
+ process.stdout.write(line + "\n");
211
+ }
212
+ }
213
+ };
214
+ JsonlLogger = class _JsonlLogger extends BaseLogger {
215
+ constructor(config, filePath, context = {}) {
216
+ super(config, context);
217
+ this.filePath = filePath;
218
+ (0, import_fs.mkdirSync)((0, import_path.dirname)(filePath), { recursive: true });
219
+ _JsonlLogger.ensureState(filePath);
220
+ }
221
+ static states = /* @__PURE__ */ new Map();
222
+ static ensureState(path) {
223
+ let state = _JsonlLogger.states.get(path);
224
+ if (!state) {
225
+ state = { queue: [], flushing: false };
226
+ _JsonlLogger.states.set(path, state);
227
+ const flushSync = () => {
228
+ const s = _JsonlLogger.states.get(path);
229
+ if (!s || s.queue.length === 0) return;
230
+ const payload = s.queue.join("");
231
+ s.queue = [];
232
+ if (!payload) return;
233
+ (0, import_fs.appendFileSync)(path, payload, "utf-8");
234
+ };
235
+ process.on("beforeExit", flushSync);
236
+ process.on("exit", flushSync);
237
+ }
238
+ return state;
239
+ }
240
+ scheduleFlush(path) {
241
+ const state = _JsonlLogger.ensureState(path);
242
+ if (state.timer || state.flushing) return;
243
+ state.timer = setTimeout(() => {
244
+ state.timer = void 0;
245
+ void this.flush(path);
246
+ }, 200);
247
+ }
248
+ async flush(path) {
249
+ const state = _JsonlLogger.ensureState(path);
250
+ if (state.flushing) return;
251
+ if (state.queue.length === 0) return;
252
+ state.flushing = true;
253
+ const payload = state.queue.join("");
254
+ state.queue = [];
255
+ try {
256
+ await (0, import_promises.appendFile)(path, payload, "utf-8");
257
+ } finally {
258
+ state.flushing = false;
259
+ if (state.queue.length > 0) this.scheduleFlush(path);
260
+ }
261
+ }
262
+ log(event) {
263
+ if (!this.shouldLog(event.level)) return;
264
+ if (!this.shouldEmitProgress(event)) return;
265
+ const e = this.merge(event);
266
+ const state = _JsonlLogger.ensureState(this.filePath);
267
+ state.queue.push(JSON.stringify(e) + "\n");
268
+ this.scheduleFlush(this.filePath);
269
+ }
270
+ child(context) {
271
+ return new _JsonlLogger(this.config, this.filePath, { ...this.context, ...context });
272
+ }
273
+ };
274
+ CompositeLogger = class _CompositeLogger extends BaseLogger {
275
+ constructor(config, sinks, context = {}) {
276
+ super(config, context);
277
+ this.sinks = sinks;
278
+ }
279
+ log(event) {
280
+ if (!this.shouldLog(event.level)) return;
281
+ if (!this.shouldEmitProgress(event)) return;
282
+ for (const sink of this.sinks) sink.log(event);
283
+ }
284
+ child(context) {
285
+ const nextSinks = this.sinks.map((s) => s.child(context));
286
+ return new _CompositeLogger(this.config, nextSinks, { ...this.context, ...context });
287
+ }
288
+ };
289
+ }
290
+ });
291
+
74
292
  // node_modules/cfb/cfb.js
75
293
  var require_cfb = __commonJS({
76
294
  "node_modules/cfb/cfb.js"(exports2, module2) {
@@ -390,8 +608,8 @@ var require_cfb = __commonJS({
390
608
  }
391
609
  return L.length - R.length;
392
610
  }
393
- function dirname(p) {
394
- if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p : dirname(p.slice(0, -1));
611
+ function dirname4(p) {
612
+ if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p : dirname4(p.slice(0, -1));
395
613
  var c = p.lastIndexOf("/");
396
614
  return c === -1 ? p : p.slice(0, c + 1);
397
615
  }
@@ -812,10 +1030,10 @@ var require_cfb = __commonJS({
812
1030
  data.push([cfb.FullPaths[i2], cfb.FileIndex[i2]]);
813
1031
  }
814
1032
  for (i2 = 0; i2 < data.length; ++i2) {
815
- var dad = dirname(data[i2][0]);
1033
+ var dad = dirname4(data[i2][0]);
816
1034
  s = fullPaths[dad];
817
1035
  while (!s) {
818
- while (dirname(dad) && !fullPaths[dirname(dad)]) dad = dirname(dad);
1036
+ while (dirname4(dad) && !fullPaths[dirname4(dad)]) dad = dirname4(dad);
819
1037
  data.push([dad, {
820
1038
  name: filename(dad).replace("/", ""),
821
1039
  type: 1,
@@ -825,7 +1043,7 @@ var require_cfb = __commonJS({
825
1043
  content: null
826
1044
  }]);
827
1045
  fullPaths[dad] = true;
828
- dad = dirname(data[i2][0]);
1046
+ dad = dirname4(data[i2][0]);
829
1047
  s = fullPaths[dad];
830
1048
  }
831
1049
  }
@@ -851,13 +1069,13 @@ var require_cfb = __commonJS({
851
1069
  elt.size = 0;
852
1070
  elt.type = 5;
853
1071
  } else if (nm.slice(-1) == "/") {
854
- for (j = i2 + 1; j < data.length; ++j) if (dirname(cfb.FullPaths[j]) == nm) break;
1072
+ for (j = i2 + 1; j < data.length; ++j) if (dirname4(cfb.FullPaths[j]) == nm) break;
855
1073
  elt.C = j >= data.length ? -1 : j;
856
- for (j = i2 + 1; j < data.length; ++j) if (dirname(cfb.FullPaths[j]) == dirname(nm)) break;
1074
+ for (j = i2 + 1; j < data.length; ++j) if (dirname4(cfb.FullPaths[j]) == dirname4(nm)) break;
857
1075
  elt.R = j >= data.length ? -1 : j;
858
1076
  elt.type = 1;
859
1077
  } else {
860
- if (dirname(cfb.FullPaths[i2 + 1] || "") == dirname(nm)) elt.R = i2 + 1;
1078
+ if (dirname4(cfb.FullPaths[i2 + 1] || "") == dirname4(nm)) elt.R = i2 + 1;
861
1079
  elt.type = 2;
862
1080
  }
863
1081
  }
@@ -2026,16 +2244,16 @@ var init_auto_detect = __esm({
2026
2244
  // src/ocr/cli-provider.ts
2027
2245
  function getTempDir() {
2028
2246
  if (!_tempDir) {
2029
- _tempDir = (0, import_path.join)(process.cwd(), ".kordoc_ocr_tmp");
2030
- (0, import_fs.mkdirSync)(_tempDir, { recursive: true });
2247
+ _tempDir = (0, import_path2.join)(process.cwd(), ".kordoc_ocr_tmp");
2248
+ (0, import_fs2.mkdirSync)(_tempDir, { recursive: true });
2031
2249
  }
2032
2250
  return _tempDir;
2033
2251
  }
2034
2252
  function createCliOcrProvider(mode) {
2035
2253
  return async (pageImage, pageNumber) => {
2036
- const tempPath = (0, import_path.join)(getTempDir(), `page-${pageNumber}.png`);
2254
+ const tempPath = (0, import_path2.join)(getTempDir(), `page-${pageNumber}.png`);
2037
2255
  try {
2038
- (0, import_fs.writeFileSync)(tempPath, pageImage);
2256
+ (0, import_fs2.writeFileSync)(tempPath, pageImage);
2039
2257
  let output;
2040
2258
  if (mode === "ollama") {
2041
2259
  output = await callOllamaApi(tempPath);
@@ -2045,7 +2263,7 @@ function createCliOcrProvider(mode) {
2045
2263
  return { markdown: stripCodeFence(output.trim()) };
2046
2264
  } finally {
2047
2265
  try {
2048
- (0, import_fs.unlinkSync)(tempPath);
2266
+ (0, import_fs2.unlinkSync)(tempPath);
2049
2267
  } catch {
2050
2268
  }
2051
2269
  }
@@ -2082,7 +2300,7 @@ function callCli(mode, imagePath) {
2082
2300
  return output;
2083
2301
  }
2084
2302
  function callCodexCli(imagePath) {
2085
- const outPath = (0, import_path.join)((0, import_os.tmpdir)(), `kordoc-codex-out-${Date.now()}.txt`);
2303
+ const outPath = (0, import_path2.join)((0, import_os.tmpdir)(), `kordoc-codex-out-${Date.now()}.txt`);
2086
2304
  try {
2087
2305
  const args = ["exec", OCR_PROMPT, "--image", imagePath, "--output-last-message", outPath];
2088
2306
  const model = process.env.KORDOC_CODEX_MODEL;
@@ -2104,7 +2322,7 @@ function callCodexCli(imagePath) {
2104
2322
  }
2105
2323
  let text;
2106
2324
  try {
2107
- text = (0, import_fs.readFileSync)(outPath, "utf-8");
2325
+ text = (0, import_fs2.readFileSync)(outPath, "utf-8");
2108
2326
  } catch {
2109
2327
  text = result.stdout || "";
2110
2328
  }
@@ -2112,7 +2330,7 @@ function callCodexCli(imagePath) {
2112
2330
  return text;
2113
2331
  } finally {
2114
2332
  try {
2115
- (0, import_fs.unlinkSync)(outPath);
2333
+ (0, import_fs2.unlinkSync)(outPath);
2116
2334
  } catch {
2117
2335
  }
2118
2336
  }
@@ -2169,13 +2387,13 @@ function stripCodeFence(text) {
2169
2387
  const match = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/m);
2170
2388
  return match ? match[1].trim() : text;
2171
2389
  }
2172
- var import_child_process2, import_fs, import_path, import_os, OCR_PROMPT, _tempDir;
2390
+ var import_child_process2, import_fs2, import_path2, import_os, OCR_PROMPT, _tempDir;
2173
2391
  var init_cli_provider = __esm({
2174
2392
  "src/ocr/cli-provider.ts"() {
2175
2393
  "use strict";
2176
2394
  import_child_process2 = require("child_process");
2177
- import_fs = require("fs");
2178
- import_path = require("path");
2395
+ import_fs2 = require("fs");
2396
+ import_path2 = require("path");
2179
2397
  import_os = require("os");
2180
2398
  OCR_PROMPT = `\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD14C\uC774\uBE14\uC744 \uCD94\uCD9C\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.
2181
2399
  \uADDC\uCE59:
@@ -2219,7 +2437,7 @@ async function createTesseractPoolProvider(concurrency) {
2219
2437
  const waitQueue = [];
2220
2438
  function acquire() {
2221
2439
  if (idle.length > 0) return Promise.resolve(idle.pop());
2222
- return new Promise((resolve) => waitQueue.push(resolve));
2440
+ return new Promise((resolve4) => waitQueue.push(resolve4));
2223
2441
  }
2224
2442
  function release(w) {
2225
2443
  if (waitQueue.length > 0) {
@@ -2258,8 +2476,8 @@ __export(batch_provider_exports, {
2258
2476
  });
2259
2477
  function getBatchTempDir() {
2260
2478
  if (!_batchTempDir) {
2261
- _batchTempDir = (0, import_path2.join)(process.cwd(), ".kordoc_ocr_tmp");
2262
- (0, import_fs2.mkdirSync)(_batchTempDir, { recursive: true });
2479
+ _batchTempDir = (0, import_path3.join)(process.cwd(), ".kordoc_ocr_tmp");
2480
+ (0, import_fs3.mkdirSync)(_batchTempDir, { recursive: true });
2263
2481
  if (process.platform === "win32") {
2264
2482
  try {
2265
2483
  (0, import_child_process3.execSync)(`attrib +h "${_batchTempDir}"`, { stdio: "ignore" });
@@ -2279,8 +2497,8 @@ function createBatchCliProvider(mode, batchSize) {
2279
2497
  const tempFiles = [];
2280
2498
  try {
2281
2499
  for (const { image, pageNum } of pages) {
2282
- const path = (0, import_path2.join)(tempDir, `batch-p${pageNum}.png`);
2283
- (0, import_fs2.writeFileSync)(path, image);
2500
+ const path = (0, import_path3.join)(tempDir, `batch-p${pageNum}.png`);
2501
+ (0, import_fs3.writeFileSync)(path, image);
2284
2502
  tempFiles.push(path);
2285
2503
  }
2286
2504
  let output;
@@ -2300,7 +2518,7 @@ function createBatchCliProvider(mode, batchSize) {
2300
2518
  } finally {
2301
2519
  for (const f of tempFiles) {
2302
2520
  try {
2303
- (0, import_fs2.unlinkSync)(f);
2521
+ (0, import_fs3.unlinkSync)(f);
2304
2522
  } catch {
2305
2523
  }
2306
2524
  }
@@ -2310,7 +2528,7 @@ function createBatchCliProvider(mode, batchSize) {
2310
2528
  };
2311
2529
  }
2312
2530
  function spawnAsync(cmd, args, opts) {
2313
- return new Promise((resolve, reject) => {
2531
+ return new Promise((resolve4, reject) => {
2314
2532
  const child = (0, import_child_process3.spawn)(cmd, args, {
2315
2533
  cwd: opts.cwd,
2316
2534
  env: process.env,
@@ -2346,7 +2564,7 @@ function spawnAsync(cmd, args, opts) {
2346
2564
  if (killed) {
2347
2565
  reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
2348
2566
  } else {
2349
- resolve({ stdout, stderr, exitCode: code ?? 1 });
2567
+ resolve4({ stdout, stderr, exitCode: code ?? 1 });
2350
2568
  }
2351
2569
  });
2352
2570
  child.on("error", (err) => {
@@ -2383,7 +2601,7 @@ ${fileRefs}`;
2383
2601
  return output;
2384
2602
  }
2385
2603
  async function callBatchCodexCli(imagePaths) {
2386
- const outPath = (0, import_path2.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
2604
+ const outPath = (0, import_path3.join)((0, import_os2.tmpdir)(), `kordoc-codex-batch-${Date.now()}-${Math.random().toString(36).slice(2)}.txt`);
2387
2605
  try {
2388
2606
  const args = ["exec", BATCH_OCR_PROMPT];
2389
2607
  for (const p of imagePaths) {
@@ -2403,7 +2621,7 @@ async function callBatchCodexCli(imagePaths) {
2403
2621
  }
2404
2622
  let text;
2405
2623
  try {
2406
- text = (0, import_fs2.readFileSync)(outPath, "utf-8");
2624
+ text = (0, import_fs3.readFileSync)(outPath, "utf-8");
2407
2625
  } catch {
2408
2626
  text = result.stdout || "";
2409
2627
  }
@@ -2411,7 +2629,7 @@ async function callBatchCodexCli(imagePaths) {
2411
2629
  return text;
2412
2630
  } finally {
2413
2631
  try {
2414
- (0, import_fs2.unlinkSync)(outPath);
2632
+ (0, import_fs3.unlinkSync)(outPath);
2415
2633
  } catch {
2416
2634
  }
2417
2635
  }
@@ -2426,13 +2644,13 @@ function stripCodeFence2(text) {
2426
2644
  const match = text.match(/^```(?:markdown|md)?\s*\n([\s\S]*?)\n```\s*$/m);
2427
2645
  return match ? match[1].trim() : text;
2428
2646
  }
2429
- var import_child_process3, import_fs2, import_path2, import_os2, BATCH_OCR_PROMPT, DEFAULT_BATCH_SIZES, _batchTempDir;
2647
+ var import_child_process3, import_fs3, import_path3, import_os2, BATCH_OCR_PROMPT, DEFAULT_BATCH_SIZES, _batchTempDir;
2430
2648
  var init_batch_provider = __esm({
2431
2649
  "src/ocr/batch-provider.ts"() {
2432
2650
  "use strict";
2433
2651
  import_child_process3 = require("child_process");
2434
- import_fs2 = require("fs");
2435
- import_path2 = require("path");
2652
+ import_fs3 = require("fs");
2653
+ import_path3 = require("path");
2436
2654
  import_os2 = require("os");
2437
2655
  BATCH_OCR_PROMPT = "\uB2E4\uC74C \uBB38\uC11C \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uB4E4\uC744 OCR\uD558\uC5EC \uC21C\uC218 Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uC138\uC694.\n\n\uADDC\uCE59:\n- \uAC01 \uD398\uC774\uC9C0 \uACB0\uACFC \uC0AC\uC774\uC5D0 \uBC18\uB4DC\uC2DC \uC774 \uAD6C\uBD84\uC790\uB97C \uC0BD\uC785: <!-- PAGE_BREAK -->\n- \uD14C\uC774\uBE14\uC740 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)\n- \uBCD1\uD569\uB41C \uC140\uC740 \uD574\uB2F9 \uC704\uCE58\uC5D0 \uB0B4\uC6A9 \uAE30\uC7AC\n- \uD5E4\uB529\uC740 \uAE00\uC790 \uD06C\uAE30\uC5D0 \uB530\uB77C ## ~ ###### \uC0AC\uC6A9\n- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9\n- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC\n- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0\n- ```\uB85C \uAC10\uC2F8\uC9C0 \uB9D0\uACE0 \uC21C\uC218 Markdown\uB9CC \uCD9C\uB825";
2438
2656
  DEFAULT_BATCH_SIZES = {
@@ -2450,7 +2668,10 @@ __export(resolve_exports, {
2450
2668
  resolveOcrProvider: () => resolveOcrProvider
2451
2669
  });
2452
2670
  async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2671
+ const logger = createLoggerFromEnv().child({ component: "ocr/resolve.ts", stage: "ocr" });
2672
+ logger.log({ level: "debug", event: "start", message: "OCR provider resolve \uC2DC\uC791", meta: { mode, concurrency, batchSize } });
2453
2673
  if (mode === "off") {
2674
+ logger.log({ level: "warn", event: "error", message: "OCR \uBE44\uD65C\uC131\uD654 \uBAA8\uB4DC \uC694\uCCAD" });
2454
2675
  throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
2455
2676
  }
2456
2677
  if (mode !== "auto") {
@@ -2458,21 +2679,27 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2458
2679
  if (mode === "tesseract") {
2459
2680
  const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2460
2681
  if (concurrency && concurrency > 1) {
2682
+ logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2461
2683
  return createTesseractPoolProvider2(concurrency);
2462
2684
  }
2685
+ logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
2463
2686
  return createTesseractProvider2();
2464
2687
  }
2465
2688
  if (mode === "gemini" || mode === "claude" || mode === "codex") {
2466
2689
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2467
2690
  const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
2468
2691
  if (effectiveBatch > 1) {
2692
+ logger.log({ level: "info", event: "done", message: "Batch CLI provider \uC120\uD0DD", meta: { mode, batchSize: effectiveBatch } });
2469
2693
  return createBatchCliProvider2(mode, effectiveBatch);
2470
2694
  }
2695
+ logger.log({ level: "info", event: "done", message: "CLI provider \uC120\uD0DD", meta: { mode } });
2471
2696
  return createCliOcrProvider(mode);
2472
2697
  }
2698
+ logger.log({ level: "info", event: "done", message: "CLI provider \uC120\uD0DD", meta: { mode } });
2473
2699
  return createCliOcrProvider(mode);
2474
2700
  }
2475
2701
  const detected = detectAvailableOcr();
2702
+ logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
2476
2703
  if (detected !== "codex") {
2477
2704
  if (detected === "tesseract") {
2478
2705
  warnings?.push({
@@ -2489,18 +2716,23 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2489
2716
  if (detected === "tesseract") {
2490
2717
  const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2491
2718
  if (concurrency && concurrency > 1) {
2719
+ logger.log({ level: "info", event: "done", message: "AUTO: Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2492
2720
  return createTesseractPoolProvider2(concurrency);
2493
2721
  }
2722
+ logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
2494
2723
  return createTesseractProvider2();
2495
2724
  }
2496
2725
  if (detected === "gemini" || detected === "codex" || detected === "claude") {
2497
2726
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2498
2727
  const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[detected];
2499
2728
  if (effectiveBatch > 1) {
2729
+ logger.log({ level: "info", event: "done", message: "AUTO: Batch CLI provider \uC120\uD0DD", meta: { mode: detected, batchSize: effectiveBatch } });
2500
2730
  return createBatchCliProvider2(detected, effectiveBatch);
2501
2731
  }
2732
+ logger.log({ level: "info", event: "done", message: "AUTO: CLI provider \uC120\uD0DD", meta: { mode: detected } });
2502
2733
  return createCliOcrProvider(detected);
2503
2734
  }
2735
+ logger.log({ level: "info", event: "done", message: "AUTO: CLI provider \uC120\uD0DD", meta: { mode: detected } });
2504
2736
  return createCliOcrProvider(detected);
2505
2737
  }
2506
2738
  var init_resolve = __esm({
@@ -2508,6 +2740,7 @@ var init_resolve = __esm({
2508
2740
  "use strict";
2509
2741
  init_auto_detect();
2510
2742
  init_cli_provider();
2743
+ init_logger();
2511
2744
  }
2512
2745
  });
2513
2746
 
@@ -2667,9 +2900,18 @@ function isBatchProvider(p) {
2667
2900
  return !!p && typeof p === "object" && "__batch" in p && p.__batch === true;
2668
2901
  }
2669
2902
  async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2903
+ const logger = createLoggerFromEnv().child({ component: "ocr/provider.ts", stage: "ocr" });
2904
+ logger.log({
2905
+ level: "info",
2906
+ event: "start",
2907
+ message: "OCR \uD398\uC774\uC9C0 \uCC98\uB9AC \uC2DC\uC791",
2908
+ meta: { effectivePageCount, concurrency, filteredPages: pageFilter?.size, batchProvider: isBatchProvider(provider) }
2909
+ });
2670
2910
  const blocks = [];
2671
2911
  if (isBatchProvider(provider)) {
2672
- return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
2912
+ const result = await ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
2913
+ logger.log({ level: "info", event: "done", message: "OCR \uBC30\uCE58 \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: result.length } });
2914
+ return result;
2673
2915
  }
2674
2916
  if (concurrency <= 1) {
2675
2917
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -2685,8 +2927,16 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2685
2927
  message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2686
2928
  code: "OCR_PAGE_FAILED"
2687
2929
  });
2930
+ logger.log({
2931
+ level: "warn",
2932
+ event: "progress",
2933
+ message: "\uD398\uC774\uC9C0 OCR \uC2E4\uD328",
2934
+ meta: { page: i },
2935
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
2936
+ });
2688
2937
  }
2689
2938
  }
2939
+ logger.log({ level: "info", event: "done", message: "OCR \uC21C\uCC28 \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: blocks.length } });
2690
2940
  return blocks;
2691
2941
  }
2692
2942
  const pageNumbers = [];
@@ -2706,6 +2956,13 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2706
2956
  message: `\uD398\uC774\uC9C0 ${pageNum} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2707
2957
  code: "OCR_PAGE_FAILED"
2708
2958
  });
2959
+ logger.log({
2960
+ level: "warn",
2961
+ event: "progress",
2962
+ message: "\uD398\uC774\uC9C0 OCR \uC2E4\uD328(\uBCD1\uB82C)",
2963
+ meta: { page: pageNum },
2964
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
2965
+ });
2709
2966
  return null;
2710
2967
  }
2711
2968
  });
@@ -2714,6 +2971,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2714
2971
  if (!item) continue;
2715
2972
  for (const b of item.pageBlocks) blocks.push(b);
2716
2973
  }
2974
+ logger.log({ level: "info", event: "done", message: "OCR \uBCD1\uB82C \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: blocks.length, pages: pageNumbers.length } });
2717
2975
  return blocks;
2718
2976
  }
2719
2977
  async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
@@ -2796,12 +3054,15 @@ var init_provider = __esm({
2796
3054
  "src/ocr/provider.ts"() {
2797
3055
  "use strict";
2798
3056
  init_markdown_to_blocks();
3057
+ init_logger();
2799
3058
  }
2800
3059
  });
2801
3060
 
2802
3061
  // src/index.ts
2803
3062
  var index_exports = {};
2804
3063
  __export(index_exports, {
3064
+ AllKeysCoolingDownError: () => AllKeysCoolingDownError,
3065
+ ApiKeyRotationPool: () => ApiKeyRotationPool,
2805
3066
  VERSION: () => VERSION,
2806
3067
  blocksToMarkdown: () => blocksToMarkdown,
2807
3068
  compare: () => compare,
@@ -2820,10 +3081,11 @@ __export(index_exports, {
2820
3081
  parseHwp: () => parseHwp,
2821
3082
  parseHwpx: () => parseHwpx,
2822
3083
  parsePdf: () => parsePdf,
2823
- parseXlsx: () => parseXlsx
3084
+ parseXlsx: () => parseXlsx,
3085
+ runUnifiedOcrPipeline: () => runUnifiedOcrPipeline
2824
3086
  });
2825
3087
  module.exports = __toCommonJS(index_exports);
2826
- var import_promises = require("fs/promises");
3088
+ var import_promises3 = require("fs/promises");
2827
3089
 
2828
3090
  // src/detect.ts
2829
3091
  var import_jszip = __toESM(require("jszip"), 1);
@@ -2876,7 +3138,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
2876
3138
  var import_xmldom = require("@xmldom/xmldom");
2877
3139
 
2878
3140
  // src/utils.ts
2879
- var VERSION = true ? "2.4.11" : "0.0.0-dev";
3141
+ var VERSION = true ? "2.4.12" : "0.0.0-dev";
2880
3142
  function toArrayBuffer(buf) {
2881
3143
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2882
3144
  return buf.buffer;
@@ -2884,9 +3146,13 @@ function toArrayBuffer(buf) {
2884
3146
  return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
2885
3147
  }
2886
3148
  var KordocError = class extends Error {
2887
- constructor(message) {
3149
+ code;
3150
+ stage;
3151
+ constructor(message, opts = {}) {
2888
3152
  super(message);
2889
3153
  this.name = "KordocError";
3154
+ this.code = opts.code;
3155
+ this.stage = opts.stage;
2890
3156
  }
2891
3157
  };
2892
3158
  function isPathTraversal(name) {
@@ -2950,6 +3216,16 @@ function classifyError(err) {
2950
3216
  if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
2951
3217
  return "PARSE_ERROR";
2952
3218
  }
3219
+ function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
3220
+ if (err instanceof KordocError) {
3221
+ if (!err.stage) err.stage = stage;
3222
+ if (!err.code) err.code = fallbackCode;
3223
+ return err;
3224
+ }
3225
+ const message = err instanceof Error ? err.message : fallbackMessage;
3226
+ const code = err instanceof Error ? classifyError(err) : fallbackCode;
3227
+ return new KordocError(message || fallbackMessage, { code, stage });
3228
+ }
2953
3229
 
2954
3230
  // src/table/builder.ts
2955
3231
  var MAX_COLS = 200;
@@ -3212,6 +3488,7 @@ var HEADING_RATIO_H3 = 1.15;
3212
3488
 
3213
3489
  // src/hwpx/parser.ts
3214
3490
  init_page_range();
3491
+ init_logger();
3215
3492
  var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
3216
3493
  var MAX_ZIP_ENTRIES = 2e3;
3217
3494
  function clampSpan(val, max) {
@@ -3303,50 +3580,89 @@ function stripDtd(xml) {
3303
3580
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
3304
3581
  }
3305
3582
  async function parseHwpxDocument(buffer, options, existingZip) {
3306
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3307
- let zip;
3583
+ const logger = createLoggerFromEnv().child({ component: "hwpx/parser.ts", stage: "detect" });
3584
+ logger.log({ level: "info", event: "start", message: "HWPX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
3585
+ let lastParsedSection = 0;
3308
3586
  try {
3309
- zip = existingZip ?? await import_jszip2.default.loadAsync(buffer);
3310
- } catch {
3311
- return await extractFromBrokenZip(buffer);
3312
- }
3313
- const actualEntryCount = Object.keys(zip.files).length;
3314
- if (actualEntryCount > MAX_ZIP_ENTRIES) {
3315
- throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
3316
- }
3317
- const decompressed = { total: 0 };
3318
- const metadata = {};
3319
- await extractHwpxMetadata(zip, metadata, decompressed);
3320
- const styleMap = await extractHwpxStyles(zip, decompressed);
3321
- const warnings = [];
3322
- const sectionPaths = await resolveSectionPaths(zip);
3323
- if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
3324
- metadata.pageCount = sectionPaths.length;
3325
- const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
3326
- const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
3327
- const blocks = [];
3328
- let parsedSections = 0;
3329
- for (let si = 0; si < sectionPaths.length; si++) {
3330
- if (pageFilter && !pageFilter.has(si + 1)) continue;
3331
- const file = zip.file(sectionPaths[si]);
3332
- if (!file) continue;
3587
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3588
+ let zip;
3333
3589
  try {
3334
- const xml = await file.async("text");
3335
- decompressed.total += xml.length * 2;
3336
- if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
3337
- blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
3338
- parsedSections++;
3339
- options?.onProgress?.(parsedSections, totalTarget);
3340
- } catch (secErr) {
3341
- if (secErr instanceof KordocError) throw secErr;
3342
- warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
3343
- }
3344
- }
3345
- const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
3346
- detectHwpxHeadings(blocks, styleMap);
3347
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
3348
- const markdown = blocksToMarkdown(blocks);
3349
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
3590
+ zip = existingZip ?? await import_jszip2.default.loadAsync(buffer);
3591
+ } catch {
3592
+ return await extractFromBrokenZip(buffer);
3593
+ }
3594
+ const actualEntryCount = Object.keys(zip.files).length;
3595
+ if (actualEntryCount > MAX_ZIP_ENTRIES) {
3596
+ throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
3597
+ }
3598
+ const decompressed = { total: 0 };
3599
+ const metadata = {};
3600
+ await extractHwpxMetadata(zip, metadata, decompressed);
3601
+ const styleMap = await extractHwpxStyles(zip, decompressed);
3602
+ const warnings = [];
3603
+ const sectionPaths = await resolveSectionPaths(zip);
3604
+ if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
3605
+ metadata.pageCount = sectionPaths.length;
3606
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uACBD\uB85C \uD574\uC11D \uC644\uB8CC", meta: { sections: sectionPaths.length } });
3607
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
3608
+ const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
3609
+ const blocks = [];
3610
+ let parsedSections = 0;
3611
+ for (let si = 0; si < sectionPaths.length; si++) {
3612
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
3613
+ const file = zip.file(sectionPaths[si]);
3614
+ if (!file) continue;
3615
+ try {
3616
+ const xml = await file.async("text");
3617
+ decompressed.total += xml.length * 2;
3618
+ if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
3619
+ blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
3620
+ parsedSections++;
3621
+ options?.onProgress?.(parsedSections, totalTarget);
3622
+ logger.log({
3623
+ level: "debug",
3624
+ stage: "convert",
3625
+ event: "progress",
3626
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
3627
+ meta: { section: si + 1, parsedSections, totalTarget }
3628
+ });
3629
+ lastParsedSection = si + 1;
3630
+ } catch (secErr) {
3631
+ if (secErr instanceof KordocError) throw secErr;
3632
+ warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
3633
+ logger.log({
3634
+ level: "warn",
3635
+ stage: "convert",
3636
+ event: "progress",
3637
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
3638
+ meta: { section: si + 1 },
3639
+ error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
3640
+ });
3641
+ }
3642
+ }
3643
+ const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
3644
+ detectHwpxHeadings(blocks, styleMap);
3645
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
3646
+ const markdown = blocksToMarkdown(blocks);
3647
+ logger.log({
3648
+ level: "info",
3649
+ stage: "finalize",
3650
+ event: "done",
3651
+ message: "HWPX \uD30C\uC2F1 \uC644\uB8CC",
3652
+ meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
3653
+ });
3654
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
3655
+ } catch (err) {
3656
+ logger.log({
3657
+ level: "error",
3658
+ stage: "finalize",
3659
+ event: "error",
3660
+ message: "HWPX \uD30C\uC2F1 \uC2E4\uD328",
3661
+ meta: { lastParsedSection },
3662
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
3663
+ });
3664
+ throw err;
3665
+ }
3350
3666
  }
3351
3667
  function imageExtToMime(ext) {
3352
3668
  switch (ext.toLowerCase()) {
@@ -5063,75 +5379,115 @@ function parseLenientCfb(data) {
5063
5379
 
5064
5380
  // src/hwp5/parser.ts
5065
5381
  init_page_range();
5382
+ init_logger();
5066
5383
  var CFB = __toESM(require_cfb(), 1);
5067
5384
  var MAX_SECTIONS = 100;
5068
5385
  var MAX_TOTAL_DECOMPRESS = 500 * 1024 * 1024;
5069
5386
  function parseHwp5Document(buffer, options) {
5070
- let cfb = null;
5071
- let lenientCfb = null;
5072
- const warnings = [];
5387
+ const logger = createLoggerFromEnv().child({ component: "hwp5/parser.ts", stage: "detect" });
5388
+ logger.log({ level: "info", event: "start", message: "HWP5 \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.length } });
5389
+ let lastParsedSection = 0;
5073
5390
  try {
5074
- cfb = CFB.parse(buffer);
5075
- } catch {
5391
+ let cfb = null;
5392
+ let lenientCfb = null;
5393
+ const warnings = [];
5076
5394
  try {
5077
- lenientCfb = parseLenientCfb(buffer);
5078
- warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
5395
+ cfb = CFB.parse(buffer);
5079
5396
  } catch {
5080
- throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
5397
+ try {
5398
+ lenientCfb = parseLenientCfb(buffer);
5399
+ warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
5400
+ } catch {
5401
+ throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
5402
+ }
5081
5403
  }
5082
- }
5083
- const findStream = (path) => {
5084
- if (cfb) {
5085
- const entry = CFB.find(cfb, path);
5086
- return entry?.content ? Buffer.from(entry.content) : null;
5404
+ const findStream = (path) => {
5405
+ if (cfb) {
5406
+ const entry = CFB.find(cfb, path);
5407
+ return entry?.content ? Buffer.from(entry.content) : null;
5408
+ }
5409
+ return lenientCfb.findStream(path);
5410
+ };
5411
+ const headerData = findStream("/FileHeader");
5412
+ if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
5413
+ const header = parseFileHeader(headerData);
5414
+ if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
5415
+ if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
5416
+ if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
5417
+ const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
5418
+ const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
5419
+ const metadata = {
5420
+ version: `${header.versionMajor}.x`
5421
+ };
5422
+ if (cfb) extractHwp5Metadata(cfb, metadata);
5423
+ const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
5424
+ const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
5425
+ if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
5426
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uBAA9\uB85D \uD574\uC11D \uC644\uB8CC", meta: { sections: sections.length, distribution } });
5427
+ metadata.pageCount = sections.length;
5428
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
5429
+ const totalTarget = pageFilter ? pageFilter.size : sections.length;
5430
+ const blocks = [];
5431
+ let totalDecompressed = 0;
5432
+ let parsedSections = 0;
5433
+ for (let si = 0; si < sections.length; si++) {
5434
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
5435
+ try {
5436
+ const sectionData = sections[si];
5437
+ const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
5438
+ totalDecompressed += data.length;
5439
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
5440
+ const records = readRecords(data);
5441
+ const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
5442
+ blocks.push(...sectionBlocks);
5443
+ parsedSections++;
5444
+ options?.onProgress?.(parsedSections, totalTarget);
5445
+ logger.log({
5446
+ level: "debug",
5447
+ stage: "convert",
5448
+ event: "progress",
5449
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
5450
+ meta: { section: si + 1, parsedSections, totalTarget }
5451
+ });
5452
+ lastParsedSection = si + 1;
5453
+ } catch (secErr) {
5454
+ if (secErr instanceof KordocError) throw secErr;
5455
+ warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
5456
+ logger.log({
5457
+ level: "warn",
5458
+ stage: "convert",
5459
+ event: "progress",
5460
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
5461
+ meta: { section: si + 1 },
5462
+ error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
5463
+ });
5464
+ }
5087
5465
  }
5088
- return lenientCfb.findStream(path);
5089
- };
5090
- const headerData = findStream("/FileHeader");
5091
- if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
5092
- const header = parseFileHeader(headerData);
5093
- if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
5094
- if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
5095
- if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
5096
- const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
5097
- const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
5098
- const metadata = {
5099
- version: `${header.versionMajor}.x`
5100
- };
5101
- if (cfb) extractHwp5Metadata(cfb, metadata);
5102
- const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
5103
- const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
5104
- if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
5105
- metadata.pageCount = sections.length;
5106
- const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
5107
- const totalTarget = pageFilter ? pageFilter.size : sections.length;
5108
- const blocks = [];
5109
- let totalDecompressed = 0;
5110
- let parsedSections = 0;
5111
- for (let si = 0; si < sections.length; si++) {
5112
- if (pageFilter && !pageFilter.has(si + 1)) continue;
5113
- try {
5114
- const sectionData = sections[si];
5115
- const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
5116
- totalDecompressed += data.length;
5117
- if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
5118
- const records = readRecords(data);
5119
- const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
5120
- blocks.push(...sectionBlocks);
5121
- parsedSections++;
5122
- options?.onProgress?.(parsedSections, totalTarget);
5123
- } catch (secErr) {
5124
- if (secErr instanceof KordocError) throw secErr;
5125
- warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
5126
- }
5127
- }
5128
- const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
5129
- if (docInfo) {
5130
- detectHwp5Headings(blocks, docInfo);
5131
- }
5132
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
5133
- const markdown = blocksToMarkdown(blocks);
5134
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
5466
+ const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
5467
+ if (docInfo) {
5468
+ detectHwp5Headings(blocks, docInfo);
5469
+ }
5470
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
5471
+ const markdown = blocksToMarkdown(blocks);
5472
+ logger.log({
5473
+ level: "info",
5474
+ stage: "finalize",
5475
+ event: "done",
5476
+ message: "HWP5 \uD30C\uC2F1 \uC644\uB8CC",
5477
+ meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
5478
+ });
5479
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
5480
+ } catch (err) {
5481
+ logger.log({
5482
+ level: "error",
5483
+ stage: "finalize",
5484
+ event: "error",
5485
+ message: "HWP5 \uD30C\uC2F1 \uC2E4\uD328",
5486
+ meta: { lastParsedSection },
5487
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
5488
+ });
5489
+ throw err;
5490
+ }
5135
5491
  }
5136
5492
  function parseDocInfoStream(cfb, compressed) {
5137
5493
  try {
@@ -5678,6 +6034,8 @@ function arrangeCells(rows, cols, cells) {
5678
6034
 
5679
6035
  // src/pdf/parser.ts
5680
6036
  init_page_range();
6037
+ var import_module = require("module");
6038
+ var import_path4 = require("path");
5681
6039
 
5682
6040
  // src/pdf/line-detector.ts
5683
6041
  var import_pdf = require("pdfjs-dist/legacy/build/pdf.mjs");
@@ -5865,12 +6223,17 @@ function buildTableGrids(horizontals, verticals) {
5865
6223
  const rawXs = vLines.map((l) => l.x1);
5866
6224
  const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
5867
6225
  if (rowYs.length < 2 || colXs.length < 2) continue;
6226
+ const rowCount = rowYs.length - 1;
6227
+ const colCount = colXs.length - 1;
6228
+ if (rowCount <= 0 || colCount <= 0) continue;
6229
+ if (rowCount * colCount < 2) continue;
5868
6230
  const bbox = {
5869
6231
  x1: colXs[0],
5870
6232
  y1: rowYs[rowYs.length - 1],
5871
6233
  x2: colXs[colXs.length - 1],
5872
6234
  y2: rowYs[0]
5873
6235
  };
6236
+ if (!hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox)) continue;
5874
6237
  grids.push({ rowYs, colXs, bbox });
5875
6238
  }
5876
6239
  return mergeAdjacentGrids(grids);
@@ -5920,6 +6283,35 @@ function clusterCoordinates(values) {
5920
6283
  }
5921
6284
  return clusters.map((c) => c.sum / c.count);
5922
6285
  }
6286
+ function hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox) {
6287
+ const internalRows = rowYs.slice(1, -1);
6288
+ const internalCols = colXs.slice(1, -1);
6289
+ const width = Math.max(1, bbox.x2 - bbox.x1);
6290
+ const height = Math.max(1, bbox.y2 - bbox.y1);
6291
+ const coverageThreshold = 0.55;
6292
+ const coveredRows = internalRows.filter(
6293
+ (y) => hLines.some((h) => Math.abs(h.y1 - y) <= COORD_MERGE_TOL && lineOverlapRatio(h.x1, h.x2, bbox.x1, bbox.x2) >= coverageThreshold)
6294
+ ).length;
6295
+ const coveredCols = internalCols.filter(
6296
+ (x) => vLines.some((v) => Math.abs(v.x1 - x) <= COORD_MERGE_TOL && lineOverlapRatio(v.y1, v.y2, bbox.y1, bbox.y2) >= coverageThreshold)
6297
+ ).length;
6298
+ const rowCoverage = internalRows.length > 0 ? coveredRows / internalRows.length : 1;
6299
+ const colCoverage = internalCols.length > 0 ? coveredCols / internalCols.length : 1;
6300
+ const longHorizontal = hLines.filter((h) => Math.abs(h.x2 - h.x1) >= width * 0.7).length;
6301
+ const longVertical = vLines.filter((v) => Math.abs(v.y2 - v.y1) >= height * 0.7).length;
6302
+ const hasAxisSupport = longHorizontal >= 2 && longVertical >= 2;
6303
+ if (!hasAxisSupport) return false;
6304
+ if (internalRows.length > 0 && rowCoverage < 0.5) return false;
6305
+ if (internalCols.length > 0 && colCoverage < 0.5) return false;
6306
+ return true;
6307
+ }
6308
+ function lineOverlapRatio(a1, a2, b1, b2) {
6309
+ const left = Math.max(Math.min(a1, a2), Math.min(b1, b2));
6310
+ const right = Math.min(Math.max(a1, a2), Math.max(b1, b2));
6311
+ const overlap = Math.max(0, right - left);
6312
+ const target = Math.max(1, Math.abs(b2 - b1));
6313
+ return overlap / target;
6314
+ }
5923
6315
  function groupConnectedLines(lines) {
5924
6316
  const parent = lines.map((_, i) => i);
5925
6317
  function find2(x) {
@@ -6296,6 +6688,9 @@ function buildClusterTable(rows, columns, pageNum) {
6296
6688
  };
6297
6689
  }
6298
6690
 
6691
+ // src/pdf/parser.ts
6692
+ init_logger();
6693
+
6299
6694
  // src/pdf/polyfill.ts
6300
6695
  var pdfjsWorker = __toESM(require("pdfjs-dist/legacy/build/pdf.worker.mjs"), 1);
6301
6696
  var g = globalThis;
@@ -6316,6 +6711,17 @@ g.pdfjsWorker = pdfjsWorker;
6316
6711
  // src/pdf/parser.ts
6317
6712
  var import_pdf2 = require("pdfjs-dist/legacy/build/pdf.mjs");
6318
6713
  import_pdf2.GlobalWorkerOptions.workerSrc = "";
6714
+ var require2 = (0, import_module.createRequire)(
6715
+ typeof __filename !== "undefined" ? __filename : (0, import_path4.resolve)(process.cwd(), "kordoc.require.cjs")
6716
+ );
6717
+ function resolvePdfjsWasmUrl() {
6718
+ try {
6719
+ const pdfjsPkg = require2.resolve("pdfjs-dist/package.json");
6720
+ return (0, import_path4.join)((0, import_path4.dirname)(pdfjsPkg), "wasm/");
6721
+ } catch {
6722
+ return (0, import_path4.resolve)(process.cwd(), "node_modules/pdfjs-dist/wasm/");
6723
+ }
6724
+ }
6319
6725
  var MAX_PAGES = 5e3;
6320
6726
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
6321
6727
  function calcPdfTimeout(bufferSize) {
@@ -6331,7 +6737,8 @@ async function loadPdfWithTimeout(buffer) {
6331
6737
  data: new Uint8Array(buffer),
6332
6738
  useSystemFonts: true,
6333
6739
  disableFontFace: true,
6334
- isEvalSupported: false
6740
+ isEvalSupported: false,
6741
+ wasmUrl: resolvePdfjsWasmUrl()
6335
6742
  });
6336
6743
  let timer;
6337
6744
  try {
@@ -6348,7 +6755,47 @@ async function loadPdfWithTimeout(buffer) {
6348
6755
  if (timer !== void 0) clearTimeout(timer);
6349
6756
  }
6350
6757
  }
6758
+ function estimateImageBasedPdf(metrics) {
6759
+ if (metrics.length === 0) {
6760
+ return { isImageBased: true, score: 1, reason: "\uC0D8\uD50C \uD1B5\uACC4 \uC5C6\uC74C" };
6761
+ }
6762
+ const totalPages = metrics.length;
6763
+ const totalChars = metrics.reduce((s, m) => s + m.nonWhitespaceChars, 0);
6764
+ const totalItems = metrics.reduce((s, m) => s + m.visibleItems, 0);
6765
+ const pagesWithText = metrics.filter((m) => m.nonWhitespaceChars >= 20 || m.visibleItems >= 15).length;
6766
+ const avgChars = totalChars / totalPages;
6767
+ const avgItems = totalItems / totalPages;
6768
+ const textPresenceRatio = pagesWithText / totalPages;
6769
+ let score = 0;
6770
+ if (avgChars < 10) score += 0.45;
6771
+ if (avgItems < 8) score += 0.35;
6772
+ if (textPresenceRatio < 0.35) score += 0.25;
6773
+ if (avgChars > 40) score -= 0.35;
6774
+ if (avgItems > 25) score -= 0.35;
6775
+ if (textPresenceRatio > 0.7) score -= 0.25;
6776
+ score = Math.max(0, Math.min(1, score));
6777
+ const isImageBased = score >= 0.5;
6778
+ const reason = `avgChars=${avgChars.toFixed(1)}, avgItems=${avgItems.toFixed(1)}, textPresence=${(textPresenceRatio * 100).toFixed(0)}%, score=${score.toFixed(2)}`;
6779
+ return { isImageBased, score, reason };
6780
+ }
6781
+ function summarizePartialFailures(failedPages, totalTarget) {
6782
+ if (failedPages.length === 0) return null;
6783
+ const sorted = [...failedPages].sort((a, b) => a - b);
6784
+ const preview = sorted.slice(0, 10).join(", ");
6785
+ const suffix = sorted.length > 10 ? ` \uC678 ${sorted.length - 10}\uD398\uC774\uC9C0` : "";
6786
+ return `\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uC694\uC57D: ${sorted.length}/${totalTarget}\uD398\uC774\uC9C0 \uC2E4\uD328 (p${preview}${suffix})`;
6787
+ }
6788
+ function shouldAbortForPartialFailures(failedPages, totalTarget, maxPartialFailureRatio) {
6789
+ if (typeof maxPartialFailureRatio !== "number") {
6790
+ return { abort: false, ratio: 0, threshold: 0 };
6791
+ }
6792
+ const threshold = Math.max(0, Math.min(1, maxPartialFailureRatio));
6793
+ const ratio = totalTarget > 0 ? failedPages.length / totalTarget : 0;
6794
+ return { abort: ratio > threshold, ratio, threshold };
6795
+ }
6351
6796
  async function parsePdfDocument(buffer, options) {
6797
+ const logger = createLoggerFromEnv().child({ component: "pdf/parser.ts", stage: "detect" });
6798
+ logger.log({ level: "info", event: "start", message: "PDF \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
6352
6799
  const doc = await loadPdfWithTimeout(buffer);
6353
6800
  try {
6354
6801
  const pageCount = doc.numPages;
@@ -6357,9 +6804,13 @@ async function parsePdfDocument(buffer, options) {
6357
6804
  await extractPdfMetadata(doc, metadata);
6358
6805
  const blocks = [];
6359
6806
  const warnings = [];
6807
+ const failedPages = [];
6808
+ let lastParsedPage2 = 0;
6809
+ const sampleMetricsByPage = /* @__PURE__ */ new Map();
6360
6810
  let totalChars = 0;
6361
6811
  let totalTextBytes = 0;
6362
6812
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
6813
+ logger.log({ level: "debug", event: "progress", message: "PDF \uB85C\uB529 \uC644\uB8CC", meta: { pageCount, effectivePageCount } });
6363
6814
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
6364
6815
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
6365
6816
  const fontSizeFreq = /* @__PURE__ */ new Map();
@@ -6396,11 +6847,17 @@ async function parsePdfDocument(buffer, options) {
6396
6847
  totalChars += t.replace(/\s/g, "").length;
6397
6848
  totalTextBytes += t.length * 2;
6398
6849
  }
6850
+ sampleMetricsByPage.set(i, {
6851
+ nonWhitespaceChars: visible.reduce((sum, it) => sum + it.text.replace(/\s/g, "").length, 0),
6852
+ visibleItems: visible.length
6853
+ });
6854
+ lastParsedPage2 = i;
6399
6855
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
6400
6856
  parsedPages++;
6401
6857
  options?.onProgress?.(parsedPages, totalTarget);
6402
6858
  } catch (pageErr) {
6403
6859
  if (pageErr instanceof KordocError) throw pageErr;
6860
+ if (!failedPages.includes(i)) failedPages.push(i);
6404
6861
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
6405
6862
  }
6406
6863
  };
@@ -6417,8 +6874,21 @@ async function parsePdfDocument(buffer, options) {
6417
6874
  for (const si of sampledIndices) {
6418
6875
  await parseSinglePage(targetPageNums[si]);
6419
6876
  }
6420
- const sampleParsed = parsedPages || sampledIndices.size;
6421
- const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
6877
+ const sampledMetrics = [];
6878
+ for (const si of sampledIndices) {
6879
+ const pageNum = targetPageNums[si];
6880
+ const m = sampleMetricsByPage.get(pageNum);
6881
+ if (m) sampledMetrics.push(m);
6882
+ }
6883
+ const imageBasedDecision = estimateImageBasedPdf(sampledMetrics);
6884
+ const isImageBased = imageBasedDecision.isImageBased;
6885
+ logger.log({
6886
+ level: "info",
6887
+ stage: "probe",
6888
+ event: "done",
6889
+ message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815",
6890
+ meta: { isImageBased, reason: imageBasedDecision.reason, sampledPages: sampledMetrics.length }
6891
+ });
6422
6892
  if (!isImageBased) {
6423
6893
  for (let si = 0; si < targetPageNums.length; si++) {
6424
6894
  if (!sampledIndices.has(si)) {
@@ -6426,11 +6896,41 @@ async function parsePdfDocument(buffer, options) {
6426
6896
  }
6427
6897
  }
6428
6898
  }
6899
+ const partialSummary = summarizePartialFailures(failedPages, totalTarget);
6900
+ if (partialSummary) {
6901
+ warnings.push({
6902
+ message: partialSummary,
6903
+ code: "PARTIAL_PARSE"
6904
+ });
6905
+ }
6906
+ if (isImageBased) {
6907
+ warnings.push({
6908
+ message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815: ${imageBasedDecision.reason}`,
6909
+ code: "OCR_FALLBACK"
6910
+ });
6911
+ }
6912
+ const partialPolicy = shouldAbortForPartialFailures(
6913
+ failedPages,
6914
+ totalTarget,
6915
+ options?.maxPartialFailureRatio
6916
+ );
6917
+ if (partialPolicy.abort) {
6918
+ throw new KordocError(
6919
+ `\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uBE44\uC728 \uCD08\uACFC: ${(partialPolicy.ratio * 100).toFixed(1)}% (\uD5C8\uC6A9 ${(partialPolicy.threshold * 100).toFixed(1)}%)`
6920
+ );
6921
+ }
6429
6922
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6430
6923
  if (isImageBased) {
6431
6924
  const ocrMode = options?.ocrMode ?? "auto";
6432
6925
  const concurrency = options?.ocrConcurrency ?? 1;
6433
6926
  const batchSize = options?.ocrBatchSize;
6927
+ logger.log({
6928
+ level: "info",
6929
+ stage: "ocr",
6930
+ event: "start",
6931
+ message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF OCR \uC2DC\uC791",
6932
+ meta: { ocrMode, concurrency, batchSize, totalTarget }
6933
+ });
6434
6934
  if (ocrMode === "off") {
6435
6935
  throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6436
6936
  }
@@ -6438,8 +6938,10 @@ async function parsePdfDocument(buffer, options) {
6438
6938
  const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6439
6939
  const tryProvider = async (provider, filter) => {
6440
6940
  try {
6941
+ logger.log({ level: "debug", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589", meta: { filteredPages: filter?.size } });
6441
6942
  return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
6442
6943
  } catch {
6944
+ logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589 \uC2E4\uD328(\uBE48 \uACB0\uACFC\uB85C \uCC98\uB9AC)" });
6443
6945
  return [];
6444
6946
  } finally {
6445
6947
  const terminable = provider;
@@ -6462,6 +6964,7 @@ async function parsePdfDocument(buffer, options) {
6462
6964
  for (const mode of getAutoFallbackChain2()) {
6463
6965
  if (pendingPages.size === 0) break;
6464
6966
  try {
6967
+ logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uC2DC\uB3C4", meta: { mode, pendingPages: pendingPages.size } });
6465
6968
  const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
6466
6969
  const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
6467
6970
  const blocks2 = await tryProvider(provider, modeFilter);
@@ -6476,10 +6979,20 @@ async function parsePdfDocument(buffer, options) {
6476
6979
  code: "OCR_CLI_FALLBACK"
6477
6980
  });
6478
6981
  }
6982
+ logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uCC98\uB9AC \uC644\uB8CC", meta: { mode, blocks: blocks2.length, pendingPages: pendingPages.size } });
6479
6983
  } else {
6480
6984
  warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6985
+ logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uACB0\uACFC \uC5C6\uC74C", meta: { mode } });
6481
6986
  }
6482
- } catch {
6987
+ } catch (engineErr) {
6988
+ logger.log({
6989
+ level: "warn",
6990
+ stage: "ocr",
6991
+ event: "progress",
6992
+ message: "OCR \uC5D4\uC9C4 \uCD08\uAE30\uD654/\uC2E4\uD589 \uC2E4\uD328",
6993
+ meta: { mode },
6994
+ error: { message: engineErr instanceof Error ? engineErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: engineErr instanceof Error ? engineErr.name : "Error" }
6995
+ });
6483
6996
  }
6484
6997
  }
6485
6998
  allOcrBlocks.sort((a, b) => (a.pageNumber ?? 0) - (b.pageNumber ?? 0));
@@ -6497,6 +7010,7 @@ async function parsePdfDocument(buffer, options) {
6497
7010
  }
6498
7011
  if (ocrBlocks.length > 0) {
6499
7012
  const ocrMarkdown = blocksToMarkdown(ocrBlocks);
7013
+ logger.log({ level: "info", stage: "ocr", event: "done", message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 OCR \uC644\uB8CC", meta: { blocks: ocrBlocks.length } });
6500
7014
  return {
6501
7015
  markdown: ocrMarkdown,
6502
7016
  blocks: ocrBlocks,
@@ -6522,8 +7036,25 @@ async function parsePdfDocument(buffer, options) {
6522
7036
  }
6523
7037
  detectMarkerHeadings(blocks);
6524
7038
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
6525
- let markdown = cleanPdfText(blocksToMarkdown(blocks));
7039
+ let markdown = cleanPdfText(blocksToMarkdown(blocks), options?.pdfTextNormalization ?? "default");
7040
+ logger.log({
7041
+ level: "info",
7042
+ stage: "finalize",
7043
+ event: "done",
7044
+ message: "PDF \uD30C\uC2F1 \uC644\uB8CC",
7045
+ meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, isImageBased: false }
7046
+ });
6526
7047
  return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
7048
+ } catch (err) {
7049
+ logger.log({
7050
+ level: "error",
7051
+ stage: "finalize",
7052
+ event: "error",
7053
+ message: "PDF \uD30C\uC2F1 \uC2E4\uD328",
7054
+ meta: { lastParsedPage },
7055
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
7056
+ });
7057
+ throw err;
6527
7058
  } finally {
6528
7059
  await doc.destroy().catch(() => {
6529
7060
  });
@@ -6617,6 +7148,17 @@ function shouldDemoteTable(table) {
6617
7148
  const emptyCells = totalCells - allCells.length;
6618
7149
  if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
6619
7150
  if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
7151
+ if (table.cols >= 3 && table.rows <= 4) {
7152
+ const markerCells = allCells.filter((t) => /^[□■◆○●▶▷◇◆]/.test(t)).length;
7153
+ const numericCells = allCells.filter((t) => /\d/.test(t)).length;
7154
+ if (markerCells >= Math.max(1, Math.floor(allCells.length * 0.35)) && numericCells <= Math.floor(allCells.length * 0.15)) {
7155
+ return true;
7156
+ }
7157
+ }
7158
+ if (table.cols >= 3 && table.rows >= 2) {
7159
+ const sparseRows = table.cells.filter((row) => row.filter((c) => c.text.trim()).length <= 1).length;
7160
+ if (sparseRows >= Math.ceil(table.rows * 0.7)) return true;
7161
+ }
6620
7162
  return false;
6621
7163
  }
6622
7164
  function demoteTableToText(table) {
@@ -7172,10 +7714,15 @@ function mergeLineSimple(items) {
7172
7714
  }
7173
7715
  return result;
7174
7716
  }
7175
- function cleanPdfText(text) {
7176
- return mergeKoreanLines(
7177
- text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
7178
- ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
7717
+ function stripPdfPageNumberArtifacts(text) {
7718
+ return text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "");
7719
+ }
7720
+ function cleanPdfText(text, mode = "default") {
7721
+ const stripped = stripPdfPageNumberArtifacts(text);
7722
+ if (mode === "strict-preserve") {
7723
+ return stripped.replace(/\n{4,}/g, "\n\n\n").trim();
7724
+ }
7725
+ return mergeKoreanLines(stripped).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
7179
7726
  }
7180
7727
  function startsWithMarker(line) {
7181
7728
  const t = line.trimStart();
@@ -7379,6 +7926,7 @@ function mergeKoreanLines(text) {
7379
7926
  // src/xlsx/parser.ts
7380
7927
  var import_jszip3 = __toESM(require("jszip"), 1);
7381
7928
  var import_xmldom2 = require("@xmldom/xmldom");
7929
+ init_logger();
7382
7930
  var MAX_SHEETS = 100;
7383
7931
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
7384
7932
  var MAX_ROWS2 = 1e4;
@@ -7568,105 +8116,145 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
7568
8116
  return blocks;
7569
8117
  }
7570
8118
  async function parseXlsxDocument(buffer, options, existingZip) {
7571
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
7572
- const zip = existingZip ?? await import_jszip3.default.loadAsync(buffer);
7573
- const warnings = [];
7574
- const workbookFile = zip.file("xl/workbook.xml");
7575
- if (!workbookFile) {
7576
- throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
7577
- }
7578
- let sharedStrings = [];
7579
- const ssFile = zip.file("xl/sharedStrings.xml");
7580
- if (ssFile) {
7581
- sharedStrings = parseSharedStrings(await ssFile.async("text"));
7582
- }
7583
- const sheets = parseWorkbook(await workbookFile.async("text"));
7584
- if (sheets.length === 0) {
7585
- throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
7586
- }
7587
- let relsMap = /* @__PURE__ */ new Map();
7588
- const relsFile = zip.file("xl/_rels/workbook.xml.rels");
7589
- if (relsFile) {
7590
- relsMap = parseRels(await relsFile.async("text"));
7591
- }
7592
- let pageFilter = null;
7593
- if (options?.pages) {
7594
- const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => (init_page_range(), page_range_exports));
7595
- pageFilter = parsePageRange2(options.pages, sheets.length);
7596
- }
7597
- const blocks = [];
7598
- const processedSheets = Math.min(sheets.length, MAX_SHEETS);
7599
- let totalCells = 0;
7600
- for (let i = 0; i < processedSheets; i++) {
7601
- if (pageFilter && !pageFilter.has(i + 1)) continue;
7602
- const sheet = sheets[i];
7603
- options?.onProgress?.(i + 1, processedSheets);
7604
- let sheetPath = relsMap.get(sheet.rId);
7605
- if (sheetPath) {
7606
- if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
7607
- sheetPath = `xl/${sheetPath}`;
7608
- } else if (sheetPath.startsWith("/")) {
7609
- sheetPath = sheetPath.slice(1);
7610
- }
7611
- } else {
7612
- sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
7613
- }
7614
- const sheetFile = zip.file(sheetPath);
7615
- if (!sheetFile) {
7616
- warnings.push({
7617
- page: i + 1,
7618
- message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
7619
- code: "PARTIAL_PARSE"
7620
- });
7621
- continue;
8119
+ const logger = createLoggerFromEnv().child({ component: "xlsx/parser.ts", stage: "detect" });
8120
+ logger.log({ level: "info", event: "start", message: "XLSX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
8121
+ let lastProcessedSheet = 0;
8122
+ try {
8123
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
8124
+ const zip = existingZip ?? await import_jszip3.default.loadAsync(buffer);
8125
+ const warnings = [];
8126
+ const workbookFile = zip.file("xl/workbook.xml");
8127
+ if (!workbookFile) {
8128
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
8129
+ }
8130
+ let sharedStrings = [];
8131
+ const ssFile = zip.file("xl/sharedStrings.xml");
8132
+ if (ssFile) {
8133
+ sharedStrings = parseSharedStrings(await ssFile.async("text"));
8134
+ }
8135
+ const sheets = parseWorkbook(await workbookFile.async("text"));
8136
+ if (sheets.length === 0) {
8137
+ throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
8138
+ }
8139
+ logger.log({ level: "debug", event: "progress", message: "\uC2DC\uD2B8 \uBAA9\uB85D \uB85C\uB4DC", meta: { sheets: sheets.length } });
8140
+ let relsMap = /* @__PURE__ */ new Map();
8141
+ const relsFile = zip.file("xl/_rels/workbook.xml.rels");
8142
+ if (relsFile) {
8143
+ relsMap = parseRels(await relsFile.async("text"));
8144
+ }
8145
+ let pageFilter = null;
8146
+ if (options?.pages) {
8147
+ const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => (init_page_range(), page_range_exports));
8148
+ pageFilter = parsePageRange2(options.pages, sheets.length);
7622
8149
  }
7623
- try {
7624
- const sheetXml = await sheetFile.async("text");
7625
- const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
7626
- totalCells += maxRow * maxCol;
7627
- if (totalCells > MAX_TOTAL_CELLS) {
7628
- warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
7629
- break;
8150
+ const blocks = [];
8151
+ const processedSheets = Math.min(sheets.length, MAX_SHEETS);
8152
+ let totalCells = 0;
8153
+ for (let i = 0; i < processedSheets; i++) {
8154
+ if (pageFilter && !pageFilter.has(i + 1)) continue;
8155
+ const sheet = sheets[i];
8156
+ options?.onProgress?.(i + 1, processedSheets);
8157
+ let sheetPath = relsMap.get(sheet.rId);
8158
+ if (sheetPath) {
8159
+ if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
8160
+ sheetPath = `xl/${sheetPath}`;
8161
+ } else if (sheetPath.startsWith("/")) {
8162
+ sheetPath = sheetPath.slice(1);
8163
+ }
8164
+ } else {
8165
+ sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
8166
+ }
8167
+ const sheetFile = zip.file(sheetPath);
8168
+ if (!sheetFile) {
8169
+ warnings.push({
8170
+ page: i + 1,
8171
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
8172
+ code: "PARTIAL_PARSE"
8173
+ });
8174
+ continue;
8175
+ }
8176
+ try {
8177
+ const sheetXml = await sheetFile.async("text");
8178
+ const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
8179
+ totalCells += maxRow * maxCol;
8180
+ if (totalCells > MAX_TOTAL_CELLS) {
8181
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
8182
+ break;
8183
+ }
8184
+ const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
8185
+ blocks.push(...sheetBlocks);
8186
+ logger.log({
8187
+ level: "debug",
8188
+ stage: "convert",
8189
+ event: "progress",
8190
+ message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC644\uB8CC",
8191
+ meta: { sheet: sheet.name, index: i + 1, processedSheets }
8192
+ });
8193
+ lastProcessedSheet = i + 1;
8194
+ } catch (err) {
8195
+ warnings.push({
8196
+ page: i + 1,
8197
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
8198
+ code: "PARTIAL_PARSE"
8199
+ });
8200
+ logger.log({
8201
+ level: "warn",
8202
+ stage: "convert",
8203
+ event: "progress",
8204
+ message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC2E4\uD328",
8205
+ meta: { sheet: sheet.name, index: i + 1 },
8206
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
8207
+ });
7630
8208
  }
7631
- const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
7632
- blocks.push(...sheetBlocks);
7633
- } catch (err) {
7634
- warnings.push({
7635
- page: i + 1,
7636
- message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
7637
- code: "PARTIAL_PARSE"
7638
- });
7639
8209
  }
7640
- }
7641
- const metadata = {
7642
- pageCount: processedSheets
7643
- };
7644
- const coreFile = zip.file("docProps/core.xml");
7645
- if (coreFile) {
7646
- try {
7647
- const coreXml = await coreFile.async("text");
7648
- const doc = parseXml(coreXml);
7649
- const getFirst = (tag) => {
7650
- const els = doc.getElementsByTagName(tag);
7651
- return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
7652
- };
7653
- metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
7654
- metadata.author = getFirst("dc:creator");
7655
- metadata.description = getFirst("dc:description");
7656
- const created = getFirst("dcterms:created");
7657
- if (created) metadata.createdAt = created;
7658
- const modified = getFirst("dcterms:modified");
7659
- if (modified) metadata.modifiedAt = modified;
7660
- } catch {
8210
+ const metadata = {
8211
+ pageCount: processedSheets
8212
+ };
8213
+ const coreFile = zip.file("docProps/core.xml");
8214
+ if (coreFile) {
8215
+ try {
8216
+ const coreXml = await coreFile.async("text");
8217
+ const doc = parseXml(coreXml);
8218
+ const getFirst = (tag) => {
8219
+ const els = doc.getElementsByTagName(tag);
8220
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
8221
+ };
8222
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
8223
+ metadata.author = getFirst("dc:creator");
8224
+ metadata.description = getFirst("dc:description");
8225
+ const created = getFirst("dcterms:created");
8226
+ if (created) metadata.createdAt = created;
8227
+ const modified = getFirst("dcterms:modified");
8228
+ if (modified) metadata.modifiedAt = modified;
8229
+ } catch {
8230
+ }
7661
8231
  }
8232
+ const markdown = blocksToMarkdown(blocks);
8233
+ logger.log({
8234
+ level: "info",
8235
+ stage: "finalize",
8236
+ event: "done",
8237
+ message: "XLSX \uD30C\uC2F1 \uC644\uB8CC",
8238
+ meta: { blocks: blocks.length, warnings: warnings.length, pageCount: processedSheets }
8239
+ });
8240
+ return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
8241
+ } catch (err) {
8242
+ logger.log({
8243
+ level: "error",
8244
+ stage: "finalize",
8245
+ event: "error",
8246
+ message: "XLSX \uD30C\uC2F1 \uC2E4\uD328",
8247
+ meta: { lastProcessedSheet },
8248
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
8249
+ });
8250
+ throw err;
7662
8251
  }
7663
- const markdown = blocksToMarkdown(blocks);
7664
- return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
7665
8252
  }
7666
8253
 
7667
8254
  // src/docx/parser.ts
7668
8255
  var import_jszip4 = __toESM(require("jszip"), 1);
7669
8256
  var import_xmldom3 = require("@xmldom/xmldom");
8257
+ init_logger();
7670
8258
  var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
7671
8259
  function getChildElements(parent, localName) {
7672
8260
  const result = [];
@@ -8028,101 +8616,127 @@ async function extractImages(zip, rels, doc) {
8028
8616
  return { blocks, images };
8029
8617
  }
8030
8618
  async function parseDocxDocument(buffer, options, existingZip) {
8031
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
8032
- const zip = existingZip ?? await import_jszip4.default.loadAsync(buffer);
8033
- const warnings = [];
8034
- const docFile = zip.file("word/document.xml");
8035
- if (!docFile) {
8036
- throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
8037
- }
8038
- let rels = /* @__PURE__ */ new Map();
8039
- const relsFile = zip.file("word/_rels/document.xml.rels");
8040
- if (relsFile) {
8041
- rels = parseRels2(await relsFile.async("text"));
8042
- }
8043
- let styles = /* @__PURE__ */ new Map();
8044
- const stylesFile = zip.file("word/styles.xml");
8045
- if (stylesFile) {
8046
- try {
8047
- styles = parseStyles(await stylesFile.async("text"));
8048
- } catch {
8619
+ const logger = createLoggerFromEnv().child({ component: "docx/parser.ts", stage: "detect" });
8620
+ logger.log({ level: "info", event: "start", message: "DOCX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
8621
+ let lastProcessedNode = 0;
8622
+ try {
8623
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
8624
+ const zip = existingZip ?? await import_jszip4.default.loadAsync(buffer);
8625
+ const warnings = [];
8626
+ const docFile = zip.file("word/document.xml");
8627
+ if (!docFile) {
8628
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
8629
+ }
8630
+ let rels = /* @__PURE__ */ new Map();
8631
+ const relsFile = zip.file("word/_rels/document.xml.rels");
8632
+ if (relsFile) {
8633
+ rels = parseRels2(await relsFile.async("text"));
8634
+ }
8635
+ let styles = /* @__PURE__ */ new Map();
8636
+ const stylesFile = zip.file("word/styles.xml");
8637
+ if (stylesFile) {
8638
+ try {
8639
+ styles = parseStyles(await stylesFile.async("text"));
8640
+ } catch {
8641
+ }
8049
8642
  }
8050
- }
8051
- let numbering = /* @__PURE__ */ new Map();
8052
- const numFile = zip.file("word/numbering.xml");
8053
- if (numFile) {
8054
- try {
8055
- numbering = parseNumbering(await numFile.async("text"));
8056
- } catch {
8643
+ let numbering = /* @__PURE__ */ new Map();
8644
+ const numFile = zip.file("word/numbering.xml");
8645
+ if (numFile) {
8646
+ try {
8647
+ numbering = parseNumbering(await numFile.async("text"));
8648
+ } catch {
8649
+ }
8057
8650
  }
8058
- }
8059
- let footnotes = /* @__PURE__ */ new Map();
8060
- const fnFile = zip.file("word/footnotes.xml");
8061
- if (fnFile) {
8062
- try {
8063
- footnotes = parseFootnotes(await fnFile.async("text"));
8064
- } catch {
8651
+ let footnotes = /* @__PURE__ */ new Map();
8652
+ const fnFile = zip.file("word/footnotes.xml");
8653
+ if (fnFile) {
8654
+ try {
8655
+ footnotes = parseFootnotes(await fnFile.async("text"));
8656
+ } catch {
8657
+ }
8065
8658
  }
8066
- }
8067
- const docXml = await docFile.async("text");
8068
- const doc = parseXml2(docXml);
8069
- const body = findElements(doc, "body");
8070
- if (body.length === 0) {
8071
- throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
8072
- }
8073
- const blocks = [];
8074
- const bodyEl = body[0];
8075
- const children = bodyEl.childNodes;
8076
- for (let i = 0; i < children.length; i++) {
8077
- const node = children[i];
8078
- if (node.nodeType !== 1) continue;
8079
- const el = node;
8080
- const localName = el.localName ?? el.tagName?.split(":").pop();
8081
- if (localName === "p") {
8082
- const block = parseParagraph(el, styles, numbering, footnotes, rels);
8083
- if (block) blocks.push(block);
8084
- } else if (localName === "tbl") {
8085
- const block = parseTable(el, styles, numbering, footnotes, rels);
8086
- if (block) blocks.push(block);
8087
- }
8088
- }
8089
- const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
8090
- const metadata = {};
8091
- const coreFile = zip.file("docProps/core.xml");
8092
- if (coreFile) {
8093
- try {
8094
- const coreXml = await coreFile.async("text");
8095
- const coreDoc = parseXml2(coreXml);
8096
- const getFirst = (tag) => {
8097
- const els = coreDoc.getElementsByTagName(tag);
8098
- return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
8099
- };
8100
- metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
8101
- metadata.author = getFirst("dc:creator");
8102
- metadata.description = getFirst("dc:description");
8103
- const created = getFirst("dcterms:created");
8104
- if (created) metadata.createdAt = created;
8105
- const modified = getFirst("dcterms:modified");
8106
- if (modified) metadata.modifiedAt = modified;
8107
- } catch {
8659
+ const docXml = await docFile.async("text");
8660
+ const doc = parseXml2(docXml);
8661
+ const body = findElements(doc, "body");
8662
+ if (body.length === 0) {
8663
+ throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
8108
8664
  }
8665
+ const blocks = [];
8666
+ const bodyEl = body[0];
8667
+ const children = bodyEl.childNodes;
8668
+ for (let i = 0; i < children.length; i++) {
8669
+ const node = children[i];
8670
+ if (node.nodeType !== 1) continue;
8671
+ const el = node;
8672
+ const localName = el.localName ?? el.tagName?.split(":").pop();
8673
+ if (localName === "p") {
8674
+ const block = parseParagraph(el, styles, numbering, footnotes, rels);
8675
+ if (block) blocks.push(block);
8676
+ } else if (localName === "tbl") {
8677
+ const block = parseTable(el, styles, numbering, footnotes, rels);
8678
+ if (block) blocks.push(block);
8679
+ }
8680
+ lastProcessedNode = i + 1;
8681
+ }
8682
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uBCF8\uBB38 \uBE14\uB85D \uD30C\uC2F1 \uC644\uB8CC", meta: { blocks: blocks.length } });
8683
+ const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
8684
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC774\uBBF8\uC9C0 \uCD94\uCD9C \uC644\uB8CC", meta: { imageBlocks: imgBlocks.length, images: images.length } });
8685
+ const metadata = {};
8686
+ const coreFile = zip.file("docProps/core.xml");
8687
+ if (coreFile) {
8688
+ try {
8689
+ const coreXml = await coreFile.async("text");
8690
+ const coreDoc = parseXml2(coreXml);
8691
+ const getFirst = (tag) => {
8692
+ const els = coreDoc.getElementsByTagName(tag);
8693
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
8694
+ };
8695
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
8696
+ metadata.author = getFirst("dc:creator");
8697
+ metadata.description = getFirst("dc:description");
8698
+ const created = getFirst("dcterms:created");
8699
+ if (created) metadata.createdAt = created;
8700
+ const modified = getFirst("dcterms:modified");
8701
+ if (modified) metadata.modifiedAt = modified;
8702
+ } catch {
8703
+ }
8704
+ }
8705
+ const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
8706
+ const markdown = blocksToMarkdown(blocks);
8707
+ logger.log({
8708
+ level: "info",
8709
+ stage: "finalize",
8710
+ event: "done",
8711
+ message: "DOCX \uD30C\uC2F1 \uC644\uB8CC",
8712
+ meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, images: images.length }
8713
+ });
8714
+ return {
8715
+ markdown,
8716
+ blocks,
8717
+ metadata,
8718
+ outline: outline.length > 0 ? outline : void 0,
8719
+ warnings: warnings.length > 0 ? warnings : void 0,
8720
+ images: images.length > 0 ? images : void 0
8721
+ };
8722
+ } catch (err) {
8723
+ logger.log({
8724
+ level: "error",
8725
+ stage: "finalize",
8726
+ event: "error",
8727
+ message: "DOCX \uD30C\uC2F1 \uC2E4\uD328",
8728
+ meta: { lastProcessedNode },
8729
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
8730
+ });
8731
+ throw err;
8109
8732
  }
8110
- const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
8111
- const markdown = blocksToMarkdown(blocks);
8112
- return {
8113
- markdown,
8114
- blocks,
8115
- metadata,
8116
- outline: outline.length > 0 ? outline : void 0,
8117
- warnings: warnings.length > 0 ? warnings : void 0,
8118
- images: images.length > 0 ? images : void 0
8119
- };
8120
8733
  }
8121
8734
 
8122
8735
  // src/index.ts
8123
8736
  init_cli_provider();
8124
8737
  init_tesseract_provider();
8125
8738
  init_markdown_to_blocks();
8739
+ init_logger();
8126
8740
 
8127
8741
  // src/diff/text-diff.ts
8128
8742
  function similarity(a, b) {
@@ -10621,15 +11235,726 @@ async function markdownToXlsx(markdown, options) {
10621
11235
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
10622
11236
  }
10623
11237
 
11238
+ // src/ocr/api-key-rotation.ts
11239
+ var AllKeysCoolingDownError = class extends Error {
11240
+ waitMs;
11241
+ constructor(waitMs) {
11242
+ super(`\uBAA8\uB4E0 API \uD0A4\uAC00 cooldown \uC0C1\uD0DC\uC785\uB2C8\uB2E4. ${waitMs}ms \uD6C4 \uC7AC\uC2DC\uB3C4\uD558\uC138\uC694.`);
11243
+ this.name = "AllKeysCoolingDownError";
11244
+ this.waitMs = waitMs;
11245
+ }
11246
+ };
11247
+ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11248
+ states;
11249
+ baseCooldownMs;
11250
+ maxCooldownMs;
11251
+ cursor = -1;
11252
+ constructor(keys, options = {}) {
11253
+ const normalized = keys.map((k) => k.trim()).filter(Boolean);
11254
+ if (normalized.length === 0) {
11255
+ throw new Error("API \uD0A4\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
11256
+ }
11257
+ this.states = normalized.map((key, idx) => ({
11258
+ key,
11259
+ keyId: `key_${idx + 1}`,
11260
+ totalRequests: 0,
11261
+ successCount: 0,
11262
+ failureCount: 0,
11263
+ consecutiveFailures: 0
11264
+ }));
11265
+ this.baseCooldownMs = options.baseCooldownMs ?? 5e3;
11266
+ this.maxCooldownMs = options.maxCooldownMs ?? 12e4;
11267
+ }
11268
+ static fromEnv(env = process.env) {
11269
+ const multi = (env.NVIDIA_API_KEYS || "").split(",").map((v) => v.trim()).filter(Boolean);
11270
+ if (multi.length > 0) return new _ApiKeyRotationPool(multi);
11271
+ const single = (env.NVIDIA_API_KEY || "").trim();
11272
+ if (single) return new _ApiKeyRotationPool([single]);
11273
+ throw new Error("NVIDIA_API_KEYS \uB610\uB294 NVIDIA_API_KEY \uD658\uACBD\uBCC0\uC218\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.");
11274
+ }
11275
+ acquire(now = Date.now()) {
11276
+ const n = this.states.length;
11277
+ for (let step = 1; step <= n; step++) {
11278
+ const idx = (this.cursor + step) % n;
11279
+ const s = this.states[idx];
11280
+ if (!s.cooldownUntil || s.cooldownUntil <= now) {
11281
+ this.cursor = idx;
11282
+ s.totalRequests++;
11283
+ s.lastUsedAt = now;
11284
+ return { key: s.key, keyId: s.keyId };
11285
+ }
11286
+ }
11287
+ const minCooldownUntil = this.states.map((s) => s.cooldownUntil ?? now).reduce((min, v) => Math.min(min, v), Number.POSITIVE_INFINITY);
11288
+ throw new AllKeysCoolingDownError(Math.max(0, minCooldownUntil - now));
11289
+ }
11290
+ markSuccess(keyId) {
11291
+ const s = this.find(keyId);
11292
+ s.successCount++;
11293
+ s.consecutiveFailures = 0;
11294
+ s.cooldownUntil = void 0;
11295
+ }
11296
+ markFailure(keyId, opts = {}, now = Date.now()) {
11297
+ const s = this.find(keyId);
11298
+ s.failureCount++;
11299
+ s.consecutiveFailures++;
11300
+ const retryable = this.isRetryableFailure(opts.status, opts.timeout);
11301
+ if (!retryable) return;
11302
+ const exp = Math.max(0, s.consecutiveFailures - 1);
11303
+ const backoff = Math.min(this.baseCooldownMs * 2 ** exp, this.maxCooldownMs);
11304
+ const cooldown = Math.max(backoff, opts.retryAfterMs ?? 0);
11305
+ s.cooldownUntil = now + cooldown;
11306
+ }
11307
+ snapshot() {
11308
+ return this.states.map((s) => ({
11309
+ keyId: s.keyId,
11310
+ totalRequests: s.totalRequests,
11311
+ successCount: s.successCount,
11312
+ failureCount: s.failureCount,
11313
+ consecutiveFailures: s.consecutiveFailures,
11314
+ lastUsedAt: s.lastUsedAt,
11315
+ cooldownUntil: s.cooldownUntil
11316
+ }));
11317
+ }
11318
+ isRetryableFailure(status, timeout) {
11319
+ if (timeout) return true;
11320
+ if (status === 429) return true;
11321
+ if (typeof status === "number" && status >= 500) return true;
11322
+ return false;
11323
+ }
11324
+ find(keyId) {
11325
+ const s = this.states.find((v) => v.keyId === keyId);
11326
+ if (!s) throw new Error(`\uC54C \uC218 \uC5C6\uB294 keyId: ${keyId}`);
11327
+ return s;
11328
+ }
11329
+ };
11330
+
11331
+ // src/pipeline/unified-ocr.ts
11332
+ var import_promises2 = require("fs/promises");
11333
+ var import_path5 = require("path");
11334
+ var import_child_process4 = require("child_process");
11335
+ var import_libreoffice_convert = __toESM(require("libreoffice-convert"), 1);
11336
+ init_logger();
11337
+ var libreConvert = import_libreoffice_convert.default.convert;
11338
+ var UnifiedOcrError = class extends Error {
11339
+ code;
11340
+ stage;
11341
+ constructor(code, stage, message) {
11342
+ super(message);
11343
+ this.name = "UnifiedOcrError";
11344
+ this.code = code;
11345
+ this.stage = stage;
11346
+ }
11347
+ };
11348
+ var DEFAULT_MODELS = [
11349
+ "mistralai/mistral-medium-3-instruct",
11350
+ "moonshotai/kimi-k2.5",
11351
+ "moonshotai/kimi-k2-thinking",
11352
+ "moonshotai/kimi-k2-instruct",
11353
+ "moonshotai/kimi-k2-instruct-0905",
11354
+ "qwen/qwen3.5-122b-a10b",
11355
+ "qwen/qwen3.5-397b-a17b"
11356
+ ];
11357
+ var DEFAULT_MODEL_MAX_TOKENS = {
11358
+ "mistralai/mistral-medium-3-instruct": 8192,
11359
+ "moonshotai/kimi-k2.5": 64e3,
11360
+ "moonshotai/kimi-k2-thinking": 64e3,
11361
+ "moonshotai/kimi-k2-instruct": 64e3,
11362
+ "moonshotai/kimi-k2-instruct-0905": 64e3,
11363
+ "qwen/qwen3.5-122b-a10b": 64e3,
11364
+ "qwen/qwen3.5-397b-a17b": 64e3
11365
+ };
11366
+ var DEFAULT_STAGE_WEIGHTS = {
11367
+ convert: 15,
11368
+ render: 20,
11369
+ probe: 5,
11370
+ ocr: 45,
11371
+ proofread: 10,
11372
+ merge: 5
11373
+ };
11374
+ var OCR_PROMPT2 = "\uC774 \uC774\uBBF8\uC9C0 1\uC7A5\uC758 \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uC694\uC57D \uC5C6\uC774 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uCD94\uCD9C\uD558\uC138\uC694. \uC808\uB300\uB85C \uB0B4\uC6A9\uC744 \uCD94\uCE21\uD558\uAC70\uB098 \uBC14\uAFB8\uC9C0 \uB9C8\uC138\uC694.";
11375
+ var PROOFREAD_PROMPT = [
11376
+ "\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
11377
+ "\uADDC\uCE59:",
11378
+ "- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
11379
+ "- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
11380
+ "- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
11381
+ "- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
11382
+ ].join("\n");
11383
+ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11384
+ const absInput = (0, import_path5.resolve)(inputPath);
11385
+ const stem = (0, import_path5.basename)(absInput, (0, import_path5.extname)(absInput));
11386
+ const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
11387
+ const imagesDir = (0, import_path5.join)(workspaceDir, "images");
11388
+ const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
11389
+ const proofDir = (0, import_path5.join)(workspaceDir, "ocr", "proofread");
11390
+ const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
11391
+ const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
11392
+ const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
11393
+ const modelCachePath = (0, import_path5.join)((0, import_path5.dirname)(absInput), ".kordoc-model-cache.json");
11394
+ const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
11395
+ const timeoutMs = options.timeoutMs ?? 6e4;
11396
+ const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
11397
+ const dpi = options.dpi ?? 300;
11398
+ const modelsInput = options.modelCandidates?.length ? options.modelCandidates : DEFAULT_MODELS;
11399
+ const modelCache = await loadModelCache(modelCachePath);
11400
+ const models = sortModelsByCache(modelsInput, modelCache);
11401
+ const modelMaxTokens = { ...DEFAULT_MODEL_MAX_TOKENS, ...options.modelMaxTokens ?? {} };
11402
+ const stageWeights = normalizeWeights({ ...DEFAULT_STAGE_WEIGHTS, ...options.stageWeights ?? {} });
11403
+ const keyPool = ApiKeyRotationPool.fromEnv();
11404
+ const runId = options.runId ?? generateRunId("ocr");
11405
+ const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11406
+ await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
11407
+ await (0, import_promises2.mkdir)(rawDir, { recursive: true });
11408
+ await (0, import_promises2.mkdir)(proofDir, { recursive: true });
11409
+ await (0, import_promises2.mkdir)(diffDir, { recursive: true });
11410
+ const timingsMs = {};
11411
+ const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11412
+ const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
11413
+ const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
11414
+ let currentStage = "convert";
11415
+ const logStage = (level, stage, event, message, meta) => {
11416
+ logger.log({ level, stage, event, message, meta });
11417
+ };
11418
+ try {
11419
+ ensureSupportedInput(absInput);
11420
+ let workingPdfPath = absInput;
11421
+ const convertStart = Date.now();
11422
+ currentStage = "convert";
11423
+ markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
11424
+ logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
11425
+ if ((0, import_path5.extname)(absInput).toLowerCase() !== ".pdf") {
11426
+ await assertSofficeAvailable();
11427
+ workingPdfPath = (0, import_path5.join)(workspaceDir, `${stem}.pdf`);
11428
+ const inputBuffer = await (0, import_promises2.readFile)(absInput);
11429
+ const out = await convertWithLibreOffice(inputBuffer, ".pdf");
11430
+ await (0, import_promises2.writeFile)(workingPdfPath, out);
11431
+ }
11432
+ timingsMs.convert = Date.now() - convertStart;
11433
+ markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
11434
+ logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
11435
+ const renderStart = Date.now();
11436
+ currentStage = "render";
11437
+ markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11438
+ logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
11439
+ await renderPdfToPng(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi);
11440
+ const images = await listPageImages(imagesDir);
11441
+ if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
11442
+ markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
11443
+ timingsMs.render = Date.now() - renderStart;
11444
+ markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11445
+ logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: images.length, elapsedMs: timingsMs.render });
11446
+ const probeStart = Date.now();
11447
+ currentStage = "probe";
11448
+ markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
11449
+ logStage("info", "probe", "start", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2DC\uC791", { models });
11450
+ const probeImage = await pickRepresentativeImage(images);
11451
+ const probeResults = [];
11452
+ for (let i = 0; i < models.length; i++) {
11453
+ const model = models[i];
11454
+ const t0 = Date.now();
11455
+ try {
11456
+ await ocrImageViaNim({
11457
+ imagePath: probeImage,
11458
+ prompt: OCR_PROMPT2,
11459
+ model,
11460
+ maxTokens: modelMaxTokens[model] ?? 8192,
11461
+ baseUrl,
11462
+ keyPool,
11463
+ timeoutMs,
11464
+ maxRetries: 2,
11465
+ logger,
11466
+ stage: "probe"
11467
+ });
11468
+ probeResults.push({ model, durationMs: Date.now() - t0, success: true });
11469
+ } catch (err) {
11470
+ probeResults.push({
11471
+ model,
11472
+ durationMs: Date.now() - t0,
11473
+ success: false,
11474
+ error: err instanceof Error ? err.message : String(err)
11475
+ });
11476
+ }
11477
+ markStageProgress("probe", Math.round((i + 1) / models.length * 100), i + 1, models.length, `\uBAA8\uB378 \uD504\uB85C\uBE0C ${i + 1}/${models.length}`);
11478
+ logStage("debug", "probe", "progress", "\uBAA8\uB378 \uD504\uB85C\uBE0C \uC9C4\uD589", { index: i + 1, total: models.length, model, result: probeResults.at(-1) });
11479
+ }
11480
+ const selectedModel = chooseFastestModel(probeResults);
11481
+ if (!selectedModel) throw new UnifiedOcrError("PROBE_FAILED", "probe", "\uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2E4\uD328: \uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uBAA8\uB378\uC774 \uC5C6\uC2B5\uB2C8\uB2E4.");
11482
+ const fallbackModelOrder = probeResults.filter((r) => r.success).sort((a, b) => a.durationMs - b.durationMs).map((r) => r.model);
11483
+ timingsMs.probe = Date.now() - probeStart;
11484
+ await updateModelCache(modelCachePath, probeResults);
11485
+ markStageDone("probe", `\uD504\uB85C\uBE0C \uC644\uB8CC: ${selectedModel}`);
11486
+ logStage("info", "probe", "done", "\uBAA8\uB378 \uD504\uB85C\uBE0C \uC644\uB8CC", { selectedModel, probeResults, elapsedMs: timingsMs.probe, modelCachePath });
11487
+ const ocrStart = Date.now();
11488
+ currentStage = "ocr";
11489
+ markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (${selectedModel})`);
11490
+ logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { selectedModel, pageCount: images.length });
11491
+ const rawPagePaths = [];
11492
+ for (let i = 0; i < images.length; i++) {
11493
+ const imagePath = images[i];
11494
+ const markdown = await ocrImageWithFallback({
11495
+ imagePath,
11496
+ prompt: OCR_PROMPT2,
11497
+ models: fallbackModelOrder,
11498
+ modelMaxTokens,
11499
+ baseUrl,
11500
+ keyPool,
11501
+ timeoutMs,
11502
+ maxRetriesPerPage,
11503
+ logger
11504
+ });
11505
+ const pagePath = (0, import_path5.join)(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11506
+ await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
11507
+ rawPagePaths.push(pagePath);
11508
+ markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`);
11509
+ logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
11510
+ }
11511
+ timingsMs.ocr = Date.now() - ocrStart;
11512
+ markStageDone("ocr", "OCR \uC644\uB8CC");
11513
+ logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
11514
+ const proofStart = Date.now();
11515
+ currentStage = "proofread";
11516
+ markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
11517
+ logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
11518
+ const proofedPaths = [];
11519
+ for (let i = 0; i < rawPagePaths.length; i++) {
11520
+ const rawMd = await (0, import_promises2.readFile)(rawPagePaths[i], "utf-8");
11521
+ const prompt = `${PROOFREAD_PROMPT}
11522
+
11523
+ ---
11524
+ ${rawMd}
11525
+ ---`;
11526
+ const corrected = await ocrImageViaNim({
11527
+ textOnlyPrompt: prompt,
11528
+ model: selectedModel,
11529
+ maxTokens: modelMaxTokens[selectedModel] ?? 8192,
11530
+ baseUrl,
11531
+ keyPool,
11532
+ timeoutMs,
11533
+ maxRetries: maxRetriesPerPage,
11534
+ logger,
11535
+ stage: "proofread"
11536
+ });
11537
+ const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
11538
+ const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
11539
+ const pagePath = (0, import_path5.join)(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11540
+ await (0, import_promises2.writeFile)(pagePath, taggedCorrected, "utf-8");
11541
+ await (0, import_promises2.writeFile)(
11542
+ (0, import_path5.join)(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
11543
+ JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
11544
+ "utf-8"
11545
+ );
11546
+ proofedPaths.push(pagePath);
11547
+ markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
11548
+ logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
11549
+ }
11550
+ timingsMs.proofread = Date.now() - proofStart;
11551
+ markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
11552
+ logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
11553
+ const mergeStart = Date.now();
11554
+ currentStage = "merge";
11555
+ markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11556
+ logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: proofedPaths.length });
11557
+ const merged = await mergeMarkdownPages(proofedPaths);
11558
+ await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
11559
+ timingsMs.merge = Date.now() - mergeStart;
11560
+ markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
11561
+ logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
11562
+ const report = {
11563
+ inputPath: absInput,
11564
+ outputPath,
11565
+ workspaceDir,
11566
+ selectedModel,
11567
+ probeImage,
11568
+ probeResults,
11569
+ pageCount: images.length,
11570
+ keyHealth: keyPool.snapshot(),
11571
+ timingsMs,
11572
+ modelCachePath
11573
+ };
11574
+ await (0, import_promises2.writeFile)(reportPath, JSON.stringify(report, null, 2), "utf-8");
11575
+ logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
11576
+ return { outputPath, reportPath, selectedModel };
11577
+ } catch (err) {
11578
+ const normalized = normalizePipelineError(err, currentStage);
11579
+ emitProgress(options.onEvent, currentStage, 0, stageWeights, {
11580
+ type: "error",
11581
+ code: normalized.code,
11582
+ message: normalized.message
11583
+ });
11584
+ logger.log({
11585
+ level: "error",
11586
+ stage: currentStage,
11587
+ event: "error",
11588
+ message: normalized.message,
11589
+ error: {
11590
+ code: normalized.code,
11591
+ name: normalized.name,
11592
+ message: normalized.message,
11593
+ stack: normalized.stack
11594
+ }
11595
+ });
11596
+ throw normalized;
11597
+ }
11598
+ }
11599
+ function normalizeWeights(weights) {
11600
+ const sum = Object.values(weights).reduce((a, b) => a + b, 0) || 1;
11601
+ return {
11602
+ convert: weights.convert / sum * 100,
11603
+ render: weights.render / sum * 100,
11604
+ probe: weights.probe / sum * 100,
11605
+ ocr: weights.ocr / sum * 100,
11606
+ proofread: weights.proofread / sum * 100,
11607
+ merge: weights.merge / sum * 100
11608
+ };
11609
+ }
11610
+ function computeOverallPercent(stage, stagePercent, weights) {
11611
+ const order = ["convert", "render", "probe", "ocr", "proofread", "merge"];
11612
+ let overall = 0;
11613
+ for (const s of order) {
11614
+ if (s === stage) {
11615
+ overall += weights[s] * Math.max(0, Math.min(100, stagePercent)) / 100;
11616
+ break;
11617
+ }
11618
+ overall += weights[s];
11619
+ }
11620
+ return Math.round(overall);
11621
+ }
11622
+ function emitProgress(cb, stage, stagePercent, weights, extra) {
11623
+ if (!cb) return;
11624
+ cb({
11625
+ type: extra.type ?? "stage_progress",
11626
+ stage,
11627
+ stagePercent: Math.max(0, Math.min(100, Math.round(stagePercent))),
11628
+ overallPercent: computeOverallPercent(stage, stagePercent, weights),
11629
+ current: extra.current,
11630
+ total: extra.total,
11631
+ code: extra.code,
11632
+ message: extra.message
11633
+ });
11634
+ }
11635
+ async function convertWithLibreOffice(buffer, ext) {
11636
+ return await new Promise((resolvePromise, reject) => {
11637
+ libreConvert(buffer, ext, void 0, (err, done) => {
11638
+ if (err || !done) {
11639
+ reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
11640
+ return;
11641
+ }
11642
+ resolvePromise(done);
11643
+ });
11644
+ });
11645
+ }
11646
+ async function renderPdfToPng(pdfPath, prefixPath, dpi) {
11647
+ try {
11648
+ await runCommand("pdftoppm", ["-png", "-r", String(dpi), pdfPath, prefixPath]);
11649
+ } catch (err) {
11650
+ throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11651
+ }
11652
+ }
11653
+ async function runCommand(cmd, args) {
11654
+ await new Promise((resolvePromise, reject) => {
11655
+ const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
11656
+ let stderr = "";
11657
+ child.stderr.on("data", (d) => {
11658
+ stderr += String(d);
11659
+ });
11660
+ child.on("error", reject);
11661
+ child.on("close", (code) => {
11662
+ if (code === 0) resolvePromise();
11663
+ else reject(new Error(`${cmd} \uC2E4\uD328 (code=${code}): ${stderr.trim()}`));
11664
+ });
11665
+ });
11666
+ }
11667
+ async function assertSofficeAvailable() {
11668
+ try {
11669
+ await runCommand("soffice", ["--version"]);
11670
+ } catch {
11671
+ throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
11672
+ }
11673
+ }
11674
+ async function listPageImages(imagesDir) {
11675
+ const files = await (0, import_promises2.readdir)(imagesDir);
11676
+ return files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b)).map((f) => (0, import_path5.join)(imagesDir, f));
11677
+ }
11678
+ function naturalPageSort(a, b) {
11679
+ const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
11680
+ const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
11681
+ return na - nb;
11682
+ }
11683
+ async function pickRepresentativeImage(images) {
11684
+ const sample = images.slice(0, Math.min(images.length, 8));
11685
+ const weighted = [];
11686
+ for (const p of sample) {
11687
+ const st = await (0, import_promises2.stat)(p);
11688
+ if (st.size > 8 * 1024) weighted.push({ path: p, size: st.size });
11689
+ }
11690
+ const use = weighted.length > 0 ? weighted : await Promise.all(sample.map(async (p) => ({ path: p, size: (await (0, import_promises2.stat)(p)).size })));
11691
+ use.sort((a, b) => a.size - b.size);
11692
+ return use[Math.floor(use.length / 2)].path;
11693
+ }
11694
+ function chooseFastestModel(results) {
11695
+ const ok = results.filter((r) => r.success).sort((a, b) => a.durationMs - b.durationMs);
11696
+ return ok[0]?.model ?? null;
11697
+ }
11698
+ async function loadModelCache(path) {
11699
+ try {
11700
+ const raw = await (0, import_promises2.readFile)(path, "utf-8");
11701
+ return JSON.parse(raw);
11702
+ } catch {
11703
+ return null;
11704
+ }
11705
+ }
11706
+ function sortModelsByCache(models, cache) {
11707
+ if (!cache) return [...models];
11708
+ return [...models].sort((a, b) => {
11709
+ const av = cache.models[a]?.avgDurationMs ?? Number.POSITIVE_INFINITY;
11710
+ const bv = cache.models[b]?.avgDurationMs ?? Number.POSITIVE_INFINITY;
11711
+ return av - bv;
11712
+ });
11713
+ }
11714
+ async function updateModelCache(path, probes) {
11715
+ const prev = await loadModelCache(path);
11716
+ const current = prev ?? { updatedAt: (/* @__PURE__ */ new Date()).toISOString(), models: {} };
11717
+ for (const p of probes) {
11718
+ if (!p.success) continue;
11719
+ const existing = current.models[p.model];
11720
+ if (!existing) {
11721
+ current.models[p.model] = { count: 1, avgDurationMs: p.durationMs };
11722
+ } else {
11723
+ const nextCount = existing.count + 1;
11724
+ current.models[p.model] = {
11725
+ count: nextCount,
11726
+ avgDurationMs: Math.round((existing.avgDurationMs * existing.count + p.durationMs) / nextCount)
11727
+ };
11728
+ }
11729
+ }
11730
+ current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
11731
+ await (0, import_promises2.writeFile)(path, JSON.stringify(current, null, 2), "utf-8");
11732
+ }
11733
+ async function ocrImageWithFallback(input) {
11734
+ let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
11735
+ for (const model of input.models) {
11736
+ try {
11737
+ return await ocrImageViaNim({
11738
+ imagePath: input.imagePath,
11739
+ prompt: input.prompt,
11740
+ model,
11741
+ maxTokens: input.modelMaxTokens[model] ?? 8192,
11742
+ baseUrl: input.baseUrl,
11743
+ keyPool: input.keyPool,
11744
+ timeoutMs: input.timeoutMs,
11745
+ maxRetries: input.maxRetriesPerPage,
11746
+ logger: input.logger,
11747
+ stage: "ocr"
11748
+ });
11749
+ } catch (err) {
11750
+ lastErr = err instanceof Error ? err.message : String(err);
11751
+ }
11752
+ }
11753
+ throw new UnifiedOcrError("OCR_FAILED", "ocr", `\uBAA8\uB4E0 OCR \uBAA8\uB378 \uC2E4\uD328: ${lastErr}`);
11754
+ }
11755
+ async function mergeMarkdownPages(paths) {
11756
+ const out = [];
11757
+ for (let i = 0; i < paths.length; i++) {
11758
+ const txt = (await (0, import_promises2.readFile)(paths[i], "utf-8")).trim();
11759
+ if (!txt) continue;
11760
+ out.push(txt);
11761
+ }
11762
+ return out.join("\n\n");
11763
+ }
11764
+ async function ocrImageViaNim(input) {
11765
+ const { model, maxTokens, baseUrl, keyPool, timeoutMs, maxRetries, logger, stage = "ocr" } = input;
11766
+ let attempt = 0;
11767
+ let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
11768
+ while (attempt < maxRetries) {
11769
+ attempt++;
11770
+ let acquired = null;
11771
+ try {
11772
+ acquired = keyPool.acquire();
11773
+ } catch (err) {
11774
+ if (err instanceof AllKeysCoolingDownError) {
11775
+ logger?.log({
11776
+ level: "warn",
11777
+ stage,
11778
+ event: "progress",
11779
+ message: "\uBAA8\uB4E0 API \uD0A4 cooldown \uC0C1\uD0DC\uB85C \uB300\uAE30",
11780
+ meta: { waitMs: err.waitMs, attempt, maxRetries, model }
11781
+ });
11782
+ await delay(err.waitMs);
11783
+ continue;
11784
+ }
11785
+ throw err;
11786
+ }
11787
+ try {
11788
+ const content = input.textOnlyPrompt ? [{ type: "text", text: input.textOnlyPrompt }] : [
11789
+ { type: "text", text: input.prompt ?? OCR_PROMPT2 },
11790
+ {
11791
+ type: "image_url",
11792
+ image_url: { url: `data:image/png;base64,${await encodeBase64(input.imagePath)}` }
11793
+ }
11794
+ ];
11795
+ const body = {
11796
+ model,
11797
+ messages: [{ role: "user", content }],
11798
+ max_tokens: maxTokens,
11799
+ temperature: 0
11800
+ };
11801
+ logger?.log({
11802
+ level: "debug",
11803
+ stage,
11804
+ event: "progress",
11805
+ message: "NIM \uC694\uCCAD \uC2DC\uB3C4",
11806
+ meta: { attempt, maxRetries, model, keyId: acquired.keyId, hasImage: Boolean(input.imagePath) }
11807
+ });
11808
+ const controller = new AbortController();
11809
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
11810
+ try {
11811
+ const resp = await fetch(baseUrl, {
11812
+ method: "POST",
11813
+ headers: {
11814
+ Authorization: `Bearer ${acquired.key}`,
11815
+ "Content-Type": "application/json"
11816
+ },
11817
+ body: JSON.stringify(body),
11818
+ signal: controller.signal
11819
+ });
11820
+ if (resp.ok) {
11821
+ const json = await resp.json();
11822
+ const text = json.choices?.[0]?.message?.content?.trim() ?? "";
11823
+ keyPool.markSuccess(acquired.keyId);
11824
+ logger?.log({
11825
+ level: "debug",
11826
+ stage,
11827
+ event: "done",
11828
+ message: "NIM \uC751\uB2F5 \uC131\uACF5",
11829
+ meta: { attempt, model, keyId: acquired.keyId }
11830
+ });
11831
+ if (!text) throw new UnifiedOcrError("OCR_FAILED", "ocr", "OCR \uC751\uB2F5\uC774 \uBE44\uC5B4 \uC788\uC2B5\uB2C8\uB2E4.");
11832
+ return stripCodeFence3(text);
11833
+ }
11834
+ const retryAfter = Number(resp.headers.get("retry-after") || "0");
11835
+ const retryAfterMs = Number.isFinite(retryAfter) && retryAfter > 0 ? retryAfter * 1e3 : void 0;
11836
+ keyPool.markFailure(acquired.keyId, { status: resp.status, retryAfterMs });
11837
+ lastErr = `NIM \uC751\uB2F5 \uC624\uB958: ${resp.status}`;
11838
+ logger?.log({
11839
+ level: "warn",
11840
+ stage,
11841
+ event: "progress",
11842
+ message: "NIM \uC751\uB2F5 \uC2E4\uD328",
11843
+ meta: { attempt, model, status: resp.status, retryAfterMs, keyId: acquired.keyId }
11844
+ });
11845
+ } finally {
11846
+ clearTimeout(timer);
11847
+ }
11848
+ } catch (err) {
11849
+ const isTimeout = err instanceof Error && err.name === "AbortError";
11850
+ if (acquired) keyPool.markFailure(acquired.keyId, { timeout: isTimeout });
11851
+ lastErr = err instanceof Error ? err.message : String(err);
11852
+ logger?.log({
11853
+ level: "warn",
11854
+ stage,
11855
+ event: "progress",
11856
+ message: "NIM \uC694\uCCAD \uC608\uC678",
11857
+ meta: { attempt, model, timeout: isTimeout, keyId: acquired?.keyId },
11858
+ error: { message: lastErr, name: err instanceof Error ? err.name : "Error" }
11859
+ });
11860
+ await delay(500);
11861
+ }
11862
+ }
11863
+ logger?.log({
11864
+ level: "error",
11865
+ stage,
11866
+ event: "error",
11867
+ message: "NIM \uCD5C\uB300 \uC7AC\uC2DC\uB3C4 \uCD08\uACFC",
11868
+ meta: { model, maxRetries },
11869
+ error: { code: "OCR_FAILED", message: lastErr }
11870
+ });
11871
+ throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
11872
+ }
11873
+ async function encodeBase64(path) {
11874
+ const b = await (0, import_promises2.readFile)(path);
11875
+ return b.toString("base64");
11876
+ }
11877
+ function stripCodeFence3(text) {
11878
+ const m = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/i);
11879
+ return m ? m[1].trim() : text;
11880
+ }
11881
+ async function delay(ms) {
11882
+ if (ms <= 0) return;
11883
+ await new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
11884
+ }
11885
+ function ensureSupportedInput(path) {
11886
+ const ext = (0, import_path5.extname)(path).toLowerCase();
11887
+ const allowed = /* @__PURE__ */ new Set([".pdf", ".hwp", ".hwpx", ".docx", ".xlsx"]);
11888
+ if (!allowed.has(ext)) {
11889
+ throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
11890
+ }
11891
+ }
11892
+ function extractNumericTokens(text) {
11893
+ return text.match(/\d[\d,./-]*/g) ?? [];
11894
+ }
11895
+ function preserveNumericIntegrity(rawText, correctedText) {
11896
+ const rawTokens = extractNumericTokens(rawText);
11897
+ const correctedTokens = extractNumericTokens(correctedText);
11898
+ if (rawTokens.length !== correctedTokens.length) return rawText;
11899
+ for (let i = 0; i < rawTokens.length; i++) {
11900
+ if (rawTokens[i] !== correctedTokens[i]) return rawText;
11901
+ }
11902
+ return correctedText;
11903
+ }
11904
+ function addUncertainTag(rawText, correctedText) {
11905
+ if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
11906
+ const rawLen = rawText.trim().length;
11907
+ const corrLen = correctedText.trim().length;
11908
+ if (rawLen === 0 || corrLen === 0) return correctedText;
11909
+ const rawLines = rawText.split("\n").filter(Boolean).length;
11910
+ const corrLines = correctedText.split("\n").filter(Boolean).length;
11911
+ const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
11912
+ const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
11913
+ const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
11914
+ if (!suspicious) return correctedText;
11915
+ return `${correctedText}
11916
+
11917
+ [\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
11918
+ }
11919
+ function buildDiffSummary(before, after) {
11920
+ return {
11921
+ changed: before !== after,
11922
+ beforeLength: before.length,
11923
+ afterLength: after.length
11924
+ };
11925
+ }
11926
+ function normalizePipelineError(err, stage) {
11927
+ if (err instanceof UnifiedOcrError) return err;
11928
+ const message = err instanceof Error ? err.message : String(err);
11929
+ const codeByStage = {
11930
+ convert: "CONVERT_FAILED",
11931
+ render: "RENDER_FAILED",
11932
+ probe: "PROBE_FAILED",
11933
+ ocr: "OCR_FAILED",
11934
+ proofread: "PROOFREAD_FAILED",
11935
+ merge: "MERGE_FAILED"
11936
+ };
11937
+ return new UnifiedOcrError(codeByStage[stage] ?? "UNKNOWN", stage, message);
11938
+ }
11939
+
10624
11940
  // src/index.ts
10625
11941
  async function parse2(input, options) {
11942
+ const logger = createLoggerFromEnv().withRun(generateRunId("parse")).child({ component: "index.ts", stage: "detect" });
11943
+ logger.log({ level: "info", event: "start", message: "parse \uD638\uCD9C \uC2DC\uC791" });
10626
11944
  let buffer;
10627
11945
  if (typeof input === "string") {
10628
11946
  try {
10629
- const buf = await (0, import_promises.readFile)(input);
11947
+ const buf = await (0, import_promises3.readFile)(input);
10630
11948
  buffer = toArrayBuffer(buf);
10631
11949
  } catch (err) {
10632
11950
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
11951
+ logger.log({
11952
+ level: "error",
11953
+ stage: "detect",
11954
+ event: "error",
11955
+ message: msg,
11956
+ error: { code: "PARSE_ERROR", message: msg, name: err instanceof Error ? err.name : "Error" }
11957
+ });
10633
11958
  return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
10634
11959
  }
10635
11960
  } else if (Buffer.isBuffer(input)) {
@@ -10638,13 +11963,23 @@ async function parse2(input, options) {
10638
11963
  buffer = input;
10639
11964
  }
10640
11965
  if (!buffer || buffer.byteLength === 0) {
11966
+ logger.log({ level: "error", stage: "detect", event: "error", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", error: { code: "EMPTY_INPUT", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", name: "KordocError" } });
10641
11967
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
10642
11968
  }
10643
11969
  const MAX_FILE_SIZE = 500 * 1024 * 1024;
10644
11970
  if (buffer.byteLength > MAX_FILE_SIZE) {
11971
+ logger.log({
11972
+ level: "error",
11973
+ stage: "detect",
11974
+ event: "error",
11975
+ message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC",
11976
+ meta: { size: buffer.byteLength },
11977
+ error: { code: "FILE_TOO_LARGE", message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC", name: "KordocError" }
11978
+ });
10645
11979
  return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
10646
11980
  }
10647
11981
  const format = detectFormat(buffer);
11982
+ logger.log({ level: "info", event: "done", message: "\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC", meta: { format } });
10648
11983
  switch (format) {
10649
11984
  case "hwpx": {
10650
11985
  const { format: zipFormat, zip } = await detectZipFormat(buffer);
@@ -10722,7 +12057,8 @@ async function parseHwpx(buffer, options, zip) {
10722
12057
  const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
10723
12058
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10724
12059
  } catch (err) {
10725
- return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
12060
+ const normalized = normalizeKordocError(err, "HWPX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
12061
+ return { success: false, fileType: "hwpx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
10726
12062
  }
10727
12063
  }
10728
12064
  async function parseHwp(buffer, options) {
@@ -10730,7 +12066,8 @@ async function parseHwp(buffer, options) {
10730
12066
  const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
10731
12067
  return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10732
12068
  } catch (err) {
10733
- return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
12069
+ const normalized = normalizeKordocError(err, "HWP \uD30C\uC2F1 \uC2E4\uD328", "finalize");
12070
+ return { success: false, fileType: "hwp", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
10734
12071
  }
10735
12072
  }
10736
12073
  async function parsePdf(buffer, options) {
@@ -10738,8 +12075,15 @@ async function parsePdf(buffer, options) {
10738
12075
  const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
10739
12076
  return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
10740
12077
  } catch (err) {
12078
+ const normalized = normalizeKordocError(err, "PDF \uD30C\uC2F1 \uC2E4\uD328", "finalize");
10741
12079
  const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
10742
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
12080
+ return {
12081
+ success: false,
12082
+ fileType: "pdf",
12083
+ error: normalized.message,
12084
+ code: normalized.code ?? classifyError(normalized),
12085
+ isImageBased
12086
+ };
10743
12087
  }
10744
12088
  }
10745
12089
  async function parseXlsx(buffer, options, zip) {
@@ -10747,7 +12091,8 @@ async function parseXlsx(buffer, options, zip) {
10747
12091
  const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
10748
12092
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
10749
12093
  } catch (err) {
10750
- return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
12094
+ const normalized = normalizeKordocError(err, "XLSX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
12095
+ return { success: false, fileType: "xlsx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
10751
12096
  }
10752
12097
  }
10753
12098
  async function parseDocx(buffer, options, zip) {
@@ -10755,11 +12100,14 @@ async function parseDocx(buffer, options, zip) {
10755
12100
  const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
10756
12101
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10757
12102
  } catch (err) {
10758
- return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
12103
+ const normalized = normalizeKordocError(err, "DOCX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
12104
+ return { success: false, fileType: "docx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
10759
12105
  }
10760
12106
  }
10761
12107
  // Annotate the CommonJS export names for ESM import in node:
10762
12108
  0 && (module.exports = {
12109
+ AllKeysCoolingDownError,
12110
+ ApiKeyRotationPool,
10763
12111
  VERSION,
10764
12112
  blocksToMarkdown,
10765
12113
  compare,
@@ -10778,7 +12126,8 @@ async function parseDocx(buffer, options, zip) {
10778
12126
  parseHwp,
10779
12127
  parseHwpx,
10780
12128
  parsePdf,
10781
- parseXlsx
12129
+ parseXlsx,
12130
+ runUnifiedOcrPipeline
10782
12131
  });
10783
12132
  /*! Bundled license information:
10784
12133