@clazic/kordoc 2.4.10 → 2.4.12

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -75,6 +75,224 @@ var init_page_range = __esm({
75
75
  }
76
76
  });
77
77
 
78
+ // src/logging/logger.ts
79
+ import { mkdirSync, appendFileSync } from "fs";
80
+ import { appendFile } from "fs/promises";
81
+ import { basename, dirname, resolve } from "path";
82
+ import { randomUUID } from "crypto";
83
+ function createLoggerFromEnv(env = process.env) {
84
+ const level = parseLevel(env.KORDOC_LOG_LEVEL);
85
+ const includeStack = env.KORDOC_LOG_STACK === "1";
86
+ const filePath = env.KORDOC_LOG_FILE ? resolve(env.KORDOC_LOG_FILE) : "";
87
+ const config = {
88
+ level,
89
+ includeStack,
90
+ progressSampleMs: parsePositiveInt(env.KORDOC_LOG_PROGRESS_SAMPLE_MS, 1e3),
91
+ basenamePaths: env.KORDOC_LOG_BASENAME_PATHS === "1",
92
+ textLimit: parsePositiveInt(env.KORDOC_LOG_TEXT_LIMIT, 400)
93
+ };
94
+ const consoleSink = new ConsoleLogger(config);
95
+ const sinks = [consoleSink];
96
+ if (filePath) sinks.push(new JsonlLogger(config, filePath));
97
+ return new CompositeLogger(config, sinks);
98
+ }
99
+ function generateRunId(prefix = "run") {
100
+ return `${prefix}_${randomUUID().slice(0, 8)}`;
101
+ }
102
+ function parseLevel(input) {
103
+ const v = (input || "").toLowerCase();
104
+ if (v === "error" || v === "warn" || v === "info" || v === "debug" || v === "trace") return v;
105
+ return "error";
106
+ }
107
+ function maskSecrets(input) {
108
+ return input.replace(/nvapi-[A-Za-z0-9_\-]+/g, "nvapi-***").replace(/Bearer\s+[A-Za-z0-9_\-\.]+/gi, "Bearer ***");
109
+ }
110
+ function sanitizeMeta(meta, cfg) {
111
+ const out = {};
112
+ for (const [k, v] of Object.entries(meta)) {
113
+ if (/authorization|api[_-]?key|token/i.test(k)) {
114
+ out[k] = "***";
115
+ continue;
116
+ }
117
+ if (typeof v === "string") {
118
+ let next = maskSecrets(v);
119
+ if (cfg.basenamePaths && /path|file|dir/i.test(k)) {
120
+ next = basename(next);
121
+ }
122
+ out[k] = limitText(next, cfg.textLimit);
123
+ } else {
124
+ out[k] = v;
125
+ }
126
+ }
127
+ return out;
128
+ }
129
+ function parsePositiveInt(input, fallback) {
130
+ const n = Number(input);
131
+ if (!Number.isFinite(n) || n < 0) return fallback;
132
+ return Math.floor(n);
133
+ }
134
+ function limitText(input, maxLen) {
135
+ if (maxLen <= 0) return input;
136
+ if (input.length <= maxLen) return input;
137
+ return `${input.slice(0, maxLen)}...(+${input.length - maxLen})`;
138
+ }
139
+ var LEVEL_ORDER, BaseLogger, ConsoleLogger, JsonlLogger, CompositeLogger;
140
+ var init_logger = __esm({
141
+ "src/logging/logger.ts"() {
142
+ "use strict";
143
+ LEVEL_ORDER = {
144
+ error: 0,
145
+ warn: 1,
146
+ info: 2,
147
+ debug: 3,
148
+ trace: 4
149
+ };
150
+ BaseLogger = class _BaseLogger {
151
+ constructor(config, context = {}) {
152
+ this.config = config;
153
+ this.context = context;
154
+ }
155
+ static progressSeenAt = /* @__PURE__ */ new Map();
156
+ shouldLog(level) {
157
+ return LEVEL_ORDER[level] <= LEVEL_ORDER[this.config.level];
158
+ }
159
+ shouldEmitProgress(ev) {
160
+ if (this.config.progressSampleMs <= 0) return true;
161
+ if ((ev.event ?? "message") !== "progress") return true;
162
+ if (ev.level === "error" || ev.level === "warn") return true;
163
+ const key = [
164
+ this.context.runId ?? ev.runId ?? "no-run",
165
+ this.context.component ?? ev.component ?? "no-component",
166
+ this.context.stage ?? ev.stage ?? "unknown",
167
+ ev.message
168
+ ].join("|");
169
+ const now = Date.now();
170
+ const prev = _BaseLogger.progressSeenAt.get(key) ?? 0;
171
+ if (now - prev < this.config.progressSampleMs) return false;
172
+ _BaseLogger.progressSeenAt.set(key, now);
173
+ return true;
174
+ }
175
+ merge(ev) {
176
+ const out = {
177
+ ...this.context,
178
+ ...ev,
179
+ ts: (/* @__PURE__ */ new Date()).toISOString(),
180
+ level: ev.level,
181
+ stage: ev.stage ?? this.context.stage ?? "unknown",
182
+ event: ev.event ?? "message",
183
+ message: ev.message
184
+ };
185
+ if (!this.config.includeStack && out.error?.stack) {
186
+ out.error = { ...out.error, stack: void 0 };
187
+ }
188
+ if (out.meta) out.meta = sanitizeMeta(out.meta, this.config);
189
+ if (out.error?.message) out.error.message = maskSecrets(out.error.message);
190
+ if (out.message) out.message = limitText(maskSecrets(out.message), this.config.textLimit);
191
+ return out;
192
+ }
193
+ child(context) {
194
+ return new _BaseLogger(this.config, { ...this.context, ...context });
195
+ }
196
+ withRun(runId) {
197
+ return this.child({ runId });
198
+ }
199
+ // eslint-disable-next-line @typescript-eslint/no-unused-vars
200
+ log(event) {
201
+ }
202
+ };
203
+ ConsoleLogger = class extends BaseLogger {
204
+ log(event) {
205
+ if (!this.shouldLog(event.level)) return;
206
+ if (!this.shouldEmitProgress(event)) return;
207
+ const e = this.merge(event);
208
+ const prefix = `[${e.ts}] [${e.level.toUpperCase()}]${e.runId ? ` [${e.runId}]` : ""}${e.stage ? ` [${e.stage}]` : ""}`;
209
+ const line = `${prefix} ${e.message}${e.component ? ` (${e.component})` : ""}`;
210
+ if (e.level === "error") {
211
+ process.stderr.write(line + "\n");
212
+ if (e.error?.stack) process.stderr.write(e.error.stack + "\n");
213
+ } else {
214
+ process.stdout.write(line + "\n");
215
+ }
216
+ }
217
+ };
218
+ JsonlLogger = class _JsonlLogger extends BaseLogger {
219
+ constructor(config, filePath, context = {}) {
220
+ super(config, context);
221
+ this.filePath = filePath;
222
+ mkdirSync(dirname(filePath), { recursive: true });
223
+ _JsonlLogger.ensureState(filePath);
224
+ }
225
+ static states = /* @__PURE__ */ new Map();
226
+ static ensureState(path) {
227
+ let state = _JsonlLogger.states.get(path);
228
+ if (!state) {
229
+ state = { queue: [], flushing: false };
230
+ _JsonlLogger.states.set(path, state);
231
+ const flushSync = () => {
232
+ const s = _JsonlLogger.states.get(path);
233
+ if (!s || s.queue.length === 0) return;
234
+ const payload = s.queue.join("");
235
+ s.queue = [];
236
+ if (!payload) return;
237
+ appendFileSync(path, payload, "utf-8");
238
+ };
239
+ process.on("beforeExit", flushSync);
240
+ process.on("exit", flushSync);
241
+ }
242
+ return state;
243
+ }
244
+ scheduleFlush(path) {
245
+ const state = _JsonlLogger.ensureState(path);
246
+ if (state.timer || state.flushing) return;
247
+ state.timer = setTimeout(() => {
248
+ state.timer = void 0;
249
+ void this.flush(path);
250
+ }, 200);
251
+ }
252
+ async flush(path) {
253
+ const state = _JsonlLogger.ensureState(path);
254
+ if (state.flushing) return;
255
+ if (state.queue.length === 0) return;
256
+ state.flushing = true;
257
+ const payload = state.queue.join("");
258
+ state.queue = [];
259
+ try {
260
+ await appendFile(path, payload, "utf-8");
261
+ } finally {
262
+ state.flushing = false;
263
+ if (state.queue.length > 0) this.scheduleFlush(path);
264
+ }
265
+ }
266
+ log(event) {
267
+ if (!this.shouldLog(event.level)) return;
268
+ if (!this.shouldEmitProgress(event)) return;
269
+ const e = this.merge(event);
270
+ const state = _JsonlLogger.ensureState(this.filePath);
271
+ state.queue.push(JSON.stringify(e) + "\n");
272
+ this.scheduleFlush(this.filePath);
273
+ }
274
+ child(context) {
275
+ return new _JsonlLogger(this.config, this.filePath, { ...this.context, ...context });
276
+ }
277
+ };
278
+ CompositeLogger = class _CompositeLogger extends BaseLogger {
279
+ constructor(config, sinks, context = {}) {
280
+ super(config, context);
281
+ this.sinks = sinks;
282
+ }
283
+ log(event) {
284
+ if (!this.shouldLog(event.level)) return;
285
+ if (!this.shouldEmitProgress(event)) return;
286
+ for (const sink of this.sinks) sink.log(event);
287
+ }
288
+ child(context) {
289
+ const nextSinks = this.sinks.map((s) => s.child(context));
290
+ return new _CompositeLogger(this.config, nextSinks, { ...this.context, ...context });
291
+ }
292
+ };
293
+ }
294
+ });
295
+
78
296
  // node_modules/cfb/cfb.js
79
297
  var require_cfb = __commonJS({
80
298
  "node_modules/cfb/cfb.js"(exports, module) {
@@ -394,8 +612,8 @@ var require_cfb = __commonJS({
394
612
  }
395
613
  return L.length - R.length;
396
614
  }
397
- function dirname(p) {
398
- if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p : dirname(p.slice(0, -1));
615
+ function dirname4(p) {
616
+ if (p.charAt(p.length - 1) == "/") return p.slice(0, -1).indexOf("/") === -1 ? p : dirname4(p.slice(0, -1));
399
617
  var c = p.lastIndexOf("/");
400
618
  return c === -1 ? p : p.slice(0, c + 1);
401
619
  }
@@ -816,10 +1034,10 @@ var require_cfb = __commonJS({
816
1034
  data.push([cfb.FullPaths[i2], cfb.FileIndex[i2]]);
817
1035
  }
818
1036
  for (i2 = 0; i2 < data.length; ++i2) {
819
- var dad = dirname(data[i2][0]);
1037
+ var dad = dirname4(data[i2][0]);
820
1038
  s = fullPaths[dad];
821
1039
  while (!s) {
822
- while (dirname(dad) && !fullPaths[dirname(dad)]) dad = dirname(dad);
1040
+ while (dirname4(dad) && !fullPaths[dirname4(dad)]) dad = dirname4(dad);
823
1041
  data.push([dad, {
824
1042
  name: filename(dad).replace("/", ""),
825
1043
  type: 1,
@@ -829,7 +1047,7 @@ var require_cfb = __commonJS({
829
1047
  content: null
830
1048
  }]);
831
1049
  fullPaths[dad] = true;
832
- dad = dirname(data[i2][0]);
1050
+ dad = dirname4(data[i2][0]);
833
1051
  s = fullPaths[dad];
834
1052
  }
835
1053
  }
@@ -855,13 +1073,13 @@ var require_cfb = __commonJS({
855
1073
  elt.size = 0;
856
1074
  elt.type = 5;
857
1075
  } else if (nm.slice(-1) == "/") {
858
- for (j = i2 + 1; j < data.length; ++j) if (dirname(cfb.FullPaths[j]) == nm) break;
1076
+ for (j = i2 + 1; j < data.length; ++j) if (dirname4(cfb.FullPaths[j]) == nm) break;
859
1077
  elt.C = j >= data.length ? -1 : j;
860
- for (j = i2 + 1; j < data.length; ++j) if (dirname(cfb.FullPaths[j]) == dirname(nm)) break;
1078
+ for (j = i2 + 1; j < data.length; ++j) if (dirname4(cfb.FullPaths[j]) == dirname4(nm)) break;
861
1079
  elt.R = j >= data.length ? -1 : j;
862
1080
  elt.type = 1;
863
1081
  } else {
864
- if (dirname(cfb.FullPaths[i2 + 1] || "") == dirname(nm)) elt.R = i2 + 1;
1082
+ if (dirname4(cfb.FullPaths[i2 + 1] || "") == dirname4(nm)) elt.R = i2 + 1;
865
1083
  elt.type = 2;
866
1084
  }
867
1085
  }
@@ -2029,13 +2247,13 @@ var init_auto_detect = __esm({
2029
2247
 
2030
2248
  // src/ocr/cli-provider.ts
2031
2249
  import { spawnSync } from "child_process";
2032
- import { writeFileSync, readFileSync, unlinkSync, mkdirSync } from "fs";
2250
+ import { writeFileSync, readFileSync, unlinkSync, mkdirSync as mkdirSync2 } from "fs";
2033
2251
  import { join } from "path";
2034
2252
  import { tmpdir } from "os";
2035
2253
  function getTempDir() {
2036
2254
  if (!_tempDir) {
2037
- _tempDir = join(process.cwd(), "_kordoc_ocr_tmp");
2038
- mkdirSync(_tempDir, { recursive: true });
2255
+ _tempDir = join(process.cwd(), ".kordoc_ocr_tmp");
2256
+ mkdirSync2(_tempDir, { recursive: true });
2039
2257
  }
2040
2258
  return _tempDir;
2041
2259
  }
@@ -2224,7 +2442,7 @@ async function createTesseractPoolProvider(concurrency) {
2224
2442
  const waitQueue = [];
2225
2443
  function acquire() {
2226
2444
  if (idle.length > 0) return Promise.resolve(idle.pop());
2227
- return new Promise((resolve) => waitQueue.push(resolve));
2445
+ return new Promise((resolve4) => waitQueue.push(resolve4));
2228
2446
  }
2229
2447
  function release(w) {
2230
2448
  if (waitQueue.length > 0) {
@@ -2260,13 +2478,13 @@ __export(batch_provider_exports, {
2260
2478
  createBatchCliProvider: () => createBatchCliProvider
2261
2479
  });
2262
2480
  import { spawn, execSync as execSync2 } from "child_process";
2263
- import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync2 } from "fs";
2481
+ import { writeFileSync as writeFileSync2, readFileSync as readFileSync2, unlinkSync as unlinkSync2, mkdirSync as mkdirSync3 } from "fs";
2264
2482
  import { join as join2 } from "path";
2265
2483
  import { tmpdir as tmpdir2 } from "os";
2266
2484
  function getBatchTempDir() {
2267
2485
  if (!_batchTempDir) {
2268
2486
  _batchTempDir = join2(process.cwd(), ".kordoc_ocr_tmp");
2269
- mkdirSync2(_batchTempDir, { recursive: true });
2487
+ mkdirSync3(_batchTempDir, { recursive: true });
2270
2488
  if (process.platform === "win32") {
2271
2489
  try {
2272
2490
  execSync2(`attrib +h "${_batchTempDir}"`, { stdio: "ignore" });
@@ -2317,7 +2535,7 @@ function createBatchCliProvider(mode, batchSize) {
2317
2535
  };
2318
2536
  }
2319
2537
  function spawnAsync(cmd, args, opts) {
2320
- return new Promise((resolve, reject) => {
2538
+ return new Promise((resolve4, reject) => {
2321
2539
  const child = spawn(cmd, args, {
2322
2540
  cwd: opts.cwd,
2323
2541
  env: process.env,
@@ -2353,7 +2571,7 @@ function spawnAsync(cmd, args, opts) {
2353
2571
  if (killed) {
2354
2572
  reject(new Error(`\uD0C0\uC784\uC544\uC6C3 (${Math.round(opts.timeoutMs / 1e3)}\uCD08)`));
2355
2573
  } else {
2356
- resolve({ stdout, stderr, exitCode: code ?? 1 });
2574
+ resolve4({ stdout, stderr, exitCode: code ?? 1 });
2357
2575
  }
2358
2576
  });
2359
2577
  child.on("error", (err) => {
@@ -2453,7 +2671,10 @@ __export(resolve_exports, {
2453
2671
  resolveOcrProvider: () => resolveOcrProvider
2454
2672
  });
2455
2673
  async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2674
+ const logger = createLoggerFromEnv().child({ component: "ocr/resolve.ts", stage: "ocr" });
2675
+ logger.log({ level: "debug", event: "start", message: "OCR provider resolve \uC2DC\uC791", meta: { mode, concurrency, batchSize } });
2456
2676
  if (mode === "off") {
2677
+ logger.log({ level: "warn", event: "error", message: "OCR \uBE44\uD65C\uC131\uD654 \uBAA8\uB4DC \uC694\uCCAD" });
2457
2678
  throw new Error("OCR\uC774 \uBE44\uD65C\uC131\uD654\uB418\uC5B4 \uC788\uC2B5\uB2C8\uB2E4 (--ocr off).");
2458
2679
  }
2459
2680
  if (mode !== "auto") {
@@ -2461,21 +2682,27 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2461
2682
  if (mode === "tesseract") {
2462
2683
  const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2463
2684
  if (concurrency && concurrency > 1) {
2685
+ logger.log({ level: "info", event: "done", message: "Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2464
2686
  return createTesseractPoolProvider2(concurrency);
2465
2687
  }
2688
+ logger.log({ level: "info", event: "done", message: "Tesseract single provider \uC120\uD0DD" });
2466
2689
  return createTesseractProvider2();
2467
2690
  }
2468
2691
  if (mode === "gemini" || mode === "claude" || mode === "codex") {
2469
2692
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2470
2693
  const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[mode];
2471
2694
  if (effectiveBatch > 1) {
2695
+ logger.log({ level: "info", event: "done", message: "Batch CLI provider \uC120\uD0DD", meta: { mode, batchSize: effectiveBatch } });
2472
2696
  return createBatchCliProvider2(mode, effectiveBatch);
2473
2697
  }
2698
+ logger.log({ level: "info", event: "done", message: "CLI provider \uC120\uD0DD", meta: { mode } });
2474
2699
  return createCliOcrProvider(mode);
2475
2700
  }
2701
+ logger.log({ level: "info", event: "done", message: "CLI provider \uC120\uD0DD", meta: { mode } });
2476
2702
  return createCliOcrProvider(mode);
2477
2703
  }
2478
2704
  const detected = detectAvailableOcr();
2705
+ logger.log({ level: "info", event: "progress", message: "OCR auto \uAC10\uC9C0 \uACB0\uACFC", meta: { detected } });
2479
2706
  if (detected !== "codex") {
2480
2707
  if (detected === "tesseract") {
2481
2708
  warnings?.push({
@@ -2492,18 +2719,23 @@ async function resolveOcrProvider(mode, warnings, concurrency, batchSize) {
2492
2719
  if (detected === "tesseract") {
2493
2720
  const { createTesseractProvider: createTesseractProvider2, createTesseractPoolProvider: createTesseractPoolProvider2 } = await Promise.resolve().then(() => (init_tesseract_provider(), tesseract_provider_exports));
2494
2721
  if (concurrency && concurrency > 1) {
2722
+ logger.log({ level: "info", event: "done", message: "AUTO: Tesseract pool provider \uC120\uD0DD", meta: { concurrency } });
2495
2723
  return createTesseractPoolProvider2(concurrency);
2496
2724
  }
2725
+ logger.log({ level: "info", event: "done", message: "AUTO: Tesseract single provider \uC120\uD0DD" });
2497
2726
  return createTesseractProvider2();
2498
2727
  }
2499
2728
  if (detected === "gemini" || detected === "codex" || detected === "claude") {
2500
2729
  const { createBatchCliProvider: createBatchCliProvider2, DEFAULT_BATCH_SIZES: DEFAULT_BATCH_SIZES2 } = await Promise.resolve().then(() => (init_batch_provider(), batch_provider_exports));
2501
2730
  const effectiveBatch = batchSize ?? DEFAULT_BATCH_SIZES2[detected];
2502
2731
  if (effectiveBatch > 1) {
2732
+ logger.log({ level: "info", event: "done", message: "AUTO: Batch CLI provider \uC120\uD0DD", meta: { mode: detected, batchSize: effectiveBatch } });
2503
2733
  return createBatchCliProvider2(detected, effectiveBatch);
2504
2734
  }
2735
+ logger.log({ level: "info", event: "done", message: "AUTO: CLI provider \uC120\uD0DD", meta: { mode: detected } });
2505
2736
  return createCliOcrProvider(detected);
2506
2737
  }
2738
+ logger.log({ level: "info", event: "done", message: "AUTO: CLI provider \uC120\uD0DD", meta: { mode: detected } });
2507
2739
  return createCliOcrProvider(detected);
2508
2740
  }
2509
2741
  var init_resolve = __esm({
@@ -2511,6 +2743,7 @@ var init_resolve = __esm({
2511
2743
  "use strict";
2512
2744
  init_auto_detect();
2513
2745
  init_cli_provider();
2746
+ init_logger();
2514
2747
  }
2515
2748
  });
2516
2749
 
@@ -2670,9 +2903,18 @@ function isBatchProvider(p) {
2670
2903
  return !!p && typeof p === "object" && "__batch" in p && p.__batch === true;
2671
2904
  }
2672
2905
  async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
2906
+ const logger = createLoggerFromEnv().child({ component: "ocr/provider.ts", stage: "ocr" });
2907
+ logger.log({
2908
+ level: "info",
2909
+ event: "start",
2910
+ message: "OCR \uD398\uC774\uC9C0 \uCC98\uB9AC \uC2DC\uC791",
2911
+ meta: { effectivePageCount, concurrency, filteredPages: pageFilter?.size, batchProvider: isBatchProvider(provider) }
2912
+ });
2673
2913
  const blocks = [];
2674
2914
  if (isBatchProvider(provider)) {
2675
- return ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
2915
+ const result = await ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency, onProgress);
2916
+ logger.log({ level: "info", event: "done", message: "OCR \uBC30\uCE58 \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: result.length } });
2917
+ return result;
2676
2918
  }
2677
2919
  if (concurrency <= 1) {
2678
2920
  for (let i = 1; i <= effectivePageCount; i++) {
@@ -2688,8 +2930,16 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2688
2930
  message: `\uD398\uC774\uC9C0 ${i} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2689
2931
  code: "OCR_PAGE_FAILED"
2690
2932
  });
2933
+ logger.log({
2934
+ level: "warn",
2935
+ event: "progress",
2936
+ message: "\uD398\uC774\uC9C0 OCR \uC2E4\uD328",
2937
+ meta: { page: i },
2938
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
2939
+ });
2691
2940
  }
2692
2941
  }
2942
+ logger.log({ level: "info", event: "done", message: "OCR \uC21C\uCC28 \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: blocks.length } });
2693
2943
  return blocks;
2694
2944
  }
2695
2945
  const pageNumbers = [];
@@ -2709,6 +2959,13 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2709
2959
  message: `\uD398\uC774\uC9C0 ${pageNum} OCR \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
2710
2960
  code: "OCR_PAGE_FAILED"
2711
2961
  });
2962
+ logger.log({
2963
+ level: "warn",
2964
+ event: "progress",
2965
+ message: "\uD398\uC774\uC9C0 OCR \uC2E4\uD328(\uBCD1\uB82C)",
2966
+ meta: { page: pageNum },
2967
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
2968
+ });
2712
2969
  return null;
2713
2970
  }
2714
2971
  });
@@ -2717,6 +2974,7 @@ async function ocrPages(doc, provider, pageFilter, effectivePageCount, warnings,
2717
2974
  if (!item) continue;
2718
2975
  for (const b of item.pageBlocks) blocks.push(b);
2719
2976
  }
2977
+ logger.log({ level: "info", event: "done", message: "OCR \uBCD1\uB82C \uCC98\uB9AC \uC644\uB8CC", meta: { blocks: blocks.length, pages: pageNumbers.length } });
2720
2978
  return blocks;
2721
2979
  }
2722
2980
  async function ocrPagesBatch(doc, provider, pageFilter, effectivePageCount, warnings, concurrency = 1, onProgress) {
@@ -2799,11 +3057,12 @@ var init_provider = __esm({
2799
3057
  "src/ocr/provider.ts"() {
2800
3058
  "use strict";
2801
3059
  init_markdown_to_blocks();
3060
+ init_logger();
2802
3061
  }
2803
3062
  });
2804
3063
 
2805
3064
  // src/index.ts
2806
- import { readFile } from "fs/promises";
3065
+ import { readFile as readFile2 } from "fs/promises";
2807
3066
 
2808
3067
  // src/detect.ts
2809
3068
  import JSZip from "jszip";
@@ -2856,7 +3115,7 @@ import JSZip2 from "jszip";
2856
3115
  import { DOMParser } from "@xmldom/xmldom";
2857
3116
 
2858
3117
  // src/utils.ts
2859
- var VERSION = true ? "2.4.10" : "0.0.0-dev";
3118
+ var VERSION = true ? "2.4.12" : "0.0.0-dev";
2860
3119
  function toArrayBuffer(buf) {
2861
3120
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
2862
3121
  return buf.buffer;
@@ -2864,9 +3123,13 @@ function toArrayBuffer(buf) {
2864
3123
  return buf.buffer.slice(buf.byteOffset, buf.byteOffset + buf.byteLength);
2865
3124
  }
2866
3125
  var KordocError = class extends Error {
2867
- constructor(message) {
3126
+ code;
3127
+ stage;
3128
+ constructor(message, opts = {}) {
2868
3129
  super(message);
2869
3130
  this.name = "KordocError";
3131
+ this.code = opts.code;
3132
+ this.stage = opts.stage;
2870
3133
  }
2871
3134
  };
2872
3135
  function isPathTraversal(name) {
@@ -2930,6 +3193,16 @@ function classifyError(err) {
2930
3193
  if (msg.includes("\uC2DC\uADF8\uB2C8\uCC98") || msg.includes("\uBCF5\uAD6C\uD560 \uC218 \uC5C6")) return "CORRUPTED";
2931
3194
  return "PARSE_ERROR";
2932
3195
  }
3196
+ function normalizeKordocError(err, fallbackMessage, stage = "unknown", fallbackCode = "PARSE_ERROR") {
3197
+ if (err instanceof KordocError) {
3198
+ if (!err.stage) err.stage = stage;
3199
+ if (!err.code) err.code = fallbackCode;
3200
+ return err;
3201
+ }
3202
+ const message = err instanceof Error ? err.message : fallbackMessage;
3203
+ const code = err instanceof Error ? classifyError(err) : fallbackCode;
3204
+ return new KordocError(message || fallbackMessage, { code, stage });
3205
+ }
2933
3206
 
2934
3207
  // src/table/builder.ts
2935
3208
  var MAX_COLS = 200;
@@ -3192,6 +3465,7 @@ var HEADING_RATIO_H3 = 1.15;
3192
3465
 
3193
3466
  // src/hwpx/parser.ts
3194
3467
  init_page_range();
3468
+ init_logger();
3195
3469
  var MAX_DECOMPRESS_SIZE = 500 * 1024 * 1024;
3196
3470
  var MAX_ZIP_ENTRIES = 2e3;
3197
3471
  function clampSpan(val, max) {
@@ -3283,50 +3557,89 @@ function stripDtd(xml) {
3283
3557
  return xml.replace(/<!DOCTYPE\s[^[>]*(\[[\s\S]*?\])?\s*>/gi, "");
3284
3558
  }
3285
3559
  async function parseHwpxDocument(buffer, options, existingZip) {
3286
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3287
- let zip;
3560
+ const logger = createLoggerFromEnv().child({ component: "hwpx/parser.ts", stage: "detect" });
3561
+ logger.log({ level: "info", event: "start", message: "HWPX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
3562
+ let lastParsedSection = 0;
3288
3563
  try {
3289
- zip = existingZip ?? await JSZip2.loadAsync(buffer);
3290
- } catch {
3291
- return await extractFromBrokenZip(buffer);
3292
- }
3293
- const actualEntryCount = Object.keys(zip.files).length;
3294
- if (actualEntryCount > MAX_ZIP_ENTRIES) {
3295
- throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
3296
- }
3297
- const decompressed = { total: 0 };
3298
- const metadata = {};
3299
- await extractHwpxMetadata(zip, metadata, decompressed);
3300
- const styleMap = await extractHwpxStyles(zip, decompressed);
3301
- const warnings = [];
3302
- const sectionPaths = await resolveSectionPaths(zip);
3303
- if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
3304
- metadata.pageCount = sectionPaths.length;
3305
- const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
3306
- const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
3307
- const blocks = [];
3308
- let parsedSections = 0;
3309
- for (let si = 0; si < sectionPaths.length; si++) {
3310
- if (pageFilter && !pageFilter.has(si + 1)) continue;
3311
- const file = zip.file(sectionPaths[si]);
3312
- if (!file) continue;
3564
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE, MAX_ZIP_ENTRIES);
3565
+ let zip;
3313
3566
  try {
3314
- const xml = await file.async("text");
3315
- decompressed.total += xml.length * 2;
3316
- if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
3317
- blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
3318
- parsedSections++;
3319
- options?.onProgress?.(parsedSections, totalTarget);
3320
- } catch (secErr) {
3321
- if (secErr instanceof KordocError) throw secErr;
3322
- warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
3323
- }
3324
- }
3325
- const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
3326
- detectHwpxHeadings(blocks, styleMap);
3327
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
3328
- const markdown = blocksToMarkdown(blocks);
3329
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
3567
+ zip = existingZip ?? await JSZip2.loadAsync(buffer);
3568
+ } catch {
3569
+ return await extractFromBrokenZip(buffer);
3570
+ }
3571
+ const actualEntryCount = Object.keys(zip.files).length;
3572
+ if (actualEntryCount > MAX_ZIP_ENTRIES) {
3573
+ throw new KordocError("ZIP \uC5D4\uD2B8\uB9AC \uC218 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
3574
+ }
3575
+ const decompressed = { total: 0 };
3576
+ const metadata = {};
3577
+ await extractHwpxMetadata(zip, metadata, decompressed);
3578
+ const styleMap = await extractHwpxStyles(zip, decompressed);
3579
+ const warnings = [];
3580
+ const sectionPaths = await resolveSectionPaths(zip);
3581
+ if (sectionPaths.length === 0) throw new KordocError("HWPX\uC5D0\uC11C \uC139\uC158 \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
3582
+ metadata.pageCount = sectionPaths.length;
3583
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uACBD\uB85C \uD574\uC11D \uC644\uB8CC", meta: { sections: sectionPaths.length } });
3584
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sectionPaths.length) : null;
3585
+ const totalTarget = pageFilter ? pageFilter.size : sectionPaths.length;
3586
+ const blocks = [];
3587
+ let parsedSections = 0;
3588
+ for (let si = 0; si < sectionPaths.length; si++) {
3589
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
3590
+ const file = zip.file(sectionPaths[si]);
3591
+ if (!file) continue;
3592
+ try {
3593
+ const xml = await file.async("text");
3594
+ decompressed.total += xml.length * 2;
3595
+ if (decompressed.total > MAX_DECOMPRESS_SIZE) throw new KordocError("ZIP \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (ZIP bomb \uC758\uC2EC)");
3596
+ blocks.push(...parseSectionXml(xml, styleMap, warnings, si + 1));
3597
+ parsedSections++;
3598
+ options?.onProgress?.(parsedSections, totalTarget);
3599
+ logger.log({
3600
+ level: "debug",
3601
+ stage: "convert",
3602
+ event: "progress",
3603
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
3604
+ meta: { section: si + 1, parsedSections, totalTarget }
3605
+ });
3606
+ lastParsedSection = si + 1;
3607
+ } catch (secErr) {
3608
+ if (secErr instanceof KordocError) throw secErr;
3609
+ warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
3610
+ logger.log({
3611
+ level: "warn",
3612
+ stage: "convert",
3613
+ event: "progress",
3614
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
3615
+ meta: { section: si + 1 },
3616
+ error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
3617
+ });
3618
+ }
3619
+ }
3620
+ const images = await extractImagesFromZip(zip, blocks, decompressed, warnings);
3621
+ detectHwpxHeadings(blocks, styleMap);
3622
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
3623
+ const markdown = blocksToMarkdown(blocks);
3624
+ logger.log({
3625
+ level: "info",
3626
+ stage: "finalize",
3627
+ event: "done",
3628
+ message: "HWPX \uD30C\uC2F1 \uC644\uB8CC",
3629
+ meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
3630
+ });
3631
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
3632
+ } catch (err) {
3633
+ logger.log({
3634
+ level: "error",
3635
+ stage: "finalize",
3636
+ event: "error",
3637
+ message: "HWPX \uD30C\uC2F1 \uC2E4\uD328",
3638
+ meta: { lastParsedSection },
3639
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
3640
+ });
3641
+ throw err;
3642
+ }
3330
3643
  }
3331
3644
  function imageExtToMime(ext) {
3332
3645
  switch (ext.toLowerCase()) {
@@ -5043,75 +5356,115 @@ function parseLenientCfb(data) {
5043
5356
 
5044
5357
  // src/hwp5/parser.ts
5045
5358
  init_page_range();
5359
+ init_logger();
5046
5360
  var CFB = __toESM(require_cfb(), 1);
5047
5361
  var MAX_SECTIONS = 100;
5048
5362
  var MAX_TOTAL_DECOMPRESS = 500 * 1024 * 1024;
5049
5363
  function parseHwp5Document(buffer, options) {
5050
- let cfb = null;
5051
- let lenientCfb = null;
5052
- const warnings = [];
5364
+ const logger = createLoggerFromEnv().child({ component: "hwp5/parser.ts", stage: "detect" });
5365
+ logger.log({ level: "info", event: "start", message: "HWP5 \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.length } });
5366
+ let lastParsedSection = 0;
5053
5367
  try {
5054
- cfb = CFB.parse(buffer);
5055
- } catch {
5368
+ let cfb = null;
5369
+ let lenientCfb = null;
5370
+ const warnings = [];
5056
5371
  try {
5057
- lenientCfb = parseLenientCfb(buffer);
5058
- warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
5372
+ cfb = CFB.parse(buffer);
5059
5373
  } catch {
5060
- throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
5374
+ try {
5375
+ lenientCfb = parseLenientCfb(buffer);
5376
+ warnings.push({ message: "\uC190\uC0C1\uB41C CFB \uCEE8\uD14C\uC774\uB108 \u2014 lenient \uBAA8\uB4DC\uB85C \uBCF5\uAD6C", code: "LENIENT_CFB_RECOVERY" });
5377
+ } catch {
5378
+ throw new KordocError("CFB \uCEE8\uD14C\uC774\uB108 \uD30C\uC2F1 \uC2E4\uD328 (strict \uBC0F lenient \uBAA8\uB450)");
5379
+ }
5061
5380
  }
5062
- }
5063
- const findStream = (path) => {
5064
- if (cfb) {
5065
- const entry = CFB.find(cfb, path);
5066
- return entry?.content ? Buffer.from(entry.content) : null;
5381
+ const findStream = (path) => {
5382
+ if (cfb) {
5383
+ const entry = CFB.find(cfb, path);
5384
+ return entry?.content ? Buffer.from(entry.content) : null;
5385
+ }
5386
+ return lenientCfb.findStream(path);
5387
+ };
5388
+ const headerData = findStream("/FileHeader");
5389
+ if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
5390
+ const header = parseFileHeader(headerData);
5391
+ if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
5392
+ if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
5393
+ if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
5394
+ const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
5395
+ const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
5396
+ const metadata = {
5397
+ version: `${header.versionMajor}.x`
5398
+ };
5399
+ if (cfb) extractHwp5Metadata(cfb, metadata);
5400
+ const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
5401
+ const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
5402
+ if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
5403
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC139\uC158 \uBAA9\uB85D \uD574\uC11D \uC644\uB8CC", meta: { sections: sections.length, distribution } });
5404
+ metadata.pageCount = sections.length;
5405
+ const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
5406
+ const totalTarget = pageFilter ? pageFilter.size : sections.length;
5407
+ const blocks = [];
5408
+ let totalDecompressed = 0;
5409
+ let parsedSections = 0;
5410
+ for (let si = 0; si < sections.length; si++) {
5411
+ if (pageFilter && !pageFilter.has(si + 1)) continue;
5412
+ try {
5413
+ const sectionData = sections[si];
5414
+ const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
5415
+ totalDecompressed += data.length;
5416
+ if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
5417
+ const records = readRecords(data);
5418
+ const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
5419
+ blocks.push(...sectionBlocks);
5420
+ parsedSections++;
5421
+ options?.onProgress?.(parsedSections, totalTarget);
5422
+ logger.log({
5423
+ level: "debug",
5424
+ stage: "convert",
5425
+ event: "progress",
5426
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC644\uB8CC",
5427
+ meta: { section: si + 1, parsedSections, totalTarget }
5428
+ });
5429
+ lastParsedSection = si + 1;
5430
+ } catch (secErr) {
5431
+ if (secErr instanceof KordocError) throw secErr;
5432
+ warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
5433
+ logger.log({
5434
+ level: "warn",
5435
+ stage: "convert",
5436
+ event: "progress",
5437
+ message: "\uC139\uC158 \uD30C\uC2F1 \uC2E4\uD328",
5438
+ meta: { section: si + 1 },
5439
+ error: { message: secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: secErr instanceof Error ? secErr.name : "Error" }
5440
+ });
5441
+ }
5067
5442
  }
5068
- return lenientCfb.findStream(path);
5069
- };
5070
- const headerData = findStream("/FileHeader");
5071
- if (!headerData) throw new KordocError("FileHeader \uC2A4\uD2B8\uB9BC \uC5C6\uC74C");
5072
- const header = parseFileHeader(headerData);
5073
- if (header.signature !== "HWP Document File") throw new KordocError("HWP \uC2DC\uADF8\uB2C8\uCC98 \uBD88\uC77C\uCE58");
5074
- if (header.flags & FLAG_ENCRYPTED) throw new KordocError("\uC554\uD638\uD654\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
5075
- if (header.flags & FLAG_DRM) throw new KordocError("DRM \uBCF4\uD638\uB41C HWP\uB294 \uC9C0\uC6D0\uD558\uC9C0 \uC54A\uC2B5\uB2C8\uB2E4");
5076
- const compressed = (header.flags & FLAG_COMPRESSED) !== 0;
5077
- const distribution = (header.flags & FLAG_DISTRIBUTION) !== 0;
5078
- const metadata = {
5079
- version: `${header.versionMajor}.x`
5080
- };
5081
- if (cfb) extractHwp5Metadata(cfb, metadata);
5082
- const docInfo = cfb ? parseDocInfoStream(cfb, compressed) : parseDocInfoFromStream(findStream("/DocInfo"), compressed);
5083
- const sections = distribution ? cfb ? findViewTextSections(cfb, compressed) : findViewTextSectionsLenient(lenientCfb, compressed) : cfb ? findSections(cfb) : findSectionsLenient(lenientCfb, compressed);
5084
- if (sections.length === 0) throw new KordocError("\uC139\uC158 \uC2A4\uD2B8\uB9BC\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
5085
- metadata.pageCount = sections.length;
5086
- const pageFilter = options?.pages ? parsePageRange(options.pages, sections.length) : null;
5087
- const totalTarget = pageFilter ? pageFilter.size : sections.length;
5088
- const blocks = [];
5089
- let totalDecompressed = 0;
5090
- let parsedSections = 0;
5091
- for (let si = 0; si < sections.length; si++) {
5092
- if (pageFilter && !pageFilter.has(si + 1)) continue;
5093
- try {
5094
- const sectionData = sections[si];
5095
- const data = !distribution && compressed ? decompressStream(Buffer.from(sectionData)) : Buffer.from(sectionData);
5096
- totalDecompressed += data.length;
5097
- if (totalDecompressed > MAX_TOTAL_DECOMPRESS) throw new KordocError("\uCD1D \uC555\uCD95 \uD574\uC81C \uD06C\uAE30 \uCD08\uACFC (decompression bomb \uC758\uC2EC)");
5098
- const records = readRecords(data);
5099
- const sectionBlocks = parseSection(records, docInfo, warnings, si + 1);
5100
- blocks.push(...sectionBlocks);
5101
- parsedSections++;
5102
- options?.onProgress?.(parsedSections, totalTarget);
5103
- } catch (secErr) {
5104
- if (secErr instanceof KordocError) throw secErr;
5105
- warnings.push({ page: si + 1, message: `\uC139\uC158 ${si + 1} \uD30C\uC2F1 \uC2E4\uD328: ${secErr instanceof Error ? secErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
5106
- }
5107
- }
5108
- const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
5109
- if (docInfo) {
5110
- detectHwp5Headings(blocks, docInfo);
5111
- }
5112
- const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
5113
- const markdown = blocksToMarkdown(blocks);
5114
- return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
5443
+ const images = cfb ? extractHwp5Images(cfb, blocks, compressed, warnings) : extractHwp5ImagesLenient(lenientCfb, blocks, compressed, warnings);
5444
+ if (docInfo) {
5445
+ detectHwp5Headings(blocks, docInfo);
5446
+ }
5447
+ const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
5448
+ const markdown = blocksToMarkdown(blocks);
5449
+ logger.log({
5450
+ level: "info",
5451
+ stage: "finalize",
5452
+ event: "done",
5453
+ message: "HWP5 \uD30C\uC2F1 \uC644\uB8CC",
5454
+ meta: { blocks: blocks.length, warnings: warnings.length, images: images.length, outline: outline.length }
5455
+ });
5456
+ return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0, images: images.length > 0 ? images : void 0 };
5457
+ } catch (err) {
5458
+ logger.log({
5459
+ level: "error",
5460
+ stage: "finalize",
5461
+ event: "error",
5462
+ message: "HWP5 \uD30C\uC2F1 \uC2E4\uD328",
5463
+ meta: { lastParsedSection },
5464
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
5465
+ });
5466
+ throw err;
5467
+ }
5115
5468
  }
5116
5469
  function parseDocInfoStream(cfb, compressed) {
5117
5470
  try {
@@ -5658,6 +6011,8 @@ function arrangeCells(rows, cols, cells) {
5658
6011
 
5659
6012
  // src/pdf/parser.ts
5660
6013
  init_page_range();
6014
+ import { createRequire } from "module";
6015
+ import { dirname as dirname2, join as join3, resolve as resolve2 } from "path";
5661
6016
 
5662
6017
  // src/pdf/line-detector.ts
5663
6018
  import { OPS } from "pdfjs-dist/legacy/build/pdf.mjs";
@@ -5845,12 +6200,17 @@ function buildTableGrids(horizontals, verticals) {
5845
6200
  const rawXs = vLines.map((l) => l.x1);
5846
6201
  const colXs = clusterCoordinates(rawXs).sort((a, b) => a - b);
5847
6202
  if (rowYs.length < 2 || colXs.length < 2) continue;
6203
+ const rowCount = rowYs.length - 1;
6204
+ const colCount = colXs.length - 1;
6205
+ if (rowCount <= 0 || colCount <= 0) continue;
6206
+ if (rowCount * colCount < 2) continue;
5848
6207
  const bbox = {
5849
6208
  x1: colXs[0],
5850
6209
  y1: rowYs[rowYs.length - 1],
5851
6210
  x2: colXs[colXs.length - 1],
5852
6211
  y2: rowYs[0]
5853
6212
  };
6213
+ if (!hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox)) continue;
5854
6214
  grids.push({ rowYs, colXs, bbox });
5855
6215
  }
5856
6216
  return mergeAdjacentGrids(grids);
@@ -5900,6 +6260,35 @@ function clusterCoordinates(values) {
5900
6260
  }
5901
6261
  return clusters.map((c) => c.sum / c.count);
5902
6262
  }
6263
+ function hasReliableGridStructure(rowYs, colXs, hLines, vLines, bbox) {
6264
+ const internalRows = rowYs.slice(1, -1);
6265
+ const internalCols = colXs.slice(1, -1);
6266
+ const width = Math.max(1, bbox.x2 - bbox.x1);
6267
+ const height = Math.max(1, bbox.y2 - bbox.y1);
6268
+ const coverageThreshold = 0.55;
6269
+ const coveredRows = internalRows.filter(
6270
+ (y) => hLines.some((h) => Math.abs(h.y1 - y) <= COORD_MERGE_TOL && lineOverlapRatio(h.x1, h.x2, bbox.x1, bbox.x2) >= coverageThreshold)
6271
+ ).length;
6272
+ const coveredCols = internalCols.filter(
6273
+ (x) => vLines.some((v) => Math.abs(v.x1 - x) <= COORD_MERGE_TOL && lineOverlapRatio(v.y1, v.y2, bbox.y1, bbox.y2) >= coverageThreshold)
6274
+ ).length;
6275
+ const rowCoverage = internalRows.length > 0 ? coveredRows / internalRows.length : 1;
6276
+ const colCoverage = internalCols.length > 0 ? coveredCols / internalCols.length : 1;
6277
+ const longHorizontal = hLines.filter((h) => Math.abs(h.x2 - h.x1) >= width * 0.7).length;
6278
+ const longVertical = vLines.filter((v) => Math.abs(v.y2 - v.y1) >= height * 0.7).length;
6279
+ const hasAxisSupport = longHorizontal >= 2 && longVertical >= 2;
6280
+ if (!hasAxisSupport) return false;
6281
+ if (internalRows.length > 0 && rowCoverage < 0.5) return false;
6282
+ if (internalCols.length > 0 && colCoverage < 0.5) return false;
6283
+ return true;
6284
+ }
6285
+ function lineOverlapRatio(a1, a2, b1, b2) {
6286
+ const left = Math.max(Math.min(a1, a2), Math.min(b1, b2));
6287
+ const right = Math.min(Math.max(a1, a2), Math.max(b1, b2));
6288
+ const overlap = Math.max(0, right - left);
6289
+ const target = Math.max(1, Math.abs(b2 - b1));
6290
+ return overlap / target;
6291
+ }
5903
6292
  function groupConnectedLines(lines) {
5904
6293
  const parent = lines.map((_, i) => i);
5905
6294
  function find2(x) {
@@ -6276,6 +6665,9 @@ function buildClusterTable(rows, columns, pageNum) {
6276
6665
  };
6277
6666
  }
6278
6667
 
6668
+ // src/pdf/parser.ts
6669
+ init_logger();
6670
+
6279
6671
  // src/pdf/polyfill.ts
6280
6672
  import * as pdfjsWorker from "pdfjs-dist/legacy/build/pdf.worker.mjs";
6281
6673
  var g = globalThis;
@@ -6296,6 +6688,17 @@ g.pdfjsWorker = pdfjsWorker;
6296
6688
  // src/pdf/parser.ts
6297
6689
  import { getDocument, GlobalWorkerOptions } from "pdfjs-dist/legacy/build/pdf.mjs";
6298
6690
  GlobalWorkerOptions.workerSrc = "";
6691
+ var require2 = createRequire(
6692
+ typeof __filename !== "undefined" ? __filename : resolve2(process.cwd(), "kordoc.require.cjs")
6693
+ );
6694
+ function resolvePdfjsWasmUrl() {
6695
+ try {
6696
+ const pdfjsPkg = require2.resolve("pdfjs-dist/package.json");
6697
+ return join3(dirname2(pdfjsPkg), "wasm/");
6698
+ } catch {
6699
+ return resolve2(process.cwd(), "node_modules/pdfjs-dist/wasm/");
6700
+ }
6701
+ }
6299
6702
  var MAX_PAGES = 5e3;
6300
6703
  var MAX_TOTAL_TEXT = 500 * 1024 * 1024;
6301
6704
  function calcPdfTimeout(bufferSize) {
@@ -6311,7 +6714,8 @@ async function loadPdfWithTimeout(buffer) {
6311
6714
  data: new Uint8Array(buffer),
6312
6715
  useSystemFonts: true,
6313
6716
  disableFontFace: true,
6314
- isEvalSupported: false
6717
+ isEvalSupported: false,
6718
+ wasmUrl: resolvePdfjsWasmUrl()
6315
6719
  });
6316
6720
  let timer;
6317
6721
  try {
@@ -6328,7 +6732,47 @@ async function loadPdfWithTimeout(buffer) {
6328
6732
  if (timer !== void 0) clearTimeout(timer);
6329
6733
  }
6330
6734
  }
6735
+ function estimateImageBasedPdf(metrics) {
6736
+ if (metrics.length === 0) {
6737
+ return { isImageBased: true, score: 1, reason: "\uC0D8\uD50C \uD1B5\uACC4 \uC5C6\uC74C" };
6738
+ }
6739
+ const totalPages = metrics.length;
6740
+ const totalChars = metrics.reduce((s, m) => s + m.nonWhitespaceChars, 0);
6741
+ const totalItems = metrics.reduce((s, m) => s + m.visibleItems, 0);
6742
+ const pagesWithText = metrics.filter((m) => m.nonWhitespaceChars >= 20 || m.visibleItems >= 15).length;
6743
+ const avgChars = totalChars / totalPages;
6744
+ const avgItems = totalItems / totalPages;
6745
+ const textPresenceRatio = pagesWithText / totalPages;
6746
+ let score = 0;
6747
+ if (avgChars < 10) score += 0.45;
6748
+ if (avgItems < 8) score += 0.35;
6749
+ if (textPresenceRatio < 0.35) score += 0.25;
6750
+ if (avgChars > 40) score -= 0.35;
6751
+ if (avgItems > 25) score -= 0.35;
6752
+ if (textPresenceRatio > 0.7) score -= 0.25;
6753
+ score = Math.max(0, Math.min(1, score));
6754
+ const isImageBased = score >= 0.5;
6755
+ const reason = `avgChars=${avgChars.toFixed(1)}, avgItems=${avgItems.toFixed(1)}, textPresence=${(textPresenceRatio * 100).toFixed(0)}%, score=${score.toFixed(2)}`;
6756
+ return { isImageBased, score, reason };
6757
+ }
6758
+ function summarizePartialFailures(failedPages, totalTarget) {
6759
+ if (failedPages.length === 0) return null;
6760
+ const sorted = [...failedPages].sort((a, b) => a - b);
6761
+ const preview = sorted.slice(0, 10).join(", ");
6762
+ const suffix = sorted.length > 10 ? ` \uC678 ${sorted.length - 10}\uD398\uC774\uC9C0` : "";
6763
+ return `\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uC694\uC57D: ${sorted.length}/${totalTarget}\uD398\uC774\uC9C0 \uC2E4\uD328 (p${preview}${suffix})`;
6764
+ }
6765
+ function shouldAbortForPartialFailures(failedPages, totalTarget, maxPartialFailureRatio) {
6766
+ if (typeof maxPartialFailureRatio !== "number") {
6767
+ return { abort: false, ratio: 0, threshold: 0 };
6768
+ }
6769
+ const threshold = Math.max(0, Math.min(1, maxPartialFailureRatio));
6770
+ const ratio = totalTarget > 0 ? failedPages.length / totalTarget : 0;
6771
+ return { abort: ratio > threshold, ratio, threshold };
6772
+ }
6331
6773
  async function parsePdfDocument(buffer, options) {
6774
+ const logger = createLoggerFromEnv().child({ component: "pdf/parser.ts", stage: "detect" });
6775
+ logger.log({ level: "info", event: "start", message: "PDF \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
6332
6776
  const doc = await loadPdfWithTimeout(buffer);
6333
6777
  try {
6334
6778
  const pageCount = doc.numPages;
@@ -6337,9 +6781,13 @@ async function parsePdfDocument(buffer, options) {
6337
6781
  await extractPdfMetadata(doc, metadata);
6338
6782
  const blocks = [];
6339
6783
  const warnings = [];
6784
+ const failedPages = [];
6785
+ let lastParsedPage2 = 0;
6786
+ const sampleMetricsByPage = /* @__PURE__ */ new Map();
6340
6787
  let totalChars = 0;
6341
6788
  let totalTextBytes = 0;
6342
6789
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
6790
+ logger.log({ level: "debug", event: "progress", message: "PDF \uB85C\uB529 \uC644\uB8CC", meta: { pageCount, effectivePageCount } });
6343
6791
  const pageFilter = options?.pages ? parsePageRange(options.pages, effectivePageCount) : null;
6344
6792
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
6345
6793
  const fontSizeFreq = /* @__PURE__ */ new Map();
@@ -6376,11 +6824,17 @@ async function parsePdfDocument(buffer, options) {
6376
6824
  totalChars += t.replace(/\s/g, "").length;
6377
6825
  totalTextBytes += t.length * 2;
6378
6826
  }
6827
+ sampleMetricsByPage.set(i, {
6828
+ nonWhitespaceChars: visible.reduce((sum, it) => sum + it.text.replace(/\s/g, "").length, 0),
6829
+ visibleItems: visible.length
6830
+ });
6831
+ lastParsedPage2 = i;
6379
6832
  if (totalTextBytes > MAX_TOTAL_TEXT) throw new KordocError("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
6380
6833
  parsedPages++;
6381
6834
  options?.onProgress?.(parsedPages, totalTarget);
6382
6835
  } catch (pageErr) {
6383
6836
  if (pageErr instanceof KordocError) throw pageErr;
6837
+ if (!failedPages.includes(i)) failedPages.push(i);
6384
6838
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
6385
6839
  }
6386
6840
  };
@@ -6397,8 +6851,21 @@ async function parsePdfDocument(buffer, options) {
6397
6851
  for (const si of sampledIndices) {
6398
6852
  await parseSinglePage(targetPageNums[si]);
6399
6853
  }
6400
- const sampleParsed = parsedPages || sampledIndices.size;
6401
- const isImageBased = totalChars / Math.max(sampleParsed, 1) < 10;
6854
+ const sampledMetrics = [];
6855
+ for (const si of sampledIndices) {
6856
+ const pageNum = targetPageNums[si];
6857
+ const m = sampleMetricsByPage.get(pageNum);
6858
+ if (m) sampledMetrics.push(m);
6859
+ }
6860
+ const imageBasedDecision = estimateImageBasedPdf(sampledMetrics);
6861
+ const isImageBased = imageBasedDecision.isImageBased;
6862
+ logger.log({
6863
+ level: "info",
6864
+ stage: "probe",
6865
+ event: "done",
6866
+ message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815",
6867
+ meta: { isImageBased, reason: imageBasedDecision.reason, sampledPages: sampledMetrics.length }
6868
+ });
6402
6869
  if (!isImageBased) {
6403
6870
  for (let si = 0; si < targetPageNums.length; si++) {
6404
6871
  if (!sampledIndices.has(si)) {
@@ -6406,11 +6873,41 @@ async function parsePdfDocument(buffer, options) {
6406
6873
  }
6407
6874
  }
6408
6875
  }
6876
+ const partialSummary = summarizePartialFailures(failedPages, totalTarget);
6877
+ if (partialSummary) {
6878
+ warnings.push({
6879
+ message: partialSummary,
6880
+ code: "PARTIAL_PARSE"
6881
+ });
6882
+ }
6883
+ if (isImageBased) {
6884
+ warnings.push({
6885
+ message: `\uC774\uBBF8\uC9C0 \uAE30\uBC18 \uD310\uC815: ${imageBasedDecision.reason}`,
6886
+ code: "OCR_FALLBACK"
6887
+ });
6888
+ }
6889
+ const partialPolicy = shouldAbortForPartialFailures(
6890
+ failedPages,
6891
+ totalTarget,
6892
+ options?.maxPartialFailureRatio
6893
+ );
6894
+ if (partialPolicy.abort) {
6895
+ throw new KordocError(
6896
+ `\uBD80\uBD84 \uD30C\uC2F1 \uC2E4\uD328 \uBE44\uC728 \uCD08\uACFC: ${(partialPolicy.ratio * 100).toFixed(1)}% (\uD5C8\uC6A9 ${(partialPolicy.threshold * 100).toFixed(1)}%)`
6897
+ );
6898
+ }
6409
6899
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
6410
6900
  if (isImageBased) {
6411
6901
  const ocrMode = options?.ocrMode ?? "auto";
6412
6902
  const concurrency = options?.ocrConcurrency ?? 1;
6413
6903
  const batchSize = options?.ocrBatchSize;
6904
+ logger.log({
6905
+ level: "info",
6906
+ stage: "ocr",
6907
+ event: "start",
6908
+ message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF OCR \uC2DC\uC791",
6909
+ meta: { ocrMode, concurrency, batchSize, totalTarget }
6910
+ });
6414
6911
  if (ocrMode === "off") {
6415
6912
  throw Object.assign(new KordocError(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
6416
6913
  }
@@ -6418,8 +6915,10 @@ async function parsePdfDocument(buffer, options) {
6418
6915
  const { ocrPages: ocrPages2 } = await Promise.resolve().then(() => (init_provider(), provider_exports));
6419
6916
  const tryProvider = async (provider, filter) => {
6420
6917
  try {
6918
+ logger.log({ level: "debug", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589", meta: { filteredPages: filter?.size } });
6421
6919
  return await ocrPages2(doc, provider, filter, effectivePageCount, warnings, concurrency, options?.onProgress);
6422
6920
  } catch {
6921
+ logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR provider \uC2E4\uD589 \uC2E4\uD328(\uBE48 \uACB0\uACFC\uB85C \uCC98\uB9AC)" });
6423
6922
  return [];
6424
6923
  } finally {
6425
6924
  const terminable = provider;
@@ -6442,6 +6941,7 @@ async function parsePdfDocument(buffer, options) {
6442
6941
  for (const mode of getAutoFallbackChain2()) {
6443
6942
  if (pendingPages.size === 0) break;
6444
6943
  try {
6944
+ logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uC2DC\uB3C4", meta: { mode, pendingPages: pendingPages.size } });
6445
6945
  const modeFilter = pendingPages.size < effectivePageCount ? new Set(pendingPages) : pageFilter;
6446
6946
  const provider = await resolveOcrProvider2(mode, warnings, concurrency, batchSize);
6447
6947
  const blocks2 = await tryProvider(provider, modeFilter);
@@ -6456,10 +6956,20 @@ async function parsePdfDocument(buffer, options) {
6456
6956
  code: "OCR_CLI_FALLBACK"
6457
6957
  });
6458
6958
  }
6959
+ logger.log({ level: "info", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uCC98\uB9AC \uC644\uB8CC", meta: { mode, blocks: blocks2.length, pendingPages: pendingPages.size } });
6459
6960
  } else {
6460
6961
  warnings.push({ message: `OCR: '${mode}' \uACB0\uACFC \uC5C6\uC74C, \uB2E4\uC74C \uC5D4\uC9C4\uC73C\uB85C \uC2DC\uB3C4`, code: "OCR_CLI_FALLBACK" });
6962
+ logger.log({ level: "warn", stage: "ocr", event: "progress", message: "OCR \uC5D4\uC9C4 \uACB0\uACFC \uC5C6\uC74C", meta: { mode } });
6461
6963
  }
6462
- } catch {
6964
+ } catch (engineErr) {
6965
+ logger.log({
6966
+ level: "warn",
6967
+ stage: "ocr",
6968
+ event: "progress",
6969
+ message: "OCR \uC5D4\uC9C4 \uCD08\uAE30\uD654/\uC2E4\uD589 \uC2E4\uD328",
6970
+ meta: { mode },
6971
+ error: { message: engineErr instanceof Error ? engineErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: engineErr instanceof Error ? engineErr.name : "Error" }
6972
+ });
6463
6973
  }
6464
6974
  }
6465
6975
  allOcrBlocks.sort((a, b) => (a.pageNumber ?? 0) - (b.pageNumber ?? 0));
@@ -6477,6 +6987,7 @@ async function parsePdfDocument(buffer, options) {
6477
6987
  }
6478
6988
  if (ocrBlocks.length > 0) {
6479
6989
  const ocrMarkdown = blocksToMarkdown(ocrBlocks);
6990
+ logger.log({ level: "info", stage: "ocr", event: "done", message: "\uC774\uBBF8\uC9C0 \uAE30\uBC18 OCR \uC644\uB8CC", meta: { blocks: ocrBlocks.length } });
6480
6991
  return {
6481
6992
  markdown: ocrMarkdown,
6482
6993
  blocks: ocrBlocks,
@@ -6502,8 +7013,25 @@ async function parsePdfDocument(buffer, options) {
6502
7013
  }
6503
7014
  detectMarkerHeadings(blocks);
6504
7015
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
6505
- let markdown = cleanPdfText(blocksToMarkdown(blocks));
7016
+ let markdown = cleanPdfText(blocksToMarkdown(blocks), options?.pdfTextNormalization ?? "default");
7017
+ logger.log({
7018
+ level: "info",
7019
+ stage: "finalize",
7020
+ event: "done",
7021
+ message: "PDF \uD30C\uC2F1 \uC644\uB8CC",
7022
+ meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, isImageBased: false }
7023
+ });
6506
7024
  return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
7025
+ } catch (err) {
7026
+ logger.log({
7027
+ level: "error",
7028
+ stage: "finalize",
7029
+ event: "error",
7030
+ message: "PDF \uD30C\uC2F1 \uC2E4\uD328",
7031
+ meta: { lastParsedPage },
7032
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
7033
+ });
7034
+ throw err;
6507
7035
  } finally {
6508
7036
  await doc.destroy().catch(() => {
6509
7037
  });
@@ -6597,6 +7125,17 @@ function shouldDemoteTable(table) {
6597
7125
  const emptyCells = totalCells - allCells.length;
6598
7126
  if (table.rows <= 2 && emptyCells > totalCells * 0.5) return true;
6599
7127
  if (table.rows === 1 && !/\d{2,}/.test(allText)) return true;
7128
+ if (table.cols >= 3 && table.rows <= 4) {
7129
+ const markerCells = allCells.filter((t) => /^[□■◆○●▶▷◇◆]/.test(t)).length;
7130
+ const numericCells = allCells.filter((t) => /\d/.test(t)).length;
7131
+ if (markerCells >= Math.max(1, Math.floor(allCells.length * 0.35)) && numericCells <= Math.floor(allCells.length * 0.15)) {
7132
+ return true;
7133
+ }
7134
+ }
7135
+ if (table.cols >= 3 && table.rows >= 2) {
7136
+ const sparseRows = table.cells.filter((row) => row.filter((c) => c.text.trim()).length <= 1).length;
7137
+ if (sparseRows >= Math.ceil(table.rows * 0.7)) return true;
7138
+ }
6600
7139
  return false;
6601
7140
  }
6602
7141
  function demoteTableToText(table) {
@@ -7152,10 +7691,15 @@ function mergeLineSimple(items) {
7152
7691
  }
7153
7692
  return result;
7154
7693
  }
7155
- function cleanPdfText(text) {
7156
- return mergeKoreanLines(
7157
- text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "")
7158
- ).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
7694
+ function stripPdfPageNumberArtifacts(text) {
7695
+ return text.replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "");
7696
+ }
7697
+ function cleanPdfText(text, mode = "default") {
7698
+ const stripped = stripPdfPageNumberArtifacts(text);
7699
+ if (mode === "strict-preserve") {
7700
+ return stripped.replace(/\n{4,}/g, "\n\n\n").trim();
7701
+ }
7702
+ return mergeKoreanLines(stripped).replace(/^(?!\|).{3,30}$/gm, (line) => collapseEvenSpacing(line)).replace(/\n{3,}/g, "\n\n").trim();
7159
7703
  }
7160
7704
  function startsWithMarker(line) {
7161
7705
  const t = line.trimStart();
@@ -7359,6 +7903,7 @@ function mergeKoreanLines(text) {
7359
7903
  // src/xlsx/parser.ts
7360
7904
  import JSZip3 from "jszip";
7361
7905
  import { DOMParser as DOMParser2 } from "@xmldom/xmldom";
7906
+ init_logger();
7362
7907
  var MAX_SHEETS = 100;
7363
7908
  var MAX_DECOMPRESS_SIZE3 = 500 * 1024 * 1024;
7364
7909
  var MAX_ROWS2 = 1e4;
@@ -7548,105 +8093,145 @@ function sheetToBlocks(sheetName, grid, merges, maxRow, maxCol, sheetIndex) {
7548
8093
  return blocks;
7549
8094
  }
7550
8095
  async function parseXlsxDocument(buffer, options, existingZip) {
7551
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
7552
- const zip = existingZip ?? await JSZip3.loadAsync(buffer);
7553
- const warnings = [];
7554
- const workbookFile = zip.file("xl/workbook.xml");
7555
- if (!workbookFile) {
7556
- throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
7557
- }
7558
- let sharedStrings = [];
7559
- const ssFile = zip.file("xl/sharedStrings.xml");
7560
- if (ssFile) {
7561
- sharedStrings = parseSharedStrings(await ssFile.async("text"));
7562
- }
7563
- const sheets = parseWorkbook(await workbookFile.async("text"));
7564
- if (sheets.length === 0) {
7565
- throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
7566
- }
7567
- let relsMap = /* @__PURE__ */ new Map();
7568
- const relsFile = zip.file("xl/_rels/workbook.xml.rels");
7569
- if (relsFile) {
7570
- relsMap = parseRels(await relsFile.async("text"));
7571
- }
7572
- let pageFilter = null;
7573
- if (options?.pages) {
7574
- const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => (init_page_range(), page_range_exports));
7575
- pageFilter = parsePageRange2(options.pages, sheets.length);
7576
- }
7577
- const blocks = [];
7578
- const processedSheets = Math.min(sheets.length, MAX_SHEETS);
7579
- let totalCells = 0;
7580
- for (let i = 0; i < processedSheets; i++) {
7581
- if (pageFilter && !pageFilter.has(i + 1)) continue;
7582
- const sheet = sheets[i];
7583
- options?.onProgress?.(i + 1, processedSheets);
7584
- let sheetPath = relsMap.get(sheet.rId);
7585
- if (sheetPath) {
7586
- if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
7587
- sheetPath = `xl/${sheetPath}`;
7588
- } else if (sheetPath.startsWith("/")) {
7589
- sheetPath = sheetPath.slice(1);
7590
- }
7591
- } else {
7592
- sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
7593
- }
7594
- const sheetFile = zip.file(sheetPath);
7595
- if (!sheetFile) {
7596
- warnings.push({
7597
- page: i + 1,
7598
- message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
7599
- code: "PARTIAL_PARSE"
7600
- });
7601
- continue;
8096
+ const logger = createLoggerFromEnv().child({ component: "xlsx/parser.ts", stage: "detect" });
8097
+ logger.log({ level: "info", event: "start", message: "XLSX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
8098
+ let lastProcessedSheet = 0;
8099
+ try {
8100
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE3);
8101
+ const zip = existingZip ?? await JSZip3.loadAsync(buffer);
8102
+ const warnings = [];
8103
+ const workbookFile = zip.file("xl/workbook.xml");
8104
+ if (!workbookFile) {
8105
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 XLSX \uD30C\uC77C: xl/workbook.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
8106
+ }
8107
+ let sharedStrings = [];
8108
+ const ssFile = zip.file("xl/sharedStrings.xml");
8109
+ if (ssFile) {
8110
+ sharedStrings = parseSharedStrings(await ssFile.async("text"));
8111
+ }
8112
+ const sheets = parseWorkbook(await workbookFile.async("text"));
8113
+ if (sheets.length === 0) {
8114
+ throw new KordocError("XLSX \uD30C\uC77C\uC5D0 \uC2DC\uD2B8\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4");
8115
+ }
8116
+ logger.log({ level: "debug", event: "progress", message: "\uC2DC\uD2B8 \uBAA9\uB85D \uB85C\uB4DC", meta: { sheets: sheets.length } });
8117
+ let relsMap = /* @__PURE__ */ new Map();
8118
+ const relsFile = zip.file("xl/_rels/workbook.xml.rels");
8119
+ if (relsFile) {
8120
+ relsMap = parseRels(await relsFile.async("text"));
8121
+ }
8122
+ let pageFilter = null;
8123
+ if (options?.pages) {
8124
+ const { parsePageRange: parsePageRange2 } = await Promise.resolve().then(() => (init_page_range(), page_range_exports));
8125
+ pageFilter = parsePageRange2(options.pages, sheets.length);
7602
8126
  }
7603
- try {
7604
- const sheetXml = await sheetFile.async("text");
7605
- const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
7606
- totalCells += maxRow * maxCol;
7607
- if (totalCells > MAX_TOTAL_CELLS) {
7608
- warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
7609
- break;
8127
+ const blocks = [];
8128
+ const processedSheets = Math.min(sheets.length, MAX_SHEETS);
8129
+ let totalCells = 0;
8130
+ for (let i = 0; i < processedSheets; i++) {
8131
+ if (pageFilter && !pageFilter.has(i + 1)) continue;
8132
+ const sheet = sheets[i];
8133
+ options?.onProgress?.(i + 1, processedSheets);
8134
+ let sheetPath = relsMap.get(sheet.rId);
8135
+ if (sheetPath) {
8136
+ if (!sheetPath.startsWith("xl/") && !sheetPath.startsWith("/")) {
8137
+ sheetPath = `xl/${sheetPath}`;
8138
+ } else if (sheetPath.startsWith("/")) {
8139
+ sheetPath = sheetPath.slice(1);
8140
+ }
8141
+ } else {
8142
+ sheetPath = `xl/worksheets/sheet${i + 1}.xml`;
8143
+ }
8144
+ const sheetFile = zip.file(sheetPath);
8145
+ if (!sheetFile) {
8146
+ warnings.push({
8147
+ page: i + 1,
8148
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${sheetPath}`,
8149
+ code: "PARTIAL_PARSE"
8150
+ });
8151
+ continue;
8152
+ }
8153
+ try {
8154
+ const sheetXml = await sheetFile.async("text");
8155
+ const { grid, merges, maxRow, maxCol } = parseWorksheet(sheetXml, sharedStrings);
8156
+ totalCells += maxRow * maxCol;
8157
+ if (totalCells > MAX_TOTAL_CELLS) {
8158
+ warnings.push({ message: `\uCD1D \uC140 \uC218 \uC81C\uD55C \uCD08\uACFC (${totalCells.toLocaleString()}\uC140), \uC774\uD6C4 \uC2DC\uD2B8 \uC0DD\uB7B5`, code: "PARTIAL_PARSE" });
8159
+ break;
8160
+ }
8161
+ const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
8162
+ blocks.push(...sheetBlocks);
8163
+ logger.log({
8164
+ level: "debug",
8165
+ stage: "convert",
8166
+ event: "progress",
8167
+ message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC644\uB8CC",
8168
+ meta: { sheet: sheet.name, index: i + 1, processedSheets }
8169
+ });
8170
+ lastProcessedSheet = i + 1;
8171
+ } catch (err) {
8172
+ warnings.push({
8173
+ page: i + 1,
8174
+ message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
8175
+ code: "PARTIAL_PARSE"
8176
+ });
8177
+ logger.log({
8178
+ level: "warn",
8179
+ stage: "convert",
8180
+ event: "progress",
8181
+ message: "\uC2DC\uD2B8 \uD30C\uC2F1 \uC2E4\uD328",
8182
+ meta: { sheet: sheet.name, index: i + 1 },
8183
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error" }
8184
+ });
7610
8185
  }
7611
- const sheetBlocks = sheetToBlocks(sheet.name, grid, merges, maxRow, maxCol, i);
7612
- blocks.push(...sheetBlocks);
7613
- } catch (err) {
7614
- warnings.push({
7615
- page: i + 1,
7616
- message: `\uC2DC\uD2B8 "${sheet.name}" \uD30C\uC2F1 \uC2E4\uD328: ${err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`,
7617
- code: "PARTIAL_PARSE"
7618
- });
7619
8186
  }
7620
- }
7621
- const metadata = {
7622
- pageCount: processedSheets
7623
- };
7624
- const coreFile = zip.file("docProps/core.xml");
7625
- if (coreFile) {
7626
- try {
7627
- const coreXml = await coreFile.async("text");
7628
- const doc = parseXml(coreXml);
7629
- const getFirst = (tag) => {
7630
- const els = doc.getElementsByTagName(tag);
7631
- return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
7632
- };
7633
- metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
7634
- metadata.author = getFirst("dc:creator");
7635
- metadata.description = getFirst("dc:description");
7636
- const created = getFirst("dcterms:created");
7637
- if (created) metadata.createdAt = created;
7638
- const modified = getFirst("dcterms:modified");
7639
- if (modified) metadata.modifiedAt = modified;
7640
- } catch {
8187
+ const metadata = {
8188
+ pageCount: processedSheets
8189
+ };
8190
+ const coreFile = zip.file("docProps/core.xml");
8191
+ if (coreFile) {
8192
+ try {
8193
+ const coreXml = await coreFile.async("text");
8194
+ const doc = parseXml(coreXml);
8195
+ const getFirst = (tag) => {
8196
+ const els = doc.getElementsByTagName(tag);
8197
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
8198
+ };
8199
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
8200
+ metadata.author = getFirst("dc:creator");
8201
+ metadata.description = getFirst("dc:description");
8202
+ const created = getFirst("dcterms:created");
8203
+ if (created) metadata.createdAt = created;
8204
+ const modified = getFirst("dcterms:modified");
8205
+ if (modified) metadata.modifiedAt = modified;
8206
+ } catch {
8207
+ }
7641
8208
  }
8209
+ const markdown = blocksToMarkdown(blocks);
8210
+ logger.log({
8211
+ level: "info",
8212
+ stage: "finalize",
8213
+ event: "done",
8214
+ message: "XLSX \uD30C\uC2F1 \uC644\uB8CC",
8215
+ meta: { blocks: blocks.length, warnings: warnings.length, pageCount: processedSheets }
8216
+ });
8217
+ return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
8218
+ } catch (err) {
8219
+ logger.log({
8220
+ level: "error",
8221
+ stage: "finalize",
8222
+ event: "error",
8223
+ message: "XLSX \uD30C\uC2F1 \uC2E4\uD328",
8224
+ meta: { lastProcessedSheet },
8225
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
8226
+ });
8227
+ throw err;
7642
8228
  }
7643
- const markdown = blocksToMarkdown(blocks);
7644
- return { markdown, blocks, metadata, warnings: warnings.length > 0 ? warnings : void 0 };
7645
8229
  }
7646
8230
 
7647
8231
  // src/docx/parser.ts
7648
8232
  import JSZip4 from "jszip";
7649
8233
  import { DOMParser as DOMParser3 } from "@xmldom/xmldom";
8234
+ init_logger();
7650
8235
  var MAX_DECOMPRESS_SIZE4 = 500 * 1024 * 1024;
7651
8236
  function getChildElements(parent, localName) {
7652
8237
  const result = [];
@@ -8008,101 +8593,127 @@ async function extractImages(zip, rels, doc) {
8008
8593
  return { blocks, images };
8009
8594
  }
8010
8595
  async function parseDocxDocument(buffer, options, existingZip) {
8011
- precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
8012
- const zip = existingZip ?? await JSZip4.loadAsync(buffer);
8013
- const warnings = [];
8014
- const docFile = zip.file("word/document.xml");
8015
- if (!docFile) {
8016
- throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
8017
- }
8018
- let rels = /* @__PURE__ */ new Map();
8019
- const relsFile = zip.file("word/_rels/document.xml.rels");
8020
- if (relsFile) {
8021
- rels = parseRels2(await relsFile.async("text"));
8022
- }
8023
- let styles = /* @__PURE__ */ new Map();
8024
- const stylesFile = zip.file("word/styles.xml");
8025
- if (stylesFile) {
8026
- try {
8027
- styles = parseStyles(await stylesFile.async("text"));
8028
- } catch {
8596
+ const logger = createLoggerFromEnv().child({ component: "docx/parser.ts", stage: "detect" });
8597
+ logger.log({ level: "info", event: "start", message: "DOCX \uD30C\uC2F1 \uC2DC\uC791", meta: { size: buffer.byteLength } });
8598
+ let lastProcessedNode = 0;
8599
+ try {
8600
+ precheckZipSize(buffer, MAX_DECOMPRESS_SIZE4);
8601
+ const zip = existingZip ?? await JSZip4.loadAsync(buffer);
8602
+ const warnings = [];
8603
+ const docFile = zip.file("word/document.xml");
8604
+ if (!docFile) {
8605
+ throw new KordocError("\uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 DOCX \uD30C\uC77C: word/document.xml\uC774 \uC5C6\uC2B5\uB2C8\uB2E4");
8606
+ }
8607
+ let rels = /* @__PURE__ */ new Map();
8608
+ const relsFile = zip.file("word/_rels/document.xml.rels");
8609
+ if (relsFile) {
8610
+ rels = parseRels2(await relsFile.async("text"));
8611
+ }
8612
+ let styles = /* @__PURE__ */ new Map();
8613
+ const stylesFile = zip.file("word/styles.xml");
8614
+ if (stylesFile) {
8615
+ try {
8616
+ styles = parseStyles(await stylesFile.async("text"));
8617
+ } catch {
8618
+ }
8029
8619
  }
8030
- }
8031
- let numbering = /* @__PURE__ */ new Map();
8032
- const numFile = zip.file("word/numbering.xml");
8033
- if (numFile) {
8034
- try {
8035
- numbering = parseNumbering(await numFile.async("text"));
8036
- } catch {
8620
+ let numbering = /* @__PURE__ */ new Map();
8621
+ const numFile = zip.file("word/numbering.xml");
8622
+ if (numFile) {
8623
+ try {
8624
+ numbering = parseNumbering(await numFile.async("text"));
8625
+ } catch {
8626
+ }
8037
8627
  }
8038
- }
8039
- let footnotes = /* @__PURE__ */ new Map();
8040
- const fnFile = zip.file("word/footnotes.xml");
8041
- if (fnFile) {
8042
- try {
8043
- footnotes = parseFootnotes(await fnFile.async("text"));
8044
- } catch {
8628
+ let footnotes = /* @__PURE__ */ new Map();
8629
+ const fnFile = zip.file("word/footnotes.xml");
8630
+ if (fnFile) {
8631
+ try {
8632
+ footnotes = parseFootnotes(await fnFile.async("text"));
8633
+ } catch {
8634
+ }
8045
8635
  }
8046
- }
8047
- const docXml = await docFile.async("text");
8048
- const doc = parseXml2(docXml);
8049
- const body = findElements(doc, "body");
8050
- if (body.length === 0) {
8051
- throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
8052
- }
8053
- const blocks = [];
8054
- const bodyEl = body[0];
8055
- const children = bodyEl.childNodes;
8056
- for (let i = 0; i < children.length; i++) {
8057
- const node = children[i];
8058
- if (node.nodeType !== 1) continue;
8059
- const el = node;
8060
- const localName = el.localName ?? el.tagName?.split(":").pop();
8061
- if (localName === "p") {
8062
- const block = parseParagraph(el, styles, numbering, footnotes, rels);
8063
- if (block) blocks.push(block);
8064
- } else if (localName === "tbl") {
8065
- const block = parseTable(el, styles, numbering, footnotes, rels);
8066
- if (block) blocks.push(block);
8067
- }
8068
- }
8069
- const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
8070
- const metadata = {};
8071
- const coreFile = zip.file("docProps/core.xml");
8072
- if (coreFile) {
8073
- try {
8074
- const coreXml = await coreFile.async("text");
8075
- const coreDoc = parseXml2(coreXml);
8076
- const getFirst = (tag) => {
8077
- const els = coreDoc.getElementsByTagName(tag);
8078
- return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
8079
- };
8080
- metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
8081
- metadata.author = getFirst("dc:creator");
8082
- metadata.description = getFirst("dc:description");
8083
- const created = getFirst("dcterms:created");
8084
- if (created) metadata.createdAt = created;
8085
- const modified = getFirst("dcterms:modified");
8086
- if (modified) metadata.modifiedAt = modified;
8087
- } catch {
8636
+ const docXml = await docFile.async("text");
8637
+ const doc = parseXml2(docXml);
8638
+ const body = findElements(doc, "body");
8639
+ if (body.length === 0) {
8640
+ throw new KordocError("DOCX \uBCF8\uBB38(w:body)\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4");
8088
8641
  }
8642
+ const blocks = [];
8643
+ const bodyEl = body[0];
8644
+ const children = bodyEl.childNodes;
8645
+ for (let i = 0; i < children.length; i++) {
8646
+ const node = children[i];
8647
+ if (node.nodeType !== 1) continue;
8648
+ const el = node;
8649
+ const localName = el.localName ?? el.tagName?.split(":").pop();
8650
+ if (localName === "p") {
8651
+ const block = parseParagraph(el, styles, numbering, footnotes, rels);
8652
+ if (block) blocks.push(block);
8653
+ } else if (localName === "tbl") {
8654
+ const block = parseTable(el, styles, numbering, footnotes, rels);
8655
+ if (block) blocks.push(block);
8656
+ }
8657
+ lastProcessedNode = i + 1;
8658
+ }
8659
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uBCF8\uBB38 \uBE14\uB85D \uD30C\uC2F1 \uC644\uB8CC", meta: { blocks: blocks.length } });
8660
+ const { blocks: imgBlocks, images } = await extractImages(zip, rels, doc);
8661
+ logger.log({ level: "debug", stage: "convert", event: "progress", message: "\uC774\uBBF8\uC9C0 \uCD94\uCD9C \uC644\uB8CC", meta: { imageBlocks: imgBlocks.length, images: images.length } });
8662
+ const metadata = {};
8663
+ const coreFile = zip.file("docProps/core.xml");
8664
+ if (coreFile) {
8665
+ try {
8666
+ const coreXml = await coreFile.async("text");
8667
+ const coreDoc = parseXml2(coreXml);
8668
+ const getFirst = (tag) => {
8669
+ const els = coreDoc.getElementsByTagName(tag);
8670
+ return els.length > 0 ? (els[0].textContent ?? "").trim() : void 0;
8671
+ };
8672
+ metadata.title = getFirst("dc:title") || getFirst("dcterms:title");
8673
+ metadata.author = getFirst("dc:creator");
8674
+ metadata.description = getFirst("dc:description");
8675
+ const created = getFirst("dcterms:created");
8676
+ if (created) metadata.createdAt = created;
8677
+ const modified = getFirst("dcterms:modified");
8678
+ if (modified) metadata.modifiedAt = modified;
8679
+ } catch {
8680
+ }
8681
+ }
8682
+ const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
8683
+ const markdown = blocksToMarkdown(blocks);
8684
+ logger.log({
8685
+ level: "info",
8686
+ stage: "finalize",
8687
+ event: "done",
8688
+ message: "DOCX \uD30C\uC2F1 \uC644\uB8CC",
8689
+ meta: { blocks: blocks.length, warnings: warnings.length, outline: outline.length, images: images.length }
8690
+ });
8691
+ return {
8692
+ markdown,
8693
+ blocks,
8694
+ metadata,
8695
+ outline: outline.length > 0 ? outline : void 0,
8696
+ warnings: warnings.length > 0 ? warnings : void 0,
8697
+ images: images.length > 0 ? images : void 0
8698
+ };
8699
+ } catch (err) {
8700
+ logger.log({
8701
+ level: "error",
8702
+ stage: "finalize",
8703
+ event: "error",
8704
+ message: "DOCX \uD30C\uC2F1 \uC2E4\uD328",
8705
+ meta: { lastProcessedNode },
8706
+ error: { message: err instanceof Error ? err.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958", name: err instanceof Error ? err.name : "Error", stack: err instanceof Error ? err.stack : void 0 }
8707
+ });
8708
+ throw err;
8089
8709
  }
8090
- const outline = blocks.filter((b) => b.type === "heading").map((b) => ({ level: b.level ?? 2, text: b.text ?? "" }));
8091
- const markdown = blocksToMarkdown(blocks);
8092
- return {
8093
- markdown,
8094
- blocks,
8095
- metadata,
8096
- outline: outline.length > 0 ? outline : void 0,
8097
- warnings: warnings.length > 0 ? warnings : void 0,
8098
- images: images.length > 0 ? images : void 0
8099
- };
8100
8710
  }
8101
8711
 
8102
8712
  // src/index.ts
8103
8713
  init_cli_provider();
8104
8714
  init_tesseract_provider();
8105
8715
  init_markdown_to_blocks();
8716
+ init_logger();
8106
8717
 
8107
8718
  // src/diff/text-diff.ts
8108
8719
  function similarity(a, b) {
@@ -10601,15 +11212,726 @@ async function markdownToXlsx(markdown, options) {
10601
11212
  return buffer.buffer.slice(buffer.byteOffset, buffer.byteOffset + buffer.byteLength);
10602
11213
  }
10603
11214
 
11215
+ // src/ocr/api-key-rotation.ts
11216
+ var AllKeysCoolingDownError = class extends Error {
11217
+ waitMs;
11218
+ constructor(waitMs) {
11219
+ super(`\uBAA8\uB4E0 API \uD0A4\uAC00 cooldown \uC0C1\uD0DC\uC785\uB2C8\uB2E4. ${waitMs}ms \uD6C4 \uC7AC\uC2DC\uB3C4\uD558\uC138\uC694.`);
11220
+ this.name = "AllKeysCoolingDownError";
11221
+ this.waitMs = waitMs;
11222
+ }
11223
+ };
11224
+ var ApiKeyRotationPool = class _ApiKeyRotationPool {
11225
+ states;
11226
+ baseCooldownMs;
11227
+ maxCooldownMs;
11228
+ cursor = -1;
11229
+ constructor(keys, options = {}) {
11230
+ const normalized = keys.map((k) => k.trim()).filter(Boolean);
11231
+ if (normalized.length === 0) {
11232
+ throw new Error("API \uD0A4\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
11233
+ }
11234
+ this.states = normalized.map((key, idx) => ({
11235
+ key,
11236
+ keyId: `key_${idx + 1}`,
11237
+ totalRequests: 0,
11238
+ successCount: 0,
11239
+ failureCount: 0,
11240
+ consecutiveFailures: 0
11241
+ }));
11242
+ this.baseCooldownMs = options.baseCooldownMs ?? 5e3;
11243
+ this.maxCooldownMs = options.maxCooldownMs ?? 12e4;
11244
+ }
11245
+ static fromEnv(env = process.env) {
11246
+ const multi = (env.NVIDIA_API_KEYS || "").split(",").map((v) => v.trim()).filter(Boolean);
11247
+ if (multi.length > 0) return new _ApiKeyRotationPool(multi);
11248
+ const single = (env.NVIDIA_API_KEY || "").trim();
11249
+ if (single) return new _ApiKeyRotationPool([single]);
11250
+ throw new Error("NVIDIA_API_KEYS \uB610\uB294 NVIDIA_API_KEY \uD658\uACBD\uBCC0\uC218\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.");
11251
+ }
11252
+ acquire(now = Date.now()) {
11253
+ const n = this.states.length;
11254
+ for (let step = 1; step <= n; step++) {
11255
+ const idx = (this.cursor + step) % n;
11256
+ const s = this.states[idx];
11257
+ if (!s.cooldownUntil || s.cooldownUntil <= now) {
11258
+ this.cursor = idx;
11259
+ s.totalRequests++;
11260
+ s.lastUsedAt = now;
11261
+ return { key: s.key, keyId: s.keyId };
11262
+ }
11263
+ }
11264
+ const minCooldownUntil = this.states.map((s) => s.cooldownUntil ?? now).reduce((min, v) => Math.min(min, v), Number.POSITIVE_INFINITY);
11265
+ throw new AllKeysCoolingDownError(Math.max(0, minCooldownUntil - now));
11266
+ }
11267
+ markSuccess(keyId) {
11268
+ const s = this.find(keyId);
11269
+ s.successCount++;
11270
+ s.consecutiveFailures = 0;
11271
+ s.cooldownUntil = void 0;
11272
+ }
11273
+ markFailure(keyId, opts = {}, now = Date.now()) {
11274
+ const s = this.find(keyId);
11275
+ s.failureCount++;
11276
+ s.consecutiveFailures++;
11277
+ const retryable = this.isRetryableFailure(opts.status, opts.timeout);
11278
+ if (!retryable) return;
11279
+ const exp = Math.max(0, s.consecutiveFailures - 1);
11280
+ const backoff = Math.min(this.baseCooldownMs * 2 ** exp, this.maxCooldownMs);
11281
+ const cooldown = Math.max(backoff, opts.retryAfterMs ?? 0);
11282
+ s.cooldownUntil = now + cooldown;
11283
+ }
11284
+ snapshot() {
11285
+ return this.states.map((s) => ({
11286
+ keyId: s.keyId,
11287
+ totalRequests: s.totalRequests,
11288
+ successCount: s.successCount,
11289
+ failureCount: s.failureCount,
11290
+ consecutiveFailures: s.consecutiveFailures,
11291
+ lastUsedAt: s.lastUsedAt,
11292
+ cooldownUntil: s.cooldownUntil
11293
+ }));
11294
+ }
11295
+ isRetryableFailure(status, timeout) {
11296
+ if (timeout) return true;
11297
+ if (status === 429) return true;
11298
+ if (typeof status === "number" && status >= 500) return true;
11299
+ return false;
11300
+ }
11301
+ find(keyId) {
11302
+ const s = this.states.find((v) => v.keyId === keyId);
11303
+ if (!s) throw new Error(`\uC54C \uC218 \uC5C6\uB294 keyId: ${keyId}`);
11304
+ return s;
11305
+ }
11306
+ };
11307
+
11308
+ // src/pipeline/unified-ocr.ts
11309
+ import { mkdir, readdir, readFile, stat, writeFile } from "fs/promises";
11310
+ import { basename as basename2, dirname as dirname3, extname, join as join4, resolve as resolve3 } from "path";
11311
+ import { spawn as spawn2 } from "child_process";
11312
+ import libre from "libreoffice-convert";
11313
+ init_logger();
11314
+ var libreConvert = libre.convert;
11315
+ var UnifiedOcrError = class extends Error {
11316
+ code;
11317
+ stage;
11318
+ constructor(code, stage, message) {
11319
+ super(message);
11320
+ this.name = "UnifiedOcrError";
11321
+ this.code = code;
11322
+ this.stage = stage;
11323
+ }
11324
+ };
11325
+ var DEFAULT_MODELS = [
11326
+ "mistralai/mistral-medium-3-instruct",
11327
+ "moonshotai/kimi-k2.5",
11328
+ "moonshotai/kimi-k2-thinking",
11329
+ "moonshotai/kimi-k2-instruct",
11330
+ "moonshotai/kimi-k2-instruct-0905",
11331
+ "qwen/qwen3.5-122b-a10b",
11332
+ "qwen/qwen3.5-397b-a17b"
11333
+ ];
11334
+ var DEFAULT_MODEL_MAX_TOKENS = {
11335
+ "mistralai/mistral-medium-3-instruct": 8192,
11336
+ "moonshotai/kimi-k2.5": 64e3,
11337
+ "moonshotai/kimi-k2-thinking": 64e3,
11338
+ "moonshotai/kimi-k2-instruct": 64e3,
11339
+ "moonshotai/kimi-k2-instruct-0905": 64e3,
11340
+ "qwen/qwen3.5-122b-a10b": 64e3,
11341
+ "qwen/qwen3.5-397b-a17b": 64e3
11342
+ };
11343
+ var DEFAULT_STAGE_WEIGHTS = {
11344
+ convert: 15,
11345
+ render: 20,
11346
+ probe: 5,
11347
+ ocr: 45,
11348
+ proofread: 10,
11349
+ merge: 5
11350
+ };
11351
+ var OCR_PROMPT2 = "\uC774 \uC774\uBBF8\uC9C0 1\uC7A5\uC758 \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uC694\uC57D \uC5C6\uC774 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uCD94\uCD9C\uD558\uC138\uC694. \uC808\uB300\uB85C \uB0B4\uC6A9\uC744 \uCD94\uCE21\uD558\uAC70\uB098 \uBC14\uAFB8\uC9C0 \uB9C8\uC138\uC694.";
11352
+ var PROOFREAD_PROMPT = [
11353
+ "\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
11354
+ "\uADDC\uCE59:",
11355
+ "- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
11356
+ "- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
11357
+ "- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
11358
+ "- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
11359
+ ].join("\n");
11360
+ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11361
+ const absInput = resolve3(inputPath);
11362
+ const stem = basename2(absInput, extname(absInput));
11363
+ const workspaceDir = resolve3(options.workspaceDir ?? join4(dirname3(absInput), `${stem}_ocr_workspace`));
11364
+ const imagesDir = join4(workspaceDir, "images");
11365
+ const rawDir = join4(workspaceDir, "ocr", "raw");
11366
+ const proofDir = join4(workspaceDir, "ocr", "proofread");
11367
+ const diffDir = join4(workspaceDir, "ocr", "diff");
11368
+ const outputPath = resolve3(options.outputPath ?? join4(dirname3(absInput), `${stem}.md`));
11369
+ const reportPath = join4(workspaceDir, "run-report.json");
11370
+ const modelCachePath = join4(dirname3(absInput), ".kordoc-model-cache.json");
11371
+ const baseUrl = options.baseUrl ?? "https://integrate.api.nvidia.com/v1/chat/completions";
11372
+ const timeoutMs = options.timeoutMs ?? 6e4;
11373
+ const maxRetriesPerPage = options.maxRetriesPerPage ?? 5;
11374
+ const dpi = options.dpi ?? 300;
11375
+ const modelsInput = options.modelCandidates?.length ? options.modelCandidates : DEFAULT_MODELS;
11376
+ const modelCache = await loadModelCache(modelCachePath);
11377
+ const models = sortModelsByCache(modelsInput, modelCache);
11378
+ const modelMaxTokens = { ...DEFAULT_MODEL_MAX_TOKENS, ...options.modelMaxTokens ?? {} };
11379
+ const stageWeights = normalizeWeights({ ...DEFAULT_STAGE_WEIGHTS, ...options.stageWeights ?? {} });
11380
+ const keyPool = ApiKeyRotationPool.fromEnv();
11381
+ const runId = options.runId ?? generateRunId("ocr");
11382
+ const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11383
+ await mkdir(imagesDir, { recursive: true });
11384
+ await mkdir(rawDir, { recursive: true });
11385
+ await mkdir(proofDir, { recursive: true });
11386
+ await mkdir(diffDir, { recursive: true });
11387
+ const timingsMs = {};
11388
+ const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11389
+ const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
11390
+ const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
11391
+ let currentStage = "convert";
11392
+ const logStage = (level, stage, event, message, meta) => {
11393
+ logger.log({ level, stage, event, message, meta });
11394
+ };
11395
+ try {
11396
+ ensureSupportedInput(absInput);
11397
+ let workingPdfPath = absInput;
11398
+ const convertStart = Date.now();
11399
+ currentStage = "convert";
11400
+ markStageStart("convert", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC911");
11401
+ logStage("info", "convert", "start", "\uBB38\uC11C\uB97C PDF\uB85C \uBCC0\uD658 \uC2DC\uC791", { input: absInput });
11402
+ if (extname(absInput).toLowerCase() !== ".pdf") {
11403
+ await assertSofficeAvailable();
11404
+ workingPdfPath = join4(workspaceDir, `${stem}.pdf`);
11405
+ const inputBuffer = await readFile(absInput);
11406
+ const out = await convertWithLibreOffice(inputBuffer, ".pdf");
11407
+ await writeFile(workingPdfPath, out);
11408
+ }
11409
+ timingsMs.convert = Date.now() - convertStart;
11410
+ markStageDone("convert", "PDF \uBCC0\uD658 \uC644\uB8CC");
11411
+ logStage("info", "convert", "done", "PDF \uBCC0\uD658 \uC644\uB8CC", { elapsedMs: timingsMs.convert });
11412
+ const renderStart = Date.now();
11413
+ currentStage = "render";
11414
+ markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11415
+ logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
11416
+ await renderPdfToPng(workingPdfPath, join4(imagesDir, "page"), dpi);
11417
+ const images = await listPageImages(imagesDir);
11418
+ if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
11419
+ markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
11420
+ timingsMs.render = Date.now() - renderStart;
11421
+ markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11422
+ logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: images.length, elapsedMs: timingsMs.render });
11423
+ const probeStart = Date.now();
11424
+ currentStage = "probe";
11425
+ markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
11426
+ logStage("info", "probe", "start", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2DC\uC791", { models });
11427
+ const probeImage = await pickRepresentativeImage(images);
11428
+ const probeResults = [];
11429
+ for (let i = 0; i < models.length; i++) {
11430
+ const model = models[i];
11431
+ const t0 = Date.now();
11432
+ try {
11433
+ await ocrImageViaNim({
11434
+ imagePath: probeImage,
11435
+ prompt: OCR_PROMPT2,
11436
+ model,
11437
+ maxTokens: modelMaxTokens[model] ?? 8192,
11438
+ baseUrl,
11439
+ keyPool,
11440
+ timeoutMs,
11441
+ maxRetries: 2,
11442
+ logger,
11443
+ stage: "probe"
11444
+ });
11445
+ probeResults.push({ model, durationMs: Date.now() - t0, success: true });
11446
+ } catch (err) {
11447
+ probeResults.push({
11448
+ model,
11449
+ durationMs: Date.now() - t0,
11450
+ success: false,
11451
+ error: err instanceof Error ? err.message : String(err)
11452
+ });
11453
+ }
11454
+ markStageProgress("probe", Math.round((i + 1) / models.length * 100), i + 1, models.length, `\uBAA8\uB378 \uD504\uB85C\uBE0C ${i + 1}/${models.length}`);
11455
+ logStage("debug", "probe", "progress", "\uBAA8\uB378 \uD504\uB85C\uBE0C \uC9C4\uD589", { index: i + 1, total: models.length, model, result: probeResults.at(-1) });
11456
+ }
11457
+ const selectedModel = chooseFastestModel(probeResults);
11458
+ if (!selectedModel) throw new UnifiedOcrError("PROBE_FAILED", "probe", "\uC18D\uB3C4 \uD504\uB85C\uBE0C \uC2E4\uD328: \uC0AC\uC6A9 \uAC00\uB2A5\uD55C OCR \uBAA8\uB378\uC774 \uC5C6\uC2B5\uB2C8\uB2E4.");
11459
+ const fallbackModelOrder = probeResults.filter((r) => r.success).sort((a, b) => a.durationMs - b.durationMs).map((r) => r.model);
11460
+ timingsMs.probe = Date.now() - probeStart;
11461
+ await updateModelCache(modelCachePath, probeResults);
11462
+ markStageDone("probe", `\uD504\uB85C\uBE0C \uC644\uB8CC: ${selectedModel}`);
11463
+ logStage("info", "probe", "done", "\uBAA8\uB378 \uD504\uB85C\uBE0C \uC644\uB8CC", { selectedModel, probeResults, elapsedMs: timingsMs.probe, modelCachePath });
11464
+ const ocrStart = Date.now();
11465
+ currentStage = "ocr";
11466
+ markStageStart("ocr", `OCR \uC9C4\uD589 \uC911 (${selectedModel})`);
11467
+ logStage("info", "ocr", "start", "\uD398\uC774\uC9C0 OCR \uC2DC\uC791", { selectedModel, pageCount: images.length });
11468
+ const rawPagePaths = [];
11469
+ for (let i = 0; i < images.length; i++) {
11470
+ const imagePath = images[i];
11471
+ const markdown = await ocrImageWithFallback({
11472
+ imagePath,
11473
+ prompt: OCR_PROMPT2,
11474
+ models: fallbackModelOrder,
11475
+ modelMaxTokens,
11476
+ baseUrl,
11477
+ keyPool,
11478
+ timeoutMs,
11479
+ maxRetriesPerPage,
11480
+ logger
11481
+ });
11482
+ const pagePath = join4(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11483
+ await writeFile(pagePath, markdown, "utf-8");
11484
+ rawPagePaths.push(pagePath);
11485
+ markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`);
11486
+ logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
11487
+ }
11488
+ timingsMs.ocr = Date.now() - ocrStart;
11489
+ markStageDone("ocr", "OCR \uC644\uB8CC");
11490
+ logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
11491
+ const proofStart = Date.now();
11492
+ currentStage = "proofread";
11493
+ markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
11494
+ logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
11495
+ const proofedPaths = [];
11496
+ for (let i = 0; i < rawPagePaths.length; i++) {
11497
+ const rawMd = await readFile(rawPagePaths[i], "utf-8");
11498
+ const prompt = `${PROOFREAD_PROMPT}
11499
+
11500
+ ---
11501
+ ${rawMd}
11502
+ ---`;
11503
+ const corrected = await ocrImageViaNim({
11504
+ textOnlyPrompt: prompt,
11505
+ model: selectedModel,
11506
+ maxTokens: modelMaxTokens[selectedModel] ?? 8192,
11507
+ baseUrl,
11508
+ keyPool,
11509
+ timeoutMs,
11510
+ maxRetries: maxRetriesPerPage,
11511
+ logger,
11512
+ stage: "proofread"
11513
+ });
11514
+ const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
11515
+ const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
11516
+ const pagePath = join4(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11517
+ await writeFile(pagePath, taggedCorrected, "utf-8");
11518
+ await writeFile(
11519
+ join4(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
11520
+ JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
11521
+ "utf-8"
11522
+ );
11523
+ proofedPaths.push(pagePath);
11524
+ markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
11525
+ logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
11526
+ }
11527
+ timingsMs.proofread = Date.now() - proofStart;
11528
+ markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
11529
+ logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
11530
+ const mergeStart = Date.now();
11531
+ currentStage = "merge";
11532
+ markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11533
+ logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: proofedPaths.length });
11534
+ const merged = await mergeMarkdownPages(proofedPaths);
11535
+ await writeFile(outputPath, merged, "utf-8");
11536
+ timingsMs.merge = Date.now() - mergeStart;
11537
+ markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
11538
+ logStage("info", "merge", "done", "\uCD5C\uC885 \uBCD1\uD569 \uC644\uB8CC", { outputPath, elapsedMs: timingsMs.merge });
11539
+ const report = {
11540
+ inputPath: absInput,
11541
+ outputPath,
11542
+ workspaceDir,
11543
+ selectedModel,
11544
+ probeImage,
11545
+ probeResults,
11546
+ pageCount: images.length,
11547
+ keyHealth: keyPool.snapshot(),
11548
+ timingsMs,
11549
+ modelCachePath
11550
+ };
11551
+ await writeFile(reportPath, JSON.stringify(report, null, 2), "utf-8");
11552
+ logStage("info", "finalize", "done", "run-report \uC800\uC7A5 \uC644\uB8CC", { reportPath });
11553
+ return { outputPath, reportPath, selectedModel };
11554
+ } catch (err) {
11555
+ const normalized = normalizePipelineError(err, currentStage);
11556
+ emitProgress(options.onEvent, currentStage, 0, stageWeights, {
11557
+ type: "error",
11558
+ code: normalized.code,
11559
+ message: normalized.message
11560
+ });
11561
+ logger.log({
11562
+ level: "error",
11563
+ stage: currentStage,
11564
+ event: "error",
11565
+ message: normalized.message,
11566
+ error: {
11567
+ code: normalized.code,
11568
+ name: normalized.name,
11569
+ message: normalized.message,
11570
+ stack: normalized.stack
11571
+ }
11572
+ });
11573
+ throw normalized;
11574
+ }
11575
+ }
11576
+ function normalizeWeights(weights) {
11577
+ const sum = Object.values(weights).reduce((a, b) => a + b, 0) || 1;
11578
+ return {
11579
+ convert: weights.convert / sum * 100,
11580
+ render: weights.render / sum * 100,
11581
+ probe: weights.probe / sum * 100,
11582
+ ocr: weights.ocr / sum * 100,
11583
+ proofread: weights.proofread / sum * 100,
11584
+ merge: weights.merge / sum * 100
11585
+ };
11586
+ }
11587
+ function computeOverallPercent(stage, stagePercent, weights) {
11588
+ const order = ["convert", "render", "probe", "ocr", "proofread", "merge"];
11589
+ let overall = 0;
11590
+ for (const s of order) {
11591
+ if (s === stage) {
11592
+ overall += weights[s] * Math.max(0, Math.min(100, stagePercent)) / 100;
11593
+ break;
11594
+ }
11595
+ overall += weights[s];
11596
+ }
11597
+ return Math.round(overall);
11598
+ }
11599
+ function emitProgress(cb, stage, stagePercent, weights, extra) {
11600
+ if (!cb) return;
11601
+ cb({
11602
+ type: extra.type ?? "stage_progress",
11603
+ stage,
11604
+ stagePercent: Math.max(0, Math.min(100, Math.round(stagePercent))),
11605
+ overallPercent: computeOverallPercent(stage, stagePercent, weights),
11606
+ current: extra.current,
11607
+ total: extra.total,
11608
+ code: extra.code,
11609
+ message: extra.message
11610
+ });
11611
+ }
11612
+ async function convertWithLibreOffice(buffer, ext) {
11613
+ return await new Promise((resolvePromise, reject) => {
11614
+ libreConvert(buffer, ext, void 0, (err, done) => {
11615
+ if (err || !done) {
11616
+ reject(new UnifiedOcrError("CONVERT_FAILED", "convert", err?.message ?? "LibreOffice \uBCC0\uD658 \uC2E4\uD328"));
11617
+ return;
11618
+ }
11619
+ resolvePromise(done);
11620
+ });
11621
+ });
11622
+ }
11623
+ async function renderPdfToPng(pdfPath, prefixPath, dpi) {
11624
+ try {
11625
+ await runCommand("pdftoppm", ["-png", "-r", String(dpi), pdfPath, prefixPath]);
11626
+ } catch (err) {
11627
+ throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11628
+ }
11629
+ }
11630
+ async function runCommand(cmd, args) {
11631
+ await new Promise((resolvePromise, reject) => {
11632
+ const child = spawn2(cmd, args, { stdio: "pipe" });
11633
+ let stderr = "";
11634
+ child.stderr.on("data", (d) => {
11635
+ stderr += String(d);
11636
+ });
11637
+ child.on("error", reject);
11638
+ child.on("close", (code) => {
11639
+ if (code === 0) resolvePromise();
11640
+ else reject(new Error(`${cmd} \uC2E4\uD328 (code=${code}): ${stderr.trim()}`));
11641
+ });
11642
+ });
11643
+ }
11644
+ async function assertSofficeAvailable() {
11645
+ try {
11646
+ await runCommand("soffice", ["--version"]);
11647
+ } catch {
11648
+ throw new UnifiedOcrError("SOFFICE_NOT_FOUND", "convert", "soffice\uB97C \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4. LibreOffice\uB97C \uC124\uCE58\uD574 \uC8FC\uC138\uC694.");
11649
+ }
11650
+ }
11651
+ async function listPageImages(imagesDir) {
11652
+ const files = await readdir(imagesDir);
11653
+ return files.filter((f) => f.endsWith(".png")).sort((a, b) => naturalPageSort(a, b)).map((f) => join4(imagesDir, f));
11654
+ }
11655
+ function naturalPageSort(a, b) {
11656
+ const na = Number((a.match(/\d+/g) || ["0"]).at(-1) || 0);
11657
+ const nb = Number((b.match(/\d+/g) || ["0"]).at(-1) || 0);
11658
+ return na - nb;
11659
+ }
11660
+ async function pickRepresentativeImage(images) {
11661
+ const sample = images.slice(0, Math.min(images.length, 8));
11662
+ const weighted = [];
11663
+ for (const p of sample) {
11664
+ const st = await stat(p);
11665
+ if (st.size > 8 * 1024) weighted.push({ path: p, size: st.size });
11666
+ }
11667
+ const use = weighted.length > 0 ? weighted : await Promise.all(sample.map(async (p) => ({ path: p, size: (await stat(p)).size })));
11668
+ use.sort((a, b) => a.size - b.size);
11669
+ return use[Math.floor(use.length / 2)].path;
11670
+ }
11671
+ function chooseFastestModel(results) {
11672
+ const ok = results.filter((r) => r.success).sort((a, b) => a.durationMs - b.durationMs);
11673
+ return ok[0]?.model ?? null;
11674
+ }
11675
+ async function loadModelCache(path) {
11676
+ try {
11677
+ const raw = await readFile(path, "utf-8");
11678
+ return JSON.parse(raw);
11679
+ } catch {
11680
+ return null;
11681
+ }
11682
+ }
11683
+ function sortModelsByCache(models, cache) {
11684
+ if (!cache) return [...models];
11685
+ return [...models].sort((a, b) => {
11686
+ const av = cache.models[a]?.avgDurationMs ?? Number.POSITIVE_INFINITY;
11687
+ const bv = cache.models[b]?.avgDurationMs ?? Number.POSITIVE_INFINITY;
11688
+ return av - bv;
11689
+ });
11690
+ }
11691
+ async function updateModelCache(path, probes) {
11692
+ const prev = await loadModelCache(path);
11693
+ const current = prev ?? { updatedAt: (/* @__PURE__ */ new Date()).toISOString(), models: {} };
11694
+ for (const p of probes) {
11695
+ if (!p.success) continue;
11696
+ const existing = current.models[p.model];
11697
+ if (!existing) {
11698
+ current.models[p.model] = { count: 1, avgDurationMs: p.durationMs };
11699
+ } else {
11700
+ const nextCount = existing.count + 1;
11701
+ current.models[p.model] = {
11702
+ count: nextCount,
11703
+ avgDurationMs: Math.round((existing.avgDurationMs * existing.count + p.durationMs) / nextCount)
11704
+ };
11705
+ }
11706
+ }
11707
+ current.updatedAt = (/* @__PURE__ */ new Date()).toISOString();
11708
+ await writeFile(path, JSON.stringify(current, null, 2), "utf-8");
11709
+ }
11710
+ async function ocrImageWithFallback(input) {
11711
+ let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
11712
+ for (const model of input.models) {
11713
+ try {
11714
+ return await ocrImageViaNim({
11715
+ imagePath: input.imagePath,
11716
+ prompt: input.prompt,
11717
+ model,
11718
+ maxTokens: input.modelMaxTokens[model] ?? 8192,
11719
+ baseUrl: input.baseUrl,
11720
+ keyPool: input.keyPool,
11721
+ timeoutMs: input.timeoutMs,
11722
+ maxRetries: input.maxRetriesPerPage,
11723
+ logger: input.logger,
11724
+ stage: "ocr"
11725
+ });
11726
+ } catch (err) {
11727
+ lastErr = err instanceof Error ? err.message : String(err);
11728
+ }
11729
+ }
11730
+ throw new UnifiedOcrError("OCR_FAILED", "ocr", `\uBAA8\uB4E0 OCR \uBAA8\uB378 \uC2E4\uD328: ${lastErr}`);
11731
+ }
11732
+ async function mergeMarkdownPages(paths) {
11733
+ const out = [];
11734
+ for (let i = 0; i < paths.length; i++) {
11735
+ const txt = (await readFile(paths[i], "utf-8")).trim();
11736
+ if (!txt) continue;
11737
+ out.push(txt);
11738
+ }
11739
+ return out.join("\n\n");
11740
+ }
11741
+ async function ocrImageViaNim(input) {
11742
+ const { model, maxTokens, baseUrl, keyPool, timeoutMs, maxRetries, logger, stage = "ocr" } = input;
11743
+ let attempt = 0;
11744
+ let lastErr = "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958";
11745
+ while (attempt < maxRetries) {
11746
+ attempt++;
11747
+ let acquired = null;
11748
+ try {
11749
+ acquired = keyPool.acquire();
11750
+ } catch (err) {
11751
+ if (err instanceof AllKeysCoolingDownError) {
11752
+ logger?.log({
11753
+ level: "warn",
11754
+ stage,
11755
+ event: "progress",
11756
+ message: "\uBAA8\uB4E0 API \uD0A4 cooldown \uC0C1\uD0DC\uB85C \uB300\uAE30",
11757
+ meta: { waitMs: err.waitMs, attempt, maxRetries, model }
11758
+ });
11759
+ await delay(err.waitMs);
11760
+ continue;
11761
+ }
11762
+ throw err;
11763
+ }
11764
+ try {
11765
+ const content = input.textOnlyPrompt ? [{ type: "text", text: input.textOnlyPrompt }] : [
11766
+ { type: "text", text: input.prompt ?? OCR_PROMPT2 },
11767
+ {
11768
+ type: "image_url",
11769
+ image_url: { url: `data:image/png;base64,${await encodeBase64(input.imagePath)}` }
11770
+ }
11771
+ ];
11772
+ const body = {
11773
+ model,
11774
+ messages: [{ role: "user", content }],
11775
+ max_tokens: maxTokens,
11776
+ temperature: 0
11777
+ };
11778
+ logger?.log({
11779
+ level: "debug",
11780
+ stage,
11781
+ event: "progress",
11782
+ message: "NIM \uC694\uCCAD \uC2DC\uB3C4",
11783
+ meta: { attempt, maxRetries, model, keyId: acquired.keyId, hasImage: Boolean(input.imagePath) }
11784
+ });
11785
+ const controller = new AbortController();
11786
+ const timer = setTimeout(() => controller.abort(), timeoutMs);
11787
+ try {
11788
+ const resp = await fetch(baseUrl, {
11789
+ method: "POST",
11790
+ headers: {
11791
+ Authorization: `Bearer ${acquired.key}`,
11792
+ "Content-Type": "application/json"
11793
+ },
11794
+ body: JSON.stringify(body),
11795
+ signal: controller.signal
11796
+ });
11797
+ if (resp.ok) {
11798
+ const json = await resp.json();
11799
+ const text = json.choices?.[0]?.message?.content?.trim() ?? "";
11800
+ keyPool.markSuccess(acquired.keyId);
11801
+ logger?.log({
11802
+ level: "debug",
11803
+ stage,
11804
+ event: "done",
11805
+ message: "NIM \uC751\uB2F5 \uC131\uACF5",
11806
+ meta: { attempt, model, keyId: acquired.keyId }
11807
+ });
11808
+ if (!text) throw new UnifiedOcrError("OCR_FAILED", "ocr", "OCR \uC751\uB2F5\uC774 \uBE44\uC5B4 \uC788\uC2B5\uB2C8\uB2E4.");
11809
+ return stripCodeFence3(text);
11810
+ }
11811
+ const retryAfter = Number(resp.headers.get("retry-after") || "0");
11812
+ const retryAfterMs = Number.isFinite(retryAfter) && retryAfter > 0 ? retryAfter * 1e3 : void 0;
11813
+ keyPool.markFailure(acquired.keyId, { status: resp.status, retryAfterMs });
11814
+ lastErr = `NIM \uC751\uB2F5 \uC624\uB958: ${resp.status}`;
11815
+ logger?.log({
11816
+ level: "warn",
11817
+ stage,
11818
+ event: "progress",
11819
+ message: "NIM \uC751\uB2F5 \uC2E4\uD328",
11820
+ meta: { attempt, model, status: resp.status, retryAfterMs, keyId: acquired.keyId }
11821
+ });
11822
+ } finally {
11823
+ clearTimeout(timer);
11824
+ }
11825
+ } catch (err) {
11826
+ const isTimeout = err instanceof Error && err.name === "AbortError";
11827
+ if (acquired) keyPool.markFailure(acquired.keyId, { timeout: isTimeout });
11828
+ lastErr = err instanceof Error ? err.message : String(err);
11829
+ logger?.log({
11830
+ level: "warn",
11831
+ stage,
11832
+ event: "progress",
11833
+ message: "NIM \uC694\uCCAD \uC608\uC678",
11834
+ meta: { attempt, model, timeout: isTimeout, keyId: acquired?.keyId },
11835
+ error: { message: lastErr, name: err instanceof Error ? err.name : "Error" }
11836
+ });
11837
+ await delay(500);
11838
+ }
11839
+ }
11840
+ logger?.log({
11841
+ level: "error",
11842
+ stage,
11843
+ event: "error",
11844
+ message: "NIM \uCD5C\uB300 \uC7AC\uC2DC\uB3C4 \uCD08\uACFC",
11845
+ meta: { model, maxRetries },
11846
+ error: { code: "OCR_FAILED", message: lastErr }
11847
+ });
11848
+ throw new UnifiedOcrError("OCR_FAILED", "ocr", `OCR \uC7AC\uC2DC\uB3C4 \uCD08\uACFC: ${lastErr}`);
11849
+ }
11850
+ async function encodeBase64(path) {
11851
+ const b = await readFile(path);
11852
+ return b.toString("base64");
11853
+ }
11854
+ function stripCodeFence3(text) {
11855
+ const m = text.match(/^```(?:markdown|md)?\s*([\s\S]*?)```\s*$/i);
11856
+ return m ? m[1].trim() : text;
11857
+ }
11858
+ async function delay(ms) {
11859
+ if (ms <= 0) return;
11860
+ await new Promise((resolvePromise) => setTimeout(resolvePromise, ms));
11861
+ }
11862
+ function ensureSupportedInput(path) {
11863
+ const ext = extname(path).toLowerCase();
11864
+ const allowed = /* @__PURE__ */ new Set([".pdf", ".hwp", ".hwpx", ".docx", ".xlsx"]);
11865
+ if (!allowed.has(ext)) {
11866
+ throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
11867
+ }
11868
+ }
11869
+ function extractNumericTokens(text) {
11870
+ return text.match(/\d[\d,./-]*/g) ?? [];
11871
+ }
11872
+ function preserveNumericIntegrity(rawText, correctedText) {
11873
+ const rawTokens = extractNumericTokens(rawText);
11874
+ const correctedTokens = extractNumericTokens(correctedText);
11875
+ if (rawTokens.length !== correctedTokens.length) return rawText;
11876
+ for (let i = 0; i < rawTokens.length; i++) {
11877
+ if (rawTokens[i] !== correctedTokens[i]) return rawText;
11878
+ }
11879
+ return correctedText;
11880
+ }
11881
+ function addUncertainTag(rawText, correctedText) {
11882
+ if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
11883
+ const rawLen = rawText.trim().length;
11884
+ const corrLen = correctedText.trim().length;
11885
+ if (rawLen === 0 || corrLen === 0) return correctedText;
11886
+ const rawLines = rawText.split("\n").filter(Boolean).length;
11887
+ const corrLines = correctedText.split("\n").filter(Boolean).length;
11888
+ const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
11889
+ const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
11890
+ const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
11891
+ if (!suspicious) return correctedText;
11892
+ return `${correctedText}
11893
+
11894
+ [\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
11895
+ }
11896
+ function buildDiffSummary(before, after) {
11897
+ return {
11898
+ changed: before !== after,
11899
+ beforeLength: before.length,
11900
+ afterLength: after.length
11901
+ };
11902
+ }
11903
+ function normalizePipelineError(err, stage) {
11904
+ if (err instanceof UnifiedOcrError) return err;
11905
+ const message = err instanceof Error ? err.message : String(err);
11906
+ const codeByStage = {
11907
+ convert: "CONVERT_FAILED",
11908
+ render: "RENDER_FAILED",
11909
+ probe: "PROBE_FAILED",
11910
+ ocr: "OCR_FAILED",
11911
+ proofread: "PROOFREAD_FAILED",
11912
+ merge: "MERGE_FAILED"
11913
+ };
11914
+ return new UnifiedOcrError(codeByStage[stage] ?? "UNKNOWN", stage, message);
11915
+ }
11916
+
10604
11917
  // src/index.ts
10605
11918
  async function parse2(input, options) {
11919
+ const logger = createLoggerFromEnv().withRun(generateRunId("parse")).child({ component: "index.ts", stage: "detect" });
11920
+ logger.log({ level: "info", event: "start", message: "parse \uD638\uCD9C \uC2DC\uC791" });
10606
11921
  let buffer;
10607
11922
  if (typeof input === "string") {
10608
11923
  try {
10609
- const buf = await readFile(input);
11924
+ const buf = await readFile2(input);
10610
11925
  buffer = toArrayBuffer(buf);
10611
11926
  } catch (err) {
10612
11927
  const msg = err instanceof Error && "code" in err && err.code === "ENOENT" ? `\uD30C\uC77C\uC744 \uCC3E\uC744 \uC218 \uC5C6\uC2B5\uB2C8\uB2E4: ${input}` : `\uD30C\uC77C \uC77D\uAE30 \uC2E4\uD328: ${input}`;
11928
+ logger.log({
11929
+ level: "error",
11930
+ stage: "detect",
11931
+ event: "error",
11932
+ message: msg,
11933
+ error: { code: "PARSE_ERROR", message: msg, name: err instanceof Error ? err.name : "Error" }
11934
+ });
10613
11935
  return { success: false, fileType: "unknown", error: msg, code: "PARSE_ERROR" };
10614
11936
  }
10615
11937
  } else if (Buffer.isBuffer(input)) {
@@ -10618,13 +11940,23 @@ async function parse2(input, options) {
10618
11940
  buffer = input;
10619
11941
  }
10620
11942
  if (!buffer || buffer.byteLength === 0) {
11943
+ logger.log({ level: "error", stage: "detect", event: "error", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", error: { code: "EMPTY_INPUT", message: "\uBE48 \uC785\uB825 \uBC84\uD37C", name: "KordocError" } });
10621
11944
  return { success: false, fileType: "unknown", error: "\uBE48 \uBC84\uD37C\uC774\uAC70\uB098 \uC720\uD6A8\uD558\uC9C0 \uC54A\uC740 \uC785\uB825\uC785\uB2C8\uB2E4.", code: "EMPTY_INPUT" };
10622
11945
  }
10623
11946
  const MAX_FILE_SIZE = 500 * 1024 * 1024;
10624
11947
  if (buffer.byteLength > MAX_FILE_SIZE) {
11948
+ logger.log({
11949
+ level: "error",
11950
+ stage: "detect",
11951
+ event: "error",
11952
+ message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC",
11953
+ meta: { size: buffer.byteLength },
11954
+ error: { code: "FILE_TOO_LARGE", message: "\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC", name: "KordocError" }
11955
+ });
10625
11956
  return { success: false, fileType: "unknown", error: `\uD30C\uC77C \uD06C\uAE30 \uCD08\uACFC: ${(buffer.byteLength / 1024 / 1024).toFixed(1)}MB (\uCD5C\uB300 500MB)`, code: "FILE_TOO_LARGE" };
10626
11957
  }
10627
11958
  const format = detectFormat(buffer);
11959
+ logger.log({ level: "info", event: "done", message: "\uD3EC\uB9F7 \uAC10\uC9C0 \uC644\uB8CC", meta: { format } });
10628
11960
  switch (format) {
10629
11961
  case "hwpx": {
10630
11962
  const { format: zipFormat, zip } = await detectZipFormat(buffer);
@@ -10702,7 +12034,8 @@ async function parseHwpx(buffer, options, zip) {
10702
12034
  const { markdown, blocks, metadata, outline, warnings, images } = await parseHwpxDocument(buffer, options, zip);
10703
12035
  return { success: true, fileType: "hwpx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10704
12036
  } catch (err) {
10705
- return { success: false, fileType: "hwpx", error: err instanceof Error ? err.message : "HWPX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
12037
+ const normalized = normalizeKordocError(err, "HWPX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
12038
+ return { success: false, fileType: "hwpx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
10706
12039
  }
10707
12040
  }
10708
12041
  async function parseHwp(buffer, options) {
@@ -10710,7 +12043,8 @@ async function parseHwp(buffer, options) {
10710
12043
  const { markdown, blocks, metadata, outline, warnings, images } = parseHwp5Document(Buffer.from(buffer), options);
10711
12044
  return { success: true, fileType: "hwp", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10712
12045
  } catch (err) {
10713
- return { success: false, fileType: "hwp", error: err instanceof Error ? err.message : "HWP \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
12046
+ const normalized = normalizeKordocError(err, "HWP \uD30C\uC2F1 \uC2E4\uD328", "finalize");
12047
+ return { success: false, fileType: "hwp", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
10714
12048
  }
10715
12049
  }
10716
12050
  async function parsePdf(buffer, options) {
@@ -10718,8 +12052,15 @@ async function parsePdf(buffer, options) {
10718
12052
  const { markdown, blocks, metadata, outline, warnings, isImageBased } = await parsePdfDocument(buffer, options);
10719
12053
  return { success: true, fileType: "pdf", markdown, blocks, metadata, outline, warnings, isImageBased };
10720
12054
  } catch (err) {
12055
+ const normalized = normalizeKordocError(err, "PDF \uD30C\uC2F1 \uC2E4\uD328", "finalize");
10721
12056
  const isImageBased = err instanceof Error && "isImageBased" in err ? true : void 0;
10722
- return { success: false, fileType: "pdf", error: err instanceof Error ? err.message : "PDF \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err), isImageBased };
12057
+ return {
12058
+ success: false,
12059
+ fileType: "pdf",
12060
+ error: normalized.message,
12061
+ code: normalized.code ?? classifyError(normalized),
12062
+ isImageBased
12063
+ };
10723
12064
  }
10724
12065
  }
10725
12066
  async function parseXlsx(buffer, options, zip) {
@@ -10727,7 +12068,8 @@ async function parseXlsx(buffer, options, zip) {
10727
12068
  const { markdown, blocks, metadata, warnings } = await parseXlsxDocument(buffer, options, zip);
10728
12069
  return { success: true, fileType: "xlsx", markdown, blocks, metadata, warnings };
10729
12070
  } catch (err) {
10730
- return { success: false, fileType: "xlsx", error: err instanceof Error ? err.message : "XLSX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
12071
+ const normalized = normalizeKordocError(err, "XLSX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
12072
+ return { success: false, fileType: "xlsx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
10731
12073
  }
10732
12074
  }
10733
12075
  async function parseDocx(buffer, options, zip) {
@@ -10735,10 +12077,13 @@ async function parseDocx(buffer, options, zip) {
10735
12077
  const { markdown, blocks, metadata, outline, warnings, images } = await parseDocxDocument(buffer, options, zip);
10736
12078
  return { success: true, fileType: "docx", markdown, blocks, metadata, outline, warnings, images: images?.length ? images : void 0 };
10737
12079
  } catch (err) {
10738
- return { success: false, fileType: "docx", error: err instanceof Error ? err.message : "DOCX \uD30C\uC2F1 \uC2E4\uD328", code: classifyError(err) };
12080
+ const normalized = normalizeKordocError(err, "DOCX \uD30C\uC2F1 \uC2E4\uD328", "finalize");
12081
+ return { success: false, fileType: "docx", error: normalized.message, code: normalized.code ?? classifyError(normalized) };
10739
12082
  }
10740
12083
  }
10741
12084
  export {
12085
+ AllKeysCoolingDownError,
12086
+ ApiKeyRotationPool,
10742
12087
  VERSION,
10743
12088
  blocksToMarkdown,
10744
12089
  compare,
@@ -10757,7 +12102,8 @@ export {
10757
12102
  parseHwp,
10758
12103
  parseHwpx,
10759
12104
  parsePdf,
10760
- parseXlsx
12105
+ parseXlsx,
12106
+ runUnifiedOcrPipeline
10761
12107
  };
10762
12108
  /*! Bundled license information:
10763
12109