@clazic/kordoc 2.4.16 → 2.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,7 +1,7 @@
1
1
  #!/usr/bin/env node
2
2
 
3
3
  // src/utils.ts
4
- var VERSION = true ? "2.4.15" : "0.0.0-dev";
4
+ var VERSION = true ? "2.4.17" : "0.0.0-dev";
5
5
  function toArrayBuffer(buf) {
6
6
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
7
7
  return buf.buffer;
@@ -105,4 +105,4 @@ export {
105
105
  classifyError,
106
106
  normalizeKordocError
107
107
  };
108
- //# sourceMappingURL=chunk-QR27D67R.js.map
108
+ //# sourceMappingURL=chunk-W2KDIKDF.js.map
package/dist/cli.js CHANGED
@@ -4,12 +4,12 @@ import {
4
4
  markdownToHwpx,
5
5
  markdownToXlsx,
6
6
  parse
7
- } from "./chunk-RH6IBTHH.js";
7
+ } from "./chunk-T7EBS5XP.js";
8
8
  import "./chunk-YW5G6BCJ.js";
9
9
  import {
10
10
  VERSION,
11
11
  toArrayBuffer
12
- } from "./chunk-QR27D67R.js";
12
+ } from "./chunk-W2KDIKDF.js";
13
13
  import "./chunk-MOL7MDBG.js";
14
14
  import "./chunk-7FMKAV4P.js";
15
15
  import "./chunk-34WIGIQC.js";
@@ -177,7 +177,7 @@ async function runParse(files, opts) {
177
177
  saveImages(absPath);
178
178
  }
179
179
  } catch (err) {
180
- const { sanitizeError } = await import("./utils-HHJDSSR6.js");
180
+ const { sanitizeError } = await import("./utils-DHOODYKU.js");
181
181
  process.stderr.write(`
182
182
  [kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
183
183
  `);
@@ -259,7 +259,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
259
259
  `));
260
260
  }
261
261
  } catch (err) {
262
- const { sanitizeError } = await import("./utils-HHJDSSR6.js");
262
+ const { sanitizeError } = await import("./utils-DHOODYKU.js");
263
263
  process.stderr.write(` FAIL
264
264
  `);
265
265
  process.stderr.write(` \u2192 ${sanitizeError(err)}
@@ -291,7 +291,7 @@ program.command("init-env").description("kordoc\uC6A9 .env \uD15C\uD50C\uB9BF \u
291
291
  }
292
292
  });
293
293
  program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
294
- const { watchDirectory } = await import("./watch-YAILKKKP.js");
294
+ const { watchDirectory } = await import("./watch-YGIU7RN7.js");
295
295
  await watchDirectory({
296
296
  dir,
297
297
  outDir: opts.outDir,
package/dist/index.cjs CHANGED
@@ -3138,7 +3138,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
3138
3138
  var import_xmldom = require("@xmldom/xmldom");
3139
3139
 
3140
3140
  // src/utils.ts
3141
- var VERSION = true ? "2.4.15" : "0.0.0-dev";
3141
+ var VERSION = true ? "2.4.17" : "0.0.0-dev";
3142
3142
  function toArrayBuffer(buf) {
3143
3143
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3144
3144
  return buf.buffer;
@@ -11369,17 +11369,39 @@ var DEFAULT_STAGE_WEIGHTS = {
11369
11369
  render: 20,
11370
11370
  probe: 5,
11371
11371
  ocr: 45,
11372
- proofread: 10,
11372
+ proofread: 0,
11373
11373
  merge: 5
11374
11374
  };
11375
- var OCR_PROMPT2 = "\uC774 \uC774\uBBF8\uC9C0 1\uC7A5\uC758 \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uC694\uC57D \uC5C6\uC774 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uCD94\uCD9C\uD558\uC138\uC694. \uC808\uB300\uB85C \uB0B4\uC6A9\uC744 \uCD94\uCE21\uD558\uAC70\uB098 \uBC14\uAFB8\uC9C0 \uB9C8\uC138\uC694.";
11376
- var PROOFREAD_PROMPT = [
11377
- "\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
11378
- "\uADDC\uCE59:",
11379
- "- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
11380
- "- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
11381
- "- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
11382
- "- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
11375
+ var OCR_PROMPT2 = [
11376
+ "\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
11377
+ "",
11378
+ "\uCD94\uCD9C \uADDC\uCE59:",
11379
+ "- \uD14D\uC2A4\uD2B8, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8\uB97C \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658",
11380
+ "- \uD45C\uB294 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)",
11381
+ "- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9",
11382
+ "- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9",
11383
+ "- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC",
11384
+ "- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0",
11385
+ "",
11386
+ "\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D:",
11387
+ "- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11388
+ "- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11389
+ "- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11390
+ "- \uD45C\uC758 \uC81C\uBAA9\uC744 \uBCC0\uACBD \uB610\uB294 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
11391
+ "- \uD45C\uC758 \uD589\xB7\uC5F4 \uC218, \uC140 \uB0B4\uC6A9, \uD5E4\uB354\uB97C \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
11392
+ "- \uC81C\uBAA9 \uC218\uC900(#, ##, ### \uB4F1)\uC744 \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83",
11393
+ "- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83",
11394
+ "- ` ``` `\uB85C \uAC10\uC2F8\uAC70\uB098 \uC124\uBA85 \uD14D\uC2A4\uD2B8\uB97C \uCD94\uAC00\uD558\uC9C0 \uB9D0 \uAC83",
11395
+ "",
11396
+ "\uD5C8\uC6A9\uB418\uB294 \uAD50\uC815 \uBC94\uC704 (OCR \uC624\uC778\uC2DD \uC218\uC815):",
11397
+ "- \uBA85\uBC31\uD55C \uAE00\uC790 \uC624\uC778\uC2DD \uC218\uC815 (\uC608: '0' \u2192 'O', 'l' \u2192 '1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uD655\uD55C \uACBD\uC6B0\uB9CC)",
11398
+ "- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70",
11399
+ "- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uBD84\uB9AC\uB41C \uBB38\uC7A5 \uBCD1\uD569 (\uC758\uBBF8 \uB2E8\uC704 \uAE30\uC900)",
11400
+ "- Markdown \uBB38\uBC95 \uC624\uB958 \uC218\uC815 (\uD45C \uAD6C\uBD84\uC120 \uB204\uB77D, \uB9AC\uC2A4\uD2B8 \uB4E4\uC5EC\uC4F0\uAE30 \uB4F1)",
11401
+ "",
11402
+ "\uCD9C\uB825 \uADDC\uCE59:",
11403
+ "- \uBCC0\uD658\uB41C Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825\uD560 \uAC83 (\uC124\uBA85, \uC8FC\uC11D, \uBA54\uD0C0 \uD14D\uC2A4\uD2B8 \uC5C6\uC774)",
11404
+ "- \uD655\uC2E4\uD558\uC9C0 \uC54A\uC73C\uBA74 \uC6D0\uBB38\uC744 \uADF8\uB300\uB85C \uC720\uC9C0\uD560 \uAC83"
11383
11405
  ].join("\n");
11384
11406
  function elapsedMs(startAt) {
11385
11407
  return Math.round(import_node_perf_hooks.performance.now() - startAt);
@@ -11390,7 +11412,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11390
11412
  const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
11391
11413
  const imagesDir = (0, import_path5.join)(workspaceDir, "images");
11392
11414
  const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
11393
- const proofDir = (0, import_path5.join)(workspaceDir, "ocr", "proofread");
11394
11415
  const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
11395
11416
  const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
11396
11417
  const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
@@ -11410,11 +11431,10 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11410
11431
  const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
11411
11432
  await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
11412
11433
  await (0, import_promises2.mkdir)(rawDir, { recursive: true });
11413
- await (0, import_promises2.mkdir)(proofDir, { recursive: true });
11414
11434
  await (0, import_promises2.mkdir)(diffDir, { recursive: true });
11415
11435
  const timingsMs = {};
11416
11436
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11417
- const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
11437
+ const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
11418
11438
  const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
11419
11439
  let currentStage = "convert";
11420
11440
  const logStage = (level, stage, event, message, meta) => {
@@ -11519,56 +11539,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11519
11539
  const pagePath = (0, import_path5.join)(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11520
11540
  await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
11521
11541
  rawPagePaths.push(pagePath);
11522
- markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`);
11542
+ markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
11523
11543
  logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
11524
11544
  }
11525
11545
  timingsMs.ocr = elapsedMs(ocrStart);
11526
11546
  markStageDone("ocr", "OCR \uC644\uB8CC");
11527
11547
  logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
11528
- const proofStart = import_node_perf_hooks.performance.now();
11529
- currentStage = "proofread";
11530
- markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
11531
- logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
11532
- const proofedPaths = [];
11533
- for (let i = 0; i < rawPagePaths.length; i++) {
11534
- const rawMd = await (0, import_promises2.readFile)(rawPagePaths[i], "utf-8");
11535
- const prompt = `${PROOFREAD_PROMPT}
11536
-
11537
- ---
11538
- ${rawMd}
11539
- ---`;
11540
- const corrected = await ocrImageViaNim({
11541
- textOnlyPrompt: prompt,
11542
- model: selectedModel,
11543
- maxTokens: modelMaxTokens[selectedModel] ?? 8192,
11544
- baseUrl,
11545
- keyPool,
11546
- timeoutMs,
11547
- maxRetries: maxRetriesPerPage,
11548
- logger,
11549
- stage: "proofread"
11550
- });
11551
- const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
11552
- const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
11553
- const pagePath = (0, import_path5.join)(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11554
- await (0, import_promises2.writeFile)(pagePath, taggedCorrected, "utf-8");
11555
- await (0, import_promises2.writeFile)(
11556
- (0, import_path5.join)(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
11557
- JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
11558
- "utf-8"
11559
- );
11560
- proofedPaths.push(pagePath);
11561
- markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
11562
- logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
11563
- }
11564
- timingsMs.proofread = elapsedMs(proofStart);
11565
- markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
11566
- logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
11567
11548
  const mergeStart = import_node_perf_hooks.performance.now();
11568
11549
  currentStage = "merge";
11569
11550
  markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
11570
- logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: proofedPaths.length });
11571
- const merged = await mergeMarkdownPages(proofedPaths);
11551
+ logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
11552
+ const merged = await mergeMarkdownPages(rawPagePaths);
11572
11553
  await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
11573
11554
  timingsMs.merge = elapsedMs(mergeStart);
11574
11555
  markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
@@ -11643,7 +11624,8 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
11643
11624
  current: extra.current,
11644
11625
  total: extra.total,
11645
11626
  code: extra.code,
11646
- message: extra.message
11627
+ message: extra.message,
11628
+ model: extra.model
11647
11629
  });
11648
11630
  }
11649
11631
  async function convertWithLibreOffice(buffer, ext) {
@@ -12026,40 +12008,6 @@ function ensureSupportedInput(path) {
12026
12008
  throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
12027
12009
  }
12028
12010
  }
12029
- function extractNumericTokens(text) {
12030
- return text.match(/\d[\d,./-]*/g) ?? [];
12031
- }
12032
- function preserveNumericIntegrity(rawText, correctedText) {
12033
- const rawTokens = extractNumericTokens(rawText);
12034
- const correctedTokens = extractNumericTokens(correctedText);
12035
- if (rawTokens.length !== correctedTokens.length) return rawText;
12036
- for (let i = 0; i < rawTokens.length; i++) {
12037
- if (rawTokens[i] !== correctedTokens[i]) return rawText;
12038
- }
12039
- return correctedText;
12040
- }
12041
- function addUncertainTag(rawText, correctedText) {
12042
- if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
12043
- const rawLen = rawText.trim().length;
12044
- const corrLen = correctedText.trim().length;
12045
- if (rawLen === 0 || corrLen === 0) return correctedText;
12046
- const rawLines = rawText.split("\n").filter(Boolean).length;
12047
- const corrLines = correctedText.split("\n").filter(Boolean).length;
12048
- const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
12049
- const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
12050
- const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
12051
- if (!suspicious) return correctedText;
12052
- return `${correctedText}
12053
-
12054
- [\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
12055
- }
12056
- function buildDiffSummary(before, after) {
12057
- return {
12058
- changed: before !== after,
12059
- beforeLength: before.length,
12060
- afterLength: after.length
12061
- };
12062
- }
12063
12011
  function normalizePipelineError(err, stage) {
12064
12012
  if (err instanceof UnifiedOcrError) return err;
12065
12013
  const message = err instanceof Error ? err.message : String(err);