@clazic/kordoc 2.4.15 → 2.4.17

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/cli.js CHANGED
@@ -4,12 +4,12 @@ import {
4
4
  markdownToHwpx,
5
5
  markdownToXlsx,
6
6
  parse
7
- } from "./chunk-ZER7GYXK.js";
7
+ } from "./chunk-WM3XI23V.js";
8
8
  import "./chunk-YW5G6BCJ.js";
9
9
  import {
10
10
  VERSION,
11
11
  toArrayBuffer
12
- } from "./chunk-YHPNDX7A.js";
12
+ } from "./chunk-W2KDIKDF.js";
13
13
  import "./chunk-MOL7MDBG.js";
14
14
  import "./chunk-7FMKAV4P.js";
15
15
  import "./chunk-34WIGIQC.js";
@@ -177,7 +177,7 @@ async function runParse(files, opts) {
177
177
  saveImages(absPath);
178
178
  }
179
179
  } catch (err) {
180
- const { sanitizeError } = await import("./utils-ZQA6RCXN.js");
180
+ const { sanitizeError } = await import("./utils-DHOODYKU.js");
181
181
  process.stderr.write(`
182
182
  [kordoc] ERROR: ${fileName} \u2014 ${sanitizeError(err)}
183
183
  `);
@@ -259,7 +259,7 @@ program.command("convert <input>").description("\uB9C8\uD06C\uB2E4\uC6B4 \uD30C\
259
259
  `));
260
260
  }
261
261
  } catch (err) {
262
- const { sanitizeError } = await import("./utils-ZQA6RCXN.js");
262
+ const { sanitizeError } = await import("./utils-DHOODYKU.js");
263
263
  process.stderr.write(` FAIL
264
264
  `);
265
265
  process.stderr.write(` \u2192 ${sanitizeError(err)}
@@ -291,7 +291,7 @@ program.command("init-env").description("kordoc\uC6A9 .env \uD15C\uD50C\uB9BF \u
291
291
  }
292
292
  });
293
293
  program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
294
- const { watchDirectory } = await import("./watch-ULSOWHFE.js");
294
+ const { watchDirectory } = await import("./watch-RM4VNOL4.js");
295
295
  await watchDirectory({
296
296
  dir,
297
297
  outDir: opts.outDir,
package/dist/index.cjs CHANGED
@@ -3138,7 +3138,7 @@ var import_jszip2 = __toESM(require("jszip"), 1);
3138
3138
  var import_xmldom = require("@xmldom/xmldom");
3139
3139
 
3140
3140
  // src/utils.ts
3141
- var VERSION = true ? "2.4.14" : "0.0.0-dev";
3141
+ var VERSION = true ? "2.4.17" : "0.0.0-dev";
3142
3142
  function toArrayBuffer(buf) {
3143
3143
  if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
3144
3144
  return buf.buffer;
@@ -11372,14 +11372,14 @@ var DEFAULT_STAGE_WEIGHTS = {
11372
11372
  proofread: 10,
11373
11373
  merge: 5
11374
11374
  };
11375
- var OCR_PROMPT2 = "\uC774 \uC774\uBBF8\uC9C0 1\uC7A5\uC758 \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uC694\uC57D \uC5C6\uC774 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uCD94\uCD9C\uD558\uC138\uC694. \uC808\uB300\uB85C \uB0B4\uC6A9\uC744 \uCD94\uCE21\uD558\uAC70\uB098 \uBC14\uAFB8\uC9C0 \uB9C8\uC138\uC694.";
11375
+ var OCR_PROMPT2 = "Extract all text and tables from this image exactly as-is into Markdown. Do not summarize, infer, or alter the content in any way.";
11376
11376
  var PROOFREAD_PROMPT = [
11377
- "\uC544\uB798 Markdown\uC744 \uBE44\uD30C\uAD34 \uAD50\uC815\uB9CC \uC218\uD589\uD558\uC138\uC694.",
11378
- "\uADDC\uCE59:",
11379
- "- \uC0AC\uC2E4 \uCD94\uAC00/\uC0AD\uC81C/\uCD94\uCE21 \uAE08\uC9C0",
11380
- "- \uC22B\uC790, \uB2E8\uC704, \uACE0\uC720\uBA85\uC0AC \uBCC0\uACBD \uAE08\uC9C0",
11381
- "- \uC624\uD0C8\uC790, \uB744\uC5B4\uC4F0\uAE30, \uC904\uBC14\uAFC8, Markdown \uAD6C\uC870\uB9CC \uAD50\uC815",
11382
- "- \uACB0\uACFC\uB294 Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825"
11377
+ "Perform non-destructive proofreading only on the Markdown below.",
11378
+ "Rules:",
11379
+ "- Do not add, remove, or infer any facts",
11380
+ "- Do not change numbers, units, or proper nouns",
11381
+ "- Correct only typos, spacing, line breaks, and Markdown structure",
11382
+ "- Output the corrected Markdown body only"
11383
11383
  ].join("\n");
11384
11384
  function elapsedMs(startAt) {
11385
11385
  return Math.round(import_node_perf_hooks.performance.now() - startAt);
@@ -11414,7 +11414,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11414
11414
  await (0, import_promises2.mkdir)(diffDir, { recursive: true });
11415
11415
  const timingsMs = {};
11416
11416
  const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
11417
- const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
11417
+ const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
11418
11418
  const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
11419
11419
  let currentStage = "convert";
11420
11420
  const logStage = (level, stage, event, message, meta) => {
@@ -11441,13 +11441,32 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11441
11441
  currentStage = "render";
11442
11442
  markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
11443
11443
  logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
11444
- await renderPdfToPng(workingPdfPath, (0, import_path5.join)(imagesDir, "page"), dpi);
11444
+ const renderWithProgress = await renderPdfToPngWithProgress(
11445
+ workingPdfPath,
11446
+ (0, import_path5.join)(imagesDir, "page"),
11447
+ dpi,
11448
+ (current, total) => {
11449
+ markStageProgress(
11450
+ "render",
11451
+ Math.round(current / total * 100),
11452
+ current,
11453
+ total,
11454
+ `\uD398\uC774\uC9C0 ${current}/${total} \uB80C\uB354\uB9C1`
11455
+ );
11456
+ }
11457
+ );
11445
11458
  const images = await listPageImages(imagesDir);
11446
11459
  if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
11447
- markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
11460
+ if (!renderWithProgress.emittedPerPageProgress) {
11461
+ markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
11462
+ }
11448
11463
  timingsMs.render = elapsedMs(renderStart);
11449
11464
  markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
11450
- logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", { pages: images.length, elapsedMs: timingsMs.render });
11465
+ logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", {
11466
+ pages: images.length,
11467
+ elapsedMs: timingsMs.render,
11468
+ pageCountSource: renderWithProgress.pageCountSource
11469
+ });
11451
11470
  const probeStart = import_node_perf_hooks.performance.now();
11452
11471
  currentStage = "probe";
11453
11472
  markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
@@ -11500,7 +11519,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
11500
11519
  const pagePath = (0, import_path5.join)(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
11501
11520
  await (0, import_promises2.writeFile)(pagePath, markdown, "utf-8");
11502
11521
  rawPagePaths.push(pagePath);
11503
- markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`);
11522
+ markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
11504
11523
  logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
11505
11524
  }
11506
11525
  timingsMs.ocr = elapsedMs(ocrStart);
@@ -11624,7 +11643,8 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
11624
11643
  current: extra.current,
11625
11644
  total: extra.total,
11626
11645
  code: extra.code,
11627
- message: extra.message
11646
+ message: extra.message,
11647
+ model: extra.model
11628
11648
  });
11629
11649
  }
11630
11650
  async function convertWithLibreOffice(buffer, ext) {
@@ -11645,6 +11665,49 @@ async function renderPdfToPng(pdfPath, prefixPath, dpi) {
11645
11665
  throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11646
11666
  }
11647
11667
  }
11668
+ async function getPdfPageCount(pdfPath) {
11669
+ const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
11670
+ const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
11671
+ if (!m) {
11672
+ throw new Error("pdfinfo \uCD9C\uB825\uC5D0\uC11C \uD398\uC774\uC9C0 \uC218\uB97C \uCC3E\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4.");
11673
+ }
11674
+ const n = Number(m[1]);
11675
+ if (!Number.isFinite(n) || n <= 0) {
11676
+ throw new Error(`\uC798\uBABB\uB41C \uD398\uC774\uC9C0 \uC218: ${m[1]}`);
11677
+ }
11678
+ return n;
11679
+ }
11680
+ async function renderPdfToPngWithProgress(pdfPath, prefixPath, dpi, onPageDone) {
11681
+ let totalPages = 0;
11682
+ try {
11683
+ totalPages = await getPdfPageCount(pdfPath);
11684
+ } catch {
11685
+ totalPages = 0;
11686
+ }
11687
+ if (totalPages > 0) {
11688
+ try {
11689
+ for (let page = 1; page <= totalPages; page++) {
11690
+ await runCommand("pdftoppm", [
11691
+ "-png",
11692
+ "-r",
11693
+ String(dpi),
11694
+ "-f",
11695
+ String(page),
11696
+ "-l",
11697
+ String(page),
11698
+ pdfPath,
11699
+ prefixPath
11700
+ ]);
11701
+ onPageDone(page, totalPages);
11702
+ }
11703
+ return { emittedPerPageProgress: true, pageCountSource: "pdfinfo" };
11704
+ } catch (err) {
11705
+ throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
11706
+ }
11707
+ }
11708
+ await renderPdfToPng(pdfPath, prefixPath, dpi);
11709
+ return { emittedPerPageProgress: false, pageCountSource: "fallback" };
11710
+ }
11648
11711
  async function runCommand(cmd, args) {
11649
11712
  await new Promise((resolvePromise, reject) => {
11650
11713
  const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
@@ -11659,6 +11722,24 @@ async function runCommand(cmd, args) {
11659
11722
  });
11660
11723
  });
11661
11724
  }
11725
+ async function runCommandWithStdout(cmd, args) {
11726
+ return await new Promise((resolvePromise, reject) => {
11727
+ const child = (0, import_child_process4.spawn)(cmd, args, { stdio: "pipe" });
11728
+ let stdout = "";
11729
+ let stderr = "";
11730
+ child.stdout.on("data", (d) => {
11731
+ stdout += String(d);
11732
+ });
11733
+ child.stderr.on("data", (d) => {
11734
+ stderr += String(d);
11735
+ });
11736
+ child.on("error", reject);
11737
+ child.on("close", (code) => {
11738
+ if (code === 0) resolvePromise(stdout);
11739
+ else reject(new Error(`${cmd} \uC2E4\uD328 (code=${code}): ${stderr.trim()}`));
11740
+ });
11741
+ });
11742
+ }
11662
11743
  async function assertSofficeAvailable() {
11663
11744
  try {
11664
11745
  await runCommand("soffice", ["--version"]);