@clazic/kordoc 2.4.15 → 2.4.17
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-YHPNDX7A.js → chunk-W2KDIKDF.js} +2 -2
- package/dist/{chunk-ZER7GYXK.js → chunk-WM3XI23V.js} +8 -8
- package/dist/{chunk-ZER7GYXK.js.map → chunk-WM3XI23V.js.map} +1 -1
- package/dist/cli.js +5 -5
- package/dist/index.cjs +95 -14
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +95 -14
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{utils-ZQA6RCXN.js → utils-DHOODYKU.js} +2 -2
- package/dist/{watch-ULSOWHFE.js → watch-RM4VNOL4.js} +3 -3
- package/package.json +1 -1
- /package/dist/{chunk-YHPNDX7A.js.map → chunk-W2KDIKDF.js.map} +0 -0
- /package/dist/{utils-ZQA6RCXN.js.map → utils-DHOODYKU.js.map} +0 -0
- /package/dist/{watch-ULSOWHFE.js.map → watch-RM4VNOL4.js.map} +0 -0
package/dist/index.d.cts
CHANGED
package/dist/index.d.ts
CHANGED
package/dist/index.js
CHANGED
|
@@ -3115,7 +3115,7 @@ import JSZip2 from "jszip";
|
|
|
3115
3115
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3116
3116
|
|
|
3117
3117
|
// src/utils.ts
|
|
3118
|
-
var VERSION = true ? "2.4.
|
|
3118
|
+
var VERSION = true ? "2.4.17" : "0.0.0-dev";
|
|
3119
3119
|
function toArrayBuffer(buf) {
|
|
3120
3120
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3121
3121
|
return buf.buffer;
|
|
@@ -11349,14 +11349,14 @@ var DEFAULT_STAGE_WEIGHTS = {
|
|
|
11349
11349
|
proofread: 10,
|
|
11350
11350
|
merge: 5
|
|
11351
11351
|
};
|
|
11352
|
-
var OCR_PROMPT2 = "
|
|
11352
|
+
var OCR_PROMPT2 = "Extract all text and tables from this image exactly as-is into Markdown. Do not summarize, infer, or alter the content in any way.";
|
|
11353
11353
|
var PROOFREAD_PROMPT = [
|
|
11354
|
-
"
|
|
11355
|
-
"
|
|
11356
|
-
"-
|
|
11357
|
-
"-
|
|
11358
|
-
"-
|
|
11359
|
-
"-
|
|
11354
|
+
"Perform non-destructive proofreading only on the Markdown below.",
|
|
11355
|
+
"Rules:",
|
|
11356
|
+
"- Do not add, remove, or infer any facts",
|
|
11357
|
+
"- Do not change numbers, units, or proper nouns",
|
|
11358
|
+
"- Correct only typos, spacing, line breaks, and Markdown structure",
|
|
11359
|
+
"- Output the corrected Markdown body only"
|
|
11360
11360
|
].join("\n");
|
|
11361
11361
|
function elapsedMs(startAt) {
|
|
11362
11362
|
return Math.round(performance.now() - startAt);
|
|
@@ -11391,7 +11391,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11391
11391
|
await mkdir(diffDir, { recursive: true });
|
|
11392
11392
|
const timingsMs = {};
|
|
11393
11393
|
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
11394
|
-
const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
|
|
11394
|
+
const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
|
|
11395
11395
|
const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
|
|
11396
11396
|
let currentStage = "convert";
|
|
11397
11397
|
const logStage = (level, stage, event, message, meta) => {
|
|
@@ -11418,13 +11418,32 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11418
11418
|
currentStage = "render";
|
|
11419
11419
|
markStageStart("render", "PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC911");
|
|
11420
11420
|
logStage("info", "render", "start", "PDF \uD398\uC774\uC9C0 \uB80C\uB354\uB9C1 \uC2DC\uC791", { pdf: workingPdfPath, dpi });
|
|
11421
|
-
|
|
11421
|
+
const renderWithProgress = await renderPdfToPngWithProgress(
|
|
11422
|
+
workingPdfPath,
|
|
11423
|
+
join4(imagesDir, "page"),
|
|
11424
|
+
dpi,
|
|
11425
|
+
(current, total) => {
|
|
11426
|
+
markStageProgress(
|
|
11427
|
+
"render",
|
|
11428
|
+
Math.round(current / total * 100),
|
|
11429
|
+
current,
|
|
11430
|
+
total,
|
|
11431
|
+
`\uD398\uC774\uC9C0 ${current}/${total} \uB80C\uB354\uB9C1`
|
|
11432
|
+
);
|
|
11433
|
+
}
|
|
11434
|
+
);
|
|
11422
11435
|
const images = await listPageImages(imagesDir);
|
|
11423
11436
|
if (images.length === 0) throw new UnifiedOcrError("RENDER_FAILED", "render", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC2E4\uD328: \uACB0\uACFC \uC774\uBBF8\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
|
|
11424
|
-
|
|
11437
|
+
if (!renderWithProgress.emittedPerPageProgress) {
|
|
11438
|
+
markStageProgress("render", 100, images.length, images.length, `\uD398\uC774\uC9C0 ${images.length}\uC7A5 \uC0DD\uC131`);
|
|
11439
|
+
}
|
|
11425
11440
|
timingsMs.render = elapsedMs(renderStart);
|
|
11426
11441
|
markStageDone("render", "\uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC");
|
|
11427
|
-
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", {
|
|
11442
|
+
logStage("info", "render", "done", "\uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0 \uC0DD\uC131 \uC644\uB8CC", {
|
|
11443
|
+
pages: images.length,
|
|
11444
|
+
elapsedMs: timingsMs.render,
|
|
11445
|
+
pageCountSource: renderWithProgress.pageCountSource
|
|
11446
|
+
});
|
|
11428
11447
|
const probeStart = performance.now();
|
|
11429
11448
|
currentStage = "probe";
|
|
11430
11449
|
markStageStart("probe", "\uBAA8\uB378 \uC18D\uB3C4 \uD504\uB85C\uBE0C \uC218\uD589 \uC911");
|
|
@@ -11477,7 +11496,7 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11477
11496
|
const pagePath = join4(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11478
11497
|
await writeFile(pagePath, markdown, "utf-8");
|
|
11479
11498
|
rawPagePaths.push(pagePath);
|
|
11480
|
-
markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}
|
|
11499
|
+
markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
|
|
11481
11500
|
logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
|
|
11482
11501
|
}
|
|
11483
11502
|
timingsMs.ocr = elapsedMs(ocrStart);
|
|
@@ -11601,7 +11620,8 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
|
|
|
11601
11620
|
current: extra.current,
|
|
11602
11621
|
total: extra.total,
|
|
11603
11622
|
code: extra.code,
|
|
11604
|
-
message: extra.message
|
|
11623
|
+
message: extra.message,
|
|
11624
|
+
model: extra.model
|
|
11605
11625
|
});
|
|
11606
11626
|
}
|
|
11607
11627
|
async function convertWithLibreOffice(buffer, ext) {
|
|
@@ -11622,6 +11642,49 @@ async function renderPdfToPng(pdfPath, prefixPath, dpi) {
|
|
|
11622
11642
|
throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
|
|
11623
11643
|
}
|
|
11624
11644
|
}
|
|
11645
|
+
async function getPdfPageCount(pdfPath) {
|
|
11646
|
+
const stdout = await runCommandWithStdout("pdfinfo", [pdfPath]);
|
|
11647
|
+
const m = stdout.match(/^\s*Pages:\s*(\d+)\s*$/mi);
|
|
11648
|
+
if (!m) {
|
|
11649
|
+
throw new Error("pdfinfo \uCD9C\uB825\uC5D0\uC11C \uD398\uC774\uC9C0 \uC218\uB97C \uCC3E\uC9C0 \uBABB\uD588\uC2B5\uB2C8\uB2E4.");
|
|
11650
|
+
}
|
|
11651
|
+
const n = Number(m[1]);
|
|
11652
|
+
if (!Number.isFinite(n) || n <= 0) {
|
|
11653
|
+
throw new Error(`\uC798\uBABB\uB41C \uD398\uC774\uC9C0 \uC218: ${m[1]}`);
|
|
11654
|
+
}
|
|
11655
|
+
return n;
|
|
11656
|
+
}
|
|
11657
|
+
async function renderPdfToPngWithProgress(pdfPath, prefixPath, dpi, onPageDone) {
|
|
11658
|
+
let totalPages = 0;
|
|
11659
|
+
try {
|
|
11660
|
+
totalPages = await getPdfPageCount(pdfPath);
|
|
11661
|
+
} catch {
|
|
11662
|
+
totalPages = 0;
|
|
11663
|
+
}
|
|
11664
|
+
if (totalPages > 0) {
|
|
11665
|
+
try {
|
|
11666
|
+
for (let page = 1; page <= totalPages; page++) {
|
|
11667
|
+
await runCommand("pdftoppm", [
|
|
11668
|
+
"-png",
|
|
11669
|
+
"-r",
|
|
11670
|
+
String(dpi),
|
|
11671
|
+
"-f",
|
|
11672
|
+
String(page),
|
|
11673
|
+
"-l",
|
|
11674
|
+
String(page),
|
|
11675
|
+
pdfPath,
|
|
11676
|
+
prefixPath
|
|
11677
|
+
]);
|
|
11678
|
+
onPageDone(page, totalPages);
|
|
11679
|
+
}
|
|
11680
|
+
return { emittedPerPageProgress: true, pageCountSource: "pdfinfo" };
|
|
11681
|
+
} catch (err) {
|
|
11682
|
+
throw new UnifiedOcrError("RENDER_FAILED", "render", err instanceof Error ? err.message : String(err));
|
|
11683
|
+
}
|
|
11684
|
+
}
|
|
11685
|
+
await renderPdfToPng(pdfPath, prefixPath, dpi);
|
|
11686
|
+
return { emittedPerPageProgress: false, pageCountSource: "fallback" };
|
|
11687
|
+
}
|
|
11625
11688
|
async function runCommand(cmd, args) {
|
|
11626
11689
|
await new Promise((resolvePromise, reject) => {
|
|
11627
11690
|
const child = spawn2(cmd, args, { stdio: "pipe" });
|
|
@@ -11636,6 +11699,24 @@ async function runCommand(cmd, args) {
|
|
|
11636
11699
|
});
|
|
11637
11700
|
});
|
|
11638
11701
|
}
|
|
11702
|
+
async function runCommandWithStdout(cmd, args) {
|
|
11703
|
+
return await new Promise((resolvePromise, reject) => {
|
|
11704
|
+
const child = spawn2(cmd, args, { stdio: "pipe" });
|
|
11705
|
+
let stdout = "";
|
|
11706
|
+
let stderr = "";
|
|
11707
|
+
child.stdout.on("data", (d) => {
|
|
11708
|
+
stdout += String(d);
|
|
11709
|
+
});
|
|
11710
|
+
child.stderr.on("data", (d) => {
|
|
11711
|
+
stderr += String(d);
|
|
11712
|
+
});
|
|
11713
|
+
child.on("error", reject);
|
|
11714
|
+
child.on("close", (code) => {
|
|
11715
|
+
if (code === 0) resolvePromise(stdout);
|
|
11716
|
+
else reject(new Error(`${cmd} \uC2E4\uD328 (code=${code}): ${stderr.trim()}`));
|
|
11717
|
+
});
|
|
11718
|
+
});
|
|
11719
|
+
}
|
|
11639
11720
|
async function assertSofficeAvailable() {
|
|
11640
11721
|
try {
|
|
11641
11722
|
await runCommand("soffice", ["--version"]);
|