@clazic/kordoc 2.4.17 → 2.4.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-WM3XI23V.js → chunk-T7EBS5XP.js} +31 -8
- package/dist/{chunk-WM3XI23V.js.map → chunk-T7EBS5XP.js.map} +1 -1
- package/dist/cli.js +2 -2
- package/dist/index.cjs +33 -86
- package/dist/index.cjs.map +1 -1
- package/dist/index.js +33 -86
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +1 -1
- package/dist/{watch-RM4VNOL4.js → watch-YGIU7RN7.js} +2 -2
- package/package.json +1 -1
- /package/dist/{watch-RM4VNOL4.js.map → watch-YGIU7RN7.js.map} +0 -0
package/dist/cli.js
CHANGED
|
@@ -4,7 +4,7 @@ import {
|
|
|
4
4
|
markdownToHwpx,
|
|
5
5
|
markdownToXlsx,
|
|
6
6
|
parse
|
|
7
|
-
} from "./chunk-
|
|
7
|
+
} from "./chunk-T7EBS5XP.js";
|
|
8
8
|
import "./chunk-YW5G6BCJ.js";
|
|
9
9
|
import {
|
|
10
10
|
VERSION,
|
|
@@ -291,7 +291,7 @@ program.command("init-env").description("kordoc\uC6A9 .env \uD15C\uD50C\uB9BF \u
|
|
|
291
291
|
}
|
|
292
292
|
});
|
|
293
293
|
program.command("watch <dir>").description("\uB514\uB809\uD1A0\uB9AC \uAC10\uC2DC \u2014 \uC0C8 \uBB38\uC11C \uC790\uB3D9 \uBCC0\uD658").option("--webhook <url>", "\uACB0\uACFC \uC804\uC1A1 \uC6F9\uD6C5 URL").option("-d, --out-dir <dir>", "\uBCC0\uD658 \uACB0\uACFC \uCD9C\uB825 \uB514\uB809\uD1A0\uB9AC").option("-p, --pages <range>", "\uD398\uC774\uC9C0/\uC139\uC158 \uBC94\uC704").option("--format <type>", "\uCD9C\uB825 \uD615\uC2DD: markdown \uB610\uB294 json", "markdown").option("--silent", "\uC9C4\uD589 \uBA54\uC2DC\uC9C0 \uC228\uAE30\uAE30").action(async (dir, opts) => {
|
|
294
|
-
const { watchDirectory } = await import("./watch-
|
|
294
|
+
const { watchDirectory } = await import("./watch-YGIU7RN7.js");
|
|
295
295
|
await watchDirectory({
|
|
296
296
|
dir,
|
|
297
297
|
outDir: opts.outDir,
|
package/dist/index.cjs
CHANGED
|
@@ -11369,17 +11369,39 @@ var DEFAULT_STAGE_WEIGHTS = {
|
|
|
11369
11369
|
render: 20,
|
|
11370
11370
|
probe: 5,
|
|
11371
11371
|
ocr: 45,
|
|
11372
|
-
proofread:
|
|
11372
|
+
proofread: 0,
|
|
11373
11373
|
merge: 5
|
|
11374
11374
|
};
|
|
11375
|
-
var OCR_PROMPT2 =
|
|
11376
|
-
|
|
11377
|
-
"
|
|
11378
|
-
"
|
|
11379
|
-
"-
|
|
11380
|
-
"-
|
|
11381
|
-
"-
|
|
11382
|
-
"-
|
|
11375
|
+
var OCR_PROMPT2 = [
|
|
11376
|
+
"\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
|
|
11377
|
+
"",
|
|
11378
|
+
"\uCD94\uCD9C \uADDC\uCE59:",
|
|
11379
|
+
"- \uD14D\uC2A4\uD2B8, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8\uB97C \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658",
|
|
11380
|
+
"- \uD45C\uB294 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)",
|
|
11381
|
+
"- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9",
|
|
11382
|
+
"- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9",
|
|
11383
|
+
"- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC",
|
|
11384
|
+
"- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0",
|
|
11385
|
+
"",
|
|
11386
|
+
"\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D:",
|
|
11387
|
+
"- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11388
|
+
"- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11389
|
+
"- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11390
|
+
"- \uD45C\uC758 \uC81C\uBAA9\uC744 \uBCC0\uACBD \uB610\uB294 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11391
|
+
"- \uD45C\uC758 \uD589\xB7\uC5F4 \uC218, \uC140 \uB0B4\uC6A9, \uD5E4\uB354\uB97C \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11392
|
+
"- \uC81C\uBAA9 \uC218\uC900(#, ##, ### \uB4F1)\uC744 \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83",
|
|
11393
|
+
"- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11394
|
+
"- ` ``` `\uB85C \uAC10\uC2F8\uAC70\uB098 \uC124\uBA85 \uD14D\uC2A4\uD2B8\uB97C \uCD94\uAC00\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11395
|
+
"",
|
|
11396
|
+
"\uD5C8\uC6A9\uB418\uB294 \uAD50\uC815 \uBC94\uC704 (OCR \uC624\uC778\uC2DD \uC218\uC815):",
|
|
11397
|
+
"- \uBA85\uBC31\uD55C \uAE00\uC790 \uC624\uC778\uC2DD \uC218\uC815 (\uC608: '0' \u2192 'O', 'l' \u2192 '1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uD655\uD55C \uACBD\uC6B0\uB9CC)",
|
|
11398
|
+
"- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70",
|
|
11399
|
+
"- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uBD84\uB9AC\uB41C \uBB38\uC7A5 \uBCD1\uD569 (\uC758\uBBF8 \uB2E8\uC704 \uAE30\uC900)",
|
|
11400
|
+
"- Markdown \uBB38\uBC95 \uC624\uB958 \uC218\uC815 (\uD45C \uAD6C\uBD84\uC120 \uB204\uB77D, \uB9AC\uC2A4\uD2B8 \uB4E4\uC5EC\uC4F0\uAE30 \uB4F1)",
|
|
11401
|
+
"",
|
|
11402
|
+
"\uCD9C\uB825 \uADDC\uCE59:",
|
|
11403
|
+
"- \uBCC0\uD658\uB41C Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825\uD560 \uAC83 (\uC124\uBA85, \uC8FC\uC11D, \uBA54\uD0C0 \uD14D\uC2A4\uD2B8 \uC5C6\uC774)",
|
|
11404
|
+
"- \uD655\uC2E4\uD558\uC9C0 \uC54A\uC73C\uBA74 \uC6D0\uBB38\uC744 \uADF8\uB300\uB85C \uC720\uC9C0\uD560 \uAC83"
|
|
11383
11405
|
].join("\n");
|
|
11384
11406
|
function elapsedMs(startAt) {
|
|
11385
11407
|
return Math.round(import_node_perf_hooks.performance.now() - startAt);
|
|
@@ -11390,7 +11412,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11390
11412
|
const workspaceDir = (0, import_path5.resolve)(options.workspaceDir ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}_ocr_workspace`));
|
|
11391
11413
|
const imagesDir = (0, import_path5.join)(workspaceDir, "images");
|
|
11392
11414
|
const rawDir = (0, import_path5.join)(workspaceDir, "ocr", "raw");
|
|
11393
|
-
const proofDir = (0, import_path5.join)(workspaceDir, "ocr", "proofread");
|
|
11394
11415
|
const diffDir = (0, import_path5.join)(workspaceDir, "ocr", "diff");
|
|
11395
11416
|
const outputPath = (0, import_path5.resolve)(options.outputPath ?? (0, import_path5.join)((0, import_path5.dirname)(absInput), `${stem}.md`));
|
|
11396
11417
|
const reportPath = (0, import_path5.join)(workspaceDir, "run-report.json");
|
|
@@ -11410,7 +11431,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11410
11431
|
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
11411
11432
|
await (0, import_promises2.mkdir)(imagesDir, { recursive: true });
|
|
11412
11433
|
await (0, import_promises2.mkdir)(rawDir, { recursive: true });
|
|
11413
|
-
await (0, import_promises2.mkdir)(proofDir, { recursive: true });
|
|
11414
11434
|
await (0, import_promises2.mkdir)(diffDir, { recursive: true });
|
|
11415
11435
|
const timingsMs = {};
|
|
11416
11436
|
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
@@ -11525,50 +11545,11 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11525
11545
|
timingsMs.ocr = elapsedMs(ocrStart);
|
|
11526
11546
|
markStageDone("ocr", "OCR \uC644\uB8CC");
|
|
11527
11547
|
logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
|
|
11528
|
-
const proofStart = import_node_perf_hooks.performance.now();
|
|
11529
|
-
currentStage = "proofread";
|
|
11530
|
-
markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
|
|
11531
|
-
logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11532
|
-
const proofedPaths = [];
|
|
11533
|
-
for (let i = 0; i < rawPagePaths.length; i++) {
|
|
11534
|
-
const rawMd = await (0, import_promises2.readFile)(rawPagePaths[i], "utf-8");
|
|
11535
|
-
const prompt = `${PROOFREAD_PROMPT}
|
|
11536
|
-
|
|
11537
|
-
---
|
|
11538
|
-
${rawMd}
|
|
11539
|
-
---`;
|
|
11540
|
-
const corrected = await ocrImageViaNim({
|
|
11541
|
-
textOnlyPrompt: prompt,
|
|
11542
|
-
model: selectedModel,
|
|
11543
|
-
maxTokens: modelMaxTokens[selectedModel] ?? 8192,
|
|
11544
|
-
baseUrl,
|
|
11545
|
-
keyPool,
|
|
11546
|
-
timeoutMs,
|
|
11547
|
-
maxRetries: maxRetriesPerPage,
|
|
11548
|
-
logger,
|
|
11549
|
-
stage: "proofread"
|
|
11550
|
-
});
|
|
11551
|
-
const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
|
|
11552
|
-
const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
|
|
11553
|
-
const pagePath = (0, import_path5.join)(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11554
|
-
await (0, import_promises2.writeFile)(pagePath, taggedCorrected, "utf-8");
|
|
11555
|
-
await (0, import_promises2.writeFile)(
|
|
11556
|
-
(0, import_path5.join)(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
|
|
11557
|
-
JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
|
|
11558
|
-
"utf-8"
|
|
11559
|
-
);
|
|
11560
|
-
proofedPaths.push(pagePath);
|
|
11561
|
-
markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
|
|
11562
|
-
logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
|
|
11563
|
-
}
|
|
11564
|
-
timingsMs.proofread = elapsedMs(proofStart);
|
|
11565
|
-
markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
|
|
11566
|
-
logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
|
|
11567
11548
|
const mergeStart = import_node_perf_hooks.performance.now();
|
|
11568
11549
|
currentStage = "merge";
|
|
11569
11550
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
11570
|
-
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages:
|
|
11571
|
-
const merged = await mergeMarkdownPages(
|
|
11551
|
+
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11552
|
+
const merged = await mergeMarkdownPages(rawPagePaths);
|
|
11572
11553
|
await (0, import_promises2.writeFile)(outputPath, merged, "utf-8");
|
|
11573
11554
|
timingsMs.merge = elapsedMs(mergeStart);
|
|
11574
11555
|
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
@@ -12027,40 +12008,6 @@ function ensureSupportedInput(path) {
|
|
|
12027
12008
|
throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
|
|
12028
12009
|
}
|
|
12029
12010
|
}
|
|
12030
|
-
function extractNumericTokens(text) {
|
|
12031
|
-
return text.match(/\d[\d,./-]*/g) ?? [];
|
|
12032
|
-
}
|
|
12033
|
-
function preserveNumericIntegrity(rawText, correctedText) {
|
|
12034
|
-
const rawTokens = extractNumericTokens(rawText);
|
|
12035
|
-
const correctedTokens = extractNumericTokens(correctedText);
|
|
12036
|
-
if (rawTokens.length !== correctedTokens.length) return rawText;
|
|
12037
|
-
for (let i = 0; i < rawTokens.length; i++) {
|
|
12038
|
-
if (rawTokens[i] !== correctedTokens[i]) return rawText;
|
|
12039
|
-
}
|
|
12040
|
-
return correctedText;
|
|
12041
|
-
}
|
|
12042
|
-
function addUncertainTag(rawText, correctedText) {
|
|
12043
|
-
if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
|
|
12044
|
-
const rawLen = rawText.trim().length;
|
|
12045
|
-
const corrLen = correctedText.trim().length;
|
|
12046
|
-
if (rawLen === 0 || corrLen === 0) return correctedText;
|
|
12047
|
-
const rawLines = rawText.split("\n").filter(Boolean).length;
|
|
12048
|
-
const corrLines = correctedText.split("\n").filter(Boolean).length;
|
|
12049
|
-
const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
|
|
12050
|
-
const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
|
|
12051
|
-
const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
|
|
12052
|
-
if (!suspicious) return correctedText;
|
|
12053
|
-
return `${correctedText}
|
|
12054
|
-
|
|
12055
|
-
[\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
|
|
12056
|
-
}
|
|
12057
|
-
function buildDiffSummary(before, after) {
|
|
12058
|
-
return {
|
|
12059
|
-
changed: before !== after,
|
|
12060
|
-
beforeLength: before.length,
|
|
12061
|
-
afterLength: after.length
|
|
12062
|
-
};
|
|
12063
|
-
}
|
|
12064
12011
|
function normalizePipelineError(err, stage) {
|
|
12065
12012
|
if (err instanceof UnifiedOcrError) return err;
|
|
12066
12013
|
const message = err instanceof Error ? err.message : String(err);
|