@clazic/kordoc 2.4.16 → 2.4.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/{chunk-RH6IBTHH.js → chunk-T7EBS5XP.js} +32 -9
- package/dist/{chunk-RH6IBTHH.js.map → chunk-T7EBS5XP.js.map} +1 -1
- package/dist/{chunk-QR27D67R.js → chunk-W2KDIKDF.js} +2 -2
- package/dist/cli.js +5 -5
- package/dist/index.cjs +38 -90
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +1 -0
- package/dist/index.d.ts +1 -0
- package/dist/index.js +38 -90
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +2 -2
- package/dist/{utils-HHJDSSR6.js → utils-DHOODYKU.js} +2 -2
- package/dist/{watch-YAILKKKP.js → watch-YGIU7RN7.js} +3 -3
- package/package.json +1 -1
- /package/dist/{chunk-QR27D67R.js.map → chunk-W2KDIKDF.js.map} +0 -0
- /package/dist/{utils-HHJDSSR6.js.map → utils-DHOODYKU.js.map} +0 -0
- /package/dist/{watch-YAILKKKP.js.map → watch-YGIU7RN7.js.map} +0 -0
package/dist/index.d.cts
CHANGED
package/dist/index.d.ts
CHANGED
package/dist/index.js
CHANGED
|
@@ -3115,7 +3115,7 @@ import JSZip2 from "jszip";
|
|
|
3115
3115
|
import { DOMParser } from "@xmldom/xmldom";
|
|
3116
3116
|
|
|
3117
3117
|
// src/utils.ts
|
|
3118
|
-
var VERSION = true ? "2.4.
|
|
3118
|
+
var VERSION = true ? "2.4.17" : "0.0.0-dev";
|
|
3119
3119
|
function toArrayBuffer(buf) {
|
|
3120
3120
|
if (buf.byteOffset === 0 && buf.byteLength === buf.buffer.byteLength) {
|
|
3121
3121
|
return buf.buffer;
|
|
@@ -11346,17 +11346,39 @@ var DEFAULT_STAGE_WEIGHTS = {
|
|
|
11346
11346
|
render: 20,
|
|
11347
11347
|
probe: 5,
|
|
11348
11348
|
ocr: 45,
|
|
11349
|
-
proofread:
|
|
11349
|
+
proofread: 0,
|
|
11350
11350
|
merge: 5
|
|
11351
11351
|
};
|
|
11352
|
-
var OCR_PROMPT2 =
|
|
11353
|
-
|
|
11354
|
-
"
|
|
11355
|
-
"\uADDC\uCE59:",
|
|
11356
|
-
"- \
|
|
11357
|
-
"- \
|
|
11358
|
-
"- \
|
|
11359
|
-
"- \
|
|
11352
|
+
var OCR_PROMPT2 = [
|
|
11353
|
+
"\uC774 PDF \uD398\uC774\uC9C0 \uC774\uBBF8\uC9C0\uC5D0\uC11C \uD14D\uC2A4\uD2B8\uC640 \uD45C\uB97C \uCD94\uCD9C\uD558\uC5EC Markdown\uC73C\uB85C \uBCC0\uD658\uD558\uACE0, OCR \uC624\uC778\uC2DD \uC624\uB958\uB97C \uC989\uC2DC \uAD50\uC815\uD558\uC5EC \uCD5C\uC885 \uACB0\uACFC\uBB3C\uC744 \uCD9C\uB825\uD558\uC138\uC694.",
|
|
11354
|
+
"",
|
|
11355
|
+
"\uCD94\uCD9C \uADDC\uCE59:",
|
|
11356
|
+
"- \uD14D\uC2A4\uD2B8, \uD45C, \uC81C\uBAA9, \uB9AC\uC2A4\uD2B8\uB97C \uC6D0\uBB38 \uAD6C\uC870 \uADF8\uB300\uB85C Markdown\uC73C\uB85C \uBCC0\uD658",
|
|
11357
|
+
"- \uD45C\uB294 Markdown \uD14C\uC774\uBE14 \uBB38\uBC95 \uC0AC\uC6A9 (| \uAD6C\uBD84, |---|---| \uD5E4\uB354 \uAD6C\uBD84\uC120 \uD3EC\uD568)",
|
|
11358
|
+
"- \uD5E4\uB529\uC740 \uC2DC\uAC01\uC801 \uD06C\uAE30\uC5D0 \uB530\uB77C # ~ ###### \uC0AC\uC6A9",
|
|
11359
|
+
"- \uB9AC\uC2A4\uD2B8\uB294 - \uB610\uB294 1. \uC0AC\uC6A9",
|
|
11360
|
+
"- \uC774\uBBF8\uC9C0, \uB3C4\uD615 \uB4F1 \uBE44\uD14D\uC2A4\uD2B8 \uC694\uC18C\uB294 \uBB34\uC2DC",
|
|
11361
|
+
"- \uC6D0\uBB38\uC758 \uC77D\uAE30 \uC21C\uC11C\uC640 \uAD6C\uC870\uB97C \uC720\uC9C0",
|
|
11362
|
+
"",
|
|
11363
|
+
"\uC808\uB300 \uAE08\uC9C0 \uC0AC\uD56D:",
|
|
11364
|
+
"- \uBB38\uC7A5\xB7\uB2E8\uB77D\xB7\uD56D\uBAA9\uC744 \uCD94\uAC00\uD558\uAC70\uB098 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11365
|
+
"- \uC22B\uC790, \uD37C\uC13C\uD2B8, \uB0A0\uC9DC, \uB2E8\uC704, \uAE08\uC561\uC744 \uC808\uB300 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11366
|
+
"- \uACE0\uC720\uBA85\uC0AC, \uAE30\uAD00\uBA85, \uBC95\uB839\uBA85, \uC9C0\uBA85\uC744 \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11367
|
+
"- \uD45C\uC758 \uC81C\uBAA9\uC744 \uBCC0\uACBD \uB610\uB294 \uC0AD\uC81C\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11368
|
+
"- \uD45C\uC758 \uD589\xB7\uC5F4 \uC218, \uC140 \uB0B4\uC6A9, \uD5E4\uB354\uB97C \uBCC0\uACBD\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11369
|
+
"- \uC81C\uBAA9 \uC218\uC900(#, ##, ### \uB4F1)\uC744 \uC784\uC758\uB85C \uBC14\uAFB8\uC9C0 \uB9D0 \uAC83",
|
|
11370
|
+
"- \uC6D0\uBB38\uC5D0 \uC5C6\uB294 \uB0B4\uC6A9\uC744 \uC694\uC57D\xB7\uBCF4\uC644\xB7\uCD94\uB860\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11371
|
+
"- ` ``` `\uB85C \uAC10\uC2F8\uAC70\uB098 \uC124\uBA85 \uD14D\uC2A4\uD2B8\uB97C \uCD94\uAC00\uD558\uC9C0 \uB9D0 \uAC83",
|
|
11372
|
+
"",
|
|
11373
|
+
"\uD5C8\uC6A9\uB418\uB294 \uAD50\uC815 \uBC94\uC704 (OCR \uC624\uC778\uC2DD \uC218\uC815):",
|
|
11374
|
+
"- \uBA85\uBC31\uD55C \uAE00\uC790 \uC624\uC778\uC2DD \uC218\uC815 (\uC608: '0' \u2192 'O', 'l' \u2192 '1' \uB4F1 \uB9E5\uB77D\uC0C1 \uBA85\uD655\uD55C \uACBD\uC6B0\uB9CC)",
|
|
11375
|
+
"- \uB2E8\uC5B4 \uC911\uAC04\uC5D0 \uC798\uBABB \uC0BD\uC785\uB41C \uACF5\uBC31 \uC81C\uAC70",
|
|
11376
|
+
"- \uC904\uBC14\uAFC8 \uC624\uB958\uB85C \uBD84\uB9AC\uB41C \uBB38\uC7A5 \uBCD1\uD569 (\uC758\uBBF8 \uB2E8\uC704 \uAE30\uC900)",
|
|
11377
|
+
"- Markdown \uBB38\uBC95 \uC624\uB958 \uC218\uC815 (\uD45C \uAD6C\uBD84\uC120 \uB204\uB77D, \uB9AC\uC2A4\uD2B8 \uB4E4\uC5EC\uC4F0\uAE30 \uB4F1)",
|
|
11378
|
+
"",
|
|
11379
|
+
"\uCD9C\uB825 \uADDC\uCE59:",
|
|
11380
|
+
"- \uBCC0\uD658\uB41C Markdown \uBCF8\uBB38\uB9CC \uCD9C\uB825\uD560 \uAC83 (\uC124\uBA85, \uC8FC\uC11D, \uBA54\uD0C0 \uD14D\uC2A4\uD2B8 \uC5C6\uC774)",
|
|
11381
|
+
"- \uD655\uC2E4\uD558\uC9C0 \uC54A\uC73C\uBA74 \uC6D0\uBB38\uC744 \uADF8\uB300\uB85C \uC720\uC9C0\uD560 \uAC83"
|
|
11360
11382
|
].join("\n");
|
|
11361
11383
|
function elapsedMs(startAt) {
|
|
11362
11384
|
return Math.round(performance.now() - startAt);
|
|
@@ -11367,7 +11389,6 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11367
11389
|
const workspaceDir = resolve3(options.workspaceDir ?? join4(dirname3(absInput), `${stem}_ocr_workspace`));
|
|
11368
11390
|
const imagesDir = join4(workspaceDir, "images");
|
|
11369
11391
|
const rawDir = join4(workspaceDir, "ocr", "raw");
|
|
11370
|
-
const proofDir = join4(workspaceDir, "ocr", "proofread");
|
|
11371
11392
|
const diffDir = join4(workspaceDir, "ocr", "diff");
|
|
11372
11393
|
const outputPath = resolve3(options.outputPath ?? join4(dirname3(absInput), `${stem}.md`));
|
|
11373
11394
|
const reportPath = join4(workspaceDir, "run-report.json");
|
|
@@ -11387,11 +11408,10 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11387
11408
|
const logger = (options.logger ?? createLoggerFromEnv()).withRun(runId).child({ component: "pipeline/unified-ocr.ts" });
|
|
11388
11409
|
await mkdir(imagesDir, { recursive: true });
|
|
11389
11410
|
await mkdir(rawDir, { recursive: true });
|
|
11390
|
-
await mkdir(proofDir, { recursive: true });
|
|
11391
11411
|
await mkdir(diffDir, { recursive: true });
|
|
11392
11412
|
const timingsMs = {};
|
|
11393
11413
|
const markStageStart = (stage, message) => emitProgress(options.onEvent, stage, 0, stageWeights, { message, type: "stage_start" });
|
|
11394
|
-
const markStageProgress = (stage, stagePercent, current, total, message) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message });
|
|
11414
|
+
const markStageProgress = (stage, stagePercent, current, total, message, model) => emitProgress(options.onEvent, stage, stagePercent, stageWeights, { type: "stage_progress", current, total, message, model });
|
|
11395
11415
|
const markStageDone = (stage, message) => emitProgress(options.onEvent, stage, 100, stageWeights, { message, type: "stage_done" });
|
|
11396
11416
|
let currentStage = "convert";
|
|
11397
11417
|
const logStage = (level, stage, event, message, meta) => {
|
|
@@ -11496,56 +11516,17 @@ async function runUnifiedOcrPipeline(inputPath, options = {}) {
|
|
|
11496
11516
|
const pagePath = join4(rawDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11497
11517
|
await writeFile(pagePath, markdown, "utf-8");
|
|
11498
11518
|
rawPagePaths.push(pagePath);
|
|
11499
|
-
markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}
|
|
11519
|
+
markStageProgress("ocr", Math.round((i + 1) / images.length * 100), i + 1, images.length, `OCR ${i + 1}/${images.length}`, selectedModel);
|
|
11500
11520
|
logStage("debug", "ocr", "progress", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { page: i + 1, total: images.length });
|
|
11501
11521
|
}
|
|
11502
11522
|
timingsMs.ocr = elapsedMs(ocrStart);
|
|
11503
11523
|
markStageDone("ocr", "OCR \uC644\uB8CC");
|
|
11504
11524
|
logStage("info", "ocr", "done", "\uD398\uC774\uC9C0 OCR \uC644\uB8CC", { elapsedMs: timingsMs.ocr });
|
|
11505
|
-
const proofStart = performance.now();
|
|
11506
|
-
currentStage = "proofread";
|
|
11507
|
-
markStageStart("proofread", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC9C4\uD589 \uC911");
|
|
11508
|
-
logStage("info", "proofread", "start", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11509
|
-
const proofedPaths = [];
|
|
11510
|
-
for (let i = 0; i < rawPagePaths.length; i++) {
|
|
11511
|
-
const rawMd = await readFile(rawPagePaths[i], "utf-8");
|
|
11512
|
-
const prompt = `${PROOFREAD_PROMPT}
|
|
11513
|
-
|
|
11514
|
-
---
|
|
11515
|
-
${rawMd}
|
|
11516
|
-
---`;
|
|
11517
|
-
const corrected = await ocrImageViaNim({
|
|
11518
|
-
textOnlyPrompt: prompt,
|
|
11519
|
-
model: selectedModel,
|
|
11520
|
-
maxTokens: modelMaxTokens[selectedModel] ?? 8192,
|
|
11521
|
-
baseUrl,
|
|
11522
|
-
keyPool,
|
|
11523
|
-
timeoutMs,
|
|
11524
|
-
maxRetries: maxRetriesPerPage,
|
|
11525
|
-
logger,
|
|
11526
|
-
stage: "proofread"
|
|
11527
|
-
});
|
|
11528
|
-
const safeCorrected = preserveNumericIntegrity(rawMd, corrected);
|
|
11529
|
-
const taggedCorrected = addUncertainTag(rawMd, safeCorrected);
|
|
11530
|
-
const pagePath = join4(proofDir, `page_${String(i + 1).padStart(4, "0")}.md`);
|
|
11531
|
-
await writeFile(pagePath, taggedCorrected, "utf-8");
|
|
11532
|
-
await writeFile(
|
|
11533
|
-
join4(diffDir, `page_${String(i + 1).padStart(4, "0")}.json`),
|
|
11534
|
-
JSON.stringify(buildDiffSummary(rawMd, taggedCorrected), null, 2),
|
|
11535
|
-
"utf-8"
|
|
11536
|
-
);
|
|
11537
|
-
proofedPaths.push(pagePath);
|
|
11538
|
-
markStageProgress("proofread", Math.round((i + 1) / rawPagePaths.length * 100), i + 1, rawPagePaths.length, `\uAD50\uC815 ${i + 1}/${rawPagePaths.length}`);
|
|
11539
|
-
logStage("debug", "proofread", "progress", "\uD398\uC774\uC9C0 \uAD50\uC815 \uC644\uB8CC", { page: i + 1, total: rawPagePaths.length });
|
|
11540
|
-
}
|
|
11541
|
-
timingsMs.proofread = elapsedMs(proofStart);
|
|
11542
|
-
markStageDone("proofread", "\uAD50\uC815 \uC644\uB8CC");
|
|
11543
|
-
logStage("info", "proofread", "done", "\uBE44\uD30C\uAD34 \uAD50\uC815 \uC644\uB8CC", { elapsedMs: timingsMs.proofread });
|
|
11544
11525
|
const mergeStart = performance.now();
|
|
11545
11526
|
currentStage = "merge";
|
|
11546
11527
|
markStageStart("merge", "\uCD5C\uC885 Markdown \uBCD1\uD569 \uC911");
|
|
11547
|
-
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages:
|
|
11548
|
-
const merged = await mergeMarkdownPages(
|
|
11528
|
+
logStage("info", "merge", "start", "\uCD5C\uC885 \uBCD1\uD569 \uC2DC\uC791", { pages: rawPagePaths.length });
|
|
11529
|
+
const merged = await mergeMarkdownPages(rawPagePaths);
|
|
11549
11530
|
await writeFile(outputPath, merged, "utf-8");
|
|
11550
11531
|
timingsMs.merge = elapsedMs(mergeStart);
|
|
11551
11532
|
markStageDone("merge", "\uBCD1\uD569 \uC644\uB8CC");
|
|
@@ -11620,7 +11601,8 @@ function emitProgress(cb, stage, stagePercent, weights, extra) {
|
|
|
11620
11601
|
current: extra.current,
|
|
11621
11602
|
total: extra.total,
|
|
11622
11603
|
code: extra.code,
|
|
11623
|
-
message: extra.message
|
|
11604
|
+
message: extra.message,
|
|
11605
|
+
model: extra.model
|
|
11624
11606
|
});
|
|
11625
11607
|
}
|
|
11626
11608
|
async function convertWithLibreOffice(buffer, ext) {
|
|
@@ -12003,40 +11985,6 @@ function ensureSupportedInput(path) {
|
|
|
12003
11985
|
throw new UnifiedOcrError("UNSUPPORTED_INPUT", "convert", `\uC9C0\uC6D0\uD558\uC9C0 \uC54A\uB294 \uC785\uB825 \uD3EC\uB9F7: ${ext}`);
|
|
12004
11986
|
}
|
|
12005
11987
|
}
|
|
12006
|
-
function extractNumericTokens(text) {
|
|
12007
|
-
return text.match(/\d[\d,./-]*/g) ?? [];
|
|
12008
|
-
}
|
|
12009
|
-
function preserveNumericIntegrity(rawText, correctedText) {
|
|
12010
|
-
const rawTokens = extractNumericTokens(rawText);
|
|
12011
|
-
const correctedTokens = extractNumericTokens(correctedText);
|
|
12012
|
-
if (rawTokens.length !== correctedTokens.length) return rawText;
|
|
12013
|
-
for (let i = 0; i < rawTokens.length; i++) {
|
|
12014
|
-
if (rawTokens[i] !== correctedTokens[i]) return rawText;
|
|
12015
|
-
}
|
|
12016
|
-
return correctedText;
|
|
12017
|
-
}
|
|
12018
|
-
function addUncertainTag(rawText, correctedText) {
|
|
12019
|
-
if (correctedText.includes("[\uD655\uC778\uD544\uC694:")) return correctedText;
|
|
12020
|
-
const rawLen = rawText.trim().length;
|
|
12021
|
-
const corrLen = correctedText.trim().length;
|
|
12022
|
-
if (rawLen === 0 || corrLen === 0) return correctedText;
|
|
12023
|
-
const rawLines = rawText.split("\n").filter(Boolean).length;
|
|
12024
|
-
const corrLines = correctedText.split("\n").filter(Boolean).length;
|
|
12025
|
-
const rawTableLines = rawText.split("\n").filter((l) => l.includes("|")).length;
|
|
12026
|
-
const corrTableLines = correctedText.split("\n").filter((l) => l.includes("|")).length;
|
|
12027
|
-
const suspicious = corrLen < rawLen * 0.75 || corrLines < Math.max(1, Math.floor(rawLines * 0.8)) || rawTableLines >= 2 && corrTableLines < Math.floor(rawTableLines * 0.7);
|
|
12028
|
-
if (!suspicious) return correctedText;
|
|
12029
|
-
return `${correctedText}
|
|
12030
|
-
|
|
12031
|
-
[\uD655\uC778\uD544\uC694: \uAD50\uC815 \uACB0\uACFC\uAC00 \uCD95\uC57D\uB418\uC5C8\uC744 \uC218 \uC788\uC5B4 \uC6D0\uBB38\uACFC \uB300\uC870\uAC00 \uD544\uC694\uD569\uB2C8\uB2E4.]`;
|
|
12032
|
-
}
|
|
12033
|
-
function buildDiffSummary(before, after) {
|
|
12034
|
-
return {
|
|
12035
|
-
changed: before !== after,
|
|
12036
|
-
beforeLength: before.length,
|
|
12037
|
-
afterLength: after.length
|
|
12038
|
-
};
|
|
12039
|
-
}
|
|
12040
11988
|
function normalizePipelineError(err, stage) {
|
|
12041
11989
|
if (err instanceof UnifiedOcrError) return err;
|
|
12042
11990
|
const message = err instanceof Error ? err.message : String(err);
|