kordoc 2.5.2 → 2.7.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +450 -431
- package/dist/chunk-4NWDJGAU.js +18955 -0
- package/dist/chunk-4NWDJGAU.js.map +1 -0
- package/dist/{chunk-NKKLA43G.js → chunk-4SK2PDMQ.js} +14 -3
- package/dist/chunk-4SK2PDMQ.js.map +1 -0
- package/dist/{chunk-24NKFRB4.js → chunk-LB7E2KDF.js} +14 -3
- package/dist/chunk-LB7E2KDF.js.map +1 -0
- package/dist/chunk-MEPHGCPQ.js +266 -0
- package/dist/chunk-MEPHGCPQ.js.map +1 -0
- package/dist/chunk-MOL7MDBG.js +0 -0
- package/dist/chunk-MUOQXDZ4.cjs.map +1 -1
- package/dist/{chunk-Z65OQP3H.cjs → chunk-Y476BOHI.cjs} +14 -3
- package/dist/chunk-Y476BOHI.cjs.map +1 -0
- package/dist/cli.js +60 -5
- package/dist/cli.js.map +1 -1
- package/dist/{detect-I7YIS4Q6.js → detect-RI2MQ33K.js} +6 -2
- package/dist/formula-3AQUUIRF.js +1151 -0
- package/dist/formula-3AQUUIRF.js.map +1 -0
- package/dist/formula-JCNF43NE.js +1153 -0
- package/dist/formula-JCNF43NE.js.map +1 -0
- package/dist/formula-XGG6ZP42.cjs +1151 -0
- package/dist/formula-XGG6ZP42.cjs.map +1 -0
- package/dist/index.cjs +14706 -450
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +73 -2
- package/dist/index.d.ts +73 -2
- package/dist/index.js +14583 -327
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/mcp.js.map +1 -1
- package/dist/page-range-3C7UGGEK.cjs.map +1 -1
- package/dist/page-range-737B4EZW.js +0 -0
- package/dist/{parser-AZYPOKAR.cjs → parser-7OFQ67QL.cjs} +160 -28
- package/dist/parser-7OFQ67QL.cjs.map +1 -0
- package/dist/{parser-BQKQOIJU.js → parser-DJCMY3OO.js} +136 -4
- package/dist/parser-DJCMY3OO.js.map +1 -0
- package/dist/{parser-FRROKAB7.js → parser-QMMQ7Y7R.js} +136 -4
- package/dist/parser-QMMQ7Y7R.js.map +1 -0
- package/dist/{provider-WPIYEALY.js → provider-2SEHU2FM.js} +1 -1
- package/dist/provider-2SEHU2FM.js.map +1 -0
- package/dist/{provider-7H4CPZYS.js → provider-AKROB7WQ.js} +1 -1
- package/dist/provider-AKROB7WQ.js.map +1 -0
- package/dist/{provider-YN2SSK4X.cjs → provider-SNONEZNW.cjs} +1 -1
- package/dist/provider-SNONEZNW.cjs.map +1 -0
- package/dist/setup-57FB3LSP.js +0 -0
- package/dist/{watch-ZJAUWUAE.js → watch-FVMVIZ5Q.js} +4 -4
- package/dist/watch-FVMVIZ5Q.js.map +1 -0
- package/package.json +98 -77
- package/dist/chunk-24NKFRB4.js.map +0 -1
- package/dist/chunk-2CAJSQK5.js +0 -5052
- package/dist/chunk-2CAJSQK5.js.map +0 -1
- package/dist/chunk-M3E3C5GS.js +0 -59
- package/dist/chunk-M3E3C5GS.js.map +0 -1
- package/dist/chunk-NKKLA43G.js.map +0 -1
- package/dist/chunk-Z65OQP3H.cjs.map +0 -1
- package/dist/parser-AZYPOKAR.cjs.map +0 -1
- package/dist/parser-BQKQOIJU.js.map +0 -1
- package/dist/parser-FRROKAB7.js.map +0 -1
- package/dist/provider-7H4CPZYS.js.map +0 -1
- package/dist/provider-WPIYEALY.js.map +0 -1
- package/dist/provider-YN2SSK4X.cjs.map +0 -1
- package/dist/watch-ZJAUWUAE.js.map +0 -1
- /package/dist/{detect-I7YIS4Q6.js.map → detect-RI2MQ33K.js.map} +0 -0
|
@@ -7,7 +7,7 @@ import {
|
|
|
7
7
|
blocksToMarkdown,
|
|
8
8
|
safeMax,
|
|
9
9
|
safeMin
|
|
10
|
-
} from "./chunk-
|
|
10
|
+
} from "./chunk-4SK2PDMQ.js";
|
|
11
11
|
import {
|
|
12
12
|
parsePageRange
|
|
13
13
|
} from "./chunk-MOL7MDBG.js";
|
|
@@ -1189,6 +1189,7 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
1189
1189
|
}
|
|
1190
1190
|
}
|
|
1191
1191
|
async function parsePdfDocument(buffer, options) {
|
|
1192
|
+
const formulaBuffer = options?.formulaOcr ? buffer.slice(0) : null;
|
|
1192
1193
|
const doc = await loadPdfWithTimeout(buffer);
|
|
1193
1194
|
try {
|
|
1194
1195
|
const pageCount = doc.numPages;
|
|
@@ -1241,7 +1242,7 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1241
1242
|
if (totalChars / Math.max(parsedPageCount, 1) < 10) {
|
|
1242
1243
|
if (options?.ocr) {
|
|
1243
1244
|
try {
|
|
1244
|
-
const { ocrPages } = await import("./provider-
|
|
1245
|
+
const { ocrPages } = await import("./provider-AKROB7WQ.js");
|
|
1245
1246
|
const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
|
|
1246
1247
|
if (ocrBlocks.length > 0) {
|
|
1247
1248
|
const ocrMarkdown = ocrBlocks.map((b) => b.text || "").filter(Boolean).join("\n\n");
|
|
@@ -1258,6 +1259,16 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1258
1259
|
blocks.splice(removed[ri], 1);
|
|
1259
1260
|
}
|
|
1260
1261
|
}
|
|
1262
|
+
if (options?.formulaOcr && formulaBuffer) {
|
|
1263
|
+
try {
|
|
1264
|
+
await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
|
|
1265
|
+
} catch (e) {
|
|
1266
|
+
warnings.push({
|
|
1267
|
+
message: `\uC218\uC2DD OCR \uC2E4\uD328: ${e instanceof Error ? e.message : String(e)}`,
|
|
1268
|
+
code: "PARTIAL_PARSE"
|
|
1269
|
+
});
|
|
1270
|
+
}
|
|
1271
|
+
}
|
|
1261
1272
|
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
1262
1273
|
if (medianFontSize > 0) {
|
|
1263
1274
|
detectHeadings(blocks, medianFontSize);
|
|
@@ -2072,7 +2083,10 @@ function mergeLineSimple(items) {
|
|
|
2072
2083
|
function cleanPdfText(text) {
|
|
2073
2084
|
return mergeKoreanLines(
|
|
2074
2085
|
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
2075
|
-
).replace(/^(?!\| ---).*$/gm, (line) =>
|
|
2086
|
+
).replace(/^(?!\| ---).*$/gm, (line) => {
|
|
2087
|
+
if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
|
|
2088
|
+
return collapseEvenSpacing(line);
|
|
2089
|
+
}).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
2076
2090
|
}
|
|
2077
2091
|
function startsWithMarker(line) {
|
|
2078
2092
|
const t = line.trimStart();
|
|
@@ -2275,9 +2289,127 @@ function mergeKoreanLines(text) {
|
|
|
2275
2289
|
}
|
|
2276
2290
|
return result.join("\n");
|
|
2277
2291
|
}
|
|
2292
|
+
async function applyFormulaOcr(buffer, blocks, pageFilter, effectivePageCount, warnings, _onProgress) {
|
|
2293
|
+
const formulaMod = await import("./formula-JCNF43NE.js");
|
|
2294
|
+
const { FormulaPipeline, ensureFormulaModels } = formulaMod;
|
|
2295
|
+
await ensureFormulaModels((p) => {
|
|
2296
|
+
if (p.phase === "download" && p.total) {
|
|
2297
|
+
const pct = Math.floor(p.downloaded / p.total * 100);
|
|
2298
|
+
process.stderr.write(`\r[kordoc-formula] ${p.spec.name} ${pct}% (${formatMb(p.downloaded)}/${formatMb(p.total)})`);
|
|
2299
|
+
if (p.downloaded >= p.total) process.stderr.write("\n");
|
|
2300
|
+
} else if (p.phase === "verify") {
|
|
2301
|
+
process.stderr.write(`[kordoc-formula] ${p.spec.name} SHA-256 \uAC80\uC99D \uC911...
|
|
2302
|
+
`);
|
|
2303
|
+
} else if (p.phase === "done") {
|
|
2304
|
+
process.stderr.write(`[kordoc-formula] ${p.spec.name} \uC900\uBE44 \uC644\uB8CC
|
|
2305
|
+
`);
|
|
2306
|
+
} else if (p.phase === "skip") {
|
|
2307
|
+
}
|
|
2308
|
+
});
|
|
2309
|
+
const pipeline = await FormulaPipeline.create();
|
|
2310
|
+
try {
|
|
2311
|
+
const pagesResult = await pipeline.runOnBuffer(buffer, pageFilter);
|
|
2312
|
+
if (pagesResult.length === 0) return;
|
|
2313
|
+
let insertedCount = 0;
|
|
2314
|
+
let removedDupCount = 0;
|
|
2315
|
+
for (const page of pagesResult) {
|
|
2316
|
+
const pageNumber = page.pageNumber;
|
|
2317
|
+
const pdfHeight = page.pdfHeight;
|
|
2318
|
+
const scaleX = page.renderedWidth > 0 ? page.pdfWidth / page.renderedWidth : 0.5;
|
|
2319
|
+
const scaleY = page.renderedHeight > 0 ? page.pdfHeight / page.renderedHeight : 0.5;
|
|
2320
|
+
const candidates = [];
|
|
2321
|
+
for (const r of page.regions) {
|
|
2322
|
+
if (!r.latex || !r.latex.trim()) continue;
|
|
2323
|
+
const wrapped = r.kind === "display" ? `$$${r.latex}$$` : `$${r.latex}$`;
|
|
2324
|
+
const x1 = r.bbox.x1 * scaleX;
|
|
2325
|
+
const x2 = r.bbox.x2 * scaleX;
|
|
2326
|
+
const yTop = pdfHeight - r.bbox.y1 * scaleY;
|
|
2327
|
+
const yBottom = pdfHeight - r.bbox.y2 * scaleY;
|
|
2328
|
+
const centerY = (yTop + yBottom) / 2;
|
|
2329
|
+
const width = x2 - x1;
|
|
2330
|
+
const height = yTop - yBottom;
|
|
2331
|
+
candidates.push({
|
|
2332
|
+
block: {
|
|
2333
|
+
type: "paragraph",
|
|
2334
|
+
text: wrapped,
|
|
2335
|
+
pageNumber,
|
|
2336
|
+
bbox: { page: pageNumber, x: x1, y: yBottom, width, height }
|
|
2337
|
+
},
|
|
2338
|
+
pdfBbox: { x1, x2, yTop, yBottom },
|
|
2339
|
+
centerY
|
|
2340
|
+
});
|
|
2341
|
+
}
|
|
2342
|
+
if (candidates.length === 0) continue;
|
|
2343
|
+
const OVERLAP_THRESHOLD = 0.6;
|
|
2344
|
+
const indicesToRemove = /* @__PURE__ */ new Set();
|
|
2345
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2346
|
+
const b = blocks[i];
|
|
2347
|
+
if (b.pageNumber !== pageNumber) continue;
|
|
2348
|
+
if (b.type === "table") continue;
|
|
2349
|
+
if (!b.bbox || b.bbox.width <= 0 || b.bbox.height <= 0) continue;
|
|
2350
|
+
const blockArea = b.bbox.width * b.bbox.height;
|
|
2351
|
+
if (blockArea <= 0) continue;
|
|
2352
|
+
for (const c of candidates) {
|
|
2353
|
+
const ox1 = Math.max(b.bbox.x, c.pdfBbox.x1);
|
|
2354
|
+
const ox2 = Math.min(b.bbox.x + b.bbox.width, c.pdfBbox.x2);
|
|
2355
|
+
const oy1 = Math.max(b.bbox.y, c.pdfBbox.yBottom);
|
|
2356
|
+
const oy2 = Math.min(b.bbox.y + b.bbox.height, c.pdfBbox.yTop);
|
|
2357
|
+
const interArea = Math.max(0, ox2 - ox1) * Math.max(0, oy2 - oy1);
|
|
2358
|
+
if (interArea / blockArea >= OVERLAP_THRESHOLD) {
|
|
2359
|
+
indicesToRemove.add(i);
|
|
2360
|
+
break;
|
|
2361
|
+
}
|
|
2362
|
+
}
|
|
2363
|
+
}
|
|
2364
|
+
if (indicesToRemove.size > 0) {
|
|
2365
|
+
const sorted = [...indicesToRemove].sort((a, b) => b - a);
|
|
2366
|
+
for (const idx of sorted) blocks.splice(idx, 1);
|
|
2367
|
+
removedDupCount += indicesToRemove.size;
|
|
2368
|
+
}
|
|
2369
|
+
candidates.sort((a, b) => b.centerY - a.centerY);
|
|
2370
|
+
for (const c of candidates) {
|
|
2371
|
+
let insertIdx = -1;
|
|
2372
|
+
let pageFirstIdx = -1;
|
|
2373
|
+
let pageLastIdx = -1;
|
|
2374
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2375
|
+
const b = blocks[i];
|
|
2376
|
+
if (b.pageNumber !== pageNumber) continue;
|
|
2377
|
+
if (pageFirstIdx === -1) pageFirstIdx = i;
|
|
2378
|
+
pageLastIdx = i;
|
|
2379
|
+
if (!b.bbox) continue;
|
|
2380
|
+
const blockCenter = b.bbox.y + b.bbox.height / 2;
|
|
2381
|
+
if (blockCenter < c.centerY) {
|
|
2382
|
+
insertIdx = i;
|
|
2383
|
+
break;
|
|
2384
|
+
}
|
|
2385
|
+
}
|
|
2386
|
+
if (insertIdx !== -1) {
|
|
2387
|
+
blocks.splice(insertIdx, 0, c.block);
|
|
2388
|
+
} else if (pageLastIdx !== -1) {
|
|
2389
|
+
blocks.splice(pageLastIdx + 1, 0, c.block);
|
|
2390
|
+
} else {
|
|
2391
|
+
blocks.push(c.block);
|
|
2392
|
+
}
|
|
2393
|
+
insertedCount++;
|
|
2394
|
+
}
|
|
2395
|
+
}
|
|
2396
|
+
if (insertedCount > 0 || removedDupCount > 0) {
|
|
2397
|
+
process.stderr.write(
|
|
2398
|
+
`[kordoc-formula] ${insertedCount}\uAC1C \uC218\uC2DD \uC0BD\uC785, ${removedDupCount}\uAC1C \uC911\uBCF5 block \uC81C\uAC70 (${pagesResult.length}\uAC1C \uD398\uC774\uC9C0)
|
|
2399
|
+
`
|
|
2400
|
+
);
|
|
2401
|
+
}
|
|
2402
|
+
} finally {
|
|
2403
|
+
await pipeline.destroy().catch(() => {
|
|
2404
|
+
});
|
|
2405
|
+
}
|
|
2406
|
+
}
|
|
2407
|
+
function formatMb(bytes) {
|
|
2408
|
+
return `${(bytes / 1024 / 1024).toFixed(1)}MB`;
|
|
2409
|
+
}
|
|
2278
2410
|
export {
|
|
2279
2411
|
cleanPdfText,
|
|
2280
2412
|
extractPdfMetadataOnly,
|
|
2281
2413
|
parsePdfDocument
|
|
2282
2414
|
};
|
|
2283
|
-
//# sourceMappingURL=parser-
|
|
2415
|
+
//# sourceMappingURL=parser-QMMQ7Y7R.js.map
|