kordoc 2.5.1 → 2.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +21 -2
- package/dist/chunk-5CJGKKMZ.js +266 -0
- package/dist/chunk-5CJGKKMZ.js.map +1 -0
- package/dist/{chunk-OCVWJSG7.js → chunk-GNN6MHH4.js} +14 -3
- package/dist/chunk-GNN6MHH4.js.map +1 -0
- package/dist/{chunk-KO7DKAXW.js → chunk-LA66FVBN.js} +14 -3
- package/dist/chunk-LA66FVBN.js.map +1 -0
- package/dist/chunk-OBSPVJ6A.js +18947 -0
- package/dist/chunk-OBSPVJ6A.js.map +1 -0
- package/dist/{chunk-TTSFPEDM.cjs → chunk-RFGEEHI4.cjs} +14 -3
- package/dist/{chunk-TTSFPEDM.cjs.map → chunk-RFGEEHI4.cjs.map} +1 -1
- package/dist/cli.js +60 -5
- package/dist/cli.js.map +1 -1
- package/dist/{detect-I7YIS4Q6.js → detect-PJZMUL2Z.js} +6 -2
- package/dist/formula-3AQUUIRF.js +1151 -0
- package/dist/formula-3AQUUIRF.js.map +1 -0
- package/dist/formula-JCNF43NE.js +1153 -0
- package/dist/formula-JCNF43NE.js.map +1 -0
- package/dist/formula-XGG6ZP42.cjs +1151 -0
- package/dist/formula-XGG6ZP42.cjs.map +1 -0
- package/dist/index.cjs +14743 -465
- package/dist/index.cjs.map +1 -1
- package/dist/index.d.cts +73 -2
- package/dist/index.d.ts +73 -2
- package/dist/index.js +14615 -337
- package/dist/index.js.map +1 -1
- package/dist/mcp.js +5 -5
- package/dist/{parser-DA3CGOZF.js → parser-5CJGXQCJ.js} +135 -3
- package/dist/parser-5CJGXQCJ.js.map +1 -0
- package/dist/{parser-NZFDRZLS.js → parser-6L6DZCOB.js} +135 -3
- package/dist/parser-6L6DZCOB.js.map +1 -0
- package/dist/{parser-BOIVVDYI.cjs → parser-SRI2TIZX.cjs} +159 -27
- package/dist/{parser-BOIVVDYI.cjs.map → parser-SRI2TIZX.cjs.map} +1 -1
- package/dist/{watch-HWN6Y6Q2.js → watch-7CTGUDQB.js} +4 -4
- package/package.json +25 -4
- package/dist/chunk-KO7DKAXW.js.map +0 -1
- package/dist/chunk-M3E3C5GS.js +0 -59
- package/dist/chunk-M3E3C5GS.js.map +0 -1
- package/dist/chunk-OCVWJSG7.js.map +0 -1
- package/dist/chunk-QEZ4CUF7.js +0 -5022
- package/dist/chunk-QEZ4CUF7.js.map +0 -1
- package/dist/parser-DA3CGOZF.js.map +0 -1
- package/dist/parser-NZFDRZLS.js.map +0 -1
- /package/dist/{detect-I7YIS4Q6.js.map → detect-PJZMUL2Z.js.map} +0 -0
- /package/dist/{watch-HWN6Y6Q2.js.map → watch-7CTGUDQB.js.map} +0 -0
|
@@ -6,7 +6,7 @@ import {
|
|
|
6
6
|
blocksToMarkdown,
|
|
7
7
|
safeMax,
|
|
8
8
|
safeMin
|
|
9
|
-
} from "./chunk-
|
|
9
|
+
} from "./chunk-GNN6MHH4.js";
|
|
10
10
|
import {
|
|
11
11
|
parsePageRange
|
|
12
12
|
} from "./chunk-SBVRCJFH.js";
|
|
@@ -1188,6 +1188,7 @@ async function loadPdfWithTimeout(buffer) {
|
|
|
1188
1188
|
}
|
|
1189
1189
|
}
|
|
1190
1190
|
async function parsePdfDocument(buffer, options) {
|
|
1191
|
+
const formulaBuffer = options?.formulaOcr ? buffer.slice(0) : null;
|
|
1191
1192
|
const doc = await loadPdfWithTimeout(buffer);
|
|
1192
1193
|
try {
|
|
1193
1194
|
const pageCount = doc.numPages;
|
|
@@ -1257,6 +1258,16 @@ async function parsePdfDocument(buffer, options) {
|
|
|
1257
1258
|
blocks.splice(removed[ri], 1);
|
|
1258
1259
|
}
|
|
1259
1260
|
}
|
|
1261
|
+
if (options?.formulaOcr && formulaBuffer) {
|
|
1262
|
+
try {
|
|
1263
|
+
await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
|
|
1264
|
+
} catch (e) {
|
|
1265
|
+
warnings.push({
|
|
1266
|
+
message: `\uC218\uC2DD OCR \uC2E4\uD328: ${e instanceof Error ? e.message : String(e)}`,
|
|
1267
|
+
code: "PARTIAL_PARSE"
|
|
1268
|
+
});
|
|
1269
|
+
}
|
|
1270
|
+
}
|
|
1260
1271
|
const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
|
|
1261
1272
|
if (medianFontSize > 0) {
|
|
1262
1273
|
detectHeadings(blocks, medianFontSize);
|
|
@@ -2071,7 +2082,10 @@ function mergeLineSimple(items) {
|
|
|
2071
2082
|
function cleanPdfText(text) {
|
|
2072
2083
|
return mergeKoreanLines(
|
|
2073
2084
|
text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
|
|
2074
|
-
).replace(/^(?!\| ---).*$/gm, (line) =>
|
|
2085
|
+
).replace(/^(?!\| ---).*$/gm, (line) => {
|
|
2086
|
+
if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
|
|
2087
|
+
return collapseEvenSpacing(line);
|
|
2088
|
+
}).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
|
|
2075
2089
|
}
|
|
2076
2090
|
function startsWithMarker(line) {
|
|
2077
2091
|
const t = line.trimStart();
|
|
@@ -2274,9 +2288,127 @@ function mergeKoreanLines(text) {
|
|
|
2274
2288
|
}
|
|
2275
2289
|
return result.join("\n");
|
|
2276
2290
|
}
|
|
2291
|
+
async function applyFormulaOcr(buffer, blocks, pageFilter, effectivePageCount, warnings, _onProgress) {
|
|
2292
|
+
const formulaMod = await import("./formula-3AQUUIRF.js");
|
|
2293
|
+
const { FormulaPipeline, ensureFormulaModels } = formulaMod;
|
|
2294
|
+
await ensureFormulaModels((p) => {
|
|
2295
|
+
if (p.phase === "download" && p.total) {
|
|
2296
|
+
const pct = Math.floor(p.downloaded / p.total * 100);
|
|
2297
|
+
process.stderr.write(`\r[kordoc-formula] ${p.spec.name} ${pct}% (${formatMb(p.downloaded)}/${formatMb(p.total)})`);
|
|
2298
|
+
if (p.downloaded >= p.total) process.stderr.write("\n");
|
|
2299
|
+
} else if (p.phase === "verify") {
|
|
2300
|
+
process.stderr.write(`[kordoc-formula] ${p.spec.name} SHA-256 \uAC80\uC99D \uC911...
|
|
2301
|
+
`);
|
|
2302
|
+
} else if (p.phase === "done") {
|
|
2303
|
+
process.stderr.write(`[kordoc-formula] ${p.spec.name} \uC900\uBE44 \uC644\uB8CC
|
|
2304
|
+
`);
|
|
2305
|
+
} else if (p.phase === "skip") {
|
|
2306
|
+
}
|
|
2307
|
+
});
|
|
2308
|
+
const pipeline = await FormulaPipeline.create();
|
|
2309
|
+
try {
|
|
2310
|
+
const pagesResult = await pipeline.runOnBuffer(buffer, pageFilter);
|
|
2311
|
+
if (pagesResult.length === 0) return;
|
|
2312
|
+
let insertedCount = 0;
|
|
2313
|
+
let removedDupCount = 0;
|
|
2314
|
+
for (const page of pagesResult) {
|
|
2315
|
+
const pageNumber = page.pageNumber;
|
|
2316
|
+
const pdfHeight = page.pdfHeight;
|
|
2317
|
+
const scaleX = page.renderedWidth > 0 ? page.pdfWidth / page.renderedWidth : 0.5;
|
|
2318
|
+
const scaleY = page.renderedHeight > 0 ? page.pdfHeight / page.renderedHeight : 0.5;
|
|
2319
|
+
const candidates = [];
|
|
2320
|
+
for (const r of page.regions) {
|
|
2321
|
+
if (!r.latex || !r.latex.trim()) continue;
|
|
2322
|
+
const wrapped = r.kind === "display" ? `$$${r.latex}$$` : `$${r.latex}$`;
|
|
2323
|
+
const x1 = r.bbox.x1 * scaleX;
|
|
2324
|
+
const x2 = r.bbox.x2 * scaleX;
|
|
2325
|
+
const yTop = pdfHeight - r.bbox.y1 * scaleY;
|
|
2326
|
+
const yBottom = pdfHeight - r.bbox.y2 * scaleY;
|
|
2327
|
+
const centerY = (yTop + yBottom) / 2;
|
|
2328
|
+
const width = x2 - x1;
|
|
2329
|
+
const height = yTop - yBottom;
|
|
2330
|
+
candidates.push({
|
|
2331
|
+
block: {
|
|
2332
|
+
type: "paragraph",
|
|
2333
|
+
text: wrapped,
|
|
2334
|
+
pageNumber,
|
|
2335
|
+
bbox: { page: pageNumber, x: x1, y: yBottom, width, height }
|
|
2336
|
+
},
|
|
2337
|
+
pdfBbox: { x1, x2, yTop, yBottom },
|
|
2338
|
+
centerY
|
|
2339
|
+
});
|
|
2340
|
+
}
|
|
2341
|
+
if (candidates.length === 0) continue;
|
|
2342
|
+
const OVERLAP_THRESHOLD = 0.6;
|
|
2343
|
+
const indicesToRemove = /* @__PURE__ */ new Set();
|
|
2344
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2345
|
+
const b = blocks[i];
|
|
2346
|
+
if (b.pageNumber !== pageNumber) continue;
|
|
2347
|
+
if (b.type === "table") continue;
|
|
2348
|
+
if (!b.bbox || b.bbox.width <= 0 || b.bbox.height <= 0) continue;
|
|
2349
|
+
const blockArea = b.bbox.width * b.bbox.height;
|
|
2350
|
+
if (blockArea <= 0) continue;
|
|
2351
|
+
for (const c of candidates) {
|
|
2352
|
+
const ox1 = Math.max(b.bbox.x, c.pdfBbox.x1);
|
|
2353
|
+
const ox2 = Math.min(b.bbox.x + b.bbox.width, c.pdfBbox.x2);
|
|
2354
|
+
const oy1 = Math.max(b.bbox.y, c.pdfBbox.yBottom);
|
|
2355
|
+
const oy2 = Math.min(b.bbox.y + b.bbox.height, c.pdfBbox.yTop);
|
|
2356
|
+
const interArea = Math.max(0, ox2 - ox1) * Math.max(0, oy2 - oy1);
|
|
2357
|
+
if (interArea / blockArea >= OVERLAP_THRESHOLD) {
|
|
2358
|
+
indicesToRemove.add(i);
|
|
2359
|
+
break;
|
|
2360
|
+
}
|
|
2361
|
+
}
|
|
2362
|
+
}
|
|
2363
|
+
if (indicesToRemove.size > 0) {
|
|
2364
|
+
const sorted = [...indicesToRemove].sort((a, b) => b - a);
|
|
2365
|
+
for (const idx of sorted) blocks.splice(idx, 1);
|
|
2366
|
+
removedDupCount += indicesToRemove.size;
|
|
2367
|
+
}
|
|
2368
|
+
candidates.sort((a, b) => b.centerY - a.centerY);
|
|
2369
|
+
for (const c of candidates) {
|
|
2370
|
+
let insertIdx = -1;
|
|
2371
|
+
let pageFirstIdx = -1;
|
|
2372
|
+
let pageLastIdx = -1;
|
|
2373
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
2374
|
+
const b = blocks[i];
|
|
2375
|
+
if (b.pageNumber !== pageNumber) continue;
|
|
2376
|
+
if (pageFirstIdx === -1) pageFirstIdx = i;
|
|
2377
|
+
pageLastIdx = i;
|
|
2378
|
+
if (!b.bbox) continue;
|
|
2379
|
+
const blockCenter = b.bbox.y + b.bbox.height / 2;
|
|
2380
|
+
if (blockCenter < c.centerY) {
|
|
2381
|
+
insertIdx = i;
|
|
2382
|
+
break;
|
|
2383
|
+
}
|
|
2384
|
+
}
|
|
2385
|
+
if (insertIdx !== -1) {
|
|
2386
|
+
blocks.splice(insertIdx, 0, c.block);
|
|
2387
|
+
} else if (pageLastIdx !== -1) {
|
|
2388
|
+
blocks.splice(pageLastIdx + 1, 0, c.block);
|
|
2389
|
+
} else {
|
|
2390
|
+
blocks.push(c.block);
|
|
2391
|
+
}
|
|
2392
|
+
insertedCount++;
|
|
2393
|
+
}
|
|
2394
|
+
}
|
|
2395
|
+
if (insertedCount > 0 || removedDupCount > 0) {
|
|
2396
|
+
process.stderr.write(
|
|
2397
|
+
`[kordoc-formula] ${insertedCount}\uAC1C \uC218\uC2DD \uC0BD\uC785, ${removedDupCount}\uAC1C \uC911\uBCF5 block \uC81C\uAC70 (${pagesResult.length}\uAC1C \uD398\uC774\uC9C0)
|
|
2398
|
+
`
|
|
2399
|
+
);
|
|
2400
|
+
}
|
|
2401
|
+
} finally {
|
|
2402
|
+
await pipeline.destroy().catch(() => {
|
|
2403
|
+
});
|
|
2404
|
+
}
|
|
2405
|
+
}
|
|
2406
|
+
function formatMb(bytes) {
|
|
2407
|
+
return `${(bytes / 1024 / 1024).toFixed(1)}MB`;
|
|
2408
|
+
}
|
|
2277
2409
|
export {
|
|
2278
2410
|
cleanPdfText,
|
|
2279
2411
|
extractPdfMetadataOnly,
|
|
2280
2412
|
parsePdfDocument
|
|
2281
2413
|
};
|
|
2282
|
-
//# sourceMappingURL=parser-
|
|
2414
|
+
//# sourceMappingURL=parser-6L6DZCOB.js.map
|