kordoc 2.5.2 → 2.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (45) hide show
  1. package/README.md +21 -2
  2. package/dist/chunk-5CJGKKMZ.js +266 -0
  3. package/dist/chunk-5CJGKKMZ.js.map +1 -0
  4. package/dist/{chunk-24NKFRB4.js → chunk-GNN6MHH4.js} +14 -3
  5. package/dist/chunk-GNN6MHH4.js.map +1 -0
  6. package/dist/{chunk-NKKLA43G.js → chunk-LA66FVBN.js} +14 -3
  7. package/dist/chunk-LA66FVBN.js.map +1 -0
  8. package/dist/chunk-OBSPVJ6A.js +18947 -0
  9. package/dist/chunk-OBSPVJ6A.js.map +1 -0
  10. package/dist/{chunk-Z65OQP3H.cjs → chunk-RFGEEHI4.cjs} +14 -3
  11. package/dist/{chunk-Z65OQP3H.cjs.map → chunk-RFGEEHI4.cjs.map} +1 -1
  12. package/dist/cli.js +60 -5
  13. package/dist/cli.js.map +1 -1
  14. package/dist/{detect-I7YIS4Q6.js → detect-PJZMUL2Z.js} +6 -2
  15. package/dist/formula-3AQUUIRF.js +1151 -0
  16. package/dist/formula-3AQUUIRF.js.map +1 -0
  17. package/dist/formula-JCNF43NE.js +1153 -0
  18. package/dist/formula-JCNF43NE.js.map +1 -0
  19. package/dist/formula-XGG6ZP42.cjs +1151 -0
  20. package/dist/formula-XGG6ZP42.cjs.map +1 -0
  21. package/dist/index.cjs +14703 -455
  22. package/dist/index.cjs.map +1 -1
  23. package/dist/index.d.cts +73 -2
  24. package/dist/index.d.ts +73 -2
  25. package/dist/index.js +14575 -327
  26. package/dist/index.js.map +1 -1
  27. package/dist/mcp.js +5 -5
  28. package/dist/{parser-FRROKAB7.js → parser-5CJGXQCJ.js} +135 -3
  29. package/dist/parser-5CJGXQCJ.js.map +1 -0
  30. package/dist/{parser-BQKQOIJU.js → parser-6L6DZCOB.js} +135 -3
  31. package/dist/parser-6L6DZCOB.js.map +1 -0
  32. package/dist/{parser-AZYPOKAR.cjs → parser-SRI2TIZX.cjs} +159 -27
  33. package/dist/{parser-AZYPOKAR.cjs.map → parser-SRI2TIZX.cjs.map} +1 -1
  34. package/dist/{watch-ZJAUWUAE.js → watch-7CTGUDQB.js} +4 -4
  35. package/package.json +25 -4
  36. package/dist/chunk-24NKFRB4.js.map +0 -1
  37. package/dist/chunk-2CAJSQK5.js +0 -5052
  38. package/dist/chunk-2CAJSQK5.js.map +0 -1
  39. package/dist/chunk-M3E3C5GS.js +0 -59
  40. package/dist/chunk-M3E3C5GS.js.map +0 -1
  41. package/dist/chunk-NKKLA43G.js.map +0 -1
  42. package/dist/parser-BQKQOIJU.js.map +0 -1
  43. package/dist/parser-FRROKAB7.js.map +0 -1
  44. /package/dist/{detect-I7YIS4Q6.js.map → detect-PJZMUL2Z.js.map} +0 -0
  45. /package/dist/{watch-ZJAUWUAE.js.map → watch-7CTGUDQB.js.map} +0 -0
@@ -6,7 +6,7 @@
6
6
 
7
7
 
8
8
 
9
- var _chunkZ65OQP3Hcjs = require('./chunk-Z65OQP3H.cjs');
9
+ var _chunkRFGEEHI4cjs = require('./chunk-RFGEEHI4.cjs');
10
10
 
11
11
 
12
12
  var _chunkMUOQXDZ4cjs = require('./chunk-MUOQXDZ4.cjs');
@@ -1179,7 +1179,7 @@ async function loadPdfWithTimeout(buffer) {
1179
1179
  new Promise((_, reject) => {
1180
1180
  timer = setTimeout(() => {
1181
1181
  loadingTask.destroy();
1182
- reject(new (0, _chunkZ65OQP3Hcjs.KordocError)("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
1182
+ reject(new (0, _chunkRFGEEHI4cjs.KordocError)("PDF \uB85C\uB529 \uD0C0\uC784\uC544\uC6C3 (30\uCD08 \uCD08\uACFC)"));
1183
1183
  }, PDF_LOAD_TIMEOUT_MS);
1184
1184
  })
1185
1185
  ]);
@@ -1188,10 +1188,11 @@ async function loadPdfWithTimeout(buffer) {
1188
1188
  }
1189
1189
  }
1190
1190
  async function parsePdfDocument(buffer, options) {
1191
+ const formulaBuffer = _optionalChain([options, 'optionalAccess', _10 => _10.formulaOcr]) ? buffer.slice(0) : null;
1191
1192
  const doc = await loadPdfWithTimeout(buffer);
1192
1193
  try {
1193
1194
  const pageCount = doc.numPages;
1194
- if (pageCount === 0) throw new (0, _chunkZ65OQP3Hcjs.KordocError)("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
1195
+ if (pageCount === 0) throw new (0, _chunkRFGEEHI4cjs.KordocError)("PDF\uC5D0 \uD398\uC774\uC9C0\uAC00 \uC5C6\uC2B5\uB2C8\uB2E4.");
1195
1196
  const metadata = { pageCount };
1196
1197
  await extractPdfMetadata(doc, metadata);
1197
1198
  const blocks = [];
@@ -1199,7 +1200,7 @@ async function parsePdfDocument(buffer, options) {
1199
1200
  let totalChars = 0;
1200
1201
  let totalTextBytes = 0;
1201
1202
  const effectivePageCount = Math.min(pageCount, MAX_PAGES);
1202
- const pageFilter = _optionalChain([options, 'optionalAccess', _10 => _10.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, effectivePageCount) : null;
1203
+ const pageFilter = _optionalChain([options, 'optionalAccess', _11 => _11.pages]) ? _chunkMUOQXDZ4cjs.parsePageRange.call(void 0, options.pages, effectivePageCount) : null;
1203
1204
  const totalTarget = pageFilter ? pageFilter.size : effectivePageCount;
1204
1205
  const fontSizeFreq = /* @__PURE__ */ new Map();
1205
1206
  const pageHeights = /* @__PURE__ */ new Map();
@@ -1228,17 +1229,17 @@ async function parsePdfDocument(buffer, options) {
1228
1229
  totalChars += t.replace(/\s/g, "").length;
1229
1230
  totalTextBytes += t.length * 2;
1230
1231
  }
1231
- if (totalTextBytes > MAX_TOTAL_TEXT) throw new (0, _chunkZ65OQP3Hcjs.KordocError)("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
1232
+ if (totalTextBytes > MAX_TOTAL_TEXT) throw new (0, _chunkRFGEEHI4cjs.KordocError)("\uD14D\uC2A4\uD2B8 \uCD94\uCD9C \uD06C\uAE30 \uCD08\uACFC");
1232
1233
  parsedPages++;
1233
- _optionalChain([options, 'optionalAccess', _11 => _11.onProgress, 'optionalCall', _12 => _12(parsedPages, totalTarget)]);
1234
+ _optionalChain([options, 'optionalAccess', _12 => _12.onProgress, 'optionalCall', _13 => _13(parsedPages, totalTarget)]);
1234
1235
  } catch (pageErr) {
1235
- if (pageErr instanceof _chunkZ65OQP3Hcjs.KordocError) throw pageErr;
1236
+ if (pageErr instanceof _chunkRFGEEHI4cjs.KordocError) throw pageErr;
1236
1237
  warnings.push({ page: i, message: `\uD398\uC774\uC9C0 ${i} \uD30C\uC2F1 \uC2E4\uD328: ${pageErr instanceof Error ? pageErr.message : "\uC54C \uC218 \uC5C6\uB294 \uC624\uB958"}`, code: "PARTIAL_PARSE" });
1237
1238
  }
1238
1239
  }
1239
1240
  const parsedPageCount = parsedPages || (pageFilter ? pageFilter.size : effectivePageCount);
1240
1241
  if (totalChars / Math.max(parsedPageCount, 1) < 10) {
1241
- if (_optionalChain([options, 'optionalAccess', _13 => _13.ocr])) {
1242
+ if (_optionalChain([options, 'optionalAccess', _14 => _14.ocr])) {
1242
1243
  try {
1243
1244
  const { ocrPages } = await Promise.resolve().then(() => _interopRequireWildcard(require("./provider-YN2SSK4X.cjs")));
1244
1245
  const ocrBlocks = await ocrPages(doc, options.ocr, pageFilter, effectivePageCount);
@@ -1249,21 +1250,31 @@ async function parsePdfDocument(buffer, options) {
1249
1250
  } catch (e2) {
1250
1251
  }
1251
1252
  }
1252
- throw Object.assign(new (0, _chunkZ65OQP3Hcjs.KordocError)(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
1253
+ throw Object.assign(new (0, _chunkRFGEEHI4cjs.KordocError)(`\uC774\uBBF8\uC9C0 \uAE30\uBC18 PDF (${pageCount}\uD398\uC774\uC9C0, ${totalChars}\uC790)`), { isImageBased: true });
1253
1254
  }
1254
- if (_optionalChain([options, 'optionalAccess', _14 => _14.removeHeaderFooter]) !== false && parsedPageCount >= 3) {
1255
+ if (_optionalChain([options, 'optionalAccess', _15 => _15.removeHeaderFooter]) !== false && parsedPageCount >= 3) {
1255
1256
  const removed = removeHeaderFooterBlocks(blocks, pageHeights, warnings);
1256
1257
  for (let ri = removed.length - 1; ri >= 0; ri--) {
1257
1258
  blocks.splice(removed[ri], 1);
1258
1259
  }
1259
1260
  }
1261
+ if (_optionalChain([options, 'optionalAccess', _16 => _16.formulaOcr]) && formulaBuffer) {
1262
+ try {
1263
+ await applyFormulaOcr(formulaBuffer, blocks, pageFilter, effectivePageCount, warnings, options.onProgress);
1264
+ } catch (e) {
1265
+ warnings.push({
1266
+ message: `\uC218\uC2DD OCR \uC2E4\uD328: ${e instanceof Error ? e.message : String(e)}`,
1267
+ code: "PARTIAL_PARSE"
1268
+ });
1269
+ }
1270
+ }
1260
1271
  const medianFontSize = computeMedianFontSizeFromFreq(fontSizeFreq);
1261
1272
  if (medianFontSize > 0) {
1262
1273
  detectHeadings(blocks, medianFontSize);
1263
1274
  }
1264
1275
  detectMarkerHeadings(blocks);
1265
1276
  const outline = blocks.filter((b) => b.type === "heading" && b.level && b.text).map((b) => ({ level: b.level, text: b.text, pageNumber: b.pageNumber }));
1266
- let markdown = cleanPdfText(_chunkZ65OQP3Hcjs.blocksToMarkdown.call(void 0, blocks));
1277
+ let markdown = cleanPdfText(_chunkRFGEEHI4cjs.blocksToMarkdown.call(void 0, blocks));
1267
1278
  return { markdown, blocks, metadata, outline: outline.length > 0 ? outline : void 0, warnings: warnings.length > 0 ? warnings : void 0 };
1268
1279
  } finally {
1269
1280
  await doc.destroy().catch(() => {
@@ -1273,7 +1284,7 @@ async function parsePdfDocument(buffer, options) {
1273
1284
  async function extractPdfMetadata(doc, metadata) {
1274
1285
  try {
1275
1286
  const result = await doc.getMetadata();
1276
- if (!_optionalChain([result, 'optionalAccess', _15 => _15.info])) return;
1287
+ if (!_optionalChain([result, 'optionalAccess', _17 => _17.info])) return;
1277
1288
  const info = result.info;
1278
1289
  if (typeof info.Title === "string" && info.Title.trim()) metadata.title = info.Title.trim();
1279
1290
  if (typeof info.Author === "string" && info.Author.trim()) metadata.author = info.Author.trim();
@@ -1336,15 +1347,15 @@ function computeMedianFontSizeFromFreq(freq) {
1336
1347
  }
1337
1348
  function detectHeadings(blocks, medianFontSize) {
1338
1349
  for (const block of blocks) {
1339
- if (block.type !== "paragraph" || !block.text || !_optionalChain([block, 'access', _16 => _16.style, 'optionalAccess', _17 => _17.fontSize])) continue;
1350
+ if (block.type !== "paragraph" || !block.text || !_optionalChain([block, 'access', _18 => _18.style, 'optionalAccess', _19 => _19.fontSize])) continue;
1340
1351
  const text = block.text.trim();
1341
1352
  if (text.length === 0 || text.length > 200) continue;
1342
1353
  if (/^\d+$/.test(text)) continue;
1343
1354
  const ratio = block.style.fontSize / medianFontSize;
1344
1355
  let level = 0;
1345
- if (ratio >= _chunkZ65OQP3Hcjs.HEADING_RATIO_H1) level = 1;
1346
- else if (ratio >= _chunkZ65OQP3Hcjs.HEADING_RATIO_H2) level = 2;
1347
- else if (ratio >= _chunkZ65OQP3Hcjs.HEADING_RATIO_H3) level = 3;
1356
+ if (ratio >= _chunkRFGEEHI4cjs.HEADING_RATIO_H1) level = 1;
1357
+ else if (ratio >= _chunkRFGEEHI4cjs.HEADING_RATIO_H2) level = 2;
1358
+ else if (ratio >= _chunkRFGEEHI4cjs.HEADING_RATIO_H3) level = 3;
1348
1359
  if (level > 0) {
1349
1360
  block.type = "heading";
1350
1361
  block.level = level;
@@ -1404,7 +1415,7 @@ function detectMarkerHeadings(blocks) {
1404
1415
  block.level = 4;
1405
1416
  continue;
1406
1417
  }
1407
- if (/^[가-힣]{2,6}$/.test(text) && _optionalChain([block, 'access', _18 => _18.style, 'optionalAccess', _19 => _19.fontSize])) {
1418
+ if (/^[가-힣]{2,6}$/.test(text) && _optionalChain([block, 'access', _20 => _20.style, 'optionalAccess', _21 => _21.fontSize])) {
1408
1419
  const prev = blocks[i - 1];
1409
1420
  const next = blocks[i + 1];
1410
1421
  const prevIsStructural = !prev || prev.type === "table" || prev.type === "heading" || prev.type === "separator";
@@ -1592,7 +1603,7 @@ function extractBlocksWithGrids(items, pageNum, grids, horizontals, verticals) {
1592
1603
  }
1593
1604
  if (remaining.length > 0) {
1594
1605
  const allY = remaining.map((i) => i.y);
1595
- const pageH = _chunkZ65OQP3Hcjs.safeMax.call(void 0, allY) - _chunkZ65OQP3Hcjs.safeMin.call(void 0, allY);
1606
+ const pageH = _chunkRFGEEHI4cjs.safeMax.call(void 0, allY) - _chunkRFGEEHI4cjs.safeMin.call(void 0, allY);
1596
1607
  const groups = xyCutOrder(remaining, Math.max(15, pageH * 0.03));
1597
1608
  const textBlocks = [];
1598
1609
  for (const group of groups) {
@@ -1680,7 +1691,7 @@ function extractPageBlocksFallback(items, pageNum) {
1680
1691
  blocks.push({ type: "paragraph", text: tableText, pageNumber: pageNum, bbox, style: dominantStyle(items) });
1681
1692
  } else {
1682
1693
  const allY = items.map((i) => i.y);
1683
- const pageHeight = _chunkZ65OQP3Hcjs.safeMax.call(void 0, allY) - _chunkZ65OQP3Hcjs.safeMin.call(void 0, allY);
1694
+ const pageHeight = _chunkRFGEEHI4cjs.safeMax.call(void 0, allY) - _chunkRFGEEHI4cjs.safeMin.call(void 0, allY);
1684
1695
  const gapThreshold = Math.max(15, pageHeight * 0.03);
1685
1696
  const orderedGroups = xyCutOrder(items, gapThreshold);
1686
1697
  for (const group of orderedGroups) {
@@ -1729,7 +1740,7 @@ function dominantStyle(items) {
1729
1740
  }
1730
1741
  }
1731
1742
  if (dominantSize === 0) return void 0;
1732
- const fontName = _optionalChain([items, 'access', _20 => _20.find, 'call', _21 => _21((i) => i.fontSize === dominantSize), 'optionalAccess', _22 => _22.fontName]) || void 0;
1743
+ const fontName = _optionalChain([items, 'access', _22 => _22.find, 'call', _23 => _23((i) => i.fontSize === dominantSize), 'optionalAccess', _24 => _24.fontName]) || void 0;
1733
1744
  return { fontSize: dominantSize, fontName };
1734
1745
  }
1735
1746
  function normalizeItems(rawItems) {
@@ -1827,14 +1838,14 @@ function isProseSpread(items) {
1827
1838
  for (let i = 1; i < sorted.length; i++) {
1828
1839
  gaps.push(sorted[i].x - (sorted[i - 1].x + sorted[i - 1].w));
1829
1840
  }
1830
- const maxGap = _chunkZ65OQP3Hcjs.safeMax.call(void 0, gaps);
1841
+ const maxGap = _chunkRFGEEHI4cjs.safeMax.call(void 0, gaps);
1831
1842
  const avgLen = items.reduce((s, i) => s + i.text.length, 0) / items.length;
1832
1843
  return maxGap < 40 && avgLen < 5;
1833
1844
  }
1834
1845
  function detectColumns(yLines) {
1835
1846
  const allItems = yLines.flat();
1836
1847
  if (allItems.length === 0) return null;
1837
- const pageWidth = _chunkZ65OQP3Hcjs.safeMax.call(void 0, allItems.map((i) => i.x + i.w)) - _chunkZ65OQP3Hcjs.safeMin.call(void 0, allItems.map((i) => i.x));
1848
+ const pageWidth = _chunkRFGEEHI4cjs.safeMax.call(void 0, allItems.map((i) => i.x + i.w)) - _chunkRFGEEHI4cjs.safeMin.call(void 0, allItems.map((i) => i.x));
1838
1849
  if (pageWidth < 100) return null;
1839
1850
  let bigoLineIdx = -1;
1840
1851
  for (let i = 0; i < yLines.length; i++) {
@@ -2034,7 +2045,7 @@ function buildGridTable(lines, columns) {
2034
2045
  return md.join("\n");
2035
2046
  }
2036
2047
  function mergeLineSimple(items) {
2037
- if (items.length <= 1) return _optionalChain([items, 'access', _23 => _23[0], 'optionalAccess', _24 => _24.text]) || "";
2048
+ if (items.length <= 1) return _optionalChain([items, 'access', _25 => _25[0], 'optionalAccess', _26 => _26.text]) || "";
2038
2049
  const sorted = [...items].sort((a, b) => a.x - b.x);
2039
2050
  const isEvenSpaced = detectEvenSpacedItems(sorted);
2040
2051
  let result = sorted[0].text;
@@ -2071,7 +2082,10 @@ function mergeLineSimple(items) {
2071
2082
  function cleanPdfText(text) {
2072
2083
  return mergeKoreanLines(
2073
2084
  text.replace(/^\d{1,4}\n/, "").replace(/^[\s]*[-–—]\s*[-–—]?\d+[-–—]?[\s]*[-–—]?[\s]*$/gm, "").replace(/^\s*\d+\s*\/\s*\d+\s*$/gm, "").replace(/\n\d{1,4}\n/g, "\n").replace(/\n\d{1,4}$/, "").replace(/^#{1,6}\s*\d{1,4}\s*$/gm, "")
2074
- ).replace(/^(?!\| ---).*$/gm, (line) => collapseEvenSpacing(line)).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2085
+ ).replace(/^(?!\| ---).*$/gm, (line) => {
2086
+ if (/^\s*\${1,2}.+\${1,2}\s*$/.test(line)) return line;
2087
+ return collapseEvenSpacing(line);
2088
+ }).replace(/([□■◆○●▶ㅇ])\s+([가-힣])\s+([가-힣])/g, "$1 $2$3").replace(/\n{3,}/g, "\n\n").trim();
2075
2089
  }
2076
2090
  function startsWithMarker(line) {
2077
2091
  const t = line.trimStart();
@@ -2194,7 +2208,7 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2194
2208
  const bottomEntries = [];
2195
2209
  for (let bi = 0; bi < blocks.length; bi++) {
2196
2210
  const b = blocks[bi];
2197
- if (!b.bbox || !b.pageNumber || !_optionalChain([b, 'access', _25 => _25.text, 'optionalAccess', _26 => _26.trim, 'call', _27 => _27()])) continue;
2211
+ if (!b.bbox || !b.pageNumber || !_optionalChain([b, 'access', _27 => _27.text, 'optionalAccess', _28 => _28.trim, 'call', _29 => _29()])) continue;
2198
2212
  const ph = pageHeights.get(b.bbox.page) || pageHeights.get(b.pageNumber);
2199
2213
  if (!ph) continue;
2200
2214
  const blockTop = ph - (b.bbox.y + b.bbox.height);
@@ -2217,7 +2231,7 @@ function removeHeaderFooterBlocks(blocks, pageHeights, warnings) {
2217
2231
  }
2218
2232
  const repeatedPatterns = /* @__PURE__ */ new Set();
2219
2233
  for (const [p, count] of patternCount) {
2220
- if (count >= MIN_REPEAT && (_nullishCoalesce(_optionalChain([patternPages, 'access', _28 => _28.get, 'call', _29 => _29(p), 'optionalAccess', _30 => _30.size]), () => ( 0))) >= MIN_REPEAT) {
2234
+ if (count >= MIN_REPEAT && (_nullishCoalesce(_optionalChain([patternPages, 'access', _30 => _30.get, 'call', _31 => _31(p), 'optionalAccess', _32 => _32.size]), () => ( 0))) >= MIN_REPEAT) {
2221
2235
  repeatedPatterns.add(p);
2222
2236
  }
2223
2237
  }
@@ -2274,9 +2288,127 @@ function mergeKoreanLines(text) {
2274
2288
  }
2275
2289
  return result.join("\n");
2276
2290
  }
2291
+ async function applyFormulaOcr(buffer, blocks, pageFilter, effectivePageCount, warnings, _onProgress) {
2292
+ const formulaMod = await Promise.resolve().then(() => _interopRequireWildcard(require("./formula-XGG6ZP42.cjs")));
2293
+ const { FormulaPipeline, ensureFormulaModels } = formulaMod;
2294
+ await ensureFormulaModels((p) => {
2295
+ if (p.phase === "download" && p.total) {
2296
+ const pct = Math.floor(p.downloaded / p.total * 100);
2297
+ process.stderr.write(`\r[kordoc-formula] ${p.spec.name} ${pct}% (${formatMb(p.downloaded)}/${formatMb(p.total)})`);
2298
+ if (p.downloaded >= p.total) process.stderr.write("\n");
2299
+ } else if (p.phase === "verify") {
2300
+ process.stderr.write(`[kordoc-formula] ${p.spec.name} SHA-256 \uAC80\uC99D \uC911...
2301
+ `);
2302
+ } else if (p.phase === "done") {
2303
+ process.stderr.write(`[kordoc-formula] ${p.spec.name} \uC900\uBE44 \uC644\uB8CC
2304
+ `);
2305
+ } else if (p.phase === "skip") {
2306
+ }
2307
+ });
2308
+ const pipeline = await FormulaPipeline.create();
2309
+ try {
2310
+ const pagesResult = await pipeline.runOnBuffer(buffer, pageFilter);
2311
+ if (pagesResult.length === 0) return;
2312
+ let insertedCount = 0;
2313
+ let removedDupCount = 0;
2314
+ for (const page of pagesResult) {
2315
+ const pageNumber = page.pageNumber;
2316
+ const pdfHeight = page.pdfHeight;
2317
+ const scaleX = page.renderedWidth > 0 ? page.pdfWidth / page.renderedWidth : 0.5;
2318
+ const scaleY = page.renderedHeight > 0 ? page.pdfHeight / page.renderedHeight : 0.5;
2319
+ const candidates = [];
2320
+ for (const r of page.regions) {
2321
+ if (!r.latex || !r.latex.trim()) continue;
2322
+ const wrapped = r.kind === "display" ? `$$${r.latex}$$` : `$${r.latex}$`;
2323
+ const x1 = r.bbox.x1 * scaleX;
2324
+ const x2 = r.bbox.x2 * scaleX;
2325
+ const yTop = pdfHeight - r.bbox.y1 * scaleY;
2326
+ const yBottom = pdfHeight - r.bbox.y2 * scaleY;
2327
+ const centerY = (yTop + yBottom) / 2;
2328
+ const width = x2 - x1;
2329
+ const height = yTop - yBottom;
2330
+ candidates.push({
2331
+ block: {
2332
+ type: "paragraph",
2333
+ text: wrapped,
2334
+ pageNumber,
2335
+ bbox: { page: pageNumber, x: x1, y: yBottom, width, height }
2336
+ },
2337
+ pdfBbox: { x1, x2, yTop, yBottom },
2338
+ centerY
2339
+ });
2340
+ }
2341
+ if (candidates.length === 0) continue;
2342
+ const OVERLAP_THRESHOLD = 0.6;
2343
+ const indicesToRemove = /* @__PURE__ */ new Set();
2344
+ for (let i = 0; i < blocks.length; i++) {
2345
+ const b = blocks[i];
2346
+ if (b.pageNumber !== pageNumber) continue;
2347
+ if (b.type === "table") continue;
2348
+ if (!b.bbox || b.bbox.width <= 0 || b.bbox.height <= 0) continue;
2349
+ const blockArea = b.bbox.width * b.bbox.height;
2350
+ if (blockArea <= 0) continue;
2351
+ for (const c of candidates) {
2352
+ const ox1 = Math.max(b.bbox.x, c.pdfBbox.x1);
2353
+ const ox2 = Math.min(b.bbox.x + b.bbox.width, c.pdfBbox.x2);
2354
+ const oy1 = Math.max(b.bbox.y, c.pdfBbox.yBottom);
2355
+ const oy2 = Math.min(b.bbox.y + b.bbox.height, c.pdfBbox.yTop);
2356
+ const interArea = Math.max(0, ox2 - ox1) * Math.max(0, oy2 - oy1);
2357
+ if (interArea / blockArea >= OVERLAP_THRESHOLD) {
2358
+ indicesToRemove.add(i);
2359
+ break;
2360
+ }
2361
+ }
2362
+ }
2363
+ if (indicesToRemove.size > 0) {
2364
+ const sorted = [...indicesToRemove].sort((a, b) => b - a);
2365
+ for (const idx of sorted) blocks.splice(idx, 1);
2366
+ removedDupCount += indicesToRemove.size;
2367
+ }
2368
+ candidates.sort((a, b) => b.centerY - a.centerY);
2369
+ for (const c of candidates) {
2370
+ let insertIdx = -1;
2371
+ let pageFirstIdx = -1;
2372
+ let pageLastIdx = -1;
2373
+ for (let i = 0; i < blocks.length; i++) {
2374
+ const b = blocks[i];
2375
+ if (b.pageNumber !== pageNumber) continue;
2376
+ if (pageFirstIdx === -1) pageFirstIdx = i;
2377
+ pageLastIdx = i;
2378
+ if (!b.bbox) continue;
2379
+ const blockCenter = b.bbox.y + b.bbox.height / 2;
2380
+ if (blockCenter < c.centerY) {
2381
+ insertIdx = i;
2382
+ break;
2383
+ }
2384
+ }
2385
+ if (insertIdx !== -1) {
2386
+ blocks.splice(insertIdx, 0, c.block);
2387
+ } else if (pageLastIdx !== -1) {
2388
+ blocks.splice(pageLastIdx + 1, 0, c.block);
2389
+ } else {
2390
+ blocks.push(c.block);
2391
+ }
2392
+ insertedCount++;
2393
+ }
2394
+ }
2395
+ if (insertedCount > 0 || removedDupCount > 0) {
2396
+ process.stderr.write(
2397
+ `[kordoc-formula] ${insertedCount}\uAC1C \uC218\uC2DD \uC0BD\uC785, ${removedDupCount}\uAC1C \uC911\uBCF5 block \uC81C\uAC70 (${pagesResult.length}\uAC1C \uD398\uC774\uC9C0)
2398
+ `
2399
+ );
2400
+ }
2401
+ } finally {
2402
+ await pipeline.destroy().catch(() => {
2403
+ });
2404
+ }
2405
+ }
2406
+ function formatMb(bytes) {
2407
+ return `${(bytes / 1024 / 1024).toFixed(1)}MB`;
2408
+ }
2277
2409
 
2278
2410
 
2279
2411
 
2280
2412
 
2281
2413
  exports.cleanPdfText = cleanPdfText; exports.extractPdfMetadataOnly = extractPdfMetadataOnly; exports.parsePdfDocument = parsePdfDocument;
2282
- //# sourceMappingURL=parser-AZYPOKAR.cjs.map
2414
+ //# sourceMappingURL=parser-SRI2TIZX.cjs.map