pretext-pdfjs 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/reflow.js +37 -9
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pretext-pdfjs",
3
- "version": "0.2.0",
3
+ "version": "0.2.1",
4
4
  "description": "Fork of PDF.js with @chenglou/pretext-native text layer — zero DOM reflows for text measurement",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
package/src/reflow.js CHANGED
@@ -61,12 +61,20 @@ function groupTextBlocks(textItems, pageHeight, styles) {
61
61
  const verticalGap = Math.abs(y - lastY);
62
62
 
63
63
  // Split block on significant font size change (headings vs body)
64
+ // But don't split for superscripts/markers that are horizontally adjacent
64
65
  const sizeRatio = fontHeight > 0 && lastFH > 0
65
66
  ? Math.max(fontHeight, lastFH) / Math.min(fontHeight, lastFH)
66
67
  : 1;
68
+ const lastX = lastItem.transform[4];
69
+ const lastW = lastItem.width || lastFH;
70
+ const hGap = x - (lastX + lastW);
71
+ const isHorizAdjacent = hGap < lastFH * 0.5 && hGap > -lastFH;
72
+ const isShortItem = (item.str || "").trim().length <= 2;
73
+ const isSuperscript = isShortItem && isHorizAdjacent && sizeRatio > 1.3;
74
+ const sizeOk = sizeRatio < 1.3 || isSuperscript;
67
75
 
68
76
  if (
69
- sizeRatio < 1.3 &&
77
+ sizeOk &&
70
78
  verticalGap < lastFH * 2.5 &&
71
79
  x < current.bbox.x + current.bbox.w + lastFH * 2
72
80
  ) {
@@ -248,11 +256,33 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
248
256
  }
249
257
  }
250
258
 
251
- // Sort by reading order: top to bottom, then left to right
252
- regions.sort((a, b) => {
253
- if (Math.abs(a.bbox.y - b.bbox.y) > 10) return a.bbox.y - b.bbox.y;
254
- return a.bbox.x - b.bbox.x;
255
- });
259
+ // Detect columns: find the midpoint X where blocks cluster on left vs right
260
+ const pageWidth = Math.max(...regions.map(r => r.bbox.x + r.bbox.w), 1);
261
+ const midX = pageWidth / 2;
262
+ const leftBlocks = regions.filter(r => r.bbox.x + r.bbox.w / 2 < midX);
263
+ const rightBlocks = regions.filter(r => r.bbox.x + r.bbox.w / 2 >= midX);
264
+ const hasColumns = leftBlocks.length > 2 && rightBlocks.length > 2 &&
265
+ rightBlocks.some(r => leftBlocks.some(l => Math.abs(l.bbox.y - r.bbox.y) < 20));
266
+
267
+ if (hasColumns) {
268
+ // Two-column: sort each column top-to-bottom, then concatenate
269
+ // Full-width blocks (spanning > 60% of page) go first, sorted by Y
270
+ const fullWidth = regions.filter(r => r.bbox.w > pageWidth * 0.6);
271
+ const leftCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 < midX);
272
+ const rightCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 >= midX);
273
+ const byY = (a, b) => a.bbox.y - b.bbox.y;
274
+ fullWidth.sort(byY);
275
+ leftCol.sort(byY);
276
+ rightCol.sort(byY);
277
+ regions.length = 0;
278
+ regions.push(...fullWidth, ...leftCol, ...rightCol);
279
+ } else {
280
+ // Single column: sort by Y then X
281
+ regions.sort((a, b) => {
282
+ if (Math.abs(a.bbox.y - b.bbox.y) > 10) return a.bbox.y - b.bbox.y;
283
+ return a.bbox.x - b.bbox.x;
284
+ });
285
+ }
256
286
 
257
287
  return regions;
258
288
  }
@@ -363,9 +393,7 @@ function reflowAndComposite(analysis, opts) {
363
393
  const blockFontSize = Math.round(fontSize * (block.fontScale || 1));
364
394
  const blockLH = blockFontSize * lineHeight;
365
395
  const style = block.isItalic ? "italic" : "normal";
366
- // Headings get lighter weight to match typical PDF display fonts
367
- const scale = block.fontScale || 1;
368
- const weight = block.isBold ? 700 : scale > 1.8 ? 300 : scale > 1.3 ? 400 : 400;
396
+ const weight = block.isBold ? 700 : 400;
369
397
  // Use PDF's detected font family if available, otherwise fall back to configured
370
398
  const blockFamily = block.pdfFontFamily
371
399
  ? `${block.pdfFontFamily}, ${fontFamily}`