pretext-pdfjs 0.3.4 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/reflow.js +248 -7
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pretext-pdfjs",
3
- "version": "0.3.4",
3
+ "version": "0.3.5",
4
4
  "description": "Pretext-native text layer for PDF.js — zero DOM reflows, per-block reflow with image preservation, pinch-to-zoom text",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
package/src/reflow.js CHANGED
@@ -439,7 +439,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
439
439
  if (current) blocks.push(current);
440
440
 
441
441
  // Post-process: merge orphan tiny blocks (superscripts, markers like *, +, #)
442
- // into the nearest larger block if vertically close
442
+ // into the nearest larger block if vertically close AND horizontally aligned
443
443
  for (let i = blocks.length - 1; i >= 0; i--) {
444
444
  const block = blocks[i];
445
445
  if (block.items.length > 2) continue;
@@ -456,6 +456,15 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
456
456
  // Check vertical proximity: orphan center within 30pt of target block
457
457
  const bcy = block.bbox.y + block.bbox.h / 2;
458
458
  if (bcy < o.bbox.y - 30 || bcy > o.bbox.y + o.bbox.h + 30) continue;
459
+ // Horizontal center alignment check - must be roughly in same column
460
+ const bcx = block.bbox.x + block.bbox.w / 2;
461
+ const ocx = o.bbox.x + o.bbox.w / 2;
462
+ const hCenterDist = Math.abs(bcx - ocx);
463
+ // Must have significant horizontal overlap or be in same column
464
+ const xOverlap = Math.max(0, Math.min(block.bbox.x + block.bbox.w, o.bbox.x + o.bbox.w) -
465
+ Math.max(block.bbox.x, o.bbox.x));
466
+ const inSameColumn = hCenterDist < Math.max(block.bbox.w, o.bbox.w) * 0.8 || xOverlap > 0;
467
+ if (!inSameColumn) continue;
459
468
  // Horizontal edge-to-edge distance (0 if overlapping)
460
469
  const hDist = Math.max(0,
461
470
  block.bbox.x > o.bbox.x + o.bbox.w ? block.bbox.x - (o.bbox.x + o.bbox.w) :
@@ -479,6 +488,83 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
479
488
  }
480
489
  }
481
490
 
491
+ // Post-process: detect multi-column grids (like author sections)
492
+ // Group blocks that form aligned columns into a single composite block
493
+ const multiColumnBlocks = [];
494
+ const processed = new Set();
495
+
496
+ for (let i = 0; i < blocks.length; i++) {
497
+ if (processed.has(i)) continue;
498
+ const block = blocks[i];
499
+ const blockText = block.items.map(it => (it.str || "").trim()).join(" ");
500
+ const blockCenterX = block.bbox.x + block.bbox.w / 2;
501
+
502
+ // Find all blocks in same horizontal band (similar Y position)
503
+ const sameRowBlocks = [block];
504
+ const rowY = block.bbox.y;
505
+ const rowH = block.bbox.h;
506
+
507
+ for (let j = i + 1; j < blocks.length; j++) {
508
+ if (processed.has(j)) continue;
509
+ const other = blocks[j];
510
+ // Check if in same row (vertical overlap)
511
+ const yOverlap = Math.max(0, Math.min(rowY + rowH, other.bbox.y + other.bbox.h) - Math.max(rowY, other.bbox.y));
512
+ const minH = Math.min(rowH, other.bbox.h);
513
+ if (yOverlap > minH * 0.5) {
514
+ sameRowBlocks.push(other);
515
+ }
516
+ }
517
+
518
+ // If we have multiple blocks in same row, this might be a multi-column layout
519
+ if (sameRowBlocks.length >= 2) {
520
+ // Sort by X position
521
+ sameRowBlocks.sort((a, b) => a.bbox.x - b.bbox.x);
522
+ // Check if they're roughly aligned (similar height, spaced evenly)
523
+ const avgH = sameRowBlocks.reduce((s, b) => s + b.bbox.h, 0) / sameRowBlocks.length;
524
+ const heightsOk = sameRowBlocks.every(b => Math.abs(b.bbox.h - avgH) < avgH * 0.5);
525
+
526
+ if (heightsOk) {
527
+ // Merge into a single composite block that preserves multi-column info
528
+ const allItems = [];
529
+ for (const b of sameRowBlocks) {
530
+ allItems.push(...b.items);
531
+ processed.add(blocks.indexOf(b));
532
+ }
533
+ // Sort items by Y then X to maintain reading order within the grid
534
+ allItems.sort((a, b) => {
535
+ const ay = pageHeight - a.transform[5];
536
+ const by = pageHeight - b.transform[5];
537
+ if (Math.abs(ay - by) > 2) return ay - by;
538
+ return a.transform[4] - b.transform[4];
539
+ });
540
+
541
+ const bbox = {
542
+ x: Math.min(...sameRowBlocks.map(b => b.bbox.x)),
543
+ y: Math.min(...sameRowBlocks.map(b => b.bbox.y)),
544
+ w: Math.max(...sameRowBlocks.map(b => b.bbox.x + b.bbox.w)) - Math.min(...sameRowBlocks.map(b => b.bbox.x)),
545
+ h: Math.max(...sameRowBlocks.map(b => b.bbox.y + b.bbox.h)) - Math.min(...sameRowBlocks.map(b => b.bbox.y))
546
+ };
547
+
548
+ multiColumnBlocks.push({
549
+ items: allItems,
550
+ bbox,
551
+ isMultiColumn: true,
552
+ columnCount: sameRowBlocks.length
553
+ });
554
+ continue;
555
+ }
556
+ }
557
+
558
+ if (!processed.has(i)) {
559
+ multiColumnBlocks.push(block);
560
+ processed.add(i);
561
+ }
562
+ }
563
+
564
+ // Replace blocks with multi-column merged version
565
+ blocks.length = 0;
566
+ blocks.push(...multiColumnBlocks);
567
+
482
568
  // Compute font metadata per block using real font objects from commonObjs
483
569
  for (const block of blocks) {
484
570
  const sizes = [];
@@ -551,8 +637,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
551
637
 
552
638
  /**
553
639
  * Extract graphic regions from the page operator list.
554
- * Only captures image operators (paintImageXObject etc).
555
- * Skips path/fill/stroke to avoid false positives from text decorations.
640
+ * Captures images and horizontal divider lines (thin rectangles).
556
641
  */
557
642
  function extractGraphicRegions(opList, OPS) {
558
643
  const regions = [];
@@ -609,6 +694,23 @@ function extractGraphicRegions(opList, OPS) {
609
694
  bbox: { x: minX, y: minY, w: maxX - minX, h: maxY - minY },
610
695
  });
611
696
  }
697
+ } else if (fn === OPS.rectangle) {
698
+ // Check for thin horizontal lines (dividers)
699
+ const [x, y, w, h] = args;
700
+ if (w > 100 && h > 0.5 && h < 5) {
701
+ const corners = [
702
+ transformPoint(x, y),
703
+ transformPoint(x + w, y),
704
+ transformPoint(x, y + h),
705
+ transformPoint(x + w, y + h),
706
+ ];
707
+ const xs = corners.map(c => c[0]);
708
+ const ys = corners.map(c => c[1]);
709
+ regions.push({
710
+ type: "divider",
711
+ bbox: { x: Math.min(...xs), y: Math.min(...ys), w: Math.max(...xs) - Math.min(...xs), h: Math.max(...ys) - Math.min(...ys) },
712
+ });
713
+ }
612
714
  }
613
715
  }
614
716
 
@@ -744,7 +846,110 @@ function findParagraphThreshold(gaps, fontSize) {
744
846
  /**
745
847
  * Build text content for a block, preserving paragraph breaks.
746
848
  */
849
+ function blockToTextMultiColumn(block, pageHeight) {
850
+ const rows = new Map();
851
+ const fontHeight = block.avgFontSize || 12;
852
+
853
+ // Group items by row (finer granularity)
854
+ for (const item of block.items) {
855
+ if (!item.str) continue;
856
+ const y = pageHeight - item.transform[5];
857
+ const rowKey = Math.round(y / 2) * 2; // 2px granularity
858
+ if (!rows.has(rowKey)) rows.set(rowKey, []);
859
+ rows.get(rowKey).push(item);
860
+ }
861
+
862
+ const sortedRows = Array.from(rows.keys()).sort((a, b) => a - b);
863
+
864
+ // Merge rows: if a row has only short items (markers), merge with next row
865
+ const mergedRows = [];
866
+ let pendingRow = null;
867
+
868
+ for (const rowKey of sortedRows) {
869
+ const rowItems = rows.get(rowKey).sort((a, b) => a.transform[4] - b.transform[4]);
870
+ const allShort = rowItems.every(it => (it.str || "").trim().length <= 3);
871
+
872
+ if (allShort && rowItems.length >= 2) {
873
+ // This is a marker row - merge with next row
874
+ pendingRow = { key: rowKey, items: rowItems };
875
+ } else {
876
+ if (pendingRow) {
877
+ // Merge pending marker row with this row
878
+ // For each item in this row, find and attach the closest marker
879
+ const mergedItems = [];
880
+ for (const item of rowItems) {
881
+ const itemCenterX = item.transform[4] + (item.width || 0) / 2;
882
+ // Find closest marker
883
+ let closestMarker = null;
884
+ let minDist = Infinity;
885
+ for (const marker of pendingRow.items) {
886
+ const markerCenterX = marker.transform[4] + (marker.width || 0) / 2;
887
+ const dist = Math.abs(markerCenterX - itemCenterX);
888
+ if (dist < minDist) {
889
+ minDist = dist;
890
+ closestMarker = marker;
891
+ }
892
+ }
893
+ // Attach marker to item
894
+ if (closestMarker && minDist < 50) { // Within 50px
895
+ mergedItems.push({...item, str: item.str + closestMarker.str});
896
+ } else {
897
+ mergedItems.push(item);
898
+ }
899
+ }
900
+ mergedItems.sort((a, b) => a.transform[4] - b.transform[4]);
901
+ mergedRows.push({ items: mergedItems, hasMarkers: true });
902
+ pendingRow = null;
903
+ } else {
904
+ mergedRows.push({ items: rowItems, hasMarkers: false });
905
+ }
906
+ }
907
+ }
908
+
909
+ // Don't forget last pending row
910
+ if (pendingRow) {
911
+ mergedRows.push({ items: pendingRow.items, hasMarkers: true });
912
+ }
913
+
914
+ // Build output lines
915
+ const lines = [];
916
+ for (const row of mergedRows) {
917
+ let lineText = "";
918
+ let lastX = null;
919
+ let lastW = 0;
920
+ let lastItemLen = 0;
921
+
922
+ for (const item of row.items) {
923
+ const currentX = item.transform[4];
924
+ const currentItemLen = (item.str || "").trim().length;
925
+ const isShortItem = currentItemLen <= 3;
926
+
927
+ if (lastX !== null) {
928
+ const hGap = currentX - (lastX + lastW);
929
+ const prevWasLong = lastItemLen > 2;
930
+ // Add column separator, but not before footnote markers
931
+ if (hGap > fontHeight * 0.3 && (!prevWasLong || !isShortItem)) {
932
+ lineText += " ";
933
+ }
934
+ }
935
+
936
+ lineText += item.str;
937
+ lastX = currentX;
938
+ lastW = item.width || fontHeight * 0.5;
939
+ lastItemLen = currentItemLen;
940
+ }
941
+ lines.push(lineText.trim());
942
+ }
943
+
944
+ return lines.join("\n");
945
+ }
946
+
747
947
  function blockToText(block, pageHeight) {
948
+ // Special handling for multi-column blocks (like author grids)
949
+ if (block.isMultiColumn && block.columnCount >= 2) {
950
+ return blockToTextMultiColumn(block, pageHeight);
951
+ }
952
+
748
953
  // First pass: collect all gaps and font sizes to compute adaptive threshold
749
954
  const gaps = [];
750
955
  let lastY = null;
@@ -772,28 +977,39 @@ function blockToText(block, pageHeight) {
772
977
  lastY = null;
773
978
  let lastX = null;
774
979
  let lastW = 0;
980
+ let lastItemLen = 0; // Track length of previous item for marker detection
775
981
 
776
982
  for (const item of block.items) {
777
983
  if (!item.str) continue;
778
984
  const currentX = item.transform[4];
779
985
  const currentY = pageHeight - item.transform[5];
986
+ const currentItemLen = (item.str || "").trim().length;
987
+ // Short items are typically footnote markers (*, †, ‡, #, etc.)
988
+ // Allow up to 3 chars to handle combined markers like "* †"
989
+ const isShortItem = currentItemLen <= 3;
780
990
 
781
991
  if (lastY !== null) {
782
992
  const vGap = Math.abs(currentY - lastY);
783
- const isShortItem = (item.str || "").trim().length <= 2;
784
993
 
785
994
  // Use adaptive threshold for paragraph detection
786
995
  if (vGap > lastFontSize * paraThreshold && !isShortItem) {
787
996
  result += "\n\n";
788
997
  } else if (vGap > lineThreshold) {
789
998
  // Different line — insert space
790
- if (!result.endsWith(" ") && !result.endsWith("\n")) {
791
- result += " ";
999
+ // But skip space if previous item was long and current is short (footnote marker)
1000
+ // This handles superscript markers like *, +, #, †, ‡
1001
+ const prevWasLong = lastItemLen > 2;
1002
+ if (!prevWasLong || !isShortItem) {
1003
+ if (!result.endsWith(" ") && !result.endsWith("\n")) {
1004
+ result += " ";
1005
+ }
792
1006
  }
793
1007
  } else if (lastX !== null) {
794
1008
  // Same line — check horizontal gap between items
795
1009
  const hGap = currentX - (lastX + lastW);
796
- if (hGap > lastFontSize * 0.15) {
1010
+ // Skip adding space before short items (superscript markers like *, +, #, $)
1011
+ // These are usually footnote markers that should attach directly to preceding text
1012
+ if (hGap > lastFontSize * 0.15 && !isShortItem) {
797
1013
  if (!result.endsWith(" ") && !result.endsWith("\n")) {
798
1014
  result += " ";
799
1015
  }
@@ -803,6 +1019,7 @@ function blockToText(block, pageHeight) {
803
1019
  lastY = currentY;
804
1020
  lastX = currentX;
805
1021
  lastW = item.width || 0;
1022
+ lastItemLen = currentItemLen;
806
1023
  result += item.str;
807
1024
  }
808
1025
  return result.trim();
@@ -1212,6 +1429,13 @@ function reflowAndComposite(analysis, opts) {
1212
1429
  colorSpans: block.colorSpans || [],
1213
1430
  region,
1214
1431
  });
1432
+ } else if (region.type === "divider") {
1433
+ // Horizontal divider line
1434
+ reflowedRegions.push({
1435
+ type: "divider",
1436
+ height: 4, // Small height for the divider line area
1437
+ region,
1438
+ });
1215
1439
  } else {
1216
1440
  // Graphic
1217
1441
  const bitmap = bitmaps.get(region);
@@ -1469,6 +1693,23 @@ export function createReflowRenderer(container, options = {}) {
1469
1693
  lineCharOffset += line.text.length;
1470
1694
  cursorY += lh;
1471
1695
  }
1696
+ } else if (r.type === "divider") {
1697
+ // Draw horizontal divider line
1698
+ const screenY = cursorY - scrollY + 1; // Slight offset to center in area
1699
+ if (screenY > -10 && screenY < H + 10) {
1700
+ const lineWidth = Math.min(400, W - padding * 2); // Max 400px or fit with padding
1701
+ const startX = (W - lineWidth) / 2; // Center the line
1702
+ ctx.save();
1703
+ ctx.strokeStyle = textColor;
1704
+ ctx.globalAlpha = 0.3;
1705
+ ctx.lineWidth = 1 * d;
1706
+ ctx.beginPath();
1707
+ ctx.moveTo(startX * d, screenY * d);
1708
+ ctx.lineTo((startX + lineWidth) * d, screenY * d);
1709
+ ctx.stroke();
1710
+ ctx.restore();
1711
+ }
1712
+ cursorY += r.height;
1472
1713
  } else if (r.type === "graphic" && r.bitmap) {
1473
1714
  const screenY = cursorY - scrollY;
1474
1715
  if (screenY > -r.drawH && screenY < H + r.drawH) {