pretext-pdfjs 0.3.4 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/reflow.js +248 -7
package/package.json
CHANGED
package/src/reflow.js
CHANGED
|
@@ -439,7 +439,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
|
|
|
439
439
|
if (current) blocks.push(current);
|
|
440
440
|
|
|
441
441
|
// Post-process: merge orphan tiny blocks (superscripts, markers like *, +, #)
|
|
442
|
-
// into the nearest larger block if vertically close
|
|
442
|
+
// into the nearest larger block if vertically close AND horizontally aligned
|
|
443
443
|
for (let i = blocks.length - 1; i >= 0; i--) {
|
|
444
444
|
const block = blocks[i];
|
|
445
445
|
if (block.items.length > 2) continue;
|
|
@@ -456,6 +456,15 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
|
|
|
456
456
|
// Check vertical proximity: orphan center within 30pt of target block
|
|
457
457
|
const bcy = block.bbox.y + block.bbox.h / 2;
|
|
458
458
|
if (bcy < o.bbox.y - 30 || bcy > o.bbox.y + o.bbox.h + 30) continue;
|
|
459
|
+
// Horizontal center alignment check - must be roughly in same column
|
|
460
|
+
const bcx = block.bbox.x + block.bbox.w / 2;
|
|
461
|
+
const ocx = o.bbox.x + o.bbox.w / 2;
|
|
462
|
+
const hCenterDist = Math.abs(bcx - ocx);
|
|
463
|
+
// Must have significant horizontal overlap or be in same column
|
|
464
|
+
const xOverlap = Math.max(0, Math.min(block.bbox.x + block.bbox.w, o.bbox.x + o.bbox.w) -
|
|
465
|
+
Math.max(block.bbox.x, o.bbox.x));
|
|
466
|
+
const inSameColumn = hCenterDist < Math.max(block.bbox.w, o.bbox.w) * 0.8 || xOverlap > 0;
|
|
467
|
+
if (!inSameColumn) continue;
|
|
459
468
|
// Horizontal edge-to-edge distance (0 if overlapping)
|
|
460
469
|
const hDist = Math.max(0,
|
|
461
470
|
block.bbox.x > o.bbox.x + o.bbox.w ? block.bbox.x - (o.bbox.x + o.bbox.w) :
|
|
@@ -479,6 +488,83 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
|
|
|
479
488
|
}
|
|
480
489
|
}
|
|
481
490
|
|
|
491
|
+
// Post-process: detect multi-column grids (like author sections)
|
|
492
|
+
// Group blocks that form aligned columns into a single composite block
|
|
493
|
+
const multiColumnBlocks = [];
|
|
494
|
+
const processed = new Set();
|
|
495
|
+
|
|
496
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
497
|
+
if (processed.has(i)) continue;
|
|
498
|
+
const block = blocks[i];
|
|
499
|
+
const blockText = block.items.map(it => (it.str || "").trim()).join(" ");
|
|
500
|
+
const blockCenterX = block.bbox.x + block.bbox.w / 2;
|
|
501
|
+
|
|
502
|
+
// Find all blocks in same horizontal band (similar Y position)
|
|
503
|
+
const sameRowBlocks = [block];
|
|
504
|
+
const rowY = block.bbox.y;
|
|
505
|
+
const rowH = block.bbox.h;
|
|
506
|
+
|
|
507
|
+
for (let j = i + 1; j < blocks.length; j++) {
|
|
508
|
+
if (processed.has(j)) continue;
|
|
509
|
+
const other = blocks[j];
|
|
510
|
+
// Check if in same row (vertical overlap)
|
|
511
|
+
const yOverlap = Math.max(0, Math.min(rowY + rowH, other.bbox.y + other.bbox.h) - Math.max(rowY, other.bbox.y));
|
|
512
|
+
const minH = Math.min(rowH, other.bbox.h);
|
|
513
|
+
if (yOverlap > minH * 0.5) {
|
|
514
|
+
sameRowBlocks.push(other);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
// If we have multiple blocks in same row, this might be a multi-column layout
|
|
519
|
+
if (sameRowBlocks.length >= 2) {
|
|
520
|
+
// Sort by X position
|
|
521
|
+
sameRowBlocks.sort((a, b) => a.bbox.x - b.bbox.x);
|
|
522
|
+
// Check if they're roughly aligned (similar height, spaced evenly)
|
|
523
|
+
const avgH = sameRowBlocks.reduce((s, b) => s + b.bbox.h, 0) / sameRowBlocks.length;
|
|
524
|
+
const heightsOk = sameRowBlocks.every(b => Math.abs(b.bbox.h - avgH) < avgH * 0.5);
|
|
525
|
+
|
|
526
|
+
if (heightsOk) {
|
|
527
|
+
// Merge into a single composite block that preserves multi-column info
|
|
528
|
+
const allItems = [];
|
|
529
|
+
for (const b of sameRowBlocks) {
|
|
530
|
+
allItems.push(...b.items);
|
|
531
|
+
processed.add(blocks.indexOf(b));
|
|
532
|
+
}
|
|
533
|
+
// Sort items by Y then X to maintain reading order within the grid
|
|
534
|
+
allItems.sort((a, b) => {
|
|
535
|
+
const ay = pageHeight - a.transform[5];
|
|
536
|
+
const by = pageHeight - b.transform[5];
|
|
537
|
+
if (Math.abs(ay - by) > 2) return ay - by;
|
|
538
|
+
return a.transform[4] - b.transform[4];
|
|
539
|
+
});
|
|
540
|
+
|
|
541
|
+
const bbox = {
|
|
542
|
+
x: Math.min(...sameRowBlocks.map(b => b.bbox.x)),
|
|
543
|
+
y: Math.min(...sameRowBlocks.map(b => b.bbox.y)),
|
|
544
|
+
w: Math.max(...sameRowBlocks.map(b => b.bbox.x + b.bbox.w)) - Math.min(...sameRowBlocks.map(b => b.bbox.x)),
|
|
545
|
+
h: Math.max(...sameRowBlocks.map(b => b.bbox.y + b.bbox.h)) - Math.min(...sameRowBlocks.map(b => b.bbox.y))
|
|
546
|
+
};
|
|
547
|
+
|
|
548
|
+
multiColumnBlocks.push({
|
|
549
|
+
items: allItems,
|
|
550
|
+
bbox,
|
|
551
|
+
isMultiColumn: true,
|
|
552
|
+
columnCount: sameRowBlocks.length
|
|
553
|
+
});
|
|
554
|
+
continue;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
if (!processed.has(i)) {
|
|
559
|
+
multiColumnBlocks.push(block);
|
|
560
|
+
processed.add(i);
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Replace blocks with multi-column merged version
|
|
565
|
+
blocks.length = 0;
|
|
566
|
+
blocks.push(...multiColumnBlocks);
|
|
567
|
+
|
|
482
568
|
// Compute font metadata per block using real font objects from commonObjs
|
|
483
569
|
for (const block of blocks) {
|
|
484
570
|
const sizes = [];
|
|
@@ -551,8 +637,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
|
|
|
551
637
|
|
|
552
638
|
/**
|
|
553
639
|
* Extract graphic regions from the page operator list.
|
|
554
|
-
*
|
|
555
|
-
* Skips path/fill/stroke to avoid false positives from text decorations.
|
|
640
|
+
* Captures images and horizontal divider lines (thin rectangles).
|
|
556
641
|
*/
|
|
557
642
|
function extractGraphicRegions(opList, OPS) {
|
|
558
643
|
const regions = [];
|
|
@@ -609,6 +694,23 @@ function extractGraphicRegions(opList, OPS) {
|
|
|
609
694
|
bbox: { x: minX, y: minY, w: maxX - minX, h: maxY - minY },
|
|
610
695
|
});
|
|
611
696
|
}
|
|
697
|
+
} else if (fn === OPS.rectangle) {
|
|
698
|
+
// Check for thin horizontal lines (dividers)
|
|
699
|
+
const [x, y, w, h] = args;
|
|
700
|
+
if (w > 100 && h > 0.5 && h < 5) {
|
|
701
|
+
const corners = [
|
|
702
|
+
transformPoint(x, y),
|
|
703
|
+
transformPoint(x + w, y),
|
|
704
|
+
transformPoint(x, y + h),
|
|
705
|
+
transformPoint(x + w, y + h),
|
|
706
|
+
];
|
|
707
|
+
const xs = corners.map(c => c[0]);
|
|
708
|
+
const ys = corners.map(c => c[1]);
|
|
709
|
+
regions.push({
|
|
710
|
+
type: "divider",
|
|
711
|
+
bbox: { x: Math.min(...xs), y: Math.min(...ys), w: Math.max(...xs) - Math.min(...xs), h: Math.max(...ys) - Math.min(...ys) },
|
|
712
|
+
});
|
|
713
|
+
}
|
|
612
714
|
}
|
|
613
715
|
}
|
|
614
716
|
|
|
@@ -744,7 +846,110 @@ function findParagraphThreshold(gaps, fontSize) {
|
|
|
744
846
|
/**
|
|
745
847
|
* Build text content for a block, preserving paragraph breaks.
|
|
746
848
|
*/
|
|
849
|
+
function blockToTextMultiColumn(block, pageHeight) {
|
|
850
|
+
const rows = new Map();
|
|
851
|
+
const fontHeight = block.avgFontSize || 12;
|
|
852
|
+
|
|
853
|
+
// Group items by row (finer granularity)
|
|
854
|
+
for (const item of block.items) {
|
|
855
|
+
if (!item.str) continue;
|
|
856
|
+
const y = pageHeight - item.transform[5];
|
|
857
|
+
const rowKey = Math.round(y / 2) * 2; // 2px granularity
|
|
858
|
+
if (!rows.has(rowKey)) rows.set(rowKey, []);
|
|
859
|
+
rows.get(rowKey).push(item);
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
const sortedRows = Array.from(rows.keys()).sort((a, b) => a - b);
|
|
863
|
+
|
|
864
|
+
// Merge rows: if a row has only short items (markers), merge with next row
|
|
865
|
+
const mergedRows = [];
|
|
866
|
+
let pendingRow = null;
|
|
867
|
+
|
|
868
|
+
for (const rowKey of sortedRows) {
|
|
869
|
+
const rowItems = rows.get(rowKey).sort((a, b) => a.transform[4] - b.transform[4]);
|
|
870
|
+
const allShort = rowItems.every(it => (it.str || "").trim().length <= 3);
|
|
871
|
+
|
|
872
|
+
if (allShort && rowItems.length >= 2) {
|
|
873
|
+
// This is a marker row - merge with next row
|
|
874
|
+
pendingRow = { key: rowKey, items: rowItems };
|
|
875
|
+
} else {
|
|
876
|
+
if (pendingRow) {
|
|
877
|
+
// Merge pending marker row with this row
|
|
878
|
+
// For each item in this row, find and attach the closest marker
|
|
879
|
+
const mergedItems = [];
|
|
880
|
+
for (const item of rowItems) {
|
|
881
|
+
const itemCenterX = item.transform[4] + (item.width || 0) / 2;
|
|
882
|
+
// Find closest marker
|
|
883
|
+
let closestMarker = null;
|
|
884
|
+
let minDist = Infinity;
|
|
885
|
+
for (const marker of pendingRow.items) {
|
|
886
|
+
const markerCenterX = marker.transform[4] + (marker.width || 0) / 2;
|
|
887
|
+
const dist = Math.abs(markerCenterX - itemCenterX);
|
|
888
|
+
if (dist < minDist) {
|
|
889
|
+
minDist = dist;
|
|
890
|
+
closestMarker = marker;
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
// Attach marker to item
|
|
894
|
+
if (closestMarker && minDist < 50) { // Within 50px
|
|
895
|
+
mergedItems.push({...item, str: item.str + closestMarker.str});
|
|
896
|
+
} else {
|
|
897
|
+
mergedItems.push(item);
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
mergedItems.sort((a, b) => a.transform[4] - b.transform[4]);
|
|
901
|
+
mergedRows.push({ items: mergedItems, hasMarkers: true });
|
|
902
|
+
pendingRow = null;
|
|
903
|
+
} else {
|
|
904
|
+
mergedRows.push({ items: rowItems, hasMarkers: false });
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
// Don't forget last pending row
|
|
910
|
+
if (pendingRow) {
|
|
911
|
+
mergedRows.push({ items: pendingRow.items, hasMarkers: true });
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
// Build output lines
|
|
915
|
+
const lines = [];
|
|
916
|
+
for (const row of mergedRows) {
|
|
917
|
+
let lineText = "";
|
|
918
|
+
let lastX = null;
|
|
919
|
+
let lastW = 0;
|
|
920
|
+
let lastItemLen = 0;
|
|
921
|
+
|
|
922
|
+
for (const item of row.items) {
|
|
923
|
+
const currentX = item.transform[4];
|
|
924
|
+
const currentItemLen = (item.str || "").trim().length;
|
|
925
|
+
const isShortItem = currentItemLen <= 3;
|
|
926
|
+
|
|
927
|
+
if (lastX !== null) {
|
|
928
|
+
const hGap = currentX - (lastX + lastW);
|
|
929
|
+
const prevWasLong = lastItemLen > 2;
|
|
930
|
+
// Add column separator, but not before footnote markers
|
|
931
|
+
if (hGap > fontHeight * 0.3 && (!prevWasLong || !isShortItem)) {
|
|
932
|
+
lineText += " ";
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
lineText += item.str;
|
|
937
|
+
lastX = currentX;
|
|
938
|
+
lastW = item.width || fontHeight * 0.5;
|
|
939
|
+
lastItemLen = currentItemLen;
|
|
940
|
+
}
|
|
941
|
+
lines.push(lineText.trim());
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
return lines.join("\n");
|
|
945
|
+
}
|
|
946
|
+
|
|
747
947
|
function blockToText(block, pageHeight) {
|
|
948
|
+
// Special handling for multi-column blocks (like author grids)
|
|
949
|
+
if (block.isMultiColumn && block.columnCount >= 2) {
|
|
950
|
+
return blockToTextMultiColumn(block, pageHeight);
|
|
951
|
+
}
|
|
952
|
+
|
|
748
953
|
// First pass: collect all gaps and font sizes to compute adaptive threshold
|
|
749
954
|
const gaps = [];
|
|
750
955
|
let lastY = null;
|
|
@@ -772,28 +977,39 @@ function blockToText(block, pageHeight) {
|
|
|
772
977
|
lastY = null;
|
|
773
978
|
let lastX = null;
|
|
774
979
|
let lastW = 0;
|
|
980
|
+
let lastItemLen = 0; // Track length of previous item for marker detection
|
|
775
981
|
|
|
776
982
|
for (const item of block.items) {
|
|
777
983
|
if (!item.str) continue;
|
|
778
984
|
const currentX = item.transform[4];
|
|
779
985
|
const currentY = pageHeight - item.transform[5];
|
|
986
|
+
const currentItemLen = (item.str || "").trim().length;
|
|
987
|
+
// Short items are typically footnote markers (*, †, ‡, #, etc.)
|
|
988
|
+
// Allow up to 3 chars to handle combined markers like "* †"
|
|
989
|
+
const isShortItem = currentItemLen <= 3;
|
|
780
990
|
|
|
781
991
|
if (lastY !== null) {
|
|
782
992
|
const vGap = Math.abs(currentY - lastY);
|
|
783
|
-
const isShortItem = (item.str || "").trim().length <= 2;
|
|
784
993
|
|
|
785
994
|
// Use adaptive threshold for paragraph detection
|
|
786
995
|
if (vGap > lastFontSize * paraThreshold && !isShortItem) {
|
|
787
996
|
result += "\n\n";
|
|
788
997
|
} else if (vGap > lineThreshold) {
|
|
789
998
|
// Different line — insert space
|
|
790
|
-
if
|
|
791
|
-
|
|
999
|
+
// But skip space if previous item was long and current is short (footnote marker)
|
|
1000
|
+
// This handles superscript markers like *, +, #, †, ‡
|
|
1001
|
+
const prevWasLong = lastItemLen > 2;
|
|
1002
|
+
if (!prevWasLong || !isShortItem) {
|
|
1003
|
+
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
1004
|
+
result += " ";
|
|
1005
|
+
}
|
|
792
1006
|
}
|
|
793
1007
|
} else if (lastX !== null) {
|
|
794
1008
|
// Same line — check horizontal gap between items
|
|
795
1009
|
const hGap = currentX - (lastX + lastW);
|
|
796
|
-
|
|
1010
|
+
// Skip adding space before short items (superscript markers like *, +, #, $)
|
|
1011
|
+
// These are usually footnote markers that should attach directly to preceding text
|
|
1012
|
+
if (hGap > lastFontSize * 0.15 && !isShortItem) {
|
|
797
1013
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
798
1014
|
result += " ";
|
|
799
1015
|
}
|
|
@@ -803,6 +1019,7 @@ function blockToText(block, pageHeight) {
|
|
|
803
1019
|
lastY = currentY;
|
|
804
1020
|
lastX = currentX;
|
|
805
1021
|
lastW = item.width || 0;
|
|
1022
|
+
lastItemLen = currentItemLen;
|
|
806
1023
|
result += item.str;
|
|
807
1024
|
}
|
|
808
1025
|
return result.trim();
|
|
@@ -1212,6 +1429,13 @@ function reflowAndComposite(analysis, opts) {
|
|
|
1212
1429
|
colorSpans: block.colorSpans || [],
|
|
1213
1430
|
region,
|
|
1214
1431
|
});
|
|
1432
|
+
} else if (region.type === "divider") {
|
|
1433
|
+
// Horizontal divider line
|
|
1434
|
+
reflowedRegions.push({
|
|
1435
|
+
type: "divider",
|
|
1436
|
+
height: 4, // Small height for the divider line area
|
|
1437
|
+
region,
|
|
1438
|
+
});
|
|
1215
1439
|
} else {
|
|
1216
1440
|
// Graphic
|
|
1217
1441
|
const bitmap = bitmaps.get(region);
|
|
@@ -1469,6 +1693,23 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1469
1693
|
lineCharOffset += line.text.length;
|
|
1470
1694
|
cursorY += lh;
|
|
1471
1695
|
}
|
|
1696
|
+
} else if (r.type === "divider") {
|
|
1697
|
+
// Draw horizontal divider line
|
|
1698
|
+
const screenY = cursorY - scrollY + 1; // Slight offset to center in area
|
|
1699
|
+
if (screenY > -10 && screenY < H + 10) {
|
|
1700
|
+
const lineWidth = Math.min(400, W - padding * 2); // Max 400px or fit with padding
|
|
1701
|
+
const startX = (W - lineWidth) / 2; // Center the line
|
|
1702
|
+
ctx.save();
|
|
1703
|
+
ctx.strokeStyle = textColor;
|
|
1704
|
+
ctx.globalAlpha = 0.3;
|
|
1705
|
+
ctx.lineWidth = 1 * d;
|
|
1706
|
+
ctx.beginPath();
|
|
1707
|
+
ctx.moveTo(startX * d, screenY * d);
|
|
1708
|
+
ctx.lineTo((startX + lineWidth) * d, screenY * d);
|
|
1709
|
+
ctx.stroke();
|
|
1710
|
+
ctx.restore();
|
|
1711
|
+
}
|
|
1712
|
+
cursorY += r.height;
|
|
1472
1713
|
} else if (r.type === "graphic" && r.bitmap) {
|
|
1473
1714
|
const screenY = cursorY - scrollY;
|
|
1474
1715
|
if (screenY > -r.drawH && screenY < H + r.drawH) {
|