pretext-pdfjs 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/reflow.js +235 -30
package/package.json
CHANGED
package/src/reflow.js
CHANGED
|
@@ -25,7 +25,16 @@ function drawJustifiedLine(ctx, text, x, y, availWidth) {
|
|
|
25
25
|
}
|
|
26
26
|
let totalWordWidth = 0;
|
|
27
27
|
for (const w of words) totalWordWidth += ctx.measureText(w).width;
|
|
28
|
+
|
|
29
|
+
const normalSpaceWidth = ctx.measureText(" ").width;
|
|
28
30
|
const extraSpace = (availWidth - totalWordWidth) / (words.length - 1);
|
|
31
|
+
|
|
32
|
+
// Fall back to left-aligned if gaps would be too large
|
|
33
|
+
if (extraSpace > normalSpaceWidth * 3 || totalWordWidth < availWidth * 0.7) {
|
|
34
|
+
ctx.fillText(text, x, y);
|
|
35
|
+
return;
|
|
36
|
+
}
|
|
37
|
+
|
|
29
38
|
let xPos = x;
|
|
30
39
|
for (const w of words) {
|
|
31
40
|
ctx.fillText(w, xPos, y);
|
|
@@ -33,6 +42,70 @@ function drawJustifiedLine(ctx, text, x, y, availWidth) {
|
|
|
33
42
|
}
|
|
34
43
|
}
|
|
35
44
|
|
|
45
|
+
/**
|
|
46
|
+
* Draw a line of text with per-span coloring (for inline colored text like links).
|
|
47
|
+
*/
|
|
48
|
+
function drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y) {
|
|
49
|
+
const lineStart = charOffset;
|
|
50
|
+
const lineEnd = charOffset + text.length;
|
|
51
|
+
let xPos = x;
|
|
52
|
+
let pos = 0;
|
|
53
|
+
|
|
54
|
+
for (const span of spans) {
|
|
55
|
+
if (span.charEnd <= lineStart || span.charStart >= lineEnd) continue;
|
|
56
|
+
const overlapStart = Math.max(span.charStart - lineStart, 0);
|
|
57
|
+
const overlapEnd = Math.min(span.charEnd - lineStart, text.length);
|
|
58
|
+
|
|
59
|
+
if (overlapStart > pos) {
|
|
60
|
+
const gapText = text.slice(pos, overlapStart);
|
|
61
|
+
ctx.fillStyle = defaultColor;
|
|
62
|
+
ctx.fillText(gapText, xPos, y);
|
|
63
|
+
xPos += ctx.measureText(gapText).width;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
const spanText = text.slice(overlapStart, overlapEnd);
|
|
67
|
+
ctx.fillStyle = span.color;
|
|
68
|
+
ctx.fillText(spanText, xPos, y);
|
|
69
|
+
xPos += ctx.measureText(spanText).width;
|
|
70
|
+
pos = overlapEnd;
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
if (pos < text.length) {
|
|
74
|
+
ctx.fillStyle = defaultColor;
|
|
75
|
+
ctx.fillText(text.slice(pos), xPos, y);
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
/**
|
|
80
|
+
* Draw a line of justified text with per-span coloring.
|
|
81
|
+
*/
|
|
82
|
+
function drawColoredJustifiedLine(ctx, text, charOffset, spans, defaultColor, x, y, availWidth) {
|
|
83
|
+
const words = text.split(" ");
|
|
84
|
+
if (words.length <= 1) {
|
|
85
|
+
drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y);
|
|
86
|
+
return;
|
|
87
|
+
}
|
|
88
|
+
let totalWordWidth = 0;
|
|
89
|
+
for (const w of words) totalWordWidth += ctx.measureText(w).width;
|
|
90
|
+
const normalSpaceWidth = ctx.measureText(" ").width;
|
|
91
|
+
const extraSpace = (availWidth - totalWordWidth) / (words.length - 1);
|
|
92
|
+
|
|
93
|
+
if (extraSpace > normalSpaceWidth * 3 || totalWordWidth < availWidth * 0.7) {
|
|
94
|
+
drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y);
|
|
95
|
+
return;
|
|
96
|
+
}
|
|
97
|
+
|
|
98
|
+
// Draw word by word with per-span coloring and justified spacing
|
|
99
|
+
let xPos = x;
|
|
100
|
+
let charPos = 0;
|
|
101
|
+
for (let wi = 0; wi < words.length; wi++) {
|
|
102
|
+
const word = words[wi];
|
|
103
|
+
drawColoredLine(ctx, word, charOffset + charPos, spans, defaultColor, xPos, y);
|
|
104
|
+
xPos += ctx.measureText(word).width + extraSpace;
|
|
105
|
+
charPos += word.length + 1; // +1 for space
|
|
106
|
+
}
|
|
107
|
+
}
|
|
108
|
+
|
|
36
109
|
function bboxOverlap(a, b) {
|
|
37
110
|
const x1 = Math.max(a.x, b.x);
|
|
38
111
|
const y1 = Math.max(a.y, b.y);
|
|
@@ -83,25 +156,26 @@ async function extractFontMetadata(page, opList, OPS) {
|
|
|
83
156
|
// ─── Text color extraction ───────────────────────────────────────────────
|
|
84
157
|
|
|
85
158
|
/**
|
|
86
|
-
* Extract fill colors
|
|
87
|
-
*
|
|
88
|
-
*
|
|
89
|
-
*
|
|
159
|
+
* Extract fill colors per beginText/endText block pair.
|
|
160
|
+
* Returns an array of color strings, one per text block.
|
|
161
|
+
*
|
|
162
|
+
* The previous approach pushed one color per text-drawing operator (showText,
|
|
163
|
+
* showSpacedText, etc.) and tried to index into text items 1:1. That mapping
|
|
164
|
+
* is broken because a single showSpacedText operator can produce multiple text
|
|
165
|
+
* items via buildTextContentItem(). Instead, we track color at text-block
|
|
166
|
+
* boundaries — all text items within the same beginText/endText pair share
|
|
167
|
+
* the same color context.
|
|
90
168
|
*/
|
|
91
|
-
function
|
|
92
|
-
const
|
|
169
|
+
function extractTextBlockColors(opList, OPS) {
|
|
170
|
+
const blockColors = []; // one entry per beginText/endText pair
|
|
93
171
|
let currentColor = "#000000";
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
OPS.showText,
|
|
97
|
-
OPS.showSpacedText,
|
|
98
|
-
OPS.nextLineShowText,
|
|
99
|
-
OPS.nextLineSetSpacingShowText,
|
|
100
|
-
]);
|
|
172
|
+
let blockColor = "#000000";
|
|
173
|
+
let inTextBlock = false;
|
|
101
174
|
|
|
102
175
|
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
103
176
|
const fn = opList.fnArray[i];
|
|
104
177
|
|
|
178
|
+
// Track color changes
|
|
105
179
|
if (fn === OPS.setFillRGBColor) {
|
|
106
180
|
currentColor = opList.argsArray[i][0];
|
|
107
181
|
} else if (fn === OPS.setFillTransparent) {
|
|
@@ -118,12 +192,30 @@ function extractTextColors(opList, OPS) {
|
|
|
118
192
|
}
|
|
119
193
|
}
|
|
120
194
|
|
|
121
|
-
if (
|
|
122
|
-
|
|
195
|
+
if (fn === OPS.beginText) {
|
|
196
|
+
inTextBlock = true;
|
|
197
|
+
blockColor = currentColor; // color at start of block
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// If color changes within a text block, update (last color wins)
|
|
201
|
+
if (inTextBlock && (
|
|
202
|
+
fn === OPS.setFillRGBColor ||
|
|
203
|
+
fn === OPS.setFillTransparent ||
|
|
204
|
+
fn === OPS.setFillGray ||
|
|
205
|
+
fn === OPS.setFillColor ||
|
|
206
|
+
fn === OPS.setFillCMYKColor ||
|
|
207
|
+
fn === OPS.setFillColorN
|
|
208
|
+
)) {
|
|
209
|
+
blockColor = currentColor;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (fn === OPS.endText) {
|
|
213
|
+
blockColors.push(blockColor);
|
|
214
|
+
inTextBlock = false;
|
|
123
215
|
}
|
|
124
216
|
}
|
|
125
217
|
|
|
126
|
-
return
|
|
218
|
+
return blockColors;
|
|
127
219
|
}
|
|
128
220
|
|
|
129
221
|
// ─── Page analysis ────────────────────────────────────────────────────────
|
|
@@ -132,15 +224,43 @@ function extractTextColors(opList, OPS) {
|
|
|
132
224
|
* Group adjacent text items into text blocks by proximity.
|
|
133
225
|
* Also extracts font metadata: average size, italic, bold.
|
|
134
226
|
*/
|
|
135
|
-
function groupTextBlocks(textItems, pageHeight, styles, fontMap,
|
|
136
|
-
//
|
|
137
|
-
//
|
|
138
|
-
|
|
139
|
-
|
|
227
|
+
function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
228
|
+
// Map text items to beginText/endText blocks by detecting position
|
|
229
|
+
// discontinuities. Items within the same text block are contiguous and
|
|
230
|
+
// share the same color. When there's a large Y-position jump or font
|
|
231
|
+
// change, we advance to the next block's color.
|
|
232
|
+
if (blockColors && blockColors.length > 0) {
|
|
233
|
+
let blockIdx = 0;
|
|
234
|
+
let prevY = null;
|
|
235
|
+
let prevFontName = null;
|
|
236
|
+
let itemsInCurrentBlock = 0;
|
|
237
|
+
|
|
140
238
|
for (const item of textItems) {
|
|
141
|
-
if (item.str
|
|
142
|
-
|
|
239
|
+
if (item.str === undefined) continue; // skip marked content
|
|
240
|
+
|
|
241
|
+
const y = item.transform ? item.transform[5] : null;
|
|
242
|
+
const fontHeight = item.transform
|
|
243
|
+
? Math.hypot(item.transform[2], item.transform[3])
|
|
244
|
+
: 12;
|
|
245
|
+
|
|
246
|
+
// Detect text block boundary by position discontinuity
|
|
247
|
+
if (prevY !== null && y !== null && itemsInCurrentBlock > 0) {
|
|
248
|
+
const yDiff = Math.abs(y - prevY);
|
|
249
|
+
const fontChanged = item.fontName !== prevFontName;
|
|
250
|
+
|
|
251
|
+
if (
|
|
252
|
+
(yDiff > fontHeight * 3) ||
|
|
253
|
+
(fontChanged && yDiff > fontHeight * 0.5)
|
|
254
|
+
) {
|
|
255
|
+
blockIdx = Math.min(blockIdx + 1, blockColors.length - 1);
|
|
256
|
+
itemsInCurrentBlock = 0;
|
|
257
|
+
}
|
|
143
258
|
}
|
|
259
|
+
|
|
260
|
+
item._color = blockColors[blockIdx] || "#000000";
|
|
261
|
+
itemsInCurrentBlock++;
|
|
262
|
+
prevY = y;
|
|
263
|
+
prevFontName = item.fontName;
|
|
144
264
|
}
|
|
145
265
|
}
|
|
146
266
|
|
|
@@ -301,6 +421,29 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
|
|
|
301
421
|
}
|
|
302
422
|
}
|
|
303
423
|
block.color = dominantColor;
|
|
424
|
+
|
|
425
|
+
// Build color spans — contiguous runs of items sharing the same color
|
|
426
|
+
// Character indices map to the concatenated text produced by blockToText
|
|
427
|
+
block.colorSpans = [];
|
|
428
|
+
if (block.items.length > 0) {
|
|
429
|
+
let spanColor = block.items[0]._color || "#000000";
|
|
430
|
+
let spanCharStart = 0;
|
|
431
|
+
let charCount = 0;
|
|
432
|
+
|
|
433
|
+
for (let i = 0; i < block.items.length; i++) {
|
|
434
|
+
const c = block.items[i]._color || "#000000";
|
|
435
|
+
const itemLen = (block.items[i].str || "").length;
|
|
436
|
+
if (c !== spanColor) {
|
|
437
|
+
block.colorSpans.push({ charStart: spanCharStart, charEnd: charCount, color: spanColor });
|
|
438
|
+
spanCharStart = charCount;
|
|
439
|
+
spanColor = c;
|
|
440
|
+
}
|
|
441
|
+
charCount += itemLen;
|
|
442
|
+
// Account for spaces inserted between items by blockToText
|
|
443
|
+
if (i < block.items.length - 1) charCount++;
|
|
444
|
+
}
|
|
445
|
+
block.colorSpans.push({ charStart: spanCharStart, charEnd: charCount, color: spanColor });
|
|
446
|
+
}
|
|
304
447
|
}
|
|
305
448
|
|
|
306
449
|
return blocks;
|
|
@@ -649,6 +792,32 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
|
|
|
649
792
|
});
|
|
650
793
|
}
|
|
651
794
|
|
|
795
|
+
// ── Compute inter-block vertical gaps from original PDF layout ──
|
|
796
|
+
for (let i = 1; i < regions.length; i++) {
|
|
797
|
+
const prev = regions[i - 1];
|
|
798
|
+
const curr = regions[i];
|
|
799
|
+
const prevBottom = prev.bbox.y + prev.bbox.h;
|
|
800
|
+
const currTop = curr.bbox.y;
|
|
801
|
+
curr.gapBefore = Math.max(0, currTop - prevBottom);
|
|
802
|
+
}
|
|
803
|
+
if (regions.length > 0) {
|
|
804
|
+
regions[0].gapBefore = 0; // padding handles top margin
|
|
805
|
+
}
|
|
806
|
+
|
|
807
|
+
// Store absolute pixel gaps and compute body font size for scaling
|
|
808
|
+
const bodyBlocks = regions.filter(r =>
|
|
809
|
+
r.type === "text" && r.block?.fontScale && Math.abs(r.block.fontScale - 1) < 0.15);
|
|
810
|
+
const avgBodyFontSize = bodyBlocks.length > 0
|
|
811
|
+
? bodyBlocks.reduce((s, r) => s + r.block.avgFontSize, 0) / bodyBlocks.length
|
|
812
|
+
: 12;
|
|
813
|
+
for (const region of regions) {
|
|
814
|
+
region.gapAbsolute = region.gapBefore || 0;
|
|
815
|
+
region._avgBodyFontSize = avgBodyFontSize;
|
|
816
|
+
// Keep gapRatio as fallback for any code that reads it
|
|
817
|
+
const avgBodyLH = avgBodyFontSize * 1.6;
|
|
818
|
+
region.gapRatio = (region.gapBefore || 0) / avgBodyLH;
|
|
819
|
+
}
|
|
820
|
+
|
|
652
821
|
return regions;
|
|
653
822
|
}
|
|
654
823
|
|
|
@@ -747,11 +916,11 @@ async function analyzePage(page, OPS) {
|
|
|
747
916
|
// Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
|
|
748
917
|
const fontMap = await extractFontMetadata(page, opList, OPS);
|
|
749
918
|
|
|
750
|
-
// Extract text colors
|
|
751
|
-
const
|
|
919
|
+
// Extract text colors per beginText/endText block (not per operator)
|
|
920
|
+
const blockColors = extractTextBlockColors(opList, OPS);
|
|
752
921
|
|
|
753
|
-
// Now group text blocks with real font data and colors
|
|
754
|
-
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap,
|
|
922
|
+
// Now group text blocks with real font data and block colors
|
|
923
|
+
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, blockColors);
|
|
755
924
|
|
|
756
925
|
// Compute body font size (most common size = body text)
|
|
757
926
|
const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
|
|
@@ -809,6 +978,7 @@ async function analyzePage(page, OPS) {
|
|
|
809
978
|
graphicRegions,
|
|
810
979
|
offCanvas,
|
|
811
980
|
fontMap,
|
|
981
|
+
bodyFontSize,
|
|
812
982
|
};
|
|
813
983
|
}
|
|
814
984
|
|
|
@@ -877,6 +1047,7 @@ function reflowAndComposite(analysis, opts) {
|
|
|
877
1047
|
fontFamily: blockFamily,
|
|
878
1048
|
align: block.align || "left",
|
|
879
1049
|
color: block.color,
|
|
1050
|
+
colorSpans: block.colorSpans || [],
|
|
880
1051
|
region,
|
|
881
1052
|
});
|
|
882
1053
|
} else {
|
|
@@ -908,12 +1079,15 @@ function reflowAndComposite(analysis, opts) {
|
|
|
908
1079
|
}
|
|
909
1080
|
}
|
|
910
1081
|
|
|
911
|
-
// Total height
|
|
1082
|
+
// Total height — use absolute pixel gaps scaled by font ratio
|
|
912
1083
|
const baseLH = fontSize * lineHeight;
|
|
913
1084
|
let totalHeight = padding;
|
|
914
1085
|
for (const r of reflowedRegions) {
|
|
915
1086
|
totalHeight += r.height;
|
|
916
|
-
|
|
1087
|
+
const gapAbs = r.region?.gapAbsolute ?? 0;
|
|
1088
|
+
const bodyFS = r.region?._avgBodyFontSize || 12;
|
|
1089
|
+
const scaledGap = gapAbs * (fontSize / bodyFS);
|
|
1090
|
+
totalHeight += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
|
|
917
1091
|
}
|
|
918
1092
|
totalHeight += padding;
|
|
919
1093
|
|
|
@@ -946,6 +1120,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
946
1120
|
let pdfjs = null;
|
|
947
1121
|
let pdfDoc = null;
|
|
948
1122
|
let currentPage = 0;
|
|
1123
|
+
const userSetFontSize = options.fontSize != null;
|
|
949
1124
|
let fontSize = options.fontSize ?? 16;
|
|
950
1125
|
let destroyed = false;
|
|
951
1126
|
|
|
@@ -1050,11 +1225,15 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1050
1225
|
const justified = r.align === "justify";
|
|
1051
1226
|
const availW = W - padding * 2;
|
|
1052
1227
|
|
|
1228
|
+
const hasMultipleColors = r.colorSpans && r.colorSpans.length > 1 &&
|
|
1229
|
+
!r.colorSpans.every(s => s.color === r.colorSpans[0].color);
|
|
1230
|
+
|
|
1053
1231
|
if (!enableMorph) {
|
|
1054
1232
|
ctx.fillStyle = r.color || textColor;
|
|
1055
1233
|
ctx.font = `${style} ${weight} ${fs * d}px ${rFamily}`;
|
|
1056
1234
|
}
|
|
1057
1235
|
|
|
1236
|
+
let lineCharOffset = 0;
|
|
1058
1237
|
for (let lineIdx = 0; lineIdx < r.lines.length; lineIdx++) {
|
|
1059
1238
|
const line = r.lines[lineIdx];
|
|
1060
1239
|
const screenY = cursorY - scrollY;
|
|
@@ -1098,6 +1277,21 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1098
1277
|
ctx.fillText(line.text, padding * d, screenY * d);
|
|
1099
1278
|
}
|
|
1100
1279
|
ctx.restore();
|
|
1280
|
+
} else if (hasMultipleColors) {
|
|
1281
|
+
// Per-span coloring for inline colored text (links, emphasis)
|
|
1282
|
+
if (shouldJustify) {
|
|
1283
|
+
drawColoredJustifiedLine(ctx, line.text, lineCharOffset, r.colorSpans,
|
|
1284
|
+
r.color || textColor, padding * d, screenY * d, availW * d);
|
|
1285
|
+
} else if (centered) {
|
|
1286
|
+
// Measure full line to center it, then draw colored from offset
|
|
1287
|
+
const lineW = ctx.measureText(line.text).width;
|
|
1288
|
+
const startX = (W * d - lineW) / 2;
|
|
1289
|
+
drawColoredLine(ctx, line.text, lineCharOffset, r.colorSpans,
|
|
1290
|
+
r.color || textColor, startX, screenY * d);
|
|
1291
|
+
} else {
|
|
1292
|
+
drawColoredLine(ctx, line.text, lineCharOffset, r.colorSpans,
|
|
1293
|
+
r.color || textColor, padding * d, screenY * d);
|
|
1294
|
+
}
|
|
1101
1295
|
} else {
|
|
1102
1296
|
if (centered) {
|
|
1103
1297
|
ctx.textAlign = "center";
|
|
@@ -1110,6 +1304,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1110
1304
|
}
|
|
1111
1305
|
}
|
|
1112
1306
|
}
|
|
1307
|
+
lineCharOffset += line.text.length;
|
|
1113
1308
|
cursorY += lh;
|
|
1114
1309
|
}
|
|
1115
1310
|
} else if (r.type === "graphic" && r.bitmap) {
|
|
@@ -1132,7 +1327,10 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1132
1327
|
}
|
|
1133
1328
|
cursorY += r.drawH;
|
|
1134
1329
|
}
|
|
1135
|
-
|
|
1330
|
+
const gapAbs = r.region?.gapAbsolute ?? 0;
|
|
1331
|
+
const bodyFS = r.region?._avgBodyFontSize || 12;
|
|
1332
|
+
const scaledGap = gapAbs * (fontSize / bodyFS);
|
|
1333
|
+
cursorY += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
|
|
1136
1334
|
}
|
|
1137
1335
|
}
|
|
1138
1336
|
|
|
@@ -1268,6 +1466,12 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1268
1466
|
|
|
1269
1467
|
currentAnalysis = analysisCache.get(pageNum);
|
|
1270
1468
|
currentPage = pageNum;
|
|
1469
|
+
|
|
1470
|
+
// Auto-match PDF body font size when user hasn't set an explicit fontSize
|
|
1471
|
+
if (!userSetFontSize && currentAnalysis.bodyFontSize) {
|
|
1472
|
+
fontSize = clamp(Math.round(currentAnalysis.bodyFontSize), minFont, maxFont);
|
|
1473
|
+
}
|
|
1474
|
+
|
|
1271
1475
|
scrollY = 0;
|
|
1272
1476
|
scrollVelocity = 0;
|
|
1273
1477
|
reflow();
|
|
@@ -1279,6 +1483,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1279
1483
|
graphicRegions: currentAnalysis.graphicRegions,
|
|
1280
1484
|
pageWidth: currentAnalysis.pageWidth,
|
|
1281
1485
|
pageHeight: currentAnalysis.pageHeight,
|
|
1486
|
+
bodyFontSize: currentAnalysis.bodyFontSize,
|
|
1282
1487
|
});
|
|
1283
1488
|
},
|
|
1284
1489
|
|