pretext-pdfjs 0.3.1 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/reflow.js +235 -30
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pretext-pdfjs",
3
- "version": "0.3.1",
3
+ "version": "0.3.3",
4
4
  "description": "Pretext-native text layer for PDF.js — zero DOM reflows, per-block reflow with image preservation, pinch-to-zoom text",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
package/src/reflow.js CHANGED
@@ -25,7 +25,16 @@ function drawJustifiedLine(ctx, text, x, y, availWidth) {
25
25
  }
26
26
  let totalWordWidth = 0;
27
27
  for (const w of words) totalWordWidth += ctx.measureText(w).width;
28
+
29
+ const normalSpaceWidth = ctx.measureText(" ").width;
28
30
  const extraSpace = (availWidth - totalWordWidth) / (words.length - 1);
31
+
32
+ // Fall back to left-aligned if gaps would be too large
33
+ if (extraSpace > normalSpaceWidth * 3 || totalWordWidth < availWidth * 0.7) {
34
+ ctx.fillText(text, x, y);
35
+ return;
36
+ }
37
+
29
38
  let xPos = x;
30
39
  for (const w of words) {
31
40
  ctx.fillText(w, xPos, y);
@@ -33,6 +42,70 @@ function drawJustifiedLine(ctx, text, x, y, availWidth) {
33
42
  }
34
43
  }
35
44
 
45
+ /**
46
+ * Draw a line of text with per-span coloring (for inline colored text like links).
47
+ */
48
+ function drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y) {
49
+ const lineStart = charOffset;
50
+ const lineEnd = charOffset + text.length;
51
+ let xPos = x;
52
+ let pos = 0;
53
+
54
+ for (const span of spans) {
55
+ if (span.charEnd <= lineStart || span.charStart >= lineEnd) continue;
56
+ const overlapStart = Math.max(span.charStart - lineStart, 0);
57
+ const overlapEnd = Math.min(span.charEnd - lineStart, text.length);
58
+
59
+ if (overlapStart > pos) {
60
+ const gapText = text.slice(pos, overlapStart);
61
+ ctx.fillStyle = defaultColor;
62
+ ctx.fillText(gapText, xPos, y);
63
+ xPos += ctx.measureText(gapText).width;
64
+ }
65
+
66
+ const spanText = text.slice(overlapStart, overlapEnd);
67
+ ctx.fillStyle = span.color;
68
+ ctx.fillText(spanText, xPos, y);
69
+ xPos += ctx.measureText(spanText).width;
70
+ pos = overlapEnd;
71
+ }
72
+
73
+ if (pos < text.length) {
74
+ ctx.fillStyle = defaultColor;
75
+ ctx.fillText(text.slice(pos), xPos, y);
76
+ }
77
+ }
78
+
79
+ /**
80
+ * Draw a line of justified text with per-span coloring.
81
+ */
82
+ function drawColoredJustifiedLine(ctx, text, charOffset, spans, defaultColor, x, y, availWidth) {
83
+ const words = text.split(" ");
84
+ if (words.length <= 1) {
85
+ drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y);
86
+ return;
87
+ }
88
+ let totalWordWidth = 0;
89
+ for (const w of words) totalWordWidth += ctx.measureText(w).width;
90
+ const normalSpaceWidth = ctx.measureText(" ").width;
91
+ const extraSpace = (availWidth - totalWordWidth) / (words.length - 1);
92
+
93
+ if (extraSpace > normalSpaceWidth * 3 || totalWordWidth < availWidth * 0.7) {
94
+ drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y);
95
+ return;
96
+ }
97
+
98
+ // Draw word by word with per-span coloring and justified spacing
99
+ let xPos = x;
100
+ let charPos = 0;
101
+ for (let wi = 0; wi < words.length; wi++) {
102
+ const word = words[wi];
103
+ drawColoredLine(ctx, word, charOffset + charPos, spans, defaultColor, xPos, y);
104
+ xPos += ctx.measureText(word).width + extraSpace;
105
+ charPos += word.length + 1; // +1 for space
106
+ }
107
+ }
108
+
36
109
  function bboxOverlap(a, b) {
37
110
  const x1 = Math.max(a.x, b.x);
38
111
  const y1 = Math.max(a.y, b.y);
@@ -83,25 +156,26 @@ async function extractFontMetadata(page, opList, OPS) {
83
156
  // ─── Text color extraction ───────────────────────────────────────────────
84
157
 
85
158
  /**
86
- * Extract fill colors from the operator list, indexed by text-drawing op.
87
- * The evaluator normalizes all fill-color commands to setFillRGBColor with
88
- * a hex string, so that's the primary path. Returns an array parallel to
89
- * the text items from getTextContent().
159
+ * Extract fill colors per beginText/endText block pair.
160
+ * Returns an array of color strings, one per text block.
161
+ *
162
+ * The previous approach pushed one color per text-drawing operator (showText,
163
+ * showSpacedText, etc.) and tried to index into text items 1:1. That mapping
164
+ * is broken because a single showSpacedText operator can produce multiple text
165
+ * items via buildTextContentItem(). Instead, we track color at text-block
166
+ * boundaries — all text items within the same beginText/endText pair share
167
+ * the same color context.
90
168
  */
91
- function extractTextColors(opList, OPS) {
92
- const textColors = [];
169
+ function extractTextBlockColors(opList, OPS) {
170
+ const blockColors = []; // one entry per beginText/endText pair
93
171
  let currentColor = "#000000";
94
-
95
- const textDrawOps = new Set([
96
- OPS.showText,
97
- OPS.showSpacedText,
98
- OPS.nextLineShowText,
99
- OPS.nextLineSetSpacingShowText,
100
- ]);
172
+ let blockColor = "#000000";
173
+ let inTextBlock = false;
101
174
 
102
175
  for (let i = 0; i < opList.fnArray.length; i++) {
103
176
  const fn = opList.fnArray[i];
104
177
 
178
+ // Track color changes
105
179
  if (fn === OPS.setFillRGBColor) {
106
180
  currentColor = opList.argsArray[i][0];
107
181
  } else if (fn === OPS.setFillTransparent) {
@@ -118,12 +192,30 @@ function extractTextColors(opList, OPS) {
118
192
  }
119
193
  }
120
194
 
121
- if (textDrawOps.has(fn)) {
122
- textColors.push(currentColor);
195
+ if (fn === OPS.beginText) {
196
+ inTextBlock = true;
197
+ blockColor = currentColor; // color at start of block
198
+ }
199
+
200
+ // If color changes within a text block, update (last color wins)
201
+ if (inTextBlock && (
202
+ fn === OPS.setFillRGBColor ||
203
+ fn === OPS.setFillTransparent ||
204
+ fn === OPS.setFillGray ||
205
+ fn === OPS.setFillColor ||
206
+ fn === OPS.setFillCMYKColor ||
207
+ fn === OPS.setFillColorN
208
+ )) {
209
+ blockColor = currentColor;
210
+ }
211
+
212
+ if (fn === OPS.endText) {
213
+ blockColors.push(blockColor);
214
+ inTextBlock = false;
123
215
  }
124
216
  }
125
217
 
126
- return textColors;
218
+ return blockColors;
127
219
  }
128
220
 
129
221
  // ─── Page analysis ────────────────────────────────────────────────────────
@@ -132,15 +224,43 @@ function extractTextColors(opList, OPS) {
132
224
  * Group adjacent text items into text blocks by proximity.
133
225
  * Also extracts font metadata: average size, italic, bold.
134
226
  */
135
- function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
136
- // Attach colors to text items before filtering (textColors is parallel to
137
- // the full items array from getTextContent, including empty items)
138
- if (textColors) {
139
- let colorIdx = 0;
227
+ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
228
+ // Map text items to beginText/endText blocks by detecting position
229
+ // discontinuities. Items within the same text block are contiguous and
230
+ // share the same color. When there's a large Y-position jump or font
231
+ // change, we advance to the next block's color.
232
+ if (blockColors && blockColors.length > 0) {
233
+ let blockIdx = 0;
234
+ let prevY = null;
235
+ let prevFontName = null;
236
+ let itemsInCurrentBlock = 0;
237
+
140
238
  for (const item of textItems) {
141
- if (item.str !== undefined) {
142
- item._color = textColors[colorIdx++] || "#000000";
239
+ if (item.str === undefined) continue; // skip marked content
240
+
241
+ const y = item.transform ? item.transform[5] : null;
242
+ const fontHeight = item.transform
243
+ ? Math.hypot(item.transform[2], item.transform[3])
244
+ : 12;
245
+
246
+ // Detect text block boundary by position discontinuity
247
+ if (prevY !== null && y !== null && itemsInCurrentBlock > 0) {
248
+ const yDiff = Math.abs(y - prevY);
249
+ const fontChanged = item.fontName !== prevFontName;
250
+
251
+ if (
252
+ (yDiff > fontHeight * 3) ||
253
+ (fontChanged && yDiff > fontHeight * 0.5)
254
+ ) {
255
+ blockIdx = Math.min(blockIdx + 1, blockColors.length - 1);
256
+ itemsInCurrentBlock = 0;
257
+ }
143
258
  }
259
+
260
+ item._color = blockColors[blockIdx] || "#000000";
261
+ itemsInCurrentBlock++;
262
+ prevY = y;
263
+ prevFontName = item.fontName;
144
264
  }
145
265
  }
146
266
 
@@ -301,6 +421,29 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
301
421
  }
302
422
  }
303
423
  block.color = dominantColor;
424
+
425
+ // Build color spans — contiguous runs of items sharing the same color
426
+ // Character indices map to the concatenated text produced by blockToText
427
+ block.colorSpans = [];
428
+ if (block.items.length > 0) {
429
+ let spanColor = block.items[0]._color || "#000000";
430
+ let spanCharStart = 0;
431
+ let charCount = 0;
432
+
433
+ for (let i = 0; i < block.items.length; i++) {
434
+ const c = block.items[i]._color || "#000000";
435
+ const itemLen = (block.items[i].str || "").length;
436
+ if (c !== spanColor) {
437
+ block.colorSpans.push({ charStart: spanCharStart, charEnd: charCount, color: spanColor });
438
+ spanCharStart = charCount;
439
+ spanColor = c;
440
+ }
441
+ charCount += itemLen;
442
+ // Account for spaces inserted between items by blockToText
443
+ if (i < block.items.length - 1) charCount++;
444
+ }
445
+ block.colorSpans.push({ charStart: spanCharStart, charEnd: charCount, color: spanColor });
446
+ }
304
447
  }
305
448
 
306
449
  return blocks;
@@ -649,6 +792,32 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
649
792
  });
650
793
  }
651
794
 
795
+ // ── Compute inter-block vertical gaps from original PDF layout ──
796
+ for (let i = 1; i < regions.length; i++) {
797
+ const prev = regions[i - 1];
798
+ const curr = regions[i];
799
+ const prevBottom = prev.bbox.y + prev.bbox.h;
800
+ const currTop = curr.bbox.y;
801
+ curr.gapBefore = Math.max(0, currTop - prevBottom);
802
+ }
803
+ if (regions.length > 0) {
804
+ regions[0].gapBefore = 0; // padding handles top margin
805
+ }
806
+
807
+ // Store absolute pixel gaps and compute body font size for scaling
808
+ const bodyBlocks = regions.filter(r =>
809
+ r.type === "text" && r.block?.fontScale && Math.abs(r.block.fontScale - 1) < 0.15);
810
+ const avgBodyFontSize = bodyBlocks.length > 0
811
+ ? bodyBlocks.reduce((s, r) => s + r.block.avgFontSize, 0) / bodyBlocks.length
812
+ : 12;
813
+ for (const region of regions) {
814
+ region.gapAbsolute = region.gapBefore || 0;
815
+ region._avgBodyFontSize = avgBodyFontSize;
816
+ // Keep gapRatio as fallback for any code that reads it
817
+ const avgBodyLH = avgBodyFontSize * 1.6;
818
+ region.gapRatio = (region.gapBefore || 0) / avgBodyLH;
819
+ }
820
+
652
821
  return regions;
653
822
  }
654
823
 
@@ -747,11 +916,11 @@ async function analyzePage(page, OPS) {
747
916
  // Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
748
917
  const fontMap = await extractFontMetadata(page, opList, OPS);
749
918
 
750
- // Extract text colors from operator list (parallel to text items)
751
- const textColors = extractTextColors(opList, OPS);
919
+ // Extract text colors per beginText/endText block (not per operator)
920
+ const blockColors = extractTextBlockColors(opList, OPS);
752
921
 
753
- // Now group text blocks with real font data and colors
754
- const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textColors);
922
+ // Now group text blocks with real font data and block colors
923
+ const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, blockColors);
755
924
 
756
925
  // Compute body font size (most common size = body text)
757
926
  const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
@@ -809,6 +978,7 @@ async function analyzePage(page, OPS) {
809
978
  graphicRegions,
810
979
  offCanvas,
811
980
  fontMap,
981
+ bodyFontSize,
812
982
  };
813
983
  }
814
984
 
@@ -877,6 +1047,7 @@ function reflowAndComposite(analysis, opts) {
877
1047
  fontFamily: blockFamily,
878
1048
  align: block.align || "left",
879
1049
  color: block.color,
1050
+ colorSpans: block.colorSpans || [],
880
1051
  region,
881
1052
  });
882
1053
  } else {
@@ -908,12 +1079,15 @@ function reflowAndComposite(analysis, opts) {
908
1079
  }
909
1080
  }
910
1081
 
911
- // Total height
1082
+ // Total height — use absolute pixel gaps scaled by font ratio
912
1083
  const baseLH = fontSize * lineHeight;
913
1084
  let totalHeight = padding;
914
1085
  for (const r of reflowedRegions) {
915
1086
  totalHeight += r.height;
916
- totalHeight += baseLH * 0.4;
1087
+ const gapAbs = r.region?.gapAbsolute ?? 0;
1088
+ const bodyFS = r.region?._avgBodyFontSize || 12;
1089
+ const scaledGap = gapAbs * (fontSize / bodyFS);
1090
+ totalHeight += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
917
1091
  }
918
1092
  totalHeight += padding;
919
1093
 
@@ -946,6 +1120,7 @@ export function createReflowRenderer(container, options = {}) {
946
1120
  let pdfjs = null;
947
1121
  let pdfDoc = null;
948
1122
  let currentPage = 0;
1123
+ const userSetFontSize = options.fontSize != null;
949
1124
  let fontSize = options.fontSize ?? 16;
950
1125
  let destroyed = false;
951
1126
 
@@ -1050,11 +1225,15 @@ export function createReflowRenderer(container, options = {}) {
1050
1225
  const justified = r.align === "justify";
1051
1226
  const availW = W - padding * 2;
1052
1227
 
1228
+ const hasMultipleColors = r.colorSpans && r.colorSpans.length > 1 &&
1229
+ !r.colorSpans.every(s => s.color === r.colorSpans[0].color);
1230
+
1053
1231
  if (!enableMorph) {
1054
1232
  ctx.fillStyle = r.color || textColor;
1055
1233
  ctx.font = `${style} ${weight} ${fs * d}px ${rFamily}`;
1056
1234
  }
1057
1235
 
1236
+ let lineCharOffset = 0;
1058
1237
  for (let lineIdx = 0; lineIdx < r.lines.length; lineIdx++) {
1059
1238
  const line = r.lines[lineIdx];
1060
1239
  const screenY = cursorY - scrollY;
@@ -1098,6 +1277,21 @@ export function createReflowRenderer(container, options = {}) {
1098
1277
  ctx.fillText(line.text, padding * d, screenY * d);
1099
1278
  }
1100
1279
  ctx.restore();
1280
+ } else if (hasMultipleColors) {
1281
+ // Per-span coloring for inline colored text (links, emphasis)
1282
+ if (shouldJustify) {
1283
+ drawColoredJustifiedLine(ctx, line.text, lineCharOffset, r.colorSpans,
1284
+ r.color || textColor, padding * d, screenY * d, availW * d);
1285
+ } else if (centered) {
1286
+ // Measure full line to center it, then draw colored from offset
1287
+ const lineW = ctx.measureText(line.text).width;
1288
+ const startX = (W * d - lineW) / 2;
1289
+ drawColoredLine(ctx, line.text, lineCharOffset, r.colorSpans,
1290
+ r.color || textColor, startX, screenY * d);
1291
+ } else {
1292
+ drawColoredLine(ctx, line.text, lineCharOffset, r.colorSpans,
1293
+ r.color || textColor, padding * d, screenY * d);
1294
+ }
1101
1295
  } else {
1102
1296
  if (centered) {
1103
1297
  ctx.textAlign = "center";
@@ -1110,6 +1304,7 @@ export function createReflowRenderer(container, options = {}) {
1110
1304
  }
1111
1305
  }
1112
1306
  }
1307
+ lineCharOffset += line.text.length;
1113
1308
  cursorY += lh;
1114
1309
  }
1115
1310
  } else if (r.type === "graphic" && r.bitmap) {
@@ -1132,7 +1327,10 @@ export function createReflowRenderer(container, options = {}) {
1132
1327
  }
1133
1328
  cursorY += r.drawH;
1134
1329
  }
1135
- cursorY += baseLH * 0.4;
1330
+ const gapAbs = r.region?.gapAbsolute ?? 0;
1331
+ const bodyFS = r.region?._avgBodyFontSize || 12;
1332
+ const scaledGap = gapAbs * (fontSize / bodyFS);
1333
+ cursorY += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
1136
1334
  }
1137
1335
  }
1138
1336
 
@@ -1268,6 +1466,12 @@ export function createReflowRenderer(container, options = {}) {
1268
1466
 
1269
1467
  currentAnalysis = analysisCache.get(pageNum);
1270
1468
  currentPage = pageNum;
1469
+
1470
+ // Auto-match PDF body font size when user hasn't set an explicit fontSize
1471
+ if (!userSetFontSize && currentAnalysis.bodyFontSize) {
1472
+ fontSize = clamp(Math.round(currentAnalysis.bodyFontSize), minFont, maxFont);
1473
+ }
1474
+
1271
1475
  scrollY = 0;
1272
1476
  scrollVelocity = 0;
1273
1477
  reflow();
@@ -1279,6 +1483,7 @@ export function createReflowRenderer(container, options = {}) {
1279
1483
  graphicRegions: currentAnalysis.graphicRegions,
1280
1484
  pageWidth: currentAnalysis.pageWidth,
1281
1485
  pageHeight: currentAnalysis.pageHeight,
1486
+ bodyFontSize: currentAnalysis.bodyFontSize,
1282
1487
  });
1283
1488
  },
1284
1489