pretext-pdfjs 0.3.0 → 0.3.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/reflow.js +269 -12
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pretext-pdfjs",
3
- "version": "0.3.0",
3
+ "version": "0.3.2",
4
4
  "description": "Pretext-native text layer for PDF.js — zero DOM reflows, per-block reflow with image preservation, pinch-to-zoom text",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
package/src/reflow.js CHANGED
@@ -25,7 +25,16 @@ function drawJustifiedLine(ctx, text, x, y, availWidth) {
25
25
  }
26
26
  let totalWordWidth = 0;
27
27
  for (const w of words) totalWordWidth += ctx.measureText(w).width;
28
+
29
+ const normalSpaceWidth = ctx.measureText(" ").width;
28
30
  const extraSpace = (availWidth - totalWordWidth) / (words.length - 1);
31
+
32
+ // Fall back to left-aligned if gaps would be too large
33
+ if (extraSpace > normalSpaceWidth * 3 || totalWordWidth < availWidth * 0.7) {
34
+ ctx.fillText(text, x, y);
35
+ return;
36
+ }
37
+
29
38
  let xPos = x;
30
39
  for (const w of words) {
31
40
  ctx.fillText(w, xPos, y);
@@ -33,6 +42,70 @@ function drawJustifiedLine(ctx, text, x, y, availWidth) {
33
42
  }
34
43
  }
35
44
 
45
+ /**
46
+ * Draw a line of text with per-span coloring (for inline colored text like links).
47
+ */
48
+ function drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y) {
49
+ const lineStart = charOffset;
50
+ const lineEnd = charOffset + text.length;
51
+ let xPos = x;
52
+ let pos = 0;
53
+
54
+ for (const span of spans) {
55
+ if (span.charEnd <= lineStart || span.charStart >= lineEnd) continue;
56
+ const overlapStart = Math.max(span.charStart - lineStart, 0);
57
+ const overlapEnd = Math.min(span.charEnd - lineStart, text.length);
58
+
59
+ if (overlapStart > pos) {
60
+ const gapText = text.slice(pos, overlapStart);
61
+ ctx.fillStyle = defaultColor;
62
+ ctx.fillText(gapText, xPos, y);
63
+ xPos += ctx.measureText(gapText).width;
64
+ }
65
+
66
+ const spanText = text.slice(overlapStart, overlapEnd);
67
+ ctx.fillStyle = span.color;
68
+ ctx.fillText(spanText, xPos, y);
69
+ xPos += ctx.measureText(spanText).width;
70
+ pos = overlapEnd;
71
+ }
72
+
73
+ if (pos < text.length) {
74
+ ctx.fillStyle = defaultColor;
75
+ ctx.fillText(text.slice(pos), xPos, y);
76
+ }
77
+ }
78
+
79
+ /**
80
+ * Draw a line of justified text with per-span coloring.
81
+ */
82
+ function drawColoredJustifiedLine(ctx, text, charOffset, spans, defaultColor, x, y, availWidth) {
83
+ const words = text.split(" ");
84
+ if (words.length <= 1) {
85
+ drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y);
86
+ return;
87
+ }
88
+ let totalWordWidth = 0;
89
+ for (const w of words) totalWordWidth += ctx.measureText(w).width;
90
+ const normalSpaceWidth = ctx.measureText(" ").width;
91
+ const extraSpace = (availWidth - totalWordWidth) / (words.length - 1);
92
+
93
+ if (extraSpace > normalSpaceWidth * 3 || totalWordWidth < availWidth * 0.7) {
94
+ drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y);
95
+ return;
96
+ }
97
+
98
+ // Draw word by word with per-span coloring and justified spacing
99
+ let xPos = x;
100
+ let charPos = 0;
101
+ for (let wi = 0; wi < words.length; wi++) {
102
+ const word = words[wi];
103
+ drawColoredLine(ctx, word, charOffset + charPos, spans, defaultColor, xPos, y);
104
+ xPos += ctx.measureText(word).width + extraSpace;
105
+ charPos += word.length + 1; // +1 for space
106
+ }
107
+ }
108
+
36
109
  function bboxOverlap(a, b) {
37
110
  const x1 = Math.max(a.x, b.x);
38
111
  const y1 = Math.max(a.y, b.y);
@@ -80,13 +153,70 @@ async function extractFontMetadata(page, opList, OPS) {
80
153
  return fontMap;
81
154
  }
82
155
 
156
+ // ─── Text color extraction ───────────────────────────────────────────────
157
+
158
+ /**
159
+ * Extract fill colors from the operator list, indexed by text-drawing op.
160
+ * The evaluator normalizes all fill-color commands to setFillRGBColor with
161
+ * a hex string, so that's the primary path. Returns an array parallel to
162
+ * the text items from getTextContent().
163
+ */
164
+ function extractTextColors(opList, OPS) {
165
+ const textColors = [];
166
+ let currentColor = "#000000";
167
+
168
+ const textDrawOps = new Set([
169
+ OPS.showText,
170
+ OPS.showSpacedText,
171
+ OPS.nextLineShowText,
172
+ OPS.nextLineSetSpacingShowText,
173
+ ]);
174
+
175
+ for (let i = 0; i < opList.fnArray.length; i++) {
176
+ const fn = opList.fnArray[i];
177
+
178
+ if (fn === OPS.setFillRGBColor) {
179
+ currentColor = opList.argsArray[i][0];
180
+ } else if (fn === OPS.setFillTransparent) {
181
+ currentColor = "transparent";
182
+ } else if (
183
+ fn === OPS.setFillGray ||
184
+ fn === OPS.setFillColor ||
185
+ fn === OPS.setFillCMYKColor ||
186
+ fn === OPS.setFillColorN
187
+ ) {
188
+ const args = opList.argsArray[i];
189
+ if (args?.[0] && typeof args[0] === "string" && args[0].startsWith("#")) {
190
+ currentColor = args[0];
191
+ }
192
+ }
193
+
194
+ if (textDrawOps.has(fn)) {
195
+ textColors.push(currentColor);
196
+ }
197
+ }
198
+
199
+ return textColors;
200
+ }
201
+
83
202
  // ─── Page analysis ────────────────────────────────────────────────────────
84
203
 
85
204
  /**
86
205
  * Group adjacent text items into text blocks by proximity.
87
206
  * Also extracts font metadata: average size, italic, bold.
88
207
  */
89
- function groupTextBlocks(textItems, pageHeight, styles, fontMap) {
208
+ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
209
+ // Attach colors to text items before filtering (textColors is parallel to
210
+ // the full items array from getTextContent, including empty items)
211
+ if (textColors) {
212
+ let colorIdx = 0;
213
+ for (const item of textItems) {
214
+ if (item.str !== undefined) {
215
+ item._color = textColors[colorIdx++] || "#000000";
216
+ }
217
+ }
218
+ }
219
+
90
220
  const sorted = [...textItems].filter(i => i.str?.trim()).sort((a, b) => {
91
221
  const ay = pageHeight - a.transform[5];
92
222
  const by = pageHeight - b.transform[5];
@@ -226,6 +356,47 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap) {
226
356
 
227
357
  // Store the font metadata for the dominant font in this block
228
358
  block.fontMeta = fontMap?.get(block.items[0]?.fontName) || null;
359
+
360
+ // Compute dominant fill color for the block
361
+ const colorFreq = {};
362
+ for (const item of block.items) {
363
+ const c = item._color || "#000000";
364
+ if (c !== "transparent") {
365
+ colorFreq[c] = (colorFreq[c] || 0) + 1;
366
+ }
367
+ }
368
+ let dominantColor = "#000000";
369
+ let maxColorFreq = 0;
370
+ for (const [c, freq] of Object.entries(colorFreq)) {
371
+ if (freq > maxColorFreq) {
372
+ maxColorFreq = freq;
373
+ dominantColor = c;
374
+ }
375
+ }
376
+ block.color = dominantColor;
377
+
378
+ // Build color spans — contiguous runs of items sharing the same color
379
+ // Character indices map to the concatenated text produced by blockToText
380
+ block.colorSpans = [];
381
+ if (block.items.length > 0) {
382
+ let spanColor = block.items[0]._color || "#000000";
383
+ let spanCharStart = 0;
384
+ let charCount = 0;
385
+
386
+ for (let i = 0; i < block.items.length; i++) {
387
+ const c = block.items[i]._color || "#000000";
388
+ const itemLen = (block.items[i].str || "").length;
389
+ if (c !== spanColor) {
390
+ block.colorSpans.push({ charStart: spanCharStart, charEnd: charCount, color: spanColor });
391
+ spanCharStart = charCount;
392
+ spanColor = c;
393
+ }
394
+ charCount += itemLen;
395
+ // Account for spaces inserted between items by blockToText
396
+ if (i < block.items.length - 1) charCount++;
397
+ }
398
+ block.colorSpans.push({ charStart: spanCharStart, charEnd: charCount, color: spanColor });
399
+ }
229
400
  }
230
401
 
231
402
  return blocks;
@@ -389,26 +560,40 @@ function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
389
560
  function blockToText(block, pageHeight) {
390
561
  let result = "";
391
562
  let lastY = null;
563
+ let lastX = null;
564
+ let lastW = 0;
392
565
  let lastFontSize = 12;
393
566
 
394
567
  for (const item of block.items) {
395
568
  if (!item.str) continue;
569
+ const currentX = item.transform[4];
396
570
  const currentY = pageHeight - item.transform[5];
397
571
  const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
398
572
  if (fontHeight > 0) lastFontSize = fontHeight;
399
573
 
400
574
  if (lastY !== null) {
401
- const gap = Math.abs(currentY - lastY);
575
+ const vGap = Math.abs(currentY - lastY);
402
576
  const isShortItem = (item.str || "").trim().length <= 2;
403
- if (gap > lastFontSize * 1.8 && !isShortItem) {
577
+ if (vGap > lastFontSize * 1.8 && !isShortItem) {
404
578
  result += "\n\n";
405
- } else if (gap > lastFontSize * 0.3) {
579
+ } else if (vGap > lastFontSize * 0.3) {
580
+ // Different line — insert space
406
581
  if (!result.endsWith(" ") && !result.endsWith("\n")) {
407
582
  result += " ";
408
583
  }
584
+ } else if (lastX !== null) {
585
+ // Same line — check horizontal gap between items
586
+ const hGap = currentX - (lastX + lastW);
587
+ if (hGap > lastFontSize * 0.15) {
588
+ if (!result.endsWith(" ") && !result.endsWith("\n")) {
589
+ result += " ";
590
+ }
591
+ }
409
592
  }
410
593
  }
411
594
  lastY = currentY;
595
+ lastX = currentX;
596
+ lastW = item.width || 0;
412
597
  result += item.str;
413
598
  }
414
599
  return result.trim();
@@ -560,6 +745,28 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
560
745
  });
561
746
  }
562
747
 
748
+ // ── Compute inter-block vertical gaps from original PDF layout ──
749
+ for (let i = 1; i < regions.length; i++) {
750
+ const prev = regions[i - 1];
751
+ const curr = regions[i];
752
+ const prevBottom = prev.bbox.y + prev.bbox.h;
753
+ const currTop = curr.bbox.y;
754
+ curr.gapBefore = Math.max(0, currTop - prevBottom);
755
+ }
756
+ if (regions.length > 0) {
757
+ regions[0].gapBefore = regions[0].bbox.y;
758
+ }
759
+
760
+ // Normalize gaps relative to average body line height
761
+ const bodyBlocks = regions.filter(r =>
762
+ r.type === "text" && r.block?.fontScale && Math.abs(r.block.fontScale - 1) < 0.15);
763
+ const avgBodyLH = bodyBlocks.length > 0
764
+ ? bodyBlocks.reduce((s, r) => s + r.block.avgFontSize * 1.6, 0) / bodyBlocks.length
765
+ : 12 * 1.6;
766
+ for (const region of regions) {
767
+ region.gapRatio = (region.gapBefore || 0) / avgBodyLH;
768
+ }
769
+
563
770
  return regions;
564
771
  }
565
772
 
@@ -658,8 +865,11 @@ async function analyzePage(page, OPS) {
658
865
  // Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
659
866
  const fontMap = await extractFontMetadata(page, opList, OPS);
660
867
 
661
- // Now group text blocks with real font data
662
- const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap);
868
+ // Extract text colors from operator list (parallel to text items)
869
+ const textColors = extractTextColors(opList, OPS);
870
+
871
+ // Now group text blocks with real font data and colors
872
+ const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textColors);
663
873
 
664
874
  // Compute body font size (most common size = body text)
665
875
  const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
@@ -717,6 +927,7 @@ async function analyzePage(page, OPS) {
717
927
  graphicRegions,
718
928
  offCanvas,
719
929
  fontMap,
930
+ bodyFontSize,
720
931
  };
721
932
  }
722
933
 
@@ -784,6 +995,8 @@ function reflowAndComposite(analysis, opts) {
784
995
  fontWeight: weight,
785
996
  fontFamily: blockFamily,
786
997
  align: block.align || "left",
998
+ color: block.color,
999
+ colorSpans: block.colorSpans || [],
787
1000
  region,
788
1001
  });
789
1002
  } else {
@@ -815,12 +1028,13 @@ function reflowAndComposite(analysis, opts) {
815
1028
  }
816
1029
  }
817
1030
 
818
- // Total height
1031
+ // Total height — use original PDF gap ratios between regions
819
1032
  const baseLH = fontSize * lineHeight;
820
1033
  let totalHeight = padding;
821
1034
  for (const r of reflowedRegions) {
822
1035
  totalHeight += r.height;
823
- totalHeight += baseLH * 0.4;
1036
+ const gapRatio = r.region?.gapRatio ?? 0.4;
1037
+ totalHeight += baseLH * Math.max(0.2, Math.min(gapRatio, 2.0));
824
1038
  }
825
1039
  totalHeight += padding;
826
1040
 
@@ -853,6 +1067,7 @@ export function createReflowRenderer(container, options = {}) {
853
1067
  let pdfjs = null;
854
1068
  let pdfDoc = null;
855
1069
  let currentPage = 0;
1070
+ const userSetFontSize = options.fontSize != null;
856
1071
  let fontSize = options.fontSize ?? 16;
857
1072
  let destroyed = false;
858
1073
 
@@ -957,11 +1172,15 @@ export function createReflowRenderer(container, options = {}) {
957
1172
  const justified = r.align === "justify";
958
1173
  const availW = W - padding * 2;
959
1174
 
1175
+ const hasMultipleColors = r.colorSpans && r.colorSpans.length > 1 &&
1176
+ !r.colorSpans.every(s => s.color === r.colorSpans[0].color);
1177
+
960
1178
  if (!enableMorph) {
961
- ctx.fillStyle = textColor;
1179
+ ctx.fillStyle = r.color || textColor;
962
1180
  ctx.font = `${style} ${weight} ${fs * d}px ${rFamily}`;
963
1181
  }
964
1182
 
1183
+ let lineCharOffset = 0;
965
1184
  for (let lineIdx = 0; lineIdx < r.lines.length; lineIdx++) {
966
1185
  const line = r.lines[lineIdx];
967
1186
  const screenY = cursorY - scrollY;
@@ -976,10 +1195,24 @@ export function createReflowRenderer(container, options = {}) {
976
1195
  const ease = 1 - (1 - t) ** 3;
977
1196
  const morphedFS = fs * (1 - ease * (1 - edgeFontRatio));
978
1197
  const opacity = 1.0 + (0.2 - 1.0) * ease;
979
- const c = Math.round(37 - (37 - 160) * ease);
1198
+ // Blend the block's actual color toward gray at edges
1199
+ const blockColor = r.color || textColor;
1200
+ let morphColor;
1201
+ if (blockColor.startsWith("#") && blockColor.length === 7) {
1202
+ const br = parseInt(blockColor.slice(1, 3), 16);
1203
+ const bg_ = parseInt(blockColor.slice(3, 5), 16);
1204
+ const bb = parseInt(blockColor.slice(5, 7), 16);
1205
+ const dimR = Math.round(br + (160 - br) * ease);
1206
+ const dimG = Math.round(bg_ + (160 - bg_) * ease);
1207
+ const dimB = Math.round(bb + (160 - bb) * ease);
1208
+ morphColor = `rgb(${dimR},${dimG},${dimB})`;
1209
+ } else {
1210
+ const c = Math.round(37 - (37 - 160) * ease);
1211
+ morphColor = `rgb(${c},${c - 2},${c - 3})`;
1212
+ }
980
1213
  ctx.save();
981
1214
  ctx.globalAlpha = opacity;
982
- ctx.fillStyle = `rgb(${c},${c - 2},${c - 3})`;
1215
+ ctx.fillStyle = morphColor;
983
1216
  ctx.font = `${style} ${weight} ${morphedFS * d}px ${rFamily}`;
984
1217
  if (centered) {
985
1218
  ctx.textAlign = "center";
@@ -991,6 +1224,21 @@ export function createReflowRenderer(container, options = {}) {
991
1224
  ctx.fillText(line.text, padding * d, screenY * d);
992
1225
  }
993
1226
  ctx.restore();
1227
+ } else if (hasMultipleColors) {
1228
+ // Per-span coloring for inline colored text (links, emphasis)
1229
+ if (shouldJustify) {
1230
+ drawColoredJustifiedLine(ctx, line.text, lineCharOffset, r.colorSpans,
1231
+ r.color || textColor, padding * d, screenY * d, availW * d);
1232
+ } else if (centered) {
1233
+ // Measure full line to center it, then draw colored from offset
1234
+ const lineW = ctx.measureText(line.text).width;
1235
+ const startX = (W * d - lineW) / 2;
1236
+ drawColoredLine(ctx, line.text, lineCharOffset, r.colorSpans,
1237
+ r.color || textColor, startX, screenY * d);
1238
+ } else {
1239
+ drawColoredLine(ctx, line.text, lineCharOffset, r.colorSpans,
1240
+ r.color || textColor, padding * d, screenY * d);
1241
+ }
994
1242
  } else {
995
1243
  if (centered) {
996
1244
  ctx.textAlign = "center";
@@ -1003,6 +1251,7 @@ export function createReflowRenderer(container, options = {}) {
1003
1251
  }
1004
1252
  }
1005
1253
  }
1254
+ lineCharOffset += line.text.length;
1006
1255
  cursorY += lh;
1007
1256
  }
1008
1257
  } else if (r.type === "graphic" && r.bitmap) {
@@ -1025,7 +1274,8 @@ export function createReflowRenderer(container, options = {}) {
1025
1274
  }
1026
1275
  cursorY += r.drawH;
1027
1276
  }
1028
- cursorY += baseLH * 0.4;
1277
+ const gapRatio = r.region?.gapRatio ?? 0.4;
1278
+ cursorY += baseLH * Math.max(0.2, Math.min(gapRatio, 2.0));
1029
1279
  }
1030
1280
  }
1031
1281
 
@@ -1161,6 +1411,12 @@ export function createReflowRenderer(container, options = {}) {
1161
1411
 
1162
1412
  currentAnalysis = analysisCache.get(pageNum);
1163
1413
  currentPage = pageNum;
1414
+
1415
+ // Auto-match PDF body font size when user hasn't set an explicit fontSize
1416
+ if (!userSetFontSize && currentAnalysis.bodyFontSize) {
1417
+ fontSize = clamp(Math.round(currentAnalysis.bodyFontSize), minFont, maxFont);
1418
+ }
1419
+
1164
1420
  scrollY = 0;
1165
1421
  scrollVelocity = 0;
1166
1422
  reflow();
@@ -1172,6 +1428,7 @@ export function createReflowRenderer(container, options = {}) {
1172
1428
  graphicRegions: currentAnalysis.graphicRegions,
1173
1429
  pageWidth: currentAnalysis.pageWidth,
1174
1430
  pageHeight: currentAnalysis.pageHeight,
1431
+ bodyFontSize: currentAnalysis.bodyFontSize,
1175
1432
  });
1176
1433
  },
1177
1434