pretext-pdfjs 0.3.0 → 0.3.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/reflow.js +116 -9
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pretext-pdfjs",
3
- "version": "0.3.0",
3
+ "version": "0.3.1",
4
4
  "description": "Pretext-native text layer for PDF.js — zero DOM reflows, per-block reflow with image preservation, pinch-to-zoom text",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
package/src/reflow.js CHANGED
@@ -80,13 +80,70 @@ async function extractFontMetadata(page, opList, OPS) {
80
80
  return fontMap;
81
81
  }
82
82
 
83
+ // ─── Text color extraction ───────────────────────────────────────────────
84
+
85
+ /**
86
+ * Extract fill colors from the operator list, indexed by text-drawing op.
87
+ * The evaluator normalizes all fill-color commands to setFillRGBColor with
88
+ * a hex string, so that's the primary path. Returns an array parallel to
89
+ * the text items from getTextContent().
90
+ */
91
+ function extractTextColors(opList, OPS) {
92
+ const textColors = [];
93
+ let currentColor = "#000000";
94
+
95
+ const textDrawOps = new Set([
96
+ OPS.showText,
97
+ OPS.showSpacedText,
98
+ OPS.nextLineShowText,
99
+ OPS.nextLineSetSpacingShowText,
100
+ ]);
101
+
102
+ for (let i = 0; i < opList.fnArray.length; i++) {
103
+ const fn = opList.fnArray[i];
104
+
105
+ if (fn === OPS.setFillRGBColor) {
106
+ currentColor = opList.argsArray[i][0];
107
+ } else if (fn === OPS.setFillTransparent) {
108
+ currentColor = "transparent";
109
+ } else if (
110
+ fn === OPS.setFillGray ||
111
+ fn === OPS.setFillColor ||
112
+ fn === OPS.setFillCMYKColor ||
113
+ fn === OPS.setFillColorN
114
+ ) {
115
+ const args = opList.argsArray[i];
116
+ if (args?.[0] && typeof args[0] === "string" && args[0].startsWith("#")) {
117
+ currentColor = args[0];
118
+ }
119
+ }
120
+
121
+ if (textDrawOps.has(fn)) {
122
+ textColors.push(currentColor);
123
+ }
124
+ }
125
+
126
+ return textColors;
127
+ }
128
+
83
129
  // ─── Page analysis ────────────────────────────────────────────────────────
84
130
 
85
131
  /**
86
132
  * Group adjacent text items into text blocks by proximity.
87
133
  * Also extracts font metadata: average size, italic, bold.
88
134
  */
89
- function groupTextBlocks(textItems, pageHeight, styles, fontMap) {
135
+ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
136
+ // Attach colors to text items before filtering (textColors is parallel to
137
+ // the full items array from getTextContent, including empty items)
138
+ if (textColors) {
139
+ let colorIdx = 0;
140
+ for (const item of textItems) {
141
+ if (item.str !== undefined) {
142
+ item._color = textColors[colorIdx++] || "#000000";
143
+ }
144
+ }
145
+ }
146
+
90
147
  const sorted = [...textItems].filter(i => i.str?.trim()).sort((a, b) => {
91
148
  const ay = pageHeight - a.transform[5];
92
149
  const by = pageHeight - b.transform[5];
@@ -226,6 +283,24 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap) {
226
283
 
227
284
  // Store the font metadata for the dominant font in this block
228
285
  block.fontMeta = fontMap?.get(block.items[0]?.fontName) || null;
286
+
287
+ // Compute dominant fill color for the block
288
+ const colorFreq = {};
289
+ for (const item of block.items) {
290
+ const c = item._color || "#000000";
291
+ if (c !== "transparent") {
292
+ colorFreq[c] = (colorFreq[c] || 0) + 1;
293
+ }
294
+ }
295
+ let dominantColor = "#000000";
296
+ let maxColorFreq = 0;
297
+ for (const [c, freq] of Object.entries(colorFreq)) {
298
+ if (freq > maxColorFreq) {
299
+ maxColorFreq = freq;
300
+ dominantColor = c;
301
+ }
302
+ }
303
+ block.color = dominantColor;
229
304
  }
230
305
 
231
306
  return blocks;
@@ -389,26 +464,40 @@ function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
389
464
  function blockToText(block, pageHeight) {
390
465
  let result = "";
391
466
  let lastY = null;
467
+ let lastX = null;
468
+ let lastW = 0;
392
469
  let lastFontSize = 12;
393
470
 
394
471
  for (const item of block.items) {
395
472
  if (!item.str) continue;
473
+ const currentX = item.transform[4];
396
474
  const currentY = pageHeight - item.transform[5];
397
475
  const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
398
476
  if (fontHeight > 0) lastFontSize = fontHeight;
399
477
 
400
478
  if (lastY !== null) {
401
- const gap = Math.abs(currentY - lastY);
479
+ const vGap = Math.abs(currentY - lastY);
402
480
  const isShortItem = (item.str || "").trim().length <= 2;
403
- if (gap > lastFontSize * 1.8 && !isShortItem) {
481
+ if (vGap > lastFontSize * 1.8 && !isShortItem) {
404
482
  result += "\n\n";
405
- } else if (gap > lastFontSize * 0.3) {
483
+ } else if (vGap > lastFontSize * 0.3) {
484
+ // Different line — insert space
406
485
  if (!result.endsWith(" ") && !result.endsWith("\n")) {
407
486
  result += " ";
408
487
  }
488
+ } else if (lastX !== null) {
489
+ // Same line — check horizontal gap between items
490
+ const hGap = currentX - (lastX + lastW);
491
+ if (hGap > lastFontSize * 0.15) {
492
+ if (!result.endsWith(" ") && !result.endsWith("\n")) {
493
+ result += " ";
494
+ }
495
+ }
409
496
  }
410
497
  }
411
498
  lastY = currentY;
499
+ lastX = currentX;
500
+ lastW = item.width || 0;
412
501
  result += item.str;
413
502
  }
414
503
  return result.trim();
@@ -658,8 +747,11 @@ async function analyzePage(page, OPS) {
658
747
  // Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
659
748
  const fontMap = await extractFontMetadata(page, opList, OPS);
660
749
 
661
- // Now group text blocks with real font data
662
- const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap);
750
+ // Extract text colors from operator list (parallel to text items)
751
+ const textColors = extractTextColors(opList, OPS);
752
+
753
+ // Now group text blocks with real font data and colors
754
+ const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textColors);
663
755
 
664
756
  // Compute body font size (most common size = body text)
665
757
  const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
@@ -784,6 +876,7 @@ function reflowAndComposite(analysis, opts) {
784
876
  fontWeight: weight,
785
877
  fontFamily: blockFamily,
786
878
  align: block.align || "left",
879
+ color: block.color,
787
880
  region,
788
881
  });
789
882
  } else {
@@ -958,7 +1051,7 @@ export function createReflowRenderer(container, options = {}) {
958
1051
  const availW = W - padding * 2;
959
1052
 
960
1053
  if (!enableMorph) {
961
- ctx.fillStyle = textColor;
1054
+ ctx.fillStyle = r.color || textColor;
962
1055
  ctx.font = `${style} ${weight} ${fs * d}px ${rFamily}`;
963
1056
  }
964
1057
 
@@ -976,10 +1069,24 @@ export function createReflowRenderer(container, options = {}) {
976
1069
  const ease = 1 - (1 - t) ** 3;
977
1070
  const morphedFS = fs * (1 - ease * (1 - edgeFontRatio));
978
1071
  const opacity = 1.0 + (0.2 - 1.0) * ease;
979
- const c = Math.round(37 - (37 - 160) * ease);
1072
+ // Blend the block's actual color toward gray at edges
1073
+ const blockColor = r.color || textColor;
1074
+ let morphColor;
1075
+ if (blockColor.startsWith("#") && blockColor.length === 7) {
1076
+ const br = parseInt(blockColor.slice(1, 3), 16);
1077
+ const bg_ = parseInt(blockColor.slice(3, 5), 16);
1078
+ const bb = parseInt(blockColor.slice(5, 7), 16);
1079
+ const dimR = Math.round(br + (160 - br) * ease);
1080
+ const dimG = Math.round(bg_ + (160 - bg_) * ease);
1081
+ const dimB = Math.round(bb + (160 - bb) * ease);
1082
+ morphColor = `rgb(${dimR},${dimG},${dimB})`;
1083
+ } else {
1084
+ const c = Math.round(37 - (37 - 160) * ease);
1085
+ morphColor = `rgb(${c},${c - 2},${c - 3})`;
1086
+ }
980
1087
  ctx.save();
981
1088
  ctx.globalAlpha = opacity;
982
- ctx.fillStyle = `rgb(${c},${c - 2},${c - 3})`;
1089
+ ctx.fillStyle = morphColor;
983
1090
  ctx.font = `${style} ${weight} ${morphedFS * d}px ${rFamily}`;
984
1091
  if (centered) {
985
1092
  ctx.textAlign = "center";