pretext-pdfjs 0.3.2 → 0.3.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (2) hide show
  1. package/package.json +1 -1
  2. package/src/reflow.js +92 -37
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pretext-pdfjs",
3
- "version": "0.3.2",
3
+ "version": "0.3.3",
4
4
  "description": "Pretext-native text layer for PDF.js — zero DOM reflows, per-block reflow with image preservation, pinch-to-zoom text",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
package/src/reflow.js CHANGED
@@ -156,25 +156,26 @@ async function extractFontMetadata(page, opList, OPS) {
156
156
  // ─── Text color extraction ───────────────────────────────────────────────
157
157
 
158
158
  /**
159
- * Extract fill colors from the operator list, indexed by text-drawing op.
160
- * The evaluator normalizes all fill-color commands to setFillRGBColor with
161
- * a hex string, so that's the primary path. Returns an array parallel to
162
- * the text items from getTextContent().
159
+ * Extract fill colors per beginText/endText block pair.
160
+ * Returns an array of color strings, one per text block.
161
+ *
162
+ * The previous approach pushed one color per text-drawing operator (showText,
163
+ * showSpacedText, etc.) and tried to index into text items 1:1. That mapping
164
+ * is broken because a single showSpacedText operator can produce multiple text
165
+ * items via buildTextContentItem(). Instead, we track color at text-block
166
+ * boundaries — all text items within the same beginText/endText pair share
167
+ * the same color context.
163
168
  */
164
- function extractTextColors(opList, OPS) {
165
- const textColors = [];
169
+ function extractTextBlockColors(opList, OPS) {
170
+ const blockColors = []; // one entry per beginText/endText pair
166
171
  let currentColor = "#000000";
167
-
168
- const textDrawOps = new Set([
169
- OPS.showText,
170
- OPS.showSpacedText,
171
- OPS.nextLineShowText,
172
- OPS.nextLineSetSpacingShowText,
173
- ]);
172
+ let blockColor = "#000000";
173
+ let inTextBlock = false;
174
174
 
175
175
  for (let i = 0; i < opList.fnArray.length; i++) {
176
176
  const fn = opList.fnArray[i];
177
177
 
178
+ // Track color changes
178
179
  if (fn === OPS.setFillRGBColor) {
179
180
  currentColor = opList.argsArray[i][0];
180
181
  } else if (fn === OPS.setFillTransparent) {
@@ -191,12 +192,30 @@ function extractTextColors(opList, OPS) {
191
192
  }
192
193
  }
193
194
 
194
- if (textDrawOps.has(fn)) {
195
- textColors.push(currentColor);
195
+ if (fn === OPS.beginText) {
196
+ inTextBlock = true;
197
+ blockColor = currentColor; // color at start of block
198
+ }
199
+
200
+ // If color changes within a text block, update (last color wins)
201
+ if (inTextBlock && (
202
+ fn === OPS.setFillRGBColor ||
203
+ fn === OPS.setFillTransparent ||
204
+ fn === OPS.setFillGray ||
205
+ fn === OPS.setFillColor ||
206
+ fn === OPS.setFillCMYKColor ||
207
+ fn === OPS.setFillColorN
208
+ )) {
209
+ blockColor = currentColor;
210
+ }
211
+
212
+ if (fn === OPS.endText) {
213
+ blockColors.push(blockColor);
214
+ inTextBlock = false;
196
215
  }
197
216
  }
198
217
 
199
- return textColors;
218
+ return blockColors;
200
219
  }
201
220
 
202
221
  // ─── Page analysis ────────────────────────────────────────────────────────
@@ -205,15 +224,43 @@ function extractTextColors(opList, OPS) {
205
224
  * Group adjacent text items into text blocks by proximity.
206
225
  * Also extracts font metadata: average size, italic, bold.
207
226
  */
208
- function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
209
- // Attach colors to text items before filtering (textColors is parallel to
210
- // the full items array from getTextContent, including empty items)
211
- if (textColors) {
212
- let colorIdx = 0;
227
+ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
228
+ // Map text items to beginText/endText blocks by detecting position
229
+ // discontinuities. Items within the same text block are contiguous and
230
+ // share the same color. When there's a large Y-position jump or font
231
+ // change, we advance to the next block's color.
232
+ if (blockColors && blockColors.length > 0) {
233
+ let blockIdx = 0;
234
+ let prevY = null;
235
+ let prevFontName = null;
236
+ let itemsInCurrentBlock = 0;
237
+
213
238
  for (const item of textItems) {
214
- if (item.str !== undefined) {
215
- item._color = textColors[colorIdx++] || "#000000";
239
+ if (item.str === undefined) continue; // skip marked content
240
+
241
+ const y = item.transform ? item.transform[5] : null;
242
+ const fontHeight = item.transform
243
+ ? Math.hypot(item.transform[2], item.transform[3])
244
+ : 12;
245
+
246
+ // Detect text block boundary by position discontinuity
247
+ if (prevY !== null && y !== null && itemsInCurrentBlock > 0) {
248
+ const yDiff = Math.abs(y - prevY);
249
+ const fontChanged = item.fontName !== prevFontName;
250
+
251
+ if (
252
+ (yDiff > fontHeight * 3) ||
253
+ (fontChanged && yDiff > fontHeight * 0.5)
254
+ ) {
255
+ blockIdx = Math.min(blockIdx + 1, blockColors.length - 1);
256
+ itemsInCurrentBlock = 0;
257
+ }
216
258
  }
259
+
260
+ item._color = blockColors[blockIdx] || "#000000";
261
+ itemsInCurrentBlock++;
262
+ prevY = y;
263
+ prevFontName = item.fontName;
217
264
  }
218
265
  }
219
266
 
@@ -754,16 +801,20 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
754
801
  curr.gapBefore = Math.max(0, currTop - prevBottom);
755
802
  }
756
803
  if (regions.length > 0) {
757
- regions[0].gapBefore = regions[0].bbox.y;
804
+ regions[0].gapBefore = 0; // padding handles top margin
758
805
  }
759
806
 
760
- // Normalize gaps relative to average body line height
807
+ // Store absolute pixel gaps and compute body font size for scaling
761
808
  const bodyBlocks = regions.filter(r =>
762
809
  r.type === "text" && r.block?.fontScale && Math.abs(r.block.fontScale - 1) < 0.15);
763
- const avgBodyLH = bodyBlocks.length > 0
764
- ? bodyBlocks.reduce((s, r) => s + r.block.avgFontSize * 1.6, 0) / bodyBlocks.length
765
- : 12 * 1.6;
810
+ const avgBodyFontSize = bodyBlocks.length > 0
811
+ ? bodyBlocks.reduce((s, r) => s + r.block.avgFontSize, 0) / bodyBlocks.length
812
+ : 12;
766
813
  for (const region of regions) {
814
+ region.gapAbsolute = region.gapBefore || 0;
815
+ region._avgBodyFontSize = avgBodyFontSize;
816
+ // Keep gapRatio as fallback for any code that reads it
817
+ const avgBodyLH = avgBodyFontSize * 1.6;
767
818
  region.gapRatio = (region.gapBefore || 0) / avgBodyLH;
768
819
  }
769
820
 
@@ -865,11 +916,11 @@ async function analyzePage(page, OPS) {
865
916
  // Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
866
917
  const fontMap = await extractFontMetadata(page, opList, OPS);
867
918
 
868
- // Extract text colors from operator list (parallel to text items)
869
- const textColors = extractTextColors(opList, OPS);
919
+ // Extract text colors per beginText/endText block (not per operator)
920
+ const blockColors = extractTextBlockColors(opList, OPS);
870
921
 
871
- // Now group text blocks with real font data and colors
872
- const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textColors);
922
+ // Now group text blocks with real font data and block colors
923
+ const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, blockColors);
873
924
 
874
925
  // Compute body font size (most common size = body text)
875
926
  const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
@@ -1028,13 +1079,15 @@ function reflowAndComposite(analysis, opts) {
1028
1079
  }
1029
1080
  }
1030
1081
 
1031
- // Total height — use original PDF gap ratios between regions
1082
+ // Total height — use absolute pixel gaps scaled by font ratio
1032
1083
  const baseLH = fontSize * lineHeight;
1033
1084
  let totalHeight = padding;
1034
1085
  for (const r of reflowedRegions) {
1035
1086
  totalHeight += r.height;
1036
- const gapRatio = r.region?.gapRatio ?? 0.4;
1037
- totalHeight += baseLH * Math.max(0.2, Math.min(gapRatio, 2.0));
1087
+ const gapAbs = r.region?.gapAbsolute ?? 0;
1088
+ const bodyFS = r.region?._avgBodyFontSize || 12;
1089
+ const scaledGap = gapAbs * (fontSize / bodyFS);
1090
+ totalHeight += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
1038
1091
  }
1039
1092
  totalHeight += padding;
1040
1093
 
@@ -1274,8 +1327,10 @@ export function createReflowRenderer(container, options = {}) {
1274
1327
  }
1275
1328
  cursorY += r.drawH;
1276
1329
  }
1277
- const gapRatio = r.region?.gapRatio ?? 0.4;
1278
- cursorY += baseLH * Math.max(0.2, Math.min(gapRatio, 2.0));
1330
+ const gapAbs = r.region?.gapAbsolute ?? 0;
1331
+ const bodyFS = r.region?._avgBodyFontSize || 12;
1332
+ const scaledGap = gapAbs * (fontSize / bodyFS);
1333
+ cursorY += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
1279
1334
  }
1280
1335
  }
1281
1336