pretext-pdfjs 0.3.3 → 0.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/package.json +1 -1
  2. package/src/pinch.js +56 -4
  3. package/src/reflow.js +258 -96
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pretext-pdfjs",
3
- "version": "0.3.3",
3
+ "version": "0.3.4",
4
4
  "description": "Pretext-native text layer for PDF.js — zero DOM reflows, per-block reflow with image preservation, pinch-to-zoom text",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
package/src/pinch.js CHANGED
@@ -392,6 +392,37 @@ export function createPDFPinchReader(container, options = {}) {
392
392
  }
393
393
  }
394
394
 
395
+ /**
396
+ * Find adaptive paragraph threshold by analyzing gap distribution.
397
+ */
398
+ function findParagraphThreshold(gaps, fontSize) {
399
+ if (gaps.length < 3) return 1.8;
400
+
401
+ // Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
402
+ const filtered = gaps.filter(g => g / fontSize < 5);
403
+ if (filtered.length < 3) return 1.8;
404
+
405
+ const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
406
+
407
+ let maxGap = 0;
408
+ let threshold = 1.8;
409
+
410
+ for (let i = 0; i < ratios.length - 1; i++) {
411
+ const gap = ratios[i + 1] - ratios[i];
412
+ if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
413
+ maxGap = gap;
414
+ threshold = (ratios[i] + ratios[i + 1]) / 2;
415
+ }
416
+ }
417
+
418
+ if (maxGap < 0.2) {
419
+ const idx = Math.floor(ratios.length * 0.75);
420
+ threshold = ratios[Math.min(idx, ratios.length - 1)];
421
+ }
422
+
423
+ return Math.max(1.25, Math.min(threshold, 2.2));
424
+ }
425
+
395
426
  /**
396
427
  * Extract plain text from a PDF page.
397
428
  * Joins text items with spaces, preserves paragraph breaks.
@@ -400,11 +431,32 @@ export function createPDFPinchReader(container, options = {}) {
400
431
  const page = await pdfDoc.getPage(pageNum);
401
432
  const content = await page.getTextContent();
402
433
 
403
- // Build text with paragraph detection
404
- let result = "";
434
+ // First pass: collect all gaps to compute adaptive threshold
435
+ const gaps = [];
405
436
  let lastY = null;
406
437
  let lastFontSize = 12;
407
438
 
439
+ for (const item of content.items) {
440
+ if (!item.str || !item.transform) continue;
441
+
442
+ const currentY = item.transform[5];
443
+ const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
444
+ if (fontHeight > 0) lastFontSize = fontHeight;
445
+
446
+ if (lastY !== null) {
447
+ gaps.push(Math.abs(currentY - lastY));
448
+ }
449
+ lastY = currentY;
450
+ }
451
+
452
+ // Compute adaptive threshold
453
+ const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
454
+ const lineThreshold = lastFontSize * 0.3;
455
+
456
+ // Second pass: build text with adaptive threshold
457
+ let result = "";
458
+ lastY = null;
459
+
408
460
  for (const item of content.items) {
409
461
  if (!item.str) continue;
410
462
 
@@ -415,10 +467,10 @@ export function createPDFPinchReader(container, options = {}) {
415
467
 
416
468
  if (lastY !== null) {
417
469
  const gap = Math.abs(currentY - lastY);
418
- if (gap > lastFontSize * 1.8) {
470
+ if (gap > lastFontSize * paraThreshold) {
419
471
  // Paragraph break
420
472
  result += "\n\n";
421
- } else if (gap > lastFontSize * 0.3) {
473
+ } else if (gap > lineThreshold) {
422
474
  // Line break within paragraph — add space
423
475
  if (!result.endsWith(" ") && !result.endsWith("\n")) {
424
476
  result += " ";
package/src/reflow.js CHANGED
@@ -64,7 +64,7 @@ function drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y) {
64
64
  }
65
65
 
66
66
  const spanText = text.slice(overlapStart, overlapEnd);
67
- ctx.fillStyle = span.color;
67
+ ctx.fillStyle = span.color === "transparent" ? defaultColor : span.color;
68
68
  ctx.fillText(spanText, xPos, y);
69
69
  xPos += ctx.measureText(spanText).width;
70
70
  pos = overlapEnd;
@@ -156,111 +156,195 @@ async function extractFontMetadata(page, opList, OPS) {
156
156
  // ─── Text color extraction ───────────────────────────────────────────────
157
157
 
158
158
  /**
159
- * Extract fill colors per beginText/endText block pair.
160
- * Returns an array of color strings, one per text block.
161
- *
162
- * The previous approach pushed one color per text-drawing operator (showText,
163
- * showSpacedText, etc.) and tried to index into text items 1:1. That mapping
164
- * is broken because a single showSpacedText operator can produce multiple text
165
- * items via buildTextContentItem(). Instead, we track color at text-block
166
- * boundaries — all text items within the same beginText/endText pair share
167
- * the same color context.
159
+ * Extract text with colors from the operator list.
160
+ * Returns an array of {text, color} objects that can be matched to getTextContent() items.
168
161
  */
169
- function extractTextBlockColors(opList, OPS) {
170
- const blockColors = []; // one entry per beginText/endText pair
171
- let currentColor = "#000000";
172
- let blockColor = "#000000";
173
- let inTextBlock = false;
162
+ function extractTextWithColors(opList, OPS) {
163
+ const textRuns = []; // {text, color}
164
+ let fillColor = "#000000";
165
+ let strokeColor = "#000000";
166
+ let textRenderingMode = 0;
167
+
168
+ // Helper to extract text from glyph array
169
+ function glyphsToText(glyphs) {
170
+ if (!Array.isArray(glyphs)) return "";
171
+ return glyphs
172
+ .filter(g => g && typeof g === "object" && g.unicode)
173
+ .map(g => g.unicode)
174
+ .join("");
175
+ }
174
176
 
175
177
  for (let i = 0; i < opList.fnArray.length; i++) {
176
178
  const fn = opList.fnArray[i];
179
+ const args = opList.argsArray[i];
177
180
 
178
- // Track color changes
179
181
  if (fn === OPS.setFillRGBColor) {
180
- currentColor = opList.argsArray[i][0];
181
- } else if (fn === OPS.setFillTransparent) {
182
- currentColor = "transparent";
183
- } else if (
184
- fn === OPS.setFillGray ||
185
- fn === OPS.setFillColor ||
186
- fn === OPS.setFillCMYKColor ||
187
- fn === OPS.setFillColorN
188
- ) {
189
- const args = opList.argsArray[i];
190
- if (args?.[0] && typeof args[0] === "string" && args[0].startsWith("#")) {
191
- currentColor = args[0];
182
+ fillColor = argsToHex(args);
183
+ } else if (fn === OPS.setStrokeRGBColor) {
184
+ strokeColor = argsToHex(args);
185
+ } else if (fn === OPS.setTextRenderingMode) {
186
+ textRenderingMode = args[0];
187
+ } else if (fn === OPS.showText || fn === OPS.nextLineShowText || fn === OPS.nextLineSetSpacingShowText) {
188
+ const text = glyphsToText(args[0]);
189
+ if (text) {
190
+ textRuns.push({ text, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
191
+ }
192
+ } else if (fn === OPS.showSpacedText) {
193
+ // showSpacedText has an array of [glyphs, spacing, glyphs, spacing, ...]
194
+ const arr = args[0];
195
+ if (Array.isArray(arr)) {
196
+ let combinedText = "";
197
+ for (let j = 0; j < arr.length; j += 2) {
198
+ const glyphs = arr[j];
199
+ if (glyphs) {
200
+ combinedText += glyphsToText(glyphs);
201
+ }
202
+ }
203
+ if (combinedText) {
204
+ textRuns.push({ text: combinedText, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
205
+ }
192
206
  }
193
207
  }
208
+ }
194
209
 
195
- if (fn === OPS.beginText) {
196
- inTextBlock = true;
197
- blockColor = currentColor; // color at start of block
210
+ return textRuns;
211
+ }
212
+
213
+ /**
214
+ * Match text items to colors by content.
215
+ * Returns an array of colors aligned with textItems.
216
+ */
217
+ function matchColorsToTextItems(textItems, textRuns) {
218
+ const colors = [];
219
+ let runIdx = 0;
220
+
221
+ for (const item of textItems) {
222
+ if (item.str === undefined || !item.str.trim()) {
223
+ colors.push(null); // Skip non-text items
224
+ continue;
198
225
  }
199
226
 
200
- // If color changes within a text block, update (last color wins)
201
- if (inTextBlock && (
202
- fn === OPS.setFillRGBColor ||
203
- fn === OPS.setFillTransparent ||
204
- fn === OPS.setFillGray ||
205
- fn === OPS.setFillColor ||
206
- fn === OPS.setFillCMYKColor ||
207
- fn === OPS.setFillColorN
208
- )) {
209
- blockColor = currentColor;
227
+ const itemText = item.str.trim();
228
+ let matchedColor = "#000000"; // default
229
+
230
+ // Find a text run that matches this item
231
+ // Reset runIdx if we've gone too far (item may be earlier in the list)
232
+ if (runIdx >= textRuns.length) {
233
+ runIdx = 0;
210
234
  }
211
235
 
212
- if (fn === OPS.endText) {
213
- blockColors.push(blockColor);
214
- inTextBlock = false;
236
+ // Search for matching run starting from current position
237
+ for (let i = runIdx; i < textRuns.length; i++) {
238
+ const run = textRuns[i];
239
+ const runText = run.text.trim();
240
+
241
+ // Skip empty runs
242
+ if (!runText) continue;
243
+
244
+ // Check for exact match or substring match
245
+ if (runText === itemText ||
246
+ itemText.startsWith(runText) ||
247
+ runText.startsWith(itemText)) {
248
+ matchedColor = run.color;
249
+ runIdx = i + 1; // Start from next run for next item
250
+ break;
251
+ }
252
+ }
253
+
254
+ colors.push(matchedColor);
255
+ }
256
+
257
+ return colors;
258
+ }
259
+
260
+ /**
261
+ * Extract one visible color per text-drawing operator in the operator list.
262
+ * Returns an array that maps ~1:1 to the text items from getTextContent().
263
+ * DEPRECATED: Use extractTextWithColors + matchColorsToTextItems instead.
264
+ */
265
+ function extractTextItemColors(opList, OPS) {
266
+ const itemColors = []; // one entry per text-drawing operator
267
+ let fillColor = "#000000";
268
+ let strokeColor = "#000000";
269
+ let textRenderingMode = 0;
270
+
271
+ for (let i = 0; i < opList.fnArray.length; i++) {
272
+ const fn = opList.fnArray[i];
273
+
274
+ if (fn === OPS.setFillRGBColor) {
275
+ fillColor = argsToHex(opList.argsArray[i]);
276
+ } else if (fn === OPS.setStrokeRGBColor) {
277
+ strokeColor = argsToHex(opList.argsArray[i]);
278
+ } else if (fn === OPS.setTextRenderingMode) {
279
+ textRenderingMode = opList.argsArray[i][0];
280
+ } else if (
281
+ fn === OPS.showText ||
282
+ fn === OPS.nextLineShowText ||
283
+ fn === OPS.nextLineSetSpacingShowText
284
+ ) {
285
+ itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
286
+ } else if (fn === OPS.showSpacedText) {
287
+ itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
215
288
  }
216
289
  }
217
290
 
218
- return blockColors;
291
+ return itemColors;
292
+ }
293
+
294
+ /** Convert color operator args to a hex string. Args may be a hex string or RGB byte array. */
295
+ function argsToHex(args) {
296
+ if (typeof args[0] === "string" && args[0].startsWith("#")) return args[0];
297
+ const r = args[0] | 0, g = args[1] | 0, b = args[2] | 0;
298
+ return `#${r.toString(16).padStart(2, "0")}${g.toString(16).padStart(2, "0")}${b.toString(16).padStart(2, "0")}`;
299
+ }
300
+
301
+ /** Pick the color that will actually be visible based on text rendering mode. */
302
+ function visibleColor(fill, stroke, mode) {
303
+ const m = mode & 3; // lower 2 bits: 0=fill, 1=stroke, 2=fill+stroke, 3=invisible
304
+ if (m === 1) return stroke;
305
+ if (m === 0 || m === 2) return fill;
306
+ return "#000000"; // mode 3 (invisible) — show as black in reflow
219
307
  }
220
308
 
221
309
  // ─── Page analysis ────────────────────────────────────────────────────────
222
310
 
311
+ /**
312
+ * Find adaptive threshold for grouping items into blocks.
313
+ * Similar to paragraph detection but tuned for block-level grouping.
314
+ */
315
+ function findBlockThreshold(gaps, fontSize) {
316
+ if (gaps.length < 3) return 2.0; // Default block threshold
317
+
318
+ // Filter extreme outliers
319
+ const filtered = gaps.filter(g => g / fontSize < 5);
320
+ if (filtered.length < 3) return 2.0;
321
+
322
+ const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
323
+
324
+ // For block grouping, we want to be more conservative than paragraph detection
325
+ // Use the 60th percentile as the threshold - this separates:
326
+ // - Line spacing (~1.0-1.3x) from paragraph gaps (~1.5x+)
327
+ const idx = Math.floor(ratios.length * 0.6);
328
+ const threshold = ratios[Math.min(idx, ratios.length - 1)];
329
+
330
+ // Clamp: block threshold should be between 1.5x and 2.2x
331
+ // Lower than paragraph threshold to ensure paragraphs split into separate blocks
332
+ return Math.max(1.5, Math.min(threshold, 2.2));
333
+ }
334
+
223
335
  /**
224
336
  * Group adjacent text items into text blocks by proximity.
225
337
  * Also extracts font metadata: average size, italic, bold.
226
338
  */
227
- function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
228
- // Map text items to beginText/endText blocks by detecting position
229
- // discontinuities. Items within the same text block are contiguous and
230
- // share the same color. When there's a large Y-position jump or font
231
- // change, we advance to the next block's color.
232
- if (blockColors && blockColors.length > 0) {
233
- let blockIdx = 0;
234
- let prevY = null;
235
- let prevFontName = null;
236
- let itemsInCurrentBlock = 0;
237
-
238
- for (const item of textItems) {
239
- if (item.str === undefined) continue; // skip marked content
240
-
241
- const y = item.transform ? item.transform[5] : null;
242
- const fontHeight = item.transform
243
- ? Math.hypot(item.transform[2], item.transform[3])
244
- : 12;
245
-
246
- // Detect text block boundary by position discontinuity
247
- if (prevY !== null && y !== null && itemsInCurrentBlock > 0) {
248
- const yDiff = Math.abs(y - prevY);
249
- const fontChanged = item.fontName !== prevFontName;
250
-
251
- if (
252
- (yDiff > fontHeight * 3) ||
253
- (fontChanged && yDiff > fontHeight * 0.5)
254
- ) {
255
- blockIdx = Math.min(blockIdx + 1, blockColors.length - 1);
256
- itemsInCurrentBlock = 0;
257
- }
258
- }
259
-
260
- item._color = blockColors[blockIdx] || "#000000";
261
- itemsInCurrentBlock++;
262
- prevY = y;
263
- prevFontName = item.fontName;
339
+ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
340
+ // Assign colors to text items by matching content from textRuns.
341
+ // textRuns is an array of {text, color} extracted from the operator list.
342
+ if (textRuns && textRuns.length > 0) {
343
+ const colors = matchColorsToTextItems(textItems, textRuns);
344
+ for (let i = 0; i < textItems.length; i++) {
345
+ const item = textItems[i];
346
+ if (item.str === undefined || !item.str.trim()) continue;
347
+ item._color = colors[i] || "#000000";
264
348
  }
265
349
  }
266
350
 
@@ -271,6 +355,23 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
271
355
  return a.transform[4] - b.transform[4];
272
356
  });
273
357
 
358
+ // First pass: collect all vertical gaps to compute adaptive block threshold
359
+ const gaps = [];
360
+ let lastY = null;
361
+ let lastFontSize = 12;
362
+ for (const item of sorted) {
363
+ const y = pageHeight - item.transform[5];
364
+ const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
365
+ if (fontHeight > 0) lastFontSize = fontHeight;
366
+ if (lastY !== null) {
367
+ gaps.push(Math.abs(y - lastY));
368
+ }
369
+ lastY = y;
370
+ }
371
+
372
+ // Compute adaptive block grouping threshold
373
+ const blockThreshold = findBlockThreshold(gaps, lastFontSize);
374
+
274
375
  const blocks = [];
275
376
  let current = null;
276
377
 
@@ -315,9 +416,10 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
315
416
  continue;
316
417
  }
317
418
 
419
+ // Use adaptive block threshold instead of fixed 2.5x
318
420
  if (
319
421
  sizeOk &&
320
- verticalGap < lastFH * 2.5 &&
422
+ verticalGap < lastFH * blockThreshold &&
321
423
  x < current.bbox.x + current.bbox.w + lastFH * 1.5
322
424
  ) {
323
425
  current.items.push(item);
@@ -408,9 +510,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
408
510
  const colorFreq = {};
409
511
  for (const item of block.items) {
410
512
  const c = item._color || "#000000";
411
- if (c !== "transparent") {
412
- colorFreq[c] = (colorFreq[c] || 0) + 1;
413
- }
513
+ colorFreq[c] = (colorFreq[c] || 0) + 1;
414
514
  }
415
515
  let dominantColor = "#000000";
416
516
  let maxColorFreq = 0;
@@ -601,29 +701,91 @@ function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
601
701
  return regions;
602
702
  }
603
703
 
704
+ /**
705
+ * Find adaptive paragraph threshold by analyzing gap distribution.
706
+ * Uses histogram approach to find natural breakpoint between line gaps and paragraph gaps.
707
+ */
708
+ function findParagraphThreshold(gaps, fontSize) {
709
+ if (gaps.length < 3) return 1.8; // Fallback for small blocks
710
+
711
+ // Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
712
+ const filtered = gaps.filter(g => g / fontSize < 5);
713
+ if (filtered.length < 3) return 1.8;
714
+
715
+ // Convert to font size ratios and sort
716
+ const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
717
+
718
+ // Find the largest gap between consecutive ratios (the "elbow")
719
+ // Look for a significant jump (>0.3) between line spacing and paragraph spacing
720
+ let maxGap = 0;
721
+ let threshold = 1.8; // Default fallback
722
+
723
+ for (let i = 0; i < ratios.length - 1; i++) {
724
+ const gap = ratios[i + 1] - ratios[i];
725
+ // Look for significant gaps above typical line spacing (0.8x+)
726
+ if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
727
+ maxGap = gap;
728
+ threshold = (ratios[i] + ratios[i + 1]) / 2;
729
+ }
730
+ }
731
+
732
+ // If no clear cluster boundary found, use percentile-based approach
733
+ // 75th percentile usually separates lines from paragraphs
734
+ if (maxGap < 0.2) {
735
+ const idx = Math.floor(ratios.length * 0.75);
736
+ threshold = ratios[Math.min(idx, ratios.length - 1)];
737
+ }
738
+
739
+ // Clamp to reasonable range for paragraph detection
740
+ // Line spacing is typically 1.0-1.3x, paragraphs 1.3-1.8x+
741
+ return Math.max(1.25, Math.min(threshold, 2.2));
742
+ }
743
+
604
744
  /**
605
745
  * Build text content for a block, preserving paragraph breaks.
606
746
  */
607
747
  function blockToText(block, pageHeight) {
608
- let result = "";
748
+ // First pass: collect all gaps and font sizes to compute adaptive threshold
749
+ const gaps = [];
609
750
  let lastY = null;
610
- let lastX = null;
611
- let lastW = 0;
612
751
  let lastFontSize = 12;
613
752
 
614
753
  for (const item of block.items) {
615
754
  if (!item.str) continue;
616
- const currentX = item.transform[4];
617
755
  const currentY = pageHeight - item.transform[5];
618
756
  const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
619
757
  if (fontHeight > 0) lastFontSize = fontHeight;
620
758
 
759
+ if (lastY !== null) {
760
+ const vGap = Math.abs(currentY - lastY);
761
+ gaps.push(vGap);
762
+ }
763
+ lastY = currentY;
764
+ }
765
+
766
+ // Compute adaptive paragraph threshold
767
+ const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
768
+ const lineThreshold = lastFontSize * 0.3; // Keep fixed line threshold
769
+
770
+ // Second pass: build text with adaptive threshold
771
+ let result = "";
772
+ lastY = null;
773
+ let lastX = null;
774
+ let lastW = 0;
775
+
776
+ for (const item of block.items) {
777
+ if (!item.str) continue;
778
+ const currentX = item.transform[4];
779
+ const currentY = pageHeight - item.transform[5];
780
+
621
781
  if (lastY !== null) {
622
782
  const vGap = Math.abs(currentY - lastY);
623
783
  const isShortItem = (item.str || "").trim().length <= 2;
624
- if (vGap > lastFontSize * 1.8 && !isShortItem) {
784
+
785
+ // Use adaptive threshold for paragraph detection
786
+ if (vGap > lastFontSize * paraThreshold && !isShortItem) {
625
787
  result += "\n\n";
626
- } else if (vGap > lastFontSize * 0.3) {
788
+ } else if (vGap > lineThreshold) {
627
789
  // Different line — insert space
628
790
  if (!result.endsWith(" ") && !result.endsWith("\n")) {
629
791
  result += " ";
@@ -916,11 +1078,11 @@ async function analyzePage(page, OPS) {
916
1078
  // Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
917
1079
  const fontMap = await extractFontMetadata(page, opList, OPS);
918
1080
 
919
- // Extract text colors per beginText/endText block (not per operator)
920
- const blockColors = extractTextBlockColors(opList, OPS);
1081
+ // Extract text with colors from operator list
1082
+ const textRuns = extractTextWithColors(opList, OPS);
921
1083
 
922
- // Now group text blocks with real font data and block colors
923
- const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, blockColors);
1084
+ // Now group text blocks with real font data and matched colors
1085
+ const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textRuns);
924
1086
 
925
1087
  // Compute body font size (most common size = body text)
926
1088
  const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);