pretext-pdfjs 0.3.3 → 0.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (3) hide show
  1. package/package.json +1 -1
  2. package/src/pinch.js +56 -4
  3. package/src/reflow.js +506 -103
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "pretext-pdfjs",
3
- "version": "0.3.3",
3
+ "version": "0.3.5",
4
4
  "description": "Pretext-native text layer for PDF.js — zero DOM reflows, per-block reflow with image preservation, pinch-to-zoom text",
5
5
  "type": "module",
6
6
  "main": "./src/index.js",
package/src/pinch.js CHANGED
@@ -392,6 +392,37 @@ export function createPDFPinchReader(container, options = {}) {
392
392
  }
393
393
  }
394
394
 
395
+ /**
396
+ * Find adaptive paragraph threshold by analyzing gap distribution.
397
+ */
398
+ function findParagraphThreshold(gaps, fontSize) {
399
+ if (gaps.length < 3) return 1.8;
400
+
401
+ // Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
402
+ const filtered = gaps.filter(g => g / fontSize < 5);
403
+ if (filtered.length < 3) return 1.8;
404
+
405
+ const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
406
+
407
+ let maxGap = 0;
408
+ let threshold = 1.8;
409
+
410
+ for (let i = 0; i < ratios.length - 1; i++) {
411
+ const gap = ratios[i + 1] - ratios[i];
412
+ if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
413
+ maxGap = gap;
414
+ threshold = (ratios[i] + ratios[i + 1]) / 2;
415
+ }
416
+ }
417
+
418
+ if (maxGap < 0.2) {
419
+ const idx = Math.floor(ratios.length * 0.75);
420
+ threshold = ratios[Math.min(idx, ratios.length - 1)];
421
+ }
422
+
423
+ return Math.max(1.25, Math.min(threshold, 2.2));
424
+ }
425
+
395
426
  /**
396
427
  * Extract plain text from a PDF page.
397
428
  * Joins text items with spaces, preserves paragraph breaks.
@@ -400,11 +431,32 @@ export function createPDFPinchReader(container, options = {}) {
400
431
  const page = await pdfDoc.getPage(pageNum);
401
432
  const content = await page.getTextContent();
402
433
 
403
- // Build text with paragraph detection
404
- let result = "";
434
+ // First pass: collect all gaps to compute adaptive threshold
435
+ const gaps = [];
405
436
  let lastY = null;
406
437
  let lastFontSize = 12;
407
438
 
439
+ for (const item of content.items) {
440
+ if (!item.str || !item.transform) continue;
441
+
442
+ const currentY = item.transform[5];
443
+ const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
444
+ if (fontHeight > 0) lastFontSize = fontHeight;
445
+
446
+ if (lastY !== null) {
447
+ gaps.push(Math.abs(currentY - lastY));
448
+ }
449
+ lastY = currentY;
450
+ }
451
+
452
+ // Compute adaptive threshold
453
+ const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
454
+ const lineThreshold = lastFontSize * 0.3;
455
+
456
+ // Second pass: build text with adaptive threshold
457
+ let result = "";
458
+ lastY = null;
459
+
408
460
  for (const item of content.items) {
409
461
  if (!item.str) continue;
410
462
 
@@ -415,10 +467,10 @@ export function createPDFPinchReader(container, options = {}) {
415
467
 
416
468
  if (lastY !== null) {
417
469
  const gap = Math.abs(currentY - lastY);
418
- if (gap > lastFontSize * 1.8) {
470
+ if (gap > lastFontSize * paraThreshold) {
419
471
  // Paragraph break
420
472
  result += "\n\n";
421
- } else if (gap > lastFontSize * 0.3) {
473
+ } else if (gap > lineThreshold) {
422
474
  // Line break within paragraph — add space
423
475
  if (!result.endsWith(" ") && !result.endsWith("\n")) {
424
476
  result += " ";
package/src/reflow.js CHANGED
@@ -64,7 +64,7 @@ function drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y) {
64
64
  }
65
65
 
66
66
  const spanText = text.slice(overlapStart, overlapEnd);
67
- ctx.fillStyle = span.color;
67
+ ctx.fillStyle = span.color === "transparent" ? defaultColor : span.color;
68
68
  ctx.fillText(spanText, xPos, y);
69
69
  xPos += ctx.measureText(spanText).width;
70
70
  pos = overlapEnd;
@@ -156,111 +156,195 @@ async function extractFontMetadata(page, opList, OPS) {
156
156
  // ─── Text color extraction ───────────────────────────────────────────────
157
157
 
158
158
  /**
159
- * Extract fill colors per beginText/endText block pair.
160
- * Returns an array of color strings, one per text block.
161
- *
162
- * The previous approach pushed one color per text-drawing operator (showText,
163
- * showSpacedText, etc.) and tried to index into text items 1:1. That mapping
164
- * is broken because a single showSpacedText operator can produce multiple text
165
- * items via buildTextContentItem(). Instead, we track color at text-block
166
- * boundaries — all text items within the same beginText/endText pair share
167
- * the same color context.
159
+ * Extract text with colors from the operator list.
160
+ * Returns an array of {text, color} objects that can be matched to getTextContent() items.
168
161
  */
169
- function extractTextBlockColors(opList, OPS) {
170
- const blockColors = []; // one entry per beginText/endText pair
171
- let currentColor = "#000000";
172
- let blockColor = "#000000";
173
- let inTextBlock = false;
162
+ function extractTextWithColors(opList, OPS) {
163
+ const textRuns = []; // {text, color}
164
+ let fillColor = "#000000";
165
+ let strokeColor = "#000000";
166
+ let textRenderingMode = 0;
167
+
168
+ // Helper to extract text from glyph array
169
+ function glyphsToText(glyphs) {
170
+ if (!Array.isArray(glyphs)) return "";
171
+ return glyphs
172
+ .filter(g => g && typeof g === "object" && g.unicode)
173
+ .map(g => g.unicode)
174
+ .join("");
175
+ }
174
176
 
175
177
  for (let i = 0; i < opList.fnArray.length; i++) {
176
178
  const fn = opList.fnArray[i];
179
+ const args = opList.argsArray[i];
177
180
 
178
- // Track color changes
179
181
  if (fn === OPS.setFillRGBColor) {
180
- currentColor = opList.argsArray[i][0];
181
- } else if (fn === OPS.setFillTransparent) {
182
- currentColor = "transparent";
183
- } else if (
184
- fn === OPS.setFillGray ||
185
- fn === OPS.setFillColor ||
186
- fn === OPS.setFillCMYKColor ||
187
- fn === OPS.setFillColorN
188
- ) {
189
- const args = opList.argsArray[i];
190
- if (args?.[0] && typeof args[0] === "string" && args[0].startsWith("#")) {
191
- currentColor = args[0];
182
+ fillColor = argsToHex(args);
183
+ } else if (fn === OPS.setStrokeRGBColor) {
184
+ strokeColor = argsToHex(args);
185
+ } else if (fn === OPS.setTextRenderingMode) {
186
+ textRenderingMode = args[0];
187
+ } else if (fn === OPS.showText || fn === OPS.nextLineShowText || fn === OPS.nextLineSetSpacingShowText) {
188
+ const text = glyphsToText(args[0]);
189
+ if (text) {
190
+ textRuns.push({ text, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
191
+ }
192
+ } else if (fn === OPS.showSpacedText) {
193
+ // showSpacedText has an array of [glyphs, spacing, glyphs, spacing, ...]
194
+ const arr = args[0];
195
+ if (Array.isArray(arr)) {
196
+ let combinedText = "";
197
+ for (let j = 0; j < arr.length; j += 2) {
198
+ const glyphs = arr[j];
199
+ if (glyphs) {
200
+ combinedText += glyphsToText(glyphs);
201
+ }
202
+ }
203
+ if (combinedText) {
204
+ textRuns.push({ text: combinedText, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
205
+ }
192
206
  }
193
207
  }
208
+ }
194
209
 
195
- if (fn === OPS.beginText) {
196
- inTextBlock = true;
197
- blockColor = currentColor; // color at start of block
210
+ return textRuns;
211
+ }
212
+
213
+ /**
214
+ * Match text items to colors by content.
215
+ * Returns an array of colors aligned with textItems.
216
+ */
217
+ function matchColorsToTextItems(textItems, textRuns) {
218
+ const colors = [];
219
+ let runIdx = 0;
220
+
221
+ for (const item of textItems) {
222
+ if (item.str === undefined || !item.str.trim()) {
223
+ colors.push(null); // Skip non-text items
224
+ continue;
198
225
  }
199
226
 
200
- // If color changes within a text block, update (last color wins)
201
- if (inTextBlock && (
202
- fn === OPS.setFillRGBColor ||
203
- fn === OPS.setFillTransparent ||
204
- fn === OPS.setFillGray ||
205
- fn === OPS.setFillColor ||
206
- fn === OPS.setFillCMYKColor ||
207
- fn === OPS.setFillColorN
208
- )) {
209
- blockColor = currentColor;
227
+ const itemText = item.str.trim();
228
+ let matchedColor = "#000000"; // default
229
+
230
+ // Find a text run that matches this item
231
+ // Reset runIdx if we've gone too far (item may be earlier in the list)
232
+ if (runIdx >= textRuns.length) {
233
+ runIdx = 0;
210
234
  }
211
235
 
212
- if (fn === OPS.endText) {
213
- blockColors.push(blockColor);
214
- inTextBlock = false;
236
+ // Search for matching run starting from current position
237
+ for (let i = runIdx; i < textRuns.length; i++) {
238
+ const run = textRuns[i];
239
+ const runText = run.text.trim();
240
+
241
+ // Skip empty runs
242
+ if (!runText) continue;
243
+
244
+ // Check for exact match or substring match
245
+ if (runText === itemText ||
246
+ itemText.startsWith(runText) ||
247
+ runText.startsWith(itemText)) {
248
+ matchedColor = run.color;
249
+ runIdx = i + 1; // Start from next run for next item
250
+ break;
251
+ }
215
252
  }
253
+
254
+ colors.push(matchedColor);
216
255
  }
217
256
 
218
- return blockColors;
257
+ return colors;
258
+ }
259
+
260
+ /**
261
+ * Extract one visible color per text-drawing operator in the operator list.
262
+ * Returns an array that maps ~1:1 to the text items from getTextContent().
263
+ * DEPRECATED: Use extractTextWithColors + matchColorsToTextItems instead.
264
+ */
265
+ function extractTextItemColors(opList, OPS) {
266
+ const itemColors = []; // one entry per text-drawing operator
267
+ let fillColor = "#000000";
268
+ let strokeColor = "#000000";
269
+ let textRenderingMode = 0;
270
+
271
+ for (let i = 0; i < opList.fnArray.length; i++) {
272
+ const fn = opList.fnArray[i];
273
+
274
+ if (fn === OPS.setFillRGBColor) {
275
+ fillColor = argsToHex(opList.argsArray[i]);
276
+ } else if (fn === OPS.setStrokeRGBColor) {
277
+ strokeColor = argsToHex(opList.argsArray[i]);
278
+ } else if (fn === OPS.setTextRenderingMode) {
279
+ textRenderingMode = opList.argsArray[i][0];
280
+ } else if (
281
+ fn === OPS.showText ||
282
+ fn === OPS.nextLineShowText ||
283
+ fn === OPS.nextLineSetSpacingShowText
284
+ ) {
285
+ itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
286
+ } else if (fn === OPS.showSpacedText) {
287
+ itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
288
+ }
289
+ }
290
+
291
+ return itemColors;
292
+ }
293
+
294
+ /** Convert color operator args to a hex string. Args may be a hex string or RGB byte array. */
295
+ function argsToHex(args) {
296
+ if (typeof args[0] === "string" && args[0].startsWith("#")) return args[0];
297
+ const r = args[0] | 0, g = args[1] | 0, b = args[2] | 0;
298
+ return `#${r.toString(16).padStart(2, "0")}${g.toString(16).padStart(2, "0")}${b.toString(16).padStart(2, "0")}`;
299
+ }
300
+
301
+ /** Pick the color that will actually be visible based on text rendering mode. */
302
+ function visibleColor(fill, stroke, mode) {
303
+ const m = mode & 3; // lower 2 bits: 0=fill, 1=stroke, 2=fill+stroke, 3=invisible
304
+ if (m === 1) return stroke;
305
+ if (m === 0 || m === 2) return fill;
306
+ return "#000000"; // mode 3 (invisible) — show as black in reflow
219
307
  }
220
308
 
221
309
  // ─── Page analysis ────────────────────────────────────────────────────────
222
310
 
311
+ /**
312
+ * Find adaptive threshold for grouping items into blocks.
313
+ * Similar to paragraph detection but tuned for block-level grouping.
314
+ */
315
+ function findBlockThreshold(gaps, fontSize) {
316
+ if (gaps.length < 3) return 2.0; // Default block threshold
317
+
318
+ // Filter extreme outliers
319
+ const filtered = gaps.filter(g => g / fontSize < 5);
320
+ if (filtered.length < 3) return 2.0;
321
+
322
+ const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
323
+
324
+ // For block grouping, we want to be more conservative than paragraph detection
325
+ // Use the 60th percentile as the threshold - this separates:
326
+ // - Line spacing (~1.0-1.3x) from paragraph gaps (~1.5x+)
327
+ const idx = Math.floor(ratios.length * 0.6);
328
+ const threshold = ratios[Math.min(idx, ratios.length - 1)];
329
+
330
+ // Clamp: block threshold should be between 1.5x and 2.2x
331
+ // Lower than paragraph threshold to ensure paragraphs split into separate blocks
332
+ return Math.max(1.5, Math.min(threshold, 2.2));
333
+ }
334
+
223
335
  /**
224
336
  * Group adjacent text items into text blocks by proximity.
225
337
  * Also extracts font metadata: average size, italic, bold.
226
338
  */
227
- function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
228
- // Map text items to beginText/endText blocks by detecting position
229
- // discontinuities. Items within the same text block are contiguous and
230
- // share the same color. When there's a large Y-position jump or font
231
- // change, we advance to the next block's color.
232
- if (blockColors && blockColors.length > 0) {
233
- let blockIdx = 0;
234
- let prevY = null;
235
- let prevFontName = null;
236
- let itemsInCurrentBlock = 0;
237
-
238
- for (const item of textItems) {
239
- if (item.str === undefined) continue; // skip marked content
240
-
241
- const y = item.transform ? item.transform[5] : null;
242
- const fontHeight = item.transform
243
- ? Math.hypot(item.transform[2], item.transform[3])
244
- : 12;
245
-
246
- // Detect text block boundary by position discontinuity
247
- if (prevY !== null && y !== null && itemsInCurrentBlock > 0) {
248
- const yDiff = Math.abs(y - prevY);
249
- const fontChanged = item.fontName !== prevFontName;
250
-
251
- if (
252
- (yDiff > fontHeight * 3) ||
253
- (fontChanged && yDiff > fontHeight * 0.5)
254
- ) {
255
- blockIdx = Math.min(blockIdx + 1, blockColors.length - 1);
256
- itemsInCurrentBlock = 0;
257
- }
258
- }
259
-
260
- item._color = blockColors[blockIdx] || "#000000";
261
- itemsInCurrentBlock++;
262
- prevY = y;
263
- prevFontName = item.fontName;
339
+ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
340
+ // Assign colors to text items by matching content from textRuns.
341
+ // textRuns is an array of {text, color} extracted from the operator list.
342
+ if (textRuns && textRuns.length > 0) {
343
+ const colors = matchColorsToTextItems(textItems, textRuns);
344
+ for (let i = 0; i < textItems.length; i++) {
345
+ const item = textItems[i];
346
+ if (item.str === undefined || !item.str.trim()) continue;
347
+ item._color = colors[i] || "#000000";
264
348
  }
265
349
  }
266
350
 
@@ -271,6 +355,23 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
271
355
  return a.transform[4] - b.transform[4];
272
356
  });
273
357
 
358
+ // First pass: collect all vertical gaps to compute adaptive block threshold
359
+ const gaps = [];
360
+ let lastY = null;
361
+ let lastFontSize = 12;
362
+ for (const item of sorted) {
363
+ const y = pageHeight - item.transform[5];
364
+ const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
365
+ if (fontHeight > 0) lastFontSize = fontHeight;
366
+ if (lastY !== null) {
367
+ gaps.push(Math.abs(y - lastY));
368
+ }
369
+ lastY = y;
370
+ }
371
+
372
+ // Compute adaptive block grouping threshold
373
+ const blockThreshold = findBlockThreshold(gaps, lastFontSize);
374
+
274
375
  const blocks = [];
275
376
  let current = null;
276
377
 
@@ -315,9 +416,10 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
315
416
  continue;
316
417
  }
317
418
 
419
+ // Use adaptive block threshold instead of fixed 2.5x
318
420
  if (
319
421
  sizeOk &&
320
- verticalGap < lastFH * 2.5 &&
422
+ verticalGap < lastFH * blockThreshold &&
321
423
  x < current.bbox.x + current.bbox.w + lastFH * 1.5
322
424
  ) {
323
425
  current.items.push(item);
@@ -337,7 +439,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
337
439
  if (current) blocks.push(current);
338
440
 
339
441
  // Post-process: merge orphan tiny blocks (superscripts, markers like *, +, #)
340
- // into the nearest larger block if vertically close
442
+ // into the nearest larger block if vertically close AND horizontally aligned
341
443
  for (let i = blocks.length - 1; i >= 0; i--) {
342
444
  const block = blocks[i];
343
445
  if (block.items.length > 2) continue;
@@ -354,6 +456,15 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
354
456
  // Check vertical proximity: orphan center within 30pt of target block
355
457
  const bcy = block.bbox.y + block.bbox.h / 2;
356
458
  if (bcy < o.bbox.y - 30 || bcy > o.bbox.y + o.bbox.h + 30) continue;
459
+ // Horizontal center alignment check - must be roughly in same column
460
+ const bcx = block.bbox.x + block.bbox.w / 2;
461
+ const ocx = o.bbox.x + o.bbox.w / 2;
462
+ const hCenterDist = Math.abs(bcx - ocx);
463
+ // Must have significant horizontal overlap or be in same column
464
+ const xOverlap = Math.max(0, Math.min(block.bbox.x + block.bbox.w, o.bbox.x + o.bbox.w) -
465
+ Math.max(block.bbox.x, o.bbox.x));
466
+ const inSameColumn = hCenterDist < Math.max(block.bbox.w, o.bbox.w) * 0.8 || xOverlap > 0;
467
+ if (!inSameColumn) continue;
357
468
  // Horizontal edge-to-edge distance (0 if overlapping)
358
469
  const hDist = Math.max(0,
359
470
  block.bbox.x > o.bbox.x + o.bbox.w ? block.bbox.x - (o.bbox.x + o.bbox.w) :
@@ -377,6 +488,83 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
377
488
  }
378
489
  }
379
490
 
491
+ // Post-process: detect multi-column grids (like author sections)
492
+ // Group blocks that form aligned columns into a single composite block
493
+ const multiColumnBlocks = [];
494
+ const processed = new Set();
495
+
496
+ for (let i = 0; i < blocks.length; i++) {
497
+ if (processed.has(i)) continue;
498
+ const block = blocks[i];
499
+ const blockText = block.items.map(it => (it.str || "").trim()).join(" ");
500
+ const blockCenterX = block.bbox.x + block.bbox.w / 2;
501
+
502
+ // Find all blocks in same horizontal band (similar Y position)
503
+ const sameRowBlocks = [block];
504
+ const rowY = block.bbox.y;
505
+ const rowH = block.bbox.h;
506
+
507
+ for (let j = i + 1; j < blocks.length; j++) {
508
+ if (processed.has(j)) continue;
509
+ const other = blocks[j];
510
+ // Check if in same row (vertical overlap)
511
+ const yOverlap = Math.max(0, Math.min(rowY + rowH, other.bbox.y + other.bbox.h) - Math.max(rowY, other.bbox.y));
512
+ const minH = Math.min(rowH, other.bbox.h);
513
+ if (yOverlap > minH * 0.5) {
514
+ sameRowBlocks.push(other);
515
+ }
516
+ }
517
+
518
+ // If we have multiple blocks in same row, this might be a multi-column layout
519
+ if (sameRowBlocks.length >= 2) {
520
+ // Sort by X position
521
+ sameRowBlocks.sort((a, b) => a.bbox.x - b.bbox.x);
522
+ // Check if they're roughly aligned (similar height, spaced evenly)
523
+ const avgH = sameRowBlocks.reduce((s, b) => s + b.bbox.h, 0) / sameRowBlocks.length;
524
+ const heightsOk = sameRowBlocks.every(b => Math.abs(b.bbox.h - avgH) < avgH * 0.5);
525
+
526
+ if (heightsOk) {
527
+ // Merge into a single composite block that preserves multi-column info
528
+ const allItems = [];
529
+ for (const b of sameRowBlocks) {
530
+ allItems.push(...b.items);
531
+ processed.add(blocks.indexOf(b));
532
+ }
533
+ // Sort items by Y then X to maintain reading order within the grid
534
+ allItems.sort((a, b) => {
535
+ const ay = pageHeight - a.transform[5];
536
+ const by = pageHeight - b.transform[5];
537
+ if (Math.abs(ay - by) > 2) return ay - by;
538
+ return a.transform[4] - b.transform[4];
539
+ });
540
+
541
+ const bbox = {
542
+ x: Math.min(...sameRowBlocks.map(b => b.bbox.x)),
543
+ y: Math.min(...sameRowBlocks.map(b => b.bbox.y)),
544
+ w: Math.max(...sameRowBlocks.map(b => b.bbox.x + b.bbox.w)) - Math.min(...sameRowBlocks.map(b => b.bbox.x)),
545
+ h: Math.max(...sameRowBlocks.map(b => b.bbox.y + b.bbox.h)) - Math.min(...sameRowBlocks.map(b => b.bbox.y))
546
+ };
547
+
548
+ multiColumnBlocks.push({
549
+ items: allItems,
550
+ bbox,
551
+ isMultiColumn: true,
552
+ columnCount: sameRowBlocks.length
553
+ });
554
+ continue;
555
+ }
556
+ }
557
+
558
+ if (!processed.has(i)) {
559
+ multiColumnBlocks.push(block);
560
+ processed.add(i);
561
+ }
562
+ }
563
+
564
+ // Replace blocks with multi-column merged version
565
+ blocks.length = 0;
566
+ blocks.push(...multiColumnBlocks);
567
+
380
568
  // Compute font metadata per block using real font objects from commonObjs
381
569
  for (const block of blocks) {
382
570
  const sizes = [];
@@ -408,9 +596,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
408
596
  const colorFreq = {};
409
597
  for (const item of block.items) {
410
598
  const c = item._color || "#000000";
411
- if (c !== "transparent") {
412
- colorFreq[c] = (colorFreq[c] || 0) + 1;
413
- }
599
+ colorFreq[c] = (colorFreq[c] || 0) + 1;
414
600
  }
415
601
  let dominantColor = "#000000";
416
602
  let maxColorFreq = 0;
@@ -451,8 +637,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
451
637
 
452
638
  /**
453
639
  * Extract graphic regions from the page operator list.
454
- * Only captures image operators (paintImageXObject etc).
455
- * Skips path/fill/stroke to avoid false positives from text decorations.
640
+ * Captures images and horizontal divider lines (thin rectangles).
456
641
  */
457
642
  function extractGraphicRegions(opList, OPS) {
458
643
  const regions = [];
@@ -509,6 +694,23 @@ function extractGraphicRegions(opList, OPS) {
509
694
  bbox: { x: minX, y: minY, w: maxX - minX, h: maxY - minY },
510
695
  });
511
696
  }
697
+ } else if (fn === OPS.rectangle) {
698
+ // Check for thin horizontal lines (dividers)
699
+ const [x, y, w, h] = args;
700
+ if (w > 100 && h > 0.5 && h < 5) {
701
+ const corners = [
702
+ transformPoint(x, y),
703
+ transformPoint(x + w, y),
704
+ transformPoint(x, y + h),
705
+ transformPoint(x + w, y + h),
706
+ ];
707
+ const xs = corners.map(c => c[0]);
708
+ const ys = corners.map(c => c[1]);
709
+ regions.push({
710
+ type: "divider",
711
+ bbox: { x: Math.min(...xs), y: Math.min(...ys), w: Math.max(...xs) - Math.min(...xs), h: Math.max(...ys) - Math.min(...ys) },
712
+ });
713
+ }
512
714
  }
513
715
  }
514
716
 
@@ -601,37 +803,213 @@ function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
601
803
  return regions;
602
804
  }
603
805
 
806
+ /**
807
+ * Find adaptive paragraph threshold by analyzing gap distribution.
808
+ * Uses histogram approach to find natural breakpoint between line gaps and paragraph gaps.
809
+ */
810
+ function findParagraphThreshold(gaps, fontSize) {
811
+ if (gaps.length < 3) return 1.8; // Fallback for small blocks
812
+
813
+ // Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
814
+ const filtered = gaps.filter(g => g / fontSize < 5);
815
+ if (filtered.length < 3) return 1.8;
816
+
817
+ // Convert to font size ratios and sort
818
+ const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
819
+
820
+ // Find the largest gap between consecutive ratios (the "elbow")
821
+ // Look for a significant jump (>0.3) between line spacing and paragraph spacing
822
+ let maxGap = 0;
823
+ let threshold = 1.8; // Default fallback
824
+
825
+ for (let i = 0; i < ratios.length - 1; i++) {
826
+ const gap = ratios[i + 1] - ratios[i];
827
+ // Look for significant gaps above typical line spacing (0.8x+)
828
+ if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
829
+ maxGap = gap;
830
+ threshold = (ratios[i] + ratios[i + 1]) / 2;
831
+ }
832
+ }
833
+
834
+ // If no clear cluster boundary found, use percentile-based approach
835
+ // 75th percentile usually separates lines from paragraphs
836
+ if (maxGap < 0.2) {
837
+ const idx = Math.floor(ratios.length * 0.75);
838
+ threshold = ratios[Math.min(idx, ratios.length - 1)];
839
+ }
840
+
841
+ // Clamp to reasonable range for paragraph detection
842
+ // Line spacing is typically 1.0-1.3x, paragraphs 1.3-1.8x+
843
+ return Math.max(1.25, Math.min(threshold, 2.2));
844
+ }
845
+
604
846
  /**
605
847
  * Build text content for a block, preserving paragraph breaks.
606
848
  */
849
+ function blockToTextMultiColumn(block, pageHeight) {
850
+ const rows = new Map();
851
+ const fontHeight = block.avgFontSize || 12;
852
+
853
+ // Group items by row (finer granularity)
854
+ for (const item of block.items) {
855
+ if (!item.str) continue;
856
+ const y = pageHeight - item.transform[5];
857
+ const rowKey = Math.round(y / 2) * 2; // 2px granularity
858
+ if (!rows.has(rowKey)) rows.set(rowKey, []);
859
+ rows.get(rowKey).push(item);
860
+ }
861
+
862
+ const sortedRows = Array.from(rows.keys()).sort((a, b) => a - b);
863
+
864
+ // Merge rows: if a row has only short items (markers), merge with next row
865
+ const mergedRows = [];
866
+ let pendingRow = null;
867
+
868
+ for (const rowKey of sortedRows) {
869
+ const rowItems = rows.get(rowKey).sort((a, b) => a.transform[4] - b.transform[4]);
870
+ const allShort = rowItems.every(it => (it.str || "").trim().length <= 3);
871
+
872
+ if (allShort && rowItems.length >= 2) {
873
+ // This is a marker row - merge with next row
874
+ pendingRow = { key: rowKey, items: rowItems };
875
+ } else {
876
+ if (pendingRow) {
877
+ // Merge pending marker row with this row
878
+ // For each item in this row, find and attach the closest marker
879
+ const mergedItems = [];
880
+ for (const item of rowItems) {
881
+ const itemCenterX = item.transform[4] + (item.width || 0) / 2;
882
+ // Find closest marker
883
+ let closestMarker = null;
884
+ let minDist = Infinity;
885
+ for (const marker of pendingRow.items) {
886
+ const markerCenterX = marker.transform[4] + (marker.width || 0) / 2;
887
+ const dist = Math.abs(markerCenterX - itemCenterX);
888
+ if (dist < minDist) {
889
+ minDist = dist;
890
+ closestMarker = marker;
891
+ }
892
+ }
893
+ // Attach marker to item
894
+ if (closestMarker && minDist < 50) { // Within 50px
895
+ mergedItems.push({...item, str: item.str + closestMarker.str});
896
+ } else {
897
+ mergedItems.push(item);
898
+ }
899
+ }
900
+ mergedItems.sort((a, b) => a.transform[4] - b.transform[4]);
901
+ mergedRows.push({ items: mergedItems, hasMarkers: true });
902
+ pendingRow = null;
903
+ } else {
904
+ mergedRows.push({ items: rowItems, hasMarkers: false });
905
+ }
906
+ }
907
+ }
908
+
909
+ // Don't forget last pending row
910
+ if (pendingRow) {
911
+ mergedRows.push({ items: pendingRow.items, hasMarkers: true });
912
+ }
913
+
914
+ // Build output lines
915
+ const lines = [];
916
+ for (const row of mergedRows) {
917
+ let lineText = "";
918
+ let lastX = null;
919
+ let lastW = 0;
920
+ let lastItemLen = 0;
921
+
922
+ for (const item of row.items) {
923
+ const currentX = item.transform[4];
924
+ const currentItemLen = (item.str || "").trim().length;
925
+ const isShortItem = currentItemLen <= 3;
926
+
927
+ if (lastX !== null) {
928
+ const hGap = currentX - (lastX + lastW);
929
+ const prevWasLong = lastItemLen > 2;
930
+ // Add column separator, but not before footnote markers
931
+ if (hGap > fontHeight * 0.3 && (!prevWasLong || !isShortItem)) {
932
+ lineText += " ";
933
+ }
934
+ }
935
+
936
+ lineText += item.str;
937
+ lastX = currentX;
938
+ lastW = item.width || fontHeight * 0.5;
939
+ lastItemLen = currentItemLen;
940
+ }
941
+ lines.push(lineText.trim());
942
+ }
943
+
944
+ return lines.join("\n");
945
+ }
946
+
607
947
  function blockToText(block, pageHeight) {
608
- let result = "";
948
+ // Special handling for multi-column blocks (like author grids)
949
+ if (block.isMultiColumn && block.columnCount >= 2) {
950
+ return blockToTextMultiColumn(block, pageHeight);
951
+ }
952
+
953
+ // First pass: collect all gaps and font sizes to compute adaptive threshold
954
+ const gaps = [];
609
955
  let lastY = null;
610
- let lastX = null;
611
- let lastW = 0;
612
956
  let lastFontSize = 12;
613
957
 
614
958
  for (const item of block.items) {
615
959
  if (!item.str) continue;
616
- const currentX = item.transform[4];
617
960
  const currentY = pageHeight - item.transform[5];
618
961
  const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
619
962
  if (fontHeight > 0) lastFontSize = fontHeight;
620
963
 
621
964
  if (lastY !== null) {
622
965
  const vGap = Math.abs(currentY - lastY);
623
- const isShortItem = (item.str || "").trim().length <= 2;
624
- if (vGap > lastFontSize * 1.8 && !isShortItem) {
966
+ gaps.push(vGap);
967
+ }
968
+ lastY = currentY;
969
+ }
970
+
971
+ // Compute adaptive paragraph threshold
972
+ const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
973
+ const lineThreshold = lastFontSize * 0.3; // Keep fixed line threshold
974
+
975
+ // Second pass: build text with adaptive threshold
976
+ let result = "";
977
+ lastY = null;
978
+ let lastX = null;
979
+ let lastW = 0;
980
+ let lastItemLen = 0; // Track length of previous item for marker detection
981
+
982
+ for (const item of block.items) {
983
+ if (!item.str) continue;
984
+ const currentX = item.transform[4];
985
+ const currentY = pageHeight - item.transform[5];
986
+ const currentItemLen = (item.str || "").trim().length;
987
+ // Short items are typically footnote markers (*, †, ‡, #, etc.)
988
+ // Allow up to 3 chars to handle combined markers like "* †"
989
+ const isShortItem = currentItemLen <= 3;
990
+
991
+ if (lastY !== null) {
992
+ const vGap = Math.abs(currentY - lastY);
993
+
994
+ // Use adaptive threshold for paragraph detection
995
+ if (vGap > lastFontSize * paraThreshold && !isShortItem) {
625
996
  result += "\n\n";
626
- } else if (vGap > lastFontSize * 0.3) {
997
+ } else if (vGap > lineThreshold) {
627
998
  // Different line — insert space
628
- if (!result.endsWith(" ") && !result.endsWith("\n")) {
629
- result += " ";
999
+ // But skip space if previous item was long and current is short (footnote marker)
1000
+ // This handles superscript markers like *, +, #, †, ‡
1001
+ const prevWasLong = lastItemLen > 2;
1002
+ if (!prevWasLong || !isShortItem) {
1003
+ if (!result.endsWith(" ") && !result.endsWith("\n")) {
1004
+ result += " ";
1005
+ }
630
1006
  }
631
1007
  } else if (lastX !== null) {
632
1008
  // Same line — check horizontal gap between items
633
1009
  const hGap = currentX - (lastX + lastW);
634
- if (hGap > lastFontSize * 0.15) {
1010
+ // Skip adding space before short items (superscript markers like *, +, #, $)
1011
+ // These are usually footnote markers that should attach directly to preceding text
1012
+ if (hGap > lastFontSize * 0.15 && !isShortItem) {
635
1013
  if (!result.endsWith(" ") && !result.endsWith("\n")) {
636
1014
  result += " ";
637
1015
  }
@@ -641,6 +1019,7 @@ function blockToText(block, pageHeight) {
641
1019
  lastY = currentY;
642
1020
  lastX = currentX;
643
1021
  lastW = item.width || 0;
1022
+ lastItemLen = currentItemLen;
644
1023
  result += item.str;
645
1024
  }
646
1025
  return result.trim();
@@ -916,11 +1295,11 @@ async function analyzePage(page, OPS) {
916
1295
  // Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
917
1296
  const fontMap = await extractFontMetadata(page, opList, OPS);
918
1297
 
919
- // Extract text colors per beginText/endText block (not per operator)
920
- const blockColors = extractTextBlockColors(opList, OPS);
1298
+ // Extract text with colors from operator list
1299
+ const textRuns = extractTextWithColors(opList, OPS);
921
1300
 
922
- // Now group text blocks with real font data and block colors
923
- const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, blockColors);
1301
+ // Now group text blocks with real font data and matched colors
1302
+ const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textRuns);
924
1303
 
925
1304
  // Compute body font size (most common size = body text)
926
1305
  const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
@@ -1050,6 +1429,13 @@ function reflowAndComposite(analysis, opts) {
1050
1429
  colorSpans: block.colorSpans || [],
1051
1430
  region,
1052
1431
  });
1432
+ } else if (region.type === "divider") {
1433
+ // Horizontal divider line
1434
+ reflowedRegions.push({
1435
+ type: "divider",
1436
+ height: 4, // Small height for the divider line area
1437
+ region,
1438
+ });
1053
1439
  } else {
1054
1440
  // Graphic
1055
1441
  const bitmap = bitmaps.get(region);
@@ -1307,6 +1693,23 @@ export function createReflowRenderer(container, options = {}) {
1307
1693
  lineCharOffset += line.text.length;
1308
1694
  cursorY += lh;
1309
1695
  }
1696
+ } else if (r.type === "divider") {
1697
+ // Draw horizontal divider line
1698
+ const screenY = cursorY - scrollY + 1; // Slight offset to center in area
1699
+ if (screenY > -10 && screenY < H + 10) {
1700
+ const lineWidth = Math.min(400, W - padding * 2); // Max 400px or fit with padding
1701
+ const startX = (W - lineWidth) / 2; // Center the line
1702
+ ctx.save();
1703
+ ctx.strokeStyle = textColor;
1704
+ ctx.globalAlpha = 0.3;
1705
+ ctx.lineWidth = 1 * d;
1706
+ ctx.beginPath();
1707
+ ctx.moveTo(startX * d, screenY * d);
1708
+ ctx.lineTo((startX + lineWidth) * d, screenY * d);
1709
+ ctx.stroke();
1710
+ ctx.restore();
1711
+ }
1712
+ cursorY += r.height;
1310
1713
  } else if (r.type === "graphic" && r.bitmap) {
1311
1714
  const screenY = cursorY - scrollY;
1312
1715
  if (screenY > -r.drawH && screenY < H + r.drawH) {