pretext-pdfjs 0.3.3 → 0.3.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/pinch.js +56 -4
- package/src/reflow.js +506 -103
package/package.json
CHANGED
package/src/pinch.js
CHANGED
|
@@ -392,6 +392,37 @@ export function createPDFPinchReader(container, options = {}) {
|
|
|
392
392
|
}
|
|
393
393
|
}
|
|
394
394
|
|
|
395
|
+
/**
|
|
396
|
+
* Find adaptive paragraph threshold by analyzing gap distribution.
|
|
397
|
+
*/
|
|
398
|
+
function findParagraphThreshold(gaps, fontSize) {
|
|
399
|
+
if (gaps.length < 3) return 1.8;
|
|
400
|
+
|
|
401
|
+
// Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
|
|
402
|
+
const filtered = gaps.filter(g => g / fontSize < 5);
|
|
403
|
+
if (filtered.length < 3) return 1.8;
|
|
404
|
+
|
|
405
|
+
const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
|
|
406
|
+
|
|
407
|
+
let maxGap = 0;
|
|
408
|
+
let threshold = 1.8;
|
|
409
|
+
|
|
410
|
+
for (let i = 0; i < ratios.length - 1; i++) {
|
|
411
|
+
const gap = ratios[i + 1] - ratios[i];
|
|
412
|
+
if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
|
|
413
|
+
maxGap = gap;
|
|
414
|
+
threshold = (ratios[i] + ratios[i + 1]) / 2;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
if (maxGap < 0.2) {
|
|
419
|
+
const idx = Math.floor(ratios.length * 0.75);
|
|
420
|
+
threshold = ratios[Math.min(idx, ratios.length - 1)];
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return Math.max(1.25, Math.min(threshold, 2.2));
|
|
424
|
+
}
|
|
425
|
+
|
|
395
426
|
/**
|
|
396
427
|
* Extract plain text from a PDF page.
|
|
397
428
|
* Joins text items with spaces, preserves paragraph breaks.
|
|
@@ -400,11 +431,32 @@ export function createPDFPinchReader(container, options = {}) {
|
|
|
400
431
|
const page = await pdfDoc.getPage(pageNum);
|
|
401
432
|
const content = await page.getTextContent();
|
|
402
433
|
|
|
403
|
-
//
|
|
404
|
-
|
|
434
|
+
// First pass: collect all gaps to compute adaptive threshold
|
|
435
|
+
const gaps = [];
|
|
405
436
|
let lastY = null;
|
|
406
437
|
let lastFontSize = 12;
|
|
407
438
|
|
|
439
|
+
for (const item of content.items) {
|
|
440
|
+
if (!item.str || !item.transform) continue;
|
|
441
|
+
|
|
442
|
+
const currentY = item.transform[5];
|
|
443
|
+
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
444
|
+
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
445
|
+
|
|
446
|
+
if (lastY !== null) {
|
|
447
|
+
gaps.push(Math.abs(currentY - lastY));
|
|
448
|
+
}
|
|
449
|
+
lastY = currentY;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// Compute adaptive threshold
|
|
453
|
+
const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
|
|
454
|
+
const lineThreshold = lastFontSize * 0.3;
|
|
455
|
+
|
|
456
|
+
// Second pass: build text with adaptive threshold
|
|
457
|
+
let result = "";
|
|
458
|
+
lastY = null;
|
|
459
|
+
|
|
408
460
|
for (const item of content.items) {
|
|
409
461
|
if (!item.str) continue;
|
|
410
462
|
|
|
@@ -415,10 +467,10 @@ export function createPDFPinchReader(container, options = {}) {
|
|
|
415
467
|
|
|
416
468
|
if (lastY !== null) {
|
|
417
469
|
const gap = Math.abs(currentY - lastY);
|
|
418
|
-
if (gap > lastFontSize *
|
|
470
|
+
if (gap > lastFontSize * paraThreshold) {
|
|
419
471
|
// Paragraph break
|
|
420
472
|
result += "\n\n";
|
|
421
|
-
} else if (gap >
|
|
473
|
+
} else if (gap > lineThreshold) {
|
|
422
474
|
// Line break within paragraph — add space
|
|
423
475
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
424
476
|
result += " ";
|
package/src/reflow.js
CHANGED
|
@@ -64,7 +64,7 @@ function drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y) {
|
|
|
64
64
|
}
|
|
65
65
|
|
|
66
66
|
const spanText = text.slice(overlapStart, overlapEnd);
|
|
67
|
-
ctx.fillStyle = span.color;
|
|
67
|
+
ctx.fillStyle = span.color === "transparent" ? defaultColor : span.color;
|
|
68
68
|
ctx.fillText(spanText, xPos, y);
|
|
69
69
|
xPos += ctx.measureText(spanText).width;
|
|
70
70
|
pos = overlapEnd;
|
|
@@ -156,111 +156,195 @@ async function extractFontMetadata(page, opList, OPS) {
|
|
|
156
156
|
// ─── Text color extraction ───────────────────────────────────────────────
|
|
157
157
|
|
|
158
158
|
/**
|
|
159
|
-
* Extract
|
|
160
|
-
* Returns an array of color
|
|
161
|
-
*
|
|
162
|
-
* The previous approach pushed one color per text-drawing operator (showText,
|
|
163
|
-
* showSpacedText, etc.) and tried to index into text items 1:1. That mapping
|
|
164
|
-
* is broken because a single showSpacedText operator can produce multiple text
|
|
165
|
-
* items via buildTextContentItem(). Instead, we track color at text-block
|
|
166
|
-
* boundaries — all text items within the same beginText/endText pair share
|
|
167
|
-
* the same color context.
|
|
159
|
+
* Extract text with colors from the operator list.
|
|
160
|
+
* Returns an array of {text, color} objects that can be matched to getTextContent() items.
|
|
168
161
|
*/
|
|
169
|
-
function
|
|
170
|
-
const
|
|
171
|
-
let
|
|
172
|
-
let
|
|
173
|
-
let
|
|
162
|
+
function extractTextWithColors(opList, OPS) {
|
|
163
|
+
const textRuns = []; // {text, color}
|
|
164
|
+
let fillColor = "#000000";
|
|
165
|
+
let strokeColor = "#000000";
|
|
166
|
+
let textRenderingMode = 0;
|
|
167
|
+
|
|
168
|
+
// Helper to extract text from glyph array
|
|
169
|
+
function glyphsToText(glyphs) {
|
|
170
|
+
if (!Array.isArray(glyphs)) return "";
|
|
171
|
+
return glyphs
|
|
172
|
+
.filter(g => g && typeof g === "object" && g.unicode)
|
|
173
|
+
.map(g => g.unicode)
|
|
174
|
+
.join("");
|
|
175
|
+
}
|
|
174
176
|
|
|
175
177
|
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
176
178
|
const fn = opList.fnArray[i];
|
|
179
|
+
const args = opList.argsArray[i];
|
|
177
180
|
|
|
178
|
-
// Track color changes
|
|
179
181
|
if (fn === OPS.setFillRGBColor) {
|
|
180
|
-
|
|
181
|
-
} else if (fn === OPS.
|
|
182
|
-
|
|
183
|
-
} else if (
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
182
|
+
fillColor = argsToHex(args);
|
|
183
|
+
} else if (fn === OPS.setStrokeRGBColor) {
|
|
184
|
+
strokeColor = argsToHex(args);
|
|
185
|
+
} else if (fn === OPS.setTextRenderingMode) {
|
|
186
|
+
textRenderingMode = args[0];
|
|
187
|
+
} else if (fn === OPS.showText || fn === OPS.nextLineShowText || fn === OPS.nextLineSetSpacingShowText) {
|
|
188
|
+
const text = glyphsToText(args[0]);
|
|
189
|
+
if (text) {
|
|
190
|
+
textRuns.push({ text, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
|
|
191
|
+
}
|
|
192
|
+
} else if (fn === OPS.showSpacedText) {
|
|
193
|
+
// showSpacedText has an array of [glyphs, spacing, glyphs, spacing, ...]
|
|
194
|
+
const arr = args[0];
|
|
195
|
+
if (Array.isArray(arr)) {
|
|
196
|
+
let combinedText = "";
|
|
197
|
+
for (let j = 0; j < arr.length; j += 2) {
|
|
198
|
+
const glyphs = arr[j];
|
|
199
|
+
if (glyphs) {
|
|
200
|
+
combinedText += glyphsToText(glyphs);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
if (combinedText) {
|
|
204
|
+
textRuns.push({ text: combinedText, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
|
|
205
|
+
}
|
|
192
206
|
}
|
|
193
207
|
}
|
|
208
|
+
}
|
|
194
209
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
210
|
+
return textRuns;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Match text items to colors by content.
|
|
215
|
+
* Returns an array of colors aligned with textItems.
|
|
216
|
+
*/
|
|
217
|
+
function matchColorsToTextItems(textItems, textRuns) {
|
|
218
|
+
const colors = [];
|
|
219
|
+
let runIdx = 0;
|
|
220
|
+
|
|
221
|
+
for (const item of textItems) {
|
|
222
|
+
if (item.str === undefined || !item.str.trim()) {
|
|
223
|
+
colors.push(null); // Skip non-text items
|
|
224
|
+
continue;
|
|
198
225
|
}
|
|
199
226
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
fn === OPS.setFillColorN
|
|
208
|
-
)) {
|
|
209
|
-
blockColor = currentColor;
|
|
227
|
+
const itemText = item.str.trim();
|
|
228
|
+
let matchedColor = "#000000"; // default
|
|
229
|
+
|
|
230
|
+
// Find a text run that matches this item
|
|
231
|
+
// Reset runIdx if we've gone too far (item may be earlier in the list)
|
|
232
|
+
if (runIdx >= textRuns.length) {
|
|
233
|
+
runIdx = 0;
|
|
210
234
|
}
|
|
211
235
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
236
|
+
// Search for matching run starting from current position
|
|
237
|
+
for (let i = runIdx; i < textRuns.length; i++) {
|
|
238
|
+
const run = textRuns[i];
|
|
239
|
+
const runText = run.text.trim();
|
|
240
|
+
|
|
241
|
+
// Skip empty runs
|
|
242
|
+
if (!runText) continue;
|
|
243
|
+
|
|
244
|
+
// Check for exact match or substring match
|
|
245
|
+
if (runText === itemText ||
|
|
246
|
+
itemText.startsWith(runText) ||
|
|
247
|
+
runText.startsWith(itemText)) {
|
|
248
|
+
matchedColor = run.color;
|
|
249
|
+
runIdx = i + 1; // Start from next run for next item
|
|
250
|
+
break;
|
|
251
|
+
}
|
|
215
252
|
}
|
|
253
|
+
|
|
254
|
+
colors.push(matchedColor);
|
|
216
255
|
}
|
|
217
256
|
|
|
218
|
-
return
|
|
257
|
+
return colors;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Extract one visible color per text-drawing operator in the operator list.
|
|
262
|
+
* Returns an array that maps ~1:1 to the text items from getTextContent().
|
|
263
|
+
* DEPRECATED: Use extractTextWithColors + matchColorsToTextItems instead.
|
|
264
|
+
*/
|
|
265
|
+
function extractTextItemColors(opList, OPS) {
|
|
266
|
+
const itemColors = []; // one entry per text-drawing operator
|
|
267
|
+
let fillColor = "#000000";
|
|
268
|
+
let strokeColor = "#000000";
|
|
269
|
+
let textRenderingMode = 0;
|
|
270
|
+
|
|
271
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
272
|
+
const fn = opList.fnArray[i];
|
|
273
|
+
|
|
274
|
+
if (fn === OPS.setFillRGBColor) {
|
|
275
|
+
fillColor = argsToHex(opList.argsArray[i]);
|
|
276
|
+
} else if (fn === OPS.setStrokeRGBColor) {
|
|
277
|
+
strokeColor = argsToHex(opList.argsArray[i]);
|
|
278
|
+
} else if (fn === OPS.setTextRenderingMode) {
|
|
279
|
+
textRenderingMode = opList.argsArray[i][0];
|
|
280
|
+
} else if (
|
|
281
|
+
fn === OPS.showText ||
|
|
282
|
+
fn === OPS.nextLineShowText ||
|
|
283
|
+
fn === OPS.nextLineSetSpacingShowText
|
|
284
|
+
) {
|
|
285
|
+
itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
|
|
286
|
+
} else if (fn === OPS.showSpacedText) {
|
|
287
|
+
itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
return itemColors;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/** Convert color operator args to a hex string. Args may be a hex string or RGB byte array. */
|
|
295
|
+
function argsToHex(args) {
|
|
296
|
+
if (typeof args[0] === "string" && args[0].startsWith("#")) return args[0];
|
|
297
|
+
const r = args[0] | 0, g = args[1] | 0, b = args[2] | 0;
|
|
298
|
+
return `#${r.toString(16).padStart(2, "0")}${g.toString(16).padStart(2, "0")}${b.toString(16).padStart(2, "0")}`;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/** Pick the color that will actually be visible based on text rendering mode. */
|
|
302
|
+
function visibleColor(fill, stroke, mode) {
|
|
303
|
+
const m = mode & 3; // lower 2 bits: 0=fill, 1=stroke, 2=fill+stroke, 3=invisible
|
|
304
|
+
if (m === 1) return stroke;
|
|
305
|
+
if (m === 0 || m === 2) return fill;
|
|
306
|
+
return "#000000"; // mode 3 (invisible) — show as black in reflow
|
|
219
307
|
}
|
|
220
308
|
|
|
221
309
|
// ─── Page analysis ────────────────────────────────────────────────────────
|
|
222
310
|
|
|
311
|
+
/**
|
|
312
|
+
* Find adaptive threshold for grouping items into blocks.
|
|
313
|
+
* Similar to paragraph detection but tuned for block-level grouping.
|
|
314
|
+
*/
|
|
315
|
+
function findBlockThreshold(gaps, fontSize) {
|
|
316
|
+
if (gaps.length < 3) return 2.0; // Default block threshold
|
|
317
|
+
|
|
318
|
+
// Filter extreme outliers
|
|
319
|
+
const filtered = gaps.filter(g => g / fontSize < 5);
|
|
320
|
+
if (filtered.length < 3) return 2.0;
|
|
321
|
+
|
|
322
|
+
const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
|
|
323
|
+
|
|
324
|
+
// For block grouping, we want to be more conservative than paragraph detection
|
|
325
|
+
// Use the 60th percentile as the threshold - this separates:
|
|
326
|
+
// - Line spacing (~1.0-1.3x) from paragraph gaps (~1.5x+)
|
|
327
|
+
const idx = Math.floor(ratios.length * 0.6);
|
|
328
|
+
const threshold = ratios[Math.min(idx, ratios.length - 1)];
|
|
329
|
+
|
|
330
|
+
// Clamp: block threshold should be between 1.5x and 2.2x
|
|
331
|
+
// Lower than paragraph threshold to ensure paragraphs split into separate blocks
|
|
332
|
+
return Math.max(1.5, Math.min(threshold, 2.2));
|
|
333
|
+
}
|
|
334
|
+
|
|
223
335
|
/**
|
|
224
336
|
* Group adjacent text items into text blocks by proximity.
|
|
225
337
|
* Also extracts font metadata: average size, italic, bold.
|
|
226
338
|
*/
|
|
227
|
-
function groupTextBlocks(textItems, pageHeight, styles, fontMap,
|
|
228
|
-
//
|
|
229
|
-
//
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
let itemsInCurrentBlock = 0;
|
|
237
|
-
|
|
238
|
-
for (const item of textItems) {
|
|
239
|
-
if (item.str === undefined) continue; // skip marked content
|
|
240
|
-
|
|
241
|
-
const y = item.transform ? item.transform[5] : null;
|
|
242
|
-
const fontHeight = item.transform
|
|
243
|
-
? Math.hypot(item.transform[2], item.transform[3])
|
|
244
|
-
: 12;
|
|
245
|
-
|
|
246
|
-
// Detect text block boundary by position discontinuity
|
|
247
|
-
if (prevY !== null && y !== null && itemsInCurrentBlock > 0) {
|
|
248
|
-
const yDiff = Math.abs(y - prevY);
|
|
249
|
-
const fontChanged = item.fontName !== prevFontName;
|
|
250
|
-
|
|
251
|
-
if (
|
|
252
|
-
(yDiff > fontHeight * 3) ||
|
|
253
|
-
(fontChanged && yDiff > fontHeight * 0.5)
|
|
254
|
-
) {
|
|
255
|
-
blockIdx = Math.min(blockIdx + 1, blockColors.length - 1);
|
|
256
|
-
itemsInCurrentBlock = 0;
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
item._color = blockColors[blockIdx] || "#000000";
|
|
261
|
-
itemsInCurrentBlock++;
|
|
262
|
-
prevY = y;
|
|
263
|
-
prevFontName = item.fontName;
|
|
339
|
+
function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
|
|
340
|
+
// Assign colors to text items by matching content from textRuns.
|
|
341
|
+
// textRuns is an array of {text, color} extracted from the operator list.
|
|
342
|
+
if (textRuns && textRuns.length > 0) {
|
|
343
|
+
const colors = matchColorsToTextItems(textItems, textRuns);
|
|
344
|
+
for (let i = 0; i < textItems.length; i++) {
|
|
345
|
+
const item = textItems[i];
|
|
346
|
+
if (item.str === undefined || !item.str.trim()) continue;
|
|
347
|
+
item._color = colors[i] || "#000000";
|
|
264
348
|
}
|
|
265
349
|
}
|
|
266
350
|
|
|
@@ -271,6 +355,23 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
271
355
|
return a.transform[4] - b.transform[4];
|
|
272
356
|
});
|
|
273
357
|
|
|
358
|
+
// First pass: collect all vertical gaps to compute adaptive block threshold
|
|
359
|
+
const gaps = [];
|
|
360
|
+
let lastY = null;
|
|
361
|
+
let lastFontSize = 12;
|
|
362
|
+
for (const item of sorted) {
|
|
363
|
+
const y = pageHeight - item.transform[5];
|
|
364
|
+
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
365
|
+
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
366
|
+
if (lastY !== null) {
|
|
367
|
+
gaps.push(Math.abs(y - lastY));
|
|
368
|
+
}
|
|
369
|
+
lastY = y;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Compute adaptive block grouping threshold
|
|
373
|
+
const blockThreshold = findBlockThreshold(gaps, lastFontSize);
|
|
374
|
+
|
|
274
375
|
const blocks = [];
|
|
275
376
|
let current = null;
|
|
276
377
|
|
|
@@ -315,9 +416,10 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
315
416
|
continue;
|
|
316
417
|
}
|
|
317
418
|
|
|
419
|
+
// Use adaptive block threshold instead of fixed 2.5x
|
|
318
420
|
if (
|
|
319
421
|
sizeOk &&
|
|
320
|
-
verticalGap < lastFH *
|
|
422
|
+
verticalGap < lastFH * blockThreshold &&
|
|
321
423
|
x < current.bbox.x + current.bbox.w + lastFH * 1.5
|
|
322
424
|
) {
|
|
323
425
|
current.items.push(item);
|
|
@@ -337,7 +439,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
337
439
|
if (current) blocks.push(current);
|
|
338
440
|
|
|
339
441
|
// Post-process: merge orphan tiny blocks (superscripts, markers like *, +, #)
|
|
340
|
-
// into the nearest larger block if vertically close
|
|
442
|
+
// into the nearest larger block if vertically close AND horizontally aligned
|
|
341
443
|
for (let i = blocks.length - 1; i >= 0; i--) {
|
|
342
444
|
const block = blocks[i];
|
|
343
445
|
if (block.items.length > 2) continue;
|
|
@@ -354,6 +456,15 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
354
456
|
// Check vertical proximity: orphan center within 30pt of target block
|
|
355
457
|
const bcy = block.bbox.y + block.bbox.h / 2;
|
|
356
458
|
if (bcy < o.bbox.y - 30 || bcy > o.bbox.y + o.bbox.h + 30) continue;
|
|
459
|
+
// Horizontal center alignment check - must be roughly in same column
|
|
460
|
+
const bcx = block.bbox.x + block.bbox.w / 2;
|
|
461
|
+
const ocx = o.bbox.x + o.bbox.w / 2;
|
|
462
|
+
const hCenterDist = Math.abs(bcx - ocx);
|
|
463
|
+
// Must have significant horizontal overlap or be in same column
|
|
464
|
+
const xOverlap = Math.max(0, Math.min(block.bbox.x + block.bbox.w, o.bbox.x + o.bbox.w) -
|
|
465
|
+
Math.max(block.bbox.x, o.bbox.x));
|
|
466
|
+
const inSameColumn = hCenterDist < Math.max(block.bbox.w, o.bbox.w) * 0.8 || xOverlap > 0;
|
|
467
|
+
if (!inSameColumn) continue;
|
|
357
468
|
// Horizontal edge-to-edge distance (0 if overlapping)
|
|
358
469
|
const hDist = Math.max(0,
|
|
359
470
|
block.bbox.x > o.bbox.x + o.bbox.w ? block.bbox.x - (o.bbox.x + o.bbox.w) :
|
|
@@ -377,6 +488,83 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
377
488
|
}
|
|
378
489
|
}
|
|
379
490
|
|
|
491
|
+
// Post-process: detect multi-column grids (like author sections)
|
|
492
|
+
// Group blocks that form aligned columns into a single composite block
|
|
493
|
+
const multiColumnBlocks = [];
|
|
494
|
+
const processed = new Set();
|
|
495
|
+
|
|
496
|
+
for (let i = 0; i < blocks.length; i++) {
|
|
497
|
+
if (processed.has(i)) continue;
|
|
498
|
+
const block = blocks[i];
|
|
499
|
+
const blockText = block.items.map(it => (it.str || "").trim()).join(" ");
|
|
500
|
+
const blockCenterX = block.bbox.x + block.bbox.w / 2;
|
|
501
|
+
|
|
502
|
+
// Find all blocks in same horizontal band (similar Y position)
|
|
503
|
+
const sameRowBlocks = [block];
|
|
504
|
+
const rowY = block.bbox.y;
|
|
505
|
+
const rowH = block.bbox.h;
|
|
506
|
+
|
|
507
|
+
for (let j = i + 1; j < blocks.length; j++) {
|
|
508
|
+
if (processed.has(j)) continue;
|
|
509
|
+
const other = blocks[j];
|
|
510
|
+
// Check if in same row (vertical overlap)
|
|
511
|
+
const yOverlap = Math.max(0, Math.min(rowY + rowH, other.bbox.y + other.bbox.h) - Math.max(rowY, other.bbox.y));
|
|
512
|
+
const minH = Math.min(rowH, other.bbox.h);
|
|
513
|
+
if (yOverlap > minH * 0.5) {
|
|
514
|
+
sameRowBlocks.push(other);
|
|
515
|
+
}
|
|
516
|
+
}
|
|
517
|
+
|
|
518
|
+
// If we have multiple blocks in same row, this might be a multi-column layout
|
|
519
|
+
if (sameRowBlocks.length >= 2) {
|
|
520
|
+
// Sort by X position
|
|
521
|
+
sameRowBlocks.sort((a, b) => a.bbox.x - b.bbox.x);
|
|
522
|
+
// Check if they're roughly aligned (similar height, spaced evenly)
|
|
523
|
+
const avgH = sameRowBlocks.reduce((s, b) => s + b.bbox.h, 0) / sameRowBlocks.length;
|
|
524
|
+
const heightsOk = sameRowBlocks.every(b => Math.abs(b.bbox.h - avgH) < avgH * 0.5);
|
|
525
|
+
|
|
526
|
+
if (heightsOk) {
|
|
527
|
+
// Merge into a single composite block that preserves multi-column info
|
|
528
|
+
const allItems = [];
|
|
529
|
+
for (const b of sameRowBlocks) {
|
|
530
|
+
allItems.push(...b.items);
|
|
531
|
+
processed.add(blocks.indexOf(b));
|
|
532
|
+
}
|
|
533
|
+
// Sort items by Y then X to maintain reading order within the grid
|
|
534
|
+
allItems.sort((a, b) => {
|
|
535
|
+
const ay = pageHeight - a.transform[5];
|
|
536
|
+
const by = pageHeight - b.transform[5];
|
|
537
|
+
if (Math.abs(ay - by) > 2) return ay - by;
|
|
538
|
+
return a.transform[4] - b.transform[4];
|
|
539
|
+
});
|
|
540
|
+
|
|
541
|
+
const bbox = {
|
|
542
|
+
x: Math.min(...sameRowBlocks.map(b => b.bbox.x)),
|
|
543
|
+
y: Math.min(...sameRowBlocks.map(b => b.bbox.y)),
|
|
544
|
+
w: Math.max(...sameRowBlocks.map(b => b.bbox.x + b.bbox.w)) - Math.min(...sameRowBlocks.map(b => b.bbox.x)),
|
|
545
|
+
h: Math.max(...sameRowBlocks.map(b => b.bbox.y + b.bbox.h)) - Math.min(...sameRowBlocks.map(b => b.bbox.y))
|
|
546
|
+
};
|
|
547
|
+
|
|
548
|
+
multiColumnBlocks.push({
|
|
549
|
+
items: allItems,
|
|
550
|
+
bbox,
|
|
551
|
+
isMultiColumn: true,
|
|
552
|
+
columnCount: sameRowBlocks.length
|
|
553
|
+
});
|
|
554
|
+
continue;
|
|
555
|
+
}
|
|
556
|
+
}
|
|
557
|
+
|
|
558
|
+
if (!processed.has(i)) {
|
|
559
|
+
multiColumnBlocks.push(block);
|
|
560
|
+
processed.add(i);
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
|
|
564
|
+
// Replace blocks with multi-column merged version
|
|
565
|
+
blocks.length = 0;
|
|
566
|
+
blocks.push(...multiColumnBlocks);
|
|
567
|
+
|
|
380
568
|
// Compute font metadata per block using real font objects from commonObjs
|
|
381
569
|
for (const block of blocks) {
|
|
382
570
|
const sizes = [];
|
|
@@ -408,9 +596,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
408
596
|
const colorFreq = {};
|
|
409
597
|
for (const item of block.items) {
|
|
410
598
|
const c = item._color || "#000000";
|
|
411
|
-
|
|
412
|
-
colorFreq[c] = (colorFreq[c] || 0) + 1;
|
|
413
|
-
}
|
|
599
|
+
colorFreq[c] = (colorFreq[c] || 0) + 1;
|
|
414
600
|
}
|
|
415
601
|
let dominantColor = "#000000";
|
|
416
602
|
let maxColorFreq = 0;
|
|
@@ -451,8 +637,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
451
637
|
|
|
452
638
|
/**
|
|
453
639
|
* Extract graphic regions from the page operator list.
|
|
454
|
-
*
|
|
455
|
-
* Skips path/fill/stroke to avoid false positives from text decorations.
|
|
640
|
+
* Captures images and horizontal divider lines (thin rectangles).
|
|
456
641
|
*/
|
|
457
642
|
function extractGraphicRegions(opList, OPS) {
|
|
458
643
|
const regions = [];
|
|
@@ -509,6 +694,23 @@ function extractGraphicRegions(opList, OPS) {
|
|
|
509
694
|
bbox: { x: minX, y: minY, w: maxX - minX, h: maxY - minY },
|
|
510
695
|
});
|
|
511
696
|
}
|
|
697
|
+
} else if (fn === OPS.rectangle) {
|
|
698
|
+
// Check for thin horizontal lines (dividers)
|
|
699
|
+
const [x, y, w, h] = args;
|
|
700
|
+
if (w > 100 && h > 0.5 && h < 5) {
|
|
701
|
+
const corners = [
|
|
702
|
+
transformPoint(x, y),
|
|
703
|
+
transformPoint(x + w, y),
|
|
704
|
+
transformPoint(x, y + h),
|
|
705
|
+
transformPoint(x + w, y + h),
|
|
706
|
+
];
|
|
707
|
+
const xs = corners.map(c => c[0]);
|
|
708
|
+
const ys = corners.map(c => c[1]);
|
|
709
|
+
regions.push({
|
|
710
|
+
type: "divider",
|
|
711
|
+
bbox: { x: Math.min(...xs), y: Math.min(...ys), w: Math.max(...xs) - Math.min(...xs), h: Math.max(...ys) - Math.min(...ys) },
|
|
712
|
+
});
|
|
713
|
+
}
|
|
512
714
|
}
|
|
513
715
|
}
|
|
514
716
|
|
|
@@ -601,37 +803,213 @@ function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
|
|
|
601
803
|
return regions;
|
|
602
804
|
}
|
|
603
805
|
|
|
806
|
+
/**
|
|
807
|
+
* Find adaptive paragraph threshold by analyzing gap distribution.
|
|
808
|
+
* Uses histogram approach to find natural breakpoint between line gaps and paragraph gaps.
|
|
809
|
+
*/
|
|
810
|
+
function findParagraphThreshold(gaps, fontSize) {
|
|
811
|
+
if (gaps.length < 3) return 1.8; // Fallback for small blocks
|
|
812
|
+
|
|
813
|
+
// Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
|
|
814
|
+
const filtered = gaps.filter(g => g / fontSize < 5);
|
|
815
|
+
if (filtered.length < 3) return 1.8;
|
|
816
|
+
|
|
817
|
+
// Convert to font size ratios and sort
|
|
818
|
+
const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
|
|
819
|
+
|
|
820
|
+
// Find the largest gap between consecutive ratios (the "elbow")
|
|
821
|
+
// Look for a significant jump (>0.3) between line spacing and paragraph spacing
|
|
822
|
+
let maxGap = 0;
|
|
823
|
+
let threshold = 1.8; // Default fallback
|
|
824
|
+
|
|
825
|
+
for (let i = 0; i < ratios.length - 1; i++) {
|
|
826
|
+
const gap = ratios[i + 1] - ratios[i];
|
|
827
|
+
// Look for significant gaps above typical line spacing (0.8x+)
|
|
828
|
+
if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
|
|
829
|
+
maxGap = gap;
|
|
830
|
+
threshold = (ratios[i] + ratios[i + 1]) / 2;
|
|
831
|
+
}
|
|
832
|
+
}
|
|
833
|
+
|
|
834
|
+
// If no clear cluster boundary found, use percentile-based approach
|
|
835
|
+
// 75th percentile usually separates lines from paragraphs
|
|
836
|
+
if (maxGap < 0.2) {
|
|
837
|
+
const idx = Math.floor(ratios.length * 0.75);
|
|
838
|
+
threshold = ratios[Math.min(idx, ratios.length - 1)];
|
|
839
|
+
}
|
|
840
|
+
|
|
841
|
+
// Clamp to reasonable range for paragraph detection
|
|
842
|
+
// Line spacing is typically 1.0-1.3x, paragraphs 1.3-1.8x+
|
|
843
|
+
return Math.max(1.25, Math.min(threshold, 2.2));
|
|
844
|
+
}
|
|
845
|
+
|
|
604
846
|
/**
|
|
605
847
|
* Build text content for a block, preserving paragraph breaks.
|
|
606
848
|
*/
|
|
849
|
+
function blockToTextMultiColumn(block, pageHeight) {
|
|
850
|
+
const rows = new Map();
|
|
851
|
+
const fontHeight = block.avgFontSize || 12;
|
|
852
|
+
|
|
853
|
+
// Group items by row (finer granularity)
|
|
854
|
+
for (const item of block.items) {
|
|
855
|
+
if (!item.str) continue;
|
|
856
|
+
const y = pageHeight - item.transform[5];
|
|
857
|
+
const rowKey = Math.round(y / 2) * 2; // 2px granularity
|
|
858
|
+
if (!rows.has(rowKey)) rows.set(rowKey, []);
|
|
859
|
+
rows.get(rowKey).push(item);
|
|
860
|
+
}
|
|
861
|
+
|
|
862
|
+
const sortedRows = Array.from(rows.keys()).sort((a, b) => a - b);
|
|
863
|
+
|
|
864
|
+
// Merge rows: if a row has only short items (markers), merge with next row
|
|
865
|
+
const mergedRows = [];
|
|
866
|
+
let pendingRow = null;
|
|
867
|
+
|
|
868
|
+
for (const rowKey of sortedRows) {
|
|
869
|
+
const rowItems = rows.get(rowKey).sort((a, b) => a.transform[4] - b.transform[4]);
|
|
870
|
+
const allShort = rowItems.every(it => (it.str || "").trim().length <= 3);
|
|
871
|
+
|
|
872
|
+
if (allShort && rowItems.length >= 2) {
|
|
873
|
+
// This is a marker row - merge with next row
|
|
874
|
+
pendingRow = { key: rowKey, items: rowItems };
|
|
875
|
+
} else {
|
|
876
|
+
if (pendingRow) {
|
|
877
|
+
// Merge pending marker row with this row
|
|
878
|
+
// For each item in this row, find and attach the closest marker
|
|
879
|
+
const mergedItems = [];
|
|
880
|
+
for (const item of rowItems) {
|
|
881
|
+
const itemCenterX = item.transform[4] + (item.width || 0) / 2;
|
|
882
|
+
// Find closest marker
|
|
883
|
+
let closestMarker = null;
|
|
884
|
+
let minDist = Infinity;
|
|
885
|
+
for (const marker of pendingRow.items) {
|
|
886
|
+
const markerCenterX = marker.transform[4] + (marker.width || 0) / 2;
|
|
887
|
+
const dist = Math.abs(markerCenterX - itemCenterX);
|
|
888
|
+
if (dist < minDist) {
|
|
889
|
+
minDist = dist;
|
|
890
|
+
closestMarker = marker;
|
|
891
|
+
}
|
|
892
|
+
}
|
|
893
|
+
// Attach marker to item
|
|
894
|
+
if (closestMarker && minDist < 50) { // Within 50px
|
|
895
|
+
mergedItems.push({...item, str: item.str + closestMarker.str});
|
|
896
|
+
} else {
|
|
897
|
+
mergedItems.push(item);
|
|
898
|
+
}
|
|
899
|
+
}
|
|
900
|
+
mergedItems.sort((a, b) => a.transform[4] - b.transform[4]);
|
|
901
|
+
mergedRows.push({ items: mergedItems, hasMarkers: true });
|
|
902
|
+
pendingRow = null;
|
|
903
|
+
} else {
|
|
904
|
+
mergedRows.push({ items: rowItems, hasMarkers: false });
|
|
905
|
+
}
|
|
906
|
+
}
|
|
907
|
+
}
|
|
908
|
+
|
|
909
|
+
// Don't forget last pending row
|
|
910
|
+
if (pendingRow) {
|
|
911
|
+
mergedRows.push({ items: pendingRow.items, hasMarkers: true });
|
|
912
|
+
}
|
|
913
|
+
|
|
914
|
+
// Build output lines
|
|
915
|
+
const lines = [];
|
|
916
|
+
for (const row of mergedRows) {
|
|
917
|
+
let lineText = "";
|
|
918
|
+
let lastX = null;
|
|
919
|
+
let lastW = 0;
|
|
920
|
+
let lastItemLen = 0;
|
|
921
|
+
|
|
922
|
+
for (const item of row.items) {
|
|
923
|
+
const currentX = item.transform[4];
|
|
924
|
+
const currentItemLen = (item.str || "").trim().length;
|
|
925
|
+
const isShortItem = currentItemLen <= 3;
|
|
926
|
+
|
|
927
|
+
if (lastX !== null) {
|
|
928
|
+
const hGap = currentX - (lastX + lastW);
|
|
929
|
+
const prevWasLong = lastItemLen > 2;
|
|
930
|
+
// Add column separator, but not before footnote markers
|
|
931
|
+
if (hGap > fontHeight * 0.3 && (!prevWasLong || !isShortItem)) {
|
|
932
|
+
lineText += " ";
|
|
933
|
+
}
|
|
934
|
+
}
|
|
935
|
+
|
|
936
|
+
lineText += item.str;
|
|
937
|
+
lastX = currentX;
|
|
938
|
+
lastW = item.width || fontHeight * 0.5;
|
|
939
|
+
lastItemLen = currentItemLen;
|
|
940
|
+
}
|
|
941
|
+
lines.push(lineText.trim());
|
|
942
|
+
}
|
|
943
|
+
|
|
944
|
+
return lines.join("\n");
|
|
945
|
+
}
|
|
946
|
+
|
|
607
947
|
function blockToText(block, pageHeight) {
|
|
608
|
-
|
|
948
|
+
// Special handling for multi-column blocks (like author grids)
|
|
949
|
+
if (block.isMultiColumn && block.columnCount >= 2) {
|
|
950
|
+
return blockToTextMultiColumn(block, pageHeight);
|
|
951
|
+
}
|
|
952
|
+
|
|
953
|
+
// First pass: collect all gaps and font sizes to compute adaptive threshold
|
|
954
|
+
const gaps = [];
|
|
609
955
|
let lastY = null;
|
|
610
|
-
let lastX = null;
|
|
611
|
-
let lastW = 0;
|
|
612
956
|
let lastFontSize = 12;
|
|
613
957
|
|
|
614
958
|
for (const item of block.items) {
|
|
615
959
|
if (!item.str) continue;
|
|
616
|
-
const currentX = item.transform[4];
|
|
617
960
|
const currentY = pageHeight - item.transform[5];
|
|
618
961
|
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
619
962
|
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
620
963
|
|
|
621
964
|
if (lastY !== null) {
|
|
622
965
|
const vGap = Math.abs(currentY - lastY);
|
|
623
|
-
|
|
624
|
-
|
|
966
|
+
gaps.push(vGap);
|
|
967
|
+
}
|
|
968
|
+
lastY = currentY;
|
|
969
|
+
}
|
|
970
|
+
|
|
971
|
+
// Compute adaptive paragraph threshold
|
|
972
|
+
const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
|
|
973
|
+
const lineThreshold = lastFontSize * 0.3; // Keep fixed line threshold
|
|
974
|
+
|
|
975
|
+
// Second pass: build text with adaptive threshold
|
|
976
|
+
let result = "";
|
|
977
|
+
lastY = null;
|
|
978
|
+
let lastX = null;
|
|
979
|
+
let lastW = 0;
|
|
980
|
+
let lastItemLen = 0; // Track length of previous item for marker detection
|
|
981
|
+
|
|
982
|
+
for (const item of block.items) {
|
|
983
|
+
if (!item.str) continue;
|
|
984
|
+
const currentX = item.transform[4];
|
|
985
|
+
const currentY = pageHeight - item.transform[5];
|
|
986
|
+
const currentItemLen = (item.str || "").trim().length;
|
|
987
|
+
// Short items are typically footnote markers (*, †, ‡, #, etc.)
|
|
988
|
+
// Allow up to 3 chars to handle combined markers like "* †"
|
|
989
|
+
const isShortItem = currentItemLen <= 3;
|
|
990
|
+
|
|
991
|
+
if (lastY !== null) {
|
|
992
|
+
const vGap = Math.abs(currentY - lastY);
|
|
993
|
+
|
|
994
|
+
// Use adaptive threshold for paragraph detection
|
|
995
|
+
if (vGap > lastFontSize * paraThreshold && !isShortItem) {
|
|
625
996
|
result += "\n\n";
|
|
626
|
-
} else if (vGap >
|
|
997
|
+
} else if (vGap > lineThreshold) {
|
|
627
998
|
// Different line — insert space
|
|
628
|
-
if
|
|
629
|
-
|
|
999
|
+
// But skip space if previous item was long and current is short (footnote marker)
|
|
1000
|
+
// This handles superscript markers like *, +, #, †, ‡
|
|
1001
|
+
const prevWasLong = lastItemLen > 2;
|
|
1002
|
+
if (!prevWasLong || !isShortItem) {
|
|
1003
|
+
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
1004
|
+
result += " ";
|
|
1005
|
+
}
|
|
630
1006
|
}
|
|
631
1007
|
} else if (lastX !== null) {
|
|
632
1008
|
// Same line — check horizontal gap between items
|
|
633
1009
|
const hGap = currentX - (lastX + lastW);
|
|
634
|
-
|
|
1010
|
+
// Skip adding space before short items (superscript markers like *, +, #, $)
|
|
1011
|
+
// These are usually footnote markers that should attach directly to preceding text
|
|
1012
|
+
if (hGap > lastFontSize * 0.15 && !isShortItem) {
|
|
635
1013
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
636
1014
|
result += " ";
|
|
637
1015
|
}
|
|
@@ -641,6 +1019,7 @@ function blockToText(block, pageHeight) {
|
|
|
641
1019
|
lastY = currentY;
|
|
642
1020
|
lastX = currentX;
|
|
643
1021
|
lastW = item.width || 0;
|
|
1022
|
+
lastItemLen = currentItemLen;
|
|
644
1023
|
result += item.str;
|
|
645
1024
|
}
|
|
646
1025
|
return result.trim();
|
|
@@ -916,11 +1295,11 @@ async function analyzePage(page, OPS) {
|
|
|
916
1295
|
// Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
|
|
917
1296
|
const fontMap = await extractFontMetadata(page, opList, OPS);
|
|
918
1297
|
|
|
919
|
-
// Extract text colors
|
|
920
|
-
const
|
|
1298
|
+
// Extract text with colors from operator list
|
|
1299
|
+
const textRuns = extractTextWithColors(opList, OPS);
|
|
921
1300
|
|
|
922
|
-
// Now group text blocks with real font data and
|
|
923
|
-
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap,
|
|
1301
|
+
// Now group text blocks with real font data and matched colors
|
|
1302
|
+
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textRuns);
|
|
924
1303
|
|
|
925
1304
|
// Compute body font size (most common size = body text)
|
|
926
1305
|
const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
|
|
@@ -1050,6 +1429,13 @@ function reflowAndComposite(analysis, opts) {
|
|
|
1050
1429
|
colorSpans: block.colorSpans || [],
|
|
1051
1430
|
region,
|
|
1052
1431
|
});
|
|
1432
|
+
} else if (region.type === "divider") {
|
|
1433
|
+
// Horizontal divider line
|
|
1434
|
+
reflowedRegions.push({
|
|
1435
|
+
type: "divider",
|
|
1436
|
+
height: 4, // Small height for the divider line area
|
|
1437
|
+
region,
|
|
1438
|
+
});
|
|
1053
1439
|
} else {
|
|
1054
1440
|
// Graphic
|
|
1055
1441
|
const bitmap = bitmaps.get(region);
|
|
@@ -1307,6 +1693,23 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1307
1693
|
lineCharOffset += line.text.length;
|
|
1308
1694
|
cursorY += lh;
|
|
1309
1695
|
}
|
|
1696
|
+
} else if (r.type === "divider") {
|
|
1697
|
+
// Draw horizontal divider line
|
|
1698
|
+
const screenY = cursorY - scrollY + 1; // Slight offset to center in area
|
|
1699
|
+
if (screenY > -10 && screenY < H + 10) {
|
|
1700
|
+
const lineWidth = Math.min(400, W - padding * 2); // Max 400px or fit with padding
|
|
1701
|
+
const startX = (W - lineWidth) / 2; // Center the line
|
|
1702
|
+
ctx.save();
|
|
1703
|
+
ctx.strokeStyle = textColor;
|
|
1704
|
+
ctx.globalAlpha = 0.3;
|
|
1705
|
+
ctx.lineWidth = 1 * d;
|
|
1706
|
+
ctx.beginPath();
|
|
1707
|
+
ctx.moveTo(startX * d, screenY * d);
|
|
1708
|
+
ctx.lineTo((startX + lineWidth) * d, screenY * d);
|
|
1709
|
+
ctx.stroke();
|
|
1710
|
+
ctx.restore();
|
|
1711
|
+
}
|
|
1712
|
+
cursorY += r.height;
|
|
1310
1713
|
} else if (r.type === "graphic" && r.bitmap) {
|
|
1311
1714
|
const screenY = cursorY - scrollY;
|
|
1312
1715
|
if (screenY > -r.drawH && screenY < H + r.drawH) {
|