pretext-pdfjs 0.3.2 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/pinch.js +56 -4
- package/src/reflow.js +280 -63
package/package.json
CHANGED
package/src/pinch.js
CHANGED
|
@@ -392,6 +392,37 @@ export function createPDFPinchReader(container, options = {}) {
|
|
|
392
392
|
}
|
|
393
393
|
}
|
|
394
394
|
|
|
395
|
+
/**
|
|
396
|
+
* Find adaptive paragraph threshold by analyzing gap distribution.
|
|
397
|
+
*/
|
|
398
|
+
function findParagraphThreshold(gaps, fontSize) {
|
|
399
|
+
if (gaps.length < 3) return 1.8;
|
|
400
|
+
|
|
401
|
+
// Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
|
|
402
|
+
const filtered = gaps.filter(g => g / fontSize < 5);
|
|
403
|
+
if (filtered.length < 3) return 1.8;
|
|
404
|
+
|
|
405
|
+
const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
|
|
406
|
+
|
|
407
|
+
let maxGap = 0;
|
|
408
|
+
let threshold = 1.8;
|
|
409
|
+
|
|
410
|
+
for (let i = 0; i < ratios.length - 1; i++) {
|
|
411
|
+
const gap = ratios[i + 1] - ratios[i];
|
|
412
|
+
if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
|
|
413
|
+
maxGap = gap;
|
|
414
|
+
threshold = (ratios[i] + ratios[i + 1]) / 2;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
if (maxGap < 0.2) {
|
|
419
|
+
const idx = Math.floor(ratios.length * 0.75);
|
|
420
|
+
threshold = ratios[Math.min(idx, ratios.length - 1)];
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return Math.max(1.25, Math.min(threshold, 2.2));
|
|
424
|
+
}
|
|
425
|
+
|
|
395
426
|
/**
|
|
396
427
|
* Extract plain text from a PDF page.
|
|
397
428
|
* Joins text items with spaces, preserves paragraph breaks.
|
|
@@ -400,11 +431,32 @@ export function createPDFPinchReader(container, options = {}) {
|
|
|
400
431
|
const page = await pdfDoc.getPage(pageNum);
|
|
401
432
|
const content = await page.getTextContent();
|
|
402
433
|
|
|
403
|
-
//
|
|
404
|
-
|
|
434
|
+
// First pass: collect all gaps to compute adaptive threshold
|
|
435
|
+
const gaps = [];
|
|
405
436
|
let lastY = null;
|
|
406
437
|
let lastFontSize = 12;
|
|
407
438
|
|
|
439
|
+
for (const item of content.items) {
|
|
440
|
+
if (!item.str || !item.transform) continue;
|
|
441
|
+
|
|
442
|
+
const currentY = item.transform[5];
|
|
443
|
+
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
444
|
+
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
445
|
+
|
|
446
|
+
if (lastY !== null) {
|
|
447
|
+
gaps.push(Math.abs(currentY - lastY));
|
|
448
|
+
}
|
|
449
|
+
lastY = currentY;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// Compute adaptive threshold
|
|
453
|
+
const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
|
|
454
|
+
const lineThreshold = lastFontSize * 0.3;
|
|
455
|
+
|
|
456
|
+
// Second pass: build text with adaptive threshold
|
|
457
|
+
let result = "";
|
|
458
|
+
lastY = null;
|
|
459
|
+
|
|
408
460
|
for (const item of content.items) {
|
|
409
461
|
if (!item.str) continue;
|
|
410
462
|
|
|
@@ -415,10 +467,10 @@ export function createPDFPinchReader(container, options = {}) {
|
|
|
415
467
|
|
|
416
468
|
if (lastY !== null) {
|
|
417
469
|
const gap = Math.abs(currentY - lastY);
|
|
418
|
-
if (gap > lastFontSize *
|
|
470
|
+
if (gap > lastFontSize * paraThreshold) {
|
|
419
471
|
// Paragraph break
|
|
420
472
|
result += "\n\n";
|
|
421
|
-
} else if (gap >
|
|
473
|
+
} else if (gap > lineThreshold) {
|
|
422
474
|
// Line break within paragraph — add space
|
|
423
475
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
424
476
|
result += " ";
|
package/src/reflow.js
CHANGED
|
@@ -64,7 +64,7 @@ function drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y) {
|
|
|
64
64
|
}
|
|
65
65
|
|
|
66
66
|
const spanText = text.slice(overlapStart, overlapEnd);
|
|
67
|
-
ctx.fillStyle = span.color;
|
|
67
|
+
ctx.fillStyle = span.color === "transparent" ? defaultColor : span.color;
|
|
68
68
|
ctx.fillText(spanText, xPos, y);
|
|
69
69
|
xPos += ctx.measureText(spanText).width;
|
|
70
70
|
pos = overlapEnd;
|
|
@@ -156,64 +156,195 @@ async function extractFontMetadata(page, opList, OPS) {
|
|
|
156
156
|
// ─── Text color extraction ───────────────────────────────────────────────
|
|
157
157
|
|
|
158
158
|
/**
|
|
159
|
-
* Extract
|
|
160
|
-
*
|
|
161
|
-
* a hex string, so that's the primary path. Returns an array parallel to
|
|
162
|
-
* the text items from getTextContent().
|
|
159
|
+
* Extract text with colors from the operator list.
|
|
160
|
+
* Returns an array of {text, color} objects that can be matched to getTextContent() items.
|
|
163
161
|
*/
|
|
164
|
-
function
|
|
165
|
-
const
|
|
166
|
-
let
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
172
|
-
|
|
173
|
-
|
|
162
|
+
function extractTextWithColors(opList, OPS) {
|
|
163
|
+
const textRuns = []; // {text, color}
|
|
164
|
+
let fillColor = "#000000";
|
|
165
|
+
let strokeColor = "#000000";
|
|
166
|
+
let textRenderingMode = 0;
|
|
167
|
+
|
|
168
|
+
// Helper to extract text from glyph array
|
|
169
|
+
function glyphsToText(glyphs) {
|
|
170
|
+
if (!Array.isArray(glyphs)) return "";
|
|
171
|
+
return glyphs
|
|
172
|
+
.filter(g => g && typeof g === "object" && g.unicode)
|
|
173
|
+
.map(g => g.unicode)
|
|
174
|
+
.join("");
|
|
175
|
+
}
|
|
174
176
|
|
|
175
177
|
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
176
178
|
const fn = opList.fnArray[i];
|
|
179
|
+
const args = opList.argsArray[i];
|
|
177
180
|
|
|
178
181
|
if (fn === OPS.setFillRGBColor) {
|
|
179
|
-
|
|
180
|
-
} else if (fn === OPS.
|
|
181
|
-
|
|
182
|
-
} else if (
|
|
183
|
-
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
182
|
+
fillColor = argsToHex(args);
|
|
183
|
+
} else if (fn === OPS.setStrokeRGBColor) {
|
|
184
|
+
strokeColor = argsToHex(args);
|
|
185
|
+
} else if (fn === OPS.setTextRenderingMode) {
|
|
186
|
+
textRenderingMode = args[0];
|
|
187
|
+
} else if (fn === OPS.showText || fn === OPS.nextLineShowText || fn === OPS.nextLineSetSpacingShowText) {
|
|
188
|
+
const text = glyphsToText(args[0]);
|
|
189
|
+
if (text) {
|
|
190
|
+
textRuns.push({ text, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
|
|
191
|
+
}
|
|
192
|
+
} else if (fn === OPS.showSpacedText) {
|
|
193
|
+
// showSpacedText has an array of [glyphs, spacing, glyphs, spacing, ...]
|
|
194
|
+
const arr = args[0];
|
|
195
|
+
if (Array.isArray(arr)) {
|
|
196
|
+
let combinedText = "";
|
|
197
|
+
for (let j = 0; j < arr.length; j += 2) {
|
|
198
|
+
const glyphs = arr[j];
|
|
199
|
+
if (glyphs) {
|
|
200
|
+
combinedText += glyphsToText(glyphs);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
if (combinedText) {
|
|
204
|
+
textRuns.push({ text: combinedText, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
|
|
205
|
+
}
|
|
206
|
+
}
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
return textRuns;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Match text items to colors by content.
|
|
215
|
+
* Returns an array of colors aligned with textItems.
|
|
216
|
+
*/
|
|
217
|
+
function matchColorsToTextItems(textItems, textRuns) {
|
|
218
|
+
const colors = [];
|
|
219
|
+
let runIdx = 0;
|
|
220
|
+
|
|
221
|
+
for (const item of textItems) {
|
|
222
|
+
if (item.str === undefined || !item.str.trim()) {
|
|
223
|
+
colors.push(null); // Skip non-text items
|
|
224
|
+
continue;
|
|
225
|
+
}
|
|
226
|
+
|
|
227
|
+
const itemText = item.str.trim();
|
|
228
|
+
let matchedColor = "#000000"; // default
|
|
229
|
+
|
|
230
|
+
// Find a text run that matches this item
|
|
231
|
+
// Reset runIdx if we've gone too far (item may be earlier in the list)
|
|
232
|
+
if (runIdx >= textRuns.length) {
|
|
233
|
+
runIdx = 0;
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
// Search for matching run starting from current position
|
|
237
|
+
for (let i = runIdx; i < textRuns.length; i++) {
|
|
238
|
+
const run = textRuns[i];
|
|
239
|
+
const runText = run.text.trim();
|
|
240
|
+
|
|
241
|
+
// Skip empty runs
|
|
242
|
+
if (!runText) continue;
|
|
243
|
+
|
|
244
|
+
// Check for exact match or substring match
|
|
245
|
+
if (runText === itemText ||
|
|
246
|
+
itemText.startsWith(runText) ||
|
|
247
|
+
runText.startsWith(itemText)) {
|
|
248
|
+
matchedColor = run.color;
|
|
249
|
+
runIdx = i + 1; // Start from next run for next item
|
|
250
|
+
break;
|
|
191
251
|
}
|
|
192
252
|
}
|
|
193
253
|
|
|
194
|
-
|
|
195
|
-
|
|
254
|
+
colors.push(matchedColor);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return colors;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Extract one visible color per text-drawing operator in the operator list.
|
|
262
|
+
* Returns an array that maps ~1:1 to the text items from getTextContent().
|
|
263
|
+
* DEPRECATED: Use extractTextWithColors + matchColorsToTextItems instead.
|
|
264
|
+
*/
|
|
265
|
+
function extractTextItemColors(opList, OPS) {
|
|
266
|
+
const itemColors = []; // one entry per text-drawing operator
|
|
267
|
+
let fillColor = "#000000";
|
|
268
|
+
let strokeColor = "#000000";
|
|
269
|
+
let textRenderingMode = 0;
|
|
270
|
+
|
|
271
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
272
|
+
const fn = opList.fnArray[i];
|
|
273
|
+
|
|
274
|
+
if (fn === OPS.setFillRGBColor) {
|
|
275
|
+
fillColor = argsToHex(opList.argsArray[i]);
|
|
276
|
+
} else if (fn === OPS.setStrokeRGBColor) {
|
|
277
|
+
strokeColor = argsToHex(opList.argsArray[i]);
|
|
278
|
+
} else if (fn === OPS.setTextRenderingMode) {
|
|
279
|
+
textRenderingMode = opList.argsArray[i][0];
|
|
280
|
+
} else if (
|
|
281
|
+
fn === OPS.showText ||
|
|
282
|
+
fn === OPS.nextLineShowText ||
|
|
283
|
+
fn === OPS.nextLineSetSpacingShowText
|
|
284
|
+
) {
|
|
285
|
+
itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
|
|
286
|
+
} else if (fn === OPS.showSpacedText) {
|
|
287
|
+
itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
|
|
196
288
|
}
|
|
197
289
|
}
|
|
198
290
|
|
|
199
|
-
return
|
|
291
|
+
return itemColors;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/** Convert color operator args to a hex string. Args may be a hex string or RGB byte array. */
|
|
295
|
+
function argsToHex(args) {
|
|
296
|
+
if (typeof args[0] === "string" && args[0].startsWith("#")) return args[0];
|
|
297
|
+
const r = args[0] | 0, g = args[1] | 0, b = args[2] | 0;
|
|
298
|
+
return `#${r.toString(16).padStart(2, "0")}${g.toString(16).padStart(2, "0")}${b.toString(16).padStart(2, "0")}`;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/** Pick the color that will actually be visible based on text rendering mode. */
|
|
302
|
+
function visibleColor(fill, stroke, mode) {
|
|
303
|
+
const m = mode & 3; // lower 2 bits: 0=fill, 1=stroke, 2=fill+stroke, 3=invisible
|
|
304
|
+
if (m === 1) return stroke;
|
|
305
|
+
if (m === 0 || m === 2) return fill;
|
|
306
|
+
return "#000000"; // mode 3 (invisible) — show as black in reflow
|
|
200
307
|
}
|
|
201
308
|
|
|
202
309
|
// ─── Page analysis ────────────────────────────────────────────────────────
|
|
203
310
|
|
|
311
|
+
/**
|
|
312
|
+
* Find adaptive threshold for grouping items into blocks.
|
|
313
|
+
* Similar to paragraph detection but tuned for block-level grouping.
|
|
314
|
+
*/
|
|
315
|
+
function findBlockThreshold(gaps, fontSize) {
|
|
316
|
+
if (gaps.length < 3) return 2.0; // Default block threshold
|
|
317
|
+
|
|
318
|
+
// Filter extreme outliers
|
|
319
|
+
const filtered = gaps.filter(g => g / fontSize < 5);
|
|
320
|
+
if (filtered.length < 3) return 2.0;
|
|
321
|
+
|
|
322
|
+
const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
|
|
323
|
+
|
|
324
|
+
// For block grouping, we want to be more conservative than paragraph detection
|
|
325
|
+
// Use the 60th percentile as the threshold - this separates:
|
|
326
|
+
// - Line spacing (~1.0-1.3x) from paragraph gaps (~1.5x+)
|
|
327
|
+
const idx = Math.floor(ratios.length * 0.6);
|
|
328
|
+
const threshold = ratios[Math.min(idx, ratios.length - 1)];
|
|
329
|
+
|
|
330
|
+
// Clamp: block threshold should be between 1.5x and 2.2x
|
|
331
|
+
// Lower than paragraph threshold to ensure paragraphs split into separate blocks
|
|
332
|
+
return Math.max(1.5, Math.min(threshold, 2.2));
|
|
333
|
+
}
|
|
334
|
+
|
|
204
335
|
/**
|
|
205
336
|
* Group adjacent text items into text blocks by proximity.
|
|
206
337
|
* Also extracts font metadata: average size, italic, bold.
|
|
207
338
|
*/
|
|
208
|
-
function groupTextBlocks(textItems, pageHeight, styles, fontMap,
|
|
209
|
-
//
|
|
210
|
-
//
|
|
211
|
-
if (
|
|
212
|
-
|
|
213
|
-
for (
|
|
214
|
-
|
|
215
|
-
|
|
216
|
-
|
|
339
|
+
function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
|
|
340
|
+
// Assign colors to text items by matching content from textRuns.
|
|
341
|
+
// textRuns is an array of {text, color} extracted from the operator list.
|
|
342
|
+
if (textRuns && textRuns.length > 0) {
|
|
343
|
+
const colors = matchColorsToTextItems(textItems, textRuns);
|
|
344
|
+
for (let i = 0; i < textItems.length; i++) {
|
|
345
|
+
const item = textItems[i];
|
|
346
|
+
if (item.str === undefined || !item.str.trim()) continue;
|
|
347
|
+
item._color = colors[i] || "#000000";
|
|
217
348
|
}
|
|
218
349
|
}
|
|
219
350
|
|
|
@@ -224,6 +355,23 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
|
|
|
224
355
|
return a.transform[4] - b.transform[4];
|
|
225
356
|
});
|
|
226
357
|
|
|
358
|
+
// First pass: collect all vertical gaps to compute adaptive block threshold
|
|
359
|
+
const gaps = [];
|
|
360
|
+
let lastY = null;
|
|
361
|
+
let lastFontSize = 12;
|
|
362
|
+
for (const item of sorted) {
|
|
363
|
+
const y = pageHeight - item.transform[5];
|
|
364
|
+
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
365
|
+
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
366
|
+
if (lastY !== null) {
|
|
367
|
+
gaps.push(Math.abs(y - lastY));
|
|
368
|
+
}
|
|
369
|
+
lastY = y;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Compute adaptive block grouping threshold
|
|
373
|
+
const blockThreshold = findBlockThreshold(gaps, lastFontSize);
|
|
374
|
+
|
|
227
375
|
const blocks = [];
|
|
228
376
|
let current = null;
|
|
229
377
|
|
|
@@ -268,9 +416,10 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
|
|
|
268
416
|
continue;
|
|
269
417
|
}
|
|
270
418
|
|
|
419
|
+
// Use adaptive block threshold instead of fixed 2.5x
|
|
271
420
|
if (
|
|
272
421
|
sizeOk &&
|
|
273
|
-
verticalGap < lastFH *
|
|
422
|
+
verticalGap < lastFH * blockThreshold &&
|
|
274
423
|
x < current.bbox.x + current.bbox.w + lastFH * 1.5
|
|
275
424
|
) {
|
|
276
425
|
current.items.push(item);
|
|
@@ -361,9 +510,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
|
|
|
361
510
|
const colorFreq = {};
|
|
362
511
|
for (const item of block.items) {
|
|
363
512
|
const c = item._color || "#000000";
|
|
364
|
-
|
|
365
|
-
colorFreq[c] = (colorFreq[c] || 0) + 1;
|
|
366
|
-
}
|
|
513
|
+
colorFreq[c] = (colorFreq[c] || 0) + 1;
|
|
367
514
|
}
|
|
368
515
|
let dominantColor = "#000000";
|
|
369
516
|
let maxColorFreq = 0;
|
|
@@ -554,29 +701,91 @@ function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
|
|
|
554
701
|
return regions;
|
|
555
702
|
}
|
|
556
703
|
|
|
704
|
+
/**
|
|
705
|
+
* Find adaptive paragraph threshold by analyzing gap distribution.
|
|
706
|
+
* Uses histogram approach to find natural breakpoint between line gaps and paragraph gaps.
|
|
707
|
+
*/
|
|
708
|
+
function findParagraphThreshold(gaps, fontSize) {
|
|
709
|
+
if (gaps.length < 3) return 1.8; // Fallback for small blocks
|
|
710
|
+
|
|
711
|
+
// Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
|
|
712
|
+
const filtered = gaps.filter(g => g / fontSize < 5);
|
|
713
|
+
if (filtered.length < 3) return 1.8;
|
|
714
|
+
|
|
715
|
+
// Convert to font size ratios and sort
|
|
716
|
+
const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
|
|
717
|
+
|
|
718
|
+
// Find the largest gap between consecutive ratios (the "elbow")
|
|
719
|
+
// Look for a significant jump (>0.3) between line spacing and paragraph spacing
|
|
720
|
+
let maxGap = 0;
|
|
721
|
+
let threshold = 1.8; // Default fallback
|
|
722
|
+
|
|
723
|
+
for (let i = 0; i < ratios.length - 1; i++) {
|
|
724
|
+
const gap = ratios[i + 1] - ratios[i];
|
|
725
|
+
// Look for significant gaps above typical line spacing (0.8x+)
|
|
726
|
+
if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
|
|
727
|
+
maxGap = gap;
|
|
728
|
+
threshold = (ratios[i] + ratios[i + 1]) / 2;
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// If no clear cluster boundary found, use percentile-based approach
|
|
733
|
+
// 75th percentile usually separates lines from paragraphs
|
|
734
|
+
if (maxGap < 0.2) {
|
|
735
|
+
const idx = Math.floor(ratios.length * 0.75);
|
|
736
|
+
threshold = ratios[Math.min(idx, ratios.length - 1)];
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
// Clamp to reasonable range for paragraph detection
|
|
740
|
+
// Line spacing is typically 1.0-1.3x, paragraphs 1.3-1.8x+
|
|
741
|
+
return Math.max(1.25, Math.min(threshold, 2.2));
|
|
742
|
+
}
|
|
743
|
+
|
|
557
744
|
/**
|
|
558
745
|
* Build text content for a block, preserving paragraph breaks.
|
|
559
746
|
*/
|
|
560
747
|
function blockToText(block, pageHeight) {
|
|
561
|
-
|
|
748
|
+
// First pass: collect all gaps and font sizes to compute adaptive threshold
|
|
749
|
+
const gaps = [];
|
|
562
750
|
let lastY = null;
|
|
563
|
-
let lastX = null;
|
|
564
|
-
let lastW = 0;
|
|
565
751
|
let lastFontSize = 12;
|
|
566
752
|
|
|
567
753
|
for (const item of block.items) {
|
|
568
754
|
if (!item.str) continue;
|
|
569
|
-
const currentX = item.transform[4];
|
|
570
755
|
const currentY = pageHeight - item.transform[5];
|
|
571
756
|
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
572
757
|
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
573
758
|
|
|
759
|
+
if (lastY !== null) {
|
|
760
|
+
const vGap = Math.abs(currentY - lastY);
|
|
761
|
+
gaps.push(vGap);
|
|
762
|
+
}
|
|
763
|
+
lastY = currentY;
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
// Compute adaptive paragraph threshold
|
|
767
|
+
const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
|
|
768
|
+
const lineThreshold = lastFontSize * 0.3; // Keep fixed line threshold
|
|
769
|
+
|
|
770
|
+
// Second pass: build text with adaptive threshold
|
|
771
|
+
let result = "";
|
|
772
|
+
lastY = null;
|
|
773
|
+
let lastX = null;
|
|
774
|
+
let lastW = 0;
|
|
775
|
+
|
|
776
|
+
for (const item of block.items) {
|
|
777
|
+
if (!item.str) continue;
|
|
778
|
+
const currentX = item.transform[4];
|
|
779
|
+
const currentY = pageHeight - item.transform[5];
|
|
780
|
+
|
|
574
781
|
if (lastY !== null) {
|
|
575
782
|
const vGap = Math.abs(currentY - lastY);
|
|
576
783
|
const isShortItem = (item.str || "").trim().length <= 2;
|
|
577
|
-
|
|
784
|
+
|
|
785
|
+
// Use adaptive threshold for paragraph detection
|
|
786
|
+
if (vGap > lastFontSize * paraThreshold && !isShortItem) {
|
|
578
787
|
result += "\n\n";
|
|
579
|
-
} else if (vGap >
|
|
788
|
+
} else if (vGap > lineThreshold) {
|
|
580
789
|
// Different line — insert space
|
|
581
790
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
582
791
|
result += " ";
|
|
@@ -754,16 +963,20 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
|
|
|
754
963
|
curr.gapBefore = Math.max(0, currTop - prevBottom);
|
|
755
964
|
}
|
|
756
965
|
if (regions.length > 0) {
|
|
757
|
-
regions[0].gapBefore =
|
|
966
|
+
regions[0].gapBefore = 0; // padding handles top margin
|
|
758
967
|
}
|
|
759
968
|
|
|
760
|
-
//
|
|
969
|
+
// Store absolute pixel gaps and compute body font size for scaling
|
|
761
970
|
const bodyBlocks = regions.filter(r =>
|
|
762
971
|
r.type === "text" && r.block?.fontScale && Math.abs(r.block.fontScale - 1) < 0.15);
|
|
763
|
-
const
|
|
764
|
-
? bodyBlocks.reduce((s, r) => s + r.block.avgFontSize
|
|
765
|
-
: 12
|
|
972
|
+
const avgBodyFontSize = bodyBlocks.length > 0
|
|
973
|
+
? bodyBlocks.reduce((s, r) => s + r.block.avgFontSize, 0) / bodyBlocks.length
|
|
974
|
+
: 12;
|
|
766
975
|
for (const region of regions) {
|
|
976
|
+
region.gapAbsolute = region.gapBefore || 0;
|
|
977
|
+
region._avgBodyFontSize = avgBodyFontSize;
|
|
978
|
+
// Keep gapRatio as fallback for any code that reads it
|
|
979
|
+
const avgBodyLH = avgBodyFontSize * 1.6;
|
|
767
980
|
region.gapRatio = (region.gapBefore || 0) / avgBodyLH;
|
|
768
981
|
}
|
|
769
982
|
|
|
@@ -865,11 +1078,11 @@ async function analyzePage(page, OPS) {
|
|
|
865
1078
|
// Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
|
|
866
1079
|
const fontMap = await extractFontMetadata(page, opList, OPS);
|
|
867
1080
|
|
|
868
|
-
// Extract text colors from operator list
|
|
869
|
-
const
|
|
1081
|
+
// Extract text with colors from operator list
|
|
1082
|
+
const textRuns = extractTextWithColors(opList, OPS);
|
|
870
1083
|
|
|
871
|
-
// Now group text blocks with real font data and colors
|
|
872
|
-
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap,
|
|
1084
|
+
// Now group text blocks with real font data and matched colors
|
|
1085
|
+
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textRuns);
|
|
873
1086
|
|
|
874
1087
|
// Compute body font size (most common size = body text)
|
|
875
1088
|
const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
|
|
@@ -1028,13 +1241,15 @@ function reflowAndComposite(analysis, opts) {
|
|
|
1028
1241
|
}
|
|
1029
1242
|
}
|
|
1030
1243
|
|
|
1031
|
-
// Total height — use
|
|
1244
|
+
// Total height — use absolute pixel gaps scaled by font ratio
|
|
1032
1245
|
const baseLH = fontSize * lineHeight;
|
|
1033
1246
|
let totalHeight = padding;
|
|
1034
1247
|
for (const r of reflowedRegions) {
|
|
1035
1248
|
totalHeight += r.height;
|
|
1036
|
-
const
|
|
1037
|
-
|
|
1249
|
+
const gapAbs = r.region?.gapAbsolute ?? 0;
|
|
1250
|
+
const bodyFS = r.region?._avgBodyFontSize || 12;
|
|
1251
|
+
const scaledGap = gapAbs * (fontSize / bodyFS);
|
|
1252
|
+
totalHeight += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
|
|
1038
1253
|
}
|
|
1039
1254
|
totalHeight += padding;
|
|
1040
1255
|
|
|
@@ -1274,8 +1489,10 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1274
1489
|
}
|
|
1275
1490
|
cursorY += r.drawH;
|
|
1276
1491
|
}
|
|
1277
|
-
const
|
|
1278
|
-
|
|
1492
|
+
const gapAbs = r.region?.gapAbsolute ?? 0;
|
|
1493
|
+
const bodyFS = r.region?._avgBodyFontSize || 12;
|
|
1494
|
+
const scaledGap = gapAbs * (fontSize / bodyFS);
|
|
1495
|
+
cursorY += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
|
|
1279
1496
|
}
|
|
1280
1497
|
}
|
|
1281
1498
|
|