pretext-pdfjs 0.3.3 → 0.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/pinch.js +56 -4
- package/src/reflow.js +258 -96
package/package.json
CHANGED
package/src/pinch.js
CHANGED
|
@@ -392,6 +392,37 @@ export function createPDFPinchReader(container, options = {}) {
|
|
|
392
392
|
}
|
|
393
393
|
}
|
|
394
394
|
|
|
395
|
+
/**
|
|
396
|
+
* Find adaptive paragraph threshold by analyzing gap distribution.
|
|
397
|
+
*/
|
|
398
|
+
function findParagraphThreshold(gaps, fontSize) {
|
|
399
|
+
if (gaps.length < 3) return 1.8;
|
|
400
|
+
|
|
401
|
+
// Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
|
|
402
|
+
const filtered = gaps.filter(g => g / fontSize < 5);
|
|
403
|
+
if (filtered.length < 3) return 1.8;
|
|
404
|
+
|
|
405
|
+
const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
|
|
406
|
+
|
|
407
|
+
let maxGap = 0;
|
|
408
|
+
let threshold = 1.8;
|
|
409
|
+
|
|
410
|
+
for (let i = 0; i < ratios.length - 1; i++) {
|
|
411
|
+
const gap = ratios[i + 1] - ratios[i];
|
|
412
|
+
if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
|
|
413
|
+
maxGap = gap;
|
|
414
|
+
threshold = (ratios[i] + ratios[i + 1]) / 2;
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
if (maxGap < 0.2) {
|
|
419
|
+
const idx = Math.floor(ratios.length * 0.75);
|
|
420
|
+
threshold = ratios[Math.min(idx, ratios.length - 1)];
|
|
421
|
+
}
|
|
422
|
+
|
|
423
|
+
return Math.max(1.25, Math.min(threshold, 2.2));
|
|
424
|
+
}
|
|
425
|
+
|
|
395
426
|
/**
|
|
396
427
|
* Extract plain text from a PDF page.
|
|
397
428
|
* Joins text items with spaces, preserves paragraph breaks.
|
|
@@ -400,11 +431,32 @@ export function createPDFPinchReader(container, options = {}) {
|
|
|
400
431
|
const page = await pdfDoc.getPage(pageNum);
|
|
401
432
|
const content = await page.getTextContent();
|
|
402
433
|
|
|
403
|
-
//
|
|
404
|
-
|
|
434
|
+
// First pass: collect all gaps to compute adaptive threshold
|
|
435
|
+
const gaps = [];
|
|
405
436
|
let lastY = null;
|
|
406
437
|
let lastFontSize = 12;
|
|
407
438
|
|
|
439
|
+
for (const item of content.items) {
|
|
440
|
+
if (!item.str || !item.transform) continue;
|
|
441
|
+
|
|
442
|
+
const currentY = item.transform[5];
|
|
443
|
+
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
444
|
+
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
445
|
+
|
|
446
|
+
if (lastY !== null) {
|
|
447
|
+
gaps.push(Math.abs(currentY - lastY));
|
|
448
|
+
}
|
|
449
|
+
lastY = currentY;
|
|
450
|
+
}
|
|
451
|
+
|
|
452
|
+
// Compute adaptive threshold
|
|
453
|
+
const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
|
|
454
|
+
const lineThreshold = lastFontSize * 0.3;
|
|
455
|
+
|
|
456
|
+
// Second pass: build text with adaptive threshold
|
|
457
|
+
let result = "";
|
|
458
|
+
lastY = null;
|
|
459
|
+
|
|
408
460
|
for (const item of content.items) {
|
|
409
461
|
if (!item.str) continue;
|
|
410
462
|
|
|
@@ -415,10 +467,10 @@ export function createPDFPinchReader(container, options = {}) {
|
|
|
415
467
|
|
|
416
468
|
if (lastY !== null) {
|
|
417
469
|
const gap = Math.abs(currentY - lastY);
|
|
418
|
-
if (gap > lastFontSize *
|
|
470
|
+
if (gap > lastFontSize * paraThreshold) {
|
|
419
471
|
// Paragraph break
|
|
420
472
|
result += "\n\n";
|
|
421
|
-
} else if (gap >
|
|
473
|
+
} else if (gap > lineThreshold) {
|
|
422
474
|
// Line break within paragraph — add space
|
|
423
475
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
424
476
|
result += " ";
|
package/src/reflow.js
CHANGED
|
@@ -64,7 +64,7 @@ function drawColoredLine(ctx, text, charOffset, spans, defaultColor, x, y) {
|
|
|
64
64
|
}
|
|
65
65
|
|
|
66
66
|
const spanText = text.slice(overlapStart, overlapEnd);
|
|
67
|
-
ctx.fillStyle = span.color;
|
|
67
|
+
ctx.fillStyle = span.color === "transparent" ? defaultColor : span.color;
|
|
68
68
|
ctx.fillText(spanText, xPos, y);
|
|
69
69
|
xPos += ctx.measureText(spanText).width;
|
|
70
70
|
pos = overlapEnd;
|
|
@@ -156,111 +156,195 @@ async function extractFontMetadata(page, opList, OPS) {
|
|
|
156
156
|
// ─── Text color extraction ───────────────────────────────────────────────
|
|
157
157
|
|
|
158
158
|
/**
|
|
159
|
-
* Extract
|
|
160
|
-
* Returns an array of color
|
|
161
|
-
*
|
|
162
|
-
* The previous approach pushed one color per text-drawing operator (showText,
|
|
163
|
-
* showSpacedText, etc.) and tried to index into text items 1:1. That mapping
|
|
164
|
-
* is broken because a single showSpacedText operator can produce multiple text
|
|
165
|
-
* items via buildTextContentItem(). Instead, we track color at text-block
|
|
166
|
-
* boundaries — all text items within the same beginText/endText pair share
|
|
167
|
-
* the same color context.
|
|
159
|
+
* Extract text with colors from the operator list.
|
|
160
|
+
* Returns an array of {text, color} objects that can be matched to getTextContent() items.
|
|
168
161
|
*/
|
|
169
|
-
function
|
|
170
|
-
const
|
|
171
|
-
let
|
|
172
|
-
let
|
|
173
|
-
let
|
|
162
|
+
function extractTextWithColors(opList, OPS) {
|
|
163
|
+
const textRuns = []; // {text, color}
|
|
164
|
+
let fillColor = "#000000";
|
|
165
|
+
let strokeColor = "#000000";
|
|
166
|
+
let textRenderingMode = 0;
|
|
167
|
+
|
|
168
|
+
// Helper to extract text from glyph array
|
|
169
|
+
function glyphsToText(glyphs) {
|
|
170
|
+
if (!Array.isArray(glyphs)) return "";
|
|
171
|
+
return glyphs
|
|
172
|
+
.filter(g => g && typeof g === "object" && g.unicode)
|
|
173
|
+
.map(g => g.unicode)
|
|
174
|
+
.join("");
|
|
175
|
+
}
|
|
174
176
|
|
|
175
177
|
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
176
178
|
const fn = opList.fnArray[i];
|
|
179
|
+
const args = opList.argsArray[i];
|
|
177
180
|
|
|
178
|
-
// Track color changes
|
|
179
181
|
if (fn === OPS.setFillRGBColor) {
|
|
180
|
-
|
|
181
|
-
} else if (fn === OPS.
|
|
182
|
-
|
|
183
|
-
} else if (
|
|
184
|
-
|
|
185
|
-
|
|
186
|
-
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
182
|
+
fillColor = argsToHex(args);
|
|
183
|
+
} else if (fn === OPS.setStrokeRGBColor) {
|
|
184
|
+
strokeColor = argsToHex(args);
|
|
185
|
+
} else if (fn === OPS.setTextRenderingMode) {
|
|
186
|
+
textRenderingMode = args[0];
|
|
187
|
+
} else if (fn === OPS.showText || fn === OPS.nextLineShowText || fn === OPS.nextLineSetSpacingShowText) {
|
|
188
|
+
const text = glyphsToText(args[0]);
|
|
189
|
+
if (text) {
|
|
190
|
+
textRuns.push({ text, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
|
|
191
|
+
}
|
|
192
|
+
} else if (fn === OPS.showSpacedText) {
|
|
193
|
+
// showSpacedText has an array of [glyphs, spacing, glyphs, spacing, ...]
|
|
194
|
+
const arr = args[0];
|
|
195
|
+
if (Array.isArray(arr)) {
|
|
196
|
+
let combinedText = "";
|
|
197
|
+
for (let j = 0; j < arr.length; j += 2) {
|
|
198
|
+
const glyphs = arr[j];
|
|
199
|
+
if (glyphs) {
|
|
200
|
+
combinedText += glyphsToText(glyphs);
|
|
201
|
+
}
|
|
202
|
+
}
|
|
203
|
+
if (combinedText) {
|
|
204
|
+
textRuns.push({ text: combinedText, color: visibleColor(fillColor, strokeColor, textRenderingMode) });
|
|
205
|
+
}
|
|
192
206
|
}
|
|
193
207
|
}
|
|
208
|
+
}
|
|
194
209
|
|
|
195
|
-
|
|
196
|
-
|
|
197
|
-
|
|
210
|
+
return textRuns;
|
|
211
|
+
}
|
|
212
|
+
|
|
213
|
+
/**
|
|
214
|
+
* Match text items to colors by content.
|
|
215
|
+
* Returns an array of colors aligned with textItems.
|
|
216
|
+
*/
|
|
217
|
+
function matchColorsToTextItems(textItems, textRuns) {
|
|
218
|
+
const colors = [];
|
|
219
|
+
let runIdx = 0;
|
|
220
|
+
|
|
221
|
+
for (const item of textItems) {
|
|
222
|
+
if (item.str === undefined || !item.str.trim()) {
|
|
223
|
+
colors.push(null); // Skip non-text items
|
|
224
|
+
continue;
|
|
198
225
|
}
|
|
199
226
|
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
fn === OPS.setFillColorN
|
|
208
|
-
)) {
|
|
209
|
-
blockColor = currentColor;
|
|
227
|
+
const itemText = item.str.trim();
|
|
228
|
+
let matchedColor = "#000000"; // default
|
|
229
|
+
|
|
230
|
+
// Find a text run that matches this item
|
|
231
|
+
// Reset runIdx if we've gone too far (item may be earlier in the list)
|
|
232
|
+
if (runIdx >= textRuns.length) {
|
|
233
|
+
runIdx = 0;
|
|
210
234
|
}
|
|
211
235
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
236
|
+
// Search for matching run starting from current position
|
|
237
|
+
for (let i = runIdx; i < textRuns.length; i++) {
|
|
238
|
+
const run = textRuns[i];
|
|
239
|
+
const runText = run.text.trim();
|
|
240
|
+
|
|
241
|
+
// Skip empty runs
|
|
242
|
+
if (!runText) continue;
|
|
243
|
+
|
|
244
|
+
// Check for exact match or substring match
|
|
245
|
+
if (runText === itemText ||
|
|
246
|
+
itemText.startsWith(runText) ||
|
|
247
|
+
runText.startsWith(itemText)) {
|
|
248
|
+
matchedColor = run.color;
|
|
249
|
+
runIdx = i + 1; // Start from next run for next item
|
|
250
|
+
break;
|
|
251
|
+
}
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
colors.push(matchedColor);
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
return colors;
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
/**
|
|
261
|
+
* Extract one visible color per text-drawing operator in the operator list.
|
|
262
|
+
* Returns an array that maps ~1:1 to the text items from getTextContent().
|
|
263
|
+
* DEPRECATED: Use extractTextWithColors + matchColorsToTextItems instead.
|
|
264
|
+
*/
|
|
265
|
+
function extractTextItemColors(opList, OPS) {
|
|
266
|
+
const itemColors = []; // one entry per text-drawing operator
|
|
267
|
+
let fillColor = "#000000";
|
|
268
|
+
let strokeColor = "#000000";
|
|
269
|
+
let textRenderingMode = 0;
|
|
270
|
+
|
|
271
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
272
|
+
const fn = opList.fnArray[i];
|
|
273
|
+
|
|
274
|
+
if (fn === OPS.setFillRGBColor) {
|
|
275
|
+
fillColor = argsToHex(opList.argsArray[i]);
|
|
276
|
+
} else if (fn === OPS.setStrokeRGBColor) {
|
|
277
|
+
strokeColor = argsToHex(opList.argsArray[i]);
|
|
278
|
+
} else if (fn === OPS.setTextRenderingMode) {
|
|
279
|
+
textRenderingMode = opList.argsArray[i][0];
|
|
280
|
+
} else if (
|
|
281
|
+
fn === OPS.showText ||
|
|
282
|
+
fn === OPS.nextLineShowText ||
|
|
283
|
+
fn === OPS.nextLineSetSpacingShowText
|
|
284
|
+
) {
|
|
285
|
+
itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
|
|
286
|
+
} else if (fn === OPS.showSpacedText) {
|
|
287
|
+
itemColors.push(visibleColor(fillColor, strokeColor, textRenderingMode));
|
|
215
288
|
}
|
|
216
289
|
}
|
|
217
290
|
|
|
218
|
-
return
|
|
291
|
+
return itemColors;
|
|
292
|
+
}
|
|
293
|
+
|
|
294
|
+
/** Convert color operator args to a hex string. Args may be a hex string or RGB byte array. */
|
|
295
|
+
function argsToHex(args) {
|
|
296
|
+
if (typeof args[0] === "string" && args[0].startsWith("#")) return args[0];
|
|
297
|
+
const r = args[0] | 0, g = args[1] | 0, b = args[2] | 0;
|
|
298
|
+
return `#${r.toString(16).padStart(2, "0")}${g.toString(16).padStart(2, "0")}${b.toString(16).padStart(2, "0")}`;
|
|
299
|
+
}
|
|
300
|
+
|
|
301
|
+
/** Pick the color that will actually be visible based on text rendering mode. */
|
|
302
|
+
function visibleColor(fill, stroke, mode) {
|
|
303
|
+
const m = mode & 3; // lower 2 bits: 0=fill, 1=stroke, 2=fill+stroke, 3=invisible
|
|
304
|
+
if (m === 1) return stroke;
|
|
305
|
+
if (m === 0 || m === 2) return fill;
|
|
306
|
+
return "#000000"; // mode 3 (invisible) — show as black in reflow
|
|
219
307
|
}
|
|
220
308
|
|
|
221
309
|
// ─── Page analysis ────────────────────────────────────────────────────────
|
|
222
310
|
|
|
311
|
+
/**
|
|
312
|
+
* Find adaptive threshold for grouping items into blocks.
|
|
313
|
+
* Similar to paragraph detection but tuned for block-level grouping.
|
|
314
|
+
*/
|
|
315
|
+
function findBlockThreshold(gaps, fontSize) {
|
|
316
|
+
if (gaps.length < 3) return 2.0; // Default block threshold
|
|
317
|
+
|
|
318
|
+
// Filter extreme outliers
|
|
319
|
+
const filtered = gaps.filter(g => g / fontSize < 5);
|
|
320
|
+
if (filtered.length < 3) return 2.0;
|
|
321
|
+
|
|
322
|
+
const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
|
|
323
|
+
|
|
324
|
+
// For block grouping, we want to be more conservative than paragraph detection
|
|
325
|
+
// Use the 60th percentile as the threshold - this separates:
|
|
326
|
+
// - Line spacing (~1.0-1.3x) from paragraph gaps (~1.5x+)
|
|
327
|
+
const idx = Math.floor(ratios.length * 0.6);
|
|
328
|
+
const threshold = ratios[Math.min(idx, ratios.length - 1)];
|
|
329
|
+
|
|
330
|
+
// Clamp: block threshold should be between 1.5x and 2.2x
|
|
331
|
+
// Lower than paragraph threshold to ensure paragraphs split into separate blocks
|
|
332
|
+
return Math.max(1.5, Math.min(threshold, 2.2));
|
|
333
|
+
}
|
|
334
|
+
|
|
223
335
|
/**
|
|
224
336
|
* Group adjacent text items into text blocks by proximity.
|
|
225
337
|
* Also extracts font metadata: average size, italic, bold.
|
|
226
338
|
*/
|
|
227
|
-
function groupTextBlocks(textItems, pageHeight, styles, fontMap,
|
|
228
|
-
//
|
|
229
|
-
//
|
|
230
|
-
|
|
231
|
-
|
|
232
|
-
|
|
233
|
-
|
|
234
|
-
|
|
235
|
-
|
|
236
|
-
let itemsInCurrentBlock = 0;
|
|
237
|
-
|
|
238
|
-
for (const item of textItems) {
|
|
239
|
-
if (item.str === undefined) continue; // skip marked content
|
|
240
|
-
|
|
241
|
-
const y = item.transform ? item.transform[5] : null;
|
|
242
|
-
const fontHeight = item.transform
|
|
243
|
-
? Math.hypot(item.transform[2], item.transform[3])
|
|
244
|
-
: 12;
|
|
245
|
-
|
|
246
|
-
// Detect text block boundary by position discontinuity
|
|
247
|
-
if (prevY !== null && y !== null && itemsInCurrentBlock > 0) {
|
|
248
|
-
const yDiff = Math.abs(y - prevY);
|
|
249
|
-
const fontChanged = item.fontName !== prevFontName;
|
|
250
|
-
|
|
251
|
-
if (
|
|
252
|
-
(yDiff > fontHeight * 3) ||
|
|
253
|
-
(fontChanged && yDiff > fontHeight * 0.5)
|
|
254
|
-
) {
|
|
255
|
-
blockIdx = Math.min(blockIdx + 1, blockColors.length - 1);
|
|
256
|
-
itemsInCurrentBlock = 0;
|
|
257
|
-
}
|
|
258
|
-
}
|
|
259
|
-
|
|
260
|
-
item._color = blockColors[blockIdx] || "#000000";
|
|
261
|
-
itemsInCurrentBlock++;
|
|
262
|
-
prevY = y;
|
|
263
|
-
prevFontName = item.fontName;
|
|
339
|
+
function groupTextBlocks(textItems, pageHeight, styles, fontMap, textRuns) {
|
|
340
|
+
// Assign colors to text items by matching content from textRuns.
|
|
341
|
+
// textRuns is an array of {text, color} extracted from the operator list.
|
|
342
|
+
if (textRuns && textRuns.length > 0) {
|
|
343
|
+
const colors = matchColorsToTextItems(textItems, textRuns);
|
|
344
|
+
for (let i = 0; i < textItems.length; i++) {
|
|
345
|
+
const item = textItems[i];
|
|
346
|
+
if (item.str === undefined || !item.str.trim()) continue;
|
|
347
|
+
item._color = colors[i] || "#000000";
|
|
264
348
|
}
|
|
265
349
|
}
|
|
266
350
|
|
|
@@ -271,6 +355,23 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
271
355
|
return a.transform[4] - b.transform[4];
|
|
272
356
|
});
|
|
273
357
|
|
|
358
|
+
// First pass: collect all vertical gaps to compute adaptive block threshold
|
|
359
|
+
const gaps = [];
|
|
360
|
+
let lastY = null;
|
|
361
|
+
let lastFontSize = 12;
|
|
362
|
+
for (const item of sorted) {
|
|
363
|
+
const y = pageHeight - item.transform[5];
|
|
364
|
+
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
365
|
+
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
366
|
+
if (lastY !== null) {
|
|
367
|
+
gaps.push(Math.abs(y - lastY));
|
|
368
|
+
}
|
|
369
|
+
lastY = y;
|
|
370
|
+
}
|
|
371
|
+
|
|
372
|
+
// Compute adaptive block grouping threshold
|
|
373
|
+
const blockThreshold = findBlockThreshold(gaps, lastFontSize);
|
|
374
|
+
|
|
274
375
|
const blocks = [];
|
|
275
376
|
let current = null;
|
|
276
377
|
|
|
@@ -315,9 +416,10 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
315
416
|
continue;
|
|
316
417
|
}
|
|
317
418
|
|
|
419
|
+
// Use adaptive block threshold instead of fixed 2.5x
|
|
318
420
|
if (
|
|
319
421
|
sizeOk &&
|
|
320
|
-
verticalGap < lastFH *
|
|
422
|
+
verticalGap < lastFH * blockThreshold &&
|
|
321
423
|
x < current.bbox.x + current.bbox.w + lastFH * 1.5
|
|
322
424
|
) {
|
|
323
425
|
current.items.push(item);
|
|
@@ -408,9 +510,7 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
|
408
510
|
const colorFreq = {};
|
|
409
511
|
for (const item of block.items) {
|
|
410
512
|
const c = item._color || "#000000";
|
|
411
|
-
|
|
412
|
-
colorFreq[c] = (colorFreq[c] || 0) + 1;
|
|
413
|
-
}
|
|
513
|
+
colorFreq[c] = (colorFreq[c] || 0) + 1;
|
|
414
514
|
}
|
|
415
515
|
let dominantColor = "#000000";
|
|
416
516
|
let maxColorFreq = 0;
|
|
@@ -601,29 +701,91 @@ function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
|
|
|
601
701
|
return regions;
|
|
602
702
|
}
|
|
603
703
|
|
|
704
|
+
/**
|
|
705
|
+
* Find adaptive paragraph threshold by analyzing gap distribution.
|
|
706
|
+
* Uses histogram approach to find natural breakpoint between line gaps and paragraph gaps.
|
|
707
|
+
*/
|
|
708
|
+
function findParagraphThreshold(gaps, fontSize) {
|
|
709
|
+
if (gaps.length < 3) return 1.8; // Fallback for small blocks
|
|
710
|
+
|
|
711
|
+
// Filter out extreme outliers (>5x font size - likely headers, titles, etc.)
|
|
712
|
+
const filtered = gaps.filter(g => g / fontSize < 5);
|
|
713
|
+
if (filtered.length < 3) return 1.8;
|
|
714
|
+
|
|
715
|
+
// Convert to font size ratios and sort
|
|
716
|
+
const ratios = filtered.map(g => g / fontSize).sort((a, b) => a - b);
|
|
717
|
+
|
|
718
|
+
// Find the largest gap between consecutive ratios (the "elbow")
|
|
719
|
+
// Look for a significant jump (>0.3) between line spacing and paragraph spacing
|
|
720
|
+
let maxGap = 0;
|
|
721
|
+
let threshold = 1.8; // Default fallback
|
|
722
|
+
|
|
723
|
+
for (let i = 0; i < ratios.length - 1; i++) {
|
|
724
|
+
const gap = ratios[i + 1] - ratios[i];
|
|
725
|
+
// Look for significant gaps above typical line spacing (0.8x+)
|
|
726
|
+
if (gap > maxGap && gap > 0.25 && ratios[i] > 0.8) {
|
|
727
|
+
maxGap = gap;
|
|
728
|
+
threshold = (ratios[i] + ratios[i + 1]) / 2;
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
|
|
732
|
+
// If no clear cluster boundary found, use percentile-based approach
|
|
733
|
+
// 75th percentile usually separates lines from paragraphs
|
|
734
|
+
if (maxGap < 0.2) {
|
|
735
|
+
const idx = Math.floor(ratios.length * 0.75);
|
|
736
|
+
threshold = ratios[Math.min(idx, ratios.length - 1)];
|
|
737
|
+
}
|
|
738
|
+
|
|
739
|
+
// Clamp to reasonable range for paragraph detection
|
|
740
|
+
// Line spacing is typically 1.0-1.3x, paragraphs 1.3-1.8x+
|
|
741
|
+
return Math.max(1.25, Math.min(threshold, 2.2));
|
|
742
|
+
}
|
|
743
|
+
|
|
604
744
|
/**
|
|
605
745
|
* Build text content for a block, preserving paragraph breaks.
|
|
606
746
|
*/
|
|
607
747
|
function blockToText(block, pageHeight) {
|
|
608
|
-
|
|
748
|
+
// First pass: collect all gaps and font sizes to compute adaptive threshold
|
|
749
|
+
const gaps = [];
|
|
609
750
|
let lastY = null;
|
|
610
|
-
let lastX = null;
|
|
611
|
-
let lastW = 0;
|
|
612
751
|
let lastFontSize = 12;
|
|
613
752
|
|
|
614
753
|
for (const item of block.items) {
|
|
615
754
|
if (!item.str) continue;
|
|
616
|
-
const currentX = item.transform[4];
|
|
617
755
|
const currentY = pageHeight - item.transform[5];
|
|
618
756
|
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
619
757
|
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
620
758
|
|
|
759
|
+
if (lastY !== null) {
|
|
760
|
+
const vGap = Math.abs(currentY - lastY);
|
|
761
|
+
gaps.push(vGap);
|
|
762
|
+
}
|
|
763
|
+
lastY = currentY;
|
|
764
|
+
}
|
|
765
|
+
|
|
766
|
+
// Compute adaptive paragraph threshold
|
|
767
|
+
const paraThreshold = findParagraphThreshold(gaps, lastFontSize);
|
|
768
|
+
const lineThreshold = lastFontSize * 0.3; // Keep fixed line threshold
|
|
769
|
+
|
|
770
|
+
// Second pass: build text with adaptive threshold
|
|
771
|
+
let result = "";
|
|
772
|
+
lastY = null;
|
|
773
|
+
let lastX = null;
|
|
774
|
+
let lastW = 0;
|
|
775
|
+
|
|
776
|
+
for (const item of block.items) {
|
|
777
|
+
if (!item.str) continue;
|
|
778
|
+
const currentX = item.transform[4];
|
|
779
|
+
const currentY = pageHeight - item.transform[5];
|
|
780
|
+
|
|
621
781
|
if (lastY !== null) {
|
|
622
782
|
const vGap = Math.abs(currentY - lastY);
|
|
623
783
|
const isShortItem = (item.str || "").trim().length <= 2;
|
|
624
|
-
|
|
784
|
+
|
|
785
|
+
// Use adaptive threshold for paragraph detection
|
|
786
|
+
if (vGap > lastFontSize * paraThreshold && !isShortItem) {
|
|
625
787
|
result += "\n\n";
|
|
626
|
-
} else if (vGap >
|
|
788
|
+
} else if (vGap > lineThreshold) {
|
|
627
789
|
// Different line — insert space
|
|
628
790
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
629
791
|
result += " ";
|
|
@@ -916,11 +1078,11 @@ async function analyzePage(page, OPS) {
|
|
|
916
1078
|
// Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
|
|
917
1079
|
const fontMap = await extractFontMetadata(page, opList, OPS);
|
|
918
1080
|
|
|
919
|
-
// Extract text colors
|
|
920
|
-
const
|
|
1081
|
+
// Extract text with colors from operator list
|
|
1082
|
+
const textRuns = extractTextWithColors(opList, OPS);
|
|
921
1083
|
|
|
922
|
-
// Now group text blocks with real font data and
|
|
923
|
-
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap,
|
|
1084
|
+
// Now group text blocks with real font data and matched colors
|
|
1085
|
+
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textRuns);
|
|
924
1086
|
|
|
925
1087
|
// Compute body font size (most common size = body text)
|
|
926
1088
|
const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
|