pretext-pdfjs 0.2.0 → 0.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/reflow.js +37 -9
package/package.json
CHANGED
package/src/reflow.js
CHANGED
|
@@ -61,12 +61,20 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
61
61
|
const verticalGap = Math.abs(y - lastY);
|
|
62
62
|
|
|
63
63
|
// Split block on significant font size change (headings vs body)
|
|
64
|
+
// But don't split for superscripts/markers that are horizontally adjacent
|
|
64
65
|
const sizeRatio = fontHeight > 0 && lastFH > 0
|
|
65
66
|
? Math.max(fontHeight, lastFH) / Math.min(fontHeight, lastFH)
|
|
66
67
|
: 1;
|
|
68
|
+
const lastX = lastItem.transform[4];
|
|
69
|
+
const lastW = lastItem.width || lastFH;
|
|
70
|
+
const hGap = x - (lastX + lastW);
|
|
71
|
+
const isHorizAdjacent = hGap < lastFH * 0.5 && hGap > -lastFH;
|
|
72
|
+
const isShortItem = (item.str || "").trim().length <= 2;
|
|
73
|
+
const isSuperscript = isShortItem && isHorizAdjacent && sizeRatio > 1.3;
|
|
74
|
+
const sizeOk = sizeRatio < 1.3 || isSuperscript;
|
|
67
75
|
|
|
68
76
|
if (
|
|
69
|
-
|
|
77
|
+
sizeOk &&
|
|
70
78
|
verticalGap < lastFH * 2.5 &&
|
|
71
79
|
x < current.bbox.x + current.bbox.w + lastFH * 2
|
|
72
80
|
) {
|
|
@@ -248,11 +256,33 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
|
|
|
248
256
|
}
|
|
249
257
|
}
|
|
250
258
|
|
|
251
|
-
//
|
|
252
|
-
regions.
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
259
|
+
// Detect columns: find the midpoint X where blocks cluster on left vs right
|
|
260
|
+
const pageWidth = Math.max(...regions.map(r => r.bbox.x + r.bbox.w), 1);
|
|
261
|
+
const midX = pageWidth / 2;
|
|
262
|
+
const leftBlocks = regions.filter(r => r.bbox.x + r.bbox.w / 2 < midX);
|
|
263
|
+
const rightBlocks = regions.filter(r => r.bbox.x + r.bbox.w / 2 >= midX);
|
|
264
|
+
const hasColumns = leftBlocks.length > 2 && rightBlocks.length > 2 &&
|
|
265
|
+
rightBlocks.some(r => leftBlocks.some(l => Math.abs(l.bbox.y - r.bbox.y) < 20));
|
|
266
|
+
|
|
267
|
+
if (hasColumns) {
|
|
268
|
+
// Two-column: sort each column top-to-bottom, then concatenate
|
|
269
|
+
// Full-width blocks (spanning > 60% of page) go first, sorted by Y
|
|
270
|
+
const fullWidth = regions.filter(r => r.bbox.w > pageWidth * 0.6);
|
|
271
|
+
const leftCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 < midX);
|
|
272
|
+
const rightCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 >= midX);
|
|
273
|
+
const byY = (a, b) => a.bbox.y - b.bbox.y;
|
|
274
|
+
fullWidth.sort(byY);
|
|
275
|
+
leftCol.sort(byY);
|
|
276
|
+
rightCol.sort(byY);
|
|
277
|
+
regions.length = 0;
|
|
278
|
+
regions.push(...fullWidth, ...leftCol, ...rightCol);
|
|
279
|
+
} else {
|
|
280
|
+
// Single column: sort by Y then X
|
|
281
|
+
regions.sort((a, b) => {
|
|
282
|
+
if (Math.abs(a.bbox.y - b.bbox.y) > 10) return a.bbox.y - b.bbox.y;
|
|
283
|
+
return a.bbox.x - b.bbox.x;
|
|
284
|
+
});
|
|
285
|
+
}
|
|
256
286
|
|
|
257
287
|
return regions;
|
|
258
288
|
}
|
|
@@ -363,9 +393,7 @@ function reflowAndComposite(analysis, opts) {
|
|
|
363
393
|
const blockFontSize = Math.round(fontSize * (block.fontScale || 1));
|
|
364
394
|
const blockLH = blockFontSize * lineHeight;
|
|
365
395
|
const style = block.isItalic ? "italic" : "normal";
|
|
366
|
-
|
|
367
|
-
const scale = block.fontScale || 1;
|
|
368
|
-
const weight = block.isBold ? 700 : scale > 1.8 ? 300 : scale > 1.3 ? 400 : 400;
|
|
396
|
+
const weight = block.isBold ? 700 : 400;
|
|
369
397
|
// Use PDF's detected font family if available, otherwise fall back to configured
|
|
370
398
|
const blockFamily = block.pdfFontFamily
|
|
371
399
|
? `${block.pdfFontFamily}, ${fontFamily}`
|