pretext-pdfjs 0.3.0 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/reflow.js +116 -9
package/package.json
CHANGED
package/src/reflow.js
CHANGED
|
@@ -80,13 +80,70 @@ async function extractFontMetadata(page, opList, OPS) {
|
|
|
80
80
|
return fontMap;
|
|
81
81
|
}
|
|
82
82
|
|
|
83
|
+
// ─── Text color extraction ───────────────────────────────────────────────
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Extract fill colors from the operator list, indexed by text-drawing op.
|
|
87
|
+
* The evaluator normalizes all fill-color commands to setFillRGBColor with
|
|
88
|
+
* a hex string, so that's the primary path. Returns an array parallel to
|
|
89
|
+
* the text items from getTextContent().
|
|
90
|
+
*/
|
|
91
|
+
function extractTextColors(opList, OPS) {
|
|
92
|
+
const textColors = [];
|
|
93
|
+
let currentColor = "#000000";
|
|
94
|
+
|
|
95
|
+
const textDrawOps = new Set([
|
|
96
|
+
OPS.showText,
|
|
97
|
+
OPS.showSpacedText,
|
|
98
|
+
OPS.nextLineShowText,
|
|
99
|
+
OPS.nextLineSetSpacingShowText,
|
|
100
|
+
]);
|
|
101
|
+
|
|
102
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
103
|
+
const fn = opList.fnArray[i];
|
|
104
|
+
|
|
105
|
+
if (fn === OPS.setFillRGBColor) {
|
|
106
|
+
currentColor = opList.argsArray[i][0];
|
|
107
|
+
} else if (fn === OPS.setFillTransparent) {
|
|
108
|
+
currentColor = "transparent";
|
|
109
|
+
} else if (
|
|
110
|
+
fn === OPS.setFillGray ||
|
|
111
|
+
fn === OPS.setFillColor ||
|
|
112
|
+
fn === OPS.setFillCMYKColor ||
|
|
113
|
+
fn === OPS.setFillColorN
|
|
114
|
+
) {
|
|
115
|
+
const args = opList.argsArray[i];
|
|
116
|
+
if (args?.[0] && typeof args[0] === "string" && args[0].startsWith("#")) {
|
|
117
|
+
currentColor = args[0];
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (textDrawOps.has(fn)) {
|
|
122
|
+
textColors.push(currentColor);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return textColors;
|
|
127
|
+
}
|
|
128
|
+
|
|
83
129
|
// ─── Page analysis ────────────────────────────────────────────────────────
|
|
84
130
|
|
|
85
131
|
/**
|
|
86
132
|
* Group adjacent text items into text blocks by proximity.
|
|
87
133
|
* Also extracts font metadata: average size, italic, bold.
|
|
88
134
|
*/
|
|
89
|
-
function groupTextBlocks(textItems, pageHeight, styles, fontMap) {
|
|
135
|
+
function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
|
|
136
|
+
// Attach colors to text items before filtering (textColors is parallel to
|
|
137
|
+
// the full items array from getTextContent, including empty items)
|
|
138
|
+
if (textColors) {
|
|
139
|
+
let colorIdx = 0;
|
|
140
|
+
for (const item of textItems) {
|
|
141
|
+
if (item.str !== undefined) {
|
|
142
|
+
item._color = textColors[colorIdx++] || "#000000";
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
90
147
|
const sorted = [...textItems].filter(i => i.str?.trim()).sort((a, b) => {
|
|
91
148
|
const ay = pageHeight - a.transform[5];
|
|
92
149
|
const by = pageHeight - b.transform[5];
|
|
@@ -226,6 +283,24 @@ function groupTextBlocks(textItems, pageHeight, styles, fontMap) {
|
|
|
226
283
|
|
|
227
284
|
// Store the font metadata for the dominant font in this block
|
|
228
285
|
block.fontMeta = fontMap?.get(block.items[0]?.fontName) || null;
|
|
286
|
+
|
|
287
|
+
// Compute dominant fill color for the block
|
|
288
|
+
const colorFreq = {};
|
|
289
|
+
for (const item of block.items) {
|
|
290
|
+
const c = item._color || "#000000";
|
|
291
|
+
if (c !== "transparent") {
|
|
292
|
+
colorFreq[c] = (colorFreq[c] || 0) + 1;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
let dominantColor = "#000000";
|
|
296
|
+
let maxColorFreq = 0;
|
|
297
|
+
for (const [c, freq] of Object.entries(colorFreq)) {
|
|
298
|
+
if (freq > maxColorFreq) {
|
|
299
|
+
maxColorFreq = freq;
|
|
300
|
+
dominantColor = c;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
block.color = dominantColor;
|
|
229
304
|
}
|
|
230
305
|
|
|
231
306
|
return blocks;
|
|
@@ -389,26 +464,40 @@ function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
|
|
|
389
464
|
function blockToText(block, pageHeight) {
|
|
390
465
|
let result = "";
|
|
391
466
|
let lastY = null;
|
|
467
|
+
let lastX = null;
|
|
468
|
+
let lastW = 0;
|
|
392
469
|
let lastFontSize = 12;
|
|
393
470
|
|
|
394
471
|
for (const item of block.items) {
|
|
395
472
|
if (!item.str) continue;
|
|
473
|
+
const currentX = item.transform[4];
|
|
396
474
|
const currentY = pageHeight - item.transform[5];
|
|
397
475
|
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
398
476
|
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
399
477
|
|
|
400
478
|
if (lastY !== null) {
|
|
401
|
-
const
|
|
479
|
+
const vGap = Math.abs(currentY - lastY);
|
|
402
480
|
const isShortItem = (item.str || "").trim().length <= 2;
|
|
403
|
-
if (
|
|
481
|
+
if (vGap > lastFontSize * 1.8 && !isShortItem) {
|
|
404
482
|
result += "\n\n";
|
|
405
|
-
} else if (
|
|
483
|
+
} else if (vGap > lastFontSize * 0.3) {
|
|
484
|
+
// Different line — insert space
|
|
406
485
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
407
486
|
result += " ";
|
|
408
487
|
}
|
|
488
|
+
} else if (lastX !== null) {
|
|
489
|
+
// Same line — check horizontal gap between items
|
|
490
|
+
const hGap = currentX - (lastX + lastW);
|
|
491
|
+
if (hGap > lastFontSize * 0.15) {
|
|
492
|
+
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
493
|
+
result += " ";
|
|
494
|
+
}
|
|
495
|
+
}
|
|
409
496
|
}
|
|
410
497
|
}
|
|
411
498
|
lastY = currentY;
|
|
499
|
+
lastX = currentX;
|
|
500
|
+
lastW = item.width || 0;
|
|
412
501
|
result += item.str;
|
|
413
502
|
}
|
|
414
503
|
return result.trim();
|
|
@@ -658,8 +747,11 @@ async function analyzePage(page, OPS) {
|
|
|
658
747
|
// Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
|
|
659
748
|
const fontMap = await extractFontMetadata(page, opList, OPS);
|
|
660
749
|
|
|
661
|
-
//
|
|
662
|
-
const
|
|
750
|
+
// Extract text colors from operator list (parallel to text items)
|
|
751
|
+
const textColors = extractTextColors(opList, OPS);
|
|
752
|
+
|
|
753
|
+
// Now group text blocks with real font data and colors
|
|
754
|
+
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textColors);
|
|
663
755
|
|
|
664
756
|
// Compute body font size (most common size = body text)
|
|
665
757
|
const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
|
|
@@ -784,6 +876,7 @@ function reflowAndComposite(analysis, opts) {
|
|
|
784
876
|
fontWeight: weight,
|
|
785
877
|
fontFamily: blockFamily,
|
|
786
878
|
align: block.align || "left",
|
|
879
|
+
color: block.color,
|
|
787
880
|
region,
|
|
788
881
|
});
|
|
789
882
|
} else {
|
|
@@ -958,7 +1051,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
958
1051
|
const availW = W - padding * 2;
|
|
959
1052
|
|
|
960
1053
|
if (!enableMorph) {
|
|
961
|
-
ctx.fillStyle = textColor;
|
|
1054
|
+
ctx.fillStyle = r.color || textColor;
|
|
962
1055
|
ctx.font = `${style} ${weight} ${fs * d}px ${rFamily}`;
|
|
963
1056
|
}
|
|
964
1057
|
|
|
@@ -976,10 +1069,24 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
976
1069
|
const ease = 1 - (1 - t) ** 3;
|
|
977
1070
|
const morphedFS = fs * (1 - ease * (1 - edgeFontRatio));
|
|
978
1071
|
const opacity = 1.0 + (0.2 - 1.0) * ease;
|
|
979
|
-
|
|
1072
|
+
// Blend the block's actual color toward gray at edges
|
|
1073
|
+
const blockColor = r.color || textColor;
|
|
1074
|
+
let morphColor;
|
|
1075
|
+
if (blockColor.startsWith("#") && blockColor.length === 7) {
|
|
1076
|
+
const br = parseInt(blockColor.slice(1, 3), 16);
|
|
1077
|
+
const bg_ = parseInt(blockColor.slice(3, 5), 16);
|
|
1078
|
+
const bb = parseInt(blockColor.slice(5, 7), 16);
|
|
1079
|
+
const dimR = Math.round(br + (160 - br) * ease);
|
|
1080
|
+
const dimG = Math.round(bg_ + (160 - bg_) * ease);
|
|
1081
|
+
const dimB = Math.round(bb + (160 - bb) * ease);
|
|
1082
|
+
morphColor = `rgb(${dimR},${dimG},${dimB})`;
|
|
1083
|
+
} else {
|
|
1084
|
+
const c = Math.round(37 - (37 - 160) * ease);
|
|
1085
|
+
morphColor = `rgb(${c},${c - 2},${c - 3})`;
|
|
1086
|
+
}
|
|
980
1087
|
ctx.save();
|
|
981
1088
|
ctx.globalAlpha = opacity;
|
|
982
|
-
ctx.fillStyle =
|
|
1089
|
+
ctx.fillStyle = morphColor;
|
|
983
1090
|
ctx.font = `${style} ${weight} ${morphedFS * d}px ${rFamily}`;
|
|
984
1091
|
if (centered) {
|
|
985
1092
|
ctx.textAlign = "center";
|