pretext-pdfjs 0.3.2 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/reflow.js +92 -37
package/package.json
CHANGED
package/src/reflow.js
CHANGED
|
@@ -156,25 +156,26 @@ async function extractFontMetadata(page, opList, OPS) {
|
|
|
156
156
|
// ─── Text color extraction ───────────────────────────────────────────────
|
|
157
157
|
|
|
158
158
|
/**
|
|
159
|
-
* Extract fill colors
|
|
160
|
-
*
|
|
161
|
-
*
|
|
162
|
-
*
|
|
159
|
+
* Extract fill colors per beginText/endText block pair.
|
|
160
|
+
* Returns an array of color strings, one per text block.
|
|
161
|
+
*
|
|
162
|
+
* The previous approach pushed one color per text-drawing operator (showText,
|
|
163
|
+
* showSpacedText, etc.) and tried to index into text items 1:1. That mapping
|
|
164
|
+
* is broken because a single showSpacedText operator can produce multiple text
|
|
165
|
+
* items via buildTextContentItem(). Instead, we track color at text-block
|
|
166
|
+
* boundaries — all text items within the same beginText/endText pair share
|
|
167
|
+
* the same color context.
|
|
163
168
|
*/
|
|
164
|
-
function
|
|
165
|
-
const
|
|
169
|
+
function extractTextBlockColors(opList, OPS) {
|
|
170
|
+
const blockColors = []; // one entry per beginText/endText pair
|
|
166
171
|
let currentColor = "#000000";
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
OPS.showText,
|
|
170
|
-
OPS.showSpacedText,
|
|
171
|
-
OPS.nextLineShowText,
|
|
172
|
-
OPS.nextLineSetSpacingShowText,
|
|
173
|
-
]);
|
|
172
|
+
let blockColor = "#000000";
|
|
173
|
+
let inTextBlock = false;
|
|
174
174
|
|
|
175
175
|
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
176
176
|
const fn = opList.fnArray[i];
|
|
177
177
|
|
|
178
|
+
// Track color changes
|
|
178
179
|
if (fn === OPS.setFillRGBColor) {
|
|
179
180
|
currentColor = opList.argsArray[i][0];
|
|
180
181
|
} else if (fn === OPS.setFillTransparent) {
|
|
@@ -191,12 +192,30 @@ function extractTextColors(opList, OPS) {
|
|
|
191
192
|
}
|
|
192
193
|
}
|
|
193
194
|
|
|
194
|
-
if (
|
|
195
|
-
|
|
195
|
+
if (fn === OPS.beginText) {
|
|
196
|
+
inTextBlock = true;
|
|
197
|
+
blockColor = currentColor; // color at start of block
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
// If color changes within a text block, update (last color wins)
|
|
201
|
+
if (inTextBlock && (
|
|
202
|
+
fn === OPS.setFillRGBColor ||
|
|
203
|
+
fn === OPS.setFillTransparent ||
|
|
204
|
+
fn === OPS.setFillGray ||
|
|
205
|
+
fn === OPS.setFillColor ||
|
|
206
|
+
fn === OPS.setFillCMYKColor ||
|
|
207
|
+
fn === OPS.setFillColorN
|
|
208
|
+
)) {
|
|
209
|
+
blockColor = currentColor;
|
|
210
|
+
}
|
|
211
|
+
|
|
212
|
+
if (fn === OPS.endText) {
|
|
213
|
+
blockColors.push(blockColor);
|
|
214
|
+
inTextBlock = false;
|
|
196
215
|
}
|
|
197
216
|
}
|
|
198
217
|
|
|
199
|
-
return
|
|
218
|
+
return blockColors;
|
|
200
219
|
}
|
|
201
220
|
|
|
202
221
|
// ─── Page analysis ────────────────────────────────────────────────────────
|
|
@@ -205,15 +224,43 @@ function extractTextColors(opList, OPS) {
|
|
|
205
224
|
* Group adjacent text items into text blocks by proximity.
|
|
206
225
|
* Also extracts font metadata: average size, italic, bold.
|
|
207
226
|
*/
|
|
208
|
-
function groupTextBlocks(textItems, pageHeight, styles, fontMap,
|
|
209
|
-
//
|
|
210
|
-
//
|
|
211
|
-
|
|
212
|
-
|
|
227
|
+
function groupTextBlocks(textItems, pageHeight, styles, fontMap, blockColors) {
|
|
228
|
+
// Map text items to beginText/endText blocks by detecting position
|
|
229
|
+
// discontinuities. Items within the same text block are contiguous and
|
|
230
|
+
// share the same color. When there's a large Y-position jump or font
|
|
231
|
+
// change, we advance to the next block's color.
|
|
232
|
+
if (blockColors && blockColors.length > 0) {
|
|
233
|
+
let blockIdx = 0;
|
|
234
|
+
let prevY = null;
|
|
235
|
+
let prevFontName = null;
|
|
236
|
+
let itemsInCurrentBlock = 0;
|
|
237
|
+
|
|
213
238
|
for (const item of textItems) {
|
|
214
|
-
if (item.str
|
|
215
|
-
|
|
239
|
+
if (item.str === undefined) continue; // skip marked content
|
|
240
|
+
|
|
241
|
+
const y = item.transform ? item.transform[5] : null;
|
|
242
|
+
const fontHeight = item.transform
|
|
243
|
+
? Math.hypot(item.transform[2], item.transform[3])
|
|
244
|
+
: 12;
|
|
245
|
+
|
|
246
|
+
// Detect text block boundary by position discontinuity
|
|
247
|
+
if (prevY !== null && y !== null && itemsInCurrentBlock > 0) {
|
|
248
|
+
const yDiff = Math.abs(y - prevY);
|
|
249
|
+
const fontChanged = item.fontName !== prevFontName;
|
|
250
|
+
|
|
251
|
+
if (
|
|
252
|
+
(yDiff > fontHeight * 3) ||
|
|
253
|
+
(fontChanged && yDiff > fontHeight * 0.5)
|
|
254
|
+
) {
|
|
255
|
+
blockIdx = Math.min(blockIdx + 1, blockColors.length - 1);
|
|
256
|
+
itemsInCurrentBlock = 0;
|
|
257
|
+
}
|
|
216
258
|
}
|
|
259
|
+
|
|
260
|
+
item._color = blockColors[blockIdx] || "#000000";
|
|
261
|
+
itemsInCurrentBlock++;
|
|
262
|
+
prevY = y;
|
|
263
|
+
prevFontName = item.fontName;
|
|
217
264
|
}
|
|
218
265
|
}
|
|
219
266
|
|
|
@@ -754,16 +801,20 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
|
|
|
754
801
|
curr.gapBefore = Math.max(0, currTop - prevBottom);
|
|
755
802
|
}
|
|
756
803
|
if (regions.length > 0) {
|
|
757
|
-
regions[0].gapBefore =
|
|
804
|
+
regions[0].gapBefore = 0; // padding handles top margin
|
|
758
805
|
}
|
|
759
806
|
|
|
760
|
-
//
|
|
807
|
+
// Store absolute pixel gaps and compute body font size for scaling
|
|
761
808
|
const bodyBlocks = regions.filter(r =>
|
|
762
809
|
r.type === "text" && r.block?.fontScale && Math.abs(r.block.fontScale - 1) < 0.15);
|
|
763
|
-
const
|
|
764
|
-
? bodyBlocks.reduce((s, r) => s + r.block.avgFontSize
|
|
765
|
-
: 12
|
|
810
|
+
const avgBodyFontSize = bodyBlocks.length > 0
|
|
811
|
+
? bodyBlocks.reduce((s, r) => s + r.block.avgFontSize, 0) / bodyBlocks.length
|
|
812
|
+
: 12;
|
|
766
813
|
for (const region of regions) {
|
|
814
|
+
region.gapAbsolute = region.gapBefore || 0;
|
|
815
|
+
region._avgBodyFontSize = avgBodyFontSize;
|
|
816
|
+
// Keep gapRatio as fallback for any code that reads it
|
|
817
|
+
const avgBodyLH = avgBodyFontSize * 1.6;
|
|
767
818
|
region.gapRatio = (region.gapBefore || 0) / avgBodyLH;
|
|
768
819
|
}
|
|
769
820
|
|
|
@@ -865,11 +916,11 @@ async function analyzePage(page, OPS) {
|
|
|
865
916
|
// Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
|
|
866
917
|
const fontMap = await extractFontMetadata(page, opList, OPS);
|
|
867
918
|
|
|
868
|
-
// Extract text colors
|
|
869
|
-
const
|
|
919
|
+
// Extract text colors per beginText/endText block (not per operator)
|
|
920
|
+
const blockColors = extractTextBlockColors(opList, OPS);
|
|
870
921
|
|
|
871
|
-
// Now group text blocks with real font data and colors
|
|
872
|
-
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap,
|
|
922
|
+
// Now group text blocks with real font data and block colors
|
|
923
|
+
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, blockColors);
|
|
873
924
|
|
|
874
925
|
// Compute body font size (most common size = body text)
|
|
875
926
|
const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
|
|
@@ -1028,13 +1079,15 @@ function reflowAndComposite(analysis, opts) {
|
|
|
1028
1079
|
}
|
|
1029
1080
|
}
|
|
1030
1081
|
|
|
1031
|
-
// Total height — use
|
|
1082
|
+
// Total height — use absolute pixel gaps scaled by font ratio
|
|
1032
1083
|
const baseLH = fontSize * lineHeight;
|
|
1033
1084
|
let totalHeight = padding;
|
|
1034
1085
|
for (const r of reflowedRegions) {
|
|
1035
1086
|
totalHeight += r.height;
|
|
1036
|
-
const
|
|
1037
|
-
|
|
1087
|
+
const gapAbs = r.region?.gapAbsolute ?? 0;
|
|
1088
|
+
const bodyFS = r.region?._avgBodyFontSize || 12;
|
|
1089
|
+
const scaledGap = gapAbs * (fontSize / bodyFS);
|
|
1090
|
+
totalHeight += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
|
|
1038
1091
|
}
|
|
1039
1092
|
totalHeight += padding;
|
|
1040
1093
|
|
|
@@ -1274,8 +1327,10 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
1274
1327
|
}
|
|
1275
1328
|
cursorY += r.drawH;
|
|
1276
1329
|
}
|
|
1277
|
-
const
|
|
1278
|
-
|
|
1330
|
+
const gapAbs = r.region?.gapAbsolute ?? 0;
|
|
1331
|
+
const bodyFS = r.region?._avgBodyFontSize || 12;
|
|
1332
|
+
const scaledGap = gapAbs * (fontSize / bodyFS);
|
|
1333
|
+
cursorY += Math.max(4, Math.min(scaledGap, baseLH * 2.0));
|
|
1279
1334
|
}
|
|
1280
1335
|
}
|
|
1281
1336
|
|