pretext-pdfjs 0.2.1 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -9
- package/package.json +2 -2
- package/src/pinch.js +40 -2
- package/src/reflow.js +633 -140
package/README.md
CHANGED
|
@@ -86,7 +86,40 @@ await TextLayer.enableReflow(container, fullText, {
|
|
|
86
86
|
});
|
|
87
87
|
```
|
|
88
88
|
|
|
89
|
-
###
|
|
89
|
+
### Reflow Mode (images preserved)
|
|
90
|
+
|
|
91
|
+
```js
|
|
92
|
+
import { createReflowRenderer } from "pretext-pdfjs/reflow";
|
|
93
|
+
|
|
94
|
+
const renderer = createReflowRenderer(container, {
|
|
95
|
+
fontSize: 16,
|
|
96
|
+
enablePinchZoom: true,
|
|
97
|
+
enableMorph: false, // set true for fisheye scroll
|
|
98
|
+
fontFamily: '"Literata", Georgia, serif',
|
|
99
|
+
});
|
|
100
|
+
await renderer.open("document.pdf");
|
|
101
|
+
await renderer.showPage(1);
|
|
102
|
+
// Pinch to zoom — text reflows, images stay in place
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Unlike the text-only reader modes, reflow mode preserves images, vector graphics,
|
|
106
|
+
and document structure. It uses PDF.js's `operationsFilter` to render non-text
|
|
107
|
+
elements separately, then composites Pretext-reflowed text on top.
|
|
108
|
+
|
|
109
|
+
### Pinch reader with preserved layout
|
|
110
|
+
|
|
111
|
+
```js
|
|
112
|
+
import { createPDFPinchReader } from "pretext-pdfjs/pinch";
|
|
113
|
+
|
|
114
|
+
const reader = createPDFPinchReader(container, {
|
|
115
|
+
mode: "pinchType",
|
|
116
|
+
preserveLayout: true, // images stay in place
|
|
117
|
+
});
|
|
118
|
+
await reader.open("document.pdf");
|
|
119
|
+
await reader.showPage(1);
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Per-block reflow (full options)
|
|
90
123
|
|
|
91
124
|
The reflow module bridges PDF mode (images preserved, no reflow) and reader modes (text reflows, images stripped). Text blocks reflow with Pretext at the target font size while images and vector graphics render as scaled bitmaps in their original positions.
|
|
92
125
|
|
|
@@ -101,8 +134,12 @@ const renderer = createReflowRenderer(container, {
|
|
|
101
134
|
background: "#f4f1eb",
|
|
102
135
|
textColor: "#252320",
|
|
103
136
|
imageFit: "proportional", // "proportional" | "original" | "full-width"
|
|
137
|
+
maxWidth: Infinity, // max canvas width (default: full container)
|
|
104
138
|
enablePinchZoom: true,
|
|
105
139
|
enableMomentumScroll: true,
|
|
140
|
+
enableMorph: false, // fisheye scroll effect on text + images
|
|
141
|
+
morphRadius: 300, // morph effect radius in px
|
|
142
|
+
edgeFontRatio: 0.5, // edge font = 50% of center font
|
|
106
143
|
onZoom: (fontSize) => console.log("Font size:", fontSize),
|
|
107
144
|
onPageReady: ({ pageNum, textBlocks, graphicRegions }) => {
|
|
108
145
|
console.log(`Page ${pageNum}: ${textBlocks.length} text blocks, ${graphicRegions.length} graphics`);
|
|
@@ -127,9 +164,9 @@ renderer.destroy();
|
|
|
127
164
|
|
|
128
165
|
**How it works:**
|
|
129
166
|
|
|
130
|
-
1. **Analyze** — extracts text blocks (grouped by proximity) and graphic regions (images, vector paths) from the PDF page via `getTextContent()` and `getOperatorList()`.
|
|
167
|
+
1. **Analyze** — extracts text blocks (grouped by proximity) and graphic regions (images, vector paths) from the PDF page via `getTextContent()` and `getOperatorList()`. Uses `operationsFilter` to render only non-text content to an offscreen canvas, and `recordImages` for precise image coordinates.
|
|
131
168
|
2. **Reflow** — each text block is reflowed with Pretext's `prepareWithSegments()` + `layoutWithLines()` at the current font size. Graphic bitmaps are scaled proportionally.
|
|
132
|
-
3. **Composite** — walks the region map in reading order, drawing reflowed text lines and graphic bitmaps onto a single output canvas.
|
|
169
|
+
3. **Composite** — walks the region map in reading order, drawing reflowed text lines and graphic bitmaps onto a single output canvas. With `enableMorph`, applies fisheye interpolation to both text and images.
|
|
133
170
|
|
|
134
171
|
Steps 1 runs once per page (cached). Steps 2-3 re-run on font size change, which is what makes pinch-to-zoom fast.
|
|
135
172
|
|
|
@@ -142,18 +179,19 @@ pretext-pdfjs/
|
|
|
142
179
|
│ ├── pretext-text-layer.js # PretextTextLayer (drop-in replacement)
|
|
143
180
|
│ ├── measurement-cache.js # Pretext-style Canvas measurement cache
|
|
144
181
|
│ ├── viewer.js # PretextPDFViewer helper
|
|
145
|
-
│ ├── pinch.js # Pinch-type
|
|
146
|
-
│ └── reflow.js # Per-block reflow
|
|
147
|
-
├── demo.html #
|
|
182
|
+
│ ├── pinch.js # Pinch-type reading modes
|
|
183
|
+
│ └── reflow.js # Per-block reflow with image preservation
|
|
184
|
+
├── demo.html # Library landing page
|
|
185
|
+
├── reader.html # Full PDF reader demo
|
|
148
186
|
├── package.json
|
|
149
187
|
└── README.md
|
|
150
188
|
```
|
|
151
189
|
|
|
152
|
-
**Kept from PDF.js
|
|
190
|
+
**Kept from PDF.js**: core parser, canvas renderer, annotation layer, worker, font loading.
|
|
153
191
|
|
|
154
|
-
**Replaced**:
|
|
192
|
+
**Replaced**: TextLayer — measurement cache, ascent detection, width scaling.
|
|
155
193
|
|
|
156
|
-
**Added**:
|
|
194
|
+
**Added**: pretextMetrics, enableReflow(), pinch/morph reading modes, per-block reflow with image preservation.
|
|
157
195
|
|
|
158
196
|
## Built on
|
|
159
197
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pretext-pdfjs",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "0.3.1",
|
|
4
|
+
"description": "Pretext-native text layer for PDF.js — zero DOM reflows, per-block reflow with image preservation, pinch-to-zoom text",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./src/index.js",
|
|
7
7
|
"exports": {
|
package/src/pinch.js
CHANGED
|
@@ -339,13 +339,51 @@ function createTextCanvas(container, opts = {}) {
|
|
|
339
339
|
* @param {Function} [options.onPageLoad] - called with { pageNum, text, numPages }
|
|
340
340
|
*/
|
|
341
341
|
export function createPDFPinchReader(container, options = {}) {
|
|
342
|
+
const preserveLayout = options.preserveLayout ?? false;
|
|
343
|
+
const mode = options.mode || "pinchType";
|
|
344
|
+
|
|
345
|
+
// When preserveLayout is true, delegate to the reflow engine
|
|
346
|
+
if (preserveLayout) {
|
|
347
|
+
let reflowRenderer = null;
|
|
348
|
+
|
|
349
|
+
return {
|
|
350
|
+
async open(source) {
|
|
351
|
+
const { createReflowRenderer } = await import("./reflow.js");
|
|
352
|
+
reflowRenderer = createReflowRenderer(container, {
|
|
353
|
+
fontSize: options.fontSize ?? 18,
|
|
354
|
+
minFontSize: options.minFontSize ?? 8,
|
|
355
|
+
maxFontSize: options.maxFontSize ?? 60,
|
|
356
|
+
fontFamily: options.fontFamily,
|
|
357
|
+
lineHeight: options.lineHeight ?? 1.6,
|
|
358
|
+
padding: options.padding ?? 28,
|
|
359
|
+
background: options.background ?? "#0a0a0a",
|
|
360
|
+
textColor: options.textColor ?? "#e5e5e5",
|
|
361
|
+
enablePinchZoom: true,
|
|
362
|
+
enableMorph: mode === "pinchMorph" || mode === "scrollMorph",
|
|
363
|
+
friction: options.friction ?? 0.95,
|
|
364
|
+
workerSrc: options.workerSrc,
|
|
365
|
+
onZoom: options.onZoom,
|
|
366
|
+
});
|
|
367
|
+
return reflowRenderer.open(source);
|
|
368
|
+
},
|
|
369
|
+
async showPage(pageNum) { return reflowRenderer.showPage(pageNum); },
|
|
370
|
+
async showAll() { return reflowRenderer.showAll(); },
|
|
371
|
+
async nextPage() { return reflowRenderer.nextPage(); },
|
|
372
|
+
async prevPage() { return reflowRenderer.prevPage(); },
|
|
373
|
+
resize() { /* handled by ResizeObserver in reflow */ },
|
|
374
|
+
destroy() { reflowRenderer?.destroy(); },
|
|
375
|
+
get currentPage() { return reflowRenderer?.currentPage ?? 0; },
|
|
376
|
+
get numPages() { return reflowRenderer?.numPages ?? 0; },
|
|
377
|
+
get canvas() { return reflowRenderer?.canvas ?? null; },
|
|
378
|
+
get mode() { return mode; },
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
|
|
342
382
|
let pdfjs = null;
|
|
343
383
|
let pdfDoc = null;
|
|
344
384
|
let textInstance = null;
|
|
345
385
|
let currentPage = 0;
|
|
346
386
|
|
|
347
|
-
const mode = options.mode || "pinchType";
|
|
348
|
-
|
|
349
387
|
async function ensurePdfjs() {
|
|
350
388
|
if (pdfjs) return;
|
|
351
389
|
pdfjs = await import("pdfjs-dist");
|
package/src/reflow.js
CHANGED
|
@@ -14,6 +14,25 @@ function clamp(v, min, max) {
|
|
|
14
14
|
return Math.max(min, Math.min(max, v));
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
+
/**
|
|
18
|
+
* Draw a line of text with justified spacing (equal space between words).
|
|
19
|
+
*/
|
|
20
|
+
function drawJustifiedLine(ctx, text, x, y, availWidth) {
|
|
21
|
+
const words = text.split(" ");
|
|
22
|
+
if (words.length <= 1) {
|
|
23
|
+
ctx.fillText(text, x, y);
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
let totalWordWidth = 0;
|
|
27
|
+
for (const w of words) totalWordWidth += ctx.measureText(w).width;
|
|
28
|
+
const extraSpace = (availWidth - totalWordWidth) / (words.length - 1);
|
|
29
|
+
let xPos = x;
|
|
30
|
+
for (const w of words) {
|
|
31
|
+
ctx.fillText(w, xPos, y);
|
|
32
|
+
xPos += ctx.measureText(w).width + extraSpace;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
17
36
|
function bboxOverlap(a, b) {
|
|
18
37
|
const x1 = Math.max(a.x, b.x);
|
|
19
38
|
const y1 = Math.max(a.y, b.y);
|
|
@@ -25,13 +44,106 @@ function bboxOverlap(a, b) {
|
|
|
25
44
|
return smaller > 0 ? intersection / smaller : 0;
|
|
26
45
|
}
|
|
27
46
|
|
|
47
|
+
// ─── Font metadata extraction ────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Extract real font metadata (bold, italic, weight, loadedName) from
|
|
51
|
+
* page.commonObjs. Must be called AFTER page.render() so fonts are loaded.
|
|
52
|
+
*/
|
|
53
|
+
async function extractFontMetadata(page, opList, OPS) {
|
|
54
|
+
const fontMap = new Map();
|
|
55
|
+
|
|
56
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
57
|
+
if (opList.fnArray[i] === OPS.setFont) {
|
|
58
|
+
const fontRefName = opList.argsArray[i][0];
|
|
59
|
+
if (fontMap.has(fontRefName)) continue;
|
|
60
|
+
|
|
61
|
+
try {
|
|
62
|
+
const fontObj = page.commonObjs.get(fontRefName);
|
|
63
|
+
if (fontObj) {
|
|
64
|
+
fontMap.set(fontRefName, {
|
|
65
|
+
bold: fontObj.bold || false,
|
|
66
|
+
black: fontObj.black || false,
|
|
67
|
+
italic: fontObj.italic || false,
|
|
68
|
+
loadedName: fontObj.loadedName || null,
|
|
69
|
+
fallbackName: fontObj.fallbackName || "sans-serif",
|
|
70
|
+
css: fontObj.systemFontInfo?.css || null,
|
|
71
|
+
isMonospace: fontObj.isMonospace || false,
|
|
72
|
+
isSerifFont: fontObj.isSerifFont || false,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
} catch (_) {
|
|
76
|
+
// Font not yet loaded — skip
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return fontMap;
|
|
81
|
+
}
|
|
82
|
+
|
|
83
|
+
// ─── Text color extraction ───────────────────────────────────────────────
|
|
84
|
+
|
|
85
|
+
/**
|
|
86
|
+
* Extract fill colors from the operator list, indexed by text-drawing op.
|
|
87
|
+
* The evaluator normalizes all fill-color commands to setFillRGBColor with
|
|
88
|
+
* a hex string, so that's the primary path. Returns an array parallel to
|
|
89
|
+
* the text items from getTextContent().
|
|
90
|
+
*/
|
|
91
|
+
function extractTextColors(opList, OPS) {
|
|
92
|
+
const textColors = [];
|
|
93
|
+
let currentColor = "#000000";
|
|
94
|
+
|
|
95
|
+
const textDrawOps = new Set([
|
|
96
|
+
OPS.showText,
|
|
97
|
+
OPS.showSpacedText,
|
|
98
|
+
OPS.nextLineShowText,
|
|
99
|
+
OPS.nextLineSetSpacingShowText,
|
|
100
|
+
]);
|
|
101
|
+
|
|
102
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
103
|
+
const fn = opList.fnArray[i];
|
|
104
|
+
|
|
105
|
+
if (fn === OPS.setFillRGBColor) {
|
|
106
|
+
currentColor = opList.argsArray[i][0];
|
|
107
|
+
} else if (fn === OPS.setFillTransparent) {
|
|
108
|
+
currentColor = "transparent";
|
|
109
|
+
} else if (
|
|
110
|
+
fn === OPS.setFillGray ||
|
|
111
|
+
fn === OPS.setFillColor ||
|
|
112
|
+
fn === OPS.setFillCMYKColor ||
|
|
113
|
+
fn === OPS.setFillColorN
|
|
114
|
+
) {
|
|
115
|
+
const args = opList.argsArray[i];
|
|
116
|
+
if (args?.[0] && typeof args[0] === "string" && args[0].startsWith("#")) {
|
|
117
|
+
currentColor = args[0];
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
if (textDrawOps.has(fn)) {
|
|
122
|
+
textColors.push(currentColor);
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
return textColors;
|
|
127
|
+
}
|
|
128
|
+
|
|
28
129
|
// ─── Page analysis ────────────────────────────────────────────────────────
|
|
29
130
|
|
|
30
131
|
/**
|
|
31
132
|
* Group adjacent text items into text blocks by proximity.
|
|
32
133
|
* Also extracts font metadata: average size, italic, bold.
|
|
33
134
|
*/
|
|
34
|
-
function groupTextBlocks(textItems, pageHeight, styles) {
|
|
135
|
+
function groupTextBlocks(textItems, pageHeight, styles, fontMap, textColors) {
|
|
136
|
+
// Attach colors to text items before filtering (textColors is parallel to
|
|
137
|
+
// the full items array from getTextContent, including empty items)
|
|
138
|
+
if (textColors) {
|
|
139
|
+
let colorIdx = 0;
|
|
140
|
+
for (const item of textItems) {
|
|
141
|
+
if (item.str !== undefined) {
|
|
142
|
+
item._color = textColors[colorIdx++] || "#000000";
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
35
147
|
const sorted = [...textItems].filter(i => i.str?.trim()).sort((a, b) => {
|
|
36
148
|
const ay = pageHeight - a.transform[5];
|
|
37
149
|
const by = pageHeight - b.transform[5];
|
|
@@ -73,10 +185,20 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
73
185
|
const isSuperscript = isShortItem && isHorizAdjacent && sizeRatio > 1.3;
|
|
74
186
|
const sizeOk = sizeRatio < 1.3 || isSuperscript;
|
|
75
187
|
|
|
188
|
+
// Large horizontal gap between consecutive items → likely column break
|
|
189
|
+
// Only for substantive text (skip short items like superscript markers)
|
|
190
|
+
const isLongItem = (item.str || "").trim().length > 3;
|
|
191
|
+
if (isLongItem && (hGap > lastFH * 1.5 ||
|
|
192
|
+
(current.bbox.w > lastFH * 10 && x < current.bbox.x - lastFH * 3))) {
|
|
193
|
+
blocks.push(current);
|
|
194
|
+
current = { items: [item], bbox: { x, y, w: item.width || 0, h: fontHeight } };
|
|
195
|
+
continue;
|
|
196
|
+
}
|
|
197
|
+
|
|
76
198
|
if (
|
|
77
199
|
sizeOk &&
|
|
78
200
|
verticalGap < lastFH * 2.5 &&
|
|
79
|
-
x < current.bbox.x + current.bbox.w + lastFH *
|
|
201
|
+
x < current.bbox.x + current.bbox.w + lastFH * 1.5
|
|
80
202
|
) {
|
|
81
203
|
current.items.push(item);
|
|
82
204
|
current.bbox.x = Math.min(current.bbox.x, x);
|
|
@@ -94,7 +216,48 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
94
216
|
}
|
|
95
217
|
if (current) blocks.push(current);
|
|
96
218
|
|
|
97
|
-
//
|
|
219
|
+
// Post-process: merge orphan tiny blocks (superscripts, markers like *, +, #)
|
|
220
|
+
// into the nearest larger block if vertically close
|
|
221
|
+
for (let i = blocks.length - 1; i >= 0; i--) {
|
|
222
|
+
const block = blocks[i];
|
|
223
|
+
if (block.items.length > 2) continue;
|
|
224
|
+
const text = block.items.map(it => (it.str || "").trim()).join("");
|
|
225
|
+
if (text.length > 3 || text.length === 0) continue;
|
|
226
|
+
|
|
227
|
+
let bestIdx = -1, bestDist = Infinity;
|
|
228
|
+
for (let j = 0; j < blocks.length; j++) {
|
|
229
|
+
if (j === i) continue;
|
|
230
|
+
const o = blocks[j];
|
|
231
|
+
// Skip other orphans (short text blocks)
|
|
232
|
+
const oText = o.items.map(it => (it.str || "").trim()).join("");
|
|
233
|
+
if (oText.length <= 3) continue;
|
|
234
|
+
// Check vertical proximity: orphan center within 30pt of target block
|
|
235
|
+
const bcy = block.bbox.y + block.bbox.h / 2;
|
|
236
|
+
if (bcy < o.bbox.y - 30 || bcy > o.bbox.y + o.bbox.h + 30) continue;
|
|
237
|
+
// Horizontal edge-to-edge distance (0 if overlapping)
|
|
238
|
+
const hDist = Math.max(0,
|
|
239
|
+
block.bbox.x > o.bbox.x + o.bbox.w ? block.bbox.x - (o.bbox.x + o.bbox.w) :
|
|
240
|
+
o.bbox.x > block.bbox.x + block.bbox.w ? o.bbox.x - (block.bbox.x + block.bbox.w) : 0);
|
|
241
|
+
if (hDist < bestDist) {
|
|
242
|
+
bestDist = hDist;
|
|
243
|
+
bestIdx = j;
|
|
244
|
+
}
|
|
245
|
+
}
|
|
246
|
+
|
|
247
|
+
if (bestIdx >= 0 && bestDist < Math.max(blocks[bestIdx].bbox.h, 20)) {
|
|
248
|
+
const target = blocks[bestIdx];
|
|
249
|
+
target.items.push(...block.items);
|
|
250
|
+
const newX = Math.min(target.bbox.x, block.bbox.x);
|
|
251
|
+
const newRight = Math.max(target.bbox.x + target.bbox.w, block.bbox.x + block.bbox.w);
|
|
252
|
+
const newBottom = Math.max(target.bbox.y + target.bbox.h, block.bbox.y + block.bbox.h);
|
|
253
|
+
target.bbox.x = newX;
|
|
254
|
+
target.bbox.w = newRight - newX;
|
|
255
|
+
target.bbox.h = newBottom - target.bbox.y;
|
|
256
|
+
blocks.splice(i, 1);
|
|
257
|
+
}
|
|
258
|
+
}
|
|
259
|
+
|
|
260
|
+
// Compute font metadata per block using real font objects from commonObjs
|
|
98
261
|
for (const block of blocks) {
|
|
99
262
|
const sizes = [];
|
|
100
263
|
let italicCount = 0;
|
|
@@ -104,18 +267,10 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
104
267
|
const fh = Math.hypot(item.transform[2], item.transform[3]);
|
|
105
268
|
if (fh > 0) sizes.push(fh);
|
|
106
269
|
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
110
|
-
|
|
111
|
-
const combined = name + " " + family;
|
|
112
|
-
|
|
113
|
-
if (combined.includes("italic") || combined.includes("oblique")) italicCount++;
|
|
114
|
-
if (combined.includes("bold") || combined.includes("black") || combined.includes("heavy")) boldCount++;
|
|
115
|
-
|
|
116
|
-
// Also detect italic from transform skew
|
|
117
|
-
if (Math.abs(item.transform[2]) > 0.1 && Math.abs(item.transform[1]) < 0.1) {
|
|
118
|
-
italicCount++;
|
|
270
|
+
const fontMeta = fontMap?.get(item.fontName);
|
|
271
|
+
if (fontMeta) {
|
|
272
|
+
if (fontMeta.italic) italicCount++;
|
|
273
|
+
if (fontMeta.bold || fontMeta.black) boldCount++;
|
|
119
274
|
}
|
|
120
275
|
}
|
|
121
276
|
|
|
@@ -124,10 +279,28 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
124
279
|
: 12;
|
|
125
280
|
block.isItalic = italicCount > block.items.length * 0.4;
|
|
126
281
|
block.isBold = boldCount > block.items.length * 0.4;
|
|
282
|
+
block.isBlack = block.items.some(it => fontMap?.get(it.fontName)?.black);
|
|
283
|
+
|
|
284
|
+
// Store the font metadata for the dominant font in this block
|
|
285
|
+
block.fontMeta = fontMap?.get(block.items[0]?.fontName) || null;
|
|
127
286
|
|
|
128
|
-
//
|
|
129
|
-
const
|
|
130
|
-
|
|
287
|
+
// Compute dominant fill color for the block
|
|
288
|
+
const colorFreq = {};
|
|
289
|
+
for (const item of block.items) {
|
|
290
|
+
const c = item._color || "#000000";
|
|
291
|
+
if (c !== "transparent") {
|
|
292
|
+
colorFreq[c] = (colorFreq[c] || 0) + 1;
|
|
293
|
+
}
|
|
294
|
+
}
|
|
295
|
+
let dominantColor = "#000000";
|
|
296
|
+
let maxColorFreq = 0;
|
|
297
|
+
for (const [c, freq] of Object.entries(colorFreq)) {
|
|
298
|
+
if (freq > maxColorFreq) {
|
|
299
|
+
maxColorFreq = freq;
|
|
300
|
+
dominantColor = c;
|
|
301
|
+
}
|
|
302
|
+
}
|
|
303
|
+
block.color = dominantColor;
|
|
131
304
|
}
|
|
132
305
|
|
|
133
306
|
return blocks;
|
|
@@ -138,8 +311,7 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
138
311
|
* Only captures image operators (paintImageXObject etc).
|
|
139
312
|
* Skips path/fill/stroke to avoid false positives from text decorations.
|
|
140
313
|
*/
|
|
141
|
-
|
|
142
|
-
const ops = await page.getOperatorList();
|
|
314
|
+
function extractGraphicRegions(opList, OPS) {
|
|
143
315
|
const regions = [];
|
|
144
316
|
const ctmStack = [];
|
|
145
317
|
let ctm = [1, 0, 0, 1, 0, 0];
|
|
@@ -165,9 +337,9 @@ async function extractGraphicRegions(page, OPS) {
|
|
|
165
337
|
return [ctm[0] * x + ctm[2] * y + ctm[4], ctm[1] * x + ctm[3] * y + ctm[5]];
|
|
166
338
|
}
|
|
167
339
|
|
|
168
|
-
for (let i = 0; i <
|
|
169
|
-
const fn =
|
|
170
|
-
const args =
|
|
340
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
341
|
+
const fn = opList.fnArray[i];
|
|
342
|
+
const args = opList.argsArray[i];
|
|
171
343
|
|
|
172
344
|
if (fn === OPS.save) {
|
|
173
345
|
ctmStack.push(ctm.slice());
|
|
@@ -200,31 +372,132 @@ async function extractGraphicRegions(page, OPS) {
|
|
|
200
372
|
return regions;
|
|
201
373
|
}
|
|
202
374
|
|
|
375
|
+
/**
|
|
376
|
+
* Detect graphic regions by scanning the rendered canvas for non-text content.
|
|
377
|
+
* Complements op-based detection by also finding vector graphics (charts, diagrams).
|
|
378
|
+
*/
|
|
379
|
+
function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
|
|
380
|
+
const w = offCanvas.width;
|
|
381
|
+
const h = offCanvas.height;
|
|
382
|
+
const ctx = offCanvas.getContext("2d");
|
|
383
|
+
|
|
384
|
+
const cellPx = 16;
|
|
385
|
+
const cols = Math.ceil(w / cellPx);
|
|
386
|
+
const rows = Math.ceil(h / cellPx);
|
|
387
|
+
const occupied = new Uint8Array(cols * rows);
|
|
388
|
+
|
|
389
|
+
// Mark cells covered by text blocks
|
|
390
|
+
for (const block of textBlocks) {
|
|
391
|
+
const margin = 4 * renderScale;
|
|
392
|
+
const x0 = Math.floor(Math.max(0, block.bbox.x * renderScale - margin) / cellPx);
|
|
393
|
+
const y0 = Math.floor(Math.max(0, block.bbox.y * renderScale - margin) / cellPx);
|
|
394
|
+
const x1 = Math.ceil(Math.min(w, (block.bbox.x + block.bbox.w) * renderScale + margin) / cellPx);
|
|
395
|
+
const y1 = Math.ceil(Math.min(h, (block.bbox.y + block.bbox.h) * renderScale + margin) / cellPx);
|
|
396
|
+
for (let cy = y0; cy < y1 && cy < rows; cy++)
|
|
397
|
+
for (let cx = x0; cx < x1 && cx < cols; cx++)
|
|
398
|
+
occupied[cy * cols + cx] = 1;
|
|
399
|
+
}
|
|
400
|
+
|
|
401
|
+
// Scan non-text cells for visible content
|
|
402
|
+
const imgData = ctx.getImageData(0, 0, w, h);
|
|
403
|
+
const pixels = imgData.data;
|
|
404
|
+
const hasContent = new Uint8Array(cols * rows);
|
|
405
|
+
|
|
406
|
+
for (let cy = 0; cy < rows; cy++) {
|
|
407
|
+
for (let cx = 0; cx < cols; cx++) {
|
|
408
|
+
if (occupied[cy * cols + cx]) continue;
|
|
409
|
+
const px0 = cx * cellPx, py0 = cy * cellPx;
|
|
410
|
+
const px1 = Math.min(px0 + cellPx, w), py1 = Math.min(py0 + cellPx, h);
|
|
411
|
+
let dark = 0, total = 0;
|
|
412
|
+
for (let py = py0; py < py1; py += 2) {
|
|
413
|
+
for (let px = px0; px < px1; px += 2) {
|
|
414
|
+
const idx = (py * w + px) * 4;
|
|
415
|
+
if (pixels[idx + 3] > 20) {
|
|
416
|
+
const lum = 0.299 * pixels[idx] + 0.587 * pixels[idx + 1] + 0.114 * pixels[idx + 2];
|
|
417
|
+
if (lum < 240) dark++;
|
|
418
|
+
}
|
|
419
|
+
total++;
|
|
420
|
+
}
|
|
421
|
+
}
|
|
422
|
+
if (total > 0 && dark / total > 0.05) hasContent[cy * cols + cx] = 1;
|
|
423
|
+
}
|
|
424
|
+
}
|
|
425
|
+
|
|
426
|
+
// Connected-component labeling to find graphic regions
|
|
427
|
+
const visited = new Uint8Array(cols * rows);
|
|
428
|
+
const regions = [];
|
|
429
|
+
for (let cy = 0; cy < rows; cy++) {
|
|
430
|
+
for (let cx = 0; cx < cols; cx++) {
|
|
431
|
+
if (!hasContent[cy * cols + cx] || visited[cy * cols + cx]) continue;
|
|
432
|
+
const queue = [[cx, cy]];
|
|
433
|
+
visited[cy * cols + cx] = 1;
|
|
434
|
+
let minX = cx, maxX = cx, minY = cy, maxY = cy, count = 0;
|
|
435
|
+
while (queue.length > 0) {
|
|
436
|
+
const [qx, qy] = queue.shift();
|
|
437
|
+
minX = Math.min(minX, qx); maxX = Math.max(maxX, qx);
|
|
438
|
+
minY = Math.min(minY, qy); maxY = Math.max(maxY, qy);
|
|
439
|
+
count++;
|
|
440
|
+
for (const [dx, dy] of [[-1,0],[1,0],[0,-1],[0,1]]) {
|
|
441
|
+
const nx = qx + dx, ny = qy + dy;
|
|
442
|
+
if (nx >= 0 && nx < cols && ny >= 0 && ny < rows &&
|
|
443
|
+
hasContent[ny * cols + nx] && !visited[ny * cols + nx]) {
|
|
444
|
+
visited[ny * cols + nx] = 1;
|
|
445
|
+
queue.push([nx, ny]);
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
}
|
|
449
|
+
const rx = minX * cellPx / renderScale;
|
|
450
|
+
const ry = minY * cellPx / renderScale;
|
|
451
|
+
const rw = (maxX - minX + 1) * cellPx / renderScale;
|
|
452
|
+
const rh = (maxY - minY + 1) * cellPx / renderScale;
|
|
453
|
+
if (rw > 30 && rh > 30 && count > 4) {
|
|
454
|
+
regions.push({ type: "graphic", bbox: { x: rx, y: ry, w: rw, h: rh }, screenCoords: true });
|
|
455
|
+
}
|
|
456
|
+
}
|
|
457
|
+
}
|
|
458
|
+
return regions;
|
|
459
|
+
}
|
|
460
|
+
|
|
203
461
|
/**
|
|
204
462
|
* Build text content for a block, preserving paragraph breaks.
|
|
205
463
|
*/
|
|
206
464
|
function blockToText(block, pageHeight) {
|
|
207
465
|
let result = "";
|
|
208
466
|
let lastY = null;
|
|
467
|
+
let lastX = null;
|
|
468
|
+
let lastW = 0;
|
|
209
469
|
let lastFontSize = 12;
|
|
210
470
|
|
|
211
471
|
for (const item of block.items) {
|
|
212
472
|
if (!item.str) continue;
|
|
473
|
+
const currentX = item.transform[4];
|
|
213
474
|
const currentY = pageHeight - item.transform[5];
|
|
214
475
|
const fontHeight = Math.hypot(item.transform[2], item.transform[3]);
|
|
215
476
|
if (fontHeight > 0) lastFontSize = fontHeight;
|
|
216
477
|
|
|
217
478
|
if (lastY !== null) {
|
|
218
|
-
const
|
|
219
|
-
|
|
479
|
+
const vGap = Math.abs(currentY - lastY);
|
|
480
|
+
const isShortItem = (item.str || "").trim().length <= 2;
|
|
481
|
+
if (vGap > lastFontSize * 1.8 && !isShortItem) {
|
|
220
482
|
result += "\n\n";
|
|
221
|
-
} else if (
|
|
483
|
+
} else if (vGap > lastFontSize * 0.3) {
|
|
484
|
+
// Different line — insert space
|
|
222
485
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
223
486
|
result += " ";
|
|
224
487
|
}
|
|
488
|
+
} else if (lastX !== null) {
|
|
489
|
+
// Same line — check horizontal gap between items
|
|
490
|
+
const hGap = currentX - (lastX + lastW);
|
|
491
|
+
if (hGap > lastFontSize * 0.15) {
|
|
492
|
+
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
493
|
+
result += " ";
|
|
494
|
+
}
|
|
495
|
+
}
|
|
225
496
|
}
|
|
226
497
|
}
|
|
227
498
|
lastY = currentY;
|
|
499
|
+
lastX = currentX;
|
|
500
|
+
lastW = item.width || 0;
|
|
228
501
|
result += item.str;
|
|
229
502
|
}
|
|
230
503
|
return result.trim();
|
|
@@ -245,9 +518,10 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
|
|
|
245
518
|
}
|
|
246
519
|
|
|
247
520
|
for (const gr of graphicRegions) {
|
|
248
|
-
//
|
|
249
|
-
const
|
|
250
|
-
|
|
521
|
+
// Render-based regions are already in screen coords; op-based need conversion
|
|
522
|
+
const bbox = gr.screenCoords
|
|
523
|
+
? { ...gr.bbox }
|
|
524
|
+
: { x: gr.bbox.x, y: pageHeight - gr.bbox.y - gr.bbox.h, w: gr.bbox.w, h: gr.bbox.h };
|
|
251
525
|
|
|
252
526
|
// Skip if this graphic region overlaps significantly with any text block
|
|
253
527
|
const overlapsText = textBboxes.some(tb => bboxOverlap(bbox, tb) > 0.3);
|
|
@@ -256,28 +530,119 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
|
|
|
256
530
|
}
|
|
257
531
|
}
|
|
258
532
|
|
|
259
|
-
//
|
|
533
|
+
// ── Column detection via histogram gap-finding ──
|
|
260
534
|
const pageWidth = Math.max(...regions.map(r => r.bbox.x + r.bbox.w), 1);
|
|
261
|
-
const
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
|
|
265
|
-
|
|
535
|
+
const narrowBlocks = regions.filter(r => r.bbox.w <= pageWidth * 0.6);
|
|
536
|
+
let gapX = pageWidth / 2;
|
|
537
|
+
let hasColumns = false;
|
|
538
|
+
|
|
539
|
+
if (narrowBlocks.length >= 4) {
|
|
540
|
+
// Build horizontal coverage histogram
|
|
541
|
+
const binCount = 100;
|
|
542
|
+
const binWidth = pageWidth / binCount;
|
|
543
|
+
const coverage = new Uint8Array(binCount);
|
|
544
|
+
for (const r of narrowBlocks) {
|
|
545
|
+
const b0 = Math.max(0, Math.floor(r.bbox.x / binWidth));
|
|
546
|
+
const b1 = Math.min(binCount, Math.ceil((r.bbox.x + r.bbox.w) / binWidth));
|
|
547
|
+
for (let b = b0; b < b1; b++) coverage[b]++;
|
|
548
|
+
}
|
|
549
|
+
|
|
550
|
+
// Find widest empty gap in middle 60% of page
|
|
551
|
+
const searchStart = Math.floor(binCount * 0.2);
|
|
552
|
+
const searchEnd = Math.ceil(binCount * 0.8);
|
|
553
|
+
let gapStart = -1, gapLen = 0, bestStart = -1, bestLen = 0;
|
|
554
|
+
for (let b = searchStart; b < searchEnd; b++) {
|
|
555
|
+
if (coverage[b] === 0) {
|
|
556
|
+
if (gapStart < 0) gapStart = b;
|
|
557
|
+
gapLen = b - gapStart + 1;
|
|
558
|
+
} else {
|
|
559
|
+
if (gapLen > bestLen) { bestLen = gapLen; bestStart = gapStart; }
|
|
560
|
+
gapStart = -1; gapLen = 0;
|
|
561
|
+
}
|
|
562
|
+
}
|
|
563
|
+
if (gapLen > bestLen) { bestLen = gapLen; bestStart = gapStart; }
|
|
564
|
+
|
|
565
|
+
if (bestLen >= 2) {
|
|
566
|
+
gapX = (bestStart + bestLen / 2) * binWidth;
|
|
567
|
+
const leftCount = narrowBlocks.filter(r => r.bbox.x + r.bbox.w / 2 < gapX).length;
|
|
568
|
+
const rightCount = narrowBlocks.filter(r => r.bbox.x + r.bbox.w / 2 >= gapX).length;
|
|
569
|
+
hasColumns = leftCount > 2 && rightCount > 2;
|
|
570
|
+
}
|
|
571
|
+
}
|
|
266
572
|
|
|
573
|
+
// ── Detect text alignment per block (including justified) ──
|
|
574
|
+
for (const region of regions) {
|
|
575
|
+
if (region.type !== "text") continue;
|
|
576
|
+
const block = region.block;
|
|
577
|
+
const leftMargin = block.bbox.x;
|
|
578
|
+
const rightMargin = pageWidth - (block.bbox.x + block.bbox.w);
|
|
579
|
+
const marginDiff = Math.abs(leftMargin - rightMargin);
|
|
580
|
+
|
|
581
|
+
// Detect justified text: multiple lines with consistent right edges
|
|
582
|
+
let isJustified = false;
|
|
583
|
+
if (block.items.length >= 3) {
|
|
584
|
+
const lines = [];
|
|
585
|
+
let lineItems = [];
|
|
586
|
+
let lastLineY = null;
|
|
587
|
+
for (const item of block.items) {
|
|
588
|
+
const y = pageHeight - item.transform[5];
|
|
589
|
+
if (lastLineY !== null && Math.abs(y - lastLineY) > 2) {
|
|
590
|
+
if (lineItems.length > 0) lines.push(lineItems);
|
|
591
|
+
lineItems = [];
|
|
592
|
+
}
|
|
593
|
+
lineItems.push(item);
|
|
594
|
+
lastLineY = y;
|
|
595
|
+
}
|
|
596
|
+
if (lineItems.length > 0) lines.push(lineItems);
|
|
597
|
+
|
|
598
|
+
if (lines.length >= 3) {
|
|
599
|
+
// Compute right edge of each line (except last — last line is usually ragged)
|
|
600
|
+
const rightEdges = [];
|
|
601
|
+
for (let li = 0; li < lines.length - 1; li++) {
|
|
602
|
+
const lastItem = lines[li][lines[li].length - 1];
|
|
603
|
+
const rightX = lastItem.transform[4] + (lastItem.width || 0);
|
|
604
|
+
rightEdges.push(rightX);
|
|
605
|
+
}
|
|
606
|
+
if (rightEdges.length >= 2) {
|
|
607
|
+
const maxRight = Math.max(...rightEdges);
|
|
608
|
+
const consistent = rightEdges.filter(r => Math.abs(r - maxRight) < pageWidth * 0.02);
|
|
609
|
+
isJustified = consistent.length > rightEdges.length * 0.7;
|
|
610
|
+
}
|
|
611
|
+
}
|
|
612
|
+
}
|
|
613
|
+
|
|
614
|
+
if (hasColumns && block.bbox.w <= pageWidth * 0.6) {
|
|
615
|
+
block.align = isJustified ? "justify" : "left";
|
|
616
|
+
} else if (isJustified) {
|
|
617
|
+
block.align = "justify";
|
|
618
|
+
} else if (leftMargin > pageWidth * 0.05 && marginDiff < pageWidth * 0.1) {
|
|
619
|
+
block.align = "center";
|
|
620
|
+
} else {
|
|
621
|
+
block.align = "left";
|
|
622
|
+
}
|
|
623
|
+
}
|
|
624
|
+
|
|
625
|
+
// ── Sort in reading order ──
|
|
267
626
|
if (hasColumns) {
|
|
268
|
-
// Two-column: sort each column top-to-bottom, then concatenate
|
|
269
|
-
// Full-width blocks (spanning > 60% of page) go first, sorted by Y
|
|
270
627
|
const fullWidth = regions.filter(r => r.bbox.w > pageWidth * 0.6);
|
|
271
|
-
const leftCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 <
|
|
272
|
-
const rightCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 >=
|
|
628
|
+
const leftCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 < gapX);
|
|
629
|
+
const rightCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 >= gapX);
|
|
273
630
|
const byY = (a, b) => a.bbox.y - b.bbox.y;
|
|
274
631
|
fullWidth.sort(byY);
|
|
275
632
|
leftCol.sort(byY);
|
|
276
633
|
rightCol.sort(byY);
|
|
634
|
+
|
|
635
|
+
// Interleave: full-width blocks mark section boundaries
|
|
277
636
|
regions.length = 0;
|
|
278
|
-
|
|
637
|
+
let li = 0, ri = 0;
|
|
638
|
+
for (const fw of fullWidth) {
|
|
639
|
+
while (li < leftCol.length && leftCol[li].bbox.y < fw.bbox.y) regions.push(leftCol[li++]);
|
|
640
|
+
while (ri < rightCol.length && rightCol[ri].bbox.y < fw.bbox.y) regions.push(rightCol[ri++]);
|
|
641
|
+
regions.push(fw);
|
|
642
|
+
}
|
|
643
|
+
while (li < leftCol.length) regions.push(leftCol[li++]);
|
|
644
|
+
while (ri < rightCol.length) regions.push(rightCol[ri++]);
|
|
279
645
|
} else {
|
|
280
|
-
// Single column: sort by Y then X
|
|
281
646
|
regions.sort((a, b) => {
|
|
282
647
|
if (Math.abs(a.bbox.y - b.bbox.y) > 10) return a.bbox.y - b.bbox.y;
|
|
283
648
|
return a.bbox.x - b.bbox.x;
|
|
@@ -296,26 +661,24 @@ async function analyzePage(page, OPS) {
|
|
|
296
661
|
|
|
297
662
|
// Get text content with styles
|
|
298
663
|
const textContent = await page.getTextContent();
|
|
299
|
-
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles);
|
|
300
664
|
|
|
301
|
-
//
|
|
302
|
-
const
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
let
|
|
307
|
-
for (
|
|
308
|
-
|
|
309
|
-
|
|
310
|
-
|
|
311
|
-
|
|
312
|
-
block.fontScale = block.avgFontSize / bodyFontSize;
|
|
665
|
+
// Get operator list once (reused for text/non-text classification + image extraction + font metadata)
|
|
666
|
+
const opList = await page.getOperatorList();
|
|
667
|
+
|
|
668
|
+
// Identify text operation indices for operationsFilter
|
|
669
|
+
const textOpIndices = new Set();
|
|
670
|
+
let inTextBlock = false;
|
|
671
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
672
|
+
const fn = opList.fnArray[i];
|
|
673
|
+
if (fn === OPS.beginText) inTextBlock = true;
|
|
674
|
+
if (inTextBlock) textOpIndices.add(i);
|
|
675
|
+
if (fn === OPS.endText) inTextBlock = false;
|
|
313
676
|
}
|
|
314
677
|
|
|
315
|
-
//
|
|
316
|
-
const
|
|
678
|
+
// Extract graphic regions from operator list CTM tracking
|
|
679
|
+
const opGraphicRegions = extractGraphicRegions(opList, OPS);
|
|
317
680
|
|
|
318
|
-
// Render
|
|
681
|
+
// Render non-text only (images, paths, fills, backgrounds)
|
|
319
682
|
const renderScale = 2;
|
|
320
683
|
const offCanvas = document.createElement("canvas");
|
|
321
684
|
offCanvas.width = Math.floor(pageWidth * renderScale);
|
|
@@ -326,9 +689,100 @@ async function analyzePage(page, OPS) {
|
|
|
326
689
|
await page.render({
|
|
327
690
|
canvasContext: offCtx,
|
|
328
691
|
viewport: renderViewport,
|
|
692
|
+
operationsFilter: (index) => !textOpIndices.has(index),
|
|
329
693
|
}).promise;
|
|
330
694
|
|
|
331
|
-
//
|
|
695
|
+
// Get precise image coordinates via recordImages (supplements CTM detection).
|
|
696
|
+
// This full render also loads fonts into commonObjs as a side effect.
|
|
697
|
+
let imageCoordRegions = [];
|
|
698
|
+
let fullRenderDone = false;
|
|
699
|
+
try {
|
|
700
|
+
const imgTrackCanvas = document.createElement("canvas");
|
|
701
|
+
imgTrackCanvas.width = offCanvas.width;
|
|
702
|
+
imgTrackCanvas.height = offCanvas.height;
|
|
703
|
+
const imgRenderTask = page.render({
|
|
704
|
+
canvasContext: imgTrackCanvas.getContext("2d"),
|
|
705
|
+
viewport: renderViewport,
|
|
706
|
+
recordImages: true,
|
|
707
|
+
});
|
|
708
|
+
await imgRenderTask.promise;
|
|
709
|
+
fullRenderDone = true;
|
|
710
|
+
const imageCoords = imgRenderTask.imageCoordinates;
|
|
711
|
+
if (imageCoords && imageCoords.length > 0) {
|
|
712
|
+
for (let j = 0; j < imageCoords.length; j += 6) {
|
|
713
|
+
const x1 = imageCoords[j], y1 = imageCoords[j + 1];
|
|
714
|
+
const x2 = imageCoords[j + 2], y2 = imageCoords[j + 3];
|
|
715
|
+
const x3 = imageCoords[j + 4], y3 = imageCoords[j + 5];
|
|
716
|
+
const xs = [x1, x2, x3];
|
|
717
|
+
const ys = [y1, y2, y3];
|
|
718
|
+
const minX = Math.min(...xs) / renderScale;
|
|
719
|
+
const maxX = Math.max(...xs) / renderScale;
|
|
720
|
+
const minY = Math.min(...ys) / renderScale;
|
|
721
|
+
const maxY = Math.max(...ys) / renderScale;
|
|
722
|
+
if (maxX - minX > 10 && maxY - minY > 10) {
|
|
723
|
+
imageCoordRegions.push({
|
|
724
|
+
type: "graphic",
|
|
725
|
+
bbox: { x: minX, y: minY, w: maxX - minX, h: maxY - minY },
|
|
726
|
+
screenCoords: true,
|
|
727
|
+
});
|
|
728
|
+
}
|
|
729
|
+
}
|
|
730
|
+
}
|
|
731
|
+
} catch (_) {
|
|
732
|
+
// recordImages not supported — CTM fallback is used
|
|
733
|
+
}
|
|
734
|
+
|
|
735
|
+
// Ensure fonts are loaded for commonObjs access. If the recordImages render
|
|
736
|
+
// above didn't run, do a minimal full render to trigger font loading.
|
|
737
|
+
if (!fullRenderDone) {
|
|
738
|
+
const fontCanvas = document.createElement("canvas");
|
|
739
|
+
fontCanvas.width = 1;
|
|
740
|
+
fontCanvas.height = 1;
|
|
741
|
+
const fontViewport = page.getViewport({ scale: 0.1 });
|
|
742
|
+
try {
|
|
743
|
+
await page.render({ canvasContext: fontCanvas.getContext("2d"), viewport: fontViewport }).promise;
|
|
744
|
+
} catch (_) {}
|
|
745
|
+
}
|
|
746
|
+
|
|
747
|
+
// Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
|
|
748
|
+
const fontMap = await extractFontMetadata(page, opList, OPS);
|
|
749
|
+
|
|
750
|
+
// Extract text colors from operator list (parallel to text items)
|
|
751
|
+
const textColors = extractTextColors(opList, OPS);
|
|
752
|
+
|
|
753
|
+
// Now group text blocks with real font data and colors
|
|
754
|
+
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap, textColors);
|
|
755
|
+
|
|
756
|
+
// Compute body font size (most common size = body text)
|
|
757
|
+
const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
|
|
758
|
+
const freq = {};
|
|
759
|
+
for (const s of allSizes) freq[s] = (freq[s] || 0) + 1;
|
|
760
|
+
let bodyFontSize = 12;
|
|
761
|
+
let maxFreq = 0;
|
|
762
|
+
for (const [s, f] of Object.entries(freq)) {
|
|
763
|
+
if (f > maxFreq) { maxFreq = f; bodyFontSize = parseFloat(s); }
|
|
764
|
+
}
|
|
765
|
+
// Compute fontScale per block
|
|
766
|
+
for (const block of textBlocks) {
|
|
767
|
+
block.fontScale = block.avgFontSize / bodyFontSize;
|
|
768
|
+
}
|
|
769
|
+
|
|
770
|
+
// Detect graphics from rendered non-text canvas (catches vector graphics)
|
|
771
|
+
const renderGraphicRegions = detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale);
|
|
772
|
+
|
|
773
|
+
// Merge all sources, deduplicating by overlap
|
|
774
|
+
const graphicRegions = [...opGraphicRegions];
|
|
775
|
+
for (const rg of [...imageCoordRegions, ...renderGraphicRegions]) {
|
|
776
|
+
const overlapsExisting = graphicRegions.some(og => {
|
|
777
|
+
const ogBbox = og.screenCoords
|
|
778
|
+
? og.bbox
|
|
779
|
+
: { x: og.bbox.x, y: pageHeight - og.bbox.y - og.bbox.h, w: og.bbox.w, h: og.bbox.h };
|
|
780
|
+
return bboxOverlap(rg.bbox, ogBbox) > 0.3;
|
|
781
|
+
});
|
|
782
|
+
if (!overlapsExisting) graphicRegions.push(rg);
|
|
783
|
+
}
|
|
784
|
+
|
|
785
|
+
// Build region map (filters overlapping graphics, detects columns + alignment)
|
|
332
786
|
const regionMap = buildRegionMap(textBlocks, graphicRegions, pageHeight);
|
|
333
787
|
|
|
334
788
|
// Extract bitmap snippets for graphic regions only
|
|
@@ -354,13 +808,14 @@ async function analyzePage(page, OPS) {
|
|
|
354
808
|
textBlocks,
|
|
355
809
|
graphicRegions,
|
|
356
810
|
offCanvas,
|
|
811
|
+
fontMap,
|
|
357
812
|
};
|
|
358
813
|
}
|
|
359
814
|
|
|
360
815
|
// ─── Reflow + composite engine ────────────────────────────────────────────
|
|
361
816
|
|
|
362
817
|
function reflowAndComposite(analysis, opts) {
|
|
363
|
-
const { regionMap, bitmaps, pageWidth, pageHeight } = analysis;
|
|
818
|
+
const { regionMap, bitmaps, pageWidth, pageHeight, fontMap } = analysis;
|
|
364
819
|
const {
|
|
365
820
|
fontSize, fontFamily, lineHeight, padding, background,
|
|
366
821
|
textColor, imageFit, canvasW,
|
|
@@ -389,15 +844,22 @@ function reflowAndComposite(analysis, opts) {
|
|
|
389
844
|
continue;
|
|
390
845
|
}
|
|
391
846
|
|
|
392
|
-
// Per-block font properties
|
|
847
|
+
// Per-block font properties using real font metadata from commonObjs
|
|
393
848
|
const blockFontSize = Math.round(fontSize * (block.fontScale || 1));
|
|
394
849
|
const blockLH = blockFontSize * lineHeight;
|
|
850
|
+
const fm = block.fontMeta;
|
|
395
851
|
const style = block.isItalic ? "italic" : "normal";
|
|
396
|
-
const weight = block.isBold ? 700 : 400;
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
852
|
+
const weight = block.isBlack ? 900 : block.isBold ? 700 : 400;
|
|
853
|
+
|
|
854
|
+
// Use the actual embedded PDF font if available (PDF.js loaded it via @font-face)
|
|
855
|
+
let blockFamily;
|
|
856
|
+
if (fm?.loadedName) {
|
|
857
|
+
blockFamily = `"${fm.loadedName}", ${fm.fallbackName || "sans-serif"}`;
|
|
858
|
+
} else if (fm?.css) {
|
|
859
|
+
blockFamily = fm.css;
|
|
860
|
+
} else {
|
|
861
|
+
blockFamily = fontFamily;
|
|
862
|
+
}
|
|
401
863
|
const font = `${style} ${weight} ${blockFontSize}px ${blockFamily}`;
|
|
402
864
|
|
|
403
865
|
const prepared = prepareWithSegments(text, font);
|
|
@@ -413,6 +875,8 @@ function reflowAndComposite(analysis, opts) {
|
|
|
413
875
|
fontStyle: style,
|
|
414
876
|
fontWeight: weight,
|
|
415
877
|
fontFamily: blockFamily,
|
|
878
|
+
align: block.align || "left",
|
|
879
|
+
color: block.color,
|
|
416
880
|
region,
|
|
417
881
|
});
|
|
418
882
|
} else {
|
|
@@ -456,73 +920,6 @@ function reflowAndComposite(analysis, opts) {
|
|
|
456
920
|
return { totalHeight, reflowedRegions, fullPageFallback: false };
|
|
457
921
|
}
|
|
458
922
|
|
|
459
|
-
/**
|
|
460
|
-
* Draw the reflowed content to canvas.
|
|
461
|
-
*/
|
|
462
|
-
function drawComposite(ctx, reflowedRegions, analysis, opts, scrollY) {
|
|
463
|
-
const {
|
|
464
|
-
fontSize, fontFamily, lineHeight, padding,
|
|
465
|
-
background, textColor, canvasW, canvasH, dpr,
|
|
466
|
-
} = opts;
|
|
467
|
-
|
|
468
|
-
const d = dpr;
|
|
469
|
-
const baseLH = fontSize * lineHeight;
|
|
470
|
-
|
|
471
|
-
ctx.fillStyle = background;
|
|
472
|
-
ctx.fillRect(0, 0, canvasW * d, canvasH * d);
|
|
473
|
-
|
|
474
|
-
// Full page fallback
|
|
475
|
-
if (reflowedRegions.length === 0 && analysis.offCanvas) {
|
|
476
|
-
const availableWidth = canvasW - padding * 2;
|
|
477
|
-
const scale = Math.min(availableWidth / analysis.pageWidth, 1);
|
|
478
|
-
ctx.drawImage(
|
|
479
|
-
analysis.offCanvas,
|
|
480
|
-
padding * d, padding * d,
|
|
481
|
-
analysis.pageWidth * scale * d,
|
|
482
|
-
analysis.pageHeight * scale * d
|
|
483
|
-
);
|
|
484
|
-
return;
|
|
485
|
-
}
|
|
486
|
-
|
|
487
|
-
let cursorY = padding;
|
|
488
|
-
ctx.textBaseline = "top";
|
|
489
|
-
|
|
490
|
-
for (const r of reflowedRegions) {
|
|
491
|
-
if (r.type === "text" && r.lines) {
|
|
492
|
-
const fs = r.fontSize || fontSize;
|
|
493
|
-
const lh = r.lineHeight || baseLH;
|
|
494
|
-
const style = r.fontStyle || "normal";
|
|
495
|
-
const weight = r.fontWeight || 400;
|
|
496
|
-
|
|
497
|
-
ctx.fillStyle = textColor;
|
|
498
|
-
ctx.font = `${style} ${weight} ${fs * d}px ${fontFamily}`;
|
|
499
|
-
|
|
500
|
-
for (const line of r.lines) {
|
|
501
|
-
const screenY = cursorY - scrollY;
|
|
502
|
-
if (screenY > -lh && screenY < canvasH + lh) {
|
|
503
|
-
ctx.fillText(line.text, padding * d, screenY * d);
|
|
504
|
-
}
|
|
505
|
-
cursorY += lh;
|
|
506
|
-
}
|
|
507
|
-
} else if (r.type === "graphic" && r.bitmap) {
|
|
508
|
-
const screenY = cursorY - scrollY;
|
|
509
|
-
if (screenY > -r.drawH && screenY < canvasH + r.drawH) {
|
|
510
|
-
const tmpCanvas = document.createElement("canvas");
|
|
511
|
-
tmpCanvas.width = r.bitmap.data.width;
|
|
512
|
-
tmpCanvas.height = r.bitmap.data.height;
|
|
513
|
-
tmpCanvas.getContext("2d").putImageData(r.bitmap.data, 0, 0);
|
|
514
|
-
ctx.drawImage(
|
|
515
|
-
tmpCanvas,
|
|
516
|
-
padding * d, screenY * d,
|
|
517
|
-
r.drawW * d, r.drawH * d
|
|
518
|
-
);
|
|
519
|
-
}
|
|
520
|
-
cursorY += r.drawH;
|
|
521
|
-
}
|
|
522
|
-
cursorY += baseLH * 0.4;
|
|
523
|
-
}
|
|
524
|
-
}
|
|
525
|
-
|
|
526
923
|
// ─── Main API ─────────────────────────────────────────────────────────────
|
|
527
924
|
|
|
528
925
|
export function createReflowRenderer(container, options = {}) {
|
|
@@ -530,7 +927,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
530
927
|
const maxFont = options.maxFontSize ?? 48;
|
|
531
928
|
const fontFamily = options.fontFamily ?? '"Literata", Georgia, serif';
|
|
532
929
|
const lhRatio = options.lineHeight ?? 1.6;
|
|
533
|
-
|
|
930
|
+
let padding = options.padding ?? 24;
|
|
534
931
|
const bg = options.background ?? "#f4f1eb";
|
|
535
932
|
const textColor = options.textColor ?? "#252320";
|
|
536
933
|
const imageFit = options.imageFit ?? "proportional";
|
|
@@ -539,6 +936,12 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
539
936
|
const friction = options.friction ?? 0.95;
|
|
540
937
|
const onZoom = options.onZoom;
|
|
541
938
|
const onPageReady = options.onPageReady;
|
|
939
|
+
const enableMorph = options.enableMorph ?? false;
|
|
940
|
+
const morphRadius = options.morphRadius ?? 300;
|
|
941
|
+
const edgeFontRatio = options.edgeFontRatio ?? 0.5;
|
|
942
|
+
const maxWidth = options.maxWidth ?? Infinity;
|
|
943
|
+
const autoDetectPadding = options.autoDetectPadding ?? true;
|
|
944
|
+
const minPadding = options.minPadding ?? 20;
|
|
542
945
|
|
|
543
946
|
let pdfjs = null;
|
|
544
947
|
let pdfDoc = null;
|
|
@@ -589,6 +992,15 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
589
992
|
|
|
590
993
|
function reflow() {
|
|
591
994
|
if (!currentAnalysis || W === 0) return;
|
|
995
|
+
// Auto-detect padding from PDF page margins
|
|
996
|
+
if (autoDetectPadding && currentAnalysis.textBlocks.length > 0 && currentAnalysis.pageWidth > 0) {
|
|
997
|
+
const minX = Math.min(...currentAnalysis.textBlocks.map(b => b.bbox.x));
|
|
998
|
+
const maxX = Math.max(...currentAnalysis.textBlocks.map(b => b.bbox.x + b.bbox.w));
|
|
999
|
+
const rightMargin = currentAnalysis.pageWidth - maxX;
|
|
1000
|
+
const pdfMargin = Math.min(minX, rightMargin);
|
|
1001
|
+
const marginRatio = pdfMargin / currentAnalysis.pageWidth;
|
|
1002
|
+
padding = Math.round(Math.max(minPadding, W * marginRatio));
|
|
1003
|
+
}
|
|
592
1004
|
const result = reflowAndComposite(currentAnalysis, {
|
|
593
1005
|
fontSize, fontFamily, lineHeight: lhRatio, padding,
|
|
594
1006
|
background: bg, textColor, imageFit, canvasW: W, canvasH: H, dpr,
|
|
@@ -605,7 +1017,6 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
605
1017
|
ctx.fillRect(0, 0, W * dpr, H * dpr);
|
|
606
1018
|
return;
|
|
607
1019
|
}
|
|
608
|
-
// Inline draw for performance (avoid function call overhead in rAF)
|
|
609
1020
|
const d = dpr;
|
|
610
1021
|
const baseLH = fontSize * lhRatio;
|
|
611
1022
|
|
|
@@ -626,19 +1037,78 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
626
1037
|
|
|
627
1038
|
let cursorY = padding;
|
|
628
1039
|
ctx.textBaseline = "top";
|
|
1040
|
+
const viewCenter = H / 2;
|
|
629
1041
|
|
|
630
1042
|
for (const r of reflowedRegions) {
|
|
631
1043
|
if (r.type === "text" && r.lines) {
|
|
632
1044
|
const fs = r.fontSize || fontSize;
|
|
633
1045
|
const lh = r.lineHeight || baseLH;
|
|
634
1046
|
const rFamily = r.fontFamily || fontFamily;
|
|
635
|
-
|
|
636
|
-
|
|
1047
|
+
const style = r.fontStyle || "normal";
|
|
1048
|
+
const weight = r.fontWeight || 400;
|
|
1049
|
+
const centered = r.align === "center";
|
|
1050
|
+
const justified = r.align === "justify";
|
|
1051
|
+
const availW = W - padding * 2;
|
|
1052
|
+
|
|
1053
|
+
if (!enableMorph) {
|
|
1054
|
+
ctx.fillStyle = r.color || textColor;
|
|
1055
|
+
ctx.font = `${style} ${weight} ${fs * d}px ${rFamily}`;
|
|
1056
|
+
}
|
|
637
1057
|
|
|
638
|
-
for (
|
|
1058
|
+
for (let lineIdx = 0; lineIdx < r.lines.length; lineIdx++) {
|
|
1059
|
+
const line = r.lines[lineIdx];
|
|
639
1060
|
const screenY = cursorY - scrollY;
|
|
640
1061
|
if (screenY > -lh && screenY < H + lh) {
|
|
641
|
-
|
|
1062
|
+
// Justified: distribute extra space between words (not on last line)
|
|
1063
|
+
const isLastLine = lineIdx === r.lines.length - 1;
|
|
1064
|
+
const shouldJustify = justified && !isLastLine && line.text.includes(" ");
|
|
1065
|
+
|
|
1066
|
+
if (enableMorph) {
|
|
1067
|
+
const dist = Math.abs(screenY - viewCenter);
|
|
1068
|
+
const t = Math.min(dist / morphRadius, 1);
|
|
1069
|
+
const ease = 1 - (1 - t) ** 3;
|
|
1070
|
+
const morphedFS = fs * (1 - ease * (1 - edgeFontRatio));
|
|
1071
|
+
const opacity = 1.0 + (0.2 - 1.0) * ease;
|
|
1072
|
+
// Blend the block's actual color toward gray at edges
|
|
1073
|
+
const blockColor = r.color || textColor;
|
|
1074
|
+
let morphColor;
|
|
1075
|
+
if (blockColor.startsWith("#") && blockColor.length === 7) {
|
|
1076
|
+
const br = parseInt(blockColor.slice(1, 3), 16);
|
|
1077
|
+
const bg_ = parseInt(blockColor.slice(3, 5), 16);
|
|
1078
|
+
const bb = parseInt(blockColor.slice(5, 7), 16);
|
|
1079
|
+
const dimR = Math.round(br + (160 - br) * ease);
|
|
1080
|
+
const dimG = Math.round(bg_ + (160 - bg_) * ease);
|
|
1081
|
+
const dimB = Math.round(bb + (160 - bb) * ease);
|
|
1082
|
+
morphColor = `rgb(${dimR},${dimG},${dimB})`;
|
|
1083
|
+
} else {
|
|
1084
|
+
const c = Math.round(37 - (37 - 160) * ease);
|
|
1085
|
+
morphColor = `rgb(${c},${c - 2},${c - 3})`;
|
|
1086
|
+
}
|
|
1087
|
+
ctx.save();
|
|
1088
|
+
ctx.globalAlpha = opacity;
|
|
1089
|
+
ctx.fillStyle = morphColor;
|
|
1090
|
+
ctx.font = `${style} ${weight} ${morphedFS * d}px ${rFamily}`;
|
|
1091
|
+
if (centered) {
|
|
1092
|
+
ctx.textAlign = "center";
|
|
1093
|
+
ctx.fillText(line.text, (W / 2) * d, screenY * d);
|
|
1094
|
+
ctx.textAlign = "left";
|
|
1095
|
+
} else if (shouldJustify) {
|
|
1096
|
+
drawJustifiedLine(ctx, line.text, padding * d, screenY * d, availW * d);
|
|
1097
|
+
} else {
|
|
1098
|
+
ctx.fillText(line.text, padding * d, screenY * d);
|
|
1099
|
+
}
|
|
1100
|
+
ctx.restore();
|
|
1101
|
+
} else {
|
|
1102
|
+
if (centered) {
|
|
1103
|
+
ctx.textAlign = "center";
|
|
1104
|
+
ctx.fillText(line.text, (W / 2) * d, screenY * d);
|
|
1105
|
+
ctx.textAlign = "left";
|
|
1106
|
+
} else if (shouldJustify) {
|
|
1107
|
+
drawJustifiedLine(ctx, line.text, padding * d, screenY * d, availW * d);
|
|
1108
|
+
} else {
|
|
1109
|
+
ctx.fillText(line.text, padding * d, screenY * d);
|
|
1110
|
+
}
|
|
1111
|
+
}
|
|
642
1112
|
}
|
|
643
1113
|
cursorY += lh;
|
|
644
1114
|
}
|
|
@@ -646,7 +1116,19 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
646
1116
|
const screenY = cursorY - scrollY;
|
|
647
1117
|
if (screenY > -r.drawH && screenY < H + r.drawH) {
|
|
648
1118
|
const tmp = getTmpCanvas(r.bitmap);
|
|
649
|
-
|
|
1119
|
+
if (enableMorph) {
|
|
1120
|
+
const dist = Math.abs(screenY + r.drawH / 2 - viewCenter);
|
|
1121
|
+
const t = Math.min(dist / morphRadius, 1);
|
|
1122
|
+
const ease = 1 - (1 - t) ** 3;
|
|
1123
|
+
const imgScale = 1 - ease * (1 - edgeFontRatio);
|
|
1124
|
+
const opacity = 1.0 + (0.2 - 1.0) * ease;
|
|
1125
|
+
ctx.save();
|
|
1126
|
+
ctx.globalAlpha = opacity;
|
|
1127
|
+
ctx.drawImage(tmp, padding * d, screenY * d, r.drawW * imgScale * d, r.drawH * imgScale * d);
|
|
1128
|
+
ctx.restore();
|
|
1129
|
+
} else {
|
|
1130
|
+
ctx.drawImage(tmp, padding * d, screenY * d, r.drawW * d, r.drawH * d);
|
|
1131
|
+
}
|
|
650
1132
|
}
|
|
651
1133
|
cursorY += r.drawH;
|
|
652
1134
|
}
|
|
@@ -742,7 +1224,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
742
1224
|
|
|
743
1225
|
function handleResize() {
|
|
744
1226
|
dpr = Math.min(devicePixelRatio || 1, 3);
|
|
745
|
-
W = Math.min(container.clientWidth,
|
|
1227
|
+
W = Math.min(container.clientWidth, maxWidth);
|
|
746
1228
|
H = container.clientHeight;
|
|
747
1229
|
canvas.width = W * dpr;
|
|
748
1230
|
canvas.height = H * dpr;
|
|
@@ -789,11 +1271,14 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
789
1271
|
scrollY = 0;
|
|
790
1272
|
scrollVelocity = 0;
|
|
791
1273
|
reflow();
|
|
1274
|
+
onZoom?.(fontSize);
|
|
792
1275
|
|
|
793
1276
|
onPageReady?.({
|
|
794
1277
|
pageNum,
|
|
795
1278
|
textBlocks: currentAnalysis.textBlocks,
|
|
796
1279
|
graphicRegions: currentAnalysis.graphicRegions,
|
|
1280
|
+
pageWidth: currentAnalysis.pageWidth,
|
|
1281
|
+
pageHeight: currentAnalysis.pageHeight,
|
|
797
1282
|
});
|
|
798
1283
|
},
|
|
799
1284
|
|
|
@@ -838,6 +1323,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
838
1323
|
scrollY = 0;
|
|
839
1324
|
scrollVelocity = 0;
|
|
840
1325
|
reflow();
|
|
1326
|
+
onZoom?.(fontSize);
|
|
841
1327
|
},
|
|
842
1328
|
|
|
843
1329
|
async nextPage() {
|
|
@@ -866,6 +1352,13 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
866
1352
|
pdfDoc = null;
|
|
867
1353
|
},
|
|
868
1354
|
|
|
1355
|
+
setPadding(newPadding) {
|
|
1356
|
+
if (newPadding !== padding) {
|
|
1357
|
+
padding = newPadding;
|
|
1358
|
+
reflow();
|
|
1359
|
+
}
|
|
1360
|
+
},
|
|
1361
|
+
|
|
869
1362
|
setFontSize(newSize) {
|
|
870
1363
|
const clamped = clamp(newSize, minFont, maxFont);
|
|
871
1364
|
if (clamped !== fontSize) {
|