pretext-pdfjs 0.2.0 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +47 -9
- package/package.json +2 -2
- package/src/pinch.js +40 -2
- package/src/reflow.js +548 -134
package/README.md
CHANGED
|
@@ -86,7 +86,40 @@ await TextLayer.enableReflow(container, fullText, {
|
|
|
86
86
|
});
|
|
87
87
|
```
|
|
88
88
|
|
|
89
|
-
###
|
|
89
|
+
### Reflow Mode (images preserved)
|
|
90
|
+
|
|
91
|
+
```js
|
|
92
|
+
import { createReflowRenderer } from "pretext-pdfjs/reflow";
|
|
93
|
+
|
|
94
|
+
const renderer = createReflowRenderer(container, {
|
|
95
|
+
fontSize: 16,
|
|
96
|
+
enablePinchZoom: true,
|
|
97
|
+
enableMorph: false, // set true for fisheye scroll
|
|
98
|
+
fontFamily: '"Literata", Georgia, serif',
|
|
99
|
+
});
|
|
100
|
+
await renderer.open("document.pdf");
|
|
101
|
+
await renderer.showPage(1);
|
|
102
|
+
// Pinch to zoom — text reflows, images stay in place
|
|
103
|
+
```
|
|
104
|
+
|
|
105
|
+
Unlike the text-only reader modes, reflow mode preserves images, vector graphics,
|
|
106
|
+
and document structure. It uses PDF.js's `operationsFilter` to render non-text
|
|
107
|
+
elements separately, then composites Pretext-reflowed text on top.
|
|
108
|
+
|
|
109
|
+
### Pinch reader with preserved layout
|
|
110
|
+
|
|
111
|
+
```js
|
|
112
|
+
import { createPDFPinchReader } from "pretext-pdfjs/pinch";
|
|
113
|
+
|
|
114
|
+
const reader = createPDFPinchReader(container, {
|
|
115
|
+
mode: "pinchType",
|
|
116
|
+
preserveLayout: true, // images stay in place
|
|
117
|
+
});
|
|
118
|
+
await reader.open("document.pdf");
|
|
119
|
+
await reader.showPage(1);
|
|
120
|
+
```
|
|
121
|
+
|
|
122
|
+
### Per-block reflow (full options)
|
|
90
123
|
|
|
91
124
|
The reflow module bridges PDF mode (images preserved, no reflow) and reader modes (text reflows, images stripped). Text blocks reflow with Pretext at the target font size while images and vector graphics render as scaled bitmaps in their original positions.
|
|
92
125
|
|
|
@@ -101,8 +134,12 @@ const renderer = createReflowRenderer(container, {
|
|
|
101
134
|
background: "#f4f1eb",
|
|
102
135
|
textColor: "#252320",
|
|
103
136
|
imageFit: "proportional", // "proportional" | "original" | "full-width"
|
|
137
|
+
maxWidth: Infinity, // max canvas width (default: full container)
|
|
104
138
|
enablePinchZoom: true,
|
|
105
139
|
enableMomentumScroll: true,
|
|
140
|
+
enableMorph: false, // fisheye scroll effect on text + images
|
|
141
|
+
morphRadius: 300, // morph effect radius in px
|
|
142
|
+
edgeFontRatio: 0.5, // edge font = 50% of center font
|
|
106
143
|
onZoom: (fontSize) => console.log("Font size:", fontSize),
|
|
107
144
|
onPageReady: ({ pageNum, textBlocks, graphicRegions }) => {
|
|
108
145
|
console.log(`Page ${pageNum}: ${textBlocks.length} text blocks, ${graphicRegions.length} graphics`);
|
|
@@ -127,9 +164,9 @@ renderer.destroy();
|
|
|
127
164
|
|
|
128
165
|
**How it works:**
|
|
129
166
|
|
|
130
|
-
1. **Analyze** — extracts text blocks (grouped by proximity) and graphic regions (images, vector paths) from the PDF page via `getTextContent()` and `getOperatorList()`.
|
|
167
|
+
1. **Analyze** — extracts text blocks (grouped by proximity) and graphic regions (images, vector paths) from the PDF page via `getTextContent()` and `getOperatorList()`. Uses `operationsFilter` to render only non-text content to an offscreen canvas, and `recordImages` for precise image coordinates.
|
|
131
168
|
2. **Reflow** — each text block is reflowed with Pretext's `prepareWithSegments()` + `layoutWithLines()` at the current font size. Graphic bitmaps are scaled proportionally.
|
|
132
|
-
3. **Composite** — walks the region map in reading order, drawing reflowed text lines and graphic bitmaps onto a single output canvas.
|
|
169
|
+
3. **Composite** — walks the region map in reading order, drawing reflowed text lines and graphic bitmaps onto a single output canvas. With `enableMorph`, applies fisheye interpolation to both text and images.
|
|
133
170
|
|
|
134
171
|
Steps 1 runs once per page (cached). Steps 2-3 re-run on font size change, which is what makes pinch-to-zoom fast.
|
|
135
172
|
|
|
@@ -142,18 +179,19 @@ pretext-pdfjs/
|
|
|
142
179
|
│ ├── pretext-text-layer.js # PretextTextLayer (drop-in replacement)
|
|
143
180
|
│ ├── measurement-cache.js # Pretext-style Canvas measurement cache
|
|
144
181
|
│ ├── viewer.js # PretextPDFViewer helper
|
|
145
|
-
│ ├── pinch.js # Pinch-type
|
|
146
|
-
│ └── reflow.js # Per-block reflow
|
|
147
|
-
├── demo.html #
|
|
182
|
+
│ ├── pinch.js # Pinch-type reading modes
|
|
183
|
+
│ └── reflow.js # Per-block reflow with image preservation
|
|
184
|
+
├── demo.html # Library landing page
|
|
185
|
+
├── reader.html # Full PDF reader demo
|
|
148
186
|
├── package.json
|
|
149
187
|
└── README.md
|
|
150
188
|
```
|
|
151
189
|
|
|
152
|
-
**Kept from PDF.js
|
|
190
|
+
**Kept from PDF.js**: core parser, canvas renderer, annotation layer, worker, font loading.
|
|
153
191
|
|
|
154
|
-
**Replaced**:
|
|
192
|
+
**Replaced**: TextLayer — measurement cache, ascent detection, width scaling.
|
|
155
193
|
|
|
156
|
-
**Added**:
|
|
194
|
+
**Added**: pretextMetrics, enableReflow(), pinch/morph reading modes, per-block reflow with image preservation.
|
|
157
195
|
|
|
158
196
|
## Built on
|
|
159
197
|
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "pretext-pdfjs",
|
|
3
|
-
"version": "0.
|
|
4
|
-
"description": "
|
|
3
|
+
"version": "0.3.0",
|
|
4
|
+
"description": "Pretext-native text layer for PDF.js — zero DOM reflows, per-block reflow with image preservation, pinch-to-zoom text",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./src/index.js",
|
|
7
7
|
"exports": {
|
package/src/pinch.js
CHANGED
|
@@ -339,13 +339,51 @@ function createTextCanvas(container, opts = {}) {
|
|
|
339
339
|
* @param {Function} [options.onPageLoad] - called with { pageNum, text, numPages }
|
|
340
340
|
*/
|
|
341
341
|
export function createPDFPinchReader(container, options = {}) {
|
|
342
|
+
const preserveLayout = options.preserveLayout ?? false;
|
|
343
|
+
const mode = options.mode || "pinchType";
|
|
344
|
+
|
|
345
|
+
// When preserveLayout is true, delegate to the reflow engine
|
|
346
|
+
if (preserveLayout) {
|
|
347
|
+
let reflowRenderer = null;
|
|
348
|
+
|
|
349
|
+
return {
|
|
350
|
+
async open(source) {
|
|
351
|
+
const { createReflowRenderer } = await import("./reflow.js");
|
|
352
|
+
reflowRenderer = createReflowRenderer(container, {
|
|
353
|
+
fontSize: options.fontSize ?? 18,
|
|
354
|
+
minFontSize: options.minFontSize ?? 8,
|
|
355
|
+
maxFontSize: options.maxFontSize ?? 60,
|
|
356
|
+
fontFamily: options.fontFamily,
|
|
357
|
+
lineHeight: options.lineHeight ?? 1.6,
|
|
358
|
+
padding: options.padding ?? 28,
|
|
359
|
+
background: options.background ?? "#0a0a0a",
|
|
360
|
+
textColor: options.textColor ?? "#e5e5e5",
|
|
361
|
+
enablePinchZoom: true,
|
|
362
|
+
enableMorph: mode === "pinchMorph" || mode === "scrollMorph",
|
|
363
|
+
friction: options.friction ?? 0.95,
|
|
364
|
+
workerSrc: options.workerSrc,
|
|
365
|
+
onZoom: options.onZoom,
|
|
366
|
+
});
|
|
367
|
+
return reflowRenderer.open(source);
|
|
368
|
+
},
|
|
369
|
+
async showPage(pageNum) { return reflowRenderer.showPage(pageNum); },
|
|
370
|
+
async showAll() { return reflowRenderer.showAll(); },
|
|
371
|
+
async nextPage() { return reflowRenderer.nextPage(); },
|
|
372
|
+
async prevPage() { return reflowRenderer.prevPage(); },
|
|
373
|
+
resize() { /* handled by ResizeObserver in reflow */ },
|
|
374
|
+
destroy() { reflowRenderer?.destroy(); },
|
|
375
|
+
get currentPage() { return reflowRenderer?.currentPage ?? 0; },
|
|
376
|
+
get numPages() { return reflowRenderer?.numPages ?? 0; },
|
|
377
|
+
get canvas() { return reflowRenderer?.canvas ?? null; },
|
|
378
|
+
get mode() { return mode; },
|
|
379
|
+
};
|
|
380
|
+
}
|
|
381
|
+
|
|
342
382
|
let pdfjs = null;
|
|
343
383
|
let pdfDoc = null;
|
|
344
384
|
let textInstance = null;
|
|
345
385
|
let currentPage = 0;
|
|
346
386
|
|
|
347
|
-
const mode = options.mode || "pinchType";
|
|
348
|
-
|
|
349
387
|
async function ensurePdfjs() {
|
|
350
388
|
if (pdfjs) return;
|
|
351
389
|
pdfjs = await import("pdfjs-dist");
|
package/src/reflow.js
CHANGED
|
@@ -14,6 +14,25 @@ function clamp(v, min, max) {
|
|
|
14
14
|
return Math.max(min, Math.min(max, v));
|
|
15
15
|
}
|
|
16
16
|
|
|
17
|
+
/**
|
|
18
|
+
* Draw a line of text with justified spacing (equal space between words).
|
|
19
|
+
*/
|
|
20
|
+
function drawJustifiedLine(ctx, text, x, y, availWidth) {
|
|
21
|
+
const words = text.split(" ");
|
|
22
|
+
if (words.length <= 1) {
|
|
23
|
+
ctx.fillText(text, x, y);
|
|
24
|
+
return;
|
|
25
|
+
}
|
|
26
|
+
let totalWordWidth = 0;
|
|
27
|
+
for (const w of words) totalWordWidth += ctx.measureText(w).width;
|
|
28
|
+
const extraSpace = (availWidth - totalWordWidth) / (words.length - 1);
|
|
29
|
+
let xPos = x;
|
|
30
|
+
for (const w of words) {
|
|
31
|
+
ctx.fillText(w, xPos, y);
|
|
32
|
+
xPos += ctx.measureText(w).width + extraSpace;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
35
|
+
|
|
17
36
|
function bboxOverlap(a, b) {
|
|
18
37
|
const x1 = Math.max(a.x, b.x);
|
|
19
38
|
const y1 = Math.max(a.y, b.y);
|
|
@@ -25,13 +44,49 @@ function bboxOverlap(a, b) {
|
|
|
25
44
|
return smaller > 0 ? intersection / smaller : 0;
|
|
26
45
|
}
|
|
27
46
|
|
|
47
|
+
// ─── Font metadata extraction ────────────────────────────────────────────
|
|
48
|
+
|
|
49
|
+
/**
|
|
50
|
+
* Extract real font metadata (bold, italic, weight, loadedName) from
|
|
51
|
+
* page.commonObjs. Must be called AFTER page.render() so fonts are loaded.
|
|
52
|
+
*/
|
|
53
|
+
async function extractFontMetadata(page, opList, OPS) {
|
|
54
|
+
const fontMap = new Map();
|
|
55
|
+
|
|
56
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
57
|
+
if (opList.fnArray[i] === OPS.setFont) {
|
|
58
|
+
const fontRefName = opList.argsArray[i][0];
|
|
59
|
+
if (fontMap.has(fontRefName)) continue;
|
|
60
|
+
|
|
61
|
+
try {
|
|
62
|
+
const fontObj = page.commonObjs.get(fontRefName);
|
|
63
|
+
if (fontObj) {
|
|
64
|
+
fontMap.set(fontRefName, {
|
|
65
|
+
bold: fontObj.bold || false,
|
|
66
|
+
black: fontObj.black || false,
|
|
67
|
+
italic: fontObj.italic || false,
|
|
68
|
+
loadedName: fontObj.loadedName || null,
|
|
69
|
+
fallbackName: fontObj.fallbackName || "sans-serif",
|
|
70
|
+
css: fontObj.systemFontInfo?.css || null,
|
|
71
|
+
isMonospace: fontObj.isMonospace || false,
|
|
72
|
+
isSerifFont: fontObj.isSerifFont || false,
|
|
73
|
+
});
|
|
74
|
+
}
|
|
75
|
+
} catch (_) {
|
|
76
|
+
// Font not yet loaded — skip
|
|
77
|
+
}
|
|
78
|
+
}
|
|
79
|
+
}
|
|
80
|
+
return fontMap;
|
|
81
|
+
}
|
|
82
|
+
|
|
28
83
|
// ─── Page analysis ────────────────────────────────────────────────────────
|
|
29
84
|
|
|
30
85
|
/**
|
|
31
86
|
* Group adjacent text items into text blocks by proximity.
|
|
32
87
|
* Also extracts font metadata: average size, italic, bold.
|
|
33
88
|
*/
|
|
34
|
-
function groupTextBlocks(textItems, pageHeight, styles) {
|
|
89
|
+
function groupTextBlocks(textItems, pageHeight, styles, fontMap) {
|
|
35
90
|
const sorted = [...textItems].filter(i => i.str?.trim()).sort((a, b) => {
|
|
36
91
|
const ay = pageHeight - a.transform[5];
|
|
37
92
|
const by = pageHeight - b.transform[5];
|
|
@@ -61,14 +116,32 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
61
116
|
const verticalGap = Math.abs(y - lastY);
|
|
62
117
|
|
|
63
118
|
// Split block on significant font size change (headings vs body)
|
|
119
|
+
// But don't split for superscripts/markers that are horizontally adjacent
|
|
64
120
|
const sizeRatio = fontHeight > 0 && lastFH > 0
|
|
65
121
|
? Math.max(fontHeight, lastFH) / Math.min(fontHeight, lastFH)
|
|
66
122
|
: 1;
|
|
123
|
+
const lastX = lastItem.transform[4];
|
|
124
|
+
const lastW = lastItem.width || lastFH;
|
|
125
|
+
const hGap = x - (lastX + lastW);
|
|
126
|
+
const isHorizAdjacent = hGap < lastFH * 0.5 && hGap > -lastFH;
|
|
127
|
+
const isShortItem = (item.str || "").trim().length <= 2;
|
|
128
|
+
const isSuperscript = isShortItem && isHorizAdjacent && sizeRatio > 1.3;
|
|
129
|
+
const sizeOk = sizeRatio < 1.3 || isSuperscript;
|
|
130
|
+
|
|
131
|
+
// Large horizontal gap between consecutive items → likely column break
|
|
132
|
+
// Only for substantive text (skip short items like superscript markers)
|
|
133
|
+
const isLongItem = (item.str || "").trim().length > 3;
|
|
134
|
+
if (isLongItem && (hGap > lastFH * 1.5 ||
|
|
135
|
+
(current.bbox.w > lastFH * 10 && x < current.bbox.x - lastFH * 3))) {
|
|
136
|
+
blocks.push(current);
|
|
137
|
+
current = { items: [item], bbox: { x, y, w: item.width || 0, h: fontHeight } };
|
|
138
|
+
continue;
|
|
139
|
+
}
|
|
67
140
|
|
|
68
141
|
if (
|
|
69
|
-
|
|
142
|
+
sizeOk &&
|
|
70
143
|
verticalGap < lastFH * 2.5 &&
|
|
71
|
-
x < current.bbox.x + current.bbox.w + lastFH *
|
|
144
|
+
x < current.bbox.x + current.bbox.w + lastFH * 1.5
|
|
72
145
|
) {
|
|
73
146
|
current.items.push(item);
|
|
74
147
|
current.bbox.x = Math.min(current.bbox.x, x);
|
|
@@ -86,7 +159,48 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
86
159
|
}
|
|
87
160
|
if (current) blocks.push(current);
|
|
88
161
|
|
|
89
|
-
//
|
|
162
|
+
// Post-process: merge orphan tiny blocks (superscripts, markers like *, +, #)
|
|
163
|
+
// into the nearest larger block if vertically close
|
|
164
|
+
for (let i = blocks.length - 1; i >= 0; i--) {
|
|
165
|
+
const block = blocks[i];
|
|
166
|
+
if (block.items.length > 2) continue;
|
|
167
|
+
const text = block.items.map(it => (it.str || "").trim()).join("");
|
|
168
|
+
if (text.length > 3 || text.length === 0) continue;
|
|
169
|
+
|
|
170
|
+
let bestIdx = -1, bestDist = Infinity;
|
|
171
|
+
for (let j = 0; j < blocks.length; j++) {
|
|
172
|
+
if (j === i) continue;
|
|
173
|
+
const o = blocks[j];
|
|
174
|
+
// Skip other orphans (short text blocks)
|
|
175
|
+
const oText = o.items.map(it => (it.str || "").trim()).join("");
|
|
176
|
+
if (oText.length <= 3) continue;
|
|
177
|
+
// Check vertical proximity: orphan center within 30pt of target block
|
|
178
|
+
const bcy = block.bbox.y + block.bbox.h / 2;
|
|
179
|
+
if (bcy < o.bbox.y - 30 || bcy > o.bbox.y + o.bbox.h + 30) continue;
|
|
180
|
+
// Horizontal edge-to-edge distance (0 if overlapping)
|
|
181
|
+
const hDist = Math.max(0,
|
|
182
|
+
block.bbox.x > o.bbox.x + o.bbox.w ? block.bbox.x - (o.bbox.x + o.bbox.w) :
|
|
183
|
+
o.bbox.x > block.bbox.x + block.bbox.w ? o.bbox.x - (block.bbox.x + block.bbox.w) : 0);
|
|
184
|
+
if (hDist < bestDist) {
|
|
185
|
+
bestDist = hDist;
|
|
186
|
+
bestIdx = j;
|
|
187
|
+
}
|
|
188
|
+
}
|
|
189
|
+
|
|
190
|
+
if (bestIdx >= 0 && bestDist < Math.max(blocks[bestIdx].bbox.h, 20)) {
|
|
191
|
+
const target = blocks[bestIdx];
|
|
192
|
+
target.items.push(...block.items);
|
|
193
|
+
const newX = Math.min(target.bbox.x, block.bbox.x);
|
|
194
|
+
const newRight = Math.max(target.bbox.x + target.bbox.w, block.bbox.x + block.bbox.w);
|
|
195
|
+
const newBottom = Math.max(target.bbox.y + target.bbox.h, block.bbox.y + block.bbox.h);
|
|
196
|
+
target.bbox.x = newX;
|
|
197
|
+
target.bbox.w = newRight - newX;
|
|
198
|
+
target.bbox.h = newBottom - target.bbox.y;
|
|
199
|
+
blocks.splice(i, 1);
|
|
200
|
+
}
|
|
201
|
+
}
|
|
202
|
+
|
|
203
|
+
// Compute font metadata per block using real font objects from commonObjs
|
|
90
204
|
for (const block of blocks) {
|
|
91
205
|
const sizes = [];
|
|
92
206
|
let italicCount = 0;
|
|
@@ -96,18 +210,10 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
96
210
|
const fh = Math.hypot(item.transform[2], item.transform[3]);
|
|
97
211
|
if (fh > 0) sizes.push(fh);
|
|
98
212
|
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
const combined = name + " " + family;
|
|
104
|
-
|
|
105
|
-
if (combined.includes("italic") || combined.includes("oblique")) italicCount++;
|
|
106
|
-
if (combined.includes("bold") || combined.includes("black") || combined.includes("heavy")) boldCount++;
|
|
107
|
-
|
|
108
|
-
// Also detect italic from transform skew
|
|
109
|
-
if (Math.abs(item.transform[2]) > 0.1 && Math.abs(item.transform[1]) < 0.1) {
|
|
110
|
-
italicCount++;
|
|
213
|
+
const fontMeta = fontMap?.get(item.fontName);
|
|
214
|
+
if (fontMeta) {
|
|
215
|
+
if (fontMeta.italic) italicCount++;
|
|
216
|
+
if (fontMeta.bold || fontMeta.black) boldCount++;
|
|
111
217
|
}
|
|
112
218
|
}
|
|
113
219
|
|
|
@@ -116,10 +222,10 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
116
222
|
: 12;
|
|
117
223
|
block.isItalic = italicCount > block.items.length * 0.4;
|
|
118
224
|
block.isBold = boldCount > block.items.length * 0.4;
|
|
225
|
+
block.isBlack = block.items.some(it => fontMap?.get(it.fontName)?.black);
|
|
119
226
|
|
|
120
|
-
//
|
|
121
|
-
|
|
122
|
-
block.pdfFontFamily = sampleStyle?.fontFamily || null;
|
|
227
|
+
// Store the font metadata for the dominant font in this block
|
|
228
|
+
block.fontMeta = fontMap?.get(block.items[0]?.fontName) || null;
|
|
123
229
|
}
|
|
124
230
|
|
|
125
231
|
return blocks;
|
|
@@ -130,8 +236,7 @@ function groupTextBlocks(textItems, pageHeight, styles) {
|
|
|
130
236
|
* Only captures image operators (paintImageXObject etc).
|
|
131
237
|
* Skips path/fill/stroke to avoid false positives from text decorations.
|
|
132
238
|
*/
|
|
133
|
-
|
|
134
|
-
const ops = await page.getOperatorList();
|
|
239
|
+
function extractGraphicRegions(opList, OPS) {
|
|
135
240
|
const regions = [];
|
|
136
241
|
const ctmStack = [];
|
|
137
242
|
let ctm = [1, 0, 0, 1, 0, 0];
|
|
@@ -157,9 +262,9 @@ async function extractGraphicRegions(page, OPS) {
|
|
|
157
262
|
return [ctm[0] * x + ctm[2] * y + ctm[4], ctm[1] * x + ctm[3] * y + ctm[5]];
|
|
158
263
|
}
|
|
159
264
|
|
|
160
|
-
for (let i = 0; i <
|
|
161
|
-
const fn =
|
|
162
|
-
const args =
|
|
265
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
266
|
+
const fn = opList.fnArray[i];
|
|
267
|
+
const args = opList.argsArray[i];
|
|
163
268
|
|
|
164
269
|
if (fn === OPS.save) {
|
|
165
270
|
ctmStack.push(ctm.slice());
|
|
@@ -192,6 +297,92 @@ async function extractGraphicRegions(page, OPS) {
|
|
|
192
297
|
return regions;
|
|
193
298
|
}
|
|
194
299
|
|
|
300
|
+
/**
|
|
301
|
+
* Detect graphic regions by scanning the rendered canvas for non-text content.
|
|
302
|
+
* Complements op-based detection by also finding vector graphics (charts, diagrams).
|
|
303
|
+
*/
|
|
304
|
+
function detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale) {
|
|
305
|
+
const w = offCanvas.width;
|
|
306
|
+
const h = offCanvas.height;
|
|
307
|
+
const ctx = offCanvas.getContext("2d");
|
|
308
|
+
|
|
309
|
+
const cellPx = 16;
|
|
310
|
+
const cols = Math.ceil(w / cellPx);
|
|
311
|
+
const rows = Math.ceil(h / cellPx);
|
|
312
|
+
const occupied = new Uint8Array(cols * rows);
|
|
313
|
+
|
|
314
|
+
// Mark cells covered by text blocks
|
|
315
|
+
for (const block of textBlocks) {
|
|
316
|
+
const margin = 4 * renderScale;
|
|
317
|
+
const x0 = Math.floor(Math.max(0, block.bbox.x * renderScale - margin) / cellPx);
|
|
318
|
+
const y0 = Math.floor(Math.max(0, block.bbox.y * renderScale - margin) / cellPx);
|
|
319
|
+
const x1 = Math.ceil(Math.min(w, (block.bbox.x + block.bbox.w) * renderScale + margin) / cellPx);
|
|
320
|
+
const y1 = Math.ceil(Math.min(h, (block.bbox.y + block.bbox.h) * renderScale + margin) / cellPx);
|
|
321
|
+
for (let cy = y0; cy < y1 && cy < rows; cy++)
|
|
322
|
+
for (let cx = x0; cx < x1 && cx < cols; cx++)
|
|
323
|
+
occupied[cy * cols + cx] = 1;
|
|
324
|
+
}
|
|
325
|
+
|
|
326
|
+
// Scan non-text cells for visible content
|
|
327
|
+
const imgData = ctx.getImageData(0, 0, w, h);
|
|
328
|
+
const pixels = imgData.data;
|
|
329
|
+
const hasContent = new Uint8Array(cols * rows);
|
|
330
|
+
|
|
331
|
+
for (let cy = 0; cy < rows; cy++) {
|
|
332
|
+
for (let cx = 0; cx < cols; cx++) {
|
|
333
|
+
if (occupied[cy * cols + cx]) continue;
|
|
334
|
+
const px0 = cx * cellPx, py0 = cy * cellPx;
|
|
335
|
+
const px1 = Math.min(px0 + cellPx, w), py1 = Math.min(py0 + cellPx, h);
|
|
336
|
+
let dark = 0, total = 0;
|
|
337
|
+
for (let py = py0; py < py1; py += 2) {
|
|
338
|
+
for (let px = px0; px < px1; px += 2) {
|
|
339
|
+
const idx = (py * w + px) * 4;
|
|
340
|
+
if (pixels[idx + 3] > 20) {
|
|
341
|
+
const lum = 0.299 * pixels[idx] + 0.587 * pixels[idx + 1] + 0.114 * pixels[idx + 2];
|
|
342
|
+
if (lum < 240) dark++;
|
|
343
|
+
}
|
|
344
|
+
total++;
|
|
345
|
+
}
|
|
346
|
+
}
|
|
347
|
+
if (total > 0 && dark / total > 0.05) hasContent[cy * cols + cx] = 1;
|
|
348
|
+
}
|
|
349
|
+
}
|
|
350
|
+
|
|
351
|
+
// Connected-component labeling to find graphic regions
|
|
352
|
+
const visited = new Uint8Array(cols * rows);
|
|
353
|
+
const regions = [];
|
|
354
|
+
for (let cy = 0; cy < rows; cy++) {
|
|
355
|
+
for (let cx = 0; cx < cols; cx++) {
|
|
356
|
+
if (!hasContent[cy * cols + cx] || visited[cy * cols + cx]) continue;
|
|
357
|
+
const queue = [[cx, cy]];
|
|
358
|
+
visited[cy * cols + cx] = 1;
|
|
359
|
+
let minX = cx, maxX = cx, minY = cy, maxY = cy, count = 0;
|
|
360
|
+
while (queue.length > 0) {
|
|
361
|
+
const [qx, qy] = queue.shift();
|
|
362
|
+
minX = Math.min(minX, qx); maxX = Math.max(maxX, qx);
|
|
363
|
+
minY = Math.min(minY, qy); maxY = Math.max(maxY, qy);
|
|
364
|
+
count++;
|
|
365
|
+
for (const [dx, dy] of [[-1,0],[1,0],[0,-1],[0,1]]) {
|
|
366
|
+
const nx = qx + dx, ny = qy + dy;
|
|
367
|
+
if (nx >= 0 && nx < cols && ny >= 0 && ny < rows &&
|
|
368
|
+
hasContent[ny * cols + nx] && !visited[ny * cols + nx]) {
|
|
369
|
+
visited[ny * cols + nx] = 1;
|
|
370
|
+
queue.push([nx, ny]);
|
|
371
|
+
}
|
|
372
|
+
}
|
|
373
|
+
}
|
|
374
|
+
const rx = minX * cellPx / renderScale;
|
|
375
|
+
const ry = minY * cellPx / renderScale;
|
|
376
|
+
const rw = (maxX - minX + 1) * cellPx / renderScale;
|
|
377
|
+
const rh = (maxY - minY + 1) * cellPx / renderScale;
|
|
378
|
+
if (rw > 30 && rh > 30 && count > 4) {
|
|
379
|
+
regions.push({ type: "graphic", bbox: { x: rx, y: ry, w: rw, h: rh }, screenCoords: true });
|
|
380
|
+
}
|
|
381
|
+
}
|
|
382
|
+
}
|
|
383
|
+
return regions;
|
|
384
|
+
}
|
|
385
|
+
|
|
195
386
|
/**
|
|
196
387
|
* Build text content for a block, preserving paragraph breaks.
|
|
197
388
|
*/
|
|
@@ -208,7 +399,8 @@ function blockToText(block, pageHeight) {
|
|
|
208
399
|
|
|
209
400
|
if (lastY !== null) {
|
|
210
401
|
const gap = Math.abs(currentY - lastY);
|
|
211
|
-
|
|
402
|
+
const isShortItem = (item.str || "").trim().length <= 2;
|
|
403
|
+
if (gap > lastFontSize * 1.8 && !isShortItem) {
|
|
212
404
|
result += "\n\n";
|
|
213
405
|
} else if (gap > lastFontSize * 0.3) {
|
|
214
406
|
if (!result.endsWith(" ") && !result.endsWith("\n")) {
|
|
@@ -237,9 +429,10 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
|
|
|
237
429
|
}
|
|
238
430
|
|
|
239
431
|
for (const gr of graphicRegions) {
|
|
240
|
-
//
|
|
241
|
-
const
|
|
242
|
-
|
|
432
|
+
// Render-based regions are already in screen coords; op-based need conversion
|
|
433
|
+
const bbox = gr.screenCoords
|
|
434
|
+
? { ...gr.bbox }
|
|
435
|
+
: { x: gr.bbox.x, y: pageHeight - gr.bbox.y - gr.bbox.h, w: gr.bbox.w, h: gr.bbox.h };
|
|
243
436
|
|
|
244
437
|
// Skip if this graphic region overlaps significantly with any text block
|
|
245
438
|
const overlapsText = textBboxes.some(tb => bboxOverlap(bbox, tb) > 0.3);
|
|
@@ -248,11 +441,124 @@ function buildRegionMap(textBlocks, graphicRegions, pageHeight) {
|
|
|
248
441
|
}
|
|
249
442
|
}
|
|
250
443
|
|
|
251
|
-
//
|
|
252
|
-
regions.
|
|
253
|
-
|
|
254
|
-
|
|
255
|
-
|
|
444
|
+
// ── Column detection via histogram gap-finding ──
|
|
445
|
+
const pageWidth = Math.max(...regions.map(r => r.bbox.x + r.bbox.w), 1);
|
|
446
|
+
const narrowBlocks = regions.filter(r => r.bbox.w <= pageWidth * 0.6);
|
|
447
|
+
let gapX = pageWidth / 2;
|
|
448
|
+
let hasColumns = false;
|
|
449
|
+
|
|
450
|
+
if (narrowBlocks.length >= 4) {
|
|
451
|
+
// Build horizontal coverage histogram
|
|
452
|
+
const binCount = 100;
|
|
453
|
+
const binWidth = pageWidth / binCount;
|
|
454
|
+
const coverage = new Uint8Array(binCount);
|
|
455
|
+
for (const r of narrowBlocks) {
|
|
456
|
+
const b0 = Math.max(0, Math.floor(r.bbox.x / binWidth));
|
|
457
|
+
const b1 = Math.min(binCount, Math.ceil((r.bbox.x + r.bbox.w) / binWidth));
|
|
458
|
+
for (let b = b0; b < b1; b++) coverage[b]++;
|
|
459
|
+
}
|
|
460
|
+
|
|
461
|
+
// Find widest empty gap in middle 60% of page
|
|
462
|
+
const searchStart = Math.floor(binCount * 0.2);
|
|
463
|
+
const searchEnd = Math.ceil(binCount * 0.8);
|
|
464
|
+
let gapStart = -1, gapLen = 0, bestStart = -1, bestLen = 0;
|
|
465
|
+
for (let b = searchStart; b < searchEnd; b++) {
|
|
466
|
+
if (coverage[b] === 0) {
|
|
467
|
+
if (gapStart < 0) gapStart = b;
|
|
468
|
+
gapLen = b - gapStart + 1;
|
|
469
|
+
} else {
|
|
470
|
+
if (gapLen > bestLen) { bestLen = gapLen; bestStart = gapStart; }
|
|
471
|
+
gapStart = -1; gapLen = 0;
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
if (gapLen > bestLen) { bestLen = gapLen; bestStart = gapStart; }
|
|
475
|
+
|
|
476
|
+
if (bestLen >= 2) {
|
|
477
|
+
gapX = (bestStart + bestLen / 2) * binWidth;
|
|
478
|
+
const leftCount = narrowBlocks.filter(r => r.bbox.x + r.bbox.w / 2 < gapX).length;
|
|
479
|
+
const rightCount = narrowBlocks.filter(r => r.bbox.x + r.bbox.w / 2 >= gapX).length;
|
|
480
|
+
hasColumns = leftCount > 2 && rightCount > 2;
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
|
|
484
|
+
// ── Detect text alignment per block (including justified) ──
|
|
485
|
+
for (const region of regions) {
|
|
486
|
+
if (region.type !== "text") continue;
|
|
487
|
+
const block = region.block;
|
|
488
|
+
const leftMargin = block.bbox.x;
|
|
489
|
+
const rightMargin = pageWidth - (block.bbox.x + block.bbox.w);
|
|
490
|
+
const marginDiff = Math.abs(leftMargin - rightMargin);
|
|
491
|
+
|
|
492
|
+
// Detect justified text: multiple lines with consistent right edges
|
|
493
|
+
let isJustified = false;
|
|
494
|
+
if (block.items.length >= 3) {
|
|
495
|
+
const lines = [];
|
|
496
|
+
let lineItems = [];
|
|
497
|
+
let lastLineY = null;
|
|
498
|
+
for (const item of block.items) {
|
|
499
|
+
const y = pageHeight - item.transform[5];
|
|
500
|
+
if (lastLineY !== null && Math.abs(y - lastLineY) > 2) {
|
|
501
|
+
if (lineItems.length > 0) lines.push(lineItems);
|
|
502
|
+
lineItems = [];
|
|
503
|
+
}
|
|
504
|
+
lineItems.push(item);
|
|
505
|
+
lastLineY = y;
|
|
506
|
+
}
|
|
507
|
+
if (lineItems.length > 0) lines.push(lineItems);
|
|
508
|
+
|
|
509
|
+
if (lines.length >= 3) {
|
|
510
|
+
// Compute right edge of each line (except last — last line is usually ragged)
|
|
511
|
+
const rightEdges = [];
|
|
512
|
+
for (let li = 0; li < lines.length - 1; li++) {
|
|
513
|
+
const lastItem = lines[li][lines[li].length - 1];
|
|
514
|
+
const rightX = lastItem.transform[4] + (lastItem.width || 0);
|
|
515
|
+
rightEdges.push(rightX);
|
|
516
|
+
}
|
|
517
|
+
if (rightEdges.length >= 2) {
|
|
518
|
+
const maxRight = Math.max(...rightEdges);
|
|
519
|
+
const consistent = rightEdges.filter(r => Math.abs(r - maxRight) < pageWidth * 0.02);
|
|
520
|
+
isJustified = consistent.length > rightEdges.length * 0.7;
|
|
521
|
+
}
|
|
522
|
+
}
|
|
523
|
+
}
|
|
524
|
+
|
|
525
|
+
if (hasColumns && block.bbox.w <= pageWidth * 0.6) {
|
|
526
|
+
block.align = isJustified ? "justify" : "left";
|
|
527
|
+
} else if (isJustified) {
|
|
528
|
+
block.align = "justify";
|
|
529
|
+
} else if (leftMargin > pageWidth * 0.05 && marginDiff < pageWidth * 0.1) {
|
|
530
|
+
block.align = "center";
|
|
531
|
+
} else {
|
|
532
|
+
block.align = "left";
|
|
533
|
+
}
|
|
534
|
+
}
|
|
535
|
+
|
|
536
|
+
// ── Sort in reading order ──
|
|
537
|
+
if (hasColumns) {
|
|
538
|
+
const fullWidth = regions.filter(r => r.bbox.w > pageWidth * 0.6);
|
|
539
|
+
const leftCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 < gapX);
|
|
540
|
+
const rightCol = regions.filter(r => r.bbox.w <= pageWidth * 0.6 && r.bbox.x + r.bbox.w / 2 >= gapX);
|
|
541
|
+
const byY = (a, b) => a.bbox.y - b.bbox.y;
|
|
542
|
+
fullWidth.sort(byY);
|
|
543
|
+
leftCol.sort(byY);
|
|
544
|
+
rightCol.sort(byY);
|
|
545
|
+
|
|
546
|
+
// Interleave: full-width blocks mark section boundaries
|
|
547
|
+
regions.length = 0;
|
|
548
|
+
let li = 0, ri = 0;
|
|
549
|
+
for (const fw of fullWidth) {
|
|
550
|
+
while (li < leftCol.length && leftCol[li].bbox.y < fw.bbox.y) regions.push(leftCol[li++]);
|
|
551
|
+
while (ri < rightCol.length && rightCol[ri].bbox.y < fw.bbox.y) regions.push(rightCol[ri++]);
|
|
552
|
+
regions.push(fw);
|
|
553
|
+
}
|
|
554
|
+
while (li < leftCol.length) regions.push(leftCol[li++]);
|
|
555
|
+
while (ri < rightCol.length) regions.push(rightCol[ri++]);
|
|
556
|
+
} else {
|
|
557
|
+
regions.sort((a, b) => {
|
|
558
|
+
if (Math.abs(a.bbox.y - b.bbox.y) > 10) return a.bbox.y - b.bbox.y;
|
|
559
|
+
return a.bbox.x - b.bbox.x;
|
|
560
|
+
});
|
|
561
|
+
}
|
|
256
562
|
|
|
257
563
|
return regions;
|
|
258
564
|
}
|
|
@@ -266,26 +572,24 @@ async function analyzePage(page, OPS) {
|
|
|
266
572
|
|
|
267
573
|
// Get text content with styles
|
|
268
574
|
const textContent = await page.getTextContent();
|
|
269
|
-
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles);
|
|
270
575
|
|
|
271
|
-
//
|
|
272
|
-
const
|
|
273
|
-
|
|
274
|
-
|
|
275
|
-
|
|
276
|
-
let
|
|
277
|
-
for (
|
|
278
|
-
|
|
279
|
-
|
|
280
|
-
|
|
281
|
-
|
|
282
|
-
block.fontScale = block.avgFontSize / bodyFontSize;
|
|
576
|
+
// Get operator list once (reused for text/non-text classification + image extraction + font metadata)
|
|
577
|
+
const opList = await page.getOperatorList();
|
|
578
|
+
|
|
579
|
+
// Identify text operation indices for operationsFilter
|
|
580
|
+
const textOpIndices = new Set();
|
|
581
|
+
let inTextBlock = false;
|
|
582
|
+
for (let i = 0; i < opList.fnArray.length; i++) {
|
|
583
|
+
const fn = opList.fnArray[i];
|
|
584
|
+
if (fn === OPS.beginText) inTextBlock = true;
|
|
585
|
+
if (inTextBlock) textOpIndices.add(i);
|
|
586
|
+
if (fn === OPS.endText) inTextBlock = false;
|
|
283
587
|
}
|
|
284
588
|
|
|
285
|
-
//
|
|
286
|
-
const
|
|
589
|
+
// Extract graphic regions from operator list CTM tracking
|
|
590
|
+
const opGraphicRegions = extractGraphicRegions(opList, OPS);
|
|
287
591
|
|
|
288
|
-
// Render
|
|
592
|
+
// Render non-text only (images, paths, fills, backgrounds)
|
|
289
593
|
const renderScale = 2;
|
|
290
594
|
const offCanvas = document.createElement("canvas");
|
|
291
595
|
offCanvas.width = Math.floor(pageWidth * renderScale);
|
|
@@ -296,9 +600,97 @@ async function analyzePage(page, OPS) {
|
|
|
296
600
|
await page.render({
|
|
297
601
|
canvasContext: offCtx,
|
|
298
602
|
viewport: renderViewport,
|
|
603
|
+
operationsFilter: (index) => !textOpIndices.has(index),
|
|
299
604
|
}).promise;
|
|
300
605
|
|
|
301
|
-
//
|
|
606
|
+
// Get precise image coordinates via recordImages (supplements CTM detection).
|
|
607
|
+
// This full render also loads fonts into commonObjs as a side effect.
|
|
608
|
+
let imageCoordRegions = [];
|
|
609
|
+
let fullRenderDone = false;
|
|
610
|
+
try {
|
|
611
|
+
const imgTrackCanvas = document.createElement("canvas");
|
|
612
|
+
imgTrackCanvas.width = offCanvas.width;
|
|
613
|
+
imgTrackCanvas.height = offCanvas.height;
|
|
614
|
+
const imgRenderTask = page.render({
|
|
615
|
+
canvasContext: imgTrackCanvas.getContext("2d"),
|
|
616
|
+
viewport: renderViewport,
|
|
617
|
+
recordImages: true,
|
|
618
|
+
});
|
|
619
|
+
await imgRenderTask.promise;
|
|
620
|
+
fullRenderDone = true;
|
|
621
|
+
const imageCoords = imgRenderTask.imageCoordinates;
|
|
622
|
+
if (imageCoords && imageCoords.length > 0) {
|
|
623
|
+
for (let j = 0; j < imageCoords.length; j += 6) {
|
|
624
|
+
const x1 = imageCoords[j], y1 = imageCoords[j + 1];
|
|
625
|
+
const x2 = imageCoords[j + 2], y2 = imageCoords[j + 3];
|
|
626
|
+
const x3 = imageCoords[j + 4], y3 = imageCoords[j + 5];
|
|
627
|
+
const xs = [x1, x2, x3];
|
|
628
|
+
const ys = [y1, y2, y3];
|
|
629
|
+
const minX = Math.min(...xs) / renderScale;
|
|
630
|
+
const maxX = Math.max(...xs) / renderScale;
|
|
631
|
+
const minY = Math.min(...ys) / renderScale;
|
|
632
|
+
const maxY = Math.max(...ys) / renderScale;
|
|
633
|
+
if (maxX - minX > 10 && maxY - minY > 10) {
|
|
634
|
+
imageCoordRegions.push({
|
|
635
|
+
type: "graphic",
|
|
636
|
+
bbox: { x: minX, y: minY, w: maxX - minX, h: maxY - minY },
|
|
637
|
+
screenCoords: true,
|
|
638
|
+
});
|
|
639
|
+
}
|
|
640
|
+
}
|
|
641
|
+
}
|
|
642
|
+
} catch (_) {
|
|
643
|
+
// recordImages not supported — CTM fallback is used
|
|
644
|
+
}
|
|
645
|
+
|
|
646
|
+
// Ensure fonts are loaded for commonObjs access. If the recordImages render
|
|
647
|
+
// above didn't run, do a minimal full render to trigger font loading.
|
|
648
|
+
if (!fullRenderDone) {
|
|
649
|
+
const fontCanvas = document.createElement("canvas");
|
|
650
|
+
fontCanvas.width = 1;
|
|
651
|
+
fontCanvas.height = 1;
|
|
652
|
+
const fontViewport = page.getViewport({ scale: 0.1 });
|
|
653
|
+
try {
|
|
654
|
+
await page.render({ canvasContext: fontCanvas.getContext("2d"), viewport: fontViewport }).promise;
|
|
655
|
+
} catch (_) {}
|
|
656
|
+
}
|
|
657
|
+
|
|
658
|
+
// Extract real font metadata from commonObjs (bold, italic, weight, loadedName)
|
|
659
|
+
const fontMap = await extractFontMetadata(page, opList, OPS);
|
|
660
|
+
|
|
661
|
+
// Now group text blocks with real font data
|
|
662
|
+
const textBlocks = groupTextBlocks(textContent.items, pageHeight, textContent.styles, fontMap);
|
|
663
|
+
|
|
664
|
+
// Compute body font size (most common size = body text)
|
|
665
|
+
const allSizes = textBlocks.map(b => Math.round(b.avgFontSize * 10) / 10);
|
|
666
|
+
const freq = {};
|
|
667
|
+
for (const s of allSizes) freq[s] = (freq[s] || 0) + 1;
|
|
668
|
+
let bodyFontSize = 12;
|
|
669
|
+
let maxFreq = 0;
|
|
670
|
+
for (const [s, f] of Object.entries(freq)) {
|
|
671
|
+
if (f > maxFreq) { maxFreq = f; bodyFontSize = parseFloat(s); }
|
|
672
|
+
}
|
|
673
|
+
// Compute fontScale per block
|
|
674
|
+
for (const block of textBlocks) {
|
|
675
|
+
block.fontScale = block.avgFontSize / bodyFontSize;
|
|
676
|
+
}
|
|
677
|
+
|
|
678
|
+
// Detect graphics from rendered non-text canvas (catches vector graphics)
|
|
679
|
+
const renderGraphicRegions = detectGraphicRegionsFromRender(offCanvas, textBlocks, renderScale);
|
|
680
|
+
|
|
681
|
+
// Merge all sources, deduplicating by overlap
|
|
682
|
+
const graphicRegions = [...opGraphicRegions];
|
|
683
|
+
for (const rg of [...imageCoordRegions, ...renderGraphicRegions]) {
|
|
684
|
+
const overlapsExisting = graphicRegions.some(og => {
|
|
685
|
+
const ogBbox = og.screenCoords
|
|
686
|
+
? og.bbox
|
|
687
|
+
: { x: og.bbox.x, y: pageHeight - og.bbox.y - og.bbox.h, w: og.bbox.w, h: og.bbox.h };
|
|
688
|
+
return bboxOverlap(rg.bbox, ogBbox) > 0.3;
|
|
689
|
+
});
|
|
690
|
+
if (!overlapsExisting) graphicRegions.push(rg);
|
|
691
|
+
}
|
|
692
|
+
|
|
693
|
+
// Build region map (filters overlapping graphics, detects columns + alignment)
|
|
302
694
|
const regionMap = buildRegionMap(textBlocks, graphicRegions, pageHeight);
|
|
303
695
|
|
|
304
696
|
// Extract bitmap snippets for graphic regions only
|
|
@@ -324,13 +716,14 @@ async function analyzePage(page, OPS) {
|
|
|
324
716
|
textBlocks,
|
|
325
717
|
graphicRegions,
|
|
326
718
|
offCanvas,
|
|
719
|
+
fontMap,
|
|
327
720
|
};
|
|
328
721
|
}
|
|
329
722
|
|
|
330
723
|
// ─── Reflow + composite engine ────────────────────────────────────────────
|
|
331
724
|
|
|
332
725
|
function reflowAndComposite(analysis, opts) {
|
|
333
|
-
const { regionMap, bitmaps, pageWidth, pageHeight } = analysis;
|
|
726
|
+
const { regionMap, bitmaps, pageWidth, pageHeight, fontMap } = analysis;
|
|
334
727
|
const {
|
|
335
728
|
fontSize, fontFamily, lineHeight, padding, background,
|
|
336
729
|
textColor, imageFit, canvasW,
|
|
@@ -359,17 +752,22 @@ function reflowAndComposite(analysis, opts) {
|
|
|
359
752
|
continue;
|
|
360
753
|
}
|
|
361
754
|
|
|
362
|
-
// Per-block font properties
|
|
755
|
+
// Per-block font properties using real font metadata from commonObjs
|
|
363
756
|
const blockFontSize = Math.round(fontSize * (block.fontScale || 1));
|
|
364
757
|
const blockLH = blockFontSize * lineHeight;
|
|
758
|
+
const fm = block.fontMeta;
|
|
365
759
|
const style = block.isItalic ? "italic" : "normal";
|
|
366
|
-
|
|
367
|
-
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
371
|
-
|
|
372
|
-
|
|
760
|
+
const weight = block.isBlack ? 900 : block.isBold ? 700 : 400;
|
|
761
|
+
|
|
762
|
+
// Use the actual embedded PDF font if available (PDF.js loaded it via @font-face)
|
|
763
|
+
let blockFamily;
|
|
764
|
+
if (fm?.loadedName) {
|
|
765
|
+
blockFamily = `"${fm.loadedName}", ${fm.fallbackName || "sans-serif"}`;
|
|
766
|
+
} else if (fm?.css) {
|
|
767
|
+
blockFamily = fm.css;
|
|
768
|
+
} else {
|
|
769
|
+
blockFamily = fontFamily;
|
|
770
|
+
}
|
|
373
771
|
const font = `${style} ${weight} ${blockFontSize}px ${blockFamily}`;
|
|
374
772
|
|
|
375
773
|
const prepared = prepareWithSegments(text, font);
|
|
@@ -385,6 +783,7 @@ function reflowAndComposite(analysis, opts) {
|
|
|
385
783
|
fontStyle: style,
|
|
386
784
|
fontWeight: weight,
|
|
387
785
|
fontFamily: blockFamily,
|
|
786
|
+
align: block.align || "left",
|
|
388
787
|
region,
|
|
389
788
|
});
|
|
390
789
|
} else {
|
|
@@ -428,73 +827,6 @@ function reflowAndComposite(analysis, opts) {
|
|
|
428
827
|
return { totalHeight, reflowedRegions, fullPageFallback: false };
|
|
429
828
|
}
|
|
430
829
|
|
|
431
|
-
/**
|
|
432
|
-
* Draw the reflowed content to canvas.
|
|
433
|
-
*/
|
|
434
|
-
function drawComposite(ctx, reflowedRegions, analysis, opts, scrollY) {
|
|
435
|
-
const {
|
|
436
|
-
fontSize, fontFamily, lineHeight, padding,
|
|
437
|
-
background, textColor, canvasW, canvasH, dpr,
|
|
438
|
-
} = opts;
|
|
439
|
-
|
|
440
|
-
const d = dpr;
|
|
441
|
-
const baseLH = fontSize * lineHeight;
|
|
442
|
-
|
|
443
|
-
ctx.fillStyle = background;
|
|
444
|
-
ctx.fillRect(0, 0, canvasW * d, canvasH * d);
|
|
445
|
-
|
|
446
|
-
// Full page fallback
|
|
447
|
-
if (reflowedRegions.length === 0 && analysis.offCanvas) {
|
|
448
|
-
const availableWidth = canvasW - padding * 2;
|
|
449
|
-
const scale = Math.min(availableWidth / analysis.pageWidth, 1);
|
|
450
|
-
ctx.drawImage(
|
|
451
|
-
analysis.offCanvas,
|
|
452
|
-
padding * d, padding * d,
|
|
453
|
-
analysis.pageWidth * scale * d,
|
|
454
|
-
analysis.pageHeight * scale * d
|
|
455
|
-
);
|
|
456
|
-
return;
|
|
457
|
-
}
|
|
458
|
-
|
|
459
|
-
let cursorY = padding;
|
|
460
|
-
ctx.textBaseline = "top";
|
|
461
|
-
|
|
462
|
-
for (const r of reflowedRegions) {
|
|
463
|
-
if (r.type === "text" && r.lines) {
|
|
464
|
-
const fs = r.fontSize || fontSize;
|
|
465
|
-
const lh = r.lineHeight || baseLH;
|
|
466
|
-
const style = r.fontStyle || "normal";
|
|
467
|
-
const weight = r.fontWeight || 400;
|
|
468
|
-
|
|
469
|
-
ctx.fillStyle = textColor;
|
|
470
|
-
ctx.font = `${style} ${weight} ${fs * d}px ${fontFamily}`;
|
|
471
|
-
|
|
472
|
-
for (const line of r.lines) {
|
|
473
|
-
const screenY = cursorY - scrollY;
|
|
474
|
-
if (screenY > -lh && screenY < canvasH + lh) {
|
|
475
|
-
ctx.fillText(line.text, padding * d, screenY * d);
|
|
476
|
-
}
|
|
477
|
-
cursorY += lh;
|
|
478
|
-
}
|
|
479
|
-
} else if (r.type === "graphic" && r.bitmap) {
|
|
480
|
-
const screenY = cursorY - scrollY;
|
|
481
|
-
if (screenY > -r.drawH && screenY < canvasH + r.drawH) {
|
|
482
|
-
const tmpCanvas = document.createElement("canvas");
|
|
483
|
-
tmpCanvas.width = r.bitmap.data.width;
|
|
484
|
-
tmpCanvas.height = r.bitmap.data.height;
|
|
485
|
-
tmpCanvas.getContext("2d").putImageData(r.bitmap.data, 0, 0);
|
|
486
|
-
ctx.drawImage(
|
|
487
|
-
tmpCanvas,
|
|
488
|
-
padding * d, screenY * d,
|
|
489
|
-
r.drawW * d, r.drawH * d
|
|
490
|
-
);
|
|
491
|
-
}
|
|
492
|
-
cursorY += r.drawH;
|
|
493
|
-
}
|
|
494
|
-
cursorY += baseLH * 0.4;
|
|
495
|
-
}
|
|
496
|
-
}
|
|
497
|
-
|
|
498
830
|
// ─── Main API ─────────────────────────────────────────────────────────────
|
|
499
831
|
|
|
500
832
|
export function createReflowRenderer(container, options = {}) {
|
|
@@ -502,7 +834,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
502
834
|
const maxFont = options.maxFontSize ?? 48;
|
|
503
835
|
const fontFamily = options.fontFamily ?? '"Literata", Georgia, serif';
|
|
504
836
|
const lhRatio = options.lineHeight ?? 1.6;
|
|
505
|
-
|
|
837
|
+
let padding = options.padding ?? 24;
|
|
506
838
|
const bg = options.background ?? "#f4f1eb";
|
|
507
839
|
const textColor = options.textColor ?? "#252320";
|
|
508
840
|
const imageFit = options.imageFit ?? "proportional";
|
|
@@ -511,6 +843,12 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
511
843
|
const friction = options.friction ?? 0.95;
|
|
512
844
|
const onZoom = options.onZoom;
|
|
513
845
|
const onPageReady = options.onPageReady;
|
|
846
|
+
const enableMorph = options.enableMorph ?? false;
|
|
847
|
+
const morphRadius = options.morphRadius ?? 300;
|
|
848
|
+
const edgeFontRatio = options.edgeFontRatio ?? 0.5;
|
|
849
|
+
const maxWidth = options.maxWidth ?? Infinity;
|
|
850
|
+
const autoDetectPadding = options.autoDetectPadding ?? true;
|
|
851
|
+
const minPadding = options.minPadding ?? 20;
|
|
514
852
|
|
|
515
853
|
let pdfjs = null;
|
|
516
854
|
let pdfDoc = null;
|
|
@@ -561,6 +899,15 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
561
899
|
|
|
562
900
|
function reflow() {
|
|
563
901
|
if (!currentAnalysis || W === 0) return;
|
|
902
|
+
// Auto-detect padding from PDF page margins
|
|
903
|
+
if (autoDetectPadding && currentAnalysis.textBlocks.length > 0 && currentAnalysis.pageWidth > 0) {
|
|
904
|
+
const minX = Math.min(...currentAnalysis.textBlocks.map(b => b.bbox.x));
|
|
905
|
+
const maxX = Math.max(...currentAnalysis.textBlocks.map(b => b.bbox.x + b.bbox.w));
|
|
906
|
+
const rightMargin = currentAnalysis.pageWidth - maxX;
|
|
907
|
+
const pdfMargin = Math.min(minX, rightMargin);
|
|
908
|
+
const marginRatio = pdfMargin / currentAnalysis.pageWidth;
|
|
909
|
+
padding = Math.round(Math.max(minPadding, W * marginRatio));
|
|
910
|
+
}
|
|
564
911
|
const result = reflowAndComposite(currentAnalysis, {
|
|
565
912
|
fontSize, fontFamily, lineHeight: lhRatio, padding,
|
|
566
913
|
background: bg, textColor, imageFit, canvasW: W, canvasH: H, dpr,
|
|
@@ -577,7 +924,6 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
577
924
|
ctx.fillRect(0, 0, W * dpr, H * dpr);
|
|
578
925
|
return;
|
|
579
926
|
}
|
|
580
|
-
// Inline draw for performance (avoid function call overhead in rAF)
|
|
581
927
|
const d = dpr;
|
|
582
928
|
const baseLH = fontSize * lhRatio;
|
|
583
929
|
|
|
@@ -598,19 +944,64 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
598
944
|
|
|
599
945
|
let cursorY = padding;
|
|
600
946
|
ctx.textBaseline = "top";
|
|
947
|
+
const viewCenter = H / 2;
|
|
601
948
|
|
|
602
949
|
for (const r of reflowedRegions) {
|
|
603
950
|
if (r.type === "text" && r.lines) {
|
|
604
951
|
const fs = r.fontSize || fontSize;
|
|
605
952
|
const lh = r.lineHeight || baseLH;
|
|
606
953
|
const rFamily = r.fontFamily || fontFamily;
|
|
607
|
-
|
|
608
|
-
|
|
954
|
+
const style = r.fontStyle || "normal";
|
|
955
|
+
const weight = r.fontWeight || 400;
|
|
956
|
+
const centered = r.align === "center";
|
|
957
|
+
const justified = r.align === "justify";
|
|
958
|
+
const availW = W - padding * 2;
|
|
959
|
+
|
|
960
|
+
if (!enableMorph) {
|
|
961
|
+
ctx.fillStyle = textColor;
|
|
962
|
+
ctx.font = `${style} ${weight} ${fs * d}px ${rFamily}`;
|
|
963
|
+
}
|
|
609
964
|
|
|
610
|
-
for (
|
|
965
|
+
for (let lineIdx = 0; lineIdx < r.lines.length; lineIdx++) {
|
|
966
|
+
const line = r.lines[lineIdx];
|
|
611
967
|
const screenY = cursorY - scrollY;
|
|
612
968
|
if (screenY > -lh && screenY < H + lh) {
|
|
613
|
-
|
|
969
|
+
// Justified: distribute extra space between words (not on last line)
|
|
970
|
+
const isLastLine = lineIdx === r.lines.length - 1;
|
|
971
|
+
const shouldJustify = justified && !isLastLine && line.text.includes(" ");
|
|
972
|
+
|
|
973
|
+
if (enableMorph) {
|
|
974
|
+
const dist = Math.abs(screenY - viewCenter);
|
|
975
|
+
const t = Math.min(dist / morphRadius, 1);
|
|
976
|
+
const ease = 1 - (1 - t) ** 3;
|
|
977
|
+
const morphedFS = fs * (1 - ease * (1 - edgeFontRatio));
|
|
978
|
+
const opacity = 1.0 + (0.2 - 1.0) * ease;
|
|
979
|
+
const c = Math.round(37 - (37 - 160) * ease);
|
|
980
|
+
ctx.save();
|
|
981
|
+
ctx.globalAlpha = opacity;
|
|
982
|
+
ctx.fillStyle = `rgb(${c},${c - 2},${c - 3})`;
|
|
983
|
+
ctx.font = `${style} ${weight} ${morphedFS * d}px ${rFamily}`;
|
|
984
|
+
if (centered) {
|
|
985
|
+
ctx.textAlign = "center";
|
|
986
|
+
ctx.fillText(line.text, (W / 2) * d, screenY * d);
|
|
987
|
+
ctx.textAlign = "left";
|
|
988
|
+
} else if (shouldJustify) {
|
|
989
|
+
drawJustifiedLine(ctx, line.text, padding * d, screenY * d, availW * d);
|
|
990
|
+
} else {
|
|
991
|
+
ctx.fillText(line.text, padding * d, screenY * d);
|
|
992
|
+
}
|
|
993
|
+
ctx.restore();
|
|
994
|
+
} else {
|
|
995
|
+
if (centered) {
|
|
996
|
+
ctx.textAlign = "center";
|
|
997
|
+
ctx.fillText(line.text, (W / 2) * d, screenY * d);
|
|
998
|
+
ctx.textAlign = "left";
|
|
999
|
+
} else if (shouldJustify) {
|
|
1000
|
+
drawJustifiedLine(ctx, line.text, padding * d, screenY * d, availW * d);
|
|
1001
|
+
} else {
|
|
1002
|
+
ctx.fillText(line.text, padding * d, screenY * d);
|
|
1003
|
+
}
|
|
1004
|
+
}
|
|
614
1005
|
}
|
|
615
1006
|
cursorY += lh;
|
|
616
1007
|
}
|
|
@@ -618,7 +1009,19 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
618
1009
|
const screenY = cursorY - scrollY;
|
|
619
1010
|
if (screenY > -r.drawH && screenY < H + r.drawH) {
|
|
620
1011
|
const tmp = getTmpCanvas(r.bitmap);
|
|
621
|
-
|
|
1012
|
+
if (enableMorph) {
|
|
1013
|
+
const dist = Math.abs(screenY + r.drawH / 2 - viewCenter);
|
|
1014
|
+
const t = Math.min(dist / morphRadius, 1);
|
|
1015
|
+
const ease = 1 - (1 - t) ** 3;
|
|
1016
|
+
const imgScale = 1 - ease * (1 - edgeFontRatio);
|
|
1017
|
+
const opacity = 1.0 + (0.2 - 1.0) * ease;
|
|
1018
|
+
ctx.save();
|
|
1019
|
+
ctx.globalAlpha = opacity;
|
|
1020
|
+
ctx.drawImage(tmp, padding * d, screenY * d, r.drawW * imgScale * d, r.drawH * imgScale * d);
|
|
1021
|
+
ctx.restore();
|
|
1022
|
+
} else {
|
|
1023
|
+
ctx.drawImage(tmp, padding * d, screenY * d, r.drawW * d, r.drawH * d);
|
|
1024
|
+
}
|
|
622
1025
|
}
|
|
623
1026
|
cursorY += r.drawH;
|
|
624
1027
|
}
|
|
@@ -714,7 +1117,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
714
1117
|
|
|
715
1118
|
function handleResize() {
|
|
716
1119
|
dpr = Math.min(devicePixelRatio || 1, 3);
|
|
717
|
-
W = Math.min(container.clientWidth,
|
|
1120
|
+
W = Math.min(container.clientWidth, maxWidth);
|
|
718
1121
|
H = container.clientHeight;
|
|
719
1122
|
canvas.width = W * dpr;
|
|
720
1123
|
canvas.height = H * dpr;
|
|
@@ -761,11 +1164,14 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
761
1164
|
scrollY = 0;
|
|
762
1165
|
scrollVelocity = 0;
|
|
763
1166
|
reflow();
|
|
1167
|
+
onZoom?.(fontSize);
|
|
764
1168
|
|
|
765
1169
|
onPageReady?.({
|
|
766
1170
|
pageNum,
|
|
767
1171
|
textBlocks: currentAnalysis.textBlocks,
|
|
768
1172
|
graphicRegions: currentAnalysis.graphicRegions,
|
|
1173
|
+
pageWidth: currentAnalysis.pageWidth,
|
|
1174
|
+
pageHeight: currentAnalysis.pageHeight,
|
|
769
1175
|
});
|
|
770
1176
|
},
|
|
771
1177
|
|
|
@@ -810,6 +1216,7 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
810
1216
|
scrollY = 0;
|
|
811
1217
|
scrollVelocity = 0;
|
|
812
1218
|
reflow();
|
|
1219
|
+
onZoom?.(fontSize);
|
|
813
1220
|
},
|
|
814
1221
|
|
|
815
1222
|
async nextPage() {
|
|
@@ -838,6 +1245,13 @@ export function createReflowRenderer(container, options = {}) {
|
|
|
838
1245
|
pdfDoc = null;
|
|
839
1246
|
},
|
|
840
1247
|
|
|
1248
|
+
setPadding(newPadding) {
|
|
1249
|
+
if (newPadding !== padding) {
|
|
1250
|
+
padding = newPadding;
|
|
1251
|
+
reflow();
|
|
1252
|
+
}
|
|
1253
|
+
},
|
|
1254
|
+
|
|
841
1255
|
setFontSize(newSize) {
|
|
842
1256
|
const clamped = clamp(newSize, minFont, maxFont);
|
|
843
1257
|
if (clamped !== fontSize) {
|