@polotno/pdf-import 0.0.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +39 -0
- package/lib/color-utils.d.ts +2 -0
- package/lib/color-utils.js +10 -0
- package/lib/constants.d.ts +13 -0
- package/lib/constants.js +111 -0
- package/lib/font-mapper.d.ts +7 -0
- package/lib/font-mapper.js +111 -0
- package/lib/font-matcher.d.ts +10 -0
- package/lib/font-matcher.js +89 -0
- package/lib/font-merger.d.ts +7 -0
- package/lib/font-merger.js +114 -0
- package/lib/font-registry.d.ts +15 -0
- package/lib/font-registry.js +110 -0
- package/lib/image-encoder.d.ts +3 -0
- package/lib/image-encoder.js +181 -0
- package/lib/index.d.ts +97 -0
- package/lib/index.js +1 -0
- package/lib/operator-list-helpers.d.ts +6 -0
- package/lib/operator-list-helpers.js +26 -0
- package/lib/operator-list.d.ts +99 -0
- package/lib/operator-list.js +528 -0
- package/lib/page-parser.d.ts +18 -0
- package/lib/page-parser.js +674 -0
- package/lib/pdf-image-extractor.d.ts +14 -0
- package/lib/pdf-image-extractor.js +91 -0
- package/lib/svg-builder.d.ts +23 -0
- package/lib/svg-builder.js +213 -0
- package/lib/text-blocks.d.ts +6 -0
- package/lib/text-blocks.js +294 -0
- package/lib/text-grouper.d.ts +11 -0
- package/lib/text-grouper.js +11 -0
- package/lib/text-layout.d.ts +3 -0
- package/lib/text-layout.js +318 -0
- package/lib/text-span-extractor.d.ts +5 -0
- package/lib/text-span-extractor.js +271 -0
- package/lib/text-types.d.ts +25 -0
- package/lib/text-types.js +2 -0
- package/package.json +46 -0
|
@@ -0,0 +1,674 @@
|
|
|
1
|
+
import { extractTextPositionColors, extractDrawingsAndImages, } from './operator-list.js';
|
|
2
|
+
import { groupTextItems, groupSpansByBlock, detectAlignment, estimatePageMargins, computeLineHeight, } from './text-grouper.js';
|
|
3
|
+
import { mapPdfFont, isKnownWebFont, extractWeightFromName, extractStyleFromName, } from './font-mapper.js';
|
|
4
|
+
import { drawingToSvg, svgToDataUri, clippedDrawingsToSvg, clipPathToSvg, } from './svg-builder.js';
|
|
5
|
+
import { imageDataToDataUri } from './image-encoder.js';
|
|
6
|
+
import { rgbTupleToHex } from './color-utils.js';
|
|
7
|
+
import { MIN_TEXT_WIDTH, MIN_TEXT_HEIGHT, MIN_IMAGE_WIDTH, MIN_IMAGE_HEIGHT, } from './constants.js';
|
|
8
|
+
import { parseRef } from './pdf-image-extractor.js';
|
|
9
|
+
import { imageBytesToDataUri } from './image-encoder.js';
|
|
10
|
+
export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, }) {
|
|
11
|
+
const viewport = page.getViewport({ scale: 1 });
|
|
12
|
+
const pageWidth = viewport.width;
|
|
13
|
+
const pageHeight = viewport.height;
|
|
14
|
+
// The viewport transform y-offset accounts for CropBox position within MediaBox.
|
|
15
|
+
// Use it for coordinate flipping instead of pageHeight to avoid offset errors.
|
|
16
|
+
const yFlipOffset = viewport.transform[5];
|
|
17
|
+
// Get operator list for drawings, images, and text colors
|
|
18
|
+
const ops = await page.getOperatorList();
|
|
19
|
+
// Extract text colors from operator list (also collects fontRefs)
|
|
20
|
+
const positionColors = extractTextPositionColors(ops, yFlipOffset);
|
|
21
|
+
const fontRefs = positionColors.fontRefs || new Set();
|
|
22
|
+
// Extract drawings and image references from operator list
|
|
23
|
+
const { drawings, imageRefs } = extractDrawingsAndImages(ops, yFlipOffset);
|
|
24
|
+
// Run independent async operations in parallel
|
|
25
|
+
const [, imageElements, { fontNameMap, fontAscentMap, fontOtMap }] = await Promise.all([
|
|
26
|
+
resolveDrawingGradients(page, drawings),
|
|
27
|
+
buildImageElements(page, imageRefs, pageIdx, generateId, jpegIndex),
|
|
28
|
+
collectPageFonts(page, fontRefs, fontRegistry),
|
|
29
|
+
]);
|
|
30
|
+
const pageBackground = detectPageBackground(drawings, pageWidth, pageHeight);
|
|
31
|
+
const svgElements = buildSvgElements(drawings, pageWidth, pageHeight, generateId);
|
|
32
|
+
const textElements = await buildTextElements({
|
|
33
|
+
page,
|
|
34
|
+
pageWidth,
|
|
35
|
+
yFlipOffset,
|
|
36
|
+
positionColors,
|
|
37
|
+
fontNameMap,
|
|
38
|
+
fontAscentMap,
|
|
39
|
+
fontOtMap,
|
|
40
|
+
generateId,
|
|
41
|
+
});
|
|
42
|
+
// Assemble page: all elements sorted by PDF paint order
|
|
43
|
+
const allElements = [...svgElements, ...imageElements, ...textElements].sort((a, b) => a._order - b._order);
|
|
44
|
+
const children = allElements.map(({ _order, ...rest }) => rest);
|
|
45
|
+
return {
|
|
46
|
+
parsedPage: {
|
|
47
|
+
id: generateId(),
|
|
48
|
+
children,
|
|
49
|
+
background: pageBackground,
|
|
50
|
+
},
|
|
51
|
+
pageWidth,
|
|
52
|
+
pageHeight,
|
|
53
|
+
};
|
|
54
|
+
}
|
|
55
|
+
async function resolveDrawingGradients(page, drawings) {
|
|
56
|
+
for (const drawing of drawings) {
|
|
57
|
+
if (drawing._shadingNames && drawing._shadingNames.length > 0) {
|
|
58
|
+
// Resolve the first (primary) shading — this is the color gradient
|
|
59
|
+
const shadingName = drawing._shadingNames[0];
|
|
60
|
+
try {
|
|
61
|
+
const shading = await new Promise((resolve, reject) => {
|
|
62
|
+
const timeout = setTimeout(() => reject(new Error('timeout')), 3000);
|
|
63
|
+
page.objs.get(shadingName, (data) => {
|
|
64
|
+
clearTimeout(timeout);
|
|
65
|
+
if (data)
|
|
66
|
+
resolve(data);
|
|
67
|
+
else
|
|
68
|
+
reject(new Error('no data'));
|
|
69
|
+
});
|
|
70
|
+
});
|
|
71
|
+
// shading[0] = "RadialAxial", [1] = type, [3] = stops, [4] = start, [5] = end
|
|
72
|
+
// For radial: [6] = startRadius, [7] = endRadius
|
|
73
|
+
if (shading && shading[1]) {
|
|
74
|
+
const type = shading[1] === 'radial' ? 'radial' : 'linear';
|
|
75
|
+
const stops = [];
|
|
76
|
+
if (Array.isArray(shading[3])) {
|
|
77
|
+
for (const [offset, color] of shading[3]) {
|
|
78
|
+
stops.push({ offset, color });
|
|
79
|
+
}
|
|
80
|
+
}
|
|
81
|
+
if (type === 'linear') {
|
|
82
|
+
drawing.gradient = {
|
|
83
|
+
type: 'linear',
|
|
84
|
+
stops,
|
|
85
|
+
x1: shading[4]?.[0] ?? 0,
|
|
86
|
+
y1: shading[4]?.[1] ?? 0,
|
|
87
|
+
x2: shading[5]?.[0] ?? 0,
|
|
88
|
+
y2: shading[5]?.[1] ?? 0,
|
|
89
|
+
};
|
|
90
|
+
}
|
|
91
|
+
else {
|
|
92
|
+
drawing.gradient = {
|
|
93
|
+
type: 'radial',
|
|
94
|
+
stops,
|
|
95
|
+
cx: shading[4]?.[0] ?? 0,
|
|
96
|
+
cy: shading[4]?.[1] ?? 0,
|
|
97
|
+
r0: shading[6] ?? 0,
|
|
98
|
+
r1: shading[7] ?? 0,
|
|
99
|
+
};
|
|
100
|
+
}
|
|
101
|
+
// Mark drawing as having a gradient fill
|
|
102
|
+
if (!drawing.fill) {
|
|
103
|
+
drawing.fill = [0, 0, 0]; // placeholder so it's not skipped
|
|
104
|
+
}
|
|
105
|
+
}
|
|
106
|
+
}
|
|
107
|
+
catch {
|
|
108
|
+
// Couldn't resolve shading — leave as-is
|
|
109
|
+
}
|
|
110
|
+
}
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
function detectPageBackground(drawings, pageWidth, pageHeight) {
|
|
114
|
+
let pageBackground = '#FFFFFF';
|
|
115
|
+
for (const drawing of drawings) {
|
|
116
|
+
if (drawing.fill !== null) {
|
|
117
|
+
const dw = drawing.rect[2] - drawing.rect[0];
|
|
118
|
+
const dh = drawing.rect[3] - drawing.rect[1];
|
|
119
|
+
if (dw >= pageWidth * 0.9 && dh >= pageHeight * 0.9) {
|
|
120
|
+
const [r, g, b] = drawing.fill;
|
|
121
|
+
pageBackground = rgbTupleToHex(r, g, b);
|
|
122
|
+
}
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
return pageBackground;
|
|
126
|
+
}
|
|
127
|
+
function buildSvgElements(drawings, pageWidth, pageHeight, generateId) {
|
|
128
|
+
const svgElements = [];
|
|
129
|
+
for (let idx = 0; idx < drawings.length; idx++) {
|
|
130
|
+
const drawing = drawings[idx];
|
|
131
|
+
// Skip fully transparent drawings (e.g. accessibility marker rectangles)
|
|
132
|
+
if (drawing.opacity <= 0)
|
|
133
|
+
continue;
|
|
134
|
+
if (isMergeableClipRunDrawing(drawing)) {
|
|
135
|
+
const run = [drawing];
|
|
136
|
+
while (idx + 1 < drawings.length &&
|
|
137
|
+
isMergeableClipRunDrawing(drawings[idx + 1]) &&
|
|
138
|
+
isSameClipRun(run[0], drawings[idx + 1])) {
|
|
139
|
+
run.push(drawings[idx + 1]);
|
|
140
|
+
idx++;
|
|
141
|
+
}
|
|
142
|
+
if (run.length >= 4) {
|
|
143
|
+
const merged = clippedDrawingsToSvg(run);
|
|
144
|
+
if (merged) {
|
|
145
|
+
svgElements.push({
|
|
146
|
+
type: 'svg',
|
|
147
|
+
id: generateId(),
|
|
148
|
+
x: merged.x,
|
|
149
|
+
y: merged.y,
|
|
150
|
+
width: merged.width,
|
|
151
|
+
height: merged.height,
|
|
152
|
+
rotation: 0,
|
|
153
|
+
opacity: run[0].opacity,
|
|
154
|
+
src: svgToDataUri(merged.svg),
|
|
155
|
+
name: '',
|
|
156
|
+
_order: run[0].orderIndex,
|
|
157
|
+
});
|
|
158
|
+
continue;
|
|
159
|
+
}
|
|
160
|
+
}
|
|
161
|
+
}
|
|
162
|
+
const result = drawingToSvg(drawing, pageWidth, pageHeight);
|
|
163
|
+
if (result) {
|
|
164
|
+
svgElements.push({
|
|
165
|
+
type: 'svg',
|
|
166
|
+
id: generateId(),
|
|
167
|
+
x: result.x,
|
|
168
|
+
y: result.y,
|
|
169
|
+
width: result.width,
|
|
170
|
+
height: result.height,
|
|
171
|
+
rotation: 0,
|
|
172
|
+
opacity: drawing.opacity,
|
|
173
|
+
src: svgToDataUri(result.svg),
|
|
174
|
+
name: '',
|
|
175
|
+
_order: drawing.orderIndex,
|
|
176
|
+
});
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
return svgElements;
|
|
180
|
+
}
|
|
181
|
+
function hasMeaningfulClip(drawing) {
|
|
182
|
+
return (!!drawing.clipRect &&
|
|
183
|
+
Math.max(drawing.clipRect[0] - drawing.rect[0], drawing.clipRect[1] - drawing.rect[1], drawing.rect[2] - drawing.clipRect[2], drawing.rect[3] - drawing.clipRect[3]) > 20);
|
|
184
|
+
}
|
|
185
|
+
function isMergeableClipRunDrawing(drawing) {
|
|
186
|
+
return (drawing.opacity > 0 &&
|
|
187
|
+
!!drawing.clipPath &&
|
|
188
|
+
hasMeaningfulClip(drawing) &&
|
|
189
|
+
drawing.items.length === 4 &&
|
|
190
|
+
!drawing.stroke &&
|
|
191
|
+
!drawing.gradient);
|
|
192
|
+
}
|
|
193
|
+
function serializeDrawingItems(items) {
|
|
194
|
+
return items
|
|
195
|
+
.map((item) => Object.entries(item)
|
|
196
|
+
.map(([key, value]) => typeof value === 'number'
|
|
197
|
+
? `${key}:${value.toFixed(3)}`
|
|
198
|
+
: `${key}:${String(value)}`)
|
|
199
|
+
.join(','))
|
|
200
|
+
.join('|');
|
|
201
|
+
}
|
|
202
|
+
function isSameClipRun(a, b) {
|
|
203
|
+
// Direct element comparison instead of JSON.stringify for numeric arrays
|
|
204
|
+
const af = a.fill, bf = b.fill;
|
|
205
|
+
const sameFill = af[0] === bf[0] && af[1] === bf[1] && af[2] === bf[2] &&
|
|
206
|
+
a.opacity === b.opacity &&
|
|
207
|
+
a.evenOdd === b.evenOdd;
|
|
208
|
+
if (!sameFill)
|
|
209
|
+
return false;
|
|
210
|
+
const ac = a.clipRect, bc = b.clipRect;
|
|
211
|
+
const sameClipRect = ac[0] === bc[0] && ac[1] === bc[1] && ac[2] === bc[2] && ac[3] === bc[3];
|
|
212
|
+
return (sameClipRect &&
|
|
213
|
+
serializeDrawingItems(a.clipPath) === serializeDrawingItems(b.clipPath));
|
|
214
|
+
}
|
|
215
|
+
async function buildImageElements(page, imageRefs, pageIdx, generateId, jpegIndex) {
|
|
216
|
+
const imageElements = [];
|
|
217
|
+
for (let imgIdx = 0; imgIdx < imageRefs.length; imgIdx++) {
|
|
218
|
+
const ref = imageRefs[imgIdx];
|
|
219
|
+
if (ref.width < MIN_IMAGE_WIDTH || ref.height < MIN_IMAGE_HEIGHT)
|
|
220
|
+
continue;
|
|
221
|
+
try {
|
|
222
|
+
const imgData = await new Promise((resolve, reject) => {
|
|
223
|
+
const timeout = setTimeout(() => reject(new Error('Timeout waiting for image data')), 10000);
|
|
224
|
+
page.objs.get(ref.name, (data) => {
|
|
225
|
+
clearTimeout(timeout);
|
|
226
|
+
if (data)
|
|
227
|
+
resolve(data);
|
|
228
|
+
else
|
|
229
|
+
reject(new Error('No image data'));
|
|
230
|
+
});
|
|
231
|
+
});
|
|
232
|
+
let src;
|
|
233
|
+
// Try to use original JPEG bytes from the PDF stream
|
|
234
|
+
const objNum = imgData.ref ? parseRef(imgData.ref) : null;
|
|
235
|
+
const rawStream = objNum != null ? jpegIndex.get(objNum) : undefined;
|
|
236
|
+
if (rawStream) {
|
|
237
|
+
src = imageBytesToDataUri(rawStream.data, rawStream.mimeType);
|
|
238
|
+
}
|
|
239
|
+
else if (imgData.data) {
|
|
240
|
+
// Raw pixel data — re-encode as PNG
|
|
241
|
+
src = imageDataToDataUri(imgData.data, imgData.width, imgData.height, imgData.kind || 3);
|
|
242
|
+
}
|
|
243
|
+
else if (imgData.src) {
|
|
244
|
+
// Already a URL or data URI
|
|
245
|
+
src = imgData.src;
|
|
246
|
+
}
|
|
247
|
+
else if (imgData.bitmap) {
|
|
248
|
+
const canvas = document.createElement('canvas');
|
|
249
|
+
canvas.width = imgData.bitmap.width;
|
|
250
|
+
canvas.height = imgData.bitmap.height;
|
|
251
|
+
const ctx = canvas.getContext('2d');
|
|
252
|
+
ctx.drawImage(imgData.bitmap, 0, 0);
|
|
253
|
+
src = canvas.toDataURL('image/png');
|
|
254
|
+
}
|
|
255
|
+
else {
|
|
256
|
+
continue;
|
|
257
|
+
}
|
|
258
|
+
// Generate clip path SVG if available
|
|
259
|
+
let clipSrc = '';
|
|
260
|
+
// Compute crop from clip rect: constrain element to visible clip area
|
|
261
|
+
let elemX = ref.x;
|
|
262
|
+
let elemY = ref.y;
|
|
263
|
+
let elemW = ref.width;
|
|
264
|
+
let elemH = ref.height;
|
|
265
|
+
let cropX = 0;
|
|
266
|
+
let cropY = 0;
|
|
267
|
+
let cropWidth = 1;
|
|
268
|
+
let cropHeight = 1;
|
|
269
|
+
const rotDeg = ref.rotation || 0;
|
|
270
|
+
const hasRotation = Math.abs(rotDeg) > 0.5;
|
|
271
|
+
if (ref.clipRect && ref.width > 0 && ref.height > 0) {
|
|
272
|
+
const [cx0, cy0, cx1, cy1] = ref.clipRect;
|
|
273
|
+
if (hasRotation) {
|
|
274
|
+
// For rotated images: transform clip rect into image-local space
|
|
275
|
+
// (origin at image top-left, x along top edge, y along left edge)
|
|
276
|
+
const rotRad = (rotDeg * Math.PI) / 180;
|
|
277
|
+
const cosNeg = Math.cos(-rotRad);
|
|
278
|
+
const sinNeg = Math.sin(-rotRad);
|
|
279
|
+
// Transform clip rect corners: screen → image-local (origin=TL, unrotated)
|
|
280
|
+
const clipScreenCorners = [
|
|
281
|
+
[cx0, cy0],
|
|
282
|
+
[cx1, cy0],
|
|
283
|
+
[cx1, cy1],
|
|
284
|
+
[cx0, cy1],
|
|
285
|
+
];
|
|
286
|
+
const localCorners = clipScreenCorners.map(([sx, sy]) => {
|
|
287
|
+
const dx = sx - ref.x;
|
|
288
|
+
const dy = sy - ref.y;
|
|
289
|
+
return [dx * cosNeg - dy * sinNeg, dx * sinNeg + dy * cosNeg];
|
|
290
|
+
});
|
|
291
|
+
// AABB of clip in local space, clamped to [0, width] × [0, height]
|
|
292
|
+
const visMinX = Math.max(Math.min(...localCorners.map((c) => c[0])), 0);
|
|
293
|
+
const visMinY = Math.max(Math.min(...localCorners.map((c) => c[1])), 0);
|
|
294
|
+
const visMaxX = Math.min(Math.max(...localCorners.map((c) => c[0])), ref.width);
|
|
295
|
+
const visMaxY = Math.min(Math.max(...localCorners.map((c) => c[1])), ref.height);
|
|
296
|
+
const visW = visMaxX - visMinX;
|
|
297
|
+
const visH = visMaxY - visMinY;
|
|
298
|
+
if (visW > MIN_IMAGE_WIDTH && visH > MIN_IMAGE_HEIGHT) {
|
|
299
|
+
cropX = visMinX / ref.width;
|
|
300
|
+
cropY = visMinY / ref.height;
|
|
301
|
+
cropWidth = visW / ref.width;
|
|
302
|
+
cropHeight = visH / ref.height;
|
|
303
|
+
elemW = visW;
|
|
304
|
+
elemH = visH;
|
|
305
|
+
// Element position: transform crop top-left from local back to screen
|
|
306
|
+
// (polotno rotates around top-left)
|
|
307
|
+
const cosPos = Math.cos(rotRad);
|
|
308
|
+
const sinPos = Math.sin(rotRad);
|
|
309
|
+
elemX = ref.x + visMinX * cosPos - visMinY * sinPos;
|
|
310
|
+
elemY = ref.y + visMinX * sinPos + visMinY * cosPos;
|
|
311
|
+
}
|
|
312
|
+
}
|
|
313
|
+
else {
|
|
314
|
+
// Non-rotated: simple axis-aligned intersection
|
|
315
|
+
const visX0 = Math.max(cx0, ref.x);
|
|
316
|
+
const visY0 = Math.max(cy0, ref.y);
|
|
317
|
+
const visX1 = Math.min(cx1, ref.x + ref.width);
|
|
318
|
+
const visY1 = Math.min(cy1, ref.y + ref.height);
|
|
319
|
+
const visW = visX1 - visX0;
|
|
320
|
+
const visH = visY1 - visY0;
|
|
321
|
+
if (visW > MIN_IMAGE_WIDTH && visH > MIN_IMAGE_HEIGHT) {
|
|
322
|
+
cropX = (visX0 - ref.x) / ref.width;
|
|
323
|
+
cropY = (visY0 - ref.y) / ref.height;
|
|
324
|
+
cropWidth = visW / ref.width;
|
|
325
|
+
cropHeight = visH / ref.height;
|
|
326
|
+
elemX = visX0;
|
|
327
|
+
elemY = visY0;
|
|
328
|
+
elemW = visW;
|
|
329
|
+
elemH = visH;
|
|
330
|
+
}
|
|
331
|
+
}
|
|
332
|
+
}
|
|
333
|
+
// Polotno renders images with uniform scaling (maintaining natural aspect ratio).
|
|
334
|
+
// PDFs can non-uniformly scale images (different scaleX/scaleY). Adjust cropHeight
|
|
335
|
+
// so that cropH/cropW = (elemH/elemW) × (naturalW/naturalH), keeping horizontal
|
|
336
|
+
// extent correct and adjusting vertical to match polotno's uniform scale model.
|
|
337
|
+
const naturalW = imgData.width || 0;
|
|
338
|
+
const naturalH = imgData.height || 0;
|
|
339
|
+
if (naturalW > 0 &&
|
|
340
|
+
naturalH > 0 &&
|
|
341
|
+
elemW > 0 &&
|
|
342
|
+
elemH > 0 &&
|
|
343
|
+
cropWidth > 0) {
|
|
344
|
+
const requiredRatio = (elemH / elemW) * (naturalW / naturalH);
|
|
345
|
+
cropHeight = cropWidth * requiredRatio;
|
|
346
|
+
}
|
|
347
|
+
if (ref.clipPath) {
|
|
348
|
+
// Skip clipSrc for simple rectangular clips — crop values already handle the clipping.
|
|
349
|
+
// Only generate clipSrc for non-rectangular shapes (circles, curves, etc.)
|
|
350
|
+
const isSimpleRect = ref.clipPath.length === 1 && ref.clipPath[0].kind === 're';
|
|
351
|
+
if (!isSimpleRect) {
|
|
352
|
+
const clipSvg = clipPathToSvg(ref.clipPath, elemX, elemY, elemW, elemH);
|
|
353
|
+
if (clipSvg) {
|
|
354
|
+
clipSrc = svgToDataUri(clipSvg);
|
|
355
|
+
}
|
|
356
|
+
}
|
|
357
|
+
}
|
|
358
|
+
imageElements.push({
|
|
359
|
+
type: 'image',
|
|
360
|
+
id: generateId(),
|
|
361
|
+
x: elemX,
|
|
362
|
+
y: elemY,
|
|
363
|
+
width: elemW,
|
|
364
|
+
height: elemH,
|
|
365
|
+
rotation: ref.rotation || 0,
|
|
366
|
+
opacity: 1,
|
|
367
|
+
visible: true,
|
|
368
|
+
selectable: true,
|
|
369
|
+
removable: true,
|
|
370
|
+
src,
|
|
371
|
+
cropX,
|
|
372
|
+
cropY,
|
|
373
|
+
cropWidth,
|
|
374
|
+
cropHeight,
|
|
375
|
+
clipSrc,
|
|
376
|
+
name: `image_${pageIdx}_${imgIdx}`,
|
|
377
|
+
_order: ref.orderIndex,
|
|
378
|
+
});
|
|
379
|
+
}
|
|
380
|
+
catch (e) {
|
|
381
|
+
console.warn(`Failed to extract image ${ref.name}:`, e);
|
|
382
|
+
}
|
|
383
|
+
}
|
|
384
|
+
return imageElements;
|
|
385
|
+
}
|
|
386
|
+
async function collectPageFonts(page, fontRefs, fontRegistry) {
|
|
387
|
+
// Build font name map: loadedName (g_d0_f1) → real PDF name (CZZZZZ+Roboto-Regular)
|
|
388
|
+
// Also build font ascent map for accurate Y positioning (glyph top vs baseline)
|
|
389
|
+
const fontNameMap = new Map();
|
|
390
|
+
const fontAscentMap = new Map();
|
|
391
|
+
// Map loadedName → parsed opentype.js Font for letter spacing computation
|
|
392
|
+
const fontOtMap = new Map();
|
|
393
|
+
for (const ref of fontRefs) {
|
|
394
|
+
try {
|
|
395
|
+
const fontObj = await new Promise((resolve, reject) => {
|
|
396
|
+
page.commonObjs.get(ref, (data) => {
|
|
397
|
+
if (data)
|
|
398
|
+
resolve(data);
|
|
399
|
+
else
|
|
400
|
+
reject();
|
|
401
|
+
});
|
|
402
|
+
});
|
|
403
|
+
if (fontObj.name) {
|
|
404
|
+
fontNameMap.set(ref, fontObj.name);
|
|
405
|
+
if (fontObj.ascent != null) {
|
|
406
|
+
fontAscentMap.set(ref, fontObj.ascent);
|
|
407
|
+
}
|
|
408
|
+
fontRegistry.recordFont(fontObj);
|
|
409
|
+
// Parse font binary with opentype.js for per-character width computation
|
|
410
|
+
// Uses FontRegistry cache to avoid re-parsing the same font across pages
|
|
411
|
+
if (fontObj.data && fontObj.data.length > 0) {
|
|
412
|
+
const otFont = fontRegistry.parseOpentype(ref, fontObj.data);
|
|
413
|
+
if (otFont) {
|
|
414
|
+
fontOtMap.set(ref, otFont);
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
}
|
|
418
|
+
}
|
|
419
|
+
catch { }
|
|
420
|
+
}
|
|
421
|
+
return { fontNameMap, fontAscentMap, fontOtMap };
|
|
422
|
+
}
|
|
423
|
+
/**
|
|
424
|
+
* Compute letter spacing for a text block by comparing the PDF's actual span
|
|
425
|
+
* widths against the expected widths from the embedded font's character metrics
|
|
426
|
+
* (via opentype.js). Returns a ratio of fontSize (Polotno's letterSpacing unit).
|
|
427
|
+
*
|
|
428
|
+
* For each span we compute:
|
|
429
|
+
* pdfWidth = span.width (exact advance width from PDF)
|
|
430
|
+
* fontWidth = sum of opentype advanceWidth for each character, scaled to fontSize
|
|
431
|
+
* perCharDelta = (pdfWidth - fontWidth) / span.text.length
|
|
432
|
+
*
|
|
433
|
+
* We average across all spans in the block and express as ratio of fontSize.
|
|
434
|
+
*/
|
|
435
|
+
function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, dominantFontName, fontOtByName) {
|
|
436
|
+
if (fontSize < 1)
|
|
437
|
+
return 0;
|
|
438
|
+
const isSingleLine = !fullText.includes('\n');
|
|
439
|
+
// For single-line text, compare the font's full rendering width against the
|
|
440
|
+
// PDF block width. For embedded (non-web) fonts the browser uses the same
|
|
441
|
+
// subset we parsed, so .notdef space width is correct. For known web fonts
|
|
442
|
+
// the browser uses the real font (not the subset), so .notdef space width
|
|
443
|
+
// is wrong — bail out and use the per-span path instead.
|
|
444
|
+
if (isSingleLine) {
|
|
445
|
+
const otFont = fontOtByName.get(dominantFontName);
|
|
446
|
+
if (otFont) {
|
|
447
|
+
const fontIsEmbedded = !isKnownWebFont(dominantFontName);
|
|
448
|
+
const scale = fontSize / otFont.unitsPerEm;
|
|
449
|
+
const notdefGlyph = otFont.glyphs.get(0);
|
|
450
|
+
let fontWidth = 0;
|
|
451
|
+
let charCount = 0;
|
|
452
|
+
let valid = true;
|
|
453
|
+
for (const ch of fullText) {
|
|
454
|
+
const glyph = otFont.charToGlyph(ch);
|
|
455
|
+
if (glyph === notdefGlyph || !glyph.advanceWidth) {
|
|
456
|
+
// For embedded fonts, .notdef space is what the browser will render.
|
|
457
|
+
// For web fonts, .notdef space width is wrong — bail out.
|
|
458
|
+
if (ch === ' ' && fontIsEmbedded) {
|
|
459
|
+
fontWidth += (glyph.advanceWidth || 0) * scale;
|
|
460
|
+
charCount++;
|
|
461
|
+
continue;
|
|
462
|
+
}
|
|
463
|
+
valid = false;
|
|
464
|
+
break;
|
|
465
|
+
}
|
|
466
|
+
fontWidth += glyph.advanceWidth * scale;
|
|
467
|
+
charCount++;
|
|
468
|
+
}
|
|
469
|
+
if (valid && fontWidth > 1 && charCount >= 2) {
|
|
470
|
+
const perCharRatio = (blockWidth - fontWidth) / charCount / fontSize;
|
|
471
|
+
const rounded = Math.round(perCharRatio * 1000) / 1000;
|
|
472
|
+
// If the value is extreme (>10% of fontSize), this likely indicates
|
|
473
|
+
// merged labels with a large spatial gap, not real letter spacing.
|
|
474
|
+
// Fall through to the safer per-span computation.
|
|
475
|
+
if (Math.abs(rounded) < 0.1) {
|
|
476
|
+
const totalPxImpact = Math.abs(rounded) * fontSize * charCount;
|
|
477
|
+
if (totalPxImpact >= 1)
|
|
478
|
+
return rounded;
|
|
479
|
+
return 0;
|
|
480
|
+
}
|
|
481
|
+
}
|
|
482
|
+
}
|
|
483
|
+
}
|
|
484
|
+
// For multi-line text (or when single-line full-text fails), use per-span
|
|
485
|
+
// comparison. This is safer because it avoids the unpredictable .notdef
|
|
486
|
+
// space width that varies wildly between subset fonts.
|
|
487
|
+
let totalDelta = 0;
|
|
488
|
+
let totalChars = 0;
|
|
489
|
+
for (const span of spans) {
|
|
490
|
+
if (span.text.length < 2)
|
|
491
|
+
continue;
|
|
492
|
+
const otFont = fontOtByName.get(span.fontName);
|
|
493
|
+
if (!otFont)
|
|
494
|
+
continue;
|
|
495
|
+
const scale = span.fontSize / otFont.unitsPerEm;
|
|
496
|
+
const notdefGlyph = otFont.glyphs.get(0);
|
|
497
|
+
let fontWidth = 0;
|
|
498
|
+
let validChars = 0;
|
|
499
|
+
let hasInvalidGlyph = false;
|
|
500
|
+
for (const ch of span.text) {
|
|
501
|
+
const glyph = otFont.charToGlyph(ch);
|
|
502
|
+
if (glyph === notdefGlyph || !glyph.advanceWidth) {
|
|
503
|
+
hasInvalidGlyph = true;
|
|
504
|
+
break;
|
|
505
|
+
}
|
|
506
|
+
fontWidth += glyph.advanceWidth * scale;
|
|
507
|
+
validChars++;
|
|
508
|
+
}
|
|
509
|
+
if (hasInvalidGlyph || fontWidth < 1)
|
|
510
|
+
continue;
|
|
511
|
+
const delta = span.width - fontWidth;
|
|
512
|
+
totalDelta += delta;
|
|
513
|
+
totalChars += validChars;
|
|
514
|
+
}
|
|
515
|
+
if (totalChars < 2)
|
|
516
|
+
return 0;
|
|
517
|
+
const perCharRatio = totalDelta / totalChars / fontSize;
|
|
518
|
+
const rounded = Math.round(perCharRatio * 1000) / 1000;
|
|
519
|
+
// Threshold: ignore if the total pixel impact is < 1px.
|
|
520
|
+
// At small fonts (12pt, 10 chars) 0.005 ratio = 0.6px total → skip.
|
|
521
|
+
// At large fonts (615pt, 9 chars) 0.003 ratio = 16px total → keep.
|
|
522
|
+
const totalPxImpact = Math.abs(rounded) * fontSize * totalChars;
|
|
523
|
+
return totalPxImpact < 1 ? 0 : rounded;
|
|
524
|
+
}
|
|
525
|
+
async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors, fontNameMap, fontAscentMap, fontOtMap, generateId, }) {
|
|
526
|
+
// Extract text
|
|
527
|
+
const textContent = await page.getTextContent();
|
|
528
|
+
const textSpans = groupTextItems(textContent.items, textContent.styles || {}, yFlipOffset, positionColors, fontNameMap, fontAscentMap);
|
|
529
|
+
// Re-key opentype fonts by real PDF name (spans store real names, not loadedNames)
|
|
530
|
+
const fontOtByName = new Map();
|
|
531
|
+
for (const [loadedName, otFont] of fontOtMap) {
|
|
532
|
+
const realName = fontNameMap.get(loadedName) || loadedName;
|
|
533
|
+
fontOtByName.set(realName, otFont);
|
|
534
|
+
}
|
|
535
|
+
// Group into blocks and create text elements
|
|
536
|
+
const textElements = [];
|
|
537
|
+
const blocks = groupSpansByBlock(textSpans);
|
|
538
|
+
// Estimate page margins for alignment detection
|
|
539
|
+
const [leftMargin, rightMargin] = estimatePageMargins(textSpans);
|
|
540
|
+
for (const block of blocks) {
|
|
541
|
+
if (block.spans.length === 0)
|
|
542
|
+
continue;
|
|
543
|
+
if (block.width < MIN_TEXT_WIDTH || block.height < MIN_TEXT_HEIGHT)
|
|
544
|
+
continue;
|
|
545
|
+
// Find dominant span (longest text)
|
|
546
|
+
const dominant = block.spans.reduce((a, b) => a.text.length > b.text.length ? a : b);
|
|
547
|
+
const fontFamily = mapPdfFont(dominant.fontName);
|
|
548
|
+
const align = detectAlignment(block.spans, pageWidth, leftMargin, rightMargin);
|
|
549
|
+
const lineHeight = computeLineHeight(block.spans);
|
|
550
|
+
// Build text content with line breaks
|
|
551
|
+
const lineMap = new Map();
|
|
552
|
+
for (const span of block.spans) {
|
|
553
|
+
const arr = lineMap.get(span.lineNo) || [];
|
|
554
|
+
arr.push(span);
|
|
555
|
+
lineMap.set(span.lineNo, arr);
|
|
556
|
+
}
|
|
557
|
+
const sortedLineNos = [...lineMap.keys()].sort((a, b) => a - b);
|
|
558
|
+
// Compute each line's text and width for join decisions
|
|
559
|
+
const lineTexts = [];
|
|
560
|
+
for (let i = 0; i < sortedLineNos.length; i++) {
|
|
561
|
+
const lineSpans = lineMap.get(sortedLineNos[i]);
|
|
562
|
+
lineSpans.sort((a, b) => a.x - b.x);
|
|
563
|
+
const parts = [];
|
|
564
|
+
for (let j = 0; j < lineSpans.length; j++) {
|
|
565
|
+
if (j > 0) {
|
|
566
|
+
const prevEnd = lineSpans[j - 1].x + lineSpans[j - 1].width;
|
|
567
|
+
const gap = lineSpans[j].x - prevEnd;
|
|
568
|
+
if (gap > Math.min(lineSpans[j].fontSize * 0.15, 5)) {
|
|
569
|
+
parts.push(' ');
|
|
570
|
+
}
|
|
571
|
+
}
|
|
572
|
+
parts.push(lineSpans[j].text);
|
|
573
|
+
}
|
|
574
|
+
const lineWidth = lineSpans[lineSpans.length - 1].x +
|
|
575
|
+
lineSpans[lineSpans.length - 1].width -
|
|
576
|
+
lineSpans[0].x;
|
|
577
|
+
lineTexts.push({ text: parts.join(''), width: lineWidth });
|
|
578
|
+
}
|
|
579
|
+
// For justified text, join word-wrapped lines with spaces instead of \n
|
|
580
|
+
// so Polotno's justify can re-flow them. Only insert \n at real paragraph
|
|
581
|
+
// breaks (where the preceding line is significantly shorter than block width).
|
|
582
|
+
const textParts = [];
|
|
583
|
+
for (let i = 0; i < lineTexts.length; i++) {
|
|
584
|
+
if (i > 0) {
|
|
585
|
+
if (align === 'justify') {
|
|
586
|
+
const prevWidth = lineTexts[i - 1].width;
|
|
587
|
+
const isShortLine = prevWidth < block.width * 0.85;
|
|
588
|
+
textParts.push(isShortLine ? '\n' : ' ');
|
|
589
|
+
}
|
|
590
|
+
else {
|
|
591
|
+
textParts.push('\n');
|
|
592
|
+
}
|
|
593
|
+
}
|
|
594
|
+
textParts.push(lineTexts[i].text);
|
|
595
|
+
}
|
|
596
|
+
const textContent2 = textParts.join('').trim();
|
|
597
|
+
if (!textContent2)
|
|
598
|
+
continue;
|
|
599
|
+
const isMultiLine = textContent2.includes('\n');
|
|
600
|
+
// Justified text fills the full element width, so any extra makes lines
|
|
601
|
+
// wider than the PDF intended. Use 0 extra for justify.
|
|
602
|
+
const extraWidth = align === 'justify'
|
|
603
|
+
? 0
|
|
604
|
+
: isMultiLine
|
|
605
|
+
? block.width * 0.05 + 10
|
|
606
|
+
: block.width * 0.15 + 10;
|
|
607
|
+
const elemWidth = block.width + extraWidth;
|
|
608
|
+
// CSS line-height half-leading: the browser distributes extra space equally
|
|
609
|
+
// above and below the font's content area. The content area height is
|
|
610
|
+
// (ascent + |descent|) * fontSize / unitsPerEm, NOT simply fontSize.
|
|
611
|
+
// Using the correct content area from the embedded font gives a more
|
|
612
|
+
// accurate half-leading offset.
|
|
613
|
+
const otFont = fontOtByName.get(dominant.fontName);
|
|
614
|
+
const contentAreaRatio = otFont
|
|
615
|
+
? (otFont.ascender - otFont.descender) / otFont.unitsPerEm
|
|
616
|
+
: 1.0;
|
|
617
|
+
// halfLeading can be negative when the font's content area exceeds the
|
|
618
|
+
// line-height. Negative means the content overflows above the line box,
|
|
619
|
+
// so we must push elemY down to compensate.
|
|
620
|
+
const halfLeading = (lineHeight - contentAreaRatio) * dominant.fontSize / 2;
|
|
621
|
+
const elemHeight = block.height + Math.abs(halfLeading) * 2 + 5;
|
|
622
|
+
// Adjust x so added padding doesn't shift visible text
|
|
623
|
+
let elemX = block.x;
|
|
624
|
+
let elemY = block.y - halfLeading;
|
|
625
|
+
if (align === 'center') {
|
|
626
|
+
elemX -= extraWidth / 2;
|
|
627
|
+
}
|
|
628
|
+
else if (align === 'right') {
|
|
629
|
+
elemX -= extraWidth;
|
|
630
|
+
}
|
|
631
|
+
// Polotno rotates text around the top-left corner. For vertical text, anchor
|
|
632
|
+
// against the baseline/right edge so the rotated box stays in the same place.
|
|
633
|
+
if (dominant.rotation <= -45 && dominant.rotation >= -135) {
|
|
634
|
+
elemX -= elemHeight;
|
|
635
|
+
elemY = dominant.baselineY;
|
|
636
|
+
}
|
|
637
|
+
else if (dominant.rotation >= 45 && dominant.rotation <= 135) {
|
|
638
|
+
elemY = dominant.baselineY - elemWidth;
|
|
639
|
+
}
|
|
640
|
+
// Use minimum orderIndex from spans for z-ordering
|
|
641
|
+
const blockOrder = Math.min(...block.spans.map((s) => s.orderIndex));
|
|
642
|
+
// Compute letter spacing by comparing PDF advance widths with opentype.js
|
|
643
|
+
// character widths. The difference (per char, as ratio of fontSize) tells us
|
|
644
|
+
// how much extra spacing the PDF applies vs browser default rendering.
|
|
645
|
+
const letterSpacing = computeBlockLetterSpacing(textContent2, block.width, block.spans, dominant.fontSize, dominant.fontName, fontOtByName);
|
|
646
|
+
textElements.push({
|
|
647
|
+
type: 'text',
|
|
648
|
+
id: generateId(),
|
|
649
|
+
x: elemX,
|
|
650
|
+
y: elemY,
|
|
651
|
+
width: elemWidth,
|
|
652
|
+
height: elemHeight,
|
|
653
|
+
rotation: dominant.rotation || 0,
|
|
654
|
+
opacity: 1,
|
|
655
|
+
visible: true,
|
|
656
|
+
selectable: true,
|
|
657
|
+
removable: true,
|
|
658
|
+
text: textContent2,
|
|
659
|
+
fontSize: dominant.fontSize,
|
|
660
|
+
fontFamily,
|
|
661
|
+
fontWeight: dominant.fontWeight || extractWeightFromName(dominant.fontName),
|
|
662
|
+
fontStyle: dominant.fontStyle || extractStyleFromName(dominant.fontName),
|
|
663
|
+
fill: dominant.color || '#000000',
|
|
664
|
+
align,
|
|
665
|
+
lineHeight,
|
|
666
|
+
letterSpacing,
|
|
667
|
+
name: '',
|
|
668
|
+
placeholder: '',
|
|
669
|
+
_order: blockOrder,
|
|
670
|
+
});
|
|
671
|
+
}
|
|
672
|
+
return textElements;
|
|
673
|
+
}
|
|
674
|
+
//# sourceMappingURL=page-parser.js.map
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
export interface RawImageStream {
|
|
2
|
+
data: Uint8Array;
|
|
3
|
+
mimeType: string;
|
|
4
|
+
}
|
|
5
|
+
/**
|
|
6
|
+
* Build a lookup from PDF object number to its raw JPEG stream data.
|
|
7
|
+
* Only indexes image XObjects with DCTDecode (JPEG) filter.
|
|
8
|
+
*/
|
|
9
|
+
export declare function buildJpegIndex(pdfBytes: Uint8Array): Map<number, RawImageStream>;
|
|
10
|
+
/**
|
|
11
|
+
* Parse a pdfjs ref string like "44R" to extract the object number.
|
|
12
|
+
*/
|
|
13
|
+
export declare function parseRef(ref: string): number | null;
|
|
14
|
+
//# sourceMappingURL=pdf-image-extractor.d.ts.map
|