@polotno/pdf-import 0.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,674 @@
1
+ import { extractTextPositionColors, extractDrawingsAndImages, } from './operator-list.js';
2
+ import { groupTextItems, groupSpansByBlock, detectAlignment, estimatePageMargins, computeLineHeight, } from './text-grouper.js';
3
+ import { mapPdfFont, isKnownWebFont, extractWeightFromName, extractStyleFromName, } from './font-mapper.js';
4
+ import { drawingToSvg, svgToDataUri, clippedDrawingsToSvg, clipPathToSvg, } from './svg-builder.js';
5
+ import { imageDataToDataUri } from './image-encoder.js';
6
+ import { rgbTupleToHex } from './color-utils.js';
7
+ import { MIN_TEXT_WIDTH, MIN_TEXT_HEIGHT, MIN_IMAGE_WIDTH, MIN_IMAGE_HEIGHT, } from './constants.js';
8
+ import { parseRef } from './pdf-image-extractor.js';
9
+ import { imageBytesToDataUri } from './image-encoder.js';
10
+ export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, }) {
11
+ const viewport = page.getViewport({ scale: 1 });
12
+ const pageWidth = viewport.width;
13
+ const pageHeight = viewport.height;
14
+ // The viewport transform y-offset accounts for CropBox position within MediaBox.
15
+ // Use it for coordinate flipping instead of pageHeight to avoid offset errors.
16
+ const yFlipOffset = viewport.transform[5];
17
+ // Get operator list for drawings, images, and text colors
18
+ const ops = await page.getOperatorList();
19
+ // Extract text colors from operator list (also collects fontRefs)
20
+ const positionColors = extractTextPositionColors(ops, yFlipOffset);
21
+ const fontRefs = positionColors.fontRefs || new Set();
22
+ // Extract drawings and image references from operator list
23
+ const { drawings, imageRefs } = extractDrawingsAndImages(ops, yFlipOffset);
24
+ // Run independent async operations in parallel
25
+ const [, imageElements, { fontNameMap, fontAscentMap, fontOtMap }] = await Promise.all([
26
+ resolveDrawingGradients(page, drawings),
27
+ buildImageElements(page, imageRefs, pageIdx, generateId, jpegIndex),
28
+ collectPageFonts(page, fontRefs, fontRegistry),
29
+ ]);
30
+ const pageBackground = detectPageBackground(drawings, pageWidth, pageHeight);
31
+ const svgElements = buildSvgElements(drawings, pageWidth, pageHeight, generateId);
32
+ const textElements = await buildTextElements({
33
+ page,
34
+ pageWidth,
35
+ yFlipOffset,
36
+ positionColors,
37
+ fontNameMap,
38
+ fontAscentMap,
39
+ fontOtMap,
40
+ generateId,
41
+ });
42
+ // Assemble page: all elements sorted by PDF paint order
43
+ const allElements = [...svgElements, ...imageElements, ...textElements].sort((a, b) => a._order - b._order);
44
+ const children = allElements.map(({ _order, ...rest }) => rest);
45
+ return {
46
+ parsedPage: {
47
+ id: generateId(),
48
+ children,
49
+ background: pageBackground,
50
+ },
51
+ pageWidth,
52
+ pageHeight,
53
+ };
54
+ }
55
+ async function resolveDrawingGradients(page, drawings) {
56
+ for (const drawing of drawings) {
57
+ if (drawing._shadingNames && drawing._shadingNames.length > 0) {
58
+ // Resolve the first (primary) shading — this is the color gradient
59
+ const shadingName = drawing._shadingNames[0];
60
+ try {
61
+ const shading = await new Promise((resolve, reject) => {
62
+ const timeout = setTimeout(() => reject(new Error('timeout')), 3000);
63
+ page.objs.get(shadingName, (data) => {
64
+ clearTimeout(timeout);
65
+ if (data)
66
+ resolve(data);
67
+ else
68
+ reject(new Error('no data'));
69
+ });
70
+ });
71
+ // shading[0] = "RadialAxial", [1] = type, [3] = stops, [4] = start, [5] = end
72
+ // For radial: [6] = startRadius, [7] = endRadius
73
+ if (shading && shading[1]) {
74
+ const type = shading[1] === 'radial' ? 'radial' : 'linear';
75
+ const stops = [];
76
+ if (Array.isArray(shading[3])) {
77
+ for (const [offset, color] of shading[3]) {
78
+ stops.push({ offset, color });
79
+ }
80
+ }
81
+ if (type === 'linear') {
82
+ drawing.gradient = {
83
+ type: 'linear',
84
+ stops,
85
+ x1: shading[4]?.[0] ?? 0,
86
+ y1: shading[4]?.[1] ?? 0,
87
+ x2: shading[5]?.[0] ?? 0,
88
+ y2: shading[5]?.[1] ?? 0,
89
+ };
90
+ }
91
+ else {
92
+ drawing.gradient = {
93
+ type: 'radial',
94
+ stops,
95
+ cx: shading[4]?.[0] ?? 0,
96
+ cy: shading[4]?.[1] ?? 0,
97
+ r0: shading[6] ?? 0,
98
+ r1: shading[7] ?? 0,
99
+ };
100
+ }
101
+ // Mark drawing as having a gradient fill
102
+ if (!drawing.fill) {
103
+ drawing.fill = [0, 0, 0]; // placeholder so it's not skipped
104
+ }
105
+ }
106
+ }
107
+ catch {
108
+ // Couldn't resolve shading — leave as-is
109
+ }
110
+ }
111
+ }
112
+ }
113
+ function detectPageBackground(drawings, pageWidth, pageHeight) {
114
+ let pageBackground = '#FFFFFF';
115
+ for (const drawing of drawings) {
116
+ if (drawing.fill !== null) {
117
+ const dw = drawing.rect[2] - drawing.rect[0];
118
+ const dh = drawing.rect[3] - drawing.rect[1];
119
+ if (dw >= pageWidth * 0.9 && dh >= pageHeight * 0.9) {
120
+ const [r, g, b] = drawing.fill;
121
+ pageBackground = rgbTupleToHex(r, g, b);
122
+ }
123
+ }
124
+ }
125
+ return pageBackground;
126
+ }
127
+ function buildSvgElements(drawings, pageWidth, pageHeight, generateId) {
128
+ const svgElements = [];
129
+ for (let idx = 0; idx < drawings.length; idx++) {
130
+ const drawing = drawings[idx];
131
+ // Skip fully transparent drawings (e.g. accessibility marker rectangles)
132
+ if (drawing.opacity <= 0)
133
+ continue;
134
+ if (isMergeableClipRunDrawing(drawing)) {
135
+ const run = [drawing];
136
+ while (idx + 1 < drawings.length &&
137
+ isMergeableClipRunDrawing(drawings[idx + 1]) &&
138
+ isSameClipRun(run[0], drawings[idx + 1])) {
139
+ run.push(drawings[idx + 1]);
140
+ idx++;
141
+ }
142
+ if (run.length >= 4) {
143
+ const merged = clippedDrawingsToSvg(run);
144
+ if (merged) {
145
+ svgElements.push({
146
+ type: 'svg',
147
+ id: generateId(),
148
+ x: merged.x,
149
+ y: merged.y,
150
+ width: merged.width,
151
+ height: merged.height,
152
+ rotation: 0,
153
+ opacity: run[0].opacity,
154
+ src: svgToDataUri(merged.svg),
155
+ name: '',
156
+ _order: run[0].orderIndex,
157
+ });
158
+ continue;
159
+ }
160
+ }
161
+ }
162
+ const result = drawingToSvg(drawing, pageWidth, pageHeight);
163
+ if (result) {
164
+ svgElements.push({
165
+ type: 'svg',
166
+ id: generateId(),
167
+ x: result.x,
168
+ y: result.y,
169
+ width: result.width,
170
+ height: result.height,
171
+ rotation: 0,
172
+ opacity: drawing.opacity,
173
+ src: svgToDataUri(result.svg),
174
+ name: '',
175
+ _order: drawing.orderIndex,
176
+ });
177
+ }
178
+ }
179
+ return svgElements;
180
+ }
181
+ function hasMeaningfulClip(drawing) {
182
+ return (!!drawing.clipRect &&
183
+ Math.max(drawing.clipRect[0] - drawing.rect[0], drawing.clipRect[1] - drawing.rect[1], drawing.rect[2] - drawing.clipRect[2], drawing.rect[3] - drawing.clipRect[3]) > 20);
184
+ }
185
+ function isMergeableClipRunDrawing(drawing) {
186
+ return (drawing.opacity > 0 &&
187
+ !!drawing.clipPath &&
188
+ hasMeaningfulClip(drawing) &&
189
+ drawing.items.length === 4 &&
190
+ !drawing.stroke &&
191
+ !drawing.gradient);
192
+ }
193
+ function serializeDrawingItems(items) {
194
+ return items
195
+ .map((item) => Object.entries(item)
196
+ .map(([key, value]) => typeof value === 'number'
197
+ ? `${key}:${value.toFixed(3)}`
198
+ : `${key}:${String(value)}`)
199
+ .join(','))
200
+ .join('|');
201
+ }
202
+ function isSameClipRun(a, b) {
203
+ // Direct element comparison instead of JSON.stringify for numeric arrays
204
+ const af = a.fill, bf = b.fill;
205
+ const sameFill = af[0] === bf[0] && af[1] === bf[1] && af[2] === bf[2] &&
206
+ a.opacity === b.opacity &&
207
+ a.evenOdd === b.evenOdd;
208
+ if (!sameFill)
209
+ return false;
210
+ const ac = a.clipRect, bc = b.clipRect;
211
+ const sameClipRect = ac[0] === bc[0] && ac[1] === bc[1] && ac[2] === bc[2] && ac[3] === bc[3];
212
+ return (sameClipRect &&
213
+ serializeDrawingItems(a.clipPath) === serializeDrawingItems(b.clipPath));
214
+ }
215
+ async function buildImageElements(page, imageRefs, pageIdx, generateId, jpegIndex) {
216
+ const imageElements = [];
217
+ for (let imgIdx = 0; imgIdx < imageRefs.length; imgIdx++) {
218
+ const ref = imageRefs[imgIdx];
219
+ if (ref.width < MIN_IMAGE_WIDTH || ref.height < MIN_IMAGE_HEIGHT)
220
+ continue;
221
+ try {
222
+ const imgData = await new Promise((resolve, reject) => {
223
+ const timeout = setTimeout(() => reject(new Error('Timeout waiting for image data')), 10000);
224
+ page.objs.get(ref.name, (data) => {
225
+ clearTimeout(timeout);
226
+ if (data)
227
+ resolve(data);
228
+ else
229
+ reject(new Error('No image data'));
230
+ });
231
+ });
232
+ let src;
233
+ // Try to use original JPEG bytes from the PDF stream
234
+ const objNum = imgData.ref ? parseRef(imgData.ref) : null;
235
+ const rawStream = objNum != null ? jpegIndex.get(objNum) : undefined;
236
+ if (rawStream) {
237
+ src = imageBytesToDataUri(rawStream.data, rawStream.mimeType);
238
+ }
239
+ else if (imgData.data) {
240
+ // Raw pixel data — re-encode as PNG
241
+ src = imageDataToDataUri(imgData.data, imgData.width, imgData.height, imgData.kind || 3);
242
+ }
243
+ else if (imgData.src) {
244
+ // Already a URL or data URI
245
+ src = imgData.src;
246
+ }
247
+ else if (imgData.bitmap) {
248
+ const canvas = document.createElement('canvas');
249
+ canvas.width = imgData.bitmap.width;
250
+ canvas.height = imgData.bitmap.height;
251
+ const ctx = canvas.getContext('2d');
252
+ ctx.drawImage(imgData.bitmap, 0, 0);
253
+ src = canvas.toDataURL('image/png');
254
+ }
255
+ else {
256
+ continue;
257
+ }
258
+ // Generate clip path SVG if available
259
+ let clipSrc = '';
260
+ // Compute crop from clip rect: constrain element to visible clip area
261
+ let elemX = ref.x;
262
+ let elemY = ref.y;
263
+ let elemW = ref.width;
264
+ let elemH = ref.height;
265
+ let cropX = 0;
266
+ let cropY = 0;
267
+ let cropWidth = 1;
268
+ let cropHeight = 1;
269
+ const rotDeg = ref.rotation || 0;
270
+ const hasRotation = Math.abs(rotDeg) > 0.5;
271
+ if (ref.clipRect && ref.width > 0 && ref.height > 0) {
272
+ const [cx0, cy0, cx1, cy1] = ref.clipRect;
273
+ if (hasRotation) {
274
+ // For rotated images: transform clip rect into image-local space
275
+ // (origin at image top-left, x along top edge, y along left edge)
276
+ const rotRad = (rotDeg * Math.PI) / 180;
277
+ const cosNeg = Math.cos(-rotRad);
278
+ const sinNeg = Math.sin(-rotRad);
279
+ // Transform clip rect corners: screen → image-local (origin=TL, unrotated)
280
+ const clipScreenCorners = [
281
+ [cx0, cy0],
282
+ [cx1, cy0],
283
+ [cx1, cy1],
284
+ [cx0, cy1],
285
+ ];
286
+ const localCorners = clipScreenCorners.map(([sx, sy]) => {
287
+ const dx = sx - ref.x;
288
+ const dy = sy - ref.y;
289
+ return [dx * cosNeg - dy * sinNeg, dx * sinNeg + dy * cosNeg];
290
+ });
291
+ // AABB of clip in local space, clamped to [0, width] × [0, height]
292
+ const visMinX = Math.max(Math.min(...localCorners.map((c) => c[0])), 0);
293
+ const visMinY = Math.max(Math.min(...localCorners.map((c) => c[1])), 0);
294
+ const visMaxX = Math.min(Math.max(...localCorners.map((c) => c[0])), ref.width);
295
+ const visMaxY = Math.min(Math.max(...localCorners.map((c) => c[1])), ref.height);
296
+ const visW = visMaxX - visMinX;
297
+ const visH = visMaxY - visMinY;
298
+ if (visW > MIN_IMAGE_WIDTH && visH > MIN_IMAGE_HEIGHT) {
299
+ cropX = visMinX / ref.width;
300
+ cropY = visMinY / ref.height;
301
+ cropWidth = visW / ref.width;
302
+ cropHeight = visH / ref.height;
303
+ elemW = visW;
304
+ elemH = visH;
305
+ // Element position: transform crop top-left from local back to screen
306
+ // (polotno rotates around top-left)
307
+ const cosPos = Math.cos(rotRad);
308
+ const sinPos = Math.sin(rotRad);
309
+ elemX = ref.x + visMinX * cosPos - visMinY * sinPos;
310
+ elemY = ref.y + visMinX * sinPos + visMinY * cosPos;
311
+ }
312
+ }
313
+ else {
314
+ // Non-rotated: simple axis-aligned intersection
315
+ const visX0 = Math.max(cx0, ref.x);
316
+ const visY0 = Math.max(cy0, ref.y);
317
+ const visX1 = Math.min(cx1, ref.x + ref.width);
318
+ const visY1 = Math.min(cy1, ref.y + ref.height);
319
+ const visW = visX1 - visX0;
320
+ const visH = visY1 - visY0;
321
+ if (visW > MIN_IMAGE_WIDTH && visH > MIN_IMAGE_HEIGHT) {
322
+ cropX = (visX0 - ref.x) / ref.width;
323
+ cropY = (visY0 - ref.y) / ref.height;
324
+ cropWidth = visW / ref.width;
325
+ cropHeight = visH / ref.height;
326
+ elemX = visX0;
327
+ elemY = visY0;
328
+ elemW = visW;
329
+ elemH = visH;
330
+ }
331
+ }
332
+ }
333
+ // Polotno renders images with uniform scaling (maintaining natural aspect ratio).
334
+ // PDFs can non-uniformly scale images (different scaleX/scaleY). Adjust cropHeight
335
+ // so that cropH/cropW = (elemH/elemW) × (naturalW/naturalH), keeping horizontal
336
+ // extent correct and adjusting vertical to match polotno's uniform scale model.
337
+ const naturalW = imgData.width || 0;
338
+ const naturalH = imgData.height || 0;
339
+ if (naturalW > 0 &&
340
+ naturalH > 0 &&
341
+ elemW > 0 &&
342
+ elemH > 0 &&
343
+ cropWidth > 0) {
344
+ const requiredRatio = (elemH / elemW) * (naturalW / naturalH);
345
+ cropHeight = cropWidth * requiredRatio;
346
+ }
347
+ if (ref.clipPath) {
348
+ // Skip clipSrc for simple rectangular clips — crop values already handle the clipping.
349
+ // Only generate clipSrc for non-rectangular shapes (circles, curves, etc.)
350
+ const isSimpleRect = ref.clipPath.length === 1 && ref.clipPath[0].kind === 're';
351
+ if (!isSimpleRect) {
352
+ const clipSvg = clipPathToSvg(ref.clipPath, elemX, elemY, elemW, elemH);
353
+ if (clipSvg) {
354
+ clipSrc = svgToDataUri(clipSvg);
355
+ }
356
+ }
357
+ }
358
+ imageElements.push({
359
+ type: 'image',
360
+ id: generateId(),
361
+ x: elemX,
362
+ y: elemY,
363
+ width: elemW,
364
+ height: elemH,
365
+ rotation: ref.rotation || 0,
366
+ opacity: 1,
367
+ visible: true,
368
+ selectable: true,
369
+ removable: true,
370
+ src,
371
+ cropX,
372
+ cropY,
373
+ cropWidth,
374
+ cropHeight,
375
+ clipSrc,
376
+ name: `image_${pageIdx}_${imgIdx}`,
377
+ _order: ref.orderIndex,
378
+ });
379
+ }
380
+ catch (e) {
381
+ console.warn(`Failed to extract image ${ref.name}:`, e);
382
+ }
383
+ }
384
+ return imageElements;
385
+ }
386
+ async function collectPageFonts(page, fontRefs, fontRegistry) {
387
+ // Build font name map: loadedName (g_d0_f1) → real PDF name (CZZZZZ+Roboto-Regular)
388
+ // Also build font ascent map for accurate Y positioning (glyph top vs baseline)
389
+ const fontNameMap = new Map();
390
+ const fontAscentMap = new Map();
391
+ // Map loadedName → parsed opentype.js Font for letter spacing computation
392
+ const fontOtMap = new Map();
393
+ for (const ref of fontRefs) {
394
+ try {
395
+ const fontObj = await new Promise((resolve, reject) => {
396
+ page.commonObjs.get(ref, (data) => {
397
+ if (data)
398
+ resolve(data);
399
+ else
400
+ reject();
401
+ });
402
+ });
403
+ if (fontObj.name) {
404
+ fontNameMap.set(ref, fontObj.name);
405
+ if (fontObj.ascent != null) {
406
+ fontAscentMap.set(ref, fontObj.ascent);
407
+ }
408
+ fontRegistry.recordFont(fontObj);
409
+ // Parse font binary with opentype.js for per-character width computation
410
+ // Uses FontRegistry cache to avoid re-parsing the same font across pages
411
+ if (fontObj.data && fontObj.data.length > 0) {
412
+ const otFont = fontRegistry.parseOpentype(ref, fontObj.data);
413
+ if (otFont) {
414
+ fontOtMap.set(ref, otFont);
415
+ }
416
+ }
417
+ }
418
+ }
419
+ catch { }
420
+ }
421
+ return { fontNameMap, fontAscentMap, fontOtMap };
422
+ }
423
+ /**
424
+ * Compute letter spacing for a text block by comparing the PDF's actual span
425
+ * widths against the expected widths from the embedded font's character metrics
426
+ * (via opentype.js). Returns a ratio of fontSize (Polotno's letterSpacing unit).
427
+ *
428
+ * For each span we compute:
429
+ * pdfWidth = span.width (exact advance width from PDF)
430
+ * fontWidth = sum of opentype advanceWidth for each character, scaled to fontSize
431
+ * perCharDelta = (pdfWidth - fontWidth) / span.text.length
432
+ *
433
+ * We average across all spans in the block and express as ratio of fontSize.
434
+ */
435
+ function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, dominantFontName, fontOtByName) {
436
+ if (fontSize < 1)
437
+ return 0;
438
+ const isSingleLine = !fullText.includes('\n');
439
+ // For single-line text, compare the font's full rendering width against the
440
+ // PDF block width. For embedded (non-web) fonts the browser uses the same
441
+ // subset we parsed, so .notdef space width is correct. For known web fonts
442
+ // the browser uses the real font (not the subset), so .notdef space width
443
+ // is wrong — bail out and use the per-span path instead.
444
+ if (isSingleLine) {
445
+ const otFont = fontOtByName.get(dominantFontName);
446
+ if (otFont) {
447
+ const fontIsEmbedded = !isKnownWebFont(dominantFontName);
448
+ const scale = fontSize / otFont.unitsPerEm;
449
+ const notdefGlyph = otFont.glyphs.get(0);
450
+ let fontWidth = 0;
451
+ let charCount = 0;
452
+ let valid = true;
453
+ for (const ch of fullText) {
454
+ const glyph = otFont.charToGlyph(ch);
455
+ if (glyph === notdefGlyph || !glyph.advanceWidth) {
456
+ // For embedded fonts, .notdef space is what the browser will render.
457
+ // For web fonts, .notdef space width is wrong — bail out.
458
+ if (ch === ' ' && fontIsEmbedded) {
459
+ fontWidth += (glyph.advanceWidth || 0) * scale;
460
+ charCount++;
461
+ continue;
462
+ }
463
+ valid = false;
464
+ break;
465
+ }
466
+ fontWidth += glyph.advanceWidth * scale;
467
+ charCount++;
468
+ }
469
+ if (valid && fontWidth > 1 && charCount >= 2) {
470
+ const perCharRatio = (blockWidth - fontWidth) / charCount / fontSize;
471
+ const rounded = Math.round(perCharRatio * 1000) / 1000;
472
+ // If the value is extreme (>10% of fontSize), this likely indicates
473
+ // merged labels with a large spatial gap, not real letter spacing.
474
+ // Fall through to the safer per-span computation.
475
+ if (Math.abs(rounded) < 0.1) {
476
+ const totalPxImpact = Math.abs(rounded) * fontSize * charCount;
477
+ if (totalPxImpact >= 1)
478
+ return rounded;
479
+ return 0;
480
+ }
481
+ }
482
+ }
483
+ }
484
+ // For multi-line text (or when single-line full-text fails), use per-span
485
+ // comparison. This is safer because it avoids the unpredictable .notdef
486
+ // space width that varies wildly between subset fonts.
487
+ let totalDelta = 0;
488
+ let totalChars = 0;
489
+ for (const span of spans) {
490
+ if (span.text.length < 2)
491
+ continue;
492
+ const otFont = fontOtByName.get(span.fontName);
493
+ if (!otFont)
494
+ continue;
495
+ const scale = span.fontSize / otFont.unitsPerEm;
496
+ const notdefGlyph = otFont.glyphs.get(0);
497
+ let fontWidth = 0;
498
+ let validChars = 0;
499
+ let hasInvalidGlyph = false;
500
+ for (const ch of span.text) {
501
+ const glyph = otFont.charToGlyph(ch);
502
+ if (glyph === notdefGlyph || !glyph.advanceWidth) {
503
+ hasInvalidGlyph = true;
504
+ break;
505
+ }
506
+ fontWidth += glyph.advanceWidth * scale;
507
+ validChars++;
508
+ }
509
+ if (hasInvalidGlyph || fontWidth < 1)
510
+ continue;
511
+ const delta = span.width - fontWidth;
512
+ totalDelta += delta;
513
+ totalChars += validChars;
514
+ }
515
+ if (totalChars < 2)
516
+ return 0;
517
+ const perCharRatio = totalDelta / totalChars / fontSize;
518
+ const rounded = Math.round(perCharRatio * 1000) / 1000;
519
+ // Threshold: ignore if the total pixel impact is < 1px.
520
+ // At small fonts (12pt, 10 chars) 0.005 ratio = 0.6px total → skip.
521
+ // At large fonts (615pt, 9 chars) 0.003 ratio = 16px total → keep.
522
+ const totalPxImpact = Math.abs(rounded) * fontSize * totalChars;
523
+ return totalPxImpact < 1 ? 0 : rounded;
524
+ }
525
+ async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors, fontNameMap, fontAscentMap, fontOtMap, generateId, }) {
526
+ // Extract text
527
+ const textContent = await page.getTextContent();
528
+ const textSpans = groupTextItems(textContent.items, textContent.styles || {}, yFlipOffset, positionColors, fontNameMap, fontAscentMap);
529
+ // Re-key opentype fonts by real PDF name (spans store real names, not loadedNames)
530
+ const fontOtByName = new Map();
531
+ for (const [loadedName, otFont] of fontOtMap) {
532
+ const realName = fontNameMap.get(loadedName) || loadedName;
533
+ fontOtByName.set(realName, otFont);
534
+ }
535
+ // Group into blocks and create text elements
536
+ const textElements = [];
537
+ const blocks = groupSpansByBlock(textSpans);
538
+ // Estimate page margins for alignment detection
539
+ const [leftMargin, rightMargin] = estimatePageMargins(textSpans);
540
+ for (const block of blocks) {
541
+ if (block.spans.length === 0)
542
+ continue;
543
+ if (block.width < MIN_TEXT_WIDTH || block.height < MIN_TEXT_HEIGHT)
544
+ continue;
545
+ // Find dominant span (longest text)
546
+ const dominant = block.spans.reduce((a, b) => a.text.length > b.text.length ? a : b);
547
+ const fontFamily = mapPdfFont(dominant.fontName);
548
+ const align = detectAlignment(block.spans, pageWidth, leftMargin, rightMargin);
549
+ const lineHeight = computeLineHeight(block.spans);
550
+ // Build text content with line breaks
551
+ const lineMap = new Map();
552
+ for (const span of block.spans) {
553
+ const arr = lineMap.get(span.lineNo) || [];
554
+ arr.push(span);
555
+ lineMap.set(span.lineNo, arr);
556
+ }
557
+ const sortedLineNos = [...lineMap.keys()].sort((a, b) => a - b);
558
+ // Compute each line's text and width for join decisions
559
+ const lineTexts = [];
560
+ for (let i = 0; i < sortedLineNos.length; i++) {
561
+ const lineSpans = lineMap.get(sortedLineNos[i]);
562
+ lineSpans.sort((a, b) => a.x - b.x);
563
+ const parts = [];
564
+ for (let j = 0; j < lineSpans.length; j++) {
565
+ if (j > 0) {
566
+ const prevEnd = lineSpans[j - 1].x + lineSpans[j - 1].width;
567
+ const gap = lineSpans[j].x - prevEnd;
568
+ if (gap > Math.min(lineSpans[j].fontSize * 0.15, 5)) {
569
+ parts.push(' ');
570
+ }
571
+ }
572
+ parts.push(lineSpans[j].text);
573
+ }
574
+ const lineWidth = lineSpans[lineSpans.length - 1].x +
575
+ lineSpans[lineSpans.length - 1].width -
576
+ lineSpans[0].x;
577
+ lineTexts.push({ text: parts.join(''), width: lineWidth });
578
+ }
579
+ // For justified text, join word-wrapped lines with spaces instead of \n
580
+ // so Polotno's justify can re-flow them. Only insert \n at real paragraph
581
+ // breaks (where the preceding line is significantly shorter than block width).
582
+ const textParts = [];
583
+ for (let i = 0; i < lineTexts.length; i++) {
584
+ if (i > 0) {
585
+ if (align === 'justify') {
586
+ const prevWidth = lineTexts[i - 1].width;
587
+ const isShortLine = prevWidth < block.width * 0.85;
588
+ textParts.push(isShortLine ? '\n' : ' ');
589
+ }
590
+ else {
591
+ textParts.push('\n');
592
+ }
593
+ }
594
+ textParts.push(lineTexts[i].text);
595
+ }
596
+ const textContent2 = textParts.join('').trim();
597
+ if (!textContent2)
598
+ continue;
599
+ const isMultiLine = textContent2.includes('\n');
600
+ // Justified text fills the full element width, so any extra makes lines
601
+ // wider than the PDF intended. Use 0 extra for justify.
602
+ const extraWidth = align === 'justify'
603
+ ? 0
604
+ : isMultiLine
605
+ ? block.width * 0.05 + 10
606
+ : block.width * 0.15 + 10;
607
+ const elemWidth = block.width + extraWidth;
608
+ // CSS line-height half-leading: the browser distributes extra space equally
609
+ // above and below the font's content area. The content area height is
610
+ // (ascent + |descent|) * fontSize / unitsPerEm, NOT simply fontSize.
611
+ // Using the correct content area from the embedded font gives a more
612
+ // accurate half-leading offset.
613
+ const otFont = fontOtByName.get(dominant.fontName);
614
+ const contentAreaRatio = otFont
615
+ ? (otFont.ascender - otFont.descender) / otFont.unitsPerEm
616
+ : 1.0;
617
+ // halfLeading can be negative when the font's content area exceeds the
618
+ // line-height. Negative means the content overflows above the line box,
619
+ // so we must push elemY down to compensate.
620
+ const halfLeading = (lineHeight - contentAreaRatio) * dominant.fontSize / 2;
621
+ const elemHeight = block.height + Math.abs(halfLeading) * 2 + 5;
622
+ // Adjust x so added padding doesn't shift visible text
623
+ let elemX = block.x;
624
+ let elemY = block.y - halfLeading;
625
+ if (align === 'center') {
626
+ elemX -= extraWidth / 2;
627
+ }
628
+ else if (align === 'right') {
629
+ elemX -= extraWidth;
630
+ }
631
+ // Polotno rotates text around the top-left corner. For vertical text, anchor
632
+ // against the baseline/right edge so the rotated box stays in the same place.
633
+ if (dominant.rotation <= -45 && dominant.rotation >= -135) {
634
+ elemX -= elemHeight;
635
+ elemY = dominant.baselineY;
636
+ }
637
+ else if (dominant.rotation >= 45 && dominant.rotation <= 135) {
638
+ elemY = dominant.baselineY - elemWidth;
639
+ }
640
+ // Use minimum orderIndex from spans for z-ordering
641
+ const blockOrder = Math.min(...block.spans.map((s) => s.orderIndex));
642
+ // Compute letter spacing by comparing PDF advance widths with opentype.js
643
+ // character widths. The difference (per char, as ratio of fontSize) tells us
644
+ // how much extra spacing the PDF applies vs browser default rendering.
645
+ const letterSpacing = computeBlockLetterSpacing(textContent2, block.width, block.spans, dominant.fontSize, dominant.fontName, fontOtByName);
646
+ textElements.push({
647
+ type: 'text',
648
+ id: generateId(),
649
+ x: elemX,
650
+ y: elemY,
651
+ width: elemWidth,
652
+ height: elemHeight,
653
+ rotation: dominant.rotation || 0,
654
+ opacity: 1,
655
+ visible: true,
656
+ selectable: true,
657
+ removable: true,
658
+ text: textContent2,
659
+ fontSize: dominant.fontSize,
660
+ fontFamily,
661
+ fontWeight: dominant.fontWeight || extractWeightFromName(dominant.fontName),
662
+ fontStyle: dominant.fontStyle || extractStyleFromName(dominant.fontName),
663
+ fill: dominant.color || '#000000',
664
+ align,
665
+ lineHeight,
666
+ letterSpacing,
667
+ name: '',
668
+ placeholder: '',
669
+ _order: blockOrder,
670
+ });
671
+ }
672
+ return textElements;
673
+ }
674
+ //# sourceMappingURL=page-parser.js.map
@@ -0,0 +1,14 @@
1
+ export interface RawImageStream {
2
+ data: Uint8Array;
3
+ mimeType: string;
4
+ }
5
+ /**
6
+ * Build a lookup from PDF object number to its raw JPEG stream data.
7
+ * Only indexes image XObjects with DCTDecode (JPEG) filter.
8
+ */
9
+ export declare function buildJpegIndex(pdfBytes: Uint8Array): Map<number, RawImageStream>;
10
+ /**
11
+ * Parse a pdfjs ref string like "44R" to extract the object number.
12
+ */
13
+ export declare function parseRef(ref: string): number | null;
14
+ //# sourceMappingURL=pdf-image-extractor.d.ts.map