@polotno/pdf-import 0.0.1 → 0.0.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/lib/font-registry.d.ts +4 -1
- package/lib/font-registry.js +22 -8
- package/lib/generated/pdf-worker-source.d.ts +2 -0
- package/lib/generated/pdf-worker-source.js +23 -0
- package/lib/index.js +1 -1
- package/lib/page-parser.d.ts +2 -1
- package/lib/page-parser.js +39 -20
- package/lib/text-blocks.js +6 -1
- package/package.json +2 -2
package/lib/index.js
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
import{getDocument as x,GlobalWorkerOptions as m}from"pdfjs-dist/legacy/build/pdf.mjs";import{parsePage as
|
|
1
|
+
import{getDocument as x,GlobalWorkerOptions as m}from"pdfjs-dist/legacy/build/pdf.mjs";import{parsePage as R}from"./page-parser.js";import{FontRegistry as I}from"./font-registry.js";import{buildJpegIndex as O}from"./pdf-image-extractor.js";import{workerSource as k}from"./generated/pdf-worker-source.js";let A=0;function C(){return`el_${Date.now()}_${++A}`}async function L({pdf:e,fontStrategy:s="embed"}){if(typeof window<"u"&&!m.workerSrc){const r=new Blob([k],{type:"application/javascript"});m.workerSrc=URL.createObjectURL(r)}const i=new Uint8Array(e instanceof ArrayBuffer?e:e.buffer.slice(e.byteOffset,e.byteOffset+e.byteLength)),d=O(i),o=await x({data:i,useSystemFonts:!0,disableFontFace:!0,fontExtraProperties:!0}).promise,c=new I;let g=612,p=792;const f=await o.getPage(1),u=f.getViewport({scale:1});g=u.width,p=u.height;const w=3,n=new Array(o.numPages);for(let r=0;r<o.numPages;r+=w){const h=Math.min(r+w,o.numPages),l=[];for(let t=r;t<h;t++)l.push((async()=>{const a=t===0?f:await o.getPage(t+1),{parsedPage:P}=await R({page:a,pageIdx:t,fontRegistry:c,generateId:C,jpegIndex:d,fontStrategy:s});return{parsedPage:P,pageIdx:t}})());const y=await Promise.all(l);for(const{parsedPage:t,pageIdx:a}of y)n[a]=t}await o.destroy();const b=c.finalize(s,n);return{width:g,height:p,fonts:b,pages:n,unit:"px",dpi:72}}export{L as pdfToJson};
|
package/lib/page-parser.d.ts
CHANGED
|
@@ -7,12 +7,13 @@ interface ParsePageOptions {
|
|
|
7
7
|
fontRegistry: FontRegistry;
|
|
8
8
|
generateId: () => string;
|
|
9
9
|
jpegIndex: Map<number, RawImageStream>;
|
|
10
|
+
fontStrategy: 'embed' | 'googleFontsMatch';
|
|
10
11
|
}
|
|
11
12
|
interface ParsePageResult {
|
|
12
13
|
parsedPage: PolotnoPage;
|
|
13
14
|
pageWidth: number;
|
|
14
15
|
pageHeight: number;
|
|
15
16
|
}
|
|
16
|
-
export declare function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, }: ParsePageOptions): Promise<ParsePageResult>;
|
|
17
|
+
export declare function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, fontStrategy, }: ParsePageOptions): Promise<ParsePageResult>;
|
|
17
18
|
export {};
|
|
18
19
|
//# sourceMappingURL=page-parser.d.ts.map
|
package/lib/page-parser.js
CHANGED
|
@@ -1,13 +1,13 @@
|
|
|
1
1
|
import { extractTextPositionColors, extractDrawingsAndImages, } from './operator-list.js';
|
|
2
2
|
import { groupTextItems, groupSpansByBlock, detectAlignment, estimatePageMargins, computeLineHeight, } from './text-grouper.js';
|
|
3
|
-
import {
|
|
3
|
+
import { isKnownWebFont, extractWeightFromName, extractStyleFromName, } from './font-mapper.js';
|
|
4
4
|
import { drawingToSvg, svgToDataUri, clippedDrawingsToSvg, clipPathToSvg, } from './svg-builder.js';
|
|
5
5
|
import { imageDataToDataUri } from './image-encoder.js';
|
|
6
6
|
import { rgbTupleToHex } from './color-utils.js';
|
|
7
7
|
import { MIN_TEXT_WIDTH, MIN_TEXT_HEIGHT, MIN_IMAGE_WIDTH, MIN_IMAGE_HEIGHT, } from './constants.js';
|
|
8
8
|
import { parseRef } from './pdf-image-extractor.js';
|
|
9
9
|
import { imageBytesToDataUri } from './image-encoder.js';
|
|
10
|
-
export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, }) {
|
|
10
|
+
export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, fontStrategy, }) {
|
|
11
11
|
const viewport = page.getViewport({ scale: 1 });
|
|
12
12
|
const pageWidth = viewport.width;
|
|
13
13
|
const pageHeight = viewport.height;
|
|
@@ -25,7 +25,7 @@ export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegI
|
|
|
25
25
|
const [, imageElements, { fontNameMap, fontAscentMap, fontOtMap }] = await Promise.all([
|
|
26
26
|
resolveDrawingGradients(page, drawings),
|
|
27
27
|
buildImageElements(page, imageRefs, pageIdx, generateId, jpegIndex),
|
|
28
|
-
collectPageFonts(page, fontRefs, fontRegistry),
|
|
28
|
+
collectPageFonts(page, fontRefs, fontRegistry, fontStrategy === 'embed'),
|
|
29
29
|
]);
|
|
30
30
|
const pageBackground = detectPageBackground(drawings, pageWidth, pageHeight);
|
|
31
31
|
const svgElements = buildSvgElements(drawings, pageWidth, pageHeight, generateId);
|
|
@@ -38,6 +38,8 @@ export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegI
|
|
|
38
38
|
fontAscentMap,
|
|
39
39
|
fontOtMap,
|
|
40
40
|
generateId,
|
|
41
|
+
fontStrategy,
|
|
42
|
+
fontRegistry,
|
|
41
43
|
});
|
|
42
44
|
// Assemble page: all elements sorted by PDF paint order
|
|
43
45
|
const allElements = [...svgElements, ...imageElements, ...textElements].sort((a, b) => a._order - b._order);
|
|
@@ -383,7 +385,7 @@ async function buildImageElements(page, imageRefs, pageIdx, generateId, jpegInde
|
|
|
383
385
|
}
|
|
384
386
|
return imageElements;
|
|
385
387
|
}
|
|
386
|
-
async function collectPageFonts(page, fontRefs, fontRegistry) {
|
|
388
|
+
async function collectPageFonts(page, fontRefs, fontRegistry, embedAll = false) {
|
|
387
389
|
// Build font name map: loadedName (g_d0_f1) → real PDF name (CZZZZZ+Roboto-Regular)
|
|
388
390
|
// Also build font ascent map for accurate Y positioning (glyph top vs baseline)
|
|
389
391
|
const fontNameMap = new Map();
|
|
@@ -405,7 +407,7 @@ async function collectPageFonts(page, fontRefs, fontRegistry) {
|
|
|
405
407
|
if (fontObj.ascent != null) {
|
|
406
408
|
fontAscentMap.set(ref, fontObj.ascent);
|
|
407
409
|
}
|
|
408
|
-
fontRegistry.recordFont(fontObj);
|
|
410
|
+
fontRegistry.recordFont(fontObj, embedAll);
|
|
409
411
|
// Parse font binary with opentype.js for per-character width computation
|
|
410
412
|
// Uses FontRegistry cache to avoid re-parsing the same font across pages
|
|
411
413
|
if (fontObj.data && fontObj.data.length > 0) {
|
|
@@ -432,19 +434,18 @@ async function collectPageFonts(page, fontRefs, fontRegistry) {
|
|
|
432
434
|
*
|
|
433
435
|
* We average across all spans in the block and express as ratio of fontSize.
|
|
434
436
|
*/
|
|
435
|
-
function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, dominantFontName, fontOtByName) {
|
|
437
|
+
function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, dominantFontName, fontOtByName, allFontsEmbedded = false) {
|
|
436
438
|
if (fontSize < 1)
|
|
437
439
|
return 0;
|
|
438
440
|
const isSingleLine = !fullText.includes('\n');
|
|
439
441
|
// For single-line text, compare the font's full rendering width against the
|
|
440
|
-
// PDF block width.
|
|
441
|
-
//
|
|
442
|
-
//
|
|
443
|
-
//
|
|
442
|
+
// PDF block width. When space maps to .notdef, use a fixed 25% em-width
|
|
443
|
+
// estimate instead of the .notdef advance width. PDF subset fonts' .notdef
|
|
444
|
+
// widths vary wildly (26–60% of em) and don't reflect what the browser
|
|
445
|
+
// actually renders for a missing space glyph (system font fallback, ~25%).
|
|
444
446
|
if (isSingleLine) {
|
|
445
447
|
const otFont = fontOtByName.get(dominantFontName);
|
|
446
448
|
if (otFont) {
|
|
447
|
-
const fontIsEmbedded = !isKnownWebFont(dominantFontName);
|
|
448
449
|
const scale = fontSize / otFont.unitsPerEm;
|
|
449
450
|
const notdefGlyph = otFont.glyphs.get(0);
|
|
450
451
|
let fontWidth = 0;
|
|
@@ -453,10 +454,12 @@ function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, domina
|
|
|
453
454
|
for (const ch of fullText) {
|
|
454
455
|
const glyph = otFont.charToGlyph(ch);
|
|
455
456
|
if (glyph === notdefGlyph || !glyph.advanceWidth) {
|
|
456
|
-
|
|
457
|
-
|
|
458
|
-
|
|
459
|
-
|
|
457
|
+
if (ch === ' ') {
|
|
458
|
+
// Use typical space width (~25% of em) as estimate for what the
|
|
459
|
+
// browser renders when the embedded font lacks a space glyph.
|
|
460
|
+
// See DESIGN_NOTES.md "Letter Spacing & Missing Space Glyphs"
|
|
461
|
+
// for why .notdef width and font injection don't work here.
|
|
462
|
+
fontWidth += fontSize * 0.25;
|
|
460
463
|
charCount++;
|
|
461
464
|
continue;
|
|
462
465
|
}
|
|
@@ -516,13 +519,23 @@ function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, domina
|
|
|
516
519
|
return 0;
|
|
517
520
|
const perCharRatio = totalDelta / totalChars / fontSize;
|
|
518
521
|
const rounded = Math.round(perCharRatio * 1000) / 1000;
|
|
522
|
+
// For known web fonts, the embedded subset glyph widths can differ
|
|
523
|
+
// significantly from the real web font the browser will load. Large
|
|
524
|
+
// letterSpacing values (>0.02) are likely subset metric mismatches,
|
|
525
|
+
// not real tracking. Cap to avoid visual distortion.
|
|
526
|
+
// When fonts are embedded, the subset metrics match what Polotno renders,
|
|
527
|
+
// so trust the computed value. For web fonts, large values are likely
|
|
528
|
+
// subset metric mismatches — cap to avoid distortion.
|
|
529
|
+
const dominantIsWebFont = !allFontsEmbedded && isKnownWebFont(spans[0]?.fontName ?? '');
|
|
530
|
+
if (dominantIsWebFont && Math.abs(rounded) > 0.02)
|
|
531
|
+
return 0;
|
|
519
532
|
// Threshold: ignore if the total pixel impact is < 1px.
|
|
520
533
|
// At small fonts (12pt, 10 chars) 0.005 ratio = 0.6px total → skip.
|
|
521
534
|
// At large fonts (615pt, 9 chars) 0.003 ratio = 16px total → keep.
|
|
522
535
|
const totalPxImpact = Math.abs(rounded) * fontSize * totalChars;
|
|
523
536
|
return totalPxImpact < 1 ? 0 : rounded;
|
|
524
537
|
}
|
|
525
|
-
async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors, fontNameMap, fontAscentMap, fontOtMap, generateId, }) {
|
|
538
|
+
async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors, fontNameMap, fontAscentMap, fontOtMap, generateId, fontStrategy, fontRegistry, }) {
|
|
526
539
|
// Extract text
|
|
527
540
|
const textContent = await page.getTextContent();
|
|
528
541
|
const textSpans = groupTextItems(textContent.items, textContent.styles || {}, yFlipOffset, positionColors, fontNameMap, fontAscentMap);
|
|
@@ -544,7 +557,7 @@ async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors,
|
|
|
544
557
|
continue;
|
|
545
558
|
// Find dominant span (longest text)
|
|
546
559
|
const dominant = block.spans.reduce((a, b) => a.text.length > b.text.length ? a : b);
|
|
547
|
-
const fontFamily =
|
|
560
|
+
const fontFamily = fontRegistry.getFontFamily(dominant.fontName);
|
|
548
561
|
const align = detectAlignment(block.spans, pageWidth, leftMargin, rightMargin);
|
|
549
562
|
const lineHeight = computeLineHeight(block.spans);
|
|
550
563
|
// Build text content with line breaks
|
|
@@ -642,7 +655,7 @@ async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors,
|
|
|
642
655
|
// Compute letter spacing by comparing PDF advance widths with opentype.js
|
|
643
656
|
// character widths. The difference (per char, as ratio of fontSize) tells us
|
|
644
657
|
// how much extra spacing the PDF applies vs browser default rendering.
|
|
645
|
-
const letterSpacing = computeBlockLetterSpacing(textContent2, block.width, block.spans, dominant.fontSize, dominant.fontName, fontOtByName);
|
|
658
|
+
const letterSpacing = computeBlockLetterSpacing(textContent2, block.width, block.spans, dominant.fontSize, dominant.fontName, fontOtByName, fontStrategy === 'embed');
|
|
646
659
|
textElements.push({
|
|
647
660
|
type: 'text',
|
|
648
661
|
id: generateId(),
|
|
@@ -658,8 +671,14 @@ async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors,
|
|
|
658
671
|
text: textContent2,
|
|
659
672
|
fontSize: dominant.fontSize,
|
|
660
673
|
fontFamily,
|
|
661
|
-
|
|
662
|
-
|
|
674
|
+
// When embedding fonts, the subset file IS the specific variant (bold,
|
|
675
|
+
// italic, etc.), so use "normal" to avoid Polotno synthesizing on top.
|
|
676
|
+
fontWeight: fontStrategy === 'embed'
|
|
677
|
+
? 'normal'
|
|
678
|
+
: dominant.fontWeight || extractWeightFromName(dominant.fontName),
|
|
679
|
+
fontStyle: fontStrategy === 'embed'
|
|
680
|
+
? 'normal'
|
|
681
|
+
: dominant.fontStyle || extractStyleFromName(dominant.fontName),
|
|
663
682
|
fill: dominant.color || '#000000',
|
|
664
683
|
align,
|
|
665
684
|
lineHeight,
|
package/lib/text-blocks.js
CHANGED
|
@@ -175,7 +175,12 @@ export function detectAlignment(blockSpans, pageWidth, leftMargin, rightMargin)
|
|
|
175
175
|
const pageCenter = (leftMargin + rightMargin) / 2;
|
|
176
176
|
const centerTol = textWidth * 0.05;
|
|
177
177
|
const rightTol = textWidth * 0.05;
|
|
178
|
-
|
|
178
|
+
// Also check against actual page center to avoid false positives when all
|
|
179
|
+
// text is clustered on one side (margins don't reflect the full page).
|
|
180
|
+
const actualPageCenter = pageWidth / 2;
|
|
181
|
+
const pageCenterTol = pageWidth * 0.05;
|
|
182
|
+
if (Math.abs(blockCenter - pageCenter) < centerTol &&
|
|
183
|
+
Math.abs(blockCenter - actualPageCenter) < pageCenterTol)
|
|
179
184
|
return 'center';
|
|
180
185
|
// Only classify as right-aligned if the line is short relative to the text
|
|
181
186
|
// area — a near-full-width line that happens to align with the right margin
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@polotno/pdf-import",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.2",
|
|
4
4
|
"description": "Convert PDF files into Polotno JSON format",
|
|
5
5
|
"type": "module",
|
|
6
6
|
"main": "./lib/index.js",
|
|
@@ -12,7 +12,7 @@
|
|
|
12
12
|
}
|
|
13
13
|
},
|
|
14
14
|
"scripts": {
|
|
15
|
-
"build": "tsc && node build.js",
|
|
15
|
+
"build": "node generate-worker.js && tsc && node build.js",
|
|
16
16
|
"test": "vitest run tests/index.test.ts",
|
|
17
17
|
"test:update": "vitest run tests/index.test.ts --update",
|
|
18
18
|
"test:visual": "node --max-old-space-size=4096 ./node_modules/.bin/vitest run --config vitest.visual.config.ts tests/visual-regression.test.ts",
|