@polotno/pdf-import 0.0.1 → 0.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/lib/index.js CHANGED
@@ -1 +1 @@
1
- import{getDocument as x,GlobalWorkerOptions as m}from"pdfjs-dist/legacy/build/pdf.mjs";import{parsePage as I}from"./page-parser.js";import{FontRegistry as R}from"./font-registry.js";import{buildJpegIndex as S}from"./pdf-image-extractor.js";let k=0;function A(){return`el_${Date.now()}_${++k}`}async function j({pdf:e,fontStrategy:w="embed"}){typeof window<"u"&&!m.workerSrc&&(m.workerSrc=new URL("pdfjs-dist/legacy/build/pdf.worker.mjs",import.meta.url).toString());const s=new Uint8Array(e instanceof ArrayBuffer?e:e.buffer.slice(e.byteOffset,e.byteOffset+e.byteLength)),l=S(s),r=await x({data:s,useSystemFonts:!0,disableFontFace:!0,fontExtraProperties:!0}).promise,i=new R;let c=612,g=792;const f=await r.getPage(1),u=f.getViewport({scale:1});c=u.width,g=u.height;const d=3,n=new Array(r.numPages);for(let o=0;o<r.numPages;o+=d){const y=Math.min(o+d,r.numPages),p=[];for(let t=o;t<y;t++)p.push((async()=>{const a=t===0?f:await r.getPage(t+1),{parsedPage:P}=await I({page:a,pageIdx:t,fontRegistry:i,generateId:A,jpegIndex:l});return{parsedPage:P,pageIdx:t}})());const b=await Promise.all(p);for(const{parsedPage:t,pageIdx:a}of b)n[a]=t}await r.destroy();const h=i.finalize(w,n);return{width:c,height:g,fonts:h,pages:n,unit:"px",dpi:72}}export{j as pdfToJson};
1
+ import{getDocument as x,GlobalWorkerOptions as m}from"pdfjs-dist/legacy/build/pdf.mjs";import{parsePage as R}from"./page-parser.js";import{FontRegistry as I}from"./font-registry.js";import{buildJpegIndex as O}from"./pdf-image-extractor.js";import{workerSource as k}from"./generated/pdf-worker-source.js";let A=0;function C(){return`el_${Date.now()}_${++A}`}async function L({pdf:e,fontStrategy:s="embed"}){if(typeof window<"u"&&!m.workerSrc){const r=new Blob([k],{type:"application/javascript"});m.workerSrc=URL.createObjectURL(r)}const i=new Uint8Array(e instanceof ArrayBuffer?e:e.buffer.slice(e.byteOffset,e.byteOffset+e.byteLength)),d=O(i),o=await x({data:i,useSystemFonts:!0,disableFontFace:!0,fontExtraProperties:!0}).promise,c=new I;let g=612,p=792;const f=await o.getPage(1),u=f.getViewport({scale:1});g=u.width,p=u.height;const w=3,n=new Array(o.numPages);for(let r=0;r<o.numPages;r+=w){const h=Math.min(r+w,o.numPages),l=[];for(let t=r;t<h;t++)l.push((async()=>{const a=t===0?f:await o.getPage(t+1),{parsedPage:P}=await R({page:a,pageIdx:t,fontRegistry:c,generateId:C,jpegIndex:d,fontStrategy:s});return{parsedPage:P,pageIdx:t}})());const y=await Promise.all(l);for(const{parsedPage:t,pageIdx:a}of y)n[a]=t}await o.destroy();const b=c.finalize(s,n);return{width:g,height:p,fonts:b,pages:n,unit:"px",dpi:72}}export{L as pdfToJson};
@@ -7,12 +7,13 @@ interface ParsePageOptions {
7
7
  fontRegistry: FontRegistry;
8
8
  generateId: () => string;
9
9
  jpegIndex: Map<number, RawImageStream>;
10
+ fontStrategy: 'embed' | 'googleFontsMatch';
10
11
  }
11
12
  interface ParsePageResult {
12
13
  parsedPage: PolotnoPage;
13
14
  pageWidth: number;
14
15
  pageHeight: number;
15
16
  }
16
- export declare function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, }: ParsePageOptions): Promise<ParsePageResult>;
17
+ export declare function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, fontStrategy, }: ParsePageOptions): Promise<ParsePageResult>;
17
18
  export {};
18
19
  //# sourceMappingURL=page-parser.d.ts.map
@@ -1,13 +1,13 @@
1
1
  import { extractTextPositionColors, extractDrawingsAndImages, } from './operator-list.js';
2
2
  import { groupTextItems, groupSpansByBlock, detectAlignment, estimatePageMargins, computeLineHeight, } from './text-grouper.js';
3
- import { mapPdfFont, isKnownWebFont, extractWeightFromName, extractStyleFromName, } from './font-mapper.js';
3
+ import { isKnownWebFont, extractWeightFromName, extractStyleFromName, } from './font-mapper.js';
4
4
  import { drawingToSvg, svgToDataUri, clippedDrawingsToSvg, clipPathToSvg, } from './svg-builder.js';
5
5
  import { imageDataToDataUri } from './image-encoder.js';
6
6
  import { rgbTupleToHex } from './color-utils.js';
7
7
  import { MIN_TEXT_WIDTH, MIN_TEXT_HEIGHT, MIN_IMAGE_WIDTH, MIN_IMAGE_HEIGHT, } from './constants.js';
8
8
  import { parseRef } from './pdf-image-extractor.js';
9
9
  import { imageBytesToDataUri } from './image-encoder.js';
10
- export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, }) {
10
+ export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegIndex, fontStrategy, }) {
11
11
  const viewport = page.getViewport({ scale: 1 });
12
12
  const pageWidth = viewport.width;
13
13
  const pageHeight = viewport.height;
@@ -25,7 +25,7 @@ export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegI
25
25
  const [, imageElements, { fontNameMap, fontAscentMap, fontOtMap }] = await Promise.all([
26
26
  resolveDrawingGradients(page, drawings),
27
27
  buildImageElements(page, imageRefs, pageIdx, generateId, jpegIndex),
28
- collectPageFonts(page, fontRefs, fontRegistry),
28
+ collectPageFonts(page, fontRefs, fontRegistry, fontStrategy === 'embed'),
29
29
  ]);
30
30
  const pageBackground = detectPageBackground(drawings, pageWidth, pageHeight);
31
31
  const svgElements = buildSvgElements(drawings, pageWidth, pageHeight, generateId);
@@ -38,6 +38,8 @@ export async function parsePage({ page, pageIdx, fontRegistry, generateId, jpegI
38
38
  fontAscentMap,
39
39
  fontOtMap,
40
40
  generateId,
41
+ fontStrategy,
42
+ fontRegistry,
41
43
  });
42
44
  // Assemble page: all elements sorted by PDF paint order
43
45
  const allElements = [...svgElements, ...imageElements, ...textElements].sort((a, b) => a._order - b._order);
@@ -383,7 +385,7 @@ async function buildImageElements(page, imageRefs, pageIdx, generateId, jpegInde
383
385
  }
384
386
  return imageElements;
385
387
  }
386
- async function collectPageFonts(page, fontRefs, fontRegistry) {
388
+ async function collectPageFonts(page, fontRefs, fontRegistry, embedAll = false) {
387
389
  // Build font name map: loadedName (g_d0_f1) → real PDF name (CZZZZZ+Roboto-Regular)
388
390
  // Also build font ascent map for accurate Y positioning (glyph top vs baseline)
389
391
  const fontNameMap = new Map();
@@ -405,7 +407,7 @@ async function collectPageFonts(page, fontRefs, fontRegistry) {
405
407
  if (fontObj.ascent != null) {
406
408
  fontAscentMap.set(ref, fontObj.ascent);
407
409
  }
408
- fontRegistry.recordFont(fontObj);
410
+ fontRegistry.recordFont(fontObj, embedAll);
409
411
  // Parse font binary with opentype.js for per-character width computation
410
412
  // Uses FontRegistry cache to avoid re-parsing the same font across pages
411
413
  if (fontObj.data && fontObj.data.length > 0) {
@@ -432,19 +434,18 @@ async function collectPageFonts(page, fontRefs, fontRegistry) {
432
434
  *
433
435
  * We average across all spans in the block and express as ratio of fontSize.
434
436
  */
435
- function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, dominantFontName, fontOtByName) {
437
+ function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, dominantFontName, fontOtByName, allFontsEmbedded = false) {
436
438
  if (fontSize < 1)
437
439
  return 0;
438
440
  const isSingleLine = !fullText.includes('\n');
439
441
  // For single-line text, compare the font's full rendering width against the
440
- // PDF block width. For embedded (non-web) fonts the browser uses the same
441
- // subset we parsed, so .notdef space width is correct. For known web fonts
442
- // the browser uses the real font (not the subset), so .notdef space width
443
- // is wrong bail out and use the per-span path instead.
442
+ // PDF block width. When space maps to .notdef, use a fixed 25% em-width
443
+ // estimate instead of the .notdef advance width. PDF subset fonts' .notdef
444
+ // widths vary wildly (26–60% of em) and don't reflect what the browser
445
+ // actually renders for a missing space glyph (system font fallback, ~25%).
444
446
  if (isSingleLine) {
445
447
  const otFont = fontOtByName.get(dominantFontName);
446
448
  if (otFont) {
447
- const fontIsEmbedded = !isKnownWebFont(dominantFontName);
448
449
  const scale = fontSize / otFont.unitsPerEm;
449
450
  const notdefGlyph = otFont.glyphs.get(0);
450
451
  let fontWidth = 0;
@@ -453,10 +454,12 @@ function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, domina
453
454
  for (const ch of fullText) {
454
455
  const glyph = otFont.charToGlyph(ch);
455
456
  if (glyph === notdefGlyph || !glyph.advanceWidth) {
456
- // For embedded fonts, .notdef space is what the browser will render.
457
- // For web fonts, .notdef space width is wrong bail out.
458
- if (ch === ' ' && fontIsEmbedded) {
459
- fontWidth += (glyph.advanceWidth || 0) * scale;
457
+ if (ch === ' ') {
458
+ // Use typical space width (~25% of em) as estimate for what the
459
+ // browser renders when the embedded font lacks a space glyph.
460
+ // See DESIGN_NOTES.md "Letter Spacing & Missing Space Glyphs"
461
+ // for why .notdef width and font injection don't work here.
462
+ fontWidth += fontSize * 0.25;
460
463
  charCount++;
461
464
  continue;
462
465
  }
@@ -516,13 +519,23 @@ function computeBlockLetterSpacing(fullText, blockWidth, spans, fontSize, domina
516
519
  return 0;
517
520
  const perCharRatio = totalDelta / totalChars / fontSize;
518
521
  const rounded = Math.round(perCharRatio * 1000) / 1000;
522
+ // For known web fonts, the embedded subset glyph widths can differ
523
+ // significantly from the real web font the browser will load. Large
524
+ // letterSpacing values (>0.02) are likely subset metric mismatches,
525
+ // not real tracking. Cap to avoid visual distortion.
526
+ // When fonts are embedded, the subset metrics match what Polotno renders,
527
+ // so trust the computed value. For web fonts, large values are likely
528
+ // subset metric mismatches — cap to avoid distortion.
529
+ const dominantIsWebFont = !allFontsEmbedded && isKnownWebFont(spans[0]?.fontName ?? '');
530
+ if (dominantIsWebFont && Math.abs(rounded) > 0.02)
531
+ return 0;
519
532
  // Threshold: ignore if the total pixel impact is < 1px.
520
533
  // At small fonts (12pt, 10 chars) 0.005 ratio = 0.6px total → skip.
521
534
  // At large fonts (615pt, 9 chars) 0.003 ratio = 16px total → keep.
522
535
  const totalPxImpact = Math.abs(rounded) * fontSize * totalChars;
523
536
  return totalPxImpact < 1 ? 0 : rounded;
524
537
  }
525
- async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors, fontNameMap, fontAscentMap, fontOtMap, generateId, }) {
538
+ async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors, fontNameMap, fontAscentMap, fontOtMap, generateId, fontStrategy, fontRegistry, }) {
526
539
  // Extract text
527
540
  const textContent = await page.getTextContent();
528
541
  const textSpans = groupTextItems(textContent.items, textContent.styles || {}, yFlipOffset, positionColors, fontNameMap, fontAscentMap);
@@ -544,7 +557,7 @@ async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors,
544
557
  continue;
545
558
  // Find dominant span (longest text)
546
559
  const dominant = block.spans.reduce((a, b) => a.text.length > b.text.length ? a : b);
547
- const fontFamily = mapPdfFont(dominant.fontName);
560
+ const fontFamily = fontRegistry.getFontFamily(dominant.fontName);
548
561
  const align = detectAlignment(block.spans, pageWidth, leftMargin, rightMargin);
549
562
  const lineHeight = computeLineHeight(block.spans);
550
563
  // Build text content with line breaks
@@ -642,7 +655,7 @@ async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors,
642
655
  // Compute letter spacing by comparing PDF advance widths with opentype.js
643
656
  // character widths. The difference (per char, as ratio of fontSize) tells us
644
657
  // how much extra spacing the PDF applies vs browser default rendering.
645
- const letterSpacing = computeBlockLetterSpacing(textContent2, block.width, block.spans, dominant.fontSize, dominant.fontName, fontOtByName);
658
+ const letterSpacing = computeBlockLetterSpacing(textContent2, block.width, block.spans, dominant.fontSize, dominant.fontName, fontOtByName, fontStrategy === 'embed');
646
659
  textElements.push({
647
660
  type: 'text',
648
661
  id: generateId(),
@@ -658,8 +671,14 @@ async function buildTextElements({ page, pageWidth, yFlipOffset, positionColors,
658
671
  text: textContent2,
659
672
  fontSize: dominant.fontSize,
660
673
  fontFamily,
661
- fontWeight: dominant.fontWeight || extractWeightFromName(dominant.fontName),
662
- fontStyle: dominant.fontStyle || extractStyleFromName(dominant.fontName),
674
+ // When embedding fonts, the subset file IS the specific variant (bold,
675
+ // italic, etc.), so use "normal" to avoid Polotno synthesizing on top.
676
+ fontWeight: fontStrategy === 'embed'
677
+ ? 'normal'
678
+ : dominant.fontWeight || extractWeightFromName(dominant.fontName),
679
+ fontStyle: fontStrategy === 'embed'
680
+ ? 'normal'
681
+ : dominant.fontStyle || extractStyleFromName(dominant.fontName),
663
682
  fill: dominant.color || '#000000',
664
683
  align,
665
684
  lineHeight,
@@ -175,7 +175,12 @@ export function detectAlignment(blockSpans, pageWidth, leftMargin, rightMargin)
175
175
  const pageCenter = (leftMargin + rightMargin) / 2;
176
176
  const centerTol = textWidth * 0.05;
177
177
  const rightTol = textWidth * 0.05;
178
- if (Math.abs(blockCenter - pageCenter) < centerTol)
178
+ // Also check against actual page center to avoid false positives when all
179
+ // text is clustered on one side (margins don't reflect the full page).
180
+ const actualPageCenter = pageWidth / 2;
181
+ const pageCenterTol = pageWidth * 0.05;
182
+ if (Math.abs(blockCenter - pageCenter) < centerTol &&
183
+ Math.abs(blockCenter - actualPageCenter) < pageCenterTol)
179
184
  return 'center';
180
185
  // Only classify as right-aligned if the line is short relative to the text
181
186
  // area — a near-full-width line that happens to align with the right margin
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@polotno/pdf-import",
3
- "version": "0.0.1",
3
+ "version": "0.0.2",
4
4
  "description": "Convert PDF files into Polotno JSON format",
5
5
  "type": "module",
6
6
  "main": "./lib/index.js",
@@ -12,7 +12,7 @@
12
12
  }
13
13
  },
14
14
  "scripts": {
15
- "build": "tsc && node build.js",
15
+ "build": "node generate-worker.js && tsc && node build.js",
16
16
  "test": "vitest run tests/index.test.ts",
17
17
  "test:update": "vitest run tests/index.test.ts --update",
18
18
  "test:visual": "node --max-old-space-size=4096 ./node_modules/.bin/vitest run --config vitest.visual.config.ts tests/visual-regression.test.ts",