@namahapdf/core 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.d.cts CHANGED
@@ -189,7 +189,11 @@ declare class PDFParser {
189
189
  */
190
190
  private extractVersion;
191
191
  /**
192
- * Parse all xref tables (including previous versions for incremental updates)
192
+ * Parse all xref tables (including previous versions for incremental updates).
193
+ * Falls back to a full-file brute-force scan when the real xref is missing or
194
+ * corrupt (missing startxref, malformed entries, unreadable xref stream) or
195
+ * when it parses but yields no usable /Root — so otherwise-readable PDFs with
196
+ * a broken cross-reference table still open.
193
197
  */
194
198
  private parseXRefTables;
195
199
  /**
@@ -470,6 +474,30 @@ type RenderTextRun = {
470
474
  * embedded family + substitute fallback, or just the substitute family).
471
475
  * Lets the inline editor render identical text — no size/width jump. */
472
476
  fontFamily?: string;
477
+ /** PDF text rendering mode (Tr) this run was drawn with. Modes 3 and 7 are
478
+ * invisible — the run is captured for editing/search but not painted. A page
479
+ * whose text is predominantly invisible is the hallmark of a scanned/OCR
480
+ * layer (see editor/scannedTextRecovery). */
481
+ textRenderMode: number;
482
+ /** Convenience: true for invisible render modes (3 = invisible, 7 = clip). */
483
+ invisible: boolean;
484
+ /** Cap-height (1000-em units) measured from the actual embedded FontFace, when
485
+ * one was registered. The editor uses it to size substitute-font redraws to
486
+ * match the original's caps. Measured (not the descriptor's often-wrong
487
+ * /CapHeight) — see measureRegisteredCapHeight. */
488
+ fontCapHeight?: number;
489
+ /** Numeric font weight (100–900) inferred from the base font name + descriptor
490
+ * (`inferFontWeight`). Lets exporters decide bold from the real weight rather
491
+ * than re-guessing from the name. ≥600 is effectively bold. */
492
+ fontWeight?: number;
493
+ /** Font slant inferred from the base font name + descriptor (`inferFontStyle`).
494
+ * Lets exporters decide italics from the real style, not a name heuristic. */
495
+ fontStyle?: 'normal' | 'italic';
496
+ /** The embedded-font family (npf-<hash>) this run was drawn with, when the PDF
497
+ * embedded a Unicode-cmap sfnt for it. Keys into
498
+ * `PageRenderer.getEmbeddedFontPrograms()` so an exporter can embed the real
499
+ * font bytes (e.g. into a .docx) instead of substituting. */
500
+ registeredFamily?: string;
473
501
  };
474
502
  /** A drawn image (XObject or inline), captured for editor selection. */
475
503
  type RenderImageBox = {
@@ -518,6 +546,10 @@ declare class PageRenderer {
518
546
  /** fontName -> registered FontFace family, populated by preloadEmbeddedFonts
519
547
  * before each async render pass. Read by loadFontInfo. */
520
548
  private embeddedFamilies;
549
+ /** family (npf-<hash>) → the embeddable sfnt bytes + PDF base font name.
550
+ * Captured independent of FontFace registration so it is populated even in
551
+ * headless/Node renders, and read by exporters (DOCX) to embed real fonts. */
552
+ private embeddedFontBytes;
521
553
  /** Memoized canvas glyph widths keyed by `${ctx.font}|${char}`, cleared per
522
554
  * render pass. Avoids a measureText (GPU readback) per glyph for fonts that
523
555
  * lack PDF width metrics. */
@@ -535,6 +567,12 @@ declare class PageRenderer {
535
567
  private currentPathY;
536
568
  private pathStartX;
537
569
  private pathStartY;
570
+ /** True while a path is being constructed (between a paint/clear op and the
571
+ * next painting op). A PDF path accumulates multiple subpaths (e.g. a glyph
572
+ * outline + its counter holes) and is only reset by a painting operator — so
573
+ * `beginPath()` must run once per path, NOT on every `m`/`re`, or all but the
574
+ * last subpath are discarded (counters/holes vanish, the `i` loses its stem). */
575
+ private pathOpen;
538
576
  private readonly initialTransform;
539
577
  private readonly forcedColor;
540
578
  private readonly patternDepth;
@@ -608,6 +646,15 @@ declare class PageRenderer {
608
646
  private decodeText;
609
647
  /** Canvas glyph width, memoized per (current font, text) for the render pass. */
610
648
  private measureGlyphWidthCached;
649
+ /**
650
+ * Cap-height (in 1000-unit em space) of a registered embedded font, measured
651
+ * from the actual FontFace via the canvas. This is the reliable source the
652
+ * editor uses to size substitute-font redraws: a PDF's FontDescriptor
653
+ * /CapHeight is often wrong (e.g. a Calibri subset declaring 750 when the real
654
+ * value is ~644), but measuring 'H' at 1000px gives the true cap height of the
655
+ * glyphs we actually drew. Cached per family; undefined if measurement fails.
656
+ */
657
+ private measureRegisteredCapHeight;
611
658
  /**
612
659
  * Resolve the page's Font dictionary and register each embedded font program
613
660
  * (TrueType/OpenType with a Unicode cmap) as a browser FontFace, recording
@@ -615,6 +662,13 @@ declare class PageRenderer {
615
662
  * font absent from the map, so loadFontInfo simply falls back to substitution.
616
663
  */
617
664
  private preloadEmbeddedFonts;
665
+ /** family (npf-<hash>) → embeddable sfnt bytes + PDF base font name, for every
666
+ * Unicode-cmap font the page embedded. Exporters embed these to reproduce the
667
+ * original type. Keyed to match `RenderTextRun.registeredFamily`. */
668
+ getEmbeddedFontPrograms(): Map<string, {
669
+ bytes: Uint8Array;
670
+ baseName: string;
671
+ }>;
618
672
  private getCurrentFontInfo;
619
673
  private loadFontInfo;
620
674
  private logFontDiagnostic;
@@ -641,9 +695,10 @@ declare class PageRenderer {
641
695
  private decodeJbig2;
642
696
  /**
643
697
  * Decode a CCITTFax (Group 3/4 fax) image stream to packed 1-bpp filter bytes
644
- * via pdf.js's `CCITTFaxDecoder`. Polarity (BlackIs1) and the other
645
- * parameters come from /DecodeParms; the decoder yields one packed byte per
646
- * `readNextChar()`. Returns null on failure.
698
+ * via the vendored `CCITTFaxDecoder` (see `./ccitt` pdf.js's
699
+ * image_decoders bundle doesn't export this class). Polarity (BlackIs1) and
700
+ * the other parameters come from /DecodeParms; the decoder yields one packed
701
+ * byte per `readNextChar()`. Returns null on failure.
647
702
  */
648
703
  private decodeCcitt;
649
704
  /** Composite an RGBA ImageData through a scratch canvas onto the page (the
@@ -704,6 +759,10 @@ declare class PageRenderer {
704
759
  private applyPatternColor;
705
760
  /** Render one tiling-pattern cell with a child renderer in pattern space. */
706
761
  private renderTilingCell;
762
+ /** Begin a fresh canvas path the first time geometry is added after a paint/
763
+ * clear op. PDF accumulates all subpaths into one path until a painting op
764
+ * resets it; calling `beginPath()` per `m`/`re` would drop earlier subpaths. */
765
+ private beginSubpathIfNeeded;
707
766
  /** Fill the current path: plain color, or pattern painted in pattern space. */
708
767
  private paintFill;
709
768
  /**
@@ -890,6 +949,9 @@ type PDFFontSpec = {
890
949
  color?: PDFColor;
891
950
  lineHeight?: number;
892
951
  charSpacing?: number;
952
+ /** Cap-height (1000-em units) measured from the original embedded font, when
953
+ * known. Used to size the substitute redraw so its caps match the original's. */
954
+ capHeight?: number;
893
955
  };
894
956
  type PdfMatrix = [number, number, number, number, number, number];
895
957
 
@@ -1158,6 +1220,26 @@ declare class PDFEditSession {
1158
1220
  * Underline / strikethrough are thin rectangles spanning the drawn width.
1159
1221
  */
1160
1222
  private drawStyledSpans;
1223
+ /**
1224
+ * Resolve the font to redraw an edit with, reporting whether it is a
1225
+ * substitute (`substituted: true`) or the document's own embedded font.
1226
+ *
1227
+ * Option A (keep-original-when-it-fits): prefer the document's embedded font
1228
+ * so an edit keeps the original typeface whenever that font can encode the
1229
+ * typed text — normal-letter edits no longer drift to Helvetica/Times. Fall
1230
+ * back to a standard look-alike (mapped from the PDF font name so weight/
1231
+ * style are kept) only when the embedded subset can't encode the text (e.g.
1232
+ * a brand-new glyph it never contained), which also avoids dropped/garbled
1233
+ * characters. Standard fonts encode with simple WinAnsi and round-trip
1234
+ * reliably through our own renderer.
1235
+ *
1236
+ * The embedded-font check (`fontSupportsText`) layout-probes the text — it
1237
+ * runs fontkit's full OpenType layout, not just cmap/encode — so a subset
1238
+ * font with pruned glyphs (or a GSUB ligature referencing a pruned glyph)
1239
+ * is rejected here and we fall through to the standard look-alike instead of
1240
+ * crashing later inside `drawText`/`widthOfTextAtSize` ("...reading
1241
+ * 'advanceWidth'"). The draw/measure call sites are additionally guarded.
1242
+ */
1161
1243
  private getRenderableFont;
1162
1244
  private supportedCodePoints;
1163
1245
  private unsupportedCount;
@@ -1169,7 +1251,31 @@ declare class PDFEditSession {
1169
1251
  */
1170
1252
  private fitTextToFont;
1171
1253
  private fontSupportsText;
1254
+ /**
1255
+ * True if fontkit can actually lay `text` out with `font`. This runs the
1256
+ * same code path that `drawText`/`widthOfTextAtSize` use, so it catches
1257
+ * subset fonts with GSUB holes that pass the cheaper cmap/encode checks.
1258
+ */
1259
+ private fontCanLayout;
1260
+ /** Measure text width, returning 0 if fontkit layout fails. */
1261
+ private safeWidth;
1262
+ /**
1263
+ * Draw text, falling back to `fallbackFont` if the chosen font throws inside
1264
+ * fontkit. The `fontSupportsText` probe should already prevent this, but a
1265
+ * font layout failure must never escape and fail the whole edit pipeline.
1266
+ * Returns the font actually used so callers can measure with it.
1267
+ */
1268
+ private safeDrawText;
1172
1269
  private getEmbeddedFont;
1270
+ /**
1271
+ * True if `fontBytes` can be embedded and a document containing it saved
1272
+ * without throwing. Run on a throwaway document so a font that crashes
1273
+ * pdf-lib's embedder (e.g. missing name-table records, or a glyph the width
1274
+ * table can't resolve) is rejected here — before it can poison `this.pdfDoc`
1275
+ * and crash `toBlob()`. Cheap and done once per font (the result is cached
1276
+ * via the embeddedFontCache promise).
1277
+ */
1278
+ private embedSurvivesSave;
1173
1279
  private getEmbeddedFontPrograms;
1174
1280
  private ensureFontkitRegistered;
1175
1281
  }
@@ -1255,11 +1361,19 @@ declare class PDFEditProcessor implements IAvniProcessor {
1255
1361
  process(doc: AvniDocument, params: Record<string, unknown>): Promise<ProcessorResult>;
1256
1362
  }
1257
1363
 
1364
+ /**
1365
+ * Pipeline wrapper around the client-side compressor (`src/engine/compress`).
1366
+ *
1367
+ * The real work — decode-any-image → downsample → JPEG re-encode, plus metadata
1368
+ * strip / content-stream Flate / object streams — lives in `compressPdf`. This
1369
+ * processor just adapts it to the `IAvniProcessor` contract so the orchestrator
1370
+ * (and `test-engine`) can drive it. `params.level` accepts a preset id or the
1371
+ * legacy numeric 0–100 value.
1372
+ */
1258
1373
  declare class PDFCompressor implements IAvniProcessor {
1259
1374
  name: string;
1260
1375
  supportedTypes: "pdf"[];
1261
1376
  process(doc: AvniDocument, params: Record<string, unknown>): Promise<ProcessorResult>;
1262
- private compressImageInBrowser;
1263
1377
  }
1264
1378
 
1265
1379
  /** Shared license/activation contracts used by both the engine gate and the server. */
package/dist/index.d.ts CHANGED
@@ -189,7 +189,11 @@ declare class PDFParser {
189
189
  */
190
190
  private extractVersion;
191
191
  /**
192
- * Parse all xref tables (including previous versions for incremental updates)
192
+ * Parse all xref tables (including previous versions for incremental updates).
193
+ * Falls back to a full-file brute-force scan when the real xref is missing or
194
+ * corrupt (missing startxref, malformed entries, unreadable xref stream) or
195
+ * when it parses but yields no usable /Root — so otherwise-readable PDFs with
196
+ * a broken cross-reference table still open.
193
197
  */
194
198
  private parseXRefTables;
195
199
  /**
@@ -470,6 +474,30 @@ type RenderTextRun = {
470
474
  * embedded family + substitute fallback, or just the substitute family).
471
475
  * Lets the inline editor render identical text — no size/width jump. */
472
476
  fontFamily?: string;
477
+ /** PDF text rendering mode (Tr) this run was drawn with. Modes 3 and 7 are
478
+ * invisible — the run is captured for editing/search but not painted. A page
479
+ * whose text is predominantly invisible is the hallmark of a scanned/OCR
480
+ * layer (see editor/scannedTextRecovery). */
481
+ textRenderMode: number;
482
+ /** Convenience: true for invisible render modes (3 = invisible, 7 = clip). */
483
+ invisible: boolean;
484
+ /** Cap-height (1000-em units) measured from the actual embedded FontFace, when
485
+ * one was registered. The editor uses it to size substitute-font redraws to
486
+ * match the original's caps. Measured (not the descriptor's often-wrong
487
+ * /CapHeight) — see measureRegisteredCapHeight. */
488
+ fontCapHeight?: number;
489
+ /** Numeric font weight (100–900) inferred from the base font name + descriptor
490
+ * (`inferFontWeight`). Lets exporters decide bold from the real weight rather
491
+ * than re-guessing from the name. ≥600 is effectively bold. */
492
+ fontWeight?: number;
493
+ /** Font slant inferred from the base font name + descriptor (`inferFontStyle`).
494
+ * Lets exporters decide italics from the real style, not a name heuristic. */
495
+ fontStyle?: 'normal' | 'italic';
496
+ /** The embedded-font family (npf-<hash>) this run was drawn with, when the PDF
497
+ * embedded a Unicode-cmap sfnt for it. Keys into
498
+ * `PageRenderer.getEmbeddedFontPrograms()` so an exporter can embed the real
499
+ * font bytes (e.g. into a .docx) instead of substituting. */
500
+ registeredFamily?: string;
473
501
  };
474
502
  /** A drawn image (XObject or inline), captured for editor selection. */
475
503
  type RenderImageBox = {
@@ -518,6 +546,10 @@ declare class PageRenderer {
518
546
  /** fontName -> registered FontFace family, populated by preloadEmbeddedFonts
519
547
  * before each async render pass. Read by loadFontInfo. */
520
548
  private embeddedFamilies;
549
+ /** family (npf-<hash>) → the embeddable sfnt bytes + PDF base font name.
550
+ * Captured independent of FontFace registration so it is populated even in
551
+ * headless/Node renders, and read by exporters (DOCX) to embed real fonts. */
552
+ private embeddedFontBytes;
521
553
  /** Memoized canvas glyph widths keyed by `${ctx.font}|${char}`, cleared per
522
554
  * render pass. Avoids a measureText (GPU readback) per glyph for fonts that
523
555
  * lack PDF width metrics. */
@@ -535,6 +567,12 @@ declare class PageRenderer {
535
567
  private currentPathY;
536
568
  private pathStartX;
537
569
  private pathStartY;
570
+ /** True while a path is being constructed (between a paint/clear op and the
571
+ * next painting op). A PDF path accumulates multiple subpaths (e.g. a glyph
572
+ * outline + its counter holes) and is only reset by a painting operator — so
573
+ * `beginPath()` must run once per path, NOT on every `m`/`re`, or all but the
574
+ * last subpath are discarded (counters/holes vanish, the `i` loses its stem). */
575
+ private pathOpen;
538
576
  private readonly initialTransform;
539
577
  private readonly forcedColor;
540
578
  private readonly patternDepth;
@@ -608,6 +646,15 @@ declare class PageRenderer {
608
646
  private decodeText;
609
647
  /** Canvas glyph width, memoized per (current font, text) for the render pass. */
610
648
  private measureGlyphWidthCached;
649
+ /**
650
+ * Cap-height (in 1000-unit em space) of a registered embedded font, measured
651
+ * from the actual FontFace via the canvas. This is the reliable source the
652
+ * editor uses to size substitute-font redraws: a PDF's FontDescriptor
653
+ * /CapHeight is often wrong (e.g. a Calibri subset declaring 750 when the real
654
+ * value is ~644), but measuring 'H' at 1000px gives the true cap height of the
655
+ * glyphs we actually drew. Cached per family; undefined if measurement fails.
656
+ */
657
+ private measureRegisteredCapHeight;
611
658
  /**
612
659
  * Resolve the page's Font dictionary and register each embedded font program
613
660
  * (TrueType/OpenType with a Unicode cmap) as a browser FontFace, recording
@@ -615,6 +662,13 @@ declare class PageRenderer {
615
662
  * font absent from the map, so loadFontInfo simply falls back to substitution.
616
663
  */
617
664
  private preloadEmbeddedFonts;
665
+ /** family (npf-<hash>) → embeddable sfnt bytes + PDF base font name, for every
666
+ * Unicode-cmap font the page embedded. Exporters embed these to reproduce the
667
+ * original type. Keyed to match `RenderTextRun.registeredFamily`. */
668
+ getEmbeddedFontPrograms(): Map<string, {
669
+ bytes: Uint8Array;
670
+ baseName: string;
671
+ }>;
618
672
  private getCurrentFontInfo;
619
673
  private loadFontInfo;
620
674
  private logFontDiagnostic;
@@ -641,9 +695,10 @@ declare class PageRenderer {
641
695
  private decodeJbig2;
642
696
  /**
643
697
  * Decode a CCITTFax (Group 3/4 fax) image stream to packed 1-bpp filter bytes
644
- * via pdf.js's `CCITTFaxDecoder`. Polarity (BlackIs1) and the other
645
- * parameters come from /DecodeParms; the decoder yields one packed byte per
646
- * `readNextChar()`. Returns null on failure.
698
+ * via the vendored `CCITTFaxDecoder` (see `./ccitt` pdf.js's
699
+ * image_decoders bundle doesn't export this class). Polarity (BlackIs1) and
700
+ * the other parameters come from /DecodeParms; the decoder yields one packed
701
+ * byte per `readNextChar()`. Returns null on failure.
647
702
  */
648
703
  private decodeCcitt;
649
704
  /** Composite an RGBA ImageData through a scratch canvas onto the page (the
@@ -704,6 +759,10 @@ declare class PageRenderer {
704
759
  private applyPatternColor;
705
760
  /** Render one tiling-pattern cell with a child renderer in pattern space. */
706
761
  private renderTilingCell;
762
+ /** Begin a fresh canvas path the first time geometry is added after a paint/
763
+ * clear op. PDF accumulates all subpaths into one path until a painting op
764
+ * resets it; calling `beginPath()` per `m`/`re` would drop earlier subpaths. */
765
+ private beginSubpathIfNeeded;
707
766
  /** Fill the current path: plain color, or pattern painted in pattern space. */
708
767
  private paintFill;
709
768
  /**
@@ -890,6 +949,9 @@ type PDFFontSpec = {
890
949
  color?: PDFColor;
891
950
  lineHeight?: number;
892
951
  charSpacing?: number;
952
+ /** Cap-height (1000-em units) measured from the original embedded font, when
953
+ * known. Used to size the substitute redraw so its caps match the original's. */
954
+ capHeight?: number;
893
955
  };
894
956
  type PdfMatrix = [number, number, number, number, number, number];
895
957
 
@@ -1158,6 +1220,26 @@ declare class PDFEditSession {
1158
1220
  * Underline / strikethrough are thin rectangles spanning the drawn width.
1159
1221
  */
1160
1222
  private drawStyledSpans;
1223
+ /**
1224
+ * Resolve the font to redraw an edit with, reporting whether it is a
1225
+ * substitute (`substituted: true`) or the document's own embedded font.
1226
+ *
1227
+ * Option A (keep-original-when-it-fits): prefer the document's embedded font
1228
+ * so an edit keeps the original typeface whenever that font can encode the
1229
+ * typed text — normal-letter edits no longer drift to Helvetica/Times. Fall
1230
+ * back to a standard look-alike (mapped from the PDF font name so weight/
1231
+ * style are kept) only when the embedded subset can't encode the text (e.g.
1232
+ * a brand-new glyph it never contained), which also avoids dropped/garbled
1233
+ * characters. Standard fonts encode with simple WinAnsi and round-trip
1234
+ * reliably through our own renderer.
1235
+ *
1236
+ * The embedded-font check (`fontSupportsText`) layout-probes the text — it
1237
+ * runs fontkit's full OpenType layout, not just cmap/encode — so a subset
1238
+ * font with pruned glyphs (or a GSUB ligature referencing a pruned glyph)
1239
+ * is rejected here and we fall through to the standard look-alike instead of
1240
+ * crashing later inside `drawText`/`widthOfTextAtSize` ("...reading
1241
+ * 'advanceWidth'"). The draw/measure call sites are additionally guarded.
1242
+ */
1161
1243
  private getRenderableFont;
1162
1244
  private supportedCodePoints;
1163
1245
  private unsupportedCount;
@@ -1169,7 +1251,31 @@ declare class PDFEditSession {
1169
1251
  */
1170
1252
  private fitTextToFont;
1171
1253
  private fontSupportsText;
1254
+ /**
1255
+ * True if fontkit can actually lay `text` out with `font`. This runs the
1256
+ * same code path that `drawText`/`widthOfTextAtSize` use, so it catches
1257
+ * subset fonts with GSUB holes that pass the cheaper cmap/encode checks.
1258
+ */
1259
+ private fontCanLayout;
1260
+ /** Measure text width, returning 0 if fontkit layout fails. */
1261
+ private safeWidth;
1262
+ /**
1263
+ * Draw text, falling back to `fallbackFont` if the chosen font throws inside
1264
+ * fontkit. The `fontSupportsText` probe should already prevent this, but a
1265
+ * font layout failure must never escape and fail the whole edit pipeline.
1266
+ * Returns the font actually used so callers can measure with it.
1267
+ */
1268
+ private safeDrawText;
1172
1269
  private getEmbeddedFont;
1270
+ /**
1271
+ * True if `fontBytes` can be embedded and a document containing it saved
1272
+ * without throwing. Run on a throwaway document so a font that crashes
1273
+ * pdf-lib's embedder (e.g. missing name-table records, or a glyph the width
1274
+ * table can't resolve) is rejected here — before it can poison `this.pdfDoc`
1275
+ * and crash `toBlob()`. Cheap and done once per font (the result is cached
1276
+ * via the embeddedFontCache promise).
1277
+ */
1278
+ private embedSurvivesSave;
1173
1279
  private getEmbeddedFontPrograms;
1174
1280
  private ensureFontkitRegistered;
1175
1281
  }
@@ -1255,11 +1361,19 @@ declare class PDFEditProcessor implements IAvniProcessor {
1255
1361
  process(doc: AvniDocument, params: Record<string, unknown>): Promise<ProcessorResult>;
1256
1362
  }
1257
1363
 
1364
+ /**
1365
+ * Pipeline wrapper around the client-side compressor (`src/engine/compress`).
1366
+ *
1367
+ * The real work — decode-any-image → downsample → JPEG re-encode, plus metadata
1368
+ * strip / content-stream Flate / object streams — lives in `compressPdf`. This
1369
+ * processor just adapts it to the `IAvniProcessor` contract so the orchestrator
1370
+ * (and `test-engine`) can drive it. `params.level` accepts a preset id or the
1371
+ * legacy numeric 0–100 value.
1372
+ */
1258
1373
  declare class PDFCompressor implements IAvniProcessor {
1259
1374
  name: string;
1260
1375
  supportedTypes: "pdf"[];
1261
1376
  process(doc: AvniDocument, params: Record<string, unknown>): Promise<ProcessorResult>;
1262
- private compressImageInBrowser;
1263
1377
  }
1264
1378
 
1265
1379
  /** Shared license/activation contracts used by both the engine gate and the server. */