npm - @strav/pdf - Versions diffs - 0.4.17 → 0.4.18 - Mend

@strav/pdf 0.4.17 → 0.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +25 -7
package/package.json +5 -3
package/src/index.ts +10 -0
package/src/reader/cmap_parser.ts +173 -0
package/src/reader/decrypt.ts +226 -0
package/src/reader/document.ts +246 -0
package/src/reader/encodings.ts +73 -0
package/src/reader/extract.ts +152 -0
package/src/reader/fonts.ts +259 -0
package/src/reader/index.ts +27 -0
package/src/reader/layout.ts +106 -0
package/src/reader/lexer.ts +270 -0
package/src/reader/object_parser.ts +203 -0
package/src/reader/objstm.ts +44 -0
package/src/reader/text_interpreter.ts +327 -0
package/src/reader/xref.ts +229 -0
package/src/streams/decode.ts +98 -0
package/src/streams/flate.ts +94 -4
package/src/streams/index.ts +6 -1
package/src/streams/lzw.ts +74 -0
package/src/streams/runlength.ts +25 -0
package/src/util/errors.ts +20 -0

package/src/reader/document.ts ADDED Viewed

@@ -0,0 +1,246 @@
+/**
+ * Read-side document model. Owns the byte buffer, the merged xref table and
+ * the (optional) decryptor; resolves indirect objects with a cache and a
+ * cycle guard; materialises compressed object streams lazily; and walks the
+ * page tree with attribute inheritance (spec §7.7).
+ */
+import {
+  type PdfObject,
+  type PdfDictionary,
+  type PdfStream,
+  isRef,
+  isDict,
+  isStream,
+  isArr,
+  isName,
+  isNum,
+  isStr,
+} from '../objects/types.ts'
+import { PdfParseError, EncryptedPdfError } from '../util/errors.ts'
+import { decodeStream } from '../streams/decode.ts'
+import { Lexer } from './lexer.ts'
+import { ObjectParser } from './object_parser.ts'
+import { parseXref, bruteForceXref, type XrefTable } from './xref.ts'
+import { parseObjStm } from './objstm.ts'
+import { buildDecryptor, type Decryptor } from './decrypt.ts'
+const INHERITED = ['Resources', 'MediaBox', 'CropBox', 'Rotate'] as const
+export class PdfReaderDocument {
+  readonly xref: XrefTable
+  private readonly cache = new Map<number, PdfObject>()
+  private readonly objStmCache = new Map<number, Map<number, PdfObject>>()
+  private readonly decryptor?: Decryptor
+  constructor(
+    readonly buf: Uint8Array,
+    opts: { password?: string } = {},
+  ) {
+    if (opts.password) {
+      // M13 only validates the empty user password.
+      throw new EncryptedPdfError(
+        'Password-protected PDFs are not supported (empty password only)',
+      )
+    }
+    let xref: XrefTable
+    try {
+      xref = parseXref(buf)
+      if (!xref.trailer.entries.has('Root')) xref = bruteForceXref(buf)
+    } catch {
+      xref = bruteForceXref(buf)
+    }
+    this.xref = xref
+    const encEntry = xref.trailer.entries.get('Encrypt')
+    if (encEntry) {
+      const encNum = isRef(encEntry) ? encEntry.num : -1
+      const encDict = this.resolve(encEntry)
+      const idArr = xref.trailer.entries.get('ID')
+      const idFirst =
+        idArr && isArr(idArr) && idArr.items[0] && isStr(idArr.items[0]!)
+          ? idArr.items[0]!.value
+          : new Uint8Array(0)
+      if (encDict && isDict(encDict)) {
+        this.decryptor = buildDecryptor(encDict, idFirst, encNum)
+      }
+    }
+  }
+  // ── Object resolution ────────────────────────────────────────────────────
+  getObject(numOrRef: number | PdfObject, gen = 0): PdfObject {
+    let num: number
+    let g = gen
+    if (typeof numOrRef === 'number') num = numOrRef
+    else if (isRef(numOrRef)) {
+      num = numOrRef.num
+      g = numOrRef.gen
+    } else return numOrRef
+    const cached = this.cache.get(num)
+    if (cached) return cached
+    const entry = this.xref.entries.get(num)
+    if (!entry) return { kind: 'null' }
+    let value: PdfObject
+    if (entry.type === 'n') {
+      try {
+        const parser = new ObjectParser(new Lexer(this.buf, entry.offset), (o) =>
+          this.toNumber(o),
+        )
+        const parsed = parser.parseIndirectAt(entry.offset)
+        value = parsed.value
+        if (this.decryptor && num !== this.decryptor.encryptObjNum) {
+          value = this.decryptDeep(value, num, entry.gen ?? g)
+        }
+      } catch (e) {
+        if (e instanceof PdfParseError) return { kind: 'null' }
+        throw e
+      }
+    } else {
+      value = this.fromObjStm(entry.streamObj, entry.index, num)
+    }
+    this.cache.set(num, value)
+    return value
+  }
+  /** Dereference one level (ref → object); pass-through otherwise. */
+  resolve(o: PdfObject | undefined): PdfObject | undefined {
+    if (!o) return undefined
+    return isRef(o) ? this.getObject(o) : o
+  }
+  private toNumber(o: PdfObject): number | undefined {
+    const r = this.resolve(o)
+    return r && isNum(r) ? r.value : undefined
+  }
+  private fromObjStm(streamObj: number, index: number, want: number): PdfObject {
+    let contents = this.objStmCache.get(streamObj)
+    if (!contents) {
+      const stm = this.getObject(streamObj)
+      if (!isStream(stm)) return { kind: 'null' }
+      const data = this.getStreamData(stm, streamObj)
+      contents = parseObjStm(stm.dict, data).objects
+      this.objStmCache.set(streamObj, contents)
+    }
+    return contents.get(want) ?? { kind: 'null' }
+  }
+  // ── Streams ──────────────────────────────────────────────────────────────
+  /** Decrypt (if needed) then run the filter chain. */
+  getStreamData(stream: PdfStream, objNum: number, gen = 0): Uint8Array {
+    let raw = stream.data
+    const type = stream.dict.entries.get('Type')
+    const isXref = type && isName(type) && type.value === 'XRef'
+    if (this.decryptor && !isXref && objNum !== this.decryptor.encryptObjNum) {
+      raw = this.decryptor.decrypt(objNum, gen, raw, false)
+    }
+    return decodeStream(stream.dict, raw, (o) => this.resolve(o))
+  }
+  private decryptDeep(o: PdfObject, num: number, gen: number): PdfObject {
+    const d = this.decryptor!
+    const walk = (x: PdfObject): PdfObject => {
+      if (x.kind === 'str') {
+        return { ...x, value: d.decrypt(num, gen, x.value, true) }
+      }
+      if (x.kind === 'arr') return { kind: 'arr', items: x.items.map(walk) }
+      if (x.kind === 'dict') {
+        const e = new Map<string, PdfObject>()
+        for (const [k, v] of x.entries) e.set(k, walk(v))
+        return { kind: 'dict', entries: e }
+      }
+      if (x.kind === 'stream') {
+        const e = new Map<string, PdfObject>()
+        for (const [k, v] of x.dict.entries) e.set(k, walk(v))
+        return { kind: 'stream', dict: { kind: 'dict', entries: e }, data: x.data }
+      }
+      return x
+    }
+    return walk(o)
+  }
+  // ── Catalog / pages ──────────────────────────────────────────────────────
+  get trailer(): PdfDictionary {
+    return this.xref.trailer
+  }
+  catalog(): PdfDictionary {
+    const root = this.resolve(this.trailer.entries.get('Root'))
+    if (!root || !isDict(root)) throw new PdfParseError('Missing document catalog')
+    return root
+  }
+  /** Leaf page dictionaries in document order, with inherited attributes. */
+  pages(): PdfDictionary[] {
+    const out: PdfDictionary[] = []
+    const seen = new Set<PdfObject>()
+    const root = this.resolve(this.catalog().entries.get('Pages'))
+    if (!root || !isDict(root)) throw new PdfParseError('Missing page tree root')
+    const visit = (nodeRef: PdfObject | undefined, inherited: Map<string, PdfObject>) => {
+      const node = this.resolve(nodeRef)
+      if (!node || !isDict(node) || seen.has(node)) return
+      seen.add(node)
+      const merged = new Map(inherited)
+      for (const key of INHERITED) {
+        const v = node.entries.get(key)
+        if (v) merged.set(key, v)
+      }
+      const type = node.entries.get('Type')
+      const kids = node.entries.get('Kids')
+      if (kids && isArr(kids)) {
+        for (const kid of kids.items) visit(kid, merged)
+      } else if (!type || (isName(type) && type.value === 'Page') || node.entries.has('Contents')) {
+        const leaf = new Map(merged)
+        for (const [k, v] of node.entries) leaf.set(k, v)
+        out.push({ kind: 'dict', entries: leaf })
+      }
+    }
+    visit(root, new Map())
+    return out
+  }
+  /** Concatenated, decoded content-stream bytes for a page. */
+  pageContent(page: PdfDictionary): Uint8Array {
+    const c = this.resolve(page.entries.get('Contents'))
+    const streams: PdfStream[] = []
+    const refsNum: number[] = []
+    const collect = (obj: PdfObject | undefined, ref?: PdfObject) => {
+      const r = this.resolve(obj)
+      if (r && isStream(r)) {
+        streams.push(r)
+        refsNum.push(ref && isRef(ref) ? ref.num : -1)
+      }
+    }
+    if (c && isArr(c)) {
+      const raw = page.entries.get('Contents')
+      const items = raw && isArr(raw) ? raw.items : c.items
+      for (const it of items) collect(it, it)
+    } else {
+      collect(c, page.entries.get('Contents'))
+    }
+    const parts: Uint8Array[] = []
+    for (let i = 0; i < streams.length; i++) {
+      parts.push(this.getStreamData(streams[i]!, refsNum[i]!))
+      parts.push(Uint8Array.of(0x0a))
+    }
+    const total = parts.reduce((a, p) => a + p.length, 0)
+    const out = new Uint8Array(total)
+    let o = 0
+    for (const p of parts) {
+      out.set(p, o)
+      o += p.length
+    }
+    return out
+  }
+  get encrypted(): boolean {
+    return this.decryptor !== undefined || this.trailer.entries.has('Encrypt')
+  }
+}

package/src/reader/encodings.ts ADDED Viewed

@@ -0,0 +1,73 @@
+/**
+ * Single-byte text encodings (spec §D) for the read side, plus a glyph-name →
+ * Unicode resolver (Adobe Glyph List subset + the algorithmic `uniXXXX` /
+ * `uXXXXXX` forms). Used when a simple font has no `/ToUnicode`: the base
+ * encoding maps code → glyph name → Unicode.
+ *
+ * WinAnsi is implemented exactly (it is what the writer emits for Standard-14
+ * and the common case for simple fonts). Standard/MacRoman/PDFDoc share
+ * WinAnsi for ASCII and Latin-1 and only differ in the punctuation high range;
+ * those differences are approximated and documented as a v1 limitation.
+ */
+// CP1252-specific code points in 0x80–0x9F; everything else in 0x20–0xFF maps
+// to the same Unicode scalar (Latin-1) and 0x00–0x1F to itself.
+const WIN_HIGH: Record<number, number> = {
+  0x80: 0x20ac, 0x82: 0x201a, 0x83: 0x0192, 0x84: 0x201e, 0x85: 0x2026,
+  0x86: 0x2020, 0x87: 0x2021, 0x88: 0x02c6, 0x89: 0x2030, 0x8a: 0x0160,
+  0x8b: 0x2039, 0x8c: 0x0152, 0x8e: 0x017d, 0x91: 0x2018, 0x92: 0x2019,
+  0x93: 0x201c, 0x94: 0x201d, 0x95: 0x2022, 0x96: 0x2013, 0x97: 0x2014,
+  0x98: 0x02dc, 0x99: 0x2122, 0x9a: 0x0161, 0x9b: 0x203a, 0x9c: 0x0153,
+  0x9e: 0x017e, 0x9f: 0x0178,
+}
+export function winAnsiToUnicode(code: number): number {
+  if (code >= 0x80 && code <= 0x9f) return WIN_HIGH[code] ?? code
+  return code // ASCII + Latin-1 are identity
+}
+export type BaseEncodingName =
+  | 'WinAnsiEncoding'
+  | 'MacRomanEncoding'
+  | 'StandardEncoding'
+  | 'PDFDocEncoding'
+  | 'MacExpertEncoding'
+/** Resolve a code under a named base encoding (approximate for non-WinAnsi). */
+export function baseEncode(name: BaseEncodingName | undefined, code: number): number {
+  // WinAnsi is exact; others are close enough for ASCII/Latin text. Glyph-name
+  // /Differences (handled by the caller) override anything that matters.
+  return winAnsiToUnicode(code)
+}
+// A pragmatic Adobe Glyph List subset: ASCII + the common Latin-1 names the
+// writer and typical producers emit via /Differences. Extend as needed.
+const AGL: Record<string, number> = {
+  space: 0x20, exclam: 0x21, quotedbl: 0x22, numbersign: 0x23, dollar: 0x24,
+  percent: 0x25, ampersand: 0x26, quotesingle: 0x27, parenleft: 0x28,
+  parenright: 0x29, asterisk: 0x2a, plus: 0x2b, comma: 0x2c, hyphen: 0x2d,
+  period: 0x2e, slash: 0x2f, zero: 0x30, one: 0x31, two: 0x32, three: 0x33,
+  four: 0x34, five: 0x35, six: 0x36, seven: 0x37, eight: 0x38, nine: 0x39,
+  colon: 0x3a, semicolon: 0x3b, less: 0x3c, equal: 0x3d, greater: 0x3e,
+  question: 0x3f, at: 0x40, bracketleft: 0x5b, backslash: 0x5c,
+  bracketright: 0x5d, asciicircum: 0x5e, underscore: 0x5f, grave: 0x60,
+  braceleft: 0x7b, bar: 0x7c, braceright: 0x7d, asciitilde: 0x7e,
+  bullet: 0x2022, endash: 0x2013, emdash: 0x2014, quoteleft: 0x2018,
+  quoteright: 0x2019, quotedblleft: 0x201c, quotedblright: 0x201d,
+  quotesinglbase: 0x201a, quotedblbase: 0x201e, ellipsis: 0x2026,
+  dagger: 0x2020, daggerdbl: 0x2021, perthousand: 0x2030, trademark: 0x2122,
+  fi: 0xfb01, fl: 0xfb02, florin: 0x192, Euro: 0x20ac, nbspace: 0xa0,
+}
+/** glyph name → Unicode code point, or -1 if unknown. */
+export function glyphNameToUnicode(g: string): number {
+  if (g in AGL) return AGL[g]!
+  // Letters/digits: single-char names like "A", "z" are not standard, but
+  // "uniXXXX" and "uXXXXXX" are.
+  let m = /^uni([0-9A-Fa-f]{4})$/.exec(g)
+  if (m) return parseInt(m[1]!, 16)
+  m = /^u([0-9A-Fa-f]{4,6})$/.exec(g)
+  if (m) return parseInt(m[1]!, 16)
+  // "gNN" / "cidNN" / "indexNN": no Unicode information available.
+  return -1
+}

package/src/reader/extract.ts ADDED Viewed

@@ -0,0 +1,152 @@
+/**
+ * Public read-side API (M13): layout-aware plain-text extraction from an
+ * existing PDF. `extractText` is the headline ergonomic entry point;
+ * `PdfReader` is a reusable handle for lazy/repeated page access.
+ *
+ * Scope: text content only. No OCR (scanned/image-only pages yield no text),
+ * no column/table reconstruction, no annotation/form-field values, empty
+ * user password only.
+ */
+import { type PdfDictionary, isDict, isStr } from '../objects/types.ts'
+import { PdfReaderDocument } from './document.ts'
+import { interpretText } from './text_interpreter.ts'
+import { runsToText } from './layout.ts'
+export interface ExtractOptions {
+  /** 1-based pages; default all. */
+  pages?: number | number[] | { from?: number; to?: number }
+  /** Collapse whitespace and trim. Default true. */
+  normalizeWhitespace?: boolean
+  /** Only the empty password is supported; non-empty throws. */
+  password?: string
+}
+export interface ExtractedPage {
+  number: number
+  text: string
+}
+export interface PdfInfo {
+  title?: string
+  author?: string
+  subject?: string
+  keywords?: string
+  creator?: string
+  producer?: string
+  creationDate?: string
+  modDate?: string
+  pageCount: number
+  encrypted: boolean
+}
+export interface ExtractResult {
+  pages: ExtractedPage[]
+  /** Page texts joined by the form-feed page separator. */
+  text: string
+  info: PdfInfo
+}
+function toU8(b: Uint8Array | ArrayBuffer): Uint8Array {
+  return b instanceof Uint8Array ? b : new Uint8Array(b)
+}
+/** Decode a PDF text string: UTF-16BE if BOM-prefixed, else Latin-1/PDFDoc. */
+function decodeTextString(bytes: Uint8Array): string {
+  if (bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff) {
+    let s = ''
+    for (let i = 2; i + 1 < bytes.length; i += 2) s += String.fromCharCode((bytes[i]! << 8) | bytes[i + 1]!)
+    return s
+  }
+  let s = ''
+  for (const b of bytes) s += String.fromCharCode(b)
+  return s
+}
+export class PdfReader {
+  private readonly doc: PdfReaderDocument
+  private readonly pageList: PdfDictionary[]
+  private constructor(bytes: Uint8Array, opts: { password?: string }) {
+    this.doc = new PdfReaderDocument(bytes, opts)
+    this.pageList = this.doc.pages()
+  }
+  static async open(
+    bytes: Uint8Array | ArrayBuffer,
+    opts: { password?: string } = {},
+  ): Promise<PdfReader> {
+    return new PdfReader(toU8(bytes), opts)
+  }
+  get pageCount(): number {
+    return this.pageList.length
+  }
+  get encrypted(): boolean {
+    return this.doc.encrypted
+  }
+  get info(): PdfInfo {
+    const out: PdfInfo = { pageCount: this.pageCount, encrypted: this.encrypted }
+    const infoObj = this.doc.resolve(this.doc.trailer.entries.get('Info'))
+    if (infoObj && isDict(infoObj)) {
+      const get = (k: string): string | undefined => {
+        const v = this.doc.resolve(infoObj.entries.get(k))
+        return v && isStr(v) ? decodeTextString(v.value) : undefined
+      }
+      out.title = get('Title')
+      out.author = get('Author')
+      out.subject = get('Subject')
+      out.keywords = get('Keywords')
+      out.creator = get('Creator')
+      out.producer = get('Producer')
+      out.creationDate = get('CreationDate')
+      out.modDate = get('ModDate')
+    }
+    return out
+  }
+  pageText(pageNumber: number, opts: { normalizeWhitespace?: boolean } = {}): string {
+    const page = this.pageList[pageNumber - 1]
+    if (!page) return ''
+    const resources = this.doc.resolve(page.entries.get('Resources'))
+    const content = this.doc.pageContent(page)
+    const runs = interpretText(
+      content,
+      resources && isDict(resources) ? resources : undefined,
+      this.doc,
+    )
+    return runsToText(runs, opts.normalizeWhitespace ?? true)
+  }
+  extractText(opts: ExtractOptions = {}): ExtractResult {
+    const nums = selectPages(opts.pages, this.pageCount)
+    const norm = opts.normalizeWhitespace ?? true
+    const pages = nums.map((n) => ({ number: n, text: this.pageText(n, { normalizeWhitespace: norm }) }))
+    return { pages, text: pages.map((p) => p.text).join('\f'), info: this.info }
+  }
+}
+function selectPages(
+  spec: ExtractOptions['pages'],
+  count: number,
+): number[] {
+  const all = Array.from({ length: count }, (_, i) => i + 1)
+  if (spec === undefined) return all
+  if (typeof spec === 'number') return spec >= 1 && spec <= count ? [spec] : []
+  if (Array.isArray(spec)) return spec.filter((n) => n >= 1 && n <= count)
+  const from = Math.max(1, spec.from ?? 1)
+  const to = Math.min(count, spec.to ?? count)
+  const out: number[] = []
+  for (let n = from; n <= to; n++) out.push(n)
+  return out
+}
+export async function extractText(
+  bytes: Uint8Array | ArrayBuffer,
+  opts: ExtractOptions = {},
+): Promise<ExtractResult> {
+  const reader = await PdfReader.open(bytes, { password: opts.password })
+  return reader.extractText(opts)
+}