npm - @strav/pdf - Versions diffs - 0.4.17 → 0.4.18 - Mend

@strav/pdf 0.4.17 → 0.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +25 -7
package/package.json +5 -3
package/src/index.ts +10 -0
package/src/reader/cmap_parser.ts +173 -0
package/src/reader/decrypt.ts +226 -0
package/src/reader/document.ts +246 -0
package/src/reader/encodings.ts +73 -0
package/src/reader/extract.ts +152 -0
package/src/reader/fonts.ts +259 -0
package/src/reader/index.ts +27 -0
package/src/reader/layout.ts +106 -0
package/src/reader/lexer.ts +270 -0
package/src/reader/object_parser.ts +203 -0
package/src/reader/objstm.ts +44 -0
package/src/reader/text_interpreter.ts +327 -0
package/src/reader/xref.ts +229 -0
package/src/streams/decode.ts +98 -0
package/src/streams/flate.ts +94 -4
package/src/streams/index.ts +6 -1
package/src/streams/lzw.ts +74 -0
package/src/streams/runlength.ts +25 -0
package/src/util/errors.ts +20 -0

package/src/reader/object_parser.ts ADDED Viewed

@@ -0,0 +1,203 @@
+/**
+ * Recursive-descent parser: token stream → {@link PdfObject} (spec §7.3).
+ *
+ * Handles the two-number lookahead for indirect references (`n g R`) and
+ * indirect object bodies (`n g obj … endobj`), including `stream`/`endstream`
+ * whose raw bytes are sliced by the resolved `/Length` (with a scan-to-
+ * `endstream` fallback for the wrong/indirect lengths real files contain).
+ */
+import {
+  type PdfObject,
+  type PdfDictionary,
+  bool,
+  num,
+  name,
+  arr,
+  ref,
+  dict as makeDict,
+  NULL,
+  isNum,
+  isDict,
+} from '../objects/types.ts'
+import { PdfParseError } from '../util/errors.ts'
+import { Lexer, latin1, type Token } from './lexer.ts'
+/** Resolve an object to a plain number (for indirect `/Length`). */
+export type LengthResolver = (o: PdfObject) => number | undefined
+export class ObjectParser {
+  constructor(
+    readonly lex: Lexer,
+    private readonly resolveLength?: LengthResolver,
+  ) {}
+  /** Parse the indirect object whose body starts at `offset`. */
+  parseIndirectAt(offset: number): { num: number; gen: number; value: PdfObject } {
+    this.lex.seek(offset)
+    const n = this.lex.next()
+    const g = this.lex.next()
+    const obj = this.lex.next()
+    if (n.type !== 'num' || g.type !== 'num' || obj.type !== 'kw' || obj.value !== 'obj') {
+      throw new PdfParseError(`Expected "N G obj" at offset ${offset}`)
+    }
+    const value = this.parseObject()
+    return { num: n.value, gen: g.value, value }
+  }
+  /** Parse a single object value, resolving `n g R` / streams. */
+  parseObject(): PdfObject {
+    const t = this.lex.next()
+    return this.parseFromToken(t)
+  }
+  private parseFromToken(t: Token): PdfObject {
+    switch (t.type) {
+      case 'eof':
+        throw new PdfParseError('Unexpected end of input')
+      case 'num':
+        return this.parseNumberOrRef(t.value)
+      case 'str':
+        return { kind: 'str', value: t.value, encoding: t.encoding }
+      case 'name':
+        return name(t.value)
+      case 'kw':
+        if (t.value === 'true') return bool(true)
+        if (t.value === 'false') return bool(false)
+        if (t.value === 'null') return NULL
+        // Unknown bare keyword (e.g. "endobj", "R" out of place) — treat as null
+        return NULL
+      case 'delim':
+        if (t.value === '[') return this.parseArray()
+        if (t.value === '<<') return this.parseDictOrStream()
+        throw new PdfParseError(`Unexpected token "${t.value}"`)
+    }
+  }
+  private parseNumberOrRef(first: number): PdfObject {
+    // Lookahead for `int int R` (indirect reference).
+    const save = this.lex.pos
+    const t2 = this.lex.next()
+    if (t2.type === 'num' && Number.isInteger(first) && Number.isInteger(t2.value)) {
+      const t3 = this.lex.next()
+      if (t3.type === 'kw' && t3.value === 'R') {
+        return ref(first, t2.value)
+      }
+    }
+    this.lex.pos = save
+    return num(first)
+  }
+  private parseArray(): PdfObject {
+    const items: PdfObject[] = []
+    for (;;) {
+      const t = this.lex.next()
+      if (t.type === 'eof') throw new PdfParseError('Unterminated array')
+      if (t.type === 'delim' && t.value === ']') break
+      items.push(this.parseFromToken(t))
+    }
+    return arr(items)
+  }
+  private parseDictOrStream(): PdfObject {
+    const d = makeDict()
+    for (;;) {
+      const t = this.lex.next()
+      if (t.type === 'eof') throw new PdfParseError('Unterminated dictionary')
+      if (t.type === 'delim' && t.value === '>>') break
+      if (t.type !== 'name') {
+        // tolerate garbage keys by skipping a value
+        continue
+      }
+      const value = this.parseObject()
+      d.entries.set(t.value, value)
+    }
+    // A `stream` keyword immediately following the dict makes this a stream.
+    const save = this.lex.pos
+    this.lex.skipWs()
+    if (this.matchKeyword('stream')) {
+      return this.readStreamBody(d)
+    }
+    this.lex.pos = save
+    return d
+  }
+  private matchKeyword(kw: string): boolean {
+    const b = this.lex.buf
+    let p = this.lex.pos
+    for (let i = 0; i < kw.length; i++) {
+      if (b[p + i] !== kw.charCodeAt(i)) return false
+    }
+    p += kw.length
+    this.lex.pos = p
+    return true
+  }
+  private readStreamBody(d: PdfDictionary): PdfObject {
+    const b = this.lex.buf
+    // After "stream": CRLF or LF (spec §7.3.8.1). A lone CR is tolerated.
+    if (b[this.lex.pos] === 0x0d && b[this.lex.pos + 1] === 0x0a) this.lex.pos += 2
+    else if (b[this.lex.pos] === 0x0a || b[this.lex.pos] === 0x0d) this.lex.pos += 1
+    const start = this.lex.pos
+    let len = -1
+    const lenObj = d.entries.get('Length')
+    if (lenObj && isNum(lenObj)) len = lenObj.value
+    else if (lenObj && this.resolveLength) {
+      const r = this.resolveLength(lenObj)
+      if (typeof r === 'number') len = r
+    }
+    let end: number
+    if (len >= 0 && this.looksLikeEndstream(start + len)) {
+      end = start + len
+    } else {
+      end = this.scanForEndstream(start)
+    }
+    const data = this.lex.slice(start, end)
+    // Skip past endstream/endobj for sequential callers.
+    this.lex.pos = end
+    this.skipUntilAfter('endstream')
+    return { kind: 'stream', dict: d, data }
+  }
+  private looksLikeEndstream(at: number): boolean {
+    const b = this.lex.buf
+    let p = at
+    while (p < b.length && (b[p] === 0x0a || b[p] === 0x0d || b[p] === 0x20 || b[p] === 0x09)) p++
+    return latin1(b, p, p + 9) === 'endstream'
+  }
+  private scanForEndstream(start: number): number {
+    const b = this.lex.buf
+    const needle = 'endstream'
+    for (let p = start; p <= b.length - needle.length; p++) {
+      if (b[p] === 0x65 && latin1(b, p, p + needle.length) === needle) {
+        // trim a single trailing EOL that belongs to the keyword line
+        let e = p
+        if (b[e - 1] === 0x0a) e--
+        if (b[e - 1] === 0x0d) e--
+        return e
+      }
+    }
+    return b.length
+  }
+  private skipUntilAfter(kw: string): void {
+    const b = this.lex.buf
+    for (let p = this.lex.pos; p <= b.length - kw.length; p++) {
+      if (latin1(b, p, p + kw.length) === kw) {
+        this.lex.pos = p + kw.length
+        return
+      }
+    }
+    this.lex.pos = b.length
+  }
+}
+/** Convenience: parse a standalone object value from bytes. */
+export function parseObjectFrom(buf: Uint8Array, offset = 0): PdfObject {
+  return new ObjectParser(new Lexer(buf, offset)).parseObject()
+}
+export { isDict }

package/src/reader/objstm.ts ADDED Viewed

@@ -0,0 +1,44 @@
+/**
+ * Compressed object streams (`/Type /ObjStm`, spec §7.5.7). Header is `N`
+ * pairs `(objNum, byteOffset)`; the objects themselves start at `/First`.
+ * Objects inside an ObjStm may not be streams and may not be ObjStm
+ * themselves — the document layer enforces no ObjStm-in-ObjStm recursion.
+ */
+import { type PdfObject, isNum } from '../objects/types.ts'
+import { PdfParseError } from '../util/errors.ts'
+import { Lexer } from './lexer.ts'
+import { ObjectParser } from './object_parser.ts'
+export interface ObjStmContents {
+  /** objNum → parsed value. */
+  objects: Map<number, PdfObject>
+}
+export function parseObjStm(dict: { entries: Map<string, PdfObject> }, data: Uint8Array): ObjStmContents {
+  const nObj = dict.entries.get('N')
+  const firstObj = dict.entries.get('First')
+  if (!nObj || !isNum(nObj) || !firstObj || !isNum(firstObj)) {
+    throw new PdfParseError('ObjStm missing /N or /First')
+  }
+  const n = nObj.value
+  const first = firstObj.value
+  const headerLex = new Lexer(data, 0)
+  const table: { num: number; off: number }[] = []
+  for (let i = 0; i < n; i++) {
+    const a = headerLex.next()
+    const b = headerLex.next()
+    if (a.type !== 'num' || b.type !== 'num') {
+      throw new PdfParseError('Malformed ObjStm header')
+    }
+    table.push({ num: a.value, off: b.value })
+  }
+  const objects = new Map<number, PdfObject>()
+  for (const { num, off } of table) {
+    const parser = new ObjectParser(new Lexer(data, first + off))
+    objects.set(num, parser.parseObject())
+  }
+  return { objects }
+}

package/src/reader/text_interpreter.ts ADDED Viewed

@@ -0,0 +1,327 @@
+/**
+ * Content-stream text interpreter (spec §9.4). Executes the text-showing
+ * subset of operators against a graphics/text state and emits positioned
+ * glyph runs (device-space origin + advance + effective size). Non-text
+ * operators are skipped; `BI…ID…EI` inline images are byte-skipped so their
+ * binary payload never reaches the lexer.
+ */
+import { type PdfDictionary, isDict, isName, isStream } from '../objects/types.ts'
+import { Lexer, latin1, type Token } from './lexer.ts'
+import { buildCharMap, type CharMap } from './fonts.ts'
+import type { Run } from './layout.ts'
+/** 2×3 affine matrix [a b c d e f]; point (x,y) → (a x + c y + e, b x + d y + f). */
+type Mat = [number, number, number, number, number, number]
+const IDENT: Mat = [1, 0, 0, 1, 0, 0]
+function mul(m: Mat, n: Mat): Mat {
+  return [
+    m[0] * n[0] + m[1] * n[2],
+    m[0] * n[1] + m[1] * n[3],
+    m[2] * n[0] + m[3] * n[2],
+    m[2] * n[1] + m[3] * n[3],
+    m[4] * n[0] + m[5] * n[2] + n[4],
+    m[4] * n[1] + m[5] * n[3] + n[5],
+  ]
+}
+interface Doc {
+  resolve(o: any): any
+  getStreamData(s: any, num: number): Uint8Array
+}
+interface TextState {
+  fontRes?: string
+  fontSize: number
+  charSpace: number
+  wordSpace: number
+  hScale: number // as a fraction (Tz / 100)
+  leading: number
+  rise: number
+}
+function freshTextState(): TextState {
+  return { fontSize: 0, charSpace: 0, wordSpace: 0, hScale: 1, leading: 0, rise: 0 }
+}
+export function interpretText(
+  content: Uint8Array,
+  resources: PdfDictionary | undefined,
+  doc: Doc,
+): Run[] {
+  const runs: Run[] = []
+  const lex = new Lexer(content, 0)
+  let ctm: Mat = IDENT
+  const ctmStack: Mat[] = []
+  let ts = freshTextState()
+  let tm: Mat = IDENT
+  let tlm: Mat = IDENT
+  const fontCache = new Map<string, CharMap | undefined>()
+  const fontDictCache = new Map<string, PdfDictionary | undefined>()
+  const charMapFor = (res: string): CharMap | undefined => {
+    if (fontCache.has(res)) return fontCache.get(res)
+    let fd = fontDictCache.get(res)
+    if (fd === undefined) {
+      fd = lookupFont(resources, res, doc)
+      fontDictCache.set(res, fd)
+    }
+    const cm = fd ? safe(() => buildCharMap(fd!, doc)) : undefined
+    fontCache.set(res, cm)
+    return cm
+  }
+  const operands: any[] = []
+  const popNums = (k: number): number[] => {
+    const v = operands.slice(-k).map((x) => (typeof x === 'number' ? x : 0))
+    operands.length = Math.max(0, operands.length - k)
+    return v
+  }
+  const showText = (bytes: Uint8Array, cm: CharMap | undefined): void => {
+    if (!cm) return
+    const trm0 = mul(mul([ts.fontSize * ts.hScale, 0, 0, ts.fontSize, 0, ts.rise], tm), ctm)
+    const startX = trm0[4]
+    const y = trm0[5]
+    const scaleX = Math.hypot(ctm[0], ctm[1]) || 1
+    const scaleY = Math.hypot(ctm[2], ctm[3]) || 1
+    const fsDevice = ts.fontSize * scaleY
+    const spaceDevice = (cm.spaceWidth / 1000) * ts.fontSize * ts.hScale * scaleX
+    let text = ''
+    for (const g of cm.decode(bytes)) {
+      text += g.unicode
+      const w0 = g.width1000 / 1000
+      const isSpaceByte = g.code === 0x20
+      const tx =
+        (w0 * ts.fontSize + ts.charSpace + (isSpaceByte ? ts.wordSpace : 0)) * ts.hScale
+      tm = mul([1, 0, 0, 1, tx, 0], tm)
+    }
+    const endX = mul(mul([ts.fontSize * ts.hScale, 0, 0, ts.fontSize, 0, ts.rise], tm), ctm)[4]
+    runs.push({ text, x: startX, endX, y, fs: fsDevice || ts.fontSize || 1, spaceW: spaceDevice || 1 })
+  }
+  const showArray = (arr: any[], cm: CharMap | undefined): void => {
+    if (!cm) return
+    for (const el of arr) {
+      if (el instanceof Uint8Array) {
+        showText(el, cm)
+      } else if (typeof el === 'number') {
+        // TJ adjustment: positive moves left (spec §9.4.3).
+        const tx = (-el / 1000) * ts.fontSize * ts.hScale
+        tm = mul([1, 0, 0, 1, tx, 0], tm)
+        // Synthesize a space for kerning-only word gaps.
+        if (-el > 200 && runs.length) {
+          const last = runs[runs.length - 1]!
+          if (!last.text.endsWith(' ')) last.text += ' '
+        }
+      }
+    }
+  }
+  for (;;) {
+    const t = lex.next()
+    if (t.type === 'eof') break
+    if (t.type === 'num') {
+      operands.push(t.value)
+      continue
+    }
+    if (t.type === 'str') {
+      operands.push(t.value)
+      continue
+    }
+    if (t.type === 'name') {
+      operands.push({ name: t.value })
+      continue
+    }
+    if (t.type === 'delim') {
+      if (t.value === '[') {
+        operands.push(readArray(lex))
+      } else if (t.value === '<<') {
+        skipDict(lex)
+        operands.push({})
+      }
+      continue
+    }
+    // Operator (keyword)
+    const op = t.value
+    switch (op) {
+      case 'q':
+        ctmStack.push(ctm)
+        break
+      case 'Q':
+        ctm = ctmStack.pop() ?? ctm
+        break
+      case 'cm': {
+        const [a, b, c, d, e, f] = popNums(6)
+        ctm = mul([a!, b!, c!, d!, e!, f!], ctm)
+        break
+      }
+      case 'BT':
+        tm = IDENT
+        tlm = IDENT
+        break
+      case 'ET':
+        break
+      case 'Td': {
+        const [tx, ty] = popNums(2)
+        tlm = mul([1, 0, 0, 1, tx!, ty!], tlm)
+        tm = tlm
+        break
+      }
+      case 'TD': {
+        const [tx, ty] = popNums(2)
+        ts.leading = -ty!
+        tlm = mul([1, 0, 0, 1, tx!, ty!], tlm)
+        tm = tlm
+        break
+      }
+      case 'Tm': {
+        const [a, b, c, d, e, f] = popNums(6)
+        tlm = [a!, b!, c!, d!, e!, f!]
+        tm = tlm
+        break
+      }
+      case 'T*':
+        tlm = mul([1, 0, 0, 1, 0, -ts.leading], tlm)
+        tm = tlm
+        break
+      case 'Tc':
+        ts.charSpace = popNums(1)[0]!
+        break
+      case 'Tw':
+        ts.wordSpace = popNums(1)[0]!
+        break
+      case 'Tz':
+        ts.hScale = popNums(1)[0]! / 100
+        break
+      case 'TL':
+        ts.leading = popNums(1)[0]!
+        break
+      case 'Ts':
+        ts.rise = popNums(1)[0]!
+        break
+      case 'Tf': {
+        const size = popNums(1)[0]!
+        const res = operands.pop()
+        ts.fontSize = size
+        ts.fontRes = res && typeof res === 'object' && 'name' in res ? res.name : undefined
+        break
+      }
+      case 'Tj': {
+        const s = operands.pop()
+        if (s instanceof Uint8Array && ts.fontRes) showText(s, charMapFor(ts.fontRes))
+        break
+      }
+      case 'TJ': {
+        const a = operands.pop()
+        if (Array.isArray(a) && ts.fontRes) showArray(a, charMapFor(ts.fontRes))
+        break
+      }
+      case "'": {
+        const s = operands.pop()
+        tlm = mul([1, 0, 0, 1, 0, -ts.leading], tlm)
+        tm = tlm
+        if (s instanceof Uint8Array && ts.fontRes) showText(s, charMapFor(ts.fontRes))
+        break
+      }
+      case '"': {
+        const s = operands.pop()
+        const [aw, ac] = popNums(2)
+        ts.wordSpace = aw!
+        ts.charSpace = ac!
+        tlm = mul([1, 0, 0, 1, 0, -ts.leading], tlm)
+        tm = tlm
+        if (s instanceof Uint8Array && ts.fontRes) showText(s, charMapFor(ts.fontRes))
+        break
+      }
+      case 'BI':
+        skipInlineImage(lex)
+        break
+      default:
+        // Unknown / non-text operator: discard its operands.
+        operands.length = 0
+    }
+    if (op !== 'Tf') operands.length = 0
+  }
+  return runs
+}
+// ── Token helpers ──────────────────────────────────────────────────────────
+function readArray(lex: Lexer): any[] {
+  const out: any[] = []
+  for (;;) {
+    const t = lex.next()
+    if (t.type === 'eof') break
+    if (t.type === 'delim' && t.value === ']') break
+    if (t.type === 'num') out.push(t.value)
+    else if (t.type === 'str') out.push(t.value)
+    else if (t.type === 'name') out.push({ name: t.value })
+  }
+  return out
+}
+function skipDict(lex: Lexer): void {
+  let depth = 1
+  for (;;) {
+    const t = lex.next()
+    if (t.type === 'eof') break
+    if (t.type === 'delim' && t.value === '<<') depth++
+    else if (t.type === 'delim' && t.value === '>>' && --depth === 0) break
+  }
+}
+/** Skip `… ID <binary> EI` without tokenizing the binary payload. */
+function skipInlineImage(lex: Lexer): void {
+  // Consume the inline image dictionary up to the ID keyword.
+  for (;;) {
+    const t: Token = lex.next()
+    if (t.type === 'eof') return
+    if (t.type === 'kw' && t.value === 'ID') break
+  }
+  const b = lex.buf
+  let p = lex.pos + 1 // one whitespace byte follows ID
+  while (p + 1 < b.length) {
+    if (
+      b[p] === 0x45 && // 'E'
+      b[p + 1] === 0x49 && // 'I'
+      (p === 0 || isWsByte(b[p - 1]!)) &&
+      (p + 2 >= b.length || isWsByte(b[p + 2]!))
+    ) {
+      lex.pos = p + 2
+      return
+    }
+    p++
+  }
+  lex.pos = b.length
+}
+const isWsByte = (x: number) =>
+  x === 0x00 || x === 0x09 || x === 0x0a || x === 0x0c || x === 0x0d || x === 0x20
+function lookupFont(
+  resources: PdfDictionary | undefined,
+  res: string,
+  doc: Doc,
+): PdfDictionary | undefined {
+  if (!resources) return undefined
+  const fontsDict = doc.resolve(resources.entries.get('Font'))
+  if (!fontsDict || !isDict(fontsDict)) return undefined
+  const fd = doc.resolve(fontsDict.entries.get(res))
+  return fd && isDict(fd) ? fd : undefined
+}
+function safe<T>(fn: () => T): T | undefined {
+  try {
+    return fn()
+  } catch {
+    return undefined
+  }
+}
+export { latin1, isName, isStream }