npm - @strav/pdf - Versions diffs - 0.4.17 → 0.4.18 - Mend

@strav/pdf 0.4.17 → 0.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/README.md +25 -7
package/package.json +5 -3
package/src/index.ts +10 -0
package/src/reader/cmap_parser.ts +173 -0
package/src/reader/decrypt.ts +226 -0
package/src/reader/document.ts +246 -0
package/src/reader/encodings.ts +73 -0
package/src/reader/extract.ts +152 -0
package/src/reader/fonts.ts +259 -0
package/src/reader/index.ts +27 -0
package/src/reader/layout.ts +106 -0
package/src/reader/lexer.ts +270 -0
package/src/reader/object_parser.ts +203 -0
package/src/reader/objstm.ts +44 -0
package/src/reader/text_interpreter.ts +327 -0
package/src/reader/xref.ts +229 -0
package/src/streams/decode.ts +98 -0
package/src/streams/flate.ts +94 -4
package/src/streams/index.ts +6 -1
package/src/streams/lzw.ts +74 -0
package/src/streams/runlength.ts +25 -0
package/src/util/errors.ts +20 -0

package/README.md CHANGED Viewed

@@ -1,9 +1,10 @@
 # @strav/pdf
-Low-level, **zero-dependency** PDF generation (the *write* side of PDF) for
-the Strav ecosystem. Produces conformant PDF 1.7 byte streams — it does not
-parse, render or display PDFs. No `@strav/*` dependency and no npm runtime
-dependency; only Node/Bun built-ins.
+Low-level, **zero-dependency** PDF generation (the *write* side) plus
+layout-aware text extraction (the *read* side) for the Strav ecosystem.
+Produces conformant PDF 1.7 byte streams and extracts plain text from existing
+PDFs — it does not render or display PDFs. No `@strav/*` dependency and no npm
+runtime dependency; only Node/Bun built-ins.
 ## Install
@@ -47,6 +48,21 @@ await doc.saveToStream(createWriteStream('out.pdf'))
 `saveToStream` resolves once the stream has flushed; it rejects on a stream
 error or a build/conformance error, exactly like `save()`.
+## Text extraction (read side)
+```typescript
+import { extractText } from '@strav/pdf'
+const { pages, text, info } = await extractText(await Bun.file('doc.pdf').bytes())
+console.log(info.pageCount, pages[0].text)
+```
+Layout-aware plain text per page (heuristic spacing/line breaks), `/ToUnicode`
+and encoding-based glyph decoding, classic + xref-stream + object-stream
+parsing with broken-xref recovery, and empty-password decryption (RC4,
+AES-128, AES-256). See [`docs/pdf/extraction.md`](../../docs/pdf/extraction.md)
+— including the `@strav/rag` ingestion snippet.
 ## What's supported
 Object model & serialization, pages, the full content-stream operator set,
@@ -56,13 +72,15 @@ with ToUnicode, JPEG/PNG images with alpha, transparency (ExtGState) and
 tiling/shading patterns, XMP metadata, and PDF/A-2b / PDF/X-4 conformance
 validation. Output is byte-deterministic with a fixed creation date and id.
-Browser builds, encryption, signatures, forms, and reading/parsing PDFs are
-out of scope.
+On the read side: classic/xref-stream/object-stream parsing, layout-aware text
+extraction, and empty-password decryption. Browser builds, write-side
+encryption, signatures, forms, OCR, and PDF rendering remain out of scope.
 ## Documentation
 Full guides live in [`docs/pdf`](../../docs/pdf/pdf.md): the content builder,
-fonts, images, color, transparency/patterns, and conformance.
+fonts, images, color, transparency/patterns, conformance, and
+[text extraction](../../docs/pdf/extraction.md).
 ## Examples

package/package.json CHANGED Viewed

@@ -1,8 +1,8 @@
 {
   "name": "@strav/pdf",
-  "version": "0.4.17",
+  "version": "0.4.18",
   "type": "module",
-  "description": "Low-level, zero-dependency PDF generation (write side) for the Strav ecosystem",
+  "description": "Low-level, zero-dependency PDF generation (write side) and layout-aware text extraction (read side) for the Strav ecosystem",
   "license": "MIT",
   "keywords": [
     "bun",
@@ -39,7 +39,9 @@
     "./color": "./src/color/index.ts",
     "./color/*": "./src/color/*.ts",
     "./document": "./src/document/index.ts",
-    "./document/*": "./src/document/*.ts"
+    "./document/*": "./src/document/*.ts",
+    "./reader": "./src/reader/index.ts",
+    "./reader/*": "./src/reader/*.ts"
   },
   "scripts": {
     "test": "bun test tests/",

package/src/index.ts CHANGED Viewed

@@ -48,10 +48,20 @@ export type { Shading, ShadingPattern, ColorStop } from './patterns/shading.ts'
 export { mm, cm, inch, pt } from './util/units.ts'
+export { extractText, PdfReader } from './reader/extract.ts'
+export type {
+  ExtractOptions,
+  ExtractResult,
+  ExtractedPage,
+  PdfInfo,
+} from './reader/extract.ts'
 export {
   PdfGenError,
   ConformanceError,
   UnsupportedFontError,
   InvalidImageError,
+  PdfParseError,
+  EncryptedPdfError,
 } from './util/errors.ts'
 export type { PdfGenErrorCode } from './util/errors.ts'

package/src/reader/cmap_parser.ts ADDED Viewed

@@ -0,0 +1,173 @@
+/**
+ * CMap parser (spec §9.7.5 / §9.10.3) — the inverse of
+ * `fonts/to_unicode.ts#buildToUnicode`. Parses `codespacerange`, `bfchar`,
+ * `bfrange` (incl. the `[…]` array form) and `cidchar`/`cidrange`. Used for
+ * `/ToUnicode` (code → Unicode) and embedded Type0 encodings (code → CID).
+ */
+import { Lexer, type Token } from './lexer.ts'
+interface CodespaceRange {
+  nbytes: number
+  low: number
+  high: number
+}
+export class CMap {
+  private readonly codespaces: CodespaceRange[] = []
+  /** code → Unicode string (bf*) */
+  private readonly toStr = new Map<number, string>()
+  /** code → CID (cid*) */
+  private readonly toCid = new Map<number, number>()
+  private bfRanges: { lo: number; hi: number; base: string }[] = []
+  private cidRanges: { lo: number; hi: number; base: number }[] = []
+  /** Byte length to read for the next code (uniform-codespace heuristic). */
+  get codeBytes(): number {
+    if (this.codespaces.length === 0) return 1
+    return this.codespaces[0]!.nbytes
+  }
+  /** Split a show string into numeric character codes. */
+  readCodes(bytes: Uint8Array): number[] {
+    const out: number[] = []
+    const n = this.codeBytes
+    for (let i = 0; i + n <= bytes.length; i += n) {
+      let c = 0
+      for (let k = 0; k < n; k++) c = (c << 8) | bytes[i + k]!
+      out.push(c)
+    }
+    return out
+  }
+  unicodeOf(code: number): string | undefined {
+    const direct = this.toStr.get(code)
+    if (direct !== undefined) return direct
+    for (const r of this.bfRanges) {
+      if (code >= r.lo && code <= r.hi) {
+        // Increment the last UTF-16 unit of the base by the offset.
+        const cps = [...r.base]
+        const off = code - r.lo
+        const last = cps.pop() ?? ''
+        return cps.join('') + String.fromCodePoint((last.codePointAt(0) ?? 0) + off)
+      }
+    }
+    return undefined
+  }
+  cidOf(code: number): number | undefined {
+    const direct = this.toCid.get(code)
+    if (direct !== undefined) return direct
+    for (const r of this.cidRanges) {
+      if (code >= r.lo && code <= r.hi) return r.base + (code - r.lo)
+    }
+    return undefined
+  }
+}
+const bytesToInt = (b: Uint8Array): number => {
+  let v = 0
+  for (const x of b) v = (v << 8) | x
+  return v
+}
+const utf16beToStr = (b: Uint8Array): string => {
+  let s = ''
+  for (let i = 0; i + 1 < b.length; i += 2) s += String.fromCharCode((b[i]! << 8) | b[i + 1]!)
+  // Normalize surrogate pairs into proper code points.
+  return [...s].join('')
+}
+export function parseCMap(content: Uint8Array): CMap {
+  const cmap = new CMap()
+  const lex = new Lexer(content, 0)
+  const internal = cmap as unknown as {
+    codespaces: CodespaceRange[]
+    toStr: Map<number, string>
+    toCid: Map<number, number>
+    bfRanges: { lo: number; hi: number; base: string }[]
+    cidRanges: { lo: number; hi: number; base: number }[]
+  }
+  const next = (): Token => lex.next()
+  for (;;) {
+    const t = next()
+    if (t.type === 'eof') break
+    if (t.type !== 'kw') continue
+    if (t.value === 'begincodespacerange') {
+      for (;;) {
+        const lo = next()
+        if (lo.type === 'kw' && lo.value === 'endcodespacerange') break
+        if (lo.type === 'eof') break
+        const hi = next()
+        if (lo.type === 'str' && hi.type === 'str') {
+          internal.codespaces.push({
+            nbytes: lo.value.length || 1,
+            low: bytesToInt(lo.value),
+            high: bytesToInt(hi.value),
+          })
+        }
+      }
+    } else if (t.value === 'beginbfchar') {
+      for (;;) {
+        const src = next()
+        if (src.type === 'kw' && src.value === 'endbfchar') break
+        if (src.type === 'eof') break
+        const dst = next()
+        if (src.type === 'str' && dst.type === 'str') {
+          internal.toStr.set(bytesToInt(src.value), utf16beToStr(dst.value))
+        }
+      }
+    } else if (t.value === 'beginbfrange') {
+      for (;;) {
+        const lo = next()
+        if (lo.type === 'kw' && lo.value === 'endbfrange') break
+        if (lo.type === 'eof') break
+        const hi = next()
+        const dst = next()
+        if (lo.type !== 'str' || hi.type !== 'str') continue
+        const loN = bytesToInt(lo.value)
+        const hiN = bytesToInt(hi.value)
+        if (dst.type === 'str') {
+          internal.bfRanges.push({ lo: loN, hi: hiN, base: utf16beToStr(dst.value) })
+        } else if (dst.type === 'delim' && dst.value === '[') {
+          let i = loN
+          for (;;) {
+            const el = next()
+            if (el.type === 'delim' && el.value === ']') break
+            if (el.type === 'eof') break
+            if (el.type === 'str') internal.toStr.set(i++, utf16beToStr(el.value))
+          }
+        }
+      }
+    } else if (t.value === 'begincidchar') {
+      for (;;) {
+        const src = next()
+        if (src.type === 'kw' && src.value === 'endcidchar') break
+        if (src.type === 'eof') break
+        const cid = next()
+        if (src.type === 'str' && cid.type === 'num') {
+          internal.toCid.set(bytesToInt(src.value), cid.value)
+        }
+      }
+    } else if (t.value === 'begincidrange') {
+      for (;;) {
+        const lo = next()
+        if (lo.type === 'kw' && lo.value === 'endcidrange') break
+        if (lo.type === 'eof') break
+        const hi = next()
+        const cid = next()
+        if (lo.type === 'str' && hi.type === 'str' && cid.type === 'num') {
+          internal.cidRanges.push({
+            lo: bytesToInt(lo.value),
+            hi: bytesToInt(hi.value),
+            base: cid.value,
+          })
+        }
+      }
+    }
+  }
+  return cmap
+}

package/src/reader/decrypt.ts ADDED Viewed

@@ -0,0 +1,226 @@
+/**
+ * Standard security handler (spec §7.6), empty user password only.
+ *
+ * Supports RC4 (40/128-bit, V1–V2), AES-128 (V4/R4) and AES-256 (V5/R5–R6).
+ * If the empty user password does not validate — i.e. the file needs a real
+ * password — or the handler is non-standard, an {@link EncryptedPdfError} is
+ * thrown. Strings and streams are decrypted after parsing, before filtering.
+ */
+import { createHash, createDecipheriv, createCipheriv } from 'node:crypto'
+import {
+  type PdfObject,
+  type PdfDictionary,
+  isNum,
+  isStr,
+  isName,
+  isDict,
+} from '../objects/types.ts'
+import { EncryptedPdfError } from '../util/errors.ts'
+const PAD = Uint8Array.from([
+  0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41, 0x64, 0x00, 0x4e, 0x56, 0xff, 0xfa, 0x01, 0x08,
+  0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80, 0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a,
+])
+const md5 = (b: Uint8Array): Uint8Array =>
+  new Uint8Array(createHash('md5').update(b).digest())
+const sha = (algo: string, b: Uint8Array): Uint8Array =>
+  new Uint8Array(createHash(algo).update(b).digest())
+function cat(...parts: Uint8Array[]): Uint8Array {
+  const len = parts.reduce((a, p) => a + p.length, 0)
+  const out = new Uint8Array(len)
+  let o = 0
+  for (const p of parts) {
+    out.set(p, o)
+    o += p.length
+  }
+  return out
+}
+function rc4(key: Uint8Array, data: Uint8Array): Uint8Array {
+  const s = new Uint8Array(256)
+  for (let i = 0; i < 256; i++) s[i] = i
+  let j = 0
+  for (let i = 0; i < 256; i++) {
+    j = (j + s[i]! + key[i % key.length]!) & 0xff
+    ;[s[i], s[j]] = [s[j]!, s[i]!]
+  }
+  const out = new Uint8Array(data.length)
+  let a = 0
+  let b = 0
+  for (let k = 0; k < data.length; k++) {
+    a = (a + 1) & 0xff
+    b = (b + s[a]!) & 0xff
+    ;[s[a], s[b]] = [s[b]!, s[a]!]
+    out[k] = data[k]! ^ s[(s[a]! + s[b]!) & 0xff]!
+  }
+  return out
+}
+function aesCbcDecrypt(key: Uint8Array, data: Uint8Array, iv?: Uint8Array): Uint8Array {
+  if (data.length < 16) return new Uint8Array(0)
+  const useIv = iv ?? data.subarray(0, 16)
+  const body = iv ? data : data.subarray(16)
+  const algo = key.length === 32 ? 'aes-256-cbc' : 'aes-128-cbc'
+  const d = createDecipheriv(algo, key, useIv)
+  d.setAutoPadding(false)
+  const out = Buffer.concat([d.update(body), d.final()])
+  // Strip PKCS#7 padding when it looks valid.
+  const pad = out.length ? out[out.length - 1]! : 0
+  if (pad >= 1 && pad <= 16 && pad <= out.length) {
+    return new Uint8Array(out.subarray(0, out.length - pad))
+  }
+  return new Uint8Array(out)
+}
+/** Algorithm 2.B — the R6 hardened hash. */
+function hash2B(pwd: Uint8Array, salt: Uint8Array, udata: Uint8Array): Uint8Array {
+  let K = sha('sha256', cat(pwd, salt, udata))
+  for (let round = 0; ; round++) {
+    const block = cat(pwd, K, udata)
+    const K1 = new Uint8Array(block.length * 64)
+    for (let i = 0; i < 64; i++) K1.set(block, i * block.length)
+    const c = createCipheriv('aes-128-cbc', K.subarray(0, 16), K.subarray(16, 32))
+    c.setAutoPadding(false)
+    const E = new Uint8Array(Buffer.concat([c.update(K1), c.final()]))
+    let mod = 0
+    for (let i = 0; i < 16; i++) mod += E[i]!
+    mod %= 3
+    K = sha(mod === 0 ? 'sha256' : mod === 1 ? 'sha384' : 'sha512', E)
+    if (round >= 63 && E[E.length - 1]! <= round - 32) break
+  }
+  return K.subarray(0, 32)
+}
+export interface Decryptor {
+  /** Decrypt a string/stream payload for object (num,gen). */
+  decrypt(num: number, gen: number, data: Uint8Array, isString: boolean): Uint8Array
+  /** Object number of the /Encrypt dict (never itself decrypted). */
+  readonly encryptObjNum: number
+}
+const strBytes = (o: PdfObject | undefined): Uint8Array =>
+  o && isStr(o) ? o.value : new Uint8Array(0)
+/**
+ * Build a decryptor from the trailer's /Encrypt dict and /ID, validating the
+ * empty user password. `encryptObjNum` is the /Encrypt indirect object number.
+ */
+export function buildDecryptor(
+  enc: PdfDictionary,
+  idFirst: Uint8Array,
+  encryptObjNum: number,
+): Decryptor {
+  const filter = enc.entries.get('Filter')
+  if (!filter || !isName(filter) || filter.value !== 'Standard') {
+    throw new EncryptedPdfError('Unsupported security handler (only /Standard)')
+  }
+  const numOf = (k: string, d = 0): number => {
+    const v = enc.entries.get(k)
+    return v && isNum(v) ? v.value : d
+  }
+  const V = numOf('V', 0)
+  const R = numOf('R', 0)
+  const O = strBytes(enc.entries.get('O'))
+  const U = strBytes(enc.entries.get('U'))
+  const P = numOf('P', 0) | 0
+  const lengthBits = numOf('Length', 40)
+  // Determine the algorithm: V5 → AES-256; V4 → /CF /StdCF /CFM; else RC4.
+  let cfm: 'V2' | 'AESV2' | 'AESV3' = 'V2'
+  if (V >= 5) cfm = 'AESV3'
+  else if (V === 4) {
+    const cf = enc.entries.get('CF')
+    const stmF = enc.entries.get('StmF')
+    if (cf && isDict(cf) && stmF && isName(stmF)) {
+      const std = cf.entries.get(stmF.value)
+      if (std && isDict(std)) {
+        const m = std.entries.get('CFM')
+        if (m && isName(m)) {
+          cfm = m.value === 'AESV3' ? 'AESV3' : m.value === 'AESV2' ? 'AESV2' : 'V2'
+        }
+      }
+    }
+  }
+  let fileKey: Uint8Array
+  if (V >= 5) {
+    // AES-256 (R5/R6). Validate empty user password against /U.
+    const valSalt = U.subarray(32, 40)
+    const keySalt = U.subarray(40, 48)
+    const empty = new Uint8Array(0)
+    const check =
+      R === 5 ? sha('sha256', cat(empty, valSalt)) : hash2B(empty, valSalt, empty)
+    if (Buffer.compare(Buffer.from(check), Buffer.from(U.subarray(0, 32))) !== 0) {
+      throw new EncryptedPdfError('PDF requires a user password')
+    }
+    const ikey =
+      R === 5 ? sha('sha256', cat(empty, keySalt)) : hash2B(empty, keySalt, empty)
+    const UE = strBytes(enc.entries.get('UE'))
+    fileKey = aesCbcDecrypt(ikey, UE, new Uint8Array(16))
+  } else {
+    // RC4 / AES-128 (R2–R4). Algorithm 2 with the empty password.
+    const keyLen = R === 2 ? 5 : lengthBits / 8
+    const pBytes = Uint8Array.from([P & 0xff, (P >> 8) & 0xff, (P >> 16) & 0xff, (P >> 24) & 0xff])
+    const encMeta = enc.entries.get('EncryptMetadata')
+    const metaFalse = encMeta && encMeta.kind === 'bool' && encMeta.value === false
+    let h = md5(
+      cat(
+        PAD,
+        O,
+        pBytes,
+        idFirst,
+        R >= 4 && metaFalse ? Uint8Array.from([0xff, 0xff, 0xff, 0xff]) : new Uint8Array(0),
+      ),
+    )
+    if (R >= 3) for (let i = 0; i < 50; i++) h = md5(h.subarray(0, keyLen))
+    fileKey = h.subarray(0, keyLen)
+    // Validate the empty password by reproducing /U (Algorithm 4/5).
+    let expectedU: Uint8Array
+    if (R === 2) {
+      expectedU = rc4(fileKey, PAD)
+    } else {
+      const idHash = md5(cat(PAD, idFirst))
+      let x = rc4(fileKey, idHash)
+      for (let i = 1; i <= 19; i++) {
+        const k = new Uint8Array(fileKey.length)
+        for (let j = 0; j < k.length; j++) k[j] = fileKey[j]! ^ i
+        x = rc4(k, x)
+      }
+      expectedU = x
+    }
+    const cmpLen = R === 2 ? 32 : 16
+    if (
+      U.length >= cmpLen &&
+      Buffer.compare(
+        Buffer.from(expectedU.subarray(0, cmpLen)),
+        Buffer.from(U.subarray(0, cmpLen)),
+      ) !== 0
+    ) {
+      throw new EncryptedPdfError('PDF requires a user password')
+    }
+  }
+  const objectKey = (num: number, gen: number, aes: boolean): Uint8Array => {
+    const ext = cat(
+      fileKey,
+      Uint8Array.from([num & 0xff, (num >> 8) & 0xff, (num >> 16) & 0xff]),
+      Uint8Array.from([gen & 0xff, (gen >> 8) & 0xff]),
+      aes ? Uint8Array.from([0x73, 0x41, 0x6c, 0x54]) : new Uint8Array(0),
+    )
+    return md5(ext).subarray(0, Math.min(fileKey.length + 5, 16))
+  }
+  return {
+    encryptObjNum,
+    decrypt(num, gen, data, _isString): Uint8Array {
+      if (data.length === 0) return data
+      if (cfm === 'AESV3') return aesCbcDecrypt(fileKey, data)
+      if (cfm === 'AESV2') return aesCbcDecrypt(objectKey(num, gen, true), data)
+      return rc4(objectKey(num, gen, false), data)
+    },
+  }
+}