@strav/pdf 0.4.17 → 0.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -1,9 +1,10 @@
1
1
  # @strav/pdf
2
2
 
3
- Low-level, **zero-dependency** PDF generation (the *write* side of PDF) for
4
- the Strav ecosystem. Produces conformant PDF 1.7 byte streams — it does not
5
- parse, render or display PDFs. No `@strav/*` dependency and no npm runtime
6
- dependency; only Node/Bun built-ins.
3
+ Low-level, **zero-dependency** PDF generation (the *write* side) plus
4
+ layout-aware text extraction (the *read* side) for the Strav ecosystem.
5
+ Produces conformant PDF 1.7 byte streams and extracts plain text from existing
6
+ PDFs — it does not render or display PDFs. No `@strav/*` dependency and no npm
7
+ runtime dependency; only Node/Bun built-ins.
7
8
 
8
9
  ## Install
9
10
 
@@ -47,6 +48,21 @@ await doc.saveToStream(createWriteStream('out.pdf'))
47
48
  `saveToStream` resolves once the stream has flushed; it rejects on a stream
48
49
  error or a build/conformance error, exactly like `save()`.
49
50
 
51
+ ## Text extraction (read side)
52
+
53
+ ```typescript
54
+ import { extractText } from '@strav/pdf'
55
+
56
+ const { pages, text, info } = await extractText(await Bun.file('doc.pdf').bytes())
57
+ console.log(info.pageCount, pages[0].text)
58
+ ```
59
+
60
+ Layout-aware plain text per page (heuristic spacing/line breaks), `/ToUnicode`
61
+ and encoding-based glyph decoding, classic + xref-stream + object-stream
62
+ parsing with broken-xref recovery, and empty-password decryption (RC4,
63
+ AES-128, AES-256). See [`docs/pdf/extraction.md`](../../docs/pdf/extraction.md)
64
+ — including the `@strav/rag` ingestion snippet.
65
+
50
66
  ## What's supported
51
67
 
52
68
  Object model & serialization, pages, the full content-stream operator set,
@@ -56,13 +72,15 @@ with ToUnicode, JPEG/PNG images with alpha, transparency (ExtGState) and
56
72
  tiling/shading patterns, XMP metadata, and PDF/A-2b / PDF/X-4 conformance
57
73
  validation. Output is byte-deterministic with a fixed creation date and id.
58
74
 
59
- Browser builds, encryption, signatures, forms, and reading/parsing PDFs are
60
- out of scope.
75
+ On the read side: classic/xref-stream/object-stream parsing, layout-aware text
76
+ extraction, and empty-password decryption. Browser builds, write-side
77
+ encryption, signatures, forms, OCR, and PDF rendering remain out of scope.
61
78
 
62
79
  ## Documentation
63
80
 
64
81
  Full guides live in [`docs/pdf`](../../docs/pdf/pdf.md): the content builder,
65
- fonts, images, color, transparency/patterns, and conformance.
82
+ fonts, images, color, transparency/patterns, conformance, and
83
+ [text extraction](../../docs/pdf/extraction.md).
66
84
 
67
85
  ## Examples
68
86
 
package/package.json CHANGED
@@ -1,8 +1,8 @@
1
1
  {
2
2
  "name": "@strav/pdf",
3
- "version": "0.4.17",
3
+ "version": "0.4.18",
4
4
  "type": "module",
5
- "description": "Low-level, zero-dependency PDF generation (write side) for the Strav ecosystem",
5
+ "description": "Low-level, zero-dependency PDF generation (write side) and layout-aware text extraction (read side) for the Strav ecosystem",
6
6
  "license": "MIT",
7
7
  "keywords": [
8
8
  "bun",
@@ -39,7 +39,9 @@
39
39
  "./color": "./src/color/index.ts",
40
40
  "./color/*": "./src/color/*.ts",
41
41
  "./document": "./src/document/index.ts",
42
- "./document/*": "./src/document/*.ts"
42
+ "./document/*": "./src/document/*.ts",
43
+ "./reader": "./src/reader/index.ts",
44
+ "./reader/*": "./src/reader/*.ts"
43
45
  },
44
46
  "scripts": {
45
47
  "test": "bun test tests/",
package/src/index.ts CHANGED
@@ -48,10 +48,20 @@ export type { Shading, ShadingPattern, ColorStop } from './patterns/shading.ts'
48
48
 
49
49
  export { mm, cm, inch, pt } from './util/units.ts'
50
50
 
51
+ export { extractText, PdfReader } from './reader/extract.ts'
52
+ export type {
53
+ ExtractOptions,
54
+ ExtractResult,
55
+ ExtractedPage,
56
+ PdfInfo,
57
+ } from './reader/extract.ts'
58
+
51
59
  export {
52
60
  PdfGenError,
53
61
  ConformanceError,
54
62
  UnsupportedFontError,
55
63
  InvalidImageError,
64
+ PdfParseError,
65
+ EncryptedPdfError,
56
66
  } from './util/errors.ts'
57
67
  export type { PdfGenErrorCode } from './util/errors.ts'
@@ -0,0 +1,173 @@
1
+ /**
2
+ * CMap parser (spec §9.7.5 / §9.10.3) — the inverse of
3
+ * `fonts/to_unicode.ts#buildToUnicode`. Parses `codespacerange`, `bfchar`,
4
+ * `bfrange` (incl. the `[…]` array form) and `cidchar`/`cidrange`. Used for
5
+ * `/ToUnicode` (code → Unicode) and embedded Type0 encodings (code → CID).
6
+ */
7
+
8
+ import { Lexer, type Token } from './lexer.ts'
9
+
10
+ interface CodespaceRange {
11
+ nbytes: number
12
+ low: number
13
+ high: number
14
+ }
15
+
16
+ export class CMap {
17
+ private readonly codespaces: CodespaceRange[] = []
18
+ /** code → Unicode string (bf*) */
19
+ private readonly toStr = new Map<number, string>()
20
+ /** code → CID (cid*) */
21
+ private readonly toCid = new Map<number, number>()
22
+ private bfRanges: { lo: number; hi: number; base: string }[] = []
23
+ private cidRanges: { lo: number; hi: number; base: number }[] = []
24
+
25
+ /** Byte length to read for the next code (uniform-codespace heuristic). */
26
+ get codeBytes(): number {
27
+ if (this.codespaces.length === 0) return 1
28
+ return this.codespaces[0]!.nbytes
29
+ }
30
+
31
+ /** Split a show string into numeric character codes. */
32
+ readCodes(bytes: Uint8Array): number[] {
33
+ const out: number[] = []
34
+ const n = this.codeBytes
35
+ for (let i = 0; i + n <= bytes.length; i += n) {
36
+ let c = 0
37
+ for (let k = 0; k < n; k++) c = (c << 8) | bytes[i + k]!
38
+ out.push(c)
39
+ }
40
+ return out
41
+ }
42
+
43
+ unicodeOf(code: number): string | undefined {
44
+ const direct = this.toStr.get(code)
45
+ if (direct !== undefined) return direct
46
+ for (const r of this.bfRanges) {
47
+ if (code >= r.lo && code <= r.hi) {
48
+ // Increment the last UTF-16 unit of the base by the offset.
49
+ const cps = [...r.base]
50
+ const off = code - r.lo
51
+ const last = cps.pop() ?? ''
52
+ return cps.join('') + String.fromCodePoint((last.codePointAt(0) ?? 0) + off)
53
+ }
54
+ }
55
+ return undefined
56
+ }
57
+
58
+ cidOf(code: number): number | undefined {
59
+ const direct = this.toCid.get(code)
60
+ if (direct !== undefined) return direct
61
+ for (const r of this.cidRanges) {
62
+ if (code >= r.lo && code <= r.hi) return r.base + (code - r.lo)
63
+ }
64
+ return undefined
65
+ }
66
+ }
67
+
68
+ const bytesToInt = (b: Uint8Array): number => {
69
+ let v = 0
70
+ for (const x of b) v = (v << 8) | x
71
+ return v
72
+ }
73
+
74
+ const utf16beToStr = (b: Uint8Array): string => {
75
+ let s = ''
76
+ for (let i = 0; i + 1 < b.length; i += 2) s += String.fromCharCode((b[i]! << 8) | b[i + 1]!)
77
+ // Normalize surrogate pairs into proper code points.
78
+ return [...s].join('')
79
+ }
80
+
81
+ export function parseCMap(content: Uint8Array): CMap {
82
+ const cmap = new CMap()
83
+ const lex = new Lexer(content, 0)
84
+ const internal = cmap as unknown as {
85
+ codespaces: CodespaceRange[]
86
+ toStr: Map<number, string>
87
+ toCid: Map<number, number>
88
+ bfRanges: { lo: number; hi: number; base: string }[]
89
+ cidRanges: { lo: number; hi: number; base: number }[]
90
+ }
91
+
92
+ const next = (): Token => lex.next()
93
+
94
+ for (;;) {
95
+ const t = next()
96
+ if (t.type === 'eof') break
97
+ if (t.type !== 'kw') continue
98
+
99
+ if (t.value === 'begincodespacerange') {
100
+ for (;;) {
101
+ const lo = next()
102
+ if (lo.type === 'kw' && lo.value === 'endcodespacerange') break
103
+ if (lo.type === 'eof') break
104
+ const hi = next()
105
+ if (lo.type === 'str' && hi.type === 'str') {
106
+ internal.codespaces.push({
107
+ nbytes: lo.value.length || 1,
108
+ low: bytesToInt(lo.value),
109
+ high: bytesToInt(hi.value),
110
+ })
111
+ }
112
+ }
113
+ } else if (t.value === 'beginbfchar') {
114
+ for (;;) {
115
+ const src = next()
116
+ if (src.type === 'kw' && src.value === 'endbfchar') break
117
+ if (src.type === 'eof') break
118
+ const dst = next()
119
+ if (src.type === 'str' && dst.type === 'str') {
120
+ internal.toStr.set(bytesToInt(src.value), utf16beToStr(dst.value))
121
+ }
122
+ }
123
+ } else if (t.value === 'beginbfrange') {
124
+ for (;;) {
125
+ const lo = next()
126
+ if (lo.type === 'kw' && lo.value === 'endbfrange') break
127
+ if (lo.type === 'eof') break
128
+ const hi = next()
129
+ const dst = next()
130
+ if (lo.type !== 'str' || hi.type !== 'str') continue
131
+ const loN = bytesToInt(lo.value)
132
+ const hiN = bytesToInt(hi.value)
133
+ if (dst.type === 'str') {
134
+ internal.bfRanges.push({ lo: loN, hi: hiN, base: utf16beToStr(dst.value) })
135
+ } else if (dst.type === 'delim' && dst.value === '[') {
136
+ let i = loN
137
+ for (;;) {
138
+ const el = next()
139
+ if (el.type === 'delim' && el.value === ']') break
140
+ if (el.type === 'eof') break
141
+ if (el.type === 'str') internal.toStr.set(i++, utf16beToStr(el.value))
142
+ }
143
+ }
144
+ }
145
+ } else if (t.value === 'begincidchar') {
146
+ for (;;) {
147
+ const src = next()
148
+ if (src.type === 'kw' && src.value === 'endcidchar') break
149
+ if (src.type === 'eof') break
150
+ const cid = next()
151
+ if (src.type === 'str' && cid.type === 'num') {
152
+ internal.toCid.set(bytesToInt(src.value), cid.value)
153
+ }
154
+ }
155
+ } else if (t.value === 'begincidrange') {
156
+ for (;;) {
157
+ const lo = next()
158
+ if (lo.type === 'kw' && lo.value === 'endcidrange') break
159
+ if (lo.type === 'eof') break
160
+ const hi = next()
161
+ const cid = next()
162
+ if (lo.type === 'str' && hi.type === 'str' && cid.type === 'num') {
163
+ internal.cidRanges.push({
164
+ lo: bytesToInt(lo.value),
165
+ hi: bytesToInt(hi.value),
166
+ base: cid.value,
167
+ })
168
+ }
169
+ }
170
+ }
171
+ }
172
+ return cmap
173
+ }
@@ -0,0 +1,226 @@
1
+ /**
2
+ * Standard security handler (spec §7.6), empty user password only.
3
+ *
4
+ * Supports RC4 (40/128-bit, V1–V2), AES-128 (V4/R4) and AES-256 (V5/R5–R6).
5
+ * If the empty user password does not validate — i.e. the file needs a real
6
+ * password — or the handler is non-standard, an {@link EncryptedPdfError} is
7
+ * thrown. Strings and streams are decrypted after parsing, before filtering.
8
+ */
9
+
10
+ import { createHash, createDecipheriv, createCipheriv } from 'node:crypto'
11
+ import {
12
+ type PdfObject,
13
+ type PdfDictionary,
14
+ isNum,
15
+ isStr,
16
+ isName,
17
+ isDict,
18
+ } from '../objects/types.ts'
19
+ import { EncryptedPdfError } from '../util/errors.ts'
20
+
21
+ const PAD = Uint8Array.from([
22
+ 0x28, 0xbf, 0x4e, 0x5e, 0x4e, 0x75, 0x8a, 0x41, 0x64, 0x00, 0x4e, 0x56, 0xff, 0xfa, 0x01, 0x08,
23
+ 0x2e, 0x2e, 0x00, 0xb6, 0xd0, 0x68, 0x3e, 0x80, 0x2f, 0x0c, 0xa9, 0xfe, 0x64, 0x53, 0x69, 0x7a,
24
+ ])
25
+
26
+ const md5 = (b: Uint8Array): Uint8Array =>
27
+ new Uint8Array(createHash('md5').update(b).digest())
28
+ const sha = (algo: string, b: Uint8Array): Uint8Array =>
29
+ new Uint8Array(createHash(algo).update(b).digest())
30
+
31
+ function cat(...parts: Uint8Array[]): Uint8Array {
32
+ const len = parts.reduce((a, p) => a + p.length, 0)
33
+ const out = new Uint8Array(len)
34
+ let o = 0
35
+ for (const p of parts) {
36
+ out.set(p, o)
37
+ o += p.length
38
+ }
39
+ return out
40
+ }
41
+
42
+ function rc4(key: Uint8Array, data: Uint8Array): Uint8Array {
43
+ const s = new Uint8Array(256)
44
+ for (let i = 0; i < 256; i++) s[i] = i
45
+ let j = 0
46
+ for (let i = 0; i < 256; i++) {
47
+ j = (j + s[i]! + key[i % key.length]!) & 0xff
48
+ ;[s[i], s[j]] = [s[j]!, s[i]!]
49
+ }
50
+ const out = new Uint8Array(data.length)
51
+ let a = 0
52
+ let b = 0
53
+ for (let k = 0; k < data.length; k++) {
54
+ a = (a + 1) & 0xff
55
+ b = (b + s[a]!) & 0xff
56
+ ;[s[a], s[b]] = [s[b]!, s[a]!]
57
+ out[k] = data[k]! ^ s[(s[a]! + s[b]!) & 0xff]!
58
+ }
59
+ return out
60
+ }
61
+
62
+ function aesCbcDecrypt(key: Uint8Array, data: Uint8Array, iv?: Uint8Array): Uint8Array {
63
+ if (data.length < 16) return new Uint8Array(0)
64
+ const useIv = iv ?? data.subarray(0, 16)
65
+ const body = iv ? data : data.subarray(16)
66
+ const algo = key.length === 32 ? 'aes-256-cbc' : 'aes-128-cbc'
67
+ const d = createDecipheriv(algo, key, useIv)
68
+ d.setAutoPadding(false)
69
+ const out = Buffer.concat([d.update(body), d.final()])
70
+ // Strip PKCS#7 padding when it looks valid.
71
+ const pad = out.length ? out[out.length - 1]! : 0
72
+ if (pad >= 1 && pad <= 16 && pad <= out.length) {
73
+ return new Uint8Array(out.subarray(0, out.length - pad))
74
+ }
75
+ return new Uint8Array(out)
76
+ }
77
+
78
+ /** Algorithm 2.B — the R6 hardened hash. */
79
+ function hash2B(pwd: Uint8Array, salt: Uint8Array, udata: Uint8Array): Uint8Array {
80
+ let K = sha('sha256', cat(pwd, salt, udata))
81
+ for (let round = 0; ; round++) {
82
+ const block = cat(pwd, K, udata)
83
+ const K1 = new Uint8Array(block.length * 64)
84
+ for (let i = 0; i < 64; i++) K1.set(block, i * block.length)
85
+ const c = createCipheriv('aes-128-cbc', K.subarray(0, 16), K.subarray(16, 32))
86
+ c.setAutoPadding(false)
87
+ const E = new Uint8Array(Buffer.concat([c.update(K1), c.final()]))
88
+ let mod = 0
89
+ for (let i = 0; i < 16; i++) mod += E[i]!
90
+ mod %= 3
91
+ K = sha(mod === 0 ? 'sha256' : mod === 1 ? 'sha384' : 'sha512', E)
92
+ if (round >= 63 && E[E.length - 1]! <= round - 32) break
93
+ }
94
+ return K.subarray(0, 32)
95
+ }
96
+
97
+ export interface Decryptor {
98
+ /** Decrypt a string/stream payload for object (num,gen). */
99
+ decrypt(num: number, gen: number, data: Uint8Array, isString: boolean): Uint8Array
100
+ /** Object number of the /Encrypt dict (never itself decrypted). */
101
+ readonly encryptObjNum: number
102
+ }
103
+
104
+ const strBytes = (o: PdfObject | undefined): Uint8Array =>
105
+ o && isStr(o) ? o.value : new Uint8Array(0)
106
+
107
+ /**
108
+ * Build a decryptor from the trailer's /Encrypt dict and /ID, validating the
109
+ * empty user password. `encryptObjNum` is the /Encrypt indirect object number.
110
+ */
111
+ export function buildDecryptor(
112
+ enc: PdfDictionary,
113
+ idFirst: Uint8Array,
114
+ encryptObjNum: number,
115
+ ): Decryptor {
116
+ const filter = enc.entries.get('Filter')
117
+ if (!filter || !isName(filter) || filter.value !== 'Standard') {
118
+ throw new EncryptedPdfError('Unsupported security handler (only /Standard)')
119
+ }
120
+ const numOf = (k: string, d = 0): number => {
121
+ const v = enc.entries.get(k)
122
+ return v && isNum(v) ? v.value : d
123
+ }
124
+ const V = numOf('V', 0)
125
+ const R = numOf('R', 0)
126
+ const O = strBytes(enc.entries.get('O'))
127
+ const U = strBytes(enc.entries.get('U'))
128
+ const P = numOf('P', 0) | 0
129
+ const lengthBits = numOf('Length', 40)
130
+
131
+ // Determine the algorithm: V5 → AES-256; V4 → /CF /StdCF /CFM; else RC4.
132
+ let cfm: 'V2' | 'AESV2' | 'AESV3' = 'V2'
133
+ if (V >= 5) cfm = 'AESV3'
134
+ else if (V === 4) {
135
+ const cf = enc.entries.get('CF')
136
+ const stmF = enc.entries.get('StmF')
137
+ if (cf && isDict(cf) && stmF && isName(stmF)) {
138
+ const std = cf.entries.get(stmF.value)
139
+ if (std && isDict(std)) {
140
+ const m = std.entries.get('CFM')
141
+ if (m && isName(m)) {
142
+ cfm = m.value === 'AESV3' ? 'AESV3' : m.value === 'AESV2' ? 'AESV2' : 'V2'
143
+ }
144
+ }
145
+ }
146
+ }
147
+
148
+ let fileKey: Uint8Array
149
+ if (V >= 5) {
150
+ // AES-256 (R5/R6). Validate empty user password against /U.
151
+ const valSalt = U.subarray(32, 40)
152
+ const keySalt = U.subarray(40, 48)
153
+ const empty = new Uint8Array(0)
154
+ const check =
155
+ R === 5 ? sha('sha256', cat(empty, valSalt)) : hash2B(empty, valSalt, empty)
156
+ if (Buffer.compare(Buffer.from(check), Buffer.from(U.subarray(0, 32))) !== 0) {
157
+ throw new EncryptedPdfError('PDF requires a user password')
158
+ }
159
+ const ikey =
160
+ R === 5 ? sha('sha256', cat(empty, keySalt)) : hash2B(empty, keySalt, empty)
161
+ const UE = strBytes(enc.entries.get('UE'))
162
+ fileKey = aesCbcDecrypt(ikey, UE, new Uint8Array(16))
163
+ } else {
164
+ // RC4 / AES-128 (R2–R4). Algorithm 2 with the empty password.
165
+ const keyLen = R === 2 ? 5 : lengthBits / 8
166
+ const pBytes = Uint8Array.from([P & 0xff, (P >> 8) & 0xff, (P >> 16) & 0xff, (P >> 24) & 0xff])
167
+ const encMeta = enc.entries.get('EncryptMetadata')
168
+ const metaFalse = encMeta && encMeta.kind === 'bool' && encMeta.value === false
169
+ let h = md5(
170
+ cat(
171
+ PAD,
172
+ O,
173
+ pBytes,
174
+ idFirst,
175
+ R >= 4 && metaFalse ? Uint8Array.from([0xff, 0xff, 0xff, 0xff]) : new Uint8Array(0),
176
+ ),
177
+ )
178
+ if (R >= 3) for (let i = 0; i < 50; i++) h = md5(h.subarray(0, keyLen))
179
+ fileKey = h.subarray(0, keyLen)
180
+
181
+ // Validate the empty password by reproducing /U (Algorithm 4/5).
182
+ let expectedU: Uint8Array
183
+ if (R === 2) {
184
+ expectedU = rc4(fileKey, PAD)
185
+ } else {
186
+ const idHash = md5(cat(PAD, idFirst))
187
+ let x = rc4(fileKey, idHash)
188
+ for (let i = 1; i <= 19; i++) {
189
+ const k = new Uint8Array(fileKey.length)
190
+ for (let j = 0; j < k.length; j++) k[j] = fileKey[j]! ^ i
191
+ x = rc4(k, x)
192
+ }
193
+ expectedU = x
194
+ }
195
+ const cmpLen = R === 2 ? 32 : 16
196
+ if (
197
+ U.length >= cmpLen &&
198
+ Buffer.compare(
199
+ Buffer.from(expectedU.subarray(0, cmpLen)),
200
+ Buffer.from(U.subarray(0, cmpLen)),
201
+ ) !== 0
202
+ ) {
203
+ throw new EncryptedPdfError('PDF requires a user password')
204
+ }
205
+ }
206
+
207
+ const objectKey = (num: number, gen: number, aes: boolean): Uint8Array => {
208
+ const ext = cat(
209
+ fileKey,
210
+ Uint8Array.from([num & 0xff, (num >> 8) & 0xff, (num >> 16) & 0xff]),
211
+ Uint8Array.from([gen & 0xff, (gen >> 8) & 0xff]),
212
+ aes ? Uint8Array.from([0x73, 0x41, 0x6c, 0x54]) : new Uint8Array(0),
213
+ )
214
+ return md5(ext).subarray(0, Math.min(fileKey.length + 5, 16))
215
+ }
216
+
217
+ return {
218
+ encryptObjNum,
219
+ decrypt(num, gen, data, _isString): Uint8Array {
220
+ if (data.length === 0) return data
221
+ if (cfm === 'AESV3') return aesCbcDecrypt(fileKey, data)
222
+ if (cfm === 'AESV2') return aesCbcDecrypt(objectKey(num, gen, true), data)
223
+ return rc4(objectKey(num, gen, false), data)
224
+ },
225
+ }
226
+ }