@strav/pdf 0.4.17 → 0.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,246 @@
1
+ /**
2
+ * Read-side document model. Owns the byte buffer, the merged xref table and
3
+ * the (optional) decryptor; resolves indirect objects with a cache and a
4
+ * cycle guard; materialises compressed object streams lazily; and walks the
5
+ * page tree with attribute inheritance (spec §7.7).
6
+ */
7
+
8
+ import {
9
+ type PdfObject,
10
+ type PdfDictionary,
11
+ type PdfStream,
12
+ isRef,
13
+ isDict,
14
+ isStream,
15
+ isArr,
16
+ isName,
17
+ isNum,
18
+ isStr,
19
+ } from '../objects/types.ts'
20
+ import { PdfParseError, EncryptedPdfError } from '../util/errors.ts'
21
+ import { decodeStream } from '../streams/decode.ts'
22
+ import { Lexer } from './lexer.ts'
23
+ import { ObjectParser } from './object_parser.ts'
24
+ import { parseXref, bruteForceXref, type XrefTable } from './xref.ts'
25
+ import { parseObjStm } from './objstm.ts'
26
+ import { buildDecryptor, type Decryptor } from './decrypt.ts'
27
+
28
+ const INHERITED = ['Resources', 'MediaBox', 'CropBox', 'Rotate'] as const
29
+
30
+ export class PdfReaderDocument {
31
+ readonly xref: XrefTable
32
+ private readonly cache = new Map<number, PdfObject>()
33
+ private readonly objStmCache = new Map<number, Map<number, PdfObject>>()
34
+ private readonly decryptor?: Decryptor
35
+
36
+ constructor(
37
+ readonly buf: Uint8Array,
38
+ opts: { password?: string } = {},
39
+ ) {
40
+ if (opts.password) {
41
+ // M13 only validates the empty user password.
42
+ throw new EncryptedPdfError(
43
+ 'Password-protected PDFs are not supported (empty password only)',
44
+ )
45
+ }
46
+ let xref: XrefTable
47
+ try {
48
+ xref = parseXref(buf)
49
+ if (!xref.trailer.entries.has('Root')) xref = bruteForceXref(buf)
50
+ } catch {
51
+ xref = bruteForceXref(buf)
52
+ }
53
+ this.xref = xref
54
+
55
+ const encEntry = xref.trailer.entries.get('Encrypt')
56
+ if (encEntry) {
57
+ const encNum = isRef(encEntry) ? encEntry.num : -1
58
+ const encDict = this.resolve(encEntry)
59
+ const idArr = xref.trailer.entries.get('ID')
60
+ const idFirst =
61
+ idArr && isArr(idArr) && idArr.items[0] && isStr(idArr.items[0]!)
62
+ ? idArr.items[0]!.value
63
+ : new Uint8Array(0)
64
+ if (encDict && isDict(encDict)) {
65
+ this.decryptor = buildDecryptor(encDict, idFirst, encNum)
66
+ }
67
+ }
68
+ }
69
+
70
+ // ── Object resolution ────────────────────────────────────────────────────
71
+
72
+ getObject(numOrRef: number | PdfObject, gen = 0): PdfObject {
73
+ let num: number
74
+ let g = gen
75
+ if (typeof numOrRef === 'number') num = numOrRef
76
+ else if (isRef(numOrRef)) {
77
+ num = numOrRef.num
78
+ g = numOrRef.gen
79
+ } else return numOrRef
80
+
81
+ const cached = this.cache.get(num)
82
+ if (cached) return cached
83
+
84
+ const entry = this.xref.entries.get(num)
85
+ if (!entry) return { kind: 'null' }
86
+
87
+ let value: PdfObject
88
+ if (entry.type === 'n') {
89
+ try {
90
+ const parser = new ObjectParser(new Lexer(this.buf, entry.offset), (o) =>
91
+ this.toNumber(o),
92
+ )
93
+ const parsed = parser.parseIndirectAt(entry.offset)
94
+ value = parsed.value
95
+ if (this.decryptor && num !== this.decryptor.encryptObjNum) {
96
+ value = this.decryptDeep(value, num, entry.gen ?? g)
97
+ }
98
+ } catch (e) {
99
+ if (e instanceof PdfParseError) return { kind: 'null' }
100
+ throw e
101
+ }
102
+ } else {
103
+ value = this.fromObjStm(entry.streamObj, entry.index, num)
104
+ }
105
+ this.cache.set(num, value)
106
+ return value
107
+ }
108
+
109
+ /** Dereference one level (ref → object); pass-through otherwise. */
110
+ resolve(o: PdfObject | undefined): PdfObject | undefined {
111
+ if (!o) return undefined
112
+ return isRef(o) ? this.getObject(o) : o
113
+ }
114
+
115
+ private toNumber(o: PdfObject): number | undefined {
116
+ const r = this.resolve(o)
117
+ return r && isNum(r) ? r.value : undefined
118
+ }
119
+
120
+ private fromObjStm(streamObj: number, index: number, want: number): PdfObject {
121
+ let contents = this.objStmCache.get(streamObj)
122
+ if (!contents) {
123
+ const stm = this.getObject(streamObj)
124
+ if (!isStream(stm)) return { kind: 'null' }
125
+ const data = this.getStreamData(stm, streamObj)
126
+ contents = parseObjStm(stm.dict, data).objects
127
+ this.objStmCache.set(streamObj, contents)
128
+ }
129
+ return contents.get(want) ?? { kind: 'null' }
130
+ }
131
+
132
+ // ── Streams ──────────────────────────────────────────────────────────────
133
+
134
+ /** Decrypt (if needed) then run the filter chain. */
135
+ getStreamData(stream: PdfStream, objNum: number, gen = 0): Uint8Array {
136
+ let raw = stream.data
137
+ const type = stream.dict.entries.get('Type')
138
+ const isXref = type && isName(type) && type.value === 'XRef'
139
+ if (this.decryptor && !isXref && objNum !== this.decryptor.encryptObjNum) {
140
+ raw = this.decryptor.decrypt(objNum, gen, raw, false)
141
+ }
142
+ return decodeStream(stream.dict, raw, (o) => this.resolve(o))
143
+ }
144
+
145
+ private decryptDeep(o: PdfObject, num: number, gen: number): PdfObject {
146
+ const d = this.decryptor!
147
+ const walk = (x: PdfObject): PdfObject => {
148
+ if (x.kind === 'str') {
149
+ return { ...x, value: d.decrypt(num, gen, x.value, true) }
150
+ }
151
+ if (x.kind === 'arr') return { kind: 'arr', items: x.items.map(walk) }
152
+ if (x.kind === 'dict') {
153
+ const e = new Map<string, PdfObject>()
154
+ for (const [k, v] of x.entries) e.set(k, walk(v))
155
+ return { kind: 'dict', entries: e }
156
+ }
157
+ if (x.kind === 'stream') {
158
+ const e = new Map<string, PdfObject>()
159
+ for (const [k, v] of x.dict.entries) e.set(k, walk(v))
160
+ return { kind: 'stream', dict: { kind: 'dict', entries: e }, data: x.data }
161
+ }
162
+ return x
163
+ }
164
+ return walk(o)
165
+ }
166
+
167
+ // ── Catalog / pages ──────────────────────────────────────────────────────
168
+
169
+ get trailer(): PdfDictionary {
170
+ return this.xref.trailer
171
+ }
172
+
173
+ catalog(): PdfDictionary {
174
+ const root = this.resolve(this.trailer.entries.get('Root'))
175
+ if (!root || !isDict(root)) throw new PdfParseError('Missing document catalog')
176
+ return root
177
+ }
178
+
179
+ /** Leaf page dictionaries in document order, with inherited attributes. */
180
+ pages(): PdfDictionary[] {
181
+ const out: PdfDictionary[] = []
182
+ const seen = new Set<PdfObject>()
183
+ const root = this.resolve(this.catalog().entries.get('Pages'))
184
+ if (!root || !isDict(root)) throw new PdfParseError('Missing page tree root')
185
+
186
+ const visit = (nodeRef: PdfObject | undefined, inherited: Map<string, PdfObject>) => {
187
+ const node = this.resolve(nodeRef)
188
+ if (!node || !isDict(node) || seen.has(node)) return
189
+ seen.add(node)
190
+ const merged = new Map(inherited)
191
+ for (const key of INHERITED) {
192
+ const v = node.entries.get(key)
193
+ if (v) merged.set(key, v)
194
+ }
195
+ const type = node.entries.get('Type')
196
+ const kids = node.entries.get('Kids')
197
+ if (kids && isArr(kids)) {
198
+ for (const kid of kids.items) visit(kid, merged)
199
+ } else if (!type || (isName(type) && type.value === 'Page') || node.entries.has('Contents')) {
200
+ const leaf = new Map(merged)
201
+ for (const [k, v] of node.entries) leaf.set(k, v)
202
+ out.push({ kind: 'dict', entries: leaf })
203
+ }
204
+ }
205
+ visit(root, new Map())
206
+ return out
207
+ }
208
+
209
+ /** Concatenated, decoded content-stream bytes for a page. */
210
+ pageContent(page: PdfDictionary): Uint8Array {
211
+ const c = this.resolve(page.entries.get('Contents'))
212
+ const streams: PdfStream[] = []
213
+ const refsNum: number[] = []
214
+ const collect = (obj: PdfObject | undefined, ref?: PdfObject) => {
215
+ const r = this.resolve(obj)
216
+ if (r && isStream(r)) {
217
+ streams.push(r)
218
+ refsNum.push(ref && isRef(ref) ? ref.num : -1)
219
+ }
220
+ }
221
+ if (c && isArr(c)) {
222
+ const raw = page.entries.get('Contents')
223
+ const items = raw && isArr(raw) ? raw.items : c.items
224
+ for (const it of items) collect(it, it)
225
+ } else {
226
+ collect(c, page.entries.get('Contents'))
227
+ }
228
+ const parts: Uint8Array[] = []
229
+ for (let i = 0; i < streams.length; i++) {
230
+ parts.push(this.getStreamData(streams[i]!, refsNum[i]!))
231
+ parts.push(Uint8Array.of(0x0a))
232
+ }
233
+ const total = parts.reduce((a, p) => a + p.length, 0)
234
+ const out = new Uint8Array(total)
235
+ let o = 0
236
+ for (const p of parts) {
237
+ out.set(p, o)
238
+ o += p.length
239
+ }
240
+ return out
241
+ }
242
+
243
+ get encrypted(): boolean {
244
+ return this.decryptor !== undefined || this.trailer.entries.has('Encrypt')
245
+ }
246
+ }
@@ -0,0 +1,73 @@
1
+ /**
2
+ * Single-byte text encodings (spec §D) for the read side, plus a glyph-name →
3
+ * Unicode resolver (Adobe Glyph List subset + the algorithmic `uniXXXX` /
4
+ * `uXXXXXX` forms). Used when a simple font has no `/ToUnicode`: the base
5
+ * encoding maps code → glyph name → Unicode.
6
+ *
7
+ * WinAnsi is implemented exactly (it is what the writer emits for Standard-14
8
+ * and the common case for simple fonts). Standard/MacRoman/PDFDoc share
9
+ * WinAnsi for ASCII and Latin-1 and only differ in the punctuation high range;
10
+ * those differences are approximated and documented as a v1 limitation.
11
+ */
12
+
13
+ // CP1252-specific code points in 0x80–0x9F; everything else in 0x20–0xFF maps
14
+ // to the same Unicode scalar (Latin-1) and 0x00–0x1F to itself.
15
+ const WIN_HIGH: Record<number, number> = {
16
+ 0x80: 0x20ac, 0x82: 0x201a, 0x83: 0x0192, 0x84: 0x201e, 0x85: 0x2026,
17
+ 0x86: 0x2020, 0x87: 0x2021, 0x88: 0x02c6, 0x89: 0x2030, 0x8a: 0x0160,
18
+ 0x8b: 0x2039, 0x8c: 0x0152, 0x8e: 0x017d, 0x91: 0x2018, 0x92: 0x2019,
19
+ 0x93: 0x201c, 0x94: 0x201d, 0x95: 0x2022, 0x96: 0x2013, 0x97: 0x2014,
20
+ 0x98: 0x02dc, 0x99: 0x2122, 0x9a: 0x0161, 0x9b: 0x203a, 0x9c: 0x0153,
21
+ 0x9e: 0x017e, 0x9f: 0x0178,
22
+ }
23
+
24
+ export function winAnsiToUnicode(code: number): number {
25
+ if (code >= 0x80 && code <= 0x9f) return WIN_HIGH[code] ?? code
26
+ return code // ASCII + Latin-1 are identity
27
+ }
28
+
29
+ export type BaseEncodingName =
30
+ | 'WinAnsiEncoding'
31
+ | 'MacRomanEncoding'
32
+ | 'StandardEncoding'
33
+ | 'PDFDocEncoding'
34
+ | 'MacExpertEncoding'
35
+
36
+ /** Resolve a code under a named base encoding (approximate for non-WinAnsi). */
37
+ export function baseEncode(name: BaseEncodingName | undefined, code: number): number {
38
+ // WinAnsi is exact; others are close enough for ASCII/Latin text. Glyph-name
39
+ // /Differences (handled by the caller) override anything that matters.
40
+ return winAnsiToUnicode(code)
41
+ }
42
+
43
+ // A pragmatic Adobe Glyph List subset: ASCII + the common Latin-1 names the
44
+ // writer and typical producers emit via /Differences. Extend as needed.
45
+ const AGL: Record<string, number> = {
46
+ space: 0x20, exclam: 0x21, quotedbl: 0x22, numbersign: 0x23, dollar: 0x24,
47
+ percent: 0x25, ampersand: 0x26, quotesingle: 0x27, parenleft: 0x28,
48
+ parenright: 0x29, asterisk: 0x2a, plus: 0x2b, comma: 0x2c, hyphen: 0x2d,
49
+ period: 0x2e, slash: 0x2f, zero: 0x30, one: 0x31, two: 0x32, three: 0x33,
50
+ four: 0x34, five: 0x35, six: 0x36, seven: 0x37, eight: 0x38, nine: 0x39,
51
+ colon: 0x3a, semicolon: 0x3b, less: 0x3c, equal: 0x3d, greater: 0x3e,
52
+ question: 0x3f, at: 0x40, bracketleft: 0x5b, backslash: 0x5c,
53
+ bracketright: 0x5d, asciicircum: 0x5e, underscore: 0x5f, grave: 0x60,
54
+ braceleft: 0x7b, bar: 0x7c, braceright: 0x7d, asciitilde: 0x7e,
55
+ bullet: 0x2022, endash: 0x2013, emdash: 0x2014, quoteleft: 0x2018,
56
+ quoteright: 0x2019, quotedblleft: 0x201c, quotedblright: 0x201d,
57
+ quotesinglbase: 0x201a, quotedblbase: 0x201e, ellipsis: 0x2026,
58
+ dagger: 0x2020, daggerdbl: 0x2021, perthousand: 0x2030, trademark: 0x2122,
59
+ fi: 0xfb01, fl: 0xfb02, florin: 0x192, Euro: 0x20ac, nbspace: 0xa0,
60
+ }
61
+
62
+ /** glyph name → Unicode code point, or -1 if unknown. */
63
+ export function glyphNameToUnicode(g: string): number {
64
+ if (g in AGL) return AGL[g]!
65
+ // Letters/digits: single-char names like "A", "z" are not standard, but
66
+ // "uniXXXX" and "uXXXXXX" are.
67
+ let m = /^uni([0-9A-Fa-f]{4})$/.exec(g)
68
+ if (m) return parseInt(m[1]!, 16)
69
+ m = /^u([0-9A-Fa-f]{4,6})$/.exec(g)
70
+ if (m) return parseInt(m[1]!, 16)
71
+ // "gNN" / "cidNN" / "indexNN": no Unicode information available.
72
+ return -1
73
+ }
@@ -0,0 +1,152 @@
1
+ /**
2
+ * Public read-side API (M13): layout-aware plain-text extraction from an
3
+ * existing PDF. `extractText` is the headline ergonomic entry point;
4
+ * `PdfReader` is a reusable handle for lazy/repeated page access.
5
+ *
6
+ * Scope: text content only. No OCR (scanned/image-only pages yield no text),
7
+ * no column/table reconstruction, no annotation/form-field values, empty
8
+ * user password only.
9
+ */
10
+
11
+ import { type PdfDictionary, isDict, isStr } from '../objects/types.ts'
12
+ import { PdfReaderDocument } from './document.ts'
13
+ import { interpretText } from './text_interpreter.ts'
14
+ import { runsToText } from './layout.ts'
15
+
16
+ export interface ExtractOptions {
17
+ /** 1-based pages; default all. */
18
+ pages?: number | number[] | { from?: number; to?: number }
19
+ /** Collapse whitespace and trim. Default true. */
20
+ normalizeWhitespace?: boolean
21
+ /** Only the empty password is supported; non-empty throws. */
22
+ password?: string
23
+ }
24
+
25
+ export interface ExtractedPage {
26
+ number: number
27
+ text: string
28
+ }
29
+
30
+ export interface PdfInfo {
31
+ title?: string
32
+ author?: string
33
+ subject?: string
34
+ keywords?: string
35
+ creator?: string
36
+ producer?: string
37
+ creationDate?: string
38
+ modDate?: string
39
+ pageCount: number
40
+ encrypted: boolean
41
+ }
42
+
43
+ export interface ExtractResult {
44
+ pages: ExtractedPage[]
45
+ /** Page texts joined by the form-feed page separator. */
46
+ text: string
47
+ info: PdfInfo
48
+ }
49
+
50
+ function toU8(b: Uint8Array | ArrayBuffer): Uint8Array {
51
+ return b instanceof Uint8Array ? b : new Uint8Array(b)
52
+ }
53
+
54
+ /** Decode a PDF text string: UTF-16BE if BOM-prefixed, else Latin-1/PDFDoc. */
55
+ function decodeTextString(bytes: Uint8Array): string {
56
+ if (bytes.length >= 2 && bytes[0] === 0xfe && bytes[1] === 0xff) {
57
+ let s = ''
58
+ for (let i = 2; i + 1 < bytes.length; i += 2) s += String.fromCharCode((bytes[i]! << 8) | bytes[i + 1]!)
59
+ return s
60
+ }
61
+ let s = ''
62
+ for (const b of bytes) s += String.fromCharCode(b)
63
+ return s
64
+ }
65
+
66
+ export class PdfReader {
67
+ private readonly doc: PdfReaderDocument
68
+ private readonly pageList: PdfDictionary[]
69
+
70
+ private constructor(bytes: Uint8Array, opts: { password?: string }) {
71
+ this.doc = new PdfReaderDocument(bytes, opts)
72
+ this.pageList = this.doc.pages()
73
+ }
74
+
75
+ static async open(
76
+ bytes: Uint8Array | ArrayBuffer,
77
+ opts: { password?: string } = {},
78
+ ): Promise<PdfReader> {
79
+ return new PdfReader(toU8(bytes), opts)
80
+ }
81
+
82
+ get pageCount(): number {
83
+ return this.pageList.length
84
+ }
85
+
86
+ get encrypted(): boolean {
87
+ return this.doc.encrypted
88
+ }
89
+
90
+ get info(): PdfInfo {
91
+ const out: PdfInfo = { pageCount: this.pageCount, encrypted: this.encrypted }
92
+ const infoObj = this.doc.resolve(this.doc.trailer.entries.get('Info'))
93
+ if (infoObj && isDict(infoObj)) {
94
+ const get = (k: string): string | undefined => {
95
+ const v = this.doc.resolve(infoObj.entries.get(k))
96
+ return v && isStr(v) ? decodeTextString(v.value) : undefined
97
+ }
98
+ out.title = get('Title')
99
+ out.author = get('Author')
100
+ out.subject = get('Subject')
101
+ out.keywords = get('Keywords')
102
+ out.creator = get('Creator')
103
+ out.producer = get('Producer')
104
+ out.creationDate = get('CreationDate')
105
+ out.modDate = get('ModDate')
106
+ }
107
+ return out
108
+ }
109
+
110
+ pageText(pageNumber: number, opts: { normalizeWhitespace?: boolean } = {}): string {
111
+ const page = this.pageList[pageNumber - 1]
112
+ if (!page) return ''
113
+ const resources = this.doc.resolve(page.entries.get('Resources'))
114
+ const content = this.doc.pageContent(page)
115
+ const runs = interpretText(
116
+ content,
117
+ resources && isDict(resources) ? resources : undefined,
118
+ this.doc,
119
+ )
120
+ return runsToText(runs, opts.normalizeWhitespace ?? true)
121
+ }
122
+
123
+ extractText(opts: ExtractOptions = {}): ExtractResult {
124
+ const nums = selectPages(opts.pages, this.pageCount)
125
+ const norm = opts.normalizeWhitespace ?? true
126
+ const pages = nums.map((n) => ({ number: n, text: this.pageText(n, { normalizeWhitespace: norm }) }))
127
+ return { pages, text: pages.map((p) => p.text).join('\f'), info: this.info }
128
+ }
129
+ }
130
+
131
+ function selectPages(
132
+ spec: ExtractOptions['pages'],
133
+ count: number,
134
+ ): number[] {
135
+ const all = Array.from({ length: count }, (_, i) => i + 1)
136
+ if (spec === undefined) return all
137
+ if (typeof spec === 'number') return spec >= 1 && spec <= count ? [spec] : []
138
+ if (Array.isArray(spec)) return spec.filter((n) => n >= 1 && n <= count)
139
+ const from = Math.max(1, spec.from ?? 1)
140
+ const to = Math.min(count, spec.to ?? count)
141
+ const out: number[] = []
142
+ for (let n = from; n <= to; n++) out.push(n)
143
+ return out
144
+ }
145
+
146
+ export async function extractText(
147
+ bytes: Uint8Array | ArrayBuffer,
148
+ opts: ExtractOptions = {},
149
+ ): Promise<ExtractResult> {
150
+ const reader = await PdfReader.open(bytes, { password: opts.password })
151
+ return reader.extractText(opts)
152
+ }