@strav/pdf 0.4.17 → 0.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,203 @@
1
+ /**
2
+ * Recursive-descent parser: token stream → {@link PdfObject} (spec §7.3).
3
+ *
4
+ * Handles the two-number lookahead for indirect references (`n g R`) and
5
+ * indirect object bodies (`n g obj … endobj`), including `stream`/`endstream`
6
+ * whose raw bytes are sliced by the resolved `/Length` (with a scan-to-
7
+ * `endstream` fallback for the wrong/indirect lengths real files contain).
8
+ */
9
+
10
+ import {
11
+ type PdfObject,
12
+ type PdfDictionary,
13
+ bool,
14
+ num,
15
+ name,
16
+ arr,
17
+ ref,
18
+ dict as makeDict,
19
+ NULL,
20
+ isNum,
21
+ isDict,
22
+ } from '../objects/types.ts'
23
+ import { PdfParseError } from '../util/errors.ts'
24
+ import { Lexer, latin1, type Token } from './lexer.ts'
25
+
26
+ /** Resolve an object to a plain number (for indirect `/Length`). */
27
+ export type LengthResolver = (o: PdfObject) => number | undefined
28
+
29
+ export class ObjectParser {
30
+ constructor(
31
+ readonly lex: Lexer,
32
+ private readonly resolveLength?: LengthResolver,
33
+ ) {}
34
+
35
+ /** Parse the indirect object whose body starts at `offset`. */
36
+ parseIndirectAt(offset: number): { num: number; gen: number; value: PdfObject } {
37
+ this.lex.seek(offset)
38
+ const n = this.lex.next()
39
+ const g = this.lex.next()
40
+ const obj = this.lex.next()
41
+ if (n.type !== 'num' || g.type !== 'num' || obj.type !== 'kw' || obj.value !== 'obj') {
42
+ throw new PdfParseError(`Expected "N G obj" at offset ${offset}`)
43
+ }
44
+ const value = this.parseObject()
45
+ return { num: n.value, gen: g.value, value }
46
+ }
47
+
48
+ /** Parse a single object value, resolving `n g R` / streams. */
49
+ parseObject(): PdfObject {
50
+ const t = this.lex.next()
51
+ return this.parseFromToken(t)
52
+ }
53
+
54
+ private parseFromToken(t: Token): PdfObject {
55
+ switch (t.type) {
56
+ case 'eof':
57
+ throw new PdfParseError('Unexpected end of input')
58
+ case 'num':
59
+ return this.parseNumberOrRef(t.value)
60
+ case 'str':
61
+ return { kind: 'str', value: t.value, encoding: t.encoding }
62
+ case 'name':
63
+ return name(t.value)
64
+ case 'kw':
65
+ if (t.value === 'true') return bool(true)
66
+ if (t.value === 'false') return bool(false)
67
+ if (t.value === 'null') return NULL
68
+ // Unknown bare keyword (e.g. "endobj", "R" out of place) — treat as null
69
+ return NULL
70
+ case 'delim':
71
+ if (t.value === '[') return this.parseArray()
72
+ if (t.value === '<<') return this.parseDictOrStream()
73
+ throw new PdfParseError(`Unexpected token "${t.value}"`)
74
+ }
75
+ }
76
+
77
+ private parseNumberOrRef(first: number): PdfObject {
78
+ // Lookahead for `int int R` (indirect reference).
79
+ const save = this.lex.pos
80
+ const t2 = this.lex.next()
81
+ if (t2.type === 'num' && Number.isInteger(first) && Number.isInteger(t2.value)) {
82
+ const t3 = this.lex.next()
83
+ if (t3.type === 'kw' && t3.value === 'R') {
84
+ return ref(first, t2.value)
85
+ }
86
+ }
87
+ this.lex.pos = save
88
+ return num(first)
89
+ }
90
+
91
+ private parseArray(): PdfObject {
92
+ const items: PdfObject[] = []
93
+ for (;;) {
94
+ const t = this.lex.next()
95
+ if (t.type === 'eof') throw new PdfParseError('Unterminated array')
96
+ if (t.type === 'delim' && t.value === ']') break
97
+ items.push(this.parseFromToken(t))
98
+ }
99
+ return arr(items)
100
+ }
101
+
102
+ private parseDictOrStream(): PdfObject {
103
+ const d = makeDict()
104
+ for (;;) {
105
+ const t = this.lex.next()
106
+ if (t.type === 'eof') throw new PdfParseError('Unterminated dictionary')
107
+ if (t.type === 'delim' && t.value === '>>') break
108
+ if (t.type !== 'name') {
109
+ // tolerate garbage keys by skipping a value
110
+ continue
111
+ }
112
+ const value = this.parseObject()
113
+ d.entries.set(t.value, value)
114
+ }
115
+ // A `stream` keyword immediately following the dict makes this a stream.
116
+ const save = this.lex.pos
117
+ this.lex.skipWs()
118
+ if (this.matchKeyword('stream')) {
119
+ return this.readStreamBody(d)
120
+ }
121
+ this.lex.pos = save
122
+ return d
123
+ }
124
+
125
+ private matchKeyword(kw: string): boolean {
126
+ const b = this.lex.buf
127
+ let p = this.lex.pos
128
+ for (let i = 0; i < kw.length; i++) {
129
+ if (b[p + i] !== kw.charCodeAt(i)) return false
130
+ }
131
+ p += kw.length
132
+ this.lex.pos = p
133
+ return true
134
+ }
135
+
136
+ private readStreamBody(d: PdfDictionary): PdfObject {
137
+ const b = this.lex.buf
138
+ // After "stream": CRLF or LF (spec §7.3.8.1). A lone CR is tolerated.
139
+ if (b[this.lex.pos] === 0x0d && b[this.lex.pos + 1] === 0x0a) this.lex.pos += 2
140
+ else if (b[this.lex.pos] === 0x0a || b[this.lex.pos] === 0x0d) this.lex.pos += 1
141
+ const start = this.lex.pos
142
+
143
+ let len = -1
144
+ const lenObj = d.entries.get('Length')
145
+ if (lenObj && isNum(lenObj)) len = lenObj.value
146
+ else if (lenObj && this.resolveLength) {
147
+ const r = this.resolveLength(lenObj)
148
+ if (typeof r === 'number') len = r
149
+ }
150
+
151
+ let end: number
152
+ if (len >= 0 && this.looksLikeEndstream(start + len)) {
153
+ end = start + len
154
+ } else {
155
+ end = this.scanForEndstream(start)
156
+ }
157
+ const data = this.lex.slice(start, end)
158
+ // Skip past endstream/endobj for sequential callers.
159
+ this.lex.pos = end
160
+ this.skipUntilAfter('endstream')
161
+ return { kind: 'stream', dict: d, data }
162
+ }
163
+
164
+ private looksLikeEndstream(at: number): boolean {
165
+ const b = this.lex.buf
166
+ let p = at
167
+ while (p < b.length && (b[p] === 0x0a || b[p] === 0x0d || b[p] === 0x20 || b[p] === 0x09)) p++
168
+ return latin1(b, p, p + 9) === 'endstream'
169
+ }
170
+
171
+ private scanForEndstream(start: number): number {
172
+ const b = this.lex.buf
173
+ const needle = 'endstream'
174
+ for (let p = start; p <= b.length - needle.length; p++) {
175
+ if (b[p] === 0x65 && latin1(b, p, p + needle.length) === needle) {
176
+ // trim a single trailing EOL that belongs to the keyword line
177
+ let e = p
178
+ if (b[e - 1] === 0x0a) e--
179
+ if (b[e - 1] === 0x0d) e--
180
+ return e
181
+ }
182
+ }
183
+ return b.length
184
+ }
185
+
186
+ private skipUntilAfter(kw: string): void {
187
+ const b = this.lex.buf
188
+ for (let p = this.lex.pos; p <= b.length - kw.length; p++) {
189
+ if (latin1(b, p, p + kw.length) === kw) {
190
+ this.lex.pos = p + kw.length
191
+ return
192
+ }
193
+ }
194
+ this.lex.pos = b.length
195
+ }
196
+ }
197
+
198
+ /** Convenience: parse a standalone object value from bytes. */
199
+ export function parseObjectFrom(buf: Uint8Array, offset = 0): PdfObject {
200
+ return new ObjectParser(new Lexer(buf, offset)).parseObject()
201
+ }
202
+
203
+ export { isDict }
@@ -0,0 +1,44 @@
1
+ /**
2
+ * Compressed object streams (`/Type /ObjStm`, spec §7.5.7). Header is `N`
3
+ * pairs `(objNum, byteOffset)`; the objects themselves start at `/First`.
4
+ * Objects inside an ObjStm may not be streams and may not be ObjStm
5
+ * themselves — the document layer enforces no ObjStm-in-ObjStm recursion.
6
+ */
7
+
8
+ import { type PdfObject, isNum } from '../objects/types.ts'
9
+ import { PdfParseError } from '../util/errors.ts'
10
+ import { Lexer } from './lexer.ts'
11
+ import { ObjectParser } from './object_parser.ts'
12
+
13
+ export interface ObjStmContents {
14
+ /** objNum → parsed value. */
15
+ objects: Map<number, PdfObject>
16
+ }
17
+
18
+ export function parseObjStm(dict: { entries: Map<string, PdfObject> }, data: Uint8Array): ObjStmContents {
19
+ const nObj = dict.entries.get('N')
20
+ const firstObj = dict.entries.get('First')
21
+ if (!nObj || !isNum(nObj) || !firstObj || !isNum(firstObj)) {
22
+ throw new PdfParseError('ObjStm missing /N or /First')
23
+ }
24
+ const n = nObj.value
25
+ const first = firstObj.value
26
+
27
+ const headerLex = new Lexer(data, 0)
28
+ const table: { num: number; off: number }[] = []
29
+ for (let i = 0; i < n; i++) {
30
+ const a = headerLex.next()
31
+ const b = headerLex.next()
32
+ if (a.type !== 'num' || b.type !== 'num') {
33
+ throw new PdfParseError('Malformed ObjStm header')
34
+ }
35
+ table.push({ num: a.value, off: b.value })
36
+ }
37
+
38
+ const objects = new Map<number, PdfObject>()
39
+ for (const { num, off } of table) {
40
+ const parser = new ObjectParser(new Lexer(data, first + off))
41
+ objects.set(num, parser.parseObject())
42
+ }
43
+ return { objects }
44
+ }
@@ -0,0 +1,327 @@
1
+ /**
2
+ * Content-stream text interpreter (spec §9.4). Executes the text-showing
3
+ * subset of operators against a graphics/text state and emits positioned
4
+ * glyph runs (device-space origin + advance + effective size). Non-text
5
+ * operators are skipped; `BI…ID…EI` inline images are byte-skipped so their
6
+ * binary payload never reaches the lexer.
7
+ */
8
+
9
+ import { type PdfDictionary, isDict, isName, isStream } from '../objects/types.ts'
10
+ import { Lexer, latin1, type Token } from './lexer.ts'
11
+ import { buildCharMap, type CharMap } from './fonts.ts'
12
+ import type { Run } from './layout.ts'
13
+
14
+ /** 2×3 affine matrix [a b c d e f]; point (x,y) → (a x + c y + e, b x + d y + f). */
15
+ type Mat = [number, number, number, number, number, number]
16
+ const IDENT: Mat = [1, 0, 0, 1, 0, 0]
17
+
18
+ function mul(m: Mat, n: Mat): Mat {
19
+ return [
20
+ m[0] * n[0] + m[1] * n[2],
21
+ m[0] * n[1] + m[1] * n[3],
22
+ m[2] * n[0] + m[3] * n[2],
23
+ m[2] * n[1] + m[3] * n[3],
24
+ m[4] * n[0] + m[5] * n[2] + n[4],
25
+ m[4] * n[1] + m[5] * n[3] + n[5],
26
+ ]
27
+ }
28
+
29
+ interface Doc {
30
+ resolve(o: any): any
31
+ getStreamData(s: any, num: number): Uint8Array
32
+ }
33
+
34
+ interface TextState {
35
+ fontRes?: string
36
+ fontSize: number
37
+ charSpace: number
38
+ wordSpace: number
39
+ hScale: number // as a fraction (Tz / 100)
40
+ leading: number
41
+ rise: number
42
+ }
43
+
44
+ function freshTextState(): TextState {
45
+ return { fontSize: 0, charSpace: 0, wordSpace: 0, hScale: 1, leading: 0, rise: 0 }
46
+ }
47
+
48
+ export function interpretText(
49
+ content: Uint8Array,
50
+ resources: PdfDictionary | undefined,
51
+ doc: Doc,
52
+ ): Run[] {
53
+ const runs: Run[] = []
54
+ const lex = new Lexer(content, 0)
55
+
56
+ let ctm: Mat = IDENT
57
+ const ctmStack: Mat[] = []
58
+ let ts = freshTextState()
59
+ let tm: Mat = IDENT
60
+ let tlm: Mat = IDENT
61
+
62
+ const fontCache = new Map<string, CharMap | undefined>()
63
+ const fontDictCache = new Map<string, PdfDictionary | undefined>()
64
+ const charMapFor = (res: string): CharMap | undefined => {
65
+ if (fontCache.has(res)) return fontCache.get(res)
66
+ let fd = fontDictCache.get(res)
67
+ if (fd === undefined) {
68
+ fd = lookupFont(resources, res, doc)
69
+ fontDictCache.set(res, fd)
70
+ }
71
+ const cm = fd ? safe(() => buildCharMap(fd!, doc)) : undefined
72
+ fontCache.set(res, cm)
73
+ return cm
74
+ }
75
+
76
+ const operands: any[] = []
77
+ const popNums = (k: number): number[] => {
78
+ const v = operands.slice(-k).map((x) => (typeof x === 'number' ? x : 0))
79
+ operands.length = Math.max(0, operands.length - k)
80
+ return v
81
+ }
82
+
83
+ const showText = (bytes: Uint8Array, cm: CharMap | undefined): void => {
84
+ if (!cm) return
85
+ const trm0 = mul(mul([ts.fontSize * ts.hScale, 0, 0, ts.fontSize, 0, ts.rise], tm), ctm)
86
+ const startX = trm0[4]
87
+ const y = trm0[5]
88
+ const scaleX = Math.hypot(ctm[0], ctm[1]) || 1
89
+ const scaleY = Math.hypot(ctm[2], ctm[3]) || 1
90
+ const fsDevice = ts.fontSize * scaleY
91
+ const spaceDevice = (cm.spaceWidth / 1000) * ts.fontSize * ts.hScale * scaleX
92
+
93
+ let text = ''
94
+ for (const g of cm.decode(bytes)) {
95
+ text += g.unicode
96
+ const w0 = g.width1000 / 1000
97
+ const isSpaceByte = g.code === 0x20
98
+ const tx =
99
+ (w0 * ts.fontSize + ts.charSpace + (isSpaceByte ? ts.wordSpace : 0)) * ts.hScale
100
+ tm = mul([1, 0, 0, 1, tx, 0], tm)
101
+ }
102
+ const endX = mul(mul([ts.fontSize * ts.hScale, 0, 0, ts.fontSize, 0, ts.rise], tm), ctm)[4]
103
+ runs.push({ text, x: startX, endX, y, fs: fsDevice || ts.fontSize || 1, spaceW: spaceDevice || 1 })
104
+ }
105
+
106
+ const showArray = (arr: any[], cm: CharMap | undefined): void => {
107
+ if (!cm) return
108
+ for (const el of arr) {
109
+ if (el instanceof Uint8Array) {
110
+ showText(el, cm)
111
+ } else if (typeof el === 'number') {
112
+ // TJ adjustment: positive moves left (spec §9.4.3).
113
+ const tx = (-el / 1000) * ts.fontSize * ts.hScale
114
+ tm = mul([1, 0, 0, 1, tx, 0], tm)
115
+ // Synthesize a space for kerning-only word gaps.
116
+ if (-el > 200 && runs.length) {
117
+ const last = runs[runs.length - 1]!
118
+ if (!last.text.endsWith(' ')) last.text += ' '
119
+ }
120
+ }
121
+ }
122
+ }
123
+
124
+ for (;;) {
125
+ const t = lex.next()
126
+ if (t.type === 'eof') break
127
+
128
+ if (t.type === 'num') {
129
+ operands.push(t.value)
130
+ continue
131
+ }
132
+ if (t.type === 'str') {
133
+ operands.push(t.value)
134
+ continue
135
+ }
136
+ if (t.type === 'name') {
137
+ operands.push({ name: t.value })
138
+ continue
139
+ }
140
+ if (t.type === 'delim') {
141
+ if (t.value === '[') {
142
+ operands.push(readArray(lex))
143
+ } else if (t.value === '<<') {
144
+ skipDict(lex)
145
+ operands.push({})
146
+ }
147
+ continue
148
+ }
149
+
150
+ // Operator (keyword)
151
+ const op = t.value
152
+ switch (op) {
153
+ case 'q':
154
+ ctmStack.push(ctm)
155
+ break
156
+ case 'Q':
157
+ ctm = ctmStack.pop() ?? ctm
158
+ break
159
+ case 'cm': {
160
+ const [a, b, c, d, e, f] = popNums(6)
161
+ ctm = mul([a!, b!, c!, d!, e!, f!], ctm)
162
+ break
163
+ }
164
+ case 'BT':
165
+ tm = IDENT
166
+ tlm = IDENT
167
+ break
168
+ case 'ET':
169
+ break
170
+ case 'Td': {
171
+ const [tx, ty] = popNums(2)
172
+ tlm = mul([1, 0, 0, 1, tx!, ty!], tlm)
173
+ tm = tlm
174
+ break
175
+ }
176
+ case 'TD': {
177
+ const [tx, ty] = popNums(2)
178
+ ts.leading = -ty!
179
+ tlm = mul([1, 0, 0, 1, tx!, ty!], tlm)
180
+ tm = tlm
181
+ break
182
+ }
183
+ case 'Tm': {
184
+ const [a, b, c, d, e, f] = popNums(6)
185
+ tlm = [a!, b!, c!, d!, e!, f!]
186
+ tm = tlm
187
+ break
188
+ }
189
+ case 'T*':
190
+ tlm = mul([1, 0, 0, 1, 0, -ts.leading], tlm)
191
+ tm = tlm
192
+ break
193
+ case 'Tc':
194
+ ts.charSpace = popNums(1)[0]!
195
+ break
196
+ case 'Tw':
197
+ ts.wordSpace = popNums(1)[0]!
198
+ break
199
+ case 'Tz':
200
+ ts.hScale = popNums(1)[0]! / 100
201
+ break
202
+ case 'TL':
203
+ ts.leading = popNums(1)[0]!
204
+ break
205
+ case 'Ts':
206
+ ts.rise = popNums(1)[0]!
207
+ break
208
+ case 'Tf': {
209
+ const size = popNums(1)[0]!
210
+ const res = operands.pop()
211
+ ts.fontSize = size
212
+ ts.fontRes = res && typeof res === 'object' && 'name' in res ? res.name : undefined
213
+ break
214
+ }
215
+ case 'Tj': {
216
+ const s = operands.pop()
217
+ if (s instanceof Uint8Array && ts.fontRes) showText(s, charMapFor(ts.fontRes))
218
+ break
219
+ }
220
+ case 'TJ': {
221
+ const a = operands.pop()
222
+ if (Array.isArray(a) && ts.fontRes) showArray(a, charMapFor(ts.fontRes))
223
+ break
224
+ }
225
+ case "'": {
226
+ const s = operands.pop()
227
+ tlm = mul([1, 0, 0, 1, 0, -ts.leading], tlm)
228
+ tm = tlm
229
+ if (s instanceof Uint8Array && ts.fontRes) showText(s, charMapFor(ts.fontRes))
230
+ break
231
+ }
232
+ case '"': {
233
+ const s = operands.pop()
234
+ const [aw, ac] = popNums(2)
235
+ ts.wordSpace = aw!
236
+ ts.charSpace = ac!
237
+ tlm = mul([1, 0, 0, 1, 0, -ts.leading], tlm)
238
+ tm = tlm
239
+ if (s instanceof Uint8Array && ts.fontRes) showText(s, charMapFor(ts.fontRes))
240
+ break
241
+ }
242
+ case 'BI':
243
+ skipInlineImage(lex)
244
+ break
245
+ default:
246
+ // Unknown / non-text operator: discard its operands.
247
+ operands.length = 0
248
+ }
249
+ if (op !== 'Tf') operands.length = 0
250
+ }
251
+ return runs
252
+ }
253
+
254
+ // ── Token helpers ──────────────────────────────────────────────────────────
255
+
256
+ function readArray(lex: Lexer): any[] {
257
+ const out: any[] = []
258
+ for (;;) {
259
+ const t = lex.next()
260
+ if (t.type === 'eof') break
261
+ if (t.type === 'delim' && t.value === ']') break
262
+ if (t.type === 'num') out.push(t.value)
263
+ else if (t.type === 'str') out.push(t.value)
264
+ else if (t.type === 'name') out.push({ name: t.value })
265
+ }
266
+ return out
267
+ }
268
+
269
+ function skipDict(lex: Lexer): void {
270
+ let depth = 1
271
+ for (;;) {
272
+ const t = lex.next()
273
+ if (t.type === 'eof') break
274
+ if (t.type === 'delim' && t.value === '<<') depth++
275
+ else if (t.type === 'delim' && t.value === '>>' && --depth === 0) break
276
+ }
277
+ }
278
+
279
+ /** Skip `… ID <binary> EI` without tokenizing the binary payload. */
280
+ function skipInlineImage(lex: Lexer): void {
281
+ // Consume the inline image dictionary up to the ID keyword.
282
+ for (;;) {
283
+ const t: Token = lex.next()
284
+ if (t.type === 'eof') return
285
+ if (t.type === 'kw' && t.value === 'ID') break
286
+ }
287
+ const b = lex.buf
288
+ let p = lex.pos + 1 // one whitespace byte follows ID
289
+ while (p + 1 < b.length) {
290
+ if (
291
+ b[p] === 0x45 && // 'E'
292
+ b[p + 1] === 0x49 && // 'I'
293
+ (p === 0 || isWsByte(b[p - 1]!)) &&
294
+ (p + 2 >= b.length || isWsByte(b[p + 2]!))
295
+ ) {
296
+ lex.pos = p + 2
297
+ return
298
+ }
299
+ p++
300
+ }
301
+ lex.pos = b.length
302
+ }
303
+
304
+ const isWsByte = (x: number) =>
305
+ x === 0x00 || x === 0x09 || x === 0x0a || x === 0x0c || x === 0x0d || x === 0x20
306
+
307
+ function lookupFont(
308
+ resources: PdfDictionary | undefined,
309
+ res: string,
310
+ doc: Doc,
311
+ ): PdfDictionary | undefined {
312
+ if (!resources) return undefined
313
+ const fontsDict = doc.resolve(resources.entries.get('Font'))
314
+ if (!fontsDict || !isDict(fontsDict)) return undefined
315
+ const fd = doc.resolve(fontsDict.entries.get(res))
316
+ return fd && isDict(fd) ? fd : undefined
317
+ }
318
+
319
+ function safe<T>(fn: () => T): T | undefined {
320
+ try {
321
+ return fn()
322
+ } catch {
323
+ return undefined
324
+ }
325
+ }
326
+
327
+ export { latin1, isName, isStream }