@strav/pdf 0.4.17 → 0.4.18
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +25 -7
- package/package.json +5 -3
- package/src/index.ts +10 -0
- package/src/reader/cmap_parser.ts +173 -0
- package/src/reader/decrypt.ts +226 -0
- package/src/reader/document.ts +246 -0
- package/src/reader/encodings.ts +73 -0
- package/src/reader/extract.ts +152 -0
- package/src/reader/fonts.ts +259 -0
- package/src/reader/index.ts +27 -0
- package/src/reader/layout.ts +106 -0
- package/src/reader/lexer.ts +270 -0
- package/src/reader/object_parser.ts +203 -0
- package/src/reader/objstm.ts +44 -0
- package/src/reader/text_interpreter.ts +327 -0
- package/src/reader/xref.ts +229 -0
- package/src/streams/decode.ts +98 -0
- package/src/streams/flate.ts +94 -4
- package/src/streams/index.ts +6 -1
- package/src/streams/lzw.ts +74 -0
- package/src/streams/runlength.ts +25 -0
- package/src/util/errors.ts +20 -0
|
@@ -0,0 +1,259 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Per-font text decoding (spec §9.6–9.10). Builds a {@link CharMap} that turns
|
|
3
|
+
* show-string bytes into Unicode + glyph advance widths. Decoding precedence:
|
|
4
|
+
* 1. /ToUnicode CMap
|
|
5
|
+
* 2. simple-font /Encoding (base + /Differences) → glyph name → Unicode
|
|
6
|
+
* 3. Type0 Identity/embedded-CMap → CID (Unicode only via /ToUnicode)
|
|
7
|
+
* 4. raw byte → WinAnsi/Latin-1 fallback
|
|
8
|
+
*/
|
|
9
|
+
|
|
10
|
+
import {
|
|
11
|
+
type PdfObject,
|
|
12
|
+
type PdfDictionary,
|
|
13
|
+
isDict,
|
|
14
|
+
isName,
|
|
15
|
+
isNum,
|
|
16
|
+
isArr,
|
|
17
|
+
isStream,
|
|
18
|
+
} from '../objects/types.ts'
|
|
19
|
+
import {
|
|
20
|
+
isStandardFontName,
|
|
21
|
+
standardGlyphWidth,
|
|
22
|
+
type StandardFontName,
|
|
23
|
+
} from '../fonts/standard_14.ts'
|
|
24
|
+
import {
|
|
25
|
+
baseEncode,
|
|
26
|
+
winAnsiToUnicode,
|
|
27
|
+
glyphNameToUnicode,
|
|
28
|
+
type BaseEncodingName,
|
|
29
|
+
} from './encodings.ts'
|
|
30
|
+
import { parseCMap, type CMap } from './cmap_parser.ts'
|
|
31
|
+
|
|
32
|
+
export interface DecodedGlyph {
|
|
33
|
+
code: number
|
|
34
|
+
unicode: string
|
|
35
|
+
/** Advance width in text space units per em/1000 (i.e. glyph-space/1000). */
|
|
36
|
+
width1000: number
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
export interface CharMap {
|
|
40
|
+
decode(bytes: Uint8Array): DecodedGlyph[]
|
|
41
|
+
/** Width of the space-like glyph (code 32 / CID space), in /1000 units. */
|
|
42
|
+
spaceWidth: number
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
interface Doc {
|
|
46
|
+
resolve(o: PdfObject | undefined): PdfObject | undefined
|
|
47
|
+
getStreamData(s: Extract<PdfObject, { kind: 'stream' }>, num: number): Uint8Array
|
|
48
|
+
}
|
|
49
|
+
|
|
50
|
+
const REPLACEMENT = '�'
|
|
51
|
+
|
|
52
|
+
export function buildCharMap(fontDict: PdfDictionary, doc: Doc): CharMap {
|
|
53
|
+
const subtype = nameOf(doc.resolve(fontDict.entries.get('Subtype')))
|
|
54
|
+
const toUni = loadToUnicode(fontDict, doc)
|
|
55
|
+
|
|
56
|
+
if (subtype === 'Type0') return type0CharMap(fontDict, doc, toUni)
|
|
57
|
+
return simpleCharMap(fontDict, doc, toUni)
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// ── Simple fonts (Type1 / TrueType / Type3 / MMType1) ──────────────────────
|
|
61
|
+
|
|
62
|
+
function simpleCharMap(fontDict: PdfDictionary, doc: Doc, toUni?: CMap): CharMap {
|
|
63
|
+
const base = nameOf(doc.resolve(fontDict.entries.get('BaseFont'))) ?? ''
|
|
64
|
+
const std = isStandardFontName(base) ? (base as StandardFontName) : undefined
|
|
65
|
+
|
|
66
|
+
// Widths
|
|
67
|
+
const firstChar = numOf(doc.resolve(fontDict.entries.get('FirstChar'))) ?? 0
|
|
68
|
+
const widthsArr = doc.resolve(fontDict.entries.get('Widths'))
|
|
69
|
+
const widths: number[] = widthsArr && isArr(widthsArr)
|
|
70
|
+
? widthsArr.items.map((w) => {
|
|
71
|
+
const r = doc.resolve(w)
|
|
72
|
+
return r && isNum(r) ? r.value : 0
|
|
73
|
+
})
|
|
74
|
+
: []
|
|
75
|
+
let missingWidth = 0
|
|
76
|
+
const fd = doc.resolve(fontDict.entries.get('FontDescriptor'))
|
|
77
|
+
if (fd && isDict(fd)) {
|
|
78
|
+
missingWidth = numOf(doc.resolve(fd.entries.get('MissingWidth'))) ?? 0
|
|
79
|
+
}
|
|
80
|
+
|
|
81
|
+
// Encoding → per-code Unicode (used only when there is no /ToUnicode).
|
|
82
|
+
const encUnicode = buildSimpleEncoding(fontDict, doc)
|
|
83
|
+
|
|
84
|
+
const widthOf = (code: number): number => {
|
|
85
|
+
const idx = code - firstChar
|
|
86
|
+
if (idx >= 0 && idx < widths.length && widths[idx]! > 0) return widths[idx]!
|
|
87
|
+
if (std) return standardGlyphWidth(std, code)
|
|
88
|
+
return missingWidth
|
|
89
|
+
}
|
|
90
|
+
|
|
91
|
+
const uniOf = (code: number): string => {
|
|
92
|
+
if (toUni) {
|
|
93
|
+
const u = toUni.unicodeOf(code)
|
|
94
|
+
if (u !== undefined && u !== '') return u
|
|
95
|
+
}
|
|
96
|
+
const e = encUnicode[code]
|
|
97
|
+
if (e !== undefined && e >= 0) return String.fromCodePoint(e)
|
|
98
|
+
const w = winAnsiToUnicode(code)
|
|
99
|
+
return w ? String.fromCodePoint(w) : ''
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
return {
|
|
103
|
+
spaceWidth: widthOf(0x20) || (std ? standardGlyphWidth(std, 0x20) : 250),
|
|
104
|
+
decode(bytes) {
|
|
105
|
+
const out: DecodedGlyph[] = []
|
|
106
|
+
for (const code of bytes) {
|
|
107
|
+
out.push({ code, unicode: uniOf(code), width1000: widthOf(code) })
|
|
108
|
+
}
|
|
109
|
+
return out
|
|
110
|
+
},
|
|
111
|
+
}
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
function buildSimpleEncoding(fontDict: PdfDictionary, doc: Doc): number[] {
|
|
115
|
+
const table: number[] = new Array(256)
|
|
116
|
+
const enc = doc.resolve(fontDict.entries.get('Encoding'))
|
|
117
|
+
let baseName: BaseEncodingName | undefined
|
|
118
|
+
if (enc && isName(enc)) baseName = enc.value as BaseEncodingName
|
|
119
|
+
else if (enc && isDict(enc)) {
|
|
120
|
+
const be = doc.resolve(enc.entries.get('BaseEncoding'))
|
|
121
|
+
if (be && isName(be)) baseName = be.value as BaseEncodingName
|
|
122
|
+
}
|
|
123
|
+
for (let c = 0; c < 256; c++) table[c] = baseEncode(baseName, c)
|
|
124
|
+
// /Differences: [ code /name /name code /name … ]
|
|
125
|
+
if (enc && isDict(enc)) {
|
|
126
|
+
const diffs = doc.resolve(enc.entries.get('Differences'))
|
|
127
|
+
if (diffs && isArr(diffs)) {
|
|
128
|
+
let cur = 0
|
|
129
|
+
for (const item of diffs.items) {
|
|
130
|
+
const r = doc.resolve(item)
|
|
131
|
+
if (r && isNum(r)) cur = r.value
|
|
132
|
+
else if (r && isName(r)) {
|
|
133
|
+
const u = glyphNameToUnicode(r.value)
|
|
134
|
+
table[cur] = u >= 0 ? u : table[cur]!
|
|
135
|
+
cur++
|
|
136
|
+
}
|
|
137
|
+
}
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
return table
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
// ── Composite (Type0) fonts ────────────────────────────────────────────────
|
|
144
|
+
|
|
145
|
+
function type0CharMap(fontDict: PdfDictionary, doc: Doc, toUni?: CMap): CharMap {
|
|
146
|
+
// Encoding: Identity-H/V → 2-byte identity; or an embedded/named CMap.
|
|
147
|
+
const enc = doc.resolve(fontDict.entries.get('Encoding'))
|
|
148
|
+
let encCMap: CMap | undefined
|
|
149
|
+
let identity = true
|
|
150
|
+
if (enc && isName(enc)) {
|
|
151
|
+
identity = enc.value === 'Identity-H' || enc.value === 'Identity-V'
|
|
152
|
+
} else if (enc && isStream(enc)) {
|
|
153
|
+
encCMap = parseCMap(doc.getStreamData(enc, -1))
|
|
154
|
+
identity = false
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
// Descendant CIDFont: /DW + /W widths, keyed by CID.
|
|
158
|
+
let dw = 1000
|
|
159
|
+
const widthByCid = new Map<number, number>()
|
|
160
|
+
const desc = doc.resolve(fontDict.entries.get('DescendantFonts'))
|
|
161
|
+
if (desc && isArr(desc) && desc.items[0]) {
|
|
162
|
+
const cidFont = doc.resolve(desc.items[0])
|
|
163
|
+
if (cidFont && isDict(cidFont)) {
|
|
164
|
+
dw = numOf(doc.resolve(cidFont.entries.get('DW'))) ?? 1000
|
|
165
|
+
const W = doc.resolve(cidFont.entries.get('W'))
|
|
166
|
+
if (W && isArr(W)) parseCidWidths(W.items, doc, widthByCid)
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
const codeBytes = encCMap ? encCMap.codeBytes : 2
|
|
171
|
+
const cidOf = (code: number): number =>
|
|
172
|
+
identity ? code : (encCMap?.cidOf(code) ?? code)
|
|
173
|
+
|
|
174
|
+
const uniOf = (code: number): string => {
|
|
175
|
+
if (toUni) {
|
|
176
|
+
const u = toUni.unicodeOf(code)
|
|
177
|
+
if (u !== undefined && u !== '') return u
|
|
178
|
+
}
|
|
179
|
+
return REPLACEMENT // no ToUnicode for an embedded-cmap-only font (limitation)
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
return {
|
|
183
|
+
spaceWidth: widthByCid.get(cidOf(0x20)) ?? dw,
|
|
184
|
+
decode(bytes) {
|
|
185
|
+
const out: DecodedGlyph[] = []
|
|
186
|
+
const codes = encCMap ? encCMap.readCodes(bytes) : readFixed(bytes, codeBytes)
|
|
187
|
+
for (const code of codes) {
|
|
188
|
+
const cid = cidOf(code)
|
|
189
|
+
out.push({
|
|
190
|
+
code,
|
|
191
|
+
unicode: uniOf(code),
|
|
192
|
+
width1000: widthByCid.get(cid) ?? dw,
|
|
193
|
+
})
|
|
194
|
+
}
|
|
195
|
+
return out
|
|
196
|
+
},
|
|
197
|
+
}
|
|
198
|
+
}
|
|
199
|
+
|
|
200
|
+
function readFixed(bytes: Uint8Array, n: number): number[] {
|
|
201
|
+
const out: number[] = []
|
|
202
|
+
for (let i = 0; i + n <= bytes.length; i += n) {
|
|
203
|
+
let c = 0
|
|
204
|
+
for (let k = 0; k < n; k++) c = (c << 8) | bytes[i + k]!
|
|
205
|
+
out.push(c)
|
|
206
|
+
}
|
|
207
|
+
return out
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
function parseCidWidths(
|
|
211
|
+
items: PdfObject[],
|
|
212
|
+
doc: Doc,
|
|
213
|
+
out: Map<number, number>,
|
|
214
|
+
): void {
|
|
215
|
+
let i = 0
|
|
216
|
+
while (i < items.length) {
|
|
217
|
+
const a = doc.resolve(items[i])
|
|
218
|
+
if (!a || !isNum(a)) break
|
|
219
|
+
const next = doc.resolve(items[i + 1])
|
|
220
|
+
if (next && isArr(next)) {
|
|
221
|
+
// c [ w1 w2 … ] : CIDs c, c+1, …
|
|
222
|
+
let cid = a.value
|
|
223
|
+
for (const w of next.items) {
|
|
224
|
+
const wr = doc.resolve(w)
|
|
225
|
+
if (wr && isNum(wr)) out.set(cid++, wr.value)
|
|
226
|
+
}
|
|
227
|
+
i += 2
|
|
228
|
+
} else {
|
|
229
|
+
// c1 c2 w : CIDs c1..c2 all width w
|
|
230
|
+
const b = doc.resolve(items[i + 1])
|
|
231
|
+
const w = doc.resolve(items[i + 2])
|
|
232
|
+
if (b && isNum(b) && w && isNum(w)) {
|
|
233
|
+
for (let cid = a.value; cid <= b.value; cid++) out.set(cid, w.value)
|
|
234
|
+
}
|
|
235
|
+
i += 3
|
|
236
|
+
}
|
|
237
|
+
}
|
|
238
|
+
}
|
|
239
|
+
|
|
240
|
+
// ── Shared helpers ─────────────────────────────────────────────────────────
|
|
241
|
+
|
|
242
|
+
function loadToUnicode(fontDict: PdfDictionary, doc: Doc): CMap | undefined {
|
|
243
|
+
const tu = doc.resolve(fontDict.entries.get('ToUnicode'))
|
|
244
|
+
if (tu && isStream(tu)) {
|
|
245
|
+
try {
|
|
246
|
+
return parseCMap(doc.getStreamData(tu, -1))
|
|
247
|
+
} catch {
|
|
248
|
+
return undefined
|
|
249
|
+
}
|
|
250
|
+
}
|
|
251
|
+
return undefined
|
|
252
|
+
}
|
|
253
|
+
|
|
254
|
+
function nameOf(o: PdfObject | undefined): string | undefined {
|
|
255
|
+
return o && isName(o) ? o.value : undefined
|
|
256
|
+
}
|
|
257
|
+
function numOf(o: PdfObject | undefined): number | undefined {
|
|
258
|
+
return o && isNum(o) ? o.value : undefined
|
|
259
|
+
}
|
|
@@ -0,0 +1,27 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Read side (M13) sub-module barrel. The headline API plus lower-level pieces
|
|
3
|
+
* for advanced callers, mirroring how `document/index.ts` exposes internals.
|
|
4
|
+
*/
|
|
5
|
+
|
|
6
|
+
export { extractText, PdfReader } from './extract.ts'
|
|
7
|
+
export type {
|
|
8
|
+
ExtractOptions,
|
|
9
|
+
ExtractResult,
|
|
10
|
+
ExtractedPage,
|
|
11
|
+
PdfInfo,
|
|
12
|
+
} from './extract.ts'
|
|
13
|
+
|
|
14
|
+
export { PdfReaderDocument } from './document.ts'
|
|
15
|
+
export { Lexer } from './lexer.ts'
|
|
16
|
+
export { ObjectParser, parseObjectFrom } from './object_parser.ts'
|
|
17
|
+
export { parseXref, bruteForceXref, findStartXref } from './xref.ts'
|
|
18
|
+
export type { XrefTable, XrefEntry } from './xref.ts'
|
|
19
|
+
export { parseObjStm } from './objstm.ts'
|
|
20
|
+
export { parseCMap, CMap } from './cmap_parser.ts'
|
|
21
|
+
export { buildCharMap } from './fonts.ts'
|
|
22
|
+
export type { CharMap, DecodedGlyph } from './fonts.ts'
|
|
23
|
+
export { interpretText } from './text_interpreter.ts'
|
|
24
|
+
export { runsToText } from './layout.ts'
|
|
25
|
+
export type { Run } from './layout.ts'
|
|
26
|
+
export { buildDecryptor } from './decrypt.ts'
|
|
27
|
+
export type { Decryptor } from './decrypt.ts'
|
|
@@ -0,0 +1,106 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Layout heuristics: positioned glyph runs → readable plain text. Runs are
|
|
3
|
+
* grouped into lines by baseline proximity; intra-line gaps become spaces
|
|
4
|
+
* (proportional to the font's space width); inter-line drops become newlines,
|
|
5
|
+
* with a blank line for paragraph-sized jumps. No column/table reconstruction.
|
|
6
|
+
*/
|
|
7
|
+
|
|
8
|
+
export interface Run {
|
|
9
|
+
text: string
|
|
10
|
+
/** Device-space X of the run's first glyph origin. */
|
|
11
|
+
x: number
|
|
12
|
+
/** Device-space X just past the run's last glyph. */
|
|
13
|
+
endX: number
|
|
14
|
+
/** Device-space baseline Y. */
|
|
15
|
+
y: number
|
|
16
|
+
/** Effective device font size. */
|
|
17
|
+
fs: number
|
|
18
|
+
/** Device-space width of the space glyph. */
|
|
19
|
+
spaceW: number
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
// Tunable thresholds (fractions of the space-glyph width / font size).
|
|
23
|
+
const SAME_LINE = 0.3 // |Δy| < SAME_LINE·fs ⇒ same line
|
|
24
|
+
const GLUE = 0.2 // gap < GLUE·spaceW ⇒ no separator
|
|
25
|
+
const WIDE = 2.5 // gap ≥ WIDE·spaceW ⇒ multiple spaces
|
|
26
|
+
const MAX_GAP_SPACES = 8
|
|
27
|
+
const PARA = 1.6 // line drop > PARA·fs ⇒ blank line
|
|
28
|
+
|
|
29
|
+
export function runsToText(runs: Run[], normalize = true): string {
|
|
30
|
+
const items = runs.filter((r) => r.text.length > 0)
|
|
31
|
+
if (items.length === 0) return ''
|
|
32
|
+
|
|
33
|
+
items.sort((a, b) => (b.y - a.y) || (a.x - b.x))
|
|
34
|
+
|
|
35
|
+
// Group into lines by baseline proximity.
|
|
36
|
+
const lines: Run[][] = []
|
|
37
|
+
let cur: Run[] = []
|
|
38
|
+
let lineY = items[0]!.y
|
|
39
|
+
let lineFs = items[0]!.fs
|
|
40
|
+
for (const r of items) {
|
|
41
|
+
if (cur.length && Math.abs(r.y - lineY) > SAME_LINE * Math.max(lineFs, r.fs)) {
|
|
42
|
+
lines.push(cur)
|
|
43
|
+
cur = []
|
|
44
|
+
}
|
|
45
|
+
if (cur.length === 0) {
|
|
46
|
+
lineY = r.y
|
|
47
|
+
lineFs = r.fs
|
|
48
|
+
}
|
|
49
|
+
cur.push(r)
|
|
50
|
+
}
|
|
51
|
+
if (cur.length) lines.push(cur)
|
|
52
|
+
|
|
53
|
+
let out = ''
|
|
54
|
+
let prevY: number | null = null
|
|
55
|
+
let prevFs = lineFs
|
|
56
|
+
for (const line of lines) {
|
|
57
|
+
line.sort((a, b) => a.x - b.x)
|
|
58
|
+
const y = line[0]!.y
|
|
59
|
+
const fs = Math.max(...line.map((r) => r.fs))
|
|
60
|
+
|
|
61
|
+
if (prevY !== null) {
|
|
62
|
+
out += '\n'
|
|
63
|
+
if (prevY - y > PARA * Math.max(fs, prevFs)) out += '\n'
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
let lineText = ''
|
|
67
|
+
let prev: Run | null = null
|
|
68
|
+
for (const r of line) {
|
|
69
|
+
if (prev) {
|
|
70
|
+
const gap = r.x - prev.endX
|
|
71
|
+
const sw = prev.spaceW || r.spaceW || fs * 0.25
|
|
72
|
+
if (gap >= WIDE * sw) {
|
|
73
|
+
lineText += ' '.repeat(Math.min(MAX_GAP_SPACES, Math.round(gap / sw)))
|
|
74
|
+
} else if (gap >= GLUE * sw) {
|
|
75
|
+
lineText += ' '
|
|
76
|
+
}
|
|
77
|
+
}
|
|
78
|
+
lineText += r.text
|
|
79
|
+
prev = r
|
|
80
|
+
}
|
|
81
|
+
out += lineText
|
|
82
|
+
prevY = y
|
|
83
|
+
prevFs = fs
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
return normalize ? normalizeWhitespace(out) : out
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
function normalizeWhitespace(s: string): string {
|
|
90
|
+
const lines = s.split('\n').map((l) => l.replace(/[ \t]+/g, ' ').replace(/[ \t]+$/g, ''))
|
|
91
|
+
// Collapse 3+ blank lines to a single blank line; trim leading/trailing.
|
|
92
|
+
const collapsed: string[] = []
|
|
93
|
+
let blanks = 0
|
|
94
|
+
for (const l of lines) {
|
|
95
|
+
if (l.trim() === '') {
|
|
96
|
+
blanks++
|
|
97
|
+
if (blanks <= 1) collapsed.push('')
|
|
98
|
+
} else {
|
|
99
|
+
blanks = 0
|
|
100
|
+
collapsed.push(l)
|
|
101
|
+
}
|
|
102
|
+
}
|
|
103
|
+
while (collapsed.length && collapsed[0] === '') collapsed.shift()
|
|
104
|
+
while (collapsed.length && collapsed[collapsed.length - 1] === '') collapsed.pop()
|
|
105
|
+
return collapsed.join('\n')
|
|
106
|
+
}
|
|
@@ -0,0 +1,270 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* PDF tokenizer (spec §7.2). Scans a byte buffer into the lexical tokens the
|
|
3
|
+
* object parser consumes. Pure and position-addressable: callers may `seek`
|
|
4
|
+
* to a known byte offset (from the xref table) and tokenize from there.
|
|
5
|
+
*
|
|
6
|
+
* Whitespace = NUL TAB LF FF CR SPACE. Delimiters = ( ) < > [ ] { } / %.
|
|
7
|
+
* Comments (`%` … EOL) are skipped except the `%PDF`/`%%EOF` markers, which
|
|
8
|
+
* callers locate by raw byte scanning, not through this lexer.
|
|
9
|
+
*/
|
|
10
|
+
|
|
11
|
+
export type Token =
|
|
12
|
+
| { type: 'num'; value: number }
|
|
13
|
+
| { type: 'name'; value: string }
|
|
14
|
+
| { type: 'str'; value: Uint8Array; encoding: 'literal' | 'hex' }
|
|
15
|
+
| { type: 'delim'; value: '[' | ']' | '<<' | '>>' | '{' | '}' }
|
|
16
|
+
| { type: 'kw'; value: string }
|
|
17
|
+
| { type: 'eof' }
|
|
18
|
+
|
|
19
|
+
const WS = new Set([0x00, 0x09, 0x0a, 0x0c, 0x0d, 0x20])
|
|
20
|
+
const DELIM = new Set([0x28, 0x29, 0x3c, 0x3e, 0x5b, 0x5d, 0x7b, 0x7d, 0x2f, 0x25])
|
|
21
|
+
|
|
22
|
+
const isWs = (b: number) => WS.has(b)
|
|
23
|
+
const isDelim = (b: number) => DELIM.has(b)
|
|
24
|
+
const isRegular = (b: number) => !isWs(b) && !isDelim(b)
|
|
25
|
+
const isDigit = (b: number) => b >= 0x30 && b <= 0x39
|
|
26
|
+
|
|
27
|
+
export class Lexer {
|
|
28
|
+
pos: number
|
|
29
|
+
|
|
30
|
+
constructor(
|
|
31
|
+
readonly buf: Uint8Array,
|
|
32
|
+
start = 0,
|
|
33
|
+
) {
|
|
34
|
+
this.pos = start
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
seek(p: number): void {
|
|
38
|
+
this.pos = p
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
/** Skip whitespace and `%` comments. */
|
|
42
|
+
skipWs(): void {
|
|
43
|
+
const b = this.buf
|
|
44
|
+
while (this.pos < b.length) {
|
|
45
|
+
const c = b[this.pos]!
|
|
46
|
+
if (isWs(c)) {
|
|
47
|
+
this.pos++
|
|
48
|
+
} else if (c === 0x25) {
|
|
49
|
+
// % comment → to end of line
|
|
50
|
+
this.pos++
|
|
51
|
+
while (this.pos < b.length && b[this.pos] !== 0x0a && b[this.pos] !== 0x0d) this.pos++
|
|
52
|
+
} else {
|
|
53
|
+
break
|
|
54
|
+
}
|
|
55
|
+
}
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** Peek the next token without consuming (cheap: save/restore pos). */
|
|
59
|
+
peek(): Token {
|
|
60
|
+
const save = this.pos
|
|
61
|
+
const t = this.next()
|
|
62
|
+
this.pos = save
|
|
63
|
+
return t
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
next(): Token {
|
|
67
|
+
this.skipWs()
|
|
68
|
+
const b = this.buf
|
|
69
|
+
if (this.pos >= b.length) return { type: 'eof' }
|
|
70
|
+
const c = b[this.pos]!
|
|
71
|
+
|
|
72
|
+
// Delimiters / structured tokens
|
|
73
|
+
if (c === 0x5b) {
|
|
74
|
+
this.pos++
|
|
75
|
+
return { type: 'delim', value: '[' }
|
|
76
|
+
}
|
|
77
|
+
if (c === 0x5d) {
|
|
78
|
+
this.pos++
|
|
79
|
+
return { type: 'delim', value: ']' }
|
|
80
|
+
}
|
|
81
|
+
if (c === 0x7b) {
|
|
82
|
+
this.pos++
|
|
83
|
+
return { type: 'delim', value: '{' }
|
|
84
|
+
}
|
|
85
|
+
if (c === 0x7d) {
|
|
86
|
+
this.pos++
|
|
87
|
+
return { type: 'delim', value: '}' }
|
|
88
|
+
}
|
|
89
|
+
if (c === 0x3c) {
|
|
90
|
+
if (b[this.pos + 1] === 0x3c) {
|
|
91
|
+
this.pos += 2
|
|
92
|
+
return { type: 'delim', value: '<<' }
|
|
93
|
+
}
|
|
94
|
+
return this.readHexString()
|
|
95
|
+
}
|
|
96
|
+
if (c === 0x3e) {
|
|
97
|
+
if (b[this.pos + 1] === 0x3e) {
|
|
98
|
+
this.pos += 2
|
|
99
|
+
return { type: 'delim', value: '>>' }
|
|
100
|
+
}
|
|
101
|
+
this.pos++ // stray '>' — tolerate
|
|
102
|
+
return this.next()
|
|
103
|
+
}
|
|
104
|
+
if (c === 0x28) return this.readLiteralString()
|
|
105
|
+
if (c === 0x2f) return this.readName()
|
|
106
|
+
|
|
107
|
+
// Number: digit, sign, or '.'
|
|
108
|
+
if (isDigit(c) || c === 0x2b || c === 0x2d || c === 0x2e) {
|
|
109
|
+
const numTok = this.tryReadNumber()
|
|
110
|
+
if (numTok) return numTok
|
|
111
|
+
// fall through: treat as keyword (e.g. malformed) below
|
|
112
|
+
}
|
|
113
|
+
|
|
114
|
+
// Keyword / bare token (obj, endobj, stream, R, true, false, null, …)
|
|
115
|
+
let s = this.pos
|
|
116
|
+
while (s < b.length && isRegular(b[s]!)) s++
|
|
117
|
+
if (s === this.pos) {
|
|
118
|
+
this.pos++ // unknown single byte — skip and continue
|
|
119
|
+
return this.next()
|
|
120
|
+
}
|
|
121
|
+
const kw = latin1(b, this.pos, s)
|
|
122
|
+
this.pos = s
|
|
123
|
+
return { type: 'kw', value: kw }
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
/** Raw byte access for stream payloads. */
|
|
127
|
+
slice(from: number, to: number): Uint8Array {
|
|
128
|
+
return this.buf.subarray(from, to)
|
|
129
|
+
}
|
|
130
|
+
|
|
131
|
+
private tryReadNumber(): Token | null {
|
|
132
|
+
const b = this.buf
|
|
133
|
+
let p = this.pos
|
|
134
|
+
let seenDigit = false
|
|
135
|
+
let seenDot = false
|
|
136
|
+
if (b[p] === 0x2b || b[p] === 0x2d) p++
|
|
137
|
+
while (p < b.length) {
|
|
138
|
+
const ch = b[p]!
|
|
139
|
+
if (isDigit(ch)) {
|
|
140
|
+
seenDigit = true
|
|
141
|
+
p++
|
|
142
|
+
} else if (ch === 0x2e && !seenDot) {
|
|
143
|
+
seenDot = true
|
|
144
|
+
p++
|
|
145
|
+
} else {
|
|
146
|
+
break
|
|
147
|
+
}
|
|
148
|
+
}
|
|
149
|
+
if (!seenDigit) return null
|
|
150
|
+
const str = latin1(b, this.pos, p)
|
|
151
|
+
const value = Number(str)
|
|
152
|
+
if (Number.isNaN(value)) return null
|
|
153
|
+
this.pos = p
|
|
154
|
+
return { type: 'num', value }
|
|
155
|
+
}
|
|
156
|
+
|
|
157
|
+
private readName(): Token {
|
|
158
|
+
const b = this.buf
|
|
159
|
+
this.pos++ // skip '/'
|
|
160
|
+
let out = ''
|
|
161
|
+
while (this.pos < b.length) {
|
|
162
|
+
const ch = b[this.pos]!
|
|
163
|
+
if (isWs(ch) || isDelim(ch)) break
|
|
164
|
+
if (ch === 0x23 && this.pos + 2 < b.length) {
|
|
165
|
+
const hi = hexVal(b[this.pos + 1]!)
|
|
166
|
+
const lo = hexVal(b[this.pos + 2]!)
|
|
167
|
+
if (hi >= 0 && lo >= 0) {
|
|
168
|
+
out += String.fromCharCode((hi << 4) | lo)
|
|
169
|
+
this.pos += 3
|
|
170
|
+
continue
|
|
171
|
+
}
|
|
172
|
+
}
|
|
173
|
+
out += String.fromCharCode(ch)
|
|
174
|
+
this.pos++
|
|
175
|
+
}
|
|
176
|
+
return { type: 'name', value: out }
|
|
177
|
+
}
|
|
178
|
+
|
|
179
|
+
private readLiteralString(): Token {
|
|
180
|
+
const b = this.buf
|
|
181
|
+
this.pos++ // skip '('
|
|
182
|
+
const out: number[] = []
|
|
183
|
+
let depth = 1
|
|
184
|
+
while (this.pos < b.length) {
|
|
185
|
+
let ch = b[this.pos++]!
|
|
186
|
+
if (ch === 0x5c) {
|
|
187
|
+
// backslash escape
|
|
188
|
+
const e = b[this.pos++]!
|
|
189
|
+
switch (e) {
|
|
190
|
+
case 0x6e: out.push(0x0a); break // \n
|
|
191
|
+
case 0x72: out.push(0x0d); break // \r
|
|
192
|
+
case 0x74: out.push(0x09); break // \t
|
|
193
|
+
case 0x62: out.push(0x08); break // \b
|
|
194
|
+
case 0x66: out.push(0x0c); break // \f
|
|
195
|
+
case 0x28: out.push(0x28); break // \(
|
|
196
|
+
case 0x29: out.push(0x29); break // \)
|
|
197
|
+
case 0x5c: out.push(0x5c); break // \\
|
|
198
|
+
case 0x0d:
|
|
199
|
+
if (b[this.pos] === 0x0a) this.pos++ // \ + CRLF line continuation
|
|
200
|
+
break
|
|
201
|
+
case 0x0a:
|
|
202
|
+
break // \ + LF line continuation
|
|
203
|
+
default:
|
|
204
|
+
if (e >= 0x30 && e <= 0x37) {
|
|
205
|
+
// octal escape (1–3 digits)
|
|
206
|
+
let v = e - 0x30
|
|
207
|
+
for (let k = 0; k < 2; k++) {
|
|
208
|
+
const d = b[this.pos]!
|
|
209
|
+
if (d >= 0x30 && d <= 0x37) {
|
|
210
|
+
v = (v << 3) | (d - 0x30)
|
|
211
|
+
this.pos++
|
|
212
|
+
} else break
|
|
213
|
+
}
|
|
214
|
+
out.push(v & 0xff)
|
|
215
|
+
} else {
|
|
216
|
+
out.push(e) // unknown escape → literal char
|
|
217
|
+
}
|
|
218
|
+
}
|
|
219
|
+
continue
|
|
220
|
+
}
|
|
221
|
+
if (ch === 0x28) {
|
|
222
|
+
depth++
|
|
223
|
+
out.push(ch)
|
|
224
|
+
continue
|
|
225
|
+
}
|
|
226
|
+
if (ch === 0x29) {
|
|
227
|
+
depth--
|
|
228
|
+
if (depth === 0) break
|
|
229
|
+
out.push(ch)
|
|
230
|
+
continue
|
|
231
|
+
}
|
|
232
|
+
if (ch === 0x0d) {
|
|
233
|
+
// CR or CRLF → normalize to LF inside literal strings
|
|
234
|
+
if (b[this.pos] === 0x0a) this.pos++
|
|
235
|
+
ch = 0x0a
|
|
236
|
+
}
|
|
237
|
+
out.push(ch)
|
|
238
|
+
}
|
|
239
|
+
return { type: 'str', value: Uint8Array.from(out), encoding: 'literal' }
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
private readHexString(): Token {
|
|
243
|
+
const b = this.buf
|
|
244
|
+
this.pos++ // skip '<'
|
|
245
|
+
const nibbles: number[] = []
|
|
246
|
+
while (this.pos < b.length) {
|
|
247
|
+
const ch = b[this.pos++]!
|
|
248
|
+
if (ch === 0x3e) break
|
|
249
|
+
const v = hexVal(ch)
|
|
250
|
+
if (v >= 0) nibbles.push(v)
|
|
251
|
+
}
|
|
252
|
+
if (nibbles.length % 2 === 1) nibbles.push(0)
|
|
253
|
+
const out = new Uint8Array(nibbles.length / 2)
|
|
254
|
+
for (let i = 0; i < out.length; i++) out[i] = (nibbles[2 * i]! << 4) | nibbles[2 * i + 1]!
|
|
255
|
+
return { type: 'str', value: out, encoding: 'hex' }
|
|
256
|
+
}
|
|
257
|
+
}
|
|
258
|
+
|
|
259
|
+
function hexVal(b: number): number {
|
|
260
|
+
if (b >= 0x30 && b <= 0x39) return b - 0x30
|
|
261
|
+
if (b >= 0x41 && b <= 0x46) return b - 0x41 + 10
|
|
262
|
+
if (b >= 0x61 && b <= 0x66) return b - 0x61 + 10
|
|
263
|
+
return -1
|
|
264
|
+
}
|
|
265
|
+
|
|
266
|
+
export function latin1(b: Uint8Array, from: number, to: number): string {
|
|
267
|
+
let s = ''
|
|
268
|
+
for (let i = from; i < to; i++) s += String.fromCharCode(b[i]!)
|
|
269
|
+
return s
|
|
270
|
+
}
|