@strav/pdf 0.4.17 → 0.4.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,259 @@
1
+ /**
2
+ * Per-font text decoding (spec §9.6–9.10). Builds a {@link CharMap} that turns
3
+ * show-string bytes into Unicode + glyph advance widths. Decoding precedence:
4
+ * 1. /ToUnicode CMap
5
+ * 2. simple-font /Encoding (base + /Differences) → glyph name → Unicode
6
+ * 3. Type0 Identity/embedded-CMap → CID (Unicode only via /ToUnicode)
7
+ * 4. raw byte → WinAnsi/Latin-1 fallback
8
+ */
9
+
10
+ import {
11
+ type PdfObject,
12
+ type PdfDictionary,
13
+ isDict,
14
+ isName,
15
+ isNum,
16
+ isArr,
17
+ isStream,
18
+ } from '../objects/types.ts'
19
+ import {
20
+ isStandardFontName,
21
+ standardGlyphWidth,
22
+ type StandardFontName,
23
+ } from '../fonts/standard_14.ts'
24
+ import {
25
+ baseEncode,
26
+ winAnsiToUnicode,
27
+ glyphNameToUnicode,
28
+ type BaseEncodingName,
29
+ } from './encodings.ts'
30
+ import { parseCMap, type CMap } from './cmap_parser.ts'
31
+
32
+ export interface DecodedGlyph {
33
+ code: number
34
+ unicode: string
35
+ /** Advance width in text space units per em/1000 (i.e. glyph-space/1000). */
36
+ width1000: number
37
+ }
38
+
39
+ export interface CharMap {
40
+ decode(bytes: Uint8Array): DecodedGlyph[]
41
+ /** Width of the space-like glyph (code 32 / CID space), in /1000 units. */
42
+ spaceWidth: number
43
+ }
44
+
45
+ interface Doc {
46
+ resolve(o: PdfObject | undefined): PdfObject | undefined
47
+ getStreamData(s: Extract<PdfObject, { kind: 'stream' }>, num: number): Uint8Array
48
+ }
49
+
50
+ const REPLACEMENT = '�'
51
+
52
+ export function buildCharMap(fontDict: PdfDictionary, doc: Doc): CharMap {
53
+ const subtype = nameOf(doc.resolve(fontDict.entries.get('Subtype')))
54
+ const toUni = loadToUnicode(fontDict, doc)
55
+
56
+ if (subtype === 'Type0') return type0CharMap(fontDict, doc, toUni)
57
+ return simpleCharMap(fontDict, doc, toUni)
58
+ }
59
+
60
+ // ── Simple fonts (Type1 / TrueType / Type3 / MMType1) ──────────────────────
61
+
62
+ function simpleCharMap(fontDict: PdfDictionary, doc: Doc, toUni?: CMap): CharMap {
63
+ const base = nameOf(doc.resolve(fontDict.entries.get('BaseFont'))) ?? ''
64
+ const std = isStandardFontName(base) ? (base as StandardFontName) : undefined
65
+
66
+ // Widths
67
+ const firstChar = numOf(doc.resolve(fontDict.entries.get('FirstChar'))) ?? 0
68
+ const widthsArr = doc.resolve(fontDict.entries.get('Widths'))
69
+ const widths: number[] = widthsArr && isArr(widthsArr)
70
+ ? widthsArr.items.map((w) => {
71
+ const r = doc.resolve(w)
72
+ return r && isNum(r) ? r.value : 0
73
+ })
74
+ : []
75
+ let missingWidth = 0
76
+ const fd = doc.resolve(fontDict.entries.get('FontDescriptor'))
77
+ if (fd && isDict(fd)) {
78
+ missingWidth = numOf(doc.resolve(fd.entries.get('MissingWidth'))) ?? 0
79
+ }
80
+
81
+ // Encoding → per-code Unicode (used only when there is no /ToUnicode).
82
+ const encUnicode = buildSimpleEncoding(fontDict, doc)
83
+
84
+ const widthOf = (code: number): number => {
85
+ const idx = code - firstChar
86
+ if (idx >= 0 && idx < widths.length && widths[idx]! > 0) return widths[idx]!
87
+ if (std) return standardGlyphWidth(std, code)
88
+ return missingWidth
89
+ }
90
+
91
+ const uniOf = (code: number): string => {
92
+ if (toUni) {
93
+ const u = toUni.unicodeOf(code)
94
+ if (u !== undefined && u !== '') return u
95
+ }
96
+ const e = encUnicode[code]
97
+ if (e !== undefined && e >= 0) return String.fromCodePoint(e)
98
+ const w = winAnsiToUnicode(code)
99
+ return w ? String.fromCodePoint(w) : ''
100
+ }
101
+
102
+ return {
103
+ spaceWidth: widthOf(0x20) || (std ? standardGlyphWidth(std, 0x20) : 250),
104
+ decode(bytes) {
105
+ const out: DecodedGlyph[] = []
106
+ for (const code of bytes) {
107
+ out.push({ code, unicode: uniOf(code), width1000: widthOf(code) })
108
+ }
109
+ return out
110
+ },
111
+ }
112
+ }
113
+
114
+ function buildSimpleEncoding(fontDict: PdfDictionary, doc: Doc): number[] {
115
+ const table: number[] = new Array(256)
116
+ const enc = doc.resolve(fontDict.entries.get('Encoding'))
117
+ let baseName: BaseEncodingName | undefined
118
+ if (enc && isName(enc)) baseName = enc.value as BaseEncodingName
119
+ else if (enc && isDict(enc)) {
120
+ const be = doc.resolve(enc.entries.get('BaseEncoding'))
121
+ if (be && isName(be)) baseName = be.value as BaseEncodingName
122
+ }
123
+ for (let c = 0; c < 256; c++) table[c] = baseEncode(baseName, c)
124
+ // /Differences: [ code /name /name code /name … ]
125
+ if (enc && isDict(enc)) {
126
+ const diffs = doc.resolve(enc.entries.get('Differences'))
127
+ if (diffs && isArr(diffs)) {
128
+ let cur = 0
129
+ for (const item of diffs.items) {
130
+ const r = doc.resolve(item)
131
+ if (r && isNum(r)) cur = r.value
132
+ else if (r && isName(r)) {
133
+ const u = glyphNameToUnicode(r.value)
134
+ table[cur] = u >= 0 ? u : table[cur]!
135
+ cur++
136
+ }
137
+ }
138
+ }
139
+ }
140
+ return table
141
+ }
142
+
143
+ // ── Composite (Type0) fonts ────────────────────────────────────────────────
144
+
145
+ function type0CharMap(fontDict: PdfDictionary, doc: Doc, toUni?: CMap): CharMap {
146
+ // Encoding: Identity-H/V → 2-byte identity; or an embedded/named CMap.
147
+ const enc = doc.resolve(fontDict.entries.get('Encoding'))
148
+ let encCMap: CMap | undefined
149
+ let identity = true
150
+ if (enc && isName(enc)) {
151
+ identity = enc.value === 'Identity-H' || enc.value === 'Identity-V'
152
+ } else if (enc && isStream(enc)) {
153
+ encCMap = parseCMap(doc.getStreamData(enc, -1))
154
+ identity = false
155
+ }
156
+
157
+ // Descendant CIDFont: /DW + /W widths, keyed by CID.
158
+ let dw = 1000
159
+ const widthByCid = new Map<number, number>()
160
+ const desc = doc.resolve(fontDict.entries.get('DescendantFonts'))
161
+ if (desc && isArr(desc) && desc.items[0]) {
162
+ const cidFont = doc.resolve(desc.items[0])
163
+ if (cidFont && isDict(cidFont)) {
164
+ dw = numOf(doc.resolve(cidFont.entries.get('DW'))) ?? 1000
165
+ const W = doc.resolve(cidFont.entries.get('W'))
166
+ if (W && isArr(W)) parseCidWidths(W.items, doc, widthByCid)
167
+ }
168
+ }
169
+
170
+ const codeBytes = encCMap ? encCMap.codeBytes : 2
171
+ const cidOf = (code: number): number =>
172
+ identity ? code : (encCMap?.cidOf(code) ?? code)
173
+
174
+ const uniOf = (code: number): string => {
175
+ if (toUni) {
176
+ const u = toUni.unicodeOf(code)
177
+ if (u !== undefined && u !== '') return u
178
+ }
179
+ return REPLACEMENT // no ToUnicode for an embedded-cmap-only font (limitation)
180
+ }
181
+
182
+ return {
183
+ spaceWidth: widthByCid.get(cidOf(0x20)) ?? dw,
184
+ decode(bytes) {
185
+ const out: DecodedGlyph[] = []
186
+ const codes = encCMap ? encCMap.readCodes(bytes) : readFixed(bytes, codeBytes)
187
+ for (const code of codes) {
188
+ const cid = cidOf(code)
189
+ out.push({
190
+ code,
191
+ unicode: uniOf(code),
192
+ width1000: widthByCid.get(cid) ?? dw,
193
+ })
194
+ }
195
+ return out
196
+ },
197
+ }
198
+ }
199
+
200
+ function readFixed(bytes: Uint8Array, n: number): number[] {
201
+ const out: number[] = []
202
+ for (let i = 0; i + n <= bytes.length; i += n) {
203
+ let c = 0
204
+ for (let k = 0; k < n; k++) c = (c << 8) | bytes[i + k]!
205
+ out.push(c)
206
+ }
207
+ return out
208
+ }
209
+
210
+ function parseCidWidths(
211
+ items: PdfObject[],
212
+ doc: Doc,
213
+ out: Map<number, number>,
214
+ ): void {
215
+ let i = 0
216
+ while (i < items.length) {
217
+ const a = doc.resolve(items[i])
218
+ if (!a || !isNum(a)) break
219
+ const next = doc.resolve(items[i + 1])
220
+ if (next && isArr(next)) {
221
+ // c [ w1 w2 … ] : CIDs c, c+1, …
222
+ let cid = a.value
223
+ for (const w of next.items) {
224
+ const wr = doc.resolve(w)
225
+ if (wr && isNum(wr)) out.set(cid++, wr.value)
226
+ }
227
+ i += 2
228
+ } else {
229
+ // c1 c2 w : CIDs c1..c2 all width w
230
+ const b = doc.resolve(items[i + 1])
231
+ const w = doc.resolve(items[i + 2])
232
+ if (b && isNum(b) && w && isNum(w)) {
233
+ for (let cid = a.value; cid <= b.value; cid++) out.set(cid, w.value)
234
+ }
235
+ i += 3
236
+ }
237
+ }
238
+ }
239
+
240
+ // ── Shared helpers ─────────────────────────────────────────────────────────
241
+
242
+ function loadToUnicode(fontDict: PdfDictionary, doc: Doc): CMap | undefined {
243
+ const tu = doc.resolve(fontDict.entries.get('ToUnicode'))
244
+ if (tu && isStream(tu)) {
245
+ try {
246
+ return parseCMap(doc.getStreamData(tu, -1))
247
+ } catch {
248
+ return undefined
249
+ }
250
+ }
251
+ return undefined
252
+ }
253
+
254
+ function nameOf(o: PdfObject | undefined): string | undefined {
255
+ return o && isName(o) ? o.value : undefined
256
+ }
257
+ function numOf(o: PdfObject | undefined): number | undefined {
258
+ return o && isNum(o) ? o.value : undefined
259
+ }
@@ -0,0 +1,27 @@
1
+ /**
2
+ * Read side (M13) sub-module barrel. The headline API plus lower-level pieces
3
+ * for advanced callers, mirroring how `document/index.ts` exposes internals.
4
+ */
5
+
6
+ export { extractText, PdfReader } from './extract.ts'
7
+ export type {
8
+ ExtractOptions,
9
+ ExtractResult,
10
+ ExtractedPage,
11
+ PdfInfo,
12
+ } from './extract.ts'
13
+
14
+ export { PdfReaderDocument } from './document.ts'
15
+ export { Lexer } from './lexer.ts'
16
+ export { ObjectParser, parseObjectFrom } from './object_parser.ts'
17
+ export { parseXref, bruteForceXref, findStartXref } from './xref.ts'
18
+ export type { XrefTable, XrefEntry } from './xref.ts'
19
+ export { parseObjStm } from './objstm.ts'
20
+ export { parseCMap, CMap } from './cmap_parser.ts'
21
+ export { buildCharMap } from './fonts.ts'
22
+ export type { CharMap, DecodedGlyph } from './fonts.ts'
23
+ export { interpretText } from './text_interpreter.ts'
24
+ export { runsToText } from './layout.ts'
25
+ export type { Run } from './layout.ts'
26
+ export { buildDecryptor } from './decrypt.ts'
27
+ export type { Decryptor } from './decrypt.ts'
@@ -0,0 +1,106 @@
1
+ /**
2
+ * Layout heuristics: positioned glyph runs → readable plain text. Runs are
3
+ * grouped into lines by baseline proximity; intra-line gaps become spaces
4
+ * (proportional to the font's space width); inter-line drops become newlines,
5
+ * with a blank line for paragraph-sized jumps. No column/table reconstruction.
6
+ */
7
+
8
+ export interface Run {
9
+ text: string
10
+ /** Device-space X of the run's first glyph origin. */
11
+ x: number
12
+ /** Device-space X just past the run's last glyph. */
13
+ endX: number
14
+ /** Device-space baseline Y. */
15
+ y: number
16
+ /** Effective device font size. */
17
+ fs: number
18
+ /** Device-space width of the space glyph. */
19
+ spaceW: number
20
+ }
21
+
22
+ // Tunable thresholds (fractions of the space-glyph width / font size).
23
+ const SAME_LINE = 0.3 // |Δy| < SAME_LINE·fs ⇒ same line
24
+ const GLUE = 0.2 // gap < GLUE·spaceW ⇒ no separator
25
+ const WIDE = 2.5 // gap ≥ WIDE·spaceW ⇒ multiple spaces
26
+ const MAX_GAP_SPACES = 8
27
+ const PARA = 1.6 // line drop > PARA·fs ⇒ blank line
28
+
29
+ export function runsToText(runs: Run[], normalize = true): string {
30
+ const items = runs.filter((r) => r.text.length > 0)
31
+ if (items.length === 0) return ''
32
+
33
+ items.sort((a, b) => (b.y - a.y) || (a.x - b.x))
34
+
35
+ // Group into lines by baseline proximity.
36
+ const lines: Run[][] = []
37
+ let cur: Run[] = []
38
+ let lineY = items[0]!.y
39
+ let lineFs = items[0]!.fs
40
+ for (const r of items) {
41
+ if (cur.length && Math.abs(r.y - lineY) > SAME_LINE * Math.max(lineFs, r.fs)) {
42
+ lines.push(cur)
43
+ cur = []
44
+ }
45
+ if (cur.length === 0) {
46
+ lineY = r.y
47
+ lineFs = r.fs
48
+ }
49
+ cur.push(r)
50
+ }
51
+ if (cur.length) lines.push(cur)
52
+
53
+ let out = ''
54
+ let prevY: number | null = null
55
+ let prevFs = lineFs
56
+ for (const line of lines) {
57
+ line.sort((a, b) => a.x - b.x)
58
+ const y = line[0]!.y
59
+ const fs = Math.max(...line.map((r) => r.fs))
60
+
61
+ if (prevY !== null) {
62
+ out += '\n'
63
+ if (prevY - y > PARA * Math.max(fs, prevFs)) out += '\n'
64
+ }
65
+
66
+ let lineText = ''
67
+ let prev: Run | null = null
68
+ for (const r of line) {
69
+ if (prev) {
70
+ const gap = r.x - prev.endX
71
+ const sw = prev.spaceW || r.spaceW || fs * 0.25
72
+ if (gap >= WIDE * sw) {
73
+ lineText += ' '.repeat(Math.min(MAX_GAP_SPACES, Math.round(gap / sw)))
74
+ } else if (gap >= GLUE * sw) {
75
+ lineText += ' '
76
+ }
77
+ }
78
+ lineText += r.text
79
+ prev = r
80
+ }
81
+ out += lineText
82
+ prevY = y
83
+ prevFs = fs
84
+ }
85
+
86
+ return normalize ? normalizeWhitespace(out) : out
87
+ }
88
+
89
+ function normalizeWhitespace(s: string): string {
90
+ const lines = s.split('\n').map((l) => l.replace(/[ \t]+/g, ' ').replace(/[ \t]+$/g, ''))
91
+ // Collapse 3+ blank lines to a single blank line; trim leading/trailing.
92
+ const collapsed: string[] = []
93
+ let blanks = 0
94
+ for (const l of lines) {
95
+ if (l.trim() === '') {
96
+ blanks++
97
+ if (blanks <= 1) collapsed.push('')
98
+ } else {
99
+ blanks = 0
100
+ collapsed.push(l)
101
+ }
102
+ }
103
+ while (collapsed.length && collapsed[0] === '') collapsed.shift()
104
+ while (collapsed.length && collapsed[collapsed.length - 1] === '') collapsed.pop()
105
+ return collapsed.join('\n')
106
+ }
@@ -0,0 +1,270 @@
1
+ /**
2
+ * PDF tokenizer (spec §7.2). Scans a byte buffer into the lexical tokens the
3
+ * object parser consumes. Pure and position-addressable: callers may `seek`
4
+ * to a known byte offset (from the xref table) and tokenize from there.
5
+ *
6
+ * Whitespace = NUL TAB LF FF CR SPACE. Delimiters = ( ) < > [ ] { } / %.
7
+ * Comments (`%` … EOL) are skipped except the `%PDF`/`%%EOF` markers, which
8
+ * callers locate by raw byte scanning, not through this lexer.
9
+ */
10
+
11
+ export type Token =
12
+ | { type: 'num'; value: number }
13
+ | { type: 'name'; value: string }
14
+ | { type: 'str'; value: Uint8Array; encoding: 'literal' | 'hex' }
15
+ | { type: 'delim'; value: '[' | ']' | '<<' | '>>' | '{' | '}' }
16
+ | { type: 'kw'; value: string }
17
+ | { type: 'eof' }
18
+
19
+ const WS = new Set([0x00, 0x09, 0x0a, 0x0c, 0x0d, 0x20])
20
+ const DELIM = new Set([0x28, 0x29, 0x3c, 0x3e, 0x5b, 0x5d, 0x7b, 0x7d, 0x2f, 0x25])
21
+
22
+ const isWs = (b: number) => WS.has(b)
23
+ const isDelim = (b: number) => DELIM.has(b)
24
+ const isRegular = (b: number) => !isWs(b) && !isDelim(b)
25
+ const isDigit = (b: number) => b >= 0x30 && b <= 0x39
26
+
27
+ export class Lexer {
28
+ pos: number
29
+
30
+ constructor(
31
+ readonly buf: Uint8Array,
32
+ start = 0,
33
+ ) {
34
+ this.pos = start
35
+ }
36
+
37
+ seek(p: number): void {
38
+ this.pos = p
39
+ }
40
+
41
+ /** Skip whitespace and `%` comments. */
42
+ skipWs(): void {
43
+ const b = this.buf
44
+ while (this.pos < b.length) {
45
+ const c = b[this.pos]!
46
+ if (isWs(c)) {
47
+ this.pos++
48
+ } else if (c === 0x25) {
49
+ // % comment → to end of line
50
+ this.pos++
51
+ while (this.pos < b.length && b[this.pos] !== 0x0a && b[this.pos] !== 0x0d) this.pos++
52
+ } else {
53
+ break
54
+ }
55
+ }
56
+ }
57
+
58
+ /** Peek the next token without consuming (cheap: save/restore pos). */
59
+ peek(): Token {
60
+ const save = this.pos
61
+ const t = this.next()
62
+ this.pos = save
63
+ return t
64
+ }
65
+
66
+ next(): Token {
67
+ this.skipWs()
68
+ const b = this.buf
69
+ if (this.pos >= b.length) return { type: 'eof' }
70
+ const c = b[this.pos]!
71
+
72
+ // Delimiters / structured tokens
73
+ if (c === 0x5b) {
74
+ this.pos++
75
+ return { type: 'delim', value: '[' }
76
+ }
77
+ if (c === 0x5d) {
78
+ this.pos++
79
+ return { type: 'delim', value: ']' }
80
+ }
81
+ if (c === 0x7b) {
82
+ this.pos++
83
+ return { type: 'delim', value: '{' }
84
+ }
85
+ if (c === 0x7d) {
86
+ this.pos++
87
+ return { type: 'delim', value: '}' }
88
+ }
89
+ if (c === 0x3c) {
90
+ if (b[this.pos + 1] === 0x3c) {
91
+ this.pos += 2
92
+ return { type: 'delim', value: '<<' }
93
+ }
94
+ return this.readHexString()
95
+ }
96
+ if (c === 0x3e) {
97
+ if (b[this.pos + 1] === 0x3e) {
98
+ this.pos += 2
99
+ return { type: 'delim', value: '>>' }
100
+ }
101
+ this.pos++ // stray '>' — tolerate
102
+ return this.next()
103
+ }
104
+ if (c === 0x28) return this.readLiteralString()
105
+ if (c === 0x2f) return this.readName()
106
+
107
+ // Number: digit, sign, or '.'
108
+ if (isDigit(c) || c === 0x2b || c === 0x2d || c === 0x2e) {
109
+ const numTok = this.tryReadNumber()
110
+ if (numTok) return numTok
111
+ // fall through: treat as keyword (e.g. malformed) below
112
+ }
113
+
114
+ // Keyword / bare token (obj, endobj, stream, R, true, false, null, …)
115
+ let s = this.pos
116
+ while (s < b.length && isRegular(b[s]!)) s++
117
+ if (s === this.pos) {
118
+ this.pos++ // unknown single byte — skip and continue
119
+ return this.next()
120
+ }
121
+ const kw = latin1(b, this.pos, s)
122
+ this.pos = s
123
+ return { type: 'kw', value: kw }
124
+ }
125
+
126
+ /** Raw byte access for stream payloads. */
127
+ slice(from: number, to: number): Uint8Array {
128
+ return this.buf.subarray(from, to)
129
+ }
130
+
131
+ private tryReadNumber(): Token | null {
132
+ const b = this.buf
133
+ let p = this.pos
134
+ let seenDigit = false
135
+ let seenDot = false
136
+ if (b[p] === 0x2b || b[p] === 0x2d) p++
137
+ while (p < b.length) {
138
+ const ch = b[p]!
139
+ if (isDigit(ch)) {
140
+ seenDigit = true
141
+ p++
142
+ } else if (ch === 0x2e && !seenDot) {
143
+ seenDot = true
144
+ p++
145
+ } else {
146
+ break
147
+ }
148
+ }
149
+ if (!seenDigit) return null
150
+ const str = latin1(b, this.pos, p)
151
+ const value = Number(str)
152
+ if (Number.isNaN(value)) return null
153
+ this.pos = p
154
+ return { type: 'num', value }
155
+ }
156
+
157
+ private readName(): Token {
158
+ const b = this.buf
159
+ this.pos++ // skip '/'
160
+ let out = ''
161
+ while (this.pos < b.length) {
162
+ const ch = b[this.pos]!
163
+ if (isWs(ch) || isDelim(ch)) break
164
+ if (ch === 0x23 && this.pos + 2 < b.length) {
165
+ const hi = hexVal(b[this.pos + 1]!)
166
+ const lo = hexVal(b[this.pos + 2]!)
167
+ if (hi >= 0 && lo >= 0) {
168
+ out += String.fromCharCode((hi << 4) | lo)
169
+ this.pos += 3
170
+ continue
171
+ }
172
+ }
173
+ out += String.fromCharCode(ch)
174
+ this.pos++
175
+ }
176
+ return { type: 'name', value: out }
177
+ }
178
+
179
+ private readLiteralString(): Token {
180
+ const b = this.buf
181
+ this.pos++ // skip '('
182
+ const out: number[] = []
183
+ let depth = 1
184
+ while (this.pos < b.length) {
185
+ let ch = b[this.pos++]!
186
+ if (ch === 0x5c) {
187
+ // backslash escape
188
+ const e = b[this.pos++]!
189
+ switch (e) {
190
+ case 0x6e: out.push(0x0a); break // \n
191
+ case 0x72: out.push(0x0d); break // \r
192
+ case 0x74: out.push(0x09); break // \t
193
+ case 0x62: out.push(0x08); break // \b
194
+ case 0x66: out.push(0x0c); break // \f
195
+ case 0x28: out.push(0x28); break // \(
196
+ case 0x29: out.push(0x29); break // \)
197
+ case 0x5c: out.push(0x5c); break // \\
198
+ case 0x0d:
199
+ if (b[this.pos] === 0x0a) this.pos++ // \ + CRLF line continuation
200
+ break
201
+ case 0x0a:
202
+ break // \ + LF line continuation
203
+ default:
204
+ if (e >= 0x30 && e <= 0x37) {
205
+ // octal escape (1–3 digits)
206
+ let v = e - 0x30
207
+ for (let k = 0; k < 2; k++) {
208
+ const d = b[this.pos]!
209
+ if (d >= 0x30 && d <= 0x37) {
210
+ v = (v << 3) | (d - 0x30)
211
+ this.pos++
212
+ } else break
213
+ }
214
+ out.push(v & 0xff)
215
+ } else {
216
+ out.push(e) // unknown escape → literal char
217
+ }
218
+ }
219
+ continue
220
+ }
221
+ if (ch === 0x28) {
222
+ depth++
223
+ out.push(ch)
224
+ continue
225
+ }
226
+ if (ch === 0x29) {
227
+ depth--
228
+ if (depth === 0) break
229
+ out.push(ch)
230
+ continue
231
+ }
232
+ if (ch === 0x0d) {
233
+ // CR or CRLF → normalize to LF inside literal strings
234
+ if (b[this.pos] === 0x0a) this.pos++
235
+ ch = 0x0a
236
+ }
237
+ out.push(ch)
238
+ }
239
+ return { type: 'str', value: Uint8Array.from(out), encoding: 'literal' }
240
+ }
241
+
242
+ private readHexString(): Token {
243
+ const b = this.buf
244
+ this.pos++ // skip '<'
245
+ const nibbles: number[] = []
246
+ while (this.pos < b.length) {
247
+ const ch = b[this.pos++]!
248
+ if (ch === 0x3e) break
249
+ const v = hexVal(ch)
250
+ if (v >= 0) nibbles.push(v)
251
+ }
252
+ if (nibbles.length % 2 === 1) nibbles.push(0)
253
+ const out = new Uint8Array(nibbles.length / 2)
254
+ for (let i = 0; i < out.length; i++) out[i] = (nibbles[2 * i]! << 4) | nibbles[2 * i + 1]!
255
+ return { type: 'str', value: out, encoding: 'hex' }
256
+ }
257
+ }
258
+
259
+ function hexVal(b: number): number {
260
+ if (b >= 0x30 && b <= 0x39) return b - 0x30
261
+ if (b >= 0x41 && b <= 0x46) return b - 0x41 + 10
262
+ if (b >= 0x61 && b <= 0x66) return b - 0x61 + 10
263
+ return -1
264
+ }
265
+
266
+ export function latin1(b: Uint8Array, from: number, to: number): string {
267
+ let s = ''
268
+ for (let i = from; i < to; i++) s += String.fromCharCode(b[i]!)
269
+ return s
270
+ }