descent 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +285 -0
- data/README.md +583 -0
- data/SYNTAX.md +334 -0
- data/exe/descent +15 -0
- data/lib/descent/ast.rb +69 -0
- data/lib/descent/generator.rb +489 -0
- data/lib/descent/ir.rb +98 -0
- data/lib/descent/ir_builder.rb +1479 -0
- data/lib/descent/lexer.rb +308 -0
- data/lib/descent/parser.rb +450 -0
- data/lib/descent/railroad.rb +272 -0
- data/lib/descent/templates/rust/_command.liquid +174 -0
- data/lib/descent/templates/rust/parser.liquid +1163 -0
- data/lib/descent/tools/debug.rb +115 -0
- data/lib/descent/tools/diagram.rb +48 -0
- data/lib/descent/tools/generate.rb +47 -0
- data/lib/descent/tools/validate.rb +56 -0
- data/lib/descent/validator.rb +231 -0
- data/lib/descent/version.rb +5 -0
- data/lib/descent.rb +34 -0
- metadata +101 -0
|
@@ -0,0 +1,1163 @@
|
|
|
1
|
+
//! Generated parser - DO NOT EDIT
|
|
2
|
+
//!
|
|
3
|
+
//! Generated by descent from {{ parser }}.desc
|
|
4
|
+
//!
|
|
5
|
+
//! Callback-based recursive descent parser.
|
|
6
|
+
//! Call stack = element stack. True recursion handles nesting naturally.
|
|
7
|
+
|
|
8
|
+
use std::ops::Range;
|
|
9
|
+
{% if keywords.size > 0 %}
|
|
10
|
+
use phf::phf_map;
|
|
11
|
+
{% endif %}
|
|
12
|
+
|
|
13
|
+
/// Events emitted by the parser.
|
|
14
|
+
#[derive(Debug, Clone, PartialEq)]
|
|
15
|
+
pub enum Event<'a> {
|
|
16
|
+
{% for type in types %}
|
|
17
|
+
{% if type.kind == "bracket" %}
|
|
18
|
+
{{ type.name }}Start { span: Range<usize> },
|
|
19
|
+
{{ type.name }}End { span: Range<usize> },
|
|
20
|
+
{% elsif type.kind == "content" %}
|
|
21
|
+
{{ type.name }} { content: std::borrow::Cow<'a, [u8]>, span: Range<usize> },
|
|
22
|
+
{% endif %}
|
|
23
|
+
{% endfor %}
|
|
24
|
+
Error { code: ParseErrorCode, span: Range<usize> },
|
|
25
|
+
}
|
|
26
|
+
|
|
27
|
+
impl<'a> Event<'a> {
|
|
28
|
+
/// Format event as a single line for test output.
|
|
29
|
+
pub fn format_line(&self) -> String {
|
|
30
|
+
match self {
|
|
31
|
+
{% for type in types %}
|
|
32
|
+
{% if type.kind == "bracket" %}
|
|
33
|
+
Event::{{ type.name }}Start { span } => {
|
|
34
|
+
format!("{{ type.name }}Start @ {}..{}", span.start, span.end)
|
|
35
|
+
}
|
|
36
|
+
Event::{{ type.name }}End { span } => {
|
|
37
|
+
format!("{{ type.name }}End @ {}..{}", span.start, span.end)
|
|
38
|
+
}
|
|
39
|
+
{% elsif type.kind == "content" %}
|
|
40
|
+
Event::{{ type.name }} { content, span } => {
|
|
41
|
+
let s = std::str::from_utf8(content.as_ref()).unwrap_or("<invalid utf8>");
|
|
42
|
+
format!("{{ type.name }} {:?} @ {}..{}", s, span.start, span.end)
|
|
43
|
+
}
|
|
44
|
+
{% endif %}
|
|
45
|
+
{% endfor %}
|
|
46
|
+
Event::Error { code, span } => {
|
|
47
|
+
format!("Error {:?} @ {}..{}", code, span.start, span.end)
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
}
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
{% comment %} Generate phf keyword maps {% endcomment %}
|
|
54
|
+
{% for kw in keywords %}
|
|
55
|
+
/// Keyword lookup map for {{ kw.name }}.
|
|
56
|
+
/// Generated from |keywords[{{ kw.name }}] - O(1) perfect hash lookup.
|
|
57
|
+
static {{ kw.const_name }}: phf::Map<&'static [u8], u8> = phf_map! {
|
|
58
|
+
{% for m in kw.mappings %}
|
|
59
|
+
{% assign idx = forloop.index0 %}
|
|
60
|
+
b"{{ m.keyword }}" => {{ idx }}u8,
|
|
61
|
+
{% endfor %}
|
|
62
|
+
};
|
|
63
|
+
{% endfor %}
|
|
64
|
+
|
|
65
|
+
/// Error codes for parse errors.
|
|
66
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
67
|
+
pub enum ParseErrorCode {
|
|
68
|
+
UnexpectedEof,
|
|
69
|
+
UnexpectedChar,
|
|
70
|
+
{% comment %} Deduplicate error codes by return_type - multiple functions may return the same type {% endcomment %}
|
|
71
|
+
{% comment %} Use comma-delimited format with leading comma to avoid partial matches {% endcomment %}
|
|
72
|
+
{% assign seen_error_codes = ",UnexpectedEof,UnexpectedChar," %}
|
|
73
|
+
{% for func in functions %}
|
|
74
|
+
{% if func.expects_char %}
|
|
75
|
+
{% assign unclosed_code = "Unclosed" | append: func.return_type %}
|
|
76
|
+
{% assign code_pattern = "," | append: unclosed_code | append: "," %}
|
|
77
|
+
{% unless seen_error_codes contains code_pattern %}
|
|
78
|
+
{{ unclosed_code }},
|
|
79
|
+
{% assign seen_error_codes = seen_error_codes | append: unclosed_code | append: "," %}
|
|
80
|
+
{% endunless %}
|
|
81
|
+
{% endif %}
|
|
82
|
+
{% endfor %}
|
|
83
|
+
{% comment %} Custom error codes from /error(code) calls - skip if already generated {% endcomment %}
|
|
84
|
+
{% for code in custom_error_codes %}
|
|
85
|
+
{% assign pascal_code = code | pascalcase %}
|
|
86
|
+
{% assign code_pattern = "," | append: pascal_code | append: "," %}
|
|
87
|
+
{% unless seen_error_codes contains code_pattern %}
|
|
88
|
+
{{ pascal_code }},
|
|
89
|
+
{% assign seen_error_codes = seen_error_codes | append: pascal_code | append: "," %}
|
|
90
|
+
{% endunless %}
|
|
91
|
+
{% endfor %}
|
|
92
|
+
}
|
|
93
|
+
|
|
94
|
+
/// Callback-based parser.
|
|
95
|
+
///
|
|
96
|
+
/// Uses true recursive descent - the call stack IS the element stack.
|
|
97
|
+
pub struct Parser<'a> {
|
|
98
|
+
input: &'a [u8],
|
|
99
|
+
pos: usize,
|
|
100
|
+
mark_pos: usize,
|
|
101
|
+
term_pos: usize,
|
|
102
|
+
prepend_buf: Vec<u8>,
|
|
103
|
+
line: u32,
|
|
104
|
+
column: u32,
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
#[allow(unused_variables, dead_code)]
|
|
108
|
+
impl<'a> Parser<'a> {
|
|
109
|
+
/// Create a new parser for the given input.
|
|
110
|
+
pub fn new(input: &'a [u8]) -> Self {
|
|
111
|
+
Self {
|
|
112
|
+
input,
|
|
113
|
+
pos: 0,
|
|
114
|
+
mark_pos: 0,
|
|
115
|
+
term_pos: 0,
|
|
116
|
+
prepend_buf: Vec::new(),
|
|
117
|
+
line: 1,
|
|
118
|
+
column: 1,
|
|
119
|
+
}
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
/// Parse the input, calling the callback for each event.
|
|
123
|
+
///
|
|
124
|
+
/// The callback receives events in document order.
|
|
125
|
+
/// For backpressure, have the callback send to a bounded channel.
|
|
126
|
+
pub fn parse<F>(mut self, mut on_event: F)
|
|
127
|
+
where
|
|
128
|
+
F: FnMut(Event<'a>),
|
|
129
|
+
{
|
|
130
|
+
self.parse_{{ entry_point | remove: "/" }}(&mut on_event);
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
// ========== Helpers ==========
|
|
134
|
+
|
|
135
|
+
#[inline(always)]
|
|
136
|
+
fn peek(&self) -> Option<u8> {
|
|
137
|
+
self.input.get(self.pos).copied()
|
|
138
|
+
}
|
|
139
|
+
|
|
140
|
+
{% if trace %}
|
|
141
|
+
/// Format a byte for trace display (shows printable chars or escape codes).
|
|
142
|
+
fn trace_byte(b: Option<u8>) -> String {
|
|
143
|
+
match b {
|
|
144
|
+
None => "EOF".to_string(),
|
|
145
|
+
Some(b'\n') => "'\\n'".to_string(),
|
|
146
|
+
Some(b'\t') => "'\\t'".to_string(),
|
|
147
|
+
Some(b'\r') => "'\\r'".to_string(),
|
|
148
|
+
Some(b' ') => "' '".to_string(),
|
|
149
|
+
Some(b) if b.is_ascii_graphic() => format!("'{}'", b as char),
|
|
150
|
+
Some(b) => format!("0x{:02x}", b),
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
/// Format accumulated content for trace display (truncated).
|
|
155
|
+
/// Shows slice content only (prepend buffer shown separately if non-empty).
|
|
156
|
+
fn trace_content(&self) -> String {
|
|
157
|
+
let end = if self.term_pos != usize::MAX { self.term_pos } else { self.pos };
|
|
158
|
+
let slice = &self.input[self.mark_pos..end];
|
|
159
|
+
let prepend_info = if self.prepend_buf.is_empty() {
|
|
160
|
+
String::new()
|
|
161
|
+
} else {
|
|
162
|
+
format!("+{}", self.prepend_buf.len())
|
|
163
|
+
};
|
|
164
|
+
if slice.is_empty() && self.prepend_buf.is_empty() {
|
|
165
|
+
return "[]".to_string();
|
|
166
|
+
}
|
|
167
|
+
let s = std::str::from_utf8(slice).unwrap_or("<binary>");
|
|
168
|
+
if s.len() > 32 {
|
|
169
|
+
format!("[{:?}...]{}", &s[..32], prepend_info)
|
|
170
|
+
} else {
|
|
171
|
+
format!("[{:?}]{}", s, prepend_info)
|
|
172
|
+
}
|
|
173
|
+
}
|
|
174
|
+
{% endif %}
|
|
175
|
+
|
|
176
|
+
#[inline(always)]
|
|
177
|
+
fn eof(&self) -> bool {
|
|
178
|
+
self.pos >= self.input.len()
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
#[inline(always)]
|
|
182
|
+
fn advance(&mut self) {
|
|
183
|
+
if self.pos < self.input.len() {
|
|
184
|
+
if self.input[self.pos] == b'\n' {
|
|
185
|
+
self.line += 1;
|
|
186
|
+
self.column = 1;
|
|
187
|
+
} else {
|
|
188
|
+
self.column += 1;
|
|
189
|
+
}
|
|
190
|
+
self.pos += 1;
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
#[inline(always)]
|
|
195
|
+
fn mark(&mut self) {
|
|
196
|
+
self.mark_pos = self.pos;
|
|
197
|
+
self.term_pos = usize::MAX; // Sentinel: TERM not yet called
|
|
198
|
+
// Note: prepend_buf is NOT cleared here - it persists until term() consumes it.
|
|
199
|
+
// This allows PREPEND to be called before a nested function that does MARK.
|
|
200
|
+
}
|
|
201
|
+
|
|
202
|
+
{% if uses_set_term %}
|
|
203
|
+
/// Set termination point with optional offset (e.g., -1 for one char before current)
|
|
204
|
+
/// Clamps to valid range [0, input.len()] to prevent underflow/overflow.
|
|
205
|
+
#[inline(always)]
|
|
206
|
+
fn set_term(&mut self, offset: i32) {
|
|
207
|
+
let new_pos = self.pos as i64 + offset as i64;
|
|
208
|
+
self.term_pos = new_pos.clamp(0, self.input.len() as i64) as usize;
|
|
209
|
+
}
|
|
210
|
+
{% endif %}
|
|
211
|
+
|
|
212
|
+
/// Prepend bytes to the accumulation buffer.
|
|
213
|
+
/// Empty slice is naturally a no-op.
|
|
214
|
+
#[inline(always)]
|
|
215
|
+
fn prepend_bytes(&mut self, bytes: &[u8]) {
|
|
216
|
+
self.prepend_buf.extend_from_slice(bytes);
|
|
217
|
+
}
|
|
218
|
+
|
|
219
|
+
/// Get accumulated content from MARK to TERM, including any prepended bytes.
|
|
220
|
+
/// Returns Cow::Borrowed when no prepend (zero-copy), Cow::Owned when prepend used.
|
|
221
|
+
#[inline(always)]
|
|
222
|
+
fn term(&mut self) -> std::borrow::Cow<'a, [u8]> {
|
|
223
|
+
// Use term_pos if set after mark, otherwise use current pos
|
|
224
|
+
let end = if self.term_pos != usize::MAX { self.term_pos } else { self.pos };
|
|
225
|
+
let slice = &self.input[self.mark_pos..end];
|
|
226
|
+
|
|
227
|
+
if self.prepend_buf.is_empty() {
|
|
228
|
+
std::borrow::Cow::Borrowed(slice)
|
|
229
|
+
} else {
|
|
230
|
+
let mut combined = std::mem::take(&mut self.prepend_buf);
|
|
231
|
+
combined.extend_from_slice(slice);
|
|
232
|
+
std::borrow::Cow::Owned(combined)
|
|
233
|
+
}
|
|
234
|
+
}
|
|
235
|
+
|
|
236
|
+
#[inline(always)]
|
|
237
|
+
fn span(&self) -> Range<usize> {
|
|
238
|
+
self.pos..self.pos
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
#[inline(always)]
|
|
242
|
+
fn span_from_mark(&self) -> Range<usize> {
|
|
243
|
+
// Use term_pos if set after mark, otherwise use current pos
|
|
244
|
+
let end = if self.term_pos != usize::MAX { self.term_pos } else { self.pos };
|
|
245
|
+
self.mark_pos..end
|
|
246
|
+
}
|
|
247
|
+
|
|
248
|
+
{% if uses_col %}
|
|
249
|
+
#[inline(always)]
|
|
250
|
+
fn col(&self) -> i32 {
|
|
251
|
+
self.column as i32
|
|
252
|
+
}
|
|
253
|
+
{% endif %}
|
|
254
|
+
|
|
255
|
+
{% if uses_prev %}
|
|
256
|
+
/// Previous byte (0 at start of input).
|
|
257
|
+
#[inline(always)]
|
|
258
|
+
fn prev(&self) -> u8 {
|
|
259
|
+
if self.pos > 0 {
|
|
260
|
+
self.input[self.pos - 1]
|
|
261
|
+
} else {
|
|
262
|
+
0
|
|
263
|
+
}
|
|
264
|
+
}
|
|
265
|
+
{% endif %}
|
|
266
|
+
|
|
267
|
+
{% if uses_letter %}
|
|
268
|
+
#[inline(always)]
|
|
269
|
+
fn is_letter(b: u8) -> bool {
|
|
270
|
+
b.is_ascii_alphabetic()
|
|
271
|
+
}
|
|
272
|
+
{% endif %}
|
|
273
|
+
|
|
274
|
+
{% if uses_label_cont %}
|
|
275
|
+
#[inline(always)]
|
|
276
|
+
fn is_label_cont(b: u8) -> bool {
|
|
277
|
+
b.is_ascii_alphanumeric() || b == b'_' || b == b'-'
|
|
278
|
+
}
|
|
279
|
+
{% endif %}
|
|
280
|
+
|
|
281
|
+
{% if uses_digit %}
|
|
282
|
+
#[inline(always)]
|
|
283
|
+
fn is_digit(b: u8) -> bool {
|
|
284
|
+
b.is_ascii_digit()
|
|
285
|
+
}
|
|
286
|
+
{% endif %}
|
|
287
|
+
|
|
288
|
+
{% if uses_hex_digit %}
|
|
289
|
+
#[inline(always)]
|
|
290
|
+
fn is_hex_digit(b: u8) -> bool {
|
|
291
|
+
b.is_ascii_hexdigit()
|
|
292
|
+
}
|
|
293
|
+
{% endif %}
|
|
294
|
+
|
|
295
|
+
{% if uses_ws %}
|
|
296
|
+
#[inline(always)]
|
|
297
|
+
fn is_ws(b: u8) -> bool {
|
|
298
|
+
b == b' ' || b == b'\t'
|
|
299
|
+
}
|
|
300
|
+
{% endif %}
|
|
301
|
+
|
|
302
|
+
{% if uses_nl %}
|
|
303
|
+
#[inline(always)]
|
|
304
|
+
fn is_nl(b: u8) -> bool {
|
|
305
|
+
b == b'\n'
|
|
306
|
+
}
|
|
307
|
+
{% endif %}
|
|
308
|
+
|
|
309
|
+
{% if uses_unicode %}
|
|
310
|
+
// ========== Unicode Identifier Classes ==========
|
|
311
|
+
// Requires `unicode-xid` crate for full Unicode support.
|
|
312
|
+
// These is_* methods work with the byte-at-a-time matching pattern.
|
|
313
|
+
// For ASCII bytes, they use unicode-xid. For non-ASCII, the check
|
|
314
|
+
// is done at the byte level (first byte of UTF-8 sequence).
|
|
315
|
+
|
|
316
|
+
/// XID_Start: Can start a Unicode identifier.
|
|
317
|
+
/// For ASCII, uses unicode-xid. For non-ASCII first bytes, returns true
|
|
318
|
+
/// (conservative - actual validation happens via match_xid_start for multi-byte).
|
|
319
|
+
#[inline(always)]
|
|
320
|
+
fn is_xid_start(b: u8) -> bool {
|
|
321
|
+
use unicode_xid::UnicodeXID;
|
|
322
|
+
if b < 0x80 {
|
|
323
|
+
(b as char).is_xid_start()
|
|
324
|
+
} else {
|
|
325
|
+
// Non-ASCII: could be start of valid UTF-8 XID_Start sequence
|
|
326
|
+
// Return true for lead bytes (0xC2-0xF4), let advance handle it
|
|
327
|
+
b >= 0xC2 && b <= 0xF4
|
|
328
|
+
}
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
/// XID_Continue: Can continue a Unicode identifier.
|
|
332
|
+
#[inline(always)]
|
|
333
|
+
fn is_xid_cont(b: u8) -> bool {
|
|
334
|
+
use unicode_xid::UnicodeXID;
|
|
335
|
+
if b < 0x80 {
|
|
336
|
+
(b as char).is_xid_continue()
|
|
337
|
+
} else {
|
|
338
|
+
// Non-ASCII: could be valid UTF-8 continuation or lead byte
|
|
339
|
+
b >= 0x80
|
|
340
|
+
}
|
|
341
|
+
}
|
|
342
|
+
|
|
343
|
+
/// XLBL_Start: Same as XID_Start (for label syntax).
|
|
344
|
+
#[inline(always)]
|
|
345
|
+
fn is_xlbl_start(b: u8) -> bool {
|
|
346
|
+
Self::is_xid_start(b)
|
|
347
|
+
}
|
|
348
|
+
|
|
349
|
+
/// XLBL_Continue: XID_Continue + hyphen (for kebab-case labels).
|
|
350
|
+
#[inline(always)]
|
|
351
|
+
fn is_xlbl_cont(b: u8) -> bool {
|
|
352
|
+
b == b'-' || Self::is_xid_cont(b)
|
|
353
|
+
}
|
|
354
|
+
{% endif %}
|
|
355
|
+
|
|
356
|
+
{% if max_scan_arity > 0 %}
|
|
357
|
+
// ========== SCAN Methods (SIMD-accelerated via memchr) ==========
|
|
358
|
+
// '\n' is included in scan targets by the generator for line tracking.
|
|
359
|
+
// When '\n' is found, caller handles line/column update. No newlines
|
|
360
|
+
// exist between start and found position, so we just add offset to column.
|
|
361
|
+
|
|
362
|
+
{% if max_scan_arity >= 1 %}
|
|
363
|
+
/// Scan forward to find first occurrence of b1.
|
|
364
|
+
#[inline(always)]
|
|
365
|
+
fn scan_to1(&mut self, b1: u8) -> Option<u8> {
|
|
366
|
+
match memchr::memchr(b1, &self.input[self.pos..]) {
|
|
367
|
+
Some(offset) => {
|
|
368
|
+
self.column += offset as u32;
|
|
369
|
+
self.pos += offset;
|
|
370
|
+
Some(self.input[self.pos])
|
|
371
|
+
}
|
|
372
|
+
None => {
|
|
373
|
+
self.column += (self.input.len() - self.pos) as u32;
|
|
374
|
+
self.pos = self.input.len();
|
|
375
|
+
None
|
|
376
|
+
}
|
|
377
|
+
}
|
|
378
|
+
}
|
|
379
|
+
{% endif %}
|
|
380
|
+
|
|
381
|
+
{% if max_scan_arity >= 2 %}
|
|
382
|
+
/// Scan forward to find first occurrence of b1 or b2.
|
|
383
|
+
#[inline(always)]
|
|
384
|
+
fn scan_to2(&mut self, b1: u8, b2: u8) -> Option<u8> {
|
|
385
|
+
match memchr::memchr2(b1, b2, &self.input[self.pos..]) {
|
|
386
|
+
Some(offset) => {
|
|
387
|
+
self.column += offset as u32;
|
|
388
|
+
self.pos += offset;
|
|
389
|
+
Some(self.input[self.pos])
|
|
390
|
+
}
|
|
391
|
+
None => {
|
|
392
|
+
self.column += (self.input.len() - self.pos) as u32;
|
|
393
|
+
self.pos = self.input.len();
|
|
394
|
+
None
|
|
395
|
+
}
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
{% endif %}
|
|
399
|
+
|
|
400
|
+
{% if max_scan_arity >= 3 %}
|
|
401
|
+
/// Scan forward to find first occurrence of b1, b2, or b3.
|
|
402
|
+
#[inline(always)]
|
|
403
|
+
fn scan_to3(&mut self, b1: u8, b2: u8, b3: u8) -> Option<u8> {
|
|
404
|
+
match memchr::memchr3(b1, b2, b3, &self.input[self.pos..]) {
|
|
405
|
+
Some(offset) => {
|
|
406
|
+
self.column += offset as u32;
|
|
407
|
+
self.pos += offset;
|
|
408
|
+
Some(self.input[self.pos])
|
|
409
|
+
}
|
|
410
|
+
None => {
|
|
411
|
+
self.column += (self.input.len() - self.pos) as u32;
|
|
412
|
+
self.pos = self.input.len();
|
|
413
|
+
None
|
|
414
|
+
}
|
|
415
|
+
}
|
|
416
|
+
}
|
|
417
|
+
{% endif %}
|
|
418
|
+
|
|
419
|
+
{% if max_scan_arity >= 4 %}
|
|
420
|
+
/// Scan forward to find first occurrence of b1..b4 (chained memchr).
|
|
421
|
+
/// Limits second search to range of first hit to avoid O(n²) behavior.
|
|
422
|
+
#[inline(always)]
|
|
423
|
+
fn scan_to4(&mut self, b1: u8, b2: u8, b3: u8, b4: u8) -> Option<u8> {
|
|
424
|
+
let haystack = &self.input[self.pos..];
|
|
425
|
+
let p1 = memchr::memchr3(b1, b2, b3, haystack);
|
|
426
|
+
let p2 = match p1 {
|
|
427
|
+
Some(limit) => memchr::memchr(b4, &haystack[..limit]),
|
|
428
|
+
None => memchr::memchr(b4, haystack),
|
|
429
|
+
};
|
|
430
|
+
let offset = match (p1, p2) {
|
|
431
|
+
(Some(x), Some(y)) => Some(x.min(y)),
|
|
432
|
+
(Some(x), None) | (None, Some(x)) => Some(x),
|
|
433
|
+
(None, None) => None,
|
|
434
|
+
};
|
|
435
|
+
match offset {
|
|
436
|
+
Some(off) => {
|
|
437
|
+
self.column += off as u32;
|
|
438
|
+
self.pos += off;
|
|
439
|
+
Some(self.input[self.pos])
|
|
440
|
+
}
|
|
441
|
+
None => {
|
|
442
|
+
self.column += (self.input.len() - self.pos) as u32;
|
|
443
|
+
self.pos = self.input.len();
|
|
444
|
+
None
|
|
445
|
+
}
|
|
446
|
+
}
|
|
447
|
+
}
|
|
448
|
+
{% endif %}
|
|
449
|
+
|
|
450
|
+
{% if max_scan_arity >= 5 %}
|
|
451
|
+
/// Scan forward to find first occurrence of b1..b5 (chained memchr).
|
|
452
|
+
/// Limits second search to range of first hit to avoid O(n²) behavior.
|
|
453
|
+
#[inline(always)]
|
|
454
|
+
fn scan_to5(&mut self, b1: u8, b2: u8, b3: u8, b4: u8, b5: u8) -> Option<u8> {
|
|
455
|
+
let haystack = &self.input[self.pos..];
|
|
456
|
+
let p1 = memchr::memchr3(b1, b2, b3, haystack);
|
|
457
|
+
let p2 = match p1 {
|
|
458
|
+
Some(limit) => memchr::memchr2(b4, b5, &haystack[..limit]),
|
|
459
|
+
None => memchr::memchr2(b4, b5, haystack),
|
|
460
|
+
};
|
|
461
|
+
let offset = match (p1, p2) {
|
|
462
|
+
(Some(x), Some(y)) => Some(x.min(y)),
|
|
463
|
+
(Some(x), None) | (None, Some(x)) => Some(x),
|
|
464
|
+
(None, None) => None,
|
|
465
|
+
};
|
|
466
|
+
match offset {
|
|
467
|
+
Some(off) => {
|
|
468
|
+
self.column += off as u32;
|
|
469
|
+
self.pos += off;
|
|
470
|
+
Some(self.input[self.pos])
|
|
471
|
+
}
|
|
472
|
+
None => {
|
|
473
|
+
self.column += (self.input.len() - self.pos) as u32;
|
|
474
|
+
self.pos = self.input.len();
|
|
475
|
+
None
|
|
476
|
+
}
|
|
477
|
+
}
|
|
478
|
+
}
|
|
479
|
+
{% endif %}
|
|
480
|
+
|
|
481
|
+
{% if max_scan_arity >= 6 %}
|
|
482
|
+
/// Scan forward to find first occurrence of b1..b6 (chained memchr).
|
|
483
|
+
/// Limits second search to range of first hit to avoid O(n²) behavior.
|
|
484
|
+
#[inline(always)]
|
|
485
|
+
fn scan_to6(&mut self, b1: u8, b2: u8, b3: u8, b4: u8, b5: u8, b6: u8) -> Option<u8> {
|
|
486
|
+
let haystack = &self.input[self.pos..];
|
|
487
|
+
let p1 = memchr::memchr3(b1, b2, b3, haystack);
|
|
488
|
+
let p2 = match p1 {
|
|
489
|
+
Some(limit) => memchr::memchr3(b4, b5, b6, &haystack[..limit]),
|
|
490
|
+
None => memchr::memchr3(b4, b5, b6, haystack),
|
|
491
|
+
};
|
|
492
|
+
let offset = match (p1, p2) {
|
|
493
|
+
(Some(x), Some(y)) => Some(x.min(y)),
|
|
494
|
+
(Some(x), None) | (None, Some(x)) => Some(x),
|
|
495
|
+
(None, None) => None,
|
|
496
|
+
};
|
|
497
|
+
match offset {
|
|
498
|
+
Some(off) => {
|
|
499
|
+
self.column += off as u32;
|
|
500
|
+
self.pos += off;
|
|
501
|
+
Some(self.input[self.pos])
|
|
502
|
+
}
|
|
503
|
+
None => {
|
|
504
|
+
self.column += (self.input.len() - self.pos) as u32;
|
|
505
|
+
self.pos = self.input.len();
|
|
506
|
+
None
|
|
507
|
+
}
|
|
508
|
+
}
|
|
509
|
+
}
|
|
510
|
+
{% endif %}
|
|
511
|
+
{% endif %}
|
|
512
|
+
|
|
513
|
+
{% comment %} Generate keyword lookup methods {% endcomment %}
|
|
514
|
+
{% for kw in keywords %}
|
|
515
|
+
// ========== Keyword Lookup: {{ kw.name }} ==========
|
|
516
|
+
|
|
517
|
+
/// Look up accumulated content in {{ kw.name }} keywords.
|
|
518
|
+
/// Returns true if a keyword matched (event emitted), false otherwise.
|
|
519
|
+
fn lookup_{{ kw.name }}<F>(&mut self, on_event: &mut F) -> bool
|
|
520
|
+
where
|
|
521
|
+
F: FnMut(Event<'a>),
|
|
522
|
+
{
|
|
523
|
+
let content = self.term();
|
|
524
|
+
if let Some(&id) = {{ kw.const_name }}.get(content.as_ref()) {
|
|
525
|
+
let span = self.span_from_mark();
|
|
526
|
+
match id {
|
|
527
|
+
{% for m in kw.mappings %}
|
|
528
|
+
{% assign idx = forloop.index0 %}
|
|
529
|
+
{{ idx }} => on_event(Event::{{ m.event_type }} { content, span }),
|
|
530
|
+
{% endfor %}
|
|
531
|
+
_ => unreachable!("keyword map contains only valid ids"),
|
|
532
|
+
}
|
|
533
|
+
true
|
|
534
|
+
} else {
|
|
535
|
+
false
|
|
536
|
+
}
|
|
537
|
+
}
|
|
538
|
+
|
|
539
|
+
/// Look up and emit keyword, or call fallback function.
|
|
540
|
+
fn lookup_{{ kw.name }}_or_fallback<F>(&mut self, on_event: &mut F)
|
|
541
|
+
where
|
|
542
|
+
F: FnMut(Event<'a>),
|
|
543
|
+
{
|
|
544
|
+
if !self.lookup_{{ kw.name }}(on_event) {
|
|
545
|
+
{% if kw.fallback_func %}
|
|
546
|
+
{% if kw.fallback_args %}
|
|
547
|
+
self.parse_{{ kw.fallback_func }}({{ kw.fallback_args }}, on_event);
|
|
548
|
+
{% else %}
|
|
549
|
+
self.parse_{{ kw.fallback_func }}(on_event);
|
|
550
|
+
{% endif %}
|
|
551
|
+
{% else %}
|
|
552
|
+
// No fallback - keyword not found is a no-op
|
|
553
|
+
{% endif %}
|
|
554
|
+
}
|
|
555
|
+
}
|
|
556
|
+
{% endfor %}
|
|
557
|
+
|
|
558
|
+
// ========== Generated Parse Functions ==========
|
|
559
|
+
|
|
560
|
+
{% for func in functions %}
|
|
561
|
+
{% comment %} Determine return type info {% endcomment %}
|
|
562
|
+
{% assign return_type_info = nil %}
|
|
563
|
+
{% for t in types %}
|
|
564
|
+
{% if t.name == func.return_type %}
|
|
565
|
+
{% assign return_type_info = t %}
|
|
566
|
+
{% endif %}
|
|
567
|
+
{% endfor %}
|
|
568
|
+
|
|
569
|
+
/// Parse {{ func.name }}{% if func.return_type %} -> {{ func.return_type }}{% endif %}
|
|
570
|
+
{% if func.params.size > 0 %}
|
|
571
|
+
fn parse_{{ func.name }}<F>(&mut self, {% for param in func.params %}{{ param }}: {% if func.param_types[param] == "byte" %}u8{% elsif func.param_types[param] == "bytes" %}&'static [u8]{% else %}i32{% endif %}, {% endfor %}on_event: &mut F){% if return_type_info.kind == "internal" %} -> i32{% endif %}
|
|
572
|
+
{% else %}
|
|
573
|
+
fn parse_{{ func.name }}<F>(&mut self, on_event: &mut F){% if return_type_info.kind == "internal" %} -> i32{% endif %}
|
|
574
|
+
{% endif %}
|
|
575
|
+
where
|
|
576
|
+
F: FnMut(Event<'a>),
|
|
577
|
+
{
|
|
578
|
+
{% if trace %}eprintln!("TRACE: L{{ func.lineno }} ENTER {{ func.name }} | byte={} pos={}", Self::trace_byte(self.peek()), self.pos);{% endif %}
|
|
579
|
+
|
|
580
|
+
{% comment %} BRACKET types: emit Start on entry {% endcomment %}
|
|
581
|
+
{% if return_type_info.kind == "bracket" %}
|
|
582
|
+
let start_span = self.span();
|
|
583
|
+
on_event(Event::{{ func.return_type }}Start { span: start_span.clone() });
|
|
584
|
+
{% endif %}
|
|
585
|
+
|
|
586
|
+
{% comment %} CONTENT types: MARK on entry {% endcomment %}
|
|
587
|
+
{% if return_type_info.kind == "content" %}
|
|
588
|
+
self.mark();
|
|
589
|
+
{% endif %}
|
|
590
|
+
|
|
591
|
+
{% comment %} Local variables - use initial value from entry_actions if available {% endcomment %}
|
|
592
|
+
{% comment %} Only add 'mut' if the variable is reassigned in the function body {% endcomment %}
|
|
593
|
+
{% for local in func.locals %}
|
|
594
|
+
{% assign init_val = func.local_init_values[local[0]] | default: "0" %}
|
|
595
|
+
{% if func.mutable_locals contains local[0] %}let mut {{ local[0] }}: i32 = {{ init_val }};{% else %}let {{ local[0] }}: i32 = {{ init_val }};{% endif %}
|
|
596
|
+
{% endfor %}
|
|
597
|
+
|
|
598
|
+
{% comment %} Entry actions (variable initialization on function entry) {% endcomment %}
|
|
599
|
+
{% comment %} Skip MARK for CONTENT types since auto-MARK already handles it {% endcomment %}
|
|
600
|
+
{% for cmd in func.entry_actions %}
|
|
601
|
+
{% unless cmd.type == "mark" and return_type_info.kind == "content" %}
|
|
602
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
|
|
603
|
+
{% endunless %}
|
|
604
|
+
{% endfor %}
|
|
605
|
+
|
|
606
|
+
{% comment %} State machine {% endcomment %}
|
|
607
|
+
{% if func.states.size == 0 %}
|
|
608
|
+
{% comment %} No states - immediate return (stateless function) {% endcomment %}
|
|
609
|
+
{% if return_type_info.kind == "content" %}
|
|
610
|
+
on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
|
|
611
|
+
{% elsif return_type_info.kind == "bracket" %}
|
|
612
|
+
on_event(Event::{{ func.return_type }}End { span: self.span() });
|
|
613
|
+
{% elsif return_type_info.kind == "internal" %}
|
|
614
|
+
return 0;
|
|
615
|
+
{% endif %}
|
|
616
|
+
{% elsif func.states.size == 1 %}
|
|
617
|
+
{% comment %} Single state - no enum needed {% endcomment %}
|
|
618
|
+
{% assign state = func.states.first %}
|
|
619
|
+
loop {
|
|
620
|
+
{% if state.scannable %}
|
|
621
|
+
{% comment %} SCAN-first optimization {% endcomment %}
|
|
622
|
+
{% assign scan_count = state.scan_chars.size %}
|
|
623
|
+
{% if scan_count == 1 %}
|
|
624
|
+
match self.scan_to1({{ state.scan_chars[0] | escape_rust_char }}) {
|
|
625
|
+
{% elsif scan_count == 2 %}
|
|
626
|
+
match self.scan_to2({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}) {
|
|
627
|
+
{% elsif scan_count == 3 %}
|
|
628
|
+
match self.scan_to3({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}) {
|
|
629
|
+
{% elsif scan_count == 4 %}
|
|
630
|
+
match self.scan_to4({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}) {
|
|
631
|
+
{% elsif scan_count == 5 %}
|
|
632
|
+
match self.scan_to5({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}, {{ state.scan_chars[4] | escape_rust_char }}) {
|
|
633
|
+
{% elsif scan_count == 6 %}
|
|
634
|
+
match self.scan_to6({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}, {{ state.scan_chars[4] | escape_rust_char }}, {{ state.scan_chars[5] | escape_rust_char }}) {
|
|
635
|
+
{% endif %}
|
|
636
|
+
{% for kase in state.cases %}
|
|
637
|
+
{% unless kase.is_default %}
|
|
638
|
+
{% if kase.is_conditional %}
|
|
639
|
+
_ if {{ kase.condition | rust_expr }} => {
|
|
640
|
+
{% elsif kase.param_ref %}
|
|
641
|
+
{% comment %} Parameter reference: match against param value {% endcomment %}
|
|
642
|
+
Some(b) if b == {{ kase.param_ref }} => {
|
|
643
|
+
{% elsif kase.special_class and kase.chars.size > 0 %}
|
|
644
|
+
{% comment %} Combined: class + literal chars {% endcomment %}
|
|
645
|
+
Some(b) if Self::is_{{ kase.special_class }}(b){% for ch in kase.chars %} || b == {{ ch | escape_rust_char }}{% endfor %} => {
|
|
646
|
+
{% elsif kase.special_class %}
|
|
647
|
+
Some(b) if Self::is_{{ kase.special_class }}(b) => {
|
|
648
|
+
{% elsif kase.chars.size == 1 %}
|
|
649
|
+
Some({{ kase.chars[0] | escape_rust_char }}) => {
|
|
650
|
+
{% elsif kase.chars.size > 1 %}
|
|
651
|
+
Some({% for ch in kase.chars %}{{ ch | escape_rust_char }}{% unless forloop.last %} | {% endunless %}{% endfor %}) => {
|
|
652
|
+
{% else %}
|
|
653
|
+
Some(_) => {
|
|
654
|
+
{% endif %}
|
|
655
|
+
{% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
|
|
656
|
+
{% for cmd in kase.commands %}
|
|
657
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
|
|
658
|
+
{% endfor %}
|
|
659
|
+
}
|
|
660
|
+
{% endunless %}
|
|
661
|
+
{% endfor %}
|
|
662
|
+
{% if state.newline_injected %}
|
|
663
|
+
{% comment %} Injected newline: update line/col and continue scanning {% endcomment %}
|
|
664
|
+
Some(b'\n') => {
|
|
665
|
+
self.advance();
|
|
666
|
+
}
|
|
667
|
+
{% endif %}
|
|
668
|
+
None => {
|
|
669
|
+
{% if trace %}eprintln!("TRACE: L{{ state.lineno }} {{ func.name }}:{{ state.name }} EOF | term={} pos={}", self.trace_content(), self.pos);{% endif %}
|
|
670
|
+
{% comment %} EOF handling - check for explicit |eof handler first {% endcomment %}
|
|
671
|
+
{% if state.eof_handler.size > 0 %}
|
|
672
|
+
{% for cmd in state.eof_handler %}
|
|
673
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
|
|
674
|
+
{% endfor %}
|
|
675
|
+
{% elsif func.eof_handler.size > 0 %}
|
|
676
|
+
{% for cmd in func.eof_handler %}
|
|
677
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
|
|
678
|
+
{% endfor %}
|
|
679
|
+
{% else %}
|
|
680
|
+
{% if return_type_info.kind == "content" %}
|
|
681
|
+
on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
|
|
682
|
+
{% endif %}
|
|
683
|
+
{% if func.expects_char %}
|
|
684
|
+
on_event(Event::Error { code: ParseErrorCode::Unclosed{{ func.return_type }}, span: self.span() });
|
|
685
|
+
{% elsif return_type_info.kind == "bracket" %}
|
|
686
|
+
on_event(Event::{{ func.return_type }}End { span: self.span() });
|
|
687
|
+
{% endif %}
|
|
688
|
+
{% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
|
|
689
|
+
{% endif %}
|
|
690
|
+
}
|
|
691
|
+
_ => unreachable!("scan_to only returns target chars"),
|
|
692
|
+
}
|
|
693
|
+
{% else %}
|
|
694
|
+
{% comment %} Non-scannable: check EOF first, then match {% endcomment %}
|
|
695
|
+
if self.eof() {
|
|
696
|
+
{% if trace %}eprintln!("TRACE: L{{ state.lineno }} {{ func.name }}:{{ state.name }} EOF | term={} pos={}", self.trace_content(), self.pos);{% endif %}
|
|
697
|
+
{% if state.eof_handler.size > 0 %}
|
|
698
|
+
{% comment %} Fix #13: Explicit |eof handler - use its commands {% endcomment %}
|
|
699
|
+
{% for cmd in state.eof_handler %}
|
|
700
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
|
|
701
|
+
{% endfor %}
|
|
702
|
+
{% elsif func.eof_handler.size > 0 %}
|
|
703
|
+
{% comment %} Function-level |eof handler {% endcomment %}
|
|
704
|
+
{% for cmd in func.eof_handler %}
|
|
705
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
|
|
706
|
+
{% endfor %}
|
|
707
|
+
{% else %}
|
|
708
|
+
{% comment %} Default EOF behavior based on return type {% endcomment %}
|
|
709
|
+
{% if return_type_info.kind == "content" %}
|
|
710
|
+
on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
|
|
711
|
+
{% endif %}
|
|
712
|
+
{% if func.expects_char %}
|
|
713
|
+
on_event(Event::Error { code: ParseErrorCode::Unclosed{{ func.return_type }}, span: self.span() });
|
|
714
|
+
{% elsif return_type_info.kind == "bracket" %}
|
|
715
|
+
on_event(Event::{{ func.return_type }}End { span: self.span() });
|
|
716
|
+
{% endif %}
|
|
717
|
+
{% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
|
|
718
|
+
{% endif %}
|
|
719
|
+
}
|
|
720
|
+
{% comment %} Optimization: if only one case and it's default, skip the match {% endcomment %}
|
|
721
|
+
{% if state.cases.size == 1 and state.cases[0].is_default %}
|
|
722
|
+
{% assign kase = state.cases[0] %}
|
|
723
|
+
{% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
|
|
724
|
+
{% for cmd in kase.commands %}
|
|
725
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
|
|
726
|
+
{% endfor %}
|
|
727
|
+
{% else %}
|
|
728
|
+
match self.peek() {
|
|
729
|
+
{% for kase in state.cases %}
|
|
730
|
+
{% if kase.is_default %}
|
|
731
|
+
_ => {
|
|
732
|
+
{% elsif kase.is_conditional %}
|
|
733
|
+
_ if {{ kase.condition | rust_expr }} => {
|
|
734
|
+
{% elsif kase.param_ref %}
|
|
735
|
+
Some(b) if b == {{ kase.param_ref }} => {
|
|
736
|
+
{% elsif kase.special_class and kase.chars.size > 0 %}
|
|
737
|
+
Some(b) if Self::is_{{ kase.special_class }}(b){% for ch in kase.chars %} || b == {{ ch | escape_rust_char }}{% endfor %} => {
|
|
738
|
+
{% elsif kase.special_class %}
|
|
739
|
+
Some(b) if Self::is_{{ kase.special_class }}(b) => {
|
|
740
|
+
{% elsif kase.chars.size == 1 %}
|
|
741
|
+
Some({{ kase.chars[0] | escape_rust_char }}) => {
|
|
742
|
+
{% elsif kase.chars.size > 1 %}
|
|
743
|
+
Some({% for ch in kase.chars %}{{ ch | escape_rust_char }}{% unless forloop.last %} | {% endunless %}{% endfor %}) => {
|
|
744
|
+
{% else %}
|
|
745
|
+
Some(_) => {
|
|
746
|
+
{% endif %}
|
|
747
|
+
{% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
|
|
748
|
+
{% for cmd in kase.commands %}
|
|
749
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
|
|
750
|
+
{% endfor %}
|
|
751
|
+
}
|
|
752
|
+
{% endfor %}
|
|
753
|
+
{% unless state.has_default %}
|
|
754
|
+
{% comment %} Add default arm if state has no explicit default case {% endcomment %}
|
|
755
|
+
_ => {
|
|
756
|
+
{% if trace %}eprintln!("TRACE: {{ func.name }}:{{ state.name }} UNHANDLED | byte={} pos={}", Self::trace_byte(self.peek()), self.pos);{% endif %}
|
|
757
|
+
{% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
|
|
758
|
+
}
|
|
759
|
+
{% endunless %}
|
|
760
|
+
}
|
|
761
|
+
{% endif %}
|
|
762
|
+
{% endif %}
|
|
763
|
+
}
|
|
764
|
+
{% else %}
|
|
765
|
+
{% comment %} Multiple states - use enum {% endcomment %}
|
|
766
|
+
#[derive(Clone, Copy)]
|
|
767
|
+
enum State { {% for state in func.states %}{{ state.name | pascalcase }}, {% endfor %} }
|
|
768
|
+
let mut state = State::{{ func.states.first.name | pascalcase }};
|
|
769
|
+
|
|
770
|
+
loop {
|
|
771
|
+
match state {
|
|
772
|
+
{% for state in func.states %}
|
|
773
|
+
State::{{ state.name | pascalcase }} => {
|
|
774
|
+
{% if state.scannable %}
|
|
775
|
+
{% assign scan_count = state.scan_chars.size %}
|
|
776
|
+
{% if scan_count == 1 %}
|
|
777
|
+
match self.scan_to1({{ state.scan_chars[0] | escape_rust_char }}) {
|
|
778
|
+
{% elsif scan_count == 2 %}
|
|
779
|
+
match self.scan_to2({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}) {
|
|
780
|
+
{% elsif scan_count == 3 %}
|
|
781
|
+
match self.scan_to3({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}) {
|
|
782
|
+
{% elsif scan_count == 4 %}
|
|
783
|
+
match self.scan_to4({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}) {
|
|
784
|
+
{% elsif scan_count == 5 %}
|
|
785
|
+
match self.scan_to5({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}, {{ state.scan_chars[4] | escape_rust_char }}) {
|
|
786
|
+
{% elsif scan_count == 6 %}
|
|
787
|
+
match self.scan_to6({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}, {{ state.scan_chars[4] | escape_rust_char }}, {{ state.scan_chars[5] | escape_rust_char }}) {
|
|
788
|
+
{% endif %}
|
|
789
|
+
{% for kase in state.cases %}
|
|
790
|
+
{% unless kase.is_default %}
|
|
791
|
+
{% if kase.is_conditional %}
|
|
792
|
+
_ if {{ kase.condition | rust_expr }} => {
|
|
793
|
+
{% elsif kase.param_ref %}
|
|
794
|
+
Some(b) if b == {{ kase.param_ref }} => {
|
|
795
|
+
{% elsif kase.special_class and kase.chars.size > 0 %}
|
|
796
|
+
Some(b) if Self::is_{{ kase.special_class }}(b){% for ch in kase.chars %} || b == {{ ch | escape_rust_char }}{% endfor %} => {
|
|
797
|
+
{% elsif kase.special_class %}
|
|
798
|
+
Some(b) if Self::is_{{ kase.special_class }}(b) => {
|
|
799
|
+
{% elsif kase.chars.size == 1 %}
|
|
800
|
+
Some({{ kase.chars[0] | escape_rust_char }}) => {
|
|
801
|
+
{% elsif kase.chars.size > 1 %}
|
|
802
|
+
Some({% for ch in kase.chars %}{{ ch | escape_rust_char }}{% unless forloop.last %} | {% endunless %}{% endfor %}) => {
|
|
803
|
+
{% else %}
|
|
804
|
+
Some(_) => {
|
|
805
|
+
{% endif %}
|
|
806
|
+
{% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
|
|
807
|
+
{% for cmd in kase.commands %}
|
|
808
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
|
|
809
|
+
{% endfor %}
|
|
810
|
+
}
|
|
811
|
+
{% endunless %}
|
|
812
|
+
{% endfor %}
|
|
813
|
+
{% if state.newline_injected %}
|
|
814
|
+
{% comment %} Injected newline: update line/col and continue scanning {% endcomment %}
|
|
815
|
+
Some(b'\n') => {
|
|
816
|
+
self.advance();
|
|
817
|
+
}
|
|
818
|
+
{% endif %}
|
|
819
|
+
None => {
|
|
820
|
+
{% if trace %}eprintln!("TRACE: L{{ state.lineno }} {{ func.name }}:{{ state.name }} EOF | term={} pos={}", self.trace_content(), self.pos);{% endif %}
|
|
821
|
+
{% comment %} EOF handling - check for explicit |eof handler first {% endcomment %}
|
|
822
|
+
{% if state.eof_handler.size > 0 %}
|
|
823
|
+
{% for cmd in state.eof_handler %}
|
|
824
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
|
|
825
|
+
{% endfor %}
|
|
826
|
+
{% elsif func.eof_handler.size > 0 %}
|
|
827
|
+
{% for cmd in func.eof_handler %}
|
|
828
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
|
|
829
|
+
{% endfor %}
|
|
830
|
+
{% else %}
|
|
831
|
+
{% if return_type_info.kind == "content" %}
|
|
832
|
+
on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
|
|
833
|
+
{% endif %}
|
|
834
|
+
{% if func.expects_char %}
|
|
835
|
+
on_event(Event::Error { code: ParseErrorCode::Unclosed{{ func.return_type }}, span: self.span() });
|
|
836
|
+
{% elsif return_type_info.kind == "bracket" %}
|
|
837
|
+
on_event(Event::{{ func.return_type }}End { span: self.span() });
|
|
838
|
+
{% endif %}
|
|
839
|
+
{% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
|
|
840
|
+
{% endif %}
|
|
841
|
+
}
|
|
842
|
+
_ => unreachable!("scan_to only returns target chars"),
|
|
843
|
+
}
|
|
844
|
+
{% else %}
|
|
845
|
+
{% if state.is_unconditional %}
|
|
846
|
+
{% comment %} Unconditional state: execute commands immediately without byte match {% endcomment %}
|
|
847
|
+
{% assign kase = state.cases.first %}
|
|
848
|
+
{% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} UNCONDITIONAL | term={} pos={}", self.trace_content(), self.pos);{% endif %}
|
|
849
|
+
{% for cmd in kase.commands %}
|
|
850
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
|
|
851
|
+
{% endfor %}
|
|
852
|
+
{% else %}
|
|
853
|
+
{% comment %} Non-scannable: check EOF first {% endcomment %}
|
|
854
|
+
if self.eof() {
|
|
855
|
+
{% if trace %}eprintln!("TRACE: L{{ state.lineno }} {{ func.name }}:{{ state.name }} EOF | term={} pos={}", self.trace_content(), self.pos);{% endif %}
|
|
856
|
+
{% if state.eof_handler.size > 0 %}
|
|
857
|
+
{% for cmd in state.eof_handler %}
|
|
858
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
|
|
859
|
+
{% endfor %}
|
|
860
|
+
{% elsif func.eof_handler.size > 0 %}
|
|
861
|
+
{% for cmd in func.eof_handler %}
|
|
862
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
|
|
863
|
+
{% endfor %}
|
|
864
|
+
{% else %}
|
|
865
|
+
{% if return_type_info.kind == "content" %}
|
|
866
|
+
on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
|
|
867
|
+
{% endif %}
|
|
868
|
+
{% if func.expects_char %}
|
|
869
|
+
on_event(Event::Error { code: ParseErrorCode::Unclosed{{ func.return_type }}, span: self.span() });
|
|
870
|
+
{% elsif return_type_info.kind == "bracket" %}
|
|
871
|
+
on_event(Event::{{ func.return_type }}End { span: self.span() });
|
|
872
|
+
{% endif %}
|
|
873
|
+
{% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
|
|
874
|
+
{% endif %}
|
|
875
|
+
}
|
|
876
|
+
{% comment %} Optimization: if only one case and it's default, skip the match {% endcomment %}
|
|
877
|
+
{% if state.cases.size == 1 and state.cases[0].is_default %}
|
|
878
|
+
{% assign kase = state.cases[0] %}
|
|
879
|
+
{% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
|
|
880
|
+
{% for cmd in kase.commands %}
|
|
881
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
|
|
882
|
+
{% endfor %}
|
|
883
|
+
{% else %}
|
|
884
|
+
match self.peek() {
|
|
885
|
+
{% for kase in state.cases %}
|
|
886
|
+
{% if kase.is_default %}
|
|
887
|
+
_ => {
|
|
888
|
+
{% elsif kase.is_conditional %}
|
|
889
|
+
_ if {{ kase.condition | rust_expr }} => {
|
|
890
|
+
{% elsif kase.param_ref %}
|
|
891
|
+
Some(b) if b == {{ kase.param_ref }} => {
|
|
892
|
+
{% elsif kase.special_class and kase.chars.size > 0 %}
|
|
893
|
+
Some(b) if Self::is_{{ kase.special_class }}(b){% for ch in kase.chars %} || b == {{ ch | escape_rust_char }}{% endfor %} => {
|
|
894
|
+
{% elsif kase.special_class %}
|
|
895
|
+
Some(b) if Self::is_{{ kase.special_class }}(b) => {
|
|
896
|
+
{% elsif kase.chars.size == 1 %}
|
|
897
|
+
Some({{ kase.chars[0] | escape_rust_char }}) => {
|
|
898
|
+
{% elsif kase.chars.size > 1 %}
|
|
899
|
+
Some({% for ch in kase.chars %}{{ ch | escape_rust_char }}{% unless forloop.last %} | {% endunless %}{% endfor %}) => {
|
|
900
|
+
{% else %}
|
|
901
|
+
Some(_) => {
|
|
902
|
+
{% endif %}
|
|
903
|
+
{% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
|
|
904
|
+
{% for cmd in kase.commands %}
|
|
905
|
+
{% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
|
|
906
|
+
{% endfor %}
|
|
907
|
+
}
|
|
908
|
+
{% endfor %}
|
|
909
|
+
{% unless state.has_default %}
|
|
910
|
+
{% comment %} Add default arm if state has no explicit default case {% endcomment %}
|
|
911
|
+
_ => {
|
|
912
|
+
{% if trace %}eprintln!("TRACE: {{ func.name }}:{{ state.name }} UNHANDLED | byte={} pos={}", Self::trace_byte(self.peek()), self.pos);{% endif %}
|
|
913
|
+
{% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
|
|
914
|
+
}
|
|
915
|
+
{% endunless %}
|
|
916
|
+
}
|
|
917
|
+
{% endif %}
|
|
918
|
+
{% endif %}
|
|
919
|
+
{% endif %}
|
|
920
|
+
}
|
|
921
|
+
{% endfor %}
|
|
922
|
+
}
|
|
923
|
+
}
|
|
924
|
+
{% endif %}
|
|
925
|
+
|
|
926
|
+
{% comment %} Note: Unreachable code below - returns happen in command handling {% endcomment %}
|
|
927
|
+
{% comment %} This is intentional - the loop above handles all exits {% endcomment %}
|
|
928
|
+
}
|
|
929
|
+
|
|
930
|
+
{% endfor %}
|
|
931
|
+
}
|
|
932
|
+
|
|
933
|
+
{% if streaming %}
|
|
934
|
+
// ============================================================================
|
|
935
|
+
// Streaming Parser (multi-chunk support)
|
|
936
|
+
// ============================================================================
|
|
937
|
+
|
|
938
|
+
/// Result of parsing a chunk in streaming mode.
|
|
939
|
+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
|
|
940
|
+
pub enum ParseResult {
|
|
941
|
+
/// Parsing completed successfully (EOF reached).
|
|
942
|
+
Complete,
|
|
943
|
+
/// More data needed - provide another chunk.
|
|
944
|
+
NeedMoreData,
|
|
945
|
+
}
|
|
946
|
+
|
|
947
|
+
/// Event with owned content for streaming across chunk boundaries.
|
|
948
|
+
#[derive(Debug, Clone, PartialEq)]
|
|
949
|
+
pub enum StreamEvent {
|
|
950
|
+
{% for type in types %}
|
|
951
|
+
{% if type.kind == "bracket" %}
|
|
952
|
+
{{ type.name }}Start { span: Range<usize> },
|
|
953
|
+
{{ type.name }}End { span: Range<usize> },
|
|
954
|
+
{% elsif type.kind == "content" %}
|
|
955
|
+
{{ type.name }} { content: Vec<u8>, span: Range<usize> },
|
|
956
|
+
{% endif %}
|
|
957
|
+
{% endfor %}
|
|
958
|
+
Error { code: ParseErrorCode, span: Range<usize> },
|
|
959
|
+
}
|
|
960
|
+
|
|
961
|
+
impl StreamEvent {
|
|
962
|
+
/// Convert from borrowed Event to owned StreamEvent.
|
|
963
|
+
fn from_event(event: Event<'_>, offset: usize) -> Self {
|
|
964
|
+
match event {
|
|
965
|
+
{% for type in types %}
|
|
966
|
+
{% if type.kind == "bracket" %}
|
|
967
|
+
Event::{{ type.name }}Start { span } => {
|
|
968
|
+
StreamEvent::{{ type.name }}Start { span: (span.start + offset)..(span.end + offset) }
|
|
969
|
+
}
|
|
970
|
+
Event::{{ type.name }}End { span } => {
|
|
971
|
+
StreamEvent::{{ type.name }}End { span: (span.start + offset)..(span.end + offset) }
|
|
972
|
+
}
|
|
973
|
+
{% elsif type.kind == "content" %}
|
|
974
|
+
Event::{{ type.name }} { content, span } => {
|
|
975
|
+
StreamEvent::{{ type.name }} {
|
|
976
|
+
content: content.into_owned(),
|
|
977
|
+
span: (span.start + offset)..(span.end + offset),
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
{% endif %}
|
|
981
|
+
{% endfor %}
|
|
982
|
+
Event::Error { code, span } => {
|
|
983
|
+
StreamEvent::Error { code, span: (span.start + offset)..(span.end + offset) }
|
|
984
|
+
}
|
|
985
|
+
}
|
|
986
|
+
}
|
|
987
|
+
}
|
|
988
|
+
|
|
989
|
+
/// Streaming parser for chunk-by-chunk input.
|
|
990
|
+
///
|
|
991
|
+
/// Wraps the single-buffer `Parser` to handle input arriving in chunks.
|
|
992
|
+
/// Buffers incomplete content across chunk boundaries.
|
|
993
|
+
///
|
|
994
|
+
/// # Example
|
|
995
|
+
///
|
|
996
|
+
/// ```ignore
|
|
997
|
+
/// let mut parser = StreamingParser::new();
|
|
998
|
+
/// loop {
|
|
999
|
+
/// match parser.parse(chunk, |event| handle(event)) {
|
|
1000
|
+
/// ParseResult::Complete => break,
|
|
1001
|
+
/// ParseResult::NeedMoreData => {
|
|
1002
|
+
/// chunk = get_next_chunk();
|
|
1003
|
+
/// if chunk.is_empty() {
|
|
1004
|
+
/// parser.finish(|event| handle(event));
|
|
1005
|
+
/// break;
|
|
1006
|
+
/// }
|
|
1007
|
+
/// }
|
|
1008
|
+
/// }
|
|
1009
|
+
/// }
|
|
1010
|
+
/// ```
|
|
1011
|
+
pub struct StreamingParser {
|
|
1012
|
+
/// Buffer for incomplete content at chunk boundaries.
|
|
1013
|
+
buffer: Vec<u8>,
|
|
1014
|
+
/// Maximum buffer size before error (default 4KB).
|
|
1015
|
+
max_buffer: usize,
|
|
1016
|
+
/// Global byte offset for span calculation.
|
|
1017
|
+
global_offset: usize,
|
|
1018
|
+
/// Line number (persists across chunks).
|
|
1019
|
+
line: u32,
|
|
1020
|
+
/// Column number (persists across chunks).
|
|
1021
|
+
column: u32,
|
|
1022
|
+
}
|
|
1023
|
+
|
|
1024
|
+
impl Default for StreamingParser {
|
|
1025
|
+
fn default() -> Self {
|
|
1026
|
+
Self::new()
|
|
1027
|
+
}
|
|
1028
|
+
}
|
|
1029
|
+
|
|
1030
|
+
impl StreamingParser {
|
|
1031
|
+
/// Create a new streaming parser with default settings.
|
|
1032
|
+
pub fn new() -> Self {
|
|
1033
|
+
Self::with_max_buffer(4096)
|
|
1034
|
+
}
|
|
1035
|
+
|
|
1036
|
+
/// Create a streaming parser with custom buffer limit.
|
|
1037
|
+
pub fn with_max_buffer(max_buffer: usize) -> Self {
|
|
1038
|
+
Self {
|
|
1039
|
+
buffer: Vec::new(),
|
|
1040
|
+
max_buffer,
|
|
1041
|
+
global_offset: 0,
|
|
1042
|
+
line: 1,
|
|
1043
|
+
column: 1,
|
|
1044
|
+
}
|
|
1045
|
+
}
|
|
1046
|
+
|
|
1047
|
+
/// Parse a chunk of input, emitting events for complete lines.
|
|
1048
|
+
///
|
|
1049
|
+
/// Uses line-oriented streaming: only parses complete lines (ending in `\n`).
|
|
1050
|
+
/// Incomplete lines are buffered until more data arrives.
|
|
1051
|
+
///
|
|
1052
|
+
/// Returns `NeedMoreData` if the chunk ends mid-line.
|
|
1053
|
+
/// Call `finish()` after the last chunk to handle any remaining content.
|
|
1054
|
+
pub fn parse<F>(&mut self, chunk: &[u8], mut on_event: F) -> ParseResult
|
|
1055
|
+
where
|
|
1056
|
+
F: FnMut(StreamEvent),
|
|
1057
|
+
{
|
|
1058
|
+
// Append new chunk to buffer
|
|
1059
|
+
self.buffer.extend_from_slice(chunk);
|
|
1060
|
+
|
|
1061
|
+
if self.buffer.is_empty() {
|
|
1062
|
+
return ParseResult::NeedMoreData;
|
|
1063
|
+
}
|
|
1064
|
+
|
|
1065
|
+
// Check buffer size limit
|
|
1066
|
+
if self.buffer.len() > self.max_buffer {
|
|
1067
|
+
on_event(StreamEvent::Error {
|
|
1068
|
+
code: ParseErrorCode::UnexpectedEof, // Buffer overflow
|
|
1069
|
+
span: self.global_offset..self.global_offset,
|
|
1070
|
+
});
|
|
1071
|
+
self.buffer.clear();
|
|
1072
|
+
return ParseResult::Complete;
|
|
1073
|
+
}
|
|
1074
|
+
|
|
1075
|
+
// Find last complete line (ending with \n)
|
|
1076
|
+
let last_newline = self.buffer.iter().rposition(|&b| b == b'\n');
|
|
1077
|
+
|
|
1078
|
+
let parse_end = match last_newline {
|
|
1079
|
+
Some(pos) => pos + 1, // Include the newline
|
|
1080
|
+
None => return ParseResult::NeedMoreData, // No complete line yet
|
|
1081
|
+
};
|
|
1082
|
+
|
|
1083
|
+
// Extract complete lines to parse
|
|
1084
|
+
let to_parse: Vec<u8> = self.buffer.drain(..parse_end).collect();
|
|
1085
|
+
let offset = self.global_offset;
|
|
1086
|
+
|
|
1087
|
+
// Parse complete lines
|
|
1088
|
+
let mut inner = Parser::new(&to_parse);
|
|
1089
|
+
inner.line = self.line;
|
|
1090
|
+
inner.column = self.column;
|
|
1091
|
+
|
|
1092
|
+
inner.parse(|event| {
|
|
1093
|
+
on_event(StreamEvent::from_event(event, offset));
|
|
1094
|
+
});
|
|
1095
|
+
|
|
1096
|
+
// Update state
|
|
1097
|
+
self.global_offset += to_parse.len();
|
|
1098
|
+
for &b in &to_parse {
|
|
1099
|
+
if b == b'\n' {
|
|
1100
|
+
self.line += 1;
|
|
1101
|
+
self.column = 1;
|
|
1102
|
+
} else {
|
|
1103
|
+
self.column += 1;
|
|
1104
|
+
}
|
|
1105
|
+
}
|
|
1106
|
+
|
|
1107
|
+
ParseResult::NeedMoreData
|
|
1108
|
+
}
|
|
1109
|
+
|
|
1110
|
+
/// Signal end of input and handle any remaining buffered content.
|
|
1111
|
+
///
|
|
1112
|
+
/// This triggers EOF handling for any incomplete constructs.
|
|
1113
|
+
pub fn finish<F>(mut self, mut on_event: F)
|
|
1114
|
+
where
|
|
1115
|
+
F: FnMut(StreamEvent),
|
|
1116
|
+
{
|
|
1117
|
+
if self.buffer.is_empty() {
|
|
1118
|
+
return;
|
|
1119
|
+
}
|
|
1120
|
+
|
|
1121
|
+
// Parse remaining buffer - this will hit EOF
|
|
1122
|
+
let input = std::mem::take(&mut self.buffer);
|
|
1123
|
+
let offset = self.global_offset;
|
|
1124
|
+
|
|
1125
|
+
let mut inner = Parser::new(&input);
|
|
1126
|
+
inner.line = self.line;
|
|
1127
|
+
inner.column = self.column;
|
|
1128
|
+
|
|
1129
|
+
inner.parse(|event| {
|
|
1130
|
+
on_event(StreamEvent::from_event(event, offset));
|
|
1131
|
+
});
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
/// Returns the current global byte offset.
|
|
1135
|
+
pub fn offset(&self) -> usize {
|
|
1136
|
+
self.global_offset
|
|
1137
|
+
}
|
|
1138
|
+
|
|
1139
|
+
/// Returns the current line number.
|
|
1140
|
+
pub fn line(&self) -> u32 {
|
|
1141
|
+
self.line
|
|
1142
|
+
}
|
|
1143
|
+
|
|
1144
|
+
/// Returns the current column number.
|
|
1145
|
+
pub fn column(&self) -> u32 {
|
|
1146
|
+
self.column
|
|
1147
|
+
}
|
|
1148
|
+
}
|
|
1149
|
+
{% endif %}
|
|
1150
|
+
|
|
1151
|
+
#[cfg(test)]
|
|
1152
|
+
mod tests {
|
|
1153
|
+
use super::*;
|
|
1154
|
+
|
|
1155
|
+
#[test]
|
|
1156
|
+
fn test_parse_basic() {
|
|
1157
|
+
// Basic smoke test - actual tests depend on parser semantics
|
|
1158
|
+
let input = b"";
|
|
1159
|
+
let mut events = Vec::new();
|
|
1160
|
+
Parser::new(input).parse(|e| events.push(e));
|
|
1161
|
+
// Events depend on the specific parser
|
|
1162
|
+
}
|
|
1163
|
+
}
|