descent 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1163 @@
1
+ //! Generated parser - DO NOT EDIT
2
+ //!
3
+ //! Generated by descent from {{ parser }}.desc
4
+ //!
5
+ //! Callback-based recursive descent parser.
6
+ //! Call stack = element stack. True recursion handles nesting naturally.
7
+
8
+ use std::ops::Range;
9
+ {% if keywords.size > 0 %}
10
+ use phf::phf_map;
11
+ {% endif %}
12
+
13
+ /// Events emitted by the parser.
14
+ #[derive(Debug, Clone, PartialEq)]
15
+ pub enum Event<'a> {
16
+ {% for type in types %}
17
+ {% if type.kind == "bracket" %}
18
+ {{ type.name }}Start { span: Range<usize> },
19
+ {{ type.name }}End { span: Range<usize> },
20
+ {% elsif type.kind == "content" %}
21
+ {{ type.name }} { content: std::borrow::Cow<'a, [u8]>, span: Range<usize> },
22
+ {% endif %}
23
+ {% endfor %}
24
+ Error { code: ParseErrorCode, span: Range<usize> },
25
+ }
26
+
27
+ impl<'a> Event<'a> {
28
+ /// Format event as a single line for test output.
29
+ pub fn format_line(&self) -> String {
30
+ match self {
31
+ {% for type in types %}
32
+ {% if type.kind == "bracket" %}
33
+ Event::{{ type.name }}Start { span } => {
34
+ format!("{{ type.name }}Start @ {}..{}", span.start, span.end)
35
+ }
36
+ Event::{{ type.name }}End { span } => {
37
+ format!("{{ type.name }}End @ {}..{}", span.start, span.end)
38
+ }
39
+ {% elsif type.kind == "content" %}
40
+ Event::{{ type.name }} { content, span } => {
41
+ let s = std::str::from_utf8(content.as_ref()).unwrap_or("<invalid utf8>");
42
+ format!("{{ type.name }} {:?} @ {}..{}", s, span.start, span.end)
43
+ }
44
+ {% endif %}
45
+ {% endfor %}
46
+ Event::Error { code, span } => {
47
+ format!("Error {:?} @ {}..{}", code, span.start, span.end)
48
+ }
49
+ }
50
+ }
51
+ }
52
+
53
+ {% comment %} Generate phf keyword maps {% endcomment %}
54
+ {% for kw in keywords %}
55
+ /// Keyword lookup map for {{ kw.name }}.
56
+ /// Generated from |keywords[{{ kw.name }}] - O(1) perfect hash lookup.
57
+ static {{ kw.const_name }}: phf::Map<&'static [u8], u8> = phf_map! {
58
+ {% for m in kw.mappings %}
59
+ {% assign idx = forloop.index0 %}
60
+ b"{{ m.keyword }}" => {{ idx }}u8,
61
+ {% endfor %}
62
+ };
63
+ {% endfor %}
64
+
65
+ /// Error codes for parse errors.
66
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
67
+ pub enum ParseErrorCode {
68
+ UnexpectedEof,
69
+ UnexpectedChar,
70
+ {% comment %} Deduplicate error codes by return_type - multiple functions may return the same type {% endcomment %}
71
+ {% comment %} Use comma-delimited format with leading comma to avoid partial matches {% endcomment %}
72
+ {% assign seen_error_codes = ",UnexpectedEof,UnexpectedChar," %}
73
+ {% for func in functions %}
74
+ {% if func.expects_char %}
75
+ {% assign unclosed_code = "Unclosed" | append: func.return_type %}
76
+ {% assign code_pattern = "," | append: unclosed_code | append: "," %}
77
+ {% unless seen_error_codes contains code_pattern %}
78
+ {{ unclosed_code }},
79
+ {% assign seen_error_codes = seen_error_codes | append: unclosed_code | append: "," %}
80
+ {% endunless %}
81
+ {% endif %}
82
+ {% endfor %}
83
+ {% comment %} Custom error codes from /error(code) calls - skip if already generated {% endcomment %}
84
+ {% for code in custom_error_codes %}
85
+ {% assign pascal_code = code | pascalcase %}
86
+ {% assign code_pattern = "," | append: pascal_code | append: "," %}
87
+ {% unless seen_error_codes contains code_pattern %}
88
+ {{ pascal_code }},
89
+ {% assign seen_error_codes = seen_error_codes | append: pascal_code | append: "," %}
90
+ {% endunless %}
91
+ {% endfor %}
92
+ }
93
+
94
+ /// Callback-based parser.
95
+ ///
96
+ /// Uses true recursive descent - the call stack IS the element stack.
97
+ pub struct Parser<'a> {
98
+ input: &'a [u8],
99
+ pos: usize,
100
+ mark_pos: usize,
101
+ term_pos: usize,
102
+ prepend_buf: Vec<u8>,
103
+ line: u32,
104
+ column: u32,
105
+ }
106
+
107
+ #[allow(unused_variables, dead_code)]
108
+ impl<'a> Parser<'a> {
109
+ /// Create a new parser for the given input.
110
+ pub fn new(input: &'a [u8]) -> Self {
111
+ Self {
112
+ input,
113
+ pos: 0,
114
+ mark_pos: 0,
115
+ term_pos: 0,
116
+ prepend_buf: Vec::new(),
117
+ line: 1,
118
+ column: 1,
119
+ }
120
+ }
121
+
122
+ /// Parse the input, calling the callback for each event.
123
+ ///
124
+ /// The callback receives events in document order.
125
+ /// For backpressure, have the callback send to a bounded channel.
126
+ pub fn parse<F>(mut self, mut on_event: F)
127
+ where
128
+ F: FnMut(Event<'a>),
129
+ {
130
+ self.parse_{{ entry_point | remove: "/" }}(&mut on_event);
131
+ }
132
+
133
+ // ========== Helpers ==========
134
+
135
+ #[inline(always)]
136
+ fn peek(&self) -> Option<u8> {
137
+ self.input.get(self.pos).copied()
138
+ }
139
+
140
+ {% if trace %}
141
+ /// Format a byte for trace display (shows printable chars or escape codes).
142
+ fn trace_byte(b: Option<u8>) -> String {
143
+ match b {
144
+ None => "EOF".to_string(),
145
+ Some(b'\n') => "'\\n'".to_string(),
146
+ Some(b'\t') => "'\\t'".to_string(),
147
+ Some(b'\r') => "'\\r'".to_string(),
148
+ Some(b' ') => "' '".to_string(),
149
+ Some(b) if b.is_ascii_graphic() => format!("'{}'", b as char),
150
+ Some(b) => format!("0x{:02x}", b),
151
+ }
152
+ }
153
+
154
+ /// Format accumulated content for trace display (truncated).
155
+ /// Shows slice content only (prepend buffer shown separately if non-empty).
156
+ fn trace_content(&self) -> String {
157
+ let end = if self.term_pos != usize::MAX { self.term_pos } else { self.pos };
158
+ let slice = &self.input[self.mark_pos..end];
159
+ let prepend_info = if self.prepend_buf.is_empty() {
160
+ String::new()
161
+ } else {
162
+ format!("+{}", self.prepend_buf.len())
163
+ };
164
+ if slice.is_empty() && self.prepend_buf.is_empty() {
165
+ return "[]".to_string();
166
+ }
167
+ let s = std::str::from_utf8(slice).unwrap_or("<binary>");
168
+ if s.len() > 32 {
169
+ format!("[{:?}...]{}", &s[..32], prepend_info)
170
+ } else {
171
+ format!("[{:?}]{}", s, prepend_info)
172
+ }
173
+ }
174
+ {% endif %}
175
+
176
+ #[inline(always)]
177
+ fn eof(&self) -> bool {
178
+ self.pos >= self.input.len()
179
+ }
180
+
181
+ #[inline(always)]
182
+ fn advance(&mut self) {
183
+ if self.pos < self.input.len() {
184
+ if self.input[self.pos] == b'\n' {
185
+ self.line += 1;
186
+ self.column = 1;
187
+ } else {
188
+ self.column += 1;
189
+ }
190
+ self.pos += 1;
191
+ }
192
+ }
193
+
194
+ #[inline(always)]
195
+ fn mark(&mut self) {
196
+ self.mark_pos = self.pos;
197
+ self.term_pos = usize::MAX; // Sentinel: TERM not yet called
198
+ // Note: prepend_buf is NOT cleared here - it persists until term() consumes it.
199
+ // This allows PREPEND to be called before a nested function that does MARK.
200
+ }
201
+
202
+ {% if uses_set_term %}
203
+ /// Set termination point with optional offset (e.g., -1 for one char before current)
204
+ /// Clamps to valid range [0, input.len()] to prevent underflow/overflow.
205
+ #[inline(always)]
206
+ fn set_term(&mut self, offset: i32) {
207
+ let new_pos = self.pos as i64 + offset as i64;
208
+ self.term_pos = new_pos.clamp(0, self.input.len() as i64) as usize;
209
+ }
210
+ {% endif %}
211
+
212
+ /// Prepend bytes to the accumulation buffer.
213
+ /// Empty slice is naturally a no-op.
214
+ #[inline(always)]
215
+ fn prepend_bytes(&mut self, bytes: &[u8]) {
216
+ self.prepend_buf.extend_from_slice(bytes);
217
+ }
218
+
219
+ /// Get accumulated content from MARK to TERM, including any prepended bytes.
220
+ /// Returns Cow::Borrowed when no prepend (zero-copy), Cow::Owned when prepend used.
221
+ #[inline(always)]
222
+ fn term(&mut self) -> std::borrow::Cow<'a, [u8]> {
223
+ // Use term_pos if set after mark, otherwise use current pos
224
+ let end = if self.term_pos != usize::MAX { self.term_pos } else { self.pos };
225
+ let slice = &self.input[self.mark_pos..end];
226
+
227
+ if self.prepend_buf.is_empty() {
228
+ std::borrow::Cow::Borrowed(slice)
229
+ } else {
230
+ let mut combined = std::mem::take(&mut self.prepend_buf);
231
+ combined.extend_from_slice(slice);
232
+ std::borrow::Cow::Owned(combined)
233
+ }
234
+ }
235
+
236
+ #[inline(always)]
237
+ fn span(&self) -> Range<usize> {
238
+ self.pos..self.pos
239
+ }
240
+
241
+ #[inline(always)]
242
+ fn span_from_mark(&self) -> Range<usize> {
243
+ // Use term_pos if set after mark, otherwise use current pos
244
+ let end = if self.term_pos != usize::MAX { self.term_pos } else { self.pos };
245
+ self.mark_pos..end
246
+ }
247
+
248
+ {% if uses_col %}
249
+ #[inline(always)]
250
+ fn col(&self) -> i32 {
251
+ self.column as i32
252
+ }
253
+ {% endif %}
254
+
255
+ {% if uses_prev %}
256
+ /// Previous byte (0 at start of input).
257
+ #[inline(always)]
258
+ fn prev(&self) -> u8 {
259
+ if self.pos > 0 {
260
+ self.input[self.pos - 1]
261
+ } else {
262
+ 0
263
+ }
264
+ }
265
+ {% endif %}
266
+
267
+ {% if uses_letter %}
268
+ #[inline(always)]
269
+ fn is_letter(b: u8) -> bool {
270
+ b.is_ascii_alphabetic()
271
+ }
272
+ {% endif %}
273
+
274
+ {% if uses_label_cont %}
275
+ #[inline(always)]
276
+ fn is_label_cont(b: u8) -> bool {
277
+ b.is_ascii_alphanumeric() || b == b'_' || b == b'-'
278
+ }
279
+ {% endif %}
280
+
281
+ {% if uses_digit %}
282
+ #[inline(always)]
283
+ fn is_digit(b: u8) -> bool {
284
+ b.is_ascii_digit()
285
+ }
286
+ {% endif %}
287
+
288
+ {% if uses_hex_digit %}
289
+ #[inline(always)]
290
+ fn is_hex_digit(b: u8) -> bool {
291
+ b.is_ascii_hexdigit()
292
+ }
293
+ {% endif %}
294
+
295
+ {% if uses_ws %}
296
+ #[inline(always)]
297
+ fn is_ws(b: u8) -> bool {
298
+ b == b' ' || b == b'\t'
299
+ }
300
+ {% endif %}
301
+
302
+ {% if uses_nl %}
303
+ #[inline(always)]
304
+ fn is_nl(b: u8) -> bool {
305
+ b == b'\n'
306
+ }
307
+ {% endif %}
308
+
309
+ {% if uses_unicode %}
310
+ // ========== Unicode Identifier Classes ==========
311
+ // Requires `unicode-xid` crate for full Unicode support.
312
+ // These is_* methods work with the byte-at-a-time matching pattern.
313
+ // For ASCII bytes, they use unicode-xid. For non-ASCII, the check
314
+ // is done at the byte level (first byte of UTF-8 sequence).
315
+
316
+ /// XID_Start: Can start a Unicode identifier.
317
+ /// For ASCII, uses unicode-xid. For non-ASCII first bytes, returns true
318
+ /// (conservative - actual validation happens via match_xid_start for multi-byte).
319
+ #[inline(always)]
320
+ fn is_xid_start(b: u8) -> bool {
321
+ use unicode_xid::UnicodeXID;
322
+ if b < 0x80 {
323
+ (b as char).is_xid_start()
324
+ } else {
325
+ // Non-ASCII: could be start of valid UTF-8 XID_Start sequence
326
+ // Return true for lead bytes (0xC2-0xF4), let advance handle it
327
+ b >= 0xC2 && b <= 0xF4
328
+ }
329
+ }
330
+
331
+ /// XID_Continue: Can continue a Unicode identifier.
332
+ #[inline(always)]
333
+ fn is_xid_cont(b: u8) -> bool {
334
+ use unicode_xid::UnicodeXID;
335
+ if b < 0x80 {
336
+ (b as char).is_xid_continue()
337
+ } else {
338
+ // Non-ASCII: could be valid UTF-8 continuation or lead byte
339
+ b >= 0x80
340
+ }
341
+ }
342
+
343
+ /// XLBL_Start: Same as XID_Start (for label syntax).
344
+ #[inline(always)]
345
+ fn is_xlbl_start(b: u8) -> bool {
346
+ Self::is_xid_start(b)
347
+ }
348
+
349
+ /// XLBL_Continue: XID_Continue + hyphen (for kebab-case labels).
350
+ #[inline(always)]
351
+ fn is_xlbl_cont(b: u8) -> bool {
352
+ b == b'-' || Self::is_xid_cont(b)
353
+ }
354
+ {% endif %}
355
+
356
+ {% if max_scan_arity > 0 %}
357
+ // ========== SCAN Methods (SIMD-accelerated via memchr) ==========
358
+ // '\n' is included in scan targets by the generator for line tracking.
359
+ // When '\n' is found, caller handles line/column update. No newlines
360
+ // exist between start and found position, so we just add offset to column.
361
+
362
+ {% if max_scan_arity >= 1 %}
363
+ /// Scan forward to find first occurrence of b1.
364
+ #[inline(always)]
365
+ fn scan_to1(&mut self, b1: u8) -> Option<u8> {
366
+ match memchr::memchr(b1, &self.input[self.pos..]) {
367
+ Some(offset) => {
368
+ self.column += offset as u32;
369
+ self.pos += offset;
370
+ Some(self.input[self.pos])
371
+ }
372
+ None => {
373
+ self.column += (self.input.len() - self.pos) as u32;
374
+ self.pos = self.input.len();
375
+ None
376
+ }
377
+ }
378
+ }
379
+ {% endif %}
380
+
381
+ {% if max_scan_arity >= 2 %}
382
+ /// Scan forward to find first occurrence of b1 or b2.
383
+ #[inline(always)]
384
+ fn scan_to2(&mut self, b1: u8, b2: u8) -> Option<u8> {
385
+ match memchr::memchr2(b1, b2, &self.input[self.pos..]) {
386
+ Some(offset) => {
387
+ self.column += offset as u32;
388
+ self.pos += offset;
389
+ Some(self.input[self.pos])
390
+ }
391
+ None => {
392
+ self.column += (self.input.len() - self.pos) as u32;
393
+ self.pos = self.input.len();
394
+ None
395
+ }
396
+ }
397
+ }
398
+ {% endif %}
399
+
400
+ {% if max_scan_arity >= 3 %}
401
+ /// Scan forward to find first occurrence of b1, b2, or b3.
402
+ #[inline(always)]
403
+ fn scan_to3(&mut self, b1: u8, b2: u8, b3: u8) -> Option<u8> {
404
+ match memchr::memchr3(b1, b2, b3, &self.input[self.pos..]) {
405
+ Some(offset) => {
406
+ self.column += offset as u32;
407
+ self.pos += offset;
408
+ Some(self.input[self.pos])
409
+ }
410
+ None => {
411
+ self.column += (self.input.len() - self.pos) as u32;
412
+ self.pos = self.input.len();
413
+ None
414
+ }
415
+ }
416
+ }
417
+ {% endif %}
418
+
419
+ {% if max_scan_arity >= 4 %}
420
+ /// Scan forward to find first occurrence of b1..b4 (chained memchr).
421
+ /// Limits second search to range of first hit to avoid O(n²) behavior.
422
+ #[inline(always)]
423
+ fn scan_to4(&mut self, b1: u8, b2: u8, b3: u8, b4: u8) -> Option<u8> {
424
+ let haystack = &self.input[self.pos..];
425
+ let p1 = memchr::memchr3(b1, b2, b3, haystack);
426
+ let p2 = match p1 {
427
+ Some(limit) => memchr::memchr(b4, &haystack[..limit]),
428
+ None => memchr::memchr(b4, haystack),
429
+ };
430
+ let offset = match (p1, p2) {
431
+ (Some(x), Some(y)) => Some(x.min(y)),
432
+ (Some(x), None) | (None, Some(x)) => Some(x),
433
+ (None, None) => None,
434
+ };
435
+ match offset {
436
+ Some(off) => {
437
+ self.column += off as u32;
438
+ self.pos += off;
439
+ Some(self.input[self.pos])
440
+ }
441
+ None => {
442
+ self.column += (self.input.len() - self.pos) as u32;
443
+ self.pos = self.input.len();
444
+ None
445
+ }
446
+ }
447
+ }
448
+ {% endif %}
449
+
450
+ {% if max_scan_arity >= 5 %}
451
+ /// Scan forward to find first occurrence of b1..b5 (chained memchr).
452
+ /// Limits second search to range of first hit to avoid O(n²) behavior.
453
+ #[inline(always)]
454
+ fn scan_to5(&mut self, b1: u8, b2: u8, b3: u8, b4: u8, b5: u8) -> Option<u8> {
455
+ let haystack = &self.input[self.pos..];
456
+ let p1 = memchr::memchr3(b1, b2, b3, haystack);
457
+ let p2 = match p1 {
458
+ Some(limit) => memchr::memchr2(b4, b5, &haystack[..limit]),
459
+ None => memchr::memchr2(b4, b5, haystack),
460
+ };
461
+ let offset = match (p1, p2) {
462
+ (Some(x), Some(y)) => Some(x.min(y)),
463
+ (Some(x), None) | (None, Some(x)) => Some(x),
464
+ (None, None) => None,
465
+ };
466
+ match offset {
467
+ Some(off) => {
468
+ self.column += off as u32;
469
+ self.pos += off;
470
+ Some(self.input[self.pos])
471
+ }
472
+ None => {
473
+ self.column += (self.input.len() - self.pos) as u32;
474
+ self.pos = self.input.len();
475
+ None
476
+ }
477
+ }
478
+ }
479
+ {% endif %}
480
+
481
+ {% if max_scan_arity >= 6 %}
482
+ /// Scan forward to find first occurrence of b1..b6 (chained memchr).
483
+ /// Limits second search to range of first hit to avoid O(n²) behavior.
484
+ #[inline(always)]
485
+ fn scan_to6(&mut self, b1: u8, b2: u8, b3: u8, b4: u8, b5: u8, b6: u8) -> Option<u8> {
486
+ let haystack = &self.input[self.pos..];
487
+ let p1 = memchr::memchr3(b1, b2, b3, haystack);
488
+ let p2 = match p1 {
489
+ Some(limit) => memchr::memchr3(b4, b5, b6, &haystack[..limit]),
490
+ None => memchr::memchr3(b4, b5, b6, haystack),
491
+ };
492
+ let offset = match (p1, p2) {
493
+ (Some(x), Some(y)) => Some(x.min(y)),
494
+ (Some(x), None) | (None, Some(x)) => Some(x),
495
+ (None, None) => None,
496
+ };
497
+ match offset {
498
+ Some(off) => {
499
+ self.column += off as u32;
500
+ self.pos += off;
501
+ Some(self.input[self.pos])
502
+ }
503
+ None => {
504
+ self.column += (self.input.len() - self.pos) as u32;
505
+ self.pos = self.input.len();
506
+ None
507
+ }
508
+ }
509
+ }
510
+ {% endif %}
511
+ {% endif %}
512
+
513
+ {% comment %} Generate keyword lookup methods {% endcomment %}
514
+ {% for kw in keywords %}
515
+ // ========== Keyword Lookup: {{ kw.name }} ==========
516
+
517
+ /// Look up accumulated content in {{ kw.name }} keywords.
518
+ /// Returns true if a keyword matched (event emitted), false otherwise.
519
+ fn lookup_{{ kw.name }}<F>(&mut self, on_event: &mut F) -> bool
520
+ where
521
+ F: FnMut(Event<'a>),
522
+ {
523
+ let content = self.term();
524
+ if let Some(&id) = {{ kw.const_name }}.get(content.as_ref()) {
525
+ let span = self.span_from_mark();
526
+ match id {
527
+ {% for m in kw.mappings %}
528
+ {% assign idx = forloop.index0 %}
529
+ {{ idx }} => on_event(Event::{{ m.event_type }} { content, span }),
530
+ {% endfor %}
531
+ _ => unreachable!("keyword map contains only valid ids"),
532
+ }
533
+ true
534
+ } else {
535
+ false
536
+ }
537
+ }
538
+
539
+ /// Look up and emit keyword, or call fallback function.
540
+ fn lookup_{{ kw.name }}_or_fallback<F>(&mut self, on_event: &mut F)
541
+ where
542
+ F: FnMut(Event<'a>),
543
+ {
544
+ if !self.lookup_{{ kw.name }}(on_event) {
545
+ {% if kw.fallback_func %}
546
+ {% if kw.fallback_args %}
547
+ self.parse_{{ kw.fallback_func }}({{ kw.fallback_args }}, on_event);
548
+ {% else %}
549
+ self.parse_{{ kw.fallback_func }}(on_event);
550
+ {% endif %}
551
+ {% else %}
552
+ // No fallback - keyword not found is a no-op
553
+ {% endif %}
554
+ }
555
+ }
556
+ {% endfor %}
557
+
558
+ // ========== Generated Parse Functions ==========
559
+
560
+ {% for func in functions %}
561
+ {% comment %} Determine return type info {% endcomment %}
562
+ {% assign return_type_info = nil %}
563
+ {% for t in types %}
564
+ {% if t.name == func.return_type %}
565
+ {% assign return_type_info = t %}
566
+ {% endif %}
567
+ {% endfor %}
568
+
569
+ /// Parse {{ func.name }}{% if func.return_type %} -> {{ func.return_type }}{% endif %}
570
+ {% if func.params.size > 0 %}
571
+ fn parse_{{ func.name }}<F>(&mut self, {% for param in func.params %}{{ param }}: {% if func.param_types[param] == "byte" %}u8{% elsif func.param_types[param] == "bytes" %}&'static [u8]{% else %}i32{% endif %}, {% endfor %}on_event: &mut F){% if return_type_info.kind == "internal" %} -> i32{% endif %}
572
+ {% else %}
573
+ fn parse_{{ func.name }}<F>(&mut self, on_event: &mut F){% if return_type_info.kind == "internal" %} -> i32{% endif %}
574
+ {% endif %}
575
+ where
576
+ F: FnMut(Event<'a>),
577
+ {
578
+ {% if trace %}eprintln!("TRACE: L{{ func.lineno }} ENTER {{ func.name }} | byte={} pos={}", Self::trace_byte(self.peek()), self.pos);{% endif %}
579
+
580
+ {% comment %} BRACKET types: emit Start on entry {% endcomment %}
581
+ {% if return_type_info.kind == "bracket" %}
582
+ let start_span = self.span();
583
+ on_event(Event::{{ func.return_type }}Start { span: start_span.clone() });
584
+ {% endif %}
585
+
586
+ {% comment %} CONTENT types: MARK on entry {% endcomment %}
587
+ {% if return_type_info.kind == "content" %}
588
+ self.mark();
589
+ {% endif %}
590
+
591
+ {% comment %} Local variables - use initial value from entry_actions if available {% endcomment %}
592
+ {% comment %} Only add 'mut' if the variable is reassigned in the function body {% endcomment %}
593
+ {% for local in func.locals %}
594
+ {% assign init_val = func.local_init_values[local[0]] | default: "0" %}
595
+ {% if func.mutable_locals contains local[0] %}let mut {{ local[0] }}: i32 = {{ init_val }};{% else %}let {{ local[0] }}: i32 = {{ init_val }};{% endif %}
596
+ {% endfor %}
597
+
598
+ {% comment %} Entry actions (variable initialization on function entry) {% endcomment %}
599
+ {% comment %} Skip MARK for CONTENT types since auto-MARK already handles it {% endcomment %}
600
+ {% for cmd in func.entry_actions %}
601
+ {% unless cmd.type == "mark" and return_type_info.kind == "content" %}
602
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
603
+ {% endunless %}
604
+ {% endfor %}
605
+
606
+ {% comment %} State machine {% endcomment %}
607
+ {% if func.states.size == 0 %}
608
+ {% comment %} No states - immediate return (stateless function) {% endcomment %}
609
+ {% if return_type_info.kind == "content" %}
610
+ on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
611
+ {% elsif return_type_info.kind == "bracket" %}
612
+ on_event(Event::{{ func.return_type }}End { span: self.span() });
613
+ {% elsif return_type_info.kind == "internal" %}
614
+ return 0;
615
+ {% endif %}
616
+ {% elsif func.states.size == 1 %}
617
+ {% comment %} Single state - no enum needed {% endcomment %}
618
+ {% assign state = func.states.first %}
619
+ loop {
620
+ {% if state.scannable %}
621
+ {% comment %} SCAN-first optimization {% endcomment %}
622
+ {% assign scan_count = state.scan_chars.size %}
623
+ {% if scan_count == 1 %}
624
+ match self.scan_to1({{ state.scan_chars[0] | escape_rust_char }}) {
625
+ {% elsif scan_count == 2 %}
626
+ match self.scan_to2({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}) {
627
+ {% elsif scan_count == 3 %}
628
+ match self.scan_to3({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}) {
629
+ {% elsif scan_count == 4 %}
630
+ match self.scan_to4({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}) {
631
+ {% elsif scan_count == 5 %}
632
+ match self.scan_to5({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}, {{ state.scan_chars[4] | escape_rust_char }}) {
633
+ {% elsif scan_count == 6 %}
634
+ match self.scan_to6({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}, {{ state.scan_chars[4] | escape_rust_char }}, {{ state.scan_chars[5] | escape_rust_char }}) {
635
+ {% endif %}
636
+ {% for kase in state.cases %}
637
+ {% unless kase.is_default %}
638
+ {% if kase.is_conditional %}
639
+ _ if {{ kase.condition | rust_expr }} => {
640
+ {% elsif kase.param_ref %}
641
+ {% comment %} Parameter reference: match against param value {% endcomment %}
642
+ Some(b) if b == {{ kase.param_ref }} => {
643
+ {% elsif kase.special_class and kase.chars.size > 0 %}
644
+ {% comment %} Combined: class + literal chars {% endcomment %}
645
+ Some(b) if Self::is_{{ kase.special_class }}(b){% for ch in kase.chars %} || b == {{ ch | escape_rust_char }}{% endfor %} => {
646
+ {% elsif kase.special_class %}
647
+ Some(b) if Self::is_{{ kase.special_class }}(b) => {
648
+ {% elsif kase.chars.size == 1 %}
649
+ Some({{ kase.chars[0] | escape_rust_char }}) => {
650
+ {% elsif kase.chars.size > 1 %}
651
+ Some({% for ch in kase.chars %}{{ ch | escape_rust_char }}{% unless forloop.last %} | {% endunless %}{% endfor %}) => {
652
+ {% else %}
653
+ Some(_) => {
654
+ {% endif %}
655
+ {% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
656
+ {% for cmd in kase.commands %}
657
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
658
+ {% endfor %}
659
+ }
660
+ {% endunless %}
661
+ {% endfor %}
662
+ {% if state.newline_injected %}
663
+ {% comment %} Injected newline: update line/col and continue scanning {% endcomment %}
664
+ Some(b'\n') => {
665
+ self.advance();
666
+ }
667
+ {% endif %}
668
+ None => {
669
+ {% if trace %}eprintln!("TRACE: L{{ state.lineno }} {{ func.name }}:{{ state.name }} EOF | term={} pos={}", self.trace_content(), self.pos);{% endif %}
670
+ {% comment %} EOF handling - check for explicit |eof handler first {% endcomment %}
671
+ {% if state.eof_handler.size > 0 %}
672
+ {% for cmd in state.eof_handler %}
673
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
674
+ {% endfor %}
675
+ {% elsif func.eof_handler.size > 0 %}
676
+ {% for cmd in func.eof_handler %}
677
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
678
+ {% endfor %}
679
+ {% else %}
680
+ {% if return_type_info.kind == "content" %}
681
+ on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
682
+ {% endif %}
683
+ {% if func.expects_char %}
684
+ on_event(Event::Error { code: ParseErrorCode::Unclosed{{ func.return_type }}, span: self.span() });
685
+ {% elsif return_type_info.kind == "bracket" %}
686
+ on_event(Event::{{ func.return_type }}End { span: self.span() });
687
+ {% endif %}
688
+ {% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
689
+ {% endif %}
690
+ }
691
+ _ => unreachable!("scan_to only returns target chars"),
692
+ }
693
+ {% else %}
694
+ {% comment %} Non-scannable: check EOF first, then match {% endcomment %}
695
+ if self.eof() {
696
+ {% if trace %}eprintln!("TRACE: L{{ state.lineno }} {{ func.name }}:{{ state.name }} EOF | term={} pos={}", self.trace_content(), self.pos);{% endif %}
697
+ {% if state.eof_handler.size > 0 %}
698
+ {% comment %} Fix #13: Explicit |eof handler - use its commands {% endcomment %}
699
+ {% for cmd in state.eof_handler %}
700
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
701
+ {% endfor %}
702
+ {% elsif func.eof_handler.size > 0 %}
703
+ {% comment %} Function-level |eof handler {% endcomment %}
704
+ {% for cmd in func.eof_handler %}
705
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
706
+ {% endfor %}
707
+ {% else %}
708
+ {% comment %} Default EOF behavior based on return type {% endcomment %}
709
+ {% if return_type_info.kind == "content" %}
710
+ on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
711
+ {% endif %}
712
+ {% if func.expects_char %}
713
+ on_event(Event::Error { code: ParseErrorCode::Unclosed{{ func.return_type }}, span: self.span() });
714
+ {% elsif return_type_info.kind == "bracket" %}
715
+ on_event(Event::{{ func.return_type }}End { span: self.span() });
716
+ {% endif %}
717
+ {% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
718
+ {% endif %}
719
+ }
720
+ {% comment %} Optimization: if only one case and it's default, skip the match {% endcomment %}
721
+ {% if state.cases.size == 1 and state.cases[0].is_default %}
722
+ {% assign kase = state.cases[0] %}
723
+ {% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
724
+ {% for cmd in kase.commands %}
725
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
726
+ {% endfor %}
727
+ {% else %}
728
+ match self.peek() {
729
+ {% for kase in state.cases %}
730
+ {% if kase.is_default %}
731
+ _ => {
732
+ {% elsif kase.is_conditional %}
733
+ _ if {{ kase.condition | rust_expr }} => {
734
+ {% elsif kase.param_ref %}
735
+ Some(b) if b == {{ kase.param_ref }} => {
736
+ {% elsif kase.special_class and kase.chars.size > 0 %}
737
+ Some(b) if Self::is_{{ kase.special_class }}(b){% for ch in kase.chars %} || b == {{ ch | escape_rust_char }}{% endfor %} => {
738
+ {% elsif kase.special_class %}
739
+ Some(b) if Self::is_{{ kase.special_class }}(b) => {
740
+ {% elsif kase.chars.size == 1 %}
741
+ Some({{ kase.chars[0] | escape_rust_char }}) => {
742
+ {% elsif kase.chars.size > 1 %}
743
+ Some({% for ch in kase.chars %}{{ ch | escape_rust_char }}{% unless forloop.last %} | {% endunless %}{% endfor %}) => {
744
+ {% else %}
745
+ Some(_) => {
746
+ {% endif %}
747
+ {% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
748
+ {% for cmd in kase.commands %}
749
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info %}
750
+ {% endfor %}
751
+ }
752
+ {% endfor %}
753
+ {% unless state.has_default %}
754
+ {% comment %} Add default arm if state has no explicit default case {% endcomment %}
755
+ _ => {
756
+ {% if trace %}eprintln!("TRACE: {{ func.name }}:{{ state.name }} UNHANDLED | byte={} pos={}", Self::trace_byte(self.peek()), self.pos);{% endif %}
757
+ {% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
758
+ }
759
+ {% endunless %}
760
+ }
761
+ {% endif %}
762
+ {% endif %}
763
+ }
764
+ {% else %}
765
+ {% comment %} Multiple states - use enum {% endcomment %}
766
+ #[derive(Clone, Copy)]
767
+ enum State { {% for state in func.states %}{{ state.name | pascalcase }}, {% endfor %} }
768
+ let mut state = State::{{ func.states.first.name | pascalcase }};
769
+
770
+ loop {
771
+ match state {
772
+ {% for state in func.states %}
773
+ State::{{ state.name | pascalcase }} => {
774
+ {% if state.scannable %}
775
+ {% assign scan_count = state.scan_chars.size %}
776
+ {% if scan_count == 1 %}
777
+ match self.scan_to1({{ state.scan_chars[0] | escape_rust_char }}) {
778
+ {% elsif scan_count == 2 %}
779
+ match self.scan_to2({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}) {
780
+ {% elsif scan_count == 3 %}
781
+ match self.scan_to3({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}) {
782
+ {% elsif scan_count == 4 %}
783
+ match self.scan_to4({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}) {
784
+ {% elsif scan_count == 5 %}
785
+ match self.scan_to5({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}, {{ state.scan_chars[4] | escape_rust_char }}) {
786
+ {% elsif scan_count == 6 %}
787
+ match self.scan_to6({{ state.scan_chars[0] | escape_rust_char }}, {{ state.scan_chars[1] | escape_rust_char }}, {{ state.scan_chars[2] | escape_rust_char }}, {{ state.scan_chars[3] | escape_rust_char }}, {{ state.scan_chars[4] | escape_rust_char }}, {{ state.scan_chars[5] | escape_rust_char }}) {
788
+ {% endif %}
789
+ {% for kase in state.cases %}
790
+ {% unless kase.is_default %}
791
+ {% if kase.is_conditional %}
792
+ _ if {{ kase.condition | rust_expr }} => {
793
+ {% elsif kase.param_ref %}
794
+ Some(b) if b == {{ kase.param_ref }} => {
795
+ {% elsif kase.special_class and kase.chars.size > 0 %}
796
+ Some(b) if Self::is_{{ kase.special_class }}(b){% for ch in kase.chars %} || b == {{ ch | escape_rust_char }}{% endfor %} => {
797
+ {% elsif kase.special_class %}
798
+ Some(b) if Self::is_{{ kase.special_class }}(b) => {
799
+ {% elsif kase.chars.size == 1 %}
800
+ Some({{ kase.chars[0] | escape_rust_char }}) => {
801
+ {% elsif kase.chars.size > 1 %}
802
+ Some({% for ch in kase.chars %}{{ ch | escape_rust_char }}{% unless forloop.last %} | {% endunless %}{% endfor %}) => {
803
+ {% else %}
804
+ Some(_) => {
805
+ {% endif %}
806
+ {% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
807
+ {% for cmd in kase.commands %}
808
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
809
+ {% endfor %}
810
+ }
811
+ {% endunless %}
812
+ {% endfor %}
813
+ {% if state.newline_injected %}
814
+ {% comment %} Injected newline: update line/col and continue scanning {% endcomment %}
815
+ Some(b'\n') => {
816
+ self.advance();
817
+ }
818
+ {% endif %}
819
+ None => {
820
+ {% if trace %}eprintln!("TRACE: L{{ state.lineno }} {{ func.name }}:{{ state.name }} EOF | term={} pos={}", self.trace_content(), self.pos);{% endif %}
821
+ {% comment %} EOF handling - check for explicit |eof handler first {% endcomment %}
822
+ {% if state.eof_handler.size > 0 %}
823
+ {% for cmd in state.eof_handler %}
824
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
825
+ {% endfor %}
826
+ {% elsif func.eof_handler.size > 0 %}
827
+ {% for cmd in func.eof_handler %}
828
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
829
+ {% endfor %}
830
+ {% else %}
831
+ {% if return_type_info.kind == "content" %}
832
+ on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
833
+ {% endif %}
834
+ {% if func.expects_char %}
835
+ on_event(Event::Error { code: ParseErrorCode::Unclosed{{ func.return_type }}, span: self.span() });
836
+ {% elsif return_type_info.kind == "bracket" %}
837
+ on_event(Event::{{ func.return_type }}End { span: self.span() });
838
+ {% endif %}
839
+ {% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
840
+ {% endif %}
841
+ }
842
+ _ => unreachable!("scan_to only returns target chars"),
843
+ }
844
+ {% else %}
845
+ {% if state.is_unconditional %}
846
+ {% comment %} Unconditional state: execute commands immediately without byte match {% endcomment %}
847
+ {% assign kase = state.cases.first %}
848
+ {% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} UNCONDITIONAL | term={} pos={}", self.trace_content(), self.pos);{% endif %}
849
+ {% for cmd in kase.commands %}
850
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
851
+ {% endfor %}
852
+ {% else %}
853
+ {% comment %} Non-scannable: check EOF first {% endcomment %}
854
+ if self.eof() {
855
+ {% if trace %}eprintln!("TRACE: L{{ state.lineno }} {{ func.name }}:{{ state.name }} EOF | term={} pos={}", self.trace_content(), self.pos);{% endif %}
856
+ {% if state.eof_handler.size > 0 %}
857
+ {% for cmd in state.eof_handler %}
858
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
859
+ {% endfor %}
860
+ {% elsif func.eof_handler.size > 0 %}
861
+ {% for cmd in func.eof_handler %}
862
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
863
+ {% endfor %}
864
+ {% else %}
865
+ {% if return_type_info.kind == "content" %}
866
+ on_event(Event::{{ func.return_type }} { content: self.term(), span: self.span_from_mark() });
867
+ {% endif %}
868
+ {% if func.expects_char %}
869
+ on_event(Event::Error { code: ParseErrorCode::Unclosed{{ func.return_type }}, span: self.span() });
870
+ {% elsif return_type_info.kind == "bracket" %}
871
+ on_event(Event::{{ func.return_type }}End { span: self.span() });
872
+ {% endif %}
873
+ {% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
874
+ {% endif %}
875
+ }
876
+ {% comment %} Optimization: if only one case and it's default, skip the match {% endcomment %}
877
+ {% if state.cases.size == 1 and state.cases[0].is_default %}
878
+ {% assign kase = state.cases[0] %}
879
+ {% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
880
+ {% for cmd in kase.commands %}
881
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
882
+ {% endfor %}
883
+ {% else %}
884
+ match self.peek() {
885
+ {% for kase in state.cases %}
886
+ {% if kase.is_default %}
887
+ _ => {
888
+ {% elsif kase.is_conditional %}
889
+ _ if {{ kase.condition | rust_expr }} => {
890
+ {% elsif kase.param_ref %}
891
+ Some(b) if b == {{ kase.param_ref }} => {
892
+ {% elsif kase.special_class and kase.chars.size > 0 %}
893
+ Some(b) if Self::is_{{ kase.special_class }}(b){% for ch in kase.chars %} || b == {{ ch | escape_rust_char }}{% endfor %} => {
894
+ {% elsif kase.special_class %}
895
+ Some(b) if Self::is_{{ kase.special_class }}(b) => {
896
+ {% elsif kase.chars.size == 1 %}
897
+ Some({{ kase.chars[0] | escape_rust_char }}) => {
898
+ {% elsif kase.chars.size > 1 %}
899
+ Some({% for ch in kase.chars %}{{ ch | escape_rust_char }}{% unless forloop.last %} | {% endunless %}{% endfor %}) => {
900
+ {% else %}
901
+ Some(_) => {
902
+ {% endif %}
903
+ {% if trace %}eprintln!("TRACE: L{{ kase.lineno }} {{ func.name }}:{{ state.name }}{% if kase.substate %}.{{ kase.substate }}{% endif %} | byte={} term={} pos={}", Self::trace_byte(self.peek()), self.trace_content(), self.pos);{% endif %}
904
+ {% for cmd in kase.commands %}
905
+ {% include 'command' cmd: cmd, func: func, return_type_info: return_type_info, states: func.states %}
906
+ {% endfor %}
907
+ }
908
+ {% endfor %}
909
+ {% unless state.has_default %}
910
+ {% comment %} Add default arm if state has no explicit default case {% endcomment %}
911
+ _ => {
912
+ {% if trace %}eprintln!("TRACE: {{ func.name }}:{{ state.name }} UNHANDLED | byte={} pos={}", Self::trace_byte(self.peek()), self.pos);{% endif %}
913
+ {% if return_type_info.kind == "internal" %}return 0;{% else %}return;{% endif %}
914
+ }
915
+ {% endunless %}
916
+ }
917
+ {% endif %}
918
+ {% endif %}
919
+ {% endif %}
920
+ }
921
+ {% endfor %}
922
+ }
923
+ }
924
+ {% endif %}
925
+
926
+ {% comment %} Note: Unreachable code below - returns happen in command handling {% endcomment %}
927
+ {% comment %} This is intentional - the loop above handles all exits {% endcomment %}
928
+ }
929
+
930
+ {% endfor %}
931
+ }
932
+
933
+ {% if streaming %}
934
+ // ============================================================================
935
+ // Streaming Parser (multi-chunk support)
936
+ // ============================================================================
937
+
938
+ /// Result of parsing a chunk in streaming mode.
939
+ #[derive(Debug, Clone, Copy, PartialEq, Eq)]
940
+ pub enum ParseResult {
941
+ /// Parsing completed successfully (EOF reached).
942
+ Complete,
943
+ /// More data needed - provide another chunk.
944
+ NeedMoreData,
945
+ }
946
+
947
+ /// Event with owned content for streaming across chunk boundaries.
948
+ #[derive(Debug, Clone, PartialEq)]
949
+ pub enum StreamEvent {
950
+ {% for type in types %}
951
+ {% if type.kind == "bracket" %}
952
+ {{ type.name }}Start { span: Range<usize> },
953
+ {{ type.name }}End { span: Range<usize> },
954
+ {% elsif type.kind == "content" %}
955
+ {{ type.name }} { content: Vec<u8>, span: Range<usize> },
956
+ {% endif %}
957
+ {% endfor %}
958
+ Error { code: ParseErrorCode, span: Range<usize> },
959
+ }
960
+
961
+ impl StreamEvent {
962
+ /// Convert from borrowed Event to owned StreamEvent.
963
+ fn from_event(event: Event<'_>, offset: usize) -> Self {
964
+ match event {
965
+ {% for type in types %}
966
+ {% if type.kind == "bracket" %}
967
+ Event::{{ type.name }}Start { span } => {
968
+ StreamEvent::{{ type.name }}Start { span: (span.start + offset)..(span.end + offset) }
969
+ }
970
+ Event::{{ type.name }}End { span } => {
971
+ StreamEvent::{{ type.name }}End { span: (span.start + offset)..(span.end + offset) }
972
+ }
973
+ {% elsif type.kind == "content" %}
974
+ Event::{{ type.name }} { content, span } => {
975
+ StreamEvent::{{ type.name }} {
976
+ content: content.into_owned(),
977
+ span: (span.start + offset)..(span.end + offset),
978
+ }
979
+ }
980
+ {% endif %}
981
+ {% endfor %}
982
+ Event::Error { code, span } => {
983
+ StreamEvent::Error { code, span: (span.start + offset)..(span.end + offset) }
984
+ }
985
+ }
986
+ }
987
+ }
988
+
989
+ /// Streaming parser for chunk-by-chunk input.
990
+ ///
991
+ /// Wraps the single-buffer `Parser` to handle input arriving in chunks.
992
+ /// Buffers incomplete content across chunk boundaries.
993
+ ///
994
+ /// # Example
995
+ ///
996
+ /// ```ignore
997
+ /// let mut parser = StreamingParser::new();
998
+ /// loop {
999
+ /// match parser.parse(chunk, |event| handle(event)) {
1000
+ /// ParseResult::Complete => break,
1001
+ /// ParseResult::NeedMoreData => {
1002
+ /// chunk = get_next_chunk();
1003
+ /// if chunk.is_empty() {
1004
+ /// parser.finish(|event| handle(event));
1005
+ /// break;
1006
+ /// }
1007
+ /// }
1008
+ /// }
1009
+ /// }
1010
+ /// ```
1011
+ pub struct StreamingParser {
1012
+ /// Buffer for incomplete content at chunk boundaries.
1013
+ buffer: Vec<u8>,
1014
+ /// Maximum buffer size before error (default 4KB).
1015
+ max_buffer: usize,
1016
+ /// Global byte offset for span calculation.
1017
+ global_offset: usize,
1018
+ /// Line number (persists across chunks).
1019
+ line: u32,
1020
+ /// Column number (persists across chunks).
1021
+ column: u32,
1022
+ }
1023
+
1024
+ impl Default for StreamingParser {
1025
+ fn default() -> Self {
1026
+ Self::new()
1027
+ }
1028
+ }
1029
+
1030
+ impl StreamingParser {
1031
+ /// Create a new streaming parser with default settings.
1032
+ pub fn new() -> Self {
1033
+ Self::with_max_buffer(4096)
1034
+ }
1035
+
1036
+ /// Create a streaming parser with custom buffer limit.
1037
+ pub fn with_max_buffer(max_buffer: usize) -> Self {
1038
+ Self {
1039
+ buffer: Vec::new(),
1040
+ max_buffer,
1041
+ global_offset: 0,
1042
+ line: 1,
1043
+ column: 1,
1044
+ }
1045
+ }
1046
+
1047
+ /// Parse a chunk of input, emitting events for complete lines.
1048
+ ///
1049
+ /// Uses line-oriented streaming: only parses complete lines (ending in `\n`).
1050
+ /// Incomplete lines are buffered until more data arrives.
1051
+ ///
1052
+ /// Returns `NeedMoreData` if the chunk ends mid-line.
1053
+ /// Call `finish()` after the last chunk to handle any remaining content.
1054
+ pub fn parse<F>(&mut self, chunk: &[u8], mut on_event: F) -> ParseResult
1055
+ where
1056
+ F: FnMut(StreamEvent),
1057
+ {
1058
+ // Append new chunk to buffer
1059
+ self.buffer.extend_from_slice(chunk);
1060
+
1061
+ if self.buffer.is_empty() {
1062
+ return ParseResult::NeedMoreData;
1063
+ }
1064
+
1065
+ // Check buffer size limit
1066
+ if self.buffer.len() > self.max_buffer {
1067
+ on_event(StreamEvent::Error {
1068
+ code: ParseErrorCode::UnexpectedEof, // Buffer overflow
1069
+ span: self.global_offset..self.global_offset,
1070
+ });
1071
+ self.buffer.clear();
1072
+ return ParseResult::Complete;
1073
+ }
1074
+
1075
+ // Find last complete line (ending with \n)
1076
+ let last_newline = self.buffer.iter().rposition(|&b| b == b'\n');
1077
+
1078
+ let parse_end = match last_newline {
1079
+ Some(pos) => pos + 1, // Include the newline
1080
+ None => return ParseResult::NeedMoreData, // No complete line yet
1081
+ };
1082
+
1083
+ // Extract complete lines to parse
1084
+ let to_parse: Vec<u8> = self.buffer.drain(..parse_end).collect();
1085
+ let offset = self.global_offset;
1086
+
1087
+ // Parse complete lines
1088
+ let mut inner = Parser::new(&to_parse);
1089
+ inner.line = self.line;
1090
+ inner.column = self.column;
1091
+
1092
+ inner.parse(|event| {
1093
+ on_event(StreamEvent::from_event(event, offset));
1094
+ });
1095
+
1096
+ // Update state
1097
+ self.global_offset += to_parse.len();
1098
+ for &b in &to_parse {
1099
+ if b == b'\n' {
1100
+ self.line += 1;
1101
+ self.column = 1;
1102
+ } else {
1103
+ self.column += 1;
1104
+ }
1105
+ }
1106
+
1107
+ ParseResult::NeedMoreData
1108
+ }
1109
+
1110
+ /// Signal end of input and handle any remaining buffered content.
1111
+ ///
1112
+ /// This triggers EOF handling for any incomplete constructs.
1113
+ pub fn finish<F>(mut self, mut on_event: F)
1114
+ where
1115
+ F: FnMut(StreamEvent),
1116
+ {
1117
+ if self.buffer.is_empty() {
1118
+ return;
1119
+ }
1120
+
1121
+ // Parse remaining buffer - this will hit EOF
1122
+ let input = std::mem::take(&mut self.buffer);
1123
+ let offset = self.global_offset;
1124
+
1125
+ let mut inner = Parser::new(&input);
1126
+ inner.line = self.line;
1127
+ inner.column = self.column;
1128
+
1129
+ inner.parse(|event| {
1130
+ on_event(StreamEvent::from_event(event, offset));
1131
+ });
1132
+ }
1133
+
1134
+ /// Returns the current global byte offset.
1135
+ pub fn offset(&self) -> usize {
1136
+ self.global_offset
1137
+ }
1138
+
1139
+ /// Returns the current line number.
1140
+ pub fn line(&self) -> u32 {
1141
+ self.line
1142
+ }
1143
+
1144
+ /// Returns the current column number.
1145
+ pub fn column(&self) -> u32 {
1146
+ self.column
1147
+ }
1148
+ }
1149
+ {% endif %}
1150
+
1151
+ #[cfg(test)]
1152
+ mod tests {
1153
+ use super::*;
1154
+
1155
+ #[test]
1156
+ fn test_parse_basic() {
1157
+ // Basic smoke test - actual tests depend on parser semantics
1158
+ let input = b"";
1159
+ let mut events = Vec::new();
1160
+ Parser::new(input).parse(|e| events.push(e));
1161
+ // Events depend on the specific parser
1162
+ }
1163
+ }