descent 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1479 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Descent
4
+ # Unified character class parser according to characters.md spec.
5
+ #
6
+ # Handles all character literal and class syntax:
7
+ # - Single chars: 'x', '\n', '\x00'
8
+ # - Strings: 'hello' (decomposed to chars for classes)
9
+ # - Classes: <...> with space-separated tokens
10
+ # - Predefined classes: LETTER, DIGIT, SQ, P, 0-9, etc.
11
+ # - Empty class: <> (empty set / empty string)
12
+ # - Param refs: :name
13
+ #
14
+ # The same parsing is used everywhere: c[...], function args, PREPEND
15
+ module CharacterClass
16
+ # Predefined single-character classes (DSL-reserved chars)
17
+ SINGLE_CHAR = {
18
+ 'P' => '|',
19
+ 'L' => '[',
20
+ 'R' => ']',
21
+ 'LB' => '{',
22
+ 'RB' => '}',
23
+ 'LP' => '(',
24
+ 'RP' => ')',
25
+ 'SQ' => "'",
26
+ 'DQ' => '"',
27
+ 'BS' => '\\'
28
+ }.freeze
29
+
30
+ # Predefined character ranges
31
+ RANGES = {
32
+ '0-9' => '0123456789',
33
+ '0-7' => '01234567',
34
+ '0-1' => '01',
35
+ 'a-z' => 'abcdefghijklmnopqrstuvwxyz',
36
+ 'A-Z' => 'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
37
+ 'a-f' => 'abcdef',
38
+ 'A-F' => 'ABCDEF'
39
+ }.freeze
40
+
41
+ # Predefined multi-character classes (expanded to char sets)
42
+ MULTI_CHAR = {
43
+ 'LETTER' => RANGES['a-z'] + RANGES['A-Z'],
44
+ 'DIGIT' => RANGES['0-9'],
45
+ 'HEX_DIGIT' => RANGES['0-9'] + RANGES['a-f'] + RANGES['A-F'],
46
+ 'LABEL_CONT' => "#{RANGES['a-z']}#{RANGES['A-Z']}#{RANGES['0-9']}_-",
47
+ 'WS' => " \t",
48
+ 'NL' => "\n"
49
+ }.freeze
50
+
51
+ # Special classes that require runtime checks (can't be expanded to char list)
52
+ SPECIAL_CLASSES = %w[XID_START XID_CONT XLBL_START XLBL_CONT].freeze
53
+
54
+ class << self
55
+ # Parse a class specification string and return structured result.
56
+ #
57
+ # @param str [String] The class specification (contents of c[...] or <...> or bare)
58
+ # @param context [Symbol] :match (for c[...]), :bytes (for function args/PREPEND), :byte (single byte)
59
+ # @return [Hash] { chars: [...], special_class: nil|Symbol, param_ref: nil|String, bytes: String }
60
+ def parse(str, context: :match)
61
+ return { chars: [], special_class: nil, param_ref: nil, bytes: '' } if str.nil? || str.empty?
62
+
63
+ str = str.strip
64
+
65
+ # Handle explicit class wrapper <...>
66
+ if str.start_with?('<') && str.end_with?('>')
67
+ inner = str[1...-1].strip
68
+ return { chars: [], special_class: nil, param_ref: nil, bytes: '' } if inner.empty? # <>
69
+
70
+ return parse_class_content(inner, context)
71
+ end
72
+
73
+ # Handle param reference :name
74
+ if str.start_with?(':')
75
+ param = str[1..]
76
+ return { chars: [], special_class: nil, param_ref: param, bytes: nil }
77
+ end
78
+
79
+ # Handle quoted string 'content'
80
+ if str.match?(/^'.*'$/) && str.length >= 2
81
+ content = parse_quoted_string(str[1...-1])
82
+ chars = content.chars
83
+ return { chars: chars, special_class: nil, param_ref: nil, bytes: content }
84
+ end
85
+
86
+ # Handle double-quoted string "content"
87
+ if str.match?(/^".*"$/) && str.length >= 2
88
+ content = str[1...-1]
89
+ chars = content.chars
90
+ return { chars: chars, special_class: nil, param_ref: nil, bytes: content }
91
+ end
92
+
93
+ # Check if it's a bare shorthand (only /[A-Za-z0-9_-]/ allowed)
94
+ if str.match?(/^[A-Za-z0-9_-]+$/)
95
+ # Could be a predefined class name or bare chars
96
+ upper = str.upcase
97
+ if SPECIAL_CLASSES.include?(upper)
98
+ return { chars: [], special_class: upper.downcase.to_sym, param_ref: nil, bytes: nil }
99
+ elsif MULTI_CHAR.key?(upper)
100
+ chars = MULTI_CHAR[upper].chars
101
+ return { chars: chars, special_class: nil, param_ref: nil, bytes: MULTI_CHAR[upper] }
102
+ elsif SINGLE_CHAR.key?(upper)
103
+ char = SINGLE_CHAR[upper]
104
+ return { chars: [char], special_class: nil, param_ref: nil, bytes: char }
105
+ elsif RANGES.key?(str)
106
+ chars = RANGES[str].chars
107
+ return { chars: chars, special_class: nil, param_ref: nil, bytes: RANGES[str] }
108
+ else
109
+ # Bare alphanumeric - decompose to individual chars
110
+ chars = str.chars
111
+ return { chars: chars, special_class: nil, param_ref: nil, bytes: str }
112
+ end
113
+ end
114
+
115
+ # If we get here, it's invalid bare content (special chars without quotes)
116
+ # For now, treat as literal bytes but this should probably error
117
+ { chars: str.chars, special_class: nil, param_ref: nil, bytes: str }
118
+ end
119
+
120
+ # Parse the content inside <...> (space-separated tokens)
121
+ def parse_class_content(content, context)
122
+ return { chars: [], special_class: nil, param_ref: nil, bytes: '' } if content.nil? || content.empty?
123
+
124
+ all_chars = []
125
+ all_bytes = +''
126
+ special_class = nil
127
+ param_ref = nil
128
+
129
+ tokens = tokenize_class_content(content)
130
+
131
+ tokens.each do |token|
132
+ result = parse(token, context: context)
133
+
134
+ if result[:special_class]
135
+ # Only one special class allowed
136
+ special_class = result[:special_class]
137
+ elsif result[:param_ref]
138
+ param_ref = result[:param_ref]
139
+ else
140
+ all_chars.concat(result[:chars]) if result[:chars]
141
+ all_bytes << result[:bytes] if result[:bytes]
142
+ end
143
+ end
144
+
145
+ { chars: all_chars.uniq, special_class: special_class, param_ref: param_ref, bytes: all_bytes }
146
+ end
147
+
148
+ # Tokenize class content respecting quotes
149
+ def tokenize_class_content(content)
150
+ tokens = []
151
+ current = +''
152
+ in_quote = false
153
+ i = 0
154
+
155
+ while i < content.length
156
+ c = content[i]
157
+
158
+ if c == "'" && !in_quote
159
+ in_quote = true
160
+ current << c
161
+ elsif c == "'" && in_quote
162
+ current << c
163
+ in_quote = false
164
+ elsif c == '\\' && in_quote && i + 1 < content.length
165
+ current << c << content[i + 1]
166
+ i += 1
167
+ elsif c == ' ' && !in_quote
168
+ tokens << current unless current.empty?
169
+ current = +''
170
+ else
171
+ current << c
172
+ end
173
+
174
+ i += 1
175
+ end
176
+
177
+ tokens << current unless current.empty?
178
+ tokens
179
+ end
180
+
181
+ # Parse a quoted string with escape sequences
182
+ def parse_quoted_string(str)
183
+ result = +''
184
+ i = 0
185
+
186
+ while i < str.length
187
+ if str[i] == '\\'
188
+ if i + 1 < str.length
189
+ case str[i + 1]
190
+ when 'n' then result << "\n"
191
+ i += 2
192
+ when 't' then result << "\t"
193
+ i += 2
194
+ when 'r' then result << "\r"
195
+ i += 2
196
+ when '\\' then result << '\\'
197
+ i += 2
198
+ when "'" then result << "'"
199
+ i += 2
200
+ when '"' then result << '"'
201
+ i += 2
202
+ when 'x'
203
+ # Hex byte: \xHH
204
+ if i + 3 < str.length && str[i + 2..i + 3].match?(/^[0-9A-Fa-f]{2}$/)
205
+ result << str[i + 2..i + 3].to_i(16).chr
206
+ i += 4
207
+ else
208
+ result << str[i + 1]
209
+ i += 2
210
+ end
211
+ when 'u'
212
+ # Unicode: \uXXXX
213
+ if i + 5 < str.length && str[i + 2..i + 5].match?(/^[0-9A-Fa-f]{4}$/)
214
+ result << str[i + 2..i + 5].to_i(16).chr(Encoding::UTF_8)
215
+ i += 6
216
+ else
217
+ result << str[i + 1]
218
+ i += 2
219
+ end
220
+ when '0'
221
+ # Null byte
222
+ result << "\0"
223
+ i += 2
224
+ else
225
+ result << str[i + 1]
226
+ i += 2
227
+ end
228
+ else
229
+ result << str[i]
230
+ i += 1
231
+ end
232
+ else
233
+ result << str[i]
234
+ i += 1
235
+ end
236
+ end
237
+
238
+ result
239
+ end
240
+
241
+ # Convert parsed result to Rust byte literal format for :byte param (u8)
242
+ def to_rust_byte(result)
243
+ return result[:param_ref] if result[:param_ref]
244
+ return '0u8' if result[:chars].empty? && result[:bytes].empty? # Empty = never match sentinel
245
+
246
+ char = result[:bytes][0] || result[:chars][0]
247
+ escape_rust_byte(char)
248
+ end
249
+
250
+ # Convert parsed result to Rust byte string format for :bytes param (&[u8])
251
+ def to_rust_bytes(result)
252
+ return result[:param_ref] if result[:param_ref]
253
+ return 'b""' if result[:bytes].nil? || result[:bytes].empty?
254
+
255
+ "b\"#{escape_rust_byte_string(result[:bytes])}\""
256
+ end
257
+
258
+ # Escape a single character for Rust byte literal b'x'
259
+ def escape_rust_byte(char)
260
+ escaped = case char
261
+ when "\n" then '\\n'
262
+ when "\t" then '\\t'
263
+ when "\r" then '\\r'
264
+ when "\0" then '\\0'
265
+ when '\\' then '\\\\'
266
+ when "'" then "\\'"
267
+ else
268
+ if char.ord < 32 || char.ord > 126
269
+ format('\\x%02x', char.ord)
270
+ else
271
+ char
272
+ end
273
+ end
274
+ "b'#{escaped}'"
275
+ end
276
+
277
+ # Escape a string for Rust byte string literal b"..."
278
+ def escape_rust_byte_string(str)
279
+ str.chars.map do |char|
280
+ case char
281
+ when "\n" then '\\n'
282
+ when "\t" then '\\t'
283
+ when "\r" then '\\r'
284
+ when "\0" then '\\0'
285
+ when '\\' then '\\\\'
286
+ when '"' then '\\"'
287
+ else
288
+ if char.ord < 32 || char.ord > 126
289
+ format('\\x%02x', char.ord)
290
+ else
291
+ char
292
+ end
293
+ end
294
+ end.join
295
+ end
296
+ end
297
+ end
298
+
299
+ # Transforms AST into IR with semantic analysis.
300
+ #
301
+ # Responsibilities:
302
+ # - Resolve type references
303
+ # - Infer SCAN optimization characters from state structure
304
+ # - Infer EOF handling requirements
305
+ # - Collect local variable declarations
306
+ # - Validate consistency
307
+ class IRBuilder
308
+ def initialize(ast) = @ast = ast
309
+
310
+ def build
311
+ types = build_types(@ast.types)
312
+ functions = @ast.functions.map { |f| build_function(f, types) }
313
+ keywords = @ast.keywords.map { |k| build_keywords(k) }
314
+
315
+ # Collect custom error codes from /error(code) calls
316
+ custom_error_codes = collect_custom_error_codes(functions)
317
+
318
+ # Collect prepend values by tracing call sites
319
+ functions = collect_prepend_values(functions)
320
+
321
+ # Transform call arguments based on target parameter types
322
+ functions = transform_call_args_by_type(functions)
323
+
324
+ IR::Parser.new(
325
+ name: @ast.name,
326
+ entry_point: @ast.entry_point,
327
+ types:,
328
+ functions:,
329
+ keywords:,
330
+ custom_error_codes:
331
+ )
332
+ end
333
+
334
+ private
335
+
336
+ # Transform AST Keywords to IR Keywords
337
+ def build_keywords(kw)
338
+ # Parse the fallback function call (e.g., "/bare_string" or "/bare_string(arg)")
339
+ fallback_func = nil
340
+ fallback_args = nil
341
+
342
+ if kw.fallback
343
+ if kw.fallback =~ %r{^/(\w+)\(([^)]*)\)$}
344
+ fallback_func = ::Regexp.last_match(1)
345
+ fallback_args = ::Regexp.last_match(2)
346
+ elsif kw.fallback =~ %r{^/(\w+)$}
347
+ fallback_func = ::Regexp.last_match(1)
348
+ end
349
+ end
350
+
351
+ IR::Keywords.new(
352
+ name: kw.name,
353
+ fallback_func:,
354
+ fallback_args:,
355
+ mappings: kw.mappings,
356
+ lineno: kw.lineno
357
+ )
358
+ end
359
+
360
+ def build_types(type_decls)
361
+ type_decls.map do |t|
362
+ emits_start = t.kind == :BRACKET
363
+ emits_end = t.kind == :BRACKET
364
+
365
+ IR::TypeInfo.new(
366
+ name: t.name,
367
+ kind: t.kind.downcase.to_sym,
368
+ emits_start:,
369
+ emits_end:,
370
+ lineno: t.lineno
371
+ )
372
+ end
373
+ end
374
+
375
+ def build_function(func, types)
376
+ return_type_info = types.find { |t| t.name == func.return_type }
377
+ emits_events = return_type_info&.bracket? || return_type_info&.content?
378
+
379
+ locals = infer_locals(func)
380
+ states = func.states.map { |s| build_state(s, func.params) }
381
+
382
+ # Infer expected closing delimiter from return cases
383
+ expects_char, emits_content_on_close = infer_expects(states)
384
+
385
+ # Infer parameter types from usage (byte if used in |c[:x]|, i32 otherwise)
386
+ param_types = infer_param_types(func.params, states)
387
+
388
+ # Transform function-level eof_handler commands from AST to IR
389
+ # Apply the same inline emit fix as for case commands
390
+ func_eof_commands = func.eof_handler&.commands&.map { |c| build_command(c) }
391
+ func_eof_commands = mark_returns_after_inline_emits(func_eof_commands) if func_eof_commands
392
+
393
+ # Transform entry_actions (initialization commands on function entry)
394
+ entry_actions = func.entry_actions&.map { |c| build_command(c) } || []
395
+
396
+ IR::Function.new(
397
+ name: func.name,
398
+ return_type: func.return_type,
399
+ params: func.params,
400
+ param_types:,
401
+ locals:,
402
+ states:,
403
+ eof_handler: func_eof_commands,
404
+ entry_actions:,
405
+ emits_events:,
406
+ expects_char:,
407
+ emits_content_on_close:,
408
+ lineno: func.lineno
409
+ )
410
+ end
411
+
412
+ def build_state(state, params = [])
413
+ cases = state.cases.map { |c| build_case(c, params) }
414
+ scan_chars = infer_scan_chars(state, cases)
415
+ is_self_looping = cases.any? { |c| c.default? && has_self_transition?(c) }
416
+
417
+ # Check if state has a default case (no chars, no condition, no special_class, no param_ref)
418
+ has_default = cases.any?(&:default?)
419
+
420
+ # Check if first case is unconditional (bare action - no char match)
421
+ # This means the state just executes actions without matching any character
422
+ # Note: param_ref IS a match (against a param value), so it's not unconditional
423
+ first_case = cases.first
424
+ is_unconditional = first_case && first_case.chars.nil? && first_case.special_class.nil? &&
425
+ first_case.param_ref.nil? && first_case.condition.nil?
426
+
427
+ # Transform eof_handler commands from AST to IR
428
+ # Apply the same inline emit fix as for case commands
429
+ eof_commands = state.eof_handler&.commands&.map { |c| build_command(c) }
430
+ eof_commands = mark_returns_after_inline_emits(eof_commands) if eof_commands
431
+
432
+ # Inject '\n' into scan_chars if not already a user target (and room available).
433
+ # This ensures SIMD scans stop at newlines for correct line/column tracking.
434
+ # The template adds a match arm for '\n' that updates line/col and continues scanning.
435
+ newline_injected = false
436
+ if scan_chars && !scan_chars.include?("\n") && scan_chars.size < 6
437
+ scan_chars = ["\n"] + scan_chars # Prepend so newline is checked first
438
+ newline_injected = true
439
+ end
440
+
441
+ IR::State.new(
442
+ name: state.name,
443
+ cases:,
444
+ eof_handler: eof_commands,
445
+ scan_chars:,
446
+ is_self_looping:,
447
+ has_default:,
448
+ is_unconditional:,
449
+ newline_injected:,
450
+ lineno: state.lineno
451
+ )
452
+ end
453
+
454
+ def build_case(kase, params = [])
455
+ validate_char_syntax(kase.chars, kase.lineno) if kase.chars
456
+ validate_prepend_commands(kase.commands, params, kase.lineno)
457
+ validate_call_args(kase.commands, params, kase.lineno)
458
+ chars, special_class, param_ref = parse_chars(kase.chars, params:)
459
+ commands = kase.commands.map { |c| build_command(c) }
460
+
461
+ # Fix #11: If inline emit precedes a bare return, mark return to suppress auto-emit
462
+ # This prevents CONTENT functions from emitting twice (once for inline, once for auto)
463
+ commands = mark_returns_after_inline_emits(commands)
464
+
465
+ IR::Case.new(
466
+ chars:,
467
+ special_class:,
468
+ param_ref:,
469
+ condition: kase.condition,
470
+ substate: kase.substate,
471
+ commands:,
472
+ lineno: kase.lineno
473
+ )
474
+ end
475
+
476
+ # Mark return commands that follow inline emits to suppress auto-emit.
477
+ # When a case has: | Float(USE_MARK) |return
478
+ # The inline emit already emits, so return should NOT auto-emit.
479
+ def mark_returns_after_inline_emits(commands)
480
+ has_inline_emit = false
481
+
482
+ commands.map do |cmd|
483
+ case cmd.type
484
+ when :inline_emit_bare, :inline_emit_mark, :inline_emit_literal
485
+ has_inline_emit = true
486
+ cmd
487
+ when :return
488
+ if has_inline_emit && cmd.args[:emit_type].nil? && cmd.args[:return_value].nil?
489
+ # Bare return after inline emit - suppress auto-emit
490
+ IR::Command.new(type: :return, args: cmd.args.merge(suppress_auto_emit: true))
491
+ else
492
+ cmd
493
+ end
494
+ else
495
+ cmd
496
+ end
497
+ end
498
+ end
499
+
500
+ def build_command(cmd)
501
+ # Handle AST::Conditional specially
502
+ if cmd.is_a?(AST::Conditional)
503
+ return IR::Command.new(
504
+ type: :conditional,
505
+ args: {
506
+ clauses: cmd.clauses&.map do |c|
507
+ {
508
+ 'condition' => c.condition,
509
+ 'commands' => c.commands.map { |cc| build_command(cc) }
510
+ }
511
+ end
512
+ }
513
+ )
514
+ end
515
+
516
+ args = case cmd.type
517
+ when :assign, :add_assign, :sub_assign then cmd.value.is_a?(Hash) ? cmd.value : {}
518
+ when :advance_to then { value: validate_advance_to(cmd.value, cmd.lineno) }
519
+ when :scan then { value: process_escapes(cmd.value) }
520
+ when :emit, :call_method, :transition, :error then { value: cmd.value }
521
+ when :call then parse_call_value(cmd.value)
522
+ when :inline_emit_bare, :inline_emit_mark then { type: cmd.value }
523
+ when :inline_emit_literal then cmd.value.is_a?(Hash) ? cmd.value : {}
524
+ when :term then { offset: cmd.value || 0 }
525
+ when :prepend then { literal: process_escapes(cmd.value) }
526
+ when :prepend_param then { param_ref: cmd.value }
527
+ when :keywords_lookup then { name: cmd.value }
528
+ when :return then parse_return_value(cmd.value)
529
+ when :advance, :mark, :noop then {}
530
+ else
531
+ raise ValidationError, "Unknown command type: #{cmd.type.inspect}"
532
+ end
533
+
534
+ IR::Command.new(type: cmd.type, args:)
535
+ end
536
+
537
+ # Process character class/literal to get the actual bytes.
538
+ # Uses unified CharacterClass parser.
539
+ def process_escapes(str)
540
+ return '' if str.nil? || str.empty?
541
+
542
+ result = CharacterClass.parse(str)
543
+ result[:bytes] || ''
544
+ end
545
+
546
+ # Validate and process advance_to (->[...]) arguments.
547
+ # Only literal bytes are supported (1-6 chars for SIMD memchr).
548
+ # Special classes and param refs are NOT supported.
549
+ def validate_advance_to(str, lineno)
550
+ raise ValidationError, "L#{lineno}: ->[] requires at least one character" if str.nil? || str.empty?
551
+
552
+ result = CharacterClass.parse(str)
553
+
554
+ if result[:special_class]
555
+ raise ValidationError,
556
+ "L#{lineno}: ->[] does not support character classes like #{str.upcase}. " \
557
+ 'Only literal bytes are supported (uses SIMD memchr).'
558
+ end
559
+
560
+ if result[:param_ref]
561
+ raise ValidationError,
562
+ "L#{lineno}: ->[] does not support parameter references like :#{result[:param_ref]}. " \
563
+ 'Only literal bytes are supported (uses SIMD memchr).'
564
+ end
565
+
566
+ bytes = result[:bytes] || ''
567
+ raise ValidationError, "L#{lineno}: ->[] resolved to empty bytes from '#{str}'" if bytes.empty?
568
+
569
+ if bytes.length > 6
570
+ raise ValidationError,
571
+ "L#{lineno}: ->[#{str}] has #{bytes.length} chars but maximum is 6 " \
572
+ '(chained memchr limit). Split into multiple scans or restructure grammar.'
573
+ end
574
+
575
+ bytes
576
+ end
577
+
578
+ # Characters that MUST be quoted or use predefined class names in c[...]
579
+ # These cause lexer/parser issues if used bare
580
+ MUST_QUOTE_CHARS = {
581
+ "'" => '<SQ>', # Single quote - causes unterminated quote issues
582
+ '|' => '<P>', # Pipe - DSL delimiter
583
+ '[' => '<L>', # Open bracket - DSL delimiter
584
+ ']' => '<R>', # Close bracket - DSL delimiter
585
+ ' ' => "' ' or <WS>" # Space - invisible, easy to miss
586
+ }.freeze
587
+
588
+ # Characters that SHOULD be quoted for clarity (warnings, not errors)
589
+ SHOULD_QUOTE_CHARS = {
590
+ '{' => '<LB>',
591
+ '}' => '<RB>',
592
+ '(' => '<LP>',
593
+ ')' => '<RP>',
594
+ '"' => '<DQ>',
595
+ '\\' => '<BS>'
596
+ }.freeze
597
+
598
+ # Validate character syntax in c[...] before parsing.
599
+ # Raises ValidationError for fatal issues.
600
+ #
601
+ # Valid syntax:
602
+ # - c[<...>] - class syntax (space-separated tokens inside)
603
+ # - c['...'] - quoted string/char
604
+ # - c[:param] - parameter reference
605
+ # - c[CLASS] - predefined class (LETTER, DIGIT, etc.)
606
+ # - c[abc] - bare alphanumeric/underscore/hyphen chars only
607
+ #
608
+ # Invalid:
609
+ # - c["] - special chars must be quoted: c['"']
610
+ # - c[ ] - spaces must be quoted: c[' ']
611
+ # - c[|] - DSL chars must use escapes: c[<P>] or c['|']
612
+ def validate_char_syntax(chars_str, lineno)
613
+ return if chars_str.nil? || chars_str.empty?
614
+
615
+ # Already using proper class syntax - <...> wrapper around everything
616
+ return if chars_str.start_with?('<') && chars_str.end_with?('>')
617
+
618
+ # Properly quoted string - validated by string parsing
619
+ return if chars_str.start_with?("'") && chars_str.end_with?("'") && chars_str.length >= 2
620
+
621
+ # Check for parameter reference (starts with : followed by valid identifier)
622
+ return if chars_str.match?(/^:[a-z_]\w*$/i)
623
+
624
+ # Check for pure special class (SCREAMING_CASE like LETTER, DIGIT, LABEL_CONT)
625
+ return if chars_str.match?(/^[A-Z]+(?:_[A-Z]+)*$/)
626
+
627
+ # Check for <TOKEN> escape sequences used OUTSIDE of a proper <...> class wrapper
628
+ if chars_str.match?(/<[A-Z]+>/)
629
+ raise ValidationError, "Line #{lineno}: Escape sequence like <SQ>, <P> etc. found outside " \
630
+ "class wrapper in c[#{chars_str}]. " \
631
+ 'Wrap everything in a class: c[<...>] not c[THING <ESC> ...]'
632
+ end
633
+
634
+ # Check for combined class + chars (e.g., LETTER'[.?!)
635
+ if chars_str.match?(/^[A-Z]+(?:_[A-Z]+)*'/)
636
+ class_name = chars_str.match(/^([A-Z_]+)/)[1]
637
+ raise ValidationError, "Line #{lineno}: Invalid character syntax in c[#{chars_str}]. " \
638
+ 'Bare quote after class name is ambiguous. ' \
639
+ "Use class syntax instead: c[<#{class_name} ...>]"
640
+ end
641
+
642
+ # Check for unterminated quotes
643
+ quote_count = chars_str.count("'")
644
+ if quote_count.odd?
645
+ raise ValidationError, "Line #{lineno}: Unterminated quote in c[#{chars_str}]. " \
646
+ 'Single quotes must be paired. ' \
647
+ "To match a literal quote, use c[<SQ>] or c['\\'']"
648
+ end
649
+
650
+ # Check for any character outside /A-Za-z0-9_-/ that isn't quoted
651
+ # These must be in single quotes or use escape sequences
652
+ chars_str.each_char.with_index do |ch, i|
653
+ next if ch.match?(/[A-Za-z0-9_-]/)
654
+ next if ch == "'" # Quote chars are handled by quote pairing check
655
+ next if ch == '\\' # Escape sequences handled separately
656
+
657
+ # Check if this char is inside quotes
658
+ quote_depth = chars_str[0...i].count("'")
659
+ next if quote_depth.odd? # Inside quotes, OK
660
+
661
+ # Special chars outside quotes - error
662
+ suggestion = case ch
663
+ when '|' then "c[<P>] or c['|']"
664
+ when '[' then "c[<L>] or c['[']"
665
+ when ']' then "c[<R>] or c[']']"
666
+ when '{' then "c[<LB>] or c['{']"
667
+ when '}' then "c[<RB>] or c['}']"
668
+ when '(' then "c[<LP>] or c['(']"
669
+ when ')' then "c[<RP>] or c[')']"
670
+ when '"' then "c[<DQ>] or c['\"']"
671
+ when '\\' then "c[<BS>] or c['\\\\']"
672
+ when ' ' then "c[<WS>] or c[' ']"
673
+ when "\t" then "c['\\t']"
674
+ when "\n" then "c['\\n']"
675
+ else "c['#{ch}']"
676
+ end
677
+
678
+ raise ValidationError, "Line #{lineno}: Unquoted '#{ch.inspect[1...-1]}' in c[#{chars_str}]. " \
679
+ "Characters outside /A-Za-z0-9_-/ must be quoted. Use #{suggestion}"
680
+ end
681
+ end
682
+
683
+ # Validate PREPEND commands for common mistakes.
684
+ # Catches: PREPEND(param) where param is a known parameter name - should be PREPEND(:param)
685
+ def validate_prepend_commands(commands, params, lineno)
686
+ return if params.empty?
687
+
688
+ commands.each do |cmd|
689
+ next unless cmd.type == :prepend
690
+ next if cmd.value.nil?
691
+
692
+ literal = cmd.value.to_s.strip
693
+
694
+ # Check if the literal matches a param name (bare word without quotes)
695
+ # Valid literals: 'x', '|', '``', <P>, etc.
696
+ # Suspicious: prepend (bare word matching param name)
697
+ next unless literal.match?(/^[a-z_]\w*$/i) # Bare identifier
698
+ next unless params.include?(literal)
699
+
700
+ raise ValidationError, "Line #{lineno}: PREPEND(#{literal}) looks like a parameter reference. " \
701
+ "Use PREPEND(:#{literal}) to reference the '#{literal}' parameter, " \
702
+ "or PREPEND('#{literal}') for a literal string."
703
+ end
704
+ end
705
+
706
+ # Validate function call arguments for bare identifiers matching param names.
707
+ # Catches: /func(param) where param is a known parameter - should be /func(:param)
708
+ def validate_call_args(commands, params, lineno)
709
+ return if params.empty?
710
+
711
+ commands.each do |cmd|
712
+ next unless cmd.type == :call
713
+ next if cmd.value.nil?
714
+
715
+ # cmd.value is like "text(prepend)" or "func" - extract args if present
716
+ call_str = cmd.value.to_s
717
+ next unless call_str.include?('(')
718
+
719
+ args_str = call_str[/\((.+)\)/, 1]
720
+ next if args_str.nil? || args_str.empty?
721
+
722
+ # Tokenize respecting quotes and angle brackets
723
+ args = tokenize_call_args_for_validation(args_str)
724
+
725
+ args.each do |arg|
726
+ arg = arg.strip
727
+ # Skip if it's already a proper reference (:param), quoted, or class syntax
728
+ next if arg.start_with?(':') # :param - correct
729
+ next if arg.start_with?("'") # 'literal'
730
+ next if arg.start_with?('"') # "literal"
731
+ next if arg.start_with?('<') # <CLASS>
732
+ next if arg.match?(/^-?\d+$/) # numeric
733
+ next if arg.match?(/^[A-Z]+$/) # COL, LINE, PREV - built-in vars
734
+ next if arg.include?(' ') # expression like "COL - 1"
735
+ next if arg.include?('.') # method call
736
+ next if arg.include?('(') # function call
737
+
738
+ # Bare lowercase identifier - check if it matches a param name
739
+ next unless arg.match?(/^[a-z_]\w*$/i)
740
+ next unless params.include?(arg)
741
+
742
+ raise ValidationError, "Line #{lineno}: /...(...#{arg}...) - bare identifier '#{arg}' matches a parameter name. " \
743
+ "Use ':#{arg}' to pass the parameter value, or \"'#{arg}'\" for a literal string."
744
+ end
745
+ end
746
+ end
747
+
748
+ # Tokenize call args for validation, respecting quotes and angle brackets
749
+ def tokenize_call_args_for_validation(args_str)
750
+ args = []
751
+ current = +''
752
+ in_quote = false
753
+ in_angle = 0
754
+
755
+ args_str.each_char do |c|
756
+ case c
757
+ when "'"
758
+ in_quote = !in_quote
759
+ current << c
760
+ when '<'
761
+ in_angle += 1
762
+ current << c
763
+ when '>'
764
+ in_angle -= 1 if in_angle.positive?
765
+ current << c
766
+ when ','
767
+ if in_quote || in_angle.positive?
768
+ current << c
769
+ else
770
+ args << current.strip
771
+ current = +''
772
+ end
773
+ else
774
+ current << c
775
+ end
776
+ end
777
+
778
+ args << current.strip unless current.empty?
779
+ args
780
+ end
781
+
782
+ # Parse character specification into literal chars, special class, and/or param reference.
783
+ # Returns [chars_array, special_class_symbol, param_ref_string]
784
+ #
785
+ # Supports both legacy syntax and new characters.md syntax:
786
+ #
787
+ # Legacy (backwards compatible):
788
+ # "abc" -> [["a", "b", "c"], nil, nil]
789
+ # "LETTER" -> [nil, :letter, nil]
790
+ # "LETTER'[.?!" -> [["'", "[", ".", "?", "!"], :letter, nil]
791
+ # ":close" -> [nil, nil, "close"] (param reference)
792
+ #
793
+ # New syntax (characters.md):
794
+ # "'x'" -> [["x"], nil, nil] (quoted char)
795
+ # "'abc'" -> [["a", "b", "c"], nil, nil] (quoted string, decomposed)
796
+ # "<abc>" -> [["a", "b", "c"], nil, nil] (class with bare lowercase)
797
+ # "<LETTER>" -> [nil, :letter, nil] (class with predefined class)
798
+ # "<0-9>" -> [["0".."9"], nil, nil] (predefined range)
799
+ # "<LETTER 0-9 '_'>" -> [["_", "0".."9"], :letter, nil] (combined)
800
+ # "<:var>" -> [nil, nil, "var"] (variable in class)
801
+ # Parse character specification for c[...] using unified CharacterClass parser.
802
+ # Returns [chars_array, special_class_symbol, param_ref_string]
803
+ def parse_chars(chars_str, params: [])
804
+ return [nil, nil, nil] if chars_str.nil?
805
+
806
+ # Use unified CharacterClass parser
807
+ result = CharacterClass.parse(chars_str)
808
+
809
+ # Validate param_ref against known params
810
+ if result[:param_ref] && !params.include?(result[:param_ref])
811
+ # Unknown param - treat the whole thing as literal chars
812
+ chars = ":#{result[:param_ref]}".chars
813
+ return [chars, nil, nil]
814
+ end
815
+
816
+ chars = result[:chars].empty? ? nil : result[:chars]
817
+ [chars, result[:special_class], result[:param_ref]]
818
+ end
819
+
820
+ # Legacy: Parse escape sequences in a quoted string.
821
+ # Kept for backwards compatibility but CharacterClass.parse_quoted_string is preferred.
822
+ def parse_quoted_string(str)
823
+ return '' if str.nil? || str.empty?
824
+
825
+ result = +''
826
+ i = 0
827
+ while i < str.length
828
+ if str[i] == '\\' && i + 1 < str.length
829
+ case str[i + 1]
830
+ when 'n' then result << "\n"
831
+ when 't' then result << "\t"
832
+ when 'r' then result << "\r"
833
+ when '\\' then result << '\\'
834
+ when "'" then result << "'"
835
+ when 'x'
836
+ # Hex byte: \xHH
837
+ if i + 3 < str.length && str[i + 2..i + 3].match?(/^[0-9A-Fa-f]{2}$/)
838
+ result << str[i + 2..i + 3].to_i(16).chr
839
+ i += 2
840
+ else
841
+ result << str[i + 1]
842
+ end
843
+ when 'u'
844
+ # Unicode: \uXXXX
845
+ if i + 5 < str.length && str[i + 2..i + 5].match?(/^[0-9A-Fa-f]{4}$/)
846
+ result << str[i + 2..i + 5].to_i(16).chr(Encoding::UTF_8)
847
+ i += 4
848
+ else
849
+ result << str[i + 1]
850
+ end
851
+ else
852
+ result << str[i + 1]
853
+ end
854
+ i += 2
855
+ else
856
+ result << str[i]
857
+ i += 1
858
+ end
859
+ end
860
+ result
861
+ end
862
+
863
+ # Parse return value specification
864
+ # Returns hash with :emit_type, :emit_mode, :literal, :return_value
865
+ # Examples:
866
+ # nil or "" -> {} (default behavior)
867
+ # "TypeName" -> { emit_type: "TypeName", emit_mode: :bare }
868
+ # "TypeName(USE_MARK)" -> { emit_type: "TypeName", emit_mode: :mark }
869
+ # "TypeName(lit)" -> { emit_type: "TypeName", emit_mode: :literal, literal: "lit" }
870
+ # "varname" -> { return_value: "varname" } (for INTERNAL types returning a value)
871
+ def parse_return_value(value)
872
+ return {} if value.nil? || value.empty?
873
+
874
+ case value
875
+ when /^([A-Z]\w*)\(USE_MARK\)$/ then { emit_type: ::Regexp.last_match(1), emit_mode: :mark }
876
+ when /^([A-Z]\w*)\(([^)]+)\)$/
877
+ { emit_type: ::Regexp.last_match(1), emit_mode: :literal, literal: process_escapes(::Regexp.last_match(2)) }
878
+ when /^([A-Z]\w*)$/ then { emit_type: ::Regexp.last_match(1), emit_mode: :bare }
879
+ when /^[a-z_]\w*$/
880
+ # Variable name - for INTERNAL types returning a computed value
881
+ { return_value: value }
882
+ else
883
+ {} # Unknown format, use default
884
+ end
885
+ end
886
+
887
+ # Parse a call command value into name and args.
888
+ # Examples:
889
+ # "func" -> { name: "func", call_args: nil }
890
+ # "func(x, y)" -> { name: "func", call_args: "x, y" }
891
+ # "func(<R>)" -> { name: "func", call_args: "<R>" }
892
+ # "func())" -> { name: "func", call_args: ")" } (bare paren as arg)
893
+ # "error(Code)" -> { name: "error", call_args: "Code", is_error: true }
894
+ def parse_call_value(value)
895
+ return { name: value, call_args: nil } unless value.include?('(')
896
+
897
+ # Find the first '(' - everything before is the name
898
+ paren_pos = value.index('(')
899
+ name = value[0...paren_pos]
900
+
901
+ # Everything after the first '(' up to the last ')' is the args
902
+ # For "func())" -> args = ")"
903
+ # For "func(<R>)" -> args = "<R>"
904
+ rest = value[(paren_pos + 1)..]
905
+
906
+ # Strip the final ')' if present - but only ONE trailing paren
907
+ call_args = rest.end_with?(')') ? rest[0...-1] : rest
908
+ call_args = nil if call_args.empty?
909
+
910
+ result = { name:, call_args: }
911
+ result[:is_error] = true if name == 'error'
912
+ result
913
+ end
914
+
915
+ # Infer SCAN optimization: if a state has a simple self-looping default case
916
+ # (only advance + transition, no side effects), the explicit character cases
917
+ # become SCAN targets.
918
+ def infer_scan_chars(_state, cases)
919
+ default_case = cases.find(&:default?)
920
+ return nil unless default_case
921
+ return nil unless simple_self_loop?(default_case)
922
+
923
+ # Collect all explicit characters from non-default cases
924
+ explicit_chars = cases
925
+ .reject(&:default?)
926
+ .reject(&:conditional?) # Skip conditional cases
927
+ .flat_map { |c| c.chars || [] }
928
+ .uniq
929
+
930
+ return nil if explicit_chars.empty?
931
+ # Support up to 6 chars via chained memchr calls (memchr3 + memchr3)
932
+ # Beyond 6, the overhead of chaining outweighs the benefit
933
+ return nil if explicit_chars.size > 6
934
+
935
+ explicit_chars
936
+ end
937
+
938
+ # Check if a case is a simple self-loop: only advance and/or transition (no calls, emits, etc.)
939
+ # This is the stricter check for SCAN optimization.
940
+ def simple_self_loop?(kase)
941
+ has_self_transition = false
942
+
943
+ kase.commands.each do |cmd|
944
+ case cmd.type
945
+ when :advance
946
+ # OK - just advancing
947
+ when :transition
948
+ val = cmd.args[:value] || cmd.args['value']
949
+ has_self_transition = true if val.nil? || val.empty?
950
+ else
951
+ # Any other command (call, emit, mark, term, etc.) means not a simple loop
952
+ return false
953
+ end
954
+ end
955
+
956
+ has_self_transition
957
+ end
958
+
959
+ # Check if a case has any self-transition (used for is_self_looping metadata)
960
+ def has_self_transition?(kase)
961
+ kase.commands.any? do |cmd|
962
+ next false unless cmd.type == :transition
963
+
964
+ val = cmd.args[:value] || cmd.args['value']
965
+ val.nil? || val.empty?
966
+ end
967
+ end
968
+
969
+ # Infer expected closing delimiter from return cases.
970
+ # If ALL return cases match the same single character, that's the expected closer.
971
+ # Also check if TERM appears before return (emits_content_on_close).
972
+ def infer_expects(states)
973
+ return_cases = []
974
+
975
+ # Collect all cases that contain a return command
976
+ states.each do |state|
977
+ state.cases.each do |kase|
978
+ return_cases << kase if kase.commands.any? { |cmd| cmd.type == :return }
979
+ end
980
+ end
981
+
982
+ # No returns found - no expected closer
983
+ return [nil, false] if return_cases.empty?
984
+
985
+ # Check if all return cases match the same single character
986
+ # (ignore conditional cases for now - they still match on a char)
987
+ char_matches = return_cases.filter_map do |kase|
988
+ # Must have exactly one character match (not default, not char class)
989
+ next nil if kase.default?
990
+ next nil if kase.special_class
991
+ next nil if kase.chars.nil? || kase.chars.length != 1
992
+
993
+ kase.chars.first
994
+ end
995
+
996
+ # If not all return cases have single-char matches, no expected closer
997
+ return [nil, false] if char_matches.length != return_cases.length
998
+
999
+ # If not all the same character, no expected closer
1000
+ return [nil, false] if char_matches.uniq.length != 1
1001
+
1002
+ expects_char = char_matches.first
1003
+
1004
+ # Check if any return case has TERM before return
1005
+ emits_content = return_cases.any? do |kase|
1006
+ kase.commands.any? { |cmd| cmd.type == :term }
1007
+ end
1008
+
1009
+ [expects_char, emits_content]
1010
+ end
1011
+
1012
+ # Collect custom error codes from /error(code) calls across all functions
1013
+ def collect_custom_error_codes(functions)
1014
+ codes = Set.new
1015
+
1016
+ functions.each do |func|
1017
+ func.states.each do |state|
1018
+ state.cases.each do |kase|
1019
+ collect_error_codes_from_commands(kase.commands, codes)
1020
+ end
1021
+ end
1022
+ end
1023
+
1024
+ codes.to_a.sort
1025
+ end
1026
+
1027
+ def collect_error_codes_from_commands(commands, codes)
1028
+ commands.each do |cmd|
1029
+ case cmd.type
1030
+ when :error
1031
+ # Explicit :error command
1032
+ code = cmd.args[:value] || cmd.args['value']
1033
+ codes << code if code && !code.empty?
1034
+ when :call
1035
+ # /error(code) is parsed as :call with is_error: true
1036
+ if cmd.args[:is_error]
1037
+ code = cmd.args[:call_args]
1038
+ codes << code if code && !code.empty?
1039
+ end
1040
+ when :conditional
1041
+ # Recurse into conditional clauses
1042
+ cmd.args[:clauses]&.each do |clause|
1043
+ collect_error_codes_from_commands(clause['commands'] || [], codes)
1044
+ end
1045
+ end
1046
+ end
1047
+ end
1048
+
1049
+ # Infer local variables from assignments in function
1050
+ def infer_locals(func)
1051
+ locals = {}
1052
+
1053
+ # Check entry_actions for variable declarations
1054
+ func.entry_actions&.each do |cmd|
1055
+ collect_locals_from_commands([cmd], locals)
1056
+ end
1057
+
1058
+ # Check state cases for variable usage
1059
+ func.states.each do |state|
1060
+ state.cases.each do |kase|
1061
+ collect_locals_from_commands(kase.commands, locals)
1062
+ end
1063
+ end
1064
+
1065
+ locals
1066
+ end
1067
+
1068
+ def collect_locals_from_commands(commands, locals)
1069
+ commands.each do |cmd|
1070
+ if cmd.is_a?(AST::Conditional)
1071
+ cmd.clauses&.each do |clause|
1072
+ collect_locals_from_commands(clause.commands, locals)
1073
+ end
1074
+ elsif cmd.respond_to?(:type)
1075
+ case cmd.type
1076
+ when :assign, :add_assign, :sub_assign
1077
+ if cmd.value.is_a?(Hash) && cmd.value[:var]
1078
+ locals[cmd.value[:var]] ||= :i32 # Default type
1079
+ end
1080
+ end
1081
+ end
1082
+ end
1083
+ end
1084
+
1085
+ # Infer parameter types from usage in states.
1086
+ # - Params used in |c[:x]| are bytes (u8) for single-byte comparison
1087
+ # - Params used in PREPEND(:x) are byte slices (&'static [u8]) for prepending
1088
+ # - Others default to i32
1089
+ def infer_param_types(params, states)
1090
+ return {} if params.empty?
1091
+
1092
+ # Start with all params as i32 (default)
1093
+ types = params.to_h { |p| [p, :i32] }
1094
+
1095
+ # Find params used in character matches (these become u8)
1096
+ # and params used in PREPEND (these become bytes slice)
1097
+ states.each do |state|
1098
+ state.cases.each do |kase|
1099
+ # Check param_ref in character matches - needs u8 for comparison
1100
+ types[kase.param_ref] = :byte if kase.param_ref && types.key?(kase.param_ref)
1101
+
1102
+ # Check conditions for param == 'char' comparisons
1103
+ # e.g., |if[prepend == '|'] means prepend should be u8
1104
+ # Note: param == 0 is NOT a byte comparison - it's a numeric flag check
1105
+ if kase.condition
1106
+ params.each do |param|
1107
+ # Look for patterns like: param == 'x', 'x' == param (character literal comparisons)
1108
+ # Do NOT match param == 0 - that's a numeric comparison, not a byte sentinel
1109
+ next unless (kase.condition.match?(/\b#{Regexp.escape(param)}\s*[!=]=\s*'/) ||
1110
+ kase.condition.match?(/'\s*[!=]=\s*#{Regexp.escape(param)}\b/)) && types.key?(param)
1111
+
1112
+ types[param] = :byte
1113
+ end
1114
+ end
1115
+
1116
+ # Check param_ref in PREPEND commands - needs &'static [u8] for prepending
1117
+ kase.commands.each do |cmd|
1118
+ if cmd.type == :prepend_param && cmd.args[:param_ref]
1119
+ param = cmd.args[:param_ref]
1120
+ types[param] = :bytes if types.key?(param)
1121
+ end
1122
+ end
1123
+ end
1124
+ end
1125
+
1126
+ types
1127
+ end
1128
+
1129
+ # Infer param types from call-site values AND propagate from callees.
1130
+ # If a function is called with bytes-like values (<>, <P>, '|'), that param becomes :bytes.
1131
+ # If bar calls foo(:x) and foo's param is :bytes, then bar's :x should be :bytes.
1132
+ def propagate_param_types(functions)
1133
+ func_by_name = functions.to_h { |f| [f.name, f] }
1134
+
1135
+ # First pass: infer types from literal values at call sites
1136
+ functions.each do |func|
1137
+ func.states.each do |state|
1138
+ state.cases.each do |kase|
1139
+ kase.commands.each do |cmd|
1140
+ next unless cmd.type == :call && cmd.args[:call_args]
1141
+
1142
+ target = func_by_name[cmd.args[:name]]
1143
+ next unless target
1144
+
1145
+ args = cmd.args[:call_args].split(',').map(&:strip)
1146
+ args.zip(target.params).each do |arg, target_param|
1147
+ next unless target_param
1148
+
1149
+ # If arg looks like a bytes value, mark target param as :bytes
1150
+ # BUT only if it's currently :i32 (default). Don't override :byte
1151
+ # which means it's used in |c[:x]| for single-byte comparison.
1152
+ if bytes_like_value?(arg) && target.param_types[target_param] == :i32
1153
+ target.param_types[target_param] =
1154
+ :bytes
1155
+ end
1156
+ end
1157
+ end
1158
+ end
1159
+ end
1160
+ end
1161
+
1162
+ # Second pass: propagate types from callees to callers (iterative)
1163
+ changed = true
1164
+ while changed
1165
+ changed = false
1166
+ functions.each do |func|
1167
+ func.states.each do |state|
1168
+ state.cases.each do |kase|
1169
+ kase.commands.each do |cmd|
1170
+ next unless cmd.type == :call && cmd.args[:call_args]
1171
+
1172
+ target = func_by_name[cmd.args[:name]]
1173
+ next unless target
1174
+
1175
+ args = cmd.args[:call_args].split(',').map(&:strip)
1176
+ args.zip(target.params).each do |arg, target_param|
1177
+ next unless target_param
1178
+
1179
+ # If arg is a param reference (:x), propagate type from callee
1180
+ next unless arg.match?(/^:(\w+)$/)
1181
+
1182
+ our_param = arg[1..]
1183
+ next unless func.param_types.key?(our_param)
1184
+
1185
+ target_type = target.param_types[target_param]
1186
+ our_type = func.param_types[our_param]
1187
+
1188
+ # Propagate :bytes from callee to caller
1189
+ if target_type == :bytes && our_type != :bytes
1190
+ func.param_types[our_param] = :bytes
1191
+ changed = true
1192
+ # Propagate :byte from callee to caller (only if we're still default :i32)
1193
+ elsif target_type == :byte && our_type == :i32
1194
+ func.param_types[our_param] = :byte
1195
+ changed = true
1196
+ end
1197
+ end
1198
+ end
1199
+ end
1200
+ end
1201
+ end
1202
+ end
1203
+
1204
+ functions
1205
+ end
1206
+
1207
+ # Check if a value looks like a bytes literal.
1208
+ # These are DSL escape sequences and quoted strings that are clearly
1209
+ # meant to be byte content, not numeric values.
1210
+ # Note: Numeric values like 0 or -1 are NOT bytes-like - they're sentinels.
1211
+ # PREPEND params get typed as :bytes from infer_param_types (PREPEND usage),
1212
+ # not from call-site inference.
1213
+ # Check if a value MUST be a byte slice (not a single byte).
1214
+ # Only empty class <> definitively requires :bytes type.
1215
+ # Single-char values like '<P>' or '|' could be either :byte or :bytes,
1216
+ # so their type should be inferred from usage, not from call-site values.
1217
+ def bytes_like_value?(arg) = arg == '<>'
1218
+
1219
+ # Collect prepend values by tracing call sites to functions with PREPEND(:param).
1220
+ # Returns updated functions with prepend_values filled in.
1221
+ def collect_prepend_values(functions)
1222
+ # First propagate param types from callees to callers
1223
+ functions = propagate_param_types(functions)
1224
+
1225
+ func_by_name = functions.to_h { |f| [f.name, f] }
1226
+
1227
+ # Step 1: Find which functions have PREPEND(:param) and which param it uses
1228
+ prepend_params = {} # func_name -> param_name
1229
+ functions.each do |func|
1230
+ func.states.each do |state|
1231
+ state.cases.each do |kase|
1232
+ kase.commands.each do |cmd|
1233
+ prepend_params[func.name] = cmd.args[:param_ref] if cmd.type == :prepend_param
1234
+ end
1235
+ end
1236
+ end
1237
+ end
1238
+
1239
+ return functions if prepend_params.empty?
1240
+
1241
+ # Step 2: Find all call sites and collect byte values passed
1242
+ prepend_values = Hash.new { |h, k| h[k] = Set.new }
1243
+
1244
+ functions.each do |func|
1245
+ collect_call_values_from_states(func.states, prepend_params, func_by_name, prepend_values)
1246
+ end
1247
+
1248
+ # Step 3: Update functions with prepend_values
1249
+ functions.map do |func|
1250
+ if prepend_params.key?(func.name)
1251
+ param_name = prepend_params[func.name]
1252
+ values = prepend_values[func.name].to_a.sort
1253
+
1254
+ # Create updated function with prepend_values
1255
+ IR::Function.new(
1256
+ name: func.name,
1257
+ return_type: func.return_type,
1258
+ params: func.params,
1259
+ param_types: func.param_types,
1260
+ locals: func.locals,
1261
+ states: func.states,
1262
+ eof_handler: func.eof_handler,
1263
+ entry_actions: func.entry_actions,
1264
+ emits_events: func.emits_events,
1265
+ expects_char: func.expects_char,
1266
+ emits_content_on_close: func.emits_content_on_close,
1267
+ prepend_values: { param_name => values },
1268
+ lineno: func.lineno
1269
+ )
1270
+ else
1271
+ func
1272
+ end
1273
+ end
1274
+ end
1275
+
1276
+ def collect_call_values_from_states(states, prepend_params, func_by_name, prepend_values)
1277
+ states.each do |state|
1278
+ state.cases.each do |kase|
1279
+ collect_call_values_from_commands(kase.commands, prepend_params, func_by_name, prepend_values)
1280
+ end
1281
+ collect_call_values_from_commands(state.eof_handler || [], prepend_params, func_by_name, prepend_values)
1282
+ end
1283
+ end
1284
+
1285
+ def collect_call_values_from_commands(commands, prepend_params, func_by_name, prepend_values)
1286
+ commands.each do |cmd|
1287
+ case cmd.type
1288
+ when :call
1289
+ func_name = cmd.args[:name]
1290
+ next unless prepend_params.key?(func_name)
1291
+
1292
+ # Extract the byte value from call_args
1293
+ call_args = cmd.args[:call_args]
1294
+ byte_value = parse_byte_literal(call_args)
1295
+ prepend_values[func_name] << byte_value if byte_value
1296
+ when :conditional
1297
+ cmd.args[:clauses]&.each do |clause|
1298
+ nested_cmds = (clause['commands'] || []).map { |c| c.is_a?(Hash) ? IR::Command.new(type: c['type'].to_sym, args: c['args'].transform_keys(&:to_sym)) : c }
1299
+ collect_call_values_from_commands(nested_cmds, prepend_params, func_by_name, prepend_values)
1300
+ end
1301
+ end
1302
+ end
1303
+ end
1304
+
1305
+ # Transform call arguments based on target function parameter types.
1306
+ # For :bytes params, generates b"..." format; for :byte params, b'.' format.
1307
+ def transform_call_args_by_type(functions)
1308
+ func_by_name = functions.to_h { |f| [f.name, f] }
1309
+
1310
+ functions.map do |func|
1311
+ new_states = func.states.map do |state|
1312
+ new_cases = state.cases.map do |kase|
1313
+ new_commands = transform_commands_args(kase.commands, func_by_name)
1314
+ IR::Case.new(
1315
+ chars: kase.chars,
1316
+ special_class: kase.special_class,
1317
+ param_ref: kase.param_ref,
1318
+ condition: kase.condition,
1319
+ substate: kase.substate,
1320
+ commands: new_commands,
1321
+ lineno: kase.lineno
1322
+ )
1323
+ end
1324
+
1325
+ new_eof = transform_commands_args(state.eof_handler || [], func_by_name)
1326
+
1327
+ IR::State.new(
1328
+ name: state.name,
1329
+ cases: new_cases,
1330
+ eof_handler: new_eof.empty? ? nil : new_eof,
1331
+ scan_chars: state.scan_chars,
1332
+ is_self_looping: state.is_self_looping,
1333
+ has_default: state.has_default,
1334
+ is_unconditional: state.is_unconditional,
1335
+ newline_injected: state.newline_injected,
1336
+ lineno: state.lineno
1337
+ )
1338
+ end
1339
+
1340
+ IR::Function.new(
1341
+ name: func.name,
1342
+ return_type: func.return_type,
1343
+ params: func.params,
1344
+ param_types: func.param_types,
1345
+ locals: func.locals,
1346
+ states: new_states,
1347
+ eof_handler: func.eof_handler,
1348
+ entry_actions: func.entry_actions,
1349
+ emits_events: func.emits_events,
1350
+ expects_char: func.expects_char,
1351
+ emits_content_on_close: func.emits_content_on_close,
1352
+ prepend_values: func.prepend_values,
1353
+ lineno: func.lineno
1354
+ )
1355
+ end
1356
+ end
1357
+
1358
+ def transform_commands_args(commands, func_by_name)
1359
+ commands.map do |cmd|
1360
+ if cmd.type == :call && cmd.args[:call_args]
1361
+ target_func = func_by_name[cmd.args[:name]]
1362
+ if target_func
1363
+ transformed_args = transform_args_for_target(cmd.args[:call_args], target_func)
1364
+ IR::Command.new(type: cmd.type, args: cmd.args.merge(call_args: transformed_args))
1365
+ else
1366
+ cmd
1367
+ end
1368
+ elsif cmd.type == :conditional
1369
+ new_clauses = cmd.args[:clauses]&.map do |clause|
1370
+ nested = (clause['commands'] || []).map do |c|
1371
+ c.is_a?(Hash) ? IR::Command.new(type: c['type'].to_sym, args: c['args'].transform_keys(&:to_sym)) : c
1372
+ end
1373
+ { 'condition' => clause['condition'], 'commands' => transform_commands_args(nested, func_by_name) }
1374
+ end
1375
+ IR::Command.new(type: cmd.type, args: { clauses: new_clauses })
1376
+ else
1377
+ cmd
1378
+ end
1379
+ end
1380
+ end
1381
+
1382
+ # Transform call arguments based on target function's parameter types.
1383
+ # Uses CharacterClass for unified parsing, then converts to appropriate Rust format.
1384
+ def transform_args_for_target(args_str, target_func)
1385
+ return args_str if args_str.nil? || target_func.params.empty?
1386
+
1387
+ args = tokenize_call_args(args_str)
1388
+ params = target_func.params
1389
+ param_types = target_func.param_types
1390
+
1391
+ args.zip(params).map do |arg, param|
1392
+ next arg unless param
1393
+
1394
+ param_type = param_types[param]
1395
+
1396
+ # Handle numeric literals specially - they're numbers, not characters
1397
+ if arg.match?(/^-?\d+$/)
1398
+ case param_type
1399
+ when :bytes then 'b""' # Numeric sentinel → empty bytes
1400
+ when :byte then "#{arg}u8" # Numeric literal → u8
1401
+ else arg # :i32 → pass through
1402
+ end
1403
+ else
1404
+ case param_type
1405
+ when :bytes
1406
+ result = CharacterClass.parse(arg)
1407
+ CharacterClass.to_rust_bytes(result)
1408
+ when :byte
1409
+ result = CharacterClass.parse(arg)
1410
+ CharacterClass.to_rust_byte(result)
1411
+ else
1412
+ arg # :i32 or unknown, pass through
1413
+ end
1414
+ end
1415
+ end.join(', ')
1416
+ end
1417
+
1418
+ # Tokenize call arguments respecting quotes (commas inside quotes don't split)
1419
+ def tokenize_call_args(args_str)
1420
+ args = []
1421
+ current = +''
1422
+ in_quote = false
1423
+ in_angle = 0
1424
+
1425
+ args_str.each_char do |c|
1426
+ case c
1427
+ when "'"
1428
+ in_quote = !in_quote
1429
+ current << c
1430
+ when '<'
1431
+ in_angle += 1
1432
+ current << c
1433
+ when '>'
1434
+ in_angle -= 1 if in_angle.positive?
1435
+ current << c
1436
+ when ','
1437
+ if in_quote || in_angle.positive?
1438
+ current << c
1439
+ else
1440
+ args << current.strip
1441
+ current = +''
1442
+ end
1443
+ else
1444
+ current << c
1445
+ end
1446
+ end
1447
+
1448
+ args << current.strip unless current.empty?
1449
+ args
1450
+ end
1451
+
1452
+ # Parse a call argument into a byte literal string for the template.
1453
+ # Supports both legacy syntax and new characters.md syntax.
1454
+ def parse_byte_literal(arg)
1455
+ return nil if arg.nil? || arg.empty?
1456
+
1457
+ case arg
1458
+ when '0' then nil # 0 means no prepend
1459
+ # Legacy escape syntax
1460
+ when '<P>' then '|'
1461
+ when '<L>' then '['
1462
+ when '<R>' then ']'
1463
+ when '<LB>' then '{'
1464
+ when '<RB>' then '}'
1465
+ when '<LP>' then '('
1466
+ when '<RP>' then ')'
1467
+ when '<BS>' then '\\\\'
1468
+ # New syntax: quoted single character
1469
+ when /^'(.)'$/ then ::Regexp.last_match(1)
1470
+ when /^"(.)"$/ then ::Regexp.last_match(1)
1471
+ # New syntax: quoted with escape (e.g., '\'')
1472
+ when /^'\\(.)'$/ then parse_quoted_string("\\#{::Regexp.last_match(1)}")
1473
+ # Legacy: single char
1474
+ when /^.$/ then arg
1475
+ else nil # Unknown format
1476
+ end
1477
+ end
1478
+ end
1479
+ end