descent 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +285 -0
- data/README.md +583 -0
- data/SYNTAX.md +334 -0
- data/exe/descent +15 -0
- data/lib/descent/ast.rb +69 -0
- data/lib/descent/generator.rb +489 -0
- data/lib/descent/ir.rb +98 -0
- data/lib/descent/ir_builder.rb +1479 -0
- data/lib/descent/lexer.rb +308 -0
- data/lib/descent/parser.rb +450 -0
- data/lib/descent/railroad.rb +272 -0
- data/lib/descent/templates/rust/_command.liquid +174 -0
- data/lib/descent/templates/rust/parser.liquid +1163 -0
- data/lib/descent/tools/debug.rb +115 -0
- data/lib/descent/tools/diagram.rb +48 -0
- data/lib/descent/tools/generate.rb +47 -0
- data/lib/descent/tools/validate.rb +56 -0
- data/lib/descent/validator.rb +231 -0
- data/lib/descent/version.rb +5 -0
- data/lib/descent.rb +34 -0
- metadata +101 -0
|
@@ -0,0 +1,1479 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Descent
|
|
4
|
+
# Unified character class parser according to characters.md spec.
|
|
5
|
+
#
|
|
6
|
+
# Handles all character literal and class syntax:
|
|
7
|
+
# - Single chars: 'x', '\n', '\x00'
|
|
8
|
+
# - Strings: 'hello' (decomposed to chars for classes)
|
|
9
|
+
# - Classes: <...> with space-separated tokens
|
|
10
|
+
# - Predefined classes: LETTER, DIGIT, SQ, P, 0-9, etc.
|
|
11
|
+
# - Empty class: <> (empty set / empty string)
|
|
12
|
+
# - Param refs: :name
|
|
13
|
+
#
|
|
14
|
+
# The same parsing is used everywhere: c[...], function args, PREPEND
|
|
15
|
+
module CharacterClass
|
|
16
|
+
# Predefined single-character classes (DSL-reserved chars)
|
|
17
|
+
SINGLE_CHAR = {
|
|
18
|
+
'P' => '|',
|
|
19
|
+
'L' => '[',
|
|
20
|
+
'R' => ']',
|
|
21
|
+
'LB' => '{',
|
|
22
|
+
'RB' => '}',
|
|
23
|
+
'LP' => '(',
|
|
24
|
+
'RP' => ')',
|
|
25
|
+
'SQ' => "'",
|
|
26
|
+
'DQ' => '"',
|
|
27
|
+
'BS' => '\\'
|
|
28
|
+
}.freeze
|
|
29
|
+
|
|
30
|
+
# Predefined character ranges
|
|
31
|
+
RANGES = {
|
|
32
|
+
'0-9' => '0123456789',
|
|
33
|
+
'0-7' => '01234567',
|
|
34
|
+
'0-1' => '01',
|
|
35
|
+
'a-z' => 'abcdefghijklmnopqrstuvwxyz',
|
|
36
|
+
'A-Z' => 'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
|
|
37
|
+
'a-f' => 'abcdef',
|
|
38
|
+
'A-F' => 'ABCDEF'
|
|
39
|
+
}.freeze
|
|
40
|
+
|
|
41
|
+
# Predefined multi-character classes (expanded to char sets)
|
|
42
|
+
MULTI_CHAR = {
|
|
43
|
+
'LETTER' => RANGES['a-z'] + RANGES['A-Z'],
|
|
44
|
+
'DIGIT' => RANGES['0-9'],
|
|
45
|
+
'HEX_DIGIT' => RANGES['0-9'] + RANGES['a-f'] + RANGES['A-F'],
|
|
46
|
+
'LABEL_CONT' => "#{RANGES['a-z']}#{RANGES['A-Z']}#{RANGES['0-9']}_-",
|
|
47
|
+
'WS' => " \t",
|
|
48
|
+
'NL' => "\n"
|
|
49
|
+
}.freeze
|
|
50
|
+
|
|
51
|
+
# Special classes that require runtime checks (can't be expanded to char list)
|
|
52
|
+
SPECIAL_CLASSES = %w[XID_START XID_CONT XLBL_START XLBL_CONT].freeze
|
|
53
|
+
|
|
54
|
+
class << self
|
|
55
|
+
# Parse a class specification string and return structured result.
|
|
56
|
+
#
|
|
57
|
+
# @param str [String] The class specification (contents of c[...] or <...> or bare)
|
|
58
|
+
# @param context [Symbol] :match (for c[...]), :bytes (for function args/PREPEND), :byte (single byte)
|
|
59
|
+
# @return [Hash] { chars: [...], special_class: nil|Symbol, param_ref: nil|String, bytes: String }
|
|
60
|
+
def parse(str, context: :match)
|
|
61
|
+
return { chars: [], special_class: nil, param_ref: nil, bytes: '' } if str.nil? || str.empty?
|
|
62
|
+
|
|
63
|
+
str = str.strip
|
|
64
|
+
|
|
65
|
+
# Handle explicit class wrapper <...>
|
|
66
|
+
if str.start_with?('<') && str.end_with?('>')
|
|
67
|
+
inner = str[1...-1].strip
|
|
68
|
+
return { chars: [], special_class: nil, param_ref: nil, bytes: '' } if inner.empty? # <>
|
|
69
|
+
|
|
70
|
+
return parse_class_content(inner, context)
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
# Handle param reference :name
|
|
74
|
+
if str.start_with?(':')
|
|
75
|
+
param = str[1..]
|
|
76
|
+
return { chars: [], special_class: nil, param_ref: param, bytes: nil }
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# Handle quoted string 'content'
|
|
80
|
+
if str.match?(/^'.*'$/) && str.length >= 2
|
|
81
|
+
content = parse_quoted_string(str[1...-1])
|
|
82
|
+
chars = content.chars
|
|
83
|
+
return { chars: chars, special_class: nil, param_ref: nil, bytes: content }
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Handle double-quoted string "content"
|
|
87
|
+
if str.match?(/^".*"$/) && str.length >= 2
|
|
88
|
+
content = str[1...-1]
|
|
89
|
+
chars = content.chars
|
|
90
|
+
return { chars: chars, special_class: nil, param_ref: nil, bytes: content }
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Check if it's a bare shorthand (only /[A-Za-z0-9_-]/ allowed)
|
|
94
|
+
if str.match?(/^[A-Za-z0-9_-]+$/)
|
|
95
|
+
# Could be a predefined class name or bare chars
|
|
96
|
+
upper = str.upcase
|
|
97
|
+
if SPECIAL_CLASSES.include?(upper)
|
|
98
|
+
return { chars: [], special_class: upper.downcase.to_sym, param_ref: nil, bytes: nil }
|
|
99
|
+
elsif MULTI_CHAR.key?(upper)
|
|
100
|
+
chars = MULTI_CHAR[upper].chars
|
|
101
|
+
return { chars: chars, special_class: nil, param_ref: nil, bytes: MULTI_CHAR[upper] }
|
|
102
|
+
elsif SINGLE_CHAR.key?(upper)
|
|
103
|
+
char = SINGLE_CHAR[upper]
|
|
104
|
+
return { chars: [char], special_class: nil, param_ref: nil, bytes: char }
|
|
105
|
+
elsif RANGES.key?(str)
|
|
106
|
+
chars = RANGES[str].chars
|
|
107
|
+
return { chars: chars, special_class: nil, param_ref: nil, bytes: RANGES[str] }
|
|
108
|
+
else
|
|
109
|
+
# Bare alphanumeric - decompose to individual chars
|
|
110
|
+
chars = str.chars
|
|
111
|
+
return { chars: chars, special_class: nil, param_ref: nil, bytes: str }
|
|
112
|
+
end
|
|
113
|
+
end
|
|
114
|
+
|
|
115
|
+
# If we get here, it's invalid bare content (special chars without quotes)
|
|
116
|
+
# For now, treat as literal bytes but this should probably error
|
|
117
|
+
{ chars: str.chars, special_class: nil, param_ref: nil, bytes: str }
|
|
118
|
+
end
|
|
119
|
+
|
|
120
|
+
# Parse the content inside <...> (space-separated tokens)
|
|
121
|
+
def parse_class_content(content, context)
|
|
122
|
+
return { chars: [], special_class: nil, param_ref: nil, bytes: '' } if content.nil? || content.empty?
|
|
123
|
+
|
|
124
|
+
all_chars = []
|
|
125
|
+
all_bytes = +''
|
|
126
|
+
special_class = nil
|
|
127
|
+
param_ref = nil
|
|
128
|
+
|
|
129
|
+
tokens = tokenize_class_content(content)
|
|
130
|
+
|
|
131
|
+
tokens.each do |token|
|
|
132
|
+
result = parse(token, context: context)
|
|
133
|
+
|
|
134
|
+
if result[:special_class]
|
|
135
|
+
# Only one special class allowed
|
|
136
|
+
special_class = result[:special_class]
|
|
137
|
+
elsif result[:param_ref]
|
|
138
|
+
param_ref = result[:param_ref]
|
|
139
|
+
else
|
|
140
|
+
all_chars.concat(result[:chars]) if result[:chars]
|
|
141
|
+
all_bytes << result[:bytes] if result[:bytes]
|
|
142
|
+
end
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
{ chars: all_chars.uniq, special_class: special_class, param_ref: param_ref, bytes: all_bytes }
|
|
146
|
+
end
|
|
147
|
+
|
|
148
|
+
# Tokenize class content respecting quotes
|
|
149
|
+
def tokenize_class_content(content)
|
|
150
|
+
tokens = []
|
|
151
|
+
current = +''
|
|
152
|
+
in_quote = false
|
|
153
|
+
i = 0
|
|
154
|
+
|
|
155
|
+
while i < content.length
|
|
156
|
+
c = content[i]
|
|
157
|
+
|
|
158
|
+
if c == "'" && !in_quote
|
|
159
|
+
in_quote = true
|
|
160
|
+
current << c
|
|
161
|
+
elsif c == "'" && in_quote
|
|
162
|
+
current << c
|
|
163
|
+
in_quote = false
|
|
164
|
+
elsif c == '\\' && in_quote && i + 1 < content.length
|
|
165
|
+
current << c << content[i + 1]
|
|
166
|
+
i += 1
|
|
167
|
+
elsif c == ' ' && !in_quote
|
|
168
|
+
tokens << current unless current.empty?
|
|
169
|
+
current = +''
|
|
170
|
+
else
|
|
171
|
+
current << c
|
|
172
|
+
end
|
|
173
|
+
|
|
174
|
+
i += 1
|
|
175
|
+
end
|
|
176
|
+
|
|
177
|
+
tokens << current unless current.empty?
|
|
178
|
+
tokens
|
|
179
|
+
end
|
|
180
|
+
|
|
181
|
+
# Parse a quoted string with escape sequences
|
|
182
|
+
def parse_quoted_string(str)
|
|
183
|
+
result = +''
|
|
184
|
+
i = 0
|
|
185
|
+
|
|
186
|
+
while i < str.length
|
|
187
|
+
if str[i] == '\\'
|
|
188
|
+
if i + 1 < str.length
|
|
189
|
+
case str[i + 1]
|
|
190
|
+
when 'n' then result << "\n"
|
|
191
|
+
i += 2
|
|
192
|
+
when 't' then result << "\t"
|
|
193
|
+
i += 2
|
|
194
|
+
when 'r' then result << "\r"
|
|
195
|
+
i += 2
|
|
196
|
+
when '\\' then result << '\\'
|
|
197
|
+
i += 2
|
|
198
|
+
when "'" then result << "'"
|
|
199
|
+
i += 2
|
|
200
|
+
when '"' then result << '"'
|
|
201
|
+
i += 2
|
|
202
|
+
when 'x'
|
|
203
|
+
# Hex byte: \xHH
|
|
204
|
+
if i + 3 < str.length && str[i + 2..i + 3].match?(/^[0-9A-Fa-f]{2}$/)
|
|
205
|
+
result << str[i + 2..i + 3].to_i(16).chr
|
|
206
|
+
i += 4
|
|
207
|
+
else
|
|
208
|
+
result << str[i + 1]
|
|
209
|
+
i += 2
|
|
210
|
+
end
|
|
211
|
+
when 'u'
|
|
212
|
+
# Unicode: \uXXXX
|
|
213
|
+
if i + 5 < str.length && str[i + 2..i + 5].match?(/^[0-9A-Fa-f]{4}$/)
|
|
214
|
+
result << str[i + 2..i + 5].to_i(16).chr(Encoding::UTF_8)
|
|
215
|
+
i += 6
|
|
216
|
+
else
|
|
217
|
+
result << str[i + 1]
|
|
218
|
+
i += 2
|
|
219
|
+
end
|
|
220
|
+
when '0'
|
|
221
|
+
# Null byte
|
|
222
|
+
result << "\0"
|
|
223
|
+
i += 2
|
|
224
|
+
else
|
|
225
|
+
result << str[i + 1]
|
|
226
|
+
i += 2
|
|
227
|
+
end
|
|
228
|
+
else
|
|
229
|
+
result << str[i]
|
|
230
|
+
i += 1
|
|
231
|
+
end
|
|
232
|
+
else
|
|
233
|
+
result << str[i]
|
|
234
|
+
i += 1
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
|
|
238
|
+
result
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Convert parsed result to Rust byte literal format for :byte param (u8)
|
|
242
|
+
def to_rust_byte(result)
|
|
243
|
+
return result[:param_ref] if result[:param_ref]
|
|
244
|
+
return '0u8' if result[:chars].empty? && result[:bytes].empty? # Empty = never match sentinel
|
|
245
|
+
|
|
246
|
+
char = result[:bytes][0] || result[:chars][0]
|
|
247
|
+
escape_rust_byte(char)
|
|
248
|
+
end
|
|
249
|
+
|
|
250
|
+
# Convert parsed result to Rust byte string format for :bytes param (&[u8])
|
|
251
|
+
def to_rust_bytes(result)
|
|
252
|
+
return result[:param_ref] if result[:param_ref]
|
|
253
|
+
return 'b""' if result[:bytes].nil? || result[:bytes].empty?
|
|
254
|
+
|
|
255
|
+
"b\"#{escape_rust_byte_string(result[:bytes])}\""
|
|
256
|
+
end
|
|
257
|
+
|
|
258
|
+
# Escape a single character for Rust byte literal b'x'
|
|
259
|
+
def escape_rust_byte(char)
|
|
260
|
+
escaped = case char
|
|
261
|
+
when "\n" then '\\n'
|
|
262
|
+
when "\t" then '\\t'
|
|
263
|
+
when "\r" then '\\r'
|
|
264
|
+
when "\0" then '\\0'
|
|
265
|
+
when '\\' then '\\\\'
|
|
266
|
+
when "'" then "\\'"
|
|
267
|
+
else
|
|
268
|
+
if char.ord < 32 || char.ord > 126
|
|
269
|
+
format('\\x%02x', char.ord)
|
|
270
|
+
else
|
|
271
|
+
char
|
|
272
|
+
end
|
|
273
|
+
end
|
|
274
|
+
"b'#{escaped}'"
|
|
275
|
+
end
|
|
276
|
+
|
|
277
|
+
# Escape a string for Rust byte string literal b"..."
|
|
278
|
+
def escape_rust_byte_string(str)
|
|
279
|
+
str.chars.map do |char|
|
|
280
|
+
case char
|
|
281
|
+
when "\n" then '\\n'
|
|
282
|
+
when "\t" then '\\t'
|
|
283
|
+
when "\r" then '\\r'
|
|
284
|
+
when "\0" then '\\0'
|
|
285
|
+
when '\\' then '\\\\'
|
|
286
|
+
when '"' then '\\"'
|
|
287
|
+
else
|
|
288
|
+
if char.ord < 32 || char.ord > 126
|
|
289
|
+
format('\\x%02x', char.ord)
|
|
290
|
+
else
|
|
291
|
+
char
|
|
292
|
+
end
|
|
293
|
+
end
|
|
294
|
+
end.join
|
|
295
|
+
end
|
|
296
|
+
end
|
|
297
|
+
end
|
|
298
|
+
|
|
299
|
+
# Transforms AST into IR with semantic analysis.
|
|
300
|
+
#
|
|
301
|
+
# Responsibilities:
|
|
302
|
+
# - Resolve type references
|
|
303
|
+
# - Infer SCAN optimization characters from state structure
|
|
304
|
+
# - Infer EOF handling requirements
|
|
305
|
+
# - Collect local variable declarations
|
|
306
|
+
# - Validate consistency
|
|
307
|
+
class IRBuilder
|
|
308
|
+
def initialize(ast) = @ast = ast
|
|
309
|
+
|
|
310
|
+
def build
|
|
311
|
+
types = build_types(@ast.types)
|
|
312
|
+
functions = @ast.functions.map { |f| build_function(f, types) }
|
|
313
|
+
keywords = @ast.keywords.map { |k| build_keywords(k) }
|
|
314
|
+
|
|
315
|
+
# Collect custom error codes from /error(code) calls
|
|
316
|
+
custom_error_codes = collect_custom_error_codes(functions)
|
|
317
|
+
|
|
318
|
+
# Collect prepend values by tracing call sites
|
|
319
|
+
functions = collect_prepend_values(functions)
|
|
320
|
+
|
|
321
|
+
# Transform call arguments based on target parameter types
|
|
322
|
+
functions = transform_call_args_by_type(functions)
|
|
323
|
+
|
|
324
|
+
IR::Parser.new(
|
|
325
|
+
name: @ast.name,
|
|
326
|
+
entry_point: @ast.entry_point,
|
|
327
|
+
types:,
|
|
328
|
+
functions:,
|
|
329
|
+
keywords:,
|
|
330
|
+
custom_error_codes:
|
|
331
|
+
)
|
|
332
|
+
end
|
|
333
|
+
|
|
334
|
+
private
|
|
335
|
+
|
|
336
|
+
# Transform AST Keywords to IR Keywords
|
|
337
|
+
def build_keywords(kw)
|
|
338
|
+
# Parse the fallback function call (e.g., "/bare_string" or "/bare_string(arg)")
|
|
339
|
+
fallback_func = nil
|
|
340
|
+
fallback_args = nil
|
|
341
|
+
|
|
342
|
+
if kw.fallback
|
|
343
|
+
if kw.fallback =~ %r{^/(\w+)\(([^)]*)\)$}
|
|
344
|
+
fallback_func = ::Regexp.last_match(1)
|
|
345
|
+
fallback_args = ::Regexp.last_match(2)
|
|
346
|
+
elsif kw.fallback =~ %r{^/(\w+)$}
|
|
347
|
+
fallback_func = ::Regexp.last_match(1)
|
|
348
|
+
end
|
|
349
|
+
end
|
|
350
|
+
|
|
351
|
+
IR::Keywords.new(
|
|
352
|
+
name: kw.name,
|
|
353
|
+
fallback_func:,
|
|
354
|
+
fallback_args:,
|
|
355
|
+
mappings: kw.mappings,
|
|
356
|
+
lineno: kw.lineno
|
|
357
|
+
)
|
|
358
|
+
end
|
|
359
|
+
|
|
360
|
+
def build_types(type_decls)
|
|
361
|
+
type_decls.map do |t|
|
|
362
|
+
emits_start = t.kind == :BRACKET
|
|
363
|
+
emits_end = t.kind == :BRACKET
|
|
364
|
+
|
|
365
|
+
IR::TypeInfo.new(
|
|
366
|
+
name: t.name,
|
|
367
|
+
kind: t.kind.downcase.to_sym,
|
|
368
|
+
emits_start:,
|
|
369
|
+
emits_end:,
|
|
370
|
+
lineno: t.lineno
|
|
371
|
+
)
|
|
372
|
+
end
|
|
373
|
+
end
|
|
374
|
+
|
|
375
|
+
def build_function(func, types)
|
|
376
|
+
return_type_info = types.find { |t| t.name == func.return_type }
|
|
377
|
+
emits_events = return_type_info&.bracket? || return_type_info&.content?
|
|
378
|
+
|
|
379
|
+
locals = infer_locals(func)
|
|
380
|
+
states = func.states.map { |s| build_state(s, func.params) }
|
|
381
|
+
|
|
382
|
+
# Infer expected closing delimiter from return cases
|
|
383
|
+
expects_char, emits_content_on_close = infer_expects(states)
|
|
384
|
+
|
|
385
|
+
# Infer parameter types from usage (byte if used in |c[:x]|, i32 otherwise)
|
|
386
|
+
param_types = infer_param_types(func.params, states)
|
|
387
|
+
|
|
388
|
+
# Transform function-level eof_handler commands from AST to IR
|
|
389
|
+
# Apply the same inline emit fix as for case commands
|
|
390
|
+
func_eof_commands = func.eof_handler&.commands&.map { |c| build_command(c) }
|
|
391
|
+
func_eof_commands = mark_returns_after_inline_emits(func_eof_commands) if func_eof_commands
|
|
392
|
+
|
|
393
|
+
# Transform entry_actions (initialization commands on function entry)
|
|
394
|
+
entry_actions = func.entry_actions&.map { |c| build_command(c) } || []
|
|
395
|
+
|
|
396
|
+
IR::Function.new(
|
|
397
|
+
name: func.name,
|
|
398
|
+
return_type: func.return_type,
|
|
399
|
+
params: func.params,
|
|
400
|
+
param_types:,
|
|
401
|
+
locals:,
|
|
402
|
+
states:,
|
|
403
|
+
eof_handler: func_eof_commands,
|
|
404
|
+
entry_actions:,
|
|
405
|
+
emits_events:,
|
|
406
|
+
expects_char:,
|
|
407
|
+
emits_content_on_close:,
|
|
408
|
+
lineno: func.lineno
|
|
409
|
+
)
|
|
410
|
+
end
|
|
411
|
+
|
|
412
|
+
def build_state(state, params = [])
|
|
413
|
+
cases = state.cases.map { |c| build_case(c, params) }
|
|
414
|
+
scan_chars = infer_scan_chars(state, cases)
|
|
415
|
+
is_self_looping = cases.any? { |c| c.default? && has_self_transition?(c) }
|
|
416
|
+
|
|
417
|
+
# Check if state has a default case (no chars, no condition, no special_class, no param_ref)
|
|
418
|
+
has_default = cases.any?(&:default?)
|
|
419
|
+
|
|
420
|
+
# Check if first case is unconditional (bare action - no char match)
|
|
421
|
+
# This means the state just executes actions without matching any character
|
|
422
|
+
# Note: param_ref IS a match (against a param value), so it's not unconditional
|
|
423
|
+
first_case = cases.first
|
|
424
|
+
is_unconditional = first_case && first_case.chars.nil? && first_case.special_class.nil? &&
|
|
425
|
+
first_case.param_ref.nil? && first_case.condition.nil?
|
|
426
|
+
|
|
427
|
+
# Transform eof_handler commands from AST to IR
|
|
428
|
+
# Apply the same inline emit fix as for case commands
|
|
429
|
+
eof_commands = state.eof_handler&.commands&.map { |c| build_command(c) }
|
|
430
|
+
eof_commands = mark_returns_after_inline_emits(eof_commands) if eof_commands
|
|
431
|
+
|
|
432
|
+
# Inject '\n' into scan_chars if not already a user target (and room available).
|
|
433
|
+
# This ensures SIMD scans stop at newlines for correct line/column tracking.
|
|
434
|
+
# The template adds a match arm for '\n' that updates line/col and continues scanning.
|
|
435
|
+
newline_injected = false
|
|
436
|
+
if scan_chars && !scan_chars.include?("\n") && scan_chars.size < 6
|
|
437
|
+
scan_chars = ["\n"] + scan_chars # Prepend so newline is checked first
|
|
438
|
+
newline_injected = true
|
|
439
|
+
end
|
|
440
|
+
|
|
441
|
+
IR::State.new(
|
|
442
|
+
name: state.name,
|
|
443
|
+
cases:,
|
|
444
|
+
eof_handler: eof_commands,
|
|
445
|
+
scan_chars:,
|
|
446
|
+
is_self_looping:,
|
|
447
|
+
has_default:,
|
|
448
|
+
is_unconditional:,
|
|
449
|
+
newline_injected:,
|
|
450
|
+
lineno: state.lineno
|
|
451
|
+
)
|
|
452
|
+
end
|
|
453
|
+
|
|
454
|
+
def build_case(kase, params = [])
|
|
455
|
+
validate_char_syntax(kase.chars, kase.lineno) if kase.chars
|
|
456
|
+
validate_prepend_commands(kase.commands, params, kase.lineno)
|
|
457
|
+
validate_call_args(kase.commands, params, kase.lineno)
|
|
458
|
+
chars, special_class, param_ref = parse_chars(kase.chars, params:)
|
|
459
|
+
commands = kase.commands.map { |c| build_command(c) }
|
|
460
|
+
|
|
461
|
+
# Fix #11: If inline emit precedes a bare return, mark return to suppress auto-emit
|
|
462
|
+
# This prevents CONTENT functions from emitting twice (once for inline, once for auto)
|
|
463
|
+
commands = mark_returns_after_inline_emits(commands)
|
|
464
|
+
|
|
465
|
+
IR::Case.new(
|
|
466
|
+
chars:,
|
|
467
|
+
special_class:,
|
|
468
|
+
param_ref:,
|
|
469
|
+
condition: kase.condition,
|
|
470
|
+
substate: kase.substate,
|
|
471
|
+
commands:,
|
|
472
|
+
lineno: kase.lineno
|
|
473
|
+
)
|
|
474
|
+
end
|
|
475
|
+
|
|
476
|
+
# Mark return commands that follow inline emits to suppress auto-emit.
|
|
477
|
+
# When a case has: | Float(USE_MARK) |return
|
|
478
|
+
# The inline emit already emits, so return should NOT auto-emit.
|
|
479
|
+
def mark_returns_after_inline_emits(commands)
|
|
480
|
+
has_inline_emit = false
|
|
481
|
+
|
|
482
|
+
commands.map do |cmd|
|
|
483
|
+
case cmd.type
|
|
484
|
+
when :inline_emit_bare, :inline_emit_mark, :inline_emit_literal
|
|
485
|
+
has_inline_emit = true
|
|
486
|
+
cmd
|
|
487
|
+
when :return
|
|
488
|
+
if has_inline_emit && cmd.args[:emit_type].nil? && cmd.args[:return_value].nil?
|
|
489
|
+
# Bare return after inline emit - suppress auto-emit
|
|
490
|
+
IR::Command.new(type: :return, args: cmd.args.merge(suppress_auto_emit: true))
|
|
491
|
+
else
|
|
492
|
+
cmd
|
|
493
|
+
end
|
|
494
|
+
else
|
|
495
|
+
cmd
|
|
496
|
+
end
|
|
497
|
+
end
|
|
498
|
+
end
|
|
499
|
+
|
|
500
|
+
def build_command(cmd)
|
|
501
|
+
# Handle AST::Conditional specially
|
|
502
|
+
if cmd.is_a?(AST::Conditional)
|
|
503
|
+
return IR::Command.new(
|
|
504
|
+
type: :conditional,
|
|
505
|
+
args: {
|
|
506
|
+
clauses: cmd.clauses&.map do |c|
|
|
507
|
+
{
|
|
508
|
+
'condition' => c.condition,
|
|
509
|
+
'commands' => c.commands.map { |cc| build_command(cc) }
|
|
510
|
+
}
|
|
511
|
+
end
|
|
512
|
+
}
|
|
513
|
+
)
|
|
514
|
+
end
|
|
515
|
+
|
|
516
|
+
args = case cmd.type
|
|
517
|
+
when :assign, :add_assign, :sub_assign then cmd.value.is_a?(Hash) ? cmd.value : {}
|
|
518
|
+
when :advance_to then { value: validate_advance_to(cmd.value, cmd.lineno) }
|
|
519
|
+
when :scan then { value: process_escapes(cmd.value) }
|
|
520
|
+
when :emit, :call_method, :transition, :error then { value: cmd.value }
|
|
521
|
+
when :call then parse_call_value(cmd.value)
|
|
522
|
+
when :inline_emit_bare, :inline_emit_mark then { type: cmd.value }
|
|
523
|
+
when :inline_emit_literal then cmd.value.is_a?(Hash) ? cmd.value : {}
|
|
524
|
+
when :term then { offset: cmd.value || 0 }
|
|
525
|
+
when :prepend then { literal: process_escapes(cmd.value) }
|
|
526
|
+
when :prepend_param then { param_ref: cmd.value }
|
|
527
|
+
when :keywords_lookup then { name: cmd.value }
|
|
528
|
+
when :return then parse_return_value(cmd.value)
|
|
529
|
+
when :advance, :mark, :noop then {}
|
|
530
|
+
else
|
|
531
|
+
raise ValidationError, "Unknown command type: #{cmd.type.inspect}"
|
|
532
|
+
end
|
|
533
|
+
|
|
534
|
+
IR::Command.new(type: cmd.type, args:)
|
|
535
|
+
end
|
|
536
|
+
|
|
537
|
+
# Process character class/literal to get the actual bytes.
|
|
538
|
+
# Uses unified CharacterClass parser.
|
|
539
|
+
def process_escapes(str)
|
|
540
|
+
return '' if str.nil? || str.empty?
|
|
541
|
+
|
|
542
|
+
result = CharacterClass.parse(str)
|
|
543
|
+
result[:bytes] || ''
|
|
544
|
+
end
|
|
545
|
+
|
|
546
|
+
# Validate and process advance_to (->[...]) arguments.
|
|
547
|
+
# Only literal bytes are supported (1-6 chars for SIMD memchr).
|
|
548
|
+
# Special classes and param refs are NOT supported.
|
|
549
|
+
def validate_advance_to(str, lineno)
|
|
550
|
+
raise ValidationError, "L#{lineno}: ->[] requires at least one character" if str.nil? || str.empty?
|
|
551
|
+
|
|
552
|
+
result = CharacterClass.parse(str)
|
|
553
|
+
|
|
554
|
+
if result[:special_class]
|
|
555
|
+
raise ValidationError,
|
|
556
|
+
"L#{lineno}: ->[] does not support character classes like #{str.upcase}. " \
|
|
557
|
+
'Only literal bytes are supported (uses SIMD memchr).'
|
|
558
|
+
end
|
|
559
|
+
|
|
560
|
+
if result[:param_ref]
|
|
561
|
+
raise ValidationError,
|
|
562
|
+
"L#{lineno}: ->[] does not support parameter references like :#{result[:param_ref]}. " \
|
|
563
|
+
'Only literal bytes are supported (uses SIMD memchr).'
|
|
564
|
+
end
|
|
565
|
+
|
|
566
|
+
bytes = result[:bytes] || ''
|
|
567
|
+
raise ValidationError, "L#{lineno}: ->[] resolved to empty bytes from '#{str}'" if bytes.empty?
|
|
568
|
+
|
|
569
|
+
if bytes.length > 6
|
|
570
|
+
raise ValidationError,
|
|
571
|
+
"L#{lineno}: ->[#{str}] has #{bytes.length} chars but maximum is 6 " \
|
|
572
|
+
'(chained memchr limit). Split into multiple scans or restructure grammar.'
|
|
573
|
+
end
|
|
574
|
+
|
|
575
|
+
bytes
|
|
576
|
+
end
|
|
577
|
+
|
|
578
|
+
# Characters that MUST be quoted or use predefined class names in c[...]
|
|
579
|
+
# These cause lexer/parser issues if used bare
|
|
580
|
+
MUST_QUOTE_CHARS = {
|
|
581
|
+
"'" => '<SQ>', # Single quote - causes unterminated quote issues
|
|
582
|
+
'|' => '<P>', # Pipe - DSL delimiter
|
|
583
|
+
'[' => '<L>', # Open bracket - DSL delimiter
|
|
584
|
+
']' => '<R>', # Close bracket - DSL delimiter
|
|
585
|
+
' ' => "' ' or <WS>" # Space - invisible, easy to miss
|
|
586
|
+
}.freeze
|
|
587
|
+
|
|
588
|
+
# Characters that SHOULD be quoted for clarity (warnings, not errors)
|
|
589
|
+
SHOULD_QUOTE_CHARS = {
|
|
590
|
+
'{' => '<LB>',
|
|
591
|
+
'}' => '<RB>',
|
|
592
|
+
'(' => '<LP>',
|
|
593
|
+
')' => '<RP>',
|
|
594
|
+
'"' => '<DQ>',
|
|
595
|
+
'\\' => '<BS>'
|
|
596
|
+
}.freeze
|
|
597
|
+
|
|
598
|
+
# Validate character syntax in c[...] before parsing.
|
|
599
|
+
# Raises ValidationError for fatal issues.
|
|
600
|
+
#
|
|
601
|
+
# Valid syntax:
|
|
602
|
+
# - c[<...>] - class syntax (space-separated tokens inside)
|
|
603
|
+
# - c['...'] - quoted string/char
|
|
604
|
+
# - c[:param] - parameter reference
|
|
605
|
+
# - c[CLASS] - predefined class (LETTER, DIGIT, etc.)
|
|
606
|
+
# - c[abc] - bare alphanumeric/underscore/hyphen chars only
|
|
607
|
+
#
|
|
608
|
+
# Invalid:
|
|
609
|
+
# - c["] - special chars must be quoted: c['"']
|
|
610
|
+
# - c[ ] - spaces must be quoted: c[' ']
|
|
611
|
+
# - c[|] - DSL chars must use escapes: c[<P>] or c['|']
|
|
612
|
+
def validate_char_syntax(chars_str, lineno)
|
|
613
|
+
return if chars_str.nil? || chars_str.empty?
|
|
614
|
+
|
|
615
|
+
# Already using proper class syntax - <...> wrapper around everything
|
|
616
|
+
return if chars_str.start_with?('<') && chars_str.end_with?('>')
|
|
617
|
+
|
|
618
|
+
# Properly quoted string - validated by string parsing
|
|
619
|
+
return if chars_str.start_with?("'") && chars_str.end_with?("'") && chars_str.length >= 2
|
|
620
|
+
|
|
621
|
+
# Check for parameter reference (starts with : followed by valid identifier)
|
|
622
|
+
return if chars_str.match?(/^:[a-z_]\w*$/i)
|
|
623
|
+
|
|
624
|
+
# Check for pure special class (SCREAMING_CASE like LETTER, DIGIT, LABEL_CONT)
|
|
625
|
+
return if chars_str.match?(/^[A-Z]+(?:_[A-Z]+)*$/)
|
|
626
|
+
|
|
627
|
+
# Check for <TOKEN> escape sequences used OUTSIDE of a proper <...> class wrapper
|
|
628
|
+
if chars_str.match?(/<[A-Z]+>/)
|
|
629
|
+
raise ValidationError, "Line #{lineno}: Escape sequence like <SQ>, <P> etc. found outside " \
|
|
630
|
+
"class wrapper in c[#{chars_str}]. " \
|
|
631
|
+
'Wrap everything in a class: c[<...>] not c[THING <ESC> ...]'
|
|
632
|
+
end
|
|
633
|
+
|
|
634
|
+
# Check for combined class + chars (e.g., LETTER'[.?!)
|
|
635
|
+
if chars_str.match?(/^[A-Z]+(?:_[A-Z]+)*'/)
|
|
636
|
+
class_name = chars_str.match(/^([A-Z_]+)/)[1]
|
|
637
|
+
raise ValidationError, "Line #{lineno}: Invalid character syntax in c[#{chars_str}]. " \
|
|
638
|
+
'Bare quote after class name is ambiguous. ' \
|
|
639
|
+
"Use class syntax instead: c[<#{class_name} ...>]"
|
|
640
|
+
end
|
|
641
|
+
|
|
642
|
+
# Check for unterminated quotes
|
|
643
|
+
quote_count = chars_str.count("'")
|
|
644
|
+
if quote_count.odd?
|
|
645
|
+
raise ValidationError, "Line #{lineno}: Unterminated quote in c[#{chars_str}]. " \
|
|
646
|
+
'Single quotes must be paired. ' \
|
|
647
|
+
"To match a literal quote, use c[<SQ>] or c['\\'']"
|
|
648
|
+
end
|
|
649
|
+
|
|
650
|
+
# Check for any character outside /A-Za-z0-9_-/ that isn't quoted
|
|
651
|
+
# These must be in single quotes or use escape sequences
|
|
652
|
+
chars_str.each_char.with_index do |ch, i|
|
|
653
|
+
next if ch.match?(/[A-Za-z0-9_-]/)
|
|
654
|
+
next if ch == "'" # Quote chars are handled by quote pairing check
|
|
655
|
+
next if ch == '\\' # Escape sequences handled separately
|
|
656
|
+
|
|
657
|
+
# Check if this char is inside quotes
|
|
658
|
+
quote_depth = chars_str[0...i].count("'")
|
|
659
|
+
next if quote_depth.odd? # Inside quotes, OK
|
|
660
|
+
|
|
661
|
+
# Special chars outside quotes - error
|
|
662
|
+
suggestion = case ch
|
|
663
|
+
when '|' then "c[<P>] or c['|']"
|
|
664
|
+
when '[' then "c[<L>] or c['[']"
|
|
665
|
+
when ']' then "c[<R>] or c[']']"
|
|
666
|
+
when '{' then "c[<LB>] or c['{']"
|
|
667
|
+
when '}' then "c[<RB>] or c['}']"
|
|
668
|
+
when '(' then "c[<LP>] or c['(']"
|
|
669
|
+
when ')' then "c[<RP>] or c[')']"
|
|
670
|
+
when '"' then "c[<DQ>] or c['\"']"
|
|
671
|
+
when '\\' then "c[<BS>] or c['\\\\']"
|
|
672
|
+
when ' ' then "c[<WS>] or c[' ']"
|
|
673
|
+
when "\t" then "c['\\t']"
|
|
674
|
+
when "\n" then "c['\\n']"
|
|
675
|
+
else "c['#{ch}']"
|
|
676
|
+
end
|
|
677
|
+
|
|
678
|
+
raise ValidationError, "Line #{lineno}: Unquoted '#{ch.inspect[1...-1]}' in c[#{chars_str}]. " \
|
|
679
|
+
"Characters outside /A-Za-z0-9_-/ must be quoted. Use #{suggestion}"
|
|
680
|
+
end
|
|
681
|
+
end
|
|
682
|
+
|
|
683
|
+
# Validate PREPEND commands for common mistakes.
|
|
684
|
+
# Catches: PREPEND(param) where param is a known parameter name - should be PREPEND(:param)
|
|
685
|
+
def validate_prepend_commands(commands, params, lineno)
|
|
686
|
+
return if params.empty?
|
|
687
|
+
|
|
688
|
+
commands.each do |cmd|
|
|
689
|
+
next unless cmd.type == :prepend
|
|
690
|
+
next if cmd.value.nil?
|
|
691
|
+
|
|
692
|
+
literal = cmd.value.to_s.strip
|
|
693
|
+
|
|
694
|
+
# Check if the literal matches a param name (bare word without quotes)
|
|
695
|
+
# Valid literals: 'x', '|', '``', <P>, etc.
|
|
696
|
+
# Suspicious: prepend (bare word matching param name)
|
|
697
|
+
next unless literal.match?(/^[a-z_]\w*$/i) # Bare identifier
|
|
698
|
+
next unless params.include?(literal)
|
|
699
|
+
|
|
700
|
+
raise ValidationError, "Line #{lineno}: PREPEND(#{literal}) looks like a parameter reference. " \
|
|
701
|
+
"Use PREPEND(:#{literal}) to reference the '#{literal}' parameter, " \
|
|
702
|
+
"or PREPEND('#{literal}') for a literal string."
|
|
703
|
+
end
|
|
704
|
+
end
|
|
705
|
+
|
|
706
|
+
# Validate function call arguments for bare identifiers matching param names.
|
|
707
|
+
# Catches: /func(param) where param is a known parameter - should be /func(:param)
|
|
708
|
+
def validate_call_args(commands, params, lineno)
|
|
709
|
+
return if params.empty?
|
|
710
|
+
|
|
711
|
+
commands.each do |cmd|
|
|
712
|
+
next unless cmd.type == :call
|
|
713
|
+
next if cmd.value.nil?
|
|
714
|
+
|
|
715
|
+
# cmd.value is like "text(prepend)" or "func" - extract args if present
|
|
716
|
+
call_str = cmd.value.to_s
|
|
717
|
+
next unless call_str.include?('(')
|
|
718
|
+
|
|
719
|
+
args_str = call_str[/\((.+)\)/, 1]
|
|
720
|
+
next if args_str.nil? || args_str.empty?
|
|
721
|
+
|
|
722
|
+
# Tokenize respecting quotes and angle brackets
|
|
723
|
+
args = tokenize_call_args_for_validation(args_str)
|
|
724
|
+
|
|
725
|
+
args.each do |arg|
|
|
726
|
+
arg = arg.strip
|
|
727
|
+
# Skip if it's already a proper reference (:param), quoted, or class syntax
|
|
728
|
+
next if arg.start_with?(':') # :param - correct
|
|
729
|
+
next if arg.start_with?("'") # 'literal'
|
|
730
|
+
next if arg.start_with?('"') # "literal"
|
|
731
|
+
next if arg.start_with?('<') # <CLASS>
|
|
732
|
+
next if arg.match?(/^-?\d+$/) # numeric
|
|
733
|
+
next if arg.match?(/^[A-Z]+$/) # COL, LINE, PREV - built-in vars
|
|
734
|
+
next if arg.include?(' ') # expression like "COL - 1"
|
|
735
|
+
next if arg.include?('.') # method call
|
|
736
|
+
next if arg.include?('(') # function call
|
|
737
|
+
|
|
738
|
+
# Bare lowercase identifier - check if it matches a param name
|
|
739
|
+
next unless arg.match?(/^[a-z_]\w*$/i)
|
|
740
|
+
next unless params.include?(arg)
|
|
741
|
+
|
|
742
|
+
raise ValidationError, "Line #{lineno}: /...(...#{arg}...) - bare identifier '#{arg}' matches a parameter name. " \
|
|
743
|
+
"Use ':#{arg}' to pass the parameter value, or \"'#{arg}'\" for a literal string."
|
|
744
|
+
end
|
|
745
|
+
end
|
|
746
|
+
end
|
|
747
|
+
|
|
748
|
+
# Tokenize call args for validation, respecting quotes and angle brackets
|
|
749
|
+
def tokenize_call_args_for_validation(args_str)
|
|
750
|
+
args = []
|
|
751
|
+
current = +''
|
|
752
|
+
in_quote = false
|
|
753
|
+
in_angle = 0
|
|
754
|
+
|
|
755
|
+
args_str.each_char do |c|
|
|
756
|
+
case c
|
|
757
|
+
when "'"
|
|
758
|
+
in_quote = !in_quote
|
|
759
|
+
current << c
|
|
760
|
+
when '<'
|
|
761
|
+
in_angle += 1
|
|
762
|
+
current << c
|
|
763
|
+
when '>'
|
|
764
|
+
in_angle -= 1 if in_angle.positive?
|
|
765
|
+
current << c
|
|
766
|
+
when ','
|
|
767
|
+
if in_quote || in_angle.positive?
|
|
768
|
+
current << c
|
|
769
|
+
else
|
|
770
|
+
args << current.strip
|
|
771
|
+
current = +''
|
|
772
|
+
end
|
|
773
|
+
else
|
|
774
|
+
current << c
|
|
775
|
+
end
|
|
776
|
+
end
|
|
777
|
+
|
|
778
|
+
args << current.strip unless current.empty?
|
|
779
|
+
args
|
|
780
|
+
end
|
|
781
|
+
|
|
782
|
+
# Parse character specification into literal chars, special class, and/or param reference.
|
|
783
|
+
# Returns [chars_array, special_class_symbol, param_ref_string]
|
|
784
|
+
#
|
|
785
|
+
# Supports both legacy syntax and new characters.md syntax:
|
|
786
|
+
#
|
|
787
|
+
# Legacy (backwards compatible):
|
|
788
|
+
# "abc" -> [["a", "b", "c"], nil, nil]
|
|
789
|
+
# "LETTER" -> [nil, :letter, nil]
|
|
790
|
+
# "LETTER'[.?!" -> [["'", "[", ".", "?", "!"], :letter, nil]
|
|
791
|
+
# ":close" -> [nil, nil, "close"] (param reference)
|
|
792
|
+
#
|
|
793
|
+
# New syntax (characters.md):
|
|
794
|
+
# "'x'" -> [["x"], nil, nil] (quoted char)
|
|
795
|
+
# "'abc'" -> [["a", "b", "c"], nil, nil] (quoted string, decomposed)
|
|
796
|
+
# "<abc>" -> [["a", "b", "c"], nil, nil] (class with bare lowercase)
|
|
797
|
+
# "<LETTER>" -> [nil, :letter, nil] (class with predefined class)
|
|
798
|
+
# "<0-9>" -> [["0".."9"], nil, nil] (predefined range)
|
|
799
|
+
# "<LETTER 0-9 '_'>" -> [["_", "0".."9"], :letter, nil] (combined)
|
|
800
|
+
# "<:var>" -> [nil, nil, "var"] (variable in class)
|
|
801
|
+
# Parse character specification for c[...] using unified CharacterClass parser.
|
|
802
|
+
# Returns [chars_array, special_class_symbol, param_ref_string]
|
|
803
|
+
def parse_chars(chars_str, params: [])
|
|
804
|
+
return [nil, nil, nil] if chars_str.nil?
|
|
805
|
+
|
|
806
|
+
# Use unified CharacterClass parser
|
|
807
|
+
result = CharacterClass.parse(chars_str)
|
|
808
|
+
|
|
809
|
+
# Validate param_ref against known params
|
|
810
|
+
if result[:param_ref] && !params.include?(result[:param_ref])
|
|
811
|
+
# Unknown param - treat the whole thing as literal chars
|
|
812
|
+
chars = ":#{result[:param_ref]}".chars
|
|
813
|
+
return [chars, nil, nil]
|
|
814
|
+
end
|
|
815
|
+
|
|
816
|
+
chars = result[:chars].empty? ? nil : result[:chars]
|
|
817
|
+
[chars, result[:special_class], result[:param_ref]]
|
|
818
|
+
end
|
|
819
|
+
|
|
820
|
+
# Legacy: Parse escape sequences in a quoted string.
|
|
821
|
+
# Kept for backwards compatibility but CharacterClass.parse_quoted_string is preferred.
|
|
822
|
+
def parse_quoted_string(str)
|
|
823
|
+
return '' if str.nil? || str.empty?
|
|
824
|
+
|
|
825
|
+
result = +''
|
|
826
|
+
i = 0
|
|
827
|
+
while i < str.length
|
|
828
|
+
if str[i] == '\\' && i + 1 < str.length
|
|
829
|
+
case str[i + 1]
|
|
830
|
+
when 'n' then result << "\n"
|
|
831
|
+
when 't' then result << "\t"
|
|
832
|
+
when 'r' then result << "\r"
|
|
833
|
+
when '\\' then result << '\\'
|
|
834
|
+
when "'" then result << "'"
|
|
835
|
+
when 'x'
|
|
836
|
+
# Hex byte: \xHH
|
|
837
|
+
if i + 3 < str.length && str[i + 2..i + 3].match?(/^[0-9A-Fa-f]{2}$/)
|
|
838
|
+
result << str[i + 2..i + 3].to_i(16).chr
|
|
839
|
+
i += 2
|
|
840
|
+
else
|
|
841
|
+
result << str[i + 1]
|
|
842
|
+
end
|
|
843
|
+
when 'u'
|
|
844
|
+
# Unicode: \uXXXX
|
|
845
|
+
if i + 5 < str.length && str[i + 2..i + 5].match?(/^[0-9A-Fa-f]{4}$/)
|
|
846
|
+
result << str[i + 2..i + 5].to_i(16).chr(Encoding::UTF_8)
|
|
847
|
+
i += 4
|
|
848
|
+
else
|
|
849
|
+
result << str[i + 1]
|
|
850
|
+
end
|
|
851
|
+
else
|
|
852
|
+
result << str[i + 1]
|
|
853
|
+
end
|
|
854
|
+
i += 2
|
|
855
|
+
else
|
|
856
|
+
result << str[i]
|
|
857
|
+
i += 1
|
|
858
|
+
end
|
|
859
|
+
end
|
|
860
|
+
result
|
|
861
|
+
end
|
|
862
|
+
|
|
863
|
+
# Parse return value specification
|
|
864
|
+
# Returns hash with :emit_type, :emit_mode, :literal, :return_value
|
|
865
|
+
# Examples:
|
|
866
|
+
# nil or "" -> {} (default behavior)
|
|
867
|
+
# "TypeName" -> { emit_type: "TypeName", emit_mode: :bare }
|
|
868
|
+
# "TypeName(USE_MARK)" -> { emit_type: "TypeName", emit_mode: :mark }
|
|
869
|
+
# "TypeName(lit)" -> { emit_type: "TypeName", emit_mode: :literal, literal: "lit" }
|
|
870
|
+
# "varname" -> { return_value: "varname" } (for INTERNAL types returning a value)
|
|
871
|
+
def parse_return_value(value)
|
|
872
|
+
return {} if value.nil? || value.empty?
|
|
873
|
+
|
|
874
|
+
case value
|
|
875
|
+
when /^([A-Z]\w*)\(USE_MARK\)$/ then { emit_type: ::Regexp.last_match(1), emit_mode: :mark }
|
|
876
|
+
when /^([A-Z]\w*)\(([^)]+)\)$/
|
|
877
|
+
{ emit_type: ::Regexp.last_match(1), emit_mode: :literal, literal: process_escapes(::Regexp.last_match(2)) }
|
|
878
|
+
when /^([A-Z]\w*)$/ then { emit_type: ::Regexp.last_match(1), emit_mode: :bare }
|
|
879
|
+
when /^[a-z_]\w*$/
|
|
880
|
+
# Variable name - for INTERNAL types returning a computed value
|
|
881
|
+
{ return_value: value }
|
|
882
|
+
else
|
|
883
|
+
{} # Unknown format, use default
|
|
884
|
+
end
|
|
885
|
+
end
|
|
886
|
+
|
|
887
|
+
# Parse a call command value into name and args.
|
|
888
|
+
# Examples:
|
|
889
|
+
# "func" -> { name: "func", call_args: nil }
|
|
890
|
+
# "func(x, y)" -> { name: "func", call_args: "x, y" }
|
|
891
|
+
# "func(<R>)" -> { name: "func", call_args: "<R>" }
|
|
892
|
+
# "func())" -> { name: "func", call_args: ")" } (bare paren as arg)
|
|
893
|
+
# "error(Code)" -> { name: "error", call_args: "Code", is_error: true }
|
|
894
|
+
def parse_call_value(value)
|
|
895
|
+
return { name: value, call_args: nil } unless value.include?('(')
|
|
896
|
+
|
|
897
|
+
# Find the first '(' - everything before is the name
|
|
898
|
+
paren_pos = value.index('(')
|
|
899
|
+
name = value[0...paren_pos]
|
|
900
|
+
|
|
901
|
+
# Everything after the first '(' up to the last ')' is the args
|
|
902
|
+
# For "func())" -> args = ")"
|
|
903
|
+
# For "func(<R>)" -> args = "<R>"
|
|
904
|
+
rest = value[(paren_pos + 1)..]
|
|
905
|
+
|
|
906
|
+
# Strip the final ')' if present - but only ONE trailing paren
|
|
907
|
+
call_args = rest.end_with?(')') ? rest[0...-1] : rest
|
|
908
|
+
call_args = nil if call_args.empty?
|
|
909
|
+
|
|
910
|
+
result = { name:, call_args: }
|
|
911
|
+
result[:is_error] = true if name == 'error'
|
|
912
|
+
result
|
|
913
|
+
end
|
|
914
|
+
|
|
915
|
+
# Infer SCAN optimization: if a state has a simple self-looping default case
|
|
916
|
+
# (only advance + transition, no side effects), the explicit character cases
|
|
917
|
+
# become SCAN targets.
|
|
918
|
+
def infer_scan_chars(_state, cases)
|
|
919
|
+
default_case = cases.find(&:default?)
|
|
920
|
+
return nil unless default_case
|
|
921
|
+
return nil unless simple_self_loop?(default_case)
|
|
922
|
+
|
|
923
|
+
# Collect all explicit characters from non-default cases
|
|
924
|
+
explicit_chars = cases
|
|
925
|
+
.reject(&:default?)
|
|
926
|
+
.reject(&:conditional?) # Skip conditional cases
|
|
927
|
+
.flat_map { |c| c.chars || [] }
|
|
928
|
+
.uniq
|
|
929
|
+
|
|
930
|
+
return nil if explicit_chars.empty?
|
|
931
|
+
# Support up to 6 chars via chained memchr calls (memchr3 + memchr3)
|
|
932
|
+
# Beyond 6, the overhead of chaining outweighs the benefit
|
|
933
|
+
return nil if explicit_chars.size > 6
|
|
934
|
+
|
|
935
|
+
explicit_chars
|
|
936
|
+
end
|
|
937
|
+
|
|
938
|
+
# Check if a case is a simple self-loop: only advance and/or transition (no calls, emits, etc.)
|
|
939
|
+
# This is the stricter check for SCAN optimization.
|
|
940
|
+
def simple_self_loop?(kase)
|
|
941
|
+
has_self_transition = false
|
|
942
|
+
|
|
943
|
+
kase.commands.each do |cmd|
|
|
944
|
+
case cmd.type
|
|
945
|
+
when :advance
|
|
946
|
+
# OK - just advancing
|
|
947
|
+
when :transition
|
|
948
|
+
val = cmd.args[:value] || cmd.args['value']
|
|
949
|
+
has_self_transition = true if val.nil? || val.empty?
|
|
950
|
+
else
|
|
951
|
+
# Any other command (call, emit, mark, term, etc.) means not a simple loop
|
|
952
|
+
return false
|
|
953
|
+
end
|
|
954
|
+
end
|
|
955
|
+
|
|
956
|
+
has_self_transition
|
|
957
|
+
end
|
|
958
|
+
|
|
959
|
+
# Check if a case has any self-transition (used for is_self_looping metadata)
|
|
960
|
+
def has_self_transition?(kase)
|
|
961
|
+
kase.commands.any? do |cmd|
|
|
962
|
+
next false unless cmd.type == :transition
|
|
963
|
+
|
|
964
|
+
val = cmd.args[:value] || cmd.args['value']
|
|
965
|
+
val.nil? || val.empty?
|
|
966
|
+
end
|
|
967
|
+
end
|
|
968
|
+
|
|
969
|
+
# Infer expected closing delimiter from return cases.
|
|
970
|
+
# If ALL return cases match the same single character, that's the expected closer.
|
|
971
|
+
# Also check if TERM appears before return (emits_content_on_close).
|
|
972
|
+
def infer_expects(states)
|
|
973
|
+
return_cases = []
|
|
974
|
+
|
|
975
|
+
# Collect all cases that contain a return command
|
|
976
|
+
states.each do |state|
|
|
977
|
+
state.cases.each do |kase|
|
|
978
|
+
return_cases << kase if kase.commands.any? { |cmd| cmd.type == :return }
|
|
979
|
+
end
|
|
980
|
+
end
|
|
981
|
+
|
|
982
|
+
# No returns found - no expected closer
|
|
983
|
+
return [nil, false] if return_cases.empty?
|
|
984
|
+
|
|
985
|
+
# Check if all return cases match the same single character
|
|
986
|
+
# (ignore conditional cases for now - they still match on a char)
|
|
987
|
+
char_matches = return_cases.filter_map do |kase|
|
|
988
|
+
# Must have exactly one character match (not default, not char class)
|
|
989
|
+
next nil if kase.default?
|
|
990
|
+
next nil if kase.special_class
|
|
991
|
+
next nil if kase.chars.nil? || kase.chars.length != 1
|
|
992
|
+
|
|
993
|
+
kase.chars.first
|
|
994
|
+
end
|
|
995
|
+
|
|
996
|
+
# If not all return cases have single-char matches, no expected closer
|
|
997
|
+
return [nil, false] if char_matches.length != return_cases.length
|
|
998
|
+
|
|
999
|
+
# If not all the same character, no expected closer
|
|
1000
|
+
return [nil, false] if char_matches.uniq.length != 1
|
|
1001
|
+
|
|
1002
|
+
expects_char = char_matches.first
|
|
1003
|
+
|
|
1004
|
+
# Check if any return case has TERM before return
|
|
1005
|
+
emits_content = return_cases.any? do |kase|
|
|
1006
|
+
kase.commands.any? { |cmd| cmd.type == :term }
|
|
1007
|
+
end
|
|
1008
|
+
|
|
1009
|
+
[expects_char, emits_content]
|
|
1010
|
+
end
|
|
1011
|
+
|
|
1012
|
+
# Collect custom error codes from /error(code) calls across all functions
|
|
1013
|
+
def collect_custom_error_codes(functions)
|
|
1014
|
+
codes = Set.new
|
|
1015
|
+
|
|
1016
|
+
functions.each do |func|
|
|
1017
|
+
func.states.each do |state|
|
|
1018
|
+
state.cases.each do |kase|
|
|
1019
|
+
collect_error_codes_from_commands(kase.commands, codes)
|
|
1020
|
+
end
|
|
1021
|
+
end
|
|
1022
|
+
end
|
|
1023
|
+
|
|
1024
|
+
codes.to_a.sort
|
|
1025
|
+
end
|
|
1026
|
+
|
|
1027
|
+
def collect_error_codes_from_commands(commands, codes)
|
|
1028
|
+
commands.each do |cmd|
|
|
1029
|
+
case cmd.type
|
|
1030
|
+
when :error
|
|
1031
|
+
# Explicit :error command
|
|
1032
|
+
code = cmd.args[:value] || cmd.args['value']
|
|
1033
|
+
codes << code if code && !code.empty?
|
|
1034
|
+
when :call
|
|
1035
|
+
# /error(code) is parsed as :call with is_error: true
|
|
1036
|
+
if cmd.args[:is_error]
|
|
1037
|
+
code = cmd.args[:call_args]
|
|
1038
|
+
codes << code if code && !code.empty?
|
|
1039
|
+
end
|
|
1040
|
+
when :conditional
|
|
1041
|
+
# Recurse into conditional clauses
|
|
1042
|
+
cmd.args[:clauses]&.each do |clause|
|
|
1043
|
+
collect_error_codes_from_commands(clause['commands'] || [], codes)
|
|
1044
|
+
end
|
|
1045
|
+
end
|
|
1046
|
+
end
|
|
1047
|
+
end
|
|
1048
|
+
|
|
1049
|
+
# Infer local variables from assignments in function
|
|
1050
|
+
def infer_locals(func)
|
|
1051
|
+
locals = {}
|
|
1052
|
+
|
|
1053
|
+
# Check entry_actions for variable declarations
|
|
1054
|
+
func.entry_actions&.each do |cmd|
|
|
1055
|
+
collect_locals_from_commands([cmd], locals)
|
|
1056
|
+
end
|
|
1057
|
+
|
|
1058
|
+
# Check state cases for variable usage
|
|
1059
|
+
func.states.each do |state|
|
|
1060
|
+
state.cases.each do |kase|
|
|
1061
|
+
collect_locals_from_commands(kase.commands, locals)
|
|
1062
|
+
end
|
|
1063
|
+
end
|
|
1064
|
+
|
|
1065
|
+
locals
|
|
1066
|
+
end
|
|
1067
|
+
|
|
1068
|
+
def collect_locals_from_commands(commands, locals)
|
|
1069
|
+
commands.each do |cmd|
|
|
1070
|
+
if cmd.is_a?(AST::Conditional)
|
|
1071
|
+
cmd.clauses&.each do |clause|
|
|
1072
|
+
collect_locals_from_commands(clause.commands, locals)
|
|
1073
|
+
end
|
|
1074
|
+
elsif cmd.respond_to?(:type)
|
|
1075
|
+
case cmd.type
|
|
1076
|
+
when :assign, :add_assign, :sub_assign
|
|
1077
|
+
if cmd.value.is_a?(Hash) && cmd.value[:var]
|
|
1078
|
+
locals[cmd.value[:var]] ||= :i32 # Default type
|
|
1079
|
+
end
|
|
1080
|
+
end
|
|
1081
|
+
end
|
|
1082
|
+
end
|
|
1083
|
+
end
|
|
1084
|
+
|
|
1085
|
+
# Infer parameter types from usage in states.
|
|
1086
|
+
# - Params used in |c[:x]| are bytes (u8) for single-byte comparison
|
|
1087
|
+
# - Params used in PREPEND(:x) are byte slices (&'static [u8]) for prepending
|
|
1088
|
+
# - Others default to i32
|
|
1089
|
+
def infer_param_types(params, states)
|
|
1090
|
+
return {} if params.empty?
|
|
1091
|
+
|
|
1092
|
+
# Start with all params as i32 (default)
|
|
1093
|
+
types = params.to_h { |p| [p, :i32] }
|
|
1094
|
+
|
|
1095
|
+
# Find params used in character matches (these become u8)
|
|
1096
|
+
# and params used in PREPEND (these become bytes slice)
|
|
1097
|
+
states.each do |state|
|
|
1098
|
+
state.cases.each do |kase|
|
|
1099
|
+
# Check param_ref in character matches - needs u8 for comparison
|
|
1100
|
+
types[kase.param_ref] = :byte if kase.param_ref && types.key?(kase.param_ref)
|
|
1101
|
+
|
|
1102
|
+
# Check conditions for param == 'char' comparisons
|
|
1103
|
+
# e.g., |if[prepend == '|'] means prepend should be u8
|
|
1104
|
+
# Note: param == 0 is NOT a byte comparison - it's a numeric flag check
|
|
1105
|
+
if kase.condition
|
|
1106
|
+
params.each do |param|
|
|
1107
|
+
# Look for patterns like: param == 'x', 'x' == param (character literal comparisons)
|
|
1108
|
+
# Do NOT match param == 0 - that's a numeric comparison, not a byte sentinel
|
|
1109
|
+
next unless (kase.condition.match?(/\b#{Regexp.escape(param)}\s*[!=]=\s*'/) ||
|
|
1110
|
+
kase.condition.match?(/'\s*[!=]=\s*#{Regexp.escape(param)}\b/)) && types.key?(param)
|
|
1111
|
+
|
|
1112
|
+
types[param] = :byte
|
|
1113
|
+
end
|
|
1114
|
+
end
|
|
1115
|
+
|
|
1116
|
+
# Check param_ref in PREPEND commands - needs &'static [u8] for prepending
|
|
1117
|
+
kase.commands.each do |cmd|
|
|
1118
|
+
if cmd.type == :prepend_param && cmd.args[:param_ref]
|
|
1119
|
+
param = cmd.args[:param_ref]
|
|
1120
|
+
types[param] = :bytes if types.key?(param)
|
|
1121
|
+
end
|
|
1122
|
+
end
|
|
1123
|
+
end
|
|
1124
|
+
end
|
|
1125
|
+
|
|
1126
|
+
types
|
|
1127
|
+
end
|
|
1128
|
+
|
|
1129
|
+
# Infer param types from call-site values AND propagate from callees.
|
|
1130
|
+
# If a function is called with bytes-like values (<>, <P>, '|'), that param becomes :bytes.
|
|
1131
|
+
# If bar calls foo(:x) and foo's param is :bytes, then bar's :x should be :bytes.
|
|
1132
|
+
def propagate_param_types(functions)
|
|
1133
|
+
func_by_name = functions.to_h { |f| [f.name, f] }
|
|
1134
|
+
|
|
1135
|
+
# First pass: infer types from literal values at call sites
|
|
1136
|
+
functions.each do |func|
|
|
1137
|
+
func.states.each do |state|
|
|
1138
|
+
state.cases.each do |kase|
|
|
1139
|
+
kase.commands.each do |cmd|
|
|
1140
|
+
next unless cmd.type == :call && cmd.args[:call_args]
|
|
1141
|
+
|
|
1142
|
+
target = func_by_name[cmd.args[:name]]
|
|
1143
|
+
next unless target
|
|
1144
|
+
|
|
1145
|
+
args = cmd.args[:call_args].split(',').map(&:strip)
|
|
1146
|
+
args.zip(target.params).each do |arg, target_param|
|
|
1147
|
+
next unless target_param
|
|
1148
|
+
|
|
1149
|
+
# If arg looks like a bytes value, mark target param as :bytes
|
|
1150
|
+
# BUT only if it's currently :i32 (default). Don't override :byte
|
|
1151
|
+
# which means it's used in |c[:x]| for single-byte comparison.
|
|
1152
|
+
if bytes_like_value?(arg) && target.param_types[target_param] == :i32
|
|
1153
|
+
target.param_types[target_param] =
|
|
1154
|
+
:bytes
|
|
1155
|
+
end
|
|
1156
|
+
end
|
|
1157
|
+
end
|
|
1158
|
+
end
|
|
1159
|
+
end
|
|
1160
|
+
end
|
|
1161
|
+
|
|
1162
|
+
# Second pass: propagate types from callees to callers (iterative)
|
|
1163
|
+
changed = true
|
|
1164
|
+
while changed
|
|
1165
|
+
changed = false
|
|
1166
|
+
functions.each do |func|
|
|
1167
|
+
func.states.each do |state|
|
|
1168
|
+
state.cases.each do |kase|
|
|
1169
|
+
kase.commands.each do |cmd|
|
|
1170
|
+
next unless cmd.type == :call && cmd.args[:call_args]
|
|
1171
|
+
|
|
1172
|
+
target = func_by_name[cmd.args[:name]]
|
|
1173
|
+
next unless target
|
|
1174
|
+
|
|
1175
|
+
args = cmd.args[:call_args].split(',').map(&:strip)
|
|
1176
|
+
args.zip(target.params).each do |arg, target_param|
|
|
1177
|
+
next unless target_param
|
|
1178
|
+
|
|
1179
|
+
# If arg is a param reference (:x), propagate type from callee
|
|
1180
|
+
next unless arg.match?(/^:(\w+)$/)
|
|
1181
|
+
|
|
1182
|
+
our_param = arg[1..]
|
|
1183
|
+
next unless func.param_types.key?(our_param)
|
|
1184
|
+
|
|
1185
|
+
target_type = target.param_types[target_param]
|
|
1186
|
+
our_type = func.param_types[our_param]
|
|
1187
|
+
|
|
1188
|
+
# Propagate :bytes from callee to caller
|
|
1189
|
+
if target_type == :bytes && our_type != :bytes
|
|
1190
|
+
func.param_types[our_param] = :bytes
|
|
1191
|
+
changed = true
|
|
1192
|
+
# Propagate :byte from callee to caller (only if we're still default :i32)
|
|
1193
|
+
elsif target_type == :byte && our_type == :i32
|
|
1194
|
+
func.param_types[our_param] = :byte
|
|
1195
|
+
changed = true
|
|
1196
|
+
end
|
|
1197
|
+
end
|
|
1198
|
+
end
|
|
1199
|
+
end
|
|
1200
|
+
end
|
|
1201
|
+
end
|
|
1202
|
+
end
|
|
1203
|
+
|
|
1204
|
+
functions
|
|
1205
|
+
end
|
|
1206
|
+
|
|
1207
|
+
# Check if a value looks like a bytes literal.
|
|
1208
|
+
# These are DSL escape sequences and quoted strings that are clearly
|
|
1209
|
+
# meant to be byte content, not numeric values.
|
|
1210
|
+
# Note: Numeric values like 0 or -1 are NOT bytes-like - they're sentinels.
|
|
1211
|
+
# PREPEND params get typed as :bytes from infer_param_types (PREPEND usage),
|
|
1212
|
+
# not from call-site inference.
|
|
1213
|
+
# Check if a value MUST be a byte slice (not a single byte).
|
|
1214
|
+
# Only empty class <> definitively requires :bytes type.
|
|
1215
|
+
# Single-char values like '<P>' or '|' could be either :byte or :bytes,
|
|
1216
|
+
# so their type should be inferred from usage, not from call-site values.
|
|
1217
|
+
def bytes_like_value?(arg) = arg == '<>'
|
|
1218
|
+
|
|
1219
|
+
# Collect prepend values by tracing call sites to functions with PREPEND(:param).
|
|
1220
|
+
# Returns updated functions with prepend_values filled in.
|
|
1221
|
+
def collect_prepend_values(functions)
|
|
1222
|
+
# First propagate param types from callees to callers
|
|
1223
|
+
functions = propagate_param_types(functions)
|
|
1224
|
+
|
|
1225
|
+
func_by_name = functions.to_h { |f| [f.name, f] }
|
|
1226
|
+
|
|
1227
|
+
# Step 1: Find which functions have PREPEND(:param) and which param it uses
|
|
1228
|
+
prepend_params = {} # func_name -> param_name
|
|
1229
|
+
functions.each do |func|
|
|
1230
|
+
func.states.each do |state|
|
|
1231
|
+
state.cases.each do |kase|
|
|
1232
|
+
kase.commands.each do |cmd|
|
|
1233
|
+
prepend_params[func.name] = cmd.args[:param_ref] if cmd.type == :prepend_param
|
|
1234
|
+
end
|
|
1235
|
+
end
|
|
1236
|
+
end
|
|
1237
|
+
end
|
|
1238
|
+
|
|
1239
|
+
return functions if prepend_params.empty?
|
|
1240
|
+
|
|
1241
|
+
# Step 2: Find all call sites and collect byte values passed
|
|
1242
|
+
prepend_values = Hash.new { |h, k| h[k] = Set.new }
|
|
1243
|
+
|
|
1244
|
+
functions.each do |func|
|
|
1245
|
+
collect_call_values_from_states(func.states, prepend_params, func_by_name, prepend_values)
|
|
1246
|
+
end
|
|
1247
|
+
|
|
1248
|
+
# Step 3: Update functions with prepend_values
|
|
1249
|
+
functions.map do |func|
|
|
1250
|
+
if prepend_params.key?(func.name)
|
|
1251
|
+
param_name = prepend_params[func.name]
|
|
1252
|
+
values = prepend_values[func.name].to_a.sort
|
|
1253
|
+
|
|
1254
|
+
# Create updated function with prepend_values
|
|
1255
|
+
IR::Function.new(
|
|
1256
|
+
name: func.name,
|
|
1257
|
+
return_type: func.return_type,
|
|
1258
|
+
params: func.params,
|
|
1259
|
+
param_types: func.param_types,
|
|
1260
|
+
locals: func.locals,
|
|
1261
|
+
states: func.states,
|
|
1262
|
+
eof_handler: func.eof_handler,
|
|
1263
|
+
entry_actions: func.entry_actions,
|
|
1264
|
+
emits_events: func.emits_events,
|
|
1265
|
+
expects_char: func.expects_char,
|
|
1266
|
+
emits_content_on_close: func.emits_content_on_close,
|
|
1267
|
+
prepend_values: { param_name => values },
|
|
1268
|
+
lineno: func.lineno
|
|
1269
|
+
)
|
|
1270
|
+
else
|
|
1271
|
+
func
|
|
1272
|
+
end
|
|
1273
|
+
end
|
|
1274
|
+
end
|
|
1275
|
+
|
|
1276
|
+
def collect_call_values_from_states(states, prepend_params, func_by_name, prepend_values)
|
|
1277
|
+
states.each do |state|
|
|
1278
|
+
state.cases.each do |kase|
|
|
1279
|
+
collect_call_values_from_commands(kase.commands, prepend_params, func_by_name, prepend_values)
|
|
1280
|
+
end
|
|
1281
|
+
collect_call_values_from_commands(state.eof_handler || [], prepend_params, func_by_name, prepend_values)
|
|
1282
|
+
end
|
|
1283
|
+
end
|
|
1284
|
+
|
|
1285
|
+
def collect_call_values_from_commands(commands, prepend_params, func_by_name, prepend_values)
|
|
1286
|
+
commands.each do |cmd|
|
|
1287
|
+
case cmd.type
|
|
1288
|
+
when :call
|
|
1289
|
+
func_name = cmd.args[:name]
|
|
1290
|
+
next unless prepend_params.key?(func_name)
|
|
1291
|
+
|
|
1292
|
+
# Extract the byte value from call_args
|
|
1293
|
+
call_args = cmd.args[:call_args]
|
|
1294
|
+
byte_value = parse_byte_literal(call_args)
|
|
1295
|
+
prepend_values[func_name] << byte_value if byte_value
|
|
1296
|
+
when :conditional
|
|
1297
|
+
cmd.args[:clauses]&.each do |clause|
|
|
1298
|
+
nested_cmds = (clause['commands'] || []).map { |c| c.is_a?(Hash) ? IR::Command.new(type: c['type'].to_sym, args: c['args'].transform_keys(&:to_sym)) : c }
|
|
1299
|
+
collect_call_values_from_commands(nested_cmds, prepend_params, func_by_name, prepend_values)
|
|
1300
|
+
end
|
|
1301
|
+
end
|
|
1302
|
+
end
|
|
1303
|
+
end
|
|
1304
|
+
|
|
1305
|
+
# Transform call arguments based on target function parameter types.
|
|
1306
|
+
# For :bytes params, generates b"..." format; for :byte params, b'.' format.
|
|
1307
|
+
def transform_call_args_by_type(functions)
|
|
1308
|
+
func_by_name = functions.to_h { |f| [f.name, f] }
|
|
1309
|
+
|
|
1310
|
+
functions.map do |func|
|
|
1311
|
+
new_states = func.states.map do |state|
|
|
1312
|
+
new_cases = state.cases.map do |kase|
|
|
1313
|
+
new_commands = transform_commands_args(kase.commands, func_by_name)
|
|
1314
|
+
IR::Case.new(
|
|
1315
|
+
chars: kase.chars,
|
|
1316
|
+
special_class: kase.special_class,
|
|
1317
|
+
param_ref: kase.param_ref,
|
|
1318
|
+
condition: kase.condition,
|
|
1319
|
+
substate: kase.substate,
|
|
1320
|
+
commands: new_commands,
|
|
1321
|
+
lineno: kase.lineno
|
|
1322
|
+
)
|
|
1323
|
+
end
|
|
1324
|
+
|
|
1325
|
+
new_eof = transform_commands_args(state.eof_handler || [], func_by_name)
|
|
1326
|
+
|
|
1327
|
+
IR::State.new(
|
|
1328
|
+
name: state.name,
|
|
1329
|
+
cases: new_cases,
|
|
1330
|
+
eof_handler: new_eof.empty? ? nil : new_eof,
|
|
1331
|
+
scan_chars: state.scan_chars,
|
|
1332
|
+
is_self_looping: state.is_self_looping,
|
|
1333
|
+
has_default: state.has_default,
|
|
1334
|
+
is_unconditional: state.is_unconditional,
|
|
1335
|
+
newline_injected: state.newline_injected,
|
|
1336
|
+
lineno: state.lineno
|
|
1337
|
+
)
|
|
1338
|
+
end
|
|
1339
|
+
|
|
1340
|
+
IR::Function.new(
|
|
1341
|
+
name: func.name,
|
|
1342
|
+
return_type: func.return_type,
|
|
1343
|
+
params: func.params,
|
|
1344
|
+
param_types: func.param_types,
|
|
1345
|
+
locals: func.locals,
|
|
1346
|
+
states: new_states,
|
|
1347
|
+
eof_handler: func.eof_handler,
|
|
1348
|
+
entry_actions: func.entry_actions,
|
|
1349
|
+
emits_events: func.emits_events,
|
|
1350
|
+
expects_char: func.expects_char,
|
|
1351
|
+
emits_content_on_close: func.emits_content_on_close,
|
|
1352
|
+
prepend_values: func.prepend_values,
|
|
1353
|
+
lineno: func.lineno
|
|
1354
|
+
)
|
|
1355
|
+
end
|
|
1356
|
+
end
|
|
1357
|
+
|
|
1358
|
+
def transform_commands_args(commands, func_by_name)
|
|
1359
|
+
commands.map do |cmd|
|
|
1360
|
+
if cmd.type == :call && cmd.args[:call_args]
|
|
1361
|
+
target_func = func_by_name[cmd.args[:name]]
|
|
1362
|
+
if target_func
|
|
1363
|
+
transformed_args = transform_args_for_target(cmd.args[:call_args], target_func)
|
|
1364
|
+
IR::Command.new(type: cmd.type, args: cmd.args.merge(call_args: transformed_args))
|
|
1365
|
+
else
|
|
1366
|
+
cmd
|
|
1367
|
+
end
|
|
1368
|
+
elsif cmd.type == :conditional
|
|
1369
|
+
new_clauses = cmd.args[:clauses]&.map do |clause|
|
|
1370
|
+
nested = (clause['commands'] || []).map do |c|
|
|
1371
|
+
c.is_a?(Hash) ? IR::Command.new(type: c['type'].to_sym, args: c['args'].transform_keys(&:to_sym)) : c
|
|
1372
|
+
end
|
|
1373
|
+
{ 'condition' => clause['condition'], 'commands' => transform_commands_args(nested, func_by_name) }
|
|
1374
|
+
end
|
|
1375
|
+
IR::Command.new(type: cmd.type, args: { clauses: new_clauses })
|
|
1376
|
+
else
|
|
1377
|
+
cmd
|
|
1378
|
+
end
|
|
1379
|
+
end
|
|
1380
|
+
end
|
|
1381
|
+
|
|
1382
|
+
# Transform call arguments based on target function's parameter types.
|
|
1383
|
+
# Uses CharacterClass for unified parsing, then converts to appropriate Rust format.
|
|
1384
|
+
def transform_args_for_target(args_str, target_func)
|
|
1385
|
+
return args_str if args_str.nil? || target_func.params.empty?
|
|
1386
|
+
|
|
1387
|
+
args = tokenize_call_args(args_str)
|
|
1388
|
+
params = target_func.params
|
|
1389
|
+
param_types = target_func.param_types
|
|
1390
|
+
|
|
1391
|
+
args.zip(params).map do |arg, param|
|
|
1392
|
+
next arg unless param
|
|
1393
|
+
|
|
1394
|
+
param_type = param_types[param]
|
|
1395
|
+
|
|
1396
|
+
# Handle numeric literals specially - they're numbers, not characters
|
|
1397
|
+
if arg.match?(/^-?\d+$/)
|
|
1398
|
+
case param_type
|
|
1399
|
+
when :bytes then 'b""' # Numeric sentinel → empty bytes
|
|
1400
|
+
when :byte then "#{arg}u8" # Numeric literal → u8
|
|
1401
|
+
else arg # :i32 → pass through
|
|
1402
|
+
end
|
|
1403
|
+
else
|
|
1404
|
+
case param_type
|
|
1405
|
+
when :bytes
|
|
1406
|
+
result = CharacterClass.parse(arg)
|
|
1407
|
+
CharacterClass.to_rust_bytes(result)
|
|
1408
|
+
when :byte
|
|
1409
|
+
result = CharacterClass.parse(arg)
|
|
1410
|
+
CharacterClass.to_rust_byte(result)
|
|
1411
|
+
else
|
|
1412
|
+
arg # :i32 or unknown, pass through
|
|
1413
|
+
end
|
|
1414
|
+
end
|
|
1415
|
+
end.join(', ')
|
|
1416
|
+
end
|
|
1417
|
+
|
|
1418
|
+
# Tokenize call arguments respecting quotes (commas inside quotes don't split)
|
|
1419
|
+
def tokenize_call_args(args_str)
|
|
1420
|
+
args = []
|
|
1421
|
+
current = +''
|
|
1422
|
+
in_quote = false
|
|
1423
|
+
in_angle = 0
|
|
1424
|
+
|
|
1425
|
+
args_str.each_char do |c|
|
|
1426
|
+
case c
|
|
1427
|
+
when "'"
|
|
1428
|
+
in_quote = !in_quote
|
|
1429
|
+
current << c
|
|
1430
|
+
when '<'
|
|
1431
|
+
in_angle += 1
|
|
1432
|
+
current << c
|
|
1433
|
+
when '>'
|
|
1434
|
+
in_angle -= 1 if in_angle.positive?
|
|
1435
|
+
current << c
|
|
1436
|
+
when ','
|
|
1437
|
+
if in_quote || in_angle.positive?
|
|
1438
|
+
current << c
|
|
1439
|
+
else
|
|
1440
|
+
args << current.strip
|
|
1441
|
+
current = +''
|
|
1442
|
+
end
|
|
1443
|
+
else
|
|
1444
|
+
current << c
|
|
1445
|
+
end
|
|
1446
|
+
end
|
|
1447
|
+
|
|
1448
|
+
args << current.strip unless current.empty?
|
|
1449
|
+
args
|
|
1450
|
+
end
|
|
1451
|
+
|
|
1452
|
+
# Parse a call argument into a byte literal string for the template.
|
|
1453
|
+
# Supports both legacy syntax and new characters.md syntax.
|
|
1454
|
+
def parse_byte_literal(arg)
|
|
1455
|
+
return nil if arg.nil? || arg.empty?
|
|
1456
|
+
|
|
1457
|
+
case arg
|
|
1458
|
+
when '0' then nil # 0 means no prepend
|
|
1459
|
+
# Legacy escape syntax
|
|
1460
|
+
when '<P>' then '|'
|
|
1461
|
+
when '<L>' then '['
|
|
1462
|
+
when '<R>' then ']'
|
|
1463
|
+
when '<LB>' then '{'
|
|
1464
|
+
when '<RB>' then '}'
|
|
1465
|
+
when '<LP>' then '('
|
|
1466
|
+
when '<RP>' then ')'
|
|
1467
|
+
when '<BS>' then '\\\\'
|
|
1468
|
+
# New syntax: quoted single character
|
|
1469
|
+
when /^'(.)'$/ then ::Regexp.last_match(1)
|
|
1470
|
+
when /^"(.)"$/ then ::Regexp.last_match(1)
|
|
1471
|
+
# New syntax: quoted with escape (e.g., '\'')
|
|
1472
|
+
when /^'\\(.)'$/ then parse_quoted_string("\\#{::Regexp.last_match(1)}")
|
|
1473
|
+
# Legacy: single char
|
|
1474
|
+
when /^.$/ then arg
|
|
1475
|
+
else nil # Unknown format
|
|
1476
|
+
end
|
|
1477
|
+
end
|
|
1478
|
+
end
|
|
1479
|
+
end
|