descent 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +285 -0
- data/README.md +583 -0
- data/SYNTAX.md +334 -0
- data/exe/descent +15 -0
- data/lib/descent/ast.rb +69 -0
- data/lib/descent/generator.rb +489 -0
- data/lib/descent/ir.rb +98 -0
- data/lib/descent/ir_builder.rb +1479 -0
- data/lib/descent/lexer.rb +308 -0
- data/lib/descent/parser.rb +450 -0
- data/lib/descent/railroad.rb +272 -0
- data/lib/descent/templates/rust/_command.liquid +174 -0
- data/lib/descent/templates/rust/parser.liquid +1163 -0
- data/lib/descent/tools/debug.rb +115 -0
- data/lib/descent/tools/diagram.rb +48 -0
- data/lib/descent/tools/generate.rb +47 -0
- data/lib/descent/tools/validate.rb +56 -0
- data/lib/descent/validator.rb +231 -0
- data/lib/descent/version.rb +5 -0
- data/lib/descent.rb +34 -0
- metadata +101 -0
|
@@ -0,0 +1,489 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'liquid'
|
|
4
|
+
|
|
5
|
+
module Descent
|
|
6
|
+
# Custom Liquid filters for code generation.
|
|
7
|
+
module LiquidFilters
|
|
8
|
+
# Escape sequences: DSL placeholder -> Rust byte literal
|
|
9
|
+
ESCAPE_SEQUENCES = {
|
|
10
|
+
'<P>' => "b'|'",
|
|
11
|
+
'<R>' => "b']'",
|
|
12
|
+
'<L>' => "b'['",
|
|
13
|
+
'<RB>' => "b'}'",
|
|
14
|
+
'<LB>' => "b'{'",
|
|
15
|
+
'<RP>' => "b')'",
|
|
16
|
+
'<LP>' => "b'('",
|
|
17
|
+
'<BS>' => "b'\\\\'",
|
|
18
|
+
'<SQ>' => "b'\\''",
|
|
19
|
+
'<DQ>' => "b'\"'",
|
|
20
|
+
'<NL>' => "b'\\n'",
|
|
21
|
+
'<WS>' => "b' '",
|
|
22
|
+
'<>' => 'b""' # Empty byte slice
|
|
23
|
+
}.freeze
|
|
24
|
+
# Convert a character to Rust byte literal format.
|
|
25
|
+
# Examples: "\n" -> "b'\\n'", "|" -> "b'|'", " " -> "b' '"
|
|
26
|
+
def escape_rust_char(char)
|
|
27
|
+
return 'b\'?\'' if char.nil?
|
|
28
|
+
|
|
29
|
+
escaped = case char
|
|
30
|
+
when "\n" then '\\n'
|
|
31
|
+
when "\t" then '\\t'
|
|
32
|
+
when "\r" then '\\r'
|
|
33
|
+
when '\\' then '\\\\'
|
|
34
|
+
when "'" then "\\'"
|
|
35
|
+
else char
|
|
36
|
+
end
|
|
37
|
+
"b'#{escaped}'"
|
|
38
|
+
end
|
|
39
|
+
|
|
40
|
+
# Convert snake_case, camelCase, or preserve PascalCase.
|
|
41
|
+
# Examples: "identity" -> "Identity", "after_name" -> "AfterName",
|
|
42
|
+
# "UnclosedInterpolation" -> "UnclosedInterpolation"
|
|
43
|
+
def pascalcase(str)
|
|
44
|
+
return '' if str.nil?
|
|
45
|
+
|
|
46
|
+
# Split on delimiters AND on case transitions (lowercase followed by uppercase)
|
|
47
|
+
# This preserves existing PascalCase while converting snake_case and camelCase
|
|
48
|
+
str.to_s.split(/[_\s-]|(?<=[a-z])(?=[A-Z])/).map(&:capitalize).join
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
# Transform DSL expressions to Rust.
|
|
52
|
+
# - /function(args) -> self.parse_function(transformed_args, on_event)
|
|
53
|
+
# - /function -> self.parse_function(on_event)
|
|
54
|
+
# - COL -> self.col()
|
|
55
|
+
# - LINE -> self.line as i32
|
|
56
|
+
# - PREV -> self.prev()
|
|
57
|
+
# - Character literals: ' ' -> b' ', '\t' -> b'\t' (only if not already a byte literal)
|
|
58
|
+
# - Escape sequences: <R> -> b']', <RB> -> b'}', <P> -> b'|', etc.
|
|
59
|
+
# - Parameter references: :param -> param
|
|
60
|
+
def rust_expr(str)
|
|
61
|
+
return '' if str.nil?
|
|
62
|
+
|
|
63
|
+
result = str.to_s
|
|
64
|
+
|
|
65
|
+
# IMPORTANT: Process function calls FIRST, before COL/LINE/PREV expansion.
|
|
66
|
+
# Otherwise /element(COL) becomes /element(self.col()) and the regex
|
|
67
|
+
# [^)]* breaks on the ) inside self.col().
|
|
68
|
+
result = result.gsub(%r{/(\w+)\(([^)]*)\)}) do
|
|
69
|
+
func = ::Regexp.last_match(1)
|
|
70
|
+
# Transform args: :param, <R>, COL, etc.
|
|
71
|
+
args = transform_call_args(::Regexp.last_match(2))
|
|
72
|
+
args = expand_special_vars(args)
|
|
73
|
+
"self.parse_#{func}(#{args}, on_event)"
|
|
74
|
+
end
|
|
75
|
+
result = result.gsub(%r{/(\w+)}) { "self.parse_#{::Regexp.last_match(1)}(on_event)" }
|
|
76
|
+
|
|
77
|
+
# Now expand special variables in the rest of the expression
|
|
78
|
+
result = expand_special_vars(result)
|
|
79
|
+
|
|
80
|
+
# Transform standalone args (handles :param, <R>, etc. outside function calls)
|
|
81
|
+
result = transform_call_args(result)
|
|
82
|
+
|
|
83
|
+
result
|
|
84
|
+
.gsub(/(?<!b)'(\\.|.)'/, "b'\\1'") # Convert char literals to byte literals (only if not already b'...')
|
|
85
|
+
.gsub(/<[A-Z]+>/) { |m| ESCAPE_SEQUENCES[m] || m } # Escape sequences
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
# Expand special variables: COL, LINE, PREV, :param
|
|
89
|
+
def expand_special_vars(str)
|
|
90
|
+
str
|
|
91
|
+
.gsub(/\bCOL\b/, 'self.col()')
|
|
92
|
+
.gsub(/\bLINE\b/, 'self.line as i32')
|
|
93
|
+
.gsub(/\bPREV\b/, 'self.prev()')
|
|
94
|
+
.gsub(/:([a-z_]\w*)/i) { ::Regexp.last_match(1) } # :param -> param
|
|
95
|
+
end
|
|
96
|
+
|
|
97
|
+
# Transform function call arguments.
|
|
98
|
+
# - :param -> param (parameter references)
|
|
99
|
+
# - <R>, <RB>, etc. -> byte literals via ESCAPE_SEQUENCES
|
|
100
|
+
# - Bare quotes: " -> b'"', ' -> b'\''
|
|
101
|
+
def transform_call_args(args)
|
|
102
|
+
args.split(',').map do |arg|
|
|
103
|
+
arg = arg.strip
|
|
104
|
+
# Try escape sequence lookup first
|
|
105
|
+
if (escaped = ESCAPE_SEQUENCES[arg])
|
|
106
|
+
escaped
|
|
107
|
+
else
|
|
108
|
+
case arg
|
|
109
|
+
when /^:(\w+)$/ then ::Regexp.last_match(1) # :param -> param
|
|
110
|
+
when '"' then "b'\"'" # Bare double quote
|
|
111
|
+
when "'" then "b'\\''" # Bare single quote (escaped)
|
|
112
|
+
when /^\d+$/ then arg # numeric literals
|
|
113
|
+
when /^-?\d+$/ then arg # negative numbers
|
|
114
|
+
when /^'(.)'$/ then "b'#{::Regexp.last_match(1)}'" # char literal
|
|
115
|
+
when /^"(.)"$/ then "b'#{::Regexp.last_match(1)}'" # quoted char
|
|
116
|
+
when %r{^[!;:#*\-_<>/\\@$%^&+=?,.]$} then "b'#{arg}'" # Single punctuation → byte literal
|
|
117
|
+
else arg # pass through (variables, expressions)
|
|
118
|
+
end
|
|
119
|
+
end
|
|
120
|
+
end.join(', ')
|
|
121
|
+
end
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
# Custom file system for Liquid partials.
|
|
125
|
+
class TemplateFileSystem
|
|
126
|
+
def initialize(base_path) = @base_path = base_path
|
|
127
|
+
|
|
128
|
+
def read_template_file(template_path)
|
|
129
|
+
# Liquid looks for partials with underscore prefix
|
|
130
|
+
full_path = File.join(@base_path, "_#{template_path}.liquid")
|
|
131
|
+
raise Liquid::FileSystemError, "No such template: #{full_path}" unless File.exist?(full_path)
|
|
132
|
+
|
|
133
|
+
File.read(full_path)
|
|
134
|
+
end
|
|
135
|
+
end
|
|
136
|
+
|
|
137
|
+
# Renders IR to target language code using Liquid templates.
|
|
138
|
+
#
|
|
139
|
+
# All target-specific logic lives in templates, not here.
|
|
140
|
+
class Generator
|
|
141
|
+
TEMPLATE_DIR = File.expand_path('templates', __dir__)
|
|
142
|
+
|
|
143
|
+
def initialize(ir, target:, trace: false, streaming: true, **options)
|
|
144
|
+
@ir = ir
|
|
145
|
+
@target = target
|
|
146
|
+
@trace = trace
|
|
147
|
+
@streaming = streaming
|
|
148
|
+
@options = options
|
|
149
|
+
end
|
|
150
|
+
|
|
151
|
+
def generate
|
|
152
|
+
template_dir = File.join(TEMPLATE_DIR, @target.to_s)
|
|
153
|
+
template_path = File.join(template_dir, 'parser.liquid')
|
|
154
|
+
|
|
155
|
+
raise Error, "No template for target: #{@target} (looked in #{template_path})" unless File.exist?(template_path)
|
|
156
|
+
|
|
157
|
+
# Build Liquid environment with filters and file system
|
|
158
|
+
env = Liquid::Environment.build do |e|
|
|
159
|
+
e.register_filter(LiquidFilters)
|
|
160
|
+
e.file_system = TemplateFileSystem.new(template_dir)
|
|
161
|
+
end
|
|
162
|
+
|
|
163
|
+
template = Liquid::Template.parse(File.read(template_path), environment: env)
|
|
164
|
+
|
|
165
|
+
result = template.render(
|
|
166
|
+
build_context,
|
|
167
|
+
strict_variables: false, # Partials may not have all variables
|
|
168
|
+
strict_filters: true
|
|
169
|
+
)
|
|
170
|
+
|
|
171
|
+
# Post-process: clean up whitespace from Liquid template
|
|
172
|
+
result
|
|
173
|
+
.gsub(/^[ \t]+$/, '') # Remove whitespace-only lines
|
|
174
|
+
.gsub(/\n{2,}/, "\n") # Collapse all blank lines
|
|
175
|
+
.gsub(%r{^(//.*)\n(use |pub |impl )}, "\\1\n\n\\2") # Blank before use/pub/impl
|
|
176
|
+
.gsub(%r{(\})\n([ \t]*(?://|#\[|pub |fn ))}, "\\1\n\n\\2") # Blank after } before new item
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
private
|
|
180
|
+
|
|
181
|
+
# Unicode character classes that require the unicode-xid crate
|
|
182
|
+
UNICODE_CLASSES = %w[xid_start xid_cont xlbl_start xlbl_cont].freeze
|
|
183
|
+
|
|
184
|
+
def build_context
|
|
185
|
+
functions_data = @ir.functions.map { |f| function_to_hash(f) }
|
|
186
|
+
usage = analyze_helper_usage(functions_data)
|
|
187
|
+
{
|
|
188
|
+
'parser' => @ir.name,
|
|
189
|
+
'entry_point' => @ir.entry_point,
|
|
190
|
+
'types' => @ir.types.map { |t| type_to_hash(t) },
|
|
191
|
+
'functions' => functions_data,
|
|
192
|
+
'keywords' => @ir.keywords.map { |k| keywords_to_hash(k) },
|
|
193
|
+
'custom_error_codes' => @ir.custom_error_codes,
|
|
194
|
+
'trace' => @trace,
|
|
195
|
+
'uses_unicode' => uses_unicode_classes?(functions_data),
|
|
196
|
+
# Helper usage flags - only emit helpers that are actually used
|
|
197
|
+
'uses_col' => usage[:col],
|
|
198
|
+
'uses_prev' => usage[:prev],
|
|
199
|
+
'uses_set_term' => usage[:set_term],
|
|
200
|
+
'uses_span' => usage[:span],
|
|
201
|
+
'uses_letter' => usage[:letter],
|
|
202
|
+
'uses_label_cont' => usage[:label_cont],
|
|
203
|
+
'uses_digit' => usage[:digit],
|
|
204
|
+
'uses_hex_digit' => usage[:hex_digit],
|
|
205
|
+
'uses_ws' => usage[:ws],
|
|
206
|
+
'uses_nl' => usage[:nl],
|
|
207
|
+
'max_scan_arity' => usage[:max_scan_arity],
|
|
208
|
+
'streaming' => @streaming
|
|
209
|
+
}
|
|
210
|
+
end
|
|
211
|
+
|
|
212
|
+
# Analyze which helper methods are actually used by the generated code.
|
|
213
|
+
# Returns a hash of usage flags that the template uses for conditional emission.
|
|
214
|
+
def analyze_helper_usage(functions_data)
|
|
215
|
+
usage = {
|
|
216
|
+
col: false, prev: false, set_term: false, span: false,
|
|
217
|
+
letter: false, label_cont: false, digit: false, hex_digit: false,
|
|
218
|
+
ws: false, nl: false, max_scan_arity: 0
|
|
219
|
+
}
|
|
220
|
+
|
|
221
|
+
functions_data.each do |func|
|
|
222
|
+
# Check conditions for COL/PREV usage
|
|
223
|
+
check_expressions_in_function(func, usage)
|
|
224
|
+
|
|
225
|
+
# Check special classes used in cases
|
|
226
|
+
func['states'].each do |state|
|
|
227
|
+
# Track max scan arity
|
|
228
|
+
if state['scannable'] && state['scan_chars']
|
|
229
|
+
usage[:max_scan_arity] =
|
|
230
|
+
[usage[:max_scan_arity], state['scan_chars'].size].max
|
|
231
|
+
end
|
|
232
|
+
|
|
233
|
+
state['cases'].each do |kase|
|
|
234
|
+
check_special_class(kase['special_class'], usage)
|
|
235
|
+
end
|
|
236
|
+
end
|
|
237
|
+
end
|
|
238
|
+
|
|
239
|
+
# span() is used for bracket types and errors (always needed if we have types)
|
|
240
|
+
usage[:span] = true
|
|
241
|
+
|
|
242
|
+
usage
|
|
243
|
+
end
|
|
244
|
+
|
|
245
|
+
# Check expressions in conditions/commands for COL/PREV usage
|
|
246
|
+
def check_expressions_in_function(func, usage)
|
|
247
|
+
all_commands = collect_all_commands(func)
|
|
248
|
+
|
|
249
|
+
all_commands.each do |cmd|
|
|
250
|
+
args = cmd['args'] || {}
|
|
251
|
+
|
|
252
|
+
# Check condition expressions (from if cases and conditionals)
|
|
253
|
+
check_expression(args['condition'], usage)
|
|
254
|
+
|
|
255
|
+
# Check call arguments for COL/PREV
|
|
256
|
+
check_expression(args['call_args'], usage) if cmd['type'] == 'call'
|
|
257
|
+
|
|
258
|
+
# Check assignment expressions
|
|
259
|
+
check_expression(args['expr'], usage) if %w[assign add_assign sub_assign].include?(cmd['type'])
|
|
260
|
+
|
|
261
|
+
# Check for set_term usage (any TERM command uses set_term)
|
|
262
|
+
usage[:set_term] = true if cmd['type'] == 'term'
|
|
263
|
+
|
|
264
|
+
# Check for advance_to - track scan arity for explicit ->[chars]
|
|
265
|
+
if cmd['type'] == 'advance_to' && args['value']
|
|
266
|
+
arity = args['value'].length
|
|
267
|
+
usage[:max_scan_arity] = [usage[:max_scan_arity], arity].max
|
|
268
|
+
end
|
|
269
|
+
end
|
|
270
|
+
|
|
271
|
+
# Check case conditions
|
|
272
|
+
func['states'].each do |state|
|
|
273
|
+
state['cases'].each do |kase|
|
|
274
|
+
check_expression(kase['condition'], usage)
|
|
275
|
+
end
|
|
276
|
+
end
|
|
277
|
+
end
|
|
278
|
+
|
|
279
|
+
# Collect all commands from a function (including nested in conditionals)
|
|
280
|
+
def collect_all_commands(func)
|
|
281
|
+
commands = []
|
|
282
|
+
|
|
283
|
+
# Entry actions
|
|
284
|
+
commands.concat(func['entry_actions'] || [])
|
|
285
|
+
|
|
286
|
+
# EOF handler
|
|
287
|
+
commands.concat(func['eof_handler'] || [])
|
|
288
|
+
|
|
289
|
+
# State commands
|
|
290
|
+
func['states'].each do |state|
|
|
291
|
+
commands.concat(state['eof_handler'] || [])
|
|
292
|
+
state['cases'].each do |kase|
|
|
293
|
+
kase['commands'].each do |cmd|
|
|
294
|
+
commands << cmd
|
|
295
|
+
# Recurse into conditional clauses
|
|
296
|
+
next unless cmd['type'] == 'conditional' && cmd.dig('args', 'clauses')
|
|
297
|
+
|
|
298
|
+
cmd['args']['clauses'].each do |clause|
|
|
299
|
+
commands.concat(clause['commands'] || [])
|
|
300
|
+
end
|
|
301
|
+
end
|
|
302
|
+
end
|
|
303
|
+
end
|
|
304
|
+
|
|
305
|
+
commands
|
|
306
|
+
end
|
|
307
|
+
|
|
308
|
+
def check_expression(expr, usage)
|
|
309
|
+
return unless expr.is_a?(String)
|
|
310
|
+
|
|
311
|
+
usage[:col] = true if expr.match?(/\bCOL\b/)
|
|
312
|
+
usage[:prev] = true if expr.match?(/\bPREV\b/)
|
|
313
|
+
end
|
|
314
|
+
|
|
315
|
+
def check_special_class(special_class, usage)
|
|
316
|
+
return unless special_class
|
|
317
|
+
|
|
318
|
+
case special_class.to_s
|
|
319
|
+
when 'letter' then usage[:letter] = true
|
|
320
|
+
when 'label_cont' then usage[:label_cont] = true
|
|
321
|
+
when 'digit' then usage[:digit] = true
|
|
322
|
+
when 'hex_digit' then usage[:hex_digit] = true
|
|
323
|
+
when 'ws' then usage[:ws] = true
|
|
324
|
+
when 'nl' then usage[:nl] = true
|
|
325
|
+
end
|
|
326
|
+
end
|
|
327
|
+
|
|
328
|
+
# Check if any function uses Unicode character classes
|
|
329
|
+
def uses_unicode_classes?(functions_data)
|
|
330
|
+
functions_data.any? do |func|
|
|
331
|
+
func['states'].any? do |state|
|
|
332
|
+
state['cases'].any? do |kase|
|
|
333
|
+
special_class = kase['special_class']
|
|
334
|
+
special_class && UNICODE_CLASSES.include?(special_class)
|
|
335
|
+
end
|
|
336
|
+
end
|
|
337
|
+
end
|
|
338
|
+
end
|
|
339
|
+
|
|
340
|
+
def keywords_to_hash(kw)
|
|
341
|
+
{
|
|
342
|
+
'name' => kw.name,
|
|
343
|
+
'const_name' => "#{kw.name.upcase}_KEYWORDS",
|
|
344
|
+
'fallback_func' => kw.fallback_func,
|
|
345
|
+
'fallback_args' => kw.fallback_args,
|
|
346
|
+
'mappings' => kw.mappings.map do |m|
|
|
347
|
+
{
|
|
348
|
+
'keyword' => m[:keyword],
|
|
349
|
+
'event_type' => m[:event_type]
|
|
350
|
+
}
|
|
351
|
+
end
|
|
352
|
+
}
|
|
353
|
+
end
|
|
354
|
+
|
|
355
|
+
def type_to_hash(type)
|
|
356
|
+
{
|
|
357
|
+
'name' => type.name,
|
|
358
|
+
'kind' => type.kind.to_s,
|
|
359
|
+
'emits_start' => type.emits_start,
|
|
360
|
+
'emits_end' => type.emits_end
|
|
361
|
+
}
|
|
362
|
+
end
|
|
363
|
+
|
|
364
|
+
def function_to_hash(func)
|
|
365
|
+
# Extract initial values from entry_actions for locals
|
|
366
|
+
# This allows template to initialize locals directly instead of "= 0" then assignment
|
|
367
|
+
local_init_values = extract_local_init_values(func.entry_actions || [])
|
|
368
|
+
|
|
369
|
+
# Determine which locals need to be mutable (reassigned in body or use +=/-=)
|
|
370
|
+
mutable_locals = find_mutable_locals(func)
|
|
371
|
+
|
|
372
|
+
# Filter out pure assignments from entry_actions (they become initializers)
|
|
373
|
+
# Keep conditionals and non-assignment commands
|
|
374
|
+
filtered_entry_actions = (func.entry_actions || []).reject do |cmd|
|
|
375
|
+
cmd.type == :assign && local_init_values.key?(cmd.args[:var])
|
|
376
|
+
end
|
|
377
|
+
|
|
378
|
+
{
|
|
379
|
+
'name' => func.name,
|
|
380
|
+
'return_type' => func.return_type,
|
|
381
|
+
'params' => func.params,
|
|
382
|
+
'param_types' => func.param_types.transform_keys(&:to_s).transform_values(&:to_s),
|
|
383
|
+
'locals' => func.locals.transform_keys(&:to_s),
|
|
384
|
+
'local_init_values' => local_init_values,
|
|
385
|
+
'mutable_locals' => mutable_locals,
|
|
386
|
+
'states' => func.states.map { |s| state_to_hash(s) },
|
|
387
|
+
'eof_handler' => func.eof_handler&.map { |c| command_to_hash(c) } || [],
|
|
388
|
+
'entry_actions' => filtered_entry_actions.map { |c| command_to_hash(c) },
|
|
389
|
+
'emits_events' => func.emits_events,
|
|
390
|
+
'expects_char' => func.expects_char,
|
|
391
|
+
'emits_content_on_close' => func.emits_content_on_close,
|
|
392
|
+
'prepend_values' => func.prepend_values.transform_keys(&:to_s),
|
|
393
|
+
'lineno' => func.lineno
|
|
394
|
+
}
|
|
395
|
+
end
|
|
396
|
+
|
|
397
|
+
# Extract initial values for locals from entry_actions assignments.
|
|
398
|
+
# We convert any expression to Rust syntax and use it as the initializer.
|
|
399
|
+
def extract_local_init_values(entry_actions)
|
|
400
|
+
init_values = {}
|
|
401
|
+
entry_actions.each do |cmd|
|
|
402
|
+
next unless cmd.type == :assign
|
|
403
|
+
|
|
404
|
+
var = cmd.args[:var]
|
|
405
|
+
expr = cmd.args[:expr]
|
|
406
|
+
next unless var && expr
|
|
407
|
+
|
|
408
|
+
# Convert the expression to Rust (inline expansion for COL/LINE/PREV)
|
|
409
|
+
rust_expr = expr
|
|
410
|
+
.gsub(/\bCOL\b/, 'self.col()')
|
|
411
|
+
.gsub(/\bLINE\b/, 'self.line as i32')
|
|
412
|
+
.gsub(/\bPREV\b/, 'self.prev()')
|
|
413
|
+
.gsub(/:([a-z_]\w*)/i) { ::Regexp.last_match(1) }
|
|
414
|
+
init_values[var] = rust_expr
|
|
415
|
+
end
|
|
416
|
+
init_values
|
|
417
|
+
end
|
|
418
|
+
|
|
419
|
+
# Find locals that need to be mutable (reassigned in function body, not just entry)
|
|
420
|
+
def find_mutable_locals(func)
|
|
421
|
+
mutable = Set.new
|
|
422
|
+
|
|
423
|
+
func.states.each do |state|
|
|
424
|
+
state.cases.each do |kase|
|
|
425
|
+
collect_mutable_vars(kase.commands, mutable)
|
|
426
|
+
end
|
|
427
|
+
collect_mutable_vars(state.eof_handler || [], mutable)
|
|
428
|
+
end
|
|
429
|
+
|
|
430
|
+
mutable.to_a
|
|
431
|
+
end
|
|
432
|
+
|
|
433
|
+
def collect_mutable_vars(commands, mutable)
|
|
434
|
+
commands.each do |cmd|
|
|
435
|
+
case cmd.type
|
|
436
|
+
when :assign, :add_assign, :sub_assign then mutable << cmd.args[:var] if cmd.args[:var]
|
|
437
|
+
when :conditional then cmd.args[:clauses]&.each do |clause|
|
|
438
|
+
collect_mutable_vars(clause.commands, mutable)
|
|
439
|
+
end
|
|
440
|
+
end
|
|
441
|
+
end
|
|
442
|
+
end
|
|
443
|
+
|
|
444
|
+
def state_to_hash(state)
|
|
445
|
+
{
|
|
446
|
+
'name' => state.name,
|
|
447
|
+
'cases' => state.cases.map { |c| case_to_hash(c) },
|
|
448
|
+
'eof_handler' => state.eof_handler&.map { |c| command_to_hash(c) } || [],
|
|
449
|
+
'scan_chars' => state.scan_chars,
|
|
450
|
+
'scannable' => state.scannable?,
|
|
451
|
+
'is_self_looping' => state.is_self_looping,
|
|
452
|
+
'has_default' => state.has_default,
|
|
453
|
+
'is_unconditional' => state.is_unconditional,
|
|
454
|
+
'newline_injected' => state.newline_injected,
|
|
455
|
+
'lineno' => state.lineno
|
|
456
|
+
}
|
|
457
|
+
end
|
|
458
|
+
|
|
459
|
+
def case_to_hash(kase)
|
|
460
|
+
{
|
|
461
|
+
'chars' => kase.chars,
|
|
462
|
+
'special_class' => kase.special_class&.to_s,
|
|
463
|
+
'param_ref' => kase.param_ref,
|
|
464
|
+
'condition' => kase.condition,
|
|
465
|
+
'is_conditional' => kase.conditional?,
|
|
466
|
+
'substate' => kase.substate,
|
|
467
|
+
'commands' => kase.commands.map { |c| command_to_hash(c) },
|
|
468
|
+
'is_default' => kase.default?,
|
|
469
|
+
'lineno' => kase.lineno
|
|
470
|
+
}
|
|
471
|
+
end
|
|
472
|
+
|
|
473
|
+
def command_to_hash(cmd)
|
|
474
|
+
args = cmd.args.transform_keys(&:to_s)
|
|
475
|
+
|
|
476
|
+
# Recursively convert nested commands in conditionals
|
|
477
|
+
if cmd.type == :conditional && args['clauses']
|
|
478
|
+
args['clauses'] = args['clauses'].map do |clause|
|
|
479
|
+
{
|
|
480
|
+
'condition' => clause['condition'],
|
|
481
|
+
'commands' => clause['commands'].map { |c| command_to_hash(c) }
|
|
482
|
+
}
|
|
483
|
+
end
|
|
484
|
+
end
|
|
485
|
+
|
|
486
|
+
{ 'type' => cmd.type.to_s, 'args' => args }
|
|
487
|
+
end
|
|
488
|
+
end
|
|
489
|
+
end
|
data/lib/descent/ir.rb
ADDED
|
@@ -0,0 +1,98 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Descent
|
|
4
|
+
# Intermediate Representation - semantic model after analysis.
|
|
5
|
+
#
|
|
6
|
+
# IR nodes are enriched with inferred information:
|
|
7
|
+
# - Resolved types
|
|
8
|
+
# - SCAN optimization characters (inferred from state structure)
|
|
9
|
+
# - EOF handling requirements
|
|
10
|
+
# - Local variable declarations with types
|
|
11
|
+
module IR
|
|
12
|
+
# Top-level parser definition
|
|
13
|
+
Parser = Data.define(:name, :entry_point, :types, :functions, :keywords, :custom_error_codes) do
|
|
14
|
+
def initialize(name:, entry_point:, types: [], functions: [], keywords: [], custom_error_codes: []) = super
|
|
15
|
+
end
|
|
16
|
+
|
|
17
|
+
# Keywords for perfect hash (phf) lookup
|
|
18
|
+
# Generates static phf::Map for O(1) keyword matching
|
|
19
|
+
Keywords = Data.define(:name, :fallback_func, :fallback_args, :mappings, :lineno) do
|
|
20
|
+
# name: identifier (e.g., "bare" generates BARE_KEYWORDS)
|
|
21
|
+
# fallback_func: function to call when no keyword matches
|
|
22
|
+
# fallback_args: arguments to pass to fallback
|
|
23
|
+
# mappings: Array of {keyword: "string", event_type: "TypeName"}
|
|
24
|
+
def initialize(name:, fallback_func: nil, fallback_args: nil, mappings: [], lineno: 0) = super
|
|
25
|
+
end
|
|
26
|
+
|
|
27
|
+
# Resolved type information
|
|
28
|
+
TypeInfo = Data.define(:name, :kind, :emits_start, :emits_end, :lineno) do
|
|
29
|
+
# kind: :bracket (emits Start/End), :content (emits on return), :internal (no emit)
|
|
30
|
+
def initialize(name:, kind:, emits_start: false, emits_end: false, lineno: 0) = super
|
|
31
|
+
|
|
32
|
+
def bracket? = kind == :bracket
|
|
33
|
+
def content? = kind == :content
|
|
34
|
+
def internal? = kind == :internal
|
|
35
|
+
end
|
|
36
|
+
|
|
37
|
+
# Function with resolved semantics
|
|
38
|
+
Function = Data.define(:name, :return_type, :params, :param_types, :locals, :states, :eof_handler, :entry_actions,
|
|
39
|
+
:emits_events, :expects_char, :emits_content_on_close, :prepend_values, :lineno) do
|
|
40
|
+
# params: Array of parameter names
|
|
41
|
+
# param_types: Hash mapping param name -> :byte or :i32 (inferred from usage)
|
|
42
|
+
# entry_actions: Commands to execute on function entry (e.g., variable initialization)
|
|
43
|
+
# expects_char: Single char that must be seen to return (inferred from return cases)
|
|
44
|
+
# emits_content_on_close: Whether TERM appears before return (emit content on unclosed EOF)
|
|
45
|
+
# prepend_values: Hash mapping param name -> Array of byte values that could be passed (for PREPEND)
|
|
46
|
+
def initialize(name:, return_type: nil, params: [], param_types: {}, locals: {}, states: [], eof_handler: nil,
|
|
47
|
+
entry_actions: [], emits_events: false, expects_char: nil, emits_content_on_close: false,
|
|
48
|
+
prepend_values: {}, lineno: 0)
|
|
49
|
+
super
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
def expects_closer? = !expects_char.nil?
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# State with inferred optimizations
|
|
56
|
+
State = Data.define(:name, :cases, :eof_handler, :scan_chars, :is_self_looping, :has_default, :is_unconditional, :newline_injected,
|
|
57
|
+
:lineno) do
|
|
58
|
+
# scan_chars: Array of chars for SIMD memchr scan, or nil if not applicable
|
|
59
|
+
# is_self_looping: true if has default case that loops back to self
|
|
60
|
+
# has_default: true if state has a default case (no chars, no condition)
|
|
61
|
+
# is_unconditional: true if first case has no char match (bare action case)
|
|
62
|
+
# newline_injected: true if '\n' was added to scan_chars by the generator (not a user target).
|
|
63
|
+
# When true, the template must add a match arm for '\n' that updates line/column and
|
|
64
|
+
# continues scanning (no state transition). This enables correct line tracking during
|
|
65
|
+
# SIMD scans without runtime checks for whether '\n' is a target.
|
|
66
|
+
def initialize(name:, cases: [], eof_handler: nil, scan_chars: nil, is_self_looping: false,
|
|
67
|
+
has_default: false, is_unconditional: false, newline_injected: false, lineno: 0) = super
|
|
68
|
+
|
|
69
|
+
def scannable? = !scan_chars.nil? && !scan_chars.empty?
|
|
70
|
+
end
|
|
71
|
+
|
|
72
|
+
# Case with resolved actions
|
|
73
|
+
Case = Data.define(:chars, :special_class, :param_ref, :condition, :substate, :commands, :lineno) do
|
|
74
|
+
# chars: Array of literal chars to match, or nil for default
|
|
75
|
+
# special_class: Symbol like :letter, :label_cont for special matchers
|
|
76
|
+
# param_ref: Parameter name to match against (for |c[:param]|), or nil
|
|
77
|
+
# condition: String condition for if-cases, or nil
|
|
78
|
+
# lineno: Source line number from .desc file (for trace output)
|
|
79
|
+
def initialize(chars: nil, special_class: nil, param_ref: nil, condition: nil, substate: nil, commands: [],
|
|
80
|
+
lineno: 0) = super
|
|
81
|
+
|
|
82
|
+
def default? = chars.nil? && special_class.nil? && param_ref.nil? && condition.nil?
|
|
83
|
+
def conditional? = !condition.nil?
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
# Resolved command
|
|
87
|
+
Command = Data.define(:type, :args) do
|
|
88
|
+
# type: :mark, :term, :advance, :emit, :call, :assign, :return, :transition, etc.
|
|
89
|
+
# args: Hash of type-specific arguments
|
|
90
|
+
def initialize(type:, args: {}) = super
|
|
91
|
+
end
|
|
92
|
+
|
|
93
|
+
# Conditional with resolved conditions
|
|
94
|
+
Conditional = Data.define(:clauses)
|
|
95
|
+
|
|
96
|
+
Clause = Data.define(:condition, :commands)
|
|
97
|
+
end
|
|
98
|
+
end
|