descent 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,450 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Descent
4
+ # Builds AST from token stream.
5
+ #
6
+ # Input: Array of Lexer::Token
7
+ # Output: AST::Machine
8
+ class Parser
9
+ # Structural keywords that end a state, function, or keywords block
10
+ STRUCTURAL = %w[function type state keywords].freeze
11
+
12
+ # Keywords that start a new case within a state
13
+ CASE_KEYWORDS = %w[c default eof if].freeze
14
+
15
+ # Character class names (lowercase words that start cases, not commands)
16
+ # ASCII classes: letter, digit, etc.
17
+ # Unicode classes: xid_start, xid_cont, xlbl_start, xlbl_cont
18
+ # Whitespace: ws, nl
19
+ CHAR_CLASSES = %w[letter label_cont digit hex_digit ws nl xid_start xid_cont xlbl_start xlbl_cont].freeze
20
+
21
+ # All tokens that can start a new case (used to know when to stop parsing current case)
22
+ CASE_STARTERS = (STRUCTURAL + CASE_KEYWORDS + CHAR_CLASSES).freeze
23
+
24
+ def initialize(tokens)
25
+ @tokens = tokens
26
+ @pos = 0
27
+ end
28
+
29
+ # Detect if a token tag looks like a command (not a case starter).
30
+ # Commands can start bare action cases.
31
+ #
32
+ # Commands include:
33
+ # - Function calls: /word or /word(...)
34
+ # - Arrows: -> or ->[...] or >>
35
+ # - Uppercase commands: WORD or WORD(...) like MARK, TERM, PREPEND, EMIT
36
+ # - Specific lowercase: return, err
37
+ def command_like?(tag)
38
+ return false if tag.nil?
39
+
40
+ # Function call: /word
41
+ return true if tag.start_with?('/')
42
+
43
+ # Arrow commands: -> or >>
44
+ return true if tag.start_with?('->') || tag.start_with?('>>')
45
+
46
+ # Uppercase command: WORD or WORD(...)
47
+ return true if tag.match?(/^[A-Z]/)
48
+
49
+ # Specific lowercase commands (not character classes)
50
+ base_tag = tag.downcase.split('(').first
51
+ %w[return err mark term].include?(base_tag)
52
+ end
53
+
54
+ # Check if a token represents an inline command at function level.
55
+ # This includes assignments like `result = 0` and commands like MARK.
56
+ def inline_command_token?(token)
57
+ tag = token.tag
58
+ rest = token.rest
59
+
60
+ # Commands we already recognize
61
+ return true if command_like?(tag)
62
+
63
+ # Assignment: tag is variable name, rest starts with = or += or -=
64
+ return true if rest&.match?(/^\s*[+-]?=/)
65
+
66
+ false
67
+ end
68
+
69
+ def parse
70
+ name = nil
71
+ entry_point = nil
72
+ types = []
73
+ functions = []
74
+ keywords = []
75
+
76
+ while (token = current)
77
+ case token.tag
78
+ when 'parser'
79
+ name = token.rest.strip.downcase
80
+ advance
81
+ when 'entry-point'
82
+ entry_point = token.rest.strip
83
+ advance
84
+ when 'type' then types << parse_type
85
+ when 'function' then functions << parse_function
86
+ when 'keywords' then keywords << parse_keywords
87
+ else
88
+ raise ParseError, "Line #{token.lineno}: Unknown top-level declaration '#{token.tag}'. " \
89
+ 'Expected: parser, entry-point, type, function, or keywords'
90
+ end
91
+ end
92
+
93
+ AST::Machine.new(name:, entry_point:, types:, functions:, keywords:)
94
+ end
95
+
96
+ private
97
+
98
+ def current = @tokens[@pos]
99
+ def advance = @pos += 1
100
+ def peek = @tokens[@pos + 1]
101
+
102
+ def parse_type
103
+ token = current
104
+ name = token.id
105
+ # Take first word only (e.g., "BRACKET" from "BRACKET ; comment")
106
+ kind = token.rest.split.first&.upcase&.to_sym || :UNKNOWN
107
+ advance
108
+
109
+ AST::TypeDecl.new(name:, kind:, lineno: token.lineno)
110
+ end
111
+
112
+ # Parse keywords block for phf perfect hash lookup
113
+ # Syntax: |keywords[name] :fallback /function(args)
114
+ # | keyword => EventType
115
+ # | keyword => EventType
116
+ def parse_keywords
117
+ token = current
118
+ name = token.id
119
+ rest = token.rest
120
+ lineno = token.lineno
121
+ advance
122
+
123
+ # Parse fallback function from rest (e.g., ":fallback /bare_string" or "/bare_string")
124
+ fallback = nil
125
+ if rest =~ %r{:fallback\s+(/\w+(?:\([^)]*\))?)}
126
+ fallback = ::Regexp.last_match(1)
127
+ elsif rest =~ %r{^(/\w+(?:\([^)]*\))?)}
128
+ fallback = ::Regexp.last_match(1)
129
+ end
130
+
131
+ mappings = []
132
+
133
+ # Parse keyword mappings: | keyword => EventType
134
+ while (t = current) && !STRUCTURAL.include?(t.tag) && !t.tag.start_with?('/')
135
+ # Empty tag with rest containing "keyword => EventType"
136
+ if t.tag == '' && t.rest.include?('=>')
137
+ keyword, event_type = t.rest.split('=>', 2).map(&:strip)
138
+ mappings << { keyword:, event_type: } if keyword && event_type
139
+ advance
140
+ # Tag is the keyword, rest contains "=> EventType"
141
+ elsif t.rest =~ /^=>\s*(\w+)/
142
+ keyword = t.tag.strip
143
+ event_type = ::Regexp.last_match(1)
144
+ mappings << { keyword:, event_type: }
145
+ advance
146
+ else
147
+ raise ParseError, "Line #{t.lineno}: Unknown keyword mapping format: '#{t.tag}' rest='#{t.rest}'"
148
+ end
149
+ end
150
+
151
+ AST::Keywords.new(name:, fallback:, mappings:, lineno:)
152
+ end
153
+
154
+ def parse_function
155
+ token = current
156
+ name, rtype = token.id.split(':')
157
+ params = parse_params(token.rest)
158
+ lineno = token.lineno
159
+ advance
160
+
161
+ states = []
162
+ eof_handler = nil
163
+ entry_actions = [] # Commands to execute on function entry (e.g., | result = 0)
164
+
165
+ while (t = current) && !%w[function type keywords].include?(t.tag)
166
+ case t.tag
167
+ when 'state' then states << parse_state
168
+ when 'eof' then eof_handler = parse_eof_handler
169
+ when 'if' then entry_actions << parse_conditional # Function-level guard condition
170
+ else
171
+ # Check if this is an inline command (assignment, MARK, etc.)
172
+ if inline_command_token?(t)
173
+ entry_actions << parse_command(t)
174
+ advance
175
+ else
176
+ raise ParseError, "Line #{t.lineno}: Unexpected token '#{t.tag}' inside function. " \
177
+ "Expected: state, eof, if, or inline command (like 'var = expr' or 'MARK')"
178
+ end
179
+ end
180
+ end
181
+
182
+ AST::Function.new(
183
+ name: name.gsub('-', '_'),
184
+ return_type: rtype,
185
+ params:,
186
+ states:,
187
+ eof_handler:,
188
+ entry_actions:,
189
+ lineno:
190
+ )
191
+ end
192
+
193
+ def parse_params(rest)
194
+ return [] if rest.nil? || rest.empty?
195
+
196
+ rest.scan(/:(\w+)/).flatten
197
+ end
198
+
199
+ def parse_state
200
+ token = current
201
+ name = token.id.gsub('-', '_').delete(':')
202
+ lineno = token.lineno
203
+ advance
204
+
205
+ cases = []
206
+ eof_handler = nil
207
+
208
+ while (t = current) && !STRUCTURAL.include?(t.tag)
209
+ case t.tag
210
+ # Case keywords - specific case types
211
+ when 'c' then cases << parse_case(t.id)
212
+ when 'default' then cases << parse_case(nil)
213
+ when 'eof' then eof_handler = parse_eof_handler
214
+ when 'if' then cases << parse_if_case
215
+ else
216
+ # Check for character class (lowercase word like 'letter', 'digit')
217
+ if CHAR_CLASSES.include?(t.tag)
218
+ cases << parse_case(t.tag.upcase)
219
+ # Check for command-like tokens that start bare action cases
220
+ elsif command_like?(t.tag)
221
+ cases << parse_bare_action_case
222
+ else
223
+ raise ParseError, "Line #{t.lineno}: Unknown token in state: '#{t.tag}' (not a case starter or command)"
224
+ end
225
+ end
226
+ end
227
+
228
+ AST::State.new(name:, cases:, eof_handler:, lineno:)
229
+ end
230
+
231
+ def parse_case(chars_str)
232
+ token = current
233
+ lineno = token.lineno
234
+ advance
235
+
236
+ substate = nil
237
+ commands = []
238
+
239
+ while (t = current) && !CASE_STARTERS.include?(t.tag)
240
+ case t.tag
241
+ when '.'
242
+ substate = t.rest.strip
243
+ else
244
+ commands << parse_command(t)
245
+ end
246
+ advance
247
+ end
248
+
249
+ AST::Case.new(
250
+ chars: chars_str,
251
+ substate:,
252
+ commands:,
253
+ lineno:
254
+ )
255
+ end
256
+
257
+ # Parse a bare action case - one that starts with a command (like /function)
258
+ # instead of a character match. Used for unconditional action states.
259
+ def parse_bare_action_case
260
+ token = current
261
+ lineno = token.lineno
262
+ # Don't advance - the current token IS the first command
263
+
264
+ substate = nil
265
+ commands = []
266
+
267
+ while (t = current) && !CASE_STARTERS.include?(t.tag)
268
+ case t.tag
269
+ when '.'
270
+ substate = t.rest.strip
271
+ else
272
+ commands << parse_command(t)
273
+ end
274
+ advance
275
+ end
276
+
277
+ AST::Case.new(
278
+ chars: nil,
279
+ substate:,
280
+ commands:,
281
+ lineno:
282
+ )
283
+ end
284
+
285
+ def parse_if_case
286
+ token = current
287
+ lineno = token.lineno
288
+ condition = token.id
289
+ advance
290
+
291
+ commands = []
292
+ last_cmd_type = nil
293
+
294
+ while (t = current) && !CASE_STARTERS.include?(t.tag)
295
+ # After return, any command-like token starts a new case (bare action case).
296
+ # The return is final - nothing should follow in the same case.
297
+ break if last_cmd_type == :return && command_like?(t.tag)
298
+
299
+ unless t.tag == '.'
300
+ cmd = parse_command(t)
301
+ commands << cmd
302
+ last_cmd_type = cmd.type
303
+ end
304
+ advance
305
+ end
306
+
307
+ AST::Case.new(condition:, commands:, lineno:)
308
+ end
309
+
310
+ def parse_eof_handler
311
+ token = current
312
+ lineno = token.lineno
313
+ advance
314
+
315
+ commands = []
316
+
317
+ while (t = current) && !CASE_STARTERS.include?(t.tag)
318
+ commands << parse_command(t) unless t.tag == '.'
319
+ advance
320
+ end
321
+
322
+ AST::EOFHandler.new(commands:, lineno:)
323
+ end
324
+
325
+ def parse_command(token)
326
+ # Determine command type from tag or content
327
+ type, value = classify_command(token)
328
+ AST::Command.new(type:, value:, lineno: token.lineno)
329
+ end
330
+
331
+ def classify_command(token)
332
+ tag = token.tag
333
+ rest = token.rest
334
+
335
+ case tag
336
+ when ''
337
+ # Inline command in rest
338
+ parse_inline_command(rest)
339
+ when '->' then token.id.empty? ? [:advance, nil] : [:advance_to, token.id]
340
+ when '>>' then [:transition, rest.strip]
341
+ when 'return' then [:return, rest.strip]
342
+ when 'err' then [:error, rest.strip]
343
+ when 'mark' then [:mark, nil]
344
+ when 'term' then [:term, nil]
345
+ when /^emit\(/i then [:emit, tag[/emit\(([^)]+)\)/i, 1]]
346
+ when %r{^/\w} then [:call, tag[1..] + (rest.empty? ? '' : "(#{rest})")]
347
+ when /^TERM\((-?\d+)\)$/i then [:term, ::Regexp.last_match(1).to_i]
348
+ when /^TERM$/i then [:term, 0]
349
+ when /^MARK$/i then [:mark, nil]
350
+ when /^KEYWORDS\((\w+)\)$/i then [:keywords_lookup, ::Regexp.last_match(1)]
351
+ when /^PREPEND\(([^)]*)\)$/i
352
+ content = ::Regexp.last_match(1).strip
353
+ if content.empty?
354
+ [:noop, nil]
355
+ elsif content.start_with?(':')
356
+ [:prepend_param, content[1..]] # Strip leading colon
357
+ else
358
+ [:prepend, content]
359
+ end
360
+ when /^([A-Z]\w*)\(USE_MARK\)$/ then [:inline_emit_mark, ::Regexp.last_match(1)]
361
+ when /^([A-Z]\w*)\(([^)]+)\)$/ then [:inline_emit_literal,
362
+ { type: ::Regexp.last_match(1), literal: ::Regexp.last_match(2) }]
363
+ when /^([A-Z]\w*)$/ then [:inline_emit_bare, ::Regexp.last_match(1)]
364
+ else
365
+ # Check if tag + rest forms an assignment (e.g., tag="depth", rest="= 1")
366
+ full_cmd = "#{tag} #{rest}".strip
367
+ parse_inline_command(full_cmd)
368
+ end
369
+ end
370
+
371
+ def parse_inline_command(cmd)
372
+ cmd = cmd.strip
373
+ return [:noop, nil] if cmd.empty?
374
+
375
+ case cmd
376
+ when /^MARK\b/i then [:mark, nil]
377
+ when /^TERM\((-?\d+)\)/i then [:term, ::Regexp.last_match(1).to_i]
378
+ when /^TERM\b/i then [:term, 0]
379
+ when /^KEYWORDS\((\w+)\)/i then [:keywords_lookup, ::Regexp.last_match(1)]
380
+ when /^PREPEND\(([^)]*)\)/i
381
+ content = ::Regexp.last_match(1).strip
382
+ if content.empty?
383
+ [:noop, nil]
384
+ elsif content.start_with?(':')
385
+ [:prepend_param, content[1..]] # Strip leading colon
386
+ else
387
+ [:prepend, content]
388
+ end
389
+ when /^return\b\s*(.*)$/i then [:return, ::Regexp.last_match(1).strip]
390
+ when /^->\s*$/ then [:advance, nil]
391
+ when /^->\s*\[([^\]]+)\]$/ then [:advance_to, ::Regexp.last_match(1)]
392
+ when /^emit\(([^)]+)\)/i then [:emit, ::Regexp.last_match(1)]
393
+ when /^CALL:(\w+)/i then [:call_method, ::Regexp.last_match(1)]
394
+ when /^SCAN\(([^)]+)\)/i then [:scan, ::Regexp.last_match(1)]
395
+ when %r{^/(\w+)} then [:call, ::Regexp.last_match(1)]
396
+ when /^(\w+)\s*\+=\s*(.+)$/ then [:add_assign, { var: ::Regexp.last_match(1), expr: ::Regexp.last_match(2) }]
397
+ when /^(\w+)\s*-=\s*(.+)$/ then [:sub_assign, { var: ::Regexp.last_match(1), expr: ::Regexp.last_match(2) }]
398
+ when /^(\w+)\s*=\s*(.+)$/ then [:assign, { var: ::Regexp.last_match(1), expr: ::Regexp.last_match(2) }]
399
+ when /^([A-Z]\w*)\(USE_MARK\)$/ then [:inline_emit_mark, ::Regexp.last_match(1)]
400
+ when /^([A-Z]\w*)\(([^)]+)\)$/ then [:inline_emit_literal,
401
+ { type: ::Regexp.last_match(1), literal: ::Regexp.last_match(2) }]
402
+ when /^([A-Z]\w*)$/ then [:inline_emit_bare, ::Regexp.last_match(1)]
403
+ else
404
+ raise ParseError, "Unrecognized command: '#{cmd}'. " \
405
+ 'Expected: MARK, TERM, PREPEND, return, ->, /call, assignment, or TypeName'
406
+ end
407
+ end
408
+
409
+ def parse_conditional
410
+ token = current
411
+ lineno = token.lineno
412
+ clauses = []
413
+
414
+ current_condition = token.id
415
+ current_commands = []
416
+ advance
417
+
418
+ loop do
419
+ t = current
420
+ break unless t
421
+
422
+ case t.tag
423
+ when 'elsif'
424
+ clauses << AST::Clause.new(condition: current_condition, commands: current_commands)
425
+ current_condition = t.id
426
+ current_commands = []
427
+ advance
428
+ when 'else'
429
+ clauses << AST::Clause.new(condition: current_condition, commands: current_commands)
430
+ current_condition = nil
431
+ current_commands = []
432
+ advance
433
+ when 'endif'
434
+ clauses << AST::Clause.new(condition: current_condition, commands: current_commands)
435
+ advance
436
+ break
437
+ when 'function', 'type', 'state', 'c', 'default', 'eof'
438
+ # Implicit endif
439
+ clauses << AST::Clause.new(condition: current_condition, commands: current_commands)
440
+ break
441
+ else
442
+ current_commands << parse_command(t)
443
+ advance
444
+ end
445
+ end
446
+
447
+ AST::Conditional.new(clauses:, lineno:)
448
+ end
449
+ end
450
+ end