descent 0.7.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/CHANGELOG.md +285 -0
- data/README.md +583 -0
- data/SYNTAX.md +334 -0
- data/exe/descent +15 -0
- data/lib/descent/ast.rb +69 -0
- data/lib/descent/generator.rb +489 -0
- data/lib/descent/ir.rb +98 -0
- data/lib/descent/ir_builder.rb +1479 -0
- data/lib/descent/lexer.rb +308 -0
- data/lib/descent/parser.rb +450 -0
- data/lib/descent/railroad.rb +272 -0
- data/lib/descent/templates/rust/_command.liquid +174 -0
- data/lib/descent/templates/rust/parser.liquid +1163 -0
- data/lib/descent/tools/debug.rb +115 -0
- data/lib/descent/tools/diagram.rb +48 -0
- data/lib/descent/tools/generate.rb +47 -0
- data/lib/descent/tools/validate.rb +56 -0
- data/lib/descent/validator.rb +231 -0
- data/lib/descent/version.rb +5 -0
- data/lib/descent.rb +34 -0
- metadata +101 -0
|
@@ -0,0 +1,308 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Descent
|
|
4
|
+
# Error raised when lexer encounters invalid syntax
|
|
5
|
+
class LexerError < StandardError
|
|
6
|
+
attr_reader :lineno, :source_file
|
|
7
|
+
|
|
8
|
+
def initialize(message, lineno: nil, source_file: nil)
|
|
9
|
+
@lineno = lineno
|
|
10
|
+
@source_file = source_file
|
|
11
|
+
location = [source_file, lineno].compact.join(':')
|
|
12
|
+
super(location.empty? ? message : "#{location}: #{message}")
|
|
13
|
+
end
|
|
14
|
+
end
|
|
15
|
+
|
|
16
|
+
# Tokenizes .desc files (pipe-delimited UDON format).
|
|
17
|
+
#
|
|
18
|
+
# Input: Raw file content
|
|
19
|
+
# Output: Array of Token structs
|
|
20
|
+
class Lexer
|
|
21
|
+
Token = Data.define(:type, :tag, :id, :rest, :lineno)
|
|
22
|
+
|
|
23
|
+
def initialize(content, source_file: '(string)')
|
|
24
|
+
@content = content
|
|
25
|
+
@source_file = source_file
|
|
26
|
+
end
|
|
27
|
+
|
|
28
|
+
def tokenize
|
|
29
|
+
tokens = []
|
|
30
|
+
|
|
31
|
+
# Strip comments BEFORE splitting on pipes to avoid corrupted parsing
|
|
32
|
+
# when a comment contains a pipe character.
|
|
33
|
+
# Note: strip_comments preserves line structure (each original line maps
|
|
34
|
+
# to exactly one stripped line), so we can count newlines to get line numbers.
|
|
35
|
+
content_without_comments = strip_comments(@content)
|
|
36
|
+
|
|
37
|
+
# Split on pipes, tracking position
|
|
38
|
+
# Skip comment-only lines (starting with ;)
|
|
39
|
+
# Use bracket-aware split to handle |c[|] correctly
|
|
40
|
+
parts = []
|
|
41
|
+
current_pos = 0
|
|
42
|
+
|
|
43
|
+
split_on_pipes(content_without_comments).each do |part|
|
|
44
|
+
next if part.strip.empty?
|
|
45
|
+
|
|
46
|
+
# Find position in stripped content
|
|
47
|
+
found_pos = content_without_comments.index(part, current_pos) || current_pos
|
|
48
|
+
|
|
49
|
+
# Count newlines before this position to get line number (1-indexed).
|
|
50
|
+
# This works because strip_comments preserves line count - each original
|
|
51
|
+
# line becomes exactly one line in stripped content.
|
|
52
|
+
lineno = content_without_comments[0...found_pos].count("\n") + 1
|
|
53
|
+
|
|
54
|
+
parts << [part.rstrip, lineno]
|
|
55
|
+
current_pos = found_pos + part.length
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
parts.each do |part, line|
|
|
59
|
+
token = parse_part(part, line)
|
|
60
|
+
tokens << token if token
|
|
61
|
+
end
|
|
62
|
+
|
|
63
|
+
tokens
|
|
64
|
+
end
|
|
65
|
+
|
|
66
|
+
# Split content on pipes, but not on pipes inside bracket-delimited IDs or quotes
|
|
67
|
+
# This correctly handles cases like |c[|] where the pipe is a literal
|
|
68
|
+
# Also handles |c[LETTER'[.?!] where [ inside is literal, not a delimiter
|
|
69
|
+
# Also handles '/sameline_text(elem_col, '|')' where | is in quotes
|
|
70
|
+
#
|
|
71
|
+
# Raises LexerError on unterminated quotes or brackets.
|
|
72
|
+
def split_on_pipes(content)
|
|
73
|
+
parts = []
|
|
74
|
+
current = +''
|
|
75
|
+
in_bracket = false
|
|
76
|
+
in_quote = nil # nil, or the quote character (' or ")
|
|
77
|
+
prev_char = nil
|
|
78
|
+
lineno = 1
|
|
79
|
+
quote_start_line = nil
|
|
80
|
+
|
|
81
|
+
content.each_char do |c|
|
|
82
|
+
lineno += 1 if c == "\n"
|
|
83
|
+
|
|
84
|
+
case c
|
|
85
|
+
when "'"
|
|
86
|
+
current << c
|
|
87
|
+
if in_quote == "'" && prev_char != '\\'
|
|
88
|
+
in_quote = nil # Close single quote (unless escaped)
|
|
89
|
+
elsif in_quote.nil?
|
|
90
|
+
in_quote = "'" # Open single quote
|
|
91
|
+
quote_start_line = lineno
|
|
92
|
+
end
|
|
93
|
+
when '"'
|
|
94
|
+
current << c
|
|
95
|
+
if in_quote == '"' && prev_char != '\\'
|
|
96
|
+
in_quote = nil # Close double quote (unless escaped)
|
|
97
|
+
elsif in_quote.nil?
|
|
98
|
+
in_quote = '"' # Open double quote
|
|
99
|
+
quote_start_line = lineno
|
|
100
|
+
end
|
|
101
|
+
when '['
|
|
102
|
+
# Only first [ opens the bracket context - nested [ are literal
|
|
103
|
+
in_bracket ||= true unless in_quote
|
|
104
|
+
current << c
|
|
105
|
+
when ']'
|
|
106
|
+
# ] always closes the bracket context (only one level)
|
|
107
|
+
current << c
|
|
108
|
+
in_bracket = false unless in_quote
|
|
109
|
+
when '|'
|
|
110
|
+
if in_bracket || in_quote
|
|
111
|
+
current << c
|
|
112
|
+
else
|
|
113
|
+
parts << current unless current.empty?
|
|
114
|
+
current = +''
|
|
115
|
+
end
|
|
116
|
+
else
|
|
117
|
+
current << c
|
|
118
|
+
end
|
|
119
|
+
prev_char = c
|
|
120
|
+
end
|
|
121
|
+
|
|
122
|
+
# Validate: no unterminated quotes or brackets
|
|
123
|
+
if in_quote
|
|
124
|
+
raise LexerError.new(
|
|
125
|
+
"Unterminated #{in_quote == "'" ? 'single' : 'double'} quote - opened but never closed",
|
|
126
|
+
lineno: quote_start_line,
|
|
127
|
+
source_file: @source_file
|
|
128
|
+
)
|
|
129
|
+
end
|
|
130
|
+
|
|
131
|
+
parts << current unless current.empty?
|
|
132
|
+
parts
|
|
133
|
+
end
|
|
134
|
+
|
|
135
|
+
# Strip comments from content, preserving semicolons inside brackets and quotes
|
|
136
|
+
def strip_comments(content)
|
|
137
|
+
content.lines.map do |line|
|
|
138
|
+
depth = 0
|
|
139
|
+
in_quote = nil
|
|
140
|
+
prev_char = nil
|
|
141
|
+
comment_start = nil
|
|
142
|
+
|
|
143
|
+
line.each_char.with_index do |c, i|
|
|
144
|
+
# Track quote state (respecting escapes)
|
|
145
|
+
if c == "'" && prev_char != '\\' && in_quote != '"'
|
|
146
|
+
in_quote = in_quote == "'" ? nil : "'"
|
|
147
|
+
elsif c == '"' && prev_char != '\\' && in_quote != "'"
|
|
148
|
+
in_quote = in_quote == '"' ? nil : '"'
|
|
149
|
+
elsif !in_quote
|
|
150
|
+
case c
|
|
151
|
+
when '[' then depth += 1
|
|
152
|
+
when ']' then depth -= 1
|
|
153
|
+
when ';'
|
|
154
|
+
if depth.zero?
|
|
155
|
+
comment_start = i
|
|
156
|
+
break
|
|
157
|
+
end
|
|
158
|
+
end
|
|
159
|
+
end
|
|
160
|
+
prev_char = c
|
|
161
|
+
end
|
|
162
|
+
comment_start ? "#{line[0...comment_start].rstrip}\n" : line
|
|
163
|
+
end.join
|
|
164
|
+
end
|
|
165
|
+
|
|
166
|
+
private
|
|
167
|
+
|
|
168
|
+
# Extract the content inside [...] from a part string, respecting single quotes.
|
|
169
|
+
# Returns [content, end_position] or ['', nil] if no brackets found.
|
|
170
|
+
# This handles cases like c[']'] where ] inside quotes shouldn't close the bracket.
|
|
171
|
+
# Only single quotes are quote delimiters in c[...] - double quotes are literals.
|
|
172
|
+
def extract_bracketed_id(part)
|
|
173
|
+
start_pos = part.index('[')
|
|
174
|
+
return ['', nil] unless start_pos
|
|
175
|
+
|
|
176
|
+
i = start_pos + 1
|
|
177
|
+
depth = 1
|
|
178
|
+
in_quote = false
|
|
179
|
+
content = +''
|
|
180
|
+
|
|
181
|
+
while i < part.length && depth.positive?
|
|
182
|
+
c = part[i]
|
|
183
|
+
|
|
184
|
+
case c
|
|
185
|
+
when "'"
|
|
186
|
+
content << c
|
|
187
|
+
in_quote = !in_quote
|
|
188
|
+
when '['
|
|
189
|
+
content << c
|
|
190
|
+
depth += 1 unless in_quote
|
|
191
|
+
when ']'
|
|
192
|
+
if in_quote
|
|
193
|
+
content << c
|
|
194
|
+
else
|
|
195
|
+
depth -= 1
|
|
196
|
+
content << c if depth.positive? # Don't include final ]
|
|
197
|
+
end
|
|
198
|
+
else
|
|
199
|
+
content << c
|
|
200
|
+
end
|
|
201
|
+
i += 1
|
|
202
|
+
end
|
|
203
|
+
|
|
204
|
+
[content, depth.zero? ? i : nil]
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def parse_part(part, lineno)
|
|
208
|
+
# Parse: TAG[ID] REST
|
|
209
|
+
# TAG is everything up to [ or space
|
|
210
|
+
# ID is inside []
|
|
211
|
+
# REST is everything after
|
|
212
|
+
#
|
|
213
|
+
# Comments start with ; and go to end of line (or end of part)
|
|
214
|
+
|
|
215
|
+
# Strip comments (but not semicolons inside brackets, parens, or quotes)
|
|
216
|
+
# A comment starts with ; only if not inside [], (), or ''
|
|
217
|
+
part = part.lines.map.with_index do |line, line_idx|
|
|
218
|
+
# Find first ; that's not inside brackets, parens, or quotes
|
|
219
|
+
bracket_depth = 0
|
|
220
|
+
paren_depth = 0
|
|
221
|
+
in_quote = false
|
|
222
|
+
quote_start_col = nil
|
|
223
|
+
comment_start = nil
|
|
224
|
+
i = 0
|
|
225
|
+
while i < line.length
|
|
226
|
+
c = line[i]
|
|
227
|
+
if in_quote
|
|
228
|
+
in_quote = false if c == "'" && (i.zero? || line[i - 1] != '\\')
|
|
229
|
+
else
|
|
230
|
+
case c
|
|
231
|
+
when "'"
|
|
232
|
+
in_quote = true
|
|
233
|
+
quote_start_col = i
|
|
234
|
+
when '[' then bracket_depth += 1
|
|
235
|
+
when ']' then bracket_depth -= 1
|
|
236
|
+
when '(' then paren_depth += 1
|
|
237
|
+
when ')' then paren_depth -= 1
|
|
238
|
+
when ';'
|
|
239
|
+
if bracket_depth.zero? && paren_depth.zero?
|
|
240
|
+
comment_start = i
|
|
241
|
+
break
|
|
242
|
+
end
|
|
243
|
+
end
|
|
244
|
+
end
|
|
245
|
+
i += 1
|
|
246
|
+
end
|
|
247
|
+
|
|
248
|
+
# Validate: unterminated quote within this part
|
|
249
|
+
if in_quote
|
|
250
|
+
raise LexerError.new(
|
|
251
|
+
"Unterminated single quote at column #{quote_start_col + 1}",
|
|
252
|
+
lineno: lineno + line_idx,
|
|
253
|
+
source_file: @source_file
|
|
254
|
+
)
|
|
255
|
+
end
|
|
256
|
+
|
|
257
|
+
comment_start ? line[0...comment_start].rstrip : line.rstrip
|
|
258
|
+
end.join("\n").strip
|
|
259
|
+
|
|
260
|
+
# Extract tag - downcase unless it's emit(), function call, or inline type emit
|
|
261
|
+
# For function calls with parens, capture the full call including arguments
|
|
262
|
+
raw_tag = if part.match?(%r{^/\w+\(})
|
|
263
|
+
# Function call - capture up to and including closing paren
|
|
264
|
+
part[%r{^/\w+\([^)]*\)}] || part[/^[^ \[]+/]
|
|
265
|
+
else
|
|
266
|
+
part[/^(\.|[^ \[]+)/]
|
|
267
|
+
end&.strip || ''
|
|
268
|
+
|
|
269
|
+
tag = case raw_tag
|
|
270
|
+
when /^emit\(/i
|
|
271
|
+
raw_tag
|
|
272
|
+
when %r{^/\w+\(}
|
|
273
|
+
# Function call - preserve case of arguments inside parens
|
|
274
|
+
name = raw_tag[%r{^/(\w+)\(}, 1]
|
|
275
|
+
args = raw_tag[/\(([^)]*)\)/, 1]
|
|
276
|
+
"/#{name.downcase}(#{args})"
|
|
277
|
+
when /^[A-Z]+(_[A-Z]+)*$/
|
|
278
|
+
# SCREAMING_SNAKE_CASE - character class like LETTER, LABEL_CONT, DIGIT
|
|
279
|
+
# Lowercase it so parser can handle it uniformly
|
|
280
|
+
raw_tag.downcase
|
|
281
|
+
when /^[A-Z]/
|
|
282
|
+
# PascalCase - inline type emit, preserve case entirely
|
|
283
|
+
raw_tag
|
|
284
|
+
else
|
|
285
|
+
raw_tag.downcase
|
|
286
|
+
end
|
|
287
|
+
|
|
288
|
+
# Extract ID from brackets, respecting quotes (so c[']'] works correctly)
|
|
289
|
+
id, id_end_pos = extract_bracketed_id(part)
|
|
290
|
+
|
|
291
|
+
# For function calls, strip the full call including parens
|
|
292
|
+
after_tag = if raw_tag.match?(%r{^/\w+\(})
|
|
293
|
+
part.sub(%r{^/\w+\([^)]*\)}, '')
|
|
294
|
+
else
|
|
295
|
+
part.sub(/^(\.|[^ \[]+)/, '')
|
|
296
|
+
end
|
|
297
|
+
rest = id_end_pos ? after_tag[(after_tag.index('[') + id.length + 2)..].to_s.strip : after_tag.strip
|
|
298
|
+
|
|
299
|
+
# For parser name and similar, take only first word/line
|
|
300
|
+
rest = rest.split("\n").first&.strip || '' if %w[parser entry-point].include?(tag)
|
|
301
|
+
|
|
302
|
+
# Skip empty tags (artifacts of split)
|
|
303
|
+
return nil if tag.empty? && id.empty? && rest.empty?
|
|
304
|
+
|
|
305
|
+
Token.new(type: :part, tag:, id:, rest:, lineno:)
|
|
306
|
+
end
|
|
307
|
+
end
|
|
308
|
+
end
|