descent 0.7.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,308 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Descent
4
+ # Error raised when lexer encounters invalid syntax
5
+ class LexerError < StandardError
6
+ attr_reader :lineno, :source_file
7
+
8
+ def initialize(message, lineno: nil, source_file: nil)
9
+ @lineno = lineno
10
+ @source_file = source_file
11
+ location = [source_file, lineno].compact.join(':')
12
+ super(location.empty? ? message : "#{location}: #{message}")
13
+ end
14
+ end
15
+
16
+ # Tokenizes .desc files (pipe-delimited UDON format).
17
+ #
18
+ # Input: Raw file content
19
+ # Output: Array of Token structs
20
+ class Lexer
21
+ Token = Data.define(:type, :tag, :id, :rest, :lineno)
22
+
23
+ def initialize(content, source_file: '(string)')
24
+ @content = content
25
+ @source_file = source_file
26
+ end
27
+
28
+ def tokenize
29
+ tokens = []
30
+
31
+ # Strip comments BEFORE splitting on pipes to avoid corrupted parsing
32
+ # when a comment contains a pipe character.
33
+ # Note: strip_comments preserves line structure (each original line maps
34
+ # to exactly one stripped line), so we can count newlines to get line numbers.
35
+ content_without_comments = strip_comments(@content)
36
+
37
+ # Split on pipes, tracking position
38
+ # Skip comment-only lines (starting with ;)
39
+ # Use bracket-aware split to handle |c[|] correctly
40
+ parts = []
41
+ current_pos = 0
42
+
43
+ split_on_pipes(content_without_comments).each do |part|
44
+ next if part.strip.empty?
45
+
46
+ # Find position in stripped content
47
+ found_pos = content_without_comments.index(part, current_pos) || current_pos
48
+
49
+ # Count newlines before this position to get line number (1-indexed).
50
+ # This works because strip_comments preserves line count - each original
51
+ # line becomes exactly one line in stripped content.
52
+ lineno = content_without_comments[0...found_pos].count("\n") + 1
53
+
54
+ parts << [part.rstrip, lineno]
55
+ current_pos = found_pos + part.length
56
+ end
57
+
58
+ parts.each do |part, line|
59
+ token = parse_part(part, line)
60
+ tokens << token if token
61
+ end
62
+
63
+ tokens
64
+ end
65
+
66
+ # Split content on pipes, but not on pipes inside bracket-delimited IDs or quotes
67
+ # This correctly handles cases like |c[|] where the pipe is a literal
68
+ # Also handles |c[LETTER'[.?!] where [ inside is literal, not a delimiter
69
+ # Also handles '/sameline_text(elem_col, '|')' where | is in quotes
70
+ #
71
+ # Raises LexerError on unterminated quotes or brackets.
72
+ def split_on_pipes(content)
73
+ parts = []
74
+ current = +''
75
+ in_bracket = false
76
+ in_quote = nil # nil, or the quote character (' or ")
77
+ prev_char = nil
78
+ lineno = 1
79
+ quote_start_line = nil
80
+
81
+ content.each_char do |c|
82
+ lineno += 1 if c == "\n"
83
+
84
+ case c
85
+ when "'"
86
+ current << c
87
+ if in_quote == "'" && prev_char != '\\'
88
+ in_quote = nil # Close single quote (unless escaped)
89
+ elsif in_quote.nil?
90
+ in_quote = "'" # Open single quote
91
+ quote_start_line = lineno
92
+ end
93
+ when '"'
94
+ current << c
95
+ if in_quote == '"' && prev_char != '\\'
96
+ in_quote = nil # Close double quote (unless escaped)
97
+ elsif in_quote.nil?
98
+ in_quote = '"' # Open double quote
99
+ quote_start_line = lineno
100
+ end
101
+ when '['
102
+ # Only first [ opens the bracket context - nested [ are literal
103
+ in_bracket ||= true unless in_quote
104
+ current << c
105
+ when ']'
106
+ # ] always closes the bracket context (only one level)
107
+ current << c
108
+ in_bracket = false unless in_quote
109
+ when '|'
110
+ if in_bracket || in_quote
111
+ current << c
112
+ else
113
+ parts << current unless current.empty?
114
+ current = +''
115
+ end
116
+ else
117
+ current << c
118
+ end
119
+ prev_char = c
120
+ end
121
+
122
+ # Validate: no unterminated quotes or brackets
123
+ if in_quote
124
+ raise LexerError.new(
125
+ "Unterminated #{in_quote == "'" ? 'single' : 'double'} quote - opened but never closed",
126
+ lineno: quote_start_line,
127
+ source_file: @source_file
128
+ )
129
+ end
130
+
131
+ parts << current unless current.empty?
132
+ parts
133
+ end
134
+
135
+ # Strip comments from content, preserving semicolons inside brackets and quotes
136
+ def strip_comments(content)
137
+ content.lines.map do |line|
138
+ depth = 0
139
+ in_quote = nil
140
+ prev_char = nil
141
+ comment_start = nil
142
+
143
+ line.each_char.with_index do |c, i|
144
+ # Track quote state (respecting escapes)
145
+ if c == "'" && prev_char != '\\' && in_quote != '"'
146
+ in_quote = in_quote == "'" ? nil : "'"
147
+ elsif c == '"' && prev_char != '\\' && in_quote != "'"
148
+ in_quote = in_quote == '"' ? nil : '"'
149
+ elsif !in_quote
150
+ case c
151
+ when '[' then depth += 1
152
+ when ']' then depth -= 1
153
+ when ';'
154
+ if depth.zero?
155
+ comment_start = i
156
+ break
157
+ end
158
+ end
159
+ end
160
+ prev_char = c
161
+ end
162
+ comment_start ? "#{line[0...comment_start].rstrip}\n" : line
163
+ end.join
164
+ end
165
+
166
+ private
167
+
168
+ # Extract the content inside [...] from a part string, respecting single quotes.
169
+ # Returns [content, end_position] or ['', nil] if no brackets found.
170
+ # This handles cases like c[']'] where ] inside quotes shouldn't close the bracket.
171
+ # Only single quotes are quote delimiters in c[...] - double quotes are literals.
172
+ def extract_bracketed_id(part)
173
+ start_pos = part.index('[')
174
+ return ['', nil] unless start_pos
175
+
176
+ i = start_pos + 1
177
+ depth = 1
178
+ in_quote = false
179
+ content = +''
180
+
181
+ while i < part.length && depth.positive?
182
+ c = part[i]
183
+
184
+ case c
185
+ when "'"
186
+ content << c
187
+ in_quote = !in_quote
188
+ when '['
189
+ content << c
190
+ depth += 1 unless in_quote
191
+ when ']'
192
+ if in_quote
193
+ content << c
194
+ else
195
+ depth -= 1
196
+ content << c if depth.positive? # Don't include final ]
197
+ end
198
+ else
199
+ content << c
200
+ end
201
+ i += 1
202
+ end
203
+
204
+ [content, depth.zero? ? i : nil]
205
+ end
206
+
207
+ def parse_part(part, lineno)
208
+ # Parse: TAG[ID] REST
209
+ # TAG is everything up to [ or space
210
+ # ID is inside []
211
+ # REST is everything after
212
+ #
213
+ # Comments start with ; and go to end of line (or end of part)
214
+
215
+ # Strip comments (but not semicolons inside brackets, parens, or quotes)
216
+ # A comment starts with ; only if not inside [], (), or ''
217
+ part = part.lines.map.with_index do |line, line_idx|
218
+ # Find first ; that's not inside brackets, parens, or quotes
219
+ bracket_depth = 0
220
+ paren_depth = 0
221
+ in_quote = false
222
+ quote_start_col = nil
223
+ comment_start = nil
224
+ i = 0
225
+ while i < line.length
226
+ c = line[i]
227
+ if in_quote
228
+ in_quote = false if c == "'" && (i.zero? || line[i - 1] != '\\')
229
+ else
230
+ case c
231
+ when "'"
232
+ in_quote = true
233
+ quote_start_col = i
234
+ when '[' then bracket_depth += 1
235
+ when ']' then bracket_depth -= 1
236
+ when '(' then paren_depth += 1
237
+ when ')' then paren_depth -= 1
238
+ when ';'
239
+ if bracket_depth.zero? && paren_depth.zero?
240
+ comment_start = i
241
+ break
242
+ end
243
+ end
244
+ end
245
+ i += 1
246
+ end
247
+
248
+ # Validate: unterminated quote within this part
249
+ if in_quote
250
+ raise LexerError.new(
251
+ "Unterminated single quote at column #{quote_start_col + 1}",
252
+ lineno: lineno + line_idx,
253
+ source_file: @source_file
254
+ )
255
+ end
256
+
257
+ comment_start ? line[0...comment_start].rstrip : line.rstrip
258
+ end.join("\n").strip
259
+
260
+ # Extract tag - downcase unless it's emit(), function call, or inline type emit
261
+ # For function calls with parens, capture the full call including arguments
262
+ raw_tag = if part.match?(%r{^/\w+\(})
263
+ # Function call - capture up to and including closing paren
264
+ part[%r{^/\w+\([^)]*\)}] || part[/^[^ \[]+/]
265
+ else
266
+ part[/^(\.|[^ \[]+)/]
267
+ end&.strip || ''
268
+
269
+ tag = case raw_tag
270
+ when /^emit\(/i
271
+ raw_tag
272
+ when %r{^/\w+\(}
273
+ # Function call - preserve case of arguments inside parens
274
+ name = raw_tag[%r{^/(\w+)\(}, 1]
275
+ args = raw_tag[/\(([^)]*)\)/, 1]
276
+ "/#{name.downcase}(#{args})"
277
+ when /^[A-Z]+(_[A-Z]+)*$/
278
+ # SCREAMING_SNAKE_CASE - character class like LETTER, LABEL_CONT, DIGIT
279
+ # Lowercase it so parser can handle it uniformly
280
+ raw_tag.downcase
281
+ when /^[A-Z]/
282
+ # PascalCase - inline type emit, preserve case entirely
283
+ raw_tag
284
+ else
285
+ raw_tag.downcase
286
+ end
287
+
288
+ # Extract ID from brackets, respecting quotes (so c[']'] works correctly)
289
+ id, id_end_pos = extract_bracketed_id(part)
290
+
291
+ # For function calls, strip the full call including parens
292
+ after_tag = if raw_tag.match?(%r{^/\w+\(})
293
+ part.sub(%r{^/\w+\([^)]*\)}, '')
294
+ else
295
+ part.sub(/^(\.|[^ \[]+)/, '')
296
+ end
297
+ rest = id_end_pos ? after_tag[(after_tag.index('[') + id.length + 2)..].to_s.strip : after_tag.strip
298
+
299
+ # For parser name and similar, take only first word/line
300
+ rest = rest.split("\n").first&.strip || '' if %w[parser entry-point].include?(tag)
301
+
302
+ # Skip empty tags (artifacts of split)
303
+ return nil if tag.empty? && id.empty? && rest.empty?
304
+
305
+ Token.new(type: :part, tag:, id:, rest:, lineno:)
306
+ end
307
+ end
308
+ end