kumi-parser 0.0.32 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,232 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'strscan'
4
+
5
+ module Kumi
6
+ module Parser
7
+ # Turns source text into a flat array of Tokens in a single StringScanner
8
+ # pass. Whitespace (except newlines, which are significant statement
9
+ # separators) is skipped; comments and newlines are emitted so the parser
10
+ # can ignore them uniformly. Every token carries its start offset, so all
11
+ # location and error-frame work is deferred to Source.
12
+ #
13
+ # The lexer is deliberately context-free: it does not track whether it is
14
+ # inside `input do … end`. Disambiguation that used to live in the old
15
+ # tokenizer's context stack is the parser's job now.
16
+ class Lexer
17
+ def initialize(source)
18
+ @source = source
19
+ @ss = StringScanner.new(source.text)
20
+ @tokens = []
21
+ end
22
+
23
+ def tokenize
24
+ until @ss.eos?
25
+ skip_inline_whitespace
26
+ break if @ss.eos?
27
+
28
+ offset = @ss.pos
29
+ scan_token(offset)
30
+ end
31
+ push(:eof, nil, @ss.pos)
32
+ @tokens
33
+ end
34
+
35
+ private
36
+
37
+ def scan_token(offset)
38
+ if @ss.scan(/\n/)
39
+ push(:newline, "\n", offset)
40
+ elsif @ss.scan(/#[^\n]*/)
41
+ push(:comment, @ss.matched[1..], offset)
42
+ elsif scan_negative_number(offset) || scan_number(offset) ||
43
+ scan_string(offset) || scan_word(offset) ||
44
+ scan_symbol_or_colon(offset) || scan_operator(offset)
45
+ nil
46
+ else
47
+ raise_lex_error("unexpected character #{@ss.peek(1).inspect}", offset)
48
+ end
49
+ end
50
+
51
+ def skip_inline_whitespace
52
+ @ss.skip(/[ \t\r]+/)
53
+ end
54
+
55
+ # ---- numbers -------------------------------------------------------
56
+
57
+ # `-5` is a single negative-number literal, but only in operand position:
58
+ # `x - 5` is subtraction, `shift(a, -1)` and `x - -5` carry a negative
59
+ # literal. We decide by the previous significant token — a `-` right after
60
+ # a value (number, name, `)`, `]`, …) is the binary operator and is left
61
+ # for scan_operator; anywhere else, `-<digit>` is a negative literal.
62
+ def scan_negative_number(offset)
63
+ return false unless @ss.check(/-\d/)
64
+ return false if value_ending_previous?
65
+
66
+ @ss.pos += 1
67
+ emit_number("-#{@ss.scan(/\d[\d_]*(\.\d[\d_]*)?/)}", offset)
68
+ true
69
+ end
70
+
71
+ # Tokens after which a `-` is the binary subtraction operator rather than
72
+ # the sign of a literal.
73
+ VALUE_ENDING_KINDS = %i[integer float string boolean symbol identifier constant rparen rbracket rbrace].freeze
74
+
75
+ def value_ending_previous?
76
+ VALUE_ENDING_KINDS.include?(@tokens.last&.kind)
77
+ end
78
+
79
+ # A number is digits with optional underscores and an optional single
80
+ # decimal point followed by digits (so `1..5` and `a.b` aren't swallowed).
81
+ def scan_number(offset)
82
+ return false unless @ss.check(/\d/)
83
+
84
+ emit_number(@ss.scan(/\d[\d_]*(\.\d[\d_]*)?/), offset)
85
+ true
86
+ end
87
+
88
+ def emit_number(matched, offset)
89
+ is_float = matched.include?('.')
90
+ clean = matched.delete('_')
91
+ push(is_float ? :float : :integer, is_float ? clean.to_f : clean.to_i, offset)
92
+ end
93
+
94
+ # ---- strings -------------------------------------------------------
95
+
96
+ def scan_string(offset)
97
+ quote = @ss.peek(1)
98
+ return false unless ['"', "'"].include?(quote)
99
+
100
+ @ss.pos += 1
101
+ buffer = +''
102
+ until @ss.eos?
103
+ ch = @ss.getch
104
+ case ch
105
+ when quote
106
+ push(:string, buffer, offset)
107
+ return true
108
+ when '\\'
109
+ buffer << read_escape
110
+ when "\n"
111
+ raise_lex_error('unterminated string literal', offset)
112
+ else
113
+ buffer << ch
114
+ end
115
+ end
116
+ raise_lex_error('unterminated string literal', offset)
117
+ end
118
+
119
+ ESCAPES = { 'n' => "\n", 't' => "\t", 'r' => "\r", '\\' => '\\', '"' => '"', "'" => "'" }.freeze
120
+
121
+ def read_escape
122
+ ch = @ss.getch
123
+ return '' if ch.nil?
124
+
125
+ ESCAPES.fetch(ch, ch)
126
+ end
127
+
128
+ # ---- words: keywords, types, sugar, identifiers, labels, constants -
129
+
130
+ def scan_word(offset)
131
+ return false unless @ss.check(/[A-Za-z_]/)
132
+
133
+ word = @ss.scan(/[A-Za-z_][A-Za-z0-9_]*/)
134
+ return true if scan_constant_path(word, offset)
135
+ return true if scan_label(word, offset)
136
+
137
+ emit_word(word, offset)
138
+ true
139
+ end
140
+
141
+ # Constant path: Foo::Bar::Baz (any word followed by one or more `::name`).
142
+ def scan_constant_path(word, offset)
143
+ return false unless @ss.check(/::/)
144
+
145
+ path = word.dup
146
+ while @ss.scan(/::/)
147
+ part = @ss.scan(/[A-Za-z_][A-Za-z0-9_]*/)
148
+ raise_lex_error("expected a name after '::'", @ss.pos) unless part
149
+ path << '::' << part
150
+ end
151
+ push(:constant, path, offset)
152
+ true
153
+ end
154
+
155
+ # Label: `name:` (but not `name::`, handled by the constant path above).
156
+ def scan_label(word, offset)
157
+ return false unless @ss.check(/:(?!:)/)
158
+
159
+ @ss.pos += 1
160
+ push(:label, word, offset)
161
+ true
162
+ end
163
+
164
+ def emit_word(word, offset)
165
+ if Grammar.boolean?(word)
166
+ push(:boolean, Grammar.boolean(word), offset)
167
+ elsif (kw = Grammar.keyword(word))
168
+ push(kw, word, offset)
169
+ elsif (type = Grammar.type_keyword(word))
170
+ push(:type_keyword, type, offset)
171
+ elsif (fn = Grammar.function_sugar(word))
172
+ push(:function_sugar, fn, offset)
173
+ else
174
+ push(:identifier, word, offset)
175
+ end
176
+ end
177
+
178
+ # ---- symbols and the bare colon ------------------------------------
179
+
180
+ def scan_symbol_or_colon(offset)
181
+ if @ss.scan(/:[A-Za-z_][A-Za-z0-9_]*/)
182
+ push(:symbol, @ss.matched[1..].to_sym, offset)
183
+ true
184
+ elsif @ss.scan(/:/)
185
+ push(:colon, ':', offset)
186
+ true
187
+ else
188
+ false
189
+ end
190
+ end
191
+
192
+ # ---- operators and punctuation -------------------------------------
193
+
194
+ # Ordered longest-first so multi-char operators win over their prefixes.
195
+ OPERATORS = [
196
+ ['...', :dot_dot_dot], ['..', :dot_dot],
197
+ ['**', :power], ['==', :eq], ['!=', :ne], ['>=', :gte], ['<=', :lte], ['=>', :arrow],
198
+ ['+', :add], ['-', :subtract], ['*', :multiply], ['/', :divide], ['%', :modulo],
199
+ ['>', :gt], ['<', :lt], ['&', :and], ['|', :or],
200
+ ['.', :dot], [',', :comma],
201
+ ['(', :lparen], [')', :rparen], ['[', :lbracket], [']', :rbracket], ['{', :lbrace], ['}', :rbrace]
202
+ ].freeze
203
+
204
+ def scan_operator(offset)
205
+ OPERATORS.each do |text, kind|
206
+ next unless @ss.scan(literal_regexp(text))
207
+
208
+ push(kind, text, offset)
209
+ return true
210
+ end
211
+ # A stray '=' is the classic typo for '=='; name it specifically.
212
+ raise_lex_error("unexpected '=' (did you mean '=='?)", offset) if @ss.peek(1) == '='
213
+ false
214
+ end
215
+
216
+ def literal_regexp(text)
217
+ @literal_regexps ||= {}
218
+ @literal_regexps[text] ||= /#{Regexp.escape(text)}/
219
+ end
220
+
221
+ # ---- helpers -------------------------------------------------------
222
+
223
+ def push(kind, value, offset)
224
+ @tokens << Token.new(kind, value, offset)
225
+ end
226
+
227
+ def raise_lex_error(message, offset)
228
+ raise ParseError.new(message, source: @source, offset: offset)
229
+ end
230
+ end
231
+ end
232
+ end
@@ -0,0 +1,52 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kumi
4
+ module Parser
5
+ # A syntax error detected during the parse phase — lexing or AST
6
+ # construction — and nothing past it. Parse errors are about *shape*: an
7
+ # unexpected character, a missing `end`, a hash key that isn't a symbol or
8
+ # string. Anything that needs to know what a name *means* (types, axes,
9
+ # whether a referenced declaration exists) is a semantic concern and belongs
10
+ # to kumi-core's analyzer, not here.
11
+ #
12
+ # The error knows the byte offset where parsing got stuck and a plain-English
13
+ # description of what was expected versus what was found. The Source renders
14
+ # the caret frame; we keep a reference so callers that want a fully formatted
15
+ # message (file:line:col + frame) can get one without re-deriving locations.
16
+ class ParseError < StandardError
17
+ attr_reader :source, :offset
18
+
19
+ def initialize(message, source:, offset:)
20
+ @source = source
21
+ @offset = offset
22
+ @short_message = message
23
+ super(build_message(message))
24
+ end
25
+
26
+ def location
27
+ source.location(offset)
28
+ end
29
+
30
+ def line
31
+ location.line
32
+ end
33
+
34
+ def column
35
+ location.column
36
+ end
37
+
38
+ # The bare "what/why" without location or frame — used where a caller
39
+ # wants to compose its own message.
40
+ attr_reader :short_message
41
+
42
+ private
43
+
44
+ def build_message(message)
45
+ line, col = source.line_col(offset)
46
+ frame = source.code_frame(offset)
47
+ header = "#{source.file}:#{line}:#{col}: #{message}"
48
+ frame.empty? ? header : "#{header}\n#{frame}"
49
+ end
50
+ end
51
+ end
52
+ end