kumi-parser 0.0.32 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rubocop.yml +41 -0
- data/CHANGELOG.md +64 -0
- data/CLAUDE.md +59 -120
- data/README.md +28 -6
- data/examples/parse_and_inspect.rb +34 -0
- data/kumi-parser.gemspec +3 -4
- data/lib/kumi/parser/grammar.rb +120 -0
- data/lib/kumi/parser/lexer.rb +232 -0
- data/lib/kumi/parser/parse_error.rb +52 -0
- data/lib/kumi/parser/parser.rb +692 -0
- data/lib/kumi/parser/source.rb +76 -0
- data/lib/kumi/parser/text_parser.rb +37 -27
- data/lib/kumi/parser/token.rb +10 -71
- data/lib/kumi/parser/version.rb +1 -1
- data/lib/kumi-parser.rb +9 -10
- metadata +16 -37
- data/examples/debug_text_parser.rb +0 -41
- data/examples/debug_transform_rule.rb +0 -26
- data/examples/text_parser_comprehensive_test.rb +0 -333
- data/examples/text_parser_test_with_comments.rb +0 -146
- data/lib/kumi/parser/base.rb +0 -51
- data/lib/kumi/parser/direct_parser.rb +0 -698
- data/lib/kumi/parser/error_extractor.rb +0 -89
- data/lib/kumi/parser/errors.rb +0 -40
- data/lib/kumi/parser/helpers.rb +0 -154
- data/lib/kumi/parser/smart_tokenizer.rb +0 -373
- data/lib/kumi/parser/syntax_validator.rb +0 -21
- data/lib/kumi/parser/text_parser/api.rb +0 -60
- data/lib/kumi/parser/token_constants.rb +0 -467
- data/lib/kumi/text_parser.rb +0 -40
- data/lib/kumi/text_schema.rb +0 -31
|
@@ -0,0 +1,232 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
require 'strscan'
|
|
4
|
+
|
|
5
|
+
module Kumi
|
|
6
|
+
module Parser
|
|
7
|
+
# Turns source text into a flat array of Tokens in a single StringScanner
|
|
8
|
+
# pass. Whitespace (except newlines, which are significant statement
|
|
9
|
+
# separators) is skipped; comments and newlines are emitted so the parser
|
|
10
|
+
# can ignore them uniformly. Every token carries its start offset, so all
|
|
11
|
+
# location and error-frame work is deferred to Source.
|
|
12
|
+
#
|
|
13
|
+
# The lexer is deliberately context-free: it does not track whether it is
|
|
14
|
+
# inside `input do … end`. Disambiguation that used to live in the old
|
|
15
|
+
# tokenizer's context stack is the parser's job now.
|
|
16
|
+
class Lexer
|
|
17
|
+
def initialize(source)
|
|
18
|
+
@source = source
|
|
19
|
+
@ss = StringScanner.new(source.text)
|
|
20
|
+
@tokens = []
|
|
21
|
+
end
|
|
22
|
+
|
|
23
|
+
def tokenize
|
|
24
|
+
until @ss.eos?
|
|
25
|
+
skip_inline_whitespace
|
|
26
|
+
break if @ss.eos?
|
|
27
|
+
|
|
28
|
+
offset = @ss.pos
|
|
29
|
+
scan_token(offset)
|
|
30
|
+
end
|
|
31
|
+
push(:eof, nil, @ss.pos)
|
|
32
|
+
@tokens
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
private
|
|
36
|
+
|
|
37
|
+
def scan_token(offset)
|
|
38
|
+
if @ss.scan(/\n/)
|
|
39
|
+
push(:newline, "\n", offset)
|
|
40
|
+
elsif @ss.scan(/#[^\n]*/)
|
|
41
|
+
push(:comment, @ss.matched[1..], offset)
|
|
42
|
+
elsif scan_negative_number(offset) || scan_number(offset) ||
|
|
43
|
+
scan_string(offset) || scan_word(offset) ||
|
|
44
|
+
scan_symbol_or_colon(offset) || scan_operator(offset)
|
|
45
|
+
nil
|
|
46
|
+
else
|
|
47
|
+
raise_lex_error("unexpected character #{@ss.peek(1).inspect}", offset)
|
|
48
|
+
end
|
|
49
|
+
end
|
|
50
|
+
|
|
51
|
+
def skip_inline_whitespace
|
|
52
|
+
@ss.skip(/[ \t\r]+/)
|
|
53
|
+
end
|
|
54
|
+
|
|
55
|
+
# ---- numbers -------------------------------------------------------
|
|
56
|
+
|
|
57
|
+
# `-5` is a single negative-number literal, but only in operand position:
|
|
58
|
+
# `x - 5` is subtraction, `shift(a, -1)` and `x - -5` carry a negative
|
|
59
|
+
# literal. We decide by the previous significant token — a `-` right after
|
|
60
|
+
# a value (number, name, `)`, `]`, …) is the binary operator and is left
|
|
61
|
+
# for scan_operator; anywhere else, `-<digit>` is a negative literal.
|
|
62
|
+
def scan_negative_number(offset)
|
|
63
|
+
return false unless @ss.check(/-\d/)
|
|
64
|
+
return false if value_ending_previous?
|
|
65
|
+
|
|
66
|
+
@ss.pos += 1
|
|
67
|
+
emit_number("-#{@ss.scan(/\d[\d_]*(\.\d[\d_]*)?/)}", offset)
|
|
68
|
+
true
|
|
69
|
+
end
|
|
70
|
+
|
|
71
|
+
# Tokens after which a `-` is the binary subtraction operator rather than
|
|
72
|
+
# the sign of a literal.
|
|
73
|
+
VALUE_ENDING_KINDS = %i[integer float string boolean symbol identifier constant rparen rbracket rbrace].freeze
|
|
74
|
+
|
|
75
|
+
def value_ending_previous?
|
|
76
|
+
VALUE_ENDING_KINDS.include?(@tokens.last&.kind)
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
# A number is digits with optional underscores and an optional single
|
|
80
|
+
# decimal point followed by digits (so `1..5` and `a.b` aren't swallowed).
|
|
81
|
+
def scan_number(offset)
|
|
82
|
+
return false unless @ss.check(/\d/)
|
|
83
|
+
|
|
84
|
+
emit_number(@ss.scan(/\d[\d_]*(\.\d[\d_]*)?/), offset)
|
|
85
|
+
true
|
|
86
|
+
end
|
|
87
|
+
|
|
88
|
+
def emit_number(matched, offset)
|
|
89
|
+
is_float = matched.include?('.')
|
|
90
|
+
clean = matched.delete('_')
|
|
91
|
+
push(is_float ? :float : :integer, is_float ? clean.to_f : clean.to_i, offset)
|
|
92
|
+
end
|
|
93
|
+
|
|
94
|
+
# ---- strings -------------------------------------------------------
|
|
95
|
+
|
|
96
|
+
def scan_string(offset)
|
|
97
|
+
quote = @ss.peek(1)
|
|
98
|
+
return false unless ['"', "'"].include?(quote)
|
|
99
|
+
|
|
100
|
+
@ss.pos += 1
|
|
101
|
+
buffer = +''
|
|
102
|
+
until @ss.eos?
|
|
103
|
+
ch = @ss.getch
|
|
104
|
+
case ch
|
|
105
|
+
when quote
|
|
106
|
+
push(:string, buffer, offset)
|
|
107
|
+
return true
|
|
108
|
+
when '\\'
|
|
109
|
+
buffer << read_escape
|
|
110
|
+
when "\n"
|
|
111
|
+
raise_lex_error('unterminated string literal', offset)
|
|
112
|
+
else
|
|
113
|
+
buffer << ch
|
|
114
|
+
end
|
|
115
|
+
end
|
|
116
|
+
raise_lex_error('unterminated string literal', offset)
|
|
117
|
+
end
|
|
118
|
+
|
|
119
|
+
ESCAPES = { 'n' => "\n", 't' => "\t", 'r' => "\r", '\\' => '\\', '"' => '"', "'" => "'" }.freeze
|
|
120
|
+
|
|
121
|
+
def read_escape
|
|
122
|
+
ch = @ss.getch
|
|
123
|
+
return '' if ch.nil?
|
|
124
|
+
|
|
125
|
+
ESCAPES.fetch(ch, ch)
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
# ---- words: keywords, types, sugar, identifiers, labels, constants -
|
|
129
|
+
|
|
130
|
+
def scan_word(offset)
|
|
131
|
+
return false unless @ss.check(/[A-Za-z_]/)
|
|
132
|
+
|
|
133
|
+
word = @ss.scan(/[A-Za-z_][A-Za-z0-9_]*/)
|
|
134
|
+
return true if scan_constant_path(word, offset)
|
|
135
|
+
return true if scan_label(word, offset)
|
|
136
|
+
|
|
137
|
+
emit_word(word, offset)
|
|
138
|
+
true
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
# Constant path: Foo::Bar::Baz (any word followed by one or more `::name`).
|
|
142
|
+
def scan_constant_path(word, offset)
|
|
143
|
+
return false unless @ss.check(/::/)
|
|
144
|
+
|
|
145
|
+
path = word.dup
|
|
146
|
+
while @ss.scan(/::/)
|
|
147
|
+
part = @ss.scan(/[A-Za-z_][A-Za-z0-9_]*/)
|
|
148
|
+
raise_lex_error("expected a name after '::'", @ss.pos) unless part
|
|
149
|
+
path << '::' << part
|
|
150
|
+
end
|
|
151
|
+
push(:constant, path, offset)
|
|
152
|
+
true
|
|
153
|
+
end
|
|
154
|
+
|
|
155
|
+
# Label: `name:` (but not `name::`, handled by the constant path above).
|
|
156
|
+
def scan_label(word, offset)
|
|
157
|
+
return false unless @ss.check(/:(?!:)/)
|
|
158
|
+
|
|
159
|
+
@ss.pos += 1
|
|
160
|
+
push(:label, word, offset)
|
|
161
|
+
true
|
|
162
|
+
end
|
|
163
|
+
|
|
164
|
+
def emit_word(word, offset)
|
|
165
|
+
if Grammar.boolean?(word)
|
|
166
|
+
push(:boolean, Grammar.boolean(word), offset)
|
|
167
|
+
elsif (kw = Grammar.keyword(word))
|
|
168
|
+
push(kw, word, offset)
|
|
169
|
+
elsif (type = Grammar.type_keyword(word))
|
|
170
|
+
push(:type_keyword, type, offset)
|
|
171
|
+
elsif (fn = Grammar.function_sugar(word))
|
|
172
|
+
push(:function_sugar, fn, offset)
|
|
173
|
+
else
|
|
174
|
+
push(:identifier, word, offset)
|
|
175
|
+
end
|
|
176
|
+
end
|
|
177
|
+
|
|
178
|
+
# ---- symbols and the bare colon ------------------------------------
|
|
179
|
+
|
|
180
|
+
def scan_symbol_or_colon(offset)
|
|
181
|
+
if @ss.scan(/:[A-Za-z_][A-Za-z0-9_]*/)
|
|
182
|
+
push(:symbol, @ss.matched[1..].to_sym, offset)
|
|
183
|
+
true
|
|
184
|
+
elsif @ss.scan(/:/)
|
|
185
|
+
push(:colon, ':', offset)
|
|
186
|
+
true
|
|
187
|
+
else
|
|
188
|
+
false
|
|
189
|
+
end
|
|
190
|
+
end
|
|
191
|
+
|
|
192
|
+
# ---- operators and punctuation -------------------------------------
|
|
193
|
+
|
|
194
|
+
# Ordered longest-first so multi-char operators win over their prefixes.
|
|
195
|
+
OPERATORS = [
|
|
196
|
+
['...', :dot_dot_dot], ['..', :dot_dot],
|
|
197
|
+
['**', :power], ['==', :eq], ['!=', :ne], ['>=', :gte], ['<=', :lte], ['=>', :arrow],
|
|
198
|
+
['+', :add], ['-', :subtract], ['*', :multiply], ['/', :divide], ['%', :modulo],
|
|
199
|
+
['>', :gt], ['<', :lt], ['&', :and], ['|', :or],
|
|
200
|
+
['.', :dot], [',', :comma],
|
|
201
|
+
['(', :lparen], [')', :rparen], ['[', :lbracket], [']', :rbracket], ['{', :lbrace], ['}', :rbrace]
|
|
202
|
+
].freeze
|
|
203
|
+
|
|
204
|
+
def scan_operator(offset)
|
|
205
|
+
OPERATORS.each do |text, kind|
|
|
206
|
+
next unless @ss.scan(literal_regexp(text))
|
|
207
|
+
|
|
208
|
+
push(kind, text, offset)
|
|
209
|
+
return true
|
|
210
|
+
end
|
|
211
|
+
# A stray '=' is the classic typo for '=='; name it specifically.
|
|
212
|
+
raise_lex_error("unexpected '=' (did you mean '=='?)", offset) if @ss.peek(1) == '='
|
|
213
|
+
false
|
|
214
|
+
end
|
|
215
|
+
|
|
216
|
+
def literal_regexp(text)
|
|
217
|
+
@literal_regexps ||= {}
|
|
218
|
+
@literal_regexps[text] ||= /#{Regexp.escape(text)}/
|
|
219
|
+
end
|
|
220
|
+
|
|
221
|
+
# ---- helpers -------------------------------------------------------
|
|
222
|
+
|
|
223
|
+
def push(kind, value, offset)
|
|
224
|
+
@tokens << Token.new(kind, value, offset)
|
|
225
|
+
end
|
|
226
|
+
|
|
227
|
+
def raise_lex_error(message, offset)
|
|
228
|
+
raise ParseError.new(message, source: @source, offset: offset)
|
|
229
|
+
end
|
|
230
|
+
end
|
|
231
|
+
end
|
|
232
|
+
end
|
|
@@ -0,0 +1,52 @@
|
|
|
1
|
+
# frozen_string_literal: true
|
|
2
|
+
|
|
3
|
+
module Kumi
|
|
4
|
+
module Parser
|
|
5
|
+
# A syntax error detected during the parse phase — lexing or AST
|
|
6
|
+
# construction — and nothing past it. Parse errors are about *shape*: an
|
|
7
|
+
# unexpected character, a missing `end`, a hash key that isn't a symbol or
|
|
8
|
+
# string. Anything that needs to know what a name *means* (types, axes,
|
|
9
|
+
# whether a referenced declaration exists) is a semantic concern and belongs
|
|
10
|
+
# to kumi-core's analyzer, not here.
|
|
11
|
+
#
|
|
12
|
+
# The error knows the byte offset where parsing got stuck and a plain-English
|
|
13
|
+
# description of what was expected versus what was found. The Source renders
|
|
14
|
+
# the caret frame; we keep a reference so callers that want a fully formatted
|
|
15
|
+
# message (file:line:col + frame) can get one without re-deriving locations.
|
|
16
|
+
class ParseError < StandardError
|
|
17
|
+
attr_reader :source, :offset
|
|
18
|
+
|
|
19
|
+
def initialize(message, source:, offset:)
|
|
20
|
+
@source = source
|
|
21
|
+
@offset = offset
|
|
22
|
+
@short_message = message
|
|
23
|
+
super(build_message(message))
|
|
24
|
+
end
|
|
25
|
+
|
|
26
|
+
def location
|
|
27
|
+
source.location(offset)
|
|
28
|
+
end
|
|
29
|
+
|
|
30
|
+
def line
|
|
31
|
+
location.line
|
|
32
|
+
end
|
|
33
|
+
|
|
34
|
+
def column
|
|
35
|
+
location.column
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
# The bare "what/why" without location or frame — used where a caller
|
|
39
|
+
# wants to compose its own message.
|
|
40
|
+
attr_reader :short_message
|
|
41
|
+
|
|
42
|
+
private
|
|
43
|
+
|
|
44
|
+
def build_message(message)
|
|
45
|
+
line, col = source.line_col(offset)
|
|
46
|
+
frame = source.code_frame(offset)
|
|
47
|
+
header = "#{source.file}:#{line}:#{col}: #{message}"
|
|
48
|
+
frame.empty? ? header : "#{header}\n#{frame}"
|
|
49
|
+
end
|
|
50
|
+
end
|
|
51
|
+
end
|
|
52
|
+
end
|