kumi-parser 0.0.3 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CLAUDE.md +120 -0
- data/README.md +38 -41
- data/lib/kumi/parser/base.rb +51 -0
- data/lib/kumi/parser/direct_parser.rb +502 -0
- data/lib/kumi/parser/errors.rb +40 -0
- data/lib/kumi/parser/smart_tokenizer.rb +287 -0
- data/lib/kumi/parser/syntax_validator.rb +3 -25
- data/lib/kumi/parser/text_parser.rb +19 -34
- data/lib/kumi/parser/token.rb +84 -0
- data/lib/kumi/parser/token_metadata.rb +370 -0
- data/lib/kumi/parser/version.rb +1 -1
- data/lib/kumi/text_parser.rb +40 -0
- data/lib/kumi/text_schema.rb +31 -0
- data/lib/kumi-parser.rb +1 -0
- metadata +10 -8
- data/lib/kumi/parser/analyzer_diagnostic_converter.rb +0 -84
- data/lib/kumi/parser/text_parser/editor_diagnostic.rb +0 -102
- data/lib/kumi/parser/text_parser/grammar.rb +0 -214
- data/lib/kumi/parser/text_parser/parser.rb +0 -168
- data/lib/kumi/parser/text_parser/transform.rb +0 -170
- data/lib/kumi/parser.rb +0 -8
- data/test_basic.rb +0 -44
@@ -0,0 +1,287 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'token_metadata'
|
4
|
+
|
5
|
+
module Kumi
|
6
|
+
module Parser
|
7
|
+
# Context-aware tokenizer that produces tokens with embedded semantic metadata
|
8
|
+
class SmartTokenizer
|
9
|
+
def initialize(source, source_file: '<input>')
|
10
|
+
@source = source
|
11
|
+
@source_file = source_file
|
12
|
+
@pos = 0
|
13
|
+
@line = 1
|
14
|
+
@column = 1
|
15
|
+
@context_stack = [:global]
|
16
|
+
@tokens = []
|
17
|
+
end
|
18
|
+
|
19
|
+
def tokenize
|
20
|
+
while @pos < @source.length
|
21
|
+
skip_whitespace_except_newlines
|
22
|
+
|
23
|
+
case current_char
|
24
|
+
when nil then break
|
25
|
+
when "\n" then handle_newline
|
26
|
+
when '#' then consume_comment
|
27
|
+
when '"' then consume_string
|
28
|
+
when /\d/ then consume_number
|
29
|
+
when /[a-zA-Z_]/ then consume_identifier_or_keyword
|
30
|
+
when ':' then consume_symbol_or_colon
|
31
|
+
else
|
32
|
+
consume_operator_or_punctuation
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
add_token(:eof, nil, {})
|
37
|
+
@tokens
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def current_char
|
43
|
+
return nil if @pos >= @source.length
|
44
|
+
|
45
|
+
@source[@pos]
|
46
|
+
end
|
47
|
+
|
48
|
+
def peek_char(offset = 1)
|
49
|
+
peek_pos = @pos + offset
|
50
|
+
return nil if peek_pos >= @source.length
|
51
|
+
|
52
|
+
@source[peek_pos]
|
53
|
+
end
|
54
|
+
|
55
|
+
def advance
|
56
|
+
if current_char == "\n"
|
57
|
+
@line += 1
|
58
|
+
@column = 1
|
59
|
+
else
|
60
|
+
@column += 1
|
61
|
+
end
|
62
|
+
@pos += 1
|
63
|
+
end
|
64
|
+
|
65
|
+
def skip_whitespace_except_newlines
|
66
|
+
advance while current_char && current_char.match?(/[ \t\r]/)
|
67
|
+
end
|
68
|
+
|
69
|
+
def handle_newline
|
70
|
+
add_token(:newline, "\n", Kumi::Parser::TOKEN_METADATA[:newline])
|
71
|
+
advance
|
72
|
+
end
|
73
|
+
|
74
|
+
def consume_comment
|
75
|
+
start_column = @column
|
76
|
+
advance # skip #
|
77
|
+
|
78
|
+
comment_text = ''
|
79
|
+
while current_char && current_char != "\n"
|
80
|
+
comment_text += current_char
|
81
|
+
advance
|
82
|
+
end
|
83
|
+
|
84
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
85
|
+
add_token(:comment, comment_text, Kumi::Parser::TOKEN_METADATA[:comment])
|
86
|
+
end
|
87
|
+
|
88
|
+
def consume_string
|
89
|
+
start_column = @column
|
90
|
+
advance # skip opening quote
|
91
|
+
|
92
|
+
string_content = ''
|
93
|
+
while current_char && current_char != '"'
|
94
|
+
if current_char == '\\'
|
95
|
+
advance
|
96
|
+
# Handle escape sequences
|
97
|
+
case current_char
|
98
|
+
when 'n' then string_content += "\n"
|
99
|
+
when 't' then string_content += "\t"
|
100
|
+
when 'r' then string_content += "\r"
|
101
|
+
when '\\' then string_content += '\\'
|
102
|
+
when '"' then string_content += '"'
|
103
|
+
else
|
104
|
+
string_content += current_char if current_char
|
105
|
+
end
|
106
|
+
else
|
107
|
+
string_content += current_char
|
108
|
+
end
|
109
|
+
advance
|
110
|
+
end
|
111
|
+
|
112
|
+
raise_tokenizer_error('Unterminated string literal') if current_char != '"'
|
113
|
+
|
114
|
+
advance # skip closing quote
|
115
|
+
|
116
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
117
|
+
@tokens << Token.new(:string, string_content, location, Kumi::Parser::TOKEN_METADATA[:string])
|
118
|
+
end
|
119
|
+
|
120
|
+
def consume_number
|
121
|
+
start_column = @column
|
122
|
+
number_str = ''
|
123
|
+
has_dot = false
|
124
|
+
|
125
|
+
# Consume digits and underscores, and optionally a decimal point
|
126
|
+
while current_char && (current_char.match?(/[0-9_]/) || (!has_dot && current_char == '.'))
|
127
|
+
if current_char == '.'
|
128
|
+
# Make sure next character is a digit to distinguish from member access
|
129
|
+
break unless peek_char && peek_char.match?(/\d/)
|
130
|
+
|
131
|
+
has_dot = true
|
132
|
+
number_str += current_char
|
133
|
+
|
134
|
+
else
|
135
|
+
number_str += current_char
|
136
|
+
end
|
137
|
+
advance
|
138
|
+
end
|
139
|
+
|
140
|
+
token_type = has_dot ? :float : :integer
|
141
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
142
|
+
@tokens << Token.new(token_type, number_str, location, Kumi::Parser::TOKEN_METADATA[token_type])
|
143
|
+
end
|
144
|
+
|
145
|
+
def consume_identifier_or_keyword
|
146
|
+
start_column = @column
|
147
|
+
identifier = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
|
148
|
+
|
149
|
+
# Check if it's a keyword
|
150
|
+
if keyword_type = Kumi::Parser::KEYWORDS[identifier]
|
151
|
+
metadata = Kumi::Parser::TOKEN_METADATA[keyword_type].dup
|
152
|
+
|
153
|
+
# Update context based on keyword
|
154
|
+
case keyword_type
|
155
|
+
when :schema, :input
|
156
|
+
@context_stack.push(keyword_type)
|
157
|
+
metadata[:opens_context] = keyword_type
|
158
|
+
when :end
|
159
|
+
closed_context = @context_stack.pop if @context_stack.length > 1
|
160
|
+
metadata[:closes_context] = closed_context
|
161
|
+
end
|
162
|
+
|
163
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
164
|
+
@tokens << Token.new(keyword_type, identifier, location, metadata)
|
165
|
+
else
|
166
|
+
# It's an identifier - determine its role based on context
|
167
|
+
metadata = Kumi::Parser::TOKEN_METADATA[:identifier].dup
|
168
|
+
|
169
|
+
# Add context-specific metadata
|
170
|
+
case current_context
|
171
|
+
when :input
|
172
|
+
metadata[:context] = :input_declaration
|
173
|
+
when :schema
|
174
|
+
metadata[:context] = :schema_body
|
175
|
+
end
|
176
|
+
|
177
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
178
|
+
@tokens << Token.new(:identifier, identifier, location, metadata)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def consume_symbol_or_colon
|
183
|
+
start_column = @column
|
184
|
+
|
185
|
+
if peek_char && peek_char.match?(/[a-zA-Z_]/)
|
186
|
+
# It's a symbol like :name
|
187
|
+
advance # skip :
|
188
|
+
symbol_name = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
|
189
|
+
|
190
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
191
|
+
@tokens << Token.new(:symbol, symbol_name.to_sym, location, Kumi::Parser::TOKEN_METADATA[:symbol])
|
192
|
+
else
|
193
|
+
# It's just a colon
|
194
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
195
|
+
@tokens << Token.new(:colon, ':', location, Kumi::Parser::TOKEN_METADATA[:colon])
|
196
|
+
advance
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def consume_operator_or_punctuation
|
201
|
+
start_column = @column
|
202
|
+
char = current_char
|
203
|
+
|
204
|
+
# Handle multi-character operators
|
205
|
+
case char
|
206
|
+
when '='
|
207
|
+
if peek_char == '='
|
208
|
+
advance
|
209
|
+
advance
|
210
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
211
|
+
@tokens << Token.new(:eq, '==', location, Kumi::Parser::TOKEN_METADATA[:eq])
|
212
|
+
else
|
213
|
+
raise_tokenizer_error("Unexpected '=' (did you mean '=='?)")
|
214
|
+
end
|
215
|
+
when '!'
|
216
|
+
if peek_char == '='
|
217
|
+
advance
|
218
|
+
advance
|
219
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
220
|
+
@tokens << Token.new(:ne, '!=', location, Kumi::Parser::TOKEN_METADATA[:ne])
|
221
|
+
else
|
222
|
+
raise_tokenizer_error("Unexpected '!' (did you mean '!='?)")
|
223
|
+
end
|
224
|
+
when '>'
|
225
|
+
if peek_char == '='
|
226
|
+
advance
|
227
|
+
advance
|
228
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
229
|
+
@tokens << Token.new(:gte, '>=', location, Kumi::Parser::TOKEN_METADATA[:gte])
|
230
|
+
else
|
231
|
+
advance
|
232
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
233
|
+
@tokens << Token.new(:gt, '>', location, Kumi::Parser::TOKEN_METADATA[:gt])
|
234
|
+
end
|
235
|
+
when '<'
|
236
|
+
if peek_char == '='
|
237
|
+
advance
|
238
|
+
advance
|
239
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
240
|
+
@tokens << Token.new(:lte, '<=', location, Kumi::Parser::TOKEN_METADATA[:lte])
|
241
|
+
else
|
242
|
+
advance
|
243
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
244
|
+
@tokens << Token.new(:lt, '<', location, Kumi::Parser::TOKEN_METADATA[:lt])
|
245
|
+
end
|
246
|
+
else
|
247
|
+
# Single character operators/punctuation
|
248
|
+
token_type = CHAR_TO_TOKEN[char]
|
249
|
+
if token_type
|
250
|
+
metadata = Kumi::Parser::TOKEN_METADATA[token_type].dup
|
251
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
252
|
+
@tokens << Token.new(token_type, char, location, metadata)
|
253
|
+
advance
|
254
|
+
else
|
255
|
+
raise_tokenizer_error("Unexpected character: #{char}")
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
def consume_while(&block)
|
261
|
+
result = ''
|
262
|
+
while current_char && block.call(current_char)
|
263
|
+
result += current_char
|
264
|
+
advance
|
265
|
+
end
|
266
|
+
result
|
267
|
+
end
|
268
|
+
|
269
|
+
def current_context
|
270
|
+
@context_stack.last
|
271
|
+
end
|
272
|
+
|
273
|
+
def add_token(type, value, metadata)
|
274
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
|
275
|
+
token = Token.new(type, value, location, metadata)
|
276
|
+
@tokens << token
|
277
|
+
end
|
278
|
+
|
279
|
+
def raise_tokenizer_error(message)
|
280
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
|
281
|
+
raise TokenizerError.new(message, location: location)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
# Custom error for tokenization issues
|
286
|
+
end
|
287
|
+
end
|
@@ -1,33 +1,11 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative 'text_parser/parser'
|
4
|
-
require_relative 'text_parser/editor_diagnostic'
|
5
|
-
require_relative 'error_extractor'
|
6
|
-
|
7
3
|
module Kumi
|
8
4
|
module Parser
|
9
|
-
# Validates Kumi DSL syntax
|
5
|
+
# Validates Kumi DSL syntax using new parser
|
10
6
|
class SyntaxValidator
|
11
|
-
def initialize
|
12
|
-
@parser = TextParser::Parser.new
|
13
|
-
end
|
14
|
-
|
15
7
|
def validate(text, source_file: '<input>')
|
16
|
-
|
17
|
-
TextParser::DiagnosticCollection.new([])
|
18
|
-
rescue StandardError => e
|
19
|
-
# ErrorExtractor.extract returns a hash, convert it to an EditorDiagnostic
|
20
|
-
error_hash = ErrorExtractor.extract(e)
|
21
|
-
return TextParser::DiagnosticCollection.new([]) if error_hash.empty?
|
22
|
-
|
23
|
-
diagnostic = TextParser::EditorDiagnostic.new(
|
24
|
-
line: error_hash[:line],
|
25
|
-
column: error_hash[:column],
|
26
|
-
message: error_hash[:message],
|
27
|
-
severity: error_hash[:severity],
|
28
|
-
type: error_hash[:type]
|
29
|
-
)
|
30
|
-
TextParser::DiagnosticCollection.new([diagnostic])
|
8
|
+
Kumi::Parser::Base.validate(text, source_file: source_file)
|
31
9
|
end
|
32
10
|
|
33
11
|
def valid?(text, source_file: '<input>')
|
@@ -36,7 +14,7 @@ module Kumi
|
|
36
14
|
|
37
15
|
def first_error(text, source_file: '<input>')
|
38
16
|
diagnostics = validate(text, source_file: source_file)
|
39
|
-
diagnostics.empty? ? nil : diagnostics.
|
17
|
+
diagnostics.empty? ? nil : diagnostics.first[:message]
|
40
18
|
end
|
41
19
|
end
|
42
20
|
end
|
@@ -1,52 +1,37 @@
|
|
1
1
|
# frozen_string_literal: true
|
2
2
|
|
3
|
-
require_relative '
|
3
|
+
require_relative 'smart_tokenizer'
|
4
|
+
require_relative 'direct_parser'
|
4
5
|
|
5
6
|
module Kumi
|
6
7
|
module Parser
|
7
8
|
module TextParser
|
8
|
-
#
|
9
|
+
# Clean text parser focused on core parsing functionality
|
9
10
|
|
10
11
|
class << self
|
12
|
+
# Parse text to AST
|
13
|
+
def parse(text, source_file: '<input>')
|
14
|
+
tokens = Kumi::Parser::SmartTokenizer.new(text, source_file: source_file).tokenize
|
15
|
+
Kumi::Parser::DirectParser.new(tokens).parse
|
16
|
+
rescue Kumi::Parser::Errors::ParseError, Kumi::Parser::Errors::TokenizerError => e
|
17
|
+
# Convert parser errors to the expected SyntaxError for compatibility
|
18
|
+
raise Kumi::Errors::SyntaxError, e.message
|
19
|
+
end
|
20
|
+
|
11
21
|
# Check if text is syntactically valid
|
12
22
|
def valid?(text, source_file: '<input>')
|
13
|
-
|
23
|
+
parse(text, source_file: source_file)
|
24
|
+
true
|
25
|
+
rescue StandardError => e
|
26
|
+
false
|
14
27
|
end
|
15
28
|
|
16
|
-
#
|
29
|
+
# Basic validation - returns array of error hashes
|
17
30
|
def validate(text, source_file: '<input>')
|
31
|
+
# Use SyntaxValidator for proper diagnostic extraction
|
32
|
+
validator = Kumi::Parser::SyntaxValidator.new
|
18
33
|
validator.validate(text, source_file: source_file)
|
19
34
|
end
|
20
|
-
|
21
|
-
# Get Monaco Editor format diagnostics
|
22
|
-
def diagnostics_for_monaco(text, source_file: '<input>')
|
23
|
-
validate(text, source_file: source_file).to_monaco
|
24
|
-
end
|
25
|
-
|
26
|
-
# Get CodeMirror format diagnostics
|
27
|
-
def diagnostics_for_codemirror(text, source_file: '<input>')
|
28
|
-
validate(text, source_file: source_file).to_codemirror
|
29
|
-
end
|
30
|
-
|
31
|
-
# Get JSON format diagnostics
|
32
|
-
def diagnostics_as_json(text, source_file: '<input>')
|
33
|
-
validate(text, source_file: source_file).to_json
|
34
|
-
end
|
35
|
-
|
36
|
-
# Parse text (compatibility method)
|
37
|
-
def parse(text, source_file: '<input>')
|
38
|
-
parser.parse(text, source_file: source_file)
|
39
|
-
end
|
40
|
-
|
41
|
-
private
|
42
|
-
|
43
|
-
def validator
|
44
|
-
@validator ||= SyntaxValidator.new
|
45
|
-
end
|
46
|
-
|
47
|
-
def parser
|
48
|
-
@parser ||= TextParser::Parser.new
|
49
|
-
end
|
50
35
|
end
|
51
36
|
end
|
52
37
|
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Kumi
|
4
|
+
module Parser
|
5
|
+
# Token with embedded metadata for smart parsing
|
6
|
+
class Token
|
7
|
+
attr_reader :type, :value, :location, :metadata
|
8
|
+
|
9
|
+
def initialize(type, value, location, metadata = {})
|
10
|
+
@type = type
|
11
|
+
@value = value
|
12
|
+
@location = location
|
13
|
+
@metadata = metadata
|
14
|
+
end
|
15
|
+
|
16
|
+
# Semantic predicates embedded in token
|
17
|
+
def keyword?
|
18
|
+
@metadata[:category] == :keyword
|
19
|
+
end
|
20
|
+
|
21
|
+
def operator?
|
22
|
+
@metadata[:category] == :operator
|
23
|
+
end
|
24
|
+
|
25
|
+
def literal?
|
26
|
+
@metadata[:category] == :literal
|
27
|
+
end
|
28
|
+
|
29
|
+
def identifier?
|
30
|
+
@metadata[:category] == :identifier
|
31
|
+
end
|
32
|
+
|
33
|
+
def punctuation?
|
34
|
+
@metadata[:category] == :punctuation
|
35
|
+
end
|
36
|
+
|
37
|
+
# Operator precedence embedded in token
|
38
|
+
def precedence
|
39
|
+
@metadata[:precedence] || 0
|
40
|
+
end
|
41
|
+
|
42
|
+
def left_associative?
|
43
|
+
@metadata[:associativity] == :left
|
44
|
+
end
|
45
|
+
|
46
|
+
def right_associative?
|
47
|
+
@metadata[:associativity] == :right
|
48
|
+
end
|
49
|
+
|
50
|
+
# Parser hints embedded in token
|
51
|
+
def expects_block?
|
52
|
+
@metadata[:expects_block] == true
|
53
|
+
end
|
54
|
+
|
55
|
+
def terminates_expression?
|
56
|
+
@metadata[:terminates_expression] == true
|
57
|
+
end
|
58
|
+
|
59
|
+
def starts_expression?
|
60
|
+
@metadata[:starts_expression] == true
|
61
|
+
end
|
62
|
+
|
63
|
+
# Direct AST construction hint
|
64
|
+
def ast_class
|
65
|
+
@metadata[:ast_class]
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_s
|
69
|
+
"#{@type}(#{@value.inspect}) at #{@location}"
|
70
|
+
end
|
71
|
+
|
72
|
+
def inspect
|
73
|
+
to_s
|
74
|
+
end
|
75
|
+
|
76
|
+
def ==(other)
|
77
|
+
other.is_a?(Token) &&
|
78
|
+
@type == other.type &&
|
79
|
+
@value == other.value &&
|
80
|
+
@location == other.location
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|