kumi-parser 0.0.2 → 0.0.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/.rspec +3 -0
- data/CLAUDE.md +120 -0
- data/LICENSE +21 -0
- data/README.md +73 -0
- data/Rakefile +10 -0
- data/examples/debug_text_parser.rb +41 -0
- data/examples/debug_transform_rule.rb +26 -0
- data/examples/text_parser_comprehensive_test.rb +333 -0
- data/examples/text_parser_test_with_comments.rb +146 -0
- data/kumi-parser.gemspec +45 -0
- data/lib/kumi/parser/base.rb +51 -0
- data/lib/kumi/parser/direct_parser.rb +502 -0
- data/lib/kumi/parser/error_extractor.rb +89 -0
- data/lib/kumi/parser/errors.rb +40 -0
- data/lib/kumi/parser/smart_tokenizer.rb +287 -0
- data/lib/kumi/parser/syntax_validator.rb +21 -0
- data/lib/kumi/parser/text_parser/api.rb +60 -0
- data/lib/kumi/parser/text_parser.rb +38 -0
- data/lib/kumi/parser/token.rb +84 -0
- data/lib/kumi/parser/token_metadata.rb +370 -0
- data/lib/kumi/parser/version.rb +7 -0
- data/lib/kumi/text_parser.rb +40 -0
- data/lib/kumi/text_schema.rb +31 -0
- data/lib/kumi-parser.rb +19 -0
- metadata +26 -2
@@ -0,0 +1,287 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'token_metadata'
|
4
|
+
|
5
|
+
module Kumi
|
6
|
+
module Parser
|
7
|
+
# Context-aware tokenizer that produces tokens with embedded semantic metadata
|
8
|
+
class SmartTokenizer
|
9
|
+
def initialize(source, source_file: '<input>')
|
10
|
+
@source = source
|
11
|
+
@source_file = source_file
|
12
|
+
@pos = 0
|
13
|
+
@line = 1
|
14
|
+
@column = 1
|
15
|
+
@context_stack = [:global]
|
16
|
+
@tokens = []
|
17
|
+
end
|
18
|
+
|
19
|
+
def tokenize
|
20
|
+
while @pos < @source.length
|
21
|
+
skip_whitespace_except_newlines
|
22
|
+
|
23
|
+
case current_char
|
24
|
+
when nil then break
|
25
|
+
when "\n" then handle_newline
|
26
|
+
when '#' then consume_comment
|
27
|
+
when '"' then consume_string
|
28
|
+
when /\d/ then consume_number
|
29
|
+
when /[a-zA-Z_]/ then consume_identifier_or_keyword
|
30
|
+
when ':' then consume_symbol_or_colon
|
31
|
+
else
|
32
|
+
consume_operator_or_punctuation
|
33
|
+
end
|
34
|
+
end
|
35
|
+
|
36
|
+
add_token(:eof, nil, {})
|
37
|
+
@tokens
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
def current_char
|
43
|
+
return nil if @pos >= @source.length
|
44
|
+
|
45
|
+
@source[@pos]
|
46
|
+
end
|
47
|
+
|
48
|
+
def peek_char(offset = 1)
|
49
|
+
peek_pos = @pos + offset
|
50
|
+
return nil if peek_pos >= @source.length
|
51
|
+
|
52
|
+
@source[peek_pos]
|
53
|
+
end
|
54
|
+
|
55
|
+
def advance
|
56
|
+
if current_char == "\n"
|
57
|
+
@line += 1
|
58
|
+
@column = 1
|
59
|
+
else
|
60
|
+
@column += 1
|
61
|
+
end
|
62
|
+
@pos += 1
|
63
|
+
end
|
64
|
+
|
65
|
+
def skip_whitespace_except_newlines
|
66
|
+
advance while current_char && current_char.match?(/[ \t\r]/)
|
67
|
+
end
|
68
|
+
|
69
|
+
def handle_newline
|
70
|
+
add_token(:newline, "\n", Kumi::Parser::TOKEN_METADATA[:newline])
|
71
|
+
advance
|
72
|
+
end
|
73
|
+
|
74
|
+
def consume_comment
|
75
|
+
start_column = @column
|
76
|
+
advance # skip #
|
77
|
+
|
78
|
+
comment_text = ''
|
79
|
+
while current_char && current_char != "\n"
|
80
|
+
comment_text += current_char
|
81
|
+
advance
|
82
|
+
end
|
83
|
+
|
84
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
85
|
+
add_token(:comment, comment_text, Kumi::Parser::TOKEN_METADATA[:comment])
|
86
|
+
end
|
87
|
+
|
88
|
+
def consume_string
|
89
|
+
start_column = @column
|
90
|
+
advance # skip opening quote
|
91
|
+
|
92
|
+
string_content = ''
|
93
|
+
while current_char && current_char != '"'
|
94
|
+
if current_char == '\\'
|
95
|
+
advance
|
96
|
+
# Handle escape sequences
|
97
|
+
case current_char
|
98
|
+
when 'n' then string_content += "\n"
|
99
|
+
when 't' then string_content += "\t"
|
100
|
+
when 'r' then string_content += "\r"
|
101
|
+
when '\\' then string_content += '\\'
|
102
|
+
when '"' then string_content += '"'
|
103
|
+
else
|
104
|
+
string_content += current_char if current_char
|
105
|
+
end
|
106
|
+
else
|
107
|
+
string_content += current_char
|
108
|
+
end
|
109
|
+
advance
|
110
|
+
end
|
111
|
+
|
112
|
+
raise_tokenizer_error('Unterminated string literal') if current_char != '"'
|
113
|
+
|
114
|
+
advance # skip closing quote
|
115
|
+
|
116
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
117
|
+
@tokens << Token.new(:string, string_content, location, Kumi::Parser::TOKEN_METADATA[:string])
|
118
|
+
end
|
119
|
+
|
120
|
+
def consume_number
|
121
|
+
start_column = @column
|
122
|
+
number_str = ''
|
123
|
+
has_dot = false
|
124
|
+
|
125
|
+
# Consume digits and underscores, and optionally a decimal point
|
126
|
+
while current_char && (current_char.match?(/[0-9_]/) || (!has_dot && current_char == '.'))
|
127
|
+
if current_char == '.'
|
128
|
+
# Make sure next character is a digit to distinguish from member access
|
129
|
+
break unless peek_char && peek_char.match?(/\d/)
|
130
|
+
|
131
|
+
has_dot = true
|
132
|
+
number_str += current_char
|
133
|
+
|
134
|
+
else
|
135
|
+
number_str += current_char
|
136
|
+
end
|
137
|
+
advance
|
138
|
+
end
|
139
|
+
|
140
|
+
token_type = has_dot ? :float : :integer
|
141
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
142
|
+
@tokens << Token.new(token_type, number_str, location, Kumi::Parser::TOKEN_METADATA[token_type])
|
143
|
+
end
|
144
|
+
|
145
|
+
def consume_identifier_or_keyword
|
146
|
+
start_column = @column
|
147
|
+
identifier = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
|
148
|
+
|
149
|
+
# Check if it's a keyword
|
150
|
+
if keyword_type = Kumi::Parser::KEYWORDS[identifier]
|
151
|
+
metadata = Kumi::Parser::TOKEN_METADATA[keyword_type].dup
|
152
|
+
|
153
|
+
# Update context based on keyword
|
154
|
+
case keyword_type
|
155
|
+
when :schema, :input
|
156
|
+
@context_stack.push(keyword_type)
|
157
|
+
metadata[:opens_context] = keyword_type
|
158
|
+
when :end
|
159
|
+
closed_context = @context_stack.pop if @context_stack.length > 1
|
160
|
+
metadata[:closes_context] = closed_context
|
161
|
+
end
|
162
|
+
|
163
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
164
|
+
@tokens << Token.new(keyword_type, identifier, location, metadata)
|
165
|
+
else
|
166
|
+
# It's an identifier - determine its role based on context
|
167
|
+
metadata = Kumi::Parser::TOKEN_METADATA[:identifier].dup
|
168
|
+
|
169
|
+
# Add context-specific metadata
|
170
|
+
case current_context
|
171
|
+
when :input
|
172
|
+
metadata[:context] = :input_declaration
|
173
|
+
when :schema
|
174
|
+
metadata[:context] = :schema_body
|
175
|
+
end
|
176
|
+
|
177
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
178
|
+
@tokens << Token.new(:identifier, identifier, location, metadata)
|
179
|
+
end
|
180
|
+
end
|
181
|
+
|
182
|
+
def consume_symbol_or_colon
|
183
|
+
start_column = @column
|
184
|
+
|
185
|
+
if peek_char && peek_char.match?(/[a-zA-Z_]/)
|
186
|
+
# It's a symbol like :name
|
187
|
+
advance # skip :
|
188
|
+
symbol_name = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
|
189
|
+
|
190
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
191
|
+
@tokens << Token.new(:symbol, symbol_name.to_sym, location, Kumi::Parser::TOKEN_METADATA[:symbol])
|
192
|
+
else
|
193
|
+
# It's just a colon
|
194
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
195
|
+
@tokens << Token.new(:colon, ':', location, Kumi::Parser::TOKEN_METADATA[:colon])
|
196
|
+
advance
|
197
|
+
end
|
198
|
+
end
|
199
|
+
|
200
|
+
def consume_operator_or_punctuation
|
201
|
+
start_column = @column
|
202
|
+
char = current_char
|
203
|
+
|
204
|
+
# Handle multi-character operators
|
205
|
+
case char
|
206
|
+
when '='
|
207
|
+
if peek_char == '='
|
208
|
+
advance
|
209
|
+
advance
|
210
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
211
|
+
@tokens << Token.new(:eq, '==', location, Kumi::Parser::TOKEN_METADATA[:eq])
|
212
|
+
else
|
213
|
+
raise_tokenizer_error("Unexpected '=' (did you mean '=='?)")
|
214
|
+
end
|
215
|
+
when '!'
|
216
|
+
if peek_char == '='
|
217
|
+
advance
|
218
|
+
advance
|
219
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
220
|
+
@tokens << Token.new(:ne, '!=', location, Kumi::Parser::TOKEN_METADATA[:ne])
|
221
|
+
else
|
222
|
+
raise_tokenizer_error("Unexpected '!' (did you mean '!='?)")
|
223
|
+
end
|
224
|
+
when '>'
|
225
|
+
if peek_char == '='
|
226
|
+
advance
|
227
|
+
advance
|
228
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
229
|
+
@tokens << Token.new(:gte, '>=', location, Kumi::Parser::TOKEN_METADATA[:gte])
|
230
|
+
else
|
231
|
+
advance
|
232
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
233
|
+
@tokens << Token.new(:gt, '>', location, Kumi::Parser::TOKEN_METADATA[:gt])
|
234
|
+
end
|
235
|
+
when '<'
|
236
|
+
if peek_char == '='
|
237
|
+
advance
|
238
|
+
advance
|
239
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
240
|
+
@tokens << Token.new(:lte, '<=', location, Kumi::Parser::TOKEN_METADATA[:lte])
|
241
|
+
else
|
242
|
+
advance
|
243
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
244
|
+
@tokens << Token.new(:lt, '<', location, Kumi::Parser::TOKEN_METADATA[:lt])
|
245
|
+
end
|
246
|
+
else
|
247
|
+
# Single character operators/punctuation
|
248
|
+
token_type = CHAR_TO_TOKEN[char]
|
249
|
+
if token_type
|
250
|
+
metadata = Kumi::Parser::TOKEN_METADATA[token_type].dup
|
251
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
|
252
|
+
@tokens << Token.new(token_type, char, location, metadata)
|
253
|
+
advance
|
254
|
+
else
|
255
|
+
raise_tokenizer_error("Unexpected character: #{char}")
|
256
|
+
end
|
257
|
+
end
|
258
|
+
end
|
259
|
+
|
260
|
+
def consume_while(&block)
|
261
|
+
result = ''
|
262
|
+
while current_char && block.call(current_char)
|
263
|
+
result += current_char
|
264
|
+
advance
|
265
|
+
end
|
266
|
+
result
|
267
|
+
end
|
268
|
+
|
269
|
+
def current_context
|
270
|
+
@context_stack.last
|
271
|
+
end
|
272
|
+
|
273
|
+
def add_token(type, value, metadata)
|
274
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
|
275
|
+
token = Token.new(type, value, location, metadata)
|
276
|
+
@tokens << token
|
277
|
+
end
|
278
|
+
|
279
|
+
def raise_tokenizer_error(message)
|
280
|
+
location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
|
281
|
+
raise TokenizerError.new(message, location: location)
|
282
|
+
end
|
283
|
+
end
|
284
|
+
|
285
|
+
# Custom error for tokenization issues
|
286
|
+
end
|
287
|
+
end
|
@@ -0,0 +1,21 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Kumi
|
4
|
+
module Parser
|
5
|
+
# Validates Kumi DSL syntax using new parser
|
6
|
+
class SyntaxValidator
|
7
|
+
def validate(text, source_file: '<input>')
|
8
|
+
Kumi::Parser::Base.validate(text, source_file: source_file)
|
9
|
+
end
|
10
|
+
|
11
|
+
def valid?(text, source_file: '<input>')
|
12
|
+
validate(text, source_file: source_file).empty?
|
13
|
+
end
|
14
|
+
|
15
|
+
def first_error(text, source_file: '<input>')
|
16
|
+
diagnostics = validate(text, source_file: source_file)
|
17
|
+
diagnostics.empty? ? nil : diagnostics.first[:message]
|
18
|
+
end
|
19
|
+
end
|
20
|
+
end
|
21
|
+
end
|
@@ -0,0 +1,60 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require 'ostruct'
|
4
|
+
|
5
|
+
module Kumi
|
6
|
+
module Parser
|
7
|
+
module TextParser
|
8
|
+
# Public API for TextParser
|
9
|
+
class Api
|
10
|
+
class << self
|
11
|
+
def parse(text, source_file: '<input>')
|
12
|
+
parser = Parser.new
|
13
|
+
parser.parse(text, source_file: source_file)
|
14
|
+
end
|
15
|
+
|
16
|
+
def validate(text, source_file: '<input>')
|
17
|
+
parse(text, source_file: source_file)
|
18
|
+
[]
|
19
|
+
rescue StandardError => e
|
20
|
+
[create_diagnostic(e, source_file)]
|
21
|
+
end
|
22
|
+
|
23
|
+
def valid?(text, source_file: '<input>')
|
24
|
+
validate(text, source_file: source_file).empty?
|
25
|
+
end
|
26
|
+
|
27
|
+
def diagnostics_for_monaco(text, source_file: '<input>')
|
28
|
+
validate(text, source_file: source_file)
|
29
|
+
end
|
30
|
+
|
31
|
+
def diagnostics_for_codemirror(text, source_file: '<input>')
|
32
|
+
validate(text, source_file: source_file)
|
33
|
+
end
|
34
|
+
|
35
|
+
def diagnostics_as_json(text, source_file: '<input>')
|
36
|
+
validate(text, source_file: source_file).map(&:to_h)
|
37
|
+
end
|
38
|
+
|
39
|
+
def analyze(text, source_file: '<input>')
|
40
|
+
ast = parse(text, source_file: source_file)
|
41
|
+
{ success: true, ast: ast, diagnostics: [] }
|
42
|
+
rescue StandardError => e
|
43
|
+
{ success: false, ast: nil, diagnostics: [create_diagnostic(e, source_file)] }
|
44
|
+
end
|
45
|
+
|
46
|
+
private
|
47
|
+
|
48
|
+
def create_diagnostic(error, source_file)
|
49
|
+
OpenStruct.new(
|
50
|
+
line: 1,
|
51
|
+
column: 1,
|
52
|
+
message: error.message,
|
53
|
+
source_file: source_file
|
54
|
+
)
|
55
|
+
end
|
56
|
+
end
|
57
|
+
end
|
58
|
+
end
|
59
|
+
end
|
60
|
+
end
|
@@ -0,0 +1,38 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
require_relative 'smart_tokenizer'
|
4
|
+
require_relative 'direct_parser'
|
5
|
+
|
6
|
+
module Kumi
|
7
|
+
module Parser
|
8
|
+
module TextParser
|
9
|
+
# Clean text parser focused on core parsing functionality
|
10
|
+
|
11
|
+
class << self
|
12
|
+
# Parse text to AST
|
13
|
+
def parse(text, source_file: '<input>')
|
14
|
+
tokens = Kumi::Parser::SmartTokenizer.new(text, source_file: source_file).tokenize
|
15
|
+
Kumi::Parser::DirectParser.new(tokens).parse
|
16
|
+
rescue Kumi::Parser::Errors::ParseError, Kumi::Parser::Errors::TokenizerError => e
|
17
|
+
# Convert parser errors to the expected SyntaxError for compatibility
|
18
|
+
raise Kumi::Errors::SyntaxError, e.message
|
19
|
+
end
|
20
|
+
|
21
|
+
# Check if text is syntactically valid
|
22
|
+
def valid?(text, source_file: '<input>')
|
23
|
+
parse(text, source_file: source_file)
|
24
|
+
true
|
25
|
+
rescue StandardError => e
|
26
|
+
false
|
27
|
+
end
|
28
|
+
|
29
|
+
# Basic validation - returns array of error hashes
|
30
|
+
def validate(text, source_file: '<input>')
|
31
|
+
# Use SyntaxValidator for proper diagnostic extraction
|
32
|
+
validator = Kumi::Parser::SyntaxValidator.new
|
33
|
+
validator.validate(text, source_file: source_file)
|
34
|
+
end
|
35
|
+
end
|
36
|
+
end
|
37
|
+
end
|
38
|
+
end
|
@@ -0,0 +1,84 @@
|
|
1
|
+
# frozen_string_literal: true
|
2
|
+
|
3
|
+
module Kumi
|
4
|
+
module Parser
|
5
|
+
# Token with embedded metadata for smart parsing
|
6
|
+
class Token
|
7
|
+
attr_reader :type, :value, :location, :metadata
|
8
|
+
|
9
|
+
def initialize(type, value, location, metadata = {})
|
10
|
+
@type = type
|
11
|
+
@value = value
|
12
|
+
@location = location
|
13
|
+
@metadata = metadata
|
14
|
+
end
|
15
|
+
|
16
|
+
# Semantic predicates embedded in token
|
17
|
+
def keyword?
|
18
|
+
@metadata[:category] == :keyword
|
19
|
+
end
|
20
|
+
|
21
|
+
def operator?
|
22
|
+
@metadata[:category] == :operator
|
23
|
+
end
|
24
|
+
|
25
|
+
def literal?
|
26
|
+
@metadata[:category] == :literal
|
27
|
+
end
|
28
|
+
|
29
|
+
def identifier?
|
30
|
+
@metadata[:category] == :identifier
|
31
|
+
end
|
32
|
+
|
33
|
+
def punctuation?
|
34
|
+
@metadata[:category] == :punctuation
|
35
|
+
end
|
36
|
+
|
37
|
+
# Operator precedence embedded in token
|
38
|
+
def precedence
|
39
|
+
@metadata[:precedence] || 0
|
40
|
+
end
|
41
|
+
|
42
|
+
def left_associative?
|
43
|
+
@metadata[:associativity] == :left
|
44
|
+
end
|
45
|
+
|
46
|
+
def right_associative?
|
47
|
+
@metadata[:associativity] == :right
|
48
|
+
end
|
49
|
+
|
50
|
+
# Parser hints embedded in token
|
51
|
+
def expects_block?
|
52
|
+
@metadata[:expects_block] == true
|
53
|
+
end
|
54
|
+
|
55
|
+
def terminates_expression?
|
56
|
+
@metadata[:terminates_expression] == true
|
57
|
+
end
|
58
|
+
|
59
|
+
def starts_expression?
|
60
|
+
@metadata[:starts_expression] == true
|
61
|
+
end
|
62
|
+
|
63
|
+
# Direct AST construction hint
|
64
|
+
def ast_class
|
65
|
+
@metadata[:ast_class]
|
66
|
+
end
|
67
|
+
|
68
|
+
def to_s
|
69
|
+
"#{@type}(#{@value.inspect}) at #{@location}"
|
70
|
+
end
|
71
|
+
|
72
|
+
def inspect
|
73
|
+
to_s
|
74
|
+
end
|
75
|
+
|
76
|
+
def ==(other)
|
77
|
+
other.is_a?(Token) &&
|
78
|
+
@type == other.type &&
|
79
|
+
@value == other.value &&
|
80
|
+
@location == other.location
|
81
|
+
end
|
82
|
+
end
|
83
|
+
end
|
84
|
+
end
|