kumi-parser 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,287 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'token_metadata'
4
+
5
+ module Kumi
6
+ module Parser
7
+ # Context-aware tokenizer that produces tokens with embedded semantic metadata
8
+ class SmartTokenizer
9
+ def initialize(source, source_file: '<input>')
10
+ @source = source
11
+ @source_file = source_file
12
+ @pos = 0
13
+ @line = 1
14
+ @column = 1
15
+ @context_stack = [:global]
16
+ @tokens = []
17
+ end
18
+
19
+ def tokenize
20
+ while @pos < @source.length
21
+ skip_whitespace_except_newlines
22
+
23
+ case current_char
24
+ when nil then break
25
+ when "\n" then handle_newline
26
+ when '#' then consume_comment
27
+ when '"' then consume_string
28
+ when /\d/ then consume_number
29
+ when /[a-zA-Z_]/ then consume_identifier_or_keyword
30
+ when ':' then consume_symbol_or_colon
31
+ else
32
+ consume_operator_or_punctuation
33
+ end
34
+ end
35
+
36
+ add_token(:eof, nil, {})
37
+ @tokens
38
+ end
39
+
40
+ private
41
+
42
+ def current_char
43
+ return nil if @pos >= @source.length
44
+
45
+ @source[@pos]
46
+ end
47
+
48
+ def peek_char(offset = 1)
49
+ peek_pos = @pos + offset
50
+ return nil if peek_pos >= @source.length
51
+
52
+ @source[peek_pos]
53
+ end
54
+
55
+ def advance
56
+ if current_char == "\n"
57
+ @line += 1
58
+ @column = 1
59
+ else
60
+ @column += 1
61
+ end
62
+ @pos += 1
63
+ end
64
+
65
+ def skip_whitespace_except_newlines
66
+ advance while current_char && current_char.match?(/[ \t\r]/)
67
+ end
68
+
69
+ def handle_newline
70
+ add_token(:newline, "\n", Kumi::Parser::TOKEN_METADATA[:newline])
71
+ advance
72
+ end
73
+
74
+ def consume_comment
75
+ start_column = @column
76
+ advance # skip #
77
+
78
+ comment_text = ''
79
+ while current_char && current_char != "\n"
80
+ comment_text += current_char
81
+ advance
82
+ end
83
+
84
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
85
+ add_token(:comment, comment_text, Kumi::Parser::TOKEN_METADATA[:comment])
86
+ end
87
+
88
+ def consume_string
89
+ start_column = @column
90
+ advance # skip opening quote
91
+
92
+ string_content = ''
93
+ while current_char && current_char != '"'
94
+ if current_char == '\\'
95
+ advance
96
+ # Handle escape sequences
97
+ case current_char
98
+ when 'n' then string_content += "\n"
99
+ when 't' then string_content += "\t"
100
+ when 'r' then string_content += "\r"
101
+ when '\\' then string_content += '\\'
102
+ when '"' then string_content += '"'
103
+ else
104
+ string_content += current_char if current_char
105
+ end
106
+ else
107
+ string_content += current_char
108
+ end
109
+ advance
110
+ end
111
+
112
+ raise_tokenizer_error('Unterminated string literal') if current_char != '"'
113
+
114
+ advance # skip closing quote
115
+
116
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
117
+ @tokens << Token.new(:string, string_content, location, Kumi::Parser::TOKEN_METADATA[:string])
118
+ end
119
+
120
+ def consume_number
121
+ start_column = @column
122
+ number_str = ''
123
+ has_dot = false
124
+
125
+ # Consume digits and underscores, and optionally a decimal point
126
+ while current_char && (current_char.match?(/[0-9_]/) || (!has_dot && current_char == '.'))
127
+ if current_char == '.'
128
+ # Make sure next character is a digit to distinguish from member access
129
+ break unless peek_char && peek_char.match?(/\d/)
130
+
131
+ has_dot = true
132
+ number_str += current_char
133
+
134
+ else
135
+ number_str += current_char
136
+ end
137
+ advance
138
+ end
139
+
140
+ token_type = has_dot ? :float : :integer
141
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
142
+ @tokens << Token.new(token_type, number_str, location, Kumi::Parser::TOKEN_METADATA[token_type])
143
+ end
144
+
145
+ def consume_identifier_or_keyword
146
+ start_column = @column
147
+ identifier = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
148
+
149
+ # Check if it's a keyword
150
+ if keyword_type = Kumi::Parser::KEYWORDS[identifier]
151
+ metadata = Kumi::Parser::TOKEN_METADATA[keyword_type].dup
152
+
153
+ # Update context based on keyword
154
+ case keyword_type
155
+ when :schema, :input
156
+ @context_stack.push(keyword_type)
157
+ metadata[:opens_context] = keyword_type
158
+ when :end
159
+ closed_context = @context_stack.pop if @context_stack.length > 1
160
+ metadata[:closes_context] = closed_context
161
+ end
162
+
163
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
164
+ @tokens << Token.new(keyword_type, identifier, location, metadata)
165
+ else
166
+ # It's an identifier - determine its role based on context
167
+ metadata = Kumi::Parser::TOKEN_METADATA[:identifier].dup
168
+
169
+ # Add context-specific metadata
170
+ case current_context
171
+ when :input
172
+ metadata[:context] = :input_declaration
173
+ when :schema
174
+ metadata[:context] = :schema_body
175
+ end
176
+
177
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
178
+ @tokens << Token.new(:identifier, identifier, location, metadata)
179
+ end
180
+ end
181
+
182
+ def consume_symbol_or_colon
183
+ start_column = @column
184
+
185
+ if peek_char && peek_char.match?(/[a-zA-Z_]/)
186
+ # It's a symbol like :name
187
+ advance # skip :
188
+ symbol_name = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
189
+
190
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
191
+ @tokens << Token.new(:symbol, symbol_name.to_sym, location, Kumi::Parser::TOKEN_METADATA[:symbol])
192
+ else
193
+ # It's just a colon
194
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
195
+ @tokens << Token.new(:colon, ':', location, Kumi::Parser::TOKEN_METADATA[:colon])
196
+ advance
197
+ end
198
+ end
199
+
200
+ def consume_operator_or_punctuation
201
+ start_column = @column
202
+ char = current_char
203
+
204
+ # Handle multi-character operators
205
+ case char
206
+ when '='
207
+ if peek_char == '='
208
+ advance
209
+ advance
210
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
211
+ @tokens << Token.new(:eq, '==', location, Kumi::Parser::TOKEN_METADATA[:eq])
212
+ else
213
+ raise_tokenizer_error("Unexpected '=' (did you mean '=='?)")
214
+ end
215
+ when '!'
216
+ if peek_char == '='
217
+ advance
218
+ advance
219
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
220
+ @tokens << Token.new(:ne, '!=', location, Kumi::Parser::TOKEN_METADATA[:ne])
221
+ else
222
+ raise_tokenizer_error("Unexpected '!' (did you mean '!='?)")
223
+ end
224
+ when '>'
225
+ if peek_char == '='
226
+ advance
227
+ advance
228
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
229
+ @tokens << Token.new(:gte, '>=', location, Kumi::Parser::TOKEN_METADATA[:gte])
230
+ else
231
+ advance
232
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
233
+ @tokens << Token.new(:gt, '>', location, Kumi::Parser::TOKEN_METADATA[:gt])
234
+ end
235
+ when '<'
236
+ if peek_char == '='
237
+ advance
238
+ advance
239
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
240
+ @tokens << Token.new(:lte, '<=', location, Kumi::Parser::TOKEN_METADATA[:lte])
241
+ else
242
+ advance
243
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
244
+ @tokens << Token.new(:lt, '<', location, Kumi::Parser::TOKEN_METADATA[:lt])
245
+ end
246
+ else
247
+ # Single character operators/punctuation
248
+ token_type = CHAR_TO_TOKEN[char]
249
+ if token_type
250
+ metadata = Kumi::Parser::TOKEN_METADATA[token_type].dup
251
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
252
+ @tokens << Token.new(token_type, char, location, metadata)
253
+ advance
254
+ else
255
+ raise_tokenizer_error("Unexpected character: #{char}")
256
+ end
257
+ end
258
+ end
259
+
260
+ def consume_while(&block)
261
+ result = ''
262
+ while current_char && block.call(current_char)
263
+ result += current_char
264
+ advance
265
+ end
266
+ result
267
+ end
268
+
269
+ def current_context
270
+ @context_stack.last
271
+ end
272
+
273
+ def add_token(type, value, metadata)
274
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
275
+ token = Token.new(type, value, location, metadata)
276
+ @tokens << token
277
+ end
278
+
279
+ def raise_tokenizer_error(message)
280
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
281
+ raise TokenizerError.new(message, location: location)
282
+ end
283
+ end
284
+
285
+ # Custom error for tokenization issues
286
+ end
287
+ end
@@ -0,0 +1,21 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kumi
4
+ module Parser
5
+ # Validates Kumi DSL syntax using new parser
6
+ class SyntaxValidator
7
+ def validate(text, source_file: '<input>')
8
+ Kumi::Parser::Base.validate(text, source_file: source_file)
9
+ end
10
+
11
+ def valid?(text, source_file: '<input>')
12
+ validate(text, source_file: source_file).empty?
13
+ end
14
+
15
+ def first_error(text, source_file: '<input>')
16
+ diagnostics = validate(text, source_file: source_file)
17
+ diagnostics.empty? ? nil : diagnostics.first[:message]
18
+ end
19
+ end
20
+ end
21
+ end
@@ -0,0 +1,60 @@
1
+ # frozen_string_literal: true
2
+
3
+ require 'ostruct'
4
+
5
+ module Kumi
6
+ module Parser
7
+ module TextParser
8
+ # Public API for TextParser
9
+ class Api
10
+ class << self
11
+ def parse(text, source_file: '<input>')
12
+ parser = Parser.new
13
+ parser.parse(text, source_file: source_file)
14
+ end
15
+
16
+ def validate(text, source_file: '<input>')
17
+ parse(text, source_file: source_file)
18
+ []
19
+ rescue StandardError => e
20
+ [create_diagnostic(e, source_file)]
21
+ end
22
+
23
+ def valid?(text, source_file: '<input>')
24
+ validate(text, source_file: source_file).empty?
25
+ end
26
+
27
+ def diagnostics_for_monaco(text, source_file: '<input>')
28
+ validate(text, source_file: source_file)
29
+ end
30
+
31
+ def diagnostics_for_codemirror(text, source_file: '<input>')
32
+ validate(text, source_file: source_file)
33
+ end
34
+
35
+ def diagnostics_as_json(text, source_file: '<input>')
36
+ validate(text, source_file: source_file).map(&:to_h)
37
+ end
38
+
39
+ def analyze(text, source_file: '<input>')
40
+ ast = parse(text, source_file: source_file)
41
+ { success: true, ast: ast, diagnostics: [] }
42
+ rescue StandardError => e
43
+ { success: false, ast: nil, diagnostics: [create_diagnostic(e, source_file)] }
44
+ end
45
+
46
+ private
47
+
48
+ def create_diagnostic(error, source_file)
49
+ OpenStruct.new(
50
+ line: 1,
51
+ column: 1,
52
+ message: error.message,
53
+ source_file: source_file
54
+ )
55
+ end
56
+ end
57
+ end
58
+ end
59
+ end
60
+ end
@@ -0,0 +1,38 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'smart_tokenizer'
4
+ require_relative 'direct_parser'
5
+
6
+ module Kumi
7
+ module Parser
8
+ module TextParser
9
+ # Clean text parser focused on core parsing functionality
10
+
11
+ class << self
12
+ # Parse text to AST
13
+ def parse(text, source_file: '<input>')
14
+ tokens = Kumi::Parser::SmartTokenizer.new(text, source_file: source_file).tokenize
15
+ Kumi::Parser::DirectParser.new(tokens).parse
16
+ rescue Kumi::Parser::Errors::ParseError, Kumi::Parser::Errors::TokenizerError => e
17
+ # Convert parser errors to the expected SyntaxError for compatibility
18
+ raise Kumi::Errors::SyntaxError, e.message
19
+ end
20
+
21
+ # Check if text is syntactically valid
22
+ def valid?(text, source_file: '<input>')
23
+ parse(text, source_file: source_file)
24
+ true
25
+ rescue StandardError => e
26
+ false
27
+ end
28
+
29
+ # Basic validation - returns array of error hashes
30
+ def validate(text, source_file: '<input>')
31
+ # Use SyntaxValidator for proper diagnostic extraction
32
+ validator = Kumi::Parser::SyntaxValidator.new
33
+ validator.validate(text, source_file: source_file)
34
+ end
35
+ end
36
+ end
37
+ end
38
+ end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kumi
4
+ module Parser
5
+ # Token with embedded metadata for smart parsing
6
+ class Token
7
+ attr_reader :type, :value, :location, :metadata
8
+
9
+ def initialize(type, value, location, metadata = {})
10
+ @type = type
11
+ @value = value
12
+ @location = location
13
+ @metadata = metadata
14
+ end
15
+
16
+ # Semantic predicates embedded in token
17
+ def keyword?
18
+ @metadata[:category] == :keyword
19
+ end
20
+
21
+ def operator?
22
+ @metadata[:category] == :operator
23
+ end
24
+
25
+ def literal?
26
+ @metadata[:category] == :literal
27
+ end
28
+
29
+ def identifier?
30
+ @metadata[:category] == :identifier
31
+ end
32
+
33
+ def punctuation?
34
+ @metadata[:category] == :punctuation
35
+ end
36
+
37
+ # Operator precedence embedded in token
38
+ def precedence
39
+ @metadata[:precedence] || 0
40
+ end
41
+
42
+ def left_associative?
43
+ @metadata[:associativity] == :left
44
+ end
45
+
46
+ def right_associative?
47
+ @metadata[:associativity] == :right
48
+ end
49
+
50
+ # Parser hints embedded in token
51
+ def expects_block?
52
+ @metadata[:expects_block] == true
53
+ end
54
+
55
+ def terminates_expression?
56
+ @metadata[:terminates_expression] == true
57
+ end
58
+
59
+ def starts_expression?
60
+ @metadata[:starts_expression] == true
61
+ end
62
+
63
+ # Direct AST construction hint
64
+ def ast_class
65
+ @metadata[:ast_class]
66
+ end
67
+
68
+ def to_s
69
+ "#{@type}(#{@value.inspect}) at #{@location}"
70
+ end
71
+
72
+ def inspect
73
+ to_s
74
+ end
75
+
76
+ def ==(other)
77
+ other.is_a?(Token) &&
78
+ @type == other.type &&
79
+ @value == other.value &&
80
+ @location == other.location
81
+ end
82
+ end
83
+ end
84
+ end