kumi-parser 0.0.3 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,287 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative 'token_metadata'
4
+
5
+ module Kumi
6
+ module Parser
7
+ # Context-aware tokenizer that produces tokens with embedded semantic metadata
8
+ class SmartTokenizer
9
+ def initialize(source, source_file: '<input>')
10
+ @source = source
11
+ @source_file = source_file
12
+ @pos = 0
13
+ @line = 1
14
+ @column = 1
15
+ @context_stack = [:global]
16
+ @tokens = []
17
+ end
18
+
19
+ def tokenize
20
+ while @pos < @source.length
21
+ skip_whitespace_except_newlines
22
+
23
+ case current_char
24
+ when nil then break
25
+ when "\n" then handle_newline
26
+ when '#' then consume_comment
27
+ when '"' then consume_string
28
+ when /\d/ then consume_number
29
+ when /[a-zA-Z_]/ then consume_identifier_or_keyword
30
+ when ':' then consume_symbol_or_colon
31
+ else
32
+ consume_operator_or_punctuation
33
+ end
34
+ end
35
+
36
+ add_token(:eof, nil, {})
37
+ @tokens
38
+ end
39
+
40
+ private
41
+
42
+ def current_char
43
+ return nil if @pos >= @source.length
44
+
45
+ @source[@pos]
46
+ end
47
+
48
+ def peek_char(offset = 1)
49
+ peek_pos = @pos + offset
50
+ return nil if peek_pos >= @source.length
51
+
52
+ @source[peek_pos]
53
+ end
54
+
55
+ def advance
56
+ if current_char == "\n"
57
+ @line += 1
58
+ @column = 1
59
+ else
60
+ @column += 1
61
+ end
62
+ @pos += 1
63
+ end
64
+
65
+ def skip_whitespace_except_newlines
66
+ advance while current_char && current_char.match?(/[ \t\r]/)
67
+ end
68
+
69
+ def handle_newline
70
+ add_token(:newline, "\n", Kumi::Parser::TOKEN_METADATA[:newline])
71
+ advance
72
+ end
73
+
74
+ def consume_comment
75
+ start_column = @column
76
+ advance # skip #
77
+
78
+ comment_text = ''
79
+ while current_char && current_char != "\n"
80
+ comment_text += current_char
81
+ advance
82
+ end
83
+
84
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
85
+ add_token(:comment, comment_text, Kumi::Parser::TOKEN_METADATA[:comment])
86
+ end
87
+
88
+ def consume_string
89
+ start_column = @column
90
+ advance # skip opening quote
91
+
92
+ string_content = ''
93
+ while current_char && current_char != '"'
94
+ if current_char == '\\'
95
+ advance
96
+ # Handle escape sequences
97
+ case current_char
98
+ when 'n' then string_content += "\n"
99
+ when 't' then string_content += "\t"
100
+ when 'r' then string_content += "\r"
101
+ when '\\' then string_content += '\\'
102
+ when '"' then string_content += '"'
103
+ else
104
+ string_content += current_char if current_char
105
+ end
106
+ else
107
+ string_content += current_char
108
+ end
109
+ advance
110
+ end
111
+
112
+ raise_tokenizer_error('Unterminated string literal') if current_char != '"'
113
+
114
+ advance # skip closing quote
115
+
116
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
117
+ @tokens << Token.new(:string, string_content, location, Kumi::Parser::TOKEN_METADATA[:string])
118
+ end
119
+
120
+ def consume_number
121
+ start_column = @column
122
+ number_str = ''
123
+ has_dot = false
124
+
125
+ # Consume digits and underscores, and optionally a decimal point
126
+ while current_char && (current_char.match?(/[0-9_]/) || (!has_dot && current_char == '.'))
127
+ if current_char == '.'
128
+ # Make sure next character is a digit to distinguish from member access
129
+ break unless peek_char && peek_char.match?(/\d/)
130
+
131
+ has_dot = true
132
+ number_str += current_char
133
+
134
+ else
135
+ number_str += current_char
136
+ end
137
+ advance
138
+ end
139
+
140
+ token_type = has_dot ? :float : :integer
141
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
142
+ @tokens << Token.new(token_type, number_str, location, Kumi::Parser::TOKEN_METADATA[token_type])
143
+ end
144
+
145
+ def consume_identifier_or_keyword
146
+ start_column = @column
147
+ identifier = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
148
+
149
+ # Check if it's a keyword
150
+ if keyword_type = Kumi::Parser::KEYWORDS[identifier]
151
+ metadata = Kumi::Parser::TOKEN_METADATA[keyword_type].dup
152
+
153
+ # Update context based on keyword
154
+ case keyword_type
155
+ when :schema, :input
156
+ @context_stack.push(keyword_type)
157
+ metadata[:opens_context] = keyword_type
158
+ when :end
159
+ closed_context = @context_stack.pop if @context_stack.length > 1
160
+ metadata[:closes_context] = closed_context
161
+ end
162
+
163
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
164
+ @tokens << Token.new(keyword_type, identifier, location, metadata)
165
+ else
166
+ # It's an identifier - determine its role based on context
167
+ metadata = Kumi::Parser::TOKEN_METADATA[:identifier].dup
168
+
169
+ # Add context-specific metadata
170
+ case current_context
171
+ when :input
172
+ metadata[:context] = :input_declaration
173
+ when :schema
174
+ metadata[:context] = :schema_body
175
+ end
176
+
177
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
178
+ @tokens << Token.new(:identifier, identifier, location, metadata)
179
+ end
180
+ end
181
+
182
+ def consume_symbol_or_colon
183
+ start_column = @column
184
+
185
+ if peek_char && peek_char.match?(/[a-zA-Z_]/)
186
+ # It's a symbol like :name
187
+ advance # skip :
188
+ symbol_name = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
189
+
190
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
191
+ @tokens << Token.new(:symbol, symbol_name.to_sym, location, Kumi::Parser::TOKEN_METADATA[:symbol])
192
+ else
193
+ # It's just a colon
194
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
195
+ @tokens << Token.new(:colon, ':', location, Kumi::Parser::TOKEN_METADATA[:colon])
196
+ advance
197
+ end
198
+ end
199
+
200
+ def consume_operator_or_punctuation
201
+ start_column = @column
202
+ char = current_char
203
+
204
+ # Handle multi-character operators
205
+ case char
206
+ when '='
207
+ if peek_char == '='
208
+ advance
209
+ advance
210
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
211
+ @tokens << Token.new(:eq, '==', location, Kumi::Parser::TOKEN_METADATA[:eq])
212
+ else
213
+ raise_tokenizer_error("Unexpected '=' (did you mean '=='?)")
214
+ end
215
+ when '!'
216
+ if peek_char == '='
217
+ advance
218
+ advance
219
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
220
+ @tokens << Token.new(:ne, '!=', location, Kumi::Parser::TOKEN_METADATA[:ne])
221
+ else
222
+ raise_tokenizer_error("Unexpected '!' (did you mean '!='?)")
223
+ end
224
+ when '>'
225
+ if peek_char == '='
226
+ advance
227
+ advance
228
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
229
+ @tokens << Token.new(:gte, '>=', location, Kumi::Parser::TOKEN_METADATA[:gte])
230
+ else
231
+ advance
232
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
233
+ @tokens << Token.new(:gt, '>', location, Kumi::Parser::TOKEN_METADATA[:gt])
234
+ end
235
+ when '<'
236
+ if peek_char == '='
237
+ advance
238
+ advance
239
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
240
+ @tokens << Token.new(:lte, '<=', location, Kumi::Parser::TOKEN_METADATA[:lte])
241
+ else
242
+ advance
243
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
244
+ @tokens << Token.new(:lt, '<', location, Kumi::Parser::TOKEN_METADATA[:lt])
245
+ end
246
+ else
247
+ # Single character operators/punctuation
248
+ token_type = CHAR_TO_TOKEN[char]
249
+ if token_type
250
+ metadata = Kumi::Parser::TOKEN_METADATA[token_type].dup
251
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
252
+ @tokens << Token.new(token_type, char, location, metadata)
253
+ advance
254
+ else
255
+ raise_tokenizer_error("Unexpected character: #{char}")
256
+ end
257
+ end
258
+ end
259
+
260
+ def consume_while(&block)
261
+ result = ''
262
+ while current_char && block.call(current_char)
263
+ result += current_char
264
+ advance
265
+ end
266
+ result
267
+ end
268
+
269
+ def current_context
270
+ @context_stack.last
271
+ end
272
+
273
+ def add_token(type, value, metadata)
274
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
275
+ token = Token.new(type, value, location, metadata)
276
+ @tokens << token
277
+ end
278
+
279
+ def raise_tokenizer_error(message)
280
+ location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
281
+ raise TokenizerError.new(message, location: location)
282
+ end
283
+ end
284
+
285
+ # Custom error for tokenization issues
286
+ end
287
+ end
@@ -1,33 +1,11 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'text_parser/parser'
4
- require_relative 'text_parser/editor_diagnostic'
5
- require_relative 'error_extractor'
6
-
7
3
  module Kumi
8
4
  module Parser
9
- # Validates Kumi DSL syntax
5
+ # Validates Kumi DSL syntax using new parser
10
6
  class SyntaxValidator
11
- def initialize
12
- @parser = TextParser::Parser.new
13
- end
14
-
15
7
  def validate(text, source_file: '<input>')
16
- @parser.parse(text, source_file: source_file)
17
- TextParser::DiagnosticCollection.new([])
18
- rescue StandardError => e
19
- # ErrorExtractor.extract returns a hash, convert it to an EditorDiagnostic
20
- error_hash = ErrorExtractor.extract(e)
21
- return TextParser::DiagnosticCollection.new([]) if error_hash.empty?
22
-
23
- diagnostic = TextParser::EditorDiagnostic.new(
24
- line: error_hash[:line],
25
- column: error_hash[:column],
26
- message: error_hash[:message],
27
- severity: error_hash[:severity],
28
- type: error_hash[:type]
29
- )
30
- TextParser::DiagnosticCollection.new([diagnostic])
8
+ Kumi::Parser::Base.validate(text, source_file: source_file)
31
9
  end
32
10
 
33
11
  def valid?(text, source_file: '<input>')
@@ -36,7 +14,7 @@ module Kumi
36
14
 
37
15
  def first_error(text, source_file: '<input>')
38
16
  diagnostics = validate(text, source_file: source_file)
39
- diagnostics.empty? ? nil : diagnostics.to_a.first.message
17
+ diagnostics.empty? ? nil : diagnostics.first[:message]
40
18
  end
41
19
  end
42
20
  end
@@ -1,52 +1,37 @@
1
1
  # frozen_string_literal: true
2
2
 
3
- require_relative 'syntax_validator'
3
+ require_relative 'smart_tokenizer'
4
+ require_relative 'direct_parser'
4
5
 
5
6
  module Kumi
6
7
  module Parser
7
8
  module TextParser
8
- # TextParser module - all classes are autoloaded by Zeitwerk
9
+ # Clean text parser focused on core parsing functionality
9
10
 
10
11
  class << self
12
+ # Parse text to AST
13
+ def parse(text, source_file: '<input>')
14
+ tokens = Kumi::Parser::SmartTokenizer.new(text, source_file: source_file).tokenize
15
+ Kumi::Parser::DirectParser.new(tokens).parse
16
+ rescue Kumi::Parser::Errors::ParseError, Kumi::Parser::Errors::TokenizerError => e
17
+ # Convert parser errors to the expected SyntaxError for compatibility
18
+ raise Kumi::Errors::SyntaxError, e.message
19
+ end
20
+
11
21
  # Check if text is syntactically valid
12
22
  def valid?(text, source_file: '<input>')
13
- validator.valid?(text, source_file: source_file)
23
+ parse(text, source_file: source_file)
24
+ true
25
+ rescue StandardError => e
26
+ false
14
27
  end
15
28
 
16
- # Validate text and return diagnostic collection
29
+ # Basic validation - returns array of error hashes
17
30
  def validate(text, source_file: '<input>')
31
+ # Use SyntaxValidator for proper diagnostic extraction
32
+ validator = Kumi::Parser::SyntaxValidator.new
18
33
  validator.validate(text, source_file: source_file)
19
34
  end
20
-
21
- # Get Monaco Editor format diagnostics
22
- def diagnostics_for_monaco(text, source_file: '<input>')
23
- validate(text, source_file: source_file).to_monaco
24
- end
25
-
26
- # Get CodeMirror format diagnostics
27
- def diagnostics_for_codemirror(text, source_file: '<input>')
28
- validate(text, source_file: source_file).to_codemirror
29
- end
30
-
31
- # Get JSON format diagnostics
32
- def diagnostics_as_json(text, source_file: '<input>')
33
- validate(text, source_file: source_file).to_json
34
- end
35
-
36
- # Parse text (compatibility method)
37
- def parse(text, source_file: '<input>')
38
- parser.parse(text, source_file: source_file)
39
- end
40
-
41
- private
42
-
43
- def validator
44
- @validator ||= SyntaxValidator.new
45
- end
46
-
47
- def parser
48
- @parser ||= TextParser::Parser.new
49
- end
50
35
  end
51
36
  end
52
37
  end
@@ -0,0 +1,84 @@
1
+ # frozen_string_literal: true
2
+
3
+ module Kumi
4
+ module Parser
5
+ # Token with embedded metadata for smart parsing
6
+ class Token
7
+ attr_reader :type, :value, :location, :metadata
8
+
9
+ def initialize(type, value, location, metadata = {})
10
+ @type = type
11
+ @value = value
12
+ @location = location
13
+ @metadata = metadata
14
+ end
15
+
16
+ # Semantic predicates embedded in token
17
+ def keyword?
18
+ @metadata[:category] == :keyword
19
+ end
20
+
21
+ def operator?
22
+ @metadata[:category] == :operator
23
+ end
24
+
25
+ def literal?
26
+ @metadata[:category] == :literal
27
+ end
28
+
29
+ def identifier?
30
+ @metadata[:category] == :identifier
31
+ end
32
+
33
+ def punctuation?
34
+ @metadata[:category] == :punctuation
35
+ end
36
+
37
+ # Operator precedence embedded in token
38
+ def precedence
39
+ @metadata[:precedence] || 0
40
+ end
41
+
42
+ def left_associative?
43
+ @metadata[:associativity] == :left
44
+ end
45
+
46
+ def right_associative?
47
+ @metadata[:associativity] == :right
48
+ end
49
+
50
+ # Parser hints embedded in token
51
+ def expects_block?
52
+ @metadata[:expects_block] == true
53
+ end
54
+
55
+ def terminates_expression?
56
+ @metadata[:terminates_expression] == true
57
+ end
58
+
59
+ def starts_expression?
60
+ @metadata[:starts_expression] == true
61
+ end
62
+
63
+ # Direct AST construction hint
64
+ def ast_class
65
+ @metadata[:ast_class]
66
+ end
67
+
68
+ def to_s
69
+ "#{@type}(#{@value.inspect}) at #{@location}"
70
+ end
71
+
72
+ def inspect
73
+ to_s
74
+ end
75
+
76
+ def ==(other)
77
+ other.is_a?(Token) &&
78
+ @type == other.type &&
79
+ @value == other.value &&
80
+ @location == other.location
81
+ end
82
+ end
83
+ end
84
+ end