RubyGems - kumi-parser - Versions diffs - 0.0.2 → 0.0.4 - Mend

kumi-parser 0.0.2 → 0.0.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (26) hide show

checksums.yaml +4 -4
data/.rspec +3 -0
data/CLAUDE.md +120 -0
data/LICENSE +21 -0
data/README.md +73 -0
data/Rakefile +10 -0
data/examples/debug_text_parser.rb +41 -0
data/examples/debug_transform_rule.rb +26 -0
data/examples/text_parser_comprehensive_test.rb +333 -0
data/examples/text_parser_test_with_comments.rb +146 -0
data/kumi-parser.gemspec +45 -0
data/lib/kumi/parser/base.rb +51 -0
data/lib/kumi/parser/direct_parser.rb +502 -0
data/lib/kumi/parser/error_extractor.rb +89 -0
data/lib/kumi/parser/errors.rb +40 -0
data/lib/kumi/parser/smart_tokenizer.rb +287 -0
data/lib/kumi/parser/syntax_validator.rb +21 -0
data/lib/kumi/parser/text_parser/api.rb +60 -0
data/lib/kumi/parser/text_parser.rb +38 -0
data/lib/kumi/parser/token.rb +84 -0
data/lib/kumi/parser/token_metadata.rb +370 -0
data/lib/kumi/parser/version.rb +7 -0
data/lib/kumi/text_parser.rb +40 -0
data/lib/kumi/text_schema.rb +31 -0
data/lib/kumi-parser.rb +19 -0
metadata +26 -2

data/lib/kumi/parser/direct_parser.rb ADDED Viewed

@@ -0,0 +1,502 @@
+# frozen_string_literal: true
+module Kumi
+  module Parser
+    # Direct AST construction parser using recursive descent with embedded token metadata
+    class DirectParser
+      def initialize(tokens)
+        @tokens = tokens
+        @pos = 0
+      end
+      def parse
+        schema_node = parse_schema
+        skip_comments_and_newlines
+        expect_token(:eof)
+        schema_node
+      end
+      private
+      def current_token
+        @tokens[@pos] || @tokens.last # Return EOF if past end
+      end
+      def peek_token(offset = 1)
+        peek_pos = @pos + offset
+        return @tokens.last if peek_pos >= @tokens.length # Return EOF
+        @tokens[peek_pos]
+      end
+      def advance
+        @pos += 1 if @pos < @tokens.length - 1
+      end
+      def expect_token(expected_type)
+        raise_parse_error("Expected #{expected_type}, got #{current_token.type}") if current_token.type != expected_type
+        token = current_token
+        advance
+        token
+      end
+      def skip_newlines
+        advance while current_token.type == :newline
+      end
+      def skip_comments_and_newlines
+        advance while %i[newline comment].include?(current_token.type)
+      end
+      # Schema: 'schema' 'do' ... 'end'
+      def parse_schema
+        schema_token = expect_token(:schema)
+        expect_token(:do)
+        skip_comments_and_newlines
+        input_declarations = parse_input_block
+        value_declarations = []
+        trait_declarations = []
+        skip_comments_and_newlines
+        while %i[value trait].include?(current_token.type)
+          case current_token.type
+          when :value
+            value_declarations << parse_value_declaration
+          when :trait
+            trait_declarations << parse_trait_declaration
+          end
+          skip_comments_and_newlines
+        end
+        expect_token(:end)
+        # Construct Root with exact AST.md structure
+        Kumi::Syntax::Root.new(
+          input_declarations,
+          value_declarations, # attributes
+          trait_declarations,
+          loc: schema_token.location
+        )
+      end
+      # Input block: 'input' 'do' ... 'end'
+      def parse_input_block
+        expect_token(:input)
+        expect_token(:do)
+        declarations = []
+        skip_comments_and_newlines
+        until %i[end eof].include?(current_token.type)
+          break unless current_token.metadata[:category] == :type_keyword
+          declarations << parse_input_declaration
+          skip_comments_and_newlines
+        end
+        expect_token(:end)
+        declarations
+      end
+      # Input declaration: 'integer :name' or 'array :items do ... end'
+      def parse_input_declaration
+        type_token = current_token
+        if type_token.metadata[:category] != :type_keyword
+          raise_parse_error("Expected type keyword, got #{type_token.type}")
+        end
+        advance
+        name_token = expect_token(:symbol)
+        # Handle domain specification: ', domain: [...]'
+        domain = nil
+        if current_token.type == :comma
+          advance
+          if current_token.type == :identifier && current_token.value == 'domain'
+            advance
+            expect_token(:colon)
+            domain = parse_domain_specification
+          else
+            # Put comma back for other parsers
+            @pos -= 1
+          end
+        end
+        # Handle nested array declarations
+        children = []
+        if type_token.metadata[:type_name] == :array && current_token.type == :do
+          advance # consume 'do'
+          skip_comments_and_newlines
+          until %i[end eof].include?(current_token.type)
+            break unless current_token.metadata[:category] == :type_keyword
+            children << parse_input_declaration
+            skip_comments_and_newlines
+          end
+          expect_token(:end)
+        end
+        Kumi::Syntax::InputDeclaration.new(
+          name_token.value,
+          domain,
+          type_token.metadata[:type_name],
+          children,
+          loc: type_token.location
+        )
+      end
+      def parse_domain_specification
+        # For now, just skip the domain spec - we can implement this later
+        # This handles cases like: domain: 1..10, domain: %w[a b c], domain: ["x", "y"]
+        if current_token.type == :lbracket
+          parse_array_literal
+        else
+          # Skip until comma or newline
+          advance until %i[comma newline eof end].include?(current_token.type)
+          nil
+        end
+      end
+      # Value declaration: 'value :name, expression' or 'value :name do ... end'
+      def parse_value_declaration
+        value_token = expect_token(:value)
+        name_token = expect_token(:symbol)
+        if current_token.type == :do
+          # Cascade expression: value :name do ... end
+          expression = parse_cascade_expression
+        else
+          # Simple expression: value :name, expression
+          expect_token(:comma)
+          expression = parse_expression
+        end
+        Kumi::Syntax::ValueDeclaration.new(
+          name_token.value,
+          expression,
+          loc: value_token.location
+        )
+      end
+      # Trait declaration: 'trait :name, expression'
+      def parse_trait_declaration
+        trait_token = expect_token(:trait)
+        name_token = expect_token(:symbol)
+        expect_token(:comma)
+        expression = parse_expression
+        Kumi::Syntax::TraitDeclaration.new(
+          name_token.value,
+          expression,
+          loc: trait_token.location
+        )
+      end
+      # Cascade expression: 'do' cases 'end'
+      def parse_cascade_expression
+        start_token = expect_token(:do)
+        cases = []
+        skip_comments_and_newlines
+        while %i[on base].include?(current_token.type)
+          cases << parse_case_expression
+          skip_comments_and_newlines
+        end
+        expect_token(:end)
+        Kumi::Syntax::CascadeExpression.new(cases, loc: start_token.location)
+      end
+      # Case expression: 'on condition, result' or 'base result'
+      def parse_case_expression
+        case current_token.type
+        when :on
+          on_token = advance_and_return_token
+          condition = parse_expression
+          # Wrap simple trait references in all? to match Ruby DSL behavior
+          condition = wrap_condition_in_all(condition) if simple_trait_reference?(condition)
+          expect_token(:comma)
+          result = parse_expression
+          Kumi::Syntax::CaseExpression.new(condition, result, loc: on_token.location)
+        when :base
+          base_token = advance_and_return_token
+          result = parse_expression
+          # Base case has condition = true
+          true_literal = Kumi::Syntax::Literal.new(true, loc: base_token.location)
+          Kumi::Syntax::CaseExpression.new(true_literal, result, loc: base_token.location)
+        else
+          raise_parse_error("Expected 'on' or 'base' in cascade expression")
+        end
+      end
+      def advance_and_return_token
+        token = current_token
+        advance
+        token
+      end
+      # Expression parsing with operator precedence
+      def parse_expression(min_precedence = 0)
+        left = parse_primary_expression
+        # Skip whitespace before checking for operators
+        skip_comments_and_newlines
+        while current_token.operator? && current_token.precedence >= min_precedence
+          operator_token = current_token
+          advance
+          # Skip whitespace after operator
+          skip_comments_and_newlines
+          # Use embedded associativity from token metadata
+          next_min_precedence = if operator_token.left_associative?
+                                  operator_token.precedence + 1
+                                else
+                                  operator_token.precedence
+                                end
+          right = parse_expression(next_min_precedence)
+          left = Kumi::Syntax::CallExpression.new(
+            map_operator_token_to_function_name(operator_token.type),
+            [left, right],
+            loc: operator_token.location
+          )
+          # Skip whitespace before checking for next operator
+          skip_comments_and_newlines
+        end
+        left
+      end
+      def parse_primary_expression
+        token = current_token
+        case token.type
+        when :integer, :float, :string, :boolean
+          # Direct AST construction using token metadata
+          value = convert_literal_value(token)
+          advance
+          Kumi::Syntax::Literal.new(value, loc: token.location)
+        when :identifier
+          if token.value == 'input' && peek_token.type == :dot
+            parse_input_reference
+          elsif peek_token.type == :lbracket
+            parse_array_access_reference
+          elsif token.value == 'fn'
+            parse_function_call
+          else
+            advance
+            Kumi::Syntax::DeclarationReference.new(token.value.to_sym, loc: token.location)
+          end
+        when :input
+          # Handle input references in expressions (input.field)
+          if peek_token.type == :dot
+            parse_input_reference_from_input_token
+          else
+            raise_parse_error("Unexpected 'input' keyword in expression")
+          end
+        when :lparen
+          advance # consume '('
+          expr = parse_expression
+          expect_token(:rparen)
+          expr
+        when :lbracket
+          parse_array_literal
+        when :fn
+          parse_function_call_from_fn_token
+        when :newline, :comment
+          # Skip newlines and comments in expressions
+          skip_comments_and_newlines
+          parse_primary_expression
+        else
+          raise_parse_error("Unexpected token in expression: #{token.type}")
+        end
+      end
+      def parse_input_reference
+        input_token = expect_token(:identifier) # 'input'
+        expect_token(:dot)
+        path = [expect_token(:identifier).value.to_sym]
+        # Handle nested access: input.field.subfield
+        while current_token.type == :dot
+          advance # consume '.'
+          path << expect_token(:identifier).value.to_sym
+        end
+        if path.length == 1
+          Kumi::Syntax::InputReference.new(path.first, loc: input_token.location)
+        else
+          Kumi::Syntax::InputElementReference.new(path, loc: input_token.location)
+        end
+      end
+      def parse_input_reference_from_input_token
+        input_token = expect_token(:input) # 'input' keyword token
+        expect_token(:dot)
+        path = [expect_token(:identifier).value.to_sym]
+        # Handle nested access: input.field.subfield
+        while current_token.type == :dot
+          advance # consume '.'
+          path << expect_token(:identifier).value.to_sym
+        end
+        if path.length == 1
+          Kumi::Syntax::InputReference.new(path.first, loc: input_token.location)
+        else
+          Kumi::Syntax::InputElementReference.new(path, loc: input_token.location)
+        end
+      end
+      def parse_array_access_reference
+        name_token = expect_token(:identifier)
+        expect_token(:lbracket)
+        index_expr = parse_expression
+        expect_token(:rbracket)
+        base_ref = Kumi::Syntax::DeclarationReference.new(name_token.value.to_sym, loc: name_token.location)
+        Kumi::Syntax::CallExpression.new(
+          :at,
+          [base_ref, index_expr],
+          loc: name_token.location
+        )
+      end
+      def parse_function_call
+        fn_token = expect_token(:identifier) # 'fn'
+        if current_token.type == :lparen
+          # Only syntax: fn(:symbol, args...)
+          advance # consume '('
+          fn_name_token = expect_token(:symbol)
+          fn_name = fn_name_token.value
+          args = []
+          while current_token.type == :comma
+            advance # consume comma
+            args << parse_expression
+          end
+          expect_token(:rparen)
+          Kumi::Syntax::CallExpression.new(fn_name, args, loc: fn_name_token.location)
+        else
+          raise_parse_error("Expected '(' after 'fn'")
+        end
+      end
+      def parse_function_call_from_fn_token
+        fn_token = expect_token(:fn) # 'fn' keyword token
+        if current_token.type == :lparen
+          # Only syntax: fn(:symbol, args...)
+          advance # consume '('
+          fn_name_token = expect_token(:symbol)
+          fn_name = fn_name_token.value
+          args = []
+          while current_token.type == :comma
+            advance # consume comma
+            args << parse_expression
+          end
+          expect_token(:rparen)
+          Kumi::Syntax::CallExpression.new(fn_name, args, loc: fn_name_token.location)
+        else
+          raise_parse_error("Expected '(' after 'fn'")
+        end
+      end
+      def parse_argument_list
+        args = []
+        unless current_token.type == :rparen
+          args << parse_expression
+          while current_token.type == :comma
+            advance # consume comma
+            args << parse_expression
+          end
+        end
+        args
+      end
+      def parse_array_literal
+        start_token = expect_token(:lbracket)
+        elements = []
+        unless current_token.type == :rbracket
+          elements << parse_expression
+          while current_token.type == :comma
+            advance # consume comma
+            elements << parse_expression unless current_token.type == :rbracket
+          end
+        end
+        expect_token(:rbracket)
+        Kumi::Syntax::ArrayExpression.new(elements, loc: start_token.location)
+      end
+      def convert_literal_value(token)
+        case token.type
+        when :integer then token.value.gsub('_', '').to_i
+        when :float then token.value.gsub('_', '').to_f
+        when :string then token.value
+        when :boolean then token.value == 'true'
+        end
+      end
+      def raise_parse_error(message)
+        location = current_token.location
+        raise Errors::ParseError.new(message, token: current_token)
+      end
+      # Helper method to check if condition is a simple trait reference
+      def simple_trait_reference?(condition)
+        condition.is_a?(Kumi::Syntax::DeclarationReference)
+      end
+      # Helper method to wrap condition in all? function call
+      def wrap_condition_in_all(condition)
+        array_expr = Kumi::Syntax::ArrayExpression.new([condition], loc: condition.loc)
+        Kumi::Syntax::CallExpression.new(:all?, [array_expr], loc: condition.loc)
+      end
+      # Map operator token types to function names for Ruby DSL compatibility
+      def map_operator_token_to_function_name(token_type)
+        case token_type
+        when :eq then :==
+        when :ne then :!=
+        else token_type
+        end
+      end
+    end
+  end
+end

data/lib/kumi/parser/error_extractor.rb ADDED Viewed

@@ -0,0 +1,89 @@
+# frozen_string_literal: true
+module Kumi
+  module Parser
+    # Extracts errors from parslet parse failures
+    class ErrorExtractor
+      def self.extract(error)
+        # Basic error extraction from parslet parse failures
+        # This would typically parse the parslet error message
+        # and extract location information
+        return {} unless error.respond_to?(:message)
+        message = error.message
+        # Determine error type based on class
+        error_type = case error.class.name
+                     when /Syntax/ then :syntax
+                     else :runtime
+                     end
+        # Simple regex to extract line/column info
+        if match = message.match(/at line (\d+) char (\d+)/)
+          line = match[1].to_i
+          column = match[2].to_i
+        else
+          line = 1
+          column = 1
+        end
+        # Format message based on error type
+        formatted_message = if error_type == :syntax
+                              extract_user_friendly_message(message)
+                            else
+                              "#{error.class.name}: #{message}"
+                            end
+        {
+          message: formatted_message,
+          line: line,
+          column: column,
+          severity: :error,
+          type: error_type
+        }
+      end
+      def self.humanize_error_message(raw_message)
+        extract_user_friendly_message(raw_message)
+      end
+      def self.extract_user_friendly_message(raw_message)
+        # Clean up the message first - remove markers, location info, and extra whitespace
+        cleaned_message = raw_message.gsub(/^\s*`-\s*/, '').gsub(/ at line \d+ char \d+\.?/, '').strip
+        # Convert parslet's technical error messages to user-friendly ones
+        case cleaned_message
+        when /Expected ":", but got "(\w+)"/
+          "Missing ':' before symbol, but got \"#{::Regexp.last_match(1)}\""
+        when /Expected ":"/
+          "Missing ':' before symbol"
+        when /Expected "do", but got "(\w+)"/
+          "Missing 'do' keyword, but got \"#{::Regexp.last_match(1)}\""
+        when /Expected "do"/
+          "Missing 'do' keyword"
+        when /Expected "end", but got (.+)/
+          "Missing 'end' keyword, but got #{::Regexp.last_match(1)}"
+        when /Expected "end"/
+          "Missing 'end' keyword"
+        when /Expected "(\w+)", but got "(\w+)"/
+          "Missing '#{::Regexp.last_match(1)}' keyword, but got \"#{::Regexp.last_match(2)}\""
+        when /Expected '(\w+)'/
+          "Expected '#{::Regexp.last_match(1)}'"
+        when /Expected "([^"]+)", but got "([^"]+)"/
+          "Expected '#{::Regexp.last_match(1)}', but got \"#{::Regexp.last_match(2)}\""
+        when /Expected "(\w+)"/
+          "Missing '#{::Regexp.last_match(1)}' keyword"
+        when /Failed to match.*Premature end of input/m
+          'Failed to match - premature end of input'
+        when /Premature end of input/
+          "Unexpected end of file - missing 'end'?"
+        when /Failed to match/
+          'Failed to match sequence'
+        else
+          'Parse error'
+        end
+      end
+    end
+  end
+end

data/lib/kumi/parser/errors.rb ADDED Viewed

@@ -0,0 +1,40 @@
+module Kumi
+  module Parser
+    # Namespace for parser-related errors
+    module Errors
+      # Custom error for parsing issues
+      class ParseError < StandardError
+        attr_reader :token, :suggestions
+        def initialize(message, token:, suggestions: [])
+          @token = token
+          @suggestions = suggestions
+          super(build_error_message(message))
+        end
+        private
+        def build_error_message(message)
+          lines = ["Parse error at #{@token.location}"]
+          lines << "  #{message}"
+          if @suggestions.any?
+            lines << '  Suggestions:'
+            @suggestions.each { |s| lines << "    - #{s}" }
+          end
+          lines.join("\n")
+        end
+      end
+      class TokenizerError < StandardError
+        attr_reader :location
+        def initialize(message, location:)
+          @location = location
+          super("#{message} at #{location}")
+        end
+      end
+    end
+  end
+end