RubyGems - kumi-parser - Versions diffs - 0.0.33 → 0.1.0 - Mend

kumi-parser 0.0.33 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (32) hide show

checksums.yaml +4 -4
data/.rubocop.yml +41 -0
data/CHANGELOG.md +64 -0
data/CLAUDE.md +59 -120
data/README.md +28 -6
data/examples/parse_and_inspect.rb +34 -0
data/kumi-parser.gemspec +3 -4
data/lib/kumi/parser/grammar.rb +120 -0
data/lib/kumi/parser/lexer.rb +232 -0
data/lib/kumi/parser/parse_error.rb +52 -0
data/lib/kumi/parser/parser.rb +692 -0
data/lib/kumi/parser/source.rb +76 -0
data/lib/kumi/parser/text_parser.rb +37 -27
data/lib/kumi/parser/token.rb +10 -71
data/lib/kumi/parser/version.rb +1 -1
data/lib/kumi-parser.rb +9 -10
metadata +16 -37
data/examples/debug_text_parser.rb +0 -41
data/examples/debug_transform_rule.rb +0 -26
data/examples/text_parser_comprehensive_test.rb +0 -333
data/examples/text_parser_test_with_comments.rb +0 -146
data/lib/kumi/parser/base.rb +0 -51
data/lib/kumi/parser/direct_parser.rb +0 -698
data/lib/kumi/parser/error_extractor.rb +0 -89
data/lib/kumi/parser/errors.rb +0 -40
data/lib/kumi/parser/helpers.rb +0 -154
data/lib/kumi/parser/smart_tokenizer.rb +0 -373
data/lib/kumi/parser/syntax_validator.rb +0 -21
data/lib/kumi/parser/text_parser/api.rb +0 -60
data/lib/kumi/parser/token_constants.rb +0 -468
data/lib/kumi/text_parser.rb +0 -40
data/lib/kumi/text_schema.rb +0 -31

data/lib/kumi/parser/error_extractor.rb DELETED Viewed

@@ -1,89 +0,0 @@
-# frozen_string_literal: true
-module Kumi
-  module Parser
-    # Extracts errors from parslet parse failures
-    class ErrorExtractor
-      def self.extract(error)
-        # Basic error extraction from parslet parse failures
-        # This would typically parse the parslet error message
-        # and extract location information
-        return {} unless error.respond_to?(:message)
-        message = error.message
-        # Determine error type based on class
-        error_type = case error.class.name
-                     when /Syntax/ then :syntax
-                     else :runtime
-                     end
-        # Simple regex to extract line/column info
-        if match = message.match(/at line (\d+) char (\d+)/)
-          line = match[1].to_i
-          column = match[2].to_i
-        else
-          line = 1
-          column = 1
-        end
-        # Format message based on error type
-        formatted_message = if error_type == :syntax
-                              extract_user_friendly_message(message)
-                            else
-                              "#{error.class.name}: #{message}"
-                            end
-        {
-          message: formatted_message,
-          line: line,
-          column: column,
-          severity: :error,
-          type: error_type
-        }
-      end
-      def self.humanize_error_message(raw_message)
-        extract_user_friendly_message(raw_message)
-      end
-      def self.extract_user_friendly_message(raw_message)
-        # Clean up the message first - remove markers, location info, and extra whitespace
-        cleaned_message = raw_message.gsub(/^\s*`-\s*/, '').gsub(/ at line \d+ char \d+\.?/, '').strip
-        # Convert parslet's technical error messages to user-friendly ones
-        case cleaned_message
-        when /Expected ":", but got "(\w+)"/
-          "Missing ':' before symbol, but got \"#{::Regexp.last_match(1)}\""
-        when /Expected ":"/
-          "Missing ':' before symbol"
-        when /Expected "do", but got "(\w+)"/
-          "Missing 'do' keyword, but got \"#{::Regexp.last_match(1)}\""
-        when /Expected "do"/
-          "Missing 'do' keyword"
-        when /Expected "end", but got (.+)/
-          "Missing 'end' keyword, but got #{::Regexp.last_match(1)}"
-        when /Expected "end"/
-          "Missing 'end' keyword"
-        when /Expected "(\w+)", but got "(\w+)"/
-          "Missing '#{::Regexp.last_match(1)}' keyword, but got \"#{::Regexp.last_match(2)}\""
-        when /Expected '(\w+)'/
-          "Expected '#{::Regexp.last_match(1)}'"
-        when /Expected "([^"]+)", but got "([^"]+)"/
-          "Expected '#{::Regexp.last_match(1)}', but got \"#{::Regexp.last_match(2)}\""
-        when /Expected "(\w+)"/
-          "Missing '#{::Regexp.last_match(1)}' keyword"
-        when /Failed to match.*Premature end of input/m
-          'Failed to match - premature end of input'
-        when /Premature end of input/
-          "Unexpected end of file - missing 'end'?"
-        when /Failed to match/
-          'Failed to match sequence'
-        else
-          'Parse error'
-        end
-      end
-    end
-  end
-end

data/lib/kumi/parser/errors.rb DELETED Viewed

@@ -1,40 +0,0 @@
-module Kumi
-  module Parser
-    # Namespace for parser-related errors
-    module Errors
-      # Custom error for parsing issues
-      class ParseError < StandardError
-        attr_reader :token, :suggestions
-        def initialize(message, token:, suggestions: [])
-          @token = token
-          @suggestions = suggestions
-          super(build_error_message(message))
-        end
-        private
-        def build_error_message(message)
-          lines = ["Parse error at #{@token.location}"]
-          lines << "  #{message}"
-          if @suggestions.any?
-            lines << '  Suggestions:'
-            @suggestions.each { |s| lines << "    - #{s}" }
-          end
-          lines.join("\n")
-        end
-      end
-      class TokenizerError < StandardError
-        attr_reader :location
-        def initialize(message, location:)
-          @location = location
-          super("#{message} at #{location}")
-        end
-      end
-    end
-  end
-end

data/lib/kumi/parser/helpers.rb DELETED Viewed

@@ -1,154 +0,0 @@
-module Kumi
-  module Parser
-    module Helpers
-      # Parses optional ", domain: ..., index: :sym" (order-agnostic, both optional)
-      # Cursor is right after the array/hash/type name.
-      def parse_optional_decl_kwargs
-        domain = nil
-        index  = nil
-        # nothing to do
-        return [domain, index] unless current_token.type == :comma
-        # consume one or more ", key: value" pairs
-        while current_token.type == :comma
-          advance
-          key_tok = current_token
-          unless key_tok.type == :label && %w[domain index].include?(key_tok.value)
-            # roll back gracefully if it's not a kw pair
-            @pos -= 1
-            break
-          end
-          advance
-          case key_tok.value
-          when 'domain'
-            domain = parse_domain_specification
-          when 'index'
-            sym = expect_token(:symbol)
-            index = sym.value.to_sym
-          end
-        end
-        [domain, index]
-      end
-      def convert_literal_value(token)
-        case token.type
-        when :integer  then token.value.gsub('_', '').to_i
-        when :float    then token.value.gsub('_', '').to_f
-        when :string   then token.value
-        when :boolean  then token.value == 'true'
-        when :symbol   then token.value.to_sym
-        when :constant
-          case token.value
-          when 'Float::INFINITY' then Float::INFINITY
-          else
-            raise_parse_error("Unknown constant: #{token.value}")
-          end
-        end
-      end
-      def parse_kw_literal_value
-        t = current_token
-        case t.type
-        when :integer  then advance
-                            t.value.delete('_').to_i
-        when :float    then advance
-                            t.value.delete('_').to_f
-        when :string, :symbol then advance
-                                   t.value
-        when :boolean  then advance
-                            t.value == 'true'
-        when :label    then advance
-                            t.value.to_sym # :wrap, :clamp, etc.
-        when :subtract # allow negatives like -1
-          advance
-          v = parse_kw_literal_value
-          raise_parse_error("numeric after unary '-'") unless v.is_a?(Numeric)
-          -v
-        else
-          raise_parse_error('keyword value must be literal/label')
-        end
-      end
-      def parse_args_and_opts_inside_parens
-        args = []
-        opts = {}
-        # expect_token(:lparen)
-        unless current_token.type == :rparen
-          # --- positional args ---
-          unless next_is_kwarg_after_comma?
-            args << parse_expression
-            while current_token.type == :comma && !next_is_kwarg_after_comma?
-              advance
-              args << parse_expression
-            end
-          end
-          # --- kwargs (labels like `policy:`) ---
-          if next_is_kwarg_after_comma?
-            # subsequent pairs: `, label value`
-            while current_token.type == :comma
-              # stop if next token is not a kw key
-              advance
-              if current_token.type == :label
-                key = current_token.value.to_sym
-                advance
-              end
-              opts[key] = parse_kw_literal_value
-              break unless next_is_kwarg_after_comma?
-            end
-          end
-        end
-        expect_token(:rparen)
-        [args, opts]
-      end
-      def expect_field_name_token
-        token = current_token
-        if token.identifier? || token.keyword?
-          advance
-          token.value
-        else
-          raise_parse_error("Expected field name (identifier or keyword), got #{token.type}")
-        end
-      end
-      def next_is_kwarg_after_comma?
-        current_token.type == :comma && peek_token.type == :label
-      end
-      def skip_comments_and_newlines
-        advance while %i[newline comment].include?(current_token.type)
-      end
-      def advance_and_return_token
-        token = current_token
-        advance
-        token
-      end
-      def map_operator_token_to_function_name(token_type)
-        case token_type
-        when :eq  then :==
-        when :ne  then :!=
-        when :gt  then :>
-        when :lt  then :<
-        when :gte then :>=
-        when :lte then :<=
-        when :and then :and
-        when :or  then :or
-        when :exponent then :power
-        else token_type
-        end
-      end
-    end
-  end
-end

data/lib/kumi/parser/smart_tokenizer.rb DELETED Viewed

@@ -1,373 +0,0 @@
-# frozen_string_literal: true
-require_relative 'token_constants'
-require_relative 'token'
-require_relative 'errors'
-module Kumi
-  module Parser
-    # Context-aware tokenizer that produces tokens with embedded semantic metadata
-    class SmartTokenizer
-      def initialize(source, source_file: '<input>')
-        @source = source
-        @source_file = source_file
-        @pos = 0
-        @line = 1
-        @column = 1
-        @context_stack = [:global]
-        @tokens = []
-      end
-      def tokenize
-        while @pos < @source.length
-          skip_whitespace_except_newlines
-          case current_char
-          when nil then break
-          when "\n" then handle_newline
-          when '#' then consume_comment
-          when '"', "'" then consume_string
-          when /\d/ then consume_number
-          when '-'
-            if peek_char && peek_char.match?(/\d/)
-              consume_number
-            else
-              consume_operator_or_punctuation
-            end
-          when /[a-zA-Z_]/ then consume_identifier_or_label_or_keyword
-          when ':' then consume_symbol_or_colon
-          else
-            consume_operator_or_punctuation
-          end
-        end
-        add_token(:eof, nil, {})
-        @tokens
-      end
-      private
-      def current_char
-        return nil if @pos >= @source.length
-        @source[@pos]
-      end
-      def peek_char(offset = 1)
-        peek_pos = @pos + offset
-        return nil if peek_pos >= @source.length
-        @source[peek_pos]
-      end
-      def advance
-        if current_char == "\n"
-          @line += 1
-          @column = 1
-        else
-          @column += 1
-        end
-        @pos += 1
-      end
-      def skip_whitespace_except_newlines
-        advance while current_char && current_char.match?(/[ \t\r]/)
-      end
-      def handle_newline
-        add_token(:newline, "\n", Kumi::Parser::TOKEN_METADATA[:newline])
-        advance
-      end
-      def consume_comment
-        start_column = @column
-        advance # skip #
-        comment_text = ''
-        while current_char && current_char != "\n"
-          comment_text += current_char
-          advance
-        end
-        location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-        add_token(:comment, comment_text, Kumi::Parser::TOKEN_METADATA[:comment])
-      end
-      def consume_string
-        start_column = @column
-        quote_char = current_char # Remember which quote type we're using
-        advance # skip opening quote
-        string_content = ''
-        while current_char && current_char != quote_char
-          if current_char == '\\'
-            advance
-            # Handle escape sequences
-            case current_char
-            when 'n' then string_content += "\n"
-            when 't' then string_content += "\t"
-            when 'r' then string_content += "\r"
-            when '\\' then string_content += '\\'
-            when '"' then string_content += '"'
-            when "'" then string_content += "'"
-            else
-              string_content += current_char if current_char
-            end
-          else
-            string_content += current_char
-          end
-          advance
-        end
-        raise_tokenizer_error('Unterminated string literal') if current_char != quote_char
-        advance # skip closing quote
-        location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-        @tokens << Token.new(:string, string_content, location, Kumi::Parser::TOKEN_METADATA[:string])
-      end
-      def consume_number
-        start_column = @column
-        number_str = ''
-        has_dot = false
-        # Handle negative sign if present
-        if current_char == '-'
-          number_str += current_char
-          advance
-        end
-        # Consume digits and underscores, and optionally a decimal point
-        while current_char && (current_char.match?(/[0-9_]/) || (!has_dot && current_char == '.'))
-          if current_char == '.'
-            # Make sure next character is a digit to distinguish from member access
-            break unless peek_char && peek_char.match?(/\d/)
-            has_dot = true
-            number_str += current_char
-          else
-            number_str += current_char
-          end
-          advance
-        end
-        token_type = has_dot ? :float : :integer
-        location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-        @tokens << Token.new(token_type, number_str, location, Kumi::Parser::TOKEN_METADATA[token_type])
-      end
-      def consume_identifier_or_label_or_keyword
-        start_column = @column
-        identifier_or_label_name = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
-        location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-        # Check if it's a constant FIRST (e.g., Float::INFINITY or Kumi::TestSharedSchemas::Tax)
-        # This needs to be checked before label detection because labels also start with `:``
-        if current_char == ':' && peek_char == ':'
-          full_constant = identifier_or_label_name
-          while current_char == ':' && peek_char == ':'
-            advance # consume first :
-            advance # consume second :
-            constant_name = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
-            full_constant = "#{full_constant}::#{constant_name}"
-          end
-          add_token(:constant, full_constant, Kumi::Parser::TOKEN_METADATA[:constant])
-          return
-        end
-        # Check if the next character is a single colon (label)
-        if current_char == ':'
-          # It's a hash key or a label (e.g., `name:`)
-          advance # consume the colon
-          add_token(:label, identifier_or_label_name, Kumi::Parser::TOKEN_METADATA[:label])
-          return
-        end
-        # If it's not a label, proceed to check for keywords and identifiers
-        # The logic below is adapted from your original `consume_identifier_or_keyword` method
-        # Check if it's a keyword
-        if keyword_type = Kumi::Parser::KEYWORDS[identifier_or_label_name]
-          metadata = Kumi::Parser::TOKEN_METADATA[keyword_type].dup
-          # Update context based on keyword
-          case keyword_type
-          when :schema, :input
-            @context_stack.push(keyword_type)
-            metadata[:opens_context] = keyword_type
-          when :end
-            closed_context = @context_stack.pop if @context_stack.length > 1
-            metadata[:closes_context] = closed_context
-          end
-          add_token(keyword_type, identifier_or_label_name, metadata)
-          return
-        end
-        # Check if its a function sugar
-        if Kumi::Parser::FUNCTION_SUGAR[identifier_or_label_name]
-          metadata = Kumi::Parser::TOKEN_METADATA[:function_sugar].dup
-          add_token(:function_sugar, identifier_or_label_name, metadata)
-          return
-        end
-        # Otherwise is an Identifier
-        metadata = Kumi::Parser::TOKEN_METADATA[:identifier].dup
-        case current_context
-        when :input
-          metadata[:context] = :input_declaration
-        when :schema
-          metadata[:context] = :schema_body
-        end
-        add_token(:identifier, identifier_or_label_name, metadata)
-      end
-      def consume_symbol_or_colon
-        start_column = @column
-        if peek_char && peek_char.match?(/[a-zA-Z_]/)
-          # It's a symbol like :name
-          advance # skip :
-          symbol_name = consume_while { |c| c.match?(/[a-zA-Z0-9_]/) }
-          location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-          @tokens << Token.new(:symbol, symbol_name.to_sym, location, Kumi::Parser::TOKEN_METADATA[:symbol])
-        else
-          # It's just a colon
-          location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-          @tokens << Token.new(:colon, ':', location, Kumi::Parser::TOKEN_METADATA[:colon])
-          advance
-        end
-      end
-      def consume_operator_or_punctuation
-        start_column = @column
-        char = current_char
-        # Handle multi-character operators
-        case char
-        when '='
-          if peek_char == '>'
-            advance
-            advance
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(:arrow, '=>', location, Kumi::Parser::TOKEN_METADATA[:arrow])
-          elsif peek_char == '='
-            advance
-            advance
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(:eq, '==', location, Kumi::Parser::TOKEN_METADATA[:eq])
-          else
-            raise_tokenizer_error("Unexpected '=' (did you mean '=='?)")
-          end
-        when '!'
-          if peek_char == '='
-            advance
-            advance
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(:ne, '!=', location, Kumi::Parser::TOKEN_METADATA[:ne])
-          else
-            raise_tokenizer_error("Unexpected '!' (did you mean '!='?)")
-          end
-        when '>'
-          if peek_char == '='
-            advance
-            advance
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(:gte, '>=', location, Kumi::Parser::TOKEN_METADATA[:gte])
-          else
-            advance
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(:gt, '>', location, Kumi::Parser::TOKEN_METADATA[:gt])
-          end
-        when '<'
-          if peek_char == '='
-            advance
-            advance
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(:lte, '<=', location, Kumi::Parser::TOKEN_METADATA[:lte])
-          else
-            advance
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(:lt, '<', location, Kumi::Parser::TOKEN_METADATA[:lt])
-          end
-        when '*'
-          if peek_char == '*'
-            advance
-            advance
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(:exponent, '**', location, Kumi::Parser::TOKEN_METADATA[:exponent])
-          else
-            # Single asterisk: fall through to single character handling
-            token_type = CHAR_TO_TOKEN[char]
-            metadata = Kumi::Parser::TOKEN_METADATA[token_type].dup
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(token_type, char, location, metadata)
-            advance
-          end
-        when '.'
-          if peek_char == '.'
-            advance
-            if peek_char == '.'
-              # Three dots: ...
-              advance
-              advance
-              location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-              @tokens << Token.new(:dot_dot_dot, '...', location, Kumi::Parser::TOKEN_METADATA[:dot_dot_dot])
-            else
-              # Two dots: ..
-              advance
-              location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-              @tokens << Token.new(:dot_dot, '..', location, Kumi::Parser::TOKEN_METADATA[:dot_dot])
-            end
-          else
-            # Single dot: fall through to single character handling
-            token_type = CHAR_TO_TOKEN[char]
-            metadata = Kumi::Parser::TOKEN_METADATA[token_type].dup
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(token_type, char, location, metadata)
-            advance
-          end
-        else
-          # Single character operators/punctuation
-          token_type = CHAR_TO_TOKEN[char]
-          if token_type
-            metadata = Kumi::Parser::TOKEN_METADATA[token_type].dup
-            location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: start_column)
-            @tokens << Token.new(token_type, char, location, metadata)
-            advance
-          else
-            raise_tokenizer_error("Unexpected character: #{char}")
-          end
-        end
-      end
-      def consume_while(&block)
-        result = ''
-        while current_char && block.call(current_char)
-          result += current_char
-          advance
-        end
-        result
-      end
-      def current_context
-        @context_stack.last
-      end
-      def add_token(type, value, metadata)
-        location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
-        token = Token.new(type, value, location, metadata)
-        @tokens << token
-      end
-      def raise_tokenizer_error(message)
-        location = Kumi::Syntax::Location.new(file: @source_file, line: @line, column: @column)
-        raise Errors::TokenizerError.new(message, location: location)
-      end
-    end
-    # Custom error for tokenization issues
-  end
-end

data/lib/kumi/parser/syntax_validator.rb DELETED Viewed

@@ -1,21 +0,0 @@
-# frozen_string_literal: true
-module Kumi
-  module Parser
-    # Validates Kumi DSL syntax using new parser
-    class SyntaxValidator
-      def validate(text, source_file: '<input>')
-        Kumi::Parser::Base.validate(text, source_file: source_file)
-      end
-      def valid?(text, source_file: '<input>')
-        validate(text, source_file: source_file).empty?
-      end
-      def first_error(text, source_file: '<input>')
-        diagnostics = validate(text, source_file: source_file)
-        diagnostics.empty? ? nil : diagnostics.first[:message]
-      end
-    end
-  end
-end