RubyGems - lexer_kit - Versions diffs - 0.5.0 - Mend

lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (61) hide show

checksums.yaml +7 -0
data/LICENSE.txt +21 -0
data/README.md +157 -0
data/exe/lexer_kit +7 -0
data/ext/lexer_kit_rust/Cargo.toml +17 -0
data/ext/lexer_kit_rust/extconf.rb +6 -0
data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
data/ext/lexer_kit_rust/src/dfa.rs +217 -0
data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
data/ext/lexer_kit_rust/src/lib.rs +248 -0
data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
data/ext/lexer_kit_rust/src/trie.rs +206 -0
data/ext/lexer_kit_rust/src/types.rs +319 -0
data/ext/lexer_kit_rust/src/vm.rs +258 -0
data/lib/lexer_kit/builder/compiler.rb +596 -0
data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
data/lib/lexer_kit/builder/mode_def.rb +36 -0
data/lib/lexer_kit/builder/token_def.rb +65 -0
data/lib/lexer_kit/builder/validator.rb +84 -0
data/lib/lexer_kit/builder.rb +230 -0
data/lib/lexer_kit/cli/commands.rb +389 -0
data/lib/lexer_kit/cli.rb +88 -0
data/lib/lexer_kit/core/diagnostic.rb +103 -0
data/lib/lexer_kit/core/source.rb +154 -0
data/lib/lexer_kit/core/span.rb +80 -0
data/lib/lexer_kit/core/token.rb +120 -0
data/lib/lexer_kit/core.rb +13 -0
data/lib/lexer_kit/debug/disassembler.rb +143 -0
data/lib/lexer_kit/debug/visualizer.rb +203 -0
data/lib/lexer_kit/debug.rb +11 -0
data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
data/lib/lexer_kit/dfa/case_folding.rb +45 -0
data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
data/lib/lexer_kit/dfa/nfa.rb +304 -0
data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
data/lib/lexer_kit/dfa.rb +37 -0
data/lib/lexer_kit/errors.rb +76 -0
data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
data/lib/lexer_kit/format/lkb1.rb +199 -0
data/lib/lexer_kit/format/lkt1.rb +111 -0
data/lib/lexer_kit/format.rb +19 -0
data/lib/lexer_kit/ir/compiled_program.rb +228 -0
data/lib/lexer_kit/ir/constant_pool.rb +107 -0
data/lib/lexer_kit/ir/dfa_table.rb +125 -0
data/lib/lexer_kit/ir/instruction.rb +50 -0
data/lib/lexer_kit/ir/jump_table.rb +94 -0
data/lib/lexer_kit/ir/keyword_table.rb +168 -0
data/lib/lexer_kit/ir/opcode.rb +96 -0
data/lib/lexer_kit/ir/serializer.rb +249 -0
data/lib/lexer_kit/ir.rb +16 -0
data/lib/lexer_kit/runner.rb +114 -0
data/lib/lexer_kit/trie.rb +170 -0
data/lib/lexer_kit/version.rb +5 -0
data/lib/lexer_kit.rb +155 -0
metadata +119 -0

data/lib/lexer_kit/core/source.rb ADDED Viewed

@@ -0,0 +1,154 @@
+# frozen_string_literal: true
+module LexerKit
+  module Core
+    # Source holds the input byte sequence and optional filename.
+    # It provides line/column conversion for diagnostics.
+    class Source
+      attr_reader :bytes, :filename
+      # @param input [String] input string (will be stored as-is for char conversion, and as BINARY for byte operations)
+      # @param filename [String, nil] optional filename for diagnostics
+      def initialize(input, filename: nil)
+        @original_string = input.freeze
+        @bytes = input.dup.force_encoding(Encoding::BINARY).freeze
+        @filename = filename&.freeze
+        @line_starts = nil
+      end
+      # Length in bytes
+      # @return [Integer]
+      def length
+        @bytes.bytesize
+      end
+      alias size length
+      # Build line index (explicit, not automatic)
+      # Call this before using line_col or line_slice on large inputs
+      # @return [self]
+      def line_index!
+        return self if @line_starts
+        @line_starts = [0]
+        pos = 0
+        while pos < @bytes.bytesize
+          byte = @bytes.getbyte(pos)
+          if byte == 0x0A # LF
+            @line_starts << (pos + 1)
+          end
+          pos += 1
+        end
+        @line_starts.freeze
+        self
+      end
+      # Convert byte offset to line and column (1-based)
+      # Builds line index if not already built
+      # @param byte_offset [Integer]
+      # @return [Array(Integer, Integer)] [line, column]
+      def line_col(byte_offset)
+        line_index! unless @line_starts
+        # Binary search for line
+        line = @line_starts.bsearch_index { |start| start > byte_offset }
+        line ||= @line_starts.length
+        line_start = @line_starts[line - 1]
+        col = byte_offset - line_start + 1
+        [line, col]
+      end
+      # Get the content of a specific line (1-based)
+      # @param line [Integer] line number (1-based)
+      # @return [String, nil]
+      def line_slice(line)
+        line_index! unless @line_starts
+        return nil if line < 1 || line > @line_starts.length
+        start = @line_starts[line - 1]
+        if line < @line_starts.length
+          # Not the last line
+          end_pos = @line_starts[line]
+          content = @bytes.byteslice(start, end_pos - start)
+          # Remove trailing newline
+          content.chomp
+        else
+          # Last line
+          @bytes.byteslice(start, @bytes.bytesize - start)
+        end
+      end
+      # Get the number of lines
+      # @return [Integer]
+      def line_count
+        line_index! unless @line_starts
+        @line_starts.length
+      end
+      # Create a span for the given range
+      # @param start [Integer] start byte offset
+      # @param len [Integer] length
+      # @return [Span]
+      def span(start, len)
+        Span.new(start, len)
+      end
+      # Extract text for a span
+      # @param span [Span]
+      # @return [String]
+      def text(span)
+        span.slice(@bytes)
+      end
+      # Get the span covering an entire line (1-based)
+      # @param line [Integer] line number (1-based)
+      # @return [Span]
+      def span_for_line(line)
+        line_index! unless @line_starts
+        line = [line, 1].max
+        return Span.new(0, 0) if line > @line_starts.length
+        start = @line_starts[line - 1]
+        line_end = if line < @line_starts.length
+                     # Not the last line - span up to (but not including) newline
+                     @line_starts[line] - 1
+                   else
+                     # Last line
+                     @bytes.bytesize
+                   end
+        Span.new(start, line_end - start)
+      end
+      # Convert character index to byte offset.
+      # For BINARY input, returns char_index directly (O(1)).
+      # For other encodings (e.g. UTF-8), computes byte offset (O(n), error paths only).
+      # @param char_index [Integer] character position (0-based)
+      # @return [Integer] byte offset
+      def byte_offset_for_char_index(char_index)
+        if @original_string.encoding == Encoding::BINARY
+          char_index
+        else
+          @original_string[0...char_index].bytesize
+        end
+      end
+      # Get the span for a character index.
+      # For BINARY input, O(1). For other encodings, O(n) (error paths only).
+      # @param char_index [Integer] character position (0-based)
+      # @param len [Integer] character length (default: 1)
+      # @return [Span]
+      def span_for_char_index(char_index, len: 1)
+        byte_start = byte_offset_for_char_index(char_index)
+        byte_end = byte_offset_for_char_index(char_index + len)
+        Span.new(byte_start, byte_end - byte_start)
+      end
+      def inspect
+        filename_str = @filename ? " #{@filename.inspect}" : ""
+        "#<LexerKit::Core::Source#{filename_str} #{length} bytes>"
+      end
+    end
+  end
+end

data/lib/lexer_kit/core/span.rb ADDED Viewed

@@ -0,0 +1,80 @@
+# frozen_string_literal: true
+module LexerKit
+  module Core
+    # Span represents a range in the input as byte offsets.
+    # It uses a half-open interval [start, start+len).
+    class Span
+      attr_reader :start, :len
+      # @param start [Integer] start byte offset (0-based)
+      # @param len [Integer] length in bytes
+      def initialize(start, len)
+        @start = start
+        @len = len
+      end
+      # End position (exclusive)
+      # @return [Integer]
+      def end
+        @start + @len
+      end
+      # Check if span is empty
+      # @return [Boolean]
+      def empty?
+        @len == 0
+      end
+      # Check if this span fully covers another span
+      # @param other [Span]
+      # @return [Boolean]
+      def cover?(other)
+        @start <= other.start && self.end >= other.end
+      end
+      # Check if this span overlaps with another span
+      # @param other [Span]
+      # @return [Boolean]
+      def overlap?(other)
+        @start < other.end && self.end > other.start
+      end
+      # Create a new span that merges this and another span
+      # @param other [Span]
+      # @return [Span]
+      def merge(other)
+        new_start = [@start, other.start].min
+        new_end = [self.end, other.end].max
+        Span.new(new_start, new_end - new_start)
+      end
+      # Extract the text from a byte string
+      # @param bytes [String] source bytes (BINARY encoding)
+      # @return [String]
+      def slice(bytes)
+        bytes.byteslice(@start, @len)
+      end
+      def ==(other)
+        other.is_a?(Span) && @start == other.start && @len == other.len
+      end
+      def eql?(other)
+        self == other
+      end
+      def hash
+        [@start, @len].hash
+      end
+      def to_s
+        "#{@start}..#{self.end}"
+      end
+      def inspect
+        "#<LexerKit::Core::Span #{self}>"
+      end
+    end
+  end
+end

data/lib/lexer_kit/core/token.rb ADDED Viewed

@@ -0,0 +1,120 @@
+# frozen_string_literal: true
+module LexerKit
+  module Core
+    # Token represents a lexed token with position and optional location info.
+    # Line and column are lazily computed to minimize overhead.
+    class Token
+      attr_reader :id, :name, :start, :len, :meta
+      # @param id [Integer] token ID
+      # @param name [Symbol] token name
+      # @param start [Integer] start byte offset
+      # @param len [Integer] length in bytes
+      # @param source [Source] source object for location lookup
+      # @param meta [Hash, nil] optional metadata for this token
+      def initialize(id:, name:, start:, len:, source:, meta: nil)
+        @id = id
+        @name = name
+        @start = start
+        @len = len
+        @source = source
+        @meta = meta || {}
+        @line = nil
+        @col = nil
+      end
+      # Alias for name (compatibility with other tools)
+      # @return [Symbol]
+      alias type name
+      # Get the token text
+      # @return [String]
+      def text
+        @source.bytes.byteslice(@start, @len)
+      end
+      # Get the line number (1-based, lazily computed)
+      # @return [Integer]
+      def line
+        @line ||= @source.line_col(@start)[0]
+      end
+      # Get the column number (1-based, lazily computed)
+      # @return [Integer]
+      def col
+        @col ||= @source.line_col(@start)[1]
+      end
+      # Get the span object
+      # @return [Span]
+      def span
+        @span ||= Span.new(@start, @len)
+      end
+      # Check if this is an error token
+      # @return [Boolean]
+      def error?
+        @id == LexerKit::INVALID_TOKEN_ID
+      end
+      # Create a Diagnostic from this token
+      # @param message [String] error message
+      # @param level [Symbol] :error, :warning, or :note
+      # @param notes [Array<String>, nil] additional notes
+      # @return [Diagnostic]
+      def to_diagnostic(message, level: :error, notes: nil)
+        Diagnostic.new(
+          level: level,
+          message: message,
+          span: span,
+          notes: notes
+        )
+      end
+      # Render diagnostic with source context
+      # @param message [String] error message
+      # @param level [Symbol] :error, :warning, or :note
+      # @param notes [Array<String>, nil] additional notes
+      # @param color [Boolean] enable ANSI colors
+      # @return [String]
+      def render_diagnostic(message, level: :error, notes: nil, color: $stdout.tty?)
+        to_diagnostic(message, level: level, notes: notes).render(@source, color: color)
+      end
+      def ==(other)
+        return false unless other.is_a?(Token)
+        @name == other.name && @start == other.start && @len == other.len
+      end
+      def eql?(other)
+        self == other
+      end
+      def hash
+        [@name, @start, @len].hash
+      end
+      def to_h
+        {
+          id: @id,
+          name: @name,
+          text: text,
+          start: @start,
+          len: @len,
+          line: line,
+          col: col
+        }
+      end
+      def to_s
+        "#{@name}(#{text.inspect})"
+      end
+      def inspect
+        "#<LexerKit::Core::Token #{@name} #{@start}:#{@len} #{text.inspect}>"
+      end
+    end
+  end
+end

data/lib/lexer_kit/core.rb ADDED Viewed

@@ -0,0 +1,13 @@
+# frozen_string_literal: true
+require_relative "core/span"
+require_relative "core/source"
+require_relative "core/diagnostic"
+require_relative "core/token"
+module LexerKit
+  # Core module provides the foundational types shared by all lexer implementations.
+  # These types are designed to be zero-allocation friendly and work with byte offsets.
+  module Core
+  end
+end

data/lib/lexer_kit/debug/disassembler.rb ADDED Viewed

@@ -0,0 +1,143 @@
+# frozen_string_literal: true
+require "stringio"
+module LexerKit
+  module Debug
+    # Disassembler for CompiledProgram instructions
+    class Disassembler
+      def initialize(program)
+        @program = program
+      end
+      # Disassemble all instructions
+      # @param io [IO] output stream (default: StringIO)
+      # @return [String] disassembled output
+      def disassemble(io: nil)
+        output = io || StringIO.new
+        # Header
+        output.puts "# LexerKit Disassembly"
+        output.puts "# Version: #{@program.version}"
+        output.puts "# Instructions: #{@program.instructions.size}"
+        output.puts "# Tokens: #{@program.token_names.join(', ')}"
+        output.puts "# Modes: #{@program.mode_names.join(', ')}"
+        output.puts
+        # Mode labels
+        mode_labels = {}
+        @program.mode_offsets.each do |name, offset|
+          mode_labels[offset] = name
+        end
+        # Disassemble each instruction
+        @program.instructions.each_with_index do |instr, offset|
+          # Mode label
+          if mode_labels[offset]
+            output.puts
+            output.puts "#{mode_labels[offset]}:"
+          end
+          line = format_instruction(offset, instr)
+          output.puts line
+        end
+        io ? nil : output.string
+      end
+      private
+      def format_instruction(offset, instr)
+        addr = format("%04d", offset)
+        name = IR::Opcode.name(instr.opcode).ljust(16)
+        arg_str = format_arg(instr)
+        "  #{addr}: #{name}#{arg_str}"
+      end
+      def format_arg(instr)
+        # rubocop:disable Lint/DuplicateBranch
+        case instr.opcode
+        when IR::Opcode::DFA_RUN
+          "dfa=#{instr.arg}"
+        when IR::Opcode::DFA_RUN_IF_MATCH
+          dfa_id = (instr.arg >> 14) & 0x3FF
+          fail_target = instr.arg & 0x3FFF
+          "dfa=#{dfa_id} fail->#{format('%04d', fail_target)}"
+        when IR::Opcode::SCAN_UNTIL
+          literal = @program.constant_pool.get(instr.arg)
+          "const=#{instr.arg} #{literal.inspect}"
+        when IR::Opcode::MATCH_LITERAL
+          literal = @program.constant_pool.get(instr.arg)
+          "const=#{instr.arg} #{literal.inspect}"
+        when IR::Opcode::MATCH_LITERAL_OR_JUMP
+          const_id = (instr.arg >> 14) & 0x3FF
+          fail_target = instr.arg & 0x3FFF
+          literal = @program.constant_pool.get(const_id)
+          "const=#{const_id} #{literal.inspect} fail->#{format('%04d', fail_target)}"
+        when IR::Opcode::MATCH_RANGE
+          min_byte = (instr.arg >> 8) & 0xFF
+          max_byte = instr.arg & 0xFF
+          "[#{format_byte(min_byte)}-#{format_byte(max_byte)}]"
+        when IR::Opcode::SWITCH_BYTE
+          "table=#{instr.arg}"
+        when IR::Opcode::JUMP
+          "-> #{format('%04d', instr.arg)}"
+        when IR::Opcode::JUMP_IF_EOF
+          "-> #{format('%04d', instr.arg)}"
+        when IR::Opcode::PUSH_MODE
+          mode_name = @program.mode_names[instr.arg] || instr.arg
+          "mode=#{mode_name}"
+        when IR::Opcode::POP_MODE
+          ""
+        when IR::Opcode::EMIT
+          token_name = @program.token_name(instr.arg) || instr.arg
+          "token=#{token_name}"
+        when IR::Opcode::EMIT_SKIP
+          ""
+        when IR::Opcode::EMIT_ERROR
+          token_name = @program.token_name(instr.arg) || instr.arg
+          "token=#{token_name}"
+        when IR::Opcode::EMIT_AND_JUMP
+          token_id = (instr.arg >> 14) & 0x3FF
+          jump_target = instr.arg & 0x3FFF
+          token_name = @program.token_name(token_id) || token_id
+          "token=#{token_name} -> #{format('%04d', jump_target)}"
+        when IR::Opcode::MARK
+          ""
+        when IR::Opcode::KEYWORD_LOOKUP
+          "table=#{instr.arg}"
+        when IR::Opcode::LITERAL_TRIE_RUN
+          "const=#{instr.arg}"
+        when IR::Opcode::CLEAR_BEST
+          ""
+        when IR::Opcode::COMMIT_BEST
+          "-> #{format('%04d', instr.arg)}"
+        when IR::Opcode::SET_MATCH
+          order = (instr.arg >> 14) & 0x3FF
+          action_ip = instr.arg & 0x3FFF
+          "order=#{order} -> #{format('%04d', action_ip)}"
+        when IR::Opcode::LITERAL_TRIE_COMMIT
+          const_id = (instr.arg >> 14) & 0x3FF
+          fail_target = instr.arg & 0x3FFF
+          "const=#{const_id} fail->#{format('%04d', fail_target)}"
+        when IR::Opcode::EMIT_SKIP_AND_JUMP
+          "-> #{format('%04d', instr.arg)}"
+        when IR::Opcode::HALT
+          ""
+        else
+          instr.arg.to_s
+        end
+        # rubocop:enable Lint/DuplicateBranch
+      end
+      def format_byte(byte)
+        if byte >= 0x20 && byte < 0x7F
+          "'#{byte.chr}'"
+        else
+          format("0x%02X", byte)
+        end
+      end
+    end
+  end
+end