lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,154 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module Core
5
+ # Source holds the input byte sequence and optional filename.
6
+ # It provides line/column conversion for diagnostics.
7
+ class Source
8
+ attr_reader :bytes, :filename
9
+
10
+ # @param input [String] input string (will be stored as-is for char conversion, and as BINARY for byte operations)
11
+ # @param filename [String, nil] optional filename for diagnostics
12
+ def initialize(input, filename: nil)
13
+ @original_string = input.freeze
14
+ @bytes = input.dup.force_encoding(Encoding::BINARY).freeze
15
+ @filename = filename&.freeze
16
+ @line_starts = nil
17
+ end
18
+
19
+ # Length in bytes
20
+ # @return [Integer]
21
+ def length
22
+ @bytes.bytesize
23
+ end
24
+
25
+ alias size length
26
+
27
+ # Build line index (explicit, not automatic)
28
+ # Call this before using line_col or line_slice on large inputs
29
+ # @return [self]
30
+ def line_index!
31
+ return self if @line_starts
32
+
33
+ @line_starts = [0]
34
+ pos = 0
35
+ while pos < @bytes.bytesize
36
+ byte = @bytes.getbyte(pos)
37
+ if byte == 0x0A # LF
38
+ @line_starts << (pos + 1)
39
+ end
40
+ pos += 1
41
+ end
42
+ @line_starts.freeze
43
+ self
44
+ end
45
+
46
+ # Convert byte offset to line and column (1-based)
47
+ # Builds line index if not already built
48
+ # @param byte_offset [Integer]
49
+ # @return [Array(Integer, Integer)] [line, column]
50
+ def line_col(byte_offset)
51
+ line_index! unless @line_starts
52
+
53
+ # Binary search for line
54
+ line = @line_starts.bsearch_index { |start| start > byte_offset }
55
+ line ||= @line_starts.length
56
+ line_start = @line_starts[line - 1]
57
+ col = byte_offset - line_start + 1
58
+
59
+ [line, col]
60
+ end
61
+
62
+ # Get the content of a specific line (1-based)
63
+ # @param line [Integer] line number (1-based)
64
+ # @return [String, nil]
65
+ def line_slice(line)
66
+ line_index! unless @line_starts
67
+ return nil if line < 1 || line > @line_starts.length
68
+
69
+ start = @line_starts[line - 1]
70
+ if line < @line_starts.length
71
+ # Not the last line
72
+ end_pos = @line_starts[line]
73
+ content = @bytes.byteslice(start, end_pos - start)
74
+ # Remove trailing newline
75
+ content.chomp
76
+ else
77
+ # Last line
78
+ @bytes.byteslice(start, @bytes.bytesize - start)
79
+ end
80
+ end
81
+
82
+ # Get the number of lines
83
+ # @return [Integer]
84
+ def line_count
85
+ line_index! unless @line_starts
86
+ @line_starts.length
87
+ end
88
+
89
+ # Create a span for the given range
90
+ # @param start [Integer] start byte offset
91
+ # @param len [Integer] length
92
+ # @return [Span]
93
+ def span(start, len)
94
+ Span.new(start, len)
95
+ end
96
+
97
+ # Extract text for a span
98
+ # @param span [Span]
99
+ # @return [String]
100
+ def text(span)
101
+ span.slice(@bytes)
102
+ end
103
+
104
+ # Get the span covering an entire line (1-based)
105
+ # @param line [Integer] line number (1-based)
106
+ # @return [Span]
107
+ def span_for_line(line)
108
+ line_index! unless @line_starts
109
+
110
+ line = [line, 1].max
111
+ return Span.new(0, 0) if line > @line_starts.length
112
+
113
+ start = @line_starts[line - 1]
114
+ line_end = if line < @line_starts.length
115
+ # Not the last line - span up to (but not including) newline
116
+ @line_starts[line] - 1
117
+ else
118
+ # Last line
119
+ @bytes.bytesize
120
+ end
121
+ Span.new(start, line_end - start)
122
+ end
123
+
124
+ # Convert character index to byte offset.
125
+ # For BINARY input, returns char_index directly (O(1)).
126
+ # For other encodings (e.g. UTF-8), computes byte offset (O(n), error paths only).
127
+ # @param char_index [Integer] character position (0-based)
128
+ # @return [Integer] byte offset
129
+ def byte_offset_for_char_index(char_index)
130
+ if @original_string.encoding == Encoding::BINARY
131
+ char_index
132
+ else
133
+ @original_string[0...char_index].bytesize
134
+ end
135
+ end
136
+
137
+ # Get the span for a character index.
138
+ # For BINARY input, O(1). For other encodings, O(n) (error paths only).
139
+ # @param char_index [Integer] character position (0-based)
140
+ # @param len [Integer] character length (default: 1)
141
+ # @return [Span]
142
+ def span_for_char_index(char_index, len: 1)
143
+ byte_start = byte_offset_for_char_index(char_index)
144
+ byte_end = byte_offset_for_char_index(char_index + len)
145
+ Span.new(byte_start, byte_end - byte_start)
146
+ end
147
+
148
+ def inspect
149
+ filename_str = @filename ? " #{@filename.inspect}" : ""
150
+ "#<LexerKit::Core::Source#{filename_str} #{length} bytes>"
151
+ end
152
+ end
153
+ end
154
+ end
@@ -0,0 +1,80 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module Core
5
+ # Span represents a range in the input as byte offsets.
6
+ # It uses a half-open interval [start, start+len).
7
+ class Span
8
+ attr_reader :start, :len
9
+
10
+ # @param start [Integer] start byte offset (0-based)
11
+ # @param len [Integer] length in bytes
12
+ def initialize(start, len)
13
+ @start = start
14
+ @len = len
15
+ end
16
+
17
+ # End position (exclusive)
18
+ # @return [Integer]
19
+ def end
20
+ @start + @len
21
+ end
22
+
23
+ # Check if span is empty
24
+ # @return [Boolean]
25
+ def empty?
26
+ @len == 0
27
+ end
28
+
29
+ # Check if this span fully covers another span
30
+ # @param other [Span]
31
+ # @return [Boolean]
32
+ def cover?(other)
33
+ @start <= other.start && self.end >= other.end
34
+ end
35
+
36
+ # Check if this span overlaps with another span
37
+ # @param other [Span]
38
+ # @return [Boolean]
39
+ def overlap?(other)
40
+ @start < other.end && self.end > other.start
41
+ end
42
+
43
+ # Create a new span that merges this and another span
44
+ # @param other [Span]
45
+ # @return [Span]
46
+ def merge(other)
47
+ new_start = [@start, other.start].min
48
+ new_end = [self.end, other.end].max
49
+ Span.new(new_start, new_end - new_start)
50
+ end
51
+
52
+ # Extract the text from a byte string
53
+ # @param bytes [String] source bytes (BINARY encoding)
54
+ # @return [String]
55
+ def slice(bytes)
56
+ bytes.byteslice(@start, @len)
57
+ end
58
+
59
+ def ==(other)
60
+ other.is_a?(Span) && @start == other.start && @len == other.len
61
+ end
62
+
63
+ def eql?(other)
64
+ self == other
65
+ end
66
+
67
+ def hash
68
+ [@start, @len].hash
69
+ end
70
+
71
+ def to_s
72
+ "#{@start}..#{self.end}"
73
+ end
74
+
75
+ def inspect
76
+ "#<LexerKit::Core::Span #{self}>"
77
+ end
78
+ end
79
+ end
80
+ end
@@ -0,0 +1,120 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module Core
5
+ # Token represents a lexed token with position and optional location info.
6
+ # Line and column are lazily computed to minimize overhead.
7
+ class Token
8
+ attr_reader :id, :name, :start, :len, :meta
9
+
10
+ # @param id [Integer] token ID
11
+ # @param name [Symbol] token name
12
+ # @param start [Integer] start byte offset
13
+ # @param len [Integer] length in bytes
14
+ # @param source [Source] source object for location lookup
15
+ # @param meta [Hash, nil] optional metadata for this token
16
+ def initialize(id:, name:, start:, len:, source:, meta: nil)
17
+ @id = id
18
+ @name = name
19
+ @start = start
20
+ @len = len
21
+ @source = source
22
+ @meta = meta || {}
23
+ @line = nil
24
+ @col = nil
25
+ end
26
+
27
+ # Alias for name (compatibility with other tools)
28
+ # @return [Symbol]
29
+ alias type name
30
+
31
+ # Get the token text
32
+ # @return [String]
33
+ def text
34
+ @source.bytes.byteslice(@start, @len)
35
+ end
36
+
37
+ # Get the line number (1-based, lazily computed)
38
+ # @return [Integer]
39
+ def line
40
+ @line ||= @source.line_col(@start)[0]
41
+ end
42
+
43
+ # Get the column number (1-based, lazily computed)
44
+ # @return [Integer]
45
+ def col
46
+ @col ||= @source.line_col(@start)[1]
47
+ end
48
+
49
+ # Get the span object
50
+ # @return [Span]
51
+ def span
52
+ @span ||= Span.new(@start, @len)
53
+ end
54
+
55
+ # Check if this is an error token
56
+ # @return [Boolean]
57
+ def error?
58
+ @id == LexerKit::INVALID_TOKEN_ID
59
+ end
60
+
61
+ # Create a Diagnostic from this token
62
+ # @param message [String] error message
63
+ # @param level [Symbol] :error, :warning, or :note
64
+ # @param notes [Array<String>, nil] additional notes
65
+ # @return [Diagnostic]
66
+ def to_diagnostic(message, level: :error, notes: nil)
67
+ Diagnostic.new(
68
+ level: level,
69
+ message: message,
70
+ span: span,
71
+ notes: notes
72
+ )
73
+ end
74
+
75
+ # Render diagnostic with source context
76
+ # @param message [String] error message
77
+ # @param level [Symbol] :error, :warning, or :note
78
+ # @param notes [Array<String>, nil] additional notes
79
+ # @param color [Boolean] enable ANSI colors
80
+ # @return [String]
81
+ def render_diagnostic(message, level: :error, notes: nil, color: $stdout.tty?)
82
+ to_diagnostic(message, level: level, notes: notes).render(@source, color: color)
83
+ end
84
+
85
+ def ==(other)
86
+ return false unless other.is_a?(Token)
87
+
88
+ @name == other.name && @start == other.start && @len == other.len
89
+ end
90
+
91
+ def eql?(other)
92
+ self == other
93
+ end
94
+
95
+ def hash
96
+ [@name, @start, @len].hash
97
+ end
98
+
99
+ def to_h
100
+ {
101
+ id: @id,
102
+ name: @name,
103
+ text: text,
104
+ start: @start,
105
+ len: @len,
106
+ line: line,
107
+ col: col
108
+ }
109
+ end
110
+
111
+ def to_s
112
+ "#{@name}(#{text.inspect})"
113
+ end
114
+
115
+ def inspect
116
+ "#<LexerKit::Core::Token #{@name} #{@start}:#{@len} #{text.inspect}>"
117
+ end
118
+ end
119
+ end
120
+ end
@@ -0,0 +1,13 @@
1
+ # frozen_string_literal: true
2
+
3
+ require_relative "core/span"
4
+ require_relative "core/source"
5
+ require_relative "core/diagnostic"
6
+ require_relative "core/token"
7
+
8
+ module LexerKit
9
+ # Core module provides the foundational types shared by all lexer implementations.
10
+ # These types are designed to be zero-allocation friendly and work with byte offsets.
11
+ module Core
12
+ end
13
+ end
@@ -0,0 +1,143 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "stringio"
4
+
5
+ module LexerKit
6
+ module Debug
7
+ # Disassembler for CompiledProgram instructions
8
+ class Disassembler
9
+ def initialize(program)
10
+ @program = program
11
+ end
12
+
13
+ # Disassemble all instructions
14
+ # @param io [IO] output stream (default: StringIO)
15
+ # @return [String] disassembled output
16
+ def disassemble(io: nil)
17
+ output = io || StringIO.new
18
+
19
+ # Header
20
+ output.puts "# LexerKit Disassembly"
21
+ output.puts "# Version: #{@program.version}"
22
+ output.puts "# Instructions: #{@program.instructions.size}"
23
+ output.puts "# Tokens: #{@program.token_names.join(', ')}"
24
+ output.puts "# Modes: #{@program.mode_names.join(', ')}"
25
+ output.puts
26
+
27
+ # Mode labels
28
+ mode_labels = {}
29
+ @program.mode_offsets.each do |name, offset|
30
+ mode_labels[offset] = name
31
+ end
32
+
33
+ # Disassemble each instruction
34
+ @program.instructions.each_with_index do |instr, offset|
35
+ # Mode label
36
+ if mode_labels[offset]
37
+ output.puts
38
+ output.puts "#{mode_labels[offset]}:"
39
+ end
40
+
41
+ line = format_instruction(offset, instr)
42
+ output.puts line
43
+ end
44
+
45
+ io ? nil : output.string
46
+ end
47
+
48
+ private
49
+
50
+ def format_instruction(offset, instr)
51
+ addr = format("%04d", offset)
52
+ name = IR::Opcode.name(instr.opcode).ljust(16)
53
+ arg_str = format_arg(instr)
54
+
55
+ " #{addr}: #{name}#{arg_str}"
56
+ end
57
+
58
+ def format_arg(instr)
59
+ # rubocop:disable Lint/DuplicateBranch
60
+ case instr.opcode
61
+ when IR::Opcode::DFA_RUN
62
+ "dfa=#{instr.arg}"
63
+ when IR::Opcode::DFA_RUN_IF_MATCH
64
+ dfa_id = (instr.arg >> 14) & 0x3FF
65
+ fail_target = instr.arg & 0x3FFF
66
+ "dfa=#{dfa_id} fail->#{format('%04d', fail_target)}"
67
+ when IR::Opcode::SCAN_UNTIL
68
+ literal = @program.constant_pool.get(instr.arg)
69
+ "const=#{instr.arg} #{literal.inspect}"
70
+ when IR::Opcode::MATCH_LITERAL
71
+ literal = @program.constant_pool.get(instr.arg)
72
+ "const=#{instr.arg} #{literal.inspect}"
73
+ when IR::Opcode::MATCH_LITERAL_OR_JUMP
74
+ const_id = (instr.arg >> 14) & 0x3FF
75
+ fail_target = instr.arg & 0x3FFF
76
+ literal = @program.constant_pool.get(const_id)
77
+ "const=#{const_id} #{literal.inspect} fail->#{format('%04d', fail_target)}"
78
+ when IR::Opcode::MATCH_RANGE
79
+ min_byte = (instr.arg >> 8) & 0xFF
80
+ max_byte = instr.arg & 0xFF
81
+ "[#{format_byte(min_byte)}-#{format_byte(max_byte)}]"
82
+ when IR::Opcode::SWITCH_BYTE
83
+ "table=#{instr.arg}"
84
+ when IR::Opcode::JUMP
85
+ "-> #{format('%04d', instr.arg)}"
86
+ when IR::Opcode::JUMP_IF_EOF
87
+ "-> #{format('%04d', instr.arg)}"
88
+ when IR::Opcode::PUSH_MODE
89
+ mode_name = @program.mode_names[instr.arg] || instr.arg
90
+ "mode=#{mode_name}"
91
+ when IR::Opcode::POP_MODE
92
+ ""
93
+ when IR::Opcode::EMIT
94
+ token_name = @program.token_name(instr.arg) || instr.arg
95
+ "token=#{token_name}"
96
+ when IR::Opcode::EMIT_SKIP
97
+ ""
98
+ when IR::Opcode::EMIT_ERROR
99
+ token_name = @program.token_name(instr.arg) || instr.arg
100
+ "token=#{token_name}"
101
+ when IR::Opcode::EMIT_AND_JUMP
102
+ token_id = (instr.arg >> 14) & 0x3FF
103
+ jump_target = instr.arg & 0x3FFF
104
+ token_name = @program.token_name(token_id) || token_id
105
+ "token=#{token_name} -> #{format('%04d', jump_target)}"
106
+ when IR::Opcode::MARK
107
+ ""
108
+ when IR::Opcode::KEYWORD_LOOKUP
109
+ "table=#{instr.arg}"
110
+ when IR::Opcode::LITERAL_TRIE_RUN
111
+ "const=#{instr.arg}"
112
+ when IR::Opcode::CLEAR_BEST
113
+ ""
114
+ when IR::Opcode::COMMIT_BEST
115
+ "-> #{format('%04d', instr.arg)}"
116
+ when IR::Opcode::SET_MATCH
117
+ order = (instr.arg >> 14) & 0x3FF
118
+ action_ip = instr.arg & 0x3FFF
119
+ "order=#{order} -> #{format('%04d', action_ip)}"
120
+ when IR::Opcode::LITERAL_TRIE_COMMIT
121
+ const_id = (instr.arg >> 14) & 0x3FF
122
+ fail_target = instr.arg & 0x3FFF
123
+ "const=#{const_id} fail->#{format('%04d', fail_target)}"
124
+ when IR::Opcode::EMIT_SKIP_AND_JUMP
125
+ "-> #{format('%04d', instr.arg)}"
126
+ when IR::Opcode::HALT
127
+ ""
128
+ else
129
+ instr.arg.to_s
130
+ end
131
+ # rubocop:enable Lint/DuplicateBranch
132
+ end
133
+
134
+ def format_byte(byte)
135
+ if byte >= 0x20 && byte < 0x7F
136
+ "'#{byte.chr}'"
137
+ else
138
+ format("0x%02X", byte)
139
+ end
140
+ end
141
+ end
142
+ end
143
+ end