lexer_kit 0.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (61) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/ext/lexer_kit_rust/Cargo.toml +17 -0
  6. data/ext/lexer_kit_rust/extconf.rb +6 -0
  7. data/ext/lexer_kit_rust/src/deserializer.rs +213 -0
  8. data/ext/lexer_kit_rust/src/dfa.rs +217 -0
  9. data/ext/lexer_kit_rust/src/fast_stream.rs +468 -0
  10. data/ext/lexer_kit_rust/src/lib.rs +248 -0
  11. data/ext/lexer_kit_rust/src/opcodes.rs +718 -0
  12. data/ext/lexer_kit_rust/src/safety_test.rs +498 -0
  13. data/ext/lexer_kit_rust/src/trie.rs +206 -0
  14. data/ext/lexer_kit_rust/src/types.rs +319 -0
  15. data/ext/lexer_kit_rust/src/vm.rs +258 -0
  16. data/lib/lexer_kit/builder/compiler.rb +596 -0
  17. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  18. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  19. data/lib/lexer_kit/builder/token_def.rb +65 -0
  20. data/lib/lexer_kit/builder/validator.rb +84 -0
  21. data/lib/lexer_kit/builder.rb +230 -0
  22. data/lib/lexer_kit/cli/commands.rb +389 -0
  23. data/lib/lexer_kit/cli.rb +88 -0
  24. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  25. data/lib/lexer_kit/core/source.rb +154 -0
  26. data/lib/lexer_kit/core/span.rb +80 -0
  27. data/lib/lexer_kit/core/token.rb +120 -0
  28. data/lib/lexer_kit/core.rb +13 -0
  29. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  30. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  31. data/lib/lexer_kit/debug.rb +11 -0
  32. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  33. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  34. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  35. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  36. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  37. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  38. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  39. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  40. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  41. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  42. data/lib/lexer_kit/dfa.rb +37 -0
  43. data/lib/lexer_kit/errors.rb +76 -0
  44. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  45. data/lib/lexer_kit/format/lkb1.rb +199 -0
  46. data/lib/lexer_kit/format/lkt1.rb +111 -0
  47. data/lib/lexer_kit/format.rb +19 -0
  48. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  49. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  50. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  51. data/lib/lexer_kit/ir/instruction.rb +50 -0
  52. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  53. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  54. data/lib/lexer_kit/ir/opcode.rb +96 -0
  55. data/lib/lexer_kit/ir/serializer.rb +249 -0
  56. data/lib/lexer_kit/ir.rb +16 -0
  57. data/lib/lexer_kit/runner.rb +114 -0
  58. data/lib/lexer_kit/trie.rb +170 -0
  59. data/lib/lexer_kit/version.rb +5 -0
  60. data/lib/lexer_kit.rb +155 -0
  61. metadata +119 -0
@@ -0,0 +1,389 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module CLI
5
+ module Commands
6
+ # Compile DSL to .lkt1 or .lkb1 binary format
7
+ module Compile
8
+ def self.run(argv)
9
+ options = { output: nil, verbose: false, dry_run: false }
10
+
11
+ parser = OptionParser.new do |opts|
12
+ opts.banner = "Usage: lexer_kit compile [options] <file.rb>"
13
+
14
+ opts.on("-o", "--output FILE", "Output file (default: <input>.lkt1)") do |v|
15
+ options[:output] = v
16
+ end
17
+
18
+ opts.on("--verbose", "Show conflict warnings") do
19
+ options[:verbose] = true
20
+ end
21
+
22
+ opts.on("-n", "--dry-run", "Check for conflicts without generating output") do
23
+ options[:dry_run] = true
24
+ end
25
+
26
+ opts.on("-h", "--help", "Show this help") do
27
+ puts opts
28
+ return 0
29
+ end
30
+ end
31
+
32
+ parser.parse!(argv)
33
+
34
+ if argv.empty?
35
+ warn "error: No input file specified"
36
+ warn parser.banner
37
+ return 1
38
+ end
39
+
40
+ path = argv.shift
41
+ unless File.exist?(path)
42
+ warn "error: File not found: #{path}"
43
+ return 1
44
+ end
45
+
46
+ # Load builder
47
+ builder = LexerKit.load_builder(path)
48
+
49
+ # Check for conflicts with --verbose or --dry-run
50
+ if options[:verbose] || options[:dry_run]
51
+ conflicts = builder.check_conflicts
52
+ if conflicts.any?
53
+ conflicts.each do |c|
54
+ warn "warning: #{c.token1} vs #{c.token2}: #{c.description}"
55
+ end
56
+ warn ""
57
+ warn "#{conflicts.size} potential conflict(s) found"
58
+ elsif options[:dry_run]
59
+ warn "No conflicts found."
60
+ end
61
+ end
62
+
63
+ # Exit early if dry-run
64
+ if options[:dry_run]
65
+ return 0
66
+ end
67
+
68
+ # Compile
69
+ compiled = builder.compile
70
+
71
+ # Write output
72
+ output_path = options[:output] || path.sub(/\.rb$/, ".lkt1")
73
+ if output_path.end_with?(".lkb1")
74
+ Format::LKB1.save(compiled, path: output_path)
75
+ else
76
+ Format::LKT1.save(compiled, path: output_path)
77
+ end
78
+ warn "Compiled: #{output_path}"
79
+
80
+ 0
81
+ end
82
+ end
83
+
84
+ # Show lexer information
85
+ module Info
86
+ def self.run(argv)
87
+ parser = OptionParser.new do |opts|
88
+ opts.banner = "Usage: lexer_kit info <file>"
89
+
90
+ opts.on("-h", "--help", "Show this help") do
91
+ puts opts
92
+ return 0
93
+ end
94
+ end
95
+
96
+ parser.parse!(argv)
97
+
98
+ if argv.empty?
99
+ warn "error: No input file specified"
100
+ warn parser.banner
101
+ return 1
102
+ end
103
+
104
+ path = argv.shift
105
+ lexer = CLI.load_lexer(path)
106
+
107
+ puts "LexerKit Compiled Program"
108
+ puts " Version: #{lexer.version}"
109
+ puts " Native: #{LexerKit.native? ? 'enabled' : 'disabled'}"
110
+ puts
111
+
112
+ puts "Tokens (#{lexer.tokens.size}):"
113
+ lexer.tokens.each_with_index do |name, id|
114
+ puts " #{id.to_s.rjust(3)}: #{name}"
115
+ end
116
+ puts
117
+
118
+ puts "Modes (#{lexer.modes.size}):"
119
+ lexer.modes.each do |name|
120
+ offset = lexer.mode_offset(name)
121
+ puts " - #{name} (offset: #{offset})"
122
+ end
123
+ puts
124
+
125
+ puts "Statistics:"
126
+ puts " Instructions: #{lexer.instructions.size}"
127
+ puts " DFA tables: #{lexer.dfa_tables.size}"
128
+ puts " Jump tables: #{lexer.jump_tables.size}"
129
+ puts " Keywords: #{lexer.keyword_tables.size}"
130
+ puts " Constants: #{lexer.constant_pool.size}"
131
+
132
+ # Calculate binary size
133
+ binary_size = lexer.to_binary.bytesize
134
+ puts " Binary size: #{format_size(binary_size)}"
135
+
136
+ 0
137
+ end
138
+
139
+ def self.format_size(bytes)
140
+ if bytes < 1024
141
+ "#{bytes} bytes"
142
+ elsif bytes < 1024 * 1024
143
+ "#{(bytes / 1024.0).round(1)} KB"
144
+ else
145
+ "#{(bytes / 1024.0 / 1024.0).round(2)} MB"
146
+ end
147
+ end
148
+ end
149
+
150
+ # Tokenize input
151
+ module Lex
152
+ def self.run(argv)
153
+ options = { format: "table", color: $stdout.tty? }
154
+
155
+ parser = OptionParser.new do |opts|
156
+ opts.banner = "Usage: lexer_kit lex <lexer> [input]"
157
+
158
+ opts.on("-f", "--format FORMAT", %w[table json simple], "Output format (table, json, simple)") do |v|
159
+ options[:format] = v
160
+ end
161
+
162
+ opts.on("--no-color", "Disable colored output") do
163
+ options[:color] = false
164
+ end
165
+
166
+ opts.on("-h", "--help", "Show this help") do
167
+ puts opts
168
+ return 0
169
+ end
170
+ end
171
+
172
+ parser.parse!(argv)
173
+
174
+ if argv.empty?
175
+ warn "error: No lexer file specified"
176
+ warn parser.banner
177
+ return 1
178
+ end
179
+
180
+ lexer_path = argv.shift
181
+ lexer = CLI.load_lexer(lexer_path)
182
+
183
+ # Read input
184
+ input = if argv.empty?
185
+ $stdin.read
186
+ else
187
+ File.read(argv.shift)
188
+ end
189
+
190
+ # Collect tokens
191
+ tokens = []
192
+ source = Core::Source.new(input)
193
+ source.line_index!
194
+
195
+ lexer.lowlevel_each(input) do |tok_id, start, len|
196
+ line, col = source.line_col(start)
197
+ tokens << {
198
+ id: tok_id,
199
+ name: lexer.token_name(tok_id),
200
+ text: input.byteslice(start, len),
201
+ line: line,
202
+ col: col,
203
+ start: start,
204
+ len: len
205
+ }
206
+ end
207
+
208
+ # Output
209
+ case options[:format]
210
+ when "json"
211
+ output_json(tokens)
212
+ when "simple"
213
+ output_simple(tokens)
214
+ else
215
+ output_table(tokens, options[:color])
216
+ end
217
+
218
+ 0
219
+ end
220
+
221
+ def self.output_json(tokens)
222
+ result = tokens.map do |t|
223
+ {
224
+ token: t[:name].to_s,
225
+ text: t[:text],
226
+ line: t[:line],
227
+ col: t[:col],
228
+ start: t[:start],
229
+ len: t[:len]
230
+ }
231
+ end
232
+ puts JSON.pretty_generate(result)
233
+ end
234
+
235
+ def self.output_simple(tokens)
236
+ tokens.each do |t|
237
+ puts "#{t[:name]} #{t[:text].inspect}"
238
+ end
239
+ end
240
+
241
+ def self.output_table(tokens, color)
242
+ # Header
243
+ puts " LINE:COL TOKEN TEXT"
244
+ puts " -------- --------------- --------------------"
245
+
246
+ tokens.each do |t|
247
+ pos = "#{t[:line]}:#{t[:col]}"
248
+ name = t[:name].to_s
249
+ text = truncate(t[:text].inspect, 40)
250
+
251
+ if color
252
+ puts " #{pos.rjust(8)} \e[36m#{name.ljust(15)}\e[0m #{text}"
253
+ else
254
+ puts " #{pos.rjust(8)} #{name.ljust(15)} #{text}"
255
+ end
256
+ end
257
+
258
+ puts
259
+ puts "#{tokens.size} tokens"
260
+ end
261
+
262
+ def self.truncate(str, max)
263
+ if str.length > max
264
+ "#{str[0, max - 3]}..."
265
+ else
266
+ str
267
+ end
268
+ end
269
+ end
270
+
271
+ # Disassemble a .lkt1 file
272
+ module Disasm
273
+ def self.run(argv)
274
+ options = { dfa: nil, jump: nil, keyword: nil }
275
+
276
+ parser = OptionParser.new do |opts|
277
+ opts.banner = "Usage: lexer_kit disasm [options] <file.lkt1|file.lkb1>"
278
+
279
+ opts.on("--dfa INDEX", Integer, "Show DFA table at INDEX") do |v|
280
+ options[:dfa] = v
281
+ end
282
+
283
+ opts.on("--jump INDEX", Integer, "Show jump table at INDEX") do |v|
284
+ options[:jump] = v
285
+ end
286
+
287
+ opts.on("--keyword INDEX", Integer, "Show keyword table at INDEX") do |v|
288
+ options[:keyword] = v
289
+ end
290
+
291
+ opts.on("-h", "--help", "Show this help") do
292
+ puts opts
293
+ return 0
294
+ end
295
+ end
296
+
297
+ parser.parse!(argv)
298
+
299
+ if argv.empty?
300
+ warn "error: No input file specified"
301
+ warn parser.banner
302
+ return 1
303
+ end
304
+
305
+ path = argv.shift
306
+ program = CLI.load_lexer(path)
307
+
308
+ require "lexer_kit/debug"
309
+
310
+ if options[:dfa]
311
+ dfa = program.dfa_tables[options[:dfa]]
312
+ raise ArgumentError, "DFA table #{options[:dfa]} not found" unless dfa
313
+
314
+ puts Debug::Visualizer.format_dfa(dfa, program: program)
315
+ elsif options[:jump]
316
+ table = program.jump_tables[options[:jump]]
317
+ raise ArgumentError, "Jump table #{options[:jump]} not found" unless table
318
+
319
+ puts Debug::Visualizer.format_jump_table(table)
320
+ elsif options[:keyword]
321
+ table = program.keyword_tables[options[:keyword]]
322
+ raise ArgumentError, "Keyword table #{options[:keyword]} not found" unless table
323
+
324
+ puts Debug::Visualizer.format_keyword_table(table, program: program)
325
+ else
326
+ puts Debug::Disassembler.new(program).disassemble
327
+ end
328
+
329
+ 0
330
+ end
331
+ end
332
+
333
+ # Visualize DFA as Graphviz DOT
334
+ module Visualize
335
+ def self.run(argv)
336
+ options = { dfa: 0, output: nil }
337
+
338
+ parser = OptionParser.new do |opts|
339
+ opts.banner = "Usage: lexer_kit visualize [options] <file.lkt1|file.lkb1>"
340
+
341
+ opts.on("--dfa INDEX", Integer, "DFA table index (default: 0)") do |v|
342
+ options[:dfa] = v
343
+ end
344
+
345
+ opts.on("-o", "--output FILE", "Output file (default: stdout)") do |v|
346
+ options[:output] = v
347
+ end
348
+
349
+ opts.on("-h", "--help", "Show this help") do
350
+ puts opts
351
+ puts
352
+ puts "Examples:"
353
+ puts " lexer_kit visualize lexer.lkt1 | dot -Tpng -o dfa.png"
354
+ puts " lexer_kit visualize --dfa 1 lexer.lkt1 -o dfa.dot"
355
+ return 0
356
+ end
357
+ end
358
+
359
+ parser.parse!(argv)
360
+
361
+ if argv.empty?
362
+ warn "error: No input file specified"
363
+ warn parser.banner
364
+ return 1
365
+ end
366
+
367
+ path = argv.shift
368
+ program = CLI.load_lexer(path)
369
+
370
+ require "lexer_kit/debug"
371
+
372
+ dfa = program.dfa_tables[options[:dfa]]
373
+ raise ArgumentError, "DFA table #{options[:dfa]} not found" unless dfa
374
+
375
+ dot = Debug::Visualizer.dfa_to_dot(dfa, program: program)
376
+
377
+ if options[:output]
378
+ File.write(options[:output], dot)
379
+ warn "Written: #{options[:output]}"
380
+ else
381
+ puts dot
382
+ end
383
+
384
+ 0
385
+ end
386
+ end
387
+ end
388
+ end
389
+ end
@@ -0,0 +1,88 @@
1
+ # frozen_string_literal: true
2
+
3
+ require "optparse"
4
+ require "json"
5
+
6
+ module LexerKit
7
+ # Command-line interface for LexerKit
8
+ module CLI
9
+ VERSION = LexerKit::VERSION
10
+
11
+ class << self
12
+ def run(argv)
13
+ if argv.empty? || argv[0] == "-h" || argv[0] == "--help"
14
+ print_help
15
+ return 0
16
+ end
17
+
18
+ if ["-v", "--version"].include?(argv[0])
19
+ puts "lexer_kit #{VERSION}"
20
+ return 0
21
+ end
22
+
23
+ command = argv.shift
24
+ case command
25
+ when "compile"
26
+ Commands::Compile.run(argv)
27
+ when "lex"
28
+ Commands::Lex.run(argv)
29
+ when "info"
30
+ Commands::Info.run(argv)
31
+ when "disasm"
32
+ Commands::Disasm.run(argv)
33
+ when "visualize"
34
+ Commands::Visualize.run(argv)
35
+ else
36
+ warn "Unknown command: #{command}"
37
+ warn "Run 'lexer_kit --help' for usage."
38
+ 1
39
+ end
40
+ rescue StandardError => e
41
+ render_error(e)
42
+ 1
43
+ end
44
+
45
+ def render_error(error, color: $stderr.tty?)
46
+ if error.respond_to?(:render)
47
+ warn error.render(color: color)
48
+ else
49
+ warn "error: #{error.message}"
50
+ end
51
+ end
52
+
53
+ def print_help
54
+ puts <<~HELP
55
+ Usage: lexer_kit <command> [options] [arguments]
56
+
57
+ Commands:
58
+ compile Compile DSL file to .lkt1 or .lkb1
59
+ lex Tokenize input with a lexer
60
+ info Show lexer information
61
+ disasm Disassemble a .lkt1 or .lkb1 file
62
+ visualize Output DFA as Graphviz DOT format
63
+
64
+ Options:
65
+ -h, --help Show this help
66
+ -v, --version Show version
67
+
68
+ Examples:
69
+ lexer_kit compile json_lexer.rb
70
+ lexer_kit compile json_lexer.rb --verbose # show conflict warnings
71
+ lexer_kit compile json_lexer.rb --dry-run # check conflicts only
72
+ lexer_kit lex json_lexer.lkt1 data.json
73
+ lexer_kit info json_lexer.lkt1
74
+ lexer_kit lex json_lexer.lkb1 data.json
75
+ lexer_kit info json_lexer.lkb1
76
+ lexer_kit visualize json_lexer.lkt1 | dot -Tpng -o dfa.png
77
+ HELP
78
+ end
79
+ end
80
+
81
+ # Load lexer from .lkt1 or .lkb1 file
82
+ def self.load_lexer(path)
83
+ LexerKit.load_lexer(path)
84
+ end
85
+ end
86
+ end
87
+
88
+ require_relative "cli/commands"
@@ -0,0 +1,103 @@
1
+ # frozen_string_literal: true
2
+
3
+ module LexerKit
4
+ module Core
5
+ # Diagnostic holds error information with position and notes.
6
+ # It can render a human-readable error message with source context.
7
+ class Diagnostic
8
+ attr_reader :level, :message, :span, :notes
9
+
10
+ LEVEL_LABELS = {
11
+ error: "error",
12
+ warning: "warning",
13
+ note: "note"
14
+ }.freeze
15
+
16
+ LEVEL_COLORS = {
17
+ error: "\e[1;31m", # bold red
18
+ warning: "\e[1;33m", # bold yellow
19
+ note: "\e[1;36m" # bold cyan
20
+ }.freeze
21
+
22
+ RESET = "\e[0m"
23
+ BOLD = "\e[1m"
24
+ BLUE = "\e[34m"
25
+
26
+ # @param level [Symbol] :error, :warning, or :note
27
+ # @param message [String] main error message
28
+ # @param span [Span] location in source
29
+ # @param notes [Array<String>, nil] additional notes
30
+ def initialize(level:, message:, span:, notes: nil)
31
+ @level = level
32
+ @message = message
33
+ @span = span
34
+ @notes = notes&.freeze
35
+ end
36
+
37
+ # Render the diagnostic with source context
38
+ # @param source [Source]
39
+ # @param color [Boolean] enable ANSI colors
40
+ # @return [String]
41
+ def render(source, color: $stdout.tty?)
42
+ lines = []
43
+ line_num, col = source.line_col(@span.start)
44
+
45
+ # Header: filename:line:col: level: message
46
+ loc = source.filename ? "#{source.filename}:" : ""
47
+ loc += "#{line_num}:#{col}"
48
+
49
+ if color
50
+ level_str = "#{LEVEL_COLORS[@level]}#{LEVEL_LABELS[@level]}#{RESET}"
51
+ lines << "#{BOLD}#{loc}:#{RESET} #{level_str}: #{BOLD}#{@message}#{RESET}"
52
+ else
53
+ lines << "#{loc}: #{LEVEL_LABELS[@level]}: #{@message}"
54
+ end
55
+
56
+ # Source line
57
+ line_content = source.line_slice(line_num)
58
+ if line_content
59
+ line_num_str = line_num.to_s
60
+ gutter_width = line_num_str.length
61
+
62
+ # Line number gutter
63
+ lines << if color
64
+ " #{BLUE}#{line_num_str} |#{RESET} #{line_content}"
65
+ else
66
+ " #{line_num_str} | #{line_content}"
67
+ end
68
+
69
+ # Caret line
70
+ highlight_len = [@span.len, line_content.length - col + 1].min
71
+ highlight_len = 1 if highlight_len < 1
72
+ carets = "^" + ("~" * (highlight_len - 1)) # rubocop:disable Style/StringConcatenation
73
+ padding = " " * (col - 1)
74
+
75
+ lines << if color
76
+ " #{' ' * gutter_width} #{BLUE}|#{RESET} #{padding}#{LEVEL_COLORS[@level]}#{carets}#{RESET}"
77
+ else
78
+ " #{' ' * gutter_width} | #{padding}#{carets}"
79
+ end
80
+ end
81
+
82
+ # Notes
83
+ @notes&.each do |note|
84
+ lines << if color
85
+ " #{BOLD}note:#{RESET} #{note}"
86
+ else
87
+ " note: #{note}"
88
+ end
89
+ end
90
+
91
+ lines.join("\n")
92
+ end
93
+
94
+ def to_s
95
+ "#{LEVEL_LABELS[@level]}: #{@message} at #{@span}"
96
+ end
97
+
98
+ def inspect
99
+ "#<LexerKit::Core::Diagnostic #{@level} #{@span}: #{@message.inspect}>"
100
+ end
101
+ end
102
+ end
103
+ end