lexer_kit 0.5.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (51) hide show
  1. checksums.yaml +7 -0
  2. data/LICENSE.txt +21 -0
  3. data/README.md +157 -0
  4. data/exe/lexer_kit +7 -0
  5. data/lib/lexer_kit/builder/compiler.rb +596 -0
  6. data/lib/lexer_kit/builder/conflict_detector.rb +209 -0
  7. data/lib/lexer_kit/builder/mode_def.rb +36 -0
  8. data/lib/lexer_kit/builder/token_def.rb +65 -0
  9. data/lib/lexer_kit/builder/validator.rb +84 -0
  10. data/lib/lexer_kit/builder.rb +230 -0
  11. data/lib/lexer_kit/cli/commands.rb +389 -0
  12. data/lib/lexer_kit/cli.rb +88 -0
  13. data/lib/lexer_kit/core/diagnostic.rb +103 -0
  14. data/lib/lexer_kit/core/source.rb +154 -0
  15. data/lib/lexer_kit/core/span.rb +80 -0
  16. data/lib/lexer_kit/core/token.rb +120 -0
  17. data/lib/lexer_kit/core.rb +13 -0
  18. data/lib/lexer_kit/debug/disassembler.rb +143 -0
  19. data/lib/lexer_kit/debug/visualizer.rb +203 -0
  20. data/lib/lexer_kit/debug.rb +11 -0
  21. data/lib/lexer_kit/dfa/byte_class_builder.rb +69 -0
  22. data/lib/lexer_kit/dfa/case_folding.rb +45 -0
  23. data/lib/lexer_kit/dfa/char_class_collector.rb +81 -0
  24. data/lib/lexer_kit/dfa/dfa_builder.rb +95 -0
  25. data/lib/lexer_kit/dfa/dfa_minimizer.rb +158 -0
  26. data/lib/lexer_kit/dfa/nfa.rb +304 -0
  27. data/lib/lexer_kit/dfa/regex_ast.rb +64 -0
  28. data/lib/lexer_kit/dfa/regex_parser.rb +385 -0
  29. data/lib/lexer_kit/dfa/utf8_range.rb +175 -0
  30. data/lib/lexer_kit/dfa/utf8_range_pattern.rb +17 -0
  31. data/lib/lexer_kit/dfa.rb +37 -0
  32. data/lib/lexer_kit/errors.rb +76 -0
  33. data/lib/lexer_kit/format/lkb1/decoder.rb +126 -0
  34. data/lib/lexer_kit/format/lkb1.rb +199 -0
  35. data/lib/lexer_kit/format/lkt1.rb +111 -0
  36. data/lib/lexer_kit/format.rb +19 -0
  37. data/lib/lexer_kit/ir/compiled_program.rb +228 -0
  38. data/lib/lexer_kit/ir/constant_pool.rb +107 -0
  39. data/lib/lexer_kit/ir/dfa_table.rb +125 -0
  40. data/lib/lexer_kit/ir/instruction.rb +50 -0
  41. data/lib/lexer_kit/ir/jump_table.rb +94 -0
  42. data/lib/lexer_kit/ir/keyword_table.rb +168 -0
  43. data/lib/lexer_kit/ir/opcode.rb +96 -0
  44. data/lib/lexer_kit/ir/serializer.rb +249 -0
  45. data/lib/lexer_kit/ir.rb +16 -0
  46. data/lib/lexer_kit/runner.rb +114 -0
  47. data/lib/lexer_kit/trie.rb +170 -0
  48. data/lib/lexer_kit/version.rb +5 -0
  49. data/lib/lexer_kit.rb +155 -0
  50. data/lib/lexer_kit_rust/lexer_kit_rust.so +0 -0
  51. metadata +111 -0
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: 9fd860b692d5da23c51450ba74fe6299a46c45d8c2d398e2b788e27358291c33
4
+ data.tar.gz: 35e0bd59a4117e2d38de220864b8a7e741afe41da15986da4751d45badcaa9df
5
+ SHA512:
6
+ metadata.gz: 16adcd34af9e6e1af15d9db2759eadac6e4a2bb9b8f9662c64383414503f13495677beead68dfac06ac4fc78cf3d242287f61bce6f1aa05ddb9992d0ce58f9b6
7
+ data.tar.gz: f96ff94f0211b210a93baacad0ef8691a34384d6b860aaaae68885fa31cde9f4b753f200cf99a8c860ed3d4385f19166e9886f768ced9e70b0cf1869fc461091
data/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
1
+ MIT License
2
+
3
+ Copyright (c) 2026 Masayoshi Takahashi (takahashim)
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,157 @@
1
+ # LexerKit
2
+
3
+ A high-performance lexer toolkit for Ruby.
4
+ Define tokenizers with a Ruby DSL and run them through a Rust native extension.
5
+
6
+ ## Features
7
+
8
+ - DSL-based lexer definition
9
+ - Fast stream lexing with minimal allocation
10
+ - On-demand token object creation for diagnostics
11
+ - Compiled lexer serialization
12
+ - Regex-based token patterns compiled to DFA
13
+
14
+ ## Installation
15
+
16
+ ```ruby
17
+ # Gemfile
18
+ gem "lexer_kit"
19
+ ```
20
+
21
+ ```bash
22
+ bundle install
23
+ ```
24
+
25
+ ## Quick Start
26
+
27
+ ```ruby
28
+ require "lexer_kit"
29
+
30
+ lexer = LexerKit.build do
31
+ token :NUMBER, /[0-9]+/
32
+ token :PLUS, "+"
33
+ token :MINUS, "-"
34
+ token :SPACE, /[ \t\r\n]+/, skip: true
35
+ end.compile
36
+
37
+ stream = lexer.stream("12 + 34 - 5")
38
+ until stream.eof?
39
+ puts "#{stream.token_name}: #{stream.text.inspect}"
40
+ stream.advance
41
+ end
42
+ ```
43
+
44
+ ## Core DSL
45
+
46
+ ### `token`
47
+
48
+ ```ruby
49
+ token :IDENT, /[a-zA-Z_][a-zA-Z0-9_]*/
50
+ token :ARROW, "->"
51
+ token :SPACE, /[ \t]+/, skip: true
52
+ token :DQUOTE, '"', push: :string
53
+ token :END_Q, '"', pop: true
54
+ ```
55
+
56
+ Options:
57
+
58
+ - `skip: true` skips emitting the token
59
+ - `push: :mode_name` pushes a mode
60
+ - `pop: true` pops the current mode
61
+
62
+ ### `keyword` / `define_keywords`
63
+
64
+ ```ruby
65
+ token :IDENT, /[a-z_]+/
66
+ keyword :IF, "if"
67
+ define_keywords :else, :while, :return
68
+ ```
69
+
70
+ ### `mode`
71
+
72
+ ```ruby
73
+ LexerKit.build do
74
+ token :DQUOTE, '"', push: :string
75
+ token :IDENT, /[a-z]+/
76
+
77
+ mode :string do
78
+ token :CONTENT, /[^"\\]+/
79
+ token :ESCAPE, /\\./
80
+ token :DQUOTE, '"', pop: true
81
+ end
82
+ end
83
+ ```
84
+
85
+ ### `scan_until` / `delimited`
86
+
87
+ ```ruby
88
+ scan_until :BLOCK_COMMENT, open: "/*", close: "*/", skip: true
89
+
90
+ delimited :TEXT, delimiter: "{{" do
91
+ token :IDENT, /[a-zA-Z_]+/
92
+ token :DOT, "."
93
+ token :CLOSE, "}}", pop: true
94
+ end
95
+ ```
96
+
97
+ ### `utf8_range`
98
+
99
+ ```ruby
100
+ token :HIRAGANA, LexerKit.utf8_range("ぁ".."ん")
101
+ token :CJK, LexerKit.utf8_range(0x4E00..0x9FFF)
102
+ ```
103
+
104
+ ## Regex Notes
105
+
106
+ - Most common regex syntax is supported (`[]`, quantifiers, groups, alternation, escapes, `/.../i`)
107
+ - Backtracking-dependent features are not supported (lookaround, backreference, etc.)
108
+ - Anchors and word-boundary assertions are not used in lexer matching
109
+ - `*?`, `+?`, `??` are parsed but behave as longest-match (DFA behavior)
110
+
111
+ ## Stream API and Error Handling
112
+
113
+ `stream.start` and `stream.len` are byte offsets.
114
+
115
+ ```ruby
116
+ stream = lexer.stream(input)
117
+ until stream.eof?
118
+ if stream.error?
119
+ token = stream.make_token
120
+ puts token.render_diagnostic("unexpected character")
121
+ end
122
+ stream.advance
123
+ end
124
+ ```
125
+
126
+ `LexerKit` always falls back to `:INVALID` for unmatched input.
127
+
128
+ ## Serialization
129
+
130
+ Pre-compile lexers for faster startup:
131
+
132
+ ```ruby
133
+ lexer = builder.compile
134
+ LexerKit::Format::LKT1.save(lexer, path: "lexer.lkt1")
135
+ LexerKit::Format::LKB1.save(lexer, path: "lexer.lkb1")
136
+ ```
137
+
138
+ ```bash
139
+ lexer_kit compile lexer.rb -o lexer.lkt1
140
+ ```
141
+
142
+ Load later:
143
+
144
+ ```ruby
145
+ lexer = LexerKit.load_lexer(File.expand_path("data/lexer.lkt1", __dir__))
146
+ ```
147
+
148
+ ## Performance Snapshot
149
+
150
+ JSON benchmark (600KB input, project benchmark script):
151
+
152
+ - LexerKit: `95.2 i/s`
153
+ - StringScanner: `4.8 i/s` (about `20x` slower)
154
+
155
+ ## License
156
+
157
+ MIT License
data/exe/lexer_kit ADDED
@@ -0,0 +1,7 @@
1
+ #!/usr/bin/env ruby
2
+ # frozen_string_literal: true
3
+
4
+ require "lexer_kit"
5
+ require "lexer_kit/cli"
6
+
7
+ exit(LexerKit::CLI.run(ARGV))