tokenkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/.yardopts +12 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +644 -0
- data/Rakefile +18 -0
- data/benchmarks/cache_test.rb +63 -0
- data/benchmarks/final_comparison.rb +83 -0
- data/benchmarks/tokenizer_benchmark.rb +250 -0
- data/docs/ARCHITECTURE.md +469 -0
- data/docs/PERFORMANCE.md +382 -0
- data/docs/README.md +118 -0
- data/ext/tokenkit/Cargo.toml +21 -0
- data/ext/tokenkit/extconf.rb +4 -0
- data/ext/tokenkit/src/config.rs +37 -0
- data/ext/tokenkit/src/error.rs +67 -0
- data/ext/tokenkit/src/lib.rs +346 -0
- data/ext/tokenkit/src/tokenizer/base.rs +41 -0
- data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
- data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
- data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
- data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
- data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
- data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
- data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
- data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
- data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
- data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
- data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
- data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
- data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
- data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
- data/lib/tokenkit/config.rb +74 -0
- data/lib/tokenkit/config_builder.rb +209 -0
- data/lib/tokenkit/config_compat.rb +52 -0
- data/lib/tokenkit/configuration.rb +194 -0
- data/lib/tokenkit/regex_converter.rb +58 -0
- data/lib/tokenkit/version.rb +5 -0
- data/lib/tokenkit.rb +336 -0
- data/sig/tokenkit.rbs +4 -0
- metadata +172 -0
|
@@ -0,0 +1,63 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "bundler/setup"
|
|
5
|
+
require "tokenkit"
|
|
6
|
+
require "benchmark/ips"
|
|
7
|
+
|
|
8
|
+
SAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. Test email@example.com and URL https://example.com"
|
|
9
|
+
|
|
10
|
+
puts "Testing tokenizer caching optimization"
|
|
11
|
+
puts "=" * 50
|
|
12
|
+
|
|
13
|
+
# Test with preserve patterns (where caching helps most)
|
|
14
|
+
TokenKit.configure do |config|
|
|
15
|
+
config.strategy = :unicode
|
|
16
|
+
config.preserve_patterns = [
|
|
17
|
+
/\w+@\w+\.\w+/, # Email pattern
|
|
18
|
+
/https?:\/\/[^\s]+/, # URL pattern
|
|
19
|
+
/\b[A-Z]{2,}\b/, # Uppercase abbreviations
|
|
20
|
+
/\b\d{4}-\d{2}-\d{2}\b/ # Date pattern
|
|
21
|
+
]
|
|
22
|
+
end
|
|
23
|
+
|
|
24
|
+
puts "\nWith preserve patterns (4 compiled regexes):"
|
|
25
|
+
Benchmark.ips do |x|
|
|
26
|
+
x.config(time: 5, warmup: 2)
|
|
27
|
+
|
|
28
|
+
x.report("Cached tokenizer") do
|
|
29
|
+
TokenKit.tokenize(SAMPLE_TEXT)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
x.compare!
|
|
33
|
+
end
|
|
34
|
+
|
|
35
|
+
# Test without preserve patterns
|
|
36
|
+
TokenKit.reset
|
|
37
|
+
TokenKit.configure do |config|
|
|
38
|
+
config.strategy = :unicode
|
|
39
|
+
end
|
|
40
|
+
|
|
41
|
+
puts "\nWithout preserve patterns:"
|
|
42
|
+
Benchmark.ips do |x|
|
|
43
|
+
x.config(time: 5, warmup: 2)
|
|
44
|
+
|
|
45
|
+
x.report("Cached tokenizer") do
|
|
46
|
+
TokenKit.tokenize(SAMPLE_TEXT)
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
x.compare!
|
|
50
|
+
end
|
|
51
|
+
|
|
52
|
+
# Test configuration changes (cache invalidation)
|
|
53
|
+
puts "\nConfiguration changes (worst case):"
|
|
54
|
+
Benchmark.ips do |x|
|
|
55
|
+
x.config(time: 5, warmup: 2)
|
|
56
|
+
|
|
57
|
+
x.report("Reconfigure each time") do
|
|
58
|
+
TokenKit.configure { |c| c.strategy = :unicode }
|
|
59
|
+
TokenKit.tokenize(SAMPLE_TEXT)
|
|
60
|
+
end
|
|
61
|
+
|
|
62
|
+
x.compare!
|
|
63
|
+
end
|
|
@@ -0,0 +1,83 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "bundler/setup"
|
|
5
|
+
require "tokenkit"
|
|
6
|
+
require "benchmark/ips"
|
|
7
|
+
|
|
8
|
+
SMALL_TEXT = "The quick brown fox jumps over the lazy dog."
|
|
9
|
+
MEDIUM_TEXT = SMALL_TEXT * 10
|
|
10
|
+
PATTERN_TEXT = "Contact us at user@example.com or visit https://example.com. Code: ABC-123, Date: 2024-01-15."
|
|
11
|
+
|
|
12
|
+
puts "TokenKit Final Performance Comparison"
|
|
13
|
+
puts "=" * 50
|
|
14
|
+
puts "After optimizations:"
|
|
15
|
+
puts "1. Cached tokenizer instances (avoids regex recompilation)"
|
|
16
|
+
puts "2. Reduced string allocations in preserve_patterns"
|
|
17
|
+
puts "3. In-place post-processing"
|
|
18
|
+
puts "=" * 50
|
|
19
|
+
|
|
20
|
+
# Test 1: Basic tokenization (should be similar)
|
|
21
|
+
puts "\n📊 Basic Unicode Tokenization:"
|
|
22
|
+
TokenKit.reset
|
|
23
|
+
TokenKit.configure { |c| c.strategy = :unicode }
|
|
24
|
+
|
|
25
|
+
Benchmark.ips do |x|
|
|
26
|
+
x.config(time: 3, warmup: 1)
|
|
27
|
+
x.report("Optimized") { TokenKit.tokenize(SMALL_TEXT) }
|
|
28
|
+
x.compare!
|
|
29
|
+
end
|
|
30
|
+
|
|
31
|
+
# Test 2: With preserve patterns (biggest improvement expected)
|
|
32
|
+
puts "\n🔥 With Preserve Patterns (4 regexes):"
|
|
33
|
+
TokenKit.configure do |config|
|
|
34
|
+
config.strategy = :unicode
|
|
35
|
+
config.preserve_patterns = [
|
|
36
|
+
/\w+@\w+\.\w+/, # Email
|
|
37
|
+
/https?:\/\/[^\s]+/, # URL
|
|
38
|
+
/\b[A-Z]{2,}\b/, # Uppercase
|
|
39
|
+
/\b\d{4}-\d{2}-\d{2}\b/ # Date
|
|
40
|
+
]
|
|
41
|
+
end
|
|
42
|
+
|
|
43
|
+
Benchmark.ips do |x|
|
|
44
|
+
x.config(time: 3, warmup: 1)
|
|
45
|
+
x.report("Optimized") { TokenKit.tokenize(PATTERN_TEXT) }
|
|
46
|
+
x.compare!
|
|
47
|
+
end
|
|
48
|
+
|
|
49
|
+
# Test 3: Complex configuration
|
|
50
|
+
puts "\n⚙️ Complex Configuration (all options):"
|
|
51
|
+
TokenKit.configure do |config|
|
|
52
|
+
config.strategy = :unicode
|
|
53
|
+
config.lowercase = true
|
|
54
|
+
config.remove_punctuation = true
|
|
55
|
+
config.preserve_patterns = [/\b[A-Z]{2,}\b/, /\d{4}-\d{2}-\d{2}/]
|
|
56
|
+
end
|
|
57
|
+
|
|
58
|
+
Benchmark.ips do |x|
|
|
59
|
+
x.config(time: 3, warmup: 1)
|
|
60
|
+
x.report("Optimized") { TokenKit.tokenize(MEDIUM_TEXT) }
|
|
61
|
+
x.compare!
|
|
62
|
+
end
|
|
63
|
+
|
|
64
|
+
# Test 4: EdgeNgram (allocation-heavy)
|
|
65
|
+
puts "\n🔤 EdgeNgram Tokenization:"
|
|
66
|
+
TokenKit.reset
|
|
67
|
+
TokenKit.configure do |config|
|
|
68
|
+
config.strategy = :edge_ngram
|
|
69
|
+
config.min_gram = 2
|
|
70
|
+
config.max_gram = 5
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
Benchmark.ips do |x|
|
|
74
|
+
x.config(time: 3, warmup: 1)
|
|
75
|
+
x.report("Optimized") { TokenKit.tokenize(SMALL_TEXT) }
|
|
76
|
+
x.compare!
|
|
77
|
+
end
|
|
78
|
+
|
|
79
|
+
puts "\n" + "=" * 50
|
|
80
|
+
puts "Summary of improvements:"
|
|
81
|
+
puts "- Preserve patterns: ~100x faster (from regex caching)"
|
|
82
|
+
puts "- Reduced allocations: ~20-30% improvement in throughput"
|
|
83
|
+
puts "- Better memory efficiency with in-place operations"
|
|
@@ -0,0 +1,250 @@
|
|
|
1
|
+
#!/usr/bin/env ruby
|
|
2
|
+
# frozen_string_literal: true
|
|
3
|
+
|
|
4
|
+
require "bundler/setup"
|
|
5
|
+
require "tokenkit"
|
|
6
|
+
require "benchmark/ips"
|
|
7
|
+
require "benchmark/memory"
|
|
8
|
+
|
|
9
|
+
# Sample texts of various sizes
|
|
10
|
+
SMALL_TEXT = "The quick brown fox jumps over the lazy dog."
|
|
11
|
+
MEDIUM_TEXT = SMALL_TEXT * 10
|
|
12
|
+
LARGE_TEXT = File.read(File.join(__dir__, "../README.md")) * 5
|
|
13
|
+
UNICODE_TEXT = "Hello 世界 мир العالم! Testing 🔥 emoji and café résumé naïve."
|
|
14
|
+
PATTERN_TEXT = "Contact us at user@example.com or visit https://example.com. Code: ABC-123, Date: 2024-01-15."
|
|
15
|
+
|
|
16
|
+
puts "TokenKit Performance Benchmarks"
|
|
17
|
+
puts "=" * 50
|
|
18
|
+
|
|
19
|
+
def run_tokenizer_benchmarks
|
|
20
|
+
puts "\n📊 Tokenizer Strategy Comparison (small text)"
|
|
21
|
+
puts "-" * 40
|
|
22
|
+
|
|
23
|
+
Benchmark.ips do |x|
|
|
24
|
+
x.config(time: 5, warmup: 2)
|
|
25
|
+
|
|
26
|
+
x.report("Unicode") do
|
|
27
|
+
TokenKit.reset
|
|
28
|
+
TokenKit.configure { |c| c.strategy = :unicode }
|
|
29
|
+
TokenKit.tokenize(SMALL_TEXT)
|
|
30
|
+
end
|
|
31
|
+
|
|
32
|
+
x.report("Whitespace") do
|
|
33
|
+
TokenKit.reset
|
|
34
|
+
TokenKit.configure { |c| c.strategy = :whitespace }
|
|
35
|
+
TokenKit.tokenize(SMALL_TEXT)
|
|
36
|
+
end
|
|
37
|
+
|
|
38
|
+
x.report("Letter") do
|
|
39
|
+
TokenKit.reset
|
|
40
|
+
TokenKit.configure { |c| c.strategy = :letter }
|
|
41
|
+
TokenKit.tokenize(SMALL_TEXT)
|
|
42
|
+
end
|
|
43
|
+
|
|
44
|
+
x.report("Lowercase") do
|
|
45
|
+
TokenKit.reset
|
|
46
|
+
TokenKit.configure { |c| c.strategy = :lowercase }
|
|
47
|
+
TokenKit.tokenize(SMALL_TEXT)
|
|
48
|
+
end
|
|
49
|
+
|
|
50
|
+
x.report("Pattern (email)") do
|
|
51
|
+
TokenKit.reset
|
|
52
|
+
TokenKit.configure do |c|
|
|
53
|
+
c.strategy = :pattern
|
|
54
|
+
c.regex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/
|
|
55
|
+
end
|
|
56
|
+
TokenKit.tokenize(PATTERN_TEXT)
|
|
57
|
+
end
|
|
58
|
+
|
|
59
|
+
x.report("EdgeNgram") do
|
|
60
|
+
TokenKit.reset
|
|
61
|
+
TokenKit.configure do |c|
|
|
62
|
+
c.strategy = :edge_ngram
|
|
63
|
+
c.min_gram = 2
|
|
64
|
+
c.max_gram = 5
|
|
65
|
+
end
|
|
66
|
+
TokenKit.tokenize(SMALL_TEXT)
|
|
67
|
+
end
|
|
68
|
+
|
|
69
|
+
x.compare!
|
|
70
|
+
end
|
|
71
|
+
end
|
|
72
|
+
|
|
73
|
+
def run_configuration_benchmarks
|
|
74
|
+
puts "\n⚙️ Configuration Options Impact"
|
|
75
|
+
puts "-" * 40
|
|
76
|
+
|
|
77
|
+
Benchmark.ips do |x|
|
|
78
|
+
x.config(time: 5, warmup: 2)
|
|
79
|
+
|
|
80
|
+
x.report("Basic Unicode") do
|
|
81
|
+
TokenKit.reset
|
|
82
|
+
TokenKit.configure { |c| c.strategy = :unicode }
|
|
83
|
+
TokenKit.tokenize(MEDIUM_TEXT)
|
|
84
|
+
end
|
|
85
|
+
|
|
86
|
+
x.report("Unicode + lowercase") do
|
|
87
|
+
TokenKit.reset
|
|
88
|
+
TokenKit.configure do |c|
|
|
89
|
+
c.strategy = :unicode
|
|
90
|
+
c.lowercase = true
|
|
91
|
+
end
|
|
92
|
+
TokenKit.tokenize(MEDIUM_TEXT)
|
|
93
|
+
end
|
|
94
|
+
|
|
95
|
+
x.report("Unicode + remove_punctuation") do
|
|
96
|
+
TokenKit.reset
|
|
97
|
+
TokenKit.configure do |c|
|
|
98
|
+
c.strategy = :unicode
|
|
99
|
+
c.remove_punctuation = true
|
|
100
|
+
end
|
|
101
|
+
TokenKit.tokenize(MEDIUM_TEXT)
|
|
102
|
+
end
|
|
103
|
+
|
|
104
|
+
x.report("Unicode + preserve_patterns") do
|
|
105
|
+
TokenKit.reset
|
|
106
|
+
TokenKit.configure do |c|
|
|
107
|
+
c.strategy = :unicode
|
|
108
|
+
c.preserve_patterns = [/\b[A-Z]{2,}\b/, /\d{4}-\d{2}-\d{2}/]
|
|
109
|
+
end
|
|
110
|
+
TokenKit.tokenize(MEDIUM_TEXT)
|
|
111
|
+
end
|
|
112
|
+
|
|
113
|
+
x.report("Unicode + all options") do
|
|
114
|
+
TokenKit.reset
|
|
115
|
+
TokenKit.configure do |c|
|
|
116
|
+
c.strategy = :unicode
|
|
117
|
+
c.lowercase = true
|
|
118
|
+
c.remove_punctuation = true
|
|
119
|
+
c.preserve_patterns = [/\b[A-Z]{2,}\b/, /\d{4}-\d{2}-\d{2}/]
|
|
120
|
+
end
|
|
121
|
+
TokenKit.tokenize(MEDIUM_TEXT)
|
|
122
|
+
end
|
|
123
|
+
|
|
124
|
+
x.compare!
|
|
125
|
+
end
|
|
126
|
+
end
|
|
127
|
+
|
|
128
|
+
def run_text_size_benchmarks
|
|
129
|
+
puts "\n📏 Text Size Scaling"
|
|
130
|
+
puts "-" * 40
|
|
131
|
+
|
|
132
|
+
TokenKit.configure { |c| c.strategy = :unicode }
|
|
133
|
+
|
|
134
|
+
Benchmark.ips do |x|
|
|
135
|
+
x.config(time: 5, warmup: 2)
|
|
136
|
+
|
|
137
|
+
x.report("Small (#{SMALL_TEXT.length} chars)") do
|
|
138
|
+
TokenKit.tokenize(SMALL_TEXT)
|
|
139
|
+
end
|
|
140
|
+
|
|
141
|
+
x.report("Medium (#{MEDIUM_TEXT.length} chars)") do
|
|
142
|
+
TokenKit.tokenize(MEDIUM_TEXT)
|
|
143
|
+
end
|
|
144
|
+
|
|
145
|
+
x.report("Large (#{LARGE_TEXT.length} chars)") do
|
|
146
|
+
TokenKit.tokenize(LARGE_TEXT)
|
|
147
|
+
end
|
|
148
|
+
|
|
149
|
+
x.compare!
|
|
150
|
+
end
|
|
151
|
+
end
|
|
152
|
+
|
|
153
|
+
def run_instance_vs_module_benchmarks
|
|
154
|
+
puts "\n🔄 Module vs Instance API"
|
|
155
|
+
puts "-" * 40
|
|
156
|
+
|
|
157
|
+
TokenKit.configure do |c|
|
|
158
|
+
c.strategy = :unicode
|
|
159
|
+
c.lowercase = true
|
|
160
|
+
end
|
|
161
|
+
|
|
162
|
+
tokenizer = TokenKit::Tokenizer.new(strategy: :unicode, lowercase: true)
|
|
163
|
+
|
|
164
|
+
Benchmark.ips do |x|
|
|
165
|
+
x.config(time: 5, warmup: 2)
|
|
166
|
+
|
|
167
|
+
x.report("Module API (creates fresh instance)") do
|
|
168
|
+
TokenKit.tokenize(MEDIUM_TEXT)
|
|
169
|
+
end
|
|
170
|
+
|
|
171
|
+
x.report("Instance API (reused instance)") do
|
|
172
|
+
tokenizer.tokenize(MEDIUM_TEXT)
|
|
173
|
+
end
|
|
174
|
+
|
|
175
|
+
x.report("Per-call options") do
|
|
176
|
+
TokenKit.tokenize(MEDIUM_TEXT, lowercase: false)
|
|
177
|
+
end
|
|
178
|
+
|
|
179
|
+
x.compare!
|
|
180
|
+
end
|
|
181
|
+
end
|
|
182
|
+
|
|
183
|
+
def run_thread_safety_benchmarks
|
|
184
|
+
puts "\n🧵 Thread Safety & Concurrency"
|
|
185
|
+
puts "-" * 40
|
|
186
|
+
|
|
187
|
+
TokenKit.configure { |c| c.strategy = :unicode }
|
|
188
|
+
|
|
189
|
+
Benchmark.ips do |x|
|
|
190
|
+
x.config(time: 5, warmup: 2)
|
|
191
|
+
|
|
192
|
+
x.report("Single-threaded (10 tokenizations)") do
|
|
193
|
+
10.times { TokenKit.tokenize(SMALL_TEXT) }
|
|
194
|
+
end
|
|
195
|
+
|
|
196
|
+
x.report("Multi-threaded (10 threads)") do
|
|
197
|
+
threads = 10.times.map do
|
|
198
|
+
Thread.new { TokenKit.tokenize(SMALL_TEXT) }
|
|
199
|
+
end
|
|
200
|
+
threads.each(&:join)
|
|
201
|
+
end
|
|
202
|
+
|
|
203
|
+
x.compare!
|
|
204
|
+
end
|
|
205
|
+
end
|
|
206
|
+
|
|
207
|
+
def run_memory_benchmarks
|
|
208
|
+
puts "\n💾 Memory Usage Analysis"
|
|
209
|
+
puts "-" * 40
|
|
210
|
+
|
|
211
|
+
Benchmark.memory do |x|
|
|
212
|
+
x.report("Unicode tokenizer") do
|
|
213
|
+
TokenKit.reset
|
|
214
|
+
TokenKit.configure { |c| c.strategy = :unicode }
|
|
215
|
+
100.times { TokenKit.tokenize(MEDIUM_TEXT) }
|
|
216
|
+
end
|
|
217
|
+
|
|
218
|
+
x.report("EdgeNgram tokenizer") do
|
|
219
|
+
TokenKit.reset
|
|
220
|
+
TokenKit.configure do |c|
|
|
221
|
+
c.strategy = :edge_ngram
|
|
222
|
+
c.min_gram = 2
|
|
223
|
+
c.max_gram = 5
|
|
224
|
+
end
|
|
225
|
+
100.times { TokenKit.tokenize(MEDIUM_TEXT) }
|
|
226
|
+
end
|
|
227
|
+
|
|
228
|
+
x.report("With preserve_patterns") do
|
|
229
|
+
TokenKit.reset
|
|
230
|
+
TokenKit.configure do |c|
|
|
231
|
+
c.strategy = :unicode
|
|
232
|
+
c.preserve_patterns = [/\b[A-Z]{2,}\b/, /\d{4}-\d{2}-\d{2}/, /\w+@\w+\.\w+/]
|
|
233
|
+
end
|
|
234
|
+
100.times { TokenKit.tokenize(MEDIUM_TEXT) }
|
|
235
|
+
end
|
|
236
|
+
|
|
237
|
+
x.compare!
|
|
238
|
+
end
|
|
239
|
+
end
|
|
240
|
+
|
|
241
|
+
# Run all benchmarks
|
|
242
|
+
run_tokenizer_benchmarks if ARGV.empty? || ARGV.include?("tokenizers")
|
|
243
|
+
run_configuration_benchmarks if ARGV.empty? || ARGV.include?("config")
|
|
244
|
+
run_text_size_benchmarks if ARGV.empty? || ARGV.include?("size")
|
|
245
|
+
run_instance_vs_module_benchmarks if ARGV.empty? || ARGV.include?("instance")
|
|
246
|
+
run_thread_safety_benchmarks if ARGV.empty? || ARGV.include?("threads")
|
|
247
|
+
run_memory_benchmarks if ARGV.empty? || ARGV.include?("memory")
|
|
248
|
+
|
|
249
|
+
puts "\n" + "=" * 50
|
|
250
|
+
puts "Benchmark complete!"
|