tokenkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/.yardopts +12 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +644 -0
- data/Rakefile +18 -0
- data/benchmarks/cache_test.rb +63 -0
- data/benchmarks/final_comparison.rb +83 -0
- data/benchmarks/tokenizer_benchmark.rb +250 -0
- data/docs/ARCHITECTURE.md +469 -0
- data/docs/PERFORMANCE.md +382 -0
- data/docs/README.md +118 -0
- data/ext/tokenkit/Cargo.toml +21 -0
- data/ext/tokenkit/extconf.rb +4 -0
- data/ext/tokenkit/src/config.rs +37 -0
- data/ext/tokenkit/src/error.rs +67 -0
- data/ext/tokenkit/src/lib.rs +346 -0
- data/ext/tokenkit/src/tokenizer/base.rs +41 -0
- data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
- data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
- data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
- data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
- data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
- data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
- data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
- data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
- data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
- data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
- data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
- data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
- data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
- data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
- data/lib/tokenkit/config.rb +74 -0
- data/lib/tokenkit/config_builder.rb +209 -0
- data/lib/tokenkit/config_compat.rb +52 -0
- data/lib/tokenkit/configuration.rb +194 -0
- data/lib/tokenkit/regex_converter.rb +58 -0
- data/lib/tokenkit/version.rb +5 -0
- data/lib/tokenkit.rb +336 -0
- data/sig/tokenkit.rbs +4 -0
- metadata +172 -0
| @@ -0,0 +1,63 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            # frozen_string_literal: true
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            require "bundler/setup"
         | 
| 5 | 
            +
            require "tokenkit"
         | 
| 6 | 
            +
            require "benchmark/ips"
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            SAMPLE_TEXT = "The quick brown fox jumps over the lazy dog. Test email@example.com and URL https://example.com"
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            puts "Testing tokenizer caching optimization"
         | 
| 11 | 
            +
            puts "=" * 50
         | 
| 12 | 
            +
             | 
| 13 | 
            +
            # Test with preserve patterns (where caching helps most)
         | 
| 14 | 
            +
            TokenKit.configure do |config|
         | 
| 15 | 
            +
              config.strategy = :unicode
         | 
| 16 | 
            +
              config.preserve_patterns = [
         | 
| 17 | 
            +
                /\w+@\w+\.\w+/,               # Email pattern
         | 
| 18 | 
            +
                /https?:\/\/[^\s]+/,          # URL pattern
         | 
| 19 | 
            +
                /\b[A-Z]{2,}\b/,              # Uppercase abbreviations
         | 
| 20 | 
            +
                /\b\d{4}-\d{2}-\d{2}\b/       # Date pattern
         | 
| 21 | 
            +
              ]
         | 
| 22 | 
            +
            end
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            puts "\nWith preserve patterns (4 compiled regexes):"
         | 
| 25 | 
            +
            Benchmark.ips do |x|
         | 
| 26 | 
            +
              x.config(time: 5, warmup: 2)
         | 
| 27 | 
            +
             | 
| 28 | 
            +
              x.report("Cached tokenizer") do
         | 
| 29 | 
            +
                TokenKit.tokenize(SAMPLE_TEXT)
         | 
| 30 | 
            +
              end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
              x.compare!
         | 
| 33 | 
            +
            end
         | 
| 34 | 
            +
             | 
| 35 | 
            +
            # Test without preserve patterns
         | 
| 36 | 
            +
            TokenKit.reset
         | 
| 37 | 
            +
            TokenKit.configure do |config|
         | 
| 38 | 
            +
              config.strategy = :unicode
         | 
| 39 | 
            +
            end
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            puts "\nWithout preserve patterns:"
         | 
| 42 | 
            +
            Benchmark.ips do |x|
         | 
| 43 | 
            +
              x.config(time: 5, warmup: 2)
         | 
| 44 | 
            +
             | 
| 45 | 
            +
              x.report("Cached tokenizer") do
         | 
| 46 | 
            +
                TokenKit.tokenize(SAMPLE_TEXT)
         | 
| 47 | 
            +
              end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
              x.compare!
         | 
| 50 | 
            +
            end
         | 
| 51 | 
            +
             | 
| 52 | 
            +
            # Test configuration changes (cache invalidation)
         | 
| 53 | 
            +
            puts "\nConfiguration changes (worst case):"
         | 
| 54 | 
            +
            Benchmark.ips do |x|
         | 
| 55 | 
            +
              x.config(time: 5, warmup: 2)
         | 
| 56 | 
            +
             | 
| 57 | 
            +
              x.report("Reconfigure each time") do
         | 
| 58 | 
            +
                TokenKit.configure { |c| c.strategy = :unicode }
         | 
| 59 | 
            +
                TokenKit.tokenize(SAMPLE_TEXT)
         | 
| 60 | 
            +
              end
         | 
| 61 | 
            +
             | 
| 62 | 
            +
              x.compare!
         | 
| 63 | 
            +
            end
         | 
| @@ -0,0 +1,83 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            # frozen_string_literal: true
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            require "bundler/setup"
         | 
| 5 | 
            +
            require "tokenkit"
         | 
| 6 | 
            +
            require "benchmark/ips"
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            SMALL_TEXT = "The quick brown fox jumps over the lazy dog."
         | 
| 9 | 
            +
            MEDIUM_TEXT = SMALL_TEXT * 10
         | 
| 10 | 
            +
            PATTERN_TEXT = "Contact us at user@example.com or visit https://example.com. Code: ABC-123, Date: 2024-01-15."
         | 
| 11 | 
            +
             | 
| 12 | 
            +
            puts "TokenKit Final Performance Comparison"
         | 
| 13 | 
            +
            puts "=" * 50
         | 
| 14 | 
            +
            puts "After optimizations:"
         | 
| 15 | 
            +
            puts "1. Cached tokenizer instances (avoids regex recompilation)"
         | 
| 16 | 
            +
            puts "2. Reduced string allocations in preserve_patterns"
         | 
| 17 | 
            +
            puts "3. In-place post-processing"
         | 
| 18 | 
            +
            puts "=" * 50
         | 
| 19 | 
            +
             | 
| 20 | 
            +
            # Test 1: Basic tokenization (should be similar)
         | 
| 21 | 
            +
            puts "\n📊 Basic Unicode Tokenization:"
         | 
| 22 | 
            +
            TokenKit.reset
         | 
| 23 | 
            +
            TokenKit.configure { |c| c.strategy = :unicode }
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            Benchmark.ips do |x|
         | 
| 26 | 
            +
              x.config(time: 3, warmup: 1)
         | 
| 27 | 
            +
              x.report("Optimized") { TokenKit.tokenize(SMALL_TEXT) }
         | 
| 28 | 
            +
              x.compare!
         | 
| 29 | 
            +
            end
         | 
| 30 | 
            +
             | 
| 31 | 
            +
            # Test 2: With preserve patterns (biggest improvement expected)
         | 
| 32 | 
            +
            puts "\n🔥 With Preserve Patterns (4 regexes):"
         | 
| 33 | 
            +
            TokenKit.configure do |config|
         | 
| 34 | 
            +
              config.strategy = :unicode
         | 
| 35 | 
            +
              config.preserve_patterns = [
         | 
| 36 | 
            +
                /\w+@\w+\.\w+/,               # Email
         | 
| 37 | 
            +
                /https?:\/\/[^\s]+/,          # URL
         | 
| 38 | 
            +
                /\b[A-Z]{2,}\b/,              # Uppercase
         | 
| 39 | 
            +
                /\b\d{4}-\d{2}-\d{2}\b/       # Date
         | 
| 40 | 
            +
              ]
         | 
| 41 | 
            +
            end
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            Benchmark.ips do |x|
         | 
| 44 | 
            +
              x.config(time: 3, warmup: 1)
         | 
| 45 | 
            +
              x.report("Optimized") { TokenKit.tokenize(PATTERN_TEXT) }
         | 
| 46 | 
            +
              x.compare!
         | 
| 47 | 
            +
            end
         | 
| 48 | 
            +
             | 
| 49 | 
            +
            # Test 3: Complex configuration
         | 
| 50 | 
            +
            puts "\n⚙️  Complex Configuration (all options):"
         | 
| 51 | 
            +
            TokenKit.configure do |config|
         | 
| 52 | 
            +
              config.strategy = :unicode
         | 
| 53 | 
            +
              config.lowercase = true
         | 
| 54 | 
            +
              config.remove_punctuation = true
         | 
| 55 | 
            +
              config.preserve_patterns = [/\b[A-Z]{2,}\b/, /\d{4}-\d{2}-\d{2}/]
         | 
| 56 | 
            +
            end
         | 
| 57 | 
            +
             | 
| 58 | 
            +
            Benchmark.ips do |x|
         | 
| 59 | 
            +
              x.config(time: 3, warmup: 1)
         | 
| 60 | 
            +
              x.report("Optimized") { TokenKit.tokenize(MEDIUM_TEXT) }
         | 
| 61 | 
            +
              x.compare!
         | 
| 62 | 
            +
            end
         | 
| 63 | 
            +
             | 
| 64 | 
            +
            # Test 4: EdgeNgram (allocation-heavy)
         | 
| 65 | 
            +
            puts "\n🔤 EdgeNgram Tokenization:"
         | 
| 66 | 
            +
            TokenKit.reset
         | 
| 67 | 
            +
            TokenKit.configure do |config|
         | 
| 68 | 
            +
              config.strategy = :edge_ngram
         | 
| 69 | 
            +
              config.min_gram = 2
         | 
| 70 | 
            +
              config.max_gram = 5
         | 
| 71 | 
            +
            end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            Benchmark.ips do |x|
         | 
| 74 | 
            +
              x.config(time: 3, warmup: 1)
         | 
| 75 | 
            +
              x.report("Optimized") { TokenKit.tokenize(SMALL_TEXT) }
         | 
| 76 | 
            +
              x.compare!
         | 
| 77 | 
            +
            end
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            puts "\n" + "=" * 50
         | 
| 80 | 
            +
            puts "Summary of improvements:"
         | 
| 81 | 
            +
            puts "- Preserve patterns: ~100x faster (from regex caching)"
         | 
| 82 | 
            +
            puts "- Reduced allocations: ~20-30% improvement in throughput"
         | 
| 83 | 
            +
            puts "- Better memory efficiency with in-place operations"
         | 
| @@ -0,0 +1,250 @@ | |
| 1 | 
            +
            #!/usr/bin/env ruby
         | 
| 2 | 
            +
            # frozen_string_literal: true
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            require "bundler/setup"
         | 
| 5 | 
            +
            require "tokenkit"
         | 
| 6 | 
            +
            require "benchmark/ips"
         | 
| 7 | 
            +
            require "benchmark/memory"
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            # Sample texts of various sizes
         | 
| 10 | 
            +
            SMALL_TEXT = "The quick brown fox jumps over the lazy dog."
         | 
| 11 | 
            +
            MEDIUM_TEXT = SMALL_TEXT * 10
         | 
| 12 | 
            +
            LARGE_TEXT = File.read(File.join(__dir__, "../README.md")) * 5
         | 
| 13 | 
            +
            UNICODE_TEXT = "Hello 世界 мир العالم! Testing 🔥 emoji and café résumé naïve."
         | 
| 14 | 
            +
            PATTERN_TEXT = "Contact us at user@example.com or visit https://example.com. Code: ABC-123, Date: 2024-01-15."
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            puts "TokenKit Performance Benchmarks"
         | 
| 17 | 
            +
            puts "=" * 50
         | 
| 18 | 
            +
             | 
| 19 | 
            +
            def run_tokenizer_benchmarks
         | 
| 20 | 
            +
              puts "\n📊 Tokenizer Strategy Comparison (small text)"
         | 
| 21 | 
            +
              puts "-" * 40
         | 
| 22 | 
            +
             | 
| 23 | 
            +
              Benchmark.ips do |x|
         | 
| 24 | 
            +
                x.config(time: 5, warmup: 2)
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                x.report("Unicode") do
         | 
| 27 | 
            +
                  TokenKit.reset
         | 
| 28 | 
            +
                  TokenKit.configure { |c| c.strategy = :unicode }
         | 
| 29 | 
            +
                  TokenKit.tokenize(SMALL_TEXT)
         | 
| 30 | 
            +
                end
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                x.report("Whitespace") do
         | 
| 33 | 
            +
                  TokenKit.reset
         | 
| 34 | 
            +
                  TokenKit.configure { |c| c.strategy = :whitespace }
         | 
| 35 | 
            +
                  TokenKit.tokenize(SMALL_TEXT)
         | 
| 36 | 
            +
                end
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                x.report("Letter") do
         | 
| 39 | 
            +
                  TokenKit.reset
         | 
| 40 | 
            +
                  TokenKit.configure { |c| c.strategy = :letter }
         | 
| 41 | 
            +
                  TokenKit.tokenize(SMALL_TEXT)
         | 
| 42 | 
            +
                end
         | 
| 43 | 
            +
             | 
| 44 | 
            +
                x.report("Lowercase") do
         | 
| 45 | 
            +
                  TokenKit.reset
         | 
| 46 | 
            +
                  TokenKit.configure { |c| c.strategy = :lowercase }
         | 
| 47 | 
            +
                  TokenKit.tokenize(SMALL_TEXT)
         | 
| 48 | 
            +
                end
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                x.report("Pattern (email)") do
         | 
| 51 | 
            +
                  TokenKit.reset
         | 
| 52 | 
            +
                  TokenKit.configure do |c|
         | 
| 53 | 
            +
                    c.strategy = :pattern
         | 
| 54 | 
            +
                    c.regex = /\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Z|a-z]{2,}\b/
         | 
| 55 | 
            +
                  end
         | 
| 56 | 
            +
                  TokenKit.tokenize(PATTERN_TEXT)
         | 
| 57 | 
            +
                end
         | 
| 58 | 
            +
             | 
| 59 | 
            +
                x.report("EdgeNgram") do
         | 
| 60 | 
            +
                  TokenKit.reset
         | 
| 61 | 
            +
                  TokenKit.configure do |c|
         | 
| 62 | 
            +
                    c.strategy = :edge_ngram
         | 
| 63 | 
            +
                    c.min_gram = 2
         | 
| 64 | 
            +
                    c.max_gram = 5
         | 
| 65 | 
            +
                  end
         | 
| 66 | 
            +
                  TokenKit.tokenize(SMALL_TEXT)
         | 
| 67 | 
            +
                end
         | 
| 68 | 
            +
             | 
| 69 | 
            +
                x.compare!
         | 
| 70 | 
            +
              end
         | 
| 71 | 
            +
            end
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            def run_configuration_benchmarks
         | 
| 74 | 
            +
              puts "\n⚙️  Configuration Options Impact"
         | 
| 75 | 
            +
              puts "-" * 40
         | 
| 76 | 
            +
             | 
| 77 | 
            +
              Benchmark.ips do |x|
         | 
| 78 | 
            +
                x.config(time: 5, warmup: 2)
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                x.report("Basic Unicode") do
         | 
| 81 | 
            +
                  TokenKit.reset
         | 
| 82 | 
            +
                  TokenKit.configure { |c| c.strategy = :unicode }
         | 
| 83 | 
            +
                  TokenKit.tokenize(MEDIUM_TEXT)
         | 
| 84 | 
            +
                end
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                x.report("Unicode + lowercase") do
         | 
| 87 | 
            +
                  TokenKit.reset
         | 
| 88 | 
            +
                  TokenKit.configure do |c|
         | 
| 89 | 
            +
                    c.strategy = :unicode
         | 
| 90 | 
            +
                    c.lowercase = true
         | 
| 91 | 
            +
                  end
         | 
| 92 | 
            +
                  TokenKit.tokenize(MEDIUM_TEXT)
         | 
| 93 | 
            +
                end
         | 
| 94 | 
            +
             | 
| 95 | 
            +
                x.report("Unicode + remove_punctuation") do
         | 
| 96 | 
            +
                  TokenKit.reset
         | 
| 97 | 
            +
                  TokenKit.configure do |c|
         | 
| 98 | 
            +
                    c.strategy = :unicode
         | 
| 99 | 
            +
                    c.remove_punctuation = true
         | 
| 100 | 
            +
                  end
         | 
| 101 | 
            +
                  TokenKit.tokenize(MEDIUM_TEXT)
         | 
| 102 | 
            +
                end
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                x.report("Unicode + preserve_patterns") do
         | 
| 105 | 
            +
                  TokenKit.reset
         | 
| 106 | 
            +
                  TokenKit.configure do |c|
         | 
| 107 | 
            +
                    c.strategy = :unicode
         | 
| 108 | 
            +
                    c.preserve_patterns = [/\b[A-Z]{2,}\b/, /\d{4}-\d{2}-\d{2}/]
         | 
| 109 | 
            +
                  end
         | 
| 110 | 
            +
                  TokenKit.tokenize(MEDIUM_TEXT)
         | 
| 111 | 
            +
                end
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                x.report("Unicode + all options") do
         | 
| 114 | 
            +
                  TokenKit.reset
         | 
| 115 | 
            +
                  TokenKit.configure do |c|
         | 
| 116 | 
            +
                    c.strategy = :unicode
         | 
| 117 | 
            +
                    c.lowercase = true
         | 
| 118 | 
            +
                    c.remove_punctuation = true
         | 
| 119 | 
            +
                    c.preserve_patterns = [/\b[A-Z]{2,}\b/, /\d{4}-\d{2}-\d{2}/]
         | 
| 120 | 
            +
                  end
         | 
| 121 | 
            +
                  TokenKit.tokenize(MEDIUM_TEXT)
         | 
| 122 | 
            +
                end
         | 
| 123 | 
            +
             | 
| 124 | 
            +
                x.compare!
         | 
| 125 | 
            +
              end
         | 
| 126 | 
            +
            end
         | 
| 127 | 
            +
             | 
| 128 | 
            +
            def run_text_size_benchmarks
         | 
| 129 | 
            +
              puts "\n📏 Text Size Scaling"
         | 
| 130 | 
            +
              puts "-" * 40
         | 
| 131 | 
            +
             | 
| 132 | 
            +
              TokenKit.configure { |c| c.strategy = :unicode }
         | 
| 133 | 
            +
             | 
| 134 | 
            +
              Benchmark.ips do |x|
         | 
| 135 | 
            +
                x.config(time: 5, warmup: 2)
         | 
| 136 | 
            +
             | 
| 137 | 
            +
                x.report("Small (#{SMALL_TEXT.length} chars)") do
         | 
| 138 | 
            +
                  TokenKit.tokenize(SMALL_TEXT)
         | 
| 139 | 
            +
                end
         | 
| 140 | 
            +
             | 
| 141 | 
            +
                x.report("Medium (#{MEDIUM_TEXT.length} chars)") do
         | 
| 142 | 
            +
                  TokenKit.tokenize(MEDIUM_TEXT)
         | 
| 143 | 
            +
                end
         | 
| 144 | 
            +
             | 
| 145 | 
            +
                x.report("Large (#{LARGE_TEXT.length} chars)") do
         | 
| 146 | 
            +
                  TokenKit.tokenize(LARGE_TEXT)
         | 
| 147 | 
            +
                end
         | 
| 148 | 
            +
             | 
| 149 | 
            +
                x.compare!
         | 
| 150 | 
            +
              end
         | 
| 151 | 
            +
            end
         | 
| 152 | 
            +
             | 
| 153 | 
            +
            def run_instance_vs_module_benchmarks
         | 
| 154 | 
            +
              puts "\n🔄 Module vs Instance API"
         | 
| 155 | 
            +
              puts "-" * 40
         | 
| 156 | 
            +
             | 
| 157 | 
            +
              TokenKit.configure do |c|
         | 
| 158 | 
            +
                c.strategy = :unicode
         | 
| 159 | 
            +
                c.lowercase = true
         | 
| 160 | 
            +
              end
         | 
| 161 | 
            +
             | 
| 162 | 
            +
              tokenizer = TokenKit::Tokenizer.new(strategy: :unicode, lowercase: true)
         | 
| 163 | 
            +
             | 
| 164 | 
            +
              Benchmark.ips do |x|
         | 
| 165 | 
            +
                x.config(time: 5, warmup: 2)
         | 
| 166 | 
            +
             | 
| 167 | 
            +
                x.report("Module API (creates fresh instance)") do
         | 
| 168 | 
            +
                  TokenKit.tokenize(MEDIUM_TEXT)
         | 
| 169 | 
            +
                end
         | 
| 170 | 
            +
             | 
| 171 | 
            +
                x.report("Instance API (reused instance)") do
         | 
| 172 | 
            +
                  tokenizer.tokenize(MEDIUM_TEXT)
         | 
| 173 | 
            +
                end
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                x.report("Per-call options") do
         | 
| 176 | 
            +
                  TokenKit.tokenize(MEDIUM_TEXT, lowercase: false)
         | 
| 177 | 
            +
                end
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                x.compare!
         | 
| 180 | 
            +
              end
         | 
| 181 | 
            +
            end
         | 
| 182 | 
            +
             | 
| 183 | 
            +
            def run_thread_safety_benchmarks
         | 
| 184 | 
            +
              puts "\n🧵 Thread Safety & Concurrency"
         | 
| 185 | 
            +
              puts "-" * 40
         | 
| 186 | 
            +
             | 
| 187 | 
            +
              TokenKit.configure { |c| c.strategy = :unicode }
         | 
| 188 | 
            +
             | 
| 189 | 
            +
              Benchmark.ips do |x|
         | 
| 190 | 
            +
                x.config(time: 5, warmup: 2)
         | 
| 191 | 
            +
             | 
| 192 | 
            +
                x.report("Single-threaded (10 tokenizations)") do
         | 
| 193 | 
            +
                  10.times { TokenKit.tokenize(SMALL_TEXT) }
         | 
| 194 | 
            +
                end
         | 
| 195 | 
            +
             | 
| 196 | 
            +
                x.report("Multi-threaded (10 threads)") do
         | 
| 197 | 
            +
                  threads = 10.times.map do
         | 
| 198 | 
            +
                    Thread.new { TokenKit.tokenize(SMALL_TEXT) }
         | 
| 199 | 
            +
                  end
         | 
| 200 | 
            +
                  threads.each(&:join)
         | 
| 201 | 
            +
                end
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                x.compare!
         | 
| 204 | 
            +
              end
         | 
| 205 | 
            +
            end
         | 
| 206 | 
            +
             | 
| 207 | 
            +
            def run_memory_benchmarks
         | 
| 208 | 
            +
              puts "\n💾 Memory Usage Analysis"
         | 
| 209 | 
            +
              puts "-" * 40
         | 
| 210 | 
            +
             | 
| 211 | 
            +
              Benchmark.memory do |x|
         | 
| 212 | 
            +
                x.report("Unicode tokenizer") do
         | 
| 213 | 
            +
                  TokenKit.reset
         | 
| 214 | 
            +
                  TokenKit.configure { |c| c.strategy = :unicode }
         | 
| 215 | 
            +
                  100.times { TokenKit.tokenize(MEDIUM_TEXT) }
         | 
| 216 | 
            +
                end
         | 
| 217 | 
            +
             | 
| 218 | 
            +
                x.report("EdgeNgram tokenizer") do
         | 
| 219 | 
            +
                  TokenKit.reset
         | 
| 220 | 
            +
                  TokenKit.configure do |c|
         | 
| 221 | 
            +
                    c.strategy = :edge_ngram
         | 
| 222 | 
            +
                    c.min_gram = 2
         | 
| 223 | 
            +
                    c.max_gram = 5
         | 
| 224 | 
            +
                  end
         | 
| 225 | 
            +
                  100.times { TokenKit.tokenize(MEDIUM_TEXT) }
         | 
| 226 | 
            +
                end
         | 
| 227 | 
            +
             | 
| 228 | 
            +
                x.report("With preserve_patterns") do
         | 
| 229 | 
            +
                  TokenKit.reset
         | 
| 230 | 
            +
                  TokenKit.configure do |c|
         | 
| 231 | 
            +
                    c.strategy = :unicode
         | 
| 232 | 
            +
                    c.preserve_patterns = [/\b[A-Z]{2,}\b/, /\d{4}-\d{2}-\d{2}/, /\w+@\w+\.\w+/]
         | 
| 233 | 
            +
                  end
         | 
| 234 | 
            +
                  100.times { TokenKit.tokenize(MEDIUM_TEXT) }
         | 
| 235 | 
            +
                end
         | 
| 236 | 
            +
             | 
| 237 | 
            +
                x.compare!
         | 
| 238 | 
            +
              end
         | 
| 239 | 
            +
            end
         | 
| 240 | 
            +
             | 
| 241 | 
            +
            # Run all benchmarks
         | 
| 242 | 
            +
            run_tokenizer_benchmarks if ARGV.empty? || ARGV.include?("tokenizers")
         | 
| 243 | 
            +
            run_configuration_benchmarks if ARGV.empty? || ARGV.include?("config")
         | 
| 244 | 
            +
            run_text_size_benchmarks if ARGV.empty? || ARGV.include?("size")
         | 
| 245 | 
            +
            run_instance_vs_module_benchmarks if ARGV.empty? || ARGV.include?("instance")
         | 
| 246 | 
            +
            run_thread_safety_benchmarks if ARGV.empty? || ARGV.include?("threads")
         | 
| 247 | 
            +
            run_memory_benchmarks if ARGV.empty? || ARGV.include?("memory")
         | 
| 248 | 
            +
             | 
| 249 | 
            +
            puts "\n" + "=" * 50
         | 
| 250 | 
            +
            puts "Benchmark complete!"
         |