tokenkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/.yardopts +12 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +644 -0
- data/Rakefile +18 -0
- data/benchmarks/cache_test.rb +63 -0
- data/benchmarks/final_comparison.rb +83 -0
- data/benchmarks/tokenizer_benchmark.rb +250 -0
- data/docs/ARCHITECTURE.md +469 -0
- data/docs/PERFORMANCE.md +382 -0
- data/docs/README.md +118 -0
- data/ext/tokenkit/Cargo.toml +21 -0
- data/ext/tokenkit/extconf.rb +4 -0
- data/ext/tokenkit/src/config.rs +37 -0
- data/ext/tokenkit/src/error.rs +67 -0
- data/ext/tokenkit/src/lib.rs +346 -0
- data/ext/tokenkit/src/tokenizer/base.rs +41 -0
- data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
- data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
- data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
- data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
- data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
- data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
- data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
- data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
- data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
- data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
- data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
- data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
- data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
- data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
- data/lib/tokenkit/config.rb +74 -0
- data/lib/tokenkit/config_builder.rb +209 -0
- data/lib/tokenkit/config_compat.rb +52 -0
- data/lib/tokenkit/configuration.rb +194 -0
- data/lib/tokenkit/regex_converter.rb +58 -0
- data/lib/tokenkit/version.rb +5 -0
- data/lib/tokenkit.rb +336 -0
- data/sig/tokenkit.rbs +4 -0
- metadata +172 -0
| @@ -0,0 +1,346 @@ | |
| 1 | 
            +
            mod config;
         | 
| 2 | 
            +
            mod error;
         | 
| 3 | 
            +
            mod tokenizer;
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            use config::{TokenizerConfig, TokenizerStrategy};
         | 
| 6 | 
            +
            use error::TokenizerError;
         | 
| 7 | 
            +
            use magnus::{define_module, function, Error, RArray, RHash, TryConvert};
         | 
| 8 | 
            +
            use std::sync::Mutex;
         | 
| 9 | 
            +
            use once_cell::sync::Lazy;
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            // Store the default configuration and a cached tokenizer
         | 
| 12 | 
            +
            struct TokenizerCache {
         | 
| 13 | 
            +
                config: TokenizerConfig,
         | 
| 14 | 
            +
                tokenizer: Option<Box<dyn tokenizer::Tokenizer + Send + Sync>>,
         | 
| 15 | 
            +
            }
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            static DEFAULT_CACHE: Lazy<Mutex<TokenizerCache>> = Lazy::new(|| {
         | 
| 18 | 
            +
                Mutex::new(TokenizerCache {
         | 
| 19 | 
            +
                    config: TokenizerConfig::default(),
         | 
| 20 | 
            +
                    tokenizer: None,
         | 
| 21 | 
            +
                })
         | 
| 22 | 
            +
            });
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            // Use cached tokenizer if config hasn't changed
         | 
| 25 | 
            +
            fn tokenize(text: String) -> std::result::Result<Vec<String>, Error> {
         | 
| 26 | 
            +
                let mut cache = DEFAULT_CACHE
         | 
| 27 | 
            +
                    .lock()
         | 
| 28 | 
            +
                    .map_err(|e| TokenizerError::MutexError(e.to_string()))?;
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                // Check if we need to create a new tokenizer
         | 
| 31 | 
            +
                if cache.tokenizer.is_none() {
         | 
| 32 | 
            +
                    let tokenizer = tokenizer::from_config(cache.config.clone())?;
         | 
| 33 | 
            +
                    cache.tokenizer = Some(tokenizer);
         | 
| 34 | 
            +
                }
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                // Use the cached tokenizer
         | 
| 37 | 
            +
                let result = cache
         | 
| 38 | 
            +
                    .tokenizer
         | 
| 39 | 
            +
                    .as_ref()
         | 
| 40 | 
            +
                    .unwrap()
         | 
| 41 | 
            +
                    .tokenize(&text);
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                Ok(result)
         | 
| 44 | 
            +
            }
         | 
| 45 | 
            +
             | 
| 46 | 
            +
            // Configure sets the default configuration and invalidates cache
         | 
| 47 | 
            +
            fn configure(config_hash: RHash) -> std::result::Result<(), Error> {
         | 
| 48 | 
            +
                let config = parse_config_from_hash(config_hash)?;
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                // Update cache with new config and clear tokenizer
         | 
| 51 | 
            +
                let mut cache = DEFAULT_CACHE
         | 
| 52 | 
            +
                    .lock()
         | 
| 53 | 
            +
                    .map_err(|e| TokenizerError::MutexError(e.to_string()))?;
         | 
| 54 | 
            +
                cache.config = config;
         | 
| 55 | 
            +
                cache.tokenizer = None; // Invalidate cached tokenizer
         | 
| 56 | 
            +
             | 
| 57 | 
            +
                Ok(())
         | 
| 58 | 
            +
            }
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            // Reset to factory defaults
         | 
| 61 | 
            +
            fn reset() -> std::result::Result<(), Error> {
         | 
| 62 | 
            +
                let mut cache = DEFAULT_CACHE
         | 
| 63 | 
            +
                    .lock()
         | 
| 64 | 
            +
                    .map_err(|e| TokenizerError::MutexError(e.to_string()))?;
         | 
| 65 | 
            +
                cache.config = TokenizerConfig::default();
         | 
| 66 | 
            +
                cache.tokenizer = None; // Clear cached tokenizer
         | 
| 67 | 
            +
                Ok(())
         | 
| 68 | 
            +
            }
         | 
| 69 | 
            +
             | 
| 70 | 
            +
            // Get current default configuration
         | 
| 71 | 
            +
            fn config_hash() -> std::result::Result<RHash, Error> {
         | 
| 72 | 
            +
                let cache = DEFAULT_CACHE
         | 
| 73 | 
            +
                    .lock()
         | 
| 74 | 
            +
                    .map_err(|e| TokenizerError::MutexError(e.to_string()))?;
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                config_to_hash(&cache.config)
         | 
| 77 | 
            +
            }
         | 
| 78 | 
            +
             | 
| 79 | 
            +
            // Helper function to convert config to RHash
         | 
| 80 | 
            +
            fn config_to_hash(config: &TokenizerConfig) -> std::result::Result<RHash, Error> {
         | 
| 81 | 
            +
                let hash = RHash::new();
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                let strategy_str = match &config.strategy {
         | 
| 84 | 
            +
                    TokenizerStrategy::Whitespace => "whitespace",
         | 
| 85 | 
            +
                    TokenizerStrategy::Unicode => "unicode",
         | 
| 86 | 
            +
                    TokenizerStrategy::Pattern { .. } => "pattern",
         | 
| 87 | 
            +
                    TokenizerStrategy::Sentence => "sentence",
         | 
| 88 | 
            +
                    TokenizerStrategy::Grapheme { .. } => "grapheme",
         | 
| 89 | 
            +
                    TokenizerStrategy::Keyword => "keyword",
         | 
| 90 | 
            +
                    TokenizerStrategy::EdgeNgram { .. } => "edge_ngram",
         | 
| 91 | 
            +
                    TokenizerStrategy::Ngram { .. } => "ngram",
         | 
| 92 | 
            +
                    TokenizerStrategy::PathHierarchy { .. } => "path_hierarchy",
         | 
| 93 | 
            +
                    TokenizerStrategy::UrlEmail => "url_email",
         | 
| 94 | 
            +
                    TokenizerStrategy::CharGroup { .. } => "char_group",
         | 
| 95 | 
            +
                    TokenizerStrategy::Letter => "letter",
         | 
| 96 | 
            +
                    TokenizerStrategy::Lowercase => "lowercase",
         | 
| 97 | 
            +
                };
         | 
| 98 | 
            +
                hash.aset("strategy", strategy_str)?;
         | 
| 99 | 
            +
             | 
| 100 | 
            +
                if let TokenizerStrategy::Pattern { regex } = &config.strategy {
         | 
| 101 | 
            +
                    hash.aset("regex", regex.as_str())?;
         | 
| 102 | 
            +
                }
         | 
| 103 | 
            +
             | 
| 104 | 
            +
                if let TokenizerStrategy::Grapheme { extended } = &config.strategy {
         | 
| 105 | 
            +
                    hash.aset("extended", *extended)?;
         | 
| 106 | 
            +
                }
         | 
| 107 | 
            +
             | 
| 108 | 
            +
                if let TokenizerStrategy::EdgeNgram { min_gram, max_gram } = &config.strategy {
         | 
| 109 | 
            +
                    hash.aset("min_gram", *min_gram)?;
         | 
| 110 | 
            +
                    hash.aset("max_gram", *max_gram)?;
         | 
| 111 | 
            +
                }
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                if let TokenizerStrategy::PathHierarchy { delimiter } = &config.strategy {
         | 
| 114 | 
            +
                    hash.aset("delimiter", delimiter.as_str())?;
         | 
| 115 | 
            +
                }
         | 
| 116 | 
            +
             | 
| 117 | 
            +
                if let TokenizerStrategy::Ngram { min_gram, max_gram } = &config.strategy {
         | 
| 118 | 
            +
                    hash.aset("min_gram", *min_gram)?;
         | 
| 119 | 
            +
                    hash.aset("max_gram", *max_gram)?;
         | 
| 120 | 
            +
                }
         | 
| 121 | 
            +
             | 
| 122 | 
            +
                if let TokenizerStrategy::CharGroup { split_on_chars } = &config.strategy {
         | 
| 123 | 
            +
                    hash.aset("split_on_chars", split_on_chars.as_str())?;
         | 
| 124 | 
            +
                }
         | 
| 125 | 
            +
             | 
| 126 | 
            +
                hash.aset("lowercase", config.lowercase)?;
         | 
| 127 | 
            +
                hash.aset("remove_punctuation", config.remove_punctuation)?;
         | 
| 128 | 
            +
             | 
| 129 | 
            +
                let patterns = RArray::new();
         | 
| 130 | 
            +
                for pattern in &config.preserve_patterns {
         | 
| 131 | 
            +
                    patterns.push(pattern.as_str())?;
         | 
| 132 | 
            +
                }
         | 
| 133 | 
            +
                hash.aset("preserve_patterns", patterns)?;
         | 
| 134 | 
            +
             | 
| 135 | 
            +
                Ok(hash)
         | 
| 136 | 
            +
            }
         | 
| 137 | 
            +
             | 
| 138 | 
            +
            // Parse config from Ruby hash
         | 
| 139 | 
            +
            fn parse_config_from_hash(config_hash: RHash) -> std::result::Result<TokenizerConfig, Error> {
         | 
| 140 | 
            +
                let strategy_val = config_hash.get("strategy");
         | 
| 141 | 
            +
                let strategy = if let Some(val) = strategy_val {
         | 
| 142 | 
            +
                    let strategy_str: String = TryConvert::try_convert(val)?;
         | 
| 143 | 
            +
                    match strategy_str.as_str() {
         | 
| 144 | 
            +
                        "whitespace" => TokenizerStrategy::Whitespace,
         | 
| 145 | 
            +
                        "unicode" => TokenizerStrategy::Unicode,
         | 
| 146 | 
            +
                        "pattern" => {
         | 
| 147 | 
            +
                            let regex_val = config_hash
         | 
| 148 | 
            +
                                .get("regex")
         | 
| 149 | 
            +
                                .ok_or_else(|| {
         | 
| 150 | 
            +
                                    TokenizerError::InvalidConfiguration(
         | 
| 151 | 
            +
                                        "pattern strategy requires regex parameter".to_string()
         | 
| 152 | 
            +
                                    )
         | 
| 153 | 
            +
                                })?;
         | 
| 154 | 
            +
                            let regex: String = TryConvert::try_convert(regex_val)?;
         | 
| 155 | 
            +
                            TokenizerStrategy::Pattern { regex }
         | 
| 156 | 
            +
                        }
         | 
| 157 | 
            +
                        "sentence" => TokenizerStrategy::Sentence,
         | 
| 158 | 
            +
                        "grapheme" => {
         | 
| 159 | 
            +
                            let extended_val = config_hash.get("extended");
         | 
| 160 | 
            +
                            let extended = if let Some(val) = extended_val {
         | 
| 161 | 
            +
                                TryConvert::try_convert(val)?
         | 
| 162 | 
            +
                            } else {
         | 
| 163 | 
            +
                                true
         | 
| 164 | 
            +
                            };
         | 
| 165 | 
            +
                            TokenizerStrategy::Grapheme { extended }
         | 
| 166 | 
            +
                        }
         | 
| 167 | 
            +
                        "keyword" => TokenizerStrategy::Keyword,
         | 
| 168 | 
            +
                        "edge_ngram" => {
         | 
| 169 | 
            +
                            let min_gram_val = config_hash.get("min_gram");
         | 
| 170 | 
            +
                            let min_gram = if let Some(val) = min_gram_val {
         | 
| 171 | 
            +
                                TryConvert::try_convert(val)?
         | 
| 172 | 
            +
                            } else {
         | 
| 173 | 
            +
                                2
         | 
| 174 | 
            +
                            };
         | 
| 175 | 
            +
                            let max_gram_val = config_hash.get("max_gram");
         | 
| 176 | 
            +
                            let max_gram = if let Some(val) = max_gram_val {
         | 
| 177 | 
            +
                                TryConvert::try_convert(val)?
         | 
| 178 | 
            +
                            } else {
         | 
| 179 | 
            +
                                10
         | 
| 180 | 
            +
                            };
         | 
| 181 | 
            +
                            TokenizerStrategy::EdgeNgram { min_gram, max_gram }
         | 
| 182 | 
            +
                        }
         | 
| 183 | 
            +
                        "path_hierarchy" => {
         | 
| 184 | 
            +
                            let delimiter_val = config_hash.get("delimiter");
         | 
| 185 | 
            +
                            let delimiter = if let Some(val) = delimiter_val {
         | 
| 186 | 
            +
                                TryConvert::try_convert(val)?
         | 
| 187 | 
            +
                            } else {
         | 
| 188 | 
            +
                                "/".to_string()
         | 
| 189 | 
            +
                            };
         | 
| 190 | 
            +
                            TokenizerStrategy::PathHierarchy { delimiter }
         | 
| 191 | 
            +
                        }
         | 
| 192 | 
            +
                        "url_email" => TokenizerStrategy::UrlEmail,
         | 
| 193 | 
            +
                        "ngram" => {
         | 
| 194 | 
            +
                            let min_gram_val = config_hash.get("min_gram");
         | 
| 195 | 
            +
                            let min_gram = if let Some(val) = min_gram_val {
         | 
| 196 | 
            +
                                TryConvert::try_convert(val)?
         | 
| 197 | 
            +
                            } else {
         | 
| 198 | 
            +
                                2
         | 
| 199 | 
            +
                            };
         | 
| 200 | 
            +
                            let max_gram_val = config_hash.get("max_gram");
         | 
| 201 | 
            +
                            let max_gram = if let Some(val) = max_gram_val {
         | 
| 202 | 
            +
                                TryConvert::try_convert(val)?
         | 
| 203 | 
            +
                            } else {
         | 
| 204 | 
            +
                                10
         | 
| 205 | 
            +
                            };
         | 
| 206 | 
            +
                            TokenizerStrategy::Ngram { min_gram, max_gram }
         | 
| 207 | 
            +
                        }
         | 
| 208 | 
            +
                        "char_group" => {
         | 
| 209 | 
            +
                            let split_on_chars_val = config_hash.get("split_on_chars");
         | 
| 210 | 
            +
                            let split_on_chars = if let Some(val) = split_on_chars_val {
         | 
| 211 | 
            +
                                TryConvert::try_convert(val)?
         | 
| 212 | 
            +
                            } else {
         | 
| 213 | 
            +
                                " \t\n\r".to_string()
         | 
| 214 | 
            +
                            };
         | 
| 215 | 
            +
                            TokenizerStrategy::CharGroup { split_on_chars }
         | 
| 216 | 
            +
                        }
         | 
| 217 | 
            +
                        "letter" => TokenizerStrategy::Letter,
         | 
| 218 | 
            +
                        "lowercase" => TokenizerStrategy::Lowercase,
         | 
| 219 | 
            +
                        _ => {
         | 
| 220 | 
            +
                            return Err(TokenizerError::UnknownStrategy(strategy_str).into())
         | 
| 221 | 
            +
                        }
         | 
| 222 | 
            +
                    }
         | 
| 223 | 
            +
                } else {
         | 
| 224 | 
            +
                    TokenizerStrategy::Unicode
         | 
| 225 | 
            +
                };
         | 
| 226 | 
            +
             | 
| 227 | 
            +
                let lowercase_val = config_hash.get("lowercase");
         | 
| 228 | 
            +
                let lowercase = if let Some(val) = lowercase_val {
         | 
| 229 | 
            +
                    TryConvert::try_convert(val)?
         | 
| 230 | 
            +
                } else {
         | 
| 231 | 
            +
                    true
         | 
| 232 | 
            +
                };
         | 
| 233 | 
            +
             | 
| 234 | 
            +
                let remove_punctuation_val = config_hash.get("remove_punctuation");
         | 
| 235 | 
            +
                let remove_punctuation = if let Some(val) = remove_punctuation_val {
         | 
| 236 | 
            +
                    TryConvert::try_convert(val)?
         | 
| 237 | 
            +
                } else {
         | 
| 238 | 
            +
                    false
         | 
| 239 | 
            +
                };
         | 
| 240 | 
            +
             | 
| 241 | 
            +
                let preserve_patterns_val = config_hash.get("preserve_patterns");
         | 
| 242 | 
            +
                let preserve_patterns = if let Some(val) = preserve_patterns_val {
         | 
| 243 | 
            +
                    let array: RArray = TryConvert::try_convert(val)?;
         | 
| 244 | 
            +
                    let mut patterns = Vec::new();
         | 
| 245 | 
            +
                    for idx in 0..array.len() {
         | 
| 246 | 
            +
                        let item = array.entry(idx as isize)?;
         | 
| 247 | 
            +
                        let pattern_str: String = TryConvert::try_convert(item)?;
         | 
| 248 | 
            +
                        patterns.push(pattern_str);
         | 
| 249 | 
            +
                    }
         | 
| 250 | 
            +
                    patterns
         | 
| 251 | 
            +
                } else {
         | 
| 252 | 
            +
                    Vec::new()
         | 
| 253 | 
            +
                };
         | 
| 254 | 
            +
             | 
| 255 | 
            +
                let config = TokenizerConfig {
         | 
| 256 | 
            +
                    strategy,
         | 
| 257 | 
            +
                    lowercase,
         | 
| 258 | 
            +
                    remove_punctuation,
         | 
| 259 | 
            +
                    preserve_patterns,
         | 
| 260 | 
            +
                };
         | 
| 261 | 
            +
             | 
| 262 | 
            +
                // Validate the configuration
         | 
| 263 | 
            +
                validate_config(&config)?;
         | 
| 264 | 
            +
             | 
| 265 | 
            +
                Ok(config)
         | 
| 266 | 
            +
            }
         | 
| 267 | 
            +
             | 
| 268 | 
            +
            // Validate configuration parameters
         | 
| 269 | 
            +
            fn validate_config(config: &TokenizerConfig) -> std::result::Result<(), TokenizerError> {
         | 
| 270 | 
            +
                use TokenizerStrategy::*;
         | 
| 271 | 
            +
             | 
| 272 | 
            +
                match &config.strategy {
         | 
| 273 | 
            +
                    EdgeNgram { min_gram, max_gram } | Ngram { min_gram, max_gram } => {
         | 
| 274 | 
            +
                        if *min_gram == 0 {
         | 
| 275 | 
            +
                            return Err(TokenizerError::InvalidNgramConfig {
         | 
| 276 | 
            +
                                min: *min_gram,
         | 
| 277 | 
            +
                                max: *max_gram,
         | 
| 278 | 
            +
                            });
         | 
| 279 | 
            +
                        }
         | 
| 280 | 
            +
                        if min_gram > max_gram {
         | 
| 281 | 
            +
                            return Err(TokenizerError::InvalidNgramConfig {
         | 
| 282 | 
            +
                                min: *min_gram,
         | 
| 283 | 
            +
                                max: *max_gram,
         | 
| 284 | 
            +
                            });
         | 
| 285 | 
            +
                        }
         | 
| 286 | 
            +
                    }
         | 
| 287 | 
            +
                    PathHierarchy { delimiter } => {
         | 
| 288 | 
            +
                        if delimiter.is_empty() {
         | 
| 289 | 
            +
                            return Err(TokenizerError::EmptyDelimiter {
         | 
| 290 | 
            +
                                tokenizer: "PathHierarchy".to_string(),
         | 
| 291 | 
            +
                            });
         | 
| 292 | 
            +
                        }
         | 
| 293 | 
            +
                    }
         | 
| 294 | 
            +
                    Pattern { regex } => {
         | 
| 295 | 
            +
                        // Validate regex pattern
         | 
| 296 | 
            +
                        regex::Regex::new(regex).map_err(|e| TokenizerError::InvalidRegex {
         | 
| 297 | 
            +
                            pattern: regex.clone(),
         | 
| 298 | 
            +
                            error: e.to_string(),
         | 
| 299 | 
            +
                        })?;
         | 
| 300 | 
            +
                    }
         | 
| 301 | 
            +
                    _ => {}
         | 
| 302 | 
            +
                }
         | 
| 303 | 
            +
             | 
| 304 | 
            +
                // Validate preserve patterns
         | 
| 305 | 
            +
                for pattern in &config.preserve_patterns {
         | 
| 306 | 
            +
                    regex::Regex::new(pattern).map_err(|e| TokenizerError::InvalidRegex {
         | 
| 307 | 
            +
                        pattern: pattern.clone(),
         | 
| 308 | 
            +
                        error: e.to_string(),
         | 
| 309 | 
            +
                    })?;
         | 
| 310 | 
            +
                }
         | 
| 311 | 
            +
             | 
| 312 | 
            +
                Ok(())
         | 
| 313 | 
            +
            }
         | 
| 314 | 
            +
             | 
| 315 | 
            +
            // Load config is just an alias for configure (for backward compat)
         | 
| 316 | 
            +
            fn load_config(config_hash: RHash) -> std::result::Result<(), Error> {
         | 
| 317 | 
            +
                configure(config_hash)
         | 
| 318 | 
            +
            }
         | 
| 319 | 
            +
             | 
| 320 | 
            +
            // Tokenize with a specific config (creates fresh tokenizer)
         | 
| 321 | 
            +
            fn tokenize_with_config(text: String, config_hash: RHash) -> std::result::Result<Vec<String>, Error> {
         | 
| 322 | 
            +
                let config = parse_config_from_hash(config_hash)?;
         | 
| 323 | 
            +
             | 
| 324 | 
            +
                // Create fresh tokenizer from config
         | 
| 325 | 
            +
                let tokenizer = tokenizer::from_config(config)?;
         | 
| 326 | 
            +
             | 
| 327 | 
            +
                // Tokenize and return
         | 
| 328 | 
            +
                Ok(tokenizer.tokenize(&text))
         | 
| 329 | 
            +
            }
         | 
| 330 | 
            +
             | 
| 331 | 
            +
            #[magnus::init]
         | 
| 332 | 
            +
            fn init(_ruby: &magnus::Ruby) -> std::result::Result<(), Error> {
         | 
| 333 | 
            +
                let module = define_module("TokenKit")?;
         | 
| 334 | 
            +
             | 
| 335 | 
            +
                // Public API functions
         | 
| 336 | 
            +
                module.define_module_function("_tokenize", function!(tokenize, 1))?;
         | 
| 337 | 
            +
                module.define_module_function("_configure", function!(configure, 1))?;
         | 
| 338 | 
            +
                module.define_module_function("_reset", function!(reset, 0))?;
         | 
| 339 | 
            +
                module.define_module_function("_config_hash", function!(config_hash, 0))?;
         | 
| 340 | 
            +
                module.define_module_function("_load_config", function!(load_config, 1))?;
         | 
| 341 | 
            +
             | 
| 342 | 
            +
                // New instance-based function
         | 
| 343 | 
            +
                module.define_module_function("_tokenize_with_config", function!(tokenize_with_config, 2))?;
         | 
| 344 | 
            +
             | 
| 345 | 
            +
                Ok(())
         | 
| 346 | 
            +
            }
         | 
| @@ -0,0 +1,41 @@ | |
| 1 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 2 | 
            +
            use regex::Regex;
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            /// Common functionality for tokenizers that support preserve_patterns
         | 
| 5 | 
            +
            /// Note: Since we validate patterns in validate_config(), they're guaranteed to be valid here
         | 
| 6 | 
            +
            pub fn create_preserve_patterns(config: &TokenizerConfig) -> Vec<Regex> {
         | 
| 7 | 
            +
                config
         | 
| 8 | 
            +
                    .preserve_patterns
         | 
| 9 | 
            +
                    .iter()
         | 
| 10 | 
            +
                    .map(|p| {
         | 
| 11 | 
            +
                        // Safe to unwrap because patterns are validated in validate_config()
         | 
| 12 | 
            +
                        Regex::new(p).expect("Pattern should have been validated")
         | 
| 13 | 
            +
                    })
         | 
| 14 | 
            +
                    .collect()
         | 
| 15 | 
            +
            }
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            /// Base fields that most tokenizers need
         | 
| 18 | 
            +
            pub struct BaseTokenizerFields {
         | 
| 19 | 
            +
                pub config: TokenizerConfig,
         | 
| 20 | 
            +
                pub preserve_patterns: Vec<Regex>,
         | 
| 21 | 
            +
            }
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            impl BaseTokenizerFields {
         | 
| 24 | 
            +
                pub fn new(config: TokenizerConfig) -> Self {
         | 
| 25 | 
            +
                    let preserve_patterns = create_preserve_patterns(&config);
         | 
| 26 | 
            +
                    Self {
         | 
| 27 | 
            +
                        config,
         | 
| 28 | 
            +
                        preserve_patterns,
         | 
| 29 | 
            +
                    }
         | 
| 30 | 
            +
                }
         | 
| 31 | 
            +
             | 
| 32 | 
            +
             | 
| 33 | 
            +
                pub fn has_preserve_patterns(&self) -> bool {
         | 
| 34 | 
            +
                    !self.preserve_patterns.is_empty()
         | 
| 35 | 
            +
                }
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                pub fn preserve_patterns(&self) -> &[Regex] {
         | 
| 38 | 
            +
                    &self.preserve_patterns
         | 
| 39 | 
            +
                }
         | 
| 40 | 
            +
            }
         | 
| 41 | 
            +
             | 
| @@ -0,0 +1,62 @@ | |
| 1 | 
            +
            use super::{apply_preserve_patterns_with_tokenizer, post_process, BaseTokenizerFields, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
            use std::collections::HashSet;
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            pub struct CharGroupTokenizer {
         | 
| 6 | 
            +
                base: BaseTokenizerFields,
         | 
| 7 | 
            +
                split_chars: HashSet<char>,
         | 
| 8 | 
            +
            }
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            impl CharGroupTokenizer {
         | 
| 11 | 
            +
                pub fn new(config: TokenizerConfig, split_on_chars: String) -> Self {
         | 
| 12 | 
            +
                    // Note: Empty split_on_chars is valid - it makes the tokenizer behave like
         | 
| 13 | 
            +
                    // a keyword tokenizer (no splitting, returns whole text as single token)
         | 
| 14 | 
            +
                    let split_chars: HashSet<char> = split_on_chars.chars().collect();
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    Self {
         | 
| 17 | 
            +
                        base: BaseTokenizerFields::new(config),
         | 
| 18 | 
            +
                        split_chars,
         | 
| 19 | 
            +
                    }
         | 
| 20 | 
            +
                }
         | 
| 21 | 
            +
             | 
| 22 | 
            +
                fn tokenize_text(&self, text: &str) -> Vec<String> {
         | 
| 23 | 
            +
                    let mut tokens = Vec::new();
         | 
| 24 | 
            +
                    let mut current_token = String::new();
         | 
| 25 | 
            +
             | 
| 26 | 
            +
                    for ch in text.chars() {
         | 
| 27 | 
            +
                        if self.split_chars.contains(&ch) {
         | 
| 28 | 
            +
                            if !current_token.is_empty() {
         | 
| 29 | 
            +
                                tokens.push(current_token.clone());
         | 
| 30 | 
            +
                                current_token.clear();
         | 
| 31 | 
            +
                            }
         | 
| 32 | 
            +
                        } else {
         | 
| 33 | 
            +
                            current_token.push(ch);
         | 
| 34 | 
            +
                        }
         | 
| 35 | 
            +
                    }
         | 
| 36 | 
            +
             | 
| 37 | 
            +
                    if !current_token.is_empty() {
         | 
| 38 | 
            +
                        tokens.push(current_token);
         | 
| 39 | 
            +
                    }
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    tokens
         | 
| 42 | 
            +
                }
         | 
| 43 | 
            +
            }
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            impl Tokenizer for CharGroupTokenizer {
         | 
| 46 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 47 | 
            +
                    let tokens = self.tokenize_text(text);
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    if self.base.has_preserve_patterns() {
         | 
| 50 | 
            +
                        apply_preserve_patterns_with_tokenizer(
         | 
| 51 | 
            +
                            tokens,
         | 
| 52 | 
            +
                            self.base.preserve_patterns(),
         | 
| 53 | 
            +
                            text,
         | 
| 54 | 
            +
                            &self.base.config,
         | 
| 55 | 
            +
                            |t| self.tokenize_text(t),
         | 
| 56 | 
            +
                        )
         | 
| 57 | 
            +
                    } else {
         | 
| 58 | 
            +
                        post_process(tokens, &self.base.config)
         | 
| 59 | 
            +
                    }
         | 
| 60 | 
            +
                }
         | 
| 61 | 
            +
             | 
| 62 | 
            +
            }
         | 
| @@ -0,0 +1,73 @@ | |
| 1 | 
            +
            use super::{Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            pub struct EdgeNgramTokenizer {
         | 
| 5 | 
            +
                config: TokenizerConfig,
         | 
| 6 | 
            +
                min_gram: usize,
         | 
| 7 | 
            +
                max_gram: usize,
         | 
| 8 | 
            +
            }
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            impl EdgeNgramTokenizer {
         | 
| 11 | 
            +
                pub fn new(config: TokenizerConfig, min_gram: usize, max_gram: usize) -> Self {
         | 
| 12 | 
            +
                    // Validate and sanitize parameters
         | 
| 13 | 
            +
                    let min_gram = min_gram.max(1); // Minimum 1 character
         | 
| 14 | 
            +
                    let max_gram = max_gram.max(min_gram); // Ensure max >= min
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    Self { config, min_gram, max_gram }
         | 
| 17 | 
            +
                }
         | 
| 18 | 
            +
             | 
| 19 | 
            +
                fn generate_edge_ngrams(&self, text: &str) -> Vec<String> {
         | 
| 20 | 
            +
                    let mut ngrams = Vec::new();
         | 
| 21 | 
            +
                    let chars: Vec<char> = text.chars().collect();
         | 
| 22 | 
            +
                    let text_len = chars.len();
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    if text_len == 0 {
         | 
| 25 | 
            +
                        return ngrams;
         | 
| 26 | 
            +
                    }
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    let max = self.max_gram.min(text_len);
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    for gram_size in self.min_gram..=max {
         | 
| 31 | 
            +
                        let ngram: String = chars.iter().take(gram_size).collect();
         | 
| 32 | 
            +
                        ngrams.push(ngram);
         | 
| 33 | 
            +
                    }
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    ngrams
         | 
| 36 | 
            +
                }
         | 
| 37 | 
            +
            }
         | 
| 38 | 
            +
             | 
| 39 | 
            +
            impl Tokenizer for EdgeNgramTokenizer {
         | 
| 40 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 41 | 
            +
                    let mut all_ngrams = Vec::new();
         | 
| 42 | 
            +
             | 
| 43 | 
            +
                    for word in text.split_whitespace() {
         | 
| 44 | 
            +
                        if word.is_empty() {
         | 
| 45 | 
            +
                            continue;
         | 
| 46 | 
            +
                        }
         | 
| 47 | 
            +
             | 
| 48 | 
            +
                        let processed_word = if self.config.remove_punctuation {
         | 
| 49 | 
            +
                            word.chars()
         | 
| 50 | 
            +
                                .filter(|c| !c.is_ascii_punctuation())
         | 
| 51 | 
            +
                                .collect()
         | 
| 52 | 
            +
                        } else {
         | 
| 53 | 
            +
                            word.to_string()
         | 
| 54 | 
            +
                        };
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                        if processed_word.is_empty() {
         | 
| 57 | 
            +
                            continue;
         | 
| 58 | 
            +
                        }
         | 
| 59 | 
            +
             | 
| 60 | 
            +
                        let ngrams = self.generate_edge_ngrams(&processed_word);
         | 
| 61 | 
            +
                        all_ngrams.extend(ngrams);
         | 
| 62 | 
            +
                    }
         | 
| 63 | 
            +
             | 
| 64 | 
            +
                    let mut result = all_ngrams;
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                    if self.config.lowercase {
         | 
| 67 | 
            +
                        result = result.into_iter().map(|t| t.to_lowercase()).collect();
         | 
| 68 | 
            +
                    }
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    result
         | 
| 71 | 
            +
                }
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            }
         | 
| @@ -0,0 +1,26 @@ | |
| 1 | 
            +
            use super::{post_process, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
            use unicode_segmentation::UnicodeSegmentation;
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            pub struct GraphemeTokenizer {
         | 
| 6 | 
            +
                config: TokenizerConfig,
         | 
| 7 | 
            +
                extended: bool,
         | 
| 8 | 
            +
            }
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            impl GraphemeTokenizer {
         | 
| 11 | 
            +
                pub fn new(config: TokenizerConfig, extended: bool) -> Self {
         | 
| 12 | 
            +
                    Self { config, extended }
         | 
| 13 | 
            +
                }
         | 
| 14 | 
            +
            }
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            impl Tokenizer for GraphemeTokenizer {
         | 
| 17 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 18 | 
            +
                    let graphemes: Vec<String> = text
         | 
| 19 | 
            +
                        .graphemes(self.extended)
         | 
| 20 | 
            +
                        .map(|s| s.to_string())
         | 
| 21 | 
            +
                        .collect();
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    post_process(graphemes, &self.config)
         | 
| 24 | 
            +
                }
         | 
| 25 | 
            +
             | 
| 26 | 
            +
            }
         | 
| @@ -0,0 +1,25 @@ | |
| 1 | 
            +
            use super::{post_process, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            pub struct KeywordTokenizer {
         | 
| 5 | 
            +
                config: TokenizerConfig,
         | 
| 6 | 
            +
            }
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            impl KeywordTokenizer {
         | 
| 9 | 
            +
                pub fn new(config: TokenizerConfig) -> Self {
         | 
| 10 | 
            +
                    Self { config }
         | 
| 11 | 
            +
                }
         | 
| 12 | 
            +
            }
         | 
| 13 | 
            +
             | 
| 14 | 
            +
            impl Tokenizer for KeywordTokenizer {
         | 
| 15 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 16 | 
            +
                    let trimmed = text.trim();
         | 
| 17 | 
            +
                    if trimmed.is_empty() {
         | 
| 18 | 
            +
                        return vec![];
         | 
| 19 | 
            +
                    }
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                    let tokens = vec![trimmed.to_string()];
         | 
| 22 | 
            +
                    post_process(tokens, &self.config)
         | 
| 23 | 
            +
                }
         | 
| 24 | 
            +
             | 
| 25 | 
            +
            }
         | 
| @@ -0,0 +1,41 @@ | |
| 1 | 
            +
            use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            pub struct LetterTokenizer {
         | 
| 5 | 
            +
                base: BaseTokenizerFields,
         | 
| 6 | 
            +
            }
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            impl LetterTokenizer {
         | 
| 9 | 
            +
                pub fn new(config: TokenizerConfig) -> Self {
         | 
| 10 | 
            +
                    Self {
         | 
| 11 | 
            +
                        base: BaseTokenizerFields::new(config),
         | 
| 12 | 
            +
                    }
         | 
| 13 | 
            +
                }
         | 
| 14 | 
            +
            }
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            impl Tokenizer for LetterTokenizer {
         | 
| 17 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 18 | 
            +
                    let mut tokens = Vec::new();
         | 
| 19 | 
            +
                    let mut current_token = String::new();
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                    for ch in text.chars() {
         | 
| 22 | 
            +
                        if ch.is_alphabetic() {
         | 
| 23 | 
            +
                            current_token.push(ch);
         | 
| 24 | 
            +
                        } else if !current_token.is_empty() {
         | 
| 25 | 
            +
                            tokens.push(current_token.clone());
         | 
| 26 | 
            +
                            current_token.clear();
         | 
| 27 | 
            +
                        }
         | 
| 28 | 
            +
                    }
         | 
| 29 | 
            +
             | 
| 30 | 
            +
                    if !current_token.is_empty() {
         | 
| 31 | 
            +
                        tokens.push(current_token);
         | 
| 32 | 
            +
                    }
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    if self.base.has_preserve_patterns() {
         | 
| 35 | 
            +
                        apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config)
         | 
| 36 | 
            +
                    } else {
         | 
| 37 | 
            +
                        post_process(tokens, &self.base.config)
         | 
| 38 | 
            +
                    }
         | 
| 39 | 
            +
                }
         | 
| 40 | 
            +
             | 
| 41 | 
            +
            }
         | 
| @@ -0,0 +1,51 @@ | |
| 1 | 
            +
            use super::{apply_preserve_patterns, BaseTokenizerFields, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            pub struct LowercaseTokenizer {
         | 
| 5 | 
            +
                base: BaseTokenizerFields,
         | 
| 6 | 
            +
            }
         | 
| 7 | 
            +
             | 
| 8 | 
            +
            impl LowercaseTokenizer {
         | 
| 9 | 
            +
                pub fn new(config: TokenizerConfig) -> Self {
         | 
| 10 | 
            +
                    Self {
         | 
| 11 | 
            +
                        base: BaseTokenizerFields::new(config),
         | 
| 12 | 
            +
                    }
         | 
| 13 | 
            +
                }
         | 
| 14 | 
            +
            }
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            impl Tokenizer for LowercaseTokenizer {
         | 
| 17 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 18 | 
            +
                    let mut tokens = Vec::new();
         | 
| 19 | 
            +
                    let mut current_token = String::new();
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                    for ch in text.chars() {
         | 
| 22 | 
            +
                        if ch.is_alphabetic() {
         | 
| 23 | 
            +
                            for lowercase_ch in ch.to_lowercase() {
         | 
| 24 | 
            +
                                current_token.push(lowercase_ch);
         | 
| 25 | 
            +
                            }
         | 
| 26 | 
            +
                        } else if !current_token.is_empty() {
         | 
| 27 | 
            +
                            tokens.push(current_token.clone());
         | 
| 28 | 
            +
                            current_token.clear();
         | 
| 29 | 
            +
                        }
         | 
| 30 | 
            +
                    }
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                    if !current_token.is_empty() {
         | 
| 33 | 
            +
                        tokens.push(current_token);
         | 
| 34 | 
            +
                    }
         | 
| 35 | 
            +
             | 
| 36 | 
            +
                    // Lowercase tokenizer always lowercases, ignore config.lowercase
         | 
| 37 | 
            +
                    // Note: remove_punctuation has no effect since we already split on non-alphabetic
         | 
| 38 | 
            +
                    // characters, but we keep it for consistency with the Tokenizer interface
         | 
| 39 | 
            +
             | 
| 40 | 
            +
                    if self.base.has_preserve_patterns() {
         | 
| 41 | 
            +
                        // For preserve_patterns, we need to pass a modified config that doesn't lowercase
         | 
| 42 | 
            +
                        // because apply_preserve_patterns handles lowercasing for non-preserved tokens
         | 
| 43 | 
            +
                        let mut modified_config = self.base.config.clone();
         | 
| 44 | 
            +
                        modified_config.lowercase = true; // Force lowercase for non-preserved tokens
         | 
| 45 | 
            +
                        apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &modified_config)
         | 
| 46 | 
            +
                    } else {
         | 
| 47 | 
            +
                        tokens
         | 
| 48 | 
            +
                    }
         | 
| 49 | 
            +
                }
         | 
| 50 | 
            +
             | 
| 51 | 
            +
            }
         |