tokenkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/.yardopts +12 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +644 -0
- data/Rakefile +18 -0
- data/benchmarks/cache_test.rb +63 -0
- data/benchmarks/final_comparison.rb +83 -0
- data/benchmarks/tokenizer_benchmark.rb +250 -0
- data/docs/ARCHITECTURE.md +469 -0
- data/docs/PERFORMANCE.md +382 -0
- data/docs/README.md +118 -0
- data/ext/tokenkit/Cargo.toml +21 -0
- data/ext/tokenkit/extconf.rb +4 -0
- data/ext/tokenkit/src/config.rs +37 -0
- data/ext/tokenkit/src/error.rs +67 -0
- data/ext/tokenkit/src/lib.rs +346 -0
- data/ext/tokenkit/src/tokenizer/base.rs +41 -0
- data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
- data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
- data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
- data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
- data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
- data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
- data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
- data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
- data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
- data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
- data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
- data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
- data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
- data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
- data/lib/tokenkit/config.rb +74 -0
- data/lib/tokenkit/config_builder.rb +209 -0
- data/lib/tokenkit/config_compat.rb +52 -0
- data/lib/tokenkit/configuration.rb +194 -0
- data/lib/tokenkit/regex_converter.rb +58 -0
- data/lib/tokenkit/version.rb +5 -0
- data/lib/tokenkit.rb +336 -0
- data/sig/tokenkit.rbs +4 -0
- metadata +172 -0
| @@ -0,0 +1,254 @@ | |
| 1 | 
            +
            mod base;
         | 
| 2 | 
            +
            mod whitespace;
         | 
| 3 | 
            +
            mod unicode;
         | 
| 4 | 
            +
            mod pattern;
         | 
| 5 | 
            +
            mod sentence;
         | 
| 6 | 
            +
            mod grapheme;
         | 
| 7 | 
            +
            mod keyword;
         | 
| 8 | 
            +
            mod edge_ngram;
         | 
| 9 | 
            +
            mod ngram;
         | 
| 10 | 
            +
            mod path_hierarchy;
         | 
| 11 | 
            +
            mod url_email;
         | 
| 12 | 
            +
            mod char_group;
         | 
| 13 | 
            +
            mod letter;
         | 
| 14 | 
            +
            mod lowercase;
         | 
| 15 | 
            +
             | 
| 16 | 
            +
            pub(crate) use base::BaseTokenizerFields;
         | 
| 17 | 
            +
             | 
| 18 | 
            +
            pub use whitespace::WhitespaceTokenizer;
         | 
| 19 | 
            +
            pub use unicode::UnicodeTokenizer;
         | 
| 20 | 
            +
            pub use pattern::PatternTokenizer;
         | 
| 21 | 
            +
            pub use sentence::SentenceTokenizer;
         | 
| 22 | 
            +
            pub use grapheme::GraphemeTokenizer;
         | 
| 23 | 
            +
            pub use keyword::KeywordTokenizer;
         | 
| 24 | 
            +
            pub use edge_ngram::EdgeNgramTokenizer;
         | 
| 25 | 
            +
            pub use ngram::NgramTokenizer;
         | 
| 26 | 
            +
            pub use path_hierarchy::PathHierarchyTokenizer;
         | 
| 27 | 
            +
            pub use url_email::UrlEmailTokenizer;
         | 
| 28 | 
            +
            pub use char_group::CharGroupTokenizer;
         | 
| 29 | 
            +
            pub use letter::LetterTokenizer;
         | 
| 30 | 
            +
            pub use lowercase::LowercaseTokenizer;
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            use crate::config::{TokenizerConfig, TokenizerStrategy};
         | 
| 33 | 
            +
            use crate::error::Result;
         | 
| 34 | 
            +
            use regex::Regex;
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            pub trait Tokenizer: Send + Sync {
         | 
| 37 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String>;
         | 
| 38 | 
            +
            }
         | 
| 39 | 
            +
             | 
| 40 | 
            +
            pub fn from_config(config: TokenizerConfig) -> Result<Box<dyn Tokenizer>> {
         | 
| 41 | 
            +
                match config.strategy.clone() {
         | 
| 42 | 
            +
                    TokenizerStrategy::Whitespace => Ok(Box::new(WhitespaceTokenizer::new(config))),
         | 
| 43 | 
            +
                    TokenizerStrategy::Unicode => Ok(Box::new(UnicodeTokenizer::new(config))),
         | 
| 44 | 
            +
                    TokenizerStrategy::Pattern { regex } => {
         | 
| 45 | 
            +
                        PatternTokenizer::new(®ex, config)
         | 
| 46 | 
            +
                            .map(|t| Box::new(t) as Box<dyn Tokenizer>)
         | 
| 47 | 
            +
                    }
         | 
| 48 | 
            +
                    TokenizerStrategy::Sentence => Ok(Box::new(SentenceTokenizer::new(config))),
         | 
| 49 | 
            +
                    TokenizerStrategy::Grapheme { extended } => {
         | 
| 50 | 
            +
                        Ok(Box::new(GraphemeTokenizer::new(config, extended)))
         | 
| 51 | 
            +
                    }
         | 
| 52 | 
            +
                    TokenizerStrategy::Keyword => Ok(Box::new(KeywordTokenizer::new(config))),
         | 
| 53 | 
            +
                    TokenizerStrategy::EdgeNgram { min_gram, max_gram } => {
         | 
| 54 | 
            +
                        Ok(Box::new(EdgeNgramTokenizer::new(config, min_gram, max_gram)))
         | 
| 55 | 
            +
                    }
         | 
| 56 | 
            +
                    TokenizerStrategy::PathHierarchy { delimiter } => {
         | 
| 57 | 
            +
                        Ok(Box::new(PathHierarchyTokenizer::new(config, delimiter)))
         | 
| 58 | 
            +
                    }
         | 
| 59 | 
            +
                    TokenizerStrategy::UrlEmail => {
         | 
| 60 | 
            +
                        Ok(Box::new(UrlEmailTokenizer::new(config)))
         | 
| 61 | 
            +
                    }
         | 
| 62 | 
            +
                    TokenizerStrategy::Ngram { min_gram, max_gram } => {
         | 
| 63 | 
            +
                        Ok(Box::new(NgramTokenizer::new(config, min_gram, max_gram)))
         | 
| 64 | 
            +
                    }
         | 
| 65 | 
            +
                    TokenizerStrategy::CharGroup { split_on_chars } => {
         | 
| 66 | 
            +
                        Ok(Box::new(CharGroupTokenizer::new(config, split_on_chars)))
         | 
| 67 | 
            +
                    }
         | 
| 68 | 
            +
                    TokenizerStrategy::Letter => Ok(Box::new(LetterTokenizer::new(config))),
         | 
| 69 | 
            +
                    TokenizerStrategy::Lowercase => Ok(Box::new(LowercaseTokenizer::new(config))),
         | 
| 70 | 
            +
                }
         | 
| 71 | 
            +
            }
         | 
| 72 | 
            +
             | 
| 73 | 
            +
            pub(crate) fn merge_overlapping_spans(mut spans: Vec<(usize, usize, String)>) -> Vec<(usize, usize, String)> {
         | 
| 74 | 
            +
                if spans.is_empty() {
         | 
| 75 | 
            +
                    return spans;
         | 
| 76 | 
            +
                }
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                spans.sort_by(|a, b| {
         | 
| 79 | 
            +
                    a.0.cmp(&b.0)
         | 
| 80 | 
            +
                        .then_with(|| b.1.cmp(&a.1))
         | 
| 81 | 
            +
                });
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                let mut merged = Vec::new();
         | 
| 84 | 
            +
                let mut current = spans[0].clone();
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                for span in spans.into_iter().skip(1) {
         | 
| 87 | 
            +
                    if span.0 < current.1 {
         | 
| 88 | 
            +
                        if span.1 > current.1 {
         | 
| 89 | 
            +
                            current = span;
         | 
| 90 | 
            +
                        }
         | 
| 91 | 
            +
                    } else {
         | 
| 92 | 
            +
                        merged.push(current);
         | 
| 93 | 
            +
                        current = span;
         | 
| 94 | 
            +
                    }
         | 
| 95 | 
            +
                }
         | 
| 96 | 
            +
                merged.push(current);
         | 
| 97 | 
            +
             | 
| 98 | 
            +
                merged
         | 
| 99 | 
            +
            }
         | 
| 100 | 
            +
             | 
| 101 | 
            +
            // Optimized version that works with indices only
         | 
| 102 | 
            +
            fn merge_overlapping_spans_optimized(mut spans: Vec<(usize, usize)>) -> Vec<(usize, usize)> {
         | 
| 103 | 
            +
                if spans.is_empty() {
         | 
| 104 | 
            +
                    return spans;
         | 
| 105 | 
            +
                }
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                spans.sort_unstable_by(|a, b| {
         | 
| 108 | 
            +
                    a.0.cmp(&b.0)
         | 
| 109 | 
            +
                        .then_with(|| b.1.cmp(&a.1))
         | 
| 110 | 
            +
                });
         | 
| 111 | 
            +
             | 
| 112 | 
            +
                let mut merged = Vec::with_capacity(spans.len());
         | 
| 113 | 
            +
                let mut current = spans[0];
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                for span in spans.into_iter().skip(1) {
         | 
| 116 | 
            +
                    if span.0 < current.1 {
         | 
| 117 | 
            +
                        if span.1 > current.1 {
         | 
| 118 | 
            +
                            current.1 = span.1;
         | 
| 119 | 
            +
                        }
         | 
| 120 | 
            +
                    } else {
         | 
| 121 | 
            +
                        merged.push(current);
         | 
| 122 | 
            +
                        current = span;
         | 
| 123 | 
            +
                    }
         | 
| 124 | 
            +
                }
         | 
| 125 | 
            +
                merged.push(current);
         | 
| 126 | 
            +
                merged
         | 
| 127 | 
            +
            }
         | 
| 128 | 
            +
             | 
| 129 | 
            +
            pub(crate) fn apply_preserve_patterns(
         | 
| 130 | 
            +
                tokens: Vec<String>,
         | 
| 131 | 
            +
                preserve_patterns: &[Regex],
         | 
| 132 | 
            +
                original_text: &str,
         | 
| 133 | 
            +
                config: &TokenizerConfig,
         | 
| 134 | 
            +
            ) -> Vec<String> {
         | 
| 135 | 
            +
                apply_preserve_patterns_with_tokenizer(
         | 
| 136 | 
            +
                    tokens,
         | 
| 137 | 
            +
                    preserve_patterns,
         | 
| 138 | 
            +
                    original_text,
         | 
| 139 | 
            +
                    config,
         | 
| 140 | 
            +
                    tokenize_simple,
         | 
| 141 | 
            +
                )
         | 
| 142 | 
            +
            }
         | 
| 143 | 
            +
             | 
| 144 | 
            +
            pub(crate) fn apply_preserve_patterns_with_tokenizer<F>(
         | 
| 145 | 
            +
                tokens: Vec<String>,
         | 
| 146 | 
            +
                preserve_patterns: &[Regex],
         | 
| 147 | 
            +
                original_text: &str,
         | 
| 148 | 
            +
                config: &TokenizerConfig,
         | 
| 149 | 
            +
                tokenizer_fn: F,
         | 
| 150 | 
            +
            ) -> Vec<String>
         | 
| 151 | 
            +
            where
         | 
| 152 | 
            +
                F: Fn(&str) -> Vec<String>,
         | 
| 153 | 
            +
            {
         | 
| 154 | 
            +
                if preserve_patterns.is_empty() {
         | 
| 155 | 
            +
                    return tokens;
         | 
| 156 | 
            +
                }
         | 
| 157 | 
            +
             | 
| 158 | 
            +
                // Use indices instead of allocating strings upfront
         | 
| 159 | 
            +
                let mut preserved_spans: Vec<(usize, usize)> = Vec::with_capacity(32);
         | 
| 160 | 
            +
                for pattern in preserve_patterns {
         | 
| 161 | 
            +
                    for mat in pattern.find_iter(original_text) {
         | 
| 162 | 
            +
                        preserved_spans.push((mat.start(), mat.end()));
         | 
| 163 | 
            +
                    }
         | 
| 164 | 
            +
                }
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                if preserved_spans.is_empty() {
         | 
| 167 | 
            +
                    return tokens;
         | 
| 168 | 
            +
                }
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                let preserved_spans = merge_overlapping_spans_optimized(preserved_spans);
         | 
| 171 | 
            +
             | 
| 172 | 
            +
                // Pre-allocate result vector with estimated capacity
         | 
| 173 | 
            +
                let mut result = Vec::with_capacity(tokens.len() + preserved_spans.len());
         | 
| 174 | 
            +
                let mut pos = 0;
         | 
| 175 | 
            +
             | 
| 176 | 
            +
                for (start, end) in preserved_spans {
         | 
| 177 | 
            +
                    if start > pos {
         | 
| 178 | 
            +
                        let before = &original_text[pos..start];
         | 
| 179 | 
            +
                        let mut before_tokens = tokenizer_fn(before);
         | 
| 180 | 
            +
                        post_process_in_place(&mut before_tokens, config);
         | 
| 181 | 
            +
                        result.extend(before_tokens);
         | 
| 182 | 
            +
                    }
         | 
| 183 | 
            +
                    // Extract preserved text only when needed
         | 
| 184 | 
            +
                    result.push(original_text[start..end].to_string());
         | 
| 185 | 
            +
                    pos = end;
         | 
| 186 | 
            +
                }
         | 
| 187 | 
            +
             | 
| 188 | 
            +
                if pos < original_text.len() {
         | 
| 189 | 
            +
                    let remaining = &original_text[pos..];
         | 
| 190 | 
            +
                    let mut remaining_tokens = tokenizer_fn(remaining);
         | 
| 191 | 
            +
                    post_process_in_place(&mut remaining_tokens, config);
         | 
| 192 | 
            +
                    result.extend(remaining_tokens);
         | 
| 193 | 
            +
                }
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                result
         | 
| 196 | 
            +
            }
         | 
| 197 | 
            +
             | 
| 198 | 
            +
            fn tokenize_simple(text: &str) -> Vec<String> {
         | 
| 199 | 
            +
                text.split_whitespace()
         | 
| 200 | 
            +
                    .filter(|s| !s.is_empty())
         | 
| 201 | 
            +
                    .map(|s| s.to_string())
         | 
| 202 | 
            +
                    .collect()
         | 
| 203 | 
            +
            }
         | 
| 204 | 
            +
             | 
| 205 | 
            +
            pub(crate) fn post_process(tokens: Vec<String>, config: &TokenizerConfig) -> Vec<String> {
         | 
| 206 | 
            +
                post_process_with_preserved(tokens, config, None)
         | 
| 207 | 
            +
            }
         | 
| 208 | 
            +
             | 
| 209 | 
            +
            // In-place version to avoid allocation
         | 
| 210 | 
            +
            fn post_process_in_place(tokens: &mut Vec<String>, config: &TokenizerConfig) {
         | 
| 211 | 
            +
                if config.lowercase {
         | 
| 212 | 
            +
                    for token in tokens.iter_mut() {
         | 
| 213 | 
            +
                        *token = token.to_lowercase();
         | 
| 214 | 
            +
                    }
         | 
| 215 | 
            +
                }
         | 
| 216 | 
            +
             | 
| 217 | 
            +
                if config.remove_punctuation {
         | 
| 218 | 
            +
                    tokens.retain_mut(|token| {
         | 
| 219 | 
            +
                        token.retain(|c| !c.is_ascii_punctuation());
         | 
| 220 | 
            +
                        !token.is_empty()
         | 
| 221 | 
            +
                    });
         | 
| 222 | 
            +
                }
         | 
| 223 | 
            +
            }
         | 
| 224 | 
            +
             | 
| 225 | 
            +
            pub(crate) fn post_process_with_preserved(
         | 
| 226 | 
            +
                mut tokens: Vec<String>,
         | 
| 227 | 
            +
                config: &TokenizerConfig,
         | 
| 228 | 
            +
                preserve_chars: Option<&str>,
         | 
| 229 | 
            +
            ) -> Vec<String> {
         | 
| 230 | 
            +
                if config.lowercase {
         | 
| 231 | 
            +
                    tokens = tokens.into_iter().map(|t| t.to_lowercase()).collect();
         | 
| 232 | 
            +
                }
         | 
| 233 | 
            +
             | 
| 234 | 
            +
                if config.remove_punctuation {
         | 
| 235 | 
            +
                    tokens = tokens
         | 
| 236 | 
            +
                        .into_iter()
         | 
| 237 | 
            +
                        .map(|t| {
         | 
| 238 | 
            +
                            t.chars()
         | 
| 239 | 
            +
                                .filter(|c| {
         | 
| 240 | 
            +
                                    if let Some(preserved) = preserve_chars {
         | 
| 241 | 
            +
                                        if preserved.contains(*c) {
         | 
| 242 | 
            +
                                            return true;
         | 
| 243 | 
            +
                                        }
         | 
| 244 | 
            +
                                    }
         | 
| 245 | 
            +
                                    !c.is_ascii_punctuation()
         | 
| 246 | 
            +
                                })
         | 
| 247 | 
            +
                                .collect()
         | 
| 248 | 
            +
                        })
         | 
| 249 | 
            +
                        .filter(|s: &String| !s.is_empty())
         | 
| 250 | 
            +
                        .collect();
         | 
| 251 | 
            +
                }
         | 
| 252 | 
            +
             | 
| 253 | 
            +
                tokens
         | 
| 254 | 
            +
            }
         | 
| @@ -0,0 +1,80 @@ | |
| 1 | 
            +
            use super::Tokenizer;
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            pub struct NgramTokenizer {
         | 
| 5 | 
            +
                config: TokenizerConfig,
         | 
| 6 | 
            +
                min_gram: usize,
         | 
| 7 | 
            +
                max_gram: usize,
         | 
| 8 | 
            +
            }
         | 
| 9 | 
            +
             | 
| 10 | 
            +
            impl NgramTokenizer {
         | 
| 11 | 
            +
                pub fn new(config: TokenizerConfig, min_gram: usize, max_gram: usize) -> Self {
         | 
| 12 | 
            +
                    // Validate and sanitize parameters
         | 
| 13 | 
            +
                    let min_gram = min_gram.max(1); // Minimum 1 character
         | 
| 14 | 
            +
                    let max_gram = max_gram.max(min_gram); // Ensure max >= min
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    Self {
         | 
| 17 | 
            +
                        config,
         | 
| 18 | 
            +
                        min_gram,
         | 
| 19 | 
            +
                        max_gram,
         | 
| 20 | 
            +
                    }
         | 
| 21 | 
            +
                }
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                fn generate_ngrams(&self, text: &str) -> Vec<String> {
         | 
| 24 | 
            +
                    let mut ngrams = Vec::new();
         | 
| 25 | 
            +
                    let chars: Vec<char> = text.chars().collect();
         | 
| 26 | 
            +
                    let text_len = chars.len();
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    if text_len == 0 {
         | 
| 29 | 
            +
                        return ngrams;
         | 
| 30 | 
            +
                    }
         | 
| 31 | 
            +
             | 
| 32 | 
            +
                    let max = self.max_gram.min(text_len);
         | 
| 33 | 
            +
             | 
| 34 | 
            +
                    for gram_size in self.min_gram..=max {
         | 
| 35 | 
            +
                        for start in 0..=(text_len - gram_size) {
         | 
| 36 | 
            +
                            let ngram: String = chars.iter().skip(start).take(gram_size).collect();
         | 
| 37 | 
            +
                            ngrams.push(ngram);
         | 
| 38 | 
            +
                        }
         | 
| 39 | 
            +
                    }
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    ngrams
         | 
| 42 | 
            +
                }
         | 
| 43 | 
            +
            }
         | 
| 44 | 
            +
             | 
| 45 | 
            +
            impl Tokenizer for NgramTokenizer {
         | 
| 46 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 47 | 
            +
                    let mut all_ngrams = Vec::new();
         | 
| 48 | 
            +
             | 
| 49 | 
            +
                    for word in text.split_whitespace() {
         | 
| 50 | 
            +
                        if word.is_empty() {
         | 
| 51 | 
            +
                            continue;
         | 
| 52 | 
            +
                        }
         | 
| 53 | 
            +
             | 
| 54 | 
            +
                        let processed_word = if self.config.remove_punctuation {
         | 
| 55 | 
            +
                            word.chars()
         | 
| 56 | 
            +
                                .filter(|c| !c.is_ascii_punctuation())
         | 
| 57 | 
            +
                                .collect()
         | 
| 58 | 
            +
                        } else {
         | 
| 59 | 
            +
                            word.to_string()
         | 
| 60 | 
            +
                        };
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                        if processed_word.is_empty() {
         | 
| 63 | 
            +
                            continue;
         | 
| 64 | 
            +
                        }
         | 
| 65 | 
            +
             | 
| 66 | 
            +
                        let ngrams = self.generate_ngrams(&processed_word);
         | 
| 67 | 
            +
                        all_ngrams.extend(ngrams);
         | 
| 68 | 
            +
                    }
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    // Apply lowercase if needed. Note: remove_punctuation already handled above.
         | 
| 71 | 
            +
                    let mut result = all_ngrams;
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                    if self.config.lowercase {
         | 
| 74 | 
            +
                        result = result.into_iter().map(|t| t.to_lowercase()).collect();
         | 
| 75 | 
            +
                    }
         | 
| 76 | 
            +
             | 
| 77 | 
            +
                    result
         | 
| 78 | 
            +
                }
         | 
| 79 | 
            +
             | 
| 80 | 
            +
            }
         | 
| @@ -0,0 +1,187 @@ | |
| 1 | 
            +
            use super::{post_process_with_preserved, BaseTokenizerFields, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
             | 
| 4 | 
            +
            pub struct PathHierarchyTokenizer {
         | 
| 5 | 
            +
                base: BaseTokenizerFields,
         | 
| 6 | 
            +
                delimiter: String,
         | 
| 7 | 
            +
            }
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            impl PathHierarchyTokenizer {
         | 
| 10 | 
            +
                pub fn new(config: TokenizerConfig, delimiter: String) -> Self {
         | 
| 11 | 
            +
                    Self {
         | 
| 12 | 
            +
                        base: BaseTokenizerFields::new(config),
         | 
| 13 | 
            +
                        delimiter,
         | 
| 14 | 
            +
                    }
         | 
| 15 | 
            +
                }
         | 
| 16 | 
            +
             | 
| 17 | 
            +
                fn generate_hierarchy(&self, path: &str) -> Vec<String> {
         | 
| 18 | 
            +
                    let mut tokens = Vec::new();
         | 
| 19 | 
            +
                    let parts: Vec<&str> = path.split(&self.delimiter).collect();
         | 
| 20 | 
            +
             | 
| 21 | 
            +
                    let mut current_path = String::new();
         | 
| 22 | 
            +
                    let starts_with_delimiter = path.starts_with(&self.delimiter);
         | 
| 23 | 
            +
             | 
| 24 | 
            +
                    for part in parts.iter() {
         | 
| 25 | 
            +
                        if part.is_empty() {
         | 
| 26 | 
            +
                            continue;
         | 
| 27 | 
            +
                        }
         | 
| 28 | 
            +
             | 
| 29 | 
            +
                        if !current_path.is_empty() {
         | 
| 30 | 
            +
                            current_path.push_str(&self.delimiter);
         | 
| 31 | 
            +
                        } else if starts_with_delimiter {
         | 
| 32 | 
            +
                            current_path.push_str(&self.delimiter);
         | 
| 33 | 
            +
                        }
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                        current_path.push_str(part);
         | 
| 36 | 
            +
                        tokens.push(current_path.clone());
         | 
| 37 | 
            +
                    }
         | 
| 38 | 
            +
             | 
| 39 | 
            +
                    tokens
         | 
| 40 | 
            +
                }
         | 
| 41 | 
            +
             | 
| 42 | 
            +
                fn apply_patterns_to_hierarchy(&self, text: &str) -> Vec<String> {
         | 
| 43 | 
            +
                    if self.base.preserve_patterns().is_empty() {
         | 
| 44 | 
            +
                        return self.generate_hierarchy(text);
         | 
| 45 | 
            +
                    }
         | 
| 46 | 
            +
             | 
| 47 | 
            +
                    // Generate all hierarchical tokens first
         | 
| 48 | 
            +
                    let all_tokens = self.generate_hierarchy(text);
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                    // Find which tokens are completely matched by preserve patterns
         | 
| 51 | 
            +
                    let mut preserved_tokens = Vec::new();
         | 
| 52 | 
            +
                    for token in &all_tokens {
         | 
| 53 | 
            +
                        for pattern in self.base.preserve_patterns() {
         | 
| 54 | 
            +
                            if let Some(mat) = pattern.find(token) {
         | 
| 55 | 
            +
                                if mat.as_str() == token {
         | 
| 56 | 
            +
                                    preserved_tokens.push(token.clone());
         | 
| 57 | 
            +
                                    break;
         | 
| 58 | 
            +
                                }
         | 
| 59 | 
            +
                            }
         | 
| 60 | 
            +
                        }
         | 
| 61 | 
            +
                    }
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                    // Now build the result, applying lowercase where appropriate
         | 
| 64 | 
            +
                    let mut result = Vec::new();
         | 
| 65 | 
            +
                    for token in all_tokens {
         | 
| 66 | 
            +
                        // Check if this token should be included
         | 
| 67 | 
            +
                        // Include if: it's a preserved token OR it extends beyond a preserved token
         | 
| 68 | 
            +
                        let should_include;
         | 
| 69 | 
            +
                        let mut apply_lowercase = self.base.config.lowercase;
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                        if preserved_tokens.contains(&token) {
         | 
| 72 | 
            +
                            should_include = true;
         | 
| 73 | 
            +
                            apply_lowercase = false; // Don't lowercase preserved tokens
         | 
| 74 | 
            +
                        } else {
         | 
| 75 | 
            +
                            // Check if this token extends a preserved token
         | 
| 76 | 
            +
                            let mut extends_preserved = false;
         | 
| 77 | 
            +
                            for preserved in &preserved_tokens {
         | 
| 78 | 
            +
                                if token.starts_with(preserved) && token.len() > preserved.len() {
         | 
| 79 | 
            +
                                    extends_preserved = true;
         | 
| 80 | 
            +
                                    break;
         | 
| 81 | 
            +
                                }
         | 
| 82 | 
            +
                            }
         | 
| 83 | 
            +
             | 
| 84 | 
            +
                            if extends_preserved {
         | 
| 85 | 
            +
                                should_include = true;
         | 
| 86 | 
            +
                            } else {
         | 
| 87 | 
            +
                                // Include if no preserved token is a prefix of this one
         | 
| 88 | 
            +
                                let mut has_preserved_prefix = false;
         | 
| 89 | 
            +
                                for preserved in &preserved_tokens {
         | 
| 90 | 
            +
                                    if preserved.starts_with(&token) && preserved != &token {
         | 
| 91 | 
            +
                                        has_preserved_prefix = true;
         | 
| 92 | 
            +
                                        break;
         | 
| 93 | 
            +
                                    }
         | 
| 94 | 
            +
                                }
         | 
| 95 | 
            +
                                should_include = !has_preserved_prefix;
         | 
| 96 | 
            +
                            }
         | 
| 97 | 
            +
                        }
         | 
| 98 | 
            +
             | 
| 99 | 
            +
                        if should_include {
         | 
| 100 | 
            +
                            if apply_lowercase && !preserved_tokens.contains(&token) {
         | 
| 101 | 
            +
                                // Apply lowercase to non-preserved parts
         | 
| 102 | 
            +
                                let mut lowercased = String::new();
         | 
| 103 | 
            +
                                let starts_with_delim = token.starts_with(&self.delimiter);
         | 
| 104 | 
            +
                                let parts: Vec<&str> = token.split(&self.delimiter).collect();
         | 
| 105 | 
            +
             | 
| 106 | 
            +
                                for (i, part) in parts.iter().enumerate() {
         | 
| 107 | 
            +
                                    if part.is_empty() {
         | 
| 108 | 
            +
                                        if i == 0 && starts_with_delim {
         | 
| 109 | 
            +
                                            // Path starts with delimiter, add it once
         | 
| 110 | 
            +
                                            lowercased.push_str(&self.delimiter);
         | 
| 111 | 
            +
                                        }
         | 
| 112 | 
            +
                                        continue;
         | 
| 113 | 
            +
                                    }
         | 
| 114 | 
            +
             | 
| 115 | 
            +
                                    if i > 0 || (i == 0 && starts_with_delim) {
         | 
| 116 | 
            +
                                        if !lowercased.is_empty() && !lowercased.ends_with(&self.delimiter) {
         | 
| 117 | 
            +
                                            lowercased.push_str(&self.delimiter);
         | 
| 118 | 
            +
                                        }
         | 
| 119 | 
            +
                                    }
         | 
| 120 | 
            +
             | 
| 121 | 
            +
                                    // Check if this specific part should be preserved
         | 
| 122 | 
            +
                                    let mut preserve_part = false;
         | 
| 123 | 
            +
                                    for pattern in self.base.preserve_patterns() {
         | 
| 124 | 
            +
                                        if pattern.is_match(part) {
         | 
| 125 | 
            +
                                            preserve_part = true;
         | 
| 126 | 
            +
                                            break;
         | 
| 127 | 
            +
                                        }
         | 
| 128 | 
            +
                                    }
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                                    if preserve_part {
         | 
| 131 | 
            +
                                        lowercased.push_str(part);
         | 
| 132 | 
            +
                                    } else {
         | 
| 133 | 
            +
                                        lowercased.push_str(&part.to_lowercase());
         | 
| 134 | 
            +
                                    }
         | 
| 135 | 
            +
                                }
         | 
| 136 | 
            +
                                result.push(lowercased);
         | 
| 137 | 
            +
                            } else {
         | 
| 138 | 
            +
                                result.push(token);
         | 
| 139 | 
            +
                            }
         | 
| 140 | 
            +
                        }
         | 
| 141 | 
            +
                    }
         | 
| 142 | 
            +
             | 
| 143 | 
            +
                    result
         | 
| 144 | 
            +
                }
         | 
| 145 | 
            +
            }
         | 
| 146 | 
            +
             | 
| 147 | 
            +
            impl Tokenizer for PathHierarchyTokenizer {
         | 
| 148 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 149 | 
            +
                    let trimmed = text.trim();
         | 
| 150 | 
            +
                    if trimmed.is_empty() {
         | 
| 151 | 
            +
                        return vec![];
         | 
| 152 | 
            +
                    }
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    if self.base.has_preserve_patterns() {
         | 
| 155 | 
            +
                        let mut tokens = self.apply_patterns_to_hierarchy(trimmed);
         | 
| 156 | 
            +
             | 
| 157 | 
            +
                        // Apply remove_punctuation if needed (but preserve delimiters)
         | 
| 158 | 
            +
                        if self.base.config.remove_punctuation {
         | 
| 159 | 
            +
                            tokens = tokens.into_iter().map(|token| {
         | 
| 160 | 
            +
                                let parts: Vec<&str> = token.split(&self.delimiter).collect();
         | 
| 161 | 
            +
                                let processed: Vec<String> = parts.iter().map(|part| {
         | 
| 162 | 
            +
                                    if part.is_empty() {
         | 
| 163 | 
            +
                                        String::new()
         | 
| 164 | 
            +
                                    } else {
         | 
| 165 | 
            +
                                        // Check if this part should be preserved
         | 
| 166 | 
            +
                                        let should_preserve = self.base.preserve_patterns().iter().any(|p| p.is_match(part));
         | 
| 167 | 
            +
                                        if should_preserve {
         | 
| 168 | 
            +
                                            part.to_string()
         | 
| 169 | 
            +
                                        } else {
         | 
| 170 | 
            +
                                            part.chars()
         | 
| 171 | 
            +
                                                .filter(|c| !c.is_ascii_punctuation() || self.delimiter.contains(*c))
         | 
| 172 | 
            +
                                                .collect()
         | 
| 173 | 
            +
                                        }
         | 
| 174 | 
            +
                                    }
         | 
| 175 | 
            +
                                }).collect();
         | 
| 176 | 
            +
                                processed.join(&self.delimiter)
         | 
| 177 | 
            +
                            }).filter(|s| !s.is_empty() && s != &self.delimiter).collect();
         | 
| 178 | 
            +
                        }
         | 
| 179 | 
            +
             | 
| 180 | 
            +
                        tokens
         | 
| 181 | 
            +
                    } else {
         | 
| 182 | 
            +
                        let tokens = self.generate_hierarchy(trimmed);
         | 
| 183 | 
            +
                        post_process_with_preserved(tokens, &self.base.config, Some(&self.delimiter))
         | 
| 184 | 
            +
                    }
         | 
| 185 | 
            +
                }
         | 
| 186 | 
            +
             | 
| 187 | 
            +
            }
         | 
| @@ -0,0 +1,38 @@ | |
| 1 | 
            +
            use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
            use crate::error::Result;
         | 
| 4 | 
            +
            use regex::Regex;
         | 
| 5 | 
            +
             | 
| 6 | 
            +
            pub struct PatternTokenizer {
         | 
| 7 | 
            +
                base: BaseTokenizerFields,
         | 
| 8 | 
            +
                pattern: Regex,
         | 
| 9 | 
            +
            }
         | 
| 10 | 
            +
             | 
| 11 | 
            +
            impl PatternTokenizer {
         | 
| 12 | 
            +
                pub fn new(regex: &str, config: TokenizerConfig) -> Result<Self> {
         | 
| 13 | 
            +
                    // Pattern is already validated in validate_config(), safe to unwrap
         | 
| 14 | 
            +
                    let pattern = Regex::new(regex).expect("Pattern should have been validated");
         | 
| 15 | 
            +
             | 
| 16 | 
            +
                    Ok(Self {
         | 
| 17 | 
            +
                        base: BaseTokenizerFields::new(config),
         | 
| 18 | 
            +
                        pattern,
         | 
| 19 | 
            +
                    })
         | 
| 20 | 
            +
                }
         | 
| 21 | 
            +
            }
         | 
| 22 | 
            +
             | 
| 23 | 
            +
            impl Tokenizer for PatternTokenizer {
         | 
| 24 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 25 | 
            +
                    let tokens: Vec<String> = self
         | 
| 26 | 
            +
                        .pattern
         | 
| 27 | 
            +
                        .find_iter(text)
         | 
| 28 | 
            +
                        .map(|mat| mat.as_str().to_string())
         | 
| 29 | 
            +
                        .collect();
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    if self.base.has_preserve_patterns() {
         | 
| 32 | 
            +
                        apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config)
         | 
| 33 | 
            +
                    } else {
         | 
| 34 | 
            +
                        post_process(tokens, &self.base.config)
         | 
| 35 | 
            +
                    }
         | 
| 36 | 
            +
                }
         | 
| 37 | 
            +
             | 
| 38 | 
            +
            }
         | 
| @@ -0,0 +1,89 @@ | |
| 1 | 
            +
            use super::{post_process, BaseTokenizerFields, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
            use unicode_segmentation::UnicodeSegmentation;
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            pub struct SentenceTokenizer {
         | 
| 6 | 
            +
                base: BaseTokenizerFields,
         | 
| 7 | 
            +
            }
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            impl SentenceTokenizer {
         | 
| 10 | 
            +
                pub fn new(config: TokenizerConfig) -> Self {
         | 
| 11 | 
            +
                    Self {
         | 
| 12 | 
            +
                        base: BaseTokenizerFields::new(config),
         | 
| 13 | 
            +
                    }
         | 
| 14 | 
            +
                }
         | 
| 15 | 
            +
            }
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            impl SentenceTokenizer {
         | 
| 18 | 
            +
                fn apply_patterns_to_sentence(&self, sentence: &str) -> String {
         | 
| 19 | 
            +
                    if self.base.preserve_patterns().is_empty() || !self.base.config.lowercase {
         | 
| 20 | 
            +
                        return sentence.to_string();
         | 
| 21 | 
            +
                    }
         | 
| 22 | 
            +
             | 
| 23 | 
            +
                    // Find all matches in the sentence
         | 
| 24 | 
            +
                    let mut preserved_spans: Vec<(usize, usize, String)> = Vec::new();
         | 
| 25 | 
            +
                    for pattern in self.base.preserve_patterns() {
         | 
| 26 | 
            +
                        for mat in pattern.find_iter(sentence) {
         | 
| 27 | 
            +
                            preserved_spans.push((mat.start(), mat.end(), mat.as_str().to_string()));
         | 
| 28 | 
            +
                        }
         | 
| 29 | 
            +
                    }
         | 
| 30 | 
            +
             | 
| 31 | 
            +
                    if preserved_spans.is_empty() {
         | 
| 32 | 
            +
                        return sentence.to_string();
         | 
| 33 | 
            +
                    }
         | 
| 34 | 
            +
             | 
| 35 | 
            +
                    // Sort and merge overlapping spans
         | 
| 36 | 
            +
                    preserved_spans.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| b.1.cmp(&a.1)));
         | 
| 37 | 
            +
             | 
| 38 | 
            +
                    let mut result = String::new();
         | 
| 39 | 
            +
                    let mut pos = 0;
         | 
| 40 | 
            +
             | 
| 41 | 
            +
                    for (start, end, preserved) in preserved_spans {
         | 
| 42 | 
            +
                        if start > pos {
         | 
| 43 | 
            +
                            // Lowercase the text before the preserved pattern
         | 
| 44 | 
            +
                            result.push_str(&sentence[pos..start].to_lowercase());
         | 
| 45 | 
            +
                        }
         | 
| 46 | 
            +
                        // Keep the preserved pattern as-is
         | 
| 47 | 
            +
                        result.push_str(&preserved);
         | 
| 48 | 
            +
                        pos = end.max(pos); // Handle overlaps
         | 
| 49 | 
            +
                    }
         | 
| 50 | 
            +
             | 
| 51 | 
            +
                    if pos < sentence.len() {
         | 
| 52 | 
            +
                        // Lowercase the remaining text
         | 
| 53 | 
            +
                        result.push_str(&sentence[pos..].to_lowercase());
         | 
| 54 | 
            +
                    }
         | 
| 55 | 
            +
             | 
| 56 | 
            +
                    result
         | 
| 57 | 
            +
                }
         | 
| 58 | 
            +
            }
         | 
| 59 | 
            +
             | 
| 60 | 
            +
            impl Tokenizer for SentenceTokenizer {
         | 
| 61 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 62 | 
            +
                    let mut sentences: Vec<String> = text
         | 
| 63 | 
            +
                        .unicode_sentences()
         | 
| 64 | 
            +
                        .map(|s| s.to_string())
         | 
| 65 | 
            +
                        .collect();
         | 
| 66 | 
            +
             | 
| 67 | 
            +
                    // Apply preserve patterns to each sentence
         | 
| 68 | 
            +
                    if self.base.has_preserve_patterns() && self.base.config.lowercase {
         | 
| 69 | 
            +
                        sentences = sentences
         | 
| 70 | 
            +
                            .into_iter()
         | 
| 71 | 
            +
                            .map(|sentence| self.apply_patterns_to_sentence(&sentence))
         | 
| 72 | 
            +
                            .collect();
         | 
| 73 | 
            +
             | 
| 74 | 
            +
                        // Don't call post_process since we already handled lowercasing with patterns
         | 
| 75 | 
            +
                        // Just handle remove_punctuation if needed
         | 
| 76 | 
            +
                        if self.base.config.remove_punctuation {
         | 
| 77 | 
            +
                            sentences = sentences
         | 
| 78 | 
            +
                                .into_iter()
         | 
| 79 | 
            +
                                .map(|s| s.chars().filter(|c| !c.is_ascii_punctuation()).collect())
         | 
| 80 | 
            +
                                .filter(|s: &String| !s.is_empty())
         | 
| 81 | 
            +
                                .collect();
         | 
| 82 | 
            +
                        }
         | 
| 83 | 
            +
                        sentences
         | 
| 84 | 
            +
                    } else {
         | 
| 85 | 
            +
                        post_process(sentences, &self.base.config)
         | 
| 86 | 
            +
                    }
         | 
| 87 | 
            +
                }
         | 
| 88 | 
            +
             | 
| 89 | 
            +
            }
         | 
| @@ -0,0 +1,36 @@ | |
| 1 | 
            +
            use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
         | 
| 2 | 
            +
            use crate::config::TokenizerConfig;
         | 
| 3 | 
            +
            use unicode_segmentation::UnicodeSegmentation;
         | 
| 4 | 
            +
             | 
| 5 | 
            +
            pub struct UnicodeTokenizer {
         | 
| 6 | 
            +
                base: BaseTokenizerFields,
         | 
| 7 | 
            +
            }
         | 
| 8 | 
            +
             | 
| 9 | 
            +
            impl UnicodeTokenizer {
         | 
| 10 | 
            +
                pub fn new(config: TokenizerConfig) -> Self {
         | 
| 11 | 
            +
                    Self {
         | 
| 12 | 
            +
                        base: BaseTokenizerFields::new(config),
         | 
| 13 | 
            +
                    }
         | 
| 14 | 
            +
                }
         | 
| 15 | 
            +
            }
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            impl Tokenizer for UnicodeTokenizer {
         | 
| 18 | 
            +
                fn tokenize(&self, text: &str) -> Vec<String> {
         | 
| 19 | 
            +
                    if self.base.has_preserve_patterns() {
         | 
| 20 | 
            +
                        let tokens = text
         | 
| 21 | 
            +
                            .unicode_words()
         | 
| 22 | 
            +
                            .map(|s| s.to_string())
         | 
| 23 | 
            +
                            .collect();
         | 
| 24 | 
            +
             | 
| 25 | 
            +
                        return apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config);
         | 
| 26 | 
            +
                    }
         | 
| 27 | 
            +
             | 
| 28 | 
            +
                    let tokens: Vec<String> = text
         | 
| 29 | 
            +
                        .unicode_words()
         | 
| 30 | 
            +
                        .map(|s| s.to_string())
         | 
| 31 | 
            +
                        .collect();
         | 
| 32 | 
            +
             | 
| 33 | 
            +
                    post_process(tokens, &self.base.config)
         | 
| 34 | 
            +
                }
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            }
         |