tokenkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.standard.yml +3 -0
  4. data/.yardopts +12 -0
  5. data/CODE_OF_CONDUCT.md +132 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +644 -0
  8. data/Rakefile +18 -0
  9. data/benchmarks/cache_test.rb +63 -0
  10. data/benchmarks/final_comparison.rb +83 -0
  11. data/benchmarks/tokenizer_benchmark.rb +250 -0
  12. data/docs/ARCHITECTURE.md +469 -0
  13. data/docs/PERFORMANCE.md +382 -0
  14. data/docs/README.md +118 -0
  15. data/ext/tokenkit/Cargo.toml +21 -0
  16. data/ext/tokenkit/extconf.rb +4 -0
  17. data/ext/tokenkit/src/config.rs +37 -0
  18. data/ext/tokenkit/src/error.rs +67 -0
  19. data/ext/tokenkit/src/lib.rs +346 -0
  20. data/ext/tokenkit/src/tokenizer/base.rs +41 -0
  21. data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
  22. data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
  23. data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
  24. data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
  25. data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
  26. data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
  27. data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
  28. data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
  29. data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
  30. data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
  31. data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
  32. data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
  33. data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
  34. data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
  35. data/lib/tokenkit/config.rb +74 -0
  36. data/lib/tokenkit/config_builder.rb +209 -0
  37. data/lib/tokenkit/config_compat.rb +52 -0
  38. data/lib/tokenkit/configuration.rb +194 -0
  39. data/lib/tokenkit/regex_converter.rb +58 -0
  40. data/lib/tokenkit/version.rb +5 -0
  41. data/lib/tokenkit.rb +336 -0
  42. data/sig/tokenkit.rbs +4 -0
  43. metadata +172 -0
@@ -0,0 +1,346 @@
1
+ mod config;
2
+ mod error;
3
+ mod tokenizer;
4
+
5
+ use config::{TokenizerConfig, TokenizerStrategy};
6
+ use error::TokenizerError;
7
+ use magnus::{define_module, function, Error, RArray, RHash, TryConvert};
8
+ use std::sync::Mutex;
9
+ use once_cell::sync::Lazy;
10
+
11
+ // Store the default configuration and a cached tokenizer
12
+ struct TokenizerCache {
13
+ config: TokenizerConfig,
14
+ tokenizer: Option<Box<dyn tokenizer::Tokenizer + Send + Sync>>,
15
+ }
16
+
17
+ static DEFAULT_CACHE: Lazy<Mutex<TokenizerCache>> = Lazy::new(|| {
18
+ Mutex::new(TokenizerCache {
19
+ config: TokenizerConfig::default(),
20
+ tokenizer: None,
21
+ })
22
+ });
23
+
24
+ // Use cached tokenizer if config hasn't changed
25
+ fn tokenize(text: String) -> std::result::Result<Vec<String>, Error> {
26
+ let mut cache = DEFAULT_CACHE
27
+ .lock()
28
+ .map_err(|e| TokenizerError::MutexError(e.to_string()))?;
29
+
30
+ // Check if we need to create a new tokenizer
31
+ if cache.tokenizer.is_none() {
32
+ let tokenizer = tokenizer::from_config(cache.config.clone())?;
33
+ cache.tokenizer = Some(tokenizer);
34
+ }
35
+
36
+ // Use the cached tokenizer
37
+ let result = cache
38
+ .tokenizer
39
+ .as_ref()
40
+ .unwrap()
41
+ .tokenize(&text);
42
+
43
+ Ok(result)
44
+ }
45
+
46
+ // Configure sets the default configuration and invalidates cache
47
+ fn configure(config_hash: RHash) -> std::result::Result<(), Error> {
48
+ let config = parse_config_from_hash(config_hash)?;
49
+
50
+ // Update cache with new config and clear tokenizer
51
+ let mut cache = DEFAULT_CACHE
52
+ .lock()
53
+ .map_err(|e| TokenizerError::MutexError(e.to_string()))?;
54
+ cache.config = config;
55
+ cache.tokenizer = None; // Invalidate cached tokenizer
56
+
57
+ Ok(())
58
+ }
59
+
60
+ // Reset to factory defaults
61
+ fn reset() -> std::result::Result<(), Error> {
62
+ let mut cache = DEFAULT_CACHE
63
+ .lock()
64
+ .map_err(|e| TokenizerError::MutexError(e.to_string()))?;
65
+ cache.config = TokenizerConfig::default();
66
+ cache.tokenizer = None; // Clear cached tokenizer
67
+ Ok(())
68
+ }
69
+
70
+ // Get current default configuration
71
+ fn config_hash() -> std::result::Result<RHash, Error> {
72
+ let cache = DEFAULT_CACHE
73
+ .lock()
74
+ .map_err(|e| TokenizerError::MutexError(e.to_string()))?;
75
+
76
+ config_to_hash(&cache.config)
77
+ }
78
+
79
+ // Helper function to convert config to RHash
80
+ fn config_to_hash(config: &TokenizerConfig) -> std::result::Result<RHash, Error> {
81
+ let hash = RHash::new();
82
+
83
+ let strategy_str = match &config.strategy {
84
+ TokenizerStrategy::Whitespace => "whitespace",
85
+ TokenizerStrategy::Unicode => "unicode",
86
+ TokenizerStrategy::Pattern { .. } => "pattern",
87
+ TokenizerStrategy::Sentence => "sentence",
88
+ TokenizerStrategy::Grapheme { .. } => "grapheme",
89
+ TokenizerStrategy::Keyword => "keyword",
90
+ TokenizerStrategy::EdgeNgram { .. } => "edge_ngram",
91
+ TokenizerStrategy::Ngram { .. } => "ngram",
92
+ TokenizerStrategy::PathHierarchy { .. } => "path_hierarchy",
93
+ TokenizerStrategy::UrlEmail => "url_email",
94
+ TokenizerStrategy::CharGroup { .. } => "char_group",
95
+ TokenizerStrategy::Letter => "letter",
96
+ TokenizerStrategy::Lowercase => "lowercase",
97
+ };
98
+ hash.aset("strategy", strategy_str)?;
99
+
100
+ if let TokenizerStrategy::Pattern { regex } = &config.strategy {
101
+ hash.aset("regex", regex.as_str())?;
102
+ }
103
+
104
+ if let TokenizerStrategy::Grapheme { extended } = &config.strategy {
105
+ hash.aset("extended", *extended)?;
106
+ }
107
+
108
+ if let TokenizerStrategy::EdgeNgram { min_gram, max_gram } = &config.strategy {
109
+ hash.aset("min_gram", *min_gram)?;
110
+ hash.aset("max_gram", *max_gram)?;
111
+ }
112
+
113
+ if let TokenizerStrategy::PathHierarchy { delimiter } = &config.strategy {
114
+ hash.aset("delimiter", delimiter.as_str())?;
115
+ }
116
+
117
+ if let TokenizerStrategy::Ngram { min_gram, max_gram } = &config.strategy {
118
+ hash.aset("min_gram", *min_gram)?;
119
+ hash.aset("max_gram", *max_gram)?;
120
+ }
121
+
122
+ if let TokenizerStrategy::CharGroup { split_on_chars } = &config.strategy {
123
+ hash.aset("split_on_chars", split_on_chars.as_str())?;
124
+ }
125
+
126
+ hash.aset("lowercase", config.lowercase)?;
127
+ hash.aset("remove_punctuation", config.remove_punctuation)?;
128
+
129
+ let patterns = RArray::new();
130
+ for pattern in &config.preserve_patterns {
131
+ patterns.push(pattern.as_str())?;
132
+ }
133
+ hash.aset("preserve_patterns", patterns)?;
134
+
135
+ Ok(hash)
136
+ }
137
+
138
+ // Parse config from Ruby hash
139
+ fn parse_config_from_hash(config_hash: RHash) -> std::result::Result<TokenizerConfig, Error> {
140
+ let strategy_val = config_hash.get("strategy");
141
+ let strategy = if let Some(val) = strategy_val {
142
+ let strategy_str: String = TryConvert::try_convert(val)?;
143
+ match strategy_str.as_str() {
144
+ "whitespace" => TokenizerStrategy::Whitespace,
145
+ "unicode" => TokenizerStrategy::Unicode,
146
+ "pattern" => {
147
+ let regex_val = config_hash
148
+ .get("regex")
149
+ .ok_or_else(|| {
150
+ TokenizerError::InvalidConfiguration(
151
+ "pattern strategy requires regex parameter".to_string()
152
+ )
153
+ })?;
154
+ let regex: String = TryConvert::try_convert(regex_val)?;
155
+ TokenizerStrategy::Pattern { regex }
156
+ }
157
+ "sentence" => TokenizerStrategy::Sentence,
158
+ "grapheme" => {
159
+ let extended_val = config_hash.get("extended");
160
+ let extended = if let Some(val) = extended_val {
161
+ TryConvert::try_convert(val)?
162
+ } else {
163
+ true
164
+ };
165
+ TokenizerStrategy::Grapheme { extended }
166
+ }
167
+ "keyword" => TokenizerStrategy::Keyword,
168
+ "edge_ngram" => {
169
+ let min_gram_val = config_hash.get("min_gram");
170
+ let min_gram = if let Some(val) = min_gram_val {
171
+ TryConvert::try_convert(val)?
172
+ } else {
173
+ 2
174
+ };
175
+ let max_gram_val = config_hash.get("max_gram");
176
+ let max_gram = if let Some(val) = max_gram_val {
177
+ TryConvert::try_convert(val)?
178
+ } else {
179
+ 10
180
+ };
181
+ TokenizerStrategy::EdgeNgram { min_gram, max_gram }
182
+ }
183
+ "path_hierarchy" => {
184
+ let delimiter_val = config_hash.get("delimiter");
185
+ let delimiter = if let Some(val) = delimiter_val {
186
+ TryConvert::try_convert(val)?
187
+ } else {
188
+ "/".to_string()
189
+ };
190
+ TokenizerStrategy::PathHierarchy { delimiter }
191
+ }
192
+ "url_email" => TokenizerStrategy::UrlEmail,
193
+ "ngram" => {
194
+ let min_gram_val = config_hash.get("min_gram");
195
+ let min_gram = if let Some(val) = min_gram_val {
196
+ TryConvert::try_convert(val)?
197
+ } else {
198
+ 2
199
+ };
200
+ let max_gram_val = config_hash.get("max_gram");
201
+ let max_gram = if let Some(val) = max_gram_val {
202
+ TryConvert::try_convert(val)?
203
+ } else {
204
+ 10
205
+ };
206
+ TokenizerStrategy::Ngram { min_gram, max_gram }
207
+ }
208
+ "char_group" => {
209
+ let split_on_chars_val = config_hash.get("split_on_chars");
210
+ let split_on_chars = if let Some(val) = split_on_chars_val {
211
+ TryConvert::try_convert(val)?
212
+ } else {
213
+ " \t\n\r".to_string()
214
+ };
215
+ TokenizerStrategy::CharGroup { split_on_chars }
216
+ }
217
+ "letter" => TokenizerStrategy::Letter,
218
+ "lowercase" => TokenizerStrategy::Lowercase,
219
+ _ => {
220
+ return Err(TokenizerError::UnknownStrategy(strategy_str).into())
221
+ }
222
+ }
223
+ } else {
224
+ TokenizerStrategy::Unicode
225
+ };
226
+
227
+ let lowercase_val = config_hash.get("lowercase");
228
+ let lowercase = if let Some(val) = lowercase_val {
229
+ TryConvert::try_convert(val)?
230
+ } else {
231
+ true
232
+ };
233
+
234
+ let remove_punctuation_val = config_hash.get("remove_punctuation");
235
+ let remove_punctuation = if let Some(val) = remove_punctuation_val {
236
+ TryConvert::try_convert(val)?
237
+ } else {
238
+ false
239
+ };
240
+
241
+ let preserve_patterns_val = config_hash.get("preserve_patterns");
242
+ let preserve_patterns = if let Some(val) = preserve_patterns_val {
243
+ let array: RArray = TryConvert::try_convert(val)?;
244
+ let mut patterns = Vec::new();
245
+ for idx in 0..array.len() {
246
+ let item = array.entry(idx as isize)?;
247
+ let pattern_str: String = TryConvert::try_convert(item)?;
248
+ patterns.push(pattern_str);
249
+ }
250
+ patterns
251
+ } else {
252
+ Vec::new()
253
+ };
254
+
255
+ let config = TokenizerConfig {
256
+ strategy,
257
+ lowercase,
258
+ remove_punctuation,
259
+ preserve_patterns,
260
+ };
261
+
262
+ // Validate the configuration
263
+ validate_config(&config)?;
264
+
265
+ Ok(config)
266
+ }
267
+
268
+ // Validate configuration parameters
269
+ fn validate_config(config: &TokenizerConfig) -> std::result::Result<(), TokenizerError> {
270
+ use TokenizerStrategy::*;
271
+
272
+ match &config.strategy {
273
+ EdgeNgram { min_gram, max_gram } | Ngram { min_gram, max_gram } => {
274
+ if *min_gram == 0 {
275
+ return Err(TokenizerError::InvalidNgramConfig {
276
+ min: *min_gram,
277
+ max: *max_gram,
278
+ });
279
+ }
280
+ if min_gram > max_gram {
281
+ return Err(TokenizerError::InvalidNgramConfig {
282
+ min: *min_gram,
283
+ max: *max_gram,
284
+ });
285
+ }
286
+ }
287
+ PathHierarchy { delimiter } => {
288
+ if delimiter.is_empty() {
289
+ return Err(TokenizerError::EmptyDelimiter {
290
+ tokenizer: "PathHierarchy".to_string(),
291
+ });
292
+ }
293
+ }
294
+ Pattern { regex } => {
295
+ // Validate regex pattern
296
+ regex::Regex::new(regex).map_err(|e| TokenizerError::InvalidRegex {
297
+ pattern: regex.clone(),
298
+ error: e.to_string(),
299
+ })?;
300
+ }
301
+ _ => {}
302
+ }
303
+
304
+ // Validate preserve patterns
305
+ for pattern in &config.preserve_patterns {
306
+ regex::Regex::new(pattern).map_err(|e| TokenizerError::InvalidRegex {
307
+ pattern: pattern.clone(),
308
+ error: e.to_string(),
309
+ })?;
310
+ }
311
+
312
+ Ok(())
313
+ }
314
+
315
+ // Load config is just an alias for configure (for backward compat)
316
+ fn load_config(config_hash: RHash) -> std::result::Result<(), Error> {
317
+ configure(config_hash)
318
+ }
319
+
320
+ // Tokenize with a specific config (creates fresh tokenizer)
321
+ fn tokenize_with_config(text: String, config_hash: RHash) -> std::result::Result<Vec<String>, Error> {
322
+ let config = parse_config_from_hash(config_hash)?;
323
+
324
+ // Create fresh tokenizer from config
325
+ let tokenizer = tokenizer::from_config(config)?;
326
+
327
+ // Tokenize and return
328
+ Ok(tokenizer.tokenize(&text))
329
+ }
330
+
331
+ #[magnus::init]
332
+ fn init(_ruby: &magnus::Ruby) -> std::result::Result<(), Error> {
333
+ let module = define_module("TokenKit")?;
334
+
335
+ // Public API functions
336
+ module.define_module_function("_tokenize", function!(tokenize, 1))?;
337
+ module.define_module_function("_configure", function!(configure, 1))?;
338
+ module.define_module_function("_reset", function!(reset, 0))?;
339
+ module.define_module_function("_config_hash", function!(config_hash, 0))?;
340
+ module.define_module_function("_load_config", function!(load_config, 1))?;
341
+
342
+ // New instance-based function
343
+ module.define_module_function("_tokenize_with_config", function!(tokenize_with_config, 2))?;
344
+
345
+ Ok(())
346
+ }
@@ -0,0 +1,41 @@
1
+ use crate::config::TokenizerConfig;
2
+ use regex::Regex;
3
+
4
+ /// Common functionality for tokenizers that support preserve_patterns
5
+ /// Note: Since we validate patterns in validate_config(), they're guaranteed to be valid here
6
+ pub fn create_preserve_patterns(config: &TokenizerConfig) -> Vec<Regex> {
7
+ config
8
+ .preserve_patterns
9
+ .iter()
10
+ .map(|p| {
11
+ // Safe to unwrap because patterns are validated in validate_config()
12
+ Regex::new(p).expect("Pattern should have been validated")
13
+ })
14
+ .collect()
15
+ }
16
+
17
+ /// Base fields that most tokenizers need
18
+ pub struct BaseTokenizerFields {
19
+ pub config: TokenizerConfig,
20
+ pub preserve_patterns: Vec<Regex>,
21
+ }
22
+
23
+ impl BaseTokenizerFields {
24
+ pub fn new(config: TokenizerConfig) -> Self {
25
+ let preserve_patterns = create_preserve_patterns(&config);
26
+ Self {
27
+ config,
28
+ preserve_patterns,
29
+ }
30
+ }
31
+
32
+
33
+ pub fn has_preserve_patterns(&self) -> bool {
34
+ !self.preserve_patterns.is_empty()
35
+ }
36
+
37
+ pub fn preserve_patterns(&self) -> &[Regex] {
38
+ &self.preserve_patterns
39
+ }
40
+ }
41
+
@@ -0,0 +1,62 @@
1
+ use super::{apply_preserve_patterns_with_tokenizer, post_process, BaseTokenizerFields, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+ use std::collections::HashSet;
4
+
5
+ pub struct CharGroupTokenizer {
6
+ base: BaseTokenizerFields,
7
+ split_chars: HashSet<char>,
8
+ }
9
+
10
+ impl CharGroupTokenizer {
11
+ pub fn new(config: TokenizerConfig, split_on_chars: String) -> Self {
12
+ // Note: Empty split_on_chars is valid - it makes the tokenizer behave like
13
+ // a keyword tokenizer (no splitting, returns whole text as single token)
14
+ let split_chars: HashSet<char> = split_on_chars.chars().collect();
15
+
16
+ Self {
17
+ base: BaseTokenizerFields::new(config),
18
+ split_chars,
19
+ }
20
+ }
21
+
22
+ fn tokenize_text(&self, text: &str) -> Vec<String> {
23
+ let mut tokens = Vec::new();
24
+ let mut current_token = String::new();
25
+
26
+ for ch in text.chars() {
27
+ if self.split_chars.contains(&ch) {
28
+ if !current_token.is_empty() {
29
+ tokens.push(current_token.clone());
30
+ current_token.clear();
31
+ }
32
+ } else {
33
+ current_token.push(ch);
34
+ }
35
+ }
36
+
37
+ if !current_token.is_empty() {
38
+ tokens.push(current_token);
39
+ }
40
+
41
+ tokens
42
+ }
43
+ }
44
+
45
+ impl Tokenizer for CharGroupTokenizer {
46
+ fn tokenize(&self, text: &str) -> Vec<String> {
47
+ let tokens = self.tokenize_text(text);
48
+
49
+ if self.base.has_preserve_patterns() {
50
+ apply_preserve_patterns_with_tokenizer(
51
+ tokens,
52
+ self.base.preserve_patterns(),
53
+ text,
54
+ &self.base.config,
55
+ |t| self.tokenize_text(t),
56
+ )
57
+ } else {
58
+ post_process(tokens, &self.base.config)
59
+ }
60
+ }
61
+
62
+ }
@@ -0,0 +1,73 @@
1
+ use super::{Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+
4
+ pub struct EdgeNgramTokenizer {
5
+ config: TokenizerConfig,
6
+ min_gram: usize,
7
+ max_gram: usize,
8
+ }
9
+
10
+ impl EdgeNgramTokenizer {
11
+ pub fn new(config: TokenizerConfig, min_gram: usize, max_gram: usize) -> Self {
12
+ // Validate and sanitize parameters
13
+ let min_gram = min_gram.max(1); // Minimum 1 character
14
+ let max_gram = max_gram.max(min_gram); // Ensure max >= min
15
+
16
+ Self { config, min_gram, max_gram }
17
+ }
18
+
19
+ fn generate_edge_ngrams(&self, text: &str) -> Vec<String> {
20
+ let mut ngrams = Vec::new();
21
+ let chars: Vec<char> = text.chars().collect();
22
+ let text_len = chars.len();
23
+
24
+ if text_len == 0 {
25
+ return ngrams;
26
+ }
27
+
28
+ let max = self.max_gram.min(text_len);
29
+
30
+ for gram_size in self.min_gram..=max {
31
+ let ngram: String = chars.iter().take(gram_size).collect();
32
+ ngrams.push(ngram);
33
+ }
34
+
35
+ ngrams
36
+ }
37
+ }
38
+
39
+ impl Tokenizer for EdgeNgramTokenizer {
40
+ fn tokenize(&self, text: &str) -> Vec<String> {
41
+ let mut all_ngrams = Vec::new();
42
+
43
+ for word in text.split_whitespace() {
44
+ if word.is_empty() {
45
+ continue;
46
+ }
47
+
48
+ let processed_word = if self.config.remove_punctuation {
49
+ word.chars()
50
+ .filter(|c| !c.is_ascii_punctuation())
51
+ .collect()
52
+ } else {
53
+ word.to_string()
54
+ };
55
+
56
+ if processed_word.is_empty() {
57
+ continue;
58
+ }
59
+
60
+ let ngrams = self.generate_edge_ngrams(&processed_word);
61
+ all_ngrams.extend(ngrams);
62
+ }
63
+
64
+ let mut result = all_ngrams;
65
+
66
+ if self.config.lowercase {
67
+ result = result.into_iter().map(|t| t.to_lowercase()).collect();
68
+ }
69
+
70
+ result
71
+ }
72
+
73
+ }
@@ -0,0 +1,26 @@
1
+ use super::{post_process, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+ use unicode_segmentation::UnicodeSegmentation;
4
+
5
+ pub struct GraphemeTokenizer {
6
+ config: TokenizerConfig,
7
+ extended: bool,
8
+ }
9
+
10
+ impl GraphemeTokenizer {
11
+ pub fn new(config: TokenizerConfig, extended: bool) -> Self {
12
+ Self { config, extended }
13
+ }
14
+ }
15
+
16
+ impl Tokenizer for GraphemeTokenizer {
17
+ fn tokenize(&self, text: &str) -> Vec<String> {
18
+ let graphemes: Vec<String> = text
19
+ .graphemes(self.extended)
20
+ .map(|s| s.to_string())
21
+ .collect();
22
+
23
+ post_process(graphemes, &self.config)
24
+ }
25
+
26
+ }
@@ -0,0 +1,25 @@
1
+ use super::{post_process, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+
4
+ pub struct KeywordTokenizer {
5
+ config: TokenizerConfig,
6
+ }
7
+
8
+ impl KeywordTokenizer {
9
+ pub fn new(config: TokenizerConfig) -> Self {
10
+ Self { config }
11
+ }
12
+ }
13
+
14
+ impl Tokenizer for KeywordTokenizer {
15
+ fn tokenize(&self, text: &str) -> Vec<String> {
16
+ let trimmed = text.trim();
17
+ if trimmed.is_empty() {
18
+ return vec![];
19
+ }
20
+
21
+ let tokens = vec![trimmed.to_string()];
22
+ post_process(tokens, &self.config)
23
+ }
24
+
25
+ }
@@ -0,0 +1,41 @@
1
+ use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+
4
+ pub struct LetterTokenizer {
5
+ base: BaseTokenizerFields,
6
+ }
7
+
8
+ impl LetterTokenizer {
9
+ pub fn new(config: TokenizerConfig) -> Self {
10
+ Self {
11
+ base: BaseTokenizerFields::new(config),
12
+ }
13
+ }
14
+ }
15
+
16
+ impl Tokenizer for LetterTokenizer {
17
+ fn tokenize(&self, text: &str) -> Vec<String> {
18
+ let mut tokens = Vec::new();
19
+ let mut current_token = String::new();
20
+
21
+ for ch in text.chars() {
22
+ if ch.is_alphabetic() {
23
+ current_token.push(ch);
24
+ } else if !current_token.is_empty() {
25
+ tokens.push(current_token.clone());
26
+ current_token.clear();
27
+ }
28
+ }
29
+
30
+ if !current_token.is_empty() {
31
+ tokens.push(current_token);
32
+ }
33
+
34
+ if self.base.has_preserve_patterns() {
35
+ apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config)
36
+ } else {
37
+ post_process(tokens, &self.base.config)
38
+ }
39
+ }
40
+
41
+ }
@@ -0,0 +1,51 @@
1
+ use super::{apply_preserve_patterns, BaseTokenizerFields, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+
4
+ pub struct LowercaseTokenizer {
5
+ base: BaseTokenizerFields,
6
+ }
7
+
8
+ impl LowercaseTokenizer {
9
+ pub fn new(config: TokenizerConfig) -> Self {
10
+ Self {
11
+ base: BaseTokenizerFields::new(config),
12
+ }
13
+ }
14
+ }
15
+
16
+ impl Tokenizer for LowercaseTokenizer {
17
+ fn tokenize(&self, text: &str) -> Vec<String> {
18
+ let mut tokens = Vec::new();
19
+ let mut current_token = String::new();
20
+
21
+ for ch in text.chars() {
22
+ if ch.is_alphabetic() {
23
+ for lowercase_ch in ch.to_lowercase() {
24
+ current_token.push(lowercase_ch);
25
+ }
26
+ } else if !current_token.is_empty() {
27
+ tokens.push(current_token.clone());
28
+ current_token.clear();
29
+ }
30
+ }
31
+
32
+ if !current_token.is_empty() {
33
+ tokens.push(current_token);
34
+ }
35
+
36
+ // Lowercase tokenizer always lowercases, ignore config.lowercase
37
+ // Note: remove_punctuation has no effect since we already split on non-alphabetic
38
+ // characters, but we keep it for consistency with the Tokenizer interface
39
+
40
+ if self.base.has_preserve_patterns() {
41
+ // For preserve_patterns, we need to pass a modified config that doesn't lowercase
42
+ // because apply_preserve_patterns handles lowercasing for non-preserved tokens
43
+ let mut modified_config = self.base.config.clone();
44
+ modified_config.lowercase = true; // Force lowercase for non-preserved tokens
45
+ apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &modified_config)
46
+ } else {
47
+ tokens
48
+ }
49
+ }
50
+
51
+ }