tokenkit 0.1.0.pre.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (43) hide show
  1. checksums.yaml +7 -0
  2. data/.rspec +3 -0
  3. data/.standard.yml +3 -0
  4. data/.yardopts +12 -0
  5. data/CODE_OF_CONDUCT.md +132 -0
  6. data/LICENSE.txt +21 -0
  7. data/README.md +644 -0
  8. data/Rakefile +18 -0
  9. data/benchmarks/cache_test.rb +63 -0
  10. data/benchmarks/final_comparison.rb +83 -0
  11. data/benchmarks/tokenizer_benchmark.rb +250 -0
  12. data/docs/ARCHITECTURE.md +469 -0
  13. data/docs/PERFORMANCE.md +382 -0
  14. data/docs/README.md +118 -0
  15. data/ext/tokenkit/Cargo.toml +21 -0
  16. data/ext/tokenkit/extconf.rb +4 -0
  17. data/ext/tokenkit/src/config.rs +37 -0
  18. data/ext/tokenkit/src/error.rs +67 -0
  19. data/ext/tokenkit/src/lib.rs +346 -0
  20. data/ext/tokenkit/src/tokenizer/base.rs +41 -0
  21. data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
  22. data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
  23. data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
  24. data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
  25. data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
  26. data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
  27. data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
  28. data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
  29. data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
  30. data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
  31. data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
  32. data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
  33. data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
  34. data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
  35. data/lib/tokenkit/config.rb +74 -0
  36. data/lib/tokenkit/config_builder.rb +209 -0
  37. data/lib/tokenkit/config_compat.rb +52 -0
  38. data/lib/tokenkit/configuration.rb +194 -0
  39. data/lib/tokenkit/regex_converter.rb +58 -0
  40. data/lib/tokenkit/version.rb +5 -0
  41. data/lib/tokenkit.rb +336 -0
  42. data/sig/tokenkit.rbs +4 -0
  43. metadata +172 -0
@@ -0,0 +1,254 @@
1
+ mod base;
2
+ mod whitespace;
3
+ mod unicode;
4
+ mod pattern;
5
+ mod sentence;
6
+ mod grapheme;
7
+ mod keyword;
8
+ mod edge_ngram;
9
+ mod ngram;
10
+ mod path_hierarchy;
11
+ mod url_email;
12
+ mod char_group;
13
+ mod letter;
14
+ mod lowercase;
15
+
16
+ pub(crate) use base::BaseTokenizerFields;
17
+
18
+ pub use whitespace::WhitespaceTokenizer;
19
+ pub use unicode::UnicodeTokenizer;
20
+ pub use pattern::PatternTokenizer;
21
+ pub use sentence::SentenceTokenizer;
22
+ pub use grapheme::GraphemeTokenizer;
23
+ pub use keyword::KeywordTokenizer;
24
+ pub use edge_ngram::EdgeNgramTokenizer;
25
+ pub use ngram::NgramTokenizer;
26
+ pub use path_hierarchy::PathHierarchyTokenizer;
27
+ pub use url_email::UrlEmailTokenizer;
28
+ pub use char_group::CharGroupTokenizer;
29
+ pub use letter::LetterTokenizer;
30
+ pub use lowercase::LowercaseTokenizer;
31
+
32
+ use crate::config::{TokenizerConfig, TokenizerStrategy};
33
+ use crate::error::Result;
34
+ use regex::Regex;
35
+
36
+ pub trait Tokenizer: Send + Sync {
37
+ fn tokenize(&self, text: &str) -> Vec<String>;
38
+ }
39
+
40
+ pub fn from_config(config: TokenizerConfig) -> Result<Box<dyn Tokenizer>> {
41
+ match config.strategy.clone() {
42
+ TokenizerStrategy::Whitespace => Ok(Box::new(WhitespaceTokenizer::new(config))),
43
+ TokenizerStrategy::Unicode => Ok(Box::new(UnicodeTokenizer::new(config))),
44
+ TokenizerStrategy::Pattern { regex } => {
45
+ PatternTokenizer::new(&regex, config)
46
+ .map(|t| Box::new(t) as Box<dyn Tokenizer>)
47
+ }
48
+ TokenizerStrategy::Sentence => Ok(Box::new(SentenceTokenizer::new(config))),
49
+ TokenizerStrategy::Grapheme { extended } => {
50
+ Ok(Box::new(GraphemeTokenizer::new(config, extended)))
51
+ }
52
+ TokenizerStrategy::Keyword => Ok(Box::new(KeywordTokenizer::new(config))),
53
+ TokenizerStrategy::EdgeNgram { min_gram, max_gram } => {
54
+ Ok(Box::new(EdgeNgramTokenizer::new(config, min_gram, max_gram)))
55
+ }
56
+ TokenizerStrategy::PathHierarchy { delimiter } => {
57
+ Ok(Box::new(PathHierarchyTokenizer::new(config, delimiter)))
58
+ }
59
+ TokenizerStrategy::UrlEmail => {
60
+ Ok(Box::new(UrlEmailTokenizer::new(config)))
61
+ }
62
+ TokenizerStrategy::Ngram { min_gram, max_gram } => {
63
+ Ok(Box::new(NgramTokenizer::new(config, min_gram, max_gram)))
64
+ }
65
+ TokenizerStrategy::CharGroup { split_on_chars } => {
66
+ Ok(Box::new(CharGroupTokenizer::new(config, split_on_chars)))
67
+ }
68
+ TokenizerStrategy::Letter => Ok(Box::new(LetterTokenizer::new(config))),
69
+ TokenizerStrategy::Lowercase => Ok(Box::new(LowercaseTokenizer::new(config))),
70
+ }
71
+ }
72
+
73
+ pub(crate) fn merge_overlapping_spans(mut spans: Vec<(usize, usize, String)>) -> Vec<(usize, usize, String)> {
74
+ if spans.is_empty() {
75
+ return spans;
76
+ }
77
+
78
+ spans.sort_by(|a, b| {
79
+ a.0.cmp(&b.0)
80
+ .then_with(|| b.1.cmp(&a.1))
81
+ });
82
+
83
+ let mut merged = Vec::new();
84
+ let mut current = spans[0].clone();
85
+
86
+ for span in spans.into_iter().skip(1) {
87
+ if span.0 < current.1 {
88
+ if span.1 > current.1 {
89
+ current = span;
90
+ }
91
+ } else {
92
+ merged.push(current);
93
+ current = span;
94
+ }
95
+ }
96
+ merged.push(current);
97
+
98
+ merged
99
+ }
100
+
101
+ // Optimized version that works with indices only
102
+ fn merge_overlapping_spans_optimized(mut spans: Vec<(usize, usize)>) -> Vec<(usize, usize)> {
103
+ if spans.is_empty() {
104
+ return spans;
105
+ }
106
+
107
+ spans.sort_unstable_by(|a, b| {
108
+ a.0.cmp(&b.0)
109
+ .then_with(|| b.1.cmp(&a.1))
110
+ });
111
+
112
+ let mut merged = Vec::with_capacity(spans.len());
113
+ let mut current = spans[0];
114
+
115
+ for span in spans.into_iter().skip(1) {
116
+ if span.0 < current.1 {
117
+ if span.1 > current.1 {
118
+ current.1 = span.1;
119
+ }
120
+ } else {
121
+ merged.push(current);
122
+ current = span;
123
+ }
124
+ }
125
+ merged.push(current);
126
+ merged
127
+ }
128
+
129
+ pub(crate) fn apply_preserve_patterns(
130
+ tokens: Vec<String>,
131
+ preserve_patterns: &[Regex],
132
+ original_text: &str,
133
+ config: &TokenizerConfig,
134
+ ) -> Vec<String> {
135
+ apply_preserve_patterns_with_tokenizer(
136
+ tokens,
137
+ preserve_patterns,
138
+ original_text,
139
+ config,
140
+ tokenize_simple,
141
+ )
142
+ }
143
+
144
+ pub(crate) fn apply_preserve_patterns_with_tokenizer<F>(
145
+ tokens: Vec<String>,
146
+ preserve_patterns: &[Regex],
147
+ original_text: &str,
148
+ config: &TokenizerConfig,
149
+ tokenizer_fn: F,
150
+ ) -> Vec<String>
151
+ where
152
+ F: Fn(&str) -> Vec<String>,
153
+ {
154
+ if preserve_patterns.is_empty() {
155
+ return tokens;
156
+ }
157
+
158
+ // Use indices instead of allocating strings upfront
159
+ let mut preserved_spans: Vec<(usize, usize)> = Vec::with_capacity(32);
160
+ for pattern in preserve_patterns {
161
+ for mat in pattern.find_iter(original_text) {
162
+ preserved_spans.push((mat.start(), mat.end()));
163
+ }
164
+ }
165
+
166
+ if preserved_spans.is_empty() {
167
+ return tokens;
168
+ }
169
+
170
+ let preserved_spans = merge_overlapping_spans_optimized(preserved_spans);
171
+
172
+ // Pre-allocate result vector with estimated capacity
173
+ let mut result = Vec::with_capacity(tokens.len() + preserved_spans.len());
174
+ let mut pos = 0;
175
+
176
+ for (start, end) in preserved_spans {
177
+ if start > pos {
178
+ let before = &original_text[pos..start];
179
+ let mut before_tokens = tokenizer_fn(before);
180
+ post_process_in_place(&mut before_tokens, config);
181
+ result.extend(before_tokens);
182
+ }
183
+ // Extract preserved text only when needed
184
+ result.push(original_text[start..end].to_string());
185
+ pos = end;
186
+ }
187
+
188
+ if pos < original_text.len() {
189
+ let remaining = &original_text[pos..];
190
+ let mut remaining_tokens = tokenizer_fn(remaining);
191
+ post_process_in_place(&mut remaining_tokens, config);
192
+ result.extend(remaining_tokens);
193
+ }
194
+
195
+ result
196
+ }
197
+
198
+ fn tokenize_simple(text: &str) -> Vec<String> {
199
+ text.split_whitespace()
200
+ .filter(|s| !s.is_empty())
201
+ .map(|s| s.to_string())
202
+ .collect()
203
+ }
204
+
205
+ pub(crate) fn post_process(tokens: Vec<String>, config: &TokenizerConfig) -> Vec<String> {
206
+ post_process_with_preserved(tokens, config, None)
207
+ }
208
+
209
+ // In-place version to avoid allocation
210
+ fn post_process_in_place(tokens: &mut Vec<String>, config: &TokenizerConfig) {
211
+ if config.lowercase {
212
+ for token in tokens.iter_mut() {
213
+ *token = token.to_lowercase();
214
+ }
215
+ }
216
+
217
+ if config.remove_punctuation {
218
+ tokens.retain_mut(|token| {
219
+ token.retain(|c| !c.is_ascii_punctuation());
220
+ !token.is_empty()
221
+ });
222
+ }
223
+ }
224
+
225
+ pub(crate) fn post_process_with_preserved(
226
+ mut tokens: Vec<String>,
227
+ config: &TokenizerConfig,
228
+ preserve_chars: Option<&str>,
229
+ ) -> Vec<String> {
230
+ if config.lowercase {
231
+ tokens = tokens.into_iter().map(|t| t.to_lowercase()).collect();
232
+ }
233
+
234
+ if config.remove_punctuation {
235
+ tokens = tokens
236
+ .into_iter()
237
+ .map(|t| {
238
+ t.chars()
239
+ .filter(|c| {
240
+ if let Some(preserved) = preserve_chars {
241
+ if preserved.contains(*c) {
242
+ return true;
243
+ }
244
+ }
245
+ !c.is_ascii_punctuation()
246
+ })
247
+ .collect()
248
+ })
249
+ .filter(|s: &String| !s.is_empty())
250
+ .collect();
251
+ }
252
+
253
+ tokens
254
+ }
@@ -0,0 +1,80 @@
1
+ use super::Tokenizer;
2
+ use crate::config::TokenizerConfig;
3
+
4
+ pub struct NgramTokenizer {
5
+ config: TokenizerConfig,
6
+ min_gram: usize,
7
+ max_gram: usize,
8
+ }
9
+
10
+ impl NgramTokenizer {
11
+ pub fn new(config: TokenizerConfig, min_gram: usize, max_gram: usize) -> Self {
12
+ // Validate and sanitize parameters
13
+ let min_gram = min_gram.max(1); // Minimum 1 character
14
+ let max_gram = max_gram.max(min_gram); // Ensure max >= min
15
+
16
+ Self {
17
+ config,
18
+ min_gram,
19
+ max_gram,
20
+ }
21
+ }
22
+
23
+ fn generate_ngrams(&self, text: &str) -> Vec<String> {
24
+ let mut ngrams = Vec::new();
25
+ let chars: Vec<char> = text.chars().collect();
26
+ let text_len = chars.len();
27
+
28
+ if text_len == 0 {
29
+ return ngrams;
30
+ }
31
+
32
+ let max = self.max_gram.min(text_len);
33
+
34
+ for gram_size in self.min_gram..=max {
35
+ for start in 0..=(text_len - gram_size) {
36
+ let ngram: String = chars.iter().skip(start).take(gram_size).collect();
37
+ ngrams.push(ngram);
38
+ }
39
+ }
40
+
41
+ ngrams
42
+ }
43
+ }
44
+
45
+ impl Tokenizer for NgramTokenizer {
46
+ fn tokenize(&self, text: &str) -> Vec<String> {
47
+ let mut all_ngrams = Vec::new();
48
+
49
+ for word in text.split_whitespace() {
50
+ if word.is_empty() {
51
+ continue;
52
+ }
53
+
54
+ let processed_word = if self.config.remove_punctuation {
55
+ word.chars()
56
+ .filter(|c| !c.is_ascii_punctuation())
57
+ .collect()
58
+ } else {
59
+ word.to_string()
60
+ };
61
+
62
+ if processed_word.is_empty() {
63
+ continue;
64
+ }
65
+
66
+ let ngrams = self.generate_ngrams(&processed_word);
67
+ all_ngrams.extend(ngrams);
68
+ }
69
+
70
+ // Apply lowercase if needed. Note: remove_punctuation already handled above.
71
+ let mut result = all_ngrams;
72
+
73
+ if self.config.lowercase {
74
+ result = result.into_iter().map(|t| t.to_lowercase()).collect();
75
+ }
76
+
77
+ result
78
+ }
79
+
80
+ }
@@ -0,0 +1,187 @@
1
+ use super::{post_process_with_preserved, BaseTokenizerFields, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+
4
+ pub struct PathHierarchyTokenizer {
5
+ base: BaseTokenizerFields,
6
+ delimiter: String,
7
+ }
8
+
9
+ impl PathHierarchyTokenizer {
10
+ pub fn new(config: TokenizerConfig, delimiter: String) -> Self {
11
+ Self {
12
+ base: BaseTokenizerFields::new(config),
13
+ delimiter,
14
+ }
15
+ }
16
+
17
+ fn generate_hierarchy(&self, path: &str) -> Vec<String> {
18
+ let mut tokens = Vec::new();
19
+ let parts: Vec<&str> = path.split(&self.delimiter).collect();
20
+
21
+ let mut current_path = String::new();
22
+ let starts_with_delimiter = path.starts_with(&self.delimiter);
23
+
24
+ for part in parts.iter() {
25
+ if part.is_empty() {
26
+ continue;
27
+ }
28
+
29
+ if !current_path.is_empty() {
30
+ current_path.push_str(&self.delimiter);
31
+ } else if starts_with_delimiter {
32
+ current_path.push_str(&self.delimiter);
33
+ }
34
+
35
+ current_path.push_str(part);
36
+ tokens.push(current_path.clone());
37
+ }
38
+
39
+ tokens
40
+ }
41
+
42
+ fn apply_patterns_to_hierarchy(&self, text: &str) -> Vec<String> {
43
+ if self.base.preserve_patterns().is_empty() {
44
+ return self.generate_hierarchy(text);
45
+ }
46
+
47
+ // Generate all hierarchical tokens first
48
+ let all_tokens = self.generate_hierarchy(text);
49
+
50
+ // Find which tokens are completely matched by preserve patterns
51
+ let mut preserved_tokens = Vec::new();
52
+ for token in &all_tokens {
53
+ for pattern in self.base.preserve_patterns() {
54
+ if let Some(mat) = pattern.find(token) {
55
+ if mat.as_str() == token {
56
+ preserved_tokens.push(token.clone());
57
+ break;
58
+ }
59
+ }
60
+ }
61
+ }
62
+
63
+ // Now build the result, applying lowercase where appropriate
64
+ let mut result = Vec::new();
65
+ for token in all_tokens {
66
+ // Check if this token should be included
67
+ // Include if: it's a preserved token OR it extends beyond a preserved token
68
+ let should_include;
69
+ let mut apply_lowercase = self.base.config.lowercase;
70
+
71
+ if preserved_tokens.contains(&token) {
72
+ should_include = true;
73
+ apply_lowercase = false; // Don't lowercase preserved tokens
74
+ } else {
75
+ // Check if this token extends a preserved token
76
+ let mut extends_preserved = false;
77
+ for preserved in &preserved_tokens {
78
+ if token.starts_with(preserved) && token.len() > preserved.len() {
79
+ extends_preserved = true;
80
+ break;
81
+ }
82
+ }
83
+
84
+ if extends_preserved {
85
+ should_include = true;
86
+ } else {
87
+ // Include if no preserved token is a prefix of this one
88
+ let mut has_preserved_prefix = false;
89
+ for preserved in &preserved_tokens {
90
+ if preserved.starts_with(&token) && preserved != &token {
91
+ has_preserved_prefix = true;
92
+ break;
93
+ }
94
+ }
95
+ should_include = !has_preserved_prefix;
96
+ }
97
+ }
98
+
99
+ if should_include {
100
+ if apply_lowercase && !preserved_tokens.contains(&token) {
101
+ // Apply lowercase to non-preserved parts
102
+ let mut lowercased = String::new();
103
+ let starts_with_delim = token.starts_with(&self.delimiter);
104
+ let parts: Vec<&str> = token.split(&self.delimiter).collect();
105
+
106
+ for (i, part) in parts.iter().enumerate() {
107
+ if part.is_empty() {
108
+ if i == 0 && starts_with_delim {
109
+ // Path starts with delimiter, add it once
110
+ lowercased.push_str(&self.delimiter);
111
+ }
112
+ continue;
113
+ }
114
+
115
+ if i > 0 || (i == 0 && starts_with_delim) {
116
+ if !lowercased.is_empty() && !lowercased.ends_with(&self.delimiter) {
117
+ lowercased.push_str(&self.delimiter);
118
+ }
119
+ }
120
+
121
+ // Check if this specific part should be preserved
122
+ let mut preserve_part = false;
123
+ for pattern in self.base.preserve_patterns() {
124
+ if pattern.is_match(part) {
125
+ preserve_part = true;
126
+ break;
127
+ }
128
+ }
129
+
130
+ if preserve_part {
131
+ lowercased.push_str(part);
132
+ } else {
133
+ lowercased.push_str(&part.to_lowercase());
134
+ }
135
+ }
136
+ result.push(lowercased);
137
+ } else {
138
+ result.push(token);
139
+ }
140
+ }
141
+ }
142
+
143
+ result
144
+ }
145
+ }
146
+
147
+ impl Tokenizer for PathHierarchyTokenizer {
148
+ fn tokenize(&self, text: &str) -> Vec<String> {
149
+ let trimmed = text.trim();
150
+ if trimmed.is_empty() {
151
+ return vec![];
152
+ }
153
+
154
+ if self.base.has_preserve_patterns() {
155
+ let mut tokens = self.apply_patterns_to_hierarchy(trimmed);
156
+
157
+ // Apply remove_punctuation if needed (but preserve delimiters)
158
+ if self.base.config.remove_punctuation {
159
+ tokens = tokens.into_iter().map(|token| {
160
+ let parts: Vec<&str> = token.split(&self.delimiter).collect();
161
+ let processed: Vec<String> = parts.iter().map(|part| {
162
+ if part.is_empty() {
163
+ String::new()
164
+ } else {
165
+ // Check if this part should be preserved
166
+ let should_preserve = self.base.preserve_patterns().iter().any(|p| p.is_match(part));
167
+ if should_preserve {
168
+ part.to_string()
169
+ } else {
170
+ part.chars()
171
+ .filter(|c| !c.is_ascii_punctuation() || self.delimiter.contains(*c))
172
+ .collect()
173
+ }
174
+ }
175
+ }).collect();
176
+ processed.join(&self.delimiter)
177
+ }).filter(|s| !s.is_empty() && s != &self.delimiter).collect();
178
+ }
179
+
180
+ tokens
181
+ } else {
182
+ let tokens = self.generate_hierarchy(trimmed);
183
+ post_process_with_preserved(tokens, &self.base.config, Some(&self.delimiter))
184
+ }
185
+ }
186
+
187
+ }
@@ -0,0 +1,38 @@
1
+ use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+ use crate::error::Result;
4
+ use regex::Regex;
5
+
6
+ pub struct PatternTokenizer {
7
+ base: BaseTokenizerFields,
8
+ pattern: Regex,
9
+ }
10
+
11
+ impl PatternTokenizer {
12
+ pub fn new(regex: &str, config: TokenizerConfig) -> Result<Self> {
13
+ // Pattern is already validated in validate_config(), safe to unwrap
14
+ let pattern = Regex::new(regex).expect("Pattern should have been validated");
15
+
16
+ Ok(Self {
17
+ base: BaseTokenizerFields::new(config),
18
+ pattern,
19
+ })
20
+ }
21
+ }
22
+
23
+ impl Tokenizer for PatternTokenizer {
24
+ fn tokenize(&self, text: &str) -> Vec<String> {
25
+ let tokens: Vec<String> = self
26
+ .pattern
27
+ .find_iter(text)
28
+ .map(|mat| mat.as_str().to_string())
29
+ .collect();
30
+
31
+ if self.base.has_preserve_patterns() {
32
+ apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config)
33
+ } else {
34
+ post_process(tokens, &self.base.config)
35
+ }
36
+ }
37
+
38
+ }
@@ -0,0 +1,89 @@
1
+ use super::{post_process, BaseTokenizerFields, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+ use unicode_segmentation::UnicodeSegmentation;
4
+
5
+ pub struct SentenceTokenizer {
6
+ base: BaseTokenizerFields,
7
+ }
8
+
9
+ impl SentenceTokenizer {
10
+ pub fn new(config: TokenizerConfig) -> Self {
11
+ Self {
12
+ base: BaseTokenizerFields::new(config),
13
+ }
14
+ }
15
+ }
16
+
17
+ impl SentenceTokenizer {
18
+ fn apply_patterns_to_sentence(&self, sentence: &str) -> String {
19
+ if self.base.preserve_patterns().is_empty() || !self.base.config.lowercase {
20
+ return sentence.to_string();
21
+ }
22
+
23
+ // Find all matches in the sentence
24
+ let mut preserved_spans: Vec<(usize, usize, String)> = Vec::new();
25
+ for pattern in self.base.preserve_patterns() {
26
+ for mat in pattern.find_iter(sentence) {
27
+ preserved_spans.push((mat.start(), mat.end(), mat.as_str().to_string()));
28
+ }
29
+ }
30
+
31
+ if preserved_spans.is_empty() {
32
+ return sentence.to_string();
33
+ }
34
+
35
+ // Sort and merge overlapping spans
36
+ preserved_spans.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| b.1.cmp(&a.1)));
37
+
38
+ let mut result = String::new();
39
+ let mut pos = 0;
40
+
41
+ for (start, end, preserved) in preserved_spans {
42
+ if start > pos {
43
+ // Lowercase the text before the preserved pattern
44
+ result.push_str(&sentence[pos..start].to_lowercase());
45
+ }
46
+ // Keep the preserved pattern as-is
47
+ result.push_str(&preserved);
48
+ pos = end.max(pos); // Handle overlaps
49
+ }
50
+
51
+ if pos < sentence.len() {
52
+ // Lowercase the remaining text
53
+ result.push_str(&sentence[pos..].to_lowercase());
54
+ }
55
+
56
+ result
57
+ }
58
+ }
59
+
60
+ impl Tokenizer for SentenceTokenizer {
61
+ fn tokenize(&self, text: &str) -> Vec<String> {
62
+ let mut sentences: Vec<String> = text
63
+ .unicode_sentences()
64
+ .map(|s| s.to_string())
65
+ .collect();
66
+
67
+ // Apply preserve patterns to each sentence
68
+ if self.base.has_preserve_patterns() && self.base.config.lowercase {
69
+ sentences = sentences
70
+ .into_iter()
71
+ .map(|sentence| self.apply_patterns_to_sentence(&sentence))
72
+ .collect();
73
+
74
+ // Don't call post_process since we already handled lowercasing with patterns
75
+ // Just handle remove_punctuation if needed
76
+ if self.base.config.remove_punctuation {
77
+ sentences = sentences
78
+ .into_iter()
79
+ .map(|s| s.chars().filter(|c| !c.is_ascii_punctuation()).collect())
80
+ .filter(|s: &String| !s.is_empty())
81
+ .collect();
82
+ }
83
+ sentences
84
+ } else {
85
+ post_process(sentences, &self.base.config)
86
+ }
87
+ }
88
+
89
+ }
@@ -0,0 +1,36 @@
1
+ use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
2
+ use crate::config::TokenizerConfig;
3
+ use unicode_segmentation::UnicodeSegmentation;
4
+
5
+ pub struct UnicodeTokenizer {
6
+ base: BaseTokenizerFields,
7
+ }
8
+
9
+ impl UnicodeTokenizer {
10
+ pub fn new(config: TokenizerConfig) -> Self {
11
+ Self {
12
+ base: BaseTokenizerFields::new(config),
13
+ }
14
+ }
15
+ }
16
+
17
+ impl Tokenizer for UnicodeTokenizer {
18
+ fn tokenize(&self, text: &str) -> Vec<String> {
19
+ if self.base.has_preserve_patterns() {
20
+ let tokens = text
21
+ .unicode_words()
22
+ .map(|s| s.to_string())
23
+ .collect();
24
+
25
+ return apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config);
26
+ }
27
+
28
+ let tokens: Vec<String> = text
29
+ .unicode_words()
30
+ .map(|s| s.to_string())
31
+ .collect();
32
+
33
+ post_process(tokens, &self.base.config)
34
+ }
35
+
36
+ }