tokenkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/.yardopts +12 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +644 -0
- data/Rakefile +18 -0
- data/benchmarks/cache_test.rb +63 -0
- data/benchmarks/final_comparison.rb +83 -0
- data/benchmarks/tokenizer_benchmark.rb +250 -0
- data/docs/ARCHITECTURE.md +469 -0
- data/docs/PERFORMANCE.md +382 -0
- data/docs/README.md +118 -0
- data/ext/tokenkit/Cargo.toml +21 -0
- data/ext/tokenkit/extconf.rb +4 -0
- data/ext/tokenkit/src/config.rs +37 -0
- data/ext/tokenkit/src/error.rs +67 -0
- data/ext/tokenkit/src/lib.rs +346 -0
- data/ext/tokenkit/src/tokenizer/base.rs +41 -0
- data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
- data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
- data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
- data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
- data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
- data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
- data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
- data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
- data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
- data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
- data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
- data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
- data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
- data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
- data/lib/tokenkit/config.rb +74 -0
- data/lib/tokenkit/config_builder.rb +209 -0
- data/lib/tokenkit/config_compat.rb +52 -0
- data/lib/tokenkit/configuration.rb +194 -0
- data/lib/tokenkit/regex_converter.rb +58 -0
- data/lib/tokenkit/version.rb +5 -0
- data/lib/tokenkit.rb +336 -0
- data/sig/tokenkit.rbs +4 -0
- metadata +172 -0
|
@@ -0,0 +1,254 @@
|
|
|
1
|
+
mod base;
|
|
2
|
+
mod whitespace;
|
|
3
|
+
mod unicode;
|
|
4
|
+
mod pattern;
|
|
5
|
+
mod sentence;
|
|
6
|
+
mod grapheme;
|
|
7
|
+
mod keyword;
|
|
8
|
+
mod edge_ngram;
|
|
9
|
+
mod ngram;
|
|
10
|
+
mod path_hierarchy;
|
|
11
|
+
mod url_email;
|
|
12
|
+
mod char_group;
|
|
13
|
+
mod letter;
|
|
14
|
+
mod lowercase;
|
|
15
|
+
|
|
16
|
+
pub(crate) use base::BaseTokenizerFields;
|
|
17
|
+
|
|
18
|
+
pub use whitespace::WhitespaceTokenizer;
|
|
19
|
+
pub use unicode::UnicodeTokenizer;
|
|
20
|
+
pub use pattern::PatternTokenizer;
|
|
21
|
+
pub use sentence::SentenceTokenizer;
|
|
22
|
+
pub use grapheme::GraphemeTokenizer;
|
|
23
|
+
pub use keyword::KeywordTokenizer;
|
|
24
|
+
pub use edge_ngram::EdgeNgramTokenizer;
|
|
25
|
+
pub use ngram::NgramTokenizer;
|
|
26
|
+
pub use path_hierarchy::PathHierarchyTokenizer;
|
|
27
|
+
pub use url_email::UrlEmailTokenizer;
|
|
28
|
+
pub use char_group::CharGroupTokenizer;
|
|
29
|
+
pub use letter::LetterTokenizer;
|
|
30
|
+
pub use lowercase::LowercaseTokenizer;
|
|
31
|
+
|
|
32
|
+
use crate::config::{TokenizerConfig, TokenizerStrategy};
|
|
33
|
+
use crate::error::Result;
|
|
34
|
+
use regex::Regex;
|
|
35
|
+
|
|
36
|
+
pub trait Tokenizer: Send + Sync {
|
|
37
|
+
fn tokenize(&self, text: &str) -> Vec<String>;
|
|
38
|
+
}
|
|
39
|
+
|
|
40
|
+
pub fn from_config(config: TokenizerConfig) -> Result<Box<dyn Tokenizer>> {
|
|
41
|
+
match config.strategy.clone() {
|
|
42
|
+
TokenizerStrategy::Whitespace => Ok(Box::new(WhitespaceTokenizer::new(config))),
|
|
43
|
+
TokenizerStrategy::Unicode => Ok(Box::new(UnicodeTokenizer::new(config))),
|
|
44
|
+
TokenizerStrategy::Pattern { regex } => {
|
|
45
|
+
PatternTokenizer::new(®ex, config)
|
|
46
|
+
.map(|t| Box::new(t) as Box<dyn Tokenizer>)
|
|
47
|
+
}
|
|
48
|
+
TokenizerStrategy::Sentence => Ok(Box::new(SentenceTokenizer::new(config))),
|
|
49
|
+
TokenizerStrategy::Grapheme { extended } => {
|
|
50
|
+
Ok(Box::new(GraphemeTokenizer::new(config, extended)))
|
|
51
|
+
}
|
|
52
|
+
TokenizerStrategy::Keyword => Ok(Box::new(KeywordTokenizer::new(config))),
|
|
53
|
+
TokenizerStrategy::EdgeNgram { min_gram, max_gram } => {
|
|
54
|
+
Ok(Box::new(EdgeNgramTokenizer::new(config, min_gram, max_gram)))
|
|
55
|
+
}
|
|
56
|
+
TokenizerStrategy::PathHierarchy { delimiter } => {
|
|
57
|
+
Ok(Box::new(PathHierarchyTokenizer::new(config, delimiter)))
|
|
58
|
+
}
|
|
59
|
+
TokenizerStrategy::UrlEmail => {
|
|
60
|
+
Ok(Box::new(UrlEmailTokenizer::new(config)))
|
|
61
|
+
}
|
|
62
|
+
TokenizerStrategy::Ngram { min_gram, max_gram } => {
|
|
63
|
+
Ok(Box::new(NgramTokenizer::new(config, min_gram, max_gram)))
|
|
64
|
+
}
|
|
65
|
+
TokenizerStrategy::CharGroup { split_on_chars } => {
|
|
66
|
+
Ok(Box::new(CharGroupTokenizer::new(config, split_on_chars)))
|
|
67
|
+
}
|
|
68
|
+
TokenizerStrategy::Letter => Ok(Box::new(LetterTokenizer::new(config))),
|
|
69
|
+
TokenizerStrategy::Lowercase => Ok(Box::new(LowercaseTokenizer::new(config))),
|
|
70
|
+
}
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
pub(crate) fn merge_overlapping_spans(mut spans: Vec<(usize, usize, String)>) -> Vec<(usize, usize, String)> {
|
|
74
|
+
if spans.is_empty() {
|
|
75
|
+
return spans;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
spans.sort_by(|a, b| {
|
|
79
|
+
a.0.cmp(&b.0)
|
|
80
|
+
.then_with(|| b.1.cmp(&a.1))
|
|
81
|
+
});
|
|
82
|
+
|
|
83
|
+
let mut merged = Vec::new();
|
|
84
|
+
let mut current = spans[0].clone();
|
|
85
|
+
|
|
86
|
+
for span in spans.into_iter().skip(1) {
|
|
87
|
+
if span.0 < current.1 {
|
|
88
|
+
if span.1 > current.1 {
|
|
89
|
+
current = span;
|
|
90
|
+
}
|
|
91
|
+
} else {
|
|
92
|
+
merged.push(current);
|
|
93
|
+
current = span;
|
|
94
|
+
}
|
|
95
|
+
}
|
|
96
|
+
merged.push(current);
|
|
97
|
+
|
|
98
|
+
merged
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
// Optimized version that works with indices only
|
|
102
|
+
fn merge_overlapping_spans_optimized(mut spans: Vec<(usize, usize)>) -> Vec<(usize, usize)> {
|
|
103
|
+
if spans.is_empty() {
|
|
104
|
+
return spans;
|
|
105
|
+
}
|
|
106
|
+
|
|
107
|
+
spans.sort_unstable_by(|a, b| {
|
|
108
|
+
a.0.cmp(&b.0)
|
|
109
|
+
.then_with(|| b.1.cmp(&a.1))
|
|
110
|
+
});
|
|
111
|
+
|
|
112
|
+
let mut merged = Vec::with_capacity(spans.len());
|
|
113
|
+
let mut current = spans[0];
|
|
114
|
+
|
|
115
|
+
for span in spans.into_iter().skip(1) {
|
|
116
|
+
if span.0 < current.1 {
|
|
117
|
+
if span.1 > current.1 {
|
|
118
|
+
current.1 = span.1;
|
|
119
|
+
}
|
|
120
|
+
} else {
|
|
121
|
+
merged.push(current);
|
|
122
|
+
current = span;
|
|
123
|
+
}
|
|
124
|
+
}
|
|
125
|
+
merged.push(current);
|
|
126
|
+
merged
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
pub(crate) fn apply_preserve_patterns(
|
|
130
|
+
tokens: Vec<String>,
|
|
131
|
+
preserve_patterns: &[Regex],
|
|
132
|
+
original_text: &str,
|
|
133
|
+
config: &TokenizerConfig,
|
|
134
|
+
) -> Vec<String> {
|
|
135
|
+
apply_preserve_patterns_with_tokenizer(
|
|
136
|
+
tokens,
|
|
137
|
+
preserve_patterns,
|
|
138
|
+
original_text,
|
|
139
|
+
config,
|
|
140
|
+
tokenize_simple,
|
|
141
|
+
)
|
|
142
|
+
}
|
|
143
|
+
|
|
144
|
+
pub(crate) fn apply_preserve_patterns_with_tokenizer<F>(
|
|
145
|
+
tokens: Vec<String>,
|
|
146
|
+
preserve_patterns: &[Regex],
|
|
147
|
+
original_text: &str,
|
|
148
|
+
config: &TokenizerConfig,
|
|
149
|
+
tokenizer_fn: F,
|
|
150
|
+
) -> Vec<String>
|
|
151
|
+
where
|
|
152
|
+
F: Fn(&str) -> Vec<String>,
|
|
153
|
+
{
|
|
154
|
+
if preserve_patterns.is_empty() {
|
|
155
|
+
return tokens;
|
|
156
|
+
}
|
|
157
|
+
|
|
158
|
+
// Use indices instead of allocating strings upfront
|
|
159
|
+
let mut preserved_spans: Vec<(usize, usize)> = Vec::with_capacity(32);
|
|
160
|
+
for pattern in preserve_patterns {
|
|
161
|
+
for mat in pattern.find_iter(original_text) {
|
|
162
|
+
preserved_spans.push((mat.start(), mat.end()));
|
|
163
|
+
}
|
|
164
|
+
}
|
|
165
|
+
|
|
166
|
+
if preserved_spans.is_empty() {
|
|
167
|
+
return tokens;
|
|
168
|
+
}
|
|
169
|
+
|
|
170
|
+
let preserved_spans = merge_overlapping_spans_optimized(preserved_spans);
|
|
171
|
+
|
|
172
|
+
// Pre-allocate result vector with estimated capacity
|
|
173
|
+
let mut result = Vec::with_capacity(tokens.len() + preserved_spans.len());
|
|
174
|
+
let mut pos = 0;
|
|
175
|
+
|
|
176
|
+
for (start, end) in preserved_spans {
|
|
177
|
+
if start > pos {
|
|
178
|
+
let before = &original_text[pos..start];
|
|
179
|
+
let mut before_tokens = tokenizer_fn(before);
|
|
180
|
+
post_process_in_place(&mut before_tokens, config);
|
|
181
|
+
result.extend(before_tokens);
|
|
182
|
+
}
|
|
183
|
+
// Extract preserved text only when needed
|
|
184
|
+
result.push(original_text[start..end].to_string());
|
|
185
|
+
pos = end;
|
|
186
|
+
}
|
|
187
|
+
|
|
188
|
+
if pos < original_text.len() {
|
|
189
|
+
let remaining = &original_text[pos..];
|
|
190
|
+
let mut remaining_tokens = tokenizer_fn(remaining);
|
|
191
|
+
post_process_in_place(&mut remaining_tokens, config);
|
|
192
|
+
result.extend(remaining_tokens);
|
|
193
|
+
}
|
|
194
|
+
|
|
195
|
+
result
|
|
196
|
+
}
|
|
197
|
+
|
|
198
|
+
fn tokenize_simple(text: &str) -> Vec<String> {
|
|
199
|
+
text.split_whitespace()
|
|
200
|
+
.filter(|s| !s.is_empty())
|
|
201
|
+
.map(|s| s.to_string())
|
|
202
|
+
.collect()
|
|
203
|
+
}
|
|
204
|
+
|
|
205
|
+
pub(crate) fn post_process(tokens: Vec<String>, config: &TokenizerConfig) -> Vec<String> {
|
|
206
|
+
post_process_with_preserved(tokens, config, None)
|
|
207
|
+
}
|
|
208
|
+
|
|
209
|
+
// In-place version to avoid allocation
|
|
210
|
+
fn post_process_in_place(tokens: &mut Vec<String>, config: &TokenizerConfig) {
|
|
211
|
+
if config.lowercase {
|
|
212
|
+
for token in tokens.iter_mut() {
|
|
213
|
+
*token = token.to_lowercase();
|
|
214
|
+
}
|
|
215
|
+
}
|
|
216
|
+
|
|
217
|
+
if config.remove_punctuation {
|
|
218
|
+
tokens.retain_mut(|token| {
|
|
219
|
+
token.retain(|c| !c.is_ascii_punctuation());
|
|
220
|
+
!token.is_empty()
|
|
221
|
+
});
|
|
222
|
+
}
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
pub(crate) fn post_process_with_preserved(
|
|
226
|
+
mut tokens: Vec<String>,
|
|
227
|
+
config: &TokenizerConfig,
|
|
228
|
+
preserve_chars: Option<&str>,
|
|
229
|
+
) -> Vec<String> {
|
|
230
|
+
if config.lowercase {
|
|
231
|
+
tokens = tokens.into_iter().map(|t| t.to_lowercase()).collect();
|
|
232
|
+
}
|
|
233
|
+
|
|
234
|
+
if config.remove_punctuation {
|
|
235
|
+
tokens = tokens
|
|
236
|
+
.into_iter()
|
|
237
|
+
.map(|t| {
|
|
238
|
+
t.chars()
|
|
239
|
+
.filter(|c| {
|
|
240
|
+
if let Some(preserved) = preserve_chars {
|
|
241
|
+
if preserved.contains(*c) {
|
|
242
|
+
return true;
|
|
243
|
+
}
|
|
244
|
+
}
|
|
245
|
+
!c.is_ascii_punctuation()
|
|
246
|
+
})
|
|
247
|
+
.collect()
|
|
248
|
+
})
|
|
249
|
+
.filter(|s: &String| !s.is_empty())
|
|
250
|
+
.collect();
|
|
251
|
+
}
|
|
252
|
+
|
|
253
|
+
tokens
|
|
254
|
+
}
|
|
@@ -0,0 +1,80 @@
|
|
|
1
|
+
use super::Tokenizer;
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
|
|
4
|
+
pub struct NgramTokenizer {
|
|
5
|
+
config: TokenizerConfig,
|
|
6
|
+
min_gram: usize,
|
|
7
|
+
max_gram: usize,
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
impl NgramTokenizer {
|
|
11
|
+
pub fn new(config: TokenizerConfig, min_gram: usize, max_gram: usize) -> Self {
|
|
12
|
+
// Validate and sanitize parameters
|
|
13
|
+
let min_gram = min_gram.max(1); // Minimum 1 character
|
|
14
|
+
let max_gram = max_gram.max(min_gram); // Ensure max >= min
|
|
15
|
+
|
|
16
|
+
Self {
|
|
17
|
+
config,
|
|
18
|
+
min_gram,
|
|
19
|
+
max_gram,
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
fn generate_ngrams(&self, text: &str) -> Vec<String> {
|
|
24
|
+
let mut ngrams = Vec::new();
|
|
25
|
+
let chars: Vec<char> = text.chars().collect();
|
|
26
|
+
let text_len = chars.len();
|
|
27
|
+
|
|
28
|
+
if text_len == 0 {
|
|
29
|
+
return ngrams;
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
let max = self.max_gram.min(text_len);
|
|
33
|
+
|
|
34
|
+
for gram_size in self.min_gram..=max {
|
|
35
|
+
for start in 0..=(text_len - gram_size) {
|
|
36
|
+
let ngram: String = chars.iter().skip(start).take(gram_size).collect();
|
|
37
|
+
ngrams.push(ngram);
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
ngrams
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
impl Tokenizer for NgramTokenizer {
|
|
46
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
47
|
+
let mut all_ngrams = Vec::new();
|
|
48
|
+
|
|
49
|
+
for word in text.split_whitespace() {
|
|
50
|
+
if word.is_empty() {
|
|
51
|
+
continue;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
let processed_word = if self.config.remove_punctuation {
|
|
55
|
+
word.chars()
|
|
56
|
+
.filter(|c| !c.is_ascii_punctuation())
|
|
57
|
+
.collect()
|
|
58
|
+
} else {
|
|
59
|
+
word.to_string()
|
|
60
|
+
};
|
|
61
|
+
|
|
62
|
+
if processed_word.is_empty() {
|
|
63
|
+
continue;
|
|
64
|
+
}
|
|
65
|
+
|
|
66
|
+
let ngrams = self.generate_ngrams(&processed_word);
|
|
67
|
+
all_ngrams.extend(ngrams);
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Apply lowercase if needed. Note: remove_punctuation already handled above.
|
|
71
|
+
let mut result = all_ngrams;
|
|
72
|
+
|
|
73
|
+
if self.config.lowercase {
|
|
74
|
+
result = result.into_iter().map(|t| t.to_lowercase()).collect();
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
result
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
}
|
|
@@ -0,0 +1,187 @@
|
|
|
1
|
+
use super::{post_process_with_preserved, BaseTokenizerFields, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
|
|
4
|
+
pub struct PathHierarchyTokenizer {
|
|
5
|
+
base: BaseTokenizerFields,
|
|
6
|
+
delimiter: String,
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
impl PathHierarchyTokenizer {
|
|
10
|
+
pub fn new(config: TokenizerConfig, delimiter: String) -> Self {
|
|
11
|
+
Self {
|
|
12
|
+
base: BaseTokenizerFields::new(config),
|
|
13
|
+
delimiter,
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
fn generate_hierarchy(&self, path: &str) -> Vec<String> {
|
|
18
|
+
let mut tokens = Vec::new();
|
|
19
|
+
let parts: Vec<&str> = path.split(&self.delimiter).collect();
|
|
20
|
+
|
|
21
|
+
let mut current_path = String::new();
|
|
22
|
+
let starts_with_delimiter = path.starts_with(&self.delimiter);
|
|
23
|
+
|
|
24
|
+
for part in parts.iter() {
|
|
25
|
+
if part.is_empty() {
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
|
|
29
|
+
if !current_path.is_empty() {
|
|
30
|
+
current_path.push_str(&self.delimiter);
|
|
31
|
+
} else if starts_with_delimiter {
|
|
32
|
+
current_path.push_str(&self.delimiter);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
current_path.push_str(part);
|
|
36
|
+
tokens.push(current_path.clone());
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
tokens
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
fn apply_patterns_to_hierarchy(&self, text: &str) -> Vec<String> {
|
|
43
|
+
if self.base.preserve_patterns().is_empty() {
|
|
44
|
+
return self.generate_hierarchy(text);
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
// Generate all hierarchical tokens first
|
|
48
|
+
let all_tokens = self.generate_hierarchy(text);
|
|
49
|
+
|
|
50
|
+
// Find which tokens are completely matched by preserve patterns
|
|
51
|
+
let mut preserved_tokens = Vec::new();
|
|
52
|
+
for token in &all_tokens {
|
|
53
|
+
for pattern in self.base.preserve_patterns() {
|
|
54
|
+
if let Some(mat) = pattern.find(token) {
|
|
55
|
+
if mat.as_str() == token {
|
|
56
|
+
preserved_tokens.push(token.clone());
|
|
57
|
+
break;
|
|
58
|
+
}
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
// Now build the result, applying lowercase where appropriate
|
|
64
|
+
let mut result = Vec::new();
|
|
65
|
+
for token in all_tokens {
|
|
66
|
+
// Check if this token should be included
|
|
67
|
+
// Include if: it's a preserved token OR it extends beyond a preserved token
|
|
68
|
+
let should_include;
|
|
69
|
+
let mut apply_lowercase = self.base.config.lowercase;
|
|
70
|
+
|
|
71
|
+
if preserved_tokens.contains(&token) {
|
|
72
|
+
should_include = true;
|
|
73
|
+
apply_lowercase = false; // Don't lowercase preserved tokens
|
|
74
|
+
} else {
|
|
75
|
+
// Check if this token extends a preserved token
|
|
76
|
+
let mut extends_preserved = false;
|
|
77
|
+
for preserved in &preserved_tokens {
|
|
78
|
+
if token.starts_with(preserved) && token.len() > preserved.len() {
|
|
79
|
+
extends_preserved = true;
|
|
80
|
+
break;
|
|
81
|
+
}
|
|
82
|
+
}
|
|
83
|
+
|
|
84
|
+
if extends_preserved {
|
|
85
|
+
should_include = true;
|
|
86
|
+
} else {
|
|
87
|
+
// Include if no preserved token is a prefix of this one
|
|
88
|
+
let mut has_preserved_prefix = false;
|
|
89
|
+
for preserved in &preserved_tokens {
|
|
90
|
+
if preserved.starts_with(&token) && preserved != &token {
|
|
91
|
+
has_preserved_prefix = true;
|
|
92
|
+
break;
|
|
93
|
+
}
|
|
94
|
+
}
|
|
95
|
+
should_include = !has_preserved_prefix;
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
|
|
99
|
+
if should_include {
|
|
100
|
+
if apply_lowercase && !preserved_tokens.contains(&token) {
|
|
101
|
+
// Apply lowercase to non-preserved parts
|
|
102
|
+
let mut lowercased = String::new();
|
|
103
|
+
let starts_with_delim = token.starts_with(&self.delimiter);
|
|
104
|
+
let parts: Vec<&str> = token.split(&self.delimiter).collect();
|
|
105
|
+
|
|
106
|
+
for (i, part) in parts.iter().enumerate() {
|
|
107
|
+
if part.is_empty() {
|
|
108
|
+
if i == 0 && starts_with_delim {
|
|
109
|
+
// Path starts with delimiter, add it once
|
|
110
|
+
lowercased.push_str(&self.delimiter);
|
|
111
|
+
}
|
|
112
|
+
continue;
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
if i > 0 || (i == 0 && starts_with_delim) {
|
|
116
|
+
if !lowercased.is_empty() && !lowercased.ends_with(&self.delimiter) {
|
|
117
|
+
lowercased.push_str(&self.delimiter);
|
|
118
|
+
}
|
|
119
|
+
}
|
|
120
|
+
|
|
121
|
+
// Check if this specific part should be preserved
|
|
122
|
+
let mut preserve_part = false;
|
|
123
|
+
for pattern in self.base.preserve_patterns() {
|
|
124
|
+
if pattern.is_match(part) {
|
|
125
|
+
preserve_part = true;
|
|
126
|
+
break;
|
|
127
|
+
}
|
|
128
|
+
}
|
|
129
|
+
|
|
130
|
+
if preserve_part {
|
|
131
|
+
lowercased.push_str(part);
|
|
132
|
+
} else {
|
|
133
|
+
lowercased.push_str(&part.to_lowercase());
|
|
134
|
+
}
|
|
135
|
+
}
|
|
136
|
+
result.push(lowercased);
|
|
137
|
+
} else {
|
|
138
|
+
result.push(token);
|
|
139
|
+
}
|
|
140
|
+
}
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
result
|
|
144
|
+
}
|
|
145
|
+
}
|
|
146
|
+
|
|
147
|
+
impl Tokenizer for PathHierarchyTokenizer {
|
|
148
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
149
|
+
let trimmed = text.trim();
|
|
150
|
+
if trimmed.is_empty() {
|
|
151
|
+
return vec![];
|
|
152
|
+
}
|
|
153
|
+
|
|
154
|
+
if self.base.has_preserve_patterns() {
|
|
155
|
+
let mut tokens = self.apply_patterns_to_hierarchy(trimmed);
|
|
156
|
+
|
|
157
|
+
// Apply remove_punctuation if needed (but preserve delimiters)
|
|
158
|
+
if self.base.config.remove_punctuation {
|
|
159
|
+
tokens = tokens.into_iter().map(|token| {
|
|
160
|
+
let parts: Vec<&str> = token.split(&self.delimiter).collect();
|
|
161
|
+
let processed: Vec<String> = parts.iter().map(|part| {
|
|
162
|
+
if part.is_empty() {
|
|
163
|
+
String::new()
|
|
164
|
+
} else {
|
|
165
|
+
// Check if this part should be preserved
|
|
166
|
+
let should_preserve = self.base.preserve_patterns().iter().any(|p| p.is_match(part));
|
|
167
|
+
if should_preserve {
|
|
168
|
+
part.to_string()
|
|
169
|
+
} else {
|
|
170
|
+
part.chars()
|
|
171
|
+
.filter(|c| !c.is_ascii_punctuation() || self.delimiter.contains(*c))
|
|
172
|
+
.collect()
|
|
173
|
+
}
|
|
174
|
+
}
|
|
175
|
+
}).collect();
|
|
176
|
+
processed.join(&self.delimiter)
|
|
177
|
+
}).filter(|s| !s.is_empty() && s != &self.delimiter).collect();
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
tokens
|
|
181
|
+
} else {
|
|
182
|
+
let tokens = self.generate_hierarchy(trimmed);
|
|
183
|
+
post_process_with_preserved(tokens, &self.base.config, Some(&self.delimiter))
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
|
|
187
|
+
}
|
|
@@ -0,0 +1,38 @@
|
|
|
1
|
+
use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
use crate::error::Result;
|
|
4
|
+
use regex::Regex;
|
|
5
|
+
|
|
6
|
+
pub struct PatternTokenizer {
|
|
7
|
+
base: BaseTokenizerFields,
|
|
8
|
+
pattern: Regex,
|
|
9
|
+
}
|
|
10
|
+
|
|
11
|
+
impl PatternTokenizer {
|
|
12
|
+
pub fn new(regex: &str, config: TokenizerConfig) -> Result<Self> {
|
|
13
|
+
// Pattern is already validated in validate_config(), safe to unwrap
|
|
14
|
+
let pattern = Regex::new(regex).expect("Pattern should have been validated");
|
|
15
|
+
|
|
16
|
+
Ok(Self {
|
|
17
|
+
base: BaseTokenizerFields::new(config),
|
|
18
|
+
pattern,
|
|
19
|
+
})
|
|
20
|
+
}
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
impl Tokenizer for PatternTokenizer {
|
|
24
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
25
|
+
let tokens: Vec<String> = self
|
|
26
|
+
.pattern
|
|
27
|
+
.find_iter(text)
|
|
28
|
+
.map(|mat| mat.as_str().to_string())
|
|
29
|
+
.collect();
|
|
30
|
+
|
|
31
|
+
if self.base.has_preserve_patterns() {
|
|
32
|
+
apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config)
|
|
33
|
+
} else {
|
|
34
|
+
post_process(tokens, &self.base.config)
|
|
35
|
+
}
|
|
36
|
+
}
|
|
37
|
+
|
|
38
|
+
}
|
|
@@ -0,0 +1,89 @@
|
|
|
1
|
+
use super::{post_process, BaseTokenizerFields, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
use unicode_segmentation::UnicodeSegmentation;
|
|
4
|
+
|
|
5
|
+
pub struct SentenceTokenizer {
|
|
6
|
+
base: BaseTokenizerFields,
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
impl SentenceTokenizer {
|
|
10
|
+
pub fn new(config: TokenizerConfig) -> Self {
|
|
11
|
+
Self {
|
|
12
|
+
base: BaseTokenizerFields::new(config),
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
impl SentenceTokenizer {
|
|
18
|
+
fn apply_patterns_to_sentence(&self, sentence: &str) -> String {
|
|
19
|
+
if self.base.preserve_patterns().is_empty() || !self.base.config.lowercase {
|
|
20
|
+
return sentence.to_string();
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
// Find all matches in the sentence
|
|
24
|
+
let mut preserved_spans: Vec<(usize, usize, String)> = Vec::new();
|
|
25
|
+
for pattern in self.base.preserve_patterns() {
|
|
26
|
+
for mat in pattern.find_iter(sentence) {
|
|
27
|
+
preserved_spans.push((mat.start(), mat.end(), mat.as_str().to_string()));
|
|
28
|
+
}
|
|
29
|
+
}
|
|
30
|
+
|
|
31
|
+
if preserved_spans.is_empty() {
|
|
32
|
+
return sentence.to_string();
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
// Sort and merge overlapping spans
|
|
36
|
+
preserved_spans.sort_by(|a, b| a.0.cmp(&b.0).then_with(|| b.1.cmp(&a.1)));
|
|
37
|
+
|
|
38
|
+
let mut result = String::new();
|
|
39
|
+
let mut pos = 0;
|
|
40
|
+
|
|
41
|
+
for (start, end, preserved) in preserved_spans {
|
|
42
|
+
if start > pos {
|
|
43
|
+
// Lowercase the text before the preserved pattern
|
|
44
|
+
result.push_str(&sentence[pos..start].to_lowercase());
|
|
45
|
+
}
|
|
46
|
+
// Keep the preserved pattern as-is
|
|
47
|
+
result.push_str(&preserved);
|
|
48
|
+
pos = end.max(pos); // Handle overlaps
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
if pos < sentence.len() {
|
|
52
|
+
// Lowercase the remaining text
|
|
53
|
+
result.push_str(&sentence[pos..].to_lowercase());
|
|
54
|
+
}
|
|
55
|
+
|
|
56
|
+
result
|
|
57
|
+
}
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
impl Tokenizer for SentenceTokenizer {
|
|
61
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
62
|
+
let mut sentences: Vec<String> = text
|
|
63
|
+
.unicode_sentences()
|
|
64
|
+
.map(|s| s.to_string())
|
|
65
|
+
.collect();
|
|
66
|
+
|
|
67
|
+
// Apply preserve patterns to each sentence
|
|
68
|
+
if self.base.has_preserve_patterns() && self.base.config.lowercase {
|
|
69
|
+
sentences = sentences
|
|
70
|
+
.into_iter()
|
|
71
|
+
.map(|sentence| self.apply_patterns_to_sentence(&sentence))
|
|
72
|
+
.collect();
|
|
73
|
+
|
|
74
|
+
// Don't call post_process since we already handled lowercasing with patterns
|
|
75
|
+
// Just handle remove_punctuation if needed
|
|
76
|
+
if self.base.config.remove_punctuation {
|
|
77
|
+
sentences = sentences
|
|
78
|
+
.into_iter()
|
|
79
|
+
.map(|s| s.chars().filter(|c| !c.is_ascii_punctuation()).collect())
|
|
80
|
+
.filter(|s: &String| !s.is_empty())
|
|
81
|
+
.collect();
|
|
82
|
+
}
|
|
83
|
+
sentences
|
|
84
|
+
} else {
|
|
85
|
+
post_process(sentences, &self.base.config)
|
|
86
|
+
}
|
|
87
|
+
}
|
|
88
|
+
|
|
89
|
+
}
|
|
@@ -0,0 +1,36 @@
|
|
|
1
|
+
use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
use unicode_segmentation::UnicodeSegmentation;
|
|
4
|
+
|
|
5
|
+
pub struct UnicodeTokenizer {
|
|
6
|
+
base: BaseTokenizerFields,
|
|
7
|
+
}
|
|
8
|
+
|
|
9
|
+
impl UnicodeTokenizer {
|
|
10
|
+
pub fn new(config: TokenizerConfig) -> Self {
|
|
11
|
+
Self {
|
|
12
|
+
base: BaseTokenizerFields::new(config),
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
impl Tokenizer for UnicodeTokenizer {
|
|
18
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
19
|
+
if self.base.has_preserve_patterns() {
|
|
20
|
+
let tokens = text
|
|
21
|
+
.unicode_words()
|
|
22
|
+
.map(|s| s.to_string())
|
|
23
|
+
.collect();
|
|
24
|
+
|
|
25
|
+
return apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config);
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
let tokens: Vec<String> = text
|
|
29
|
+
.unicode_words()
|
|
30
|
+
.map(|s| s.to_string())
|
|
31
|
+
.collect();
|
|
32
|
+
|
|
33
|
+
post_process(tokens, &self.base.config)
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
}
|