tokenkit 0.1.0.pre.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +7 -0
- data/.rspec +3 -0
- data/.standard.yml +3 -0
- data/.yardopts +12 -0
- data/CODE_OF_CONDUCT.md +132 -0
- data/LICENSE.txt +21 -0
- data/README.md +644 -0
- data/Rakefile +18 -0
- data/benchmarks/cache_test.rb +63 -0
- data/benchmarks/final_comparison.rb +83 -0
- data/benchmarks/tokenizer_benchmark.rb +250 -0
- data/docs/ARCHITECTURE.md +469 -0
- data/docs/PERFORMANCE.md +382 -0
- data/docs/README.md +118 -0
- data/ext/tokenkit/Cargo.toml +21 -0
- data/ext/tokenkit/extconf.rb +4 -0
- data/ext/tokenkit/src/config.rs +37 -0
- data/ext/tokenkit/src/error.rs +67 -0
- data/ext/tokenkit/src/lib.rs +346 -0
- data/ext/tokenkit/src/tokenizer/base.rs +41 -0
- data/ext/tokenkit/src/tokenizer/char_group.rs +62 -0
- data/ext/tokenkit/src/tokenizer/edge_ngram.rs +73 -0
- data/ext/tokenkit/src/tokenizer/grapheme.rs +26 -0
- data/ext/tokenkit/src/tokenizer/keyword.rs +25 -0
- data/ext/tokenkit/src/tokenizer/letter.rs +41 -0
- data/ext/tokenkit/src/tokenizer/lowercase.rs +51 -0
- data/ext/tokenkit/src/tokenizer/mod.rs +254 -0
- data/ext/tokenkit/src/tokenizer/ngram.rs +80 -0
- data/ext/tokenkit/src/tokenizer/path_hierarchy.rs +187 -0
- data/ext/tokenkit/src/tokenizer/pattern.rs +38 -0
- data/ext/tokenkit/src/tokenizer/sentence.rs +89 -0
- data/ext/tokenkit/src/tokenizer/unicode.rs +36 -0
- data/ext/tokenkit/src/tokenizer/url_email.rs +108 -0
- data/ext/tokenkit/src/tokenizer/whitespace.rs +31 -0
- data/lib/tokenkit/config.rb +74 -0
- data/lib/tokenkit/config_builder.rb +209 -0
- data/lib/tokenkit/config_compat.rb +52 -0
- data/lib/tokenkit/configuration.rb +194 -0
- data/lib/tokenkit/regex_converter.rb +58 -0
- data/lib/tokenkit/version.rb +5 -0
- data/lib/tokenkit.rb +336 -0
- data/sig/tokenkit.rbs +4 -0
- metadata +172 -0
|
@@ -0,0 +1,346 @@
|
|
|
1
|
+
mod config;
|
|
2
|
+
mod error;
|
|
3
|
+
mod tokenizer;
|
|
4
|
+
|
|
5
|
+
use config::{TokenizerConfig, TokenizerStrategy};
|
|
6
|
+
use error::TokenizerError;
|
|
7
|
+
use magnus::{define_module, function, Error, RArray, RHash, TryConvert};
|
|
8
|
+
use std::sync::Mutex;
|
|
9
|
+
use once_cell::sync::Lazy;
|
|
10
|
+
|
|
11
|
+
// Store the default configuration and a cached tokenizer
|
|
12
|
+
struct TokenizerCache {
|
|
13
|
+
config: TokenizerConfig,
|
|
14
|
+
tokenizer: Option<Box<dyn tokenizer::Tokenizer + Send + Sync>>,
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
static DEFAULT_CACHE: Lazy<Mutex<TokenizerCache>> = Lazy::new(|| {
|
|
18
|
+
Mutex::new(TokenizerCache {
|
|
19
|
+
config: TokenizerConfig::default(),
|
|
20
|
+
tokenizer: None,
|
|
21
|
+
})
|
|
22
|
+
});
|
|
23
|
+
|
|
24
|
+
// Use cached tokenizer if config hasn't changed
|
|
25
|
+
fn tokenize(text: String) -> std::result::Result<Vec<String>, Error> {
|
|
26
|
+
let mut cache = DEFAULT_CACHE
|
|
27
|
+
.lock()
|
|
28
|
+
.map_err(|e| TokenizerError::MutexError(e.to_string()))?;
|
|
29
|
+
|
|
30
|
+
// Check if we need to create a new tokenizer
|
|
31
|
+
if cache.tokenizer.is_none() {
|
|
32
|
+
let tokenizer = tokenizer::from_config(cache.config.clone())?;
|
|
33
|
+
cache.tokenizer = Some(tokenizer);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Use the cached tokenizer
|
|
37
|
+
let result = cache
|
|
38
|
+
.tokenizer
|
|
39
|
+
.as_ref()
|
|
40
|
+
.unwrap()
|
|
41
|
+
.tokenize(&text);
|
|
42
|
+
|
|
43
|
+
Ok(result)
|
|
44
|
+
}
|
|
45
|
+
|
|
46
|
+
// Configure sets the default configuration and invalidates cache
|
|
47
|
+
fn configure(config_hash: RHash) -> std::result::Result<(), Error> {
|
|
48
|
+
let config = parse_config_from_hash(config_hash)?;
|
|
49
|
+
|
|
50
|
+
// Update cache with new config and clear tokenizer
|
|
51
|
+
let mut cache = DEFAULT_CACHE
|
|
52
|
+
.lock()
|
|
53
|
+
.map_err(|e| TokenizerError::MutexError(e.to_string()))?;
|
|
54
|
+
cache.config = config;
|
|
55
|
+
cache.tokenizer = None; // Invalidate cached tokenizer
|
|
56
|
+
|
|
57
|
+
Ok(())
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
// Reset to factory defaults
|
|
61
|
+
fn reset() -> std::result::Result<(), Error> {
|
|
62
|
+
let mut cache = DEFAULT_CACHE
|
|
63
|
+
.lock()
|
|
64
|
+
.map_err(|e| TokenizerError::MutexError(e.to_string()))?;
|
|
65
|
+
cache.config = TokenizerConfig::default();
|
|
66
|
+
cache.tokenizer = None; // Clear cached tokenizer
|
|
67
|
+
Ok(())
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
// Get current default configuration
|
|
71
|
+
fn config_hash() -> std::result::Result<RHash, Error> {
|
|
72
|
+
let cache = DEFAULT_CACHE
|
|
73
|
+
.lock()
|
|
74
|
+
.map_err(|e| TokenizerError::MutexError(e.to_string()))?;
|
|
75
|
+
|
|
76
|
+
config_to_hash(&cache.config)
|
|
77
|
+
}
|
|
78
|
+
|
|
79
|
+
// Helper function to convert config to RHash
|
|
80
|
+
fn config_to_hash(config: &TokenizerConfig) -> std::result::Result<RHash, Error> {
|
|
81
|
+
let hash = RHash::new();
|
|
82
|
+
|
|
83
|
+
let strategy_str = match &config.strategy {
|
|
84
|
+
TokenizerStrategy::Whitespace => "whitespace",
|
|
85
|
+
TokenizerStrategy::Unicode => "unicode",
|
|
86
|
+
TokenizerStrategy::Pattern { .. } => "pattern",
|
|
87
|
+
TokenizerStrategy::Sentence => "sentence",
|
|
88
|
+
TokenizerStrategy::Grapheme { .. } => "grapheme",
|
|
89
|
+
TokenizerStrategy::Keyword => "keyword",
|
|
90
|
+
TokenizerStrategy::EdgeNgram { .. } => "edge_ngram",
|
|
91
|
+
TokenizerStrategy::Ngram { .. } => "ngram",
|
|
92
|
+
TokenizerStrategy::PathHierarchy { .. } => "path_hierarchy",
|
|
93
|
+
TokenizerStrategy::UrlEmail => "url_email",
|
|
94
|
+
TokenizerStrategy::CharGroup { .. } => "char_group",
|
|
95
|
+
TokenizerStrategy::Letter => "letter",
|
|
96
|
+
TokenizerStrategy::Lowercase => "lowercase",
|
|
97
|
+
};
|
|
98
|
+
hash.aset("strategy", strategy_str)?;
|
|
99
|
+
|
|
100
|
+
if let TokenizerStrategy::Pattern { regex } = &config.strategy {
|
|
101
|
+
hash.aset("regex", regex.as_str())?;
|
|
102
|
+
}
|
|
103
|
+
|
|
104
|
+
if let TokenizerStrategy::Grapheme { extended } = &config.strategy {
|
|
105
|
+
hash.aset("extended", *extended)?;
|
|
106
|
+
}
|
|
107
|
+
|
|
108
|
+
if let TokenizerStrategy::EdgeNgram { min_gram, max_gram } = &config.strategy {
|
|
109
|
+
hash.aset("min_gram", *min_gram)?;
|
|
110
|
+
hash.aset("max_gram", *max_gram)?;
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
if let TokenizerStrategy::PathHierarchy { delimiter } = &config.strategy {
|
|
114
|
+
hash.aset("delimiter", delimiter.as_str())?;
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
if let TokenizerStrategy::Ngram { min_gram, max_gram } = &config.strategy {
|
|
118
|
+
hash.aset("min_gram", *min_gram)?;
|
|
119
|
+
hash.aset("max_gram", *max_gram)?;
|
|
120
|
+
}
|
|
121
|
+
|
|
122
|
+
if let TokenizerStrategy::CharGroup { split_on_chars } = &config.strategy {
|
|
123
|
+
hash.aset("split_on_chars", split_on_chars.as_str())?;
|
|
124
|
+
}
|
|
125
|
+
|
|
126
|
+
hash.aset("lowercase", config.lowercase)?;
|
|
127
|
+
hash.aset("remove_punctuation", config.remove_punctuation)?;
|
|
128
|
+
|
|
129
|
+
let patterns = RArray::new();
|
|
130
|
+
for pattern in &config.preserve_patterns {
|
|
131
|
+
patterns.push(pattern.as_str())?;
|
|
132
|
+
}
|
|
133
|
+
hash.aset("preserve_patterns", patterns)?;
|
|
134
|
+
|
|
135
|
+
Ok(hash)
|
|
136
|
+
}
|
|
137
|
+
|
|
138
|
+
// Parse config from Ruby hash
|
|
139
|
+
fn parse_config_from_hash(config_hash: RHash) -> std::result::Result<TokenizerConfig, Error> {
|
|
140
|
+
let strategy_val = config_hash.get("strategy");
|
|
141
|
+
let strategy = if let Some(val) = strategy_val {
|
|
142
|
+
let strategy_str: String = TryConvert::try_convert(val)?;
|
|
143
|
+
match strategy_str.as_str() {
|
|
144
|
+
"whitespace" => TokenizerStrategy::Whitespace,
|
|
145
|
+
"unicode" => TokenizerStrategy::Unicode,
|
|
146
|
+
"pattern" => {
|
|
147
|
+
let regex_val = config_hash
|
|
148
|
+
.get("regex")
|
|
149
|
+
.ok_or_else(|| {
|
|
150
|
+
TokenizerError::InvalidConfiguration(
|
|
151
|
+
"pattern strategy requires regex parameter".to_string()
|
|
152
|
+
)
|
|
153
|
+
})?;
|
|
154
|
+
let regex: String = TryConvert::try_convert(regex_val)?;
|
|
155
|
+
TokenizerStrategy::Pattern { regex }
|
|
156
|
+
}
|
|
157
|
+
"sentence" => TokenizerStrategy::Sentence,
|
|
158
|
+
"grapheme" => {
|
|
159
|
+
let extended_val = config_hash.get("extended");
|
|
160
|
+
let extended = if let Some(val) = extended_val {
|
|
161
|
+
TryConvert::try_convert(val)?
|
|
162
|
+
} else {
|
|
163
|
+
true
|
|
164
|
+
};
|
|
165
|
+
TokenizerStrategy::Grapheme { extended }
|
|
166
|
+
}
|
|
167
|
+
"keyword" => TokenizerStrategy::Keyword,
|
|
168
|
+
"edge_ngram" => {
|
|
169
|
+
let min_gram_val = config_hash.get("min_gram");
|
|
170
|
+
let min_gram = if let Some(val) = min_gram_val {
|
|
171
|
+
TryConvert::try_convert(val)?
|
|
172
|
+
} else {
|
|
173
|
+
2
|
|
174
|
+
};
|
|
175
|
+
let max_gram_val = config_hash.get("max_gram");
|
|
176
|
+
let max_gram = if let Some(val) = max_gram_val {
|
|
177
|
+
TryConvert::try_convert(val)?
|
|
178
|
+
} else {
|
|
179
|
+
10
|
|
180
|
+
};
|
|
181
|
+
TokenizerStrategy::EdgeNgram { min_gram, max_gram }
|
|
182
|
+
}
|
|
183
|
+
"path_hierarchy" => {
|
|
184
|
+
let delimiter_val = config_hash.get("delimiter");
|
|
185
|
+
let delimiter = if let Some(val) = delimiter_val {
|
|
186
|
+
TryConvert::try_convert(val)?
|
|
187
|
+
} else {
|
|
188
|
+
"/".to_string()
|
|
189
|
+
};
|
|
190
|
+
TokenizerStrategy::PathHierarchy { delimiter }
|
|
191
|
+
}
|
|
192
|
+
"url_email" => TokenizerStrategy::UrlEmail,
|
|
193
|
+
"ngram" => {
|
|
194
|
+
let min_gram_val = config_hash.get("min_gram");
|
|
195
|
+
let min_gram = if let Some(val) = min_gram_val {
|
|
196
|
+
TryConvert::try_convert(val)?
|
|
197
|
+
} else {
|
|
198
|
+
2
|
|
199
|
+
};
|
|
200
|
+
let max_gram_val = config_hash.get("max_gram");
|
|
201
|
+
let max_gram = if let Some(val) = max_gram_val {
|
|
202
|
+
TryConvert::try_convert(val)?
|
|
203
|
+
} else {
|
|
204
|
+
10
|
|
205
|
+
};
|
|
206
|
+
TokenizerStrategy::Ngram { min_gram, max_gram }
|
|
207
|
+
}
|
|
208
|
+
"char_group" => {
|
|
209
|
+
let split_on_chars_val = config_hash.get("split_on_chars");
|
|
210
|
+
let split_on_chars = if let Some(val) = split_on_chars_val {
|
|
211
|
+
TryConvert::try_convert(val)?
|
|
212
|
+
} else {
|
|
213
|
+
" \t\n\r".to_string()
|
|
214
|
+
};
|
|
215
|
+
TokenizerStrategy::CharGroup { split_on_chars }
|
|
216
|
+
}
|
|
217
|
+
"letter" => TokenizerStrategy::Letter,
|
|
218
|
+
"lowercase" => TokenizerStrategy::Lowercase,
|
|
219
|
+
_ => {
|
|
220
|
+
return Err(TokenizerError::UnknownStrategy(strategy_str).into())
|
|
221
|
+
}
|
|
222
|
+
}
|
|
223
|
+
} else {
|
|
224
|
+
TokenizerStrategy::Unicode
|
|
225
|
+
};
|
|
226
|
+
|
|
227
|
+
let lowercase_val = config_hash.get("lowercase");
|
|
228
|
+
let lowercase = if let Some(val) = lowercase_val {
|
|
229
|
+
TryConvert::try_convert(val)?
|
|
230
|
+
} else {
|
|
231
|
+
true
|
|
232
|
+
};
|
|
233
|
+
|
|
234
|
+
let remove_punctuation_val = config_hash.get("remove_punctuation");
|
|
235
|
+
let remove_punctuation = if let Some(val) = remove_punctuation_val {
|
|
236
|
+
TryConvert::try_convert(val)?
|
|
237
|
+
} else {
|
|
238
|
+
false
|
|
239
|
+
};
|
|
240
|
+
|
|
241
|
+
let preserve_patterns_val = config_hash.get("preserve_patterns");
|
|
242
|
+
let preserve_patterns = if let Some(val) = preserve_patterns_val {
|
|
243
|
+
let array: RArray = TryConvert::try_convert(val)?;
|
|
244
|
+
let mut patterns = Vec::new();
|
|
245
|
+
for idx in 0..array.len() {
|
|
246
|
+
let item = array.entry(idx as isize)?;
|
|
247
|
+
let pattern_str: String = TryConvert::try_convert(item)?;
|
|
248
|
+
patterns.push(pattern_str);
|
|
249
|
+
}
|
|
250
|
+
patterns
|
|
251
|
+
} else {
|
|
252
|
+
Vec::new()
|
|
253
|
+
};
|
|
254
|
+
|
|
255
|
+
let config = TokenizerConfig {
|
|
256
|
+
strategy,
|
|
257
|
+
lowercase,
|
|
258
|
+
remove_punctuation,
|
|
259
|
+
preserve_patterns,
|
|
260
|
+
};
|
|
261
|
+
|
|
262
|
+
// Validate the configuration
|
|
263
|
+
validate_config(&config)?;
|
|
264
|
+
|
|
265
|
+
Ok(config)
|
|
266
|
+
}
|
|
267
|
+
|
|
268
|
+
// Validate configuration parameters
|
|
269
|
+
fn validate_config(config: &TokenizerConfig) -> std::result::Result<(), TokenizerError> {
|
|
270
|
+
use TokenizerStrategy::*;
|
|
271
|
+
|
|
272
|
+
match &config.strategy {
|
|
273
|
+
EdgeNgram { min_gram, max_gram } | Ngram { min_gram, max_gram } => {
|
|
274
|
+
if *min_gram == 0 {
|
|
275
|
+
return Err(TokenizerError::InvalidNgramConfig {
|
|
276
|
+
min: *min_gram,
|
|
277
|
+
max: *max_gram,
|
|
278
|
+
});
|
|
279
|
+
}
|
|
280
|
+
if min_gram > max_gram {
|
|
281
|
+
return Err(TokenizerError::InvalidNgramConfig {
|
|
282
|
+
min: *min_gram,
|
|
283
|
+
max: *max_gram,
|
|
284
|
+
});
|
|
285
|
+
}
|
|
286
|
+
}
|
|
287
|
+
PathHierarchy { delimiter } => {
|
|
288
|
+
if delimiter.is_empty() {
|
|
289
|
+
return Err(TokenizerError::EmptyDelimiter {
|
|
290
|
+
tokenizer: "PathHierarchy".to_string(),
|
|
291
|
+
});
|
|
292
|
+
}
|
|
293
|
+
}
|
|
294
|
+
Pattern { regex } => {
|
|
295
|
+
// Validate regex pattern
|
|
296
|
+
regex::Regex::new(regex).map_err(|e| TokenizerError::InvalidRegex {
|
|
297
|
+
pattern: regex.clone(),
|
|
298
|
+
error: e.to_string(),
|
|
299
|
+
})?;
|
|
300
|
+
}
|
|
301
|
+
_ => {}
|
|
302
|
+
}
|
|
303
|
+
|
|
304
|
+
// Validate preserve patterns
|
|
305
|
+
for pattern in &config.preserve_patterns {
|
|
306
|
+
regex::Regex::new(pattern).map_err(|e| TokenizerError::InvalidRegex {
|
|
307
|
+
pattern: pattern.clone(),
|
|
308
|
+
error: e.to_string(),
|
|
309
|
+
})?;
|
|
310
|
+
}
|
|
311
|
+
|
|
312
|
+
Ok(())
|
|
313
|
+
}
|
|
314
|
+
|
|
315
|
+
// Load config is just an alias for configure (for backward compat)
|
|
316
|
+
fn load_config(config_hash: RHash) -> std::result::Result<(), Error> {
|
|
317
|
+
configure(config_hash)
|
|
318
|
+
}
|
|
319
|
+
|
|
320
|
+
// Tokenize with a specific config (creates fresh tokenizer)
|
|
321
|
+
fn tokenize_with_config(text: String, config_hash: RHash) -> std::result::Result<Vec<String>, Error> {
|
|
322
|
+
let config = parse_config_from_hash(config_hash)?;
|
|
323
|
+
|
|
324
|
+
// Create fresh tokenizer from config
|
|
325
|
+
let tokenizer = tokenizer::from_config(config)?;
|
|
326
|
+
|
|
327
|
+
// Tokenize and return
|
|
328
|
+
Ok(tokenizer.tokenize(&text))
|
|
329
|
+
}
|
|
330
|
+
|
|
331
|
+
#[magnus::init]
|
|
332
|
+
fn init(_ruby: &magnus::Ruby) -> std::result::Result<(), Error> {
|
|
333
|
+
let module = define_module("TokenKit")?;
|
|
334
|
+
|
|
335
|
+
// Public API functions
|
|
336
|
+
module.define_module_function("_tokenize", function!(tokenize, 1))?;
|
|
337
|
+
module.define_module_function("_configure", function!(configure, 1))?;
|
|
338
|
+
module.define_module_function("_reset", function!(reset, 0))?;
|
|
339
|
+
module.define_module_function("_config_hash", function!(config_hash, 0))?;
|
|
340
|
+
module.define_module_function("_load_config", function!(load_config, 1))?;
|
|
341
|
+
|
|
342
|
+
// New instance-based function
|
|
343
|
+
module.define_module_function("_tokenize_with_config", function!(tokenize_with_config, 2))?;
|
|
344
|
+
|
|
345
|
+
Ok(())
|
|
346
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
use crate::config::TokenizerConfig;
|
|
2
|
+
use regex::Regex;
|
|
3
|
+
|
|
4
|
+
/// Common functionality for tokenizers that support preserve_patterns
|
|
5
|
+
/// Note: Since we validate patterns in validate_config(), they're guaranteed to be valid here
|
|
6
|
+
pub fn create_preserve_patterns(config: &TokenizerConfig) -> Vec<Regex> {
|
|
7
|
+
config
|
|
8
|
+
.preserve_patterns
|
|
9
|
+
.iter()
|
|
10
|
+
.map(|p| {
|
|
11
|
+
// Safe to unwrap because patterns are validated in validate_config()
|
|
12
|
+
Regex::new(p).expect("Pattern should have been validated")
|
|
13
|
+
})
|
|
14
|
+
.collect()
|
|
15
|
+
}
|
|
16
|
+
|
|
17
|
+
/// Base fields that most tokenizers need
|
|
18
|
+
pub struct BaseTokenizerFields {
|
|
19
|
+
pub config: TokenizerConfig,
|
|
20
|
+
pub preserve_patterns: Vec<Regex>,
|
|
21
|
+
}
|
|
22
|
+
|
|
23
|
+
impl BaseTokenizerFields {
|
|
24
|
+
pub fn new(config: TokenizerConfig) -> Self {
|
|
25
|
+
let preserve_patterns = create_preserve_patterns(&config);
|
|
26
|
+
Self {
|
|
27
|
+
config,
|
|
28
|
+
preserve_patterns,
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
pub fn has_preserve_patterns(&self) -> bool {
|
|
34
|
+
!self.preserve_patterns.is_empty()
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
pub fn preserve_patterns(&self) -> &[Regex] {
|
|
38
|
+
&self.preserve_patterns
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
|
|
@@ -0,0 +1,62 @@
|
|
|
1
|
+
use super::{apply_preserve_patterns_with_tokenizer, post_process, BaseTokenizerFields, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
use std::collections::HashSet;
|
|
4
|
+
|
|
5
|
+
pub struct CharGroupTokenizer {
|
|
6
|
+
base: BaseTokenizerFields,
|
|
7
|
+
split_chars: HashSet<char>,
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
impl CharGroupTokenizer {
|
|
11
|
+
pub fn new(config: TokenizerConfig, split_on_chars: String) -> Self {
|
|
12
|
+
// Note: Empty split_on_chars is valid - it makes the tokenizer behave like
|
|
13
|
+
// a keyword tokenizer (no splitting, returns whole text as single token)
|
|
14
|
+
let split_chars: HashSet<char> = split_on_chars.chars().collect();
|
|
15
|
+
|
|
16
|
+
Self {
|
|
17
|
+
base: BaseTokenizerFields::new(config),
|
|
18
|
+
split_chars,
|
|
19
|
+
}
|
|
20
|
+
}
|
|
21
|
+
|
|
22
|
+
fn tokenize_text(&self, text: &str) -> Vec<String> {
|
|
23
|
+
let mut tokens = Vec::new();
|
|
24
|
+
let mut current_token = String::new();
|
|
25
|
+
|
|
26
|
+
for ch in text.chars() {
|
|
27
|
+
if self.split_chars.contains(&ch) {
|
|
28
|
+
if !current_token.is_empty() {
|
|
29
|
+
tokens.push(current_token.clone());
|
|
30
|
+
current_token.clear();
|
|
31
|
+
}
|
|
32
|
+
} else {
|
|
33
|
+
current_token.push(ch);
|
|
34
|
+
}
|
|
35
|
+
}
|
|
36
|
+
|
|
37
|
+
if !current_token.is_empty() {
|
|
38
|
+
tokens.push(current_token);
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
tokens
|
|
42
|
+
}
|
|
43
|
+
}
|
|
44
|
+
|
|
45
|
+
impl Tokenizer for CharGroupTokenizer {
|
|
46
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
47
|
+
let tokens = self.tokenize_text(text);
|
|
48
|
+
|
|
49
|
+
if self.base.has_preserve_patterns() {
|
|
50
|
+
apply_preserve_patterns_with_tokenizer(
|
|
51
|
+
tokens,
|
|
52
|
+
self.base.preserve_patterns(),
|
|
53
|
+
text,
|
|
54
|
+
&self.base.config,
|
|
55
|
+
|t| self.tokenize_text(t),
|
|
56
|
+
)
|
|
57
|
+
} else {
|
|
58
|
+
post_process(tokens, &self.base.config)
|
|
59
|
+
}
|
|
60
|
+
}
|
|
61
|
+
|
|
62
|
+
}
|
|
@@ -0,0 +1,73 @@
|
|
|
1
|
+
use super::{Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
|
|
4
|
+
pub struct EdgeNgramTokenizer {
|
|
5
|
+
config: TokenizerConfig,
|
|
6
|
+
min_gram: usize,
|
|
7
|
+
max_gram: usize,
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
impl EdgeNgramTokenizer {
|
|
11
|
+
pub fn new(config: TokenizerConfig, min_gram: usize, max_gram: usize) -> Self {
|
|
12
|
+
// Validate and sanitize parameters
|
|
13
|
+
let min_gram = min_gram.max(1); // Minimum 1 character
|
|
14
|
+
let max_gram = max_gram.max(min_gram); // Ensure max >= min
|
|
15
|
+
|
|
16
|
+
Self { config, min_gram, max_gram }
|
|
17
|
+
}
|
|
18
|
+
|
|
19
|
+
fn generate_edge_ngrams(&self, text: &str) -> Vec<String> {
|
|
20
|
+
let mut ngrams = Vec::new();
|
|
21
|
+
let chars: Vec<char> = text.chars().collect();
|
|
22
|
+
let text_len = chars.len();
|
|
23
|
+
|
|
24
|
+
if text_len == 0 {
|
|
25
|
+
return ngrams;
|
|
26
|
+
}
|
|
27
|
+
|
|
28
|
+
let max = self.max_gram.min(text_len);
|
|
29
|
+
|
|
30
|
+
for gram_size in self.min_gram..=max {
|
|
31
|
+
let ngram: String = chars.iter().take(gram_size).collect();
|
|
32
|
+
ngrams.push(ngram);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
ngrams
|
|
36
|
+
}
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
impl Tokenizer for EdgeNgramTokenizer {
|
|
40
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
41
|
+
let mut all_ngrams = Vec::new();
|
|
42
|
+
|
|
43
|
+
for word in text.split_whitespace() {
|
|
44
|
+
if word.is_empty() {
|
|
45
|
+
continue;
|
|
46
|
+
}
|
|
47
|
+
|
|
48
|
+
let processed_word = if self.config.remove_punctuation {
|
|
49
|
+
word.chars()
|
|
50
|
+
.filter(|c| !c.is_ascii_punctuation())
|
|
51
|
+
.collect()
|
|
52
|
+
} else {
|
|
53
|
+
word.to_string()
|
|
54
|
+
};
|
|
55
|
+
|
|
56
|
+
if processed_word.is_empty() {
|
|
57
|
+
continue;
|
|
58
|
+
}
|
|
59
|
+
|
|
60
|
+
let ngrams = self.generate_edge_ngrams(&processed_word);
|
|
61
|
+
all_ngrams.extend(ngrams);
|
|
62
|
+
}
|
|
63
|
+
|
|
64
|
+
let mut result = all_ngrams;
|
|
65
|
+
|
|
66
|
+
if self.config.lowercase {
|
|
67
|
+
result = result.into_iter().map(|t| t.to_lowercase()).collect();
|
|
68
|
+
}
|
|
69
|
+
|
|
70
|
+
result
|
|
71
|
+
}
|
|
72
|
+
|
|
73
|
+
}
|
|
@@ -0,0 +1,26 @@
|
|
|
1
|
+
use super::{post_process, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
use unicode_segmentation::UnicodeSegmentation;
|
|
4
|
+
|
|
5
|
+
pub struct GraphemeTokenizer {
|
|
6
|
+
config: TokenizerConfig,
|
|
7
|
+
extended: bool,
|
|
8
|
+
}
|
|
9
|
+
|
|
10
|
+
impl GraphemeTokenizer {
|
|
11
|
+
pub fn new(config: TokenizerConfig, extended: bool) -> Self {
|
|
12
|
+
Self { config, extended }
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
impl Tokenizer for GraphemeTokenizer {
|
|
17
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
18
|
+
let graphemes: Vec<String> = text
|
|
19
|
+
.graphemes(self.extended)
|
|
20
|
+
.map(|s| s.to_string())
|
|
21
|
+
.collect();
|
|
22
|
+
|
|
23
|
+
post_process(graphemes, &self.config)
|
|
24
|
+
}
|
|
25
|
+
|
|
26
|
+
}
|
|
@@ -0,0 +1,25 @@
|
|
|
1
|
+
use super::{post_process, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
|
|
4
|
+
pub struct KeywordTokenizer {
|
|
5
|
+
config: TokenizerConfig,
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
impl KeywordTokenizer {
|
|
9
|
+
pub fn new(config: TokenizerConfig) -> Self {
|
|
10
|
+
Self { config }
|
|
11
|
+
}
|
|
12
|
+
}
|
|
13
|
+
|
|
14
|
+
impl Tokenizer for KeywordTokenizer {
|
|
15
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
16
|
+
let trimmed = text.trim();
|
|
17
|
+
if trimmed.is_empty() {
|
|
18
|
+
return vec![];
|
|
19
|
+
}
|
|
20
|
+
|
|
21
|
+
let tokens = vec![trimmed.to_string()];
|
|
22
|
+
post_process(tokens, &self.config)
|
|
23
|
+
}
|
|
24
|
+
|
|
25
|
+
}
|
|
@@ -0,0 +1,41 @@
|
|
|
1
|
+
use super::{apply_preserve_patterns, post_process, BaseTokenizerFields, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
|
|
4
|
+
pub struct LetterTokenizer {
|
|
5
|
+
base: BaseTokenizerFields,
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
impl LetterTokenizer {
|
|
9
|
+
pub fn new(config: TokenizerConfig) -> Self {
|
|
10
|
+
Self {
|
|
11
|
+
base: BaseTokenizerFields::new(config),
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
impl Tokenizer for LetterTokenizer {
|
|
17
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
18
|
+
let mut tokens = Vec::new();
|
|
19
|
+
let mut current_token = String::new();
|
|
20
|
+
|
|
21
|
+
for ch in text.chars() {
|
|
22
|
+
if ch.is_alphabetic() {
|
|
23
|
+
current_token.push(ch);
|
|
24
|
+
} else if !current_token.is_empty() {
|
|
25
|
+
tokens.push(current_token.clone());
|
|
26
|
+
current_token.clear();
|
|
27
|
+
}
|
|
28
|
+
}
|
|
29
|
+
|
|
30
|
+
if !current_token.is_empty() {
|
|
31
|
+
tokens.push(current_token);
|
|
32
|
+
}
|
|
33
|
+
|
|
34
|
+
if self.base.has_preserve_patterns() {
|
|
35
|
+
apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &self.base.config)
|
|
36
|
+
} else {
|
|
37
|
+
post_process(tokens, &self.base.config)
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
|
|
41
|
+
}
|
|
@@ -0,0 +1,51 @@
|
|
|
1
|
+
use super::{apply_preserve_patterns, BaseTokenizerFields, Tokenizer};
|
|
2
|
+
use crate::config::TokenizerConfig;
|
|
3
|
+
|
|
4
|
+
pub struct LowercaseTokenizer {
|
|
5
|
+
base: BaseTokenizerFields,
|
|
6
|
+
}
|
|
7
|
+
|
|
8
|
+
impl LowercaseTokenizer {
|
|
9
|
+
pub fn new(config: TokenizerConfig) -> Self {
|
|
10
|
+
Self {
|
|
11
|
+
base: BaseTokenizerFields::new(config),
|
|
12
|
+
}
|
|
13
|
+
}
|
|
14
|
+
}
|
|
15
|
+
|
|
16
|
+
impl Tokenizer for LowercaseTokenizer {
|
|
17
|
+
fn tokenize(&self, text: &str) -> Vec<String> {
|
|
18
|
+
let mut tokens = Vec::new();
|
|
19
|
+
let mut current_token = String::new();
|
|
20
|
+
|
|
21
|
+
for ch in text.chars() {
|
|
22
|
+
if ch.is_alphabetic() {
|
|
23
|
+
for lowercase_ch in ch.to_lowercase() {
|
|
24
|
+
current_token.push(lowercase_ch);
|
|
25
|
+
}
|
|
26
|
+
} else if !current_token.is_empty() {
|
|
27
|
+
tokens.push(current_token.clone());
|
|
28
|
+
current_token.clear();
|
|
29
|
+
}
|
|
30
|
+
}
|
|
31
|
+
|
|
32
|
+
if !current_token.is_empty() {
|
|
33
|
+
tokens.push(current_token);
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
// Lowercase tokenizer always lowercases, ignore config.lowercase
|
|
37
|
+
// Note: remove_punctuation has no effect since we already split on non-alphabetic
|
|
38
|
+
// characters, but we keep it for consistency with the Tokenizer interface
|
|
39
|
+
|
|
40
|
+
if self.base.has_preserve_patterns() {
|
|
41
|
+
// For preserve_patterns, we need to pass a modified config that doesn't lowercase
|
|
42
|
+
// because apply_preserve_patterns handles lowercasing for non-preserved tokens
|
|
43
|
+
let mut modified_config = self.base.config.clone();
|
|
44
|
+
modified_config.lowercase = true; // Force lowercase for non-preserved tokens
|
|
45
|
+
apply_preserve_patterns(tokens, self.base.preserve_patterns(), text, &modified_config)
|
|
46
|
+
} else {
|
|
47
|
+
tokens
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
}
|