tokenizers 0.5.3 → 0.5.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,19 +22,32 @@ use magnus::{function, method, prelude::*, value::Lazy, Error, RModule, Ruby};
22
22
 
23
23
  type RbResult<T> = Result<T, Error>;
24
24
 
25
- static TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.class_object().const_get("Tokenizers").unwrap());
25
+ static TOKENIZERS: Lazy<RModule> =
26
+ Lazy::new(|ruby| ruby.class_object().const_get("Tokenizers").unwrap());
26
27
 
27
- static DECODERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Decoders").unwrap());
28
+ static DECODERS: Lazy<RModule> =
29
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Decoders").unwrap());
28
30
 
29
- static MODELS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Models").unwrap());
31
+ static MODELS: Lazy<RModule> =
32
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Models").unwrap());
30
33
 
31
- static NORMALIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Normalizers").unwrap());
34
+ static NORMALIZERS: Lazy<RModule> = Lazy::new(|ruby| {
35
+ ruby.get_inner(&TOKENIZERS)
36
+ .const_get("Normalizers")
37
+ .unwrap()
38
+ });
32
39
 
33
- static PRE_TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("PreTokenizers").unwrap());
40
+ static PRE_TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| {
41
+ ruby.get_inner(&TOKENIZERS)
42
+ .const_get("PreTokenizers")
43
+ .unwrap()
44
+ });
34
45
 
35
- static PROCESSORS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Processors").unwrap());
46
+ static PROCESSORS: Lazy<RModule> =
47
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Processors").unwrap());
36
48
 
37
- static TRAINERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Trainers").unwrap());
49
+ static TRAINERS: Lazy<RModule> =
50
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Trainers").unwrap());
38
51
 
39
52
  #[magnus::init]
40
53
  fn init(ruby: &Ruby) -> RbResult<()> {
@@ -56,12 +69,15 @@ fn init(ruby: &Ruby) -> RbResult<()> {
56
69
  class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
57
70
  class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
58
71
  class.define_method("model", method!(RbTokenizer::get_model, 0))?;
59
- class.define_method("model=", method!(RbTokenizer::set_model,1))?;
72
+ class.define_method("model=", method!(RbTokenizer::set_model, 1))?;
60
73
  class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
61
74
  class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
62
75
  class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
63
76
  class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
64
- class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
77
+ class.define_method(
78
+ "post_processor",
79
+ method!(RbTokenizer::get_post_processor, 0),
80
+ )?;
65
81
  class.define_method(
66
82
  "post_processor=",
67
83
  method!(RbTokenizer::set_post_processor, 1),
@@ -73,13 +89,22 @@ fn init(ruby: &Ruby) -> RbResult<()> {
73
89
  class.define_method("_enable_padding", method!(RbTokenizer::enable_padding, 1))?;
74
90
  class.define_method("padding", method!(RbTokenizer::padding, 0))?;
75
91
  class.define_method("no_padding", method!(RbTokenizer::no_padding, 0))?;
76
- class.define_method("_enable_truncation", method!(RbTokenizer::enable_truncation, 2))?;
92
+ class.define_method(
93
+ "_enable_truncation",
94
+ method!(RbTokenizer::enable_truncation, 2),
95
+ )?;
77
96
  class.define_method("truncation", method!(RbTokenizer::truncation, 0))?;
78
97
  class.define_method("no_truncation", method!(RbTokenizer::no_truncation, 0))?;
79
- class.define_method("num_special_tokens_to_add", method!(RbTokenizer::num_special_tokens_to_add, 1))?;
98
+ class.define_method(
99
+ "num_special_tokens_to_add",
100
+ method!(RbTokenizer::num_special_tokens_to_add, 1),
101
+ )?;
80
102
  class.define_method("_vocab", method!(RbTokenizer::vocab, 1))?;
81
103
  class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
82
- class.define_method("added_tokens_decoder", method!(RbTokenizer::get_added_tokens_decoder, 0))?;
104
+ class.define_method(
105
+ "added_tokens_decoder",
106
+ method!(RbTokenizer::get_added_tokens_decoder, 0),
107
+ )?;
83
108
  class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
84
109
 
85
110
  let class = module.define_class("Encoding", ruby.class_object())?;
@@ -5,18 +5,19 @@ use std::sync::{Arc, RwLock};
5
5
  use crate::trainers::RbTrainer;
6
6
  use magnus::prelude::*;
7
7
  use magnus::{
8
- data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
9
- RClass, RHash, RModule, Ruby, Symbol, TryConvert, TypedData, Value,
8
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType,
9
+ DataTypeFunctions, Error, Module, Object, RClass, RHash, RModule, Ruby, Symbol, TryConvert,
10
+ TypedData, Value,
10
11
  };
11
12
  use serde::{Deserialize, Serialize};
12
13
  use tk::models::bpe::{BpeBuilder, Merges, Vocab, BPE};
13
- use tk::models::ModelWrapper;
14
14
  use tk::models::unigram::Unigram;
15
15
  use tk::models::wordlevel::WordLevel;
16
16
  use tk::models::wordpiece::{WordPiece, WordPieceBuilder};
17
+ use tk::models::ModelWrapper;
17
18
  use tk::{Model, Token};
18
19
 
19
- use super::{MODELS, RbError, RbResult};
20
+ use super::{RbError, RbResult, MODELS};
20
21
 
21
22
  #[derive(DataTypeFunctions, Clone, Serialize, Deserialize)]
22
23
  pub struct RbModel {
@@ -187,7 +188,12 @@ impl RbModel {
187
188
  }
188
189
 
189
190
  pub fn bpe_set_continuing_subword_prefix(&self, continuing_subword_prefix: Option<String>) {
190
- setter!(self, BPE, continuing_subword_prefix, continuing_subword_prefix);
191
+ setter!(
192
+ self,
193
+ BPE,
194
+ continuing_subword_prefix,
195
+ continuing_subword_prefix
196
+ );
191
197
  }
192
198
 
193
199
  pub fn bpe_end_of_word_suffix(&self) -> Option<String> {
@@ -219,7 +225,12 @@ impl RbModel {
219
225
  }
220
226
 
221
227
  pub fn word_piece_set_continuing_subword_prefix(&self, continuing_subword_prefix: String) {
222
- setter!(self, WordPiece, continuing_subword_prefix, continuing_subword_prefix);
228
+ setter!(
229
+ self,
230
+ WordPiece,
231
+ continuing_subword_prefix,
232
+ continuing_subword_prefix
233
+ );
223
234
  }
224
235
 
225
236
  pub fn word_piece_max_input_chars_per_word(&self) -> usize {
@@ -227,21 +238,34 @@ impl RbModel {
227
238
  }
228
239
 
229
240
  pub fn word_piece_set_max_input_chars_per_word(&self, max_input_chars_per_word: usize) {
230
- setter!(self, WordPiece, max_input_chars_per_word, max_input_chars_per_word);
241
+ setter!(
242
+ self,
243
+ WordPiece,
244
+ max_input_chars_per_word,
245
+ max_input_chars_per_word
246
+ );
231
247
  }
232
248
  }
233
249
 
234
250
  pub struct RbUnigram {}
235
251
 
236
252
  impl RbUnigram {
237
- fn new(vocab: Option<Vec<(String, f64)>>, unk_id: Option<usize>, byte_fallback: Option<bool>) -> RbResult<RbModel> {
253
+ fn new(
254
+ vocab: Option<Vec<(String, f64)>>,
255
+ unk_id: Option<usize>,
256
+ byte_fallback: Option<bool>,
257
+ ) -> RbResult<RbModel> {
238
258
  match (vocab, unk_id, byte_fallback) {
239
259
  (Some(vocab), unk_id, byte_fallback) => {
240
- let model = Unigram::from(vocab, unk_id, byte_fallback.unwrap_or(false)).map_err(RbError::from)?;
260
+ let model = Unigram::from(vocab, unk_id, byte_fallback.unwrap_or(false))
261
+ .map_err(RbError::from)?;
241
262
  Ok(model.into())
242
263
  }
243
264
  (None, None, _) => Ok(Unigram::default().into()),
244
- _ => Err(Error::new(exception::arg_error(), "`vocab` and `unk_id` must be both specified")),
265
+ _ => Err(Error::new(
266
+ exception::arg_error(),
267
+ "`vocab` and `unk_id` must be both specified",
268
+ )),
245
269
  }
246
270
  }
247
271
  }
@@ -249,7 +273,10 @@ impl RbUnigram {
249
273
  pub struct RbWordLevel {}
250
274
 
251
275
  impl RbWordLevel {
252
- pub fn new(vocab: Option<HashMap<String, u32>>, unk_token: Option<String>) -> RbResult<RbModel> {
276
+ pub fn new(
277
+ vocab: Option<HashMap<String, u32>>,
278
+ unk_token: Option<String>,
279
+ ) -> RbResult<RbModel> {
253
280
  let mut builder = WordLevel::builder();
254
281
  if let Some(vocab) = vocab {
255
282
  builder = builder.vocab(vocab);
@@ -316,15 +343,16 @@ impl RbWordPiece {
316
343
  unsafe impl TypedData for RbModel {
317
344
  fn class(ruby: &Ruby) -> RClass {
318
345
  static CLASS: Lazy<RClass> = Lazy::new(|ruby| {
319
- let class: RClass = ruby.get_inner(&MODELS).const_get("Model").unwrap();
320
- class.undef_default_alloc_func();
321
- class
346
+ let class: RClass = ruby.get_inner(&MODELS).const_get("Model").unwrap();
347
+ class.undef_default_alloc_func();
348
+ class
322
349
  });
323
350
  ruby.get_inner(&CLASS)
324
351
  }
325
352
 
326
353
  fn data_type() -> &'static DataType {
327
- static DATA_TYPE: DataType = data_type_builder!(RbModel, "Tokenizers::Models::Model").build();
354
+ static DATA_TYPE: DataType =
355
+ data_type_builder!(RbModel, "Tokenizers::Models::Model").build();
328
356
  &DATA_TYPE
329
357
  }
330
358
 
@@ -368,10 +396,22 @@ pub fn init_models(ruby: &Ruby, module: &RModule) -> RbResult<()> {
368
396
  class.define_method("dropout=", method!(RbModel::bpe_set_dropout, 1))?;
369
397
  class.define_method("unk_token", method!(RbModel::bpe_unk_token, 0))?;
370
398
  class.define_method("unk_token=", method!(RbModel::bpe_set_unk_token, 1))?;
371
- class.define_method("continuing_subword_prefix", method!(RbModel::bpe_continuing_subword_prefix, 0))?;
372
- class.define_method("continuing_subword_prefix=", method!(RbModel::bpe_set_continuing_subword_prefix, 1))?;
373
- class.define_method("end_of_word_suffix", method!(RbModel::bpe_end_of_word_suffix, 0))?;
374
- class.define_method("end_of_word_suffix=", method!(RbModel::bpe_set_end_of_word_suffix, 1))?;
399
+ class.define_method(
400
+ "continuing_subword_prefix",
401
+ method!(RbModel::bpe_continuing_subword_prefix, 0),
402
+ )?;
403
+ class.define_method(
404
+ "continuing_subword_prefix=",
405
+ method!(RbModel::bpe_set_continuing_subword_prefix, 1),
406
+ )?;
407
+ class.define_method(
408
+ "end_of_word_suffix",
409
+ method!(RbModel::bpe_end_of_word_suffix, 0),
410
+ )?;
411
+ class.define_method(
412
+ "end_of_word_suffix=",
413
+ method!(RbModel::bpe_set_end_of_word_suffix, 1),
414
+ )?;
375
415
  class.define_method("fuse_unk", method!(RbModel::bpe_fuse_unk, 0))?;
376
416
  class.define_method("fuse_unk=", method!(RbModel::bpe_set_fuse_unk, 1))?;
377
417
  class.define_method("byte_fallback", method!(RbModel::bpe_byte_fallback, 0))?;
@@ -392,10 +432,22 @@ pub fn init_models(ruby: &Ruby, module: &RModule) -> RbResult<()> {
392
432
  class.define_singleton_method("_from_file", function!(RbWordPiece::from_file, 2))?;
393
433
  class.define_method("unk_token", method!(RbModel::word_piece_unk_token, 0))?;
394
434
  class.define_method("unk_token=", method!(RbModel::word_piece_set_unk_token, 1))?;
395
- class.define_method("continuing_subword_prefix", method!(RbModel::word_piece_continuing_subword_prefix, 0))?;
396
- class.define_method("continuing_subword_prefix=", method!(RbModel::word_piece_set_continuing_subword_prefix, 1))?;
397
- class.define_method("max_input_chars_per_word", method!(RbModel::word_piece_max_input_chars_per_word, 0))?;
398
- class.define_method("max_input_chars_per_word=", method!(RbModel::word_piece_set_max_input_chars_per_word, 1))?;
435
+ class.define_method(
436
+ "continuing_subword_prefix",
437
+ method!(RbModel::word_piece_continuing_subword_prefix, 0),
438
+ )?;
439
+ class.define_method(
440
+ "continuing_subword_prefix=",
441
+ method!(RbModel::word_piece_set_continuing_subword_prefix, 1),
442
+ )?;
443
+ class.define_method(
444
+ "max_input_chars_per_word",
445
+ method!(RbModel::word_piece_max_input_chars_per_word, 0),
446
+ )?;
447
+ class.define_method(
448
+ "max_input_chars_per_word=",
449
+ method!(RbModel::word_piece_set_max_input_chars_per_word, 1),
450
+ )?;
399
451
 
400
452
  Ok(())
401
453
  }
@@ -1,19 +1,19 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
3
  use magnus::{
4
- data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object, RArray, RClass, RModule,
5
- Ruby, TryConvert, TypedData,
4
+ data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module,
5
+ Object, RArray, RClass, RModule, Ruby, TryConvert, TypedData,
6
6
  };
7
7
  use serde::ser::SerializeStruct;
8
8
  use serde::{Deserialize, Serialize, Serializer};
9
9
  use tk::normalizers::{
10
- BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Prepend, Strip, StripAccents,
11
- NFC, NFD, NFKC, NFKD,
10
+ BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Prepend, Replace, Strip,
11
+ StripAccents, NFC, NFD, NFKC, NFKD,
12
12
  };
13
13
  use tk::{NormalizedString, Normalizer};
14
14
 
15
15
  use super::utils::*;
16
- use super::{NORMALIZERS, RbError, RbResult};
16
+ use super::{RbError, RbResult, NORMALIZERS};
17
17
 
18
18
  #[derive(DataTypeFunctions, Clone, Serialize, Deserialize)]
19
19
  pub struct RbNormalizer {
@@ -28,7 +28,9 @@ impl RbNormalizer {
28
28
 
29
29
  pub fn normalize_str(&self, sequence: String) -> RbResult<String> {
30
30
  let mut normalized = NormalizedString::from(sequence);
31
- self.normalizer.normalize(&mut normalized).map_err(RbError::from)?;
31
+ self.normalizer
32
+ .normalize(&mut normalized)
33
+ .map_err(RbError::from)?;
32
34
  Ok(normalized.get().to_owned())
33
35
  }
34
36
  }
@@ -43,7 +45,8 @@ macro_rules! getter {
43
45
  ($self: ident, $variant: ident, $name: ident) => {{
44
46
  if let RbNormalizerTypeWrapper::Single(ref norm) = &$self.normalizer {
45
47
  let wrapper = norm.read().unwrap();
46
- if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone() {
48
+ if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone()
49
+ {
47
50
  o.$name
48
51
  } else {
49
52
  unreachable!()
@@ -66,7 +69,6 @@ macro_rules! setter {
66
69
  }
67
70
 
68
71
  impl RbNormalizer {
69
-
70
72
  fn bert_clean_text(&self) -> bool {
71
73
  getter!(self, BertNormalizer, clean_text)
72
74
  }
@@ -101,7 +103,7 @@ impl RbNormalizer {
101
103
  }
102
104
 
103
105
  fn bert_set_lowercase(&self, lowercase: bool) {
104
- setter!(self, BertNormalizer, lowercase, lowercase)
106
+ setter!(self, BertNormalizer, lowercase, lowercase);
105
107
  }
106
108
 
107
109
  fn prepend_prepend(&self) -> String {
@@ -109,7 +111,7 @@ impl RbNormalizer {
109
111
  }
110
112
 
111
113
  fn prepend_set_prepend(&self, prepend: String) {
112
- setter!(self, Prepend, prepend, prepend)
114
+ setter!(self, Prepend, prepend, prepend);
113
115
  }
114
116
 
115
117
  fn strip_left(&self) -> bool {
@@ -117,7 +119,7 @@ impl RbNormalizer {
117
119
  }
118
120
 
119
121
  fn strip_set_left(&self, left: bool) {
120
- setter!(self, StripNormalizer, strip_left, left)
122
+ setter!(self, StripNormalizer, strip_left, left);
121
123
  }
122
124
 
123
125
  fn strip_right(&self) -> bool {
@@ -125,14 +127,19 @@ impl RbNormalizer {
125
127
  }
126
128
 
127
129
  fn strip_set_right(&self, right: bool) {
128
- setter!(self, StripNormalizer, strip_right, right)
130
+ setter!(self, StripNormalizer, strip_right, right);
129
131
  }
130
132
  }
131
133
 
132
134
  pub struct RbBertNormalizer {}
133
135
 
134
136
  impl RbBertNormalizer {
135
- pub fn new(clean_text: bool, handle_chinese_chars: bool, strip_accents: Option<bool>, lowercase: bool) -> RbNormalizer {
137
+ pub fn new(
138
+ clean_text: bool,
139
+ handle_chinese_chars: bool,
140
+ strip_accents: Option<bool>,
141
+ lowercase: bool,
142
+ ) -> RbNormalizer {
136
143
  BertNormalizer::new(clean_text, handle_chinese_chars, strip_accents, lowercase).into()
137
144
  }
138
145
  }
@@ -185,11 +192,28 @@ impl RbNmt {
185
192
  }
186
193
  }
187
194
 
195
+ pub struct RbPrecompiled {}
196
+
197
+ impl RbPrecompiled {
198
+ pub fn new(precompiled_charsmap: Vec<u8>) -> RbResult<RbNormalizer> {
199
+ Precompiled::from(&precompiled_charsmap)
200
+ .map_err(|e| {
201
+ RbError::new_err(format!(
202
+ "Error while attempting to build Precompiled normalizer: {}",
203
+ e
204
+ ))
205
+ })
206
+ .map(|v| v.into())
207
+ }
208
+ }
209
+
188
210
  pub struct RbReplace {}
189
211
 
190
212
  impl RbReplace {
191
213
  pub fn new(pattern: RbPattern, content: String) -> RbResult<RbNormalizer> {
192
- Replace::new(pattern, content).map(|v| v.into()).map_err(RbError::from)
214
+ Replace::new(pattern, content)
215
+ .map(|v| v.into())
216
+ .map_err(RbError::from)
193
217
  }
194
218
  }
195
219
 
@@ -222,14 +246,16 @@ pub struct RbSequence {}
222
246
  impl RbSequence {
223
247
  fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
224
248
  let mut sequence = Vec::with_capacity(normalizers.len());
225
- for n in normalizers.into_iter() {
249
+ for n in normalizers {
226
250
  let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
227
251
  match &normalizer.normalizer {
228
252
  RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
229
253
  RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
230
254
  }
231
255
  }
232
- Ok(RbNormalizer::new(RbNormalizerTypeWrapper::Sequence(sequence)))
256
+ Ok(RbNormalizer::new(RbNormalizerTypeWrapper::Sequence(
257
+ sequence,
258
+ )))
233
259
  }
234
260
  }
235
261
 
@@ -328,7 +354,10 @@ impl Normalizer for RbNormalizerWrapper {
328
354
  unsafe impl TypedData for RbNormalizer {
329
355
  fn class(ruby: &Ruby) -> RClass {
330
356
  static CLASS: Lazy<RClass> = Lazy::new(|ruby| {
331
- let class: RClass = ruby.get_inner(&NORMALIZERS).const_get("Normalizer").unwrap();
357
+ let class: RClass = ruby
358
+ .get_inner(&NORMALIZERS)
359
+ .const_get("Normalizer")
360
+ .unwrap();
332
361
  class.undef_default_alloc_func();
333
362
  class
334
363
  });
@@ -336,7 +365,8 @@ unsafe impl TypedData for RbNormalizer {
336
365
  }
337
366
 
338
367
  fn data_type() -> &'static DataType {
339
- static DATA_TYPE: DataType = data_type_builder!(RbNormalizer, "Tokenizers::Normalizers::Normalizer").build();
368
+ static DATA_TYPE: DataType =
369
+ data_type_builder!(RbNormalizer, "Tokenizers::Normalizers::Normalizer").build();
340
370
  &DATA_TYPE
341
371
  }
342
372
 
@@ -347,7 +377,10 @@ unsafe impl TypedData for RbNormalizer {
347
377
  class
348
378
  });
349
379
  static BERT_NORMALIZER: Lazy<RClass> = Lazy::new(|ruby| {
350
- let class: RClass = ruby.get_inner(&NORMALIZERS).const_get("BertNormalizer").unwrap();
380
+ let class: RClass = ruby
381
+ .get_inner(&NORMALIZERS)
382
+ .const_get("BertNormalizer")
383
+ .unwrap();
351
384
  class.undef_default_alloc_func();
352
385
  class
353
386
  });
@@ -381,6 +414,14 @@ unsafe impl TypedData for RbNormalizer {
381
414
  class.undef_default_alloc_func();
382
415
  class
383
416
  });
417
+ static PRECOMPILED: Lazy<RClass> = Lazy::new(|ruby| {
418
+ let class: RClass = ruby
419
+ .get_inner(&NORMALIZERS)
420
+ .const_get("Precompiled")
421
+ .unwrap();
422
+ class.undef_default_alloc_func();
423
+ class
424
+ });
384
425
  static REPLACE: Lazy<RClass> = Lazy::new(|ruby| {
385
426
  let class: RClass = ruby.get_inner(&NORMALIZERS).const_get("Replace").unwrap();
386
427
  class.undef_default_alloc_func();
@@ -397,7 +438,10 @@ unsafe impl TypedData for RbNormalizer {
397
438
  class
398
439
  });
399
440
  static STRIP_ACCENTS: Lazy<RClass> = Lazy::new(|ruby| {
400
- let class: RClass = ruby.get_inner(&NORMALIZERS).const_get("StripAccents").unwrap();
441
+ let class: RClass = ruby
442
+ .get_inner(&NORMALIZERS)
443
+ .const_get("StripAccents")
444
+ .unwrap();
401
445
  class.undef_default_alloc_func();
402
446
  class
403
447
  });
@@ -412,6 +456,7 @@ unsafe impl TypedData for RbNormalizer {
412
456
  NormalizerWrapper::NFKC(_) => ruby.get_inner(&NFKC),
413
457
  NormalizerWrapper::NFKD(_) => ruby.get_inner(&NFKD),
414
458
  NormalizerWrapper::Nmt(_) => ruby.get_inner(&NMT),
459
+ NormalizerWrapper::Precompiled(_) => ruby.get_inner(&PRECOMPILED),
415
460
  NormalizerWrapper::Replace(_) => ruby.get_inner(&REPLACE),
416
461
  NormalizerWrapper::Prepend(_) => ruby.get_inner(&PREPEND),
417
462
  NormalizerWrapper::StripNormalizer(_) => ruby.get_inner(&STRIP),
@@ -434,10 +479,22 @@ pub fn init_normalizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
434
479
  class.define_singleton_method("_new", function!(RbBertNormalizer::new, 4))?;
435
480
  class.define_method("clean_text", method!(RbNormalizer::bert_clean_text, 0))?;
436
481
  class.define_method("clean_text=", method!(RbNormalizer::bert_set_clean_text, 1))?;
437
- class.define_method("handle_chinese_chars", method!(RbNormalizer::bert_handle_chinese_chars, 0))?;
438
- class.define_method("handle_chinese_chars=", method!(RbNormalizer::bert_set_handle_chinese_chars, 1))?;
439
- class.define_method("strip_accents", method!(RbNormalizer::bert_strip_accents, 0))?;
440
- class.define_method("strip_accents=", method!(RbNormalizer::bert_set_strip_accents, 1))?;
482
+ class.define_method(
483
+ "handle_chinese_chars",
484
+ method!(RbNormalizer::bert_handle_chinese_chars, 0),
485
+ )?;
486
+ class.define_method(
487
+ "handle_chinese_chars=",
488
+ method!(RbNormalizer::bert_set_handle_chinese_chars, 1),
489
+ )?;
490
+ class.define_method(
491
+ "strip_accents",
492
+ method!(RbNormalizer::bert_strip_accents, 0),
493
+ )?;
494
+ class.define_method(
495
+ "strip_accents=",
496
+ method!(RbNormalizer::bert_set_strip_accents, 1),
497
+ )?;
441
498
  class.define_method("lowercase", method!(RbNormalizer::bert_lowercase, 0))?;
442
499
  class.define_method("lowercase=", method!(RbNormalizer::bert_set_lowercase, 1))?;
443
500
 
@@ -459,6 +516,9 @@ pub fn init_normalizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
459
516
  let class = module.define_class("Nmt", normalizer)?;
460
517
  class.define_singleton_method("new", function!(RbNmt::new, 0))?;
461
518
 
519
+ let class = module.define_class("Precompiled", normalizer)?;
520
+ class.define_singleton_method("new", function!(RbPrecompiled::new, 1))?;
521
+
462
522
  let class = module.define_class("Replace", normalizer)?;
463
523
  class.define_singleton_method("new", function!(RbReplace::new, 2))?;
464
524