tokenizers 0.5.3 → 0.5.4

Sign up to get free protection for your applications and to get access to all the features.
@@ -22,19 +22,32 @@ use magnus::{function, method, prelude::*, value::Lazy, Error, RModule, Ruby};
22
22
 
23
23
  type RbResult<T> = Result<T, Error>;
24
24
 
25
- static TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.class_object().const_get("Tokenizers").unwrap());
25
+ static TOKENIZERS: Lazy<RModule> =
26
+ Lazy::new(|ruby| ruby.class_object().const_get("Tokenizers").unwrap());
26
27
 
27
- static DECODERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Decoders").unwrap());
28
+ static DECODERS: Lazy<RModule> =
29
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Decoders").unwrap());
28
30
 
29
- static MODELS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Models").unwrap());
31
+ static MODELS: Lazy<RModule> =
32
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Models").unwrap());
30
33
 
31
- static NORMALIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Normalizers").unwrap());
34
+ static NORMALIZERS: Lazy<RModule> = Lazy::new(|ruby| {
35
+ ruby.get_inner(&TOKENIZERS)
36
+ .const_get("Normalizers")
37
+ .unwrap()
38
+ });
32
39
 
33
- static PRE_TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("PreTokenizers").unwrap());
40
+ static PRE_TOKENIZERS: Lazy<RModule> = Lazy::new(|ruby| {
41
+ ruby.get_inner(&TOKENIZERS)
42
+ .const_get("PreTokenizers")
43
+ .unwrap()
44
+ });
34
45
 
35
- static PROCESSORS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Processors").unwrap());
46
+ static PROCESSORS: Lazy<RModule> =
47
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Processors").unwrap());
36
48
 
37
- static TRAINERS: Lazy<RModule> = Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Trainers").unwrap());
49
+ static TRAINERS: Lazy<RModule> =
50
+ Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Trainers").unwrap());
38
51
 
39
52
  #[magnus::init]
40
53
  fn init(ruby: &Ruby) -> RbResult<()> {
@@ -56,12 +69,15 @@ fn init(ruby: &Ruby) -> RbResult<()> {
56
69
  class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
57
70
  class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
58
71
  class.define_method("model", method!(RbTokenizer::get_model, 0))?;
59
- class.define_method("model=", method!(RbTokenizer::set_model,1))?;
72
+ class.define_method("model=", method!(RbTokenizer::set_model, 1))?;
60
73
  class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
61
74
  class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
62
75
  class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
63
76
  class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
64
- class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
77
+ class.define_method(
78
+ "post_processor",
79
+ method!(RbTokenizer::get_post_processor, 0),
80
+ )?;
65
81
  class.define_method(
66
82
  "post_processor=",
67
83
  method!(RbTokenizer::set_post_processor, 1),
@@ -73,13 +89,22 @@ fn init(ruby: &Ruby) -> RbResult<()> {
73
89
  class.define_method("_enable_padding", method!(RbTokenizer::enable_padding, 1))?;
74
90
  class.define_method("padding", method!(RbTokenizer::padding, 0))?;
75
91
  class.define_method("no_padding", method!(RbTokenizer::no_padding, 0))?;
76
- class.define_method("_enable_truncation", method!(RbTokenizer::enable_truncation, 2))?;
92
+ class.define_method(
93
+ "_enable_truncation",
94
+ method!(RbTokenizer::enable_truncation, 2),
95
+ )?;
77
96
  class.define_method("truncation", method!(RbTokenizer::truncation, 0))?;
78
97
  class.define_method("no_truncation", method!(RbTokenizer::no_truncation, 0))?;
79
- class.define_method("num_special_tokens_to_add", method!(RbTokenizer::num_special_tokens_to_add, 1))?;
98
+ class.define_method(
99
+ "num_special_tokens_to_add",
100
+ method!(RbTokenizer::num_special_tokens_to_add, 1),
101
+ )?;
80
102
  class.define_method("_vocab", method!(RbTokenizer::vocab, 1))?;
81
103
  class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
82
- class.define_method("added_tokens_decoder", method!(RbTokenizer::get_added_tokens_decoder, 0))?;
104
+ class.define_method(
105
+ "added_tokens_decoder",
106
+ method!(RbTokenizer::get_added_tokens_decoder, 0),
107
+ )?;
83
108
  class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
84
109
 
85
110
  let class = module.define_class("Encoding", ruby.class_object())?;
@@ -5,18 +5,19 @@ use std::sync::{Arc, RwLock};
5
5
  use crate::trainers::RbTrainer;
6
6
  use magnus::prelude::*;
7
7
  use magnus::{
8
- data_type_builder, exception, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Error, Module, Object,
9
- RClass, RHash, RModule, Ruby, Symbol, TryConvert, TypedData, Value,
8
+ data_type_builder, exception, function, method, value::Lazy, Class, DataType,
9
+ DataTypeFunctions, Error, Module, Object, RClass, RHash, RModule, Ruby, Symbol, TryConvert,
10
+ TypedData, Value,
10
11
  };
11
12
  use serde::{Deserialize, Serialize};
12
13
  use tk::models::bpe::{BpeBuilder, Merges, Vocab, BPE};
13
- use tk::models::ModelWrapper;
14
14
  use tk::models::unigram::Unigram;
15
15
  use tk::models::wordlevel::WordLevel;
16
16
  use tk::models::wordpiece::{WordPiece, WordPieceBuilder};
17
+ use tk::models::ModelWrapper;
17
18
  use tk::{Model, Token};
18
19
 
19
- use super::{MODELS, RbError, RbResult};
20
+ use super::{RbError, RbResult, MODELS};
20
21
 
21
22
  #[derive(DataTypeFunctions, Clone, Serialize, Deserialize)]
22
23
  pub struct RbModel {
@@ -187,7 +188,12 @@ impl RbModel {
187
188
  }
188
189
 
189
190
  pub fn bpe_set_continuing_subword_prefix(&self, continuing_subword_prefix: Option<String>) {
190
- setter!(self, BPE, continuing_subword_prefix, continuing_subword_prefix);
191
+ setter!(
192
+ self,
193
+ BPE,
194
+ continuing_subword_prefix,
195
+ continuing_subword_prefix
196
+ );
191
197
  }
192
198
 
193
199
  pub fn bpe_end_of_word_suffix(&self) -> Option<String> {
@@ -219,7 +225,12 @@ impl RbModel {
219
225
  }
220
226
 
221
227
  pub fn word_piece_set_continuing_subword_prefix(&self, continuing_subword_prefix: String) {
222
- setter!(self, WordPiece, continuing_subword_prefix, continuing_subword_prefix);
228
+ setter!(
229
+ self,
230
+ WordPiece,
231
+ continuing_subword_prefix,
232
+ continuing_subword_prefix
233
+ );
223
234
  }
224
235
 
225
236
  pub fn word_piece_max_input_chars_per_word(&self) -> usize {
@@ -227,21 +238,34 @@ impl RbModel {
227
238
  }
228
239
 
229
240
  pub fn word_piece_set_max_input_chars_per_word(&self, max_input_chars_per_word: usize) {
230
- setter!(self, WordPiece, max_input_chars_per_word, max_input_chars_per_word);
241
+ setter!(
242
+ self,
243
+ WordPiece,
244
+ max_input_chars_per_word,
245
+ max_input_chars_per_word
246
+ );
231
247
  }
232
248
  }
233
249
 
234
250
  pub struct RbUnigram {}
235
251
 
236
252
  impl RbUnigram {
237
- fn new(vocab: Option<Vec<(String, f64)>>, unk_id: Option<usize>, byte_fallback: Option<bool>) -> RbResult<RbModel> {
253
+ fn new(
254
+ vocab: Option<Vec<(String, f64)>>,
255
+ unk_id: Option<usize>,
256
+ byte_fallback: Option<bool>,
257
+ ) -> RbResult<RbModel> {
238
258
  match (vocab, unk_id, byte_fallback) {
239
259
  (Some(vocab), unk_id, byte_fallback) => {
240
- let model = Unigram::from(vocab, unk_id, byte_fallback.unwrap_or(false)).map_err(RbError::from)?;
260
+ let model = Unigram::from(vocab, unk_id, byte_fallback.unwrap_or(false))
261
+ .map_err(RbError::from)?;
241
262
  Ok(model.into())
242
263
  }
243
264
  (None, None, _) => Ok(Unigram::default().into()),
244
- _ => Err(Error::new(exception::arg_error(), "`vocab` and `unk_id` must be both specified")),
265
+ _ => Err(Error::new(
266
+ exception::arg_error(),
267
+ "`vocab` and `unk_id` must be both specified",
268
+ )),
245
269
  }
246
270
  }
247
271
  }
@@ -249,7 +273,10 @@ impl RbUnigram {
249
273
  pub struct RbWordLevel {}
250
274
 
251
275
  impl RbWordLevel {
252
- pub fn new(vocab: Option<HashMap<String, u32>>, unk_token: Option<String>) -> RbResult<RbModel> {
276
+ pub fn new(
277
+ vocab: Option<HashMap<String, u32>>,
278
+ unk_token: Option<String>,
279
+ ) -> RbResult<RbModel> {
253
280
  let mut builder = WordLevel::builder();
254
281
  if let Some(vocab) = vocab {
255
282
  builder = builder.vocab(vocab);
@@ -316,15 +343,16 @@ impl RbWordPiece {
316
343
  unsafe impl TypedData for RbModel {
317
344
  fn class(ruby: &Ruby) -> RClass {
318
345
  static CLASS: Lazy<RClass> = Lazy::new(|ruby| {
319
- let class: RClass = ruby.get_inner(&MODELS).const_get("Model").unwrap();
320
- class.undef_default_alloc_func();
321
- class
346
+ let class: RClass = ruby.get_inner(&MODELS).const_get("Model").unwrap();
347
+ class.undef_default_alloc_func();
348
+ class
322
349
  });
323
350
  ruby.get_inner(&CLASS)
324
351
  }
325
352
 
326
353
  fn data_type() -> &'static DataType {
327
- static DATA_TYPE: DataType = data_type_builder!(RbModel, "Tokenizers::Models::Model").build();
354
+ static DATA_TYPE: DataType =
355
+ data_type_builder!(RbModel, "Tokenizers::Models::Model").build();
328
356
  &DATA_TYPE
329
357
  }
330
358
 
@@ -368,10 +396,22 @@ pub fn init_models(ruby: &Ruby, module: &RModule) -> RbResult<()> {
368
396
  class.define_method("dropout=", method!(RbModel::bpe_set_dropout, 1))?;
369
397
  class.define_method("unk_token", method!(RbModel::bpe_unk_token, 0))?;
370
398
  class.define_method("unk_token=", method!(RbModel::bpe_set_unk_token, 1))?;
371
- class.define_method("continuing_subword_prefix", method!(RbModel::bpe_continuing_subword_prefix, 0))?;
372
- class.define_method("continuing_subword_prefix=", method!(RbModel::bpe_set_continuing_subword_prefix, 1))?;
373
- class.define_method("end_of_word_suffix", method!(RbModel::bpe_end_of_word_suffix, 0))?;
374
- class.define_method("end_of_word_suffix=", method!(RbModel::bpe_set_end_of_word_suffix, 1))?;
399
+ class.define_method(
400
+ "continuing_subword_prefix",
401
+ method!(RbModel::bpe_continuing_subword_prefix, 0),
402
+ )?;
403
+ class.define_method(
404
+ "continuing_subword_prefix=",
405
+ method!(RbModel::bpe_set_continuing_subword_prefix, 1),
406
+ )?;
407
+ class.define_method(
408
+ "end_of_word_suffix",
409
+ method!(RbModel::bpe_end_of_word_suffix, 0),
410
+ )?;
411
+ class.define_method(
412
+ "end_of_word_suffix=",
413
+ method!(RbModel::bpe_set_end_of_word_suffix, 1),
414
+ )?;
375
415
  class.define_method("fuse_unk", method!(RbModel::bpe_fuse_unk, 0))?;
376
416
  class.define_method("fuse_unk=", method!(RbModel::bpe_set_fuse_unk, 1))?;
377
417
  class.define_method("byte_fallback", method!(RbModel::bpe_byte_fallback, 0))?;
@@ -392,10 +432,22 @@ pub fn init_models(ruby: &Ruby, module: &RModule) -> RbResult<()> {
392
432
  class.define_singleton_method("_from_file", function!(RbWordPiece::from_file, 2))?;
393
433
  class.define_method("unk_token", method!(RbModel::word_piece_unk_token, 0))?;
394
434
  class.define_method("unk_token=", method!(RbModel::word_piece_set_unk_token, 1))?;
395
- class.define_method("continuing_subword_prefix", method!(RbModel::word_piece_continuing_subword_prefix, 0))?;
396
- class.define_method("continuing_subword_prefix=", method!(RbModel::word_piece_set_continuing_subword_prefix, 1))?;
397
- class.define_method("max_input_chars_per_word", method!(RbModel::word_piece_max_input_chars_per_word, 0))?;
398
- class.define_method("max_input_chars_per_word=", method!(RbModel::word_piece_set_max_input_chars_per_word, 1))?;
435
+ class.define_method(
436
+ "continuing_subword_prefix",
437
+ method!(RbModel::word_piece_continuing_subword_prefix, 0),
438
+ )?;
439
+ class.define_method(
440
+ "continuing_subword_prefix=",
441
+ method!(RbModel::word_piece_set_continuing_subword_prefix, 1),
442
+ )?;
443
+ class.define_method(
444
+ "max_input_chars_per_word",
445
+ method!(RbModel::word_piece_max_input_chars_per_word, 0),
446
+ )?;
447
+ class.define_method(
448
+ "max_input_chars_per_word=",
449
+ method!(RbModel::word_piece_set_max_input_chars_per_word, 1),
450
+ )?;
399
451
 
400
452
  Ok(())
401
453
  }
@@ -1,19 +1,19 @@
1
1
  use std::sync::{Arc, RwLock};
2
2
 
3
3
  use magnus::{
4
- data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module, Object, RArray, RClass, RModule,
5
- Ruby, TryConvert, TypedData,
4
+ data_type_builder, function, method, value::Lazy, Class, DataType, DataTypeFunctions, Module,
5
+ Object, RArray, RClass, RModule, Ruby, TryConvert, TypedData,
6
6
  };
7
7
  use serde::ser::SerializeStruct;
8
8
  use serde::{Deserialize, Serialize, Serializer};
9
9
  use tk::normalizers::{
10
- BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Prepend, Strip, StripAccents,
11
- NFC, NFD, NFKC, NFKD,
10
+ BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Precompiled, Prepend, Replace, Strip,
11
+ StripAccents, NFC, NFD, NFKC, NFKD,
12
12
  };
13
13
  use tk::{NormalizedString, Normalizer};
14
14
 
15
15
  use super::utils::*;
16
- use super::{NORMALIZERS, RbError, RbResult};
16
+ use super::{RbError, RbResult, NORMALIZERS};
17
17
 
18
18
  #[derive(DataTypeFunctions, Clone, Serialize, Deserialize)]
19
19
  pub struct RbNormalizer {
@@ -28,7 +28,9 @@ impl RbNormalizer {
28
28
 
29
29
  pub fn normalize_str(&self, sequence: String) -> RbResult<String> {
30
30
  let mut normalized = NormalizedString::from(sequence);
31
- self.normalizer.normalize(&mut normalized).map_err(RbError::from)?;
31
+ self.normalizer
32
+ .normalize(&mut normalized)
33
+ .map_err(RbError::from)?;
32
34
  Ok(normalized.get().to_owned())
33
35
  }
34
36
  }
@@ -43,7 +45,8 @@ macro_rules! getter {
43
45
  ($self: ident, $variant: ident, $name: ident) => {{
44
46
  if let RbNormalizerTypeWrapper::Single(ref norm) = &$self.normalizer {
45
47
  let wrapper = norm.read().unwrap();
46
- if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone() {
48
+ if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone()
49
+ {
47
50
  o.$name
48
51
  } else {
49
52
  unreachable!()
@@ -66,7 +69,6 @@ macro_rules! setter {
66
69
  }
67
70
 
68
71
  impl RbNormalizer {
69
-
70
72
  fn bert_clean_text(&self) -> bool {
71
73
  getter!(self, BertNormalizer, clean_text)
72
74
  }
@@ -101,7 +103,7 @@ impl RbNormalizer {
101
103
  }
102
104
 
103
105
  fn bert_set_lowercase(&self, lowercase: bool) {
104
- setter!(self, BertNormalizer, lowercase, lowercase)
106
+ setter!(self, BertNormalizer, lowercase, lowercase);
105
107
  }
106
108
 
107
109
  fn prepend_prepend(&self) -> String {
@@ -109,7 +111,7 @@ impl RbNormalizer {
109
111
  }
110
112
 
111
113
  fn prepend_set_prepend(&self, prepend: String) {
112
- setter!(self, Prepend, prepend, prepend)
114
+ setter!(self, Prepend, prepend, prepend);
113
115
  }
114
116
 
115
117
  fn strip_left(&self) -> bool {
@@ -117,7 +119,7 @@ impl RbNormalizer {
117
119
  }
118
120
 
119
121
  fn strip_set_left(&self, left: bool) {
120
- setter!(self, StripNormalizer, strip_left, left)
122
+ setter!(self, StripNormalizer, strip_left, left);
121
123
  }
122
124
 
123
125
  fn strip_right(&self) -> bool {
@@ -125,14 +127,19 @@ impl RbNormalizer {
125
127
  }
126
128
 
127
129
  fn strip_set_right(&self, right: bool) {
128
- setter!(self, StripNormalizer, strip_right, right)
130
+ setter!(self, StripNormalizer, strip_right, right);
129
131
  }
130
132
  }
131
133
 
132
134
  pub struct RbBertNormalizer {}
133
135
 
134
136
  impl RbBertNormalizer {
135
- pub fn new(clean_text: bool, handle_chinese_chars: bool, strip_accents: Option<bool>, lowercase: bool) -> RbNormalizer {
137
+ pub fn new(
138
+ clean_text: bool,
139
+ handle_chinese_chars: bool,
140
+ strip_accents: Option<bool>,
141
+ lowercase: bool,
142
+ ) -> RbNormalizer {
136
143
  BertNormalizer::new(clean_text, handle_chinese_chars, strip_accents, lowercase).into()
137
144
  }
138
145
  }
@@ -185,11 +192,28 @@ impl RbNmt {
185
192
  }
186
193
  }
187
194
 
195
+ pub struct RbPrecompiled {}
196
+
197
+ impl RbPrecompiled {
198
+ pub fn new(precompiled_charsmap: Vec<u8>) -> RbResult<RbNormalizer> {
199
+ Precompiled::from(&precompiled_charsmap)
200
+ .map_err(|e| {
201
+ RbError::new_err(format!(
202
+ "Error while attempting to build Precompiled normalizer: {}",
203
+ e
204
+ ))
205
+ })
206
+ .map(|v| v.into())
207
+ }
208
+ }
209
+
188
210
  pub struct RbReplace {}
189
211
 
190
212
  impl RbReplace {
191
213
  pub fn new(pattern: RbPattern, content: String) -> RbResult<RbNormalizer> {
192
- Replace::new(pattern, content).map(|v| v.into()).map_err(RbError::from)
214
+ Replace::new(pattern, content)
215
+ .map(|v| v.into())
216
+ .map_err(RbError::from)
193
217
  }
194
218
  }
195
219
 
@@ -222,14 +246,16 @@ pub struct RbSequence {}
222
246
  impl RbSequence {
223
247
  fn new(normalizers: RArray) -> RbResult<RbNormalizer> {
224
248
  let mut sequence = Vec::with_capacity(normalizers.len());
225
- for n in normalizers.into_iter() {
249
+ for n in normalizers {
226
250
  let normalizer: &RbNormalizer = TryConvert::try_convert(n)?;
227
251
  match &normalizer.normalizer {
228
252
  RbNormalizerTypeWrapper::Sequence(inner) => sequence.extend(inner.iter().cloned()),
229
253
  RbNormalizerTypeWrapper::Single(inner) => sequence.push(inner.clone()),
230
254
  }
231
255
  }
232
- Ok(RbNormalizer::new(RbNormalizerTypeWrapper::Sequence(sequence)))
256
+ Ok(RbNormalizer::new(RbNormalizerTypeWrapper::Sequence(
257
+ sequence,
258
+ )))
233
259
  }
234
260
  }
235
261
 
@@ -328,7 +354,10 @@ impl Normalizer for RbNormalizerWrapper {
328
354
  unsafe impl TypedData for RbNormalizer {
329
355
  fn class(ruby: &Ruby) -> RClass {
330
356
  static CLASS: Lazy<RClass> = Lazy::new(|ruby| {
331
- let class: RClass = ruby.get_inner(&NORMALIZERS).const_get("Normalizer").unwrap();
357
+ let class: RClass = ruby
358
+ .get_inner(&NORMALIZERS)
359
+ .const_get("Normalizer")
360
+ .unwrap();
332
361
  class.undef_default_alloc_func();
333
362
  class
334
363
  });
@@ -336,7 +365,8 @@ unsafe impl TypedData for RbNormalizer {
336
365
  }
337
366
 
338
367
  fn data_type() -> &'static DataType {
339
- static DATA_TYPE: DataType = data_type_builder!(RbNormalizer, "Tokenizers::Normalizers::Normalizer").build();
368
+ static DATA_TYPE: DataType =
369
+ data_type_builder!(RbNormalizer, "Tokenizers::Normalizers::Normalizer").build();
340
370
  &DATA_TYPE
341
371
  }
342
372
 
@@ -347,7 +377,10 @@ unsafe impl TypedData for RbNormalizer {
347
377
  class
348
378
  });
349
379
  static BERT_NORMALIZER: Lazy<RClass> = Lazy::new(|ruby| {
350
- let class: RClass = ruby.get_inner(&NORMALIZERS).const_get("BertNormalizer").unwrap();
380
+ let class: RClass = ruby
381
+ .get_inner(&NORMALIZERS)
382
+ .const_get("BertNormalizer")
383
+ .unwrap();
351
384
  class.undef_default_alloc_func();
352
385
  class
353
386
  });
@@ -381,6 +414,14 @@ unsafe impl TypedData for RbNormalizer {
381
414
  class.undef_default_alloc_func();
382
415
  class
383
416
  });
417
+ static PRECOMPILED: Lazy<RClass> = Lazy::new(|ruby| {
418
+ let class: RClass = ruby
419
+ .get_inner(&NORMALIZERS)
420
+ .const_get("Precompiled")
421
+ .unwrap();
422
+ class.undef_default_alloc_func();
423
+ class
424
+ });
384
425
  static REPLACE: Lazy<RClass> = Lazy::new(|ruby| {
385
426
  let class: RClass = ruby.get_inner(&NORMALIZERS).const_get("Replace").unwrap();
386
427
  class.undef_default_alloc_func();
@@ -397,7 +438,10 @@ unsafe impl TypedData for RbNormalizer {
397
438
  class
398
439
  });
399
440
  static STRIP_ACCENTS: Lazy<RClass> = Lazy::new(|ruby| {
400
- let class: RClass = ruby.get_inner(&NORMALIZERS).const_get("StripAccents").unwrap();
441
+ let class: RClass = ruby
442
+ .get_inner(&NORMALIZERS)
443
+ .const_get("StripAccents")
444
+ .unwrap();
401
445
  class.undef_default_alloc_func();
402
446
  class
403
447
  });
@@ -412,6 +456,7 @@ unsafe impl TypedData for RbNormalizer {
412
456
  NormalizerWrapper::NFKC(_) => ruby.get_inner(&NFKC),
413
457
  NormalizerWrapper::NFKD(_) => ruby.get_inner(&NFKD),
414
458
  NormalizerWrapper::Nmt(_) => ruby.get_inner(&NMT),
459
+ NormalizerWrapper::Precompiled(_) => ruby.get_inner(&PRECOMPILED),
415
460
  NormalizerWrapper::Replace(_) => ruby.get_inner(&REPLACE),
416
461
  NormalizerWrapper::Prepend(_) => ruby.get_inner(&PREPEND),
417
462
  NormalizerWrapper::StripNormalizer(_) => ruby.get_inner(&STRIP),
@@ -434,10 +479,22 @@ pub fn init_normalizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
434
479
  class.define_singleton_method("_new", function!(RbBertNormalizer::new, 4))?;
435
480
  class.define_method("clean_text", method!(RbNormalizer::bert_clean_text, 0))?;
436
481
  class.define_method("clean_text=", method!(RbNormalizer::bert_set_clean_text, 1))?;
437
- class.define_method("handle_chinese_chars", method!(RbNormalizer::bert_handle_chinese_chars, 0))?;
438
- class.define_method("handle_chinese_chars=", method!(RbNormalizer::bert_set_handle_chinese_chars, 1))?;
439
- class.define_method("strip_accents", method!(RbNormalizer::bert_strip_accents, 0))?;
440
- class.define_method("strip_accents=", method!(RbNormalizer::bert_set_strip_accents, 1))?;
482
+ class.define_method(
483
+ "handle_chinese_chars",
484
+ method!(RbNormalizer::bert_handle_chinese_chars, 0),
485
+ )?;
486
+ class.define_method(
487
+ "handle_chinese_chars=",
488
+ method!(RbNormalizer::bert_set_handle_chinese_chars, 1),
489
+ )?;
490
+ class.define_method(
491
+ "strip_accents",
492
+ method!(RbNormalizer::bert_strip_accents, 0),
493
+ )?;
494
+ class.define_method(
495
+ "strip_accents=",
496
+ method!(RbNormalizer::bert_set_strip_accents, 1),
497
+ )?;
441
498
  class.define_method("lowercase", method!(RbNormalizer::bert_lowercase, 0))?;
442
499
  class.define_method("lowercase=", method!(RbNormalizer::bert_set_lowercase, 1))?;
443
500
 
@@ -459,6 +516,9 @@ pub fn init_normalizers(ruby: &Ruby, module: &RModule) -> RbResult<()> {
459
516
  let class = module.define_class("Nmt", normalizer)?;
460
517
  class.define_singleton_method("new", function!(RbNmt::new, 0))?;
461
518
 
519
+ let class = module.define_class("Precompiled", normalizer)?;
520
+ class.define_singleton_method("new", function!(RbPrecompiled::new, 1))?;
521
+
462
522
  let class = module.define_class("Replace", normalizer)?;
463
523
  class.define_singleton_method("new", function!(RbReplace::new, 2))?;
464
524