tokenizers 0.6.3 → 0.7.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: b7e76174884c06417a6829e89ef1f5785957d92f10d8a9acd6586e19ec84737c
4
- data.tar.gz: 04b8e76f25f59e404978f2af68829142890c7bdd4ffbe7cd86a5af4f72de2d5e
3
+ metadata.gz: e11d6d7b7b6adb870221d30c3f11e3dee86a57e1334cd2a419a38959a6523712
4
+ data.tar.gz: 04a6be127e354dcb9f8f4f0656c242a2e8df12ce91b5614379404d6e35bb219f
5
5
  SHA512:
6
- metadata.gz: b9b2583c6c2aac22d835c045f6674a95a1f1a9dcdddd7d2406b34c7f64cb04bd8900de05feaa85a8f9d5601392636b41ed300326f362e9e5de29098506801cc6
7
- data.tar.gz: '08f6e8e3c4187a5f3bd57e75141add2973857a1fe4211b86acd8ab43a3ec5fc5550309910c606d09267dfe3242c7088b7ffa2d5ccb0f94c5ee922deb0fd0c943'
6
+ metadata.gz: 364207bd71c3aa9fe2760d4ec7ae58666274e7ba7fe0e753b33316b1b61bb411e2592a8cf2c8dff1ae37a3082607d2d07259375a50ff345769274f8aeedd89c1
7
+ data.tar.gz: 6a6a572a5925f3d140dcbbd93c23bae774d28898921b743dbe1626b02ebff046f54ba5231886e78d5b2ea51ca5c235582c1a440be2a89f35e13584bdbcf186d0
data/CHANGELOG.md CHANGED
@@ -1,3 +1,14 @@
1
+ ## 0.7.0 (2026-04-27)
2
+
3
+ - Updated Tokenizers to 0.23.1
4
+ - Added support for releasing GVL
5
+ - Added `encode_batch_fast` method to `Tokenizer`
6
+ - Dropped support for Ruby < 3.3
7
+
8
+ ## 0.6.4 (2026-04-09)
9
+
10
+ - Fixed caching
11
+
1
12
  ## 0.6.3 (2026-01-05)
2
13
 
3
14
  - Updated Tokenizers to 0.22.2
data/Cargo.lock CHANGED
@@ -33,16 +33,14 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
33
33
 
34
34
  [[package]]
35
35
  name = "bindgen"
36
- version = "0.69.5"
36
+ version = "0.72.1"
37
37
  source = "registry+https://github.com/rust-lang/crates.io-index"
38
- checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
38
+ checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
39
39
  dependencies = [
40
40
  "bitflags",
41
41
  "cexpr",
42
42
  "clang-sys",
43
43
  "itertools 0.12.1",
44
- "lazy_static",
45
- "lazycell",
46
44
  "proc-macro2",
47
45
  "quote",
48
46
  "regex",
@@ -160,6 +158,12 @@ version = "0.8.21"
160
158
  source = "registry+https://github.com/rust-lang/crates.io-index"
161
159
  checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
162
160
 
161
+ [[package]]
162
+ name = "daachorse"
163
+ version = "1.0.1"
164
+ source = "registry+https://github.com/rust-lang/crates.io-index"
165
+ checksum = "6f55d7153ba3b507595872a3874803f07a8a81d1e888abed8e5db7da0597d6e2"
166
+
163
167
  [[package]]
164
168
  name = "darling"
165
169
  version = "0.20.11"
@@ -339,12 +343,6 @@ version = "1.5.0"
339
343
  source = "registry+https://github.com/rust-lang/crates.io-index"
340
344
  checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
341
345
 
342
- [[package]]
343
- name = "lazycell"
344
- version = "1.3.0"
345
- source = "registry+https://github.com/rust-lang/crates.io-index"
346
- checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
347
-
348
346
  [[package]]
349
347
  name = "libc"
350
348
  version = "0.2.172"
@@ -530,9 +528,9 @@ checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
530
528
 
531
529
  [[package]]
532
530
  name = "rand"
533
- version = "0.9.1"
531
+ version = "0.9.4"
534
532
  source = "registry+https://github.com/rust-lang/crates.io-index"
535
- checksum = "9fbfd9d094a40bf3ae768db9361049ace4c0e04a4fd6b359518bd7b73a73dd97"
533
+ checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
536
534
  dependencies = [
537
535
  "rand_chacha",
538
536
  "rand_core",
@@ -590,18 +588,18 @@ dependencies = [
590
588
 
591
589
  [[package]]
592
590
  name = "rb-sys"
593
- version = "0.9.124"
591
+ version = "0.9.127"
594
592
  source = "registry+https://github.com/rust-lang/crates.io-index"
595
- checksum = "c85c4188462601e2aa1469def389c17228566f82ea72f137ed096f21591bc489"
593
+ checksum = "d7d7c9560fe42dcffa576941394075f18a17dce89fcf718a2fa90b7dc2134d12"
596
594
  dependencies = [
597
595
  "rb-sys-build",
598
596
  ]
599
597
 
600
598
  [[package]]
601
599
  name = "rb-sys-build"
602
- version = "0.9.124"
600
+ version = "0.9.127"
603
601
  source = "registry+https://github.com/rust-lang/crates.io-index"
604
- checksum = "568068db4102230882e6d4ae8de6632e224ca75fe5970f6e026a04e91ed635d3"
602
+ checksum = "f1688e8f32967ba48c89e4dfa283b57f901075f542fc7ee9c3d7c5f9091ca1d9"
605
603
  dependencies = [
606
604
  "bindgen",
607
605
  "lazy_static",
@@ -649,9 +647,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
649
647
 
650
648
  [[package]]
651
649
  name = "rustc-hash"
652
- version = "1.1.0"
650
+ version = "2.1.2"
653
651
  source = "registry+https://github.com/rust-lang/crates.io-index"
654
- checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
652
+ checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
655
653
 
656
654
  [[package]]
657
655
  name = "rustversion"
@@ -778,13 +776,13 @@ dependencies = [
778
776
 
779
777
  [[package]]
780
778
  name = "tokenizers"
781
- version = "0.22.2"
779
+ version = "0.23.1"
782
780
  source = "registry+https://github.com/rust-lang/crates.io-index"
783
- checksum = "b238e22d44a15349529690fb07bd645cf58149a1b1e44d6cb5bd1641ff1a6223"
781
+ checksum = "44e5bea67576e04b6ff8564c5d9e09c2ef0cf476502245f2f120e497769d3112"
784
782
  dependencies = [
785
783
  "ahash",
786
- "aho-corasick",
787
784
  "compact_str",
785
+ "daachorse",
788
786
  "dary_heap",
789
787
  "derive_builder",
790
788
  "esaxx-rs",
@@ -812,11 +810,12 @@ dependencies = [
812
810
 
813
811
  [[package]]
814
812
  name = "tokenizers-ruby"
815
- version = "0.6.3"
813
+ version = "0.7.0"
816
814
  dependencies = [
817
815
  "ahash",
818
816
  "magnus",
819
817
  "onig",
818
+ "rb-sys",
820
819
  "serde",
821
820
  "tokenizers",
822
821
  ]
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers-ruby"
3
- version = "0.6.3"
3
+ version = "0.7.0"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -15,9 +15,10 @@ crate-type = ["cdylib"]
15
15
  ahash = { version = "0.8.11", features = ["serde"] }
16
16
  magnus = "0.8"
17
17
  onig = { version = "6", default-features = false }
18
+ rb-sys = "0.9"
18
19
  serde = { version = "1", features = ["rc", "derive"] }
19
20
 
20
21
  [dependencies.tokenizers]
21
- version = "=0.22.2" # also update in from_pretrained.rb
22
+ version = "=0.23.1" # also update in from_pretrained.rb
22
23
  default-features = false
23
24
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -23,8 +23,8 @@ use super::utils::*;
23
23
  use super::{RbError, RbResult, DECODERS};
24
24
 
25
25
  #[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
26
+ #[serde(transparent)]
26
27
  pub struct RbDecoder {
27
- #[serde(flatten)]
28
28
  pub(crate) decoder: RbDecoderWrapper,
29
29
  }
30
30
 
@@ -69,7 +69,7 @@ macro_rules! setter {
69
69
  }};
70
70
  }
71
71
  impl RbDecoder {
72
- pub fn bpe_suffix(&self) -> String {
72
+ pub fn bpe_get_suffix(&self) -> String {
73
73
  getter!(self, BPE, suffix.clone())
74
74
  }
75
75
 
@@ -77,7 +77,7 @@ impl RbDecoder {
77
77
  setter!(self, BPE, suffix, suffix);
78
78
  }
79
79
 
80
- pub fn ctc_cleanup(&self) -> bool {
80
+ pub fn ctc_get_cleanup(&self) -> bool {
81
81
  getter!(self, CTC, cleanup)
82
82
  }
83
83
 
@@ -85,7 +85,7 @@ impl RbDecoder {
85
85
  setter!(self, CTC, cleanup, cleanup);
86
86
  }
87
87
 
88
- pub fn ctc_pad_token(&self) -> String {
88
+ pub fn ctc_get_pad_token(&self) -> String {
89
89
  getter!(self, CTC, pad_token.clone())
90
90
  }
91
91
 
@@ -93,7 +93,7 @@ impl RbDecoder {
93
93
  setter!(self, CTC, pad_token, pad_token);
94
94
  }
95
95
 
96
- pub fn ctc_word_delimiter_token(&self) -> String {
96
+ pub fn ctc_get_word_delimiter_token(&self) -> String {
97
97
  getter!(self, CTC, word_delimiter_token.clone())
98
98
  }
99
99
 
@@ -101,31 +101,31 @@ impl RbDecoder {
101
101
  setter!(self, CTC, word_delimiter_token, word_delimiter_token);
102
102
  }
103
103
 
104
- fn strip_content(&self) -> char {
104
+ pub fn strip_get_content(&self) -> char {
105
105
  getter!(self, Strip, content)
106
106
  }
107
107
 
108
- fn strip_set_content(&self, content: char) {
108
+ pub fn strip_set_content(&self, content: char) {
109
109
  setter!(self, Strip, content, content);
110
110
  }
111
111
 
112
- fn strip_start(&self) -> usize {
112
+ pub fn strip_get_start(&self) -> usize {
113
113
  getter!(self, Strip, start)
114
114
  }
115
115
 
116
- fn strip_set_start(&self, start: usize) {
116
+ pub fn strip_set_start(&self, start: usize) {
117
117
  setter!(self, Strip, start, start);
118
118
  }
119
119
 
120
- fn strip_stop(&self) -> usize {
120
+ pub fn strip_get_stop(&self) -> usize {
121
121
  getter!(self, Strip, stop)
122
122
  }
123
123
 
124
- fn strip_set_stop(&self, stop: usize) {
124
+ pub fn strip_set_stop(&self, stop: usize) {
125
125
  setter!(self, Strip, stop, stop);
126
126
  }
127
127
 
128
- pub fn metaspace_replacement(&self) -> char {
128
+ pub fn metaspace_get_replacement(&self) -> char {
129
129
  getter!(self, Metaspace, get_replacement().clone())
130
130
  }
131
131
 
@@ -133,7 +133,7 @@ impl RbDecoder {
133
133
  setter!(self, Metaspace, @set_replacement, replacement);
134
134
  }
135
135
 
136
- pub fn metaspace_split(&self) -> bool {
136
+ pub fn metaspace_get_split(&self) -> bool {
137
137
  getter!(self, Metaspace, get_split())
138
138
  }
139
139
 
@@ -141,7 +141,7 @@ impl RbDecoder {
141
141
  setter!(self, Metaspace, @set_split, split);
142
142
  }
143
143
 
144
- pub fn metaspace_prepend_scheme(&self) -> String {
144
+ pub fn metaspace_get_prepend_scheme(&self) -> String {
145
145
  // Assuming Metaspace has a method to get the prepend_scheme as a string
146
146
  let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
147
147
  match scheme {
@@ -158,7 +158,7 @@ impl RbDecoder {
158
158
  Ok(())
159
159
  }
160
160
 
161
- pub fn word_piece_cleanup(&self) -> bool {
161
+ pub fn word_piece_get_cleanup(&self) -> bool {
162
162
  getter!(self, WordPiece, cleanup)
163
163
  }
164
164
 
@@ -166,7 +166,7 @@ impl RbDecoder {
166
166
  setter!(self, WordPiece, cleanup, cleanup);
167
167
  }
168
168
 
169
- pub fn word_piece_prefix(&self) -> String {
169
+ pub fn word_piece_get_prefix(&self) -> String {
170
170
  getter!(self, WordPiece, prefix.clone())
171
171
  }
172
172
 
@@ -371,7 +371,7 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
371
371
 
372
372
  let class = module.define_class("BPEDecoder", decoder)?;
373
373
  class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
374
- class.define_method("suffix", method!(RbDecoder::bpe_suffix, 0))?;
374
+ class.define_method("suffix", method!(RbDecoder::bpe_get_suffix, 0))?;
375
375
  class.define_method("suffix=", method!(RbDecoder::bpe_set_suffix, 1))?;
376
376
 
377
377
  let class = module.define_class("ByteFallback", decoder)?;
@@ -382,13 +382,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
382
382
 
383
383
  let class = module.define_class("CTC", decoder)?;
384
384
  class.define_singleton_method("_new", function!(RbCTC::new, 3))?;
385
- class.define_method("cleanup", method!(RbDecoder::ctc_cleanup, 0))?;
385
+ class.define_method("cleanup", method!(RbDecoder::ctc_get_cleanup, 0))?;
386
386
  class.define_method("cleanup=", method!(RbDecoder::ctc_set_cleanup, 1))?;
387
- class.define_method("pad_token", method!(RbDecoder::ctc_pad_token, 0))?;
387
+ class.define_method("pad_token", method!(RbDecoder::ctc_get_pad_token, 0))?;
388
388
  class.define_method("pad_token=", method!(RbDecoder::ctc_set_pad_token, 1))?;
389
389
  class.define_method(
390
390
  "word_delimiter_token",
391
- method!(RbDecoder::ctc_word_delimiter_token, 0),
391
+ method!(RbDecoder::ctc_get_word_delimiter_token, 0),
392
392
  )?;
393
393
  class.define_method(
394
394
  "word_delimiter_token=",
@@ -402,18 +402,21 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
402
402
  class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
403
403
  class.define_method(
404
404
  "prepend_scheme",
405
- method!(RbDecoder::metaspace_prepend_scheme, 0),
405
+ method!(RbDecoder::metaspace_get_prepend_scheme, 0),
406
406
  )?;
407
407
  class.define_method(
408
408
  "prepend_scheme=",
409
409
  method!(RbDecoder::metaspace_set_prepend_scheme, 1),
410
410
  )?;
411
- class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
411
+ class.define_method(
412
+ "replacement",
413
+ method!(RbDecoder::metaspace_get_replacement, 0),
414
+ )?;
412
415
  class.define_method(
413
416
  "replacement=",
414
417
  method!(RbDecoder::metaspace_set_replacement, 1),
415
418
  )?;
416
- class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
419
+ class.define_method("split", method!(RbDecoder::metaspace_get_split, 0))?;
417
420
  class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
418
421
 
419
422
  let class = module.define_class("Replace", decoder)?;
@@ -421,18 +424,18 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
421
424
 
422
425
  let class = module.define_class("Strip", decoder)?;
423
426
  class.define_singleton_method("_new", function!(RbStripDecoder::new, 3))?;
424
- class.define_method("content", method!(RbDecoder::strip_content, 0))?;
427
+ class.define_method("content", method!(RbDecoder::strip_get_content, 0))?;
425
428
  class.define_method("content=", method!(RbDecoder::strip_set_content, 1))?;
426
- class.define_method("start", method!(RbDecoder::strip_start, 0))?;
429
+ class.define_method("start", method!(RbDecoder::strip_get_start, 0))?;
427
430
  class.define_method("start=", method!(RbDecoder::strip_set_start, 1))?;
428
- class.define_method("stop", method!(RbDecoder::strip_stop, 0))?;
431
+ class.define_method("stop", method!(RbDecoder::strip_get_stop, 0))?;
429
432
  class.define_method("stop=", method!(RbDecoder::strip_set_stop, 1))?;
430
433
 
431
434
  let class = module.define_class("WordPiece", decoder)?;
432
435
  class.define_singleton_method("_new", function!(RbWordPieceDecoder::new, 2))?;
433
- class.define_method("cleanup", method!(RbDecoder::word_piece_cleanup, 0))?;
436
+ class.define_method("cleanup", method!(RbDecoder::word_piece_get_cleanup, 0))?;
434
437
  class.define_method("cleanup=", method!(RbDecoder::word_piece_set_cleanup, 1))?;
435
- class.define_method("prefix", method!(RbDecoder::word_piece_prefix, 0))?;
438
+ class.define_method("prefix", method!(RbDecoder::word_piece_get_prefix, 0))?;
436
439
  class.define_method("prefix=", method!(RbDecoder::word_piece_set_prefix, 1))?;
437
440
 
438
441
  Ok(())
@@ -1,6 +1,8 @@
1
- use magnus::{RArray, Ruby};
1
+ use magnus::{method, Module, RArray, RModule, Ruby};
2
2
  use tk::{Encoding, Offsets};
3
3
 
4
+ use super::RbResult;
5
+
4
6
  #[magnus::wrap(class = "Tokenizers::Encoding")]
5
7
  #[repr(transparent)]
6
8
  pub struct RbEncoding {
@@ -14,43 +16,43 @@ impl From<Encoding> for RbEncoding {
14
16
  }
15
17
 
16
18
  impl RbEncoding {
17
- pub fn n_sequences(&self) -> usize {
19
+ pub fn get_n_sequences(&self) -> usize {
18
20
  self.encoding.n_sequences()
19
21
  }
20
22
 
21
- pub fn ids(&self) -> Vec<u32> {
23
+ pub fn get_ids(&self) -> Vec<u32> {
22
24
  self.encoding.get_ids().to_vec()
23
25
  }
24
26
 
25
- pub fn tokens(&self) -> Vec<String> {
27
+ pub fn get_tokens(&self) -> Vec<String> {
26
28
  self.encoding.get_tokens().to_vec()
27
29
  }
28
30
 
29
- pub fn word_ids(&self) -> Vec<Option<u32>> {
31
+ pub fn get_word_ids(&self) -> Vec<Option<u32>> {
30
32
  self.encoding.get_word_ids().to_vec()
31
33
  }
32
34
 
33
- pub fn sequence_ids(&self) -> Vec<Option<usize>> {
35
+ pub fn get_sequence_ids(&self) -> Vec<Option<usize>> {
34
36
  self.encoding.get_sequence_ids()
35
37
  }
36
38
 
37
- pub fn type_ids(&self) -> Vec<u32> {
39
+ pub fn get_type_ids(&self) -> Vec<u32> {
38
40
  self.encoding.get_type_ids().to_vec()
39
41
  }
40
42
 
41
- pub fn offsets(&self) -> Vec<(usize, usize)> {
43
+ pub fn get_offsets(&self) -> Vec<(usize, usize)> {
42
44
  self.encoding.get_offsets().to_vec()
43
45
  }
44
46
 
45
- pub fn special_tokens_mask(&self) -> Vec<u32> {
47
+ pub fn get_special_tokens_mask(&self) -> Vec<u32> {
46
48
  self.encoding.get_special_tokens_mask().to_vec()
47
49
  }
48
50
 
49
- pub fn attention_mask(&self) -> Vec<u32> {
51
+ pub fn get_attention_mask(&self) -> Vec<u32> {
50
52
  self.encoding.get_attention_mask().to_vec()
51
53
  }
52
54
 
53
- pub fn overflowing(ruby: &Ruby, rb_self: &Self) -> RArray {
55
+ pub fn get_overflowing(ruby: &Ruby, rb_self: &Self) -> RArray {
54
56
  ruby.ary_from_iter(
55
57
  rb_self
56
58
  .encoding
@@ -91,3 +93,32 @@ impl RbEncoding {
91
93
  self.encoding.char_to_word(char_pos, sequence_index)
92
94
  }
93
95
  }
96
+
97
+ pub fn init_encoding(ruby: &Ruby, module: &RModule) -> RbResult<()> {
98
+ let class = module.define_class("Encoding", ruby.class_object())?;
99
+ class.define_method("n_sequences", method!(RbEncoding::get_n_sequences, 0))?;
100
+ class.define_method("ids", method!(RbEncoding::get_ids, 0))?;
101
+ class.define_method("tokens", method!(RbEncoding::get_tokens, 0))?;
102
+ class.define_method("word_ids", method!(RbEncoding::get_word_ids, 0))?;
103
+ class.define_method("sequence_ids", method!(RbEncoding::get_sequence_ids, 0))?;
104
+ class.define_method("type_ids", method!(RbEncoding::get_type_ids, 0))?;
105
+ class.define_method("offsets", method!(RbEncoding::get_offsets, 0))?;
106
+ class.define_method(
107
+ "special_tokens_mask",
108
+ method!(RbEncoding::get_special_tokens_mask, 0),
109
+ )?;
110
+ class.define_method("attention_mask", method!(RbEncoding::get_attention_mask, 0))?;
111
+ class.define_method("overflowing", method!(RbEncoding::get_overflowing, 0))?;
112
+ class.define_method("_word_to_tokens", method!(RbEncoding::word_to_tokens, 2))?;
113
+ class.define_method("_word_to_chars", method!(RbEncoding::word_to_chars, 2))?;
114
+ class.define_method(
115
+ "token_to_sequence",
116
+ method!(RbEncoding::token_to_sequence, 1),
117
+ )?;
118
+ class.define_method("token_to_chars", method!(RbEncoding::token_to_chars, 1))?;
119
+ class.define_method("token_to_word", method!(RbEncoding::token_to_word, 1))?;
120
+ class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
121
+ class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
122
+
123
+ Ok(())
124
+ }
@@ -1,3 +1,5 @@
1
+ use std::borrow::Cow;
2
+
1
3
  use magnus::{prelude::*, value::Lazy, Error, ExceptionClass, Ruby};
2
4
 
3
5
  use super::TOKENIZERS;
@@ -7,17 +9,20 @@ pub struct RbError {}
7
9
  impl RbError {
8
10
  // convert to Error instead of Self
9
11
  pub fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Error {
10
- Error::new(error(), e.to_string())
12
+ Error::new(error(&Ruby::get().unwrap()), e.to_string())
11
13
  }
12
14
 
13
- pub fn new_err(s: String) -> Error {
14
- Error::new(error(), s)
15
+ pub fn new_err<T>(s: T) -> Error
16
+ where
17
+ T: Into<Cow<'static, str>>,
18
+ {
19
+ Error::new(error(&Ruby::get().unwrap()), s)
15
20
  }
16
21
  }
17
22
 
18
23
  static ERROR: Lazy<ExceptionClass> =
19
24
  Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Error").unwrap());
20
25
 
21
- fn error() -> ExceptionClass {
22
- Ruby::get().unwrap().get_inner(&ERROR)
26
+ fn error(ruby: &Ruby) -> ExceptionClass {
27
+ ruby.get_inner(&ERROR)
23
28
  }
@@ -9,16 +9,15 @@ mod models;
9
9
  mod normalizers;
10
10
  mod pre_tokenizers;
11
11
  mod processors;
12
+ mod ruby;
12
13
  mod tokenizer;
13
14
  mod trainers;
14
15
  mod utils;
15
16
 
16
- use encoding::RbEncoding;
17
17
  use error::RbError;
18
- use tokenizer::{RbAddedToken, RbTokenizer};
19
18
  use utils::RbRegex;
20
19
 
21
- use magnus::{function, method, prelude::*, value::Lazy, Error, RModule, Ruby};
20
+ use magnus::{function, prelude::*, value::Lazy, Error, RModule, Ruby};
22
21
 
23
22
  type RbResult<T> = Result<T, Error>;
24
23
 
@@ -53,97 +52,9 @@ static TRAINERS: Lazy<RModule> =
53
52
  fn init(ruby: &Ruby) -> RbResult<()> {
54
53
  let module = ruby.define_module("Tokenizers")?;
55
54
 
56
- let class = module.define_class("Tokenizer", ruby.class_object())?;
57
- class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
58
- class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
59
- class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
60
- class.define_method(
61
- "add_special_tokens",
62
- method!(RbTokenizer::add_special_tokens, 1),
63
- )?;
64
- class.define_method("train", method!(RbTokenizer::train, 2))?;
65
- class.define_method("_save", method!(RbTokenizer::save, 2))?;
66
- class.define_method("add_tokens", method!(RbTokenizer::add_tokens, 1))?;
67
- class.define_method("_encode", method!(RbTokenizer::encode, 4))?;
68
- class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
69
- class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
70
- class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
71
- class.define_method("model", method!(RbTokenizer::get_model, 0))?;
72
- class.define_method("model=", method!(RbTokenizer::set_model, 1))?;
73
- class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
74
- class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
75
- class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
76
- class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
77
- class.define_method(
78
- "post_processor",
79
- method!(RbTokenizer::get_post_processor, 0),
80
- )?;
81
- class.define_method(
82
- "post_processor=",
83
- method!(RbTokenizer::set_post_processor, 1),
84
- )?;
85
- class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
86
- class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
87
- class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
88
- class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
89
- class.define_method("_enable_padding", method!(RbTokenizer::enable_padding, 1))?;
90
- class.define_method("padding", method!(RbTokenizer::padding, 0))?;
91
- class.define_method("no_padding", method!(RbTokenizer::no_padding, 0))?;
92
- class.define_method(
93
- "_enable_truncation",
94
- method!(RbTokenizer::enable_truncation, 2),
95
- )?;
96
- class.define_method("truncation", method!(RbTokenizer::truncation, 0))?;
97
- class.define_method("no_truncation", method!(RbTokenizer::no_truncation, 0))?;
98
- class.define_method(
99
- "num_special_tokens_to_add",
100
- method!(RbTokenizer::num_special_tokens_to_add, 1),
101
- )?;
102
- class.define_method("_vocab", method!(RbTokenizer::vocab, 1))?;
103
- class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
104
- class.define_method(
105
- "added_tokens_decoder",
106
- method!(RbTokenizer::get_added_tokens_decoder, 0),
107
- )?;
108
- class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
109
-
110
- let class = module.define_class("Encoding", ruby.class_object())?;
111
- class.define_method("n_sequences", method!(RbEncoding::n_sequences, 0))?;
112
- class.define_method("ids", method!(RbEncoding::ids, 0))?;
113
- class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
114
- class.define_method("word_ids", method!(RbEncoding::word_ids, 0))?;
115
- class.define_method("sequence_ids", method!(RbEncoding::sequence_ids, 0))?;
116
- class.define_method("type_ids", method!(RbEncoding::type_ids, 0))?;
117
- class.define_method("offsets", method!(RbEncoding::offsets, 0))?;
118
- class.define_method(
119
- "special_tokens_mask",
120
- method!(RbEncoding::special_tokens_mask, 0),
121
- )?;
122
- class.define_method("attention_mask", method!(RbEncoding::attention_mask, 0))?;
123
- class.define_method("overflowing", method!(RbEncoding::overflowing, 0))?;
124
- class.define_method("_word_to_tokens", method!(RbEncoding::word_to_tokens, 2))?;
125
- class.define_method("_word_to_chars", method!(RbEncoding::word_to_chars, 2))?;
126
- class.define_method(
127
- "token_to_sequence",
128
- method!(RbEncoding::token_to_sequence, 1),
129
- )?;
130
- class.define_method("token_to_chars", method!(RbEncoding::token_to_chars, 1))?;
131
- class.define_method("token_to_word", method!(RbEncoding::token_to_word, 1))?;
132
- class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
133
- class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
134
-
135
55
  let class = module.define_class("Regex", ruby.class_object())?;
136
56
  class.define_singleton_method("new", function!(RbRegex::new, 1))?;
137
57
 
138
- let class = module.define_class("AddedToken", ruby.class_object())?;
139
- class.define_singleton_method("_new", function!(RbAddedToken::new, 2))?;
140
- class.define_method("content", method!(RbAddedToken::get_content, 0))?;
141
- class.define_method("rstrip", method!(RbAddedToken::get_rstrip, 0))?;
142
- class.define_method("lstrip", method!(RbAddedToken::get_lstrip, 0))?;
143
- class.define_method("single_word", method!(RbAddedToken::get_single_word, 0))?;
144
- class.define_method("normalized", method!(RbAddedToken::get_normalized, 0))?;
145
- class.define_method("special", method!(RbAddedToken::get_special, 0))?;
146
-
147
58
  let models = module.define_module("Models")?;
148
59
  let pre_tokenizers = module.define_module("PreTokenizers")?;
149
60
  let decoders = module.define_module("Decoders")?;
@@ -151,6 +62,8 @@ fn init(ruby: &Ruby) -> RbResult<()> {
151
62
  let normalizers = module.define_module("Normalizers")?;
152
63
  let trainers = module.define_module("Trainers")?;
153
64
 
65
+ tokenizer::init_tokenizer(ruby, &module)?;
66
+ encoding::init_encoding(ruby, &module)?;
154
67
  models::init_models(ruby, &models)?;
155
68
  pre_tokenizers::init_pre_tokenizers(ruby, &pre_tokenizers)?;
156
69
  decoders::init_decoders(ruby, &decoders)?;