tokenizers 0.6.4 → 0.7.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Cargo.lock +21 -22
- data/ext/tokenizers/Cargo.toml +3 -2
- data/ext/tokenizers/src/decoders.rs +31 -28
- data/ext/tokenizers/src/encoding.rs +42 -11
- data/ext/tokenizers/src/error.rs +10 -5
- data/ext/tokenizers/src/lib.rs +4 -91
- data/ext/tokenizers/src/models.rs +21 -21
- data/ext/tokenizers/src/normalizers.rs +15 -15
- data/ext/tokenizers/src/pre_tokenizers.rs +15 -15
- data/ext/tokenizers/src/processors.rs +145 -15
- data/ext/tokenizers/src/ruby.rs +51 -0
- data/ext/tokenizers/src/tokenizer.rs +381 -244
- data/ext/tokenizers/src/trainers.rs +55 -49
- data/ext/tokenizers/src/utils/normalization.rs +2 -1
- data/ext/tokenizers/src/utils/regex.rs +2 -2
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/processors/sequence.rb +9 -0
- data/lib/tokenizers/tokenizer.rb +4 -0
- data/lib/tokenizers/version.rb +1 -1
- metadata +4 -2
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e11d6d7b7b6adb870221d30c3f11e3dee86a57e1334cd2a419a38959a6523712
|
|
4
|
+
data.tar.gz: 04a6be127e354dcb9f8f4f0656c242a2e8df12ce91b5614379404d6e35bb219f
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 364207bd71c3aa9fe2760d4ec7ae58666274e7ba7fe0e753b33316b1b61bb411e2592a8cf2c8dff1ae37a3082607d2d07259375a50ff345769274f8aeedd89c1
|
|
7
|
+
data.tar.gz: 6a6a572a5925f3d140dcbbd93c23bae774d28898921b743dbe1626b02ebff046f54ba5231886e78d5b2ea51ca5c235582c1a440be2a89f35e13584bdbcf186d0
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
|
@@ -33,16 +33,14 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
|
33
33
|
|
|
34
34
|
[[package]]
|
|
35
35
|
name = "bindgen"
|
|
36
|
-
version = "0.
|
|
36
|
+
version = "0.72.1"
|
|
37
37
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
38
|
-
checksum = "
|
|
38
|
+
checksum = "993776b509cfb49c750f11b8f07a46fa23e0a1386ffc01fb1e7d343efc387895"
|
|
39
39
|
dependencies = [
|
|
40
40
|
"bitflags",
|
|
41
41
|
"cexpr",
|
|
42
42
|
"clang-sys",
|
|
43
43
|
"itertools 0.12.1",
|
|
44
|
-
"lazy_static",
|
|
45
|
-
"lazycell",
|
|
46
44
|
"proc-macro2",
|
|
47
45
|
"quote",
|
|
48
46
|
"regex",
|
|
@@ -160,6 +158,12 @@ version = "0.8.21"
|
|
|
160
158
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
161
159
|
checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
|
|
162
160
|
|
|
161
|
+
[[package]]
|
|
162
|
+
name = "daachorse"
|
|
163
|
+
version = "1.0.1"
|
|
164
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
165
|
+
checksum = "6f55d7153ba3b507595872a3874803f07a8a81d1e888abed8e5db7da0597d6e2"
|
|
166
|
+
|
|
163
167
|
[[package]]
|
|
164
168
|
name = "darling"
|
|
165
169
|
version = "0.20.11"
|
|
@@ -339,12 +343,6 @@ version = "1.5.0"
|
|
|
339
343
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
340
344
|
checksum = "bbd2bcb4c963f2ddae06a2efc7e9f3591312473c50c6685e1f298068316e66fe"
|
|
341
345
|
|
|
342
|
-
[[package]]
|
|
343
|
-
name = "lazycell"
|
|
344
|
-
version = "1.3.0"
|
|
345
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
346
|
-
checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
347
|
-
|
|
348
346
|
[[package]]
|
|
349
347
|
name = "libc"
|
|
350
348
|
version = "0.2.172"
|
|
@@ -530,9 +528,9 @@ checksum = "69cdb34c158ceb288df11e18b4bd39de994f6657d83847bdffdbd7f346754b0f"
|
|
|
530
528
|
|
|
531
529
|
[[package]]
|
|
532
530
|
name = "rand"
|
|
533
|
-
version = "0.9.
|
|
531
|
+
version = "0.9.4"
|
|
534
532
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
535
|
-
checksum = "
|
|
533
|
+
checksum = "44c5af06bb1b7d3216d91932aed5265164bf384dc89cd6ba05cf59a35f5f76ea"
|
|
536
534
|
dependencies = [
|
|
537
535
|
"rand_chacha",
|
|
538
536
|
"rand_core",
|
|
@@ -590,18 +588,18 @@ dependencies = [
|
|
|
590
588
|
|
|
591
589
|
[[package]]
|
|
592
590
|
name = "rb-sys"
|
|
593
|
-
version = "0.9.
|
|
591
|
+
version = "0.9.127"
|
|
594
592
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
595
|
-
checksum = "
|
|
593
|
+
checksum = "d7d7c9560fe42dcffa576941394075f18a17dce89fcf718a2fa90b7dc2134d12"
|
|
596
594
|
dependencies = [
|
|
597
595
|
"rb-sys-build",
|
|
598
596
|
]
|
|
599
597
|
|
|
600
598
|
[[package]]
|
|
601
599
|
name = "rb-sys-build"
|
|
602
|
-
version = "0.9.
|
|
600
|
+
version = "0.9.127"
|
|
603
601
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
604
|
-
checksum = "
|
|
602
|
+
checksum = "f1688e8f32967ba48c89e4dfa283b57f901075f542fc7ee9c3d7c5f9091ca1d9"
|
|
605
603
|
dependencies = [
|
|
606
604
|
"bindgen",
|
|
607
605
|
"lazy_static",
|
|
@@ -649,9 +647,9 @@ checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
|
|
|
649
647
|
|
|
650
648
|
[[package]]
|
|
651
649
|
name = "rustc-hash"
|
|
652
|
-
version = "
|
|
650
|
+
version = "2.1.2"
|
|
653
651
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
654
|
-
checksum = "
|
|
652
|
+
checksum = "94300abf3f1ae2e2b8ffb7b58043de3d399c73fa6f4b73826402a5c457614dbe"
|
|
655
653
|
|
|
656
654
|
[[package]]
|
|
657
655
|
name = "rustversion"
|
|
@@ -778,13 +776,13 @@ dependencies = [
|
|
|
778
776
|
|
|
779
777
|
[[package]]
|
|
780
778
|
name = "tokenizers"
|
|
781
|
-
version = "0.
|
|
779
|
+
version = "0.23.1"
|
|
782
780
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
783
|
-
checksum = "
|
|
781
|
+
checksum = "44e5bea67576e04b6ff8564c5d9e09c2ef0cf476502245f2f120e497769d3112"
|
|
784
782
|
dependencies = [
|
|
785
783
|
"ahash",
|
|
786
|
-
"aho-corasick",
|
|
787
784
|
"compact_str",
|
|
785
|
+
"daachorse",
|
|
788
786
|
"dary_heap",
|
|
789
787
|
"derive_builder",
|
|
790
788
|
"esaxx-rs",
|
|
@@ -812,11 +810,12 @@ dependencies = [
|
|
|
812
810
|
|
|
813
811
|
[[package]]
|
|
814
812
|
name = "tokenizers-ruby"
|
|
815
|
-
version = "0.
|
|
813
|
+
version = "0.7.0"
|
|
816
814
|
dependencies = [
|
|
817
815
|
"ahash",
|
|
818
816
|
"magnus",
|
|
819
817
|
"onig",
|
|
818
|
+
"rb-sys",
|
|
820
819
|
"serde",
|
|
821
820
|
"tokenizers",
|
|
822
821
|
]
|
data/ext/tokenizers/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "tokenizers-ruby"
|
|
3
|
-
version = "0.
|
|
3
|
+
version = "0.7.0"
|
|
4
4
|
license = "Apache-2.0"
|
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
|
6
6
|
edition = "2021"
|
|
@@ -15,9 +15,10 @@ crate-type = ["cdylib"]
|
|
|
15
15
|
ahash = { version = "0.8.11", features = ["serde"] }
|
|
16
16
|
magnus = "0.8"
|
|
17
17
|
onig = { version = "6", default-features = false }
|
|
18
|
+
rb-sys = "0.9"
|
|
18
19
|
serde = { version = "1", features = ["rc", "derive"] }
|
|
19
20
|
|
|
20
21
|
[dependencies.tokenizers]
|
|
21
|
-
version = "=0.
|
|
22
|
+
version = "=0.23.1" # also update in from_pretrained.rb
|
|
22
23
|
default-features = false
|
|
23
24
|
features = ["progressbar", "onig", "esaxx_fast"]
|
|
@@ -23,8 +23,8 @@ use super::utils::*;
|
|
|
23
23
|
use super::{RbError, RbResult, DECODERS};
|
|
24
24
|
|
|
25
25
|
#[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
|
|
26
|
+
#[serde(transparent)]
|
|
26
27
|
pub struct RbDecoder {
|
|
27
|
-
#[serde(flatten)]
|
|
28
28
|
pub(crate) decoder: RbDecoderWrapper,
|
|
29
29
|
}
|
|
30
30
|
|
|
@@ -69,7 +69,7 @@ macro_rules! setter {
|
|
|
69
69
|
}};
|
|
70
70
|
}
|
|
71
71
|
impl RbDecoder {
|
|
72
|
-
pub fn
|
|
72
|
+
pub fn bpe_get_suffix(&self) -> String {
|
|
73
73
|
getter!(self, BPE, suffix.clone())
|
|
74
74
|
}
|
|
75
75
|
|
|
@@ -77,7 +77,7 @@ impl RbDecoder {
|
|
|
77
77
|
setter!(self, BPE, suffix, suffix);
|
|
78
78
|
}
|
|
79
79
|
|
|
80
|
-
pub fn
|
|
80
|
+
pub fn ctc_get_cleanup(&self) -> bool {
|
|
81
81
|
getter!(self, CTC, cleanup)
|
|
82
82
|
}
|
|
83
83
|
|
|
@@ -85,7 +85,7 @@ impl RbDecoder {
|
|
|
85
85
|
setter!(self, CTC, cleanup, cleanup);
|
|
86
86
|
}
|
|
87
87
|
|
|
88
|
-
pub fn
|
|
88
|
+
pub fn ctc_get_pad_token(&self) -> String {
|
|
89
89
|
getter!(self, CTC, pad_token.clone())
|
|
90
90
|
}
|
|
91
91
|
|
|
@@ -93,7 +93,7 @@ impl RbDecoder {
|
|
|
93
93
|
setter!(self, CTC, pad_token, pad_token);
|
|
94
94
|
}
|
|
95
95
|
|
|
96
|
-
pub fn
|
|
96
|
+
pub fn ctc_get_word_delimiter_token(&self) -> String {
|
|
97
97
|
getter!(self, CTC, word_delimiter_token.clone())
|
|
98
98
|
}
|
|
99
99
|
|
|
@@ -101,31 +101,31 @@ impl RbDecoder {
|
|
|
101
101
|
setter!(self, CTC, word_delimiter_token, word_delimiter_token);
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
-
fn
|
|
104
|
+
pub fn strip_get_content(&self) -> char {
|
|
105
105
|
getter!(self, Strip, content)
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
-
fn strip_set_content(&self, content: char) {
|
|
108
|
+
pub fn strip_set_content(&self, content: char) {
|
|
109
109
|
setter!(self, Strip, content, content);
|
|
110
110
|
}
|
|
111
111
|
|
|
112
|
-
fn
|
|
112
|
+
pub fn strip_get_start(&self) -> usize {
|
|
113
113
|
getter!(self, Strip, start)
|
|
114
114
|
}
|
|
115
115
|
|
|
116
|
-
fn strip_set_start(&self, start: usize) {
|
|
116
|
+
pub fn strip_set_start(&self, start: usize) {
|
|
117
117
|
setter!(self, Strip, start, start);
|
|
118
118
|
}
|
|
119
119
|
|
|
120
|
-
fn
|
|
120
|
+
pub fn strip_get_stop(&self) -> usize {
|
|
121
121
|
getter!(self, Strip, stop)
|
|
122
122
|
}
|
|
123
123
|
|
|
124
|
-
fn strip_set_stop(&self, stop: usize) {
|
|
124
|
+
pub fn strip_set_stop(&self, stop: usize) {
|
|
125
125
|
setter!(self, Strip, stop, stop);
|
|
126
126
|
}
|
|
127
127
|
|
|
128
|
-
pub fn
|
|
128
|
+
pub fn metaspace_get_replacement(&self) -> char {
|
|
129
129
|
getter!(self, Metaspace, get_replacement().clone())
|
|
130
130
|
}
|
|
131
131
|
|
|
@@ -133,7 +133,7 @@ impl RbDecoder {
|
|
|
133
133
|
setter!(self, Metaspace, @set_replacement, replacement);
|
|
134
134
|
}
|
|
135
135
|
|
|
136
|
-
pub fn
|
|
136
|
+
pub fn metaspace_get_split(&self) -> bool {
|
|
137
137
|
getter!(self, Metaspace, get_split())
|
|
138
138
|
}
|
|
139
139
|
|
|
@@ -141,7 +141,7 @@ impl RbDecoder {
|
|
|
141
141
|
setter!(self, Metaspace, @set_split, split);
|
|
142
142
|
}
|
|
143
143
|
|
|
144
|
-
pub fn
|
|
144
|
+
pub fn metaspace_get_prepend_scheme(&self) -> String {
|
|
145
145
|
// Assuming Metaspace has a method to get the prepend_scheme as a string
|
|
146
146
|
let scheme: PrependScheme = getter!(self, Metaspace, get_prepend_scheme());
|
|
147
147
|
match scheme {
|
|
@@ -158,7 +158,7 @@ impl RbDecoder {
|
|
|
158
158
|
Ok(())
|
|
159
159
|
}
|
|
160
160
|
|
|
161
|
-
pub fn
|
|
161
|
+
pub fn word_piece_get_cleanup(&self) -> bool {
|
|
162
162
|
getter!(self, WordPiece, cleanup)
|
|
163
163
|
}
|
|
164
164
|
|
|
@@ -166,7 +166,7 @@ impl RbDecoder {
|
|
|
166
166
|
setter!(self, WordPiece, cleanup, cleanup);
|
|
167
167
|
}
|
|
168
168
|
|
|
169
|
-
pub fn
|
|
169
|
+
pub fn word_piece_get_prefix(&self) -> String {
|
|
170
170
|
getter!(self, WordPiece, prefix.clone())
|
|
171
171
|
}
|
|
172
172
|
|
|
@@ -371,7 +371,7 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
371
371
|
|
|
372
372
|
let class = module.define_class("BPEDecoder", decoder)?;
|
|
373
373
|
class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
|
|
374
|
-
class.define_method("suffix", method!(RbDecoder::
|
|
374
|
+
class.define_method("suffix", method!(RbDecoder::bpe_get_suffix, 0))?;
|
|
375
375
|
class.define_method("suffix=", method!(RbDecoder::bpe_set_suffix, 1))?;
|
|
376
376
|
|
|
377
377
|
let class = module.define_class("ByteFallback", decoder)?;
|
|
@@ -382,13 +382,13 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
382
382
|
|
|
383
383
|
let class = module.define_class("CTC", decoder)?;
|
|
384
384
|
class.define_singleton_method("_new", function!(RbCTC::new, 3))?;
|
|
385
|
-
class.define_method("cleanup", method!(RbDecoder::
|
|
385
|
+
class.define_method("cleanup", method!(RbDecoder::ctc_get_cleanup, 0))?;
|
|
386
386
|
class.define_method("cleanup=", method!(RbDecoder::ctc_set_cleanup, 1))?;
|
|
387
|
-
class.define_method("pad_token", method!(RbDecoder::
|
|
387
|
+
class.define_method("pad_token", method!(RbDecoder::ctc_get_pad_token, 0))?;
|
|
388
388
|
class.define_method("pad_token=", method!(RbDecoder::ctc_set_pad_token, 1))?;
|
|
389
389
|
class.define_method(
|
|
390
390
|
"word_delimiter_token",
|
|
391
|
-
method!(RbDecoder::
|
|
391
|
+
method!(RbDecoder::ctc_get_word_delimiter_token, 0),
|
|
392
392
|
)?;
|
|
393
393
|
class.define_method(
|
|
394
394
|
"word_delimiter_token=",
|
|
@@ -402,18 +402,21 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
402
402
|
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
|
|
403
403
|
class.define_method(
|
|
404
404
|
"prepend_scheme",
|
|
405
|
-
method!(RbDecoder::
|
|
405
|
+
method!(RbDecoder::metaspace_get_prepend_scheme, 0),
|
|
406
406
|
)?;
|
|
407
407
|
class.define_method(
|
|
408
408
|
"prepend_scheme=",
|
|
409
409
|
method!(RbDecoder::metaspace_set_prepend_scheme, 1),
|
|
410
410
|
)?;
|
|
411
|
-
class.define_method(
|
|
411
|
+
class.define_method(
|
|
412
|
+
"replacement",
|
|
413
|
+
method!(RbDecoder::metaspace_get_replacement, 0),
|
|
414
|
+
)?;
|
|
412
415
|
class.define_method(
|
|
413
416
|
"replacement=",
|
|
414
417
|
method!(RbDecoder::metaspace_set_replacement, 1),
|
|
415
418
|
)?;
|
|
416
|
-
class.define_method("split", method!(RbDecoder::
|
|
419
|
+
class.define_method("split", method!(RbDecoder::metaspace_get_split, 0))?;
|
|
417
420
|
class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
|
|
418
421
|
|
|
419
422
|
let class = module.define_class("Replace", decoder)?;
|
|
@@ -421,18 +424,18 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
|
421
424
|
|
|
422
425
|
let class = module.define_class("Strip", decoder)?;
|
|
423
426
|
class.define_singleton_method("_new", function!(RbStripDecoder::new, 3))?;
|
|
424
|
-
class.define_method("content", method!(RbDecoder::
|
|
427
|
+
class.define_method("content", method!(RbDecoder::strip_get_content, 0))?;
|
|
425
428
|
class.define_method("content=", method!(RbDecoder::strip_set_content, 1))?;
|
|
426
|
-
class.define_method("start", method!(RbDecoder::
|
|
429
|
+
class.define_method("start", method!(RbDecoder::strip_get_start, 0))?;
|
|
427
430
|
class.define_method("start=", method!(RbDecoder::strip_set_start, 1))?;
|
|
428
|
-
class.define_method("stop", method!(RbDecoder::
|
|
431
|
+
class.define_method("stop", method!(RbDecoder::strip_get_stop, 0))?;
|
|
429
432
|
class.define_method("stop=", method!(RbDecoder::strip_set_stop, 1))?;
|
|
430
433
|
|
|
431
434
|
let class = module.define_class("WordPiece", decoder)?;
|
|
432
435
|
class.define_singleton_method("_new", function!(RbWordPieceDecoder::new, 2))?;
|
|
433
|
-
class.define_method("cleanup", method!(RbDecoder::
|
|
436
|
+
class.define_method("cleanup", method!(RbDecoder::word_piece_get_cleanup, 0))?;
|
|
434
437
|
class.define_method("cleanup=", method!(RbDecoder::word_piece_set_cleanup, 1))?;
|
|
435
|
-
class.define_method("prefix", method!(RbDecoder::
|
|
438
|
+
class.define_method("prefix", method!(RbDecoder::word_piece_get_prefix, 0))?;
|
|
436
439
|
class.define_method("prefix=", method!(RbDecoder::word_piece_set_prefix, 1))?;
|
|
437
440
|
|
|
438
441
|
Ok(())
|
|
@@ -1,6 +1,8 @@
|
|
|
1
|
-
use magnus::{RArray, Ruby};
|
|
1
|
+
use magnus::{method, Module, RArray, RModule, Ruby};
|
|
2
2
|
use tk::{Encoding, Offsets};
|
|
3
3
|
|
|
4
|
+
use super::RbResult;
|
|
5
|
+
|
|
4
6
|
#[magnus::wrap(class = "Tokenizers::Encoding")]
|
|
5
7
|
#[repr(transparent)]
|
|
6
8
|
pub struct RbEncoding {
|
|
@@ -14,43 +16,43 @@ impl From<Encoding> for RbEncoding {
|
|
|
14
16
|
}
|
|
15
17
|
|
|
16
18
|
impl RbEncoding {
|
|
17
|
-
pub fn
|
|
19
|
+
pub fn get_n_sequences(&self) -> usize {
|
|
18
20
|
self.encoding.n_sequences()
|
|
19
21
|
}
|
|
20
22
|
|
|
21
|
-
pub fn
|
|
23
|
+
pub fn get_ids(&self) -> Vec<u32> {
|
|
22
24
|
self.encoding.get_ids().to_vec()
|
|
23
25
|
}
|
|
24
26
|
|
|
25
|
-
pub fn
|
|
27
|
+
pub fn get_tokens(&self) -> Vec<String> {
|
|
26
28
|
self.encoding.get_tokens().to_vec()
|
|
27
29
|
}
|
|
28
30
|
|
|
29
|
-
pub fn
|
|
31
|
+
pub fn get_word_ids(&self) -> Vec<Option<u32>> {
|
|
30
32
|
self.encoding.get_word_ids().to_vec()
|
|
31
33
|
}
|
|
32
34
|
|
|
33
|
-
pub fn
|
|
35
|
+
pub fn get_sequence_ids(&self) -> Vec<Option<usize>> {
|
|
34
36
|
self.encoding.get_sequence_ids()
|
|
35
37
|
}
|
|
36
38
|
|
|
37
|
-
pub fn
|
|
39
|
+
pub fn get_type_ids(&self) -> Vec<u32> {
|
|
38
40
|
self.encoding.get_type_ids().to_vec()
|
|
39
41
|
}
|
|
40
42
|
|
|
41
|
-
pub fn
|
|
43
|
+
pub fn get_offsets(&self) -> Vec<(usize, usize)> {
|
|
42
44
|
self.encoding.get_offsets().to_vec()
|
|
43
45
|
}
|
|
44
46
|
|
|
45
|
-
pub fn
|
|
47
|
+
pub fn get_special_tokens_mask(&self) -> Vec<u32> {
|
|
46
48
|
self.encoding.get_special_tokens_mask().to_vec()
|
|
47
49
|
}
|
|
48
50
|
|
|
49
|
-
pub fn
|
|
51
|
+
pub fn get_attention_mask(&self) -> Vec<u32> {
|
|
50
52
|
self.encoding.get_attention_mask().to_vec()
|
|
51
53
|
}
|
|
52
54
|
|
|
53
|
-
pub fn
|
|
55
|
+
pub fn get_overflowing(ruby: &Ruby, rb_self: &Self) -> RArray {
|
|
54
56
|
ruby.ary_from_iter(
|
|
55
57
|
rb_self
|
|
56
58
|
.encoding
|
|
@@ -91,3 +93,32 @@ impl RbEncoding {
|
|
|
91
93
|
self.encoding.char_to_word(char_pos, sequence_index)
|
|
92
94
|
}
|
|
93
95
|
}
|
|
96
|
+
|
|
97
|
+
pub fn init_encoding(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
|
98
|
+
let class = module.define_class("Encoding", ruby.class_object())?;
|
|
99
|
+
class.define_method("n_sequences", method!(RbEncoding::get_n_sequences, 0))?;
|
|
100
|
+
class.define_method("ids", method!(RbEncoding::get_ids, 0))?;
|
|
101
|
+
class.define_method("tokens", method!(RbEncoding::get_tokens, 0))?;
|
|
102
|
+
class.define_method("word_ids", method!(RbEncoding::get_word_ids, 0))?;
|
|
103
|
+
class.define_method("sequence_ids", method!(RbEncoding::get_sequence_ids, 0))?;
|
|
104
|
+
class.define_method("type_ids", method!(RbEncoding::get_type_ids, 0))?;
|
|
105
|
+
class.define_method("offsets", method!(RbEncoding::get_offsets, 0))?;
|
|
106
|
+
class.define_method(
|
|
107
|
+
"special_tokens_mask",
|
|
108
|
+
method!(RbEncoding::get_special_tokens_mask, 0),
|
|
109
|
+
)?;
|
|
110
|
+
class.define_method("attention_mask", method!(RbEncoding::get_attention_mask, 0))?;
|
|
111
|
+
class.define_method("overflowing", method!(RbEncoding::get_overflowing, 0))?;
|
|
112
|
+
class.define_method("_word_to_tokens", method!(RbEncoding::word_to_tokens, 2))?;
|
|
113
|
+
class.define_method("_word_to_chars", method!(RbEncoding::word_to_chars, 2))?;
|
|
114
|
+
class.define_method(
|
|
115
|
+
"token_to_sequence",
|
|
116
|
+
method!(RbEncoding::token_to_sequence, 1),
|
|
117
|
+
)?;
|
|
118
|
+
class.define_method("token_to_chars", method!(RbEncoding::token_to_chars, 1))?;
|
|
119
|
+
class.define_method("token_to_word", method!(RbEncoding::token_to_word, 1))?;
|
|
120
|
+
class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
|
|
121
|
+
class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
|
|
122
|
+
|
|
123
|
+
Ok(())
|
|
124
|
+
}
|
data/ext/tokenizers/src/error.rs
CHANGED
|
@@ -1,3 +1,5 @@
|
|
|
1
|
+
use std::borrow::Cow;
|
|
2
|
+
|
|
1
3
|
use magnus::{prelude::*, value::Lazy, Error, ExceptionClass, Ruby};
|
|
2
4
|
|
|
3
5
|
use super::TOKENIZERS;
|
|
@@ -7,17 +9,20 @@ pub struct RbError {}
|
|
|
7
9
|
impl RbError {
|
|
8
10
|
// convert to Error instead of Self
|
|
9
11
|
pub fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Error {
|
|
10
|
-
Error::new(error(), e.to_string())
|
|
12
|
+
Error::new(error(&Ruby::get().unwrap()), e.to_string())
|
|
11
13
|
}
|
|
12
14
|
|
|
13
|
-
pub fn new_err(s:
|
|
14
|
-
|
|
15
|
+
pub fn new_err<T>(s: T) -> Error
|
|
16
|
+
where
|
|
17
|
+
T: Into<Cow<'static, str>>,
|
|
18
|
+
{
|
|
19
|
+
Error::new(error(&Ruby::get().unwrap()), s)
|
|
15
20
|
}
|
|
16
21
|
}
|
|
17
22
|
|
|
18
23
|
static ERROR: Lazy<ExceptionClass> =
|
|
19
24
|
Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Error").unwrap());
|
|
20
25
|
|
|
21
|
-
fn error() -> ExceptionClass {
|
|
22
|
-
|
|
26
|
+
fn error(ruby: &Ruby) -> ExceptionClass {
|
|
27
|
+
ruby.get_inner(&ERROR)
|
|
23
28
|
}
|
data/ext/tokenizers/src/lib.rs
CHANGED
|
@@ -9,16 +9,15 @@ mod models;
|
|
|
9
9
|
mod normalizers;
|
|
10
10
|
mod pre_tokenizers;
|
|
11
11
|
mod processors;
|
|
12
|
+
mod ruby;
|
|
12
13
|
mod tokenizer;
|
|
13
14
|
mod trainers;
|
|
14
15
|
mod utils;
|
|
15
16
|
|
|
16
|
-
use encoding::RbEncoding;
|
|
17
17
|
use error::RbError;
|
|
18
|
-
use tokenizer::{RbAddedToken, RbTokenizer};
|
|
19
18
|
use utils::RbRegex;
|
|
20
19
|
|
|
21
|
-
use magnus::{function,
|
|
20
|
+
use magnus::{function, prelude::*, value::Lazy, Error, RModule, Ruby};
|
|
22
21
|
|
|
23
22
|
type RbResult<T> = Result<T, Error>;
|
|
24
23
|
|
|
@@ -53,97 +52,9 @@ static TRAINERS: Lazy<RModule> =
|
|
|
53
52
|
fn init(ruby: &Ruby) -> RbResult<()> {
|
|
54
53
|
let module = ruby.define_module("Tokenizers")?;
|
|
55
54
|
|
|
56
|
-
let class = module.define_class("Tokenizer", ruby.class_object())?;
|
|
57
|
-
class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
|
|
58
|
-
class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
|
|
59
|
-
class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
|
|
60
|
-
class.define_method(
|
|
61
|
-
"add_special_tokens",
|
|
62
|
-
method!(RbTokenizer::add_special_tokens, 1),
|
|
63
|
-
)?;
|
|
64
|
-
class.define_method("train", method!(RbTokenizer::train, 2))?;
|
|
65
|
-
class.define_method("_save", method!(RbTokenizer::save, 2))?;
|
|
66
|
-
class.define_method("add_tokens", method!(RbTokenizer::add_tokens, 1))?;
|
|
67
|
-
class.define_method("_encode", method!(RbTokenizer::encode, 4))?;
|
|
68
|
-
class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
|
|
69
|
-
class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
|
|
70
|
-
class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
|
|
71
|
-
class.define_method("model", method!(RbTokenizer::get_model, 0))?;
|
|
72
|
-
class.define_method("model=", method!(RbTokenizer::set_model, 1))?;
|
|
73
|
-
class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
|
|
74
|
-
class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
|
|
75
|
-
class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
|
|
76
|
-
class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
|
|
77
|
-
class.define_method(
|
|
78
|
-
"post_processor",
|
|
79
|
-
method!(RbTokenizer::get_post_processor, 0),
|
|
80
|
-
)?;
|
|
81
|
-
class.define_method(
|
|
82
|
-
"post_processor=",
|
|
83
|
-
method!(RbTokenizer::set_post_processor, 1),
|
|
84
|
-
)?;
|
|
85
|
-
class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
|
|
86
|
-
class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
|
|
87
|
-
class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
|
|
88
|
-
class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
|
|
89
|
-
class.define_method("_enable_padding", method!(RbTokenizer::enable_padding, 1))?;
|
|
90
|
-
class.define_method("padding", method!(RbTokenizer::padding, 0))?;
|
|
91
|
-
class.define_method("no_padding", method!(RbTokenizer::no_padding, 0))?;
|
|
92
|
-
class.define_method(
|
|
93
|
-
"_enable_truncation",
|
|
94
|
-
method!(RbTokenizer::enable_truncation, 2),
|
|
95
|
-
)?;
|
|
96
|
-
class.define_method("truncation", method!(RbTokenizer::truncation, 0))?;
|
|
97
|
-
class.define_method("no_truncation", method!(RbTokenizer::no_truncation, 0))?;
|
|
98
|
-
class.define_method(
|
|
99
|
-
"num_special_tokens_to_add",
|
|
100
|
-
method!(RbTokenizer::num_special_tokens_to_add, 1),
|
|
101
|
-
)?;
|
|
102
|
-
class.define_method("_vocab", method!(RbTokenizer::vocab, 1))?;
|
|
103
|
-
class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
|
|
104
|
-
class.define_method(
|
|
105
|
-
"added_tokens_decoder",
|
|
106
|
-
method!(RbTokenizer::get_added_tokens_decoder, 0),
|
|
107
|
-
)?;
|
|
108
|
-
class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
|
|
109
|
-
|
|
110
|
-
let class = module.define_class("Encoding", ruby.class_object())?;
|
|
111
|
-
class.define_method("n_sequences", method!(RbEncoding::n_sequences, 0))?;
|
|
112
|
-
class.define_method("ids", method!(RbEncoding::ids, 0))?;
|
|
113
|
-
class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
|
|
114
|
-
class.define_method("word_ids", method!(RbEncoding::word_ids, 0))?;
|
|
115
|
-
class.define_method("sequence_ids", method!(RbEncoding::sequence_ids, 0))?;
|
|
116
|
-
class.define_method("type_ids", method!(RbEncoding::type_ids, 0))?;
|
|
117
|
-
class.define_method("offsets", method!(RbEncoding::offsets, 0))?;
|
|
118
|
-
class.define_method(
|
|
119
|
-
"special_tokens_mask",
|
|
120
|
-
method!(RbEncoding::special_tokens_mask, 0),
|
|
121
|
-
)?;
|
|
122
|
-
class.define_method("attention_mask", method!(RbEncoding::attention_mask, 0))?;
|
|
123
|
-
class.define_method("overflowing", method!(RbEncoding::overflowing, 0))?;
|
|
124
|
-
class.define_method("_word_to_tokens", method!(RbEncoding::word_to_tokens, 2))?;
|
|
125
|
-
class.define_method("_word_to_chars", method!(RbEncoding::word_to_chars, 2))?;
|
|
126
|
-
class.define_method(
|
|
127
|
-
"token_to_sequence",
|
|
128
|
-
method!(RbEncoding::token_to_sequence, 1),
|
|
129
|
-
)?;
|
|
130
|
-
class.define_method("token_to_chars", method!(RbEncoding::token_to_chars, 1))?;
|
|
131
|
-
class.define_method("token_to_word", method!(RbEncoding::token_to_word, 1))?;
|
|
132
|
-
class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
|
|
133
|
-
class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
|
|
134
|
-
|
|
135
55
|
let class = module.define_class("Regex", ruby.class_object())?;
|
|
136
56
|
class.define_singleton_method("new", function!(RbRegex::new, 1))?;
|
|
137
57
|
|
|
138
|
-
let class = module.define_class("AddedToken", ruby.class_object())?;
|
|
139
|
-
class.define_singleton_method("_new", function!(RbAddedToken::new, 2))?;
|
|
140
|
-
class.define_method("content", method!(RbAddedToken::get_content, 0))?;
|
|
141
|
-
class.define_method("rstrip", method!(RbAddedToken::get_rstrip, 0))?;
|
|
142
|
-
class.define_method("lstrip", method!(RbAddedToken::get_lstrip, 0))?;
|
|
143
|
-
class.define_method("single_word", method!(RbAddedToken::get_single_word, 0))?;
|
|
144
|
-
class.define_method("normalized", method!(RbAddedToken::get_normalized, 0))?;
|
|
145
|
-
class.define_method("special", method!(RbAddedToken::get_special, 0))?;
|
|
146
|
-
|
|
147
58
|
let models = module.define_module("Models")?;
|
|
148
59
|
let pre_tokenizers = module.define_module("PreTokenizers")?;
|
|
149
60
|
let decoders = module.define_module("Decoders")?;
|
|
@@ -151,6 +62,8 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
|
151
62
|
let normalizers = module.define_module("Normalizers")?;
|
|
152
63
|
let trainers = module.define_module("Trainers")?;
|
|
153
64
|
|
|
65
|
+
tokenizer::init_tokenizer(ruby, &module)?;
|
|
66
|
+
encoding::init_encoding(ruby, &module)?;
|
|
154
67
|
models::init_models(ruby, &models)?;
|
|
155
68
|
pre_tokenizers::init_pre_tokenizers(ruby, &pre_tokenizers)?;
|
|
156
69
|
decoders::init_decoders(ruby, &decoders)?;
|