tokenizers 0.5.1 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Cargo.lock +22 -19
- data/ext/tokenizers/Cargo.toml +1 -1
- data/ext/tokenizers/src/decoders.rs +7 -0
- data/ext/tokenizers/src/lib.rs +7 -0
- data/ext/tokenizers/src/tokenizer.rs +34 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
|
4
|
+
data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
|
7
|
+
data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## 0.5.2 (2024-08-26)
|
2
|
+
|
3
|
+
- Added `from_str` method to `Tokenizer`
|
4
|
+
- Added `model` and `model=` methods to `Tokenizer`
|
5
|
+
- Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
|
6
|
+
- Added `decode` method to `Decoder`
|
7
|
+
|
1
8
|
## 0.5.1 (2024-08-13)
|
2
9
|
|
3
10
|
- Updated Tokenizers to 0.20.0
|
data/Cargo.lock
CHANGED
@@ -57,9 +57,12 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
|
57
57
|
|
58
58
|
[[package]]
|
59
59
|
name = "cc"
|
60
|
-
version = "1.1.
|
60
|
+
version = "1.1.15"
|
61
61
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
62
|
-
checksum = "
|
62
|
+
checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
|
63
|
+
dependencies = [
|
64
|
+
"shlex",
|
65
|
+
]
|
63
66
|
|
64
67
|
[[package]]
|
65
68
|
name = "cexpr"
|
@@ -301,9 +304,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
301
304
|
|
302
305
|
[[package]]
|
303
306
|
name = "libc"
|
304
|
-
version = "0.2.
|
307
|
+
version = "0.2.158"
|
305
308
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
306
|
-
checksum = "
|
309
|
+
checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
|
307
310
|
|
308
311
|
[[package]]
|
309
312
|
name = "libloading"
|
@@ -475,9 +478,9 @@ dependencies = [
|
|
475
478
|
|
476
479
|
[[package]]
|
477
480
|
name = "quote"
|
478
|
-
version = "1.0.
|
481
|
+
version = "1.0.37"
|
479
482
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
480
|
-
checksum = "
|
483
|
+
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
481
484
|
dependencies = [
|
482
485
|
"proc-macro2",
|
483
486
|
]
|
@@ -545,18 +548,18 @@ dependencies = [
|
|
545
548
|
|
546
549
|
[[package]]
|
547
550
|
name = "rb-sys"
|
548
|
-
version = "0.9.
|
551
|
+
version = "0.9.102"
|
549
552
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
550
|
-
checksum = "
|
553
|
+
checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
|
551
554
|
dependencies = [
|
552
555
|
"rb-sys-build",
|
553
556
|
]
|
554
557
|
|
555
558
|
[[package]]
|
556
559
|
name = "rb-sys-build"
|
557
|
-
version = "0.9.
|
560
|
+
version = "0.9.102"
|
558
561
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
559
|
-
checksum = "
|
562
|
+
checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
|
560
563
|
dependencies = [
|
561
564
|
"bindgen",
|
562
565
|
"lazy_static",
|
@@ -622,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
622
625
|
|
623
626
|
[[package]]
|
624
627
|
name = "serde"
|
625
|
-
version = "1.0.
|
628
|
+
version = "1.0.209"
|
626
629
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
627
|
-
checksum = "
|
630
|
+
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
|
628
631
|
dependencies = [
|
629
632
|
"serde_derive",
|
630
633
|
]
|
631
634
|
|
632
635
|
[[package]]
|
633
636
|
name = "serde_derive"
|
634
|
-
version = "1.0.
|
637
|
+
version = "1.0.209"
|
635
638
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
636
|
-
checksum = "
|
639
|
+
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
|
637
640
|
dependencies = [
|
638
641
|
"proc-macro2",
|
639
642
|
"quote",
|
@@ -642,9 +645,9 @@ dependencies = [
|
|
642
645
|
|
643
646
|
[[package]]
|
644
647
|
name = "serde_json"
|
645
|
-
version = "1.0.
|
648
|
+
version = "1.0.127"
|
646
649
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
647
|
-
checksum = "
|
650
|
+
checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
|
648
651
|
dependencies = [
|
649
652
|
"itoa",
|
650
653
|
"memchr",
|
@@ -690,9 +693,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
690
693
|
|
691
694
|
[[package]]
|
692
695
|
name = "syn"
|
693
|
-
version = "2.0.
|
696
|
+
version = "2.0.76"
|
694
697
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
695
|
-
checksum = "
|
698
|
+
checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
|
696
699
|
dependencies = [
|
697
700
|
"proc-macro2",
|
698
701
|
"quote",
|
@@ -721,7 +724,7 @@ dependencies = [
|
|
721
724
|
|
722
725
|
[[package]]
|
723
726
|
name = "tokenizers"
|
724
|
-
version = "0.5.
|
727
|
+
version = "0.5.2"
|
725
728
|
dependencies = [
|
726
729
|
"magnus",
|
727
730
|
"onig",
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -34,6 +34,12 @@ impl Decoder for RbDecoder {
|
|
34
34
|
}
|
35
35
|
}
|
36
36
|
|
37
|
+
impl RbDecoder {
|
38
|
+
pub fn decode(&self, tokens: Vec<String>) -> RbResult<String> {
|
39
|
+
self.decoder.decode(tokens).map_err(RbError::from)
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
37
43
|
macro_rules! getter {
|
38
44
|
($self: ident, $variant: ident, $($name: tt)+) => {{
|
39
45
|
let decoder = &$self.decoder;
|
@@ -358,6 +364,7 @@ unsafe impl TypedData for RbDecoder {
|
|
358
364
|
|
359
365
|
pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
360
366
|
let decoder = module.define_class("Decoder", ruby.class_object())?;
|
367
|
+
decoder.define_method("decode", method!(RbDecoder::decode, 1))?;
|
361
368
|
|
362
369
|
let class = module.define_class("BPEDecoder", decoder)?;
|
363
370
|
class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
|
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -42,6 +42,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
42
42
|
|
43
43
|
let class = module.define_class("Tokenizer", ruby.class_object())?;
|
44
44
|
class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
|
45
|
+
class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
|
45
46
|
class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
|
46
47
|
class.define_method(
|
47
48
|
"add_special_tokens",
|
@@ -54,12 +55,18 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
54
55
|
class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
|
55
56
|
class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
|
56
57
|
class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
|
58
|
+
class.define_method("model", method!(RbTokenizer::get_model, 0))?;
|
59
|
+
class.define_method("model=", method!(RbTokenizer::set_model,1))?;
|
60
|
+
class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
|
57
61
|
class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
|
62
|
+
class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
|
58
63
|
class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
|
64
|
+
class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
|
59
65
|
class.define_method(
|
60
66
|
"post_processor=",
|
61
67
|
method!(RbTokenizer::set_post_processor, 1),
|
62
68
|
)?;
|
69
|
+
class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
|
63
70
|
class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
|
64
71
|
class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
|
65
72
|
class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
|
@@ -1,9 +1,10 @@
|
|
1
1
|
use std::cell::RefCell;
|
2
2
|
use std::collections::HashMap;
|
3
3
|
use std::path::PathBuf;
|
4
|
+
use std::str::FromStr;
|
4
5
|
|
5
6
|
use magnus::prelude::*;
|
6
|
-
use magnus::{exception, Error, RArray, RHash, Symbol, TryConvert, Value};
|
7
|
+
use magnus::{exception, Error, RArray, RHash, RString, Symbol, TryConvert, Value};
|
7
8
|
use tk::tokenizer::{
|
8
9
|
Model, PaddingDirection, PaddingParams, PaddingStrategy,
|
9
10
|
TruncationDirection, TruncationParams, TruncationStrategy, TokenizerImpl
|
@@ -203,6 +204,14 @@ impl RbTokenizer {
|
|
203
204
|
RbTokenizer::new(TokenizerImpl::new(model.clone()))
|
204
205
|
}
|
205
206
|
|
207
|
+
pub fn from_str(json: RString) -> RbResult<Self> {
|
208
|
+
Tokenizer::from_str(unsafe { json.as_str()? })
|
209
|
+
.map(|v| RbTokenizer {
|
210
|
+
tokenizer: RefCell::new(v),
|
211
|
+
})
|
212
|
+
.map_err(RbError::from)
|
213
|
+
}
|
214
|
+
|
206
215
|
pub fn from_file(path: PathBuf) -> RbResult<Self> {
|
207
216
|
Tokenizer::from_file(path)
|
208
217
|
.map(|v| RbTokenizer {
|
@@ -319,22 +328,46 @@ impl RbTokenizer {
|
|
319
328
|
.map_err(RbError::from)
|
320
329
|
}
|
321
330
|
|
331
|
+
pub fn get_model(&self) -> RbModel {
|
332
|
+
self.tokenizer.borrow().get_model().clone()
|
333
|
+
}
|
334
|
+
|
335
|
+
pub fn set_model(&self, model: &RbModel) {
|
336
|
+
self.tokenizer.borrow_mut().with_model(model.clone());
|
337
|
+
}
|
338
|
+
|
339
|
+
pub fn get_decoder(&self) -> Option<RbDecoder> {
|
340
|
+
self.tokenizer.borrow().get_decoder().cloned()
|
341
|
+
}
|
342
|
+
|
322
343
|
pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
|
323
344
|
self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
|
324
345
|
}
|
325
346
|
|
347
|
+
pub fn get_pre_tokenizer(&self) -> Option<RbPreTokenizer> {
|
348
|
+
self.tokenizer.borrow().get_pre_tokenizer().cloned()
|
349
|
+
}
|
350
|
+
|
326
351
|
pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
|
327
352
|
self.tokenizer
|
328
353
|
.borrow_mut()
|
329
354
|
.with_pre_tokenizer(pretok.cloned());
|
330
355
|
}
|
331
356
|
|
357
|
+
pub fn get_post_processor(&self) -> Option<RbPostProcessor> {
|
358
|
+
self.tokenizer.borrow().get_post_processor().cloned()
|
359
|
+
}
|
360
|
+
|
332
361
|
pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
|
333
362
|
self.tokenizer
|
334
363
|
.borrow_mut()
|
335
364
|
.with_post_processor(processor.cloned());
|
336
365
|
}
|
337
366
|
|
367
|
+
pub fn get_normalizer(&self) -> Option<RbNormalizer> {
|
368
|
+
self.tokenizer.borrow().get_normalizer().cloned()
|
369
|
+
}
|
370
|
+
|
338
371
|
pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
|
339
372
|
self.tokenizer
|
340
373
|
.borrow_mut()
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|