tokenizers 0.5.1 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Cargo.lock +22 -19
- data/ext/tokenizers/Cargo.toml +1 -1
- data/ext/tokenizers/src/decoders.rs +7 -0
- data/ext/tokenizers/src/lib.rs +7 -0
- data/ext/tokenizers/src/tokenizer.rs +34 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
|
4
|
+
data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
|
7
|
+
data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## 0.5.2 (2024-08-26)
|
2
|
+
|
3
|
+
- Added `from_str` method to `Tokenizer`
|
4
|
+
- Added `model` and `model=` methods to `Tokenizer`
|
5
|
+
- Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
|
6
|
+
- Added `decode` method to `Decoder`
|
7
|
+
|
1
8
|
## 0.5.1 (2024-08-13)
|
2
9
|
|
3
10
|
- Updated Tokenizers to 0.20.0
|
data/Cargo.lock
CHANGED
@@ -57,9 +57,12 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
|
57
57
|
|
58
58
|
[[package]]
|
59
59
|
name = "cc"
|
60
|
-
version = "1.1.
|
60
|
+
version = "1.1.15"
|
61
61
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
62
|
-
checksum = "
|
62
|
+
checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
|
63
|
+
dependencies = [
|
64
|
+
"shlex",
|
65
|
+
]
|
63
66
|
|
64
67
|
[[package]]
|
65
68
|
name = "cexpr"
|
@@ -301,9 +304,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
301
304
|
|
302
305
|
[[package]]
|
303
306
|
name = "libc"
|
304
|
-
version = "0.2.
|
307
|
+
version = "0.2.158"
|
305
308
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
306
|
-
checksum = "
|
309
|
+
checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
|
307
310
|
|
308
311
|
[[package]]
|
309
312
|
name = "libloading"
|
@@ -475,9 +478,9 @@ dependencies = [
|
|
475
478
|
|
476
479
|
[[package]]
|
477
480
|
name = "quote"
|
478
|
-
version = "1.0.
|
481
|
+
version = "1.0.37"
|
479
482
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
480
|
-
checksum = "
|
483
|
+
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
481
484
|
dependencies = [
|
482
485
|
"proc-macro2",
|
483
486
|
]
|
@@ -545,18 +548,18 @@ dependencies = [
|
|
545
548
|
|
546
549
|
[[package]]
|
547
550
|
name = "rb-sys"
|
548
|
-
version = "0.9.
|
551
|
+
version = "0.9.102"
|
549
552
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
550
|
-
checksum = "
|
553
|
+
checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
|
551
554
|
dependencies = [
|
552
555
|
"rb-sys-build",
|
553
556
|
]
|
554
557
|
|
555
558
|
[[package]]
|
556
559
|
name = "rb-sys-build"
|
557
|
-
version = "0.9.
|
560
|
+
version = "0.9.102"
|
558
561
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
559
|
-
checksum = "
|
562
|
+
checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
|
560
563
|
dependencies = [
|
561
564
|
"bindgen",
|
562
565
|
"lazy_static",
|
@@ -622,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
622
625
|
|
623
626
|
[[package]]
|
624
627
|
name = "serde"
|
625
|
-
version = "1.0.
|
628
|
+
version = "1.0.209"
|
626
629
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
627
|
-
checksum = "
|
630
|
+
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
|
628
631
|
dependencies = [
|
629
632
|
"serde_derive",
|
630
633
|
]
|
631
634
|
|
632
635
|
[[package]]
|
633
636
|
name = "serde_derive"
|
634
|
-
version = "1.0.
|
637
|
+
version = "1.0.209"
|
635
638
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
636
|
-
checksum = "
|
639
|
+
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
|
637
640
|
dependencies = [
|
638
641
|
"proc-macro2",
|
639
642
|
"quote",
|
@@ -642,9 +645,9 @@ dependencies = [
|
|
642
645
|
|
643
646
|
[[package]]
|
644
647
|
name = "serde_json"
|
645
|
-
version = "1.0.
|
648
|
+
version = "1.0.127"
|
646
649
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
647
|
-
checksum = "
|
650
|
+
checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
|
648
651
|
dependencies = [
|
649
652
|
"itoa",
|
650
653
|
"memchr",
|
@@ -690,9 +693,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
690
693
|
|
691
694
|
[[package]]
|
692
695
|
name = "syn"
|
693
|
-
version = "2.0.
|
696
|
+
version = "2.0.76"
|
694
697
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
695
|
-
checksum = "
|
698
|
+
checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
|
696
699
|
dependencies = [
|
697
700
|
"proc-macro2",
|
698
701
|
"quote",
|
@@ -721,7 +724,7 @@ dependencies = [
|
|
721
724
|
|
722
725
|
[[package]]
|
723
726
|
name = "tokenizers"
|
724
|
-
version = "0.5.
|
727
|
+
version = "0.5.2"
|
725
728
|
dependencies = [
|
726
729
|
"magnus",
|
727
730
|
"onig",
|
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -34,6 +34,12 @@ impl Decoder for RbDecoder {
|
|
34
34
|
}
|
35
35
|
}
|
36
36
|
|
37
|
+
impl RbDecoder {
|
38
|
+
pub fn decode(&self, tokens: Vec<String>) -> RbResult<String> {
|
39
|
+
self.decoder.decode(tokens).map_err(RbError::from)
|
40
|
+
}
|
41
|
+
}
|
42
|
+
|
37
43
|
macro_rules! getter {
|
38
44
|
($self: ident, $variant: ident, $($name: tt)+) => {{
|
39
45
|
let decoder = &$self.decoder;
|
@@ -358,6 +364,7 @@ unsafe impl TypedData for RbDecoder {
|
|
358
364
|
|
359
365
|
pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
|
360
366
|
let decoder = module.define_class("Decoder", ruby.class_object())?;
|
367
|
+
decoder.define_method("decode", method!(RbDecoder::decode, 1))?;
|
361
368
|
|
362
369
|
let class = module.define_class("BPEDecoder", decoder)?;
|
363
370
|
class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
|
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -42,6 +42,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
42
42
|
|
43
43
|
let class = module.define_class("Tokenizer", ruby.class_object())?;
|
44
44
|
class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
|
45
|
+
class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
|
45
46
|
class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
|
46
47
|
class.define_method(
|
47
48
|
"add_special_tokens",
|
@@ -54,12 +55,18 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
54
55
|
class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
|
55
56
|
class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
|
56
57
|
class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
|
58
|
+
class.define_method("model", method!(RbTokenizer::get_model, 0))?;
|
59
|
+
class.define_method("model=", method!(RbTokenizer::set_model,1))?;
|
60
|
+
class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
|
57
61
|
class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
|
62
|
+
class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
|
58
63
|
class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
|
64
|
+
class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
|
59
65
|
class.define_method(
|
60
66
|
"post_processor=",
|
61
67
|
method!(RbTokenizer::set_post_processor, 1),
|
62
68
|
)?;
|
69
|
+
class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
|
63
70
|
class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
|
64
71
|
class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
|
65
72
|
class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
|
@@ -1,9 +1,10 @@
|
|
1
1
|
use std::cell::RefCell;
|
2
2
|
use std::collections::HashMap;
|
3
3
|
use std::path::PathBuf;
|
4
|
+
use std::str::FromStr;
|
4
5
|
|
5
6
|
use magnus::prelude::*;
|
6
|
-
use magnus::{exception, Error, RArray, RHash, Symbol, TryConvert, Value};
|
7
|
+
use magnus::{exception, Error, RArray, RHash, RString, Symbol, TryConvert, Value};
|
7
8
|
use tk::tokenizer::{
|
8
9
|
Model, PaddingDirection, PaddingParams, PaddingStrategy,
|
9
10
|
TruncationDirection, TruncationParams, TruncationStrategy, TokenizerImpl
|
@@ -203,6 +204,14 @@ impl RbTokenizer {
|
|
203
204
|
RbTokenizer::new(TokenizerImpl::new(model.clone()))
|
204
205
|
}
|
205
206
|
|
207
|
+
pub fn from_str(json: RString) -> RbResult<Self> {
|
208
|
+
Tokenizer::from_str(unsafe { json.as_str()? })
|
209
|
+
.map(|v| RbTokenizer {
|
210
|
+
tokenizer: RefCell::new(v),
|
211
|
+
})
|
212
|
+
.map_err(RbError::from)
|
213
|
+
}
|
214
|
+
|
206
215
|
pub fn from_file(path: PathBuf) -> RbResult<Self> {
|
207
216
|
Tokenizer::from_file(path)
|
208
217
|
.map(|v| RbTokenizer {
|
@@ -319,22 +328,46 @@ impl RbTokenizer {
|
|
319
328
|
.map_err(RbError::from)
|
320
329
|
}
|
321
330
|
|
331
|
+
pub fn get_model(&self) -> RbModel {
|
332
|
+
self.tokenizer.borrow().get_model().clone()
|
333
|
+
}
|
334
|
+
|
335
|
+
pub fn set_model(&self, model: &RbModel) {
|
336
|
+
self.tokenizer.borrow_mut().with_model(model.clone());
|
337
|
+
}
|
338
|
+
|
339
|
+
pub fn get_decoder(&self) -> Option<RbDecoder> {
|
340
|
+
self.tokenizer.borrow().get_decoder().cloned()
|
341
|
+
}
|
342
|
+
|
322
343
|
pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
|
323
344
|
self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
|
324
345
|
}
|
325
346
|
|
347
|
+
pub fn get_pre_tokenizer(&self) -> Option<RbPreTokenizer> {
|
348
|
+
self.tokenizer.borrow().get_pre_tokenizer().cloned()
|
349
|
+
}
|
350
|
+
|
326
351
|
pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
|
327
352
|
self.tokenizer
|
328
353
|
.borrow_mut()
|
329
354
|
.with_pre_tokenizer(pretok.cloned());
|
330
355
|
}
|
331
356
|
|
357
|
+
pub fn get_post_processor(&self) -> Option<RbPostProcessor> {
|
358
|
+
self.tokenizer.borrow().get_post_processor().cloned()
|
359
|
+
}
|
360
|
+
|
332
361
|
pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
|
333
362
|
self.tokenizer
|
334
363
|
.borrow_mut()
|
335
364
|
.with_post_processor(processor.cloned());
|
336
365
|
}
|
337
366
|
|
367
|
+
pub fn get_normalizer(&self) -> Option<RbNormalizer> {
|
368
|
+
self.tokenizer.borrow().get_normalizer().cloned()
|
369
|
+
}
|
370
|
+
|
338
371
|
pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
|
339
372
|
self.tokenizer
|
340
373
|
.borrow_mut()
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-08-
|
11
|
+
date: 2024-08-26 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|