tokenizers 0.5.1 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4846b5d3dc0fe8f5828ddffe46908b1f3812ebf6a03a939ca0395ad7748533bb
4
- data.tar.gz: 259795bfa6b13a36f62ab2ffb65e9feabd460e01efe9f59e7d6017c6dcd9b9b0
3
+ metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
4
+ data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
5
5
  SHA512:
6
- metadata.gz: 90f55feb8ceec81815bb61b7773e6f67924eb6d47869e5da67d579e2ac9df6a48fddee5e97f5a028e3fa1e39941f3bfe6ec6c04cf49fd1b87f18bade54911231
7
- data.tar.gz: 9c1895b43222494b393f3fbddaa6e78216e025f3d3dddebf0c0311d2d897b282a16b8b0044aacb2466790b0a93c8d01099b25d662750b96d5798d0a4a927267b
6
+ metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
7
+ data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.5.2 (2024-08-26)
2
+
3
+ - Added `from_str` method to `Tokenizer`
4
+ - Added `model` and `model=` methods to `Tokenizer`
5
+ - Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
6
+ - Added `decode` method to `Decoder`
7
+
1
8
  ## 0.5.1 (2024-08-13)
2
9
 
3
10
  - Updated Tokenizers to 0.20.0
data/Cargo.lock CHANGED
@@ -57,9 +57,12 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
57
57
 
58
58
  [[package]]
59
59
  name = "cc"
60
- version = "1.1.8"
60
+ version = "1.1.15"
61
61
  source = "registry+https://github.com/rust-lang/crates.io-index"
62
- checksum = "504bdec147f2cc13c8b57ed9401fd8a147cc66b67ad5cb241394244f2c947549"
62
+ checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
63
+ dependencies = [
64
+ "shlex",
65
+ ]
63
66
 
64
67
  [[package]]
65
68
  name = "cexpr"
@@ -301,9 +304,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
301
304
 
302
305
  [[package]]
303
306
  name = "libc"
304
- version = "0.2.155"
307
+ version = "0.2.158"
305
308
  source = "registry+https://github.com/rust-lang/crates.io-index"
306
- checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
309
+ checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
307
310
 
308
311
  [[package]]
309
312
  name = "libloading"
@@ -475,9 +478,9 @@ dependencies = [
475
478
 
476
479
  [[package]]
477
480
  name = "quote"
478
- version = "1.0.36"
481
+ version = "1.0.37"
479
482
  source = "registry+https://github.com/rust-lang/crates.io-index"
480
- checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
483
+ checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
481
484
  dependencies = [
482
485
  "proc-macro2",
483
486
  ]
@@ -545,18 +548,18 @@ dependencies = [
545
548
 
546
549
  [[package]]
547
550
  name = "rb-sys"
548
- version = "0.9.100"
551
+ version = "0.9.102"
549
552
  source = "registry+https://github.com/rust-lang/crates.io-index"
550
- checksum = "87f2ba20be84b32fad6b0ce397764bcdd0f2dca4431cf7035f6a6721e5747565"
553
+ checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
551
554
  dependencies = [
552
555
  "rb-sys-build",
553
556
  ]
554
557
 
555
558
  [[package]]
556
559
  name = "rb-sys-build"
557
- version = "0.9.100"
560
+ version = "0.9.102"
558
561
  source = "registry+https://github.com/rust-lang/crates.io-index"
559
- checksum = "7ecae2bdcb118ee721d9a3929f89e8578237fade298dfcf8c928609aa88abc48"
562
+ checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
560
563
  dependencies = [
561
564
  "bindgen",
562
565
  "lazy_static",
@@ -622,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
622
625
 
623
626
  [[package]]
624
627
  name = "serde"
625
- version = "1.0.205"
628
+ version = "1.0.209"
626
629
  source = "registry+https://github.com/rust-lang/crates.io-index"
627
- checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150"
630
+ checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
628
631
  dependencies = [
629
632
  "serde_derive",
630
633
  ]
631
634
 
632
635
  [[package]]
633
636
  name = "serde_derive"
634
- version = "1.0.205"
637
+ version = "1.0.209"
635
638
  source = "registry+https://github.com/rust-lang/crates.io-index"
636
- checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1"
639
+ checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
637
640
  dependencies = [
638
641
  "proc-macro2",
639
642
  "quote",
@@ -642,9 +645,9 @@ dependencies = [
642
645
 
643
646
  [[package]]
644
647
  name = "serde_json"
645
- version = "1.0.122"
648
+ version = "1.0.127"
646
649
  source = "registry+https://github.com/rust-lang/crates.io-index"
647
- checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da"
650
+ checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
648
651
  dependencies = [
649
652
  "itoa",
650
653
  "memchr",
@@ -690,9 +693,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
690
693
 
691
694
  [[package]]
692
695
  name = "syn"
693
- version = "2.0.72"
696
+ version = "2.0.76"
694
697
  source = "registry+https://github.com/rust-lang/crates.io-index"
695
- checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af"
698
+ checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
696
699
  dependencies = [
697
700
  "proc-macro2",
698
701
  "quote",
@@ -721,7 +724,7 @@ dependencies = [
721
724
 
722
725
  [[package]]
723
726
  name = "tokenizers"
724
- version = "0.5.1"
727
+ version = "0.5.2"
725
728
  dependencies = [
726
729
  "magnus",
727
730
  "onig",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.5.1"
3
+ version = "0.5.2"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -34,6 +34,12 @@ impl Decoder for RbDecoder {
34
34
  }
35
35
  }
36
36
 
37
+ impl RbDecoder {
38
+ pub fn decode(&self, tokens: Vec<String>) -> RbResult<String> {
39
+ self.decoder.decode(tokens).map_err(RbError::from)
40
+ }
41
+ }
42
+
37
43
  macro_rules! getter {
38
44
  ($self: ident, $variant: ident, $($name: tt)+) => {{
39
45
  let decoder = &$self.decoder;
@@ -358,6 +364,7 @@ unsafe impl TypedData for RbDecoder {
358
364
 
359
365
  pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
360
366
  let decoder = module.define_class("Decoder", ruby.class_object())?;
367
+ decoder.define_method("decode", method!(RbDecoder::decode, 1))?;
361
368
 
362
369
  let class = module.define_class("BPEDecoder", decoder)?;
363
370
  class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
@@ -42,6 +42,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
42
42
 
43
43
  let class = module.define_class("Tokenizer", ruby.class_object())?;
44
44
  class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
45
+ class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
45
46
  class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
46
47
  class.define_method(
47
48
  "add_special_tokens",
@@ -54,12 +55,18 @@ fn init(ruby: &Ruby) -> RbResult<()> {
54
55
  class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
55
56
  class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
56
57
  class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
58
+ class.define_method("model", method!(RbTokenizer::get_model, 0))?;
59
+ class.define_method("model=", method!(RbTokenizer::set_model,1))?;
60
+ class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
57
61
  class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
62
+ class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
58
63
  class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
64
+ class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
59
65
  class.define_method(
60
66
  "post_processor=",
61
67
  method!(RbTokenizer::set_post_processor, 1),
62
68
  )?;
69
+ class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
63
70
  class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
64
71
  class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
65
72
  class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
@@ -1,9 +1,10 @@
1
1
  use std::cell::RefCell;
2
2
  use std::collections::HashMap;
3
3
  use std::path::PathBuf;
4
+ use std::str::FromStr;
4
5
 
5
6
  use magnus::prelude::*;
6
- use magnus::{exception, Error, RArray, RHash, Symbol, TryConvert, Value};
7
+ use magnus::{exception, Error, RArray, RHash, RString, Symbol, TryConvert, Value};
7
8
  use tk::tokenizer::{
8
9
  Model, PaddingDirection, PaddingParams, PaddingStrategy,
9
10
  TruncationDirection, TruncationParams, TruncationStrategy, TokenizerImpl
@@ -203,6 +204,14 @@ impl RbTokenizer {
203
204
  RbTokenizer::new(TokenizerImpl::new(model.clone()))
204
205
  }
205
206
 
207
+ pub fn from_str(json: RString) -> RbResult<Self> {
208
+ Tokenizer::from_str(unsafe { json.as_str()? })
209
+ .map(|v| RbTokenizer {
210
+ tokenizer: RefCell::new(v),
211
+ })
212
+ .map_err(RbError::from)
213
+ }
214
+
206
215
  pub fn from_file(path: PathBuf) -> RbResult<Self> {
207
216
  Tokenizer::from_file(path)
208
217
  .map(|v| RbTokenizer {
@@ -319,22 +328,46 @@ impl RbTokenizer {
319
328
  .map_err(RbError::from)
320
329
  }
321
330
 
331
+ pub fn get_model(&self) -> RbModel {
332
+ self.tokenizer.borrow().get_model().clone()
333
+ }
334
+
335
+ pub fn set_model(&self, model: &RbModel) {
336
+ self.tokenizer.borrow_mut().with_model(model.clone());
337
+ }
338
+
339
+ pub fn get_decoder(&self) -> Option<RbDecoder> {
340
+ self.tokenizer.borrow().get_decoder().cloned()
341
+ }
342
+
322
343
  pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
323
344
  self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
324
345
  }
325
346
 
347
+ pub fn get_pre_tokenizer(&self) -> Option<RbPreTokenizer> {
348
+ self.tokenizer.borrow().get_pre_tokenizer().cloned()
349
+ }
350
+
326
351
  pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
327
352
  self.tokenizer
328
353
  .borrow_mut()
329
354
  .with_pre_tokenizer(pretok.cloned());
330
355
  }
331
356
 
357
+ pub fn get_post_processor(&self) -> Option<RbPostProcessor> {
358
+ self.tokenizer.borrow().get_post_processor().cloned()
359
+ }
360
+
332
361
  pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
333
362
  self.tokenizer
334
363
  .borrow_mut()
335
364
  .with_post_processor(processor.cloned());
336
365
  }
337
366
 
367
+ pub fn get_normalizer(&self) -> Option<RbNormalizer> {
368
+ self.tokenizer.borrow().get_normalizer().cloned()
369
+ }
370
+
338
371
  pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
339
372
  self.tokenizer
340
373
  .borrow_mut()
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.1"
2
+ VERSION = "0.5.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-13 00:00:00.000000000 Z
11
+ date: 2024-08-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys