tokenizers 0.5.1 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4846b5d3dc0fe8f5828ddffe46908b1f3812ebf6a03a939ca0395ad7748533bb
4
- data.tar.gz: 259795bfa6b13a36f62ab2ffb65e9feabd460e01efe9f59e7d6017c6dcd9b9b0
3
+ metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
4
+ data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
5
5
  SHA512:
6
- metadata.gz: 90f55feb8ceec81815bb61b7773e6f67924eb6d47869e5da67d579e2ac9df6a48fddee5e97f5a028e3fa1e39941f3bfe6ec6c04cf49fd1b87f18bade54911231
7
- data.tar.gz: 9c1895b43222494b393f3fbddaa6e78216e025f3d3dddebf0c0311d2d897b282a16b8b0044aacb2466790b0a93c8d01099b25d662750b96d5798d0a4a927267b
6
+ metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
7
+ data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.5.2 (2024-08-26)
2
+
3
+ - Added `from_str` method to `Tokenizer`
4
+ - Added `model` and `model=` methods to `Tokenizer`
5
+ - Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
6
+ - Added `decode` method to `Decoder`
7
+
1
8
  ## 0.5.1 (2024-08-13)
2
9
 
3
10
  - Updated Tokenizers to 0.20.0
data/Cargo.lock CHANGED
@@ -57,9 +57,12 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
57
57
 
58
58
  [[package]]
59
59
  name = "cc"
60
- version = "1.1.8"
60
+ version = "1.1.15"
61
61
  source = "registry+https://github.com/rust-lang/crates.io-index"
62
- checksum = "504bdec147f2cc13c8b57ed9401fd8a147cc66b67ad5cb241394244f2c947549"
62
+ checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
63
+ dependencies = [
64
+ "shlex",
65
+ ]
63
66
 
64
67
  [[package]]
65
68
  name = "cexpr"
@@ -301,9 +304,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
301
304
 
302
305
  [[package]]
303
306
  name = "libc"
304
- version = "0.2.155"
307
+ version = "0.2.158"
305
308
  source = "registry+https://github.com/rust-lang/crates.io-index"
306
- checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
309
+ checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
307
310
 
308
311
  [[package]]
309
312
  name = "libloading"
@@ -475,9 +478,9 @@ dependencies = [
475
478
 
476
479
  [[package]]
477
480
  name = "quote"
478
- version = "1.0.36"
481
+ version = "1.0.37"
479
482
  source = "registry+https://github.com/rust-lang/crates.io-index"
480
- checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
483
+ checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
481
484
  dependencies = [
482
485
  "proc-macro2",
483
486
  ]
@@ -545,18 +548,18 @@ dependencies = [
545
548
 
546
549
  [[package]]
547
550
  name = "rb-sys"
548
- version = "0.9.100"
551
+ version = "0.9.102"
549
552
  source = "registry+https://github.com/rust-lang/crates.io-index"
550
- checksum = "87f2ba20be84b32fad6b0ce397764bcdd0f2dca4431cf7035f6a6721e5747565"
553
+ checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
551
554
  dependencies = [
552
555
  "rb-sys-build",
553
556
  ]
554
557
 
555
558
  [[package]]
556
559
  name = "rb-sys-build"
557
- version = "0.9.100"
560
+ version = "0.9.102"
558
561
  source = "registry+https://github.com/rust-lang/crates.io-index"
559
- checksum = "7ecae2bdcb118ee721d9a3929f89e8578237fade298dfcf8c928609aa88abc48"
562
+ checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
560
563
  dependencies = [
561
564
  "bindgen",
562
565
  "lazy_static",
@@ -622,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
622
625
 
623
626
  [[package]]
624
627
  name = "serde"
625
- version = "1.0.205"
628
+ version = "1.0.209"
626
629
  source = "registry+https://github.com/rust-lang/crates.io-index"
627
- checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150"
630
+ checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
628
631
  dependencies = [
629
632
  "serde_derive",
630
633
  ]
631
634
 
632
635
  [[package]]
633
636
  name = "serde_derive"
634
- version = "1.0.205"
637
+ version = "1.0.209"
635
638
  source = "registry+https://github.com/rust-lang/crates.io-index"
636
- checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1"
639
+ checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
637
640
  dependencies = [
638
641
  "proc-macro2",
639
642
  "quote",
@@ -642,9 +645,9 @@ dependencies = [
642
645
 
643
646
  [[package]]
644
647
  name = "serde_json"
645
- version = "1.0.122"
648
+ version = "1.0.127"
646
649
  source = "registry+https://github.com/rust-lang/crates.io-index"
647
- checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da"
650
+ checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
648
651
  dependencies = [
649
652
  "itoa",
650
653
  "memchr",
@@ -690,9 +693,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
690
693
 
691
694
  [[package]]
692
695
  name = "syn"
693
- version = "2.0.72"
696
+ version = "2.0.76"
694
697
  source = "registry+https://github.com/rust-lang/crates.io-index"
695
- checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af"
698
+ checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
696
699
  dependencies = [
697
700
  "proc-macro2",
698
701
  "quote",
@@ -721,7 +724,7 @@ dependencies = [
721
724
 
722
725
  [[package]]
723
726
  name = "tokenizers"
724
- version = "0.5.1"
727
+ version = "0.5.2"
725
728
  dependencies = [
726
729
  "magnus",
727
730
  "onig",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.5.1"
3
+ version = "0.5.2"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -34,6 +34,12 @@ impl Decoder for RbDecoder {
34
34
  }
35
35
  }
36
36
 
37
+ impl RbDecoder {
38
+ pub fn decode(&self, tokens: Vec<String>) -> RbResult<String> {
39
+ self.decoder.decode(tokens).map_err(RbError::from)
40
+ }
41
+ }
42
+
37
43
  macro_rules! getter {
38
44
  ($self: ident, $variant: ident, $($name: tt)+) => {{
39
45
  let decoder = &$self.decoder;
@@ -358,6 +364,7 @@ unsafe impl TypedData for RbDecoder {
358
364
 
359
365
  pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> {
360
366
  let decoder = module.define_class("Decoder", ruby.class_object())?;
367
+ decoder.define_method("decode", method!(RbDecoder::decode, 1))?;
361
368
 
362
369
  let class = module.define_class("BPEDecoder", decoder)?;
363
370
  class.define_singleton_method("_new", function!(RbBPEDecoder::new, 1))?;
@@ -42,6 +42,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
42
42
 
43
43
  let class = module.define_class("Tokenizer", ruby.class_object())?;
44
44
  class.define_singleton_method("new", function!(RbTokenizer::from_model, 1))?;
45
+ class.define_singleton_method("from_str", function!(RbTokenizer::from_str, 1))?;
45
46
  class.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
46
47
  class.define_method(
47
48
  "add_special_tokens",
@@ -54,12 +55,18 @@ fn init(ruby: &Ruby) -> RbResult<()> {
54
55
  class.define_method("_encode_batch", method!(RbTokenizer::encode_batch, 3))?;
55
56
  class.define_method("_decode", method!(RbTokenizer::decode, 2))?;
56
57
  class.define_method("_decode_batch", method!(RbTokenizer::decode_batch, 2))?;
58
+ class.define_method("model", method!(RbTokenizer::get_model, 0))?;
59
+ class.define_method("model=", method!(RbTokenizer::set_model,1))?;
60
+ class.define_method("decoder", method!(RbTokenizer::get_decoder, 0))?;
57
61
  class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
62
+ class.define_method("pre_tokenizer", method!(RbTokenizer::get_pre_tokenizer, 0))?;
58
63
  class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
64
+ class.define_method("post_processor", method!(RbTokenizer::get_post_processor, 0))?;
59
65
  class.define_method(
60
66
  "post_processor=",
61
67
  method!(RbTokenizer::set_post_processor, 1),
62
68
  )?;
69
+ class.define_method("normalizer", method!(RbTokenizer::get_normalizer, 0))?;
63
70
  class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
64
71
  class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
65
72
  class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
@@ -1,9 +1,10 @@
1
1
  use std::cell::RefCell;
2
2
  use std::collections::HashMap;
3
3
  use std::path::PathBuf;
4
+ use std::str::FromStr;
4
5
 
5
6
  use magnus::prelude::*;
6
- use magnus::{exception, Error, RArray, RHash, Symbol, TryConvert, Value};
7
+ use magnus::{exception, Error, RArray, RHash, RString, Symbol, TryConvert, Value};
7
8
  use tk::tokenizer::{
8
9
  Model, PaddingDirection, PaddingParams, PaddingStrategy,
9
10
  TruncationDirection, TruncationParams, TruncationStrategy, TokenizerImpl
@@ -203,6 +204,14 @@ impl RbTokenizer {
203
204
  RbTokenizer::new(TokenizerImpl::new(model.clone()))
204
205
  }
205
206
 
207
+ pub fn from_str(json: RString) -> RbResult<Self> {
208
+ Tokenizer::from_str(unsafe { json.as_str()? })
209
+ .map(|v| RbTokenizer {
210
+ tokenizer: RefCell::new(v),
211
+ })
212
+ .map_err(RbError::from)
213
+ }
214
+
206
215
  pub fn from_file(path: PathBuf) -> RbResult<Self> {
207
216
  Tokenizer::from_file(path)
208
217
  .map(|v| RbTokenizer {
@@ -319,22 +328,46 @@ impl RbTokenizer {
319
328
  .map_err(RbError::from)
320
329
  }
321
330
 
331
+ pub fn get_model(&self) -> RbModel {
332
+ self.tokenizer.borrow().get_model().clone()
333
+ }
334
+
335
+ pub fn set_model(&self, model: &RbModel) {
336
+ self.tokenizer.borrow_mut().with_model(model.clone());
337
+ }
338
+
339
+ pub fn get_decoder(&self) -> Option<RbDecoder> {
340
+ self.tokenizer.borrow().get_decoder().cloned()
341
+ }
342
+
322
343
  pub fn set_decoder(&self, decoder: Option<&RbDecoder>) {
323
344
  self.tokenizer.borrow_mut().with_decoder(decoder.cloned());
324
345
  }
325
346
 
347
+ pub fn get_pre_tokenizer(&self) -> Option<RbPreTokenizer> {
348
+ self.tokenizer.borrow().get_pre_tokenizer().cloned()
349
+ }
350
+
326
351
  pub fn set_pre_tokenizer(&self, pretok: Option<&RbPreTokenizer>) {
327
352
  self.tokenizer
328
353
  .borrow_mut()
329
354
  .with_pre_tokenizer(pretok.cloned());
330
355
  }
331
356
 
357
+ pub fn get_post_processor(&self) -> Option<RbPostProcessor> {
358
+ self.tokenizer.borrow().get_post_processor().cloned()
359
+ }
360
+
332
361
  pub fn set_post_processor(&self, processor: Option<&RbPostProcessor>) {
333
362
  self.tokenizer
334
363
  .borrow_mut()
335
364
  .with_post_processor(processor.cloned());
336
365
  }
337
366
 
367
+ pub fn get_normalizer(&self) -> Option<RbNormalizer> {
368
+ self.tokenizer.borrow().get_normalizer().cloned()
369
+ }
370
+
338
371
  pub fn set_normalizer(&self, normalizer: Option<&RbNormalizer>) {
339
372
  self.tokenizer
340
373
  .borrow_mut()
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.1"
2
+ VERSION = "0.5.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-13 00:00:00.000000000 Z
11
+ date: 2024-08-26 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys