tokenizers 0.5.2 → 0.5.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
4
- data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
3
+ metadata.gz: 30c4558340092b3fe4b60adbfed6b042810e27df9a62bd8fe828c3a2c9b5cf7a
4
+ data.tar.gz: bc97136598b82cdb47b0d50de4ead4b5afd8500dc52b487496f3179dd48ecee8
5
5
  SHA512:
6
- metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
7
- data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
6
+ metadata.gz: '0184d588343d823b0a2942828c0a496e131b5dfbae475d46ed7ebb2f3e89e5fd6d420705e88b31293331b247920c209653d0590b3aad618aab583a6a9ff49c8a'
7
+ data.tar.gz: a7c590677a968516ae075fb46a5153e301b93e2bd13cf372d5cf020c4bd0c9c0cde7a7118e708e853a61c42a8957fcec73afa32e8a2eebd517943254905d0621
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.5.3 (2024-09-17)
2
+
3
+ - Added `AddedToken` class
4
+ - Added precompiled gem for Windows
5
+
1
6
  ## 0.5.2 (2024-08-26)
2
7
 
3
8
  - Added `from_str` method to `Tokenizer`
data/Cargo.lock CHANGED
@@ -724,7 +724,7 @@ dependencies = [
724
724
 
725
725
  [[package]]
726
726
  name = "tokenizers"
727
- version = "0.5.2"
727
+ version = "0.5.3"
728
728
  dependencies = [
729
729
  "magnus",
730
730
  "onig",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.5.2"
3
+ version = "0.5.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -15,7 +15,7 @@ mod utils;
15
15
 
16
16
  use encoding::RbEncoding;
17
17
  use error::RbError;
18
- use tokenizer::RbTokenizer;
18
+ use tokenizer::{RbAddedToken, RbTokenizer};
19
19
  use utils::RbRegex;
20
20
 
21
21
  use magnus::{function, method, prelude::*, value::Lazy, Error, RModule, Ruby};
@@ -79,6 +79,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
79
79
  class.define_method("num_special_tokens_to_add", method!(RbTokenizer::num_special_tokens_to_add, 1))?;
80
80
  class.define_method("_vocab", method!(RbTokenizer::vocab, 1))?;
81
81
  class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
82
+ class.define_method("added_tokens_decoder", method!(RbTokenizer::get_added_tokens_decoder, 0))?;
82
83
  class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
83
84
 
84
85
  let class = module.define_class("Encoding", ruby.class_object())?;
@@ -109,6 +110,15 @@ fn init(ruby: &Ruby) -> RbResult<()> {
109
110
  let class = module.define_class("Regex", ruby.class_object())?;
110
111
  class.define_singleton_method("new", function!(RbRegex::new, 1))?;
111
112
 
113
+ let class = module.define_class("AddedToken", ruby.class_object())?;
114
+ class.define_singleton_method("_new", function!(RbAddedToken::new, 2))?;
115
+ class.define_method("content", method!(RbAddedToken::get_content, 0))?;
116
+ class.define_method("rstrip", method!(RbAddedToken::get_rstrip, 0))?;
117
+ class.define_method("lstrip", method!(RbAddedToken::get_lstrip, 0))?;
118
+ class.define_method("single_word", method!(RbAddedToken::get_single_word, 0))?;
119
+ class.define_method("normalized", method!(RbAddedToken::get_normalized, 0))?;
120
+ class.define_method("special", method!(RbAddedToken::get_special, 0))?;
121
+
112
122
  let models = module.define_module("Models")?;
113
123
  let pre_tokenizers = module.define_module("PreTokenizers")?;
114
124
  let decoders = module.define_module("Decoders")?;
@@ -22,9 +22,10 @@ use super::processors::RbPostProcessor;
22
22
  use super::trainers::RbTrainer;
23
23
  use super::{RbError, RbResult};
24
24
 
25
+ #[magnus::wrap(class = "Tokenizers::AddedToken")]
25
26
  pub struct RbAddedToken {
26
27
  pub content: String,
27
- pub is_special_token: bool,
28
+ pub special: bool,
28
29
  pub single_word: Option<bool>,
29
30
  pub lstrip: Option<bool>,
30
31
  pub rstrip: Option<bool>,
@@ -32,10 +33,10 @@ pub struct RbAddedToken {
32
33
  }
33
34
 
34
35
  impl RbAddedToken {
35
- pub fn from<S: Into<String>>(content: S, is_special_token: Option<bool>) -> Self {
36
+ pub fn from<S: Into<String>>(content: S, special: Option<bool>) -> Self {
36
37
  Self {
37
38
  content: content.into(),
38
- is_special_token: is_special_token.unwrap_or(false),
39
+ special: special.unwrap_or(false),
39
40
  single_word: None,
40
41
  lstrip: None,
41
42
  rstrip: None,
@@ -44,7 +45,7 @@ impl RbAddedToken {
44
45
  }
45
46
 
46
47
  pub fn get_token(&self) -> tk::tokenizer::AddedToken {
47
- let mut token = tk::AddedToken::from(&self.content, self.is_special_token);
48
+ let mut token = tk::AddedToken::from(&self.content, self.special);
48
49
 
49
50
  if let Some(sw) = self.single_word {
50
51
  token = token.single_word(sw);
@@ -71,11 +72,73 @@ impl From<tk::AddedToken> for RbAddedToken {
71
72
  lstrip: Some(token.lstrip),
72
73
  rstrip: Some(token.rstrip),
73
74
  normalized: Some(token.normalized),
74
- is_special_token: !token.normalized,
75
+ special: !token.normalized,
75
76
  }
76
77
  }
77
78
  }
78
79
 
80
+ impl RbAddedToken {
81
+ pub fn new(content: Option<String>, kwargs: RHash) -> RbResult<Self> {
82
+ let mut token = RbAddedToken::from(content.unwrap_or("".to_string()), None);
83
+
84
+ let value: Value = kwargs.delete(Symbol::new("single_word"))?;
85
+ if !value.is_nil() {
86
+ token.single_word = TryConvert::try_convert(value)?;
87
+ }
88
+
89
+ let value: Value = kwargs.delete(Symbol::new("lstrip"))?;
90
+ if !value.is_nil() {
91
+ token.lstrip = TryConvert::try_convert(value)?;
92
+ }
93
+
94
+ let value: Value = kwargs.delete(Symbol::new("rstrip"))?;
95
+ if !value.is_nil() {
96
+ token.rstrip = TryConvert::try_convert(value)?;
97
+ }
98
+
99
+ let value: Value = kwargs.delete(Symbol::new("normalized"))?;
100
+ if !value.is_nil() {
101
+ token.normalized = TryConvert::try_convert(value)?;
102
+ }
103
+
104
+ let value: Value = kwargs.delete(Symbol::new("special"))?;
105
+ if !value.is_nil() {
106
+ token.special = TryConvert::try_convert(value)?;
107
+ }
108
+
109
+ if !kwargs.is_empty() {
110
+ // TODO improve message
111
+ return Err(Error::new(exception::arg_error(), "unknown keyword"));
112
+ }
113
+
114
+ Ok(token)
115
+ }
116
+
117
+ pub fn get_content(&self) -> String {
118
+ self.content.to_string()
119
+ }
120
+
121
+ pub fn get_rstrip(&self) -> bool {
122
+ self.get_token().rstrip
123
+ }
124
+
125
+ pub fn get_lstrip(&self) -> bool {
126
+ self.get_token().lstrip
127
+ }
128
+
129
+ pub fn get_single_word(&self) -> bool {
130
+ self.get_token().single_word
131
+ }
132
+
133
+ pub fn get_normalized(&self) -> bool {
134
+ self.get_token().normalized
135
+ }
136
+
137
+ pub fn get_special(&self) -> bool {
138
+ self.get_token().special
139
+ }
140
+ }
141
+
79
142
  struct TextInputSequence<'s>(tk::InputSequence<'s>);
80
143
 
81
144
  impl<'s> TryConvert for TextInputSequence<'s> {
@@ -536,4 +599,14 @@ impl RbTokenizer {
536
599
  pub fn vocab_size(&self, with_added_tokens: bool) -> usize {
537
600
  self.tokenizer.borrow().get_vocab_size(with_added_tokens)
538
601
  }
602
+
603
+ pub fn get_added_tokens_decoder(&self) -> RbResult<RHash> {
604
+ let sorted_map = RHash::new();
605
+
606
+ for (key, value) in self.tokenizer.borrow().get_added_tokens_decoder() {
607
+ sorted_map.aset::<u32, RbAddedToken>(key, value.into())?;
608
+ }
609
+
610
+ Ok(sorted_map)
611
+ }
539
612
  }
@@ -0,0 +1,7 @@
1
+ module Tokenizers
2
+ class AddedToken
3
+ def self.new(content, **kwargs)
4
+ _new(content, kwargs)
5
+ end
6
+ end
7
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.2"
2
+ VERSION = "0.5.3"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -42,6 +42,7 @@ require_relative "tokenizers/trainers/word_level_trainer"
42
42
  require_relative "tokenizers/trainers/word_piece_trainer"
43
43
 
44
44
  # other
45
+ require_relative "tokenizers/added_token"
45
46
  require_relative "tokenizers/char_bpe_tokenizer"
46
47
  require_relative "tokenizers/encoding"
47
48
  require_relative "tokenizers/from_pretrained"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-26 00:00:00.000000000 Z
11
+ date: 2024-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -52,6 +52,7 @@ files:
52
52
  - ext/tokenizers/src/utils/normalization.rs
53
53
  - ext/tokenizers/src/utils/regex.rs
54
54
  - lib/tokenizers.rb
55
+ - lib/tokenizers/added_token.rb
55
56
  - lib/tokenizers/char_bpe_tokenizer.rb
56
57
  - lib/tokenizers/decoders/bpe_decoder.rb
57
58
  - lib/tokenizers/decoders/ctc.rb
@@ -100,7 +101,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
101
  - !ruby/object:Gem::Version
101
102
  version: '0'
102
103
  requirements: []
103
- rubygems_version: 3.5.11
104
+ rubygems_version: 3.5.16
104
105
  signing_key:
105
106
  specification_version: 4
106
107
  summary: Fast state-of-the-art tokenizers for Ruby