tokenizers 0.5.2 → 0.5.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 556d084ad69603fa0d5ff61c4a03864d56fbe4d525d706390851d1a5761d173a
4
- data.tar.gz: 4dd1ab9a2de88f60135aca333c7d48557fa1ad6c8e31c61065717e77a37f0cdd
3
+ metadata.gz: 30c4558340092b3fe4b60adbfed6b042810e27df9a62bd8fe828c3a2c9b5cf7a
4
+ data.tar.gz: bc97136598b82cdb47b0d50de4ead4b5afd8500dc52b487496f3179dd48ecee8
5
5
  SHA512:
6
- metadata.gz: 3d6b7209189aeec8846a50f0e65a24e1089e7e0998d1f3d07026446c18b2b0c139d5b7566ca2fc721f72b67519c50595144d6deb3603d032fb41d20c7bc8c6e7
7
- data.tar.gz: a7f04aa13c7cbc3c3973408140fc9f4c3330dacd150d4edd33f16e6c218e03a949c9d44ff03b7b4b6e63eedd2623b2abf417a647a8f2260aa67d64594f06c6fc
6
+ metadata.gz: '0184d588343d823b0a2942828c0a496e131b5dfbae475d46ed7ebb2f3e89e5fd6d420705e88b31293331b247920c209653d0590b3aad618aab583a6a9ff49c8a'
7
+ data.tar.gz: a7c590677a968516ae075fb46a5153e301b93e2bd13cf372d5cf020c4bd0c9c0cde7a7118e708e853a61c42a8957fcec73afa32e8a2eebd517943254905d0621
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.5.3 (2024-09-17)
2
+
3
+ - Added `AddedToken` class
4
+ - Added precompiled gem for Windows
5
+
1
6
  ## 0.5.2 (2024-08-26)
2
7
 
3
8
  - Added `from_str` method to `Tokenizer`
data/Cargo.lock CHANGED
@@ -724,7 +724,7 @@ dependencies = [
724
724
 
725
725
  [[package]]
726
726
  name = "tokenizers"
727
- version = "0.5.2"
727
+ version = "0.5.3"
728
728
  dependencies = [
729
729
  "magnus",
730
730
  "onig",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.5.2"
3
+ version = "0.5.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -15,7 +15,7 @@ mod utils;
15
15
 
16
16
  use encoding::RbEncoding;
17
17
  use error::RbError;
18
- use tokenizer::RbTokenizer;
18
+ use tokenizer::{RbAddedToken, RbTokenizer};
19
19
  use utils::RbRegex;
20
20
 
21
21
  use magnus::{function, method, prelude::*, value::Lazy, Error, RModule, Ruby};
@@ -79,6 +79,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
79
79
  class.define_method("num_special_tokens_to_add", method!(RbTokenizer::num_special_tokens_to_add, 1))?;
80
80
  class.define_method("_vocab", method!(RbTokenizer::vocab, 1))?;
81
81
  class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
82
+ class.define_method("added_tokens_decoder", method!(RbTokenizer::get_added_tokens_decoder, 0))?;
82
83
  class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
83
84
 
84
85
  let class = module.define_class("Encoding", ruby.class_object())?;
@@ -109,6 +110,15 @@ fn init(ruby: &Ruby) -> RbResult<()> {
109
110
  let class = module.define_class("Regex", ruby.class_object())?;
110
111
  class.define_singleton_method("new", function!(RbRegex::new, 1))?;
111
112
 
113
+ let class = module.define_class("AddedToken", ruby.class_object())?;
114
+ class.define_singleton_method("_new", function!(RbAddedToken::new, 2))?;
115
+ class.define_method("content", method!(RbAddedToken::get_content, 0))?;
116
+ class.define_method("rstrip", method!(RbAddedToken::get_rstrip, 0))?;
117
+ class.define_method("lstrip", method!(RbAddedToken::get_lstrip, 0))?;
118
+ class.define_method("single_word", method!(RbAddedToken::get_single_word, 0))?;
119
+ class.define_method("normalized", method!(RbAddedToken::get_normalized, 0))?;
120
+ class.define_method("special", method!(RbAddedToken::get_special, 0))?;
121
+
112
122
  let models = module.define_module("Models")?;
113
123
  let pre_tokenizers = module.define_module("PreTokenizers")?;
114
124
  let decoders = module.define_module("Decoders")?;
@@ -22,9 +22,10 @@ use super::processors::RbPostProcessor;
22
22
  use super::trainers::RbTrainer;
23
23
  use super::{RbError, RbResult};
24
24
 
25
+ #[magnus::wrap(class = "Tokenizers::AddedToken")]
25
26
  pub struct RbAddedToken {
26
27
  pub content: String,
27
- pub is_special_token: bool,
28
+ pub special: bool,
28
29
  pub single_word: Option<bool>,
29
30
  pub lstrip: Option<bool>,
30
31
  pub rstrip: Option<bool>,
@@ -32,10 +33,10 @@ pub struct RbAddedToken {
32
33
  }
33
34
 
34
35
  impl RbAddedToken {
35
- pub fn from<S: Into<String>>(content: S, is_special_token: Option<bool>) -> Self {
36
+ pub fn from<S: Into<String>>(content: S, special: Option<bool>) -> Self {
36
37
  Self {
37
38
  content: content.into(),
38
- is_special_token: is_special_token.unwrap_or(false),
39
+ special: special.unwrap_or(false),
39
40
  single_word: None,
40
41
  lstrip: None,
41
42
  rstrip: None,
@@ -44,7 +45,7 @@ impl RbAddedToken {
44
45
  }
45
46
 
46
47
  pub fn get_token(&self) -> tk::tokenizer::AddedToken {
47
- let mut token = tk::AddedToken::from(&self.content, self.is_special_token);
48
+ let mut token = tk::AddedToken::from(&self.content, self.special);
48
49
 
49
50
  if let Some(sw) = self.single_word {
50
51
  token = token.single_word(sw);
@@ -71,11 +72,73 @@ impl From<tk::AddedToken> for RbAddedToken {
71
72
  lstrip: Some(token.lstrip),
72
73
  rstrip: Some(token.rstrip),
73
74
  normalized: Some(token.normalized),
74
- is_special_token: !token.normalized,
75
+ special: !token.normalized,
75
76
  }
76
77
  }
77
78
  }
78
79
 
80
+ impl RbAddedToken {
81
+ pub fn new(content: Option<String>, kwargs: RHash) -> RbResult<Self> {
82
+ let mut token = RbAddedToken::from(content.unwrap_or("".to_string()), None);
83
+
84
+ let value: Value = kwargs.delete(Symbol::new("single_word"))?;
85
+ if !value.is_nil() {
86
+ token.single_word = TryConvert::try_convert(value)?;
87
+ }
88
+
89
+ let value: Value = kwargs.delete(Symbol::new("lstrip"))?;
90
+ if !value.is_nil() {
91
+ token.lstrip = TryConvert::try_convert(value)?;
92
+ }
93
+
94
+ let value: Value = kwargs.delete(Symbol::new("rstrip"))?;
95
+ if !value.is_nil() {
96
+ token.rstrip = TryConvert::try_convert(value)?;
97
+ }
98
+
99
+ let value: Value = kwargs.delete(Symbol::new("normalized"))?;
100
+ if !value.is_nil() {
101
+ token.normalized = TryConvert::try_convert(value)?;
102
+ }
103
+
104
+ let value: Value = kwargs.delete(Symbol::new("special"))?;
105
+ if !value.is_nil() {
106
+ token.special = TryConvert::try_convert(value)?;
107
+ }
108
+
109
+ if !kwargs.is_empty() {
110
+ // TODO improve message
111
+ return Err(Error::new(exception::arg_error(), "unknown keyword"));
112
+ }
113
+
114
+ Ok(token)
115
+ }
116
+
117
+ pub fn get_content(&self) -> String {
118
+ self.content.to_string()
119
+ }
120
+
121
+ pub fn get_rstrip(&self) -> bool {
122
+ self.get_token().rstrip
123
+ }
124
+
125
+ pub fn get_lstrip(&self) -> bool {
126
+ self.get_token().lstrip
127
+ }
128
+
129
+ pub fn get_single_word(&self) -> bool {
130
+ self.get_token().single_word
131
+ }
132
+
133
+ pub fn get_normalized(&self) -> bool {
134
+ self.get_token().normalized
135
+ }
136
+
137
+ pub fn get_special(&self) -> bool {
138
+ self.get_token().special
139
+ }
140
+ }
141
+
79
142
  struct TextInputSequence<'s>(tk::InputSequence<'s>);
80
143
 
81
144
  impl<'s> TryConvert for TextInputSequence<'s> {
@@ -536,4 +599,14 @@ impl RbTokenizer {
536
599
  pub fn vocab_size(&self, with_added_tokens: bool) -> usize {
537
600
  self.tokenizer.borrow().get_vocab_size(with_added_tokens)
538
601
  }
602
+
603
+ pub fn get_added_tokens_decoder(&self) -> RbResult<RHash> {
604
+ let sorted_map = RHash::new();
605
+
606
+ for (key, value) in self.tokenizer.borrow().get_added_tokens_decoder() {
607
+ sorted_map.aset::<u32, RbAddedToken>(key, value.into())?;
608
+ }
609
+
610
+ Ok(sorted_map)
611
+ }
539
612
  }
@@ -0,0 +1,7 @@
1
+ module Tokenizers
2
+ class AddedToken
3
+ def self.new(content, **kwargs)
4
+ _new(content, kwargs)
5
+ end
6
+ end
7
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.2"
2
+ VERSION = "0.5.3"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -42,6 +42,7 @@ require_relative "tokenizers/trainers/word_level_trainer"
42
42
  require_relative "tokenizers/trainers/word_piece_trainer"
43
43
 
44
44
  # other
45
+ require_relative "tokenizers/added_token"
45
46
  require_relative "tokenizers/char_bpe_tokenizer"
46
47
  require_relative "tokenizers/encoding"
47
48
  require_relative "tokenizers/from_pretrained"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.2
4
+ version: 0.5.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-26 00:00:00.000000000 Z
11
+ date: 2024-09-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -52,6 +52,7 @@ files:
52
52
  - ext/tokenizers/src/utils/normalization.rs
53
53
  - ext/tokenizers/src/utils/regex.rs
54
54
  - lib/tokenizers.rb
55
+ - lib/tokenizers/added_token.rb
55
56
  - lib/tokenizers/char_bpe_tokenizer.rb
56
57
  - lib/tokenizers/decoders/bpe_decoder.rb
57
58
  - lib/tokenizers/decoders/ctc.rb
@@ -100,7 +101,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
100
101
  - !ruby/object:Gem::Version
101
102
  version: '0'
102
103
  requirements: []
103
- rubygems_version: 3.5.11
104
+ rubygems_version: 3.5.16
104
105
  signing_key:
105
106
  specification_version: 4
106
107
  summary: Fast state-of-the-art tokenizers for Ruby