tokenizers 0.2.2 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 197131371ec438d82623bc0aacb8fe82ba255904e847eeb9259358f38a7063f0
4
- data.tar.gz: 42ef490120e56fbb79d847ec1eb2b0a6b0ca7aa8f2ad90c09d2053d167491350
3
+ metadata.gz: e4f3cb98cb867df67a1c8a00b56f9ec5f4c6fafa178d760655dafb6735160773
4
+ data.tar.gz: 88c420f7a42f56330ce091df7f131878efd552488232282388e69d7a3c4b4aa2
5
5
  SHA512:
6
- metadata.gz: 0a21b4811cc9e31565209eb514e55d6b22302c350371a76205aeb3b67cf94ea6dabf85074cebd48c65f9eca56e8e750b83a1df841807e53afb1275961bca50ce
7
- data.tar.gz: 222bb9d759e3a2cc00ad7a4950c821fdbad1bbf6d4413f237bcf9cdc0698c2011022890b3f306be6df3d70b05abd446ad43066851ffa6c27387ddf3191f7557d
6
+ metadata.gz: 8e4746ccdf33dce78dc2b86d847f47f83576ca0d637671f825ad006a53b7ac3374654f7724f1e889618f322f9cfa5081e30083997ee9810eab282b9a8b99f807
7
+ data.tar.gz: 5dfe7b502d908f85ae16cfb28ebe1bd2ff51348c31151c7ee531504c00a0315dc22ea76fea963690de8c7390c7adb50d392e39de6db4a22101e91d31de1fa4e8
data/CHANGELOG.md CHANGED
@@ -1,3 +1,10 @@
1
+ ## 0.2.3 (2022-01-22)
2
+
3
+ - Added `add_special_tokens` option to `encode` method
4
+ - Added warning about `encode` method including special tokens by default in 0.3.0
5
+ - Added more methods to `Encoding`
6
+ - Fixed error with precompiled gem on Mac ARM
7
+
1
8
  ## 0.2.2 (2022-01-15)
2
9
 
3
10
  - Added precompiled gem for Linux ARM
data/Cargo.lock CHANGED
@@ -753,7 +753,7 @@ dependencies = [
753
753
 
754
754
  [[package]]
755
755
  name = "tokenizers"
756
- version = "0.2.2"
756
+ version = "0.2.3"
757
757
  dependencies = [
758
758
  "magnus",
759
759
  "tokenizers 0.13.2",
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.2.2"
3
+ version = "0.2.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -1,16 +1,90 @@
1
- use tk::Encoding;
1
+ use tk::{Encoding, Offsets};
2
2
 
3
3
  #[magnus::wrap(class = "Tokenizers::Encoding")]
4
+ #[repr(transparent)]
4
5
  pub struct RbEncoding {
5
6
  pub encoding: Encoding,
6
7
  }
7
8
 
9
+ impl From<Encoding> for RbEncoding {
10
+ fn from(v: Encoding) -> Self {
11
+ Self { encoding: v }
12
+ }
13
+ }
14
+
8
15
  impl RbEncoding {
16
+ pub fn n_sequences(&self) -> usize {
17
+ self.encoding.n_sequences()
18
+ }
19
+
9
20
  pub fn ids(&self) -> Vec<u32> {
10
- self.encoding.get_ids().into()
21
+ self.encoding.get_ids().to_vec()
11
22
  }
12
23
 
13
24
  pub fn tokens(&self) -> Vec<String> {
14
- self.encoding.get_tokens().into()
25
+ self.encoding.get_tokens().to_vec()
26
+ }
27
+
28
+ pub fn word_ids(&self) -> Vec<Option<u32>> {
29
+ self.encoding.get_word_ids().to_vec()
30
+ }
31
+
32
+ pub fn sequence_ids(&self) -> Vec<Option<usize>> {
33
+ self.encoding.get_sequence_ids()
34
+ }
35
+
36
+ pub fn type_ids(&self) -> Vec<u32> {
37
+ self.encoding.get_type_ids().to_vec()
38
+ }
39
+
40
+ pub fn offsets(&self) -> Vec<(usize, usize)> {
41
+ self.encoding.get_offsets().to_vec()
42
+ }
43
+
44
+ pub fn special_tokens_mask(&self) -> Vec<u32> {
45
+ self.encoding.get_special_tokens_mask().to_vec()
46
+ }
47
+
48
+ pub fn attention_mask(&self) -> Vec<u32> {
49
+ self.encoding.get_attention_mask().to_vec()
50
+ }
51
+
52
+ pub fn overflowing(&self) -> Vec<Self> {
53
+ self.encoding
54
+ .get_overflowing()
55
+ .clone()
56
+ .into_iter()
57
+ .map(|e| e.into())
58
+ .collect()
59
+ }
60
+
61
+ pub fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
62
+ self.encoding.word_to_tokens(word_index, sequence_index)
63
+ }
64
+
65
+ pub fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
66
+ self.encoding.word_to_chars(word_index, sequence_index)
67
+ }
68
+
69
+ pub fn token_to_sequence(&self, token_index: usize) -> Option<usize> {
70
+ self.encoding.token_to_sequence(token_index)
71
+ }
72
+
73
+ pub fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
74
+ let (_, offsets) = self.encoding.token_to_chars(token_index)?;
75
+ Some(offsets)
76
+ }
77
+
78
+ pub fn token_to_word(&self, token_index: usize) -> Option<u32> {
79
+ let (_, word_idx) = self.encoding.token_to_word(token_index)?;
80
+ Some(word_idx)
81
+ }
82
+
83
+ pub fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
84
+ self.encoding.char_to_token(char_pos, sequence_index)
85
+ }
86
+
87
+ pub fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
88
+ self.encoding.char_to_word(char_pos, sequence_index)
15
89
  }
16
90
  }
@@ -38,15 +38,39 @@ fn init() -> RbResult<()> {
38
38
  "add_special_tokens",
39
39
  method!(RbTokenizer::add_special_tokens, 1),
40
40
  )?;
41
- class.define_method("encode", method!(RbTokenizer::encode, 1))?;
41
+ class.define_method("add_tokens", method!(RbTokenizer::add_tokens, 1))?;
42
+ class.define_method("_encode", method!(RbTokenizer::encode, 2))?;
42
43
  class.define_method("decode", method!(RbTokenizer::decode, 1))?;
43
44
  class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
44
45
  class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
45
46
  class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
47
+ class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
48
+ class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
46
49
 
47
50
  let class = module.define_class("Encoding", Default::default())?;
51
+ class.define_method("n_sequences", method!(RbEncoding::n_sequences, 0))?;
48
52
  class.define_method("ids", method!(RbEncoding::ids, 0))?;
49
53
  class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
54
+ class.define_method("word_ids", method!(RbEncoding::word_ids, 0))?;
55
+ class.define_method("sequence_ids", method!(RbEncoding::sequence_ids, 0))?;
56
+ class.define_method("type_ids", method!(RbEncoding::type_ids, 0))?;
57
+ class.define_method("offsets", method!(RbEncoding::offsets, 0))?;
58
+ class.define_method(
59
+ "special_tokens_mask",
60
+ method!(RbEncoding::special_tokens_mask, 0),
61
+ )?;
62
+ class.define_method("attention_mask", method!(RbEncoding::attention_mask, 0))?;
63
+ class.define_method("overflowing", method!(RbEncoding::overflowing, 0))?;
64
+ class.define_method("_word_to_tokens", method!(RbEncoding::word_to_tokens, 2))?;
65
+ class.define_method("_word_to_chars", method!(RbEncoding::word_to_chars, 2))?;
66
+ class.define_method(
67
+ "token_to_sequence",
68
+ method!(RbEncoding::token_to_sequence, 1),
69
+ )?;
70
+ class.define_method("token_to_chars", method!(RbEncoding::token_to_chars, 1))?;
71
+ class.define_method("token_to_word", method!(RbEncoding::token_to_word, 1))?;
72
+ class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
73
+ class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
50
74
 
51
75
  let class = module.define_class("BPEDecoder", Default::default())?;
52
76
  class.define_singleton_method("new", function!(RbBPEDecoder::new, 0))?;
@@ -36,10 +36,16 @@ impl RbTokenizer {
36
36
  // TODO return self
37
37
  }
38
38
 
39
- pub fn encode(&self, text: String) -> RbResult<RbEncoding> {
39
+ pub fn add_tokens(&self, tokens: Vec<String>) {
40
+ let tokens: Vec<AddedToken> = tokens.iter().map(|t| AddedToken::from(t, true)).collect();
41
+ self.tokenizer.borrow_mut().add_tokens(&tokens);
42
+ // TODO return self
43
+ }
44
+
45
+ pub fn encode(&self, sequence: String, add_special_tokens: bool) -> RbResult<RbEncoding> {
40
46
  self.tokenizer
41
47
  .borrow()
42
- .encode(text, false)
48
+ .encode(sequence, add_special_tokens)
43
49
  .map(|v| RbEncoding { encoding: v })
44
50
  .map_err(RbError::from)
45
51
  }
@@ -68,4 +74,12 @@ impl RbTokenizer {
68
74
  .borrow_mut()
69
75
  .with_normalizer(normalizer.normalizer);
70
76
  }
77
+
78
+ pub fn token_to_id(&self, token: String) -> Option<u32> {
79
+ self.tokenizer.borrow().token_to_id(&token)
80
+ }
81
+
82
+ pub fn id_to_token(&self, id: u32) -> Option<String> {
83
+ self.tokenizer.borrow().id_to_token(id)
84
+ }
71
85
  }
@@ -8,8 +8,8 @@ module Tokenizers
8
8
  @tokenizer.decoder = BPEDecoder.new
9
9
  end
10
10
 
11
- def encode(text)
12
- @tokenizer.encode(text)
11
+ def encode(text, **options)
12
+ @tokenizer.encode(text, **options)
13
13
  end
14
14
 
15
15
  def decode(ids)
@@ -0,0 +1,19 @@
1
+ module Tokenizers
2
+ class Encoding
3
+ def word_to_tokens(word_index, sequence_index = 0)
4
+ _word_to_tokens(word_index, sequence_index)
5
+ end
6
+
7
+ def word_to_chars(word_index, sequence_index = 0)
8
+ _word_to_chars(word_index, sequence_index)
9
+ end
10
+
11
+ def char_to_token(char_pos, sequence_index = 0)
12
+ _char_to_token(char_pos, sequence_index)
13
+ end
14
+
15
+ def char_to_word(char_pos, sequence_index = 0)
16
+ _char_to_word(word_index, sequence_index)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,12 @@
1
+ module Tokenizers
2
+ class Tokenizer
3
+ # TODO change add_special_tokens default to true in 0.3.0
4
+ def encode(sequence, add_special_tokens: nil)
5
+ if add_special_tokens.nil?
6
+ warn "[tokenizers] add_special_tokens will default to true in 0.3.0. Pass add_special_tokens: true/false to silence this warning."
7
+ add_special_tokens = false
8
+ end
9
+ _encode(sequence, add_special_tokens)
10
+ end
11
+ end
12
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.2.2"
2
+ VERSION = "0.2.3"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,14 +1,16 @@
1
1
  # ext
2
2
  begin
3
- require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
3
+ require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
4
  rescue LoadError
5
- require "tokenizers/tokenizers"
5
+ require_relative "tokenizers/tokenizers"
6
6
  end
7
7
 
8
8
  # modules
9
- require "tokenizers/char_bpe_tokenizer"
10
- require "tokenizers/from_pretrained"
11
- require "tokenizers/version"
9
+ require_relative "tokenizers/char_bpe_tokenizer"
10
+ require_relative "tokenizers/encoding"
11
+ require_relative "tokenizers/from_pretrained"
12
+ require_relative "tokenizers/tokenizer"
13
+ require_relative "tokenizers/version"
12
14
 
13
15
  module Tokenizers
14
16
  class Error < StandardError; end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.2
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-15 00:00:00.000000000 Z
11
+ date: 2023-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -48,7 +48,9 @@ files:
48
48
  - ext/tokenizers/src/tokenizer.rs
49
49
  - lib/tokenizers.rb
50
50
  - lib/tokenizers/char_bpe_tokenizer.rb
51
+ - lib/tokenizers/encoding.rb
51
52
  - lib/tokenizers/from_pretrained.rb
53
+ - lib/tokenizers/tokenizer.rb
52
54
  - lib/tokenizers/version.rb
53
55
  homepage: https://github.com/ankane/tokenizers-ruby
54
56
  licenses: