tokenizers 0.2.1 → 0.2.3

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml CHANGED
@@ -3,8 +3,3 @@ members = ["ext/tokenizers"]
3
3
 
4
4
  [profile.release]
5
5
  strip = true
6
-
7
- [patch.crates-io]
8
- number_prefix = { git = "https://github.com/ankane/rust-number-prefix", branch = "license-file" }
9
- rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
10
- tokenizers = { git = "https://github.com/huggingface/tokenizers" }
data/README.md CHANGED
@@ -24,8 +24,8 @@ Encode
24
24
 
25
25
  ```ruby
26
26
  encoded = tokenizer.encode("I can feel the magic, can you?")
27
- encoded.ids
28
27
  encoded.tokens
28
+ encoded.ids
29
29
  ```
30
30
 
31
31
  Decode
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.2.1"
3
+ version = "0.2.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -13,6 +13,7 @@ crate-type = ["cdylib"]
13
13
  magnus = "0.4"
14
14
 
15
15
  [dependencies.tokenizers]
16
- version = "0.13.2"
16
+ version = "0.13.2" # also update in from_pretrained.rb
17
+ git = "https://github.com/huggingface/tokenizers"
17
18
  default-features = false
18
- features = ["progressbar", "http", "onig", "esaxx_fast"]
19
+ features = ["progressbar", "onig", "esaxx_fast"]
@@ -1,16 +1,90 @@
1
- use tk::Encoding;
1
+ use tk::{Encoding, Offsets};
2
2
 
3
3
  #[magnus::wrap(class = "Tokenizers::Encoding")]
4
+ #[repr(transparent)]
4
5
  pub struct RbEncoding {
5
6
  pub encoding: Encoding,
6
7
  }
7
8
 
9
+ impl From<Encoding> for RbEncoding {
10
+ fn from(v: Encoding) -> Self {
11
+ Self { encoding: v }
12
+ }
13
+ }
14
+
8
15
  impl RbEncoding {
16
+ pub fn n_sequences(&self) -> usize {
17
+ self.encoding.n_sequences()
18
+ }
19
+
9
20
  pub fn ids(&self) -> Vec<u32> {
10
- self.encoding.get_ids().into()
21
+ self.encoding.get_ids().to_vec()
11
22
  }
12
23
 
13
24
  pub fn tokens(&self) -> Vec<String> {
14
- self.encoding.get_tokens().into()
25
+ self.encoding.get_tokens().to_vec()
26
+ }
27
+
28
+ pub fn word_ids(&self) -> Vec<Option<u32>> {
29
+ self.encoding.get_word_ids().to_vec()
30
+ }
31
+
32
+ pub fn sequence_ids(&self) -> Vec<Option<usize>> {
33
+ self.encoding.get_sequence_ids()
34
+ }
35
+
36
+ pub fn type_ids(&self) -> Vec<u32> {
37
+ self.encoding.get_type_ids().to_vec()
38
+ }
39
+
40
+ pub fn offsets(&self) -> Vec<(usize, usize)> {
41
+ self.encoding.get_offsets().to_vec()
42
+ }
43
+
44
+ pub fn special_tokens_mask(&self) -> Vec<u32> {
45
+ self.encoding.get_special_tokens_mask().to_vec()
46
+ }
47
+
48
+ pub fn attention_mask(&self) -> Vec<u32> {
49
+ self.encoding.get_attention_mask().to_vec()
50
+ }
51
+
52
+ pub fn overflowing(&self) -> Vec<Self> {
53
+ self.encoding
54
+ .get_overflowing()
55
+ .clone()
56
+ .into_iter()
57
+ .map(|e| e.into())
58
+ .collect()
59
+ }
60
+
61
+ pub fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
62
+ self.encoding.word_to_tokens(word_index, sequence_index)
63
+ }
64
+
65
+ pub fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
66
+ self.encoding.word_to_chars(word_index, sequence_index)
67
+ }
68
+
69
+ pub fn token_to_sequence(&self, token_index: usize) -> Option<usize> {
70
+ self.encoding.token_to_sequence(token_index)
71
+ }
72
+
73
+ pub fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
74
+ let (_, offsets) = self.encoding.token_to_chars(token_index)?;
75
+ Some(offsets)
76
+ }
77
+
78
+ pub fn token_to_word(&self, token_index: usize) -> Option<u32> {
79
+ let (_, word_idx) = self.encoding.token_to_word(token_index)?;
80
+ Some(word_idx)
81
+ }
82
+
83
+ pub fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
84
+ self.encoding.char_to_token(char_pos, sequence_index)
85
+ }
86
+
87
+ pub fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
88
+ self.encoding.char_to_word(char_pos, sequence_index)
15
89
  }
16
90
  }
@@ -27,10 +27,7 @@ fn module() -> RModule {
27
27
  #[magnus::init]
28
28
  fn init() -> RbResult<()> {
29
29
  let module = module();
30
- module.define_singleton_method(
31
- "_from_pretrained",
32
- function!(RbTokenizer::from_pretrained, 3),
33
- )?;
30
+ module.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
34
31
 
35
32
  let class = module.define_class("BPE", Default::default())?;
36
33
  class.define_singleton_method("new", function!(RbBPE::new, 2))?;
@@ -41,15 +38,39 @@ fn init() -> RbResult<()> {
41
38
  "add_special_tokens",
42
39
  method!(RbTokenizer::add_special_tokens, 1),
43
40
  )?;
44
- class.define_method("encode", method!(RbTokenizer::encode, 1))?;
41
+ class.define_method("add_tokens", method!(RbTokenizer::add_tokens, 1))?;
42
+ class.define_method("_encode", method!(RbTokenizer::encode, 2))?;
45
43
  class.define_method("decode", method!(RbTokenizer::decode, 1))?;
46
44
  class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
47
45
  class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
48
46
  class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
47
+ class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
48
+ class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
49
49
 
50
50
  let class = module.define_class("Encoding", Default::default())?;
51
+ class.define_method("n_sequences", method!(RbEncoding::n_sequences, 0))?;
51
52
  class.define_method("ids", method!(RbEncoding::ids, 0))?;
52
53
  class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
54
+ class.define_method("word_ids", method!(RbEncoding::word_ids, 0))?;
55
+ class.define_method("sequence_ids", method!(RbEncoding::sequence_ids, 0))?;
56
+ class.define_method("type_ids", method!(RbEncoding::type_ids, 0))?;
57
+ class.define_method("offsets", method!(RbEncoding::offsets, 0))?;
58
+ class.define_method(
59
+ "special_tokens_mask",
60
+ method!(RbEncoding::special_tokens_mask, 0),
61
+ )?;
62
+ class.define_method("attention_mask", method!(RbEncoding::attention_mask, 0))?;
63
+ class.define_method("overflowing", method!(RbEncoding::overflowing, 0))?;
64
+ class.define_method("_word_to_tokens", method!(RbEncoding::word_to_tokens, 2))?;
65
+ class.define_method("_word_to_chars", method!(RbEncoding::word_to_chars, 2))?;
66
+ class.define_method(
67
+ "token_to_sequence",
68
+ method!(RbEncoding::token_to_sequence, 1),
69
+ )?;
70
+ class.define_method("token_to_chars", method!(RbEncoding::token_to_chars, 1))?;
71
+ class.define_method("token_to_word", method!(RbEncoding::token_to_word, 1))?;
72
+ class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
73
+ class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
53
74
 
54
75
  let class = module.define_class("BPEDecoder", Default::default())?;
55
76
  class.define_singleton_method("new", function!(RbBPEDecoder::new, 0))?;
@@ -1,5 +1,5 @@
1
- use magnus::Module;
2
1
  use std::cell::RefCell;
2
+ use std::path::PathBuf;
3
3
  use tk::tokenizer::Tokenizer;
4
4
  use tk::AddedToken;
5
5
 
@@ -8,7 +8,7 @@ use super::encoding::RbEncoding;
8
8
  use super::models::RbBPE;
9
9
  use super::normalizers::RbBertNormalizer;
10
10
  use super::pre_tokenizers::RbBertPreTokenizer;
11
- use super::{module, RbError, RbResult};
11
+ use super::{RbError, RbResult};
12
12
 
13
13
  #[magnus::wrap(class = "Tokenizers::Tokenizer")]
14
14
  pub struct RbTokenizer {
@@ -22,22 +22,8 @@ impl RbTokenizer {
22
22
  }
23
23
  }
24
24
 
25
- pub fn from_pretrained(
26
- identifier: String,
27
- revision: String,
28
- auth_token: Option<String>,
29
- ) -> RbResult<Self> {
30
- let version = module().const_get("VERSION").unwrap();
31
- let params = tk::FromPretrainedParameters {
32
- revision,
33
- auth_token,
34
- user_agent: [("bindings", "Ruby".to_string()), ("version", version)]
35
- .iter()
36
- .map(|(k, v)| (k.to_string(), v.to_string()))
37
- .collect(),
38
- };
39
-
40
- Tokenizer::from_pretrained(identifier, Some(params))
25
+ pub fn from_file(path: PathBuf) -> RbResult<Self> {
26
+ Tokenizer::from_file(path)
41
27
  .map(|v| RbTokenizer {
42
28
  tokenizer: RefCell::new(v),
43
29
  })
@@ -50,10 +36,16 @@ impl RbTokenizer {
50
36
  // TODO return self
51
37
  }
52
38
 
53
- pub fn encode(&self, text: String) -> RbResult<RbEncoding> {
39
+ pub fn add_tokens(&self, tokens: Vec<String>) {
40
+ let tokens: Vec<AddedToken> = tokens.iter().map(|t| AddedToken::from(t, true)).collect();
41
+ self.tokenizer.borrow_mut().add_tokens(&tokens);
42
+ // TODO return self
43
+ }
44
+
45
+ pub fn encode(&self, sequence: String, add_special_tokens: bool) -> RbResult<RbEncoding> {
54
46
  self.tokenizer
55
47
  .borrow()
56
- .encode(text, false)
48
+ .encode(sequence, add_special_tokens)
57
49
  .map(|v| RbEncoding { encoding: v })
58
50
  .map_err(RbError::from)
59
51
  }
@@ -82,4 +74,12 @@ impl RbTokenizer {
82
74
  .borrow_mut()
83
75
  .with_normalizer(normalizer.normalizer);
84
76
  }
77
+
78
+ pub fn token_to_id(&self, token: String) -> Option<u32> {
79
+ self.tokenizer.borrow().token_to_id(&token)
80
+ }
81
+
82
+ pub fn id_to_token(&self, id: u32) -> Option<String> {
83
+ self.tokenizer.borrow().id_to_token(id)
84
+ }
85
85
  }
@@ -8,8 +8,8 @@ module Tokenizers
8
8
  @tokenizer.decoder = BPEDecoder.new
9
9
  end
10
10
 
11
- def encode(text)
12
- @tokenizer.encode(text)
11
+ def encode(text, **options)
12
+ @tokenizer.encode(text, **options)
13
13
  end
14
14
 
15
15
  def decode(ids)
@@ -0,0 +1,19 @@
1
+ module Tokenizers
2
+ class Encoding
3
+ def word_to_tokens(word_index, sequence_index = 0)
4
+ _word_to_tokens(word_index, sequence_index)
5
+ end
6
+
7
+ def word_to_chars(word_index, sequence_index = 0)
8
+ _word_to_chars(word_index, sequence_index)
9
+ end
10
+
11
+ def char_to_token(char_pos, sequence_index = 0)
12
+ _char_to_token(char_pos, sequence_index)
13
+ end
14
+
15
+ def char_to_word(char_pos, sequence_index = 0)
16
+ _char_to_word(word_index, sequence_index)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,119 @@
1
+ module Tokenizers
2
+ module FromPretrained
3
+ # for user agent
4
+ TOKENIZERS_VERSION = "0.13.2"
5
+
6
+ # use Ruby for downloads
7
+ # this avoids the need to vendor OpenSSL on Linux
8
+ # and reduces the extension size by about half
9
+ def from_pretrained(identifier, revision: "main", auth_token: nil)
10
+ require "cgi"
11
+ require "digest"
12
+ require "fileutils"
13
+ require "json"
14
+ require "open-uri"
15
+
16
+ cache_dir = ensure_cache_dir
17
+
18
+ # string options are headers
19
+ options = {
20
+ open_timeout: 3,
21
+ read_timeout: 30,
22
+ "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
+ }
24
+ if auth_token
25
+ options["Authorization"] = "Bearer #{auth_token}"
26
+ end
27
+
28
+ url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
+
30
+ path =
31
+ begin
32
+ cached_path(cache_dir, url, options)
33
+ rescue OpenURI::HTTPError
34
+ raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
+ end
36
+
37
+ from_file(path)
38
+ end
39
+
40
+ private
41
+
42
+ # use same storage format as Rust version
43
+ # https://github.com/epwalsh/rust-cached-path
44
+ def cached_path(cache_dir, url, options)
45
+ fsum = Digest::SHA256.hexdigest(url)
46
+ meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
+ meta = meta_paths.map { |f| JSON.load_file(f) }.max_by { |m| m["creation_time"] }
48
+ etag = meta["etag"] if meta
49
+
50
+ if etag
51
+ esum = Digest::SHA256.hexdigest(etag)
52
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
+ options["If-None-Match"] = etag if File.exist?(resource_path)
54
+ end
55
+
56
+ options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
+
58
+ tempfile =
59
+ begin
60
+ URI.open(url, options)
61
+ rescue OpenURI::HTTPError => e
62
+ if e.message == "304 Not Modified"
63
+ return resource_path
64
+ else
65
+ raise e
66
+ end
67
+ end
68
+
69
+ etag = tempfile.meta["etag"]
70
+ esum = Digest::SHA256.hexdigest(etag)
71
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
72
+ meta_path = "#{resource_path}.meta"
73
+
74
+ meta = {
75
+ resource: url,
76
+ resource_path: resource_path,
77
+ meta_path: meta_path,
78
+ etag: etag,
79
+ expires: nil,
80
+ creation_time: Time.now.to_f
81
+ }
82
+
83
+ File.write("#{resource_path}.lock", "")
84
+ File.open(resource_path, "wb") { |f| IO.copy_stream(tempfile, f) }
85
+ File.write(meta_path, JSON.generate(meta))
86
+
87
+ resource_path
88
+ end
89
+
90
+ def cache_dir
91
+ if ENV["TOKENIZERS_CACHE"]
92
+ ENV["TOKENIZERS_CACHE"]
93
+ else
94
+ # use same directory as Rust version
95
+ # https://docs.rs/dirs/latest/dirs/fn.cache_dir.html
96
+ dir =
97
+ if Gem.win_platform?
98
+ ENV.fetch("LOCALAPPDATA")
99
+ elsif mac?
100
+ File.join(ENV.fetch("HOME"), "Library", "Caches")
101
+ else
102
+ ENV["XDG_CACHE_HOME"] || File.join(ENV.fetch("HOME"), ".cache")
103
+ end
104
+
105
+ File.join(dir, "huggingface", "tokenizers")
106
+ end
107
+ end
108
+
109
+ def ensure_cache_dir
110
+ dir = cache_dir
111
+ FileUtils.mkdir_p(dir)
112
+ dir
113
+ end
114
+
115
+ def mac?
116
+ RbConfig::CONFIG["host_os"] =~ /darwin/i
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,12 @@
1
+ module Tokenizers
2
+ class Tokenizer
3
+ # TODO change add_special_tokens default to true in 0.3.0
4
+ def encode(sequence, add_special_tokens: nil)
5
+ if add_special_tokens.nil?
6
+ warn "[tokenizers] add_special_tokens will default to true in 0.3.0. Pass add_special_tokens: true/false to silence this warning."
7
+ add_special_tokens = false
8
+ end
9
+ _encode(sequence, add_special_tokens)
10
+ end
11
+ end
12
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.3"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,18 +1,19 @@
1
1
  # ext
2
2
  begin
3
- require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
3
+ require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
4
  rescue LoadError
5
- require "tokenizers/tokenizers"
5
+ require_relative "tokenizers/tokenizers"
6
6
  end
7
7
 
8
8
  # modules
9
- require "tokenizers/char_bpe_tokenizer"
10
- require "tokenizers/version"
9
+ require_relative "tokenizers/char_bpe_tokenizer"
10
+ require_relative "tokenizers/encoding"
11
+ require_relative "tokenizers/from_pretrained"
12
+ require_relative "tokenizers/tokenizer"
13
+ require_relative "tokenizers/version"
11
14
 
12
15
  module Tokenizers
13
16
  class Error < StandardError; end
14
17
 
15
- def self.from_pretrained(identifier, revision: "main", auth_token: nil)
16
- _from_pretrained(identifier, revision, auth_token)
17
- end
18
+ extend FromPretrained
18
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-12 00:00:00.000000000 Z
11
+ date: 2023-01-22 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -48,6 +48,9 @@ files:
48
48
  - ext/tokenizers/src/tokenizer.rs
49
49
  - lib/tokenizers.rb
50
50
  - lib/tokenizers/char_bpe_tokenizer.rb
51
+ - lib/tokenizers/encoding.rb
52
+ - lib/tokenizers/from_pretrained.rb
53
+ - lib/tokenizers/tokenizer.rb
51
54
  - lib/tokenizers/version.rb
52
55
  homepage: https://github.com/ankane/tokenizers-ruby
53
56
  licenses: