tokenizers 0.2.1 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/Cargo.lock +125 -1253
- data/Cargo.toml +0 -5
- data/README.md +1 -1
- data/ext/tokenizers/Cargo.toml +4 -3
- data/ext/tokenizers/src/encoding.rs +77 -3
- data/ext/tokenizers/src/lib.rs +26 -5
- data/ext/tokenizers/src/tokenizer.rs +20 -20
- data/lib/tokenizers/char_bpe_tokenizer.rb +2 -2
- data/lib/tokenizers/encoding.rb +19 -0
- data/lib/tokenizers/from_pretrained.rb +119 -0
- data/lib/tokenizers/tokenizer.rb +12 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +8 -7
- metadata +5 -2
data/Cargo.toml
CHANGED
@@ -3,8 +3,3 @@ members = ["ext/tokenizers"]
|
|
3
3
|
|
4
4
|
[profile.release]
|
5
5
|
strip = true
|
6
|
-
|
7
|
-
[patch.crates-io]
|
8
|
-
number_prefix = { git = "https://github.com/ankane/rust-number-prefix", branch = "license-file" }
|
9
|
-
rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
|
10
|
-
tokenizers = { git = "https://github.com/huggingface/tokenizers" }
|
data/README.md
CHANGED
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.2.
|
3
|
+
version = "0.2.3"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -13,6 +13,7 @@ crate-type = ["cdylib"]
|
|
13
13
|
magnus = "0.4"
|
14
14
|
|
15
15
|
[dependencies.tokenizers]
|
16
|
-
version = "0.13.2"
|
16
|
+
version = "0.13.2" # also update in from_pretrained.rb
|
17
|
+
git = "https://github.com/huggingface/tokenizers"
|
17
18
|
default-features = false
|
18
|
-
features = ["progressbar", "
|
19
|
+
features = ["progressbar", "onig", "esaxx_fast"]
|
@@ -1,16 +1,90 @@
|
|
1
|
-
use tk::Encoding;
|
1
|
+
use tk::{Encoding, Offsets};
|
2
2
|
|
3
3
|
#[magnus::wrap(class = "Tokenizers::Encoding")]
|
4
|
+
#[repr(transparent)]
|
4
5
|
pub struct RbEncoding {
|
5
6
|
pub encoding: Encoding,
|
6
7
|
}
|
7
8
|
|
9
|
+
impl From<Encoding> for RbEncoding {
|
10
|
+
fn from(v: Encoding) -> Self {
|
11
|
+
Self { encoding: v }
|
12
|
+
}
|
13
|
+
}
|
14
|
+
|
8
15
|
impl RbEncoding {
|
16
|
+
pub fn n_sequences(&self) -> usize {
|
17
|
+
self.encoding.n_sequences()
|
18
|
+
}
|
19
|
+
|
9
20
|
pub fn ids(&self) -> Vec<u32> {
|
10
|
-
self.encoding.get_ids().
|
21
|
+
self.encoding.get_ids().to_vec()
|
11
22
|
}
|
12
23
|
|
13
24
|
pub fn tokens(&self) -> Vec<String> {
|
14
|
-
self.encoding.get_tokens().
|
25
|
+
self.encoding.get_tokens().to_vec()
|
26
|
+
}
|
27
|
+
|
28
|
+
pub fn word_ids(&self) -> Vec<Option<u32>> {
|
29
|
+
self.encoding.get_word_ids().to_vec()
|
30
|
+
}
|
31
|
+
|
32
|
+
pub fn sequence_ids(&self) -> Vec<Option<usize>> {
|
33
|
+
self.encoding.get_sequence_ids()
|
34
|
+
}
|
35
|
+
|
36
|
+
pub fn type_ids(&self) -> Vec<u32> {
|
37
|
+
self.encoding.get_type_ids().to_vec()
|
38
|
+
}
|
39
|
+
|
40
|
+
pub fn offsets(&self) -> Vec<(usize, usize)> {
|
41
|
+
self.encoding.get_offsets().to_vec()
|
42
|
+
}
|
43
|
+
|
44
|
+
pub fn special_tokens_mask(&self) -> Vec<u32> {
|
45
|
+
self.encoding.get_special_tokens_mask().to_vec()
|
46
|
+
}
|
47
|
+
|
48
|
+
pub fn attention_mask(&self) -> Vec<u32> {
|
49
|
+
self.encoding.get_attention_mask().to_vec()
|
50
|
+
}
|
51
|
+
|
52
|
+
pub fn overflowing(&self) -> Vec<Self> {
|
53
|
+
self.encoding
|
54
|
+
.get_overflowing()
|
55
|
+
.clone()
|
56
|
+
.into_iter()
|
57
|
+
.map(|e| e.into())
|
58
|
+
.collect()
|
59
|
+
}
|
60
|
+
|
61
|
+
pub fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
|
62
|
+
self.encoding.word_to_tokens(word_index, sequence_index)
|
63
|
+
}
|
64
|
+
|
65
|
+
pub fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
|
66
|
+
self.encoding.word_to_chars(word_index, sequence_index)
|
67
|
+
}
|
68
|
+
|
69
|
+
pub fn token_to_sequence(&self, token_index: usize) -> Option<usize> {
|
70
|
+
self.encoding.token_to_sequence(token_index)
|
71
|
+
}
|
72
|
+
|
73
|
+
pub fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
|
74
|
+
let (_, offsets) = self.encoding.token_to_chars(token_index)?;
|
75
|
+
Some(offsets)
|
76
|
+
}
|
77
|
+
|
78
|
+
pub fn token_to_word(&self, token_index: usize) -> Option<u32> {
|
79
|
+
let (_, word_idx) = self.encoding.token_to_word(token_index)?;
|
80
|
+
Some(word_idx)
|
81
|
+
}
|
82
|
+
|
83
|
+
pub fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
|
84
|
+
self.encoding.char_to_token(char_pos, sequence_index)
|
85
|
+
}
|
86
|
+
|
87
|
+
pub fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
|
88
|
+
self.encoding.char_to_word(char_pos, sequence_index)
|
15
89
|
}
|
16
90
|
}
|
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -27,10 +27,7 @@ fn module() -> RModule {
|
|
27
27
|
#[magnus::init]
|
28
28
|
fn init() -> RbResult<()> {
|
29
29
|
let module = module();
|
30
|
-
module.define_singleton_method(
|
31
|
-
"_from_pretrained",
|
32
|
-
function!(RbTokenizer::from_pretrained, 3),
|
33
|
-
)?;
|
30
|
+
module.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
|
34
31
|
|
35
32
|
let class = module.define_class("BPE", Default::default())?;
|
36
33
|
class.define_singleton_method("new", function!(RbBPE::new, 2))?;
|
@@ -41,15 +38,39 @@ fn init() -> RbResult<()> {
|
|
41
38
|
"add_special_tokens",
|
42
39
|
method!(RbTokenizer::add_special_tokens, 1),
|
43
40
|
)?;
|
44
|
-
class.define_method("
|
41
|
+
class.define_method("add_tokens", method!(RbTokenizer::add_tokens, 1))?;
|
42
|
+
class.define_method("_encode", method!(RbTokenizer::encode, 2))?;
|
45
43
|
class.define_method("decode", method!(RbTokenizer::decode, 1))?;
|
46
44
|
class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
|
47
45
|
class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
|
48
46
|
class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
|
47
|
+
class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
|
48
|
+
class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
|
49
49
|
|
50
50
|
let class = module.define_class("Encoding", Default::default())?;
|
51
|
+
class.define_method("n_sequences", method!(RbEncoding::n_sequences, 0))?;
|
51
52
|
class.define_method("ids", method!(RbEncoding::ids, 0))?;
|
52
53
|
class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
|
54
|
+
class.define_method("word_ids", method!(RbEncoding::word_ids, 0))?;
|
55
|
+
class.define_method("sequence_ids", method!(RbEncoding::sequence_ids, 0))?;
|
56
|
+
class.define_method("type_ids", method!(RbEncoding::type_ids, 0))?;
|
57
|
+
class.define_method("offsets", method!(RbEncoding::offsets, 0))?;
|
58
|
+
class.define_method(
|
59
|
+
"special_tokens_mask",
|
60
|
+
method!(RbEncoding::special_tokens_mask, 0),
|
61
|
+
)?;
|
62
|
+
class.define_method("attention_mask", method!(RbEncoding::attention_mask, 0))?;
|
63
|
+
class.define_method("overflowing", method!(RbEncoding::overflowing, 0))?;
|
64
|
+
class.define_method("_word_to_tokens", method!(RbEncoding::word_to_tokens, 2))?;
|
65
|
+
class.define_method("_word_to_chars", method!(RbEncoding::word_to_chars, 2))?;
|
66
|
+
class.define_method(
|
67
|
+
"token_to_sequence",
|
68
|
+
method!(RbEncoding::token_to_sequence, 1),
|
69
|
+
)?;
|
70
|
+
class.define_method("token_to_chars", method!(RbEncoding::token_to_chars, 1))?;
|
71
|
+
class.define_method("token_to_word", method!(RbEncoding::token_to_word, 1))?;
|
72
|
+
class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
|
73
|
+
class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
|
53
74
|
|
54
75
|
let class = module.define_class("BPEDecoder", Default::default())?;
|
55
76
|
class.define_singleton_method("new", function!(RbBPEDecoder::new, 0))?;
|
@@ -1,5 +1,5 @@
|
|
1
|
-
use magnus::Module;
|
2
1
|
use std::cell::RefCell;
|
2
|
+
use std::path::PathBuf;
|
3
3
|
use tk::tokenizer::Tokenizer;
|
4
4
|
use tk::AddedToken;
|
5
5
|
|
@@ -8,7 +8,7 @@ use super::encoding::RbEncoding;
|
|
8
8
|
use super::models::RbBPE;
|
9
9
|
use super::normalizers::RbBertNormalizer;
|
10
10
|
use super::pre_tokenizers::RbBertPreTokenizer;
|
11
|
-
use super::{
|
11
|
+
use super::{RbError, RbResult};
|
12
12
|
|
13
13
|
#[magnus::wrap(class = "Tokenizers::Tokenizer")]
|
14
14
|
pub struct RbTokenizer {
|
@@ -22,22 +22,8 @@ impl RbTokenizer {
|
|
22
22
|
}
|
23
23
|
}
|
24
24
|
|
25
|
-
pub fn
|
26
|
-
|
27
|
-
revision: String,
|
28
|
-
auth_token: Option<String>,
|
29
|
-
) -> RbResult<Self> {
|
30
|
-
let version = module().const_get("VERSION").unwrap();
|
31
|
-
let params = tk::FromPretrainedParameters {
|
32
|
-
revision,
|
33
|
-
auth_token,
|
34
|
-
user_agent: [("bindings", "Ruby".to_string()), ("version", version)]
|
35
|
-
.iter()
|
36
|
-
.map(|(k, v)| (k.to_string(), v.to_string()))
|
37
|
-
.collect(),
|
38
|
-
};
|
39
|
-
|
40
|
-
Tokenizer::from_pretrained(identifier, Some(params))
|
25
|
+
pub fn from_file(path: PathBuf) -> RbResult<Self> {
|
26
|
+
Tokenizer::from_file(path)
|
41
27
|
.map(|v| RbTokenizer {
|
42
28
|
tokenizer: RefCell::new(v),
|
43
29
|
})
|
@@ -50,10 +36,16 @@ impl RbTokenizer {
|
|
50
36
|
// TODO return self
|
51
37
|
}
|
52
38
|
|
53
|
-
pub fn
|
39
|
+
pub fn add_tokens(&self, tokens: Vec<String>) {
|
40
|
+
let tokens: Vec<AddedToken> = tokens.iter().map(|t| AddedToken::from(t, true)).collect();
|
41
|
+
self.tokenizer.borrow_mut().add_tokens(&tokens);
|
42
|
+
// TODO return self
|
43
|
+
}
|
44
|
+
|
45
|
+
pub fn encode(&self, sequence: String, add_special_tokens: bool) -> RbResult<RbEncoding> {
|
54
46
|
self.tokenizer
|
55
47
|
.borrow()
|
56
|
-
.encode(
|
48
|
+
.encode(sequence, add_special_tokens)
|
57
49
|
.map(|v| RbEncoding { encoding: v })
|
58
50
|
.map_err(RbError::from)
|
59
51
|
}
|
@@ -82,4 +74,12 @@ impl RbTokenizer {
|
|
82
74
|
.borrow_mut()
|
83
75
|
.with_normalizer(normalizer.normalizer);
|
84
76
|
}
|
77
|
+
|
78
|
+
pub fn token_to_id(&self, token: String) -> Option<u32> {
|
79
|
+
self.tokenizer.borrow().token_to_id(&token)
|
80
|
+
}
|
81
|
+
|
82
|
+
pub fn id_to_token(&self, id: u32) -> Option<String> {
|
83
|
+
self.tokenizer.borrow().id_to_token(id)
|
84
|
+
}
|
85
85
|
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
class Encoding
|
3
|
+
def word_to_tokens(word_index, sequence_index = 0)
|
4
|
+
_word_to_tokens(word_index, sequence_index)
|
5
|
+
end
|
6
|
+
|
7
|
+
def word_to_chars(word_index, sequence_index = 0)
|
8
|
+
_word_to_chars(word_index, sequence_index)
|
9
|
+
end
|
10
|
+
|
11
|
+
def char_to_token(char_pos, sequence_index = 0)
|
12
|
+
_char_to_token(char_pos, sequence_index)
|
13
|
+
end
|
14
|
+
|
15
|
+
def char_to_word(char_pos, sequence_index = 0)
|
16
|
+
_char_to_word(word_index, sequence_index)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,119 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
module FromPretrained
|
3
|
+
# for user agent
|
4
|
+
TOKENIZERS_VERSION = "0.13.2"
|
5
|
+
|
6
|
+
# use Ruby for downloads
|
7
|
+
# this avoids the need to vendor OpenSSL on Linux
|
8
|
+
# and reduces the extension size by about half
|
9
|
+
def from_pretrained(identifier, revision: "main", auth_token: nil)
|
10
|
+
require "cgi"
|
11
|
+
require "digest"
|
12
|
+
require "fileutils"
|
13
|
+
require "json"
|
14
|
+
require "open-uri"
|
15
|
+
|
16
|
+
cache_dir = ensure_cache_dir
|
17
|
+
|
18
|
+
# string options are headers
|
19
|
+
options = {
|
20
|
+
open_timeout: 3,
|
21
|
+
read_timeout: 30,
|
22
|
+
"User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
|
23
|
+
}
|
24
|
+
if auth_token
|
25
|
+
options["Authorization"] = "Bearer #{auth_token}"
|
26
|
+
end
|
27
|
+
|
28
|
+
url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
|
29
|
+
|
30
|
+
path =
|
31
|
+
begin
|
32
|
+
cached_path(cache_dir, url, options)
|
33
|
+
rescue OpenURI::HTTPError
|
34
|
+
raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
|
35
|
+
end
|
36
|
+
|
37
|
+
from_file(path)
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
# use same storage format as Rust version
|
43
|
+
# https://github.com/epwalsh/rust-cached-path
|
44
|
+
def cached_path(cache_dir, url, options)
|
45
|
+
fsum = Digest::SHA256.hexdigest(url)
|
46
|
+
meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
|
47
|
+
meta = meta_paths.map { |f| JSON.load_file(f) }.max_by { |m| m["creation_time"] }
|
48
|
+
etag = meta["etag"] if meta
|
49
|
+
|
50
|
+
if etag
|
51
|
+
esum = Digest::SHA256.hexdigest(etag)
|
52
|
+
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
53
|
+
options["If-None-Match"] = etag if File.exist?(resource_path)
|
54
|
+
end
|
55
|
+
|
56
|
+
options[:content_length_proc] = -> (_) { puts "Downloading..." }
|
57
|
+
|
58
|
+
tempfile =
|
59
|
+
begin
|
60
|
+
URI.open(url, options)
|
61
|
+
rescue OpenURI::HTTPError => e
|
62
|
+
if e.message == "304 Not Modified"
|
63
|
+
return resource_path
|
64
|
+
else
|
65
|
+
raise e
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
etag = tempfile.meta["etag"]
|
70
|
+
esum = Digest::SHA256.hexdigest(etag)
|
71
|
+
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
72
|
+
meta_path = "#{resource_path}.meta"
|
73
|
+
|
74
|
+
meta = {
|
75
|
+
resource: url,
|
76
|
+
resource_path: resource_path,
|
77
|
+
meta_path: meta_path,
|
78
|
+
etag: etag,
|
79
|
+
expires: nil,
|
80
|
+
creation_time: Time.now.to_f
|
81
|
+
}
|
82
|
+
|
83
|
+
File.write("#{resource_path}.lock", "")
|
84
|
+
File.open(resource_path, "wb") { |f| IO.copy_stream(tempfile, f) }
|
85
|
+
File.write(meta_path, JSON.generate(meta))
|
86
|
+
|
87
|
+
resource_path
|
88
|
+
end
|
89
|
+
|
90
|
+
def cache_dir
|
91
|
+
if ENV["TOKENIZERS_CACHE"]
|
92
|
+
ENV["TOKENIZERS_CACHE"]
|
93
|
+
else
|
94
|
+
# use same directory as Rust version
|
95
|
+
# https://docs.rs/dirs/latest/dirs/fn.cache_dir.html
|
96
|
+
dir =
|
97
|
+
if Gem.win_platform?
|
98
|
+
ENV.fetch("LOCALAPPDATA")
|
99
|
+
elsif mac?
|
100
|
+
File.join(ENV.fetch("HOME"), "Library", "Caches")
|
101
|
+
else
|
102
|
+
ENV["XDG_CACHE_HOME"] || File.join(ENV.fetch("HOME"), ".cache")
|
103
|
+
end
|
104
|
+
|
105
|
+
File.join(dir, "huggingface", "tokenizers")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def ensure_cache_dir
|
110
|
+
dir = cache_dir
|
111
|
+
FileUtils.mkdir_p(dir)
|
112
|
+
dir
|
113
|
+
end
|
114
|
+
|
115
|
+
def mac?
|
116
|
+
RbConfig::CONFIG["host_os"] =~ /darwin/i
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
class Tokenizer
|
3
|
+
# TODO change add_special_tokens default to true in 0.3.0
|
4
|
+
def encode(sequence, add_special_tokens: nil)
|
5
|
+
if add_special_tokens.nil?
|
6
|
+
warn "[tokenizers] add_special_tokens will default to true in 0.3.0. Pass add_special_tokens: true/false to silence this warning."
|
7
|
+
add_special_tokens = false
|
8
|
+
end
|
9
|
+
_encode(sequence, add_special_tokens)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -1,18 +1,19 @@
|
|
1
1
|
# ext
|
2
2
|
begin
|
3
|
-
|
3
|
+
require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
|
4
4
|
rescue LoadError
|
5
|
-
|
5
|
+
require_relative "tokenizers/tokenizers"
|
6
6
|
end
|
7
7
|
|
8
8
|
# modules
|
9
|
-
|
10
|
-
|
9
|
+
require_relative "tokenizers/char_bpe_tokenizer"
|
10
|
+
require_relative "tokenizers/encoding"
|
11
|
+
require_relative "tokenizers/from_pretrained"
|
12
|
+
require_relative "tokenizers/tokenizer"
|
13
|
+
require_relative "tokenizers/version"
|
11
14
|
|
12
15
|
module Tokenizers
|
13
16
|
class Error < StandardError; end
|
14
17
|
|
15
|
-
|
16
|
-
_from_pretrained(identifier, revision, auth_token)
|
17
|
-
end
|
18
|
+
extend FromPretrained
|
18
19
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -48,6 +48,9 @@ files:
|
|
48
48
|
- ext/tokenizers/src/tokenizer.rs
|
49
49
|
- lib/tokenizers.rb
|
50
50
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
51
|
+
- lib/tokenizers/encoding.rb
|
52
|
+
- lib/tokenizers/from_pretrained.rb
|
53
|
+
- lib/tokenizers/tokenizer.rb
|
51
54
|
- lib/tokenizers/version.rb
|
52
55
|
homepage: https://github.com/ankane/tokenizers-ruby
|
53
56
|
licenses:
|