tokenizers 0.2.1 → 0.2.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Cargo.lock +124 -1252
- data/Cargo.toml +0 -5
- data/README.md +1 -1
- data/ext/tokenizers/Cargo.toml +4 -3
- data/ext/tokenizers/src/lib.rs +1 -4
- data/ext/tokenizers/src/tokenizer.rs +4 -18
- data/lib/tokenizers/from_pretrained.rb +119 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +2 -3
- metadata +3 -2
data/Cargo.toml
CHANGED
@@ -3,8 +3,3 @@ members = ["ext/tokenizers"]
|
|
3
3
|
|
4
4
|
[profile.release]
|
5
5
|
strip = true
|
6
|
-
|
7
|
-
[patch.crates-io]
|
8
|
-
number_prefix = { git = "https://github.com/ankane/rust-number-prefix", branch = "license-file" }
|
9
|
-
rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
|
10
|
-
tokenizers = { git = "https://github.com/huggingface/tokenizers" }
|
data/README.md
CHANGED
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1
1
|
[package]
|
2
2
|
name = "tokenizers"
|
3
|
-
version = "0.2.
|
3
|
+
version = "0.2.2"
|
4
4
|
license = "Apache-2.0"
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
6
|
edition = "2021"
|
@@ -13,6 +13,7 @@ crate-type = ["cdylib"]
|
|
13
13
|
magnus = "0.4"
|
14
14
|
|
15
15
|
[dependencies.tokenizers]
|
16
|
-
version = "0.13.2"
|
16
|
+
version = "0.13.2" # also update in from_pretrained.rb
|
17
|
+
git = "https://github.com/huggingface/tokenizers"
|
17
18
|
default-features = false
|
18
|
-
features = ["progressbar", "
|
19
|
+
features = ["progressbar", "onig", "esaxx_fast"]
|
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -27,10 +27,7 @@ fn module() -> RModule {
|
|
27
27
|
#[magnus::init]
|
28
28
|
fn init() -> RbResult<()> {
|
29
29
|
let module = module();
|
30
|
-
module.define_singleton_method(
|
31
|
-
"_from_pretrained",
|
32
|
-
function!(RbTokenizer::from_pretrained, 3),
|
33
|
-
)?;
|
30
|
+
module.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
|
34
31
|
|
35
32
|
let class = module.define_class("BPE", Default::default())?;
|
36
33
|
class.define_singleton_method("new", function!(RbBPE::new, 2))?;
|
@@ -1,5 +1,5 @@
|
|
1
|
-
use magnus::Module;
|
2
1
|
use std::cell::RefCell;
|
2
|
+
use std::path::PathBuf;
|
3
3
|
use tk::tokenizer::Tokenizer;
|
4
4
|
use tk::AddedToken;
|
5
5
|
|
@@ -8,7 +8,7 @@ use super::encoding::RbEncoding;
|
|
8
8
|
use super::models::RbBPE;
|
9
9
|
use super::normalizers::RbBertNormalizer;
|
10
10
|
use super::pre_tokenizers::RbBertPreTokenizer;
|
11
|
-
use super::{
|
11
|
+
use super::{RbError, RbResult};
|
12
12
|
|
13
13
|
#[magnus::wrap(class = "Tokenizers::Tokenizer")]
|
14
14
|
pub struct RbTokenizer {
|
@@ -22,22 +22,8 @@ impl RbTokenizer {
|
|
22
22
|
}
|
23
23
|
}
|
24
24
|
|
25
|
-
pub fn
|
26
|
-
|
27
|
-
revision: String,
|
28
|
-
auth_token: Option<String>,
|
29
|
-
) -> RbResult<Self> {
|
30
|
-
let version = module().const_get("VERSION").unwrap();
|
31
|
-
let params = tk::FromPretrainedParameters {
|
32
|
-
revision,
|
33
|
-
auth_token,
|
34
|
-
user_agent: [("bindings", "Ruby".to_string()), ("version", version)]
|
35
|
-
.iter()
|
36
|
-
.map(|(k, v)| (k.to_string(), v.to_string()))
|
37
|
-
.collect(),
|
38
|
-
};
|
39
|
-
|
40
|
-
Tokenizer::from_pretrained(identifier, Some(params))
|
25
|
+
pub fn from_file(path: PathBuf) -> RbResult<Self> {
|
26
|
+
Tokenizer::from_file(path)
|
41
27
|
.map(|v| RbTokenizer {
|
42
28
|
tokenizer: RefCell::new(v),
|
43
29
|
})
|
@@ -0,0 +1,119 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
module FromPretrained
|
3
|
+
# for user agent
|
4
|
+
TOKENIZERS_VERSION = "0.13.2"
|
5
|
+
|
6
|
+
# use Ruby for downloads
|
7
|
+
# this avoids the need to vendor OpenSSL on Linux
|
8
|
+
# and reduces the extension size by about half
|
9
|
+
def from_pretrained(identifier, revision: "main", auth_token: nil)
|
10
|
+
require "cgi"
|
11
|
+
require "digest"
|
12
|
+
require "fileutils"
|
13
|
+
require "json"
|
14
|
+
require "open-uri"
|
15
|
+
|
16
|
+
cache_dir = ensure_cache_dir
|
17
|
+
|
18
|
+
# string options are headers
|
19
|
+
options = {
|
20
|
+
open_timeout: 3,
|
21
|
+
read_timeout: 30,
|
22
|
+
"User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
|
23
|
+
}
|
24
|
+
if auth_token
|
25
|
+
options["Authorization"] = "Bearer #{auth_token}"
|
26
|
+
end
|
27
|
+
|
28
|
+
url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
|
29
|
+
|
30
|
+
path =
|
31
|
+
begin
|
32
|
+
cached_path(cache_dir, url, options)
|
33
|
+
rescue OpenURI::HTTPError
|
34
|
+
raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
|
35
|
+
end
|
36
|
+
|
37
|
+
from_file(path)
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
# use same storage format as Rust version
|
43
|
+
# https://github.com/epwalsh/rust-cached-path
|
44
|
+
def cached_path(cache_dir, url, options)
|
45
|
+
fsum = Digest::SHA256.hexdigest(url)
|
46
|
+
meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
|
47
|
+
meta = meta_paths.map { |f| JSON.load_file(f) }.max_by { |m| m["creation_time"] }
|
48
|
+
etag = meta["etag"] if meta
|
49
|
+
|
50
|
+
if etag
|
51
|
+
esum = Digest::SHA256.hexdigest(etag)
|
52
|
+
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
53
|
+
options["If-None-Match"] = etag if File.exist?(resource_path)
|
54
|
+
end
|
55
|
+
|
56
|
+
options[:content_length_proc] = -> (_) { puts "Downloading..." }
|
57
|
+
|
58
|
+
tempfile =
|
59
|
+
begin
|
60
|
+
URI.open(url, options)
|
61
|
+
rescue OpenURI::HTTPError => e
|
62
|
+
if e.message == "304 Not Modified"
|
63
|
+
return resource_path
|
64
|
+
else
|
65
|
+
raise e
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
etag = tempfile.meta["etag"]
|
70
|
+
esum = Digest::SHA256.hexdigest(etag)
|
71
|
+
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
72
|
+
meta_path = "#{resource_path}.meta"
|
73
|
+
|
74
|
+
meta = {
|
75
|
+
resource: url,
|
76
|
+
resource_path: resource_path,
|
77
|
+
meta_path: meta_path,
|
78
|
+
etag: etag,
|
79
|
+
expires: nil,
|
80
|
+
creation_time: Time.now.to_f
|
81
|
+
}
|
82
|
+
|
83
|
+
File.write("#{resource_path}.lock", "")
|
84
|
+
File.open(resource_path, "wb") { |f| IO.copy_stream(tempfile, f) }
|
85
|
+
File.write(meta_path, JSON.generate(meta))
|
86
|
+
|
87
|
+
resource_path
|
88
|
+
end
|
89
|
+
|
90
|
+
def cache_dir
|
91
|
+
if ENV["TOKENIZERS_CACHE"]
|
92
|
+
ENV["TOKENIZERS_CACHE"]
|
93
|
+
else
|
94
|
+
# use same directory as Rust version
|
95
|
+
# https://docs.rs/dirs/latest/dirs/fn.cache_dir.html
|
96
|
+
dir =
|
97
|
+
if Gem.win_platform?
|
98
|
+
ENV.fetch("LOCALAPPDATA")
|
99
|
+
elsif mac?
|
100
|
+
File.join(ENV.fetch("HOME"), "Library", "Caches")
|
101
|
+
else
|
102
|
+
ENV["XDG_CACHE_HOME"] || File.join(ENV.fetch("HOME"), ".cache")
|
103
|
+
end
|
104
|
+
|
105
|
+
File.join(dir, "huggingface", "tokenizers")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def ensure_cache_dir
|
110
|
+
dir = cache_dir
|
111
|
+
FileUtils.mkdir_p(dir)
|
112
|
+
dir
|
113
|
+
end
|
114
|
+
|
115
|
+
def mac?
|
116
|
+
RbConfig::CONFIG["host_os"] =~ /darwin/i
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -7,12 +7,11 @@ end
|
|
7
7
|
|
8
8
|
# modules
|
9
9
|
require "tokenizers/char_bpe_tokenizer"
|
10
|
+
require "tokenizers/from_pretrained"
|
10
11
|
require "tokenizers/version"
|
11
12
|
|
12
13
|
module Tokenizers
|
13
14
|
class Error < StandardError; end
|
14
15
|
|
15
|
-
|
16
|
-
_from_pretrained(identifier, revision, auth_token)
|
17
|
-
end
|
16
|
+
extend FromPretrained
|
18
17
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-15 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -48,6 +48,7 @@ files:
|
|
48
48
|
- ext/tokenizers/src/tokenizer.rs
|
49
49
|
- lib/tokenizers.rb
|
50
50
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
51
|
+
- lib/tokenizers/from_pretrained.rb
|
51
52
|
- lib/tokenizers/version.rb
|
52
53
|
homepage: https://github.com/ankane/tokenizers-ruby
|
53
54
|
licenses:
|