tokenizers 0.2.1-x86_64-linux → 0.2.2-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Cargo.lock +124 -1252
- data/Cargo.toml +0 -5
- data/LICENSE-THIRD-PARTY.txt +6795 -23773
- data/README.md +1 -1
- data/lib/tokenizers/2.7/tokenizers.so +0 -0
- data/lib/tokenizers/3.0/tokenizers.so +0 -0
- data/lib/tokenizers/3.1/tokenizers.so +0 -0
- data/lib/tokenizers/3.2/tokenizers.so +0 -0
- data/lib/tokenizers/from_pretrained.rb +119 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +2 -3
- metadata +3 -2
data/README.md
CHANGED
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,119 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
module FromPretrained
|
3
|
+
# for user agent
|
4
|
+
TOKENIZERS_VERSION = "0.13.2"
|
5
|
+
|
6
|
+
# use Ruby for downloads
|
7
|
+
# this avoids the need to vendor OpenSSL on Linux
|
8
|
+
# and reduces the extension size by about half
|
9
|
+
def from_pretrained(identifier, revision: "main", auth_token: nil)
|
10
|
+
require "cgi"
|
11
|
+
require "digest"
|
12
|
+
require "fileutils"
|
13
|
+
require "json"
|
14
|
+
require "open-uri"
|
15
|
+
|
16
|
+
cache_dir = ensure_cache_dir
|
17
|
+
|
18
|
+
# string options are headers
|
19
|
+
options = {
|
20
|
+
open_timeout: 3,
|
21
|
+
read_timeout: 30,
|
22
|
+
"User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
|
23
|
+
}
|
24
|
+
if auth_token
|
25
|
+
options["Authorization"] = "Bearer #{auth_token}"
|
26
|
+
end
|
27
|
+
|
28
|
+
url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
|
29
|
+
|
30
|
+
path =
|
31
|
+
begin
|
32
|
+
cached_path(cache_dir, url, options)
|
33
|
+
rescue OpenURI::HTTPError
|
34
|
+
raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
|
35
|
+
end
|
36
|
+
|
37
|
+
from_file(path)
|
38
|
+
end
|
39
|
+
|
40
|
+
private
|
41
|
+
|
42
|
+
# use same storage format as Rust version
|
43
|
+
# https://github.com/epwalsh/rust-cached-path
|
44
|
+
def cached_path(cache_dir, url, options)
|
45
|
+
fsum = Digest::SHA256.hexdigest(url)
|
46
|
+
meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
|
47
|
+
meta = meta_paths.map { |f| JSON.load_file(f) }.max_by { |m| m["creation_time"] }
|
48
|
+
etag = meta["etag"] if meta
|
49
|
+
|
50
|
+
if etag
|
51
|
+
esum = Digest::SHA256.hexdigest(etag)
|
52
|
+
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
53
|
+
options["If-None-Match"] = etag if File.exist?(resource_path)
|
54
|
+
end
|
55
|
+
|
56
|
+
options[:content_length_proc] = -> (_) { puts "Downloading..." }
|
57
|
+
|
58
|
+
tempfile =
|
59
|
+
begin
|
60
|
+
URI.open(url, options)
|
61
|
+
rescue OpenURI::HTTPError => e
|
62
|
+
if e.message == "304 Not Modified"
|
63
|
+
return resource_path
|
64
|
+
else
|
65
|
+
raise e
|
66
|
+
end
|
67
|
+
end
|
68
|
+
|
69
|
+
etag = tempfile.meta["etag"]
|
70
|
+
esum = Digest::SHA256.hexdigest(etag)
|
71
|
+
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
72
|
+
meta_path = "#{resource_path}.meta"
|
73
|
+
|
74
|
+
meta = {
|
75
|
+
resource: url,
|
76
|
+
resource_path: resource_path,
|
77
|
+
meta_path: meta_path,
|
78
|
+
etag: etag,
|
79
|
+
expires: nil,
|
80
|
+
creation_time: Time.now.to_f
|
81
|
+
}
|
82
|
+
|
83
|
+
File.write("#{resource_path}.lock", "")
|
84
|
+
File.open(resource_path, "wb") { |f| IO.copy_stream(tempfile, f) }
|
85
|
+
File.write(meta_path, JSON.generate(meta))
|
86
|
+
|
87
|
+
resource_path
|
88
|
+
end
|
89
|
+
|
90
|
+
def cache_dir
|
91
|
+
if ENV["TOKENIZERS_CACHE"]
|
92
|
+
ENV["TOKENIZERS_CACHE"]
|
93
|
+
else
|
94
|
+
# use same directory as Rust version
|
95
|
+
# https://docs.rs/dirs/latest/dirs/fn.cache_dir.html
|
96
|
+
dir =
|
97
|
+
if Gem.win_platform?
|
98
|
+
ENV.fetch("LOCALAPPDATA")
|
99
|
+
elsif mac?
|
100
|
+
File.join(ENV.fetch("HOME"), "Library", "Caches")
|
101
|
+
else
|
102
|
+
ENV["XDG_CACHE_HOME"] || File.join(ENV.fetch("HOME"), ".cache")
|
103
|
+
end
|
104
|
+
|
105
|
+
File.join(dir, "huggingface", "tokenizers")
|
106
|
+
end
|
107
|
+
end
|
108
|
+
|
109
|
+
def ensure_cache_dir
|
110
|
+
dir = cache_dir
|
111
|
+
FileUtils.mkdir_p(dir)
|
112
|
+
dir
|
113
|
+
end
|
114
|
+
|
115
|
+
def mac?
|
116
|
+
RbConfig::CONFIG["host_os"] =~ /darwin/i
|
117
|
+
end
|
118
|
+
end
|
119
|
+
end
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -7,12 +7,11 @@ end
|
|
7
7
|
|
8
8
|
# modules
|
9
9
|
require "tokenizers/char_bpe_tokenizer"
|
10
|
+
require "tokenizers/from_pretrained"
|
10
11
|
require "tokenizers/version"
|
11
12
|
|
12
13
|
module Tokenizers
|
13
14
|
class Error < StandardError; end
|
14
15
|
|
15
|
-
|
16
|
-
_from_pretrained(identifier, revision, auth_token)
|
17
|
-
end
|
16
|
+
extend FromPretrained
|
18
17
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.2
|
5
5
|
platform: x86_64-linux
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-15 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -28,6 +28,7 @@ files:
|
|
28
28
|
- lib/tokenizers/3.1/tokenizers.so
|
29
29
|
- lib/tokenizers/3.2/tokenizers.so
|
30
30
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
31
|
+
- lib/tokenizers/from_pretrained.rb
|
31
32
|
- lib/tokenizers/version.rb
|
32
33
|
homepage: https://github.com/ankane/tokenizers-ruby
|
33
34
|
licenses:
|