tokenizers 0.4.1-x86_64-darwin → 0.4.2-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +70 -103
- data/Cargo.toml +1 -0
- data/LICENSE-THIRD-PARTY.txt +358 -1526
- data/lib/tokenizers/3.0/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.1/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.2/tokenizers.bundle +0 -0
- data/lib/tokenizers/from_pretrained.rb +23 -17
- data/lib/tokenizers/version.rb +1 -1
- metadata +2 -2
Binary file
|
Binary file
|
Binary file
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module FromPretrained
|
3
3
|
# for user agent
|
4
|
-
TOKENIZERS_VERSION = "0.
|
4
|
+
TOKENIZERS_VERSION = "0.15.0"
|
5
5
|
|
6
6
|
# use Ruby for downloads
|
7
7
|
# this avoids the need to vendor OpenSSL on Linux
|
@@ -11,25 +11,27 @@ module Tokenizers
|
|
11
11
|
require "digest"
|
12
12
|
require "fileutils"
|
13
13
|
require "json"
|
14
|
+
require "net/http"
|
14
15
|
require "open-uri"
|
15
16
|
|
16
17
|
cache_dir = ensure_cache_dir
|
17
18
|
|
18
|
-
# string options are headers
|
19
19
|
options = {
|
20
20
|
open_timeout: 3,
|
21
|
-
read_timeout: 30
|
21
|
+
read_timeout: 30
|
22
|
+
}
|
23
|
+
headers = {
|
22
24
|
"User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
|
23
25
|
}
|
24
26
|
if auth_token
|
25
|
-
|
27
|
+
headers["Authorization"] = "Bearer #{auth_token}"
|
26
28
|
end
|
27
29
|
|
28
30
|
url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
|
29
31
|
|
30
32
|
path =
|
31
33
|
begin
|
32
|
-
cached_path(cache_dir, url, options)
|
34
|
+
cached_path(cache_dir, url, headers, options)
|
33
35
|
rescue OpenURI::HTTPError
|
34
36
|
raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
|
35
37
|
end
|
@@ -41,7 +43,7 @@ module Tokenizers
|
|
41
43
|
|
42
44
|
# use same storage format as Rust version
|
43
45
|
# https://github.com/epwalsh/rust-cached-path
|
44
|
-
def cached_path(cache_dir, url, options)
|
46
|
+
def cached_path(cache_dir, url, headers, options)
|
45
47
|
fsum = Digest::SHA256.hexdigest(url)
|
46
48
|
meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
|
47
49
|
meta = meta_paths.map { |f| JSON.parse(File.read(f)) }.max_by { |m| m["creation_time"] }
|
@@ -50,21 +52,25 @@ module Tokenizers
|
|
50
52
|
if etag
|
51
53
|
esum = Digest::SHA256.hexdigest(etag)
|
52
54
|
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
53
|
-
|
55
|
+
if File.exist?(resource_path)
|
56
|
+
uri = URI(url)
|
57
|
+
req = Net::HTTP::Head.new(uri)
|
58
|
+
headers.each do |k, v|
|
59
|
+
req[k] = v
|
60
|
+
end
|
61
|
+
res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
|
62
|
+
http.request(req)
|
63
|
+
end
|
64
|
+
if res["etag"] == etag
|
65
|
+
return resource_path
|
66
|
+
end
|
67
|
+
end
|
54
68
|
end
|
55
69
|
|
56
70
|
options[:content_length_proc] = -> (_) { puts "Downloading..." }
|
57
71
|
|
58
|
-
|
59
|
-
|
60
|
-
URI.parse(url).open(options)
|
61
|
-
rescue OpenURI::HTTPError => e
|
62
|
-
if e.message == "304 Not Modified"
|
63
|
-
return resource_path
|
64
|
-
else
|
65
|
-
raise e
|
66
|
-
end
|
67
|
-
end
|
72
|
+
# string options are headers
|
73
|
+
tempfile = URI.parse(url).open(headers.merge(options))
|
68
74
|
|
69
75
|
etag = tempfile.meta["etag"]
|
70
76
|
esum = Digest::SHA256.hexdigest(etag)
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.2
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-11-16 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|