tokenizers 0.4.0-x86_64-linux-musl → 0.4.2-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
Binary file
Binary file
Binary file
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.14.0"
4
+ TOKENIZERS_VERSION = "0.15.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -11,25 +11,27 @@ module Tokenizers
11
11
  require "digest"
12
12
  require "fileutils"
13
13
  require "json"
14
+ require "net/http"
14
15
  require "open-uri"
15
16
 
16
17
  cache_dir = ensure_cache_dir
17
18
 
18
- # string options are headers
19
19
  options = {
20
20
  open_timeout: 3,
21
- read_timeout: 30,
21
+ read_timeout: 30
22
+ }
23
+ headers = {
22
24
  "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
25
  }
24
26
  if auth_token
25
- options["Authorization"] = "Bearer #{auth_token}"
27
+ headers["Authorization"] = "Bearer #{auth_token}"
26
28
  end
27
29
 
28
30
  url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
31
 
30
32
  path =
31
33
  begin
32
- cached_path(cache_dir, url, options)
34
+ cached_path(cache_dir, url, headers, options)
33
35
  rescue OpenURI::HTTPError
34
36
  raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
37
  end
@@ -41,7 +43,7 @@ module Tokenizers
41
43
 
42
44
  # use same storage format as Rust version
43
45
  # https://github.com/epwalsh/rust-cached-path
44
- def cached_path(cache_dir, url, options)
46
+ def cached_path(cache_dir, url, headers, options)
45
47
  fsum = Digest::SHA256.hexdigest(url)
46
48
  meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
49
  meta = meta_paths.map { |f| JSON.parse(File.read(f)) }.max_by { |m| m["creation_time"] }
@@ -50,21 +52,25 @@ module Tokenizers
50
52
  if etag
51
53
  esum = Digest::SHA256.hexdigest(etag)
52
54
  resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
- options["If-None-Match"] = etag if File.exist?(resource_path)
55
+ if File.exist?(resource_path)
56
+ uri = URI(url)
57
+ req = Net::HTTP::Head.new(uri)
58
+ headers.each do |k, v|
59
+ req[k] = v
60
+ end
61
+ res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
62
+ http.request(req)
63
+ end
64
+ if res["etag"] == etag
65
+ return resource_path
66
+ end
67
+ end
54
68
  end
55
69
 
56
70
  options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
71
 
58
- tempfile =
59
- begin
60
- URI.parse(url).open(options)
61
- rescue OpenURI::HTTPError => e
62
- if e.message == "304 Not Modified"
63
- return resource_path
64
- else
65
- raise e
66
- end
67
- end
72
+ # string options are headers
73
+ tempfile = URI.parse(url).open(headers.merge(options))
68
74
 
69
75
  etag = tempfile.meta["etag"]
70
76
  esum = Digest::SHA256.hexdigest(etag)
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.0"
2
+ VERSION = "0.4.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.0
4
+ version: 0.4.2
5
5
  platform: x86_64-linux-musl
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-09-21 00:00:00.000000000 Z
11
+ date: 2023-11-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org