tokenizers 0.4.1-x86_64-darwin → 0.4.2-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Binary file
Binary file
Binary file
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.14.0"
4
+ TOKENIZERS_VERSION = "0.15.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -11,25 +11,27 @@ module Tokenizers
11
11
  require "digest"
12
12
  require "fileutils"
13
13
  require "json"
14
+ require "net/http"
14
15
  require "open-uri"
15
16
 
16
17
  cache_dir = ensure_cache_dir
17
18
 
18
- # string options are headers
19
19
  options = {
20
20
  open_timeout: 3,
21
- read_timeout: 30,
21
+ read_timeout: 30
22
+ }
23
+ headers = {
22
24
  "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
25
  }
24
26
  if auth_token
25
- options["Authorization"] = "Bearer #{auth_token}"
27
+ headers["Authorization"] = "Bearer #{auth_token}"
26
28
  end
27
29
 
28
30
  url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
31
 
30
32
  path =
31
33
  begin
32
- cached_path(cache_dir, url, options)
34
+ cached_path(cache_dir, url, headers, options)
33
35
  rescue OpenURI::HTTPError
34
36
  raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
37
  end
@@ -41,7 +43,7 @@ module Tokenizers
41
43
 
42
44
  # use same storage format as Rust version
43
45
  # https://github.com/epwalsh/rust-cached-path
44
- def cached_path(cache_dir, url, options)
46
+ def cached_path(cache_dir, url, headers, options)
45
47
  fsum = Digest::SHA256.hexdigest(url)
46
48
  meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
49
  meta = meta_paths.map { |f| JSON.parse(File.read(f)) }.max_by { |m| m["creation_time"] }
@@ -50,21 +52,25 @@ module Tokenizers
50
52
  if etag
51
53
  esum = Digest::SHA256.hexdigest(etag)
52
54
  resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
- options["If-None-Match"] = etag if File.exist?(resource_path)
55
+ if File.exist?(resource_path)
56
+ uri = URI(url)
57
+ req = Net::HTTP::Head.new(uri)
58
+ headers.each do |k, v|
59
+ req[k] = v
60
+ end
61
+ res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
62
+ http.request(req)
63
+ end
64
+ if res["etag"] == etag
65
+ return resource_path
66
+ end
67
+ end
54
68
  end
55
69
 
56
70
  options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
71
 
58
- tempfile =
59
- begin
60
- URI.parse(url).open(options)
61
- rescue OpenURI::HTTPError => e
62
- if e.message == "304 Not Modified"
63
- return resource_path
64
- else
65
- raise e
66
- end
67
- end
72
+ # string options are headers
73
+ tempfile = URI.parse(url).open(headers.merge(options))
68
74
 
69
75
  etag = tempfile.meta["etag"]
70
76
  esum = Digest::SHA256.hexdigest(etag)
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.1"
2
+ VERSION = "0.4.2"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.2
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-05 00:00:00.000000000 Z
11
+ date: 2023-11-16 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org