tokenizers 0.2.1-x86_64-darwin → 0.2.2-x86_64-darwin

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -24,8 +24,8 @@ Encode
24
24
 
25
25
  ```ruby
26
26
  encoded = tokenizer.encode("I can feel the magic, can you?")
27
- encoded.ids
28
27
  encoded.tokens
28
+ encoded.ids
29
29
  ```
30
30
 
31
31
  Decode
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,119 @@
1
+ module Tokenizers
2
+ module FromPretrained
3
+ # for user agent
4
+ TOKENIZERS_VERSION = "0.13.2"
5
+
6
+ # use Ruby for downloads
7
+ # this avoids the need to vendor OpenSSL on Linux
8
+ # and reduces the extension size by about half
9
+ def from_pretrained(identifier, revision: "main", auth_token: nil)
10
+ require "cgi"
11
+ require "digest"
12
+ require "fileutils"
13
+ require "json"
14
+ require "open-uri"
15
+
16
+ cache_dir = ensure_cache_dir
17
+
18
+ # string options are headers
19
+ options = {
20
+ open_timeout: 3,
21
+ read_timeout: 30,
22
+ "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
+ }
24
+ if auth_token
25
+ options["Authorization"] = "Bearer #{auth_token}"
26
+ end
27
+
28
+ url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
+
30
+ path =
31
+ begin
32
+ cached_path(cache_dir, url, options)
33
+ rescue OpenURI::HTTPError
34
+ raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
+ end
36
+
37
+ from_file(path)
38
+ end
39
+
40
+ private
41
+
42
+ # use same storage format as Rust version
43
+ # https://github.com/epwalsh/rust-cached-path
44
+ def cached_path(cache_dir, url, options)
45
+ fsum = Digest::SHA256.hexdigest(url)
46
+ meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
+ meta = meta_paths.map { |f| JSON.load_file(f) }.max_by { |m| m["creation_time"] }
48
+ etag = meta["etag"] if meta
49
+
50
+ if etag
51
+ esum = Digest::SHA256.hexdigest(etag)
52
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
+ options["If-None-Match"] = etag if File.exist?(resource_path)
54
+ end
55
+
56
+ options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
+
58
+ tempfile =
59
+ begin
60
+ URI.open(url, options)
61
+ rescue OpenURI::HTTPError => e
62
+ if e.message == "304 Not Modified"
63
+ return resource_path
64
+ else
65
+ raise e
66
+ end
67
+ end
68
+
69
+ etag = tempfile.meta["etag"]
70
+ esum = Digest::SHA256.hexdigest(etag)
71
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
72
+ meta_path = "#{resource_path}.meta"
73
+
74
+ meta = {
75
+ resource: url,
76
+ resource_path: resource_path,
77
+ meta_path: meta_path,
78
+ etag: etag,
79
+ expires: nil,
80
+ creation_time: Time.now.to_f
81
+ }
82
+
83
+ File.write("#{resource_path}.lock", "")
84
+ File.open(resource_path, "wb") { |f| IO.copy_stream(tempfile, f) }
85
+ File.write(meta_path, JSON.generate(meta))
86
+
87
+ resource_path
88
+ end
89
+
90
+ def cache_dir
91
+ if ENV["TOKENIZERS_CACHE"]
92
+ ENV["TOKENIZERS_CACHE"]
93
+ else
94
+ # use same directory as Rust version
95
+ # https://docs.rs/dirs/latest/dirs/fn.cache_dir.html
96
+ dir =
97
+ if Gem.win_platform?
98
+ ENV.fetch("LOCALAPPDATA")
99
+ elsif mac?
100
+ File.join(ENV.fetch("HOME"), "Library", "Caches")
101
+ else
102
+ ENV["XDG_CACHE_HOME"] || File.join(ENV.fetch("HOME"), ".cache")
103
+ end
104
+
105
+ File.join(dir, "huggingface", "tokenizers")
106
+ end
107
+ end
108
+
109
+ def ensure_cache_dir
110
+ dir = cache_dir
111
+ FileUtils.mkdir_p(dir)
112
+ dir
113
+ end
114
+
115
+ def mac?
116
+ RbConfig::CONFIG["host_os"] =~ /darwin/i
117
+ end
118
+ end
119
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -7,12 +7,11 @@ end
7
7
 
8
8
  # modules
9
9
  require "tokenizers/char_bpe_tokenizer"
10
+ require "tokenizers/from_pretrained"
10
11
  require "tokenizers/version"
11
12
 
12
13
  module Tokenizers
13
14
  class Error < StandardError; end
14
15
 
15
- def self.from_pretrained(identifier, revision: "main", auth_token: nil)
16
- _from_pretrained(identifier, revision, auth_token)
17
- end
16
+ extend FromPretrained
18
17
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-12 00:00:00.000000000 Z
11
+ date: 2023-01-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -28,6 +28,7 @@ files:
28
28
  - lib/tokenizers/3.1/tokenizers.bundle
29
29
  - lib/tokenizers/3.2/tokenizers.bundle
30
30
  - lib/tokenizers/char_bpe_tokenizer.rb
31
+ - lib/tokenizers/from_pretrained.rb
31
32
  - lib/tokenizers/version.rb
32
33
  homepage: https://github.com/ankane/tokenizers-ruby
33
34
  licenses: