tokenizers 0.2.1-x86_64-linux → 0.2.2-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -24,8 +24,8 @@ Encode
24
24
 
25
25
  ```ruby
26
26
  encoded = tokenizer.encode("I can feel the magic, can you?")
27
- encoded.ids
28
27
  encoded.tokens
28
+ encoded.ids
29
29
  ```
30
30
 
31
31
  Decode
Binary file
Binary file
Binary file
Binary file
@@ -0,0 +1,119 @@
1
+ module Tokenizers
2
+ module FromPretrained
3
+ # for user agent
4
+ TOKENIZERS_VERSION = "0.13.2"
5
+
6
+ # use Ruby for downloads
7
+ # this avoids the need to vendor OpenSSL on Linux
8
+ # and reduces the extension size by about half
9
+ def from_pretrained(identifier, revision: "main", auth_token: nil)
10
+ require "cgi"
11
+ require "digest"
12
+ require "fileutils"
13
+ require "json"
14
+ require "open-uri"
15
+
16
+ cache_dir = ensure_cache_dir
17
+
18
+ # string options are headers
19
+ options = {
20
+ open_timeout: 3,
21
+ read_timeout: 30,
22
+ "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
+ }
24
+ if auth_token
25
+ options["Authorization"] = "Bearer #{auth_token}"
26
+ end
27
+
28
+ url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
+
30
+ path =
31
+ begin
32
+ cached_path(cache_dir, url, options)
33
+ rescue OpenURI::HTTPError
34
+ raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
+ end
36
+
37
+ from_file(path)
38
+ end
39
+
40
+ private
41
+
42
+ # use same storage format as Rust version
43
+ # https://github.com/epwalsh/rust-cached-path
44
+ def cached_path(cache_dir, url, options)
45
+ fsum = Digest::SHA256.hexdigest(url)
46
+ meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
+ meta = meta_paths.map { |f| JSON.load_file(f) }.max_by { |m| m["creation_time"] }
48
+ etag = meta["etag"] if meta
49
+
50
+ if etag
51
+ esum = Digest::SHA256.hexdigest(etag)
52
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
+ options["If-None-Match"] = etag if File.exist?(resource_path)
54
+ end
55
+
56
+ options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
+
58
+ tempfile =
59
+ begin
60
+ URI.open(url, options)
61
+ rescue OpenURI::HTTPError => e
62
+ if e.message == "304 Not Modified"
63
+ return resource_path
64
+ else
65
+ raise e
66
+ end
67
+ end
68
+
69
+ etag = tempfile.meta["etag"]
70
+ esum = Digest::SHA256.hexdigest(etag)
71
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
72
+ meta_path = "#{resource_path}.meta"
73
+
74
+ meta = {
75
+ resource: url,
76
+ resource_path: resource_path,
77
+ meta_path: meta_path,
78
+ etag: etag,
79
+ expires: nil,
80
+ creation_time: Time.now.to_f
81
+ }
82
+
83
+ File.write("#{resource_path}.lock", "")
84
+ File.open(resource_path, "wb") { |f| IO.copy_stream(tempfile, f) }
85
+ File.write(meta_path, JSON.generate(meta))
86
+
87
+ resource_path
88
+ end
89
+
90
+ def cache_dir
91
+ if ENV["TOKENIZERS_CACHE"]
92
+ ENV["TOKENIZERS_CACHE"]
93
+ else
94
+ # use same directory as Rust version
95
+ # https://docs.rs/dirs/latest/dirs/fn.cache_dir.html
96
+ dir =
97
+ if Gem.win_platform?
98
+ ENV.fetch("LOCALAPPDATA")
99
+ elsif mac?
100
+ File.join(ENV.fetch("HOME"), "Library", "Caches")
101
+ else
102
+ ENV["XDG_CACHE_HOME"] || File.join(ENV.fetch("HOME"), ".cache")
103
+ end
104
+
105
+ File.join(dir, "huggingface", "tokenizers")
106
+ end
107
+ end
108
+
109
+ def ensure_cache_dir
110
+ dir = cache_dir
111
+ FileUtils.mkdir_p(dir)
112
+ dir
113
+ end
114
+
115
+ def mac?
116
+ RbConfig::CONFIG["host_os"] =~ /darwin/i
117
+ end
118
+ end
119
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -7,12 +7,11 @@ end
7
7
 
8
8
  # modules
9
9
  require "tokenizers/char_bpe_tokenizer"
10
+ require "tokenizers/from_pretrained"
10
11
  require "tokenizers/version"
11
12
 
12
13
  module Tokenizers
13
14
  class Error < StandardError; end
14
15
 
15
- def self.from_pretrained(identifier, revision: "main", auth_token: nil)
16
- _from_pretrained(identifier, revision, auth_token)
17
- end
16
+ extend FromPretrained
18
17
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-12 00:00:00.000000000 Z
11
+ date: 2023-01-15 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -28,6 +28,7 @@ files:
28
28
  - lib/tokenizers/3.1/tokenizers.so
29
29
  - lib/tokenizers/3.2/tokenizers.so
30
30
  - lib/tokenizers/char_bpe_tokenizer.rb
31
+ - lib/tokenizers/from_pretrained.rb
31
32
  - lib/tokenizers/version.rb
32
33
  homepage: https://github.com/ankane/tokenizers-ruby
33
34
  licenses: