tokenizers 0.2.1-arm64-darwin → 0.2.3-arm64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -24,8 +24,8 @@ Encode
24
24
 
25
25
  ```ruby
26
26
  encoded = tokenizer.encode("I can feel the magic, can you?")
27
- encoded.ids
28
27
  encoded.tokens
28
+ encoded.ids
29
29
  ```
30
30
 
31
31
  Decode
Binary file
Binary file
Binary file
Binary file
@@ -8,8 +8,8 @@ module Tokenizers
8
8
  @tokenizer.decoder = BPEDecoder.new
9
9
  end
10
10
 
11
- def encode(text)
12
- @tokenizer.encode(text)
11
+ def encode(text, **options)
12
+ @tokenizer.encode(text, **options)
13
13
  end
14
14
 
15
15
  def decode(ids)
@@ -0,0 +1,19 @@
1
+ module Tokenizers
2
+ class Encoding
3
+ def word_to_tokens(word_index, sequence_index = 0)
4
+ _word_to_tokens(word_index, sequence_index)
5
+ end
6
+
7
+ def word_to_chars(word_index, sequence_index = 0)
8
+ _word_to_chars(word_index, sequence_index)
9
+ end
10
+
11
+ def char_to_token(char_pos, sequence_index = 0)
12
+ _char_to_token(char_pos, sequence_index)
13
+ end
14
+
15
+ def char_to_word(char_pos, sequence_index = 0)
16
+ _char_to_word(word_index, sequence_index)
17
+ end
18
+ end
19
+ end
@@ -0,0 +1,119 @@
1
+ module Tokenizers
2
+ module FromPretrained
3
+ # for user agent
4
+ TOKENIZERS_VERSION = "0.13.2"
5
+
6
+ # use Ruby for downloads
7
+ # this avoids the need to vendor OpenSSL on Linux
8
+ # and reduces the extension size by about half
9
+ def from_pretrained(identifier, revision: "main", auth_token: nil)
10
+ require "cgi"
11
+ require "digest"
12
+ require "fileutils"
13
+ require "json"
14
+ require "open-uri"
15
+
16
+ cache_dir = ensure_cache_dir
17
+
18
+ # string options are headers
19
+ options = {
20
+ open_timeout: 3,
21
+ read_timeout: 30,
22
+ "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
+ }
24
+ if auth_token
25
+ options["Authorization"] = "Bearer #{auth_token}"
26
+ end
27
+
28
+ url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
+
30
+ path =
31
+ begin
32
+ cached_path(cache_dir, url, options)
33
+ rescue OpenURI::HTTPError
34
+ raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
+ end
36
+
37
+ from_file(path)
38
+ end
39
+
40
+ private
41
+
42
+ # use same storage format as Rust version
43
+ # https://github.com/epwalsh/rust-cached-path
44
+ def cached_path(cache_dir, url, options)
45
+ fsum = Digest::SHA256.hexdigest(url)
46
+ meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
+ meta = meta_paths.map { |f| JSON.load_file(f) }.max_by { |m| m["creation_time"] }
48
+ etag = meta["etag"] if meta
49
+
50
+ if etag
51
+ esum = Digest::SHA256.hexdigest(etag)
52
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
+ options["If-None-Match"] = etag if File.exist?(resource_path)
54
+ end
55
+
56
+ options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
+
58
+ tempfile =
59
+ begin
60
+ URI.open(url, options)
61
+ rescue OpenURI::HTTPError => e
62
+ if e.message == "304 Not Modified"
63
+ return resource_path
64
+ else
65
+ raise e
66
+ end
67
+ end
68
+
69
+ etag = tempfile.meta["etag"]
70
+ esum = Digest::SHA256.hexdigest(etag)
71
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
72
+ meta_path = "#{resource_path}.meta"
73
+
74
+ meta = {
75
+ resource: url,
76
+ resource_path: resource_path,
77
+ meta_path: meta_path,
78
+ etag: etag,
79
+ expires: nil,
80
+ creation_time: Time.now.to_f
81
+ }
82
+
83
+ File.write("#{resource_path}.lock", "")
84
+ File.open(resource_path, "wb") { |f| IO.copy_stream(tempfile, f) }
85
+ File.write(meta_path, JSON.generate(meta))
86
+
87
+ resource_path
88
+ end
89
+
90
+ def cache_dir
91
+ if ENV["TOKENIZERS_CACHE"]
92
+ ENV["TOKENIZERS_CACHE"]
93
+ else
94
+ # use same directory as Rust version
95
+ # https://docs.rs/dirs/latest/dirs/fn.cache_dir.html
96
+ dir =
97
+ if Gem.win_platform?
98
+ ENV.fetch("LOCALAPPDATA")
99
+ elsif mac?
100
+ File.join(ENV.fetch("HOME"), "Library", "Caches")
101
+ else
102
+ ENV["XDG_CACHE_HOME"] || File.join(ENV.fetch("HOME"), ".cache")
103
+ end
104
+
105
+ File.join(dir, "huggingface", "tokenizers")
106
+ end
107
+ end
108
+
109
+ def ensure_cache_dir
110
+ dir = cache_dir
111
+ FileUtils.mkdir_p(dir)
112
+ dir
113
+ end
114
+
115
+ def mac?
116
+ RbConfig::CONFIG["host_os"] =~ /darwin/i
117
+ end
118
+ end
119
+ end
@@ -0,0 +1,12 @@
1
+ module Tokenizers
2
+ class Tokenizer
3
+ # TODO change add_special_tokens default to true in 0.3.0
4
+ def encode(sequence, add_special_tokens: nil)
5
+ if add_special_tokens.nil?
6
+ warn "[tokenizers] add_special_tokens will default to true in 0.3.0. Pass add_special_tokens: true/false to silence this warning."
7
+ add_special_tokens = false
8
+ end
9
+ _encode(sequence, add_special_tokens)
10
+ end
11
+ end
12
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.3"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,18 +1,19 @@
1
1
  # ext
2
2
  begin
3
- require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
3
+ require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
4
  rescue LoadError
5
- require "tokenizers/tokenizers"
5
+ require_relative "tokenizers/tokenizers"
6
6
  end
7
7
 
8
8
  # modules
9
- require "tokenizers/char_bpe_tokenizer"
10
- require "tokenizers/version"
9
+ require_relative "tokenizers/char_bpe_tokenizer"
10
+ require_relative "tokenizers/encoding"
11
+ require_relative "tokenizers/from_pretrained"
12
+ require_relative "tokenizers/tokenizer"
13
+ require_relative "tokenizers/version"
11
14
 
12
15
  module Tokenizers
13
16
  class Error < StandardError; end
14
17
 
15
- def self.from_pretrained(identifier, revision: "main", auth_token: nil)
16
- _from_pretrained(identifier, revision, auth_token)
17
- end
18
+ extend FromPretrained
18
19
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.3
5
5
  platform: arm64-darwin
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-12 00:00:00.000000000 Z
11
+ date: 2023-01-22 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -28,6 +28,9 @@ files:
28
28
  - lib/tokenizers/3.1/tokenizers.bundle
29
29
  - lib/tokenizers/3.2/tokenizers.bundle
30
30
  - lib/tokenizers/char_bpe_tokenizer.rb
31
+ - lib/tokenizers/encoding.rb
32
+ - lib/tokenizers/from_pretrained.rb
33
+ - lib/tokenizers/tokenizer.rb
31
34
  - lib/tokenizers/version.rb
32
35
  homepage: https://github.com/ankane/tokenizers-ruby
33
36
  licenses:
@@ -51,7 +54,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
51
54
  - !ruby/object:Gem::Version
52
55
  version: '0'
53
56
  requirements: []
54
- rubygems_version: 3.4.3
57
+ rubygems_version: 3.4.4
55
58
  signing_key:
56
59
  specification_version: 4
57
60
  summary: Fast state-of-the-art tokenizers for Ruby