tokenizers 0.4.1-x86_64-darwin → 0.4.3-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/README.md CHANGED
@@ -34,15 +34,51 @@ Decode
34
34
  tokenizer.decode(ids)
35
35
  ```
36
36
 
37
- Load a tokenizer from files
37
+ ## Training
38
+
39
+ Create a tokenizer
38
40
 
39
41
  ```ruby
40
- tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
42
+ tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
41
43
  ```
42
44
 
43
- ## Training
45
+ Set the pre-tokenizer
46
+
47
+ ```ruby
48
+ tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
49
+ ```
50
+
51
+ Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
52
+
53
+ ```ruby
54
+ trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
55
+ tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
56
+ ```
57
+
58
+ Encode
59
+
60
+ ```ruby
61
+ output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
62
+ output.tokens
63
+ ```
64
+
65
+ Save the tokenizer to a file
66
+
67
+ ```ruby
68
+ tokenizer.save("tokenizer.json")
69
+ ```
70
+
71
+ Load a tokenizer from a file
72
+
73
+ ```ruby
74
+ tokenizer = Tokenizers.from_file("tokenizer.json")
75
+ ```
76
+
77
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
78
+
79
+ ## API
44
80
 
45
- Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
81
+ This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
46
82
 
47
83
  ## History
48
84
 
Binary file
Binary file
Binary file
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.14.0"
4
+ TOKENIZERS_VERSION = "0.15.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -11,25 +11,27 @@ module Tokenizers
11
11
  require "digest"
12
12
  require "fileutils"
13
13
  require "json"
14
+ require "net/http"
14
15
  require "open-uri"
15
16
 
16
17
  cache_dir = ensure_cache_dir
17
18
 
18
- # string options are headers
19
19
  options = {
20
20
  open_timeout: 3,
21
- read_timeout: 30,
21
+ read_timeout: 30
22
+ }
23
+ headers = {
22
24
  "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
25
  }
24
26
  if auth_token
25
- options["Authorization"] = "Bearer #{auth_token}"
27
+ headers["Authorization"] = "Bearer #{auth_token}"
26
28
  end
27
29
 
28
30
  url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
31
 
30
32
  path =
31
33
  begin
32
- cached_path(cache_dir, url, options)
34
+ cached_path(cache_dir, url, headers, options)
33
35
  rescue OpenURI::HTTPError
34
36
  raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
37
  end
@@ -41,7 +43,7 @@ module Tokenizers
41
43
 
42
44
  # use same storage format as Rust version
43
45
  # https://github.com/epwalsh/rust-cached-path
44
- def cached_path(cache_dir, url, options)
46
+ def cached_path(cache_dir, url, headers, options)
45
47
  fsum = Digest::SHA256.hexdigest(url)
46
48
  meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
49
  meta = meta_paths.map { |f| JSON.parse(File.read(f)) }.max_by { |m| m["creation_time"] }
@@ -50,21 +52,25 @@ module Tokenizers
50
52
  if etag
51
53
  esum = Digest::SHA256.hexdigest(etag)
52
54
  resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
- options["If-None-Match"] = etag if File.exist?(resource_path)
55
+ if File.exist?(resource_path)
56
+ uri = URI(url)
57
+ req = Net::HTTP::Head.new(uri)
58
+ headers.each do |k, v|
59
+ req[k] = v
60
+ end
61
+ res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
62
+ http.request(req)
63
+ end
64
+ if res["etag"] == etag
65
+ return resource_path
66
+ end
67
+ end
54
68
  end
55
69
 
56
70
  options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
71
 
58
- tempfile =
59
- begin
60
- URI.parse(url).open(options)
61
- rescue OpenURI::HTTPError => e
62
- if e.message == "304 Not Modified"
63
- return resource_path
64
- else
65
- raise e
66
- end
67
- end
72
+ # string options are headers
73
+ tempfile = URI.parse(url).open(headers.merge(options))
68
74
 
69
75
  etag = tempfile.meta["etag"]
70
76
  esum = Digest::SHA256.hexdigest(etag)
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.1"
2
+ VERSION = "0.4.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.3
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-05 00:00:00.000000000 Z
11
+ date: 2024-01-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -26,6 +26,7 @@ files:
26
26
  - lib/tokenizers/3.0/tokenizers.bundle
27
27
  - lib/tokenizers/3.1/tokenizers.bundle
28
28
  - lib/tokenizers/3.2/tokenizers.bundle
29
+ - lib/tokenizers/3.3/tokenizers.bundle
29
30
  - lib/tokenizers/char_bpe_tokenizer.rb
30
31
  - lib/tokenizers/decoders/bpe_decoder.rb
31
32
  - lib/tokenizers/decoders/ctc.rb
@@ -70,7 +71,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
70
71
  version: '3.0'
71
72
  - - "<"
72
73
  - !ruby/object:Gem::Version
73
- version: 3.3.dev
74
+ version: 3.4.dev
74
75
  required_rubygems_version: !ruby/object:Gem::Requirement
75
76
  requirements:
76
77
  - - ">="