tokenizers 0.4.1-x86_64-linux-musl → 0.4.3-x86_64-linux-musl

Sign up to get free protection for your applications and to get access to all the features.
data/README.md CHANGED
@@ -34,15 +34,51 @@ Decode
34
34
  tokenizer.decode(ids)
35
35
  ```
36
36
 
37
- Load a tokenizer from files
37
+ ## Training
38
+
39
+ Create a tokenizer
38
40
 
39
41
  ```ruby
40
- tokenizer = Tokenizers::CharBPETokenizer.new("vocab.json", "merges.txt")
42
+ tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
41
43
  ```
42
44
 
43
- ## Training
45
+ Set the pre-tokenizer
46
+
47
+ ```ruby
48
+ tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
49
+ ```
50
+
51
+ Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
52
+
53
+ ```ruby
54
+ trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
55
+ tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
56
+ ```
57
+
58
+ Encode
59
+
60
+ ```ruby
61
+ output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
62
+ output.tokens
63
+ ```
64
+
65
+ Save the tokenizer to a file
66
+
67
+ ```ruby
68
+ tokenizer.save("tokenizer.json")
69
+ ```
70
+
71
+ Load a tokenizer from a file
72
+
73
+ ```ruby
74
+ tokenizer = Tokenizers.from_file("tokenizer.json")
75
+ ```
76
+
77
+ Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
78
+
79
+ ## API
44
80
 
45
- Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8)
81
+ This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
46
82
 
47
83
  ## History
48
84
 
Binary file
Binary file
Binary file
Binary file
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.14.0"
4
+ TOKENIZERS_VERSION = "0.15.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -11,25 +11,27 @@ module Tokenizers
11
11
  require "digest"
12
12
  require "fileutils"
13
13
  require "json"
14
+ require "net/http"
14
15
  require "open-uri"
15
16
 
16
17
  cache_dir = ensure_cache_dir
17
18
 
18
- # string options are headers
19
19
  options = {
20
20
  open_timeout: 3,
21
- read_timeout: 30,
21
+ read_timeout: 30
22
+ }
23
+ headers = {
22
24
  "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
25
  }
24
26
  if auth_token
25
- options["Authorization"] = "Bearer #{auth_token}"
27
+ headers["Authorization"] = "Bearer #{auth_token}"
26
28
  end
27
29
 
28
30
  url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
31
 
30
32
  path =
31
33
  begin
32
- cached_path(cache_dir, url, options)
34
+ cached_path(cache_dir, url, headers, options)
33
35
  rescue OpenURI::HTTPError
34
36
  raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
37
  end
@@ -41,7 +43,7 @@ module Tokenizers
41
43
 
42
44
  # use same storage format as Rust version
43
45
  # https://github.com/epwalsh/rust-cached-path
44
- def cached_path(cache_dir, url, options)
46
+ def cached_path(cache_dir, url, headers, options)
45
47
  fsum = Digest::SHA256.hexdigest(url)
46
48
  meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
49
  meta = meta_paths.map { |f| JSON.parse(File.read(f)) }.max_by { |m| m["creation_time"] }
@@ -50,21 +52,25 @@ module Tokenizers
50
52
  if etag
51
53
  esum = Digest::SHA256.hexdigest(etag)
52
54
  resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
- options["If-None-Match"] = etag if File.exist?(resource_path)
55
+ if File.exist?(resource_path)
56
+ uri = URI(url)
57
+ req = Net::HTTP::Head.new(uri)
58
+ headers.each do |k, v|
59
+ req[k] = v
60
+ end
61
+ res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
62
+ http.request(req)
63
+ end
64
+ if res["etag"] == etag
65
+ return resource_path
66
+ end
67
+ end
54
68
  end
55
69
 
56
70
  options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
71
 
58
- tempfile =
59
- begin
60
- URI.parse(url).open(options)
61
- rescue OpenURI::HTTPError => e
62
- if e.message == "304 Not Modified"
63
- return resource_path
64
- else
65
- raise e
66
- end
67
- end
72
+ # string options are headers
73
+ tempfile = URI.parse(url).open(headers.merge(options))
68
74
 
69
75
  etag = tempfile.meta["etag"]
70
76
  esum = Digest::SHA256.hexdigest(etag)
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.4.1"
2
+ VERSION = "0.4.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.4.1
4
+ version: 0.4.3
5
5
  platform: x86_64-linux-musl
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-10-05 00:00:00.000000000 Z
11
+ date: 2024-01-04 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -26,6 +26,7 @@ files:
26
26
  - lib/tokenizers/3.0/tokenizers.so
27
27
  - lib/tokenizers/3.1/tokenizers.so
28
28
  - lib/tokenizers/3.2/tokenizers.so
29
+ - lib/tokenizers/3.3/tokenizers.so
29
30
  - lib/tokenizers/char_bpe_tokenizer.rb
30
31
  - lib/tokenizers/decoders/bpe_decoder.rb
31
32
  - lib/tokenizers/decoders/ctc.rb
@@ -70,7 +71,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
70
71
  version: '3.0'
71
72
  - - "<"
72
73
  - !ruby/object:Gem::Version
73
- version: 3.3.dev
74
+ version: 3.4.dev
74
75
  required_rubygems_version: !ruby/object:Gem::Requirement
75
76
  requirements:
76
77
  - - ">="