tokenizers 0.4.1-arm64-darwin → 0.4.3-arm64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Cargo.lock +88 -115
- data/Cargo.toml +1 -0
- data/LICENSE-THIRD-PARTY.txt +577 -1505
- data/README.md +40 -4
- data/lib/tokenizers/3.0/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.1/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.2/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.3/tokenizers.bundle +0 -0
- data/lib/tokenizers/from_pretrained.rb +23 -17
- data/lib/tokenizers/version.rb +1 -1
- metadata +4 -3
data/README.md
CHANGED
@@ -34,15 +34,51 @@ Decode
|
|
34
34
|
tokenizer.decode(ids)
|
35
35
|
```
|
36
36
|
|
37
|
-
|
37
|
+
## Training
|
38
|
+
|
39
|
+
Create a tokenizer
|
38
40
|
|
39
41
|
```ruby
|
40
|
-
tokenizer = Tokenizers::
|
42
|
+
tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
|
41
43
|
```
|
42
44
|
|
43
|
-
|
45
|
+
Set the pre-tokenizer
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
|
49
|
+
```
|
50
|
+
|
51
|
+
Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
55
|
+
tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
|
56
|
+
```
|
57
|
+
|
58
|
+
Encode
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
62
|
+
output.tokens
|
63
|
+
```
|
64
|
+
|
65
|
+
Save the tokenizer to a file
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
tokenizer.save("tokenizer.json")
|
69
|
+
```
|
70
|
+
|
71
|
+
Load a tokenizer from a file
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
tokenizer = Tokenizers.from_file("tokenizer.json")
|
75
|
+
```
|
76
|
+
|
77
|
+
Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
|
78
|
+
|
79
|
+
## API
|
44
80
|
|
45
|
-
|
81
|
+
This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
|
46
82
|
|
47
83
|
## History
|
48
84
|
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -1,7 +1,7 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
module FromPretrained
|
3
3
|
# for user agent
|
4
|
-
TOKENIZERS_VERSION = "0.
|
4
|
+
TOKENIZERS_VERSION = "0.15.0"
|
5
5
|
|
6
6
|
# use Ruby for downloads
|
7
7
|
# this avoids the need to vendor OpenSSL on Linux
|
@@ -11,25 +11,27 @@ module Tokenizers
|
|
11
11
|
require "digest"
|
12
12
|
require "fileutils"
|
13
13
|
require "json"
|
14
|
+
require "net/http"
|
14
15
|
require "open-uri"
|
15
16
|
|
16
17
|
cache_dir = ensure_cache_dir
|
17
18
|
|
18
|
-
# string options are headers
|
19
19
|
options = {
|
20
20
|
open_timeout: 3,
|
21
|
-
read_timeout: 30
|
21
|
+
read_timeout: 30
|
22
|
+
}
|
23
|
+
headers = {
|
22
24
|
"User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
|
23
25
|
}
|
24
26
|
if auth_token
|
25
|
-
|
27
|
+
headers["Authorization"] = "Bearer #{auth_token}"
|
26
28
|
end
|
27
29
|
|
28
30
|
url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
|
29
31
|
|
30
32
|
path =
|
31
33
|
begin
|
32
|
-
cached_path(cache_dir, url, options)
|
34
|
+
cached_path(cache_dir, url, headers, options)
|
33
35
|
rescue OpenURI::HTTPError
|
34
36
|
raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
|
35
37
|
end
|
@@ -41,7 +43,7 @@ module Tokenizers
|
|
41
43
|
|
42
44
|
# use same storage format as Rust version
|
43
45
|
# https://github.com/epwalsh/rust-cached-path
|
44
|
-
def cached_path(cache_dir, url, options)
|
46
|
+
def cached_path(cache_dir, url, headers, options)
|
45
47
|
fsum = Digest::SHA256.hexdigest(url)
|
46
48
|
meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
|
47
49
|
meta = meta_paths.map { |f| JSON.parse(File.read(f)) }.max_by { |m| m["creation_time"] }
|
@@ -50,21 +52,25 @@ module Tokenizers
|
|
50
52
|
if etag
|
51
53
|
esum = Digest::SHA256.hexdigest(etag)
|
52
54
|
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
53
|
-
|
55
|
+
if File.exist?(resource_path)
|
56
|
+
uri = URI(url)
|
57
|
+
req = Net::HTTP::Head.new(uri)
|
58
|
+
headers.each do |k, v|
|
59
|
+
req[k] = v
|
60
|
+
end
|
61
|
+
res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
|
62
|
+
http.request(req)
|
63
|
+
end
|
64
|
+
if res["etag"] == etag
|
65
|
+
return resource_path
|
66
|
+
end
|
67
|
+
end
|
54
68
|
end
|
55
69
|
|
56
70
|
options[:content_length_proc] = -> (_) { puts "Downloading..." }
|
57
71
|
|
58
|
-
|
59
|
-
|
60
|
-
URI.parse(url).open(options)
|
61
|
-
rescue OpenURI::HTTPError => e
|
62
|
-
if e.message == "304 Not Modified"
|
63
|
-
return resource_path
|
64
|
-
else
|
65
|
-
raise e
|
66
|
-
end
|
67
|
-
end
|
72
|
+
# string options are headers
|
73
|
+
tempfile = URI.parse(url).open(headers.merge(options))
|
68
74
|
|
69
75
|
etag = tempfile.meta["etag"]
|
70
76
|
esum = Digest::SHA256.hexdigest(etag)
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: arm64-darwin
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -26,6 +26,7 @@ files:
|
|
26
26
|
- lib/tokenizers/3.0/tokenizers.bundle
|
27
27
|
- lib/tokenizers/3.1/tokenizers.bundle
|
28
28
|
- lib/tokenizers/3.2/tokenizers.bundle
|
29
|
+
- lib/tokenizers/3.3/tokenizers.bundle
|
29
30
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
30
31
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
31
32
|
- lib/tokenizers/decoders/ctc.rb
|
@@ -70,7 +71,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
70
71
|
version: '3.0'
|
71
72
|
- - "<"
|
72
73
|
- !ruby/object:Gem::Version
|
73
|
-
version: 3.
|
74
|
+
version: 3.4.dev
|
74
75
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
75
76
|
requirements:
|
76
77
|
- - ">="
|