tokenizers 0.2.1 → 0.2.2

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml CHANGED
@@ -3,8 +3,3 @@ members = ["ext/tokenizers"]
3
3
 
4
4
  [profile.release]
5
5
  strip = true
6
-
7
- [patch.crates-io]
8
- number_prefix = { git = "https://github.com/ankane/rust-number-prefix", branch = "license-file" }
9
- rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
10
- tokenizers = { git = "https://github.com/huggingface/tokenizers" }
data/README.md CHANGED
@@ -24,8 +24,8 @@ Encode
24
24
 
25
25
  ```ruby
26
26
  encoded = tokenizer.encode("I can feel the magic, can you?")
27
- encoded.ids
28
27
  encoded.tokens
28
+ encoded.ids
29
29
  ```
30
30
 
31
31
  Decode
@@ -1,6 +1,6 @@
1
1
  [package]
2
2
  name = "tokenizers"
3
- version = "0.2.1"
3
+ version = "0.2.2"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -13,6 +13,7 @@ crate-type = ["cdylib"]
13
13
  magnus = "0.4"
14
14
 
15
15
  [dependencies.tokenizers]
16
- version = "0.13.2"
16
+ version = "0.13.2" # also update in from_pretrained.rb
17
+ git = "https://github.com/huggingface/tokenizers"
17
18
  default-features = false
18
- features = ["progressbar", "http", "onig", "esaxx_fast"]
19
+ features = ["progressbar", "onig", "esaxx_fast"]
@@ -27,10 +27,7 @@ fn module() -> RModule {
27
27
  #[magnus::init]
28
28
  fn init() -> RbResult<()> {
29
29
  let module = module();
30
- module.define_singleton_method(
31
- "_from_pretrained",
32
- function!(RbTokenizer::from_pretrained, 3),
33
- )?;
30
+ module.define_singleton_method("from_file", function!(RbTokenizer::from_file, 1))?;
34
31
 
35
32
  let class = module.define_class("BPE", Default::default())?;
36
33
  class.define_singleton_method("new", function!(RbBPE::new, 2))?;
@@ -1,5 +1,5 @@
1
- use magnus::Module;
2
1
  use std::cell::RefCell;
2
+ use std::path::PathBuf;
3
3
  use tk::tokenizer::Tokenizer;
4
4
  use tk::AddedToken;
5
5
 
@@ -8,7 +8,7 @@ use super::encoding::RbEncoding;
8
8
  use super::models::RbBPE;
9
9
  use super::normalizers::RbBertNormalizer;
10
10
  use super::pre_tokenizers::RbBertPreTokenizer;
11
- use super::{module, RbError, RbResult};
11
+ use super::{RbError, RbResult};
12
12
 
13
13
  #[magnus::wrap(class = "Tokenizers::Tokenizer")]
14
14
  pub struct RbTokenizer {
@@ -22,22 +22,8 @@ impl RbTokenizer {
22
22
  }
23
23
  }
24
24
 
25
- pub fn from_pretrained(
26
- identifier: String,
27
- revision: String,
28
- auth_token: Option<String>,
29
- ) -> RbResult<Self> {
30
- let version = module().const_get("VERSION").unwrap();
31
- let params = tk::FromPretrainedParameters {
32
- revision,
33
- auth_token,
34
- user_agent: [("bindings", "Ruby".to_string()), ("version", version)]
35
- .iter()
36
- .map(|(k, v)| (k.to_string(), v.to_string()))
37
- .collect(),
38
- };
39
-
40
- Tokenizer::from_pretrained(identifier, Some(params))
25
+ pub fn from_file(path: PathBuf) -> RbResult<Self> {
26
+ Tokenizer::from_file(path)
41
27
  .map(|v| RbTokenizer {
42
28
  tokenizer: RefCell::new(v),
43
29
  })
@@ -0,0 +1,119 @@
1
+ module Tokenizers
2
+ module FromPretrained
3
+ # for user agent
4
+ TOKENIZERS_VERSION = "0.13.2"
5
+
6
+ # use Ruby for downloads
7
+ # this avoids the need to vendor OpenSSL on Linux
8
+ # and reduces the extension size by about half
9
+ def from_pretrained(identifier, revision: "main", auth_token: nil)
10
+ require "cgi"
11
+ require "digest"
12
+ require "fileutils"
13
+ require "json"
14
+ require "open-uri"
15
+
16
+ cache_dir = ensure_cache_dir
17
+
18
+ # string options are headers
19
+ options = {
20
+ open_timeout: 3,
21
+ read_timeout: 30,
22
+ "User-Agent" => "tokenizers/#{TOKENIZERS_VERSION}; bindings/Ruby; version/#{VERSION}"
23
+ }
24
+ if auth_token
25
+ options["Authorization"] = "Bearer #{auth_token}"
26
+ end
27
+
28
+ url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
29
+
30
+ path =
31
+ begin
32
+ cached_path(cache_dir, url, options)
33
+ rescue OpenURI::HTTPError
34
+ raise Error, "Model \"#{identifier}\" on the Hub doesn't have a tokenizer"
35
+ end
36
+
37
+ from_file(path)
38
+ end
39
+
40
+ private
41
+
42
+ # use same storage format as Rust version
43
+ # https://github.com/epwalsh/rust-cached-path
44
+ def cached_path(cache_dir, url, options)
45
+ fsum = Digest::SHA256.hexdigest(url)
46
+ meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
47
+ meta = meta_paths.map { |f| JSON.load_file(f) }.max_by { |m| m["creation_time"] }
48
+ etag = meta["etag"] if meta
49
+
50
+ if etag
51
+ esum = Digest::SHA256.hexdigest(etag)
52
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
53
+ options["If-None-Match"] = etag if File.exist?(resource_path)
54
+ end
55
+
56
+ options[:content_length_proc] = -> (_) { puts "Downloading..." }
57
+
58
+ tempfile =
59
+ begin
60
+ URI.open(url, options)
61
+ rescue OpenURI::HTTPError => e
62
+ if e.message == "304 Not Modified"
63
+ return resource_path
64
+ else
65
+ raise e
66
+ end
67
+ end
68
+
69
+ etag = tempfile.meta["etag"]
70
+ esum = Digest::SHA256.hexdigest(etag)
71
+ resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
72
+ meta_path = "#{resource_path}.meta"
73
+
74
+ meta = {
75
+ resource: url,
76
+ resource_path: resource_path,
77
+ meta_path: meta_path,
78
+ etag: etag,
79
+ expires: nil,
80
+ creation_time: Time.now.to_f
81
+ }
82
+
83
+ File.write("#{resource_path}.lock", "")
84
+ File.open(resource_path, "wb") { |f| IO.copy_stream(tempfile, f) }
85
+ File.write(meta_path, JSON.generate(meta))
86
+
87
+ resource_path
88
+ end
89
+
90
+ def cache_dir
91
+ if ENV["TOKENIZERS_CACHE"]
92
+ ENV["TOKENIZERS_CACHE"]
93
+ else
94
+ # use same directory as Rust version
95
+ # https://docs.rs/dirs/latest/dirs/fn.cache_dir.html
96
+ dir =
97
+ if Gem.win_platform?
98
+ ENV.fetch("LOCALAPPDATA")
99
+ elsif mac?
100
+ File.join(ENV.fetch("HOME"), "Library", "Caches")
101
+ else
102
+ ENV["XDG_CACHE_HOME"] || File.join(ENV.fetch("HOME"), ".cache")
103
+ end
104
+
105
+ File.join(dir, "huggingface", "tokenizers")
106
+ end
107
+ end
108
+
109
+ def ensure_cache_dir
110
+ dir = cache_dir
111
+ FileUtils.mkdir_p(dir)
112
+ dir
113
+ end
114
+
115
+ def mac?
116
+ RbConfig::CONFIG["host_os"] =~ /darwin/i
117
+ end
118
+ end
119
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.2.1"
2
+ VERSION = "0.2.2"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -7,12 +7,11 @@ end
7
7
 
8
8
  # modules
9
9
  require "tokenizers/char_bpe_tokenizer"
10
+ require "tokenizers/from_pretrained"
10
11
  require "tokenizers/version"
11
12
 
12
13
  module Tokenizers
13
14
  class Error < StandardError; end
14
15
 
15
- def self.from_pretrained(identifier, revision: "main", auth_token: nil)
16
- _from_pretrained(identifier, revision, auth_token)
17
- end
16
+ extend FromPretrained
18
17
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.2.1
4
+ version: 0.2.2
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-01-12 00:00:00.000000000 Z
11
+ date: 2023-01-15 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rb_sys
@@ -48,6 +48,7 @@ files:
48
48
  - ext/tokenizers/src/tokenizer.rs
49
49
  - lib/tokenizers.rb
50
50
  - lib/tokenizers/char_bpe_tokenizer.rb
51
+ - lib/tokenizers/from_pretrained.rb
51
52
  - lib/tokenizers/version.rb
52
53
  homepage: https://github.com/ankane/tokenizers-ruby
53
54
  licenses: