tokenizers 0.6.2 → 0.6.3

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: caf36f6de3318df84c66a3ff99963e1b57aeca78736892ad143f86c82bfde737
4
- data.tar.gz: 1e8689c437f736aeb85e2163f5b4eb7a30b1b01ba3ec6c472afde16a7dd25040
3
+ metadata.gz: b7e76174884c06417a6829e89ef1f5785957d92f10d8a9acd6586e19ec84737c
4
+ data.tar.gz: 04b8e76f25f59e404978f2af68829142890c7bdd4ffbe7cd86a5af4f72de2d5e
5
5
  SHA512:
6
- metadata.gz: 88a48300b336c3afaf6ba2119835d548945419133e3f194b3ec8ed78c9a6b477389a4f60325c3233358e5f52b2e7b3ad79550a8e5b2828436757cad6796093ab
7
- data.tar.gz: af05786f53957827b7094bbf464dd281c44839de13ecbb3811e2897a95c5fd42fe27573259b8d337b1c975746c20ea9051b128bf7a048784654bba6f5fa003e9
6
+ metadata.gz: b9b2583c6c2aac22d835c045f6674a95a1f1a9dcdddd7d2406b34c7f64cb04bd8900de05feaa85a8f9d5601392636b41ed300326f362e9e5de29098506801cc6
7
+ data.tar.gz: '08f6e8e3c4187a5f3bd57e75141add2973857a1fe4211b86acd8ab43a3ec5fc5550309910c606d09267dfe3242c7088b7ffa2d5ccb0f94c5ee922deb0fd0c943'
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.6.3 (2026-01-05)
2
+
3
+ - Updated Tokenizers to 0.22.2
4
+ - Added support for Ruby 4.0
5
+ - Fixed `from_pretrained` method with namespaces
6
+
1
7
  ## 0.6.2 (2025-11-24)
2
8
 
3
9
  - Updated Tokenizers to 0.22.1
data/Cargo.lock CHANGED
@@ -124,9 +124,9 @@ dependencies = [
124
124
 
125
125
  [[package]]
126
126
  name = "console"
127
- version = "0.15.11"
127
+ version = "0.16.1"
128
128
  source = "registry+https://github.com/rust-lang/crates.io-index"
129
- checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
129
+ checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4"
130
130
  dependencies = [
131
131
  "encode_unicode",
132
132
  "libc",
@@ -288,14 +288,14 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
288
288
 
289
289
  [[package]]
290
290
  name = "indicatif"
291
- version = "0.17.11"
291
+ version = "0.18.3"
292
292
  source = "registry+https://github.com/rust-lang/crates.io-index"
293
- checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
293
+ checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88"
294
294
  dependencies = [
295
295
  "console",
296
- "number_prefix",
297
296
  "portable-atomic",
298
297
  "unicode-width",
298
+ "unit-prefix",
299
299
  "web-time",
300
300
  ]
301
301
 
@@ -449,12 +449,6 @@ dependencies = [
449
449
  "minimal-lexical",
450
450
  ]
451
451
 
452
- [[package]]
453
- name = "number_prefix"
454
- version = "0.4.0"
455
- source = "registry+https://github.com/rust-lang/crates.io-index"
456
- checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
457
-
458
452
  [[package]]
459
453
  name = "once_cell"
460
454
  version = "1.21.3"
@@ -596,18 +590,18 @@ dependencies = [
596
590
 
597
591
  [[package]]
598
592
  name = "rb-sys"
599
- version = "0.9.117"
593
+ version = "0.9.124"
600
594
  source = "registry+https://github.com/rust-lang/crates.io-index"
601
- checksum = "f900d1ce4629a2ebffaf5de74bd8f9c1188d4c5ed406df02f97e22f77a006f44"
595
+ checksum = "c85c4188462601e2aa1469def389c17228566f82ea72f137ed096f21591bc489"
602
596
  dependencies = [
603
597
  "rb-sys-build",
604
598
  ]
605
599
 
606
600
  [[package]]
607
601
  name = "rb-sys-build"
608
- version = "0.9.117"
602
+ version = "0.9.124"
609
603
  source = "registry+https://github.com/rust-lang/crates.io-index"
610
- checksum = "ef1e9c857028f631056bcd6d88cec390c751e343ce2223ddb26d23eb4a151d59"
604
+ checksum = "568068db4102230882e6d4ae8de6632e224ca75fe5970f6e026a04e91ed635d3"
611
605
  dependencies = [
612
606
  "bindgen",
613
607
  "lazy_static",
@@ -784,20 +778,9 @@ dependencies = [
784
778
 
785
779
  [[package]]
786
780
  name = "tokenizers"
787
- version = "0.6.2"
788
- dependencies = [
789
- "ahash",
790
- "magnus",
791
- "onig",
792
- "serde",
793
- "tokenizers 0.22.1",
794
- ]
795
-
796
- [[package]]
797
- name = "tokenizers"
798
- version = "0.22.1"
781
+ version = "0.22.2"
799
782
  source = "registry+https://github.com/rust-lang/crates.io-index"
800
- checksum = "6475a27088c98ea96d00b39a9ddfb63780d1ad4cceb6f48374349a96ab2b7842"
783
+ checksum = "b238e22d44a15349529690fb07bd645cf58149a1b1e44d6cb5bd1641ff1a6223"
801
784
  dependencies = [
802
785
  "ahash",
803
786
  "aho-corasick",
@@ -827,6 +810,17 @@ dependencies = [
827
810
  "unicode_categories",
828
811
  ]
829
812
 
813
+ [[package]]
814
+ name = "tokenizers-ruby"
815
+ version = "0.6.3"
816
+ dependencies = [
817
+ "ahash",
818
+ "magnus",
819
+ "onig",
820
+ "serde",
821
+ "tokenizers",
822
+ ]
823
+
830
824
  [[package]]
831
825
  name = "unicode-ident"
832
826
  version = "1.0.18"
@@ -860,6 +854,12 @@ version = "0.1.1"
860
854
  source = "registry+https://github.com/rust-lang/crates.io-index"
861
855
  checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
862
856
 
857
+ [[package]]
858
+ name = "unit-prefix"
859
+ version = "0.5.2"
860
+ source = "registry+https://github.com/rust-lang/crates.io-index"
861
+ checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
862
+
863
863
  [[package]]
864
864
  name = "version_check"
865
865
  version = "0.9.5"
@@ -942,13 +942,19 @@ dependencies = [
942
942
  "wasm-bindgen",
943
943
  ]
944
944
 
945
+ [[package]]
946
+ name = "windows-link"
947
+ version = "0.2.1"
948
+ source = "registry+https://github.com/rust-lang/crates.io-index"
949
+ checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
950
+
945
951
  [[package]]
946
952
  name = "windows-sys"
947
- version = "0.59.0"
953
+ version = "0.61.2"
948
954
  source = "registry+https://github.com/rust-lang/crates.io-index"
949
- checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
955
+ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
950
956
  dependencies = [
951
- "windows-targets",
957
+ "windows-link",
952
958
  ]
953
959
 
954
960
  [[package]]
@@ -1,6 +1,6 @@
1
1
  [package]
2
- name = "tokenizers"
3
- version = "0.6.2"
2
+ name = "tokenizers-ruby"
3
+ version = "0.6.3"
4
4
  license = "Apache-2.0"
5
5
  authors = ["Andrew Kane <andrew@ankane.org>"]
6
6
  edition = "2021"
@@ -8,6 +8,7 @@ rust-version = "1.63.0"
8
8
  publish = false
9
9
 
10
10
  [lib]
11
+ name = "tokenizers"
11
12
  crate-type = ["cdylib"]
12
13
 
13
14
  [dependencies]
@@ -17,6 +18,6 @@ onig = { version = "6", default-features = false }
17
18
  serde = { version = "1", features = ["rc", "derive"] }
18
19
 
19
20
  [dependencies.tokenizers]
20
- version = "=0.22.1" # also update in from_pretrained.rb
21
+ version = "=0.22.2" # also update in from_pretrained.rb
21
22
  default-features = false
22
23
  features = ["progressbar", "onig", "esaxx_fast"]
@@ -49,7 +49,7 @@ static PROCESSORS: Lazy<RModule> =
49
49
  static TRAINERS: Lazy<RModule> =
50
50
  Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Trainers").unwrap());
51
51
 
52
- #[magnus::init]
52
+ #[magnus::init(name = "tokenizers")]
53
53
  fn init(ruby: &Ruby) -> RbResult<()> {
54
54
  let module = ruby.define_module("Tokenizers")?;
55
55
 
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.22.1"
4
+ TOKENIZERS_VERSION = "0.22.2"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -27,7 +27,8 @@ module Tokenizers
27
27
  headers["Authorization"] = "Bearer #{auth_token}"
28
28
  end
29
29
 
30
- url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
30
+ escaped_identifier = identifier.split("/", 2).map { |v| CGI.escape(v) }.join("/")
31
+ url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [escaped_identifier, CGI.escape(revision)]
31
32
 
32
33
  path =
33
34
  begin
@@ -53,14 +54,7 @@ module Tokenizers
53
54
  esum = Digest::SHA256.hexdigest(etag)
54
55
  resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
55
56
  if File.exist?(resource_path)
56
- uri = URI(url)
57
- req = Net::HTTP::Head.new(uri)
58
- headers.each do |k, v|
59
- req[k] = v
60
- end
61
- res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
62
- http.request(req)
63
- end
57
+ res = head_request(url, headers, options)
64
58
  if res["etag"] == etag
65
59
  return resource_path
66
60
  end
@@ -93,6 +87,25 @@ module Tokenizers
93
87
  resource_path
94
88
  end
95
89
 
90
+ def head_request(url, headers, options, redirects = 0)
91
+ uri = URI(url)
92
+ req = Net::HTTP::Head.new(uri)
93
+ headers.each do |k, v|
94
+ req[k] = v
95
+ end
96
+ res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
97
+ http.request(req)
98
+ end
99
+ if res.is_a?(Net::HTTPRedirection) && redirects < 3
100
+ location = URI.parse(res["location"])
101
+ # follow relative redirects only
102
+ if location.relative?
103
+ return head_request(uri.merge(location), headers, options, redirects + 1)
104
+ end
105
+ end
106
+ res
107
+ end
108
+
96
109
  def cache_dir
97
110
  cache_dir =
98
111
  if ENV["TOKENIZERS_CACHE"]
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.6.2"
2
+ VERSION = "0.6.3"
3
3
  end
metadata CHANGED
@@ -1,7 +1,7 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
@@ -98,7 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
98
98
  - !ruby/object:Gem::Version
99
99
  version: '0'
100
100
  requirements: []
101
- rubygems_version: 3.6.9
101
+ rubygems_version: 4.0.3
102
102
  specification_version: 4
103
103
  summary: Fast state-of-the-art tokenizers for Ruby
104
104
  test_files: []