tokenizers 0.6.2-x86_64-darwin → 0.6.3-x86_64-darwin

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 0a793276da84b2b3c38df06471e9128072cf4c87e61c8f11f9c8885ecc79d5a6
4
- data.tar.gz: 9bb03308d9699b8464bdba326e45777d94a0d990ccad1997eef38ecbe113904f
3
+ metadata.gz: ab7df68f887ed98c7592e0553d82dcb0989033b3983aae43d093ffd8d9883ac0
4
+ data.tar.gz: e17dcd450c6e7f984cde6ecc9d2e9a5d8a695583c226d3e8ab2524a4a3c78c1d
5
5
  SHA512:
6
- metadata.gz: c63ff72dda8763dad39ec49ec18f3e7e9a05e0939f630a03d2fe9fdc70e590c1e8cd17108e3f5e9d6e9899241a1913c55d8c932abfc066ca632df19a28597fa4
7
- data.tar.gz: e1cf8d65b051f288d14d7e92622fe3ea07843acd1288b3aec39e324427b2fca88a38680a2c6b29e5559923614c6f25aac9c249692323da8534ba11696b7d936a
6
+ metadata.gz: 7067a8fc83ad516f1ac5f063b1c1b891ddd33490d5379e64b8b750aaf164c1a971a0e8053a91f7ae0b7c6eef4b311610c5cd6f3a30264f237b4f0b107557ff8d
7
+ data.tar.gz: '0986b6b549512d6308e3a273e66b7b68f2a84c3438fd390a3987b73d85b2449cb6478d35234a258c21c01c7f604bf1c5c61b18b0cb4c0d9969237da1dd12f50a'
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.6.3 (2026-01-05)
2
+
3
+ - Updated Tokenizers to 0.22.2
4
+ - Added support for Ruby 4.0
5
+ - Fixed `from_pretrained` method with namespaces
6
+
1
7
  ## 0.6.2 (2025-11-24)
2
8
 
3
9
  - Updated Tokenizers to 0.22.1
data/Cargo.lock CHANGED
@@ -124,9 +124,9 @@ dependencies = [
124
124
 
125
125
  [[package]]
126
126
  name = "console"
127
- version = "0.15.11"
127
+ version = "0.16.1"
128
128
  source = "registry+https://github.com/rust-lang/crates.io-index"
129
- checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
129
+ checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4"
130
130
  dependencies = [
131
131
  "encode_unicode",
132
132
  "libc",
@@ -288,14 +288,14 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
288
288
 
289
289
  [[package]]
290
290
  name = "indicatif"
291
- version = "0.17.11"
291
+ version = "0.18.3"
292
292
  source = "registry+https://github.com/rust-lang/crates.io-index"
293
- checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
293
+ checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88"
294
294
  dependencies = [
295
295
  "console",
296
- "number_prefix",
297
296
  "portable-atomic",
298
297
  "unicode-width",
298
+ "unit-prefix",
299
299
  "web-time",
300
300
  ]
301
301
 
@@ -449,12 +449,6 @@ dependencies = [
449
449
  "minimal-lexical",
450
450
  ]
451
451
 
452
- [[package]]
453
- name = "number_prefix"
454
- version = "0.4.0"
455
- source = "registry+https://github.com/rust-lang/crates.io-index"
456
- checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
457
-
458
452
  [[package]]
459
453
  name = "once_cell"
460
454
  version = "1.21.3"
@@ -596,18 +590,18 @@ dependencies = [
596
590
 
597
591
  [[package]]
598
592
  name = "rb-sys"
599
- version = "0.9.117"
593
+ version = "0.9.124"
600
594
  source = "registry+https://github.com/rust-lang/crates.io-index"
601
- checksum = "f900d1ce4629a2ebffaf5de74bd8f9c1188d4c5ed406df02f97e22f77a006f44"
595
+ checksum = "c85c4188462601e2aa1469def389c17228566f82ea72f137ed096f21591bc489"
602
596
  dependencies = [
603
597
  "rb-sys-build",
604
598
  ]
605
599
 
606
600
  [[package]]
607
601
  name = "rb-sys-build"
608
- version = "0.9.117"
602
+ version = "0.9.124"
609
603
  source = "registry+https://github.com/rust-lang/crates.io-index"
610
- checksum = "ef1e9c857028f631056bcd6d88cec390c751e343ce2223ddb26d23eb4a151d59"
604
+ checksum = "568068db4102230882e6d4ae8de6632e224ca75fe5970f6e026a04e91ed635d3"
611
605
  dependencies = [
612
606
  "bindgen",
613
607
  "lazy_static",
@@ -784,20 +778,9 @@ dependencies = [
784
778
 
785
779
  [[package]]
786
780
  name = "tokenizers"
787
- version = "0.6.2"
788
- dependencies = [
789
- "ahash",
790
- "magnus",
791
- "onig",
792
- "serde",
793
- "tokenizers 0.22.1",
794
- ]
795
-
796
- [[package]]
797
- name = "tokenizers"
798
- version = "0.22.1"
781
+ version = "0.22.2"
799
782
  source = "registry+https://github.com/rust-lang/crates.io-index"
800
- checksum = "6475a27088c98ea96d00b39a9ddfb63780d1ad4cceb6f48374349a96ab2b7842"
783
+ checksum = "b238e22d44a15349529690fb07bd645cf58149a1b1e44d6cb5bd1641ff1a6223"
801
784
  dependencies = [
802
785
  "ahash",
803
786
  "aho-corasick",
@@ -827,6 +810,17 @@ dependencies = [
827
810
  "unicode_categories",
828
811
  ]
829
812
 
813
+ [[package]]
814
+ name = "tokenizers-ruby"
815
+ version = "0.6.3"
816
+ dependencies = [
817
+ "ahash",
818
+ "magnus",
819
+ "onig",
820
+ "serde",
821
+ "tokenizers",
822
+ ]
823
+
830
824
  [[package]]
831
825
  name = "unicode-ident"
832
826
  version = "1.0.18"
@@ -860,6 +854,12 @@ version = "0.1.1"
860
854
  source = "registry+https://github.com/rust-lang/crates.io-index"
861
855
  checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
862
856
 
857
+ [[package]]
858
+ name = "unit-prefix"
859
+ version = "0.5.2"
860
+ source = "registry+https://github.com/rust-lang/crates.io-index"
861
+ checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
862
+
863
863
  [[package]]
864
864
  name = "version_check"
865
865
  version = "0.9.5"
@@ -942,13 +942,19 @@ dependencies = [
942
942
  "wasm-bindgen",
943
943
  ]
944
944
 
945
+ [[package]]
946
+ name = "windows-link"
947
+ version = "0.2.1"
948
+ source = "registry+https://github.com/rust-lang/crates.io-index"
949
+ checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
950
+
945
951
  [[package]]
946
952
  name = "windows-sys"
947
- version = "0.59.0"
953
+ version = "0.61.2"
948
954
  source = "registry+https://github.com/rust-lang/crates.io-index"
949
- checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
955
+ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
950
956
  dependencies = [
951
- "windows-targets",
957
+ "windows-link",
952
958
  ]
953
959
 
954
960
  [[package]]
@@ -46,7 +46,7 @@ compact_str v0.9.0
46
46
  https://github.com/ParkMyCar/compact_str
47
47
  MIT
48
48
 
49
- console v0.15.11
49
+ console v0.16.1
50
50
  https://github.com/console-rs/console
51
51
  MIT
52
52
 
@@ -114,7 +114,7 @@ ident_case v1.0.1
114
114
  https://github.com/TedDriggs/ident_case
115
115
  MIT/Apache-2.0
116
116
 
117
- indicatif v0.17.11
117
+ indicatif v0.18.3
118
118
  https://github.com/console-rs/indicatif
119
119
  MIT
120
120
 
@@ -186,10 +186,6 @@ nom v7.1.3
186
186
  https://github.com/Geal/nom
187
187
  MIT
188
188
 
189
- number_prefix v0.4.0
190
- https://github.com/ogham/rust-number-prefix
191
- MIT
192
-
193
189
  once_cell v1.21.3
194
190
  https://github.com/matklad/once_cell
195
191
  MIT OR Apache-2.0
@@ -250,11 +246,11 @@ rayon-core v1.12.1
250
246
  https://github.com/rayon-rs/rayon
251
247
  MIT OR Apache-2.0
252
248
 
253
- rb-sys v0.9.117
249
+ rb-sys v0.9.124
254
250
  https://github.com/oxidize-rb/rb-sys
255
251
  MIT OR Apache-2.0
256
252
 
257
- rb-sys-build v0.9.117
253
+ rb-sys-build v0.9.124
258
254
  https://github.com/oxidize-rb/rb-sys
259
255
  MIT OR Apache-2.0
260
256
 
@@ -338,7 +334,7 @@ thiserror-impl v2.0.12
338
334
  https://github.com/dtolnay/thiserror
339
335
  MIT OR Apache-2.0
340
336
 
341
- tokenizers v0.22.1
337
+ tokenizers v0.22.2
342
338
  https://github.com/huggingface/tokenizers
343
339
  Apache-2.0
344
340
 
@@ -362,6 +358,10 @@ unicode_categories v0.1.1
362
358
  https://github.com/swgillespie/unicode-categories
363
359
  MIT OR Apache-2.0
364
360
 
361
+ unit-prefix v0.5.2
362
+ https://codeberg.org/commons-rs/unit-prefix
363
+ MIT
364
+
365
365
  version_check v0.9.5
366
366
  https://github.com/SergioBenitez/version_check
367
367
  MIT/Apache-2.0
@@ -8031,32 +8031,6 @@ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
8031
8031
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
8032
8032
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8033
8033
 
8034
- ================================================================================
8035
- number_prefix LICENCE
8036
- ================================================================================
8037
-
8038
- MIT License
8039
-
8040
- Copyright (c) 2018 Benjamin Sago
8041
-
8042
- Permission is hereby granted, free of charge, to any person obtaining a copy
8043
- of this software and associated documentation files (the "Software"), to deal
8044
- in the Software without restriction, including without limitation the rights
8045
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8046
- copies of the Software, and to permit persons to whom the Software is
8047
- furnished to do so, subject to the following conditions:
8048
-
8049
- The above copyright notice and this permission notice shall be included in all
8050
- copies or substantial portions of the Software.
8051
-
8052
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
8053
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
8054
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
8055
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
8056
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
8057
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
8058
- SOFTWARE.
8059
-
8060
8034
  ================================================================================
8061
8035
  once_cell LICENSE-APACHE
8062
8036
  ================================================================================
@@ -16984,6 +16958,32 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16984
16958
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
16985
16959
  SOFTWARE.
16986
16960
 
16961
+ ================================================================================
16962
+ unit-prefix LICENSE
16963
+ ================================================================================
16964
+
16965
+ MIT License
16966
+
16967
+ Copyright (c) 2024 Benjamin Sago, Fabio Valentini
16968
+
16969
+ Permission is hereby granted, free of charge, to any person obtaining a copy
16970
+ of this software and associated documentation files (the "Software"), to deal
16971
+ in the Software without restriction, including without limitation the rights
16972
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
16973
+ copies of the Software, and to permit persons to whom the Software is
16974
+ furnished to do so, subject to the following conditions:
16975
+
16976
+ The above copyright notice and this permission notice shall be included in all
16977
+ copies or substantial portions of the Software.
16978
+
16979
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16980
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16981
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16982
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
16983
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
16984
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
16985
+ SOFTWARE.
16986
+
16987
16987
  ================================================================================
16988
16988
  version_check LICENSE-APACHE
16989
16989
  ================================================================================
Binary file
Binary file
Binary file
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.22.1"
4
+ TOKENIZERS_VERSION = "0.22.2"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -27,7 +27,8 @@ module Tokenizers
27
27
  headers["Authorization"] = "Bearer #{auth_token}"
28
28
  end
29
29
 
30
- url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
30
+ escaped_identifier = identifier.split("/", 2).map { |v| CGI.escape(v) }.join("/")
31
+ url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [escaped_identifier, CGI.escape(revision)]
31
32
 
32
33
  path =
33
34
  begin
@@ -53,14 +54,7 @@ module Tokenizers
53
54
  esum = Digest::SHA256.hexdigest(etag)
54
55
  resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
55
56
  if File.exist?(resource_path)
56
- uri = URI(url)
57
- req = Net::HTTP::Head.new(uri)
58
- headers.each do |k, v|
59
- req[k] = v
60
- end
61
- res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
62
- http.request(req)
63
- end
57
+ res = head_request(url, headers, options)
64
58
  if res["etag"] == etag
65
59
  return resource_path
66
60
  end
@@ -93,6 +87,25 @@ module Tokenizers
93
87
  resource_path
94
88
  end
95
89
 
90
+ def head_request(url, headers, options, redirects = 0)
91
+ uri = URI(url)
92
+ req = Net::HTTP::Head.new(uri)
93
+ headers.each do |k, v|
94
+ req[k] = v
95
+ end
96
+ res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
97
+ http.request(req)
98
+ end
99
+ if res.is_a?(Net::HTTPRedirection) && redirects < 3
100
+ location = URI.parse(res["location"])
101
+ # follow relative redirects only
102
+ if location.relative?
103
+ return head_request(uri.merge(location), headers, options, redirects + 1)
104
+ end
105
+ end
106
+ res
107
+ end
108
+
96
109
  def cache_dir
97
110
  cache_dir =
98
111
  if ENV["TOKENIZERS_CACHE"]
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.6.2"
2
+ VERSION = "0.6.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: x86_64-darwin
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-11-25 00:00:00.000000000 Z
11
+ date: 2026-01-06 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -26,6 +26,7 @@ files:
26
26
  - lib/tokenizers/3.2/tokenizers.bundle
27
27
  - lib/tokenizers/3.3/tokenizers.bundle
28
28
  - lib/tokenizers/3.4/tokenizers.bundle
29
+ - lib/tokenizers/4.0/tokenizers.bundle
29
30
  - lib/tokenizers/added_token.rb
30
31
  - lib/tokenizers/char_bpe_tokenizer.rb
31
32
  - lib/tokenizers/decoders/bpe_decoder.rb
@@ -71,7 +72,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
71
72
  version: '3.2'
72
73
  - - "<"
73
74
  - !ruby/object:Gem::Version
74
- version: 3.5.dev
75
+ version: 4.1.dev
75
76
  required_rubygems_version: !ruby/object:Gem::Requirement
76
77
  requirements:
77
78
  - - ">="