tokenizers 0.6.2-x64-mingw-ucrt → 0.6.3-x64-mingw-ucrt

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 8aa07d83801712c72cfc0fac5601f8fd4d59b8672a50785408c14333d28992d5
4
- data.tar.gz: c6e76f9bc88ed251dbef80680390eb78c252edc3b8be1d72562ca25c9db13623
3
+ metadata.gz: 8a5126bb08eb492411710ed30f9dd3f8ee02907eceec777429495ff5b974fff4
4
+ data.tar.gz: 0eca6198e6a605eaa5599bf6d67c96134658ab7209708bdbb4176df320c424fd
5
5
  SHA512:
6
- metadata.gz: eec2eca0c73b18c679e9b5ce1241c8b5b44ebd2d7973d8c1f832b6063e46643178e6c1e288ce0cc5f5f998088651520103c20c817e9cfc876e64ffbeae7b2bd4
7
- data.tar.gz: a7707d2de91a1e09cd0b0d4113e32430f27a69e82935f74f61993d690768076e6fbabb976669e90d2e0a15de56e2eb85995c8b19c45ede1ce0772b01f6c5b1ee
6
+ metadata.gz: a97579658629bd534db971c49c76c0d26e3e70b41b883654e3ff8e3185a25ab7a532e7d444eb09ee62fc20f4a303694e21cabebeae93dd0200166e4b2e47811d
7
+ data.tar.gz: 3446592210d668e8388510dcbd8f3aede10d05291ad41df81bf4b82667f6a28826824e090a7b983eb3da1b19b7283835a472a88c8f2182baba9328c4c8c9e5c9
data/CHANGELOG.md CHANGED
@@ -1,3 +1,9 @@
1
+ ## 0.6.3 (2026-01-05)
2
+
3
+ - Updated Tokenizers to 0.22.2
4
+ - Added support for Ruby 4.0
5
+ - Fixed `from_pretrained` method with namespaces
6
+
1
7
  ## 0.6.2 (2025-11-24)
2
8
 
3
9
  - Updated Tokenizers to 0.22.1
data/Cargo.lock CHANGED
@@ -124,9 +124,9 @@ dependencies = [
124
124
 
125
125
  [[package]]
126
126
  name = "console"
127
- version = "0.15.11"
127
+ version = "0.16.1"
128
128
  source = "registry+https://github.com/rust-lang/crates.io-index"
129
- checksum = "054ccb5b10f9f2cbf51eb355ca1d05c2d279ce1804688d0db74b4733a5aeafd8"
129
+ checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4"
130
130
  dependencies = [
131
131
  "encode_unicode",
132
132
  "libc",
@@ -288,14 +288,14 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
288
288
 
289
289
  [[package]]
290
290
  name = "indicatif"
291
- version = "0.17.11"
291
+ version = "0.18.3"
292
292
  source = "registry+https://github.com/rust-lang/crates.io-index"
293
- checksum = "183b3088984b400f4cfac3620d5e076c84da5364016b4f49473de574b2586235"
293
+ checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88"
294
294
  dependencies = [
295
295
  "console",
296
- "number_prefix",
297
296
  "portable-atomic",
298
297
  "unicode-width",
298
+ "unit-prefix",
299
299
  "web-time",
300
300
  ]
301
301
 
@@ -449,12 +449,6 @@ dependencies = [
449
449
  "minimal-lexical",
450
450
  ]
451
451
 
452
- [[package]]
453
- name = "number_prefix"
454
- version = "0.4.0"
455
- source = "registry+https://github.com/rust-lang/crates.io-index"
456
- checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
457
-
458
452
  [[package]]
459
453
  name = "once_cell"
460
454
  version = "1.21.3"
@@ -596,18 +590,18 @@ dependencies = [
596
590
 
597
591
  [[package]]
598
592
  name = "rb-sys"
599
- version = "0.9.117"
593
+ version = "0.9.124"
600
594
  source = "registry+https://github.com/rust-lang/crates.io-index"
601
- checksum = "f900d1ce4629a2ebffaf5de74bd8f9c1188d4c5ed406df02f97e22f77a006f44"
595
+ checksum = "c85c4188462601e2aa1469def389c17228566f82ea72f137ed096f21591bc489"
602
596
  dependencies = [
603
597
  "rb-sys-build",
604
598
  ]
605
599
 
606
600
  [[package]]
607
601
  name = "rb-sys-build"
608
- version = "0.9.117"
602
+ version = "0.9.124"
609
603
  source = "registry+https://github.com/rust-lang/crates.io-index"
610
- checksum = "ef1e9c857028f631056bcd6d88cec390c751e343ce2223ddb26d23eb4a151d59"
604
+ checksum = "568068db4102230882e6d4ae8de6632e224ca75fe5970f6e026a04e91ed635d3"
611
605
  dependencies = [
612
606
  "bindgen",
613
607
  "lazy_static",
@@ -784,20 +778,9 @@ dependencies = [
784
778
 
785
779
  [[package]]
786
780
  name = "tokenizers"
787
- version = "0.6.2"
788
- dependencies = [
789
- "ahash",
790
- "magnus",
791
- "onig",
792
- "serde",
793
- "tokenizers 0.22.1",
794
- ]
795
-
796
- [[package]]
797
- name = "tokenizers"
798
- version = "0.22.1"
781
+ version = "0.22.2"
799
782
  source = "registry+https://github.com/rust-lang/crates.io-index"
800
- checksum = "6475a27088c98ea96d00b39a9ddfb63780d1ad4cceb6f48374349a96ab2b7842"
783
+ checksum = "b238e22d44a15349529690fb07bd645cf58149a1b1e44d6cb5bd1641ff1a6223"
801
784
  dependencies = [
802
785
  "ahash",
803
786
  "aho-corasick",
@@ -827,6 +810,17 @@ dependencies = [
827
810
  "unicode_categories",
828
811
  ]
829
812
 
813
+ [[package]]
814
+ name = "tokenizers-ruby"
815
+ version = "0.6.3"
816
+ dependencies = [
817
+ "ahash",
818
+ "magnus",
819
+ "onig",
820
+ "serde",
821
+ "tokenizers",
822
+ ]
823
+
830
824
  [[package]]
831
825
  name = "unicode-ident"
832
826
  version = "1.0.18"
@@ -860,6 +854,12 @@ version = "0.1.1"
860
854
  source = "registry+https://github.com/rust-lang/crates.io-index"
861
855
  checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
862
856
 
857
+ [[package]]
858
+ name = "unit-prefix"
859
+ version = "0.5.2"
860
+ source = "registry+https://github.com/rust-lang/crates.io-index"
861
+ checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
862
+
863
863
  [[package]]
864
864
  name = "version_check"
865
865
  version = "0.9.5"
@@ -942,13 +942,19 @@ dependencies = [
942
942
  "wasm-bindgen",
943
943
  ]
944
944
 
945
+ [[package]]
946
+ name = "windows-link"
947
+ version = "0.2.1"
948
+ source = "registry+https://github.com/rust-lang/crates.io-index"
949
+ checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
950
+
945
951
  [[package]]
946
952
  name = "windows-sys"
947
- version = "0.59.0"
953
+ version = "0.61.2"
948
954
  source = "registry+https://github.com/rust-lang/crates.io-index"
949
- checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
955
+ checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
950
956
  dependencies = [
951
- "windows-targets",
957
+ "windows-link",
952
958
  ]
953
959
 
954
960
  [[package]]
@@ -46,7 +46,7 @@ compact_str v0.9.0
46
46
  https://github.com/ParkMyCar/compact_str
47
47
  MIT
48
48
 
49
- console v0.15.11
49
+ console v0.16.1
50
50
  https://github.com/console-rs/console
51
51
  MIT
52
52
 
@@ -118,7 +118,7 @@ ident_case v1.0.1
118
118
  https://github.com/TedDriggs/ident_case
119
119
  MIT/Apache-2.0
120
120
 
121
- indicatif v0.17.11
121
+ indicatif v0.18.3
122
122
  https://github.com/console-rs/indicatif
123
123
  MIT
124
124
 
@@ -190,10 +190,6 @@ nom v7.1.3
190
190
  https://github.com/Geal/nom
191
191
  MIT
192
192
 
193
- number_prefix v0.4.0
194
- https://github.com/ogham/rust-number-prefix
195
- MIT
196
-
197
193
  once_cell v1.21.3
198
194
  https://github.com/matklad/once_cell
199
195
  MIT OR Apache-2.0
@@ -254,11 +250,11 @@ rayon-core v1.12.1
254
250
  https://github.com/rayon-rs/rayon
255
251
  MIT OR Apache-2.0
256
252
 
257
- rb-sys v0.9.117
253
+ rb-sys v0.9.124
258
254
  https://github.com/oxidize-rb/rb-sys
259
255
  MIT OR Apache-2.0
260
256
 
261
- rb-sys-build v0.9.117
257
+ rb-sys-build v0.9.124
262
258
  https://github.com/oxidize-rb/rb-sys
263
259
  MIT OR Apache-2.0
264
260
 
@@ -342,7 +338,7 @@ thiserror-impl v2.0.12
342
338
  https://github.com/dtolnay/thiserror
343
339
  MIT OR Apache-2.0
344
340
 
345
- tokenizers v0.22.1
341
+ tokenizers v0.22.2
346
342
  https://github.com/huggingface/tokenizers
347
343
  Apache-2.0
348
344
 
@@ -366,11 +362,19 @@ unicode_categories v0.1.1
366
362
  https://github.com/swgillespie/unicode-categories
367
363
  MIT OR Apache-2.0
368
364
 
365
+ unit-prefix v0.5.2
366
+ https://codeberg.org/commons-rs/unit-prefix
367
+ MIT
368
+
369
369
  version_check v0.9.5
370
370
  https://github.com/SergioBenitez/version_check
371
371
  MIT/Apache-2.0
372
372
 
373
- windows-sys v0.59.0
373
+ windows-link v0.2.1
374
+ https://github.com/microsoft/windows-rs
375
+ MIT OR Apache-2.0
376
+
377
+ windows-sys v0.61.2
374
378
  https://github.com/microsoft/windows-rs
375
379
  MIT OR Apache-2.0
376
380
 
@@ -8276,32 +8280,6 @@ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
8276
8280
  OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
8277
8281
  WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8278
8282
 
8279
- ================================================================================
8280
- number_prefix LICENCE
8281
- ================================================================================
8282
-
8283
- MIT License
8284
-
8285
- Copyright (c) 2018 Benjamin Sago
8286
-
8287
- Permission is hereby granted, free of charge, to any person obtaining a copy
8288
- of this software and associated documentation files (the "Software"), to deal
8289
- in the Software without restriction, including without limitation the rights
8290
- to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
8291
- copies of the Software, and to permit persons to whom the Software is
8292
- furnished to do so, subject to the following conditions:
8293
-
8294
- The above copyright notice and this permission notice shall be included in all
8295
- copies or substantial portions of the Software.
8296
-
8297
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
8298
- IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
8299
- FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
8300
- AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
8301
- LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
8302
- OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
8303
- SOFTWARE.
8304
-
8305
8283
  ================================================================================
8306
8284
  once_cell LICENSE-APACHE
8307
8285
  ================================================================================
@@ -17229,6 +17207,32 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17229
17207
  OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17230
17208
  SOFTWARE.
17231
17209
 
17210
+ ================================================================================
17211
+ unit-prefix LICENSE
17212
+ ================================================================================
17213
+
17214
+ MIT License
17215
+
17216
+ Copyright (c) 2024 Benjamin Sago, Fabio Valentini
17217
+
17218
+ Permission is hereby granted, free of charge, to any person obtaining a copy
17219
+ of this software and associated documentation files (the "Software"), to deal
17220
+ in the Software without restriction, including without limitation the rights
17221
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17222
+ copies of the Software, and to permit persons to whom the Software is
17223
+ furnished to do so, subject to the following conditions:
17224
+
17225
+ The above copyright notice and this permission notice shall be included in all
17226
+ copies or substantial portions of the Software.
17227
+
17228
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17229
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17230
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17231
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17232
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17233
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17234
+ SOFTWARE.
17235
+
17232
17236
  ================================================================================
17233
17237
  version_check LICENSE-APACHE
17234
17238
  ================================================================================
@@ -17459,6 +17463,32 @@ COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
17459
17463
  IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
17460
17464
  CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
17461
17465
 
17466
+ ================================================================================
17467
+ windows-link license-mit
17468
+ ================================================================================
17469
+
17470
+ MIT License
17471
+
17472
+ Copyright (c) Microsoft Corporation.
17473
+
17474
+ Permission is hereby granted, free of charge, to any person obtaining a copy
17475
+ of this software and associated documentation files (the "Software"), to deal
17476
+ in the Software without restriction, including without limitation the rights
17477
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
17478
+ copies of the Software, and to permit persons to whom the Software is
17479
+ furnished to do so, subject to the following conditions:
17480
+
17481
+ The above copyright notice and this permission notice shall be included in all
17482
+ copies or substantial portions of the Software.
17483
+
17484
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17485
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17486
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
17487
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17488
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
17489
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
17490
+ SOFTWARE
17491
+
17462
17492
  ================================================================================
17463
17493
  windows-sys license-mit
17464
17494
  ================================================================================
Binary file
Binary file
Binary file
Binary file
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.22.1"
4
+ TOKENIZERS_VERSION = "0.22.2"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -27,7 +27,8 @@ module Tokenizers
27
27
  headers["Authorization"] = "Bearer #{auth_token}"
28
28
  end
29
29
 
30
- url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [identifier, revision].map { |v| CGI.escape(v) }
30
+ escaped_identifier = identifier.split("/", 2).map { |v| CGI.escape(v) }.join("/")
31
+ url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [escaped_identifier, CGI.escape(revision)]
31
32
 
32
33
  path =
33
34
  begin
@@ -53,14 +54,7 @@ module Tokenizers
53
54
  esum = Digest::SHA256.hexdigest(etag)
54
55
  resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
55
56
  if File.exist?(resource_path)
56
- uri = URI(url)
57
- req = Net::HTTP::Head.new(uri)
58
- headers.each do |k, v|
59
- req[k] = v
60
- end
61
- res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
62
- http.request(req)
63
- end
57
+ res = head_request(url, headers, options)
64
58
  if res["etag"] == etag
65
59
  return resource_path
66
60
  end
@@ -93,6 +87,25 @@ module Tokenizers
93
87
  resource_path
94
88
  end
95
89
 
90
+ def head_request(url, headers, options, redirects = 0)
91
+ uri = URI(url)
92
+ req = Net::HTTP::Head.new(uri)
93
+ headers.each do |k, v|
94
+ req[k] = v
95
+ end
96
+ res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
97
+ http.request(req)
98
+ end
99
+ if res.is_a?(Net::HTTPRedirection) && redirects < 3
100
+ location = URI.parse(res["location"])
101
+ # follow relative redirects only
102
+ if location.relative?
103
+ return head_request(uri.merge(location), headers, options, redirects + 1)
104
+ end
105
+ end
106
+ res
107
+ end
108
+
96
109
  def cache_dir
97
110
  cache_dir =
98
111
  if ENV["TOKENIZERS_CACHE"]
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.6.2"
2
+ VERSION = "0.6.3"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.6.2
4
+ version: 0.6.3
5
5
  platform: x64-mingw-ucrt
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2025-11-25 00:00:00.000000000 Z
11
+ date: 2026-01-06 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -26,6 +26,7 @@ files:
26
26
  - lib/tokenizers/3.2/tokenizers.so
27
27
  - lib/tokenizers/3.3/tokenizers.so
28
28
  - lib/tokenizers/3.4/tokenizers.so
29
+ - lib/tokenizers/4.0/tokenizers.so
29
30
  - lib/tokenizers/added_token.rb
30
31
  - lib/tokenizers/char_bpe_tokenizer.rb
31
32
  - lib/tokenizers/decoders/bpe_decoder.rb
@@ -71,7 +72,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
71
72
  version: '3.2'
72
73
  - - "<"
73
74
  - !ruby/object:Gem::Version
74
- version: 3.5.dev
75
+ version: 4.1.dev
75
76
  required_rubygems_version: !ruby/object:Gem::Requirement
76
77
  requirements:
77
78
  - - ">="