tokenizers 0.6.2-x86_64-darwin → 0.6.3-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/Cargo.lock +37 -31
- data/LICENSE-THIRD-PARTY.txt +35 -35
- data/lib/tokenizers/3.2/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.3/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.4/tokenizers.bundle +0 -0
- data/lib/tokenizers/4.0/tokenizers.bundle +0 -0
- data/lib/tokenizers/from_pretrained.rb +23 -10
- data/lib/tokenizers/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: ab7df68f887ed98c7592e0553d82dcb0989033b3983aae43d093ffd8d9883ac0
|
|
4
|
+
data.tar.gz: e17dcd450c6e7f984cde6ecc9d2e9a5d8a695583c226d3e8ab2524a4a3c78c1d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 7067a8fc83ad516f1ac5f063b1c1b891ddd33490d5379e64b8b750aaf164c1a971a0e8053a91f7ae0b7c6eef4b311610c5cd6f3a30264f237b4f0b107557ff8d
|
|
7
|
+
data.tar.gz: '0986b6b549512d6308e3a273e66b7b68f2a84c3438fd390a3987b73d85b2449cb6478d35234a258c21c01c7f604bf1c5c61b18b0cb4c0d9969237da1dd12f50a'
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
|
@@ -124,9 +124,9 @@ dependencies = [
|
|
|
124
124
|
|
|
125
125
|
[[package]]
|
|
126
126
|
name = "console"
|
|
127
|
-
version = "0.
|
|
127
|
+
version = "0.16.1"
|
|
128
128
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
129
|
-
checksum = "
|
|
129
|
+
checksum = "b430743a6eb14e9764d4260d4c0d8123087d504eeb9c48f2b2a5e810dd369df4"
|
|
130
130
|
dependencies = [
|
|
131
131
|
"encode_unicode",
|
|
132
132
|
"libc",
|
|
@@ -288,14 +288,14 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39"
|
|
|
288
288
|
|
|
289
289
|
[[package]]
|
|
290
290
|
name = "indicatif"
|
|
291
|
-
version = "0.
|
|
291
|
+
version = "0.18.3"
|
|
292
292
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
293
|
-
checksum = "
|
|
293
|
+
checksum = "9375e112e4b463ec1b1c6c011953545c65a30164fbab5b581df32b3abf0dcb88"
|
|
294
294
|
dependencies = [
|
|
295
295
|
"console",
|
|
296
|
-
"number_prefix",
|
|
297
296
|
"portable-atomic",
|
|
298
297
|
"unicode-width",
|
|
298
|
+
"unit-prefix",
|
|
299
299
|
"web-time",
|
|
300
300
|
]
|
|
301
301
|
|
|
@@ -449,12 +449,6 @@ dependencies = [
|
|
|
449
449
|
"minimal-lexical",
|
|
450
450
|
]
|
|
451
451
|
|
|
452
|
-
[[package]]
|
|
453
|
-
name = "number_prefix"
|
|
454
|
-
version = "0.4.0"
|
|
455
|
-
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
456
|
-
checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3"
|
|
457
|
-
|
|
458
452
|
[[package]]
|
|
459
453
|
name = "once_cell"
|
|
460
454
|
version = "1.21.3"
|
|
@@ -596,18 +590,18 @@ dependencies = [
|
|
|
596
590
|
|
|
597
591
|
[[package]]
|
|
598
592
|
name = "rb-sys"
|
|
599
|
-
version = "0.9.
|
|
593
|
+
version = "0.9.124"
|
|
600
594
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
601
|
-
checksum = "
|
|
595
|
+
checksum = "c85c4188462601e2aa1469def389c17228566f82ea72f137ed096f21591bc489"
|
|
602
596
|
dependencies = [
|
|
603
597
|
"rb-sys-build",
|
|
604
598
|
]
|
|
605
599
|
|
|
606
600
|
[[package]]
|
|
607
601
|
name = "rb-sys-build"
|
|
608
|
-
version = "0.9.
|
|
602
|
+
version = "0.9.124"
|
|
609
603
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
610
|
-
checksum = "
|
|
604
|
+
checksum = "568068db4102230882e6d4ae8de6632e224ca75fe5970f6e026a04e91ed635d3"
|
|
611
605
|
dependencies = [
|
|
612
606
|
"bindgen",
|
|
613
607
|
"lazy_static",
|
|
@@ -784,20 +778,9 @@ dependencies = [
|
|
|
784
778
|
|
|
785
779
|
[[package]]
|
|
786
780
|
name = "tokenizers"
|
|
787
|
-
version = "0.
|
|
788
|
-
dependencies = [
|
|
789
|
-
"ahash",
|
|
790
|
-
"magnus",
|
|
791
|
-
"onig",
|
|
792
|
-
"serde",
|
|
793
|
-
"tokenizers 0.22.1",
|
|
794
|
-
]
|
|
795
|
-
|
|
796
|
-
[[package]]
|
|
797
|
-
name = "tokenizers"
|
|
798
|
-
version = "0.22.1"
|
|
781
|
+
version = "0.22.2"
|
|
799
782
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
800
|
-
checksum = "
|
|
783
|
+
checksum = "b238e22d44a15349529690fb07bd645cf58149a1b1e44d6cb5bd1641ff1a6223"
|
|
801
784
|
dependencies = [
|
|
802
785
|
"ahash",
|
|
803
786
|
"aho-corasick",
|
|
@@ -827,6 +810,17 @@ dependencies = [
|
|
|
827
810
|
"unicode_categories",
|
|
828
811
|
]
|
|
829
812
|
|
|
813
|
+
[[package]]
|
|
814
|
+
name = "tokenizers-ruby"
|
|
815
|
+
version = "0.6.3"
|
|
816
|
+
dependencies = [
|
|
817
|
+
"ahash",
|
|
818
|
+
"magnus",
|
|
819
|
+
"onig",
|
|
820
|
+
"serde",
|
|
821
|
+
"tokenizers",
|
|
822
|
+
]
|
|
823
|
+
|
|
830
824
|
[[package]]
|
|
831
825
|
name = "unicode-ident"
|
|
832
826
|
version = "1.0.18"
|
|
@@ -860,6 +854,12 @@ version = "0.1.1"
|
|
|
860
854
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
861
855
|
checksum = "39ec24b3121d976906ece63c9daad25b85969647682eee313cb5779fdd69e14e"
|
|
862
856
|
|
|
857
|
+
[[package]]
|
|
858
|
+
name = "unit-prefix"
|
|
859
|
+
version = "0.5.2"
|
|
860
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
861
|
+
checksum = "81e544489bf3d8ef66c953931f56617f423cd4b5494be343d9b9d3dda037b9a3"
|
|
862
|
+
|
|
863
863
|
[[package]]
|
|
864
864
|
name = "version_check"
|
|
865
865
|
version = "0.9.5"
|
|
@@ -942,13 +942,19 @@ dependencies = [
|
|
|
942
942
|
"wasm-bindgen",
|
|
943
943
|
]
|
|
944
944
|
|
|
945
|
+
[[package]]
|
|
946
|
+
name = "windows-link"
|
|
947
|
+
version = "0.2.1"
|
|
948
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
949
|
+
checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5"
|
|
950
|
+
|
|
945
951
|
[[package]]
|
|
946
952
|
name = "windows-sys"
|
|
947
|
-
version = "0.
|
|
953
|
+
version = "0.61.2"
|
|
948
954
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
949
|
-
checksum = "
|
|
955
|
+
checksum = "ae137229bcbd6cdf0f7b80a31df61766145077ddf49416a728b02cb3921ff3fc"
|
|
950
956
|
dependencies = [
|
|
951
|
-
"windows-
|
|
957
|
+
"windows-link",
|
|
952
958
|
]
|
|
953
959
|
|
|
954
960
|
[[package]]
|
data/LICENSE-THIRD-PARTY.txt
CHANGED
|
@@ -46,7 +46,7 @@ compact_str v0.9.0
|
|
|
46
46
|
https://github.com/ParkMyCar/compact_str
|
|
47
47
|
MIT
|
|
48
48
|
|
|
49
|
-
console v0.
|
|
49
|
+
console v0.16.1
|
|
50
50
|
https://github.com/console-rs/console
|
|
51
51
|
MIT
|
|
52
52
|
|
|
@@ -114,7 +114,7 @@ ident_case v1.0.1
|
|
|
114
114
|
https://github.com/TedDriggs/ident_case
|
|
115
115
|
MIT/Apache-2.0
|
|
116
116
|
|
|
117
|
-
indicatif v0.
|
|
117
|
+
indicatif v0.18.3
|
|
118
118
|
https://github.com/console-rs/indicatif
|
|
119
119
|
MIT
|
|
120
120
|
|
|
@@ -186,10 +186,6 @@ nom v7.1.3
|
|
|
186
186
|
https://github.com/Geal/nom
|
|
187
187
|
MIT
|
|
188
188
|
|
|
189
|
-
number_prefix v0.4.0
|
|
190
|
-
https://github.com/ogham/rust-number-prefix
|
|
191
|
-
MIT
|
|
192
|
-
|
|
193
189
|
once_cell v1.21.3
|
|
194
190
|
https://github.com/matklad/once_cell
|
|
195
191
|
MIT OR Apache-2.0
|
|
@@ -250,11 +246,11 @@ rayon-core v1.12.1
|
|
|
250
246
|
https://github.com/rayon-rs/rayon
|
|
251
247
|
MIT OR Apache-2.0
|
|
252
248
|
|
|
253
|
-
rb-sys v0.9.
|
|
249
|
+
rb-sys v0.9.124
|
|
254
250
|
https://github.com/oxidize-rb/rb-sys
|
|
255
251
|
MIT OR Apache-2.0
|
|
256
252
|
|
|
257
|
-
rb-sys-build v0.9.
|
|
253
|
+
rb-sys-build v0.9.124
|
|
258
254
|
https://github.com/oxidize-rb/rb-sys
|
|
259
255
|
MIT OR Apache-2.0
|
|
260
256
|
|
|
@@ -338,7 +334,7 @@ thiserror-impl v2.0.12
|
|
|
338
334
|
https://github.com/dtolnay/thiserror
|
|
339
335
|
MIT OR Apache-2.0
|
|
340
336
|
|
|
341
|
-
tokenizers v0.22.
|
|
337
|
+
tokenizers v0.22.2
|
|
342
338
|
https://github.com/huggingface/tokenizers
|
|
343
339
|
Apache-2.0
|
|
344
340
|
|
|
@@ -362,6 +358,10 @@ unicode_categories v0.1.1
|
|
|
362
358
|
https://github.com/swgillespie/unicode-categories
|
|
363
359
|
MIT OR Apache-2.0
|
|
364
360
|
|
|
361
|
+
unit-prefix v0.5.2
|
|
362
|
+
https://codeberg.org/commons-rs/unit-prefix
|
|
363
|
+
MIT
|
|
364
|
+
|
|
365
365
|
version_check v0.9.5
|
|
366
366
|
https://github.com/SergioBenitez/version_check
|
|
367
367
|
MIT/Apache-2.0
|
|
@@ -8031,32 +8031,6 @@ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
|
|
8031
8031
|
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
|
8032
8032
|
WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
|
8033
8033
|
|
|
8034
|
-
================================================================================
|
|
8035
|
-
number_prefix LICENCE
|
|
8036
|
-
================================================================================
|
|
8037
|
-
|
|
8038
|
-
MIT License
|
|
8039
|
-
|
|
8040
|
-
Copyright (c) 2018 Benjamin Sago
|
|
8041
|
-
|
|
8042
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
8043
|
-
of this software and associated documentation files (the "Software"), to deal
|
|
8044
|
-
in the Software without restriction, including without limitation the rights
|
|
8045
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
8046
|
-
copies of the Software, and to permit persons to whom the Software is
|
|
8047
|
-
furnished to do so, subject to the following conditions:
|
|
8048
|
-
|
|
8049
|
-
The above copyright notice and this permission notice shall be included in all
|
|
8050
|
-
copies or substantial portions of the Software.
|
|
8051
|
-
|
|
8052
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
8053
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
8054
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
8055
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
8056
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
8057
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
8058
|
-
SOFTWARE.
|
|
8059
|
-
|
|
8060
8034
|
================================================================================
|
|
8061
8035
|
once_cell LICENSE-APACHE
|
|
8062
8036
|
================================================================================
|
|
@@ -16984,6 +16958,32 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
|
16984
16958
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
16985
16959
|
SOFTWARE.
|
|
16986
16960
|
|
|
16961
|
+
================================================================================
|
|
16962
|
+
unit-prefix LICENSE
|
|
16963
|
+
================================================================================
|
|
16964
|
+
|
|
16965
|
+
MIT License
|
|
16966
|
+
|
|
16967
|
+
Copyright (c) 2024 Benjamin Sago, Fabio Valentini
|
|
16968
|
+
|
|
16969
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
16970
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
16971
|
+
in the Software without restriction, including without limitation the rights
|
|
16972
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
16973
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
16974
|
+
furnished to do so, subject to the following conditions:
|
|
16975
|
+
|
|
16976
|
+
The above copyright notice and this permission notice shall be included in all
|
|
16977
|
+
copies or substantial portions of the Software.
|
|
16978
|
+
|
|
16979
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
16980
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
16981
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
16982
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
16983
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
16984
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
16985
|
+
SOFTWARE.
|
|
16986
|
+
|
|
16987
16987
|
================================================================================
|
|
16988
16988
|
version_check LICENSE-APACHE
|
|
16989
16989
|
================================================================================
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
Binary file
|
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
module Tokenizers
|
|
2
2
|
module FromPretrained
|
|
3
3
|
# for user agent
|
|
4
|
-
TOKENIZERS_VERSION = "0.22.
|
|
4
|
+
TOKENIZERS_VERSION = "0.22.2"
|
|
5
5
|
|
|
6
6
|
# use Ruby for downloads
|
|
7
7
|
# this avoids the need to vendor OpenSSL on Linux
|
|
@@ -27,7 +27,8 @@ module Tokenizers
|
|
|
27
27
|
headers["Authorization"] = "Bearer #{auth_token}"
|
|
28
28
|
end
|
|
29
29
|
|
|
30
|
-
|
|
30
|
+
escaped_identifier = identifier.split("/", 2).map { |v| CGI.escape(v) }.join("/")
|
|
31
|
+
url = "https://huggingface.co/%s/resolve/%s/tokenizer.json" % [escaped_identifier, CGI.escape(revision)]
|
|
31
32
|
|
|
32
33
|
path =
|
|
33
34
|
begin
|
|
@@ -53,14 +54,7 @@ module Tokenizers
|
|
|
53
54
|
esum = Digest::SHA256.hexdigest(etag)
|
|
54
55
|
resource_path = File.join(cache_dir, "#{fsum}.#{esum}")
|
|
55
56
|
if File.exist?(resource_path)
|
|
56
|
-
|
|
57
|
-
req = Net::HTTP::Head.new(uri)
|
|
58
|
-
headers.each do |k, v|
|
|
59
|
-
req[k] = v
|
|
60
|
-
end
|
|
61
|
-
res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
|
|
62
|
-
http.request(req)
|
|
63
|
-
end
|
|
57
|
+
res = head_request(url, headers, options)
|
|
64
58
|
if res["etag"] == etag
|
|
65
59
|
return resource_path
|
|
66
60
|
end
|
|
@@ -93,6 +87,25 @@ module Tokenizers
|
|
|
93
87
|
resource_path
|
|
94
88
|
end
|
|
95
89
|
|
|
90
|
+
def head_request(url, headers, options, redirects = 0)
|
|
91
|
+
uri = URI(url)
|
|
92
|
+
req = Net::HTTP::Head.new(uri)
|
|
93
|
+
headers.each do |k, v|
|
|
94
|
+
req[k] = v
|
|
95
|
+
end
|
|
96
|
+
res = Net::HTTP.start(uri.hostname, uri.port, options.merge(use_ssl: true)) do |http|
|
|
97
|
+
http.request(req)
|
|
98
|
+
end
|
|
99
|
+
if res.is_a?(Net::HTTPRedirection) && redirects < 3
|
|
100
|
+
location = URI.parse(res["location"])
|
|
101
|
+
# follow relative redirects only
|
|
102
|
+
if location.relative?
|
|
103
|
+
return head_request(uri.merge(location), headers, options, redirects + 1)
|
|
104
|
+
end
|
|
105
|
+
end
|
|
106
|
+
res
|
|
107
|
+
end
|
|
108
|
+
|
|
96
109
|
def cache_dir
|
|
97
110
|
cache_dir =
|
|
98
111
|
if ENV["TOKENIZERS_CACHE"]
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.6.
|
|
4
|
+
version: 0.6.3
|
|
5
5
|
platform: x86_64-darwin
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2026-01-06 00:00:00.000000000 Z
|
|
12
12
|
dependencies: []
|
|
13
13
|
description:
|
|
14
14
|
email: andrew@ankane.org
|
|
@@ -26,6 +26,7 @@ files:
|
|
|
26
26
|
- lib/tokenizers/3.2/tokenizers.bundle
|
|
27
27
|
- lib/tokenizers/3.3/tokenizers.bundle
|
|
28
28
|
- lib/tokenizers/3.4/tokenizers.bundle
|
|
29
|
+
- lib/tokenizers/4.0/tokenizers.bundle
|
|
29
30
|
- lib/tokenizers/added_token.rb
|
|
30
31
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
|
31
32
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
|
@@ -71,7 +72,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
71
72
|
version: '3.2'
|
|
72
73
|
- - "<"
|
|
73
74
|
- !ruby/object:Gem::Version
|
|
74
|
-
version:
|
|
75
|
+
version: 4.1.dev
|
|
75
76
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
76
77
|
requirements:
|
|
77
78
|
- - ">="
|