tokenizers 0.5.1-x86_64-linux → 0.5.3-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -0
- data/Cargo.lock +22 -19
- data/LICENSE-THIRD-PARTY.txt +9 -9
- data/lib/tokenizers/3.1/tokenizers.so +0 -0
- data/lib/tokenizers/3.2/tokenizers.so +0 -0
- data/lib/tokenizers/3.3/tokenizers.so +0 -0
- data/lib/tokenizers/added_token.rb +7 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +1 -0
- metadata +3 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: c5a23e4520e57dd74f97517733d0916919493906a2fa82cfc037a76ff203113d
|
4
|
+
data.tar.gz: d2cbc6265ac9d74a5724178da71d961ec81a0872c371839e22f1a3c2b097d637
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: de6f0db2c89c0f3476ac3a1f11d4c43387e8bdb9bd3b5790ed2be0c91ed9733ff9db8009d9f51c39d786f83e8e6bdde3af72e13d406ce6e644103fa260d5e97f
|
7
|
+
data.tar.gz: f9ebdc7f298ff9c8f39f63acbd861515b3c7a3aa7929973498be040aae27f9235ee3256f72b66f13a3bccfe658db9e10cb922bef7b77a531dd5e0d1babebd25f
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,15 @@
|
|
1
|
+
## 0.5.3 (2024-09-17)
|
2
|
+
|
3
|
+
- Added `AddedToken` class
|
4
|
+
- Added precompiled gem for Windows
|
5
|
+
|
6
|
+
## 0.5.2 (2024-08-26)
|
7
|
+
|
8
|
+
- Added `from_str` method to `Tokenizer`
|
9
|
+
- Added `model` and `model=` methods to `Tokenizer`
|
10
|
+
- Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
|
11
|
+
- Added `decode` method to `Decoder`
|
12
|
+
|
1
13
|
## 0.5.1 (2024-08-13)
|
2
14
|
|
3
15
|
- Updated Tokenizers to 0.20.0
|
data/Cargo.lock
CHANGED
@@ -57,9 +57,12 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
|
|
57
57
|
|
58
58
|
[[package]]
|
59
59
|
name = "cc"
|
60
|
-
version = "1.1.
|
60
|
+
version = "1.1.15"
|
61
61
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
62
|
-
checksum = "
|
62
|
+
checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
|
63
|
+
dependencies = [
|
64
|
+
"shlex",
|
65
|
+
]
|
63
66
|
|
64
67
|
[[package]]
|
65
68
|
name = "cexpr"
|
@@ -301,9 +304,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
301
304
|
|
302
305
|
[[package]]
|
303
306
|
name = "libc"
|
304
|
-
version = "0.2.
|
307
|
+
version = "0.2.158"
|
305
308
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
306
|
-
checksum = "
|
309
|
+
checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
|
307
310
|
|
308
311
|
[[package]]
|
309
312
|
name = "libloading"
|
@@ -475,9 +478,9 @@ dependencies = [
|
|
475
478
|
|
476
479
|
[[package]]
|
477
480
|
name = "quote"
|
478
|
-
version = "1.0.
|
481
|
+
version = "1.0.37"
|
479
482
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
480
|
-
checksum = "
|
483
|
+
checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
|
481
484
|
dependencies = [
|
482
485
|
"proc-macro2",
|
483
486
|
]
|
@@ -545,18 +548,18 @@ dependencies = [
|
|
545
548
|
|
546
549
|
[[package]]
|
547
550
|
name = "rb-sys"
|
548
|
-
version = "0.9.
|
551
|
+
version = "0.9.102"
|
549
552
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
550
|
-
checksum = "
|
553
|
+
checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
|
551
554
|
dependencies = [
|
552
555
|
"rb-sys-build",
|
553
556
|
]
|
554
557
|
|
555
558
|
[[package]]
|
556
559
|
name = "rb-sys-build"
|
557
|
-
version = "0.9.
|
560
|
+
version = "0.9.102"
|
558
561
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
559
|
-
checksum = "
|
562
|
+
checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
|
560
563
|
dependencies = [
|
561
564
|
"bindgen",
|
562
565
|
"lazy_static",
|
@@ -622,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
|
|
622
625
|
|
623
626
|
[[package]]
|
624
627
|
name = "serde"
|
625
|
-
version = "1.0.
|
628
|
+
version = "1.0.209"
|
626
629
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
627
|
-
checksum = "
|
630
|
+
checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
|
628
631
|
dependencies = [
|
629
632
|
"serde_derive",
|
630
633
|
]
|
631
634
|
|
632
635
|
[[package]]
|
633
636
|
name = "serde_derive"
|
634
|
-
version = "1.0.
|
637
|
+
version = "1.0.209"
|
635
638
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
636
|
-
checksum = "
|
639
|
+
checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
|
637
640
|
dependencies = [
|
638
641
|
"proc-macro2",
|
639
642
|
"quote",
|
@@ -642,9 +645,9 @@ dependencies = [
|
|
642
645
|
|
643
646
|
[[package]]
|
644
647
|
name = "serde_json"
|
645
|
-
version = "1.0.
|
648
|
+
version = "1.0.127"
|
646
649
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
647
|
-
checksum = "
|
650
|
+
checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
|
648
651
|
dependencies = [
|
649
652
|
"itoa",
|
650
653
|
"memchr",
|
@@ -690,9 +693,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
|
|
690
693
|
|
691
694
|
[[package]]
|
692
695
|
name = "syn"
|
693
|
-
version = "2.0.
|
696
|
+
version = "2.0.76"
|
694
697
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
695
|
-
checksum = "
|
698
|
+
checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
|
696
699
|
dependencies = [
|
697
700
|
"proc-macro2",
|
698
701
|
"quote",
|
@@ -721,7 +724,7 @@ dependencies = [
|
|
721
724
|
|
722
725
|
[[package]]
|
723
726
|
name = "tokenizers"
|
724
|
-
version = "0.5.
|
727
|
+
version = "0.5.3"
|
725
728
|
dependencies = [
|
726
729
|
"magnus",
|
727
730
|
"onig",
|
data/LICENSE-THIRD-PARTY.txt
CHANGED
@@ -26,7 +26,7 @@ byteorder v1.5.0
|
|
26
26
|
https://github.com/BurntSushi/byteorder
|
27
27
|
Unlicense OR MIT
|
28
28
|
|
29
|
-
cc v1.1.
|
29
|
+
cc v1.1.15
|
30
30
|
https://github.com/rust-lang/cc-rs
|
31
31
|
MIT OR Apache-2.0
|
32
32
|
|
@@ -130,7 +130,7 @@ lazycell v1.3.0
|
|
130
130
|
https://github.com/indiv0/lazycell
|
131
131
|
MIT/Apache-2.0
|
132
132
|
|
133
|
-
libc v0.2.
|
133
|
+
libc v0.2.158
|
134
134
|
https://github.com/rust-lang/libc
|
135
135
|
MIT OR Apache-2.0
|
136
136
|
|
@@ -214,7 +214,7 @@ proc-macro2 v1.0.86
|
|
214
214
|
https://github.com/dtolnay/proc-macro2
|
215
215
|
MIT OR Apache-2.0
|
216
216
|
|
217
|
-
quote v1.0.
|
217
|
+
quote v1.0.37
|
218
218
|
https://github.com/dtolnay/quote
|
219
219
|
MIT OR Apache-2.0
|
220
220
|
|
@@ -242,11 +242,11 @@ rayon-core v1.12.1
|
|
242
242
|
https://github.com/rayon-rs/rayon
|
243
243
|
MIT OR Apache-2.0
|
244
244
|
|
245
|
-
rb-sys v0.9.
|
245
|
+
rb-sys v0.9.102
|
246
246
|
https://github.com/oxidize-rb/rb-sys
|
247
247
|
MIT OR Apache-2.0
|
248
248
|
|
249
|
-
rb-sys-build v0.9.
|
249
|
+
rb-sys-build v0.9.102
|
250
250
|
https://github.com/oxidize-rb/rb-sys
|
251
251
|
MIT OR Apache-2.0
|
252
252
|
|
@@ -278,15 +278,15 @@ seq-macro v0.3.5
|
|
278
278
|
https://github.com/dtolnay/seq-macro
|
279
279
|
MIT OR Apache-2.0
|
280
280
|
|
281
|
-
serde v1.0.
|
281
|
+
serde v1.0.209
|
282
282
|
https://serde.rs
|
283
283
|
MIT OR Apache-2.0
|
284
284
|
|
285
|
-
serde_derive v1.0.
|
285
|
+
serde_derive v1.0.209
|
286
286
|
https://serde.rs
|
287
287
|
MIT OR Apache-2.0
|
288
288
|
|
289
|
-
serde_json v1.0.
|
289
|
+
serde_json v1.0.127
|
290
290
|
https://github.com/serde-rs/json
|
291
291
|
MIT OR Apache-2.0
|
292
292
|
|
@@ -310,7 +310,7 @@ strsim v0.11.1
|
|
310
310
|
https://github.com/rapidfuzz/strsim-rs
|
311
311
|
MIT
|
312
312
|
|
313
|
-
syn v2.0.
|
313
|
+
syn v2.0.76
|
314
314
|
https://github.com/dtolnay/syn
|
315
315
|
MIT OR Apache-2.0
|
316
316
|
|
Binary file
|
Binary file
|
Binary file
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -42,6 +42,7 @@ require_relative "tokenizers/trainers/word_level_trainer"
|
|
42
42
|
require_relative "tokenizers/trainers/word_piece_trainer"
|
43
43
|
|
44
44
|
# other
|
45
|
+
require_relative "tokenizers/added_token"
|
45
46
|
require_relative "tokenizers/char_bpe_tokenizer"
|
46
47
|
require_relative "tokenizers/encoding"
|
47
48
|
require_relative "tokenizers/from_pretrained"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
platform: x86_64-linux
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -26,6 +26,7 @@ files:
|
|
26
26
|
- lib/tokenizers/3.1/tokenizers.so
|
27
27
|
- lib/tokenizers/3.2/tokenizers.so
|
28
28
|
- lib/tokenizers/3.3/tokenizers.so
|
29
|
+
- lib/tokenizers/added_token.rb
|
29
30
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
30
31
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
31
32
|
- lib/tokenizers/decoders/ctc.rb
|