tokenizers 0.5.1-x86_64-linux → 0.5.3-x86_64-linux

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e1ac97056f64913b16bd3d63a97b0dfa11daad03adabf7bc09fc1f1f59c3459d
4
- data.tar.gz: 022cb74e0108763c23f26715accff37148809f564813409db81b9c85a8273fd4
3
+ metadata.gz: c5a23e4520e57dd74f97517733d0916919493906a2fa82cfc037a76ff203113d
4
+ data.tar.gz: d2cbc6265ac9d74a5724178da71d961ec81a0872c371839e22f1a3c2b097d637
5
5
  SHA512:
6
- metadata.gz: f00e700813f6dffb830497943538d4d9d45fe4679b38a5132dd18564088fd644340b7c7f937a0ce9904f001d2914f3898cd63435ec3750c6b1b51198a5e0e207
7
- data.tar.gz: 0c4b13af17bc407d788d5b263a024ccbd92130d1174ac1aa78412d2458923c9c6e32bcc562a8f68188867631321b6a33399fb594211e7f55f3964386d8f95700
6
+ metadata.gz: de6f0db2c89c0f3476ac3a1f11d4c43387e8bdb9bd3b5790ed2be0c91ed9733ff9db8009d9f51c39d786f83e8e6bdde3af72e13d406ce6e644103fa260d5e97f
7
+ data.tar.gz: f9ebdc7f298ff9c8f39f63acbd861515b3c7a3aa7929973498be040aae27f9235ee3256f72b66f13a3bccfe658db9e10cb922bef7b77a531dd5e0d1babebd25f
data/CHANGELOG.md CHANGED
@@ -1,3 +1,15 @@
1
+ ## 0.5.3 (2024-09-17)
2
+
3
+ - Added `AddedToken` class
4
+ - Added precompiled gem for Windows
5
+
6
+ ## 0.5.2 (2024-08-26)
7
+
8
+ - Added `from_str` method to `Tokenizer`
9
+ - Added `model` and `model=` methods to `Tokenizer`
10
+ - Added `decoder`, `pre_tokenizer`, `post_processor`, and `normalizer` methods to `Tokenizer`
11
+ - Added `decode` method to `Decoder`
12
+
1
13
  ## 0.5.1 (2024-08-13)
2
14
 
3
15
  - Updated Tokenizers to 0.20.0
data/Cargo.lock CHANGED
@@ -57,9 +57,12 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b"
57
57
 
58
58
  [[package]]
59
59
  name = "cc"
60
- version = "1.1.8"
60
+ version = "1.1.15"
61
61
  source = "registry+https://github.com/rust-lang/crates.io-index"
62
- checksum = "504bdec147f2cc13c8b57ed9401fd8a147cc66b67ad5cb241394244f2c947549"
62
+ checksum = "57b6a275aa2903740dc87da01c62040406b8812552e97129a63ea8850a17c6e6"
63
+ dependencies = [
64
+ "shlex",
65
+ ]
63
66
 
64
67
  [[package]]
65
68
  name = "cexpr"
@@ -301,9 +304,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
301
304
 
302
305
  [[package]]
303
306
  name = "libc"
304
- version = "0.2.155"
307
+ version = "0.2.158"
305
308
  source = "registry+https://github.com/rust-lang/crates.io-index"
306
- checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c"
309
+ checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439"
307
310
 
308
311
  [[package]]
309
312
  name = "libloading"
@@ -475,9 +478,9 @@ dependencies = [
475
478
 
476
479
  [[package]]
477
480
  name = "quote"
478
- version = "1.0.36"
481
+ version = "1.0.37"
479
482
  source = "registry+https://github.com/rust-lang/crates.io-index"
480
- checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7"
483
+ checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af"
481
484
  dependencies = [
482
485
  "proc-macro2",
483
486
  ]
@@ -545,18 +548,18 @@ dependencies = [
545
548
 
546
549
  [[package]]
547
550
  name = "rb-sys"
548
- version = "0.9.100"
551
+ version = "0.9.102"
549
552
  source = "registry+https://github.com/rust-lang/crates.io-index"
550
- checksum = "87f2ba20be84b32fad6b0ce397764bcdd0f2dca4431cf7035f6a6721e5747565"
553
+ checksum = "df4dec4b1d304c3b308a2cd86b1216ea45dd4361f4e9fa056f108332d0a450c1"
551
554
  dependencies = [
552
555
  "rb-sys-build",
553
556
  ]
554
557
 
555
558
  [[package]]
556
559
  name = "rb-sys-build"
557
- version = "0.9.100"
560
+ version = "0.9.102"
558
561
  source = "registry+https://github.com/rust-lang/crates.io-index"
559
- checksum = "7ecae2bdcb118ee721d9a3929f89e8578237fade298dfcf8c928609aa88abc48"
562
+ checksum = "1d71de3e29d174b8fb17b5d4470f27d7aa2605f8a9d05fda0d3aeff30e05a570"
560
563
  dependencies = [
561
564
  "bindgen",
562
565
  "lazy_static",
@@ -622,18 +625,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
622
625
 
623
626
  [[package]]
624
627
  name = "serde"
625
- version = "1.0.205"
628
+ version = "1.0.209"
626
629
  source = "registry+https://github.com/rust-lang/crates.io-index"
627
- checksum = "e33aedb1a7135da52b7c21791455563facbbcc43d0f0f66165b42c21b3dfb150"
630
+ checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09"
628
631
  dependencies = [
629
632
  "serde_derive",
630
633
  ]
631
634
 
632
635
  [[package]]
633
636
  name = "serde_derive"
634
- version = "1.0.205"
637
+ version = "1.0.209"
635
638
  source = "registry+https://github.com/rust-lang/crates.io-index"
636
- checksum = "692d6f5ac90220161d6774db30c662202721e64aed9058d2c394f451261420c1"
639
+ checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170"
637
640
  dependencies = [
638
641
  "proc-macro2",
639
642
  "quote",
@@ -642,9 +645,9 @@ dependencies = [
642
645
 
643
646
  [[package]]
644
647
  name = "serde_json"
645
- version = "1.0.122"
648
+ version = "1.0.127"
646
649
  source = "registry+https://github.com/rust-lang/crates.io-index"
647
- checksum = "784b6203951c57ff748476b126ccb5e8e2959a5c19e5c617ab1956be3dbc68da"
650
+ checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad"
648
651
  dependencies = [
649
652
  "itoa",
650
653
  "memchr",
@@ -690,9 +693,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f"
690
693
 
691
694
  [[package]]
692
695
  name = "syn"
693
- version = "2.0.72"
696
+ version = "2.0.76"
694
697
  source = "registry+https://github.com/rust-lang/crates.io-index"
695
- checksum = "dc4b9b9bf2add8093d3f2c0204471e951b2285580335de42f9d2534f3ae7a8af"
698
+ checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525"
696
699
  dependencies = [
697
700
  "proc-macro2",
698
701
  "quote",
@@ -721,7 +724,7 @@ dependencies = [
721
724
 
722
725
  [[package]]
723
726
  name = "tokenizers"
724
- version = "0.5.1"
727
+ version = "0.5.3"
725
728
  dependencies = [
726
729
  "magnus",
727
730
  "onig",
@@ -26,7 +26,7 @@ byteorder v1.5.0
26
26
  https://github.com/BurntSushi/byteorder
27
27
  Unlicense OR MIT
28
28
 
29
- cc v1.1.8
29
+ cc v1.1.15
30
30
  https://github.com/rust-lang/cc-rs
31
31
  MIT OR Apache-2.0
32
32
 
@@ -130,7 +130,7 @@ lazycell v1.3.0
130
130
  https://github.com/indiv0/lazycell
131
131
  MIT/Apache-2.0
132
132
 
133
- libc v0.2.155
133
+ libc v0.2.158
134
134
  https://github.com/rust-lang/libc
135
135
  MIT OR Apache-2.0
136
136
 
@@ -214,7 +214,7 @@ proc-macro2 v1.0.86
214
214
  https://github.com/dtolnay/proc-macro2
215
215
  MIT OR Apache-2.0
216
216
 
217
- quote v1.0.36
217
+ quote v1.0.37
218
218
  https://github.com/dtolnay/quote
219
219
  MIT OR Apache-2.0
220
220
 
@@ -242,11 +242,11 @@ rayon-core v1.12.1
242
242
  https://github.com/rayon-rs/rayon
243
243
  MIT OR Apache-2.0
244
244
 
245
- rb-sys v0.9.100
245
+ rb-sys v0.9.102
246
246
  https://github.com/oxidize-rb/rb-sys
247
247
  MIT OR Apache-2.0
248
248
 
249
- rb-sys-build v0.9.100
249
+ rb-sys-build v0.9.102
250
250
  https://github.com/oxidize-rb/rb-sys
251
251
  MIT OR Apache-2.0
252
252
 
@@ -278,15 +278,15 @@ seq-macro v0.3.5
278
278
  https://github.com/dtolnay/seq-macro
279
279
  MIT OR Apache-2.0
280
280
 
281
- serde v1.0.205
281
+ serde v1.0.209
282
282
  https://serde.rs
283
283
  MIT OR Apache-2.0
284
284
 
285
- serde_derive v1.0.205
285
+ serde_derive v1.0.209
286
286
  https://serde.rs
287
287
  MIT OR Apache-2.0
288
288
 
289
- serde_json v1.0.122
289
+ serde_json v1.0.127
290
290
  https://github.com/serde-rs/json
291
291
  MIT OR Apache-2.0
292
292
 
@@ -310,7 +310,7 @@ strsim v0.11.1
310
310
  https://github.com/rapidfuzz/strsim-rs
311
311
  MIT
312
312
 
313
- syn v2.0.72
313
+ syn v2.0.76
314
314
  https://github.com/dtolnay/syn
315
315
  MIT OR Apache-2.0
316
316
 
Binary file
Binary file
Binary file
@@ -0,0 +1,7 @@
1
+ module Tokenizers
2
+ class AddedToken
3
+ def self.new(content, **kwargs)
4
+ _new(content, kwargs)
5
+ end
6
+ end
7
+ end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.5.1"
2
+ VERSION = "0.5.3"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -42,6 +42,7 @@ require_relative "tokenizers/trainers/word_level_trainer"
42
42
  require_relative "tokenizers/trainers/word_piece_trainer"
43
43
 
44
44
  # other
45
+ require_relative "tokenizers/added_token"
45
46
  require_relative "tokenizers/char_bpe_tokenizer"
46
47
  require_relative "tokenizers/encoding"
47
48
  require_relative "tokenizers/from_pretrained"
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.1
4
+ version: 0.5.3
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2024-08-13 00:00:00.000000000 Z
11
+ date: 2024-09-17 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -26,6 +26,7 @@ files:
26
26
  - lib/tokenizers/3.1/tokenizers.so
27
27
  - lib/tokenizers/3.2/tokenizers.so
28
28
  - lib/tokenizers/3.3/tokenizers.so
29
+ - lib/tokenizers/added_token.rb
29
30
  - lib/tokenizers/char_bpe_tokenizer.rb
30
31
  - lib/tokenizers/decoders/bpe_decoder.rb
31
32
  - lib/tokenizers/decoders/ctc.rb