tokenizers 0.3.1 → 0.3.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -5
- data/Cargo.lock +129 -89
- data/ext/tokenizers/Cargo.toml +5 -6
- data/ext/tokenizers/src/decoders.rs +100 -1
- data/ext/tokenizers/src/models.rs +15 -0
- data/ext/tokenizers/src/normalizers.rs +28 -2
- data/lib/tokenizers/decoders/strip.rb +9 -0
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/normalizers/prepend.rb +9 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +2 -0
- metadata +5 -3
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: e6e88ec5618e36e317434410c960603695806bb59dadb2252f2957d8dbf0525b
|
|
4
|
+
data.tar.gz: 33a04a4a5faada27e6e7246c16d836a4ff9f6793e89de3cfd4880e30c6c8ed0d
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: 88e4f2ad57fd1d66cd5fcf0d8b7ff6b1ea902258296fb02d207a446032134189e3445a104658074e94f914331c94f46cfdd09eed7c745c0483cb3b32b09e6abf
|
|
7
|
+
data.tar.gz: e8a1721ecbd36874322477077331743b0d1ba2de6f90076e07ad5456c230f76625d7f28ed6e6026c11395c6bb27701a6b8c0feedf2050387d32d9b777baa51fe
|
data/CHANGELOG.md
CHANGED
|
@@ -1,8 +1,18 @@
|
|
|
1
|
-
## 0.3.
|
|
1
|
+
## 0.3.3 (2023-04-09)
|
|
2
|
+
|
|
3
|
+
- Updated Tokenizers to 0.13.3
|
|
4
|
+
- Added `ByteFallback`, `Fuse`, `Replace`, and `Strip` decoders
|
|
5
|
+
- Added `Prepend` normalizer
|
|
6
|
+
|
|
7
|
+
## 0.3.2 (2023-03-06)
|
|
8
|
+
|
|
9
|
+
- Added precompiled gem for Linux x86-64 MUSL
|
|
10
|
+
|
|
11
|
+
## 0.3.1 (2023-02-08)
|
|
2
12
|
|
|
3
13
|
- Fixed error with Ruby 2.7
|
|
4
14
|
|
|
5
|
-
## 0.3.0 (
|
|
15
|
+
## 0.3.0 (2023-02-07)
|
|
6
16
|
|
|
7
17
|
- Added support for training tokenizers
|
|
8
18
|
- Added more methods to `Tokenizer`
|
|
@@ -11,20 +21,20 @@
|
|
|
11
21
|
- Changed `encode` method to include special tokens by default
|
|
12
22
|
- Changed how offsets are calculated for strings with multibyte characters
|
|
13
23
|
|
|
14
|
-
## 0.2.3 (
|
|
24
|
+
## 0.2.3 (2023-01-22)
|
|
15
25
|
|
|
16
26
|
- Added `add_special_tokens` option to `encode` method
|
|
17
27
|
- Added warning about `encode` method including special tokens by default in 0.3.0
|
|
18
28
|
- Added more methods to `Encoding`
|
|
19
29
|
- Fixed error with precompiled gem on Mac ARM
|
|
20
30
|
|
|
21
|
-
## 0.2.2 (
|
|
31
|
+
## 0.2.2 (2023-01-15)
|
|
22
32
|
|
|
23
33
|
- Added precompiled gem for Linux ARM
|
|
24
34
|
- Added `from_file` method
|
|
25
35
|
- Fixed error with precompiled gem on Linux x86-64
|
|
26
36
|
|
|
27
|
-
## 0.2.1 (
|
|
37
|
+
## 0.2.1 (2023-01-12)
|
|
28
38
|
|
|
29
39
|
- Added support for Ruby 3.2
|
|
30
40
|
|
data/Cargo.lock
CHANGED
|
@@ -71,9 +71,9 @@ checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd"
|
|
|
71
71
|
|
|
72
72
|
[[package]]
|
|
73
73
|
name = "clang-sys"
|
|
74
|
-
version = "1.
|
|
74
|
+
version = "1.6.1"
|
|
75
75
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
76
|
-
checksum = "
|
|
76
|
+
checksum = "c688fc74432808e3eb684cae8830a86be1d66a2bd58e1f248ed0960a590baf6f"
|
|
77
77
|
dependencies = [
|
|
78
78
|
"glob",
|
|
79
79
|
"libc",
|
|
@@ -95,9 +95,9 @@ dependencies = [
|
|
|
95
95
|
|
|
96
96
|
[[package]]
|
|
97
97
|
name = "crossbeam-channel"
|
|
98
|
-
version = "0.5.
|
|
98
|
+
version = "0.5.8"
|
|
99
99
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
100
|
-
checksum = "
|
|
100
|
+
checksum = "a33c2bf77f2df06183c3aa30d1e96c0695a313d4f9c453cc3762a6db39f99200"
|
|
101
101
|
dependencies = [
|
|
102
102
|
"cfg-if",
|
|
103
103
|
"crossbeam-utils",
|
|
@@ -105,9 +105,9 @@ dependencies = [
|
|
|
105
105
|
|
|
106
106
|
[[package]]
|
|
107
107
|
name = "crossbeam-deque"
|
|
108
|
-
version = "0.8.
|
|
108
|
+
version = "0.8.3"
|
|
109
109
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
110
|
-
checksum = "
|
|
110
|
+
checksum = "ce6fd6f855243022dcecf8702fef0c297d4338e226845fe067f6341ad9fa0cef"
|
|
111
111
|
dependencies = [
|
|
112
112
|
"cfg-if",
|
|
113
113
|
"crossbeam-epoch",
|
|
@@ -116,9 +116,9 @@ dependencies = [
|
|
|
116
116
|
|
|
117
117
|
[[package]]
|
|
118
118
|
name = "crossbeam-epoch"
|
|
119
|
-
version = "0.9.
|
|
119
|
+
version = "0.9.14"
|
|
120
120
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
121
|
-
checksum = "
|
|
121
|
+
checksum = "46bd5f3f85273295a9d14aedfb86f6aadbff6d8f5295c4a9edb08e819dcf5695"
|
|
122
122
|
dependencies = [
|
|
123
123
|
"autocfg",
|
|
124
124
|
"cfg-if",
|
|
@@ -129,18 +129,18 @@ dependencies = [
|
|
|
129
129
|
|
|
130
130
|
[[package]]
|
|
131
131
|
name = "crossbeam-utils"
|
|
132
|
-
version = "0.8.
|
|
132
|
+
version = "0.8.15"
|
|
133
133
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
134
|
-
checksum = "
|
|
134
|
+
checksum = "3c063cd8cc95f5c377ed0d4b49a4b21f632396ff690e8470c29b3359b346984b"
|
|
135
135
|
dependencies = [
|
|
136
136
|
"cfg-if",
|
|
137
137
|
]
|
|
138
138
|
|
|
139
139
|
[[package]]
|
|
140
140
|
name = "darling"
|
|
141
|
-
version = "0.14.
|
|
141
|
+
version = "0.14.4"
|
|
142
142
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
143
|
-
checksum = "
|
|
143
|
+
checksum = "7b750cb3417fd1b327431a470f388520309479ab0bf5e323505daf0290cd3850"
|
|
144
144
|
dependencies = [
|
|
145
145
|
"darling_core",
|
|
146
146
|
"darling_macro",
|
|
@@ -148,27 +148,27 @@ dependencies = [
|
|
|
148
148
|
|
|
149
149
|
[[package]]
|
|
150
150
|
name = "darling_core"
|
|
151
|
-
version = "0.14.
|
|
151
|
+
version = "0.14.4"
|
|
152
152
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
153
|
-
checksum = "
|
|
153
|
+
checksum = "109c1ca6e6b7f82cc233a97004ea8ed7ca123a9af07a8230878fcfda9b158bf0"
|
|
154
154
|
dependencies = [
|
|
155
155
|
"fnv",
|
|
156
156
|
"ident_case",
|
|
157
157
|
"proc-macro2",
|
|
158
158
|
"quote",
|
|
159
159
|
"strsim",
|
|
160
|
-
"syn",
|
|
160
|
+
"syn 1.0.109",
|
|
161
161
|
]
|
|
162
162
|
|
|
163
163
|
[[package]]
|
|
164
164
|
name = "darling_macro"
|
|
165
|
-
version = "0.14.
|
|
165
|
+
version = "0.14.4"
|
|
166
166
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
167
|
-
checksum = "
|
|
167
|
+
checksum = "a4aab4dbc9f7611d8b55048a3a16d2d010c2c8334e46304b40ac1cc14bf3b48e"
|
|
168
168
|
dependencies = [
|
|
169
169
|
"darling_core",
|
|
170
170
|
"quote",
|
|
171
|
-
"syn",
|
|
171
|
+
"syn 1.0.109",
|
|
172
172
|
]
|
|
173
173
|
|
|
174
174
|
[[package]]
|
|
@@ -189,7 +189,7 @@ dependencies = [
|
|
|
189
189
|
"darling",
|
|
190
190
|
"proc-macro2",
|
|
191
191
|
"quote",
|
|
192
|
-
"syn",
|
|
192
|
+
"syn 1.0.109",
|
|
193
193
|
]
|
|
194
194
|
|
|
195
195
|
[[package]]
|
|
@@ -199,7 +199,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
|
199
199
|
checksum = "ebcda35c7a396850a55ffeac740804b40ffec779b98fffbb1738f4033f0ee79e"
|
|
200
200
|
dependencies = [
|
|
201
201
|
"derive_builder_core",
|
|
202
|
-
"syn",
|
|
202
|
+
"syn 1.0.109",
|
|
203
203
|
]
|
|
204
204
|
|
|
205
205
|
[[package]]
|
|
@@ -231,9 +231,9 @@ checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1"
|
|
|
231
231
|
|
|
232
232
|
[[package]]
|
|
233
233
|
name = "getrandom"
|
|
234
|
-
version = "0.2.
|
|
234
|
+
version = "0.2.9"
|
|
235
235
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
236
|
-
checksum = "
|
|
236
|
+
checksum = "c85e1d9ab2eadba7e5040d4e09cbd6d072b76a557ad64e797c2cb9d4da21d7e4"
|
|
237
237
|
dependencies = [
|
|
238
238
|
"cfg-if",
|
|
239
239
|
"libc",
|
|
@@ -293,9 +293,9 @@ dependencies = [
|
|
|
293
293
|
|
|
294
294
|
[[package]]
|
|
295
295
|
name = "itoa"
|
|
296
|
-
version = "1.0.
|
|
296
|
+
version = "1.0.6"
|
|
297
297
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
298
|
-
checksum = "
|
|
298
|
+
checksum = "453ad9f582a441959e5f0d088b02ce04cfe8d51a8eaf077f12ac6d3e94164ca6"
|
|
299
299
|
|
|
300
300
|
[[package]]
|
|
301
301
|
name = "lazy_static"
|
|
@@ -311,9 +311,9 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55"
|
|
|
311
311
|
|
|
312
312
|
[[package]]
|
|
313
313
|
name = "libc"
|
|
314
|
-
version = "0.2.
|
|
314
|
+
version = "0.2.141"
|
|
315
315
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
316
|
-
checksum = "
|
|
316
|
+
checksum = "3304a64d199bb964be99741b7a14d26972741915b3649639149b2479bb46f4b5"
|
|
317
317
|
|
|
318
318
|
[[package]]
|
|
319
319
|
name = "libloading"
|
|
@@ -352,8 +352,9 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
|
|
|
352
352
|
|
|
353
353
|
[[package]]
|
|
354
354
|
name = "magnus"
|
|
355
|
-
version = "0.5.
|
|
356
|
-
source = "
|
|
355
|
+
version = "0.5.3"
|
|
356
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
357
|
+
checksum = "c8dc14463c2552e753ef562961f486ca76f17a857c121db40e9f3ade3f35ab81"
|
|
357
358
|
dependencies = [
|
|
358
359
|
"magnus-macros",
|
|
359
360
|
"rb-sys",
|
|
@@ -362,12 +363,13 @@ dependencies = [
|
|
|
362
363
|
|
|
363
364
|
[[package]]
|
|
364
365
|
name = "magnus-macros"
|
|
365
|
-
version = "0.
|
|
366
|
-
source = "
|
|
366
|
+
version = "0.4.1"
|
|
367
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
368
|
+
checksum = "6cc17af1d45442c011aa579d727ec6cff8a69aea8a6bbad26736e7112d749bfb"
|
|
367
369
|
dependencies = [
|
|
368
370
|
"proc-macro2",
|
|
369
371
|
"quote",
|
|
370
|
-
"syn",
|
|
372
|
+
"syn 1.0.109",
|
|
371
373
|
]
|
|
372
374
|
|
|
373
375
|
[[package]]
|
|
@@ -378,9 +380,9 @@ checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
|
|
|
378
380
|
|
|
379
381
|
[[package]]
|
|
380
382
|
name = "memoffset"
|
|
381
|
-
version = "0.
|
|
383
|
+
version = "0.8.0"
|
|
382
384
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
383
|
-
checksum = "
|
|
385
|
+
checksum = "d61c719bcfbcf5d62b3a09efa6088de8c54bc0bfcd3ea7ae39fcc186108b8de1"
|
|
384
386
|
dependencies = [
|
|
385
387
|
"autocfg",
|
|
386
388
|
]
|
|
@@ -391,6 +393,27 @@ version = "0.2.1"
|
|
|
391
393
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
392
394
|
checksum = "68354c5c6bd36d73ff3feceb05efa59b6acb7626617f4962be322a825e61f79a"
|
|
393
395
|
|
|
396
|
+
[[package]]
|
|
397
|
+
name = "monostate"
|
|
398
|
+
version = "0.1.6"
|
|
399
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
400
|
+
checksum = "0230b703f1ac35df1e24f6d0d2255472bcccaf657ecdfa4f1fcbcad1ad5bb98a"
|
|
401
|
+
dependencies = [
|
|
402
|
+
"monostate-impl",
|
|
403
|
+
"serde",
|
|
404
|
+
]
|
|
405
|
+
|
|
406
|
+
[[package]]
|
|
407
|
+
name = "monostate-impl"
|
|
408
|
+
version = "0.1.6"
|
|
409
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
410
|
+
checksum = "8795add3e14028f11f8e848bd3294898a8294767b3776b6f733560d33bd2530b"
|
|
411
|
+
dependencies = [
|
|
412
|
+
"proc-macro2",
|
|
413
|
+
"quote",
|
|
414
|
+
"syn 2.0.13",
|
|
415
|
+
]
|
|
416
|
+
|
|
394
417
|
[[package]]
|
|
395
418
|
name = "nom"
|
|
396
419
|
version = "7.1.3"
|
|
@@ -419,9 +442,9 @@ checksum = "17b02fc0ff9a9e4b35b3342880f48e896ebf69f2967921fe8646bf5b7125956a"
|
|
|
419
442
|
|
|
420
443
|
[[package]]
|
|
421
444
|
name = "once_cell"
|
|
422
|
-
version = "1.17.
|
|
445
|
+
version = "1.17.1"
|
|
423
446
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
424
|
-
checksum = "
|
|
447
|
+
checksum = "b7e5500299e16ebb147ae15a00a942af264cf3688f47923b8fc2cd5858f23ad3"
|
|
425
448
|
|
|
426
449
|
[[package]]
|
|
427
450
|
name = "onig"
|
|
@@ -447,9 +470,9 @@ dependencies = [
|
|
|
447
470
|
|
|
448
471
|
[[package]]
|
|
449
472
|
name = "paste"
|
|
450
|
-
version = "1.0.
|
|
473
|
+
version = "1.0.12"
|
|
451
474
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
452
|
-
checksum = "
|
|
475
|
+
checksum = "9f746c4065a8fa3fe23974dd82f15431cc8d40779821001404d10d2e79ca7d79"
|
|
453
476
|
|
|
454
477
|
[[package]]
|
|
455
478
|
name = "peeking_take_while"
|
|
@@ -471,18 +494,18 @@ checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de"
|
|
|
471
494
|
|
|
472
495
|
[[package]]
|
|
473
496
|
name = "proc-macro2"
|
|
474
|
-
version = "1.0.
|
|
497
|
+
version = "1.0.56"
|
|
475
498
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
476
|
-
checksum = "
|
|
499
|
+
checksum = "2b63bdb0cd06f1f4dedf69b254734f9b45af66e4a031e42a7480257d9898b435"
|
|
477
500
|
dependencies = [
|
|
478
501
|
"unicode-ident",
|
|
479
502
|
]
|
|
480
503
|
|
|
481
504
|
[[package]]
|
|
482
505
|
name = "quote"
|
|
483
|
-
version = "1.0.
|
|
506
|
+
version = "1.0.26"
|
|
484
507
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
485
|
-
checksum = "
|
|
508
|
+
checksum = "4424af4bf778aae2051a77b60283332f386554255d722233d09fbfc7e30da2fc"
|
|
486
509
|
dependencies = [
|
|
487
510
|
"proc-macro2",
|
|
488
511
|
]
|
|
@@ -519,9 +542,9 @@ dependencies = [
|
|
|
519
542
|
|
|
520
543
|
[[package]]
|
|
521
544
|
name = "rayon"
|
|
522
|
-
version = "1.
|
|
545
|
+
version = "1.7.0"
|
|
523
546
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
524
|
-
checksum = "
|
|
547
|
+
checksum = "1d2df5196e37bcc87abebc0053e20787d73847bb33134a69841207dd0a47f03b"
|
|
525
548
|
dependencies = [
|
|
526
549
|
"either",
|
|
527
550
|
"rayon-core",
|
|
@@ -540,9 +563,9 @@ dependencies = [
|
|
|
540
563
|
|
|
541
564
|
[[package]]
|
|
542
565
|
name = "rayon-core"
|
|
543
|
-
version = "1.
|
|
566
|
+
version = "1.11.0"
|
|
544
567
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
545
|
-
checksum = "
|
|
568
|
+
checksum = "4b8f95bd6966f5c87776639160a66bd8ab9895d9d4ab01ddba9fc60661aebe8d"
|
|
546
569
|
dependencies = [
|
|
547
570
|
"crossbeam-channel",
|
|
548
571
|
"crossbeam-deque",
|
|
@@ -552,22 +575,26 @@ dependencies = [
|
|
|
552
575
|
|
|
553
576
|
[[package]]
|
|
554
577
|
name = "rb-sys"
|
|
555
|
-
version = "0.9.
|
|
578
|
+
version = "0.9.71"
|
|
556
579
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
557
|
-
checksum = "
|
|
580
|
+
checksum = "156bfedced1e236600bcaad538477097ff2ed5c6b474e411d15b791e1d24c0f1"
|
|
558
581
|
dependencies = [
|
|
559
582
|
"rb-sys-build",
|
|
560
583
|
]
|
|
561
584
|
|
|
562
585
|
[[package]]
|
|
563
586
|
name = "rb-sys-build"
|
|
564
|
-
version = "0.9.
|
|
587
|
+
version = "0.9.71"
|
|
565
588
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
566
|
-
checksum = "
|
|
589
|
+
checksum = "5cb2e4a32cbc290b543a74567072ad24b708aff7bb5dde5a68d5690379cd7938"
|
|
567
590
|
dependencies = [
|
|
568
591
|
"bindgen",
|
|
592
|
+
"lazy_static",
|
|
593
|
+
"proc-macro2",
|
|
594
|
+
"quote",
|
|
569
595
|
"regex",
|
|
570
596
|
"shell-words",
|
|
597
|
+
"syn 1.0.109",
|
|
571
598
|
]
|
|
572
599
|
|
|
573
600
|
[[package]]
|
|
@@ -578,9 +605,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
|
|
|
578
605
|
|
|
579
606
|
[[package]]
|
|
580
607
|
name = "regex"
|
|
581
|
-
version = "1.7.
|
|
608
|
+
version = "1.7.3"
|
|
582
609
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
583
|
-
checksum = "
|
|
610
|
+
checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
|
|
584
611
|
dependencies = [
|
|
585
612
|
"aho-corasick",
|
|
586
613
|
"memchr",
|
|
@@ -589,9 +616,9 @@ dependencies = [
|
|
|
589
616
|
|
|
590
617
|
[[package]]
|
|
591
618
|
name = "regex-syntax"
|
|
592
|
-
version = "0.6.
|
|
619
|
+
version = "0.6.29"
|
|
593
620
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
594
|
-
checksum = "
|
|
621
|
+
checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
|
|
595
622
|
|
|
596
623
|
[[package]]
|
|
597
624
|
name = "rustc-hash"
|
|
@@ -601,9 +628,9 @@ checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2"
|
|
|
601
628
|
|
|
602
629
|
[[package]]
|
|
603
630
|
name = "ryu"
|
|
604
|
-
version = "1.0.
|
|
631
|
+
version = "1.0.13"
|
|
605
632
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
606
|
-
checksum = "
|
|
633
|
+
checksum = "f91339c0467de62360649f8d3e185ca8de4224ff281f66000de5eb2a77a79041"
|
|
607
634
|
|
|
608
635
|
[[package]]
|
|
609
636
|
name = "scopeguard"
|
|
@@ -613,29 +640,29 @@ checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
|
|
|
613
640
|
|
|
614
641
|
[[package]]
|
|
615
642
|
name = "serde"
|
|
616
|
-
version = "1.0.
|
|
643
|
+
version = "1.0.159"
|
|
617
644
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
618
|
-
checksum = "
|
|
645
|
+
checksum = "3c04e8343c3daeec41f58990b9d77068df31209f2af111e059e9fe9646693065"
|
|
619
646
|
dependencies = [
|
|
620
647
|
"serde_derive",
|
|
621
648
|
]
|
|
622
649
|
|
|
623
650
|
[[package]]
|
|
624
651
|
name = "serde_derive"
|
|
625
|
-
version = "1.0.
|
|
652
|
+
version = "1.0.159"
|
|
626
653
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
627
|
-
checksum = "
|
|
654
|
+
checksum = "4c614d17805b093df4b147b51339e7e44bf05ef59fba1e45d83500bcfb4d8585"
|
|
628
655
|
dependencies = [
|
|
629
656
|
"proc-macro2",
|
|
630
657
|
"quote",
|
|
631
|
-
"syn",
|
|
658
|
+
"syn 2.0.13",
|
|
632
659
|
]
|
|
633
660
|
|
|
634
661
|
[[package]]
|
|
635
662
|
name = "serde_json"
|
|
636
|
-
version = "1.0.
|
|
663
|
+
version = "1.0.95"
|
|
637
664
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
638
|
-
checksum = "
|
|
665
|
+
checksum = "d721eca97ac802aa7777b701877c8004d950fc142651367300d21c1cc0194744"
|
|
639
666
|
dependencies = [
|
|
640
667
|
"itoa",
|
|
641
668
|
"ryu",
|
|
@@ -680,9 +707,20 @@ checksum = "73473c0e59e6d5812c5dfe2a064a6444949f089e20eec9a2e5506596494e4623"
|
|
|
680
707
|
|
|
681
708
|
[[package]]
|
|
682
709
|
name = "syn"
|
|
683
|
-
version = "1.0.
|
|
710
|
+
version = "1.0.109"
|
|
684
711
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
685
|
-
checksum = "
|
|
712
|
+
checksum = "72b64191b275b66ffe2469e8af2c1cfe3bafa67b529ead792a6d0160888b4237"
|
|
713
|
+
dependencies = [
|
|
714
|
+
"proc-macro2",
|
|
715
|
+
"quote",
|
|
716
|
+
"unicode-ident",
|
|
717
|
+
]
|
|
718
|
+
|
|
719
|
+
[[package]]
|
|
720
|
+
name = "syn"
|
|
721
|
+
version = "2.0.13"
|
|
722
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
723
|
+
checksum = "4c9da457c5285ac1f936ebd076af6dac17a61cfe7826f2076b4d015cf47bc8ec"
|
|
686
724
|
dependencies = [
|
|
687
725
|
"proc-macro2",
|
|
688
726
|
"quote",
|
|
@@ -691,38 +729,39 @@ dependencies = [
|
|
|
691
729
|
|
|
692
730
|
[[package]]
|
|
693
731
|
name = "thiserror"
|
|
694
|
-
version = "1.0.
|
|
732
|
+
version = "1.0.40"
|
|
695
733
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
696
|
-
checksum = "
|
|
734
|
+
checksum = "978c9a314bd8dc99be594bc3c175faaa9794be04a5a5e153caba6915336cebac"
|
|
697
735
|
dependencies = [
|
|
698
736
|
"thiserror-impl",
|
|
699
737
|
]
|
|
700
738
|
|
|
701
739
|
[[package]]
|
|
702
740
|
name = "thiserror-impl"
|
|
703
|
-
version = "1.0.
|
|
741
|
+
version = "1.0.40"
|
|
704
742
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
705
|
-
checksum = "
|
|
743
|
+
checksum = "f9456a42c5b0d803c8cd86e73dd7cc9edd429499f37a3550d286d5e86720569f"
|
|
706
744
|
dependencies = [
|
|
707
745
|
"proc-macro2",
|
|
708
746
|
"quote",
|
|
709
|
-
"syn",
|
|
747
|
+
"syn 2.0.13",
|
|
710
748
|
]
|
|
711
749
|
|
|
712
750
|
[[package]]
|
|
713
751
|
name = "tokenizers"
|
|
714
|
-
version = "0.3.
|
|
752
|
+
version = "0.3.3"
|
|
715
753
|
dependencies = [
|
|
716
754
|
"magnus",
|
|
717
755
|
"onig",
|
|
718
756
|
"serde",
|
|
719
|
-
"tokenizers 0.13.
|
|
757
|
+
"tokenizers 0.13.3",
|
|
720
758
|
]
|
|
721
759
|
|
|
722
760
|
[[package]]
|
|
723
761
|
name = "tokenizers"
|
|
724
|
-
version = "0.13.
|
|
725
|
-
source = "
|
|
762
|
+
version = "0.13.3"
|
|
763
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
764
|
+
checksum = "5cf49017523bf0bc01c9966f172c5f120bbb7b96cccd1708772dd42e767fb9f5"
|
|
726
765
|
dependencies = [
|
|
727
766
|
"aho-corasick",
|
|
728
767
|
"derive_builder",
|
|
@@ -733,6 +772,7 @@ dependencies = [
|
|
|
733
772
|
"lazy_static",
|
|
734
773
|
"log",
|
|
735
774
|
"macro_rules_attribute",
|
|
775
|
+
"monostate",
|
|
736
776
|
"onig",
|
|
737
777
|
"paste",
|
|
738
778
|
"rand",
|
|
@@ -751,9 +791,9 @@ dependencies = [
|
|
|
751
791
|
|
|
752
792
|
[[package]]
|
|
753
793
|
name = "unicode-ident"
|
|
754
|
-
version = "1.0.
|
|
794
|
+
version = "1.0.8"
|
|
755
795
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
756
|
-
checksum = "
|
|
796
|
+
checksum = "e5464a87b239f13a63a501f2701565754bae92d243d4bb7eb12f6d57d2269bf4"
|
|
757
797
|
|
|
758
798
|
[[package]]
|
|
759
799
|
name = "unicode-normalization-alignments"
|
|
@@ -827,42 +867,42 @@ dependencies = [
|
|
|
827
867
|
|
|
828
868
|
[[package]]
|
|
829
869
|
name = "windows_aarch64_gnullvm"
|
|
830
|
-
version = "0.42.
|
|
870
|
+
version = "0.42.2"
|
|
831
871
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
832
|
-
checksum = "
|
|
872
|
+
checksum = "597a5118570b68bc08d8d59125332c54f1ba9d9adeedeef5b99b02ba2b0698f8"
|
|
833
873
|
|
|
834
874
|
[[package]]
|
|
835
875
|
name = "windows_aarch64_msvc"
|
|
836
|
-
version = "0.42.
|
|
876
|
+
version = "0.42.2"
|
|
837
877
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
838
|
-
checksum = "
|
|
878
|
+
checksum = "e08e8864a60f06ef0d0ff4ba04124db8b0fb3be5776a5cd47641e942e58c4d43"
|
|
839
879
|
|
|
840
880
|
[[package]]
|
|
841
881
|
name = "windows_i686_gnu"
|
|
842
|
-
version = "0.42.
|
|
882
|
+
version = "0.42.2"
|
|
843
883
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
844
|
-
checksum = "
|
|
884
|
+
checksum = "c61d927d8da41da96a81f029489353e68739737d3beca43145c8afec9a31a84f"
|
|
845
885
|
|
|
846
886
|
[[package]]
|
|
847
887
|
name = "windows_i686_msvc"
|
|
848
|
-
version = "0.42.
|
|
888
|
+
version = "0.42.2"
|
|
849
889
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
850
|
-
checksum = "
|
|
890
|
+
checksum = "44d840b6ec649f480a41c8d80f9c65108b92d89345dd94027bfe06ac444d1060"
|
|
851
891
|
|
|
852
892
|
[[package]]
|
|
853
893
|
name = "windows_x86_64_gnu"
|
|
854
|
-
version = "0.42.
|
|
894
|
+
version = "0.42.2"
|
|
855
895
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
856
|
-
checksum = "
|
|
896
|
+
checksum = "8de912b8b8feb55c064867cf047dda097f92d51efad5b491dfb98f6bbb70cb36"
|
|
857
897
|
|
|
858
898
|
[[package]]
|
|
859
899
|
name = "windows_x86_64_gnullvm"
|
|
860
|
-
version = "0.42.
|
|
900
|
+
version = "0.42.2"
|
|
861
901
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
862
|
-
checksum = "
|
|
902
|
+
checksum = "26d41b46a36d453748aedef1486d5c7a85db22e56aff34643984ea85514e94a3"
|
|
863
903
|
|
|
864
904
|
[[package]]
|
|
865
905
|
name = "windows_x86_64_msvc"
|
|
866
|
-
version = "0.42.
|
|
906
|
+
version = "0.42.2"
|
|
867
907
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
868
|
-
checksum = "
|
|
908
|
+
checksum = "9aec5da331524158c6d1a4ac0ab1541149c0b9505fde06423b02f5ef0106b9f0"
|
data/ext/tokenizers/Cargo.toml
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
[package]
|
|
2
2
|
name = "tokenizers"
|
|
3
|
-
version = "0.3.
|
|
3
|
+
version = "0.3.3"
|
|
4
4
|
license = "Apache-2.0"
|
|
5
5
|
authors = ["Andrew Kane <andrew@ankane.org>"]
|
|
6
6
|
edition = "2021"
|
|
@@ -10,12 +10,11 @@ publish = false
|
|
|
10
10
|
crate-type = ["cdylib"]
|
|
11
11
|
|
|
12
12
|
[dependencies]
|
|
13
|
-
magnus =
|
|
14
|
-
onig = { version = "6
|
|
15
|
-
serde = { version = "1
|
|
13
|
+
magnus = "0.5"
|
|
14
|
+
onig = { version = "6", default-features = false }
|
|
15
|
+
serde = { version = "1", features = ["rc", "derive"] }
|
|
16
16
|
|
|
17
17
|
[dependencies.tokenizers]
|
|
18
|
-
version = "0.13.
|
|
19
|
-
git = "https://github.com/huggingface/tokenizers"
|
|
18
|
+
version = "=0.13.3" # also update in from_pretrained.rb
|
|
20
19
|
default-features = false
|
|
21
20
|
features = ["progressbar", "onig", "esaxx_fast"]
|
|
@@ -7,14 +7,19 @@ use magnus::{
|
|
|
7
7
|
};
|
|
8
8
|
use serde::{Deserialize, Serialize};
|
|
9
9
|
use tk::decoders::bpe::BPEDecoder;
|
|
10
|
+
use tk::decoders::byte_fallback::ByteFallback;
|
|
10
11
|
use tk::decoders::byte_level::ByteLevel;
|
|
11
12
|
use tk::decoders::ctc::CTC;
|
|
13
|
+
use tk::decoders::fuse::Fuse;
|
|
12
14
|
use tk::decoders::metaspace::Metaspace;
|
|
15
|
+
use tk::decoders::strip::Strip;
|
|
13
16
|
use tk::decoders::wordpiece::WordPiece;
|
|
14
17
|
use tk::decoders::DecoderWrapper;
|
|
15
18
|
use tk::Decoder;
|
|
19
|
+
use tk::normalizers::replace::Replace;
|
|
16
20
|
|
|
17
|
-
use super::
|
|
21
|
+
use super::utils::*;
|
|
22
|
+
use super::{RbError, RbResult};
|
|
18
23
|
|
|
19
24
|
#[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
|
|
20
25
|
pub struct RbDecoder {
|
|
@@ -89,6 +94,30 @@ impl RbDecoder {
|
|
|
89
94
|
setter!(self, CTC, word_delimiter_token, word_delimiter_token);
|
|
90
95
|
}
|
|
91
96
|
|
|
97
|
+
fn strip_content(&self) -> char {
|
|
98
|
+
getter!(self, Strip, content)
|
|
99
|
+
}
|
|
100
|
+
|
|
101
|
+
fn strip_set_content(&self, content: char) {
|
|
102
|
+
setter!(self, Strip, content, content)
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
fn strip_start(&self) -> usize {
|
|
106
|
+
getter!(self, Strip, start)
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
fn strip_set_start(&self, start: usize) {
|
|
110
|
+
setter!(self, Strip, start, start)
|
|
111
|
+
}
|
|
112
|
+
|
|
113
|
+
fn strip_stop(&self) -> usize {
|
|
114
|
+
getter!(self, Strip, stop)
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
fn strip_set_stop(&self, stop: usize) {
|
|
118
|
+
setter!(self, Strip, stop, stop)
|
|
119
|
+
}
|
|
120
|
+
|
|
92
121
|
pub fn metaspace_replacement(&self) -> char {
|
|
93
122
|
getter!(self, Metaspace, get_replacement().clone())
|
|
94
123
|
}
|
|
@@ -130,6 +159,14 @@ impl RbBPEDecoder {
|
|
|
130
159
|
}
|
|
131
160
|
}
|
|
132
161
|
|
|
162
|
+
pub struct RbByteFallbackDecoder {}
|
|
163
|
+
|
|
164
|
+
impl RbByteFallbackDecoder {
|
|
165
|
+
pub fn new() -> RbDecoder {
|
|
166
|
+
ByteFallback::default().into()
|
|
167
|
+
}
|
|
168
|
+
}
|
|
169
|
+
|
|
133
170
|
pub struct RbByteLevelDecoder {}
|
|
134
171
|
|
|
135
172
|
impl RbByteLevelDecoder {
|
|
@@ -146,6 +183,14 @@ impl RbCTC {
|
|
|
146
183
|
}
|
|
147
184
|
}
|
|
148
185
|
|
|
186
|
+
pub struct RbFuse {}
|
|
187
|
+
|
|
188
|
+
impl RbFuse {
|
|
189
|
+
pub fn new() -> RbDecoder {
|
|
190
|
+
Fuse::default().into()
|
|
191
|
+
}
|
|
192
|
+
}
|
|
193
|
+
|
|
149
194
|
pub struct RbMetaspaceDecoder {}
|
|
150
195
|
|
|
151
196
|
impl RbMetaspaceDecoder {
|
|
@@ -154,6 +199,22 @@ impl RbMetaspaceDecoder {
|
|
|
154
199
|
}
|
|
155
200
|
}
|
|
156
201
|
|
|
202
|
+
pub struct RbReplaceDecoder {}
|
|
203
|
+
|
|
204
|
+
impl RbReplaceDecoder {
|
|
205
|
+
pub fn new(pattern: RbPattern, content: String) -> RbResult<RbDecoder> {
|
|
206
|
+
Replace::new(pattern, content).map(|v| v.into()).map_err(RbError::from)
|
|
207
|
+
}
|
|
208
|
+
}
|
|
209
|
+
|
|
210
|
+
pub struct RbStripDecoder {}
|
|
211
|
+
|
|
212
|
+
impl RbStripDecoder {
|
|
213
|
+
pub fn new(content: char, start: usize, stop: usize) -> RbDecoder {
|
|
214
|
+
Strip::new(content, start, stop).into()
|
|
215
|
+
}
|
|
216
|
+
}
|
|
217
|
+
|
|
157
218
|
pub struct RbWordPieceDecoder {}
|
|
158
219
|
|
|
159
220
|
impl RbWordPieceDecoder {
|
|
@@ -219,6 +280,11 @@ unsafe impl TypedData for RbDecoder {
|
|
|
219
280
|
class.undef_alloc_func();
|
|
220
281
|
class
|
|
221
282
|
}),
|
|
283
|
+
DecoderWrapper::ByteFallback(_) => *memoize!(RClass: {
|
|
284
|
+
let class: RClass = crate::decoders().const_get("ByteFallback").unwrap();
|
|
285
|
+
class.undef_alloc_func();
|
|
286
|
+
class
|
|
287
|
+
}),
|
|
222
288
|
DecoderWrapper::ByteLevel(_) => *memoize!(RClass: {
|
|
223
289
|
let class: RClass = crate::decoders().const_get("ByteLevel").unwrap();
|
|
224
290
|
class.undef_alloc_func();
|
|
@@ -229,11 +295,26 @@ unsafe impl TypedData for RbDecoder {
|
|
|
229
295
|
class.undef_alloc_func();
|
|
230
296
|
class
|
|
231
297
|
}),
|
|
298
|
+
DecoderWrapper::Fuse(_) => *memoize!(RClass: {
|
|
299
|
+
let class: RClass = crate::decoders().const_get("Fuse").unwrap();
|
|
300
|
+
class.undef_alloc_func();
|
|
301
|
+
class
|
|
302
|
+
}),
|
|
232
303
|
DecoderWrapper::Metaspace(_) => *memoize!(RClass: {
|
|
233
304
|
let class: RClass = crate::decoders().const_get("Metaspace").unwrap();
|
|
234
305
|
class.undef_alloc_func();
|
|
235
306
|
class
|
|
236
307
|
}),
|
|
308
|
+
DecoderWrapper::Replace(_) => *memoize!(RClass: {
|
|
309
|
+
let class: RClass = crate::decoders().const_get("Replace").unwrap();
|
|
310
|
+
class.undef_alloc_func();
|
|
311
|
+
class
|
|
312
|
+
}),
|
|
313
|
+
DecoderWrapper::Strip(_) => *memoize!(RClass: {
|
|
314
|
+
let class: RClass = crate::decoders().const_get("Strip").unwrap();
|
|
315
|
+
class.undef_alloc_func();
|
|
316
|
+
class
|
|
317
|
+
}),
|
|
237
318
|
DecoderWrapper::WordPiece(_) => *memoize!(RClass: {
|
|
238
319
|
let class: RClass = crate::decoders().const_get("WordPiece").unwrap();
|
|
239
320
|
class.undef_alloc_func();
|
|
@@ -253,6 +334,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
|
|
|
253
334
|
class.define_method("suffix", method!(RbDecoder::bpe_suffix, 0))?;
|
|
254
335
|
class.define_method("suffix=", method!(RbDecoder::bpe_set_suffix, 1))?;
|
|
255
336
|
|
|
337
|
+
let class = module.define_class("ByteFallback", decoder)?;
|
|
338
|
+
class.define_singleton_method("new", function!(RbByteFallbackDecoder::new, 0))?;
|
|
339
|
+
|
|
256
340
|
let class = module.define_class("ByteLevel", decoder)?;
|
|
257
341
|
class.define_singleton_method("new", function!(RbByteLevelDecoder::new, 0))?;
|
|
258
342
|
|
|
@@ -265,6 +349,9 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
|
|
|
265
349
|
class.define_method("word_delimiter_token", method!(RbDecoder::ctc_word_delimiter_token, 0))?;
|
|
266
350
|
class.define_method("word_delimiter_token=", method!(RbDecoder::ctc_set_word_delimiter_token, 1))?;
|
|
267
351
|
|
|
352
|
+
let class = module.define_class("Fuse", decoder)?;
|
|
353
|
+
class.define_singleton_method("new", function!(RbFuse::new, 0))?;
|
|
354
|
+
|
|
268
355
|
let class = module.define_class("Metaspace", decoder)?;
|
|
269
356
|
class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 2))?;
|
|
270
357
|
class.define_method("add_prefix_space", method!(RbDecoder::metaspace_add_prefix_space, 0))?;
|
|
@@ -272,6 +359,18 @@ pub fn decoders(module: &RModule) -> RbResult<()> {
|
|
|
272
359
|
class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
|
|
273
360
|
class.define_method("replacement=", method!(RbDecoder::metaspace_set_replacement, 1))?;
|
|
274
361
|
|
|
362
|
+
let class = module.define_class("Replace", decoder)?;
|
|
363
|
+
class.define_singleton_method("new", function!(RbReplaceDecoder::new, 2))?;
|
|
364
|
+
|
|
365
|
+
let class = module.define_class("Strip", decoder)?;
|
|
366
|
+
class.define_singleton_method("_new", function!(RbStripDecoder::new, 3))?;
|
|
367
|
+
class.define_method("content", method!(RbDecoder::strip_content, 0))?;
|
|
368
|
+
class.define_method("content=", method!(RbDecoder::strip_set_content, 1))?;
|
|
369
|
+
class.define_method("start", method!(RbDecoder::strip_start, 0))?;
|
|
370
|
+
class.define_method("start=", method!(RbDecoder::strip_set_start, 1))?;
|
|
371
|
+
class.define_method("stop", method!(RbDecoder::strip_stop, 0))?;
|
|
372
|
+
class.define_method("stop=", method!(RbDecoder::strip_set_stop, 1))?;
|
|
373
|
+
|
|
275
374
|
let class = module.define_class("WordPiece", decoder)?;
|
|
276
375
|
class.define_singleton_method("_new", function!(RbWordPieceDecoder::new, 2))?;
|
|
277
376
|
class.define_method("cleanup", method!(RbDecoder::word_piece_cleanup, 0))?;
|
|
@@ -101,6 +101,11 @@ impl RbBPE {
|
|
|
101
101
|
builder = builder.fuse_unk(value.try_convert()?);
|
|
102
102
|
}
|
|
103
103
|
|
|
104
|
+
let value: Value = kwargs.delete(Symbol::new("byte_fallback"))?;
|
|
105
|
+
if !value.is_nil() {
|
|
106
|
+
builder = builder.byte_fallback(value.try_convert()?);
|
|
107
|
+
}
|
|
108
|
+
|
|
104
109
|
if !kwargs.is_empty() {
|
|
105
110
|
// TODO improve message
|
|
106
111
|
return Err(Error::new(exception::arg_error(), "unknown keyword"));
|
|
@@ -169,6 +174,14 @@ impl RbModel {
|
|
|
169
174
|
setter!(self, BPE, fuse_unk, fuse_unk);
|
|
170
175
|
}
|
|
171
176
|
|
|
177
|
+
pub fn bpe_byte_fallback(&self) -> bool {
|
|
178
|
+
getter!(self, BPE, byte_fallback)
|
|
179
|
+
}
|
|
180
|
+
|
|
181
|
+
pub fn bpe_set_byte_fallback(&self, byte_fallback: bool) {
|
|
182
|
+
setter!(self, BPE, byte_fallback, byte_fallback);
|
|
183
|
+
}
|
|
184
|
+
|
|
172
185
|
pub fn bpe_continuing_subword_prefix(&self) -> Option<String> {
|
|
173
186
|
getter!(self, BPE, continuing_subword_prefix.clone())
|
|
174
187
|
}
|
|
@@ -355,6 +368,8 @@ pub fn models(module: &RModule) -> RbResult<()> {
|
|
|
355
368
|
class.define_method("end_of_word_suffix=", method!(RbModel::bpe_set_end_of_word_suffix, 1))?;
|
|
356
369
|
class.define_method("fuse_unk", method!(RbModel::bpe_fuse_unk, 0))?;
|
|
357
370
|
class.define_method("fuse_unk=", method!(RbModel::bpe_set_fuse_unk, 1))?;
|
|
371
|
+
class.define_method("byte_fallback", method!(RbModel::bpe_byte_fallback, 0))?;
|
|
372
|
+
class.define_method("byte_fallback=", method!(RbModel::bpe_set_byte_fallback, 1))?;
|
|
358
373
|
|
|
359
374
|
let class = module.define_class("Unigram", model)?;
|
|
360
375
|
class.define_singleton_method("_new", function!(RbUnigram::new, 2))?;
|
|
@@ -8,7 +8,7 @@ use magnus::{
|
|
|
8
8
|
use serde::ser::SerializeStruct;
|
|
9
9
|
use serde::{Deserialize, Serialize, Serializer};
|
|
10
10
|
use tk::normalizers::{
|
|
11
|
-
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Strip, StripAccents,
|
|
11
|
+
BertNormalizer, Lowercase, Nmt, NormalizerWrapper, Replace, Prepend, Strip, StripAccents,
|
|
12
12
|
NFC, NFD, NFKC, NFKD,
|
|
13
13
|
};
|
|
14
14
|
use tk::{NormalizedString, Normalizer};
|
|
@@ -44,7 +44,7 @@ macro_rules! getter {
|
|
|
44
44
|
($self: ident, $variant: ident, $name: ident) => {{
|
|
45
45
|
if let RbNormalizerTypeWrapper::Single(ref norm) = &$self.normalizer {
|
|
46
46
|
let wrapper = norm.read().unwrap();
|
|
47
|
-
if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = *wrapper {
|
|
47
|
+
if let RbNormalizerWrapper::Wrapped(NormalizerWrapper::$variant(o)) = (*wrapper).clone() {
|
|
48
48
|
o.$name
|
|
49
49
|
} else {
|
|
50
50
|
unreachable!()
|
|
@@ -105,6 +105,14 @@ impl RbNormalizer {
|
|
|
105
105
|
setter!(self, BertNormalizer, lowercase, lowercase)
|
|
106
106
|
}
|
|
107
107
|
|
|
108
|
+
fn prepend_prepend(&self) -> String {
|
|
109
|
+
getter!(self, Prepend, prepend)
|
|
110
|
+
}
|
|
111
|
+
|
|
112
|
+
fn prepend_set_prepend(&self, prepend: String) {
|
|
113
|
+
setter!(self, Prepend, prepend, prepend)
|
|
114
|
+
}
|
|
115
|
+
|
|
108
116
|
fn strip_left(&self) -> bool {
|
|
109
117
|
getter!(self, StripNormalizer, strip_left)
|
|
110
118
|
}
|
|
@@ -186,6 +194,14 @@ impl RbReplace {
|
|
|
186
194
|
}
|
|
187
195
|
}
|
|
188
196
|
|
|
197
|
+
pub struct RbPrepend {}
|
|
198
|
+
|
|
199
|
+
impl RbPrepend {
|
|
200
|
+
pub fn new(prepend: String) -> RbNormalizer {
|
|
201
|
+
Prepend::new(prepend).into()
|
|
202
|
+
}
|
|
203
|
+
}
|
|
204
|
+
|
|
189
205
|
pub struct RbStrip {}
|
|
190
206
|
|
|
191
207
|
impl RbStrip {
|
|
@@ -372,6 +388,11 @@ unsafe impl TypedData for RbNormalizer {
|
|
|
372
388
|
class.undef_alloc_func();
|
|
373
389
|
class
|
|
374
390
|
}),
|
|
391
|
+
NormalizerWrapper::Prepend(_) => *memoize!(RClass: {
|
|
392
|
+
let class: RClass = crate::normalizers().const_get("Prepend").unwrap();
|
|
393
|
+
class.undef_alloc_func();
|
|
394
|
+
class
|
|
395
|
+
}),
|
|
375
396
|
NormalizerWrapper::StripNormalizer(_) => *memoize!(RClass: {
|
|
376
397
|
let class: RClass = crate::normalizers().const_get("Strip").unwrap();
|
|
377
398
|
class.undef_alloc_func();
|
|
@@ -428,6 +449,11 @@ pub fn normalizers(module: &RModule) -> RbResult<()> {
|
|
|
428
449
|
let class = module.define_class("Replace", normalizer)?;
|
|
429
450
|
class.define_singleton_method("new", function!(RbReplace::new, 2))?;
|
|
430
451
|
|
|
452
|
+
let class = module.define_class("Prepend", normalizer)?;
|
|
453
|
+
class.define_singleton_method("_new", function!(RbPrepend::new, 1))?;
|
|
454
|
+
class.define_method("prepend", method!(RbNormalizer::prepend_prepend, 0))?;
|
|
455
|
+
class.define_method("prepend=", method!(RbNormalizer::prepend_set_prepend, 1))?;
|
|
456
|
+
|
|
431
457
|
let class = module.define_class("Strip", normalizer)?;
|
|
432
458
|
class.define_singleton_method("_new", function!(RbStrip::new, 2))?;
|
|
433
459
|
class.define_method("left", method!(RbNormalizer::strip_left, 0))?;
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
|
@@ -9,6 +9,7 @@ end
|
|
|
9
9
|
require_relative "tokenizers/decoders/bpe_decoder"
|
|
10
10
|
require_relative "tokenizers/decoders/ctc"
|
|
11
11
|
require_relative "tokenizers/decoders/metaspace"
|
|
12
|
+
require_relative "tokenizers/decoders/strip"
|
|
12
13
|
require_relative "tokenizers/decoders/word_piece"
|
|
13
14
|
|
|
14
15
|
# models
|
|
@@ -19,6 +20,7 @@ require_relative "tokenizers/models/unigram"
|
|
|
19
20
|
|
|
20
21
|
# normalizers
|
|
21
22
|
require_relative "tokenizers/normalizers/bert_normalizer"
|
|
23
|
+
require_relative "tokenizers/normalizers/prepend"
|
|
22
24
|
require_relative "tokenizers/normalizers/strip"
|
|
23
25
|
|
|
24
26
|
# pre-tokenizers
|
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: tokenizers
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.3.
|
|
4
|
+
version: 0.3.3
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
8
|
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date: 2023-
|
|
11
|
+
date: 2023-04-09 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rb_sys
|
|
@@ -56,6 +56,7 @@ files:
|
|
|
56
56
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
|
57
57
|
- lib/tokenizers/decoders/ctc.rb
|
|
58
58
|
- lib/tokenizers/decoders/metaspace.rb
|
|
59
|
+
- lib/tokenizers/decoders/strip.rb
|
|
59
60
|
- lib/tokenizers/decoders/word_piece.rb
|
|
60
61
|
- lib/tokenizers/encoding.rb
|
|
61
62
|
- lib/tokenizers/from_pretrained.rb
|
|
@@ -64,6 +65,7 @@ files:
|
|
|
64
65
|
- lib/tokenizers/models/word_level.rb
|
|
65
66
|
- lib/tokenizers/models/word_piece.rb
|
|
66
67
|
- lib/tokenizers/normalizers/bert_normalizer.rb
|
|
68
|
+
- lib/tokenizers/normalizers/prepend.rb
|
|
67
69
|
- lib/tokenizers/normalizers/strip.rb
|
|
68
70
|
- lib/tokenizers/pre_tokenizers/byte_level.rb
|
|
69
71
|
- lib/tokenizers/pre_tokenizers/digits.rb
|
|
@@ -98,7 +100,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
|
98
100
|
- !ruby/object:Gem::Version
|
|
99
101
|
version: '0'
|
|
100
102
|
requirements: []
|
|
101
|
-
rubygems_version: 3.4.
|
|
103
|
+
rubygems_version: 3.4.10
|
|
102
104
|
signing_key:
|
|
103
105
|
specification_version: 4
|
|
104
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|