tokenizers 0.5.3 → 0.5.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +154 -83
- data/ext/tokenizers/Cargo.toml +2 -2
- data/ext/tokenizers/src/decoders.rs +32 -14
- data/ext/tokenizers/src/error.rs +6 -1
- data/ext/tokenizers/src/lib.rs +37 -12
- data/ext/tokenizers/src/models.rs +75 -23
- data/ext/tokenizers/src/normalizers.rs +84 -24
- data/ext/tokenizers/src/pre_tokenizers.rs +121 -42
- data/ext/tokenizers/src/processors.rs +22 -10
- data/ext/tokenizers/src/tokenizer.rs +63 -34
- data/ext/tokenizers/src/trainers.rs +215 -56
- data/ext/tokenizers/src/utils/regex.rs +6 -4
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +3 -7
    
        checksums.yaml
    CHANGED
    
    | @@ -1,7 +1,7 @@ | |
| 1 1 | 
             
            ---
         | 
| 2 2 | 
             
            SHA256:
         | 
| 3 | 
            -
              metadata.gz:  | 
| 4 | 
            -
              data.tar.gz:  | 
| 3 | 
            +
              metadata.gz: 8394d394a6ebaa502c53d08508586044c1f6e0ea8cd8c6629e6a7c2bed38518e
         | 
| 4 | 
            +
              data.tar.gz: 1b54fa285fb6799c2cbc411c21c0f951db1fdacc3291b1da1971f420ada07820
         | 
| 5 5 | 
             
            SHA512:
         | 
| 6 | 
            -
              metadata.gz:  | 
| 7 | 
            -
              data.tar.gz:  | 
| 6 | 
            +
              metadata.gz: f75af568151b9aa3fb9b57c2020464e5d1270174a25adff113f9058b8a92c11288f4df22c1f382af47c9312c1cca83f90214b1875a37334e232bd35bbfd4785e
         | 
| 7 | 
            +
              data.tar.gz: a5b9d665dd2f985f03ea1056887169318c679a4fd30bc768f00109bd2d225958233f57f254a7c1e505c0b3b3fdb94b8a2fcd4027c55405088c6c1e88e3fd24be
         | 
    
        data/CHANGELOG.md
    CHANGED
    
    
    
        data/Cargo.lock
    CHANGED
    
    | @@ -19,9 +19,9 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8" | |
| 19 19 |  | 
| 20 20 | 
             
            [[package]]
         | 
| 21 21 | 
             
            name = "bindgen"
         | 
| 22 | 
            -
            version = "0.69. | 
| 22 | 
            +
            version = "0.69.5"
         | 
| 23 23 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 24 | 
            -
            checksum = " | 
| 24 | 
            +
            checksum = "271383c67ccabffb7381723dea0672a673f292304fcb45c01cc648c7a8d58088"
         | 
| 25 25 | 
             
            dependencies = [
         | 
| 26 26 | 
             
             "bitflags 2.6.0",
         | 
| 27 27 | 
             
             "cexpr",
         | 
| @@ -49,6 +49,12 @@ version = "2.6.0" | |
| 49 49 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 50 50 | 
             
            checksum = "b048fb63fd8b5923fc5aa7b340d8e156aec7ec02f0c78fa8a6ddc2613f6f71de"
         | 
| 51 51 |  | 
| 52 | 
            +
            [[package]]
         | 
| 53 | 
            +
            name = "bumpalo"
         | 
| 54 | 
            +
            version = "3.16.0"
         | 
| 55 | 
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 56 | 
            +
            checksum = "79296716171880943b8470b5f8d03aa55eb2e645a4874bdbb28adb49162e012c"
         | 
| 57 | 
            +
             | 
| 52 58 | 
             
            [[package]]
         | 
| 53 59 | 
             
            name = "byteorder"
         | 
| 54 60 | 
             
            version = "1.5.0"
         | 
| @@ -57,9 +63,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" | |
| 57 63 |  | 
| 58 64 | 
             
            [[package]]
         | 
| 59 65 | 
             
            name = "cc"
         | 
| 60 | 
            -
            version = "1. | 
| 66 | 
            +
            version = "1.2.6"
         | 
| 61 67 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 62 | 
            -
            checksum = " | 
| 68 | 
            +
            checksum = "8d6dbb628b8f8555f86d0323c2eb39e3ec81901f4b83e091db8a6a76d316a333"
         | 
| 63 69 | 
             
            dependencies = [
         | 
| 64 70 | 
             
             "shlex",
         | 
| 65 71 | 
             
            ]
         | 
| @@ -92,22 +98,22 @@ dependencies = [ | |
| 92 98 |  | 
| 93 99 | 
             
            [[package]]
         | 
| 94 100 | 
             
            name = "console"
         | 
| 95 | 
            -
            version = "0.15. | 
| 101 | 
            +
            version = "0.15.10"
         | 
| 96 102 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 97 | 
            -
            checksum = " | 
| 103 | 
            +
            checksum = "ea3c6ecd8059b57859df5c69830340ed3c41d30e3da0c1cbed90a96ac853041b"
         | 
| 98 104 | 
             
            dependencies = [
         | 
| 99 105 | 
             
             "encode_unicode",
         | 
| 100 | 
            -
             "lazy_static",
         | 
| 101 106 | 
             
             "libc",
         | 
| 107 | 
            +
             "once_cell",
         | 
| 102 108 | 
             
             "unicode-width",
         | 
| 103 109 | 
             
             "windows-sys",
         | 
| 104 110 | 
             
            ]
         | 
| 105 111 |  | 
| 106 112 | 
             
            [[package]]
         | 
| 107 113 | 
             
            name = "crossbeam-deque"
         | 
| 108 | 
            -
            version = "0.8. | 
| 114 | 
            +
            version = "0.8.6"
         | 
| 109 115 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 110 | 
            -
            checksum = " | 
| 116 | 
            +
            checksum = "9dd111b7b7f7d55b72c0a6ae361660ee5853c9af73f70c3c2ef6858b950e2e51"
         | 
| 111 117 | 
             
            dependencies = [
         | 
| 112 118 | 
             
             "crossbeam-epoch",
         | 
| 113 119 | 
             
             "crossbeam-utils",
         | 
| @@ -124,9 +130,9 @@ dependencies = [ | |
| 124 130 |  | 
| 125 131 | 
             
            [[package]]
         | 
| 126 132 | 
             
            name = "crossbeam-utils"
         | 
| 127 | 
            -
            version = "0.8. | 
| 133 | 
            +
            version = "0.8.21"
         | 
| 128 134 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 129 | 
            -
            checksum = " | 
| 135 | 
            +
            checksum = "d0a5c400df2834b80a4c3327b3aad3a4c4cd4de0629063962b03235697506a28"
         | 
| 130 136 |  | 
| 131 137 | 
             
            [[package]]
         | 
| 132 138 | 
             
            name = "darling"
         | 
| @@ -165,18 +171,18 @@ dependencies = [ | |
| 165 171 |  | 
| 166 172 | 
             
            [[package]]
         | 
| 167 173 | 
             
            name = "derive_builder"
         | 
| 168 | 
            -
            version = "0.20. | 
| 174 | 
            +
            version = "0.20.2"
         | 
| 169 175 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 170 | 
            -
            checksum = " | 
| 176 | 
            +
            checksum = "507dfb09ea8b7fa618fcf76e953f4f5e192547945816d5358edffe39f6f94947"
         | 
| 171 177 | 
             
            dependencies = [
         | 
| 172 178 | 
             
             "derive_builder_macro",
         | 
| 173 179 | 
             
            ]
         | 
| 174 180 |  | 
| 175 181 | 
             
            [[package]]
         | 
| 176 182 | 
             
            name = "derive_builder_core"
         | 
| 177 | 
            -
            version = "0.20. | 
| 183 | 
            +
            version = "0.20.2"
         | 
| 178 184 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 179 | 
            -
            checksum = " | 
| 185 | 
            +
            checksum = "2d5bcf7b024d6835cfb3d473887cd966994907effbe9227e8c8219824d06c4e8"
         | 
| 180 186 | 
             
            dependencies = [
         | 
| 181 187 | 
             
             "darling",
         | 
| 182 188 | 
             
             "proc-macro2",
         | 
| @@ -186,9 +192,9 @@ dependencies = [ | |
| 186 192 |  | 
| 187 193 | 
             
            [[package]]
         | 
| 188 194 | 
             
            name = "derive_builder_macro"
         | 
| 189 | 
            -
            version = "0.20. | 
| 195 | 
            +
            version = "0.20.2"
         | 
| 190 196 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 191 | 
            -
            checksum = " | 
| 197 | 
            +
            checksum = "ab63b0e2bf4d5928aff72e83a7dace85d7bba5fe12dcc3c5a572d78caffd3f3c"
         | 
| 192 198 | 
             
            dependencies = [
         | 
| 193 199 | 
             
             "derive_builder_core",
         | 
| 194 200 | 
             
             "syn",
         | 
| @@ -202,9 +208,9 @@ checksum = "60b1af1c220855b6ceac025d3f6ecdd2b7c4894bfe9cd9bda4fbb4bc7c0d4cf0" | |
| 202 208 |  | 
| 203 209 | 
             
            [[package]]
         | 
| 204 210 | 
             
            name = "encode_unicode"
         | 
| 205 | 
            -
            version = "0. | 
| 211 | 
            +
            version = "1.0.0"
         | 
| 206 212 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 207 | 
            -
            checksum = " | 
| 213 | 
            +
            checksum = "34aa73646ffb006b8f5147f3dc182bd4bcb190227ce861fc4a4844bf8e3cb2c0"
         | 
| 208 214 |  | 
| 209 215 | 
             
            [[package]]
         | 
| 210 216 | 
             
            name = "esaxx-rs"
         | 
| @@ -234,9 +240,9 @@ dependencies = [ | |
| 234 240 |  | 
| 235 241 | 
             
            [[package]]
         | 
| 236 242 | 
             
            name = "glob"
         | 
| 237 | 
            -
            version = "0.3. | 
| 243 | 
            +
            version = "0.3.2"
         | 
| 238 244 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 239 | 
            -
            checksum = " | 
| 245 | 
            +
            checksum = "a8d1add55171497b4705a648c6b583acafb01d58050a51727785f0b2c8e0a2b2"
         | 
| 240 246 |  | 
| 241 247 | 
             
            [[package]]
         | 
| 242 248 | 
             
            name = "ident_case"
         | 
| @@ -246,24 +252,15 @@ checksum = "b9e0384b61958566e926dc50660321d12159025e767c18e043daf26b70104c39" | |
| 246 252 |  | 
| 247 253 | 
             
            [[package]]
         | 
| 248 254 | 
             
            name = "indicatif"
         | 
| 249 | 
            -
            version = "0.17. | 
| 255 | 
            +
            version = "0.17.9"
         | 
| 250 256 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 251 | 
            -
            checksum = " | 
| 257 | 
            +
            checksum = "cbf675b85ed934d3c67b5c5469701eec7db22689d0a2139d856e0925fa28b281"
         | 
| 252 258 | 
             
            dependencies = [
         | 
| 253 259 | 
             
             "console",
         | 
| 254 | 
            -
             "instant",
         | 
| 255 260 | 
             
             "number_prefix",
         | 
| 256 261 | 
             
             "portable-atomic",
         | 
| 257 262 | 
             
             "unicode-width",
         | 
| 258 | 
            -
             | 
| 259 | 
            -
             | 
| 260 | 
            -
            [[package]]
         | 
| 261 | 
            -
            name = "instant"
         | 
| 262 | 
            -
            version = "0.1.13"
         | 
| 263 | 
            -
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 264 | 
            -
            checksum = "e0242819d153cba4b4b05a5a8f2a7e9bbf97b6055b2a002b395c96b5ff3c0222"
         | 
| 265 | 
            -
            dependencies = [
         | 
| 266 | 
            -
             "cfg-if",
         | 
| 263 | 
            +
             "web-time",
         | 
| 267 264 | 
             
            ]
         | 
| 268 265 |  | 
| 269 266 | 
             
            [[package]]
         | 
| @@ -286,9 +283,19 @@ dependencies = [ | |
| 286 283 |  | 
| 287 284 | 
             
            [[package]]
         | 
| 288 285 | 
             
            name = "itoa"
         | 
| 289 | 
            -
            version = "1.0. | 
| 286 | 
            +
            version = "1.0.14"
         | 
| 290 287 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 291 | 
            -
            checksum = " | 
| 288 | 
            +
            checksum = "d75a2a4b1b190afb6f5425f10f6a8f959d2ea0b9c2b1d79553551850539e4674"
         | 
| 289 | 
            +
             | 
| 290 | 
            +
            [[package]]
         | 
| 291 | 
            +
            name = "js-sys"
         | 
| 292 | 
            +
            version = "0.3.76"
         | 
| 293 | 
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 294 | 
            +
            checksum = "6717b6b5b077764fb5966237269cb3c64edddde4b14ce42647430a78ced9e7b7"
         | 
| 295 | 
            +
            dependencies = [
         | 
| 296 | 
            +
             "once_cell",
         | 
| 297 | 
            +
             "wasm-bindgen",
         | 
| 298 | 
            +
            ]
         | 
| 292 299 |  | 
| 293 300 | 
             
            [[package]]
         | 
| 294 301 | 
             
            name = "lazy_static"
         | 
| @@ -304,15 +311,15 @@ checksum = "830d08ce1d1d941e6b30645f1a0eb5643013d835ce3779a5fc208261dbe10f55" | |
| 304 311 |  | 
| 305 312 | 
             
            [[package]]
         | 
| 306 313 | 
             
            name = "libc"
         | 
| 307 | 
            -
            version = "0.2. | 
| 314 | 
            +
            version = "0.2.169"
         | 
| 308 315 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 309 | 
            -
            checksum = " | 
| 316 | 
            +
            checksum = "b5aba8db14291edd000dfcc4d620c7ebfb122c613afb886ca8803fa4e128a20a"
         | 
| 310 317 |  | 
| 311 318 | 
             
            [[package]]
         | 
| 312 319 | 
             
            name = "libloading"
         | 
| 313 | 
            -
            version = "0.8. | 
| 320 | 
            +
            version = "0.8.6"
         | 
| 314 321 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 315 | 
            -
            checksum = " | 
| 322 | 
            +
            checksum = "fc2f4eb4bc735547cfed7c0a4922cbd04a4655978c09b54f1f7b228750664c34"
         | 
| 316 323 | 
             
            dependencies = [
         | 
| 317 324 | 
             
             "cfg-if",
         | 
| 318 325 | 
             
             "windows-targets",
         | 
| @@ -414,9 +421,9 @@ checksum = "830b246a0e5f20af87141b25c173cd1b609bd7779a4617d6ec582abaf90870f3" | |
| 414 421 |  | 
| 415 422 | 
             
            [[package]]
         | 
| 416 423 | 
             
            name = "once_cell"
         | 
| 417 | 
            -
            version = "1. | 
| 424 | 
            +
            version = "1.20.2"
         | 
| 418 425 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 419 | 
            -
            checksum = " | 
| 426 | 
            +
            checksum = "1261fe7e33c73b354eab43b1273a57c8f967d0391e80353e51f764ac02cf6775"
         | 
| 420 427 |  | 
| 421 428 | 
             
            [[package]]
         | 
| 422 429 | 
             
            name = "onig"
         | 
| @@ -448,15 +455,15 @@ checksum = "57c0d7b74b563b49d38dae00a0c37d4d6de9b432382b2892f0574ddcae73fd0a" | |
| 448 455 |  | 
| 449 456 | 
             
            [[package]]
         | 
| 450 457 | 
             
            name = "pkg-config"
         | 
| 451 | 
            -
            version = "0.3. | 
| 458 | 
            +
            version = "0.3.31"
         | 
| 452 459 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 453 | 
            -
            checksum = " | 
| 460 | 
            +
            checksum = "953ec861398dccce10c670dfeaf3ec4911ca479e9c02154b3a215178c5f566f2"
         | 
| 454 461 |  | 
| 455 462 | 
             
            [[package]]
         | 
| 456 463 | 
             
            name = "portable-atomic"
         | 
| 457 | 
            -
            version = "1. | 
| 464 | 
            +
            version = "1.10.0"
         | 
| 458 465 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 459 | 
            -
            checksum = " | 
| 466 | 
            +
            checksum = "280dc24453071f1b63954171985a0b0d30058d287960968b9b2aca264c8d4ee6"
         | 
| 460 467 |  | 
| 461 468 | 
             
            [[package]]
         | 
| 462 469 | 
             
            name = "ppv-lite86"
         | 
| @@ -469,18 +476,18 @@ dependencies = [ | |
| 469 476 |  | 
| 470 477 | 
             
            [[package]]
         | 
| 471 478 | 
             
            name = "proc-macro2"
         | 
| 472 | 
            -
            version = "1.0. | 
| 479 | 
            +
            version = "1.0.92"
         | 
| 473 480 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 474 | 
            -
            checksum = " | 
| 481 | 
            +
            checksum = "37d3544b3f2748c54e147655edb5025752e2303145b5aefb3c3ea2c78b973bb0"
         | 
| 475 482 | 
             
            dependencies = [
         | 
| 476 483 | 
             
             "unicode-ident",
         | 
| 477 484 | 
             
            ]
         | 
| 478 485 |  | 
| 479 486 | 
             
            [[package]]
         | 
| 480 487 | 
             
            name = "quote"
         | 
| 481 | 
            -
            version = "1.0. | 
| 488 | 
            +
            version = "1.0.38"
         | 
| 482 489 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 483 | 
            -
            checksum = " | 
| 490 | 
            +
            checksum = "0e4dccaaaf89514f546c693ddc140f729f958c247918a13380cccc6078391acc"
         | 
| 484 491 | 
             
            dependencies = [
         | 
| 485 492 | 
             
             "proc-macro2",
         | 
| 486 493 | 
             
            ]
         | 
| @@ -548,18 +555,18 @@ dependencies = [ | |
| 548 555 |  | 
| 549 556 | 
             
            [[package]]
         | 
| 550 557 | 
             
            name = "rb-sys"
         | 
| 551 | 
            -
            version = "0.9. | 
| 558 | 
            +
            version = "0.9.105"
         | 
| 552 559 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 553 | 
            -
            checksum = " | 
| 560 | 
            +
            checksum = "4b3a1f3ce8e7c36d777d52fe7a99039fe4fea7c8ec355a4c4f3a17f92a14029f"
         | 
| 554 561 | 
             
            dependencies = [
         | 
| 555 562 | 
             
             "rb-sys-build",
         | 
| 556 563 | 
             
            ]
         | 
| 557 564 |  | 
| 558 565 | 
             
            [[package]]
         | 
| 559 566 | 
             
            name = "rb-sys-build"
         | 
| 560 | 
            -
            version = "0.9. | 
| 567 | 
            +
            version = "0.9.105"
         | 
| 561 568 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 562 | 
            -
            checksum = " | 
| 569 | 
            +
            checksum = "3e6b246c29c0809e1cbe60a1ba9e093da72a4676d02adc68469297d1e589bbf0"
         | 
| 563 570 | 
             
            dependencies = [
         | 
| 564 571 | 
             
             "bindgen",
         | 
| 565 572 | 
             
             "lazy_static",
         | 
| @@ -578,9 +585,9 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb" | |
| 578 585 |  | 
| 579 586 | 
             
            [[package]]
         | 
| 580 587 | 
             
            name = "regex"
         | 
| 581 | 
            -
            version = "1. | 
| 588 | 
            +
            version = "1.11.1"
         | 
| 582 589 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 583 | 
            -
            checksum = " | 
| 590 | 
            +
            checksum = "b544ef1b4eac5dc2db33ea63606ae9ffcfac26c1416a2806ae0bf5f56b201191"
         | 
| 584 591 | 
             
            dependencies = [
         | 
| 585 592 | 
             
             "aho-corasick",
         | 
| 586 593 | 
             
             "memchr",
         | 
| @@ -590,9 +597,9 @@ dependencies = [ | |
| 590 597 |  | 
| 591 598 | 
             
            [[package]]
         | 
| 592 599 | 
             
            name = "regex-automata"
         | 
| 593 | 
            -
            version = "0.4. | 
| 600 | 
            +
            version = "0.4.9"
         | 
| 594 601 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 595 | 
            -
            checksum = " | 
| 602 | 
            +
            checksum = "809e8dc61f6de73b46c85f4c96486310fe304c434cfa43669d7b40f711150908"
         | 
| 596 603 | 
             
            dependencies = [
         | 
| 597 604 | 
             
             "aho-corasick",
         | 
| 598 605 | 
             
             "memchr",
         | 
| @@ -601,9 +608,9 @@ dependencies = [ | |
| 601 608 |  | 
| 602 609 | 
             
            [[package]]
         | 
| 603 610 | 
             
            name = "regex-syntax"
         | 
| 604 | 
            -
            version = "0.8. | 
| 611 | 
            +
            version = "0.8.5"
         | 
| 605 612 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 606 | 
            -
            checksum = " | 
| 613 | 
            +
            checksum = "2b15c43186be67a4fd63bee50d0303afffcef381492ebe2c5d87f324e1b8815c"
         | 
| 607 614 |  | 
| 608 615 | 
             
            [[package]]
         | 
| 609 616 | 
             
            name = "rustc-hash"
         | 
| @@ -625,18 +632,18 @@ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4" | |
| 625 632 |  | 
| 626 633 | 
             
            [[package]]
         | 
| 627 634 | 
             
            name = "serde"
         | 
| 628 | 
            -
            version = "1.0. | 
| 635 | 
            +
            version = "1.0.217"
         | 
| 629 636 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 630 | 
            -
            checksum = " | 
| 637 | 
            +
            checksum = "02fc4265df13d6fa1d00ecff087228cc0a2b5f3c0e87e258d8b94a156e984c70"
         | 
| 631 638 | 
             
            dependencies = [
         | 
| 632 639 | 
             
             "serde_derive",
         | 
| 633 640 | 
             
            ]
         | 
| 634 641 |  | 
| 635 642 | 
             
            [[package]]
         | 
| 636 643 | 
             
            name = "serde_derive"
         | 
| 637 | 
            -
            version = "1.0. | 
| 644 | 
            +
            version = "1.0.217"
         | 
| 638 645 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 639 | 
            -
            checksum = " | 
| 646 | 
            +
            checksum = "5a9bf7cf98d04a2b28aead066b7496853d4779c9cc183c440dbac457641e19a0"
         | 
| 640 647 | 
             
            dependencies = [
         | 
| 641 648 | 
             
             "proc-macro2",
         | 
| 642 649 | 
             
             "quote",
         | 
| @@ -645,9 +652,9 @@ dependencies = [ | |
| 645 652 |  | 
| 646 653 | 
             
            [[package]]
         | 
| 647 654 | 
             
            name = "serde_json"
         | 
| 648 | 
            -
            version = "1.0. | 
| 655 | 
            +
            version = "1.0.134"
         | 
| 649 656 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 650 | 
            -
            checksum = " | 
| 657 | 
            +
            checksum = "d00f4175c42ee48b15416f6193a959ba3a0d67fc699a0db9ad12df9f83991c7d"
         | 
| 651 658 | 
             
            dependencies = [
         | 
| 652 659 | 
             
             "itoa",
         | 
| 653 660 | 
             
             "memchr",
         | 
| @@ -693,9 +700,9 @@ checksum = "7da8b5736845d9f2fcb837ea5d9e2628564b3b043a70948a3f0b778838c5fb4f" | |
| 693 700 |  | 
| 694 701 | 
             
            [[package]]
         | 
| 695 702 | 
             
            name = "syn"
         | 
| 696 | 
            -
            version = "2.0. | 
| 703 | 
            +
            version = "2.0.93"
         | 
| 697 704 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 698 | 
            -
            checksum = " | 
| 705 | 
            +
            checksum = "9c786062daee0d6db1132800e623df74274a0a87322d8e183338e01b3d98d058"
         | 
| 699 706 | 
             
            dependencies = [
         | 
| 700 707 | 
             
             "proc-macro2",
         | 
| 701 708 | 
             
             "quote",
         | 
| @@ -704,18 +711,18 @@ dependencies = [ | |
| 704 711 |  | 
| 705 712 | 
             
            [[package]]
         | 
| 706 713 | 
             
            name = "thiserror"
         | 
| 707 | 
            -
            version = "1.0. | 
| 714 | 
            +
            version = "1.0.69"
         | 
| 708 715 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 709 | 
            -
            checksum = " | 
| 716 | 
            +
            checksum = "b6aaf5339b578ea85b50e080feb250a3e8ae8cfcdff9a461c9ec2904bc923f52"
         | 
| 710 717 | 
             
            dependencies = [
         | 
| 711 718 | 
             
             "thiserror-impl",
         | 
| 712 719 | 
             
            ]
         | 
| 713 720 |  | 
| 714 721 | 
             
            [[package]]
         | 
| 715 722 | 
             
            name = "thiserror-impl"
         | 
| 716 | 
            -
            version = "1.0. | 
| 723 | 
            +
            version = "1.0.69"
         | 
| 717 724 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 718 | 
            -
            checksum = " | 
| 725 | 
            +
            checksum = "4fee6c4efc90059e10f81e6d42c60a18f76588c3d74cb83a0b242a2b6c7504c1"
         | 
| 719 726 | 
             
            dependencies = [
         | 
| 720 727 | 
             
             "proc-macro2",
         | 
| 721 728 | 
             
             "quote",
         | 
| @@ -724,19 +731,19 @@ dependencies = [ | |
| 724 731 |  | 
| 725 732 | 
             
            [[package]]
         | 
| 726 733 | 
             
            name = "tokenizers"
         | 
| 727 | 
            -
            version = "0.5. | 
| 734 | 
            +
            version = "0.5.4"
         | 
| 728 735 | 
             
            dependencies = [
         | 
| 729 736 | 
             
             "magnus",
         | 
| 730 737 | 
             
             "onig",
         | 
| 731 738 | 
             
             "serde",
         | 
| 732 | 
            -
             "tokenizers 0. | 
| 739 | 
            +
             "tokenizers 0.21.0",
         | 
| 733 740 | 
             
            ]
         | 
| 734 741 |  | 
| 735 742 | 
             
            [[package]]
         | 
| 736 743 | 
             
            name = "tokenizers"
         | 
| 737 | 
            -
            version = "0. | 
| 744 | 
            +
            version = "0.21.0"
         | 
| 738 745 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 739 | 
            -
            checksum = " | 
| 746 | 
            +
            checksum = "9ecededfed68a69bc657e486510089e255e53c3d38cc7d4d59c8742668ca2cae"
         | 
| 740 747 | 
             
            dependencies = [
         | 
| 741 748 | 
             
             "aho-corasick",
         | 
| 742 749 | 
             
             "derive_builder",
         | 
| @@ -766,9 +773,9 @@ dependencies = [ | |
| 766 773 |  | 
| 767 774 | 
             
            [[package]]
         | 
| 768 775 | 
             
            name = "unicode-ident"
         | 
| 769 | 
            -
            version = "1.0. | 
| 776 | 
            +
            version = "1.0.14"
         | 
| 770 777 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 771 | 
            -
            checksum = " | 
| 778 | 
            +
            checksum = "adb9e6ca4f869e1180728b7950e35922a7fc6397f7b641499e8f3ef06e50dc83"
         | 
| 772 779 |  | 
| 773 780 | 
             
            [[package]]
         | 
| 774 781 | 
             
            name = "unicode-normalization-alignments"
         | 
| @@ -781,15 +788,15 @@ dependencies = [ | |
| 781 788 |  | 
| 782 789 | 
             
            [[package]]
         | 
| 783 790 | 
             
            name = "unicode-segmentation"
         | 
| 784 | 
            -
            version = "1. | 
| 791 | 
            +
            version = "1.12.0"
         | 
| 785 792 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 786 | 
            -
            checksum = " | 
| 793 | 
            +
            checksum = "f6ccf251212114b54433ec949fd6a7841275f9ada20dddd2f29e9ceea4501493"
         | 
| 787 794 |  | 
| 788 795 | 
             
            [[package]]
         | 
| 789 796 | 
             
            name = "unicode-width"
         | 
| 790 | 
            -
            version = "0. | 
| 797 | 
            +
            version = "0.2.0"
         | 
| 791 798 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 792 | 
            -
            checksum = " | 
| 799 | 
            +
            checksum = "1fc81956842c57dac11422a97c3b8195a1ff727f06e85c84ed2e8aa277c9a0fd"
         | 
| 793 800 |  | 
| 794 801 | 
             
            [[package]]
         | 
| 795 802 | 
             
            name = "unicode_categories"
         | 
| @@ -803,11 +810,75 @@ version = "0.11.0+wasi-snapshot-preview1" | |
| 803 810 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 804 811 | 
             
            checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423"
         | 
| 805 812 |  | 
| 813 | 
            +
            [[package]]
         | 
| 814 | 
            +
            name = "wasm-bindgen"
         | 
| 815 | 
            +
            version = "0.2.99"
         | 
| 816 | 
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 817 | 
            +
            checksum = "a474f6281d1d70c17ae7aa6a613c87fce69a127e2624002df63dcb39d6cf6396"
         | 
| 818 | 
            +
            dependencies = [
         | 
| 819 | 
            +
             "cfg-if",
         | 
| 820 | 
            +
             "once_cell",
         | 
| 821 | 
            +
             "wasm-bindgen-macro",
         | 
| 822 | 
            +
            ]
         | 
| 823 | 
            +
             | 
| 824 | 
            +
            [[package]]
         | 
| 825 | 
            +
            name = "wasm-bindgen-backend"
         | 
| 826 | 
            +
            version = "0.2.99"
         | 
| 827 | 
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 828 | 
            +
            checksum = "5f89bb38646b4f81674e8f5c3fb81b562be1fd936d84320f3264486418519c79"
         | 
| 829 | 
            +
            dependencies = [
         | 
| 830 | 
            +
             "bumpalo",
         | 
| 831 | 
            +
             "log",
         | 
| 832 | 
            +
             "proc-macro2",
         | 
| 833 | 
            +
             "quote",
         | 
| 834 | 
            +
             "syn",
         | 
| 835 | 
            +
             "wasm-bindgen-shared",
         | 
| 836 | 
            +
            ]
         | 
| 837 | 
            +
             | 
| 838 | 
            +
            [[package]]
         | 
| 839 | 
            +
            name = "wasm-bindgen-macro"
         | 
| 840 | 
            +
            version = "0.2.99"
         | 
| 841 | 
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 842 | 
            +
            checksum = "2cc6181fd9a7492eef6fef1f33961e3695e4579b9872a6f7c83aee556666d4fe"
         | 
| 843 | 
            +
            dependencies = [
         | 
| 844 | 
            +
             "quote",
         | 
| 845 | 
            +
             "wasm-bindgen-macro-support",
         | 
| 846 | 
            +
            ]
         | 
| 847 | 
            +
             | 
| 848 | 
            +
            [[package]]
         | 
| 849 | 
            +
            name = "wasm-bindgen-macro-support"
         | 
| 850 | 
            +
            version = "0.2.99"
         | 
| 851 | 
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 852 | 
            +
            checksum = "30d7a95b763d3c45903ed6c81f156801839e5ee968bb07e534c44df0fcd330c2"
         | 
| 853 | 
            +
            dependencies = [
         | 
| 854 | 
            +
             "proc-macro2",
         | 
| 855 | 
            +
             "quote",
         | 
| 856 | 
            +
             "syn",
         | 
| 857 | 
            +
             "wasm-bindgen-backend",
         | 
| 858 | 
            +
             "wasm-bindgen-shared",
         | 
| 859 | 
            +
            ]
         | 
| 860 | 
            +
             | 
| 861 | 
            +
            [[package]]
         | 
| 862 | 
            +
            name = "wasm-bindgen-shared"
         | 
| 863 | 
            +
            version = "0.2.99"
         | 
| 864 | 
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 865 | 
            +
            checksum = "943aab3fdaaa029a6e0271b35ea10b72b943135afe9bffca82384098ad0e06a6"
         | 
| 866 | 
            +
             | 
| 867 | 
            +
            [[package]]
         | 
| 868 | 
            +
            name = "web-time"
         | 
| 869 | 
            +
            version = "1.1.0"
         | 
| 870 | 
            +
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 871 | 
            +
            checksum = "5a6580f308b1fad9207618087a65c04e7a10bc77e02c8e84e9b00dd4b12fa0bb"
         | 
| 872 | 
            +
            dependencies = [
         | 
| 873 | 
            +
             "js-sys",
         | 
| 874 | 
            +
             "wasm-bindgen",
         | 
| 875 | 
            +
            ]
         | 
| 876 | 
            +
             | 
| 806 877 | 
             
            [[package]]
         | 
| 807 878 | 
             
            name = "windows-sys"
         | 
| 808 | 
            -
            version = "0. | 
| 879 | 
            +
            version = "0.59.0"
         | 
| 809 880 | 
             
            source = "registry+https://github.com/rust-lang/crates.io-index"
         | 
| 810 | 
            -
            checksum = " | 
| 881 | 
            +
            checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b"
         | 
| 811 882 | 
             
            dependencies = [
         | 
| 812 883 | 
             
             "windows-targets",
         | 
| 813 884 | 
             
            ]
         | 
    
        data/ext/tokenizers/Cargo.toml
    CHANGED
    
    | @@ -1,6 +1,6 @@ | |
| 1 1 | 
             
            [package]
         | 
| 2 2 | 
             
            name = "tokenizers"
         | 
| 3 | 
            -
            version = "0.5. | 
| 3 | 
            +
            version = "0.5.4"
         | 
| 4 4 | 
             
            license = "Apache-2.0"
         | 
| 5 5 | 
             
            authors = ["Andrew Kane <andrew@ankane.org>"]
         | 
| 6 6 | 
             
            edition = "2021"
         | 
| @@ -16,6 +16,6 @@ onig = { version = "6", default-features = false } | |
| 16 16 | 
             
            serde = { version = "1", features = ["rc", "derive"] }
         | 
| 17 17 |  | 
| 18 18 | 
             
            [dependencies.tokenizers]
         | 
| 19 | 
            -
            version = "=0. | 
| 19 | 
            +
            version = "=0.21.0" # also update in from_pretrained.rb
         | 
| 20 20 | 
             
            default-features = false
         | 
| 21 21 | 
             
            features = ["progressbar", "onig", "esaxx_fast"]
         | 
| @@ -3,8 +3,8 @@ use std::sync::{Arc, RwLock}; | |
| 3 3 | 
             
            use crate::pre_tokenizers::from_string;
         | 
| 4 4 | 
             
            use magnus::value::Lazy;
         | 
| 5 5 | 
             
            use magnus::{
         | 
| 6 | 
            -
                data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object, | 
| 7 | 
            -
                Ruby, TypedData,
         | 
| 6 | 
            +
                data_type_builder, function, method, Class, DataType, DataTypeFunctions, Module, Object,
         | 
| 7 | 
            +
                RClass, RModule, Ruby, TypedData,
         | 
| 8 8 | 
             
            };
         | 
| 9 9 | 
             
            use serde::{Deserialize, Serialize};
         | 
| 10 10 | 
             
            use tk::decoders::bpe::BPEDecoder;
         | 
| @@ -16,11 +16,11 @@ use tk::decoders::metaspace::{Metaspace, PrependScheme}; | |
| 16 16 | 
             
            use tk::decoders::strip::Strip;
         | 
| 17 17 | 
             
            use tk::decoders::wordpiece::WordPiece;
         | 
| 18 18 | 
             
            use tk::decoders::DecoderWrapper;
         | 
| 19 | 
            -
            use tk::Decoder;
         | 
| 20 19 | 
             
            use tk::normalizers::replace::Replace;
         | 
| 20 | 
            +
            use tk::Decoder;
         | 
| 21 21 |  | 
| 22 22 | 
             
            use super::utils::*;
         | 
| 23 | 
            -
            use super::{ | 
| 23 | 
            +
            use super::{RbError, RbResult, DECODERS};
         | 
| 24 24 |  | 
| 25 25 | 
             
            #[derive(DataTypeFunctions, Clone, Deserialize, Serialize)]
         | 
| 26 26 | 
             
            pub struct RbDecoder {
         | 
| @@ -106,7 +106,7 @@ impl RbDecoder { | |
| 106 106 | 
             
                }
         | 
| 107 107 |  | 
| 108 108 | 
             
                fn strip_set_content(&self, content: char) {
         | 
| 109 | 
            -
                    setter!(self, Strip, content, content)
         | 
| 109 | 
            +
                    setter!(self, Strip, content, content);
         | 
| 110 110 | 
             
                }
         | 
| 111 111 |  | 
| 112 112 | 
             
                fn strip_start(&self) -> usize {
         | 
| @@ -114,7 +114,7 @@ impl RbDecoder { | |
| 114 114 | 
             
                }
         | 
| 115 115 |  | 
| 116 116 | 
             
                fn strip_set_start(&self, start: usize) {
         | 
| 117 | 
            -
                    setter!(self, Strip, start, start)
         | 
| 117 | 
            +
                    setter!(self, Strip, start, start);
         | 
| 118 118 | 
             
                }
         | 
| 119 119 |  | 
| 120 120 | 
             
                fn strip_stop(&self) -> usize {
         | 
| @@ -122,7 +122,7 @@ impl RbDecoder { | |
| 122 122 | 
             
                }
         | 
| 123 123 |  | 
| 124 124 | 
             
                fn strip_set_stop(&self, stop: usize) {
         | 
| 125 | 
            -
                    setter!(self, Strip, stop, stop)
         | 
| 125 | 
            +
                    setter!(self, Strip, stop, stop);
         | 
| 126 126 | 
             
                }
         | 
| 127 127 |  | 
| 128 128 | 
             
                pub fn metaspace_replacement(&self) -> char {
         | 
| @@ -228,7 +228,9 @@ pub struct RbReplaceDecoder {} | |
| 228 228 |  | 
| 229 229 | 
             
            impl RbReplaceDecoder {
         | 
| 230 230 | 
             
                pub fn new(pattern: RbPattern, content: String) -> RbResult<RbDecoder> {
         | 
| 231 | 
            -
                    Replace::new(pattern, content) | 
| 231 | 
            +
                    Replace::new(pattern, content)
         | 
| 232 | 
            +
                        .map(|v| v.into())
         | 
| 233 | 
            +
                        .map_err(RbError::from)
         | 
| 232 234 | 
             
                }
         | 
| 233 235 | 
             
            }
         | 
| 234 236 |  | 
| @@ -295,7 +297,8 @@ unsafe impl TypedData for RbDecoder { | |
| 295 297 | 
             
                }
         | 
| 296 298 |  | 
| 297 299 | 
             
                fn data_type() -> &'static DataType {
         | 
| 298 | 
            -
                    static DATA_TYPE: DataType = | 
| 300 | 
            +
                    static DATA_TYPE: DataType =
         | 
| 301 | 
            +
                        data_type_builder!(RbDecoder, "Tokenizers::Decoders::Decoder").build();
         | 
| 299 302 | 
             
                    &DATA_TYPE
         | 
| 300 303 | 
             
                }
         | 
| 301 304 |  | 
| @@ -383,18 +386,33 @@ pub fn init_decoders(ruby: &Ruby, module: &RModule) -> RbResult<()> { | |
| 383 386 | 
             
                class.define_method("cleanup=", method!(RbDecoder::ctc_set_cleanup, 1))?;
         | 
| 384 387 | 
             
                class.define_method("pad_token", method!(RbDecoder::ctc_pad_token, 0))?;
         | 
| 385 388 | 
             
                class.define_method("pad_token=", method!(RbDecoder::ctc_set_pad_token, 1))?;
         | 
| 386 | 
            -
                class.define_method( | 
| 387 | 
            -
             | 
| 389 | 
            +
                class.define_method(
         | 
| 390 | 
            +
                    "word_delimiter_token",
         | 
| 391 | 
            +
                    method!(RbDecoder::ctc_word_delimiter_token, 0),
         | 
| 392 | 
            +
                )?;
         | 
| 393 | 
            +
                class.define_method(
         | 
| 394 | 
            +
                    "word_delimiter_token=",
         | 
| 395 | 
            +
                    method!(RbDecoder::ctc_set_word_delimiter_token, 1),
         | 
| 396 | 
            +
                )?;
         | 
| 388 397 |  | 
| 389 398 | 
             
                let class = module.define_class("Fuse", decoder)?;
         | 
| 390 399 | 
             
                class.define_singleton_method("new", function!(RbFuse::new, 0))?;
         | 
| 391 400 |  | 
| 392 401 | 
             
                let class = module.define_class("Metaspace", decoder)?;
         | 
| 393 402 | 
             
                class.define_singleton_method("_new", function!(RbMetaspaceDecoder::new, 3))?;
         | 
| 394 | 
            -
                class.define_method( | 
| 395 | 
            -
             | 
| 403 | 
            +
                class.define_method(
         | 
| 404 | 
            +
                    "prepend_scheme",
         | 
| 405 | 
            +
                    method!(RbDecoder::metaspace_prepend_scheme, 0),
         | 
| 406 | 
            +
                )?;
         | 
| 407 | 
            +
                class.define_method(
         | 
| 408 | 
            +
                    "prepend_scheme=",
         | 
| 409 | 
            +
                    method!(RbDecoder::metaspace_set_prepend_scheme, 1),
         | 
| 410 | 
            +
                )?;
         | 
| 396 411 | 
             
                class.define_method("replacement", method!(RbDecoder::metaspace_replacement, 0))?;
         | 
| 397 | 
            -
                class.define_method( | 
| 412 | 
            +
                class.define_method(
         | 
| 413 | 
            +
                    "replacement=",
         | 
| 414 | 
            +
                    method!(RbDecoder::metaspace_set_replacement, 1),
         | 
| 415 | 
            +
                )?;
         | 
| 398 416 | 
             
                class.define_method("split", method!(RbDecoder::metaspace_split, 0))?;
         | 
| 399 417 | 
             
                class.define_method("split=", method!(RbDecoder::metaspace_set_split, 1))?;
         | 
| 400 418 |  | 
    
        data/ext/tokenizers/src/error.rs
    CHANGED
    
    | @@ -9,9 +9,14 @@ impl RbError { | |
| 9 9 | 
             
                pub fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Error {
         | 
| 10 10 | 
             
                    Error::new(error(), e.to_string())
         | 
| 11 11 | 
             
                }
         | 
| 12 | 
            +
             | 
| 13 | 
            +
                pub fn new_err(s: String) -> Error {
         | 
| 14 | 
            +
                    Error::new(error(), s)
         | 
| 15 | 
            +
                }
         | 
| 12 16 | 
             
            }
         | 
| 13 17 |  | 
| 14 | 
            -
            static ERROR: Lazy<ExceptionClass> = | 
| 18 | 
            +
            static ERROR: Lazy<ExceptionClass> =
         | 
| 19 | 
            +
                Lazy::new(|ruby| ruby.get_inner(&TOKENIZERS).const_get("Error").unwrap());
         | 
| 15 20 |  | 
| 16 21 | 
             
            fn error() -> ExceptionClass {
         | 
| 17 22 | 
             
                Ruby::get().unwrap().get_inner(&ERROR)
         |