tokenizers 0.3.0-x86_64-darwin → 0.3.2-x86_64-darwin
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +12 -4
- data/Cargo.lock +13 -8
- data/LICENSE-THIRD-PARTY.txt +3 -29
- data/lib/tokenizers/2.7/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.0/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.1/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.2/tokenizers.bundle +0 -0
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/version.rb +1 -1
- metadata +2 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: a79dd7bc52f269fdb6dd01849a2e0bbdf916d92e0bcfb39a8f8a04d70d0e8c76
|
4
|
+
data.tar.gz: 1455ff72f0bbdfbe48bad1287086ad6fc7c3b7663a74ff320896762a3b13e05f
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 4439fa4958c377d9b936bfca4af93de721a8b3b7ff6994b612b023b5a95629b3467bafd43f30d93e179c798bf1c6fcf31bcb99b6bde72cfee556fb9f4e74e68c
|
7
|
+
data.tar.gz: af98eb6353bdcc291a0fd2d666525f1640c797b54a59010ecff7a1a7ae9356643980be20b6f6dd61e4ed0589c6e221cdfd12982cf2db9d0a0565ae00a25cbdf1
|
data/CHANGELOG.md
CHANGED
@@ -1,4 +1,12 @@
|
|
1
|
-
## 0.3.
|
1
|
+
## 0.3.2 (2023-03-06)
|
2
|
+
|
3
|
+
- Added precompiled gem for Linux x86-64 MUSL
|
4
|
+
|
5
|
+
## 0.3.1 (2023-02-08)
|
6
|
+
|
7
|
+
- Fixed error with Ruby 2.7
|
8
|
+
|
9
|
+
## 0.3.0 (2023-02-07)
|
2
10
|
|
3
11
|
- Added support for training tokenizers
|
4
12
|
- Added more methods to `Tokenizer`
|
@@ -7,20 +15,20 @@
|
|
7
15
|
- Changed `encode` method to include special tokens by default
|
8
16
|
- Changed how offsets are calculated for strings with multibyte characters
|
9
17
|
|
10
|
-
## 0.2.3 (
|
18
|
+
## 0.2.3 (2023-01-22)
|
11
19
|
|
12
20
|
- Added `add_special_tokens` option to `encode` method
|
13
21
|
- Added warning about `encode` method including special tokens by default in 0.3.0
|
14
22
|
- Added more methods to `Encoding`
|
15
23
|
- Fixed error with precompiled gem on Mac ARM
|
16
24
|
|
17
|
-
## 0.2.2 (
|
25
|
+
## 0.2.2 (2023-01-15)
|
18
26
|
|
19
27
|
- Added precompiled gem for Linux ARM
|
20
28
|
- Added `from_file` method
|
21
29
|
- Fixed error with precompiled gem on Linux x86-64
|
22
30
|
|
23
|
-
## 0.2.1 (
|
31
|
+
## 0.2.1 (2023-01-12)
|
24
32
|
|
25
33
|
- Added support for Ruby 3.2
|
26
34
|
|
data/Cargo.lock
CHANGED
@@ -353,7 +353,8 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
|
|
353
353
|
[[package]]
|
354
354
|
name = "magnus"
|
355
355
|
version = "0.5.0"
|
356
|
-
source = "
|
356
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
357
|
+
checksum = "af37419a942477f606d227d0e6e92f3b68458bfc68fec3bc2629df6a2c1ccdf9"
|
357
358
|
dependencies = [
|
358
359
|
"magnus-macros",
|
359
360
|
"rb-sys",
|
@@ -362,8 +363,9 @@ dependencies = [
|
|
362
363
|
|
363
364
|
[[package]]
|
364
365
|
name = "magnus-macros"
|
365
|
-
version = "0.
|
366
|
-
source = "
|
366
|
+
version = "0.4.0"
|
367
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
368
|
+
checksum = "85aa71c9891b2732ff1157e1860a1ee578459fd25811fd3d72cc6e32b3fbdfea"
|
367
369
|
dependencies = [
|
368
370
|
"proc-macro2",
|
369
371
|
"quote",
|
@@ -552,22 +554,25 @@ dependencies = [
|
|
552
554
|
|
553
555
|
[[package]]
|
554
556
|
name = "rb-sys"
|
555
|
-
version = "0.9.
|
557
|
+
version = "0.9.65"
|
556
558
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
557
|
-
checksum = "
|
559
|
+
checksum = "e8fe617bad8e88fd7e5d6f432e35f09e5f94144dfb8e8ee4adde82fb920dc59b"
|
558
560
|
dependencies = [
|
559
561
|
"rb-sys-build",
|
560
562
|
]
|
561
563
|
|
562
564
|
[[package]]
|
563
565
|
name = "rb-sys-build"
|
564
|
-
version = "0.9.
|
566
|
+
version = "0.9.65"
|
565
567
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
566
|
-
checksum = "
|
568
|
+
checksum = "007e63597f91c711cbb299e60fecbdb6f5ad4a066d6a20c81943893f1584c895"
|
567
569
|
dependencies = [
|
568
570
|
"bindgen",
|
571
|
+
"lazy_static",
|
572
|
+
"quote",
|
569
573
|
"regex",
|
570
574
|
"shell-words",
|
575
|
+
"syn",
|
571
576
|
]
|
572
577
|
|
573
578
|
[[package]]
|
@@ -711,7 +716,7 @@ dependencies = [
|
|
711
716
|
|
712
717
|
[[package]]
|
713
718
|
name = "tokenizers"
|
714
|
-
version = "0.
|
719
|
+
version = "0.3.1"
|
715
720
|
dependencies = [
|
716
721
|
"magnus",
|
717
722
|
"onig",
|
data/LICENSE-THIRD-PARTY.txt
CHANGED
@@ -154,7 +154,7 @@ magnus v0.5.0
|
|
154
154
|
https://github.com/matsadler/magnus
|
155
155
|
MIT
|
156
156
|
|
157
|
-
magnus-macros v0.
|
157
|
+
magnus-macros v0.4.0
|
158
158
|
https://github.com/matsadler/magnus
|
159
159
|
MIT
|
160
160
|
|
@@ -242,11 +242,11 @@ rayon-core v1.10.2
|
|
242
242
|
https://github.com/rayon-rs/rayon
|
243
243
|
MIT OR Apache-2.0
|
244
244
|
|
245
|
-
rb-sys v0.9.
|
245
|
+
rb-sys v0.9.65
|
246
246
|
https://github.com/oxidize-rb/rb-sys
|
247
247
|
MIT OR Apache-2.0
|
248
248
|
|
249
|
-
rb-sys-build v0.9.
|
249
|
+
rb-sys-build v0.9.65
|
250
250
|
https://github.com/oxidize-rb/rb-sys
|
251
251
|
MIT OR Apache-2.0
|
252
252
|
|
@@ -7795,32 +7795,6 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
7795
7795
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
7796
7796
|
SOFTWARE.
|
7797
7797
|
|
7798
|
-
================================================================================
|
7799
|
-
magnus magnus-macros/LICENSE
|
7800
|
-
================================================================================
|
7801
|
-
|
7802
|
-
MIT License
|
7803
|
-
|
7804
|
-
Copyright (c) 2022, 2021 Matthew Sadler
|
7805
|
-
|
7806
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7807
|
-
of this software and associated documentation files (the "Software"), to deal
|
7808
|
-
in the Software without restriction, including without limitation the rights
|
7809
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
7810
|
-
copies of the Software, and to permit persons to whom the Software is
|
7811
|
-
furnished to do so, subject to the following conditions:
|
7812
|
-
|
7813
|
-
The above copyright notice and this permission notice shall be included in all
|
7814
|
-
copies or substantial portions of the Software.
|
7815
|
-
|
7816
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
7817
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
7818
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
7819
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
7820
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
7821
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
7822
|
-
SOFTWARE.
|
7823
|
-
|
7824
7798
|
================================================================================
|
7825
7799
|
magnus-macros LICENSE
|
7826
7800
|
================================================================================
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -44,7 +44,7 @@ module Tokenizers
|
|
44
44
|
def cached_path(cache_dir, url, options)
|
45
45
|
fsum = Digest::SHA256.hexdigest(url)
|
46
46
|
meta_paths = Dir[File.join(cache_dir, "#{fsum}.*.meta")]
|
47
|
-
meta = meta_paths.map { |f| JSON.
|
47
|
+
meta = meta_paths.map { |f| JSON.parse(File.read(f)) }.max_by { |m| m["creation_time"] }
|
48
48
|
etag = meta["etag"] if meta
|
49
49
|
|
50
50
|
if etag
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.3.
|
4
|
+
version: 0.3.2
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-03-07 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|