tokenizers 0.4.2-x86_64-darwin → 0.4.3-x86_64-darwin
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +4 -0
- data/Cargo.lock +19 -13
- data/LICENSE-THIRD-PARTY.txt +246 -6
- data/README.md +40 -4
- data/lib/tokenizers/3.0/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.1/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.2/tokenizers.bundle +0 -0
- data/lib/tokenizers/3.3/tokenizers.bundle +0 -0
- data/lib/tokenizers/version.rb +1 -1
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 647eb16b11c84edccf58a7d1162fce958d824c5fcb741270dc2eff3a1c183ef3
|
4
|
+
data.tar.gz: '018a24e59be71749877bfe636fa08bc698f8a693e07ef4c2d653c1d30781f80b'
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: af6d7d2111da599f2d59361af2558510676b5251d61145ceb4aa72388d9200f99b95ede940c0d5bdbdd117885fdcadc90d89b006f3b9040951deb5b7f0431156
|
7
|
+
data.tar.gz: 5ef4e6be5321a6fffe6aaf8a1899a6584f93f4c9b7ace304362a0e7c3a9dd7aabba853460c87e832366033d63450ef34fec9f17be46e7ea336db69888d1b6faf
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
@@ -25,11 +25,11 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
|
|
25
25
|
|
26
26
|
[[package]]
|
27
27
|
name = "bindgen"
|
28
|
-
version = "0.
|
28
|
+
version = "0.69.1"
|
29
29
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
30
|
-
checksum = "
|
30
|
+
checksum = "9ffcebc3849946a7170a05992aac39da343a90676ab392c51a4280981d6379c2"
|
31
31
|
dependencies = [
|
32
|
-
"bitflags",
|
32
|
+
"bitflags 2.4.1",
|
33
33
|
"cexpr",
|
34
34
|
"clang-sys",
|
35
35
|
"lazy_static",
|
@@ -40,7 +40,7 @@ dependencies = [
|
|
40
40
|
"regex",
|
41
41
|
"rustc-hash",
|
42
42
|
"shlex",
|
43
|
-
"syn
|
43
|
+
"syn 2.0.38",
|
44
44
|
]
|
45
45
|
|
46
46
|
[[package]]
|
@@ -49,6 +49,12 @@ version = "1.3.2"
|
|
49
49
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
50
50
|
checksum = "bef38d45163c2f1dde094a7dfd33ccf595c92905c8f8f4fdc18d06fb1037718a"
|
51
51
|
|
52
|
+
[[package]]
|
53
|
+
name = "bitflags"
|
54
|
+
version = "2.4.1"
|
55
|
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
56
|
+
checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
|
57
|
+
|
52
58
|
[[package]]
|
53
59
|
name = "cc"
|
54
60
|
version = "1.0.79"
|
@@ -335,9 +341,9 @@ checksum = "b8dd856d451cc0da70e2ef2ce95a18e39a93b7558bedf10201ad28503f918568"
|
|
335
341
|
|
336
342
|
[[package]]
|
337
343
|
name = "magnus"
|
338
|
-
version = "0.6.
|
344
|
+
version = "0.6.2"
|
339
345
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
340
|
-
checksum = "
|
346
|
+
checksum = "4778544796676e8428e9c622460ebf284bea52d8b10db3aeb449d8b5e61b3a13"
|
341
347
|
dependencies = [
|
342
348
|
"magnus-macros",
|
343
349
|
"rb-sys",
|
@@ -426,7 +432,7 @@ version = "6.4.0"
|
|
426
432
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
427
433
|
checksum = "8c4b31c8722ad9171c6d77d3557db078cab2bd50afcc9d09c8b315c59df8ca4f"
|
428
434
|
dependencies = [
|
429
|
-
"bitflags",
|
435
|
+
"bitflags 1.3.2",
|
430
436
|
"libc",
|
431
437
|
"once_cell",
|
432
438
|
"onig_sys",
|
@@ -553,18 +559,18 @@ dependencies = [
|
|
553
559
|
|
554
560
|
[[package]]
|
555
561
|
name = "rb-sys"
|
556
|
-
version = "0.9.
|
562
|
+
version = "0.9.86"
|
557
563
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
558
|
-
checksum = "
|
564
|
+
checksum = "7285f2a7b92f58ab198e3fd59a71d2861478f9c4642f41e83582385818941697"
|
559
565
|
dependencies = [
|
560
566
|
"rb-sys-build",
|
561
567
|
]
|
562
568
|
|
563
569
|
[[package]]
|
564
570
|
name = "rb-sys-build"
|
565
|
-
version = "0.9.
|
571
|
+
version = "0.9.86"
|
566
572
|
source = "registry+https://github.com/rust-lang/crates.io-index"
|
567
|
-
checksum = "
|
573
|
+
checksum = "71583945f94dabb6c0dfa63f1b71e929c1901e1e288ef3739ab8bed3b7069550"
|
568
574
|
dependencies = [
|
569
575
|
"bindgen",
|
570
576
|
"lazy_static",
|
@@ -572,7 +578,7 @@ dependencies = [
|
|
572
578
|
"quote",
|
573
579
|
"regex",
|
574
580
|
"shell-words",
|
575
|
-
"syn
|
581
|
+
"syn 2.0.38",
|
576
582
|
]
|
577
583
|
|
578
584
|
[[package]]
|
@@ -745,7 +751,7 @@ dependencies = [
|
|
745
751
|
|
746
752
|
[[package]]
|
747
753
|
name = "tokenizers"
|
748
|
-
version = "0.4.
|
754
|
+
version = "0.4.3"
|
749
755
|
dependencies = [
|
750
756
|
"magnus",
|
751
757
|
"onig",
|
data/LICENSE-THIRD-PARTY.txt
CHANGED
@@ -14,7 +14,7 @@ base64 v0.13.1
|
|
14
14
|
https://github.com/marshallpierce/rust-base64
|
15
15
|
MIT/Apache-2.0
|
16
16
|
|
17
|
-
bindgen v0.
|
17
|
+
bindgen v0.69.1
|
18
18
|
https://rust-lang.github.io/rust-bindgen/
|
19
19
|
BSD-3-Clause
|
20
20
|
|
@@ -22,6 +22,10 @@ bitflags v1.3.2
|
|
22
22
|
https://github.com/bitflags/bitflags
|
23
23
|
MIT/Apache-2.0
|
24
24
|
|
25
|
+
bitflags v2.4.1
|
26
|
+
https://github.com/bitflags/bitflags
|
27
|
+
MIT OR Apache-2.0
|
28
|
+
|
25
29
|
cc v1.0.79
|
26
30
|
https://github.com/rust-lang/cc-rs
|
27
31
|
MIT OR Apache-2.0
|
@@ -142,7 +146,7 @@ macro_rules_attribute-proc_macro v0.2.0
|
|
142
146
|
https://github.com/danielhenrymantilla/macro_rules_attribute-rs
|
143
147
|
MIT
|
144
148
|
|
145
|
-
magnus v0.6.
|
149
|
+
magnus v0.6.2
|
146
150
|
https://github.com/matsadler/magnus
|
147
151
|
MIT
|
148
152
|
|
@@ -242,11 +246,11 @@ rayon-core v1.12.0
|
|
242
246
|
https://github.com/rayon-rs/rayon
|
243
247
|
MIT OR Apache-2.0
|
244
248
|
|
245
|
-
rb-sys v0.9.
|
249
|
+
rb-sys v0.9.86
|
246
250
|
https://github.com/oxidize-rb/rb-sys
|
247
251
|
MIT OR Apache-2.0
|
248
252
|
|
249
|
-
rb-sys-build v0.9.
|
253
|
+
rb-sys-build v0.9.86
|
250
254
|
https://github.com/oxidize-rb/rb-sys
|
251
255
|
MIT OR Apache-2.0
|
252
256
|
|
@@ -920,7 +924,243 @@ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
|
920
924
|
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
921
925
|
|
922
926
|
================================================================================
|
923
|
-
bitflags LICENSE-APACHE
|
927
|
+
bitflags v1.3.2 LICENSE-APACHE
|
928
|
+
================================================================================
|
929
|
+
|
930
|
+
Apache License
|
931
|
+
Version 2.0, January 2004
|
932
|
+
http://www.apache.org/licenses/
|
933
|
+
|
934
|
+
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
|
935
|
+
|
936
|
+
1. Definitions.
|
937
|
+
|
938
|
+
"License" shall mean the terms and conditions for use, reproduction,
|
939
|
+
and distribution as defined by Sections 1 through 9 of this document.
|
940
|
+
|
941
|
+
"Licensor" shall mean the copyright owner or entity authorized by
|
942
|
+
the copyright owner that is granting the License.
|
943
|
+
|
944
|
+
"Legal Entity" shall mean the union of the acting entity and all
|
945
|
+
other entities that control, are controlled by, or are under common
|
946
|
+
control with that entity. For the purposes of this definition,
|
947
|
+
"control" means (i) the power, direct or indirect, to cause the
|
948
|
+
direction or management of such entity, whether by contract or
|
949
|
+
otherwise, or (ii) ownership of fifty percent (50%) or more of the
|
950
|
+
outstanding shares, or (iii) beneficial ownership of such entity.
|
951
|
+
|
952
|
+
"You" (or "Your") shall mean an individual or Legal Entity
|
953
|
+
exercising permissions granted by this License.
|
954
|
+
|
955
|
+
"Source" form shall mean the preferred form for making modifications,
|
956
|
+
including but not limited to software source code, documentation
|
957
|
+
source, and configuration files.
|
958
|
+
|
959
|
+
"Object" form shall mean any form resulting from mechanical
|
960
|
+
transformation or translation of a Source form, including but
|
961
|
+
not limited to compiled object code, generated documentation,
|
962
|
+
and conversions to other media types.
|
963
|
+
|
964
|
+
"Work" shall mean the work of authorship, whether in Source or
|
965
|
+
Object form, made available under the License, as indicated by a
|
966
|
+
copyright notice that is included in or attached to the work
|
967
|
+
(an example is provided in the Appendix below).
|
968
|
+
|
969
|
+
"Derivative Works" shall mean any work, whether in Source or Object
|
970
|
+
form, that is based on (or derived from) the Work and for which the
|
971
|
+
editorial revisions, annotations, elaborations, or other modifications
|
972
|
+
represent, as a whole, an original work of authorship. For the purposes
|
973
|
+
of this License, Derivative Works shall not include works that remain
|
974
|
+
separable from, or merely link (or bind by name) to the interfaces of,
|
975
|
+
the Work and Derivative Works thereof.
|
976
|
+
|
977
|
+
"Contribution" shall mean any work of authorship, including
|
978
|
+
the original version of the Work and any modifications or additions
|
979
|
+
to that Work or Derivative Works thereof, that is intentionally
|
980
|
+
submitted to Licensor for inclusion in the Work by the copyright owner
|
981
|
+
or by an individual or Legal Entity authorized to submit on behalf of
|
982
|
+
the copyright owner. For the purposes of this definition, "submitted"
|
983
|
+
means any form of electronic, verbal, or written communication sent
|
984
|
+
to the Licensor or its representatives, including but not limited to
|
985
|
+
communication on electronic mailing lists, source code control systems,
|
986
|
+
and issue tracking systems that are managed by, or on behalf of, the
|
987
|
+
Licensor for the purpose of discussing and improving the Work, but
|
988
|
+
excluding communication that is conspicuously marked or otherwise
|
989
|
+
designated in writing by the copyright owner as "Not a Contribution."
|
990
|
+
|
991
|
+
"Contributor" shall mean Licensor and any individual or Legal Entity
|
992
|
+
on behalf of whom a Contribution has been received by Licensor and
|
993
|
+
subsequently incorporated within the Work.
|
994
|
+
|
995
|
+
2. Grant of Copyright License. Subject to the terms and conditions of
|
996
|
+
this License, each Contributor hereby grants to You a perpetual,
|
997
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
998
|
+
copyright license to reproduce, prepare Derivative Works of,
|
999
|
+
publicly display, publicly perform, sublicense, and distribute the
|
1000
|
+
Work and such Derivative Works in Source or Object form.
|
1001
|
+
|
1002
|
+
3. Grant of Patent License. Subject to the terms and conditions of
|
1003
|
+
this License, each Contributor hereby grants to You a perpetual,
|
1004
|
+
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
|
1005
|
+
(except as stated in this section) patent license to make, have made,
|
1006
|
+
use, offer to sell, sell, import, and otherwise transfer the Work,
|
1007
|
+
where such license applies only to those patent claims licensable
|
1008
|
+
by such Contributor that are necessarily infringed by their
|
1009
|
+
Contribution(s) alone or by combination of their Contribution(s)
|
1010
|
+
with the Work to which such Contribution(s) was submitted. If You
|
1011
|
+
institute patent litigation against any entity (including a
|
1012
|
+
cross-claim or counterclaim in a lawsuit) alleging that the Work
|
1013
|
+
or a Contribution incorporated within the Work constitutes direct
|
1014
|
+
or contributory patent infringement, then any patent licenses
|
1015
|
+
granted to You under this License for that Work shall terminate
|
1016
|
+
as of the date such litigation is filed.
|
1017
|
+
|
1018
|
+
4. Redistribution. You may reproduce and distribute copies of the
|
1019
|
+
Work or Derivative Works thereof in any medium, with or without
|
1020
|
+
modifications, and in Source or Object form, provided that You
|
1021
|
+
meet the following conditions:
|
1022
|
+
|
1023
|
+
(a) You must give any other recipients of the Work or
|
1024
|
+
Derivative Works a copy of this License; and
|
1025
|
+
|
1026
|
+
(b) You must cause any modified files to carry prominent notices
|
1027
|
+
stating that You changed the files; and
|
1028
|
+
|
1029
|
+
(c) You must retain, in the Source form of any Derivative Works
|
1030
|
+
that You distribute, all copyright, patent, trademark, and
|
1031
|
+
attribution notices from the Source form of the Work,
|
1032
|
+
excluding those notices that do not pertain to any part of
|
1033
|
+
the Derivative Works; and
|
1034
|
+
|
1035
|
+
(d) If the Work includes a "NOTICE" text file as part of its
|
1036
|
+
distribution, then any Derivative Works that You distribute must
|
1037
|
+
include a readable copy of the attribution notices contained
|
1038
|
+
within such NOTICE file, excluding those notices that do not
|
1039
|
+
pertain to any part of the Derivative Works, in at least one
|
1040
|
+
of the following places: within a NOTICE text file distributed
|
1041
|
+
as part of the Derivative Works; within the Source form or
|
1042
|
+
documentation, if provided along with the Derivative Works; or,
|
1043
|
+
within a display generated by the Derivative Works, if and
|
1044
|
+
wherever such third-party notices normally appear. The contents
|
1045
|
+
of the NOTICE file are for informational purposes only and
|
1046
|
+
do not modify the License. You may add Your own attribution
|
1047
|
+
notices within Derivative Works that You distribute, alongside
|
1048
|
+
or as an addendum to the NOTICE text from the Work, provided
|
1049
|
+
that such additional attribution notices cannot be construed
|
1050
|
+
as modifying the License.
|
1051
|
+
|
1052
|
+
You may add Your own copyright statement to Your modifications and
|
1053
|
+
may provide additional or different license terms and conditions
|
1054
|
+
for use, reproduction, or distribution of Your modifications, or
|
1055
|
+
for any such Derivative Works as a whole, provided Your use,
|
1056
|
+
reproduction, and distribution of the Work otherwise complies with
|
1057
|
+
the conditions stated in this License.
|
1058
|
+
|
1059
|
+
5. Submission of Contributions. Unless You explicitly state otherwise,
|
1060
|
+
any Contribution intentionally submitted for inclusion in the Work
|
1061
|
+
by You to the Licensor shall be under the terms and conditions of
|
1062
|
+
this License, without any additional terms or conditions.
|
1063
|
+
Notwithstanding the above, nothing herein shall supersede or modify
|
1064
|
+
the terms of any separate license agreement you may have executed
|
1065
|
+
with Licensor regarding such Contributions.
|
1066
|
+
|
1067
|
+
6. Trademarks. This License does not grant permission to use the trade
|
1068
|
+
names, trademarks, service marks, or product names of the Licensor,
|
1069
|
+
except as required for reasonable and customary use in describing the
|
1070
|
+
origin of the Work and reproducing the content of the NOTICE file.
|
1071
|
+
|
1072
|
+
7. Disclaimer of Warranty. Unless required by applicable law or
|
1073
|
+
agreed to in writing, Licensor provides the Work (and each
|
1074
|
+
Contributor provides its Contributions) on an "AS IS" BASIS,
|
1075
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
|
1076
|
+
implied, including, without limitation, any warranties or conditions
|
1077
|
+
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
|
1078
|
+
PARTICULAR PURPOSE. You are solely responsible for determining the
|
1079
|
+
appropriateness of using or redistributing the Work and assume any
|
1080
|
+
risks associated with Your exercise of permissions under this License.
|
1081
|
+
|
1082
|
+
8. Limitation of Liability. In no event and under no legal theory,
|
1083
|
+
whether in tort (including negligence), contract, or otherwise,
|
1084
|
+
unless required by applicable law (such as deliberate and grossly
|
1085
|
+
negligent acts) or agreed to in writing, shall any Contributor be
|
1086
|
+
liable to You for damages, including any direct, indirect, special,
|
1087
|
+
incidental, or consequential damages of any character arising as a
|
1088
|
+
result of this License or out of the use or inability to use the
|
1089
|
+
Work (including but not limited to damages for loss of goodwill,
|
1090
|
+
work stoppage, computer failure or malfunction, or any and all
|
1091
|
+
other commercial damages or losses), even if such Contributor
|
1092
|
+
has been advised of the possibility of such damages.
|
1093
|
+
|
1094
|
+
9. Accepting Warranty or Additional Liability. While redistributing
|
1095
|
+
the Work or Derivative Works thereof, You may choose to offer,
|
1096
|
+
and charge a fee for, acceptance of support, warranty, indemnity,
|
1097
|
+
or other liability obligations and/or rights consistent with this
|
1098
|
+
License. However, in accepting such obligations, You may act only
|
1099
|
+
on Your own behalf and on Your sole responsibility, not on behalf
|
1100
|
+
of any other Contributor, and only if You agree to indemnify,
|
1101
|
+
defend, and hold each Contributor harmless for any liability
|
1102
|
+
incurred by, or claims asserted against, such Contributor by reason
|
1103
|
+
of your accepting any such warranty or additional liability.
|
1104
|
+
|
1105
|
+
END OF TERMS AND CONDITIONS
|
1106
|
+
|
1107
|
+
APPENDIX: How to apply the Apache License to your work.
|
1108
|
+
|
1109
|
+
To apply the Apache License to your work, attach the following
|
1110
|
+
boilerplate notice, with the fields enclosed by brackets "[]"
|
1111
|
+
replaced with your own identifying information. (Don't include
|
1112
|
+
the brackets!) The text should be enclosed in the appropriate
|
1113
|
+
comment syntax for the file format. We also recommend that a
|
1114
|
+
file or class name and description of purpose be included on the
|
1115
|
+
same "printed page" as the copyright notice for easier
|
1116
|
+
identification within third-party archives.
|
1117
|
+
|
1118
|
+
Copyright [yyyy] [name of copyright owner]
|
1119
|
+
|
1120
|
+
Licensed under the Apache License, Version 2.0 (the "License");
|
1121
|
+
you may not use this file except in compliance with the License.
|
1122
|
+
You may obtain a copy of the License at
|
1123
|
+
|
1124
|
+
http://www.apache.org/licenses/LICENSE-2.0
|
1125
|
+
|
1126
|
+
Unless required by applicable law or agreed to in writing, software
|
1127
|
+
distributed under the License is distributed on an "AS IS" BASIS,
|
1128
|
+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
1129
|
+
See the License for the specific language governing permissions and
|
1130
|
+
limitations under the License.
|
1131
|
+
|
1132
|
+
================================================================================
|
1133
|
+
bitflags v1.3.2 LICENSE-MIT
|
1134
|
+
================================================================================
|
1135
|
+
|
1136
|
+
Copyright (c) 2014 The Rust Project Developers
|
1137
|
+
|
1138
|
+
Permission is hereby granted, free of charge, to any
|
1139
|
+
person obtaining a copy of this software and associated
|
1140
|
+
documentation files (the "Software"), to deal in the
|
1141
|
+
Software without restriction, including without
|
1142
|
+
limitation the rights to use, copy, modify, merge,
|
1143
|
+
publish, distribute, sublicense, and/or sell copies of
|
1144
|
+
the Software, and to permit persons to whom the Software
|
1145
|
+
is furnished to do so, subject to the following
|
1146
|
+
conditions:
|
1147
|
+
|
1148
|
+
The above copyright notice and this permission notice
|
1149
|
+
shall be included in all copies or substantial portions
|
1150
|
+
of the Software.
|
1151
|
+
|
1152
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
|
1153
|
+
ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
|
1154
|
+
TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
|
1155
|
+
PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
|
1156
|
+
SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
|
1157
|
+
CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
1158
|
+
OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
|
1159
|
+
IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
|
1160
|
+
DEALINGS IN THE SOFTWARE.
|
1161
|
+
|
1162
|
+
================================================================================
|
1163
|
+
bitflags v2.4.1 LICENSE-APACHE
|
924
1164
|
================================================================================
|
925
1165
|
|
926
1166
|
Apache License
|
@@ -1126,7 +1366,7 @@ See the License for the specific language governing permissions and
|
|
1126
1366
|
limitations under the License.
|
1127
1367
|
|
1128
1368
|
================================================================================
|
1129
|
-
bitflags LICENSE-MIT
|
1369
|
+
bitflags v2.4.1 LICENSE-MIT
|
1130
1370
|
================================================================================
|
1131
1371
|
|
1132
1372
|
Copyright (c) 2014 The Rust Project Developers
|
data/README.md
CHANGED
@@ -34,15 +34,51 @@ Decode
|
|
34
34
|
tokenizer.decode(ids)
|
35
35
|
```
|
36
36
|
|
37
|
-
|
37
|
+
## Training
|
38
|
+
|
39
|
+
Create a tokenizer
|
38
40
|
|
39
41
|
```ruby
|
40
|
-
tokenizer = Tokenizers::
|
42
|
+
tokenizer = Tokenizers::Tokenizer.new(Tokenizers::Models::BPE.new(unk_token: "[UNK]"))
|
41
43
|
```
|
42
44
|
|
43
|
-
|
45
|
+
Set the pre-tokenizer
|
46
|
+
|
47
|
+
```ruby
|
48
|
+
tokenizer.pre_tokenizer = Tokenizers::PreTokenizers::Whitespace.new
|
49
|
+
```
|
50
|
+
|
51
|
+
Train the tokenizer ([example data](https://huggingface.co/docs/tokenizers/quicktour#build-a-tokenizer-from-scratch))
|
52
|
+
|
53
|
+
```ruby
|
54
|
+
trainer = Tokenizers::Trainers::BpeTrainer.new(special_tokens: ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])
|
55
|
+
tokenizer.train(["wiki.train.raw", "wiki.valid.raw", "wiki.test.raw"], trainer)
|
56
|
+
```
|
57
|
+
|
58
|
+
Encode
|
59
|
+
|
60
|
+
```ruby
|
61
|
+
output = tokenizer.encode("Hello, y'all! How are you 😁 ?")
|
62
|
+
output.tokens
|
63
|
+
```
|
64
|
+
|
65
|
+
Save the tokenizer to a file
|
66
|
+
|
67
|
+
```ruby
|
68
|
+
tokenizer.save("tokenizer.json")
|
69
|
+
```
|
70
|
+
|
71
|
+
Load a tokenizer from a file
|
72
|
+
|
73
|
+
```ruby
|
74
|
+
tokenizer = Tokenizers.from_file("tokenizer.json")
|
75
|
+
```
|
76
|
+
|
77
|
+
Check out the [Quicktour](https://huggingface.co/docs/tokenizers/quicktour) and equivalent [Ruby code](https://github.com/ankane/tokenizers-ruby/blob/master/test/quicktour_test.rb#L8) for more info
|
78
|
+
|
79
|
+
## API
|
44
80
|
|
45
|
-
|
81
|
+
This library follows the [Tokenizers Python API](https://huggingface.co/docs/tokenizers/index). You can follow Python tutorials and convert the code to Ruby in many cases. Feel free to open an issue if you run into problems.
|
46
82
|
|
47
83
|
## History
|
48
84
|
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
data/lib/tokenizers/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.4.
|
4
|
+
version: 0.4.3
|
5
5
|
platform: x86_64-darwin
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2024-01-04 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -26,6 +26,7 @@ files:
|
|
26
26
|
- lib/tokenizers/3.0/tokenizers.bundle
|
27
27
|
- lib/tokenizers/3.1/tokenizers.bundle
|
28
28
|
- lib/tokenizers/3.2/tokenizers.bundle
|
29
|
+
- lib/tokenizers/3.3/tokenizers.bundle
|
29
30
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
30
31
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
31
32
|
- lib/tokenizers/decoders/ctc.rb
|
@@ -70,7 +71,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
70
71
|
version: '3.0'
|
71
72
|
- - "<"
|
72
73
|
- !ruby/object:Gem::Version
|
73
|
-
version: 3.
|
74
|
+
version: 3.4.dev
|
74
75
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
75
76
|
requirements:
|
76
77
|
- - ">="
|