tokenizers 0.3.3-x86_64-linux → 0.4.0-x86_64-linux

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: 4041eb3f8e79ca25397620670357fce958823e6fb49c0ef7c3968f11b6ce99e1
4
- data.tar.gz: 17952cf748eff3f62165bfedc304753f127cfde805e2be3e8cfd7adcfd5e878c
3
+ metadata.gz: 32e0ffbced9ba6ede5acdd6ff7a1d2a39efaea0445dc7b42d23524a387c039da
4
+ data.tar.gz: 5c896851ce062d1fa4457038549f9742d16e6f1c32fe3b3f715f5989fc06ca69
5
5
  SHA512:
6
- metadata.gz: a3b4c35630874860ddc85b09b75c03f119dfd622dcf7ddd47fd682b547ba338a08656ec52b4bcad409ebdc6c128dbd640f78f639c4a11839ecc28dc4289a0da6
7
- data.tar.gz: 8c90c7e8a1e67ce65c2b545bbb879e4d277da908f6fcbc3ddc029711f36e2ac4d4e73d0bf76d71f333eaceb6a612ee9261708d80c507b0d597d8f924562ec912
6
+ metadata.gz: 0b9eb472f71f49273d6b1ade1e1017f66953c35159530b1e944ba67d6575697f27c5b8bd9534fbe506562541664f193ed9e9efaf7e6937db7fb482e98cd74d79
7
+ data.tar.gz: 375f812105aa5a78688e47c7e65ad69d86e65a9112821cb63b470c51a00afd3281bf69162a47e980d7f777e5cca1ad5d2c66ce16fd064aca5cf19b14115b106c
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
+ ## 0.4.0 (2023-07-20)
2
+
3
+ - Updated Tokenizers to 0.14.0
4
+ - Dropped support for Ruby < 3
5
+
1
6
  ## 0.3.3 (2023-04-09)
2
7
 
3
8
  - Updated Tokenizers to 0.13.3
data/Cargo.lock CHANGED
@@ -11,6 +11,15 @@ dependencies = [
11
11
  "memchr",
12
12
  ]
13
13
 
14
+ [[package]]
15
+ name = "aho-corasick"
16
+ version = "1.0.5"
17
+ source = "registry+https://github.com/rust-lang/crates.io-index"
18
+ checksum = "0c378d78423fdad8089616f827526ee33c19f2fddbd5de1629152c9593ba4783"
19
+ dependencies = [
20
+ "memchr",
21
+ ]
22
+
14
23
  [[package]]
15
24
  name = "autocfg"
16
25
  version = "1.1.0"
@@ -25,9 +34,9 @@ checksum = "9e1b586273c5702936fe7b7d6896644d8be71e6314cfe09d3167c95f712589e8"
25
34
 
26
35
  [[package]]
27
36
  name = "bindgen"
28
- version = "0.60.1"
37
+ version = "0.62.0"
29
38
  source = "registry+https://github.com/rust-lang/crates.io-index"
30
- checksum = "062dddbc1ba4aca46de6338e2bf87771414c335f7b2f2036e8f3e9befebf88e6"
39
+ checksum = "c6720a8b7b2d39dd533285ed438d458f65b31b5c257e6ac7bb3d7e82844dd722"
31
40
  dependencies = [
32
41
  "bitflags",
33
42
  "cexpr",
@@ -40,6 +49,7 @@ dependencies = [
40
49
  "regex",
41
50
  "rustc-hash",
42
51
  "shlex",
52
+ "syn 1.0.109",
43
53
  ]
44
54
 
45
55
  [[package]]
@@ -352,31 +362,32 @@ checksum = "58093314a45e00c77d5c508f76e77c3396afbbc0d01506e7fae47b018bac2b1d"
352
362
 
353
363
  [[package]]
354
364
  name = "magnus"
355
- version = "0.5.3"
365
+ version = "0.6.0"
356
366
  source = "registry+https://github.com/rust-lang/crates.io-index"
357
- checksum = "c8dc14463c2552e753ef562961f486ca76f17a857c121db40e9f3ade3f35ab81"
367
+ checksum = "68e9585bfe236e88e6b10b6d8eb5349bd0e0009f3f9dff8d2e99a82601b33743"
358
368
  dependencies = [
359
369
  "magnus-macros",
360
370
  "rb-sys",
361
371
  "rb-sys-env",
372
+ "seq-macro",
362
373
  ]
363
374
 
364
375
  [[package]]
365
376
  name = "magnus-macros"
366
- version = "0.4.1"
377
+ version = "0.6.0"
367
378
  source = "registry+https://github.com/rust-lang/crates.io-index"
368
- checksum = "6cc17af1d45442c011aa579d727ec6cff8a69aea8a6bbad26736e7112d749bfb"
379
+ checksum = "5968c820e2960565f647819f5928a42d6e874551cab9d88d75e3e0660d7f71e3"
369
380
  dependencies = [
370
381
  "proc-macro2",
371
382
  "quote",
372
- "syn 1.0.109",
383
+ "syn 2.0.13",
373
384
  ]
374
385
 
375
386
  [[package]]
376
387
  name = "memchr"
377
- version = "2.5.0"
388
+ version = "2.6.3"
378
389
  source = "registry+https://github.com/rust-lang/crates.io-index"
379
- checksum = "2dffe52ecf27772e601905b7522cb4ef790d2cc203488bbd0e2fe85fcb74566d"
390
+ checksum = "8f232d6ef707e1956a43342693d2a31e72989554d58299d7a88738cc95b0d35c"
380
391
 
381
392
  [[package]]
382
393
  name = "memoffset"
@@ -575,18 +586,18 @@ dependencies = [
575
586
 
576
587
  [[package]]
577
588
  name = "rb-sys"
578
- version = "0.9.71"
589
+ version = "0.9.79"
579
590
  source = "registry+https://github.com/rust-lang/crates.io-index"
580
- checksum = "156bfedced1e236600bcaad538477097ff2ed5c6b474e411d15b791e1d24c0f1"
591
+ checksum = "939fb78db3e4f26665c1d4c7b91ca66d3578335a19aba552d4a6445811d07072"
581
592
  dependencies = [
582
593
  "rb-sys-build",
583
594
  ]
584
595
 
585
596
  [[package]]
586
597
  name = "rb-sys-build"
587
- version = "0.9.71"
598
+ version = "0.9.79"
588
599
  source = "registry+https://github.com/rust-lang/crates.io-index"
589
- checksum = "5cb2e4a32cbc290b543a74567072ad24b708aff7bb5dde5a68d5690379cd7938"
600
+ checksum = "335a95eb0420d52fa94ef12019df3c2c250c6b19cbb3c60bd05cb7e9c362072c"
590
601
  dependencies = [
591
602
  "bindgen",
592
603
  "lazy_static",
@@ -605,20 +616,32 @@ checksum = "a35802679f07360454b418a5d1735c89716bde01d35b1560fc953c1415a0b3bb"
605
616
 
606
617
  [[package]]
607
618
  name = "regex"
608
- version = "1.7.3"
619
+ version = "1.9.5"
609
620
  source = "registry+https://github.com/rust-lang/crates.io-index"
610
- checksum = "8b1f693b24f6ac912f4893ef08244d70b6067480d2f1a46e950c9691e6749d1d"
621
+ checksum = "697061221ea1b4a94a624f67d0ae2bfe4e22b8a17b6a192afb11046542cc8c47"
611
622
  dependencies = [
612
- "aho-corasick",
623
+ "aho-corasick 1.0.5",
624
+ "memchr",
625
+ "regex-automata",
626
+ "regex-syntax",
627
+ ]
628
+
629
+ [[package]]
630
+ name = "regex-automata"
631
+ version = "0.3.8"
632
+ source = "registry+https://github.com/rust-lang/crates.io-index"
633
+ checksum = "c2f401f4955220693b56f8ec66ee9c78abffd8d1c4f23dc41a23839eb88f0795"
634
+ dependencies = [
635
+ "aho-corasick 1.0.5",
613
636
  "memchr",
614
637
  "regex-syntax",
615
638
  ]
616
639
 
617
640
  [[package]]
618
641
  name = "regex-syntax"
619
- version = "0.6.29"
642
+ version = "0.7.5"
620
643
  source = "registry+https://github.com/rust-lang/crates.io-index"
621
- checksum = "f162c6dd7b008981e4d40210aca20b4bd0f9b60ca9271061b07f78537722f2e1"
644
+ checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da"
622
645
 
623
646
  [[package]]
624
647
  name = "rustc-hash"
@@ -638,6 +661,12 @@ version = "1.1.0"
638
661
  source = "registry+https://github.com/rust-lang/crates.io-index"
639
662
  checksum = "d29ab0c6d3fc0ee92fe66e2d99f700eab17a8d57d1c1d3b748380fb20baa78cd"
640
663
 
664
+ [[package]]
665
+ name = "seq-macro"
666
+ version = "0.3.5"
667
+ source = "registry+https://github.com/rust-lang/crates.io-index"
668
+ checksum = "a3f0bf26fd526d2a95683cd0f87bf103b8539e2ca1ef48ce002d67aad59aa0b4"
669
+
641
670
  [[package]]
642
671
  name = "serde"
643
672
  version = "1.0.159"
@@ -749,21 +778,21 @@ dependencies = [
749
778
 
750
779
  [[package]]
751
780
  name = "tokenizers"
752
- version = "0.3.3"
781
+ version = "0.4.0"
753
782
  dependencies = [
754
783
  "magnus",
755
784
  "onig",
756
785
  "serde",
757
- "tokenizers 0.13.3",
786
+ "tokenizers 0.14.0",
758
787
  ]
759
788
 
760
789
  [[package]]
761
790
  name = "tokenizers"
762
- version = "0.13.3"
791
+ version = "0.14.0"
763
792
  source = "registry+https://github.com/rust-lang/crates.io-index"
764
- checksum = "5cf49017523bf0bc01c9966f172c5f120bbb7b96cccd1708772dd42e767fb9f5"
793
+ checksum = "12b515a66453a4d68f03398054f7204fd0dde6b93d3f20ea90b08025ab49b499"
765
794
  dependencies = [
766
- "aho-corasick",
795
+ "aho-corasick 0.7.20",
767
796
  "derive_builder",
768
797
  "esaxx-rs",
769
798
  "getrandom",
@@ -6,6 +6,10 @@ aho-corasick v0.7.20
6
6
  https://github.com/BurntSushi/aho-corasick
7
7
  Unlicense OR MIT
8
8
 
9
+ aho-corasick v1.0.5
10
+ https://github.com/BurntSushi/aho-corasick
11
+ Unlicense OR MIT
12
+
9
13
  autocfg v1.1.0
10
14
  https://github.com/cuviper/autocfg
11
15
  Apache-2.0 OR MIT
@@ -14,7 +18,7 @@ base64 v0.13.1
14
18
  https://github.com/marshallpierce/rust-base64
15
19
  MIT/Apache-2.0
16
20
 
17
- bindgen v0.60.1
21
+ bindgen v0.62.0
18
22
  https://rust-lang.github.io/rust-bindgen/
19
23
  BSD-3-Clause
20
24
 
@@ -150,17 +154,17 @@ macro_rules_attribute-proc_macro v0.1.3
150
154
  https://github.com/danielhenrymantilla/macro_rules_attribute-rs
151
155
  MIT
152
156
 
153
- magnus v0.5.3
157
+ magnus v0.6.0
154
158
  https://github.com/matsadler/magnus
155
159
  MIT
156
160
 
157
- magnus-macros v0.4.1
161
+ magnus-macros v0.6.0
158
162
  https://github.com/matsadler/magnus
159
163
  MIT
160
164
 
161
- memchr v2.5.0
165
+ memchr v2.6.3
162
166
  https://github.com/BurntSushi/memchr
163
- Unlicense/MIT
167
+ Unlicense OR MIT
164
168
 
165
169
  memoffset v0.8.0
166
170
  https://github.com/Gilnaa/memoffset
@@ -250,11 +254,11 @@ rayon-core v1.11.0
250
254
  https://github.com/rayon-rs/rayon
251
255
  MIT OR Apache-2.0
252
256
 
253
- rb-sys v0.9.71
257
+ rb-sys v0.9.79
254
258
  https://github.com/oxidize-rb/rb-sys
255
259
  MIT OR Apache-2.0
256
260
 
257
- rb-sys-build v0.9.71
261
+ rb-sys-build v0.9.79
258
262
  https://github.com/oxidize-rb/rb-sys
259
263
  MIT OR Apache-2.0
260
264
 
@@ -262,12 +266,16 @@ rb-sys-env v0.1.2
262
266
  https://github.com/oxidize-rb/rb-sys
263
267
  MIT OR Apache-2.0
264
268
 
265
- regex v1.7.3
269
+ regex v1.9.5
266
270
  https://github.com/rust-lang/regex
267
271
  MIT OR Apache-2.0
268
272
 
269
- regex-syntax v0.6.29
270
- https://github.com/rust-lang/regex
273
+ regex-automata v0.3.8
274
+ https://github.com/rust-lang/regex/tree/master/regex-automata
275
+ MIT OR Apache-2.0
276
+
277
+ regex-syntax v0.7.5
278
+ https://github.com/rust-lang/regex/tree/master/regex-syntax
271
279
  MIT OR Apache-2.0
272
280
 
273
281
  rustc-hash v1.1.0
@@ -282,6 +290,10 @@ scopeguard v1.1.0
282
290
  https://github.com/bluss/scopeguard
283
291
  MIT/Apache-2.0
284
292
 
293
+ seq-macro v0.3.5
294
+ https://github.com/dtolnay/seq-macro
295
+ MIT OR Apache-2.0
296
+
285
297
  serde v1.0.159
286
298
  https://serde.rs
287
299
  MIT OR Apache-2.0
@@ -330,7 +342,7 @@ thiserror-impl v1.0.40
330
342
  https://github.com/dtolnay/thiserror
331
343
  MIT OR Apache-2.0
332
344
 
333
- tokenizers v0.13.3
345
+ tokenizers v0.14.0
334
346
  https://github.com/huggingface/tokenizers
335
347
  Apache-2.0
336
348
 
@@ -355,7 +367,70 @@ https://github.com/swgillespie/unicode-categories
355
367
  MIT OR Apache-2.0
356
368
 
357
369
  ================================================================================
358
- aho-corasick COPYING
370
+ aho-corasick v0.7.20 COPYING
371
+ ================================================================================
372
+
373
+ This project is dual-licensed under the Unlicense and MIT licenses.
374
+
375
+ You may use this code under the terms of either license.
376
+
377
+ ================================================================================
378
+ aho-corasick v0.7.20 LICENSE-MIT
379
+ ================================================================================
380
+
381
+ The MIT License (MIT)
382
+
383
+ Copyright (c) 2015 Andrew Gallant
384
+
385
+ Permission is hereby granted, free of charge, to any person obtaining a copy
386
+ of this software and associated documentation files (the "Software"), to deal
387
+ in the Software without restriction, including without limitation the rights
388
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
389
+ copies of the Software, and to permit persons to whom the Software is
390
+ furnished to do so, subject to the following conditions:
391
+
392
+ The above copyright notice and this permission notice shall be included in
393
+ all copies or substantial portions of the Software.
394
+
395
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
396
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
397
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
398
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
399
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
400
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
401
+ THE SOFTWARE.
402
+
403
+ ================================================================================
404
+ aho-corasick v0.7.20 UNLICENSE
405
+ ================================================================================
406
+
407
+ This is free and unencumbered software released into the public domain.
408
+
409
+ Anyone is free to copy, modify, publish, use, compile, sell, or
410
+ distribute this software, either in source code form or as a compiled
411
+ binary, for any purpose, commercial or non-commercial, and by any
412
+ means.
413
+
414
+ In jurisdictions that recognize copyright laws, the author or authors
415
+ of this software dedicate any and all copyright interest in the
416
+ software to the public domain. We make this dedication for the benefit
417
+ of the public at large and to the detriment of our heirs and
418
+ successors. We intend this dedication to be an overt act of
419
+ relinquishment in perpetuity of all present and future rights to this
420
+ software under copyright law.
421
+
422
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
423
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
424
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
425
+ IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
426
+ OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
427
+ ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
428
+ OTHER DEALINGS IN THE SOFTWARE.
429
+
430
+ For more information, please refer to <http://unlicense.org/>
431
+
432
+ ================================================================================
433
+ aho-corasick v1.0.5 COPYING
359
434
  ================================================================================
360
435
 
361
436
  This project is dual-licensed under the Unlicense and MIT licenses.
@@ -363,7 +438,7 @@ This project is dual-licensed under the Unlicense and MIT licenses.
363
438
  You may use this code under the terms of either license.
364
439
 
365
440
  ================================================================================
366
- aho-corasick LICENSE-MIT
441
+ aho-corasick v1.0.5 LICENSE-MIT
367
442
  ================================================================================
368
443
 
369
444
  The MIT License (MIT)
@@ -389,7 +464,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
389
464
  THE SOFTWARE.
390
465
 
391
466
  ================================================================================
392
- aho-corasick UNLICENSE
467
+ aho-corasick v1.0.5 UNLICENSE
393
468
  ================================================================================
394
469
 
395
470
  This is free and unencumbered software released into the public domain.
@@ -7762,7 +7837,7 @@ magnus LICENSE
7762
7837
 
7763
7838
  MIT License
7764
7839
 
7765
- Copyright (c) 2022, 2021 Matthew Sadler
7840
+ Copyright (c) 2023, 2022, 2021 Matthew Sadler
7766
7841
 
7767
7842
  Permission is hereby granted, free of charge, to any person obtaining a copy
7768
7843
  of this software and associated documentation files (the "Software"), to deal
@@ -7788,7 +7863,7 @@ magnus-macros LICENSE
7788
7863
 
7789
7864
  MIT License
7790
7865
 
7791
- Copyright (c) 2022, 2021 Matthew Sadler
7866
+ Copyright (c) 2023, 2022, 2021 Matthew Sadler
7792
7867
 
7793
7868
  Permission is hereby granted, free of charge, to any person obtaining a copy
7794
7869
  of this software and associated documentation files (the "Software"), to deal
@@ -12820,28 +12895,240 @@ IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
12820
12895
  DEALINGS IN THE SOFTWARE.
12821
12896
 
12822
12897
  ================================================================================
12823
- regex src/testdata/LICENSE
12898
+ regex-automata LICENSE-APACHE
12824
12899
  ================================================================================
12825
12900
 
12826
- The following license covers testregex.c and all associated test data.
12901
+ Apache License
12902
+ Version 2.0, January 2004
12903
+ http://www.apache.org/licenses/
12827
12904
 
12828
- Permission is hereby granted, free of charge, to any person obtaining a
12829
- copy of THIS SOFTWARE FILE (the "Software"), to deal in the Software
12830
- without restriction, including without limitation the rights to use,
12831
- copy, modify, merge, publish, distribute, and/or sell copies of the
12832
- Software, and to permit persons to whom the Software is furnished to do
12833
- so, subject to the following disclaimer:
12834
-
12835
- THIS SOFTWARE IS PROVIDED BY AT&T ``AS IS'' AND ANY EXPRESS OR IMPLIED
12836
- WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
12837
- MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
12838
- IN NO EVENT SHALL AT&T BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
12839
- SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
12840
- LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
12841
- DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
12842
- THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
12843
- (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
12844
- OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
12905
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
12906
+
12907
+ 1. Definitions.
12908
+
12909
+ "License" shall mean the terms and conditions for use, reproduction,
12910
+ and distribution as defined by Sections 1 through 9 of this document.
12911
+
12912
+ "Licensor" shall mean the copyright owner or entity authorized by
12913
+ the copyright owner that is granting the License.
12914
+
12915
+ "Legal Entity" shall mean the union of the acting entity and all
12916
+ other entities that control, are controlled by, or are under common
12917
+ control with that entity. For the purposes of this definition,
12918
+ "control" means (i) the power, direct or indirect, to cause the
12919
+ direction or management of such entity, whether by contract or
12920
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
12921
+ outstanding shares, or (iii) beneficial ownership of such entity.
12922
+
12923
+ "You" (or "Your") shall mean an individual or Legal Entity
12924
+ exercising permissions granted by this License.
12925
+
12926
+ "Source" form shall mean the preferred form for making modifications,
12927
+ including but not limited to software source code, documentation
12928
+ source, and configuration files.
12929
+
12930
+ "Object" form shall mean any form resulting from mechanical
12931
+ transformation or translation of a Source form, including but
12932
+ not limited to compiled object code, generated documentation,
12933
+ and conversions to other media types.
12934
+
12935
+ "Work" shall mean the work of authorship, whether in Source or
12936
+ Object form, made available under the License, as indicated by a
12937
+ copyright notice that is included in or attached to the work
12938
+ (an example is provided in the Appendix below).
12939
+
12940
+ "Derivative Works" shall mean any work, whether in Source or Object
12941
+ form, that is based on (or derived from) the Work and for which the
12942
+ editorial revisions, annotations, elaborations, or other modifications
12943
+ represent, as a whole, an original work of authorship. For the purposes
12944
+ of this License, Derivative Works shall not include works that remain
12945
+ separable from, or merely link (or bind by name) to the interfaces of,
12946
+ the Work and Derivative Works thereof.
12947
+
12948
+ "Contribution" shall mean any work of authorship, including
12949
+ the original version of the Work and any modifications or additions
12950
+ to that Work or Derivative Works thereof, that is intentionally
12951
+ submitted to Licensor for inclusion in the Work by the copyright owner
12952
+ or by an individual or Legal Entity authorized to submit on behalf of
12953
+ the copyright owner. For the purposes of this definition, "submitted"
12954
+ means any form of electronic, verbal, or written communication sent
12955
+ to the Licensor or its representatives, including but not limited to
12956
+ communication on electronic mailing lists, source code control systems,
12957
+ and issue tracking systems that are managed by, or on behalf of, the
12958
+ Licensor for the purpose of discussing and improving the Work, but
12959
+ excluding communication that is conspicuously marked or otherwise
12960
+ designated in writing by the copyright owner as "Not a Contribution."
12961
+
12962
+ "Contributor" shall mean Licensor and any individual or Legal Entity
12963
+ on behalf of whom a Contribution has been received by Licensor and
12964
+ subsequently incorporated within the Work.
12965
+
12966
+ 2. Grant of Copyright License. Subject to the terms and conditions of
12967
+ this License, each Contributor hereby grants to You a perpetual,
12968
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
12969
+ copyright license to reproduce, prepare Derivative Works of,
12970
+ publicly display, publicly perform, sublicense, and distribute the
12971
+ Work and such Derivative Works in Source or Object form.
12972
+
12973
+ 3. Grant of Patent License. Subject to the terms and conditions of
12974
+ this License, each Contributor hereby grants to You a perpetual,
12975
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
12976
+ (except as stated in this section) patent license to make, have made,
12977
+ use, offer to sell, sell, import, and otherwise transfer the Work,
12978
+ where such license applies only to those patent claims licensable
12979
+ by such Contributor that are necessarily infringed by their
12980
+ Contribution(s) alone or by combination of their Contribution(s)
12981
+ with the Work to which such Contribution(s) was submitted. If You
12982
+ institute patent litigation against any entity (including a
12983
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
12984
+ or a Contribution incorporated within the Work constitutes direct
12985
+ or contributory patent infringement, then any patent licenses
12986
+ granted to You under this License for that Work shall terminate
12987
+ as of the date such litigation is filed.
12988
+
12989
+ 4. Redistribution. You may reproduce and distribute copies of the
12990
+ Work or Derivative Works thereof in any medium, with or without
12991
+ modifications, and in Source or Object form, provided that You
12992
+ meet the following conditions:
12993
+
12994
+ (a) You must give any other recipients of the Work or
12995
+ Derivative Works a copy of this License; and
12996
+
12997
+ (b) You must cause any modified files to carry prominent notices
12998
+ stating that You changed the files; and
12999
+
13000
+ (c) You must retain, in the Source form of any Derivative Works
13001
+ that You distribute, all copyright, patent, trademark, and
13002
+ attribution notices from the Source form of the Work,
13003
+ excluding those notices that do not pertain to any part of
13004
+ the Derivative Works; and
13005
+
13006
+ (d) If the Work includes a "NOTICE" text file as part of its
13007
+ distribution, then any Derivative Works that You distribute must
13008
+ include a readable copy of the attribution notices contained
13009
+ within such NOTICE file, excluding those notices that do not
13010
+ pertain to any part of the Derivative Works, in at least one
13011
+ of the following places: within a NOTICE text file distributed
13012
+ as part of the Derivative Works; within the Source form or
13013
+ documentation, if provided along with the Derivative Works; or,
13014
+ within a display generated by the Derivative Works, if and
13015
+ wherever such third-party notices normally appear. The contents
13016
+ of the NOTICE file are for informational purposes only and
13017
+ do not modify the License. You may add Your own attribution
13018
+ notices within Derivative Works that You distribute, alongside
13019
+ or as an addendum to the NOTICE text from the Work, provided
13020
+ that such additional attribution notices cannot be construed
13021
+ as modifying the License.
13022
+
13023
+ You may add Your own copyright statement to Your modifications and
13024
+ may provide additional or different license terms and conditions
13025
+ for use, reproduction, or distribution of Your modifications, or
13026
+ for any such Derivative Works as a whole, provided Your use,
13027
+ reproduction, and distribution of the Work otherwise complies with
13028
+ the conditions stated in this License.
13029
+
13030
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
13031
+ any Contribution intentionally submitted for inclusion in the Work
13032
+ by You to the Licensor shall be under the terms and conditions of
13033
+ this License, without any additional terms or conditions.
13034
+ Notwithstanding the above, nothing herein shall supersede or modify
13035
+ the terms of any separate license agreement you may have executed
13036
+ with Licensor regarding such Contributions.
13037
+
13038
+ 6. Trademarks. This License does not grant permission to use the trade
13039
+ names, trademarks, service marks, or product names of the Licensor,
13040
+ except as required for reasonable and customary use in describing the
13041
+ origin of the Work and reproducing the content of the NOTICE file.
13042
+
13043
+ 7. Disclaimer of Warranty. Unless required by applicable law or
13044
+ agreed to in writing, Licensor provides the Work (and each
13045
+ Contributor provides its Contributions) on an "AS IS" BASIS,
13046
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
13047
+ implied, including, without limitation, any warranties or conditions
13048
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
13049
+ PARTICULAR PURPOSE. You are solely responsible for determining the
13050
+ appropriateness of using or redistributing the Work and assume any
13051
+ risks associated with Your exercise of permissions under this License.
13052
+
13053
+ 8. Limitation of Liability. In no event and under no legal theory,
13054
+ whether in tort (including negligence), contract, or otherwise,
13055
+ unless required by applicable law (such as deliberate and grossly
13056
+ negligent acts) or agreed to in writing, shall any Contributor be
13057
+ liable to You for damages, including any direct, indirect, special,
13058
+ incidental, or consequential damages of any character arising as a
13059
+ result of this License or out of the use or inability to use the
13060
+ Work (including but not limited to damages for loss of goodwill,
13061
+ work stoppage, computer failure or malfunction, or any and all
13062
+ other commercial damages or losses), even if such Contributor
13063
+ has been advised of the possibility of such damages.
13064
+
13065
+ 9. Accepting Warranty or Additional Liability. While redistributing
13066
+ the Work or Derivative Works thereof, You may choose to offer,
13067
+ and charge a fee for, acceptance of support, warranty, indemnity,
13068
+ or other liability obligations and/or rights consistent with this
13069
+ License. However, in accepting such obligations, You may act only
13070
+ on Your own behalf and on Your sole responsibility, not on behalf
13071
+ of any other Contributor, and only if You agree to indemnify,
13072
+ defend, and hold each Contributor harmless for any liability
13073
+ incurred by, or claims asserted against, such Contributor by reason
13074
+ of your accepting any such warranty or additional liability.
13075
+
13076
+ END OF TERMS AND CONDITIONS
13077
+
13078
+ APPENDIX: How to apply the Apache License to your work.
13079
+
13080
+ To apply the Apache License to your work, attach the following
13081
+ boilerplate notice, with the fields enclosed by brackets "[]"
13082
+ replaced with your own identifying information. (Don't include
13083
+ the brackets!) The text should be enclosed in the appropriate
13084
+ comment syntax for the file format. We also recommend that a
13085
+ file or class name and description of purpose be included on the
13086
+ same "printed page" as the copyright notice for easier
13087
+ identification within third-party archives.
13088
+
13089
+ Copyright [yyyy] [name of copyright owner]
13090
+
13091
+ Licensed under the Apache License, Version 2.0 (the "License");
13092
+ you may not use this file except in compliance with the License.
13093
+ You may obtain a copy of the License at
13094
+
13095
+ http://www.apache.org/licenses/LICENSE-2.0
13096
+
13097
+ Unless required by applicable law or agreed to in writing, software
13098
+ distributed under the License is distributed on an "AS IS" BASIS,
13099
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13100
+ See the License for the specific language governing permissions and
13101
+ limitations under the License.
13102
+
13103
+ ================================================================================
13104
+ regex-automata LICENSE-MIT
13105
+ ================================================================================
13106
+
13107
+ Copyright (c) 2014 The Rust Project Developers
13108
+
13109
+ Permission is hereby granted, free of charge, to any
13110
+ person obtaining a copy of this software and associated
13111
+ documentation files (the "Software"), to deal in the
13112
+ Software without restriction, including without
13113
+ limitation the rights to use, copy, modify, merge,
13114
+ publish, distribute, sublicense, and/or sell copies of
13115
+ the Software, and to permit persons to whom the Software
13116
+ is furnished to do so, subject to the following
13117
+ conditions:
13118
+
13119
+ The above copyright notice and this permission notice
13120
+ shall be included in all copies or substantial portions
13121
+ of the Software.
13122
+
13123
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
13124
+ ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
13125
+ TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
13126
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
13127
+ SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
13128
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
13129
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
13130
+ IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
13131
+ DEALINGS IN THE SOFTWARE.
12845
13132
 
12846
13133
  ================================================================================
12847
13134
  regex-syntax LICENSE-APACHE
@@ -13820,6 +14107,215 @@ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
13820
14107
  IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
13821
14108
  DEALINGS IN THE SOFTWARE.
13822
14109
 
14110
+ ================================================================================
14111
+ seq-macro LICENSE-APACHE
14112
+ ================================================================================
14113
+
14114
+ Apache License
14115
+ Version 2.0, January 2004
14116
+ http://www.apache.org/licenses/
14117
+
14118
+ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
14119
+
14120
+ 1. Definitions.
14121
+
14122
+ "License" shall mean the terms and conditions for use, reproduction,
14123
+ and distribution as defined by Sections 1 through 9 of this document.
14124
+
14125
+ "Licensor" shall mean the copyright owner or entity authorized by
14126
+ the copyright owner that is granting the License.
14127
+
14128
+ "Legal Entity" shall mean the union of the acting entity and all
14129
+ other entities that control, are controlled by, or are under common
14130
+ control with that entity. For the purposes of this definition,
14131
+ "control" means (i) the power, direct or indirect, to cause the
14132
+ direction or management of such entity, whether by contract or
14133
+ otherwise, or (ii) ownership of fifty percent (50%) or more of the
14134
+ outstanding shares, or (iii) beneficial ownership of such entity.
14135
+
14136
+ "You" (or "Your") shall mean an individual or Legal Entity
14137
+ exercising permissions granted by this License.
14138
+
14139
+ "Source" form shall mean the preferred form for making modifications,
14140
+ including but not limited to software source code, documentation
14141
+ source, and configuration files.
14142
+
14143
+ "Object" form shall mean any form resulting from mechanical
14144
+ transformation or translation of a Source form, including but
14145
+ not limited to compiled object code, generated documentation,
14146
+ and conversions to other media types.
14147
+
14148
+ "Work" shall mean the work of authorship, whether in Source or
14149
+ Object form, made available under the License, as indicated by a
14150
+ copyright notice that is included in or attached to the work
14151
+ (an example is provided in the Appendix below).
14152
+
14153
+ "Derivative Works" shall mean any work, whether in Source or Object
14154
+ form, that is based on (or derived from) the Work and for which the
14155
+ editorial revisions, annotations, elaborations, or other modifications
14156
+ represent, as a whole, an original work of authorship. For the purposes
14157
+ of this License, Derivative Works shall not include works that remain
14158
+ separable from, or merely link (or bind by name) to the interfaces of,
14159
+ the Work and Derivative Works thereof.
14160
+
14161
+ "Contribution" shall mean any work of authorship, including
14162
+ the original version of the Work and any modifications or additions
14163
+ to that Work or Derivative Works thereof, that is intentionally
14164
+ submitted to Licensor for inclusion in the Work by the copyright owner
14165
+ or by an individual or Legal Entity authorized to submit on behalf of
14166
+ the copyright owner. For the purposes of this definition, "submitted"
14167
+ means any form of electronic, verbal, or written communication sent
14168
+ to the Licensor or its representatives, including but not limited to
14169
+ communication on electronic mailing lists, source code control systems,
14170
+ and issue tracking systems that are managed by, or on behalf of, the
14171
+ Licensor for the purpose of discussing and improving the Work, but
14172
+ excluding communication that is conspicuously marked or otherwise
14173
+ designated in writing by the copyright owner as "Not a Contribution."
14174
+
14175
+ "Contributor" shall mean Licensor and any individual or Legal Entity
14176
+ on behalf of whom a Contribution has been received by Licensor and
14177
+ subsequently incorporated within the Work.
14178
+
14179
+ 2. Grant of Copyright License. Subject to the terms and conditions of
14180
+ this License, each Contributor hereby grants to You a perpetual,
14181
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
14182
+ copyright license to reproduce, prepare Derivative Works of,
14183
+ publicly display, publicly perform, sublicense, and distribute the
14184
+ Work and such Derivative Works in Source or Object form.
14185
+
14186
+ 3. Grant of Patent License. Subject to the terms and conditions of
14187
+ this License, each Contributor hereby grants to You a perpetual,
14188
+ worldwide, non-exclusive, no-charge, royalty-free, irrevocable
14189
+ (except as stated in this section) patent license to make, have made,
14190
+ use, offer to sell, sell, import, and otherwise transfer the Work,
14191
+ where such license applies only to those patent claims licensable
14192
+ by such Contributor that are necessarily infringed by their
14193
+ Contribution(s) alone or by combination of their Contribution(s)
14194
+ with the Work to which such Contribution(s) was submitted. If You
14195
+ institute patent litigation against any entity (including a
14196
+ cross-claim or counterclaim in a lawsuit) alleging that the Work
14197
+ or a Contribution incorporated within the Work constitutes direct
14198
+ or contributory patent infringement, then any patent licenses
14199
+ granted to You under this License for that Work shall terminate
14200
+ as of the date such litigation is filed.
14201
+
14202
+ 4. Redistribution. You may reproduce and distribute copies of the
14203
+ Work or Derivative Works thereof in any medium, with or without
14204
+ modifications, and in Source or Object form, provided that You
14205
+ meet the following conditions:
14206
+
14207
+ (a) You must give any other recipients of the Work or
14208
+ Derivative Works a copy of this License; and
14209
+
14210
+ (b) You must cause any modified files to carry prominent notices
14211
+ stating that You changed the files; and
14212
+
14213
+ (c) You must retain, in the Source form of any Derivative Works
14214
+ that You distribute, all copyright, patent, trademark, and
14215
+ attribution notices from the Source form of the Work,
14216
+ excluding those notices that do not pertain to any part of
14217
+ the Derivative Works; and
14218
+
14219
+ (d) If the Work includes a "NOTICE" text file as part of its
14220
+ distribution, then any Derivative Works that You distribute must
14221
+ include a readable copy of the attribution notices contained
14222
+ within such NOTICE file, excluding those notices that do not
14223
+ pertain to any part of the Derivative Works, in at least one
14224
+ of the following places: within a NOTICE text file distributed
14225
+ as part of the Derivative Works; within the Source form or
14226
+ documentation, if provided along with the Derivative Works; or,
14227
+ within a display generated by the Derivative Works, if and
14228
+ wherever such third-party notices normally appear. The contents
14229
+ of the NOTICE file are for informational purposes only and
14230
+ do not modify the License. You may add Your own attribution
14231
+ notices within Derivative Works that You distribute, alongside
14232
+ or as an addendum to the NOTICE text from the Work, provided
14233
+ that such additional attribution notices cannot be construed
14234
+ as modifying the License.
14235
+
14236
+ You may add Your own copyright statement to Your modifications and
14237
+ may provide additional or different license terms and conditions
14238
+ for use, reproduction, or distribution of Your modifications, or
14239
+ for any such Derivative Works as a whole, provided Your use,
14240
+ reproduction, and distribution of the Work otherwise complies with
14241
+ the conditions stated in this License.
14242
+
14243
+ 5. Submission of Contributions. Unless You explicitly state otherwise,
14244
+ any Contribution intentionally submitted for inclusion in the Work
14245
+ by You to the Licensor shall be under the terms and conditions of
14246
+ this License, without any additional terms or conditions.
14247
+ Notwithstanding the above, nothing herein shall supersede or modify
14248
+ the terms of any separate license agreement you may have executed
14249
+ with Licensor regarding such Contributions.
14250
+
14251
+ 6. Trademarks. This License does not grant permission to use the trade
14252
+ names, trademarks, service marks, or product names of the Licensor,
14253
+ except as required for reasonable and customary use in describing the
14254
+ origin of the Work and reproducing the content of the NOTICE file.
14255
+
14256
+ 7. Disclaimer of Warranty. Unless required by applicable law or
14257
+ agreed to in writing, Licensor provides the Work (and each
14258
+ Contributor provides its Contributions) on an "AS IS" BASIS,
14259
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
14260
+ implied, including, without limitation, any warranties or conditions
14261
+ of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
14262
+ PARTICULAR PURPOSE. You are solely responsible for determining the
14263
+ appropriateness of using or redistributing the Work and assume any
14264
+ risks associated with Your exercise of permissions under this License.
14265
+
14266
+ 8. Limitation of Liability. In no event and under no legal theory,
14267
+ whether in tort (including negligence), contract, or otherwise,
14268
+ unless required by applicable law (such as deliberate and grossly
14269
+ negligent acts) or agreed to in writing, shall any Contributor be
14270
+ liable to You for damages, including any direct, indirect, special,
14271
+ incidental, or consequential damages of any character arising as a
14272
+ result of this License or out of the use or inability to use the
14273
+ Work (including but not limited to damages for loss of goodwill,
14274
+ work stoppage, computer failure or malfunction, or any and all
14275
+ other commercial damages or losses), even if such Contributor
14276
+ has been advised of the possibility of such damages.
14277
+
14278
+ 9. Accepting Warranty or Additional Liability. While redistributing
14279
+ the Work or Derivative Works thereof, You may choose to offer,
14280
+ and charge a fee for, acceptance of support, warranty, indemnity,
14281
+ or other liability obligations and/or rights consistent with this
14282
+ License. However, in accepting such obligations, You may act only
14283
+ on Your own behalf and on Your sole responsibility, not on behalf
14284
+ of any other Contributor, and only if You agree to indemnify,
14285
+ defend, and hold each Contributor harmless for any liability
14286
+ incurred by, or claims asserted against, such Contributor by reason
14287
+ of your accepting any such warranty or additional liability.
14288
+
14289
+ END OF TERMS AND CONDITIONS
14290
+
14291
+ ================================================================================
14292
+ seq-macro LICENSE-MIT
14293
+ ================================================================================
14294
+
14295
+ Permission is hereby granted, free of charge, to any
14296
+ person obtaining a copy of this software and associated
14297
+ documentation files (the "Software"), to deal in the
14298
+ Software without restriction, including without
14299
+ limitation the rights to use, copy, modify, merge,
14300
+ publish, distribute, sublicense, and/or sell copies of
14301
+ the Software, and to permit persons to whom the Software
14302
+ is furnished to do so, subject to the following
14303
+ conditions:
14304
+
14305
+ The above copyright notice and this permission notice
14306
+ shall be included in all copies or substantial portions
14307
+ of the Software.
14308
+
14309
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF
14310
+ ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED
14311
+ TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A
14312
+ PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT
14313
+ SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
14314
+ CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
14315
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR
14316
+ IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
14317
+ DEALINGS IN THE SOFTWARE.
14318
+
13823
14319
  ================================================================================
13824
14320
  serde LICENSE-APACHE
13825
14321
  ================================================================================
Binary file
Binary file
Binary file
@@ -1,7 +1,7 @@
1
1
  module Tokenizers
2
2
  module FromPretrained
3
3
  # for user agent
4
- TOKENIZERS_VERSION = "0.13.3"
4
+ TOKENIZERS_VERSION = "0.14.0"
5
5
 
6
6
  # use Ruby for downloads
7
7
  # this avoids the need to vendor OpenSSL on Linux
@@ -1,8 +1,8 @@
1
1
  module Tokenizers
2
2
  module Models
3
3
  class Unigram
4
- def self.new(vocab: nil, unk_id: nil)
5
- _new(vocab, unk_id)
4
+ def self.new(vocab: nil, unk_id: nil, byte_fallback: nil)
5
+ _new(vocab, unk_id, byte_fallback)
6
6
  end
7
7
  end
8
8
  end
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.3.3"
2
+ VERSION = "0.4.0"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,8 +1,8 @@
1
1
  # ext
2
2
  begin
3
- require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
3
+ require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
4
  rescue LoadError
5
- require_relative "tokenizers/tokenizers"
5
+ require "tokenizers/tokenizers"
6
6
  end
7
7
 
8
8
  # decoders
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.3.3
4
+ version: 0.4.0
5
5
  platform: x86_64-linux
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2023-04-09 00:00:00.000000000 Z
11
+ date: 2023-09-21 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description:
14
14
  email: andrew@ankane.org
@@ -23,7 +23,6 @@ files:
23
23
  - LICENSE.txt
24
24
  - README.md
25
25
  - lib/tokenizers.rb
26
- - lib/tokenizers/2.7/tokenizers.so
27
26
  - lib/tokenizers/3.0/tokenizers.so
28
27
  - lib/tokenizers/3.1/tokenizers.so
29
28
  - lib/tokenizers/3.2/tokenizers.so
@@ -68,7 +67,7 @@ required_ruby_version: !ruby/object:Gem::Requirement
68
67
  requirements:
69
68
  - - ">="
70
69
  - !ruby/object:Gem::Version
71
- version: '2.7'
70
+ version: '3.0'
72
71
  - - "<"
73
72
  - !ruby/object:Gem::Version
74
73
  version: 3.3.dev
Binary file