tokenizers 0.2.2-x86_64-linux → 0.2.3-x86_64-linux
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Cargo.lock +1 -1
- data/LICENSE-THIRD-PARTY.txt +26 -26
- data/lib/tokenizers/2.7/tokenizers.so +0 -0
- data/lib/tokenizers/3.0/tokenizers.so +0 -0
- data/lib/tokenizers/3.1/tokenizers.so +0 -0
- data/lib/tokenizers/3.2/tokenizers.so +0 -0
- data/lib/tokenizers/char_bpe_tokenizer.rb +2 -2
- data/lib/tokenizers/encoding.rb +19 -0
- data/lib/tokenizers/tokenizer.rb +12 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +7 -5
- metadata +5 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 51273c1f38d9a2fcbcda6df42b1f8eff718965e84ab49233b6e54bef0825aed4
|
4
|
+
data.tar.gz: 3ce5e8543e7ac32c6302fcdefde06fdebb249be42db234bbbbe8671bb414a69a
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 45946d725ed104ca0001cf323c4bc0146050f051e8f9150aba22f5169640cef2227729ccbd9f705c43ada82ff53703ddaceb52a77259f0096066739c93c13a07
|
7
|
+
data.tar.gz: b056d1024ab43363c3ba3b0ae1cec2874fab31630de474eb0a6694484b88418bad408155121bf2adc0ed0ed4a70064765565e630d3ba648cb23e6b5c34d9aefc
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## 0.2.3 (2022-01-22)
|
2
|
+
|
3
|
+
- Added `add_special_tokens` option to `encode` method
|
4
|
+
- Added warning about `encode` method including special tokens by default in 0.3.0
|
5
|
+
- Added more methods to `Encoding`
|
6
|
+
- Fixed error with precompiled gem on Mac ARM
|
7
|
+
|
1
8
|
## 0.2.2 (2022-01-15)
|
2
9
|
|
3
10
|
- Added precompiled gem for Linux ARM
|
data/Cargo.lock
CHANGED
data/LICENSE-THIRD-PARTY.txt
CHANGED
@@ -12446,6 +12446,32 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
12446
12446
|
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
12447
12447
|
SOFTWARE.
|
12448
12448
|
|
12449
|
+
================================================================================
|
12450
|
+
rb-sys-env LICENSE-MIT
|
12451
|
+
================================================================================
|
12452
|
+
|
12453
|
+
The MIT License (MIT)
|
12454
|
+
|
12455
|
+
Copyright (c) 2021-2022 Ian Ker-Seymer
|
12456
|
+
|
12457
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
12458
|
+
of this software and associated documentation files (the "Software"), to deal
|
12459
|
+
in the Software without restriction, including without limitation the rights
|
12460
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12461
|
+
copies of the Software, and to permit persons to whom the Software is
|
12462
|
+
furnished to do so, subject to the following conditions:
|
12463
|
+
|
12464
|
+
The above copyright notice and this permission notice shall be included in all
|
12465
|
+
copies or substantial portions of the Software.
|
12466
|
+
|
12467
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
12468
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
12469
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
12470
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
12471
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
12472
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
12473
|
+
SOFTWARE.
|
12474
|
+
|
12449
12475
|
================================================================================
|
12450
12476
|
rb-sys-env LICENSE-APACHE
|
12451
12477
|
================================================================================
|
@@ -12641,32 +12667,6 @@ rb-sys-env LICENSE-APACHE
|
|
12641
12667
|
See the License for the specific language governing permissions and
|
12642
12668
|
limitations under the License.
|
12643
12669
|
|
12644
|
-
================================================================================
|
12645
|
-
rb-sys-env LICENSE-MIT
|
12646
|
-
================================================================================
|
12647
|
-
|
12648
|
-
The MIT License (MIT)
|
12649
|
-
|
12650
|
-
Copyright (c) 2021-2022 Ian Ker-Seymer
|
12651
|
-
|
12652
|
-
Permission is hereby granted, free of charge, to any person obtaining a copy
|
12653
|
-
of this software and associated documentation files (the "Software"), to deal
|
12654
|
-
in the Software without restriction, including without limitation the rights
|
12655
|
-
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
12656
|
-
copies of the Software, and to permit persons to whom the Software is
|
12657
|
-
furnished to do so, subject to the following conditions:
|
12658
|
-
|
12659
|
-
The above copyright notice and this permission notice shall be included in all
|
12660
|
-
copies or substantial portions of the Software.
|
12661
|
-
|
12662
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
12663
|
-
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
12664
|
-
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
12665
|
-
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
12666
|
-
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
12667
|
-
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
12668
|
-
SOFTWARE.
|
12669
|
-
|
12670
12670
|
================================================================================
|
12671
12671
|
regex LICENSE-APACHE
|
12672
12672
|
================================================================================
|
Binary file
|
Binary file
|
Binary file
|
Binary file
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
class Encoding
|
3
|
+
def word_to_tokens(word_index, sequence_index = 0)
|
4
|
+
_word_to_tokens(word_index, sequence_index)
|
5
|
+
end
|
6
|
+
|
7
|
+
def word_to_chars(word_index, sequence_index = 0)
|
8
|
+
_word_to_chars(word_index, sequence_index)
|
9
|
+
end
|
10
|
+
|
11
|
+
def char_to_token(char_pos, sequence_index = 0)
|
12
|
+
_char_to_token(char_pos, sequence_index)
|
13
|
+
end
|
14
|
+
|
15
|
+
def char_to_word(char_pos, sequence_index = 0)
|
16
|
+
_char_to_word(word_index, sequence_index)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
class Tokenizer
|
3
|
+
# TODO change add_special_tokens default to true in 0.3.0
|
4
|
+
def encode(sequence, add_special_tokens: nil)
|
5
|
+
if add_special_tokens.nil?
|
6
|
+
warn "[tokenizers] add_special_tokens will default to true in 0.3.0. Pass add_special_tokens: true/false to silence this warning."
|
7
|
+
add_special_tokens = false
|
8
|
+
end
|
9
|
+
_encode(sequence, add_special_tokens)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
# ext
|
2
2
|
begin
|
3
|
-
|
3
|
+
require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
|
4
4
|
rescue LoadError
|
5
|
-
|
5
|
+
require_relative "tokenizers/tokenizers"
|
6
6
|
end
|
7
7
|
|
8
8
|
# modules
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
require_relative "tokenizers/char_bpe_tokenizer"
|
10
|
+
require_relative "tokenizers/encoding"
|
11
|
+
require_relative "tokenizers/from_pretrained"
|
12
|
+
require_relative "tokenizers/tokenizer"
|
13
|
+
require_relative "tokenizers/version"
|
12
14
|
|
13
15
|
module Tokenizers
|
14
16
|
class Error < StandardError; end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: x86_64-linux
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies: []
|
13
13
|
description:
|
14
14
|
email: andrew@ankane.org
|
@@ -28,7 +28,9 @@ files:
|
|
28
28
|
- lib/tokenizers/3.1/tokenizers.so
|
29
29
|
- lib/tokenizers/3.2/tokenizers.so
|
30
30
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
31
|
+
- lib/tokenizers/encoding.rb
|
31
32
|
- lib/tokenizers/from_pretrained.rb
|
33
|
+
- lib/tokenizers/tokenizer.rb
|
32
34
|
- lib/tokenizers/version.rb
|
33
35
|
homepage: https://github.com/ankane/tokenizers-ruby
|
34
36
|
licenses:
|
@@ -52,7 +54,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
52
54
|
- !ruby/object:Gem::Version
|
53
55
|
version: '0'
|
54
56
|
requirements: []
|
55
|
-
rubygems_version: 3.4.
|
57
|
+
rubygems_version: 3.4.4
|
56
58
|
signing_key:
|
57
59
|
specification_version: 4
|
58
60
|
summary: Fast state-of-the-art tokenizers for Ruby
|