tokenizers 0.3.2 → 0.4.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Cargo.lock +160 -96
- data/ext/tokenizers/Cargo.toml +6 -6
- data/ext/tokenizers/src/decoders.rs +149 -39
- data/ext/tokenizers/src/error.rs +5 -3
- data/ext/tokenizers/src/lib.rs +21 -33
- data/ext/tokenizers/src/models.rs +71 -50
- data/ext/tokenizers/src/normalizers.rs +113 -74
- data/ext/tokenizers/src/pre_tokenizers.rs +85 -73
- data/ext/tokenizers/src/processors.rs +43 -38
- data/ext/tokenizers/src/tokenizer.rs +35 -28
- data/ext/tokenizers/src/trainers.rs +82 -80
- data/ext/tokenizers/src/utils/normalization.rs +4 -3
- data/ext/tokenizers/src/utils/regex.rs +5 -3
- data/lib/tokenizers/decoders/strip.rb +9 -0
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/models/unigram.rb +2 -2
- data/lib/tokenizers/normalizers/prepend.rb +9 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +4 -2
- metadata +6 -4
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -1,14 +1,15 @@
|
|
1
1
|
# ext
|
2
2
|
begin
|
3
|
-
|
3
|
+
require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
|
4
4
|
rescue LoadError
|
5
|
-
|
5
|
+
require "tokenizers/tokenizers"
|
6
6
|
end
|
7
7
|
|
8
8
|
# decoders
|
9
9
|
require_relative "tokenizers/decoders/bpe_decoder"
|
10
10
|
require_relative "tokenizers/decoders/ctc"
|
11
11
|
require_relative "tokenizers/decoders/metaspace"
|
12
|
+
require_relative "tokenizers/decoders/strip"
|
12
13
|
require_relative "tokenizers/decoders/word_piece"
|
13
14
|
|
14
15
|
# models
|
@@ -19,6 +20,7 @@ require_relative "tokenizers/models/unigram"
|
|
19
20
|
|
20
21
|
# normalizers
|
21
22
|
require_relative "tokenizers/normalizers/bert_normalizer"
|
23
|
+
require_relative "tokenizers/normalizers/prepend"
|
22
24
|
require_relative "tokenizers/normalizers/strip"
|
23
25
|
|
24
26
|
# pre-tokenizers
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.4.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-09-21 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -56,6 +56,7 @@ files:
|
|
56
56
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
57
57
|
- lib/tokenizers/decoders/ctc.rb
|
58
58
|
- lib/tokenizers/decoders/metaspace.rb
|
59
|
+
- lib/tokenizers/decoders/strip.rb
|
59
60
|
- lib/tokenizers/decoders/word_piece.rb
|
60
61
|
- lib/tokenizers/encoding.rb
|
61
62
|
- lib/tokenizers/from_pretrained.rb
|
@@ -64,6 +65,7 @@ files:
|
|
64
65
|
- lib/tokenizers/models/word_level.rb
|
65
66
|
- lib/tokenizers/models/word_piece.rb
|
66
67
|
- lib/tokenizers/normalizers/bert_normalizer.rb
|
68
|
+
- lib/tokenizers/normalizers/prepend.rb
|
67
69
|
- lib/tokenizers/normalizers/strip.rb
|
68
70
|
- lib/tokenizers/pre_tokenizers/byte_level.rb
|
69
71
|
- lib/tokenizers/pre_tokenizers/digits.rb
|
@@ -91,14 +93,14 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
91
93
|
requirements:
|
92
94
|
- - ">="
|
93
95
|
- !ruby/object:Gem::Version
|
94
|
-
version: '
|
96
|
+
version: '3'
|
95
97
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
96
98
|
requirements:
|
97
99
|
- - ">="
|
98
100
|
- !ruby/object:Gem::Version
|
99
101
|
version: '0'
|
100
102
|
requirements: []
|
101
|
-
rubygems_version: 3.4.
|
103
|
+
rubygems_version: 3.4.10
|
102
104
|
signing_key:
|
103
105
|
specification_version: 4
|
104
106
|
summary: Fast state-of-the-art tokenizers for Ruby
|