tokenizers 0.2.3 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/Cargo.lock +32 -73
- data/README.md +4 -0
- data/ext/tokenizers/Cargo.toml +3 -1
- data/ext/tokenizers/src/decoders.rs +275 -6
- data/ext/tokenizers/src/encoding.rs +3 -2
- data/ext/tokenizers/src/error.rs +2 -2
- data/ext/tokenizers/src/lib.rs +64 -17
- data/ext/tokenizers/src/models.rs +372 -11
- data/ext/tokenizers/src/normalizers.rs +435 -7
- data/ext/tokenizers/src/pre_tokenizers.rs +470 -6
- data/ext/tokenizers/src/processors.rs +210 -0
- data/ext/tokenizers/src/tokenizer.rs +437 -23
- data/ext/tokenizers/src/trainers.rs +749 -0
- data/ext/tokenizers/src/utils/mod.rs +5 -0
- data/ext/tokenizers/src/utils/normalization.rs +85 -0
- data/ext/tokenizers/src/utils/regex.rs +22 -0
- data/lib/tokenizers/char_bpe_tokenizer.rb +9 -6
- data/lib/tokenizers/decoders/bpe_decoder.rb +9 -0
- data/lib/tokenizers/decoders/ctc.rb +9 -0
- data/lib/tokenizers/decoders/metaspace.rb +9 -0
- data/lib/tokenizers/decoders/word_piece.rb +9 -0
- data/lib/tokenizers/from_pretrained.rb +1 -1
- data/lib/tokenizers/models/bpe.rb +9 -0
- data/lib/tokenizers/models/unigram.rb +9 -0
- data/lib/tokenizers/models/word_level.rb +13 -0
- data/lib/tokenizers/models/word_piece.rb +9 -0
- data/lib/tokenizers/normalizers/bert_normalizer.rb +9 -0
- data/lib/tokenizers/normalizers/strip.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/byte_level.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/digits.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/metaspace.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/punctuation.rb +9 -0
- data/lib/tokenizers/pre_tokenizers/split.rb +9 -0
- data/lib/tokenizers/processors/byte_level.rb +9 -0
- data/lib/tokenizers/processors/roberta_processing.rb +9 -0
- data/lib/tokenizers/processors/template_processing.rb +9 -0
- data/lib/tokenizers/tokenizer.rb +40 -7
- data/lib/tokenizers/trainers/bpe_trainer.rb +9 -0
- data/lib/tokenizers/trainers/unigram_trainer.rb +26 -0
- data/lib/tokenizers/trainers/word_level_trainer.rb +9 -0
- data/lib/tokenizers/trainers/word_piece_trainer.rb +26 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +42 -2
- metadata +30 -3
@@ -0,0 +1,85 @@
|
|
1
|
+
use super::regex::{regex, RbRegex};
|
2
|
+
use crate::RbResult;
|
3
|
+
use magnus::{exception, Error, TryConvert, Value};
|
4
|
+
use tk::normalizer::SplitDelimiterBehavior;
|
5
|
+
use tk::pattern::Pattern;
|
6
|
+
|
7
|
+
#[derive(Clone)]
|
8
|
+
pub enum RbPattern<'p> {
|
9
|
+
Str(String),
|
10
|
+
Regex(&'p RbRegex),
|
11
|
+
}
|
12
|
+
|
13
|
+
impl TryConvert for RbPattern<'_> {
|
14
|
+
fn try_convert(obj: Value) -> RbResult<Self> {
|
15
|
+
if obj.is_kind_of(regex()) {
|
16
|
+
Ok(RbPattern::Regex(obj.try_convert()?))
|
17
|
+
} else {
|
18
|
+
Ok(RbPattern::Str(obj.try_convert()?))
|
19
|
+
}
|
20
|
+
}
|
21
|
+
}
|
22
|
+
|
23
|
+
impl Pattern for RbPattern<'_> {
|
24
|
+
fn find_matches(&self, inside: &str) -> tk::Result<Vec<(tk::Offsets, bool)>> {
|
25
|
+
match self {
|
26
|
+
RbPattern::Str(s) => {
|
27
|
+
let mut chars = s.chars();
|
28
|
+
if let (Some(c), None) = (chars.next(), chars.next()) {
|
29
|
+
c.find_matches(inside)
|
30
|
+
} else {
|
31
|
+
s.find_matches(inside)
|
32
|
+
}
|
33
|
+
}
|
34
|
+
RbPattern::Regex(_r) => {
|
35
|
+
todo!()
|
36
|
+
}
|
37
|
+
}
|
38
|
+
}
|
39
|
+
}
|
40
|
+
|
41
|
+
impl From<RbPattern<'_>> for tk::normalizers::replace::ReplacePattern {
|
42
|
+
fn from(pattern: RbPattern<'_>) -> Self {
|
43
|
+
match pattern {
|
44
|
+
RbPattern::Str(s) => Self::String(s),
|
45
|
+
RbPattern::Regex(_r) => todo!(),
|
46
|
+
}
|
47
|
+
}
|
48
|
+
}
|
49
|
+
|
50
|
+
impl From<RbPattern<'_>> for tk::pre_tokenizers::split::SplitPattern {
|
51
|
+
fn from(pattern: RbPattern<'_>) -> Self {
|
52
|
+
match pattern {
|
53
|
+
RbPattern::Str(s) => Self::String(s),
|
54
|
+
RbPattern::Regex(_r) => todo!(),
|
55
|
+
}
|
56
|
+
}
|
57
|
+
}
|
58
|
+
|
59
|
+
#[derive(Clone)]
|
60
|
+
pub struct RbSplitDelimiterBehavior(pub SplitDelimiterBehavior);
|
61
|
+
|
62
|
+
impl TryConvert for RbSplitDelimiterBehavior {
|
63
|
+
fn try_convert(obj: Value) -> RbResult<Self> {
|
64
|
+
let s = obj.try_convert::<String>()?;
|
65
|
+
|
66
|
+
Ok(Self(match s.as_str() {
|
67
|
+
"removed" => Ok(SplitDelimiterBehavior::Removed),
|
68
|
+
"isolated" => Ok(SplitDelimiterBehavior::Isolated),
|
69
|
+
"merged_with_previous" => Ok(SplitDelimiterBehavior::MergedWithPrevious),
|
70
|
+
"merged_with_next" => Ok(SplitDelimiterBehavior::MergedWithNext),
|
71
|
+
"contiguous" => Ok(SplitDelimiterBehavior::Contiguous),
|
72
|
+
_ => Err(Error::new(
|
73
|
+
exception::arg_error(),
|
74
|
+
"Wrong value for SplitDelimiterBehavior, expected one of: \
|
75
|
+
`removed, isolated, merged_with_previous, merged_with_next, contiguous`",
|
76
|
+
)),
|
77
|
+
}?))
|
78
|
+
}
|
79
|
+
}
|
80
|
+
|
81
|
+
impl From<RbSplitDelimiterBehavior> for SplitDelimiterBehavior {
|
82
|
+
fn from(v: RbSplitDelimiterBehavior) -> Self {
|
83
|
+
v.0
|
84
|
+
}
|
85
|
+
}
|
@@ -0,0 +1,22 @@
|
|
1
|
+
use onig::Regex;
|
2
|
+
use magnus::{exception, memoize, Error, Module, RClass};
|
3
|
+
use crate::{module, RbResult};
|
4
|
+
|
5
|
+
#[magnus::wrap(class = "Tokenizers::Regex")]
|
6
|
+
pub struct RbRegex {
|
7
|
+
pub inner: Regex,
|
8
|
+
pub pattern: String,
|
9
|
+
}
|
10
|
+
|
11
|
+
impl RbRegex {
|
12
|
+
pub fn new(s: String) -> RbResult<Self> {
|
13
|
+
Ok(Self {
|
14
|
+
inner: Regex::new(&s).map_err(|e| Error::new(exception::runtime_error(), e.description().to_owned()))?,
|
15
|
+
pattern: s,
|
16
|
+
})
|
17
|
+
}
|
18
|
+
}
|
19
|
+
|
20
|
+
pub fn regex() -> RClass {
|
21
|
+
*memoize!(RClass: module().const_get("Regex").unwrap())
|
22
|
+
}
|
@@ -1,11 +1,14 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
class CharBPETokenizer
|
3
|
-
def initialize(vocab, merges)
|
4
|
-
@tokenizer =
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
@tokenizer.
|
3
|
+
def initialize(vocab, merges, unk_token: "<unk>", suffix: "</w>")
|
4
|
+
@tokenizer =
|
5
|
+
Tokenizer.new(
|
6
|
+
Models::BPE._from_file(vocab, merges, {unk_token: unk_token, end_of_word_suffix: suffix})
|
7
|
+
)
|
8
|
+
@tokenizer.add_special_tokens([unk_token])
|
9
|
+
@tokenizer.normalizer = Normalizers::BertNormalizer.new
|
10
|
+
@tokenizer.pre_tokenizer = PreTokenizers::BertPreTokenizer.new
|
11
|
+
@tokenizer.decoder = Decoders::BPEDecoder.new
|
9
12
|
end
|
10
13
|
|
11
14
|
def encode(text, **options)
|
data/lib/tokenizers/tokenizer.rb
CHANGED
@@ -1,12 +1,45 @@
|
|
1
1
|
module Tokenizers
|
2
2
|
class Tokenizer
|
3
|
-
|
4
|
-
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
3
|
+
extend FromPretrained
|
4
|
+
|
5
|
+
def to_s(pretty: false)
|
6
|
+
_to_s(pretty)
|
7
|
+
end
|
8
|
+
|
9
|
+
def save(path, pretty: false)
|
10
|
+
_save(path, pretty)
|
11
|
+
end
|
12
|
+
|
13
|
+
def encode(sequence, pair = nil, is_pretokenized: false, add_special_tokens: true)
|
14
|
+
_encode(sequence, pair, is_pretokenized, add_special_tokens)
|
15
|
+
end
|
16
|
+
|
17
|
+
def encode_batch(input, is_pretokenized: false, add_special_tokens: true)
|
18
|
+
_encode_batch(input, is_pretokenized, add_special_tokens)
|
19
|
+
end
|
20
|
+
|
21
|
+
def decode(ids, skip_special_tokens: true)
|
22
|
+
_decode(ids, skip_special_tokens)
|
23
|
+
end
|
24
|
+
|
25
|
+
def decode_batch(sequences, skip_special_tokens: true)
|
26
|
+
_decode_batch(sequences, skip_special_tokens)
|
27
|
+
end
|
28
|
+
|
29
|
+
def enable_padding(**options)
|
30
|
+
_enable_padding(options)
|
31
|
+
end
|
32
|
+
|
33
|
+
def enable_truncation(max_length, **options)
|
34
|
+
_enable_truncation(max_length, options)
|
35
|
+
end
|
36
|
+
|
37
|
+
def vocab(with_added_tokens: true)
|
38
|
+
_vocab(with_added_tokens)
|
39
|
+
end
|
40
|
+
|
41
|
+
def vocab_size(with_added_tokens: true)
|
42
|
+
_vocab_size(with_added_tokens)
|
10
43
|
end
|
11
44
|
end
|
12
45
|
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
module Trainers
|
3
|
+
class UnigramTrainer
|
4
|
+
def self.new(vocab_size: 8000,
|
5
|
+
show_progress: true,
|
6
|
+
special_tokens: [],
|
7
|
+
initial_alphabet: [],
|
8
|
+
shrinking_factor: 0.75,
|
9
|
+
unk_token: nil,
|
10
|
+
max_piece_length: 16,
|
11
|
+
n_sub_iterations: 2)
|
12
|
+
|
13
|
+
_new({
|
14
|
+
vocab_size: vocab_size,
|
15
|
+
show_progress: show_progress,
|
16
|
+
special_tokens: special_tokens,
|
17
|
+
initial_alphabet: initial_alphabet,
|
18
|
+
shrinking_factor: shrinking_factor,
|
19
|
+
unk_token: unk_token,
|
20
|
+
max_piece_length: max_piece_length,
|
21
|
+
n_sub_iterations: n_sub_iterations
|
22
|
+
})
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
@@ -0,0 +1,26 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
module Trainers
|
3
|
+
class WordPieceTrainer
|
4
|
+
def self.new(vocab_size: 30000,
|
5
|
+
min_frequency: 0,
|
6
|
+
show_progress: true,
|
7
|
+
special_tokens: [],
|
8
|
+
limit_alphabet: nil,
|
9
|
+
initial_alphabet: [],
|
10
|
+
continuing_subword_prefix: "##",
|
11
|
+
end_of_word_suffix: nil)
|
12
|
+
|
13
|
+
_new({
|
14
|
+
vocab_size: vocab_size,
|
15
|
+
min_frequency: min_frequency,
|
16
|
+
show_progress: show_progress,
|
17
|
+
special_tokens: special_tokens,
|
18
|
+
limit_alphabet: limit_alphabet,
|
19
|
+
initial_alphabet: initial_alphabet,
|
20
|
+
continuing_subword_prefix: continuing_subword_prefix,
|
21
|
+
end_of_word_suffix: end_of_word_suffix
|
22
|
+
})
|
23
|
+
end
|
24
|
+
end
|
25
|
+
end
|
26
|
+
end
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -5,7 +5,41 @@ rescue LoadError
|
|
5
5
|
require_relative "tokenizers/tokenizers"
|
6
6
|
end
|
7
7
|
|
8
|
-
#
|
8
|
+
# decoders
|
9
|
+
require_relative "tokenizers/decoders/bpe_decoder"
|
10
|
+
require_relative "tokenizers/decoders/ctc"
|
11
|
+
require_relative "tokenizers/decoders/metaspace"
|
12
|
+
require_relative "tokenizers/decoders/word_piece"
|
13
|
+
|
14
|
+
# models
|
15
|
+
require_relative "tokenizers/models/bpe"
|
16
|
+
require_relative "tokenizers/models/word_level"
|
17
|
+
require_relative "tokenizers/models/word_piece"
|
18
|
+
require_relative "tokenizers/models/unigram"
|
19
|
+
|
20
|
+
# normalizers
|
21
|
+
require_relative "tokenizers/normalizers/bert_normalizer"
|
22
|
+
require_relative "tokenizers/normalizers/strip"
|
23
|
+
|
24
|
+
# pre-tokenizers
|
25
|
+
require_relative "tokenizers/pre_tokenizers/byte_level"
|
26
|
+
require_relative "tokenizers/pre_tokenizers/digits"
|
27
|
+
require_relative "tokenizers/pre_tokenizers/metaspace"
|
28
|
+
require_relative "tokenizers/pre_tokenizers/punctuation"
|
29
|
+
require_relative "tokenizers/pre_tokenizers/split"
|
30
|
+
|
31
|
+
# processors
|
32
|
+
require_relative "tokenizers/processors/byte_level"
|
33
|
+
require_relative "tokenizers/processors/roberta_processing"
|
34
|
+
require_relative "tokenizers/processors/template_processing"
|
35
|
+
|
36
|
+
# trainers
|
37
|
+
require_relative "tokenizers/trainers/bpe_trainer"
|
38
|
+
require_relative "tokenizers/trainers/unigram_trainer"
|
39
|
+
require_relative "tokenizers/trainers/word_level_trainer"
|
40
|
+
require_relative "tokenizers/trainers/word_piece_trainer"
|
41
|
+
|
42
|
+
# other
|
9
43
|
require_relative "tokenizers/char_bpe_tokenizer"
|
10
44
|
require_relative "tokenizers/encoding"
|
11
45
|
require_relative "tokenizers/from_pretrained"
|
@@ -15,5 +49,11 @@ require_relative "tokenizers/version"
|
|
15
49
|
module Tokenizers
|
16
50
|
class Error < StandardError; end
|
17
51
|
|
18
|
-
|
52
|
+
def self.from_pretrained(...)
|
53
|
+
Tokenizer.from_pretrained(...)
|
54
|
+
end
|
55
|
+
|
56
|
+
def self.from_file(...)
|
57
|
+
Tokenizer.from_file(...)
|
58
|
+
end
|
19
59
|
end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.3.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-
|
11
|
+
date: 2023-02-08 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -45,12 +45,39 @@ files:
|
|
45
45
|
- ext/tokenizers/src/models.rs
|
46
46
|
- ext/tokenizers/src/normalizers.rs
|
47
47
|
- ext/tokenizers/src/pre_tokenizers.rs
|
48
|
+
- ext/tokenizers/src/processors.rs
|
48
49
|
- ext/tokenizers/src/tokenizer.rs
|
50
|
+
- ext/tokenizers/src/trainers.rs
|
51
|
+
- ext/tokenizers/src/utils/mod.rs
|
52
|
+
- ext/tokenizers/src/utils/normalization.rs
|
53
|
+
- ext/tokenizers/src/utils/regex.rs
|
49
54
|
- lib/tokenizers.rb
|
50
55
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
56
|
+
- lib/tokenizers/decoders/bpe_decoder.rb
|
57
|
+
- lib/tokenizers/decoders/ctc.rb
|
58
|
+
- lib/tokenizers/decoders/metaspace.rb
|
59
|
+
- lib/tokenizers/decoders/word_piece.rb
|
51
60
|
- lib/tokenizers/encoding.rb
|
52
61
|
- lib/tokenizers/from_pretrained.rb
|
62
|
+
- lib/tokenizers/models/bpe.rb
|
63
|
+
- lib/tokenizers/models/unigram.rb
|
64
|
+
- lib/tokenizers/models/word_level.rb
|
65
|
+
- lib/tokenizers/models/word_piece.rb
|
66
|
+
- lib/tokenizers/normalizers/bert_normalizer.rb
|
67
|
+
- lib/tokenizers/normalizers/strip.rb
|
68
|
+
- lib/tokenizers/pre_tokenizers/byte_level.rb
|
69
|
+
- lib/tokenizers/pre_tokenizers/digits.rb
|
70
|
+
- lib/tokenizers/pre_tokenizers/metaspace.rb
|
71
|
+
- lib/tokenizers/pre_tokenizers/punctuation.rb
|
72
|
+
- lib/tokenizers/pre_tokenizers/split.rb
|
73
|
+
- lib/tokenizers/processors/byte_level.rb
|
74
|
+
- lib/tokenizers/processors/roberta_processing.rb
|
75
|
+
- lib/tokenizers/processors/template_processing.rb
|
53
76
|
- lib/tokenizers/tokenizer.rb
|
77
|
+
- lib/tokenizers/trainers/bpe_trainer.rb
|
78
|
+
- lib/tokenizers/trainers/unigram_trainer.rb
|
79
|
+
- lib/tokenizers/trainers/word_level_trainer.rb
|
80
|
+
- lib/tokenizers/trainers/word_piece_trainer.rb
|
54
81
|
- lib/tokenizers/version.rb
|
55
82
|
homepage: https://github.com/ankane/tokenizers-ruby
|
56
83
|
licenses:
|
@@ -71,7 +98,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
71
98
|
- !ruby/object:Gem::Version
|
72
99
|
version: '0'
|
73
100
|
requirements: []
|
74
|
-
rubygems_version: 3.4.
|
101
|
+
rubygems_version: 3.4.6
|
75
102
|
signing_key:
|
76
103
|
specification_version: 4
|
77
104
|
summary: Fast state-of-the-art tokenizers for Ruby
|