tokenizers 0.2.2 → 0.2.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Cargo.lock +1 -1
- data/ext/tokenizers/Cargo.toml +1 -1
- data/ext/tokenizers/src/encoding.rs +77 -3
- data/ext/tokenizers/src/lib.rs +25 -1
- data/ext/tokenizers/src/tokenizer.rs +16 -2
- data/lib/tokenizers/char_bpe_tokenizer.rb +2 -2
- data/lib/tokenizers/encoding.rb +19 -0
- data/lib/tokenizers/tokenizer.rb +12 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +7 -5
- metadata +4 -2
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: e4f3cb98cb867df67a1c8a00b56f9ec5f4c6fafa178d760655dafb6735160773
|
4
|
+
data.tar.gz: 88c420f7a42f56330ce091df7f131878efd552488232282388e69d7a3c4b4aa2
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: 8e4746ccdf33dce78dc2b86d847f47f83576ca0d637671f825ad006a53b7ac3374654f7724f1e889618f322f9cfa5081e30083997ee9810eab282b9a8b99f807
|
7
|
+
data.tar.gz: 5dfe7b502d908f85ae16cfb28ebe1bd2ff51348c31151c7ee531504c00a0315dc22ea76fea963690de8c7390c7adb50d392e39de6db4a22101e91d31de1fa4e8
|
data/CHANGELOG.md
CHANGED
@@ -1,3 +1,10 @@
|
|
1
|
+
## 0.2.3 (2022-01-22)
|
2
|
+
|
3
|
+
- Added `add_special_tokens` option to `encode` method
|
4
|
+
- Added warning about `encode` method including special tokens by default in 0.3.0
|
5
|
+
- Added more methods to `Encoding`
|
6
|
+
- Fixed error with precompiled gem on Mac ARM
|
7
|
+
|
1
8
|
## 0.2.2 (2022-01-15)
|
2
9
|
|
3
10
|
- Added precompiled gem for Linux ARM
|
data/Cargo.lock
CHANGED
data/ext/tokenizers/Cargo.toml
CHANGED
@@ -1,16 +1,90 @@
|
|
1
|
-
use tk::Encoding;
|
1
|
+
use tk::{Encoding, Offsets};
|
2
2
|
|
3
3
|
#[magnus::wrap(class = "Tokenizers::Encoding")]
|
4
|
+
#[repr(transparent)]
|
4
5
|
pub struct RbEncoding {
|
5
6
|
pub encoding: Encoding,
|
6
7
|
}
|
7
8
|
|
9
|
+
impl From<Encoding> for RbEncoding {
|
10
|
+
fn from(v: Encoding) -> Self {
|
11
|
+
Self { encoding: v }
|
12
|
+
}
|
13
|
+
}
|
14
|
+
|
8
15
|
impl RbEncoding {
|
16
|
+
pub fn n_sequences(&self) -> usize {
|
17
|
+
self.encoding.n_sequences()
|
18
|
+
}
|
19
|
+
|
9
20
|
pub fn ids(&self) -> Vec<u32> {
|
10
|
-
self.encoding.get_ids().
|
21
|
+
self.encoding.get_ids().to_vec()
|
11
22
|
}
|
12
23
|
|
13
24
|
pub fn tokens(&self) -> Vec<String> {
|
14
|
-
self.encoding.get_tokens().
|
25
|
+
self.encoding.get_tokens().to_vec()
|
26
|
+
}
|
27
|
+
|
28
|
+
pub fn word_ids(&self) -> Vec<Option<u32>> {
|
29
|
+
self.encoding.get_word_ids().to_vec()
|
30
|
+
}
|
31
|
+
|
32
|
+
pub fn sequence_ids(&self) -> Vec<Option<usize>> {
|
33
|
+
self.encoding.get_sequence_ids()
|
34
|
+
}
|
35
|
+
|
36
|
+
pub fn type_ids(&self) -> Vec<u32> {
|
37
|
+
self.encoding.get_type_ids().to_vec()
|
38
|
+
}
|
39
|
+
|
40
|
+
pub fn offsets(&self) -> Vec<(usize, usize)> {
|
41
|
+
self.encoding.get_offsets().to_vec()
|
42
|
+
}
|
43
|
+
|
44
|
+
pub fn special_tokens_mask(&self) -> Vec<u32> {
|
45
|
+
self.encoding.get_special_tokens_mask().to_vec()
|
46
|
+
}
|
47
|
+
|
48
|
+
pub fn attention_mask(&self) -> Vec<u32> {
|
49
|
+
self.encoding.get_attention_mask().to_vec()
|
50
|
+
}
|
51
|
+
|
52
|
+
pub fn overflowing(&self) -> Vec<Self> {
|
53
|
+
self.encoding
|
54
|
+
.get_overflowing()
|
55
|
+
.clone()
|
56
|
+
.into_iter()
|
57
|
+
.map(|e| e.into())
|
58
|
+
.collect()
|
59
|
+
}
|
60
|
+
|
61
|
+
pub fn word_to_tokens(&self, word_index: u32, sequence_index: usize) -> Option<(usize, usize)> {
|
62
|
+
self.encoding.word_to_tokens(word_index, sequence_index)
|
63
|
+
}
|
64
|
+
|
65
|
+
pub fn word_to_chars(&self, word_index: u32, sequence_index: usize) -> Option<Offsets> {
|
66
|
+
self.encoding.word_to_chars(word_index, sequence_index)
|
67
|
+
}
|
68
|
+
|
69
|
+
pub fn token_to_sequence(&self, token_index: usize) -> Option<usize> {
|
70
|
+
self.encoding.token_to_sequence(token_index)
|
71
|
+
}
|
72
|
+
|
73
|
+
pub fn token_to_chars(&self, token_index: usize) -> Option<Offsets> {
|
74
|
+
let (_, offsets) = self.encoding.token_to_chars(token_index)?;
|
75
|
+
Some(offsets)
|
76
|
+
}
|
77
|
+
|
78
|
+
pub fn token_to_word(&self, token_index: usize) -> Option<u32> {
|
79
|
+
let (_, word_idx) = self.encoding.token_to_word(token_index)?;
|
80
|
+
Some(word_idx)
|
81
|
+
}
|
82
|
+
|
83
|
+
pub fn char_to_token(&self, char_pos: usize, sequence_index: usize) -> Option<usize> {
|
84
|
+
self.encoding.char_to_token(char_pos, sequence_index)
|
85
|
+
}
|
86
|
+
|
87
|
+
pub fn char_to_word(&self, char_pos: usize, sequence_index: usize) -> Option<u32> {
|
88
|
+
self.encoding.char_to_word(char_pos, sequence_index)
|
15
89
|
}
|
16
90
|
}
|
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -38,15 +38,39 @@ fn init() -> RbResult<()> {
|
|
38
38
|
"add_special_tokens",
|
39
39
|
method!(RbTokenizer::add_special_tokens, 1),
|
40
40
|
)?;
|
41
|
-
class.define_method("
|
41
|
+
class.define_method("add_tokens", method!(RbTokenizer::add_tokens, 1))?;
|
42
|
+
class.define_method("_encode", method!(RbTokenizer::encode, 2))?;
|
42
43
|
class.define_method("decode", method!(RbTokenizer::decode, 1))?;
|
43
44
|
class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
|
44
45
|
class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
|
45
46
|
class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
|
47
|
+
class.define_method("token_to_id", method!(RbTokenizer::token_to_id, 1))?;
|
48
|
+
class.define_method("id_to_token", method!(RbTokenizer::id_to_token, 1))?;
|
46
49
|
|
47
50
|
let class = module.define_class("Encoding", Default::default())?;
|
51
|
+
class.define_method("n_sequences", method!(RbEncoding::n_sequences, 0))?;
|
48
52
|
class.define_method("ids", method!(RbEncoding::ids, 0))?;
|
49
53
|
class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
|
54
|
+
class.define_method("word_ids", method!(RbEncoding::word_ids, 0))?;
|
55
|
+
class.define_method("sequence_ids", method!(RbEncoding::sequence_ids, 0))?;
|
56
|
+
class.define_method("type_ids", method!(RbEncoding::type_ids, 0))?;
|
57
|
+
class.define_method("offsets", method!(RbEncoding::offsets, 0))?;
|
58
|
+
class.define_method(
|
59
|
+
"special_tokens_mask",
|
60
|
+
method!(RbEncoding::special_tokens_mask, 0),
|
61
|
+
)?;
|
62
|
+
class.define_method("attention_mask", method!(RbEncoding::attention_mask, 0))?;
|
63
|
+
class.define_method("overflowing", method!(RbEncoding::overflowing, 0))?;
|
64
|
+
class.define_method("_word_to_tokens", method!(RbEncoding::word_to_tokens, 2))?;
|
65
|
+
class.define_method("_word_to_chars", method!(RbEncoding::word_to_chars, 2))?;
|
66
|
+
class.define_method(
|
67
|
+
"token_to_sequence",
|
68
|
+
method!(RbEncoding::token_to_sequence, 1),
|
69
|
+
)?;
|
70
|
+
class.define_method("token_to_chars", method!(RbEncoding::token_to_chars, 1))?;
|
71
|
+
class.define_method("token_to_word", method!(RbEncoding::token_to_word, 1))?;
|
72
|
+
class.define_method("_char_to_token", method!(RbEncoding::char_to_token, 2))?;
|
73
|
+
class.define_method("_char_to_word", method!(RbEncoding::char_to_word, 2))?;
|
50
74
|
|
51
75
|
let class = module.define_class("BPEDecoder", Default::default())?;
|
52
76
|
class.define_singleton_method("new", function!(RbBPEDecoder::new, 0))?;
|
@@ -36,10 +36,16 @@ impl RbTokenizer {
|
|
36
36
|
// TODO return self
|
37
37
|
}
|
38
38
|
|
39
|
-
pub fn
|
39
|
+
pub fn add_tokens(&self, tokens: Vec<String>) {
|
40
|
+
let tokens: Vec<AddedToken> = tokens.iter().map(|t| AddedToken::from(t, true)).collect();
|
41
|
+
self.tokenizer.borrow_mut().add_tokens(&tokens);
|
42
|
+
// TODO return self
|
43
|
+
}
|
44
|
+
|
45
|
+
pub fn encode(&self, sequence: String, add_special_tokens: bool) -> RbResult<RbEncoding> {
|
40
46
|
self.tokenizer
|
41
47
|
.borrow()
|
42
|
-
.encode(
|
48
|
+
.encode(sequence, add_special_tokens)
|
43
49
|
.map(|v| RbEncoding { encoding: v })
|
44
50
|
.map_err(RbError::from)
|
45
51
|
}
|
@@ -68,4 +74,12 @@ impl RbTokenizer {
|
|
68
74
|
.borrow_mut()
|
69
75
|
.with_normalizer(normalizer.normalizer);
|
70
76
|
}
|
77
|
+
|
78
|
+
pub fn token_to_id(&self, token: String) -> Option<u32> {
|
79
|
+
self.tokenizer.borrow().token_to_id(&token)
|
80
|
+
}
|
81
|
+
|
82
|
+
pub fn id_to_token(&self, id: u32) -> Option<String> {
|
83
|
+
self.tokenizer.borrow().id_to_token(id)
|
84
|
+
}
|
71
85
|
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
class Encoding
|
3
|
+
def word_to_tokens(word_index, sequence_index = 0)
|
4
|
+
_word_to_tokens(word_index, sequence_index)
|
5
|
+
end
|
6
|
+
|
7
|
+
def word_to_chars(word_index, sequence_index = 0)
|
8
|
+
_word_to_chars(word_index, sequence_index)
|
9
|
+
end
|
10
|
+
|
11
|
+
def char_to_token(char_pos, sequence_index = 0)
|
12
|
+
_char_to_token(char_pos, sequence_index)
|
13
|
+
end
|
14
|
+
|
15
|
+
def char_to_word(char_pos, sequence_index = 0)
|
16
|
+
_char_to_word(word_index, sequence_index)
|
17
|
+
end
|
18
|
+
end
|
19
|
+
end
|
@@ -0,0 +1,12 @@
|
|
1
|
+
module Tokenizers
|
2
|
+
class Tokenizer
|
3
|
+
# TODO change add_special_tokens default to true in 0.3.0
|
4
|
+
def encode(sequence, add_special_tokens: nil)
|
5
|
+
if add_special_tokens.nil?
|
6
|
+
warn "[tokenizers] add_special_tokens will default to true in 0.3.0. Pass add_special_tokens: true/false to silence this warning."
|
7
|
+
add_special_tokens = false
|
8
|
+
end
|
9
|
+
_encode(sequence, add_special_tokens)
|
10
|
+
end
|
11
|
+
end
|
12
|
+
end
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -1,14 +1,16 @@
|
|
1
1
|
# ext
|
2
2
|
begin
|
3
|
-
|
3
|
+
require_relative "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
|
4
4
|
rescue LoadError
|
5
|
-
|
5
|
+
require_relative "tokenizers/tokenizers"
|
6
6
|
end
|
7
7
|
|
8
8
|
# modules
|
9
|
-
|
10
|
-
|
11
|
-
|
9
|
+
require_relative "tokenizers/char_bpe_tokenizer"
|
10
|
+
require_relative "tokenizers/encoding"
|
11
|
+
require_relative "tokenizers/from_pretrained"
|
12
|
+
require_relative "tokenizers/tokenizer"
|
13
|
+
require_relative "tokenizers/version"
|
12
14
|
|
13
15
|
module Tokenizers
|
14
16
|
class Error < StandardError; end
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.2.
|
4
|
+
version: 0.2.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2023-01-
|
11
|
+
date: 2023-01-22 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -48,7 +48,9 @@ files:
|
|
48
48
|
- ext/tokenizers/src/tokenizer.rs
|
49
49
|
- lib/tokenizers.rb
|
50
50
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
51
|
+
- lib/tokenizers/encoding.rb
|
51
52
|
- lib/tokenizers/from_pretrained.rb
|
53
|
+
- lib/tokenizers/tokenizer.rb
|
52
54
|
- lib/tokenizers/version.rb
|
53
55
|
homepage: https://github.com/ankane/tokenizers-ruby
|
54
56
|
licenses:
|