tokenizers 0.5.2 → 0.5.3
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +1 -1
- data/ext/tokenizers/Cargo.toml +1 -1
- data/ext/tokenizers/src/lib.rs +11 -1
- data/ext/tokenizers/src/tokenizer.rs +78 -5
- data/lib/tokenizers/added_token.rb +7 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +1 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30c4558340092b3fe4b60adbfed6b042810e27df9a62bd8fe828c3a2c9b5cf7a
|
4
|
+
data.tar.gz: bc97136598b82cdb47b0d50de4ead4b5afd8500dc52b487496f3179dd48ecee8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '0184d588343d823b0a2942828c0a496e131b5dfbae475d46ed7ebb2f3e89e5fd6d420705e88b31293331b247920c209653d0590b3aad618aab583a6a9ff49c8a'
|
7
|
+
data.tar.gz: a7c590677a968516ae075fb46a5153e301b93e2bd13cf372d5cf020c4bd0c9c0cde7a7118e708e853a61c42a8957fcec73afa32e8a2eebd517943254905d0621
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
data/ext/tokenizers/Cargo.toml
CHANGED
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -15,7 +15,7 @@ mod utils;
|
|
15
15
|
|
16
16
|
use encoding::RbEncoding;
|
17
17
|
use error::RbError;
|
18
|
-
use tokenizer::RbTokenizer;
|
18
|
+
use tokenizer::{RbAddedToken, RbTokenizer};
|
19
19
|
use utils::RbRegex;
|
20
20
|
|
21
21
|
use magnus::{function, method, prelude::*, value::Lazy, Error, RModule, Ruby};
|
@@ -79,6 +79,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
79
79
|
class.define_method("num_special_tokens_to_add", method!(RbTokenizer::num_special_tokens_to_add, 1))?;
|
80
80
|
class.define_method("_vocab", method!(RbTokenizer::vocab, 1))?;
|
81
81
|
class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
|
82
|
+
class.define_method("added_tokens_decoder", method!(RbTokenizer::get_added_tokens_decoder, 0))?;
|
82
83
|
class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
|
83
84
|
|
84
85
|
let class = module.define_class("Encoding", ruby.class_object())?;
|
@@ -109,6 +110,15 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
109
110
|
let class = module.define_class("Regex", ruby.class_object())?;
|
110
111
|
class.define_singleton_method("new", function!(RbRegex::new, 1))?;
|
111
112
|
|
113
|
+
let class = module.define_class("AddedToken", ruby.class_object())?;
|
114
|
+
class.define_singleton_method("_new", function!(RbAddedToken::new, 2))?;
|
115
|
+
class.define_method("content", method!(RbAddedToken::get_content, 0))?;
|
116
|
+
class.define_method("rstrip", method!(RbAddedToken::get_rstrip, 0))?;
|
117
|
+
class.define_method("lstrip", method!(RbAddedToken::get_lstrip, 0))?;
|
118
|
+
class.define_method("single_word", method!(RbAddedToken::get_single_word, 0))?;
|
119
|
+
class.define_method("normalized", method!(RbAddedToken::get_normalized, 0))?;
|
120
|
+
class.define_method("special", method!(RbAddedToken::get_special, 0))?;
|
121
|
+
|
112
122
|
let models = module.define_module("Models")?;
|
113
123
|
let pre_tokenizers = module.define_module("PreTokenizers")?;
|
114
124
|
let decoders = module.define_module("Decoders")?;
|
@@ -22,9 +22,10 @@ use super::processors::RbPostProcessor;
|
|
22
22
|
use super::trainers::RbTrainer;
|
23
23
|
use super::{RbError, RbResult};
|
24
24
|
|
25
|
+
#[magnus::wrap(class = "Tokenizers::AddedToken")]
|
25
26
|
pub struct RbAddedToken {
|
26
27
|
pub content: String,
|
27
|
-
pub
|
28
|
+
pub special: bool,
|
28
29
|
pub single_word: Option<bool>,
|
29
30
|
pub lstrip: Option<bool>,
|
30
31
|
pub rstrip: Option<bool>,
|
@@ -32,10 +33,10 @@ pub struct RbAddedToken {
|
|
32
33
|
}
|
33
34
|
|
34
35
|
impl RbAddedToken {
|
35
|
-
pub fn from<S: Into<String>>(content: S,
|
36
|
+
pub fn from<S: Into<String>>(content: S, special: Option<bool>) -> Self {
|
36
37
|
Self {
|
37
38
|
content: content.into(),
|
38
|
-
|
39
|
+
special: special.unwrap_or(false),
|
39
40
|
single_word: None,
|
40
41
|
lstrip: None,
|
41
42
|
rstrip: None,
|
@@ -44,7 +45,7 @@ impl RbAddedToken {
|
|
44
45
|
}
|
45
46
|
|
46
47
|
pub fn get_token(&self) -> tk::tokenizer::AddedToken {
|
47
|
-
let mut token = tk::AddedToken::from(&self.content, self.
|
48
|
+
let mut token = tk::AddedToken::from(&self.content, self.special);
|
48
49
|
|
49
50
|
if let Some(sw) = self.single_word {
|
50
51
|
token = token.single_word(sw);
|
@@ -71,11 +72,73 @@ impl From<tk::AddedToken> for RbAddedToken {
|
|
71
72
|
lstrip: Some(token.lstrip),
|
72
73
|
rstrip: Some(token.rstrip),
|
73
74
|
normalized: Some(token.normalized),
|
74
|
-
|
75
|
+
special: !token.normalized,
|
75
76
|
}
|
76
77
|
}
|
77
78
|
}
|
78
79
|
|
80
|
+
impl RbAddedToken {
|
81
|
+
pub fn new(content: Option<String>, kwargs: RHash) -> RbResult<Self> {
|
82
|
+
let mut token = RbAddedToken::from(content.unwrap_or("".to_string()), None);
|
83
|
+
|
84
|
+
let value: Value = kwargs.delete(Symbol::new("single_word"))?;
|
85
|
+
if !value.is_nil() {
|
86
|
+
token.single_word = TryConvert::try_convert(value)?;
|
87
|
+
}
|
88
|
+
|
89
|
+
let value: Value = kwargs.delete(Symbol::new("lstrip"))?;
|
90
|
+
if !value.is_nil() {
|
91
|
+
token.lstrip = TryConvert::try_convert(value)?;
|
92
|
+
}
|
93
|
+
|
94
|
+
let value: Value = kwargs.delete(Symbol::new("rstrip"))?;
|
95
|
+
if !value.is_nil() {
|
96
|
+
token.rstrip = TryConvert::try_convert(value)?;
|
97
|
+
}
|
98
|
+
|
99
|
+
let value: Value = kwargs.delete(Symbol::new("normalized"))?;
|
100
|
+
if !value.is_nil() {
|
101
|
+
token.normalized = TryConvert::try_convert(value)?;
|
102
|
+
}
|
103
|
+
|
104
|
+
let value: Value = kwargs.delete(Symbol::new("special"))?;
|
105
|
+
if !value.is_nil() {
|
106
|
+
token.special = TryConvert::try_convert(value)?;
|
107
|
+
}
|
108
|
+
|
109
|
+
if !kwargs.is_empty() {
|
110
|
+
// TODO improve message
|
111
|
+
return Err(Error::new(exception::arg_error(), "unknown keyword"));
|
112
|
+
}
|
113
|
+
|
114
|
+
Ok(token)
|
115
|
+
}
|
116
|
+
|
117
|
+
pub fn get_content(&self) -> String {
|
118
|
+
self.content.to_string()
|
119
|
+
}
|
120
|
+
|
121
|
+
pub fn get_rstrip(&self) -> bool {
|
122
|
+
self.get_token().rstrip
|
123
|
+
}
|
124
|
+
|
125
|
+
pub fn get_lstrip(&self) -> bool {
|
126
|
+
self.get_token().lstrip
|
127
|
+
}
|
128
|
+
|
129
|
+
pub fn get_single_word(&self) -> bool {
|
130
|
+
self.get_token().single_word
|
131
|
+
}
|
132
|
+
|
133
|
+
pub fn get_normalized(&self) -> bool {
|
134
|
+
self.get_token().normalized
|
135
|
+
}
|
136
|
+
|
137
|
+
pub fn get_special(&self) -> bool {
|
138
|
+
self.get_token().special
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
79
142
|
struct TextInputSequence<'s>(tk::InputSequence<'s>);
|
80
143
|
|
81
144
|
impl<'s> TryConvert for TextInputSequence<'s> {
|
@@ -536,4 +599,14 @@ impl RbTokenizer {
|
|
536
599
|
pub fn vocab_size(&self, with_added_tokens: bool) -> usize {
|
537
600
|
self.tokenizer.borrow().get_vocab_size(with_added_tokens)
|
538
601
|
}
|
602
|
+
|
603
|
+
pub fn get_added_tokens_decoder(&self) -> RbResult<RHash> {
|
604
|
+
let sorted_map = RHash::new();
|
605
|
+
|
606
|
+
for (key, value) in self.tokenizer.borrow().get_added_tokens_decoder() {
|
607
|
+
sorted_map.aset::<u32, RbAddedToken>(key, value.into())?;
|
608
|
+
}
|
609
|
+
|
610
|
+
Ok(sorted_map)
|
611
|
+
}
|
539
612
|
}
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -42,6 +42,7 @@ require_relative "tokenizers/trainers/word_level_trainer"
|
|
42
42
|
require_relative "tokenizers/trainers/word_piece_trainer"
|
43
43
|
|
44
44
|
# other
|
45
|
+
require_relative "tokenizers/added_token"
|
45
46
|
require_relative "tokenizers/char_bpe_tokenizer"
|
46
47
|
require_relative "tokenizers/encoding"
|
47
48
|
require_relative "tokenizers/from_pretrained"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -52,6 +52,7 @@ files:
|
|
52
52
|
- ext/tokenizers/src/utils/normalization.rs
|
53
53
|
- ext/tokenizers/src/utils/regex.rs
|
54
54
|
- lib/tokenizers.rb
|
55
|
+
- lib/tokenizers/added_token.rb
|
55
56
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
56
57
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
57
58
|
- lib/tokenizers/decoders/ctc.rb
|
@@ -100,7 +101,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
101
|
- !ruby/object:Gem::Version
|
101
102
|
version: '0'
|
102
103
|
requirements: []
|
103
|
-
rubygems_version: 3.5.
|
104
|
+
rubygems_version: 3.5.16
|
104
105
|
signing_key:
|
105
106
|
specification_version: 4
|
106
107
|
summary: Fast state-of-the-art tokenizers for Ruby
|