tokenizers 0.5.2 → 0.5.3
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +5 -0
- data/Cargo.lock +1 -1
- data/ext/tokenizers/Cargo.toml +1 -1
- data/ext/tokenizers/src/lib.rs +11 -1
- data/ext/tokenizers/src/tokenizer.rs +78 -5
- data/lib/tokenizers/added_token.rb +7 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +1 -0
- metadata +4 -3
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 30c4558340092b3fe4b60adbfed6b042810e27df9a62bd8fe828c3a2c9b5cf7a
|
4
|
+
data.tar.gz: bc97136598b82cdb47b0d50de4ead4b5afd8500dc52b487496f3179dd48ecee8
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: '0184d588343d823b0a2942828c0a496e131b5dfbae475d46ed7ebb2f3e89e5fd6d420705e88b31293331b247920c209653d0590b3aad618aab583a6a9ff49c8a'
|
7
|
+
data.tar.gz: a7c590677a968516ae075fb46a5153e301b93e2bd13cf372d5cf020c4bd0c9c0cde7a7118e708e853a61c42a8957fcec73afa32e8a2eebd517943254905d0621
|
data/CHANGELOG.md
CHANGED
data/Cargo.lock
CHANGED
data/ext/tokenizers/Cargo.toml
CHANGED
data/ext/tokenizers/src/lib.rs
CHANGED
@@ -15,7 +15,7 @@ mod utils;
|
|
15
15
|
|
16
16
|
use encoding::RbEncoding;
|
17
17
|
use error::RbError;
|
18
|
-
use tokenizer::RbTokenizer;
|
18
|
+
use tokenizer::{RbAddedToken, RbTokenizer};
|
19
19
|
use utils::RbRegex;
|
20
20
|
|
21
21
|
use magnus::{function, method, prelude::*, value::Lazy, Error, RModule, Ruby};
|
@@ -79,6 +79,7 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
79
79
|
class.define_method("num_special_tokens_to_add", method!(RbTokenizer::num_special_tokens_to_add, 1))?;
|
80
80
|
class.define_method("_vocab", method!(RbTokenizer::vocab, 1))?;
|
81
81
|
class.define_method("_vocab_size", method!(RbTokenizer::vocab_size, 1))?;
|
82
|
+
class.define_method("added_tokens_decoder", method!(RbTokenizer::get_added_tokens_decoder, 0))?;
|
82
83
|
class.define_method("_to_s", method!(RbTokenizer::to_str, 1))?;
|
83
84
|
|
84
85
|
let class = module.define_class("Encoding", ruby.class_object())?;
|
@@ -109,6 +110,15 @@ fn init(ruby: &Ruby) -> RbResult<()> {
|
|
109
110
|
let class = module.define_class("Regex", ruby.class_object())?;
|
110
111
|
class.define_singleton_method("new", function!(RbRegex::new, 1))?;
|
111
112
|
|
113
|
+
let class = module.define_class("AddedToken", ruby.class_object())?;
|
114
|
+
class.define_singleton_method("_new", function!(RbAddedToken::new, 2))?;
|
115
|
+
class.define_method("content", method!(RbAddedToken::get_content, 0))?;
|
116
|
+
class.define_method("rstrip", method!(RbAddedToken::get_rstrip, 0))?;
|
117
|
+
class.define_method("lstrip", method!(RbAddedToken::get_lstrip, 0))?;
|
118
|
+
class.define_method("single_word", method!(RbAddedToken::get_single_word, 0))?;
|
119
|
+
class.define_method("normalized", method!(RbAddedToken::get_normalized, 0))?;
|
120
|
+
class.define_method("special", method!(RbAddedToken::get_special, 0))?;
|
121
|
+
|
112
122
|
let models = module.define_module("Models")?;
|
113
123
|
let pre_tokenizers = module.define_module("PreTokenizers")?;
|
114
124
|
let decoders = module.define_module("Decoders")?;
|
@@ -22,9 +22,10 @@ use super::processors::RbPostProcessor;
|
|
22
22
|
use super::trainers::RbTrainer;
|
23
23
|
use super::{RbError, RbResult};
|
24
24
|
|
25
|
+
#[magnus::wrap(class = "Tokenizers::AddedToken")]
|
25
26
|
pub struct RbAddedToken {
|
26
27
|
pub content: String,
|
27
|
-
pub
|
28
|
+
pub special: bool,
|
28
29
|
pub single_word: Option<bool>,
|
29
30
|
pub lstrip: Option<bool>,
|
30
31
|
pub rstrip: Option<bool>,
|
@@ -32,10 +33,10 @@ pub struct RbAddedToken {
|
|
32
33
|
}
|
33
34
|
|
34
35
|
impl RbAddedToken {
|
35
|
-
pub fn from<S: Into<String>>(content: S,
|
36
|
+
pub fn from<S: Into<String>>(content: S, special: Option<bool>) -> Self {
|
36
37
|
Self {
|
37
38
|
content: content.into(),
|
38
|
-
|
39
|
+
special: special.unwrap_or(false),
|
39
40
|
single_word: None,
|
40
41
|
lstrip: None,
|
41
42
|
rstrip: None,
|
@@ -44,7 +45,7 @@ impl RbAddedToken {
|
|
44
45
|
}
|
45
46
|
|
46
47
|
pub fn get_token(&self) -> tk::tokenizer::AddedToken {
|
47
|
-
let mut token = tk::AddedToken::from(&self.content, self.
|
48
|
+
let mut token = tk::AddedToken::from(&self.content, self.special);
|
48
49
|
|
49
50
|
if let Some(sw) = self.single_word {
|
50
51
|
token = token.single_word(sw);
|
@@ -71,11 +72,73 @@ impl From<tk::AddedToken> for RbAddedToken {
|
|
71
72
|
lstrip: Some(token.lstrip),
|
72
73
|
rstrip: Some(token.rstrip),
|
73
74
|
normalized: Some(token.normalized),
|
74
|
-
|
75
|
+
special: !token.normalized,
|
75
76
|
}
|
76
77
|
}
|
77
78
|
}
|
78
79
|
|
80
|
+
impl RbAddedToken {
|
81
|
+
pub fn new(content: Option<String>, kwargs: RHash) -> RbResult<Self> {
|
82
|
+
let mut token = RbAddedToken::from(content.unwrap_or("".to_string()), None);
|
83
|
+
|
84
|
+
let value: Value = kwargs.delete(Symbol::new("single_word"))?;
|
85
|
+
if !value.is_nil() {
|
86
|
+
token.single_word = TryConvert::try_convert(value)?;
|
87
|
+
}
|
88
|
+
|
89
|
+
let value: Value = kwargs.delete(Symbol::new("lstrip"))?;
|
90
|
+
if !value.is_nil() {
|
91
|
+
token.lstrip = TryConvert::try_convert(value)?;
|
92
|
+
}
|
93
|
+
|
94
|
+
let value: Value = kwargs.delete(Symbol::new("rstrip"))?;
|
95
|
+
if !value.is_nil() {
|
96
|
+
token.rstrip = TryConvert::try_convert(value)?;
|
97
|
+
}
|
98
|
+
|
99
|
+
let value: Value = kwargs.delete(Symbol::new("normalized"))?;
|
100
|
+
if !value.is_nil() {
|
101
|
+
token.normalized = TryConvert::try_convert(value)?;
|
102
|
+
}
|
103
|
+
|
104
|
+
let value: Value = kwargs.delete(Symbol::new("special"))?;
|
105
|
+
if !value.is_nil() {
|
106
|
+
token.special = TryConvert::try_convert(value)?;
|
107
|
+
}
|
108
|
+
|
109
|
+
if !kwargs.is_empty() {
|
110
|
+
// TODO improve message
|
111
|
+
return Err(Error::new(exception::arg_error(), "unknown keyword"));
|
112
|
+
}
|
113
|
+
|
114
|
+
Ok(token)
|
115
|
+
}
|
116
|
+
|
117
|
+
pub fn get_content(&self) -> String {
|
118
|
+
self.content.to_string()
|
119
|
+
}
|
120
|
+
|
121
|
+
pub fn get_rstrip(&self) -> bool {
|
122
|
+
self.get_token().rstrip
|
123
|
+
}
|
124
|
+
|
125
|
+
pub fn get_lstrip(&self) -> bool {
|
126
|
+
self.get_token().lstrip
|
127
|
+
}
|
128
|
+
|
129
|
+
pub fn get_single_word(&self) -> bool {
|
130
|
+
self.get_token().single_word
|
131
|
+
}
|
132
|
+
|
133
|
+
pub fn get_normalized(&self) -> bool {
|
134
|
+
self.get_token().normalized
|
135
|
+
}
|
136
|
+
|
137
|
+
pub fn get_special(&self) -> bool {
|
138
|
+
self.get_token().special
|
139
|
+
}
|
140
|
+
}
|
141
|
+
|
79
142
|
struct TextInputSequence<'s>(tk::InputSequence<'s>);
|
80
143
|
|
81
144
|
impl<'s> TryConvert for TextInputSequence<'s> {
|
@@ -536,4 +599,14 @@ impl RbTokenizer {
|
|
536
599
|
pub fn vocab_size(&self, with_added_tokens: bool) -> usize {
|
537
600
|
self.tokenizer.borrow().get_vocab_size(with_added_tokens)
|
538
601
|
}
|
602
|
+
|
603
|
+
pub fn get_added_tokens_decoder(&self) -> RbResult<RHash> {
|
604
|
+
let sorted_map = RHash::new();
|
605
|
+
|
606
|
+
for (key, value) in self.tokenizer.borrow().get_added_tokens_decoder() {
|
607
|
+
sorted_map.aset::<u32, RbAddedToken>(key, value.into())?;
|
608
|
+
}
|
609
|
+
|
610
|
+
Ok(sorted_map)
|
611
|
+
}
|
539
612
|
}
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
@@ -42,6 +42,7 @@ require_relative "tokenizers/trainers/word_level_trainer"
|
|
42
42
|
require_relative "tokenizers/trainers/word_piece_trainer"
|
43
43
|
|
44
44
|
# other
|
45
|
+
require_relative "tokenizers/added_token"
|
45
46
|
require_relative "tokenizers/char_bpe_tokenizer"
|
46
47
|
require_relative "tokenizers/encoding"
|
47
48
|
require_relative "tokenizers/from_pretrained"
|
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.5.
|
4
|
+
version: 0.5.3
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2024-
|
11
|
+
date: 2024-09-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rb_sys
|
@@ -52,6 +52,7 @@ files:
|
|
52
52
|
- ext/tokenizers/src/utils/normalization.rs
|
53
53
|
- ext/tokenizers/src/utils/regex.rs
|
54
54
|
- lib/tokenizers.rb
|
55
|
+
- lib/tokenizers/added_token.rb
|
55
56
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
56
57
|
- lib/tokenizers/decoders/bpe_decoder.rb
|
57
58
|
- lib/tokenizers/decoders/ctc.rb
|
@@ -100,7 +101,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
100
101
|
- !ruby/object:Gem::Version
|
101
102
|
version: '0'
|
102
103
|
requirements: []
|
103
|
-
rubygems_version: 3.5.
|
104
|
+
rubygems_version: 3.5.16
|
104
105
|
signing_key:
|
105
106
|
specification_version: 4
|
106
107
|
summary: Fast state-of-the-art tokenizers for Ruby
|