tokenizers 0.1.3 → 0.2.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/Cargo.lock +649 -442
- data/Cargo.toml +8 -12
- data/README.md +1 -3
- data/ext/tokenizers/Cargo.toml +18 -0
- data/ext/tokenizers/extconf.rb +3 -10
- data/ext/tokenizers/src/decoders.rs +14 -0
- data/ext/tokenizers/src/encoding.rs +16 -0
- data/ext/tokenizers/src/error.rs +16 -0
- data/ext/tokenizers/src/lib.rs +64 -0
- data/ext/tokenizers/src/models.rs +19 -0
- data/ext/tokenizers/src/normalizers.rs +14 -0
- data/ext/tokenizers/src/pre_tokenizers.rs +14 -0
- data/ext/tokenizers/src/tokenizer.rs +85 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +6 -2
- metadata +27 -5
- data/src/lib.rs +0 -290
data/Cargo.toml
CHANGED
@@ -1,14 +1,10 @@
|
|
1
|
-
[
|
2
|
-
|
3
|
-
version = "0.1.0"
|
4
|
-
authors = ["Andrew Kane <andrew@ankane.org>"]
|
5
|
-
edition = "2018"
|
1
|
+
[workspace]
|
2
|
+
members = ["ext/tokenizers"]
|
6
3
|
|
7
|
-
[
|
8
|
-
|
9
|
-
crate-type = ["cdylib"]
|
4
|
+
[profile.release]
|
5
|
+
strip = true
|
10
6
|
|
11
|
-
[
|
12
|
-
|
13
|
-
|
14
|
-
tokenizers = "
|
7
|
+
[patch.crates-io]
|
8
|
+
number_prefix = { git = "https://github.com/ankane/rust-number-prefix", branch = "license-file" }
|
9
|
+
rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
|
10
|
+
tokenizers = { git = "https://github.com/huggingface/tokenizers" }
|
data/README.md
CHANGED
@@ -12,8 +12,6 @@ Add this line to your application’s Gemfile:
|
|
12
12
|
gem "tokenizers"
|
13
13
|
```
|
14
14
|
|
15
|
-
Note: Rust and pkg-config are currently required for installation, and it can take 5-10 minutes to compile the extension.
|
16
|
-
|
17
15
|
## Getting Started
|
18
16
|
|
19
17
|
Load a pretrained tokenizer
|
@@ -61,7 +59,7 @@ To get started with development:
|
|
61
59
|
git clone https://github.com/ankane/tokenizers-ruby.git
|
62
60
|
cd tokenizers-ruby
|
63
61
|
bundle install
|
64
|
-
bundle exec
|
62
|
+
bundle exec rake compile
|
65
63
|
bundle exec rake download:files
|
66
64
|
bundle exec rake test
|
67
65
|
```
|
@@ -0,0 +1,18 @@
|
|
1
|
+
[package]
|
2
|
+
name = "tokenizers"
|
3
|
+
version = "0.2.1"
|
4
|
+
license = "Apache-2.0"
|
5
|
+
authors = ["Andrew Kane <andrew@ankane.org>"]
|
6
|
+
edition = "2021"
|
7
|
+
publish = false
|
8
|
+
|
9
|
+
[lib]
|
10
|
+
crate-type = ["cdylib"]
|
11
|
+
|
12
|
+
[dependencies]
|
13
|
+
magnus = "0.4"
|
14
|
+
|
15
|
+
[dependencies.tokenizers]
|
16
|
+
version = "0.13.2"
|
17
|
+
default-features = false
|
18
|
+
features = ["progressbar", "http", "onig", "esaxx_fast"]
|
data/ext/tokenizers/extconf.rb
CHANGED
@@ -1,11 +1,4 @@
|
|
1
|
-
require "
|
1
|
+
require "mkmf"
|
2
|
+
require "rb_sys/mkmf"
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
File.write "Makefile", <<~EOS
|
6
|
-
all:
|
7
|
-
\tcargo build --release --target-dir target
|
8
|
-
install:
|
9
|
-
\tmv target/release/libtokenizers.#{RbConfig::CONFIG["SOEXT"]} #{dest}
|
10
|
-
clean:
|
11
|
-
EOS
|
4
|
+
create_rust_makefile("tokenizers/tokenizers")
|
@@ -0,0 +1,14 @@
|
|
1
|
+
use tk::decoders::bpe::BPEDecoder;
|
2
|
+
|
3
|
+
#[magnus::wrap(class = "Tokenizers::BPEDecoder")]
|
4
|
+
pub struct RbBPEDecoder {
|
5
|
+
pub decoder: BPEDecoder,
|
6
|
+
}
|
7
|
+
|
8
|
+
impl RbBPEDecoder {
|
9
|
+
pub fn new() -> Self {
|
10
|
+
RbBPEDecoder {
|
11
|
+
decoder: BPEDecoder::default(),
|
12
|
+
}
|
13
|
+
}
|
14
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
use tk::Encoding;
|
2
|
+
|
3
|
+
#[magnus::wrap(class = "Tokenizers::Encoding")]
|
4
|
+
pub struct RbEncoding {
|
5
|
+
pub encoding: Encoding,
|
6
|
+
}
|
7
|
+
|
8
|
+
impl RbEncoding {
|
9
|
+
pub fn ids(&self) -> Vec<u32> {
|
10
|
+
self.encoding.get_ids().into()
|
11
|
+
}
|
12
|
+
|
13
|
+
pub fn tokens(&self) -> Vec<String> {
|
14
|
+
self.encoding.get_tokens().into()
|
15
|
+
}
|
16
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
use magnus::{exception, memoize, Error, ExceptionClass, Module};
|
2
|
+
|
3
|
+
use super::module;
|
4
|
+
|
5
|
+
pub struct RbError {}
|
6
|
+
|
7
|
+
impl RbError {
|
8
|
+
// convert to Error instead of Self
|
9
|
+
pub fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Error {
|
10
|
+
Error::new(error(), e.to_string())
|
11
|
+
}
|
12
|
+
}
|
13
|
+
|
14
|
+
fn error() -> ExceptionClass {
|
15
|
+
*memoize!(ExceptionClass: module().define_error("Error", exception::standard_error()).unwrap())
|
16
|
+
}
|
@@ -0,0 +1,64 @@
|
|
1
|
+
extern crate tokenizers as tk;
|
2
|
+
|
3
|
+
mod decoders;
|
4
|
+
mod encoding;
|
5
|
+
mod error;
|
6
|
+
mod models;
|
7
|
+
mod normalizers;
|
8
|
+
mod pre_tokenizers;
|
9
|
+
mod tokenizer;
|
10
|
+
|
11
|
+
use decoders::RbBPEDecoder;
|
12
|
+
use encoding::RbEncoding;
|
13
|
+
use error::RbError;
|
14
|
+
use models::RbBPE;
|
15
|
+
use normalizers::RbBertNormalizer;
|
16
|
+
use pre_tokenizers::RbBertPreTokenizer;
|
17
|
+
use tokenizer::RbTokenizer;
|
18
|
+
|
19
|
+
use magnus::{define_module, function, memoize, method, prelude::*, Error, RModule};
|
20
|
+
|
21
|
+
type RbResult<T> = Result<T, Error>;
|
22
|
+
|
23
|
+
fn module() -> RModule {
|
24
|
+
*memoize!(RModule: define_module("Tokenizers").unwrap())
|
25
|
+
}
|
26
|
+
|
27
|
+
#[magnus::init]
|
28
|
+
fn init() -> RbResult<()> {
|
29
|
+
let module = module();
|
30
|
+
module.define_singleton_method(
|
31
|
+
"_from_pretrained",
|
32
|
+
function!(RbTokenizer::from_pretrained, 3),
|
33
|
+
)?;
|
34
|
+
|
35
|
+
let class = module.define_class("BPE", Default::default())?;
|
36
|
+
class.define_singleton_method("new", function!(RbBPE::new, 2))?;
|
37
|
+
|
38
|
+
let class = module.define_class("Tokenizer", Default::default())?;
|
39
|
+
class.define_singleton_method("new", function!(RbTokenizer::new, 1))?;
|
40
|
+
class.define_method(
|
41
|
+
"add_special_tokens",
|
42
|
+
method!(RbTokenizer::add_special_tokens, 1),
|
43
|
+
)?;
|
44
|
+
class.define_method("encode", method!(RbTokenizer::encode, 1))?;
|
45
|
+
class.define_method("decode", method!(RbTokenizer::decode, 1))?;
|
46
|
+
class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
|
47
|
+
class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
|
48
|
+
class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
|
49
|
+
|
50
|
+
let class = module.define_class("Encoding", Default::default())?;
|
51
|
+
class.define_method("ids", method!(RbEncoding::ids, 0))?;
|
52
|
+
class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
|
53
|
+
|
54
|
+
let class = module.define_class("BPEDecoder", Default::default())?;
|
55
|
+
class.define_singleton_method("new", function!(RbBPEDecoder::new, 0))?;
|
56
|
+
|
57
|
+
let class = module.define_class("BertPreTokenizer", Default::default())?;
|
58
|
+
class.define_singleton_method("new", function!(RbBertPreTokenizer::new, 0))?;
|
59
|
+
|
60
|
+
let class = module.define_class("BertNormalizer", Default::default())?;
|
61
|
+
class.define_singleton_method("new", function!(RbBertNormalizer::new, 0))?;
|
62
|
+
|
63
|
+
Ok(())
|
64
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
use tk::models::bpe::BPE;
|
2
|
+
|
3
|
+
use super::{RbError, RbResult};
|
4
|
+
|
5
|
+
#[magnus::wrap(class = "Tokenizers::BPE")]
|
6
|
+
pub struct RbBPE {
|
7
|
+
pub model: BPE,
|
8
|
+
}
|
9
|
+
|
10
|
+
impl RbBPE {
|
11
|
+
pub fn new(vocab: String, merges: String) -> RbResult<Self> {
|
12
|
+
BPE::from_file(&vocab, &merges)
|
13
|
+
.unk_token("<unk>".into())
|
14
|
+
.end_of_word_suffix("</w>".into())
|
15
|
+
.build()
|
16
|
+
.map(|v| RbBPE { model: v })
|
17
|
+
.map_err(RbError::from)
|
18
|
+
}
|
19
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
use tk::normalizers::BertNormalizer;
|
2
|
+
|
3
|
+
#[magnus::wrap(class = "Tokenizers::BertNormalizer")]
|
4
|
+
pub struct RbBertNormalizer {
|
5
|
+
pub normalizer: BertNormalizer,
|
6
|
+
}
|
7
|
+
|
8
|
+
impl RbBertNormalizer {
|
9
|
+
pub fn new() -> Self {
|
10
|
+
RbBertNormalizer {
|
11
|
+
normalizer: BertNormalizer::default(),
|
12
|
+
}
|
13
|
+
}
|
14
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
use tk::pre_tokenizers::bert::BertPreTokenizer;
|
2
|
+
|
3
|
+
#[magnus::wrap(class = "Tokenizers::BertPreTokenizer")]
|
4
|
+
pub struct RbBertPreTokenizer {
|
5
|
+
pub pretok: BertPreTokenizer,
|
6
|
+
}
|
7
|
+
|
8
|
+
impl RbBertPreTokenizer {
|
9
|
+
pub fn new() -> Self {
|
10
|
+
RbBertPreTokenizer {
|
11
|
+
pretok: BertPreTokenizer,
|
12
|
+
}
|
13
|
+
}
|
14
|
+
}
|
@@ -0,0 +1,85 @@
|
|
1
|
+
use magnus::Module;
|
2
|
+
use std::cell::RefCell;
|
3
|
+
use tk::tokenizer::Tokenizer;
|
4
|
+
use tk::AddedToken;
|
5
|
+
|
6
|
+
use super::decoders::RbBPEDecoder;
|
7
|
+
use super::encoding::RbEncoding;
|
8
|
+
use super::models::RbBPE;
|
9
|
+
use super::normalizers::RbBertNormalizer;
|
10
|
+
use super::pre_tokenizers::RbBertPreTokenizer;
|
11
|
+
use super::{module, RbError, RbResult};
|
12
|
+
|
13
|
+
#[magnus::wrap(class = "Tokenizers::Tokenizer")]
|
14
|
+
pub struct RbTokenizer {
|
15
|
+
tokenizer: RefCell<Tokenizer>,
|
16
|
+
}
|
17
|
+
|
18
|
+
impl RbTokenizer {
|
19
|
+
pub fn new(model: &RbBPE) -> Self {
|
20
|
+
Self {
|
21
|
+
tokenizer: RefCell::new(Tokenizer::new(model.model.clone())),
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
pub fn from_pretrained(
|
26
|
+
identifier: String,
|
27
|
+
revision: String,
|
28
|
+
auth_token: Option<String>,
|
29
|
+
) -> RbResult<Self> {
|
30
|
+
let version = module().const_get("VERSION").unwrap();
|
31
|
+
let params = tk::FromPretrainedParameters {
|
32
|
+
revision,
|
33
|
+
auth_token,
|
34
|
+
user_agent: [("bindings", "Ruby".to_string()), ("version", version)]
|
35
|
+
.iter()
|
36
|
+
.map(|(k, v)| (k.to_string(), v.to_string()))
|
37
|
+
.collect(),
|
38
|
+
};
|
39
|
+
|
40
|
+
Tokenizer::from_pretrained(identifier, Some(params))
|
41
|
+
.map(|v| RbTokenizer {
|
42
|
+
tokenizer: RefCell::new(v),
|
43
|
+
})
|
44
|
+
.map_err(RbError::from)
|
45
|
+
}
|
46
|
+
|
47
|
+
pub fn add_special_tokens(&self, tokens: Vec<String>) {
|
48
|
+
let tokens: Vec<AddedToken> = tokens.iter().map(|t| AddedToken::from(t, true)).collect();
|
49
|
+
self.tokenizer.borrow_mut().add_special_tokens(&tokens);
|
50
|
+
// TODO return self
|
51
|
+
}
|
52
|
+
|
53
|
+
pub fn encode(&self, text: String) -> RbResult<RbEncoding> {
|
54
|
+
self.tokenizer
|
55
|
+
.borrow()
|
56
|
+
.encode(text, false)
|
57
|
+
.map(|v| RbEncoding { encoding: v })
|
58
|
+
.map_err(RbError::from)
|
59
|
+
}
|
60
|
+
|
61
|
+
pub fn decode(&self, ids: Vec<u32>) -> RbResult<String> {
|
62
|
+
self.tokenizer
|
63
|
+
.borrow()
|
64
|
+
.decode(ids, true)
|
65
|
+
.map_err(RbError::from)
|
66
|
+
}
|
67
|
+
|
68
|
+
pub fn set_decoder(&self, decoder: &RbBPEDecoder) {
|
69
|
+
self.tokenizer
|
70
|
+
.borrow_mut()
|
71
|
+
.with_decoder(decoder.decoder.clone());
|
72
|
+
}
|
73
|
+
|
74
|
+
pub fn set_pre_tokenizer(&self, pre_tokenizer: &RbBertPreTokenizer) {
|
75
|
+
self.tokenizer
|
76
|
+
.borrow_mut()
|
77
|
+
.with_pre_tokenizer(pre_tokenizer.pretok);
|
78
|
+
}
|
79
|
+
|
80
|
+
pub fn set_normalizer(&self, normalizer: &RbBertNormalizer) {
|
81
|
+
self.tokenizer
|
82
|
+
.borrow_mut()
|
83
|
+
.with_normalizer(normalizer.normalizer);
|
84
|
+
}
|
85
|
+
}
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.1
|
4
|
+
version: 0.2.1
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
12
|
-
dependencies:
|
11
|
+
date: 2023-01-12 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rb_sys
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
description:
|
14
28
|
email: andrew@ankane.org
|
15
29
|
executables: []
|
@@ -22,11 +36,19 @@ files:
|
|
22
36
|
- Cargo.toml
|
23
37
|
- LICENSE.txt
|
24
38
|
- README.md
|
39
|
+
- ext/tokenizers/Cargo.toml
|
25
40
|
- ext/tokenizers/extconf.rb
|
41
|
+
- ext/tokenizers/src/decoders.rs
|
42
|
+
- ext/tokenizers/src/encoding.rs
|
43
|
+
- ext/tokenizers/src/error.rs
|
44
|
+
- ext/tokenizers/src/lib.rs
|
45
|
+
- ext/tokenizers/src/models.rs
|
46
|
+
- ext/tokenizers/src/normalizers.rs
|
47
|
+
- ext/tokenizers/src/pre_tokenizers.rs
|
48
|
+
- ext/tokenizers/src/tokenizer.rs
|
26
49
|
- lib/tokenizers.rb
|
27
50
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
28
51
|
- lib/tokenizers/version.rb
|
29
|
-
- src/lib.rs
|
30
52
|
homepage: https://github.com/ankane/tokenizers-ruby
|
31
53
|
licenses:
|
32
54
|
- Apache-2.0
|
@@ -46,7 +68,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
46
68
|
- !ruby/object:Gem::Version
|
47
69
|
version: '0'
|
48
70
|
requirements: []
|
49
|
-
rubygems_version: 3.
|
71
|
+
rubygems_version: 3.4.1
|
50
72
|
signing_key:
|
51
73
|
specification_version: 4
|
52
74
|
summary: Fast state-of-the-art tokenizers for Ruby
|