tokenizers 0.1.3 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/Cargo.lock +478 -400
- data/Cargo.toml +9 -12
- data/README.md +1 -3
- data/ext/tokenizers/Cargo.toml +17 -0
- data/ext/tokenizers/extconf.rb +3 -10
- data/ext/tokenizers/src/decoders.rs +14 -0
- data/ext/tokenizers/src/encoding.rs +16 -0
- data/ext/tokenizers/src/error.rs +16 -0
- data/ext/tokenizers/src/lib.rs +64 -0
- data/ext/tokenizers/src/models.rs +19 -0
- data/ext/tokenizers/src/normalizers.rs +14 -0
- data/ext/tokenizers/src/pre_tokenizers.rs +14 -0
- data/ext/tokenizers/src/tokenizer.rs +85 -0
- data/lib/tokenizers/version.rb +1 -1
- data/lib/tokenizers.rb +6 -2
- metadata +27 -5
- data/src/lib.rs +0 -290
data/Cargo.toml
CHANGED
@@ -1,14 +1,11 @@
|
|
1
|
-
[
|
2
|
-
|
3
|
-
version = "0.1.0"
|
4
|
-
authors = ["Andrew Kane <andrew@ankane.org>"]
|
5
|
-
edition = "2018"
|
1
|
+
[workspace]
|
2
|
+
members = ["ext/tokenizers"]
|
6
3
|
|
7
|
-
[
|
8
|
-
|
9
|
-
crate-type = ["cdylib"]
|
4
|
+
[profile.release]
|
5
|
+
strip = true
|
10
6
|
|
11
|
-
[
|
12
|
-
|
13
|
-
|
14
|
-
|
7
|
+
[patch.crates-io]
|
8
|
+
magnus-macros = { git = "https://github.com/matsadler/magnus" }
|
9
|
+
number_prefix = { git = "https://github.com/ankane/rust-number-prefix", branch = "license-file" }
|
10
|
+
rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
|
11
|
+
tokenizers = { git = "https://github.com/huggingface/tokenizers" }
|
data/README.md
CHANGED
@@ -12,8 +12,6 @@ Add this line to your application’s Gemfile:
|
|
12
12
|
gem "tokenizers"
|
13
13
|
```
|
14
14
|
|
15
|
-
Note: Rust and pkg-config are currently required for installation, and it can take 5-10 minutes to compile the extension.
|
16
|
-
|
17
15
|
## Getting Started
|
18
16
|
|
19
17
|
Load a pretrained tokenizer
|
@@ -61,7 +59,7 @@ To get started with development:
|
|
61
59
|
git clone https://github.com/ankane/tokenizers-ruby.git
|
62
60
|
cd tokenizers-ruby
|
63
61
|
bundle install
|
64
|
-
bundle exec
|
62
|
+
bundle exec rake compile
|
65
63
|
bundle exec rake download:files
|
66
64
|
bundle exec rake test
|
67
65
|
```
|
@@ -0,0 +1,17 @@
|
|
1
|
+
[package]
|
2
|
+
name = "tokenizers"
|
3
|
+
version = "0.2.0"
|
4
|
+
authors = ["Andrew Kane <andrew@ankane.org>"]
|
5
|
+
edition = "2021"
|
6
|
+
publish = false
|
7
|
+
|
8
|
+
[lib]
|
9
|
+
crate-type = ["cdylib"]
|
10
|
+
|
11
|
+
[dependencies]
|
12
|
+
magnus = "0.4"
|
13
|
+
|
14
|
+
[dependencies.tokenizers]
|
15
|
+
version = "0.13.2"
|
16
|
+
default-features = false
|
17
|
+
features = ["progressbar", "http", "onig", "esaxx_fast"]
|
data/ext/tokenizers/extconf.rb
CHANGED
@@ -1,11 +1,4 @@
|
|
1
|
-
require "
|
1
|
+
require "mkmf"
|
2
|
+
require "rb_sys/mkmf"
|
2
3
|
|
3
|
-
|
4
|
-
|
5
|
-
File.write "Makefile", <<~EOS
|
6
|
-
all:
|
7
|
-
\tcargo build --release --target-dir target
|
8
|
-
install:
|
9
|
-
\tmv target/release/libtokenizers.#{RbConfig::CONFIG["SOEXT"]} #{dest}
|
10
|
-
clean:
|
11
|
-
EOS
|
4
|
+
create_rust_makefile("tokenizers/tokenizers")
|
@@ -0,0 +1,14 @@
|
|
1
|
+
use tk::decoders::bpe::BPEDecoder;
|
2
|
+
|
3
|
+
#[magnus::wrap(class = "Tokenizers::BPEDecoder")]
|
4
|
+
pub struct RbBPEDecoder {
|
5
|
+
pub decoder: BPEDecoder,
|
6
|
+
}
|
7
|
+
|
8
|
+
impl RbBPEDecoder {
|
9
|
+
pub fn new() -> Self {
|
10
|
+
RbBPEDecoder {
|
11
|
+
decoder: BPEDecoder::default(),
|
12
|
+
}
|
13
|
+
}
|
14
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
use tk::Encoding;
|
2
|
+
|
3
|
+
#[magnus::wrap(class = "Tokenizers::Encoding")]
|
4
|
+
pub struct RbEncoding {
|
5
|
+
pub encoding: Encoding,
|
6
|
+
}
|
7
|
+
|
8
|
+
impl RbEncoding {
|
9
|
+
pub fn ids(&self) -> Vec<u32> {
|
10
|
+
self.encoding.get_ids().into()
|
11
|
+
}
|
12
|
+
|
13
|
+
pub fn tokens(&self) -> Vec<String> {
|
14
|
+
self.encoding.get_tokens().into()
|
15
|
+
}
|
16
|
+
}
|
@@ -0,0 +1,16 @@
|
|
1
|
+
use magnus::{exception, memoize, Error, ExceptionClass, Module};
|
2
|
+
|
3
|
+
use super::module;
|
4
|
+
|
5
|
+
pub struct RbError {}
|
6
|
+
|
7
|
+
impl RbError {
|
8
|
+
// convert to Error instead of Self
|
9
|
+
pub fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Error {
|
10
|
+
Error::new(error(), e.to_string())
|
11
|
+
}
|
12
|
+
}
|
13
|
+
|
14
|
+
fn error() -> ExceptionClass {
|
15
|
+
*memoize!(ExceptionClass: module().define_error("Error", exception::standard_error()).unwrap())
|
16
|
+
}
|
@@ -0,0 +1,64 @@
|
|
1
|
+
extern crate tokenizers as tk;
|
2
|
+
|
3
|
+
mod decoders;
|
4
|
+
mod encoding;
|
5
|
+
mod error;
|
6
|
+
mod models;
|
7
|
+
mod normalizers;
|
8
|
+
mod pre_tokenizers;
|
9
|
+
mod tokenizer;
|
10
|
+
|
11
|
+
use decoders::RbBPEDecoder;
|
12
|
+
use encoding::RbEncoding;
|
13
|
+
use error::RbError;
|
14
|
+
use models::RbBPE;
|
15
|
+
use normalizers::RbBertNormalizer;
|
16
|
+
use pre_tokenizers::RbBertPreTokenizer;
|
17
|
+
use tokenizer::RbTokenizer;
|
18
|
+
|
19
|
+
use magnus::{define_module, function, memoize, method, prelude::*, Error, RModule};
|
20
|
+
|
21
|
+
type RbResult<T> = Result<T, Error>;
|
22
|
+
|
23
|
+
fn module() -> RModule {
|
24
|
+
*memoize!(RModule: define_module("Tokenizers").unwrap())
|
25
|
+
}
|
26
|
+
|
27
|
+
#[magnus::init]
|
28
|
+
fn init() -> RbResult<()> {
|
29
|
+
let module = module();
|
30
|
+
module.define_singleton_method(
|
31
|
+
"_from_pretrained",
|
32
|
+
function!(RbTokenizer::from_pretrained, 3),
|
33
|
+
)?;
|
34
|
+
|
35
|
+
let class = module.define_class("BPE", Default::default())?;
|
36
|
+
class.define_singleton_method("new", function!(RbBPE::new, 2))?;
|
37
|
+
|
38
|
+
let class = module.define_class("Tokenizer", Default::default())?;
|
39
|
+
class.define_singleton_method("new", function!(RbTokenizer::new, 1))?;
|
40
|
+
class.define_method(
|
41
|
+
"add_special_tokens",
|
42
|
+
method!(RbTokenizer::add_special_tokens, 1),
|
43
|
+
)?;
|
44
|
+
class.define_method("encode", method!(RbTokenizer::encode, 1))?;
|
45
|
+
class.define_method("decode", method!(RbTokenizer::decode, 1))?;
|
46
|
+
class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
|
47
|
+
class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
|
48
|
+
class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
|
49
|
+
|
50
|
+
let class = module.define_class("Encoding", Default::default())?;
|
51
|
+
class.define_method("ids", method!(RbEncoding::ids, 0))?;
|
52
|
+
class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
|
53
|
+
|
54
|
+
let class = module.define_class("BPEDecoder", Default::default())?;
|
55
|
+
class.define_singleton_method("new", function!(RbBPEDecoder::new, 0))?;
|
56
|
+
|
57
|
+
let class = module.define_class("BertPreTokenizer", Default::default())?;
|
58
|
+
class.define_singleton_method("new", function!(RbBertPreTokenizer::new, 0))?;
|
59
|
+
|
60
|
+
let class = module.define_class("BertNormalizer", Default::default())?;
|
61
|
+
class.define_singleton_method("new", function!(RbBertNormalizer::new, 0))?;
|
62
|
+
|
63
|
+
Ok(())
|
64
|
+
}
|
@@ -0,0 +1,19 @@
|
|
1
|
+
use tk::models::bpe::BPE;
|
2
|
+
|
3
|
+
use super::{RbError, RbResult};
|
4
|
+
|
5
|
+
#[magnus::wrap(class = "Tokenizers::BPE")]
|
6
|
+
pub struct RbBPE {
|
7
|
+
pub model: BPE,
|
8
|
+
}
|
9
|
+
|
10
|
+
impl RbBPE {
|
11
|
+
pub fn new(vocab: String, merges: String) -> RbResult<Self> {
|
12
|
+
BPE::from_file(&vocab, &merges)
|
13
|
+
.unk_token("<unk>".into())
|
14
|
+
.end_of_word_suffix("</w>".into())
|
15
|
+
.build()
|
16
|
+
.map(|v| RbBPE { model: v })
|
17
|
+
.map_err(RbError::from)
|
18
|
+
}
|
19
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
use tk::normalizers::BertNormalizer;
|
2
|
+
|
3
|
+
#[magnus::wrap(class = "Tokenizers::BertNormalizer")]
|
4
|
+
pub struct RbBertNormalizer {
|
5
|
+
pub normalizer: BertNormalizer,
|
6
|
+
}
|
7
|
+
|
8
|
+
impl RbBertNormalizer {
|
9
|
+
pub fn new() -> Self {
|
10
|
+
RbBertNormalizer {
|
11
|
+
normalizer: BertNormalizer::default(),
|
12
|
+
}
|
13
|
+
}
|
14
|
+
}
|
@@ -0,0 +1,14 @@
|
|
1
|
+
use tk::pre_tokenizers::bert::BertPreTokenizer;
|
2
|
+
|
3
|
+
#[magnus::wrap(class = "Tokenizers::BertPreTokenizer")]
|
4
|
+
pub struct RbBertPreTokenizer {
|
5
|
+
pub pretok: BertPreTokenizer,
|
6
|
+
}
|
7
|
+
|
8
|
+
impl RbBertPreTokenizer {
|
9
|
+
pub fn new() -> Self {
|
10
|
+
RbBertPreTokenizer {
|
11
|
+
pretok: BertPreTokenizer,
|
12
|
+
}
|
13
|
+
}
|
14
|
+
}
|
@@ -0,0 +1,85 @@
|
|
1
|
+
use magnus::Module;
|
2
|
+
use std::cell::RefCell;
|
3
|
+
use tk::tokenizer::Tokenizer;
|
4
|
+
use tk::AddedToken;
|
5
|
+
|
6
|
+
use super::decoders::RbBPEDecoder;
|
7
|
+
use super::encoding::RbEncoding;
|
8
|
+
use super::models::RbBPE;
|
9
|
+
use super::normalizers::RbBertNormalizer;
|
10
|
+
use super::pre_tokenizers::RbBertPreTokenizer;
|
11
|
+
use super::{module, RbError, RbResult};
|
12
|
+
|
13
|
+
#[magnus::wrap(class = "Tokenizers::Tokenizer")]
|
14
|
+
pub struct RbTokenizer {
|
15
|
+
tokenizer: RefCell<Tokenizer>,
|
16
|
+
}
|
17
|
+
|
18
|
+
impl RbTokenizer {
|
19
|
+
pub fn new(model: &RbBPE) -> Self {
|
20
|
+
Self {
|
21
|
+
tokenizer: RefCell::new(Tokenizer::new(model.model.clone())),
|
22
|
+
}
|
23
|
+
}
|
24
|
+
|
25
|
+
pub fn from_pretrained(
|
26
|
+
identifier: String,
|
27
|
+
revision: String,
|
28
|
+
auth_token: Option<String>,
|
29
|
+
) -> RbResult<Self> {
|
30
|
+
let version = module().const_get("VERSION").unwrap();
|
31
|
+
let params = tk::FromPretrainedParameters {
|
32
|
+
revision,
|
33
|
+
auth_token,
|
34
|
+
user_agent: [("bindings", "Ruby".to_string()), ("version", version)]
|
35
|
+
.iter()
|
36
|
+
.map(|(k, v)| (k.to_string(), v.to_string()))
|
37
|
+
.collect(),
|
38
|
+
};
|
39
|
+
|
40
|
+
Tokenizer::from_pretrained(identifier, Some(params))
|
41
|
+
.map(|v| RbTokenizer {
|
42
|
+
tokenizer: RefCell::new(v),
|
43
|
+
})
|
44
|
+
.map_err(RbError::from)
|
45
|
+
}
|
46
|
+
|
47
|
+
pub fn add_special_tokens(&self, tokens: Vec<String>) {
|
48
|
+
let tokens: Vec<AddedToken> = tokens.iter().map(|t| AddedToken::from(t, true)).collect();
|
49
|
+
self.tokenizer.borrow_mut().add_special_tokens(&tokens);
|
50
|
+
// TODO return self
|
51
|
+
}
|
52
|
+
|
53
|
+
pub fn encode(&self, text: String) -> RbResult<RbEncoding> {
|
54
|
+
self.tokenizer
|
55
|
+
.borrow()
|
56
|
+
.encode(text, false)
|
57
|
+
.map(|v| RbEncoding { encoding: v })
|
58
|
+
.map_err(RbError::from)
|
59
|
+
}
|
60
|
+
|
61
|
+
pub fn decode(&self, ids: Vec<u32>) -> RbResult<String> {
|
62
|
+
self.tokenizer
|
63
|
+
.borrow()
|
64
|
+
.decode(ids, true)
|
65
|
+
.map_err(RbError::from)
|
66
|
+
}
|
67
|
+
|
68
|
+
pub fn set_decoder(&self, decoder: &RbBPEDecoder) {
|
69
|
+
self.tokenizer
|
70
|
+
.borrow_mut()
|
71
|
+
.with_decoder(decoder.decoder.clone());
|
72
|
+
}
|
73
|
+
|
74
|
+
pub fn set_pre_tokenizer(&self, pre_tokenizer: &RbBertPreTokenizer) {
|
75
|
+
self.tokenizer
|
76
|
+
.borrow_mut()
|
77
|
+
.with_pre_tokenizer(pre_tokenizer.pretok);
|
78
|
+
}
|
79
|
+
|
80
|
+
pub fn set_normalizer(&self, normalizer: &RbBertNormalizer) {
|
81
|
+
self.tokenizer
|
82
|
+
.borrow_mut()
|
83
|
+
.with_normalizer(normalizer.normalizer);
|
84
|
+
}
|
85
|
+
}
|
data/lib/tokenizers/version.rb
CHANGED
data/lib/tokenizers.rb
CHANGED
metadata
CHANGED
@@ -1,15 +1,29 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: tokenizers
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
8
|
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date: 2022-
|
12
|
-
dependencies:
|
11
|
+
date: 2022-12-11 00:00:00.000000000 Z
|
12
|
+
dependencies:
|
13
|
+
- !ruby/object:Gem::Dependency
|
14
|
+
name: rb_sys
|
15
|
+
requirement: !ruby/object:Gem::Requirement
|
16
|
+
requirements:
|
17
|
+
- - ">="
|
18
|
+
- !ruby/object:Gem::Version
|
19
|
+
version: '0'
|
20
|
+
type: :runtime
|
21
|
+
prerelease: false
|
22
|
+
version_requirements: !ruby/object:Gem::Requirement
|
23
|
+
requirements:
|
24
|
+
- - ">="
|
25
|
+
- !ruby/object:Gem::Version
|
26
|
+
version: '0'
|
13
27
|
description:
|
14
28
|
email: andrew@ankane.org
|
15
29
|
executables: []
|
@@ -22,11 +36,19 @@ files:
|
|
22
36
|
- Cargo.toml
|
23
37
|
- LICENSE.txt
|
24
38
|
- README.md
|
39
|
+
- ext/tokenizers/Cargo.toml
|
25
40
|
- ext/tokenizers/extconf.rb
|
41
|
+
- ext/tokenizers/src/decoders.rs
|
42
|
+
- ext/tokenizers/src/encoding.rs
|
43
|
+
- ext/tokenizers/src/error.rs
|
44
|
+
- ext/tokenizers/src/lib.rs
|
45
|
+
- ext/tokenizers/src/models.rs
|
46
|
+
- ext/tokenizers/src/normalizers.rs
|
47
|
+
- ext/tokenizers/src/pre_tokenizers.rs
|
48
|
+
- ext/tokenizers/src/tokenizer.rs
|
26
49
|
- lib/tokenizers.rb
|
27
50
|
- lib/tokenizers/char_bpe_tokenizer.rb
|
28
51
|
- lib/tokenizers/version.rb
|
29
|
-
- src/lib.rs
|
30
52
|
homepage: https://github.com/ankane/tokenizers-ruby
|
31
53
|
licenses:
|
32
54
|
- Apache-2.0
|
@@ -46,7 +68,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
|
|
46
68
|
- !ruby/object:Gem::Version
|
47
69
|
version: '0'
|
48
70
|
requirements: []
|
49
|
-
rubygems_version: 3.3.
|
71
|
+
rubygems_version: 3.3.26
|
50
72
|
signing_key:
|
51
73
|
specification_version: 4
|
52
74
|
summary: Fast state-of-the-art tokenizers for Ruby
|