tokenizers 0.1.3 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
data/Cargo.toml CHANGED
@@ -1,14 +1,11 @@
1
- [package]
2
- name = "tokenizers-ruby"
3
- version = "0.1.0"
4
- authors = ["Andrew Kane <andrew@ankane.org>"]
5
- edition = "2018"
1
+ [workspace]
2
+ members = ["ext/tokenizers"]
6
3
 
7
- [lib]
8
- name = "tokenizers"
9
- crate-type = ["cdylib"]
4
+ [profile.release]
5
+ strip = true
10
6
 
11
- [dependencies]
12
- lazy_static = "1"
13
- rutie = "0.8.4"
14
- tokenizers = "0.13.1"
7
+ [patch.crates-io]
8
+ magnus-macros = { git = "https://github.com/matsadler/magnus" }
9
+ number_prefix = { git = "https://github.com/ankane/rust-number-prefix", branch = "license-file" }
10
+ rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
11
+ tokenizers = { git = "https://github.com/huggingface/tokenizers" }
data/README.md CHANGED
@@ -12,8 +12,6 @@ Add this line to your application’s Gemfile:
12
12
  gem "tokenizers"
13
13
  ```
14
14
 
15
- Note: Rust and pkg-config are currently required for installation, and it can take 5-10 minutes to compile the extension.
16
-
17
15
  ## Getting Started
18
16
 
19
17
  Load a pretrained tokenizer
@@ -61,7 +59,7 @@ To get started with development:
61
59
  git clone https://github.com/ankane/tokenizers-ruby.git
62
60
  cd tokenizers-ruby
63
61
  bundle install
64
- bundle exec ruby ext/tokenizers/extconf.rb && make && make install
62
+ bundle exec rake compile
65
63
  bundle exec rake download:files
66
64
  bundle exec rake test
67
65
  ```
@@ -0,0 +1,17 @@
1
+ [package]
2
+ name = "tokenizers"
3
+ version = "0.2.0"
4
+ authors = ["Andrew Kane <andrew@ankane.org>"]
5
+ edition = "2021"
6
+ publish = false
7
+
8
+ [lib]
9
+ crate-type = ["cdylib"]
10
+
11
+ [dependencies]
12
+ magnus = "0.4"
13
+
14
+ [dependencies.tokenizers]
15
+ version = "0.13.2"
16
+ default-features = false
17
+ features = ["progressbar", "http", "onig", "esaxx_fast"]
@@ -1,11 +1,4 @@
1
- require "pathname"
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
2
3
 
3
- dest = Pathname.new(__dir__).relative_path_from(Pathname.pwd).join("../../lib/tokenizers/ext.#{RbConfig::CONFIG["DLEXT"]}")
4
-
5
- File.write "Makefile", <<~EOS
6
- all:
7
- \tcargo build --release --target-dir target
8
- install:
9
- \tmv target/release/libtokenizers.#{RbConfig::CONFIG["SOEXT"]} #{dest}
10
- clean:
11
- EOS
4
+ create_rust_makefile("tokenizers/tokenizers")
@@ -0,0 +1,14 @@
1
+ use tk::decoders::bpe::BPEDecoder;
2
+
3
+ #[magnus::wrap(class = "Tokenizers::BPEDecoder")]
4
+ pub struct RbBPEDecoder {
5
+ pub decoder: BPEDecoder,
6
+ }
7
+
8
+ impl RbBPEDecoder {
9
+ pub fn new() -> Self {
10
+ RbBPEDecoder {
11
+ decoder: BPEDecoder::default(),
12
+ }
13
+ }
14
+ }
@@ -0,0 +1,16 @@
1
+ use tk::Encoding;
2
+
3
+ #[magnus::wrap(class = "Tokenizers::Encoding")]
4
+ pub struct RbEncoding {
5
+ pub encoding: Encoding,
6
+ }
7
+
8
+ impl RbEncoding {
9
+ pub fn ids(&self) -> Vec<u32> {
10
+ self.encoding.get_ids().into()
11
+ }
12
+
13
+ pub fn tokens(&self) -> Vec<String> {
14
+ self.encoding.get_tokens().into()
15
+ }
16
+ }
@@ -0,0 +1,16 @@
1
+ use magnus::{exception, memoize, Error, ExceptionClass, Module};
2
+
3
+ use super::module;
4
+
5
+ pub struct RbError {}
6
+
7
+ impl RbError {
8
+ // convert to Error instead of Self
9
+ pub fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Error {
10
+ Error::new(error(), e.to_string())
11
+ }
12
+ }
13
+
14
+ fn error() -> ExceptionClass {
15
+ *memoize!(ExceptionClass: module().define_error("Error", exception::standard_error()).unwrap())
16
+ }
@@ -0,0 +1,64 @@
1
+ extern crate tokenizers as tk;
2
+
3
+ mod decoders;
4
+ mod encoding;
5
+ mod error;
6
+ mod models;
7
+ mod normalizers;
8
+ mod pre_tokenizers;
9
+ mod tokenizer;
10
+
11
+ use decoders::RbBPEDecoder;
12
+ use encoding::RbEncoding;
13
+ use error::RbError;
14
+ use models::RbBPE;
15
+ use normalizers::RbBertNormalizer;
16
+ use pre_tokenizers::RbBertPreTokenizer;
17
+ use tokenizer::RbTokenizer;
18
+
19
+ use magnus::{define_module, function, memoize, method, prelude::*, Error, RModule};
20
+
21
+ type RbResult<T> = Result<T, Error>;
22
+
23
+ fn module() -> RModule {
24
+ *memoize!(RModule: define_module("Tokenizers").unwrap())
25
+ }
26
+
27
+ #[magnus::init]
28
+ fn init() -> RbResult<()> {
29
+ let module = module();
30
+ module.define_singleton_method(
31
+ "_from_pretrained",
32
+ function!(RbTokenizer::from_pretrained, 3),
33
+ )?;
34
+
35
+ let class = module.define_class("BPE", Default::default())?;
36
+ class.define_singleton_method("new", function!(RbBPE::new, 2))?;
37
+
38
+ let class = module.define_class("Tokenizer", Default::default())?;
39
+ class.define_singleton_method("new", function!(RbTokenizer::new, 1))?;
40
+ class.define_method(
41
+ "add_special_tokens",
42
+ method!(RbTokenizer::add_special_tokens, 1),
43
+ )?;
44
+ class.define_method("encode", method!(RbTokenizer::encode, 1))?;
45
+ class.define_method("decode", method!(RbTokenizer::decode, 1))?;
46
+ class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
47
+ class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
48
+ class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
49
+
50
+ let class = module.define_class("Encoding", Default::default())?;
51
+ class.define_method("ids", method!(RbEncoding::ids, 0))?;
52
+ class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
53
+
54
+ let class = module.define_class("BPEDecoder", Default::default())?;
55
+ class.define_singleton_method("new", function!(RbBPEDecoder::new, 0))?;
56
+
57
+ let class = module.define_class("BertPreTokenizer", Default::default())?;
58
+ class.define_singleton_method("new", function!(RbBertPreTokenizer::new, 0))?;
59
+
60
+ let class = module.define_class("BertNormalizer", Default::default())?;
61
+ class.define_singleton_method("new", function!(RbBertNormalizer::new, 0))?;
62
+
63
+ Ok(())
64
+ }
@@ -0,0 +1,19 @@
1
+ use tk::models::bpe::BPE;
2
+
3
+ use super::{RbError, RbResult};
4
+
5
+ #[magnus::wrap(class = "Tokenizers::BPE")]
6
+ pub struct RbBPE {
7
+ pub model: BPE,
8
+ }
9
+
10
+ impl RbBPE {
11
+ pub fn new(vocab: String, merges: String) -> RbResult<Self> {
12
+ BPE::from_file(&vocab, &merges)
13
+ .unk_token("<unk>".into())
14
+ .end_of_word_suffix("</w>".into())
15
+ .build()
16
+ .map(|v| RbBPE { model: v })
17
+ .map_err(RbError::from)
18
+ }
19
+ }
@@ -0,0 +1,14 @@
1
+ use tk::normalizers::BertNormalizer;
2
+
3
+ #[magnus::wrap(class = "Tokenizers::BertNormalizer")]
4
+ pub struct RbBertNormalizer {
5
+ pub normalizer: BertNormalizer,
6
+ }
7
+
8
+ impl RbBertNormalizer {
9
+ pub fn new() -> Self {
10
+ RbBertNormalizer {
11
+ normalizer: BertNormalizer::default(),
12
+ }
13
+ }
14
+ }
@@ -0,0 +1,14 @@
1
+ use tk::pre_tokenizers::bert::BertPreTokenizer;
2
+
3
+ #[magnus::wrap(class = "Tokenizers::BertPreTokenizer")]
4
+ pub struct RbBertPreTokenizer {
5
+ pub pretok: BertPreTokenizer,
6
+ }
7
+
8
+ impl RbBertPreTokenizer {
9
+ pub fn new() -> Self {
10
+ RbBertPreTokenizer {
11
+ pretok: BertPreTokenizer,
12
+ }
13
+ }
14
+ }
@@ -0,0 +1,85 @@
1
+ use magnus::Module;
2
+ use std::cell::RefCell;
3
+ use tk::tokenizer::Tokenizer;
4
+ use tk::AddedToken;
5
+
6
+ use super::decoders::RbBPEDecoder;
7
+ use super::encoding::RbEncoding;
8
+ use super::models::RbBPE;
9
+ use super::normalizers::RbBertNormalizer;
10
+ use super::pre_tokenizers::RbBertPreTokenizer;
11
+ use super::{module, RbError, RbResult};
12
+
13
+ #[magnus::wrap(class = "Tokenizers::Tokenizer")]
14
+ pub struct RbTokenizer {
15
+ tokenizer: RefCell<Tokenizer>,
16
+ }
17
+
18
+ impl RbTokenizer {
19
+ pub fn new(model: &RbBPE) -> Self {
20
+ Self {
21
+ tokenizer: RefCell::new(Tokenizer::new(model.model.clone())),
22
+ }
23
+ }
24
+
25
+ pub fn from_pretrained(
26
+ identifier: String,
27
+ revision: String,
28
+ auth_token: Option<String>,
29
+ ) -> RbResult<Self> {
30
+ let version = module().const_get("VERSION").unwrap();
31
+ let params = tk::FromPretrainedParameters {
32
+ revision,
33
+ auth_token,
34
+ user_agent: [("bindings", "Ruby".to_string()), ("version", version)]
35
+ .iter()
36
+ .map(|(k, v)| (k.to_string(), v.to_string()))
37
+ .collect(),
38
+ };
39
+
40
+ Tokenizer::from_pretrained(identifier, Some(params))
41
+ .map(|v| RbTokenizer {
42
+ tokenizer: RefCell::new(v),
43
+ })
44
+ .map_err(RbError::from)
45
+ }
46
+
47
+ pub fn add_special_tokens(&self, tokens: Vec<String>) {
48
+ let tokens: Vec<AddedToken> = tokens.iter().map(|t| AddedToken::from(t, true)).collect();
49
+ self.tokenizer.borrow_mut().add_special_tokens(&tokens);
50
+ // TODO return self
51
+ }
52
+
53
+ pub fn encode(&self, text: String) -> RbResult<RbEncoding> {
54
+ self.tokenizer
55
+ .borrow()
56
+ .encode(text, false)
57
+ .map(|v| RbEncoding { encoding: v })
58
+ .map_err(RbError::from)
59
+ }
60
+
61
+ pub fn decode(&self, ids: Vec<u32>) -> RbResult<String> {
62
+ self.tokenizer
63
+ .borrow()
64
+ .decode(ids, true)
65
+ .map_err(RbError::from)
66
+ }
67
+
68
+ pub fn set_decoder(&self, decoder: &RbBPEDecoder) {
69
+ self.tokenizer
70
+ .borrow_mut()
71
+ .with_decoder(decoder.decoder.clone());
72
+ }
73
+
74
+ pub fn set_pre_tokenizer(&self, pre_tokenizer: &RbBertPreTokenizer) {
75
+ self.tokenizer
76
+ .borrow_mut()
77
+ .with_pre_tokenizer(pre_tokenizer.pretok);
78
+ }
79
+
80
+ pub fn set_normalizer(&self, normalizer: &RbBertNormalizer) {
81
+ self.tokenizer
82
+ .borrow_mut()
83
+ .with_normalizer(normalizer.normalizer);
84
+ }
85
+ }
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.1.3"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,5 +1,9 @@
1
- # extlib
2
- require "tokenizers/ext"
1
+ # ext
2
+ begin
3
+ require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
+ rescue LoadError
5
+ require "tokenizers/tokenizers"
6
+ end
3
7
 
4
8
  # modules
5
9
  require "tokenizers/char_bpe_tokenizer"
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-10-07 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2022-12-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  description:
14
28
  email: andrew@ankane.org
15
29
  executables: []
@@ -22,11 +36,19 @@ files:
22
36
  - Cargo.toml
23
37
  - LICENSE.txt
24
38
  - README.md
39
+ - ext/tokenizers/Cargo.toml
25
40
  - ext/tokenizers/extconf.rb
41
+ - ext/tokenizers/src/decoders.rs
42
+ - ext/tokenizers/src/encoding.rs
43
+ - ext/tokenizers/src/error.rs
44
+ - ext/tokenizers/src/lib.rs
45
+ - ext/tokenizers/src/models.rs
46
+ - ext/tokenizers/src/normalizers.rs
47
+ - ext/tokenizers/src/pre_tokenizers.rs
48
+ - ext/tokenizers/src/tokenizer.rs
26
49
  - lib/tokenizers.rb
27
50
  - lib/tokenizers/char_bpe_tokenizer.rb
28
51
  - lib/tokenizers/version.rb
29
- - src/lib.rs
30
52
  homepage: https://github.com/ankane/tokenizers-ruby
31
53
  licenses:
32
54
  - Apache-2.0
@@ -46,7 +68,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
46
68
  - !ruby/object:Gem::Version
47
69
  version: '0'
48
70
  requirements: []
49
- rubygems_version: 3.3.7
71
+ rubygems_version: 3.3.26
50
72
  signing_key:
51
73
  specification_version: 4
52
74
  summary: Fast state-of-the-art tokenizers for Ruby