tokenizers 0.1.3 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
data/Cargo.toml CHANGED
@@ -1,14 +1,11 @@
1
- [package]
2
- name = "tokenizers-ruby"
3
- version = "0.1.0"
4
- authors = ["Andrew Kane <andrew@ankane.org>"]
5
- edition = "2018"
1
+ [workspace]
2
+ members = ["ext/tokenizers"]
6
3
 
7
- [lib]
8
- name = "tokenizers"
9
- crate-type = ["cdylib"]
4
+ [profile.release]
5
+ strip = true
10
6
 
11
- [dependencies]
12
- lazy_static = "1"
13
- rutie = "0.8.4"
14
- tokenizers = "0.13.1"
7
+ [patch.crates-io]
8
+ magnus-macros = { git = "https://github.com/matsadler/magnus" }
9
+ number_prefix = { git = "https://github.com/ankane/rust-number-prefix", branch = "license-file" }
10
+ rb-sys-env = { git = "https://github.com/oxidize-rb/rb-sys" }
11
+ tokenizers = { git = "https://github.com/huggingface/tokenizers" }
data/README.md CHANGED
@@ -12,8 +12,6 @@ Add this line to your application’s Gemfile:
12
12
  gem "tokenizers"
13
13
  ```
14
14
 
15
- Note: Rust and pkg-config are currently required for installation, and it can take 5-10 minutes to compile the extension.
16
-
17
15
  ## Getting Started
18
16
 
19
17
  Load a pretrained tokenizer
@@ -61,7 +59,7 @@ To get started with development:
61
59
  git clone https://github.com/ankane/tokenizers-ruby.git
62
60
  cd tokenizers-ruby
63
61
  bundle install
64
- bundle exec ruby ext/tokenizers/extconf.rb && make && make install
62
+ bundle exec rake compile
65
63
  bundle exec rake download:files
66
64
  bundle exec rake test
67
65
  ```
@@ -0,0 +1,17 @@
1
+ [package]
2
+ name = "tokenizers"
3
+ version = "0.2.0"
4
+ authors = ["Andrew Kane <andrew@ankane.org>"]
5
+ edition = "2021"
6
+ publish = false
7
+
8
+ [lib]
9
+ crate-type = ["cdylib"]
10
+
11
+ [dependencies]
12
+ magnus = "0.4"
13
+
14
+ [dependencies.tokenizers]
15
+ version = "0.13.2"
16
+ default-features = false
17
+ features = ["progressbar", "http", "onig", "esaxx_fast"]
@@ -1,11 +1,4 @@
1
- require "pathname"
1
+ require "mkmf"
2
+ require "rb_sys/mkmf"
2
3
 
3
- dest = Pathname.new(__dir__).relative_path_from(Pathname.pwd).join("../../lib/tokenizers/ext.#{RbConfig::CONFIG["DLEXT"]}")
4
-
5
- File.write "Makefile", <<~EOS
6
- all:
7
- \tcargo build --release --target-dir target
8
- install:
9
- \tmv target/release/libtokenizers.#{RbConfig::CONFIG["SOEXT"]} #{dest}
10
- clean:
11
- EOS
4
+ create_rust_makefile("tokenizers/tokenizers")
@@ -0,0 +1,14 @@
1
+ use tk::decoders::bpe::BPEDecoder;
2
+
3
+ #[magnus::wrap(class = "Tokenizers::BPEDecoder")]
4
+ pub struct RbBPEDecoder {
5
+ pub decoder: BPEDecoder,
6
+ }
7
+
8
+ impl RbBPEDecoder {
9
+ pub fn new() -> Self {
10
+ RbBPEDecoder {
11
+ decoder: BPEDecoder::default(),
12
+ }
13
+ }
14
+ }
@@ -0,0 +1,16 @@
1
+ use tk::Encoding;
2
+
3
+ #[magnus::wrap(class = "Tokenizers::Encoding")]
4
+ pub struct RbEncoding {
5
+ pub encoding: Encoding,
6
+ }
7
+
8
+ impl RbEncoding {
9
+ pub fn ids(&self) -> Vec<u32> {
10
+ self.encoding.get_ids().into()
11
+ }
12
+
13
+ pub fn tokens(&self) -> Vec<String> {
14
+ self.encoding.get_tokens().into()
15
+ }
16
+ }
@@ -0,0 +1,16 @@
1
+ use magnus::{exception, memoize, Error, ExceptionClass, Module};
2
+
3
+ use super::module;
4
+
5
+ pub struct RbError {}
6
+
7
+ impl RbError {
8
+ // convert to Error instead of Self
9
+ pub fn from(e: Box<dyn std::error::Error + Send + Sync>) -> Error {
10
+ Error::new(error(), e.to_string())
11
+ }
12
+ }
13
+
14
+ fn error() -> ExceptionClass {
15
+ *memoize!(ExceptionClass: module().define_error("Error", exception::standard_error()).unwrap())
16
+ }
@@ -0,0 +1,64 @@
1
+ extern crate tokenizers as tk;
2
+
3
+ mod decoders;
4
+ mod encoding;
5
+ mod error;
6
+ mod models;
7
+ mod normalizers;
8
+ mod pre_tokenizers;
9
+ mod tokenizer;
10
+
11
+ use decoders::RbBPEDecoder;
12
+ use encoding::RbEncoding;
13
+ use error::RbError;
14
+ use models::RbBPE;
15
+ use normalizers::RbBertNormalizer;
16
+ use pre_tokenizers::RbBertPreTokenizer;
17
+ use tokenizer::RbTokenizer;
18
+
19
+ use magnus::{define_module, function, memoize, method, prelude::*, Error, RModule};
20
+
21
+ type RbResult<T> = Result<T, Error>;
22
+
23
+ fn module() -> RModule {
24
+ *memoize!(RModule: define_module("Tokenizers").unwrap())
25
+ }
26
+
27
+ #[magnus::init]
28
+ fn init() -> RbResult<()> {
29
+ let module = module();
30
+ module.define_singleton_method(
31
+ "_from_pretrained",
32
+ function!(RbTokenizer::from_pretrained, 3),
33
+ )?;
34
+
35
+ let class = module.define_class("BPE", Default::default())?;
36
+ class.define_singleton_method("new", function!(RbBPE::new, 2))?;
37
+
38
+ let class = module.define_class("Tokenizer", Default::default())?;
39
+ class.define_singleton_method("new", function!(RbTokenizer::new, 1))?;
40
+ class.define_method(
41
+ "add_special_tokens",
42
+ method!(RbTokenizer::add_special_tokens, 1),
43
+ )?;
44
+ class.define_method("encode", method!(RbTokenizer::encode, 1))?;
45
+ class.define_method("decode", method!(RbTokenizer::decode, 1))?;
46
+ class.define_method("decoder=", method!(RbTokenizer::set_decoder, 1))?;
47
+ class.define_method("pre_tokenizer=", method!(RbTokenizer::set_pre_tokenizer, 1))?;
48
+ class.define_method("normalizer=", method!(RbTokenizer::set_normalizer, 1))?;
49
+
50
+ let class = module.define_class("Encoding", Default::default())?;
51
+ class.define_method("ids", method!(RbEncoding::ids, 0))?;
52
+ class.define_method("tokens", method!(RbEncoding::tokens, 0))?;
53
+
54
+ let class = module.define_class("BPEDecoder", Default::default())?;
55
+ class.define_singleton_method("new", function!(RbBPEDecoder::new, 0))?;
56
+
57
+ let class = module.define_class("BertPreTokenizer", Default::default())?;
58
+ class.define_singleton_method("new", function!(RbBertPreTokenizer::new, 0))?;
59
+
60
+ let class = module.define_class("BertNormalizer", Default::default())?;
61
+ class.define_singleton_method("new", function!(RbBertNormalizer::new, 0))?;
62
+
63
+ Ok(())
64
+ }
@@ -0,0 +1,19 @@
1
+ use tk::models::bpe::BPE;
2
+
3
+ use super::{RbError, RbResult};
4
+
5
+ #[magnus::wrap(class = "Tokenizers::BPE")]
6
+ pub struct RbBPE {
7
+ pub model: BPE,
8
+ }
9
+
10
+ impl RbBPE {
11
+ pub fn new(vocab: String, merges: String) -> RbResult<Self> {
12
+ BPE::from_file(&vocab, &merges)
13
+ .unk_token("<unk>".into())
14
+ .end_of_word_suffix("</w>".into())
15
+ .build()
16
+ .map(|v| RbBPE { model: v })
17
+ .map_err(RbError::from)
18
+ }
19
+ }
@@ -0,0 +1,14 @@
1
+ use tk::normalizers::BertNormalizer;
2
+
3
+ #[magnus::wrap(class = "Tokenizers::BertNormalizer")]
4
+ pub struct RbBertNormalizer {
5
+ pub normalizer: BertNormalizer,
6
+ }
7
+
8
+ impl RbBertNormalizer {
9
+ pub fn new() -> Self {
10
+ RbBertNormalizer {
11
+ normalizer: BertNormalizer::default(),
12
+ }
13
+ }
14
+ }
@@ -0,0 +1,14 @@
1
+ use tk::pre_tokenizers::bert::BertPreTokenizer;
2
+
3
+ #[magnus::wrap(class = "Tokenizers::BertPreTokenizer")]
4
+ pub struct RbBertPreTokenizer {
5
+ pub pretok: BertPreTokenizer,
6
+ }
7
+
8
+ impl RbBertPreTokenizer {
9
+ pub fn new() -> Self {
10
+ RbBertPreTokenizer {
11
+ pretok: BertPreTokenizer,
12
+ }
13
+ }
14
+ }
@@ -0,0 +1,85 @@
1
+ use magnus::Module;
2
+ use std::cell::RefCell;
3
+ use tk::tokenizer::Tokenizer;
4
+ use tk::AddedToken;
5
+
6
+ use super::decoders::RbBPEDecoder;
7
+ use super::encoding::RbEncoding;
8
+ use super::models::RbBPE;
9
+ use super::normalizers::RbBertNormalizer;
10
+ use super::pre_tokenizers::RbBertPreTokenizer;
11
+ use super::{module, RbError, RbResult};
12
+
13
+ #[magnus::wrap(class = "Tokenizers::Tokenizer")]
14
+ pub struct RbTokenizer {
15
+ tokenizer: RefCell<Tokenizer>,
16
+ }
17
+
18
+ impl RbTokenizer {
19
+ pub fn new(model: &RbBPE) -> Self {
20
+ Self {
21
+ tokenizer: RefCell::new(Tokenizer::new(model.model.clone())),
22
+ }
23
+ }
24
+
25
+ pub fn from_pretrained(
26
+ identifier: String,
27
+ revision: String,
28
+ auth_token: Option<String>,
29
+ ) -> RbResult<Self> {
30
+ let version = module().const_get("VERSION").unwrap();
31
+ let params = tk::FromPretrainedParameters {
32
+ revision,
33
+ auth_token,
34
+ user_agent: [("bindings", "Ruby".to_string()), ("version", version)]
35
+ .iter()
36
+ .map(|(k, v)| (k.to_string(), v.to_string()))
37
+ .collect(),
38
+ };
39
+
40
+ Tokenizer::from_pretrained(identifier, Some(params))
41
+ .map(|v| RbTokenizer {
42
+ tokenizer: RefCell::new(v),
43
+ })
44
+ .map_err(RbError::from)
45
+ }
46
+
47
+ pub fn add_special_tokens(&self, tokens: Vec<String>) {
48
+ let tokens: Vec<AddedToken> = tokens.iter().map(|t| AddedToken::from(t, true)).collect();
49
+ self.tokenizer.borrow_mut().add_special_tokens(&tokens);
50
+ // TODO return self
51
+ }
52
+
53
+ pub fn encode(&self, text: String) -> RbResult<RbEncoding> {
54
+ self.tokenizer
55
+ .borrow()
56
+ .encode(text, false)
57
+ .map(|v| RbEncoding { encoding: v })
58
+ .map_err(RbError::from)
59
+ }
60
+
61
+ pub fn decode(&self, ids: Vec<u32>) -> RbResult<String> {
62
+ self.tokenizer
63
+ .borrow()
64
+ .decode(ids, true)
65
+ .map_err(RbError::from)
66
+ }
67
+
68
+ pub fn set_decoder(&self, decoder: &RbBPEDecoder) {
69
+ self.tokenizer
70
+ .borrow_mut()
71
+ .with_decoder(decoder.decoder.clone());
72
+ }
73
+
74
+ pub fn set_pre_tokenizer(&self, pre_tokenizer: &RbBertPreTokenizer) {
75
+ self.tokenizer
76
+ .borrow_mut()
77
+ .with_pre_tokenizer(pre_tokenizer.pretok);
78
+ }
79
+
80
+ pub fn set_normalizer(&self, normalizer: &RbBertNormalizer) {
81
+ self.tokenizer
82
+ .borrow_mut()
83
+ .with_normalizer(normalizer.normalizer);
84
+ }
85
+ }
@@ -1,3 +1,3 @@
1
1
  module Tokenizers
2
- VERSION = "0.1.3"
2
+ VERSION = "0.2.0"
3
3
  end
data/lib/tokenizers.rb CHANGED
@@ -1,5 +1,9 @@
1
- # extlib
2
- require "tokenizers/ext"
1
+ # ext
2
+ begin
3
+ require "tokenizers/#{RUBY_VERSION.to_f}/tokenizers"
4
+ rescue LoadError
5
+ require "tokenizers/tokenizers"
6
+ end
3
7
 
4
8
  # modules
5
9
  require "tokenizers/char_bpe_tokenizer"
metadata CHANGED
@@ -1,15 +1,29 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: tokenizers
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.3
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
8
  autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2022-10-07 00:00:00.000000000 Z
12
- dependencies: []
11
+ date: 2022-12-11 00:00:00.000000000 Z
12
+ dependencies:
13
+ - !ruby/object:Gem::Dependency
14
+ name: rb_sys
15
+ requirement: !ruby/object:Gem::Requirement
16
+ requirements:
17
+ - - ">="
18
+ - !ruby/object:Gem::Version
19
+ version: '0'
20
+ type: :runtime
21
+ prerelease: false
22
+ version_requirements: !ruby/object:Gem::Requirement
23
+ requirements:
24
+ - - ">="
25
+ - !ruby/object:Gem::Version
26
+ version: '0'
13
27
  description:
14
28
  email: andrew@ankane.org
15
29
  executables: []
@@ -22,11 +36,19 @@ files:
22
36
  - Cargo.toml
23
37
  - LICENSE.txt
24
38
  - README.md
39
+ - ext/tokenizers/Cargo.toml
25
40
  - ext/tokenizers/extconf.rb
41
+ - ext/tokenizers/src/decoders.rs
42
+ - ext/tokenizers/src/encoding.rs
43
+ - ext/tokenizers/src/error.rs
44
+ - ext/tokenizers/src/lib.rs
45
+ - ext/tokenizers/src/models.rs
46
+ - ext/tokenizers/src/normalizers.rs
47
+ - ext/tokenizers/src/pre_tokenizers.rs
48
+ - ext/tokenizers/src/tokenizer.rs
26
49
  - lib/tokenizers.rb
27
50
  - lib/tokenizers/char_bpe_tokenizer.rb
28
51
  - lib/tokenizers/version.rb
29
- - src/lib.rs
30
52
  homepage: https://github.com/ankane/tokenizers-ruby
31
53
  licenses:
32
54
  - Apache-2.0
@@ -46,7 +68,7 @@ required_rubygems_version: !ruby/object:Gem::Requirement
46
68
  - !ruby/object:Gem::Version
47
69
  version: '0'
48
70
  requirements: []
49
- rubygems_version: 3.3.7
71
+ rubygems_version: 3.3.26
50
72
  signing_key:
51
73
  specification_version: 4
52
74
  summary: Fast state-of-the-art tokenizers for Ruby