youtokentome 0.1.0 → 0.2.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e55b4f8edc8306370a1e5f7138bbbb00912b29234e7169d4dd7233fee04934cb
4
- data.tar.gz: dea649fc649c23a0955ed603867e7312a66d64f241fadad7fb057a9164bda285
3
+ metadata.gz: 9fd1ff636b8d8ede5a0dbae880595216656fe26fbd54004994d322e51d870d6a
4
+ data.tar.gz: b633849e81472eefcf36688547cf5e562e6a0dd6a3df8c554805f2289f0ceb07
5
5
  SHA512:
6
- metadata.gz: '080b09ffa1cb1721d321e7af92b980087e2bd77e74b76127e4f2131c1cb4a72895ea7ed121d4a2a79ded30989e862d40ab6e3a01c0489da130ef131f66f37b96'
7
- data.tar.gz: 741c1c809801be24105a52be8884dd74e273b068c9c9d004dfbede6a9e050ecc662acf0578bc3b2ec8f20002c117003be2f5d3e79ee399408c466e5fe57d2af4
6
+ metadata.gz: b619f8b2cc8fa0c9cf92da1b12d66227d87dc931a6cb89307cbe3d3a54800328fcab4d9591a5968f2c4da83ed184aa67d78cd6a78d01d1faa21e09447a0609bc
7
+ data.tar.gz: 4b38a4db6d2adf2e24f6044cae75fb34eb6505a8e6ce8a48fe5a4ddfed3596f130431b8d59473885029ba3f4c71ace4f11fd22139a6ae8476d77d79cd0727b90
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
- ## 0.1.0 (unreleased)
1
+ ## 0.2.0 (2021-05-17)
2
+
3
+ - Updated to Rice 4
4
+ - Dropped support for Ruby < 2.6
5
+
6
+ ## 0.1.0 (2020-02-23)
2
7
 
3
8
  - First release
data/LICENSE.txt CHANGED
@@ -1,22 +1,22 @@
1
- Copyright (c) 2020 Andrew Kane
2
-
3
1
  MIT License
4
2
 
5
- Permission is hereby granted, free of charge, to any person obtaining
6
- a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
8
- without limitation the rights to use, copy, modify, merge, publish,
9
- distribute, sublicense, and/or sell copies of the Software, and to
10
- permit persons to whom the Software is furnished to do so, subject to
11
- the following conditions:
3
+ Copyright (c) 2019 VK.com
4
+ Copyright (c) 2020-2021 Andrew Kane
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
12
 
13
- The above copyright notice and this permission notice shall be
14
- included in all copies or substantial portions of the Software.
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
15
 
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
data/README.md CHANGED
@@ -1,6 +1,10 @@
1
1
  # YouTokenToMe
2
2
 
3
- :fire: [YouTokenToMe](https://github.com/VKCOM/YouTokenToMe) - the high performance unsupervised text tokenizer - for Ruby
3
+ [YouTokenToMe](https://github.com/VKCOM/YouTokenToMe) - high performance unsupervised text tokenization - for Ruby
4
+
5
+ Learn more about [how it works](https://medium.com/@vktech/youtokentome-a-tool-for-quick-text-tokenization-from-the-vk-team-aa6341215c5a)
6
+
7
+ [![Build Status](https://github.com/ankane/youtokentome/workflows/build/badge.svg?branch=master)](https://github.com/ankane/youtokentome/actions)
4
8
 
5
9
  ## Installation
6
10
 
@@ -12,6 +16,12 @@ gem 'youtokentome'
12
16
 
13
17
  ## Getting Started
14
18
 
19
+ Dump your text to a file
20
+
21
+ ```txt
22
+ Blazingly fast tokenization!
23
+ ```
24
+
15
25
  Train a model
16
26
 
17
27
  ```ruby
@@ -33,7 +43,7 @@ model.vocab
33
43
  Encode
34
44
 
35
45
  ```ruby
36
- model.encode(sentence)
46
+ model.encode(sentences)
37
47
  ```
38
48
 
39
49
  Decode
@@ -60,10 +70,10 @@ YouTokenToMe::BPE.train(
60
70
  vocab_size: 30000, # number of tokens in the final vocabulary
61
71
  coverage: 1.0, # fraction of characters covered by the model
62
72
  n_threads: -1, # number of parallel threads used to run
63
- pad_id: 1, # reserved id for padding
64
- unk_id: 2, # reserved id for unknown symbols
65
- bos_id: 3, # reserved id for begin of sentence token
66
- eos_id: 4 # reserved id for end of sentence token
73
+ pad_id: 0, # reserved id for padding
74
+ unk_id: 1, # reserved id for unknown symbols
75
+ bos_id: 2, # reserved id for begin of sentence token
76
+ eos_id: 3 # reserved id for end of sentence token
67
77
  )
68
78
  ```
69
79
 
@@ -3,9 +3,8 @@
3
3
  #include <utils.h>
4
4
 
5
5
  // rice
6
- #include <rice/Array.hpp>
7
- #include <rice/Data_Type.hpp>
8
- #include <rice/Object.hpp>
6
+ #include <rice/rice.hpp>
7
+ #include <rice/stl.hpp>
9
8
 
10
9
  using Rice::define_class_under;
11
10
  using Rice::define_module;
@@ -20,44 +19,59 @@ void check_status(vkcom::Status& status) {
20
19
  }
21
20
  }
22
21
 
23
- template<>
24
- Object to_ruby<std::vector<std::string>>(std::vector<std::string> const & x)
22
+ namespace Rice::detail
25
23
  {
26
- Array ret;
27
- for (auto& v : x) {
28
- ret.push(v);
29
- }
30
- return ret;
31
- }
24
+ template<>
25
+ class To_Ruby<std::vector<std::string>>
26
+ {
27
+ public:
28
+ VALUE convert(std::vector<std::string> const & x)
29
+ {
30
+ Array ret;
31
+ for (auto& v : x) {
32
+ ret.push(v);
33
+ }
34
+ return ret;
35
+ }
36
+ };
32
37
 
33
- template<>
34
- std::vector<int> from_ruby<std::vector<int>>(Object x)
35
- {
36
- std::vector<int> ret;
37
- Array a = Array(x);
38
- for (size_t i = 0; i < a.size(); i++) {
39
- ret.push_back(from_ruby<int>(a[i]));
40
- }
41
- return ret;
42
- }
38
+ template<>
39
+ class From_Ruby<std::vector<int>>
40
+ {
41
+ public:
42
+ std::vector<int> convert(VALUE x)
43
+ {
44
+ Array a = Array(x);
45
+ std::vector<int> ret;
46
+ for (size_t i = 0; i < a.size(); i++) {
47
+ ret.push_back(Rice::detail::From_Ruby<int>().convert(a[i].value()));
48
+ }
49
+ return ret;
50
+ }
51
+ };
43
52
 
44
- template<>
45
- std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
46
- {
47
- std::vector<std::string> ret;
48
- Array a = Array(x);
49
- for (size_t i = 0; i < a.size(); i++) {
50
- ret.push_back(from_ruby<std::string>(a[i]));
51
- }
52
- return ret;
53
+ template<>
54
+ class From_Ruby<std::vector<std::string>>
55
+ {
56
+ public:
57
+ std::vector<std::string> convert(VALUE x)
58
+ {
59
+ Array a = Array(x);
60
+ std::vector<std::string> ret;
61
+ for (size_t i = 0; i < a.size(); i++) {
62
+ ret.push_back(Rice::detail::From_Ruby<std::string>().convert(a[i].value()));
63
+ }
64
+ return ret;
65
+ }
66
+ };
53
67
  }
54
68
 
55
69
  extern "C" void Init_ext() {
56
70
  Module rb_mYouTokenToMe = define_module("YouTokenToMe");
57
71
  Module rb_mExt = define_module_under(rb_mYouTokenToMe, "Ext")
58
- .define_singleton_method(
72
+ .define_singleton_function(
59
73
  "train_bpe",
60
- *[](std::string &input_path, std::string &model_path, int vocab_size, double coverage,
74
+ [](const std::string &input_path, const std::string &model_path, int vocab_size, double coverage,
61
75
  int n_threads, int pad_id, int unk_id, int bos_id, int eos_id) {
62
76
 
63
77
  vkcom::SpecialTokens special_tokens(pad_id, unk_id, bos_id, eos_id);
@@ -71,7 +85,7 @@ extern "C" void Init_ext() {
71
85
  .define_method("subword_to_id", &vkcom::BaseEncoder::subword_to_id)
72
86
  .define_method(
73
87
  "id_to_subword",
74
- *[](vkcom::BaseEncoder& self, int id) {
88
+ [](vkcom::BaseEncoder& self, int id) {
75
89
  std::string subword;
76
90
  auto status = self.id_to_subword(id, &subword);
77
91
  check_status(status);
@@ -79,7 +93,7 @@ extern "C" void Init_ext() {
79
93
  })
80
94
  .define_method(
81
95
  "decode",
82
- *[](vkcom::BaseEncoder& self, std::vector<int> ids) {
96
+ [](vkcom::BaseEncoder& self, std::vector<int> ids) {
83
97
  std::string sentence;
84
98
  const std::unordered_set<int> ignore_ids;
85
99
  auto status = self.decode(ids, &sentence, &ignore_ids);
@@ -91,7 +105,7 @@ extern "C" void Init_ext() {
91
105
  })
92
106
  .define_method(
93
107
  "encode_as_ids",
94
- *[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
108
+ [](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
95
109
  std::vector<std::vector<int>> ids;
96
110
  auto status = self.encode_as_ids(sentences, &ids, bos, eos, reverse, dropout_prob);
97
111
  check_status(status);
@@ -108,7 +122,7 @@ extern "C" void Init_ext() {
108
122
  })
109
123
  .define_method(
110
124
  "encode_as_subwords",
111
- *[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
125
+ [](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
112
126
  std::vector<std::vector<std::string>> subwords;
113
127
  auto status = self.encode_as_subwords(sentences, &subwords, bos, eos, reverse, dropout_prob);
114
128
  check_status(status);
@@ -124,9 +138,9 @@ extern "C" void Init_ext() {
124
138
  return ret;
125
139
  })
126
140
  .define_method("vocab", &vkcom::BaseEncoder::vocabulary)
127
- .define_singleton_method(
141
+ .define_singleton_function(
128
142
  "new",
129
- *[](std::string &model_path, int n_threads) {
143
+ [](std::string &model_path, int n_threads) {
130
144
  auto status = vkcom::Status();
131
145
  vkcom::BaseEncoder encoder(model_path, n_threads, &status);
132
146
  check_status(status);
@@ -1,6 +1,6 @@
1
1
  require "mkmf-rice"
2
2
 
3
- $CXXFLAGS << " -std=c++11"
3
+ $CXXFLAGS << " -std=c++17"
4
4
 
5
5
  ext = File.expand_path(".", __dir__)
6
6
  youtokentome = File.expand_path("../../vendor/YouTokenToMe/youtokentome/cpp", __dir__)
@@ -1,3 +1,3 @@
1
1
  module YouTokenToMe
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: youtokentome
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-02-24 00:00:00.000000000 Z
11
+ date: 2021-05-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -16,72 +16,16 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.2'
19
+ version: 4.0.2
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.2'
27
- - !ruby/object:Gem::Dependency
28
- name: bundler
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: rake
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: rake-compiler
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: minitest
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '5'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '5'
83
- description:
84
- email: andrew@chartkick.com
26
+ version: 4.0.2
27
+ description:
28
+ email: andrew@ankane.org
85
29
  executables: []
86
30
  extensions:
87
31
  - ext/youtokentome/extconf.rb
@@ -94,7 +38,6 @@ files:
94
38
  - ext/youtokentome/extconf.rb
95
39
  - lib/youtokentome.rb
96
40
  - lib/youtokentome/bpe.rb
97
- - lib/youtokentome/ext.bundle
98
41
  - lib/youtokentome/version.rb
99
42
  - vendor/YouTokenToMe/LICENSE
100
43
  - vendor/YouTokenToMe/README.md
@@ -111,7 +54,7 @@ homepage: https://github.com/ankane/youtokentome
111
54
  licenses:
112
55
  - MIT
113
56
  metadata: {}
114
- post_install_message:
57
+ post_install_message:
115
58
  rdoc_options: []
116
59
  require_paths:
117
60
  - lib
@@ -119,15 +62,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
119
62
  requirements:
120
63
  - - ">="
121
64
  - !ruby/object:Gem::Version
122
- version: '2.4'
65
+ version: '2.6'
123
66
  required_rubygems_version: !ruby/object:Gem::Requirement
124
67
  requirements:
125
68
  - - ">="
126
69
  - !ruby/object:Gem::Version
127
70
  version: '0'
128
71
  requirements: []
129
- rubygems_version: 3.1.2
130
- signing_key:
72
+ rubygems_version: 3.2.3
73
+ signing_key:
131
74
  specification_version: 4
132
- summary: High performance unsupervised text tokenizer for Ruby
75
+ summary: High performance unsupervised text tokenization for Ruby
133
76
  test_files: []
Binary file