youtokentome 0.1.0 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml CHANGED
@@ -1,7 +1,7 @@
1
1
  ---
2
2
  SHA256:
3
- metadata.gz: e55b4f8edc8306370a1e5f7138bbbb00912b29234e7169d4dd7233fee04934cb
4
- data.tar.gz: dea649fc649c23a0955ed603867e7312a66d64f241fadad7fb057a9164bda285
3
+ metadata.gz: 9fd1ff636b8d8ede5a0dbae880595216656fe26fbd54004994d322e51d870d6a
4
+ data.tar.gz: b633849e81472eefcf36688547cf5e562e6a0dd6a3df8c554805f2289f0ceb07
5
5
  SHA512:
6
- metadata.gz: '080b09ffa1cb1721d321e7af92b980087e2bd77e74b76127e4f2131c1cb4a72895ea7ed121d4a2a79ded30989e862d40ab6e3a01c0489da130ef131f66f37b96'
7
- data.tar.gz: 741c1c809801be24105a52be8884dd74e273b068c9c9d004dfbede6a9e050ecc662acf0578bc3b2ec8f20002c117003be2f5d3e79ee399408c466e5fe57d2af4
6
+ metadata.gz: b619f8b2cc8fa0c9cf92da1b12d66227d87dc931a6cb89307cbe3d3a54800328fcab4d9591a5968f2c4da83ed184aa67d78cd6a78d01d1faa21e09447a0609bc
7
+ data.tar.gz: 4b38a4db6d2adf2e24f6044cae75fb34eb6505a8e6ce8a48fe5a4ddfed3596f130431b8d59473885029ba3f4c71ace4f11fd22139a6ae8476d77d79cd0727b90
data/CHANGELOG.md CHANGED
@@ -1,3 +1,8 @@
1
- ## 0.1.0 (unreleased)
1
+ ## 0.2.0 (2021-05-17)
2
+
3
+ - Updated to Rice 4
4
+ - Dropped support for Ruby < 2.6
5
+
6
+ ## 0.1.0 (2020-02-23)
2
7
 
3
8
  - First release
data/LICENSE.txt CHANGED
@@ -1,22 +1,22 @@
1
- Copyright (c) 2020 Andrew Kane
2
-
3
1
  MIT License
4
2
 
5
- Permission is hereby granted, free of charge, to any person obtaining
6
- a copy of this software and associated documentation files (the
7
- "Software"), to deal in the Software without restriction, including
8
- without limitation the rights to use, copy, modify, merge, publish,
9
- distribute, sublicense, and/or sell copies of the Software, and to
10
- permit persons to whom the Software is furnished to do so, subject to
11
- the following conditions:
3
+ Copyright (c) 2019 VK.com
4
+ Copyright (c) 2020-2021 Andrew Kane
5
+
6
+ Permission is hereby granted, free of charge, to any person obtaining a copy
7
+ of this software and associated documentation files (the "Software"), to deal
8
+ in the Software without restriction, including without limitation the rights
9
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
10
+ copies of the Software, and to permit persons to whom the Software is
11
+ furnished to do so, subject to the following conditions:
12
12
 
13
- The above copyright notice and this permission notice shall be
14
- included in all copies or substantial portions of the Software.
13
+ The above copyright notice and this permission notice shall be included in all
14
+ copies or substantial portions of the Software.
15
15
 
16
- THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
- EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
- MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
- NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
- LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
- OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
- WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
19
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
20
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
21
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
22
+ SOFTWARE.
data/README.md CHANGED
@@ -1,6 +1,10 @@
1
1
  # YouTokenToMe
2
2
 
3
- :fire: [YouTokenToMe](https://github.com/VKCOM/YouTokenToMe) - the high performance unsupervised text tokenizer - for Ruby
3
+ [YouTokenToMe](https://github.com/VKCOM/YouTokenToMe) - high performance unsupervised text tokenization - for Ruby
4
+
5
+ Learn more about [how it works](https://medium.com/@vktech/youtokentome-a-tool-for-quick-text-tokenization-from-the-vk-team-aa6341215c5a)
6
+
7
+ [![Build Status](https://github.com/ankane/youtokentome/workflows/build/badge.svg?branch=master)](https://github.com/ankane/youtokentome/actions)
4
8
 
5
9
  ## Installation
6
10
 
@@ -12,6 +16,12 @@ gem 'youtokentome'
12
16
 
13
17
  ## Getting Started
14
18
 
19
+ Dump your text to a file
20
+
21
+ ```txt
22
+ Blazingly fast tokenization!
23
+ ```
24
+
15
25
  Train a model
16
26
 
17
27
  ```ruby
@@ -33,7 +43,7 @@ model.vocab
33
43
  Encode
34
44
 
35
45
  ```ruby
36
- model.encode(sentence)
46
+ model.encode(sentences)
37
47
  ```
38
48
 
39
49
  Decode
@@ -60,10 +70,10 @@ YouTokenToMe::BPE.train(
60
70
  vocab_size: 30000, # number of tokens in the final vocabulary
61
71
  coverage: 1.0, # fraction of characters covered by the model
62
72
  n_threads: -1, # number of parallel threads used to run
63
- pad_id: 1, # reserved id for padding
64
- unk_id: 2, # reserved id for unknown symbols
65
- bos_id: 3, # reserved id for begin of sentence token
66
- eos_id: 4 # reserved id for end of sentence token
73
+ pad_id: 0, # reserved id for padding
74
+ unk_id: 1, # reserved id for unknown symbols
75
+ bos_id: 2, # reserved id for begin of sentence token
76
+ eos_id: 3 # reserved id for end of sentence token
67
77
  )
68
78
  ```
69
79
 
@@ -3,9 +3,8 @@
3
3
  #include <utils.h>
4
4
 
5
5
  // rice
6
- #include <rice/Array.hpp>
7
- #include <rice/Data_Type.hpp>
8
- #include <rice/Object.hpp>
6
+ #include <rice/rice.hpp>
7
+ #include <rice/stl.hpp>
9
8
 
10
9
  using Rice::define_class_under;
11
10
  using Rice::define_module;
@@ -20,44 +19,59 @@ void check_status(vkcom::Status& status) {
20
19
  }
21
20
  }
22
21
 
23
- template<>
24
- Object to_ruby<std::vector<std::string>>(std::vector<std::string> const & x)
22
+ namespace Rice::detail
25
23
  {
26
- Array ret;
27
- for (auto& v : x) {
28
- ret.push(v);
29
- }
30
- return ret;
31
- }
24
+ template<>
25
+ class To_Ruby<std::vector<std::string>>
26
+ {
27
+ public:
28
+ VALUE convert(std::vector<std::string> const & x)
29
+ {
30
+ Array ret;
31
+ for (auto& v : x) {
32
+ ret.push(v);
33
+ }
34
+ return ret;
35
+ }
36
+ };
32
37
 
33
- template<>
34
- std::vector<int> from_ruby<std::vector<int>>(Object x)
35
- {
36
- std::vector<int> ret;
37
- Array a = Array(x);
38
- for (size_t i = 0; i < a.size(); i++) {
39
- ret.push_back(from_ruby<int>(a[i]));
40
- }
41
- return ret;
42
- }
38
+ template<>
39
+ class From_Ruby<std::vector<int>>
40
+ {
41
+ public:
42
+ std::vector<int> convert(VALUE x)
43
+ {
44
+ Array a = Array(x);
45
+ std::vector<int> ret;
46
+ for (size_t i = 0; i < a.size(); i++) {
47
+ ret.push_back(Rice::detail::From_Ruby<int>().convert(a[i].value()));
48
+ }
49
+ return ret;
50
+ }
51
+ };
43
52
 
44
- template<>
45
- std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
46
- {
47
- std::vector<std::string> ret;
48
- Array a = Array(x);
49
- for (size_t i = 0; i < a.size(); i++) {
50
- ret.push_back(from_ruby<std::string>(a[i]));
51
- }
52
- return ret;
53
+ template<>
54
+ class From_Ruby<std::vector<std::string>>
55
+ {
56
+ public:
57
+ std::vector<std::string> convert(VALUE x)
58
+ {
59
+ Array a = Array(x);
60
+ std::vector<std::string> ret;
61
+ for (size_t i = 0; i < a.size(); i++) {
62
+ ret.push_back(Rice::detail::From_Ruby<std::string>().convert(a[i].value()));
63
+ }
64
+ return ret;
65
+ }
66
+ };
53
67
  }
54
68
 
55
69
  extern "C" void Init_ext() {
56
70
  Module rb_mYouTokenToMe = define_module("YouTokenToMe");
57
71
  Module rb_mExt = define_module_under(rb_mYouTokenToMe, "Ext")
58
- .define_singleton_method(
72
+ .define_singleton_function(
59
73
  "train_bpe",
60
- *[](std::string &input_path, std::string &model_path, int vocab_size, double coverage,
74
+ [](const std::string &input_path, const std::string &model_path, int vocab_size, double coverage,
61
75
  int n_threads, int pad_id, int unk_id, int bos_id, int eos_id) {
62
76
 
63
77
  vkcom::SpecialTokens special_tokens(pad_id, unk_id, bos_id, eos_id);
@@ -71,7 +85,7 @@ extern "C" void Init_ext() {
71
85
  .define_method("subword_to_id", &vkcom::BaseEncoder::subword_to_id)
72
86
  .define_method(
73
87
  "id_to_subword",
74
- *[](vkcom::BaseEncoder& self, int id) {
88
+ [](vkcom::BaseEncoder& self, int id) {
75
89
  std::string subword;
76
90
  auto status = self.id_to_subword(id, &subword);
77
91
  check_status(status);
@@ -79,7 +93,7 @@ extern "C" void Init_ext() {
79
93
  })
80
94
  .define_method(
81
95
  "decode",
82
- *[](vkcom::BaseEncoder& self, std::vector<int> ids) {
96
+ [](vkcom::BaseEncoder& self, std::vector<int> ids) {
83
97
  std::string sentence;
84
98
  const std::unordered_set<int> ignore_ids;
85
99
  auto status = self.decode(ids, &sentence, &ignore_ids);
@@ -91,7 +105,7 @@ extern "C" void Init_ext() {
91
105
  })
92
106
  .define_method(
93
107
  "encode_as_ids",
94
- *[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
108
+ [](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
95
109
  std::vector<std::vector<int>> ids;
96
110
  auto status = self.encode_as_ids(sentences, &ids, bos, eos, reverse, dropout_prob);
97
111
  check_status(status);
@@ -108,7 +122,7 @@ extern "C" void Init_ext() {
108
122
  })
109
123
  .define_method(
110
124
  "encode_as_subwords",
111
- *[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
125
+ [](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
112
126
  std::vector<std::vector<std::string>> subwords;
113
127
  auto status = self.encode_as_subwords(sentences, &subwords, bos, eos, reverse, dropout_prob);
114
128
  check_status(status);
@@ -124,9 +138,9 @@ extern "C" void Init_ext() {
124
138
  return ret;
125
139
  })
126
140
  .define_method("vocab", &vkcom::BaseEncoder::vocabulary)
127
- .define_singleton_method(
141
+ .define_singleton_function(
128
142
  "new",
129
- *[](std::string &model_path, int n_threads) {
143
+ [](std::string &model_path, int n_threads) {
130
144
  auto status = vkcom::Status();
131
145
  vkcom::BaseEncoder encoder(model_path, n_threads, &status);
132
146
  check_status(status);
@@ -1,6 +1,6 @@
1
1
  require "mkmf-rice"
2
2
 
3
- $CXXFLAGS << " -std=c++11"
3
+ $CXXFLAGS << " -std=c++17"
4
4
 
5
5
  ext = File.expand_path(".", __dir__)
6
6
  youtokentome = File.expand_path("../../vendor/YouTokenToMe/youtokentome/cpp", __dir__)
@@ -1,3 +1,3 @@
1
1
  module YouTokenToMe
2
- VERSION = "0.1.0"
2
+ VERSION = "0.2.0"
3
3
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: youtokentome
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.1.0
4
+ version: 0.2.0
5
5
  platform: ruby
6
6
  authors:
7
7
  - Andrew Kane
8
- autorequire:
8
+ autorequire:
9
9
  bindir: bin
10
10
  cert_chain: []
11
- date: 2020-02-24 00:00:00.000000000 Z
11
+ date: 2021-05-17 00:00:00.000000000 Z
12
12
  dependencies:
13
13
  - !ruby/object:Gem::Dependency
14
14
  name: rice
@@ -16,72 +16,16 @@ dependencies:
16
16
  requirements:
17
17
  - - ">="
18
18
  - !ruby/object:Gem::Version
19
- version: '2.2'
19
+ version: 4.0.2
20
20
  type: :runtime
21
21
  prerelease: false
22
22
  version_requirements: !ruby/object:Gem::Requirement
23
23
  requirements:
24
24
  - - ">="
25
25
  - !ruby/object:Gem::Version
26
- version: '2.2'
27
- - !ruby/object:Gem::Dependency
28
- name: bundler
29
- requirement: !ruby/object:Gem::Requirement
30
- requirements:
31
- - - ">="
32
- - !ruby/object:Gem::Version
33
- version: '0'
34
- type: :development
35
- prerelease: false
36
- version_requirements: !ruby/object:Gem::Requirement
37
- requirements:
38
- - - ">="
39
- - !ruby/object:Gem::Version
40
- version: '0'
41
- - !ruby/object:Gem::Dependency
42
- name: rake
43
- requirement: !ruby/object:Gem::Requirement
44
- requirements:
45
- - - ">="
46
- - !ruby/object:Gem::Version
47
- version: '0'
48
- type: :development
49
- prerelease: false
50
- version_requirements: !ruby/object:Gem::Requirement
51
- requirements:
52
- - - ">="
53
- - !ruby/object:Gem::Version
54
- version: '0'
55
- - !ruby/object:Gem::Dependency
56
- name: rake-compiler
57
- requirement: !ruby/object:Gem::Requirement
58
- requirements:
59
- - - ">="
60
- - !ruby/object:Gem::Version
61
- version: '0'
62
- type: :development
63
- prerelease: false
64
- version_requirements: !ruby/object:Gem::Requirement
65
- requirements:
66
- - - ">="
67
- - !ruby/object:Gem::Version
68
- version: '0'
69
- - !ruby/object:Gem::Dependency
70
- name: minitest
71
- requirement: !ruby/object:Gem::Requirement
72
- requirements:
73
- - - ">="
74
- - !ruby/object:Gem::Version
75
- version: '5'
76
- type: :development
77
- prerelease: false
78
- version_requirements: !ruby/object:Gem::Requirement
79
- requirements:
80
- - - ">="
81
- - !ruby/object:Gem::Version
82
- version: '5'
83
- description:
84
- email: andrew@chartkick.com
26
+ version: 4.0.2
27
+ description:
28
+ email: andrew@ankane.org
85
29
  executables: []
86
30
  extensions:
87
31
  - ext/youtokentome/extconf.rb
@@ -94,7 +38,6 @@ files:
94
38
  - ext/youtokentome/extconf.rb
95
39
  - lib/youtokentome.rb
96
40
  - lib/youtokentome/bpe.rb
97
- - lib/youtokentome/ext.bundle
98
41
  - lib/youtokentome/version.rb
99
42
  - vendor/YouTokenToMe/LICENSE
100
43
  - vendor/YouTokenToMe/README.md
@@ -111,7 +54,7 @@ homepage: https://github.com/ankane/youtokentome
111
54
  licenses:
112
55
  - MIT
113
56
  metadata: {}
114
- post_install_message:
57
+ post_install_message:
115
58
  rdoc_options: []
116
59
  require_paths:
117
60
  - lib
@@ -119,15 +62,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
119
62
  requirements:
120
63
  - - ">="
121
64
  - !ruby/object:Gem::Version
122
- version: '2.4'
65
+ version: '2.6'
123
66
  required_rubygems_version: !ruby/object:Gem::Requirement
124
67
  requirements:
125
68
  - - ">="
126
69
  - !ruby/object:Gem::Version
127
70
  version: '0'
128
71
  requirements: []
129
- rubygems_version: 3.1.2
130
- signing_key:
72
+ rubygems_version: 3.2.3
73
+ signing_key:
131
74
  specification_version: 4
132
- summary: High performance unsupervised text tokenizer for Ruby
75
+ summary: High performance unsupervised text tokenization for Ruby
133
76
  test_files: []
Binary file