youtokentome 0.1.0 → 0.2.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -1
- data/LICENSE.txt +18 -18
- data/README.md +16 -6
- data/ext/youtokentome/ext.cpp +52 -38
- data/ext/youtokentome/extconf.rb +1 -1
- data/lib/youtokentome/version.rb +1 -1
- metadata +12 -69
- data/lib/youtokentome/ext.bundle +0 -0
checksums.yaml
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
---
|
|
2
2
|
SHA256:
|
|
3
|
-
metadata.gz:
|
|
4
|
-
data.tar.gz:
|
|
3
|
+
metadata.gz: 9fd1ff636b8d8ede5a0dbae880595216656fe26fbd54004994d322e51d870d6a
|
|
4
|
+
data.tar.gz: b633849e81472eefcf36688547cf5e562e6a0dd6a3df8c554805f2289f0ceb07
|
|
5
5
|
SHA512:
|
|
6
|
-
metadata.gz:
|
|
7
|
-
data.tar.gz:
|
|
6
|
+
metadata.gz: b619f8b2cc8fa0c9cf92da1b12d66227d87dc931a6cb89307cbe3d3a54800328fcab4d9591a5968f2c4da83ed184aa67d78cd6a78d01d1faa21e09447a0609bc
|
|
7
|
+
data.tar.gz: 4b38a4db6d2adf2e24f6044cae75fb34eb6505a8e6ce8a48fe5a4ddfed3596f130431b8d59473885029ba3f4c71ace4f11fd22139a6ae8476d77d79cd0727b90
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
|
@@ -1,22 +1,22 @@
|
|
|
1
|
-
Copyright (c) 2020 Andrew Kane
|
|
2
|
-
|
|
3
1
|
MIT License
|
|
4
2
|
|
|
5
|
-
|
|
6
|
-
|
|
7
|
-
|
|
8
|
-
|
|
9
|
-
|
|
10
|
-
|
|
11
|
-
|
|
3
|
+
Copyright (c) 2019 VK.com
|
|
4
|
+
Copyright (c) 2020-2021 Andrew Kane
|
|
5
|
+
|
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
|
8
|
+
in the Software without restriction, including without limitation the rights
|
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
|
11
|
+
furnished to do so, subject to the following conditions:
|
|
12
12
|
|
|
13
|
-
The above copyright notice and this permission notice shall be
|
|
14
|
-
|
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
|
14
|
+
copies or substantial portions of the Software.
|
|
15
15
|
|
|
16
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
OF
|
|
22
|
-
|
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
|
22
|
+
SOFTWARE.
|
data/README.md
CHANGED
|
@@ -1,6 +1,10 @@
|
|
|
1
1
|
# YouTokenToMe
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
[YouTokenToMe](https://github.com/VKCOM/YouTokenToMe) - high performance unsupervised text tokenization - for Ruby
|
|
4
|
+
|
|
5
|
+
Learn more about [how it works](https://medium.com/@vktech/youtokentome-a-tool-for-quick-text-tokenization-from-the-vk-team-aa6341215c5a)
|
|
6
|
+
|
|
7
|
+
[](https://github.com/ankane/youtokentome/actions)
|
|
4
8
|
|
|
5
9
|
## Installation
|
|
6
10
|
|
|
@@ -12,6 +16,12 @@ gem 'youtokentome'
|
|
|
12
16
|
|
|
13
17
|
## Getting Started
|
|
14
18
|
|
|
19
|
+
Dump your text to a file
|
|
20
|
+
|
|
21
|
+
```txt
|
|
22
|
+
Blazingly fast tokenization!
|
|
23
|
+
```
|
|
24
|
+
|
|
15
25
|
Train a model
|
|
16
26
|
|
|
17
27
|
```ruby
|
|
@@ -33,7 +43,7 @@ model.vocab
|
|
|
33
43
|
Encode
|
|
34
44
|
|
|
35
45
|
```ruby
|
|
36
|
-
model.encode(
|
|
46
|
+
model.encode(sentences)
|
|
37
47
|
```
|
|
38
48
|
|
|
39
49
|
Decode
|
|
@@ -60,10 +70,10 @@ YouTokenToMe::BPE.train(
|
|
|
60
70
|
vocab_size: 30000, # number of tokens in the final vocabulary
|
|
61
71
|
coverage: 1.0, # fraction of characters covered by the model
|
|
62
72
|
n_threads: -1, # number of parallel threads used to run
|
|
63
|
-
pad_id:
|
|
64
|
-
unk_id:
|
|
65
|
-
bos_id:
|
|
66
|
-
eos_id:
|
|
73
|
+
pad_id: 0, # reserved id for padding
|
|
74
|
+
unk_id: 1, # reserved id for unknown symbols
|
|
75
|
+
bos_id: 2, # reserved id for begin of sentence token
|
|
76
|
+
eos_id: 3 # reserved id for end of sentence token
|
|
67
77
|
)
|
|
68
78
|
```
|
|
69
79
|
|
data/ext/youtokentome/ext.cpp
CHANGED
|
@@ -3,9 +3,8 @@
|
|
|
3
3
|
#include <utils.h>
|
|
4
4
|
|
|
5
5
|
// rice
|
|
6
|
-
#include <rice/
|
|
7
|
-
#include <rice/
|
|
8
|
-
#include <rice/Object.hpp>
|
|
6
|
+
#include <rice/rice.hpp>
|
|
7
|
+
#include <rice/stl.hpp>
|
|
9
8
|
|
|
10
9
|
using Rice::define_class_under;
|
|
11
10
|
using Rice::define_module;
|
|
@@ -20,44 +19,59 @@ void check_status(vkcom::Status& status) {
|
|
|
20
19
|
}
|
|
21
20
|
}
|
|
22
21
|
|
|
23
|
-
|
|
24
|
-
Object to_ruby<std::vector<std::string>>(std::vector<std::string> const & x)
|
|
22
|
+
namespace Rice::detail
|
|
25
23
|
{
|
|
26
|
-
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
24
|
+
template<>
|
|
25
|
+
class To_Ruby<std::vector<std::string>>
|
|
26
|
+
{
|
|
27
|
+
public:
|
|
28
|
+
VALUE convert(std::vector<std::string> const & x)
|
|
29
|
+
{
|
|
30
|
+
Array ret;
|
|
31
|
+
for (auto& v : x) {
|
|
32
|
+
ret.push(v);
|
|
33
|
+
}
|
|
34
|
+
return ret;
|
|
35
|
+
}
|
|
36
|
+
};
|
|
32
37
|
|
|
33
|
-
template<>
|
|
34
|
-
|
|
35
|
-
{
|
|
36
|
-
|
|
37
|
-
|
|
38
|
-
|
|
39
|
-
|
|
40
|
-
|
|
41
|
-
|
|
42
|
-
|
|
38
|
+
template<>
|
|
39
|
+
class From_Ruby<std::vector<int>>
|
|
40
|
+
{
|
|
41
|
+
public:
|
|
42
|
+
std::vector<int> convert(VALUE x)
|
|
43
|
+
{
|
|
44
|
+
Array a = Array(x);
|
|
45
|
+
std::vector<int> ret;
|
|
46
|
+
for (size_t i = 0; i < a.size(); i++) {
|
|
47
|
+
ret.push_back(Rice::detail::From_Ruby<int>().convert(a[i].value()));
|
|
48
|
+
}
|
|
49
|
+
return ret;
|
|
50
|
+
}
|
|
51
|
+
};
|
|
43
52
|
|
|
44
|
-
template<>
|
|
45
|
-
|
|
46
|
-
{
|
|
47
|
-
|
|
48
|
-
|
|
49
|
-
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
|
|
53
|
+
template<>
|
|
54
|
+
class From_Ruby<std::vector<std::string>>
|
|
55
|
+
{
|
|
56
|
+
public:
|
|
57
|
+
std::vector<std::string> convert(VALUE x)
|
|
58
|
+
{
|
|
59
|
+
Array a = Array(x);
|
|
60
|
+
std::vector<std::string> ret;
|
|
61
|
+
for (size_t i = 0; i < a.size(); i++) {
|
|
62
|
+
ret.push_back(Rice::detail::From_Ruby<std::string>().convert(a[i].value()));
|
|
63
|
+
}
|
|
64
|
+
return ret;
|
|
65
|
+
}
|
|
66
|
+
};
|
|
53
67
|
}
|
|
54
68
|
|
|
55
69
|
extern "C" void Init_ext() {
|
|
56
70
|
Module rb_mYouTokenToMe = define_module("YouTokenToMe");
|
|
57
71
|
Module rb_mExt = define_module_under(rb_mYouTokenToMe, "Ext")
|
|
58
|
-
.
|
|
72
|
+
.define_singleton_function(
|
|
59
73
|
"train_bpe",
|
|
60
|
-
|
|
74
|
+
[](const std::string &input_path, const std::string &model_path, int vocab_size, double coverage,
|
|
61
75
|
int n_threads, int pad_id, int unk_id, int bos_id, int eos_id) {
|
|
62
76
|
|
|
63
77
|
vkcom::SpecialTokens special_tokens(pad_id, unk_id, bos_id, eos_id);
|
|
@@ -71,7 +85,7 @@ extern "C" void Init_ext() {
|
|
|
71
85
|
.define_method("subword_to_id", &vkcom::BaseEncoder::subword_to_id)
|
|
72
86
|
.define_method(
|
|
73
87
|
"id_to_subword",
|
|
74
|
-
|
|
88
|
+
[](vkcom::BaseEncoder& self, int id) {
|
|
75
89
|
std::string subword;
|
|
76
90
|
auto status = self.id_to_subword(id, &subword);
|
|
77
91
|
check_status(status);
|
|
@@ -79,7 +93,7 @@ extern "C" void Init_ext() {
|
|
|
79
93
|
})
|
|
80
94
|
.define_method(
|
|
81
95
|
"decode",
|
|
82
|
-
|
|
96
|
+
[](vkcom::BaseEncoder& self, std::vector<int> ids) {
|
|
83
97
|
std::string sentence;
|
|
84
98
|
const std::unordered_set<int> ignore_ids;
|
|
85
99
|
auto status = self.decode(ids, &sentence, &ignore_ids);
|
|
@@ -91,7 +105,7 @@ extern "C" void Init_ext() {
|
|
|
91
105
|
})
|
|
92
106
|
.define_method(
|
|
93
107
|
"encode_as_ids",
|
|
94
|
-
|
|
108
|
+
[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
|
|
95
109
|
std::vector<std::vector<int>> ids;
|
|
96
110
|
auto status = self.encode_as_ids(sentences, &ids, bos, eos, reverse, dropout_prob);
|
|
97
111
|
check_status(status);
|
|
@@ -108,7 +122,7 @@ extern "C" void Init_ext() {
|
|
|
108
122
|
})
|
|
109
123
|
.define_method(
|
|
110
124
|
"encode_as_subwords",
|
|
111
|
-
|
|
125
|
+
[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
|
|
112
126
|
std::vector<std::vector<std::string>> subwords;
|
|
113
127
|
auto status = self.encode_as_subwords(sentences, &subwords, bos, eos, reverse, dropout_prob);
|
|
114
128
|
check_status(status);
|
|
@@ -124,9 +138,9 @@ extern "C" void Init_ext() {
|
|
|
124
138
|
return ret;
|
|
125
139
|
})
|
|
126
140
|
.define_method("vocab", &vkcom::BaseEncoder::vocabulary)
|
|
127
|
-
.
|
|
141
|
+
.define_singleton_function(
|
|
128
142
|
"new",
|
|
129
|
-
|
|
143
|
+
[](std::string &model_path, int n_threads) {
|
|
130
144
|
auto status = vkcom::Status();
|
|
131
145
|
vkcom::BaseEncoder encoder(model_path, n_threads, &status);
|
|
132
146
|
check_status(status);
|
data/ext/youtokentome/extconf.rb
CHANGED
data/lib/youtokentome/version.rb
CHANGED
metadata
CHANGED
|
@@ -1,14 +1,14 @@
|
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
|
2
2
|
name: youtokentome
|
|
3
3
|
version: !ruby/object:Gem::Version
|
|
4
|
-
version: 0.
|
|
4
|
+
version: 0.2.0
|
|
5
5
|
platform: ruby
|
|
6
6
|
authors:
|
|
7
7
|
- Andrew Kane
|
|
8
|
-
autorequire:
|
|
8
|
+
autorequire:
|
|
9
9
|
bindir: bin
|
|
10
10
|
cert_chain: []
|
|
11
|
-
date:
|
|
11
|
+
date: 2021-05-17 00:00:00.000000000 Z
|
|
12
12
|
dependencies:
|
|
13
13
|
- !ruby/object:Gem::Dependency
|
|
14
14
|
name: rice
|
|
@@ -16,72 +16,16 @@ dependencies:
|
|
|
16
16
|
requirements:
|
|
17
17
|
- - ">="
|
|
18
18
|
- !ruby/object:Gem::Version
|
|
19
|
-
version:
|
|
19
|
+
version: 4.0.2
|
|
20
20
|
type: :runtime
|
|
21
21
|
prerelease: false
|
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
|
23
23
|
requirements:
|
|
24
24
|
- - ">="
|
|
25
25
|
- !ruby/object:Gem::Version
|
|
26
|
-
version:
|
|
27
|
-
|
|
28
|
-
|
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
|
30
|
-
requirements:
|
|
31
|
-
- - ">="
|
|
32
|
-
- !ruby/object:Gem::Version
|
|
33
|
-
version: '0'
|
|
34
|
-
type: :development
|
|
35
|
-
prerelease: false
|
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
37
|
-
requirements:
|
|
38
|
-
- - ">="
|
|
39
|
-
- !ruby/object:Gem::Version
|
|
40
|
-
version: '0'
|
|
41
|
-
- !ruby/object:Gem::Dependency
|
|
42
|
-
name: rake
|
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
|
44
|
-
requirements:
|
|
45
|
-
- - ">="
|
|
46
|
-
- !ruby/object:Gem::Version
|
|
47
|
-
version: '0'
|
|
48
|
-
type: :development
|
|
49
|
-
prerelease: false
|
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
51
|
-
requirements:
|
|
52
|
-
- - ">="
|
|
53
|
-
- !ruby/object:Gem::Version
|
|
54
|
-
version: '0'
|
|
55
|
-
- !ruby/object:Gem::Dependency
|
|
56
|
-
name: rake-compiler
|
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
|
58
|
-
requirements:
|
|
59
|
-
- - ">="
|
|
60
|
-
- !ruby/object:Gem::Version
|
|
61
|
-
version: '0'
|
|
62
|
-
type: :development
|
|
63
|
-
prerelease: false
|
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
65
|
-
requirements:
|
|
66
|
-
- - ">="
|
|
67
|
-
- !ruby/object:Gem::Version
|
|
68
|
-
version: '0'
|
|
69
|
-
- !ruby/object:Gem::Dependency
|
|
70
|
-
name: minitest
|
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
|
72
|
-
requirements:
|
|
73
|
-
- - ">="
|
|
74
|
-
- !ruby/object:Gem::Version
|
|
75
|
-
version: '5'
|
|
76
|
-
type: :development
|
|
77
|
-
prerelease: false
|
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
|
79
|
-
requirements:
|
|
80
|
-
- - ">="
|
|
81
|
-
- !ruby/object:Gem::Version
|
|
82
|
-
version: '5'
|
|
83
|
-
description:
|
|
84
|
-
email: andrew@chartkick.com
|
|
26
|
+
version: 4.0.2
|
|
27
|
+
description:
|
|
28
|
+
email: andrew@ankane.org
|
|
85
29
|
executables: []
|
|
86
30
|
extensions:
|
|
87
31
|
- ext/youtokentome/extconf.rb
|
|
@@ -94,7 +38,6 @@ files:
|
|
|
94
38
|
- ext/youtokentome/extconf.rb
|
|
95
39
|
- lib/youtokentome.rb
|
|
96
40
|
- lib/youtokentome/bpe.rb
|
|
97
|
-
- lib/youtokentome/ext.bundle
|
|
98
41
|
- lib/youtokentome/version.rb
|
|
99
42
|
- vendor/YouTokenToMe/LICENSE
|
|
100
43
|
- vendor/YouTokenToMe/README.md
|
|
@@ -111,7 +54,7 @@ homepage: https://github.com/ankane/youtokentome
|
|
|
111
54
|
licenses:
|
|
112
55
|
- MIT
|
|
113
56
|
metadata: {}
|
|
114
|
-
post_install_message:
|
|
57
|
+
post_install_message:
|
|
115
58
|
rdoc_options: []
|
|
116
59
|
require_paths:
|
|
117
60
|
- lib
|
|
@@ -119,15 +62,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
|
119
62
|
requirements:
|
|
120
63
|
- - ">="
|
|
121
64
|
- !ruby/object:Gem::Version
|
|
122
|
-
version: '2.
|
|
65
|
+
version: '2.6'
|
|
123
66
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
|
124
67
|
requirements:
|
|
125
68
|
- - ">="
|
|
126
69
|
- !ruby/object:Gem::Version
|
|
127
70
|
version: '0'
|
|
128
71
|
requirements: []
|
|
129
|
-
rubygems_version: 3.
|
|
130
|
-
signing_key:
|
|
72
|
+
rubygems_version: 3.2.3
|
|
73
|
+
signing_key:
|
|
131
74
|
specification_version: 4
|
|
132
|
-
summary: High performance unsupervised text
|
|
75
|
+
summary: High performance unsupervised text tokenization for Ruby
|
|
133
76
|
test_files: []
|
data/lib/youtokentome/ext.bundle
DELETED
|
Binary file
|