youtokentome 0.1.0 → 0.2.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -1
- data/LICENSE.txt +18 -18
- data/README.md +16 -6
- data/ext/youtokentome/ext.cpp +52 -38
- data/ext/youtokentome/extconf.rb +1 -1
- data/lib/youtokentome/version.rb +1 -1
- metadata +12 -69
- data/lib/youtokentome/ext.bundle +0 -0
checksums.yaml
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
---
|
2
2
|
SHA256:
|
3
|
-
metadata.gz:
|
4
|
-
data.tar.gz:
|
3
|
+
metadata.gz: 9fd1ff636b8d8ede5a0dbae880595216656fe26fbd54004994d322e51d870d6a
|
4
|
+
data.tar.gz: b633849e81472eefcf36688547cf5e562e6a0dd6a3df8c554805f2289f0ceb07
|
5
5
|
SHA512:
|
6
|
-
metadata.gz:
|
7
|
-
data.tar.gz:
|
6
|
+
metadata.gz: b619f8b2cc8fa0c9cf92da1b12d66227d87dc931a6cb89307cbe3d3a54800328fcab4d9591a5968f2c4da83ed184aa67d78cd6a78d01d1faa21e09447a0609bc
|
7
|
+
data.tar.gz: 4b38a4db6d2adf2e24f6044cae75fb34eb6505a8e6ce8a48fe5a4ddfed3596f130431b8d59473885029ba3f4c71ace4f11fd22139a6ae8476d77d79cd0727b90
|
data/CHANGELOG.md
CHANGED
data/LICENSE.txt
CHANGED
@@ -1,22 +1,22 @@
|
|
1
|
-
Copyright (c) 2020 Andrew Kane
|
2
|
-
|
3
1
|
MIT License
|
4
2
|
|
5
|
-
|
6
|
-
|
7
|
-
|
8
|
-
|
9
|
-
|
10
|
-
|
11
|
-
|
3
|
+
Copyright (c) 2019 VK.com
|
4
|
+
Copyright (c) 2020-2021 Andrew Kane
|
5
|
+
|
6
|
+
Permission is hereby granted, free of charge, to any person obtaining a copy
|
7
|
+
of this software and associated documentation files (the "Software"), to deal
|
8
|
+
in the Software without restriction, including without limitation the rights
|
9
|
+
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
10
|
+
copies of the Software, and to permit persons to whom the Software is
|
11
|
+
furnished to do so, subject to the following conditions:
|
12
12
|
|
13
|
-
The above copyright notice and this permission notice shall be
|
14
|
-
|
13
|
+
The above copyright notice and this permission notice shall be included in all
|
14
|
+
copies or substantial portions of the Software.
|
15
15
|
|
16
|
-
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
17
|
-
|
18
|
-
|
19
|
-
|
20
|
-
|
21
|
-
OF
|
22
|
-
|
16
|
+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
17
|
+
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
18
|
+
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
19
|
+
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
20
|
+
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
21
|
+
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
|
22
|
+
SOFTWARE.
|
data/README.md
CHANGED
@@ -1,6 +1,10 @@
|
|
1
1
|
# YouTokenToMe
|
2
2
|
|
3
|
-
|
3
|
+
[YouTokenToMe](https://github.com/VKCOM/YouTokenToMe) - high performance unsupervised text tokenization - for Ruby
|
4
|
+
|
5
|
+
Learn more about [how it works](https://medium.com/@vktech/youtokentome-a-tool-for-quick-text-tokenization-from-the-vk-team-aa6341215c5a)
|
6
|
+
|
7
|
+
[![Build Status](https://github.com/ankane/youtokentome/workflows/build/badge.svg?branch=master)](https://github.com/ankane/youtokentome/actions)
|
4
8
|
|
5
9
|
## Installation
|
6
10
|
|
@@ -12,6 +16,12 @@ gem 'youtokentome'
|
|
12
16
|
|
13
17
|
## Getting Started
|
14
18
|
|
19
|
+
Dump your text to a file
|
20
|
+
|
21
|
+
```txt
|
22
|
+
Blazingly fast tokenization!
|
23
|
+
```
|
24
|
+
|
15
25
|
Train a model
|
16
26
|
|
17
27
|
```ruby
|
@@ -33,7 +43,7 @@ model.vocab
|
|
33
43
|
Encode
|
34
44
|
|
35
45
|
```ruby
|
36
|
-
model.encode(
|
46
|
+
model.encode(sentences)
|
37
47
|
```
|
38
48
|
|
39
49
|
Decode
|
@@ -60,10 +70,10 @@ YouTokenToMe::BPE.train(
|
|
60
70
|
vocab_size: 30000, # number of tokens in the final vocabulary
|
61
71
|
coverage: 1.0, # fraction of characters covered by the model
|
62
72
|
n_threads: -1, # number of parallel threads used to run
|
63
|
-
pad_id:
|
64
|
-
unk_id:
|
65
|
-
bos_id:
|
66
|
-
eos_id:
|
73
|
+
pad_id: 0, # reserved id for padding
|
74
|
+
unk_id: 1, # reserved id for unknown symbols
|
75
|
+
bos_id: 2, # reserved id for begin of sentence token
|
76
|
+
eos_id: 3 # reserved id for end of sentence token
|
67
77
|
)
|
68
78
|
```
|
69
79
|
|
data/ext/youtokentome/ext.cpp
CHANGED
@@ -3,9 +3,8 @@
|
|
3
3
|
#include <utils.h>
|
4
4
|
|
5
5
|
// rice
|
6
|
-
#include <rice/
|
7
|
-
#include <rice/
|
8
|
-
#include <rice/Object.hpp>
|
6
|
+
#include <rice/rice.hpp>
|
7
|
+
#include <rice/stl.hpp>
|
9
8
|
|
10
9
|
using Rice::define_class_under;
|
11
10
|
using Rice::define_module;
|
@@ -20,44 +19,59 @@ void check_status(vkcom::Status& status) {
|
|
20
19
|
}
|
21
20
|
}
|
22
21
|
|
23
|
-
|
24
|
-
Object to_ruby<std::vector<std::string>>(std::vector<std::string> const & x)
|
22
|
+
namespace Rice::detail
|
25
23
|
{
|
26
|
-
|
27
|
-
|
28
|
-
|
29
|
-
|
30
|
-
|
31
|
-
|
24
|
+
template<>
|
25
|
+
class To_Ruby<std::vector<std::string>>
|
26
|
+
{
|
27
|
+
public:
|
28
|
+
VALUE convert(std::vector<std::string> const & x)
|
29
|
+
{
|
30
|
+
Array ret;
|
31
|
+
for (auto& v : x) {
|
32
|
+
ret.push(v);
|
33
|
+
}
|
34
|
+
return ret;
|
35
|
+
}
|
36
|
+
};
|
32
37
|
|
33
|
-
template<>
|
34
|
-
|
35
|
-
{
|
36
|
-
|
37
|
-
|
38
|
-
|
39
|
-
|
40
|
-
|
41
|
-
|
42
|
-
|
38
|
+
template<>
|
39
|
+
class From_Ruby<std::vector<int>>
|
40
|
+
{
|
41
|
+
public:
|
42
|
+
std::vector<int> convert(VALUE x)
|
43
|
+
{
|
44
|
+
Array a = Array(x);
|
45
|
+
std::vector<int> ret;
|
46
|
+
for (size_t i = 0; i < a.size(); i++) {
|
47
|
+
ret.push_back(Rice::detail::From_Ruby<int>().convert(a[i].value()));
|
48
|
+
}
|
49
|
+
return ret;
|
50
|
+
}
|
51
|
+
};
|
43
52
|
|
44
|
-
template<>
|
45
|
-
|
46
|
-
{
|
47
|
-
|
48
|
-
|
49
|
-
|
50
|
-
|
51
|
-
|
52
|
-
|
53
|
+
template<>
|
54
|
+
class From_Ruby<std::vector<std::string>>
|
55
|
+
{
|
56
|
+
public:
|
57
|
+
std::vector<std::string> convert(VALUE x)
|
58
|
+
{
|
59
|
+
Array a = Array(x);
|
60
|
+
std::vector<std::string> ret;
|
61
|
+
for (size_t i = 0; i < a.size(); i++) {
|
62
|
+
ret.push_back(Rice::detail::From_Ruby<std::string>().convert(a[i].value()));
|
63
|
+
}
|
64
|
+
return ret;
|
65
|
+
}
|
66
|
+
};
|
53
67
|
}
|
54
68
|
|
55
69
|
extern "C" void Init_ext() {
|
56
70
|
Module rb_mYouTokenToMe = define_module("YouTokenToMe");
|
57
71
|
Module rb_mExt = define_module_under(rb_mYouTokenToMe, "Ext")
|
58
|
-
.
|
72
|
+
.define_singleton_function(
|
59
73
|
"train_bpe",
|
60
|
-
|
74
|
+
[](const std::string &input_path, const std::string &model_path, int vocab_size, double coverage,
|
61
75
|
int n_threads, int pad_id, int unk_id, int bos_id, int eos_id) {
|
62
76
|
|
63
77
|
vkcom::SpecialTokens special_tokens(pad_id, unk_id, bos_id, eos_id);
|
@@ -71,7 +85,7 @@ extern "C" void Init_ext() {
|
|
71
85
|
.define_method("subword_to_id", &vkcom::BaseEncoder::subword_to_id)
|
72
86
|
.define_method(
|
73
87
|
"id_to_subword",
|
74
|
-
|
88
|
+
[](vkcom::BaseEncoder& self, int id) {
|
75
89
|
std::string subword;
|
76
90
|
auto status = self.id_to_subword(id, &subword);
|
77
91
|
check_status(status);
|
@@ -79,7 +93,7 @@ extern "C" void Init_ext() {
|
|
79
93
|
})
|
80
94
|
.define_method(
|
81
95
|
"decode",
|
82
|
-
|
96
|
+
[](vkcom::BaseEncoder& self, std::vector<int> ids) {
|
83
97
|
std::string sentence;
|
84
98
|
const std::unordered_set<int> ignore_ids;
|
85
99
|
auto status = self.decode(ids, &sentence, &ignore_ids);
|
@@ -91,7 +105,7 @@ extern "C" void Init_ext() {
|
|
91
105
|
})
|
92
106
|
.define_method(
|
93
107
|
"encode_as_ids",
|
94
|
-
|
108
|
+
[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
|
95
109
|
std::vector<std::vector<int>> ids;
|
96
110
|
auto status = self.encode_as_ids(sentences, &ids, bos, eos, reverse, dropout_prob);
|
97
111
|
check_status(status);
|
@@ -108,7 +122,7 @@ extern "C" void Init_ext() {
|
|
108
122
|
})
|
109
123
|
.define_method(
|
110
124
|
"encode_as_subwords",
|
111
|
-
|
125
|
+
[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
|
112
126
|
std::vector<std::vector<std::string>> subwords;
|
113
127
|
auto status = self.encode_as_subwords(sentences, &subwords, bos, eos, reverse, dropout_prob);
|
114
128
|
check_status(status);
|
@@ -124,9 +138,9 @@ extern "C" void Init_ext() {
|
|
124
138
|
return ret;
|
125
139
|
})
|
126
140
|
.define_method("vocab", &vkcom::BaseEncoder::vocabulary)
|
127
|
-
.
|
141
|
+
.define_singleton_function(
|
128
142
|
"new",
|
129
|
-
|
143
|
+
[](std::string &model_path, int n_threads) {
|
130
144
|
auto status = vkcom::Status();
|
131
145
|
vkcom::BaseEncoder encoder(model_path, n_threads, &status);
|
132
146
|
check_status(status);
|
data/ext/youtokentome/extconf.rb
CHANGED
data/lib/youtokentome/version.rb
CHANGED
metadata
CHANGED
@@ -1,14 +1,14 @@
|
|
1
1
|
--- !ruby/object:Gem::Specification
|
2
2
|
name: youtokentome
|
3
3
|
version: !ruby/object:Gem::Version
|
4
|
-
version: 0.
|
4
|
+
version: 0.2.0
|
5
5
|
platform: ruby
|
6
6
|
authors:
|
7
7
|
- Andrew Kane
|
8
|
-
autorequire:
|
8
|
+
autorequire:
|
9
9
|
bindir: bin
|
10
10
|
cert_chain: []
|
11
|
-
date:
|
11
|
+
date: 2021-05-17 00:00:00.000000000 Z
|
12
12
|
dependencies:
|
13
13
|
- !ruby/object:Gem::Dependency
|
14
14
|
name: rice
|
@@ -16,72 +16,16 @@ dependencies:
|
|
16
16
|
requirements:
|
17
17
|
- - ">="
|
18
18
|
- !ruby/object:Gem::Version
|
19
|
-
version:
|
19
|
+
version: 4.0.2
|
20
20
|
type: :runtime
|
21
21
|
prerelease: false
|
22
22
|
version_requirements: !ruby/object:Gem::Requirement
|
23
23
|
requirements:
|
24
24
|
- - ">="
|
25
25
|
- !ruby/object:Gem::Version
|
26
|
-
version:
|
27
|
-
|
28
|
-
|
29
|
-
requirement: !ruby/object:Gem::Requirement
|
30
|
-
requirements:
|
31
|
-
- - ">="
|
32
|
-
- !ruby/object:Gem::Version
|
33
|
-
version: '0'
|
34
|
-
type: :development
|
35
|
-
prerelease: false
|
36
|
-
version_requirements: !ruby/object:Gem::Requirement
|
37
|
-
requirements:
|
38
|
-
- - ">="
|
39
|
-
- !ruby/object:Gem::Version
|
40
|
-
version: '0'
|
41
|
-
- !ruby/object:Gem::Dependency
|
42
|
-
name: rake
|
43
|
-
requirement: !ruby/object:Gem::Requirement
|
44
|
-
requirements:
|
45
|
-
- - ">="
|
46
|
-
- !ruby/object:Gem::Version
|
47
|
-
version: '0'
|
48
|
-
type: :development
|
49
|
-
prerelease: false
|
50
|
-
version_requirements: !ruby/object:Gem::Requirement
|
51
|
-
requirements:
|
52
|
-
- - ">="
|
53
|
-
- !ruby/object:Gem::Version
|
54
|
-
version: '0'
|
55
|
-
- !ruby/object:Gem::Dependency
|
56
|
-
name: rake-compiler
|
57
|
-
requirement: !ruby/object:Gem::Requirement
|
58
|
-
requirements:
|
59
|
-
- - ">="
|
60
|
-
- !ruby/object:Gem::Version
|
61
|
-
version: '0'
|
62
|
-
type: :development
|
63
|
-
prerelease: false
|
64
|
-
version_requirements: !ruby/object:Gem::Requirement
|
65
|
-
requirements:
|
66
|
-
- - ">="
|
67
|
-
- !ruby/object:Gem::Version
|
68
|
-
version: '0'
|
69
|
-
- !ruby/object:Gem::Dependency
|
70
|
-
name: minitest
|
71
|
-
requirement: !ruby/object:Gem::Requirement
|
72
|
-
requirements:
|
73
|
-
- - ">="
|
74
|
-
- !ruby/object:Gem::Version
|
75
|
-
version: '5'
|
76
|
-
type: :development
|
77
|
-
prerelease: false
|
78
|
-
version_requirements: !ruby/object:Gem::Requirement
|
79
|
-
requirements:
|
80
|
-
- - ">="
|
81
|
-
- !ruby/object:Gem::Version
|
82
|
-
version: '5'
|
83
|
-
description:
|
84
|
-
email: andrew@chartkick.com
|
26
|
+
version: 4.0.2
|
27
|
+
description:
|
28
|
+
email: andrew@ankane.org
|
85
29
|
executables: []
|
86
30
|
extensions:
|
87
31
|
- ext/youtokentome/extconf.rb
|
@@ -94,7 +38,6 @@ files:
|
|
94
38
|
- ext/youtokentome/extconf.rb
|
95
39
|
- lib/youtokentome.rb
|
96
40
|
- lib/youtokentome/bpe.rb
|
97
|
-
- lib/youtokentome/ext.bundle
|
98
41
|
- lib/youtokentome/version.rb
|
99
42
|
- vendor/YouTokenToMe/LICENSE
|
100
43
|
- vendor/YouTokenToMe/README.md
|
@@ -111,7 +54,7 @@ homepage: https://github.com/ankane/youtokentome
|
|
111
54
|
licenses:
|
112
55
|
- MIT
|
113
56
|
metadata: {}
|
114
|
-
post_install_message:
|
57
|
+
post_install_message:
|
115
58
|
rdoc_options: []
|
116
59
|
require_paths:
|
117
60
|
- lib
|
@@ -119,15 +62,15 @@ required_ruby_version: !ruby/object:Gem::Requirement
|
|
119
62
|
requirements:
|
120
63
|
- - ">="
|
121
64
|
- !ruby/object:Gem::Version
|
122
|
-
version: '2.
|
65
|
+
version: '2.6'
|
123
66
|
required_rubygems_version: !ruby/object:Gem::Requirement
|
124
67
|
requirements:
|
125
68
|
- - ">="
|
126
69
|
- !ruby/object:Gem::Version
|
127
70
|
version: '0'
|
128
71
|
requirements: []
|
129
|
-
rubygems_version: 3.
|
130
|
-
signing_key:
|
72
|
+
rubygems_version: 3.2.3
|
73
|
+
signing_key:
|
131
74
|
specification_version: 4
|
132
|
-
summary: High performance unsupervised text
|
75
|
+
summary: High performance unsupervised text tokenization for Ruby
|
133
76
|
test_files: []
|
data/lib/youtokentome/ext.bundle
DELETED
Binary file
|