youtokentome 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e55b4f8edc8306370a1e5f7138bbbb00912b29234e7169d4dd7233fee04934cb
4
+ data.tar.gz: dea649fc649c23a0955ed603867e7312a66d64f241fadad7fb057a9164bda285
5
+ SHA512:
6
+ metadata.gz: '080b09ffa1cb1721d321e7af92b980087e2bd77e74b76127e4f2131c1cb4a72895ea7ed121d4a2a79ded30989e862d40ab6e3a01c0489da130ef131f66f37b96'
7
+ data.tar.gz: 741c1c809801be24105a52be8884dd74e273b068c9c9d004dfbede6a9e050ecc662acf0578bc3b2ec8f20002c117003be2f5d3e79ee399408c466e5fe57d2af4
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (unreleased)
2
+
3
+ - First release
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2020 Andrew Kane
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,104 @@
1
+ # YouTokenToMe
2
+
3
+ :fire: [YouTokenToMe](https://github.com/VKCOM/YouTokenToMe) - the high performance unsupervised text tokenizer - for Ruby
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application’s Gemfile:
8
+
9
+ ```ruby
10
+ gem 'youtokentome'
11
+ ```
12
+
13
+ ## Getting Started
14
+
15
+ Train a model
16
+
17
+ ```ruby
18
+ model = YouTokenToMe::BPE.train(data: "train.txt", model: "model.txt", vocab_size: 30000)
19
+ ```
20
+
21
+ Load a model
22
+
23
+ ```ruby
24
+ model = YouTokenToMe::BPE.new("model.txt")
25
+ ```
26
+
27
+ Get vocab
28
+
29
+ ```ruby
30
+ model.vocab
31
+ ```
32
+
33
+ Encode
34
+
35
+ ```ruby
36
+ model.encode(sentence)
37
+ ```
38
+
39
+ Decode
40
+
41
+ ```ruby
42
+ model.decode(ids)
43
+ ```
44
+
45
+ Convert between ids and subwords
46
+
47
+ ```ruby
48
+ model.subword_to_id(subword)
49
+ model.id_to_subword(id)
50
+ ```
51
+
52
+ ## Options
53
+
54
+ Train
55
+
56
+ ```ruby
57
+ YouTokenToMe::BPE.train(
58
+ data: "train.txt", # path to file with training data
59
+ model: "model.txt", # path to where the trained model will be saved
60
+ vocab_size: 30000, # number of tokens in the final vocabulary
61
+ coverage: 1.0, # fraction of characters covered by the model
62
+ n_threads: -1, # number of parallel threads used to run
63
+ pad_id: 1, # reserved id for padding
64
+ unk_id: 2, # reserved id for unknown symbols
65
+ bos_id: 3, # reserved id for begin of sentence token
66
+ eos_id: 4 # reserved id for end of sentence token
67
+ )
68
+ ```
69
+
70
+ Encode
71
+
72
+ ```ruby
73
+ model.encode(
74
+ sentences,
75
+ output_type: :id, # or :subword
76
+ bos: false, # add "beginning of sentence" token
77
+ eos: false, # add "end of sentence" token
78
+ reverse: false, # reverse output sequence of tokens
79
+ dropout_prob: 0.0 # BPE-dropout probability
80
+ )
81
+ ```
82
+
83
+ ## History
84
+
85
+ View the [changelog](https://github.com/ankane/youtokentome/blob/master/CHANGELOG.md)
86
+
87
+ ## Contributing
88
+
89
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
90
+
91
+ - [Report bugs](https://github.com/ankane/youtokentome/issues)
92
+ - Fix bugs and [submit pull requests](https://github.com/ankane/youtokentome/pulls)
93
+ - Write, clarify, or fix documentation
94
+ - Suggest or add new features
95
+
96
+ To get started with development:
97
+
98
+ ```sh
99
+ git clone https://github.com/ankane/youtokentome.git
100
+ cd youtokentome
101
+ bundle install
102
+ bundle exec rake compile
103
+ bundle exec rake test
104
+ ```
@@ -0,0 +1,135 @@
1
+ // youtokentome
2
+ #include <bpe.h>
3
+ #include <utils.h>
4
+
5
+ // rice
6
+ #include <rice/Array.hpp>
7
+ #include <rice/Data_Type.hpp>
8
+ #include <rice/Object.hpp>
9
+
10
+ using Rice::define_class_under;
11
+ using Rice::define_module;
12
+ using Rice::define_module_under;
13
+ using Rice::Array;
14
+ using Rice::Module;
15
+ using Rice::Object;
16
+
17
+ void check_status(vkcom::Status& status) {
18
+ if (!status.ok()) {
19
+ throw std::invalid_argument(status.error_message());
20
+ }
21
+ }
22
+
23
+ template<>
24
+ Object to_ruby<std::vector<std::string>>(std::vector<std::string> const & x)
25
+ {
26
+ Array ret;
27
+ for (auto& v : x) {
28
+ ret.push(v);
29
+ }
30
+ return ret;
31
+ }
32
+
33
+ template<>
34
+ std::vector<int> from_ruby<std::vector<int>>(Object x)
35
+ {
36
+ std::vector<int> ret;
37
+ Array a = Array(x);
38
+ for (size_t i = 0; i < a.size(); i++) {
39
+ ret.push_back(from_ruby<int>(a[i]));
40
+ }
41
+ return ret;
42
+ }
43
+
44
+ template<>
45
+ std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
46
+ {
47
+ std::vector<std::string> ret;
48
+ Array a = Array(x);
49
+ for (size_t i = 0; i < a.size(); i++) {
50
+ ret.push_back(from_ruby<std::string>(a[i]));
51
+ }
52
+ return ret;
53
+ }
54
+
55
+ extern "C" void Init_ext() {
56
+ Module rb_mYouTokenToMe = define_module("YouTokenToMe");
57
+ Module rb_mExt = define_module_under(rb_mYouTokenToMe, "Ext")
58
+ .define_singleton_method(
59
+ "train_bpe",
60
+ *[](std::string &input_path, std::string &model_path, int vocab_size, double coverage,
61
+ int n_threads, int pad_id, int unk_id, int bos_id, int eos_id) {
62
+
63
+ vkcom::SpecialTokens special_tokens(pad_id, unk_id, bos_id, eos_id);
64
+ vkcom::BpeConfig config(coverage, n_threads, special_tokens);
65
+ auto status = vkcom::train_bpe(input_path, model_path, vocab_size, config);
66
+ check_status(status);
67
+ });
68
+
69
+ define_class_under<vkcom::BaseEncoder>(rb_mExt, "BaseEncoder")
70
+ .define_method("vocab_size", &vkcom::BaseEncoder::vocab_size)
71
+ .define_method("subword_to_id", &vkcom::BaseEncoder::subword_to_id)
72
+ .define_method(
73
+ "id_to_subword",
74
+ *[](vkcom::BaseEncoder& self, int id) {
75
+ std::string subword;
76
+ auto status = self.id_to_subword(id, &subword);
77
+ check_status(status);
78
+ return subword;
79
+ })
80
+ .define_method(
81
+ "decode",
82
+ *[](vkcom::BaseEncoder& self, std::vector<int> ids) {
83
+ std::string sentence;
84
+ const std::unordered_set<int> ignore_ids;
85
+ auto status = self.decode(ids, &sentence, &ignore_ids);
86
+ check_status(status);
87
+
88
+ Array ret;
89
+ ret.push(sentence);
90
+ return ret;
91
+ })
92
+ .define_method(
93
+ "encode_as_ids",
94
+ *[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
95
+ std::vector<std::vector<int>> ids;
96
+ auto status = self.encode_as_ids(sentences, &ids, bos, eos, reverse, dropout_prob);
97
+ check_status(status);
98
+
99
+ Array ret;
100
+ for (auto& v : ids) {
101
+ Array r;
102
+ for (auto& v2 : v) {
103
+ r.push(v2);
104
+ }
105
+ ret.push(r);
106
+ }
107
+ return ret;
108
+ })
109
+ .define_method(
110
+ "encode_as_subwords",
111
+ *[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
112
+ std::vector<std::vector<std::string>> subwords;
113
+ auto status = self.encode_as_subwords(sentences, &subwords, bos, eos, reverse, dropout_prob);
114
+ check_status(status);
115
+
116
+ Array ret;
117
+ for (auto& v : subwords) {
118
+ Array r;
119
+ for (auto& v2 : v) {
120
+ r.push(v2);
121
+ }
122
+ ret.push(r);
123
+ }
124
+ return ret;
125
+ })
126
+ .define_method("vocab", &vkcom::BaseEncoder::vocabulary)
127
+ .define_singleton_method(
128
+ "new",
129
+ *[](std::string &model_path, int n_threads) {
130
+ auto status = vkcom::Status();
131
+ vkcom::BaseEncoder encoder(model_path, n_threads, &status);
132
+ check_status(status);
133
+ return encoder;
134
+ });
135
+ }
@@ -0,0 +1,12 @@
1
+ require "mkmf-rice"
2
+
3
+ $CXXFLAGS << " -std=c++11"
4
+
5
+ ext = File.expand_path(".", __dir__)
6
+ youtokentome = File.expand_path("../../vendor/YouTokenToMe/youtokentome/cpp", __dir__)
7
+
8
+ $srcs = Dir["{#{ext},#{youtokentome}}/*.{cc,cpp}"]
9
+ $INCFLAGS << " -I#{youtokentome}"
10
+ $VPATH << youtokentome
11
+
12
+ create_makefile("youtokentome/ext")
@@ -0,0 +1,10 @@
1
+ # ext
2
+ require "youtokentome/ext"
3
+
4
+ # modules
5
+ require "youtokentome/version"
6
+ require "youtokentome/bpe"
7
+
8
+ module YouTokenToMe
9
+ class Error < StandardError; end
10
+ end
@@ -0,0 +1,54 @@
1
+ module YouTokenToMe
2
+ class BPE
3
+ def initialize(model, n_threads: -1)
4
+ @encoder = Ext::BaseEncoder.new(model, n_threads)
5
+ end
6
+
7
+ def vocab_size
8
+ @encoder.vocab_size
9
+ end
10
+
11
+ def vocab
12
+ vocab = @encoder.vocab
13
+ vocab.each do |v|
14
+ v.force_encoding(Encoding::UTF_8)
15
+ end
16
+ vocab
17
+ end
18
+
19
+ def subword_to_id(subword)
20
+ @encoder.subword_to_id(subword)
21
+ end
22
+
23
+ def id_to_subword(id)
24
+ @encoder.id_to_subword(id)
25
+ end
26
+
27
+ def encode(sentences, output_type: :id, bos: false, eos: false, reverse: false, dropout_prob: 0)
28
+ case output_type
29
+ when :id
30
+ @encoder.encode_as_ids(sentences, bos, eos, reverse, dropout_prob)
31
+ when :subword
32
+ subwords = @encoder.encode_as_subwords(sentences, bos, eos, reverse, dropout_prob)
33
+ subwords.each do |s|
34
+ s.each do |v|
35
+ v.force_encoding(Encoding::UTF_8)
36
+ end
37
+ end
38
+ subwords
39
+ else
40
+ raise ArgumentError, "Unknown output type"
41
+ end
42
+ end
43
+
44
+ # TODO add ignore_ids
45
+ def decode(ids)
46
+ @encoder.decode(ids)
47
+ end
48
+
49
+ def self.train(data:, model:, vocab_size:, coverage: 1.0, n_threads: -1, pad_id: 0, unk_id: 1, bos_id: 2, eos_id: 3)
50
+ Ext.train_bpe(data, model, vocab_size, coverage, n_threads, pad_id, unk_id, bos_id, eos_id)
51
+ new(model, n_threads: n_threads)
52
+ end
53
+ end
54
+ end
Binary file
@@ -0,0 +1,3 @@
1
+ module YouTokenToMe
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2019 VK.com
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,304 @@
1
+ ![PyPI](https://img.shields.io/pypi/v/youtokentome.svg)
2
+ [![Downloads](https://pepy.tech/badge/youtokentome)](https://pepy.tech/project/youtokentome)
3
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
4
+ ![GitHub](https://img.shields.io/github/license/vkcom/youtokentome.svg)
5
+ [![Build Status](https://travis-ci.org/VKCOM/YouTokenToMe.svg?branch=master)](https://travis-ci.org/VKCOM/YouTokenToMe)
6
+
7
+ # YouTokenToMe
8
+
9
+ YouTokenToMe is an unsupervised text tokenizer focused on computational efficiency. It currently implements fast Byte Pair Encoding (BPE) [[Sennrich et al.](https://www.aclweb.org/anthology/P16-1162)].
10
+ Our implementation is much faster in training and tokenization than [Hugging Face](https://github.com/huggingface/tokenizers), [fastBPE](https://github.com/glample/fastBPE)
11
+ and [SentencePiece](https://github.com/google/sentencepiece). In some test cases, it is 90 times faster.
12
+ Check out our [benchmark](benchmark.md) results.
13
+
14
+ Key advantages:
15
+
16
+ * Multithreading for training and tokenization
17
+ * The algorithm has `O(N)` complexity, where `N` is the length of training data
18
+ * Highly efficient implementation in C++
19
+ * Python wrapper and command-line interface
20
+
21
+ Extra features:
22
+ * BPE-dropout (as described in [Provilkov et al, 2019](https://arxiv.org/abs/1910.13267))
23
+
24
+ As well as in the algorithm from the original paper, ours does not consider tokens
25
+ that cross word boundaries. Just like in [SentencePiece](https://github.com/google/sentencepiece), all space symbols were replaced by meta symbol "▁" (U+2581). It allows sequences of tokens to be converted back to text and for word boundaries to be restored.
26
+
27
+ For example, the phrase ```Blazingly fast tokenization!``` can be tokenized into
28
+
29
+ `['▁Bl', 'az', 'ingly', '▁fast', '▁token', 'ization', '!']`
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install youtokentome
35
+ ```
36
+ ## Python interface
37
+
38
+ ### Example
39
+ Let's start with a self-contained example.
40
+
41
+ ```python
42
+ import random
43
+
44
+ import youtokentome as yttm
45
+
46
+ train_data_path = "train_data.txt"
47
+ model_path = "example.model"
48
+
49
+ # Generating random file with training data
50
+ # 10000 lines with 100 characters in each line
51
+ n_lines = 10000
52
+ n_characters = 100
53
+ with open(train_data_path, "w") as fout:
54
+ for _ in range(n_lines):
55
+ print("".join([random.choice("abcd ") for _ in range(n_characters)]), file=fout)
56
+
57
+ # Generating random text
58
+ test_text = "".join([random.choice("abcde ") for _ in range(100)])
59
+
60
+ # Training model
61
+ yttm.BPE.train(data=train_data_path, vocab_size=5000, model=model_path)
62
+
63
+ # Loading model
64
+ bpe = yttm.BPE(model=model_path)
65
+
66
+ # Two types of tokenization
67
+ print(bpe.encode([test_text], output_type=yttm.OutputType.ID))
68
+ print(bpe.encode([test_text], output_type=yttm.OutputType.SUBWORD))
69
+ ```
70
+
71
+ &nbsp;
72
+ ### Training model
73
+ ```python
74
+ youtokentome.BPE.train(data, model, vocab_size, coverage, n_threads=-1, pad_id=0, unk_id=1, bos_id=2, eos_id=3)
75
+ ```
76
+ Trains BPE model and saves to file.
77
+
78
+ **Args:**
79
+
80
+ * `data`: string, path to file with training data
81
+ * `model`: string, path to where the trained model will be saved
82
+ * `vocab_size`: int, number of tokens in the final vocabulary
83
+ * `coverage`: float, fraction of characters covered by the model. Must be in the range [0, 1]. A good value to use is about 0.9999.
84
+ * `n_threads`: int, number of parallel threads used to run. If -1 is passed, then all available threads are going to be used. Note that the number of threads is limited by 8 (see [benchmark](benchmark.md#number-of-threads)).
85
+ * `pad_id`: int, reserved id for padding
86
+ * `unk_id`: int, reserved id for unknown symbols
87
+ * `bos_id`: int, reserved id for begin of sentence token
88
+ * `eos_id`: int, reserved id for end of sentence token
89
+
90
+ **Returns**: Class `youtokentome.BPE` with the loaded model.
91
+
92
+
93
+ &nbsp;
94
+
95
+ ### Model loading
96
+
97
+ ```python
98
+ youtokentome.BPE(model, n_threads=-1)
99
+ ```
100
+
101
+ Class constructor. Loads the trained model.
102
+
103
+ * `model`: string, path to the trained model
104
+ * `n_threads`: int, number of parallel threads used to run.
105
+ If equal to -1, then the maximum number of threads available will be used.
106
+
107
+ &nbsp;
108
+
109
+ ### Methods
110
+ Class `youtokentome.BPE` has the following methods:
111
+ #### encode
112
+ ```python
113
+ encode(self, sentences, output_type=yttm.OutputType.ID, bos=False, eos=False, reverse=False, dropout_prob=0)
114
+ ```
115
+
116
+ **Args:**
117
+
118
+ * `sentences`: list of strings, sentences for tokenization.
119
+ * `output_type`: enum, sentence can be tokenized to ids or subwords. Use `OutputType.ID` for ids and `OutputType.SUBWORD` for subwords.
120
+ * `bos`: bool, if True then token “beginning of sentence” will be added
121
+ * `eos`: bool, if True then token “end of sentence” will be added
122
+ * `reverse`: bool, if True the output sequence of tokens will be reversed
123
+ * `dropout_prob`: float, BPE-dropout probability (the probability of a merge being dropped). Must be in the range [0, 1].
124
+
125
+
126
+ **Returns:** If `output_type` is equal to `youtokentome.OutputType.ID` or `youtokentome.OutputType.SUBWORD`
127
+ then a list of lists of integers or list of lists of strings will be returned
128
+ respectively.
129
+
130
+ &nbsp;
131
+ #### vocab
132
+
133
+ ```python
134
+ vocab(self)
135
+ ```
136
+
137
+ **Returns:** A list `vocab_size` strings. The i-th string in the list corresponds
138
+ to i-th subword.
139
+
140
+ &nbsp;
141
+ #### vocab_size
142
+
143
+ ```python
144
+ vocab_size(self)
145
+ ```
146
+
147
+ **Returns:** int. Size of vocabulary.
148
+
149
+ &nbsp;
150
+ #### subword_to_id
151
+
152
+ ```python
153
+ subword_to_id(self, subword)
154
+ ```
155
+ **Args:**
156
+ * `subword`: string.
157
+
158
+ **Returns:**
159
+ Integer from the range [0, vocab_size-1]. Id of subword or,
160
+ if there is no such subword in the vocabulary, `unk_id` will be
161
+ returned.
162
+
163
+ &nbsp;
164
+ #### id_to_subword
165
+
166
+ ```python
167
+ id_to_subword(self, id)
168
+ ```
169
+ **Args:**
170
+ * `id`: int, must be in the range [0, vocab_size-1]
171
+
172
+ **Returns:** string. Subword from vocabulary by id.
173
+
174
+ &nbsp;
175
+ #### decode
176
+ ```python
177
+ decode(self, ids, ignore_ids=None)
178
+ ```
179
+ Convert each id to subword and concatenate with space symbol.
180
+
181
+ **Args:**
182
+
183
+ * `ids`: list of lists of integers. All integers must be in the range [0, vocab_size-1]
184
+ * `ignore_ids`: collection of integers. These indices would be ignored during the decoding. All integers must be in the range [0, vocab_size-1] [default: None]
185
+
186
+
187
+ **Returns:** List of strings.
188
+
189
+ ## Command line interface
190
+
191
+ ### Example
192
+
193
+ ```bash
194
+ $ yttm bpe --data TRAINING_DATA_FILE --model OUTPUT_MODEL_FILE --vocab_size 2000
195
+ $ yttm encode --model OUTPUT_MODEL_FILE --output_type subword < TEST_DATA_FILE > ENCODED_DATA
196
+ ```
197
+
198
+
199
+ ### Supported commands
200
+
201
+ `YouTokenToMe` supports the following commands:
202
+
203
+ ```
204
+ $ yttm --help
205
+
206
+ Usage: yttm [OPTIONS] COMMAND [ARGS]...
207
+
208
+ Options:
209
+ --help Show this message and exit.
210
+
211
+ Commands:
212
+ bpe Train BPE model.
213
+ decode Decode ids to text.
214
+ encode Encode text to ids or subwords.
215
+ vocab Print list of learned subwords.
216
+ ```
217
+
218
+ Command `bpe` allows you to train Byte Pair Encoding model based on a text file.
219
+
220
+ ```
221
+ $ yttm bpe --help
222
+
223
+ Usage: yttm bpe [OPTIONS]
224
+
225
+ Train BPE model.
226
+
227
+ Options:
228
+ --data PATH Training data file path. [required]
229
+ --model PATH Output model file path. [required]
230
+ --vocab_size INTEGER Number of tokens in the final vocabulary. [required]
231
+ --coverage FLOAT Fraction of characters covered by the model. [default: 1.0]
232
+ --n_threads INTEGER Number of threads. [default: -1]
233
+ --pad_id INTEGER Padding token id. [default: 0]
234
+ --unk_id INTEGER Unknown token id. [default: 1]
235
+ --bos_id INTEGER 'Begin of sentence' token id. [default: 2]
236
+ --eos_id INTEGER 'End of sentence' token id. [default: 3]
237
+ --help Show this message and exit.
238
+ ```
239
+
240
+
241
+ Apply BPE encoding for a corpus of sentences. Use `stdin` for input and `stdout` for output.
242
+
243
+ By default, encoding works in parallel using `n_threads` threads. Number of threads is limited by
244
+ 8 (see [benchmark](benchmark.md#number-of-threads)).
245
+
246
+ With the `--stream` option, `--n_threads` will be ignored and all sentences will be processed one by one.
247
+ Each sentence will be tokenized and written to the `stdout` before the next sentence is read.
248
+
249
+
250
+ ```
251
+ $ yttm encode --help
252
+
253
+ Usage: yttm encode [OPTIONS]
254
+
255
+ Encode text to ids or subwords.
256
+
257
+ Options:
258
+ --model PATH Path to file with learned model. [required]
259
+ --output_type TEXT 'id' or 'subword'. [required]
260
+ --n_threads INTEGER Number of threads. [default: -1]
261
+ --bos Add tab 'begin of sentence'.
262
+ --eos Add tab 'end of sentence'.
263
+ --reverse Reverse output sequence of tokens.
264
+ --stream Process each line before reading the next one.
265
+ --dropout_prob BPE-dropout probability (the probability of a merge being dropped). [default: 0]
266
+ --help Show this message and exit.
267
+ ```
268
+
269
+ Print vocabulary. This can be useful for understanding the model.
270
+
271
+ ```
272
+ $ yttm vocab --help
273
+
274
+ Usage: yttm vocab [OPTIONS]
275
+
276
+ Print list of learned subwords.
277
+
278
+ Options:
279
+ --model PATH Path to file with learned model. [required]
280
+ --verbose Add merging rules.
281
+ --help Show this message and exit.
282
+ ```
283
+
284
+ Convert ids back to text. Use `stdin` for input and `stdout` for output.
285
+
286
+ ```
287
+ $ yttm decode --help
288
+
289
+ Usage: yttm decode [OPTIONS]
290
+
291
+ Decode ids to text.
292
+
293
+ Options:
294
+ --model PATH Path to file with learned model. [required]
295
+ --ignore_ids List of indices to ignore for decoding. Example: --ignore_ids=1,2,3
296
+ --help Show this message and exit.
297
+ ```
298
+
299
+
300
+
301
+
302
+
303
+
304
+