youtokentome 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
checksums.yaml ADDED
@@ -0,0 +1,7 @@
1
+ ---
2
+ SHA256:
3
+ metadata.gz: e55b4f8edc8306370a1e5f7138bbbb00912b29234e7169d4dd7233fee04934cb
4
+ data.tar.gz: dea649fc649c23a0955ed603867e7312a66d64f241fadad7fb057a9164bda285
5
+ SHA512:
6
+ metadata.gz: '080b09ffa1cb1721d321e7af92b980087e2bd77e74b76127e4f2131c1cb4a72895ea7ed121d4a2a79ded30989e862d40ab6e3a01c0489da130ef131f66f37b96'
7
+ data.tar.gz: 741c1c809801be24105a52be8884dd74e273b068c9c9d004dfbede6a9e050ecc662acf0578bc3b2ec8f20002c117003be2f5d3e79ee399408c466e5fe57d2af4
data/CHANGELOG.md ADDED
@@ -0,0 +1,3 @@
1
+ ## 0.1.0 (unreleased)
2
+
3
+ - First release
data/LICENSE.txt ADDED
@@ -0,0 +1,22 @@
1
+ Copyright (c) 2020 Andrew Kane
2
+
3
+ MIT License
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining
6
+ a copy of this software and associated documentation files (the
7
+ "Software"), to deal in the Software without restriction, including
8
+ without limitation the rights to use, copy, modify, merge, publish,
9
+ distribute, sublicense, and/or sell copies of the Software, and to
10
+ permit persons to whom the Software is furnished to do so, subject to
11
+ the following conditions:
12
+
13
+ The above copyright notice and this permission notice shall be
14
+ included in all copies or substantial portions of the Software.
15
+
16
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17
+ EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18
+ MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19
+ NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
20
+ LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
21
+ OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
22
+ WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
data/README.md ADDED
@@ -0,0 +1,104 @@
1
+ # YouTokenToMe
2
+
3
+ :fire: [YouTokenToMe](https://github.com/VKCOM/YouTokenToMe) - the high performance unsupervised text tokenizer - for Ruby
4
+
5
+ ## Installation
6
+
7
+ Add this line to your application’s Gemfile:
8
+
9
+ ```ruby
10
+ gem 'youtokentome'
11
+ ```
12
+
13
+ ## Getting Started
14
+
15
+ Train a model
16
+
17
+ ```ruby
18
+ model = YouTokenToMe::BPE.train(data: "train.txt", model: "model.txt", vocab_size: 30000)
19
+ ```
20
+
21
+ Load a model
22
+
23
+ ```ruby
24
+ model = YouTokenToMe::BPE.new("model.txt")
25
+ ```
26
+
27
+ Get vocab
28
+
29
+ ```ruby
30
+ model.vocab
31
+ ```
32
+
33
+ Encode
34
+
35
+ ```ruby
36
+ model.encode(sentence)
37
+ ```
38
+
39
+ Decode
40
+
41
+ ```ruby
42
+ model.decode(ids)
43
+ ```
44
+
45
+ Convert between ids and subwords
46
+
47
+ ```ruby
48
+ model.subword_to_id(subword)
49
+ model.id_to_subword(id)
50
+ ```
51
+
52
+ ## Options
53
+
54
+ Train
55
+
56
+ ```ruby
57
+ YouTokenToMe::BPE.train(
58
+ data: "train.txt", # path to file with training data
59
+ model: "model.txt", # path to where the trained model will be saved
60
+ vocab_size: 30000, # number of tokens in the final vocabulary
61
+ coverage: 1.0, # fraction of characters covered by the model
62
+ n_threads: -1, # number of parallel threads used to run
63
+ pad_id: 1, # reserved id for padding
64
+ unk_id: 2, # reserved id for unknown symbols
65
+ bos_id: 3, # reserved id for begin of sentence token
66
+ eos_id: 4 # reserved id for end of sentence token
67
+ )
68
+ ```
69
+
70
+ Encode
71
+
72
+ ```ruby
73
+ model.encode(
74
+ sentences,
75
+ output_type: :id, # or :subword
76
+ bos: false, # add "beginning of sentence" token
77
+ eos: false, # add "end of sentence" token
78
+ reverse: false, # reverse output sequence of tokens
79
+ dropout_prob: 0.0 # BPE-dropout probability
80
+ )
81
+ ```
82
+
83
+ ## History
84
+
85
+ View the [changelog](https://github.com/ankane/youtokentome/blob/master/CHANGELOG.md)
86
+
87
+ ## Contributing
88
+
89
+ Everyone is encouraged to help improve this project. Here are a few ways you can help:
90
+
91
+ - [Report bugs](https://github.com/ankane/youtokentome/issues)
92
+ - Fix bugs and [submit pull requests](https://github.com/ankane/youtokentome/pulls)
93
+ - Write, clarify, or fix documentation
94
+ - Suggest or add new features
95
+
96
+ To get started with development:
97
+
98
+ ```sh
99
+ git clone https://github.com/ankane/youtokentome.git
100
+ cd youtokentome
101
+ bundle install
102
+ bundle exec rake compile
103
+ bundle exec rake test
104
+ ```
@@ -0,0 +1,135 @@
1
+ // youtokentome
2
+ #include <bpe.h>
3
+ #include <utils.h>
4
+
5
+ // rice
6
+ #include <rice/Array.hpp>
7
+ #include <rice/Data_Type.hpp>
8
+ #include <rice/Object.hpp>
9
+
10
+ using Rice::define_class_under;
11
+ using Rice::define_module;
12
+ using Rice::define_module_under;
13
+ using Rice::Array;
14
+ using Rice::Module;
15
+ using Rice::Object;
16
+
17
+ void check_status(vkcom::Status& status) {
18
+ if (!status.ok()) {
19
+ throw std::invalid_argument(status.error_message());
20
+ }
21
+ }
22
+
23
+ template<>
24
+ Object to_ruby<std::vector<std::string>>(std::vector<std::string> const & x)
25
+ {
26
+ Array ret;
27
+ for (auto& v : x) {
28
+ ret.push(v);
29
+ }
30
+ return ret;
31
+ }
32
+
33
+ template<>
34
+ std::vector<int> from_ruby<std::vector<int>>(Object x)
35
+ {
36
+ std::vector<int> ret;
37
+ Array a = Array(x);
38
+ for (size_t i = 0; i < a.size(); i++) {
39
+ ret.push_back(from_ruby<int>(a[i]));
40
+ }
41
+ return ret;
42
+ }
43
+
44
+ template<>
45
+ std::vector<std::string> from_ruby<std::vector<std::string>>(Object x)
46
+ {
47
+ std::vector<std::string> ret;
48
+ Array a = Array(x);
49
+ for (size_t i = 0; i < a.size(); i++) {
50
+ ret.push_back(from_ruby<std::string>(a[i]));
51
+ }
52
+ return ret;
53
+ }
54
+
55
+ extern "C" void Init_ext() {
56
+ Module rb_mYouTokenToMe = define_module("YouTokenToMe");
57
+ Module rb_mExt = define_module_under(rb_mYouTokenToMe, "Ext")
58
+ .define_singleton_method(
59
+ "train_bpe",
60
+ *[](std::string &input_path, std::string &model_path, int vocab_size, double coverage,
61
+ int n_threads, int pad_id, int unk_id, int bos_id, int eos_id) {
62
+
63
+ vkcom::SpecialTokens special_tokens(pad_id, unk_id, bos_id, eos_id);
64
+ vkcom::BpeConfig config(coverage, n_threads, special_tokens);
65
+ auto status = vkcom::train_bpe(input_path, model_path, vocab_size, config);
66
+ check_status(status);
67
+ });
68
+
69
+ define_class_under<vkcom::BaseEncoder>(rb_mExt, "BaseEncoder")
70
+ .define_method("vocab_size", &vkcom::BaseEncoder::vocab_size)
71
+ .define_method("subword_to_id", &vkcom::BaseEncoder::subword_to_id)
72
+ .define_method(
73
+ "id_to_subword",
74
+ *[](vkcom::BaseEncoder& self, int id) {
75
+ std::string subword;
76
+ auto status = self.id_to_subword(id, &subword);
77
+ check_status(status);
78
+ return subword;
79
+ })
80
+ .define_method(
81
+ "decode",
82
+ *[](vkcom::BaseEncoder& self, std::vector<int> ids) {
83
+ std::string sentence;
84
+ const std::unordered_set<int> ignore_ids;
85
+ auto status = self.decode(ids, &sentence, &ignore_ids);
86
+ check_status(status);
87
+
88
+ Array ret;
89
+ ret.push(sentence);
90
+ return ret;
91
+ })
92
+ .define_method(
93
+ "encode_as_ids",
94
+ *[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
95
+ std::vector<std::vector<int>> ids;
96
+ auto status = self.encode_as_ids(sentences, &ids, bos, eos, reverse, dropout_prob);
97
+ check_status(status);
98
+
99
+ Array ret;
100
+ for (auto& v : ids) {
101
+ Array r;
102
+ for (auto& v2 : v) {
103
+ r.push(v2);
104
+ }
105
+ ret.push(r);
106
+ }
107
+ return ret;
108
+ })
109
+ .define_method(
110
+ "encode_as_subwords",
111
+ *[](vkcom::BaseEncoder& self, std::vector<std::string> sentences, bool bos, bool eos, bool reverse, double dropout_prob) {
112
+ std::vector<std::vector<std::string>> subwords;
113
+ auto status = self.encode_as_subwords(sentences, &subwords, bos, eos, reverse, dropout_prob);
114
+ check_status(status);
115
+
116
+ Array ret;
117
+ for (auto& v : subwords) {
118
+ Array r;
119
+ for (auto& v2 : v) {
120
+ r.push(v2);
121
+ }
122
+ ret.push(r);
123
+ }
124
+ return ret;
125
+ })
126
+ .define_method("vocab", &vkcom::BaseEncoder::vocabulary)
127
+ .define_singleton_method(
128
+ "new",
129
+ *[](std::string &model_path, int n_threads) {
130
+ auto status = vkcom::Status();
131
+ vkcom::BaseEncoder encoder(model_path, n_threads, &status);
132
+ check_status(status);
133
+ return encoder;
134
+ });
135
+ }
@@ -0,0 +1,12 @@
1
+ require "mkmf-rice"
2
+
3
+ $CXXFLAGS << " -std=c++11"
4
+
5
+ ext = File.expand_path(".", __dir__)
6
+ youtokentome = File.expand_path("../../vendor/YouTokenToMe/youtokentome/cpp", __dir__)
7
+
8
+ $srcs = Dir["{#{ext},#{youtokentome}}/*.{cc,cpp}"]
9
+ $INCFLAGS << " -I#{youtokentome}"
10
+ $VPATH << youtokentome
11
+
12
+ create_makefile("youtokentome/ext")
@@ -0,0 +1,10 @@
1
+ # ext
2
+ require "youtokentome/ext"
3
+
4
+ # modules
5
+ require "youtokentome/version"
6
+ require "youtokentome/bpe"
7
+
8
+ module YouTokenToMe
9
+ class Error < StandardError; end
10
+ end
@@ -0,0 +1,54 @@
1
+ module YouTokenToMe
2
+ class BPE
3
+ def initialize(model, n_threads: -1)
4
+ @encoder = Ext::BaseEncoder.new(model, n_threads)
5
+ end
6
+
7
+ def vocab_size
8
+ @encoder.vocab_size
9
+ end
10
+
11
+ def vocab
12
+ vocab = @encoder.vocab
13
+ vocab.each do |v|
14
+ v.force_encoding(Encoding::UTF_8)
15
+ end
16
+ vocab
17
+ end
18
+
19
+ def subword_to_id(subword)
20
+ @encoder.subword_to_id(subword)
21
+ end
22
+
23
+ def id_to_subword(id)
24
+ @encoder.id_to_subword(id)
25
+ end
26
+
27
+ def encode(sentences, output_type: :id, bos: false, eos: false, reverse: false, dropout_prob: 0)
28
+ case output_type
29
+ when :id
30
+ @encoder.encode_as_ids(sentences, bos, eos, reverse, dropout_prob)
31
+ when :subword
32
+ subwords = @encoder.encode_as_subwords(sentences, bos, eos, reverse, dropout_prob)
33
+ subwords.each do |s|
34
+ s.each do |v|
35
+ v.force_encoding(Encoding::UTF_8)
36
+ end
37
+ end
38
+ subwords
39
+ else
40
+ raise ArgumentError, "Unknown output type"
41
+ end
42
+ end
43
+
44
+ # TODO add ignore_ids
45
+ def decode(ids)
46
+ @encoder.decode(ids)
47
+ end
48
+
49
+ def self.train(data:, model:, vocab_size:, coverage: 1.0, n_threads: -1, pad_id: 0, unk_id: 1, bos_id: 2, eos_id: 3)
50
+ Ext.train_bpe(data, model, vocab_size, coverage, n_threads, pad_id, unk_id, bos_id, eos_id)
51
+ new(model, n_threads: n_threads)
52
+ end
53
+ end
54
+ end
Binary file
@@ -0,0 +1,3 @@
1
+ module YouTokenToMe
2
+ VERSION = "0.1.0"
3
+ end
@@ -0,0 +1,19 @@
1
+ Copyright (c) 2019 VK.com
2
+
3
+ Permission is hereby granted, free of charge, to any person obtaining a copy
4
+ of this software and associated documentation files (the "Software"), to deal
5
+ in the Software without restriction, including without limitation the rights
6
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
7
+ copies of the Software, and to permit persons to whom the Software is
8
+ furnished to do so, subject to the following conditions:
9
+
10
+ The above copyright notice and this permission notice shall be included in all
11
+ copies or substantial portions of the Software.
12
+
13
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
19
+ SOFTWARE.
@@ -0,0 +1,304 @@
1
+ ![PyPI](https://img.shields.io/pypi/v/youtokentome.svg)
2
+ [![Downloads](https://pepy.tech/badge/youtokentome)](https://pepy.tech/project/youtokentome)
3
+ [![Code style: black](https://img.shields.io/badge/code%20style-black-000000.svg)](https://github.com/python/black)
4
+ ![GitHub](https://img.shields.io/github/license/vkcom/youtokentome.svg)
5
+ [![Build Status](https://travis-ci.org/VKCOM/YouTokenToMe.svg?branch=master)](https://travis-ci.org/VKCOM/YouTokenToMe)
6
+
7
+ # YouTokenToMe
8
+
9
+ YouTokenToMe is an unsupervised text tokenizer focused on computational efficiency. It currently implements fast Byte Pair Encoding (BPE) [[Sennrich et al.](https://www.aclweb.org/anthology/P16-1162)].
10
+ Our implementation is much faster in training and tokenization than [Hugging Face](https://github.com/huggingface/tokenizers), [fastBPE](https://github.com/glample/fastBPE)
11
+ and [SentencePiece](https://github.com/google/sentencepiece). In some test cases, it is 90 times faster.
12
+ Check out our [benchmark](benchmark.md) results.
13
+
14
+ Key advantages:
15
+
16
+ * Multithreading for training and tokenization
17
+ * The algorithm has `O(N)` complexity, where `N` is the length of training data
18
+ * Highly efficient implementation in C++
19
+ * Python wrapper and command-line interface
20
+
21
+ Extra features:
22
+ * BPE-dropout (as described in [Provilkov et al, 2019](https://arxiv.org/abs/1910.13267))
23
+
24
+ As well as in the algorithm from the original paper, ours does not consider tokens
25
+ that cross word boundaries. Just like in [SentencePiece](https://github.com/google/sentencepiece), all space symbols were replaced by meta symbol "▁" (U+2581). It allows sequences of tokens to be converted back to text and for word boundaries to be restored.
26
+
27
+ For example, the phrase ```Blazingly fast tokenization!``` can be tokenized into
28
+
29
+ `['▁Bl', 'az', 'ingly', '▁fast', '▁token', 'ization', '!']`
30
+
31
+ ## Installation
32
+
33
+ ```bash
34
+ pip install youtokentome
35
+ ```
36
+ ## Python interface
37
+
38
+ ### Example
39
+ Let's start with a self-contained example.
40
+
41
+ ```python
42
+ import random
43
+
44
+ import youtokentome as yttm
45
+
46
+ train_data_path = "train_data.txt"
47
+ model_path = "example.model"
48
+
49
+ # Generating random file with training data
50
+ # 10000 lines with 100 characters in each line
51
+ n_lines = 10000
52
+ n_characters = 100
53
+ with open(train_data_path, "w") as fout:
54
+ for _ in range(n_lines):
55
+ print("".join([random.choice("abcd ") for _ in range(n_characters)]), file=fout)
56
+
57
+ # Generating random text
58
+ test_text = "".join([random.choice("abcde ") for _ in range(100)])
59
+
60
+ # Training model
61
+ yttm.BPE.train(data=train_data_path, vocab_size=5000, model=model_path)
62
+
63
+ # Loading model
64
+ bpe = yttm.BPE(model=model_path)
65
+
66
+ # Two types of tokenization
67
+ print(bpe.encode([test_text], output_type=yttm.OutputType.ID))
68
+ print(bpe.encode([test_text], output_type=yttm.OutputType.SUBWORD))
69
+ ```
70
+
71
+ &nbsp;
72
+ ### Training model
73
+ ```python
74
+ youtokentome.BPE.train(data, model, vocab_size, coverage, n_threads=-1, pad_id=0, unk_id=1, bos_id=2, eos_id=3)
75
+ ```
76
+ Trains BPE model and saves to file.
77
+
78
+ **Args:**
79
+
80
+ * `data`: string, path to file with training data
81
+ * `model`: string, path to where the trained model will be saved
82
+ * `vocab_size`: int, number of tokens in the final vocabulary
83
+ * `coverage`: float, fraction of characters covered by the model. Must be in the range [0, 1]. A good value to use is about 0.9999.
84
+ * `n_threads`: int, number of parallel threads used to run. If -1 is passed, then all available threads are going to be used. Note that the number of threads is limited by 8 (see [benchmark](benchmark.md#number-of-threads)).
85
+ * `pad_id`: int, reserved id for padding
86
+ * `unk_id`: int, reserved id for unknown symbols
87
+ * `bos_id`: int, reserved id for begin of sentence token
88
+ * `eos_id`: int, reserved id for end of sentence token
89
+
90
+ **Returns**: Class `youtokentome.BPE` with the loaded model.
91
+
92
+
93
+ &nbsp;
94
+
95
+ ### Model loading
96
+
97
+ ```python
98
+ youtokentome.BPE(model, n_threads=-1)
99
+ ```
100
+
101
+ Class constructor. Loads the trained model.
102
+
103
+ * `model`: string, path to the trained model
104
+ * `n_threads`: int, number of parallel threads used to run.
105
+ If equal to -1, then the maximum number of threads available will be used.
106
+
107
+ &nbsp;
108
+
109
+ ### Methods
110
+ Class `youtokentome.BPE` has the following methods:
111
+ #### encode
112
+ ```python
113
+ encode(self, sentences, output_type=yttm.OutputType.ID, bos=False, eos=False, reverse=False, dropout_prob=0)
114
+ ```
115
+
116
+ **Args:**
117
+
118
+ * `sentences`: list of strings, sentences for tokenization.
119
+ * `output_type`: enum, sentence can be tokenized to ids or subwords. Use `OutputType.ID` for ids and `OutputType.SUBWORD` for subwords.
120
+ * `bos`: bool, if True then token “beginning of sentence” will be added
121
+ * `eos`: bool, if True then token “end of sentence” will be added
122
+ * `reverse`: bool, if True the output sequence of tokens will be reversed
123
+ * `dropout_prob`: float, BPE-dropout probability (the probability of a merge being dropped). Must be in the range [0, 1].
124
+
125
+
126
+ **Returns:** If `output_type` is equal to `youtokentome.OutputType.ID` or `youtokentome.OutputType.SUBWORD`
127
+ then a list of lists of integers or list of lists of strings will be returned
128
+ respectively.
129
+
130
+ &nbsp;
131
+ #### vocab
132
+
133
+ ```python
134
+ vocab(self)
135
+ ```
136
+
137
+ **Returns:** A list `vocab_size` strings. The i-th string in the list corresponds
138
+ to i-th subword.
139
+
140
+ &nbsp;
141
+ #### vocab_size
142
+
143
+ ```python
144
+ vocab_size(self)
145
+ ```
146
+
147
+ **Returns:** int. Size of vocabulary.
148
+
149
+ &nbsp;
150
+ #### subword_to_id
151
+
152
+ ```python
153
+ subword_to_id(self, subword)
154
+ ```
155
+ **Args:**
156
+ * `subword`: string.
157
+
158
+ **Returns:**
159
+ Integer from the range [0, vocab_size-1]. Id of subword or,
160
+ if there is no such subword in the vocabulary, `unk_id` will be
161
+ returned.
162
+
163
+ &nbsp;
164
+ #### id_to_subword
165
+
166
+ ```python
167
+ id_to_subword(self, id)
168
+ ```
169
+ **Args:**
170
+ * `id`: int, must be in the range [0, vocab_size-1]
171
+
172
+ **Returns:** string. Subword from vocabulary by id.
173
+
174
+ &nbsp;
175
+ #### decode
176
+ ```python
177
+ decode(self, ids, ignore_ids=None)
178
+ ```
179
+ Convert each id to subword and concatenate with space symbol.
180
+
181
+ **Args:**
182
+
183
+ * `ids`: list of lists of integers. All integers must be in the range [0, vocab_size-1]
184
+ * `ignore_ids`: collection of integers. These indices would be ignored during the decoding. All integers must be in the range [0, vocab_size-1] [default: None]
185
+
186
+
187
+ **Returns:** List of strings.
188
+
189
+ ## Command line interface
190
+
191
+ ### Example
192
+
193
+ ```bash
194
+ $ yttm bpe --data TRAINING_DATA_FILE --model OUTPUT_MODEL_FILE --vocab_size 2000
195
+ $ yttm encode --model OUTPUT_MODEL_FILE --output_type subword < TEST_DATA_FILE > ENCODED_DATA
196
+ ```
197
+
198
+
199
+ ### Supported commands
200
+
201
+ `YouTokenToMe` supports the following commands:
202
+
203
+ ```
204
+ $ yttm --help
205
+
206
+ Usage: yttm [OPTIONS] COMMAND [ARGS]...
207
+
208
+ Options:
209
+ --help Show this message and exit.
210
+
211
+ Commands:
212
+ bpe Train BPE model.
213
+ decode Decode ids to text.
214
+ encode Encode text to ids or subwords.
215
+ vocab Print list of learned subwords.
216
+ ```
217
+
218
+ Command `bpe` allows you to train Byte Pair Encoding model based on a text file.
219
+
220
+ ```
221
+ $ yttm bpe --help
222
+
223
+ Usage: yttm bpe [OPTIONS]
224
+
225
+ Train BPE model.
226
+
227
+ Options:
228
+ --data PATH Training data file path. [required]
229
+ --model PATH Output model file path. [required]
230
+ --vocab_size INTEGER Number of tokens in the final vocabulary. [required]
231
+ --coverage FLOAT Fraction of characters covered by the model. [default: 1.0]
232
+ --n_threads INTEGER Number of threads. [default: -1]
233
+ --pad_id INTEGER Padding token id. [default: 0]
234
+ --unk_id INTEGER Unknown token id. [default: 1]
235
+ --bos_id INTEGER 'Begin of sentence' token id. [default: 2]
236
+ --eos_id INTEGER 'End of sentence' token id. [default: 3]
237
+ --help Show this message and exit.
238
+ ```
239
+
240
+
241
+ Apply BPE encoding for a corpus of sentences. Use `stdin` for input and `stdout` for output.
242
+
243
+ By default, encoding works in parallel using `n_threads` threads. Number of threads is limited by
244
+ 8 (see [benchmark](benchmark.md#number-of-threads)).
245
+
246
+ With the `--stream` option, `--n_threads` will be ignored and all sentences will be processed one by one.
247
+ Each sentence will be tokenized and written to the `stdout` before the next sentence is read.
248
+
249
+
250
+ ```
251
+ $ yttm encode --help
252
+
253
+ Usage: yttm encode [OPTIONS]
254
+
255
+ Encode text to ids or subwords.
256
+
257
+ Options:
258
+ --model PATH Path to file with learned model. [required]
259
+ --output_type TEXT 'id' or 'subword'. [required]
260
+ --n_threads INTEGER Number of threads. [default: -1]
261
+ --bos Add tab 'begin of sentence'.
262
+ --eos Add tab 'end of sentence'.
263
+ --reverse Reverse output sequence of tokens.
264
+ --stream Process each line before reading the next one.
265
+ --dropout_prob BPE-dropout probability (the probability of a merge being dropped). [default: 0]
266
+ --help Show this message and exit.
267
+ ```
268
+
269
+ Print vocabulary. This can be useful for understanding the model.
270
+
271
+ ```
272
+ $ yttm vocab --help
273
+
274
+ Usage: yttm vocab [OPTIONS]
275
+
276
+ Print list of learned subwords.
277
+
278
+ Options:
279
+ --model PATH Path to file with learned model. [required]
280
+ --verbose Add merging rules.
281
+ --help Show this message and exit.
282
+ ```
283
+
284
+ Convert ids back to text. Use `stdin` for input and `stdout` for output.
285
+
286
+ ```
287
+ $ yttm decode --help
288
+
289
+ Usage: yttm decode [OPTIONS]
290
+
291
+ Decode ids to text.
292
+
293
+ Options:
294
+ --model PATH Path to file with learned model. [required]
295
+ --ignore_ids List of indices to ignore for decoding. Example: --ignore_ids=1,2,3
296
+ --help Show this message and exit.
297
+ ```
298
+
299
+
300
+
301
+
302
+
303
+
304
+