robo-lib 0.0.4__tar.gz → 0.0.6__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- robo_lib-0.0.6/PKG-INFO +243 -0
- robo_lib-0.0.6/README.md +226 -0
- {robo_lib-0.0.4 → robo_lib-0.0.6}/pyproject.toml +2 -2
- {robo_lib-0.0.4 → robo_lib-0.0.6}/robo_lib/components.py +12 -6
- robo_lib-0.0.4/PKG-INFO +0 -18
- robo_lib-0.0.4/README.md +0 -1
- {robo_lib-0.0.4 → robo_lib-0.0.6}/LICENSE +0 -0
- {robo_lib-0.0.4 → robo_lib-0.0.6}/robo_lib/__init__.py +0 -0
- {robo_lib-0.0.4 → robo_lib-0.0.6}/tests/__init__.py +0 -0
robo_lib-0.0.6/PKG-INFO
ADDED
@@ -0,0 +1,243 @@
|
|
1
|
+
Metadata-Version: 2.3
|
2
|
+
Name: robo_lib
|
3
|
+
Version: 0.0.6
|
4
|
+
Summary: A package to create, configure, and train transformer models.
|
5
|
+
Project-URL: Homepage, https://github.com/hamburgerfish/robo_pack
|
6
|
+
Project-URL: Issues, https://github.com/hamburgerfish/robo_pack/issues
|
7
|
+
Author-email: Erik Papp <erik3papp@gmail.com>
|
8
|
+
License-File: LICENSE
|
9
|
+
Classifier: License :: OSI Approved :: MIT License
|
10
|
+
Classifier: Operating System :: OS Independent
|
11
|
+
Classifier: Programming Language :: Python :: 3
|
12
|
+
Requires-Python: >=3.8
|
13
|
+
Requires-Dist: numpy
|
14
|
+
Requires-Dist: tokenizers
|
15
|
+
Requires-Dist: torch
|
16
|
+
Description-Content-Type: text/markdown
|
17
|
+
|
18
|
+
# robo-lib
|
19
|
+
|
20
|
+
provides tools for creating, configuring, and training custom transformer models on any data available to you.
|
21
|
+
|
22
|
+
## Main features:
|
23
|
+
- Customize and train tokenizers using an implementation of the features from the [tokenizers](https://pypi.org/project/tokenizers/#description) library.
|
24
|
+
- Customize data processor to process data into individual tensors, ready to be used to train transformers without further processing.
|
25
|
+
- Configure transformer models to fit specific requirements/specifications without having to write the internal logic.
|
26
|
+
- Use the 3 components to create, train, and use custom transformers in different applications.
|
27
|
+
|
28
|
+
## Installation
|
29
|
+
|
30
|
+
```bash
|
31
|
+
pip install robo-lib
|
32
|
+
```
|
33
|
+
|
34
|
+
## using robo-lib
|
35
|
+
|
36
|
+
Documentation can be found [here](https://github.com/hamburgerfish/robo_pack/wiki).
|
37
|
+
|
38
|
+
### Language translation example
|
39
|
+
- In this example, an encoder-decoder transformer is created for language translation, from English to French.
|
40
|
+
- This example uses two .txt files for training, one with English, and the other with the equivalent French sentence in each line (delimited by "\n").
|
41
|
+
- Create, train, and save tokenizers using `TokenizerConstructor`.
|
42
|
+
- In this example, the WordLevel tokenizer is used, along with the detault arguments of `TokenizerConstructor`.
|
43
|
+
|
44
|
+
```python
|
45
|
+
import robo_lib as rl
|
46
|
+
|
47
|
+
encoder_tok = rl.TokenizerConstructor(tokenizer_type="WordLevel")
|
48
|
+
encoder_tok.train("english_data.txt")
|
49
|
+
|
50
|
+
decoder_tok = rl.TokenizerConstructor(tokenizer_type="WordLevel")
|
51
|
+
encoder_tok.train("french_data.txt")
|
52
|
+
|
53
|
+
rl.save_component(encoder_tok, "tokenizers/encoder_tok.pkl")
|
54
|
+
rl.save_component(decoder_tok, "tokenizers/decoder_tok.pkl")
|
55
|
+
```
|
56
|
+
|
57
|
+
- The `DataProcessor` can be used to automatically process the data into a single torch.tensor, easily useable by the transformer for training.
|
58
|
+
- The tokenizer(s) must be specified when initialising a DataProcessor. In this case the dec_tokenizer, and enc_tokenizer is both specified for an encoder-decoder transformer.
|
59
|
+
- The `process_list` method processes lists of string data, so our .txt files are read into lists to be processed by `process_list`.
|
60
|
+
- In this example, we are splitting the data 90% : 10% for training and validation.
|
61
|
+
|
62
|
+
```python
|
63
|
+
proc = rl.DataProcessor(dec_tokenizer=decoder_tok, enc_tokenizer=encoder_tok)
|
64
|
+
|
65
|
+
# read training .txt files into lists
|
66
|
+
with open("english_data.txt", "r") as file:
|
67
|
+
english_list = file.read().split("\n")
|
68
|
+
|
69
|
+
with open("french_data.txt", "r") as file:
|
70
|
+
french_list = file.read().split("\n")
|
71
|
+
|
72
|
+
# splitting lists into train and validation sets
|
73
|
+
split = 0.9
|
74
|
+
n = int(len(english_list) * 0.9)
|
75
|
+
english_train = english_list[:n]
|
76
|
+
french_train = french_list[:n]
|
77
|
+
english_val = english_list[n:]
|
78
|
+
french_val = french_list[n:]
|
79
|
+
|
80
|
+
# process and save training data as data/training*.pt
|
81
|
+
# block_size_exceeded_policy="skip" removes training data larger than specified block size
|
82
|
+
proc.process_list(
|
83
|
+
save_path="data/training",
|
84
|
+
dec_data=french_train,
|
85
|
+
dec_max_block_size=100,
|
86
|
+
dec_block_size_exceeded_policy="skip",
|
87
|
+
enc_data=english_train,
|
88
|
+
enc_max_block_size=100,
|
89
|
+
enc_block_size_exceeded_policy="skip"
|
90
|
+
)
|
91
|
+
|
92
|
+
# process and save validation data as data/validation*.pt
|
93
|
+
proc.process_list(
|
94
|
+
save_path="data/validation",
|
95
|
+
dec_data=french_val,
|
96
|
+
dec_max_block_size=100,
|
97
|
+
dec_block_size_exceeded_policy="skip",
|
98
|
+
enc_data=english_val,
|
99
|
+
enc_max_block_size=100,
|
100
|
+
enc_block_size_exceeded_policy="skip"
|
101
|
+
)
|
102
|
+
```
|
103
|
+
- The `RoboConstructor` class is used to create and configure transformer models before trainin.
|
104
|
+
- A separate .py file is recommended for training.
|
105
|
+
- If device is not specified, `RoboConstructor` will take the first available one out of ("cuda", "mps", "cpu"). Torch cuda is not part of the dependencies when installing robo-lib, so it is highly recommended to install it, using this [link](https://pytorch.org/get-started/locally/), if you have a CUDA compatible device.
|
106
|
+
- The `train` method is used to train the transformer and save it to `save_path` every `eval_interval` iterations.
|
107
|
+
- If a non-`TokenizerConstructor` token is used, the pad token if your tokenizer can be specified instead of the dec_tokenizer parameter.
|
108
|
+
|
109
|
+
```python
|
110
|
+
import robo_lib as rl
|
111
|
+
|
112
|
+
encoder_tok = rl.load_component("tokenizers/encoder_tok.pkl")
|
113
|
+
decoder_tok = rl.load_component("tokenizers/decoder_tok.pkl")
|
114
|
+
|
115
|
+
robo = rl.RoboConstructor(
|
116
|
+
n_embed=512,
|
117
|
+
dec_n_blocks=6,
|
118
|
+
dec_n_head=8,
|
119
|
+
dec_vocab_size=decoder_tok.vocab_size,
|
120
|
+
dec_block_size=100,
|
121
|
+
enc_n_blocks=6,
|
122
|
+
enc_n_head=8,
|
123
|
+
enc_vocab_size=encoder_tok.vocab_size,
|
124
|
+
enc_block_size=100
|
125
|
+
)
|
126
|
+
|
127
|
+
robo.train(
|
128
|
+
max_iters=20000,
|
129
|
+
eval_interval=200,
|
130
|
+
batch_size=128,
|
131
|
+
dec_training_path="data/training_decoder_data.pt",
|
132
|
+
dec_eval_path="data/validation_decoder_data.pt",
|
133
|
+
dec_training_masks_path="data/training_decoder_mask_data.pt",
|
134
|
+
dec_eval_masks_path="data/validation_decoder_mask_data.pt",
|
135
|
+
enc_training_path="data/training_encoder_data.pt",
|
136
|
+
enc_eval_path="data/validation_encoder_data.pt",
|
137
|
+
enc_training_masks_path="data/training_encoder_mask_data.pt",
|
138
|
+
enc_eval_masks_path="data/validation_encoder_mask_data.pt",
|
139
|
+
dec_tokenizer=decoder_tok,
|
140
|
+
save_path="models/eng_to_fr_robo.pkl"
|
141
|
+
)
|
142
|
+
```
|
143
|
+
|
144
|
+
- For language translation, a loss of around 3 already shows good results.
|
145
|
+
- To use the trained transformer, the `generate` method can be employed.
|
146
|
+
- The temperature, top_k and top_p values can be specified for this method, along with the tokenizers used.
|
147
|
+
- If a non-`TokenizerConstructor` tokenizer is used, the start, end, separator (decoder-only), and new-line tokens can be specified of your tokenizer.
|
148
|
+
- In this example, a simple script is created to interact with the user on the command-line, where the user's English input will be translated by the transformer and printed out onto the console in French.
|
149
|
+
|
150
|
+
```python
|
151
|
+
import robo-lib as rl
|
152
|
+
|
153
|
+
robo = rc.load_component("models/eng_to_fr_robo.pkl")
|
154
|
+
encoder_tok = rl.load_component("tokenizers/encoder_tok.pkl")
|
155
|
+
decoder_tok = rl.load_component("tokenizers/decoder_tok.pkl")
|
156
|
+
|
157
|
+
While True:
|
158
|
+
query = input()
|
159
|
+
print(robo.generate(query, dec_tokenizer=decoder_tok, enc_tokenizer=encoder_tok))
|
160
|
+
```
|
161
|
+
|
162
|
+
### Shakespeare dialogue generator example
|
163
|
+
- In this example, a decoder-only transformer is created and trained on a file containing all the dialogue written by William Shakespeare in his plays.
|
164
|
+
- The training data is in the form of a single .txt file containing the dialogue.
|
165
|
+
- The default BPE tokenizer is used in this case, so no argument is specified for `TokenizerConstructor`.
|
166
|
+
|
167
|
+
```python
|
168
|
+
import robo_lib as rl
|
169
|
+
|
170
|
+
tok = rl.TokenizerConstructor()
|
171
|
+
tok.train("shakespeare_dialogues.txt")
|
172
|
+
|
173
|
+
rl.save_component(tok, "tokenizers/shakespeare_tok.pkl")
|
174
|
+
```
|
175
|
+
|
176
|
+
- In this example, instead of having multiple pieces of training data, we have one large text file, from which random chunks of length `block_size` can be used for training. Therefore, a single large string is input into the DataProcessor instead of a list of strings.
|
177
|
+
- Since this is a decoder-only transformer, encoder arguments are not given.
|
178
|
+
- Since the entire string should be processed as is, instead of creating blocks of training data, block_size is not specified.
|
179
|
+
- dec_create_masks is set to False, as there will be no padding in the training data.
|
180
|
+
|
181
|
+
```python
|
182
|
+
proc = rl.DataProcessor(dec_tokenizer=tok)
|
183
|
+
|
184
|
+
# read training .txt file
|
185
|
+
with open("shakespeare_dialogues.txt", "r") as file:
|
186
|
+
dialogues_str = file.read()
|
187
|
+
|
188
|
+
# splitting string into train and validation sets
|
189
|
+
split = 0.9
|
190
|
+
n = int(len(dialogues_str) * 0.9)
|
191
|
+
train_data = dialogues_str[:n]
|
192
|
+
val_data = dialogues_str[n:]
|
193
|
+
|
194
|
+
# process and save training data as data/shakespeare_train*.pt
|
195
|
+
proc.process_list(
|
196
|
+
save_path="data/shakespeare_train",
|
197
|
+
dec_data=train_data,
|
198
|
+
dec_create_masks=False
|
199
|
+
)
|
200
|
+
|
201
|
+
# process and save validation data as data/validation*.pt
|
202
|
+
proc.process_list(
|
203
|
+
save_path="data/shakespeare_valid",
|
204
|
+
dec_data=val_data,
|
205
|
+
dec_create_masks=False
|
206
|
+
)
|
207
|
+
```
|
208
|
+
- Training the transformer.
|
209
|
+
```python
|
210
|
+
import robo_lib as rl
|
211
|
+
|
212
|
+
tok = rl.load_component("tokenizers/shakespeare_tok.pkl")
|
213
|
+
|
214
|
+
robo = rl.RoboConstructor(
|
215
|
+
n_embed=1024,
|
216
|
+
dec_n_blocks=8,
|
217
|
+
dec_n_head=8,
|
218
|
+
dec_vocab_size=tok.vocab_size,
|
219
|
+
dec_block_size=200
|
220
|
+
)
|
221
|
+
|
222
|
+
robo.train(
|
223
|
+
max_iters=20000,
|
224
|
+
eval_interval=200,
|
225
|
+
batch_size=64,
|
226
|
+
dec_training_path="data/shakespeare_train_decoder_data.pt",
|
227
|
+
dec_eval_path="data/shakespeare_valid_decoder_data.pt",
|
228
|
+
dec_tokenizer=tok,
|
229
|
+
save_path="models/shakespeare_robo.pkl"
|
230
|
+
)
|
231
|
+
```
|
232
|
+
- In this example, the user can specify the start of the generated Shakespeare play and the transformer will generate and print the rest, until `max_new_tokens` (1000) tokens are generated.
|
233
|
+
- Temperature and top_k are set to 1.2 and 2 respectively to generate a more "creative" output.
|
234
|
+
```python
|
235
|
+
import robo_lib as rl
|
236
|
+
|
237
|
+
robo = rc.load_component("models/shakespeare_robo.pkl")
|
238
|
+
tok = rl.load_component("tokenizers/shakespeare_tok.pkl")
|
239
|
+
|
240
|
+
While True:
|
241
|
+
start = input()
|
242
|
+
print(robo.generate(start, max_new_tokens=1000, dec_tokenizer=tok, temperature=1.2, top_k=2))
|
243
|
+
```
|
robo_lib-0.0.6/README.md
ADDED
@@ -0,0 +1,226 @@
|
|
1
|
+
# robo-lib
|
2
|
+
|
3
|
+
provides tools for creating, configuring, and training custom transformer models on any data available to you.
|
4
|
+
|
5
|
+
## Main features:
|
6
|
+
- Customize and train tokenizers using an implementation of the features from the [tokenizers](https://pypi.org/project/tokenizers/#description) library.
|
7
|
+
- Customize data processor to process data into individual tensors, ready to be used to train transformers without further processing.
|
8
|
+
- Configure transformer models to fit specific requirements/specifications without having to write the internal logic.
|
9
|
+
- Use the 3 components to create, train, and use custom transformers in different applications.
|
10
|
+
|
11
|
+
## Installation
|
12
|
+
|
13
|
+
```bash
|
14
|
+
pip install robo-lib
|
15
|
+
```
|
16
|
+
|
17
|
+
## using robo-lib
|
18
|
+
|
19
|
+
Documentation can be found [here](https://github.com/hamburgerfish/robo_pack/wiki).
|
20
|
+
|
21
|
+
### Language translation example
|
22
|
+
- In this example, an encoder-decoder transformer is created for language translation, from English to French.
|
23
|
+
- This example uses two .txt files for training, one with English, and the other with the equivalent French sentence in each line (delimited by "\n").
|
24
|
+
- Create, train, and save tokenizers using `TokenizerConstructor`.
|
25
|
+
- In this example, the WordLevel tokenizer is used, along with the detault arguments of `TokenizerConstructor`.
|
26
|
+
|
27
|
+
```python
|
28
|
+
import robo_lib as rl
|
29
|
+
|
30
|
+
encoder_tok = rl.TokenizerConstructor(tokenizer_type="WordLevel")
|
31
|
+
encoder_tok.train("english_data.txt")
|
32
|
+
|
33
|
+
decoder_tok = rl.TokenizerConstructor(tokenizer_type="WordLevel")
|
34
|
+
encoder_tok.train("french_data.txt")
|
35
|
+
|
36
|
+
rl.save_component(encoder_tok, "tokenizers/encoder_tok.pkl")
|
37
|
+
rl.save_component(decoder_tok, "tokenizers/decoder_tok.pkl")
|
38
|
+
```
|
39
|
+
|
40
|
+
- The `DataProcessor` can be used to automatically process the data into a single torch.tensor, easily useable by the transformer for training.
|
41
|
+
- The tokenizer(s) must be specified when initialising a DataProcessor. In this case the dec_tokenizer, and enc_tokenizer is both specified for an encoder-decoder transformer.
|
42
|
+
- The `process_list` method processes lists of string data, so our .txt files are read into lists to be processed by `process_list`.
|
43
|
+
- In this example, we are splitting the data 90% : 10% for training and validation.
|
44
|
+
|
45
|
+
```python
|
46
|
+
proc = rl.DataProcessor(dec_tokenizer=decoder_tok, enc_tokenizer=encoder_tok)
|
47
|
+
|
48
|
+
# read training .txt files into lists
|
49
|
+
with open("english_data.txt", "r") as file:
|
50
|
+
english_list = file.read().split("\n")
|
51
|
+
|
52
|
+
with open("french_data.txt", "r") as file:
|
53
|
+
french_list = file.read().split("\n")
|
54
|
+
|
55
|
+
# splitting lists into train and validation sets
|
56
|
+
split = 0.9
|
57
|
+
n = int(len(english_list) * 0.9)
|
58
|
+
english_train = english_list[:n]
|
59
|
+
french_train = french_list[:n]
|
60
|
+
english_val = english_list[n:]
|
61
|
+
french_val = french_list[n:]
|
62
|
+
|
63
|
+
# process and save training data as data/training*.pt
|
64
|
+
# block_size_exceeded_policy="skip" removes training data larger than specified block size
|
65
|
+
proc.process_list(
|
66
|
+
save_path="data/training",
|
67
|
+
dec_data=french_train,
|
68
|
+
dec_max_block_size=100,
|
69
|
+
dec_block_size_exceeded_policy="skip",
|
70
|
+
enc_data=english_train,
|
71
|
+
enc_max_block_size=100,
|
72
|
+
enc_block_size_exceeded_policy="skip"
|
73
|
+
)
|
74
|
+
|
75
|
+
# process and save validation data as data/validation*.pt
|
76
|
+
proc.process_list(
|
77
|
+
save_path="data/validation",
|
78
|
+
dec_data=french_val,
|
79
|
+
dec_max_block_size=100,
|
80
|
+
dec_block_size_exceeded_policy="skip",
|
81
|
+
enc_data=english_val,
|
82
|
+
enc_max_block_size=100,
|
83
|
+
enc_block_size_exceeded_policy="skip"
|
84
|
+
)
|
85
|
+
```
|
86
|
+
- The `RoboConstructor` class is used to create and configure transformer models before trainin.
|
87
|
+
- A separate .py file is recommended for training.
|
88
|
+
- If device is not specified, `RoboConstructor` will take the first available one out of ("cuda", "mps", "cpu"). Torch cuda is not part of the dependencies when installing robo-lib, so it is highly recommended to install it, using this [link](https://pytorch.org/get-started/locally/), if you have a CUDA compatible device.
|
89
|
+
- The `train` method is used to train the transformer and save it to `save_path` every `eval_interval` iterations.
|
90
|
+
- If a non-`TokenizerConstructor` token is used, the pad token if your tokenizer can be specified instead of the dec_tokenizer parameter.
|
91
|
+
|
92
|
+
```python
|
93
|
+
import robo_lib as rl
|
94
|
+
|
95
|
+
encoder_tok = rl.load_component("tokenizers/encoder_tok.pkl")
|
96
|
+
decoder_tok = rl.load_component("tokenizers/decoder_tok.pkl")
|
97
|
+
|
98
|
+
robo = rl.RoboConstructor(
|
99
|
+
n_embed=512,
|
100
|
+
dec_n_blocks=6,
|
101
|
+
dec_n_head=8,
|
102
|
+
dec_vocab_size=decoder_tok.vocab_size,
|
103
|
+
dec_block_size=100,
|
104
|
+
enc_n_blocks=6,
|
105
|
+
enc_n_head=8,
|
106
|
+
enc_vocab_size=encoder_tok.vocab_size,
|
107
|
+
enc_block_size=100
|
108
|
+
)
|
109
|
+
|
110
|
+
robo.train(
|
111
|
+
max_iters=20000,
|
112
|
+
eval_interval=200,
|
113
|
+
batch_size=128,
|
114
|
+
dec_training_path="data/training_decoder_data.pt",
|
115
|
+
dec_eval_path="data/validation_decoder_data.pt",
|
116
|
+
dec_training_masks_path="data/training_decoder_mask_data.pt",
|
117
|
+
dec_eval_masks_path="data/validation_decoder_mask_data.pt",
|
118
|
+
enc_training_path="data/training_encoder_data.pt",
|
119
|
+
enc_eval_path="data/validation_encoder_data.pt",
|
120
|
+
enc_training_masks_path="data/training_encoder_mask_data.pt",
|
121
|
+
enc_eval_masks_path="data/validation_encoder_mask_data.pt",
|
122
|
+
dec_tokenizer=decoder_tok,
|
123
|
+
save_path="models/eng_to_fr_robo.pkl"
|
124
|
+
)
|
125
|
+
```
|
126
|
+
|
127
|
+
- For language translation, a loss of around 3 already shows good results.
|
128
|
+
- To use the trained transformer, the `generate` method can be employed.
|
129
|
+
- The temperature, top_k and top_p values can be specified for this method, along with the tokenizers used.
|
130
|
+
- If a non-`TokenizerConstructor` tokenizer is used, the start, end, separator (decoder-only), and new-line tokens can be specified of your tokenizer.
|
131
|
+
- In this example, a simple script is created to interact with the user on the command-line, where the user's English input will be translated by the transformer and printed out onto the console in French.
|
132
|
+
|
133
|
+
```python
|
134
|
+
import robo-lib as rl
|
135
|
+
|
136
|
+
robo = rc.load_component("models/eng_to_fr_robo.pkl")
|
137
|
+
encoder_tok = rl.load_component("tokenizers/encoder_tok.pkl")
|
138
|
+
decoder_tok = rl.load_component("tokenizers/decoder_tok.pkl")
|
139
|
+
|
140
|
+
While True:
|
141
|
+
query = input()
|
142
|
+
print(robo.generate(query, dec_tokenizer=decoder_tok, enc_tokenizer=encoder_tok))
|
143
|
+
```
|
144
|
+
|
145
|
+
### Shakespeare dialogue generator example
|
146
|
+
- In this example, a decoder-only transformer is created and trained on a file containing all the dialogue written by William Shakespeare in his plays.
|
147
|
+
- The training data is in the form of a single .txt file containing the dialogue.
|
148
|
+
- The default BPE tokenizer is used in this case, so no argument is specified for `TokenizerConstructor`.
|
149
|
+
|
150
|
+
```python
|
151
|
+
import robo_lib as rl
|
152
|
+
|
153
|
+
tok = rl.TokenizerConstructor()
|
154
|
+
tok.train("shakespeare_dialogues.txt")
|
155
|
+
|
156
|
+
rl.save_component(tok, "tokenizers/shakespeare_tok.pkl")
|
157
|
+
```
|
158
|
+
|
159
|
+
- In this example, instead of having multiple pieces of training data, we have one large text file, from which random chunks of length `block_size` can be used for training. Therefore, a single large string is input into the DataProcessor instead of a list of strings.
|
160
|
+
- Since this is a decoder-only transformer, encoder arguments are not given.
|
161
|
+
- Since the entire string should be processed as is, instead of creating blocks of training data, block_size is not specified.
|
162
|
+
- dec_create_masks is set to False, as there will be no padding in the training data.
|
163
|
+
|
164
|
+
```python
|
165
|
+
proc = rl.DataProcessor(dec_tokenizer=tok)
|
166
|
+
|
167
|
+
# read training .txt file
|
168
|
+
with open("shakespeare_dialogues.txt", "r") as file:
|
169
|
+
dialogues_str = file.read()
|
170
|
+
|
171
|
+
# splitting string into train and validation sets
|
172
|
+
split = 0.9
|
173
|
+
n = int(len(dialogues_str) * 0.9)
|
174
|
+
train_data = dialogues_str[:n]
|
175
|
+
val_data = dialogues_str[n:]
|
176
|
+
|
177
|
+
# process and save training data as data/shakespeare_train*.pt
|
178
|
+
proc.process_list(
|
179
|
+
save_path="data/shakespeare_train",
|
180
|
+
dec_data=train_data,
|
181
|
+
dec_create_masks=False
|
182
|
+
)
|
183
|
+
|
184
|
+
# process and save validation data as data/validation*.pt
|
185
|
+
proc.process_list(
|
186
|
+
save_path="data/shakespeare_valid",
|
187
|
+
dec_data=val_data,
|
188
|
+
dec_create_masks=False
|
189
|
+
)
|
190
|
+
```
|
191
|
+
- Training the transformer.
|
192
|
+
```python
|
193
|
+
import robo_lib as rl
|
194
|
+
|
195
|
+
tok = rl.load_component("tokenizers/shakespeare_tok.pkl")
|
196
|
+
|
197
|
+
robo = rl.RoboConstructor(
|
198
|
+
n_embed=1024,
|
199
|
+
dec_n_blocks=8,
|
200
|
+
dec_n_head=8,
|
201
|
+
dec_vocab_size=tok.vocab_size,
|
202
|
+
dec_block_size=200
|
203
|
+
)
|
204
|
+
|
205
|
+
robo.train(
|
206
|
+
max_iters=20000,
|
207
|
+
eval_interval=200,
|
208
|
+
batch_size=64,
|
209
|
+
dec_training_path="data/shakespeare_train_decoder_data.pt",
|
210
|
+
dec_eval_path="data/shakespeare_valid_decoder_data.pt",
|
211
|
+
dec_tokenizer=tok,
|
212
|
+
save_path="models/shakespeare_robo.pkl"
|
213
|
+
)
|
214
|
+
```
|
215
|
+
- In this example, the user can specify the start of the generated Shakespeare play and the transformer will generate and print the rest, until `max_new_tokens` (1000) tokens are generated.
|
216
|
+
- Temperature and top_k are set to 1.2 and 2 respectively to generate a more "creative" output.
|
217
|
+
```python
|
218
|
+
import robo_lib as rl
|
219
|
+
|
220
|
+
robo = rc.load_component("models/shakespeare_robo.pkl")
|
221
|
+
tok = rl.load_component("tokenizers/shakespeare_tok.pkl")
|
222
|
+
|
223
|
+
While True:
|
224
|
+
start = input()
|
225
|
+
print(robo.generate(start, max_new_tokens=1000, dec_tokenizer=tok, temperature=1.2, top_k=2))
|
226
|
+
```
|
@@ -4,11 +4,11 @@ build-backend = "hatchling.build"
|
|
4
4
|
|
5
5
|
[project]
|
6
6
|
name = "robo_lib"
|
7
|
-
version = "0.0.
|
7
|
+
version = "0.0.6"
|
8
8
|
authors = [
|
9
9
|
{ name="Erik Papp", email="erik3papp@gmail.com" },
|
10
10
|
]
|
11
|
-
description = "A package to configure,
|
11
|
+
description = "A package to create, configure, and train transformer models."
|
12
12
|
readme = "README.md"
|
13
13
|
requires-python = ">=3.8"
|
14
14
|
dependencies = ["torch", "tokenizers", "numpy"]
|
@@ -202,8 +202,8 @@ class DataProcessor:
|
|
202
202
|
dec_create_masks:bool=True,
|
203
203
|
dec_block_size_exceeded_policy:str=None,
|
204
204
|
enc_data:list[str]=None,
|
205
|
-
enc_create_masks=True,
|
206
205
|
enc_max_block_size:int=None,
|
206
|
+
enc_create_masks:bool=True,
|
207
207
|
enc_block_size_exceeded_policy:str=None
|
208
208
|
) -> None:
|
209
209
|
|
@@ -646,7 +646,14 @@ class RoboConstructor(nn.Module):
|
|
646
646
|
self.encoder_blocks = MySequential(*[EncoderBlock(n_embed, enc_n_head, enc_expansion_factor, dropout=dropout) for _ in range(enc_n_blocks)])
|
647
647
|
else:
|
648
648
|
self.cross_attention = False
|
649
|
+
self.enc_n_blocks = None
|
650
|
+
self.enc_n_head = None
|
651
|
+
self.enc_expansion_factor = None
|
652
|
+
self.enc_vocab_size = None
|
649
653
|
self.enc_block_size = None
|
654
|
+
self.enc_token_embedding_table = None
|
655
|
+
self.enc_positional_embedding_table = None
|
656
|
+
self.encoder_blocks = None
|
650
657
|
|
651
658
|
self.decoder_blocks = MySequential(*[DecoderBlock(n_embed, dec_n_head, dec_expansion_factor, cross_attention=self.cross_attention, block_size=self.dec_block_size, dropout=dropout) for _ in range(dec_n_blocks)])
|
652
659
|
self.ln = nn.LayerNorm(n_embed)
|
@@ -734,7 +741,7 @@ class RoboConstructor(nn.Module):
|
|
734
741
|
eval_iters:int=3,
|
735
742
|
learning_rate:float=1e-4,
|
736
743
|
pad_token:int=None,
|
737
|
-
|
744
|
+
dec_tokenizer:TokenizerConstructor=None,
|
738
745
|
save_path:str=None,
|
739
746
|
label_smoothing:float=0.1
|
740
747
|
) -> None:
|
@@ -748,8 +755,8 @@ class RoboConstructor(nn.Module):
|
|
748
755
|
enc_training_masks_data = torch.load(enc_training_masks_path, weights_only=True) if enc_training_masks_path != None else None
|
749
756
|
enc_eval_masks_data = torch.load(enc_eval_masks_path, weights_only=True) if enc_eval_masks_path != None else None
|
750
757
|
|
751
|
-
if pad_token == None and
|
752
|
-
pad_token =
|
758
|
+
if pad_token == None and dec_tokenizer != None:
|
759
|
+
pad_token = dec_tokenizer.pad_token
|
753
760
|
|
754
761
|
self.to(self.device)
|
755
762
|
|
@@ -797,7 +804,6 @@ class RoboConstructor(nn.Module):
|
|
797
804
|
|
798
805
|
self.eval()
|
799
806
|
|
800
|
-
# use dec and enc tokenizers
|
801
807
|
def generate(self,
|
802
808
|
inputs:list[int]|str,
|
803
809
|
max_new_tokens:int=None,
|
@@ -805,8 +811,8 @@ class RoboConstructor(nn.Module):
|
|
805
811
|
enc_tokenizer:TokenizerConstructor=None,
|
806
812
|
dec_start_token:int=None,
|
807
813
|
enc_start_token:int=None,
|
808
|
-
enc_end_token:int=None,
|
809
814
|
dec_end_token:int=None,
|
815
|
+
enc_end_token:int=None,
|
810
816
|
separator_token:int=None,
|
811
817
|
new_line_token:int=None,
|
812
818
|
temperature:float=1,
|
robo_lib-0.0.4/PKG-INFO
DELETED
@@ -1,18 +0,0 @@
|
|
1
|
-
Metadata-Version: 2.3
|
2
|
-
Name: robo_lib
|
3
|
-
Version: 0.0.4
|
4
|
-
Summary: A package to configure, create and train transformer models.
|
5
|
-
Project-URL: Homepage, https://github.com/hamburgerfish/robo_pack
|
6
|
-
Project-URL: Issues, https://github.com/hamburgerfish/robo_pack/issues
|
7
|
-
Author-email: Erik Papp <erik3papp@gmail.com>
|
8
|
-
License-File: LICENSE
|
9
|
-
Classifier: License :: OSI Approved :: MIT License
|
10
|
-
Classifier: Operating System :: OS Independent
|
11
|
-
Classifier: Programming Language :: Python :: 3
|
12
|
-
Requires-Python: >=3.8
|
13
|
-
Requires-Dist: numpy
|
14
|
-
Requires-Dist: tokenizers
|
15
|
-
Requires-Dist: torch
|
16
|
-
Description-Content-Type: text/markdown
|
17
|
-
|
18
|
-
# robo_pack
|
robo_lib-0.0.4/README.md
DELETED
@@ -1 +0,0 @@
|
|
1
|
-
# robo_pack
|
File without changes
|
File without changes
|
File without changes
|