robo-lib 1.0.0__py3-none-any.whl → 1.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- robo_lib/components.py +59 -88
- {robo_lib-1.0.0.dist-info → robo_lib-1.0.1.dist-info}/METADATA +8 -17
- robo_lib-1.0.1.dist-info/RECORD +6 -0
- robo_lib-1.0.0.dist-info/RECORD +0 -6
- {robo_lib-1.0.0.dist-info → robo_lib-1.0.1.dist-info}/WHEEL +0 -0
- {robo_lib-1.0.0.dist-info → robo_lib-1.0.1.dist-info}/licenses/LICENSE +0 -0
robo_lib/components.py
CHANGED
@@ -8,29 +8,24 @@ import pickle
|
|
8
8
|
import itertools
|
9
9
|
from pathlib import Path
|
10
10
|
import os
|
11
|
+
from typing import List, Literal
|
12
|
+
|
13
|
+
pre_tokenizers = Literal["Whitespace", "IndividualDigit", "Digits", "BertPreTokenizer", "ByteLevel", "Metaspace", "Punctuation", "UnicodeScripts", "WhitespaceSplit"]
|
11
14
|
|
12
15
|
class TokenizerConstructor:
|
13
16
|
'''
|
14
|
-
|
15
17
|
simple assembler for tokenizer using the tokenizers library
|
16
|
-
tokenizer parameters can be set using strings and list[
|
18
|
+
tokenizer parameters can be set using strings and list[str]s
|
17
19
|
strings used for tokenizer_type, pre_tokenizers, normalizers arguments are the names of those present in the
|
18
20
|
tokenizers library. Additionally "IndividualDigits" can be used in normalizers for tokenizers.pre_tokenizers.Digits(individual_digits=True)
|
19
21
|
|
20
|
-
train([paths]) function points to text files to be used for training the tokenizer instance
|
21
|
-
|
22
|
-
encode(string) function encodes string using trained tokenizer instance
|
23
|
-
|
24
|
-
decode(list[int]) function decodes list of tokenz using trained tokenizer instance
|
25
|
-
|
26
22
|
vocab_size attribute returns the tokenizer instance's vocab_size (untrained tokenizer will have vocab_size=None)
|
27
23
|
|
28
|
-
|
29
24
|
'''
|
30
25
|
def __init__(self,
|
31
26
|
min_frequency:int=2,
|
32
|
-
tokenizer_type:
|
33
|
-
pre_tokenizers:
|
27
|
+
tokenizer_type:Literal["BPE", "WordLevel", "WordPiece", "Unigram"] = "BPE",
|
28
|
+
pre_tokenizers: pre_tokenizers|List[pre_tokenizers]=["Whitespace"],
|
34
29
|
normalizers:list[str]|str=["Lowercase", "NFD", "StripAccents", "Strip"],
|
35
30
|
vocab:dict[str,int] = {},
|
36
31
|
special_tokens:list[str]|str=[],
|
@@ -50,6 +45,7 @@ class TokenizerConstructor:
|
|
50
45
|
self.start_token = self.special_tokens.index(start_token_string) if start_token_string is not None else None
|
51
46
|
self.end_token = self.special_tokens.index(end_token_string) if end_token_string is not None else None
|
52
47
|
self.pad_token = self.special_tokens.index(pad_token_string) if pad_token_string is not None else None
|
48
|
+
self.pad_token_string = pad_token_string
|
53
49
|
self.new_line_token = self.special_tokens.index(new_line_token_string) if new_line_token_string is not None else None
|
54
50
|
|
55
51
|
if tokenizer_type == "BPE":
|
@@ -120,40 +116,50 @@ class TokenizerConstructor:
|
|
120
116
|
|
121
117
|
|
122
118
|
def train(self, training_paths:list[str]|str) -> None:
|
119
|
+
'''
|
120
|
+
points to text files to be used for training the tokenizer instance
|
121
|
+
'''
|
123
122
|
if isinstance(training_paths, str):
|
124
123
|
training_paths = [training_paths]
|
125
124
|
self.tokenizer_type.train(training_paths, trainer=self.trainer)
|
126
125
|
self.vocab_size = self.tokenizer_type.get_vocab_size()
|
127
126
|
|
128
127
|
def encode(self, inp:str) -> list[int]:
|
128
|
+
'''
|
129
|
+
encodes string using trained tokenizer instance
|
130
|
+
'''
|
129
131
|
return self.tokenizer_type.encode(inp).ids
|
130
132
|
|
131
133
|
def encode_batch(self, inp:list[str], max_length:int=None) -> list[list[int]]:
|
134
|
+
'''
|
135
|
+
encodes strings in parallel and truncates entries with length > max_length
|
136
|
+
'''
|
132
137
|
if max_length is not None:
|
133
138
|
self.tokenizer_type.enable_truncation(max_length=max_length)
|
139
|
+
self.tokenizer_type.enable_padding(pad_id=self.pad_token, pad_token=self.pad_token_string, length=max_length)
|
134
140
|
out = [row.ids for row in self.tokenizer_type.encode_batch(inp)]
|
135
141
|
self.tokenizer_type.no_truncation()
|
142
|
+
self.tokenizer_type.enable_padding(pad_id=self.pad_token, pad_token=self.pad_token_string)
|
136
143
|
return out
|
137
144
|
|
138
145
|
def decode(self, inp:list[int]) -> str:
|
146
|
+
'''
|
147
|
+
decodes list of tokenz using trained tokenizer instance
|
148
|
+
'''
|
139
149
|
return self.tokenizer_type.decode(inp)
|
140
150
|
|
141
151
|
|
142
152
|
|
143
153
|
def create_mask(row:list, block_size:int) -> list[bool]:
|
144
154
|
'''
|
145
|
-
|
146
155
|
creates a mask list of length block_size for row, asuming mask does cover the entire row input
|
147
|
-
|
148
156
|
'''
|
149
157
|
mask = [1]*len(row) + [0]*(block_size - len(row))
|
150
158
|
return mask
|
151
159
|
|
152
160
|
def pre_process_data(data:str, start_token_string:str, end_token_string:str) -> list[int]:
|
153
161
|
'''
|
154
|
-
|
155
|
-
returns string row with the tokenizer's start and end tokens if they exist
|
156
|
-
|
162
|
+
returns data with the tokenizer's start and end tokens added to each row if they exist
|
157
163
|
'''
|
158
164
|
if start_token_string is None and end_token_string is None:
|
159
165
|
return data
|
@@ -168,11 +174,8 @@ def pre_process_data(data:str, start_token_string:str, end_token_string:str) ->
|
|
168
174
|
|
169
175
|
def safe_stack(tensor_list:list[torch.tensor]) -> torch.tensor:
|
170
176
|
'''
|
171
|
-
|
172
177
|
torch stack with check to ensure tensors are valid in input list
|
173
|
-
|
174
178
|
returns torch.stack(out_list) for all valid torch tensors in tensor_list. raises error if no valid tensors
|
175
|
-
|
176
179
|
'''
|
177
180
|
out_list = [row for row in tensor_list if isinstance(row, torch.Tensor)]
|
178
181
|
if len(out_list) == 0:
|
@@ -182,21 +185,7 @@ def safe_stack(tensor_list:list[torch.tensor]) -> torch.tensor:
|
|
182
185
|
|
183
186
|
class DataProcessor:
|
184
187
|
'''
|
185
|
-
|
186
188
|
data processor can be instantiated by specifying the tokenizer(s) for decoder and encoder data
|
187
|
-
|
188
|
-
process_list() function processes raw data in the form of list[str] or str for decoder and encoder simultaneously and
|
189
|
-
saves them to save_path as .pt files.
|
190
|
-
- encoder and decoder input data should have matching input and outputs so enc_data[n] should have its corresponding
|
191
|
-
decoder data at dec_data[n].
|
192
|
-
- max block size can be specified for both input and output, default takes the max
|
193
|
-
block size provided in the data respectively.
|
194
|
-
- if enc/dec_block_size is specified and enc/dec_block_size_exceeded_policy is not, an error will occur if a piece
|
195
|
-
of data larger than enc/dec_block_size is encountered. enc/dec_block_size_exceeded_policy can be set to "skip" or
|
196
|
-
"trim" to skip rows larger than enc/dec_block_size or truncate the row to specified enc/dec_block_size respectively.
|
197
|
-
- enc/dec_create_masks saves masks tensors to save_path as .pt files.
|
198
|
-
|
199
|
-
|
200
189
|
'''
|
201
190
|
def __init__(self,
|
202
191
|
dec_tokenizer:TokenizerConstructor,
|
@@ -214,6 +203,14 @@ class DataProcessor:
|
|
214
203
|
enc_create_masks:bool=True,
|
215
204
|
save_path:str = "."
|
216
205
|
) -> None:
|
206
|
+
'''
|
207
|
+
processes raw data in the form of list[str] or str for decoder and encoder simultaneously and
|
208
|
+
saves them to save_path as .pt files.
|
209
|
+
- encoder and decoder input data should have matching input and outputs so enc_data[n] should have its corresponding
|
210
|
+
decoder data at dec_data[n].
|
211
|
+
- max block size can be specified for both input and output, default takes the max
|
212
|
+
block size provided in the data respectively. If data length > max_length, the data is trimmed.
|
213
|
+
'''
|
217
214
|
|
218
215
|
if isinstance(dec_data, str):
|
219
216
|
dec_data = [dec_data]
|
@@ -258,10 +255,8 @@ def get_valid_samples(random_samples:torch.Tensor,
|
|
258
255
|
block_size:int
|
259
256
|
) -> list[int]:
|
260
257
|
'''
|
261
|
-
|
262
258
|
returns list of len(random_samples) with values corresponding to index values of masks that ensure minimum masked
|
263
259
|
values when taking sample of length block_size
|
264
|
-
|
265
260
|
'''
|
266
261
|
valid_samples = [0 if sum(masks[row_num]) <= block_size else random.randint(0, sum(masks[row_num]) - block_size) for row_num in random_samples]
|
267
262
|
return valid_samples
|
@@ -273,11 +268,9 @@ def get_batch(data:torch.Tensor,
|
|
273
268
|
get_offset:bool=True
|
274
269
|
) -> tuple[torch.tensor]:
|
275
270
|
'''
|
276
|
-
|
277
271
|
returns random batches from data tensor using random sample for data selection.
|
278
272
|
- returns corresponding batch offset by 1 unless get_offset=False
|
279
273
|
- returns corresponding masks batch if masks data is specified
|
280
|
-
|
281
274
|
'''
|
282
275
|
batch_size = len(random_samples)
|
283
276
|
if block_size is not None and block_size != data.shape[1]:
|
@@ -310,9 +303,6 @@ def top_kp_filter(logits: torch.Tensor,
|
|
310
303
|
logits: (batch_size, vocab_size) tensor of raw logits.
|
311
304
|
top_k: keep only top_k tokens with highest logits.
|
312
305
|
top_p: keep the smallest set of tokens with cumulative probability >= top_p.
|
313
|
-
|
314
|
-
Returns:
|
315
|
-
selected: tensor of selected token indices (batch_size,)
|
316
306
|
'''
|
317
307
|
logits = logits.clone() # avoid modifying input logits in-place
|
318
308
|
|
@@ -359,10 +349,8 @@ def top_kp_filter(logits: torch.Tensor,
|
|
359
349
|
|
360
350
|
class SelfAttention(nn.Module):
|
361
351
|
'''
|
362
|
-
|
363
352
|
single self attention block of size head_size.
|
364
353
|
triangle_mask=True to apply look-ahead mask of size block_size.
|
365
|
-
|
366
354
|
'''
|
367
355
|
def __init__(self,
|
368
356
|
head_size:int,
|
@@ -390,10 +378,8 @@ class SelfAttention(nn.Module):
|
|
390
378
|
mask:torch.Tensor=None
|
391
379
|
) -> torch.tensor:
|
392
380
|
'''
|
393
|
-
|
394
381
|
k, q and v are key, tensors to get key, query and value tensors.
|
395
382
|
custom mask tensor can be applied.
|
396
|
-
|
397
383
|
'''
|
398
384
|
_,T,_ = k.shape
|
399
385
|
|
@@ -414,12 +400,10 @@ class SelfAttention(nn.Module):
|
|
414
400
|
|
415
401
|
class MultiHeadAttention(nn.Module):
|
416
402
|
'''
|
417
|
-
|
418
403
|
multi-head attention block consisting of num_heads SelfAttention blocks and a linear layer to
|
419
404
|
rejoin outputs.
|
420
405
|
specified head_size, n_embed, dropout, block_size and triangle_mask values are passed through to
|
421
406
|
SelfAttention blocks
|
422
|
-
|
423
407
|
'''
|
424
408
|
def __init__(self,
|
425
409
|
num_heads:int,
|
@@ -441,10 +425,8 @@ class MultiHeadAttention(nn.Module):
|
|
441
425
|
mask:torch.Tensor=None
|
442
426
|
) -> torch.tensor:
|
443
427
|
'''
|
444
|
-
|
445
428
|
k, q and v are key, tensors to get key, query and value tensors.
|
446
429
|
custom mask tensor can be applied.
|
447
|
-
|
448
430
|
'''
|
449
431
|
out = torch.cat([h(k, q, v, mask=mask) for h in self.heads], dim=-1)
|
450
432
|
out = self.dropout(self.proj(out))
|
@@ -452,11 +434,9 @@ class MultiHeadAttention(nn.Module):
|
|
452
434
|
|
453
435
|
class FeedForward(nn.Module):
|
454
436
|
'''
|
455
|
-
|
456
437
|
feed forward layer used after multi-head attention consisting of 2 lieanr layers with
|
457
438
|
a ReLU in between. Linear layers expand from n_embed to n_embed * expansion_factor and
|
458
439
|
back to n_embed.
|
459
|
-
|
460
440
|
'''
|
461
441
|
def __init__(self,
|
462
442
|
n_embed:int,
|
@@ -478,10 +458,8 @@ class FeedForward(nn.Module):
|
|
478
458
|
|
479
459
|
class EncoderBlock(nn.Module):
|
480
460
|
'''
|
481
|
-
|
482
461
|
encoder block consists of a sequence of multi-head attention, LayerNorm, feed-forward, LayerNorm
|
483
462
|
head_size is calculated from n_embed // n_head
|
484
|
-
|
485
463
|
'''
|
486
464
|
def __init__(self,
|
487
465
|
n_embed:int,
|
@@ -509,13 +487,11 @@ class EncoderBlock(nn.Module):
|
|
509
487
|
|
510
488
|
class DecoderBlock(nn.Module):
|
511
489
|
'''
|
512
|
-
|
513
490
|
decoder block consists of a sequence of multi-head attention, LayerNorm, feed-forward, LayerNorm
|
514
491
|
if cross-attention is True, a multi-head attention block and layerNorm is added before feed-forward
|
515
492
|
taking specified enc_k and enc_v tensors as value and key tensors. These values should be the output
|
516
493
|
of an encoder block.
|
517
494
|
head_size is calculated from n_embed // n_head
|
518
|
-
|
519
495
|
'''
|
520
496
|
def __init__(self,
|
521
497
|
n_embed:int,
|
@@ -555,9 +531,7 @@ class DecoderBlock(nn.Module):
|
|
555
531
|
|
556
532
|
class MySequential(nn.Sequential):
|
557
533
|
'''
|
558
|
-
|
559
534
|
MySequential serves the same purpose as nn.Sequential but allows for multiple inputs and outputs
|
560
|
-
|
561
535
|
'''
|
562
536
|
def forward(self, *input):
|
563
537
|
for module in self._modules.values():
|
@@ -566,39 +540,12 @@ class MySequential(nn.Sequential):
|
|
566
540
|
|
567
541
|
class RoboConstructor(nn.Module):
|
568
542
|
'''
|
569
|
-
|
570
543
|
RoboConstructor assembles an encoder-decoder or decoder-only transformer.
|
571
544
|
if the enc_* variables are not specified, or enc_n_blocks==0, the transformer will be decoder-only.
|
572
545
|
- if any of the dec_* variables are not specified (except dec_expansion_factor) an error will occur.
|
573
546
|
- if enc_n_blocks > 0 and any of the enc_* variables are not specified (except enc_expansion_factor and enc_block_size) an error will occur.
|
574
547
|
dropout can be specified, default=0.1.
|
575
548
|
if device is not specified, device will default to first available among ("cuda", "mps", "cpu")
|
576
|
-
|
577
|
-
prep_data() function returns a batch of specified batch_size, from dec_data (and dec_masks, enc_data and enc_masks if specified)
|
578
|
-
- if encoder is configured in this instance, enc_data must be specified.
|
579
|
-
- dec_block_size must be specified.
|
580
|
-
- if enc_block_size is not specified, the entire block_size of enc_data will be used.
|
581
|
-
this function is for use in train_robo()
|
582
|
-
|
583
|
-
train_robo() function trains the RoboConstructor instance transformer.
|
584
|
-
- training parameters can be specified such as max_iters, eval_interval, batch_size, eval_iters, learning_rate, label_smoothing.
|
585
|
-
- paths must be specified for decoder training data (and encoder training data if encoder-decoder transformer)
|
586
|
-
- optional paths to specify: decoder and encoder masks, decoder and encoder validation data, decoder and encoder validation masks data
|
587
|
-
- if neither pad_token or tokenizer is specified (or tokenizer has no pad_token), any padding in labels will contribute towards the loss
|
588
|
-
which may cause unwanted results. Specifying pad_token and/or tokenizer allows loss to be calculated while ignoring any padding in labels
|
589
|
-
- specify save_path to save the model as a .pkl file every eval_interval iterations using the save_component function.
|
590
|
-
|
591
|
-
generate() function uses the tranformer model from the RoboConstructor instance to generate an output from an input.
|
592
|
-
- input can be in the form of a string if input tokenizer is specified (enc_tokenizer for encoder-decoder and dec_tokenizder for decoder-only),
|
593
|
-
otherwise, it must be in the form of a list of tokens.
|
594
|
-
- if dec_tokenizer is specified, output will be a string.
|
595
|
-
- new tokens are generated until the dec_end_token (or dec_tokenizer.end_token) is generated, or the number of tokens generated == max_new_tokens.
|
596
|
-
- if input tokenizer is not specified, or input tokenizer.start_token is None, enc_start_token must be specified for an encoder-decoder model.
|
597
|
-
- separator_token is used to separate the input and generated tokens for a decoder-only model. If this value is not specified, there
|
598
|
-
will be no distinction between input tokens and generated tokens to the transformer, even if dec_tokenizer is specified.
|
599
|
-
- if new_line_token is not specified, output will be returned in one line, without any "\n" line separators.
|
600
|
-
- temperature, top_k and top_p can be specified to adjust the output.
|
601
|
-
|
602
549
|
'''
|
603
550
|
def __init__(self,
|
604
551
|
n_embed:int,
|
@@ -711,6 +658,13 @@ class RoboConstructor(nn.Module):
|
|
711
658
|
enc_block_size:int=None,
|
712
659
|
enc_masks:str=None
|
713
660
|
) -> tuple[torch.tensor]:
|
661
|
+
'''
|
662
|
+
returns a batch of specified batch_size, from dec_data (and dec_masks, enc_data and enc_masks if specified)
|
663
|
+
- if encoder is configured in this instance, enc_data must be specified.
|
664
|
+
- dec_block_size must be specified.
|
665
|
+
- if enc_block_size is not specified, the entire block_size of enc_data will be used.
|
666
|
+
this method is for use in train_robo()
|
667
|
+
'''
|
714
668
|
random_samples = torch.randint(dec_data.shape[0], (batch_size,))
|
715
669
|
|
716
670
|
dec_train_batch_in, dec_train_batch_out, dec_train_masks_in = get_batch(dec_data, random_samples, masks=dec_masks, block_size=dec_block_size, get_offset=True)
|
@@ -734,7 +688,7 @@ class RoboConstructor(nn.Module):
|
|
734
688
|
eval_interval:int,
|
735
689
|
batch_size:int,
|
736
690
|
training_dir_path:str,
|
737
|
-
eval_dir_path:str,
|
691
|
+
eval_dir_path:str=None,
|
738
692
|
eval_iters:int=3,
|
739
693
|
learning_rate:float=1e-4,
|
740
694
|
pad_token:int=None,
|
@@ -742,6 +696,15 @@ class RoboConstructor(nn.Module):
|
|
742
696
|
save_path:str=None,
|
743
697
|
label_smoothing:float=0.1
|
744
698
|
) -> None:
|
699
|
+
'''
|
700
|
+
trains the RoboConstructor instance transformer.
|
701
|
+
- training parameters can be specified such as max_iters, eval_interval, batch_size, eval_iters, learning_rate, label_smoothing.
|
702
|
+
- paths must be specified for decoder training data (and encoder training data if encoder-decoder transformer)
|
703
|
+
- optional paths to specify: decoder and encoder masks, decoder and encoder validation data, decoder and encoder validation masks data
|
704
|
+
- if neither pad_token or tokenizer is specified (or tokenizer has no pad_token), any padding in labels will contribute towards the loss
|
705
|
+
which may cause unwanted results. Specifying pad_token and/or tokenizer allows loss to be calculated while ignoring any padding in labels
|
706
|
+
- specify save_path to save the model as a .pkl file every eval_interval iterations using the save_component function.
|
707
|
+
'''
|
745
708
|
|
746
709
|
dec_training_path = os.path.join(training_dir_path, "decoder_data.pt")
|
747
710
|
dec_training_data = torch.load(dec_training_path, weights_only=True)
|
@@ -831,6 +794,18 @@ class RoboConstructor(nn.Module):
|
|
831
794
|
top_k:int=None,
|
832
795
|
top_p:float=None
|
833
796
|
) -> list[int]|str:
|
797
|
+
'''
|
798
|
+
uses the tranformer model from the RoboConstructor instance to generate an output from an input.
|
799
|
+
- input can be in the form of a string if input tokenizer is specified (enc_tokenizer for encoder-decoder and dec_tokenizder for decoder-only),
|
800
|
+
otherwise, it must be in the form of a list of tokens.
|
801
|
+
- if dec_tokenizer is specified, output will be a string.
|
802
|
+
- new tokens are generated until the dec_end_token (or dec_tokenizer.end_token) is generated, or the number of tokens generated == max_new_tokens.
|
803
|
+
- if input tokenizer is not specified, or input tokenizer.start_token is None, enc_start_token must be specified for an encoder-decoder model.
|
804
|
+
- separator_token is used to separate the input and generated tokens for a decoder-only model. If this value is not specified, there
|
805
|
+
will be no distinction between input tokens and generated tokens to the transformer, even if dec_tokenizer is specified.
|
806
|
+
- if new_line_token is not specified, output will be returned in one line, without any "\n" line separators.
|
807
|
+
- temperature, top_k and top_p can be specified to adjust the output.
|
808
|
+
'''
|
834
809
|
max_new_tokens = self.dec_block_size if max_new_tokens is None else max_new_tokens
|
835
810
|
|
836
811
|
if self.cross_attention:
|
@@ -891,9 +866,7 @@ class RoboConstructor(nn.Module):
|
|
891
866
|
|
892
867
|
def save_component(component, save_path:str) -> None:
|
893
868
|
'''
|
894
|
-
|
895
869
|
saves component (such as TokenizerConstructor or RoboConstructor) as .pkl file.
|
896
|
-
|
897
870
|
'''
|
898
871
|
save_path = save_path + ".pkl" if save_path[-4:] != ".pkl" else save_path
|
899
872
|
with open(save_path, "wb") as comp:
|
@@ -901,9 +874,7 @@ def save_component(component, save_path:str) -> None:
|
|
901
874
|
|
902
875
|
def load_component(load_path:str):
|
903
876
|
'''
|
904
|
-
|
905
877
|
loads saved .pkl file into variable.
|
906
|
-
|
907
878
|
'''
|
908
879
|
load_path = load_path + ".pkl" if load_path[-4:] != ".pkl" else load_path
|
909
880
|
with open(load_path, "rb") as comp:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: robo_lib
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.1
|
4
4
|
Summary: A package to create, configure, and train transformer models.
|
5
5
|
Project-URL: Homepage, https://github.com/hamburgerfish/robo_pack
|
6
6
|
Project-URL: Issues, https://github.com/hamburgerfish/robo_pack/issues
|
@@ -13,6 +13,7 @@ Requires-Python: >=3.8
|
|
13
13
|
Requires-Dist: numpy
|
14
14
|
Requires-Dist: tokenizers
|
15
15
|
Requires-Dist: torch
|
16
|
+
Requires-Dist: typing
|
16
17
|
Description-Content-Type: text/markdown
|
17
18
|
|
18
19
|
# robo-lib
|
@@ -83,10 +84,8 @@ proc.process_list(
|
|
83
84
|
save_path="data/training",
|
84
85
|
dec_data=french_train,
|
85
86
|
dec_max_block_size=100,
|
86
|
-
dec_block_size_exceeded_policy="skip",
|
87
87
|
enc_data=english_train,
|
88
|
-
enc_max_block_size=100
|
89
|
-
enc_block_size_exceeded_policy="skip"
|
88
|
+
enc_max_block_size=100
|
90
89
|
)
|
91
90
|
|
92
91
|
# process and save validation data as data/validation*.pt
|
@@ -94,10 +93,8 @@ proc.process_list(
|
|
94
93
|
save_path="data/validation",
|
95
94
|
dec_data=french_val,
|
96
95
|
dec_max_block_size=100,
|
97
|
-
dec_block_size_exceeded_policy="skip",
|
98
96
|
enc_data=english_val,
|
99
|
-
enc_max_block_size=100
|
100
|
-
enc_block_size_exceeded_policy="skip"
|
97
|
+
enc_max_block_size=100
|
101
98
|
)
|
102
99
|
```
|
103
100
|
- The `RoboConstructor` class is used to create and configure transformer models before trainin.
|
@@ -128,14 +125,8 @@ robo.train_robo(
|
|
128
125
|
max_iters=20000,
|
129
126
|
eval_interval=200,
|
130
127
|
batch_size=128,
|
131
|
-
|
132
|
-
|
133
|
-
dec_training_masks_path="data/training_decoder_mask_data.pt",
|
134
|
-
dec_eval_masks_path="data/validation_decoder_mask_data.pt",
|
135
|
-
enc_training_path="data/training_encoder_data.pt",
|
136
|
-
enc_eval_path="data/validation_encoder_data.pt",
|
137
|
-
enc_training_masks_path="data/training_encoder_mask_data.pt",
|
138
|
-
enc_eval_masks_path="data/validation_encoder_mask_data.pt",
|
128
|
+
training_dir_path="data/training",
|
129
|
+
eval_dir_path="data/validation",
|
139
130
|
dec_tokenizer=decoder_tok,
|
140
131
|
save_path="models/eng_to_fr_robo.pkl"
|
141
132
|
)
|
@@ -223,8 +214,8 @@ robo.train(
|
|
223
214
|
max_iters=20000,
|
224
215
|
eval_interval=200,
|
225
216
|
batch_size=64,
|
226
|
-
|
227
|
-
|
217
|
+
training_dir_path="data/shakespeare_train",
|
218
|
+
eval_dir_path="data/shakespeare_valid",
|
228
219
|
dec_tokenizer=tok,
|
229
220
|
save_path="models/shakespeare_robo.pkl"
|
230
221
|
)
|
@@ -0,0 +1,6 @@
|
|
1
|
+
robo_lib/__init__.py,sha256=NnzWHWwpFcSJD_XRMWKKPQFAIrRBFYiCFN0pgUGPygc,968
|
2
|
+
robo_lib/components.py,sha256=mfvdNC77d1k1vmlNwG3ri2MbfmEn3haACAnRf56b_c4,43164
|
3
|
+
robo_lib-1.0.1.dist-info/METADATA,sha256=4CG07VLULgAcGlfNeNXS9Pjzs7SXP5gNf95ddgGbWqc,9051
|
4
|
+
robo_lib-1.0.1.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
5
|
+
robo_lib-1.0.1.dist-info/licenses/LICENSE,sha256=4XzkkpFqPzH0GH3zxOqRTqc7xUKSEe7dWPOuJYW95ac,1089
|
6
|
+
robo_lib-1.0.1.dist-info/RECORD,,
|
robo_lib-1.0.0.dist-info/RECORD
DELETED
@@ -1,6 +0,0 @@
|
|
1
|
-
robo_lib/__init__.py,sha256=NnzWHWwpFcSJD_XRMWKKPQFAIrRBFYiCFN0pgUGPygc,968
|
2
|
-
robo_lib/components.py,sha256=M_1M1Y56_W0bSElZlg3M6gRoJJPAnUchTO3N8AdsEV8,43091
|
3
|
-
robo_lib-1.0.0.dist-info/METADATA,sha256=GAnmrynDr3-hv9KyCjXlpx5I8v2BLQJCIDXURoGFw2w,9633
|
4
|
-
robo_lib-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
5
|
-
robo_lib-1.0.0.dist-info/licenses/LICENSE,sha256=4XzkkpFqPzH0GH3zxOqRTqc7xUKSEe7dWPOuJYW95ac,1089
|
6
|
-
robo_lib-1.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|