robo-lib 1.0.0__py3-none-any.whl → 1.0.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- robo_lib/components.py +61 -88
- {robo_lib-1.0.0.dist-info → robo_lib-1.0.2.dist-info}/METADATA +8 -17
- robo_lib-1.0.2.dist-info/RECORD +6 -0
- robo_lib-1.0.0.dist-info/RECORD +0 -6
- {robo_lib-1.0.0.dist-info → robo_lib-1.0.2.dist-info}/WHEEL +0 -0
- {robo_lib-1.0.0.dist-info → robo_lib-1.0.2.dist-info}/licenses/LICENSE +0 -0
robo_lib/components.py
CHANGED
@@ -8,29 +8,24 @@ import pickle
|
|
8
8
|
import itertools
|
9
9
|
from pathlib import Path
|
10
10
|
import os
|
11
|
+
from typing import List, Literal
|
12
|
+
|
13
|
+
pre_tokenizers = Literal["Whitespace", "IndividualDigit", "Digits", "BertPreTokenizer", "ByteLevel", "Metaspace", "Punctuation", "UnicodeScripts", "WhitespaceSplit"]
|
11
14
|
|
12
15
|
class TokenizerConstructor:
|
13
16
|
'''
|
14
|
-
|
15
17
|
simple assembler for tokenizer using the tokenizers library
|
16
|
-
tokenizer parameters can be set using strings and list[
|
18
|
+
tokenizer parameters can be set using strings and list[str]s
|
17
19
|
strings used for tokenizer_type, pre_tokenizers, normalizers arguments are the names of those present in the
|
18
20
|
tokenizers library. Additionally "IndividualDigits" can be used in normalizers for tokenizers.pre_tokenizers.Digits(individual_digits=True)
|
19
21
|
|
20
|
-
train([paths]) function points to text files to be used for training the tokenizer instance
|
21
|
-
|
22
|
-
encode(string) function encodes string using trained tokenizer instance
|
23
|
-
|
24
|
-
decode(list[int]) function decodes list of tokenz using trained tokenizer instance
|
25
|
-
|
26
22
|
vocab_size attribute returns the tokenizer instance's vocab_size (untrained tokenizer will have vocab_size=None)
|
27
23
|
|
28
|
-
|
29
24
|
'''
|
30
25
|
def __init__(self,
|
31
26
|
min_frequency:int=2,
|
32
|
-
tokenizer_type:
|
33
|
-
pre_tokenizers:
|
27
|
+
tokenizer_type:Literal["BPE", "WordLevel", "WordPiece", "Unigram"] = "BPE",
|
28
|
+
pre_tokenizers: pre_tokenizers|List[pre_tokenizers]=["Whitespace"],
|
34
29
|
normalizers:list[str]|str=["Lowercase", "NFD", "StripAccents", "Strip"],
|
35
30
|
vocab:dict[str,int] = {},
|
36
31
|
special_tokens:list[str]|str=[],
|
@@ -50,6 +45,7 @@ class TokenizerConstructor:
|
|
50
45
|
self.start_token = self.special_tokens.index(start_token_string) if start_token_string is not None else None
|
51
46
|
self.end_token = self.special_tokens.index(end_token_string) if end_token_string is not None else None
|
52
47
|
self.pad_token = self.special_tokens.index(pad_token_string) if pad_token_string is not None else None
|
48
|
+
self.pad_token_string = pad_token_string
|
53
49
|
self.new_line_token = self.special_tokens.index(new_line_token_string) if new_line_token_string is not None else None
|
54
50
|
|
55
51
|
if tokenizer_type == "BPE":
|
@@ -120,40 +116,52 @@ class TokenizerConstructor:
|
|
120
116
|
|
121
117
|
|
122
118
|
def train(self, training_paths:list[str]|str) -> None:
|
119
|
+
'''
|
120
|
+
points to text files to be used for training the tokenizer instance
|
121
|
+
'''
|
123
122
|
if isinstance(training_paths, str):
|
124
123
|
training_paths = [training_paths]
|
125
124
|
self.tokenizer_type.train(training_paths, trainer=self.trainer)
|
126
125
|
self.vocab_size = self.tokenizer_type.get_vocab_size()
|
127
126
|
|
128
127
|
def encode(self, inp:str) -> list[int]:
|
128
|
+
'''
|
129
|
+
encodes string using trained tokenizer instance
|
130
|
+
'''
|
129
131
|
return self.tokenizer_type.encode(inp).ids
|
130
132
|
|
131
133
|
def encode_batch(self, inp:list[str], max_length:int=None) -> list[list[int]]:
|
134
|
+
'''
|
135
|
+
encodes strings in parallel and truncates entries with length > max_length
|
136
|
+
'''
|
132
137
|
if max_length is not None:
|
133
138
|
self.tokenizer_type.enable_truncation(max_length=max_length)
|
139
|
+
if self.pad_token is not None:
|
140
|
+
self.tokenizer_type.enable_padding(pad_id=self.pad_token, pad_token=self.pad_token_string, length=max_length)
|
134
141
|
out = [row.ids for row in self.tokenizer_type.encode_batch(inp)]
|
135
142
|
self.tokenizer_type.no_truncation()
|
143
|
+
if self.pad_token is not None:
|
144
|
+
self.tokenizer_type.enable_padding(pad_id=self.pad_token, pad_token=self.pad_token_string)
|
136
145
|
return out
|
137
146
|
|
138
147
|
def decode(self, inp:list[int]) -> str:
|
148
|
+
'''
|
149
|
+
decodes list of tokenz using trained tokenizer instance
|
150
|
+
'''
|
139
151
|
return self.tokenizer_type.decode(inp)
|
140
152
|
|
141
153
|
|
142
154
|
|
143
155
|
def create_mask(row:list, block_size:int) -> list[bool]:
|
144
156
|
'''
|
145
|
-
|
146
157
|
creates a mask list of length block_size for row, asuming mask does cover the entire row input
|
147
|
-
|
148
158
|
'''
|
149
159
|
mask = [1]*len(row) + [0]*(block_size - len(row))
|
150
160
|
return mask
|
151
161
|
|
152
162
|
def pre_process_data(data:str, start_token_string:str, end_token_string:str) -> list[int]:
|
153
163
|
'''
|
154
|
-
|
155
|
-
returns string row with the tokenizer's start and end tokens if they exist
|
156
|
-
|
164
|
+
returns data with the tokenizer's start and end tokens added to each row if they exist
|
157
165
|
'''
|
158
166
|
if start_token_string is None and end_token_string is None:
|
159
167
|
return data
|
@@ -168,11 +176,8 @@ def pre_process_data(data:str, start_token_string:str, end_token_string:str) ->
|
|
168
176
|
|
169
177
|
def safe_stack(tensor_list:list[torch.tensor]) -> torch.tensor:
|
170
178
|
'''
|
171
|
-
|
172
179
|
torch stack with check to ensure tensors are valid in input list
|
173
|
-
|
174
180
|
returns torch.stack(out_list) for all valid torch tensors in tensor_list. raises error if no valid tensors
|
175
|
-
|
176
181
|
'''
|
177
182
|
out_list = [row for row in tensor_list if isinstance(row, torch.Tensor)]
|
178
183
|
if len(out_list) == 0:
|
@@ -182,21 +187,7 @@ def safe_stack(tensor_list:list[torch.tensor]) -> torch.tensor:
|
|
182
187
|
|
183
188
|
class DataProcessor:
|
184
189
|
'''
|
185
|
-
|
186
190
|
data processor can be instantiated by specifying the tokenizer(s) for decoder and encoder data
|
187
|
-
|
188
|
-
process_list() function processes raw data in the form of list[str] or str for decoder and encoder simultaneously and
|
189
|
-
saves them to save_path as .pt files.
|
190
|
-
- encoder and decoder input data should have matching input and outputs so enc_data[n] should have its corresponding
|
191
|
-
decoder data at dec_data[n].
|
192
|
-
- max block size can be specified for both input and output, default takes the max
|
193
|
-
block size provided in the data respectively.
|
194
|
-
- if enc/dec_block_size is specified and enc/dec_block_size_exceeded_policy is not, an error will occur if a piece
|
195
|
-
of data larger than enc/dec_block_size is encountered. enc/dec_block_size_exceeded_policy can be set to "skip" or
|
196
|
-
"trim" to skip rows larger than enc/dec_block_size or truncate the row to specified enc/dec_block_size respectively.
|
197
|
-
- enc/dec_create_masks saves masks tensors to save_path as .pt files.
|
198
|
-
|
199
|
-
|
200
191
|
'''
|
201
192
|
def __init__(self,
|
202
193
|
dec_tokenizer:TokenizerConstructor,
|
@@ -214,6 +205,14 @@ class DataProcessor:
|
|
214
205
|
enc_create_masks:bool=True,
|
215
206
|
save_path:str = "."
|
216
207
|
) -> None:
|
208
|
+
'''
|
209
|
+
processes raw data in the form of list[str] or str for decoder and encoder simultaneously and
|
210
|
+
saves them to save_path as .pt files.
|
211
|
+
- encoder and decoder input data should have matching input and outputs so enc_data[n] should have its corresponding
|
212
|
+
decoder data at dec_data[n].
|
213
|
+
- max block size can be specified for both input and output, default takes the max
|
214
|
+
block size provided in the data respectively. If data length > max_length, the data is trimmed.
|
215
|
+
'''
|
217
216
|
|
218
217
|
if isinstance(dec_data, str):
|
219
218
|
dec_data = [dec_data]
|
@@ -258,10 +257,8 @@ def get_valid_samples(random_samples:torch.Tensor,
|
|
258
257
|
block_size:int
|
259
258
|
) -> list[int]:
|
260
259
|
'''
|
261
|
-
|
262
260
|
returns list of len(random_samples) with values corresponding to index values of masks that ensure minimum masked
|
263
261
|
values when taking sample of length block_size
|
264
|
-
|
265
262
|
'''
|
266
263
|
valid_samples = [0 if sum(masks[row_num]) <= block_size else random.randint(0, sum(masks[row_num]) - block_size) for row_num in random_samples]
|
267
264
|
return valid_samples
|
@@ -273,11 +270,9 @@ def get_batch(data:torch.Tensor,
|
|
273
270
|
get_offset:bool=True
|
274
271
|
) -> tuple[torch.tensor]:
|
275
272
|
'''
|
276
|
-
|
277
273
|
returns random batches from data tensor using random sample for data selection.
|
278
274
|
- returns corresponding batch offset by 1 unless get_offset=False
|
279
275
|
- returns corresponding masks batch if masks data is specified
|
280
|
-
|
281
276
|
'''
|
282
277
|
batch_size = len(random_samples)
|
283
278
|
if block_size is not None and block_size != data.shape[1]:
|
@@ -310,9 +305,6 @@ def top_kp_filter(logits: torch.Tensor,
|
|
310
305
|
logits: (batch_size, vocab_size) tensor of raw logits.
|
311
306
|
top_k: keep only top_k tokens with highest logits.
|
312
307
|
top_p: keep the smallest set of tokens with cumulative probability >= top_p.
|
313
|
-
|
314
|
-
Returns:
|
315
|
-
selected: tensor of selected token indices (batch_size,)
|
316
308
|
'''
|
317
309
|
logits = logits.clone() # avoid modifying input logits in-place
|
318
310
|
|
@@ -359,10 +351,8 @@ def top_kp_filter(logits: torch.Tensor,
|
|
359
351
|
|
360
352
|
class SelfAttention(nn.Module):
|
361
353
|
'''
|
362
|
-
|
363
354
|
single self attention block of size head_size.
|
364
355
|
triangle_mask=True to apply look-ahead mask of size block_size.
|
365
|
-
|
366
356
|
'''
|
367
357
|
def __init__(self,
|
368
358
|
head_size:int,
|
@@ -390,10 +380,8 @@ class SelfAttention(nn.Module):
|
|
390
380
|
mask:torch.Tensor=None
|
391
381
|
) -> torch.tensor:
|
392
382
|
'''
|
393
|
-
|
394
383
|
k, q and v are key, tensors to get key, query and value tensors.
|
395
384
|
custom mask tensor can be applied.
|
396
|
-
|
397
385
|
'''
|
398
386
|
_,T,_ = k.shape
|
399
387
|
|
@@ -414,12 +402,10 @@ class SelfAttention(nn.Module):
|
|
414
402
|
|
415
403
|
class MultiHeadAttention(nn.Module):
|
416
404
|
'''
|
417
|
-
|
418
405
|
multi-head attention block consisting of num_heads SelfAttention blocks and a linear layer to
|
419
406
|
rejoin outputs.
|
420
407
|
specified head_size, n_embed, dropout, block_size and triangle_mask values are passed through to
|
421
408
|
SelfAttention blocks
|
422
|
-
|
423
409
|
'''
|
424
410
|
def __init__(self,
|
425
411
|
num_heads:int,
|
@@ -441,10 +427,8 @@ class MultiHeadAttention(nn.Module):
|
|
441
427
|
mask:torch.Tensor=None
|
442
428
|
) -> torch.tensor:
|
443
429
|
'''
|
444
|
-
|
445
430
|
k, q and v are key, tensors to get key, query and value tensors.
|
446
431
|
custom mask tensor can be applied.
|
447
|
-
|
448
432
|
'''
|
449
433
|
out = torch.cat([h(k, q, v, mask=mask) for h in self.heads], dim=-1)
|
450
434
|
out = self.dropout(self.proj(out))
|
@@ -452,11 +436,9 @@ class MultiHeadAttention(nn.Module):
|
|
452
436
|
|
453
437
|
class FeedForward(nn.Module):
|
454
438
|
'''
|
455
|
-
|
456
439
|
feed forward layer used after multi-head attention consisting of 2 lieanr layers with
|
457
440
|
a ReLU in between. Linear layers expand from n_embed to n_embed * expansion_factor and
|
458
441
|
back to n_embed.
|
459
|
-
|
460
442
|
'''
|
461
443
|
def __init__(self,
|
462
444
|
n_embed:int,
|
@@ -478,10 +460,8 @@ class FeedForward(nn.Module):
|
|
478
460
|
|
479
461
|
class EncoderBlock(nn.Module):
|
480
462
|
'''
|
481
|
-
|
482
463
|
encoder block consists of a sequence of multi-head attention, LayerNorm, feed-forward, LayerNorm
|
483
464
|
head_size is calculated from n_embed // n_head
|
484
|
-
|
485
465
|
'''
|
486
466
|
def __init__(self,
|
487
467
|
n_embed:int,
|
@@ -509,13 +489,11 @@ class EncoderBlock(nn.Module):
|
|
509
489
|
|
510
490
|
class DecoderBlock(nn.Module):
|
511
491
|
'''
|
512
|
-
|
513
492
|
decoder block consists of a sequence of multi-head attention, LayerNorm, feed-forward, LayerNorm
|
514
493
|
if cross-attention is True, a multi-head attention block and layerNorm is added before feed-forward
|
515
494
|
taking specified enc_k and enc_v tensors as value and key tensors. These values should be the output
|
516
495
|
of an encoder block.
|
517
496
|
head_size is calculated from n_embed // n_head
|
518
|
-
|
519
497
|
'''
|
520
498
|
def __init__(self,
|
521
499
|
n_embed:int,
|
@@ -555,9 +533,7 @@ class DecoderBlock(nn.Module):
|
|
555
533
|
|
556
534
|
class MySequential(nn.Sequential):
|
557
535
|
'''
|
558
|
-
|
559
536
|
MySequential serves the same purpose as nn.Sequential but allows for multiple inputs and outputs
|
560
|
-
|
561
537
|
'''
|
562
538
|
def forward(self, *input):
|
563
539
|
for module in self._modules.values():
|
@@ -566,39 +542,12 @@ class MySequential(nn.Sequential):
|
|
566
542
|
|
567
543
|
class RoboConstructor(nn.Module):
|
568
544
|
'''
|
569
|
-
|
570
545
|
RoboConstructor assembles an encoder-decoder or decoder-only transformer.
|
571
546
|
if the enc_* variables are not specified, or enc_n_blocks==0, the transformer will be decoder-only.
|
572
547
|
- if any of the dec_* variables are not specified (except dec_expansion_factor) an error will occur.
|
573
548
|
- if enc_n_blocks > 0 and any of the enc_* variables are not specified (except enc_expansion_factor and enc_block_size) an error will occur.
|
574
549
|
dropout can be specified, default=0.1.
|
575
550
|
if device is not specified, device will default to first available among ("cuda", "mps", "cpu")
|
576
|
-
|
577
|
-
prep_data() function returns a batch of specified batch_size, from dec_data (and dec_masks, enc_data and enc_masks if specified)
|
578
|
-
- if encoder is configured in this instance, enc_data must be specified.
|
579
|
-
- dec_block_size must be specified.
|
580
|
-
- if enc_block_size is not specified, the entire block_size of enc_data will be used.
|
581
|
-
this function is for use in train_robo()
|
582
|
-
|
583
|
-
train_robo() function trains the RoboConstructor instance transformer.
|
584
|
-
- training parameters can be specified such as max_iters, eval_interval, batch_size, eval_iters, learning_rate, label_smoothing.
|
585
|
-
- paths must be specified for decoder training data (and encoder training data if encoder-decoder transformer)
|
586
|
-
- optional paths to specify: decoder and encoder masks, decoder and encoder validation data, decoder and encoder validation masks data
|
587
|
-
- if neither pad_token or tokenizer is specified (or tokenizer has no pad_token), any padding in labels will contribute towards the loss
|
588
|
-
which may cause unwanted results. Specifying pad_token and/or tokenizer allows loss to be calculated while ignoring any padding in labels
|
589
|
-
- specify save_path to save the model as a .pkl file every eval_interval iterations using the save_component function.
|
590
|
-
|
591
|
-
generate() function uses the tranformer model from the RoboConstructor instance to generate an output from an input.
|
592
|
-
- input can be in the form of a string if input tokenizer is specified (enc_tokenizer for encoder-decoder and dec_tokenizder for decoder-only),
|
593
|
-
otherwise, it must be in the form of a list of tokens.
|
594
|
-
- if dec_tokenizer is specified, output will be a string.
|
595
|
-
- new tokens are generated until the dec_end_token (or dec_tokenizer.end_token) is generated, or the number of tokens generated == max_new_tokens.
|
596
|
-
- if input tokenizer is not specified, or input tokenizer.start_token is None, enc_start_token must be specified for an encoder-decoder model.
|
597
|
-
- separator_token is used to separate the input and generated tokens for a decoder-only model. If this value is not specified, there
|
598
|
-
will be no distinction between input tokens and generated tokens to the transformer, even if dec_tokenizer is specified.
|
599
|
-
- if new_line_token is not specified, output will be returned in one line, without any "\n" line separators.
|
600
|
-
- temperature, top_k and top_p can be specified to adjust the output.
|
601
|
-
|
602
551
|
'''
|
603
552
|
def __init__(self,
|
604
553
|
n_embed:int,
|
@@ -711,6 +660,13 @@ class RoboConstructor(nn.Module):
|
|
711
660
|
enc_block_size:int=None,
|
712
661
|
enc_masks:str=None
|
713
662
|
) -> tuple[torch.tensor]:
|
663
|
+
'''
|
664
|
+
returns a batch of specified batch_size, from dec_data (and dec_masks, enc_data and enc_masks if specified)
|
665
|
+
- if encoder is configured in this instance, enc_data must be specified.
|
666
|
+
- dec_block_size must be specified.
|
667
|
+
- if enc_block_size is not specified, the entire block_size of enc_data will be used.
|
668
|
+
this method is for use in train_robo()
|
669
|
+
'''
|
714
670
|
random_samples = torch.randint(dec_data.shape[0], (batch_size,))
|
715
671
|
|
716
672
|
dec_train_batch_in, dec_train_batch_out, dec_train_masks_in = get_batch(dec_data, random_samples, masks=dec_masks, block_size=dec_block_size, get_offset=True)
|
@@ -734,7 +690,7 @@ class RoboConstructor(nn.Module):
|
|
734
690
|
eval_interval:int,
|
735
691
|
batch_size:int,
|
736
692
|
training_dir_path:str,
|
737
|
-
eval_dir_path:str,
|
693
|
+
eval_dir_path:str=None,
|
738
694
|
eval_iters:int=3,
|
739
695
|
learning_rate:float=1e-4,
|
740
696
|
pad_token:int=None,
|
@@ -742,6 +698,15 @@ class RoboConstructor(nn.Module):
|
|
742
698
|
save_path:str=None,
|
743
699
|
label_smoothing:float=0.1
|
744
700
|
) -> None:
|
701
|
+
'''
|
702
|
+
trains the RoboConstructor instance transformer.
|
703
|
+
- training parameters can be specified such as max_iters, eval_interval, batch_size, eval_iters, learning_rate, label_smoothing.
|
704
|
+
- paths must be specified for decoder training data (and encoder training data if encoder-decoder transformer)
|
705
|
+
- optional paths to specify: decoder and encoder masks, decoder and encoder validation data, decoder and encoder validation masks data
|
706
|
+
- if neither pad_token or tokenizer is specified (or tokenizer has no pad_token), any padding in labels will contribute towards the loss
|
707
|
+
which may cause unwanted results. Specifying pad_token and/or tokenizer allows loss to be calculated while ignoring any padding in labels
|
708
|
+
- specify save_path to save the model as a .pkl file every eval_interval iterations using the save_component function.
|
709
|
+
'''
|
745
710
|
|
746
711
|
dec_training_path = os.path.join(training_dir_path, "decoder_data.pt")
|
747
712
|
dec_training_data = torch.load(dec_training_path, weights_only=True)
|
@@ -831,6 +796,18 @@ class RoboConstructor(nn.Module):
|
|
831
796
|
top_k:int=None,
|
832
797
|
top_p:float=None
|
833
798
|
) -> list[int]|str:
|
799
|
+
'''
|
800
|
+
uses the tranformer model from the RoboConstructor instance to generate an output from an input.
|
801
|
+
- input can be in the form of a string if input tokenizer is specified (enc_tokenizer for encoder-decoder and dec_tokenizder for decoder-only),
|
802
|
+
otherwise, it must be in the form of a list of tokens.
|
803
|
+
- if dec_tokenizer is specified, output will be a string.
|
804
|
+
- new tokens are generated until the dec_end_token (or dec_tokenizer.end_token) is generated, or the number of tokens generated == max_new_tokens.
|
805
|
+
- if input tokenizer is not specified, or input tokenizer.start_token is None, enc_start_token must be specified for an encoder-decoder model.
|
806
|
+
- separator_token is used to separate the input and generated tokens for a decoder-only model. If this value is not specified, there
|
807
|
+
will be no distinction between input tokens and generated tokens to the transformer, even if dec_tokenizer is specified.
|
808
|
+
- if new_line_token is not specified, output will be returned in one line, without any "\n" line separators.
|
809
|
+
- temperature, top_k and top_p can be specified to adjust the output.
|
810
|
+
'''
|
834
811
|
max_new_tokens = self.dec_block_size if max_new_tokens is None else max_new_tokens
|
835
812
|
|
836
813
|
if self.cross_attention:
|
@@ -891,9 +868,7 @@ class RoboConstructor(nn.Module):
|
|
891
868
|
|
892
869
|
def save_component(component, save_path:str) -> None:
|
893
870
|
'''
|
894
|
-
|
895
871
|
saves component (such as TokenizerConstructor or RoboConstructor) as .pkl file.
|
896
|
-
|
897
872
|
'''
|
898
873
|
save_path = save_path + ".pkl" if save_path[-4:] != ".pkl" else save_path
|
899
874
|
with open(save_path, "wb") as comp:
|
@@ -901,9 +876,7 @@ def save_component(component, save_path:str) -> None:
|
|
901
876
|
|
902
877
|
def load_component(load_path:str):
|
903
878
|
'''
|
904
|
-
|
905
879
|
loads saved .pkl file into variable.
|
906
|
-
|
907
880
|
'''
|
908
881
|
load_path = load_path + ".pkl" if load_path[-4:] != ".pkl" else load_path
|
909
882
|
with open(load_path, "rb") as comp:
|
@@ -1,6 +1,6 @@
|
|
1
1
|
Metadata-Version: 2.4
|
2
2
|
Name: robo_lib
|
3
|
-
Version: 1.0.
|
3
|
+
Version: 1.0.2
|
4
4
|
Summary: A package to create, configure, and train transformer models.
|
5
5
|
Project-URL: Homepage, https://github.com/hamburgerfish/robo_pack
|
6
6
|
Project-URL: Issues, https://github.com/hamburgerfish/robo_pack/issues
|
@@ -13,6 +13,7 @@ Requires-Python: >=3.8
|
|
13
13
|
Requires-Dist: numpy
|
14
14
|
Requires-Dist: tokenizers
|
15
15
|
Requires-Dist: torch
|
16
|
+
Requires-Dist: typing
|
16
17
|
Description-Content-Type: text/markdown
|
17
18
|
|
18
19
|
# robo-lib
|
@@ -83,10 +84,8 @@ proc.process_list(
|
|
83
84
|
save_path="data/training",
|
84
85
|
dec_data=french_train,
|
85
86
|
dec_max_block_size=100,
|
86
|
-
dec_block_size_exceeded_policy="skip",
|
87
87
|
enc_data=english_train,
|
88
|
-
enc_max_block_size=100
|
89
|
-
enc_block_size_exceeded_policy="skip"
|
88
|
+
enc_max_block_size=100
|
90
89
|
)
|
91
90
|
|
92
91
|
# process and save validation data as data/validation*.pt
|
@@ -94,10 +93,8 @@ proc.process_list(
|
|
94
93
|
save_path="data/validation",
|
95
94
|
dec_data=french_val,
|
96
95
|
dec_max_block_size=100,
|
97
|
-
dec_block_size_exceeded_policy="skip",
|
98
96
|
enc_data=english_val,
|
99
|
-
enc_max_block_size=100
|
100
|
-
enc_block_size_exceeded_policy="skip"
|
97
|
+
enc_max_block_size=100
|
101
98
|
)
|
102
99
|
```
|
103
100
|
- The `RoboConstructor` class is used to create and configure transformer models before trainin.
|
@@ -128,14 +125,8 @@ robo.train_robo(
|
|
128
125
|
max_iters=20000,
|
129
126
|
eval_interval=200,
|
130
127
|
batch_size=128,
|
131
|
-
|
132
|
-
|
133
|
-
dec_training_masks_path="data/training_decoder_mask_data.pt",
|
134
|
-
dec_eval_masks_path="data/validation_decoder_mask_data.pt",
|
135
|
-
enc_training_path="data/training_encoder_data.pt",
|
136
|
-
enc_eval_path="data/validation_encoder_data.pt",
|
137
|
-
enc_training_masks_path="data/training_encoder_mask_data.pt",
|
138
|
-
enc_eval_masks_path="data/validation_encoder_mask_data.pt",
|
128
|
+
training_dir_path="data/training",
|
129
|
+
eval_dir_path="data/validation",
|
139
130
|
dec_tokenizer=decoder_tok,
|
140
131
|
save_path="models/eng_to_fr_robo.pkl"
|
141
132
|
)
|
@@ -223,8 +214,8 @@ robo.train(
|
|
223
214
|
max_iters=20000,
|
224
215
|
eval_interval=200,
|
225
216
|
batch_size=64,
|
226
|
-
|
227
|
-
|
217
|
+
training_dir_path="data/shakespeare_train",
|
218
|
+
eval_dir_path="data/shakespeare_valid",
|
228
219
|
dec_tokenizer=tok,
|
229
220
|
save_path="models/shakespeare_robo.pkl"
|
230
221
|
)
|
@@ -0,0 +1,6 @@
|
|
1
|
+
robo_lib/__init__.py,sha256=NnzWHWwpFcSJD_XRMWKKPQFAIrRBFYiCFN0pgUGPygc,968
|
2
|
+
robo_lib/components.py,sha256=eJ6ZU8xvPXy3vFx26C7J_2jSSsMPPoIFk7EDTXZpfOI,43256
|
3
|
+
robo_lib-1.0.2.dist-info/METADATA,sha256=j6aCx5KqUIZmy-j8r2AGafOB450-G0uBbQWdJUeyLKc,9051
|
4
|
+
robo_lib-1.0.2.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
5
|
+
robo_lib-1.0.2.dist-info/licenses/LICENSE,sha256=4XzkkpFqPzH0GH3zxOqRTqc7xUKSEe7dWPOuJYW95ac,1089
|
6
|
+
robo_lib-1.0.2.dist-info/RECORD,,
|
robo_lib-1.0.0.dist-info/RECORD
DELETED
@@ -1,6 +0,0 @@
|
|
1
|
-
robo_lib/__init__.py,sha256=NnzWHWwpFcSJD_XRMWKKPQFAIrRBFYiCFN0pgUGPygc,968
|
2
|
-
robo_lib/components.py,sha256=M_1M1Y56_W0bSElZlg3M6gRoJJPAnUchTO3N8AdsEV8,43091
|
3
|
-
robo_lib-1.0.0.dist-info/METADATA,sha256=GAnmrynDr3-hv9KyCjXlpx5I8v2BLQJCIDXURoGFw2w,9633
|
4
|
-
robo_lib-1.0.0.dist-info/WHEEL,sha256=qtCwoSJWgHk21S1Kb4ihdzI2rlJ1ZKaIurTj_ngOhyQ,87
|
5
|
-
robo_lib-1.0.0.dist-info/licenses/LICENSE,sha256=4XzkkpFqPzH0GH3zxOqRTqc7xUKSEe7dWPOuJYW95ac,1089
|
6
|
-
robo_lib-1.0.0.dist-info/RECORD,,
|
File without changes
|
File without changes
|