gptmodel 1.0.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
gptmodel/__init__.py
ADDED
@@ -0,0 +1 @@
|
|
1
|
+
from .gptmodel import *
|
gptmodel/gptmodel.py
ADDED
@@ -0,0 +1,476 @@
|
|
1
|
+
# This is a standard code of a GPT (Generative Pre-trained Transformer) model, developed by Sapiens Technology®️,
|
2
|
+
# which faithfully follows the mathematical structure of the article “Attention Is All You Need” for the construction of the Transformer architecture
|
3
|
+
# used in the pattern recognition of the model that is saved. Some optimizations that do not influence the Transformer architecture
|
4
|
+
# were applied only to facilitate the adjustments of the parameters and variables of the training, saving, loading, fine-tuning and inference of the pre-trained model.
|
5
|
+
# --------------------------> A SAPIENS TECHNOLOGY®️ PRODUCTION) <--------------------------
|
6
|
+
class GPTModel:
|
7
|
+
def __init__(self, embedding_dim=384, block_size=500, batch_size=32, number_heads=6, number_layers=6, dropout=0.1, learning_rate=3e-4, eval_interval=500, epochs=2000):
|
8
|
+
self.__embedding_dim = max((1, int(embedding_dim))) if type(embedding_dim) in (bool, int, float) else 384
|
9
|
+
self.__block_size = max((1, int(block_size))) if type(block_size) in (bool, int, float) else 500
|
10
|
+
self.__batch_size = max((1, int(batch_size))) if type(batch_size) in (bool, int, float) else 32
|
11
|
+
self.__number_heads = max((1, int(number_heads))) if type(number_heads) in (bool, int, float) else 6
|
12
|
+
self.__number_layers = max((1, int(number_layers))) if type(number_layers) in (bool, int, float) else 6
|
13
|
+
self.dropout = max((0, float(dropout))) if type(dropout) in (bool, int, float) else 0.1
|
14
|
+
self.__learning_rate = max((0, float(learning_rate))) if type(learning_rate) in (bool, int, float) else 3e-4
|
15
|
+
self.__eval_interval = max((1, int(eval_interval))) if type(eval_interval) in (bool, int, float) else 500
|
16
|
+
self.__epochs = max((1, int(epochs))) if type(epochs) in (bool, int, float) else 2000
|
17
|
+
from torch import cuda, device, backends
|
18
|
+
from torch.utils.data import Dataset, DataLoader
|
19
|
+
from torch.nn import Module, functional as Function, utils
|
20
|
+
from torch import nn as artificial_neural_network, triu, ones
|
21
|
+
from torch import tensor, no_grad, int64, multinomial, cat, topk, where, sort, cumsum, zeros_like, bool as torch_bool, save, load
|
22
|
+
from tiktoken import get_encoding
|
23
|
+
from json import load as json_load
|
24
|
+
from torch import optim
|
25
|
+
from tqdm import tqdm
|
26
|
+
from os import path as os_path, makedirs as os_makedirs
|
27
|
+
if cuda.is_available(): local_device = device('cuda')
|
28
|
+
elif backends.mps.is_available(): local_device = device('mps')
|
29
|
+
else: local_device = device('cpu')
|
30
|
+
self.__Dataset = Dataset
|
31
|
+
self.__Module = Module
|
32
|
+
self.__neural_network = artificial_neural_network
|
33
|
+
self.__tensor = tensor
|
34
|
+
self.__triu = triu
|
35
|
+
self.__ones = ones
|
36
|
+
self.__no_grad = no_grad
|
37
|
+
self.__device = local_device
|
38
|
+
self.__Function = Function
|
39
|
+
self.__int64 = int64
|
40
|
+
self.__multinomial = multinomial
|
41
|
+
self.__cat = cat
|
42
|
+
self.__topk = topk
|
43
|
+
self.__where = where
|
44
|
+
self.__sort = sort
|
45
|
+
self.__cumsum = cumsum
|
46
|
+
self.__zeros_like = zeros_like
|
47
|
+
self.__bool = torch_bool
|
48
|
+
self.__get_encoding = get_encoding
|
49
|
+
self.__json_load = json_load
|
50
|
+
self.__DataLoader = DataLoader
|
51
|
+
self.__optim = optim
|
52
|
+
self.__utils = utils
|
53
|
+
self.__tqdm = tqdm
|
54
|
+
self.__os_path = os_path
|
55
|
+
self.__os_makedirs = os_makedirs
|
56
|
+
self.__save = save
|
57
|
+
self.__load = load
|
58
|
+
self.__model = None
|
59
|
+
self.__encode = None
|
60
|
+
self.__decode = None
|
61
|
+
self.__end_tag = None
|
62
|
+
self.__string = ''
|
63
|
+
self.__vocab_size = 0
|
64
|
+
self.__char_to_idx = {}
|
65
|
+
self.__idx_to_char = {}
|
66
|
+
self.__tokenizer = 'gpt'
|
67
|
+
self.__optimizer = None
|
68
|
+
self.__train = False
|
69
|
+
self.parameters_number = 0
|
70
|
+
class TextDataset(self.__Dataset):
|
71
|
+
def __init__(self, data={}, block_size=0): self.data, self.block_size = data, block_size
|
72
|
+
def __len__(self): return len(self.data) - self.block_size
|
73
|
+
def __getitem__(self, index=0):
|
74
|
+
input_sequence = self.data[index:index + self.block_size]
|
75
|
+
target_sequence = self.data[index + 1:index + self.block_size + 1]
|
76
|
+
return input_sequence, target_sequence
|
77
|
+
class Transformer(self.__Module):
|
78
|
+
def __init__(self, outer=None, vocab_size=0, embedding_dim=0, number_heads=0, number_layers=0, dropout=None, block_size=0):
|
79
|
+
super().__init__()
|
80
|
+
self.outer = outer
|
81
|
+
self.positional_encoding = outer._GPTModel__neural_network.Parameter(outer._GPTModel__tensor([]).new_zeros(1, block_size, embedding_dim))
|
82
|
+
self.dropout = outer._GPTModel__neural_network.Dropout(dropout)
|
83
|
+
self.input_embedding = outer._GPTModel__neural_network.Embedding(vocab_size, embedding_dim)
|
84
|
+
self.multi_head_attention = outer._GPTModel__neural_network.TransformerDecoder(outer._GPTModel__neural_network.TransformerDecoderLayer(d_model=embedding_dim, nhead=number_heads, dropout=dropout), num_layers=number_layers)
|
85
|
+
self.output_function = outer._GPTModel__neural_network.Linear(embedding_dim, vocab_size)
|
86
|
+
self.block_size = block_size
|
87
|
+
def forward(self, input_tensor=[]):
|
88
|
+
outer = self.outer
|
89
|
+
batch_size, sequence_length = input_tensor.size()
|
90
|
+
positions = self.positional_encoding[:, :sequence_length, :].to(input_tensor.device)
|
91
|
+
output_embedding = self.dropout(self.input_embedding(input_tensor) + positions)
|
92
|
+
transposed = output_embedding.transpose(0, 1)
|
93
|
+
masked_multi_head_attention = outer._GPTModel__triu(outer._GPTModel__ones(sequence_length, sequence_length, device=input_tensor.device) * float('-inf'), diagonal=1)
|
94
|
+
add_and_norm = self.multi_head_attention(transposed, transposed, tgt_mask=masked_multi_head_attention)
|
95
|
+
add_and_norm = add_and_norm.transpose(0, 1)
|
96
|
+
return self.output_function(add_and_norm)
|
97
|
+
self.__TextDatasets = TextDataset
|
98
|
+
self.__Transformers = Transformer
|
99
|
+
def __compute_loss(self, loader=[]):
|
100
|
+
self.__model.eval()
|
101
|
+
total_loss = 0
|
102
|
+
with self.__no_grad():
|
103
|
+
for input_batch, target_batch in loader:
|
104
|
+
input_batch, target_batch = input_batch.to(self.__device), target_batch.to(self.__device)
|
105
|
+
logits = self.__model(input_batch)
|
106
|
+
loss = self.__Function.cross_entropy(logits.view(-1, logits.size(-1)), target_batch.view(-1))
|
107
|
+
total_loss += loss.item()
|
108
|
+
return total_loss / len(loader)
|
109
|
+
def __format_params(self, number_params=0):
|
110
|
+
if number_params < 1_000: return f'{number_params}U'
|
111
|
+
elif number_params < 1_000_000: return f'{number_params // 1_000}K'
|
112
|
+
elif number_params < 1_000_000_000: return f'{number_params // 1_000_000}M'
|
113
|
+
elif number_params < 1_000_000_000_000: return f'{number_params // 1_000_000_000}B'
|
114
|
+
else: return f'{number_params // 1_000_000_000_000}T'
|
115
|
+
def __get_found_end_tag(self, decoded_token='', decoded_tokens='', limits=[]):
|
116
|
+
if self.__end_tag is None: return False
|
117
|
+
decoded_token, decoded_tokens, limits = str(decoded_token).strip(), str(decoded_tokens).strip(), list(limits)
|
118
|
+
for limit in ['']+limits+[' ']:
|
119
|
+
if decoded_token.endswith(limit+self.__end_tag) or decoded_tokens.endswith(limit+self.__end_tag): return True
|
120
|
+
elif decoded_token.endswith(limit+self.__end_tag[0]) or decoded_tokens.endswith(limit+self.__end_tag[0]): return True
|
121
|
+
return False
|
122
|
+
def __generate_tokens_x(self, prompt='', max_tokens=500, temperature=1.0):
|
123
|
+
self.__model.eval()
|
124
|
+
encoded_prompt = self.__encode(prompt)
|
125
|
+
input_tensor = self.__tensor(encoded_prompt, dtype=self.__int64).unsqueeze(0).to(self.__device)
|
126
|
+
limits = ('.', '\n', '!', '?', ';')
|
127
|
+
with self.__no_grad():
|
128
|
+
tokens_generated, decoded_tokens = 0, ''
|
129
|
+
while True:
|
130
|
+
conditioned_input = input_tensor[:, -self.__block_size:] if input_tensor.size(1) > self.__block_size else input_tensor
|
131
|
+
logits = self.__model(conditioned_input)
|
132
|
+
logits = logits[:, -1, :] / temperature
|
133
|
+
output_probabilities = self.__Function.softmax(logits, dim=-1)
|
134
|
+
shifted_right = self.__multinomial(output_probabilities, num_samples=1)
|
135
|
+
input_tensor = self.__cat((input_tensor, shifted_right), dim=1)
|
136
|
+
token = shifted_right.item()
|
137
|
+
decoded_token, found_end_tag = self.__decode([token]), False
|
138
|
+
if tokens_generated == 0 and '\n' in decoded_token: continue
|
139
|
+
tokens_generated += 1
|
140
|
+
decoded_tokens += decoded_token
|
141
|
+
found_end_tag = self.__get_found_end_tag(decoded_token=decoded_token, decoded_tokens=decoded_tokens, limits=limits)
|
142
|
+
if found_end_tag and decoded_token.endswith(self.__end_tag[0]): decoded_token = decoded_token[:-1]
|
143
|
+
yield decoded_token
|
144
|
+
if found_end_tag or ((tokens_generated >= max_tokens) and (decoded_token[-1] in limits)) or (tokens_generated >= (max_tokens*2)): break
|
145
|
+
def __generate_tokens_y(self, prompt='', max_tokens=500, temperature=1.0, top_k=50, top_p=0.9):
|
146
|
+
self.__model.eval()
|
147
|
+
encoded_prompt = self.__encode(prompt)
|
148
|
+
input_tensor = self.__tensor(encoded_prompt, dtype=self.__int64).unsqueeze(0).to(self.__device)
|
149
|
+
limits = ('.', '\n', '!', '?', ';')
|
150
|
+
with self.__no_grad():
|
151
|
+
tokens_generated, decoded_tokens = 0, ''
|
152
|
+
while True:
|
153
|
+
conditioned_input = (input_tensor[:, -self.__block_size:] if input_tensor.size(1) > self.__block_size else input_tensor)
|
154
|
+
logits = self.__model(conditioned_input)
|
155
|
+
logits = logits[:, -1, :] / temperature
|
156
|
+
if top_k > 0:
|
157
|
+
top_k = min(top_k, logits.size(-1))
|
158
|
+
value, _ = self.__topk(logits, top_k)
|
159
|
+
thresh = value[:, -1].unsqueeze(-1)
|
160
|
+
logits = self.__where(logits < thresh, self.__tensor(float('-inf')).to(logits), logits)
|
161
|
+
if top_p < 1.0:
|
162
|
+
sorted_logits, sorted_index = self.__sort(logits, dim=-1, descending=True)
|
163
|
+
sorted_probabilities = self.__Function.softmax(sorted_logits, dim=-1)
|
164
|
+
cumulative_probabilities = self.__cumsum(sorted_probabilities, dim=-1)
|
165
|
+
sorted_mask = cumulative_probabilities > top_p
|
166
|
+
sorted_mask[:, 0] = False
|
167
|
+
mask = self.__zeros_like(logits, dtype=self.__bool)
|
168
|
+
mask.scatter_(-1, sorted_index, sorted_mask)
|
169
|
+
logits = logits.masked_fill(mask, float('-inf'))
|
170
|
+
output_probabilities = self.__Function.softmax(logits, dim=-1)
|
171
|
+
shifted_right = self.__multinomial(output_probabilities, num_samples=1)
|
172
|
+
input_tensor = self.__cat((input_tensor, shifted_right), dim=1)
|
173
|
+
token = shifted_right.item()
|
174
|
+
decoded_token, found_end_tag = self.__decode([token]), False
|
175
|
+
if tokens_generated == 0 and '\n' in decoded_token: continue
|
176
|
+
tokens_generated += 1
|
177
|
+
decoded_tokens += decoded_token
|
178
|
+
found_end_tag = self.__get_found_end_tag(decoded_token=decoded_token, decoded_tokens=decoded_tokens, limits=limits)
|
179
|
+
if found_end_tag and decoded_token.endswith(self.__end_tag[0]): decoded_token = decoded_token[:-1]
|
180
|
+
yield decoded_token
|
181
|
+
if found_end_tag or ((tokens_generated >= max_tokens) and (decoded_token[-1] in limits)) or (tokens_generated >= (max_tokens*2)): break
|
182
|
+
def __generate_tokens(self, prompt='', max_tokens=500, temperature=1.0, top_k=0, top_p=1.0):
|
183
|
+
prompt = '?' if len(str(prompt).strip()) < 1 else str(prompt).strip()
|
184
|
+
def get_last_n_tokens(text='', n=0):
|
185
|
+
if self.__tokenizer == 'sapi': return text[-n:]
|
186
|
+
else:
|
187
|
+
encoding = self.__get_encoding('gpt2')
|
188
|
+
tokens = encoding.encode(text)
|
189
|
+
last_n_tokens = tokens[-n:]
|
190
|
+
truncated_text = encoding.decode(last_n_tokens)
|
191
|
+
return truncated_text
|
192
|
+
prompt = get_last_n_tokens(text=prompt, n=self.__block_size)
|
193
|
+
if top_k > 0 or top_p < 1.0: return self.__generate_tokens_y(prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p)
|
194
|
+
else: return self.__generate_tokens_x(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
|
195
|
+
def train(self, dataset_path='', string='', precision=0.9, tokenizer='gpt', context_window=500, end_tag=None, validate=0.0, progress=True):
|
196
|
+
try:
|
197
|
+
training_metrics = {'val_loss': 0.0, 'loss': 0.0, 'generalization_rate': 0.0, 'precision': 0.0}
|
198
|
+
dataset_path = str(dataset_path).strip()
|
199
|
+
string = str(string).strip()
|
200
|
+
precision = min((1.0, max((0.0, float(precision))))) if type(precision) in (bool, int, float) else 0.9
|
201
|
+
tokenizer = str(tokenizer).lower().strip()
|
202
|
+
self.__block_size = max((1, int(context_window))) if type(context_window) in (bool, int, float) else 500
|
203
|
+
if end_tag is not None and self.__end_tag is None: self.__end_tag = str(end_tag)
|
204
|
+
validate = min((1.0, max((0.0, float(validate))))) if type(validate) in (bool, int, float) else 0.0
|
205
|
+
progress = bool(progress) if type(progress) in (bool, int, float) else True
|
206
|
+
if tokenizer not in ('sapi', 'gpt'): tokenizer = 'gpt'
|
207
|
+
self.__string = str(self.__string+'\n\n'+string).strip()
|
208
|
+
loss_limit = min(1.0, max(0.0, 1.0 - precision))
|
209
|
+
is_txt, is_json, text_data = dataset_path.endswith('.txt'), dataset_path.endswith('.json'), ''
|
210
|
+
def prepare_json(json_data={}):
|
211
|
+
if type(json_data) == dict: pairs = json_data[list(json_data.keys())[0]]
|
212
|
+
else: pairs = json_data
|
213
|
+
if self.__end_tag is None: self.__end_tag = '<|end|>'
|
214
|
+
return '\n\n'.join([str(pair[list(pair.keys())[0]]+'\n'+pair[list(pair.keys())[1]]).replace(self.__end_tag, '').strip()+self.__end_tag for pair in pairs])
|
215
|
+
def is_web_address(url_path=''):
|
216
|
+
url_path = str(url_path).lower().strip()
|
217
|
+
return url_path.startswith('https://') or url_path.startswith('http://') or url_path.startswith('www.')
|
218
|
+
_is_web_address = is_web_address(url_path=dataset_path)
|
219
|
+
if _is_web_address:
|
220
|
+
is_json = True if '.json' in dataset_path.lower() else False
|
221
|
+
def read_remote_file(url_path=''):
|
222
|
+
from urllib.request import urlopen
|
223
|
+
with urlopen(url_path) as response: return str(response.read().decode('utf-8', errors='replace').replace('\r\n', '\n').replace('\r', '\n')).strip()
|
224
|
+
text_data = read_remote_file(url_path=dataset_path)
|
225
|
+
if is_json:
|
226
|
+
def load_json(string_content=''):
|
227
|
+
json_content = {}
|
228
|
+
string_content = str(string_content)
|
229
|
+
try:
|
230
|
+
from json import loads
|
231
|
+
json_content = loads(string_content)
|
232
|
+
except:
|
233
|
+
from ast import literal_eval
|
234
|
+
json_content = literal_eval(string_content)
|
235
|
+
return json_content
|
236
|
+
json_data = load_json(string_content=text_data)
|
237
|
+
text_data = prepare_json(json_data=json_data)
|
238
|
+
else:
|
239
|
+
if not is_txt and not is_json and len(self.__string) < 1: raise ValueError('Unsupported file format. Use .txt or .json.')
|
240
|
+
if is_txt:
|
241
|
+
with open(dataset_path, 'r', encoding='utf-8') as file: text_data = str(file.read()).strip()
|
242
|
+
elif is_json:
|
243
|
+
with open(dataset_path, 'r', encoding='utf-8') as file: json_data = self.__json_load(file)
|
244
|
+
text_data = prepare_json(json_data=json_data)
|
245
|
+
if len(self.__string) > 0: text_data += '\n\n' + self.__string
|
246
|
+
text_data = text_data.strip()
|
247
|
+
if tokenizer == 'sapi':
|
248
|
+
chars = sorted(list(set(text_data)))
|
249
|
+
self.__vocab_size = len(chars)
|
250
|
+
self.__char_to_idx = {char: index for index, char in enumerate(chars)}
|
251
|
+
self.__idx_to_char = {index: char for index, char in enumerate(chars)}
|
252
|
+
self.__encode = lambda string: [self.__char_to_idx[char] for char in string]
|
253
|
+
self.__decode = lambda indices: ''.join([self.__idx_to_char[index] for index in indices])
|
254
|
+
else:
|
255
|
+
encode = self.__get_encoding('gpt2')
|
256
|
+
self.__vocab_size = encode.n_vocab
|
257
|
+
self.__encode = encode.encode
|
258
|
+
self.__decode = encode.decode
|
259
|
+
data = self.__tensor(self.__encode(text_data), dtype=self.__int64)
|
260
|
+
if validate > 0:
|
261
|
+
split_point = int((1-validate) * len(data))
|
262
|
+
train_data, validation_data = data[:split_point], data[split_point:]
|
263
|
+
minimum_length = min(len(train_data), len(validation_data))
|
264
|
+
if minimum_length >= 2:
|
265
|
+
desired_block_size = int(context_window) if context_window else 500
|
266
|
+
self.__block_size = max(1, min(desired_block_size, minimum_length - 1))
|
267
|
+
else: self.__block_size = 1
|
268
|
+
else:
|
269
|
+
train_data = data
|
270
|
+
data_length = len(train_data)
|
271
|
+
self.__block_size = max(1, min(self.__block_size, data_length - 1))
|
272
|
+
self.__tokenizer = tokenizer
|
273
|
+
train_dataset = self.__TextDatasets(train_data, self.__block_size)
|
274
|
+
if validate > 0: validation_dataset = self.__TextDatasets(validation_data, self.__block_size)
|
275
|
+
train_loader = self.__DataLoader(train_dataset, batch_size=self.__batch_size, shuffle=True)
|
276
|
+
if validate > 0: validation_loader = self.__DataLoader(validation_dataset, batch_size=self.__batch_size, shuffle=False)
|
277
|
+
self.__model = self.__Transformers(self, self.__vocab_size, self.__embedding_dim, self.__number_heads, self.__number_layers, self.dropout, self.__block_size).to(self.__device)
|
278
|
+
self.__optimizer = self.__optim.AdamW(self.__model.parameters(), lr=self.__learning_rate)
|
279
|
+
scheduler, feed_forward = self.__optim.lr_scheduler.ReduceLROnPlateau(self.__optimizer, mode='min', factor=0.5, patience=3), True
|
280
|
+
Nx, last_validation_loss, step, best_val_loss = 0, 1.0, 0, float('inf')
|
281
|
+
string_precision = f'{precision:.4f}'.ljust(5, '0')
|
282
|
+
formatted_string = '{desc}: {percentage:3.0f}%|{bar:10}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt:>9}]'
|
283
|
+
while feed_forward:
|
284
|
+
self.__model.train()
|
285
|
+
loss_item, total_train_loss = 1.0, 1.0
|
286
|
+
epoch = str(Nx+1).rjust(10, '0')
|
287
|
+
for input_batch, target_batch in train_loader:
|
288
|
+
input_batch, target_batch = input_batch.to(self.__device), target_batch.to(self.__device)
|
289
|
+
logits = self.__model(input_batch)
|
290
|
+
loss = self.__Function.cross_entropy(logits.view(-1, logits.size(-1)), target_batch.view(-1))
|
291
|
+
self.__optimizer.zero_grad()
|
292
|
+
loss.backward()
|
293
|
+
self.__utils.clip_grad_norm_(self.__model.parameters(), 1.0)
|
294
|
+
self.__optimizer.step()
|
295
|
+
loss_item = loss.item()
|
296
|
+
total_train_loss += loss_item
|
297
|
+
last_validation_loss = validation_loss = self.__compute_loss(validation_loader) if validate > 0 else 1.0
|
298
|
+
training_metrics['generalization_rate'] = min((1.0, max((0.0, 1.0-validation_loss))))
|
299
|
+
if step > 0 and step % self.__eval_interval == 0:
|
300
|
+
scheduler.step(validation_loss)
|
301
|
+
if validation_loss < best_val_loss: best_val_loss = validation_loss
|
302
|
+
step += 1
|
303
|
+
current_precision = min(1.0, max(0.0, 1.0 - loss_item))
|
304
|
+
average_train_loss = total_train_loss / max((1, len(train_loader)))
|
305
|
+
if current_precision >= precision or average_train_loss <= loss_limit or Nx >= self.__epochs:
|
306
|
+
training_metrics['loss'] = loss_item if current_precision >= precision else average_train_loss
|
307
|
+
training_metrics['precision'] = current_precision
|
308
|
+
if progress:
|
309
|
+
description = f'Backpropagation epoch: {epoch} - current precision is '+f'{current_precision:.4f}'.ljust(5, '0')+f'; aiming for precision >= {string_precision} in training'
|
310
|
+
self.__tqdm(train_loader, desc=description, unit='it', unit_scale=True, unit_divisor=1000, smoothing=0.1, bar_format=formatted_string).update(len(train_loader))
|
311
|
+
print()
|
312
|
+
break
|
313
|
+
elif progress:
|
314
|
+
description = f'Backpropagation epoch: {epoch} - current precision is '+f'{current_precision:.4f}'.ljust(5, '0')+f'; aiming for precision >= {string_precision} in training'
|
315
|
+
train_loader = self.__tqdm(train_loader, desc=description, unit='it', unit_scale=True, unit_divisor=1000, smoothing=0.1, bar_format=formatted_string)
|
316
|
+
Nx += 1
|
317
|
+
training_metrics['val_loss'] = best_val_loss if best_val_loss < 1.0 else min((1.0, max((0.0, last_validation_loss))))
|
318
|
+
self.__train = True
|
319
|
+
return training_metrics
|
320
|
+
except Exception as error:
|
321
|
+
print('ERROR in train: ' + str(error))
|
322
|
+
try: return training_metrics
|
323
|
+
except: return {'val_loss': 1.0, 'loss': 1.0, 'generalization_rate': 0.0, 'precision': 0.0}
|
324
|
+
def saveModel(self, model_path='', progress=True):
|
325
|
+
try:
|
326
|
+
model_path = str(model_path).strip()
|
327
|
+
progress = bool(progress) if type(progress) in (bool, int, float) else True
|
328
|
+
if self.__model is None: raise ValueError('Model is not initialized. Call train or loadModel first.')
|
329
|
+
self.parameters_number = sum(parameters.numel() for parameters in self.__model.parameters())
|
330
|
+
formatted_params = self.__format_params(self.parameters_number)
|
331
|
+
if len(model_path) > 0:
|
332
|
+
directory, file_name = self.__os_path.split(model_path)
|
333
|
+
if not file_name: file_name = 'model.gpt'
|
334
|
+
elif not file_name.endswith('.gpt'): file_name += '.gpt'
|
335
|
+
else: directory, file_name = str(model_path), 'model.gpt'
|
336
|
+
if directory and not self.__os_path.exists(directory): self.__os_makedirs(directory)
|
337
|
+
save_path = self.__os_path.join(directory, file_name)
|
338
|
+
save_dict = {
|
339
|
+
'tokenizer': str(self.__tokenizer).lower().strip(),
|
340
|
+
'embedding_dim': max((1, int(self.__embedding_dim))) if type(self.__embedding_dim) in (bool, int, float) else -1,
|
341
|
+
'vocab_size': max((0, int(self.__vocab_size))) if type(self.__vocab_size) in (bool, int, float) else 0,
|
342
|
+
'block_size': max((1, int(self.__block_size))) if type(self.__block_size) in (bool, int, float) else -1,
|
343
|
+
'end_tag': str(self.__end_tag) if type(self.__end_tag) is not None else '',
|
344
|
+
'number_heads': max((1, int(self.__number_heads))) if type(self.__number_heads) in (bool, int, float) else -1,
|
345
|
+
'number_layers': max((1, int(self.__number_layers))) if type(self.__number_layers) in (bool, int, float) else -1,
|
346
|
+
'dropout': max((0, int(self.dropout))) if type(self.dropout) in (bool, int, float) else 0.1,
|
347
|
+
'parameters_number': max((0, int(self.parameters_number))) if type(self.parameters_number) in (bool, int, float) else 0,
|
348
|
+
'architecture_type': 'gpt_model',
|
349
|
+
'model_state_dict': self.__model.state_dict(),
|
350
|
+
'fine_tuning': [],
|
351
|
+
'precision': 1.0
|
352
|
+
|
353
|
+
}
|
354
|
+
if self.__tokenizer == 'sapi':
|
355
|
+
save_dict['char_to_idx'] = self.__char_to_idx if type(self.__char_to_idx) == dict else {}
|
356
|
+
save_dict['idx_to_char'] = self.__idx_to_char if type(self.__idx_to_char) == dict else {}
|
357
|
+
if progress:
|
358
|
+
for _ in self.__tqdm(range(10), desc=f'Saving model with {formatted_params} parameters', leave=False): self.__save(save_dict, save_path)
|
359
|
+
else: self.__save(save_dict, save_path)
|
360
|
+
return True
|
361
|
+
except Exception as error:
|
362
|
+
print('ERROR in saveModel: ' + str(error))
|
363
|
+
return False
|
364
|
+
def loadModel(self, model_path='', progress=True):
|
365
|
+
try:
|
366
|
+
model_path = str(model_path).strip()
|
367
|
+
progress = bool(progress) if type(progress) in (bool, int, float) else True
|
368
|
+
if len(model_path) > 0:
|
369
|
+
directory, file_name = self.__os_path.split(model_path)
|
370
|
+
if not file_name: file_name = 'model.gpt'
|
371
|
+
elif not file_name.endswith('.gpt'): file_name += '.gpt'
|
372
|
+
else: directory, file_name = str(model_path), 'model.gpt'
|
373
|
+
model_file = self.__os_path.join(directory, file_name)
|
374
|
+
if progress:
|
375
|
+
for _ in self.__tqdm(range(10), desc='Loading model', leave=False): checkpoint = self.__load(model_file, map_location=self.__device)
|
376
|
+
else: checkpoint = self.__load(model_file, map_location=self.__device)
|
377
|
+
try: self.__tokenizer = str(checkpoint['tokenizer']).lower().strip()
|
378
|
+
except: self.__tokenizer = 'gpt'
|
379
|
+
try: self.__embedding_dim = max((1, int(checkpoint['embedding_dim']))) if checkpoint['embedding_dim'] != -1 else None
|
380
|
+
except: self.__embedding_dim = None
|
381
|
+
try: self.__vocab_size = max((0, int(checkpoint['vocab_size']))) if type(checkpoint['vocab_size']) in (bool, int, float) else 0
|
382
|
+
except: self.__vocab_size = 0
|
383
|
+
try: self.__block_size = max((1, int(checkpoint['block_size']))) if type(checkpoint['block_size']) != -1 else None
|
384
|
+
except: self.__block_size = None
|
385
|
+
try: self.__end_tag = str(checkpoint['end_tag'])
|
386
|
+
except: self.__end_tag = ''
|
387
|
+
try: self.__number_heads = max((1, int(checkpoint['number_heads']))) if type(checkpoint['number_heads']) != -1 else None
|
388
|
+
except: self.__number_heads = None
|
389
|
+
try: self.__number_layers = max((1, int(checkpoint['number_layers']))) if type(checkpoint['number_layers']) != -1 else None
|
390
|
+
except: self.__number_layers = None
|
391
|
+
try: self.dropout = max((0, float(checkpoint['dropout']))) if type(checkpoint['dropout']) in (bool, int, float) else 0.1
|
392
|
+
except: self.dropout = 0.1
|
393
|
+
try: self.parameters_number = max((0, int(checkpoint['parameters_number']))) if type(checkpoint['parameters_number']) in (bool, int, float) else 0
|
394
|
+
except: self.parameters_number = 0
|
395
|
+
if self.__tokenizer == 'sapi':
|
396
|
+
try: self.__char_to_idx = dict(checkpoint['char_to_idx'])
|
397
|
+
except: self.__char_to_idx = {}
|
398
|
+
try: self.__idx_to_char = dict(checkpoint['idx_to_char'])
|
399
|
+
except: self.__idx_to_char = {}
|
400
|
+
self.__encode = lambda string: [self.__char_to_idx[char] for char in string]
|
401
|
+
self.__decode = lambda indexes: ''.join([self.__idx_to_char[index] for index in indexes])
|
402
|
+
else:
|
403
|
+
encode = self.__get_encoding('gpt2')
|
404
|
+
self.__encode = encode.encode
|
405
|
+
self.__decode = encode.decode
|
406
|
+
if len(self.__end_tag) < 1: self.__end_tag = None
|
407
|
+
self.__model = self.__Transformers(outer=self, vocab_size=self.__vocab_size, embedding_dim=self.__embedding_dim, number_heads=self.__number_heads, number_layers=self.__number_layers, dropout=self.dropout, block_size=self.__block_size).to(self.__device)
|
408
|
+
state_dict = checkpoint['model_state_dict']
|
409
|
+
self.__model.load_state_dict(state_dict)
|
410
|
+
self.__optimizer, self.__train = None, True
|
411
|
+
return True
|
412
|
+
except Exception as error:
|
413
|
+
print('ERROR in loadModel: ' + str(error))
|
414
|
+
return False
|
415
|
+
def addFit(self, prompt='', answer=''):
|
416
|
+
try:
|
417
|
+
prompt = str(prompt).strip()
|
418
|
+
answer = str(answer).strip()
|
419
|
+
if not self.__train:
|
420
|
+
if self.__end_tag is None: self.__end_tag = '<|end|>'
|
421
|
+
self.__string += prompt+'\n'+answer+self.__end_tag+'\n\n'
|
422
|
+
else:
|
423
|
+
if self.__model is None: raise ValueError('Model is not initialized. Call train or loadModel first.')
|
424
|
+
if self.__optimizer is None: self.__optimizer = self.__optim.AdamW(self.__model.parameters(), lr=self.__learning_rate)
|
425
|
+
if self.__end_tag is None: formatted = prompt+'\n'+answer+'\n\n'
|
426
|
+
else: formatted = prompt+'\n'+answer+self.__end_tag+'\n\n'
|
427
|
+
encoded = self.__encode(formatted)
|
428
|
+
if len(encoded) > self.__block_size: encoded = encoded[:self.__block_size]
|
429
|
+
input_tensor = self.__tensor(encoded[:-1], dtype=self.__int64).unsqueeze(0).to(self.__device)
|
430
|
+
target_tensor = self.__tensor(encoded[1:], dtype=self.__int64).unsqueeze(0).to(self.__device)
|
431
|
+
self.__model.train()
|
432
|
+
logits = self.__model(input_tensor)
|
433
|
+
loss = self.__Function.cross_entropy(logits.view(-1, logits.size(-1)), target_tensor.view(-1))
|
434
|
+
self.__optimizer.zero_grad()
|
435
|
+
loss.backward()
|
436
|
+
self.__utils.clip_grad_norm_(self.__model.parameters(), 1.0)
|
437
|
+
self.__optimizer.step()
|
438
|
+
return True
|
439
|
+
except Exception as error:
|
440
|
+
print('ERROR in addFit: ' + str(error))
|
441
|
+
return False
|
442
|
+
def predict(self, prompt='', max_tokens=500, temperature=0.5, top_k=0, top_p=1.0, stream=False):
|
443
|
+
try:
|
444
|
+
prompt = str(prompt).strip()
|
445
|
+
max_tokens = max((1, int(max_tokens))) if type(max_tokens) in (bool, int, float) else 500
|
446
|
+
temperature = max((0, float(temperature))) if type(temperature) in (bool, int, float) else 0.5
|
447
|
+
top_k = max((0, int(top_k))) if type(top_k) in (bool, int, float) else 0
|
448
|
+
top_p = min((1.0, max((0.0, int(top_p))))) if type(top_p) in (bool, int, float) else 1.0
|
449
|
+
stream = bool(stream) if type(stream) in (bool, int, float) else False
|
450
|
+
if self.__model is None: raise ValueError('Model is not initialized. Call train or loadModel first.')
|
451
|
+
if stream: return self.__generate_tokens(prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p)
|
452
|
+
tokens = list(self.__generate_tokens(prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p))
|
453
|
+
return ''.join(tokens)
|
454
|
+
except Exception as error:
|
455
|
+
print('ERROR in predict: ' + str(error))
|
456
|
+
return ''
|
457
|
+
def print_predict(self, prompt='', max_tokens=500, temperature=0.5, top_k=0, top_p=1.0, stream=False):
|
458
|
+
try:
|
459
|
+
prompt = str(prompt).strip()
|
460
|
+
max_tokens = max((1, int(max_tokens))) if type(max_tokens) in (bool, int, float) else 500
|
461
|
+
temperature = max((0, float(temperature))) if type(temperature) in (bool, int, float) else 0.5
|
462
|
+
top_k = max((0, int(top_k))) if type(top_k) in (bool, int, float) else 0
|
463
|
+
top_p = min((1.0, max((0.0, int(top_p))))) if type(top_p) in (bool, int, float) else 1.0
|
464
|
+
stream = bool(stream) if type(stream) in (bool, int, float) else False
|
465
|
+
if self.__model is None: raise ValueError('Model is not initialized. Call train or loadModel first.')
|
466
|
+
if stream:
|
467
|
+
[print(token, end='', flush=True) for token in self.__generate_tokens(prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p)]
|
468
|
+
print()
|
469
|
+
else: print(self.predict(prompt=prompt, max_tokens=max_tokens, temperature=temperature, stream=stream))
|
470
|
+
except Exception as error:
|
471
|
+
print('ERROR in print_predict: ' + str(error))
|
472
|
+
# This is a standard code of a GPT (Generative Pre-trained Transformer) model, developed by Sapiens Technology®️,
|
473
|
+
# which faithfully follows the mathematical structure of the article “Attention Is All You Need” for the construction of the Transformer architecture
|
474
|
+
# used in the pattern recognition of the model that is saved. Some optimizations that do not influence the Transformer architecture
|
475
|
+
# were applied only to facilitate the adjustments of the parameters and variables of the training, saving, loading, fine-tuning and inference of the pre-trained model.
|
476
|
+
# --------------------------> A SAPIENS TECHNOLOGY®️ PRODUCTION) <--------------------------
|
@@ -0,0 +1,14 @@
|
|
1
|
+
Metadata-Version: 2.2
|
2
|
+
Name: gptmodel
|
3
|
+
Version: 1.0.0
|
4
|
+
Home-page: https://github.com/
|
5
|
+
Author: SAPIENS TECHNOLOGY
|
6
|
+
License: Proprietary Software
|
7
|
+
License-File: LICENSE.txt
|
8
|
+
Requires-Dist: torch==2.4.1
|
9
|
+
Requires-Dist: tiktoken==0.4.0
|
10
|
+
Requires-Dist: tqdm==4.67.1
|
11
|
+
Dynamic: author
|
12
|
+
Dynamic: home-page
|
13
|
+
Dynamic: license
|
14
|
+
Dynamic: requires-dist
|
@@ -0,0 +1,7 @@
|
|
1
|
+
gptmodel/__init__.py,sha256=UCXG7pgytIN6ODBWxcfF5tQJkE6uMdzbvvVRDuRsSYs,24
|
2
|
+
gptmodel/gptmodel.py,sha256=GBPnOgY547dZiI3LJfOpCHzbCSq1vzEzoQ_oeMWUxhw,33411
|
3
|
+
gptmodel-1.0.0.dist-info/LICENSE.txt,sha256=WqB2vIA5tH5lqLTr53yT_oy1m0wYfuvCPQKxdDHWimg,115
|
4
|
+
gptmodel-1.0.0.dist-info/METADATA,sha256=OXJF4uVDZbuVatpnD0_RJJJgR1VxS0RdT9EcO_WRPmo,328
|
5
|
+
gptmodel-1.0.0.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
6
|
+
gptmodel-1.0.0.dist-info/top_level.txt,sha256=585C0QclguIkVPKKPpoeD2FxYAc5n5EAuvJNK4vMeQk,9
|
7
|
+
gptmodel-1.0.0.dist-info/RECORD,,
|
@@ -0,0 +1 @@
|
|
1
|
+
gptmodel
|