sapiens-gpt 2.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
sapiens_gpt/__init__.py
ADDED
|
@@ -0,0 +1,11 @@
|
|
|
1
|
+
# This is a standard code of a GPT (Generative Pre-trained Transformer) model, developed by Sapiens Technology®️,
|
|
2
|
+
# which faithfully follows the mathematical structure of the article “Attention Is All You Need” for the construction of the Transformer architecture
|
|
3
|
+
# used in the pattern recognition of the model that is saved. Some optimizations that do not influence the Transformer architecture
|
|
4
|
+
# were applied only to facilitate the adjustments of the parameters and variables of the training, saving, loading, fine-tuning and inference of the pre-trained model.
|
|
5
|
+
# --------------------------> A SAPIENS TECHNOLOGY®️ PRODUCTION) <--------------------------
|
|
6
|
+
from .sapiens_gpt import *
|
|
7
|
+
# This is a standard code of a GPT (Generative Pre-trained Transformer) model, developed by Sapiens Technology®️,
|
|
8
|
+
# which faithfully follows the mathematical structure of the article “Attention Is All You Need” for the construction of the Transformer architecture
|
|
9
|
+
# used in the pattern recognition of the model that is saved. Some optimizations that do not influence the Transformer architecture
|
|
10
|
+
# were applied only to facilitate the adjustments of the parameters and variables of the training, saving, loading, fine-tuning and inference of the pre-trained model.
|
|
11
|
+
# --------------------------> A SAPIENS TECHNOLOGY®️ PRODUCTION) <--------------------------
|
|
@@ -0,0 +1,803 @@
|
|
|
1
|
+
# This is a standard code of a GPT (Generative Pre-trained Transformer) model, developed by Sapiens Technology®️,
|
|
2
|
+
# which faithfully follows the mathematical structure of the article “Attention Is All You Need” for the construction of the Transformer architecture
|
|
3
|
+
# used in the pattern recognition of the model that is saved. Some optimizations that do not influence the Transformer architecture
|
|
4
|
+
# were applied only to facilitate the adjustments of the parameters and variables of the training, saving, loading, fine-tuning and inference of the pre-trained model.
|
|
5
|
+
# --------------------------> A SAPIENS TECHNOLOGY®️ PRODUCTION) <--------------------------
|
|
6
|
+
class SapiensGPT:
|
|
7
|
+
def __init__(self, show_errors=True, display_error_point=False):
|
|
8
|
+
try:
|
|
9
|
+
self.__show_errors = bool(show_errors) if type(show_errors) in (bool, int, float) else True
|
|
10
|
+
self.__display_error_point = bool(display_error_point) if type(display_error_point) in (bool, int, float) else False
|
|
11
|
+
try:
|
|
12
|
+
from warnings import filterwarnings
|
|
13
|
+
from logging import getLogger, ERROR, disable, CRITICAL
|
|
14
|
+
from os import environ
|
|
15
|
+
from dotenv import load_dotenv
|
|
16
|
+
filterwarnings('ignore')
|
|
17
|
+
filterwarnings('ignore', category=UserWarning, module='torch.distributed')
|
|
18
|
+
getLogger('torch.distributed.elastic.multiprocessing.redirects').setLevel(ERROR)
|
|
19
|
+
environ['DISABLE_MODEL_SOURCE_CHECK'] = 'True'
|
|
20
|
+
load_dotenv()
|
|
21
|
+
disable(CRITICAL)
|
|
22
|
+
except: pass
|
|
23
|
+
from traceback import print_exc
|
|
24
|
+
self.__print_exc = print_exc
|
|
25
|
+
self.embedding_dim = 384
|
|
26
|
+
self.block_size = 500
|
|
27
|
+
self.batch_size = 32
|
|
28
|
+
self.number_heads = 6
|
|
29
|
+
self.number_layers = 6
|
|
30
|
+
self.dropout = 0.1
|
|
31
|
+
self.learning_rate = 3e-4
|
|
32
|
+
self.eval_interval = 500
|
|
33
|
+
self.epochs = 2000
|
|
34
|
+
self.string = ''
|
|
35
|
+
self.precision = 1.0
|
|
36
|
+
self.tokenizer = 'gpt'
|
|
37
|
+
self.context_window = 500
|
|
38
|
+
self.end_tag = '<|end|>'
|
|
39
|
+
self.validate = 0.0
|
|
40
|
+
self.max_tokens = 500
|
|
41
|
+
self.temperature = 0.5
|
|
42
|
+
self.top_k = 0
|
|
43
|
+
self.top_p = 1.0
|
|
44
|
+
self.delay = 0.01
|
|
45
|
+
self.device = None
|
|
46
|
+
from torch import cuda, device, backends
|
|
47
|
+
from torch.utils.data import Dataset, DataLoader
|
|
48
|
+
from torch.nn import Module, functional as Function, utils
|
|
49
|
+
from torch import nn as artificial_neural_network, triu, ones
|
|
50
|
+
from torch import tensor, no_grad, int64, multinomial, cat, topk, where, sort, cumsum, zeros_like, bool as torch_bool, save, load
|
|
51
|
+
from tiktoken import get_encoding
|
|
52
|
+
from json import load as json_load
|
|
53
|
+
from torch import optim
|
|
54
|
+
from tqdm import tqdm
|
|
55
|
+
from os import path as os_path, makedirs as os_makedirs
|
|
56
|
+
try:
|
|
57
|
+
import torch_xla.core.xla_model as xm
|
|
58
|
+
self.__xm = xm
|
|
59
|
+
except ImportError: self.__xm = None
|
|
60
|
+
if not self.device:
|
|
61
|
+
if cuda.is_available(): self.device = device('cuda')
|
|
62
|
+
elif self.__xm is not None: self.device = self.__xm.xla_device()
|
|
63
|
+
elif backends.mps.is_available(): self.device = device('mps')
|
|
64
|
+
else: self.device = device('cpu')
|
|
65
|
+
self.__Dataset = Dataset
|
|
66
|
+
self.__Module = Module
|
|
67
|
+
self.__neural_network = artificial_neural_network
|
|
68
|
+
self.__tensor = tensor
|
|
69
|
+
self.__triu = triu
|
|
70
|
+
self.__ones = ones
|
|
71
|
+
self.__no_grad = no_grad
|
|
72
|
+
self.__Function = Function
|
|
73
|
+
self.__int64 = int64
|
|
74
|
+
self.__multinomial = multinomial
|
|
75
|
+
self.__cat = cat
|
|
76
|
+
self.__topk = topk
|
|
77
|
+
self.__where = where
|
|
78
|
+
self.__sort = sort
|
|
79
|
+
self.__cumsum = cumsum
|
|
80
|
+
self.__zeros_like = zeros_like
|
|
81
|
+
self.__bool = torch_bool
|
|
82
|
+
self.__get_encoding = get_encoding
|
|
83
|
+
self.__json_load = json_load
|
|
84
|
+
self.__DataLoader = DataLoader
|
|
85
|
+
self.__optim = optim
|
|
86
|
+
self.__utils = utils
|
|
87
|
+
self.__tqdm = tqdm
|
|
88
|
+
self.__os_path = os_path
|
|
89
|
+
self.__os_makedirs = os_makedirs
|
|
90
|
+
self.__save = save
|
|
91
|
+
self.__load = load
|
|
92
|
+
self.__model = None
|
|
93
|
+
self.__encode = None
|
|
94
|
+
self.__decode = None
|
|
95
|
+
self.__end_tag = None
|
|
96
|
+
self.__string = ''
|
|
97
|
+
self.__vocab_size = 0
|
|
98
|
+
self.__char_to_idx = {}
|
|
99
|
+
self.__idx_to_char = {}
|
|
100
|
+
self.__tokenizer = 'gpt'
|
|
101
|
+
self.__optimizer = None
|
|
102
|
+
self.__train = False
|
|
103
|
+
self.parameters_number = 0
|
|
104
|
+
class TextDataset(self.__Dataset):
|
|
105
|
+
def __init__(self, data={}, block_size=0): self.data, self.block_size = data, block_size
|
|
106
|
+
def __len__(self): return len(self.data) - self.block_size
|
|
107
|
+
def __getitem__(self, index=0):
|
|
108
|
+
input_sequence = self.data[index:index + self.block_size]
|
|
109
|
+
target_sequence = self.data[index + 1:index + self.block_size + 1]
|
|
110
|
+
return input_sequence, target_sequence
|
|
111
|
+
class Transformer(self.__Module):
|
|
112
|
+
def __init__(self, outer=None, vocab_size=0, embedding_dim=0, number_heads=0, number_layers=0, dropout=None, block_size=0):
|
|
113
|
+
super().__init__()
|
|
114
|
+
self.outer = outer
|
|
115
|
+
self.positional_encoding = outer._SapiensGPT__neural_network.Parameter(outer._SapiensGPT__tensor([]).new_zeros(1, block_size, embedding_dim))
|
|
116
|
+
self.dropout = outer._SapiensGPT__neural_network.Dropout(dropout)
|
|
117
|
+
self.input_embedding = outer._SapiensGPT__neural_network.Embedding(vocab_size, embedding_dim)
|
|
118
|
+
self.multi_head_attention = outer._SapiensGPT__neural_network.TransformerDecoder(outer._SapiensGPT__neural_network.TransformerDecoderLayer(d_model=embedding_dim, nhead=number_heads, dropout=dropout), num_layers=number_layers)
|
|
119
|
+
self.output_function = outer._SapiensGPT__neural_network.Linear(embedding_dim, vocab_size)
|
|
120
|
+
self.block_size = block_size
|
|
121
|
+
def forward(self, input_tensor=[]):
|
|
122
|
+
outer = self.outer
|
|
123
|
+
batch_size, sequence_length = input_tensor.size()
|
|
124
|
+
positions = self.positional_encoding[:, :sequence_length, :].to(input_tensor.device)
|
|
125
|
+
output_embedding = self.dropout(self.input_embedding(input_tensor) + positions)
|
|
126
|
+
transposed = output_embedding.transpose(0, 1)
|
|
127
|
+
masked_multi_head_attention = outer._SapiensGPT__triu(outer._SapiensGPT__ones(sequence_length, sequence_length, device=input_tensor.device) * float('-inf'), diagonal=1)
|
|
128
|
+
add_and_norm = self.multi_head_attention(transposed, transposed, tgt_mask=masked_multi_head_attention)
|
|
129
|
+
add_and_norm = add_and_norm.transpose(0, 1)
|
|
130
|
+
return self.output_function(add_and_norm)
|
|
131
|
+
self.__TextDatasets = TextDataset
|
|
132
|
+
self.__Transformers = Transformer
|
|
133
|
+
except Exception as error:
|
|
134
|
+
try:
|
|
135
|
+
if self.__show_errors:
|
|
136
|
+
error_message = 'ERROR in SapiensGPT.__init__: '+str(error)
|
|
137
|
+
print(error_message)
|
|
138
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
139
|
+
except: pass
|
|
140
|
+
except: pass
|
|
141
|
+
def __compute_loss(self, loader=[]):
|
|
142
|
+
try:
|
|
143
|
+
self.__model.eval()
|
|
144
|
+
total_loss = 0
|
|
145
|
+
with self.__no_grad():
|
|
146
|
+
for input_batch, target_batch in loader:
|
|
147
|
+
input_batch, target_batch = input_batch.to(self.device), target_batch.to(self.device)
|
|
148
|
+
logits = self.__model(input_batch)
|
|
149
|
+
loss = self.__Function.cross_entropy(logits.view(-1, logits.size(-1)), target_batch.view(-1))
|
|
150
|
+
total_loss += loss.item()
|
|
151
|
+
return total_loss / len(loader)
|
|
152
|
+
except Exception as error:
|
|
153
|
+
try:
|
|
154
|
+
if self.__show_errors:
|
|
155
|
+
error_message = 'ERROR in SapiensGPT.__compute_loss: '+str(error)
|
|
156
|
+
print(error_message)
|
|
157
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
158
|
+
except: pass
|
|
159
|
+
except: pass
|
|
160
|
+
return 0
|
|
161
|
+
def __format_params(self, number_params=0):
|
|
162
|
+
try:
|
|
163
|
+
if number_params < 1_000: return f'{number_params}U'
|
|
164
|
+
elif number_params < 1_000_000: return f'{number_params // 1_000}K'
|
|
165
|
+
elif number_params < 1_000_000_000: return f'{number_params // 1_000_000}M'
|
|
166
|
+
elif number_params < 1_000_000_000_000: return f'{number_params // 1_000_000_000}B'
|
|
167
|
+
else: return f'{number_params // 1_000_000_000_000}T'
|
|
168
|
+
except Exception as error:
|
|
169
|
+
try:
|
|
170
|
+
if self.__show_errors:
|
|
171
|
+
error_message = 'ERROR in SapiensGPT.__format_params: '+str(error)
|
|
172
|
+
print(error_message)
|
|
173
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
174
|
+
except: pass
|
|
175
|
+
except: pass
|
|
176
|
+
return f'{number_params}U'
|
|
177
|
+
def __get_found_end_tag(self, decoded_token='', decoded_tokens='', limits=[]):
|
|
178
|
+
try:
|
|
179
|
+
if self.__end_tag is None: return False
|
|
180
|
+
decoded_token, decoded_tokens, limits = str(decoded_token).strip(), str(decoded_tokens).strip(), list(limits)
|
|
181
|
+
for limit in ['']+limits+[' ']:
|
|
182
|
+
if decoded_token.endswith(limit+self.__end_tag) or decoded_tokens.endswith(limit+self.__end_tag): return True
|
|
183
|
+
elif decoded_token.endswith(limit+self.__end_tag[0]) or decoded_tokens.endswith(limit+self.__end_tag[0]): return True
|
|
184
|
+
return False
|
|
185
|
+
except Exception as error:
|
|
186
|
+
try:
|
|
187
|
+
if self.__show_errors:
|
|
188
|
+
error_message = 'ERROR in SapiensGPT.__get_found_end_tag: '+str(error)
|
|
189
|
+
print(error_message)
|
|
190
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
191
|
+
except: pass
|
|
192
|
+
except: pass
|
|
193
|
+
return False
|
|
194
|
+
def __generate_tokens_x(self, prompt='', max_tokens=500, temperature=1.0):
|
|
195
|
+
try:
|
|
196
|
+
self.__model.eval()
|
|
197
|
+
encoded_prompt = self.__encode(prompt)
|
|
198
|
+
input_tensor = self.__tensor(encoded_prompt, dtype=self.__int64).unsqueeze(0).to(self.device)
|
|
199
|
+
limits = ('.', '\n', '!', '?', ';')
|
|
200
|
+
with self.__no_grad():
|
|
201
|
+
tokens_generated, decoded_tokens = 0, ''
|
|
202
|
+
while True:
|
|
203
|
+
conditioned_input = input_tensor[:, -self.block_size:] if input_tensor.size(1) > self.block_size else input_tensor
|
|
204
|
+
logits = self.__model(conditioned_input)
|
|
205
|
+
logits = logits[:, -1, :] / temperature
|
|
206
|
+
output_probabilities = self.__Function.softmax(logits, dim=-1)
|
|
207
|
+
shifted_right = self.__multinomial(output_probabilities, num_samples=1)
|
|
208
|
+
input_tensor = self.__cat((input_tensor, shifted_right), dim=1)
|
|
209
|
+
token = shifted_right.item()
|
|
210
|
+
decoded_token, found_end_tag = self.__decode([token]), False
|
|
211
|
+
if tokens_generated == 0 and '\n' in decoded_token: continue
|
|
212
|
+
tokens_generated += 1
|
|
213
|
+
decoded_tokens += decoded_token
|
|
214
|
+
found_end_tag = self.__get_found_end_tag(decoded_token=decoded_token, decoded_tokens=decoded_tokens, limits=limits)
|
|
215
|
+
if found_end_tag and decoded_token.endswith(self.__end_tag[0]): decoded_token = decoded_token[:-1]
|
|
216
|
+
yield decoded_token
|
|
217
|
+
if found_end_tag or ((tokens_generated >= max_tokens) and (decoded_token[-1] in limits)) or (tokens_generated >= (max_tokens*2)): break
|
|
218
|
+
except Exception as error:
|
|
219
|
+
try:
|
|
220
|
+
if self.__show_errors:
|
|
221
|
+
error_message = 'ERROR in SapiensGPT.__generate_tokens_x: '+str(error)
|
|
222
|
+
print(error_message)
|
|
223
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
224
|
+
except: pass
|
|
225
|
+
except: pass
|
|
226
|
+
return ''
|
|
227
|
+
def __generate_tokens_y(self, prompt='', max_tokens=500, temperature=1.0, top_k=50, top_p=0.9):
|
|
228
|
+
try:
|
|
229
|
+
self.__model.eval()
|
|
230
|
+
encoded_prompt = self.__encode(prompt)
|
|
231
|
+
input_tensor = self.__tensor(encoded_prompt, dtype=self.__int64).unsqueeze(0).to(self.device)
|
|
232
|
+
limits = ('.', '\n', '!', '?', ';')
|
|
233
|
+
with self.__no_grad():
|
|
234
|
+
tokens_generated, decoded_tokens = 0, ''
|
|
235
|
+
while True:
|
|
236
|
+
conditioned_input = (input_tensor[:, -self.block_size:] if input_tensor.size(1) > self.block_size else input_tensor)
|
|
237
|
+
logits = self.__model(conditioned_input)
|
|
238
|
+
logits = logits[:, -1, :] / temperature
|
|
239
|
+
if top_k > 0:
|
|
240
|
+
top_k = min(top_k, logits.size(-1))
|
|
241
|
+
value, _ = self.__topk(logits, top_k)
|
|
242
|
+
thresh = value[:, -1].unsqueeze(-1)
|
|
243
|
+
logits = self.__where(logits < thresh, self.__tensor(float('-inf')).to(logits), logits)
|
|
244
|
+
if top_p < 1.0:
|
|
245
|
+
sorted_logits, sorted_index = self.__sort(logits, dim=-1, descending=True)
|
|
246
|
+
sorted_probabilities = self.__Function.softmax(sorted_logits, dim=-1)
|
|
247
|
+
cumulative_probabilities = self.__cumsum(sorted_probabilities, dim=-1)
|
|
248
|
+
sorted_mask = cumulative_probabilities > top_p
|
|
249
|
+
sorted_mask[:, 0] = False
|
|
250
|
+
mask = self.__zeros_like(logits, dtype=self.__bool)
|
|
251
|
+
mask.scatter_(-1, sorted_index, sorted_mask)
|
|
252
|
+
logits = logits.masked_fill(mask, float('-inf'))
|
|
253
|
+
output_probabilities = self.__Function.softmax(logits, dim=-1)
|
|
254
|
+
shifted_right = self.__multinomial(output_probabilities, num_samples=1)
|
|
255
|
+
input_tensor = self.__cat((input_tensor, shifted_right), dim=1)
|
|
256
|
+
token = shifted_right.item()
|
|
257
|
+
decoded_token, found_end_tag = self.__decode([token]), False
|
|
258
|
+
if tokens_generated == 0 and '\n' in decoded_token: continue
|
|
259
|
+
tokens_generated += 1
|
|
260
|
+
decoded_tokens += decoded_token
|
|
261
|
+
found_end_tag = self.__get_found_end_tag(decoded_token=decoded_token, decoded_tokens=decoded_tokens, limits=limits)
|
|
262
|
+
if found_end_tag and decoded_token.endswith(self.__end_tag[0]): decoded_token = decoded_token[:-1]
|
|
263
|
+
yield decoded_token
|
|
264
|
+
if found_end_tag or ((tokens_generated >= max_tokens) and (decoded_token[-1] in limits)) or (tokens_generated >= (max_tokens*2)): break
|
|
265
|
+
except Exception as error:
|
|
266
|
+
try:
|
|
267
|
+
if self.__show_errors:
|
|
268
|
+
error_message = 'ERROR in SapiensGPT.__generate_tokens_y: '+str(error)
|
|
269
|
+
print(error_message)
|
|
270
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
271
|
+
except: pass
|
|
272
|
+
except: pass
|
|
273
|
+
return ''
|
|
274
|
+
def __generate_tokens(self, prompt='', max_tokens=500, temperature=1.0, top_k=0, top_p=1.0):
|
|
275
|
+
try:
|
|
276
|
+
prompt = '?' if len(str(prompt).strip()) < 1 else str(prompt).strip()
|
|
277
|
+
def get_last_n_tokens(text='', n=0):
|
|
278
|
+
if self.__tokenizer == 'sapi': return text[-n:]
|
|
279
|
+
else:
|
|
280
|
+
encoding = self.__get_encoding('gpt2')
|
|
281
|
+
tokens = encoding.encode(text)
|
|
282
|
+
last_n_tokens = tokens[-n:]
|
|
283
|
+
truncated_text = encoding.decode(last_n_tokens)
|
|
284
|
+
return truncated_text
|
|
285
|
+
prompt = get_last_n_tokens(text=prompt, n=self.block_size)
|
|
286
|
+
if top_k > 0 or top_p < 1.0: return self.__generate_tokens_y(prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p)
|
|
287
|
+
else: return self.__generate_tokens_x(prompt=prompt, max_tokens=max_tokens, temperature=temperature)
|
|
288
|
+
except Exception as error:
|
|
289
|
+
try:
|
|
290
|
+
if self.__show_errors:
|
|
291
|
+
error_message = 'ERROR in SapiensGPT.__generate_tokens: '+str(error)
|
|
292
|
+
print(error_message)
|
|
293
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
294
|
+
except: pass
|
|
295
|
+
except: pass
|
|
296
|
+
return ''
|
|
297
|
+
def __addFit(self, prompt='', answer=''):
|
|
298
|
+
try:
|
|
299
|
+
prompt = str(prompt).strip()
|
|
300
|
+
answer = str(answer).strip()
|
|
301
|
+
if not self.__train:
|
|
302
|
+
if self.__end_tag is None: self.__end_tag = '<|end|>'
|
|
303
|
+
self.__string += prompt+'\n'+answer+self.__end_tag+'\n\n'
|
|
304
|
+
else:
|
|
305
|
+
if self.__model is None: raise ValueError('Model is not initialized. Call train or loadModel first.')
|
|
306
|
+
if self.__optimizer is None: self.__optimizer = self.__optim.AdamW(self.__model.parameters(), lr=self.learning_rate)
|
|
307
|
+
if self.__end_tag is None: formatted = prompt+'\n'+answer+'\n\n'
|
|
308
|
+
else: formatted = prompt+'\n'+answer+self.__end_tag+'\n\n'
|
|
309
|
+
encoded = self.__encode(formatted)
|
|
310
|
+
if len(encoded) > self.block_size: encoded = encoded[:self.block_size]
|
|
311
|
+
input_tensor = self.__tensor(encoded[:-1], dtype=self.__int64).unsqueeze(0).to(self.device)
|
|
312
|
+
target_tensor = self.__tensor(encoded[1:], dtype=self.__int64).unsqueeze(0).to(self.device)
|
|
313
|
+
self.__model.train()
|
|
314
|
+
logits = self.__model(input_tensor)
|
|
315
|
+
loss = self.__Function.cross_entropy(logits.view(-1, logits.size(-1)), target_tensor.view(-1))
|
|
316
|
+
self.__optimizer.zero_grad()
|
|
317
|
+
loss.backward()
|
|
318
|
+
self.__utils.clip_grad_norm_(self.__model.parameters(), 1.0)
|
|
319
|
+
if self.__xm is not None or str(self.device).lower().strip() == 'tpu': self.device = self.__xm.xla_device()
|
|
320
|
+
if self.__xm is not None: self.__xm.optimizer_step(self.__optimizer)
|
|
321
|
+
else: self.__optimizer.step()
|
|
322
|
+
return True
|
|
323
|
+
except Exception as error:
|
|
324
|
+
try:
|
|
325
|
+
if self.__show_errors:
|
|
326
|
+
error_message = 'ERROR in SapiensGPT.__addFit: '+str(error)
|
|
327
|
+
print(error_message)
|
|
328
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
329
|
+
except: pass
|
|
330
|
+
except: pass
|
|
331
|
+
return False
|
|
332
|
+
def train(self, dataset_path='', progress=True):
|
|
333
|
+
try:
|
|
334
|
+
training_metrics = {'val_loss': 0.0, 'loss': 0.0, 'generalization_rate': 0.0, 'precision': 0.0}
|
|
335
|
+
if self.__train: return training_metrics
|
|
336
|
+
dataset_path = str(dataset_path).strip()
|
|
337
|
+
string = str(self.string).strip()
|
|
338
|
+
precision = min((1.0, max((0.0, float(self.precision))))) if type(self.precision) in (bool, int, float) else 1.0
|
|
339
|
+
tokenizer = str(self.tokenizer).lower().strip()
|
|
340
|
+
self.block_size = max((1, int(self.context_window))) if type(self.context_window) in (bool, int, float) else 500
|
|
341
|
+
if self.end_tag is not None and self.__end_tag is None: self.__end_tag = str(self.end_tag)
|
|
342
|
+
validate = min((1.0, max((0.0, float(self.validate))))) if type(self.validate) in (bool, int, float) else 0.0
|
|
343
|
+
progress = bool(progress) if type(progress) in (bool, int, float) else True
|
|
344
|
+
self.embedding_dim = max((1, int(self.embedding_dim))) if type(self.embedding_dim) in (bool, int, float) else 384
|
|
345
|
+
self.block_size = max((1, int(self.block_size))) if type(self.block_size) in (bool, int, float) else 500
|
|
346
|
+
self.batch_size = max((1, int(self.batch_size))) if type(self.batch_size) in (bool, int, float) else 32
|
|
347
|
+
self.number_heads = max((1, int(self.number_heads))) if type(self.number_heads) in (bool, int, float) else 6
|
|
348
|
+
self.number_layers = max((1, int(self.number_layers))) if type(self.number_layers) in (bool, int, float) else 6
|
|
349
|
+
self.dropout = max((0, float(self.dropout))) if type(self.dropout) in (bool, int, float) else 0.1
|
|
350
|
+
self.learning_rate = max((0, float(self.learning_rate))) if type(self.learning_rate) in (bool, int, float) else 3e-4
|
|
351
|
+
self.eval_interval = max((1, int(self.eval_interval))) if type(self.eval_interval) in (bool, int, float) else 500
|
|
352
|
+
self.epochs = max((1, int(self.epochs))) if type(self.epochs) in (bool, int, float) else 2000
|
|
353
|
+
if tokenizer not in ('sapi', 'gpt'): tokenizer = 'gpt'
|
|
354
|
+
self.__string = str(self.__string+'\n\n'+string).strip()
|
|
355
|
+
loss_limit = min(1.0, max(0.0, 1.0 - precision))
|
|
356
|
+
is_txt, is_json, text_data = dataset_path.endswith('.txt'), dataset_path.endswith('.json'), ''
|
|
357
|
+
def prepare_json(json_data={}):
|
|
358
|
+
if type(json_data) == dict: pairs = json_data[list(json_data.keys())[0]]
|
|
359
|
+
else: pairs = json_data
|
|
360
|
+
if self.__end_tag is None: self.__end_tag = '<|end|>'
|
|
361
|
+
return '\n\n'.join([str(pair[list(pair.keys())[0]]+'\n'+pair[list(pair.keys())[1]]).replace(self.__end_tag, '').strip()+self.__end_tag for pair in pairs])
|
|
362
|
+
def is_web_address(url_path=''):
|
|
363
|
+
url_path = str(url_path).lower().strip()
|
|
364
|
+
return url_path.startswith('https://') or url_path.startswith('http://') or url_path.startswith('www.')
|
|
365
|
+
_is_web_address = is_web_address(url_path=dataset_path)
|
|
366
|
+
if _is_web_address:
|
|
367
|
+
is_json = True if '.json' in dataset_path.lower() else False
|
|
368
|
+
def read_remote_file(url_path=''):
|
|
369
|
+
from urllib.request import urlopen
|
|
370
|
+
with urlopen(url_path) as response: return str(response.read().decode('utf-8', errors='replace').replace('\r\n', '\n').replace('\r', '\n')).strip()
|
|
371
|
+
text_data = read_remote_file(url_path=dataset_path)
|
|
372
|
+
if is_json:
|
|
373
|
+
def load_json(string_content=''):
|
|
374
|
+
json_content = {}
|
|
375
|
+
string_content = str(string_content)
|
|
376
|
+
try:
|
|
377
|
+
from json import loads
|
|
378
|
+
json_content = loads(string_content)
|
|
379
|
+
except:
|
|
380
|
+
from ast import literal_eval
|
|
381
|
+
json_content = literal_eval(string_content)
|
|
382
|
+
return json_content
|
|
383
|
+
json_data = load_json(string_content=text_data)
|
|
384
|
+
text_data = prepare_json(json_data=json_data)
|
|
385
|
+
else:
|
|
386
|
+
if not is_txt and not is_json and len(self.__string) < 1: raise ValueError('Unsupported file format. Use .txt or .json.')
|
|
387
|
+
if is_txt:
|
|
388
|
+
with open(dataset_path, 'r', encoding='utf-8') as file: text_data = str(file.read()).strip()
|
|
389
|
+
elif is_json:
|
|
390
|
+
with open(dataset_path, 'r', encoding='utf-8') as file: json_data = self.__json_load(file)
|
|
391
|
+
text_data = prepare_json(json_data=json_data)
|
|
392
|
+
if len(self.__string) > 0: text_data += '\n\n' + self.__string
|
|
393
|
+
text_data = text_data.strip()
|
|
394
|
+
if tokenizer == 'sapi':
|
|
395
|
+
chars = sorted(list(set(text_data)))
|
|
396
|
+
self.__vocab_size = len(chars)
|
|
397
|
+
self.__char_to_idx = {char: index for index, char in enumerate(chars)}
|
|
398
|
+
self.__idx_to_char = {index: char for index, char in enumerate(chars)}
|
|
399
|
+
self.__encode = lambda string: [self.__char_to_idx[char] for char in string]
|
|
400
|
+
self.__decode = lambda indices: ''.join([self.__idx_to_char[index] for index in indices])
|
|
401
|
+
else:
|
|
402
|
+
encode = self.__get_encoding('gpt2')
|
|
403
|
+
self.__vocab_size = encode.n_vocab
|
|
404
|
+
self.__encode = encode.encode
|
|
405
|
+
self.__decode = encode.decode
|
|
406
|
+
data = self.__tensor(self.__encode(text_data), dtype=self.__int64)
|
|
407
|
+
if validate > 0:
|
|
408
|
+
split_point = int((1-validate) * len(data))
|
|
409
|
+
train_data, validation_data = data[:split_point], data[split_point:]
|
|
410
|
+
minimum_length = min(len(train_data), len(validation_data))
|
|
411
|
+
if minimum_length >= 2:
|
|
412
|
+
desired_block_size = int(context_window) if context_window else 500
|
|
413
|
+
self.block_size = max(1, min(desired_block_size, minimum_length - 1))
|
|
414
|
+
else: self.block_size = 1
|
|
415
|
+
else:
|
|
416
|
+
train_data = data
|
|
417
|
+
data_length = len(train_data)
|
|
418
|
+
self.block_size = max(1, min(self.block_size, data_length - 1))
|
|
419
|
+
self.__tokenizer = tokenizer
|
|
420
|
+
train_dataset = self.__TextDatasets(train_data, self.block_size)
|
|
421
|
+
if validate > 0: validation_dataset = self.__TextDatasets(validation_data, self.block_size)
|
|
422
|
+
train_loader = self.__DataLoader(train_dataset, batch_size=self.batch_size, shuffle=True)
|
|
423
|
+
if validate > 0: validation_loader = self.__DataLoader(validation_dataset, batch_size=self.batch_size, shuffle=False)
|
|
424
|
+
self.__model = self.__Transformers(self, self.__vocab_size, self.embedding_dim, self.number_heads, self.number_layers, self.dropout, self.block_size).to(self.device)
|
|
425
|
+
self.__optimizer = self.__optim.AdamW(self.__model.parameters(), lr=self.learning_rate)
|
|
426
|
+
scheduler, feed_forward = self.__optim.lr_scheduler.ReduceLROnPlateau(self.__optimizer, mode='min', factor=0.5, patience=3), True
|
|
427
|
+
Nx, last_validation_loss, step, best_val_loss = 0, 1.0, 0, float('inf')
|
|
428
|
+
string_precision = f'{precision:.4f}'.ljust(5, '0')
|
|
429
|
+
formatted_string = '{desc}: {percentage:3.0f}%|{bar:10}| {n_fmt}/{total_fmt} [{elapsed}<{remaining}, {rate_fmt:>9}]'
|
|
430
|
+
if self.__xm is not None or str(self.device).lower().strip() == 'tpu': self.device = self.__xm.xla_device()
|
|
431
|
+
while feed_forward:
|
|
432
|
+
self.__model.train()
|
|
433
|
+
loss_item, total_train_loss = 1.0, 1.0
|
|
434
|
+
epoch = str(Nx+1).rjust(10, '0')
|
|
435
|
+
for input_batch, target_batch in train_loader:
|
|
436
|
+
input_batch, target_batch = input_batch.to(self.device), target_batch.to(self.device)
|
|
437
|
+
logits = self.__model(input_batch)
|
|
438
|
+
loss = self.__Function.cross_entropy(logits.view(-1, logits.size(-1)), target_batch.view(-1))
|
|
439
|
+
self.__optimizer.zero_grad()
|
|
440
|
+
loss.backward()
|
|
441
|
+
self.__utils.clip_grad_norm_(self.__model.parameters(), 1.0)
|
|
442
|
+
if self.__xm is not None: self.__xm.optimizer_step(self.__optimizer)
|
|
443
|
+
else: self.__optimizer.step()
|
|
444
|
+
loss_item = loss.item()
|
|
445
|
+
total_train_loss += loss_item
|
|
446
|
+
last_validation_loss = validation_loss = self.__compute_loss(validation_loader) if validate > 0 else 1.0
|
|
447
|
+
training_metrics['generalization_rate'] = min((1.0, max((0.0, 1.0-validation_loss))))
|
|
448
|
+
if step > 0 and step % self.eval_interval == 0:
|
|
449
|
+
scheduler.step(validation_loss)
|
|
450
|
+
if validation_loss < best_val_loss: best_val_loss = validation_loss
|
|
451
|
+
step += 1
|
|
452
|
+
current_precision = min(1.0, max(0.0, 1.0 - loss_item))
|
|
453
|
+
average_train_loss = total_train_loss / max((1, len(train_loader)))
|
|
454
|
+
if current_precision >= precision or average_train_loss <= loss_limit or Nx >= self.epochs:
|
|
455
|
+
training_metrics['loss'] = loss_item if current_precision >= precision else average_train_loss
|
|
456
|
+
training_metrics['precision'] = current_precision
|
|
457
|
+
if progress:
|
|
458
|
+
description = f'Finalization of backpropagations... current precision is '+f'{current_precision:.4f}'.ljust(5, '0')+f'; aiming for precision >= {string_precision} in training'
|
|
459
|
+
self.__tqdm(train_loader, desc=description, unit='it', unit_scale=True, unit_divisor=1000, smoothing=0.1, bar_format=formatted_string).update(len(train_loader))
|
|
460
|
+
print()
|
|
461
|
+
break
|
|
462
|
+
elif progress:
|
|
463
|
+
description = f'Backpropagation epoch: {epoch} - current precision is '+f'{current_precision:.4f}'.ljust(5, '0')+f'; aiming for precision >= {string_precision} in training'
|
|
464
|
+
train_loader = self.__tqdm(train_loader, desc=description, unit='it', unit_scale=True, unit_divisor=1000, smoothing=0.1, bar_format=formatted_string)
|
|
465
|
+
Nx += 1
|
|
466
|
+
training_metrics['val_loss'] = best_val_loss if best_val_loss < 1.0 else min((1.0, max((0.0, last_validation_loss))))
|
|
467
|
+
self.__train = True
|
|
468
|
+
return training_metrics
|
|
469
|
+
except Exception as error:
|
|
470
|
+
try:
|
|
471
|
+
if self.__show_errors:
|
|
472
|
+
error_message = 'ERROR in SapiensGPT.train: '+str(error)
|
|
473
|
+
print(error_message)
|
|
474
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
475
|
+
except: pass
|
|
476
|
+
except: pass
|
|
477
|
+
try: return training_metrics
|
|
478
|
+
except: return {'val_loss': 1.0, 'loss': 1.0, 'generalization_rate': 0.0, 'precision': 0.0}
|
|
479
|
+
def saveModel(self, model_path='', progress=True):
|
|
480
|
+
try:
|
|
481
|
+
model_path = str(model_path).strip()
|
|
482
|
+
progress = bool(progress) if type(progress) in (bool, int, float) else True
|
|
483
|
+
if self.__model is None: raise ValueError('Model is not initialized. Call train or loadModel first.')
|
|
484
|
+
self.parameters_number = sum(parameters.numel() for parameters in self.__model.parameters())
|
|
485
|
+
formatted_params = self.__format_params(self.parameters_number)
|
|
486
|
+
if len(model_path) > 0:
|
|
487
|
+
directory, file_name = self.__os_path.split(model_path)
|
|
488
|
+
if not file_name: file_name = 'model.gpt'
|
|
489
|
+
elif not file_name.endswith('.gpt'): file_name += '.gpt'
|
|
490
|
+
else: directory, file_name = str(model_path), 'model.gpt'
|
|
491
|
+
if directory and not self.__os_path.exists(directory): self.__os_makedirs(directory)
|
|
492
|
+
save_path = self.__os_path.join(directory, file_name)
|
|
493
|
+
save_dict = {
|
|
494
|
+
'tokenizer': str(self.__tokenizer).lower().strip(),
|
|
495
|
+
'embedding_dim': max((1, int(self.embedding_dim))) if type(self.embedding_dim) in (bool, int, float) else -1,
|
|
496
|
+
'vocab_size': max((0, int(self.__vocab_size))) if type(self.__vocab_size) in (bool, int, float) else 0,
|
|
497
|
+
'block_size': max((1, int(self.block_size))) if type(self.block_size) in (bool, int, float) else -1,
|
|
498
|
+
'end_tag': str(self.__end_tag) if self.__end_tag is not None else '',
|
|
499
|
+
'number_heads': max((1, int(self.number_heads))) if type(self.number_heads) in (bool, int, float) else -1,
|
|
500
|
+
'number_layers': max((1, int(self.number_layers))) if type(self.number_layers) in (bool, int, float) else -1,
|
|
501
|
+
'dropout': max((0, int(self.dropout))) if type(self.dropout) in (bool, int, float) else 0.1,
|
|
502
|
+
'parameters_number': max((0, int(self.parameters_number))) if type(self.parameters_number) in (bool, int, float) else 0,
|
|
503
|
+
'architecture_type': 'gpt_model',
|
|
504
|
+
'model_state_dict': self.__model.state_dict(),
|
|
505
|
+
'fine_tuning': [],
|
|
506
|
+
'precision': 1.0
|
|
507
|
+
|
|
508
|
+
}
|
|
509
|
+
if self.__tokenizer == 'sapi':
|
|
510
|
+
save_dict['char_to_idx'] = self.__char_to_idx if type(self.__char_to_idx) == dict else {}
|
|
511
|
+
save_dict['idx_to_char'] = self.__idx_to_char if type(self.__idx_to_char) == dict else {}
|
|
512
|
+
if progress:
|
|
513
|
+
for _ in self.__tqdm(range(10), desc=f'Saving model with {formatted_params} parameters', leave=False): self.__save(save_dict, save_path)
|
|
514
|
+
else: self.__save(save_dict, save_path)
|
|
515
|
+
return True
|
|
516
|
+
except Exception as error:
|
|
517
|
+
try:
|
|
518
|
+
if self.__show_errors:
|
|
519
|
+
error_message = 'ERROR in SapiensGPT.saveModel: '+str(error)
|
|
520
|
+
print(error_message)
|
|
521
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
522
|
+
except: pass
|
|
523
|
+
except: pass
|
|
524
|
+
return False
|
|
525
|
+
def loadModel(self, model_path='', progress=True):
|
|
526
|
+
try:
|
|
527
|
+
model_path = str(model_path).strip()
|
|
528
|
+
progress = bool(progress) if type(progress) in (bool, int, float) else True
|
|
529
|
+
if len(model_path) > 0:
|
|
530
|
+
directory, file_name = self.__os_path.split(model_path)
|
|
531
|
+
if not file_name: file_name = 'model.gpt'
|
|
532
|
+
elif not file_name.endswith('.gpt'): file_name += '.gpt'
|
|
533
|
+
else: directory, file_name = str(model_path), 'model.gpt'
|
|
534
|
+
model_file = self.__os_path.join(directory, file_name)
|
|
535
|
+
if progress:
|
|
536
|
+
for _ in self.__tqdm(range(10), desc='Loading model', leave=False):
|
|
537
|
+
try: checkpoint = self.__load(model_file, map_location=self.device)
|
|
538
|
+
except: checkpoint = self.__load(model_file)
|
|
539
|
+
else:
|
|
540
|
+
try: checkpoint = self.__load(model_file, map_location=self.device)
|
|
541
|
+
except: checkpoint = self.__load(model_file)
|
|
542
|
+
try: self.__tokenizer = str(checkpoint['tokenizer']).lower().strip()
|
|
543
|
+
except: self.__tokenizer = 'gpt'
|
|
544
|
+
try: self.embedding_dim = max((1, int(checkpoint['embedding_dim']))) if checkpoint['embedding_dim'] != -1 else None
|
|
545
|
+
except: self.embedding_dim = None
|
|
546
|
+
try: self.__vocab_size = max((0, int(checkpoint['vocab_size']))) if type(checkpoint['vocab_size']) in (bool, int, float) else 0
|
|
547
|
+
except: self.__vocab_size = 0
|
|
548
|
+
try: self.block_size = max((1, int(checkpoint['block_size']))) if type(checkpoint['block_size']) != -1 else None
|
|
549
|
+
except: self.block_size = None
|
|
550
|
+
try: self.__end_tag = str(checkpoint['end_tag'])
|
|
551
|
+
except: self.__end_tag = ''
|
|
552
|
+
try: self.number_heads = max((1, int(checkpoint['number_heads']))) if type(checkpoint['number_heads']) != -1 else None
|
|
553
|
+
except: self.number_heads = None
|
|
554
|
+
try: self.number_layers = max((1, int(checkpoint['number_layers']))) if type(checkpoint['number_layers']) != -1 else None
|
|
555
|
+
except: self.number_layers = None
|
|
556
|
+
try: self.dropout = max((0, float(checkpoint['dropout']))) if type(checkpoint['dropout']) in (bool, int, float) else 0.1
|
|
557
|
+
except: self.dropout = 0.1
|
|
558
|
+
try: self.parameters_number = max((0, int(checkpoint['parameters_number']))) if type(checkpoint['parameters_number']) in (bool, int, float) else 0
|
|
559
|
+
except: self.parameters_number = 0
|
|
560
|
+
if self.__tokenizer == 'sapi':
|
|
561
|
+
try: self.__char_to_idx = dict(checkpoint['char_to_idx'])
|
|
562
|
+
except: self.__char_to_idx = {}
|
|
563
|
+
try: self.__idx_to_char = dict(checkpoint['idx_to_char'])
|
|
564
|
+
except: self.__idx_to_char = {}
|
|
565
|
+
self.__encode = lambda string: [self.__char_to_idx[char] for char in string]
|
|
566
|
+
self.__decode = lambda indexes: ''.join([self.__idx_to_char[index] for index in indexes])
|
|
567
|
+
else:
|
|
568
|
+
encode = self.__get_encoding('gpt2')
|
|
569
|
+
self.__encode = encode.encode
|
|
570
|
+
self.__decode = encode.decode
|
|
571
|
+
if len(self.__end_tag) < 1: self.__end_tag = None
|
|
572
|
+
self.__model = self.__Transformers(outer=self, vocab_size=self.__vocab_size, embedding_dim=self.embedding_dim, number_heads=self.number_heads, number_layers=self.number_layers, dropout=self.dropout, block_size=self.block_size).to(self.device)
|
|
573
|
+
state_dict = checkpoint['model_state_dict']
|
|
574
|
+
self.__model.load_state_dict(state_dict)
|
|
575
|
+
self.__optimizer, self.__train = None, True
|
|
576
|
+
return True
|
|
577
|
+
except Exception as error:
|
|
578
|
+
try:
|
|
579
|
+
if self.__show_errors:
|
|
580
|
+
error_message = 'ERROR in SapiensGPT.loadModel: '+str(error)
|
|
581
|
+
print(error_message)
|
|
582
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
583
|
+
except: pass
|
|
584
|
+
except: pass
|
|
585
|
+
return False
|
|
586
|
+
def fineTuning(self, dataset_path='', progress=True):
|
|
587
|
+
try:
|
|
588
|
+
adjusted_model = False
|
|
589
|
+
dataset_path = str(dataset_path).strip()
|
|
590
|
+
progress = bool(progress) if type(progress) in (bool, int, float) else True
|
|
591
|
+
def _read_remote_file(remote_path=''):
|
|
592
|
+
remote_path = str(remote_path).strip()
|
|
593
|
+
from urllib.request import urlopen
|
|
594
|
+
try:
|
|
595
|
+
from os import environ
|
|
596
|
+
from certifi import where
|
|
597
|
+
environ['SSL_CERT_FILE'] = where()
|
|
598
|
+
from logging import getLogger, ERROR
|
|
599
|
+
getLogger('urlopen').setLevel(ERROR)
|
|
600
|
+
except: pass
|
|
601
|
+
remote_stream = urlopen(remote_path)
|
|
602
|
+
content = remote_stream.read().decode('utf-8')
|
|
603
|
+
return str(content).strip()
|
|
604
|
+
is_json, string_content = dataset_path.lower().endswith('.json'), ''
|
|
605
|
+
if dataset_path.startswith(('https://', 'http://')): string_content = _read_remote_file(remote_path=dataset_path)
|
|
606
|
+
else:
|
|
607
|
+
with open(dataset_path, 'r', encoding='utf-8') as file_object: string_content = str(file_object.read()).strip()
|
|
608
|
+
if is_json:
|
|
609
|
+
fit_structure, data = [], []
|
|
610
|
+
if string_content:
|
|
611
|
+
def _string_to_dictionary_or_json(string=''):
|
|
612
|
+
dictionary_or_json = {}
|
|
613
|
+
original_string = string
|
|
614
|
+
string = str(string).strip()
|
|
615
|
+
try:
|
|
616
|
+
from json import loads
|
|
617
|
+
try: dictionary_or_json = loads(string)
|
|
618
|
+
except: dictionary_or_json = loads(original_string)
|
|
619
|
+
except:
|
|
620
|
+
from ast import literal_eval
|
|
621
|
+
try: dictionary_or_json = literal_eval(string)
|
|
622
|
+
except: dictionary_or_json = literal_eval(original_string)
|
|
623
|
+
return dictionary_or_json
|
|
624
|
+
fit_structure = _string_to_dictionary_or_json(string=string_content)
|
|
625
|
+
if fit_structure:
|
|
626
|
+
if type(fit_structure) == dict:
|
|
627
|
+
json_keys = list(fit_structure.keys())
|
|
628
|
+
if 'data' in json_keys: data = list(fit_structure.get('data', []))
|
|
629
|
+
else:
|
|
630
|
+
data_key = str(json_keys[0]).strip()
|
|
631
|
+
data = list(fit_structure.get(data_key, []))
|
|
632
|
+
elif type(fit_structure) in (tuple, list): data = fit_structure
|
|
633
|
+
if data:
|
|
634
|
+
from tqdm import tqdm
|
|
635
|
+
total_length = len(data)
|
|
636
|
+
with tqdm(total=total_length, unit='item', disable=not progress) as progress_bar:
|
|
637
|
+
for input_output in data:
|
|
638
|
+
_input, _output, _file_path = '', '', ''
|
|
639
|
+
if input_output and type(input_output) == dict:
|
|
640
|
+
json_keys = list(input_output.keys())
|
|
641
|
+
if 'input' in json_keys: _input = str(input_output.get('input', '')).strip()
|
|
642
|
+
elif 'Input' in json_keys: _input = str(input_output.get('Input', '')).strip()
|
|
643
|
+
elif 'INPUT' in json_keys: _input = str(input_output.get('INPUT', '')).strip()
|
|
644
|
+
elif 'question' in json_keys: _input = str(input_output.get('question', '')).strip()
|
|
645
|
+
elif 'Question' in json_keys: _input = str(input_output.get('Question', '')).strip()
|
|
646
|
+
elif 'QUESTION' in json_keys: _input = str(input_output.get('QUESTION', '')).strip()
|
|
647
|
+
elif 'prompt' in json_keys: _input = str(input_output.get('prompt', '')).strip()
|
|
648
|
+
elif 'Prompt' in json_keys: _input = str(input_output.get('Prompt', '')).strip()
|
|
649
|
+
elif 'PROMPT' in json_keys: _input = str(input_output.get('PROMPT', '')).strip()
|
|
650
|
+
if 'output' in json_keys: _output = str(input_output.get('output', '')).strip()
|
|
651
|
+
elif 'Output' in json_keys: _output = str(input_output.get('Output', '')).strip()
|
|
652
|
+
elif 'OUTPUT' in json_keys: _output = str(input_output.get('OUTPUT', '')).strip()
|
|
653
|
+
elif 'answer' in json_keys: _output = str(input_output.get('answer', '')).strip()
|
|
654
|
+
elif 'Answer' in json_keys: _output = str(input_output.get('Answer', '')).strip()
|
|
655
|
+
elif 'ANSWER' in json_keys: _output = str(input_output.get('ANSWER', '')).strip()
|
|
656
|
+
elif 'response' in json_keys: _output = str(input_output.get('response', '')).strip()
|
|
657
|
+
elif 'Response' in json_keys: _output = str(input_output.get('Response', '')).strip()
|
|
658
|
+
elif 'RESPONSE' in json_keys: _output = str(input_output.get('RESPONSE', '')).strip()
|
|
659
|
+
if 'file_path' in json_keys: _file_path = str(input_output.get('file_path', '')).strip()
|
|
660
|
+
elif 'File_path' in json_keys: _file_path = str(input_output.get('File_path', '')).strip()
|
|
661
|
+
elif 'FILE_PATH' in json_keys: _file_path = str(input_output.get('FILE_PATH', '')).strip()
|
|
662
|
+
if not _input: _input = str(input_output[json_keys[0]]).strip()
|
|
663
|
+
if not _output: _output = str(input_output[json_keys[1]]).strip()
|
|
664
|
+
if _input and _output: self.__addFit(prompt=_input, answer=_output)
|
|
665
|
+
if progress:
|
|
666
|
+
progress_bar.set_description('Adjusting')
|
|
667
|
+
progress_bar.update(1)
|
|
668
|
+
adjusted_model = total_length > 0
|
|
669
|
+
elif self.__show_errors:
|
|
670
|
+
print('The file must be a JSON with a "data" key containing an array of objects with the keys "input" and "output".')
|
|
671
|
+
adjusted_model = False
|
|
672
|
+
return adjusted_model
|
|
673
|
+
except Exception as error:
|
|
674
|
+
try:
|
|
675
|
+
if self.__show_errors:
|
|
676
|
+
error_message = 'ERROR in SapiensGPT.fineTuning: '+str(error)
|
|
677
|
+
print(error_message)
|
|
678
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
679
|
+
except: pass
|
|
680
|
+
except: pass
|
|
681
|
+
return False
|
|
682
|
+
def inference(self, prompt='', stream=True):
|
|
683
|
+
try:
|
|
684
|
+
prompt = str(prompt).strip()
|
|
685
|
+
max_tokens = max((1, int(self.max_tokens))) if type(self.max_tokens) in (bool, int, float) else 500
|
|
686
|
+
temperature = max((0, float(self.temperature))) if type(self.temperature) in (bool, int, float) else 0.5
|
|
687
|
+
top_k = max((0, int(self.top_k))) if type(self.top_k) in (bool, int, float) else 0
|
|
688
|
+
top_p = min((1.0, max((0.0, float(self.top_p))))) if type(self.top_p) in (bool, int, float) else 1.0
|
|
689
|
+
stream = bool(stream) if type(stream) in (bool, int, float) else False
|
|
690
|
+
if self.__model is None: raise ValueError('Model is not initialized. Call train or loadModel first.')
|
|
691
|
+
if stream: return self.__generate_tokens(prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p)
|
|
692
|
+
tokens = list(self.__generate_tokens(prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p))
|
|
693
|
+
return ''.join(tokens)
|
|
694
|
+
except Exception as error:
|
|
695
|
+
try:
|
|
696
|
+
if self.__show_errors:
|
|
697
|
+
error_message = 'ERROR in SapiensGPT.inference: '+str(error)
|
|
698
|
+
print(error_message)
|
|
699
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
700
|
+
except: pass
|
|
701
|
+
except: pass
|
|
702
|
+
return ''
|
|
703
|
+
def completeMessages(self, messages=[], stream=False):
|
|
704
|
+
try:
|
|
705
|
+
complete_messages = {'answer': '', 'messages': [], 'next_token': ''}
|
|
706
|
+
messages = list(messages) if type(messages) in (tuple, list, dict) else []
|
|
707
|
+
stream = bool(stream) if type(stream) in (bool, int, float) else False
|
|
708
|
+
complete_messages['messages'] = messages
|
|
709
|
+
if messages:
|
|
710
|
+
prompt_template = ''
|
|
711
|
+
end_tag = self.end_tag if self.end_tag else ''
|
|
712
|
+
for message in messages:
|
|
713
|
+
role = str(message.get('role', '')).strip()
|
|
714
|
+
content = str(message.get('content', '')).strip()
|
|
715
|
+
if role and content: prompt_template += f'{role}:\n{content}{end_tag}\n'
|
|
716
|
+
prompt_template = prompt_template.strip()
|
|
717
|
+
def _get_stream(prompt_template='', complete_messages={}):
|
|
718
|
+
current_answer, message = '', {'role': 'assistant', 'content': ''}
|
|
719
|
+
complete_messages['messages'].append(message)
|
|
720
|
+
inference_function = self.inference(prompt=prompt_template, stream=True)
|
|
721
|
+
for token in inference_function:
|
|
722
|
+
current_answer += token
|
|
723
|
+
complete_messages['answer'] = current_answer
|
|
724
|
+
complete_messages['messages'][-1]['content'] = current_answer
|
|
725
|
+
complete_messages['next_token'] = token
|
|
726
|
+
yield complete_messages
|
|
727
|
+
def _get_string(prompt_template='', complete_messages={}):
|
|
728
|
+
inference_function = self.inference(prompt=prompt_template, stream=False)
|
|
729
|
+
complete_messages['answer'] = inference_function
|
|
730
|
+
message = {'role': 'assistant', 'content': inference_function}
|
|
731
|
+
complete_messages['messages'].append(message)
|
|
732
|
+
token = inference_function.split(chr(32))[-1].rstrip()
|
|
733
|
+
complete_messages['next_token'] = token
|
|
734
|
+
return complete_messages
|
|
735
|
+
if stream: complete_messages = _get_stream(prompt_template=prompt_template, complete_messages=complete_messages)
|
|
736
|
+
else: complete_messages = _get_string(prompt_template=prompt_template, complete_messages=complete_messages)
|
|
737
|
+
return complete_messages
|
|
738
|
+
except Exception as error:
|
|
739
|
+
try:
|
|
740
|
+
if self.__show_errors:
|
|
741
|
+
error_message = 'ERROR in SapiensGPT.completeMessages: '+str(error)
|
|
742
|
+
print(error_message)
|
|
743
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
744
|
+
except: pass
|
|
745
|
+
except: pass
|
|
746
|
+
return {'answer': '', 'messages': [], 'next_token': ''}
|
|
747
|
+
def printInference(self, prompt='', stream=True):
|
|
748
|
+
try:
|
|
749
|
+
prompt = str(prompt).strip()
|
|
750
|
+
max_tokens = max((1, int(self.max_tokens))) if type(self.max_tokens) in (bool, int, float) else 500
|
|
751
|
+
temperature = max((0, float(self.temperature))) if type(self.temperature) in (bool, int, float) else 0.5
|
|
752
|
+
top_k = max((0, int(self.top_k))) if type(self.top_k) in (bool, int, float) else 0
|
|
753
|
+
top_p = min((1.0, max((0.0, float(self.top_p))))) if type(self.top_p) in (bool, int, float) else 1.0
|
|
754
|
+
stream = bool(stream) if type(stream) in (bool, int, float) else False
|
|
755
|
+
if self.__model is None: raise ValueError('Model is not initialized. Call train or loadModel first.')
|
|
756
|
+
if stream:
|
|
757
|
+
[print(token, end='', flush=True) for token in self.__generate_tokens(prompt=prompt, max_tokens=max_tokens, temperature=temperature, top_k=top_k, top_p=top_p)]
|
|
758
|
+
print()
|
|
759
|
+
else: print(self.predict(prompt=prompt, max_tokens=max_tokens, temperature=temperature, stream=stream))
|
|
760
|
+
except Exception as error:
|
|
761
|
+
try:
|
|
762
|
+
if self.__show_errors:
|
|
763
|
+
error_message = 'ERROR in SapiensGPT.printInference: '+str(error)
|
|
764
|
+
print(error_message)
|
|
765
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
766
|
+
except: pass
|
|
767
|
+
except: pass
|
|
768
|
+
def printCompleteMessages(self, messages=[], stream=True):
|
|
769
|
+
try:
|
|
770
|
+
inference = self.completeMessages(messages=messages, stream=stream)
|
|
771
|
+
if stream:
|
|
772
|
+
from time import sleep
|
|
773
|
+
for token in inference:
|
|
774
|
+
print(token['next_token'], end='', flush=True)
|
|
775
|
+
sleep(self.delay)
|
|
776
|
+
print()
|
|
777
|
+
else: print(inference['answer'])
|
|
778
|
+
except Exception as error:
|
|
779
|
+
try:
|
|
780
|
+
if self.__show_errors:
|
|
781
|
+
error_message = 'ERROR in SapiensModel.printInference: '+str(error)
|
|
782
|
+
print(error_message)
|
|
783
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
784
|
+
except: pass
|
|
785
|
+
except: pass
|
|
786
|
+
def close(self):
|
|
787
|
+
try:
|
|
788
|
+
self.__init__()
|
|
789
|
+
return True
|
|
790
|
+
except Exception as error:
|
|
791
|
+
try:
|
|
792
|
+
if self.__show_errors:
|
|
793
|
+
error_message = 'ERROR in SapiensGPT.close: '+str(error)
|
|
794
|
+
print(error_message)
|
|
795
|
+
try: self.__print_exc() if self.__display_error_point else None
|
|
796
|
+
except: pass
|
|
797
|
+
except: pass
|
|
798
|
+
return False
|
|
799
|
+
# This is a standard code of a GPT (Generative Pre-trained Transformer) model, developed by Sapiens Technology®️,
|
|
800
|
+
# which faithfully follows the mathematical structure of the article “Attention Is All You Need” for the construction of the Transformer architecture
|
|
801
|
+
# used in the pattern recognition of the model that is saved. Some optimizations that do not influence the Transformer architecture
|
|
802
|
+
# were applied only to facilitate the adjustments of the parameters and variables of the training, saving, loading, fine-tuning and inference of the pre-trained model.
|
|
803
|
+
# --------------------------> A SAPIENS TECHNOLOGY®️ PRODUCTION) <--------------------------
|
|
@@ -0,0 +1,17 @@
|
|
|
1
|
+
Metadata-Version: 2.2
|
|
2
|
+
Name: sapiens_gpt
|
|
3
|
+
Version: 2.0.1
|
|
4
|
+
Home-page: https://github.com/sapiens-technology/sapiens_gpt
|
|
5
|
+
Author: SAPIENS TECHNOLOGY
|
|
6
|
+
License: Proprietary Software
|
|
7
|
+
License-File: LICENSE.txt
|
|
8
|
+
Requires-Dist: torch
|
|
9
|
+
Requires-Dist: numpy
|
|
10
|
+
Requires-Dist: tiktoken
|
|
11
|
+
Requires-Dist: tqdm
|
|
12
|
+
Requires-Dist: certifi
|
|
13
|
+
Requires-Dist: torch-xla==2.7.0; platform_system == "Linux" and platform_machine == "x86_64"
|
|
14
|
+
Dynamic: author
|
|
15
|
+
Dynamic: home-page
|
|
16
|
+
Dynamic: license
|
|
17
|
+
Dynamic: requires-dist
|
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
sapiens_gpt/__init__.py,sha256=_M8Z1PmKWzDj0sMmNRtpMZfP8YCN_gHS1dL1rlg_nIA,1361
|
|
2
|
+
sapiens_gpt/sapiens_gpt.py,sha256=A1wTQ_5W9k7aF7dmxeEIUG8Qj-KYeydBNFBXHa30gwc,51718
|
|
3
|
+
sapiens_gpt-2.0.1.dist-info/LICENSE.txt,sha256=WqB2vIA5tH5lqLTr53yT_oy1m0wYfuvCPQKxdDHWimg,115
|
|
4
|
+
sapiens_gpt-2.0.1.dist-info/METADATA,sha256=15tefvU31tQ7Qj88ypt0d8Pot9Ee-XmBLG7gccckPvk,476
|
|
5
|
+
sapiens_gpt-2.0.1.dist-info/WHEEL,sha256=52BFRY2Up02UkjOa29eZOS2VxUrpPORXg1pkohGGUS8,91
|
|
6
|
+
sapiens_gpt-2.0.1.dist-info/top_level.txt,sha256=vgTSwuajeQYPx5xRnkIQIjjTH5bb5JMPIbfHnhSutBM,12
|
|
7
|
+
sapiens_gpt-2.0.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
sapiens_gpt
|