webscout 6.0__py3-none-any.whl → 6.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of webscout might be problematic. Click here for more details.
- webscout/AIauto.py +77 -259
- webscout/Agents/Onlinesearcher.py +22 -10
- webscout/Agents/functioncall.py +2 -2
- webscout/Bard.py +21 -21
- webscout/Extra/autollama.py +37 -20
- webscout/Local/__init__.py +6 -7
- webscout/Local/formats.py +406 -194
- webscout/Local/model.py +1074 -477
- webscout/Local/samplers.py +108 -144
- webscout/Local/thread.py +251 -410
- webscout/Local/ui.py +401 -0
- webscout/Local/utils.py +338 -136
- webscout/Provider/Amigo.py +51 -38
- webscout/Provider/Deepseek.py +7 -6
- webscout/Provider/EDITEE.py +2 -2
- webscout/Provider/GPTWeb.py +1 -1
- webscout/Provider/Llama3.py +1 -1
- webscout/Provider/NinjaChat.py +200 -0
- webscout/Provider/OLLAMA.py +1 -1
- webscout/Provider/Perplexity.py +1 -1
- webscout/Provider/Reka.py +12 -5
- webscout/Provider/TTI/AIuncensored.py +103 -0
- webscout/Provider/TTI/Nexra.py +3 -3
- webscout/Provider/TTI/__init__.py +4 -2
- webscout/Provider/TTI/aiforce.py +2 -2
- webscout/Provider/TTI/imgninza.py +136 -0
- webscout/Provider/TTI/talkai.py +116 -0
- webscout/Provider/TeachAnything.py +0 -3
- webscout/Provider/Youchat.py +1 -1
- webscout/Provider/__init__.py +16 -12
- webscout/Provider/{ChatHub.py → aimathgpt.py} +72 -88
- webscout/Provider/cerebras.py +143 -123
- webscout/Provider/cleeai.py +1 -1
- webscout/Provider/felo_search.py +1 -1
- webscout/Provider/gaurish.py +207 -0
- webscout/Provider/geminiprorealtime.py +160 -0
- webscout/Provider/genspark.py +1 -1
- webscout/Provider/julius.py +8 -3
- webscout/Provider/learnfastai.py +1 -1
- webscout/Provider/{aigames.py → llmchat.py} +74 -84
- webscout/Provider/promptrefine.py +3 -1
- webscout/Provider/talkai.py +196 -0
- webscout/Provider/turboseek.py +3 -8
- webscout/Provider/tutorai.py +1 -1
- webscout/__init__.py +2 -43
- webscout/exceptions.py +5 -1
- webscout/tempid.py +4 -73
- webscout/utils.py +3 -0
- webscout/version.py +1 -1
- webscout/webai.py +1 -1
- webscout/webscout_search.py +154 -123
- {webscout-6.0.dist-info → webscout-6.2.dist-info}/METADATA +164 -245
- {webscout-6.0.dist-info → webscout-6.2.dist-info}/RECORD +57 -55
- webscout/Local/rawdog.py +0 -946
- webscout/Provider/BasedGPT.py +0 -214
- webscout/Provider/TTI/amigo.py +0 -148
- webscout/Provider/bixin.py +0 -264
- webscout/Provider/xdash.py +0 -182
- webscout/websx_search.py +0 -19
- {webscout-6.0.dist-info → webscout-6.2.dist-info}/LICENSE.md +0 -0
- {webscout-6.0.dist-info → webscout-6.2.dist-info}/WHEEL +0 -0
- {webscout-6.0.dist-info → webscout-6.2.dist-info}/entry_points.txt +0 -0
- {webscout-6.0.dist-info → webscout-6.2.dist-info}/top_level.txt +0 -0
webscout/Local/model.py
CHANGED
|
@@ -1,77 +1,137 @@
|
|
|
1
|
-
import json
|
|
2
|
-
from ._version import __version__, __llama_cpp_version__
|
|
3
1
|
|
|
4
|
-
"""Submodule containing the Model class to work with language models"""
|
|
5
2
|
|
|
3
|
+
import os
|
|
6
4
|
import sys
|
|
5
|
+
import uuid
|
|
7
6
|
import numpy as np
|
|
8
7
|
|
|
9
8
|
from .utils import (
|
|
10
9
|
_SupportsWriteAndFlush,
|
|
10
|
+
UnreachableException,
|
|
11
|
+
print_version_info,
|
|
12
|
+
QuickGGUFReader,
|
|
11
13
|
print_warning,
|
|
12
14
|
print_verbose,
|
|
13
|
-
|
|
15
|
+
assert_type,
|
|
16
|
+
NoneType,
|
|
17
|
+
truncate,
|
|
14
18
|
softmax
|
|
15
19
|
)
|
|
16
20
|
|
|
17
|
-
from .samplers import SamplerSettings, DefaultSampling
|
|
18
21
|
from llama_cpp import Llama, StoppingCriteriaList
|
|
19
|
-
from typing import
|
|
20
|
-
from
|
|
21
|
-
from heapq import nlargest
|
|
22
|
+
from typing import Generator, Optional
|
|
23
|
+
from .samplers import SamplerSettings
|
|
22
24
|
|
|
23
|
-
from os import cpu_count as os_cpu_count
|
|
24
25
|
|
|
26
|
+
from webscout import exceptions
|
|
25
27
|
|
|
26
|
-
class ModelUnloadedException(Exception):
|
|
27
|
-
"""Exception raised when trying to use a Model that has been unloaded"""
|
|
28
|
-
def __init__(self, message):
|
|
29
|
-
self.message = message
|
|
30
|
-
self.tool_code_start = "```tool_code\n" # Define tool code markers
|
|
31
|
-
self.tool_code_end = "\n```tool_code```"
|
|
32
|
-
super().__init__(self.message)
|
|
33
|
-
self.add_note('Are you trying to use a Model that has been unloaded?')
|
|
34
28
|
|
|
35
29
|
class Model:
|
|
36
30
|
"""
|
|
37
|
-
A high-level abstraction of a
|
|
38
|
-
|
|
39
|
-
This is just a brief overview of webscout.Local.Model.
|
|
40
|
-
To see a full description of each method and its parameters,
|
|
41
|
-
call help(Model), or see the relevant docstring.
|
|
31
|
+
A high-level abstraction of a Llama model
|
|
42
32
|
|
|
43
33
|
The following methods are available:
|
|
44
|
-
-
|
|
45
|
-
|
|
46
|
-
-
|
|
47
|
-
|
|
48
|
-
-
|
|
49
|
-
|
|
50
|
-
-
|
|
51
|
-
|
|
52
|
-
|
|
34
|
+
- unload:
|
|
35
|
+
Unload the model from memory
|
|
36
|
+
- reload:
|
|
37
|
+
Re-load the model, optionally changing parameters
|
|
38
|
+
- load:
|
|
39
|
+
Load the model into memory
|
|
40
|
+
- is_loaded:
|
|
41
|
+
Return `True` if the model is fully loaded, `False` otherwise
|
|
42
|
+
- tokenize:
|
|
43
|
+
Tokenize the given text, from `str` to `list[int]`
|
|
44
|
+
- detokenize:
|
|
45
|
+
Detokenize the given text, from `list[int]` or `int` to `str`
|
|
46
|
+
- get_length:
|
|
47
|
+
Return the length of the given text as measured in tokens
|
|
48
|
+
- get_tokenization_mapping:
|
|
49
|
+
Return a mapping of token IDs to tokens for a given text
|
|
50
|
+
- print_tokenization_mapping:
|
|
51
|
+
Display the tokenization map for a given text
|
|
52
|
+
- generate:
|
|
53
|
+
Generate text from an input and return it all at once when finished
|
|
54
|
+
- stream:
|
|
55
|
+
Return a Generator that yields tokens as they are generated
|
|
56
|
+
- stream_print:
|
|
57
|
+
Stream tokens to a file as they are generated
|
|
58
|
+
- ingest:
|
|
59
|
+
Ingest the given text into the model's cache, reducing the latency of
|
|
60
|
+
future generations that start with the same text
|
|
61
|
+
- candidates:
|
|
62
|
+
Return a sorted list of candidates for the next token, along with
|
|
63
|
+
their normalized probabilities
|
|
64
|
+
- print_candidates:
|
|
65
|
+
Print a sorted list of candidates for the next token, along with
|
|
66
|
+
their normalized probabilities
|
|
67
|
+
|
|
53
68
|
The following attributes are available:
|
|
54
|
-
-
|
|
55
|
-
|
|
56
|
-
-
|
|
57
|
-
|
|
58
|
-
-
|
|
59
|
-
|
|
60
|
-
-
|
|
61
|
-
|
|
62
|
-
-
|
|
63
|
-
|
|
64
|
-
-
|
|
69
|
+
- verbose `bool`:
|
|
70
|
+
Whether the model was loaded with `verbose=True`
|
|
71
|
+
- metadata `dict`:
|
|
72
|
+
A dictionary containing the GGUF metadata of the model
|
|
73
|
+
- context_length `int`:
|
|
74
|
+
The currently loaded context length of the model, in tokens
|
|
75
|
+
- n_ctx `int`:
|
|
76
|
+
Alias to context_length
|
|
77
|
+
- llama `llama_cpp.Llama`:
|
|
78
|
+
The underlying Llama instance
|
|
79
|
+
- vocab `list[str]`:
|
|
80
|
+
A list of all tokens in the model's vocabulary
|
|
81
|
+
- bos_token `int`:
|
|
82
|
+
The beginning-of-sequence token ID
|
|
83
|
+
- eos_token `int`:
|
|
84
|
+
The end-of-sequence token ID
|
|
85
|
+
- eot_token `int`:
|
|
86
|
+
The end-of-turn token ID (or `None` if not found)
|
|
87
|
+
- nl_token `int`:
|
|
88
|
+
The newline token ID (or `None` if not found)
|
|
89
|
+
- prefix_token `int`:
|
|
90
|
+
The infill prefix token ID (or `None` if not found)
|
|
91
|
+
- middle_token `int`:
|
|
92
|
+
The infill middle token ID (or `None` if not found)
|
|
93
|
+
- suffix_token `int`:
|
|
94
|
+
The infill suffix token ID (or `None` if not found)
|
|
95
|
+
- cls_token `int`:
|
|
96
|
+
The classifier token ID (or `None` if not found)
|
|
97
|
+
- sep_token `int`:
|
|
98
|
+
The separator token ID (or `None` if not found)
|
|
99
|
+
- filename `str`:
|
|
100
|
+
The name of the file the model was loaded from
|
|
101
|
+
- n_ctx_train `int`:
|
|
102
|
+
The native context length of the model
|
|
103
|
+
- rope_freq_base_train `float`:
|
|
104
|
+
The native RoPE frequency base (theta) value
|
|
105
|
+
- rope_freq_base `float`:
|
|
106
|
+
The currently loaded RoPE frequency base (theta) value
|
|
107
|
+
- flash_attn `bool`:
|
|
108
|
+
Whether the model was loaded with Flash Attention enabled
|
|
109
|
+
- n_vocab `int`:
|
|
110
|
+
The number of tokens in the model's vocabulary
|
|
111
|
+
- n_layer `int`:
|
|
112
|
+
The number of layers in the model
|
|
113
|
+
- n_gpu_layers `int`:
|
|
114
|
+
The number of layers offloaded to the GPU (-1 for all layers)
|
|
115
|
+
- type_k `int`:
|
|
116
|
+
The GGML data type used for the `K` cache. 1 == f16, q8_0 otherwise
|
|
117
|
+
- type_v `int`:
|
|
118
|
+
The GGML data type used for the `V` cache. 1 == f16, q8_0 otherwise
|
|
119
|
+
- n_gqa `int`:
|
|
120
|
+
The GQA (Grouped-Query Attention) factor of the model
|
|
121
|
+
- uuid `uuid.UUID`:
|
|
122
|
+
A randomly generated UUID, unique to this specific model instance
|
|
65
123
|
"""
|
|
66
124
|
|
|
67
125
|
def __init__(
|
|
68
126
|
self,
|
|
69
127
|
model_path: str,
|
|
70
|
-
context_length: Optional[int] =
|
|
128
|
+
context_length: Optional[int] = 2048,
|
|
71
129
|
n_gpu_layers: int = 0,
|
|
72
130
|
offload_kqv: bool = True,
|
|
73
131
|
flash_attn: bool = False,
|
|
132
|
+
quantize_kv_cache: bool = False,
|
|
74
133
|
verbose: bool = False,
|
|
134
|
+
**kwargs
|
|
75
135
|
):
|
|
76
136
|
"""
|
|
77
137
|
Given the path to a GGUF file, construct a Model instance.
|
|
@@ -79,28 +139,44 @@ class Model:
|
|
|
79
139
|
The model must be in GGUF format.
|
|
80
140
|
|
|
81
141
|
The following parameters are optional:
|
|
82
|
-
- context_length:
|
|
83
|
-
|
|
84
|
-
-
|
|
85
|
-
|
|
86
|
-
-
|
|
142
|
+
- context_length:
|
|
143
|
+
The context length at which to load the model, in tokens
|
|
144
|
+
- n_gpu_layers:
|
|
145
|
+
The number of layers to be offloaded to the GPU
|
|
146
|
+
- offload_kqv:
|
|
147
|
+
Whether the KQV cache (context) should be offloaded
|
|
148
|
+
- flash_attn:
|
|
149
|
+
Whether to use Flash Attention
|
|
150
|
+
- quantize_kv_cache:
|
|
151
|
+
Whether to use q8_0 values for KV cache
|
|
152
|
+
- verbose:
|
|
153
|
+
Whether to print additional backend information. `bool`
|
|
154
|
+
|
|
155
|
+
The following additional keyword arguments are also accepted:
|
|
156
|
+
- do_not_load:
|
|
157
|
+
If `True`, construct the model instance but do not load it into
|
|
158
|
+
memory yet. Call `Model.load()` before using the model
|
|
159
|
+
- debug:
|
|
160
|
+
If `True`, print additional backend information from llama.cpp
|
|
87
161
|
"""
|
|
88
162
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
|
|
92
|
-
|
|
93
|
-
|
|
94
|
-
|
|
95
|
-
|
|
96
|
-
|
|
97
|
-
|
|
98
|
-
|
|
99
|
-
|
|
100
|
-
|
|
101
|
-
|
|
102
|
-
|
|
103
|
-
|
|
163
|
+
assert_type(verbose, bool, 'verbose', 'Model')
|
|
164
|
+
assert_type(model_path, str, 'model_path', 'Model')
|
|
165
|
+
if not os.path.exists(model_path):
|
|
166
|
+
raise FileNotFoundError(
|
|
167
|
+
f"Model: the given model_path {model_path!r} does not exist"
|
|
168
|
+
)
|
|
169
|
+
if os.path.isdir(model_path):
|
|
170
|
+
raise IsADirectoryError(
|
|
171
|
+
f"Model: the given model_path {model_path!r} is a directory, "
|
|
172
|
+
"not a GGUF file"
|
|
173
|
+
)
|
|
174
|
+
assert_type(context_length, (int, NoneType), 'context_length', 'Model')
|
|
175
|
+
assert_type(n_gpu_layers, int, 'n_gpu_layers', 'Model')
|
|
176
|
+
assert_type(offload_kqv, bool, 'offload_kqv', 'Model')
|
|
177
|
+
assert_type(flash_attn, bool, 'flash_attn', 'Model')
|
|
178
|
+
assert_type(quantize_kv_cache, bool, 'quantize_kv_cache', 'Model')
|
|
179
|
+
|
|
104
180
|
# save __init__ parameters for __repr__
|
|
105
181
|
self._model_path = model_path
|
|
106
182
|
self._context_length = context_length
|
|
@@ -108,130 +184,203 @@ class Model:
|
|
|
108
184
|
self._offload_kqv = offload_kqv
|
|
109
185
|
self._flash_attn = flash_attn
|
|
110
186
|
self._verbose = self.verbose = verbose
|
|
111
|
-
self.
|
|
112
|
-
# if context_length <= 0, use n_ctx_train
|
|
113
|
-
if isinstance(context_length, int) and context_length <= 0:
|
|
114
|
-
context_length = None
|
|
187
|
+
self._quantize_kv_cache = quantize_kv_cache
|
|
115
188
|
|
|
116
|
-
|
|
117
|
-
# metadata to determine some parameters of the Llama instance
|
|
118
|
-
# before it is created
|
|
119
|
-
self.metadata = GGUFReader.load_metadata(self, model_path)
|
|
120
|
-
metadata_keys = self.metadata.keys() # only read once
|
|
189
|
+
_kwargs_keys = kwargs.keys() # only read once
|
|
121
190
|
|
|
122
|
-
|
|
123
|
-
|
|
124
|
-
|
|
125
|
-
|
|
126
|
-
|
|
191
|
+
if '__uuid' not in _kwargs_keys:
|
|
192
|
+
self.uuid = uuid.uuid4()
|
|
193
|
+
else:
|
|
194
|
+
# Model.reload() passes this kwarg to preserve the UUID
|
|
195
|
+
self.uuid = kwargs.get('__uuid')
|
|
127
196
|
|
|
128
|
-
if
|
|
129
|
-
|
|
130
|
-
|
|
131
|
-
|
|
197
|
+
if 'do_not_load' in _kwargs_keys:
|
|
198
|
+
if kwargs.get('do_not_load') is True:
|
|
199
|
+
# only save __init__ params to be used later in self.load()
|
|
200
|
+
return
|
|
132
201
|
|
|
133
|
-
|
|
134
|
-
|
|
135
|
-
if key.endswith('.rope.freq_base'):
|
|
136
|
-
rope_freq_base_train = self.metadata[key]
|
|
137
|
-
break
|
|
202
|
+
if verbose:
|
|
203
|
+
print_version_info(file=sys.stderr)
|
|
138
204
|
|
|
139
|
-
if
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
205
|
+
if sys.byteorder == 'big':
|
|
206
|
+
print_warning(
|
|
207
|
+
"host is big-endian, please ensure your GGUF file is also "
|
|
208
|
+
"big-endian"
|
|
209
|
+
)
|
|
210
|
+
elif sys.byteorder == 'little':
|
|
211
|
+
if verbose:
|
|
212
|
+
print_verbose(
|
|
213
|
+
"host is little-endian"
|
|
146
214
|
)
|
|
215
|
+
else:
|
|
216
|
+
print_warning(
|
|
217
|
+
f"unexpected value for sys.byteorder: {sys.byteorder!r}, "
|
|
218
|
+
"expected 'little' for little-endian host or 'big' for "
|
|
219
|
+
"big-endian host"
|
|
220
|
+
)
|
|
221
|
+
|
|
222
|
+
self._model_file_size_bytes = os.stat(model_path).st_size
|
|
223
|
+
self.metadata = QuickGGUFReader.load_metadata(model_path)
|
|
147
224
|
|
|
148
|
-
|
|
149
|
-
|
|
150
|
-
|
|
225
|
+
_debug = False
|
|
226
|
+
if 'debug' in _kwargs_keys:
|
|
227
|
+
_debug = bool(kwargs.get('debug'))
|
|
151
228
|
|
|
152
|
-
|
|
153
|
-
|
|
154
|
-
else:
|
|
155
|
-
self.context_length = context_length
|
|
156
|
-
rope_freq_base = rope_freq_base_train
|
|
229
|
+
if verbose and not _debug:
|
|
230
|
+
__class__._print_metadata(self.metadata)
|
|
157
231
|
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
232
|
+
n_ctx_train = None
|
|
233
|
+
rope_freq_base_train = None
|
|
234
|
+
n_layer = None
|
|
235
|
+
n_attn_heads = None
|
|
236
|
+
n_kv_heads = None
|
|
237
|
+
n_gqa = None
|
|
161
238
|
|
|
162
|
-
|
|
163
|
-
|
|
164
|
-
|
|
165
|
-
|
|
166
|
-
|
|
167
|
-
|
|
168
|
-
|
|
169
|
-
|
|
170
|
-
|
|
171
|
-
|
|
239
|
+
for key in self.metadata.keys():
|
|
240
|
+
if key.endswith('.context_length'):
|
|
241
|
+
n_ctx_train = int(self.metadata[key])
|
|
242
|
+
elif key.endswith('.rope.freq_base'):
|
|
243
|
+
rope_freq_base_train = float(self.metadata[key])
|
|
244
|
+
elif key.endswith('.block_count'):
|
|
245
|
+
n_layer = int(self.metadata[key])
|
|
246
|
+
elif key.endswith('.attention.head_count'):
|
|
247
|
+
n_attn_heads = int(self.metadata[key])
|
|
248
|
+
elif key.endswith('.attention.head_count_kv'):
|
|
249
|
+
n_kv_heads = int(self.metadata[key])
|
|
250
|
+
|
|
251
|
+
if n_layer is None:
|
|
252
|
+
exc = KeyError(
|
|
253
|
+
f"GGUF file metadata does not specify n_layer"
|
|
254
|
+
)
|
|
255
|
+
exc.add_note(
|
|
256
|
+
f"GGUF file is at {self._model_path!r}"
|
|
257
|
+
)
|
|
258
|
+
raise exc
|
|
172
259
|
|
|
173
|
-
|
|
174
|
-
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
|
|
260
|
+
if n_ctx_train is None:
|
|
261
|
+
exc = KeyError(
|
|
262
|
+
f"GGUF file metadata does not specify a context length"
|
|
263
|
+
)
|
|
264
|
+
exc.add_note(
|
|
265
|
+
f"GGUF file is at {self._model_path!r}"
|
|
266
|
+
)
|
|
267
|
+
raise exc
|
|
268
|
+
|
|
269
|
+
if n_attn_heads is not None and n_kv_heads is not None:
|
|
270
|
+
n_gqa = int(n_attn_heads / n_kv_heads)
|
|
271
|
+
|
|
272
|
+
if context_length <= 0:
|
|
273
|
+
context_length = None
|
|
274
|
+
|
|
275
|
+
rope_freq_base = __class__._calculate_rope_freq_base(
|
|
276
|
+
n_ctx_train,
|
|
277
|
+
context_length if context_length is not None else n_ctx_train,
|
|
278
|
+
rope_freq_base_train
|
|
279
|
+
)
|
|
184
280
|
|
|
185
|
-
|
|
281
|
+
if context_length is None:
|
|
282
|
+
if n_ctx_train > 32768:
|
|
186
283
|
print_warning(
|
|
187
|
-
|
|
188
|
-
|
|
284
|
+
f"you did not specify a context length, and the native "
|
|
285
|
+
f"context length of this model is very large "
|
|
286
|
+
f"({n_ctx_train}). defaulting to 32768 to avoid "
|
|
287
|
+
f"out-of-memory errors. you should specify a higher "
|
|
288
|
+
f"context length if you need it"
|
|
189
289
|
)
|
|
290
|
+
self.context_length = self.n_ctx = 32768
|
|
291
|
+
else:
|
|
292
|
+
self.context_length = self.n_ctx = n_ctx_train
|
|
293
|
+
|
|
294
|
+
elif context_length <= n_ctx_train:
|
|
295
|
+
self.context_length = self.n_ctx = context_length
|
|
190
296
|
|
|
191
|
-
|
|
192
|
-
self.tokens: list[str] = self.metadata['tokenizer.ggml.tokens']
|
|
193
|
-
except KeyError:
|
|
194
|
-
print_warning(
|
|
195
|
-
"could not set Model.tokens, defaulting to None"
|
|
196
|
-
)
|
|
197
|
-
self.tokens = None
|
|
198
|
-
try:
|
|
199
|
-
self.bos_token: int = self.metadata['tokenizer.ggml.bos_token_id']
|
|
200
|
-
except KeyError:
|
|
201
|
-
print_warning(
|
|
202
|
-
"could not set Model.bos_token, defaulting to None"
|
|
203
|
-
)
|
|
204
|
-
self.bos_token = None
|
|
205
|
-
try:
|
|
206
|
-
self.eos_token: int = self.metadata['tokenizer.ggml.eos_token_id']
|
|
207
|
-
except KeyError:
|
|
297
|
+
elif context_length > n_ctx_train:
|
|
208
298
|
print_warning(
|
|
209
|
-
"
|
|
299
|
+
f"you have specified a context length that is greater than "
|
|
300
|
+
f"the natively supported context length of this model "
|
|
301
|
+
f"({context_length} > {n_ctx_train}). the model will still "
|
|
302
|
+
f"work, but the quality of output may be subpar. consider "
|
|
303
|
+
f"decreasing the context length to {n_ctx_train} or lower "
|
|
304
|
+
f"for best results"
|
|
210
305
|
)
|
|
211
|
-
self.
|
|
306
|
+
self.context_length = self.n_ctx = context_length
|
|
307
|
+
|
|
308
|
+
else:
|
|
309
|
+
raise UnreachableException
|
|
212
310
|
|
|
213
|
-
cpu_count =
|
|
311
|
+
cpu_count = int(os.cpu_count()) # only read once
|
|
312
|
+
|
|
313
|
+
if n_gpu_layers < 0 or n_gpu_layers > n_layer:
|
|
314
|
+
n_gpu_layers = n_layer
|
|
315
|
+
|
|
316
|
+
if n_gpu_layers == n_layer:
|
|
317
|
+
# fully offloaded
|
|
318
|
+
n_batch = 1024
|
|
319
|
+
else:
|
|
320
|
+
# partially offloaded
|
|
321
|
+
n_batch = 512
|
|
322
|
+
|
|
323
|
+
# NOTE: the optimal n_threads value (for text generation) is equal
|
|
324
|
+
# to the number of physical cores (for homogenous CPUs) or
|
|
325
|
+
# to the number of performance cores (for heterogenous CPUs)
|
|
326
|
+
#
|
|
327
|
+
# the optimal n_threads_batch value (for prompt eval) is equal
|
|
328
|
+
# to the total number of logical cores, regardless of
|
|
329
|
+
# their type
|
|
214
330
|
|
|
215
|
-
# these values for n_threads and n_threads_batch are
|
|
216
|
-
# known to be optimal for most systems
|
|
217
|
-
n_batch = 512 # can this be optimized?
|
|
218
331
|
n_threads = max(cpu_count//2, 1)
|
|
219
332
|
n_threads_batch = cpu_count
|
|
220
333
|
|
|
221
334
|
if flash_attn and n_gpu_layers == 0:
|
|
335
|
+
flash_attn = False
|
|
222
336
|
print_warning(
|
|
223
337
|
"disabling flash_attn because n_gpu_layers == 0"
|
|
224
338
|
)
|
|
225
|
-
|
|
339
|
+
|
|
340
|
+
if quantize_kv_cache:
|
|
341
|
+
# use q8_0 for K, V
|
|
342
|
+
if flash_attn:
|
|
343
|
+
type_k = 8
|
|
344
|
+
type_v = 8
|
|
345
|
+
if verbose:
|
|
346
|
+
print_verbose(
|
|
347
|
+
"using q8_0 KV cache"
|
|
348
|
+
)
|
|
349
|
+
else: # llama.cpp requires flash_attn for V quantization
|
|
350
|
+
type_k = 8
|
|
351
|
+
type_v = 1
|
|
352
|
+
if verbose:
|
|
353
|
+
print_verbose(
|
|
354
|
+
"using q8_0 K cache, f16 V cache"
|
|
355
|
+
)
|
|
356
|
+
print_verbose(
|
|
357
|
+
"to quantize V cache, flash_attn must be enabled"
|
|
358
|
+
)
|
|
359
|
+
else:
|
|
360
|
+
# use f16 for K, V (default)
|
|
361
|
+
type_k = 1
|
|
362
|
+
type_v = 1
|
|
226
363
|
|
|
227
364
|
# guard against models with no rope_freq_base
|
|
228
365
|
if rope_freq_base is None:
|
|
229
366
|
rope_freq_base = 0
|
|
367
|
+
|
|
368
|
+
if verbose:
|
|
369
|
+
print_verbose(
|
|
370
|
+
f"attempting to load model, offloading "
|
|
371
|
+
f"{n_gpu_layers}/{n_layer} layers..."
|
|
372
|
+
)
|
|
373
|
+
|
|
374
|
+
# llama.cpp needs -ngl set to `-1`, not just n_layer
|
|
375
|
+
if n_gpu_layers >= n_layer:
|
|
376
|
+
_llama_ngl = -1
|
|
377
|
+
else:
|
|
378
|
+
_llama_ngl = n_gpu_layers
|
|
230
379
|
|
|
231
|
-
self.llama
|
|
380
|
+
self.llama = Llama(
|
|
232
381
|
model_path=model_path,
|
|
233
382
|
n_ctx=self.context_length,
|
|
234
|
-
n_gpu_layers=
|
|
383
|
+
n_gpu_layers=_llama_ngl,
|
|
235
384
|
use_mmap=True,
|
|
236
385
|
use_mlock=False,
|
|
237
386
|
logits_all=False,
|
|
@@ -242,222 +391,511 @@ class Model:
|
|
|
242
391
|
mul_mat_q=True,
|
|
243
392
|
offload_kqv=offload_kqv,
|
|
244
393
|
flash_attn=flash_attn,
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
|
|
248
|
-
#type_v=8,
|
|
249
|
-
verbose=verbose
|
|
394
|
+
type_k=type_k,
|
|
395
|
+
type_v=type_v,
|
|
396
|
+
verbose=_debug
|
|
250
397
|
)
|
|
251
398
|
|
|
252
|
-
#
|
|
253
|
-
#
|
|
254
|
-
|
|
399
|
+
# NOTE: llama.cpp uses the nearest multiple of 32 as the actual
|
|
400
|
+
# context length. here we update self.context_length to reflect
|
|
401
|
+
# this
|
|
402
|
+
self.context_length = self.n_ctx = self.llama.n_ctx()
|
|
255
403
|
|
|
256
|
-
|
|
257
|
-
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
404
|
+
if self.n_ctx < 512:
|
|
405
|
+
print_warning(
|
|
406
|
+
f'the currently loaded context length is less than 512 tokens '
|
|
407
|
+
f'({self.n_ctx} < 512). sometimes this can cause problems in '
|
|
408
|
+
f'llama.cpp. consider increasing the context length to at '
|
|
409
|
+
f'least 512 tokens'
|
|
410
|
+
)
|
|
261
411
|
|
|
262
|
-
if self.verbose:
|
|
263
|
-
print_verbose("new Model instance with the following attributes:")
|
|
264
|
-
print_verbose(f"model: {model_path}")
|
|
265
|
-
print_verbose(f"param: n_gpu_layers == {n_gpu_layers}")
|
|
266
|
-
print_verbose(f"param: offload_kqv == {offload_kqv}")
|
|
267
|
-
print_verbose(f"param: flash_attn == {flash_attn}")
|
|
268
|
-
print_verbose(f"param: n_batch == {n_batch}")
|
|
269
|
-
print_verbose(f"param: n_threads == {n_threads}")
|
|
270
|
-
print_verbose(f"param: n_threads_batch == {n_threads_batch}")
|
|
271
|
-
print_verbose(f" gguf: n_ctx_train == {n_ctx_train}")
|
|
272
|
-
print_verbose(f"param: self.context_length == {self.context_length}")
|
|
273
|
-
print_verbose(f" gguf: rope_freq_base_train == {rope_freq_base_train}")
|
|
274
|
-
print_verbose(f"param: rope_freq_base == {rope_freq_base}")
|
|
275
|
-
def register_tool(self, name: str, function: Callable):
|
|
276
|
-
"""Registers a tool for function calling."""
|
|
277
|
-
self.tools[name] = function
|
|
278
|
-
|
|
279
|
-
def _extract_tool_code(self, text: str) -> dict:
|
|
280
|
-
"""Extracts tool code from the model's output."""
|
|
281
412
|
try:
|
|
282
|
-
|
|
283
|
-
|
|
284
|
-
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
|
|
293
|
-
|
|
294
|
-
|
|
295
|
-
|
|
296
|
-
|
|
297
|
-
|
|
298
|
-
|
|
299
|
-
|
|
300
|
-
|
|
301
|
-
|
|
413
|
+
self.vocab: list[str] = self.metadata['tokenizer.ggml.tokens']
|
|
414
|
+
except (KeyError, TypeError, ValueError):
|
|
415
|
+
print_warning(
|
|
416
|
+
"could not set Model.vocab, constructing manually..."
|
|
417
|
+
)
|
|
418
|
+
self.vocab = [
|
|
419
|
+
self.llama._model.detokenize([i], special=True).decode(
|
|
420
|
+
'utf-8', errors='ignore'
|
|
421
|
+
) for i in range(self.llama._model.n_vocab())
|
|
422
|
+
]
|
|
423
|
+
try:
|
|
424
|
+
self.bos_token = int(self.metadata['tokenizer.ggml.bos_token_id'])
|
|
425
|
+
except (KeyError, TypeError, ValueError):
|
|
426
|
+
self.bos_token = int(self.llama._model.token_bos())
|
|
427
|
+
if self.bos_token < 0:
|
|
428
|
+
self.bos_token = None
|
|
429
|
+
print_warning(
|
|
430
|
+
"could not set Model.bos_token, defaulting to None"
|
|
431
|
+
)
|
|
432
|
+
try:
|
|
433
|
+
self.eos_token = int(self.metadata['tokenizer.ggml.eos_token_id'])
|
|
434
|
+
except (KeyError, TypeError, ValueError):
|
|
435
|
+
self.eos_token = int(self.llama._model.token_eos())
|
|
436
|
+
if self.eos_token < 0:
|
|
437
|
+
self.eos_token = None
|
|
438
|
+
print_warning(
|
|
439
|
+
"could not set Model.eos_token, defaulting to None"
|
|
440
|
+
)
|
|
441
|
+
|
|
442
|
+
# These special tokens are optional
|
|
443
|
+
|
|
444
|
+
self.eot_token = int(self.llama._model.token_eot())
|
|
445
|
+
if self.eot_token < 0:
|
|
446
|
+
self.eot_token = None
|
|
447
|
+
|
|
448
|
+
self.nl_token = int(self.llama._model.token_nl())
|
|
449
|
+
if self.nl_token < 0:
|
|
450
|
+
self.nl_token = None
|
|
302
451
|
|
|
303
|
-
|
|
304
|
-
|
|
305
|
-
|
|
306
|
-
|
|
307
|
-
|
|
452
|
+
self.prefix_token = int(self.llama._model.token_prefix())
|
|
453
|
+
if self.prefix_token < 0:
|
|
454
|
+
self.prefix_token = None
|
|
455
|
+
|
|
456
|
+
self.middle_token = int(self.llama._model.token_middle())
|
|
457
|
+
if self.middle_token < 0:
|
|
458
|
+
self.middle_token = None
|
|
459
|
+
|
|
460
|
+
self.suffix_token = int(self.llama._model.token_suffix())
|
|
461
|
+
if self.suffix_token < 0:
|
|
462
|
+
self.suffix_token = None
|
|
463
|
+
|
|
464
|
+
self.cls_token = int(self.llama._model.token_cls())
|
|
465
|
+
if self.cls_token < 0:
|
|
466
|
+
self.cls_token = None
|
|
467
|
+
|
|
468
|
+
self.sep_token = int(self.llama._model.token_sep())
|
|
469
|
+
if self.sep_token < 0:
|
|
470
|
+
self.sep_token = None
|
|
471
|
+
|
|
472
|
+
# Misc. attributes
|
|
473
|
+
_add_bos_token = self.llama._model.add_bos_token()
|
|
474
|
+
if _add_bos_token == 1:
|
|
475
|
+
self.add_bos_token = True
|
|
476
|
+
elif _add_bos_token == 0:
|
|
477
|
+
self.add_bos_token = False
|
|
478
|
+
else:
|
|
479
|
+
self.add_bos_token = None
|
|
480
|
+
print_warning(
|
|
481
|
+
"Model.add_bos_token is unknown, defaulting to None"
|
|
482
|
+
)
|
|
483
|
+
|
|
484
|
+
_add_eos_token = self.llama._model.add_eos_token()
|
|
485
|
+
if _add_eos_token == 1:
|
|
486
|
+
self.add_eos_token = True
|
|
487
|
+
elif _add_eos_token == 0:
|
|
488
|
+
self.add_eos_token = False
|
|
489
|
+
else:
|
|
490
|
+
self.add_eos_token = None
|
|
491
|
+
print_warning(
|
|
492
|
+
"Model.add_eos_token is unknown, defaulting to None"
|
|
493
|
+
)
|
|
308
494
|
|
|
309
|
-
|
|
310
|
-
|
|
495
|
+
self.filename: str = os.path.basename(model_path)
|
|
496
|
+
self.n_ctx_train: int = n_ctx_train
|
|
497
|
+
self.rope_freq_base_train: float = rope_freq_base_train
|
|
498
|
+
self.rope_freq_base: float = rope_freq_base
|
|
499
|
+
self.n_batch: int = n_batch
|
|
500
|
+
self.n_threads: int = n_threads
|
|
501
|
+
self.n_threads_batch: int = n_threads_batch
|
|
502
|
+
self.flash_attn: bool = flash_attn
|
|
503
|
+
self.n_embd = self.llama._model.n_embd()
|
|
504
|
+
self.n_params = self.llama._model.n_params()
|
|
505
|
+
self.bpw = (8*self._model_file_size_bytes)/self.n_params
|
|
506
|
+
self.n_vocab: int = len(self.vocab)
|
|
507
|
+
self.n_layer: int = n_layer
|
|
508
|
+
self.n_gpu_layers: int = n_gpu_layers
|
|
509
|
+
self.offload_kqv = offload_kqv
|
|
510
|
+
self.is_native: bool = self.context_length <= self.n_ctx_train
|
|
511
|
+
self.type_k: int = type_k
|
|
512
|
+
self.type_v: int = type_v
|
|
513
|
+
self.n_gqa: int = n_gqa
|
|
514
|
+
|
|
515
|
+
if verbose:
|
|
516
|
+
print_verbose(
|
|
517
|
+
f"{'new' if '__uuid' not in _kwargs_keys else 'reloaded'} "
|
|
518
|
+
f"Model instance with the following attributes:"
|
|
519
|
+
)
|
|
520
|
+
print_verbose(f" uuid == {self.uuid}")
|
|
521
|
+
print_verbose(f" filename == {self.filename}")
|
|
522
|
+
print_verbose(f" n_params == {self.n_params}")
|
|
523
|
+
print_verbose(
|
|
524
|
+
f" bpw == {self.bpw} "
|
|
525
|
+
f"({__class__._get_bpw_quality_hint(self.bpw)})"
|
|
526
|
+
)
|
|
527
|
+
print_verbose(f" n_gpu_layers == {self.n_gpu_layers}")
|
|
528
|
+
print_verbose(f" n_layer == {self.n_layer}")
|
|
529
|
+
print_verbose(f" offload_kqv == {self.offload_kqv}")
|
|
530
|
+
print_verbose(f" flash_attn == {self.flash_attn}")
|
|
531
|
+
print_verbose(f" n_gqa == {self.n_gqa}")
|
|
532
|
+
print_verbose(
|
|
533
|
+
f" type_k == {self.type_k} "
|
|
534
|
+
f"({'f16' if self.type_k == 1 else 'q8_0'})"
|
|
535
|
+
)
|
|
536
|
+
print_verbose(
|
|
537
|
+
f" type_v == {self.type_v} "
|
|
538
|
+
f"({'f16' if self.type_v == 1 else 'q8_0'})"
|
|
539
|
+
)
|
|
540
|
+
print_verbose(f" n_batch == {self.n_batch}")
|
|
541
|
+
print_verbose(
|
|
542
|
+
f" n_threads == {self.n_threads}/{cpu_count}"
|
|
543
|
+
)
|
|
544
|
+
print_verbose(
|
|
545
|
+
f" n_threads_batch == {self.n_threads_batch}/{cpu_count}"
|
|
546
|
+
)
|
|
547
|
+
print_verbose(f" n_ctx_train == {self.n_ctx_train}")
|
|
548
|
+
print_verbose(f" n_ctx == {self.n_ctx}")
|
|
549
|
+
print_verbose(f" rope_freq_base_train == {self.rope_freq_base_train}")
|
|
550
|
+
print_verbose(f" rope_freq_base == {self.rope_freq_base}")
|
|
551
|
+
print_verbose(f" n_embd == {self.n_embd}")
|
|
552
|
+
print_verbose(f" n_vocab == {self.n_vocab}")
|
|
553
|
+
print_verbose(f" bos_token == {self.bos_token}")
|
|
554
|
+
print_verbose(f" eos_token == {self.eos_token}")
|
|
555
|
+
if self.eot_token is not None:
|
|
556
|
+
print_verbose(f" eot_token == {self.eot_token}")
|
|
557
|
+
if self.nl_token is not None:
|
|
558
|
+
print_verbose(f" nl_token == {self.nl_token}")
|
|
559
|
+
if self.prefix_token is not None:
|
|
560
|
+
print_verbose(f" prefix_token == {self.prefix_token}")
|
|
561
|
+
if self.middle_token is not None:
|
|
562
|
+
print_verbose(f" middle_token == {self.middle_token}")
|
|
563
|
+
if self.suffix_token is not None:
|
|
564
|
+
print_verbose(f" suffix_token == {self.suffix_token}")
|
|
565
|
+
if self.cls_token is not None:
|
|
566
|
+
print_verbose(f" cls_token == {self.cls_token}")
|
|
567
|
+
if self.sep_token is not None:
|
|
568
|
+
print_verbose(f" sep_token == {self.sep_token}")
|
|
569
|
+
print_verbose(f" add_bos_token == {self.add_bos_token}")
|
|
570
|
+
print_verbose(f" add_eos_token == {self.add_eos_token}")
|
|
571
|
+
|
|
572
|
+
|
|
573
|
+
@staticmethod
|
|
574
|
+
def _calculate_rope_freq_base(
|
|
575
|
+
n_ctx_train: int,
|
|
576
|
+
n_ctx_load: int,
|
|
577
|
+
rope_freq_base_train: Optional[float]
|
|
578
|
+
) -> float:
|
|
311
579
|
"""
|
|
312
|
-
|
|
313
|
-
|
|
314
|
-
|
|
315
|
-
|
|
316
|
-
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
|
|
321
|
-
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
580
|
+
Returns the rope_freq_base (theta) value at which model should be loaded
|
|
581
|
+
"""
|
|
582
|
+
assert_type(n_ctx_train, int, 'n_ctx_train', '_calculate_rope_freq_base')
|
|
583
|
+
assert_type(n_ctx_load, int, 'n_ctx_load', '_calculate_rope_freq_base')
|
|
584
|
+
assert_type(rope_freq_base_train, (float, NoneType),
|
|
585
|
+
'rope_freq_base_train', '_calculate_rope_freq_base')
|
|
586
|
+
|
|
587
|
+
if n_ctx_load <= n_ctx_train:
|
|
588
|
+
if rope_freq_base_train is None:
|
|
589
|
+
return 0.0
|
|
590
|
+
else:
|
|
591
|
+
return rope_freq_base_train
|
|
592
|
+
|
|
593
|
+
if rope_freq_base_train is None or rope_freq_base_train == 0.0:
|
|
594
|
+
raise ValueError(
|
|
595
|
+
'unable to load model with greater than native '
|
|
596
|
+
f'context length ({n_ctx_load} > {n_ctx_train}) '
|
|
597
|
+
'because model does not specify rope_freq_base. '
|
|
598
|
+
f'try again with context_length <= {n_ctx_train}'
|
|
599
|
+
)
|
|
600
|
+
|
|
601
|
+
return ((n_ctx_load/n_ctx_train)**(2**(1/4)))*rope_freq_base_train
|
|
602
|
+
|
|
603
|
+
# traditional formula:
|
|
604
|
+
# return (n_ctx_load/n_ctx_train)*rope_freq_base_train
|
|
605
|
+
# experimental formula A:
|
|
606
|
+
# return ((n_ctx_load/n_ctx_train)**2)*rope_freq_base_train
|
|
607
|
+
# experimental formula B:
|
|
608
|
+
# return ((n_ctx_load/n_ctx_train)**(2**(1/4)))*rope_freq_base_train
|
|
609
|
+
|
|
325
610
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
|
|
330
|
-
|
|
331
|
-
|
|
332
|
-
|
|
333
|
-
|
|
334
|
-
|
|
335
|
-
|
|
336
|
-
|
|
337
|
-
|
|
338
|
-
|
|
339
|
-
|
|
611
|
+
@staticmethod
|
|
612
|
+
def _get_bpw_quality_hint(bpw: float) -> str:
|
|
613
|
+
if 0.0 < bpw < 2.0:
|
|
614
|
+
return 'terrible'
|
|
615
|
+
elif 2.0 <= bpw < 4.0:
|
|
616
|
+
return 'bad'
|
|
617
|
+
elif 4.0 <= bpw < 5.0:
|
|
618
|
+
return 'good'
|
|
619
|
+
elif 5.0 <= bpw < 16.0:
|
|
620
|
+
return 'great'
|
|
621
|
+
elif bpw >= 16.0:
|
|
622
|
+
return 'native'
|
|
623
|
+
else:
|
|
624
|
+
raise UnreachableException
|
|
625
|
+
|
|
626
|
+
|
|
627
|
+
@staticmethod
|
|
628
|
+
def _print_metadata(
|
|
629
|
+
metadata: dict,
|
|
630
|
+
file: _SupportsWriteAndFlush = sys.stderr
|
|
631
|
+
) -> None:
|
|
632
|
+
max_len_key = max(len(k) for k in metadata.keys())
|
|
633
|
+
print(f'webscout.Local: read model metadata from GGUF file header:', file=file)
|
|
634
|
+
for k, v in metadata.items():
|
|
635
|
+
print(
|
|
636
|
+
f'webscout.Local: {k:<{max_len_key}} : {truncate(repr(v))}',
|
|
637
|
+
file=file
|
|
638
|
+
)
|
|
340
639
|
|
|
341
|
-
|
|
640
|
+
|
|
342
641
|
def __repr__(self) -> str:
|
|
343
|
-
return
|
|
344
|
-
f"Model({
|
|
345
|
-
f"context_length={self._context_length}, "
|
|
346
|
-
f"n_gpu_layers={self._n_gpu_layers}, "
|
|
347
|
-
f"offload_kqv={self._offload_kqv}, "
|
|
348
|
-
f"flash_attn={self._flash_attn}, "
|
|
642
|
+
return (
|
|
643
|
+
f"Model({self._model_path!r}, "
|
|
644
|
+
f"context_length={self._context_length}, "
|
|
645
|
+
f"n_gpu_layers={self._n_gpu_layers}, "
|
|
646
|
+
f"offload_kqv={self._offload_kqv}, "
|
|
647
|
+
f"flash_attn={self._flash_attn}, "
|
|
648
|
+
f"quantize_kv_cache={self._quantize_kv_cache}, "
|
|
349
649
|
f"verbose={self._verbose})"
|
|
650
|
+
)
|
|
651
|
+
|
|
652
|
+
|
|
653
|
+
def __sizeof__(self) -> int:
|
|
654
|
+
"""Returns the size of the model file on disk, NOT the memory usage"""
|
|
655
|
+
return self._model_file_size_bytes
|
|
656
|
+
|
|
350
657
|
|
|
351
658
|
def __del__(self):
|
|
659
|
+
if self.is_loaded():
|
|
352
660
|
self.unload()
|
|
353
661
|
|
|
662
|
+
|
|
354
663
|
def __enter__(self):
|
|
664
|
+
if not self.is_loaded():
|
|
665
|
+
self.load()
|
|
355
666
|
return self
|
|
356
667
|
|
|
668
|
+
|
|
357
669
|
def __exit__(self, *_):
|
|
358
|
-
self.
|
|
670
|
+
if self.is_loaded():
|
|
671
|
+
self.unload()
|
|
359
672
|
|
|
673
|
+
|
|
360
674
|
def __call__(
|
|
361
675
|
self,
|
|
362
|
-
prompt:
|
|
363
|
-
stops: list[
|
|
364
|
-
sampler: SamplerSettings =
|
|
676
|
+
prompt: str | list[int],
|
|
677
|
+
stops: Optional[list[str | int]] = None,
|
|
678
|
+
sampler: Optional[SamplerSettings] = None
|
|
365
679
|
) -> str:
|
|
366
680
|
"""
|
|
367
681
|
`Model(...)` is a shorthand for `Model.generate(...)`
|
|
368
682
|
"""
|
|
369
|
-
return self.generate(prompt, stops, sampler)
|
|
683
|
+
return self.generate(prompt=prompt, stops=stops, sampler=sampler)
|
|
684
|
+
|
|
685
|
+
|
|
686
|
+
def __eq__(self, value: object, /) -> bool:
|
|
687
|
+
if not isinstance(value, __class__):
|
|
688
|
+
return NotImplemented
|
|
689
|
+
if not (hasattr(self, 'uuid') and hasattr(value, 'uuid')):
|
|
690
|
+
raise AttributeError(
|
|
691
|
+
"At least one of the models being compared is missing the "
|
|
692
|
+
"`.uuid` attribute"
|
|
693
|
+
)
|
|
694
|
+
return self.uuid == value.uuid
|
|
695
|
+
|
|
696
|
+
|
|
697
|
+
def __hash__(self, /) -> int:
|
|
698
|
+
return hash(self.uuid)
|
|
699
|
+
|
|
370
700
|
|
|
371
701
|
def unload(self):
|
|
372
702
|
"""
|
|
373
703
|
Unload the model from memory
|
|
704
|
+
|
|
705
|
+
Does nothing if the model is not loaded
|
|
374
706
|
"""
|
|
375
|
-
|
|
376
|
-
|
|
377
|
-
|
|
378
|
-
return
|
|
379
|
-
try:
|
|
380
|
-
if self.llama._model.model is not None:
|
|
381
|
-
# actually unload the model from memory
|
|
382
|
-
self.llama._model._llama_free_model(self.llama._model.model)
|
|
383
|
-
self.llama._model.model = None
|
|
384
|
-
except AttributeError:
|
|
385
|
-
# broken or already being destroyed by GC, abort
|
|
707
|
+
if not self.is_loaded():
|
|
708
|
+
if self.verbose:
|
|
709
|
+
print_verbose('model already unloaded')
|
|
386
710
|
return
|
|
387
|
-
|
|
711
|
+
|
|
712
|
+
if self.verbose:
|
|
713
|
+
print_verbose('unloading model...')
|
|
714
|
+
|
|
715
|
+
self.llama.close()
|
|
716
|
+
|
|
717
|
+
while hasattr(self, 'llama'):
|
|
388
718
|
delattr(self, 'llama')
|
|
719
|
+
|
|
389
720
|
if self.verbose:
|
|
390
|
-
print_verbose('
|
|
721
|
+
print_verbose('model unloaded')
|
|
391
722
|
|
|
392
|
-
|
|
723
|
+
|
|
724
|
+
def reload(
|
|
393
725
|
self,
|
|
394
|
-
|
|
395
|
-
|
|
396
|
-
|
|
726
|
+
context_length: Optional[int] = None,
|
|
727
|
+
n_gpu_layers: Optional[int] = None,
|
|
728
|
+
offload_kqv: Optional[bool] = None,
|
|
729
|
+
flash_attn: Optional[bool] = None,
|
|
730
|
+
quantize_kv_cache: Optional[bool] = None,
|
|
731
|
+
verbose: Optional[bool] = None
|
|
732
|
+
):
|
|
733
|
+
"""
|
|
734
|
+
Re-load the model into memory using the specified parameters
|
|
735
|
+
|
|
736
|
+
Any parameters unspecified will be unchanged
|
|
737
|
+
"""
|
|
738
|
+
__uuid = self.uuid
|
|
739
|
+
self.unload()
|
|
740
|
+
self.__init__(
|
|
741
|
+
model_path = self._model_path,
|
|
742
|
+
context_length = (
|
|
743
|
+
self._context_length if context_length is None
|
|
744
|
+
else context_length
|
|
745
|
+
),
|
|
746
|
+
n_gpu_layers = (
|
|
747
|
+
self._n_gpu_layers if n_gpu_layers is None
|
|
748
|
+
else n_gpu_layers
|
|
749
|
+
),
|
|
750
|
+
offload_kqv = (
|
|
751
|
+
self._offload_kqv if offload_kqv is None
|
|
752
|
+
else offload_kqv
|
|
753
|
+
),
|
|
754
|
+
flash_attn = (
|
|
755
|
+
self._flash_attn if flash_attn is None
|
|
756
|
+
else flash_attn
|
|
757
|
+
),
|
|
758
|
+
quantize_kv_cache = (
|
|
759
|
+
self._quantize_kv_cache if quantize_kv_cache is None
|
|
760
|
+
else quantize_kv_cache
|
|
761
|
+
),
|
|
762
|
+
verbose = (
|
|
763
|
+
self._verbose if verbose is None
|
|
764
|
+
else verbose
|
|
765
|
+
),
|
|
766
|
+
__uuid = __uuid # do not change UUID on reload
|
|
767
|
+
)
|
|
768
|
+
assert_model_is_loaded(self)
|
|
769
|
+
|
|
397
770
|
|
|
771
|
+
def load(self) -> None:
|
|
398
772
|
"""
|
|
399
|
-
|
|
400
|
-
|
|
773
|
+
Load the model into memory
|
|
774
|
+
|
|
775
|
+
Does nothing if already loaded
|
|
776
|
+
"""
|
|
777
|
+
if self.is_loaded():
|
|
778
|
+
if self.verbose:
|
|
779
|
+
print_verbose('model already loaded')
|
|
780
|
+
else:
|
|
781
|
+
self.reload()
|
|
782
|
+
|
|
401
783
|
|
|
402
|
-
|
|
403
|
-
|
|
404
|
-
|
|
784
|
+
def is_loaded(self) -> bool:
|
|
785
|
+
"""
|
|
786
|
+
Return `True` if the model is fully loaded, `False` otherwise
|
|
787
|
+
"""
|
|
788
|
+
try:
|
|
789
|
+
assert_model_is_loaded(self)
|
|
790
|
+
except exceptions.ModelUnloadedException:
|
|
791
|
+
return False
|
|
792
|
+
else:
|
|
793
|
+
return True
|
|
794
|
+
|
|
405
795
|
|
|
406
|
-
|
|
407
|
-
|
|
796
|
+
def tokenize(self, text: str) -> list[int]:
|
|
797
|
+
"""
|
|
798
|
+
Tokenize the given text (from `str` to `list[int]`)
|
|
408
799
|
"""
|
|
800
|
+
assert_type(text, str, 'text', 'tokenize')
|
|
409
801
|
assert_model_is_loaded(self)
|
|
410
|
-
|
|
411
|
-
|
|
412
|
-
|
|
802
|
+
tokens = self.llama._model.tokenize(
|
|
803
|
+
text.encode('utf-8'),
|
|
804
|
+
add_bos=(
|
|
805
|
+
self.add_bos_token if self.add_bos_token is not None
|
|
806
|
+
else True
|
|
807
|
+
),
|
|
808
|
+
special=True
|
|
413
809
|
)
|
|
810
|
+
# remove duplicate BOS tokens at the start of the text
|
|
811
|
+
while len(tokens) >= 2 and tokens[0] == self.bos_token and tokens[1] == self.bos_token:
|
|
812
|
+
tokens.pop(0)
|
|
813
|
+
if self.verbose:
|
|
814
|
+
print_verbose("tokenize: removed duplicate BOS token")
|
|
815
|
+
# remove duplicate EOS tokens at the end of the text
|
|
816
|
+
while len(tokens) >= 2 and tokens[-1] == self.eos_token and tokens[-2] == self.eos_token:
|
|
817
|
+
tokens.pop(-1)
|
|
818
|
+
if self.verbose:
|
|
819
|
+
print_verbose("tokenize: removed duplicate EOS token")
|
|
820
|
+
return tokens
|
|
821
|
+
|
|
414
822
|
|
|
415
|
-
|
|
416
|
-
|
|
417
|
-
|
|
418
|
-
|
|
419
|
-
|
|
420
|
-
if
|
|
421
|
-
#
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
427
|
-
|
|
428
|
-
if len(tokens_list) > trim_length and overwrite is not None:
|
|
429
|
-
# cut to trim_length
|
|
430
|
-
tokens_list = tokens_list[-trim_length:]
|
|
431
|
-
overwrite_tokens = self.llama.tokenize(overwrite.encode(
|
|
432
|
-
"utf-8",
|
|
433
|
-
errors="ignore"
|
|
823
|
+
def detokenize(self, tokens: list[int] | int) -> str:
|
|
824
|
+
"""
|
|
825
|
+
Detokenize the given text (from `int` or `list[int]` to `str`)
|
|
826
|
+
"""
|
|
827
|
+
assert_type(tokens, (list, int), 'tokens', 'detokenize')
|
|
828
|
+
if isinstance(tokens, int):
|
|
829
|
+
tokens = [tokens] # handle single tokens
|
|
830
|
+
for tok_id in tokens:
|
|
831
|
+
if not 0 <= tok_id < self.n_vocab:
|
|
832
|
+
raise ValueError(
|
|
833
|
+
f"detokenize: token id {tok_id} is out of range. "
|
|
834
|
+
f"acceptable values for this model are between 0 and "
|
|
835
|
+
f"{self.n_vocab-1} inclusive"
|
|
434
836
|
)
|
|
435
|
-
|
|
436
|
-
|
|
437
|
-
|
|
438
|
-
|
|
439
|
-
"
|
|
440
|
-
|
|
441
|
-
|
|
837
|
+
# remove duplicate BOS tokens at the start of the text
|
|
838
|
+
while len(tokens) >= 2 and tokens[0] == self.bos_token and tokens[1] == self.bos_token:
|
|
839
|
+
tokens.pop(0)
|
|
840
|
+
if self.verbose:
|
|
841
|
+
print_verbose("detokenize: removed duplicate BOS token")
|
|
842
|
+
# remove duplicate EOS tokens at the end of the text
|
|
843
|
+
while len(tokens) >= 2 and tokens[-1] == self.eos_token and tokens[-2] == self.eos_token:
|
|
844
|
+
tokens.pop(-1)
|
|
845
|
+
if self.verbose:
|
|
846
|
+
print_verbose("detokenize: removed duplicate EOS token")
|
|
847
|
+
assert_model_is_loaded(self)
|
|
848
|
+
return self.llama._model.detokenize(
|
|
849
|
+
tokens,
|
|
850
|
+
special=True
|
|
851
|
+
).decode('utf-8', errors='ignore')
|
|
852
|
+
|
|
442
853
|
|
|
443
854
|
def get_length(self, text: str) -> int:
|
|
444
855
|
"""
|
|
445
|
-
Return the length of the given text in
|
|
446
|
-
including the appended BOS token.
|
|
856
|
+
Return the length of the given text in as measured in tokens
|
|
447
857
|
"""
|
|
448
|
-
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
|
|
452
|
-
|
|
453
|
-
|
|
454
|
-
|
|
858
|
+
return len(self.tokenize(text))
|
|
859
|
+
|
|
860
|
+
|
|
861
|
+
def get_tokenization_mapping(
|
|
862
|
+
self,
|
|
863
|
+
text: str
|
|
864
|
+
) -> list[tuple[int, str]]:
|
|
865
|
+
"""
|
|
866
|
+
Tokenize the given text and return a list of tuples where the first
|
|
867
|
+
item in the tuple is the token ID and the second item is the
|
|
868
|
+
corresponding text
|
|
869
|
+
"""
|
|
870
|
+
token_id_list: list[int] = self.tokenize(text)
|
|
871
|
+
|
|
872
|
+
return list(
|
|
873
|
+
zip(
|
|
874
|
+
token_id_list,
|
|
875
|
+
[self.detokenize(tok_id) for tok_id in token_id_list]
|
|
876
|
+
)
|
|
877
|
+
)
|
|
878
|
+
|
|
455
879
|
|
|
880
|
+
def print_tokenization_mapping(self, text: str) -> None:
|
|
881
|
+
"""
|
|
882
|
+
Tokenize the given text and display a mapping of each
|
|
883
|
+
token ID and its corresponding decoded text
|
|
884
|
+
|
|
885
|
+
This is meant to be equivalent to `llama.cpp/llama-tokenize`
|
|
886
|
+
"""
|
|
887
|
+
token_mapping_list = self.get_tokenization_mapping(text)
|
|
888
|
+
|
|
889
|
+
for token_id, token_text in token_mapping_list:
|
|
890
|
+
print(f"{token_id:>7} -> '{token_text}'")
|
|
891
|
+
print(f"Total number of tokens: {len(token_mapping_list)}")
|
|
892
|
+
|
|
893
|
+
|
|
456
894
|
def generate(
|
|
457
895
|
self,
|
|
458
|
-
prompt:
|
|
459
|
-
stops: list[
|
|
460
|
-
sampler: SamplerSettings =
|
|
896
|
+
prompt: str | list[int],
|
|
897
|
+
stops: Optional[list[str | int]] = None,
|
|
898
|
+
sampler: Optional[SamplerSettings] = None
|
|
461
899
|
) -> str:
|
|
462
900
|
"""
|
|
463
901
|
Given a prompt, return a generated string.
|
|
@@ -468,56 +906,76 @@ class Model:
|
|
|
468
906
|
- stops: A list of strings and/or token IDs at which to end the generation early
|
|
469
907
|
- sampler: The SamplerSettings object used to control text generation
|
|
470
908
|
"""
|
|
909
|
+
|
|
910
|
+
stops = [] if stops is None else stops
|
|
911
|
+
assert_type(stops, list, 'stops', 'generate')
|
|
912
|
+
for item in stops:
|
|
913
|
+
assert_type(
|
|
914
|
+
item,
|
|
915
|
+
(str, int),
|
|
916
|
+
"some item in parameter 'stops'",
|
|
917
|
+
'generate'
|
|
918
|
+
)
|
|
919
|
+
|
|
920
|
+
sampler = SamplerSettings() if sampler is None else sampler
|
|
471
921
|
|
|
472
|
-
|
|
473
|
-
|
|
922
|
+
if sampler.temp < 0.0:
|
|
923
|
+
print_warning(
|
|
924
|
+
f'generate: using negative temperature value {sampler.temp}'
|
|
925
|
+
)
|
|
926
|
+
|
|
927
|
+
assert_type(prompt, (str, list), 'prompt', 'generate')
|
|
474
928
|
if isinstance(prompt, list):
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
929
|
+
prompt_tokens = prompt
|
|
930
|
+
else:
|
|
931
|
+
if self.verbose:
|
|
932
|
+
print_verbose(
|
|
933
|
+
"generate: tokenizing prompt"
|
|
934
|
+
)
|
|
935
|
+
prompt_tokens = self.tokenize(prompt)
|
|
936
|
+
|
|
937
|
+
input_length = len(prompt_tokens)
|
|
938
|
+
|
|
939
|
+
if input_length > self.context_length:
|
|
940
|
+
print(f'webscout.Local: raw input: {prompt_tokens}')
|
|
941
|
+
raise exceptions.ExceededContextLengthException(
|
|
942
|
+
f"generate: length of input exceeds model's context length "
|
|
943
|
+
f"({input_length} > {self.context_length})"
|
|
944
|
+
)
|
|
945
|
+
elif input_length == self.context_length:
|
|
946
|
+
print(f'webscout.Local: raw input: {prompt_tokens}')
|
|
947
|
+
raise exceptions.ExceededContextLengthException(
|
|
948
|
+
f"generate: length of input is equal to model's context "
|
|
949
|
+
f"length ({input_length} == {self.context_length}). this "
|
|
950
|
+
f"leaves no room for any new tokens to be generated"
|
|
951
|
+
)
|
|
952
|
+
elif self.verbose:
|
|
953
|
+
print_verbose(
|
|
954
|
+
f"generate: received prompt with {input_length} tokens"
|
|
955
|
+
)
|
|
956
|
+
|
|
957
|
+
stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
|
|
958
|
+
stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
|
|
959
|
+
stopping_criteria = None
|
|
960
|
+
if stop_token_ids != []:
|
|
961
|
+
def stop_on_token_ids(tokens, *args, **kwargs):
|
|
962
|
+
return tokens[-1] in stop_token_ids
|
|
963
|
+
stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
|
|
481
964
|
|
|
482
965
|
if self.verbose:
|
|
483
|
-
print_verbose(f'using the following sampler settings
|
|
966
|
+
print_verbose(f'generate: using the following sampler settings:')
|
|
484
967
|
print_verbose(f'max_len_tokens == {sampler.max_len_tokens}')
|
|
485
|
-
print_verbose(f'
|
|
968
|
+
print_verbose(f'top_k == {sampler.top_k}')
|
|
486
969
|
print_verbose(f'top_p == {sampler.top_p}')
|
|
487
970
|
print_verbose(f'min_p == {sampler.min_p}')
|
|
971
|
+
print_verbose(f'temp == {sampler.temp}')
|
|
488
972
|
print_verbose(f'frequency_penalty == {sampler.frequency_penalty}')
|
|
489
973
|
print_verbose(f'presence_penalty == {sampler.presence_penalty}')
|
|
490
974
|
print_verbose(f'repeat_penalty == {sampler.repeat_penalty}')
|
|
491
|
-
print_verbose(f'top_k == {sampler.top_k}')
|
|
492
975
|
|
|
493
|
-
# if any stop item is a token ID (int)
|
|
494
|
-
if any(isinstance(stop, int) for stop in stops):
|
|
495
|
-
# stop_strs is a list of all stopping strings
|
|
496
|
-
stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
|
|
497
|
-
# stop_token_ids is a list of all stop token IDs
|
|
498
|
-
stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
|
|
499
|
-
def stop_on_token_ids(tokens, *args, **kwargs):
|
|
500
|
-
return tokens[-1] in stop_token_ids
|
|
501
|
-
stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
|
|
502
|
-
assert_model_is_loaded(self)
|
|
503
|
-
return self.llama.create_completion(
|
|
504
|
-
prompt,
|
|
505
|
-
max_tokens=sampler.max_len_tokens,
|
|
506
|
-
temperature=sampler.temp,
|
|
507
|
-
top_p=sampler.top_p,
|
|
508
|
-
min_p=sampler.min_p,
|
|
509
|
-
frequency_penalty=sampler.frequency_penalty,
|
|
510
|
-
presence_penalty=sampler.presence_penalty,
|
|
511
|
-
repeat_penalty=sampler.repeat_penalty,
|
|
512
|
-
top_k=sampler.top_k,
|
|
513
|
-
stop=stop_strs,
|
|
514
|
-
stopping_criteria=stopping_criteria
|
|
515
|
-
)['choices'][0]['text']
|
|
516
|
-
|
|
517
|
-
# if stop items are only strings
|
|
518
976
|
assert_model_is_loaded(self)
|
|
519
977
|
return self.llama.create_completion(
|
|
520
|
-
prompt,
|
|
978
|
+
prompt=prompt_tokens,
|
|
521
979
|
max_tokens=sampler.max_len_tokens,
|
|
522
980
|
temperature=sampler.temp,
|
|
523
981
|
top_p=sampler.top_p,
|
|
@@ -526,17 +984,17 @@ class Model:
|
|
|
526
984
|
presence_penalty=sampler.presence_penalty,
|
|
527
985
|
repeat_penalty=sampler.repeat_penalty,
|
|
528
986
|
top_k=sampler.top_k,
|
|
529
|
-
stop=
|
|
987
|
+
stop=stop_strs,
|
|
988
|
+
stopping_criteria=stopping_criteria
|
|
530
989
|
)['choices'][0]['text']
|
|
531
990
|
|
|
532
991
|
|
|
533
992
|
def stream(
|
|
534
993
|
self,
|
|
535
|
-
prompt:
|
|
536
|
-
stops: list[
|
|
537
|
-
sampler: SamplerSettings =
|
|
994
|
+
prompt: str | list[int],
|
|
995
|
+
stops: Optional[list[str | int]] = None,
|
|
996
|
+
sampler: Optional[SamplerSettings] = None
|
|
538
997
|
) -> Generator:
|
|
539
|
-
|
|
540
998
|
"""
|
|
541
999
|
Given a prompt, return a Generator that yields dicts containing tokens.
|
|
542
1000
|
|
|
@@ -551,55 +1009,75 @@ class Model:
|
|
|
551
1009
|
- sampler: The SamplerSettings object used to control text generation
|
|
552
1010
|
"""
|
|
553
1011
|
|
|
554
|
-
|
|
555
|
-
|
|
1012
|
+
stops = [] if stops is None else stops
|
|
1013
|
+
assert_type(stops, list, 'stops', 'stream')
|
|
1014
|
+
for item in stops:
|
|
1015
|
+
assert_type(
|
|
1016
|
+
item,
|
|
1017
|
+
(str, int),
|
|
1018
|
+
"some item in parameter 'stops'",
|
|
1019
|
+
'stream'
|
|
1020
|
+
)
|
|
1021
|
+
|
|
1022
|
+
sampler = SamplerSettings() if sampler is None else sampler
|
|
1023
|
+
|
|
1024
|
+
if sampler.temp < 0.0:
|
|
1025
|
+
print_warning(
|
|
1026
|
+
f'stream: using negative temperature value {sampler.temp}'
|
|
1027
|
+
)
|
|
1028
|
+
|
|
1029
|
+
assert_type(prompt, (str, list), 'prompt', 'stream')
|
|
556
1030
|
if isinstance(prompt, list):
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
1031
|
+
prompt_tokens = prompt
|
|
1032
|
+
else:
|
|
1033
|
+
if self.verbose:
|
|
1034
|
+
print_verbose(
|
|
1035
|
+
"stream: tokenizing prompt"
|
|
1036
|
+
)
|
|
1037
|
+
prompt_tokens = self.tokenize(prompt)
|
|
1038
|
+
|
|
1039
|
+
input_length = len(prompt_tokens)
|
|
1040
|
+
|
|
1041
|
+
if input_length > self.context_length:
|
|
1042
|
+
print(f'webscout.Local: raw input: {prompt_tokens}')
|
|
1043
|
+
raise exceptions.ExceededContextLengthException(
|
|
1044
|
+
f"stream: length of input exceeds model's context length "
|
|
1045
|
+
f"({input_length} > {self.context_length})"
|
|
1046
|
+
)
|
|
1047
|
+
elif input_length == self.context_length:
|
|
1048
|
+
print(f'webscout.Local: raw input: {prompt_tokens}')
|
|
1049
|
+
raise exceptions.ExceededContextLengthException(
|
|
1050
|
+
f"stream: length of input is equal to model's context "
|
|
1051
|
+
f"length ({input_length} == {self.context_length}). this "
|
|
1052
|
+
f"leaves no room for any new tokens to be generated"
|
|
1053
|
+
)
|
|
1054
|
+
elif self.verbose:
|
|
1055
|
+
print_verbose(
|
|
1056
|
+
f"stream: received prompt with {input_length} tokens"
|
|
1057
|
+
)
|
|
1058
|
+
|
|
1059
|
+
stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
|
|
1060
|
+
stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
|
|
1061
|
+
stopping_criteria = None
|
|
1062
|
+
if stop_token_ids != []:
|
|
1063
|
+
def stop_on_token_ids(tokens, *args, **kwargs):
|
|
1064
|
+
return tokens[-1] in stop_token_ids
|
|
1065
|
+
stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
|
|
563
1066
|
|
|
564
1067
|
if self.verbose:
|
|
565
|
-
print_verbose(f'using the following sampler settings
|
|
1068
|
+
print_verbose(f'stream: using the following sampler settings:')
|
|
566
1069
|
print_verbose(f'max_len_tokens == {sampler.max_len_tokens}')
|
|
567
|
-
print_verbose(f'
|
|
1070
|
+
print_verbose(f'top_k == {sampler.top_k}')
|
|
568
1071
|
print_verbose(f'top_p == {sampler.top_p}')
|
|
569
1072
|
print_verbose(f'min_p == {sampler.min_p}')
|
|
1073
|
+
print_verbose(f'temp == {sampler.temp}')
|
|
570
1074
|
print_verbose(f'frequency_penalty == {sampler.frequency_penalty}')
|
|
571
1075
|
print_verbose(f'presence_penalty == {sampler.presence_penalty}')
|
|
572
1076
|
print_verbose(f'repeat_penalty == {sampler.repeat_penalty}')
|
|
573
|
-
print_verbose(f'top_k == {sampler.top_k}')
|
|
574
1077
|
|
|
575
|
-
# if any stop item is a token ID (int)
|
|
576
|
-
if any(isinstance(stop, int) for stop in stops):
|
|
577
|
-
# stop_strs is a list of all stopping strings
|
|
578
|
-
stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
|
|
579
|
-
# stop_token_ids is a list of all stop token IDs
|
|
580
|
-
stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
|
|
581
|
-
def stop_on_token_ids(tokens, *args, **kwargs):
|
|
582
|
-
return tokens[-1] in stop_token_ids
|
|
583
|
-
stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
|
|
584
|
-
assert_model_is_loaded(self)
|
|
585
|
-
return self.llama.create_completion(
|
|
586
|
-
prompt,
|
|
587
|
-
max_tokens=sampler.max_len_tokens,
|
|
588
|
-
temperature=sampler.temp,
|
|
589
|
-
top_p=sampler.top_p,
|
|
590
|
-
min_p=sampler.min_p,
|
|
591
|
-
frequency_penalty=sampler.frequency_penalty,
|
|
592
|
-
presence_penalty=sampler.presence_penalty,
|
|
593
|
-
repeat_penalty=sampler.repeat_penalty,
|
|
594
|
-
top_k=sampler.top_k,
|
|
595
|
-
stream=True,
|
|
596
|
-
stop=stop_strs,
|
|
597
|
-
stopping_criteria=stopping_criteria
|
|
598
|
-
)
|
|
599
|
-
|
|
600
1078
|
assert_model_is_loaded(self)
|
|
601
1079
|
return self.llama.create_completion(
|
|
602
|
-
prompt,
|
|
1080
|
+
prompt=prompt_tokens,
|
|
603
1081
|
max_tokens=sampler.max_len_tokens,
|
|
604
1082
|
temperature=sampler.temp,
|
|
605
1083
|
top_p=sampler.top_p,
|
|
@@ -609,32 +1087,24 @@ class Model:
|
|
|
609
1087
|
repeat_penalty=sampler.repeat_penalty,
|
|
610
1088
|
top_k=sampler.top_k,
|
|
611
1089
|
stream=True,
|
|
612
|
-
stop=
|
|
1090
|
+
stop=stop_strs,
|
|
1091
|
+
stopping_criteria=stopping_criteria
|
|
613
1092
|
)
|
|
614
1093
|
|
|
615
1094
|
|
|
616
1095
|
def stream_print(
|
|
617
1096
|
self,
|
|
618
|
-
prompt:
|
|
619
|
-
stops: list[
|
|
620
|
-
sampler: SamplerSettings =
|
|
621
|
-
end: str =
|
|
622
|
-
file: _SupportsWriteAndFlush =
|
|
1097
|
+
prompt: str | list[int],
|
|
1098
|
+
stops: Optional[list[str | int]] = None,
|
|
1099
|
+
sampler: Optional[SamplerSettings] = None,
|
|
1100
|
+
end: str = '\n',
|
|
1101
|
+
file: _SupportsWriteAndFlush = None,
|
|
623
1102
|
flush: bool = True
|
|
624
1103
|
) -> str:
|
|
625
1104
|
"""
|
|
626
|
-
Given a prompt, stream text as it is generated, and return
|
|
627
|
-
The returned string does not include the `end`
|
|
628
|
-
|
|
629
|
-
`Model.stream_print(...)` is a shorthand for:
|
|
630
|
-
|
|
631
|
-
```
|
|
632
|
-
s = Model.stream(prompt, stops=stops, sampler=sampler)
|
|
633
|
-
for i in s:
|
|
634
|
-
tok = i['choices'][0]['text']
|
|
635
|
-
print(tok, end='', file=file, flush=flush)
|
|
636
|
-
print(end, end='', file=file, flush=True)
|
|
637
|
-
```
|
|
1105
|
+
Given a prompt, stream text to a file as it is generated, and return
|
|
1106
|
+
the generated string. The returned string does not include the `end`
|
|
1107
|
+
parameter.
|
|
638
1108
|
|
|
639
1109
|
prompt: The text from which to generate
|
|
640
1110
|
|
|
@@ -652,120 +1122,247 @@ class Model:
|
|
|
652
1122
|
sampler=sampler
|
|
653
1123
|
)
|
|
654
1124
|
|
|
655
|
-
|
|
1125
|
+
file = sys.stdout if file is None else file
|
|
1126
|
+
|
|
1127
|
+
response = ''
|
|
656
1128
|
for i in token_generator:
|
|
657
1129
|
tok = i['choices'][0]['text']
|
|
658
1130
|
print(tok, end='', file=file, flush=flush)
|
|
659
|
-
|
|
1131
|
+
response += tok
|
|
660
1132
|
|
|
661
1133
|
# print `end`, and always flush stream after generation is done
|
|
662
1134
|
print(end, end='', file=file, flush=True)
|
|
663
1135
|
|
|
664
|
-
return
|
|
1136
|
+
return response
|
|
665
1137
|
|
|
666
1138
|
|
|
667
|
-
def ingest(self, text: str) -> None:
|
|
1139
|
+
def ingest(self, text: str | list[int]) -> None:
|
|
668
1140
|
"""
|
|
669
1141
|
Ingest the given text into the model's cache
|
|
670
1142
|
"""
|
|
671
1143
|
|
|
1144
|
+
assert_type(text, (str, list), 'prompt', 'stream')
|
|
1145
|
+
if isinstance(text, list):
|
|
1146
|
+
tokens = text
|
|
1147
|
+
else:
|
|
1148
|
+
if self.verbose:
|
|
1149
|
+
print_verbose(
|
|
1150
|
+
"ingest: tokenizing text"
|
|
1151
|
+
)
|
|
1152
|
+
tokens = self.tokenize(text)
|
|
1153
|
+
|
|
1154
|
+
input_length = len(tokens)
|
|
1155
|
+
|
|
1156
|
+
if input_length > self.context_length:
|
|
1157
|
+
print(f'webscout.Local: raw input: {tokens}')
|
|
1158
|
+
raise exceptions.ExceededContextLengthException(
|
|
1159
|
+
f"ingest: length of input exceeds model's context length "
|
|
1160
|
+
f"({input_length} > {self.context_length})"
|
|
1161
|
+
)
|
|
1162
|
+
elif input_length == self.context_length:
|
|
1163
|
+
print(f'webscout.Local: raw input: {tokens}')
|
|
1164
|
+
raise exceptions.ExceededContextLengthException(
|
|
1165
|
+
f"ingest: length of input is equal to model's context "
|
|
1166
|
+
f"length ({input_length} == {self.context_length}). this "
|
|
1167
|
+
f"leaves no room for any new tokens to be generated"
|
|
1168
|
+
)
|
|
1169
|
+
elif self.verbose:
|
|
1170
|
+
print_verbose(
|
|
1171
|
+
f"ingest: ingesting {input_length} tokens"
|
|
1172
|
+
)
|
|
1173
|
+
|
|
672
1174
|
assert_model_is_loaded(self)
|
|
673
1175
|
self.llama.create_completion(
|
|
674
|
-
|
|
675
|
-
max_tokens=
|
|
1176
|
+
prompt=tokens,
|
|
1177
|
+
max_tokens=2,
|
|
676
1178
|
temperature=0.0
|
|
677
1179
|
)
|
|
678
|
-
|
|
1180
|
+
|
|
679
1181
|
|
|
680
1182
|
def candidates(
|
|
681
1183
|
self,
|
|
682
1184
|
prompt: str,
|
|
683
|
-
k: int
|
|
1185
|
+
k: int = 40,
|
|
1186
|
+
temp: Optional[float] = None,
|
|
1187
|
+
raw_token_ids: bool = False
|
|
684
1188
|
) -> list[tuple[str, np.floating]]:
|
|
685
1189
|
"""
|
|
686
1190
|
Given prompt `str` and k `int`, return a sorted list of the
|
|
687
1191
|
top k candidates for most likely next token, along with their
|
|
688
|
-
normalized probabilities
|
|
689
|
-
"""
|
|
1192
|
+
normalized probabilities (logprobs).
|
|
690
1193
|
|
|
691
|
-
|
|
692
|
-
|
|
693
|
-
|
|
694
|
-
f"next_candidates: k should be int, not {type(k)}"
|
|
695
|
-
assert 0 < k <= len(self.tokens), \
|
|
696
|
-
f"next_candidates: k should be between 0 and {len(self.tokens)}"
|
|
1194
|
+
The following parameters are optional:
|
|
1195
|
+
- temp: The temperature to apply to the distribution
|
|
1196
|
+
- raw_token_ids: If `True`, return raw token IDs instead of text tokens
|
|
697
1197
|
|
|
1198
|
+
If parameter `k` is <= 0, the probabilities for all tokens in the
|
|
1199
|
+
vocabulary will be returned. Vocabulary sizes are often in the
|
|
1200
|
+
hundred-thousands.
|
|
1201
|
+
"""
|
|
1202
|
+
|
|
1203
|
+
assert_type(prompt, str, 'prompt', 'candidates')
|
|
1204
|
+
assert_type(k, int, 'k', 'candidates')
|
|
1205
|
+
assert_type(temp, (float, NoneType), 'temp', 'candidates')
|
|
698
1206
|
assert_model_is_loaded(self)
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
1207
|
+
if k <= 0:
|
|
1208
|
+
k = self.n_vocab
|
|
1209
|
+
if self.verbose:
|
|
1210
|
+
print_verbose(
|
|
1211
|
+
f"candidates: k <= 0, using n_vocab ({self.n_vocab})"
|
|
1212
|
+
)
|
|
1213
|
+
if not 1 <= k <= self.n_vocab:
|
|
1214
|
+
raise ValueError(
|
|
1215
|
+
f"candidates: k should be between 1 and {self.n_vocab} "
|
|
1216
|
+
f"inclusive"
|
|
1217
|
+
)
|
|
1218
|
+
|
|
1219
|
+
prompt_tokens = self.tokenize(prompt)
|
|
1220
|
+
input_length = len(prompt_tokens)
|
|
703
1221
|
|
|
704
|
-
|
|
705
|
-
|
|
1222
|
+
if input_length > self.context_length:
|
|
1223
|
+
print(f'webscout.Local: raw input: {prompt_tokens}')
|
|
1224
|
+
raise exceptions.ExceededContextLengthException(
|
|
1225
|
+
f"candidates: length of input exceeds model's context length "
|
|
1226
|
+
f"({input_length} > {self.context_length})"
|
|
1227
|
+
)
|
|
1228
|
+
elif input_length == self.context_length:
|
|
1229
|
+
print(f'webscout.Local: raw input: {prompt_tokens}')
|
|
1230
|
+
raise exceptions.ExceededContextLengthException(
|
|
1231
|
+
f"candidates: length of input is equal to model's context "
|
|
1232
|
+
f"length ({input_length} == {self.context_length}). this "
|
|
1233
|
+
f"leaves no room for any new tokens to be generated"
|
|
1234
|
+
)
|
|
1235
|
+
|
|
1236
|
+
# it is necessary to reset the model before calling llama.eval()
|
|
1237
|
+
elif self.verbose:
|
|
1238
|
+
print_verbose(
|
|
1239
|
+
"candidates: reset model state..."
|
|
1240
|
+
)
|
|
1241
|
+
self.llama.reset()
|
|
1242
|
+
|
|
1243
|
+
if self.verbose:
|
|
1244
|
+
print_verbose(
|
|
1245
|
+
"candidates: eval..."
|
|
1246
|
+
)
|
|
1247
|
+
self.llama.eval(prompt_tokens) # single forward pass
|
|
706
1248
|
|
|
707
|
-
|
|
708
|
-
|
|
1249
|
+
scores = self.llama.scores[len(prompt_tokens) - 1]
|
|
1250
|
+
|
|
1251
|
+
# Get the top k indices based on raw scores
|
|
1252
|
+
top_k_indices = np.argpartition(scores, -k)[-k:]
|
|
1253
|
+
|
|
1254
|
+
# Get the scores of the top k tokens
|
|
1255
|
+
top_k_scores = scores[top_k_indices]
|
|
1256
|
+
|
|
1257
|
+
# Apply softmax to the top k scores
|
|
709
1258
|
if self.verbose:
|
|
710
|
-
print_verbose(
|
|
711
|
-
|
|
1259
|
+
print_verbose(
|
|
1260
|
+
f'candidates: compute softmax over {len(top_k_scores)} '
|
|
1261
|
+
f'values...'
|
|
1262
|
+
)
|
|
1263
|
+
normalized_scores = softmax(z=top_k_scores, T=temp)
|
|
1264
|
+
|
|
1265
|
+
# consider only the top k tokens
|
|
1266
|
+
logprobs = [
|
|
1267
|
+
(
|
|
1268
|
+
self.llama._model.detokenize(
|
|
1269
|
+
[tok_id], special=True
|
|
1270
|
+
).decode('utf-8', errors='ignore'),
|
|
1271
|
+
normalized_scores[i]
|
|
1272
|
+
) for i, tok_id in enumerate(top_k_indices)
|
|
1273
|
+
] if not raw_token_ids else [
|
|
1274
|
+
(
|
|
1275
|
+
tok_id,
|
|
1276
|
+
normalized_scores[i]
|
|
1277
|
+
) for i, tok_id in enumerate(top_k_indices)
|
|
1278
|
+
]
|
|
712
1279
|
|
|
713
|
-
#
|
|
714
|
-
|
|
715
|
-
token_probs_list: list[tuple[str, np.floating]] = []
|
|
716
|
-
for tok_str in self.tokens:
|
|
717
|
-
token_probs_list.append((tok_str, normalized_scores[i]))
|
|
718
|
-
i += 1
|
|
1280
|
+
# sort by probability
|
|
1281
|
+
logprobs.sort(key=lambda x: x[1], reverse=True)
|
|
719
1282
|
|
|
720
|
-
|
|
721
|
-
return nlargest(k, token_probs_list, key=lambda x:x[1])
|
|
1283
|
+
return logprobs
|
|
722
1284
|
|
|
723
1285
|
|
|
724
1286
|
def print_candidates(
|
|
725
1287
|
self,
|
|
726
1288
|
prompt: str,
|
|
727
|
-
k: int,
|
|
728
|
-
|
|
729
|
-
|
|
1289
|
+
k: int = 40,
|
|
1290
|
+
temp: Optional[float] = None,
|
|
1291
|
+
raw_token_ids: bool = False,
|
|
1292
|
+
file: _SupportsWriteAndFlush = None,
|
|
730
1293
|
) -> None:
|
|
731
1294
|
"""
|
|
732
|
-
|
|
733
|
-
|
|
734
|
-
|
|
1295
|
+
Given prompt `str` and k `int`, print a sorted list of the
|
|
1296
|
+
top k candidates for most likely next token, along with their
|
|
1297
|
+
normalized probabilities (logprobs).
|
|
735
1298
|
|
|
736
|
-
|
|
737
|
-
|
|
738
|
-
|
|
739
|
-
|
|
740
|
-
|
|
741
|
-
|
|
742
|
-
|
|
743
|
-
|
|
744
|
-
|
|
745
|
-
|
|
746
|
-
|
|
1299
|
+
The following parameters are optional:
|
|
1300
|
+
- temp: The temperature to apply to the distribution
|
|
1301
|
+
- raw_token_ids: If `True`, print raw token IDs instead of text tokens
|
|
1302
|
+
|
|
1303
|
+
If parameter `k` is <= 0, the probabilities for all tokens in the
|
|
1304
|
+
vocabulary will be printed. Vocabulary sizes are often in the
|
|
1305
|
+
hundred-thousands.
|
|
1306
|
+
"""
|
|
1307
|
+
for _tuple in self.candidates(
|
|
1308
|
+
prompt=prompt, k=k, temp=temp, raw_token_ids=raw_token_ids
|
|
1309
|
+
):
|
|
1310
|
+
percent_as_string = f"{_tuple[1] * 100 :>7.3f}"
|
|
1311
|
+
# do not print tokens with ~0.000% probability
|
|
1312
|
+
if percent_as_string != " 0.000":
|
|
1313
|
+
print(
|
|
1314
|
+
f"token {_tuple[0]!r:<32} has probability "
|
|
1315
|
+
f"{percent_as_string} %",
|
|
1316
|
+
file=sys.stdout if file is None else file,
|
|
1317
|
+
)
|
|
747
1318
|
|
|
748
1319
|
|
|
749
|
-
def assert_model_is_loaded(model
|
|
1320
|
+
def assert_model_is_loaded(model) -> None:
|
|
750
1321
|
"""
|
|
751
|
-
Ensure the
|
|
752
|
-
`
|
|
1322
|
+
Ensure the model is fully constructed, such that
|
|
1323
|
+
`model.llama._model.model is not None` is guaranteed to be `True`
|
|
753
1324
|
|
|
754
1325
|
Raise ModelUnloadedException otherwise
|
|
755
1326
|
"""
|
|
756
|
-
|
|
757
|
-
|
|
1327
|
+
try:
|
|
1328
|
+
if model.llama._model.model is not None:
|
|
1329
|
+
return
|
|
1330
|
+
except AttributeError:
|
|
1331
|
+
pass
|
|
1332
|
+
|
|
1333
|
+
if model is None:
|
|
1334
|
+
exc = exceptions.ModelUnloadedException(
|
|
1335
|
+
"model is None"
|
|
1336
|
+
)
|
|
1337
|
+
elif not hasattr(model, 'llama'):
|
|
1338
|
+
exc = exceptions.ModelUnloadedException(
|
|
758
1339
|
"webscout.Local.Model instance has no attribute 'llama'"
|
|
759
1340
|
)
|
|
760
|
-
|
|
761
|
-
|
|
1341
|
+
elif not hasattr(model.llama, '_model'):
|
|
1342
|
+
exc = exceptions.ModelUnloadedException(
|
|
762
1343
|
"llama_cpp.Llama instance has no attribute '_model'"
|
|
763
1344
|
)
|
|
764
|
-
|
|
765
|
-
|
|
766
|
-
"llama_cpp._internals._LlamaModel instance has no attribute
|
|
1345
|
+
elif not hasattr(model.llama._model, 'model'):
|
|
1346
|
+
exc = exceptions.ModelUnloadedException(
|
|
1347
|
+
"llama_cpp._internals._LlamaModel instance has no attribute "
|
|
1348
|
+
"'model'"
|
|
767
1349
|
)
|
|
768
|
-
|
|
769
|
-
|
|
1350
|
+
elif model.llama._model.model is None:
|
|
1351
|
+
exc = exceptions.ModelUnloadedException(
|
|
770
1352
|
"llama_cpp._internals._LlamaModel.model is None"
|
|
771
1353
|
)
|
|
1354
|
+
else:
|
|
1355
|
+
raise UnreachableException
|
|
1356
|
+
|
|
1357
|
+
if not isinstance(model, Model):
|
|
1358
|
+
exc.add_note(
|
|
1359
|
+
'WARNING: `assert_model_is_loaded` was called on an object '
|
|
1360
|
+
'that is NOT an instance of `webscout.Local.Model` '
|
|
1361
|
+
f'(object had type {type(model)!r})'
|
|
1362
|
+
)
|
|
1363
|
+
else:
|
|
1364
|
+
exc.add_note(
|
|
1365
|
+
'Are you trying to use a model that has been unloaded?'
|
|
1366
|
+
)
|
|
1367
|
+
|
|
1368
|
+
raise exc
|