webscout 6.0__py3-none-any.whl → 6.1__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of webscout might be problematic. Click here for more details.

webscout/Local/model.py CHANGED
@@ -1,77 +1,137 @@
1
- import json
2
- from ._version import __version__, __llama_cpp_version__
3
1
 
4
- """Submodule containing the Model class to work with language models"""
5
2
 
3
+ import os
6
4
  import sys
5
+ import uuid
7
6
  import numpy as np
8
7
 
9
8
  from .utils import (
10
9
  _SupportsWriteAndFlush,
10
+ UnreachableException,
11
+ print_version_info,
12
+ QuickGGUFReader,
11
13
  print_warning,
12
14
  print_verbose,
13
- GGUFReader,
15
+ assert_type,
16
+ NoneType,
17
+ truncate,
14
18
  softmax
15
19
  )
16
20
 
17
- from .samplers import SamplerSettings, DefaultSampling
18
21
  from llama_cpp import Llama, StoppingCriteriaList
19
- from typing import Callable, Generator, Optional, Union
20
- from os.path import isdir, exists
21
- from heapq import nlargest
22
+ from typing import Generator, Optional
23
+ from .samplers import SamplerSettings
22
24
 
23
- from os import cpu_count as os_cpu_count
24
25
 
26
+ from webscout import exceptions
25
27
 
26
- class ModelUnloadedException(Exception):
27
- """Exception raised when trying to use a Model that has been unloaded"""
28
- def __init__(self, message):
29
- self.message = message
30
- self.tool_code_start = "```tool_code\n" # Define tool code markers
31
- self.tool_code_end = "\n```tool_code```"
32
- super().__init__(self.message)
33
- self.add_note('Are you trying to use a Model that has been unloaded?')
34
28
 
35
29
  class Model:
36
30
  """
37
- A high-level abstraction of a llama model
38
-
39
- This is just a brief overview of webscout.Local.Model.
40
- To see a full description of each method and its parameters,
41
- call help(Model), or see the relevant docstring.
31
+ A high-level abstraction of a Llama model
42
32
 
43
33
  The following methods are available:
44
- - `.generate()` - Generate text
45
- - `.get_length()` - Get the length of a given text in tokens
46
- - `.ingest()` - Ingest text into the model's cache
47
- - `.next_candidates()` - Get a list of the most likely next tokens (WIP)
48
- - `.stream()` - Return a Generator that can stream text as it is generated
49
- - `.stream_print()` - Print text as it is generated
50
- - `.trim()` - Trim a given text to the model's context length
51
- - `.unload()` - Unload the model from memory
52
-
34
+ - unload:
35
+ Unload the model from memory
36
+ - reload:
37
+ Re-load the model, optionally changing parameters
38
+ - load:
39
+ Load the model into memory
40
+ - is_loaded:
41
+ Return `True` if the model is fully loaded, `False` otherwise
42
+ - tokenize:
43
+ Tokenize the given text, from `str` to `list[int]`
44
+ - detokenize:
45
+ Detokenize the given text, from `list[int]` or `int` to `str`
46
+ - get_length:
47
+ Return the length of the given text as measured in tokens
48
+ - get_tokenization_mapping:
49
+ Return a mapping of token IDs to tokens for a given text
50
+ - print_tokenization_mapping:
51
+ Display the tokenization map for a given text
52
+ - generate:
53
+ Generate text from an input and return it all at once when finished
54
+ - stream:
55
+ Return a Generator that yields tokens as they are generated
56
+ - stream_print:
57
+ Stream tokens to a file as they are generated
58
+ - ingest:
59
+ Ingest the given text into the model's cache, reducing the latency of
60
+ future generations that start with the same text
61
+ - candidates:
62
+ Return a sorted list of candidates for the next token, along with
63
+ their normalized probabilities
64
+ - print_candidates:
65
+ Print a sorted list of candidates for the next token, along with
66
+ their normalized probabilities
67
+
53
68
  The following attributes are available:
54
- - `.bos_token` - The model's beginning-of-stream token ID
55
- - `.context_length` - The model's loaded context length
56
- - `.flash_attn` - Whether the model was loaded with `flash_attn=True`
57
- - `.eos_token` - The model's end-of-stream token ID
58
- - `.llama` - The underlying `llama_cpp.Llama` instance
59
- - `.metadata` - The GGUF metadata of the model
60
- - `.n_ctx_train` - The native context length of the model
61
- - `.rope_freq_base` - The model's loaded RoPE frequency base
62
- - `.rope_freq_base_train` - The model's native RoPE frequency base
63
- - `.tokens` - A list of all the tokens in the model's tokenizer
64
- - `.verbose` - Whether the model was loaded with `verbose=True`
69
+ - verbose `bool`:
70
+ Whether the model was loaded with `verbose=True`
71
+ - metadata `dict`:
72
+ A dictionary containing the GGUF metadata of the model
73
+ - context_length `int`:
74
+ The currently loaded context length of the model, in tokens
75
+ - n_ctx `int`:
76
+ Alias to context_length
77
+ - llama `llama_cpp.Llama`:
78
+ The underlying Llama instance
79
+ - vocab `list[str]`:
80
+ A list of all tokens in the model's vocabulary
81
+ - bos_token `int`:
82
+ The beginning-of-sequence token ID
83
+ - eos_token `int`:
84
+ The end-of-sequence token ID
85
+ - eot_token `int`:
86
+ The end-of-turn token ID (or `None` if not found)
87
+ - nl_token `int`:
88
+ The newline token ID (or `None` if not found)
89
+ - prefix_token `int`:
90
+ The infill prefix token ID (or `None` if not found)
91
+ - middle_token `int`:
92
+ The infill middle token ID (or `None` if not found)
93
+ - suffix_token `int`:
94
+ The infill suffix token ID (or `None` if not found)
95
+ - cls_token `int`:
96
+ The classifier token ID (or `None` if not found)
97
+ - sep_token `int`:
98
+ The separator token ID (or `None` if not found)
99
+ - filename `str`:
100
+ The name of the file the model was loaded from
101
+ - n_ctx_train `int`:
102
+ The native context length of the model
103
+ - rope_freq_base_train `float`:
104
+ The native RoPE frequency base (theta) value
105
+ - rope_freq_base `float`:
106
+ The currently loaded RoPE frequency base (theta) value
107
+ - flash_attn `bool`:
108
+ Whether the model was loaded with Flash Attention enabled
109
+ - n_vocab `int`:
110
+ The number of tokens in the model's vocabulary
111
+ - n_layer `int`:
112
+ The number of layers in the model
113
+ - n_gpu_layers `int`:
114
+ The number of layers offloaded to the GPU (-1 for all layers)
115
+ - type_k `int`:
116
+ The GGML data type used for the `K` cache. 1 == f16, q8_0 otherwise
117
+ - type_v `int`:
118
+ The GGML data type used for the `V` cache. 1 == f16, q8_0 otherwise
119
+ - n_gqa `int`:
120
+ The GQA (Grouped-Query Attention) factor of the model
121
+ - uuid `uuid.UUID`:
122
+ A randomly generated UUID, unique to this specific model instance
65
123
  """
66
124
 
67
125
  def __init__(
68
126
  self,
69
127
  model_path: str,
70
- context_length: Optional[int] = None,
128
+ context_length: Optional[int] = 2048,
71
129
  n_gpu_layers: int = 0,
72
130
  offload_kqv: bool = True,
73
131
  flash_attn: bool = False,
132
+ quantize_kv_cache: bool = False,
74
133
  verbose: bool = False,
134
+ **kwargs
75
135
  ):
76
136
  """
77
137
  Given the path to a GGUF file, construct a Model instance.
@@ -79,28 +139,44 @@ class Model:
79
139
  The model must be in GGUF format.
80
140
 
81
141
  The following parameters are optional:
82
- - context_length: The context length at which to load the model, in tokens
83
- - n_gpu_layers: The number of layers to be offloaded to the GPU
84
- - offload_kqv: Whether the KQV cache (context) should be offloaded
85
- - flash_attn: Whether to use Flash Attention
86
- - verbose: Whether to print additional backend information
142
+ - context_length:
143
+ The context length at which to load the model, in tokens
144
+ - n_gpu_layers:
145
+ The number of layers to be offloaded to the GPU
146
+ - offload_kqv:
147
+ Whether the KQV cache (context) should be offloaded
148
+ - flash_attn:
149
+ Whether to use Flash Attention
150
+ - quantize_kv_cache:
151
+ Whether to use q8_0 values for KV cache
152
+ - verbose:
153
+ Whether to print additional backend information. `bool`
154
+
155
+ The following additional keyword arguments are also accepted:
156
+ - do_not_load:
157
+ If `True`, construct the model instance but do not load it into
158
+ memory yet. Call `Model.load()` before using the model
159
+ - debug:
160
+ If `True`, print additional backend information from llama.cpp
87
161
  """
88
162
 
89
- if verbose:
90
- print_verbose(f"webscout.Local package version: {__version__}")
91
- print_verbose(f"llama_cpp package version: {__llama_cpp_version__}")
92
-
93
- assert isinstance(model_path, str), \
94
- f"Model: model_path should be a string, not {type(model_path)}"
95
- assert exists(model_path), \
96
- f"Model: the given model_path '{model_path}' does not exist"
97
- assert not isdir(model_path), \
98
- f"Model: the given model_path '{model_path}' is a directory, not a GGUF file"
99
- assert isinstance(context_length, (int, type(None))), \
100
- f"Model: context_length should be int or None, not {type(context_length)}"
101
- assert isinstance(flash_attn, bool), \
102
- f"Model: flash_attn should be bool (True or False), not {type(flash_attn)}"
103
-
163
+ assert_type(verbose, bool, 'verbose', 'Model')
164
+ assert_type(model_path, str, 'model_path', 'Model')
165
+ if not os.path.exists(model_path):
166
+ raise FileNotFoundError(
167
+ f"Model: the given model_path {model_path!r} does not exist"
168
+ )
169
+ if os.path.isdir(model_path):
170
+ raise IsADirectoryError(
171
+ f"Model: the given model_path {model_path!r} is a directory, "
172
+ "not a GGUF file"
173
+ )
174
+ assert_type(context_length, (int, NoneType), 'context_length', 'Model')
175
+ assert_type(n_gpu_layers, int, 'n_gpu_layers', 'Model')
176
+ assert_type(offload_kqv, bool, 'offload_kqv', 'Model')
177
+ assert_type(flash_attn, bool, 'flash_attn', 'Model')
178
+ assert_type(quantize_kv_cache, bool, 'quantize_kv_cache', 'Model')
179
+
104
180
  # save __init__ parameters for __repr__
105
181
  self._model_path = model_path
106
182
  self._context_length = context_length
@@ -108,130 +184,203 @@ class Model:
108
184
  self._offload_kqv = offload_kqv
109
185
  self._flash_attn = flash_attn
110
186
  self._verbose = self.verbose = verbose
111
- self.tools = {}
112
- # if context_length <= 0, use n_ctx_train
113
- if isinstance(context_length, int) and context_length <= 0:
114
- context_length = None
187
+ self._quantize_kv_cache = quantize_kv_cache
115
188
 
116
- # this does not use Llama.metadata because we want to use GGUF
117
- # metadata to determine some parameters of the Llama instance
118
- # before it is created
119
- self.metadata = GGUFReader.load_metadata(self, model_path)
120
- metadata_keys = self.metadata.keys() # only read once
189
+ _kwargs_keys = kwargs.keys() # only read once
121
190
 
122
- n_ctx_train = None
123
- for key in metadata_keys:
124
- if key.endswith('.context_length'):
125
- n_ctx_train = self.metadata[key]
126
- break
191
+ if '__uuid' not in _kwargs_keys:
192
+ self.uuid = uuid.uuid4()
193
+ else:
194
+ # Model.reload() passes this kwarg to preserve the UUID
195
+ self.uuid = kwargs.get('__uuid')
127
196
 
128
- if n_ctx_train is None:
129
- raise KeyError(
130
- "GGUF file does not specify a context length"
131
- )
197
+ if 'do_not_load' in _kwargs_keys:
198
+ if kwargs.get('do_not_load') is True:
199
+ # only save __init__ params to be used later in self.load()
200
+ return
132
201
 
133
- rope_freq_base_train = None
134
- for key in metadata_keys:
135
- if key.endswith('.rope.freq_base'):
136
- rope_freq_base_train = self.metadata[key]
137
- break
202
+ if verbose:
203
+ print_version_info(file=sys.stderr)
138
204
 
139
- if rope_freq_base_train is None and context_length is not None:
140
- if context_length > n_ctx_train:
141
- raise ValueError(
142
- 'unable to load model with greater than native ' + \
143
- f'context length ({context_length} > {n_ctx_train}) ' + \
144
- 'because model does not specify freq_base. ' + \
145
- f'try again with `context_length={n_ctx_train}`'
205
+ if sys.byteorder == 'big':
206
+ print_warning(
207
+ "host is big-endian, please ensure your GGUF file is also "
208
+ "big-endian"
209
+ )
210
+ elif sys.byteorder == 'little':
211
+ if verbose:
212
+ print_verbose(
213
+ "host is little-endian"
146
214
  )
215
+ else:
216
+ print_warning(
217
+ f"unexpected value for sys.byteorder: {sys.byteorder!r}, "
218
+ "expected 'little' for little-endian host or 'big' for "
219
+ "big-endian host"
220
+ )
221
+
222
+ self._model_file_size_bytes = os.stat(model_path).st_size
223
+ self.metadata = QuickGGUFReader.load_metadata(model_path)
147
224
 
148
- if rope_freq_base_train is None or context_length is None or \
149
- context_length <= n_ctx_train:
150
- # no need to do context scaling, load model normally
225
+ _debug = False
226
+ if 'debug' in _kwargs_keys:
227
+ _debug = bool(kwargs.get('debug'))
151
228
 
152
- if context_length is None:
153
- self.context_length = n_ctx_train
154
- else:
155
- self.context_length = context_length
156
- rope_freq_base = rope_freq_base_train
229
+ if verbose and not _debug:
230
+ __class__._print_metadata(self.metadata)
157
231
 
158
- elif context_length > n_ctx_train:
159
- # multiply rope_freq_base according to requested context length
160
- # because context length > n_ctx_train and rope freq base is known
232
+ n_ctx_train = None
233
+ rope_freq_base_train = None
234
+ n_layer = None
235
+ n_attn_heads = None
236
+ n_kv_heads = None
237
+ n_gqa = None
161
238
 
162
- rope_freq_base = (context_length/n_ctx_train)*rope_freq_base_train
163
- self.context_length = context_length
164
-
165
- if self.verbose:
166
- print_verbose(
167
- 'chosen context length is greater than native context '
168
- f'length ({context_length} > {n_ctx_train}), '
169
- 'rope_freq_base will be changed from '
170
- f'{rope_freq_base_train} to {rope_freq_base}'
171
- )
239
+ for key in self.metadata.keys():
240
+ if key.endswith('.context_length'):
241
+ n_ctx_train = int(self.metadata[key])
242
+ elif key.endswith('.rope.freq_base'):
243
+ rope_freq_base_train = float(self.metadata[key])
244
+ elif key.endswith('.block_count'):
245
+ n_layer = int(self.metadata[key])
246
+ elif key.endswith('.attention.head_count'):
247
+ n_attn_heads = int(self.metadata[key])
248
+ elif key.endswith('.attention.head_count_kv'):
249
+ n_kv_heads = int(self.metadata[key])
250
+
251
+ if n_layer is None:
252
+ exc = KeyError(
253
+ f"GGUF file metadata does not specify n_layer"
254
+ )
255
+ exc.add_note(
256
+ f"GGUF file is at {self._model_path!r}"
257
+ )
258
+ raise exc
172
259
 
173
- if 2 <= context_length/n_ctx_train < 4:
174
- print_warning(
175
- 'loading model with 2x native context length or more, '
176
- 'expect small loss of quality'
177
- )
178
-
179
- elif 4 <= context_length/n_ctx_train < 8:
180
- print_warning(
181
- 'loading model with 4x native context length or more, '
182
- 'expect moderate loss of quality'
183
- )
260
+ if n_ctx_train is None:
261
+ exc = KeyError(
262
+ f"GGUF file metadata does not specify a context length"
263
+ )
264
+ exc.add_note(
265
+ f"GGUF file is at {self._model_path!r}"
266
+ )
267
+ raise exc
268
+
269
+ if n_attn_heads is not None and n_kv_heads is not None:
270
+ n_gqa = int(n_attn_heads / n_kv_heads)
271
+
272
+ if context_length <= 0:
273
+ context_length = None
274
+
275
+ rope_freq_base = __class__._calculate_rope_freq_base(
276
+ n_ctx_train,
277
+ context_length if context_length is not None else n_ctx_train,
278
+ rope_freq_base_train
279
+ )
184
280
 
185
- elif context_length/n_ctx_train >= 8:
281
+ if context_length is None:
282
+ if n_ctx_train > 32768:
186
283
  print_warning(
187
- 'loading model with 8x native context length or more, '
188
- 'expect SIGNIFICANT loss of quality'
284
+ f"you did not specify a context length, and the native "
285
+ f"context length of this model is very large "
286
+ f"({n_ctx_train}). defaulting to 32768 to avoid "
287
+ f"out-of-memory errors. you should specify a higher "
288
+ f"context length if you need it"
189
289
  )
290
+ self.context_length = self.n_ctx = 32768
291
+ else:
292
+ self.context_length = self.n_ctx = n_ctx_train
293
+
294
+ elif context_length <= n_ctx_train:
295
+ self.context_length = self.n_ctx = context_length
190
296
 
191
- try:
192
- self.tokens: list[str] = self.metadata['tokenizer.ggml.tokens']
193
- except KeyError:
194
- print_warning(
195
- "could not set Model.tokens, defaulting to None"
196
- )
197
- self.tokens = None
198
- try:
199
- self.bos_token: int = self.metadata['tokenizer.ggml.bos_token_id']
200
- except KeyError:
201
- print_warning(
202
- "could not set Model.bos_token, defaulting to None"
203
- )
204
- self.bos_token = None
205
- try:
206
- self.eos_token: int = self.metadata['tokenizer.ggml.eos_token_id']
207
- except KeyError:
297
+ elif context_length > n_ctx_train:
208
298
  print_warning(
209
- "could not set Model.eos_token, defaulting to None"
299
+ f"you have specified a context length that is greater than "
300
+ f"the natively supported context length of this model "
301
+ f"({context_length} > {n_ctx_train}). the model will still "
302
+ f"work, but the quality of output may be subpar. consider "
303
+ f"decreasing the context length to {n_ctx_train} or lower "
304
+ f"for best results"
210
305
  )
211
- self.eos_token = None
306
+ self.context_length = self.n_ctx = context_length
307
+
308
+ else:
309
+ raise UnreachableException
212
310
 
213
- cpu_count = os_cpu_count()
311
+ cpu_count = int(os.cpu_count()) # only read once
312
+
313
+ if n_gpu_layers < 0 or n_gpu_layers > n_layer:
314
+ n_gpu_layers = n_layer
315
+
316
+ if n_gpu_layers == n_layer:
317
+ # fully offloaded
318
+ n_batch = 1024
319
+ else:
320
+ # partially offloaded
321
+ n_batch = 512
322
+
323
+ # NOTE: the optimal n_threads value (for text generation) is equal
324
+ # to the number of physical cores (for homogenous CPUs) or
325
+ # to the number of performance cores (for heterogenous CPUs)
326
+ #
327
+ # the optimal n_threads_batch value (for prompt eval) is equal
328
+ # to the total number of logical cores, regardless of
329
+ # their type
214
330
 
215
- # these values for n_threads and n_threads_batch are
216
- # known to be optimal for most systems
217
- n_batch = 512 # can this be optimized?
218
331
  n_threads = max(cpu_count//2, 1)
219
332
  n_threads_batch = cpu_count
220
333
 
221
334
  if flash_attn and n_gpu_layers == 0:
335
+ flash_attn = False
222
336
  print_warning(
223
337
  "disabling flash_attn because n_gpu_layers == 0"
224
338
  )
225
- flash_attn = False
339
+
340
+ if quantize_kv_cache:
341
+ # use q8_0 for K, V
342
+ if flash_attn:
343
+ type_k = 8
344
+ type_v = 8
345
+ if verbose:
346
+ print_verbose(
347
+ "using q8_0 KV cache"
348
+ )
349
+ else: # llama.cpp requires flash_attn for V quantization
350
+ type_k = 8
351
+ type_v = 1
352
+ if verbose:
353
+ print_verbose(
354
+ "using q8_0 K cache, f16 V cache"
355
+ )
356
+ print_verbose(
357
+ "to quantize V cache, flash_attn must be enabled"
358
+ )
359
+ else:
360
+ # use f16 for K, V (default)
361
+ type_k = 1
362
+ type_v = 1
226
363
 
227
364
  # guard against models with no rope_freq_base
228
365
  if rope_freq_base is None:
229
366
  rope_freq_base = 0
367
+
368
+ if verbose:
369
+ print_verbose(
370
+ f"attempting to load model, offloading "
371
+ f"{n_gpu_layers}/{n_layer} layers..."
372
+ )
373
+
374
+ # llama.cpp needs -ngl set to `-1`, not just n_layer
375
+ if n_gpu_layers >= n_layer:
376
+ _llama_ngl = -1
377
+ else:
378
+ _llama_ngl = n_gpu_layers
230
379
 
231
- self.llama: Llama = Llama(
380
+ self.llama = Llama(
232
381
  model_path=model_path,
233
382
  n_ctx=self.context_length,
234
- n_gpu_layers=n_gpu_layers,
383
+ n_gpu_layers=_llama_ngl,
235
384
  use_mmap=True,
236
385
  use_mlock=False,
237
386
  logits_all=False,
@@ -242,222 +391,511 @@ class Model:
242
391
  mul_mat_q=True,
243
392
  offload_kqv=offload_kqv,
244
393
  flash_attn=flash_attn,
245
- # KV cache quantization
246
- # use 1 for F16 (default), 8 for q8_0, 2 for q4_0, 3 for q4_1
247
- #type_k=8,
248
- #type_v=8,
249
- verbose=verbose
394
+ type_k=type_k,
395
+ type_v=type_v,
396
+ verbose=_debug
250
397
  )
251
398
 
252
- # once model is loaded, replace metadata (as read using internal class)
253
- # with metadata (as read using the more robust llama-cpp-python code)
254
- self.metadata = self.llama.metadata
399
+ # NOTE: llama.cpp uses the nearest multiple of 32 as the actual
400
+ # context length. here we update self.context_length to reflect
401
+ # this
402
+ self.context_length = self.n_ctx = self.llama.n_ctx()
255
403
 
256
- # expose these values because they may be useful / informative
257
- self.n_ctx_train = n_ctx_train
258
- self.rope_freq_base_train = rope_freq_base_train
259
- self.rope_freq_base = rope_freq_base
260
- self.flash_attn = flash_attn
404
+ if self.n_ctx < 512:
405
+ print_warning(
406
+ f'the currently loaded context length is less than 512 tokens '
407
+ f'({self.n_ctx} < 512). sometimes this can cause problems in '
408
+ f'llama.cpp. consider increasing the context length to at '
409
+ f'least 512 tokens'
410
+ )
261
411
 
262
- if self.verbose:
263
- print_verbose("new Model instance with the following attributes:")
264
- print_verbose(f"model: {model_path}")
265
- print_verbose(f"param: n_gpu_layers == {n_gpu_layers}")
266
- print_verbose(f"param: offload_kqv == {offload_kqv}")
267
- print_verbose(f"param: flash_attn == {flash_attn}")
268
- print_verbose(f"param: n_batch == {n_batch}")
269
- print_verbose(f"param: n_threads == {n_threads}")
270
- print_verbose(f"param: n_threads_batch == {n_threads_batch}")
271
- print_verbose(f" gguf: n_ctx_train == {n_ctx_train}")
272
- print_verbose(f"param: self.context_length == {self.context_length}")
273
- print_verbose(f" gguf: rope_freq_base_train == {rope_freq_base_train}")
274
- print_verbose(f"param: rope_freq_base == {rope_freq_base}")
275
- def register_tool(self, name: str, function: Callable):
276
- """Registers a tool for function calling."""
277
- self.tools[name] = function
278
-
279
- def _extract_tool_code(self, text: str) -> dict:
280
- """Extracts tool code from the model's output."""
281
412
  try:
282
- start = text.find(self.tool_code_start) + len(self.tool_code_start)
283
- end = text.find(self.tool_code_end)
284
- tool_code_json = text[start:end]
285
- tool_code = json.loads(tool_code_json)
286
- return tool_code
287
- except (ValueError, json.JSONDecodeError):
288
- return None
289
- def _should_call_tool(self, response_text: str) -> bool:
290
- """Determines if the model suggests a tool call."""
291
- # Simple check for tool code markers in response
292
- return self.tool_code_start in response_text and self.tool_code_end in response_text
293
- def generate(
294
- self,
295
- prompt: Union[str, list[int]],
296
- stops: list[Union[str, int]] = [],
297
- sampler: SamplerSettings = DefaultSampling,
298
- max_iterations: int = 3, # Maximum iterations for tool calls
299
- ) -> str:
300
- """
301
- Generates text and handles tool calls.
413
+ self.vocab: list[str] = self.metadata['tokenizer.ggml.tokens']
414
+ except (KeyError, TypeError, ValueError):
415
+ print_warning(
416
+ "could not set Model.vocab, constructing manually..."
417
+ )
418
+ self.vocab = [
419
+ self.llama._model.detokenize([i], special=True).decode(
420
+ 'utf-8', errors='ignore'
421
+ ) for i in range(self.llama._model.n_vocab())
422
+ ]
423
+ try:
424
+ self.bos_token = int(self.metadata['tokenizer.ggml.bos_token_id'])
425
+ except (KeyError, TypeError, ValueError):
426
+ self.bos_token = int(self.llama._model.token_bos())
427
+ if self.bos_token < 0:
428
+ self.bos_token = None
429
+ print_warning(
430
+ "could not set Model.bos_token, defaulting to None"
431
+ )
432
+ try:
433
+ self.eos_token = int(self.metadata['tokenizer.ggml.eos_token_id'])
434
+ except (KeyError, TypeError, ValueError):
435
+ self.eos_token = int(self.llama._model.token_eos())
436
+ if self.eos_token < 0:
437
+ self.eos_token = None
438
+ print_warning(
439
+ "could not set Model.eos_token, defaulting to None"
440
+ )
441
+
442
+ # These special tokens are optional
443
+
444
+ self.eot_token = int(self.llama._model.token_eot())
445
+ if self.eot_token < 0:
446
+ self.eot_token = None
447
+
448
+ self.nl_token = int(self.llama._model.token_nl())
449
+ if self.nl_token < 0:
450
+ self.nl_token = None
302
451
 
303
- Args:
304
- prompt (Union[str, list[int]]): The input prompt.
305
- stops (list[Union[str, int]]): Stop sequences.
306
- sampler (SamplerSettings): Sampler settings.
307
- max_iterations (int): Maximum number of tool call iterations.
452
+ self.prefix_token = int(self.llama._model.token_prefix())
453
+ if self.prefix_token < 0:
454
+ self.prefix_token = None
455
+
456
+ self.middle_token = int(self.llama._model.token_middle())
457
+ if self.middle_token < 0:
458
+ self.middle_token = None
459
+
460
+ self.suffix_token = int(self.llama._model.token_suffix())
461
+ if self.suffix_token < 0:
462
+ self.suffix_token = None
463
+
464
+ self.cls_token = int(self.llama._model.token_cls())
465
+ if self.cls_token < 0:
466
+ self.cls_token = None
467
+
468
+ self.sep_token = int(self.llama._model.token_sep())
469
+ if self.sep_token < 0:
470
+ self.sep_token = None
471
+
472
+ # Misc. attributes
473
+ _add_bos_token = self.llama._model.add_bos_token()
474
+ if _add_bos_token == 1:
475
+ self.add_bos_token = True
476
+ elif _add_bos_token == 0:
477
+ self.add_bos_token = False
478
+ else:
479
+ self.add_bos_token = None
480
+ print_warning(
481
+ "Model.add_bos_token is unknown, defaulting to None"
482
+ )
483
+
484
+ _add_eos_token = self.llama._model.add_eos_token()
485
+ if _add_eos_token == 1:
486
+ self.add_eos_token = True
487
+ elif _add_eos_token == 0:
488
+ self.add_eos_token = False
489
+ else:
490
+ self.add_eos_token = None
491
+ print_warning(
492
+ "Model.add_eos_token is unknown, defaulting to None"
493
+ )
308
494
 
309
- Returns:
310
- str: The generated text.
495
+ self.filename: str = os.path.basename(model_path)
496
+ self.n_ctx_train: int = n_ctx_train
497
+ self.rope_freq_base_train: float = rope_freq_base_train
498
+ self.rope_freq_base: float = rope_freq_base
499
+ self.n_batch: int = n_batch
500
+ self.n_threads: int = n_threads
501
+ self.n_threads_batch: int = n_threads_batch
502
+ self.flash_attn: bool = flash_attn
503
+ self.n_embd = self.llama._model.n_embd()
504
+ self.n_params = self.llama._model.n_params()
505
+ self.bpw = (8*self._model_file_size_bytes)/self.n_params
506
+ self.n_vocab: int = len(self.vocab)
507
+ self.n_layer: int = n_layer
508
+ self.n_gpu_layers: int = n_gpu_layers
509
+ self.offload_kqv = offload_kqv
510
+ self.is_native: bool = self.context_length <= self.n_ctx_train
511
+ self.type_k: int = type_k
512
+ self.type_v: int = type_v
513
+ self.n_gqa: int = n_gqa
514
+
515
+ if verbose:
516
+ print_verbose(
517
+ f"{'new' if '__uuid' not in _kwargs_keys else 'reloaded'} "
518
+ f"Model instance with the following attributes:"
519
+ )
520
+ print_verbose(f" uuid == {self.uuid}")
521
+ print_verbose(f" filename == {self.filename}")
522
+ print_verbose(f" n_params == {self.n_params}")
523
+ print_verbose(
524
+ f" bpw == {self.bpw} "
525
+ f"({__class__._get_bpw_quality_hint(self.bpw)})"
526
+ )
527
+ print_verbose(f" n_gpu_layers == {self.n_gpu_layers}")
528
+ print_verbose(f" n_layer == {self.n_layer}")
529
+ print_verbose(f" offload_kqv == {self.offload_kqv}")
530
+ print_verbose(f" flash_attn == {self.flash_attn}")
531
+ print_verbose(f" n_gqa == {self.n_gqa}")
532
+ print_verbose(
533
+ f" type_k == {self.type_k} "
534
+ f"({'f16' if self.type_k == 1 else 'q8_0'})"
535
+ )
536
+ print_verbose(
537
+ f" type_v == {self.type_v} "
538
+ f"({'f16' if self.type_v == 1 else 'q8_0'})"
539
+ )
540
+ print_verbose(f" n_batch == {self.n_batch}")
541
+ print_verbose(
542
+ f" n_threads == {self.n_threads}/{cpu_count}"
543
+ )
544
+ print_verbose(
545
+ f" n_threads_batch == {self.n_threads_batch}/{cpu_count}"
546
+ )
547
+ print_verbose(f" n_ctx_train == {self.n_ctx_train}")
548
+ print_verbose(f" n_ctx == {self.n_ctx}")
549
+ print_verbose(f" rope_freq_base_train == {self.rope_freq_base_train}")
550
+ print_verbose(f" rope_freq_base == {self.rope_freq_base}")
551
+ print_verbose(f" n_embd == {self.n_embd}")
552
+ print_verbose(f" n_vocab == {self.n_vocab}")
553
+ print_verbose(f" bos_token == {self.bos_token}")
554
+ print_verbose(f" eos_token == {self.eos_token}")
555
+ if self.eot_token is not None:
556
+ print_verbose(f" eot_token == {self.eot_token}")
557
+ if self.nl_token is not None:
558
+ print_verbose(f" nl_token == {self.nl_token}")
559
+ if self.prefix_token is not None:
560
+ print_verbose(f" prefix_token == {self.prefix_token}")
561
+ if self.middle_token is not None:
562
+ print_verbose(f" middle_token == {self.middle_token}")
563
+ if self.suffix_token is not None:
564
+ print_verbose(f" suffix_token == {self.suffix_token}")
565
+ if self.cls_token is not None:
566
+ print_verbose(f" cls_token == {self.cls_token}")
567
+ if self.sep_token is not None:
568
+ print_verbose(f" sep_token == {self.sep_token}")
569
+ print_verbose(f" add_bos_token == {self.add_bos_token}")
570
+ print_verbose(f" add_eos_token == {self.add_eos_token}")
571
+
572
+
573
+ @staticmethod
574
+ def _calculate_rope_freq_base(
575
+ n_ctx_train: int,
576
+ n_ctx_load: int,
577
+ rope_freq_base_train: Optional[float]
578
+ ) -> float:
311
579
  """
312
- assert_model_is_loaded(self)
313
- response_text = self.llama.create_completion(
314
- prompt,
315
- max_tokens=sampler.max_len_tokens,
316
- temperature=sampler.temp,
317
- top_p=sampler.top_p,
318
- min_p=sampler.min_p,
319
- frequency_penalty=sampler.frequency_penalty,
320
- presence_penalty=sampler.presence_penalty,
321
- repeat_penalty=sampler.repeat_penalty,
322
- top_k=sampler.top_k,
323
- stop=stops
324
- )['choices'][0]['text']
580
+ Returns the rope_freq_base (theta) value at which model should be loaded
581
+ """
582
+ assert_type(n_ctx_train, int, 'n_ctx_train', '_calculate_rope_freq_base')
583
+ assert_type(n_ctx_load, int, 'n_ctx_load', '_calculate_rope_freq_base')
584
+ assert_type(rope_freq_base_train, (float, NoneType),
585
+ 'rope_freq_base_train', '_calculate_rope_freq_base')
586
+
587
+ if n_ctx_load <= n_ctx_train:
588
+ if rope_freq_base_train is None:
589
+ return 0.0
590
+ else:
591
+ return rope_freq_base_train
592
+
593
+ if rope_freq_base_train is None or rope_freq_base_train == 0.0:
594
+ raise ValueError(
595
+ 'unable to load model with greater than native '
596
+ f'context length ({n_ctx_load} > {n_ctx_train}) '
597
+ 'because model does not specify rope_freq_base. '
598
+ f'try again with context_length <= {n_ctx_train}'
599
+ )
600
+
601
+ return ((n_ctx_load/n_ctx_train)**(2**(1/4)))*rope_freq_base_train
602
+
603
+ # traditional formula:
604
+ # return (n_ctx_load/n_ctx_train)*rope_freq_base_train
605
+ # experimental formula A:
606
+ # return ((n_ctx_load/n_ctx_train)**2)*rope_freq_base_train
607
+ # experimental formula B:
608
+ # return ((n_ctx_load/n_ctx_train)**(2**(1/4)))*rope_freq_base_train
609
+
325
610
 
326
- iteration = 0
327
- while self._should_call_tool(response_text) and iteration < max_iterations:
328
- tool_code = self._extract_tool_code(response_text)
329
- if tool_code:
330
- tool_name = tool_code.get("function", {}).get("name")
331
- arguments = tool_code.get("function", {}).get("arguments", "")
332
- if tool_name and arguments and tool_name in self.tools:
333
- # Execute the tool and append its output
334
- tool_output = self.tools[tool_name](**json.loads(arguments))
335
- response_text = response_text.replace(
336
- f"{self.tool_code_start}{json.dumps(tool_code)}{self.tool_code_end}",
337
- tool_output
338
- )
339
- iteration += 1
611
+ @staticmethod
612
+ def _get_bpw_quality_hint(bpw: float) -> str:
613
+ if 0.0 < bpw < 2.0:
614
+ return 'terrible'
615
+ elif 2.0 <= bpw < 4.0:
616
+ return 'bad'
617
+ elif 4.0 <= bpw < 5.0:
618
+ return 'good'
619
+ elif 5.0 <= bpw < 16.0:
620
+ return 'great'
621
+ elif bpw >= 16.0:
622
+ return 'native'
623
+ else:
624
+ raise UnreachableException
625
+
626
+
627
+ @staticmethod
628
+ def _print_metadata(
629
+ metadata: dict,
630
+ file: _SupportsWriteAndFlush = sys.stderr
631
+ ) -> None:
632
+ max_len_key = max(len(k) for k in metadata.keys())
633
+ print(f'webscout.Local: read model metadata from GGUF file header:', file=file)
634
+ for k, v in metadata.items():
635
+ print(
636
+ f'webscout.Local: {k:<{max_len_key}} : {truncate(repr(v))}',
637
+ file=file
638
+ )
340
639
 
341
- return response_text
640
+
342
641
  def __repr__(self) -> str:
343
- return \
344
- f"Model({repr(self._model_path)}, " + \
345
- f"context_length={self._context_length}, " + \
346
- f"n_gpu_layers={self._n_gpu_layers}, " + \
347
- f"offload_kqv={self._offload_kqv}, "+ \
348
- f"flash_attn={self._flash_attn}, " + \
642
+ return (
643
+ f"Model({self._model_path!r}, "
644
+ f"context_length={self._context_length}, "
645
+ f"n_gpu_layers={self._n_gpu_layers}, "
646
+ f"offload_kqv={self._offload_kqv}, "
647
+ f"flash_attn={self._flash_attn}, "
648
+ f"quantize_kv_cache={self._quantize_kv_cache}, "
349
649
  f"verbose={self._verbose})"
650
+ )
651
+
652
+
653
+ def __sizeof__(self) -> int:
654
+ """Returns the size of the model file on disk, NOT the memory usage"""
655
+ return self._model_file_size_bytes
656
+
350
657
 
351
658
  def __del__(self):
659
+ if self.is_loaded():
352
660
  self.unload()
353
661
 
662
+
354
663
  def __enter__(self):
664
+ if not self.is_loaded():
665
+ self.load()
355
666
  return self
356
667
 
668
+
357
669
  def __exit__(self, *_):
358
- self.unload()
670
+ if self.is_loaded():
671
+ self.unload()
359
672
 
673
+
360
674
  def __call__(
361
675
  self,
362
- prompt: Union[str, list[int]],
363
- stops: list[Union[str, int]] = [],
364
- sampler: SamplerSettings = DefaultSampling
676
+ prompt: str | list[int],
677
+ stops: Optional[list[str | int]] = None,
678
+ sampler: Optional[SamplerSettings] = None
365
679
  ) -> str:
366
680
  """
367
681
  `Model(...)` is a shorthand for `Model.generate(...)`
368
682
  """
369
- return self.generate(prompt, stops, sampler)
683
+ return self.generate(prompt=prompt, stops=stops, sampler=sampler)
684
+
685
+
686
+ def __eq__(self, value: object, /) -> bool:
687
+ if not isinstance(value, __class__):
688
+ return NotImplemented
689
+ if not (hasattr(self, 'uuid') and hasattr(value, 'uuid')):
690
+ raise AttributeError(
691
+ "At least one of the models being compared is missing the "
692
+ "`.uuid` attribute"
693
+ )
694
+ return self.uuid == value.uuid
695
+
696
+
697
+ def __hash__(self, /) -> int:
698
+ return hash(self.uuid)
699
+
370
700
 
371
701
  def unload(self):
372
702
  """
373
703
  Unload the model from memory
704
+
705
+ Does nothing if the model is not loaded
374
706
  """
375
- # ref: llama_cpp._internals._LlamaModel.__del__()
376
- if not hasattr(self, 'llama'):
377
- # nothing can be done
378
- return
379
- try:
380
- if self.llama._model.model is not None:
381
- # actually unload the model from memory
382
- self.llama._model._llama_free_model(self.llama._model.model)
383
- self.llama._model.model = None
384
- except AttributeError:
385
- # broken or already being destroyed by GC, abort
707
+ if not self.is_loaded():
708
+ if self.verbose:
709
+ print_verbose('model already unloaded')
386
710
  return
387
- if hasattr(self, 'llama'):
711
+
712
+ if self.verbose:
713
+ print_verbose('unloading model...')
714
+
715
+ self.llama.close()
716
+
717
+ while hasattr(self, 'llama'):
388
718
  delattr(self, 'llama')
719
+
389
720
  if self.verbose:
390
- print_verbose('Model unloaded')
721
+ print_verbose('model unloaded')
391
722
 
392
- def trim(
723
+
724
+ def reload(
393
725
  self,
394
- text: str,
395
- overwrite: Optional[str] = None
396
- ) -> str:
726
+ context_length: Optional[int] = None,
727
+ n_gpu_layers: Optional[int] = None,
728
+ offload_kqv: Optional[bool] = None,
729
+ flash_attn: Optional[bool] = None,
730
+ quantize_kv_cache: Optional[bool] = None,
731
+ verbose: Optional[bool] = None
732
+ ):
733
+ """
734
+ Re-load the model into memory using the specified parameters
735
+
736
+ Any parameters unspecified will be unchanged
737
+ """
738
+ __uuid = self.uuid
739
+ self.unload()
740
+ self.__init__(
741
+ model_path = self._model_path,
742
+ context_length = (
743
+ self._context_length if context_length is None
744
+ else context_length
745
+ ),
746
+ n_gpu_layers = (
747
+ self._n_gpu_layers if n_gpu_layers is None
748
+ else n_gpu_layers
749
+ ),
750
+ offload_kqv = (
751
+ self._offload_kqv if offload_kqv is None
752
+ else offload_kqv
753
+ ),
754
+ flash_attn = (
755
+ self._flash_attn if flash_attn is None
756
+ else flash_attn
757
+ ),
758
+ quantize_kv_cache = (
759
+ self._quantize_kv_cache if quantize_kv_cache is None
760
+ else quantize_kv_cache
761
+ ),
762
+ verbose = (
763
+ self._verbose if verbose is None
764
+ else verbose
765
+ ),
766
+ __uuid = __uuid # do not change UUID on reload
767
+ )
768
+ assert_model_is_loaded(self)
769
+
397
770
 
771
+ def load(self) -> None:
398
772
  """
399
- Trim the given text to the context length of this model,
400
- leaving room for two extra tokens.
773
+ Load the model into memory
774
+
775
+ Does nothing if already loaded
776
+ """
777
+ if self.is_loaded():
778
+ if self.verbose:
779
+ print_verbose('model already loaded')
780
+ else:
781
+ self.reload()
782
+
401
783
 
402
- Optionally overwrite the oldest tokens with the text given in the
403
- `overwrite` parameter, which may be useful for keeping some
404
- information in context.
784
+ def is_loaded(self) -> bool:
785
+ """
786
+ Return `True` if the model is fully loaded, `False` otherwise
787
+ """
788
+ try:
789
+ assert_model_is_loaded(self)
790
+ except exceptions.ModelUnloadedException:
791
+ return False
792
+ else:
793
+ return True
794
+
405
795
 
406
- Does nothing if the text is equal to or shorter than
407
- (context_length - 2).
796
+ def tokenize(self, text: str) -> list[int]:
797
+ """
798
+ Tokenize the given text (from `str` to `list[int]`)
408
799
  """
800
+ assert_type(text, str, 'text', 'tokenize')
409
801
  assert_model_is_loaded(self)
410
- trim_length = self.context_length - 2
411
- tokens_list = self.llama.tokenize(
412
- text.encode("utf-8", errors="ignore")
802
+ tokens = self.llama._model.tokenize(
803
+ text.encode('utf-8'),
804
+ add_bos=(
805
+ self.add_bos_token if self.add_bos_token is not None
806
+ else True
807
+ ),
808
+ special=True
413
809
  )
810
+ # remove duplicate BOS tokens at the start of the text
811
+ while len(tokens) >= 2 and tokens[0] == self.bos_token and tokens[1] == self.bos_token:
812
+ tokens.pop(0)
813
+ if self.verbose:
814
+ print_verbose("tokenize: removed duplicate BOS token")
815
+ # remove duplicate EOS tokens at the end of the text
816
+ while len(tokens) >= 2 and tokens[-1] == self.eos_token and tokens[-2] == self.eos_token:
817
+ tokens.pop(-1)
818
+ if self.verbose:
819
+ print_verbose("tokenize: removed duplicate EOS token")
820
+ return tokens
821
+
414
822
 
415
- if len(tokens_list) <= trim_length:
416
- if overwrite is not None:
417
- text[0 : len(overwrite)] = overwrite
418
- return text
419
-
420
- if len(tokens_list) > trim_length and overwrite is None:
421
- # cut to trim_length
422
- tokens_list = tokens_list[-trim_length:]
423
- return self.llama.detokenize(tokens_list).decode(
424
- "utf-8",
425
- errors="ignore"
426
- )
427
-
428
- if len(tokens_list) > trim_length and overwrite is not None:
429
- # cut to trim_length
430
- tokens_list = tokens_list[-trim_length:]
431
- overwrite_tokens = self.llama.tokenize(overwrite.encode(
432
- "utf-8",
433
- errors="ignore"
823
+ def detokenize(self, tokens: list[int] | int) -> str:
824
+ """
825
+ Detokenize the given text (from `int` or `list[int]` to `str`)
826
+ """
827
+ assert_type(tokens, (list, int), 'tokens', 'detokenize')
828
+ if isinstance(tokens, int):
829
+ tokens = [tokens] # handle single tokens
830
+ for tok_id in tokens:
831
+ if not 0 <= tok_id < self.n_vocab:
832
+ raise ValueError(
833
+ f"detokenize: token id {tok_id} is out of range. "
834
+ f"acceptable values for this model are between 0 and "
835
+ f"{self.n_vocab-1} inclusive"
434
836
  )
435
- )
436
- # overwrite oldest tokens
437
- tokens_list[0 : len(overwrite_tokens)] = overwrite_tokens
438
- return self.llama.detokenize(tokens_list).decode(
439
- "utf-8",
440
- errors="ignore"
441
- )
837
+ # remove duplicate BOS tokens at the start of the text
838
+ while len(tokens) >= 2 and tokens[0] == self.bos_token and tokens[1] == self.bos_token:
839
+ tokens.pop(0)
840
+ if self.verbose:
841
+ print_verbose("detokenize: removed duplicate BOS token")
842
+ # remove duplicate EOS tokens at the end of the text
843
+ while len(tokens) >= 2 and tokens[-1] == self.eos_token and tokens[-2] == self.eos_token:
844
+ tokens.pop(-1)
845
+ if self.verbose:
846
+ print_verbose("detokenize: removed duplicate EOS token")
847
+ assert_model_is_loaded(self)
848
+ return self.llama._model.detokenize(
849
+ tokens,
850
+ special=True
851
+ ).decode('utf-8', errors='ignore')
852
+
442
853
 
443
854
  def get_length(self, text: str) -> int:
444
855
  """
445
- Return the length of the given text in tokens according to this model,
446
- including the appended BOS token.
856
+ Return the length of the given text in as measured in tokens
447
857
  """
448
- assert_model_is_loaded(self)
449
- return len(self.llama.tokenize(
450
- text.encode(
451
- "utf-8",
452
- errors="ignore"
453
- )
454
- ))
858
+ return len(self.tokenize(text))
859
+
860
+
861
+ def get_tokenization_mapping(
862
+ self,
863
+ text: str
864
+ ) -> list[tuple[int, str]]:
865
+ """
866
+ Tokenize the given text and return a list of tuples where the first
867
+ item in the tuple is the token ID and the second item is the
868
+ corresponding text
869
+ """
870
+ token_id_list: list[int] = self.tokenize(text)
871
+
872
+ return list(
873
+ zip(
874
+ token_id_list,
875
+ [self.detokenize(tok_id) for tok_id in token_id_list]
876
+ )
877
+ )
878
+
455
879
 
880
+ def print_tokenization_mapping(self, text: str) -> None:
881
+ """
882
+ Tokenize the given text and display a mapping of each
883
+ token ID and its corresponding decoded text
884
+
885
+ This is meant to be equivalent to `llama.cpp/llama-tokenize`
886
+ """
887
+ token_mapping_list = self.get_tokenization_mapping(text)
888
+
889
+ for token_id, token_text in token_mapping_list:
890
+ print(f"{token_id:>7} -> '{token_text}'")
891
+ print(f"Total number of tokens: {len(token_mapping_list)}")
892
+
893
+
456
894
  def generate(
457
895
  self,
458
- prompt: Union[str, list[int]],
459
- stops: list[Union[str, int]] = [],
460
- sampler: SamplerSettings = DefaultSampling
896
+ prompt: str | list[int],
897
+ stops: Optional[list[str | int]] = None,
898
+ sampler: Optional[SamplerSettings] = None
461
899
  ) -> str:
462
900
  """
463
901
  Given a prompt, return a generated string.
@@ -468,56 +906,76 @@ class Model:
468
906
  - stops: A list of strings and/or token IDs at which to end the generation early
469
907
  - sampler: The SamplerSettings object used to control text generation
470
908
  """
909
+
910
+ stops = [] if stops is None else stops
911
+ assert_type(stops, list, 'stops', 'generate')
912
+ for item in stops:
913
+ assert_type(
914
+ item,
915
+ (str, int),
916
+ "some item in parameter 'stops'",
917
+ 'generate'
918
+ )
919
+
920
+ sampler = SamplerSettings() if sampler is None else sampler
471
921
 
472
- assert isinstance(prompt, (str, list)), \
473
- f"generate: prompt should be string or list[int], not {type(prompt)}"
922
+ if sampler.temp < 0.0:
923
+ print_warning(
924
+ f'generate: using negative temperature value {sampler.temp}'
925
+ )
926
+
927
+ assert_type(prompt, (str, list), 'prompt', 'generate')
474
928
  if isinstance(prompt, list):
475
- assert all(isinstance(tok, int) for tok in prompt), \
476
- "generate: some token in prompt is not an integer"
477
- assert isinstance(stops, list), \
478
- f"generate: parameter `stops` should be a list, not {type(stops)}"
479
- assert all(isinstance(item, (str, int)) for item in stops), \
480
- f"generate: some item in parameter `stops` is not a string or int"
929
+ prompt_tokens = prompt
930
+ else:
931
+ if self.verbose:
932
+ print_verbose(
933
+ "generate: tokenizing prompt"
934
+ )
935
+ prompt_tokens = self.tokenize(prompt)
936
+
937
+ input_length = len(prompt_tokens)
938
+
939
+ if input_length > self.context_length:
940
+ print(f'webscout.Local: raw input: {prompt_tokens}')
941
+ raise exceptions.ExceededContextLengthException(
942
+ f"generate: length of input exceeds model's context length "
943
+ f"({input_length} > {self.context_length})"
944
+ )
945
+ elif input_length == self.context_length:
946
+ print(f'webscout.Local: raw input: {prompt_tokens}')
947
+ raise exceptions.ExceededContextLengthException(
948
+ f"generate: length of input is equal to model's context "
949
+ f"length ({input_length} == {self.context_length}). this "
950
+ f"leaves no room for any new tokens to be generated"
951
+ )
952
+ elif self.verbose:
953
+ print_verbose(
954
+ f"generate: received prompt with {input_length} tokens"
955
+ )
956
+
957
+ stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
958
+ stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
959
+ stopping_criteria = None
960
+ if stop_token_ids != []:
961
+ def stop_on_token_ids(tokens, *args, **kwargs):
962
+ return tokens[-1] in stop_token_ids
963
+ stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
481
964
 
482
965
  if self.verbose:
483
- print_verbose(f'using the following sampler settings for Model.generate:')
966
+ print_verbose(f'generate: using the following sampler settings:')
484
967
  print_verbose(f'max_len_tokens == {sampler.max_len_tokens}')
485
- print_verbose(f'temp == {sampler.temp}')
968
+ print_verbose(f'top_k == {sampler.top_k}')
486
969
  print_verbose(f'top_p == {sampler.top_p}')
487
970
  print_verbose(f'min_p == {sampler.min_p}')
971
+ print_verbose(f'temp == {sampler.temp}')
488
972
  print_verbose(f'frequency_penalty == {sampler.frequency_penalty}')
489
973
  print_verbose(f'presence_penalty == {sampler.presence_penalty}')
490
974
  print_verbose(f'repeat_penalty == {sampler.repeat_penalty}')
491
- print_verbose(f'top_k == {sampler.top_k}')
492
975
 
493
- # if any stop item is a token ID (int)
494
- if any(isinstance(stop, int) for stop in stops):
495
- # stop_strs is a list of all stopping strings
496
- stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
497
- # stop_token_ids is a list of all stop token IDs
498
- stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
499
- def stop_on_token_ids(tokens, *args, **kwargs):
500
- return tokens[-1] in stop_token_ids
501
- stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
502
- assert_model_is_loaded(self)
503
- return self.llama.create_completion(
504
- prompt,
505
- max_tokens=sampler.max_len_tokens,
506
- temperature=sampler.temp,
507
- top_p=sampler.top_p,
508
- min_p=sampler.min_p,
509
- frequency_penalty=sampler.frequency_penalty,
510
- presence_penalty=sampler.presence_penalty,
511
- repeat_penalty=sampler.repeat_penalty,
512
- top_k=sampler.top_k,
513
- stop=stop_strs,
514
- stopping_criteria=stopping_criteria
515
- )['choices'][0]['text']
516
-
517
- # if stop items are only strings
518
976
  assert_model_is_loaded(self)
519
977
  return self.llama.create_completion(
520
- prompt,
978
+ prompt=prompt_tokens,
521
979
  max_tokens=sampler.max_len_tokens,
522
980
  temperature=sampler.temp,
523
981
  top_p=sampler.top_p,
@@ -526,17 +984,17 @@ class Model:
526
984
  presence_penalty=sampler.presence_penalty,
527
985
  repeat_penalty=sampler.repeat_penalty,
528
986
  top_k=sampler.top_k,
529
- stop=stops
987
+ stop=stop_strs,
988
+ stopping_criteria=stopping_criteria
530
989
  )['choices'][0]['text']
531
990
 
532
991
 
533
992
  def stream(
534
993
  self,
535
- prompt: Union[str, list[int]],
536
- stops: list[Union[str, int]] = [],
537
- sampler: SamplerSettings = DefaultSampling
994
+ prompt: str | list[int],
995
+ stops: Optional[list[str | int]] = None,
996
+ sampler: Optional[SamplerSettings] = None
538
997
  ) -> Generator:
539
-
540
998
  """
541
999
  Given a prompt, return a Generator that yields dicts containing tokens.
542
1000
 
@@ -551,55 +1009,75 @@ class Model:
551
1009
  - sampler: The SamplerSettings object used to control text generation
552
1010
  """
553
1011
 
554
- assert isinstance(prompt, (str, list)), \
555
- f"stream: prompt should be string or list[int], not {type(prompt)}"
1012
+ stops = [] if stops is None else stops
1013
+ assert_type(stops, list, 'stops', 'stream')
1014
+ for item in stops:
1015
+ assert_type(
1016
+ item,
1017
+ (str, int),
1018
+ "some item in parameter 'stops'",
1019
+ 'stream'
1020
+ )
1021
+
1022
+ sampler = SamplerSettings() if sampler is None else sampler
1023
+
1024
+ if sampler.temp < 0.0:
1025
+ print_warning(
1026
+ f'stream: using negative temperature value {sampler.temp}'
1027
+ )
1028
+
1029
+ assert_type(prompt, (str, list), 'prompt', 'stream')
556
1030
  if isinstance(prompt, list):
557
- assert all(isinstance(tok, int) for tok in prompt), \
558
- "stream: some token in prompt is not an integer"
559
- assert isinstance(stops, list), \
560
- f"stream: parameter `stops` should be a list, not {type(stops)}"
561
- assert all(isinstance(item, (str, int)) for item in stops), \
562
- f"stream: some item in parameter `stops` is not a string or int"
1031
+ prompt_tokens = prompt
1032
+ else:
1033
+ if self.verbose:
1034
+ print_verbose(
1035
+ "stream: tokenizing prompt"
1036
+ )
1037
+ prompt_tokens = self.tokenize(prompt)
1038
+
1039
+ input_length = len(prompt_tokens)
1040
+
1041
+ if input_length > self.context_length:
1042
+ print(f'webscout.Local: raw input: {prompt_tokens}')
1043
+ raise exceptions.ExceededContextLengthException(
1044
+ f"stream: length of input exceeds model's context length "
1045
+ f"({input_length} > {self.context_length})"
1046
+ )
1047
+ elif input_length == self.context_length:
1048
+ print(f'webscout.Local: raw input: {prompt_tokens}')
1049
+ raise exceptions.ExceededContextLengthException(
1050
+ f"stream: length of input is equal to model's context "
1051
+ f"length ({input_length} == {self.context_length}). this "
1052
+ f"leaves no room for any new tokens to be generated"
1053
+ )
1054
+ elif self.verbose:
1055
+ print_verbose(
1056
+ f"stream: received prompt with {input_length} tokens"
1057
+ )
1058
+
1059
+ stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
1060
+ stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
1061
+ stopping_criteria = None
1062
+ if stop_token_ids != []:
1063
+ def stop_on_token_ids(tokens, *args, **kwargs):
1064
+ return tokens[-1] in stop_token_ids
1065
+ stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
563
1066
 
564
1067
  if self.verbose:
565
- print_verbose(f'using the following sampler settings for Model.stream:')
1068
+ print_verbose(f'stream: using the following sampler settings:')
566
1069
  print_verbose(f'max_len_tokens == {sampler.max_len_tokens}')
567
- print_verbose(f'temp == {sampler.temp}')
1070
+ print_verbose(f'top_k == {sampler.top_k}')
568
1071
  print_verbose(f'top_p == {sampler.top_p}')
569
1072
  print_verbose(f'min_p == {sampler.min_p}')
1073
+ print_verbose(f'temp == {sampler.temp}')
570
1074
  print_verbose(f'frequency_penalty == {sampler.frequency_penalty}')
571
1075
  print_verbose(f'presence_penalty == {sampler.presence_penalty}')
572
1076
  print_verbose(f'repeat_penalty == {sampler.repeat_penalty}')
573
- print_verbose(f'top_k == {sampler.top_k}')
574
1077
 
575
- # if any stop item is a token ID (int)
576
- if any(isinstance(stop, int) for stop in stops):
577
- # stop_strs is a list of all stopping strings
578
- stop_strs: list[str] = [stop for stop in stops if isinstance(stop, str)]
579
- # stop_token_ids is a list of all stop token IDs
580
- stop_token_ids: list[int] = [tok_id for tok_id in stops if isinstance(tok_id, int)]
581
- def stop_on_token_ids(tokens, *args, **kwargs):
582
- return tokens[-1] in stop_token_ids
583
- stopping_criteria = StoppingCriteriaList([stop_on_token_ids])
584
- assert_model_is_loaded(self)
585
- return self.llama.create_completion(
586
- prompt,
587
- max_tokens=sampler.max_len_tokens,
588
- temperature=sampler.temp,
589
- top_p=sampler.top_p,
590
- min_p=sampler.min_p,
591
- frequency_penalty=sampler.frequency_penalty,
592
- presence_penalty=sampler.presence_penalty,
593
- repeat_penalty=sampler.repeat_penalty,
594
- top_k=sampler.top_k,
595
- stream=True,
596
- stop=stop_strs,
597
- stopping_criteria=stopping_criteria
598
- )
599
-
600
1078
  assert_model_is_loaded(self)
601
1079
  return self.llama.create_completion(
602
- prompt,
1080
+ prompt=prompt_tokens,
603
1081
  max_tokens=sampler.max_len_tokens,
604
1082
  temperature=sampler.temp,
605
1083
  top_p=sampler.top_p,
@@ -609,32 +1087,24 @@ class Model:
609
1087
  repeat_penalty=sampler.repeat_penalty,
610
1088
  top_k=sampler.top_k,
611
1089
  stream=True,
612
- stop=stops
1090
+ stop=stop_strs,
1091
+ stopping_criteria=stopping_criteria
613
1092
  )
614
1093
 
615
1094
 
616
1095
  def stream_print(
617
1096
  self,
618
- prompt: Union[str, list[int]],
619
- stops: list[Union[str, int]] = [],
620
- sampler: SamplerSettings = DefaultSampling,
621
- end: str = "\n",
622
- file: _SupportsWriteAndFlush = sys.stdout,
1097
+ prompt: str | list[int],
1098
+ stops: Optional[list[str | int]] = None,
1099
+ sampler: Optional[SamplerSettings] = None,
1100
+ end: str = '\n',
1101
+ file: _SupportsWriteAndFlush = None,
623
1102
  flush: bool = True
624
1103
  ) -> str:
625
1104
  """
626
- Given a prompt, stream text as it is generated, and return the generated string.
627
- The returned string does not include the `end` parameter.
628
-
629
- `Model.stream_print(...)` is a shorthand for:
630
-
631
- ```
632
- s = Model.stream(prompt, stops=stops, sampler=sampler)
633
- for i in s:
634
- tok = i['choices'][0]['text']
635
- print(tok, end='', file=file, flush=flush)
636
- print(end, end='', file=file, flush=True)
637
- ```
1105
+ Given a prompt, stream text to a file as it is generated, and return
1106
+ the generated string. The returned string does not include the `end`
1107
+ parameter.
638
1108
 
639
1109
  prompt: The text from which to generate
640
1110
 
@@ -652,120 +1122,247 @@ class Model:
652
1122
  sampler=sampler
653
1123
  )
654
1124
 
655
- res = ''
1125
+ file = sys.stdout if file is None else file
1126
+
1127
+ response = ''
656
1128
  for i in token_generator:
657
1129
  tok = i['choices'][0]['text']
658
1130
  print(tok, end='', file=file, flush=flush)
659
- res += tok
1131
+ response += tok
660
1132
 
661
1133
  # print `end`, and always flush stream after generation is done
662
1134
  print(end, end='', file=file, flush=True)
663
1135
 
664
- return res
1136
+ return response
665
1137
 
666
1138
 
667
- def ingest(self, text: str) -> None:
1139
+ def ingest(self, text: str | list[int]) -> None:
668
1140
  """
669
1141
  Ingest the given text into the model's cache
670
1142
  """
671
1143
 
1144
+ assert_type(text, (str, list), 'prompt', 'stream')
1145
+ if isinstance(text, list):
1146
+ tokens = text
1147
+ else:
1148
+ if self.verbose:
1149
+ print_verbose(
1150
+ "ingest: tokenizing text"
1151
+ )
1152
+ tokens = self.tokenize(text)
1153
+
1154
+ input_length = len(tokens)
1155
+
1156
+ if input_length > self.context_length:
1157
+ print(f'webscout.Local: raw input: {tokens}')
1158
+ raise exceptions.ExceededContextLengthException(
1159
+ f"ingest: length of input exceeds model's context length "
1160
+ f"({input_length} > {self.context_length})"
1161
+ )
1162
+ elif input_length == self.context_length:
1163
+ print(f'webscout.Local: raw input: {tokens}')
1164
+ raise exceptions.ExceededContextLengthException(
1165
+ f"ingest: length of input is equal to model's context "
1166
+ f"length ({input_length} == {self.context_length}). this "
1167
+ f"leaves no room for any new tokens to be generated"
1168
+ )
1169
+ elif self.verbose:
1170
+ print_verbose(
1171
+ f"ingest: ingesting {input_length} tokens"
1172
+ )
1173
+
672
1174
  assert_model_is_loaded(self)
673
1175
  self.llama.create_completion(
674
- text,
675
- max_tokens=1,
1176
+ prompt=tokens,
1177
+ max_tokens=2,
676
1178
  temperature=0.0
677
1179
  )
678
-
1180
+
679
1181
 
680
1182
  def candidates(
681
1183
  self,
682
1184
  prompt: str,
683
- k: int
1185
+ k: int = 40,
1186
+ temp: Optional[float] = None,
1187
+ raw_token_ids: bool = False
684
1188
  ) -> list[tuple[str, np.floating]]:
685
1189
  """
686
1190
  Given prompt `str` and k `int`, return a sorted list of the
687
1191
  top k candidates for most likely next token, along with their
688
- normalized probabilities
689
- """
1192
+ normalized probabilities (logprobs).
690
1193
 
691
- assert isinstance(prompt, str), \
692
- f"next_candidates: prompt should be str, not {type(prompt)}"
693
- assert isinstance(k, int), \
694
- f"next_candidates: k should be int, not {type(k)}"
695
- assert 0 < k <= len(self.tokens), \
696
- f"next_candidates: k should be between 0 and {len(self.tokens)}"
1194
+ The following parameters are optional:
1195
+ - temp: The temperature to apply to the distribution
1196
+ - raw_token_ids: If `True`, return raw token IDs instead of text tokens
697
1197
 
1198
+ If parameter `k` is <= 0, the probabilities for all tokens in the
1199
+ vocabulary will be returned. Vocabulary sizes are often in the
1200
+ hundred-thousands.
1201
+ """
1202
+
1203
+ assert_type(prompt, str, 'prompt', 'candidates')
1204
+ assert_type(k, int, 'k', 'candidates')
1205
+ assert_type(temp, (float, NoneType), 'temp', 'candidates')
698
1206
  assert_model_is_loaded(self)
699
- prompt_tokens = self.llama.tokenize(prompt.encode('utf-8', errors='ignore'))
700
- self.llama.reset() # reset model state
701
- self.llama.eval(prompt_tokens)
702
- scores = self.llama.scores[len(prompt_tokens) - 1]
1207
+ if k <= 0:
1208
+ k = self.n_vocab
1209
+ if self.verbose:
1210
+ print_verbose(
1211
+ f"candidates: k <= 0, using n_vocab ({self.n_vocab})"
1212
+ )
1213
+ if not 1 <= k <= self.n_vocab:
1214
+ raise ValueError(
1215
+ f"candidates: k should be between 1 and {self.n_vocab} "
1216
+ f"inclusive"
1217
+ )
1218
+
1219
+ prompt_tokens = self.tokenize(prompt)
1220
+ input_length = len(prompt_tokens)
703
1221
 
704
- # len(self.llama.scores) == self.context_length
705
- # len(self.llama.scores[i]) == len(self.tokens)
1222
+ if input_length > self.context_length:
1223
+ print(f'webscout.Local: raw input: {prompt_tokens}')
1224
+ raise exceptions.ExceededContextLengthException(
1225
+ f"candidates: length of input exceeds model's context length "
1226
+ f"({input_length} > {self.context_length})"
1227
+ )
1228
+ elif input_length == self.context_length:
1229
+ print(f'webscout.Local: raw input: {prompt_tokens}')
1230
+ raise exceptions.ExceededContextLengthException(
1231
+ f"candidates: length of input is equal to model's context "
1232
+ f"length ({input_length} == {self.context_length}). this "
1233
+ f"leaves no room for any new tokens to be generated"
1234
+ )
1235
+
1236
+ # it is necessary to reset the model before calling llama.eval()
1237
+ elif self.verbose:
1238
+ print_verbose(
1239
+ "candidates: reset model state..."
1240
+ )
1241
+ self.llama.reset()
1242
+
1243
+ if self.verbose:
1244
+ print_verbose(
1245
+ "candidates: eval..."
1246
+ )
1247
+ self.llama.eval(prompt_tokens) # single forward pass
706
1248
 
707
- # normalize scores with softmax
708
- # must normalize over all tokens in vocab, not just top k
1249
+ scores = self.llama.scores[len(prompt_tokens) - 1]
1250
+
1251
+ # Get the top k indices based on raw scores
1252
+ top_k_indices = np.argpartition(scores, -k)[-k:]
1253
+
1254
+ # Get the scores of the top k tokens
1255
+ top_k_scores = scores[top_k_indices]
1256
+
1257
+ # Apply softmax to the top k scores
709
1258
  if self.verbose:
710
- print_verbose(f'calculating softmax over {len(scores)} values')
711
- normalized_scores: list[np.floating] = list(softmax(scores))
1259
+ print_verbose(
1260
+ f'candidates: compute softmax over {len(top_k_scores)} '
1261
+ f'values...'
1262
+ )
1263
+ normalized_scores = softmax(z=top_k_scores, T=temp)
1264
+
1265
+ # consider only the top k tokens
1266
+ logprobs = [
1267
+ (
1268
+ self.llama._model.detokenize(
1269
+ [tok_id], special=True
1270
+ ).decode('utf-8', errors='ignore'),
1271
+ normalized_scores[i]
1272
+ ) for i, tok_id in enumerate(top_k_indices)
1273
+ ] if not raw_token_ids else [
1274
+ (
1275
+ tok_id,
1276
+ normalized_scores[i]
1277
+ ) for i, tok_id in enumerate(top_k_indices)
1278
+ ]
712
1279
 
713
- # construct the final list
714
- i = 0
715
- token_probs_list: list[tuple[str, np.floating]] = []
716
- for tok_str in self.tokens:
717
- token_probs_list.append((tok_str, normalized_scores[i]))
718
- i += 1
1280
+ # sort by probability
1281
+ logprobs.sort(key=lambda x: x[1], reverse=True)
719
1282
 
720
- # return token_probs_list, sorted by probability, only top k
721
- return nlargest(k, token_probs_list, key=lambda x:x[1])
1283
+ return logprobs
722
1284
 
723
1285
 
724
1286
  def print_candidates(
725
1287
  self,
726
1288
  prompt: str,
727
- k: int,
728
- file: _SupportsWriteAndFlush = sys.stdout,
729
- flush: bool = False
1289
+ k: int = 40,
1290
+ temp: Optional[float] = None,
1291
+ raw_token_ids: bool = False,
1292
+ file: _SupportsWriteAndFlush = None,
730
1293
  ) -> None:
731
1294
  """
732
- Like `Model.candidates()`, but print the values instead
733
- of returning them
734
- """
1295
+ Given prompt `str` and k `int`, print a sorted list of the
1296
+ top k candidates for most likely next token, along with their
1297
+ normalized probabilities (logprobs).
735
1298
 
736
- for _tuple in self.candidates(prompt, k):
737
- print(
738
- f"token {repr(_tuple[0])} has probability {_tuple[1]}",
739
- file=file,
740
- flush=flush
741
- )
742
-
743
- # if flush is False, then so far file is not flushed, but it should
744
- # always be flushed at the end of printing
745
- if not flush:
746
- file.flush()
1299
+ The following parameters are optional:
1300
+ - temp: The temperature to apply to the distribution
1301
+ - raw_token_ids: If `True`, print raw token IDs instead of text tokens
1302
+
1303
+ If parameter `k` is <= 0, the probabilities for all tokens in the
1304
+ vocabulary will be printed. Vocabulary sizes are often in the
1305
+ hundred-thousands.
1306
+ """
1307
+ for _tuple in self.candidates(
1308
+ prompt=prompt, k=k, temp=temp, raw_token_ids=raw_token_ids
1309
+ ):
1310
+ percent_as_string = f"{_tuple[1] * 100 :>7.3f}"
1311
+ # do not print tokens with ~0.000% probability
1312
+ if percent_as_string != " 0.000":
1313
+ print(
1314
+ f"token {_tuple[0]!r:<32} has probability "
1315
+ f"{percent_as_string} %",
1316
+ file=sys.stdout if file is None else file,
1317
+ )
747
1318
 
748
1319
 
749
- def assert_model_is_loaded(model: Model) -> None:
1320
+ def assert_model_is_loaded(model) -> None:
750
1321
  """
751
- Ensure the Model is fully constructed, such that
752
- `Model.llama._model.model is not None` is guaranteed to be `True`
1322
+ Ensure the model is fully constructed, such that
1323
+ `model.llama._model.model is not None` is guaranteed to be `True`
753
1324
 
754
1325
  Raise ModelUnloadedException otherwise
755
1326
  """
756
- if not hasattr(model, 'llama'):
757
- raise ModelUnloadedException(
1327
+ try:
1328
+ if model.llama._model.model is not None:
1329
+ return
1330
+ except AttributeError:
1331
+ pass
1332
+
1333
+ if model is None:
1334
+ exc = exceptions.ModelUnloadedException(
1335
+ "model is None"
1336
+ )
1337
+ elif not hasattr(model, 'llama'):
1338
+ exc = exceptions.ModelUnloadedException(
758
1339
  "webscout.Local.Model instance has no attribute 'llama'"
759
1340
  )
760
- if not hasattr(model.llama, '_model'):
761
- raise ModelUnloadedException(
1341
+ elif not hasattr(model.llama, '_model'):
1342
+ exc = exceptions.ModelUnloadedException(
762
1343
  "llama_cpp.Llama instance has no attribute '_model'"
763
1344
  )
764
- if not hasattr(model.llama._model, 'model'):
765
- raise ModelUnloadedException(
766
- "llama_cpp._internals._LlamaModel instance has no attribute 'model'"
1345
+ elif not hasattr(model.llama._model, 'model'):
1346
+ exc = exceptions.ModelUnloadedException(
1347
+ "llama_cpp._internals._LlamaModel instance has no attribute "
1348
+ "'model'"
767
1349
  )
768
- if model.llama._model.model is None:
769
- raise ModelUnloadedException(
1350
+ elif model.llama._model.model is None:
1351
+ exc = exceptions.ModelUnloadedException(
770
1352
  "llama_cpp._internals._LlamaModel.model is None"
771
1353
  )
1354
+ else:
1355
+ raise UnreachableException
1356
+
1357
+ if not isinstance(model, Model):
1358
+ exc.add_note(
1359
+ 'WARNING: `assert_model_is_loaded` was called on an object '
1360
+ 'that is NOT an instance of `webscout.Local.Model` '
1361
+ f'(object had type {type(model)!r})'
1362
+ )
1363
+ else:
1364
+ exc.add_note(
1365
+ 'Are you trying to use a model that has been unloaded?'
1366
+ )
1367
+
1368
+ raise exc