llama-cpp-python-win 0.3.16__cp314-cp314-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (75) hide show
  1. bin/convert_hf_to_gguf.py +8751 -0
  2. bin/ggml-base.dll +0 -0
  3. bin/ggml-cpu.dll +0 -0
  4. bin/ggml.dll +0 -0
  5. bin/llama-mtmd-cli.exe +0 -0
  6. bin/llama.dll +0 -0
  7. bin/mtmd.dll +0 -0
  8. include/ggml-alloc.h +76 -0
  9. include/ggml-backend.h +354 -0
  10. include/ggml-blas.h +25 -0
  11. include/ggml-cann.h +123 -0
  12. include/ggml-cpp.h +39 -0
  13. include/ggml-cpu.h +145 -0
  14. include/ggml-cuda.h +47 -0
  15. include/ggml-metal.h +66 -0
  16. include/ggml-opt.h +256 -0
  17. include/ggml-rpc.h +33 -0
  18. include/ggml-sycl.h +49 -0
  19. include/ggml-vulkan.h +29 -0
  20. include/ggml-webgpu.h +19 -0
  21. include/ggml.h +2467 -0
  22. include/gguf.h +202 -0
  23. include/llama-cpp.h +30 -0
  24. include/llama.h +1482 -0
  25. include/mtmd-helper.h +91 -0
  26. include/mtmd.h +298 -0
  27. lib/cmake/ggml/ggml-config.cmake +328 -0
  28. lib/cmake/ggml/ggml-version.cmake +65 -0
  29. lib/cmake/llama/llama-config.cmake +54 -0
  30. lib/cmake/llama/llama-version.cmake +65 -0
  31. lib/ggml-base.lib +0 -0
  32. lib/ggml-cpu.lib +0 -0
  33. lib/ggml.lib +0 -0
  34. lib/llama.lib +0 -0
  35. lib/mtmd.lib +0 -0
  36. lib/pkgconfig/llama.pc +10 -0
  37. llama_cpp/__init__.py +4 -0
  38. llama_cpp/_ctypes_extensions.py +131 -0
  39. llama_cpp/_ggml.py +12 -0
  40. llama_cpp/_internals.py +856 -0
  41. llama_cpp/_logger.py +47 -0
  42. llama_cpp/_utils.py +78 -0
  43. llama_cpp/lib/ggml-base.dll +0 -0
  44. llama_cpp/lib/ggml-base.lib +0 -0
  45. llama_cpp/lib/ggml-cpu.dll +0 -0
  46. llama_cpp/lib/ggml-cpu.lib +0 -0
  47. llama_cpp/lib/ggml.dll +0 -0
  48. llama_cpp/lib/ggml.lib +0 -0
  49. llama_cpp/lib/llama.dll +0 -0
  50. llama_cpp/lib/llama.lib +0 -0
  51. llama_cpp/lib/mtmd.dll +0 -0
  52. llama_cpp/lib/mtmd.lib +0 -0
  53. llama_cpp/llama.py +2422 -0
  54. llama_cpp/llama_cache.py +155 -0
  55. llama_cpp/llama_chat_format.py +3962 -0
  56. llama_cpp/llama_cpp.py +4374 -0
  57. llama_cpp/llama_grammar.py +953 -0
  58. llama_cpp/llama_speculative.py +64 -0
  59. llama_cpp/llama_tokenizer.py +120 -0
  60. llama_cpp/llama_types.py +316 -0
  61. llama_cpp/llava_cpp.py +158 -0
  62. llama_cpp/mtmd_cpp.py +280 -0
  63. llama_cpp/py.typed +0 -0
  64. llama_cpp/server/__init__.py +0 -0
  65. llama_cpp/server/__main__.py +100 -0
  66. llama_cpp/server/app.py +597 -0
  67. llama_cpp/server/cli.py +97 -0
  68. llama_cpp/server/errors.py +212 -0
  69. llama_cpp/server/model.py +312 -0
  70. llama_cpp/server/settings.py +240 -0
  71. llama_cpp/server/types.py +316 -0
  72. llama_cpp_python_win-0.3.16.dist-info/METADATA +856 -0
  73. llama_cpp_python_win-0.3.16.dist-info/RECORD +75 -0
  74. llama_cpp_python_win-0.3.16.dist-info/WHEEL +5 -0
  75. llama_cpp_python_win-0.3.16.dist-info/licenses/LICENSE.md +9 -0
@@ -0,0 +1,856 @@
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import ctypes
5
+
6
+ from typing import (
7
+ Dict,
8
+ List,
9
+ Tuple,
10
+ Optional,
11
+ Sequence,
12
+ Callable,
13
+ Union,
14
+ )
15
+ from dataclasses import dataclass, field
16
+ from contextlib import ExitStack
17
+
18
+ import numpy as np
19
+ import numpy.typing as npt
20
+
21
+ from .llama_types import *
22
+ from .llama_grammar import LlamaGrammar
23
+ from ._utils import suppress_stdout_stderr
24
+
25
+ import llama_cpp.llama_cpp as llama_cpp
26
+
27
+
28
+ # Python wrappers over llama.h structs
29
+
30
+
31
+ class LlamaModel:
32
+ """Intermediate Python wrapper for a llama.cpp llama_model.
33
+ NOTE: For stability it's recommended you use the Llama class instead."""
34
+
35
+ def __init__(
36
+ self,
37
+ *,
38
+ path_model: str,
39
+ params: llama_cpp.llama_model_params,
40
+ verbose: bool = True,
41
+ ):
42
+ self.path_model = path_model
43
+ self.params = params
44
+ self.verbose = verbose
45
+ self._exit_stack = ExitStack()
46
+
47
+ model = None
48
+
49
+ if not os.path.exists(path_model):
50
+ raise ValueError(f"Model path does not exist: {path_model}")
51
+
52
+ with suppress_stdout_stderr(disable=verbose):
53
+ model = llama_cpp.llama_model_load_from_file(
54
+ self.path_model.encode("utf-8"), self.params
55
+ )
56
+
57
+ if model is None:
58
+ raise ValueError(f"Failed to load model from file: {path_model}")
59
+
60
+ vocab = llama_cpp.llama_model_get_vocab(model)
61
+
62
+ if vocab is None:
63
+ raise ValueError(f"Failed to get vocab from model: {path_model}")
64
+
65
+ self.model = model
66
+ self.vocab = vocab
67
+ self.sampler = None # LlamaModel doesn't use samplers, but some cleanup code expects this attribute
68
+
69
+ def free_model():
70
+ if self.model is None:
71
+ return
72
+ llama_cpp.llama_model_free(self.model)
73
+ self.model = None
74
+
75
+ self._exit_stack.callback(free_model)
76
+
77
+ def close(self):
78
+ if self.sampler is not None:
79
+ # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
80
+ for i, _ in reversed(self.custom_samplers):
81
+ llama_cpp.llama_sampler_chain_remove(self.sampler, i)
82
+ self.custom_samplers.clear()
83
+ self._exit_stack.close()
84
+
85
+ def __del__(self):
86
+ self.close()
87
+
88
+ def vocab_type(self) -> int:
89
+ return llama_cpp.llama_vocab_type(self.vocab)
90
+
91
+ def n_vocab(self) -> int:
92
+ return llama_cpp.llama_vocab_n_tokens(self.vocab)
93
+
94
+ def n_ctx_train(self) -> int:
95
+ return llama_cpp.llama_model_n_ctx_train(self.model)
96
+
97
+ def n_embd(self) -> int:
98
+ return llama_cpp.llama_model_n_embd(self.model)
99
+
100
+ def rope_freq_scale_train(self) -> float:
101
+ return llama_cpp.llama_model_rope_freq_scale_train(self.model)
102
+
103
+ def desc(self) -> str:
104
+ buf = ctypes.create_string_buffer(1024)
105
+ llama_cpp.llama_model_desc(self.model, buf, 1024)
106
+ return buf.value.decode("utf-8")
107
+
108
+ def size(self) -> int:
109
+ return llama_cpp.llama_model_size(self.model)
110
+
111
+ def n_params(self) -> int:
112
+ return llama_cpp.llama_model_n_params(self.model)
113
+
114
+ def get_tensor(self, name: str) -> ctypes.c_void_p:
115
+ raise NotImplementedError("get_tensor is not implemented in llama.cpp")
116
+
117
+ # Vocab
118
+
119
+ def token_get_text(self, token: int) -> str:
120
+ return llama_cpp.llama_vocab_get_text(self.vocab, token).decode("utf-8")
121
+
122
+ def token_get_score(self, token: int) -> float:
123
+ return llama_cpp.llama_vocab_get_score(self.vocab, token)
124
+
125
+ def token_get_attr(self, token: int) -> int:
126
+ return llama_cpp.llama_vocab_get_attr(self.vocab, token)
127
+
128
+ # Special tokens
129
+
130
+ def token_bos(self) -> int:
131
+ return llama_cpp.llama_vocab_bos(self.vocab)
132
+
133
+ def token_eos(self) -> int:
134
+ return llama_cpp.llama_vocab_eos(self.vocab)
135
+
136
+ def token_cls(self) -> int:
137
+ return llama_cpp.llama_vocab_cls(self.vocab)
138
+
139
+ def token_sep(self) -> int:
140
+ return llama_cpp.llama_vocab_sep(self.vocab)
141
+
142
+ def token_nl(self) -> int:
143
+ return llama_cpp.llama_vocab_nl(self.vocab)
144
+
145
+ def token_prefix(self) -> int:
146
+ return llama_cpp.llama_vocab_fim_pre(self.vocab)
147
+
148
+ def token_middle(self) -> int:
149
+ return llama_cpp.llama_vocab_fim_mid(self.vocab)
150
+
151
+ def token_suffix(self) -> int:
152
+ return llama_cpp.llama_vocab_fim_suf(self.vocab)
153
+
154
+ def token_eot(self) -> int:
155
+ return llama_cpp.llama_vocab_eot(self.vocab)
156
+
157
+ def add_bos_token(self) -> bool:
158
+ return llama_cpp.llama_vocab_get_add_bos(self.vocab)
159
+
160
+ def add_eos_token(self) -> bool:
161
+ return llama_cpp.llama_vocab_get_add_eos(self.vocab)
162
+
163
+ # Tokenization
164
+
165
+ def tokenize(self, text: bytes, add_bos: bool, special: bool):
166
+ n_ctx = self.n_ctx_train()
167
+ tokens = (llama_cpp.llama_token * n_ctx)()
168
+ n_tokens = llama_cpp.llama_tokenize(
169
+ self.vocab, text, len(text), tokens, n_ctx, add_bos, special
170
+ )
171
+ if n_tokens < 0:
172
+ n_tokens = abs(n_tokens)
173
+ tokens = (llama_cpp.llama_token * n_tokens)()
174
+ n_tokens = llama_cpp.llama_tokenize(
175
+ self.vocab, text, len(text), tokens, n_tokens, add_bos, special
176
+ )
177
+ if n_tokens < 0:
178
+ raise RuntimeError(
179
+ f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
180
+ )
181
+ return list(tokens[:n_tokens])
182
+
183
+ def token_to_piece(self, token: int, special: bool = False) -> bytes:
184
+ buf = ctypes.create_string_buffer(32)
185
+ llama_cpp.llama_token_to_piece(self.vocab, token, buf, 32, 0, special)
186
+ return bytes(buf)
187
+
188
+ def detokenize(self, tokens: List[int], special: bool = False) -> bytes:
189
+ output = b""
190
+ size = 32
191
+ buffer = (ctypes.c_char * size)()
192
+ for token in tokens:
193
+ n = llama_cpp.llama_token_to_piece(
194
+ self.vocab, llama_cpp.llama_token(token), buffer, size, 0, special
195
+ )
196
+ assert n <= size
197
+ output += bytes(buffer[:n])
198
+ # NOTE: Llama1 models automatically added a space at the start of the prompt
199
+ # this line removes a leading space if the first token is a beginning of sentence token
200
+ return (
201
+ output[1:]
202
+ if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b" "
203
+ else output
204
+ )
205
+
206
+ # Extra
207
+ def metadata(self) -> Dict[str, str]:
208
+ metadata: Dict[str, str] = {}
209
+ buffer_size = 1024
210
+ buffer = ctypes.create_string_buffer(buffer_size)
211
+ # zero the buffer
212
+ buffer.value = b"\0" * buffer_size
213
+ # iterate over model keys
214
+ for i in range(llama_cpp.llama_model_meta_count(self.model)):
215
+ nbytes = llama_cpp.llama_model_meta_key_by_index(
216
+ self.model, i, buffer, buffer_size
217
+ )
218
+ if nbytes > buffer_size:
219
+ buffer_size = nbytes + 1
220
+ buffer = ctypes.create_string_buffer(buffer_size)
221
+ nbytes = llama_cpp.llama_model_meta_key_by_index(
222
+ self.model, i, buffer, buffer_size
223
+ )
224
+ key = buffer.value.decode("utf-8")
225
+ nbytes = llama_cpp.llama_model_meta_val_str_by_index(
226
+ self.model, i, buffer, buffer_size
227
+ )
228
+ if nbytes > buffer_size:
229
+ buffer_size = nbytes + 1
230
+ buffer = ctypes.create_string_buffer(buffer_size)
231
+ nbytes = llama_cpp.llama_model_meta_val_str_by_index(
232
+ self.model, i, buffer, buffer_size
233
+ )
234
+ value = buffer.value.decode("utf-8")
235
+ metadata[key] = value
236
+ return metadata
237
+
238
+ @staticmethod
239
+ def default_params():
240
+ """Get the default llama_model_params."""
241
+ return llama_cpp.llama_model_default_params()
242
+
243
+
244
+ class LlamaContext:
245
+ """Intermediate Python wrapper for a llama.cpp llama_context.
246
+ NOTE: For stability it's recommended you use the Llama class instead."""
247
+
248
+ def __init__(
249
+ self,
250
+ *,
251
+ model: LlamaModel,
252
+ params: llama_cpp.llama_context_params,
253
+ verbose: bool = True,
254
+ ):
255
+ self.model = model
256
+ self.params = params
257
+ self.verbose = verbose
258
+ self._exit_stack = ExitStack()
259
+
260
+ ctx = llama_cpp.llama_init_from_model(self.model.model, self.params)
261
+
262
+ if ctx is None:
263
+ raise ValueError("Failed to create llama_context")
264
+
265
+ self.ctx = ctx
266
+ self.memory = llama_cpp.llama_get_memory(self.ctx)
267
+ self.sampler = None # LlamaContext doesn't manage samplers directly, but some cleanup code expects this attribute
268
+
269
+ def free_ctx():
270
+ if self.ctx is None:
271
+ return
272
+ llama_cpp.llama_free(self.ctx)
273
+ self.ctx = None
274
+
275
+ self._exit_stack.callback(free_ctx)
276
+
277
+ def close(self):
278
+ self._exit_stack.close()
279
+
280
+ def __del__(self):
281
+ self.close()
282
+
283
+ def n_ctx(self) -> int:
284
+ return llama_cpp.llama_n_ctx(self.ctx)
285
+
286
+ def pooling_type(self) -> int:
287
+ return llama_cpp.llama_pooling_type(self.ctx)
288
+
289
+ def kv_cache_clear(self):
290
+ assert self.memory is not None, "Memory is not initialized"
291
+ llama_cpp.llama_memory_clear(self.memory, True)
292
+
293
+ def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int):
294
+ assert self.memory is not None, "Memory is not initialized"
295
+ seq_id = seq_id if seq_id >= 0 else 0
296
+ llama_cpp.llama_memory_seq_rm(self.memory, seq_id, p0, p1)
297
+
298
+ def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int):
299
+ assert self.memory is not None, "Memory is not initialized"
300
+ llama_cpp.llama_memory_seq_cp(self.memory, seq_id_src, seq_id_dst, p0, p1)
301
+
302
+ def kv_cache_seq_keep(self, seq_id: int):
303
+ assert self.memory is not None, "Memory is not initialized"
304
+ llama_cpp.llama_memory_seq_keep(self.memory, seq_id)
305
+
306
+ def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int):
307
+ assert self.memory is not None, "Memory is not initialized"
308
+ llama_cpp.llama_memory_seq_add(self.memory, seq_id, p0, p1, shift)
309
+
310
+ def get_state_size(self) -> int:
311
+ return llama_cpp.llama_state_get_size(self.ctx)
312
+
313
+ # TODO: copy_state_data
314
+
315
+ # TODO: set_state_data
316
+
317
+ # TODO: llama_load_session_file
318
+
319
+ # TODO: llama_save_session_file
320
+
321
+ def decode(self, batch: LlamaBatch):
322
+ return_code = llama_cpp.llama_decode(
323
+ self.ctx,
324
+ batch.batch,
325
+ )
326
+ if return_code != 0:
327
+ raise RuntimeError(f"llama_decode returned {return_code}")
328
+
329
+ def encode(self, batch: LlamaBatch):
330
+ return_code = llama_cpp.llama_encode(
331
+ self.ctx,
332
+ batch.batch,
333
+ )
334
+ if return_code != 0:
335
+ raise RuntimeError(f"llama_encode returned {return_code}")
336
+
337
+ def set_n_threads(self, n_threads: int, n_threads_batch: int):
338
+ llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch)
339
+
340
+ def get_logits(self):
341
+ return llama_cpp.llama_get_logits(self.ctx)
342
+
343
+ def get_logits_ith(self, i: int):
344
+ return llama_cpp.llama_get_logits_ith(self.ctx, i)
345
+
346
+ def get_embeddings(self):
347
+ return llama_cpp.llama_get_embeddings(self.ctx)
348
+
349
+ def get_embeddings_ith(self, i: int):
350
+ return llama_cpp.llama_get_embeddings_ith(self.ctx, i)
351
+
352
+ def get_embeddings_seq(self, seq_id: int):
353
+ return llama_cpp.llama_get_embeddings_seq(self.ctx, seq_id)
354
+
355
+ # Sampling functions - deprecated, use LlamaSampler instead
356
+
357
+ def set_rng_seed(self, seed: int):
358
+ raise NotImplementedError("set_rng_seed is deprecated, use LlamaSampler instead")
359
+
360
+ def sample_repetition_penalties(
361
+ self,
362
+ candidates: "_LlamaTokenDataArray",
363
+ last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]",
364
+ penalty_last_n: int,
365
+ penalty_repeat: float,
366
+ penalty_freq: float,
367
+ penalty_present: float,
368
+ ):
369
+ raise NotImplementedError("sample_repetition_penalties is deprecated, use LlamaSampler instead")
370
+
371
+ def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
372
+ raise NotImplementedError("sample_softmax is deprecated, use LlamaSampler instead")
373
+
374
+ def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int):
375
+ raise NotImplementedError("sample_top_k is deprecated, use LlamaSampler instead")
376
+
377
+ def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
378
+ raise NotImplementedError("sample_top_p is deprecated, use LlamaSampler instead")
379
+
380
+ def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int):
381
+ raise NotImplementedError("sample_min_p is deprecated, use LlamaSampler instead")
382
+
383
+ def sample_typical(
384
+ self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int
385
+ ):
386
+ raise NotImplementedError("sample_typical is deprecated, use LlamaSampler instead")
387
+
388
+ def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float):
389
+ raise NotImplementedError("sample_temp is deprecated, use LlamaSampler instead")
390
+
391
+ def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar):
392
+ raise NotImplementedError("sample_grammar is deprecated, use LlamaSampler instead")
393
+
394
+ def sample_token_mirostat(
395
+ self,
396
+ candidates: "_LlamaTokenDataArray",
397
+ tau: float,
398
+ eta: float,
399
+ m: int,
400
+ mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
401
+ ) -> int:
402
+ raise NotImplementedError("sample_token_mirostat is deprecated, use LlamaSampler instead")
403
+
404
+ def sample_token_mirostat_v2(
405
+ self,
406
+ candidates: "_LlamaTokenDataArray",
407
+ tau: float,
408
+ eta: float,
409
+ mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float],
410
+ ) -> int:
411
+ raise NotImplementedError("sample_token_mirostat_v2 is deprecated, use LlamaSampler instead")
412
+
413
+ def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int:
414
+ raise NotImplementedError("sample_token_greedy is deprecated, use LlamaSampler instead")
415
+
416
+ def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
417
+ raise NotImplementedError("sample_token is deprecated, use LlamaSampler instead")
418
+
419
+ # Grammar
420
+ def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
421
+ raise NotImplementedError("grammar_accept_token is deprecated, use LlamaSampler instead")
422
+
423
+ def reset_timings(self):
424
+ llama_cpp.llama_perf_context_reset(self.ctx)
425
+
426
+ def print_timings(self):
427
+ llama_cpp.llama_perf_context_print(self.ctx)
428
+
429
+ # Utility functions
430
+ @staticmethod
431
+ def default_params():
432
+ """Get the default llama_context_params."""
433
+ return llama_cpp.llama_context_default_params()
434
+
435
+
436
+ class LlamaBatch:
437
+ def __init__(
438
+ self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
439
+ ):
440
+ self._n_tokens = n_tokens
441
+ self.embd = embd
442
+ self.n_seq_max = n_seq_max
443
+ self.verbose = verbose
444
+ self._exit_stack = ExitStack()
445
+
446
+ batch = llama_cpp.llama_batch_init(self._n_tokens, self.embd, self.n_seq_max)
447
+
448
+ if batch is None:
449
+ raise ValueError("Failed to create llama_batch")
450
+
451
+ self.batch = batch
452
+ self.sampler = None # LlamaBatch doesn't use samplers, but some cleanup code expects this attribute
453
+
454
+ def free_batch():
455
+ if self.batch is None:
456
+ return
457
+ llama_cpp.llama_batch_free(self.batch)
458
+ self.batch = None
459
+
460
+ self._exit_stack.callback(free_batch)
461
+
462
+ def close(self):
463
+ self._exit_stack.close()
464
+
465
+ def __del__(self):
466
+ self.close()
467
+
468
+ def n_tokens(self) -> int:
469
+ return self.batch.n_tokens
470
+
471
+ def reset(self):
472
+ self.batch.n_tokens = 0
473
+
474
+ def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
475
+ n_tokens = len(batch)
476
+ self.batch.n_tokens = n_tokens
477
+ for i in range(n_tokens):
478
+ self.batch.token[i] = batch[i]
479
+ self.batch.pos[i] = n_past + i
480
+ self.batch.seq_id[i][0] = 0
481
+ self.batch.n_seq_id[i] = 1
482
+ self.batch.logits[i] = logits_all
483
+ self.batch.logits[n_tokens - 1] = True
484
+
485
+ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool):
486
+ n_tokens = len(batch)
487
+ n_tokens0 = self.batch.n_tokens
488
+ self.batch.n_tokens += n_tokens
489
+ for i in range(n_tokens):
490
+ j = n_tokens0 + i
491
+ self.batch.token[j] = batch[i]
492
+ self.batch.pos[j] = i
493
+ self.batch.seq_id[j][0] = seq_id
494
+ self.batch.n_seq_id[j] = 1
495
+ self.batch.logits[j] = logits_all
496
+ self.batch.logits[n_tokens - 1] = True
497
+
498
+
499
+ class LlamaTokenDataArray:
500
+ def __init__(self, *, n_vocab: int):
501
+ self.n_vocab = n_vocab
502
+ self.candidates_data = np.recarray(
503
+ (self.n_vocab,),
504
+ dtype=np.dtype(
505
+ [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
506
+ ),
507
+ )
508
+ self.candidates = llama_cpp.llama_token_data_array(
509
+ data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
510
+ size=self.n_vocab,
511
+ sorted=False,
512
+ )
513
+ self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc) # type: ignore
514
+ self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single)
515
+ self.sampler = None # LlamaTokenDataArray doesn't use samplers, but some cleanup code expects this attribute
516
+
517
+ def copy_logits(self, logits: npt.NDArray[np.single]):
518
+ self.candidates_data.id[:] = self.default_candidates_data_id
519
+ self.candidates_data.logit[:] = logits
520
+ self.candidates_data.p[:] = self.default_candidates_data_p
521
+ self.candidates.sorted = False
522
+ self.candidates.size = self.n_vocab
523
+
524
+
525
+ # Embedding functions
526
+
527
+
528
+ def normalize_embedding(embedding):
529
+ norm = float(np.linalg.norm(embedding))
530
+ if norm == 0.0:
531
+ return embedding
532
+ return [v / norm for v in embedding]
533
+
534
+
535
+ # Python wrappers over common/sampling structs
536
+
537
+
538
+ @dataclass
539
+ class LlamaSamplingParams:
540
+ n_prev: int = 64
541
+ n_probs: int = 0
542
+ top_k: int = 40
543
+ top_p: float = 0.95
544
+ min_p: float = 0.05
545
+ tfs_z: float = 1.00
546
+ typical_p: float = 1.00
547
+ temp: float = 0.80
548
+ penalty_last_n: int = 64
549
+ penalty_repeat: float = 1.0
550
+ penalty_freq: float = 0.00
551
+ penalty_present: float = 0.00
552
+ mirostat: int = 0
553
+ mirostat_tau: float = 5.00
554
+ mirostat_eta: float = 0.10
555
+ penalize_nl: bool = True
556
+
557
+ grammar: str = ""
558
+
559
+ cfg_negative_prompt: str = ""
560
+ cfg_scale: float = 1.00
561
+
562
+ logit_bias: dict[int, float] = field(default_factory=dict)
563
+
564
+
565
+ @dataclass
566
+ class LlamaSamplingContext:
567
+ params: LlamaSamplingParams = field(default_factory=LlamaSamplingParams)
568
+ mirostat_mu: ctypes.c_float = field(default_factory=ctypes.c_float)
569
+ grammar: Optional[LlamaGrammar] = None
570
+ # NOTE: Missing parsed_grammar
571
+ prev: list[int] = field(default_factory=list)
572
+ cur: list[llama_cpp.llama_token_data] = field(default_factory=list)
573
+
574
+ def reset(self):
575
+ self.prev = []
576
+ self.cur = []
577
+ if self.grammar is not None:
578
+ self.grammar.reset()
579
+
580
+ def cp(self):
581
+ return LlamaSamplingContext(
582
+ params=self.params,
583
+ mirostat_mu=self.mirostat_mu,
584
+ grammar=self.grammar,
585
+ prev=self.prev.copy(),
586
+ cur=self.cur.copy(),
587
+ )
588
+
589
+ def last(self) -> Optional[int]:
590
+ if len(self.prev) > 0:
591
+ return self.prev[-1]
592
+ else:
593
+ return None
594
+
595
+ def prev_str(self, ctx_main: LlamaContext, n: int) -> str:
596
+ return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8")
597
+
598
+ def sample(
599
+ self,
600
+ ctx_main: LlamaContext,
601
+ idx: int = 0,
602
+ logits_array: Optional[npt.NDArray[np.single]] = None,
603
+ ):
604
+ # This method is deprecated in favor of using LlamaSampler directly
605
+ raise NotImplementedError("LlamaSamplingContext.sample is deprecated, use LlamaSampler instead")
606
+
607
+ def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool):
608
+ self.prev.append(id)
609
+
610
+
611
+ class CustomSampler:
612
+ def __init__(
613
+ self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
614
+ ):
615
+ self.apply_func = apply_func
616
+
617
+ def apply_wrapper(
618
+ sampler: llama_cpp.llama_sampler_p,
619
+ cur_p: llama_cpp.llama_token_data_array_p,
620
+ ):
621
+ self.apply_func(cur_p)
622
+
623
+ def free_wrapper(sampler: llama_cpp.llama_sampler_p):
624
+ pass
625
+
626
+ sampler_i = llama_cpp.llama_sampler_i()
627
+ sampler_i.apply = llama_cpp.llama_sampler_i_apply(apply_wrapper)
628
+ self._apply_wrapper_ref = apply_wrapper
629
+
630
+ sampler_i.name = llama_cpp.llama_sampler_i_name(0)
631
+ sampler_i.accept = llama_cpp.llama_sampler_i_accept(0)
632
+ sampler_i.reset = llama_cpp.llama_sampler_i_reset(0)
633
+ sampler_i.clone = llama_cpp.llama_sampler_i_clone(0)
634
+ sampler_i.free = llama_cpp.llama_sampler_i_free(0)
635
+
636
+ self.sampler = llama_cpp.llama_sampler()
637
+ self.sampler.iface = ctypes.pointer(sampler_i)
638
+ self.sampler.ctx = None
639
+
640
+ def get_sampler(self) -> llama_cpp.llama_sampler_p:
641
+ return ctypes.pointer(self.sampler)
642
+
643
+
644
+ class LlamaSampler:
645
+ def __init__(self):
646
+ params = llama_cpp.llama_sampler_chain_default_params()
647
+ self.sampler = llama_cpp.llama_sampler_chain_init(params)
648
+ self.custom_samplers: List[Tuple[int, CustomSampler]] = []
649
+ self._exit_stack = ExitStack()
650
+
651
+ def free_sampler():
652
+ if self.sampler is not None:
653
+ # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
654
+ for i, _ in reversed(self.custom_samplers):
655
+ llama_cpp.llama_sampler_chain_remove(self.sampler, i)
656
+ llama_cpp.llama_sampler_free(self.sampler)
657
+ self.sampler = None
658
+
659
+ self._exit_stack.callback(free_sampler)
660
+
661
+ def close(self):
662
+ self._exit_stack.close()
663
+
664
+ def __del__(self):
665
+ self.close()
666
+
667
+ def add_greedy(self):
668
+ sampler = llama_cpp.llama_sampler_init_greedy()
669
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
670
+
671
+ def add_dist(self, seed: int):
672
+ sampler = llama_cpp.llama_sampler_init_dist(seed)
673
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
674
+
675
+ def add_softmax(self):
676
+ sampler = llama_cpp.llama_sampler_init_softmax()
677
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
678
+
679
+ def add_top_k(self, k: int):
680
+ sampler = llama_cpp.llama_sampler_init_top_k(k)
681
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
682
+
683
+ def add_top_p(self, p: float, min_keep: int = 1):
684
+ sampler = llama_cpp.llama_sampler_init_top_p(p, min_keep)
685
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
686
+
687
+ def add_min_p(self, p: float, min_keep: int = 1):
688
+ sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep)
689
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
690
+
691
+ def add_typical(self, p: float, min_keep: int = 1):
692
+ sampler = llama_cpp.llama_sampler_init_typical(p, min_keep)
693
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
694
+
695
+ def add_temp(self, temp: float):
696
+ sampler = llama_cpp.llama_sampler_init_temp(temp)
697
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
698
+
699
+ def add_temp_ext(self, t: float, delta: float, exponent: float):
700
+ sampler = llama_cpp.llama_sampler_init_temp_ext(t, delta, exponent)
701
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
702
+
703
+ def add_xtc(self, p: float, t: float, min_keep: int, seed: int):
704
+ sampler = llama_cpp.llama_sampler_init_xtc(p, t, min_keep, seed)
705
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
706
+
707
+ def add_top_n_sigma(self, n: float):
708
+ sampler = llama_cpp.llama_sampler_init_top_n_sigma(n)
709
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
710
+
711
+ def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int):
712
+ sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m)
713
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
714
+
715
+ def add_mirostat_v2(self, seed: int, tau: float, eta: float):
716
+ sampler = llama_cpp.llama_sampler_init_mirostat_v2(seed, tau, eta)
717
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
718
+
719
+ def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar):
720
+ sampler = llama_cpp.llama_sampler_init_grammar(
721
+ model.vocab, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8")
722
+ )
723
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
724
+
725
+ def add_grammar_lazy_patterns(
726
+ self,
727
+ model: LlamaModel,
728
+ grammar: LlamaGrammar,
729
+ trigger_patterns: List[str],
730
+ trigger_tokens: List[int]
731
+ ):
732
+ # Convert patterns to C array
733
+ pattern_ptrs = (ctypes.c_char_p * len(trigger_patterns))()
734
+ for i, pattern in enumerate(trigger_patterns):
735
+ pattern_ptrs[i] = pattern.encode("utf-8")
736
+
737
+ # Convert tokens to C array
738
+ token_array = (llama_cpp.llama_token * len(trigger_tokens))(*trigger_tokens)
739
+
740
+ sampler = llama_cpp.llama_sampler_init_grammar_lazy_patterns(
741
+ model.vocab,
742
+ grammar._grammar.encode("utf-8"),
743
+ grammar._root.encode("utf-8"),
744
+ pattern_ptrs,
745
+ len(trigger_patterns),
746
+ token_array,
747
+ len(trigger_tokens)
748
+ )
749
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
750
+
751
+ def add_penalties(
752
+ self,
753
+ penalty_last_n: int,
754
+ penalty_repeat: float,
755
+ penalty_freq: float,
756
+ penalty_present: float,
757
+ ):
758
+ sampler = llama_cpp.llama_sampler_init_penalties(
759
+ penalty_last_n,
760
+ penalty_repeat,
761
+ penalty_freq,
762
+ penalty_present,
763
+ )
764
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
765
+
766
+ def add_dry(
767
+ self,
768
+ model: LlamaModel,
769
+ n_ctx_train: int,
770
+ dry_multiplier: float,
771
+ dry_base: float,
772
+ dry_allowed_length: int,
773
+ dry_penalty_last_n: int,
774
+ seq_breakers: List[str]
775
+ ):
776
+ # Convert seq_breakers to C array
777
+ breaker_ptrs = (ctypes.c_char_p * len(seq_breakers))()
778
+ for i, breaker in enumerate(seq_breakers):
779
+ breaker_ptrs[i] = breaker.encode("utf-8")
780
+
781
+ sampler = llama_cpp.llama_sampler_init_dry(
782
+ model.vocab,
783
+ n_ctx_train,
784
+ dry_multiplier,
785
+ dry_base,
786
+ dry_allowed_length,
787
+ dry_penalty_last_n,
788
+ breaker_ptrs,
789
+ len(seq_breakers)
790
+ )
791
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
792
+
793
+ def add_logit_bias(
794
+ self,
795
+ n_vocab: int,
796
+ logit_bias: Dict[int, float]
797
+ ):
798
+ # Convert logit_bias dict to C array
799
+ bias_array = (llama_cpp.llama_logit_bias * len(logit_bias))()
800
+ for i, (token, bias) in enumerate(logit_bias.items()):
801
+ bias_array[i].token = token
802
+ bias_array[i].bias = bias
803
+
804
+ sampler = llama_cpp.llama_sampler_init_logit_bias(
805
+ n_vocab,
806
+ len(logit_bias),
807
+ bias_array
808
+ )
809
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
810
+
811
+ def add_infill(self, model: LlamaModel):
812
+ sampler = llama_cpp.llama_sampler_init_infill(model.vocab)
813
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
814
+
815
+ def add_custom(
816
+ self, apply_func: Callable[[llama_cpp.llama_token_data_array], None]
817
+ ):
818
+ custom_sampler = CustomSampler(apply_func)
819
+ sampler = custom_sampler.get_sampler()
820
+ llama_cpp.llama_sampler_chain_add(self.sampler, sampler)
821
+ # NOTE: Must remove custom samplers before free or llama.cpp will try to free them
822
+ self.custom_samplers.append(
823
+ (llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler)
824
+ )
825
+
826
+ def get_seed(self) -> int:
827
+ return llama_cpp.llama_sampler_get_seed(self.sampler)
828
+
829
+ def sample(self, ctx: LlamaContext, idx: int = -1) -> int:
830
+ return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx)
831
+
832
+ def accept(self, token: int):
833
+ llama_cpp.llama_sampler_accept(self.sampler, token)
834
+
835
+ def reset(self):
836
+ llama_cpp.llama_sampler_reset(self.sampler)
837
+
838
+ def clone(self):
839
+ # NOTE: Custom samplers cannot be cloned due to Python callback limitations
840
+ if self.custom_samplers:
841
+ raise NotImplementedError("Cannot clone LlamaSampler that contains custom samplers")
842
+
843
+ cloned_sampler = llama_cpp.llama_sampler_clone(self.sampler)
844
+ # Create a new wrapper around the cloned sampler
845
+ new_sampler = LlamaSampler.__new__(LlamaSampler)
846
+ new_sampler.sampler = cloned_sampler
847
+ new_sampler.custom_samplers = []
848
+ new_sampler._exit_stack = ExitStack()
849
+
850
+ def free_sampler():
851
+ if new_sampler.sampler is not None:
852
+ llama_cpp.llama_sampler_free(new_sampler.sampler)
853
+ new_sampler.sampler = None
854
+
855
+ new_sampler._exit_stack.callback(free_sampler)
856
+ return new_sampler