PyPI - llama-cpp-python - Versions diffs - 0.2.37__tar.gz → 0.2.39__tar.gz - Mend

llama-cpp-python 0.2.37tar.gz → 0.2.39tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (711) hide show

llama_cpp_python-0.2.39/.git/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 34f31040f610925552a66b3a033e31320b6f6ad8 '34f31040f610925552a66b3a033e31320b6f6ad8' of https://github.com/abetlen/llama-cpp-python

llama_cpp_python-0.2.39/.git/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 34f31040f610925552a66b3a033e31320b6f6ad8

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/.git/config RENAMED Viewed

@@ -9,7 +9,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX2kzdHlYMTBndHl4RXlDOFk1QmE3Nm1nVEdaSFNzQzFnRlh5TA==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX3J2Wk5WNnkzdklaWVhQWDVqc2F4NXZUUTJQdXpUcDRCUXVHSQ==
 [submodule "vendor/llama.cpp"]
 	active = true
 	url = https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.39/.git/index ADDED Viewed

Binary file

llama_cpp_python-0.2.39/.git/logs/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 34f31040f610925552a66b3a033e31320b6f6ad8 runner <runner@fv-az1019-910.m5ch4o5xfz2e5czaqtqza4205h.dx.internal.cloudapp.net> 1707241738 +0000 checkout: moving from master to refs/tags/v0.2.39

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ b08f22c882a1443e6b97081f3ce718a4d1a741f8 'b08f22c882a1443e6b97081f3ce718a4d1a741f8' of https://github.com/ggerganov/llama.cpp

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ b08f22c882a1443e6b97081f3ce718a4d1a741f8

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/.git/modules/vendor/llama.cpp/config RENAMED Viewed

@@ -13,7 +13,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX2kzdHlYMTBndHl4RXlDOFk1QmE3Nm1nVEdaSFNzQzFnRlh5TA==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX3J2Wk5WNnkzdklaWVhQWDVqc2F4NXZUUTJQdXpUcDRCUXVHSQ==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/index ADDED Viewed

Binary file

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 17c97fb0620448b37516a3f53fea6c482b0a30a4 runner <runner@fv-az1019-910.m5ch4o5xfz2e5czaqtqza4205h.dx.internal.cloudapp.net> 1707241739 +0000 clone: from https://github.com/ggerganov/llama.cpp.git
2	+ 17c97fb0620448b37516a3f53fea6c482b0a30a4 b08f22c882a1443e6b97081f3ce718a4d1a741f8 runner <runner@fv-az1019-910.m5ch4o5xfz2e5czaqtqza4205h.dx.internal.cloudapp.net> 1707241741 +0000 checkout: moving from master to b08f22c882a1443e6b97081f3ce718a4d1a741f8

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 17c97fb0620448b37516a3f53fea6c482b0a30a4 runner <runner@fv-az1019-910.m5ch4o5xfz2e5czaqtqza4205h.dx.internal.cloudapp.net> 1707241739 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 17c97fb0620448b37516a3f53fea6c482b0a30a4 runner <runner@fv-az1019-910.m5ch4o5xfz2e5czaqtqza4205h.dx.internal.cloudapp.net> 1707241739 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/objects/95/161b3f4b3d6c1c35e9f423a1a933b0735bec4f ADDED Viewed

Binary file

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/objects/b0/8f22c882a1443e6b97081f3ce718a4d1a741f8 ADDED Viewed

Binary file

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/objects/dd/fe289fed86e1d59a21ea2d6f625ff44620eec5 ADDED Viewed

Binary file

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/objects/pack/pack-d5af55203ef291c34ac81ee488254e42e1f134d3.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.37/.git/modules/vendor/llama.cpp/objects/pack/pack-2dab3d9b62e8dec5ea0d7cf3608572059e30b9db.pack → llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/objects/pack/pack-d5af55203ef291c34ac81ee488254e42e1f134d3.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/objects/pack/pack-d5af55203ef291c34ac81ee488254e42e1f134d3.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/packed-refs ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # pack-refs with: peeled fully-peeled sorted
2	+ 17c97fb0620448b37516a3f53fea6c482b0a30a4 refs/remotes/origin/master

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 17c97fb0620448b37516a3f53fea6c482b0a30a4

llama_cpp_python-0.2.39/.git/modules/vendor/llama.cpp/shallow ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 17c97fb0620448b37516a3f53fea6c482b0a30a4
2	+ b08f22c882a1443e6b97081f3ce718a4d1a741f8

llama_cpp_python-0.2.39/.git/objects/pack/pack-b1310bf828284f1e9291e80af9059046b7d97b6b.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.39/.git/objects/pack/pack-b1310bf828284f1e9291e80af9059046b7d97b6b.pack ADDED Viewed

Binary file

llama_cpp_python-0.2.39/.git/objects/pack/pack-b1310bf828284f1e9291e80af9059046b7d97b6b.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.39/.git/refs/tags/v0.2.39 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 34f31040f610925552a66b3a033e31320b6f6ad8

llama_cpp_python-0.2.39/.git/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 34f31040f610925552a66b3a033e31320b6f6ad8

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.39]
+- feat: Update llama.cpp to ggerganov/llama.cpp@b08f22c882a1443e6b97081f3ce718a4d1a741f8
+- fix: Fix destructor logging bugs by using llama_log_callback to avoid suppress_stdout_stderr by @abetlen in 59760c85eddc72dfcc1839f43760ef72c23d6874
+## [0.2.38]
+- feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
+- feat: Add speculative decoding by @abetlen in #1120
+- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template by @abetlen in 078cca0361bf5a94d2cf52ed04980d20e32d6f95
 ## [0.2.37]
 - feat: Update llama.cpp to ggerganov/llama.cpp@fea4fd4ba7f6b754ac795387b275e1a014a77bde

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.2.37
+Version: 0.2.39
 Summary: Python bindings for the llama.cpp library
 Author-Email: Andrei Betlen <abetlen@gmail.com>
 License: MIT
@@ -421,6 +421,24 @@ Then you'll need to use a custom chat handler to load the clip model and process
 )
 ```
+### Speculative Decoding
+`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model.
+The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class.
+Just pass this as a draft model to the `Llama` class during initialization.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+llama = Llama(
+    model_path="path/to/model.gguf",
+    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
+)
+```
 ### Adjusting the Context Window
 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/README.md RENAMED Viewed

@@ -378,6 +378,24 @@ Then you'll need to use a custom chat handler to load the clip model and process
 )
 ```
+### Speculative Decoding
+`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model.
+The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class.
+Just pass this as a draft model to the `Llama` class during initialization.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+llama = Llama(
+    model_path="path/to/model.gguf",
+    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
+)
+```
 ### Adjusting the Context Window
 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/examples/high_level_api/fastapi_server.py RENAMED Viewed

@@ -9,7 +9,7 @@ export MODEL=../models/7B/...
 Then run:
 ```
-uvicorn llama_cpp.server.app:app --reload
+uvicorn --factory llama_cpp.server.app:create_app --reload
 ```
 or

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/llama_cpp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.37"
+__version__ = "0.2.39"

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/llama_cpp/_internals.py RENAMED Viewed

@@ -18,8 +18,6 @@ from .llama_grammar import LlamaGrammar
 import llama_cpp.llama_cpp as llama_cpp
-from ._utils import suppress_stdout_stderr
 # Python wrappers over llama.h structs
@@ -30,7 +28,6 @@ class _LlamaModel:
     _llama_free_model = None
     # NOTE: this must be "saved" here to avoid exceptions when calling __del__
-    _suppress_stdout_stderr = suppress_stdout_stderr
     def __init__(
         self,
@@ -48,16 +45,14 @@ class _LlamaModel:
         if not os.path.exists(path_model):
             raise ValueError(f"Model path does not exist: {path_model}")
-        with self._suppress_stdout_stderr(disable=self.verbose):
-            self.model = llama_cpp.llama_load_model_from_file(
-                self.path_model.encode("utf-8"), self.params
-            )
+        self.model = llama_cpp.llama_load_model_from_file(
+            self.path_model.encode("utf-8"), self.params
+        )
     def __del__(self):
-        with self._suppress_stdout_stderr(disable=self.verbose):
-            if self.model is not None and self._llama_free_model is not None:
-                self._llama_free_model(self.model)
-                self.model = None
+        if self.model is not None and self._llama_free_model is not None:
+            self._llama_free_model(self.model)
+            self.model = None
     def vocab_type(self) -> int:
         assert self.model is not None
@@ -240,8 +235,6 @@ class _LlamaContext:
     NOTE: For stability it's recommended you use the Llama class instead."""
     _llama_free = None
-    # NOTE: this must be "saved" here to avoid exceptions when calling __del__
-    _suppress_stdout_stderr = suppress_stdout_stderr
     def __init__(
         self,
@@ -256,16 +249,16 @@ class _LlamaContext:
         self._llama_free = llama_cpp._lib.llama_free  # type: ignore
-        with self._suppress_stdout_stderr(disable=self.verbose):
-            self.ctx = llama_cpp.llama_new_context_with_model(
-                self.model.model, self.params
-            )
+        assert self.model.model is not None
+        self.ctx = llama_cpp.llama_new_context_with_model(
+            self.model.model, self.params
+        )
     def __del__(self):
-        with self._suppress_stdout_stderr(disable=self.verbose):
-            if self.ctx is not None and self._llama_free is not None:
-                self._llama_free(self.ctx)
-                self.ctx = None
+        if self.ctx is not None and self._llama_free is not None:
+            self._llama_free(self.ctx)
+            self.ctx = None
     def n_ctx(self) -> int:
         assert self.ctx is not None
@@ -493,8 +486,6 @@ class _LlamaContext:
 class _LlamaBatch:
     _llama_batch_free = None
-    # NOTE: this must be "saved" here to avoid exceptions when calling __del__
-    _suppress_stdout_stderr = suppress_stdout_stderr
     def __init__(
         self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True
@@ -506,16 +497,14 @@ class _LlamaBatch:
         self._llama_batch_free = llama_cpp._lib.llama_batch_free  # type: ignore
-        with self._suppress_stdout_stderr(disable=self.verbose):
-            self.batch = llama_cpp.llama_batch_init(
-                self.n_tokens, self.embd, self.n_seq_max
-            )
+        self.batch = llama_cpp.llama_batch_init(
+            self.n_tokens, self.embd, self.n_seq_max
+        )
     def __del__(self):
-        with self._suppress_stdout_stderr(disable=self.verbose):
-            if self.batch is not None and self._llama_batch_free is not None:
-                self._llama_batch_free(self.batch)
-                self.batch = None
+        if self.batch is not None and self._llama_batch_free is not None:
+            self._llama_batch_free(self.batch)
+            self.batch = None
     def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool):
         assert self.batch is not None

llama_cpp_python-0.2.39/llama_cpp/_logger.py ADDED Viewed

@@ -0,0 +1,37 @@
+import sys
+import ctypes
+import logging
+import llama_cpp
+# enum ggml_log_level {
+#     GGML_LOG_LEVEL_ERROR = 2,
+#     GGML_LOG_LEVEL_WARN = 3,
+#     GGML_LOG_LEVEL_INFO = 4,
+#     GGML_LOG_LEVEL_DEBUG = 5
+# };
+GGML_LOG_LEVEL_TO_LOGGING_LEVEL = {
+    2: logging.ERROR,
+    3: logging.WARNING,
+    4: logging.INFO,
+    5: logging.DEBUG,
+}
+logger = logging.getLogger("llama-cpp-python")
+@llama_cpp.llama_log_callback
+def llama_log_callback(
+    level: int,
+    text: bytes,
+    user_data: ctypes.c_void_p,
+):
+    if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]:
+        print(text.decode("utf-8"), end="", flush=True, file=sys.stderr)
+llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0))
+def set_verbose(verbose: bool):
+    logger.setLevel(logging.DEBUG if verbose else logging.ERROR)

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/llama_cpp/llama.py RENAMED Viewed

@@ -30,16 +30,20 @@ from .llama_cache import (
 import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
+from llama_cpp.llama_speculative import LlamaDraftModel
 import numpy as np
 import numpy.typing as npt
-from ._utils import suppress_stdout_stderr
 from ._internals import (
     _LlamaModel,  # type: ignore
     _LlamaContext,  # type: ignore
     _LlamaBatch,  # type: ignore
     _LlamaTokenDataArray,  # type: ignore
+    _LlamaSamplingParams,  # type: ignore
+    _LlamaSamplingContext,  # type: ignore
 )
+from ._logger import set_verbose
 class Llama:
@@ -89,6 +93,8 @@ class Llama:
         # Chat Format Params
         chat_format: Optional[str] = None,
         chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
+        # Speculative Decoding
+        draft_model: Optional[LlamaDraftModel] = None,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -152,6 +158,7 @@ class Llama:
             numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
             chat_format: String specifying the chat format to use when calling create_chat_completion.
             chat_handler: Optional chat handler to use when calling create_chat_completion.
+            draft_model: Optional draft model to use for speculative decoding.
             verbose: Print verbose output to stderr.
         Raises:
@@ -162,10 +169,11 @@ class Llama:
         """
         self.verbose = verbose
+        set_verbose(verbose)
         self.numa = numa
         if not Llama.__backend_initialized:
-            with suppress_stdout_stderr(disable=self.verbose):
-                llama_cpp.llama_backend_init(self.numa)
+            llama_cpp.llama_backend_init(self.numa)
             Llama.__backend_initialized = True
         self.model_path = model_path
@@ -315,6 +323,8 @@ class Llama:
         self.chat_format = chat_format
         self.chat_handler = chat_handler
+        self.draft_model = draft_model
         self._n_vocab = self.n_vocab()
         self._n_ctx = self.n_ctx()
@@ -503,6 +513,7 @@ class Llama:
         penalize_nl: bool = True,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
+        idx: Optional[int] = None,
     ):
         """Sample a token from the model.
@@ -517,77 +528,46 @@ class Llama:
         """
         assert self._ctx is not None
         assert self.n_tokens > 0
-        last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
-            0, self.last_n_tokens_size - self.n_tokens
-        ) + self._input_ids[-self.last_n_tokens_size :].tolist()
-        last_n_tokens_size = len(last_n_tokens_data)
-        n_vocab = self._n_vocab
-        n_ctx = self._n_ctx
-        top_k = n_vocab if top_k <= 0 else top_k
-        last_n_tokens_size = n_ctx if last_n_tokens_size < 0 else last_n_tokens_size
-        last_n_tokens_data_c = (llama_cpp.llama_token * last_n_tokens_size)(
-            *last_n_tokens_data
-        )
-        logits: npt.NDArray[np.single] = self._scores[-1, :]
+        if idx is None:
+            logits: npt.NDArray[np.single] = self._scores[-1, :]
+        else:
+            logits = self._scores[idx, :]
         if logits_processor is not None:
-            logits[:] = logits_processor(self._input_ids, logits)
-        nl_logit = logits[self._token_nl]
-        self._candidates.copy_logits(logits)
-        self._ctx.sample_repetition_penalties(
-            candidates=self._candidates,
-            last_tokens_data=last_n_tokens_data_c,
-            penalty_last_n=last_n_tokens_size,
+            logits[:] = (
+                logits_processor(self._input_ids, logits)
+                if idx is None
+                else logits_processor(self._input_ids[:idx], logits)
+            )
+        sampling_params = _LlamaSamplingParams(
+            top_k=top_k,
+            top_p=top_p,
+            min_p=min_p,
+            tfs_z=tfs_z,
+            typical_p=typical_p,
+            temp=temp,
+            penalty_last_n=self.last_n_tokens_size,
             penalty_repeat=repeat_penalty,
             penalty_freq=frequency_penalty,
             penalty_present=presence_penalty,
+            mirostat=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            penalize_nl=penalize_nl,
+        )
+        sampling_context = _LlamaSamplingContext(
+            params=sampling_params,
+            grammar=grammar,
+        )
+        sampling_context.prev = list(self.eval_tokens)
+        id = sampling_context.sample(ctx_main=self._ctx, logits_array=logits)
+        sampling_context.accept(
+            ctx_main=self._ctx,
+            id=id,
+            apply_grammar=grammar is not None,
         )
-        if not penalize_nl:
-            self._candidates.candidates.data[self._token_nl].logit = llama_cpp.c_float(
-                nl_logit
-            )
-        if grammar is not None:
-            self._ctx.sample_grammar(
-                candidates=self._candidates,
-                grammar=grammar,
-            )
-        if temp < 0.0:
-            self._ctx.sample_softmax(candidates=self._candidates)
-            id = self._candidates.candidates.data[0].id
-        elif temp == 0.0:
-            id = self._ctx.sample_token_greedy(candidates=self._candidates)
-        elif mirostat_mode == 1:
-            self._ctx.sample_temp(candidates=self._candidates, temp=temp)
-            id = self._ctx.sample_token_mirostat(
-                candidates=self._candidates,
-                tau=mirostat_tau,
-                eta=mirostat_eta,
-                mu=ctypes.pointer(self._mirostat_mu),
-                m=100,
-            )
-        elif mirostat_mode == 2:
-            self._ctx.sample_temp(candidates=self._candidates, temp=temp)
-            id = self._ctx.sample_token_mirostat_v2(
-                candidates=self._candidates,
-                tau=mirostat_tau,
-                eta=mirostat_eta,
-                mu=ctypes.pointer(self._mirostat_mu),
-            )
-        else:
-            self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
-            self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1)
-            self._ctx.sample_typical(
-                candidates=self._candidates, p=typical_p, min_keep=1
-            )
-            self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1)
-            self._ctx.sample_min_p(candidates=self._candidates, p=min_p, min_keep=1)
-            self._ctx.sample_temp(candidates=self._candidates, temp=temp)
-            id = self._ctx.sample_token(candidates=self._candidates)
-        if grammar is not None:
-            self._ctx.grammar_accept_token(grammar=grammar, token=id)
         return id
     def generate(
@@ -656,34 +636,56 @@ class Llama:
         if grammar is not None:
             grammar.reset()
+        sample_idx = self.n_tokens + len(tokens) - 1
+        tokens = list(tokens)
         # Eval and sample
         while True:
             self.eval(tokens)
-            token = self.sample(
-                top_k=top_k,
-                top_p=top_p,
-                min_p=min_p,
-                typical_p=typical_p,
-                temp=temp,
-                repeat_penalty=repeat_penalty,
-                frequency_penalty=frequency_penalty,
-                presence_penalty=presence_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                logits_processor=logits_processor,
-                grammar=grammar,
-                penalize_nl=penalize_nl,
-            )
-            if stopping_criteria is not None and stopping_criteria(
-                self._input_ids, self._scores[-1, :]
-            ):
-                return
-            tokens_or_none = yield token
-            tokens = [token]
-            if tokens_or_none is not None:
-                tokens.extend(tokens_or_none)
+            while sample_idx < self.n_tokens:
+                token = self.sample(
+                    top_k=top_k,
+                    top_p=top_p,
+                    min_p=min_p,
+                    typical_p=typical_p,
+                    temp=temp,
+                    repeat_penalty=repeat_penalty,
+                    frequency_penalty=frequency_penalty,
+                    presence_penalty=presence_penalty,
+                    tfs_z=tfs_z,
+                    mirostat_mode=mirostat_mode,
+                    mirostat_tau=mirostat_tau,
+                    mirostat_eta=mirostat_eta,
+                    logits_processor=logits_processor,
+                    grammar=grammar,
+                    penalize_nl=penalize_nl,
+                    idx=sample_idx,
+                )
+                sample_idx += 1
+                if stopping_criteria is not None and stopping_criteria(
+                    self._input_ids, self._scores[-1, :]
+                ):
+                    return
+                tokens_or_none = yield token
+                tokens.clear()
+                tokens.append(token)
+                if tokens_or_none is not None:
+                    tokens.extend(tokens_or_none)
+                if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
+                    self.n_tokens = sample_idx
+                    self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+                    break
+            if self.draft_model is not None:
+                self.input_ids[self.n_tokens : self.n_tokens + len(tokens)] = tokens
+                draft_tokens = self.draft_model(self.input_ids[:self.n_tokens + len(tokens)])
+                tokens.extend(
+                    draft_tokens.astype(int)[
+                        : self._n_ctx - self.n_tokens - len(tokens)
+                    ]
+                )
     def create_embedding(
         self, input: Union[str, List[str]], model: Optional[str] = None

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.39}/llama_cpp/llama_chat_format.py RENAMED Viewed

@@ -185,16 +185,17 @@ class Jinja2ChatFormatter(ChatFormatter):
         messages: List[llama_types.ChatCompletionRequestMessage],
         **kwargs: Any,
     ) -> ChatFormatterResponse:
-        if self.add_generation_prompt:
-            messages = [
-                *messages,
-                llama_types.ChatCompletionRequestAssistantMessage(
-                    role="assistant", content=""
-                ),
-            ]
+        def raise_exception(message: str):
+            raise ValueError(message)
         prompt = self._environment.render(
-            messages=messages, eos_token=self.eos_token, bos_token=self.bos_token
+            messages=messages,
+            eos_token=self.eos_token,
+            bos_token=self.bos_token,
+            raise_exception=raise_exception,
+            add_generation_prompt=self.add_generation_prompt
         )
         return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
     def to_chat_handler(self) -> LlamaChatCompletionHandler:

llama-cpp-python 0.2.37__tar.gz → 0.2.39__tar.gz

llama-cpp-python 0.2.37tar.gz → 0.2.39tar.gz