PyPI - llama-cpp-python - Versions diffs - 0.2.37__tar.gz → 0.2.38__tar.gz - Mend

llama-cpp-python 0.2.37tar.gz → 0.2.38tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (701) hide show

llama_cpp_python-0.2.38/.git/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3322eadbf30a68731f6aafe0b4d055255b46d8f7 '3322eadbf30a68731f6aafe0b4d055255b46d8f7' of https://github.com/abetlen/llama-cpp-python

llama_cpp_python-0.2.38/.git/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3322eadbf30a68731f6aafe0b4d055255b46d8f7

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/.git/config RENAMED Viewed

@@ -9,7 +9,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX2kzdHlYMTBndHl4RXlDOFk1QmE3Nm1nVEdaSFNzQzFnRlh5TA==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX2ptaUFjRVlmS1RIZmJGRE9hdnltMDZJQ0p2MGVoTjFxOGFWNQ==
 [submodule "vendor/llama.cpp"]
 	active = true
 	url = https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.38/.git/index ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/logs/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 3322eadbf30a68731f6aafe0b4d055255b46d8f7 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732571 +0000 checkout: moving from master to refs/tags/v0.2.38

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/.git/modules/vendor/llama.cpp/config RENAMED Viewed

@@ -13,7 +13,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX2kzdHlYMTBndHl4RXlDOFk1QmE3Nm1nVEdaSFNzQzFnRlh5TA==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX2ptaUFjRVlmS1RIZmJGRE9hdnltMDZJQ0p2MGVoTjFxOGFWNQ==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/index ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732572 +0000 clone: from https://github.com/ggerganov/llama.cpp.git
2	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732572 +0000 checkout: moving from master to 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732572 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732572 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/objects/pack/pack-840f4459d494ce7fd10b79596f309b54b31652b8.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.37/.git/modules/vendor/llama.cpp/objects/pack/pack-2dab3d9b62e8dec5ea0d7cf3608572059e30b9db.pack → llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/objects/pack/pack-840f4459d494ce7fd10b79596f309b54b31652b8.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/objects/pack/pack-840f4459d494ce7fd10b79596f309b54b31652b8.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/packed-refs ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # pack-refs with: peeled fully-peeled sorted
2	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 refs/remotes/origin/master

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915

llama_cpp_python-0.2.38/.git/objects/pack/pack-a38d8e19feb0b2901a657ea0b79846878599b3fc.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/objects/pack/pack-a38d8e19feb0b2901a657ea0b79846878599b3fc.pack ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/objects/pack/pack-a38d8e19feb0b2901a657ea0b79846878599b3fc.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/refs/tags/v0.2.38 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3322eadbf30a68731f6aafe0b4d055255b46d8f7

llama_cpp_python-0.2.38/.git/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3322eadbf30a68731f6aafe0b4d055255b46d8f7

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.38]
+- feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
+- feat: Add speculative decoding by @abetlen in #1120
+- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template 078cca0361bf5a94d2cf52ed04980d20e32d6f95
 ## [0.2.37]
 - feat: Update llama.cpp to ggerganov/llama.cpp@fea4fd4ba7f6b754ac795387b275e1a014a77bde

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.2.37
+Version: 0.2.38
 Summary: Python bindings for the llama.cpp library
 Author-Email: Andrei Betlen <abetlen@gmail.com>
 License: MIT
@@ -421,6 +421,24 @@ Then you'll need to use a custom chat handler to load the clip model and process
 )
 ```
+### Speculative Decoding
+`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model.
+The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class.
+Just pass this as a draft model to the `Llama` class during initialization.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+llama = Llama(
+    model_path="path/to/model.gguf",
+    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
+)
+```
 ### Adjusting the Context Window
 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/README.md RENAMED Viewed

@@ -378,6 +378,24 @@ Then you'll need to use a custom chat handler to load the clip model and process
 )
 ```
+### Speculative Decoding
+`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model.
+The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class.
+Just pass this as a draft model to the `Llama` class during initialization.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+llama = Llama(
+    model_path="path/to/model.gguf",
+    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
+)
+```
 ### Adjusting the Context Window
 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/examples/high_level_api/fastapi_server.py RENAMED Viewed

@@ -9,7 +9,7 @@ export MODEL=../models/7B/...
 Then run:
 ```
-uvicorn llama_cpp.server.app:app --reload
+uvicorn --factory llama_cpp.server.app:create_app --reload
 ```
 or

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/llama_cpp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.37"
+__version__ = "0.2.38"

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/llama_cpp/llama.py RENAMED Viewed

@@ -30,6 +30,8 @@ from .llama_cache import (
 import llama_cpp.llama_cpp as llama_cpp
 import llama_cpp.llama_chat_format as llama_chat_format
+from llama_cpp.llama_speculative import LlamaDraftModel
 import numpy as np
 import numpy.typing as npt
@@ -39,6 +41,8 @@ from ._internals import (
     _LlamaContext,  # type: ignore
     _LlamaBatch,  # type: ignore
     _LlamaTokenDataArray,  # type: ignore
+    _LlamaSamplingParams,  # type: ignore
+    _LlamaSamplingContext,  # type: ignore
 )
@@ -89,6 +93,8 @@ class Llama:
         # Chat Format Params
         chat_format: Optional[str] = None,
         chat_handler: Optional[llama_chat_format.LlamaChatCompletionHandler] = None,
+        # Speculative Decoding
+        draft_model: Optional[LlamaDraftModel] = None,
         # Misc
         verbose: bool = True,
         # Extra Params
@@ -152,6 +158,7 @@ class Llama:
             numa: Enable NUMA support. (NOTE: The initial value of this parameter is used for the remainder of the program as this value is set in llama_backend_init)
             chat_format: String specifying the chat format to use when calling create_chat_completion.
             chat_handler: Optional chat handler to use when calling create_chat_completion.
+            draft_model: Optional draft model to use for speculative decoding.
             verbose: Print verbose output to stderr.
         Raises:
@@ -315,6 +322,8 @@ class Llama:
         self.chat_format = chat_format
         self.chat_handler = chat_handler
+        self.draft_model = draft_model
         self._n_vocab = self.n_vocab()
         self._n_ctx = self.n_ctx()
@@ -503,6 +512,7 @@ class Llama:
         penalize_nl: bool = True,
         logits_processor: Optional[LogitsProcessorList] = None,
         grammar: Optional[LlamaGrammar] = None,
+        idx: Optional[int] = None,
     ):
         """Sample a token from the model.
@@ -517,77 +527,46 @@ class Llama:
         """
         assert self._ctx is not None
         assert self.n_tokens > 0
-        last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
-            0, self.last_n_tokens_size - self.n_tokens
-        ) + self._input_ids[-self.last_n_tokens_size :].tolist()
-        last_n_tokens_size = len(last_n_tokens_data)
-        n_vocab = self._n_vocab
-        n_ctx = self._n_ctx
-        top_k = n_vocab if top_k <= 0 else top_k
-        last_n_tokens_size = n_ctx if last_n_tokens_size < 0 else last_n_tokens_size
-        last_n_tokens_data_c = (llama_cpp.llama_token * last_n_tokens_size)(
-            *last_n_tokens_data
-        )
-        logits: npt.NDArray[np.single] = self._scores[-1, :]
+        if idx is None:
+            logits: npt.NDArray[np.single] = self._scores[-1, :]
+        else:
+            logits = self._scores[idx, :]
         if logits_processor is not None:
-            logits[:] = logits_processor(self._input_ids, logits)
-        nl_logit = logits[self._token_nl]
-        self._candidates.copy_logits(logits)
-        self._ctx.sample_repetition_penalties(
-            candidates=self._candidates,
-            last_tokens_data=last_n_tokens_data_c,
-            penalty_last_n=last_n_tokens_size,
+            logits[:] = (
+                logits_processor(self._input_ids, logits)
+                if idx is None
+                else logits_processor(self._input_ids[:idx], logits)
+            )
+        sampling_params = _LlamaSamplingParams(
+            top_k=top_k,
+            top_p=top_p,
+            min_p=min_p,
+            tfs_z=tfs_z,
+            typical_p=typical_p,
+            temp=temp,
+            penalty_last_n=self.last_n_tokens_size,
             penalty_repeat=repeat_penalty,
             penalty_freq=frequency_penalty,
             penalty_present=presence_penalty,
+            mirostat=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
+            penalize_nl=penalize_nl,
+        )
+        sampling_context = _LlamaSamplingContext(
+            params=sampling_params,
+            grammar=grammar,
+        )
+        sampling_context.prev = list(self.eval_tokens)
+        id = sampling_context.sample(ctx_main=self._ctx, logits_array=logits)
+        sampling_context.accept(
+            ctx_main=self._ctx,
+            id=id,
+            apply_grammar=grammar is not None,
         )
-        if not penalize_nl:
-            self._candidates.candidates.data[self._token_nl].logit = llama_cpp.c_float(
-                nl_logit
-            )
-        if grammar is not None:
-            self._ctx.sample_grammar(
-                candidates=self._candidates,
-                grammar=grammar,
-            )
-        if temp < 0.0:
-            self._ctx.sample_softmax(candidates=self._candidates)
-            id = self._candidates.candidates.data[0].id
-        elif temp == 0.0:
-            id = self._ctx.sample_token_greedy(candidates=self._candidates)
-        elif mirostat_mode == 1:
-            self._ctx.sample_temp(candidates=self._candidates, temp=temp)
-            id = self._ctx.sample_token_mirostat(
-                candidates=self._candidates,
-                tau=mirostat_tau,
-                eta=mirostat_eta,
-                mu=ctypes.pointer(self._mirostat_mu),
-                m=100,
-            )
-        elif mirostat_mode == 2:
-            self._ctx.sample_temp(candidates=self._candidates, temp=temp)
-            id = self._ctx.sample_token_mirostat_v2(
-                candidates=self._candidates,
-                tau=mirostat_tau,
-                eta=mirostat_eta,
-                mu=ctypes.pointer(self._mirostat_mu),
-            )
-        else:
-            self._ctx.sample_top_k(candidates=self._candidates, k=top_k, min_keep=1)
-            self._ctx.sample_tail_free(candidates=self._candidates, z=tfs_z, min_keep=1)
-            self._ctx.sample_typical(
-                candidates=self._candidates, p=typical_p, min_keep=1
-            )
-            self._ctx.sample_top_p(candidates=self._candidates, p=top_p, min_keep=1)
-            self._ctx.sample_min_p(candidates=self._candidates, p=min_p, min_keep=1)
-            self._ctx.sample_temp(candidates=self._candidates, temp=temp)
-            id = self._ctx.sample_token(candidates=self._candidates)
-        if grammar is not None:
-            self._ctx.grammar_accept_token(grammar=grammar, token=id)
         return id
     def generate(
@@ -656,34 +635,56 @@ class Llama:
         if grammar is not None:
             grammar.reset()
+        sample_idx = self.n_tokens + len(tokens) - 1
+        tokens = list(tokens)
         # Eval and sample
         while True:
             self.eval(tokens)
-            token = self.sample(
-                top_k=top_k,
-                top_p=top_p,
-                min_p=min_p,
-                typical_p=typical_p,
-                temp=temp,
-                repeat_penalty=repeat_penalty,
-                frequency_penalty=frequency_penalty,
-                presence_penalty=presence_penalty,
-                tfs_z=tfs_z,
-                mirostat_mode=mirostat_mode,
-                mirostat_tau=mirostat_tau,
-                mirostat_eta=mirostat_eta,
-                logits_processor=logits_processor,
-                grammar=grammar,
-                penalize_nl=penalize_nl,
-            )
-            if stopping_criteria is not None and stopping_criteria(
-                self._input_ids, self._scores[-1, :]
-            ):
-                return
-            tokens_or_none = yield token
-            tokens = [token]
-            if tokens_or_none is not None:
-                tokens.extend(tokens_or_none)
+            while sample_idx < self.n_tokens:
+                token = self.sample(
+                    top_k=top_k,
+                    top_p=top_p,
+                    min_p=min_p,
+                    typical_p=typical_p,
+                    temp=temp,
+                    repeat_penalty=repeat_penalty,
+                    frequency_penalty=frequency_penalty,
+                    presence_penalty=presence_penalty,
+                    tfs_z=tfs_z,
+                    mirostat_mode=mirostat_mode,
+                    mirostat_tau=mirostat_tau,
+                    mirostat_eta=mirostat_eta,
+                    logits_processor=logits_processor,
+                    grammar=grammar,
+                    penalize_nl=penalize_nl,
+                    idx=sample_idx,
+                )
+                sample_idx += 1
+                if stopping_criteria is not None and stopping_criteria(
+                    self._input_ids, self._scores[-1, :]
+                ):
+                    return
+                tokens_or_none = yield token
+                tokens.clear()
+                tokens.append(token)
+                if tokens_or_none is not None:
+                    tokens.extend(tokens_or_none)
+                if sample_idx < self.n_tokens and token != self._input_ids[sample_idx]:
+                    self.n_tokens = sample_idx
+                    self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1)
+                    break
+            if self.draft_model is not None:
+                self.input_ids[self.n_tokens : self.n_tokens + len(tokens)] = tokens
+                draft_tokens = self.draft_model(self.input_ids[:self.n_tokens + len(tokens)])
+                tokens.extend(
+                    draft_tokens.astype(int)[
+                        : self._n_ctx - self.n_tokens - len(tokens)
+                    ]
+                )
     def create_embedding(
         self, input: Union[str, List[str]], model: Optional[str] = None

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/llama_cpp/llama_chat_format.py RENAMED Viewed

@@ -185,16 +185,17 @@ class Jinja2ChatFormatter(ChatFormatter):
         messages: List[llama_types.ChatCompletionRequestMessage],
         **kwargs: Any,
     ) -> ChatFormatterResponse:
-        if self.add_generation_prompt:
-            messages = [
-                *messages,
-                llama_types.ChatCompletionRequestAssistantMessage(
-                    role="assistant", content=""
-                ),
-            ]
+        def raise_exception(message: str):
+            raise ValueError(message)
         prompt = self._environment.render(
-            messages=messages, eos_token=self.eos_token, bos_token=self.bos_token
+            messages=messages,
+            eos_token=self.eos_token,
+            bos_token=self.bos_token,
+            raise_exception=raise_exception,
+            add_generation_prompt=self.add_generation_prompt
         )
         return ChatFormatterResponse(prompt=prompt, stop=[self.eos_token])
     def to_chat_handler(self) -> LlamaChatCompletionHandler:

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/llama_cpp/llama_cpp.py RENAMED Viewed

@@ -98,7 +98,7 @@ ggml_backend_sched_eval_callback = ctypes.CFUNCTYPE(c_bool, c_void_p, c_bool, c_
 # llama.h bindings
 _lib.llama_max_devices.argtypes = []
-_lib.llama_max_devices.restype = ctypes.c_int32
+_lib.llama_max_devices.restype = ctypes.c_size_t
 LLAMA_MAX_DEVICES = _lib.llama_max_devices()
@@ -390,7 +390,7 @@ class llama_model_kv_override(Structure):
 #     // LLAMA_SPLIT_LAYER: ignored
 #     int32_t main_gpu;
-#     // proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+#     // proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
 #     const float * tensor_split;
 #     // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
@@ -417,7 +417,7 @@ class llama_model_params(Structure):
         n_gpu_layers (int): number of layers to store in VRAM
         split_mode (int): how to split the model across multiple GPUs
         main_gpu (int): the GPU that is used for the entire model. main_gpu interpretation depends on split_mode: LLAMA_SPLIT_NONE: the GPU that is used for the entire model LLAMA_SPLIT_ROW: the GPU that is used for small tensors and intermediate results LLAMA_SPLIT_LAYER: ignored
-        tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: LLAMA_MAX_DEVICES
+        tensor_split (ctypes.Array[ctypes.c_float]): proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
         progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
         progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
         kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
@@ -760,16 +760,43 @@ _lib.llama_time_us.argtypes = []
 _lib.llama_time_us.restype = ctypes.c_int64
-# LLAMA_API int32_t  llama_max_devices(void);
+# LLAMA_API size_t llama_max_devices(void);
 def llama_max_devices() -> int:
     return _lib.llama_max_devices()
 _lib.llama_max_devices.argtypes = []
-_lib.llama_max_devices.restype = ctypes.c_int32
+_lib.llama_max_devices.restype = ctypes.c_size_t
-# LLAMA_API bool llama_mmap_supported (void);
+# LLAMA_API bool llama_supports_mmap       (void);
+def llama_supports_mmap() -> bool:
+    return _lib.llama_supports_mmap()
+_lib.llama_supports_mmap.argtypes = []
+_lib.llama_supports_mmap.restype = c_bool
+# LLAMA_API bool llama_supports_mlock      (void);
+def llama_supports_mlock() -> bool:
+    return _lib.llama_supports_mlock()
+_lib.llama_supports_mlock.argtypes = []
+_lib.llama_supports_mlock.restype = c_bool
+# LLAMA_API bool llama_supports_gpu_offload(void);
+def llama_supports_gpu_offload() -> bool:
+    return _lib.llama_supports_gpu_offload()
+_lib.llama_supports_gpu_offload.argtypes = []
+_lib.llama_supports_gpu_offload.restype = c_bool
+# LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
 def llama_mmap_supported() -> bool:
     return _lib.llama_mmap_supported()
@@ -778,7 +805,7 @@ _lib.llama_mmap_supported.argtypes = []
 _lib.llama_mmap_supported.restype = c_bool
-# LLAMA_API bool llama_mlock_supported(void);
+# LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
 def llama_mlock_supported() -> bool:
     return _lib.llama_mlock_supported()

llama_cpp_python-0.2.38/llama_cpp/llama_speculative.py ADDED Viewed

@@ -0,0 +1,64 @@
+import abc
+from typing import Any
+import numpy as np
+import numpy.typing as npt
+class LlamaDraftModel(abc.ABC):
+    @abc.abstractmethod
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
+    ) -> npt.NDArray[np.intc]:
+        raise NotImplementedError()
+class LlamaPromptLookupDecoding(LlamaDraftModel):
+    """Based on https://github.com/apoorvumang/prompt-lookup-decoding"""
+    def __init__(self, max_ngram_size: int = 2, num_pred_tokens: int = 10):
+        self.max_ngram_size = max_ngram_size
+        self.num_pred_tokens = num_pred_tokens
+    @staticmethod
+    def find_candidate_pred_tokens(
+        input_ids: npt.NDArray[np.intc],
+        max_ngram_size: int,
+        num_pred_tokens: int,
+    ):
+        input_length = input_ids.shape[0]
+        for ngram_size in range(min(max_ngram_size, input_length - 1), 0, -1):
+            # Create sliding windows of size ngram_size
+            windows = np.lib.stride_tricks.sliding_window_view(input_ids, (ngram_size,))
+            # Convert ngram to an array for comparison
+            ngram_array = input_ids[-ngram_size:]
+            # Find where the windows match the ngram
+            matches = np.all(windows == ngram_array, axis=1)
+            # Get the indices of matches
+            match_indices = np.nonzero(matches)[0]
+            # Iterate through match indices to find a valid continuation
+            for idx in match_indices:
+                start_idx = idx + ngram_size
+                end_idx = start_idx + num_pred_tokens
+                end_idx = min(end_idx, input_length)
+                if start_idx < end_idx:
+                    return input_ids[start_idx:end_idx]
+        # If no match is found, return an empty array
+        return np.array([], dtype=np.intc)
+    def __call__(
+        self, input_ids: npt.NDArray[np.intc], /, **kwargs: Any
+    ) -> npt.NDArray[np.intc]:
+        return self.find_candidate_pred_tokens(
+            input_ids=input_ids,
+            max_ngram_size=self.max_ngram_size,
+            num_pred_tokens=self.num_pred_tokens,
+        )

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/llama_cpp/server/model.py RENAMED Viewed

@@ -5,6 +5,7 @@ import json
 from typing import Dict, Optional, Union, List
 import llama_cpp
+import llama_cpp.llama_speculative as llama_speculative
 from llama_cpp.server.settings import ModelSettings
@@ -92,6 +93,12 @@ class LlamaProxy:
                 )
             )
+        draft_model = None
+        if settings.draft_model is not None:
+            draft_model = llama_speculative.LlamaPromptLookupDecoding(
+                num_pred_tokens=settings.draft_model_num_pred_tokens
+            )
         kv_overrides: Optional[Dict[str, Union[bool, int, float]]] = None
         if settings.kv_overrides is not None:
             assert isinstance(settings.kv_overrides, list)
@@ -147,6 +154,8 @@ class LlamaProxy:
             # Chat Format Params
             chat_format=settings.chat_format,
             chat_handler=chat_handler,
+            # Speculative Decoding
+            draft_model=draft_model,
             # Misc
             verbose=settings.verbose,
         )

{llama_cpp_python-0.2.37 → llama_cpp_python-0.2.38}/llama_cpp/server/settings.py RENAMED Viewed

@@ -143,6 +143,15 @@ class ModelSettings(BaseSettings):
         default=None,
         description="The model name or path to a pretrained HuggingFace tokenizer model. Same as you would pass to AutoTokenizer.from_pretrained().",
     )
+    # Speculative Decoding
+    draft_model: Optional[str] = Field(
+        default=None,
+        description="Method to use for speculative decoding. One of (prompt-lookup-decoding).",
+    )
+    draft_model_num_pred_tokens: int = Field(
+        default=10,
+        description="Number of tokens to predict using the draft model.",
+    )
     # Misc
     verbose: bool = Field(
         default=True, description="Whether to print debug information."

llama_cpp_python-0.2.38/tests/test_llama_speculative.py ADDED Viewed

@@ -0,0 +1,16 @@
+import numpy as np
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+def test_find_candidate_pred_tokens():
+    find_candidate_pred_tokens = LlamaPromptLookupDecoding.find_candidate_pred_tokens
+    # Test Case 1: Matching ngram is found
+    input_ids1 = np.array([1, 2, 3, 1, 2, 3, 1, 2, 3])
+    result1 = find_candidate_pred_tokens(input_ids1, max_ngram_size=3, num_pred_tokens=2)
+    assert np.array_equal(result1, np.array([1, 2]))
+    # Test Case 2: Matching ngram is not found
+    input_ids2 = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9])
+    result2 = find_candidate_pred_tokens(input_ids2, max_ngram_size=3, num_pred_tokens=2)
+    assert np.array_equal(result2, np.array([]))

llama-cpp-python 0.2.37__tar.gz → 0.2.38__tar.gz

llama-cpp-python 0.2.37tar.gz → 0.2.38tar.gz