PyPI - llama-cpp-python - Versions diffs - 0.2.53__tar.gz → 0.2.54__tar.gz - Mend

llama-cpp-python 0.2.53tar.gz → 0.2.54tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (981) hide show

llama_cpp_python-0.2.54/.git/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ d5df431278433b580e52222dbf4174f5102585b1 'd5df431278433b580e52222dbf4174f5102585b1' of https://github.com/abetlen/llama-cpp-python

llama_cpp_python-0.2.54/.git/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ d5df431278433b580e52222dbf4174f5102585b1

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/.git/config RENAMED Viewed

@@ -9,7 +9,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1FVcUlkd2s5NjZYRlh5RXNidElxQTluejZXYXhOWDNTRUQyaQ==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzXzdpQXRKODgxbmFTbmVlTHlEM0pqN0pCRE1mazRldzBySUlxMw==
 [submodule "vendor/llama.cpp"]
 	active = true
 	url = https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.54/.git/index ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/logs/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 d5df431278433b580e52222dbf4174f5102585b1 runner <runner@fv-az1240-729.ompflkn1s5uuratdiiqak2yqtf.ex.internal.cloudapp.net> 1709316972 +0000 checkout: moving from master to refs/tags/v0.2.54

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ c2224f003bf9cf558b1a3c57033563e11a4de9a5 'c2224f003bf9cf558b1a3c57033563e11a4de9a5' of https://github.com/ggerganov/llama.cpp

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ c2224f003bf9cf558b1a3c57033563e11a4de9a5

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/.git/modules/vendor/llama.cpp/config RENAMED Viewed

@@ -16,7 +16,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1FVcUlkd2s5NjZYRlh5RXNidElxQTluejZXYXhOWDNTRUQyaQ==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzXzdpQXRKODgxbmFTbmVlTHlEM0pqN0pCRE1mazRldzBySUlxMw==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/index ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 38d16b142624bdd7c41d9955752b7f7b59c5e048 runner <runner@fv-az1240-729.ompflkn1s5uuratdiiqak2yqtf.ex.internal.cloudapp.net> 1709316973 +0000 clone: from https://github.com/ggerganov/llama.cpp.git
2	+ 38d16b142624bdd7c41d9955752b7f7b59c5e048 c2224f003bf9cf558b1a3c57033563e11a4de9a5 runner <runner@fv-az1240-729.ompflkn1s5uuratdiiqak2yqtf.ex.internal.cloudapp.net> 1709316974 +0000 checkout: moving from master to c2224f003bf9cf558b1a3c57033563e11a4de9a5

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 38d16b142624bdd7c41d9955752b7f7b59c5e048 runner <runner@fv-az1240-729.ompflkn1s5uuratdiiqak2yqtf.ex.internal.cloudapp.net> 1709316973 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 38d16b142624bdd7c41d9955752b7f7b59c5e048 runner <runner@fv-az1240-729.ompflkn1s5uuratdiiqak2yqtf.ex.internal.cloudapp.net> 1709316973 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/.git/modules/vendor/llama.cpp/modules/kompute/config RENAMED Viewed

@@ -13,7 +13,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1FVcUlkd2s5NjZYRlh5RXNidElxQTluejZXYXhOWDNTRUQyaQ==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzXzdpQXRKODgxbmFTbmVlTHlEM0pqN0pCRE1mazRldzBySUlxMw==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/modules/kompute/index ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/modules/kompute/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 d1e3b0953cf66acc94b2e29693e221427b2c1f3f runner <runner@fv-az1240-729.ompflkn1s5uuratdiiqak2yqtf.ex.internal.cloudapp.net> 1709316974 +0000 clone: from https://github.com/nomic-ai/kompute.git
2	+ d1e3b0953cf66acc94b2e29693e221427b2c1f3f 4565194ed7c32d1d2efa32ceab4d3c6cae006306 runner <runner@fv-az1240-729.ompflkn1s5uuratdiiqak2yqtf.ex.internal.cloudapp.net> 1709316975 +0000 checkout: moving from master to 4565194ed7c32d1d2efa32ceab4d3c6cae006306

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/modules/kompute/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 d1e3b0953cf66acc94b2e29693e221427b2c1f3f runner <runner@fv-az1240-729.ompflkn1s5uuratdiiqak2yqtf.ex.internal.cloudapp.net> 1709316974 +0000 clone: from https://github.com/nomic-ai/kompute.git

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/modules/kompute/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 d1e3b0953cf66acc94b2e29693e221427b2c1f3f runner <runner@fv-az1240-729.ompflkn1s5uuratdiiqak2yqtf.ex.internal.cloudapp.net> 1709316974 +0000 clone: from https://github.com/nomic-ai/kompute.git

llama_cpp_python-0.2.53/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-aea54470ccfced130dc113c076f9a5f9e05cddbf.idx → llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-dfe06cade21d4a3c314f514ca2e7bec04aebe5ea.idx RENAMED Viewed

Binary file

llama_cpp_python-0.2.53/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-aea54470ccfced130dc113c076f9a5f9e05cddbf.pack → llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-dfe06cade21d4a3c314f514ca2e7bec04aebe5ea.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.53/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-aea54470ccfced130dc113c076f9a5f9e05cddbf.rev → llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-dfe06cade21d4a3c314f514ca2e7bec04aebe5ea.rev RENAMED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/4e/25511d01db635da1739c66e6c5551adccf8ebc ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/54/01e197f67d7e05c4ea5460287555c6722298d6 ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/60/7fe49d3ff151e661f52f83363b8f137d605995 ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/ad/35306c60c4e6ab83f1d6e1fb0cd94f05231f6b ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/c2/224f003bf9cf558b1a3c57033563e11a4de9a5 ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/ea/0610dbdc12d3c1527584fe710b172def2a22b2 ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/ec/711ee993236600dbf9b2d5c72692eac9425255 ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/pack/pack-1490421c864662f09473b15fde42fa3d90801bd4.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.53/.git/modules/vendor/llama.cpp/objects/pack/pack-2d74f02a59f1943182604fe1cb36111024efff36.pack → llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/pack/pack-1490421c864662f09473b15fde42fa3d90801bd4.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/objects/pack/pack-1490421c864662f09473b15fde42fa3d90801bd4.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/packed-refs ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # pack-refs with: peeled fully-peeled sorted
2	+ 38d16b142624bdd7c41d9955752b7f7b59c5e048 refs/remotes/origin/master

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 38d16b142624bdd7c41d9955752b7f7b59c5e048

llama_cpp_python-0.2.54/.git/modules/vendor/llama.cpp/shallow ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 38d16b142624bdd7c41d9955752b7f7b59c5e048
2	+ c2224f003bf9cf558b1a3c57033563e11a4de9a5

llama_cpp_python-0.2.54/.git/objects/pack/pack-2f0cfa1ecc73c12b9a6897e7c351b84eb55c7590.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.53/.git/objects/pack/pack-61676b95c7210237f5ac382e586f3f1575687cf6.pack → llama_cpp_python-0.2.54/.git/objects/pack/pack-2f0cfa1ecc73c12b9a6897e7c351b84eb55c7590.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.54/.git/objects/pack/pack-2f0cfa1ecc73c12b9a6897e7c351b84eb55c7590.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.54/.git/refs/tags/v0.2.54 ADDED Viewed

	@@ -0,0 +1 @@
1	+ d5df431278433b580e52222dbf4174f5102585b1

llama_cpp_python-0.2.54/.git/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ d5df431278433b580e52222dbf4174f5102585b1

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,11 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.54]
+- feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a
+- docs: fix typo in README.md embeddings example by @iamlemec in #1232
 ## [0.2.53]
 - feat: Update llama.cpp to ggerganov/llama.cpp@cb49e0f8c906e5da49e9f6d64a57742a9a241c6a

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.2.53
+Version: 0.2.54
 Summary: Python bindings for the llama.cpp library
 Author-Email: Andrei Betlen <abetlen@gmail.com>
 License: MIT
@@ -329,7 +329,16 @@ By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest
 The high-level API also provides a simple interface for chat completion.
-Note that `chat_format` option must be set for the particular model you are using.
+Chat completion requires that the model know how to format the messages into a single prompt.
+The `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object.
+The model will will format the messages into a single prompt using the following order of precedence:
+  - Use the `chat_handler` if provided
+  - Use the `chat_format` if provided
+  - Use the `tokenizer.chat_template` from the `gguf` model's metadata (should work for most new models, older models may not have this)
+  - else, fallback to the `llama-2` chat format
+Set `verbose=True` to see the selected chat format.
 ```python
 >>> from llama_cpp import Llama
@@ -568,7 +577,7 @@ To generate text embeddings use [`create_embedding`](http://localhost:8000/api-r
 ```python
 import llama_cpp
-llm = llama_cpp.Llama(model_path="path/to/model.gguf", embeddings=True)
+llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True)
 embeddings = llm.create_embedding("Hello, world!")

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/README.md RENAMED Viewed

@@ -286,7 +286,16 @@ By default [`from_pretrained`](https://llama-cpp-python.readthedocs.io/en/latest
 The high-level API also provides a simple interface for chat completion.
-Note that `chat_format` option must be set for the particular model you are using.
+Chat completion requires that the model know how to format the messages into a single prompt.
+The `Llama` class does this using pre-registered chat formats (ie. `chatml`, `llama-2`, `gemma`, etc) or by providing a custom chat handler object.
+The model will will format the messages into a single prompt using the following order of precedence:
+  - Use the `chat_handler` if provided
+  - Use the `chat_format` if provided
+  - Use the `tokenizer.chat_template` from the `gguf` model's metadata (should work for most new models, older models may not have this)
+  - else, fallback to the `llama-2` chat format
+Set `verbose=True` to see the selected chat format.
 ```python
 >>> from llama_cpp import Llama
@@ -525,7 +534,7 @@ To generate text embeddings use [`create_embedding`](http://localhost:8000/api-r
 ```python
 import llama_cpp
-llm = llama_cpp.Llama(model_path="path/to/model.gguf", embeddings=True)
+llm = llama_cpp.Llama(model_path="path/to/model.gguf", embedding=True)
 embeddings = llm.create_embedding("Hello, world!")

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/llama_cpp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.53"
+__version__ = "0.2.54"

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/llama_cpp/_internals.py RENAMED Viewed

@@ -357,21 +357,6 @@ class _LlamaContext:
             penalty_present,
         )
-    def sample_classifier_free_guidance(
-        self,
-        candidates: "_LlamaTokenDataArray",
-        guidance_ctx: "_LlamaContext",
-        scale: float,
-    ):
-        assert self.ctx is not None
-        assert guidance_ctx.ctx is not None
-        llama_cpp.llama_sample_classifier_free_guidance(
-            self.ctx,
-            llama_cpp.byref(candidates.candidates),
-            guidance_ctx.ctx,
-            scale,
-        )
     def sample_softmax(self, candidates: "_LlamaTokenDataArray"):
         assert self.ctx is not None
         llama_cpp.llama_sample_softmax(
@@ -720,7 +705,7 @@ class _LlamaSamplingContext:
         return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8")
     def sample(
-        self, ctx_main: _LlamaContext, ctx_cfg: Optional[_LlamaContext] = None, idx: int = 0, logits_array: Optional[npt.NDArray[np.single]] = None
+        self, ctx_main: _LlamaContext, idx: int = 0, logits_array: Optional[npt.NDArray[np.single]] = None
     ):
         n_vocab = ctx_main.model.n_vocab()
         id: int = 0
@@ -741,11 +726,6 @@ class _LlamaSamplingContext:
         )  # TODO: Only create this once
         token_data_array.copy_logits(logits_array)
-        if ctx_cfg is not None:
-            ctx_main.sample_classifier_free_guidance(
-                token_data_array, ctx_cfg, self.params.cfg_scale
-            )
         # apply penalties
         if len(self.prev) > 0:
             nl_token = ctx_main.model.token_nl()

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/llama_cpp/llama.py RENAMED Viewed

@@ -86,7 +86,6 @@ class Llama:
         yarn_beta_fast: float = 32.0,
         yarn_beta_slow: float = 1.0,
         yarn_orig_ctx: int = 0,
-        mul_mat_q: bool = True,
         logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
@@ -291,7 +290,6 @@ class Llama:
             yarn_beta_slow if yarn_beta_slow != 0.0 else 0
         )
         self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0
-        self.context_params.mul_mat_q = mul_mat_q
         self.context_params.logits_all = (
             logits_all if draft_model is None else True
         )  # Must be set to True for speculative decoding
@@ -412,7 +410,7 @@ class Llama:
                 bos_token = self._model.token_get_text(bos_token_id)
                 if self.verbose:
-                    print(f"Using chat template: {template}", file=sys.stderr)
+                    print(f"Using gguf chat template: {template}", file=sys.stderr)
                     print(f"Using chat eos_token: {eos_token}", file=sys.stderr)
                     print(f"Using chat bos_token: {bos_token}", file=sys.stderr)
@@ -422,6 +420,8 @@ class Llama:
         if self.chat_format is None and self.chat_handler is None:
             self.chat_format = "llama-2"
+            if self.verbose:
+                print(f"Using fallback chat format: {chat_format}", file=sys.stderr)
     @property
     def ctx(self) -> llama_cpp.llama_context_p:
@@ -1724,7 +1724,6 @@ class Llama:
             yarn_beta_fast=self.context_params.yarn_beta_fast,
             yarn_beta_slow=self.context_params.yarn_beta_slow,
             yarn_orig_ctx=self.context_params.yarn_orig_ctx,
-            mul_mat_q=self.context_params.mul_mat_q,
             logits_all=self.context_params.logits_all,
             embedding=self.context_params.embedding,
             # Sampling Params
@@ -1768,7 +1767,6 @@ class Llama:
             yarn_beta_fast=state["yarn_beta_fast"],
             yarn_beta_slow=state["yarn_beta_slow"],
             yarn_orig_ctx=state["yarn_orig_ctx"],
-            mul_mat_q=state["mul_mat_q"],
             logits_all=state["logits_all"],
             embedding=state["embedding"],
             # Sampling Params

{llama_cpp_python-0.2.53 → llama_cpp_python-0.2.54}/llama_cpp/llama_cpp.py RENAMED Viewed

@@ -111,6 +111,7 @@ if TYPE_CHECKING:
 F = TypeVar("F", bound=Callable[..., Any])
 def ctypes_function_for_shared_library(lib: ctypes.CDLL):
     def ctypes_function(
         name: str, argtypes: List[Any], restype: Any, enabled: bool = True
@@ -558,9 +559,7 @@ class llama_model_params(ctypes.Structure):
 #     enum ggml_type type_k; // data type for K cache
 #     enum ggml_type type_v; // data type for V cache
 #     // Keep the booleans together to avoid misalignment during copy-by-value.
-#     bool mul_mat_q;   // if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
 #     bool logits_all;  // the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embedding;   // embedding mode only
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
@@ -588,7 +587,6 @@ class llama_context_params(ctypes.Structure):
         cb_eval_user_data (ctypes.ctypes.c_void_p): user data for cb_eval
         type_k (int): data type for K cache
         type_v (int): data type for V cache
-        mul_mat_q (bool): if true, use experimental mul_mat_q kernels (DEPRECATED - always true)
         logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         embedding (bool): embedding mode only
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
@@ -614,7 +612,6 @@ class llama_context_params(ctypes.Structure):
         ("cb_eval_user_data", ctypes.c_void_p),
         ("type_k", ctypes.c_int),
         ("type_v", ctypes.c_int),
-        ("mul_mat_q", ctypes.c_bool),
         ("logits_all", ctypes.c_bool),
         ("embedding", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
@@ -938,18 +935,6 @@ def llama_supports_gpu_offload() -> bool:
     ...
-# LLAMA_API DEPRECATED(bool llama_mmap_supported (void), "use llama_supports_mmap() instead");
-@ctypes_function("llama_mmap_supported", [], ctypes.c_bool)
-def llama_mmap_supported() -> bool:
-    ...
-# LLAMA_API DEPRECATED(bool llama_mlock_supported(void), "use llama_supports_mlock() instead");
-@ctypes_function("llama_mlock_supported", [], ctypes.c_bool)
-def llama_mlock_supported() -> bool:
-    ...
 # LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx);
 @ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes)
 def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]:
@@ -1158,47 +1143,6 @@ def llama_model_quantize(
     ...
-# // Apply a LoRA adapter to a loaded model
-# // path_base_model is the path to a higher quality model to use as a base for
-# // the layers modified by the adapter. Can be NULL to use the current loaded model.
-# // The model needs to be reloaded before applying a new adapter, otherwise the adapter
-# // will be applied on top of the previous one
-# // Returns 0 on success
-# LLAMA_API DEPRECATED(int32_t llama_apply_lora_from_file(
-#         struct llama_context * ctx,
-#                   const char * path_lora,
-#                        float   scale,
-#                   const char * path_base_model,
-#                      int32_t   n_threads),
-#         "use llama_model_apply_lora_from_file instead");
-@ctypes_function(
-    "llama_apply_lora_from_file",
-    [
-        llama_context_p_ctypes,
-        ctypes.c_char_p,
-        ctypes.c_float,
-        ctypes.c_char_p,
-        ctypes.c_int32,
-    ],
-    ctypes.c_int32,
-)
-def llama_apply_lora_from_file(
-    ctx: llama_context_p,
-    path_lora: Union[ctypes.c_char_p, bytes],
-    scale: Union[ctypes.c_float, float],
-    path_base_model: Union[ctypes.c_char_p, bytes],
-    n_threads: Union[ctypes.c_int32, int],
-    /,
-) -> int:
-    """Apply a LoRA adapter to a loaded model
-    path_base_model is the path to a higher quality model to use as a base for
-    the layers modified by the adapter. Can be NULL to use the current loaded model.
-    The model needs to be reloaded before applying a new adapter, otherwise the adapter
-    will be applied on top of the previous one
-    Returns 0 on success"""
-    ...
 # LLAMA_API int32_t llama_model_apply_lora_from_file(
 #         const struct llama_model * model,
 #                   const char * path_lora,
@@ -1220,7 +1164,7 @@ def llama_model_apply_lora_from_file(
     model: llama_model_p,
     path_lora: Union[ctypes.c_char_p, bytes],
     scale: Union[ctypes.c_float, float],
-    path_base_model: Union[ctypes.c_char_p, bytes],
+    path_base_model: Union[ctypes.c_char_p, bytes, None],
     n_threads: Union[ctypes.c_int32, int],
     /,
 ) -> int:
@@ -1571,11 +1515,11 @@ def llama_copy_state_data(
     ...
-# Set the state reading from the specified address
-# Returns the number of bytes read
+# // Set the state reading from the specified address
+# // Returns the number of bytes read
 # LLAMA_API size_t llama_set_state_data(
 #         struct llama_context * ctx,
-#                      uint8_t * src);
+#                const uint8_t * src);
 @ctypes_function(
     "llama_set_state_data",
     [llama_context_p_ctypes, ctypes.POINTER(ctypes.c_uint8)],
@@ -1647,72 +1591,6 @@ def llama_save_session_file(
 # //
-# // Run the llama inference to obtain the logits and probabilities for the next token(s).
-# // tokens + n_tokens is the provided batch of new tokens to process
-# // n_past is the number of tokens to use from previous eval calls
-# // Returns 0 on success
-# // DEPRECATED: use llama_decode() instead
-# LLAMA_API DEPRECATED(int llama_eval(
-#         struct llama_context * ctx,
-#                  llama_token * tokens,
-#                      int32_t   n_tokens,
-#                      int32_t   n_past),
-#         "use llama_decode() instead");
-@ctypes_function(
-    "llama_eval",
-    [
-        llama_context_p_ctypes,
-        llama_token_p,
-        ctypes.c_int32,
-        ctypes.c_int32,
-    ],
-    ctypes.c_int,
-)
-def llama_eval(
-    ctx: llama_context_p,
-    tokens: CtypesArray[llama_token],
-    n_tokens: Union[ctypes.c_int, int],
-    n_past: Union[ctypes.c_int, int],
-    /,
-) -> int:
-    """Run the llama inference to obtain the logits and probabilities for the next token(s).
-    tokens + n_tokens is the provided batch of new tokens to process
-    n_past is the number of tokens to use from previous eval calls
-    Returns 0 on success
-    DEPRECATED: use llama_decode() instead"""
-    ...
-# // Same as llama_eval, but use float matrix input directly.
-# // DEPRECATED: use llama_decode() instead
-# LLAMA_API DEPRECATED(int llama_eval_embd(
-#         struct llama_context * ctx,
-#                        float * embd,
-#                      int32_t   n_tokens,
-#                      int32_t   n_past),
-#         "use llama_decode() instead");
-@ctypes_function(
-    "llama_eval_embd",
-    [
-        llama_context_p_ctypes,
-        ctypes.POINTER(ctypes.c_float),
-        ctypes.c_int32,
-        ctypes.c_int32,
-    ],
-    ctypes.c_int,
-)
-def llama_eval_embd(
-    ctx: llama_context_p,
-    embd: CtypesArray[ctypes.c_float],
-    n_tokens: Union[ctypes.c_int, int],
-    n_past: Union[ctypes.c_int, int],
-    /,
-) -> int:
-    """Same as llama_eval, but use float matrix input directly.
-    DEPRECATED: use llama_decode() instead"""
-    ...
 # // Return batch for single sequence of tokens starting at pos_0
 # //
 # // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it
@@ -2247,35 +2125,6 @@ def llama_sample_apply_guidance(
     ...
-# LLAMA_API DEPRECATED(void llama_sample_classifier_free_guidance(
-#           struct llama_context * ctx,
-#         llama_token_data_array * candidates,
-#           struct llama_context * guidance_ctx,
-#                          float   scale),
-#           "use llama_sample_apply_guidance() instead");
-@ctypes_function(
-    "llama_sample_classifier_free_guidance",
-    [
-        llama_context_p_ctypes,
-        llama_token_data_array_p,
-        llama_context_p_ctypes,
-        ctypes.c_float,
-    ],
-    None,
-)
-def llama_sample_classifier_free_guidance(
-    ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
-    ],
-    guidance_ctx: llama_context_p,
-    scale: Union[ctypes.c_float, float],
-    /,
-):
-    """Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806"""
-    ...
 # /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.
 # LLAMA_API void llama_sample_softmax(
 #         struct llama_context * ctx,
@@ -2474,28 +2323,6 @@ def llama_sample_temp(
     ...
-# LLAMA_API DEPRECATED(void llama_sample_temperature(
-#             struct llama_context * ctx,
-#           llama_token_data_array * candidates,
-#                            float   temp),
-#         "use llama_sample_temp instead");
-@ctypes_function(
-    "llama_sample_temperature",
-    [llama_context_p_ctypes, llama_token_data_array_p, ctypes.c_float],
-    None,
-)
-def llama_sample_temperature(
-    ctx: llama_context_p,
-    candidates: Union[
-        CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array]
-    ],
-    temp: Union[ctypes.c_float, float],
-    /,
-):
-    """use llama_sample_temp instead"""
-    ...
 # /// @details Apply constraints from grammar
 # LLAMA_API void llama_sample_grammar(
 #         struct llama_context * ctx,

llama-cpp-python 0.2.53__tar.gz → 0.2.54__tar.gz

llama-cpp-python 0.2.53tar.gz → 0.2.54tar.gz