PyPI - llama-cpp-python - Versions diffs - 0.2.24__tar.gz → 0.2.26__tar.gz - Mend

llama-cpp-python 0.2.24tar.gz → 0.2.26tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (578) hide show

llama_cpp_python-0.2.26/.git/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 011c3630f5a130505458c29d58f1654d5efba3bf '011c3630f5a130505458c29d58f1654d5efba3bf' of https://github.com/abetlen/llama-cpp-python

llama_cpp_python-0.2.26/.git/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 011c3630f5a130505458c29d58f1654d5efba3bf

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/.git/config RENAMED Viewed

@@ -9,7 +9,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX3JrYlR4S25WQUV4a0N2bEZ4QVF0cVR6dlo0VTM5eDNENGxubg==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1U4WUFIMlNCR2xPQ1NjYm1HcTR5ZlZod0ZKeEFUQjJxSmllTw==
 [submodule "vendor/llama.cpp"]
 	active = true
 	url = https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.26/.git/index ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/logs/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 011c3630f5a130505458c29d58f1654d5efba3bf runner <runner@fv-az532-920.yhfsaq54z0vebhuvdla3z0z0vh.cx.internal.cloudapp.net> 1703716575 +0000 checkout: moving from master to refs/tags/v0.2.26

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ f6793491b5af6da75edad34d6f503ef86d31b09f

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/.git/modules/vendor/llama.cpp/config RENAMED Viewed

@@ -13,7 +13,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX3JrYlR4S25WQUV4a0N2bEZ4QVF0cVR6dlo0VTM5eDNENGxubg==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1U4WUFIMlNCR2xPQ1NjYm1HcTR5ZlZod0ZKeEFUQjJxSmllTw==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/index ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 f6793491b5af6da75edad34d6f503ef86d31b09f runner <runner@fv-az532-920.yhfsaq54z0vebhuvdla3z0z0vh.cx.internal.cloudapp.net> 1703716576 +0000 clone: from https://github.com/ggerganov/llama.cpp.git
2	+ f6793491b5af6da75edad34d6f503ef86d31b09f f6793491b5af6da75edad34d6f503ef86d31b09f runner <runner@fv-az532-920.yhfsaq54z0vebhuvdla3z0z0vh.cx.internal.cloudapp.net> 1703716576 +0000 checkout: moving from master to f6793491b5af6da75edad34d6f503ef86d31b09f

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 f6793491b5af6da75edad34d6f503ef86d31b09f runner <runner@fv-az532-920.yhfsaq54z0vebhuvdla3z0z0vh.cx.internal.cloudapp.net> 1703716576 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 f6793491b5af6da75edad34d6f503ef86d31b09f runner <runner@fv-az532-920.yhfsaq54z0vebhuvdla3z0z0vh.cx.internal.cloudapp.net> 1703716576 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/objects/pack/pack-0111b9b091eb3bb79082a11785b10794d01ad8e7.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.24/.git/modules/vendor/llama.cpp/objects/pack/pack-b630c306381137c42fbadc02d9f2c158165b39b1.pack → llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/objects/pack/pack-0111b9b091eb3bb79082a11785b10794d01ad8e7.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/objects/pack/pack-0111b9b091eb3bb79082a11785b10794d01ad8e7.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/packed-refs ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # pack-refs with: peeled fully-peeled sorted
2	+ f6793491b5af6da75edad34d6f503ef86d31b09f refs/remotes/origin/master

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ f6793491b5af6da75edad34d6f503ef86d31b09f

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/refs/tags/b1708 ADDED Viewed

	@@ -0,0 +1 @@
1	+ f6793491b5af6da75edad34d6f503ef86d31b09f

llama_cpp_python-0.2.26/.git/modules/vendor/llama.cpp/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ f6793491b5af6da75edad34d6f503ef86d31b09f

llama_cpp_python-0.2.26/.git/objects/01/1c3630f5a130505458c29d58f1654d5efba3bf ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/0d/870969f4b23bd92a09ec29134d3fb454d38bec ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/22/8e3b99b75ebb836a7e001d2c1316eb7799f3de ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/2e/ead7f8523a78a1f4d8c776de42d230265bcf37 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/34/6b4631ebd1f4af85e9988d4a528e00edba6375 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/46/6e2cf6a1f3a75ef800a56f5fd5e85090a98786 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/5d/b46643482a53bdce74c1e51b4ff3944dcf866b ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/75/800c0d80955f2bc82409ca16eda2b0dc405757 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/7f/2e3d8c1e2a505f69b12024b9cc80712f447cdd ADDED Viewed

Binary file

llama_cpp_python-0.2.24/.git/objects/3d/07614e35e03d55630abf4e92857441fdcaf91f → llama_cpp_python-0.2.26/.git/objects/80/7b0f57a8a873e58ade0ff0f5b0bcf0ff66b7f9 RENAMED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/8e/32d2c0edce725a47b5845463133919cd766a61 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/91/78a22255fef8acfe298d0f27640f65f647d400 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/97/fe6e180b574c24eb4f07ef229981a3ac478bb7 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/9d/3d3559849603efda6f3c8181684e4d19e0ec79 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/b9/373b7ac641e6e9c8d8cc64951139205d91d8bc ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/c5/4e4eb5ce2636abd78df46a7616cfe9196a1198 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/c8/4fd04498c2fb188ff7c2a59473035fc90eb990 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/cd/351ba33849dcf6af35b493f7405962fa1625d4 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/d3/efb3a6fe1e001db62ec08e5d31ce1d08567045 ADDED Viewed

Binary file

llama_cpp_python-0.2.24/.git/objects/5d/f12aaf53a0e85f55e1aa0e5167bc831ab32783 → llama_cpp_python-0.2.26/.git/objects/eb/0fb9662e690d0f9de4632cddd321b3f872a725 RENAMED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/f0/827d762e852a21f6406c469300899d5f509b8f ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/objects/fa/dfc5fb4fe6f5eb6d5d98b62519e374a5202b00 ADDED Viewed

Binary file

llama_cpp_python-0.2.26/.git/refs/tags/v0.2.26 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 011c3630f5a130505458c29d58f1654d5efba3bf

llama_cpp_python-0.2.26/.git/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 011c3630f5a130505458c29d58f1654d5efba3bf

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/.github/ISSUE_TEMPLATE/bug_report.md RENAMED Viewed

@@ -64,7 +64,7 @@ Try the following:
 1. `git clone https://github.com/abetlen/llama-cpp-python`
 2. `cd llama-cpp-python`
 3. `rm -rf _skbuild/` # delete any old builds
-4. `python setup.py develop`
+4. `python -m pip install .`
 5. `cd ./vendor/llama.cpp`
 6. Follow [llama.cpp's instructions](https://github.com/ggerganov/llama.cpp#build) to `cmake` llama.cpp
 7. Run llama.cpp's `./main` with the same arguments you previously passed to llama-cpp-python and see if you can reproduce the issue. If you can, [log an issue with llama.cpp](https://github.com/ggerganov/llama.cpp/issues)

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,19 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.26]
+- feat: Update llama.cpp to ggerganov/llama.cpp@f6793491b5af6da75edad34d6f503ef86d31b09f
+## [0.2.25]
+- feat(server): Multi model support by @D4ve-R in #931
+- feat(server): Support none defaulting to infinity for completions by @swg in #111
+- feat(server): Implement openai api compatible authentication by @docmeth2 in #1010
+- fix: text_offset of multi-token characters by @twaka in #1037
+- fix: ctypes bindings for kv override by @phiharri in #1011
+- fix: ctypes definitions of llama_kv_cache_view_update and llama_kv_cache_view_free. by @e-c-d in #1028
 ## [0.2.24]
 - feat: Update llama.cpp to ggerganov/llama.cpp@0e18b2e7d0b5c0a509ea40098def234b8d4a938a

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.2.24
+Version: 0.2.26
 Summary: Python bindings for the llama.cpp library
 Author-Email: Andrei Betlen <abetlen@gmail.com>
 License: MIT
@@ -60,11 +60,13 @@ This package provides:
     - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
     - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
     - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+    - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 ## Installation
 `llama-cpp-python` can be installed directly from PyPI as a source distribution by running:
@@ -374,6 +376,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
 - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
 - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
 - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+- [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 ## Docker image

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/README.md RENAMED Viewed

@@ -18,11 +18,13 @@ This package provides:
     - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
     - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
     - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+    - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 ## Installation
 `llama-cpp-python` can be installed directly from PyPI as a source distribution by running:
@@ -332,6 +334,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
 - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
 - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
 - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+- [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 ## Docker image

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/docs/server.md RENAMED Viewed

@@ -32,6 +32,12 @@ python3 -m llama_cpp.server --help
 NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
+Check out the server config reference below settings for more information on the available options.
+CLI arguments and environment variables are available for all of the fields defined in [`ServerSettings`](#llama_cpp.server.settings.ServerSettings) and [`ModelSettings`](#llama_cpp.server.settings.ModelSettings)
+Additionally the server supports configuration check out the [configuration section](#configuration-and-multi-model-support) for more information and examples.
 ## Guides
 ### Code Completion
@@ -121,4 +127,92 @@ response = client.chat.completions.create(
     ],
 )
 print(response)
-```
+```
+## Configuration and Multi-Model Support
+The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable.
+```bash
+python3 -m llama_cpp.server --config_file <config_file>
+```
+Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models.
+The server supports routing requests to multiple models based on the `model` parameter in the request which matches against the `model_alias` in the config file.
+At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.
+```json
+{
+    "host": "0.0.0.0",
+    "port": 8080,
+    "models": [
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-3.5-turbo",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-4",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf",
+            "model_alias": "gpt-4-vision-preview",
+            "chat_format": "llava-1-5",
+            "clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf",
+            "model_alias": "text-davinci-003",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf",
+            "model_alias": "copilot-codex",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 1024,
+            "n_ctx": 9216
+        }
+    ]
+}
+```
+The config file format is defined by the [`ConfigFileSettings`](#llama_cpp.server.settings.ConfigFileSettings) class.
+## Server Options Reference
+::: llama_cpp.server.settings.ConfigFileSettings
+    options:
+        show_if_no_docstring: true
+::: llama_cpp.server.settings.ServerSettings
+    options:
+        show_if_no_docstring: true
+::: llama_cpp.server.settings.ModelSettings
+    options:
+        show_if_no_docstring: true

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/llama_cpp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.24"
+__version__ = "0.2.26"

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/llama_cpp/llama.py RENAMED Viewed

@@ -850,7 +850,7 @@ class Llama:
         )  # 0x7FFFFFFF is INT32 max, will be auto set to all layers
         self.model_params.main_gpu = main_gpu
         self.tensor_split = tensor_split
-        self._p_tensor_split = None
+        self._c_tensor_split = None
         if self.tensor_split is not None:
             if len(self.tensor_split) > llama_cpp.LLAMA_MAX_DEVICES:
                 raise ValueError(
@@ -1551,11 +1551,13 @@ class Llama:
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
-                            self.detokenize(completion_tokens[:returned_tokens])
+                            self.detokenize(completion_tokens[:returned_tokens]).decode(
+                                "utf-8", errors="ignore"
+                            )
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
                         logits = self._scores[token_offset - 1, :]
-                        current_logprobs = Llama.logits_to_logprobs(logits)
+                        current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                         sorted_logprobs = list(
                             sorted(
                                 zip(current_logprobs, range(len(current_logprobs))),
@@ -1674,7 +1676,7 @@ class Llama:
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
-                    current_logprobs = Llama.logits_to_logprobs(logits)
+                    current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                     sorted_logprobs = list(
                         sorted(
                             zip(current_logprobs, range(len(current_logprobs))),
@@ -1789,13 +1791,19 @@ class Llama:
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
             # TODO: may be able to change this loop to use np.take_along_dim
-            for token, token_str, logprobs_token in zip(
-                all_tokens, all_token_strs, all_logprobs
+            for idx, (token, token_str, logprobs_token) in enumerate(
+                zip(all_tokens, all_token_strs, all_logprobs)
             ):
                 if token == self.token_bos():
                     continue
-                text_offsets.append(text_offset)
-                text_offset += len(token_str)
+                text_offsets.append(
+                    text_offset
+                    + len(
+                        self.detokenize(all_tokens[:idx]).decode(
+                            "utf-8", errors="ignore"
+                        )
+                    )
+                )
                 tokens.append(token_str)
                 sorted_logprobs = list(
                     sorted(
@@ -1909,7 +1917,7 @@ class Llama:
         completion_or_chunks = self._create_completion(
             prompt=prompt,
             suffix=suffix,
-            max_tokens=max_tokens,
+            max_tokens=-1 if max_tokens is None else max_tokens,
             temperature=temperature,
             top_p=top_p,
             min_p=min_p,
@@ -1943,7 +1951,7 @@ class Llama:
         self,
         prompt: str,
         suffix: Optional[str] = None,
-        max_tokens: int = 128,
+        max_tokens: Optional[int] = 16,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.26}/llama_cpp/llama_cpp.py RENAMED Viewed

@@ -9,6 +9,7 @@ from ctypes import (
     c_int32,
     c_uint8,
     c_uint32,
+    c_int64,
     c_size_t,
     c_float,
     c_double,
@@ -16,6 +17,7 @@ from ctypes import (
     POINTER,
     _Pointer,  # type: ignore
     Structure,
+    Union as CtypesUnion,
     Array,
 )
 import pathlib
@@ -60,6 +62,9 @@ def _load_shared_library(lib_base_name: str):
         if "CUDA_PATH" in os.environ:
             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
             os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+        if "HIP_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib"))
         cdll_args["winmode"] = ctypes.RTLD_GLOBAL
     # Try to load the shared library, handling potential errors
@@ -88,9 +93,7 @@ c_size_t_p = POINTER(c_size_t)
 # llama.h bindings
-GGML_USE_CUBLAS = hasattr(_lib, "ggml_init_cublas")
-GGML_CUDA_MAX_DEVICES = 16
-LLAMA_MAX_DEVICES = GGML_CUDA_MAX_DEVICES if GGML_USE_CUBLAS else 1
+LLAMA_MAX_DEVICES = _lib.llama_max_devices()
 # define LLAMA_DEFAULT_SEED 0xFFFFFFFF
 LLAMA_DEFAULT_SEED = 0xFFFFFFFF
@@ -252,8 +255,8 @@ class llama_token_data_array(Structure):
 llama_token_data_array_p = POINTER(llama_token_data_array)
-# typedef void (*llama_progress_callback)(float progress, void *ctx);
-llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
+# typedef bool (*llama_progress_callback)(float progress, void *ctx);
+llama_progress_callback = ctypes.CFUNCTYPE(c_bool, c_float, c_void_p)
 # // Input data for llama_decode
@@ -317,12 +320,9 @@ class llama_batch(Structure):
 #     LLAMA_KV_OVERRIDE_FLOAT,
 #     LLAMA_KV_OVERRIDE_BOOL,
 # };
-class llama_model_kv_override_type(Structure):
-    _fields_ = [
-        ("LLAMA_KV_OVERRIDE_INT", c_int),
-        ("LLAMA_KV_OVERRIDE_FLOAT", c_int),
-        ("LLAMA_KV_OVERRIDE_BOOL", c_int),
-    ]
+LLAMA_KV_OVERRIDE_INT = 0
+LLAMA_KV_OVERRIDE_FLOAT = 1
+LLAMA_KV_OVERRIDE_BOOL = 2
 # struct llama_model_kv_override {
 #     char key[128];
@@ -333,21 +333,28 @@ class llama_model_kv_override_type(Structure):
 #         bool bool_value;
 #     };
 # };
-class llama_model_kv_override(Structure):
+class llama_model_kv_override_value(CtypesUnion):
     _fields_ = [
-        ("key", ctypes.c_char * 128),
-        ("tag", llama_model_kv_override_type),
-        ("int_value", ctypes.c_int64),
+        ("int_value", c_int64),
         ("float_value", c_double),
         ("bool_value", c_bool),
     ]
+class llama_model_kv_override(Structure):
+    _fields_ = [
+        ("key", ctypes.c_char * 128),
+        ("tag", c_int),
+        ("value", llama_model_kv_override_value),
+    ]
 # struct llama_model_params {
 #     int32_t n_gpu_layers; // number of layers to store in VRAM
 #     int32_t main_gpu;     // the GPU that is used for scratch and small tensors
 #     const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
-#     // called with a progress value between 0 and 1, pass NULL to disable
+#     // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+#     // If the provided progress_callback returns true, model loading continues.
+#     // If it returns false, model loading is immediately aborted.
 #     llama_progress_callback progress_callback;
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;
@@ -367,7 +374,7 @@ class llama_model_params(Structure):
         n_gpu_layers (int): number of layers to store in VRAM
         main_gpu (int): the GPU that is used for scratch and small tensors
         tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
-        progress_callback (llama_progress_callback): called with a progress value between 0 and 1, pass NULL to disable
+        progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
         progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
         kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
         vocab_only (bool): only load the vocabulary, no weights
@@ -733,8 +740,14 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
 _lib.llama_n_ctx.argtypes = [llama_context_p]
-_lib.llama_n_ctx.restype = c_int
+_lib.llama_n_ctx.restype = c_uint32
+# LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
+def llama_n_batch(ctx: llama_context_p) -> int:
+    return _lib.llama_n_batch(ctx)
+_lib.llama_n_batch.argtypes = [llama_context_p]
+_lib.llama_n_batch.restype = c_uint32
 # LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
 def llama_vocab_type(model: llama_model_p) -> int:
@@ -1041,6 +1054,9 @@ class llama_kv_cache_view(Structure):
     ]
+llama_kv_cache_view_p = POINTER(llama_kv_cache_view)
 # // Create an empty KV cache view. (use only for debugging purposes)
 # LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
 def llama_kv_cache_view_init(
@@ -1056,23 +1072,23 @@ _lib.llama_kv_cache_view_init.restype = llama_kv_cache_view
 # // Free a KV cache view. (use only for debugging purposes)
 # LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-def llama_kv_cache_view_free(view: llama_kv_cache_view):
+def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
     """Free a KV cache view. (use only for debugging purposes)"""
     return _lib.llama_kv_cache_view_free(view)
-_lib.llama_kv_cache_view_free.argtypes = [llama_kv_cache_view]
+_lib.llama_kv_cache_view_free.argtypes = [llama_kv_cache_view_p]
 _lib.llama_kv_cache_view_free.restype = None
 # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
 # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-def llama_kv_cache_view_update(ctx: llama_context_p, view: llama_kv_cache_view):
+def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
     """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
     return _lib.llama_kv_cache_view_update(ctx, view)
-_lib.llama_kv_cache_view_update.argtypes = [llama_context_p, llama_kv_cache_view]
+_lib.llama_kv_cache_view_update.argtypes = [llama_context_p, llama_kv_cache_view_p]
 _lib.llama_kv_cache_view_update.restype = None

llama-cpp-python 0.2.24__tar.gz → 0.2.26__tar.gz

llama-cpp-python 0.2.24tar.gz → 0.2.26tar.gz