PyPI - llama-cpp-python - Versions diffs - 0.2.24__tar.gz → 0.2.25__tar.gz - Mend

llama-cpp-python 0.2.24tar.gz → 0.2.25tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (570) hide show

llama_cpp_python-0.2.25/.git/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 37556bf9c4f83f51e76682316ff4ea3aed58a279 '37556bf9c4f83f51e76682316ff4ea3aed58a279' of https://github.com/abetlen/llama-cpp-python

llama_cpp_python-0.2.25/.git/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 37556bf9c4f83f51e76682316ff4ea3aed58a279

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.25}/.git/config RENAMED Viewed

@@ -9,7 +9,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX3JrYlR4S25WQUV4a0N2bEZ4QVF0cVR6dlo0VTM5eDNENGxubg==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzXzMzOEY0bUxVSzA1UlkwTkhqNDgzaXlqZW8xRVBxMDFBQ1JITQ==
 [submodule "vendor/llama.cpp"]
 	active = true
 	url = https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.25/.git/index ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/logs/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 37556bf9c4f83f51e76682316ff4ea3aed58a279 runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276562 +0000 checkout: moving from master to refs/tags/v0.2.25

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.25}/.git/modules/vendor/llama.cpp/config RENAMED Viewed

@@ -13,7 +13,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX3JrYlR4S25WQUV4a0N2bEZ4QVF0cVR6dlo0VTM5eDNENGxubg==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzXzMzOEY0bUxVSzA1UlkwTkhqNDgzaXlqZW8xRVBxMDFBQ1JITQ==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/index ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 7082d24cec35e9ce9147535a2224dfc67ee0a78c runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276563 +0000 clone: from https://github.com/ggerganov/llama.cpp.git
2	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c 7082d24cec35e9ce9147535a2224dfc67ee0a78c runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276563 +0000 checkout: moving from master to 7082d24cec35e9ce9147535a2224dfc67ee0a78c

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 7082d24cec35e9ce9147535a2224dfc67ee0a78c runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276563 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 7082d24cec35e9ce9147535a2224dfc67ee0a78c runner <runner@fv-az1149-712.p0yfcspwqgdenibuqhnf5ysfnc.dx.internal.cloudapp.net> 1703276563 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/objects/pack/pack-53054ebe4aff53f3127333b245dfe71db566f164.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.24/.git/modules/vendor/llama.cpp/objects/pack/pack-b630c306381137c42fbadc02d9f2c158165b39b1.pack → llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/objects/pack/pack-53054ebe4aff53f3127333b245dfe71db566f164.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/objects/pack/pack-53054ebe4aff53f3127333b245dfe71db566f164.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/packed-refs ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # pack-refs with: peeled fully-peeled sorted
2	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c refs/remotes/origin/master

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/refs/tags/b1691 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c

llama_cpp_python-0.2.25/.git/modules/vendor/llama.cpp/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 7082d24cec35e9ce9147535a2224dfc67ee0a78c

llama_cpp_python-0.2.25/.git/objects/13/454a3a6bea90892a42064c32f7a1a60deb0806 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/1c/5efea21fad700ef81acb5682eb71efa64c7453 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/2b/14bc6783798c56c71db248c5a834c30fbbce21 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/34/6b4631ebd1f4af85e9988d4a528e00edba6375 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/37/556bf9c4f83f51e76682316ff4ea3aed58a279 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/64/b567b4f3142efeae284deeab2342122d7e62bd ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/78/8732bd3ba7ed71b0e49fb2dfe42d4ed781c0eb ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/7c/1dececc933fdfba00ee95b5bed81f447a21333 ADDED Viewed

Binary file

llama_cpp_python-0.2.24/.git/objects/3d/07614e35e03d55630abf4e92857441fdcaf91f → llama_cpp_python-0.2.25/.git/objects/80/7b0f57a8a873e58ade0ff0f5b0bcf0ff66b7f9 RENAMED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/85/21e7721390edb971bb04098cba2d50446b3d8f ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/8e/32d2c0edce725a47b5845463133919cd766a61 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/97/fe6e180b574c24eb4f07ef229981a3ac478bb7 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/9d/3d3559849603efda6f3c8181684e4d19e0ec79 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/b9/373b7ac641e6e9c8d8cc64951139205d91d8bc ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/c5/4e4eb5ce2636abd78df46a7616cfe9196a1198 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/c8/4fd04498c2fb188ff7c2a59473035fc90eb990 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/cd/351ba33849dcf6af35b493f7405962fa1625d4 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/f0/827d762e852a21f6406c469300899d5f509b8f ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/objects/fa/dfc5fb4fe6f5eb6d5d98b62519e374a5202b00 ADDED Viewed

Binary file

llama_cpp_python-0.2.25/.git/refs/tags/v0.2.25 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 37556bf9c4f83f51e76682316ff4ea3aed58a279

llama_cpp_python-0.2.25/.git/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 37556bf9c4f83f51e76682316ff4ea3aed58a279

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.25}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,15 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.25]
+- feat(server): Multi model support by @D4ve-R in #931
+- feat(server): Support none defaulting to infinity for completions by @swg in #111
+- feat(server): Implement openai api compatible authentication by @docmeth2 in #1010
+- fix: text_offset of multi-token characters by @twaka in #1037
+- fix: ctypes bindings for kv override by @phiharri in #1011
+- fix: ctypes definitions of llama_kv_cache_view_update and llama_kv_cache_view_free. by @e-c-d in #1028
 ## [0.2.24]
 - feat: Update llama.cpp to ggerganov/llama.cpp@0e18b2e7d0b5c0a509ea40098def234b8d4a938a

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.25}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.2.24
+Version: 0.2.25
 Summary: Python bindings for the llama.cpp library
 Author-Email: Andrei Betlen <abetlen@gmail.com>
 License: MIT
@@ -60,11 +60,13 @@ This package provides:
     - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
     - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
     - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+    - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 ## Installation
 `llama-cpp-python` can be installed directly from PyPI as a source distribution by running:
@@ -374,6 +376,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
 - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
 - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
 - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+- [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 ## Docker image

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.25}/README.md RENAMED Viewed

@@ -18,11 +18,13 @@ This package provides:
     - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
     - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
     - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+    - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 ## Installation
 `llama-cpp-python` can be installed directly from PyPI as a source distribution by running:
@@ -332,6 +334,7 @@ For possible options, see [llama_cpp/llama_chat_format.py](llama_cpp/llama_chat_
 - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
 - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
 - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+- [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 ## Docker image

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.25}/docs/server.md RENAMED Viewed

@@ -32,6 +32,12 @@ python3 -m llama_cpp.server --help
 NOTE: All server options are also available as environment variables. For example, `--model` can be set by setting the `MODEL` environment variable.
+Check out the server config reference below settings for more information on the available options.
+CLI arguments and environment variables are available for all of the fields defined in [`ServerSettings`](#llama_cpp.server.settings.ServerSettings) and [`ModelSettings`](#llama_cpp.server.settings.ModelSettings)
+Additionally the server supports configuration check out the [configuration section](#configuration-and-multi-model-support) for more information and examples.
 ## Guides
 ### Code Completion
@@ -121,4 +127,92 @@ response = client.chat.completions.create(
     ],
 )
 print(response)
-```
+```
+## Configuration and Multi-Model Support
+The server supports configuration via a JSON config file that can be passed using the `--config_file` parameter or the `CONFIG_FILE` environment variable.
+```bash
+python3 -m llama_cpp.server --config_file <config_file>
+```
+Config files support all of the server and model options supported by the cli and environment variables however instead of only a single model the config file can specify multiple models.
+The server supports routing requests to multiple models based on the `model` parameter in the request which matches against the `model_alias` in the config file.
+At the moment only a single model is loaded into memory at, the server will automatically load and unload models as needed.
+```json
+{
+    "host": "0.0.0.0",
+    "port": 8080,
+    "models": [
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-3.5-turbo",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf",
+            "model_alias": "gpt-4",
+            "chat_format": "chatml",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf",
+            "model_alias": "gpt-4-vision-preview",
+            "chat_format": "llava-1-5",
+            "clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf",
+            "model_alias": "text-davinci-003",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 512,
+            "n_ctx": 2048
+        },
+        {
+            "model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf",
+            "model_alias": "copilot-codex",
+            "n_gpu_layers": -1,
+            "offload_kqv": true,
+            "n_threads": 12,
+            "n_batch": 1024,
+            "n_ctx": 9216
+        }
+    ]
+}
+```
+The config file format is defined by the [`ConfigFileSettings`](#llama_cpp.server.settings.ConfigFileSettings) class.
+## Server Options Reference
+::: llama_cpp.server.settings.ConfigFileSettings
+    options:
+        show_if_no_docstring: true
+::: llama_cpp.server.settings.ServerSettings
+    options:
+        show_if_no_docstring: true
+::: llama_cpp.server.settings.ModelSettings
+    options:
+        show_if_no_docstring: true

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.25}/llama_cpp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.24"
+__version__ = "0.2.25"

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.25}/llama_cpp/llama.py RENAMED Viewed

@@ -1551,11 +1551,13 @@ class Llama:
                             "utf-8", errors="ignore"
                         )
                         text_offset = len(prompt) + len(
-                            self.detokenize(completion_tokens[:returned_tokens])
+                            self.detokenize(completion_tokens[:returned_tokens]).decode(
+                                "utf-8", errors="ignore"
+                            )
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
                         logits = self._scores[token_offset - 1, :]
-                        current_logprobs = Llama.logits_to_logprobs(logits)
+                        current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                         sorted_logprobs = list(
                             sorted(
                                 zip(current_logprobs, range(len(current_logprobs))),
@@ -1674,7 +1676,7 @@ class Llama:
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
                     logits = self._scores[token_offset, :]
-                    current_logprobs = Llama.logits_to_logprobs(logits)
+                    current_logprobs = Llama.logits_to_logprobs(logits).tolist()
                     sorted_logprobs = list(
                         sorted(
                             zip(current_logprobs, range(len(current_logprobs))),
@@ -1789,13 +1791,19 @@ class Llama:
             ]
             all_logprobs = Llama.logits_to_logprobs(self._scores)[token_offset:]
             # TODO: may be able to change this loop to use np.take_along_dim
-            for token, token_str, logprobs_token in zip(
-                all_tokens, all_token_strs, all_logprobs
+            for idx, (token, token_str, logprobs_token) in enumerate(
+                zip(all_tokens, all_token_strs, all_logprobs)
             ):
                 if token == self.token_bos():
                     continue
-                text_offsets.append(text_offset)
-                text_offset += len(token_str)
+                text_offsets.append(
+                    text_offset
+                    + len(
+                        self.detokenize(all_tokens[:idx]).decode(
+                            "utf-8", errors="ignore"
+                        )
+                    )
+                )
                 tokens.append(token_str)
                 sorted_logprobs = list(
                     sorted(
@@ -1909,7 +1917,7 @@ class Llama:
         completion_or_chunks = self._create_completion(
             prompt=prompt,
             suffix=suffix,
-            max_tokens=max_tokens,
+            max_tokens=-1 if max_tokens is None else max_tokens,
             temperature=temperature,
             top_p=top_p,
             min_p=min_p,
@@ -1943,7 +1951,7 @@ class Llama:
         self,
         prompt: str,
         suffix: Optional[str] = None,
-        max_tokens: int = 128,
+        max_tokens: Optional[int] = 16,
         temperature: float = 0.8,
         top_p: float = 0.95,
         min_p: float = 0.05,

{llama_cpp_python-0.2.24 → llama_cpp_python-0.2.25}/llama_cpp/llama_cpp.py RENAMED Viewed

@@ -9,6 +9,7 @@ from ctypes import (
     c_int32,
     c_uint8,
     c_uint32,
+    c_int64,
     c_size_t,
     c_float,
     c_double,
@@ -16,6 +17,7 @@ from ctypes import (
     POINTER,
     _Pointer,  # type: ignore
     Structure,
+    Union as CtypesUnion,
     Array,
 )
 import pathlib
@@ -252,8 +254,8 @@ class llama_token_data_array(Structure):
 llama_token_data_array_p = POINTER(llama_token_data_array)
-# typedef void (*llama_progress_callback)(float progress, void *ctx);
-llama_progress_callback = ctypes.CFUNCTYPE(None, c_float, c_void_p)
+# typedef bool (*llama_progress_callback)(float progress, void *ctx);
+llama_progress_callback = ctypes.CFUNCTYPE(c_bool, c_float, c_void_p)
 # // Input data for llama_decode
@@ -317,12 +319,9 @@ class llama_batch(Structure):
 #     LLAMA_KV_OVERRIDE_FLOAT,
 #     LLAMA_KV_OVERRIDE_BOOL,
 # };
-class llama_model_kv_override_type(Structure):
-    _fields_ = [
-        ("LLAMA_KV_OVERRIDE_INT", c_int),
-        ("LLAMA_KV_OVERRIDE_FLOAT", c_int),
-        ("LLAMA_KV_OVERRIDE_BOOL", c_int),
-    ]
+LLAMA_KV_OVERRIDE_INT = 0
+LLAMA_KV_OVERRIDE_FLOAT = 1
+LLAMA_KV_OVERRIDE_BOOL = 2
 # struct llama_model_kv_override {
 #     char key[128];
@@ -333,21 +332,28 @@ class llama_model_kv_override_type(Structure):
 #         bool bool_value;
 #     };
 # };
-class llama_model_kv_override(Structure):
+class llama_model_kv_override_value(CtypesUnion):
     _fields_ = [
-        ("key", ctypes.c_char * 128),
-        ("tag", llama_model_kv_override_type),
-        ("int_value", ctypes.c_int64),
+        ("int_value", c_int64),
         ("float_value", c_double),
         ("bool_value", c_bool),
     ]
+class llama_model_kv_override(Structure):
+    _fields_ = [
+        ("key", ctypes.c_char * 128),
+        ("tag", c_int),
+        ("value", llama_model_kv_override_value),
+    ]
 # struct llama_model_params {
 #     int32_t n_gpu_layers; // number of layers to store in VRAM
 #     int32_t main_gpu;     // the GPU that is used for scratch and small tensors
 #     const float * tensor_split; // how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
-#     // called with a progress value between 0 and 1, pass NULL to disable
+#     // Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
+#     // If the provided progress_callback returns true, model loading continues.
+#     // If it returns false, model loading is immediately aborted.
 #     llama_progress_callback progress_callback;
 #     // context pointer passed to the progress callback
 #     void * progress_callback_user_data;
@@ -367,7 +373,7 @@ class llama_model_params(Structure):
         n_gpu_layers (int): number of layers to store in VRAM
         main_gpu (int): the GPU that is used for scratch and small tensors
         tensor_split (ctypes.Array[ctypes.c_float]): how to split layers across multiple GPUs (size: LLAMA_MAX_DEVICES)
-        progress_callback (llama_progress_callback): called with a progress value between 0 and 1, pass NULL to disable
+        progress_callback (llama_progress_callback): called with a progress value between 0.0 and 1.0. Pass NULL to disable. If the provided progress_callback returns true, model loading continues. If it returns false, model loading is immediately aborted.
         progress_callback_user_data (ctypes.c_void_p): context pointer passed to the progress callback
         kv_overrides (ctypes.Array[llama_model_kv_override]): override key-value pairs of the model meta data
         vocab_only (bool): only load the vocabulary, no weights
@@ -733,8 +739,14 @@ def llama_n_ctx(ctx: llama_context_p) -> int:
 _lib.llama_n_ctx.argtypes = [llama_context_p]
-_lib.llama_n_ctx.restype = c_int
+_lib.llama_n_ctx.restype = c_uint32
+# LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
+def llama_n_batch(ctx: llama_context_p) -> int:
+    return _lib.llama_n_batch(ctx)
+_lib.llama_n_batch.argtypes = [llama_context_p]
+_lib.llama_n_batch.restype = c_uint32
 # LLAMA_API enum llama_vocab_type llama_vocab_type(const struct llama_model * model);
 def llama_vocab_type(model: llama_model_p) -> int:
@@ -1041,6 +1053,9 @@ class llama_kv_cache_view(Structure):
     ]
+llama_kv_cache_view_p = POINTER(llama_kv_cache_view)
 # // Create an empty KV cache view. (use only for debugging purposes)
 # LLAMA_API struct llama_kv_cache_view llama_kv_cache_view_init(const struct llama_context * ctx, int32_t n_max_seq);
 def llama_kv_cache_view_init(
@@ -1056,23 +1071,23 @@ _lib.llama_kv_cache_view_init.restype = llama_kv_cache_view
 # // Free a KV cache view. (use only for debugging purposes)
 # LLAMA_API void llama_kv_cache_view_free(struct llama_kv_cache_view * view);
-def llama_kv_cache_view_free(view: llama_kv_cache_view):
+def llama_kv_cache_view_free(view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
     """Free a KV cache view. (use only for debugging purposes)"""
     return _lib.llama_kv_cache_view_free(view)
-_lib.llama_kv_cache_view_free.argtypes = [llama_kv_cache_view]
+_lib.llama_kv_cache_view_free.argtypes = [llama_kv_cache_view_p]
 _lib.llama_kv_cache_view_free.restype = None
 # // Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)
 # LLAMA_API void llama_kv_cache_view_update(const struct llama_context * ctx, struct llama_kv_cache_view * view);
-def llama_kv_cache_view_update(ctx: llama_context_p, view: llama_kv_cache_view):
+def llama_kv_cache_view_update(ctx: llama_context_p, view: "ctypes.pointer[llama_kv_cache_view]"): # type: ignore
     """Update the KV cache view structure with the current state of the KV cache. (use only for debugging purposes)"""
     return _lib.llama_kv_cache_view_update(ctx, view)
-_lib.llama_kv_cache_view_update.argtypes = [llama_context_p, llama_kv_cache_view]
+_lib.llama_kv_cache_view_update.argtypes = [llama_context_p, llama_kv_cache_view_p]
 _lib.llama_kv_cache_view_update.restype = None

llama_cpp_python-0.2.25/llama_cpp/server/__main__.py ADDED Viewed

@@ -0,0 +1,88 @@
+"""Example FastAPI server for llama.cpp.
+To run this example:
+```bash
+pip install fastapi uvicorn sse-starlette pydantic-settings
+export MODEL=../models/7B/...
+```
+Then run:
+```
+uvicorn llama_cpp.server.app:create_app --reload
+```
+or
+```
+python3 -m llama_cpp.server
+```
+Then visit http://localhost:8000/docs to see the interactive API docs.
+"""
+from __future__ import annotations
+import os
+import sys
+import argparse
+import uvicorn
+from llama_cpp.server.app import create_app
+from llama_cpp.server.settings import (
+    Settings,
+    ServerSettings,
+    ModelSettings,
+    ConfigFileSettings,
+)
+from llama_cpp.server.cli import add_args_from_model, parse_model_from_args
+def main():
+    description = "🦙 Llama.cpp python server. Host your own LLMs!🚀"
+    parser = argparse.ArgumentParser(description=description)
+    add_args_from_model(parser, Settings)
+    parser.add_argument(
+        "--config_file",
+        type=str,
+        help="Path to a config file to load.",
+    )
+    server_settings: ServerSettings | None = None
+    model_settings: list[ModelSettings] = []
+    args = parser.parse_args()
+    try:
+        # Load server settings from config_file if provided
+        config_file = os.environ.get("CONFIG_FILE", args.config_file)
+        if config_file:
+            if not os.path.exists(config_file):
+                raise ValueError(f"Config file {config_file} not found!")
+            with open(config_file, "rb") as f:
+                config_file_settings = ConfigFileSettings.model_validate_json(f.read())
+                server_settings = ServerSettings.model_validate(config_file_settings)
+                model_settings = config_file_settings.models
+        else:
+            server_settings = parse_model_from_args(ServerSettings, args)
+            model_settings = [parse_model_from_args(ModelSettings, args)]
+    except Exception as e:
+        print(e, file=sys.stderr)
+        parser.print_help()
+        sys.exit(1)
+    assert server_settings is not None
+    assert model_settings is not None
+    app = create_app(
+        server_settings=server_settings,
+        model_settings=model_settings,
+    )
+    uvicorn.run(
+        app,
+        host=os.getenv("HOST", server_settings.host),
+        port=int(os.getenv("PORT", server_settings.port)),
+        ssl_keyfile=server_settings.ssl_keyfile,
+        ssl_certfile=server_settings.ssl_certfile,
+    )
+if __name__ == "__main__":
+    main()

llama-cpp-python 0.2.24__tar.gz → 0.2.25__tar.gz

llama-cpp-python 0.2.24tar.gz → 0.2.25tar.gz