PyPI - llama-cpp-python - Versions diffs - 0.2.36__tar.gz → 0.2.38__tar.gz - Mend

llama-cpp-python 0.2.36tar.gz → 0.2.38tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (703) hide show

llama_cpp_python-0.2.38/.git/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3322eadbf30a68731f6aafe0b4d055255b46d8f7 '3322eadbf30a68731f6aafe0b4d055255b46d8f7' of https://github.com/abetlen/llama-cpp-python

llama_cpp_python-0.2.38/.git/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3322eadbf30a68731f6aafe0b4d055255b46d8f7

{llama_cpp_python-0.2.36 → llama_cpp_python-0.2.38}/.git/config RENAMED Viewed

@@ -9,7 +9,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1Y2cUFITVRmZ0hoM1VTanhaR2VKRTRhUGxXUHJjRzQwRWhTSg==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX2ptaUFjRVlmS1RIZmJGRE9hdnltMDZJQ0p2MGVoTjFxOGFWNQ==
 [submodule "vendor/llama.cpp"]
 	active = true
 	url = https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.38/.git/index ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/logs/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 3322eadbf30a68731f6aafe0b4d055255b46d8f7 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732571 +0000 checkout: moving from master to refs/tags/v0.2.38

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915

{llama_cpp_python-0.2.36 → llama_cpp_python-0.2.38}/.git/modules/vendor/llama.cpp/config RENAMED Viewed

@@ -13,7 +13,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1Y2cUFITVRmZ0hoM1VTanhaR2VKRTRhUGxXUHJjRzQwRWhTSg==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX2ptaUFjRVlmS1RIZmJGRE9hdnltMDZJQ0p2MGVoTjFxOGFWNQ==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/index ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732572 +0000 clone: from https://github.com/ggerganov/llama.cpp.git
2	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732572 +0000 checkout: moving from master to 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732572 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 runner <runner@fv-az1016-588.pv3vitign2bulj5h5vrau5ekvd.cx.internal.cloudapp.net> 1706732572 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/objects/pack/pack-840f4459d494ce7fd10b79596f309b54b31652b8.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.36/.git/modules/vendor/llama.cpp/objects/pack/pack-45c5d7da4d130e32bb1f98f3b58ea9cd2784fad3.pack → llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/objects/pack/pack-840f4459d494ce7fd10b79596f309b54b31652b8.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/objects/pack/pack-840f4459d494ce7fd10b79596f309b54b31652b8.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/packed-refs ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # pack-refs with: peeled fully-peeled sorted
2	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915 refs/remotes/origin/master

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915

llama_cpp_python-0.2.38/.git/modules/vendor/llama.cpp/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915

llama_cpp_python-0.2.38/.git/objects/pack/pack-a38d8e19feb0b2901a657ea0b79846878599b3fc.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/objects/pack/pack-a38d8e19feb0b2901a657ea0b79846878599b3fc.pack ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/objects/pack/pack-a38d8e19feb0b2901a657ea0b79846878599b3fc.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.38/.git/refs/tags/v0.2.38 ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3322eadbf30a68731f6aafe0b4d055255b46d8f7

llama_cpp_python-0.2.38/.git/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 3322eadbf30a68731f6aafe0b4d055255b46d8f7

{llama_cpp_python-0.2.36 → llama_cpp_python-0.2.38}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,17 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.38]
+- feat: Update llama.cpp to ggerganov/llama.cpp@1cfb5372cf5707c8ec6dde7c874f4a44a6c4c915
+- feat: Add speculative decoding by @abetlen in #1120
+- fix: Pass raise_exception and add_generation_prompt to jinja2 chat template 078cca0361bf5a94d2cf52ed04980d20e32d6f95
+## [0.2.37]
+- feat: Update llama.cpp to ggerganov/llama.cpp@fea4fd4ba7f6b754ac795387b275e1a014a77bde
+- feat: Automatically set chat format from gguf by @abetlen in #1110
 ## [0.2.36]
 - feat: Update llama.cpp to ggerganov/llama.cpp@2aed77eb06a329f0d82bb1c467f4244904d4073f

{llama_cpp_python-0.2.36 → llama_cpp_python-0.2.38}/Makefile RENAMED Viewed

@@ -30,6 +30,12 @@ build.metal:
 build.vulkan:
 	CMAKE_ARGS="-DLLAMA_VULKAN=on" python3 -m pip install --verbose -e .
+build.kompute:
+	CMAKE_ARGS="-DLLAMA_KOMPUTE=on" python3 -m pip install --verbose -e .
+build.sycl:
+	CMAKE_ARGS="-DLLAMA_SYCL=on" python3 -m pip install --verbose -e .
 build.sdist:
 	python3 -m build --sdist

{llama_cpp_python-0.2.36 → llama_cpp_python-0.2.38}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.2.36
+Version: 0.2.38
 Summary: Python bindings for the llama.cpp library
 Author-Email: Andrei Betlen <abetlen@gmail.com>
 License: MIT
@@ -55,20 +55,17 @@ This package provides:
 - Low-level access to C API via `ctypes` interface.
 - High-level Python API for text completion
-    - OpenAI-like API
-    - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp)
-    - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html)
+  - OpenAI-like API
+  - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp)
+  - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html)
 - OpenAI compatible web server
-    - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
-    - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
-    - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
-    - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
+  - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
+  - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
+  - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+  - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 ## Installation
 `llama-cpp-python` can be installed directly from PyPI as a source distribution by running:
@@ -81,7 +78,6 @@ This will build `llama.cpp` from source using cmake and your system's c compiler
 If you run into issues during installation add the `--verbose` flag to the `pip install` command to see the full cmake build log.
 ### Installation with Specific Hardware Acceleration (BLAS, CUDA, Metal, etc)
 The default pip install behaviour is to build `llama.cpp` for CPU only on Linux and Windows and use Metal on MacOS.
@@ -114,7 +110,7 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
 #### cuBLAS
-To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
+To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing:
 ```bash
 CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
@@ -130,7 +126,7 @@ CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
 #### CLBlast
-To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
+To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing:
 ```bash
 CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
@@ -144,13 +140,37 @@ To install with hipBLAS / ROCm support for AMD cards, set the `LLAMA_HIPBLAS=on`
 CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
 ```
+#### Vulkan
+To install with Vulkan support, set the `LLAMA_VULKAN=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_VULKAN=on" pip install llama-cpp-python
+```
+#### Kompute
+To install with Kompute support, set the `LLAMA_KOMPUTE=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python
+```
+#### SYCL
+To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_SYCL=on" pip install llama-cpp-python
+```
 ### Windows Notes
 If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install:
 ```ps
 $env:CMAKE_GENERATOR = "MinGW Makefiles"
-$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe"
+$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe"
 ```
 See the above instructions and set `CMAKE_ARGS` to the BLAS backend you want to use.
@@ -200,7 +220,7 @@ Below is a short example demonstrating how to use the high-level API to for basi
 >>> from llama_cpp import Llama
 >>> llm = Llama(
       model_path="./models/7B/llama-model.gguf",
-      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
+      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
       # seed=1337, # Uncomment to set a specific seed
       # n_ctx=2048, # Uncomment to increase the context window
 )
@@ -319,7 +339,6 @@ The high-level API also provides a simple interface for function calling.
 Note that the only model that supports full function calling at this time is "functionary".
 The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
 ```python
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
@@ -328,7 +347,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
         {
           "role": "system",
           "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
         },
         {
           "role": "user",
@@ -367,7 +386,6 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
 ### Multi-modal Models
 `llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
 read information from both text and images.
@@ -403,6 +421,24 @@ Then you'll need to use a custom chat handler to load the clip model and process
 )
 ```
+### Speculative Decoding
+`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model.
+The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class.
+Just pass this as a draft model to the `Llama` class during initialization.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+llama = Llama(
+    model_path="path/to/model.gguf",
+    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
+)
+```
 ### Adjusting the Context Window
 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
@@ -413,7 +449,6 @@ For instance, if you want to work with larger contexts, you can expand the conte
 llm = Llama(model_path="./models/7B/llama-model.gguf", n_ctx=2048)
 ```
 ## OpenAI Compatible Web Server
 `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
@@ -461,7 +496,8 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python).
 ```bash
 docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/llama-model.gguf ghcr.io/abetlen/llama-cpp-python:latest
 ```
-[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389)
+[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389)
 ## Low-level API
@@ -489,7 +525,6 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
 ## Documentation
 Documentation is available via [https://llama-cpp-python.readthedocs.io/](https://llama-cpp-python.readthedocs.io/).

{llama_cpp_python-0.2.36 → llama_cpp_python-0.2.38}/README.md RENAMED Viewed

@@ -12,20 +12,17 @@ This package provides:
 - Low-level access to C API via `ctypes` interface.
 - High-level Python API for text completion
-    - OpenAI-like API
-    - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp)
-    - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html)
+  - OpenAI-like API
+  - [LangChain compatibility](https://python.langchain.com/docs/integrations/llms/llamacpp)
+  - [LlamaIndex compatibility](https://docs.llamaindex.ai/en/stable/examples/llm/llama_2_llama_cpp.html)
 - OpenAI compatible web server
-    - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
-    - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
-    - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
-    - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
+  - [Local Copilot replacement](https://llama-cpp-python.readthedocs.io/en/latest/server/#code-completion)
+  - [Function Calling support](https://llama-cpp-python.readthedocs.io/en/latest/server/#function-calling)
+  - [Vision API support](https://llama-cpp-python.readthedocs.io/en/latest/server/#multimodal-models)
+  - [Multiple Models](https://llama-cpp-python.readthedocs.io/en/latest/server/#configuration-and-multi-model-support)
 Documentation is available at [https://llama-cpp-python.readthedocs.io/en/latest](https://llama-cpp-python.readthedocs.io/en/latest).
 ## Installation
 `llama-cpp-python` can be installed directly from PyPI as a source distribution by running:
@@ -38,7 +35,6 @@ This will build `llama.cpp` from source using cmake and your system's c compiler
 If you run into issues during installation add the `--verbose` flag to the `pip install` command to see the full cmake build log.
 ### Installation with Specific Hardware Acceleration (BLAS, CUDA, Metal, etc)
 The default pip install behaviour is to build `llama.cpp` for CPU only on Linux and Windows and use Metal on MacOS.
@@ -71,7 +67,7 @@ CMAKE_ARGS="-DLLAMA_BLAS=ON -DLLAMA_BLAS_VENDOR=OpenBLAS" pip install llama-cpp-
 #### cuBLAS
-To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
+To install with cuBLAS, set the `LLAMA_CUBLAS=on` environment variable before installing:
 ```bash
 CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
@@ -87,7 +83,7 @@ CMAKE_ARGS="-DLLAMA_METAL=on" pip install llama-cpp-python
 #### CLBlast
-To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
+To install with CLBlast, set the `LLAMA_CLBLAST=on` environment variable before installing:
 ```bash
 CMAKE_ARGS="-DLLAMA_CLBLAST=on" pip install llama-cpp-python
@@ -101,13 +97,37 @@ To install with hipBLAS / ROCm support for AMD cards, set the `LLAMA_HIPBLAS=on`
 CMAKE_ARGS="-DLLAMA_HIPBLAS=on" pip install llama-cpp-python
 ```
+#### Vulkan
+To install with Vulkan support, set the `LLAMA_VULKAN=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_VULKAN=on" pip install llama-cpp-python
+```
+#### Kompute
+To install with Kompute support, set the `LLAMA_KOMPUTE=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_KOMPUTE=on" pip install llama-cpp-python
+```
+#### SYCL
+To install with SYCL support, set the `LLAMA_SYCL=on` environment variable before installing:
+```bash
+CMAKE_ARGS="-DLLAMA_SYCL=on" pip install llama-cpp-python
+```
 ### Windows Notes
 If you run into issues where it complains it can't find `'nmake'` `'?'` or CMAKE_C_COMPILER, you can extract w64devkit as [mentioned in llama.cpp repo](https://github.com/ggerganov/llama.cpp#openblas) and add those manually to CMAKE_ARGS before running `pip` install:
 ```ps
 $env:CMAKE_GENERATOR = "MinGW Makefiles"
-$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe"
+$env:CMAKE_ARGS = "-DLLAMA_OPENBLAS=on -DCMAKE_C_COMPILER=C:/w64devkit/bin/gcc.exe -DCMAKE_CXX_COMPILER=C:/w64devkit/bin/g++.exe"
 ```
 See the above instructions and set `CMAKE_ARGS` to the BLAS backend you want to use.
@@ -157,7 +177,7 @@ Below is a short example demonstrating how to use the high-level API to for basi
 >>> from llama_cpp import Llama
 >>> llm = Llama(
       model_path="./models/7B/llama-model.gguf",
-      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
+      # n_gpu_layers=-1, # Uncomment to use GPU acceleration
       # seed=1337, # Uncomment to set a specific seed
       # n_ctx=2048, # Uncomment to increase the context window
 )
@@ -276,7 +296,6 @@ The high-level API also provides a simple interface for function calling.
 Note that the only model that supports full function calling at this time is "functionary".
 The gguf-converted files for this model can be found here: [functionary-7b-v1](https://huggingface.co/abetlen/functionary-7b-v1-GGUF)
 ```python
 >>> from llama_cpp import Llama
 >>> llm = Llama(model_path="path/to/functionary/llama-model.gguf", chat_format="functionary")
@@ -285,7 +304,7 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
         {
           "role": "system",
           "content": "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. The assistant calls functions with appropriate input when necessary"
         },
         {
           "role": "user",
@@ -324,7 +343,6 @@ The gguf-converted files for this model can be found here: [functionary-7b-v1](h
 ### Multi-modal Models
 `llama-cpp-python` supports the llava1.5 family of multi-modal models which allow the language model to
 read information from both text and images.
@@ -360,6 +378,24 @@ Then you'll need to use a custom chat handler to load the clip model and process
 )
 ```
+### Speculative Decoding
+`llama-cpp-python` supports speculative decoding which allows the model to generate completions based on a draft model.
+The fastest way to use speculative decoding is through the `LlamaPromptLookupDecoding` class.
+Just pass this as a draft model to the `Llama` class during initialization.
+```python
+from llama_cpp import Llama
+from llama_cpp.llama_speculative import LlamaPromptLookupDecoding
+llama = Llama(
+    model_path="path/to/model.gguf",
+    draft_model=LlamaPromptLookupDecoding(num_pred_tokens=10) # num_pred_tokens is the number of tokens to predict 10 is the default and generally good for gpu, 2 performs better for cpu-only machines.
+)
+```
 ### Adjusting the Context Window
 The context window of the Llama models determines the maximum number of tokens that can be processed at once. By default, this is set to 512 tokens, but can be adjusted based on your requirements.
@@ -370,7 +406,6 @@ For instance, if you want to work with larger contexts, you can expand the conte
 llm = Llama(model_path="./models/7B/llama-model.gguf", n_ctx=2048)
 ```
 ## OpenAI Compatible Web Server
 `llama-cpp-python` offers a web server which aims to act as a drop-in replacement for the OpenAI API.
@@ -418,7 +453,8 @@ A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python).
 ```bash
 docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/llama-model.gguf ghcr.io/abetlen/llama-cpp-python:latest
 ```
-[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389)
+[Docker on termux (requires root)](https://gist.github.com/FreddieOliveira/efe850df7ff3951cb62d74bd770dce27) is currently the only known way to run this on phones, see [termux support issue](https://github.com/abetlen/llama-cpp-python/issues/389)
 ## Low-level API
@@ -446,7 +482,6 @@ Below is a short example demonstrating how to use the low-level API to tokenize
 Check out the [examples folder](examples/low_level_api) for more examples of using the low-level API.
 ## Documentation
 Documentation is available via [https://llama-cpp-python.readthedocs.io/](https://llama-cpp-python.readthedocs.io/).

{llama_cpp_python-0.2.36 → llama_cpp_python-0.2.38}/examples/high_level_api/fastapi_server.py RENAMED Viewed

@@ -9,7 +9,7 @@ export MODEL=../models/7B/...
 Then run:
 ```
-uvicorn llama_cpp.server.app:app --reload
+uvicorn --factory llama_cpp.server.app:create_app --reload
 ```
 or

{llama_cpp_python-0.2.36 → llama_cpp_python-0.2.38}/llama_cpp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.36"
+__version__ = "0.2.38"

{llama_cpp_python-0.2.36 → llama_cpp_python-0.2.38}/llama_cpp/_internals.py RENAMED Viewed

@@ -216,13 +216,13 @@ class _LlamaModel:
         for i in range(llama_cpp.llama_model_meta_count(self.model)):
             nbytes = llama_cpp.llama_model_meta_key_by_index(self.model, i, buffer, buffer_size)
             if nbytes > buffer_size:
-                buffer_size = nbytes
+                buffer_size = nbytes + 1
                 buffer = ctypes.create_string_buffer(buffer_size)
                 nbytes = llama_cpp.llama_model_meta_key_by_index(self.model, i, buffer, buffer_size)
             key = buffer.value.decode("utf-8")
             nbytes = llama_cpp.llama_model_meta_val_str_by_index(self.model, i, buffer, buffer_size)
             if nbytes > buffer_size:
-                buffer_size = nbytes
+                buffer_size = nbytes + 1
                 buffer = ctypes.create_string_buffer(buffer_size)
                 nbytes = llama_cpp.llama_model_meta_val_str_by_index(self.model, i, buffer, buffer_size)
             value = buffer.value.decode("utf-8")

llama-cpp-python 0.2.36__tar.gz → 0.2.38__tar.gz

llama-cpp-python 0.2.36tar.gz → 0.2.38tar.gz