PyPI - llama-cpp-python - Versions diffs - 0.1.57__tar.gz → 0.1.59__tar.gz - Mend

llama-cpp-python 0.1.57tar.gz → 0.1.59tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (185) hide show

{llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/.gitignore RENAMED Viewed

@@ -164,3 +164,6 @@ cython_debug/
 #  and can be added to the global gitignore or merged into this file.  For a more nuclear
 #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
 .idea/
+# downloaded model .bin files
+docker/open_llama/*.bin

llama_cpp_python-0.1.59/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,43 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+## [v0.1.59]
+### Added
+- (llama.cpp) k-quants support
+- (server) mirostat sampling parameters to server
+### Fixed
+- Support both `.so` and `.dylib` for `libllama` on MacOS
+## [v0.1.58]
+### Added
+- (llama.cpp) Metal Silicon support
+## [v0.1.57]
+### Added
+- (llama.cpp) OpenLlama 3B support
+## [v0.1.56]
+### Added
+- (misc) Added first version of the changelog
+- (server) Use async routes
+- (python-api) Use numpy for internal buffers to reduce memory usage and improve performance.
+### Fixed
+- (python-api) Performance bug in stop sequence check slowing down streaming.

{llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/CMakeLists.txt RENAMED Viewed

@@ -27,5 +27,7 @@ else()
         TARGETS llama
         LIBRARY DESTINATION llama_cpp
         RUNTIME DESTINATION llama_cpp
+        ARCHIVE DESTINATION llama_cpp
+        FRAMEWORK DESTINATION llama_cpp
     )
 endif()

{llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/Makefile RENAMED Viewed

@@ -20,6 +20,9 @@ build.openblas:
 build.blis:
 	CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
+build.metal:
+	CMAKE_ARGS="-DLLAMA_METAL=on" FORCE_CMAKE=1 python3 setup.py develop
 build.sdist:
 	python3 setup.py sdist
@@ -34,7 +37,9 @@ clean:
 	- cd vendor/llama.cpp && make clean
 	- cd vendor/llama.cpp && rm libllama.so
 	- rm -rf _skbuild
-	- rm llama_cpp/libllama.so
+	- rm llama_cpp/*.so
+	- rm llama_cpp/*.dylib
+	- rm llama_cpp/*.dll
 .PHONY: \
 	update \

{llama_cpp_python-0.1.57 → llama_cpp_python-0.1.59}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.1.57
+Version: 0.1.59
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com

llama_cpp_python-0.1.59/docker/README.md ADDED Viewed

@@ -0,0 +1,66 @@
+# Install Docker Server
+**Note #1:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
+[Install Docker Engine](https://docs.docker.com/engine/install)
+**Note #2:** NVidia GPU CuBLAS support requires a NVidia GPU with sufficient VRAM (approximately as much as the size in the table below) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
+# Simple Dockerfiles for building the llama-cpp-python server with external model bin files
+## openblas_simple - a simple Dockerfile for non-GPU OpenBLAS, where the model is located outside the Docker image
+```
+cd ./openblas_simple
+docker build -t openblas_simple .
+docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t openblas_simple
+```
+where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+## cuda_simple - a simple Dockerfile for CUDA accelerated CuBLAS, where the model is located outside the Docker image
+```
+cd ./cuda_simple
+docker build -t cuda_simple .
+docker run -e USE_MLOCK=0 -e MODEL=/var/model/<model-path> -v <model-root-path>:/var/model -t cuda_simple
+```
+where `<model-root-path>/<model-path>` is the full path to the model file on the Docker host system.
+# "Open-Llama-in-a-box"
+## Download an Apache V2.0 licensed 3B paramter Open Llama model and install into a Docker image that runs an OpenBLAS-enabled llama-cpp-python server
+```
+$ cd ./open_llama
+./build.sh
+./start.sh
+```
+# Manually choose your own Llama model from Hugging Face
+`python3 ./hug_model.py -a TheBloke -t llama`
+You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
+```
+docker $ ls -lh *.bin
+-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>q5_1.bin
+lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>q5_1.bin
+```
+**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
+**TWICE** as much disk space as the size of the model:
+| Model |  Quantized size |
+|------:|----------------:|
+|    3B |            3 GB |
+|    7B |            5 GB |
+|   13B |           10 GB |
+|   33B |           25 GB |
+|   65B |           50 GB |
+**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
+## Use OpenBLAS
+Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
+### Build:
+`docker build -t openblas .`
+### Run:
+`docker run --cap-add SYS_RESOURCE -t openblas`
+## Use CuBLAS
+### Build:
+`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
+### Run:
+`docker run --cap-add SYS_RESOURCE -t cublas`

llama_cpp_python-0.1.57/docker/Dockerfile.cuda_simple → llama_cpp_python-0.1.59/docker/cuda_simple/Dockerfile RENAMED Viewed

@@ -1,5 +1,5 @@
 ARG CUDA_IMAGE="12.1.1-devel-ubuntu22.04"
-FROM ${CUDA_IMAGE}
+FROM nvidia/cuda:${CUDA_IMAGE}
 # We need to set the host to 0.0.0.0 to allow outside access
 ENV HOST 0.0.0.0
@@ -10,7 +10,7 @@ COPY . .
 RUN apt update && apt install -y python3 python3-pip
 RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
-RUN LLAMA_CUBLAS=1 python3 setup.py develop
+RUN LLAMA_CUBLAS=1 pip install llama-cpp-python
 # Run the server
 CMD python3 -m llama_cpp.server

llama_cpp_python-0.1.59/docker/open_llama/build.sh ADDED Viewed

@@ -0,0 +1,14 @@
+#!/bin/sh
+MODEL="open_llama_3b"
+# Get  open_llama_3b_ggml q5_1 quantization
+python3 ./hug_model.py -a SlyEcho -s ${MODEL} -f "q5_1"
+ls -lh *.bin
+# Build the default OpenBLAS image
+docker build -t $MODEL .
+docker images | egrep "^(REPOSITORY|$MODEL)"
+echo
+echo "To start the docker container run:"
+echo "docker run -t -p 8000:8000 $MODEL"

{llama_cpp_python-0.1.57/docker → llama_cpp_python-0.1.59/docker/open_llama}/hug_model.py RENAMED Viewed

@@ -2,6 +2,7 @@ import requests
 import json
 import os
 import struct
+import argparse
 def make_request(url, params=None):
     print(f"Making request to {url}...")
@@ -69,21 +70,30 @@ def get_user_choice(model_list):
     return None
-import argparse
 def main():
     # Create an argument parser
-    parser = argparse.ArgumentParser(description='Process the model version.')
+    parser = argparse.ArgumentParser(description='Process some parameters.')
+    # Arguments
     parser.add_argument('-v', '--version', type=int, default=0x0003,
-                        help='an integer for the version to be used')
+                        help='hexadecimal version number of ggml file')
+    parser.add_argument('-a', '--author', type=str, default='TheBloke',
+                        help='HuggingFace author filter')
+    parser.add_argument('-t', '--tag', type=str, default='llama',
+                        help='HuggingFace tag filter')
+    parser.add_argument('-s', '--search', type=str, default='',
+                        help='HuggingFace search filter')
+    parser.add_argument('-f', '--filename', type=str, default='q5_1',
+                        help='HuggingFace model repository filename substring match')
     # Parse the arguments
     args = parser.parse_args()
     # Define the parameters
     params = {
-        "author": "TheBloke",  # Filter by author
-        "tags": "llama"
+        "author": args.author,
+        "tags": args.tag,
+        "search": args.search
     }
     models = make_request('https://huggingface.co/api/models', params=params)
@@ -100,17 +110,30 @@ def main():
         for sibling in model_info.get('siblings', []):
             rfilename = sibling.get('rfilename')
-            if rfilename and 'q5_1' in rfilename:
+            if rfilename and args.filename in rfilename:
                 model_list.append((model_id, rfilename))
-    model_choice = get_user_choice(model_list)
+    # Choose the model
+    model_list.sort(key=lambda x: x[0])
+    if len(model_list) == 0:
+        print("No models found")
+        exit(1)
+    elif len(model_list) == 1:
+        model_choice = model_list[0]
+    else:
+        model_choice = get_user_choice(model_list)
     if model_choice is not None:
         model_id, rfilename = model_choice
         url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
-        download_file(url, rfilename)
-        _, version = check_magic_and_version(rfilename)
+        dest = f"{model_id.replace('/', '_')}_{rfilename}"
+        download_file(url, dest)
+        _, version = check_magic_and_version(dest)
         if version != args.version:
-            print(f"Warning: Expected version {args.version}, but found different version in the file.")
+             print(f"Warning: Expected version {args.version}, but found different version in the file.")
+    else:
+        print("Error - model choice was None")
+        exit(2)
 if __name__ == '__main__':
     main()

llama_cpp_python-0.1.59/docker/open_llama/start.sh ADDED Viewed

@@ -0,0 +1,28 @@
+#!/bin/sh
+MODEL="open_llama_3b"
+# Start Docker container
+docker run --cap-add SYS_RESOURCE -p 8000:8000 -t $MODEL &
+sleep 10
+echo
+docker ps | egrep "(^CONTAINER|$MODEL)"
+# Test the model works
+echo
+curl -X 'POST'   'http://localhost:8000/v1/completions'   -H 'accept: application/json'   -H 'Content-Type: application/json'   -d '{
+  "prompt": "\n\n### Instructions:\nWhat is the capital of France?\n\n### Response:\n",
+  "stop": [
+    "\n",
+    "###"
+  ]
+}' | grep Paris
+if [ $? -eq 0 ]
+then
+    echo
+    echo "$MODEL is working!!"
+else
+    echo
+    echo "ERROR: $MODEL not replying."
+    exit 1
+fi

{llama_cpp_python-0.1.57/docker → llama_cpp_python-0.1.59/docker/open_llama}/start_server.sh RENAMED Viewed

@@ -1,6 +1,6 @@
 #!/bin/sh
-# For mmap support
+# For mlock support
 ulimit -l unlimited
 if [ "$IMAGE" = "python:3-slim-bullseye" ]; then

llama_cpp_python-0.1.57/docker/Dockerfile.openblas_simple → llama_cpp_python-0.1.59/docker/openblas_simple/Dockerfile RENAMED Viewed

@@ -9,7 +9,7 @@ COPY . .
 RUN apt update && apt install -y libopenblas-dev ninja-build build-essential
 RUN python -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
-RUN LLAMA_OPENBLAS=1 python3 setup.py develop
+RUN LLAMA_OPENBLAS=1 pip install llama_cpp_python --verbose
 # Run the server
 CMD python3 -m llama_cpp.server

llama-cpp-python 0.1.57__tar.gz → 0.1.59__tar.gz

llama-cpp-python 0.1.57tar.gz → 0.1.59tar.gz