PyPI - llama-cpp-python - Versions diffs - 0.1.54__tar.gz → 0.1.56__tar.gz - Mend

llama-cpp-python 0.1.54tar.gz → 0.1.56tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

llama_cpp_python-0.1.56/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,20 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+## [v0.1.56]
+### Added
+- Added first version of the changelog
+- Server: Use async routes
+- Use numpy for internal buffers to reduce memory usage and improve performance.
+### Fixed
+- Performance bug in stop sequence check slowing down streaming.

llama_cpp_python-0.1.56/Makefile ADDED Viewed

@@ -0,0 +1,49 @@
+update:
+	poetry install
+	git submodule update --init --recursive
+update.vendor:
+	cd vendor/llama.cpp && git pull origin master
+build:
+	python3 setup.py develop
+build.cuda:
+	CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
+build.opencl:
+	CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop
+build.openblas:
+	CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
+build.blis:
+	CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
+build.sdist:
+	python3 setup.py sdist
+deploy.pypi:
+	python3 -m twine upload dist/*
+deploy.gh-docs:
+	mkdocs build
+	mkdocs gh-deploy
+clean:
+	- cd vendor/llama.cpp && make clean
+	- cd vendor/llama.cpp && rm libllama.so
+	- rm -rf _skbuild
+	- rm llama_cpp/libllama.so
+.PHONY: \
+	update \
+	update.vendor \
+	build \
+	build.cuda \
+	build.opencl \
+	build.openblas \
+	build.sdist \
+	deploy.pypi \
+	deploy.gh-docs \
+	clean

{llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.1.54
+Version: 0.1.56
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com
@@ -173,6 +173,17 @@ To get started, clone the repository and install the package in development mode
 ```bash
 git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
+# Install with pip
+pip install -e .
+# if you want to use the fastapi / openapi server
+pip install -e .[server]
+# If you're a poetry user, installing will also include a virtual environment
+poetry install --all-extras
+. .venv/bin/activate
 # Will need to be re-run any time vendor/llama.cpp is updated
 python3 setup.py develop
 ```

{llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/README.md RENAMED Viewed

@@ -155,6 +155,17 @@ To get started, clone the repository and install the package in development mode
 ```bash
 git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
+# Install with pip
+pip install -e .
+# if you want to use the fastapi / openapi server
+pip install -e .[server]
+# If you're a poetry user, installing will also include a virtual environment
+poetry install --all-extras
+. .venv/bin/activate
 # Will need to be re-run any time vendor/llama.cpp is updated
 python3 setup.py develop
 ```

llama_cpp_python-0.1.56/docker/Dockerfile ADDED Viewed

@@ -0,0 +1,51 @@
+# Define the image argument and provide a default value
+ARG IMAGE=python:3-slim-bullseye
+# Use the image as specified
+FROM ${IMAGE}
+# Re-declare the ARG after FROM
+ARG IMAGE
+# Update and upgrade the existing packages
+RUN apt-get update && apt-get upgrade -y && apt-get install -y --no-install-recommends \
+    python3 \
+    python3-pip \
+    ninja-build \
+    build-essential
+RUN python3 -m pip install --upgrade pip pytest cmake scikit-build setuptools fastapi uvicorn sse-starlette
+# Perform the conditional installations based on the image
+RUN echo "Image: ${IMAGE}" && \
+    if [ "${IMAGE}" = "python:3-slim-bullseye" ] ; then \
+    echo "OpenBLAS install:" && \
+    apt-get install -y --no-install-recommends libopenblas-dev && \
+    LLAMA_OPENBLAS=1 pip install llama-cpp-python --verbose; \
+else \
+    echo "CuBLAS install:" && \
+    LLAMA_CUBLAS=1 pip install llama-cpp-python --verbose; \
+fi
+# Clean up apt cache
+RUN rm -rf /var/lib/apt/lists/*
+# Set a working directory for better clarity
+WORKDIR /app
+# Copy files to the app directory
+RUN echo "Installing model...this can take some time..."
+COPY ./model.bin /app/model.bin
+COPY ./start_server.sh /app/start_server.sh
+# Make the server start script executable
+RUN chmod +x /app/start_server.sh
+# Set environment variable for the host
+ENV HOST=0.0.0.0
+# Expose a port for the server
+EXPOSE 8000
+# Run the server start script
+CMD ["/bin/sh", "/app/start_server.sh"]

llama_cpp_python-0.1.56/docker/README.md ADDED Viewed

@@ -0,0 +1,46 @@
+# Dockerfiles for building the llama-cpp-python server
+- `Dockerfile.openblas_simple` - a simple Dockerfile for non-GPU OpenBLAS
+- `Dockerfile.cuda_simple` - a simple Dockerfile for CUDA accelerated CuBLAS
+- `hug_model.py` - a Python utility for interactively choosing and downloading the latest `5_1` quantized models from [huggingface.co/TheBloke]( https://huggingface.co/TheBloke)
+- `Dockerfile` - a single OpenBLAS and CuBLAS combined Dockerfile that automatically installs a previously downloaded model `model.bin`
+# Get model from Hugging Face
+`python3 ./hug_model.py`
+You should now have a model in the current directory and `model.bin` symlinked to it for the subsequent Docker build and copy step. e.g.
+```
+docker $ ls -lh *.bin
+-rw-rw-r-- 1 user user 4.8G May 23 18:30 <downloaded-model-file>.q5_1.bin
+lrwxrwxrwx 1 user user   24 May 23 18:30 model.bin -> <downloaded-model-file>.q5_1.bin
+```
+**Note #1:** Make sure you have enough disk space to download the model. As the model is then copied into the image you will need at least
+**TWICE** as much disk space as the size of the model:
+| Model |  Quantized size |
+|------:|----------------:|
+|    7B |            5 GB |
+|   13B |           10 GB |
+|   30B |           25 GB |
+|   65B |           50 GB |
+**Note #2:** If you want to pass or tune additional parameters, customise `./start_server.sh` before running `docker build ...`
+# Install Docker Server
+**Note #3:** This was tested with Docker running on Linux. If you can get it working on Windows or MacOS, please update this `README.md` with a PR!
+[Install Docker Engine](https://docs.docker.com/engine/install)
+# Use OpenBLAS
+Use if you don't have a NVidia GPU. Defaults to `python:3-slim-bullseye` Docker base image and OpenBLAS:
+## Build:
+`docker build --build-arg -t openblas .`
+## Run:
+`docker run --cap-add SYS_RESOURCE -t openblas`
+# Use CuBLAS
+Requires a NVidia GPU with sufficient VRAM (approximately as much as the size above) and Docker NVidia support (see [container-toolkit/install-guide](https://docs.nvidia.com/datacenter/cloud-native/container-toolkit/install-guide.html))
+## Build:
+`docker build --build-arg IMAGE=nvidia/cuda:12.1.1-devel-ubuntu22.04 -t cublas .`
+## Run:
+`docker run --cap-add SYS_RESOURCE -t cublas`

llama_cpp_python-0.1.56/docker/hug_model.py ADDED Viewed

@@ -0,0 +1,116 @@
+import requests
+import json
+import os
+import struct
+def make_request(url, params=None):
+    print(f"Making request to {url}...")
+    response = requests.get(url, params=params)
+    if response.status_code == 200:
+        return json.loads(response.text)
+    else:
+        print(f"Request failed with status code {response.status_code}")
+        return None
+def check_magic_and_version(filename):
+    with open(filename, 'rb') as f:
+        # Read the first 6 bytes from the file
+        data = f.read(6)
+    # Unpack the binary data, interpreting the first 4 bytes as a little-endian unsigned int
+    # and the next 2 bytes as a little-endian unsigned short
+    magic, version = struct.unpack('<I H', data)
+    print(f"magic: 0x{magic:08x}, version: 0x{version:04x}, file: {filename}")
+    return magic, version
+def download_file(url, destination):
+    print(f"Downloading {url} to {destination}...")
+    response = requests.get(url, stream=True)
+    if response.status_code == 200:
+        with open(destination, 'wb') as f:
+            total_downloaded = 0
+            for chunk in response.iter_content(chunk_size=1024):
+                if chunk:  # filter out keep-alive new chunks
+                    f.write(chunk)
+                    total_downloaded += len(chunk)
+                    if total_downloaded >= 10485760:  # 10 MB
+                        print('.', end='', flush=True)
+                        total_downloaded = 0
+        print("\nDownload complete.")
+        # Creating a symbolic link from destination to "model.bin"
+        if os.path.isfile("model.bin"):
+            os.remove("model.bin")  # remove the existing link if any
+        os.symlink(destination, "model.bin")
+    else:
+        print(f"Download failed with status code {response.status_code}")
+def get_user_choice(model_list):
+    # Print the enumerated list
+    print("\n")
+    for i, (model_id, rfilename) in enumerate(model_list):
+        print(f"{i+1}: Model ID: {model_id}, RFilename: {rfilename}")
+    # Get user's choice
+    choice = input("Choose a model to download by entering the corresponding number: ")
+    try:
+        index = int(choice) - 1
+        if 0 <= index < len(model_list):
+            # Return the chosen model
+            return model_list[index]
+        else:
+            print("Invalid choice.")
+    except ValueError:
+        print("Invalid input. Please enter a number corresponding to a model.")
+    except IndexError:
+        print("Invalid choice. Index out of range.")
+    return None
+import argparse
+def main():
+    # Create an argument parser
+    parser = argparse.ArgumentParser(description='Process the model version.')
+    parser.add_argument('-v', '--version', type=int, default=0x0003,
+                        help='an integer for the version to be used')
+    # Parse the arguments
+    args = parser.parse_args()
+    # Define the parameters
+    params = {
+        "author": "TheBloke",  # Filter by author
+        "tags": "llama"
+    }
+    models = make_request('https://huggingface.co/api/models', params=params)
+    if models is None:
+        return
+    model_list = []
+    # Iterate over the models
+    for model in models:
+        model_id = model['id']
+        model_info = make_request(f'https://huggingface.co/api/models/{model_id}')
+        if model_info is None:
+            continue
+        for sibling in model_info.get('siblings', []):
+            rfilename = sibling.get('rfilename')
+            if rfilename and 'q5_1' in rfilename:
+                model_list.append((model_id, rfilename))
+    model_choice = get_user_choice(model_list)
+    if model_choice is not None:
+        model_id, rfilename = model_choice
+        url = f"https://huggingface.co/{model_id}/resolve/main/{rfilename}"
+        download_file(url, rfilename)
+        _, version = check_magic_and_version(rfilename)
+        if version != args.version:
+            print(f"Warning: Expected version {args.version}, but found different version in the file.")
+if __name__ == '__main__':
+    main()

llama_cpp_python-0.1.56/docker/start_server.sh ADDED Viewed

@@ -0,0 +1,11 @@
+#!/bin/sh
+# For mmap support
+ulimit -l unlimited
+if [ "$IMAGE" = "python:3-slim-bullseye" ]; then
+    python3 -B -m llama_cpp.server --model /app/model.bin
+else
+    # You may have to reduce --n_gpu_layers=1000 to 20 or less if you don't have enough VRAM
+    python3 -B -m llama_cpp.server --model /app/model.bin --n_gpu_layers=1000
+fi

{llama_cpp_python-0.1.54 → llama_cpp_python-0.1.56}/examples/low_level_api/low_level_api_chat_cpp.py RENAMED Viewed

@@ -368,10 +368,10 @@ n_keep = {self.params.n_keep}
 						id = llama_cpp.llama_sample_token_mirostat_v2(self.ctx, candidates_p, llama_cpp.c_float(self.params.mirostat_tau), llama_cpp.c_float(self.params.mirostat_eta), llama_cpp.c_float(mirostat_mu))
 					else:
 						# Temperature sampling
-						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k)
-						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z))
-						llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p))
-						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p))
+						llama_cpp.llama_sample_top_k(self.ctx, candidates_p, top_k, min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_tail_free(self.ctx, candidates_p, llama_cpp.c_float(self.params.tfs_z), min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_typical(self.ctx, candidates_p, llama_cpp.c_float(self.params.typical_p), min_keep=llama_cpp.c_size_t(1))
+						llama_cpp.llama_sample_top_p(self.ctx, candidates_p, llama_cpp.c_float(self.params.top_p), min_keep=llama_cpp.c_size_t(1))
 						llama_cpp.llama_sample_temperature(self.ctx, candidates_p, llama_cpp.c_float(self.params.temp))
 						id = llama_cpp.llama_sample_token(self.ctx, candidates_p)
 				# print("`{}`".format(candidates_p.size))
@@ -382,12 +382,15 @@ n_keep = {self.params.n_keep}
 				# replace end of text token with newline token when in interactive mode
 				if (id == llama_cpp.llama_token_eos() and self.params.interactive and not self.params.instruct):
 					id = self.llama_token_newline[0]
+					self.embd.append(id)
 					if (self.use_antiprompt()):
 						# tokenize and inject first reverse prompt
 						self.embd_inp += self.first_antiprompt[0]
-				# add it to the context
-				self.embd.append(id)
+						for id in self.first_antiprompt[0]:
+							self.embd.append(id)
+				else:
+					# add it to the context
+					self.embd.append(id)
 				# echo this to console
 				self.output_echo = True
@@ -493,7 +496,7 @@ n_keep = {self.params.n_keep}
 			# Contains multi-byte UTF8
 			for num, pattern in [(2, 192), (3, 224), (4, 240)]:
 				# Bitwise AND check
-				if pattern & int.from_bytes(cur_char) == pattern:
+				if pattern & int.from_bytes(cur_char, 'little') == pattern:
 					self.multibyte_fix = [cur_char] + ([None] * (num-1))
 			# Stop incomplete bytes from passing

llama-cpp-python 0.1.54__tar.gz → 0.1.56__tar.gz

llama-cpp-python 0.1.54tar.gz → 0.1.56tar.gz