PyPI - llama-cpp-python - Versions diffs - 0.1.55__tar.gz → 0.1.56__tar.gz - Mend

llama-cpp-python 0.1.55tar.gz → 0.1.56tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (171) hide show

llama_cpp_python-0.1.56/CHANGELOG.md ADDED Viewed

@@ -0,0 +1,20 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
+and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
+## [Unreleased]
+## [v0.1.56]
+### Added
+- Added first version of the changelog
+- Server: Use async routes
+- Use numpy for internal buffers to reduce memory usage and improve performance.
+### Fixed
+- Performance bug in stop sequence check slowing down streaming.

llama_cpp_python-0.1.56/Makefile ADDED Viewed

@@ -0,0 +1,49 @@
+update:
+	poetry install
+	git submodule update --init --recursive
+update.vendor:
+	cd vendor/llama.cpp && git pull origin master
+build:
+	python3 setup.py develop
+build.cuda:
+	CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
+build.opencl:
+	CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 python3 setup.py develop
+build.openblas:
+	CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 python3 setup.py develop
+build.blis:
+	CMAKE_ARGS="-DLLAMA_OPENBLAS=on -DLLAMA_OPENBLAS_VENDOR=blis" FORCE_CMAKE=1 python3 setup.py develop
+build.sdist:
+	python3 setup.py sdist
+deploy.pypi:
+	python3 -m twine upload dist/*
+deploy.gh-docs:
+	mkdocs build
+	mkdocs gh-deploy
+clean:
+	- cd vendor/llama.cpp && make clean
+	- cd vendor/llama.cpp && rm libllama.so
+	- rm -rf _skbuild
+	- rm llama_cpp/libllama.so
+.PHONY: \
+	update \
+	update.vendor \
+	build \
+	build.cuda \
+	build.opencl \
+	build.openblas \
+	build.sdist \
+	deploy.pypi \
+	deploy.gh-docs \
+	clean

{llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.1.55
+Version: 0.1.56
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com
@@ -173,6 +173,17 @@ To get started, clone the repository and install the package in development mode
 ```bash
 git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
+# Install with pip
+pip install -e .
+# if you want to use the fastapi / openapi server
+pip install -e .[server]
+# If you're a poetry user, installing will also include a virtual environment
+poetry install --all-extras
+. .venv/bin/activate
 # Will need to be re-run any time vendor/llama.cpp is updated
 python3 setup.py develop
 ```

{llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/README.md RENAMED Viewed

@@ -155,6 +155,17 @@ To get started, clone the repository and install the package in development mode
 ```bash
 git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
+# Install with pip
+pip install -e .
+# if you want to use the fastapi / openapi server
+pip install -e .[server]
+# If you're a poetry user, installing will also include a virtual environment
+poetry install --all-extras
+. .venv/bin/activate
 # Will need to be re-run any time vendor/llama.cpp is updated
 python3 setup.py develop
 ```

{llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/llama.py RENAMED Viewed

@@ -20,6 +20,9 @@ from collections import deque, OrderedDict
 from . import llama_cpp
 from .llama_types import *
+import numpy as np
+import numpy.typing as npt
 class LlamaCache:
     """Cache for a llama.cpp model."""
@@ -73,11 +76,15 @@ class LlamaState:
         self,
         eval_tokens: Deque[int],
         eval_logits: Deque[List[float]],
+        input_ids: npt.NDArray[np.intc],
+        scores: npt.NDArray[np.single],
         llama_state,  # type: llama_cpp.Array[llama_cpp.c_uint8]
         llama_state_size: int,
     ):
         self.eval_tokens = eval_tokens
         self.eval_logits = eval_logits
+        self.input_ids = input_ids
+        self.scores = scores
         self.llama_state = llama_state
         self.llama_state_size = llama_state_size
@@ -207,20 +214,17 @@ class Llama:
         self._n_vocab = self.n_vocab()
         self._n_ctx = self.n_ctx()
-        data = (llama_cpp.llama_token_data * self._n_vocab)(
-            *[
-                llama_cpp.llama_token_data(
-                    id=llama_cpp.llama_token(i),
-                    logit=llama_cpp.c_float(0.0),
-                    p=llama_cpp.c_float(0.0),
-                )
-                for i in range(self._n_vocab)
-            ]
-        )
         size = llama_cpp.c_size_t(self._n_vocab)
-        sorted = False
+        sorted = llama_cpp.c_bool(False)
+        self._candidates_data = np.array(
+            [],
+            dtype=np.dtype(
+                [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True
+            ),
+        )
+        self._candidates_data.resize(3, self._n_vocab)
         candidates = llama_cpp.llama_token_data_array(
-            data=data,
+            data=self._candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p),
             size=size,
             sorted=sorted,
         )
@@ -228,6 +232,9 @@ class Llama:
         self._token_nl = Llama.token_nl()
         self._token_eos = Llama.token_eos()
+        self._input_ids = np.array([], dtype=np.intc)
+        self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
     def tokenize(self, text: bytes, add_bos: bool = True) -> List[int]:
         """Tokenize a string.
@@ -295,6 +302,8 @@ class Llama:
         """Reset the model state."""
         self.eval_tokens.clear()
         self.eval_logits.clear()
+        self._input_ids = np.array([], dtype=np.intc)
+        self._scores = np.ndarray((0, self._n_vocab), dtype=np.single)
     def eval(self, tokens: Sequence[int]):
         """Evaluate a list of tokens.
@@ -306,7 +315,7 @@ class Llama:
         n_ctx = self._n_ctx
         for i in range(0, len(tokens), self.n_batch):
             batch = tokens[i : min(len(tokens), i + self.n_batch)]
-            n_past = min(n_ctx - len(batch), len(self.eval_tokens))
+            n_past = min(n_ctx - len(batch), len(self._input_ids))
             n_tokens = len(batch)
             return_code = llama_cpp.llama_eval(
                 ctx=self.ctx,
@@ -319,6 +328,9 @@ class Llama:
                 raise RuntimeError(f"llama_eval returned {return_code}")
             # Save tokens
             self.eval_tokens.extend(batch)
+            self._input_ids: npt.NDArray[np.intc] = np.concatenate(
+                (self._input_ids, np.array(batch, dtype=np.intc)), axis=0
+            )
             # Save logits
             rows = n_tokens if self.params.logits_all else 1
             n_vocab = self._n_vocab
@@ -326,6 +338,9 @@ class Llama:
             logits_view = llama_cpp.llama_get_logits(self.ctx)
             logits = [logits_view[i * cols : (i + 1) * cols] for i in range(rows)]
             self.eval_logits.extend(logits)
+            self._scores: npt.NDArray[np.single] = np.concatenate(
+                (self._scores, np.array(logits, dtype=np.single)), axis=0
+            )
     def _sample(
         self,
@@ -346,6 +361,7 @@ class Llama:
     ):
         assert self.ctx is not None
         assert len(self.eval_logits) > 0
+        assert self._scores.shape[0] > 0
         n_vocab = self._n_vocab
         n_ctx = self._n_ctx
         top_k = llama_cpp.c_int(n_vocab) if top_k.value <= 0 else top_k
@@ -354,18 +370,23 @@ class Llama:
             if last_n_tokens_size.value < 0
             else last_n_tokens_size
         )
-        logits = self.eval_logits[-1]
+        logits: npt.NDArray[np.single] = self._scores[-1, :]
         if logits_processor is not None:
-            logits = logits_processor(list(self.eval_tokens), logits)
-            self.eval_logits[-1] = logits
+            logits = np.array(
+                logits_processor(self._input_ids.tolist(), logits.tolist()),
+                dtype=np.single,
+            )
+            self._scores[-1, :] = logits
+            self.eval_logits[-1] = logits.tolist()
         nl_logit = logits[self._token_nl]
         candidates = self._candidates
-        for i, logit in enumerate(logits):
-            candidates.data[i].id = llama_cpp.llama_token(i)
-            candidates.data[i].logit = llama_cpp.c_float(logit)
-            candidates.data[i].p = llama_cpp.c_float(0.0)
+        candidates_data = self._candidates_data
+        candidates_data["id"] = np.arange(n_vocab, dtype=np.intc)  # type: ignore
+        candidates_data["logit"] = logits
+        candidates_data["p"] = np.zeros(n_vocab, dtype=np.single)
+        candidates.data = candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p)
         candidates.sorted = llama_cpp.c_bool(False)
         candidates.size = llama_cpp.c_size_t(n_vocab)
         llama_cpp.llama_sample_repetition_penalty(
@@ -483,8 +504,8 @@ class Llama:
         """
         assert self.ctx is not None
         last_n_tokens_data = [llama_cpp.llama_token(0)] * max(
-            0, self.last_n_tokens_size - len(self.eval_tokens)
-        ) + list(self.eval_tokens)[-self.last_n_tokens_size :]
+            0, self.last_n_tokens_size - len(self._input_ids)
+        ) + self._input_ids[-self.last_n_tokens_size :].tolist()
         return self._sample(
             last_n_tokens_data=(llama_cpp.llama_token * self.last_n_tokens_size)(
                 *last_n_tokens_data
@@ -542,9 +563,9 @@ class Llama:
         """
         assert self.ctx is not None
-        if reset and len(self.eval_tokens) > 0:
+        if reset and len(self._input_ids) > 0:
             longest_prefix = 0
-            for a, b in zip(self.eval_tokens, tokens[:-1]):
+            for a, b in zip(self._input_ids, tokens[:-1]):
                 if a == b:
                     longest_prefix += 1
                 else:
@@ -554,6 +575,8 @@ class Llama:
                     print("Llama.generate: prefix-match hit", file=sys.stderr)
                 reset = False
                 tokens = tokens[longest_prefix:]
+                self._input_ids = self._input_ids[:longest_prefix]
+                self._scores = self._scores[:longest_prefix, :]
                 for _ in range(len(self.eval_tokens) - longest_prefix):
                     self.eval_tokens.pop()
                     try:
@@ -580,7 +603,7 @@ class Llama:
                 logits_processor=logits_processor,
             )
             if stopping_criteria is not None and stopping_criteria(
-                list(self.eval_tokens), self.eval_logits[-1]
+                self._input_ids.tolist(), self._scores[-1, :].tolist()
             ):
                 return
             tokens_or_none = yield token
@@ -715,10 +738,10 @@ class Llama:
             try:
                 cache_item = self.cache[prompt_tokens]
                 cache_prefix_len = Llama.longest_token_prefix(
-                    cache_item.eval_tokens, prompt_tokens
+                    cache_item.input_ids.tolist(), prompt_tokens
                 )
                 eval_prefix_len = Llama.longest_token_prefix(
-                    self.eval_tokens, prompt_tokens
+                    self._input_ids.tolist(), prompt_tokens
                 )
                 if cache_prefix_len > eval_prefix_len:
                     self.load_state(cache_item)
@@ -775,20 +798,22 @@ class Llama:
                 break
             if stream:
+                remaining_tokens = completion_tokens[returned_tokens:]
+                remaining_text = self.detokenize(remaining_tokens)
+                remaining_length = len(remaining_text)
                 # We want to avoid yielding any characters from
                 # the generated text if they are part of a stop
                 # sequence.
                 first_stop_position = 0
                 for s in stop_sequences:
-                    for i in range(len(s), 0, -1):
-                        if all_text.endswith(s[:i]):
+                    for i in range(min(len(s), remaining_length), 0, -1):
+                        if remaining_text.endswith(s[:i]):
                             if i > first_stop_position:
                                 first_stop_position = i
                             break
                 token_end_position = 0
-                remaining_tokens = completion_tokens[returned_tokens:]
-                remaining_length = len(self.detokenize(remaining_tokens))
                 for token in remaining_tokens:
                     token_end_position += len(self.detokenize([token]))
                     # Check if stop sequence is in the token
@@ -805,7 +830,7 @@ class Llama:
                             self.detokenize(completion_tokens[:returned_tokens])
                         )
                         token_offset = len(prompt_tokens) + returned_tokens
-                        logits = self.eval_logits[token_offset - 1]
+                        logits = self._scores[token_offset - 1, :].tolist()
                         current_logprobs = Llama.logits_to_logprobs(logits)
                         sorted_logprobs = list(
                             sorted(
@@ -854,7 +879,7 @@ class Llama:
                 break
         if stopping_criteria is not None and stopping_criteria(
-            list(self.eval_tokens), self.eval_logits[-1]
+            self._input_ids.tolist(), self._scores[-1, :].tolist()
         ):
             text = self.detokenize(completion_tokens)
             finish_reason = "stop"
@@ -884,7 +909,7 @@ class Llama:
                         self.detokenize(completion_tokens[:returned_tokens])
                     )
                     token_offset = len(prompt_tokens) + returned_tokens - 1
-                    logits = self.eval_logits[token_offset]
+                    logits = self._scores[token_offset, :].tolist()
                     current_logprobs = Llama.logits_to_logprobs(logits)
                     sorted_logprobs = list(
                         sorted(
@@ -986,8 +1011,7 @@ class Llama:
                 for token in all_tokens
             ]
             all_logprobs = [
-                Llama.logits_to_logprobs(list(map(float, row)))
-                for row in self.eval_logits
+                Llama.logits_to_logprobs(row.tolist()) for row in self._scores
             ][token_offset:]
             for token, token_str, logprobs_token in zip(
                 all_tokens, all_token_strs, all_logprobs
@@ -1371,6 +1395,8 @@ class Llama:
         return LlamaState(
             eval_tokens=self.eval_tokens.copy(),
             eval_logits=self.eval_logits.copy(),
+            scores=self._scores.copy(),
+            input_ids=self._input_ids.copy(),
             llama_state=llama_state_compact,
             llama_state_size=n_bytes,
         )
@@ -1379,6 +1405,8 @@ class Llama:
         assert self.ctx is not None
         self.eval_tokens = state.eval_tokens.copy()
         self.eval_logits = state.eval_logits.copy()
+        self._scores = state.scores.copy()
+        self._input_ids = state.input_ids.copy()
         state_size = state.llama_state_size
         if llama_cpp.llama_set_state_data(self.ctx, state.llama_state) != state_size:
             raise RuntimeError("Failed to set llama state data")

{llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp/server/app.py RENAMED Viewed

@@ -1,13 +1,16 @@
 import json
-import logging
 import multiprocessing
 from threading import Lock
-from typing import List, Optional, Union, Iterator, Dict
+from functools import partial
+from typing import Iterator, List, Optional, Union, Dict
 from typing_extensions import TypedDict, Literal
 import llama_cpp
-from fastapi import Depends, FastAPI, APIRouter
+import anyio
+from anyio.streams.memory import MemoryObjectSendStream
+from starlette.concurrency import run_in_threadpool, iterate_in_threadpool
+from fastapi import Depends, FastAPI, APIRouter, Request
 from fastapi.middleware.cors import CORSMiddleware
 from pydantic import BaseModel, BaseSettings, Field, create_model_from_typeddict
 from sse_starlette.sse import EventSourceResponse
@@ -242,35 +245,49 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
     "/v1/completions",
     response_model=CreateCompletionResponse,
 )
-def create_completion(
-    request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
+async def create_completion(
+    request: Request,
+    body: CreateCompletionRequest,
+    llama: llama_cpp.Llama = Depends(get_llama),
 ):
-    if isinstance(request.prompt, list):
-        assert len(request.prompt) <= 1
-        request.prompt = request.prompt[0] if len(request.prompt) > 0 else ""
-    completion_or_chunks = llama(
-        **request.dict(
-            exclude={
-                "n",
-                "best_of",
-                "logit_bias",
-                "user",
-            }
-        )
-    )
-    if request.stream:
-        async def server_sent_events(
-            chunks: Iterator[llama_cpp.CompletionChunk],
-        ):
-            for chunk in chunks:
-                yield dict(data=json.dumps(chunk))
+    if isinstance(body.prompt, list):
+        assert len(body.prompt) <= 1
+        body.prompt = body.prompt[0] if len(body.prompt) > 0 else ""
+    exclude = {
+        "n",
+        "best_of",
+        "logit_bias",
+        "user",
+    }
+    kwargs = body.dict(exclude=exclude)
+    if body.stream:
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        async def event_publisher(inner_send_chan: MemoryObjectSendStream):
+            async with inner_send_chan:
+                try:
+                    iterator: Iterator[llama_cpp.CompletionChunk] = await run_in_threadpool(llama, **kwargs)  # type: ignore
+                    async for chunk in iterate_in_threadpool(iterator):
+                        await inner_send_chan.send(dict(data=json.dumps(chunk)))
+                        if await request.is_disconnected():
+                            raise anyio.get_cancelled_exc_class()()
+                    await inner_send_chan.send(dict(data="[DONE]"))
+                except anyio.get_cancelled_exc_class() as e:
+                    print("disconnected")
+                    with anyio.move_on_after(1, shield=True):
+                        print(
+                            f"Disconnected from client (via refresh/close) {request.client}"
+                        )
+                        await inner_send_chan.send(dict(closing=True))
+                        raise e
-        chunks: Iterator[llama_cpp.CompletionChunk] = completion_or_chunks  # type: ignore
-        return EventSourceResponse(server_sent_events(chunks))
-    completion: llama_cpp.Completion = completion_or_chunks  # type: ignore
-    return completion
+        return EventSourceResponse(
+            recv_chan, data_sender_callable=partial(event_publisher, send_chan)
+        )
+    else:
+        completion: llama_cpp.Completion = await run_in_threadpool(llama, **kwargs)  # type: ignore
+        return completion
 class CreateEmbeddingRequest(BaseModel):
@@ -293,10 +310,12 @@ CreateEmbeddingResponse = create_model_from_typeddict(llama_cpp.Embedding)
     "/v1/embeddings",
     response_model=CreateEmbeddingResponse,
 )
-def create_embedding(
+async def create_embedding(
     request: CreateEmbeddingRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
-    return llama.create_embedding(**request.dict(exclude={"user"}))
+    return await run_in_threadpool(
+        llama.create_embedding, **request.dict(exclude={"user"})
+    )
 class ChatCompletionRequestMessage(BaseModel):
@@ -350,36 +369,47 @@ CreateChatCompletionResponse = create_model_from_typeddict(llama_cpp.ChatComplet
     "/v1/chat/completions",
     response_model=CreateChatCompletionResponse,
 )
-def create_chat_completion(
-    request: CreateChatCompletionRequest,
+async def create_chat_completion(
+    request: Request,
+    body: CreateChatCompletionRequest,
     llama: llama_cpp.Llama = Depends(get_llama),
 ) -> Union[llama_cpp.ChatCompletion, EventSourceResponse]:
-    completion_or_chunks = llama.create_chat_completion(
-        **request.dict(
-            exclude={
-                "n",
-                "logit_bias",
-                "user",
-            }
-        ),
-    )
-    if request.stream:
-        async def server_sent_events(
-            chat_chunks: Iterator[llama_cpp.ChatCompletionChunk],
-        ):
-            for chat_chunk in chat_chunks:
-                yield dict(data=json.dumps(chat_chunk))
-            yield dict(data="[DONE]")
-        chunks: Iterator[llama_cpp.ChatCompletionChunk] = completion_or_chunks  # type: ignore
+    exclude = {
+        "n",
+        "logit_bias",
+        "user",
+    }
+    kwargs = body.dict(exclude=exclude)
+    if body.stream:
+        send_chan, recv_chan = anyio.create_memory_object_stream(10)
+        async def event_publisher(inner_send_chan: MemoryObjectSendStream):
+            async with inner_send_chan:
+                try:
+                    iterator: Iterator[llama_cpp.ChatCompletionChunk] = await run_in_threadpool(llama.create_chat_completion, **kwargs)  # type: ignore
+                    async for chat_chunk in iterate_in_threadpool(iterator):
+                        await inner_send_chan.send(dict(data=json.dumps(chat_chunk)))
+                        if await request.is_disconnected():
+                            raise anyio.get_cancelled_exc_class()()
+                    await inner_send_chan.send(dict(data="[DONE]"))
+                except anyio.get_cancelled_exc_class() as e:
+                    print("disconnected")
+                    with anyio.move_on_after(1, shield=True):
+                        print(
+                            f"Disconnected from client (via refresh/close) {request.client}"
+                        )
+                        await inner_send_chan.send(dict(closing=True))
+                        raise e
         return EventSourceResponse(
-            server_sent_events(chunks),
+            recv_chan,
+            data_sender_callable=partial(event_publisher, send_chan),
+        )
+    else:
+        completion: llama_cpp.ChatCompletion = await run_in_threadpool(
+            llama.create_chat_completion, **kwargs  # type: ignore
         )
-    completion: llama_cpp.ChatCompletion = completion_or_chunks  # type: ignore
-    return completion
+        return completion
 class ModelData(TypedDict):
@@ -398,7 +428,7 @@ GetModelResponse = create_model_from_typeddict(ModelList)
 @router.get("/v1/models", response_model=GetModelResponse)
-def get_models(
+async def get_models(
     settings: Settings = Depends(get_settings),
     llama: llama_cpp.Llama = Depends(get_llama),
 ) -> ModelList:

{llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama-cpp-python
-Version: 0.1.55
+Version: 0.1.56
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com
@@ -173,6 +173,17 @@ To get started, clone the repository and install the package in development mode
 ```bash
 git clone --recurse-submodules git@github.com:abetlen/llama-cpp-python.git
+# Install with pip
+pip install -e .
+# if you want to use the fastapi / openapi server
+pip install -e .[server]
+# If you're a poetry user, installing will also include a virtual environment
+poetry install --all-extras
+. .venv/bin/activate
 # Will need to be re-run any time vendor/llama.cpp is updated
 python3 setup.py develop
 ```

{llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/SOURCES.txt RENAMED Viewed

@@ -1,11 +1,14 @@
 .dockerignore
 .gitignore
 .gitmodules
+CHANGELOG.md
 CMakeLists.txt
 LICENSE.md
+Makefile
 README.md
 mkdocs.yml
 poetry.lock
+poetry.toml
 pyproject.toml
 setup.py
 .github/dependabot.yml

{llama_cpp_python-0.1.55 → llama_cpp_python-0.1.56}/llama_cpp_python.egg-info/requires.txt RENAMED Viewed

@@ -1,4 +1,5 @@
 typing-extensions>=4.5.0
+numpy>=1.20.0
 [server]
 uvicorn>=0.21.1

llama-cpp-python 0.1.55__tar.gz → 0.1.56__tar.gz

llama-cpp-python 0.1.55tar.gz → 0.1.56tar.gz