PyPI - llama-cpp-python - Versions diffs - 0.1.48__tar.gz → 0.1.50__tar.gz - Mend

llama-cpp-python 0.1.48tar.gz → 0.1.50tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (157) hide show

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.1.48
+Version: 0.1.50
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com
@@ -53,19 +53,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
 To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
 ```bash
-LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
 ```bash
-LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
 ```bash
-LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
@@ -120,7 +120,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
 A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
 ```bash
-docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
+docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
 ```
 ## Low-level API

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/README.md RENAMED Viewed

@@ -35,19 +35,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
 To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
 ```bash
-LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
 ```bash
-LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
 ```bash
-LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
@@ -102,7 +102,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
 A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
 ```bash
-docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
+docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
 ```
 ## Low-level API

llama_cpp_python-0.1.50/examples/high_level_api/fastapi_server.py ADDED Viewed

@@ -0,0 +1,37 @@
+"""Example FastAPI server for llama.cpp.
+To run this example:
+```bash
+pip install fastapi uvicorn sse-starlette
+export MODEL=../models/7B/...
+```
+Then run:
+```
+uvicorn llama_cpp.server.app:app --reload
+```
+or
+```
+python3 -m llama_cpp.server
+```
+Then visit http://localhost:8000/docs to see the interactive API docs.
+To actually see the implementation of the server, see llama_cpp/server/app.py
+"""
+import os
+import uvicorn
+from llama_cpp.server.app import create_app
+if __name__ == "__main__":
+    app = create_app()
+    uvicorn.run(
+        app, host=os.getenv("HOST", "localhost"), port=int(os.getenv("PORT", 8000))
+    )

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/llama.py RENAMED Viewed

@@ -83,6 +83,7 @@ class Llama:
         # NOTE: These parameters are likely to change in the future.
         n_ctx: int = 512,
         n_parts: int = -1,
+        n_gpu_layers: int = 0,
         seed: int = 1337,
         f16_kv: bool = True,
         logits_all: bool = False,
@@ -129,6 +130,7 @@ class Llama:
         self.params = llama_cpp.llama_context_default_params()
         self.params.n_ctx = n_ctx
         self.params.n_parts = n_parts
+        self.params.n_gpu_layers = n_gpu_layers
         self.params.seed = seed
         self.params.f16_kv = f16_kv
         self.params.logits_all = logits_all
@@ -174,7 +176,9 @@ class Llama:
         if self.verbose:
             print(llama_cpp.llama_print_system_info().decode("utf-8"), file=sys.stderr)
-    def tokenize(self, text: bytes) -> List[llama_cpp.llama_token]:
+    def tokenize(
+        self, text: bytes, add_bos: bool = True
+    ) -> List[llama_cpp.llama_token]:
         """Tokenize a string.
         Args:
@@ -194,10 +198,22 @@ class Llama:
             text,
             tokens,
             n_ctx,
-            llama_cpp.c_bool(True),
+            llama_cpp.c_bool(add_bos),
         )
         if int(n_tokens) < 0:
-            raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
+            n_tokens = abs(n_tokens)
+            tokens = (llama_cpp.llama_token * int(n_tokens))()
+            n_tokens = llama_cpp.llama_tokenize(
+                self.ctx,
+                text,
+                tokens,
+                llama_cpp.c_int(n_tokens),
+                llama_cpp.c_bool(add_bos),
+            )
+            if n_tokens < 0:
+                raise RuntimeError(
+                    f'Failed to tokenize: text="{text}" n_tokens={n_tokens}'
+                )
         return list(tokens[:n_tokens])
     def detokenize(self, tokens: List[llama_cpp.llama_token]) -> bytes:
@@ -268,9 +284,13 @@ class Llama:
         top_k: llama_cpp.c_int,
         top_p: llama_cpp.c_float,
         temp: llama_cpp.c_float,
+        tfs_z: llama_cpp.c_float,
         repeat_penalty: llama_cpp.c_float,
         frequency_penalty: llama_cpp.c_float,
         presence_penalty: llama_cpp.c_float,
+        mirostat_mode: llama_cpp.c_int,
+        mirostat_tau: llama_cpp.c_float,
+        mirostat_eta: llama_cpp.c_float,
     ):
         assert self.ctx is not None
         assert len(self.eval_logits) > 0
@@ -308,11 +328,41 @@ class Llama:
             alpha_frequency=frequency_penalty,
             alpha_presence=presence_penalty,
         )
-        if float(temp.value) == 0.0:
+        if temp.value == 0.0:
             return llama_cpp.llama_sample_token_greedy(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
             )
+        elif mirostat_mode.value == 1:
+            mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
+            mirostat_m = llama_cpp.c_int(100)
+            llama_cpp.llama_sample_temperature(
+                ctx=self.ctx,
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                temp=temp,
+            )
+            return llama_cpp.llama_sample_token_mirostat(
+                ctx=self.ctx,
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                tau=mirostat_tau,
+                eta=mirostat_eta,
+                mu=llama_cpp.ctypes.byref(mirostat_mu),  # type: ignore
+                m=mirostat_m,
+            )
+        elif mirostat_mode.value == 2:
+            mirostat_mu = llama_cpp.c_float(2.0 * mirostat_tau.value)
+            llama_cpp.llama_sample_temperature(
+                ctx=self.ctx,
+                candidates=llama_cpp.ctypes.pointer(candidates),
+                temp=temp,
+            )
+            return llama_cpp.llama_sample_token_mirostat_v2(
+                ctx=self.ctx,
+                candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
+                tau=mirostat_tau,
+                eta=mirostat_eta,
+                mu=llama_cpp.ctypes.byref(mirostat_mu),  # type: ignore
+            )
         else:
             llama_cpp.llama_sample_top_k(
                 ctx=self.ctx,
@@ -323,7 +373,7 @@ class Llama:
             llama_cpp.llama_sample_tail_free(
                 ctx=self.ctx,
                 candidates=llama_cpp.ctypes.byref(candidates),  # type: ignore
-                z=llama_cpp.c_float(1.0),
+                z=tfs_z,
                 min_keep=llama_cpp.c_size_t(1),
             )
             llama_cpp.llama_sample_typical(
@@ -350,12 +400,16 @@ class Llama:
     def sample(
         self,
-        top_k: int,
-        top_p: float,
-        temp: float,
-        repeat_penalty: float,
+        top_k: int = 40,
+        top_p: float = 0.95,
+        temp: float = 0.80,
+        repeat_penalty: float = 1.1,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_eta: float = 0.1,
+        mirostat_tau: float = 5.0,
     ):
         """Sample a token from the model.
@@ -380,9 +434,13 @@ class Llama:
             top_k=llama_cpp.c_int(top_k),
             top_p=llama_cpp.c_float(top_p),
             temp=llama_cpp.c_float(temp),
+            tfs_z=llama_cpp.c_float(tfs_z),
             repeat_penalty=llama_cpp.c_float(repeat_penalty),
             frequency_penalty=llama_cpp.c_float(frequency_penalty),
             presence_penalty=llama_cpp.c_float(presence_penalty),
+            mirostat_mode=llama_cpp.c_int(mirostat_mode),
+            mirostat_tau=llama_cpp.c_float(mirostat_tau),
+            mirostat_eta=llama_cpp.c_float(mirostat_eta),
         )
     def generate(
@@ -392,9 +450,13 @@ class Llama:
         top_p: float,
         temp: float,
         repeat_penalty: float,
+        reset: bool = True,
         frequency_penalty: float = 0.0,
         presence_penalty: float = 0.0,
-        reset: bool = True,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
     ) -> Generator[
         llama_cpp.llama_token, Optional[Sequence[llama_cpp.llama_token]], None
     ]:
@@ -447,9 +509,13 @@ class Llama:
                 top_k=top_k,
                 top_p=top_p,
                 temp=temp,
+                repeat_penalty=repeat_penalty,
                 frequency_penalty=frequency_penalty,
                 presence_penalty=presence_penalty,
-                repeat_penalty=repeat_penalty,
+                tfs_z=tfs_z,
+                mirostat_mode=mirostat_mode,
+                mirostat_tau=mirostat_tau,
+                mirostat_eta=mirostat_eta,
             )
             tokens_or_none = yield token
             tokens = [token]
@@ -528,6 +594,10 @@ class Llama:
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
     ) -> Union[Iterator[Completion], Iterator[CompletionChunk]]:
         assert self.ctx is not None
         completion_id: str = f"cmpl-{str(uuid.uuid4())}"
@@ -583,6 +653,10 @@ class Llama:
             top_k=top_k,
             top_p=top_p,
             temp=temperature,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
             frequency_penalty=frequency_penalty,
             presence_penalty=presence_penalty,
             repeat_penalty=repeat_penalty,
@@ -655,6 +729,9 @@ class Llama:
                 print("Llama._create_completion: cache save", file=sys.stderr)
             self.cache[prompt_tokens + completion_tokens] = self.save_state()
+        if self.verbose:
+            llama_cpp.llama_print_timings(self.ctx)
         if stream:
             yield {
                 "id": completion_id,
@@ -726,9 +803,6 @@ class Llama:
                 "top_logprobs": top_logprobs,
             }
-        if self.verbose:
-            llama_cpp.llama_print_timings(self.ctx)
         yield {
             "id": completion_id,
             "object": "text_completion",
@@ -764,6 +838,10 @@ class Llama:
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
@@ -801,6 +879,10 @@ class Llama:
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks
@@ -823,6 +905,10 @@ class Llama:
         repeat_penalty: float = 1.1,
         top_k: int = 40,
         stream: bool = False,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
     ) -> Union[Completion, Iterator[CompletionChunk]]:
         """Generate text from a prompt.
@@ -860,6 +946,10 @@ class Llama:
             repeat_penalty=repeat_penalty,
             top_k=top_k,
             stream=stream,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
         )
     def _convert_text_completion_to_chat(
@@ -932,6 +1022,10 @@ class Llama:
         presence_penalty: float = 0.0,
         frequency_penalty: float = 0.0,
         repeat_penalty: float = 1.1,
+        tfs_z: float = 1.0,
+        mirostat_mode: int = 0,
+        mirostat_tau: float = 5.0,
+        mirostat_eta: float = 0.1,
     ) -> Union[ChatCompletion, Iterator[ChatCompletionChunk]]:
         """Generate a chat completion from a list of messages.
@@ -966,6 +1060,10 @@ class Llama:
             repeat_penalty=repeat_penalty,
             presence_penalty=presence_penalty,
             frequency_penalty=frequency_penalty,
+            tfs_z=tfs_z,
+            mirostat_mode=mirostat_mode,
+            mirostat_tau=mirostat_tau,
+            mirostat_eta=mirostat_eta,
         )
         if stream:
             chunks: Iterator[CompletionChunk] = completion_or_chunks  # type: ignore
@@ -985,6 +1083,7 @@ class Llama:
             model_path=self.model_path,
             n_ctx=self.params.n_ctx,
             n_parts=self.params.n_parts,
+            n_gpu_layers=self.params.n_gpu_layers,
             seed=self.params.seed,
             f16_kv=self.params.f16_kv,
             logits_all=self.params.logits_all,
@@ -1004,6 +1103,7 @@ class Llama:
             model_path=state["model_path"],
             n_ctx=state["n_ctx"],
             n_parts=state["n_parts"],
+            n_gpu_layers=state["n_gpu_layers"],
             seed=state["seed"],
             f16_kv=state["f16_kv"],
             logits_all=state["logits_all"],

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/llama_cpp.py RENAMED Viewed

@@ -68,7 +68,7 @@ _lib_base_name = "llama"
 _lib = _load_shared_library(_lib_base_name)
 # C types
-LLAMA_FILE_VERSION = c_int(1)
+LLAMA_FILE_VERSION = c_int(2)
 LLAMA_FILE_MAGIC = b"ggjt"
 LLAMA_FILE_MAGIC_UNVERSIONED = b"ggml"
 LLAMA_SESSION_MAGIC = b"ggsn"
@@ -109,6 +109,7 @@ class llama_context_params(Structure):
     _fields_ = [
         ("n_ctx", c_int),  # text context
         ("n_parts", c_int),  # -1 for default
+        ("n_gpu_layers", c_int),  # number of layers to store in VRAM
         ("seed", c_int),  # RNG seed, 0 for random
         ("f16_kv", c_bool),  # use fp16 for KV cache
         (
@@ -135,7 +136,7 @@ LLAMA_FTYPE_MOSTLY_Q4_1 = c_int(3)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q4_1_SOME_F16 = c_int(
     4
 )  # tok_embeddings.weight and output.weight are F16
-LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
+# LLAMA_FTYPE_MOSTLY_Q4_2 = c_int(5)  # except 1d tensors
 # LLAMA_FTYPE_MOSTYL_Q4_3 = c_int(6)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q8_0 = c_int(7)  # except 1d tensors
 LLAMA_FTYPE_MOSTLY_Q5_0 = c_int(8)  # except 1d tensors
@@ -259,9 +260,9 @@ _lib.llama_get_state_size.restype = c_size_t
 # Destination needs to have allocated enough memory.
 # Returns the number of bytes copied
 def llama_copy_state_data(
-    ctx: llama_context_p, dest  # type: Array[c_uint8]
+    ctx: llama_context_p, dst  # type: Array[c_uint8]
 ) -> int:
-    return _lib.llama_copy_state_data(ctx, dest)
+    return _lib.llama_copy_state_data(ctx, dst)
 _lib.llama_copy_state_data.argtypes = [llama_context_p, c_uint8_p]
@@ -350,7 +351,7 @@ def llama_tokenize(
     tokens,  # type: Array[llama_token]
     n_max_tokens: c_int,
     add_bos: c_bool,
-) -> c_int:
+) -> int:
     return _lib.llama_tokenize(ctx, text, tokens, n_max_tokens, add_bos)

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp/server/app.py RENAMED Viewed

@@ -17,6 +17,11 @@ class Settings(BaseSettings):
         description="The path to the model to use for generating completions."
     )
     n_ctx: int = Field(default=2048, ge=1, description="The context size.")
+    n_gpu_layers: int = Field(
+        default=0,
+        ge=0,
+        description="The number of layers to put on the GPU. The rest will be on the CPU.",
+    )
     n_batch: int = Field(
         default=512, ge=1, description="The batch size to use per eval."
     )
@@ -80,6 +85,7 @@ def create_app(settings: Optional[Settings] = None):
     global llama
     llama = llama_cpp.Llama(
         model_path=settings.model,
+        n_gpu_layers=settings.n_gpu_layers,
         f16_kv=settings.f16_kv,
         use_mlock=settings.use_mlock,
         use_mmap=settings.use_mmap,
@@ -152,9 +158,23 @@ repeat_penalty_field = Field(
     + "Repeat penalty is a hyperparameter used to penalize the repetition of token sequences during text generation. It helps prevent the model from generating repetitive or monotonous text. A higher value (e.g., 1.5) will penalize repetitions more strongly, while a lower value (e.g., 0.9) will be more lenient.",
 )
+presence_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on whether they appear in the text so far, increasing the model's likelihood to talk about new topics.",
+)
+frequency_penalty_field = Field(
+    default=0.0,
+    ge=-2.0,
+    le=2.0,
+    description="Positive values penalize new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.",
+)
 class CreateCompletionRequest(BaseModel):
-    prompt: Optional[str] = Field(
+    prompt: Union[str, List[str]] = Field(
         default="", description="The prompt to generate completions for."
     )
     suffix: Optional[str] = Field(
@@ -175,13 +195,13 @@ class CreateCompletionRequest(BaseModel):
         ge=0,
         description="The number of logprobs to generate. If None, no logprobs are generated.",
     )
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
     # ignored or currently unsupported
     model: Optional[str] = model_field
     n: Optional[int] = 1
     logprobs: Optional[int] = Field(None)
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
     best_of: Optional[int] = 1
     logit_bias: Optional[Dict[str, float]] = Field(None)
     user: Optional[str] = Field(None)
@@ -209,6 +229,10 @@ CreateCompletionResponse = create_model_from_typeddict(llama_cpp.Completion)
 def create_completion(
     request: CreateCompletionRequest, llama: llama_cpp.Llama = Depends(get_llama)
 ):
+    if isinstance(request.prompt, list):
+        assert len(request.prompt) <= 1
+        request.prompt = request.prompt[0] if len(request.prompt) > 0 else ""
     completion_or_chunks = llama(
         **request.dict(
             exclude={
@@ -269,12 +293,12 @@ class CreateChatCompletionRequest(BaseModel):
     top_p: float = top_p_field
     stop: Optional[List[str]] = stop_field
     stream: bool = stream_field
+    presence_penalty: Optional[float] = presence_penalty_field
+    frequency_penalty: Optional[float] = frequency_penalty_field
     # ignored or currently unsupported
     model: Optional[str] = model_field
     n: Optional[int] = 1
-    presence_penalty: Optional[float] = 0
-    frequency_penalty: Optional[float] = 0
     logit_bias: Optional[Dict[str, float]] = Field(None)
     user: Optional[str] = Field(None)

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama-cpp-python
-Version: 0.1.48
+Version: 0.1.50
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com
@@ -53,19 +53,19 @@ Use the `FORCE_CMAKE=1` environment variable to force the use of `cmake` and ins
 To install with OpenBLAS, set the `LLAMA_OPENBLAS=1` environment variable before installing:
 ```bash
-LLAMA_OPENBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_OPENBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 To install with cuBLAS, set the `LLAMA_CUBLAS=1` environment variable before installing:
 ```bash
-LLAMA_CUBLAS=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
 To install with CLBlast, set the `LLAMA_CLBLAST=1` environment variable before installing:
 ```bash
-LLAMA_CLBLAST=1 FORCE_CMAKE=1 pip install llama-cpp-python
+CMAKE_ARGS="-DLLAMA_CLBLAST=on" FORCE_CMAKE=1 pip install llama-cpp-python
 ```
@@ -120,7 +120,7 @@ Navigate to [http://localhost:8000/docs](http://localhost:8000/docs) to see the
 A Docker image is available on [GHCR](https://ghcr.io/abetlen/llama-cpp-python). To run the server:
 ```bash
-docker run --rm -it -p8000:8000 -v /path/to/models:/models -eMODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
+docker run --rm -it -p 8000:8000 -v /path/to/models:/models -e MODEL=/models/ggml-model-name.bin ghcr.io/abetlen/llama-cpp-python:latest
 ```
 ## Low-level API

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/llama_cpp_python.egg-info/SOURCES.txt RENAMED Viewed

@@ -47,6 +47,7 @@ llama_cpp_python.egg-info/dependency_links.txt
 llama_cpp_python.egg-info/requires.txt
 llama_cpp_python.egg-info/top_level.txt
 tests/test_llama.py
+vendor/llama.cpp/.clang-tidy
 vendor/llama.cpp/.dockerignore
 vendor/llama.cpp/.ecrc
 vendor/llama.cpp/.editorconfig
@@ -80,6 +81,8 @@ vendor/llama.cpp/.github/ISSUE_TEMPLATE/custom.md
 vendor/llama.cpp/.github/workflows/build.yml
 vendor/llama.cpp/.github/workflows/docker.yml
 vendor/llama.cpp/.github/workflows/editorconfig.yml
+vendor/llama.cpp/.github/workflows/tidy-post.yml
+vendor/llama.cpp/.github/workflows/tidy-review.yml
 vendor/llama.cpp/examples/CMakeLists.txt
 vendor/llama.cpp/examples/Miku.sh
 vendor/llama.cpp/examples/alpaca.sh
@@ -90,6 +93,8 @@ vendor/llama.cpp/examples/common.cpp
 vendor/llama.cpp/examples/common.h
 vendor/llama.cpp/examples/gpt4all.sh
 vendor/llama.cpp/examples/reason-act.sh
+vendor/llama.cpp/examples/baby-llama/CMakeLists.txt
+vendor/llama.cpp/examples/baby-llama/baby-llama.cpp
 vendor/llama.cpp/examples/benchmark/CMakeLists.txt
 vendor/llama.cpp/examples/benchmark/benchmark-matmult.cpp
 vendor/llama.cpp/examples/embedding/CMakeLists.txt
@@ -128,16 +133,21 @@ vendor/llama.cpp/prompts/chat-with-bob.txt
 vendor/llama.cpp/prompts/chat-with-vicuna-v0.txt
 vendor/llama.cpp/prompts/chat-with-vicuna-v1.txt
 vendor/llama.cpp/prompts/chat.txt
+vendor/llama.cpp/prompts/dan-modified.txt
 vendor/llama.cpp/prompts/dan.txt
 vendor/llama.cpp/prompts/reason-act.txt
 vendor/llama.cpp/scripts/build-info.cmake
 vendor/llama.cpp/scripts/build-info.h.in
 vendor/llama.cpp/scripts/build-info.sh
+vendor/llama.cpp/scripts/perf-run-all.sh
+vendor/llama.cpp/scripts/ppl-run-all.sh
 vendor/llama.cpp/scripts/sync-ggml.sh
 vendor/llama.cpp/scripts/verify-checksum-models.py
 vendor/llama.cpp/spm-headers/llama.h
 vendor/llama.cpp/tests/CMakeLists.txt
 vendor/llama.cpp/tests/test-double-float.c
+vendor/llama.cpp/tests/test-grad0.c
+vendor/llama.cpp/tests/test-opt.c
 vendor/llama.cpp/tests/test-quantize-fns.cpp
 vendor/llama.cpp/tests/test-quantize-perf.cpp
 vendor/llama.cpp/tests/test-sampling.cpp

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/poetry.lock RENAMED Viewed

@@ -773,14 +773,14 @@ mkdocs = ">=1.1"
 [[package]]
 name = "mkdocs-material"
-version = "9.1.9"
+version = "9.1.11"
 description = "Documentation that simply works"
 category = "dev"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "mkdocs_material-9.1.9-py3-none-any.whl", hash = "sha256:7db24261cb17400e132c46d17eea712bfe71056d892a9beba32cf68210297141"},
-    {file = "mkdocs_material-9.1.9.tar.gz", hash = "sha256:74d8da1371ab3a326868fe47bae3cbc4aa22e93c048b4ca5117e6817b88bd734"},
+    {file = "mkdocs_material-9.1.11-py3-none-any.whl", hash = "sha256:fbc86d50ec2cf34d40d5c4365780f290ceedde23f1a0704323b34e7f16b0c0dd"},
+    {file = "mkdocs_material-9.1.11.tar.gz", hash = "sha256:f5d473eb79d6640a5e668d4b2ab5b9de5e76ae0a0e2d864112df0cfe9016dc1d"},
 ]
 [package.dependencies]
@@ -1439,4 +1439,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "e87403dcd0a0b8484436b02c392326adfaf22b8d7e182d77e4a155c67a7435bc"
+content-hash = "6bea74d847b958639276d4be527c2b65dafeb0a455b6e3d1f29fee5171ce73b2"

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp_python"
-version = "0.1.48"
+version = "0.1.50"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"
@@ -22,7 +22,7 @@ black = "^23.3.0"
 twine = "^4.0.2"
 mkdocs = "^1.4.3"
 mkdocstrings = {extras = ["python"], version = "^0.21.2"}
-mkdocs-material = "^9.1.9"
+mkdocs-material = "^9.1.11"
 pytest = "^7.3.1"
 httpx = "^0.24.0"

{llama_cpp_python-0.1.48 → llama_cpp_python-0.1.50}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.48",
+    version="0.1.50",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

llama-cpp-python 0.1.48__tar.gz → 0.1.50__tar.gz

llama-cpp-python 0.1.48tar.gz → 0.1.50tar.gz