PyPI - llama-cpp-python - Versions diffs - 0.1.7__tar.gz → 0.1.9__tar.gz - Mend

llama-cpp-python 0.1.7tar.gz → 0.1.9tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (47) hide show

{llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.1.7
+Version: 0.1.9
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com

{llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/README.md RENAMED Viewed

@@ -1,5 +1,6 @@
 # 🦙 Python Bindings for `llama.cpp`
+[![Documentation](https://img.shields.io/badge/docs-passing-green.svg)](https://abetlen.github.io/llama-cpp-python)
 [![PyPI](https://img.shields.io/pypi/v/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - Python Version](https://img.shields.io/pypi/pyversions/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)
 [![PyPI - License](https://img.shields.io/pypi/l/llama-cpp-python)](https://pypi.org/project/llama-cpp-python/)

llama_cpp_python-0.1.9/llama_cpp/llama.py ADDED Viewed

@@ -0,0 +1,343 @@
+import os
+import uuid
+import time
+import multiprocessing
+from typing import List, Optional
+from collections import deque
+from . import llama_cpp
+class Llama:
+    """High-level Python wrapper for a llama.cpp model."""
+    def __init__(
+        self,
+        model_path: str,
+        # NOTE: The following parameters are likely to change in the future.
+        n_ctx: int = 512,
+        n_parts: int = -1,
+        seed: int = 1337,
+        f16_kv: bool = False,
+        logits_all: bool = False,
+        vocab_only: bool = False,
+        use_mlock: bool = False,
+        embedding: bool = False,
+        n_threads: Optional[int] = None,
+    ) -> "Llama":
+        """Load a llama.cpp model from `model_path`.
+        Args:
+            model_path: Path to the model.
+            n_ctx: Maximum context size.
+            n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
+            seed: Random seed. 0 for random.
+            f16_kv: Use half-precision for key/value cache.
+            logits_all: Return logits for all tokens, not just the last token.
+            vocab_only: Only load the vocabulary no weights.
+            use_mlock: Force the system to keep the model in RAM.
+            embedding: Embedding mode only.
+            n_threads: Number of threads to use. If None, the number of threads is automatically determined.
+        Raises:
+            ValueError: If the model path does not exist.
+        Returns:
+            A Llama instance.
+        """
+        self.model_path = model_path
+        self.params = llama_cpp.llama_context_default_params()
+        self.params.n_ctx = n_ctx
+        self.params.n_parts = n_parts
+        self.params.seed = seed
+        self.params.f16_kv = f16_kv
+        self.params.logits_all = logits_all
+        self.params.vocab_only = vocab_only
+        self.params.use_mlock = use_mlock
+        self.params.embedding = embedding
+        self.last_n = 64
+        self.max_chunk_size = n_ctx
+        self.n_threads = n_threads or multiprocessing.cpu_count()
+        if not os.path.exists(model_path):
+            raise ValueError(f"Model path does not exist: {model_path}")
+        self.ctx = llama_cpp.llama_init_from_file(
+            self.model_path.encode("utf-8"), self.params
+        )
+    def tokenize(self, text: bytes) -> List[int]:
+        """Tokenize a string.
+        Args:
+            text: The utf-8 encoded string to tokenize.
+        Returns:
+            A list of tokens.
+        """
+        n_ctx = llama_cpp.llama_n_ctx(self.ctx)
+        tokens = (llama_cpp.llama_token * n_ctx)()
+        n_tokens = llama_cpp.llama_tokenize(
+            self.ctx,
+            text,
+            tokens,
+            n_ctx,
+            True,
+        )
+        if n_tokens < 0:
+            raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}')
+        return list(tokens[:n_tokens])
+    def detokenize(self, tokens: List[int]) -> bytes:
+        """Detokenize a list of tokens.
+        Args:
+            tokens: The list of tokens to detokenize.
+        Returns:
+            The detokenized string.
+        """
+        output = b""
+        for token in tokens:
+            output += llama_cpp.llama_token_to_str(self.ctx, token)
+        return output
+    def embed(self, text: str):
+        """Embed a string.
+        Args:
+            text: The utf-8 encoded string to embed.
+        Returns:
+            A list of embeddings.
+        """
+        tokens = self.tokenize(text.encode("utf-8"))
+        self._eval(tokens, 0)
+        embeddings = llama_cpp.llama_get_embeddings(self.ctx)
+        return embeddings[:llama_cpp.llama_n_embd(self.ctx)]
+    def _eval(self, tokens: List[int], n_past):
+        rc = llama_cpp.llama_eval(
+            self.ctx,
+            (llama_cpp.llama_token * len(tokens))(*tokens),
+            len(tokens),
+            n_past,
+            self.n_threads,
+        )
+        if rc != 0:
+            raise RuntimeError(f"Failed to evaluate: {rc}")
+    def _sample(self, last_n_tokens, top_p, top_k, temp, repeat_penalty):
+        return llama_cpp.llama_sample_top_p_top_k(
+            self.ctx,
+            (llama_cpp.llama_token * len(last_n_tokens))(*last_n_tokens),
+            len(last_n_tokens),
+            top_k=top_k,
+            top_p=top_p,
+            temp=temp,
+            repeat_penalty=repeat_penalty,
+        )
+    def _generate(self, past_tokens, max_tokens, top_p, top_k, temp, repeat_penalty):
+        last_n_tokens = deque([0] * self.last_n, maxlen=self.last_n)
+        last_n_tokens.extend(past_tokens)
+        for i in range(max_tokens):
+            token = self._sample(
+                last_n_tokens,
+                top_p=top_p,
+                top_k=top_k,
+                temp=temp,
+                repeat_penalty=repeat_penalty,
+            )
+            yield token
+            self._eval([token], len(past_tokens) + i)
+    def _call(
+        self,
+        prompt: str,
+        suffix: Optional[str] = None,
+        max_tokens: int = 16,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+        logprobs: Optional[int] = None,
+        echo: bool = False,
+        stop: List[str] = [],
+        repeat_penalty: float = 1.1,
+        top_k: int = 40,
+        stream: bool = False,
+    ):
+        completion_id = f"cmpl-{str(uuid.uuid4())}"
+        created = int(time.time())
+        completion_tokens = []
+        prompt_tokens = self.tokenize(prompt.encode("utf-8"))
+        if len(prompt_tokens) + max_tokens > llama_cpp.llama_n_ctx(self.ctx):
+            raise ValueError(
+                f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
+            )
+        # Process prompt in chunks to avoid running out of memory
+        for i in range(0, len(prompt_tokens), self.max_chunk_size):
+            chunk = prompt_tokens[i : min(len(prompt_tokens), i + self.max_chunk_size)]
+            self._eval(chunk, n_past=i)
+        if stop is not None:
+            stop = [s.encode("utf-8") for s in stop]
+        finish_reason = None
+        for token in self._generate(
+            prompt_tokens, max_tokens, top_p, top_k, temperature, repeat_penalty
+        ):
+            if token == llama_cpp.llama_token_eos():
+                finish_reason = "stop"
+                break
+            completion_tokens.append(token)
+            text = self.detokenize(completion_tokens)
+            any_stop = [s for s in stop if s in text]
+            if len(any_stop) > 0:
+                first_stop = any_stop[0]
+                text = text[: text.index(first_stop)]
+                finish_reason = "stop"
+                break
+            if stream:
+                start = len(self.detokenize(completion_tokens[:-1]))
+                longest = 0
+                for s in stop:
+                    for i in range(len(s), 0, -1):
+                        if s[-i:] == text[-i:]:
+                            if i > longest:
+                                longest = i
+                            break
+                yield {
+                    "id": completion_id,
+                    "object": "text_completion",
+                    "created": created,
+                    "model": self.model_path,
+                    "choices": [
+                        {
+                            "text": text[start : len(text) - longest].decode("utf-8"),
+                            "index": 0,
+                            "logprobs": None,
+                            "finish_reason": None,
+                        }
+                    ],
+                }
+        if finish_reason is None:
+            finish_reason = "length"
+        if stream:
+            if finish_reason == "stop":
+                start = len(self.detokenize(completion_tokens[:-1]))
+                text = text[start:].decode("utf-8")
+            else:
+                text = ""
+            yield {
+                "id": completion_id,
+                "object": "text_completion",
+                "created": created,
+                "model": self.model_path,
+                "choices": [
+                    {
+                        "text": text,
+                        "index": 0,
+                        "logprobs": None,
+                        "finish_reason": finish_reason,
+                    }
+                ],
+            }
+            return
+        text = text.decode("utf-8")
+        if echo:
+            text = prompt + text
+        if suffix is not None:
+            text = text + suffix
+        if logprobs is not None:
+            logprobs = llama_cpp.llama_get_logits(
+                self.ctx,
+            )[:logprobs]
+        yield {
+            "id": completion_id,
+            "object": "text_completion",
+            "created": created,
+            "model": self.model_path,
+            "choices": [
+                {
+                    "text": text,
+                    "index": 0,
+                    "logprobs": logprobs,
+                    "finish_reason": finish_reason,
+                }
+            ],
+            "usage": {
+                "prompt_tokens": len(prompt_tokens),
+                "completion_tokens": len(completion_tokens),
+                "total_tokens": len(prompt_tokens) + len(completion_tokens),
+            },
+        }
+    def __call__(
+        self,
+        prompt: str,
+        suffix: Optional[str] = None,
+        max_tokens: int = 16,
+        temperature: float = 0.8,
+        top_p: float = 0.95,
+        logprobs: Optional[int] = None,
+        echo: bool = False,
+        stop: List[str] = [],
+        repeat_penalty: float = 1.1,
+        top_k: int = 40,
+        stream: bool = False,
+    ):
+        """Generate text from a prompt.
+        Args:
+            prompt: The prompt to generate text from.
+            suffix: A suffix to append to the generated text. If None, no suffix is appended.
+            max_tokens: The maximum number of tokens to generate.
+            temperature: The temperature to use for sampling.
+            top_p: The top-p value to use for sampling.
+            logprobs: The number of logprobs to return. If None, no logprobs are returned.
+            echo: Whether to echo the prompt.
+            stop: A list of strings to stop generation when encountered.
+            repeat_penalty: The penalty to apply to repeated tokens.
+            top_k: The top-k value to use for sampling.
+            stream: Whether to stream the results.
+        Raises:
+            ValueError: If the requested tokens exceed the context window.
+            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
+        Returns:
+            Response object containing the generated text.
+        """
+        call = self._call(
+            prompt=prompt,
+            suffix=suffix,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            logprobs=logprobs,
+            echo=echo,
+            stop=stop,
+            repeat_penalty=repeat_penalty,
+            top_k=top_k,
+            stream=stream,
+        )
+        if stream:
+            return call
+        return next(call)
+    def __del__(self):
+        llama_cpp.llama_free(self.ctx)

{llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/llama_cpp_python.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama-cpp-python
-Version: 0.1.7
+Version: 0.1.9
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com

{llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp"
-version = "0.1.7"
+version = "0.1.9"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"

{llama_cpp_python-0.1.7 → llama_cpp_python-0.1.9}/setup.py RENAMED Viewed

@@ -3,7 +3,7 @@ from skbuild import setup
 setup(
     name="llama_cpp_python",
     description="A Python wrapper for llama.cpp",
-    version="0.1.7",
+    version="0.1.9",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

llama_cpp_python-0.1.7/llama_cpp/llama.py DELETED Viewed

@@ -1,216 +0,0 @@
-import os
-import uuid
-import time
-import multiprocessing
-from typing import List, Optional
-from . import llama_cpp
-class Llama:
-    """High-level Python wrapper for a llama.cpp model."""
-    def __init__(
-        self,
-        model_path: str,
-        # NOTE: The following parameters are likely to change in the future.
-        n_ctx: int = 512,
-        n_parts: int = -1,
-        seed: int = 1337,
-        f16_kv: bool = False,
-        logits_all: bool = False,
-        vocab_only: bool = False,
-        use_mlock: bool = False,
-        embedding: bool = False,
-        n_threads: Optional[int] = None,
-    ) -> "Llama":
-        """Load a llama.cpp model from `model_path`.
-        Args:
-            model_path: Path to the model.
-            n_ctx: Maximum context size.
-            n_parts: Number of parts to split the model into. If -1, the number of parts is automatically determined.
-            seed: Random seed. 0 for random.
-            f16_kv: Use half-precision for key/value cache.
-            logits_all: Return logits for all tokens, not just the last token.
-            vocab_only: Only load the vocabulary no weights.
-            use_mlock: Force the system to keep the model in RAM.
-            embedding: Embedding mode only.
-            n_threads: Number of threads to use. If None, the number of threads is automatically determined.
-        Raises:
-            ValueError: If the model path does not exist.
-        Returns:
-            A Llama instance.
-        """
-        self.model_path = model_path
-        self.last_n = 64
-        self.max_chunk_size = 32
-        self.params = llama_cpp.llama_context_default_params()
-        self.params.n_ctx = n_ctx
-        self.params.n_parts = n_parts
-        self.params.seed = seed
-        self.params.f16_kv = f16_kv
-        self.params.logits_all = logits_all
-        self.params.vocab_only = vocab_only
-        self.params.use_mlock = use_mlock
-        self.params.embedding = embedding
-        self.n_threads = n_threads or multiprocessing.cpu_count()
-        self.tokens = (llama_cpp.llama_token * self.params.n_ctx)()
-        if not os.path.exists(model_path):
-            raise ValueError(f"Model path does not exist: {model_path}")
-        self.ctx = llama_cpp.llama_init_from_file(
-            self.model_path.encode("utf-8"), self.params
-        )
-    def __call__(
-        self,
-        prompt: str,
-        suffix: Optional[str] = None,
-        max_tokens: int = 16,
-        temperature: float = 0.8,
-        top_p: float = 0.95,
-        logprobs: Optional[int] = None,
-        echo: bool = False,
-        stop: List[str] = [],
-        repeat_penalty: float = 1.1,
-        top_k: int = 40,
-    ):
-        """Generate text from a prompt.
-        Args:
-            prompt: The prompt to generate text from.
-            suffix: A suffix to append to the generated text. If None, no suffix is appended.
-            max_tokens: The maximum number of tokens to generate.
-            temperature: The temperature to use for sampling.
-            top_p: The top-p value to use for sampling.
-            logprobs: The number of logprobs to return. If None, no logprobs are returned.
-            echo: Whether to echo the prompt.
-            stop: A list of strings to stop generation when encountered.
-            repeat_penalty: The penalty to apply to repeated tokens.
-            top_k: The top-k value to use for sampling.
-        Raises:
-            ValueError: If the requested tokens exceed the context window.
-            RuntimeError: If the prompt fails to tokenize or the model fails to evaluate the prompt.
-        Returns:
-            Response object containing the generated text.
-        """
-        text = b""
-        finish_reason = "length"
-        completion_tokens = 0
-        if stop is not None:
-            stop = [s.encode("utf-8") for s in stop]
-        prompt_tokens = llama_cpp.llama_tokenize(
-            self.ctx,
-            prompt.encode("utf-8"),
-            self.tokens,
-            llama_cpp.llama_n_ctx(self.ctx),
-            True,
-        )
-        if prompt_tokens < 0:
-            raise RuntimeError(f"Failed to tokenize prompt: {prompt_tokens}")
-        if prompt_tokens + max_tokens > self.params.n_ctx:
-            raise ValueError(
-                f"Requested tokens exceed context window of {llama_cpp.llama_n_ctx(self.ctx)}"
-            )
-        # Process prompt in chunks to avoid running out of memory
-        for i in range(0, prompt_tokens, self.max_chunk_size):
-            chunk = self.tokens[i : min(prompt_tokens, i + self.max_chunk_size)]
-            rc = llama_cpp.llama_eval(
-                self.ctx,
-                (llama_cpp.llama_token * len(chunk))(*chunk),
-                len(chunk),
-                max(0, i - 1),
-                self.n_threads,
-            )
-            if rc != 0:
-                raise RuntimeError(f"Failed to evaluate prompt: {rc}")
-        for i in range(max_tokens):
-            tokens_seen = prompt_tokens + completion_tokens
-            last_n_tokens = [0] * max(0, self.last_n - tokens_seen) + [
-                self.tokens[j]
-                for j in range(max(tokens_seen - self.last_n, 0), tokens_seen)
-            ]
-            token = llama_cpp.llama_sample_top_p_top_k(
-                self.ctx,
-                (llama_cpp.llama_token * len(last_n_tokens))(*last_n_tokens),
-                len(last_n_tokens),
-                top_k=top_k,
-                top_p=top_p,
-                temp=temperature,
-                repeat_penalty=repeat_penalty,
-            )
-            if token == llama_cpp.llama_token_eos():
-                finish_reason = "stop"
-                break
-            text += llama_cpp.llama_token_to_str(self.ctx, token)
-            self.tokens[prompt_tokens + i] = token
-            completion_tokens += 1
-            any_stop = [s for s in stop if s in text]
-            if len(any_stop) > 0:
-                first_stop = any_stop[0]
-                text = text[: text.index(first_stop)]
-                finish_reason = "stop"
-                break
-            rc = llama_cpp.llama_eval(
-                self.ctx,
-                (llama_cpp.llama_token * 1)(self.tokens[prompt_tokens + i]),
-                1,
-                prompt_tokens + completion_tokens,
-                self.n_threads,
-            )
-            if rc != 0:
-                raise RuntimeError(f"Failed to evaluate next token: {rc}")
-        text = text.decode("utf-8")
-        if echo:
-            text = prompt + text
-        if suffix is not None:
-            text = text + suffix
-        if logprobs is not None:
-            logprobs = llama_cpp.llama_get_logits(
-                self.ctx,
-            )[:logprobs]
-        return {
-            "id": f"cmpl-{str(uuid.uuid4())}",  # Likely to change
-            "object": "text_completion",
-            "created": int(time.time()),
-            "model": self.model_path,
-            "choices": [
-                {
-                    "text": text,
-                    "index": 0,
-                    "logprobs": logprobs,
-                    "finish_reason": finish_reason,
-                }
-            ],
-            "usage": {
-                "prompt_tokens": prompt_tokens,
-                "completion_tokens": completion_tokens,
-                "total_tokens": prompt_tokens + completion_tokens,
-            },
-        }
-    def __del__(self):
-        llama_cpp.llama_free(self.ctx)