PyPI - llama-cpp-python - Versions diffs - 0.1.14__tar.gz → 0.1.16__tar.gz - Mend

llama-cpp-python 0.1.14tar.gz → 0.1.16tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (48) hide show

{llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.1.14
+Version: 0.1.16
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com

llama_cpp_python-0.1.16/_skbuild/linux-x86_64-3.8/cmake-install/llama_cpp/libllama.so ADDED Viewed

Binary file

{llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp/llama.py RENAMED Viewed

@@ -286,6 +286,7 @@ class Llama:
         # Add blank space to start of prompt to match OG llama tokenizer
         prompt_tokens = self.tokenize(b" " + prompt.encode("utf-8"))
         text = b""
+        returned_characters = 0
         if len(prompt_tokens) + max_tokens > int(llama_cpp.llama_n_ctx(self.ctx)):
             raise ValueError(
@@ -293,9 +294,9 @@ class Llama:
             )
         if stop != []:
-            stop_bytes = [s.encode("utf-8") for s in stop]
+            stop_sequences = [s.encode("utf-8") for s in stop]
         else:
-            stop_bytes = []
+            stop_sequences = []
         finish_reason = None
         for token in self.generate(
@@ -306,28 +307,33 @@ class Llama:
             repeat_penalty=repeat_penalty,
         ):
             if token == llama_cpp.llama_token_eos():
+                text = self.detokenize(completion_tokens)
                 finish_reason = "stop"
                 break
             completion_tokens.append(token)
-            text = self.detokenize(completion_tokens)
-            any_stop = [s for s in stop_bytes if s in text]
+            all_text = self.detokenize(completion_tokens)
+            any_stop = [s for s in stop_sequences if s in all_text]
             if len(any_stop) > 0:
                 first_stop = any_stop[0]
-                text = text[: text.index(first_stop)]
+                text = all_text[: all_text.index(first_stop)]
                 finish_reason = "stop"
                 break
             if stream:
-                start = len(self.detokenize(completion_tokens[:-1]))
+                start = returned_characters
                 longest = 0
-                # TODO: Clean up this mess
-                for s in stop_bytes:
+                # We want to avoid yielding any characters from
+                # the generated text if they are part of a stop
+                # sequence.
+                for s in stop_sequences:
                     for i in range(len(s), 0, -1):
-                        if s[-i:] == text[-i:]:
+                        if all_text.endswith(s[:i]):
                             if i > longest:
                                 longest = i
                             break
+                text = all_text[: len(all_text) - longest]
+                returned_characters += len(text[start:])
                 yield {
                     "id": completion_id,
                     "object": "text_completion",
@@ -335,23 +341,22 @@ class Llama:
                     "model": self.model_path,
                     "choices": [
                         {
-                            "text": text[start : len(text) - longest].decode("utf-8"),
+                            "text": text[start :].decode("utf-8"),
                             "index": 0,
                             "logprobs": None,
                             "finish_reason": None,
                         }
                     ],
                 }
+            if len(completion_tokens) >= max_tokens:
+                text = self.detokenize(completion_tokens)
+                finish_reason = "length"
+                break
         if finish_reason is None:
             finish_reason = "length"
         if stream:
-            if finish_reason == "stop":
-                start = len(self.detokenize(completion_tokens[:-1]))
-                text = text[start:].decode("utf-8")
-            else:
-                text = ""
             yield {
                 "id": completion_id,
                 "object": "text_completion",
@@ -359,7 +364,7 @@ class Llama:
                 "model": self.model_path,
                 "choices": [
                     {
-                        "text": text,
+                        "text": text[returned_characters:].decode("utf-8"),
                         "index": 0,
                         "logprobs": None,
                         "finish_reason": finish_reason,

{llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp/llama_cpp.py RENAMED Viewed

@@ -1,6 +1,6 @@
 import ctypes
-from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array
+from ctypes import c_int, c_float, c_char_p, c_void_p, c_bool, POINTER, Structure, Array, c_uint8, c_size_t
 import pathlib
 from itertools import chain
@@ -101,6 +101,36 @@ def llama_model_quantize(
 _lib.llama_model_quantize.argtypes = [c_char_p, c_char_p, c_int, c_int]
 _lib.llama_model_quantize.restype = c_int
+# Returns the KV cache that will contain the context for the
+# ongoing prediction with the model.
+def llama_get_kv_cache(ctx: llama_context_p):
+    return _lib.llama_get_kv_cache(ctx)
+_lib.llama_get_kv_cache.argtypes = [llama_context_p]
+_lib.llama_get_kv_cache.restype = POINTER(c_uint8)
+# Returns the size of the KV cache
+def llama_get_kv_cache_size(ctx: llama_context_p) -> c_size_t:
+    return _lib.llama_get_kv_cache_size(ctx)
+_lib.llama_get_kv_cache_size.argtypes = [llama_context_p]
+_lib.llama_get_kv_cache_size.restype = c_size_t
+# Returns the number of tokens in the KV cache
+def llama_get_kv_cache_token_count(ctx: llama_context_p) -> c_int:
+    return _lib.llama_get_kv_cache_token_count(ctx)
+_lib.llama_get_kv_cache_token_count.argtypes = [llama_context_p]
+_lib.llama_get_kv_cache_token_count.restype = c_int
+# Sets the KV cache containing the current context for the model
+def llama_set_kv_cache(ctx: llama_context_p, kv_cache, n_size: c_size_t, n_token_count: c_int):
+    return _lib.llama_set_kv_cache(ctx, kv_cache, n_size, n_token_count)
+_lib.llama_set_kv_cache.argtypes = [llama_context_p, POINTER(c_uint8), c_size_t, c_int]
+_lib.llama_set_kv_cache.restype = None
 # Run the llama inference to obtain the logits and probabilities for the next token.
 # tokens + n_tokens is the provided batch of new tokens to process

{llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/llama_cpp_python.egg-info/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama-cpp-python
-Version: 0.1.14
+Version: 0.1.16
 Summary: A Python wrapper for llama.cpp
 Author: Andrei Betlen
 Author-email: abetlen@gmail.com

{llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/pyproject.toml RENAMED Viewed

@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "llama_cpp"
-version = "0.1.14"
+version = "0.1.16"
 description = "Python bindings for the llama.cpp library"
 authors = ["Andrei Betlen <abetlen@gmail.com>"]
 license = "MIT"

{llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/setup.py RENAMED Viewed

@@ -10,7 +10,7 @@ setup(
     description="A Python wrapper for llama.cpp",
     long_description=long_description,
     long_description_content_type="text/markdown",
-    version="0.1.14",
+    version="0.1.16",
     author="Andrei Betlen",
     author_email="abetlen@gmail.com",
     license="MIT",

{llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/Makefile RENAMED Viewed

@@ -70,95 +70,8 @@ endif
 # TODO: probably these flags need to be tweaked on some architectures
 #       feel free to update the Makefile for your architecture and send a pull request or issue
 ifeq ($(UNAME_M),$(filter $(UNAME_M),x86_64 i686))
-	ifeq ($(UNAME_S),Darwin)
-		F16C_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring F16C,$(F16C_M)))
-		    CFLAGS += -mf16c
-		endif
-		AVX1_M := $(shell sysctl machdep.cpu.features)
-		ifneq (,$(findstring FMA,$(AVX1_M)))
-			CFLAGS += -mfma
-		endif
-		ifneq (,$(findstring AVX1.0,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysctl machdep.cpu.leaf7_features)
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-	else ifeq ($(UNAME_S),Linux)
-		AVX1_M := $(shell grep "avx " /proc/cpuinfo)
-		ifneq (,$(findstring avx,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell grep "avx2 " /proc/cpuinfo)
-		ifneq (,$(findstring avx2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell grep "fma " /proc/cpuinfo)
-		ifneq (,$(findstring fma,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell grep "f16c " /proc/cpuinfo)
-		ifneq (,$(findstring f16c,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-		SSE3_M := $(shell grep "sse3 " /proc/cpuinfo)
-		ifneq (,$(findstring sse3,$(SSE3_M)))
-			CFLAGS += -msse3
-		endif
-		AVX512F_M := $(shell grep "avx512f " /proc/cpuinfo)
-		ifneq (,$(findstring avx512f,$(AVX512F_M)))
-			CFLAGS += -mavx512f
-		endif
-		AVX512BW_M := $(shell grep "avx512bw " /proc/cpuinfo)
-		ifneq (,$(findstring avx512bw,$(AVX512BW_M)))
-			CFLAGS += -mavx512bw
-		endif
-		AVX512DQ_M := $(shell grep "avx512dq " /proc/cpuinfo)
-		ifneq (,$(findstring avx512dq,$(AVX512DQ_M)))
-			CFLAGS += -mavx512dq
-		endif
-		AVX512VL_M := $(shell grep "avx512vl " /proc/cpuinfo)
-		ifneq (,$(findstring avx512vl,$(AVX512VL_M)))
-			CFLAGS += -mavx512vl
-		endif
-		AVX512CD_M := $(shell grep "avx512cd " /proc/cpuinfo)
-		ifneq (,$(findstring avx512cd,$(AVX512CD_M)))
-			CFLAGS += -mavx512cd
-		endif
-		AVX512ER_M := $(shell grep "avx512er " /proc/cpuinfo)
-		ifneq (,$(findstring avx512er,$(AVX512ER_M)))
-			CFLAGS += -mavx512er
-		endif
-		AVX512IFMA_M := $(shell grep "avx512ifma " /proc/cpuinfo)
-		ifneq (,$(findstring avx512ifma,$(AVX512IFMA_M)))
-			CFLAGS += -mavx512ifma
-		endif
-		AVX512PF_M := $(shell grep "avx512pf " /proc/cpuinfo)
-		ifneq (,$(findstring avx512pf,$(AVX512PF_M)))
-			CFLAGS += -mavx512pf
-		endif
-	else ifeq ($(UNAME_S),Haiku)
-		AVX1_M := $(shell sysinfo -cpu | grep -w "AVX")
-		ifneq (,$(findstring AVX,$(AVX1_M)))
-			CFLAGS += -mavx
-		endif
-		AVX2_M := $(shell sysinfo -cpu | grep -w "AVX2")
-		ifneq (,$(findstring AVX2,$(AVX2_M)))
-			CFLAGS += -mavx2
-		endif
-		FMA_M := $(shell sysinfo -cpu | grep -w "FMA")
-		ifneq (,$(findstring FMA,$(FMA_M)))
-			CFLAGS += -mfma
-		endif
-		F16C_M := $(shell sysinfo -cpu | grep -w "F16C")
-		ifneq (,$(findstring F16C,$(F16C_M)))
-			CFLAGS += -mf16c
-		endif
-	else
-		CFLAGS += -mfma -mf16c -mavx -mavx2
-	endif
+	# Use all CPU extensions that are available:
+	CFLAGS += -march=native -mtune=native
 endif
 ifneq ($(filter ppc64%,$(UNAME_M)),)
 	POWER9_M := $(shell grep "POWER9" /proc/cpuinfo)

{llama_cpp_python-0.1.14 → llama_cpp_python-0.1.16}/vendor/llama.cpp/README.md RENAMED Viewed

@@ -232,13 +232,15 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 - Obtain the `gpt4all-lora-quantized.bin` model
 - It is distributed in the old `ggml` format which is now obsoleted
-- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py):
+- You have to convert it to the new format using [./convert-gpt4all-to-ggml.py](./convert-gpt4all-to-ggml.py). You may also need to
+convert the model from the old format to the new format with [./migrate-ggml-2023-03-30-pr613.py](./migrate-ggml-2023-03-30-pr613.py):
   ```bash
   python3 convert-gpt4all-to-ggml.py models/gpt4all-7B/gpt4all-lora-quantized.bin ./models/tokenizer.model
+  python3 migrate-ggml-2023-03-30-pr613.py models/gpt4all-7B/gpt4all-lora-quantized.bin models/gpt4all-7B/gpt4all-lora-quantized-new.bin
   ```
-- You can now use the newly generated `gpt4all-lora-quantized.bin` model in exactly the same way as all other models
+- You can now use the newly generated `gpt4all-lora-quantized-new.bin` model in exactly the same way as all other models
 - The original model is saved in the same folder with a suffix `.orig`
 ### Obtaining and verifying the Facebook LLaMA original model and Stanford Alpaca model data

llama-cpp-python 0.1.14__tar.gz → 0.1.16__tar.gz

llama-cpp-python 0.1.14tar.gz → 0.1.16tar.gz