PyPI - llama-cpp-python - Versions diffs - 0.2.67__tar.gz → 0.2.68__tar.gz - Mend

llama-cpp-python 0.2.67tar.gz → 0.2.68tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1081) hide show

llama_cpp_python-0.2.68/.git/FETCH_HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ b14dd98922c7f18468ae202eadbaf58fe17f5320 'b14dd98922c7f18468ae202eadbaf58fe17f5320' of https://github.com/abetlen/llama-cpp-python

llama_cpp_python-0.2.68/.git/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ b14dd98922c7f18468ae202eadbaf58fe17f5320

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/.git/config RENAMED Viewed

@@ -9,7 +9,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1Z3cnF2cFBtMHRHQmp6b0tWNHNLd1ZOYXVFdDYyYzF3amxueQ==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX0JDY0xlczRzUW4zRThuS0x6TEpwOFJRcjhPNDlpdTNwNFdKUg==
 [submodule "vendor/llama.cpp"]
 	active = true
 	url = https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.68/.git/index ADDED Viewed

Binary file

llama_cpp_python-0.2.68/.git/logs/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 b14dd98922c7f18468ae202eadbaf58fe17f5320 runner <runner@fv-az847-33.(none)> 1714484689 +0000 checkout: moving from master to refs/tags/v0.2.68

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 77e15bec6217a39be59b9cc83d6b9afb6b0d8167

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/.git/modules/vendor/llama.cpp/config RENAMED Viewed

@@ -16,7 +16,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1Z3cnF2cFBtMHRHQmp6b0tWNHNLd1ZOYXVFdDYyYzF3amxueQ==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX0JDY0xlczRzUW4zRThuS0x6TEpwOFJRcjhPNDlpdTNwNFdKUg==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/index ADDED Viewed

Binary file

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 77e15bec6217a39be59b9cc83d6b9afb6b0d8167 runner <runner@fv-az847-33.(none)> 1714484690 +0000 clone: from https://github.com/ggerganov/llama.cpp.git
2	+ 77e15bec6217a39be59b9cc83d6b9afb6b0d8167 77e15bec6217a39be59b9cc83d6b9afb6b0d8167 runner <runner@fv-az847-33.(none)> 1714484691 +0000 checkout: moving from master to 77e15bec6217a39be59b9cc83d6b9afb6b0d8167

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 77e15bec6217a39be59b9cc83d6b9afb6b0d8167 runner <runner@fv-az847-33.(none)> 1714484690 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 77e15bec6217a39be59b9cc83d6b9afb6b0d8167 runner <runner@fv-az847-33.(none)> 1714484690 +0000 clone: from https://github.com/ggerganov/llama.cpp.git

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/.git/modules/vendor/llama.cpp/modules/kompute/config RENAMED Viewed

@@ -13,7 +13,7 @@
 [gc]
 	auto = 0
 [http "https://github.com/"]
-	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX1Z3cnF2cFBtMHRHQmp6b0tWNHNLd1ZOYXVFdDYyYzF3amxueQ==
+	extraheader = AUTHORIZATION: basic eC1hY2Nlc3MtdG9rZW46Z2hzX0JDY0xlczRzUW4zRThuS0x6TEpwOFJRcjhPNDlpdTNwNFdKUg==
 [url "https://github.com/"]
 	insteadOf = git@github.com:
 	insteadOf = org-6826477@github.com:

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/modules/kompute/index ADDED Viewed

Binary file

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/modules/kompute/logs/HEAD ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ 0000000000000000000000000000000000000000 d1e3b0953cf66acc94b2e29693e221427b2c1f3f runner <runner@fv-az847-33.(none)> 1714484691 +0000 clone: from https://github.com/nomic-ai/kompute.git
2	+ d1e3b0953cf66acc94b2e29693e221427b2c1f3f 4565194ed7c32d1d2efa32ceab4d3c6cae006306 runner <runner@fv-az847-33.(none)> 1714484692 +0000 checkout: moving from master to 4565194ed7c32d1d2efa32ceab4d3c6cae006306

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/modules/kompute/logs/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 d1e3b0953cf66acc94b2e29693e221427b2c1f3f runner <runner@fv-az847-33.(none)> 1714484691 +0000 clone: from https://github.com/nomic-ai/kompute.git

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/modules/kompute/logs/refs/remotes/origin/HEAD ADDED Viewed

	@@ -0,0 +1 @@
1	+ 0000000000000000000000000000000000000000 d1e3b0953cf66acc94b2e29693e221427b2c1f3f runner <runner@fv-az847-33.(none)> 1714484691 +0000 clone: from https://github.com/nomic-ai/kompute.git

llama_cpp_python-0.2.67/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-dfe06cade21d4a3c314f514ca2e7bec04aebe5ea.idx → llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-aea54470ccfced130dc113c076f9a5f9e05cddbf.idx RENAMED Viewed

Binary file

llama_cpp_python-0.2.67/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-dfe06cade21d4a3c314f514ca2e7bec04aebe5ea.pack → llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-aea54470ccfced130dc113c076f9a5f9e05cddbf.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.67/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-dfe06cade21d4a3c314f514ca2e7bec04aebe5ea.rev → llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/modules/kompute/objects/pack/pack-aea54470ccfced130dc113c076f9a5f9e05cddbf.rev RENAMED Viewed

Binary file

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/objects/pack/pack-80678416707e3403714c6fedf67fc0629e198f4c.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.67/.git/modules/vendor/llama.cpp/objects/pack/pack-b2ae82d9afbb0ac8787798ddae3e8dbf0bc0b0bf.pack → llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/objects/pack/pack-80678416707e3403714c6fedf67fc0629e198f4c.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/objects/pack/pack-80678416707e3403714c6fedf67fc0629e198f4c.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/packed-refs ADDED Viewed

	@@ -0,0 +1,2 @@
1	+ # pack-refs with: peeled fully-peeled sorted
2	+ 77e15bec6217a39be59b9cc83d6b9afb6b0d8167 refs/remotes/origin/master

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/refs/heads/master ADDED Viewed

	@@ -0,0 +1 @@
1	+ 77e15bec6217a39be59b9cc83d6b9afb6b0d8167

llama_cpp_python-0.2.68/.git/modules/vendor/llama.cpp/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ 77e15bec6217a39be59b9cc83d6b9afb6b0d8167

llama_cpp_python-0.2.68/.git/objects/pack/pack-d80e9c2842087fe2b118d96efa116f60e3086b09.idx ADDED Viewed

Binary file

llama_cpp_python-0.2.67/.git/objects/pack/pack-4d004f221f5b5a3d64e532ac150c9f2f741e1616.pack → llama_cpp_python-0.2.68/.git/objects/pack/pack-d80e9c2842087fe2b118d96efa116f60e3086b09.pack RENAMED Viewed

Binary file

llama_cpp_python-0.2.68/.git/objects/pack/pack-d80e9c2842087fe2b118d96efa116f60e3086b09.rev ADDED Viewed

Binary file

llama_cpp_python-0.2.68/.git/refs/tags/v0.2.68 ADDED Viewed

	@@ -0,0 +1 @@
1	+ b14dd98922c7f18468ae202eadbaf58fe17f5320

llama_cpp_python-0.2.68/.git/shallow ADDED Viewed

	@@ -0,0 +1 @@
1	+ b14dd98922c7f18468ae202eadbaf58fe17f5320

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/.github/workflows/build-and-release.yaml RENAMED Viewed

@@ -39,7 +39,7 @@ jobs:
       - uses: actions/upload-artifact@v4
         with:
-          name: wheels
+          name: wheels-${{ matrix.os }}
           path: ./wheelhouse/*.whl
   build_wheels_arm64:
@@ -100,14 +100,9 @@ jobs:
     runs-on: ubuntu-latest
     steps:
-      - name: Merge Artifacts
-        uses: actions/upload-artifact/merge@v4
+      - uses: actions/download-artifact@v4
         with:
-          name: release
-      - uses: actions/download-artifact@v3
-        with:
-          name: release
+          merge-multiple: true
           path: dist
       - uses: softprops/action-gh-release@v2

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/CHANGELOG.md RENAMED Viewed

@@ -7,6 +7,12 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 ## [Unreleased]
+## [0.2.68]
+- feat: Update llama.cpp to ggerganov/llama.cpp@
+- feat: Add option to enable flash_attn to Lllama params and ModelSettings by @abetlen in 22d77eefd2edaf0148f53374d0cac74d0e25d06e
+- fix(ci): Fix build-and-release.yaml by @Smartappli in #1413
 ## [0.2.67]
 - fix: Ensure image renders before text in chat formats regardless of message content order by @abetlen in 3489ef09d3775f4a87fb7114f619e8ba9cb6b656

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.1
 Name: llama_cpp_python
-Version: 0.2.67
+Version: 0.2.68
 Summary: Python bindings for the llama.cpp library
 Author-Email: Andrei Betlen <abetlen@gmail.com>
 License: MIT
@@ -165,7 +165,7 @@ CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
 It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
-- CUDA Version is 12.1, 12.2 or 12.3
+- CUDA Version is 12.1, 12.2, 12.3, or 12.4
 - Python Version is 3.10, 3.11 or 3.12
 ```bash
@@ -177,6 +177,7 @@ Where `<cuda-version>` is one of the following:
 - `cu121`: CUDA 12.1
 - `cu122`: CUDA 12.2
 - `cu123`: CUDA 12.3
+- `cu124`: CUDA 12.4
 For example, to install the CUDA 12.1 wheel:

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/README.md RENAMED Viewed

@@ -121,7 +121,7 @@ CMAKE_ARGS="-DLLAMA_CUDA=on" pip install llama-cpp-python
 It is also possible to install a pre-built wheel with CUDA support. As long as your system meets some requirements:
-- CUDA Version is 12.1, 12.2 or 12.3
+- CUDA Version is 12.1, 12.2, 12.3, or 12.4
 - Python Version is 3.10, 3.11 or 3.12
 ```bash
@@ -133,6 +133,7 @@ Where `<cuda-version>` is one of the following:
 - `cu121`: CUDA 12.1
 - `cu122`: CUDA 12.2
 - `cu123`: CUDA 12.3
+- `cu124`: CUDA 12.4
 For example, to install the CUDA 12.1 wheel:

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/llama_cpp/__init__.py RENAMED Viewed

@@ -1,4 +1,4 @@
 from .llama_cpp import *
 from .llama import *
-__version__ = "0.2.67"
+__version__ = "0.2.68"

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/llama_cpp/llama.py RENAMED Viewed

@@ -92,6 +92,7 @@ class Llama:
         logits_all: bool = False,
         embedding: bool = False,
         offload_kqv: bool = True,
+        flash_attn: bool = False,
         # Sampling Params
         last_n_tokens_size: int = 64,
         # LoRA Params
@@ -168,6 +169,7 @@ class Llama:
             logits_all: Return logits for all tokens, not just the last token. Must be True for completion to return logprobs.
             embedding: Embedding mode only.
             offload_kqv: Offload K, Q, V to GPU.
+            flash_attn: Use flash attention.
             last_n_tokens_size: Maximum number of tokens to keep in the last_n_tokens deque.
             lora_base: Optional path to base model, useful if using a quantized base model and you want to apply LoRA to an f16 model.
             lora_path: Path to a LoRA file to apply to the model.
@@ -310,6 +312,7 @@ class Llama:
         )  # Must be set to True for speculative decoding
         self.context_params.embeddings = embedding # TODO: Rename to embeddings
         self.context_params.offload_kqv = offload_kqv
+        self.context_params.flash_attn = flash_attn
         #  KV cache quantization
         if type_k is not None:
             self.context_params.type_k = type_k
@@ -1774,6 +1777,7 @@ class Llama:
             logits_all=self.context_params.logits_all,
             embedding=self.context_params.embeddings,
             offload_kqv=self.context_params.offload_kqv,
+            flash_attn=self.context_params.flash_attn,
             # Sampling Params
             last_n_tokens_size=self.last_n_tokens_size,
             # LoRA Params

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/llama_cpp/llama_cpp.py RENAMED Viewed

@@ -242,8 +242,8 @@ LLAMA_FILE_MAGIC_GGSQ = 0x67677371
 # define LLAMA_SESSION_MAGIC   LLAMA_FILE_MAGIC_GGSN
 LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN
-# define LLAMA_SESSION_VERSION 5
-LLAMA_SESSION_VERSION = 5
+# define LLAMA_SESSION_VERSION 6
+LLAMA_SESSION_VERSION = 6
 # define LLAMA_STATE_SEQ_MAGIC   LLAMA_FILE_MAGIC_GGSQ
 LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ
@@ -730,6 +730,7 @@ class llama_model_params(ctypes.Structure):
 #     bool logits_all;  // the llama_decode() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
 #     bool embeddings;  // if true, extract embeddings (together with logits)
 #     bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU
+#     bool flash_attn;  // whether to use flash attention
 #     // Abort callback
@@ -766,6 +767,7 @@ class llama_context_params(ctypes.Structure):
         logits_all (bool): the llama_eval() call computes all logits, not just the last one (DEPRECATED - set llama_batch.logits instead)
         embeddings (bool): if true, extract embeddings (together with logits)
         offload_kqv (bool): whether to offload the KQV ops (including the KV cache) to GPU
+        flash_attn (bool): whether to use flash attention
         abort_callback (ggml_abort_callback): abort callback if it returns true, execution of llama_decode() will be aborted
         abort_callback_data (ctypes.ctypes.c_void_p): data for abort_callback
     """
@@ -795,6 +797,7 @@ class llama_context_params(ctypes.Structure):
         logits_all: bool
         embeddings: bool
         offload_kqv: bool
+        flash_attn: bool
         abort_callback: Callable[[ctypes.c_void_p], bool]
         abort_callback_data: ctypes.c_void_p
@@ -823,6 +826,7 @@ class llama_context_params(ctypes.Structure):
         ("logits_all", ctypes.c_bool),
         ("embeddings", ctypes.c_bool),
         ("offload_kqv", ctypes.c_bool),
+        ("flash_attn", ctypes.c_bool),
         ("abort_callback", ggml_abort_callback),
         ("abort_callback_data", ctypes.c_void_p),
     ]
@@ -1615,7 +1619,7 @@ def llama_get_kv_cache_used_cells(ctx: llama_context_p, /) -> int:
     ...
-# // Clear the KV cache
+# // Clear the KV cache - both cell info is erased and KV data is zeroed
 # LLAMA_API void llama_kv_cache_clear(
 #         struct llama_context * ctx);
 @ctypes_function("llama_kv_cache_clear", [llama_context_p_ctypes], None)

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/llama_cpp/server/settings.py RENAMED Viewed

@@ -96,6 +96,9 @@ class ModelSettings(BaseSettings):
     offload_kqv: bool = Field(
         default=True, description="Whether to offload kqv to the GPU."
     )
+    flash_attn: bool = Field(
+        default=False, description="Whether to use flash attention."
+    )
     # Sampling Params
     last_n_tokens_size: int = Field(
         default=64,

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/vendor/llama.cpp/ci/run.sh RENAMED Viewed

@@ -336,7 +336,8 @@ function gg_run_open_llama_3b_v2 {
     (time ./bin/imatrix --model ${model_f16} -f ${wiki_test_60} -c 128 -b 128 --chunks 1 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state     --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
     function check_ppl {
         qnt="$1"
@@ -517,7 +518,10 @@ function gg_run_open_llama_7b_v2 {
     (time ./bin/imatrix --model ${model_f16} -f ${wiki_test} -t 1 -ngl 999 -c 2048 -b 512 --chunks 4 ) 2>&1 | tee -a $OUT/${ci}-imatrix.log
-    (time ./bin/save-load-state --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state     -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa -ngl 10 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state     -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
+    (time ./bin/save-load-state -fa -ngl 99 --model ${model_q4_0} ) 2>&1 | tee -a $OUT/${ci}-save-load-state.log
     function check_ppl {
         qnt="$1"

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/vendor/llama.cpp/common/build-info.cpp RENAMED Viewed

@@ -1,4 +1,4 @@
 int LLAMA_BUILD_NUMBER = 1;
-char const *LLAMA_COMMIT = "8843a98";
+char const *LLAMA_COMMIT = "77e15be";
 char const *LLAMA_COMPILER = "cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0";
 char const *LLAMA_BUILD_TARGET = "x86_64-linux-gnu";

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/vendor/llama.cpp/common/common.cpp RENAMED Viewed

@@ -947,6 +947,10 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
         params.cont_batching = true;
         return true;
     }
+    if (arg == "-fa" || arg == "--flash-attn") {
+        params.flash_attn = true;
+        return true;
+    }
     if (arg == "--color") {
         params.use_color = true;
         return true;
@@ -1513,6 +1517,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -ns N, --sequences N  number of sequences to decode (default: %d)\n", params.n_sequences);
     printf("  -ps N, --p-split N    speculative decoding split probability (default: %.1f)\n", (double)params.p_split);
     printf("  -cb, --cont-batching  enable continuous batching (a.k.a dynamic batching) (default: disabled)\n");
+    printf("  -fa, --flash-attn     enable Flash Attention (default: %s)\n", params.flash_attn ? "enabled" : "disabled");
     printf("  --mmproj MMPROJ_FILE  path to a multimodal projector file for LLaVA. see examples/llava/README.md\n");
     printf("  --image IMAGE_FILE    path to an image file. use with multimodal models. Specify multiple times for batching\n");
     if (llama_supports_mlock()) {
@@ -1885,6 +1890,7 @@ struct llama_context_params llama_context_params_from_gpt_params(const gpt_param
     cparams.cb_eval           = params.cb_eval;
     cparams.cb_eval_user_data = params.cb_eval_user_data;
     cparams.offload_kqv       = !params.no_kv_offload;
+    cparams.flash_attn        = params.flash_attn;
     cparams.type_k = kv_cache_type_from_str(params.cache_type_k);
     cparams.type_v = kv_cache_type_from_str(params.cache_type_v);
@@ -2707,6 +2713,7 @@ void dump_non_result_info_yaml(FILE * stream, const gpt_params & params, const l
     fprintf(stream, "seed: %u # default: -1 (random seed)\n", params.seed);
     fprintf(stream, "simple_io: %s # default: false\n", params.simple_io ? "true" : "false");
     fprintf(stream, "cont_batching: %s # default: false\n", params.cont_batching ? "true" : "false");
+    fprintf(stream, "flash_attn: %s # default: false\n", params.flash_attn ? "true" : "false");
     fprintf(stream, "temp: %f # default: 0.8\n", sparams.temp);
     const std::vector<float> tensor_split_vector(params.tensor_split, params.tensor_split + llama_max_devices());

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/vendor/llama.cpp/common/common.h RENAMED Viewed

@@ -150,6 +150,7 @@ struct gpt_params {
     bool multiline_input   = false; // reverse the usage of `\`
     bool simple_io         = false; // improves compatibility with subprocesses and limited consoles
     bool cont_batching     = true;  // insert new sequences for decoding on-the-fly
+    bool flash_attn        = false; // flash attention
     bool input_prefix_bos  = false; // prefix BOS to user inputs, preceding input_prefix
     bool ignore_eos        = false; // ignore generated EOS tokens

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/vendor/llama.cpp/convert-hf-to-gguf-update.py RENAMED Viewed

@@ -128,7 +128,7 @@ for model in models:
     print(f"chkhsh: {chkhsh}")
     # print the "pre_tokenizer" content from the tokenizer.json
-    with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
+    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
         cfg = json.load(f)
         pre_tokenizer = cfg["pre_tokenizer"]
         print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
@@ -156,15 +156,19 @@ src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
 src_func +=  "\n"
 src_func +=  "        res = None\n"
 src_func +=  "\n"
-src_func +=  "        # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n"
-src_func +=  "        #       don't do this manually - use the convert-hf-to-gguf-update.py script!\n"
+src_func +=  "        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
+src_func +=  "        #       or pull the latest version of the model from Huggingface\n"
+src_func +=  "        #       don't edit the hashes manually!\n"
 src_func += f"{src_ifs}\n"
 src_func +=  "        if res is None:\n"
 src_func +=  "            print(\"\\n\")\n"
 src_func +=  "            print(\"**************************************************************************************\")\n"
 src_func +=  "            print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
-src_func +=  "            print(\"**          This means that it was not added yet or you are using an older version.\")\n"
-src_func +=  "            print(\"**          Check convert-hf-to-gguf-update.py and update it accordingly.\")\n"
+src_func +=  "            print(\"**          There are 2 possible reasons for this:\")\n"
+src_func +=  "            print(\"**          - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
+src_func +=  "            print(\"**          - the pre-tokenization config has changed upstream\")\n"
+src_func +=  "            print(\"**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
+src_func +=  "            print(\"** ref:     https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
 src_func +=  "            print(\"**\")\n"
 src_func +=  "            print(f\"** chkhsh:  {chkhsh}\")\n"
 src_func +=  "            print(\"**************************************************************************************\")\n"
@@ -249,7 +253,7 @@ for model in models:
     from transformers import AutoTokenizer
     tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
-    with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
+    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
         for text in tests:
             f.write(f"{text}")
             f.write("\n__ggml_vocab_test__\n")

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/vendor/llama.cpp/convert-hf-to-gguf.py RENAMED Viewed

@@ -279,8 +279,9 @@ class Model(ABC):
         res = None
-        # NOTE: if you get an error here, you need to add the model to the if-elif chain below
-        #       don't do this manually - use the convert-hf-to-gguf-update.py script!
+        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
+        #       or pull the latest version of the model from Huggingface
+        #       don't edit the hashes manually!
         if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
             # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
             res = "llama-bpe"
@@ -310,8 +311,11 @@ class Model(ABC):
             print("\n")
             print("**************************************************************************************")
             print("** WARNING: The BPE pre-tokenizer was not recognized!")
-            print("**          This means that it was not added yet or you are using an older version.")
-            print("**          Check convert-hf-to-gguf-update.py and update it accordingly.")
+            print("**          There are 2 possible reasons for this:")
+            print("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
+            print("**          - the pre-tokenization config has changed upstream")
+            print("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+            print("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
             print("**")
             print(f"** chkhsh:  {chkhsh}")
             print("**************************************************************************************")

{llama_cpp_python-0.2.67 → llama_cpp_python-0.2.68}/vendor/llama.cpp/examples/batched-bench/batched-bench.cpp RENAMED Viewed

@@ -32,7 +32,7 @@ int main(int argc, char ** argv) {
     gpt_params params;
     if (argc == 1 || argv[1][0] == '-') {
-        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
+        printf("usage: %s MODEL_PATH [N_KV_MAX] [N_BATCH] [N_UBATCH] [FATTN] [IS_PP_SHARED] [NGL] <PP> <TG> <PL>\n" , argv[0]);
         printf("  <PP>, <TG> and PL are comma-separated lists of numbers without spaces\n\n");
         printf("  example: %s ggml-model-f16.gguf 2048 2048 512 0 999 128,256,512 128,256 1,2,4,8,16,32\n\n", argv[0]);
         return 1 ;
@@ -41,6 +41,7 @@ int main(int argc, char ** argv) {
     int n_kv_max     = 2048;
     int n_batch      = 2048;
     int n_ubatch     = 512;
+    bool flash_attn  = false;
     int is_pp_shared = 0;
     int n_gpu_layers = 0;
@@ -66,23 +67,27 @@ int main(int argc, char ** argv) {
     }
     if (argc >= 6) {
-        is_pp_shared = std::atoi(argv[5]);
+        flash_attn = std::atoi(argv[5]);
     }
     if (argc >= 7) {
-        n_gpu_layers = std::atoi(argv[6]);
+        is_pp_shared = std::atoi(argv[6]);
     }
     if (argc >= 8) {
-        n_pp = parse_list(argv[7]);
+        n_gpu_layers = std::atoi(argv[7]);
     }
     if (argc >= 9) {
-        n_tg = parse_list(argv[8]);
+        n_pp = parse_list(argv[8]);
     }
     if (argc >= 10) {
-        n_pl = parse_list(argv[9]);
+        n_tg = parse_list(argv[9]);
+    }
+    if (argc >= 11) {
+        n_pl = parse_list(argv[10]);
     }
     // init LLM
@@ -108,10 +113,11 @@ int main(int argc, char ** argv) {
     llama_context_params ctx_params = llama_context_default_params();
-    ctx_params.seed      = 1234;
-    ctx_params.n_ctx     = n_kv_max;
-    ctx_params.n_batch   = n_batch;
-    ctx_params.n_ubatch  = n_ubatch;
+    ctx_params.seed       = 1234;
+    ctx_params.n_ctx      = n_kv_max;
+    ctx_params.n_batch    = n_batch;
+    ctx_params.n_ubatch   = n_ubatch;
+    ctx_params.flash_attn = flash_attn;
     ctx_params.n_threads       = params.n_threads;
     ctx_params.n_threads_batch = params.n_threads_batch == -1 ? params.n_threads : params.n_threads_batch;
@@ -169,7 +175,7 @@ int main(int argc, char ** argv) {
     }
     LOG_TEE("\n");
-    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
+    LOG_TEE("%s: n_kv_max = %d, n_batch = %d, n_ubatch = %d, flash_attn = %d, is_pp_shared = %d, n_gpu_layers = %d, n_threads = %u, n_threads_batch = %u\n", __func__, n_kv_max, n_batch, n_ubatch, flash_attn, is_pp_shared, n_gpu_layers, ctx_params.n_threads, ctx_params.n_threads_batch);
     LOG_TEE("\n");
     LOG_TEE("|%6s | %6s | %4s | %6s | %8s | %8s | %8s | %8s | %8s | %8s |\n", "PP",     "TG",     "B",    "N_KV",     "T_PP s",   "S_PP t/s", "T_TG s",   "S_TG t/s", "T s",      "S t/s");

llama-cpp-python 0.2.67__tar.gz → 0.2.68__tar.gz

llama-cpp-python 0.2.67tar.gz → 0.2.68tar.gz