PyPI - llama-cpp-bin - Versions diffs - 9672.0.0__tar.gz → 9692.0.0__tar.gz - Mend

llama-cpp-bin 9672.0.0tar.gz → 9692.0.0tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (1069) hide show

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/PKG-INFO RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.4
 Name: llama-cpp-bin
-Version: 9672.0.0
+Version: 9692.0.0
 Summary: llama.cpp server binary built from source
 Author-email: Vladyslav Tkachenko <vladlearns@gmail.com>
 License: AGPL-3.0-or-later

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/common/download.cpp RENAMED Viewed

@@ -997,3 +997,87 @@ std::vector<common_cached_model_info> common_list_cached_models() {
     return result;
 }
+bool common_download_remove(const std::string & hf_repo_with_tag) {
+    namespace fs = std::filesystem;
+    auto [repo_id, tag] = common_download_split_repo_tag(hf_repo_with_tag);
+    if (tag.empty()) {
+        return hf_cache::remove_cached_repo(repo_id);
+    }
+    std::string tag_upper = tag;
+    for (char & c : tag_upper) {
+        c = (char) std::toupper((unsigned char) c);
+    }
+    auto files = hf_cache::get_cached_files(repo_id);
+    if (files.empty()) {
+        return false;
+    }
+    // collect snapshot entries whose tag matches
+    std::vector<fs::path> to_remove;
+    for (const auto & f : files) {
+        auto split = get_gguf_split_info(f.path);
+        if (split.tag == tag_upper) {
+            to_remove.emplace_back(f.local_path);
+        }
+    }
+    if (to_remove.empty()) {
+        return false;
+    }
+    // resolve blob paths from symlinks before deleting snapshot entries
+    std::vector<fs::path> blobs_to_check;
+    for (const auto & p : to_remove) {
+        std::error_code ec;
+        if (fs::is_symlink(p, ec)) {
+            auto target = fs::read_symlink(p, ec);
+            if (!ec) {
+                blobs_to_check.push_back((p.parent_path() / target).lexically_normal());
+            }
+        }
+    }
+    // remove snapshot entries
+    for (const auto & p : to_remove) {
+        std::error_code ec;
+        fs::remove(p, ec);
+        if (ec) {
+            LOG_WRN("%s: failed to remove %s: %s\n", __func__, p.string().c_str(), ec.message().c_str());
+        }
+    }
+    if (blobs_to_check.empty()) {
+        return true;
+    }
+    // collect blobs still referenced by remaining snapshot entries
+    std::unordered_set<std::string> still_referenced;
+    for (const auto & f : hf_cache::get_cached_files(repo_id)) {
+        fs::path p(f.local_path);
+        std::error_code ec;
+        if (fs::is_symlink(p, ec)) {
+            auto target = fs::read_symlink(p, ec);
+            if (!ec) {
+                still_referenced.insert((p.parent_path() / target).lexically_normal().string());
+            }
+        }
+    }
+    // remove orphaned blobs
+    for (const auto & blob : blobs_to_check) {
+        if (still_referenced.find(blob.string()) == still_referenced.end()) {
+            std::error_code ec;
+            fs::remove(blob, ec);
+            if (ec) {
+                LOG_WRN("%s: failed to remove blob %s: %s\n", __func__, blob.string().c_str(), ec.message().c_str());
+            }
+        }
+    }
+    return true;
+}

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/common/download.h RENAMED Viewed

@@ -115,3 +115,10 @@ int common_download_file_single(const std::string & url,
 // resolve and download model from Docker registry
 // return local path to downloaded model file
 std::string common_docker_resolve_model(const std::string & docker);
+// Remove a cached model from disk
+// input format: "user/model" or "user/model:tag"
+// - if tag is omitted, removes the entire repo cache directory
+// - if tag is present, removes only files matching that tag (and orphaned blobs)
+// returns true if anything was removed
+bool common_download_remove(const std::string & hf_repo_with_tag);

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/common/hf-cache.cpp RENAMED Viewed

@@ -495,4 +495,19 @@ std::string finalize_file(const hf_file & file) {
     return file.final_path;
 }
+bool remove_cached_repo(const std::string & repo_id) {
+    if (!is_valid_repo_id(repo_id)) {
+        LOG_WRN("%s: invalid repository: %s\n", __func__, repo_id.c_str());
+        return false;
+    }
+    fs::path repo_path = get_repo_path(repo_id);
+    std::error_code ec;
+    auto removed = fs::remove_all(repo_path, ec);
+    if (ec) {
+        LOG_ERR("%s: failed to remove repo cache %s: %s\n", __func__, repo_path.string().c_str(), ec.message().c_str());
+        return false;
+    }
+    return removed > 0;
+}
 } // namespace hf_cache

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/common/hf-cache.h RENAMED Viewed

@@ -29,4 +29,7 @@ hf_files get_cached_files(const std::string & repo_id = {});
 // Create snapshot path (link or move/copy) and return it
 std::string finalize_file(const hf_file & file);
+// Remove the entire cached directory for a repo, returns true if removed
+bool remove_cached_repo(const std::string & repo_id);
 } // namespace hf_cache

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/common/log.cpp RENAMED Viewed

@@ -11,8 +11,13 @@
 #include <sstream>
 #include <thread>
 #include <vector>
+#include <algorithm>
 #if defined(_WIN32)
+#    define WIN32_LEAN_AND_MEAN
+#    ifndef NOMINMAX
+#       define NOMINMAX
+#    endif
 #    include <io.h>
 #    include <windows.h>
 #    define isatty _isatty
@@ -62,16 +67,15 @@ static const char* g_col[] = {
 };
 struct common_log_entry {
-    enum ggml_log_level level;
-    bool prefix;
-    int64_t timestamp;
+    enum ggml_log_level level {GGML_LOG_LEVEL_INFO};
     std::vector<char> msg;
-    // signals the worker thread to stop
-    bool is_end;
+    int64_t timestamp { 0 };
+    bool is_end       { false }; // signals the worker thread to stop
+    bool prefix       { false };
+    common_log_entry(size_t size = 256) : msg(size) { }
     void print(FILE * file = nullptr) const {
         FILE * fcur = file;
@@ -122,22 +126,15 @@ struct common_log_entry {
 };
 struct common_log {
-    // default capacity - will be expanded if needed
-    common_log() : common_log(256) {}
-    common_log(size_t capacity) {
-        file = nullptr;
-        prefix = false;
+    // default capacity
+    common_log(size_t capacity = 512) {
+        file       = nullptr;
+        prefix     = false;
         timestamps = false;
-        running = false;
-        t_start = t_us();
-        // initial message size - will be expanded if longer messages arrive
-        entries.resize(capacity);
-        for (auto & entry : entries) {
-            entry.msg.resize(256);
-        }
+        running    = false;
+        t_start    = t_us();
+        queue.resize(capacity, common_log_entry(256));
         head = 0;
         tail = 0;
@@ -152,9 +149,10 @@ struct common_log {
     }
 private:
-    std::mutex mtx;
-    std::thread thrd;
-    std::condition_variable cv;
+    std::mutex              mtx;
+    std::thread             thrd;
+    std::condition_variable cv_new;  // new entry
+    std::condition_variable cv_full; // wait on full
     FILE * file;
@@ -164,24 +162,53 @@ private:
     int64_t t_start;
-    // ring buffer of entries
-    std::vector<common_log_entry> entries;
+    // queue of entries
+    std::vector<common_log_entry> queue;
     size_t head;
     size_t tail;
-    // worker thread copies into this
-    common_log_entry cur;
+    bool print_entry(const common_log_entry & e) const {
+        if (e.is_end) return true;
+        e.print();
+        if (file) {
+            e.print(file);
+        }
+        return false;
+    }
+    bool flush_queue(size_t start_head, size_t end_tail, size_t & out_head) const {
+        bool stop = false;
+        size_t h = start_head;
+        while (h != end_tail && !stop) {
+            stop = print_entry(queue[h]);
+            h = (h + 1) % queue.size();
+        }
+        out_head = h;
+        return stop;
+    }
 public:
+    bool is_full() const {
+        return ((tail + 1) % queue.size()) == head;
+    }
+    bool is_empty() const {
+        return head == tail;
+    }
     void add(enum ggml_log_level level, const char * fmt, va_list args) {
-        std::lock_guard<std::mutex> lock(mtx);
+        std::unique_lock<std::mutex> lock(mtx);
+        // block if the queue is full
+        cv_full.wait(lock, [this]() { return !running || !is_full(); });
         if (!running) {
             // discard messages while the worker thread is paused
             return;
         }
-        auto & entry = entries[tail];
+        auto & entry = queue[tail];
         {
             // cannot use args twice, so make a copy in case we need to expand the buffer
@@ -216,38 +243,16 @@ public:
             va_end(args_copy);
         }
-        entry.level = level;
-        entry.prefix = prefix;
+        entry.is_end    = false;
+        entry.level     = level;
+        entry.prefix    = prefix;
         entry.timestamp = 0;
         if (timestamps) {
             entry.timestamp = t_us() - t_start;
         }
-        entry.is_end = false;
-        tail = (tail + 1) % entries.size();
-        if (tail == head) {
-            // expand the buffer
-            std::vector<common_log_entry> new_entries(2*entries.size());
-            size_t new_tail = 0;
-            do {
-                new_entries[new_tail] = std::move(entries[head]);
-                head     = (head     + 1) % entries.size();
-                new_tail = (new_tail + 1);
-            } while (head != tail);
-            head = 0;
-            tail = new_tail;
-            for (size_t i = tail; i < new_entries.size(); i++) {
-                new_entries[i].msg.resize(256);
-            }
-            entries = std::move(new_entries);
-        }
-        cv.notify_one();
+        tail = (tail + 1) % queue.size();
+        cv_new.notify_one();
     }
     void resume() {
@@ -261,22 +266,23 @@ public:
         thrd = std::thread([this]() {
             while (true) {
-                {
-                    std::unique_lock<std::mutex> lock(mtx);
-                    cv.wait(lock, [this]() { return head != tail; });
-                    cur = entries[head];
+                std::unique_lock<std::mutex> lock(mtx);
+                cv_new.wait(lock, [this]() { return !is_empty(); });
-                    head = (head + 1) % entries.size();
-                }
+                size_t cached_head = head;
+                size_t cached_tail = tail;
-                if (cur.is_end) {
-                    break;
-                }
+                lock.unlock(); // drop the lock during flush
+                size_t next_head;
+                bool stop = flush_queue(cached_head, cached_tail, next_head);
-                cur.print(); // stdout and stderr
+                lock.lock();
+                head = next_head;
+                cv_full.notify_all();
-                if (file) {
-                    cur.print(file);
+                if (stop) {
+                    break;
                 }
             }
         });
@@ -293,13 +299,13 @@ public:
             running = false;
             // push an entry to signal the worker thread to stop
-            {
-                auto & entry = entries[tail];
-                entry.is_end = true;
+            auto & entry = queue[tail];
+            entry.is_end = true;
+            tail = (tail + 1) % queue.size();
-                tail = (tail + 1) % entries.size();
-            }
-            cv.notify_one();
+            // wakeup everyone
+            cv_new.notify_one();
+            cv_full.notify_all();
         }
         thrd.join();

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/ggml/src/CMakeLists.txt RENAMED Viewed

@@ -438,7 +438,14 @@ if (GGML_CPU_ALL_VARIANTS)
             ggml_add_cpu_backend_variant(power8_2       POWER8  VSX)
             ggml_add_cpu_backend_variant(power9         POWER9  VSX)
             ggml_add_cpu_backend_variant(power10        POWER10 VSX)
-            ggml_add_cpu_backend_variant(power11        POWER11 VSX)
+            # POWER11 backend: only if compiler supports -mcpu=power11
+            check_cxx_compiler_flag("-mcpu=power11" GGML_CXX_SUPPORTS_POWER11)
+            if (GGML_CXX_SUPPORTS_POWER11)
+                message(STATUS "Compiler supports -mcpu=power11, enabling POWER11 backend")
+                ggml_add_cpu_backend_variant(power11 POWER11 VSX)
+            else()
+                message(STATUS "Skipping POWER11 backend: compiler does not support -mcpu=power11")
+            endif()
         else()
             message(FATAL_ERROR "Unsupported PowerPC target OS: ${CMAKE_SYSTEM_NAME}")
         endif()

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/ggml/src/ggml-cpu/CMakeLists.txt RENAMED Viewed

@@ -389,7 +389,7 @@ function(ggml_add_cpu_backend_variant_impl tag_name)
             string(REGEX MATCHALL "POWER *([0-9]+)" MATCHED_STRING "${POWER10_M_UPPER}")
             string(REGEX REPLACE "POWER *([0-9]+)" "\\1" EXTRACTED_NUMBER "${MATCHED_STRING}")
-            if (EXTRACTED_NUMBER GREATER_EQUAL 10)
+            if (EXTRACTED_NUMBER EQUAL 10 OR EXTRACTED_NUMBER EQUAL 11)
                 list(APPEND ARCH_FLAGS -mcpu=power10)
             elseif (EXTRACTED_NUMBER EQUAL 9)
                 list(APPEND ARCH_FLAGS -mcpu=power9)

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.cpp RENAMED Viewed

@@ -66,7 +66,6 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_base(ggml
     const char * op_str = "undefined";
     switch (op) {
         case GGML_OP_ADD_ID: op_str = "add_id"; break;
-        case GGML_OP_CONCAT: op_str = "concat"; break;
         default: GGML_ABORT("fatal error");
     };
@@ -211,6 +210,21 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat(ggml_meta
     return res;
 }
+ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat(ggml_metal_library_t lib, ggml_type tsrc) {
+    char base[256];
+    char name[256];
+    snprintf(base, 256, "kernel_concat_%s", ggml_type_name(tsrc));
+    snprintf(name, 256, "%s", base);
+    ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
+    if (!res.pipeline) {
+        res = ggml_metal_library_compile_pipeline(lib, base, name, nullptr);
+    }
+    return res;
+}
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary(ggml_metal_library_t lib, const ggml_tensor * op) {
     char base[256];
     char name[256];
@@ -1689,7 +1703,9 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_norm(ggml_metal_
 }
 ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_library_t lib, const ggml_tensor * op) {
-    assert(op->op == GGML_OP_ROPE);
+    assert(op->op == GGML_OP_ROPE || op->op == GGML_OP_ROPE_BACK);
+    const bool is_back = op->op == GGML_OP_ROPE_BACK;
     char base[256];
     char name[256];
@@ -1713,13 +1729,14 @@ ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_rope(ggml_metal_
         snprintf(base, 256, "kernel_rope_norm_%s", ggml_type_name(op->src[0]->type));
     }
-    snprintf(name, 256, "%s_imrope=%d", base, is_imrope ? 1 : 0);
+    snprintf(name, 256, "%s_imrope=%d_is_back=%d", base, is_imrope ? 1 : 0, is_back ? 1 : 0);
     ggml_metal_pipeline_with_params res = ggml_metal_library_get_pipeline(lib, name);
     if (!res.pipeline) {
         ggml_metal_cv_t cv = ggml_metal_cv_init();
         ggml_metal_cv_set_bool(cv, is_imrope, FC_ROPE + 0);
+        ggml_metal_cv_set_bool(cv, is_back,   FC_ROPE + 1);
         res = ggml_metal_library_compile_pipeline(lib, base, name, cv);

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/ggml/src/ggml-metal/ggml-metal-device.h RENAMED Viewed

@@ -115,6 +115,7 @@ struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_get_rows
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_set_rows          (ggml_metal_library_t lib, enum ggml_type tidx, enum ggml_type tdst);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_diag              (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_repeat            (ggml_metal_library_t lib, enum ggml_type tsrc);
+struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_concat            (ggml_metal_library_t lib, enum ggml_type tsrc);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_unary             (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_glu               (ggml_metal_library_t lib, const struct ggml_tensor * op);
 struct ggml_metal_pipeline_with_params ggml_metal_library_get_pipeline_sum               (ggml_metal_library_t lib, const struct ggml_tensor * op);

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/ggml/src/ggml-metal/ggml-metal-ops.cpp RENAMED Viewed

@@ -375,6 +375,7 @@ static int ggml_metal_op_encode_impl(ggml_metal_op_t ctx, int idx) {
                 n_fuse = ggml_metal_op_norm(ctx, idx);
             } break;
         case GGML_OP_ROPE:
+        case GGML_OP_ROPE_BACK:
             {
                 n_fuse = ggml_metal_op_rope(ctx, idx);
             } break;
@@ -556,7 +557,7 @@ int ggml_metal_op_concat(ggml_metal_op_t ctx, int idx) {
         /*.dim  =*/ dim,
     };
-    auto pipeline = ggml_metal_library_get_pipeline_base(lib, GGML_OP_CONCAT);
+    auto pipeline = ggml_metal_library_get_pipeline_concat(lib, op->type);
     ggml_metal_encoder_set_pipeline(enc, pipeline);
     ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/ggml/src/ggml-metal/ggml-metal.metal RENAMED Viewed

@@ -4358,6 +4358,7 @@ template [[host_name("kernel_mul_mv_bf16_bf16_short")]] kernel mul_mv_t_t_short_
 #endif
 constant bool FC_rope_is_imrope [[function_constant(FC_ROPE + 0)]];
+constant bool FC_rope_is_back   [[function_constant(FC_ROPE + 1)]];
 static float rope_yarn_ramp(const float low, const float high, const int i0) {
     const float y = (i0 / 2 - low) / max(0.001f, high - low);
@@ -4381,6 +4382,9 @@ static void rope_yarn(
     }
     *cos_theta = cos(theta) * mscale;
     *sin_theta = sin(theta) * mscale;
+    if (FC_rope_is_back) {
+        *sin_theta *= -1.0f;
+    }
 }
 // Apparently solving `n_rot = 2pi * x * base^((2 * max_pos_emb) / n_dims)` for x, we get
@@ -7513,14 +7517,15 @@ template [[host_name("kernel_cpy_q5_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<
 template [[host_name("kernel_cpy_q5_1_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q5_1, 2, dequantize_q5_1>;
 template [[host_name("kernel_cpy_q8_0_f16")]] kernel cpy_q_f_t kernel_cpy_q_f32<half4x4, block_q8_0, 2, dequantize_q8_0>;
+template<typename T>
 kernel void kernel_concat(
-    constant ggml_metal_kargs_concat & args,
-    device  const char * src0,
-    device  const char * src1,
-    device        char * dst,
-    uint3   tgpig[[threadgroup_position_in_grid]],
-    ushort3 tpitg[[thread_position_in_threadgroup]],
-    ushort3   ntg[[threads_per_threadgroup]]) {
+        constant ggml_metal_kargs_concat & args,
+        device  const char * src0,
+        device  const char * src1,
+        device        char * dst,
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
     const int i3 = tgpig.z;
     const int i2 = tgpig.y;
@@ -7533,21 +7538,31 @@ kernel void kernel_concat(
     int o[4] = {0, 0, 0, 0};
     o[args.dim] = args.dim == 0 ? args.ne00 : (args.dim == 1 ? args.ne01 : (args.dim == 2 ? args.ne02 : args.ne03));
-    device const float * x;
     for (int i0 = tpitg.x; i0 < args.ne0; i0 += ntg.x) {
+        device const T * x;
         if (i0 < args.ne00 && i1 < args.ne01 && i2 < args.ne02 && i3 < args.ne03) {
-            x = (device const float *)(src0 + (i3       )*args.nb03 + (i2       )*args.nb02 + (i1       )*args.nb01 + (i0       )*args.nb00);
+            x = (device const T *)(src0 + (i3       )*args.nb03 + (i2       )*args.nb02 + (i1       )*args.nb01 + (i0       )*args.nb00);
         } else {
-            x = (device const float *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
+            x = (device const T *)(src1 + (i3 - o[3])*args.nb13 + (i2 - o[2])*args.nb12 + (i1 - o[1])*args.nb11 + (i0 - o[0])*args.nb10);
         }
-        device float * y = (device float *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
+        device T * y = (device T *)(dst + i3*args.nb3 + i2*args.nb2 + i1*args.nb1 + i0*args.nb0);
         *y = *x;
     }
 }
+typedef decltype(kernel_concat<float>) kernel_concat_t;
+template [[host_name("kernel_concat_f32")]]  kernel kernel_concat_t kernel_concat<float>;
+template [[host_name("kernel_concat_f16")]]  kernel kernel_concat_t kernel_concat<half>;
+template [[host_name("kernel_concat_bf16")]] kernel kernel_concat_t kernel_concat<bfloat>;
+template [[host_name("kernel_concat_i8")]]   kernel kernel_concat_t kernel_concat<char>;
+template [[host_name("kernel_concat_i16")]]  kernel kernel_concat_t kernel_concat<short>;
+template [[host_name("kernel_concat_i32")]]  kernel kernel_concat_t kernel_concat<int>;
+template [[host_name("kernel_concat_i64")]]  kernel kernel_concat_t kernel_concat<long>;
 template<int nr0, typename args_t>
 void kernel_mul_mv_q2_K_f32_impl(
         args_t args,

{llama_cpp_bin-9672.0.0 → llama_cpp_bin-9692.0.0}/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp RENAMED Viewed

@@ -564,6 +564,9 @@ struct ggml_backend_opencl_context {
     cl_kernel kernel_mul_mat_f16_f32_1row;
     cl_kernel kernel_mul_mat_f16_f32;
     cl_kernel kernel_mul_mat_f16_f32_l4;
+    cl_kernel kernel_mul_mat_f16_f32_l4_dr;
+    cl_kernel kernel_mul_mat_f16_f32_l4_dr_ls;
+    cl_kernel kernel_mul_mat_f16_f32_l4_dr_lq;
     cl_kernel kernel_mul_mat_f16_f32_tiled;
     cl_kernel kernel_adreno_xmem_pack_src_f32;
     cl_kernel kernel_adreno_xmem_prepack_weight_f16;
@@ -1787,6 +1790,11 @@ static void load_cl_kernels(ggml_backend_opencl_context *backend_ctx) {
             build_program_from_source(backend_ctx->context, backend_ctx->device, kernel_src.c_str(), compile_opts);
         CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4   = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4", &err), err));
+        CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr", &err), err));
+        if (backend_ctx->gpu_family == ADRENO) {
+            CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_ls", &err), err));
+            CL_CHECK((backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq = clCreateKernel(backend_ctx->program_mul_mv_f16_f32_l4, "kernel_mul_mat_f16_f32_l4_dr_lq", &err), err));
+        }
         GGML_LOG_CONT(".");
     }
@@ -14570,11 +14578,31 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
             }
             if (src1t == GGML_TYPE_F32) {
+                // heuristic for packing more work for Adreno
+                const bool adreno_use_lane_split =
+                    backend_ctx->gpu_family == ADRENO &&
+                    ne11 == 1 &&
+                    ne01 >= 8 &&
+                    ne00 % 4 == 0 &&
+                    r3 == 1 && r2 >= 1 && r2 <= 8 &&
+                    (ne12 % r2) == 0;
                 if (ne11 * ne12 < 4) {
                     kernel = backend_ctx->kernel_mul_mat_f16_f32_1row;
+                } else if (adreno_use_lane_split && ne00 >= 64 && ne00 <= 128) {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq;
+                    nrows  = 1;
+                } else if (adreno_use_lane_split && r2 >= 2 && ne00 > 128 && ne00 <= 256) {
+                    kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls;
+                    nrows  = 1;
                 } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
-                    kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
-                    nrows = ne11;
+                    if (ne11 == 1) {
+                        kernel = backend_ctx->kernel_mul_mat_f16_f32_l4_dr;
+                        nrows  = 1; // not used by this kernel
+                    } else {
+                        kernel = backend_ctx->kernel_mul_mat_f16_f32_l4;
+                        nrows  = ne11;
+                    }
                 } else {
                     kernel = backend_ctx->kernel_mul_mat_f16_f32;
                     nrows = 4;
@@ -15353,12 +15381,30 @@ static void ggml_cl_mul_mat(ggml_backend_t backend, const ggml_tensor * src0, co
         backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
     } else {
-        int64_t ny = (ne11 + nrows - 1)/nrows;
+        if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr) {
+            const int NDST_DR = 4;
+            size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, NDST_DR)*nth0, (size_t)nth1, (size_t)ne12*ne13};
+            size_t local_work_size[]  = {(size_t)nth0, (size_t)nth1, 1};
-        size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
-        size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        } else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_ls) {
+            size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 2)*nth0, (size_t)nth1, (size_t)ne02*ne03};
+            size_t local_work_size[]  = {(size_t)nth0, (size_t)nth1, 1};
-        backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        } else if (kernel == backend_ctx->kernel_mul_mat_f16_f32_l4_dr_lq) {
+            size_t global_work_size[] = {(size_t)CEIL_DIV(ne01, 4)*nth0, (size_t)nth1, (size_t)ne02*ne03};
+            size_t local_work_size[]  = {(size_t)nth0, (size_t)nth1, 1};
+            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        } else {
+            int64_t ny = (ne11 + nrows - 1)/nrows;
+            size_t global_work_size[] = {(size_t)ne01*nth0, (size_t)ny*nth1, (size_t)ne12*ne13};
+            size_t local_work_size[] = {(size_t)nth0, (size_t)nth1, 1};
+            backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
+        }
     }
 }

llama-cpp-bin 9672.0.0__tar.gz → 9692.0.0__tar.gz

llama-cpp-bin 9672.0.0tar.gz → 9692.0.0tar.gz