RubyGems - whispercpp - Versions diffs - 1.3.4 → 1.3.5 - Mend

whispercpp 1.3.4 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (630) hide show

data/ext/sources/ggml/src/ggml-hexagon/op-desc.h ADDED Viewed

@@ -0,0 +1,153 @@
+#ifndef OP_DESC_H
+#define OP_DESC_H
+#define GGML_COMMON_IMPL_CPP
+#include "ggml-backend-impl.h"
+#include "ggml-common.h"
+#include <string>
+#include <stdio.h>
+struct op_desc {
+    char strides[64 * GGML_MAX_SRC];
+    char dims[64 * GGML_MAX_SRC];
+    char types[16 * GGML_MAX_SRC];
+    char buffs[64 * GGML_MAX_SRC];
+    char names[64 * GGML_MAX_SRC];
+    int format_tensor_dims(char * str, const struct ggml_tensor * t) {
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%d:%d", (int) t->ne[0], (int) t->ne[1]);
+        } else {
+            return sprintf(str, "%d:%d:%d:%d", (int) t->ne[0], (int) t->ne[1], (int) t->ne[2], (int) t->ne[3]);
+        }
+    }
+    void format_op_dims(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += format_tensor_dims(p, t->src[0]);
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_dims(p, t->src[i]);
+            }
+            p += sprintf(p, " -> ");
+        }
+        // format self dims separately for better visual alignment
+        char self[64];
+        format_tensor_dims(self, t);
+        p += sprintf(p, "%s", self);
+    }
+    int format_tensor_strides(char * str, const struct ggml_tensor * t) {
+        const char * c = ggml_is_contiguous(t) ? "" : "!";
+        if (t->ne[2] == 1 && t->ne[3] == 1) {
+            return sprintf(str, "%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], c);
+        } else {
+            return sprintf(str, "%zu:%zu:%zu:%zu%s", (size_t) t->nb[0], (size_t) t->nb[1], (size_t) t->nb[2], (size_t) t->nb[3], c);
+        }
+    }
+    void format_op_strides(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += format_tensor_strides(p, t->src[0]);
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += format_tensor_strides(p, t->src[i]);
+            }
+            p += sprintf(p, " -> ");
+        }
+        // format self dims separately for better visual alignment
+        char self[64];
+        format_tensor_strides(self, t);
+        p += sprintf(p, "%s", self);
+    }
+    void format_op_types(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", ggml_type_name(t->src[0]->type));
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", ggml_type_name(t->src[i]->type));
+            }
+            p += sprintf(p, " -> ");
+        }
+        p += sprintf(p, "%s", ggml_type_name(t->type));
+    }
+    const char * tensor_buff_name(const struct ggml_tensor * t) {
+        if (t->buffer) {
+            return ggml_backend_buffer_name(t->buffer);
+        }
+        return "NONE";
+    }
+    void format_op_buffs(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", tensor_buff_name(t->src[0]));
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", tensor_buff_name(t->src[i]));
+            }
+            p += sprintf(p, " -> ");
+        }
+        p += sprintf(p, "%s", tensor_buff_name(t));
+    }
+    void format_op_names(char * str, const struct ggml_tensor * t) {
+        char * p = str;
+        // append src0 and src1 (if any)
+        if (t->src[0]) {
+            p += sprintf(p, "%s", t->src[0]->name);
+            for (int i = 1; i < GGML_MAX_SRC && t->src[i]; i++) {
+                p += sprintf(p, " x ");
+                p += sprintf(p, "%s", t->src[i]->name);
+            }
+            p += sprintf(p, " -> ");
+        }
+        p += sprintf(p, "%s", t->name);
+    }
+    void format(const ggml_tensor * op) {
+        format_op_dims(dims, op);
+        format_op_strides(strides, op);
+        format_op_types(types, op);
+        format_op_buffs(buffs, op);
+        format_op_names(names, op);
+    }
+    op_desc() {}
+    op_desc(const ggml_tensor * op) { format(op); }
+};
+#endif // OP_DESC_H

data/ext/sources/ggml/src/ggml-hip/CMakeLists.txt CHANGED Viewed

@@ -28,9 +28,12 @@ if (CXX_IS_HIPCC)
                 " Prefer setting the HIP compiler directly. See README for details.")
     endif()
 else()
-    # Forward AMDGPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
-    if (AMDGPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
-        set(CMAKE_HIP_ARCHITECTURES ${AMDGPU_TARGETS})
+    # Forward (AMD)GPU_TARGETS to CMAKE_HIP_ARCHITECTURES.
+    if(AMDGPU_TARGETS AND NOT GPU_TARGETS)
+        set(GPU_TARGETS ${AMDGPU_TARGETS})
+    endif()
+    if(GPU_TARGETS AND NOT CMAKE_HIP_ARCHITECTURES)
+        set(CMAKE_HIP_ARCHITECTURES ${GPU_TARGETS})
     endif()
     cmake_minimum_required(VERSION 3.21)
     enable_language(HIP)
@@ -39,12 +42,6 @@ endif()
 find_package(hip     REQUIRED)
 find_package(hipblas REQUIRED)
 find_package(rocblas REQUIRED)
-if (GGML_HIP_ROCWMMA_FATTN)
-    CHECK_INCLUDE_FILE_CXX("rocwmma/rocwmma.hpp" FOUND_ROCWMMA)
-    if (NOT ${FOUND_ROCWMMA})
-        message(FATAL_ERROR "rocwmma has not been found")
-    endif()
-endif()
 if (${hip_VERSION} VERSION_LESS 6.1)
     message(FATAL_ERROR "At least ROCM/HIP V6.1 is required")
@@ -59,6 +56,8 @@ file(GLOB   GGML_HEADERS_ROCM "../ggml-cuda/*.cuh")
 list(APPEND GGML_HEADERS_ROCM "../../include/ggml-cuda.h")
 file(GLOB   GGML_SOURCES_ROCM "../ggml-cuda/*.cu")
+file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-tile*.cu")
+list(APPEND GGML_SOURCES_ROCM ${SRCS})
 file(GLOB   SRCS "../ggml-cuda/template-instances/fattn-mma*.cu")
 list(APPEND GGML_SOURCES_ROCM ${SRCS})
 file(GLOB   SRCS "../ggml-cuda/template-instances/mmq*.cu")
@@ -117,10 +116,6 @@ if (NOT GGML_HIP_MMQ_MFMA)
     add_compile_definitions(GGML_HIP_NO_MMQ_MFMA)
 endif()
-if (GGML_HIP_FORCE_ROCWMMA_FATTN_GFX12 OR ${hip_VERSION} VERSION_GREATER_EQUAL 7.0)
-    add_compile_definitions(GGML_HIP_ROCWMMA_FATTN_GFX12)
-endif()
 if (GGML_HIP_EXPORT_METRICS)
     set(CMAKE_HIP_FLAGS "${CMAKE_HIP_FLAGS} -Rpass-analysis=kernel-resource-usage --save-temps")
 endif()

data/ext/sources/ggml/src/ggml-impl.h CHANGED Viewed

@@ -24,10 +24,6 @@
 #include <arm_neon.h>
 #endif
-#if defined(__F16C__)
-#include <immintrin.h>
-#endif
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -102,6 +98,9 @@ static bool ggml_op_is_empty(enum ggml_op op) {
     }
 }
+static inline float ggml_compute_softplus_f32(float input) {
+    return (input > 20.0f) ? input : logf(1 + expf(input));
+}
 //
 // logging
 //
@@ -562,14 +561,23 @@ static inline ggml_bf16_t ggml_compute_fp32_to_bf16(float s) {
 #define GGML_FP32_TO_BF16(x) ggml_compute_fp32_to_bf16(x)
 #define GGML_BF16_TO_FP32(x) ggml_compute_bf16_to_fp32(x)
+static inline int32_t ggml_node_get_use_count(const struct ggml_cgraph * cgraph, int node_idx) {
+    const struct ggml_tensor * node = cgraph->nodes[node_idx];
+    size_t hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
+    if (!ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos)) {
+        return 0;
+    }
+    return cgraph->use_counts[hash_pos];
+}
 // return true if the node's results are only used by N other nodes
 // and can be fused into their calculations.
 static inline bool ggml_node_has_n_uses(const struct ggml_cgraph * cgraph, int node_idx, int32_t n_uses) {
     const struct ggml_tensor * node = cgraph->nodes[node_idx];
     // check the use count against how many we're replacing
-    size_t hash_pos = ggml_hash_find(&cgraph->visited_hash_set, node);
-    if (!ggml_bitset_get(cgraph->visited_hash_set.used, hash_pos) || cgraph->use_counts[hash_pos] != n_uses) {
+    if (ggml_node_get_use_count(cgraph, node_idx) != n_uses) {
         return false;
     }
@@ -635,11 +643,42 @@ static inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx
     return ggml_can_fuse_ext(cgraph, idxs, ops, num_ops);
 }
+GGML_API bool ggml_can_fuse_subgraph_ext(const struct ggml_cgraph * cgraph,
+                                         const int *                node_idxs,
+                                         int                        count,
+                                         const enum ggml_op *       ops,
+                                         const int *                outputs,
+                                         int                        num_outputs);
+// Returns true if the subgraph formed by {node_idxs} can be fused
+// checks whethers all nodes which are not part of outputs can be elided
+// by checking if their num_uses are confined to the subgraph
+static inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph * cgraph,
+                                          int                        node_idx,
+                                          int                        count,
+                                          const enum ggml_op *       ops,
+                                          const int *                outputs,
+                                          int                        num_outputs) {
+    GGML_ASSERT(count < 32);
+    if (node_idx + count > cgraph->n_nodes) {
+        return false;
+    }
+    int idxs[32];
+    for (int i = 0; i < count; ++i) {
+        idxs[i] = node_idx + i;
+    }
+    return ggml_can_fuse_subgraph_ext(cgraph, idxs, count, ops, outputs, num_outputs);
+}
 #ifdef __cplusplus
 }
 #endif
 #ifdef __cplusplus
+#include <array>
 #include <initializer_list>
 #include <vector>
@@ -648,6 +687,28 @@ inline bool ggml_can_fuse(const struct ggml_cgraph * cgraph, int node_idx, std::
     return ggml_can_fuse(cgraph, node_idx, ops.begin(), (int)ops.size());
 }
+inline bool ggml_can_fuse_subgraph(const struct ggml_cgraph *          cgraph,
+                                   int                                 start_idx,
+                                   std::initializer_list<enum ggml_op> ops,
+                                   std::initializer_list<int>          outputs = {}) {
+    return ggml_can_fuse_subgraph(cgraph, start_idx, ops.size(), ops.begin(), outputs.begin(), outputs.size());
+}
+// Return true if the edges in the graph match expectations.
+inline bool ggml_check_edges(const struct ggml_cgraph *                cgraph,
+                             int                                       start_idx,
+                             std::initializer_list<std::array<int, 3>> edges) {
+    for (const auto & edge : edges) {
+        int dst_node = edge[0];
+        int src_idx  = edge[1];
+        int src_node = edge[2];
+        if (cgraph->nodes[start_idx + dst_node]->src[src_idx] != cgraph->nodes[start_idx + src_node]) {
+            return false;
+        }
+    }
+    return true;
+}
 // expose GGUF internals for test code
 GGML_API size_t gguf_type_size(enum gguf_type type);
 GGML_API struct gguf_context * gguf_init_from_file_impl(FILE * file, struct gguf_init_params params);

data/ext/sources/ggml/src/ggml-metal/ggml-metal-common.cpp CHANGED Viewed

@@ -112,7 +112,7 @@ static bool ggml_mem_ranges_add_dst(ggml_mem_ranges_t mrs, const ggml_tensor * t
 }
 bool ggml_mem_ranges_add(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
         if (tensor->src[i]) {
             ggml_mem_ranges_add_src(mrs, tensor->src[i]);
         }
@@ -173,7 +173,7 @@ static bool ggml_mem_ranges_check_dst(ggml_mem_ranges_t mrs, const ggml_tensor *
 }
 bool ggml_mem_ranges_check(ggml_mem_ranges_t mrs, const ggml_tensor * tensor) {
-    for (int i = 0; i < GGML_MAX_DIMS; i++) {
+    for (int i = 0; i < GGML_MAX_SRC; i++) {
         if (tensor->src[i]) {
             if (!ggml_mem_ranges_check_src(mrs, tensor->src[i])) {
                 return false;

data/ext/sources/ggml/src/ggml-metal/ggml-metal-context.m CHANGED Viewed

@@ -24,9 +24,6 @@ struct ggml_metal_command_buffer {
 };
 struct ggml_metal {
-    id<MTLDevice>       device;
-    id<MTLCommandQueue> queue; // currently a pointer to the device queue, but might become separate queue [TAG_QUEUE_PER_BACKEND]
     ggml_metal_device_t  dev;
     ggml_metal_library_t lib;
@@ -35,7 +32,6 @@ struct ggml_metal {
     // additional, inference-time compiled pipelines
     ggml_metal_pipelines_t pipelines_ext;
-    bool use_bfloat;
     bool use_fusion;
     bool use_concurrency;
     bool use_graph_optimize;
@@ -92,15 +88,15 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
     // init context
     ggml_metal_t res = calloc(1, sizeof(struct ggml_metal));
-    res->device = ggml_metal_device_get_obj(dev);
+    id<MTLDevice> device = ggml_metal_device_get_obj(dev);
-    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[res->device name] UTF8String]);
+    GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
     // TODO: would it be better to have one queue for the backend and one queue for the device?
     //       the graph encoders and async ops would use the backend queue while the sync ops would use the device queue?
     //res->queue = [device newCommandQueue]; [TAG_QUEUE_PER_BACKEND]
-    res->queue = ggml_metal_device_get_queue(dev);
-    if (res->queue == nil) {
+    id<MTLCommandQueue> queue = ggml_metal_device_get_queue(dev);
+    if (queue == nil) {
         GGML_LOG_ERROR("%s: error: failed to create command queue\n", __func__);
         return NULL;
     }
@@ -121,11 +117,10 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
         }
     }
-    const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
+    //const struct ggml_metal_device_props * props_dev = ggml_metal_device_get_props(dev);
     res->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
-    res->use_bfloat      = props_dev->has_bfloat;
     res->use_fusion      = getenv("GGML_METAL_FUSION_DISABLE") == nil;
     res->use_concurrency = getenv("GGML_METAL_CONCURRENCY_DISABLE") == nil;
@@ -147,7 +142,6 @@ ggml_metal_t ggml_metal_init(ggml_metal_device_t dev) {
     memset(res->fuse_cnt, 0, sizeof(res->fuse_cnt));
-    GGML_LOG_INFO("%s: use bfloat         = %s\n", __func__, res->use_bfloat         ? "true" : "false");
     GGML_LOG_INFO("%s: use fusion         = %s\n", __func__, res->use_fusion         ? "true" : "false");
     GGML_LOG_INFO("%s: use concurrency    = %s\n", __func__, res->use_concurrency    ? "true" : "false");
     GGML_LOG_INFO("%s: use graph optimize = %s\n", __func__, res->use_graph_optimize ? "true" : "false");
@@ -277,7 +271,8 @@ static struct ggml_metal_buffer_id ggml_metal_get_buffer_id(const struct ggml_te
 void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
     @autoreleasepool {
         // wrap the source data into a Metal buffer
-        id<MTLBuffer> buf_src = [ctx->device newBufferWithBytes:data
+        id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+        id<MTLBuffer> buf_src = [device newBufferWithBytes:data
                                                          length:size
                                                         options:MTLResourceStorageModeShared];
@@ -292,7 +287,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
         // queue the copy operation into the queue of the Metal context
         // this will be queued at the end, after any currently ongoing GPU operations
-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+        id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [encoder copyFromBuffer:buf_src
@@ -303,6 +299,7 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
         [encoder endEncoding];
         [cmd_buf commit];
+        [buf_src release];
         // do not wait here for completion
         //[cmd_buf waitUntilCompleted];
@@ -317,7 +314,8 @@ void ggml_metal_set_tensor_async(ggml_metal_t ctx, struct ggml_tensor * tensor,
 void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
     @autoreleasepool {
-        id<MTLBuffer> buf_dst = [ctx->device newBufferWithBytesNoCopy:data
+        id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+        id<MTLBuffer> buf_dst = [device newBufferWithBytesNoCopy:data
                                                                length:size
                                                               options:MTLResourceStorageModeShared
                                                           deallocator:nil];
@@ -333,7 +331,8 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
         // queue the copy operation into the queue of the Metal context
         // this will be queued at the end, after any currently ongoing GPU operations
-        id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
+        id<MTLCommandBuffer> cmd_buf = [queue commandBuffer];
         id<MTLBlitCommandEncoder> encoder = [cmd_buf blitCommandEncoder];
         [encoder copyFromBuffer:bid_src.metal
@@ -344,6 +343,7 @@ void ggml_metal_get_tensor_async(ggml_metal_t ctx, const struct ggml_tensor * te
         [encoder endEncoding];
         [cmd_buf commit];
+        [buf_dst release];
         // do not wait here for completion
         //[cmd_buf waitUntilCompleted];
@@ -363,6 +363,9 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
     // number of threads in addition to the main thread
     const int n_cb = ctx->n_cb;
+    // keep the memory wired
+    ggml_metal_device_rsets_keep_alive(ctx->dev);
     // submit the ggml compute graph to the GPU by creating command buffers and encoding the ops in them
     // the first n_nodes_0 are encoded and submitted for processing directly by the calling thread
     // while these nodes are processing, we start n_cb threads to enqueue the rest of the nodes
@@ -390,7 +393,8 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
             if (!ctx->capture_started) {
                 // create capture scope
-                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:ctx->device];
+                id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
+                ctx->capture_scope = [[MTLCaptureManager sharedCaptureManager] newCaptureScopeWithDevice:device];
                 MTLCaptureDescriptor * descriptor = [MTLCaptureDescriptor new];
                 descriptor.captureObject = ctx->capture_scope;
@@ -407,10 +411,13 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
             }
         }
+        // short-hand
+        id<MTLCommandQueue> queue = ggml_metal_device_get_queue(ctx->dev);
         // the main thread commits the first few commands immediately
         // cmd_buf[n_cb]
         {
-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
             [cmd_buf retain];
             if (ctx->cmd_bufs[n_cb].obj) {
@@ -429,7 +436,7 @@ enum ggml_status ggml_metal_graph_compute(ggml_metal_t ctx, struct ggml_cgraph *
         // prepare the rest of the command buffers asynchronously (optional)
         // cmd_buf[0.. n_cb)
         for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) {
-            id<MTLCommandBuffer> cmd_buf = [ctx->queue commandBufferWithUnretainedReferences];
+            id<MTLCommandBuffer> cmd_buf = [queue commandBufferWithUnretainedReferences];
             [cmd_buf retain];
             if (ctx->cmd_bufs[cb_idx].obj) {
@@ -590,9 +597,11 @@ void ggml_metal_set_abort_callback(ggml_metal_t ctx, ggml_abort_callback abort_c
 }
 bool ggml_metal_supports_family(ggml_metal_t ctx, int family) {
-    GGML_ASSERT(ctx->device != nil);
+    GGML_ASSERT(ctx->dev != nil);
+    id<MTLDevice> device = ggml_metal_device_get_obj(ctx->dev);
-    return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
+    return [device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }
 void ggml_metal_capture_next_compute(ggml_metal_t ctx) {