npm - cui-llama.rn - Versions diffs - 1.7.4 → 1.7.6 - Mend

cui-llama.rn 1.7.4 → 1.7.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (276) hide show

package/cpp/ggml-metal.m CHANGED Viewed

@@ -48,22 +48,28 @@ static struct lm_ggml_backend_metal_device_context {
     int            mtl_device_ref_count;
     id<MTLLibrary> mtl_library;
+    NSLock * mtl_lock;
     bool has_simdgroup_reduction;
     bool has_simdgroup_mm;
     bool has_residency_sets;
     bool has_bfloat;
     bool use_bfloat;
+    size_t max_size;
     char name[128];
 } g_lm_ggml_ctx_dev_main = {
     /*.mtl_device              =*/ nil,
     /*.mtl_device_ref_count    =*/ 0,
     /*.mtl_library             =*/ nil,
+    /*.mtl_lock                =*/ nil,
     /*.has_simdgroup_reduction =*/ false,
     /*.has_simdgroup_mm        =*/ false,
     /*.has_residency_sets      =*/ false,
     /*.has_bfloat              =*/ false,
     /*.use_bfloat              =*/ false,
+    /*.max_size                =*/ 0,
     /*.name                    =*/ "",
 };
@@ -71,6 +77,10 @@ static struct lm_ggml_backend_metal_device_context {
 static id<MTLDevice> lm_ggml_backend_metal_device_acq(struct lm_ggml_backend_metal_device_context * ctx) {
     assert(ctx != NULL);
+    if (ctx->mtl_lock == nil) {
+        ctx->mtl_lock = [[NSLock alloc] init];
+    }
     if (ctx->mtl_device == nil) {
         ctx->mtl_device = MTLCreateSystemDefaultDevice();
     }
@@ -94,6 +104,8 @@ static id<MTLDevice> lm_ggml_backend_metal_device_acq(struct lm_ggml_backend_met
         ctx->use_bfloat = false;
 #endif
+        ctx->max_size = ctx->mtl_device.maxBufferLength;
         strncpy(ctx->name, [[ctx->mtl_device name] UTF8String], sizeof(ctx->name) - 1);
     }
@@ -110,6 +122,11 @@ static void lm_ggml_backend_metal_device_rel(struct lm_ggml_backend_metal_device
     ctx->mtl_device_ref_count--;
     if (ctx->mtl_device_ref_count == 0) {
+        if (ctx->mtl_lock) {
+            [ctx->mtl_lock release];
+            ctx->mtl_lock = nil;
+        }
         if (ctx->mtl_library) {
             [ctx->mtl_library release];
             ctx->mtl_library = nil;
@@ -194,11 +211,14 @@ enum lm_ggml_metal_kernel_type {
     LM_GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32,
     LM_GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32,
     LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,
+    LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4,
     LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,
+    LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4,
     LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,
     LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,
     LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,
     LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,
+    LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4,
     LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,
     LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,
     LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,
@@ -498,6 +518,7 @@ enum lm_ggml_metal_kernel_type {
     LM_GGML_METAL_KERNEL_TYPE_COS,
     LM_GGML_METAL_KERNEL_TYPE_NEG,
     LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS,
+    LM_GGML_METAL_KERNEL_TYPE_MEAN,
     LM_GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,
     LM_GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,
     LM_GGML_METAL_KERNEL_TYPE_ARGMAX,
@@ -976,7 +997,7 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(lm_ggml_backend
     struct lm_ggml_backend_metal_context * ctx = calloc(1, sizeof(struct lm_ggml_backend_metal_context));
     struct lm_ggml_backend_metal_device_context * ctx_dev = dev->context;
-    id<MTLDevice> device = lm_ggml_backend_metal_device_acq(ctx_dev);
+    id<MTLDevice> device = ctx_dev->mtl_device;
     LM_GGML_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
@@ -990,9 +1011,16 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(lm_ggml_backend
     ctx->d_queue = dispatch_queue_create("ggml-metal", DISPATCH_QUEUE_CONCURRENT);
     // load library
-    if (ctx_dev->mtl_library == nil) {
-        ctx_dev->mtl_library = lm_ggml_metal_load_library(device, ctx_dev->use_bfloat);
+    {
+        [ctx_dev->mtl_lock lock];
+        if (ctx_dev->mtl_library == nil) {
+            ctx_dev->mtl_library = lm_ggml_metal_load_library(device, ctx_dev->use_bfloat);
+        }
+        [ctx_dev->mtl_lock unlock];
     }
     id<MTLLibrary> metal_library = ctx_dev->mtl_library;
     if (metal_library == nil) {
         LM_GGML_LOG_ERROR("%s: error: metal library is nil\n", __func__);
@@ -1150,11 +1178,14 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(lm_ggml_backend
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_RWKV_WKV6_F32,                   rwkv_wkv6_f32,                   true);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_RWKV_WKV7_F32,                   rwkv_wkv7_f32,                   true);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32,                  mul_mv_f32_f32,                  has_simdgroup_reduction);
+        LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4,               mul_mv_f32_f32_c4,               true);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32,                 mul_mv_bf16_f32,                 has_simdgroup_reduction && use_bfloat);
+        LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4,              mul_mv_bf16_f32_c4,              use_bfloat);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW,            mul_mv_bf16_f32_1row,            has_simdgroup_reduction && use_bfloat);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4,              mul_mv_bf16_f32_l4,              has_simdgroup_reduction && use_bfloat);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_BF16,                mul_mv_bf16_bf16,                has_simdgroup_reduction && use_bfloat);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32,                  mul_mv_f16_f32,                  has_simdgroup_reduction);
+        LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4,               mul_mv_f16_f32_c4,               true);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW,             mul_mv_f16_f32_1row,             has_simdgroup_reduction);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4,               mul_mv_f16_f32_l4,               has_simdgroup_reduction);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F16,                  mul_mv_f16_f16,                  has_simdgroup_reduction);
@@ -1454,6 +1485,7 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(lm_ggml_backend
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_COS,                             cos,                             true);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_NEG,                             neg,                             true);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS,                        sum_rows,                        true);
+        LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MEAN,                            mean,                            true);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ARGMAX,                          argmax,                          true);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_POOL_2D_AVG_F32,                 pool_2d_avg_f32,                 true);
         LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_POOL_2D_MAX_F32,                 pool_2d_max_f32,                 true);
@@ -1653,6 +1685,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_device_
         case LM_GGML_OP_LOG:
             return false; // TODO: implement
         case LM_GGML_OP_SUM_ROWS:
+        case LM_GGML_OP_MEAN:
         case LM_GGML_OP_SOFT_MAX:
         case LM_GGML_OP_GROUP_NORM:
             return has_simdgroup_reduction && lm_ggml_is_contiguous(op->src[0]);
@@ -2400,11 +2433,31 @@ static bool lm_ggml_metal_encode_node(
                 [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
             } break;
         case LM_GGML_OP_SUM_ROWS:
+        case LM_GGML_OP_MEAN:
             {
                 LM_GGML_ASSERT(src0->nb[0] == lm_ggml_type_size(src0->type));
-                id<MTLComputePipelineState> pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+                id<MTLComputePipelineState> pipeline = nil;
+                switch (dst->op) {
+                    case LM_GGML_OP_SUM_ROWS:
+                        pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS].pipeline;
+                        break;
+                    case LM_GGML_OP_MEAN:
+                        pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MEAN].pipeline;
+                        break;
+                    default:
+                        LM_GGML_ABORT("fatal error");
+                }
+                int nth = 32; // SIMD width
+                while (nth < ne00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                    nth *= 2;
+                }
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
+                nth = MIN(nth, ne00);
                 lm_ggml_metal_kargs_sum_rows args = {
                    /*.ne00 =*/ ne00,
@@ -2434,11 +2487,12 @@ static bool lm_ggml_metal_encode_node(
                 };
                 [encoder setComputePipelineState:pipeline];
-                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
-                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:1];
-                [encoder setBytes:&args length:sizeof(args) atIndex:2];
+                [encoder setBytes:&args length:sizeof(args) atIndex:0];
+                [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
+                [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
+                [encoder setThreadgroupMemoryLength:32*sizeof(float) atIndex:0];
-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
             } break;
         case LM_GGML_OP_SOFT_MAX:
             {
@@ -3063,14 +3117,23 @@ static bool lm_ggml_metal_encode_node(
                                 nsg = 1;
                                 nr0 = 1;
                                 nr1 = 4;
-                                pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                                if (ne00 == 4) {
+                                    nr0 = 32;
+                                    pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32_C4].pipeline;
+                                } else {
+                                    pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F32_F32].pipeline;
+                                }
                             } break;
                         case LM_GGML_TYPE_F16:
                             {
                                 nsg = 1;
                                 nr0 = 1;
                                 if (src1t == LM_GGML_TYPE_F32) {
-                                    if (ne11 * ne12 < 4) {
+                                    if (ne00 == 4) {
+                                        nr0 = 32;
+                                        nr1 = 4;
+                                        pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_C4].pipeline;
+                                    } else if (ne11 * ne12 < 4) {
                                         pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_1ROW].pipeline;
                                     } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
                                         pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_F16_F32_L4].pipeline;
@@ -3089,7 +3152,11 @@ static bool lm_ggml_metal_encode_node(
                                 nsg = 1;
                                 nr0 = 1;
                                 if (src1t == LM_GGML_TYPE_F32) {
-                                    if (ne11 * ne12 < 4) {
+                                    if (ne00 == 4) {
+                                        nr0 = 32;
+                                        nr1 = 4;
+                                        pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_C4].pipeline;
+                                    } else if (ne11 * ne12 < 4) {
                                         pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_1ROW].pipeline;
                                     } else if (ne00 >= 128 && ne01 >= 8 && ne00%4 == 0) {
                                         pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_MV_BF16_F32_L4].pipeline;
@@ -3733,6 +3800,7 @@ static bool lm_ggml_metal_encode_node(
                     nth *= 2;
                 }
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
                 nth = MIN(nth, ne00/4);
                 lm_ggml_metal_kargs_rms_norm args = {
@@ -3769,6 +3837,7 @@ static bool lm_ggml_metal_encode_node(
                     nth *= 2;
                 }
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
                 nth = MIN(nth, ne00/4);
                 lm_ggml_metal_kargs_l2_norm args = {
@@ -3841,6 +3910,7 @@ static bool lm_ggml_metal_encode_node(
                     nth *= 2;
                 }
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
                 nth = MIN(nth, ne00/4);
                 lm_ggml_metal_kargs_norm args = {
@@ -4766,6 +4836,8 @@ static bool lm_ggml_metal_encode_node(
                     LM_GGML_ASSERT(nqptg  % 8  == 0);
                     LM_GGML_ASSERT(ncpsg  % 32 == 0);
+                    const int is_q = lm_ggml_is_quantized(src1->type) ? 1 : 0;
                     // 2*(2*ncpsg + nqptg)*(nsg)
                     // ncpsg soft_max values + ncpsg mask values + a diagonal scaling matrix (in float)
                     //
@@ -4773,7 +4845,7 @@ static bool lm_ggml_metal_encode_node(
                     // the shared memory needed for the simdgroups to load the KV cache
                     // each thread loads (dequantizes) 16 head elements, there are 32 threads in th SG
                     //
-#define FATTN_SMEM(nsg) (LM_GGML_PAD((nqptg*(ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + 16*32*(nsg))*(sizeof(float)/2), 16))
+#define FATTN_SMEM(nsg) (LM_GGML_PAD((nqptg*(2*ne00 + 2*(2*ncpsg + nqptg)*(nsg)) + is_q*(16*32*(nsg)))*(sizeof(float)/2), 16))
                     int64_t nsgmax = 2;
@@ -4810,9 +4882,9 @@ static bool lm_ggml_metal_encode_node(
                     // and store the soft_max values and the mask
                     //
                     // ne00*(nsg)
-                    // each simdgroup has a full f16 head vector in shared mem to accumulate results
+                    // each simdgroup has a full f32 head vector in shared mem to accumulate results
                     //
-#define FATTN_SMEM(nsg) (LM_GGML_PAD((nqptg*(LM_GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + ne20*(nsg))*(sizeof(float)/2), 16))
+#define FATTN_SMEM(nsg) (LM_GGML_PAD((nqptg*(LM_GGML_PAD(ne00, 128) + 4*ncpsg*(nsg)) + 2*ne20*(nsg))*(sizeof(float)/2), 16))
                     int64_t nsgmax = 2;
                     while (true) {
@@ -4925,8 +4997,39 @@ static bool lm_ggml_metal_encode_node(
                     default: LM_GGML_ABORT("not implemented");
                 }
+                LM_GGML_ASSERT(ne00 % lm_ggml_blck_size(src0->type) == 0);
+                // TODO: support
+                //const int32_t nk00 = ne00/lm_ggml_blck_size(dst->type);
+                const int32_t nk00 = ne00;
+                int nth = 32; // SIMD width
+                while (nth < nk00 && nth < (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                    nth *= 2;
+                }
+                nth = MIN(nth, (int) pipeline.maxTotalThreadsPerThreadgroup);
+                // when rows are small, we can batch them together in a single threadgroup
+                int nrptg = 1;
+                // TODO: relax this constraint in the future
+                if (lm_ggml_blck_size(src0->type) == 1 && lm_ggml_blck_size(dst->type) == 1) {
+                    if (nth > nk00) {
+                        nrptg = (nth + nk00 - 1)/nk00;
+                        nth   = nk00;
+                        if (nrptg*nth > (int) pipeline.maxTotalThreadsPerThreadgroup) {
+                            nrptg--;
+                        }
+                    }
+                }
+                nth = MIN(nth, nk00);
                 lm_ggml_metal_kargs_cpy args = {
-                    /*.ne00 =*/ ne00,
+                    /*.ne00 =*/ nk00,
                     /*.ne01 =*/ ne01,
                     /*.ne02 =*/ ne02,
                     /*.ne03 =*/ ne03,
@@ -4949,11 +5052,7 @@ static bool lm_ggml_metal_encode_node(
                 [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1];
                 [encoder setBuffer:id_dst  offset:offs_dst  atIndex:2];
-                LM_GGML_ASSERT(ne00 % lm_ggml_blck_size(src0->type) == 0);
-                int nth = MIN(1024, ne00/lm_ggml_blck_size(src0->type));
-                [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
+                [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nrptg - 1)/nrptg, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, nrptg, 1)];
             } break;
         case LM_GGML_OP_SET:
             {
@@ -5259,7 +5358,6 @@ static void lm_ggml_backend_metal_buffer_free_buffer(lm_ggml_backend_buffer_t bu
     }
     lm_ggml_backend_metal_buffer_rset_free(ctx);
-    lm_ggml_backend_metal_device_rel(buffer->buft->device->context);
     if (ctx->owned) {
 #if TARGET_OS_OSX
@@ -5368,7 +5466,10 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_alloc_buffer(l
     }
     struct lm_ggml_backend_metal_device_context * ctx_dev = (struct lm_ggml_backend_metal_device_context *)buft->device->context;
-    id<MTLDevice> device = lm_ggml_backend_metal_device_acq(ctx_dev);
+    LM_GGML_ASSERT(ctx_dev->mtl_device != nil);
+    id<MTLDevice> device = ctx_dev->mtl_device;
     ctx->all_data = lm_ggml_metal_host_malloc(size_aligned);
     ctx->all_size = size_aligned;
@@ -5391,14 +5492,12 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_alloc_buffer(l
     if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
         LM_GGML_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
         free(ctx);
-        lm_ggml_backend_metal_device_rel(ctx_dev);
         return NULL;
     }
     if (!lm_ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
         LM_GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
         free(ctx);
-        lm_ggml_backend_metal_device_rel(ctx_dev);
         return NULL;
     }
@@ -5409,17 +5508,14 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_alloc_buffer(l
 static size_t lm_ggml_backend_metal_buffer_type_get_alignment(lm_ggml_backend_buffer_type_t buft) {
     return 32;
     LM_GGML_UNUSED(buft);
 }
 static size_t lm_ggml_backend_metal_buffer_type_get_max_size(lm_ggml_backend_buffer_type_t buft) {
-    id<MTLDevice> device = lm_ggml_backend_metal_device_acq(buft->device->context);
-    const size_t max_size = device.maxBufferLength;
-    lm_ggml_backend_metal_device_rel(buft->device->context);
+    const size_t max_size = ((struct lm_ggml_backend_metal_device_context *)buft->device->context)->max_size;
     return max_size;
-    LM_GGML_UNUSED(buft);
 }
 static bool lm_ggml_backend_metal_buffer_type_is_host(lm_ggml_backend_buffer_type_t buft) {
@@ -5492,7 +5588,10 @@ lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void * data, size
     }
     struct lm_ggml_backend_metal_device_context * ctx_dev = &g_lm_ggml_ctx_dev_main;
-    id<MTLDevice> device = lm_ggml_backend_metal_device_acq(ctx_dev);
+    LM_GGML_ASSERT(ctx_dev->mtl_device != nil);
+    id<MTLDevice> device = ctx_dev->mtl_device;
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {
@@ -5548,7 +5647,6 @@ lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void * data, size
     if (!lm_ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
         LM_GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
         free(ctx);
-        lm_ggml_backend_metal_device_rel(ctx_dev);
         return NULL;
     }
@@ -5564,10 +5662,8 @@ static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t backend) {
 }
 static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
-    struct lm_ggml_backend_metal_context        * ctx     = backend->context;
-    struct lm_ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+    struct lm_ggml_backend_metal_context * ctx = backend->context;
-    lm_ggml_backend_metal_device_rel(ctx_dev);
     lm_ggml_metal_free(ctx);
     free(backend);
@@ -5707,6 +5803,8 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family
     struct lm_ggml_backend_metal_device_context * ctx_dev = backend->device->context;
+    LM_GGML_ASSERT(ctx_dev->mtl_device != nil);
     return [ctx_dev->mtl_device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
 }
@@ -5726,10 +5824,7 @@ static const char * lm_ggml_backend_metal_device_get_name(lm_ggml_backend_dev_t
 }
 static const char * lm_ggml_backend_metal_device_get_description(lm_ggml_backend_dev_t dev) {
-    // acq/rel just to populate ctx->name in case it hasn't been done yet
     struct lm_ggml_backend_metal_device_context * ctx_dev = (struct lm_ggml_backend_metal_device_context *)dev->context;
-    lm_ggml_backend_metal_device_acq(ctx_dev);
-    lm_ggml_backend_metal_device_rel(ctx_dev);
     return ctx_dev->name;
 }
@@ -5737,12 +5832,10 @@ static const char * lm_ggml_backend_metal_device_get_description(lm_ggml_backend
 static void lm_ggml_backend_metal_device_get_memory(lm_ggml_backend_dev_t dev, size_t * free, size_t * total) {
     if (@available(macOS 10.12, iOS 16.0, *)) {
         struct lm_ggml_backend_metal_device_context * ctx_dev = (struct lm_ggml_backend_metal_device_context *)dev->context;
-        id<MTLDevice> device = lm_ggml_backend_metal_device_acq(ctx_dev);
+        id<MTLDevice> device = ctx_dev->mtl_device;
         *total = device.recommendedMaxWorkingSetSize;
         *free  = *total - device.currentAllocatedSize;
-        lm_ggml_backend_metal_device_rel(ctx_dev);
     } else {
         *free = 1;
         *total = 1;
@@ -5820,7 +5913,10 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_metal_device_buffer_from_ptr(lm_
     }
     struct lm_ggml_backend_metal_device_context * ctx_dev = (struct lm_ggml_backend_metal_device_context *)dev->context;
-    id<MTLDevice> device = lm_ggml_backend_metal_device_acq(ctx_dev);
+    LM_GGML_ASSERT(ctx_dev->mtl_device != nil);
+    id<MTLDevice> device = ctx_dev->mtl_device;
     // the buffer fits into the max buffer size allowed by the device
     if (size_aligned <= device.maxBufferLength) {
@@ -5876,7 +5972,6 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_metal_device_buffer_from_ptr(lm_
     if (!lm_ggml_backend_metal_buffer_rset_init(ctx, ctx_dev, device)) {
         LM_GGML_LOG_ERROR("%s: error: failed to initialize residency set\n", __func__);
         free(ctx);
-        lm_ggml_backend_metal_device_rel(ctx_dev);
         return NULL;
     }
@@ -5890,8 +5985,9 @@ static bool lm_ggml_backend_metal_device_supports_op(lm_ggml_backend_dev_t dev,
 }
 static bool lm_ggml_backend_metal_device_supports_buft(lm_ggml_backend_dev_t dev, lm_ggml_backend_buffer_type_t buft) {
-    return buft->iface.get_name == lm_ggml_backend_metal_buffer_type_get_name ||
-            buft->iface.get_name == lm_ggml_backend_metal_buffer_from_ptr_type_get_name;
+    return
+        buft->iface.get_name == lm_ggml_backend_metal_buffer_type_get_name ||
+        buft->iface.get_name == lm_ggml_backend_metal_buffer_from_ptr_type_get_name;
     LM_GGML_UNUSED(dev);
 }
@@ -5976,8 +6072,19 @@ static struct lm_ggml_backend_reg_i lm_ggml_backend_metal_reg_i = {
     /* .get_proc_address = */ lm_ggml_backend_metal_get_proc_address,
 };
+// called upon program exit
+static void lm_ggml_metal_cleanup(void) {
+    lm_ggml_backend_metal_device_rel(&g_lm_ggml_ctx_dev_main);
+}
+// TODO: make thread-safe
 lm_ggml_backend_reg_t lm_ggml_backend_metal_reg(void) {
-    // TODO: make this thread-safe somehow?
+    lm_ggml_backend_metal_device_acq(&g_lm_ggml_ctx_dev_main);
+    // register cleanup callback
+    // TODO: not ideal, but not sure if there is a better way to do this in Objective-C
+    atexit(lm_ggml_metal_cleanup);
     {
         g_lm_ggml_backend_metal_reg = (struct lm_ggml_backend_reg) {
             /* .api_version = */ LM_GGML_BACKEND_API_VERSION,

package/cpp/ggml-quants.c CHANGED Viewed

@@ -2425,8 +2425,6 @@ void dequantize_row_iq1_m(const block_iq1_m * LM_GGML_RESTRICT x, float * LM_GGM
     }
 }
-static const int8_t kvalues_iq4nl[16] = {-127, -104, -83, -65, -49, -35, -22, -10, 1, 13, 25, 38, 53, 69, 89, 113};
 void dequantize_row_iq4_nl(const block_iq4_nl * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int64_t k) {
     assert(k % QK4_NL == 0);
     const int64_t nb = k / QK4_NL;