cui-llama.rn 1.1.2 → 1.1.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml-alloc.h CHANGED
@@ -7,8 +7,8 @@ extern "C" {
7
7
  #endif
8
8
 
9
9
  typedef struct lm_ggml_backend_buffer_type * lm_ggml_backend_buffer_type_t;
10
- typedef struct lm_ggml_backend_buffer * lm_ggml_backend_buffer_t;
11
- typedef struct lm_ggml_backend * lm_ggml_backend_t;
10
+ typedef struct lm_ggml_backend_buffer * lm_ggml_backend_buffer_t;
11
+ typedef struct lm_ggml_backend * lm_ggml_backend_t;
12
12
 
13
13
  // Tensor allocator
14
14
  struct lm_ggml_tallocr {
@@ -722,9 +722,11 @@ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
722
722
  #endif
723
723
 
724
724
  struct lm_ggml_backend_cpu_context {
725
- int n_threads;
726
- void * work_data;
727
- size_t work_size;
725
+ int n_threads;
726
+ lm_ggml_threadpool_t threadpool;
727
+
728
+ void * work_data;
729
+ size_t work_size;
728
730
 
729
731
  lm_ggml_abort_callback abort_callback;
730
732
  void * abort_callback_data;
@@ -759,7 +761,7 @@ LM_GGML_CALL static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_
759
761
 
760
762
  struct lm_ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct lm_ggml_backend_plan_cpu));
761
763
 
762
- cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
764
+ cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
763
765
  cpu_plan->cgraph = *cgraph; // FIXME: deep copy
764
766
 
765
767
  if (cpu_plan->cplan.work_size > 0) {
@@ -796,7 +798,7 @@ LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(l
796
798
  LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
797
799
  struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
798
800
 
799
- struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
801
+ struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
800
802
 
801
803
  if (cpu_ctx->work_size < cplan.work_size) {
802
804
  free(cpu_ctx->work_data);
@@ -825,6 +827,10 @@ LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_op(lm_ggml_backend_t backe
825
827
  op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
826
828
  case LM_GGML_OP_MUL_MAT:
827
829
  return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
830
+ case LM_GGML_OP_ROPE_BACK:
831
+ return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
832
+ case LM_GGML_OP_IM2COL_BACK:
833
+ return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
828
834
  default:
829
835
  return true;
830
836
  }
@@ -873,6 +879,7 @@ lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
873
879
  }
874
880
 
875
881
  ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
882
+ ctx->threadpool = NULL;
876
883
  ctx->work_data = NULL;
877
884
  ctx->work_size = 0;
878
885
  ctx->abort_callback = NULL;
@@ -903,6 +910,18 @@ void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_thre
903
910
  ctx->n_threads = n_threads;
904
911
  }
905
912
 
913
+ void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
914
+ LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
915
+
916
+ struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
917
+
918
+ if (ctx->threadpool && ctx->threadpool != threadpool) {
919
+ // already had a different threadpool, pause/suspend it before switching
920
+ lm_ggml_threadpool_pause(ctx->threadpool);
921
+ }
922
+ ctx->threadpool = threadpool;
923
+ }
924
+
906
925
  void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
907
926
  LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
908
927
 
@@ -1150,6 +1169,11 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
1150
1169
  }
1151
1170
  }
1152
1171
 
1172
+ if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
1173
+ // since the tensor is pre-allocated, it cannot be moved to another backend
1174
+ LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
1175
+ }
1176
+
1153
1177
  // graph input
1154
1178
  if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
1155
1179
  cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
@@ -1629,7 +1653,7 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
1629
1653
  sched->prev_leaf_backend_ids = tmp;
1630
1654
  }
1631
1655
 
1632
- int graph_size = graph->n_nodes + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
1656
+ int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
1633
1657
  if (sched->graph.size < graph_size) {
1634
1658
  sched->graph.size = graph_size;
1635
1659
  sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
@@ -1681,6 +1705,7 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
1681
1705
  for (int c = 0; c < sched->n_copies; c++) {
1682
1706
  struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1683
1707
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1708
+ assert(graph_copy->size > graph_copy->n_leafs);
1684
1709
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1685
1710
  }
1686
1711
  }
@@ -1694,6 +1719,7 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
1694
1719
  for (int c = 0; c < sched->n_copies; c++) {
1695
1720
  struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
1696
1721
  sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
1722
+ assert(graph_copy->size > graph_copy->n_leafs);
1697
1723
  graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
1698
1724
  }
1699
1725
  }
@@ -1704,6 +1730,7 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
1704
1730
  for (int i = 0; i < graph->n_leafs; i++) {
1705
1731
  struct lm_ggml_tensor * leaf = graph->leafs[i];
1706
1732
  sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
1733
+ assert(graph_copy->size > graph_copy->n_leafs);
1707
1734
  graph_copy->leafs[graph_copy->n_leafs++] = leaf;
1708
1735
  }
1709
1736
  }
@@ -63,6 +63,7 @@ extern "C" {
63
63
  LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
64
64
  LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
65
65
 
66
+ // "offset" refers to the offset of the tensor data for setting/getting data
66
67
  LM_GGML_API LM_GGML_CALL void lm_ggml_backend_tensor_set( struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
67
68
  LM_GGML_API LM_GGML_CALL void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
68
69
 
@@ -102,6 +103,7 @@ extern "C" {
102
103
 
103
104
  LM_GGML_API LM_GGML_CALL bool lm_ggml_backend_is_cpu (lm_ggml_backend_t backend);
104
105
  LM_GGML_API void lm_ggml_backend_cpu_set_n_threads (lm_ggml_backend_t backend_cpu, int n_threads);
106
+ LM_GGML_API void lm_ggml_backend_cpu_set_threadpool (lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool);
105
107
  LM_GGML_API void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data);
106
108
 
107
109
  // Create a backend buffer from an existing pointer
package/cpp/ggml-common.h CHANGED
@@ -227,6 +227,25 @@ typedef struct {
227
227
  } block_q8_0x8;
228
228
  static_assert(sizeof(block_q8_0x8) == 8 * sizeof(lm_ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
229
229
 
230
+ //
231
+ // Ternary quantization
232
+ //
233
+
234
+ // 1.6875 bpw
235
+ typedef struct {
236
+ uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
237
+ uint8_t qh[QK_K/64]; // 4 elements per byte
238
+ lm_ggml_half d;
239
+ } block_tq1_0;
240
+ static_assert(sizeof(block_tq1_0) == sizeof(lm_ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
241
+
242
+ // 2.0625 bpw
243
+ typedef struct {
244
+ uint8_t qs[QK_K/4]; // 2 bits per element
245
+ lm_ggml_half d;
246
+ } block_tq2_0;
247
+ static_assert(sizeof(block_tq2_0) == sizeof(lm_ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
248
+
230
249
  //
231
250
  // Super-block quantization structures
232
251
  //
@@ -361,6 +380,7 @@ typedef struct {
361
380
  } block_iq3_s;
362
381
  static_assert(sizeof(block_iq3_s) == sizeof(lm_ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
363
382
 
383
+ // 1.5625 bpw
364
384
  typedef struct {
365
385
  lm_ggml_half d;
366
386
  uint8_t qs[QK_K/8];
package/cpp/ggml-impl.h CHANGED
@@ -175,7 +175,7 @@ typedef __fp16 lm_ggml_fp16_internal_t;
175
175
 
176
176
  // 32-bit ARM compatibility
177
177
 
178
- // vaddvq_s16
178
+ // vaddlvq_s16
179
179
  // vpaddq_s16
180
180
  // vpaddq_s32
181
181
  // vaddvq_s32
@@ -185,12 +185,9 @@ typedef __fp16 lm_ggml_fp16_internal_t;
185
185
  // vzip1_u8
186
186
  // vzip2_u8
187
187
 
188
- inline static int32_t vaddvq_s16(int16x8_t v) {
189
- return
190
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
191
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
192
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
193
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
188
+ inline static int32_t vaddlvq_s16(int16x8_t v) {
189
+ int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
190
+ return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
194
191
  }
195
192
 
196
193
  inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
@@ -632,8 +629,16 @@ inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) {
632
629
  #define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
633
630
  #endif
634
631
 
632
+ enum lm_ggml_cgraph_eval_order {
633
+ LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
634
+ LM_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
635
+ LM_GGML_CGRAPH_EVAL_ORDER_COUNT
636
+ };
637
+
635
638
  // bitset
636
639
 
640
+ typedef uint32_t lm_ggml_bitset_t;
641
+
637
642
  static_assert(sizeof(lm_ggml_bitset_t) == 4, "bitset_t constants must be updated");
638
643
  #define BITSET_SHR 5 // log2(sizeof(lm_ggml_bitset_t)*8)
639
644
  #define BITSET_MASK (sizeof(lm_ggml_bitset_t)*8 - 1)
@@ -659,6 +664,12 @@ static inline void lm_ggml_bitset_clear(lm_ggml_bitset_t * bitset, size_t i) {
659
664
  #define LM_GGML_HASHSET_FULL ((size_t)-1)
660
665
  #define LM_GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
661
666
 
667
+ struct lm_ggml_hash_set {
668
+ size_t size;
669
+ lm_ggml_bitset_t * used; // whether or not the keys are in use i.e. set
670
+ struct lm_ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if lm_ggml_bitset_get(used, i)
671
+ };
672
+
662
673
  struct lm_ggml_hash_set lm_ggml_hash_set_new(size_t size);
663
674
  void lm_ggml_hash_set_free(struct lm_ggml_hash_set * hash_set);
664
675
 
@@ -748,6 +759,24 @@ static size_t lm_ggml_hash_find_or_insert(struct lm_ggml_hash_set * hash_set, st
748
759
  LM_GGML_ABORT("fatal error");
749
760
  }
750
761
 
762
+ // computation graph
763
+
764
+ struct lm_ggml_cgraph {
765
+ int size;
766
+ int n_nodes;
767
+ int n_leafs;
768
+
769
+ struct lm_ggml_tensor ** nodes;
770
+ struct lm_ggml_tensor ** grads;
771
+ struct lm_ggml_tensor ** leafs;
772
+
773
+ struct lm_ggml_hash_set visited_hash_set;
774
+
775
+ enum lm_ggml_cgraph_eval_order order;
776
+ };
777
+
778
+ struct lm_ggml_cgraph lm_ggml_graph_view(struct lm_ggml_cgraph * cgraph, int i0, int i1);
779
+
751
780
  #ifdef __cplusplus
752
781
  }
753
782
  #endif
package/cpp/ggml-metal.m CHANGED
@@ -1,7 +1,7 @@
1
1
  #import "ggml-metal.h"
2
2
 
3
+ #import "ggml-impl.h"
3
4
  #import "ggml-backend-impl.h"
4
- #import "ggml.h"
5
5
 
6
6
  #import <Foundation/Foundation.h>
7
7
 
@@ -17,8 +17,8 @@
17
17
  #define LM_GGML_METAL_LOG_WARN(...)
18
18
  #define LM_GGML_METAL_LOG_ERROR(...)
19
19
  #else
20
- #define LM_GGML_METAL_LOG_INFO(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_INFO, __VA_ARGS__)
21
- #define LM_GGML_METAL_LOG_WARN(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_WARN, __VA_ARGS__)
20
+ #define LM_GGML_METAL_LOG_INFO(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_INFO, __VA_ARGS__)
21
+ #define LM_GGML_METAL_LOG_WARN(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_WARN, __VA_ARGS__)
22
22
  #define LM_GGML_METAL_LOG_ERROR(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
23
23
  #endif
24
24
 
@@ -31,6 +31,8 @@ struct lm_ggml_metal_kernel {
31
31
  enum lm_ggml_metal_kernel_type {
32
32
  LM_GGML_METAL_KERNEL_TYPE_ADD,
33
33
  LM_GGML_METAL_KERNEL_TYPE_ADD_ROW,
34
+ LM_GGML_METAL_KERNEL_TYPE_SUB,
35
+ LM_GGML_METAL_KERNEL_TYPE_SUB_ROW,
34
36
  LM_GGML_METAL_KERNEL_TYPE_MUL,
35
37
  LM_GGML_METAL_KERNEL_TYPE_MUL_ROW,
36
38
  LM_GGML_METAL_KERNEL_TYPE_DIV,
@@ -207,6 +209,9 @@ enum lm_ggml_metal_kernel_type {
207
209
  LM_GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
208
210
  LM_GGML_METAL_KERNEL_TYPE_CONCAT,
209
211
  LM_GGML_METAL_KERNEL_TYPE_SQR,
212
+ LM_GGML_METAL_KERNEL_TYPE_SQRT,
213
+ LM_GGML_METAL_KERNEL_TYPE_SIN,
214
+ LM_GGML_METAL_KERNEL_TYPE_COS,
210
215
  LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS,
211
216
 
212
217
  LM_GGML_METAL_KERNEL_TYPE_COUNT
@@ -493,6 +498,8 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
493
498
 
494
499
  LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ADD, add, true);
495
500
  LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
501
+ LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SUB, sub, true);
502
+ LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true);
496
503
  LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL, mul, true);
497
504
  LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
498
505
  LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_DIV, div, true);
@@ -669,6 +676,9 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
669
676
  LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true);
670
677
  LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
671
678
  LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
679
+ LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
680
+ LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SIN, sin, true);
681
+ LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_COS, cos, true);
672
682
  LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
673
683
  }
674
684
 
@@ -769,23 +779,29 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context
769
779
  case LM_GGML_OP_PERMUTE:
770
780
  case LM_GGML_OP_CONCAT:
771
781
  case LM_GGML_OP_ADD:
782
+ case LM_GGML_OP_SUB:
772
783
  case LM_GGML_OP_ACC:
773
784
  case LM_GGML_OP_MUL:
774
785
  case LM_GGML_OP_DIV:
775
786
  case LM_GGML_OP_REPEAT:
776
787
  case LM_GGML_OP_SCALE:
777
788
  case LM_GGML_OP_CLAMP:
789
+ return true;
778
790
  case LM_GGML_OP_SQR:
791
+ case LM_GGML_OP_SQRT:
792
+ case LM_GGML_OP_SIN:
793
+ case LM_GGML_OP_COS:
794
+ return lm_ggml_is_contiguous(op->src[0]);
779
795
  case LM_GGML_OP_SUM_ROWS:
780
- return true;
781
796
  case LM_GGML_OP_SOFT_MAX:
782
797
  case LM_GGML_OP_RMS_NORM:
783
798
  case LM_GGML_OP_GROUP_NORM:
784
799
  return ctx->support_simdgroup_reduction;
785
800
  case LM_GGML_OP_NORM:
786
801
  case LM_GGML_OP_ROPE:
787
- case LM_GGML_OP_IM2COL:
788
802
  return true;
803
+ case LM_GGML_OP_IM2COL:
804
+ return op->src[0]->type == LM_GGML_TYPE_F16;
789
805
  case LM_GGML_OP_POOL_1D:
790
806
  case LM_GGML_OP_POOL_2D:
791
807
  return false;
@@ -866,7 +882,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
866
882
  // create multiple command buffers and enqueue them
867
883
  // then, we encode the graph into the command buffers in parallel
868
884
 
869
- const int n_nodes = gf->n_nodes;
885
+ const int n_nodes = gf->n_nodes;
870
886
  const int n_cb = ctx->n_cb;
871
887
  const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
872
888
 
@@ -1057,6 +1073,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
1057
1073
  [encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
1058
1074
  } break;
1059
1075
  case LM_GGML_OP_ADD:
1076
+ case LM_GGML_OP_SUB:
1060
1077
  case LM_GGML_OP_MUL:
1061
1078
  case LM_GGML_OP_DIV:
1062
1079
  {
@@ -1080,6 +1097,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
1080
1097
  nb = ne00 / 4;
1081
1098
  switch (dst->op) {
1082
1099
  case LM_GGML_OP_ADD: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
1100
+ case LM_GGML_OP_SUB: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
1083
1101
  case LM_GGML_OP_MUL: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
1084
1102
  case LM_GGML_OP_DIV: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
1085
1103
  default: LM_GGML_ABORT("fatal error");
@@ -1089,6 +1107,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
1089
1107
  } else {
1090
1108
  switch (dst->op) {
1091
1109
  case LM_GGML_OP_ADD: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
1110
+ case LM_GGML_OP_SUB: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
1092
1111
  case LM_GGML_OP_MUL: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
1093
1112
  case LM_GGML_OP_DIV: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
1094
1113
  default: LM_GGML_ABORT("fatal error");
@@ -1416,6 +1435,48 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
1416
1435
 
1417
1436
  const int64_t n = lm_ggml_nelements(dst);
1418
1437
 
1438
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1439
+ } break;
1440
+ case LM_GGML_OP_SQRT:
1441
+ {
1442
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
1443
+
1444
+ id<MTLComputePipelineState> pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
1445
+
1446
+ [encoder setComputePipelineState:pipeline];
1447
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1448
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1449
+
1450
+ const int64_t n = lm_ggml_nelements(dst);
1451
+
1452
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1453
+ } break;
1454
+ case LM_GGML_OP_SIN:
1455
+ {
1456
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
1457
+
1458
+ id<MTLComputePipelineState> pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SIN].pipeline;
1459
+
1460
+ [encoder setComputePipelineState:pipeline];
1461
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1462
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1463
+
1464
+ const int64_t n = lm_ggml_nelements(dst);
1465
+
1466
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1467
+ } break;
1468
+ case LM_GGML_OP_COS:
1469
+ {
1470
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
1471
+
1472
+ id<MTLComputePipelineState> pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_COS].pipeline;
1473
+
1474
+ [encoder setComputePipelineState:pipeline];
1475
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
1476
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
1477
+
1478
+ const int64_t n = lm_ggml_nelements(dst);
1479
+
1419
1480
  [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
1420
1481
  } break;
1421
1482
  case LM_GGML_OP_SUM_ROWS:
@@ -2978,8 +3039,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2978
3039
  if (status != MTLCommandBufferStatusCompleted) {
2979
3040
  LM_GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
2980
3041
  if (status == MTLCommandBufferStatusError) {
2981
- NSString * error_code = [command_buffer error].localizedDescription;
2982
- LM_GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
3042
+ LM_GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
2983
3043
  }
2984
3044
 
2985
3045
  return LM_GGML_STATUS_FAILED;