cui-llama.rn 1.1.2 → 1.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -2
- package/android/src/main/jni.cpp +26 -21
- package/cpp/common.cpp +181 -1584
- package/cpp/common.h +131 -52
- package/cpp/ggml-aarch64.c +612 -0
- package/cpp/ggml-alloc.h +2 -2
- package/cpp/ggml-backend.c +33 -6
- package/cpp/ggml-backend.h +2 -0
- package/cpp/ggml-common.h +20 -0
- package/cpp/ggml-impl.h +36 -7
- package/cpp/ggml-metal.m +68 -8
- package/cpp/ggml-quants.c +932 -50
- package/cpp/ggml-quants.h +15 -0
- package/cpp/ggml.c +1712 -325
- package/cpp/ggml.h +169 -100
- package/cpp/llama-grammar.cpp +721 -122
- package/cpp/llama-grammar.h +120 -15
- package/cpp/llama-impl.h +132 -1
- package/cpp/llama-sampling.cpp +1483 -354
- package/cpp/llama-sampling.h +20 -48
- package/cpp/llama-vocab.cpp +140 -7
- package/cpp/llama-vocab.h +3 -2
- package/cpp/llama.cpp +824 -327
- package/cpp/llama.h +235 -256
- package/cpp/rn-llama.hpp +18 -14
- package/cpp/sampling.cpp +353 -354
- package/cpp/sampling.h +62 -143
- package/cpp/sgemm.cpp +153 -0
- package/package.json +1 -1
- package/cpp/grammar-parser.cpp +0 -539
- package/cpp/grammar-parser.h +0 -29
package/cpp/ggml-alloc.h
CHANGED
@@ -7,8 +7,8 @@ extern "C" {
|
|
7
7
|
#endif
|
8
8
|
|
9
9
|
typedef struct lm_ggml_backend_buffer_type * lm_ggml_backend_buffer_type_t;
|
10
|
-
typedef struct
|
11
|
-
typedef struct
|
10
|
+
typedef struct lm_ggml_backend_buffer * lm_ggml_backend_buffer_t;
|
11
|
+
typedef struct lm_ggml_backend * lm_ggml_backend_t;
|
12
12
|
|
13
13
|
// Tensor allocator
|
14
14
|
struct lm_ggml_tallocr {
|
package/cpp/ggml-backend.c
CHANGED
@@ -722,9 +722,11 @@ lm_ggml_backend_buffer_type_t lm_ggml_backend_cpu_hbm_buffer_type(void) {
|
|
722
722
|
#endif
|
723
723
|
|
724
724
|
struct lm_ggml_backend_cpu_context {
|
725
|
-
int
|
726
|
-
|
727
|
-
|
725
|
+
int n_threads;
|
726
|
+
lm_ggml_threadpool_t threadpool;
|
727
|
+
|
728
|
+
void * work_data;
|
729
|
+
size_t work_size;
|
728
730
|
|
729
731
|
lm_ggml_abort_callback abort_callback;
|
730
732
|
void * abort_callback_data;
|
@@ -759,7 +761,7 @@ LM_GGML_CALL static lm_ggml_backend_graph_plan_t lm_ggml_backend_cpu_graph_plan_
|
|
759
761
|
|
760
762
|
struct lm_ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct lm_ggml_backend_plan_cpu));
|
761
763
|
|
762
|
-
cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
764
|
+
cpu_plan->cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
763
765
|
cpu_plan->cgraph = *cgraph; // FIXME: deep copy
|
764
766
|
|
765
767
|
if (cpu_plan->cplan.work_size > 0) {
|
@@ -796,7 +798,7 @@ LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_plan_compute(l
|
|
796
798
|
LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_cpu_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
|
797
799
|
struct lm_ggml_backend_cpu_context * cpu_ctx = (struct lm_ggml_backend_cpu_context *)backend->context;
|
798
800
|
|
799
|
-
struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads);
|
801
|
+
struct lm_ggml_cplan cplan = lm_ggml_graph_plan(cgraph, cpu_ctx->n_threads, cpu_ctx->threadpool);
|
800
802
|
|
801
803
|
if (cpu_ctx->work_size < cplan.work_size) {
|
802
804
|
free(cpu_ctx->work_data);
|
@@ -825,6 +827,10 @@ LM_GGML_CALL static bool lm_ggml_backend_cpu_supports_op(lm_ggml_backend_t backe
|
|
825
827
|
op->type != LM_GGML_TYPE_IQ1_M; // missing type_traits.from_float
|
826
828
|
case LM_GGML_OP_MUL_MAT:
|
827
829
|
return op->src[1]->type == LM_GGML_TYPE_F32 || op->src[1]->type == lm_ggml_internal_get_type_traits(op->src[0]->type).vec_dot_type;
|
830
|
+
case LM_GGML_OP_ROPE_BACK:
|
831
|
+
return op->src[2] == NULL && (op->op_params[2] & 4) == 0;
|
832
|
+
case LM_GGML_OP_IM2COL_BACK:
|
833
|
+
return op->src[0]->type == LM_GGML_TYPE_F32 && op->src[1]->type == LM_GGML_TYPE_F32;
|
828
834
|
default:
|
829
835
|
return true;
|
830
836
|
}
|
@@ -873,6 +879,7 @@ lm_ggml_backend_t lm_ggml_backend_cpu_init(void) {
|
|
873
879
|
}
|
874
880
|
|
875
881
|
ctx->n_threads = LM_GGML_DEFAULT_N_THREADS;
|
882
|
+
ctx->threadpool = NULL;
|
876
883
|
ctx->work_data = NULL;
|
877
884
|
ctx->work_size = 0;
|
878
885
|
ctx->abort_callback = NULL;
|
@@ -903,6 +910,18 @@ void lm_ggml_backend_cpu_set_n_threads(lm_ggml_backend_t backend_cpu, int n_thre
|
|
903
910
|
ctx->n_threads = n_threads;
|
904
911
|
}
|
905
912
|
|
913
|
+
void lm_ggml_backend_cpu_set_threadpool(lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool) {
|
914
|
+
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
915
|
+
|
916
|
+
struct lm_ggml_backend_cpu_context * ctx = (struct lm_ggml_backend_cpu_context *)backend_cpu->context;
|
917
|
+
|
918
|
+
if (ctx->threadpool && ctx->threadpool != threadpool) {
|
919
|
+
// already had a different threadpool, pause/suspend it before switching
|
920
|
+
lm_ggml_threadpool_pause(ctx->threadpool);
|
921
|
+
}
|
922
|
+
ctx->threadpool = threadpool;
|
923
|
+
}
|
924
|
+
|
906
925
|
void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data) {
|
907
926
|
LM_GGML_ASSERT(lm_ggml_backend_is_cpu(backend_cpu));
|
908
927
|
|
@@ -1150,6 +1169,11 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
|
|
1150
1169
|
}
|
1151
1170
|
}
|
1152
1171
|
|
1172
|
+
if (tensor->buffer || (tensor->view_src && tensor->view_src->buffer)) {
|
1173
|
+
// since the tensor is pre-allocated, it cannot be moved to another backend
|
1174
|
+
LM_GGML_ABORT("pre-allocated tensor in a backend that cannot run the operation");
|
1175
|
+
}
|
1176
|
+
|
1153
1177
|
// graph input
|
1154
1178
|
if (tensor->flags & LM_GGML_TENSOR_FLAG_INPUT) {
|
1155
1179
|
cur_backend_id = sched->n_backends - 1; // last backend (assumed CPU)
|
@@ -1629,7 +1653,7 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
|
|
1629
1653
|
sched->prev_leaf_backend_ids = tmp;
|
1630
1654
|
}
|
1631
1655
|
|
1632
|
-
int graph_size = graph->n_nodes + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2;
|
1656
|
+
int graph_size = MAX(graph->n_nodes, graph->n_leafs) + sched->n_splits*LM_GGML_SCHED_MAX_SPLIT_INPUTS*2*sched->n_copies;
|
1633
1657
|
if (sched->graph.size < graph_size) {
|
1634
1658
|
sched->graph.size = graph_size;
|
1635
1659
|
sched->graph.nodes = realloc(sched->graph.nodes, graph_size * sizeof(struct lm_ggml_tensor *));
|
@@ -1681,6 +1705,7 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
|
|
1681
1705
|
for (int c = 0; c < sched->n_copies; c++) {
|
1682
1706
|
struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
1683
1707
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1708
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1684
1709
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1685
1710
|
}
|
1686
1711
|
}
|
@@ -1694,6 +1719,7 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
|
|
1694
1719
|
for (int c = 0; c < sched->n_copies; c++) {
|
1695
1720
|
struct lm_ggml_tensor * input_cpy = tensor_id_copy(id, backend_id, c);
|
1696
1721
|
sched->leaf_backend_ids[graph_copy->n_leafs] = backend_id;
|
1722
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1697
1723
|
graph_copy->leafs[graph_copy->n_leafs++] = input_cpy;
|
1698
1724
|
}
|
1699
1725
|
}
|
@@ -1704,6 +1730,7 @@ static void lm_ggml_backend_sched_split_graph(lm_ggml_backend_sched_t sched, str
|
|
1704
1730
|
for (int i = 0; i < graph->n_leafs; i++) {
|
1705
1731
|
struct lm_ggml_tensor * leaf = graph->leafs[i];
|
1706
1732
|
sched->leaf_backend_ids[graph_copy->n_leafs] = tensor_backend_id(leaf);
|
1733
|
+
assert(graph_copy->size > graph_copy->n_leafs);
|
1707
1734
|
graph_copy->leafs[graph_copy->n_leafs++] = leaf;
|
1708
1735
|
}
|
1709
1736
|
}
|
package/cpp/ggml-backend.h
CHANGED
@@ -63,6 +63,7 @@ extern "C" {
|
|
63
63
|
LM_GGML_API void lm_ggml_backend_tensor_set_async(lm_ggml_backend_t backend, struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
64
64
|
LM_GGML_API void lm_ggml_backend_tensor_get_async(lm_ggml_backend_t backend, const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
65
65
|
|
66
|
+
// "offset" refers to the offset of the tensor data for setting/getting data
|
66
67
|
LM_GGML_API LM_GGML_CALL void lm_ggml_backend_tensor_set( struct lm_ggml_tensor * tensor, const void * data, size_t offset, size_t size);
|
67
68
|
LM_GGML_API LM_GGML_CALL void lm_ggml_backend_tensor_get(const struct lm_ggml_tensor * tensor, void * data, size_t offset, size_t size);
|
68
69
|
|
@@ -102,6 +103,7 @@ extern "C" {
|
|
102
103
|
|
103
104
|
LM_GGML_API LM_GGML_CALL bool lm_ggml_backend_is_cpu (lm_ggml_backend_t backend);
|
104
105
|
LM_GGML_API void lm_ggml_backend_cpu_set_n_threads (lm_ggml_backend_t backend_cpu, int n_threads);
|
106
|
+
LM_GGML_API void lm_ggml_backend_cpu_set_threadpool (lm_ggml_backend_t backend_cpu, lm_ggml_threadpool_t threadpool);
|
105
107
|
LM_GGML_API void lm_ggml_backend_cpu_set_abort_callback(lm_ggml_backend_t backend_cpu, lm_ggml_abort_callback abort_callback, void * abort_callback_data);
|
106
108
|
|
107
109
|
// Create a backend buffer from an existing pointer
|
package/cpp/ggml-common.h
CHANGED
@@ -227,6 +227,25 @@ typedef struct {
|
|
227
227
|
} block_q8_0x8;
|
228
228
|
static_assert(sizeof(block_q8_0x8) == 8 * sizeof(lm_ggml_half) + QK8_0 * 8, "wrong q8_0x8 block size/padding");
|
229
229
|
|
230
|
+
//
|
231
|
+
// Ternary quantization
|
232
|
+
//
|
233
|
+
|
234
|
+
// 1.6875 bpw
|
235
|
+
typedef struct {
|
236
|
+
uint8_t qs[(QK_K - 4 * QK_K / 64) / 5]; // 5 elements per byte (3^5 = 243 < 256)
|
237
|
+
uint8_t qh[QK_K/64]; // 4 elements per byte
|
238
|
+
lm_ggml_half d;
|
239
|
+
} block_tq1_0;
|
240
|
+
static_assert(sizeof(block_tq1_0) == sizeof(lm_ggml_half) + QK_K / 64 + (QK_K - 4 * QK_K / 64) / 5, "wrong tq1_0 block size/padding");
|
241
|
+
|
242
|
+
// 2.0625 bpw
|
243
|
+
typedef struct {
|
244
|
+
uint8_t qs[QK_K/4]; // 2 bits per element
|
245
|
+
lm_ggml_half d;
|
246
|
+
} block_tq2_0;
|
247
|
+
static_assert(sizeof(block_tq2_0) == sizeof(lm_ggml_half) + QK_K / 4, "wrong tq2_0 block size/padding");
|
248
|
+
|
230
249
|
//
|
231
250
|
// Super-block quantization structures
|
232
251
|
//
|
@@ -361,6 +380,7 @@ typedef struct {
|
|
361
380
|
} block_iq3_s;
|
362
381
|
static_assert(sizeof(block_iq3_s) == sizeof(lm_ggml_half) + 13*(QK_K/32) + IQ3S_N_SCALE, "wrong iq3_s block size/padding");
|
363
382
|
|
383
|
+
// 1.5625 bpw
|
364
384
|
typedef struct {
|
365
385
|
lm_ggml_half d;
|
366
386
|
uint8_t qs[QK_K/8];
|
package/cpp/ggml-impl.h
CHANGED
@@ -175,7 +175,7 @@ typedef __fp16 lm_ggml_fp16_internal_t;
|
|
175
175
|
|
176
176
|
// 32-bit ARM compatibility
|
177
177
|
|
178
|
-
//
|
178
|
+
// vaddlvq_s16
|
179
179
|
// vpaddq_s16
|
180
180
|
// vpaddq_s32
|
181
181
|
// vaddvq_s32
|
@@ -185,12 +185,9 @@ typedef __fp16 lm_ggml_fp16_internal_t;
|
|
185
185
|
// vzip1_u8
|
186
186
|
// vzip2_u8
|
187
187
|
|
188
|
-
inline static int32_t
|
189
|
-
|
190
|
-
|
191
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
192
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
193
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
188
|
+
inline static int32_t vaddlvq_s16(int16x8_t v) {
|
189
|
+
int32x4_t v0 = vreinterpretq_s32_s64(vpaddlq_s32(vpaddlq_s16(v)));
|
190
|
+
return vgetq_lane_s32(v0, 0) + vgetq_lane_s32(v0, 2);
|
194
191
|
}
|
195
192
|
|
196
193
|
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
@@ -632,8 +629,16 @@ inline static float lm_ggml_lookup_fp16_to_fp32(lm_ggml_fp16_t f) {
|
|
632
629
|
#define LM_GGML_FP32_TO_FP16(x) LM_GGML_COMPUTE_FP32_TO_FP16(x)
|
633
630
|
#endif
|
634
631
|
|
632
|
+
enum lm_ggml_cgraph_eval_order {
|
633
|
+
LM_GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
|
634
|
+
LM_GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
|
635
|
+
LM_GGML_CGRAPH_EVAL_ORDER_COUNT
|
636
|
+
};
|
637
|
+
|
635
638
|
// bitset
|
636
639
|
|
640
|
+
typedef uint32_t lm_ggml_bitset_t;
|
641
|
+
|
637
642
|
static_assert(sizeof(lm_ggml_bitset_t) == 4, "bitset_t constants must be updated");
|
638
643
|
#define BITSET_SHR 5 // log2(sizeof(lm_ggml_bitset_t)*8)
|
639
644
|
#define BITSET_MASK (sizeof(lm_ggml_bitset_t)*8 - 1)
|
@@ -659,6 +664,12 @@ static inline void lm_ggml_bitset_clear(lm_ggml_bitset_t * bitset, size_t i) {
|
|
659
664
|
#define LM_GGML_HASHSET_FULL ((size_t)-1)
|
660
665
|
#define LM_GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
|
661
666
|
|
667
|
+
struct lm_ggml_hash_set {
|
668
|
+
size_t size;
|
669
|
+
lm_ggml_bitset_t * used; // whether or not the keys are in use i.e. set
|
670
|
+
struct lm_ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if lm_ggml_bitset_get(used, i)
|
671
|
+
};
|
672
|
+
|
662
673
|
struct lm_ggml_hash_set lm_ggml_hash_set_new(size_t size);
|
663
674
|
void lm_ggml_hash_set_free(struct lm_ggml_hash_set * hash_set);
|
664
675
|
|
@@ -748,6 +759,24 @@ static size_t lm_ggml_hash_find_or_insert(struct lm_ggml_hash_set * hash_set, st
|
|
748
759
|
LM_GGML_ABORT("fatal error");
|
749
760
|
}
|
750
761
|
|
762
|
+
// computation graph
|
763
|
+
|
764
|
+
struct lm_ggml_cgraph {
|
765
|
+
int size;
|
766
|
+
int n_nodes;
|
767
|
+
int n_leafs;
|
768
|
+
|
769
|
+
struct lm_ggml_tensor ** nodes;
|
770
|
+
struct lm_ggml_tensor ** grads;
|
771
|
+
struct lm_ggml_tensor ** leafs;
|
772
|
+
|
773
|
+
struct lm_ggml_hash_set visited_hash_set;
|
774
|
+
|
775
|
+
enum lm_ggml_cgraph_eval_order order;
|
776
|
+
};
|
777
|
+
|
778
|
+
struct lm_ggml_cgraph lm_ggml_graph_view(struct lm_ggml_cgraph * cgraph, int i0, int i1);
|
779
|
+
|
751
780
|
#ifdef __cplusplus
|
752
781
|
}
|
753
782
|
#endif
|
package/cpp/ggml-metal.m
CHANGED
@@ -1,7 +1,7 @@
|
|
1
1
|
#import "ggml-metal.h"
|
2
2
|
|
3
|
+
#import "ggml-impl.h"
|
3
4
|
#import "ggml-backend-impl.h"
|
4
|
-
#import "ggml.h"
|
5
5
|
|
6
6
|
#import <Foundation/Foundation.h>
|
7
7
|
|
@@ -17,8 +17,8 @@
|
|
17
17
|
#define LM_GGML_METAL_LOG_WARN(...)
|
18
18
|
#define LM_GGML_METAL_LOG_ERROR(...)
|
19
19
|
#else
|
20
|
-
#define LM_GGML_METAL_LOG_INFO(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_INFO,
|
21
|
-
#define LM_GGML_METAL_LOG_WARN(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_WARN,
|
20
|
+
#define LM_GGML_METAL_LOG_INFO(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
21
|
+
#define LM_GGML_METAL_LOG_WARN(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
22
22
|
#define LM_GGML_METAL_LOG_ERROR(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
23
23
|
#endif
|
24
24
|
|
@@ -31,6 +31,8 @@ struct lm_ggml_metal_kernel {
|
|
31
31
|
enum lm_ggml_metal_kernel_type {
|
32
32
|
LM_GGML_METAL_KERNEL_TYPE_ADD,
|
33
33
|
LM_GGML_METAL_KERNEL_TYPE_ADD_ROW,
|
34
|
+
LM_GGML_METAL_KERNEL_TYPE_SUB,
|
35
|
+
LM_GGML_METAL_KERNEL_TYPE_SUB_ROW,
|
34
36
|
LM_GGML_METAL_KERNEL_TYPE_MUL,
|
35
37
|
LM_GGML_METAL_KERNEL_TYPE_MUL_ROW,
|
36
38
|
LM_GGML_METAL_KERNEL_TYPE_DIV,
|
@@ -207,6 +209,9 @@ enum lm_ggml_metal_kernel_type {
|
|
207
209
|
LM_GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL,
|
208
210
|
LM_GGML_METAL_KERNEL_TYPE_CONCAT,
|
209
211
|
LM_GGML_METAL_KERNEL_TYPE_SQR,
|
212
|
+
LM_GGML_METAL_KERNEL_TYPE_SQRT,
|
213
|
+
LM_GGML_METAL_KERNEL_TYPE_SIN,
|
214
|
+
LM_GGML_METAL_KERNEL_TYPE_COS,
|
210
215
|
LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS,
|
211
216
|
|
212
217
|
LM_GGML_METAL_KERNEL_TYPE_COUNT
|
@@ -493,6 +498,8 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
493
498
|
|
494
499
|
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ADD, add, true);
|
495
500
|
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_ADD_ROW, add_row, true);
|
501
|
+
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SUB, sub, true);
|
502
|
+
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SUB_ROW, sub_row, true);
|
496
503
|
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL, mul, true);
|
497
504
|
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_MUL_ROW, mul_row, true);
|
498
505
|
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_DIV, div, true);
|
@@ -669,6 +676,9 @@ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
|
|
669
676
|
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CPY_F32_IQ4_NL, cpy_f32_iq4_nl, true);
|
670
677
|
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_CONCAT, concat, true);
|
671
678
|
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SQR, sqr, true);
|
679
|
+
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SQRT, sqrt, true);
|
680
|
+
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SIN, sin, true);
|
681
|
+
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_COS, cos, true);
|
672
682
|
LM_GGML_METAL_ADD_KERNEL(LM_GGML_METAL_KERNEL_TYPE_SUM_ROWS, sum_rows, true);
|
673
683
|
}
|
674
684
|
|
@@ -769,23 +779,29 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context
|
|
769
779
|
case LM_GGML_OP_PERMUTE:
|
770
780
|
case LM_GGML_OP_CONCAT:
|
771
781
|
case LM_GGML_OP_ADD:
|
782
|
+
case LM_GGML_OP_SUB:
|
772
783
|
case LM_GGML_OP_ACC:
|
773
784
|
case LM_GGML_OP_MUL:
|
774
785
|
case LM_GGML_OP_DIV:
|
775
786
|
case LM_GGML_OP_REPEAT:
|
776
787
|
case LM_GGML_OP_SCALE:
|
777
788
|
case LM_GGML_OP_CLAMP:
|
789
|
+
return true;
|
778
790
|
case LM_GGML_OP_SQR:
|
791
|
+
case LM_GGML_OP_SQRT:
|
792
|
+
case LM_GGML_OP_SIN:
|
793
|
+
case LM_GGML_OP_COS:
|
794
|
+
return lm_ggml_is_contiguous(op->src[0]);
|
779
795
|
case LM_GGML_OP_SUM_ROWS:
|
780
|
-
return true;
|
781
796
|
case LM_GGML_OP_SOFT_MAX:
|
782
797
|
case LM_GGML_OP_RMS_NORM:
|
783
798
|
case LM_GGML_OP_GROUP_NORM:
|
784
799
|
return ctx->support_simdgroup_reduction;
|
785
800
|
case LM_GGML_OP_NORM:
|
786
801
|
case LM_GGML_OP_ROPE:
|
787
|
-
case LM_GGML_OP_IM2COL:
|
788
802
|
return true;
|
803
|
+
case LM_GGML_OP_IM2COL:
|
804
|
+
return op->src[0]->type == LM_GGML_TYPE_F16;
|
789
805
|
case LM_GGML_OP_POOL_1D:
|
790
806
|
case LM_GGML_OP_POOL_2D:
|
791
807
|
return false;
|
@@ -866,7 +882,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
866
882
|
// create multiple command buffers and enqueue them
|
867
883
|
// then, we encode the graph into the command buffers in parallel
|
868
884
|
|
869
|
-
const int n_nodes
|
885
|
+
const int n_nodes = gf->n_nodes;
|
870
886
|
const int n_cb = ctx->n_cb;
|
871
887
|
const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
|
872
888
|
|
@@ -1057,6 +1073,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
1057
1073
|
[encoder dispatchThreadgroups:MTLSizeMake(ne1, ne2, ne3) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
1058
1074
|
} break;
|
1059
1075
|
case LM_GGML_OP_ADD:
|
1076
|
+
case LM_GGML_OP_SUB:
|
1060
1077
|
case LM_GGML_OP_MUL:
|
1061
1078
|
case LM_GGML_OP_DIV:
|
1062
1079
|
{
|
@@ -1080,6 +1097,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
1080
1097
|
nb = ne00 / 4;
|
1081
1098
|
switch (dst->op) {
|
1082
1099
|
case LM_GGML_OP_ADD: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ADD_ROW].pipeline; break;
|
1100
|
+
case LM_GGML_OP_SUB: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SUB_ROW].pipeline; break;
|
1083
1101
|
case LM_GGML_OP_MUL: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL_ROW].pipeline; break;
|
1084
1102
|
case LM_GGML_OP_DIV: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_DIV_ROW].pipeline; break;
|
1085
1103
|
default: LM_GGML_ABORT("fatal error");
|
@@ -1089,6 +1107,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
1089
1107
|
} else {
|
1090
1108
|
switch (dst->op) {
|
1091
1109
|
case LM_GGML_OP_ADD: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_ADD].pipeline; break;
|
1110
|
+
case LM_GGML_OP_SUB: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SUB].pipeline; break;
|
1092
1111
|
case LM_GGML_OP_MUL: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_MUL].pipeline; break;
|
1093
1112
|
case LM_GGML_OP_DIV: pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_DIV].pipeline; break;
|
1094
1113
|
default: LM_GGML_ABORT("fatal error");
|
@@ -1416,6 +1435,48 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
1416
1435
|
|
1417
1436
|
const int64_t n = lm_ggml_nelements(dst);
|
1418
1437
|
|
1438
|
+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
1439
|
+
} break;
|
1440
|
+
case LM_GGML_OP_SQRT:
|
1441
|
+
{
|
1442
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
|
1443
|
+
|
1444
|
+
id<MTLComputePipelineState> pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SQRT].pipeline;
|
1445
|
+
|
1446
|
+
[encoder setComputePipelineState:pipeline];
|
1447
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1448
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
1449
|
+
|
1450
|
+
const int64_t n = lm_ggml_nelements(dst);
|
1451
|
+
|
1452
|
+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
1453
|
+
} break;
|
1454
|
+
case LM_GGML_OP_SIN:
|
1455
|
+
{
|
1456
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
|
1457
|
+
|
1458
|
+
id<MTLComputePipelineState> pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_SIN].pipeline;
|
1459
|
+
|
1460
|
+
[encoder setComputePipelineState:pipeline];
|
1461
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1462
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
1463
|
+
|
1464
|
+
const int64_t n = lm_ggml_nelements(dst);
|
1465
|
+
|
1466
|
+
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
1467
|
+
} break;
|
1468
|
+
case LM_GGML_OP_COS:
|
1469
|
+
{
|
1470
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
|
1471
|
+
|
1472
|
+
id<MTLComputePipelineState> pipeline = ctx->kernels[LM_GGML_METAL_KERNEL_TYPE_COS].pipeline;
|
1473
|
+
|
1474
|
+
[encoder setComputePipelineState:pipeline];
|
1475
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
1476
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
1477
|
+
|
1478
|
+
const int64_t n = lm_ggml_nelements(dst);
|
1479
|
+
|
1419
1480
|
[encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
|
1420
1481
|
} break;
|
1421
1482
|
case LM_GGML_OP_SUM_ROWS:
|
@@ -2978,8 +3039,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
|
|
2978
3039
|
if (status != MTLCommandBufferStatusCompleted) {
|
2979
3040
|
LM_GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
2980
3041
|
if (status == MTLCommandBufferStatusError) {
|
2981
|
-
|
2982
|
-
LM_GGML_METAL_LOG_INFO("error: %s\n", [error_code UTF8String]);
|
3042
|
+
LM_GGML_METAL_LOG_INFO("error: %s\n", [[command_buffer error].localizedDescription UTF8String]);
|
2983
3043
|
}
|
2984
3044
|
|
2985
3045
|
return LM_GGML_STATUS_FAILED;
|