llama_cpp 0.2.1 → 0.3.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
|
|
41
41
|
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
42
42
|
// - the mapping is used during computation to determine the arguments of the compute kernels
|
43
43
|
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
44
|
+
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
45
|
+
// that it is guaranteed that the tensor will fit in at least one of the views
|
44
46
|
//
|
45
47
|
bool ggml_metal_add_buffer(
|
46
48
|
struct ggml_metal_context * ctx,
|
47
49
|
const char * name,
|
48
50
|
void * data,
|
49
|
-
size_t size
|
51
|
+
size_t size,
|
52
|
+
size_t max_size);
|
50
53
|
|
51
54
|
// set data from host memory into the device
|
52
55
|
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
@@ -51,23 +51,26 @@ struct ggml_metal_context {
|
|
51
51
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
52
52
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
53
53
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
54
|
-
GGML_METAL_DECL_KERNEL(
|
55
|
-
GGML_METAL_DECL_KERNEL(
|
56
|
-
GGML_METAL_DECL_KERNEL(
|
57
|
-
GGML_METAL_DECL_KERNEL(
|
58
|
-
GGML_METAL_DECL_KERNEL(
|
54
|
+
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
55
|
+
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
56
|
+
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
57
|
+
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
58
|
+
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
59
59
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
60
|
+
GGML_METAL_DECL_KERNEL(norm);
|
60
61
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
61
62
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
62
63
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
63
|
-
GGML_METAL_DECL_KERNEL(
|
64
|
-
GGML_METAL_DECL_KERNEL(
|
65
|
-
GGML_METAL_DECL_KERNEL(
|
66
|
-
GGML_METAL_DECL_KERNEL(
|
67
|
-
GGML_METAL_DECL_KERNEL(
|
64
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
65
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
66
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
67
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
68
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
68
69
|
GGML_METAL_DECL_KERNEL(rope);
|
70
|
+
GGML_METAL_DECL_KERNEL(alibi_f32);
|
69
71
|
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
70
72
|
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
73
|
+
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
71
74
|
|
72
75
|
#undef GGML_METAL_DECL_KERNEL
|
73
76
|
};
|
@@ -129,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
129
132
|
exit(1);
|
130
133
|
}
|
131
134
|
|
135
|
+
#ifdef GGML_QKK_64
|
136
|
+
MTLCompileOptions* options = [MTLCompileOptions new];
|
137
|
+
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
138
|
+
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
139
|
+
#else
|
132
140
|
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
141
|
+
#endif
|
133
142
|
if (error) {
|
134
143
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
135
144
|
exit(1);
|
@@ -156,27 +165,38 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
156
165
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
157
166
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
158
167
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
159
|
-
GGML_METAL_ADD_KERNEL(
|
160
|
-
GGML_METAL_ADD_KERNEL(
|
161
|
-
GGML_METAL_ADD_KERNEL(
|
162
|
-
GGML_METAL_ADD_KERNEL(
|
163
|
-
GGML_METAL_ADD_KERNEL(
|
168
|
+
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
169
|
+
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
170
|
+
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
171
|
+
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
172
|
+
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
164
173
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
174
|
+
GGML_METAL_ADD_KERNEL(norm);
|
165
175
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
166
176
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
167
177
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
168
|
-
GGML_METAL_ADD_KERNEL(
|
169
|
-
GGML_METAL_ADD_KERNEL(
|
170
|
-
GGML_METAL_ADD_KERNEL(
|
171
|
-
GGML_METAL_ADD_KERNEL(
|
172
|
-
GGML_METAL_ADD_KERNEL(
|
178
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
179
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
180
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
181
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
182
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
173
183
|
GGML_METAL_ADD_KERNEL(rope);
|
184
|
+
GGML_METAL_ADD_KERNEL(alibi_f32);
|
174
185
|
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
175
186
|
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
187
|
+
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
176
188
|
|
177
189
|
#undef GGML_METAL_ADD_KERNEL
|
178
190
|
}
|
179
191
|
|
192
|
+
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
193
|
+
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
194
|
+
if (ctx->device.maxTransferRate != 0) {
|
195
|
+
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
196
|
+
} else {
|
197
|
+
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
|
198
|
+
}
|
199
|
+
|
180
200
|
return ctx;
|
181
201
|
}
|
182
202
|
|
@@ -193,10 +213,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
193
213
|
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
194
214
|
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
195
215
|
|
216
|
+
const int64_t tsize = ggml_nbytes(t);
|
217
|
+
|
218
|
+
// find the view that contains the tensor fully
|
196
219
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
197
220
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
198
221
|
|
199
|
-
if (ioffs >= 0 && ioffs
|
222
|
+
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
200
223
|
*offs = (size_t) ioffs;
|
201
224
|
|
202
225
|
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
@@ -214,7 +237,8 @@ bool ggml_metal_add_buffer(
|
|
214
237
|
struct ggml_metal_context * ctx,
|
215
238
|
const char * name,
|
216
239
|
void * data,
|
217
|
-
size_t size
|
240
|
+
size_t size,
|
241
|
+
size_t max_size) {
|
218
242
|
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
219
243
|
fprintf(stderr, "%s: too many buffers\n", __func__);
|
220
244
|
return false;
|
@@ -231,30 +255,68 @@ bool ggml_metal_add_buffer(
|
|
231
255
|
}
|
232
256
|
}
|
233
257
|
|
234
|
-
size_t
|
235
|
-
|
236
|
-
|
237
|
-
|
258
|
+
const size_t size_page = getpagesize();
|
259
|
+
|
260
|
+
size_t size_aligned = size;
|
261
|
+
if ((size_aligned % size_page) != 0) {
|
262
|
+
size_aligned += (size_page - (size_aligned % size_page));
|
238
263
|
}
|
239
264
|
|
240
|
-
|
241
|
-
ctx->
|
242
|
-
|
265
|
+
// the buffer fits into the max buffer size allowed by the device
|
266
|
+
if (size_aligned <= ctx->device.maxBufferLength) {
|
267
|
+
ctx->buffers[ctx->n_buffers].name = name;
|
268
|
+
ctx->buffers[ctx->n_buffers].data = data;
|
269
|
+
ctx->buffers[ctx->n_buffers].size = size;
|
243
270
|
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
271
|
+
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
272
|
+
|
273
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
274
|
+
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
275
|
+
return false;
|
276
|
+
}
|
249
277
|
|
250
|
-
|
251
|
-
|
252
|
-
|
278
|
+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
279
|
+
|
280
|
+
++ctx->n_buffers;
|
253
281
|
} else {
|
254
|
-
|
282
|
+
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
283
|
+
// one of the views
|
284
|
+
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
285
|
+
const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
|
286
|
+
const size_t size_view = ctx->device.maxBufferLength;
|
287
|
+
|
288
|
+
for (size_t i = 0; i < size; i += size_step) {
|
289
|
+
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
290
|
+
|
291
|
+
ctx->buffers[ctx->n_buffers].name = name;
|
292
|
+
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
293
|
+
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
294
|
+
|
295
|
+
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
296
|
+
|
297
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
298
|
+
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
299
|
+
return false;
|
300
|
+
}
|
301
|
+
|
302
|
+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
303
|
+
if (i + size_step < size) {
|
304
|
+
fprintf(stderr, "\n");
|
305
|
+
}
|
306
|
+
|
307
|
+
++ctx->n_buffers;
|
308
|
+
}
|
255
309
|
}
|
256
310
|
|
257
|
-
|
311
|
+
fprintf(stderr, ", (%8.2f / %8.2f)",
|
312
|
+
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
313
|
+
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
314
|
+
|
315
|
+
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
316
|
+
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
|
317
|
+
} else {
|
318
|
+
fprintf(stderr, "\n");
|
319
|
+
}
|
258
320
|
}
|
259
321
|
|
260
322
|
return true;
|
@@ -606,7 +668,7 @@ void ggml_metal_graph_compute(
|
|
606
668
|
|
607
669
|
nth0 = 4;
|
608
670
|
nth1 = 16;
|
609
|
-
[encoder setComputePipelineState:ctx->
|
671
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
610
672
|
} break;
|
611
673
|
case GGML_TYPE_Q3_K:
|
612
674
|
{
|
@@ -615,7 +677,7 @@ void ggml_metal_graph_compute(
|
|
615
677
|
|
616
678
|
nth0 = 4;
|
617
679
|
nth1 = 16;
|
618
|
-
[encoder setComputePipelineState:ctx->
|
680
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
619
681
|
} break;
|
620
682
|
case GGML_TYPE_Q4_K:
|
621
683
|
{
|
@@ -624,7 +686,7 @@ void ggml_metal_graph_compute(
|
|
624
686
|
|
625
687
|
nth0 = 4;
|
626
688
|
nth1 = 16;
|
627
|
-
[encoder setComputePipelineState:ctx->
|
689
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
628
690
|
} break;
|
629
691
|
case GGML_TYPE_Q5_K:
|
630
692
|
{
|
@@ -633,7 +695,7 @@ void ggml_metal_graph_compute(
|
|
633
695
|
|
634
696
|
nth0 = 4;
|
635
697
|
nth1 = 16;
|
636
|
-
[encoder setComputePipelineState:ctx->
|
698
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
637
699
|
} break;
|
638
700
|
case GGML_TYPE_Q6_K:
|
639
701
|
{
|
@@ -642,7 +704,7 @@ void ggml_metal_graph_compute(
|
|
642
704
|
|
643
705
|
nth0 = 4;
|
644
706
|
nth1 = 16;
|
645
|
-
[encoder setComputePipelineState:ctx->
|
707
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
646
708
|
} break;
|
647
709
|
default:
|
648
710
|
{
|
@@ -694,11 +756,11 @@ void ggml_metal_graph_compute(
|
|
694
756
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
695
757
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
696
758
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
697
|
-
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->
|
698
|
-
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->
|
699
|
-
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->
|
700
|
-
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->
|
701
|
-
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->
|
759
|
+
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
760
|
+
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|
761
|
+
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
762
|
+
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
763
|
+
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
702
764
|
default: GGML_ASSERT(false && "not implemented");
|
703
765
|
}
|
704
766
|
|
@@ -735,6 +797,70 @@ void ggml_metal_graph_compute(
|
|
735
797
|
|
736
798
|
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
737
799
|
} break;
|
800
|
+
case GGML_OP_NORM:
|
801
|
+
{
|
802
|
+
if (encoder == nil) {
|
803
|
+
encoder = [command_buffer computeCommandEncoder];
|
804
|
+
}
|
805
|
+
|
806
|
+
const float eps = 1e-5f;
|
807
|
+
|
808
|
+
const int nth = 256;
|
809
|
+
|
810
|
+
[encoder setComputePipelineState:ctx->pipeline_norm];
|
811
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
812
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
813
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
814
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
815
|
+
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
816
|
+
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
817
|
+
|
818
|
+
const int64_t nrows = ggml_nrows(src0);
|
819
|
+
|
820
|
+
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
821
|
+
} break;
|
822
|
+
case GGML_OP_ALIBI:
|
823
|
+
{
|
824
|
+
if (encoder == nil) {
|
825
|
+
encoder = [command_buffer computeCommandEncoder];
|
826
|
+
}
|
827
|
+
|
828
|
+
GGML_ASSERT((src0t == GGML_TYPE_F32));
|
829
|
+
|
830
|
+
const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
|
831
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
832
|
+
const float max_bias = ((float *) src1->data)[2];
|
833
|
+
|
834
|
+
if (__builtin_popcount(n_head) != 1) {
|
835
|
+
GGML_ASSERT(false && "only power-of-two n_head implemented");
|
836
|
+
}
|
837
|
+
|
838
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
839
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
840
|
+
|
841
|
+
[encoder setComputePipelineState:ctx->pipeline_alibi_f32];
|
842
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
843
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
844
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
845
|
+
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
846
|
+
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
847
|
+
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
848
|
+
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
849
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
850
|
+
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
851
|
+
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
852
|
+
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
853
|
+
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
854
|
+
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
855
|
+
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
856
|
+
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
857
|
+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
858
|
+
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
859
|
+
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
860
|
+
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
861
|
+
const int nth = 32;
|
862
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
863
|
+
} break;
|
738
864
|
case GGML_OP_ROPE:
|
739
865
|
{
|
740
866
|
if (encoder == nil) {
|
@@ -788,6 +914,14 @@ void ggml_metal_graph_compute(
|
|
788
914
|
default: GGML_ASSERT(false && "not implemented");
|
789
915
|
};
|
790
916
|
} break;
|
917
|
+
case GGML_TYPE_F16:
|
918
|
+
{
|
919
|
+
switch (dstt) {
|
920
|
+
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
|
921
|
+
case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
|
922
|
+
default: GGML_ASSERT(false && "not implemented");
|
923
|
+
};
|
924
|
+
} break;
|
791
925
|
default: GGML_ASSERT(false && "not implemented");
|
792
926
|
}
|
793
927
|
|
@@ -831,4 +965,14 @@ void ggml_metal_graph_compute(
|
|
831
965
|
dispatch_barrier_sync(queue, ^{});
|
832
966
|
|
833
967
|
[command_buffers[n_cb - 1] waitUntilCompleted];
|
968
|
+
|
969
|
+
// check status of command buffers
|
970
|
+
// needed to detect if the device ran out-of-memory for example (#1881)
|
971
|
+
for (int i = 0; i < n_cb; i++) {
|
972
|
+
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
|
973
|
+
if (status != MTLCommandBufferStatusCompleted) {
|
974
|
+
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
975
|
+
GGML_ASSERT(false);
|
976
|
+
}
|
977
|
+
}
|
834
978
|
}
|