llama_cpp 0.2.1 → 0.3.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +32 -0
- data/README.md +39 -6
- data/examples/README.md +32 -0
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +38 -0
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +231 -132
- data/ext/llama_cpp/src/ggml-cuda.cu +844 -337
- data/ext/llama_cpp/src/ggml-metal.h +4 -1
- data/ext/llama_cpp/src/ggml-metal.m +193 -49
- data/ext/llama_cpp/src/ggml-metal.metal +477 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +493 -4
- data/ext/llama_cpp/src/ggml.c +1565 -430
- data/ext/llama_cpp/src/ggml.h +208 -14
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +194 -101
- data/ext/llama_cpp/src/llama.h +41 -14
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +12 -17
- metadata +3 -3
- data/lib/llama_cpp/client.rb +0 -172
@@ -41,12 +41,15 @@ void ggml_metal_free(struct ggml_metal_context * ctx);
|
|
41
41
|
// - make sure to map all buffers used in the graph before calling ggml_metal_graph_compute
|
42
42
|
// - the mapping is used during computation to determine the arguments of the compute kernels
|
43
43
|
// - you don't need to keep the host memory buffer allocated as it is never accessed by Metal
|
44
|
+
// - max_size specifies the maximum size of a tensor and is used to create shared views such
|
45
|
+
// that it is guaranteed that the tensor will fit in at least one of the views
|
44
46
|
//
|
45
47
|
bool ggml_metal_add_buffer(
|
46
48
|
struct ggml_metal_context * ctx,
|
47
49
|
const char * name,
|
48
50
|
void * data,
|
49
|
-
size_t size
|
51
|
+
size_t size,
|
52
|
+
size_t max_size);
|
50
53
|
|
51
54
|
// set data from host memory into the device
|
52
55
|
void ggml_metal_set_tensor(struct ggml_metal_context * ctx, struct ggml_tensor * t);
|
@@ -51,23 +51,26 @@ struct ggml_metal_context {
|
|
51
51
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
52
52
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
53
53
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
54
|
-
GGML_METAL_DECL_KERNEL(
|
55
|
-
GGML_METAL_DECL_KERNEL(
|
56
|
-
GGML_METAL_DECL_KERNEL(
|
57
|
-
GGML_METAL_DECL_KERNEL(
|
58
|
-
GGML_METAL_DECL_KERNEL(
|
54
|
+
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
55
|
+
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
56
|
+
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
57
|
+
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
58
|
+
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
59
59
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
60
|
+
GGML_METAL_DECL_KERNEL(norm);
|
60
61
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
61
62
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
62
63
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
63
|
-
GGML_METAL_DECL_KERNEL(
|
64
|
-
GGML_METAL_DECL_KERNEL(
|
65
|
-
GGML_METAL_DECL_KERNEL(
|
66
|
-
GGML_METAL_DECL_KERNEL(
|
67
|
-
GGML_METAL_DECL_KERNEL(
|
64
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
65
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
66
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
67
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
68
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
68
69
|
GGML_METAL_DECL_KERNEL(rope);
|
70
|
+
GGML_METAL_DECL_KERNEL(alibi_f32);
|
69
71
|
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
70
72
|
GGML_METAL_DECL_KERNEL(cpy_f32_f32);
|
73
|
+
GGML_METAL_DECL_KERNEL(cpy_f16_f16);
|
71
74
|
|
72
75
|
#undef GGML_METAL_DECL_KERNEL
|
73
76
|
};
|
@@ -129,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
129
132
|
exit(1);
|
130
133
|
}
|
131
134
|
|
135
|
+
#ifdef GGML_QKK_64
|
136
|
+
MTLCompileOptions* options = [MTLCompileOptions new];
|
137
|
+
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
138
|
+
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
139
|
+
#else
|
132
140
|
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
141
|
+
#endif
|
133
142
|
if (error) {
|
134
143
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
135
144
|
exit(1);
|
@@ -156,27 +165,38 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
156
165
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
157
166
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
158
167
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
159
|
-
GGML_METAL_ADD_KERNEL(
|
160
|
-
GGML_METAL_ADD_KERNEL(
|
161
|
-
GGML_METAL_ADD_KERNEL(
|
162
|
-
GGML_METAL_ADD_KERNEL(
|
163
|
-
GGML_METAL_ADD_KERNEL(
|
168
|
+
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
169
|
+
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
170
|
+
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
171
|
+
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
172
|
+
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
164
173
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
174
|
+
GGML_METAL_ADD_KERNEL(norm);
|
165
175
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
166
176
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
167
177
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
168
|
-
GGML_METAL_ADD_KERNEL(
|
169
|
-
GGML_METAL_ADD_KERNEL(
|
170
|
-
GGML_METAL_ADD_KERNEL(
|
171
|
-
GGML_METAL_ADD_KERNEL(
|
172
|
-
GGML_METAL_ADD_KERNEL(
|
178
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
179
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
180
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
181
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
182
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
173
183
|
GGML_METAL_ADD_KERNEL(rope);
|
184
|
+
GGML_METAL_ADD_KERNEL(alibi_f32);
|
174
185
|
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
175
186
|
GGML_METAL_ADD_KERNEL(cpy_f32_f32);
|
187
|
+
GGML_METAL_ADD_KERNEL(cpy_f16_f16);
|
176
188
|
|
177
189
|
#undef GGML_METAL_ADD_KERNEL
|
178
190
|
}
|
179
191
|
|
192
|
+
fprintf(stderr, "%s: recommendedMaxWorkingSetSize = %8.2f MB\n", __func__, ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
193
|
+
fprintf(stderr, "%s: hasUnifiedMemory = %s\n", __func__, ctx->device.hasUnifiedMemory ? "true" : "false");
|
194
|
+
if (ctx->device.maxTransferRate != 0) {
|
195
|
+
fprintf(stderr, "%s: maxTransferRate = %8.2f MB/s\n", __func__, ctx->device.maxTransferRate / 1024.0 / 1024.0);
|
196
|
+
} else {
|
197
|
+
fprintf(stderr, "%s: maxTransferRate = built-in GPU\n", __func__);
|
198
|
+
}
|
199
|
+
|
180
200
|
return ctx;
|
181
201
|
}
|
182
202
|
|
@@ -193,10 +213,13 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
193
213
|
static id<MTLBuffer> ggml_metal_get_buffer(struct ggml_metal_context * ctx, struct ggml_tensor * t, size_t * offs) {
|
194
214
|
//fprintf(stderr, "%s: data tensor '%16s', offs_data = %8ld, offs_eval = %8ld, offs_cach = %8ld\n", __func__, t->name, offs_data, offs_eval, offs_cach);
|
195
215
|
|
216
|
+
const int64_t tsize = ggml_nbytes(t);
|
217
|
+
|
218
|
+
// find the view that contains the tensor fully
|
196
219
|
for (int i = 0; i < ctx->n_buffers; ++i) {
|
197
220
|
const int64_t ioffs = (int64_t) t->data - (int64_t) ctx->buffers[i].data;
|
198
221
|
|
199
|
-
if (ioffs >= 0 && ioffs
|
222
|
+
if (ioffs >= 0 && ioffs + tsize <= (int64_t) ctx->buffers[i].size) {
|
200
223
|
*offs = (size_t) ioffs;
|
201
224
|
|
202
225
|
//fprintf(stderr, "%s: '%s' tensor '%16s', offs = %8ld\n", __func__, ctx->buffers[i].name, t->name, *offs);
|
@@ -214,7 +237,8 @@ bool ggml_metal_add_buffer(
|
|
214
237
|
struct ggml_metal_context * ctx,
|
215
238
|
const char * name,
|
216
239
|
void * data,
|
217
|
-
size_t size
|
240
|
+
size_t size,
|
241
|
+
size_t max_size) {
|
218
242
|
if (ctx->n_buffers >= GGML_METAL_MAX_BUFFERS) {
|
219
243
|
fprintf(stderr, "%s: too many buffers\n", __func__);
|
220
244
|
return false;
|
@@ -231,30 +255,68 @@ bool ggml_metal_add_buffer(
|
|
231
255
|
}
|
232
256
|
}
|
233
257
|
|
234
|
-
size_t
|
235
|
-
|
236
|
-
|
237
|
-
|
258
|
+
const size_t size_page = getpagesize();
|
259
|
+
|
260
|
+
size_t size_aligned = size;
|
261
|
+
if ((size_aligned % size_page) != 0) {
|
262
|
+
size_aligned += (size_page - (size_aligned % size_page));
|
238
263
|
}
|
239
264
|
|
240
|
-
|
241
|
-
ctx->
|
242
|
-
|
265
|
+
// the buffer fits into the max buffer size allowed by the device
|
266
|
+
if (size_aligned <= ctx->device.maxBufferLength) {
|
267
|
+
ctx->buffers[ctx->n_buffers].name = name;
|
268
|
+
ctx->buffers[ctx->n_buffers].data = data;
|
269
|
+
ctx->buffers[ctx->n_buffers].size = size;
|
243
270
|
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
271
|
+
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
272
|
+
|
273
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
274
|
+
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_aligned / 1024.0 / 1024.0);
|
275
|
+
return false;
|
276
|
+
}
|
249
277
|
|
250
|
-
|
251
|
-
|
252
|
-
|
278
|
+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB", __func__, name, size_aligned / 1024.0 / 1024.0);
|
279
|
+
|
280
|
+
++ctx->n_buffers;
|
253
281
|
} else {
|
254
|
-
|
282
|
+
// this overlap between the views will guarantee that the tensor with the maximum size will fully fit into
|
283
|
+
// one of the views
|
284
|
+
const size_t size_ovlp = ((max_size + size_page - 1) / size_page + 1) * size_page; // round-up 2 pages just in case
|
285
|
+
const size_t size_step = ctx->device.maxBufferLength - size_ovlp;
|
286
|
+
const size_t size_view = ctx->device.maxBufferLength;
|
287
|
+
|
288
|
+
for (size_t i = 0; i < size; i += size_step) {
|
289
|
+
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
290
|
+
|
291
|
+
ctx->buffers[ctx->n_buffers].name = name;
|
292
|
+
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
293
|
+
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
294
|
+
|
295
|
+
ctx->buffers[ctx->n_buffers].metal = [ctx->device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
296
|
+
|
297
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
298
|
+
fprintf(stderr, "%s: failed to allocate '%-16s' buffer, size = %8.2f MB\n", __func__, name, size_step_aligned / 1024.0 / 1024.0);
|
299
|
+
return false;
|
300
|
+
}
|
301
|
+
|
302
|
+
fprintf(stderr, "%s: allocated '%-16s' buffer, size = %8.2f MB, offs = %12ld", __func__, name, size_step_aligned / 1024.0 / 1024.0, i);
|
303
|
+
if (i + size_step < size) {
|
304
|
+
fprintf(stderr, "\n");
|
305
|
+
}
|
306
|
+
|
307
|
+
++ctx->n_buffers;
|
308
|
+
}
|
255
309
|
}
|
256
310
|
|
257
|
-
|
311
|
+
fprintf(stderr, ", (%8.2f / %8.2f)",
|
312
|
+
ctx->device.currentAllocatedSize / 1024.0 / 1024.0,
|
313
|
+
ctx->device.recommendedMaxWorkingSetSize / 1024.0 / 1024.0);
|
314
|
+
|
315
|
+
if (ctx->device.currentAllocatedSize > ctx->device.recommendedMaxWorkingSetSize) {
|
316
|
+
fprintf(stderr, ", warning: current allocated size is greater than the recommended max working set size\n");
|
317
|
+
} else {
|
318
|
+
fprintf(stderr, "\n");
|
319
|
+
}
|
258
320
|
}
|
259
321
|
|
260
322
|
return true;
|
@@ -606,7 +668,7 @@ void ggml_metal_graph_compute(
|
|
606
668
|
|
607
669
|
nth0 = 4;
|
608
670
|
nth1 = 16;
|
609
|
-
[encoder setComputePipelineState:ctx->
|
671
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
610
672
|
} break;
|
611
673
|
case GGML_TYPE_Q3_K:
|
612
674
|
{
|
@@ -615,7 +677,7 @@ void ggml_metal_graph_compute(
|
|
615
677
|
|
616
678
|
nth0 = 4;
|
617
679
|
nth1 = 16;
|
618
|
-
[encoder setComputePipelineState:ctx->
|
680
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
619
681
|
} break;
|
620
682
|
case GGML_TYPE_Q4_K:
|
621
683
|
{
|
@@ -624,7 +686,7 @@ void ggml_metal_graph_compute(
|
|
624
686
|
|
625
687
|
nth0 = 4;
|
626
688
|
nth1 = 16;
|
627
|
-
[encoder setComputePipelineState:ctx->
|
689
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
628
690
|
} break;
|
629
691
|
case GGML_TYPE_Q5_K:
|
630
692
|
{
|
@@ -633,7 +695,7 @@ void ggml_metal_graph_compute(
|
|
633
695
|
|
634
696
|
nth0 = 4;
|
635
697
|
nth1 = 16;
|
636
|
-
[encoder setComputePipelineState:ctx->
|
698
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
637
699
|
} break;
|
638
700
|
case GGML_TYPE_Q6_K:
|
639
701
|
{
|
@@ -642,7 +704,7 @@ void ggml_metal_graph_compute(
|
|
642
704
|
|
643
705
|
nth0 = 4;
|
644
706
|
nth1 = 16;
|
645
|
-
[encoder setComputePipelineState:ctx->
|
707
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
646
708
|
} break;
|
647
709
|
default:
|
648
710
|
{
|
@@ -694,11 +756,11 @@ void ggml_metal_graph_compute(
|
|
694
756
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
695
757
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
696
758
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
697
|
-
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->
|
698
|
-
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->
|
699
|
-
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->
|
700
|
-
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->
|
701
|
-
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->
|
759
|
+
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
760
|
+
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|
761
|
+
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
762
|
+
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
763
|
+
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
702
764
|
default: GGML_ASSERT(false && "not implemented");
|
703
765
|
}
|
704
766
|
|
@@ -735,6 +797,70 @@ void ggml_metal_graph_compute(
|
|
735
797
|
|
736
798
|
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
737
799
|
} break;
|
800
|
+
case GGML_OP_NORM:
|
801
|
+
{
|
802
|
+
if (encoder == nil) {
|
803
|
+
encoder = [command_buffer computeCommandEncoder];
|
804
|
+
}
|
805
|
+
|
806
|
+
const float eps = 1e-5f;
|
807
|
+
|
808
|
+
const int nth = 256;
|
809
|
+
|
810
|
+
[encoder setComputePipelineState:ctx->pipeline_norm];
|
811
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
812
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
813
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
814
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:3];
|
815
|
+
[encoder setBytes:&eps length:sizeof( float) atIndex:4];
|
816
|
+
[encoder setThreadgroupMemoryLength:nth*sizeof(float) atIndex:0];
|
817
|
+
|
818
|
+
const int64_t nrows = ggml_nrows(src0);
|
819
|
+
|
820
|
+
[encoder dispatchThreadgroups:MTLSizeMake(nrows, 1, 1) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
821
|
+
} break;
|
822
|
+
case GGML_OP_ALIBI:
|
823
|
+
{
|
824
|
+
if (encoder == nil) {
|
825
|
+
encoder = [command_buffer computeCommandEncoder];
|
826
|
+
}
|
827
|
+
|
828
|
+
GGML_ASSERT((src0t == GGML_TYPE_F32));
|
829
|
+
|
830
|
+
const int n_past = ((int32_t *) src1->data)[0]; UNUSED(n_past);
|
831
|
+
const int n_head = ((int32_t *) src1->data)[1];
|
832
|
+
const float max_bias = ((float *) src1->data)[2];
|
833
|
+
|
834
|
+
if (__builtin_popcount(n_head) != 1) {
|
835
|
+
GGML_ASSERT(false && "only power-of-two n_head implemented");
|
836
|
+
}
|
837
|
+
|
838
|
+
const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
|
839
|
+
const float m0 = powf(2.0f, -(max_bias) / n_heads_log2_floor);
|
840
|
+
|
841
|
+
[encoder setComputePipelineState:ctx->pipeline_alibi_f32];
|
842
|
+
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
843
|
+
[encoder setBuffer:id_dst offset:offs_dst atIndex:1];
|
844
|
+
[encoder setBytes:&ne00 length:sizeof( int64_t) atIndex:2];
|
845
|
+
[encoder setBytes:&ne01 length:sizeof( int64_t) atIndex:3];
|
846
|
+
[encoder setBytes:&ne02 length:sizeof( int64_t) atIndex:4];
|
847
|
+
[encoder setBytes:&ne03 length:sizeof( int64_t) atIndex:5];
|
848
|
+
[encoder setBytes:&nb00 length:sizeof(uint64_t) atIndex:6];
|
849
|
+
[encoder setBytes:&nb01 length:sizeof(uint64_t) atIndex:7];
|
850
|
+
[encoder setBytes:&nb02 length:sizeof(uint64_t) atIndex:8];
|
851
|
+
[encoder setBytes:&nb03 length:sizeof(uint64_t) atIndex:9];
|
852
|
+
[encoder setBytes:&ne0 length:sizeof( int64_t) atIndex:10];
|
853
|
+
[encoder setBytes:&ne1 length:sizeof( int64_t) atIndex:11];
|
854
|
+
[encoder setBytes:&ne2 length:sizeof( int64_t) atIndex:12];
|
855
|
+
[encoder setBytes:&ne3 length:sizeof( int64_t) atIndex:13];
|
856
|
+
[encoder setBytes:&nb0 length:sizeof(uint64_t) atIndex:14];
|
857
|
+
[encoder setBytes:&nb1 length:sizeof(uint64_t) atIndex:15];
|
858
|
+
[encoder setBytes:&nb2 length:sizeof(uint64_t) atIndex:16];
|
859
|
+
[encoder setBytes:&nb3 length:sizeof(uint64_t) atIndex:17];
|
860
|
+
[encoder setBytes:&m0 length:sizeof( float) atIndex:18];
|
861
|
+
const int nth = 32;
|
862
|
+
[encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth, 1, 1)];
|
863
|
+
} break;
|
738
864
|
case GGML_OP_ROPE:
|
739
865
|
{
|
740
866
|
if (encoder == nil) {
|
@@ -788,6 +914,14 @@ void ggml_metal_graph_compute(
|
|
788
914
|
default: GGML_ASSERT(false && "not implemented");
|
789
915
|
};
|
790
916
|
} break;
|
917
|
+
case GGML_TYPE_F16:
|
918
|
+
{
|
919
|
+
switch (dstt) {
|
920
|
+
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_cpy_f16_f16]; break;
|
921
|
+
case GGML_TYPE_F32: GGML_ASSERT(false && "cpy_f16_f32 not implemented"); break;
|
922
|
+
default: GGML_ASSERT(false && "not implemented");
|
923
|
+
};
|
924
|
+
} break;
|
791
925
|
default: GGML_ASSERT(false && "not implemented");
|
792
926
|
}
|
793
927
|
|
@@ -831,4 +965,14 @@ void ggml_metal_graph_compute(
|
|
831
965
|
dispatch_barrier_sync(queue, ^{});
|
832
966
|
|
833
967
|
[command_buffers[n_cb - 1] waitUntilCompleted];
|
968
|
+
|
969
|
+
// check status of command buffers
|
970
|
+
// needed to detect if the device ran out-of-memory for example (#1881)
|
971
|
+
for (int i = 0; i < n_cb; i++) {
|
972
|
+
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [command_buffers[i] status];
|
973
|
+
if (status != MTLCommandBufferStatusCompleted) {
|
974
|
+
fprintf(stderr, "%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
975
|
+
GGML_ASSERT(false);
|
976
|
+
}
|
977
|
+
}
|
834
978
|
}
|