llama_cpp 0.12.0 → 0.12.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +6 -0
- data/ext/llama_cpp/llama_cpp.cpp +14 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
@@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
|
87
87
|
|
88
88
|
// same as ggml_graph_compute but uses Metal
|
89
89
|
// creates gf->n_threads command buffers in parallel
|
90
|
-
|
90
|
+
bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
91
91
|
|
92
92
|
//
|
93
93
|
// backend API
|
@@ -88,6 +88,8 @@ struct ggml_metal_context {
|
|
88
88
|
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
89
89
|
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
90
90
|
GGML_METAL_DECL_KERNEL(get_rows_i32);
|
91
|
+
GGML_METAL_DECL_KERNEL(get_rows_iq2_xxs);
|
92
|
+
GGML_METAL_DECL_KERNEL(get_rows_iq2_xs);
|
91
93
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
92
94
|
GGML_METAL_DECL_KERNEL(group_norm);
|
93
95
|
GGML_METAL_DECL_KERNEL(norm);
|
@@ -106,6 +108,8 @@ struct ggml_metal_context {
|
|
106
108
|
GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
|
107
109
|
GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
|
108
110
|
GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
|
111
|
+
GGML_METAL_DECL_KERNEL(mul_mv_iq2_xxs_f32);
|
112
|
+
GGML_METAL_DECL_KERNEL(mul_mv_iq2_xs_f32);
|
109
113
|
GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
|
110
114
|
//GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
|
111
115
|
GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
|
@@ -121,6 +125,8 @@ struct ggml_metal_context {
|
|
121
125
|
GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32);
|
122
126
|
GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
|
123
127
|
GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
|
128
|
+
GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xxs_f32);
|
129
|
+
GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xs_f32);
|
124
130
|
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
125
131
|
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
126
132
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
@@ -133,6 +139,8 @@ struct ggml_metal_context {
|
|
133
139
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
|
134
140
|
GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
|
135
141
|
GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
|
142
|
+
GGML_METAL_DECL_KERNEL(mul_mm_iq2_xxs_f32);
|
143
|
+
GGML_METAL_DECL_KERNEL(mul_mm_iq2_xs_f32);
|
136
144
|
GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32);
|
137
145
|
GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32);
|
138
146
|
GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32);
|
@@ -145,6 +153,8 @@ struct ggml_metal_context {
|
|
145
153
|
GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32);
|
146
154
|
GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32);
|
147
155
|
GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32);
|
156
|
+
GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xxs_f32);
|
157
|
+
GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xs_f32);
|
148
158
|
GGML_METAL_DECL_KERNEL(rope_f32);
|
149
159
|
GGML_METAL_DECL_KERNEL(rope_f16);
|
150
160
|
GGML_METAL_DECL_KERNEL(alibi_f32);
|
@@ -258,14 +268,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
258
268
|
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
259
269
|
#endif
|
260
270
|
NSError * error = nil;
|
261
|
-
NSString * libPath = [bundle pathForResource:@"
|
271
|
+
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
|
262
272
|
if (libPath != nil) {
|
263
273
|
// pre-compiled library found
|
264
274
|
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
265
275
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
266
276
|
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
267
277
|
} else {
|
268
|
-
GGML_METAL_LOG_INFO("%s:
|
278
|
+
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
269
279
|
|
270
280
|
NSString * sourcePath;
|
271
281
|
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
@@ -295,7 +305,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
295
305
|
#endif
|
296
306
|
// try to disable fast-math
|
297
307
|
// NOTE: this seems to have no effect whatsoever
|
298
|
-
// instead, in order to disable fast-math, we have to build
|
308
|
+
// instead, in order to disable fast-math, we have to build default.metallib from the command line
|
299
309
|
// using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
300
310
|
// and go through the "pre-compiled library found" path above
|
301
311
|
//[options setFastMathEnabled:false];
|
@@ -379,6 +389,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
379
389
|
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
380
390
|
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
381
391
|
GGML_METAL_ADD_KERNEL(get_rows_i32);
|
392
|
+
GGML_METAL_ADD_KERNEL(get_rows_iq2_xxs);
|
393
|
+
GGML_METAL_ADD_KERNEL(get_rows_iq2_xs);
|
382
394
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
383
395
|
GGML_METAL_ADD_KERNEL(group_norm);
|
384
396
|
GGML_METAL_ADD_KERNEL(norm);
|
@@ -397,6 +409,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
397
409
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
|
398
410
|
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
|
399
411
|
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
|
412
|
+
GGML_METAL_ADD_KERNEL(mul_mv_iq2_xxs_f32);
|
413
|
+
GGML_METAL_ADD_KERNEL(mul_mv_iq2_xs_f32);
|
400
414
|
GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
|
401
415
|
//GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
|
402
416
|
GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
|
@@ -412,6 +426,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
412
426
|
GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32);
|
413
427
|
GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
|
414
428
|
GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
|
429
|
+
GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xxs_f32);
|
430
|
+
GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xs_f32);
|
415
431
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
416
432
|
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
417
433
|
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
@@ -425,6 +441,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
425
441
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
426
442
|
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
427
443
|
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
444
|
+
GGML_METAL_ADD_KERNEL(mul_mm_iq2_xxs_f32);
|
445
|
+
GGML_METAL_ADD_KERNEL(mul_mm_iq2_xs_f32);
|
428
446
|
GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32);
|
429
447
|
GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32);
|
430
448
|
GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32);
|
@@ -437,6 +455,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
437
455
|
GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32);
|
438
456
|
GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32);
|
439
457
|
GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32);
|
458
|
+
GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xxs_f32);
|
459
|
+
GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xs_f32);
|
440
460
|
}
|
441
461
|
GGML_METAL_ADD_KERNEL(rope_f32);
|
442
462
|
GGML_METAL_ADD_KERNEL(rope_f16);
|
@@ -502,6 +522,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
502
522
|
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
|
503
523
|
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
504
524
|
GGML_METAL_DEL_KERNEL(get_rows_i32);
|
525
|
+
GGML_METAL_DEL_KERNEL(get_rows_iq2_xxs);
|
526
|
+
GGML_METAL_DEL_KERNEL(get_rows_iq2_xs);
|
505
527
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
506
528
|
GGML_METAL_DEL_KERNEL(group_norm);
|
507
529
|
GGML_METAL_DEL_KERNEL(norm);
|
@@ -520,6 +542,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
520
542
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
|
521
543
|
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
|
522
544
|
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
|
545
|
+
GGML_METAL_DEL_KERNEL(mul_mv_iq2_xxs_f32);
|
546
|
+
GGML_METAL_DEL_KERNEL(mul_mv_iq2_xs_f32);
|
523
547
|
GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
|
524
548
|
//GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
|
525
549
|
GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
|
@@ -535,6 +559,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
535
559
|
GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32);
|
536
560
|
GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
|
537
561
|
GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
|
562
|
+
GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xxs_f32);
|
563
|
+
GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xs_f32);
|
538
564
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
539
565
|
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
540
566
|
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
@@ -548,6 +574,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
548
574
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
549
575
|
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
550
576
|
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
577
|
+
GGML_METAL_DEL_KERNEL(mul_mm_iq2_xxs_f32);
|
578
|
+
GGML_METAL_DEL_KERNEL(mul_mm_iq2_xs_f32);
|
551
579
|
GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32);
|
552
580
|
GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32);
|
553
581
|
GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32);
|
@@ -560,6 +588,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
560
588
|
GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32);
|
561
589
|
GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32);
|
562
590
|
GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32);
|
591
|
+
GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xxs_f32);
|
592
|
+
GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xs_f32);
|
563
593
|
}
|
564
594
|
GGML_METAL_DEL_KERNEL(rope_f32);
|
565
595
|
GGML_METAL_DEL_KERNEL(rope_f16);
|
@@ -977,7 +1007,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
|
|
977
1007
|
return false;
|
978
1008
|
}
|
979
1009
|
}
|
980
|
-
|
1010
|
+
bool ggml_metal_graph_compute(
|
981
1011
|
struct ggml_metal_context * ctx,
|
982
1012
|
struct ggml_cgraph * gf) {
|
983
1013
|
@autoreleasepool {
|
@@ -1052,6 +1082,10 @@ void ggml_metal_graph_compute(
|
|
1052
1082
|
GGML_ASSERT(!"unsupported op");
|
1053
1083
|
}
|
1054
1084
|
|
1085
|
+
#ifndef GGML_METAL_NDEBUG
|
1086
|
+
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
|
1087
|
+
#endif
|
1088
|
+
|
1055
1089
|
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
1056
1090
|
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
1057
1091
|
const int64_t ne02 = src0 ? src0->ne[2] : 0;
|
@@ -1541,6 +1575,8 @@ void ggml_metal_graph_compute(
|
|
1541
1575
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
|
1542
1576
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
|
1543
1577
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
|
1578
|
+
case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xxs_f32]; break;
|
1579
|
+
case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xs_f32]; break;
|
1544
1580
|
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
1545
1581
|
}
|
1546
1582
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
@@ -1653,6 +1689,18 @@ void ggml_metal_graph_compute(
|
|
1653
1689
|
nth1 = 32;
|
1654
1690
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
|
1655
1691
|
} break;
|
1692
|
+
case GGML_TYPE_IQ2_XXS:
|
1693
|
+
{
|
1694
|
+
nth0 = 4;
|
1695
|
+
nth1 = 16;
|
1696
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xxs_f32];
|
1697
|
+
} break;
|
1698
|
+
case GGML_TYPE_IQ2_XS:
|
1699
|
+
{
|
1700
|
+
nth0 = 4;
|
1701
|
+
nth1 = 16;
|
1702
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xs_f32];
|
1703
|
+
} break;
|
1656
1704
|
default:
|
1657
1705
|
{
|
1658
1706
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
@@ -1689,6 +1737,11 @@ void ggml_metal_graph_compute(
|
|
1689
1737
|
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
|
1690
1738
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1691
1739
|
}
|
1740
|
+
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
1741
|
+
const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
|
1742
|
+
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
1743
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1744
|
+
}
|
1692
1745
|
else if (src0t == GGML_TYPE_Q4_K) {
|
1693
1746
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1694
1747
|
}
|
@@ -1778,6 +1831,8 @@ void ggml_metal_graph_compute(
|
|
1778
1831
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break;
|
1779
1832
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break;
|
1780
1833
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break;
|
1834
|
+
case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xxs_f32]; break;
|
1835
|
+
case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xs_f32]; break;
|
1781
1836
|
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
1782
1837
|
}
|
1783
1838
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
@@ -1893,6 +1948,18 @@ void ggml_metal_graph_compute(
|
|
1893
1948
|
nth1 = 32;
|
1894
1949
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32];
|
1895
1950
|
} break;
|
1951
|
+
case GGML_TYPE_IQ2_XXS:
|
1952
|
+
{
|
1953
|
+
nth0 = 4;
|
1954
|
+
nth1 = 16;
|
1955
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xxs_f32];
|
1956
|
+
} break;
|
1957
|
+
case GGML_TYPE_IQ2_XS:
|
1958
|
+
{
|
1959
|
+
nth0 = 4;
|
1960
|
+
nth1 = 16;
|
1961
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xs_f32];
|
1962
|
+
} break;
|
1896
1963
|
default:
|
1897
1964
|
{
|
1898
1965
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
@@ -1945,6 +2012,11 @@ void ggml_metal_graph_compute(
|
|
1945
2012
|
src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
|
1946
2013
|
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1947
2014
|
}
|
2015
|
+
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
|
2016
|
+
const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
|
2017
|
+
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
2018
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
2019
|
+
}
|
1948
2020
|
else if (src2t == GGML_TYPE_Q4_K) {
|
1949
2021
|
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1950
2022
|
}
|
@@ -1982,6 +2054,8 @@ void ggml_metal_graph_compute(
|
|
1982
2054
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
1983
2055
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
1984
2056
|
case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
|
2057
|
+
case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xxs]; break;
|
2058
|
+
case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xs]; break;
|
1985
2059
|
default: GGML_ASSERT(false && "not implemented");
|
1986
2060
|
}
|
1987
2061
|
|
@@ -2383,6 +2457,10 @@ void ggml_metal_graph_compute(
|
|
2383
2457
|
GGML_ASSERT(false);
|
2384
2458
|
}
|
2385
2459
|
}
|
2460
|
+
|
2461
|
+
#ifndef GGML_METAL_NDEBUG
|
2462
|
+
[encoder popDebugGroup];
|
2463
|
+
#endif
|
2386
2464
|
}
|
2387
2465
|
|
2388
2466
|
if (encoder != nil) {
|
@@ -2405,10 +2483,11 @@ void ggml_metal_graph_compute(
|
|
2405
2483
|
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
2406
2484
|
if (status != MTLCommandBufferStatusCompleted) {
|
2407
2485
|
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
2408
|
-
|
2486
|
+
return false;
|
2409
2487
|
}
|
2410
2488
|
}
|
2411
2489
|
|
2490
|
+
return true;
|
2412
2491
|
}
|
2413
2492
|
}
|
2414
2493
|
|
@@ -2688,10 +2767,10 @@ static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggm
|
|
2688
2767
|
UNUSED(backend);
|
2689
2768
|
}
|
2690
2769
|
|
2691
|
-
static
|
2770
|
+
static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
2692
2771
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
2693
2772
|
|
2694
|
-
ggml_metal_graph_compute(metal_ctx, cgraph);
|
2773
|
+
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
2695
2774
|
}
|
2696
2775
|
|
2697
2776
|
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|