llama_cpp 0.11.1 → 0.12.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +15 -0
- data/README.md +3 -3
- data/examples/chat.rb +6 -2
- data/examples/embedding.rb +5 -1
- data/examples/simple.rb +4 -1
- data/ext/llama_cpp/llama_cpp.cpp +63 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +2 -2
- data/sig/llama_cpp.rbs +5 -0
- data/vendor/tmp/llama.cpp/Makefile +8 -2
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-backend.c +7 -3
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +758 -39
- data/vendor/tmp/llama.cpp/ggml-metal.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +86 -7
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-quants.c +635 -1
- data/vendor/tmp/llama.cpp/ggml-quants.h +25 -1
- data/vendor/tmp/llama.cpp/ggml.c +91 -52
- data/vendor/tmp/llama.cpp/ggml.h +14 -11
- data/vendor/tmp/llama.cpp/llama.cpp +79 -30
- data/vendor/tmp/llama.cpp/llama.h +14 -0
- metadata +2 -2
@@ -87,7 +87,7 @@ int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
|
|
87
87
|
|
88
88
|
// same as ggml_graph_compute but uses Metal
|
89
89
|
// creates gf->n_threads command buffers in parallel
|
90
|
-
|
90
|
+
bool ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
|
91
91
|
|
92
92
|
//
|
93
93
|
// backend API
|
@@ -88,6 +88,8 @@ struct ggml_metal_context {
|
|
88
88
|
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
89
89
|
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
90
90
|
GGML_METAL_DECL_KERNEL(get_rows_i32);
|
91
|
+
GGML_METAL_DECL_KERNEL(get_rows_iq2_xxs);
|
92
|
+
GGML_METAL_DECL_KERNEL(get_rows_iq2_xs);
|
91
93
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
92
94
|
GGML_METAL_DECL_KERNEL(group_norm);
|
93
95
|
GGML_METAL_DECL_KERNEL(norm);
|
@@ -106,6 +108,8 @@ struct ggml_metal_context {
|
|
106
108
|
GGML_METAL_DECL_KERNEL(mul_mv_q4_K_f32);
|
107
109
|
GGML_METAL_DECL_KERNEL(mul_mv_q5_K_f32);
|
108
110
|
GGML_METAL_DECL_KERNEL(mul_mv_q6_K_f32);
|
111
|
+
GGML_METAL_DECL_KERNEL(mul_mv_iq2_xxs_f32);
|
112
|
+
GGML_METAL_DECL_KERNEL(mul_mv_iq2_xs_f32);
|
109
113
|
GGML_METAL_DECL_KERNEL(mul_mv_id_f32_f32);
|
110
114
|
//GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f16);
|
111
115
|
GGML_METAL_DECL_KERNEL(mul_mv_id_f16_f32);
|
@@ -121,6 +125,8 @@ struct ggml_metal_context {
|
|
121
125
|
GGML_METAL_DECL_KERNEL(mul_mv_id_q4_K_f32);
|
122
126
|
GGML_METAL_DECL_KERNEL(mul_mv_id_q5_K_f32);
|
123
127
|
GGML_METAL_DECL_KERNEL(mul_mv_id_q6_K_f32);
|
128
|
+
GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xxs_f32);
|
129
|
+
GGML_METAL_DECL_KERNEL(mul_mv_id_iq2_xs_f32);
|
124
130
|
GGML_METAL_DECL_KERNEL(mul_mm_f32_f32);
|
125
131
|
GGML_METAL_DECL_KERNEL(mul_mm_f16_f32);
|
126
132
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_0_f32);
|
@@ -133,6 +139,8 @@ struct ggml_metal_context {
|
|
133
139
|
GGML_METAL_DECL_KERNEL(mul_mm_q4_K_f32);
|
134
140
|
GGML_METAL_DECL_KERNEL(mul_mm_q5_K_f32);
|
135
141
|
GGML_METAL_DECL_KERNEL(mul_mm_q6_K_f32);
|
142
|
+
GGML_METAL_DECL_KERNEL(mul_mm_iq2_xxs_f32);
|
143
|
+
GGML_METAL_DECL_KERNEL(mul_mm_iq2_xs_f32);
|
136
144
|
GGML_METAL_DECL_KERNEL(mul_mm_id_f32_f32);
|
137
145
|
GGML_METAL_DECL_KERNEL(mul_mm_id_f16_f32);
|
138
146
|
GGML_METAL_DECL_KERNEL(mul_mm_id_q4_0_f32);
|
@@ -145,6 +153,8 @@ struct ggml_metal_context {
|
|
145
153
|
GGML_METAL_DECL_KERNEL(mul_mm_id_q4_K_f32);
|
146
154
|
GGML_METAL_DECL_KERNEL(mul_mm_id_q5_K_f32);
|
147
155
|
GGML_METAL_DECL_KERNEL(mul_mm_id_q6_K_f32);
|
156
|
+
GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xxs_f32);
|
157
|
+
GGML_METAL_DECL_KERNEL(mul_mm_id_iq2_xs_f32);
|
148
158
|
GGML_METAL_DECL_KERNEL(rope_f32);
|
149
159
|
GGML_METAL_DECL_KERNEL(rope_f16);
|
150
160
|
GGML_METAL_DECL_KERNEL(alibi_f32);
|
@@ -258,14 +268,14 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
258
268
|
bundle = [NSBundle bundleForClass:[GGMLMetalClass class]];
|
259
269
|
#endif
|
260
270
|
NSError * error = nil;
|
261
|
-
NSString * libPath = [bundle pathForResource:@"
|
271
|
+
NSString * libPath = [bundle pathForResource:@"default" ofType:@"metallib"];
|
262
272
|
if (libPath != nil) {
|
263
273
|
// pre-compiled library found
|
264
274
|
NSURL * libURL = [NSURL fileURLWithPath:libPath];
|
265
275
|
GGML_METAL_LOG_INFO("%s: loading '%s'\n", __func__, [libPath UTF8String]);
|
266
276
|
ctx->library = [ctx->device newLibraryWithURL:libURL error:&error];
|
267
277
|
} else {
|
268
|
-
GGML_METAL_LOG_INFO("%s:
|
278
|
+
GGML_METAL_LOG_INFO("%s: default.metallib not found, loading from source\n", __func__);
|
269
279
|
|
270
280
|
NSString * sourcePath;
|
271
281
|
NSString * ggmlMetalPathResources = [[NSProcessInfo processInfo].environment objectForKey:@"GGML_METAL_PATH_RESOURCES"];
|
@@ -295,7 +305,7 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
295
305
|
#endif
|
296
306
|
// try to disable fast-math
|
297
307
|
// NOTE: this seems to have no effect whatsoever
|
298
|
-
// instead, in order to disable fast-math, we have to build
|
308
|
+
// instead, in order to disable fast-math, we have to build default.metallib from the command line
|
299
309
|
// using xcrun -sdk macosx metal -fno-fast-math -c ggml-metal.metal -o ggml-metal.air
|
300
310
|
// and go through the "pre-compiled library found" path above
|
301
311
|
//[options setFastMathEnabled:false];
|
@@ -379,6 +389,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
379
389
|
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
380
390
|
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
381
391
|
GGML_METAL_ADD_KERNEL(get_rows_i32);
|
392
|
+
GGML_METAL_ADD_KERNEL(get_rows_iq2_xxs);
|
393
|
+
GGML_METAL_ADD_KERNEL(get_rows_iq2_xs);
|
382
394
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
383
395
|
GGML_METAL_ADD_KERNEL(group_norm);
|
384
396
|
GGML_METAL_ADD_KERNEL(norm);
|
@@ -397,6 +409,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
397
409
|
GGML_METAL_ADD_KERNEL(mul_mv_q4_K_f32);
|
398
410
|
GGML_METAL_ADD_KERNEL(mul_mv_q5_K_f32);
|
399
411
|
GGML_METAL_ADD_KERNEL(mul_mv_q6_K_f32);
|
412
|
+
GGML_METAL_ADD_KERNEL(mul_mv_iq2_xxs_f32);
|
413
|
+
GGML_METAL_ADD_KERNEL(mul_mv_iq2_xs_f32);
|
400
414
|
GGML_METAL_ADD_KERNEL(mul_mv_id_f32_f32);
|
401
415
|
//GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f16);
|
402
416
|
GGML_METAL_ADD_KERNEL(mul_mv_id_f16_f32);
|
@@ -412,6 +426,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
412
426
|
GGML_METAL_ADD_KERNEL(mul_mv_id_q4_K_f32);
|
413
427
|
GGML_METAL_ADD_KERNEL(mul_mv_id_q5_K_f32);
|
414
428
|
GGML_METAL_ADD_KERNEL(mul_mv_id_q6_K_f32);
|
429
|
+
GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xxs_f32);
|
430
|
+
GGML_METAL_ADD_KERNEL(mul_mv_id_iq2_xs_f32);
|
415
431
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
416
432
|
GGML_METAL_ADD_KERNEL(mul_mm_f32_f32);
|
417
433
|
GGML_METAL_ADD_KERNEL(mul_mm_f16_f32);
|
@@ -425,6 +441,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
425
441
|
GGML_METAL_ADD_KERNEL(mul_mm_q4_K_f32);
|
426
442
|
GGML_METAL_ADD_KERNEL(mul_mm_q5_K_f32);
|
427
443
|
GGML_METAL_ADD_KERNEL(mul_mm_q6_K_f32);
|
444
|
+
GGML_METAL_ADD_KERNEL(mul_mm_iq2_xxs_f32);
|
445
|
+
GGML_METAL_ADD_KERNEL(mul_mm_iq2_xs_f32);
|
428
446
|
GGML_METAL_ADD_KERNEL(mul_mm_id_f32_f32);
|
429
447
|
GGML_METAL_ADD_KERNEL(mul_mm_id_f16_f32);
|
430
448
|
GGML_METAL_ADD_KERNEL(mul_mm_id_q4_0_f32);
|
@@ -437,6 +455,8 @@ struct ggml_metal_context * ggml_metal_init(int n_cb) {
|
|
437
455
|
GGML_METAL_ADD_KERNEL(mul_mm_id_q4_K_f32);
|
438
456
|
GGML_METAL_ADD_KERNEL(mul_mm_id_q5_K_f32);
|
439
457
|
GGML_METAL_ADD_KERNEL(mul_mm_id_q6_K_f32);
|
458
|
+
GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xxs_f32);
|
459
|
+
GGML_METAL_ADD_KERNEL(mul_mm_id_iq2_xs_f32);
|
440
460
|
}
|
441
461
|
GGML_METAL_ADD_KERNEL(rope_f32);
|
442
462
|
GGML_METAL_ADD_KERNEL(rope_f16);
|
@@ -502,6 +522,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
502
522
|
GGML_METAL_DEL_KERNEL(get_rows_q5_K);
|
503
523
|
GGML_METAL_DEL_KERNEL(get_rows_q6_K);
|
504
524
|
GGML_METAL_DEL_KERNEL(get_rows_i32);
|
525
|
+
GGML_METAL_DEL_KERNEL(get_rows_iq2_xxs);
|
526
|
+
GGML_METAL_DEL_KERNEL(get_rows_iq2_xs);
|
505
527
|
GGML_METAL_DEL_KERNEL(rms_norm);
|
506
528
|
GGML_METAL_DEL_KERNEL(group_norm);
|
507
529
|
GGML_METAL_DEL_KERNEL(norm);
|
@@ -520,6 +542,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
520
542
|
GGML_METAL_DEL_KERNEL(mul_mv_q4_K_f32);
|
521
543
|
GGML_METAL_DEL_KERNEL(mul_mv_q5_K_f32);
|
522
544
|
GGML_METAL_DEL_KERNEL(mul_mv_q6_K_f32);
|
545
|
+
GGML_METAL_DEL_KERNEL(mul_mv_iq2_xxs_f32);
|
546
|
+
GGML_METAL_DEL_KERNEL(mul_mv_iq2_xs_f32);
|
523
547
|
GGML_METAL_DEL_KERNEL(mul_mv_id_f32_f32);
|
524
548
|
//GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f16);
|
525
549
|
GGML_METAL_DEL_KERNEL(mul_mv_id_f16_f32);
|
@@ -535,6 +559,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
535
559
|
GGML_METAL_DEL_KERNEL(mul_mv_id_q4_K_f32);
|
536
560
|
GGML_METAL_DEL_KERNEL(mul_mv_id_q5_K_f32);
|
537
561
|
GGML_METAL_DEL_KERNEL(mul_mv_id_q6_K_f32);
|
562
|
+
GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xxs_f32);
|
563
|
+
GGML_METAL_DEL_KERNEL(mul_mv_id_iq2_xs_f32);
|
538
564
|
if ([ctx->device supportsFamily:MTLGPUFamilyApple7]) {
|
539
565
|
GGML_METAL_DEL_KERNEL(mul_mm_f32_f32);
|
540
566
|
GGML_METAL_DEL_KERNEL(mul_mm_f16_f32);
|
@@ -548,6 +574,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
548
574
|
GGML_METAL_DEL_KERNEL(mul_mm_q4_K_f32);
|
549
575
|
GGML_METAL_DEL_KERNEL(mul_mm_q5_K_f32);
|
550
576
|
GGML_METAL_DEL_KERNEL(mul_mm_q6_K_f32);
|
577
|
+
GGML_METAL_DEL_KERNEL(mul_mm_iq2_xxs_f32);
|
578
|
+
GGML_METAL_DEL_KERNEL(mul_mm_iq2_xs_f32);
|
551
579
|
GGML_METAL_DEL_KERNEL(mul_mm_id_f32_f32);
|
552
580
|
GGML_METAL_DEL_KERNEL(mul_mm_id_f16_f32);
|
553
581
|
GGML_METAL_DEL_KERNEL(mul_mm_id_q4_0_f32);
|
@@ -560,6 +588,8 @@ void ggml_metal_free(struct ggml_metal_context * ctx) {
|
|
560
588
|
GGML_METAL_DEL_KERNEL(mul_mm_id_q4_K_f32);
|
561
589
|
GGML_METAL_DEL_KERNEL(mul_mm_id_q5_K_f32);
|
562
590
|
GGML_METAL_DEL_KERNEL(mul_mm_id_q6_K_f32);
|
591
|
+
GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xxs_f32);
|
592
|
+
GGML_METAL_DEL_KERNEL(mul_mm_id_iq2_xs_f32);
|
563
593
|
}
|
564
594
|
GGML_METAL_DEL_KERNEL(rope_f32);
|
565
595
|
GGML_METAL_DEL_KERNEL(rope_f16);
|
@@ -977,7 +1007,7 @@ static bool ggml_metal_supports_op(const struct ggml_tensor * op) {
|
|
977
1007
|
return false;
|
978
1008
|
}
|
979
1009
|
}
|
980
|
-
|
1010
|
+
bool ggml_metal_graph_compute(
|
981
1011
|
struct ggml_metal_context * ctx,
|
982
1012
|
struct ggml_cgraph * gf) {
|
983
1013
|
@autoreleasepool {
|
@@ -1052,6 +1082,10 @@ void ggml_metal_graph_compute(
|
|
1052
1082
|
GGML_ASSERT(!"unsupported op");
|
1053
1083
|
}
|
1054
1084
|
|
1085
|
+
#ifndef GGML_METAL_NDEBUG
|
1086
|
+
[encoder pushDebugGroup:[NSString stringWithCString:ggml_op_desc(dst) encoding:NSUTF8StringEncoding]];
|
1087
|
+
#endif
|
1088
|
+
|
1055
1089
|
const int64_t ne00 = src0 ? src0->ne[0] : 0;
|
1056
1090
|
const int64_t ne01 = src0 ? src0->ne[1] : 0;
|
1057
1091
|
const int64_t ne02 = src0 ? src0->ne[2] : 0;
|
@@ -1541,6 +1575,8 @@ void ggml_metal_graph_compute(
|
|
1541
1575
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q4_K_f32]; break;
|
1542
1576
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q5_K_f32]; break;
|
1543
1577
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_q6_K_f32]; break;
|
1578
|
+
case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xxs_f32]; break;
|
1579
|
+
case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_iq2_xs_f32]; break;
|
1544
1580
|
default: GGML_ASSERT(false && "MUL MAT-MAT not implemented");
|
1545
1581
|
}
|
1546
1582
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
@@ -1653,6 +1689,18 @@ void ggml_metal_graph_compute(
|
|
1653
1689
|
nth1 = 32;
|
1654
1690
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_q6_K_f32];
|
1655
1691
|
} break;
|
1692
|
+
case GGML_TYPE_IQ2_XXS:
|
1693
|
+
{
|
1694
|
+
nth0 = 4;
|
1695
|
+
nth1 = 16;
|
1696
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xxs_f32];
|
1697
|
+
} break;
|
1698
|
+
case GGML_TYPE_IQ2_XS:
|
1699
|
+
{
|
1700
|
+
nth0 = 4;
|
1701
|
+
nth1 = 16;
|
1702
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_iq2_xs_f32];
|
1703
|
+
} break;
|
1656
1704
|
default:
|
1657
1705
|
{
|
1658
1706
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src0t);
|
@@ -1689,6 +1737,11 @@ void ggml_metal_graph_compute(
|
|
1689
1737
|
src0t == GGML_TYPE_Q2_K) { // || src0t == GGML_TYPE_Q4_K) {
|
1690
1738
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1691
1739
|
}
|
1740
|
+
else if (src0t == GGML_TYPE_IQ2_XXS || src0t == GGML_TYPE_IQ2_XS) {
|
1741
|
+
const int mem_size = src0t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
|
1742
|
+
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
1743
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 7)/8, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1744
|
+
}
|
1692
1745
|
else if (src0t == GGML_TYPE_Q4_K) {
|
1693
1746
|
[encoder dispatchThreadgroups:MTLSizeMake((ne01 + 3)/4, ne11, ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1694
1747
|
}
|
@@ -1778,6 +1831,8 @@ void ggml_metal_graph_compute(
|
|
1778
1831
|
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q4_K_f32]; break;
|
1779
1832
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q5_K_f32]; break;
|
1780
1833
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_q6_K_f32]; break;
|
1834
|
+
case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xxs_f32]; break;
|
1835
|
+
case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_mul_mm_id_iq2_xs_f32]; break;
|
1781
1836
|
default: GGML_ASSERT(false && "MUL_MAT_ID not implemented");
|
1782
1837
|
}
|
1783
1838
|
[encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
|
@@ -1893,6 +1948,18 @@ void ggml_metal_graph_compute(
|
|
1893
1948
|
nth1 = 32;
|
1894
1949
|
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_q6_K_f32];
|
1895
1950
|
} break;
|
1951
|
+
case GGML_TYPE_IQ2_XXS:
|
1952
|
+
{
|
1953
|
+
nth0 = 4;
|
1954
|
+
nth1 = 16;
|
1955
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xxs_f32];
|
1956
|
+
} break;
|
1957
|
+
case GGML_TYPE_IQ2_XS:
|
1958
|
+
{
|
1959
|
+
nth0 = 4;
|
1960
|
+
nth1 = 16;
|
1961
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mv_id_iq2_xs_f32];
|
1962
|
+
} break;
|
1896
1963
|
default:
|
1897
1964
|
{
|
1898
1965
|
GGML_METAL_LOG_ERROR("Asserting on type %d\n", (int)src2t);
|
@@ -1945,6 +2012,11 @@ void ggml_metal_graph_compute(
|
|
1945
2012
|
src2t == GGML_TYPE_Q2_K) { // || src2t == GGML_TYPE_Q4_K) {
|
1946
2013
|
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1947
2014
|
}
|
2015
|
+
else if (src2t == GGML_TYPE_IQ2_XXS || src2t == GGML_TYPE_IQ2_XS) {
|
2016
|
+
const int mem_size = src2t == GGML_TYPE_IQ2_XXS ? 256*8+128 : 512*8+128;
|
2017
|
+
[encoder setThreadgroupMemoryLength:mem_size atIndex:0];
|
2018
|
+
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 7)/8, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
2019
|
+
}
|
1948
2020
|
else if (src2t == GGML_TYPE_Q4_K) {
|
1949
2021
|
[encoder dispatchThreadgroups:MTLSizeMake((ne21 + 3)/4, _ne1, ne01*ne12*ne13) threadsPerThreadgroup:MTLSizeMake(nth0, nth1, 1)];
|
1950
2022
|
}
|
@@ -1982,6 +2054,8 @@ void ggml_metal_graph_compute(
|
|
1982
2054
|
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
1983
2055
|
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
1984
2056
|
case GGML_TYPE_I32: [encoder setComputePipelineState:ctx->pipeline_get_rows_i32]; break;
|
2057
|
+
case GGML_TYPE_IQ2_XXS: [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xxs]; break;
|
2058
|
+
case GGML_TYPE_IQ2_XS : [encoder setComputePipelineState:ctx->pipeline_get_rows_iq2_xs]; break;
|
1985
2059
|
default: GGML_ASSERT(false && "not implemented");
|
1986
2060
|
}
|
1987
2061
|
|
@@ -2383,6 +2457,10 @@ void ggml_metal_graph_compute(
|
|
2383
2457
|
GGML_ASSERT(false);
|
2384
2458
|
}
|
2385
2459
|
}
|
2460
|
+
|
2461
|
+
#ifndef GGML_METAL_NDEBUG
|
2462
|
+
[encoder popDebugGroup];
|
2463
|
+
#endif
|
2386
2464
|
}
|
2387
2465
|
|
2388
2466
|
if (encoder != nil) {
|
@@ -2405,10 +2483,11 @@ void ggml_metal_graph_compute(
|
|
2405
2483
|
MTLCommandBufferStatus status = (MTLCommandBufferStatus) [ctx->command_buffers[i] status];
|
2406
2484
|
if (status != MTLCommandBufferStatusCompleted) {
|
2407
2485
|
GGML_METAL_LOG_INFO("%s: command buffer %d failed with status %lu\n", __func__, i, status);
|
2408
|
-
|
2486
|
+
return false;
|
2409
2487
|
}
|
2410
2488
|
}
|
2411
2489
|
|
2490
|
+
return true;
|
2412
2491
|
}
|
2413
2492
|
}
|
2414
2493
|
|
@@ -2688,10 +2767,10 @@ static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggm
|
|
2688
2767
|
UNUSED(backend);
|
2689
2768
|
}
|
2690
2769
|
|
2691
|
-
static
|
2770
|
+
static bool ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
|
2692
2771
|
struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
|
2693
2772
|
|
2694
|
-
ggml_metal_graph_compute(metal_ctx, cgraph);
|
2773
|
+
return ggml_metal_graph_compute(metal_ctx, cgraph);
|
2695
2774
|
}
|
2696
2775
|
|
2697
2776
|
static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
|