llama_cpp 0.2.2 → 0.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -51,21 +51,21 @@ struct ggml_metal_context {
51
51
  GGML_METAL_DECL_KERNEL(get_rows_f16);
52
52
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
53
53
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
54
- GGML_METAL_DECL_KERNEL(get_rows_q2_k);
55
- GGML_METAL_DECL_KERNEL(get_rows_q3_k);
56
- GGML_METAL_DECL_KERNEL(get_rows_q4_k);
57
- GGML_METAL_DECL_KERNEL(get_rows_q5_k);
58
- GGML_METAL_DECL_KERNEL(get_rows_q6_k);
54
+ GGML_METAL_DECL_KERNEL(get_rows_q2_K);
55
+ GGML_METAL_DECL_KERNEL(get_rows_q3_K);
56
+ GGML_METAL_DECL_KERNEL(get_rows_q4_K);
57
+ GGML_METAL_DECL_KERNEL(get_rows_q5_K);
58
+ GGML_METAL_DECL_KERNEL(get_rows_q6_K);
59
59
  GGML_METAL_DECL_KERNEL(rms_norm);
60
60
  GGML_METAL_DECL_KERNEL(norm);
61
61
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
62
62
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
63
63
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
64
- GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
65
- GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
66
- GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
67
- GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
68
- GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
64
+ GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
65
+ GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
66
+ GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
67
+ GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
68
+ GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
69
69
  GGML_METAL_DECL_KERNEL(rope);
70
70
  GGML_METAL_DECL_KERNEL(alibi_f32);
71
71
  GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
132
132
  exit(1);
133
133
  }
134
134
 
135
+ #ifdef GGML_QKK_64
136
+ MTLCompileOptions* options = [MTLCompileOptions new];
137
+ options.preprocessorMacros = @{ @"QK_K" : @(64) };
138
+ ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
139
+ #else
135
140
  ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
141
+ #endif
136
142
  if (error) {
137
143
  fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
138
144
  exit(1);
@@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) {
159
165
  GGML_METAL_ADD_KERNEL(get_rows_f16);
160
166
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
161
167
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
162
- GGML_METAL_ADD_KERNEL(get_rows_q2_k);
163
- GGML_METAL_ADD_KERNEL(get_rows_q3_k);
164
- GGML_METAL_ADD_KERNEL(get_rows_q4_k);
165
- GGML_METAL_ADD_KERNEL(get_rows_q5_k);
166
- GGML_METAL_ADD_KERNEL(get_rows_q6_k);
168
+ GGML_METAL_ADD_KERNEL(get_rows_q2_K);
169
+ GGML_METAL_ADD_KERNEL(get_rows_q3_K);
170
+ GGML_METAL_ADD_KERNEL(get_rows_q4_K);
171
+ GGML_METAL_ADD_KERNEL(get_rows_q5_K);
172
+ GGML_METAL_ADD_KERNEL(get_rows_q6_K);
167
173
  GGML_METAL_ADD_KERNEL(rms_norm);
168
174
  GGML_METAL_ADD_KERNEL(norm);
169
175
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
170
176
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
171
177
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
172
- GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
173
- GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
174
- GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
175
- GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
176
- GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
178
+ GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
179
+ GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
180
+ GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
181
+ GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
182
+ GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
177
183
  GGML_METAL_ADD_KERNEL(rope);
178
184
  GGML_METAL_ADD_KERNEL(alibi_f32);
179
185
  GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@@ -662,7 +668,7 @@ void ggml_metal_graph_compute(
662
668
 
663
669
  nth0 = 4;
664
670
  nth1 = 16;
665
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
671
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
666
672
  } break;
667
673
  case GGML_TYPE_Q3_K:
668
674
  {
@@ -671,7 +677,7 @@ void ggml_metal_graph_compute(
671
677
 
672
678
  nth0 = 4;
673
679
  nth1 = 16;
674
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
680
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
675
681
  } break;
676
682
  case GGML_TYPE_Q4_K:
677
683
  {
@@ -680,7 +686,7 @@ void ggml_metal_graph_compute(
680
686
 
681
687
  nth0 = 4;
682
688
  nth1 = 16;
683
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
689
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
684
690
  } break;
685
691
  case GGML_TYPE_Q5_K:
686
692
  {
@@ -689,7 +695,7 @@ void ggml_metal_graph_compute(
689
695
 
690
696
  nth0 = 4;
691
697
  nth1 = 16;
692
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
698
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
693
699
  } break;
694
700
  case GGML_TYPE_Q6_K:
695
701
  {
@@ -698,7 +704,7 @@ void ggml_metal_graph_compute(
698
704
 
699
705
  nth0 = 4;
700
706
  nth1 = 16;
701
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
707
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
702
708
  } break;
703
709
  default:
704
710
  {
@@ -750,11 +756,11 @@ void ggml_metal_graph_compute(
750
756
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
751
757
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
752
758
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
753
- case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
754
- case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
755
- case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
756
- case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
757
- case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
759
+ case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
760
+ case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
761
+ case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
762
+ case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
763
+ case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
758
764
  default: GGML_ASSERT(false && "not implemented");
759
765
  }
760
766