llama_cpp 0.2.2 → 0.3.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
29
29
  void ggml_cuda_free_data(struct ggml_tensor * tensor);
30
30
  void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
31
31
  void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
32
+ void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
32
33
  void ggml_cuda_set_main_device(int main_device);
33
34
  void ggml_cuda_set_scratch_size(size_t scratch_size);
34
35
  void ggml_cuda_free_scratch(void);
@@ -51,21 +51,21 @@ struct ggml_metal_context {
51
51
  GGML_METAL_DECL_KERNEL(get_rows_f16);
52
52
  GGML_METAL_DECL_KERNEL(get_rows_q4_0);
53
53
  GGML_METAL_DECL_KERNEL(get_rows_q4_1);
54
- GGML_METAL_DECL_KERNEL(get_rows_q2_k);
55
- GGML_METAL_DECL_KERNEL(get_rows_q3_k);
56
- GGML_METAL_DECL_KERNEL(get_rows_q4_k);
57
- GGML_METAL_DECL_KERNEL(get_rows_q5_k);
58
- GGML_METAL_DECL_KERNEL(get_rows_q6_k);
54
+ GGML_METAL_DECL_KERNEL(get_rows_q2_K);
55
+ GGML_METAL_DECL_KERNEL(get_rows_q3_K);
56
+ GGML_METAL_DECL_KERNEL(get_rows_q4_K);
57
+ GGML_METAL_DECL_KERNEL(get_rows_q5_K);
58
+ GGML_METAL_DECL_KERNEL(get_rows_q6_K);
59
59
  GGML_METAL_DECL_KERNEL(rms_norm);
60
60
  GGML_METAL_DECL_KERNEL(norm);
61
61
  GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
62
62
  GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
63
63
  GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
64
- GGML_METAL_DECL_KERNEL(mul_mat_q2_k_f32);
65
- GGML_METAL_DECL_KERNEL(mul_mat_q3_k_f32);
66
- GGML_METAL_DECL_KERNEL(mul_mat_q4_k_f32);
67
- GGML_METAL_DECL_KERNEL(mul_mat_q5_k_f32);
68
- GGML_METAL_DECL_KERNEL(mul_mat_q6_k_f32);
64
+ GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
65
+ GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
66
+ GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
67
+ GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
68
+ GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
69
69
  GGML_METAL_DECL_KERNEL(rope);
70
70
  GGML_METAL_DECL_KERNEL(alibi_f32);
71
71
  GGML_METAL_DECL_KERNEL(cpy_f32_f16);
@@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
132
132
  exit(1);
133
133
  }
134
134
 
135
+ #ifdef GGML_QKK_64
136
+ MTLCompileOptions* options = [MTLCompileOptions new];
137
+ options.preprocessorMacros = @{ @"QK_K" : @(64) };
138
+ ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
139
+ #else
135
140
  ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
141
+ #endif
136
142
  if (error) {
137
143
  fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
138
144
  exit(1);
@@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) {
159
165
  GGML_METAL_ADD_KERNEL(get_rows_f16);
160
166
  GGML_METAL_ADD_KERNEL(get_rows_q4_0);
161
167
  GGML_METAL_ADD_KERNEL(get_rows_q4_1);
162
- GGML_METAL_ADD_KERNEL(get_rows_q2_k);
163
- GGML_METAL_ADD_KERNEL(get_rows_q3_k);
164
- GGML_METAL_ADD_KERNEL(get_rows_q4_k);
165
- GGML_METAL_ADD_KERNEL(get_rows_q5_k);
166
- GGML_METAL_ADD_KERNEL(get_rows_q6_k);
168
+ GGML_METAL_ADD_KERNEL(get_rows_q2_K);
169
+ GGML_METAL_ADD_KERNEL(get_rows_q3_K);
170
+ GGML_METAL_ADD_KERNEL(get_rows_q4_K);
171
+ GGML_METAL_ADD_KERNEL(get_rows_q5_K);
172
+ GGML_METAL_ADD_KERNEL(get_rows_q6_K);
167
173
  GGML_METAL_ADD_KERNEL(rms_norm);
168
174
  GGML_METAL_ADD_KERNEL(norm);
169
175
  GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
170
176
  GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
171
177
  GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
172
- GGML_METAL_ADD_KERNEL(mul_mat_q2_k_f32);
173
- GGML_METAL_ADD_KERNEL(mul_mat_q3_k_f32);
174
- GGML_METAL_ADD_KERNEL(mul_mat_q4_k_f32);
175
- GGML_METAL_ADD_KERNEL(mul_mat_q5_k_f32);
176
- GGML_METAL_ADD_KERNEL(mul_mat_q6_k_f32);
178
+ GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
179
+ GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
180
+ GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
181
+ GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
182
+ GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
177
183
  GGML_METAL_ADD_KERNEL(rope);
178
184
  GGML_METAL_ADD_KERNEL(alibi_f32);
179
185
  GGML_METAL_ADD_KERNEL(cpy_f32_f16);
@@ -662,7 +668,7 @@ void ggml_metal_graph_compute(
662
668
 
663
669
  nth0 = 4;
664
670
  nth1 = 16;
665
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_k_f32];
671
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
666
672
  } break;
667
673
  case GGML_TYPE_Q3_K:
668
674
  {
@@ -671,7 +677,7 @@ void ggml_metal_graph_compute(
671
677
 
672
678
  nth0 = 4;
673
679
  nth1 = 16;
674
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_k_f32];
680
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
675
681
  } break;
676
682
  case GGML_TYPE_Q4_K:
677
683
  {
@@ -680,7 +686,7 @@ void ggml_metal_graph_compute(
680
686
 
681
687
  nth0 = 4;
682
688
  nth1 = 16;
683
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_k_f32];
689
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
684
690
  } break;
685
691
  case GGML_TYPE_Q5_K:
686
692
  {
@@ -689,7 +695,7 @@ void ggml_metal_graph_compute(
689
695
 
690
696
  nth0 = 4;
691
697
  nth1 = 16;
692
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_k_f32];
698
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
693
699
  } break;
694
700
  case GGML_TYPE_Q6_K:
695
701
  {
@@ -698,7 +704,7 @@ void ggml_metal_graph_compute(
698
704
 
699
705
  nth0 = 4;
700
706
  nth1 = 16;
701
- [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_k_f32];
707
+ [encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
702
708
  } break;
703
709
  default:
704
710
  {
@@ -750,11 +756,11 @@ void ggml_metal_graph_compute(
750
756
  case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
751
757
  case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
752
758
  case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
753
- case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_k]; break;
754
- case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_k]; break;
755
- case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_k]; break;
756
- case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_k]; break;
757
- case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_k]; break;
759
+ case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
760
+ case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
761
+ case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
762
+ case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
763
+ case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
758
764
  default: GGML_ASSERT(false && "not implemented");
759
765
  }
760
766