llama_cpp 0.2.2 → 0.3.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +305 -133
- data/ext/llama_cpp/src/ggml-cuda.cu +367 -69
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +262 -291
- data/ext/llama_cpp/src/llama.h +49 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +14 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
@@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
|
29
29
|
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
30
30
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
31
31
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
32
|
+
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
32
33
|
void ggml_cuda_set_main_device(int main_device);
|
33
34
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
34
35
|
void ggml_cuda_free_scratch(void);
|
@@ -51,21 +51,21 @@ struct ggml_metal_context {
|
|
51
51
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
52
52
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
53
53
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
54
|
-
GGML_METAL_DECL_KERNEL(
|
55
|
-
GGML_METAL_DECL_KERNEL(
|
56
|
-
GGML_METAL_DECL_KERNEL(
|
57
|
-
GGML_METAL_DECL_KERNEL(
|
58
|
-
GGML_METAL_DECL_KERNEL(
|
54
|
+
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
55
|
+
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
56
|
+
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
57
|
+
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
58
|
+
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
59
59
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
60
60
|
GGML_METAL_DECL_KERNEL(norm);
|
61
61
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
62
62
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
63
63
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
64
|
-
GGML_METAL_DECL_KERNEL(
|
65
|
-
GGML_METAL_DECL_KERNEL(
|
66
|
-
GGML_METAL_DECL_KERNEL(
|
67
|
-
GGML_METAL_DECL_KERNEL(
|
68
|
-
GGML_METAL_DECL_KERNEL(
|
64
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
65
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
66
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
67
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
68
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
69
69
|
GGML_METAL_DECL_KERNEL(rope);
|
70
70
|
GGML_METAL_DECL_KERNEL(alibi_f32);
|
71
71
|
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
@@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
132
132
|
exit(1);
|
133
133
|
}
|
134
134
|
|
135
|
+
#ifdef GGML_QKK_64
|
136
|
+
MTLCompileOptions* options = [MTLCompileOptions new];
|
137
|
+
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
138
|
+
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
139
|
+
#else
|
135
140
|
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
141
|
+
#endif
|
136
142
|
if (error) {
|
137
143
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
138
144
|
exit(1);
|
@@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
159
165
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
160
166
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
161
167
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
162
|
-
GGML_METAL_ADD_KERNEL(
|
163
|
-
GGML_METAL_ADD_KERNEL(
|
164
|
-
GGML_METAL_ADD_KERNEL(
|
165
|
-
GGML_METAL_ADD_KERNEL(
|
166
|
-
GGML_METAL_ADD_KERNEL(
|
168
|
+
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
169
|
+
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
170
|
+
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
171
|
+
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
172
|
+
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
167
173
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
168
174
|
GGML_METAL_ADD_KERNEL(norm);
|
169
175
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
170
176
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
171
177
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
172
|
-
GGML_METAL_ADD_KERNEL(
|
173
|
-
GGML_METAL_ADD_KERNEL(
|
174
|
-
GGML_METAL_ADD_KERNEL(
|
175
|
-
GGML_METAL_ADD_KERNEL(
|
176
|
-
GGML_METAL_ADD_KERNEL(
|
178
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
179
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
180
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
181
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
182
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
177
183
|
GGML_METAL_ADD_KERNEL(rope);
|
178
184
|
GGML_METAL_ADD_KERNEL(alibi_f32);
|
179
185
|
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
@@ -662,7 +668,7 @@ void ggml_metal_graph_compute(
|
|
662
668
|
|
663
669
|
nth0 = 4;
|
664
670
|
nth1 = 16;
|
665
|
-
[encoder setComputePipelineState:ctx->
|
671
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
666
672
|
} break;
|
667
673
|
case GGML_TYPE_Q3_K:
|
668
674
|
{
|
@@ -671,7 +677,7 @@ void ggml_metal_graph_compute(
|
|
671
677
|
|
672
678
|
nth0 = 4;
|
673
679
|
nth1 = 16;
|
674
|
-
[encoder setComputePipelineState:ctx->
|
680
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
675
681
|
} break;
|
676
682
|
case GGML_TYPE_Q4_K:
|
677
683
|
{
|
@@ -680,7 +686,7 @@ void ggml_metal_graph_compute(
|
|
680
686
|
|
681
687
|
nth0 = 4;
|
682
688
|
nth1 = 16;
|
683
|
-
[encoder setComputePipelineState:ctx->
|
689
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
684
690
|
} break;
|
685
691
|
case GGML_TYPE_Q5_K:
|
686
692
|
{
|
@@ -689,7 +695,7 @@ void ggml_metal_graph_compute(
|
|
689
695
|
|
690
696
|
nth0 = 4;
|
691
697
|
nth1 = 16;
|
692
|
-
[encoder setComputePipelineState:ctx->
|
698
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
693
699
|
} break;
|
694
700
|
case GGML_TYPE_Q6_K:
|
695
701
|
{
|
@@ -698,7 +704,7 @@ void ggml_metal_graph_compute(
|
|
698
704
|
|
699
705
|
nth0 = 4;
|
700
706
|
nth1 = 16;
|
701
|
-
[encoder setComputePipelineState:ctx->
|
707
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
702
708
|
} break;
|
703
709
|
default:
|
704
710
|
{
|
@@ -750,11 +756,11 @@ void ggml_metal_graph_compute(
|
|
750
756
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
751
757
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
752
758
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
753
|
-
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->
|
754
|
-
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->
|
755
|
-
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->
|
756
|
-
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->
|
757
|
-
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->
|
759
|
+
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
760
|
+
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|
761
|
+
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
762
|
+
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
763
|
+
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
758
764
|
default: GGML_ASSERT(false && "not implemented");
|
759
765
|
}
|
760
766
|
|