llama_cpp 0.2.2 → 0.3.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +34 -0
- data/README.md +39 -6
- data/examples/chat.rb +2 -1
- data/examples/embedding.rb +3 -2
- data/ext/llama_cpp/extconf.rb +13 -0
- data/ext/llama_cpp/llama_cpp.cpp +305 -133
- data/ext/llama_cpp/src/ggml-cuda.cu +367 -69
- data/ext/llama_cpp/src/ggml-cuda.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +36 -30
- data/ext/llama_cpp/src/ggml-metal.metal +328 -84
- data/ext/llama_cpp/src/ggml-opencl.cpp +352 -175
- data/ext/llama_cpp/src/ggml.c +800 -303
- data/ext/llama_cpp/src/ggml.h +68 -5
- data/ext/llama_cpp/src/k_quants.c +1712 -56
- data/ext/llama_cpp/src/k_quants.h +41 -6
- data/ext/llama_cpp/src/llama-util.h +19 -5
- data/ext/llama_cpp/src/llama.cpp +262 -291
- data/ext/llama_cpp/src/llama.h +49 -11
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +0 -2
- data/sig/llama_cpp.rbs +14 -17
- metadata +2 -3
- data/lib/llama_cpp/client.rb +0 -172
@@ -29,6 +29,7 @@ void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor);
|
|
29
29
|
void ggml_cuda_free_data(struct ggml_tensor * tensor);
|
30
30
|
void ggml_cuda_assign_buffers(struct ggml_tensor * tensor);
|
31
31
|
void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor);
|
32
|
+
void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor);
|
32
33
|
void ggml_cuda_set_main_device(int main_device);
|
33
34
|
void ggml_cuda_set_scratch_size(size_t scratch_size);
|
34
35
|
void ggml_cuda_free_scratch(void);
|
@@ -51,21 +51,21 @@ struct ggml_metal_context {
|
|
51
51
|
GGML_METAL_DECL_KERNEL(get_rows_f16);
|
52
52
|
GGML_METAL_DECL_KERNEL(get_rows_q4_0);
|
53
53
|
GGML_METAL_DECL_KERNEL(get_rows_q4_1);
|
54
|
-
GGML_METAL_DECL_KERNEL(
|
55
|
-
GGML_METAL_DECL_KERNEL(
|
56
|
-
GGML_METAL_DECL_KERNEL(
|
57
|
-
GGML_METAL_DECL_KERNEL(
|
58
|
-
GGML_METAL_DECL_KERNEL(
|
54
|
+
GGML_METAL_DECL_KERNEL(get_rows_q2_K);
|
55
|
+
GGML_METAL_DECL_KERNEL(get_rows_q3_K);
|
56
|
+
GGML_METAL_DECL_KERNEL(get_rows_q4_K);
|
57
|
+
GGML_METAL_DECL_KERNEL(get_rows_q5_K);
|
58
|
+
GGML_METAL_DECL_KERNEL(get_rows_q6_K);
|
59
59
|
GGML_METAL_DECL_KERNEL(rms_norm);
|
60
60
|
GGML_METAL_DECL_KERNEL(norm);
|
61
61
|
GGML_METAL_DECL_KERNEL(mul_mat_f16_f32);
|
62
62
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_0_f32);
|
63
63
|
GGML_METAL_DECL_KERNEL(mul_mat_q4_1_f32);
|
64
|
-
GGML_METAL_DECL_KERNEL(
|
65
|
-
GGML_METAL_DECL_KERNEL(
|
66
|
-
GGML_METAL_DECL_KERNEL(
|
67
|
-
GGML_METAL_DECL_KERNEL(
|
68
|
-
GGML_METAL_DECL_KERNEL(
|
64
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q2_K_f32);
|
65
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q3_K_f32);
|
66
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q4_K_f32);
|
67
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q5_K_f32);
|
68
|
+
GGML_METAL_DECL_KERNEL(mul_mat_q6_K_f32);
|
69
69
|
GGML_METAL_DECL_KERNEL(rope);
|
70
70
|
GGML_METAL_DECL_KERNEL(alibi_f32);
|
71
71
|
GGML_METAL_DECL_KERNEL(cpy_f32_f16);
|
@@ -132,7 +132,13 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
132
132
|
exit(1);
|
133
133
|
}
|
134
134
|
|
135
|
+
#ifdef GGML_QKK_64
|
136
|
+
MTLCompileOptions* options = [MTLCompileOptions new];
|
137
|
+
options.preprocessorMacros = @{ @"QK_K" : @(64) };
|
138
|
+
ctx->library = [ctx->device newLibraryWithSource:src options:options error:&error];
|
139
|
+
#else
|
135
140
|
ctx->library = [ctx->device newLibraryWithSource:src options:nil error:&error];
|
141
|
+
#endif
|
136
142
|
if (error) {
|
137
143
|
fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
|
138
144
|
exit(1);
|
@@ -159,21 +165,21 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
159
165
|
GGML_METAL_ADD_KERNEL(get_rows_f16);
|
160
166
|
GGML_METAL_ADD_KERNEL(get_rows_q4_0);
|
161
167
|
GGML_METAL_ADD_KERNEL(get_rows_q4_1);
|
162
|
-
GGML_METAL_ADD_KERNEL(
|
163
|
-
GGML_METAL_ADD_KERNEL(
|
164
|
-
GGML_METAL_ADD_KERNEL(
|
165
|
-
GGML_METAL_ADD_KERNEL(
|
166
|
-
GGML_METAL_ADD_KERNEL(
|
168
|
+
GGML_METAL_ADD_KERNEL(get_rows_q2_K);
|
169
|
+
GGML_METAL_ADD_KERNEL(get_rows_q3_K);
|
170
|
+
GGML_METAL_ADD_KERNEL(get_rows_q4_K);
|
171
|
+
GGML_METAL_ADD_KERNEL(get_rows_q5_K);
|
172
|
+
GGML_METAL_ADD_KERNEL(get_rows_q6_K);
|
167
173
|
GGML_METAL_ADD_KERNEL(rms_norm);
|
168
174
|
GGML_METAL_ADD_KERNEL(norm);
|
169
175
|
GGML_METAL_ADD_KERNEL(mul_mat_f16_f32);
|
170
176
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_0_f32);
|
171
177
|
GGML_METAL_ADD_KERNEL(mul_mat_q4_1_f32);
|
172
|
-
GGML_METAL_ADD_KERNEL(
|
173
|
-
GGML_METAL_ADD_KERNEL(
|
174
|
-
GGML_METAL_ADD_KERNEL(
|
175
|
-
GGML_METAL_ADD_KERNEL(
|
176
|
-
GGML_METAL_ADD_KERNEL(
|
178
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q2_K_f32);
|
179
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q3_K_f32);
|
180
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q4_K_f32);
|
181
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q5_K_f32);
|
182
|
+
GGML_METAL_ADD_KERNEL(mul_mat_q6_K_f32);
|
177
183
|
GGML_METAL_ADD_KERNEL(rope);
|
178
184
|
GGML_METAL_ADD_KERNEL(alibi_f32);
|
179
185
|
GGML_METAL_ADD_KERNEL(cpy_f32_f16);
|
@@ -662,7 +668,7 @@ void ggml_metal_graph_compute(
|
|
662
668
|
|
663
669
|
nth0 = 4;
|
664
670
|
nth1 = 16;
|
665
|
-
[encoder setComputePipelineState:ctx->
|
671
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q2_K_f32];
|
666
672
|
} break;
|
667
673
|
case GGML_TYPE_Q3_K:
|
668
674
|
{
|
@@ -671,7 +677,7 @@ void ggml_metal_graph_compute(
|
|
671
677
|
|
672
678
|
nth0 = 4;
|
673
679
|
nth1 = 16;
|
674
|
-
[encoder setComputePipelineState:ctx->
|
680
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q3_K_f32];
|
675
681
|
} break;
|
676
682
|
case GGML_TYPE_Q4_K:
|
677
683
|
{
|
@@ -680,7 +686,7 @@ void ggml_metal_graph_compute(
|
|
680
686
|
|
681
687
|
nth0 = 4;
|
682
688
|
nth1 = 16;
|
683
|
-
[encoder setComputePipelineState:ctx->
|
689
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q4_K_f32];
|
684
690
|
} break;
|
685
691
|
case GGML_TYPE_Q5_K:
|
686
692
|
{
|
@@ -689,7 +695,7 @@ void ggml_metal_graph_compute(
|
|
689
695
|
|
690
696
|
nth0 = 4;
|
691
697
|
nth1 = 16;
|
692
|
-
[encoder setComputePipelineState:ctx->
|
698
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q5_K_f32];
|
693
699
|
} break;
|
694
700
|
case GGML_TYPE_Q6_K:
|
695
701
|
{
|
@@ -698,7 +704,7 @@ void ggml_metal_graph_compute(
|
|
698
704
|
|
699
705
|
nth0 = 4;
|
700
706
|
nth1 = 16;
|
701
|
-
[encoder setComputePipelineState:ctx->
|
707
|
+
[encoder setComputePipelineState:ctx->pipeline_mul_mat_q6_K_f32];
|
702
708
|
} break;
|
703
709
|
default:
|
704
710
|
{
|
@@ -750,11 +756,11 @@ void ggml_metal_graph_compute(
|
|
750
756
|
case GGML_TYPE_F16: [encoder setComputePipelineState:ctx->pipeline_get_rows_f16]; break;
|
751
757
|
case GGML_TYPE_Q4_0: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_0]; break;
|
752
758
|
case GGML_TYPE_Q4_1: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_1]; break;
|
753
|
-
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->
|
754
|
-
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->
|
755
|
-
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->
|
756
|
-
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->
|
757
|
-
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->
|
759
|
+
case GGML_TYPE_Q2_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q2_K]; break;
|
760
|
+
case GGML_TYPE_Q3_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q3_K]; break;
|
761
|
+
case GGML_TYPE_Q4_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q4_K]; break;
|
762
|
+
case GGML_TYPE_Q5_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q5_K]; break;
|
763
|
+
case GGML_TYPE_Q6_K: [encoder setComputePipelineState:ctx->pipeline_get_rows_q6_K]; break;
|
758
764
|
default: GGML_ASSERT(false && "not implemented");
|
759
765
|
}
|
760
766
|
|