llama_cpp 0.3.1 → 0.3.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
@@ -8,10 +8,6 @@ extern "C" {
|
|
8
8
|
|
9
9
|
#define GGML_CUDA_MAX_DEVICES 16
|
10
10
|
|
11
|
-
struct ggml_tensor_extra_gpu {
|
12
|
-
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
13
|
-
};
|
14
|
-
|
15
11
|
void ggml_init_cublas(void);
|
16
12
|
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
17
13
|
|
@@ -202,7 +202,9 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
202
202
|
|
203
203
|
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
204
204
|
fprintf(stderr, "%s: deallocating\n", __func__);
|
205
|
-
|
205
|
+
for (int i = 0; i < ctx->n_buffers; ++i) {
|
206
|
+
[ctx->buffers[i].metal release];
|
207
|
+
}
|
206
208
|
free(ctx);
|
207
209
|
}
|
208
210
|
|
@@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
653
653
|
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
654
654
|
const int in = tid - step*im; // 0...15 or 0...7
|
655
655
|
|
656
|
-
#if K_QUANTS_PER_ITERATION == 1
|
656
|
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
657
657
|
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
658
658
|
const int is = 0;
|
659
|
-
|
659
|
+
|
660
|
+
\n#else\n
|
661
|
+
|
660
662
|
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
661
663
|
const int is = in / 4;
|
662
|
-
|
664
|
+
|
665
|
+
\n#endif\n
|
666
|
+
|
663
667
|
const int ql_offset = 64*im + l0;
|
664
668
|
const int qh_offset = 32*im + l0;
|
665
669
|
const int s_offset = 8*im + is;
|
@@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
676
680
|
|
677
681
|
const float d = vload_half(0, &x[i].d);
|
678
682
|
|
679
|
-
#if K_QUANTS_PER_ITERATION == 1
|
683
|
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
680
684
|
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
681
685
|
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
682
686
|
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
@@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
686
690
|
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
687
691
|
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
688
692
|
tmp[16 * ix + tid] += sum;
|
689
|
-
#else
|
693
|
+
\n#else\n
|
690
694
|
float sum = 0;
|
691
695
|
for (int l = 0; l < 4; ++l) {
|
692
696
|
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
@@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
695
699
|
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
696
700
|
}
|
697
701
|
tmp[16 * ix + tid] += sum;
|
698
|
-
#endif
|
702
|
+
\n#endif\n
|
699
703
|
|
700
704
|
}
|
701
705
|
|
@@ -1376,7 +1380,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1376
1380
|
const int64_t ne00 = src0->ne[0];
|
1377
1381
|
const int64_t ne01 = src0->ne[1];
|
1378
1382
|
const int64_t ne02 = src0->ne[2];
|
1379
|
-
const int64_t ne03 = src0->ne[
|
1383
|
+
const int64_t ne03 = src0->ne[3];
|
1380
1384
|
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
1381
1385
|
const int64_t ne10 = src1->ne[0];
|
1382
1386
|
const int64_t ne11 = src1->ne[1];
|