llama_cpp 0.3.1 → 0.3.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +28 -0
- data/README.md +9 -0
- data/examples/chat.rb +1 -1
- data/examples/embedding.rb +1 -1
- data/examples/prompt_jp.txt +8 -0
- data/ext/llama_cpp/extconf.rb +2 -2
- data/ext/llama_cpp/llama_cpp.cpp +121 -1
- data/ext/llama_cpp/src/ggml-cuda.cu +451 -101
- data/ext/llama_cpp/src/ggml-cuda.h +0 -4
- data/ext/llama_cpp/src/ggml-metal.m +3 -1
- data/ext/llama_cpp/src/ggml-opencl.cpp +11 -7
- data/ext/llama_cpp/src/ggml.c +690 -1512
- data/ext/llama_cpp/src/ggml.h +88 -62
- data/ext/llama_cpp/src/llama.cpp +103 -39
- data/ext/llama_cpp/src/llama.h +15 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +15 -12
- data/sig/llama_cpp.rbs +19 -1
- metadata +3 -2
@@ -8,10 +8,6 @@ extern "C" {
|
|
8
8
|
|
9
9
|
#define GGML_CUDA_MAX_DEVICES 16
|
10
10
|
|
11
|
-
struct ggml_tensor_extra_gpu {
|
12
|
-
void * data_device[GGML_CUDA_MAX_DEVICES]; // 1 pointer for each device for split tensors
|
13
|
-
};
|
14
|
-
|
15
11
|
void ggml_init_cublas(void);
|
16
12
|
void ggml_cuda_set_tensor_split(const float * tensor_split);
|
17
13
|
|
@@ -202,7 +202,9 @@ struct ggml_metal_context * ggml_metal_init(void) {
|
|
202
202
|
|
203
203
|
void ggml_metal_free(struct ggml_metal_context * ctx) {
|
204
204
|
fprintf(stderr, "%s: deallocating\n", __func__);
|
205
|
-
|
205
|
+
for (int i = 0; i < ctx->n_buffers; ++i) {
|
206
|
+
[ctx->buffers[i].metal release];
|
207
|
+
}
|
206
208
|
free(ctx);
|
207
209
|
}
|
208
210
|
|
@@ -653,13 +653,17 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
653
653
|
const int im = tid/step; // 0 or 1. 0 computes 0..., 1 computes 128...
|
654
654
|
const int in = tid - step*im; // 0...15 or 0...7
|
655
655
|
|
656
|
-
#if K_QUANTS_PER_ITERATION == 1
|
656
|
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
657
657
|
const int l0 = K_QUANTS_PER_ITERATION*in; // 0...15
|
658
658
|
const int is = 0;
|
659
|
-
|
659
|
+
|
660
|
+
\n#else\n
|
661
|
+
|
660
662
|
const int l0 = 4 * in; // 0, 4, 8, ..., 28
|
661
663
|
const int is = in / 4;
|
662
|
-
|
664
|
+
|
665
|
+
\n#endif\n
|
666
|
+
|
663
667
|
const int ql_offset = 64*im + l0;
|
664
668
|
const int qh_offset = 32*im + l0;
|
665
669
|
const int s_offset = 8*im + is;
|
@@ -676,7 +680,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
676
680
|
|
677
681
|
const float d = vload_half(0, &x[i].d);
|
678
682
|
|
679
|
-
#if K_QUANTS_PER_ITERATION == 1
|
683
|
+
\n#if K_QUANTS_PER_ITERATION == 1\n
|
680
684
|
float sum = y[ 0] * s[0] * d * ((int8_t)((ql[ 0] & 0xF) | ((qh[ 0] & 0x03) << 4)) - 32)
|
681
685
|
+ y[16] * s[1] * d * ((int8_t)((ql[16] & 0xF) | ((qh[16] & 0x03) << 4)) - 32)
|
682
686
|
+ y[32] * s[2] * d * ((int8_t)((ql[32] & 0xF) | ((qh[ 0] & 0x0c) << 2)) - 32)
|
@@ -686,7 +690,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
686
690
|
+ y[96] * s[6] * d * ((int8_t)((ql[32] >> 4) | ((qh[ 0] & 0xc0) >> 2)) - 32)
|
687
691
|
+y[112] * s[7] * d * ((int8_t)((ql[48] >> 4) | ((qh[16] & 0xc0) >> 2)) - 32);
|
688
692
|
tmp[16 * ix + tid] += sum;
|
689
|
-
#else
|
693
|
+
\n#else\n
|
690
694
|
float sum = 0;
|
691
695
|
for (int l = 0; l < 4; ++l) {
|
692
696
|
sum += y[l+ 0] * s[0] * d * ((int8_t)((ql[l+ 0] & 0xF) | (((qh[l] >> 0) & 3) << 4)) - 32)
|
@@ -695,7 +699,7 @@ __kernel void dequantize_mul_mat_vec_q6_K(__global const struct block_q6_K * xx,
|
|
695
699
|
+ y[l+96] * s[6] * d * ((int8_t)((ql[l+32] >> 4) | (((qh[l] >> 6) & 3) << 4)) - 32);
|
696
700
|
}
|
697
701
|
tmp[16 * ix + tid] += sum;
|
698
|
-
#endif
|
702
|
+
\n#endif\n
|
699
703
|
|
700
704
|
}
|
701
705
|
|
@@ -1376,7 +1380,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1376
1380
|
const int64_t ne00 = src0->ne[0];
|
1377
1381
|
const int64_t ne01 = src0->ne[1];
|
1378
1382
|
const int64_t ne02 = src0->ne[2];
|
1379
|
-
const int64_t ne03 = src0->ne[
|
1383
|
+
const int64_t ne03 = src0->ne[3];
|
1380
1384
|
const int64_t ne0 = ne00 * ne01 * ne02 * ne03;
|
1381
1385
|
const int64_t ne10 = src1->ne[0];
|
1382
1386
|
const int64_t ne11 = src1->ne[1];
|