llama_cpp 0.12.5 → 0.12.6
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -1
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +347 -40
- data/vendor/tmp/llama.cpp/ggml-quants.h +14 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +14 -61
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +89 -6
- data/vendor/tmp/llama.cpp/ggml.c +134 -60
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +654 -130
- data/vendor/tmp/llama.cpp/llama.h +6 -0
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -11578,11 +11578,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
|
11578
11578
|
}
|
11579
11579
|
char * dst_ptr = (char *) dst;
|
11580
11580
|
|
11581
|
-
|
11582
|
-
|
11583
|
-
const int64_t nb1 = src->nb[1];
|
11584
|
-
const int64_t nb2 = src->nb[2];
|
11585
|
-
const int64_t nb3 = src->nb[3];
|
11581
|
+
GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
|
11582
|
+
GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
|
11586
11583
|
const enum ggml_type type = src->type;
|
11587
11584
|
const int64_t ts = ggml_type_size(type);
|
11588
11585
|
const int64_t bs = ggml_blck_size(type);
|
@@ -12426,9 +12423,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
12426
12423
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
12427
12424
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12428
12425
|
|
12429
|
-
|
12430
|
-
const int64_t ne01 = src0->ne[1];
|
12431
|
-
const int64_t ne02 = src0->ne[2];
|
12426
|
+
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
12432
12427
|
const int64_t nrows = ggml_nrows(src0);
|
12433
12428
|
|
12434
12429
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
@@ -12758,15 +12753,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12758
12753
|
ggml_sycl_op_mul_mat_t op,
|
12759
12754
|
const bool convert_src1_to_q8_1) try {
|
12760
12755
|
|
12761
|
-
|
12762
|
-
const int64_t ne01 = src0->ne[1];
|
12763
|
-
const int64_t ne02 = src0->ne[2];
|
12764
|
-
const int64_t ne03 = src0->ne[3];
|
12756
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
12765
12757
|
|
12766
|
-
|
12767
|
-
const int64_t ne11 = src1->ne[1];
|
12768
|
-
const int64_t ne12 = src1->ne[2];
|
12769
|
-
const int64_t ne13 = src1->ne[3];
|
12758
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
12770
12759
|
const int64_t nrows1 = ggml_nrows(src1);
|
12771
12760
|
|
12772
12761
|
GGML_ASSERT(ne03 == ne13);
|
@@ -13337,23 +13326,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13337
13326
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13338
13327
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13339
13328
|
|
13340
|
-
|
13341
|
-
const int64_t ne01 = src0->ne[1];
|
13342
|
-
const int64_t ne02 = src0->ne[2];
|
13343
|
-
const int64_t ne03 = src0->ne[3];
|
13329
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
13344
13330
|
|
13345
|
-
|
13346
|
-
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
13347
|
-
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
13331
|
+
GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
|
13348
13332
|
|
13349
|
-
|
13350
|
-
const int64_t ne11 = src1->ne[1];
|
13351
|
-
const int64_t ne12 = src1->ne[2];
|
13352
|
-
const int64_t ne13 = src1->ne[3];
|
13333
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13353
13334
|
|
13354
|
-
|
13355
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
13356
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
13335
|
+
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13357
13336
|
|
13358
13337
|
const int64_t ne1 = ggml_nelements(src1);
|
13359
13338
|
const int64_t ne = ggml_nelements(dst);
|
@@ -13655,23 +13634,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
13655
13634
|
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
13656
13635
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13657
13636
|
|
13658
|
-
|
13659
|
-
const int64_t ne01 = src00->ne[1];
|
13660
|
-
const int64_t ne02 = src00->ne[2];
|
13661
|
-
const int64_t ne03 = src00->ne[3];
|
13637
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
13662
13638
|
|
13663
13639
|
//const int64_t nb01 = src00->nb[1];
|
13664
|
-
|
13665
|
-
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
13640
|
+
GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
|
13666
13641
|
|
13667
|
-
|
13668
|
-
const int64_t ne11 = src1->ne[1];
|
13669
|
-
const int64_t ne12 = src1->ne[2];
|
13670
|
-
const int64_t ne13 = src1->ne[3];
|
13642
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13671
13643
|
|
13644
|
+
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13672
13645
|
//const int64_t nb11 = src1->nb[1];
|
13673
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
13674
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
13675
13646
|
|
13676
13647
|
const int64_t ne1 = ggml_nelements(src1);
|
13677
13648
|
const int64_t ne = ggml_nelements(dst);
|
@@ -13940,25 +13911,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13940
13911
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
13941
13912
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
13942
13913
|
|
13943
|
-
|
13944
|
-
const int64_t ne01 = src0->ne[1];
|
13945
|
-
const int64_t ne02 = src0->ne[2];
|
13946
|
-
|
13947
|
-
|
13948
|
-
const int64_t nb00 = src0->nb[0];
|
13949
|
-
const int64_t nb01 = src0->nb[1];
|
13950
|
-
const int64_t nb02 = src0->nb[2];
|
13951
|
-
const int64_t nb03 = src0->nb[3];
|
13952
|
-
|
13953
|
-
const int64_t ne10 = src1->ne[0];
|
13954
|
-
const int64_t ne11 = src1->ne[1];
|
13955
|
-
const int64_t ne12 = src1->ne[2];
|
13956
|
-
|
13957
|
-
|
13958
|
-
const int64_t nb10 = src1->nb[0];
|
13959
|
-
const int64_t nb11 = src1->nb[1];
|
13960
|
-
const int64_t nb12 = src1->nb[2];
|
13961
|
-
const int64_t nb13 = src1->nb[3];
|
13914
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13962
13915
|
|
13963
13916
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
13964
13917
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
@@ -27,6 +27,7 @@
|
|
27
27
|
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
28
28
|
|
29
29
|
#define VK_VENDOR_ID_AMD 0x1002
|
30
|
+
#define VK_VENDOR_ID_APPLE 0x106b
|
30
31
|
#define VK_VENDOR_ID_INTEL 0x8086
|
31
32
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
32
33
|
|
@@ -2034,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
|
|
2034
2035
|
return ctx->pipeline_matmul_f32_aligned_l.align;
|
2035
2036
|
}
|
2036
2037
|
|
2038
|
+
static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
2039
|
+
if (bit16_x && bit16_y) {
|
2040
|
+
if (m <= 32 || n <= 32) {
|
2041
|
+
#ifdef GGML_VULKAN_DEBUG
|
2042
|
+
std::cerr << " S" << std::endl;
|
2043
|
+
#endif
|
2044
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2045
|
+
}
|
2046
|
+
#ifdef GGML_VULKAN_DEBUG
|
2047
|
+
std::cerr << " M" << std::endl;
|
2048
|
+
#endif
|
2049
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
2050
|
+
}
|
2051
|
+
if (bit16_x && !bit16_y) {
|
2052
|
+
if (m <= 32 || n <= 32) {
|
2053
|
+
#ifdef GGML_VULKAN_DEBUG
|
2054
|
+
std::cerr << " S" << std::endl;
|
2055
|
+
#endif
|
2056
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2057
|
+
}
|
2058
|
+
#ifdef GGML_VULKAN_DEBUG
|
2059
|
+
std::cerr << " M" << std::endl;
|
2060
|
+
#endif
|
2061
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
2062
|
+
}
|
2063
|
+
if (!bit16_x && bit16_y) {
|
2064
|
+
GGML_ASSERT(false);
|
2065
|
+
}
|
2066
|
+
|
2067
|
+
if (m <= 32 || n <= 32) {
|
2068
|
+
#ifdef GGML_VULKAN_DEBUG
|
2069
|
+
std::cerr << " S" << std::endl;
|
2070
|
+
#endif
|
2071
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2072
|
+
}
|
2073
|
+
#ifdef GGML_VULKAN_DEBUG
|
2074
|
+
std::cerr << " M" << std::endl;
|
2075
|
+
#endif
|
2076
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
2077
|
+
}
|
2078
|
+
|
2079
|
+
static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
2080
|
+
#ifdef GGML_VULKAN_DEBUG
|
2081
|
+
std::cerr << " M" << std::endl;
|
2082
|
+
#endif
|
2083
|
+
if (bit16_x && bit16_y) {
|
2084
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
2085
|
+
}
|
2086
|
+
if (bit16_x && !bit16_y) {
|
2087
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
2088
|
+
}
|
2089
|
+
if (!bit16_x && bit16_y) {
|
2090
|
+
GGML_ASSERT(false);
|
2091
|
+
}
|
2092
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
2093
|
+
}
|
2094
|
+
|
2095
|
+
static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
2096
|
+
#ifdef GGML_VULKAN_DEBUG
|
2097
|
+
std::cerr << " S" << std::endl;
|
2098
|
+
#endif
|
2099
|
+
if (bit16_x && bit16_y) {
|
2100
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2101
|
+
}
|
2102
|
+
if (bit16_x && !bit16_y) {
|
2103
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2104
|
+
}
|
2105
|
+
if (!bit16_x && bit16_y) {
|
2106
|
+
GGML_ASSERT(false);
|
2107
|
+
}
|
2108
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2109
|
+
}
|
2110
|
+
|
2037
2111
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
2038
2112
|
#ifdef GGML_VULKAN_DEBUG
|
2039
2113
|
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
2040
2114
|
#endif
|
2115
|
+
switch (ctx->device.lock()->vendor_id) {
|
2116
|
+
case VK_VENDOR_ID_AMD:
|
2117
|
+
return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
|
2118
|
+
case VK_VENDOR_ID_APPLE:
|
2119
|
+
return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
|
2120
|
+
case VK_VENDOR_ID_INTEL:
|
2121
|
+
return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
|
2122
|
+
}
|
2123
|
+
|
2041
2124
|
if (bit16_x && bit16_y) {
|
2042
|
-
if (
|
2125
|
+
if (m <= 32 || n <= 32) {
|
2043
2126
|
#ifdef GGML_VULKAN_DEBUG
|
2044
2127
|
std::cerr << " S" << std::endl;
|
2045
2128
|
#endif
|
2046
2129
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2047
2130
|
}
|
2048
|
-
if (
|
2131
|
+
if (m <= 64 || n <= 64) {
|
2049
2132
|
#ifdef GGML_VULKAN_DEBUG
|
2050
2133
|
std::cerr << " M" << std::endl;
|
2051
2134
|
#endif
|
@@ -2057,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2057
2140
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
2058
2141
|
}
|
2059
2142
|
if (bit16_x && !bit16_y) {
|
2060
|
-
if (
|
2143
|
+
if (m <= 32 || n <= 32) {
|
2061
2144
|
#ifdef GGML_VULKAN_DEBUG
|
2062
2145
|
std::cerr << " S" << std::endl;
|
2063
2146
|
#endif
|
2064
2147
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2065
2148
|
}
|
2066
|
-
if (
|
2149
|
+
if (m <= 64 || n <= 64) {
|
2067
2150
|
#ifdef GGML_VULKAN_DEBUG
|
2068
2151
|
std::cerr << " M" << std::endl;
|
2069
2152
|
#endif
|
@@ -2078,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2078
2161
|
GGML_ASSERT(false);
|
2079
2162
|
}
|
2080
2163
|
|
2081
|
-
if (
|
2164
|
+
if (m <= 32 || n <= 32) {
|
2082
2165
|
#ifdef GGML_VULKAN_DEBUG
|
2083
2166
|
std::cerr << " S" << std::endl;
|
2084
2167
|
#endif
|
2085
2168
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2086
2169
|
}
|
2087
|
-
if (
|
2170
|
+
if (m <= 64 || n <= 64) {
|
2088
2171
|
#ifdef GGML_VULKAN_DEBUG
|
2089
2172
|
std::cerr << " M" << std::endl;
|
2090
2173
|
#endif
|