llama_cpp 0.12.5 → 0.12.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +7 -0
- data/ext/llama_cpp/llama_cpp.cpp +46 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +9 -1
- data/vendor/tmp/llama.cpp/ggml-alloc.c +563 -490
- data/vendor/tmp/llama.cpp/ggml-alloc.h +39 -65
- data/vendor/tmp/llama.cpp/ggml-backend.c +250 -262
- data/vendor/tmp/llama.cpp/ggml-backend.h +8 -12
- data/vendor/tmp/llama.cpp/ggml-metal.m +2 -0
- data/vendor/tmp/llama.cpp/ggml-quants.c +347 -40
- data/vendor/tmp/llama.cpp/ggml-quants.h +14 -14
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +14 -61
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +89 -6
- data/vendor/tmp/llama.cpp/ggml.c +134 -60
- data/vendor/tmp/llama.cpp/ggml.h +26 -6
- data/vendor/tmp/llama.cpp/llama.cpp +654 -130
- data/vendor/tmp/llama.cpp/llama.h +6 -0
- data/vendor/tmp/llama.cpp/unicode.h +42 -30
- metadata +2 -2
@@ -11578,11 +11578,8 @@ static dpct::err0 ggml_sycl_cpy_tensor_2d(void *dst,
|
|
11578
11578
|
}
|
11579
11579
|
char * dst_ptr = (char *) dst;
|
11580
11580
|
|
11581
|
-
|
11582
|
-
|
11583
|
-
const int64_t nb1 = src->nb[1];
|
11584
|
-
const int64_t nb2 = src->nb[2];
|
11585
|
-
const int64_t nb3 = src->nb[3];
|
11581
|
+
GGML_TENSOR_LOCALS_1(int64_t, ne, src, ne);
|
11582
|
+
GGML_TENSOR_LOCALS(int64_t, nb, src, nb);
|
11586
11583
|
const enum ggml_type type = src->type;
|
11587
11584
|
const int64_t ts = ggml_type_size(type);
|
11588
11585
|
const int64_t bs = ggml_blck_size(type);
|
@@ -12426,9 +12423,7 @@ inline void ggml_sycl_op_alibi(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
12426
12423
|
GGML_ASSERT(src0->type == GGML_TYPE_F32);
|
12427
12424
|
GGML_ASSERT( dst->type == GGML_TYPE_F32);
|
12428
12425
|
|
12429
|
-
|
12430
|
-
const int64_t ne01 = src0->ne[1];
|
12431
|
-
const int64_t ne02 = src0->ne[2];
|
12426
|
+
GGML_TENSOR_LOCALS_3(int64_t, ne0, src0, ne);
|
12432
12427
|
const int64_t nrows = ggml_nrows(src0);
|
12433
12428
|
|
12434
12429
|
//const int n_past = ((int32_t *) dst->op_params)[0];
|
@@ -12758,15 +12753,9 @@ static void ggml_sycl_op_mul_mat(const ggml_tensor *src0,
|
|
12758
12753
|
ggml_sycl_op_mul_mat_t op,
|
12759
12754
|
const bool convert_src1_to_q8_1) try {
|
12760
12755
|
|
12761
|
-
|
12762
|
-
const int64_t ne01 = src0->ne[1];
|
12763
|
-
const int64_t ne02 = src0->ne[2];
|
12764
|
-
const int64_t ne03 = src0->ne[3];
|
12756
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
12765
12757
|
|
12766
|
-
|
12767
|
-
const int64_t ne11 = src1->ne[1];
|
12768
|
-
const int64_t ne12 = src1->ne[2];
|
12769
|
-
const int64_t ne13 = src1->ne[3];
|
12758
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
12770
12759
|
const int64_t nrows1 = ggml_nrows(src1);
|
12771
12760
|
|
12772
12761
|
GGML_ASSERT(ne03 == ne13);
|
@@ -13337,23 +13326,13 @@ static void ggml_sycl_mul_mat_mat_batched_sycl(const ggml_tensor *src0,
|
|
13337
13326
|
GGML_ASSERT(src0->type == GGML_TYPE_F16);
|
13338
13327
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13339
13328
|
|
13340
|
-
|
13341
|
-
const int64_t ne01 = src0->ne[1];
|
13342
|
-
const int64_t ne02 = src0->ne[2];
|
13343
|
-
const int64_t ne03 = src0->ne[3];
|
13329
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src0, ne);
|
13344
13330
|
|
13345
|
-
|
13346
|
-
const int64_t nb02 = src0->nb[2]; GGML_UNUSED(nb02);
|
13347
|
-
const int64_t nb03 = src0->nb[3]; GGML_UNUSED(nb03);
|
13331
|
+
GGML_TENSOR_LOCALS(int64_t, nb0, src0, nb);
|
13348
13332
|
|
13349
|
-
|
13350
|
-
const int64_t ne11 = src1->ne[1];
|
13351
|
-
const int64_t ne12 = src1->ne[2];
|
13352
|
-
const int64_t ne13 = src1->ne[3];
|
13333
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13353
13334
|
|
13354
|
-
|
13355
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
13356
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
13335
|
+
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13357
13336
|
|
13358
13337
|
const int64_t ne1 = ggml_nelements(src1);
|
13359
13338
|
const int64_t ne = ggml_nelements(dst);
|
@@ -13655,23 +13634,15 @@ static void ggml_sycl_mul_mat_id_sycl(ggml_tensor * dst) {
|
|
13655
13634
|
GGML_ASSERT(src00->backend != GGML_BACKEND_GPU_SPLIT);
|
13656
13635
|
GGML_ASSERT(src1->type == GGML_TYPE_F32);
|
13657
13636
|
|
13658
|
-
|
13659
|
-
const int64_t ne01 = src00->ne[1];
|
13660
|
-
const int64_t ne02 = src00->ne[2];
|
13661
|
-
const int64_t ne03 = src00->ne[3];
|
13637
|
+
GGML_TENSOR_LOCALS(int64_t, ne0, src00, ne);
|
13662
13638
|
|
13663
13639
|
//const int64_t nb01 = src00->nb[1];
|
13664
|
-
|
13665
|
-
const int64_t nb03 = src00->nb[3]; GGML_UNUSED(nb03);
|
13640
|
+
GGML_TENSOR_LOCALS(int64_t, nb0, src00, nb);
|
13666
13641
|
|
13667
|
-
|
13668
|
-
const int64_t ne11 = src1->ne[1];
|
13669
|
-
const int64_t ne12 = src1->ne[2];
|
13670
|
-
const int64_t ne13 = src1->ne[3];
|
13642
|
+
GGML_TENSOR_LOCALS(int64_t, ne1, src1, ne);
|
13671
13643
|
|
13644
|
+
GGML_TENSOR_LOCALS(int64_t, nb1, src1, nb);
|
13672
13645
|
//const int64_t nb11 = src1->nb[1];
|
13673
|
-
const int64_t nb12 = src1->nb[2]; GGML_UNUSED(nb12);
|
13674
|
-
const int64_t nb13 = src1->nb[3]; GGML_UNUSED(nb13);
|
13675
13646
|
|
13676
13647
|
const int64_t ne1 = ggml_nelements(src1);
|
13677
13648
|
const int64_t ne = ggml_nelements(dst);
|
@@ -13940,25 +13911,7 @@ static void ggml_sycl_cpy(const ggml_tensor *src0, const ggml_tensor *src1,
|
|
13940
13911
|
GGML_ASSERT(ggml_nbytes(src0) <= INT_MAX);
|
13941
13912
|
GGML_ASSERT(ggml_nbytes(src1) <= INT_MAX);
|
13942
13913
|
|
13943
|
-
|
13944
|
-
const int64_t ne01 = src0->ne[1];
|
13945
|
-
const int64_t ne02 = src0->ne[2];
|
13946
|
-
|
13947
|
-
|
13948
|
-
const int64_t nb00 = src0->nb[0];
|
13949
|
-
const int64_t nb01 = src0->nb[1];
|
13950
|
-
const int64_t nb02 = src0->nb[2];
|
13951
|
-
const int64_t nb03 = src0->nb[3];
|
13952
|
-
|
13953
|
-
const int64_t ne10 = src1->ne[0];
|
13954
|
-
const int64_t ne11 = src1->ne[1];
|
13955
|
-
const int64_t ne12 = src1->ne[2];
|
13956
|
-
|
13957
|
-
|
13958
|
-
const int64_t nb10 = src1->nb[0];
|
13959
|
-
const int64_t nb11 = src1->nb[1];
|
13960
|
-
const int64_t nb12 = src1->nb[2];
|
13961
|
-
const int64_t nb13 = src1->nb[3];
|
13914
|
+
GGML_TENSOR_BINARY_OP_LOCALS;
|
13962
13915
|
|
13963
13916
|
SYCL_CHECK(ggml_sycl_set_device(g_main_device));
|
13964
13917
|
dpct::queue_ptr main_stream = g_syclStreams[g_main_device_index][0];
|
@@ -27,6 +27,7 @@
|
|
27
27
|
#define CEIL_DIV(M, N) (((M) + (N)-1) / (N))
|
28
28
|
|
29
29
|
#define VK_VENDOR_ID_AMD 0x1002
|
30
|
+
#define VK_VENDOR_ID_APPLE 0x106b
|
30
31
|
#define VK_VENDOR_ID_INTEL 0x8086
|
31
32
|
#define VK_VENDOR_ID_NVIDIA 0x10de
|
32
33
|
|
@@ -2034,18 +2035,100 @@ static uint32_t ggml_vk_guess_matmul_pipeline_align(ggml_backend_vk_context * ct
|
|
2034
2035
|
return ctx->pipeline_matmul_f32_aligned_l.align;
|
2035
2036
|
}
|
2036
2037
|
|
2038
|
+
static vk_pipeline* ggml_vk_guess_matmul_pipeline_amd(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
2039
|
+
if (bit16_x && bit16_y) {
|
2040
|
+
if (m <= 32 || n <= 32) {
|
2041
|
+
#ifdef GGML_VULKAN_DEBUG
|
2042
|
+
std::cerr << " S" << std::endl;
|
2043
|
+
#endif
|
2044
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2045
|
+
}
|
2046
|
+
#ifdef GGML_VULKAN_DEBUG
|
2047
|
+
std::cerr << " M" << std::endl;
|
2048
|
+
#endif
|
2049
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
2050
|
+
}
|
2051
|
+
if (bit16_x && !bit16_y) {
|
2052
|
+
if (m <= 32 || n <= 32) {
|
2053
|
+
#ifdef GGML_VULKAN_DEBUG
|
2054
|
+
std::cerr << " S" << std::endl;
|
2055
|
+
#endif
|
2056
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2057
|
+
}
|
2058
|
+
#ifdef GGML_VULKAN_DEBUG
|
2059
|
+
std::cerr << " M" << std::endl;
|
2060
|
+
#endif
|
2061
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
2062
|
+
}
|
2063
|
+
if (!bit16_x && bit16_y) {
|
2064
|
+
GGML_ASSERT(false);
|
2065
|
+
}
|
2066
|
+
|
2067
|
+
if (m <= 32 || n <= 32) {
|
2068
|
+
#ifdef GGML_VULKAN_DEBUG
|
2069
|
+
std::cerr << " S" << std::endl;
|
2070
|
+
#endif
|
2071
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2072
|
+
}
|
2073
|
+
#ifdef GGML_VULKAN_DEBUG
|
2074
|
+
std::cerr << " M" << std::endl;
|
2075
|
+
#endif
|
2076
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
2077
|
+
}
|
2078
|
+
|
2079
|
+
static vk_pipeline* ggml_vk_guess_matmul_pipeline_apple(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
2080
|
+
#ifdef GGML_VULKAN_DEBUG
|
2081
|
+
std::cerr << " M" << std::endl;
|
2082
|
+
#endif
|
2083
|
+
if (bit16_x && bit16_y) {
|
2084
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_m : &ctx->pipeline_matmul_f16_m;
|
2085
|
+
}
|
2086
|
+
if (bit16_x && !bit16_y) {
|
2087
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_m : &ctx->pipeline_matmul_f16_f32_m;
|
2088
|
+
}
|
2089
|
+
if (!bit16_x && bit16_y) {
|
2090
|
+
GGML_ASSERT(false);
|
2091
|
+
}
|
2092
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_m : &ctx->pipeline_matmul_f32_m;
|
2093
|
+
}
|
2094
|
+
|
2095
|
+
static vk_pipeline* ggml_vk_guess_matmul_pipeline_intel(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, bool aligned) {
|
2096
|
+
#ifdef GGML_VULKAN_DEBUG
|
2097
|
+
std::cerr << " S" << std::endl;
|
2098
|
+
#endif
|
2099
|
+
if (bit16_x && bit16_y) {
|
2100
|
+
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2101
|
+
}
|
2102
|
+
if (bit16_x && !bit16_y) {
|
2103
|
+
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2104
|
+
}
|
2105
|
+
if (!bit16_x && bit16_y) {
|
2106
|
+
GGML_ASSERT(false);
|
2107
|
+
}
|
2108
|
+
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2109
|
+
}
|
2110
|
+
|
2037
2111
|
static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx, bool bit16_x, bool bit16_y, int m, int n, bool aligned) {
|
2038
2112
|
#ifdef GGML_VULKAN_DEBUG
|
2039
2113
|
std::cerr << "ggml_vk_guess_matmul_pipeline(" << bit16_x << ", " << bit16_y << ", " << m << ", " << n << ", " << aligned << ")";
|
2040
2114
|
#endif
|
2115
|
+
switch (ctx->device.lock()->vendor_id) {
|
2116
|
+
case VK_VENDOR_ID_AMD:
|
2117
|
+
return ggml_vk_guess_matmul_pipeline_amd(ctx, bit16_x, bit16_y, m, n, aligned);
|
2118
|
+
case VK_VENDOR_ID_APPLE:
|
2119
|
+
return ggml_vk_guess_matmul_pipeline_apple(ctx, bit16_x, bit16_y, aligned);
|
2120
|
+
case VK_VENDOR_ID_INTEL:
|
2121
|
+
return ggml_vk_guess_matmul_pipeline_intel(ctx, bit16_x, bit16_y, aligned);
|
2122
|
+
}
|
2123
|
+
|
2041
2124
|
if (bit16_x && bit16_y) {
|
2042
|
-
if (
|
2125
|
+
if (m <= 32 || n <= 32) {
|
2043
2126
|
#ifdef GGML_VULKAN_DEBUG
|
2044
2127
|
std::cerr << " S" << std::endl;
|
2045
2128
|
#endif
|
2046
2129
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_s : &ctx->pipeline_matmul_f16_s;
|
2047
2130
|
}
|
2048
|
-
if (
|
2131
|
+
if (m <= 64 || n <= 64) {
|
2049
2132
|
#ifdef GGML_VULKAN_DEBUG
|
2050
2133
|
std::cerr << " M" << std::endl;
|
2051
2134
|
#endif
|
@@ -2057,13 +2140,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2057
2140
|
return aligned ? &ctx->pipeline_matmul_f16_aligned_l : &ctx->pipeline_matmul_f16_l;
|
2058
2141
|
}
|
2059
2142
|
if (bit16_x && !bit16_y) {
|
2060
|
-
if (
|
2143
|
+
if (m <= 32 || n <= 32) {
|
2061
2144
|
#ifdef GGML_VULKAN_DEBUG
|
2062
2145
|
std::cerr << " S" << std::endl;
|
2063
2146
|
#endif
|
2064
2147
|
return aligned ? &ctx->pipeline_matmul_f16_f32_aligned_s : &ctx->pipeline_matmul_f16_f32_s;
|
2065
2148
|
}
|
2066
|
-
if (
|
2149
|
+
if (m <= 64 || n <= 64) {
|
2067
2150
|
#ifdef GGML_VULKAN_DEBUG
|
2068
2151
|
std::cerr << " M" << std::endl;
|
2069
2152
|
#endif
|
@@ -2078,13 +2161,13 @@ static vk_pipeline* ggml_vk_guess_matmul_pipeline(ggml_backend_vk_context * ctx,
|
|
2078
2161
|
GGML_ASSERT(false);
|
2079
2162
|
}
|
2080
2163
|
|
2081
|
-
if (
|
2164
|
+
if (m <= 32 || n <= 32) {
|
2082
2165
|
#ifdef GGML_VULKAN_DEBUG
|
2083
2166
|
std::cerr << " S" << std::endl;
|
2084
2167
|
#endif
|
2085
2168
|
return aligned ? &ctx->pipeline_matmul_f32_aligned_s : &ctx->pipeline_matmul_f32_s;
|
2086
2169
|
}
|
2087
|
-
if (
|
2170
|
+
if (m <= 64 || n <= 64) {
|
2088
2171
|
#ifdef GGML_VULKAN_DEBUG
|
2089
2172
|
std::cerr << " M" << std::endl;
|
2090
2173
|
#endif
|