llama_cpp 0.5.0 → 0.5.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
|
1334
1334
|
return;
|
1335
1335
|
}
|
1336
1336
|
|
1337
|
-
cl_mem mem = (cl_mem)tensor->
|
1337
|
+
cl_mem mem = (cl_mem)tensor->extra;
|
1338
1338
|
clReleaseMemObject(mem);
|
1339
1339
|
}
|
1340
1340
|
|
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1393
1393
|
size_t d_size;
|
1394
1394
|
|
1395
1395
|
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
1396
|
-
cl_mem d_Y = (cl_mem) src1->
|
1396
|
+
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1397
1397
|
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
1398
1398
|
|
1399
1399
|
|
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1491
1491
|
size_t d_size;
|
1492
1492
|
cl_mem d_X;
|
1493
1493
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1494
|
-
d_X = (cl_mem) src0->
|
1494
|
+
d_X = (cl_mem) src0->extra;
|
1495
1495
|
} else {
|
1496
|
-
d_X = ggml_cl_pool_malloc(sizeof(
|
1496
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
1497
1497
|
}
|
1498
1498
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1499
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1567
1567
|
size_t d_size;
|
1568
1568
|
cl_mem d_X;
|
1569
1569
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1570
|
-
d_X = (cl_mem) src0->
|
1570
|
+
d_X = (cl_mem) src0->extra;
|
1571
1571
|
} else {
|
1572
1572
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
1573
1573
|
}
|
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1697
1697
|
events.emplace_back();
|
1698
1698
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1699
1699
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
|
-
d_Q = (cl_mem) src0->
|
1700
|
+
d_Q = (cl_mem) src0->extra;
|
1701
1701
|
} else {
|
1702
1702
|
GGML_ASSERT(false);
|
1703
1703
|
}
|
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1860
1860
|
|
1861
1861
|
CL_CHECK(clFinish(queue));
|
1862
1862
|
|
1863
|
-
tensor->
|
1863
|
+
tensor->extra = dst;
|
1864
1864
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
1865
1865
|
}
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
2
1
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
3
2
|
|
4
3
|
#include "ggml.h"
|
@@ -47,6 +46,10 @@
|
|
47
46
|
// disable "possible loss of data" to avoid hundreds of casts
|
48
47
|
// we should just be careful :)
|
49
48
|
#pragma warning(disable: 4244 4267)
|
49
|
+
|
50
|
+
// disable POSIX deprecation warnigns
|
51
|
+
// these functions are never going away, anyway
|
52
|
+
#pragma warning(disable: 4996)
|
50
53
|
#endif
|
51
54
|
|
52
55
|
#if defined(_WIN32)
|
@@ -103,6 +106,9 @@ typedef void * thread_ret_t;
|
|
103
106
|
#include <sys/stat.h>
|
104
107
|
#include <unistd.h>
|
105
108
|
|
109
|
+
#endif
|
110
|
+
#ifdef GGML_USE_CPU_HBM
|
111
|
+
#include <hbwmalloc.h>
|
106
112
|
#endif
|
107
113
|
|
108
114
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -192,9 +198,15 @@ typedef void * thread_ret_t;
|
|
192
198
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
193
199
|
#else
|
194
200
|
inline static void * ggml_aligned_malloc(size_t size) {
|
201
|
+
if (size == 0) {
|
202
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
203
|
+
return NULL;
|
204
|
+
}
|
195
205
|
void * aligned_memory = NULL;
|
196
|
-
#ifdef
|
197
|
-
int result =
|
206
|
+
#ifdef GGML_USE_CPU_HBM
|
207
|
+
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
208
|
+
#elif GGML_USE_METAL
|
209
|
+
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
198
210
|
#else
|
199
211
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
200
212
|
#endif
|
@@ -215,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
215
227
|
return aligned_memory;
|
216
228
|
}
|
217
229
|
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
230
|
+
#ifdef GGML_USE_CPU_HBM
|
231
|
+
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
232
|
+
#else
|
218
233
|
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
219
234
|
#endif
|
235
|
+
#endif
|
220
236
|
|
221
237
|
#define UNUSED GGML_UNUSED
|
222
238
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
@@ -267,7 +283,7 @@ typedef double ggml_float;
|
|
267
283
|
// 16-bit float
|
268
284
|
// on Arm, we use __fp16
|
269
285
|
// on x86, we use uint16_t
|
270
|
-
#
|
286
|
+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
271
287
|
|
272
288
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
273
289
|
//
|
@@ -294,12 +310,14 @@ typedef double ggml_float;
|
|
294
310
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
295
311
|
#include <intrin.h>
|
296
312
|
#else
|
313
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
297
314
|
#if !defined(__riscv)
|
298
315
|
#include <immintrin.h>
|
299
316
|
#endif
|
300
317
|
#endif
|
301
318
|
#endif
|
302
319
|
#endif
|
320
|
+
#endif
|
303
321
|
|
304
322
|
#ifdef __riscv_v_intrinsic
|
305
323
|
#include <riscv_vector.h>
|
@@ -817,46 +835,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
817
835
|
|
818
836
|
#if !defined(__aarch64__)
|
819
837
|
|
820
|
-
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
821
|
-
return
|
822
|
-
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
823
|
-
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
824
|
-
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
825
|
-
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
826
|
-
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
827
|
-
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
828
|
-
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
829
|
-
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
830
|
-
}
|
831
|
-
|
832
|
-
inline static int16_t vaddvq_s8(int8x16_t v) {
|
833
|
-
return
|
834
|
-
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
|
835
|
-
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
|
836
|
-
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
|
837
|
-
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
|
838
|
-
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
|
839
|
-
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
|
840
|
-
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
|
841
|
-
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
|
842
|
-
}
|
843
|
-
|
844
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
845
|
-
return
|
846
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
847
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
848
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
849
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
850
|
-
}
|
851
|
-
|
852
|
-
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
853
|
-
return
|
854
|
-
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
855
|
-
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
856
|
-
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
857
|
-
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
858
|
-
}
|
859
|
-
|
860
838
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
861
839
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
862
840
|
}
|
@@ -865,12 +843,6 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
865
843
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
866
844
|
}
|
867
845
|
|
868
|
-
inline static float vminvq_f32(float32x4_t v) {
|
869
|
-
return
|
870
|
-
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
871
|
-
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
872
|
-
}
|
873
|
-
|
874
846
|
inline static float vmaxvq_f32(float32x4_t v) {
|
875
847
|
return
|
876
848
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
@@ -4612,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4612
4584
|
return NULL;
|
4613
4585
|
}
|
4614
4586
|
|
4587
|
+
// allow to call ggml_init with 0 size
|
4588
|
+
if (params.mem_size == 0) {
|
4589
|
+
params.mem_size = GGML_MEM_ALIGN;
|
4590
|
+
}
|
4591
|
+
|
4615
4592
|
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
|
4616
4593
|
|
4617
4594
|
*ctx = (struct ggml_context) {
|
@@ -4814,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4814
4791
|
|
4815
4792
|
size_t obj_alloc_size = 0;
|
4816
4793
|
|
4817
|
-
if (view_src == NULL && ctx->no_alloc
|
4794
|
+
if (view_src == NULL && !ctx->no_alloc) {
|
4818
4795
|
if (ctx->scratch.data != NULL) {
|
4819
4796
|
// allocate tensor data in the scratch buffer
|
4820
4797
|
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
@@ -5515,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
5515
5492
|
}
|
5516
5493
|
|
5517
5494
|
if (inplace) {
|
5518
|
-
GGML_ASSERT(is_node
|
5495
|
+
GGML_ASSERT(!is_node);
|
5519
5496
|
}
|
5520
5497
|
|
5521
5498
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -5558,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
5558
5535
|
}
|
5559
5536
|
|
5560
5537
|
if (inplace) {
|
5561
|
-
GGML_ASSERT(is_node
|
5538
|
+
GGML_ASSERT(!is_node);
|
5562
5539
|
}
|
5563
5540
|
|
5564
5541
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -18900,7 +18877,6 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18900
18877
|
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
|
18901
18878
|
return count;
|
18902
18879
|
}
|
18903
|
-
return count;
|
18904
18880
|
}
|
18905
18881
|
}
|
18906
18882
|
|
@@ -20003,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20003
19979
|
|
20004
19980
|
struct ggml_tensor * data = NULL;
|
20005
19981
|
|
20006
|
-
if (params.no_alloc
|
19982
|
+
if (!params.no_alloc) {
|
20007
19983
|
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
20008
19984
|
|
20009
19985
|
ok = ok && data != NULL;
|
@@ -20044,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20044
20020
|
}
|
20045
20021
|
|
20046
20022
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20047
|
-
if (params.no_alloc
|
20023
|
+
if (!params.no_alloc) {
|
20048
20024
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
20049
20025
|
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
20050
20026
|
}
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -270,7 +270,7 @@ extern "C" {
|
|
270
270
|
|
271
271
|
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
272
272
|
typedef half ggml_fp16_t;
|
273
|
-
#elif defined(__ARM_NEON)
|
273
|
+
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
274
274
|
typedef __fp16 ggml_fp16_t;
|
275
275
|
#else
|
276
276
|
typedef uint16_t ggml_fp16_t;
|
@@ -13,6 +13,26 @@
|
|
13
13
|
//
|
14
14
|
#include <arm_neon.h>
|
15
15
|
|
16
|
+
#if !defined(__aarch64__)
|
17
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
18
|
+
return
|
19
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
20
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
21
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
22
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
23
|
+
}
|
24
|
+
|
25
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
26
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
27
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
28
|
+
return vcombine_s16(a0, b0);
|
29
|
+
}
|
30
|
+
|
31
|
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
32
|
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
33
|
+
}
|
34
|
+
#endif
|
35
|
+
|
16
36
|
#else
|
17
37
|
|
18
38
|
#ifdef __wasm_simd128__
|
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
63
83
|
float ax = fabsf(x[i]);
|
64
84
|
if (ax > amax) { amax = ax; max = x[i]; }
|
65
85
|
}
|
66
|
-
if (
|
86
|
+
if (amax < 1e-30f) { // all zero
|
67
87
|
for (int i = 0; i < n; ++i) {
|
68
88
|
L[i] = 0;
|
69
89
|
}
|
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
183
203
|
int ntry, float alpha) {
|
184
204
|
float min = x[0];
|
185
205
|
float max = x[0];
|
186
|
-
float sum_x = 0;
|
187
|
-
float sum_x2 = 0;
|
188
206
|
for (int i = 1; i < n; ++i) {
|
189
207
|
if (x[i] < min) min = x[i];
|
190
208
|
if (x[i] > max) max = x[i];
|
191
|
-
sum_x += x[i];
|
192
|
-
sum_x2 += x[i]*x[i];
|
193
209
|
}
|
194
210
|
if (max == min) {
|
195
211
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
1070
1086
|
|
1071
1087
|
}
|
1072
1088
|
|
1089
|
+
if (!max_abs_scale) {
|
1090
|
+
memset(&y[i], 0, sizeof(block_q6_K));
|
1091
|
+
y[i].d = ggml_fp32_to_fp16(0.f);
|
1092
|
+
x += QK_K;
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
|
1073
1096
|
float iscale = -128.f/max_scale;
|
1074
1097
|
y[i].d = ggml_fp32_to_fp16(1/iscale);
|
1075
1098
|
for (int ib = 0; ib < QK_K/16; ++ib) {
|
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1306
1329
|
|
1307
1330
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1308
1331
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
1332
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1309
1333
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1334
|
+
#endif
|
1310
1335
|
|
1311
1336
|
int8x16x2_t q2bytes;
|
1312
1337
|
uint8_t aux[16];
|
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1612
1637
|
#ifdef __ARM_NEON
|
1613
1638
|
|
1614
1639
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1640
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1615
1641
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1642
|
+
#endif
|
1616
1643
|
|
1617
1644
|
int8x16x4_t q2bytes;
|
1618
1645
|
|
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2060
2087
|
|
2061
2088
|
__m256 acc = _mm256_setzero_ps();
|
2062
2089
|
|
2063
|
-
uint32_t *aux;
|
2090
|
+
const uint32_t *aux;
|
2064
2091
|
|
2065
2092
|
for (int i = 0; i < nb; ++i) {
|
2066
2093
|
|
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2070
2097
|
const int8_t * restrict q8 = y[i].qs;
|
2071
2098
|
|
2072
2099
|
// Set up scales
|
2073
|
-
aux = (uint32_t *)x[i].scales;
|
2100
|
+
aux = (const uint32_t *)x[i].scales;
|
2074
2101
|
__m128i scales128 = _mm_set_epi32(
|
2075
2102
|
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
|
2076
2103
|
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
|
@@ -2582,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2582
2609
|
|
2583
2610
|
memcpy(utmp, x[i].scales, 12);
|
2584
2611
|
|
2585
|
-
|
2612
|
+
uint32x2_t mins8 = { 0 };
|
2613
|
+
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
|
2614
|
+
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
|
2615
|
+
|
2586
2616
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
2587
2617
|
utmp[0] &= kmask1;
|
2588
2618
|
|
@@ -2596,8 +2626,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2596
2626
|
const uint8_t * restrict q4 = x[i].qs;
|
2597
2627
|
const int8_t * restrict q8 = y[i].qs;
|
2598
2628
|
|
2599
|
-
//int32x4_t isum = mzero;
|
2600
|
-
|
2601
2629
|
int32_t sumi1 = 0;
|
2602
2630
|
int32_t sumi2 = 0;
|
2603
2631
|
|
@@ -3096,9 +3124,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3096
3124
|
#ifdef __ARM_NEON
|
3097
3125
|
|
3098
3126
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3099
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3100
3127
|
const uint8x16_t mone = vdupq_n_u8(1);
|
3101
3128
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
3129
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3130
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3131
|
+
#endif
|
3102
3132
|
|
3103
3133
|
int8x16x4_t q5bytes;
|
3104
3134
|
|
@@ -3441,8 +3471,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3441
3471
|
#ifdef __ARM_NEON
|
3442
3472
|
|
3443
3473
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3444
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3445
3474
|
const uint8x16_t mh = vdupq_n_u8(16);
|
3475
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3476
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3477
|
+
#endif
|
3446
3478
|
|
3447
3479
|
int8x16x4_t q5bytes;
|
3448
3480
|
uint8x16x4_t q5h;
|
@@ -3660,7 +3692,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3660
3692
|
float sum = 0;
|
3661
3693
|
|
3662
3694
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
3695
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3663
3696
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3697
|
+
#endif
|
3664
3698
|
//const int8x16_t m32s = vdupq_n_s8(32);
|
3665
3699
|
|
3666
3700
|
const uint8x16_t mone = vdupq_n_u8(3);
|
@@ -4049,8 +4083,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4049
4083
|
float sum = 0;
|
4050
4084
|
|
4051
4085
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
4052
|
-
const int32x4_t vzero = vdupq_n_s32(0);
|
4053
4086
|
const int8x16_t m32s = vdupq_n_s8(32);
|
4087
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
4088
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
4089
|
+
#endif
|
4054
4090
|
|
4055
4091
|
const uint8x16_t mone = vdupq_n_u8(3);
|
4056
4092
|
|