llama_cpp 0.5.0 → 0.5.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -2
- data/examples/prompt_jp.txt +1 -1
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +30 -0
- data/ext/llama_cpp/src/ggml-alloc.c +101 -24
- data/ext/llama_cpp/src/ggml-cuda.cu +1094 -678
- data/ext/llama_cpp/src/ggml-metal.m +89 -23
- data/ext/llama_cpp/src/ggml-metal.metal +398 -211
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +32 -56
- data/ext/llama_cpp/src/ggml.h +1 -1
- data/ext/llama_cpp/src/k_quants.c +49 -13
- data/ext/llama_cpp/src/llama.cpp +833 -281
- data/ext/llama_cpp/src/llama.h +11 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +4 -0
- metadata +2 -2
@@ -1334,7 +1334,7 @@ void ggml_cl_free_data(const struct ggml_tensor* tensor) {
|
|
1334
1334
|
return;
|
1335
1335
|
}
|
1336
1336
|
|
1337
|
-
cl_mem mem = (cl_mem)tensor->
|
1337
|
+
cl_mem mem = (cl_mem)tensor->extra;
|
1338
1338
|
clReleaseMemObject(mem);
|
1339
1339
|
}
|
1340
1340
|
|
@@ -1393,7 +1393,7 @@ static void ggml_cl_mul_f32(const ggml_tensor * src0, const ggml_tensor * src1,
|
|
1393
1393
|
size_t d_size;
|
1394
1394
|
|
1395
1395
|
cl_mem d_X = ggml_cl_pool_malloc(ne0 * sizeof(float), &x_size); // src0
|
1396
|
-
cl_mem d_Y = (cl_mem) src1->
|
1396
|
+
cl_mem d_Y = (cl_mem) src1->extra; // src1 is already on device, broadcasted.
|
1397
1397
|
cl_mem d_D = ggml_cl_pool_malloc(ne0 * sizeof(float), &d_size); // dst
|
1398
1398
|
|
1399
1399
|
|
@@ -1491,9 +1491,9 @@ static void ggml_cl_mul_mat_f32(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1491
1491
|
size_t d_size;
|
1492
1492
|
cl_mem d_X;
|
1493
1493
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1494
|
-
d_X = (cl_mem) src0->
|
1494
|
+
d_X = (cl_mem) src0->extra;
|
1495
1495
|
} else {
|
1496
|
-
d_X = ggml_cl_pool_malloc(sizeof(
|
1496
|
+
d_X = ggml_cl_pool_malloc(sizeof(float) * x_ne, &x_size);
|
1497
1497
|
}
|
1498
1498
|
cl_mem d_Y = ggml_cl_pool_malloc(sizeof(float) * y_ne, &y_size);
|
1499
1499
|
cl_mem d_D = ggml_cl_pool_malloc(sizeof(float) * d_ne, &d_size);
|
@@ -1567,7 +1567,7 @@ static void ggml_cl_mul_mat_f16(const ggml_tensor * src0, const ggml_tensor * sr
|
|
1567
1567
|
size_t d_size;
|
1568
1568
|
cl_mem d_X;
|
1569
1569
|
if (src0->backend == GGML_BACKEND_GPU) { // NOLINT
|
1570
|
-
d_X = (cl_mem) src0->
|
1570
|
+
d_X = (cl_mem) src0->extra;
|
1571
1571
|
} else {
|
1572
1572
|
d_X = ggml_cl_pool_malloc(sizeof(ggml_fp16_t) * x_ne, &x_size);
|
1573
1573
|
}
|
@@ -1697,7 +1697,7 @@ static void ggml_cl_mul_mat_q_f32(const ggml_tensor * src0, const ggml_tensor *
|
|
1697
1697
|
events.emplace_back();
|
1698
1698
|
CL_CHECK(ggml_cl_h2d_tensor_2d(queue, d_Q, 0, src0, i03, i02, events.data() + ev_idx++));
|
1699
1699
|
} else if (src0->backend == GGML_BACKEND_GPU) {
|
1700
|
-
d_Q = (cl_mem) src0->
|
1700
|
+
d_Q = (cl_mem) src0->extra;
|
1701
1701
|
} else {
|
1702
1702
|
GGML_ASSERT(false);
|
1703
1703
|
}
|
@@ -1860,6 +1860,6 @@ void ggml_cl_transform_tensor(void * data, ggml_tensor * tensor) {
|
|
1860
1860
|
|
1861
1861
|
CL_CHECK(clFinish(queue));
|
1862
1862
|
|
1863
|
-
tensor->
|
1863
|
+
tensor->extra = dst;
|
1864
1864
|
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
|
1865
1865
|
}
|
data/ext/llama_cpp/src/ggml.c
CHANGED
@@ -1,4 +1,3 @@
|
|
1
|
-
#define _GNU_SOURCE // Defines CLOCK_MONOTONIC on Linux
|
2
1
|
#define _CRT_SECURE_NO_DEPRECATE // Disables ridiculous "unsafe" warnigns on Windows
|
3
2
|
|
4
3
|
#include "ggml.h"
|
@@ -47,6 +46,10 @@
|
|
47
46
|
// disable "possible loss of data" to avoid hundreds of casts
|
48
47
|
// we should just be careful :)
|
49
48
|
#pragma warning(disable: 4244 4267)
|
49
|
+
|
50
|
+
// disable POSIX deprecation warnigns
|
51
|
+
// these functions are never going away, anyway
|
52
|
+
#pragma warning(disable: 4996)
|
50
53
|
#endif
|
51
54
|
|
52
55
|
#if defined(_WIN32)
|
@@ -103,6 +106,9 @@ typedef void * thread_ret_t;
|
|
103
106
|
#include <sys/stat.h>
|
104
107
|
#include <unistd.h>
|
105
108
|
|
109
|
+
#endif
|
110
|
+
#ifdef GGML_USE_CPU_HBM
|
111
|
+
#include <hbwmalloc.h>
|
106
112
|
#endif
|
107
113
|
|
108
114
|
// __FMA__ and __F16C__ are not defined in MSVC, however they are implied with AVX2/AVX512
|
@@ -192,9 +198,15 @@ typedef void * thread_ret_t;
|
|
192
198
|
#define GGML_ALIGNED_FREE(ptr) _aligned_free(ptr)
|
193
199
|
#else
|
194
200
|
inline static void * ggml_aligned_malloc(size_t size) {
|
201
|
+
if (size == 0) {
|
202
|
+
GGML_PRINT("WARNING: Behavior may be unexpected when allocating 0 bytes for ggml_aligned_malloc!\n");
|
203
|
+
return NULL;
|
204
|
+
}
|
195
205
|
void * aligned_memory = NULL;
|
196
|
-
#ifdef
|
197
|
-
int result =
|
206
|
+
#ifdef GGML_USE_CPU_HBM
|
207
|
+
int result = hbw_posix_memalign(&aligned_memory, 16, size);
|
208
|
+
#elif GGML_USE_METAL
|
209
|
+
int result = posix_memalign(&aligned_memory, sysconf(_SC_PAGESIZE), size);
|
198
210
|
#else
|
199
211
|
int result = posix_memalign(&aligned_memory, GGML_MEM_ALIGN, size);
|
200
212
|
#endif
|
@@ -215,8 +227,12 @@ inline static void * ggml_aligned_malloc(size_t size) {
|
|
215
227
|
return aligned_memory;
|
216
228
|
}
|
217
229
|
#define GGML_ALIGNED_MALLOC(size) ggml_aligned_malloc(size)
|
230
|
+
#ifdef GGML_USE_CPU_HBM
|
231
|
+
#define GGML_ALIGNED_FREE(ptr) if(NULL != ptr) hbw_free(ptr)
|
232
|
+
#else
|
218
233
|
#define GGML_ALIGNED_FREE(ptr) free(ptr)
|
219
234
|
#endif
|
235
|
+
#endif
|
220
236
|
|
221
237
|
#define UNUSED GGML_UNUSED
|
222
238
|
#define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
|
@@ -267,7 +283,7 @@ typedef double ggml_float;
|
|
267
283
|
// 16-bit float
|
268
284
|
// on Arm, we use __fp16
|
269
285
|
// on x86, we use uint16_t
|
270
|
-
#
|
286
|
+
#if defined(__ARM_NEON) && !defined(_MSC_VER)
|
271
287
|
|
272
288
|
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
273
289
|
//
|
@@ -294,12 +310,14 @@ typedef double ggml_float;
|
|
294
310
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
295
311
|
#include <intrin.h>
|
296
312
|
#else
|
313
|
+
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
297
314
|
#if !defined(__riscv)
|
298
315
|
#include <immintrin.h>
|
299
316
|
#endif
|
300
317
|
#endif
|
301
318
|
#endif
|
302
319
|
#endif
|
320
|
+
#endif
|
303
321
|
|
304
322
|
#ifdef __riscv_v_intrinsic
|
305
323
|
#include <riscv_vector.h>
|
@@ -817,46 +835,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
817
835
|
|
818
836
|
#if !defined(__aarch64__)
|
819
837
|
|
820
|
-
inline static uint16_t vaddvq_u8(uint8x16_t v) {
|
821
|
-
return
|
822
|
-
(uint16_t)vgetq_lane_u8(v, 0) + (uint16_t)vgetq_lane_u8(v, 1) +
|
823
|
-
(uint16_t)vgetq_lane_u8(v, 2) + (uint16_t)vgetq_lane_u8(v, 3) +
|
824
|
-
(uint16_t)vgetq_lane_u8(v, 4) + (uint16_t)vgetq_lane_u8(v, 5) +
|
825
|
-
(uint16_t)vgetq_lane_u8(v, 6) + (uint16_t)vgetq_lane_u8(v, 7) +
|
826
|
-
(uint16_t)vgetq_lane_u8(v, 8) + (uint16_t)vgetq_lane_u8(v, 9) +
|
827
|
-
(uint16_t)vgetq_lane_u8(v, 10) + (uint16_t)vgetq_lane_u8(v, 11) +
|
828
|
-
(uint16_t)vgetq_lane_u8(v, 12) + (uint16_t)vgetq_lane_u8(v, 13) +
|
829
|
-
(uint16_t)vgetq_lane_u8(v, 14) + (uint16_t)vgetq_lane_u8(v, 15);
|
830
|
-
}
|
831
|
-
|
832
|
-
inline static int16_t vaddvq_s8(int8x16_t v) {
|
833
|
-
return
|
834
|
-
(int16_t)vgetq_lane_s8(v, 0) + (int16_t)vgetq_lane_s8(v, 1) +
|
835
|
-
(int16_t)vgetq_lane_s8(v, 2) + (int16_t)vgetq_lane_s8(v, 3) +
|
836
|
-
(int16_t)vgetq_lane_s8(v, 4) + (int16_t)vgetq_lane_s8(v, 5) +
|
837
|
-
(int16_t)vgetq_lane_s8(v, 6) + (int16_t)vgetq_lane_s8(v, 7) +
|
838
|
-
(int16_t)vgetq_lane_s8(v, 8) + (int16_t)vgetq_lane_s8(v, 9) +
|
839
|
-
(int16_t)vgetq_lane_s8(v, 10) + (int16_t)vgetq_lane_s8(v, 11) +
|
840
|
-
(int16_t)vgetq_lane_s8(v, 12) + (int16_t)vgetq_lane_s8(v, 13) +
|
841
|
-
(int16_t)vgetq_lane_s8(v, 14) + (int16_t)vgetq_lane_s8(v, 15);
|
842
|
-
}
|
843
|
-
|
844
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
845
|
-
return
|
846
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
847
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
848
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
849
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
850
|
-
}
|
851
|
-
|
852
|
-
inline static uint32_t vaddvq_u16(uint16x8_t v) {
|
853
|
-
return
|
854
|
-
(uint32_t)vgetq_lane_u16(v, 0) + (uint32_t)vgetq_lane_u16(v, 1) +
|
855
|
-
(uint32_t)vgetq_lane_u16(v, 2) + (uint32_t)vgetq_lane_u16(v, 3) +
|
856
|
-
(uint32_t)vgetq_lane_u16(v, 4) + (uint32_t)vgetq_lane_u16(v, 5) +
|
857
|
-
(uint32_t)vgetq_lane_u16(v, 6) + (uint32_t)vgetq_lane_u16(v, 7);
|
858
|
-
}
|
859
|
-
|
860
838
|
inline static int32_t vaddvq_s32(int32x4_t v) {
|
861
839
|
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
862
840
|
}
|
@@ -865,12 +843,6 @@ inline static float vaddvq_f32(float32x4_t v) {
|
|
865
843
|
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
866
844
|
}
|
867
845
|
|
868
|
-
inline static float vminvq_f32(float32x4_t v) {
|
869
|
-
return
|
870
|
-
MIN(MIN(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
871
|
-
MIN(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
872
|
-
}
|
873
|
-
|
874
846
|
inline static float vmaxvq_f32(float32x4_t v) {
|
875
847
|
return
|
876
848
|
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
@@ -4612,6 +4584,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
4612
4584
|
return NULL;
|
4613
4585
|
}
|
4614
4586
|
|
4587
|
+
// allow to call ggml_init with 0 size
|
4588
|
+
if (params.mem_size == 0) {
|
4589
|
+
params.mem_size = GGML_MEM_ALIGN;
|
4590
|
+
}
|
4591
|
+
|
4615
4592
|
const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
|
4616
4593
|
|
4617
4594
|
*ctx = (struct ggml_context) {
|
@@ -4814,7 +4791,7 @@ static struct ggml_tensor * ggml_new_tensor_impl(
|
|
4814
4791
|
|
4815
4792
|
size_t obj_alloc_size = 0;
|
4816
4793
|
|
4817
|
-
if (view_src == NULL && ctx->no_alloc
|
4794
|
+
if (view_src == NULL && !ctx->no_alloc) {
|
4818
4795
|
if (ctx->scratch.data != NULL) {
|
4819
4796
|
// allocate tensor data in the scratch buffer
|
4820
4797
|
if (ctx->scratch.offs + data_size > ctx->scratch.size) {
|
@@ -5515,7 +5492,7 @@ static struct ggml_tensor * ggml_mul_impl(
|
|
5515
5492
|
}
|
5516
5493
|
|
5517
5494
|
if (inplace) {
|
5518
|
-
GGML_ASSERT(is_node
|
5495
|
+
GGML_ASSERT(!is_node);
|
5519
5496
|
}
|
5520
5497
|
|
5521
5498
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -5558,7 +5535,7 @@ static struct ggml_tensor * ggml_div_impl(
|
|
5558
5535
|
}
|
5559
5536
|
|
5560
5537
|
if (inplace) {
|
5561
|
-
GGML_ASSERT(is_node
|
5538
|
+
GGML_ASSERT(!is_node);
|
5562
5539
|
}
|
5563
5540
|
|
5564
5541
|
struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
|
@@ -18900,7 +18877,6 @@ static enum ggml_opt_result linesearch_backtracking(
|
|
18900
18877
|
// strong Wolfe condition (GGML_LINESEARCH_BACKTRACKING_STRONG_WOLFE)
|
18901
18878
|
return count;
|
18902
18879
|
}
|
18903
|
-
return count;
|
18904
18880
|
}
|
18905
18881
|
}
|
18906
18882
|
|
@@ -20003,7 +19979,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20003
19979
|
|
20004
19980
|
struct ggml_tensor * data = NULL;
|
20005
19981
|
|
20006
|
-
if (params.no_alloc
|
19982
|
+
if (!params.no_alloc) {
|
20007
19983
|
data = ggml_new_tensor_1d(ctx_data, GGML_TYPE_I8, ctx->size);
|
20008
19984
|
|
20009
19985
|
ok = ok && data != NULL;
|
@@ -20044,7 +20020,7 @@ struct gguf_context * gguf_init_from_file(const char * fname, struct gguf_init_p
|
|
20044
20020
|
}
|
20045
20021
|
|
20046
20022
|
// point the data member to the appropriate location in the binary blob using the tensor infos
|
20047
|
-
if (params.no_alloc
|
20023
|
+
if (!params.no_alloc) {
|
20048
20024
|
//cur->data = (char *) data->data + ctx->infos[i].offset - ctx->offset; // offset from start of file
|
20049
20025
|
cur->data = (char *) data->data + ctx->infos[i].offset; // offset from data
|
20050
20026
|
}
|
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -270,7 +270,7 @@ extern "C" {
|
|
270
270
|
|
271
271
|
#if defined(__ARM_NEON) && defined(__CUDACC__)
|
272
272
|
typedef half ggml_fp16_t;
|
273
|
-
#elif defined(__ARM_NEON)
|
273
|
+
#elif defined(__ARM_NEON) && !defined(_MSC_VER)
|
274
274
|
typedef __fp16 ggml_fp16_t;
|
275
275
|
#else
|
276
276
|
typedef uint16_t ggml_fp16_t;
|
@@ -13,6 +13,26 @@
|
|
13
13
|
//
|
14
14
|
#include <arm_neon.h>
|
15
15
|
|
16
|
+
#if !defined(__aarch64__)
|
17
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
18
|
+
return
|
19
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
20
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
21
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
22
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
23
|
+
}
|
24
|
+
|
25
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
26
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
27
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
28
|
+
return vcombine_s16(a0, b0);
|
29
|
+
}
|
30
|
+
|
31
|
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
32
|
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
33
|
+
}
|
34
|
+
#endif
|
35
|
+
|
16
36
|
#else
|
17
37
|
|
18
38
|
#ifdef __wasm_simd128__
|
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
63
83
|
float ax = fabsf(x[i]);
|
64
84
|
if (ax > amax) { amax = ax; max = x[i]; }
|
65
85
|
}
|
66
|
-
if (
|
86
|
+
if (amax < 1e-30f) { // all zero
|
67
87
|
for (int i = 0; i < n; ++i) {
|
68
88
|
L[i] = 0;
|
69
89
|
}
|
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
183
203
|
int ntry, float alpha) {
|
184
204
|
float min = x[0];
|
185
205
|
float max = x[0];
|
186
|
-
float sum_x = 0;
|
187
|
-
float sum_x2 = 0;
|
188
206
|
for (int i = 1; i < n; ++i) {
|
189
207
|
if (x[i] < min) min = x[i];
|
190
208
|
if (x[i] > max) max = x[i];
|
191
|
-
sum_x += x[i];
|
192
|
-
sum_x2 += x[i]*x[i];
|
193
209
|
}
|
194
210
|
if (max == min) {
|
195
211
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
1070
1086
|
|
1071
1087
|
}
|
1072
1088
|
|
1089
|
+
if (!max_abs_scale) {
|
1090
|
+
memset(&y[i], 0, sizeof(block_q6_K));
|
1091
|
+
y[i].d = ggml_fp32_to_fp16(0.f);
|
1092
|
+
x += QK_K;
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
|
1073
1096
|
float iscale = -128.f/max_scale;
|
1074
1097
|
y[i].d = ggml_fp32_to_fp16(1/iscale);
|
1075
1098
|
for (int ib = 0; ib < QK_K/16; ++ib) {
|
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1306
1329
|
|
1307
1330
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1308
1331
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
1332
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1309
1333
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1334
|
+
#endif
|
1310
1335
|
|
1311
1336
|
int8x16x2_t q2bytes;
|
1312
1337
|
uint8_t aux[16];
|
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1612
1637
|
#ifdef __ARM_NEON
|
1613
1638
|
|
1614
1639
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1640
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1615
1641
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1642
|
+
#endif
|
1616
1643
|
|
1617
1644
|
int8x16x4_t q2bytes;
|
1618
1645
|
|
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2060
2087
|
|
2061
2088
|
__m256 acc = _mm256_setzero_ps();
|
2062
2089
|
|
2063
|
-
uint32_t *aux;
|
2090
|
+
const uint32_t *aux;
|
2064
2091
|
|
2065
2092
|
for (int i = 0; i < nb; ++i) {
|
2066
2093
|
|
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2070
2097
|
const int8_t * restrict q8 = y[i].qs;
|
2071
2098
|
|
2072
2099
|
// Set up scales
|
2073
|
-
aux = (uint32_t *)x[i].scales;
|
2100
|
+
aux = (const uint32_t *)x[i].scales;
|
2074
2101
|
__m128i scales128 = _mm_set_epi32(
|
2075
2102
|
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
|
2076
2103
|
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
|
@@ -2582,7 +2609,10 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2582
2609
|
|
2583
2610
|
memcpy(utmp, x[i].scales, 12);
|
2584
2611
|
|
2585
|
-
|
2612
|
+
uint32x2_t mins8 = { 0 };
|
2613
|
+
mins8 = vset_lane_u32(utmp[1] & kmask1, mins8, 0);
|
2614
|
+
mins8 = vset_lane_u32(((utmp[2] >> 4) & kmask2) | (((utmp[1] >> 6) & kmask3) << 4), mins8, 1);
|
2615
|
+
|
2586
2616
|
utmp[1] = (utmp[2] & kmask2) | (((utmp[0] >> 6) & kmask3) << 4);
|
2587
2617
|
utmp[0] &= kmask1;
|
2588
2618
|
|
@@ -2596,8 +2626,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2596
2626
|
const uint8_t * restrict q4 = x[i].qs;
|
2597
2627
|
const int8_t * restrict q8 = y[i].qs;
|
2598
2628
|
|
2599
|
-
//int32x4_t isum = mzero;
|
2600
|
-
|
2601
2629
|
int32_t sumi1 = 0;
|
2602
2630
|
int32_t sumi2 = 0;
|
2603
2631
|
|
@@ -3096,9 +3124,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3096
3124
|
#ifdef __ARM_NEON
|
3097
3125
|
|
3098
3126
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3099
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3100
3127
|
const uint8x16_t mone = vdupq_n_u8(1);
|
3101
3128
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
3129
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3130
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3131
|
+
#endif
|
3102
3132
|
|
3103
3133
|
int8x16x4_t q5bytes;
|
3104
3134
|
|
@@ -3441,8 +3471,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3441
3471
|
#ifdef __ARM_NEON
|
3442
3472
|
|
3443
3473
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3444
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3445
3474
|
const uint8x16_t mh = vdupq_n_u8(16);
|
3475
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3476
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3477
|
+
#endif
|
3446
3478
|
|
3447
3479
|
int8x16x4_t q5bytes;
|
3448
3480
|
uint8x16x4_t q5h;
|
@@ -3660,7 +3692,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3660
3692
|
float sum = 0;
|
3661
3693
|
|
3662
3694
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
3695
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3663
3696
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3697
|
+
#endif
|
3664
3698
|
//const int8x16_t m32s = vdupq_n_s8(32);
|
3665
3699
|
|
3666
3700
|
const uint8x16_t mone = vdupq_n_u8(3);
|
@@ -4049,8 +4083,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4049
4083
|
float sum = 0;
|
4050
4084
|
|
4051
4085
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
4052
|
-
const int32x4_t vzero = vdupq_n_s32(0);
|
4053
4086
|
const int8x16_t m32s = vdupq_n_s8(32);
|
4087
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
4088
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
4089
|
+
#endif
|
4054
4090
|
|
4055
4091
|
const uint8x16_t mone = vdupq_n_u8(3);
|
4056
4092
|
|