llama_cpp 0.14.6 → 0.14.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/ext/llama_cpp/llama_cpp.cpp +37 -2
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +4 -0
- data/vendor/tmp/llama.cpp/Makefile +11 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.c +8 -8
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -10
- data/vendor/tmp/llama.cpp/ggml-impl.h +262 -4
- data/vendor/tmp/llama.cpp/ggml-quants.c +0 -293
- data/vendor/tmp/llama.cpp/ggml.c +3 -17
- data/vendor/tmp/llama.cpp/llama.cpp +379 -66
- data/vendor/tmp/llama.cpp/llama.h +19 -6
- data/vendor/tmp/llama.cpp/sgemm.cpp +404 -553
- metadata +2 -2
@@ -14,47 +14,6 @@
|
|
14
14
|
#include <stdlib.h> // for qsort
|
15
15
|
#include <stdio.h> // for GGML_ASSERT
|
16
16
|
|
17
|
-
#ifdef __ARM_NEON
|
18
|
-
|
19
|
-
// if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
|
20
|
-
//
|
21
|
-
// $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
|
22
|
-
//
|
23
|
-
#include <arm_neon.h>
|
24
|
-
|
25
|
-
#else
|
26
|
-
|
27
|
-
#ifdef __wasm_simd128__
|
28
|
-
#include <wasm_simd128.h>
|
29
|
-
#else
|
30
|
-
#if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
|
31
|
-
#include <altivec.h>
|
32
|
-
#undef bool
|
33
|
-
#define bool _Bool
|
34
|
-
#else
|
35
|
-
#if defined(_MSC_VER) || defined(__MINGW32__)
|
36
|
-
#include <intrin.h>
|
37
|
-
#else
|
38
|
-
#if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
|
39
|
-
#if !defined(__riscv)
|
40
|
-
#include <immintrin.h>
|
41
|
-
#endif
|
42
|
-
#endif
|
43
|
-
#endif
|
44
|
-
#endif
|
45
|
-
#endif
|
46
|
-
#endif
|
47
|
-
|
48
|
-
#ifdef __riscv_v_intrinsic
|
49
|
-
#include <riscv_vector.h>
|
50
|
-
#endif
|
51
|
-
|
52
|
-
#undef MIN
|
53
|
-
#undef MAX
|
54
|
-
|
55
|
-
#define MIN(a, b) ((a) < (b) ? (a) : (b))
|
56
|
-
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
57
|
-
|
58
17
|
#define UNUSED GGML_UNUSED
|
59
18
|
|
60
19
|
// some compilers don't provide _mm256_set_m128i, e.g. gcc 7
|
@@ -276,258 +235,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
|
|
276
235
|
#endif // __AVX__ || __AVX2__ || __AVX512F__
|
277
236
|
#endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
|
278
237
|
|
279
|
-
#if defined(__ARM_NEON)
|
280
|
-
|
281
|
-
#ifdef _MSC_VER
|
282
|
-
|
283
|
-
#define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
|
284
|
-
|
285
|
-
#else
|
286
|
-
|
287
|
-
#define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
|
288
|
-
|
289
|
-
#endif
|
290
|
-
|
291
|
-
#if !defined(__aarch64__)
|
292
|
-
|
293
|
-
// 64-bit compatibility
|
294
|
-
|
295
|
-
// vaddvq_s16
|
296
|
-
// vpaddq_s16
|
297
|
-
// vpaddq_s32
|
298
|
-
// vaddvq_s32
|
299
|
-
// vaddvq_f32
|
300
|
-
// vmaxvq_f32
|
301
|
-
// vcvtnq_s32_f32
|
302
|
-
// vzip1_u8
|
303
|
-
// vzip2_u8
|
304
|
-
|
305
|
-
inline static int32_t vaddvq_s16(int16x8_t v) {
|
306
|
-
return
|
307
|
-
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
308
|
-
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
309
|
-
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
310
|
-
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
311
|
-
}
|
312
|
-
|
313
|
-
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
314
|
-
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
315
|
-
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
316
|
-
return vcombine_s16(a0, b0);
|
317
|
-
}
|
318
|
-
|
319
|
-
inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
|
320
|
-
int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
|
321
|
-
int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
|
322
|
-
return vcombine_s32(a0, b0);
|
323
|
-
}
|
324
|
-
|
325
|
-
inline static int32_t vaddvq_s32(int32x4_t v) {
|
326
|
-
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
327
|
-
}
|
328
|
-
|
329
|
-
inline static float vaddvq_f32(float32x4_t v) {
|
330
|
-
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
331
|
-
}
|
332
|
-
|
333
|
-
inline static float vmaxvq_f32(float32x4_t v) {
|
334
|
-
return
|
335
|
-
MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
|
336
|
-
MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
|
337
|
-
}
|
338
|
-
|
339
|
-
inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
|
340
|
-
int32x4_t res;
|
341
|
-
|
342
|
-
res[0] = roundf(vgetq_lane_f32(v, 0));
|
343
|
-
res[1] = roundf(vgetq_lane_f32(v, 1));
|
344
|
-
res[2] = roundf(vgetq_lane_f32(v, 2));
|
345
|
-
res[3] = roundf(vgetq_lane_f32(v, 3));
|
346
|
-
|
347
|
-
return res;
|
348
|
-
}
|
349
|
-
|
350
|
-
inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
|
351
|
-
uint8x8_t res;
|
352
|
-
|
353
|
-
res[0] = a[0]; res[1] = b[0];
|
354
|
-
res[2] = a[1]; res[3] = b[1];
|
355
|
-
res[4] = a[2]; res[5] = b[2];
|
356
|
-
res[6] = a[3]; res[7] = b[3];
|
357
|
-
|
358
|
-
return res;
|
359
|
-
}
|
360
|
-
|
361
|
-
inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
|
362
|
-
uint8x8_t res;
|
363
|
-
|
364
|
-
res[0] = a[4]; res[1] = b[4];
|
365
|
-
res[2] = a[5]; res[3] = b[5];
|
366
|
-
res[4] = a[6]; res[5] = b[6];
|
367
|
-
res[6] = a[7]; res[7] = b[7];
|
368
|
-
|
369
|
-
return res;
|
370
|
-
}
|
371
|
-
|
372
|
-
// vld1q_s16_x2
|
373
|
-
// vld1q_u8_x2
|
374
|
-
// vld1q_u8_x4
|
375
|
-
// vld1q_s8_x2
|
376
|
-
// vld1q_s8_x4
|
377
|
-
// TODO: double-check these work correctly
|
378
|
-
|
379
|
-
typedef struct ggml_int16x8x2_t {
|
380
|
-
int16x8_t val[2];
|
381
|
-
} ggml_int16x8x2_t;
|
382
|
-
|
383
|
-
inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
|
384
|
-
ggml_int16x8x2_t res;
|
385
|
-
|
386
|
-
res.val[0] = vld1q_s16(ptr + 0);
|
387
|
-
res.val[1] = vld1q_s16(ptr + 8);
|
388
|
-
|
389
|
-
return res;
|
390
|
-
}
|
391
|
-
|
392
|
-
typedef struct ggml_uint8x16x2_t {
|
393
|
-
uint8x16_t val[2];
|
394
|
-
} ggml_uint8x16x2_t;
|
395
|
-
|
396
|
-
inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
|
397
|
-
ggml_uint8x16x2_t res;
|
398
|
-
|
399
|
-
res.val[0] = vld1q_u8(ptr + 0);
|
400
|
-
res.val[1] = vld1q_u8(ptr + 16);
|
401
|
-
|
402
|
-
return res;
|
403
|
-
}
|
404
|
-
|
405
|
-
typedef struct ggml_uint8x16x4_t {
|
406
|
-
uint8x16_t val[4];
|
407
|
-
} ggml_uint8x16x4_t;
|
408
|
-
|
409
|
-
inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
|
410
|
-
ggml_uint8x16x4_t res;
|
411
|
-
|
412
|
-
res.val[0] = vld1q_u8(ptr + 0);
|
413
|
-
res.val[1] = vld1q_u8(ptr + 16);
|
414
|
-
res.val[2] = vld1q_u8(ptr + 32);
|
415
|
-
res.val[3] = vld1q_u8(ptr + 48);
|
416
|
-
|
417
|
-
return res;
|
418
|
-
}
|
419
|
-
|
420
|
-
typedef struct ggml_int8x16x2_t {
|
421
|
-
int8x16_t val[2];
|
422
|
-
} ggml_int8x16x2_t;
|
423
|
-
|
424
|
-
inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
|
425
|
-
ggml_int8x16x2_t res;
|
426
|
-
|
427
|
-
res.val[0] = vld1q_s8(ptr + 0);
|
428
|
-
res.val[1] = vld1q_s8(ptr + 16);
|
429
|
-
|
430
|
-
return res;
|
431
|
-
}
|
432
|
-
|
433
|
-
typedef struct ggml_int8x16x4_t {
|
434
|
-
int8x16_t val[4];
|
435
|
-
} ggml_int8x16x4_t;
|
436
|
-
|
437
|
-
inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
|
438
|
-
ggml_int8x16x4_t res;
|
439
|
-
|
440
|
-
res.val[0] = vld1q_s8(ptr + 0);
|
441
|
-
res.val[1] = vld1q_s8(ptr + 16);
|
442
|
-
res.val[2] = vld1q_s8(ptr + 32);
|
443
|
-
res.val[3] = vld1q_s8(ptr + 48);
|
444
|
-
|
445
|
-
return res;
|
446
|
-
}
|
447
|
-
|
448
|
-
// NOTE: not tested
|
449
|
-
inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
|
450
|
-
int8x16_t res;
|
451
|
-
|
452
|
-
res[ 0] = a[b[ 0]];
|
453
|
-
res[ 1] = a[b[ 1]];
|
454
|
-
res[ 2] = a[b[ 2]];
|
455
|
-
res[ 3] = a[b[ 3]];
|
456
|
-
res[ 4] = a[b[ 4]];
|
457
|
-
res[ 5] = a[b[ 5]];
|
458
|
-
res[ 6] = a[b[ 6]];
|
459
|
-
res[ 7] = a[b[ 7]];
|
460
|
-
res[ 8] = a[b[ 8]];
|
461
|
-
res[ 9] = a[b[ 9]];
|
462
|
-
res[10] = a[b[10]];
|
463
|
-
res[11] = a[b[11]];
|
464
|
-
res[12] = a[b[12]];
|
465
|
-
res[13] = a[b[13]];
|
466
|
-
res[14] = a[b[14]];
|
467
|
-
res[15] = a[b[15]];
|
468
|
-
|
469
|
-
return res;
|
470
|
-
}
|
471
|
-
|
472
|
-
// NOTE: not tested
|
473
|
-
inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
|
474
|
-
uint8x16_t res;
|
475
|
-
|
476
|
-
res[ 0] = a[b[ 0]];
|
477
|
-
res[ 1] = a[b[ 1]];
|
478
|
-
res[ 2] = a[b[ 2]];
|
479
|
-
res[ 3] = a[b[ 3]];
|
480
|
-
res[ 4] = a[b[ 4]];
|
481
|
-
res[ 5] = a[b[ 5]];
|
482
|
-
res[ 6] = a[b[ 6]];
|
483
|
-
res[ 7] = a[b[ 7]];
|
484
|
-
res[ 8] = a[b[ 8]];
|
485
|
-
res[ 9] = a[b[ 9]];
|
486
|
-
res[10] = a[b[10]];
|
487
|
-
res[11] = a[b[11]];
|
488
|
-
res[12] = a[b[12]];
|
489
|
-
res[13] = a[b[13]];
|
490
|
-
res[14] = a[b[14]];
|
491
|
-
res[15] = a[b[15]];
|
492
|
-
|
493
|
-
return res;
|
494
|
-
}
|
495
|
-
|
496
|
-
#else
|
497
|
-
|
498
|
-
#define ggml_int16x8x2_t int16x8x2_t
|
499
|
-
#define ggml_uint8x16x2_t uint8x16x2_t
|
500
|
-
#define ggml_uint8x16x4_t uint8x16x4_t
|
501
|
-
#define ggml_int8x16x2_t int8x16x2_t
|
502
|
-
#define ggml_int8x16x4_t int8x16x4_t
|
503
|
-
|
504
|
-
#define ggml_vld1q_s16_x2 vld1q_s16_x2
|
505
|
-
#define ggml_vld1q_u8_x2 vld1q_u8_x2
|
506
|
-
#define ggml_vld1q_u8_x4 vld1q_u8_x4
|
507
|
-
#define ggml_vld1q_s8_x2 vld1q_s8_x2
|
508
|
-
#define ggml_vld1q_s8_x4 vld1q_s8_x4
|
509
|
-
#define ggml_vqtbl1q_s8 vqtbl1q_s8
|
510
|
-
#define ggml_vqtbl1q_u8 vqtbl1q_u8
|
511
|
-
|
512
|
-
#endif
|
513
|
-
|
514
|
-
#if !defined(__ARM_FEATURE_DOTPROD)
|
515
|
-
|
516
|
-
inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
|
517
|
-
const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
|
518
|
-
const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
|
519
|
-
|
520
|
-
return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
|
521
|
-
}
|
522
|
-
|
523
|
-
#else
|
524
|
-
|
525
|
-
#define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
|
526
|
-
|
527
|
-
#endif
|
528
|
-
|
529
|
-
#endif
|
530
|
-
|
531
238
|
#if defined(__ARM_NEON) || defined(__wasm_simd128__)
|
532
239
|
#define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
|
533
240
|
#define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
|
data/vendor/tmp/llama.cpp/ggml.c
CHANGED
@@ -858,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
|
|
858
858
|
// simd mappings
|
859
859
|
//
|
860
860
|
|
861
|
-
#if defined(__ARM_NEON)
|
862
|
-
#if !defined(__aarch64__)
|
863
|
-
|
864
|
-
// 64-bit compatibility
|
865
|
-
|
866
|
-
inline static float vaddvq_f32(float32x4_t v) {
|
867
|
-
return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
|
868
|
-
}
|
869
|
-
|
870
|
-
#endif
|
871
|
-
#endif
|
872
|
-
|
873
861
|
// we define a common set of C macros which map to specific intrinsics based on the current architecture
|
874
862
|
// we then implement the fundamental computation operations below using only these macros
|
875
863
|
// adding support for new architectures requires to define the corresponding SIMD macros
|
@@ -10825,7 +10813,7 @@ static void ggml_compute_forward_mul_mat(
|
|
10825
10813
|
#endif
|
10826
10814
|
|
10827
10815
|
#if GGML_USE_LLAMAFILE
|
10828
|
-
if (
|
10816
|
+
if (src1_cont) {
|
10829
10817
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10830
10818
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10831
10819
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
@@ -10878,15 +10866,13 @@ UseGgmlGemm1:;
|
|
10878
10866
|
const size_t row_size = ggml_row_size(vec_dot_type, ne10);
|
10879
10867
|
|
10880
10868
|
#if GGML_USE_LLAMAFILE
|
10881
|
-
if (
|
10869
|
+
if (src1->type != vec_dot_type) {
|
10882
10870
|
for (int64_t i13 = 0; i13 < ne13; i13++)
|
10883
10871
|
for (int64_t i12 = 0; i12 < ne12; i12++)
|
10884
10872
|
if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
|
10885
10873
|
(const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
|
10886
10874
|
nb01/ggml_type_size(src0->type),
|
10887
|
-
(const char *)wdata +
|
10888
|
-
nb12/ggml_type_size(src1->type)*i12 +
|
10889
|
-
nb13/ggml_type_size(src1->type)*i13),
|
10875
|
+
(const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
|
10890
10876
|
row_size/ggml_type_size(vec_dot_type),
|
10891
10877
|
(char *)dst->data + i12*nb2 + i13*nb3,
|
10892
10878
|
nb1/ggml_type_size(dst->type),
|