llama_cpp 0.14.6 → 0.14.7

Sign up to get free protection for your applications and to get access to all the features.
@@ -14,47 +14,6 @@
14
14
  #include <stdlib.h> // for qsort
15
15
  #include <stdio.h> // for GGML_ASSERT
16
16
 
17
- #ifdef __ARM_NEON
18
-
19
- // if YCM cannot find <arm_neon.h>, make a symbolic link to it, for example:
20
- //
21
- // $ ln -sfn /Library/Developer/CommandLineTools/usr/lib/clang/13.1.6/include/arm_neon.h ./src/
22
- //
23
- #include <arm_neon.h>
24
-
25
- #else
26
-
27
- #ifdef __wasm_simd128__
28
- #include <wasm_simd128.h>
29
- #else
30
- #if defined(__POWER9_VECTOR__) || defined(__powerpc64__)
31
- #include <altivec.h>
32
- #undef bool
33
- #define bool _Bool
34
- #else
35
- #if defined(_MSC_VER) || defined(__MINGW32__)
36
- #include <intrin.h>
37
- #else
38
- #if defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__) || defined(__SSE3__)
39
- #if !defined(__riscv)
40
- #include <immintrin.h>
41
- #endif
42
- #endif
43
- #endif
44
- #endif
45
- #endif
46
- #endif
47
-
48
- #ifdef __riscv_v_intrinsic
49
- #include <riscv_vector.h>
50
- #endif
51
-
52
- #undef MIN
53
- #undef MAX
54
-
55
- #define MIN(a, b) ((a) < (b) ? (a) : (b))
56
- #define MAX(a, b) ((a) > (b) ? (a) : (b))
57
-
58
17
  #define UNUSED GGML_UNUSED
59
18
 
60
19
  // some compilers don't provide _mm256_set_m128i, e.g. gcc 7
@@ -276,258 +235,6 @@ static inline float hsum_float_4x4(const __m128 a, const __m128 b, const __m128
276
235
  #endif // __AVX__ || __AVX2__ || __AVX512F__
277
236
  #endif // defined(__AVX__) || defined(__AVX2__) || defined(__AVX512F__) || defined(__SSSE3__)
278
237
 
279
- #if defined(__ARM_NEON)
280
-
281
- #ifdef _MSC_VER
282
-
283
- #define ggml_vld1q_u32(w,x,y,z) { ((w) + ((uint64_t)(x) << 32)), ((y) + ((uint64_t)(z) << 32)) }
284
-
285
- #else
286
-
287
- #define ggml_vld1q_u32(w,x,y,z) { (w), (x), (y), (z) }
288
-
289
- #endif
290
-
291
- #if !defined(__aarch64__)
292
-
293
- // 64-bit compatibility
294
-
295
- // vaddvq_s16
296
- // vpaddq_s16
297
- // vpaddq_s32
298
- // vaddvq_s32
299
- // vaddvq_f32
300
- // vmaxvq_f32
301
- // vcvtnq_s32_f32
302
- // vzip1_u8
303
- // vzip2_u8
304
-
305
- inline static int32_t vaddvq_s16(int16x8_t v) {
306
- return
307
- (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
308
- (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
309
- (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
310
- (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
311
- }
312
-
313
- inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
314
- int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
315
- int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
316
- return vcombine_s16(a0, b0);
317
- }
318
-
319
- inline static int32x4_t vpaddq_s32(int32x4_t a, int32x4_t b) {
320
- int32x2_t a0 = vpadd_s32(vget_low_s32(a), vget_high_s32(a));
321
- int32x2_t b0 = vpadd_s32(vget_low_s32(b), vget_high_s32(b));
322
- return vcombine_s32(a0, b0);
323
- }
324
-
325
- inline static int32_t vaddvq_s32(int32x4_t v) {
326
- return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
327
- }
328
-
329
- inline static float vaddvq_f32(float32x4_t v) {
330
- return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
331
- }
332
-
333
- inline static float vmaxvq_f32(float32x4_t v) {
334
- return
335
- MAX(MAX(vgetq_lane_f32(v, 0), vgetq_lane_f32(v, 1)),
336
- MAX(vgetq_lane_f32(v, 2), vgetq_lane_f32(v, 3)));
337
- }
338
-
339
- inline static int32x4_t vcvtnq_s32_f32(float32x4_t v) {
340
- int32x4_t res;
341
-
342
- res[0] = roundf(vgetq_lane_f32(v, 0));
343
- res[1] = roundf(vgetq_lane_f32(v, 1));
344
- res[2] = roundf(vgetq_lane_f32(v, 2));
345
- res[3] = roundf(vgetq_lane_f32(v, 3));
346
-
347
- return res;
348
- }
349
-
350
- inline static uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
351
- uint8x8_t res;
352
-
353
- res[0] = a[0]; res[1] = b[0];
354
- res[2] = a[1]; res[3] = b[1];
355
- res[4] = a[2]; res[5] = b[2];
356
- res[6] = a[3]; res[7] = b[3];
357
-
358
- return res;
359
- }
360
-
361
- inline static uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
362
- uint8x8_t res;
363
-
364
- res[0] = a[4]; res[1] = b[4];
365
- res[2] = a[5]; res[3] = b[5];
366
- res[4] = a[6]; res[5] = b[6];
367
- res[6] = a[7]; res[7] = b[7];
368
-
369
- return res;
370
- }
371
-
372
- // vld1q_s16_x2
373
- // vld1q_u8_x2
374
- // vld1q_u8_x4
375
- // vld1q_s8_x2
376
- // vld1q_s8_x4
377
- // TODO: double-check these work correctly
378
-
379
- typedef struct ggml_int16x8x2_t {
380
- int16x8_t val[2];
381
- } ggml_int16x8x2_t;
382
-
383
- inline static ggml_int16x8x2_t ggml_vld1q_s16_x2(const int16_t * ptr) {
384
- ggml_int16x8x2_t res;
385
-
386
- res.val[0] = vld1q_s16(ptr + 0);
387
- res.val[1] = vld1q_s16(ptr + 8);
388
-
389
- return res;
390
- }
391
-
392
- typedef struct ggml_uint8x16x2_t {
393
- uint8x16_t val[2];
394
- } ggml_uint8x16x2_t;
395
-
396
- inline static ggml_uint8x16x2_t ggml_vld1q_u8_x2(const uint8_t * ptr) {
397
- ggml_uint8x16x2_t res;
398
-
399
- res.val[0] = vld1q_u8(ptr + 0);
400
- res.val[1] = vld1q_u8(ptr + 16);
401
-
402
- return res;
403
- }
404
-
405
- typedef struct ggml_uint8x16x4_t {
406
- uint8x16_t val[4];
407
- } ggml_uint8x16x4_t;
408
-
409
- inline static ggml_uint8x16x4_t ggml_vld1q_u8_x4(const uint8_t * ptr) {
410
- ggml_uint8x16x4_t res;
411
-
412
- res.val[0] = vld1q_u8(ptr + 0);
413
- res.val[1] = vld1q_u8(ptr + 16);
414
- res.val[2] = vld1q_u8(ptr + 32);
415
- res.val[3] = vld1q_u8(ptr + 48);
416
-
417
- return res;
418
- }
419
-
420
- typedef struct ggml_int8x16x2_t {
421
- int8x16_t val[2];
422
- } ggml_int8x16x2_t;
423
-
424
- inline static ggml_int8x16x2_t ggml_vld1q_s8_x2(const int8_t * ptr) {
425
- ggml_int8x16x2_t res;
426
-
427
- res.val[0] = vld1q_s8(ptr + 0);
428
- res.val[1] = vld1q_s8(ptr + 16);
429
-
430
- return res;
431
- }
432
-
433
- typedef struct ggml_int8x16x4_t {
434
- int8x16_t val[4];
435
- } ggml_int8x16x4_t;
436
-
437
- inline static ggml_int8x16x4_t ggml_vld1q_s8_x4(const int8_t * ptr) {
438
- ggml_int8x16x4_t res;
439
-
440
- res.val[0] = vld1q_s8(ptr + 0);
441
- res.val[1] = vld1q_s8(ptr + 16);
442
- res.val[2] = vld1q_s8(ptr + 32);
443
- res.val[3] = vld1q_s8(ptr + 48);
444
-
445
- return res;
446
- }
447
-
448
- // NOTE: not tested
449
- inline static int8x16_t ggml_vqtbl1q_s8(int8x16_t a, uint8x16_t b) {
450
- int8x16_t res;
451
-
452
- res[ 0] = a[b[ 0]];
453
- res[ 1] = a[b[ 1]];
454
- res[ 2] = a[b[ 2]];
455
- res[ 3] = a[b[ 3]];
456
- res[ 4] = a[b[ 4]];
457
- res[ 5] = a[b[ 5]];
458
- res[ 6] = a[b[ 6]];
459
- res[ 7] = a[b[ 7]];
460
- res[ 8] = a[b[ 8]];
461
- res[ 9] = a[b[ 9]];
462
- res[10] = a[b[10]];
463
- res[11] = a[b[11]];
464
- res[12] = a[b[12]];
465
- res[13] = a[b[13]];
466
- res[14] = a[b[14]];
467
- res[15] = a[b[15]];
468
-
469
- return res;
470
- }
471
-
472
- // NOTE: not tested
473
- inline static uint8x16_t ggml_vqtbl1q_u8(uint8x16_t a, uint8x16_t b) {
474
- uint8x16_t res;
475
-
476
- res[ 0] = a[b[ 0]];
477
- res[ 1] = a[b[ 1]];
478
- res[ 2] = a[b[ 2]];
479
- res[ 3] = a[b[ 3]];
480
- res[ 4] = a[b[ 4]];
481
- res[ 5] = a[b[ 5]];
482
- res[ 6] = a[b[ 6]];
483
- res[ 7] = a[b[ 7]];
484
- res[ 8] = a[b[ 8]];
485
- res[ 9] = a[b[ 9]];
486
- res[10] = a[b[10]];
487
- res[11] = a[b[11]];
488
- res[12] = a[b[12]];
489
- res[13] = a[b[13]];
490
- res[14] = a[b[14]];
491
- res[15] = a[b[15]];
492
-
493
- return res;
494
- }
495
-
496
- #else
497
-
498
- #define ggml_int16x8x2_t int16x8x2_t
499
- #define ggml_uint8x16x2_t uint8x16x2_t
500
- #define ggml_uint8x16x4_t uint8x16x4_t
501
- #define ggml_int8x16x2_t int8x16x2_t
502
- #define ggml_int8x16x4_t int8x16x4_t
503
-
504
- #define ggml_vld1q_s16_x2 vld1q_s16_x2
505
- #define ggml_vld1q_u8_x2 vld1q_u8_x2
506
- #define ggml_vld1q_u8_x4 vld1q_u8_x4
507
- #define ggml_vld1q_s8_x2 vld1q_s8_x2
508
- #define ggml_vld1q_s8_x4 vld1q_s8_x4
509
- #define ggml_vqtbl1q_s8 vqtbl1q_s8
510
- #define ggml_vqtbl1q_u8 vqtbl1q_u8
511
-
512
- #endif
513
-
514
- #if !defined(__ARM_FEATURE_DOTPROD)
515
-
516
- inline static int32x4_t ggml_vdotq_s32(int32x4_t acc, int8x16_t a, int8x16_t b) {
517
- const int16x8_t p0 = vmull_s8(vget_low_s8 (a), vget_low_s8 (b));
518
- const int16x8_t p1 = vmull_s8(vget_high_s8(a), vget_high_s8(b));
519
-
520
- return vaddq_s32(acc, vaddq_s32(vpaddlq_s16(p0), vpaddlq_s16(p1)));
521
- }
522
-
523
- #else
524
-
525
- #define ggml_vdotq_s32(a, b, c) vdotq_s32(a, b, c)
526
-
527
- #endif
528
-
529
- #endif
530
-
531
238
  #if defined(__ARM_NEON) || defined(__wasm_simd128__)
532
239
  #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
533
240
  #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
@@ -858,18 +858,6 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
858
858
  // simd mappings
859
859
  //
860
860
 
861
- #if defined(__ARM_NEON)
862
- #if !defined(__aarch64__)
863
-
864
- // 64-bit compatibility
865
-
866
- inline static float vaddvq_f32(float32x4_t v) {
867
- return vgetq_lane_f32(v, 0) + vgetq_lane_f32(v, 1) + vgetq_lane_f32(v, 2) + vgetq_lane_f32(v, 3);
868
- }
869
-
870
- #endif
871
- #endif
872
-
873
861
  // we define a common set of C macros which map to specific intrinsics based on the current architecture
874
862
  // we then implement the fundamental computation operations below using only these macros
875
863
  // adding support for new architectures requires to define the corresponding SIMD macros
@@ -10825,7 +10813,7 @@ static void ggml_compute_forward_mul_mat(
10825
10813
  #endif
10826
10814
 
10827
10815
  #if GGML_USE_LLAMAFILE
10828
- if (nb10 == ggml_type_size(src1->type)) {
10816
+ if (src1_cont) {
10829
10817
  for (int64_t i13 = 0; i13 < ne13; i13++)
10830
10818
  for (int64_t i12 = 0; i12 < ne12; i12++)
10831
10819
  if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
@@ -10878,15 +10866,13 @@ UseGgmlGemm1:;
10878
10866
  const size_t row_size = ggml_row_size(vec_dot_type, ne10);
10879
10867
 
10880
10868
  #if GGML_USE_LLAMAFILE
10881
- if (nb10 == ggml_type_size(src1->type) || src1->type != vec_dot_type) {
10869
+ if (src1->type != vec_dot_type) {
10882
10870
  for (int64_t i13 = 0; i13 < ne13; i13++)
10883
10871
  for (int64_t i12 = 0; i12 < ne12; i12++)
10884
10872
  if (!llamafile_sgemm(ne01, ne11, ne00/ggml_blck_size(src0->type),
10885
10873
  (const char *)src0->data + i12/r2*nb02 + i13/r3*nb03,
10886
10874
  nb01/ggml_type_size(src0->type),
10887
- (const char *)wdata + ggml_row_size(vec_dot_type,
10888
- nb12/ggml_type_size(src1->type)*i12 +
10889
- nb13/ggml_type_size(src1->type)*i13),
10875
+ (const char *)wdata + (i12*ne11 + i13*ne12*ne11)*row_size,
10890
10876
  row_size/ggml_type_size(vec_dot_type),
10891
10877
  (char *)dst->data + i12*nb2 + i13*nb3,
10892
10878
  nb1/ggml_type_size(dst->type),