llama_cpp 0.0.7 → 0.1.0

Sign up to get free protection for your applications and to get access to all the features.
@@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
135
135
  #define UNUSED(x) (void)(x)
136
136
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
137
137
 
138
- #define GGML_ASSERT(x) \
139
- do { \
140
- if (!(x)) { \
141
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
142
- abort(); \
143
- } \
144
- } while (0)
145
-
146
138
  #if defined(GGML_USE_ACCELERATE)
147
139
  #include <Accelerate/Accelerate.h>
148
140
  #elif defined(GGML_USE_OPENBLAS)
@@ -188,9 +180,13 @@ typedef double ggml_float;
188
180
  #undef bool
189
181
  #define bool _Bool
190
182
  #else
183
+ #if defined(_MSC_VER) || defined(__MINGW32__)
184
+ #include <intrin.h>
185
+ #else
191
186
  #include <immintrin.h>
192
187
  #endif
193
188
  #endif
189
+ #endif
194
190
 
195
191
  #ifdef __F16C__
196
192
 
@@ -330,7 +326,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
330
326
  // precomputed f32 table for f16 (256 KB)
331
327
  static float table_f32_f16[1 << 16];
332
328
 
333
- #if defined(__ARM_NEON)
329
+ #if defined(__ARM_NEON) || defined(__wasm_simd128__)
334
330
  #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
335
331
  #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
336
332
  #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -370,6 +366,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
370
366
  return GGML_FP32_TO_FP16(x);
371
367
  }
372
368
 
369
+ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
370
+ for (size_t i = 0; i < n; i++) {
371
+ y[i] = GGML_FP16_TO_FP32(x[i]);
372
+ }
373
+ }
374
+
375
+ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
376
+ size_t i = 0;
377
+ #if defined(__F16C__)
378
+ for (; i + 7 < n; i += 8) {
379
+ __m256 x_vec = _mm256_loadu_ps(x + i);
380
+ __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
381
+ _mm_storeu_si128((__m128i *)(y + i), y_vec);
382
+ }
383
+ for(; i + 3 < n; i += 4) {
384
+ __m128 x_vec = _mm_loadu_ps(x + i);
385
+ __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
386
+ _mm_storel_epi64((__m128i *)(y + i), y_vec);
387
+ }
388
+ #endif
389
+ for (; i < n; i++) {
390
+ y[i] = GGML_FP32_TO_FP16(x[i]);
391
+ }
392
+ }
393
+
394
+
373
395
  //
374
396
  // timing
375
397
  //
@@ -653,19 +675,102 @@ float vmaxvq_f32(float32x4_t v) {
653
675
  }
654
676
 
655
677
  int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
656
- return vget_low_s8(vcombine_s8(a, b));
678
+ int8x8_t res;
679
+
680
+ res[0] = a[0]; res[1] = b[0];
681
+ res[2] = a[1]; res[3] = b[1];
682
+ res[4] = a[2]; res[5] = b[2];
683
+ res[6] = a[3]; res[7] = b[3];
684
+
685
+ return res;
657
686
  }
658
687
 
659
688
  int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
660
- return vget_high_s8(vcombine_s8(a, b));
689
+ int8x8_t res;
690
+
691
+ res[0] = a[4]; res[1] = b[4];
692
+ res[2] = a[5]; res[3] = b[5];
693
+ res[4] = a[6]; res[5] = b[6];
694
+ res[6] = a[7]; res[7] = b[7];
695
+
696
+ return res;
661
697
  }
662
698
 
663
699
  uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
664
- return vget_low_u8(vcombine_u8(a, b));
700
+ uint8x8_t res;
701
+
702
+ res[0] = a[0]; res[1] = b[0];
703
+ res[2] = a[1]; res[3] = b[1];
704
+ res[4] = a[2]; res[5] = b[2];
705
+ res[6] = a[3]; res[7] = b[3];
706
+
707
+ return res;
665
708
  }
666
709
 
667
710
  uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
668
- return vget_high_u8(vcombine_u8(a, b));
711
+ uint8x8_t res;
712
+
713
+ res[0] = a[4]; res[1] = b[4];
714
+ res[2] = a[5]; res[3] = b[5];
715
+ res[4] = a[6]; res[5] = b[6];
716
+ res[6] = a[7]; res[7] = b[7];
717
+
718
+ return res;
719
+ }
720
+
721
+ int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
722
+ int8x16_t res;
723
+
724
+ res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
725
+ res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
726
+ res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
727
+ res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
728
+
729
+ return res;
730
+ }
731
+
732
+ int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
733
+ int8x16_t res;
734
+
735
+ res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
736
+ res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
737
+ res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
738
+ res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
739
+
740
+ return res;
741
+ }
742
+
743
+ uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
744
+ uint8x16_t res;
745
+
746
+ res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
747
+ res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
748
+ res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
749
+ res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
750
+
751
+ return res;
752
+ }
753
+
754
+ uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
755
+ uint8x16_t res;
756
+
757
+ res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
758
+ res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
759
+ res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
760
+ res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
761
+
762
+ return res;
763
+ }
764
+
765
+ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
766
+ int32x4_t res;
767
+
768
+ res[0] = roundf(vgetq_lane_f32(v, 0));
769
+ res[1] = roundf(vgetq_lane_f32(v, 1));
770
+ res[2] = roundf(vgetq_lane_f32(v, 2));
771
+ res[3] = roundf(vgetq_lane_f32(v, 3));
772
+
773
+ return res;
669
774
  }
670
775
 
671
776
  #endif
@@ -694,14 +799,6 @@ typedef struct {
694
799
  } block_q4_2;
695
800
  static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
696
801
 
697
- #define QK4_3 16
698
- typedef struct {
699
- ggml_fp16_t d; // delta
700
- ggml_fp16_t m; // min
701
- uint8_t qs[QK4_3 / 2]; // nibbles / quants
702
- } block_q4_3;
703
- static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
704
-
705
802
  #define QK5_0 32
706
803
  typedef struct {
707
804
  ggml_fp16_t d; // delta
@@ -789,6 +886,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
789
886
  float max = 0.0f;
790
887
  float min = 0.0f;
791
888
 
889
+ vector float asrcv [8];
792
890
  vector float srcv [8];
793
891
  vector float maxv[8];
794
892
  vector float minv[8];
@@ -1068,7 +1166,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
1068
1166
  const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
1069
1167
  const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
1070
1168
  const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
1071
- const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15));
1169
+ const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
1072
1170
 
1073
1171
  y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
1074
1172
  y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
@@ -1291,49 +1389,6 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
1291
1389
  quantize_row_q4_2_reference(x, y, k);
1292
1390
  }
1293
1391
 
1294
- static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) {
1295
- assert(k % QK4_3 == 0);
1296
- const int nb = k / QK4_3;
1297
-
1298
- for (int i = 0; i < nb; i++) {
1299
- float min = FLT_MAX;
1300
- float max = -FLT_MAX;
1301
-
1302
- for (int l = 0; l < QK4_3; l++) {
1303
- const float v = x[i*QK4_3 + l];
1304
- if (v < min) min = v;
1305
- if (v > max) max = v;
1306
- }
1307
-
1308
- const float d = (max - min) / ((1 << 4) - 1);
1309
- const float id = d ? 1.0f/d : 0.0f;
1310
-
1311
- y[i].d = GGML_FP32_TO_FP16(d);
1312
- y[i].m = GGML_FP32_TO_FP16(min);
1313
-
1314
- for (int l = 0; l < QK4_3; l += 2) {
1315
- const float v0 = (x[i*QK4_3 + l + 0] - min)*id;
1316
- const float v1 = (x[i*QK4_3 + l + 1] - min)*id;
1317
-
1318
- const uint8_t vi0 = (int) (v0 + 0.5f);
1319
- const uint8_t vi1 = (int) (v1 + 0.5f);
1320
-
1321
- assert(vi0 < 16);
1322
- assert(vi1 < 16);
1323
-
1324
- y[i].qs[l/2] = vi0 | (vi1 << 4);
1325
- }
1326
- }
1327
- }
1328
-
1329
- static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) {
1330
- assert(k % QK4_3 == 0);
1331
-
1332
- block_q4_3 * restrict y = vy;
1333
-
1334
- quantize_row_q4_3_reference(x, y, k);
1335
- }
1336
-
1337
1392
  static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
1338
1393
  assert(k % QK5_0 == 0);
1339
1394
  const int nb = k / QK5_0;
@@ -1458,15 +1513,135 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
1458
1513
  }
1459
1514
 
1460
1515
  static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
1516
+ assert(QK8_0 == 32);
1461
1517
  assert(k % QK8_0 == 0);
1518
+ const int nb = k / QK8_0;
1462
1519
 
1463
1520
  block_q8_0 * restrict y = vy;
1464
1521
 
1522
+ #if defined(__ARM_NEON)
1523
+ for (int i = 0; i < nb; i++) {
1524
+ float32x4_t srcv [8];
1525
+ float32x4_t asrcv[8];
1526
+ float32x4_t amaxv[8];
1527
+
1528
+ for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
1529
+ for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
1530
+
1531
+ for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
1532
+ for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
1533
+ for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
1534
+
1535
+ const float amax = vmaxvq_f32(amaxv[0]);
1536
+
1537
+ const float d = amax / ((1 << 7) - 1);
1538
+ const float id = d ? 1.0f/d : 0.0f;
1539
+
1540
+ y[i].d = d;
1541
+
1542
+ for (int l = 0; l < 8; l++) {
1543
+ const float32x4_t v = vmulq_n_f32(srcv[l], id);
1544
+ const int32x4_t vi = vcvtnq_s32_f32(v);
1545
+
1546
+ y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
1547
+ y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
1548
+ y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
1549
+ y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
1550
+ }
1551
+ }
1552
+ #elif defined(__AVX2__) || defined(__AVX__)
1553
+ for (int i = 0; i < nb; i++) {
1554
+ // Load elements into 4 AVX vectors
1555
+ __m256 v0 = _mm256_loadu_ps( x );
1556
+ __m256 v1 = _mm256_loadu_ps( x + 8 );
1557
+ __m256 v2 = _mm256_loadu_ps( x + 16 );
1558
+ __m256 v3 = _mm256_loadu_ps( x + 24 );
1559
+ x += 32;
1560
+
1561
+ // Compute max(abs(e)) for the block
1562
+ const __m256 signBit = _mm256_set1_ps( -0.0f );
1563
+ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
1564
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
1565
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
1566
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
1567
+
1568
+ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
1569
+ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
1570
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
1571
+ const float maxScalar = _mm_cvtss_f32( max4 );
1572
+
1573
+ // Quantize these floats
1574
+ const float d = maxScalar / 127.f;
1575
+ y[i].d = d;
1576
+ const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
1577
+ const __m256 mul = _mm256_set1_ps( id );
1578
+
1579
+ // Apply the multiplier
1580
+ v0 = _mm256_mul_ps( v0, mul );
1581
+ v1 = _mm256_mul_ps( v1, mul );
1582
+ v2 = _mm256_mul_ps( v2, mul );
1583
+ v3 = _mm256_mul_ps( v3, mul );
1584
+
1585
+ // Round to nearest integer
1586
+ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
1587
+ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
1588
+ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
1589
+ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
1590
+
1591
+ // Convert floats to integers
1592
+ __m256i i0 = _mm256_cvtps_epi32( v0 );
1593
+ __m256i i1 = _mm256_cvtps_epi32( v1 );
1594
+ __m256i i2 = _mm256_cvtps_epi32( v2 );
1595
+ __m256i i3 = _mm256_cvtps_epi32( v3 );
1596
+
1597
+ #if defined(__AVX2__)
1598
+ // Convert int32 to int16
1599
+ i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
1600
+ i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
1601
+ // Convert int16 to int8
1602
+ i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
1603
+
1604
+ // We got our precious signed bytes, but the order is now wrong
1605
+ // These AVX2 pack instructions process 16-byte pieces independently
1606
+ // The following instruction is fixing the order
1607
+ const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
1608
+ i0 = _mm256_permutevar8x32_epi32( i0, perm );
1609
+
1610
+ _mm256_storeu_si256((__m256i *)y[i].qs, i0);
1611
+ #else
1612
+ // Since we don't have in AVX some necessary functions,
1613
+ // we split the registers in half and call AVX2 analogs from SSE
1614
+ __m128i ni0 = _mm256_castsi256_si128( i0 );
1615
+ __m128i ni1 = _mm256_extractf128_si256( i0, 1);
1616
+ __m128i ni2 = _mm256_castsi256_si128( i1 );
1617
+ __m128i ni3 = _mm256_extractf128_si256( i1, 1);
1618
+ __m128i ni4 = _mm256_castsi256_si128( i2 );
1619
+ __m128i ni5 = _mm256_extractf128_si256( i2, 1);
1620
+ __m128i ni6 = _mm256_castsi256_si128( i3 );
1621
+ __m128i ni7 = _mm256_extractf128_si256( i3, 1);
1622
+
1623
+ // Convert int32 to int16
1624
+ ni0 = _mm_packs_epi32( ni0, ni1 );
1625
+ ni2 = _mm_packs_epi32( ni2, ni3 );
1626
+ ni4 = _mm_packs_epi32( ni4, ni5 );
1627
+ ni6 = _mm_packs_epi32( ni6, ni7 );
1628
+ // Convert int16 to int8
1629
+ ni0 = _mm_packs_epi16( ni0, ni2 );
1630
+ ni4 = _mm_packs_epi16( ni4, ni6 );
1631
+
1632
+ _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
1633
+ _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
1634
+ #endif
1635
+ }
1636
+ #else
1637
+ // scalar
1465
1638
  quantize_row_q8_0_reference(x, y, k);
1639
+ #endif
1466
1640
  }
1467
1641
 
1468
1642
  // reference implementation for deterministic creation of model files
1469
1643
  static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
1644
+ assert(QK8_1 == 32);
1470
1645
  assert(k % QK8_1 == 0);
1471
1646
  const int nb = k / QK8_1;
1472
1647
 
@@ -1917,36 +2092,6 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
1917
2092
  }
1918
2093
  }
1919
2094
 
1920
- static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) {
1921
- assert(k % QK4_3 == 0);
1922
- const int nb = k / QK4_3;
1923
-
1924
- const block_q4_3 * restrict x = vx;
1925
-
1926
- for (int i = 0; i < nb; i++) {
1927
- const float d = GGML_FP16_TO_FP32(x[i].d);
1928
- const float m = GGML_FP16_TO_FP32(x[i].m);
1929
-
1930
- const uint8_t * restrict pp = x[i].qs;
1931
-
1932
- for (int l = 0; l < QK4_3; l += 2) {
1933
- const uint8_t vi = pp[l/2];
1934
-
1935
- const int8_t vi0 = vi & 0x0F;
1936
- const int8_t vi1 = vi >> 4;
1937
-
1938
- const float v0 = vi0*d + m;
1939
- const float v1 = vi1*d + m;
1940
-
1941
- y[i*QK4_3 + l + 0] = v0;
1942
- y[i*QK4_3 + l + 1] = v1;
1943
-
1944
- assert(!isnan(y[i*QK4_3 + l + 0]));
1945
- assert(!isnan(y[i*QK4_3 + l + 1]));
1946
- }
1947
- }
1948
- }
1949
-
1950
2095
  static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) {
1951
2096
  assert(k % QK5_0 == 0);
1952
2097
  const int nb = k / QK5_0;
@@ -1965,8 +2110,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
1965
2110
  const uint8_t vi = pp[l/2];
1966
2111
 
1967
2112
  // extract the 5-th bit from qh
1968
- const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
1969
- const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
2113
+ const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
2114
+ const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
1970
2115
 
1971
2116
  const int8_t vi0 = (vi & 0x0F) | vh0;
1972
2117
  const int8_t vi1 = (vi >> 4) | vh1;
@@ -2002,8 +2147,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
2002
2147
  const uint8_t vi = pp[l/2];
2003
2148
 
2004
2149
  // extract the 5-th bit from qh
2005
- const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
2006
- const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
2150
+ const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
2151
+ const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
2007
2152
 
2008
2153
  const uint8_t vi0 = (vi & 0x0F) | vh0;
2009
2154
  const uint8_t vi1 = (vi >> 4) | vh1;
@@ -2040,7 +2185,6 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
2040
2185
  static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2041
2186
  static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2042
2187
  static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2043
- static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2044
2188
  static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2045
2189
  static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2046
2190
  static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -2070,14 +2214,6 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
2070
2214
  .vec_dot_q = ggml_vec_dot_q4_2_q8_0,
2071
2215
  .vec_dot_type = GGML_TYPE_Q8_0,
2072
2216
  },
2073
- [GGML_TYPE_Q4_3] = {
2074
- .dequantize_row_q = dequantize_row_q4_3,
2075
- .quantize_row_q = quantize_row_q4_3,
2076
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference,
2077
- .quantize_row_q_dot = quantize_row_q8_1,
2078
- .vec_dot_q = ggml_vec_dot_q4_3_q8_1,
2079
- .vec_dot_type = GGML_TYPE_Q8_1,
2080
- },
2081
2217
  [GGML_TYPE_Q5_0] = {
2082
2218
  .dequantize_row_q = dequantize_row_q5_0,
2083
2219
  .quantize_row_q = quantize_row_q5_0,
@@ -2748,35 +2884,35 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2748
2884
  const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
2749
2885
  const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
2750
2886
 
2887
+ // interleave
2888
+ const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
2889
+ const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
2890
+ const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
2891
+ const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
2892
+
2751
2893
  // load y
2752
2894
  const int8x16_t v1_0l = vld1q_s8(y0->qs);
2753
2895
  const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
2754
2896
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2755
2897
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2756
2898
 
2757
- // interleave
2758
- const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h);
2759
- const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h);
2760
- const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h);
2761
- const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h);
2762
-
2763
2899
  #if defined(__ARM_FEATURE_DOTPROD)
2764
2900
  // dot product into int32x4_t
2765
- const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs);
2766
- const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs);
2901
+ const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
2902
+ const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
2767
2903
 
2768
2904
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
2769
2905
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
2770
2906
  #else
2771
- const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
2772
- const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
2773
- const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
2774
- const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
2907
+ const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
2908
+ const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
2909
+ const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
2910
+ const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
2775
2911
 
2776
- const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
2777
- const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
2778
- const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
2779
- const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
2912
+ const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
2913
+ const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
2914
+ const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
2915
+ const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
2780
2916
 
2781
2917
  const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
2782
2918
  const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
@@ -3171,136 +3307,6 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
3171
3307
  #endif
3172
3308
  }
3173
3309
 
3174
- static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3175
- const int nb = n / QK8_1;
3176
-
3177
- assert(n % QK8_1 == 0);
3178
- assert(nb % 2 == 0);
3179
- assert(QK8_1 == 2*QK4_3);
3180
-
3181
- const block_q4_3 * restrict x = vx;
3182
- const block_q8_1 * restrict y = vy;
3183
-
3184
- #if defined(__ARM_NEON)
3185
- float32x4_t sumv0 = vdupq_n_f32(0.0f);
3186
- float32x4_t sumv1 = vdupq_n_f32(0.0f);
3187
-
3188
- float summs0 = 0.0f;
3189
- float summs1 = 0.0f;
3190
-
3191
- for (int i = 0; i < nb; ++i) {
3192
- const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
3193
- const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
3194
-
3195
- const block_q8_1 * restrict y0 = &y[i + 0];
3196
-
3197
- summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0;
3198
- summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1;
3199
-
3200
- const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
3201
-
3202
- // 4-bit -> 8-bit
3203
- const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, vdupq_n_u8(0x0F)));
3204
- const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
3205
-
3206
- // interleave
3207
- const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
3208
- const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
3209
-
3210
- // load y
3211
- const int8x16_t v1_0l = vld1q_s8(y0->qs);
3212
- const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
3213
-
3214
- const float x0_0d = GGML_FP16_TO_FP32(x0_0->d);
3215
- const float x0_1d = GGML_FP16_TO_FP32(x0_1->d);
3216
-
3217
- #if defined(__ARM_FEATURE_DOTPROD)
3218
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d);
3219
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d);
3220
- #else
3221
- const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
3222
- const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
3223
- const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
3224
- const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
3225
-
3226
- const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
3227
- const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
3228
-
3229
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d);
3230
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph0), x0_1d*y0->d);
3231
- #endif
3232
- }
3233
-
3234
- *s = vaddvq_f32(vaddq_f32(sumv0, sumv1)) + summs0 + summs1;
3235
- #elif defined(__AVX2__)
3236
- // Initialize accumulator with zeros
3237
- __m256 acc = _mm256_setzero_ps();
3238
- float summs = 0.0f;
3239
-
3240
- // Main loop
3241
- for (int i = 0; i < nb; i++) {
3242
- const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
3243
- const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
3244
- const __m256 dx = _mm256_set_m128(d1, d0);
3245
-
3246
- summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0
3247
- + GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1;
3248
-
3249
- const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
3250
- const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
3251
- const __m256i bx = _mm256_set_m128i(bx1, bx0);
3252
-
3253
- const __m256 dy = _mm256_broadcast_ss(&y[i].d);
3254
- const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
3255
-
3256
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
3257
-
3258
- acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
3259
- }
3260
-
3261
- *s = hsum_float_8(acc) + summs;
3262
- #else
3263
- // scalar
3264
- float sumf = 0.0;
3265
- for (int i = 0; i < nb; i++) {
3266
- const uint8_t * restrict x0 = x[2*i + 0].qs;
3267
- const uint8_t * restrict x1 = x[2*i + 1].qs;
3268
- const int8_t * restrict y0 = y[i].qs;
3269
-
3270
- const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
3271
- const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m);
3272
- const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
3273
- const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m);
3274
-
3275
- int sxy_0 = 0;
3276
- int sxy_1 = 0;
3277
-
3278
- for (int j = 0; j < QK8_1/4; j++) {
3279
- const uint8_t v0 = x0[j];
3280
- const uint8_t v1 = x1[j];
3281
-
3282
- const int x0_0 = v0 & 0x0F;
3283
- const int x1_0 = v0 >> 4;
3284
-
3285
- const int x0_1 = v1 & 0x0F;
3286
- const int x1_1 = v1 >> 4;
3287
-
3288
- const int y0_0 = y0[2*j + 0];
3289
- const int y1_0 = y0[2*j + 1];
3290
-
3291
- const int y0_1 = y0[2*(j + QK8_1/4) + 0];
3292
- const int y1_1 = y0[2*(j + QK8_1/4) + 1];
3293
-
3294
- sxy_0 += x0_0*y0_0 + x1_0*y1_0;
3295
- sxy_1 += x0_1*y0_1 + x1_1*y1_1;
3296
- }
3297
-
3298
- sumf += (d0*sxy_0 + d1*sxy_1)*y[i].d + m0*y[i].s0 + m1*y[i].s1;
3299
- }
3300
- *s = sumf;
3301
- #endif
3302
- }
3303
-
3304
3310
  static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3305
3311
  const int nb = n / QK8_0;
3306
3312
 
@@ -3373,6 +3379,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3373
3379
  }
3374
3380
 
3375
3381
  *s = vaddvq_f32(sumv);
3382
+ #elif defined(__wasm_simd128__)
3383
+ v128_t sumv = wasm_f32x4_splat(0.0f);
3384
+
3385
+ uint64_t tmp[4];
3386
+
3387
+ for (int i = 0; i < nb; ++i) {
3388
+ const block_q5_0 * restrict x0 = &x[i];
3389
+ const block_q8_0 * restrict y0 = &y[i];
3390
+
3391
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
3392
+ const v128_t s16b = wasm_i8x16_splat(0x10);
3393
+
3394
+ // extract the 5th bit
3395
+ uint32_t qh;
3396
+ memcpy(&qh, x0->qh, sizeof(qh));
3397
+
3398
+ tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
3399
+ tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
3400
+ tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
3401
+ tmp[3] = table_b2b_u[(qh >> 24) ];
3402
+
3403
+ const v128_t qhl = wasm_v128_load(tmp + 0);
3404
+ const v128_t qhh = wasm_v128_load(tmp + 2);
3405
+
3406
+ const v128_t v0 = wasm_v128_load(x0->qs);
3407
+
3408
+ // 4-bit -> 8-bit
3409
+ const v128_t v0l = wasm_v128_and (v0, m4b);
3410
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
3411
+
3412
+ // interleave
3413
+ const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
3414
+ const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
3415
+
3416
+ // add high bit and sub 16
3417
+ const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
3418
+ const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
3419
+
3420
+ // load y
3421
+ const v128_t v1l = wasm_v128_load(y0->qs);
3422
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
3423
+
3424
+ // int8x16 -> int16x8
3425
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
3426
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
3427
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
3428
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
3429
+
3430
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
3431
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
3432
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
3433
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
3434
+
3435
+ const float x0d = GGML_FP16_TO_FP32(x0->d);
3436
+
3437
+ // dot product
3438
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
3439
+ wasm_i32x4_add(
3440
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
3441
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
3442
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
3443
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
3444
+ }
3445
+
3446
+ *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
3447
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
3376
3448
  #elif defined(__AVX2__)
3377
3449
  // Initialize accumulator with zeros
3378
3450
  __m256 acc = _mm256_setzero_ps();
@@ -3413,8 +3485,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3413
3485
  for (int j = 0; j < QK8_0/2; j++) {
3414
3486
  const uint8_t v0 = x0[j];
3415
3487
 
3416
- const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
3417
- const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
3488
+ const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
3489
+ const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
3418
3490
 
3419
3491
  const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
3420
3492
  const int x1_0 = ((v0 >> 4) | x1_0h) - 16;
@@ -3504,6 +3576,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3504
3576
  }
3505
3577
 
3506
3578
  *s = vaddvq_f32(sumv) + summs;
3579
+ #elif defined(__wasm_simd128__)
3580
+ v128_t sumv = wasm_f32x4_splat(0.0f);
3581
+
3582
+ float summs = 0.0f;
3583
+
3584
+ uint64_t tmp[4];
3585
+
3586
+ for (int i = 0; i < nb; ++i) {
3587
+ const block_q5_1 * restrict x0 = &x[i];
3588
+ const block_q8_1 * restrict y0 = &y[i];
3589
+
3590
+ summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
3591
+
3592
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
3593
+
3594
+ // extract the 5th bit
3595
+ uint32_t qh;
3596
+ memcpy(&qh, x0->qh, sizeof(qh));
3597
+
3598
+ tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
3599
+ tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
3600
+ tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
3601
+ tmp[3] = table_b2b_u[(qh >> 24) ];
3602
+
3603
+ const v128_t qhl = wasm_v128_load(tmp + 0);
3604
+ const v128_t qhh = wasm_v128_load(tmp + 2);
3605
+
3606
+ const v128_t v0 = wasm_v128_load(x0->qs);
3607
+
3608
+ // 4-bit -> 8-bit
3609
+ const v128_t v0l = wasm_v128_and (v0, m4b);
3610
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
3611
+
3612
+ static bool x = true;
3613
+
3614
+ // interleave
3615
+ const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
3616
+ const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
3617
+
3618
+ // add high bit
3619
+ const v128_t v0lf = wasm_v128_or(v0lz, qhl);
3620
+ const v128_t v0hf = wasm_v128_or(v0hz, qhh);
3621
+
3622
+ // load y
3623
+ const v128_t v1l = wasm_v128_load(y0->qs);
3624
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
3625
+
3626
+ // int8x16 -> int16x8
3627
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
3628
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
3629
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
3630
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
3631
+
3632
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
3633
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
3634
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
3635
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
3636
+
3637
+ const float x0d = GGML_FP16_TO_FP32(x0->d);
3638
+
3639
+ // dot product
3640
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
3641
+ wasm_i32x4_add(
3642
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
3643
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
3644
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
3645
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
3646
+ }
3647
+
3648
+ *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
3649
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
3507
3650
  #elif defined(__AVX2__)
3508
3651
  // Initialize accumulator with zeros
3509
3652
  __m256 acc = _mm256_setzero_ps();
@@ -3547,8 +3690,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3547
3690
  for (int j = 0; j < QK8_1/2; j++) {
3548
3691
  const uint8_t v0 = x0[j];
3549
3692
 
3550
- const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
3551
- const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
3693
+ const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
3694
+ const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
3552
3695
 
3553
3696
  const int x0_0 = (v0 & 0x0F) | x0_0h;
3554
3697
  const int x1_0 = (v0 >> 4) | x1_0h;
@@ -3925,7 +4068,6 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3925
4068
  [GGML_TYPE_Q4_0] = QK4_0,
3926
4069
  [GGML_TYPE_Q4_1] = QK4_1,
3927
4070
  [GGML_TYPE_Q4_2] = QK4_2,
3928
- [GGML_TYPE_Q4_3] = QK4_3,
3929
4071
  [GGML_TYPE_Q5_0] = QK5_0,
3930
4072
  [GGML_TYPE_Q5_1] = QK5_1,
3931
4073
  [GGML_TYPE_Q8_0] = QK8_0,
@@ -3942,7 +4084,6 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3942
4084
  [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
3943
4085
  [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
3944
4086
  [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
3945
- [GGML_TYPE_Q4_3] = sizeof(block_q4_3),
3946
4087
  [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
3947
4088
  [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3948
4089
  [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
@@ -3960,7 +4101,6 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3960
4101
  [GGML_TYPE_Q4_0] = "q4_0",
3961
4102
  [GGML_TYPE_Q4_1] = "q4_1",
3962
4103
  [GGML_TYPE_Q4_2] = "q4_2",
3963
- [GGML_TYPE_Q4_3] = "q4_3",
3964
4104
  [GGML_TYPE_Q5_0] = "q5_0",
3965
4105
  [GGML_TYPE_Q5_1] = "q5_1",
3966
4106
  [GGML_TYPE_Q8_0] = "q8_0",
@@ -3977,7 +4117,6 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3977
4117
  [GGML_TYPE_Q4_0] = true,
3978
4118
  [GGML_TYPE_Q4_1] = true,
3979
4119
  [GGML_TYPE_Q4_2] = true,
3980
- [GGML_TYPE_Q4_3] = true,
3981
4120
  [GGML_TYPE_Q5_0] = true,
3982
4121
  [GGML_TYPE_Q5_1] = true,
3983
4122
  [GGML_TYPE_Q8_0] = true,
@@ -4024,6 +4163,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
4024
4163
  "DIAG_MASK_INF",
4025
4164
  "SOFT_MAX",
4026
4165
  "ROPE",
4166
+ "ALIBI",
4027
4167
  "CONV_1D_1S",
4028
4168
  "CONV_1D_2S",
4029
4169
 
@@ -4034,7 +4174,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
4034
4174
  "MAP_BINARY",
4035
4175
  };
4036
4176
 
4037
- static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
4177
+ static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
4038
4178
 
4039
4179
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
4040
4180
  "none",
@@ -4072,6 +4212,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
4072
4212
  "diag_mask_inf(x)",
4073
4213
  "soft_max(x)",
4074
4214
  "rope(x)",
4215
+ "alibi(x)",
4075
4216
  "conv_1d_1s(x)",
4076
4217
  "conv_1d_2s(x)",
4077
4218
 
@@ -4082,7 +4223,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
4082
4223
  "f(x,y)",
4083
4224
  };
4084
4225
 
4085
- static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
4226
+ static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
4086
4227
 
4087
4228
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
4088
4229
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -4252,6 +4393,27 @@ bool ggml_is_quantized(enum ggml_type type) {
4252
4393
  return GGML_IS_QUANTIZED[type];
4253
4394
  }
4254
4395
 
4396
+ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4397
+ enum ggml_type wtype = GGML_TYPE_COUNT;
4398
+
4399
+ switch (ftype) {
4400
+ case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
4401
+ case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
4402
+ case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
4403
+ case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
4404
+ case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
4405
+ case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
4406
+ case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
4407
+ case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
4408
+ case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
4409
+ case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
4410
+ }
4411
+
4412
+ GGML_ASSERT(wtype != GGML_TYPE_COUNT);
4413
+
4414
+ return wtype;
4415
+ }
4416
+
4255
4417
  static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
4256
4418
  return tensor->nb[0] > tensor->nb[1];
4257
4419
  }
@@ -4362,12 +4524,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4362
4524
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4363
4525
  }
4364
4526
 
4365
- // initialize cuBLAS
4366
- #if defined(GGML_USE_CUBLAS)
4527
+ #if defined(GGML_USE_CUBLAS)
4367
4528
  ggml_init_cublas();
4368
- #elif defined(GGML_USE_CLBLAST)
4529
+ #elif defined(GGML_USE_CLBLAST)
4369
4530
  ggml_cl_init();
4370
- #endif
4531
+ #endif
4371
4532
 
4372
4533
  is_first_call = false;
4373
4534
  }
@@ -4448,7 +4609,7 @@ void ggml_free(struct ggml_context * ctx) {
4448
4609
  }
4449
4610
 
4450
4611
  size_t ggml_used_mem(const struct ggml_context * ctx) {
4451
- return ctx->objects_end->offs + ctx->objects_end->size;
4612
+ return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
4452
4613
  }
4453
4614
 
4454
4615
  size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
@@ -4561,6 +4722,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4561
4722
  /*.perf_cycles =*/ 0,
4562
4723
  /*.perf_time_us =*/ 0,
4563
4724
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4725
+ /*.name =*/ { 0 },
4564
4726
  /*.pad =*/ { 0 },
4565
4727
  };
4566
4728
 
@@ -4915,6 +5077,15 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4915
5077
  return (float *)(tensor->data);
4916
5078
  }
4917
5079
 
5080
+ const char * ggml_get_name(const struct ggml_tensor * tensor) {
5081
+ return tensor->name;
5082
+ }
5083
+
5084
+ void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
5085
+ strncpy(tensor->name, name, sizeof(tensor->name));
5086
+ tensor->name[sizeof(tensor->name) - 1] = '\0';
5087
+ }
5088
+
4918
5089
  struct ggml_tensor * ggml_view_tensor(
4919
5090
  struct ggml_context * ctx,
4920
5091
  const struct ggml_tensor * src) {
@@ -6014,6 +6185,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
6014
6185
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6015
6186
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6016
6187
  struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
6188
+ ggml_set_name(b, "n_past");
6017
6189
 
6018
6190
  result->op = GGML_OP_DIAG_MASK_INF;
6019
6191
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6071,6 +6243,7 @@ struct ggml_tensor * ggml_rope(
6071
6243
  ((int32_t *) b->data)[0] = n_past;
6072
6244
  ((int32_t *) b->data)[1] = n_dims;
6073
6245
  ((int32_t *) b->data)[2] = mode;
6246
+ ggml_set_name(b, "n_past, n_dims, mode");
6074
6247
 
6075
6248
  result->op = GGML_OP_ROPE;
6076
6249
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6080,6 +6253,37 @@ struct ggml_tensor * ggml_rope(
6080
6253
  return result;
6081
6254
  }
6082
6255
 
6256
+ // ggml_alibi
6257
+
6258
+ struct ggml_tensor * ggml_alibi(
6259
+ struct ggml_context * ctx,
6260
+ struct ggml_tensor * a,
6261
+ int n_past,
6262
+ int n_head) {
6263
+ GGML_ASSERT(n_past >= 0);
6264
+ bool is_node = false;
6265
+
6266
+ if (a->grad) {
6267
+ GGML_ASSERT(false); // TODO: implement backward
6268
+ is_node = true;
6269
+ }
6270
+
6271
+ // TODO: when implement backward, fix this:
6272
+ //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6273
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6274
+
6275
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6276
+ ((int32_t *) b->data)[0] = n_past;
6277
+ ((int32_t *) b->data)[1] = n_head;
6278
+
6279
+ result->op = GGML_OP_ALIBI;
6280
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6281
+ result->src0 = a;
6282
+ result->src1 = b;
6283
+
6284
+ return result;
6285
+ }
6286
+
6083
6287
  // ggml_conv_1d_1s
6084
6288
 
6085
6289
  struct ggml_tensor * ggml_conv_1d_1s(
@@ -7199,7 +7403,6 @@ static void ggml_compute_forward_add(
7199
7403
  case GGML_TYPE_Q4_0:
7200
7404
  case GGML_TYPE_Q4_1:
7201
7405
  case GGML_TYPE_Q4_2:
7202
- case GGML_TYPE_Q4_3:
7203
7406
  case GGML_TYPE_Q5_0:
7204
7407
  case GGML_TYPE_Q5_1:
7205
7408
  case GGML_TYPE_Q8_0:
@@ -8108,7 +8311,7 @@ static void ggml_compute_forward_rms_norm(
8108
8311
 
8109
8312
  // ggml_compute_forward_mul_mat
8110
8313
 
8111
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8314
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8112
8315
  // helper function to determine if it is better to use BLAS or not
8113
8316
  // for large matrices, BLAS is faster
8114
8317
  static bool ggml_compute_forward_mul_mat_use_blas(
@@ -8125,7 +8328,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
8125
8328
 
8126
8329
  // TODO: find the optimal values for these
8127
8330
  if (ggml_is_contiguous(src0) &&
8128
- ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
8331
+ ggml_is_contiguous(src1) &&
8332
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
8129
8333
 
8130
8334
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
8131
8335
  return true;
@@ -8133,7 +8337,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
8133
8337
 
8134
8338
  return false;
8135
8339
  }
8136
-
8137
8340
  #endif
8138
8341
 
8139
8342
  static void ggml_compute_forward_mul_mat_f32(
@@ -8149,7 +8352,7 @@ static void ggml_compute_forward_mul_mat_f32(
8149
8352
  const int64_t ne02 = src0->ne[2];
8150
8353
  const int64_t ne03 = src0->ne[3];
8151
8354
 
8152
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8355
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8153
8356
  const int64_t ne10 = src1->ne[0];
8154
8357
  #endif
8155
8358
  const int64_t ne11 = src1->ne[1];
@@ -8206,7 +8409,16 @@ static void ggml_compute_forward_mul_mat_f32(
8206
8409
  // nb01 >= nb00 - src0 is not transposed
8207
8410
  // compute by src0 rows
8208
8411
 
8209
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8412
+ #if defined(GGML_USE_CUBLAS)
8413
+ if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
8414
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
8415
+ ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
8416
+ }
8417
+ return;
8418
+ }
8419
+ #endif
8420
+
8421
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8210
8422
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8211
8423
  if (params->ith != 0) {
8212
8424
  return;
@@ -8220,42 +8432,13 @@ static void ggml_compute_forward_mul_mat_f32(
8220
8432
  return;
8221
8433
  }
8222
8434
 
8223
- #if defined(GGML_USE_CUBLAS)
8224
- const float alpha = 1.0f;
8225
- const float beta = 0.0f;
8226
- const int x_ne = ne01 * ne10;
8227
- const int y_ne = ne11 * ne10;
8228
- const int d_ne = ne11 * ne01;
8229
-
8230
- size_t x_size, y_size, d_size;
8231
- float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8232
- float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8233
- float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8234
- #endif
8235
-
8236
8435
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8237
8436
  for (int64_t i02 = 0; i02 < ne02; i02++) {
8238
8437
  const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
8239
8438
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
8240
-
8241
8439
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8242
8440
 
8243
- #if defined(GGML_USE_CUBLAS)
8244
- // copy data to device
8245
- CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
8246
- CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
8247
-
8248
- // compute
8249
- CUBLAS_CHECK(
8250
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
8251
- ne01, ne11, ne10,
8252
- &alpha, d_X, ne00,
8253
- d_Y, ne10,
8254
- &beta, d_D, ne01));
8255
-
8256
- // copy data to host
8257
- CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
8258
- #elif defined(GGML_USE_CLBLAST)
8441
+ #if defined(GGML_USE_CLBLAST)
8259
8442
  // zT = y * xT
8260
8443
  ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
8261
8444
  ne11, ne01, ne10,
@@ -8272,12 +8455,6 @@ static void ggml_compute_forward_mul_mat_f32(
8272
8455
  #endif
8273
8456
  }
8274
8457
  }
8275
- #if defined(GGML_USE_CUBLAS)
8276
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
8277
- ggml_cuda_pool_free(d_X, x_size);
8278
- ggml_cuda_pool_free(d_Y, y_size);
8279
- ggml_cuda_pool_free(d_D, d_size);
8280
- #endif
8281
8458
  //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
8282
8459
 
8283
8460
  return;
@@ -8407,7 +8584,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8407
8584
  // nb01 >= nb00 - src0 is not transposed
8408
8585
  // compute by src0 rows
8409
8586
 
8410
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8587
+ #if defined(GGML_USE_CUBLAS)
8588
+ if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
8589
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
8590
+ ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
8591
+ }
8592
+ return;
8593
+ }
8594
+ #endif
8595
+
8596
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8411
8597
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8412
8598
  GGML_ASSERT(nb10 == sizeof(float));
8413
8599
 
@@ -8423,35 +8609,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8423
8609
  return;
8424
8610
  }
8425
8611
 
8426
- #if defined(GGML_USE_CUBLAS)
8427
- ggml_fp16_t * const wdata = params->wdata;
8428
-
8429
- const float alpha = 1.0f;
8430
- const float beta = 0.0f;
8431
- const int x_ne = ne01 * ne10;
8432
- const int y_ne = ne11 * ne10;
8433
- const int d_ne = ne11 * ne01;
8434
-
8435
- size_t x_size, y_size, d_size;
8436
- float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8437
- float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8438
- float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8439
- #else
8440
- float * const wdata = params->wdata;
8441
- #endif
8442
8612
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8443
8613
  for (int64_t i02 = 0; i02 < ne02; i02++) {
8444
- #if defined(GGML_USE_CUBLAS)
8445
- // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
8446
- {
8447
- size_t id = 0;
8448
- for (int64_t i01 = 0; i01 < ne11; ++i01) {
8449
- for (int64_t i00 = 0; i00 < ne10; ++i00) {
8450
- wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
8451
- }
8452
- }
8453
- }
8454
- #else
8614
+ float * const wdata = params->wdata;
8455
8615
  {
8456
8616
  size_t id = 0;
8457
8617
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -8459,32 +8619,11 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8459
8619
  wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
8460
8620
  }
8461
8621
  }
8462
- }
8463
- #endif
8464
-
8465
- #if defined(GGML_USE_CUBLAS)
8466
- const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
8467
- const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
8468
8622
 
8469
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8623
+ assert(id*sizeof(float) <= params->wsize);
8624
+ }
8470
8625
 
8471
- // copy data to device
8472
- CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
8473
- CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
8474
-
8475
- // compute
8476
- CUBLAS_CHECK(
8477
- cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
8478
- ne01, ne11, ne10,
8479
- &alpha, d_X, CUDA_R_16F, ne00,
8480
- d_Y, CUDA_R_16F, ne10,
8481
- &beta, d_D, CUDA_R_32F, ne01,
8482
- CUBLAS_COMPUTE_32F,
8483
- CUBLAS_GEMM_DEFAULT));
8484
-
8485
- // copy data to host
8486
- CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
8487
- #elif defined(GGML_USE_CLBLAST)
8626
+ #if defined(GGML_USE_CLBLAST)
8488
8627
  const float * x = wdata;
8489
8628
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
8490
8629
 
@@ -8513,12 +8652,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8513
8652
  }
8514
8653
  }
8515
8654
 
8516
- #if defined(GGML_USE_CUBLAS)
8517
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
8518
- ggml_cuda_pool_free(d_X, x_size);
8519
- ggml_cuda_pool_free(d_Y, y_size);
8520
- ggml_cuda_pool_free(d_D, d_size);
8521
- #endif
8522
8655
  /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
8523
8656
 
8524
8657
  return;
@@ -8671,7 +8804,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
8671
8804
  // nb01 >= nb00 - src0 is not transposed
8672
8805
  // compute by src0 rows
8673
8806
 
8674
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8807
+ #if defined(GGML_USE_CUBLAS)
8808
+ if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
8809
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
8810
+ ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
8811
+ }
8812
+ return;
8813
+ }
8814
+ #endif
8815
+
8816
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8675
8817
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8676
8818
  if (params->ith != 0) {
8677
8819
  return;
@@ -8685,48 +8827,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
8685
8827
  return;
8686
8828
  }
8687
8829
 
8688
- #if defined(GGML_USE_CUBLAS)
8689
- const float alpha = 1.0f;
8690
- const float beta = 0.0f;
8691
- const int x_ne = ne01 * ne10;
8692
- const int y_ne = ne11 * ne10;
8693
- const int d_ne = ne11 * ne01;
8694
-
8695
- size_t x_size, y_size, d_size, q_size;
8696
- float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8697
- float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8698
- float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8699
- float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
8700
-
8701
- void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL;
8702
- if (type == GGML_TYPE_Q4_0) {
8703
- dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
8704
- }
8705
- else if (type == GGML_TYPE_Q4_1) {
8706
- dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
8707
- }
8708
- else if (type == GGML_TYPE_Q4_2) {
8709
- dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
8710
- }
8711
- else if (type == GGML_TYPE_Q4_3) {
8712
- dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
8713
- }
8714
- else if (type == GGML_TYPE_Q5_0) {
8715
- dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
8716
- }
8717
- else if (type == GGML_TYPE_Q5_1) {
8718
- dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
8719
- }
8720
- else if (type == GGML_TYPE_Q8_0) {
8721
- dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
8722
- }
8723
- else {
8724
- GGML_ASSERT(false);
8725
- }
8726
- #elif !defined(GGML_USE_CLBLAST)
8727
8830
  float * const wdata = params->wdata;
8728
8831
  dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
8729
- #endif
8730
8832
 
8731
8833
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8732
8834
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -8734,15 +8836,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
8734
8836
 
8735
8837
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8736
8838
 
8737
- #if defined(GGML_USE_CUBLAS)
8738
- // copy and dequantize on device
8739
- CUDA_CHECK(
8740
- cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02,
8741
- GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, g_cudaStream));
8742
-
8743
- dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
8744
- CUDA_CHECK(cudaGetLastError());
8745
- #elif defined(GGML_USE_CLBLAST)
8839
+ #if defined(GGML_USE_CLBLAST)
8746
8840
  const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
8747
8841
  #else
8748
8842
  {
@@ -8751,26 +8845,14 @@ static void ggml_compute_forward_mul_mat_q_f32(
8751
8845
  dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
8752
8846
  id += ne00;
8753
8847
  }
8848
+
8849
+ assert(id*sizeof(float) <= params->wsize);
8754
8850
  }
8851
+
8755
8852
  const float * x = wdata;
8756
8853
  #endif
8757
8854
 
8758
-
8759
- #if defined(GGML_USE_CUBLAS)
8760
- // copy data to device
8761
- CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
8762
-
8763
- // compute
8764
- CUBLAS_CHECK(
8765
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
8766
- ne01, ne11, ne10,
8767
- &alpha, d_X, ne00,
8768
- d_Y, ne10,
8769
- &beta, d_D, ne01));
8770
-
8771
- // copy data to host
8772
- CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
8773
- #elif defined(GGML_USE_CLBLAST)
8855
+ #if defined(GGML_USE_CLBLAST)
8774
8856
  // zT = y * xT
8775
8857
  ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
8776
8858
  ne11, ne01, ne10,
@@ -8788,13 +8870,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
8788
8870
  }
8789
8871
  }
8790
8872
 
8791
- #if defined(GGML_USE_CUBLAS)
8792
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
8793
- ggml_cuda_pool_free(d_X, x_size);
8794
- ggml_cuda_pool_free(d_Y, y_size);
8795
- ggml_cuda_pool_free(d_D, d_size);
8796
- ggml_cuda_pool_free(d_Q, q_size);
8797
- #endif
8798
8873
  //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
8799
8874
 
8800
8875
  return;
@@ -8883,7 +8958,6 @@ static void ggml_compute_forward_mul_mat(
8883
8958
  case GGML_TYPE_Q4_0:
8884
8959
  case GGML_TYPE_Q4_1:
8885
8960
  case GGML_TYPE_Q4_2:
8886
- case GGML_TYPE_Q4_3:
8887
8961
  case GGML_TYPE_Q5_0:
8888
8962
  case GGML_TYPE_Q5_1:
8889
8963
  case GGML_TYPE_Q8_0:
@@ -9115,7 +9189,6 @@ static void ggml_compute_forward_get_rows(
9115
9189
  case GGML_TYPE_Q4_0:
9116
9190
  case GGML_TYPE_Q4_1:
9117
9191
  case GGML_TYPE_Q4_2:
9118
- case GGML_TYPE_Q4_3:
9119
9192
  case GGML_TYPE_Q5_0:
9120
9193
  case GGML_TYPE_Q5_1:
9121
9194
  case GGML_TYPE_Q8_0:
@@ -9300,6 +9373,161 @@ static void ggml_compute_forward_soft_max(
9300
9373
  }
9301
9374
  }
9302
9375
 
9376
+ // ggml_compute_forward_alibi
9377
+
9378
+ static void ggml_compute_forward_alibi_f32(
9379
+ const struct ggml_compute_params * params,
9380
+ const struct ggml_tensor * src0,
9381
+ const struct ggml_tensor * src1,
9382
+ struct ggml_tensor * dst) {
9383
+ assert(params->ith == 0);
9384
+ assert(src1->type == GGML_TYPE_I32);
9385
+ assert(ggml_nelements(src1) == 2);
9386
+
9387
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9388
+ return;
9389
+ }
9390
+
9391
+ const int n_past = ((int32_t *) src1->data)[0];
9392
+ const int n_head = ((int32_t *) src1->data)[1];
9393
+
9394
+ const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
9395
+ const int ne1 = src0->ne[1]; // seq_len_without_past
9396
+ //const int ne2 = src0->ne[2]; // n_head -> this is k
9397
+ //const int ne3 = src0->ne[3]; // 1 -> bsz
9398
+
9399
+ const int n = ggml_nrows(src0);
9400
+ const int ne2_ne3 = n/ne1; // ne2*ne3
9401
+
9402
+ const int nb0 = src0->nb[0];
9403
+ const int nb1 = src0->nb[1];
9404
+ const int nb2 = src0->nb[2];
9405
+ //const int nb3 = src0->nb[3];
9406
+
9407
+ assert(nb0 == sizeof(float));
9408
+ assert(ne1 + n_past == ne0); (void) n_past;
9409
+
9410
+ // add alibi to src0 (KQ_scaled)
9411
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
9412
+
9413
+ const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
9414
+ const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
9415
+
9416
+ for (int i = 0; i < ne0; i++) {
9417
+ for (int j = 0; j < ne1; j++) {
9418
+ for (int k = 0; k < ne2_ne3; k++) {
9419
+ float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
9420
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
9421
+
9422
+ // TODO: k*nb2 or k*nb3
9423
+
9424
+ float m_k;
9425
+
9426
+ if (k < n_heads_log2_floor) {
9427
+ m_k = powf(m0, k + 1);
9428
+ } else {
9429
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
9430
+ }
9431
+
9432
+ pdst[0] = (j+1) * m_k + src[0];
9433
+ }
9434
+ }
9435
+ }
9436
+ }
9437
+
9438
+
9439
+ static void ggml_compute_forward_alibi_f16(
9440
+ const struct ggml_compute_params * params,
9441
+ const struct ggml_tensor * src0,
9442
+ const struct ggml_tensor * src1,
9443
+ struct ggml_tensor * dst) {
9444
+ assert(params->ith == 0);
9445
+ assert(src1->type == GGML_TYPE_I32);
9446
+ assert(ggml_nelements(src1) == 2);
9447
+
9448
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9449
+ return;
9450
+ }
9451
+
9452
+ const int n_past = ((int32_t *) src1->data)[0];
9453
+ const int n_head = ((int32_t *) src1->data)[1];
9454
+
9455
+ const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
9456
+ const int ne1 = src0->ne[1]; // seq_len_without_past
9457
+ //const int ne2 = src0->ne[2]; // n_head -> this is k
9458
+ //const int ne3 = src0->ne[3]; // 1 -> bsz
9459
+
9460
+ const int n = ggml_nrows(src0);
9461
+ const int ne2_ne3 = n/ne1; // ne2*ne3
9462
+
9463
+ const int nb0 = src0->nb[0];
9464
+ const int nb1 = src0->nb[1];
9465
+ const int nb2 = src0->nb[2];
9466
+ //const int nb3 = src0->nb[3];
9467
+
9468
+ assert(nb0 == sizeof(ggml_fp16_t));
9469
+ assert(ne1 + n_past == ne0); (void) n_past;
9470
+
9471
+ // add alibi to src0 (KQ_scaled)
9472
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
9473
+
9474
+ const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
9475
+ const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
9476
+
9477
+ for (int i = 0; i < ne0; i++) {
9478
+ for (int j = 0; j < ne1; j++) {
9479
+ for (int k = 0; k < ne2_ne3; k++) {
9480
+ ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
9481
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
9482
+
9483
+ // TODO: k*nb2 or k*nb3
9484
+
9485
+ float m_k;
9486
+
9487
+ if (k < n_heads_log2_floor) {
9488
+ m_k = powf(m0, k + 1);
9489
+ } else {
9490
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
9491
+ }
9492
+
9493
+ // we return F32
9494
+ pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]);
9495
+ }
9496
+ }
9497
+ }
9498
+ }
9499
+
9500
+ static void ggml_compute_forward_alibi(
9501
+ const struct ggml_compute_params * params,
9502
+ const struct ggml_tensor * src0,
9503
+ const struct ggml_tensor * src1,
9504
+ struct ggml_tensor * dst) {
9505
+ switch (src0->type) {
9506
+ case GGML_TYPE_F16:
9507
+ {
9508
+ ggml_compute_forward_alibi_f16(params, src0, src1, dst);
9509
+ } break;
9510
+ case GGML_TYPE_F32:
9511
+ {
9512
+ ggml_compute_forward_alibi_f32(params, src0, src1, dst);
9513
+ } break;
9514
+ case GGML_TYPE_Q4_0:
9515
+ case GGML_TYPE_Q4_1:
9516
+ case GGML_TYPE_Q4_2:
9517
+ case GGML_TYPE_Q5_0:
9518
+ case GGML_TYPE_Q5_1:
9519
+ case GGML_TYPE_Q8_0:
9520
+ case GGML_TYPE_Q8_1:
9521
+ case GGML_TYPE_I8:
9522
+ case GGML_TYPE_I16:
9523
+ case GGML_TYPE_I32:
9524
+ case GGML_TYPE_COUNT:
9525
+ {
9526
+ GGML_ASSERT(false);
9527
+ } break;
9528
+ }
9529
+ }
9530
+
9303
9531
  // ggml_compute_forward_rope
9304
9532
 
9305
9533
  static void ggml_compute_forward_rope_f32(
@@ -10938,6 +11166,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
10938
11166
  {
10939
11167
  ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
10940
11168
  } break;
11169
+ case GGML_OP_ALIBI:
11170
+ {
11171
+ ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
11172
+ } break;
10941
11173
  case GGML_OP_CONV_1D_1S:
10942
11174
  {
10943
11175
  ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@@ -11140,6 +11372,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
11140
11372
  {
11141
11373
  GGML_ASSERT(false); // TODO: not implemented
11142
11374
  } break;
11375
+ case GGML_OP_ALIBI:
11376
+ {
11377
+ GGML_ASSERT(false); // TODO: not implemented
11378
+ } break;
11143
11379
  case GGML_OP_SILU:
11144
11380
  {
11145
11381
  GGML_ASSERT(false); // TODO: not implemented
@@ -11617,15 +11853,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11617
11853
 
11618
11854
  size_t cur = 0;
11619
11855
 
11856
+ #if defined(GGML_USE_CUBLAS)
11857
+ if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
11858
+ node->n_tasks = 1; // TODO: this actually is doing nothing
11859
+ // the threads are still spinning
11860
+ cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
11861
+ }
11862
+ else
11863
+ #endif
11620
11864
  if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
11621
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11865
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11622
11866
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11623
11867
  node->n_tasks = 1; // TODO: this actually is doing nothing
11624
11868
  // the threads are still spinning
11869
+ // here we need memory just for single 2D matrix from src0
11625
11870
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
11626
- //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
11627
- //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
11628
- //printf("cur = %zu\n", cur);
11629
11871
  } else {
11630
11872
  cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
11631
11873
  }
@@ -11634,8 +11876,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11634
11876
  #endif
11635
11877
  } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
11636
11878
  cur = 0;
11879
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11880
+ if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11881
+ node->n_tasks = 1;
11882
+ }
11883
+ #endif
11637
11884
  } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
11638
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11885
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11639
11886
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11640
11887
  node->n_tasks = 1;
11641
11888
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -11673,6 +11920,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11673
11920
  {
11674
11921
  node->n_tasks = n_threads;
11675
11922
  } break;
11923
+ case GGML_OP_ALIBI:
11924
+ {
11925
+ node->n_tasks = 1; //TODO
11926
+ } break;
11676
11927
  case GGML_OP_CONV_1D_1S:
11677
11928
  case GGML_OP_CONV_1D_2S:
11678
11929
  {
@@ -12060,10 +12311,16 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
12060
12311
  snprintf(color, sizeof(color), "white");
12061
12312
  }
12062
12313
 
12063
- fprintf(fp, " \"%p\" [ \
12064
- style = filled; fillcolor = %s; shape = record; \
12065
- label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
12066
- (void *) node, color,
12314
+ fprintf(fp, " \"%p\" [ "
12315
+ "style = filled; fillcolor = %s; shape = record; "
12316
+ "label=\"",
12317
+ (void *) node, color);
12318
+
12319
+ if (strlen(node->name) > 0) {
12320
+ fprintf(fp, "%s |", node->name);
12321
+ }
12322
+
12323
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
12067
12324
  i, node->ne[0], node->ne[1],
12068
12325
  GGML_OP_SYMBOL[node->op]);
12069
12326
 
@@ -12079,18 +12336,26 @@ label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
12079
12336
 
12080
12337
  snprintf(color, sizeof(color), "pink");
12081
12338
 
12339
+ fprintf(fp, " \"%p\" [ "
12340
+ "style = filled; fillcolor = %s; shape = record; "
12341
+ "label=\"<x>",
12342
+ (void *) node, color);
12343
+
12344
+ if (strlen(node->name) > 0) {
12345
+ fprintf(fp, "%s | ", node->name);
12346
+ }
12082
12347
  if (ggml_nelements(node) == 1) {
12083
- fprintf(fp, " \"%p\" [ \
12084
- style = filled; fillcolor = %s; shape = record; \
12085
- label=\"<x>%.1e\"; ]\n",
12086
- (void *) node, color, (double)ggml_get_f32_1d(node, 0));
12087
- } else {
12088
- fprintf(fp, " \"%p\" [ \
12089
- style = filled; fillcolor = %s; shape = record; \
12090
- label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
12091
- (void *) node, color,
12092
- i, node->ne[0], node->ne[1]);
12348
+ if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
12349
+ fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
12350
+ }
12351
+ else {
12352
+ fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
12353
+ }
12093
12354
  }
12355
+ else {
12356
+ fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
12357
+ }
12358
+ fprintf(fp, "\"; ]\n");
12094
12359
  }
12095
12360
 
12096
12361
  for (int i = 0; i < gb->n_nodes; i++) {
@@ -12889,29 +13154,6 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
12889
13154
  return (n/QK4_2*sizeof(block_q4_2));
12890
13155
  }
12891
13156
 
12892
- size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) {
12893
- assert(k % QK4_3 == 0);
12894
- const int nb = k / QK4_3;
12895
-
12896
- for (int j = 0; j < n; j += k) {
12897
- block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3;
12898
-
12899
- quantize_row_q4_3_reference(src + j, y, k);
12900
-
12901
- for (int i = 0; i < nb; i++) {
12902
- for (int l = 0; l < QK4_3; l += 2) {
12903
- const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
12904
- const uint8_t vi1 = y[i].qs[l/2] >> 4;
12905
-
12906
- hist[vi0]++;
12907
- hist[vi1]++;
12908
- }
12909
- }
12910
- }
12911
-
12912
- return (n/QK4_3*sizeof(block_q4_3));
12913
- }
12914
-
12915
13157
  size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
12916
13158
  assert(k % QK5_0 == 0);
12917
13159
  const int nb = k / QK5_0;
@@ -12926,8 +13168,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
12926
13168
  memcpy(&qh, &y[i].qh, sizeof(qh));
12927
13169
 
12928
13170
  for (int l = 0; l < QK5_0; l += 2) {
12929
- const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
12930
- const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
13171
+ const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
13172
+ const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
12931
13173
 
12932
13174
  // cast to 16 bins
12933
13175
  const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
@@ -12956,8 +13198,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
12956
13198
  memcpy(&qh, &y[i].qh, sizeof(qh));
12957
13199
 
12958
13200
  for (int l = 0; l < QK5_1; l += 2) {
12959
- const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
12960
- const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
13201
+ const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
13202
+ const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
12961
13203
 
12962
13204
  // cast to 16 bins
12963
13205
  const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
@@ -13014,12 +13256,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
13014
13256
  block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
13015
13257
  result = ggml_quantize_q4_2(src + start, block, n, n, hist);
13016
13258
  } break;
13017
- case GGML_TYPE_Q4_3:
13018
- {
13019
- GGML_ASSERT(start % QK4_3 == 0);
13020
- block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
13021
- result = ggml_quantize_q4_3(src + start, block, n, n, hist);
13022
- } break;
13023
13259
  case GGML_TYPE_Q5_0:
13024
13260
  {
13025
13261
  GGML_ASSERT(start % QK5_0 == 0);