llama_cpp 0.0.7 → 0.1.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -135,14 +135,6 @@ inline static void* ggml_aligned_malloc(size_t size) {
135
135
  #define UNUSED(x) (void)(x)
136
136
  #define SWAP(x, y, T) do { T SWAP = x; x = y; y = SWAP; } while (0)
137
137
 
138
- #define GGML_ASSERT(x) \
139
- do { \
140
- if (!(x)) { \
141
- fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
142
- abort(); \
143
- } \
144
- } while (0)
145
-
146
138
  #if defined(GGML_USE_ACCELERATE)
147
139
  #include <Accelerate/Accelerate.h>
148
140
  #elif defined(GGML_USE_OPENBLAS)
@@ -188,9 +180,13 @@ typedef double ggml_float;
188
180
  #undef bool
189
181
  #define bool _Bool
190
182
  #else
183
+ #if defined(_MSC_VER) || defined(__MINGW32__)
184
+ #include <intrin.h>
185
+ #else
191
186
  #include <immintrin.h>
192
187
  #endif
193
188
  #endif
189
+ #endif
194
190
 
195
191
  #ifdef __F16C__
196
192
 
@@ -330,7 +326,7 @@ static ggml_fp16_t table_exp_f16[1 << 16];
330
326
  // precomputed f32 table for f16 (256 KB)
331
327
  static float table_f32_f16[1 << 16];
332
328
 
333
- #if defined(__ARM_NEON)
329
+ #if defined(__ARM_NEON) || defined(__wasm_simd128__)
334
330
  #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s
335
331
  #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s)
336
332
  #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s)
@@ -370,6 +366,32 @@ ggml_fp16_t ggml_fp32_to_fp16(float x) {
370
366
  return GGML_FP32_TO_FP16(x);
371
367
  }
372
368
 
369
+ void ggml_fp16_to_fp32_row(const ggml_fp16_t * x, float * y, size_t n) {
370
+ for (size_t i = 0; i < n; i++) {
371
+ y[i] = GGML_FP16_TO_FP32(x[i]);
372
+ }
373
+ }
374
+
375
+ void ggml_fp32_to_fp16_row(const float * x, ggml_fp16_t * y, size_t n) {
376
+ size_t i = 0;
377
+ #if defined(__F16C__)
378
+ for (; i + 7 < n; i += 8) {
379
+ __m256 x_vec = _mm256_loadu_ps(x + i);
380
+ __m128i y_vec = _mm256_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
381
+ _mm_storeu_si128((__m128i *)(y + i), y_vec);
382
+ }
383
+ for(; i + 3 < n; i += 4) {
384
+ __m128 x_vec = _mm_loadu_ps(x + i);
385
+ __m128i y_vec = _mm_cvtps_ph(x_vec, _MM_FROUND_TO_NEAREST_INT);
386
+ _mm_storel_epi64((__m128i *)(y + i), y_vec);
387
+ }
388
+ #endif
389
+ for (; i < n; i++) {
390
+ y[i] = GGML_FP32_TO_FP16(x[i]);
391
+ }
392
+ }
393
+
394
+
373
395
  //
374
396
  // timing
375
397
  //
@@ -653,19 +675,102 @@ float vmaxvq_f32(float32x4_t v) {
653
675
  }
654
676
 
655
677
  int8x8_t vzip1_s8(int8x8_t a, int8x8_t b) {
656
- return vget_low_s8(vcombine_s8(a, b));
678
+ int8x8_t res;
679
+
680
+ res[0] = a[0]; res[1] = b[0];
681
+ res[2] = a[1]; res[3] = b[1];
682
+ res[4] = a[2]; res[5] = b[2];
683
+ res[6] = a[3]; res[7] = b[3];
684
+
685
+ return res;
657
686
  }
658
687
 
659
688
  int8x8_t vzip2_s8(int8x8_t a, int8x8_t b) {
660
- return vget_high_s8(vcombine_s8(a, b));
689
+ int8x8_t res;
690
+
691
+ res[0] = a[4]; res[1] = b[4];
692
+ res[2] = a[5]; res[3] = b[5];
693
+ res[4] = a[6]; res[5] = b[6];
694
+ res[6] = a[7]; res[7] = b[7];
695
+
696
+ return res;
661
697
  }
662
698
 
663
699
  uint8x8_t vzip1_u8(uint8x8_t a, uint8x8_t b) {
664
- return vget_low_u8(vcombine_u8(a, b));
700
+ uint8x8_t res;
701
+
702
+ res[0] = a[0]; res[1] = b[0];
703
+ res[2] = a[1]; res[3] = b[1];
704
+ res[4] = a[2]; res[5] = b[2];
705
+ res[6] = a[3]; res[7] = b[3];
706
+
707
+ return res;
665
708
  }
666
709
 
667
710
  uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) {
668
- return vget_high_u8(vcombine_u8(a, b));
711
+ uint8x8_t res;
712
+
713
+ res[0] = a[4]; res[1] = b[4];
714
+ res[2] = a[5]; res[3] = b[5];
715
+ res[4] = a[6]; res[5] = b[6];
716
+ res[6] = a[7]; res[7] = b[7];
717
+
718
+ return res;
719
+ }
720
+
721
+ int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) {
722
+ int8x16_t res;
723
+
724
+ res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
725
+ res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
726
+ res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
727
+ res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
728
+
729
+ return res;
730
+ }
731
+
732
+ int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) {
733
+ int8x16_t res;
734
+
735
+ res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
736
+ res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
737
+ res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
738
+ res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
739
+
740
+ return res;
741
+ }
742
+
743
+ uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) {
744
+ uint8x16_t res;
745
+
746
+ res[0] = a[0]; res[1] = b[0]; res[2] = a[1]; res[3] = b[1];
747
+ res[4] = a[2]; res[5] = b[2]; res[6] = a[3]; res[7] = b[3];
748
+ res[8] = a[4]; res[9] = b[4]; res[10] = a[5]; res[11] = b[5];
749
+ res[12] = a[6]; res[13] = b[6]; res[14] = a[7]; res[15] = b[7];
750
+
751
+ return res;
752
+ }
753
+
754
+ uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) {
755
+ uint8x16_t res;
756
+
757
+ res[0] = a[8]; res[1] = b[8]; res[2] = a[9]; res[3] = b[9];
758
+ res[4] = a[10]; res[5] = b[10]; res[6] = a[11]; res[7] = b[11];
759
+ res[8] = a[12]; res[9] = b[12]; res[10] = a[13]; res[11] = b[13];
760
+ res[12] = a[14]; res[13] = b[14]; res[14] = a[15]; res[15] = b[15];
761
+
762
+ return res;
763
+ }
764
+
765
+ int32x4_t vcvtnq_s32_f32(float32x4_t v) {
766
+ int32x4_t res;
767
+
768
+ res[0] = roundf(vgetq_lane_f32(v, 0));
769
+ res[1] = roundf(vgetq_lane_f32(v, 1));
770
+ res[2] = roundf(vgetq_lane_f32(v, 2));
771
+ res[3] = roundf(vgetq_lane_f32(v, 3));
772
+
773
+ return res;
669
774
  }
670
775
 
671
776
  #endif
@@ -694,14 +799,6 @@ typedef struct {
694
799
  } block_q4_2;
695
800
  static_assert(sizeof(block_q4_2) == sizeof(ggml_fp16_t) + QK4_2 / 2, "wrong q4_2 block size/padding");
696
801
 
697
- #define QK4_3 16
698
- typedef struct {
699
- ggml_fp16_t d; // delta
700
- ggml_fp16_t m; // min
701
- uint8_t qs[QK4_3 / 2]; // nibbles / quants
702
- } block_q4_3;
703
- static_assert(sizeof(block_q4_3) == 2 * sizeof(ggml_fp16_t) + QK4_3 / 2, "wrong q4_3 block size/padding");
704
-
705
802
  #define QK5_0 32
706
803
  typedef struct {
707
804
  ggml_fp16_t d; // delta
@@ -789,6 +886,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
789
886
  float max = 0.0f;
790
887
  float min = 0.0f;
791
888
 
889
+ vector float asrcv [8];
792
890
  vector float srcv [8];
793
891
  vector float maxv[8];
794
892
  vector float minv[8];
@@ -1068,7 +1166,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int
1068
1166
  const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id));
1069
1167
  const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f));
1070
1168
  const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf);
1071
- const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15));
1169
+ const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15));
1072
1170
 
1073
1171
  y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4);
1074
1172
  y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4);
@@ -1291,49 +1389,6 @@ static void quantize_row_q4_2(const float * restrict x, void * restrict vy, int
1291
1389
  quantize_row_q4_2_reference(x, y, k);
1292
1390
  }
1293
1391
 
1294
- static void quantize_row_q4_3_reference(const float * restrict x, block_q4_3 * restrict y, int k) {
1295
- assert(k % QK4_3 == 0);
1296
- const int nb = k / QK4_3;
1297
-
1298
- for (int i = 0; i < nb; i++) {
1299
- float min = FLT_MAX;
1300
- float max = -FLT_MAX;
1301
-
1302
- for (int l = 0; l < QK4_3; l++) {
1303
- const float v = x[i*QK4_3 + l];
1304
- if (v < min) min = v;
1305
- if (v > max) max = v;
1306
- }
1307
-
1308
- const float d = (max - min) / ((1 << 4) - 1);
1309
- const float id = d ? 1.0f/d : 0.0f;
1310
-
1311
- y[i].d = GGML_FP32_TO_FP16(d);
1312
- y[i].m = GGML_FP32_TO_FP16(min);
1313
-
1314
- for (int l = 0; l < QK4_3; l += 2) {
1315
- const float v0 = (x[i*QK4_3 + l + 0] - min)*id;
1316
- const float v1 = (x[i*QK4_3 + l + 1] - min)*id;
1317
-
1318
- const uint8_t vi0 = (int) (v0 + 0.5f);
1319
- const uint8_t vi1 = (int) (v1 + 0.5f);
1320
-
1321
- assert(vi0 < 16);
1322
- assert(vi1 < 16);
1323
-
1324
- y[i].qs[l/2] = vi0 | (vi1 << 4);
1325
- }
1326
- }
1327
- }
1328
-
1329
- static void quantize_row_q4_3(const float * restrict x, void * restrict vy, int k) {
1330
- assert(k % QK4_3 == 0);
1331
-
1332
- block_q4_3 * restrict y = vy;
1333
-
1334
- quantize_row_q4_3_reference(x, y, k);
1335
- }
1336
-
1337
1392
  static void quantize_row_q5_0_reference(const float * restrict x, block_q5_0 * restrict y, int k) {
1338
1393
  assert(k % QK5_0 == 0);
1339
1394
  const int nb = k / QK5_0;
@@ -1458,15 +1513,135 @@ static void quantize_row_q8_0_reference(const float * restrict x, block_q8_0 * r
1458
1513
  }
1459
1514
 
1460
1515
  static void quantize_row_q8_0(const float * restrict x, void * restrict vy, int k) {
1516
+ assert(QK8_0 == 32);
1461
1517
  assert(k % QK8_0 == 0);
1518
+ const int nb = k / QK8_0;
1462
1519
 
1463
1520
  block_q8_0 * restrict y = vy;
1464
1521
 
1522
+ #if defined(__ARM_NEON)
1523
+ for (int i = 0; i < nb; i++) {
1524
+ float32x4_t srcv [8];
1525
+ float32x4_t asrcv[8];
1526
+ float32x4_t amaxv[8];
1527
+
1528
+ for (int l = 0; l < 8; l++) srcv[l] = vld1q_f32(x + i*32 + 4*l);
1529
+ for (int l = 0; l < 8; l++) asrcv[l] = vabsq_f32(srcv[l]);
1530
+
1531
+ for (int l = 0; l < 4; l++) amaxv[2*l] = vmaxq_f32(asrcv[2*l], asrcv[2*l+1]);
1532
+ for (int l = 0; l < 2; l++) amaxv[4*l] = vmaxq_f32(amaxv[4*l], amaxv[4*l+2]);
1533
+ for (int l = 0; l < 1; l++) amaxv[8*l] = vmaxq_f32(amaxv[8*l], amaxv[8*l+4]);
1534
+
1535
+ const float amax = vmaxvq_f32(amaxv[0]);
1536
+
1537
+ const float d = amax / ((1 << 7) - 1);
1538
+ const float id = d ? 1.0f/d : 0.0f;
1539
+
1540
+ y[i].d = d;
1541
+
1542
+ for (int l = 0; l < 8; l++) {
1543
+ const float32x4_t v = vmulq_n_f32(srcv[l], id);
1544
+ const int32x4_t vi = vcvtnq_s32_f32(v);
1545
+
1546
+ y[i].qs[4*l + 0] = vgetq_lane_s32(vi, 0);
1547
+ y[i].qs[4*l + 1] = vgetq_lane_s32(vi, 1);
1548
+ y[i].qs[4*l + 2] = vgetq_lane_s32(vi, 2);
1549
+ y[i].qs[4*l + 3] = vgetq_lane_s32(vi, 3);
1550
+ }
1551
+ }
1552
+ #elif defined(__AVX2__) || defined(__AVX__)
1553
+ for (int i = 0; i < nb; i++) {
1554
+ // Load elements into 4 AVX vectors
1555
+ __m256 v0 = _mm256_loadu_ps( x );
1556
+ __m256 v1 = _mm256_loadu_ps( x + 8 );
1557
+ __m256 v2 = _mm256_loadu_ps( x + 16 );
1558
+ __m256 v3 = _mm256_loadu_ps( x + 24 );
1559
+ x += 32;
1560
+
1561
+ // Compute max(abs(e)) for the block
1562
+ const __m256 signBit = _mm256_set1_ps( -0.0f );
1563
+ __m256 maxAbs = _mm256_andnot_ps( signBit, v0 );
1564
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v1 ) );
1565
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v2 ) );
1566
+ maxAbs = _mm256_max_ps( maxAbs, _mm256_andnot_ps( signBit, v3 ) );
1567
+
1568
+ __m128 max4 = _mm_max_ps( _mm256_extractf128_ps( maxAbs, 1 ), _mm256_castps256_ps128( maxAbs ) );
1569
+ max4 = _mm_max_ps( max4, _mm_movehl_ps( max4, max4 ) );
1570
+ max4 = _mm_max_ss( max4, _mm_movehdup_ps( max4 ) );
1571
+ const float maxScalar = _mm_cvtss_f32( max4 );
1572
+
1573
+ // Quantize these floats
1574
+ const float d = maxScalar / 127.f;
1575
+ y[i].d = d;
1576
+ const float id = ( maxScalar != 0.0f ) ? 127.f / maxScalar : 0.0f;
1577
+ const __m256 mul = _mm256_set1_ps( id );
1578
+
1579
+ // Apply the multiplier
1580
+ v0 = _mm256_mul_ps( v0, mul );
1581
+ v1 = _mm256_mul_ps( v1, mul );
1582
+ v2 = _mm256_mul_ps( v2, mul );
1583
+ v3 = _mm256_mul_ps( v3, mul );
1584
+
1585
+ // Round to nearest integer
1586
+ v0 = _mm256_round_ps( v0, _MM_ROUND_NEAREST );
1587
+ v1 = _mm256_round_ps( v1, _MM_ROUND_NEAREST );
1588
+ v2 = _mm256_round_ps( v2, _MM_ROUND_NEAREST );
1589
+ v3 = _mm256_round_ps( v3, _MM_ROUND_NEAREST );
1590
+
1591
+ // Convert floats to integers
1592
+ __m256i i0 = _mm256_cvtps_epi32( v0 );
1593
+ __m256i i1 = _mm256_cvtps_epi32( v1 );
1594
+ __m256i i2 = _mm256_cvtps_epi32( v2 );
1595
+ __m256i i3 = _mm256_cvtps_epi32( v3 );
1596
+
1597
+ #if defined(__AVX2__)
1598
+ // Convert int32 to int16
1599
+ i0 = _mm256_packs_epi32( i0, i1 ); // 0, 1, 2, 3, 8, 9, 10, 11, 4, 5, 6, 7, 12, 13, 14, 15
1600
+ i2 = _mm256_packs_epi32( i2, i3 ); // 16, 17, 18, 19, 24, 25, 26, 27, 20, 21, 22, 23, 28, 29, 30, 31
1601
+ // Convert int16 to int8
1602
+ i0 = _mm256_packs_epi16( i0, i2 ); // 0, 1, 2, 3, 8, 9, 10, 11, 16, 17, 18, 19, 24, 25, 26, 27, 4, 5, 6, 7, 12, 13, 14, 15, 20, 21, 22, 23, 28, 29, 30, 31
1603
+
1604
+ // We got our precious signed bytes, but the order is now wrong
1605
+ // These AVX2 pack instructions process 16-byte pieces independently
1606
+ // The following instruction is fixing the order
1607
+ const __m256i perm = _mm256_setr_epi32( 0, 4, 1, 5, 2, 6, 3, 7 );
1608
+ i0 = _mm256_permutevar8x32_epi32( i0, perm );
1609
+
1610
+ _mm256_storeu_si256((__m256i *)y[i].qs, i0);
1611
+ #else
1612
+ // Since we don't have in AVX some necessary functions,
1613
+ // we split the registers in half and call AVX2 analogs from SSE
1614
+ __m128i ni0 = _mm256_castsi256_si128( i0 );
1615
+ __m128i ni1 = _mm256_extractf128_si256( i0, 1);
1616
+ __m128i ni2 = _mm256_castsi256_si128( i1 );
1617
+ __m128i ni3 = _mm256_extractf128_si256( i1, 1);
1618
+ __m128i ni4 = _mm256_castsi256_si128( i2 );
1619
+ __m128i ni5 = _mm256_extractf128_si256( i2, 1);
1620
+ __m128i ni6 = _mm256_castsi256_si128( i3 );
1621
+ __m128i ni7 = _mm256_extractf128_si256( i3, 1);
1622
+
1623
+ // Convert int32 to int16
1624
+ ni0 = _mm_packs_epi32( ni0, ni1 );
1625
+ ni2 = _mm_packs_epi32( ni2, ni3 );
1626
+ ni4 = _mm_packs_epi32( ni4, ni5 );
1627
+ ni6 = _mm_packs_epi32( ni6, ni7 );
1628
+ // Convert int16 to int8
1629
+ ni0 = _mm_packs_epi16( ni0, ni2 );
1630
+ ni4 = _mm_packs_epi16( ni4, ni6 );
1631
+
1632
+ _mm_storeu_si128((__m128i *)(y[i].qs + 0), ni0);
1633
+ _mm_storeu_si128((__m128i *)(y[i].qs + 16), ni4);
1634
+ #endif
1635
+ }
1636
+ #else
1637
+ // scalar
1465
1638
  quantize_row_q8_0_reference(x, y, k);
1639
+ #endif
1466
1640
  }
1467
1641
 
1468
1642
  // reference implementation for deterministic creation of model files
1469
1643
  static void quantize_row_q8_1_reference(const float * restrict x, block_q8_1 * restrict y, int k) {
1644
+ assert(QK8_1 == 32);
1470
1645
  assert(k % QK8_1 == 0);
1471
1646
  const int nb = k / QK8_1;
1472
1647
 
@@ -1917,36 +2092,6 @@ static void dequantize_row_q4_2(const void * restrict vx, float * restrict y, in
1917
2092
  }
1918
2093
  }
1919
2094
 
1920
- static void dequantize_row_q4_3(const void * restrict vx, float * restrict y, int k) {
1921
- assert(k % QK4_3 == 0);
1922
- const int nb = k / QK4_3;
1923
-
1924
- const block_q4_3 * restrict x = vx;
1925
-
1926
- for (int i = 0; i < nb; i++) {
1927
- const float d = GGML_FP16_TO_FP32(x[i].d);
1928
- const float m = GGML_FP16_TO_FP32(x[i].m);
1929
-
1930
- const uint8_t * restrict pp = x[i].qs;
1931
-
1932
- for (int l = 0; l < QK4_3; l += 2) {
1933
- const uint8_t vi = pp[l/2];
1934
-
1935
- const int8_t vi0 = vi & 0x0F;
1936
- const int8_t vi1 = vi >> 4;
1937
-
1938
- const float v0 = vi0*d + m;
1939
- const float v1 = vi1*d + m;
1940
-
1941
- y[i*QK4_3 + l + 0] = v0;
1942
- y[i*QK4_3 + l + 1] = v1;
1943
-
1944
- assert(!isnan(y[i*QK4_3 + l + 0]));
1945
- assert(!isnan(y[i*QK4_3 + l + 1]));
1946
- }
1947
- }
1948
- }
1949
-
1950
2095
  static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, int k) {
1951
2096
  assert(k % QK5_0 == 0);
1952
2097
  const int nb = k / QK5_0;
@@ -1965,8 +2110,8 @@ static void dequantize_row_q5_0(const void * restrict vx, float * restrict y, in
1965
2110
  const uint8_t vi = pp[l/2];
1966
2111
 
1967
2112
  // extract the 5-th bit from qh
1968
- const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
1969
- const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
2113
+ const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
2114
+ const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
1970
2115
 
1971
2116
  const int8_t vi0 = (vi & 0x0F) | vh0;
1972
2117
  const int8_t vi1 = (vi >> 4) | vh1;
@@ -2002,8 +2147,8 @@ static void dequantize_row_q5_1(const void * restrict vx, float * restrict y, in
2002
2147
  const uint8_t vi = pp[l/2];
2003
2148
 
2004
2149
  // extract the 5-th bit from qh
2005
- const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
2006
- const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
2150
+ const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
2151
+ const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
2007
2152
 
2008
2153
  const uint8_t vi0 = (vi & 0x0F) | vh0;
2009
2154
  const uint8_t vi1 = (vi >> 4) | vh1;
@@ -2040,7 +2185,6 @@ static void dequantize_row_q8_0(const void * restrict vx, float * restrict y, in
2040
2185
  static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2041
2186
  static void ggml_vec_dot_q4_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2042
2187
  static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2043
- static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2044
2188
  static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2045
2189
  static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
2046
2190
  static void ggml_vec_dot_q8_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy);
@@ -2070,14 +2214,6 @@ static const quantize_fns_t quantize_fns[GGML_TYPE_COUNT] = {
2070
2214
  .vec_dot_q = ggml_vec_dot_q4_2_q8_0,
2071
2215
  .vec_dot_type = GGML_TYPE_Q8_0,
2072
2216
  },
2073
- [GGML_TYPE_Q4_3] = {
2074
- .dequantize_row_q = dequantize_row_q4_3,
2075
- .quantize_row_q = quantize_row_q4_3,
2076
- .quantize_row_q_reference = (quantize_row_q_t) quantize_row_q4_3_reference,
2077
- .quantize_row_q_dot = quantize_row_q8_1,
2078
- .vec_dot_q = ggml_vec_dot_q4_3_q8_1,
2079
- .vec_dot_type = GGML_TYPE_Q8_1,
2080
- },
2081
2217
  [GGML_TYPE_Q5_0] = {
2082
2218
  .dequantize_row_q = dequantize_row_q5_0,
2083
2219
  .quantize_row_q = quantize_row_q5_0,
@@ -2748,35 +2884,35 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void *
2748
2884
  const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b);
2749
2885
  const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b);
2750
2886
 
2887
+ // interleave
2888
+ const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs);
2889
+ const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs);
2890
+ const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs);
2891
+ const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs);
2892
+
2751
2893
  // load y
2752
2894
  const int8x16_t v1_0l = vld1q_s8(y0->qs);
2753
2895
  const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
2754
2896
  const int8x16_t v1_1l = vld1q_s8(y1->qs);
2755
2897
  const int8x16_t v1_1h = vld1q_s8(y1->qs + 16);
2756
2898
 
2757
- // interleave
2758
- const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h);
2759
- const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h);
2760
- const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h);
2761
- const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h);
2762
-
2763
2899
  #if defined(__ARM_FEATURE_DOTPROD)
2764
2900
  // dot product into int32x4_t
2765
- const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs);
2766
- const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs);
2901
+ const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h);
2902
+ const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h);
2767
2903
 
2768
2904
  sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d);
2769
2905
  sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d);
2770
2906
  #else
2771
- const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls));
2772
- const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls));
2773
- const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs));
2774
- const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs));
2907
+ const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
2908
+ const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
2909
+ const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
2910
+ const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
2775
2911
 
2776
- const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls));
2777
- const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls));
2778
- const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs));
2779
- const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs));
2912
+ const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l));
2913
+ const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l));
2914
+ const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h));
2915
+ const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h));
2780
2916
 
2781
2917
  const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
2782
2918
  const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
@@ -3171,136 +3307,6 @@ static void ggml_vec_dot_q4_2_q8_0(const int n, float * restrict s, const void *
3171
3307
  #endif
3172
3308
  }
3173
3309
 
3174
- static void ggml_vec_dot_q4_3_q8_1(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3175
- const int nb = n / QK8_1;
3176
-
3177
- assert(n % QK8_1 == 0);
3178
- assert(nb % 2 == 0);
3179
- assert(QK8_1 == 2*QK4_3);
3180
-
3181
- const block_q4_3 * restrict x = vx;
3182
- const block_q8_1 * restrict y = vy;
3183
-
3184
- #if defined(__ARM_NEON)
3185
- float32x4_t sumv0 = vdupq_n_f32(0.0f);
3186
- float32x4_t sumv1 = vdupq_n_f32(0.0f);
3187
-
3188
- float summs0 = 0.0f;
3189
- float summs1 = 0.0f;
3190
-
3191
- for (int i = 0; i < nb; ++i) {
3192
- const block_q4_3 * restrict x0_0 = &x[2*(i + 0) + 0];
3193
- const block_q4_3 * restrict x0_1 = &x[2*(i + 0) + 1];
3194
-
3195
- const block_q8_1 * restrict y0 = &y[i + 0];
3196
-
3197
- summs0 += GGML_FP16_TO_FP32(x0_0->m) * y0->s0;
3198
- summs1 += GGML_FP16_TO_FP32(x0_1->m) * y0->s1;
3199
-
3200
- const uint8x16_t v0_0 = vcombine_u8(vld1_u8(x0_0->qs), vld1_u8(x0_1->qs));
3201
-
3202
- // 4-bit -> 8-bit
3203
- const int8x16_t v0_0l = vreinterpretq_s8_u8(vandq_u8 (v0_0, vdupq_n_u8(0x0F)));
3204
- const int8x16_t v0_0h = vreinterpretq_s8_u8(vshrq_n_u8(v0_0, 4));
3205
-
3206
- // interleave
3207
- const int8x16_t v0_0lz = vzip1q_s8(v0_0l, v0_0h);
3208
- const int8x16_t v0_0hz = vzip2q_s8(v0_0l, v0_0h);
3209
-
3210
- // load y
3211
- const int8x16_t v1_0l = vld1q_s8(y0->qs);
3212
- const int8x16_t v1_0h = vld1q_s8(y0->qs + 16);
3213
-
3214
- const float x0_0d = GGML_FP16_TO_FP32(x0_0->d);
3215
- const float x0_1d = GGML_FP16_TO_FP32(x0_1->d);
3216
-
3217
- #if defined(__ARM_FEATURE_DOTPROD)
3218
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l)), x0_0d*y0->d);
3219
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(vdotq_s32(vdupq_n_s32(0), v0_0hz, v1_0h)), x0_1d*y0->d);
3220
- #else
3221
- const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l));
3222
- const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l));
3223
- const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h));
3224
- const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h));
3225
-
3226
- const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h));
3227
- const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h));
3228
-
3229
- sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(pl0), x0_0d*y0->d);
3230
- sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(ph0), x0_1d*y0->d);
3231
- #endif
3232
- }
3233
-
3234
- *s = vaddvq_f32(vaddq_f32(sumv0, sumv1)) + summs0 + summs1;
3235
- #elif defined(__AVX2__)
3236
- // Initialize accumulator with zeros
3237
- __m256 acc = _mm256_setzero_ps();
3238
- float summs = 0.0f;
3239
-
3240
- // Main loop
3241
- for (int i = 0; i < nb; i++) {
3242
- const __m128 d0 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 0].d));
3243
- const __m128 d1 = _mm_set1_ps(GGML_FP16_TO_FP32(x[2*i + 1].d));
3244
- const __m256 dx = _mm256_set_m128(d1, d0);
3245
-
3246
- summs += GGML_FP16_TO_FP32(x[2*i + 0].m) * y[i].s0
3247
- + GGML_FP16_TO_FP32(x[2*i + 1].m) * y[i].s1;
3248
-
3249
- const __m128i bx0 = bytes_from_nibbles_16(x[2*i + 0].qs);
3250
- const __m128i bx1 = bytes_from_nibbles_16(x[2*i + 1].qs);
3251
- const __m256i bx = _mm256_set_m128i(bx1, bx0);
3252
-
3253
- const __m256 dy = _mm256_broadcast_ss(&y[i].d);
3254
- const __m256i by = _mm256_loadu_si256((const __m256i *)y[i].qs);
3255
-
3256
- const __m256 q = mul_sum_i8_pairs_float(bx, by);
3257
-
3258
- acc = _mm256_fmadd_ps(q, _mm256_mul_ps(dx, dy), acc);
3259
- }
3260
-
3261
- *s = hsum_float_8(acc) + summs;
3262
- #else
3263
- // scalar
3264
- float sumf = 0.0;
3265
- for (int i = 0; i < nb; i++) {
3266
- const uint8_t * restrict x0 = x[2*i + 0].qs;
3267
- const uint8_t * restrict x1 = x[2*i + 1].qs;
3268
- const int8_t * restrict y0 = y[i].qs;
3269
-
3270
- const float d0 = GGML_FP16_TO_FP32(x[2*i + 0].d);
3271
- const float m0 = GGML_FP16_TO_FP32(x[2*i + 0].m);
3272
- const float d1 = GGML_FP16_TO_FP32(x[2*i + 1].d);
3273
- const float m1 = GGML_FP16_TO_FP32(x[2*i + 1].m);
3274
-
3275
- int sxy_0 = 0;
3276
- int sxy_1 = 0;
3277
-
3278
- for (int j = 0; j < QK8_1/4; j++) {
3279
- const uint8_t v0 = x0[j];
3280
- const uint8_t v1 = x1[j];
3281
-
3282
- const int x0_0 = v0 & 0x0F;
3283
- const int x1_0 = v0 >> 4;
3284
-
3285
- const int x0_1 = v1 & 0x0F;
3286
- const int x1_1 = v1 >> 4;
3287
-
3288
- const int y0_0 = y0[2*j + 0];
3289
- const int y1_0 = y0[2*j + 1];
3290
-
3291
- const int y0_1 = y0[2*(j + QK8_1/4) + 0];
3292
- const int y1_1 = y0[2*(j + QK8_1/4) + 1];
3293
-
3294
- sxy_0 += x0_0*y0_0 + x1_0*y1_0;
3295
- sxy_1 += x0_1*y0_1 + x1_1*y1_1;
3296
- }
3297
-
3298
- sumf += (d0*sxy_0 + d1*sxy_1)*y[i].d + m0*y[i].s0 + m1*y[i].s1;
3299
- }
3300
- *s = sumf;
3301
- #endif
3302
- }
3303
-
3304
3310
  static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * restrict vx, const void * restrict vy) {
3305
3311
  const int nb = n / QK8_0;
3306
3312
 
@@ -3373,6 +3379,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3373
3379
  }
3374
3380
 
3375
3381
  *s = vaddvq_f32(sumv);
3382
+ #elif defined(__wasm_simd128__)
3383
+ v128_t sumv = wasm_f32x4_splat(0.0f);
3384
+
3385
+ uint64_t tmp[4];
3386
+
3387
+ for (int i = 0; i < nb; ++i) {
3388
+ const block_q5_0 * restrict x0 = &x[i];
3389
+ const block_q8_0 * restrict y0 = &y[i];
3390
+
3391
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
3392
+ const v128_t s16b = wasm_i8x16_splat(0x10);
3393
+
3394
+ // extract the 5th bit
3395
+ uint32_t qh;
3396
+ memcpy(&qh, x0->qh, sizeof(qh));
3397
+
3398
+ tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
3399
+ tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
3400
+ tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
3401
+ tmp[3] = table_b2b_u[(qh >> 24) ];
3402
+
3403
+ const v128_t qhl = wasm_v128_load(tmp + 0);
3404
+ const v128_t qhh = wasm_v128_load(tmp + 2);
3405
+
3406
+ const v128_t v0 = wasm_v128_load(x0->qs);
3407
+
3408
+ // 4-bit -> 8-bit
3409
+ const v128_t v0l = wasm_v128_and (v0, m4b);
3410
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
3411
+
3412
+ // interleave
3413
+ const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
3414
+ const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
3415
+
3416
+ // add high bit and sub 16
3417
+ const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b);
3418
+ const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b);
3419
+
3420
+ // load y
3421
+ const v128_t v1l = wasm_v128_load(y0->qs);
3422
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
3423
+
3424
+ // int8x16 -> int16x8
3425
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
3426
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
3427
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
3428
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
3429
+
3430
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
3431
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
3432
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
3433
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
3434
+
3435
+ const float x0d = GGML_FP16_TO_FP32(x0->d);
3436
+
3437
+ // dot product
3438
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
3439
+ wasm_i32x4_add(
3440
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
3441
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
3442
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
3443
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
3444
+ }
3445
+
3446
+ *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
3447
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3);
3376
3448
  #elif defined(__AVX2__)
3377
3449
  // Initialize accumulator with zeros
3378
3450
  __m256 acc = _mm256_setzero_ps();
@@ -3413,8 +3485,8 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void *
3413
3485
  for (int j = 0; j < QK8_0/2; j++) {
3414
3486
  const uint8_t v0 = x0[j];
3415
3487
 
3416
- const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
3417
- const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
3488
+ const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
3489
+ const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
3418
3490
 
3419
3491
  const int x0_0 = ((v0 & 0x0F) | x0_0h) - 16;
3420
3492
  const int x1_0 = ((v0 >> 4) | x1_0h) - 16;
@@ -3504,6 +3576,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3504
3576
  }
3505
3577
 
3506
3578
  *s = vaddvq_f32(sumv) + summs;
3579
+ #elif defined(__wasm_simd128__)
3580
+ v128_t sumv = wasm_f32x4_splat(0.0f);
3581
+
3582
+ float summs = 0.0f;
3583
+
3584
+ uint64_t tmp[4];
3585
+
3586
+ for (int i = 0; i < nb; ++i) {
3587
+ const block_q5_1 * restrict x0 = &x[i];
3588
+ const block_q8_1 * restrict y0 = &y[i];
3589
+
3590
+ summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1);
3591
+
3592
+ const v128_t m4b = wasm_i8x16_splat(0x0F);
3593
+
3594
+ // extract the 5th bit
3595
+ uint32_t qh;
3596
+ memcpy(&qh, x0->qh, sizeof(qh));
3597
+
3598
+ tmp[0] = table_b2b_u[(qh >> 0) & 0xFF];
3599
+ tmp[1] = table_b2b_u[(qh >> 8) & 0xFF];
3600
+ tmp[2] = table_b2b_u[(qh >> 16) & 0xFF];
3601
+ tmp[3] = table_b2b_u[(qh >> 24) ];
3602
+
3603
+ const v128_t qhl = wasm_v128_load(tmp + 0);
3604
+ const v128_t qhh = wasm_v128_load(tmp + 2);
3605
+
3606
+ const v128_t v0 = wasm_v128_load(x0->qs);
3607
+
3608
+ // 4-bit -> 8-bit
3609
+ const v128_t v0l = wasm_v128_and (v0, m4b);
3610
+ const v128_t v0h = wasm_u8x16_shr(v0, 4);
3611
+
3612
+ static bool x = true;
3613
+
3614
+ // interleave
3615
+ const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
3616
+ const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
3617
+
3618
+ // add high bit
3619
+ const v128_t v0lf = wasm_v128_or(v0lz, qhl);
3620
+ const v128_t v0hf = wasm_v128_or(v0hz, qhh);
3621
+
3622
+ // load y
3623
+ const v128_t v1l = wasm_v128_load(y0->qs);
3624
+ const v128_t v1h = wasm_v128_load(y0->qs + 16);
3625
+
3626
+ // int8x16 -> int16x8
3627
+ const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf);
3628
+ const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf);
3629
+ const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf);
3630
+ const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf);
3631
+
3632
+ const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l);
3633
+ const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l);
3634
+ const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h);
3635
+ const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h);
3636
+
3637
+ const float x0d = GGML_FP16_TO_FP32(x0->d);
3638
+
3639
+ // dot product
3640
+ sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4(
3641
+ wasm_i32x4_add(
3642
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll),
3643
+ wasm_i32x4_dot_i16x8(v0lfh, v1lh)),
3644
+ wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl),
3645
+ wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d)));
3646
+ }
3647
+
3648
+ *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) +
3649
+ wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs;
3507
3650
  #elif defined(__AVX2__)
3508
3651
  // Initialize accumulator with zeros
3509
3652
  __m256 acc = _mm256_setzero_ps();
@@ -3547,8 +3690,8 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void *
3547
3690
  for (int j = 0; j < QK8_1/2; j++) {
3548
3691
  const uint8_t v0 = x0[j];
3549
3692
 
3550
- const int x0_0h = ((qh & (1 << (2*j + 0))) >> (2*j + 0)) << 4;
3551
- const int x1_0h = ((qh & (1 << (2*j + 1))) >> (2*j + 1)) << 4;
3693
+ const int x0_0h = ((qh & (1u << (2*j + 0))) >> (2*j + 0)) << 4;
3694
+ const int x1_0h = ((qh & (1u << (2*j + 1))) >> (2*j + 1)) << 4;
3552
3695
 
3553
3696
  const int x0_0 = (v0 & 0x0F) | x0_0h;
3554
3697
  const int x1_0 = (v0 >> 4) | x1_0h;
@@ -3925,7 +4068,6 @@ static const int GGML_BLCK_SIZE[GGML_TYPE_COUNT] = {
3925
4068
  [GGML_TYPE_Q4_0] = QK4_0,
3926
4069
  [GGML_TYPE_Q4_1] = QK4_1,
3927
4070
  [GGML_TYPE_Q4_2] = QK4_2,
3928
- [GGML_TYPE_Q4_3] = QK4_3,
3929
4071
  [GGML_TYPE_Q5_0] = QK5_0,
3930
4072
  [GGML_TYPE_Q5_1] = QK5_1,
3931
4073
  [GGML_TYPE_Q8_0] = QK8_0,
@@ -3942,7 +4084,6 @@ static const size_t GGML_TYPE_SIZE[GGML_TYPE_COUNT] = {
3942
4084
  [GGML_TYPE_Q4_0] = sizeof(block_q4_0),
3943
4085
  [GGML_TYPE_Q4_1] = sizeof(block_q4_1),
3944
4086
  [GGML_TYPE_Q4_2] = sizeof(block_q4_2),
3945
- [GGML_TYPE_Q4_3] = sizeof(block_q4_3),
3946
4087
  [GGML_TYPE_Q5_0] = sizeof(block_q5_0),
3947
4088
  [GGML_TYPE_Q5_1] = sizeof(block_q5_1),
3948
4089
  [GGML_TYPE_Q8_0] = sizeof(block_q8_0),
@@ -3960,7 +4101,6 @@ static const char * GGML_TYPE_NAME[GGML_TYPE_COUNT] = {
3960
4101
  [GGML_TYPE_Q4_0] = "q4_0",
3961
4102
  [GGML_TYPE_Q4_1] = "q4_1",
3962
4103
  [GGML_TYPE_Q4_2] = "q4_2",
3963
- [GGML_TYPE_Q4_3] = "q4_3",
3964
4104
  [GGML_TYPE_Q5_0] = "q5_0",
3965
4105
  [GGML_TYPE_Q5_1] = "q5_1",
3966
4106
  [GGML_TYPE_Q8_0] = "q8_0",
@@ -3977,7 +4117,6 @@ static bool GGML_IS_QUANTIZED[GGML_TYPE_COUNT] = {
3977
4117
  [GGML_TYPE_Q4_0] = true,
3978
4118
  [GGML_TYPE_Q4_1] = true,
3979
4119
  [GGML_TYPE_Q4_2] = true,
3980
- [GGML_TYPE_Q4_3] = true,
3981
4120
  [GGML_TYPE_Q5_0] = true,
3982
4121
  [GGML_TYPE_Q5_1] = true,
3983
4122
  [GGML_TYPE_Q8_0] = true,
@@ -4024,6 +4163,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
4024
4163
  "DIAG_MASK_INF",
4025
4164
  "SOFT_MAX",
4026
4165
  "ROPE",
4166
+ "ALIBI",
4027
4167
  "CONV_1D_1S",
4028
4168
  "CONV_1D_2S",
4029
4169
 
@@ -4034,7 +4174,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = {
4034
4174
  "MAP_BINARY",
4035
4175
  };
4036
4176
 
4037
- static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
4177
+ static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
4038
4178
 
4039
4179
  static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
4040
4180
  "none",
@@ -4072,6 +4212,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
4072
4212
  "diag_mask_inf(x)",
4073
4213
  "soft_max(x)",
4074
4214
  "rope(x)",
4215
+ "alibi(x)",
4075
4216
  "conv_1d_1s(x)",
4076
4217
  "conv_1d_2s(x)",
4077
4218
 
@@ -4082,7 +4223,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = {
4082
4223
  "f(x,y)",
4083
4224
  };
4084
4225
 
4085
- static_assert(GGML_OP_COUNT == 38, "GGML_OP_COUNT != 38");
4226
+ static_assert(GGML_OP_COUNT == 39, "GGML_OP_COUNT != 39");
4086
4227
 
4087
4228
  static_assert(sizeof(struct ggml_object)%GGML_MEM_ALIGN == 0, "ggml_object size must be a multiple of GGML_MEM_ALIGN");
4088
4229
  static_assert(sizeof(struct ggml_tensor)%GGML_MEM_ALIGN == 0, "ggml_tensor size must be a multiple of GGML_MEM_ALIGN");
@@ -4252,6 +4393,27 @@ bool ggml_is_quantized(enum ggml_type type) {
4252
4393
  return GGML_IS_QUANTIZED[type];
4253
4394
  }
4254
4395
 
4396
+ enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) {
4397
+ enum ggml_type wtype = GGML_TYPE_COUNT;
4398
+
4399
+ switch (ftype) {
4400
+ case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break;
4401
+ case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break;
4402
+ case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break;
4403
+ case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break;
4404
+ case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break;
4405
+ case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break;
4406
+ case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break;
4407
+ case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break;
4408
+ case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break;
4409
+ case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break;
4410
+ }
4411
+
4412
+ GGML_ASSERT(wtype != GGML_TYPE_COUNT);
4413
+
4414
+ return wtype;
4415
+ }
4416
+
4255
4417
  static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) {
4256
4418
  return tensor->nb[0] > tensor->nb[1];
4257
4419
  }
@@ -4362,12 +4524,11 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
4362
4524
  GGML_PRINT_DEBUG("%s: g_state initialized in %f ms\n", __func__, (t_end - t_start)/1000.0f);
4363
4525
  }
4364
4526
 
4365
- // initialize cuBLAS
4366
- #if defined(GGML_USE_CUBLAS)
4527
+ #if defined(GGML_USE_CUBLAS)
4367
4528
  ggml_init_cublas();
4368
- #elif defined(GGML_USE_CLBLAST)
4529
+ #elif defined(GGML_USE_CLBLAST)
4369
4530
  ggml_cl_init();
4370
- #endif
4531
+ #endif
4371
4532
 
4372
4533
  is_first_call = false;
4373
4534
  }
@@ -4448,7 +4609,7 @@ void ggml_free(struct ggml_context * ctx) {
4448
4609
  }
4449
4610
 
4450
4611
  size_t ggml_used_mem(const struct ggml_context * ctx) {
4451
- return ctx->objects_end->offs + ctx->objects_end->size;
4612
+ return ctx->objects_end == NULL ? 0 : ctx->objects_end->offs + ctx->objects_end->size;
4452
4613
  }
4453
4614
 
4454
4615
  size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch) {
@@ -4561,6 +4722,7 @@ struct ggml_tensor * ggml_new_tensor_impl(
4561
4722
  /*.perf_cycles =*/ 0,
4562
4723
  /*.perf_time_us =*/ 0,
4563
4724
  /*.data =*/ (data == NULL && !ctx->no_alloc) ? (void *)(result + 1) : data,
4725
+ /*.name =*/ { 0 },
4564
4726
  /*.pad =*/ { 0 },
4565
4727
  };
4566
4728
 
@@ -4915,6 +5077,15 @@ float * ggml_get_data_f32(const struct ggml_tensor * tensor) {
4915
5077
  return (float *)(tensor->data);
4916
5078
  }
4917
5079
 
5080
+ const char * ggml_get_name(const struct ggml_tensor * tensor) {
5081
+ return tensor->name;
5082
+ }
5083
+
5084
+ void ggml_set_name(struct ggml_tensor * tensor, const char * name) {
5085
+ strncpy(tensor->name, name, sizeof(tensor->name));
5086
+ tensor->name[sizeof(tensor->name) - 1] = '\0';
5087
+ }
5088
+
4918
5089
  struct ggml_tensor * ggml_view_tensor(
4919
5090
  struct ggml_context * ctx,
4920
5091
  const struct ggml_tensor * src) {
@@ -6014,6 +6185,7 @@ struct ggml_tensor * ggml_diag_mask_inf(
6014
6185
  //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6015
6186
  struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6016
6187
  struct ggml_tensor * b = ggml_new_i32(ctx, n_past);
6188
+ ggml_set_name(b, "n_past");
6017
6189
 
6018
6190
  result->op = GGML_OP_DIAG_MASK_INF;
6019
6191
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6071,6 +6243,7 @@ struct ggml_tensor * ggml_rope(
6071
6243
  ((int32_t *) b->data)[0] = n_past;
6072
6244
  ((int32_t *) b->data)[1] = n_dims;
6073
6245
  ((int32_t *) b->data)[2] = mode;
6246
+ ggml_set_name(b, "n_past, n_dims, mode");
6074
6247
 
6075
6248
  result->op = GGML_OP_ROPE;
6076
6249
  result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
@@ -6080,6 +6253,37 @@ struct ggml_tensor * ggml_rope(
6080
6253
  return result;
6081
6254
  }
6082
6255
 
6256
+ // ggml_alibi
6257
+
6258
+ struct ggml_tensor * ggml_alibi(
6259
+ struct ggml_context * ctx,
6260
+ struct ggml_tensor * a,
6261
+ int n_past,
6262
+ int n_head) {
6263
+ GGML_ASSERT(n_past >= 0);
6264
+ bool is_node = false;
6265
+
6266
+ if (a->grad) {
6267
+ GGML_ASSERT(false); // TODO: implement backward
6268
+ is_node = true;
6269
+ }
6270
+
6271
+ // TODO: when implement backward, fix this:
6272
+ //struct ggml_tensor * result = inplace ? ggml_view_tensor(ctx, a) : ggml_dup_tensor(ctx, a);
6273
+ struct ggml_tensor * result = ggml_view_tensor(ctx, a);
6274
+
6275
+ struct ggml_tensor * b = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, 2);
6276
+ ((int32_t *) b->data)[0] = n_past;
6277
+ ((int32_t *) b->data)[1] = n_head;
6278
+
6279
+ result->op = GGML_OP_ALIBI;
6280
+ result->grad = is_node ? ggml_dup_tensor(ctx, result) : NULL;
6281
+ result->src0 = a;
6282
+ result->src1 = b;
6283
+
6284
+ return result;
6285
+ }
6286
+
6083
6287
  // ggml_conv_1d_1s
6084
6288
 
6085
6289
  struct ggml_tensor * ggml_conv_1d_1s(
@@ -7199,7 +7403,6 @@ static void ggml_compute_forward_add(
7199
7403
  case GGML_TYPE_Q4_0:
7200
7404
  case GGML_TYPE_Q4_1:
7201
7405
  case GGML_TYPE_Q4_2:
7202
- case GGML_TYPE_Q4_3:
7203
7406
  case GGML_TYPE_Q5_0:
7204
7407
  case GGML_TYPE_Q5_1:
7205
7408
  case GGML_TYPE_Q8_0:
@@ -8108,7 +8311,7 @@ static void ggml_compute_forward_rms_norm(
8108
8311
 
8109
8312
  // ggml_compute_forward_mul_mat
8110
8313
 
8111
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8314
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8112
8315
  // helper function to determine if it is better to use BLAS or not
8113
8316
  // for large matrices, BLAS is faster
8114
8317
  static bool ggml_compute_forward_mul_mat_use_blas(
@@ -8125,7 +8328,8 @@ static bool ggml_compute_forward_mul_mat_use_blas(
8125
8328
 
8126
8329
  // TODO: find the optimal values for these
8127
8330
  if (ggml_is_contiguous(src0) &&
8128
- ggml_is_contiguous(src1) && ((ne0 >= 32 && ne1 >= 32 && ne10 >= 32))) {
8331
+ ggml_is_contiguous(src1) &&
8332
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
8129
8333
 
8130
8334
  /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
8131
8335
  return true;
@@ -8133,7 +8337,6 @@ static bool ggml_compute_forward_mul_mat_use_blas(
8133
8337
 
8134
8338
  return false;
8135
8339
  }
8136
-
8137
8340
  #endif
8138
8341
 
8139
8342
  static void ggml_compute_forward_mul_mat_f32(
@@ -8149,7 +8352,7 @@ static void ggml_compute_forward_mul_mat_f32(
8149
8352
  const int64_t ne02 = src0->ne[2];
8150
8353
  const int64_t ne03 = src0->ne[3];
8151
8354
 
8152
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8355
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8153
8356
  const int64_t ne10 = src1->ne[0];
8154
8357
  #endif
8155
8358
  const int64_t ne11 = src1->ne[1];
@@ -8206,7 +8409,16 @@ static void ggml_compute_forward_mul_mat_f32(
8206
8409
  // nb01 >= nb00 - src0 is not transposed
8207
8410
  // compute by src0 rows
8208
8411
 
8209
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8412
+ #if defined(GGML_USE_CUBLAS)
8413
+ if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
8414
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
8415
+ ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
8416
+ }
8417
+ return;
8418
+ }
8419
+ #endif
8420
+
8421
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8210
8422
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8211
8423
  if (params->ith != 0) {
8212
8424
  return;
@@ -8220,42 +8432,13 @@ static void ggml_compute_forward_mul_mat_f32(
8220
8432
  return;
8221
8433
  }
8222
8434
 
8223
- #if defined(GGML_USE_CUBLAS)
8224
- const float alpha = 1.0f;
8225
- const float beta = 0.0f;
8226
- const int x_ne = ne01 * ne10;
8227
- const int y_ne = ne11 * ne10;
8228
- const int d_ne = ne11 * ne01;
8229
-
8230
- size_t x_size, y_size, d_size;
8231
- float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8232
- float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8233
- float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8234
- #endif
8235
-
8236
8435
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8237
8436
  for (int64_t i02 = 0; i02 < ne02; i02++) {
8238
8437
  const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
8239
8438
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
8240
-
8241
8439
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8242
8440
 
8243
- #if defined(GGML_USE_CUBLAS)
8244
- // copy data to device
8245
- CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(float) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
8246
- CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
8247
-
8248
- // compute
8249
- CUBLAS_CHECK(
8250
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
8251
- ne01, ne11, ne10,
8252
- &alpha, d_X, ne00,
8253
- d_Y, ne10,
8254
- &beta, d_D, ne01));
8255
-
8256
- // copy data to host
8257
- CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
8258
- #elif defined(GGML_USE_CLBLAST)
8441
+ #if defined(GGML_USE_CLBLAST)
8259
8442
  // zT = y * xT
8260
8443
  ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
8261
8444
  ne11, ne01, ne10,
@@ -8272,12 +8455,6 @@ static void ggml_compute_forward_mul_mat_f32(
8272
8455
  #endif
8273
8456
  }
8274
8457
  }
8275
- #if defined(GGML_USE_CUBLAS)
8276
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
8277
- ggml_cuda_pool_free(d_X, x_size);
8278
- ggml_cuda_pool_free(d_Y, y_size);
8279
- ggml_cuda_pool_free(d_D, d_size);
8280
- #endif
8281
8458
  //printf("CBLAS F32 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
8282
8459
 
8283
8460
  return;
@@ -8407,7 +8584,16 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8407
8584
  // nb01 >= nb00 - src0 is not transposed
8408
8585
  // compute by src0 rows
8409
8586
 
8410
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8587
+ #if defined(GGML_USE_CUBLAS)
8588
+ if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
8589
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
8590
+ ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
8591
+ }
8592
+ return;
8593
+ }
8594
+ #endif
8595
+
8596
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8411
8597
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8412
8598
  GGML_ASSERT(nb10 == sizeof(float));
8413
8599
 
@@ -8423,35 +8609,9 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8423
8609
  return;
8424
8610
  }
8425
8611
 
8426
- #if defined(GGML_USE_CUBLAS)
8427
- ggml_fp16_t * const wdata = params->wdata;
8428
-
8429
- const float alpha = 1.0f;
8430
- const float beta = 0.0f;
8431
- const int x_ne = ne01 * ne10;
8432
- const int y_ne = ne11 * ne10;
8433
- const int d_ne = ne11 * ne01;
8434
-
8435
- size_t x_size, y_size, d_size;
8436
- float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8437
- float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8438
- float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8439
- #else
8440
- float * const wdata = params->wdata;
8441
- #endif
8442
8612
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8443
8613
  for (int64_t i02 = 0; i02 < ne02; i02++) {
8444
- #if defined(GGML_USE_CUBLAS)
8445
- // with cuBlAS, instead of converting src0 to fp32, we convert src1 to fp16
8446
- {
8447
- size_t id = 0;
8448
- for (int64_t i01 = 0; i01 < ne11; ++i01) {
8449
- for (int64_t i00 = 0; i00 < ne10; ++i00) {
8450
- wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10));
8451
- }
8452
- }
8453
- }
8454
- #else
8614
+ float * const wdata = params->wdata;
8455
8615
  {
8456
8616
  size_t id = 0;
8457
8617
  for (int64_t i01 = 0; i01 < ne01; ++i01) {
@@ -8459,32 +8619,11 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8459
8619
  wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00));
8460
8620
  }
8461
8621
  }
8462
- }
8463
- #endif
8464
-
8465
- #if defined(GGML_USE_CUBLAS)
8466
- const ggml_fp16_t * x = (ggml_fp16_t *) ((char *) src0->data + i02*nb02 + i03*nb03);
8467
- const ggml_fp16_t * y = (ggml_fp16_t *) wdata;
8468
8622
 
8469
- float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8623
+ assert(id*sizeof(float) <= params->wsize);
8624
+ }
8470
8625
 
8471
- // copy data to device
8472
- CUDA_CHECK(cudaMemcpyAsync(d_X, x, sizeof(ggml_fp16_t) * x_ne, cudaMemcpyHostToDevice, g_cudaStream));
8473
- CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(ggml_fp16_t) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
8474
-
8475
- // compute
8476
- CUBLAS_CHECK(
8477
- cublasGemmEx(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
8478
- ne01, ne11, ne10,
8479
- &alpha, d_X, CUDA_R_16F, ne00,
8480
- d_Y, CUDA_R_16F, ne10,
8481
- &beta, d_D, CUDA_R_32F, ne01,
8482
- CUBLAS_COMPUTE_32F,
8483
- CUBLAS_GEMM_DEFAULT));
8484
-
8485
- // copy data to host
8486
- CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
8487
- #elif defined(GGML_USE_CLBLAST)
8626
+ #if defined(GGML_USE_CLBLAST)
8488
8627
  const float * x = wdata;
8489
8628
  const float * y = (float *) ((char *) src1->data + i02*nb12 + i03*nb13);
8490
8629
 
@@ -8513,12 +8652,6 @@ static void ggml_compute_forward_mul_mat_f16_f32(
8513
8652
  }
8514
8653
  }
8515
8654
 
8516
- #if defined(GGML_USE_CUBLAS)
8517
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
8518
- ggml_cuda_pool_free(d_X, x_size);
8519
- ggml_cuda_pool_free(d_Y, y_size);
8520
- ggml_cuda_pool_free(d_D, d_size);
8521
- #endif
8522
8655
  /*printf("CBLAS F16 = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);*/
8523
8656
 
8524
8657
  return;
@@ -8671,7 +8804,16 @@ static void ggml_compute_forward_mul_mat_q_f32(
8671
8804
  // nb01 >= nb00 - src0 is not transposed
8672
8805
  // compute by src0 rows
8673
8806
 
8674
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
8807
+ #if defined(GGML_USE_CUBLAS)
8808
+ if (ggml_cuda_can_mul_mat(src0, src1, dst)) {
8809
+ if (params->ith == 0 && params->type == GGML_TASK_COMPUTE) {
8810
+ ggml_cuda_mul_mat(src0, src1, dst, params->wdata, params->wsize);
8811
+ }
8812
+ return;
8813
+ }
8814
+ #endif
8815
+
8816
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
8675
8817
  if (ggml_compute_forward_mul_mat_use_blas(src0, src1, dst)) {
8676
8818
  if (params->ith != 0) {
8677
8819
  return;
@@ -8685,48 +8827,8 @@ static void ggml_compute_forward_mul_mat_q_f32(
8685
8827
  return;
8686
8828
  }
8687
8829
 
8688
- #if defined(GGML_USE_CUBLAS)
8689
- const float alpha = 1.0f;
8690
- const float beta = 0.0f;
8691
- const int x_ne = ne01 * ne10;
8692
- const int y_ne = ne11 * ne10;
8693
- const int d_ne = ne11 * ne01;
8694
-
8695
- size_t x_size, y_size, d_size, q_size;
8696
- float *d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size);
8697
- float *d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size);
8698
- float *d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size);
8699
- float *d_Q = ggml_cuda_pool_malloc(GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], &q_size);
8700
-
8701
- void (*dequantize_row_q_cuda)(const void * x, float * y, int k, cudaStream_t stream) = NULL;
8702
- if (type == GGML_TYPE_Q4_0) {
8703
- dequantize_row_q_cuda = dequantize_row_q4_0_cuda;
8704
- }
8705
- else if (type == GGML_TYPE_Q4_1) {
8706
- dequantize_row_q_cuda = dequantize_row_q4_1_cuda;
8707
- }
8708
- else if (type == GGML_TYPE_Q4_2) {
8709
- dequantize_row_q_cuda = dequantize_row_q4_2_cuda;
8710
- }
8711
- else if (type == GGML_TYPE_Q4_3) {
8712
- dequantize_row_q_cuda = dequantize_row_q4_3_cuda;
8713
- }
8714
- else if (type == GGML_TYPE_Q5_0) {
8715
- dequantize_row_q_cuda = dequantize_row_q5_0_cuda;
8716
- }
8717
- else if (type == GGML_TYPE_Q5_1) {
8718
- dequantize_row_q_cuda = dequantize_row_q5_1_cuda;
8719
- }
8720
- else if (type == GGML_TYPE_Q8_0) {
8721
- dequantize_row_q_cuda = dequantize_row_q8_0_cuda;
8722
- }
8723
- else {
8724
- GGML_ASSERT(false);
8725
- }
8726
- #elif !defined(GGML_USE_CLBLAST)
8727
8830
  float * const wdata = params->wdata;
8728
8831
  dequantize_row_q_t const dequantize_row_q = quantize_fns[type].dequantize_row_q;
8729
- #endif
8730
8832
 
8731
8833
  for (int64_t i03 = 0; i03 < ne03; i03++) {
8732
8834
  for (int64_t i02 = 0; i02 < ne02; i02++) {
@@ -8734,15 +8836,7 @@ static void ggml_compute_forward_mul_mat_q_f32(
8734
8836
 
8735
8837
  float * d = (float *) ((char *) dst->data + i02*nb2 + i03*nb3);
8736
8838
 
8737
- #if defined(GGML_USE_CUBLAS)
8738
- // copy and dequantize on device
8739
- CUDA_CHECK(
8740
- cudaMemcpyAsync(d_Q, (char *) src0->data + i03*nb03 + i02*nb02,
8741
- GGML_TYPE_SIZE[type] * x_ne / GGML_BLCK_SIZE[type], cudaMemcpyHostToDevice, g_cudaStream));
8742
-
8743
- dequantize_row_q_cuda(d_Q, d_X, ne01 * ne00, g_cudaStream);
8744
- CUDA_CHECK(cudaGetLastError());
8745
- #elif defined(GGML_USE_CLBLAST)
8839
+ #if defined(GGML_USE_CLBLAST)
8746
8840
  const void* x = (char *) src0->data + i03*nb03 + i02*nb02;
8747
8841
  #else
8748
8842
  {
@@ -8751,26 +8845,14 @@ static void ggml_compute_forward_mul_mat_q_f32(
8751
8845
  dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00);
8752
8846
  id += ne00;
8753
8847
  }
8848
+
8849
+ assert(id*sizeof(float) <= params->wsize);
8754
8850
  }
8851
+
8755
8852
  const float * x = wdata;
8756
8853
  #endif
8757
8854
 
8758
-
8759
- #if defined(GGML_USE_CUBLAS)
8760
- // copy data to device
8761
- CUDA_CHECK(cudaMemcpyAsync(d_Y, y, sizeof(float) * y_ne, cudaMemcpyHostToDevice, g_cudaStream));
8762
-
8763
- // compute
8764
- CUBLAS_CHECK(
8765
- cublasSgemm(g_cublasH, CUBLAS_OP_T, CUBLAS_OP_N,
8766
- ne01, ne11, ne10,
8767
- &alpha, d_X, ne00,
8768
- d_Y, ne10,
8769
- &beta, d_D, ne01));
8770
-
8771
- // copy data to host
8772
- CUDA_CHECK(cudaMemcpyAsync(d, d_D, sizeof(float) * d_ne, cudaMemcpyDeviceToHost, g_cudaStream));
8773
- #elif defined(GGML_USE_CLBLAST)
8855
+ #if defined(GGML_USE_CLBLAST)
8774
8856
  // zT = y * xT
8775
8857
  ggml_cl_sgemm_wrapper(GGML_BLAS_ORDER_ROW_MAJOR, GGML_BLAS_OP_N, GGML_BLAS_OP_T,
8776
8858
  ne11, ne01, ne10,
@@ -8788,13 +8870,6 @@ static void ggml_compute_forward_mul_mat_q_f32(
8788
8870
  }
8789
8871
  }
8790
8872
 
8791
- #if defined(GGML_USE_CUBLAS)
8792
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStream));
8793
- ggml_cuda_pool_free(d_X, x_size);
8794
- ggml_cuda_pool_free(d_Y, y_size);
8795
- ggml_cuda_pool_free(d_D, d_size);
8796
- ggml_cuda_pool_free(d_Q, q_size);
8797
- #endif
8798
8873
  //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
8799
8874
 
8800
8875
  return;
@@ -8883,7 +8958,6 @@ static void ggml_compute_forward_mul_mat(
8883
8958
  case GGML_TYPE_Q4_0:
8884
8959
  case GGML_TYPE_Q4_1:
8885
8960
  case GGML_TYPE_Q4_2:
8886
- case GGML_TYPE_Q4_3:
8887
8961
  case GGML_TYPE_Q5_0:
8888
8962
  case GGML_TYPE_Q5_1:
8889
8963
  case GGML_TYPE_Q8_0:
@@ -9115,7 +9189,6 @@ static void ggml_compute_forward_get_rows(
9115
9189
  case GGML_TYPE_Q4_0:
9116
9190
  case GGML_TYPE_Q4_1:
9117
9191
  case GGML_TYPE_Q4_2:
9118
- case GGML_TYPE_Q4_3:
9119
9192
  case GGML_TYPE_Q5_0:
9120
9193
  case GGML_TYPE_Q5_1:
9121
9194
  case GGML_TYPE_Q8_0:
@@ -9300,6 +9373,161 @@ static void ggml_compute_forward_soft_max(
9300
9373
  }
9301
9374
  }
9302
9375
 
9376
+ // ggml_compute_forward_alibi
9377
+
9378
+ static void ggml_compute_forward_alibi_f32(
9379
+ const struct ggml_compute_params * params,
9380
+ const struct ggml_tensor * src0,
9381
+ const struct ggml_tensor * src1,
9382
+ struct ggml_tensor * dst) {
9383
+ assert(params->ith == 0);
9384
+ assert(src1->type == GGML_TYPE_I32);
9385
+ assert(ggml_nelements(src1) == 2);
9386
+
9387
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9388
+ return;
9389
+ }
9390
+
9391
+ const int n_past = ((int32_t *) src1->data)[0];
9392
+ const int n_head = ((int32_t *) src1->data)[1];
9393
+
9394
+ const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
9395
+ const int ne1 = src0->ne[1]; // seq_len_without_past
9396
+ //const int ne2 = src0->ne[2]; // n_head -> this is k
9397
+ //const int ne3 = src0->ne[3]; // 1 -> bsz
9398
+
9399
+ const int n = ggml_nrows(src0);
9400
+ const int ne2_ne3 = n/ne1; // ne2*ne3
9401
+
9402
+ const int nb0 = src0->nb[0];
9403
+ const int nb1 = src0->nb[1];
9404
+ const int nb2 = src0->nb[2];
9405
+ //const int nb3 = src0->nb[3];
9406
+
9407
+ assert(nb0 == sizeof(float));
9408
+ assert(ne1 + n_past == ne0); (void) n_past;
9409
+
9410
+ // add alibi to src0 (KQ_scaled)
9411
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
9412
+
9413
+ const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
9414
+ const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
9415
+
9416
+ for (int i = 0; i < ne0; i++) {
9417
+ for (int j = 0; j < ne1; j++) {
9418
+ for (int k = 0; k < ne2_ne3; k++) {
9419
+ float * const src = (float *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
9420
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
9421
+
9422
+ // TODO: k*nb2 or k*nb3
9423
+
9424
+ float m_k;
9425
+
9426
+ if (k < n_heads_log2_floor) {
9427
+ m_k = powf(m0, k + 1);
9428
+ } else {
9429
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
9430
+ }
9431
+
9432
+ pdst[0] = (j+1) * m_k + src[0];
9433
+ }
9434
+ }
9435
+ }
9436
+ }
9437
+
9438
+
9439
+ static void ggml_compute_forward_alibi_f16(
9440
+ const struct ggml_compute_params * params,
9441
+ const struct ggml_tensor * src0,
9442
+ const struct ggml_tensor * src1,
9443
+ struct ggml_tensor * dst) {
9444
+ assert(params->ith == 0);
9445
+ assert(src1->type == GGML_TYPE_I32);
9446
+ assert(ggml_nelements(src1) == 2);
9447
+
9448
+ if (params->type == GGML_TASK_INIT || params->type == GGML_TASK_FINALIZE) {
9449
+ return;
9450
+ }
9451
+
9452
+ const int n_past = ((int32_t *) src1->data)[0];
9453
+ const int n_head = ((int32_t *) src1->data)[1];
9454
+
9455
+ const int ne0 = src0->ne[0]; // all_seq_len = n_past + ne1
9456
+ const int ne1 = src0->ne[1]; // seq_len_without_past
9457
+ //const int ne2 = src0->ne[2]; // n_head -> this is k
9458
+ //const int ne3 = src0->ne[3]; // 1 -> bsz
9459
+
9460
+ const int n = ggml_nrows(src0);
9461
+ const int ne2_ne3 = n/ne1; // ne2*ne3
9462
+
9463
+ const int nb0 = src0->nb[0];
9464
+ const int nb1 = src0->nb[1];
9465
+ const int nb2 = src0->nb[2];
9466
+ //const int nb3 = src0->nb[3];
9467
+
9468
+ assert(nb0 == sizeof(ggml_fp16_t));
9469
+ assert(ne1 + n_past == ne0); (void) n_past;
9470
+
9471
+ // add alibi to src0 (KQ_scaled)
9472
+ const int n_heads_log2_floor = 1 << (int) floor(log2(n_head));
9473
+
9474
+ const float m0 = powf(2.0f, -8.0f / n_heads_log2_floor);
9475
+ const float m1 = powf(2.0f, -4.0f / n_heads_log2_floor);
9476
+
9477
+ for (int i = 0; i < ne0; i++) {
9478
+ for (int j = 0; j < ne1; j++) {
9479
+ for (int k = 0; k < ne2_ne3; k++) {
9480
+ ggml_fp16_t * const src = (ggml_fp16_t *)((char *) src0->data + i*nb0 + j*nb1 + k*nb2);
9481
+ float * pdst = (float *)((char *) dst->data + i*nb0 + j*nb1 + k*nb2);
9482
+
9483
+ // TODO: k*nb2 or k*nb3
9484
+
9485
+ float m_k;
9486
+
9487
+ if (k < n_heads_log2_floor) {
9488
+ m_k = powf(m0, k + 1);
9489
+ } else {
9490
+ m_k = powf(m1, 2 * (k - n_heads_log2_floor) + 1);
9491
+ }
9492
+
9493
+ // we return F32
9494
+ pdst[0] = (j+1) * m_k + GGML_FP16_TO_FP32(src[0]);
9495
+ }
9496
+ }
9497
+ }
9498
+ }
9499
+
9500
+ static void ggml_compute_forward_alibi(
9501
+ const struct ggml_compute_params * params,
9502
+ const struct ggml_tensor * src0,
9503
+ const struct ggml_tensor * src1,
9504
+ struct ggml_tensor * dst) {
9505
+ switch (src0->type) {
9506
+ case GGML_TYPE_F16:
9507
+ {
9508
+ ggml_compute_forward_alibi_f16(params, src0, src1, dst);
9509
+ } break;
9510
+ case GGML_TYPE_F32:
9511
+ {
9512
+ ggml_compute_forward_alibi_f32(params, src0, src1, dst);
9513
+ } break;
9514
+ case GGML_TYPE_Q4_0:
9515
+ case GGML_TYPE_Q4_1:
9516
+ case GGML_TYPE_Q4_2:
9517
+ case GGML_TYPE_Q5_0:
9518
+ case GGML_TYPE_Q5_1:
9519
+ case GGML_TYPE_Q8_0:
9520
+ case GGML_TYPE_Q8_1:
9521
+ case GGML_TYPE_I8:
9522
+ case GGML_TYPE_I16:
9523
+ case GGML_TYPE_I32:
9524
+ case GGML_TYPE_COUNT:
9525
+ {
9526
+ GGML_ASSERT(false);
9527
+ } break;
9528
+ }
9529
+ }
9530
+
9303
9531
  // ggml_compute_forward_rope
9304
9532
 
9305
9533
  static void ggml_compute_forward_rope_f32(
@@ -10938,6 +11166,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
10938
11166
  {
10939
11167
  ggml_compute_forward_rope(params, tensor->src0, tensor->src1, tensor);
10940
11168
  } break;
11169
+ case GGML_OP_ALIBI:
11170
+ {
11171
+ ggml_compute_forward_alibi(params, tensor->src0, tensor->src1, tensor);
11172
+ } break;
10941
11173
  case GGML_OP_CONV_1D_1S:
10942
11174
  {
10943
11175
  ggml_compute_forward_conv_1d_1s(params, tensor->src0, tensor->src1, tensor);
@@ -11140,6 +11372,10 @@ static void ggml_compute_backward(struct ggml_context * ctx, struct ggml_tensor
11140
11372
  {
11141
11373
  GGML_ASSERT(false); // TODO: not implemented
11142
11374
  } break;
11375
+ case GGML_OP_ALIBI:
11376
+ {
11377
+ GGML_ASSERT(false); // TODO: not implemented
11378
+ } break;
11143
11379
  case GGML_OP_SILU:
11144
11380
  {
11145
11381
  GGML_ASSERT(false); // TODO: not implemented
@@ -11617,15 +11853,21 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11617
11853
 
11618
11854
  size_t cur = 0;
11619
11855
 
11856
+ #if defined(GGML_USE_CUBLAS)
11857
+ if (ggml_cuda_can_mul_mat(node->src0, node->src1, node)) {
11858
+ node->n_tasks = 1; // TODO: this actually is doing nothing
11859
+ // the threads are still spinning
11860
+ cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
11861
+ }
11862
+ else
11863
+ #endif
11620
11864
  if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
11621
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11865
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11622
11866
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11623
11867
  node->n_tasks = 1; // TODO: this actually is doing nothing
11624
11868
  // the threads are still spinning
11869
+ // here we need memory just for single 2D matrix from src0
11625
11870
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
11626
- //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]);
11627
- //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]);
11628
- //printf("cur = %zu\n", cur);
11629
11871
  } else {
11630
11872
  cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1);
11631
11873
  }
@@ -11634,8 +11876,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11634
11876
  #endif
11635
11877
  } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) {
11636
11878
  cur = 0;
11879
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11880
+ if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11881
+ node->n_tasks = 1;
11882
+ }
11883
+ #endif
11637
11884
  } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
11638
- #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
11885
+ #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CLBLAST)
11639
11886
  if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) {
11640
11887
  node->n_tasks = 1;
11641
11888
  cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
@@ -11673,6 +11920,10 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
11673
11920
  {
11674
11921
  node->n_tasks = n_threads;
11675
11922
  } break;
11923
+ case GGML_OP_ALIBI:
11924
+ {
11925
+ node->n_tasks = 1; //TODO
11926
+ } break;
11676
11927
  case GGML_OP_CONV_1D_1S:
11677
11928
  case GGML_OP_CONV_1D_2S:
11678
11929
  {
@@ -12060,10 +12311,16 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph
12060
12311
  snprintf(color, sizeof(color), "white");
12061
12312
  }
12062
12313
 
12063
- fprintf(fp, " \"%p\" [ \
12064
- style = filled; fillcolor = %s; shape = record; \
12065
- label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
12066
- (void *) node, color,
12314
+ fprintf(fp, " \"%p\" [ "
12315
+ "style = filled; fillcolor = %s; shape = record; "
12316
+ "label=\"",
12317
+ (void *) node, color);
12318
+
12319
+ if (strlen(node->name) > 0) {
12320
+ fprintf(fp, "%s |", node->name);
12321
+ }
12322
+
12323
+ fprintf(fp, "%d [%" PRId64 ", %" PRId64 "] | <x>%s",
12067
12324
  i, node->ne[0], node->ne[1],
12068
12325
  GGML_OP_SYMBOL[node->op]);
12069
12326
 
@@ -12079,18 +12336,26 @@ label=\"%d [%" PRId64 ", %" PRId64 "] | <x>%s",
12079
12336
 
12080
12337
  snprintf(color, sizeof(color), "pink");
12081
12338
 
12339
+ fprintf(fp, " \"%p\" [ "
12340
+ "style = filled; fillcolor = %s; shape = record; "
12341
+ "label=\"<x>",
12342
+ (void *) node, color);
12343
+
12344
+ if (strlen(node->name) > 0) {
12345
+ fprintf(fp, "%s | ", node->name);
12346
+ }
12082
12347
  if (ggml_nelements(node) == 1) {
12083
- fprintf(fp, " \"%p\" [ \
12084
- style = filled; fillcolor = %s; shape = record; \
12085
- label=\"<x>%.1e\"; ]\n",
12086
- (void *) node, color, (double)ggml_get_f32_1d(node, 0));
12087
- } else {
12088
- fprintf(fp, " \"%p\" [ \
12089
- style = filled; fillcolor = %s; shape = record; \
12090
- label=\"<x>CONST %d [%" PRId64 ", %" PRId64 "]\"; ]\n",
12091
- (void *) node, color,
12092
- i, node->ne[0], node->ne[1]);
12348
+ if (node->type == GGML_TYPE_I8 || node->type == GGML_TYPE_I16 || node->type == GGML_TYPE_I32) {
12349
+ fprintf(fp, "%d", ggml_get_i32_1d(node, 0));
12350
+ }
12351
+ else {
12352
+ fprintf(fp, "%.1e", (double)ggml_get_f32_1d(node, 0));
12353
+ }
12093
12354
  }
12355
+ else {
12356
+ fprintf(fp, "CONST %d [%" PRId64 ", %" PRId64 "]", i, node->ne[0], node->ne[1]);
12357
+ }
12358
+ fprintf(fp, "\"; ]\n");
12094
12359
  }
12095
12360
 
12096
12361
  for (int i = 0; i < gb->n_nodes; i++) {
@@ -12889,29 +13154,6 @@ size_t ggml_quantize_q4_2(const float * src, void * dst, int n, int k, int64_t *
12889
13154
  return (n/QK4_2*sizeof(block_q4_2));
12890
13155
  }
12891
13156
 
12892
- size_t ggml_quantize_q4_3(const float * src, void * dst, int n, int k, int64_t * hist) {
12893
- assert(k % QK4_3 == 0);
12894
- const int nb = k / QK4_3;
12895
-
12896
- for (int j = 0; j < n; j += k) {
12897
- block_q4_3 * restrict y = (block_q4_3 *)dst + j/QK4_3;
12898
-
12899
- quantize_row_q4_3_reference(src + j, y, k);
12900
-
12901
- for (int i = 0; i < nb; i++) {
12902
- for (int l = 0; l < QK4_3; l += 2) {
12903
- const uint8_t vi0 = y[i].qs[l/2] & 0x0F;
12904
- const uint8_t vi1 = y[i].qs[l/2] >> 4;
12905
-
12906
- hist[vi0]++;
12907
- hist[vi1]++;
12908
- }
12909
- }
12910
- }
12911
-
12912
- return (n/QK4_3*sizeof(block_q4_3));
12913
- }
12914
-
12915
13157
  size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t * hist) {
12916
13158
  assert(k % QK5_0 == 0);
12917
13159
  const int nb = k / QK5_0;
@@ -12926,8 +13168,8 @@ size_t ggml_quantize_q5_0(const float * src, void * dst, int n, int k, int64_t *
12926
13168
  memcpy(&qh, &y[i].qh, sizeof(qh));
12927
13169
 
12928
13170
  for (int l = 0; l < QK5_0; l += 2) {
12929
- const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
12930
- const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
13171
+ const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
13172
+ const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
12931
13173
 
12932
13174
  // cast to 16 bins
12933
13175
  const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
@@ -12956,8 +13198,8 @@ size_t ggml_quantize_q5_1(const float * src, void * dst, int n, int k, int64_t *
12956
13198
  memcpy(&qh, &y[i].qh, sizeof(qh));
12957
13199
 
12958
13200
  for (int l = 0; l < QK5_1; l += 2) {
12959
- const uint8_t vh0 = ((qh & (1 << (l + 0))) >> (l + 0)) << 4;
12960
- const uint8_t vh1 = ((qh & (1 << (l + 1))) >> (l + 1)) << 4;
13201
+ const uint8_t vh0 = ((qh & (1u << (l + 0))) >> (l + 0)) << 4;
13202
+ const uint8_t vh1 = ((qh & (1u << (l + 1))) >> (l + 1)) << 4;
12961
13203
 
12962
13204
  // cast to 16 bins
12963
13205
  const uint8_t vi0 = ((y[i].qs[l/2] & 0x0F) | vh0) / 2;
@@ -13014,12 +13256,6 @@ size_t ggml_quantize_chunk(enum ggml_type type, const float * src, void * dst, i
13014
13256
  block_q4_2 * block = (block_q4_2*)dst + start / QK4_2;
13015
13257
  result = ggml_quantize_q4_2(src + start, block, n, n, hist);
13016
13258
  } break;
13017
- case GGML_TYPE_Q4_3:
13018
- {
13019
- GGML_ASSERT(start % QK4_3 == 0);
13020
- block_q4_3 * block = (block_q4_3*)dst + start / QK4_3;
13021
- result = ggml_quantize_q4_3(src + start, block, n, n, hist);
13022
- } break;
13023
13259
  case GGML_TYPE_Q5_0:
13024
13260
  {
13025
13261
  GGML_ASSERT(start % QK5_0 == 0);