llama_cpp 0.4.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -130,13 +130,16 @@
130
130
  // The data of the tensor is accessed via the "data" pointer. For example:
131
131
  //
132
132
  // {
133
- // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
133
+ // const int nx = 2;
134
+ // const int ny = 3;
134
135
  //
135
- // // a[2, 1] = 1.0f;
136
- // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
136
+ // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
137
137
  //
138
- // // a[0, 2] = 2.0f;
139
- // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
138
+ // for (int y = 0; y < ny; y++) {
139
+ // for (int x = 0; x < nx; x++) {
140
+ // *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
141
+ // }
142
+ // }
140
143
  //
141
144
  // ...
142
145
  // }
@@ -211,12 +214,17 @@
211
214
  #define GGML_MAX_OP_PARAMS 32
212
215
  #define GGML_DEFAULT_N_THREADS 4
213
216
 
217
+ #if UINTPTR_MAX == 0xFFFFFFFF
218
+ #define GGML_MEM_ALIGN 4
219
+ #else
220
+ #define GGML_MEM_ALIGN 16
221
+ #endif
214
222
 
215
223
  #define GGML_EXIT_SUCCESS 0
216
224
  #define GGML_EXIT_ABORTED 1
217
225
 
218
226
  #define GGUF_MAGIC 0x46554747 // "GGUF"
219
- #define GGUF_VERSION 1
227
+ #define GGUF_VERSION 2
220
228
 
221
229
  #define GGUF_DEFAULT_ALIGNMENT 32
222
230
 
@@ -471,6 +479,9 @@ extern "C" {
471
479
  int64_t perf_cycles;
472
480
  int64_t perf_time_us;
473
481
 
482
+ struct ggml_tensor * view_src;
483
+ size_t view_offs;
484
+
474
485
  void * data;
475
486
 
476
487
  char name[GGML_MAX_NAME];
@@ -653,7 +664,7 @@ extern "C" {
653
664
  GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
654
665
 
655
666
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
656
- GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
667
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
657
668
 
658
669
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
659
670
 
@@ -944,11 +955,11 @@ extern "C" {
944
955
 
945
956
  // a - x
946
957
  // b - dy
947
- // TODO: update with configurable eps
948
958
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
949
959
  struct ggml_context * ctx,
950
960
  struct ggml_tensor * a,
951
- struct ggml_tensor * b);
961
+ struct ggml_tensor * b,
962
+ float eps);
952
963
 
953
964
  // A: n columns, m rows
954
965
  // B: n columns, p rows (i.e. we transpose it internally)
@@ -1604,7 +1615,8 @@ extern "C" {
1604
1615
  struct ggml_tensor * tensor);
1605
1616
 
1606
1617
 
1607
- GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1618
+ GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1619
+ GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
1608
1620
 
1609
1621
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1610
1622
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
@@ -1669,6 +1681,8 @@ extern "C" {
1669
1681
  GGML_LINESEARCH_INVALID_PARAMETERS,
1670
1682
  };
1671
1683
 
1684
+ typedef void (*ggml_opt_callback)(void * data, float * sched);
1685
+
1672
1686
  // optimization parameters
1673
1687
  //
1674
1688
  // see ggml.c (ggml_opt_default_params) for default values
@@ -1704,12 +1718,14 @@ extern "C" {
1704
1718
 
1705
1719
  float sched; // schedule multiplier (fixed, decay or warmup)
1706
1720
  float decay; // weight decay for AdamW, use 0.0f to disable
1721
+ int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
1707
1722
  float alpha; // learning rate
1708
1723
  float beta1;
1709
1724
  float beta2;
1710
1725
  float eps; // epsilon for numerical stability
1711
1726
  float eps_f; // epsilon for convergence test
1712
1727
  float eps_g; // epsilon for convergence test
1728
+ float gclip; // gradient clipping
1713
1729
  } adam;
1714
1730
 
1715
1731
  // LBFGS parameters
@@ -1737,14 +1753,12 @@ extern "C" {
1737
1753
 
1738
1754
  bool just_initialized;
1739
1755
 
1756
+ float loss_before;
1757
+ float loss_after;
1758
+
1740
1759
  struct {
1741
- struct ggml_tensor * x; // view of the parameters
1742
- struct ggml_tensor * g1; // gradient
1743
- struct ggml_tensor * g2; // gradient squared
1744
1760
  struct ggml_tensor * m; // first moment
1745
1761
  struct ggml_tensor * v; // second moment
1746
- struct ggml_tensor * mh; // first moment hat
1747
- struct ggml_tensor * vh; // second moment hat
1748
1762
  struct ggml_tensor * pf; // past function values
1749
1763
  float fx_best;
1750
1764
  float fx_prev;
@@ -1781,10 +1795,10 @@ extern "C" {
1781
1795
 
1782
1796
  // initialize optimizer context
1783
1797
  GGML_API void ggml_opt_init(
1784
- struct ggml_context * ctx,
1798
+ struct ggml_context * ctx,
1785
1799
  struct ggml_opt_context * opt,
1786
- struct ggml_opt_params params,
1787
- int64_t nx);
1800
+ struct ggml_opt_params params,
1801
+ int64_t nx);
1788
1802
 
1789
1803
  // continue optimizing the function defined by the tensor f
1790
1804
  GGML_API enum ggml_opt_result ggml_opt_resume(
@@ -1798,7 +1812,9 @@ extern "C" {
1798
1812
  struct ggml_opt_context * opt,
1799
1813
  struct ggml_tensor * f,
1800
1814
  struct ggml_cgraph * gf,
1801
- struct ggml_cgraph * gb);
1815
+ struct ggml_cgraph * gb,
1816
+ ggml_opt_callback callback,
1817
+ void * callback_data);
1802
1818
 
1803
1819
  //
1804
1820
  // quantization
@@ -1827,6 +1843,9 @@ extern "C" {
1827
1843
  GGUF_TYPE_BOOL = 7,
1828
1844
  GGUF_TYPE_STRING = 8,
1829
1845
  GGUF_TYPE_ARRAY = 9,
1846
+ GGUF_TYPE_UINT64 = 10,
1847
+ GGUF_TYPE_INT64 = 11,
1848
+ GGUF_TYPE_FLOAT64 = 12,
1830
1849
  GGUF_TYPE_COUNT, // marks the end of the enum
1831
1850
  };
1832
1851
 
@@ -1867,6 +1886,9 @@ extern "C" {
1867
1886
  GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
1868
1887
  GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
1869
1888
  GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
1889
+ GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
1890
+ GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
1891
+ GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
1870
1892
  GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
1871
1893
  GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
1872
1894
  GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
@@ -1886,6 +1908,9 @@ extern "C" {
1886
1908
  GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
1887
1909
  GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
1888
1910
  GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
1911
+ GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
1912
+ GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
1913
+ GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
1889
1914
  GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
1890
1915
  GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
1891
1916
  GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
@@ -1944,6 +1969,7 @@ extern "C" {
1944
1969
  GGML_API int ggml_cpu_has_clblast (void);
1945
1970
  GGML_API int ggml_cpu_has_gpublas (void);
1946
1971
  GGML_API int ggml_cpu_has_sse3 (void);
1972
+ GGML_API int ggml_cpu_has_ssse3 (void);
1947
1973
  GGML_API int ggml_cpu_has_vsx (void);
1948
1974
 
1949
1975
  //
@@ -13,6 +13,26 @@
13
13
  //
14
14
  #include <arm_neon.h>
15
15
 
16
+ #if !defined(__aarch64__)
17
+ inline static int32_t vaddvq_s16(int16x8_t v) {
18
+ return
19
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
20
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
21
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
22
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
23
+ }
24
+
25
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
26
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
27
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
28
+ return vcombine_s16(a0, b0);
29
+ }
30
+
31
+ inline static int32_t vaddvq_s32(int32x4_t v) {
32
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
33
+ }
34
+ #endif
35
+
16
36
  #else
17
37
 
18
38
  #ifdef __wasm_simd128__
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
63
83
  float ax = fabsf(x[i]);
64
84
  if (ax > amax) { amax = ax; max = x[i]; }
65
85
  }
66
- if (!amax) { // all zero
86
+ if (amax < 1e-30f) { // all zero
67
87
  for (int i = 0; i < n; ++i) {
68
88
  L[i] = 0;
69
89
  }
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
183
203
  int ntry, float alpha) {
184
204
  float min = x[0];
185
205
  float max = x[0];
186
- float sum_x = 0;
187
- float sum_x2 = 0;
188
206
  for (int i = 1; i < n; ++i) {
189
207
  if (x[i] < min) min = x[i];
190
208
  if (x[i] > max) max = x[i];
191
- sum_x += x[i];
192
- sum_x2 += x[i]*x[i];
193
209
  }
194
210
  if (max == min) {
195
211
  for (int i = 0; i < n; ++i) L[i] = 0;
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
1070
1086
 
1071
1087
  }
1072
1088
 
1089
+ if (!max_abs_scale) {
1090
+ memset(&y[i], 0, sizeof(block_q6_K));
1091
+ y[i].d = ggml_fp32_to_fp16(0.f);
1092
+ x += QK_K;
1093
+ continue;
1094
+ }
1095
+
1073
1096
  float iscale = -128.f/max_scale;
1074
1097
  y[i].d = ggml_fp32_to_fp16(1/iscale);
1075
1098
  for (int ib = 0; ib < QK_K/16; ++ib) {
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1306
1329
 
1307
1330
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1308
1331
  const uint8x16_t m4 = vdupq_n_u8(0xF);
1332
+ #if defined(__ARM_FEATURE_DOTPROD)
1309
1333
  const int32x4_t vzero = vdupq_n_s32(0);
1334
+ #endif
1310
1335
 
1311
1336
  int8x16x2_t q2bytes;
1312
1337
  uint8_t aux[16];
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1612
1637
  #ifdef __ARM_NEON
1613
1638
 
1614
1639
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1640
+ #if defined(__ARM_FEATURE_DOTPROD)
1615
1641
  const int32x4_t vzero = vdupq_n_s32(0);
1642
+ #endif
1616
1643
 
1617
1644
  int8x16x4_t q2bytes;
1618
1645
 
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2060
2087
 
2061
2088
  __m256 acc = _mm256_setzero_ps();
2062
2089
 
2063
- uint32_t *aux;
2090
+ const uint32_t *aux;
2064
2091
 
2065
2092
  for (int i = 0; i < nb; ++i) {
2066
2093
 
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2070
2097
  const int8_t * restrict q8 = y[i].qs;
2071
2098
 
2072
2099
  // Set up scales
2073
- aux = (uint32_t *)x[i].scales;
2100
+ aux = (const uint32_t *)x[i].scales;
2074
2101
  __m128i scales128 = _mm_set_epi32(
2075
2102
  ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
2076
2103
  ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
@@ -2596,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2596
2623
  const uint8_t * restrict q4 = x[i].qs;
2597
2624
  const int8_t * restrict q8 = y[i].qs;
2598
2625
 
2599
- //int32x4_t isum = mzero;
2600
-
2601
2626
  int32_t sumi1 = 0;
2602
2627
  int32_t sumi2 = 0;
2603
2628
 
@@ -2694,13 +2719,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2694
2719
  const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2695
2720
  __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
2696
2721
  p16l = _mm256_madd_epi16(scale_l, p16l);
2697
- sumi = _mm256_add_epi32(sumi, p16l);
2698
2722
 
2699
2723
  const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2700
2724
  __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
2701
2725
  p16h = _mm256_madd_epi16(scale_h, p16h);
2702
- sumi = _mm256_add_epi32(sumi, p16h);
2726
+ const __m256i sumj = _mm256_add_epi32(p16l, p16h);
2703
2727
 
2728
+ sumi = _mm256_add_epi32(sumi, sumj);
2704
2729
  }
2705
2730
 
2706
2731
  __m256 vd = _mm256_set1_ps(d);
@@ -3096,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3096
3121
  #ifdef __ARM_NEON
3097
3122
 
3098
3123
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3099
- const int32x4_t mzero = vdupq_n_s32(0);
3100
3124
  const uint8x16_t mone = vdupq_n_u8(1);
3101
3125
  const uint8x16_t mtwo = vdupq_n_u8(2);
3126
+ #if defined(__ARM_FEATURE_DOTPROD)
3127
+ const int32x4_t mzero = vdupq_n_s32(0);
3128
+ #endif
3102
3129
 
3103
3130
  int8x16x4_t q5bytes;
3104
3131
 
@@ -3441,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3441
3468
  #ifdef __ARM_NEON
3442
3469
 
3443
3470
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3444
- const int32x4_t mzero = vdupq_n_s32(0);
3445
3471
  const uint8x16_t mh = vdupq_n_u8(16);
3472
+ #if defined(__ARM_FEATURE_DOTPROD)
3473
+ const int32x4_t mzero = vdupq_n_s32(0);
3474
+ #endif
3446
3475
 
3447
3476
  int8x16x4_t q5bytes;
3448
3477
  uint8x16x4_t q5h;
@@ -3660,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3660
3689
  float sum = 0;
3661
3690
 
3662
3691
  const uint8x16_t m4b = vdupq_n_u8(0xF);
3692
+ #if defined(__ARM_FEATURE_DOTPROD)
3663
3693
  const int32x4_t vzero = vdupq_n_s32(0);
3694
+ #endif
3664
3695
  //const int8x16_t m32s = vdupq_n_s8(32);
3665
3696
 
3666
3697
  const uint8x16_t mone = vdupq_n_u8(3);
@@ -4049,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4049
4080
  float sum = 0;
4050
4081
 
4051
4082
  const uint8x16_t m4b = vdupq_n_u8(0xF);
4052
- const int32x4_t vzero = vdupq_n_s32(0);
4053
4083
  const int8x16_t m32s = vdupq_n_s8(32);
4084
+ #if defined(__ARM_FEATURE_DOTPROD)
4085
+ const int32x4_t vzero = vdupq_n_s32(0);
4086
+ #endif
4054
4087
 
4055
4088
  const uint8x16_t mone = vdupq_n_u8(3);
4056
4089