llama_cpp 0.4.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -130,13 +130,16 @@
130
130
  // The data of the tensor is accessed via the "data" pointer. For example:
131
131
  //
132
132
  // {
133
- // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 2, 3);
133
+ // const int nx = 2;
134
+ // const int ny = 3;
134
135
  //
135
- // // a[2, 1] = 1.0f;
136
- // *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
136
+ // struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
137
137
  //
138
- // // a[0, 2] = 2.0f;
139
- // *(float *) ((char *) a->data + 0*a->nb[1] + 2*a->nb[0]) = 2.0f;
138
+ // for (int y = 0; y < ny; y++) {
139
+ // for (int x = 0; x < nx; x++) {
140
+ // *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
141
+ // }
142
+ // }
140
143
  //
141
144
  // ...
142
145
  // }
@@ -211,12 +214,17 @@
211
214
  #define GGML_MAX_OP_PARAMS 32
212
215
  #define GGML_DEFAULT_N_THREADS 4
213
216
 
217
+ #if UINTPTR_MAX == 0xFFFFFFFF
218
+ #define GGML_MEM_ALIGN 4
219
+ #else
220
+ #define GGML_MEM_ALIGN 16
221
+ #endif
214
222
 
215
223
  #define GGML_EXIT_SUCCESS 0
216
224
  #define GGML_EXIT_ABORTED 1
217
225
 
218
226
  #define GGUF_MAGIC 0x46554747 // "GGUF"
219
- #define GGUF_VERSION 1
227
+ #define GGUF_VERSION 2
220
228
 
221
229
  #define GGUF_DEFAULT_ALIGNMENT 32
222
230
 
@@ -471,6 +479,9 @@ extern "C" {
471
479
  int64_t perf_cycles;
472
480
  int64_t perf_time_us;
473
481
 
482
+ struct ggml_tensor * view_src;
483
+ size_t view_offs;
484
+
474
485
  void * data;
475
486
 
476
487
  char name[GGML_MAX_NAME];
@@ -653,7 +664,7 @@ extern "C" {
653
664
  GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
654
665
 
655
666
  GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
656
- GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
667
+ GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
657
668
 
658
669
  GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
659
670
 
@@ -944,11 +955,11 @@ extern "C" {
944
955
 
945
956
  // a - x
946
957
  // b - dy
947
- // TODO: update with configurable eps
948
958
  GGML_API struct ggml_tensor * ggml_rms_norm_back(
949
959
  struct ggml_context * ctx,
950
960
  struct ggml_tensor * a,
951
- struct ggml_tensor * b);
961
+ struct ggml_tensor * b,
962
+ float eps);
952
963
 
953
964
  // A: n columns, m rows
954
965
  // B: n columns, p rows (i.e. we transpose it internally)
@@ -1604,7 +1615,8 @@ extern "C" {
1604
1615
  struct ggml_tensor * tensor);
1605
1616
 
1606
1617
 
1607
- GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1618
+ GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
1619
+ GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
1608
1620
 
1609
1621
  GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
1610
1622
  GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
@@ -1669,6 +1681,8 @@ extern "C" {
1669
1681
  GGML_LINESEARCH_INVALID_PARAMETERS,
1670
1682
  };
1671
1683
 
1684
+ typedef void (*ggml_opt_callback)(void * data, float * sched);
1685
+
1672
1686
  // optimization parameters
1673
1687
  //
1674
1688
  // see ggml.c (ggml_opt_default_params) for default values
@@ -1704,12 +1718,14 @@ extern "C" {
1704
1718
 
1705
1719
  float sched; // schedule multiplier (fixed, decay or warmup)
1706
1720
  float decay; // weight decay for AdamW, use 0.0f to disable
1721
+ int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
1707
1722
  float alpha; // learning rate
1708
1723
  float beta1;
1709
1724
  float beta2;
1710
1725
  float eps; // epsilon for numerical stability
1711
1726
  float eps_f; // epsilon for convergence test
1712
1727
  float eps_g; // epsilon for convergence test
1728
+ float gclip; // gradient clipping
1713
1729
  } adam;
1714
1730
 
1715
1731
  // LBFGS parameters
@@ -1737,14 +1753,12 @@ extern "C" {
1737
1753
 
1738
1754
  bool just_initialized;
1739
1755
 
1756
+ float loss_before;
1757
+ float loss_after;
1758
+
1740
1759
  struct {
1741
- struct ggml_tensor * x; // view of the parameters
1742
- struct ggml_tensor * g1; // gradient
1743
- struct ggml_tensor * g2; // gradient squared
1744
1760
  struct ggml_tensor * m; // first moment
1745
1761
  struct ggml_tensor * v; // second moment
1746
- struct ggml_tensor * mh; // first moment hat
1747
- struct ggml_tensor * vh; // second moment hat
1748
1762
  struct ggml_tensor * pf; // past function values
1749
1763
  float fx_best;
1750
1764
  float fx_prev;
@@ -1781,10 +1795,10 @@ extern "C" {
1781
1795
 
1782
1796
  // initialize optimizer context
1783
1797
  GGML_API void ggml_opt_init(
1784
- struct ggml_context * ctx,
1798
+ struct ggml_context * ctx,
1785
1799
  struct ggml_opt_context * opt,
1786
- struct ggml_opt_params params,
1787
- int64_t nx);
1800
+ struct ggml_opt_params params,
1801
+ int64_t nx);
1788
1802
 
1789
1803
  // continue optimizing the function defined by the tensor f
1790
1804
  GGML_API enum ggml_opt_result ggml_opt_resume(
@@ -1798,7 +1812,9 @@ extern "C" {
1798
1812
  struct ggml_opt_context * opt,
1799
1813
  struct ggml_tensor * f,
1800
1814
  struct ggml_cgraph * gf,
1801
- struct ggml_cgraph * gb);
1815
+ struct ggml_cgraph * gb,
1816
+ ggml_opt_callback callback,
1817
+ void * callback_data);
1802
1818
 
1803
1819
  //
1804
1820
  // quantization
@@ -1827,6 +1843,9 @@ extern "C" {
1827
1843
  GGUF_TYPE_BOOL = 7,
1828
1844
  GGUF_TYPE_STRING = 8,
1829
1845
  GGUF_TYPE_ARRAY = 9,
1846
+ GGUF_TYPE_UINT64 = 10,
1847
+ GGUF_TYPE_INT64 = 11,
1848
+ GGUF_TYPE_FLOAT64 = 12,
1830
1849
  GGUF_TYPE_COUNT, // marks the end of the enum
1831
1850
  };
1832
1851
 
@@ -1867,6 +1886,9 @@ extern "C" {
1867
1886
  GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
1868
1887
  GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
1869
1888
  GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
1889
+ GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
1890
+ GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
1891
+ GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
1870
1892
  GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
1871
1893
  GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
1872
1894
  GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
@@ -1886,6 +1908,9 @@ extern "C" {
1886
1908
  GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
1887
1909
  GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
1888
1910
  GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
1911
+ GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
1912
+ GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
1913
+ GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
1889
1914
  GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
1890
1915
  GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
1891
1916
  GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
@@ -1944,6 +1969,7 @@ extern "C" {
1944
1969
  GGML_API int ggml_cpu_has_clblast (void);
1945
1970
  GGML_API int ggml_cpu_has_gpublas (void);
1946
1971
  GGML_API int ggml_cpu_has_sse3 (void);
1972
+ GGML_API int ggml_cpu_has_ssse3 (void);
1947
1973
  GGML_API int ggml_cpu_has_vsx (void);
1948
1974
 
1949
1975
  //
@@ -13,6 +13,26 @@
13
13
  //
14
14
  #include <arm_neon.h>
15
15
 
16
+ #if !defined(__aarch64__)
17
+ inline static int32_t vaddvq_s16(int16x8_t v) {
18
+ return
19
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
20
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
21
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
22
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
23
+ }
24
+
25
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
26
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
27
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
28
+ return vcombine_s16(a0, b0);
29
+ }
30
+
31
+ inline static int32_t vaddvq_s32(int32x4_t v) {
32
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
33
+ }
34
+ #endif
35
+
16
36
  #else
17
37
 
18
38
  #ifdef __wasm_simd128__
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
63
83
  float ax = fabsf(x[i]);
64
84
  if (ax > amax) { amax = ax; max = x[i]; }
65
85
  }
66
- if (!amax) { // all zero
86
+ if (amax < 1e-30f) { // all zero
67
87
  for (int i = 0; i < n; ++i) {
68
88
  L[i] = 0;
69
89
  }
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
183
203
  int ntry, float alpha) {
184
204
  float min = x[0];
185
205
  float max = x[0];
186
- float sum_x = 0;
187
- float sum_x2 = 0;
188
206
  for (int i = 1; i < n; ++i) {
189
207
  if (x[i] < min) min = x[i];
190
208
  if (x[i] > max) max = x[i];
191
- sum_x += x[i];
192
- sum_x2 += x[i]*x[i];
193
209
  }
194
210
  if (max == min) {
195
211
  for (int i = 0; i < n; ++i) L[i] = 0;
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
1070
1086
 
1071
1087
  }
1072
1088
 
1089
+ if (!max_abs_scale) {
1090
+ memset(&y[i], 0, sizeof(block_q6_K));
1091
+ y[i].d = ggml_fp32_to_fp16(0.f);
1092
+ x += QK_K;
1093
+ continue;
1094
+ }
1095
+
1073
1096
  float iscale = -128.f/max_scale;
1074
1097
  y[i].d = ggml_fp32_to_fp16(1/iscale);
1075
1098
  for (int ib = 0; ib < QK_K/16; ++ib) {
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1306
1329
 
1307
1330
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1308
1331
  const uint8x16_t m4 = vdupq_n_u8(0xF);
1332
+ #if defined(__ARM_FEATURE_DOTPROD)
1309
1333
  const int32x4_t vzero = vdupq_n_s32(0);
1334
+ #endif
1310
1335
 
1311
1336
  int8x16x2_t q2bytes;
1312
1337
  uint8_t aux[16];
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1612
1637
  #ifdef __ARM_NEON
1613
1638
 
1614
1639
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1640
+ #if defined(__ARM_FEATURE_DOTPROD)
1615
1641
  const int32x4_t vzero = vdupq_n_s32(0);
1642
+ #endif
1616
1643
 
1617
1644
  int8x16x4_t q2bytes;
1618
1645
 
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2060
2087
 
2061
2088
  __m256 acc = _mm256_setzero_ps();
2062
2089
 
2063
- uint32_t *aux;
2090
+ const uint32_t *aux;
2064
2091
 
2065
2092
  for (int i = 0; i < nb; ++i) {
2066
2093
 
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2070
2097
  const int8_t * restrict q8 = y[i].qs;
2071
2098
 
2072
2099
  // Set up scales
2073
- aux = (uint32_t *)x[i].scales;
2100
+ aux = (const uint32_t *)x[i].scales;
2074
2101
  __m128i scales128 = _mm_set_epi32(
2075
2102
  ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
2076
2103
  ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
@@ -2596,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2596
2623
  const uint8_t * restrict q4 = x[i].qs;
2597
2624
  const int8_t * restrict q8 = y[i].qs;
2598
2625
 
2599
- //int32x4_t isum = mzero;
2600
-
2601
2626
  int32_t sumi1 = 0;
2602
2627
  int32_t sumi2 = 0;
2603
2628
 
@@ -2694,13 +2719,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2694
2719
  const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2695
2720
  __m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
2696
2721
  p16l = _mm256_madd_epi16(scale_l, p16l);
2697
- sumi = _mm256_add_epi32(sumi, p16l);
2698
2722
 
2699
2723
  const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
2700
2724
  __m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
2701
2725
  p16h = _mm256_madd_epi16(scale_h, p16h);
2702
- sumi = _mm256_add_epi32(sumi, p16h);
2726
+ const __m256i sumj = _mm256_add_epi32(p16l, p16h);
2703
2727
 
2728
+ sumi = _mm256_add_epi32(sumi, sumj);
2704
2729
  }
2705
2730
 
2706
2731
  __m256 vd = _mm256_set1_ps(d);
@@ -3096,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3096
3121
  #ifdef __ARM_NEON
3097
3122
 
3098
3123
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3099
- const int32x4_t mzero = vdupq_n_s32(0);
3100
3124
  const uint8x16_t mone = vdupq_n_u8(1);
3101
3125
  const uint8x16_t mtwo = vdupq_n_u8(2);
3126
+ #if defined(__ARM_FEATURE_DOTPROD)
3127
+ const int32x4_t mzero = vdupq_n_s32(0);
3128
+ #endif
3102
3129
 
3103
3130
  int8x16x4_t q5bytes;
3104
3131
 
@@ -3441,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3441
3468
  #ifdef __ARM_NEON
3442
3469
 
3443
3470
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3444
- const int32x4_t mzero = vdupq_n_s32(0);
3445
3471
  const uint8x16_t mh = vdupq_n_u8(16);
3472
+ #if defined(__ARM_FEATURE_DOTPROD)
3473
+ const int32x4_t mzero = vdupq_n_s32(0);
3474
+ #endif
3446
3475
 
3447
3476
  int8x16x4_t q5bytes;
3448
3477
  uint8x16x4_t q5h;
@@ -3660,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3660
3689
  float sum = 0;
3661
3690
 
3662
3691
  const uint8x16_t m4b = vdupq_n_u8(0xF);
3692
+ #if defined(__ARM_FEATURE_DOTPROD)
3663
3693
  const int32x4_t vzero = vdupq_n_s32(0);
3694
+ #endif
3664
3695
  //const int8x16_t m32s = vdupq_n_s8(32);
3665
3696
 
3666
3697
  const uint8x16_t mone = vdupq_n_u8(3);
@@ -4049,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4049
4080
  float sum = 0;
4050
4081
 
4051
4082
  const uint8x16_t m4b = vdupq_n_u8(0xF);
4052
- const int32x4_t vzero = vdupq_n_s32(0);
4053
4083
  const int8x16_t m32s = vdupq_n_s8(32);
4084
+ #if defined(__ARM_FEATURE_DOTPROD)
4085
+ const int32x4_t vzero = vdupq_n_s32(0);
4086
+ #endif
4054
4087
 
4055
4088
  const uint8x16_t mone = vdupq_n_u8(3);
4056
4089