llama_cpp 0.4.0 → 0.5.1
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -130,13 +130,16 @@
|
|
130
130
|
// The data of the tensor is accessed via the "data" pointer. For example:
|
131
131
|
//
|
132
132
|
// {
|
133
|
-
//
|
133
|
+
// const int nx = 2;
|
134
|
+
// const int ny = 3;
|
134
135
|
//
|
135
|
-
//
|
136
|
-
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
136
|
+
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
|
137
137
|
//
|
138
|
-
//
|
139
|
-
//
|
138
|
+
// for (int y = 0; y < ny; y++) {
|
139
|
+
// for (int x = 0; x < nx; x++) {
|
140
|
+
// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
|
141
|
+
// }
|
142
|
+
// }
|
140
143
|
//
|
141
144
|
// ...
|
142
145
|
// }
|
@@ -211,12 +214,17 @@
|
|
211
214
|
#define GGML_MAX_OP_PARAMS 32
|
212
215
|
#define GGML_DEFAULT_N_THREADS 4
|
213
216
|
|
217
|
+
#if UINTPTR_MAX == 0xFFFFFFFF
|
218
|
+
#define GGML_MEM_ALIGN 4
|
219
|
+
#else
|
220
|
+
#define GGML_MEM_ALIGN 16
|
221
|
+
#endif
|
214
222
|
|
215
223
|
#define GGML_EXIT_SUCCESS 0
|
216
224
|
#define GGML_EXIT_ABORTED 1
|
217
225
|
|
218
226
|
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
219
|
-
#define GGUF_VERSION
|
227
|
+
#define GGUF_VERSION 2
|
220
228
|
|
221
229
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
222
230
|
|
@@ -471,6 +479,9 @@ extern "C" {
|
|
471
479
|
int64_t perf_cycles;
|
472
480
|
int64_t perf_time_us;
|
473
481
|
|
482
|
+
struct ggml_tensor * view_src;
|
483
|
+
size_t view_offs;
|
484
|
+
|
474
485
|
void * data;
|
475
486
|
|
476
487
|
char name[GGML_MAX_NAME];
|
@@ -653,7 +664,7 @@ extern "C" {
|
|
653
664
|
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
654
665
|
|
655
666
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
656
|
-
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx,
|
667
|
+
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
657
668
|
|
658
669
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
659
670
|
|
@@ -944,11 +955,11 @@ extern "C" {
|
|
944
955
|
|
945
956
|
// a - x
|
946
957
|
// b - dy
|
947
|
-
// TODO: update with configurable eps
|
948
958
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
949
959
|
struct ggml_context * ctx,
|
950
960
|
struct ggml_tensor * a,
|
951
|
-
struct ggml_tensor * b
|
961
|
+
struct ggml_tensor * b,
|
962
|
+
float eps);
|
952
963
|
|
953
964
|
// A: n columns, m rows
|
954
965
|
// B: n columns, p rows (i.e. we transpose it internally)
|
@@ -1604,7 +1615,8 @@ extern "C" {
|
|
1604
1615
|
struct ggml_tensor * tensor);
|
1605
1616
|
|
1606
1617
|
|
1607
|
-
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1618
|
+
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1619
|
+
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
1608
1620
|
|
1609
1621
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1610
1622
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
@@ -1669,6 +1681,8 @@ extern "C" {
|
|
1669
1681
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
1670
1682
|
};
|
1671
1683
|
|
1684
|
+
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
1685
|
+
|
1672
1686
|
// optimization parameters
|
1673
1687
|
//
|
1674
1688
|
// see ggml.c (ggml_opt_default_params) for default values
|
@@ -1704,12 +1718,14 @@ extern "C" {
|
|
1704
1718
|
|
1705
1719
|
float sched; // schedule multiplier (fixed, decay or warmup)
|
1706
1720
|
float decay; // weight decay for AdamW, use 0.0f to disable
|
1721
|
+
int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
|
1707
1722
|
float alpha; // learning rate
|
1708
1723
|
float beta1;
|
1709
1724
|
float beta2;
|
1710
1725
|
float eps; // epsilon for numerical stability
|
1711
1726
|
float eps_f; // epsilon for convergence test
|
1712
1727
|
float eps_g; // epsilon for convergence test
|
1728
|
+
float gclip; // gradient clipping
|
1713
1729
|
} adam;
|
1714
1730
|
|
1715
1731
|
// LBFGS parameters
|
@@ -1737,14 +1753,12 @@ extern "C" {
|
|
1737
1753
|
|
1738
1754
|
bool just_initialized;
|
1739
1755
|
|
1756
|
+
float loss_before;
|
1757
|
+
float loss_after;
|
1758
|
+
|
1740
1759
|
struct {
|
1741
|
-
struct ggml_tensor * x; // view of the parameters
|
1742
|
-
struct ggml_tensor * g1; // gradient
|
1743
|
-
struct ggml_tensor * g2; // gradient squared
|
1744
1760
|
struct ggml_tensor * m; // first moment
|
1745
1761
|
struct ggml_tensor * v; // second moment
|
1746
|
-
struct ggml_tensor * mh; // first moment hat
|
1747
|
-
struct ggml_tensor * vh; // second moment hat
|
1748
1762
|
struct ggml_tensor * pf; // past function values
|
1749
1763
|
float fx_best;
|
1750
1764
|
float fx_prev;
|
@@ -1781,10 +1795,10 @@ extern "C" {
|
|
1781
1795
|
|
1782
1796
|
// initialize optimizer context
|
1783
1797
|
GGML_API void ggml_opt_init(
|
1784
|
-
struct ggml_context
|
1798
|
+
struct ggml_context * ctx,
|
1785
1799
|
struct ggml_opt_context * opt,
|
1786
|
-
struct ggml_opt_params
|
1787
|
-
int64_t
|
1800
|
+
struct ggml_opt_params params,
|
1801
|
+
int64_t nx);
|
1788
1802
|
|
1789
1803
|
// continue optimizing the function defined by the tensor f
|
1790
1804
|
GGML_API enum ggml_opt_result ggml_opt_resume(
|
@@ -1798,7 +1812,9 @@ extern "C" {
|
|
1798
1812
|
struct ggml_opt_context * opt,
|
1799
1813
|
struct ggml_tensor * f,
|
1800
1814
|
struct ggml_cgraph * gf,
|
1801
|
-
struct ggml_cgraph * gb
|
1815
|
+
struct ggml_cgraph * gb,
|
1816
|
+
ggml_opt_callback callback,
|
1817
|
+
void * callback_data);
|
1802
1818
|
|
1803
1819
|
//
|
1804
1820
|
// quantization
|
@@ -1827,6 +1843,9 @@ extern "C" {
|
|
1827
1843
|
GGUF_TYPE_BOOL = 7,
|
1828
1844
|
GGUF_TYPE_STRING = 8,
|
1829
1845
|
GGUF_TYPE_ARRAY = 9,
|
1846
|
+
GGUF_TYPE_UINT64 = 10,
|
1847
|
+
GGUF_TYPE_INT64 = 11,
|
1848
|
+
GGUF_TYPE_FLOAT64 = 12,
|
1830
1849
|
GGUF_TYPE_COUNT, // marks the end of the enum
|
1831
1850
|
};
|
1832
1851
|
|
@@ -1867,6 +1886,9 @@ extern "C" {
|
|
1867
1886
|
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
1868
1887
|
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
1869
1888
|
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
1889
|
+
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
1890
|
+
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
1891
|
+
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
1870
1892
|
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
1871
1893
|
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
1872
1894
|
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
@@ -1886,6 +1908,9 @@ extern "C" {
|
|
1886
1908
|
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
1887
1909
|
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
1888
1910
|
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
1911
|
+
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
1912
|
+
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
1913
|
+
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
1889
1914
|
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
1890
1915
|
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
1891
1916
|
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
@@ -1944,6 +1969,7 @@ extern "C" {
|
|
1944
1969
|
GGML_API int ggml_cpu_has_clblast (void);
|
1945
1970
|
GGML_API int ggml_cpu_has_gpublas (void);
|
1946
1971
|
GGML_API int ggml_cpu_has_sse3 (void);
|
1972
|
+
GGML_API int ggml_cpu_has_ssse3 (void);
|
1947
1973
|
GGML_API int ggml_cpu_has_vsx (void);
|
1948
1974
|
|
1949
1975
|
//
|
@@ -13,6 +13,26 @@
|
|
13
13
|
//
|
14
14
|
#include <arm_neon.h>
|
15
15
|
|
16
|
+
#if !defined(__aarch64__)
|
17
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
18
|
+
return
|
19
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
20
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
21
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
22
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
23
|
+
}
|
24
|
+
|
25
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
26
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
27
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
28
|
+
return vcombine_s16(a0, b0);
|
29
|
+
}
|
30
|
+
|
31
|
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
32
|
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
33
|
+
}
|
34
|
+
#endif
|
35
|
+
|
16
36
|
#else
|
17
37
|
|
18
38
|
#ifdef __wasm_simd128__
|
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
63
83
|
float ax = fabsf(x[i]);
|
64
84
|
if (ax > amax) { amax = ax; max = x[i]; }
|
65
85
|
}
|
66
|
-
if (
|
86
|
+
if (amax < 1e-30f) { // all zero
|
67
87
|
for (int i = 0; i < n; ++i) {
|
68
88
|
L[i] = 0;
|
69
89
|
}
|
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
183
203
|
int ntry, float alpha) {
|
184
204
|
float min = x[0];
|
185
205
|
float max = x[0];
|
186
|
-
float sum_x = 0;
|
187
|
-
float sum_x2 = 0;
|
188
206
|
for (int i = 1; i < n; ++i) {
|
189
207
|
if (x[i] < min) min = x[i];
|
190
208
|
if (x[i] > max) max = x[i];
|
191
|
-
sum_x += x[i];
|
192
|
-
sum_x2 += x[i]*x[i];
|
193
209
|
}
|
194
210
|
if (max == min) {
|
195
211
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
1070
1086
|
|
1071
1087
|
}
|
1072
1088
|
|
1089
|
+
if (!max_abs_scale) {
|
1090
|
+
memset(&y[i], 0, sizeof(block_q6_K));
|
1091
|
+
y[i].d = ggml_fp32_to_fp16(0.f);
|
1092
|
+
x += QK_K;
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
|
1073
1096
|
float iscale = -128.f/max_scale;
|
1074
1097
|
y[i].d = ggml_fp32_to_fp16(1/iscale);
|
1075
1098
|
for (int ib = 0; ib < QK_K/16; ++ib) {
|
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1306
1329
|
|
1307
1330
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1308
1331
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
1332
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1309
1333
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1334
|
+
#endif
|
1310
1335
|
|
1311
1336
|
int8x16x2_t q2bytes;
|
1312
1337
|
uint8_t aux[16];
|
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1612
1637
|
#ifdef __ARM_NEON
|
1613
1638
|
|
1614
1639
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1640
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1615
1641
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1642
|
+
#endif
|
1616
1643
|
|
1617
1644
|
int8x16x4_t q2bytes;
|
1618
1645
|
|
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2060
2087
|
|
2061
2088
|
__m256 acc = _mm256_setzero_ps();
|
2062
2089
|
|
2063
|
-
uint32_t *aux;
|
2090
|
+
const uint32_t *aux;
|
2064
2091
|
|
2065
2092
|
for (int i = 0; i < nb; ++i) {
|
2066
2093
|
|
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2070
2097
|
const int8_t * restrict q8 = y[i].qs;
|
2071
2098
|
|
2072
2099
|
// Set up scales
|
2073
|
-
aux = (uint32_t *)x[i].scales;
|
2100
|
+
aux = (const uint32_t *)x[i].scales;
|
2074
2101
|
__m128i scales128 = _mm_set_epi32(
|
2075
2102
|
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
|
2076
2103
|
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
|
@@ -2596,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2596
2623
|
const uint8_t * restrict q4 = x[i].qs;
|
2597
2624
|
const int8_t * restrict q8 = y[i].qs;
|
2598
2625
|
|
2599
|
-
//int32x4_t isum = mzero;
|
2600
|
-
|
2601
2626
|
int32_t sumi1 = 0;
|
2602
2627
|
int32_t sumi2 = 0;
|
2603
2628
|
|
@@ -2694,13 +2719,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2694
2719
|
const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
2695
2720
|
__m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
|
2696
2721
|
p16l = _mm256_madd_epi16(scale_l, p16l);
|
2697
|
-
sumi = _mm256_add_epi32(sumi, p16l);
|
2698
2722
|
|
2699
2723
|
const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
2700
2724
|
__m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
|
2701
2725
|
p16h = _mm256_madd_epi16(scale_h, p16h);
|
2702
|
-
|
2726
|
+
const __m256i sumj = _mm256_add_epi32(p16l, p16h);
|
2703
2727
|
|
2728
|
+
sumi = _mm256_add_epi32(sumi, sumj);
|
2704
2729
|
}
|
2705
2730
|
|
2706
2731
|
__m256 vd = _mm256_set1_ps(d);
|
@@ -3096,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3096
3121
|
#ifdef __ARM_NEON
|
3097
3122
|
|
3098
3123
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3099
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3100
3124
|
const uint8x16_t mone = vdupq_n_u8(1);
|
3101
3125
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
3126
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3127
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3128
|
+
#endif
|
3102
3129
|
|
3103
3130
|
int8x16x4_t q5bytes;
|
3104
3131
|
|
@@ -3441,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3441
3468
|
#ifdef __ARM_NEON
|
3442
3469
|
|
3443
3470
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3444
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3445
3471
|
const uint8x16_t mh = vdupq_n_u8(16);
|
3472
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3473
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3474
|
+
#endif
|
3446
3475
|
|
3447
3476
|
int8x16x4_t q5bytes;
|
3448
3477
|
uint8x16x4_t q5h;
|
@@ -3660,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3660
3689
|
float sum = 0;
|
3661
3690
|
|
3662
3691
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
3692
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3663
3693
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3694
|
+
#endif
|
3664
3695
|
//const int8x16_t m32s = vdupq_n_s8(32);
|
3665
3696
|
|
3666
3697
|
const uint8x16_t mone = vdupq_n_u8(3);
|
@@ -4049,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4049
4080
|
float sum = 0;
|
4050
4081
|
|
4051
4082
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
4052
|
-
const int32x4_t vzero = vdupq_n_s32(0);
|
4053
4083
|
const int8x16_t m32s = vdupq_n_s8(32);
|
4084
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
4085
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
4086
|
+
#endif
|
4054
4087
|
|
4055
4088
|
const uint8x16_t mone = vdupq_n_u8(3);
|
4056
4089
|
|