llama_cpp 0.4.0 → 0.5.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +13 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +118 -73
- data/ext/llama_cpp/src/ggml-cuda.cu +106 -34
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +165 -72
- data/ext/llama_cpp/src/ggml-metal.metal +160 -89
- data/ext/llama_cpp/src/ggml-opencl.cpp +7 -7
- data/ext/llama_cpp/src/ggml.c +661 -380
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +47 -14
- data/ext/llama_cpp/src/llama.cpp +571 -166
- data/ext/llama_cpp/src/llama.h +54 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -130,13 +130,16 @@
|
|
130
130
|
// The data of the tensor is accessed via the "data" pointer. For example:
|
131
131
|
//
|
132
132
|
// {
|
133
|
-
//
|
133
|
+
// const int nx = 2;
|
134
|
+
// const int ny = 3;
|
134
135
|
//
|
135
|
-
//
|
136
|
-
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
136
|
+
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
|
137
137
|
//
|
138
|
-
//
|
139
|
-
//
|
138
|
+
// for (int y = 0; y < ny; y++) {
|
139
|
+
// for (int x = 0; x < nx; x++) {
|
140
|
+
// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
|
141
|
+
// }
|
142
|
+
// }
|
140
143
|
//
|
141
144
|
// ...
|
142
145
|
// }
|
@@ -211,12 +214,17 @@
|
|
211
214
|
#define GGML_MAX_OP_PARAMS 32
|
212
215
|
#define GGML_DEFAULT_N_THREADS 4
|
213
216
|
|
217
|
+
#if UINTPTR_MAX == 0xFFFFFFFF
|
218
|
+
#define GGML_MEM_ALIGN 4
|
219
|
+
#else
|
220
|
+
#define GGML_MEM_ALIGN 16
|
221
|
+
#endif
|
214
222
|
|
215
223
|
#define GGML_EXIT_SUCCESS 0
|
216
224
|
#define GGML_EXIT_ABORTED 1
|
217
225
|
|
218
226
|
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
219
|
-
#define GGUF_VERSION
|
227
|
+
#define GGUF_VERSION 2
|
220
228
|
|
221
229
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
222
230
|
|
@@ -471,6 +479,9 @@ extern "C" {
|
|
471
479
|
int64_t perf_cycles;
|
472
480
|
int64_t perf_time_us;
|
473
481
|
|
482
|
+
struct ggml_tensor * view_src;
|
483
|
+
size_t view_offs;
|
484
|
+
|
474
485
|
void * data;
|
475
486
|
|
476
487
|
char name[GGML_MAX_NAME];
|
@@ -653,7 +664,7 @@ extern "C" {
|
|
653
664
|
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
654
665
|
|
655
666
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
656
|
-
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx,
|
667
|
+
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
657
668
|
|
658
669
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
659
670
|
|
@@ -944,11 +955,11 @@ extern "C" {
|
|
944
955
|
|
945
956
|
// a - x
|
946
957
|
// b - dy
|
947
|
-
// TODO: update with configurable eps
|
948
958
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
949
959
|
struct ggml_context * ctx,
|
950
960
|
struct ggml_tensor * a,
|
951
|
-
struct ggml_tensor * b
|
961
|
+
struct ggml_tensor * b,
|
962
|
+
float eps);
|
952
963
|
|
953
964
|
// A: n columns, m rows
|
954
965
|
// B: n columns, p rows (i.e. we transpose it internally)
|
@@ -1604,7 +1615,8 @@ extern "C" {
|
|
1604
1615
|
struct ggml_tensor * tensor);
|
1605
1616
|
|
1606
1617
|
|
1607
|
-
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1618
|
+
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1619
|
+
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
1608
1620
|
|
1609
1621
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1610
1622
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
@@ -1669,6 +1681,8 @@ extern "C" {
|
|
1669
1681
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
1670
1682
|
};
|
1671
1683
|
|
1684
|
+
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
1685
|
+
|
1672
1686
|
// optimization parameters
|
1673
1687
|
//
|
1674
1688
|
// see ggml.c (ggml_opt_default_params) for default values
|
@@ -1704,12 +1718,14 @@ extern "C" {
|
|
1704
1718
|
|
1705
1719
|
float sched; // schedule multiplier (fixed, decay or warmup)
|
1706
1720
|
float decay; // weight decay for AdamW, use 0.0f to disable
|
1721
|
+
int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
|
1707
1722
|
float alpha; // learning rate
|
1708
1723
|
float beta1;
|
1709
1724
|
float beta2;
|
1710
1725
|
float eps; // epsilon for numerical stability
|
1711
1726
|
float eps_f; // epsilon for convergence test
|
1712
1727
|
float eps_g; // epsilon for convergence test
|
1728
|
+
float gclip; // gradient clipping
|
1713
1729
|
} adam;
|
1714
1730
|
|
1715
1731
|
// LBFGS parameters
|
@@ -1737,14 +1753,12 @@ extern "C" {
|
|
1737
1753
|
|
1738
1754
|
bool just_initialized;
|
1739
1755
|
|
1756
|
+
float loss_before;
|
1757
|
+
float loss_after;
|
1758
|
+
|
1740
1759
|
struct {
|
1741
|
-
struct ggml_tensor * x; // view of the parameters
|
1742
|
-
struct ggml_tensor * g1; // gradient
|
1743
|
-
struct ggml_tensor * g2; // gradient squared
|
1744
1760
|
struct ggml_tensor * m; // first moment
|
1745
1761
|
struct ggml_tensor * v; // second moment
|
1746
|
-
struct ggml_tensor * mh; // first moment hat
|
1747
|
-
struct ggml_tensor * vh; // second moment hat
|
1748
1762
|
struct ggml_tensor * pf; // past function values
|
1749
1763
|
float fx_best;
|
1750
1764
|
float fx_prev;
|
@@ -1781,10 +1795,10 @@ extern "C" {
|
|
1781
1795
|
|
1782
1796
|
// initialize optimizer context
|
1783
1797
|
GGML_API void ggml_opt_init(
|
1784
|
-
struct ggml_context
|
1798
|
+
struct ggml_context * ctx,
|
1785
1799
|
struct ggml_opt_context * opt,
|
1786
|
-
struct ggml_opt_params
|
1787
|
-
int64_t
|
1800
|
+
struct ggml_opt_params params,
|
1801
|
+
int64_t nx);
|
1788
1802
|
|
1789
1803
|
// continue optimizing the function defined by the tensor f
|
1790
1804
|
GGML_API enum ggml_opt_result ggml_opt_resume(
|
@@ -1798,7 +1812,9 @@ extern "C" {
|
|
1798
1812
|
struct ggml_opt_context * opt,
|
1799
1813
|
struct ggml_tensor * f,
|
1800
1814
|
struct ggml_cgraph * gf,
|
1801
|
-
struct ggml_cgraph * gb
|
1815
|
+
struct ggml_cgraph * gb,
|
1816
|
+
ggml_opt_callback callback,
|
1817
|
+
void * callback_data);
|
1802
1818
|
|
1803
1819
|
//
|
1804
1820
|
// quantization
|
@@ -1827,6 +1843,9 @@ extern "C" {
|
|
1827
1843
|
GGUF_TYPE_BOOL = 7,
|
1828
1844
|
GGUF_TYPE_STRING = 8,
|
1829
1845
|
GGUF_TYPE_ARRAY = 9,
|
1846
|
+
GGUF_TYPE_UINT64 = 10,
|
1847
|
+
GGUF_TYPE_INT64 = 11,
|
1848
|
+
GGUF_TYPE_FLOAT64 = 12,
|
1830
1849
|
GGUF_TYPE_COUNT, // marks the end of the enum
|
1831
1850
|
};
|
1832
1851
|
|
@@ -1867,6 +1886,9 @@ extern "C" {
|
|
1867
1886
|
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
1868
1887
|
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
1869
1888
|
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
1889
|
+
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
1890
|
+
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
1891
|
+
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
1870
1892
|
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
1871
1893
|
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
1872
1894
|
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
@@ -1886,6 +1908,9 @@ extern "C" {
|
|
1886
1908
|
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
1887
1909
|
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
1888
1910
|
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
1911
|
+
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
1912
|
+
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
1913
|
+
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
1889
1914
|
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
1890
1915
|
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
1891
1916
|
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
@@ -1944,6 +1969,7 @@ extern "C" {
|
|
1944
1969
|
GGML_API int ggml_cpu_has_clblast (void);
|
1945
1970
|
GGML_API int ggml_cpu_has_gpublas (void);
|
1946
1971
|
GGML_API int ggml_cpu_has_sse3 (void);
|
1972
|
+
GGML_API int ggml_cpu_has_ssse3 (void);
|
1947
1973
|
GGML_API int ggml_cpu_has_vsx (void);
|
1948
1974
|
|
1949
1975
|
//
|
@@ -13,6 +13,26 @@
|
|
13
13
|
//
|
14
14
|
#include <arm_neon.h>
|
15
15
|
|
16
|
+
#if !defined(__aarch64__)
|
17
|
+
inline static int32_t vaddvq_s16(int16x8_t v) {
|
18
|
+
return
|
19
|
+
(int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
|
20
|
+
(int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
|
21
|
+
(int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
|
22
|
+
(int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
|
23
|
+
}
|
24
|
+
|
25
|
+
inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
|
26
|
+
int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
|
27
|
+
int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
|
28
|
+
return vcombine_s16(a0, b0);
|
29
|
+
}
|
30
|
+
|
31
|
+
inline static int32_t vaddvq_s32(int32x4_t v) {
|
32
|
+
return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
|
33
|
+
}
|
34
|
+
#endif
|
35
|
+
|
16
36
|
#else
|
17
37
|
|
18
38
|
#ifdef __wasm_simd128__
|
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
|
|
63
83
|
float ax = fabsf(x[i]);
|
64
84
|
if (ax > amax) { amax = ax; max = x[i]; }
|
65
85
|
}
|
66
|
-
if (
|
86
|
+
if (amax < 1e-30f) { // all zero
|
67
87
|
for (int i = 0; i < n; ++i) {
|
68
88
|
L[i] = 0;
|
69
89
|
}
|
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
|
|
183
203
|
int ntry, float alpha) {
|
184
204
|
float min = x[0];
|
185
205
|
float max = x[0];
|
186
|
-
float sum_x = 0;
|
187
|
-
float sum_x2 = 0;
|
188
206
|
for (int i = 1; i < n; ++i) {
|
189
207
|
if (x[i] < min) min = x[i];
|
190
208
|
if (x[i] > max) max = x[i];
|
191
|
-
sum_x += x[i];
|
192
|
-
sum_x2 += x[i]*x[i];
|
193
209
|
}
|
194
210
|
if (max == min) {
|
195
211
|
for (int i = 0; i < n; ++i) L[i] = 0;
|
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
|
|
1070
1086
|
|
1071
1087
|
}
|
1072
1088
|
|
1089
|
+
if (!max_abs_scale) {
|
1090
|
+
memset(&y[i], 0, sizeof(block_q6_K));
|
1091
|
+
y[i].d = ggml_fp32_to_fp16(0.f);
|
1092
|
+
x += QK_K;
|
1093
|
+
continue;
|
1094
|
+
}
|
1095
|
+
|
1073
1096
|
float iscale = -128.f/max_scale;
|
1074
1097
|
y[i].d = ggml_fp32_to_fp16(1/iscale);
|
1075
1098
|
for (int ib = 0; ib < QK_K/16; ++ib) {
|
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1306
1329
|
|
1307
1330
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1308
1331
|
const uint8x16_t m4 = vdupq_n_u8(0xF);
|
1332
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1309
1333
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1334
|
+
#endif
|
1310
1335
|
|
1311
1336
|
int8x16x2_t q2bytes;
|
1312
1337
|
uint8_t aux[16];
|
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
|
|
1612
1637
|
#ifdef __ARM_NEON
|
1613
1638
|
|
1614
1639
|
const uint8x16_t m3 = vdupq_n_u8(0x3);
|
1640
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
1615
1641
|
const int32x4_t vzero = vdupq_n_s32(0);
|
1642
|
+
#endif
|
1616
1643
|
|
1617
1644
|
int8x16x4_t q2bytes;
|
1618
1645
|
|
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2060
2087
|
|
2061
2088
|
__m256 acc = _mm256_setzero_ps();
|
2062
2089
|
|
2063
|
-
uint32_t *aux;
|
2090
|
+
const uint32_t *aux;
|
2064
2091
|
|
2065
2092
|
for (int i = 0; i < nb; ++i) {
|
2066
2093
|
|
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2070
2097
|
const int8_t * restrict q8 = y[i].qs;
|
2071
2098
|
|
2072
2099
|
// Set up scales
|
2073
|
-
aux = (uint32_t *)x[i].scales;
|
2100
|
+
aux = (const uint32_t *)x[i].scales;
|
2074
2101
|
__m128i scales128 = _mm_set_epi32(
|
2075
2102
|
((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
|
2076
2103
|
((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
|
@@ -2596,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2596
2623
|
const uint8_t * restrict q4 = x[i].qs;
|
2597
2624
|
const int8_t * restrict q8 = y[i].qs;
|
2598
2625
|
|
2599
|
-
//int32x4_t isum = mzero;
|
2600
|
-
|
2601
2626
|
int32_t sumi1 = 0;
|
2602
2627
|
int32_t sumi2 = 0;
|
2603
2628
|
|
@@ -2694,13 +2719,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2694
2719
|
const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
2695
2720
|
__m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
|
2696
2721
|
p16l = _mm256_madd_epi16(scale_l, p16l);
|
2697
|
-
sumi = _mm256_add_epi32(sumi, p16l);
|
2698
2722
|
|
2699
2723
|
const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
2700
2724
|
__m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
|
2701
2725
|
p16h = _mm256_madd_epi16(scale_h, p16h);
|
2702
|
-
|
2726
|
+
const __m256i sumj = _mm256_add_epi32(p16l, p16h);
|
2703
2727
|
|
2728
|
+
sumi = _mm256_add_epi32(sumi, sumj);
|
2704
2729
|
}
|
2705
2730
|
|
2706
2731
|
__m256 vd = _mm256_set1_ps(d);
|
@@ -3096,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3096
3121
|
#ifdef __ARM_NEON
|
3097
3122
|
|
3098
3123
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3099
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3100
3124
|
const uint8x16_t mone = vdupq_n_u8(1);
|
3101
3125
|
const uint8x16_t mtwo = vdupq_n_u8(2);
|
3126
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3127
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3128
|
+
#endif
|
3102
3129
|
|
3103
3130
|
int8x16x4_t q5bytes;
|
3104
3131
|
|
@@ -3441,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3441
3468
|
#ifdef __ARM_NEON
|
3442
3469
|
|
3443
3470
|
const uint8x16_t m4b = vdupq_n_u8(0xf);
|
3444
|
-
const int32x4_t mzero = vdupq_n_s32(0);
|
3445
3471
|
const uint8x16_t mh = vdupq_n_u8(16);
|
3472
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3473
|
+
const int32x4_t mzero = vdupq_n_s32(0);
|
3474
|
+
#endif
|
3446
3475
|
|
3447
3476
|
int8x16x4_t q5bytes;
|
3448
3477
|
uint8x16x4_t q5h;
|
@@ -3660,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
3660
3689
|
float sum = 0;
|
3661
3690
|
|
3662
3691
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
3692
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
3663
3693
|
const int32x4_t vzero = vdupq_n_s32(0);
|
3694
|
+
#endif
|
3664
3695
|
//const int8x16_t m32s = vdupq_n_s8(32);
|
3665
3696
|
|
3666
3697
|
const uint8x16_t mone = vdupq_n_u8(3);
|
@@ -4049,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
|
|
4049
4080
|
float sum = 0;
|
4050
4081
|
|
4051
4082
|
const uint8x16_t m4b = vdupq_n_u8(0xF);
|
4052
|
-
const int32x4_t vzero = vdupq_n_s32(0);
|
4053
4083
|
const int8x16_t m32s = vdupq_n_s8(32);
|
4084
|
+
#if defined(__ARM_FEATURE_DOTPROD)
|
4085
|
+
const int32x4_t vzero = vdupq_n_s32(0);
|
4086
|
+
#endif
|
4054
4087
|
|
4055
4088
|
const uint8x16_t mone = vdupq_n_u8(3);
|
4056
4089
|
|