llama_cpp 0.4.0 → 0.5.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +13 -50
- data/ext/llama_cpp/src/ggml-cuda.cu +23 -11
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +130 -61
- data/ext/llama_cpp/src/ggml-metal.metal +44 -26
- data/ext/llama_cpp/src/ggml.c +637 -328
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +2 -2
- data/ext/llama_cpp/src/llama.cpp +426 -97
- data/ext/llama_cpp/src/llama.h +51 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -130,13 +130,16 @@
|
|
130
130
|
// The data of the tensor is accessed via the "data" pointer. For example:
|
131
131
|
//
|
132
132
|
// {
|
133
|
-
//
|
133
|
+
// const int nx = 2;
|
134
|
+
// const int ny = 3;
|
134
135
|
//
|
135
|
-
//
|
136
|
-
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
136
|
+
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
|
137
137
|
//
|
138
|
-
//
|
139
|
-
//
|
138
|
+
// for (int y = 0; y < ny; y++) {
|
139
|
+
// for (int x = 0; x < nx; x++) {
|
140
|
+
// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
|
141
|
+
// }
|
142
|
+
// }
|
140
143
|
//
|
141
144
|
// ...
|
142
145
|
// }
|
@@ -211,12 +214,17 @@
|
|
211
214
|
#define GGML_MAX_OP_PARAMS 32
|
212
215
|
#define GGML_DEFAULT_N_THREADS 4
|
213
216
|
|
217
|
+
#if UINTPTR_MAX == 0xFFFFFFFF
|
218
|
+
#define GGML_MEM_ALIGN 4
|
219
|
+
#else
|
220
|
+
#define GGML_MEM_ALIGN 16
|
221
|
+
#endif
|
214
222
|
|
215
223
|
#define GGML_EXIT_SUCCESS 0
|
216
224
|
#define GGML_EXIT_ABORTED 1
|
217
225
|
|
218
226
|
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
219
|
-
#define GGUF_VERSION
|
227
|
+
#define GGUF_VERSION 2
|
220
228
|
|
221
229
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
222
230
|
|
@@ -471,6 +479,9 @@ extern "C" {
|
|
471
479
|
int64_t perf_cycles;
|
472
480
|
int64_t perf_time_us;
|
473
481
|
|
482
|
+
struct ggml_tensor * view_src;
|
483
|
+
size_t view_offs;
|
484
|
+
|
474
485
|
void * data;
|
475
486
|
|
476
487
|
char name[GGML_MAX_NAME];
|
@@ -653,7 +664,7 @@ extern "C" {
|
|
653
664
|
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
654
665
|
|
655
666
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
656
|
-
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx,
|
667
|
+
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
657
668
|
|
658
669
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
659
670
|
|
@@ -944,11 +955,11 @@ extern "C" {
|
|
944
955
|
|
945
956
|
// a - x
|
946
957
|
// b - dy
|
947
|
-
// TODO: update with configurable eps
|
948
958
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
949
959
|
struct ggml_context * ctx,
|
950
960
|
struct ggml_tensor * a,
|
951
|
-
struct ggml_tensor * b
|
961
|
+
struct ggml_tensor * b,
|
962
|
+
float eps);
|
952
963
|
|
953
964
|
// A: n columns, m rows
|
954
965
|
// B: n columns, p rows (i.e. we transpose it internally)
|
@@ -1604,7 +1615,8 @@ extern "C" {
|
|
1604
1615
|
struct ggml_tensor * tensor);
|
1605
1616
|
|
1606
1617
|
|
1607
|
-
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1618
|
+
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1619
|
+
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
1608
1620
|
|
1609
1621
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1610
1622
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
@@ -1669,6 +1681,8 @@ extern "C" {
|
|
1669
1681
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
1670
1682
|
};
|
1671
1683
|
|
1684
|
+
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
1685
|
+
|
1672
1686
|
// optimization parameters
|
1673
1687
|
//
|
1674
1688
|
// see ggml.c (ggml_opt_default_params) for default values
|
@@ -1704,12 +1718,14 @@ extern "C" {
|
|
1704
1718
|
|
1705
1719
|
float sched; // schedule multiplier (fixed, decay or warmup)
|
1706
1720
|
float decay; // weight decay for AdamW, use 0.0f to disable
|
1721
|
+
int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
|
1707
1722
|
float alpha; // learning rate
|
1708
1723
|
float beta1;
|
1709
1724
|
float beta2;
|
1710
1725
|
float eps; // epsilon for numerical stability
|
1711
1726
|
float eps_f; // epsilon for convergence test
|
1712
1727
|
float eps_g; // epsilon for convergence test
|
1728
|
+
float gclip; // gradient clipping
|
1713
1729
|
} adam;
|
1714
1730
|
|
1715
1731
|
// LBFGS parameters
|
@@ -1737,14 +1753,12 @@ extern "C" {
|
|
1737
1753
|
|
1738
1754
|
bool just_initialized;
|
1739
1755
|
|
1756
|
+
float loss_before;
|
1757
|
+
float loss_after;
|
1758
|
+
|
1740
1759
|
struct {
|
1741
|
-
struct ggml_tensor * x; // view of the parameters
|
1742
|
-
struct ggml_tensor * g1; // gradient
|
1743
|
-
struct ggml_tensor * g2; // gradient squared
|
1744
1760
|
struct ggml_tensor * m; // first moment
|
1745
1761
|
struct ggml_tensor * v; // second moment
|
1746
|
-
struct ggml_tensor * mh; // first moment hat
|
1747
|
-
struct ggml_tensor * vh; // second moment hat
|
1748
1762
|
struct ggml_tensor * pf; // past function values
|
1749
1763
|
float fx_best;
|
1750
1764
|
float fx_prev;
|
@@ -1781,10 +1795,10 @@ extern "C" {
|
|
1781
1795
|
|
1782
1796
|
// initialize optimizer context
|
1783
1797
|
GGML_API void ggml_opt_init(
|
1784
|
-
struct ggml_context
|
1798
|
+
struct ggml_context * ctx,
|
1785
1799
|
struct ggml_opt_context * opt,
|
1786
|
-
struct ggml_opt_params
|
1787
|
-
int64_t
|
1800
|
+
struct ggml_opt_params params,
|
1801
|
+
int64_t nx);
|
1788
1802
|
|
1789
1803
|
// continue optimizing the function defined by the tensor f
|
1790
1804
|
GGML_API enum ggml_opt_result ggml_opt_resume(
|
@@ -1798,7 +1812,9 @@ extern "C" {
|
|
1798
1812
|
struct ggml_opt_context * opt,
|
1799
1813
|
struct ggml_tensor * f,
|
1800
1814
|
struct ggml_cgraph * gf,
|
1801
|
-
struct ggml_cgraph * gb
|
1815
|
+
struct ggml_cgraph * gb,
|
1816
|
+
ggml_opt_callback callback,
|
1817
|
+
void * callback_data);
|
1802
1818
|
|
1803
1819
|
//
|
1804
1820
|
// quantization
|
@@ -1827,6 +1843,9 @@ extern "C" {
|
|
1827
1843
|
GGUF_TYPE_BOOL = 7,
|
1828
1844
|
GGUF_TYPE_STRING = 8,
|
1829
1845
|
GGUF_TYPE_ARRAY = 9,
|
1846
|
+
GGUF_TYPE_UINT64 = 10,
|
1847
|
+
GGUF_TYPE_INT64 = 11,
|
1848
|
+
GGUF_TYPE_FLOAT64 = 12,
|
1830
1849
|
GGUF_TYPE_COUNT, // marks the end of the enum
|
1831
1850
|
};
|
1832
1851
|
|
@@ -1867,6 +1886,9 @@ extern "C" {
|
|
1867
1886
|
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
1868
1887
|
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
1869
1888
|
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
1889
|
+
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
1890
|
+
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
1891
|
+
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
1870
1892
|
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
1871
1893
|
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
1872
1894
|
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
@@ -1886,6 +1908,9 @@ extern "C" {
|
|
1886
1908
|
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
1887
1909
|
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
1888
1910
|
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
1911
|
+
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
1912
|
+
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
1913
|
+
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
1889
1914
|
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
1890
1915
|
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
1891
1916
|
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
@@ -1944,6 +1969,7 @@ extern "C" {
|
|
1944
1969
|
GGML_API int ggml_cpu_has_clblast (void);
|
1945
1970
|
GGML_API int ggml_cpu_has_gpublas (void);
|
1946
1971
|
GGML_API int ggml_cpu_has_sse3 (void);
|
1972
|
+
GGML_API int ggml_cpu_has_ssse3 (void);
|
1947
1973
|
GGML_API int ggml_cpu_has_vsx (void);
|
1948
1974
|
|
1949
1975
|
//
|
@@ -2694,13 +2694,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2694
2694
|
const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
2695
2695
|
__m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
|
2696
2696
|
p16l = _mm256_madd_epi16(scale_l, p16l);
|
2697
|
-
sumi = _mm256_add_epi32(sumi, p16l);
|
2698
2697
|
|
2699
2698
|
const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
2700
2699
|
__m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
|
2701
2700
|
p16h = _mm256_madd_epi16(scale_h, p16h);
|
2702
|
-
|
2701
|
+
const __m256i sumj = _mm256_add_epi32(p16l, p16h);
|
2703
2702
|
|
2703
|
+
sumi = _mm256_add_epi32(sumi, sumj);
|
2704
2704
|
}
|
2705
2705
|
|
2706
2706
|
__m256 vd = _mm256_set1_ps(d);
|