llama_cpp 0.4.0 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +9 -0
- data/examples/chat.rb +2 -2
- data/ext/llama_cpp/extconf.rb +1 -1
- data/ext/llama_cpp/llama_cpp.cpp +23 -11
- data/ext/llama_cpp/src/ggml-alloc.c +13 -50
- data/ext/llama_cpp/src/ggml-cuda.cu +23 -11
- data/ext/llama_cpp/src/ggml-metal.h +1 -0
- data/ext/llama_cpp/src/ggml-metal.m +130 -61
- data/ext/llama_cpp/src/ggml-metal.metal +44 -26
- data/ext/llama_cpp/src/ggml.c +637 -328
- data/ext/llama_cpp/src/ggml.h +45 -19
- data/ext/llama_cpp/src/k_quants.c +2 -2
- data/ext/llama_cpp/src/llama.cpp +426 -97
- data/ext/llama_cpp/src/llama.h +51 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/lib/llama_cpp.rb +1 -1
- data/sig/llama_cpp.rbs +5 -3
- metadata +2 -2
data/ext/llama_cpp/src/ggml.h
CHANGED
@@ -130,13 +130,16 @@
|
|
130
130
|
// The data of the tensor is accessed via the "data" pointer. For example:
|
131
131
|
//
|
132
132
|
// {
|
133
|
-
//
|
133
|
+
// const int nx = 2;
|
134
|
+
// const int ny = 3;
|
134
135
|
//
|
135
|
-
//
|
136
|
-
// *(float *) ((char *) a->data + 2*a->nb[1] + 1*a->nb[0]) = 1.0f;
|
136
|
+
// struct ggml_tensor * a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, nx, ny);
|
137
137
|
//
|
138
|
-
//
|
139
|
-
//
|
138
|
+
// for (int y = 0; y < ny; y++) {
|
139
|
+
// for (int x = 0; x < nx; x++) {
|
140
|
+
// *(float *) ((char *) a->data + y*a->nb[1] + x*a->nb[0]) = x + y;
|
141
|
+
// }
|
142
|
+
// }
|
140
143
|
//
|
141
144
|
// ...
|
142
145
|
// }
|
@@ -211,12 +214,17 @@
|
|
211
214
|
#define GGML_MAX_OP_PARAMS 32
|
212
215
|
#define GGML_DEFAULT_N_THREADS 4
|
213
216
|
|
217
|
+
#if UINTPTR_MAX == 0xFFFFFFFF
|
218
|
+
#define GGML_MEM_ALIGN 4
|
219
|
+
#else
|
220
|
+
#define GGML_MEM_ALIGN 16
|
221
|
+
#endif
|
214
222
|
|
215
223
|
#define GGML_EXIT_SUCCESS 0
|
216
224
|
#define GGML_EXIT_ABORTED 1
|
217
225
|
|
218
226
|
#define GGUF_MAGIC 0x46554747 // "GGUF"
|
219
|
-
#define GGUF_VERSION
|
227
|
+
#define GGUF_VERSION 2
|
220
228
|
|
221
229
|
#define GGUF_DEFAULT_ALIGNMENT 32
|
222
230
|
|
@@ -471,6 +479,9 @@ extern "C" {
|
|
471
479
|
int64_t perf_cycles;
|
472
480
|
int64_t perf_time_us;
|
473
481
|
|
482
|
+
struct ggml_tensor * view_src;
|
483
|
+
size_t view_offs;
|
484
|
+
|
474
485
|
void * data;
|
475
486
|
|
476
487
|
char name[GGML_MAX_NAME];
|
@@ -653,7 +664,7 @@ extern "C" {
|
|
653
664
|
GGML_API struct ggml_tensor * ggml_new_f32(struct ggml_context * ctx, float value);
|
654
665
|
|
655
666
|
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
|
656
|
-
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx,
|
667
|
+
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, struct ggml_tensor * src);
|
657
668
|
|
658
669
|
GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
|
659
670
|
|
@@ -944,11 +955,11 @@ extern "C" {
|
|
944
955
|
|
945
956
|
// a - x
|
946
957
|
// b - dy
|
947
|
-
// TODO: update with configurable eps
|
948
958
|
GGML_API struct ggml_tensor * ggml_rms_norm_back(
|
949
959
|
struct ggml_context * ctx,
|
950
960
|
struct ggml_tensor * a,
|
951
|
-
struct ggml_tensor * b
|
961
|
+
struct ggml_tensor * b,
|
962
|
+
float eps);
|
952
963
|
|
953
964
|
// A: n columns, m rows
|
954
965
|
// B: n columns, p rows (i.e. we transpose it internally)
|
@@ -1604,7 +1615,8 @@ extern "C" {
|
|
1604
1615
|
struct ggml_tensor * tensor);
|
1605
1616
|
|
1606
1617
|
|
1607
|
-
GGML_API void ggml_build_forward_expand(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1618
|
+
GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
|
1619
|
+
GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
|
1608
1620
|
|
1609
1621
|
GGML_API struct ggml_cgraph ggml_build_forward (struct ggml_tensor * tensor);
|
1610
1622
|
GGML_API struct ggml_cgraph ggml_build_backward(struct ggml_context * ctx, struct ggml_cgraph * gf, bool keep);
|
@@ -1669,6 +1681,8 @@ extern "C" {
|
|
1669
1681
|
GGML_LINESEARCH_INVALID_PARAMETERS,
|
1670
1682
|
};
|
1671
1683
|
|
1684
|
+
typedef void (*ggml_opt_callback)(void * data, float * sched);
|
1685
|
+
|
1672
1686
|
// optimization parameters
|
1673
1687
|
//
|
1674
1688
|
// see ggml.c (ggml_opt_default_params) for default values
|
@@ -1704,12 +1718,14 @@ extern "C" {
|
|
1704
1718
|
|
1705
1719
|
float sched; // schedule multiplier (fixed, decay or warmup)
|
1706
1720
|
float decay; // weight decay for AdamW, use 0.0f to disable
|
1721
|
+
int decay_min_ndim; // minimum number of tensor dimension to apply weight decay
|
1707
1722
|
float alpha; // learning rate
|
1708
1723
|
float beta1;
|
1709
1724
|
float beta2;
|
1710
1725
|
float eps; // epsilon for numerical stability
|
1711
1726
|
float eps_f; // epsilon for convergence test
|
1712
1727
|
float eps_g; // epsilon for convergence test
|
1728
|
+
float gclip; // gradient clipping
|
1713
1729
|
} adam;
|
1714
1730
|
|
1715
1731
|
// LBFGS parameters
|
@@ -1737,14 +1753,12 @@ extern "C" {
|
|
1737
1753
|
|
1738
1754
|
bool just_initialized;
|
1739
1755
|
|
1756
|
+
float loss_before;
|
1757
|
+
float loss_after;
|
1758
|
+
|
1740
1759
|
struct {
|
1741
|
-
struct ggml_tensor * x; // view of the parameters
|
1742
|
-
struct ggml_tensor * g1; // gradient
|
1743
|
-
struct ggml_tensor * g2; // gradient squared
|
1744
1760
|
struct ggml_tensor * m; // first moment
|
1745
1761
|
struct ggml_tensor * v; // second moment
|
1746
|
-
struct ggml_tensor * mh; // first moment hat
|
1747
|
-
struct ggml_tensor * vh; // second moment hat
|
1748
1762
|
struct ggml_tensor * pf; // past function values
|
1749
1763
|
float fx_best;
|
1750
1764
|
float fx_prev;
|
@@ -1781,10 +1795,10 @@ extern "C" {
|
|
1781
1795
|
|
1782
1796
|
// initialize optimizer context
|
1783
1797
|
GGML_API void ggml_opt_init(
|
1784
|
-
struct ggml_context
|
1798
|
+
struct ggml_context * ctx,
|
1785
1799
|
struct ggml_opt_context * opt,
|
1786
|
-
struct ggml_opt_params
|
1787
|
-
int64_t
|
1800
|
+
struct ggml_opt_params params,
|
1801
|
+
int64_t nx);
|
1788
1802
|
|
1789
1803
|
// continue optimizing the function defined by the tensor f
|
1790
1804
|
GGML_API enum ggml_opt_result ggml_opt_resume(
|
@@ -1798,7 +1812,9 @@ extern "C" {
|
|
1798
1812
|
struct ggml_opt_context * opt,
|
1799
1813
|
struct ggml_tensor * f,
|
1800
1814
|
struct ggml_cgraph * gf,
|
1801
|
-
struct ggml_cgraph * gb
|
1815
|
+
struct ggml_cgraph * gb,
|
1816
|
+
ggml_opt_callback callback,
|
1817
|
+
void * callback_data);
|
1802
1818
|
|
1803
1819
|
//
|
1804
1820
|
// quantization
|
@@ -1827,6 +1843,9 @@ extern "C" {
|
|
1827
1843
|
GGUF_TYPE_BOOL = 7,
|
1828
1844
|
GGUF_TYPE_STRING = 8,
|
1829
1845
|
GGUF_TYPE_ARRAY = 9,
|
1846
|
+
GGUF_TYPE_UINT64 = 10,
|
1847
|
+
GGUF_TYPE_INT64 = 11,
|
1848
|
+
GGUF_TYPE_FLOAT64 = 12,
|
1830
1849
|
GGUF_TYPE_COUNT, // marks the end of the enum
|
1831
1850
|
};
|
1832
1851
|
|
@@ -1867,6 +1886,9 @@ extern "C" {
|
|
1867
1886
|
GGML_API uint32_t gguf_get_val_u32 (struct gguf_context * ctx, int i);
|
1868
1887
|
GGML_API int32_t gguf_get_val_i32 (struct gguf_context * ctx, int i);
|
1869
1888
|
GGML_API float gguf_get_val_f32 (struct gguf_context * ctx, int i);
|
1889
|
+
GGML_API uint64_t gguf_get_val_u64 (struct gguf_context * ctx, int i);
|
1890
|
+
GGML_API int64_t gguf_get_val_i64 (struct gguf_context * ctx, int i);
|
1891
|
+
GGML_API double gguf_get_val_f64 (struct gguf_context * ctx, int i);
|
1870
1892
|
GGML_API bool gguf_get_val_bool(struct gguf_context * ctx, int i);
|
1871
1893
|
GGML_API const char * gguf_get_val_str (struct gguf_context * ctx, int i);
|
1872
1894
|
GGML_API int gguf_get_arr_n (struct gguf_context * ctx, int i);
|
@@ -1886,6 +1908,9 @@ extern "C" {
|
|
1886
1908
|
GGML_API void gguf_set_val_u32 (struct gguf_context * ctx, const char * key, uint32_t val);
|
1887
1909
|
GGML_API void gguf_set_val_i32 (struct gguf_context * ctx, const char * key, int32_t val);
|
1888
1910
|
GGML_API void gguf_set_val_f32 (struct gguf_context * ctx, const char * key, float val);
|
1911
|
+
GGML_API void gguf_set_val_u64 (struct gguf_context * ctx, const char * key, uint64_t val);
|
1912
|
+
GGML_API void gguf_set_val_i64 (struct gguf_context * ctx, const char * key, int64_t val);
|
1913
|
+
GGML_API void gguf_set_val_f64 (struct gguf_context * ctx, const char * key, double val);
|
1889
1914
|
GGML_API void gguf_set_val_bool(struct gguf_context * ctx, const char * key, bool val);
|
1890
1915
|
GGML_API void gguf_set_val_str (struct gguf_context * ctx, const char * key, const char * val);
|
1891
1916
|
GGML_API void gguf_set_arr_data(struct gguf_context * ctx, const char * key, enum gguf_type type, const void * data, int n);
|
@@ -1944,6 +1969,7 @@ extern "C" {
|
|
1944
1969
|
GGML_API int ggml_cpu_has_clblast (void);
|
1945
1970
|
GGML_API int ggml_cpu_has_gpublas (void);
|
1946
1971
|
GGML_API int ggml_cpu_has_sse3 (void);
|
1972
|
+
GGML_API int ggml_cpu_has_ssse3 (void);
|
1947
1973
|
GGML_API int ggml_cpu_has_vsx (void);
|
1948
1974
|
|
1949
1975
|
//
|
@@ -2694,13 +2694,13 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
|
|
2694
2694
|
const __m256i q8l = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
2695
2695
|
__m256i p16l = _mm256_maddubs_epi16(q4l, q8l);
|
2696
2696
|
p16l = _mm256_madd_epi16(scale_l, p16l);
|
2697
|
-
sumi = _mm256_add_epi32(sumi, p16l);
|
2698
2697
|
|
2699
2698
|
const __m256i q8h = _mm256_loadu_si256((const __m256i*)q8); q8 += 32;
|
2700
2699
|
__m256i p16h = _mm256_maddubs_epi16(q4h, q8h);
|
2701
2700
|
p16h = _mm256_madd_epi16(scale_h, p16h);
|
2702
|
-
|
2701
|
+
const __m256i sumj = _mm256_add_epi32(p16l, p16h);
|
2703
2702
|
|
2703
|
+
sumi = _mm256_add_epi32(sumi, sumj);
|
2704
2704
|
}
|
2705
2705
|
|
2706
2706
|
__m256 vd = _mm256_set1_ps(d);
|