llama_cpp 0.9.5 → 0.10.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +10 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +42 -7
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1140 -355
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +506 -158
- data/ext/llama_cpp/src/ggml-metal.metal +795 -144
- data/ext/llama_cpp/src/ggml.c +331 -111
- data/ext/llama_cpp/src/ggml.h +49 -4
- data/ext/llama_cpp/src/llama.cpp +749 -329
- data/ext/llama_cpp/src/llama.h +28 -5
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +2 -2
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -74,6 +74,7 @@
|
|
74
74
|
#include <set>
|
75
75
|
#include <sstream>
|
76
76
|
#include <thread>
|
77
|
+
#include <type_traits>
|
77
78
|
#include <unordered_map>
|
78
79
|
|
79
80
|
#if defined(_MSC_VER)
|
@@ -192,6 +193,7 @@ enum llm_arch {
|
|
192
193
|
LLM_ARCH_REFACT,
|
193
194
|
LLM_ARCH_BLOOM,
|
194
195
|
LLM_ARCH_STABLELM,
|
196
|
+
LLM_ARCH_QWEN,
|
195
197
|
LLM_ARCH_UNKNOWN,
|
196
198
|
};
|
197
199
|
|
@@ -208,6 +210,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
208
210
|
{ LLM_ARCH_REFACT, "refact" },
|
209
211
|
{ LLM_ARCH_BLOOM, "bloom" },
|
210
212
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
213
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
211
214
|
};
|
212
215
|
|
213
216
|
enum llm_kv {
|
@@ -518,6 +521,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
518
521
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
519
522
|
},
|
520
523
|
},
|
524
|
+
{
|
525
|
+
LLM_ARCH_QWEN,
|
526
|
+
{
|
527
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
528
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
529
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
530
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
531
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
532
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
533
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
534
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
535
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
536
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
537
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
538
|
+
},
|
539
|
+
},
|
521
540
|
|
522
541
|
{
|
523
542
|
LLM_ARCH_UNKNOWN,
|
@@ -572,21 +591,6 @@ struct LLM_TN {
|
|
572
591
|
// gguf helpers
|
573
592
|
//
|
574
593
|
|
575
|
-
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
576
|
-
do { \
|
577
|
-
const std::string skey(key); \
|
578
|
-
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
579
|
-
if (kid >= 0) { \
|
580
|
-
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
|
581
|
-
if (ktype != (type)) { \
|
582
|
-
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
|
583
|
-
} \
|
584
|
-
(dst) = func(ctx, kid); \
|
585
|
-
} else if (req) { \
|
586
|
-
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
587
|
-
} \
|
588
|
-
} while (0)
|
589
|
-
|
590
594
|
static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
|
591
595
|
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
592
596
|
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
@@ -620,7 +624,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
|
620
624
|
}
|
621
625
|
}
|
622
626
|
|
623
|
-
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
627
|
+
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
624
628
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
625
629
|
|
626
630
|
switch (type) {
|
@@ -1222,6 +1226,7 @@ struct llama_cparams {
|
|
1222
1226
|
float yarn_beta_slow;
|
1223
1227
|
|
1224
1228
|
bool mul_mat_q;
|
1229
|
+
bool offload_kqv;
|
1225
1230
|
};
|
1226
1231
|
|
1227
1232
|
struct llama_layer {
|
@@ -1243,6 +1248,9 @@ struct llama_layer {
|
|
1243
1248
|
struct ggml_tensor * wqkv;
|
1244
1249
|
|
1245
1250
|
// attention bias
|
1251
|
+
struct ggml_tensor * bq;
|
1252
|
+
struct ggml_tensor * bk;
|
1253
|
+
struct ggml_tensor * bv;
|
1246
1254
|
struct ggml_tensor * bo;
|
1247
1255
|
struct ggml_tensor * bqkv;
|
1248
1256
|
|
@@ -1287,8 +1295,8 @@ struct llama_kv_cache {
|
|
1287
1295
|
|
1288
1296
|
std::vector<llama_kv_cell> cells;
|
1289
1297
|
|
1290
|
-
struct ggml_tensor
|
1291
|
-
struct ggml_tensor
|
1298
|
+
std::vector<struct ggml_tensor *> k_l; // per layer
|
1299
|
+
std::vector<struct ggml_tensor *> v_l;
|
1292
1300
|
|
1293
1301
|
struct ggml_context * ctx = NULL;
|
1294
1302
|
|
@@ -1301,8 +1309,10 @@ struct llama_kv_cache {
|
|
1301
1309
|
|
1302
1310
|
#ifdef GGML_USE_CUBLAS
|
1303
1311
|
if (ggml_cublas_loaded()) {
|
1304
|
-
|
1305
|
-
|
1312
|
+
for (size_t i = 0; i < k_l.size(); ++i) {
|
1313
|
+
ggml_cuda_free_data(k_l[i]);
|
1314
|
+
ggml_cuda_free_data(v_l[i]);
|
1315
|
+
}
|
1306
1316
|
}
|
1307
1317
|
#endif
|
1308
1318
|
}
|
@@ -1492,9 +1502,11 @@ struct llama_context {
|
|
1492
1502
|
static bool llama_kv_cache_init(
|
1493
1503
|
const struct llama_hparams & hparams,
|
1494
1504
|
struct llama_kv_cache & cache,
|
1495
|
-
ggml_type
|
1505
|
+
ggml_type ktype,
|
1506
|
+
ggml_type vtype,
|
1496
1507
|
uint32_t n_ctx,
|
1497
|
-
int n_gpu_layers
|
1508
|
+
int n_gpu_layers,
|
1509
|
+
bool offload) {
|
1498
1510
|
const uint32_t n_embd = hparams.n_embd_gqa();
|
1499
1511
|
const uint32_t n_layer = hparams.n_layer;
|
1500
1512
|
|
@@ -1510,7 +1522,7 @@ static bool llama_kv_cache_init(
|
|
1510
1522
|
cache.cells.clear();
|
1511
1523
|
cache.cells.resize(n_ctx);
|
1512
1524
|
|
1513
|
-
cache.buf.resize(
|
1525
|
+
cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
|
1514
1526
|
memset(cache.buf.data, 0, cache.buf.size);
|
1515
1527
|
|
1516
1528
|
struct ggml_init_params params;
|
@@ -1520,37 +1532,44 @@ static bool llama_kv_cache_init(
|
|
1520
1532
|
|
1521
1533
|
cache.ctx = ggml_init(params);
|
1522
1534
|
|
1535
|
+
size_t vram_kv_cache = 0;
|
1536
|
+
|
1523
1537
|
if (!cache.ctx) {
|
1524
1538
|
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
|
1525
1539
|
return false;
|
1526
1540
|
}
|
1527
1541
|
|
1528
|
-
cache.
|
1529
|
-
cache.
|
1530
|
-
ggml_set_name(cache.k, "cache_k");
|
1531
|
-
ggml_set_name(cache.v, "cache_v");
|
1542
|
+
cache.k_l.reserve(n_layer);
|
1543
|
+
cache.v_l.reserve(n_layer);
|
1532
1544
|
|
1533
|
-
(
|
1545
|
+
const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
|
1534
1546
|
|
1535
|
-
|
1536
|
-
if (ggml_cublas_loaded()) {
|
1537
|
-
size_t vram_kv_cache = 0;
|
1547
|
+
GGML_UNUSED(offload);
|
1538
1548
|
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1546
|
-
|
1547
|
-
|
1548
|
-
|
1549
|
-
|
1550
|
-
|
1549
|
+
for (int i = 0; i < (int) n_layer; i++) {
|
1550
|
+
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
1551
|
+
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
|
1552
|
+
ggml_format_name(k, "cache_k_l%d", i);
|
1553
|
+
ggml_format_name(v, "cache_v_l%d", i);
|
1554
|
+
cache.k_l.push_back(k);
|
1555
|
+
cache.v_l.push_back(v);
|
1556
|
+
#ifdef GGML_USE_CUBLAS
|
1557
|
+
if (i >= i_gpu_start) {
|
1558
|
+
if (offload) {
|
1559
|
+
ggml_cuda_assign_buffers_no_scratch(k);
|
1560
|
+
vram_kv_cache += ggml_nbytes(k);
|
1561
|
+
ggml_cuda_assign_buffers_no_scratch(v);
|
1562
|
+
vram_kv_cache += ggml_nbytes(v);
|
1563
|
+
}
|
1551
1564
|
}
|
1565
|
+
#endif // GGML_USE_CUBLAS
|
1552
1566
|
}
|
1553
|
-
|
1567
|
+
|
1568
|
+
if (vram_kv_cache > 0) {
|
1569
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1570
|
+
}
|
1571
|
+
|
1572
|
+
GGML_UNUSED(n_gpu_layers);
|
1554
1573
|
|
1555
1574
|
return true;
|
1556
1575
|
}
|
@@ -1771,6 +1790,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
|
1771
1790
|
return buf;
|
1772
1791
|
}
|
1773
1792
|
|
1793
|
+
namespace GGUFMeta {
|
1794
|
+
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
|
1795
|
+
struct GKV_Base_Type {
|
1796
|
+
static constexpr gguf_type gt = gt_;
|
1797
|
+
|
1798
|
+
static T getter(const gguf_context * ctx, const int kid) {
|
1799
|
+
return gfun(ctx, kid);
|
1800
|
+
}
|
1801
|
+
};
|
1802
|
+
|
1803
|
+
template<typename T> struct GKV_Base;
|
1804
|
+
|
1805
|
+
template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
|
1806
|
+
template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
|
1807
|
+
template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
|
1808
|
+
template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
|
1809
|
+
template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
|
1810
|
+
template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
|
1811
|
+
template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
|
1812
|
+
template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
|
1813
|
+
template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
|
1814
|
+
template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
|
1815
|
+
template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
|
1816
|
+
template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
|
1817
|
+
|
1818
|
+
template<> struct GKV_Base<std::string> {
|
1819
|
+
static constexpr gguf_type gt = GGUF_TYPE_STRING;
|
1820
|
+
|
1821
|
+
static std::string getter(const gguf_context * ctx, const int kid) {
|
1822
|
+
return gguf_get_val_str(ctx, kid);
|
1823
|
+
}
|
1824
|
+
};
|
1825
|
+
|
1826
|
+
struct ArrayInfo{
|
1827
|
+
const gguf_type gt;
|
1828
|
+
const size_t length;
|
1829
|
+
const void * data;
|
1830
|
+
};
|
1831
|
+
|
1832
|
+
template<> struct GKV_Base<ArrayInfo> {
|
1833
|
+
public:
|
1834
|
+
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
1835
|
+
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
1836
|
+
return ArrayInfo {
|
1837
|
+
gguf_get_arr_type(ctx, k),
|
1838
|
+
size_t(gguf_get_arr_n(ctx, k)),
|
1839
|
+
gguf_get_arr_data(ctx, k),
|
1840
|
+
};
|
1841
|
+
}
|
1842
|
+
};
|
1843
|
+
|
1844
|
+
template<typename T>
|
1845
|
+
class GKV: public GKV_Base<T> {
|
1846
|
+
GKV() = delete;
|
1847
|
+
|
1848
|
+
public:
|
1849
|
+
static T get_kv(const gguf_context * ctx, const int k) {
|
1850
|
+
const enum gguf_type kt = gguf_get_kv_type(ctx, k);
|
1851
|
+
|
1852
|
+
if (kt != GKV::gt) {
|
1853
|
+
throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
|
1854
|
+
gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
|
1855
|
+
}
|
1856
|
+
return GKV::getter(ctx, k);
|
1857
|
+
}
|
1858
|
+
|
1859
|
+
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
1860
|
+
switch (ty) {
|
1861
|
+
case LLAMA_KV_OVERRIDE_BOOL: return "bool";
|
1862
|
+
case LLAMA_KV_OVERRIDE_INT: return "int";
|
1863
|
+
case LLAMA_KV_OVERRIDE_FLOAT: return "float";
|
1864
|
+
}
|
1865
|
+
return "unknown";
|
1866
|
+
}
|
1867
|
+
|
1868
|
+
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
|
1869
|
+
if (!override) { return false; }
|
1870
|
+
if (override->tag == expected_type) {
|
1871
|
+
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
1872
|
+
__func__, override_type_to_str(override->tag), override->key);
|
1873
|
+
switch (override->tag) {
|
1874
|
+
case LLAMA_KV_OVERRIDE_BOOL: {
|
1875
|
+
printf("%s\n", override->bool_value ? "true" : "false");
|
1876
|
+
} break;
|
1877
|
+
case LLAMA_KV_OVERRIDE_INT: {
|
1878
|
+
printf("%" PRId64 "\n", override->int_value);
|
1879
|
+
} break;
|
1880
|
+
case LLAMA_KV_OVERRIDE_FLOAT: {
|
1881
|
+
printf("%.6f\n", override->float_value);
|
1882
|
+
} break;
|
1883
|
+
default:
|
1884
|
+
// Shouldn't be possible to end up here, but just in case...
|
1885
|
+
throw std::runtime_error(
|
1886
|
+
format("Unsupported attempt to override %s type for metadata key %s\n",
|
1887
|
+
override_type_to_str(override->tag), override->key));
|
1888
|
+
}
|
1889
|
+
return true;
|
1890
|
+
}
|
1891
|
+
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
|
1892
|
+
__func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
|
1893
|
+
return false;
|
1894
|
+
}
|
1895
|
+
|
1896
|
+
template<typename OT>
|
1897
|
+
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
1898
|
+
try_override(OT & target, const struct llama_model_kv_override *override) {
|
1899
|
+
if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
|
1900
|
+
target = override->bool_value;
|
1901
|
+
return true;
|
1902
|
+
}
|
1903
|
+
return true;
|
1904
|
+
}
|
1905
|
+
|
1906
|
+
template<typename OT>
|
1907
|
+
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
1908
|
+
try_override(OT & target, const struct llama_model_kv_override *override) {
|
1909
|
+
if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
|
1910
|
+
target = override->int_value;
|
1911
|
+
return true;
|
1912
|
+
}
|
1913
|
+
return false;
|
1914
|
+
}
|
1915
|
+
|
1916
|
+
template<typename OT>
|
1917
|
+
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
1918
|
+
try_override(T & target, const struct llama_model_kv_override *override) {
|
1919
|
+
if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
|
1920
|
+
target = override->float_value;
|
1921
|
+
return true;
|
1922
|
+
}
|
1923
|
+
return false;
|
1924
|
+
}
|
1925
|
+
|
1926
|
+
template<typename OT>
|
1927
|
+
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
1928
|
+
try_override(T & target, const struct llama_model_kv_override *override) {
|
1929
|
+
(void)target;
|
1930
|
+
(void)override;
|
1931
|
+
if (!override) { return false; }
|
1932
|
+
// Currently, we should never end up here so it would be a bug if we do.
|
1933
|
+
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
1934
|
+
override ? override->key : "NULL"));
|
1935
|
+
}
|
1936
|
+
|
1937
|
+
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1938
|
+
if (try_override<T>(target, override)) {
|
1939
|
+
return true;
|
1940
|
+
}
|
1941
|
+
if (k < 0) { return false; }
|
1942
|
+
target = get_kv(ctx, k);
|
1943
|
+
return true;
|
1944
|
+
}
|
1945
|
+
|
1946
|
+
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1947
|
+
return set(ctx, gguf_find_key(ctx, key), target, override);
|
1948
|
+
}
|
1949
|
+
|
1950
|
+
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1951
|
+
return set(ctx, key.c_str(), target, override);
|
1952
|
+
}
|
1953
|
+
};
|
1954
|
+
}
|
1955
|
+
|
1774
1956
|
struct llama_model_loader {
|
1775
1957
|
int n_kv = 0;
|
1776
1958
|
int n_tensors = 0;
|
@@ -1786,21 +1968,34 @@ struct llama_model_loader {
|
|
1786
1968
|
llama_fver fver;
|
1787
1969
|
|
1788
1970
|
std::unique_ptr<llama_mmap> mapping;
|
1971
|
+
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
1789
1972
|
|
1790
1973
|
struct gguf_context * ctx_gguf = NULL;
|
1791
1974
|
struct ggml_context * ctx_meta = NULL;
|
1792
1975
|
|
1793
|
-
|
1976
|
+
std::string arch_name;
|
1977
|
+
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
1978
|
+
|
1979
|
+
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
|
1794
1980
|
struct gguf_init_params params = {
|
1795
1981
|
/*.no_alloc = */ true,
|
1796
1982
|
/*.ctx = */ &ctx_meta,
|
1797
1983
|
};
|
1798
1984
|
|
1985
|
+
if (param_overrides_p != nullptr) {
|
1986
|
+
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
1987
|
+
kv_overrides.insert({std::string(p->key), *p});
|
1988
|
+
}
|
1989
|
+
}
|
1990
|
+
|
1799
1991
|
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
1800
1992
|
if (!ctx_gguf) {
|
1801
1993
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
1802
1994
|
}
|
1803
1995
|
|
1996
|
+
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
1997
|
+
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
1998
|
+
|
1804
1999
|
n_kv = gguf_get_n_kv(ctx_gguf);
|
1805
2000
|
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
1806
2001
|
|
@@ -1868,6 +2063,7 @@ struct llama_model_loader {
|
|
1868
2063
|
}
|
1869
2064
|
}
|
1870
2065
|
|
2066
|
+
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
1871
2067
|
for (int i = 0; i < n_kv; i++) {
|
1872
2068
|
const char * name = gguf_get_key(ctx_gguf, i);
|
1873
2069
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
@@ -1913,19 +2109,59 @@ struct llama_model_loader {
|
|
1913
2109
|
}
|
1914
2110
|
}
|
1915
2111
|
|
1916
|
-
|
1917
|
-
|
2112
|
+
template<typename T>
|
2113
|
+
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2114
|
+
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
2115
|
+
const int kid = gguf_find_key(ctx_gguf, key.c_str());
|
2116
|
+
|
2117
|
+
if (kid < 0) {
|
2118
|
+
if (required) {
|
2119
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
2120
|
+
}
|
2121
|
+
return false;
|
2122
|
+
}
|
2123
|
+
|
2124
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
2125
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
|
1918
2126
|
|
1919
|
-
std::string arch_name;
|
1920
|
-
GGUF_GET_KEY(ctx_gguf, arch_name, gguf_get_val_str, GGUF_TYPE_STRING, false, kv(LLM_KV_GENERAL_ARCHITECTURE));
|
1921
2127
|
|
2128
|
+
result = arr_info.length;
|
2129
|
+
return true;
|
2130
|
+
}
|
2131
|
+
|
2132
|
+
template<typename T>
|
2133
|
+
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2134
|
+
get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
|
2135
|
+
return get_arr_n(llm_kv(kid), result, required);
|
2136
|
+
}
|
2137
|
+
|
2138
|
+
template<typename T>
|
2139
|
+
bool get_key(const std::string & key, T & result, const bool required = true) {
|
2140
|
+
auto it = kv_overrides.find(key);
|
2141
|
+
|
2142
|
+
const struct llama_model_kv_override * override =
|
2143
|
+
it != kv_overrides.end() ? &it->second : nullptr;
|
2144
|
+
|
2145
|
+
const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
|
2146
|
+
|
2147
|
+
if (required && !found) {
|
2148
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
2149
|
+
}
|
2150
|
+
|
2151
|
+
return found;
|
2152
|
+
}
|
2153
|
+
|
2154
|
+
template<typename T>
|
2155
|
+
bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
|
2156
|
+
return get_key(llm_kv(kid), result, required);
|
2157
|
+
}
|
2158
|
+
|
2159
|
+
std::string get_arch_name() const {
|
1922
2160
|
return arch_name;
|
1923
2161
|
}
|
1924
2162
|
|
1925
2163
|
enum llm_arch get_arch() const {
|
1926
|
-
|
1927
|
-
|
1928
|
-
return llm_arch_from_string(arch_name);
|
2164
|
+
return llm_kv.arch;
|
1929
2165
|
}
|
1930
2166
|
|
1931
2167
|
const char * get_tensor_name(int i) const {
|
@@ -1965,10 +2201,13 @@ struct llama_model_loader {
|
|
1965
2201
|
return tensor;
|
1966
2202
|
}
|
1967
2203
|
|
1968
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
2204
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
|
1969
2205
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1970
2206
|
|
1971
2207
|
if (cur == NULL) {
|
2208
|
+
if (!required) {
|
2209
|
+
return NULL;
|
2210
|
+
}
|
1972
2211
|
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
1973
2212
|
}
|
1974
2213
|
|
@@ -2172,11 +2411,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
2172
2411
|
static void llm_load_hparams(
|
2173
2412
|
llama_model_loader & ml,
|
2174
2413
|
llama_model & model) {
|
2175
|
-
struct gguf_context * ctx = ml.ctx_gguf;
|
2176
|
-
|
2177
|
-
const auto kv = LLM_KV(model.arch);
|
2178
|
-
|
2179
2414
|
auto & hparams = model.hparams;
|
2415
|
+
const gguf_context * ctx = ml.ctx_gguf;
|
2180
2416
|
|
2181
2417
|
// get metadata as string
|
2182
2418
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
@@ -2190,42 +2426,41 @@ static void llm_load_hparams(
|
|
2190
2426
|
}
|
2191
2427
|
|
2192
2428
|
// get general kv
|
2193
|
-
|
2429
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
2194
2430
|
|
2195
2431
|
// get hparams kv
|
2196
|
-
|
2197
|
-
|
2198
|
-
|
2199
|
-
|
2200
|
-
|
2201
|
-
|
2432
|
+
ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
2433
|
+
ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
2434
|
+
ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
2435
|
+
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
2436
|
+
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
2437
|
+
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
2202
2438
|
|
2203
2439
|
// n_head_kv is optional, default to n_head
|
2204
2440
|
hparams.n_head_kv = hparams.n_head;
|
2205
|
-
|
2441
|
+
ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
|
2206
2442
|
|
2207
|
-
|
2208
|
-
|
2209
|
-
|
2443
|
+
bool rope_finetuned = false;
|
2444
|
+
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
2445
|
+
hparams.rope_finetuned = rope_finetuned;
|
2210
2446
|
|
2211
2447
|
hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
|
2212
|
-
|
2213
|
-
kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
|
2448
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
|
2214
2449
|
|
2215
2450
|
// rope_freq_base (optional)
|
2216
2451
|
hparams.rope_freq_base_train = 10000.0f;
|
2217
|
-
|
2452
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
|
2218
2453
|
|
2219
2454
|
std::string rope_scaling("linear");
|
2220
|
-
|
2455
|
+
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
2221
2456
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
2222
2457
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
|
2223
2458
|
|
2224
2459
|
// rope_freq_scale (inverse of the kv) is optional
|
2225
2460
|
float ropescale = 0.0f;
|
2226
|
-
|
2227
|
-
|
2228
|
-
|
2461
|
+
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
2462
|
+
// try the old key name
|
2463
|
+
ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
|
2229
2464
|
}
|
2230
2465
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
2231
2466
|
|
@@ -2233,7 +2468,7 @@ static void llm_load_hparams(
|
|
2233
2468
|
{
|
2234
2469
|
hparams.n_rot = hparams.n_embd / hparams.n_head;
|
2235
2470
|
|
2236
|
-
|
2471
|
+
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
2237
2472
|
|
2238
2473
|
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
2239
2474
|
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
@@ -2248,7 +2483,7 @@ static void llm_load_hparams(
|
|
2248
2483
|
switch (model.arch) {
|
2249
2484
|
case LLM_ARCH_LLAMA:
|
2250
2485
|
{
|
2251
|
-
|
2486
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2252
2487
|
|
2253
2488
|
switch (hparams.n_layer) {
|
2254
2489
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -2262,7 +2497,7 @@ static void llm_load_hparams(
|
|
2262
2497
|
} break;
|
2263
2498
|
case LLM_ARCH_FALCON:
|
2264
2499
|
{
|
2265
|
-
|
2500
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2266
2501
|
|
2267
2502
|
switch (hparams.n_layer) {
|
2268
2503
|
case 32: model.type = e_model::MODEL_7B; break;
|
@@ -2272,7 +2507,7 @@ static void llm_load_hparams(
|
|
2272
2507
|
} break;
|
2273
2508
|
case LLM_ARCH_BAICHUAN:
|
2274
2509
|
{
|
2275
|
-
|
2510
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2276
2511
|
switch (hparams.n_layer) {
|
2277
2512
|
case 32: model.type = e_model::MODEL_7B; break;
|
2278
2513
|
case 40: model.type = e_model::MODEL_13B; break;
|
@@ -2281,7 +2516,7 @@ static void llm_load_hparams(
|
|
2281
2516
|
} break;
|
2282
2517
|
case LLM_ARCH_STARCODER:
|
2283
2518
|
{
|
2284
|
-
|
2519
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2285
2520
|
switch (hparams.n_layer) {
|
2286
2521
|
case 24: model.type = e_model::MODEL_1B; break;
|
2287
2522
|
case 36: model.type = e_model::MODEL_3B; break;
|
@@ -2292,7 +2527,7 @@ static void llm_load_hparams(
|
|
2292
2527
|
} break;
|
2293
2528
|
case LLM_ARCH_PERSIMMON:
|
2294
2529
|
{
|
2295
|
-
|
2530
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2296
2531
|
switch (hparams.n_layer) {
|
2297
2532
|
case 36: model.type = e_model::MODEL_8B; break;
|
2298
2533
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -2300,7 +2535,7 @@ static void llm_load_hparams(
|
|
2300
2535
|
} break;
|
2301
2536
|
case LLM_ARCH_REFACT:
|
2302
2537
|
{
|
2303
|
-
|
2538
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2304
2539
|
switch (hparams.n_layer) {
|
2305
2540
|
case 32: model.type = e_model::MODEL_1B; break;
|
2306
2541
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -2308,7 +2543,7 @@ static void llm_load_hparams(
|
|
2308
2543
|
} break;
|
2309
2544
|
case LLM_ARCH_BLOOM:
|
2310
2545
|
{
|
2311
|
-
|
2546
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2312
2547
|
|
2313
2548
|
switch (hparams.n_layer) {
|
2314
2549
|
case 24: model.type = e_model::MODEL_1B; break;
|
@@ -2323,9 +2558,9 @@ static void llm_load_hparams(
|
|
2323
2558
|
{
|
2324
2559
|
hparams.f_clamp_kqv = 0.0f;
|
2325
2560
|
|
2326
|
-
|
2327
|
-
|
2328
|
-
|
2561
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2562
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
2563
|
+
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
2329
2564
|
|
2330
2565
|
switch (hparams.n_layer) {
|
2331
2566
|
case 32: model.type = e_model::MODEL_7B; break;
|
@@ -2335,13 +2570,23 @@ static void llm_load_hparams(
|
|
2335
2570
|
} break;
|
2336
2571
|
case LLM_ARCH_STABLELM:
|
2337
2572
|
{
|
2338
|
-
|
2573
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2339
2574
|
|
2340
2575
|
switch (hparams.n_layer) {
|
2341
2576
|
case 32: model.type = e_model::MODEL_3B; break;
|
2342
2577
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2343
2578
|
}
|
2344
2579
|
} break;
|
2580
|
+
case LLM_ARCH_QWEN:
|
2581
|
+
{
|
2582
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2583
|
+
|
2584
|
+
switch (hparams.n_layer) {
|
2585
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2586
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
2587
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2588
|
+
}
|
2589
|
+
} break;
|
2345
2590
|
|
2346
2591
|
default: (void)0;
|
2347
2592
|
}
|
@@ -2383,7 +2628,7 @@ static void llm_load_vocab(
|
|
2383
2628
|
{
|
2384
2629
|
std::string tokenizer_name;
|
2385
2630
|
|
2386
|
-
|
2631
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
2387
2632
|
|
2388
2633
|
if (tokenizer_name == "llama") {
|
2389
2634
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
@@ -2473,34 +2718,31 @@ static void llm_load_vocab(
|
|
2473
2718
|
};
|
2474
2719
|
for (const auto & it : special_token_types) {
|
2475
2720
|
const std::string & key = kv(std::get<0>(it));
|
2476
|
-
int32_t & id = std::get<1>(it)
|
2721
|
+
int32_t & id = std::get<1>(it);
|
2477
2722
|
|
2478
|
-
|
2479
|
-
|
2480
|
-
|
2481
|
-
|
2482
|
-
if (
|
2483
|
-
LLAMA_LOG_WARN("%s: bad special token: '%s' = %
|
2484
|
-
__func__, key.c_str(),
|
2485
|
-
|
2723
|
+
uint32_t new_id;
|
2724
|
+
if (!ml.get_key(std::get<0>(it), new_id, false)) {
|
2725
|
+
continue;
|
2726
|
+
}
|
2727
|
+
if (new_id >= vocab.id_to_token.size()) {
|
2728
|
+
LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
|
2729
|
+
__func__, key.c_str(), new_id, id);
|
2730
|
+
} else {
|
2731
|
+
id = new_id;
|
2486
2732
|
}
|
2487
2733
|
|
2488
2734
|
}
|
2489
2735
|
|
2490
2736
|
// Handle add_bos_token and add_eos_token
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
2494
|
-
|
2495
|
-
|
2496
|
-
|
2497
|
-
|
2498
|
-
|
2499
|
-
|
2500
|
-
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2501
|
-
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2502
|
-
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2503
|
-
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2737
|
+
{
|
2738
|
+
bool temp = true;
|
2739
|
+
|
2740
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
2741
|
+
vocab.special_add_bos = int(temp);
|
2742
|
+
}
|
2743
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
2744
|
+
vocab.special_add_eos = int(temp);
|
2745
|
+
}
|
2504
2746
|
}
|
2505
2747
|
}
|
2506
2748
|
|
@@ -2733,14 +2975,7 @@ static void llm_load_tensors(
|
|
2733
2975
|
ggml_backend_type backend_output;
|
2734
2976
|
|
2735
2977
|
if (n_gpu_layers > int(n_layer)) {
|
2736
|
-
|
2737
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2738
|
-
#ifndef _WIN32
|
2739
|
-
backend_norm = llama_backend_offload;
|
2740
|
-
#else
|
2741
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2742
|
-
#endif // _WIN32
|
2743
|
-
|
2978
|
+
backend_norm = llama_backend_offload;
|
2744
2979
|
backend_output = llama_backend_offload_split;
|
2745
2980
|
} else {
|
2746
2981
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2777,6 +3012,12 @@ static void llm_load_tensors(
|
|
2777
3012
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2778
3013
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2779
3014
|
|
3015
|
+
// optional bias tensors
|
3016
|
+
layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
|
3017
|
+
layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
|
3018
|
+
layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
|
3019
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
|
3020
|
+
|
2780
3021
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2781
3022
|
|
2782
3023
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
@@ -2785,9 +3026,14 @@ static void llm_load_tensors(
|
|
2785
3026
|
|
2786
3027
|
if (backend == GGML_BACKEND_GPU) {
|
2787
3028
|
vram_weights +=
|
2788
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)
|
2789
|
-
ggml_nbytes(layer.wv)
|
2790
|
-
|
3029
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3030
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
|
3031
|
+
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
|
3032
|
+
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
3033
|
+
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
3034
|
+
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
3035
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
3036
|
+
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
2791
3037
|
}
|
2792
3038
|
}
|
2793
3039
|
} break;
|
@@ -2799,14 +3045,7 @@ static void llm_load_tensors(
|
|
2799
3045
|
ggml_backend_type backend_output;
|
2800
3046
|
|
2801
3047
|
if (n_gpu_layers > int(n_layer)) {
|
2802
|
-
|
2803
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2804
|
-
#ifndef _WIN32
|
2805
|
-
backend_norm = llama_backend_offload;
|
2806
|
-
#else
|
2807
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2808
|
-
#endif // _WIN32
|
2809
|
-
|
3048
|
+
backend_norm = llama_backend_offload;
|
2810
3049
|
backend_output = llama_backend_offload_split;
|
2811
3050
|
} else {
|
2812
3051
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2869,14 +3108,7 @@ static void llm_load_tensors(
|
|
2869
3108
|
ggml_backend_type backend_output;
|
2870
3109
|
|
2871
3110
|
if (n_gpu_layers > int(n_layer)) {
|
2872
|
-
|
2873
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2874
|
-
#ifndef _WIN32
|
2875
|
-
backend_norm = llama_backend_offload;
|
2876
|
-
#else
|
2877
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2878
|
-
#endif // _WIN32
|
2879
|
-
|
3111
|
+
backend_norm = llama_backend_offload;
|
2880
3112
|
backend_output = llama_backend_offload_split;
|
2881
3113
|
} else {
|
2882
3114
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2946,14 +3178,7 @@ static void llm_load_tensors(
|
|
2946
3178
|
ggml_backend_type backend_output;
|
2947
3179
|
|
2948
3180
|
if (n_gpu_layers > int(n_layer)) {
|
2949
|
-
|
2950
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2951
|
-
#ifndef _WIN32
|
2952
|
-
backend_norm = llama_backend_offload;
|
2953
|
-
#else
|
2954
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2955
|
-
#endif // _WIN32
|
2956
|
-
|
3181
|
+
backend_norm = llama_backend_offload;
|
2957
3182
|
backend_output = llama_backend_offload_split;
|
2958
3183
|
} else {
|
2959
3184
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3023,21 +3248,7 @@ static void llm_load_tensors(
|
|
3023
3248
|
ggml_backend_type backend_output;
|
3024
3249
|
|
3025
3250
|
if (n_gpu_layers > int(n_layer)) {
|
3026
|
-
|
3027
|
-
if (n_gpu_layers > int(n_layer + 1)) {
|
3028
|
-
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
3029
|
-
__func__, n_layer + 1);
|
3030
|
-
throw std::runtime_error("Persimmon CUDA offload failed");
|
3031
|
-
}
|
3032
|
-
#endif
|
3033
|
-
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3034
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3035
|
-
#ifndef _WIN32
|
3036
|
-
backend_norm = llama_backend_offload;
|
3037
|
-
#else
|
3038
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3039
|
-
#endif // _WIN32
|
3040
|
-
|
3251
|
+
backend_norm = llama_backend_offload;
|
3041
3252
|
backend_output = llama_backend_offload_split;
|
3042
3253
|
} else {
|
3043
3254
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3096,14 +3307,7 @@ static void llm_load_tensors(
|
|
3096
3307
|
ggml_backend_type backend_output;
|
3097
3308
|
|
3098
3309
|
if (n_gpu_layers > int(n_layer)) {
|
3099
|
-
|
3100
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3101
|
-
#ifndef _WIN32
|
3102
|
-
backend_norm = llama_backend_offload;
|
3103
|
-
#else
|
3104
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3105
|
-
#endif // _WIN32
|
3106
|
-
|
3310
|
+
backend_norm = llama_backend_offload;
|
3107
3311
|
backend_output = llama_backend_offload_split;
|
3108
3312
|
} else {
|
3109
3313
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3174,14 +3378,7 @@ static void llm_load_tensors(
|
|
3174
3378
|
ggml_backend_type backend_output;
|
3175
3379
|
|
3176
3380
|
if (n_gpu_layers > int(n_layer)) {
|
3177
|
-
|
3178
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3179
|
-
#ifndef _WIN32
|
3180
|
-
backend_norm = llama_backend_offload;
|
3181
|
-
#else
|
3182
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3183
|
-
#endif // _WIN32
|
3184
|
-
|
3381
|
+
backend_norm = llama_backend_offload;
|
3185
3382
|
backend_output = llama_backend_offload_split;
|
3186
3383
|
} else {
|
3187
3384
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3241,14 +3438,7 @@ static void llm_load_tensors(
|
|
3241
3438
|
ggml_backend_type backend_output;
|
3242
3439
|
|
3243
3440
|
if (n_gpu_layers > int(n_layer)) {
|
3244
|
-
|
3245
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3246
|
-
#ifndef _WIN32
|
3247
|
-
backend_norm = llama_backend_offload;
|
3248
|
-
#else
|
3249
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3250
|
-
#endif // _WIN32
|
3251
|
-
|
3441
|
+
backend_norm = llama_backend_offload;
|
3252
3442
|
backend_output = llama_backend_offload_split;
|
3253
3443
|
} else {
|
3254
3444
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3305,6 +3495,64 @@ static void llm_load_tensors(
|
|
3305
3495
|
}
|
3306
3496
|
}
|
3307
3497
|
} break;
|
3498
|
+
case LLM_ARCH_QWEN:
|
3499
|
+
{
|
3500
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3501
|
+
{
|
3502
|
+
ggml_backend_type backend_norm;
|
3503
|
+
ggml_backend_type backend_output;
|
3504
|
+
|
3505
|
+
if (n_gpu_layers > int(n_layer)) {
|
3506
|
+
backend_norm = llama_backend_offload;
|
3507
|
+
backend_output = llama_backend_offload_split;
|
3508
|
+
} else {
|
3509
|
+
backend_norm = GGML_BACKEND_CPU;
|
3510
|
+
backend_output = GGML_BACKEND_CPU;
|
3511
|
+
}
|
3512
|
+
|
3513
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3514
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3515
|
+
|
3516
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3517
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3518
|
+
}
|
3519
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3520
|
+
vram_weights += ggml_nbytes(model.output);
|
3521
|
+
}
|
3522
|
+
}
|
3523
|
+
|
3524
|
+
const uint32_t n_ff = hparams.n_ff / 2;
|
3525
|
+
|
3526
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3527
|
+
|
3528
|
+
model.layers.resize(n_layer);
|
3529
|
+
|
3530
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3531
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3532
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3533
|
+
|
3534
|
+
auto & layer = model.layers[i];
|
3535
|
+
|
3536
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3537
|
+
|
3538
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
|
3539
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
|
3540
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3541
|
+
|
3542
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3543
|
+
|
3544
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3545
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3546
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3547
|
+
|
3548
|
+
if (backend == GGML_BACKEND_GPU) {
|
3549
|
+
vram_weights +=
|
3550
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3551
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
3552
|
+
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3553
|
+
}
|
3554
|
+
}
|
3555
|
+
} break;
|
3308
3556
|
|
3309
3557
|
default:
|
3310
3558
|
throw std::runtime_error("unknown architecture");
|
@@ -3331,8 +3579,8 @@ static void llm_load_tensors(
|
|
3331
3579
|
}
|
3332
3580
|
|
3333
3581
|
#ifdef GGML_USE_CUBLAS
|
3334
|
-
const int max_backend_supported_layers = hparams.n_layer +
|
3335
|
-
const int max_offloadable_layers = hparams.n_layer +
|
3582
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3583
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
3336
3584
|
#elif GGML_USE_CLBLAST
|
3337
3585
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3338
3586
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
@@ -3373,7 +3621,7 @@ static void llm_load_tensors(
|
|
3373
3621
|
|
3374
3622
|
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
3375
3623
|
try {
|
3376
|
-
llama_model_loader ml(fname, params.use_mmap);
|
3624
|
+
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
3377
3625
|
|
3378
3626
|
model.hparams.vocab_only = params.vocab_only;
|
3379
3627
|
|
@@ -3500,11 +3748,11 @@ static void llm_build_k_shift(
|
|
3500
3748
|
struct ggml_tensor * tmp =
|
3501
3749
|
// we rotate only the first n_rot dimensions
|
3502
3750
|
ggml_rope_custom_inplace(ctx,
|
3503
|
-
ggml_view_3d(ctx, kv.
|
3751
|
+
ggml_view_3d(ctx, kv.k_l[il],
|
3504
3752
|
n_embd_head, n_head_kv, n_ctx,
|
3505
|
-
|
3506
|
-
|
3507
|
-
|
3753
|
+
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
3754
|
+
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
3755
|
+
0),
|
3508
3756
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
3509
3757
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
3510
3758
|
cb(tmp, "K_shifted", il);
|
@@ -3531,13 +3779,13 @@ static void llm_build_kv_store(
|
|
3531
3779
|
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
3532
3780
|
cb(v_cur_t, "v_cur_t", il);
|
3533
3781
|
|
3534
|
-
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.
|
3535
|
-
(
|
3782
|
+
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
3783
|
+
(ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
|
3536
3784
|
cb(k_cache_view, "k_cache_view", il);
|
3537
3785
|
|
3538
|
-
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.
|
3539
|
-
(
|
3540
|
-
(
|
3786
|
+
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
3787
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
3788
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
3541
3789
|
cb(v_cache_view, "v_cache_view", il);
|
3542
3790
|
|
3543
3791
|
// important: storing RoPE-ed version of K in the KV cache!
|
@@ -3689,11 +3937,11 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3689
3937
|
cb(q, "q", il);
|
3690
3938
|
|
3691
3939
|
struct ggml_tensor * k =
|
3692
|
-
ggml_view_3d(ctx, kv.
|
3940
|
+
ggml_view_3d(ctx, kv.k_l[il],
|
3693
3941
|
n_embd_head, n_kv, n_head_kv,
|
3694
|
-
|
3695
|
-
|
3696
|
-
|
3942
|
+
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
3943
|
+
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
3944
|
+
0);
|
3697
3945
|
cb(k, "k", il);
|
3698
3946
|
|
3699
3947
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
@@ -3724,11 +3972,11 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3724
3972
|
|
3725
3973
|
// split cached v into n_head heads
|
3726
3974
|
struct ggml_tensor * v =
|
3727
|
-
ggml_view_3d(ctx, kv.
|
3975
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
3728
3976
|
n_kv, n_embd_head, n_head_kv,
|
3729
|
-
ggml_element_size(kv.
|
3730
|
-
ggml_element_size(kv.
|
3731
|
-
|
3977
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
3978
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
|
3979
|
+
0);
|
3732
3980
|
cb(v, "v", il);
|
3733
3981
|
|
3734
3982
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
@@ -3886,12 +4134,24 @@ struct llm_build_context {
|
|
3886
4134
|
// compute Q and K and RoPE them
|
3887
4135
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3888
4136
|
cb(Qcur, "Qcur", il);
|
4137
|
+
if (model.layers[il].bq) {
|
4138
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
4139
|
+
cb(Qcur, "Qcur", il);
|
4140
|
+
}
|
3889
4141
|
|
3890
4142
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3891
4143
|
cb(Kcur, "Kcur", il);
|
4144
|
+
if (model.layers[il].bk) {
|
4145
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
4146
|
+
cb(Kcur, "Kcur", il);
|
4147
|
+
}
|
3892
4148
|
|
3893
4149
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3894
4150
|
cb(Vcur, "Vcur", il);
|
4151
|
+
if (model.layers[il].bv) {
|
4152
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
4153
|
+
cb(Vcur, "Vcur", il);
|
4154
|
+
}
|
3895
4155
|
|
3896
4156
|
Qcur = ggml_rope_custom(
|
3897
4157
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
@@ -3910,7 +4170,7 @@ struct llm_build_context {
|
|
3910
4170
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
3911
4171
|
|
3912
4172
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
3913
|
-
model.layers[il].wo,
|
4173
|
+
model.layers[il].wo, model.layers[il].bo,
|
3914
4174
|
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
3915
4175
|
cb(cur, "kqv_out", il);
|
3916
4176
|
}
|
@@ -4308,6 +4568,7 @@ struct llm_build_context {
|
|
4308
4568
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4309
4569
|
cb(inpL, "imp_embd", -1);
|
4310
4570
|
|
4571
|
+
// inp_pos - contains the positions
|
4311
4572
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4312
4573
|
cb(inp_pos, "inp_pos", -1);
|
4313
4574
|
|
@@ -4315,6 +4576,7 @@ struct llm_build_context {
|
|
4315
4576
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4316
4577
|
cb(KQ_scale, "KQ_scale", -1);
|
4317
4578
|
|
4579
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4318
4580
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4319
4581
|
cb(KQ_mask, "KQ_mask", -1);
|
4320
4582
|
|
@@ -4903,6 +5165,121 @@ struct llm_build_context {
|
|
4903
5165
|
|
4904
5166
|
return gf;
|
4905
5167
|
}
|
5168
|
+
|
5169
|
+
struct ggml_cgraph * build_qwen() {
|
5170
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5171
|
+
|
5172
|
+
struct ggml_tensor * cur;
|
5173
|
+
struct ggml_tensor * inpL;
|
5174
|
+
|
5175
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5176
|
+
cb(inpL, "inp_embd", -1);
|
5177
|
+
|
5178
|
+
// inp_pos - contains the positions
|
5179
|
+
struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5180
|
+
cb(inp_pos, "inp_pos", -1);
|
5181
|
+
|
5182
|
+
// KQ_scale
|
5183
|
+
struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5184
|
+
cb(KQ_scale, "KQ_scale", -1);
|
5185
|
+
|
5186
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5187
|
+
struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5188
|
+
cb(KQ_mask, "KQ_mask", -1);
|
5189
|
+
|
5190
|
+
// shift the entire K-cache if needed
|
5191
|
+
if (do_rope_shift) {
|
5192
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
5193
|
+
}
|
5194
|
+
|
5195
|
+
for (int il = 0; il < n_layer; ++il) {
|
5196
|
+
struct ggml_tensor * inpSA = inpL;
|
5197
|
+
|
5198
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5199
|
+
model.layers[il].attn_norm, NULL,
|
5200
|
+
LLM_NORM_RMS, cb, il);
|
5201
|
+
cb(cur, "attn_norm", il);
|
5202
|
+
|
5203
|
+
// self-attention
|
5204
|
+
{
|
5205
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5206
|
+
cb(cur, "wqkv", il);
|
5207
|
+
|
5208
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5209
|
+
cb(cur, "bqkv", il);
|
5210
|
+
|
5211
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5212
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5213
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
|
5214
|
+
|
5215
|
+
cb(Qcur, "Qcur", il);
|
5216
|
+
cb(Kcur, "Kcur", il);
|
5217
|
+
cb(Vcur, "Vcur", il);
|
5218
|
+
|
5219
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5220
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5221
|
+
|
5222
|
+
// using mode = 2 for neox mode
|
5223
|
+
Qcur = ggml_rope_custom(
|
5224
|
+
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5225
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5226
|
+
);
|
5227
|
+
cb(Qcur, "Qcur", il);
|
5228
|
+
|
5229
|
+
Kcur = ggml_rope_custom(
|
5230
|
+
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5231
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5232
|
+
);
|
5233
|
+
cb(Kcur, "Kcur", il);
|
5234
|
+
|
5235
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5236
|
+
|
5237
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5238
|
+
model.layers[il].wo, NULL,
|
5239
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
5240
|
+
cb(cur, "kqv_out", il);
|
5241
|
+
}
|
5242
|
+
|
5243
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5244
|
+
cb(ffn_inp, "ffn_inp", il);
|
5245
|
+
|
5246
|
+
// feed-forward forward
|
5247
|
+
{
|
5248
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
5249
|
+
model.layers[il].ffn_norm, NULL,
|
5250
|
+
LLM_NORM_RMS, cb, il);
|
5251
|
+
cb(cur, "ffn_norm", il);
|
5252
|
+
|
5253
|
+
cur = llm_build_ffn(ctx0, cur,
|
5254
|
+
model.layers[il].ffn_up, NULL,
|
5255
|
+
model.layers[il].ffn_gate, NULL,
|
5256
|
+
model.layers[il].ffn_down, NULL,
|
5257
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5258
|
+
cb(cur, "ffn_out", il);
|
5259
|
+
}
|
5260
|
+
|
5261
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
5262
|
+
cb(cur, "l_out", il);
|
5263
|
+
|
5264
|
+
// input for next layer
|
5265
|
+
inpL = cur;
|
5266
|
+
}
|
5267
|
+
|
5268
|
+
cur = inpL;
|
5269
|
+
|
5270
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
5271
|
+
model.output_norm, NULL,
|
5272
|
+
LLM_NORM_RMS, cb, -1);
|
5273
|
+
cb(cur, "result_norm", -1);
|
5274
|
+
|
5275
|
+
// lm_head
|
5276
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5277
|
+
cb(cur, "result_output", -1);
|
5278
|
+
|
5279
|
+
ggml_build_forward_expand(gf, cur);
|
5280
|
+
|
5281
|
+
return gf;
|
5282
|
+
}
|
4906
5283
|
};
|
4907
5284
|
|
4908
5285
|
//
|
@@ -4913,8 +5290,8 @@ struct llm_build_context {
|
|
4913
5290
|
enum llm_offload_func_e {
|
4914
5291
|
OFFLOAD_FUNC_NOP,
|
4915
5292
|
OFFLOAD_FUNC,
|
4916
|
-
|
4917
|
-
|
5293
|
+
OFFLOAD_FUNC_FRC, // force offload
|
5294
|
+
OFFLOAD_FUNC_KQV,
|
4918
5295
|
OFFLOAD_FUNC_NR,
|
4919
5296
|
OFFLOAD_FUNC_EMB,
|
4920
5297
|
OFFLOAD_FUNC_OUT,
|
@@ -5000,11 +5377,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5000
5377
|
//{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
|
5001
5378
|
{ "pos_embd", OFFLOAD_FUNC_NR },
|
5002
5379
|
|
5003
|
-
{ "inp_pos",
|
5004
|
-
{ "KQ_scale",
|
5005
|
-
{ "KQ_mask",
|
5006
|
-
{ "K_shift",
|
5007
|
-
|
5380
|
+
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
5381
|
+
{ "KQ_scale", OFFLOAD_FUNC_FRC },
|
5382
|
+
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
5383
|
+
{ "K_shift", OFFLOAD_FUNC_FRC },
|
5384
|
+
|
5385
|
+
{ "K_shifted", OFFLOAD_FUNC },
|
5008
5386
|
|
5009
5387
|
{ "inp_norm", OFFLOAD_FUNC_NR },
|
5010
5388
|
{ "inp_norm_w", OFFLOAD_FUNC_NR },
|
@@ -5017,38 +5395,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5017
5395
|
{ "attn_norm", OFFLOAD_FUNC },
|
5018
5396
|
{ "attn_norm_2", OFFLOAD_FUNC },
|
5019
5397
|
|
5020
|
-
{ "wqkv",
|
5021
|
-
{ "bqkv",
|
5022
|
-
{ "wqkv_clamped",
|
5023
|
-
|
5024
|
-
{ "tmpk",
|
5025
|
-
{ "tmpq",
|
5026
|
-
{ "tmpv",
|
5027
|
-
{ "Kcur",
|
5028
|
-
{ "Qcur",
|
5029
|
-
{ "Vcur",
|
5030
|
-
|
5031
|
-
{ "krot",
|
5032
|
-
{ "qrot",
|
5033
|
-
{ "kpass",
|
5034
|
-
{ "qpass",
|
5035
|
-
{ "krotated",
|
5036
|
-
{ "qrotated",
|
5037
|
-
|
5038
|
-
{ "q",
|
5039
|
-
{ "k",
|
5040
|
-
{ "kq",
|
5041
|
-
{ "kq_scaled",
|
5042
|
-
{ "kq_scaled_alibi",
|
5043
|
-
{ "kq_masked",
|
5044
|
-
{ "kq_soft_max",
|
5045
|
-
{ "kq_soft_max_ext",
|
5046
|
-
{ "v",
|
5047
|
-
{ "kqv",
|
5048
|
-
{ "kqv_merged",
|
5049
|
-
{ "kqv_merged_cont",
|
5050
|
-
{ "kqv_wo",
|
5051
|
-
{ "kqv_out",
|
5398
|
+
{ "wqkv", OFFLOAD_FUNC_KQV },
|
5399
|
+
{ "bqkv", OFFLOAD_FUNC_KQV },
|
5400
|
+
{ "wqkv_clamped", OFFLOAD_FUNC_KQV },
|
5401
|
+
|
5402
|
+
{ "tmpk", OFFLOAD_FUNC_KQV },
|
5403
|
+
{ "tmpq", OFFLOAD_FUNC_KQV },
|
5404
|
+
{ "tmpv", OFFLOAD_FUNC_KQV },
|
5405
|
+
{ "Kcur", OFFLOAD_FUNC_KQV },
|
5406
|
+
{ "Qcur", OFFLOAD_FUNC_KQV },
|
5407
|
+
{ "Vcur", OFFLOAD_FUNC_KQV },
|
5408
|
+
|
5409
|
+
{ "krot", OFFLOAD_FUNC_KQV },
|
5410
|
+
{ "qrot", OFFLOAD_FUNC_KQV },
|
5411
|
+
{ "kpass", OFFLOAD_FUNC_KQV },
|
5412
|
+
{ "qpass", OFFLOAD_FUNC_KQV },
|
5413
|
+
{ "krotated", OFFLOAD_FUNC_KQV },
|
5414
|
+
{ "qrotated", OFFLOAD_FUNC_KQV },
|
5415
|
+
|
5416
|
+
{ "q", OFFLOAD_FUNC_KQV },
|
5417
|
+
{ "k", OFFLOAD_FUNC_KQV },
|
5418
|
+
{ "kq", OFFLOAD_FUNC_KQV },
|
5419
|
+
{ "kq_scaled", OFFLOAD_FUNC_KQV },
|
5420
|
+
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
|
5421
|
+
{ "kq_masked", OFFLOAD_FUNC_KQV },
|
5422
|
+
{ "kq_soft_max", OFFLOAD_FUNC_KQV },
|
5423
|
+
{ "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
|
5424
|
+
{ "v", OFFLOAD_FUNC_KQV },
|
5425
|
+
{ "kqv", OFFLOAD_FUNC_KQV },
|
5426
|
+
{ "kqv_merged", OFFLOAD_FUNC_KQV },
|
5427
|
+
{ "kqv_merged_cont", OFFLOAD_FUNC_KQV },
|
5428
|
+
{ "kqv_wo", OFFLOAD_FUNC_KQV },
|
5429
|
+
{ "kqv_out", OFFLOAD_FUNC_KQV },
|
5052
5430
|
|
5053
5431
|
{ "ffn_inp", OFFLOAD_FUNC },
|
5054
5432
|
{ "ffn_norm", OFFLOAD_FUNC },
|
@@ -5240,15 +5618,15 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5240
5618
|
{ OFFLOAD_FUNC_NOP, "CPU" },
|
5241
5619
|
{ OFFLOAD_FUNC_OUT, "CPU" },
|
5242
5620
|
#ifdef GGML_USE_CUBLAS
|
5243
|
-
{ OFFLOAD_FUNC, "GPU (CUDA)"
|
5244
|
-
{
|
5245
|
-
{
|
5246
|
-
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR"
|
5621
|
+
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
5622
|
+
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
5623
|
+
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
5624
|
+
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
|
5247
5625
|
{ OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
|
5248
5626
|
#else
|
5249
5627
|
{ OFFLOAD_FUNC, "CPU" },
|
5250
|
-
{
|
5251
|
-
{
|
5628
|
+
{ OFFLOAD_FUNC_FRC, "CPU" },
|
5629
|
+
{ OFFLOAD_FUNC_KQV, "CPU" },
|
5252
5630
|
{ OFFLOAD_FUNC_NR, "CPU" },
|
5253
5631
|
{ OFFLOAD_FUNC_EMB, "CPU" },
|
5254
5632
|
#endif // GGML_USE_CUBLAS
|
@@ -5281,18 +5659,23 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5281
5659
|
}
|
5282
5660
|
}
|
5283
5661
|
break;
|
5284
|
-
case
|
5285
|
-
if (
|
5662
|
+
case OFFLOAD_FUNC_FRC:
|
5663
|
+
if (!lctx.cparams.offload_kqv) {
|
5286
5664
|
func_e = OFFLOAD_FUNC_NOP;
|
5287
|
-
}
|
5288
|
-
|
5289
|
-
|
5290
|
-
if (n_gpu_layers <= n_layer + 1) {
|
5665
|
+
} break;
|
5666
|
+
case OFFLOAD_FUNC_KQV:
|
5667
|
+
if (!lctx.cparams.offload_kqv) {
|
5291
5668
|
func_e = OFFLOAD_FUNC_NOP;
|
5669
|
+
} else {
|
5670
|
+
if (n_gpu_layers < n_layer) {
|
5671
|
+
if (il < i_gpu_start) {
|
5672
|
+
func_e = OFFLOAD_FUNC_NOP;
|
5673
|
+
}
|
5674
|
+
}
|
5292
5675
|
}
|
5293
5676
|
break;
|
5294
|
-
case
|
5295
|
-
if (n_gpu_layers <= n_layer +
|
5677
|
+
case OFFLOAD_FUNC_NR:
|
5678
|
+
if (n_gpu_layers <= n_layer + 0) {
|
5296
5679
|
func_e = OFFLOAD_FUNC_NOP;
|
5297
5680
|
}
|
5298
5681
|
break;
|
@@ -5317,8 +5700,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5317
5700
|
case OFFLOAD_FUNC_NOP:
|
5318
5701
|
case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
|
5319
5702
|
case OFFLOAD_FUNC:
|
5320
|
-
case
|
5321
|
-
case
|
5703
|
+
case OFFLOAD_FUNC_KQV:
|
5704
|
+
case OFFLOAD_FUNC_FRC:
|
5322
5705
|
case OFFLOAD_FUNC_NR:
|
5323
5706
|
case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
|
5324
5707
|
default: GGML_ASSERT(false);
|
@@ -5377,6 +5760,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5377
5760
|
{
|
5378
5761
|
result = llm.build_stablelm();
|
5379
5762
|
} break;
|
5763
|
+
case LLM_ARCH_QWEN:
|
5764
|
+
{
|
5765
|
+
result = llm.build_qwen();
|
5766
|
+
} break;
|
5380
5767
|
default:
|
5381
5768
|
GGML_ASSERT(false);
|
5382
5769
|
}
|
@@ -5499,8 +5886,8 @@ static int llama_decode_internal(
|
|
5499
5886
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
5500
5887
|
// after enough generations, the benefit from this heuristic disappears
|
5501
5888
|
// if we start defragmenting the cache, the benefit from this will be more important
|
5502
|
-
|
5503
|
-
kv_self.n =
|
5889
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
5890
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
5504
5891
|
|
5505
5892
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5506
5893
|
|
@@ -5551,7 +5938,7 @@ static int llama_decode_internal(
|
|
5551
5938
|
n_threads = std::min(4, n_threads);
|
5552
5939
|
}
|
5553
5940
|
|
5554
|
-
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer +
|
5941
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
5555
5942
|
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
5556
5943
|
n_threads = 1;
|
5557
5944
|
}
|
@@ -6410,14 +6797,13 @@ struct llama_grammar_candidate {
|
|
6410
6797
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
6411
6798
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
6412
6799
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6413
|
-
const
|
6414
|
-
size_t n_src,
|
6800
|
+
const std::string & src,
|
6415
6801
|
llama_partial_utf8 partial_start) {
|
6416
6802
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
6417
|
-
const char * pos = src;
|
6803
|
+
const char * pos = src.c_str();
|
6418
6804
|
std::vector<uint32_t> code_points;
|
6419
6805
|
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
6420
|
-
code_points.reserve(
|
6806
|
+
code_points.reserve(src.size() + 1);
|
6421
6807
|
uint32_t value = partial_start.value;
|
6422
6808
|
int n_remain = partial_start.n_remain;
|
6423
6809
|
|
@@ -6468,13 +6854,6 @@ static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
|
6468
6854
|
return std::make_pair(std::move(code_points), llama_partial_utf8{ value, n_remain });
|
6469
6855
|
}
|
6470
6856
|
|
6471
|
-
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6472
|
-
std::string src,
|
6473
|
-
llama_partial_utf8 partial_start
|
6474
|
-
) {
|
6475
|
-
return decode_utf8(src.c_str(), src.size(), partial_start);
|
6476
|
-
}
|
6477
|
-
|
6478
6857
|
// returns true iff pos points to the end of one of the definitions of a rule
|
6479
6858
|
static bool llama_grammar_is_end_of_sequence(const llama_grammar_element * pos) {
|
6480
6859
|
switch (pos->type) {
|
@@ -7113,11 +7492,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7113
7492
|
const llama_token eos = llama_token_eos(&ctx->model);
|
7114
7493
|
|
7115
7494
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
7495
|
+
candidates_decoded.reserve(candidates->size);
|
7116
7496
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
7497
|
+
candidates_grammar.reserve(candidates->size);
|
7117
7498
|
|
7118
7499
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7119
7500
|
const llama_token id = candidates->data[i].id;
|
7120
|
-
const std::string piece =
|
7501
|
+
const std::string & piece = ctx->model.vocab.id_to_token[id].text;
|
7121
7502
|
if (id == eos) {
|
7122
7503
|
if (!allow_eos) {
|
7123
7504
|
candidates->data[i].logit = -INFINITY;
|
@@ -7329,7 +7710,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7329
7710
|
GGML_ASSERT(false);
|
7330
7711
|
}
|
7331
7712
|
|
7332
|
-
const std::string piece =
|
7713
|
+
const std::string & piece = ctx->model.vocab.id_to_token[token].text;
|
7333
7714
|
|
7334
7715
|
// Note terminating 0 in decoded string
|
7335
7716
|
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
@@ -7650,18 +8031,21 @@ static void llama_convert_tensor_internal(
|
|
7650
8031
|
return;
|
7651
8032
|
}
|
7652
8033
|
|
7653
|
-
|
7654
|
-
|
8034
|
+
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
8035
|
+
size_t block_size_bytes = ggml_type_size(tensor->type);
|
7655
8036
|
|
7656
8037
|
GGML_ASSERT(nelements % block_size == 0);
|
7657
|
-
|
7658
|
-
|
7659
|
-
|
8038
|
+
size_t nblocks = nelements / block_size;
|
8039
|
+
size_t blocks_per_thread = nblocks / nthread;
|
8040
|
+
size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
8041
|
+
|
8042
|
+
size_t in_buff_offs = 0;
|
8043
|
+
size_t out_buff_offs = 0;
|
7660
8044
|
|
7661
|
-
for (
|
7662
|
-
|
7663
|
-
|
7664
|
-
|
8045
|
+
for (int tnum = 0; tnum < nthread; tnum++) {
|
8046
|
+
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
8047
|
+
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
|
8048
|
+
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
7665
8049
|
|
7666
8050
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
7667
8051
|
if (typ == GGML_TYPE_F16) {
|
@@ -7831,7 +8215,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7831
8215
|
constexpr bool use_mmap = false;
|
7832
8216
|
#endif
|
7833
8217
|
|
7834
|
-
llama_model_loader ml(fname_inp, use_mmap);
|
8218
|
+
llama_model_loader ml(fname_inp, use_mmap, NULL);
|
7835
8219
|
if (ml.use_mmap) {
|
7836
8220
|
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
7837
8221
|
}
|
@@ -8127,7 +8511,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8127
8511
|
std::vector<uint8_t> base_buf;
|
8128
8512
|
if (path_base_model) {
|
8129
8513
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8130
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
8514
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
|
8131
8515
|
|
8132
8516
|
size_t ctx_size;
|
8133
8517
|
size_t mmapped_size;
|
@@ -8355,6 +8739,7 @@ struct llama_model_params llama_model_default_params() {
|
|
8355
8739
|
/*.tensor_split =*/ nullptr,
|
8356
8740
|
/*.progress_callback =*/ nullptr,
|
8357
8741
|
/*.progress_callback_user_data =*/ nullptr,
|
8742
|
+
/*.kv_overrides =*/ nullptr,
|
8358
8743
|
/*.vocab_only =*/ false,
|
8359
8744
|
/*.use_mmap =*/ true,
|
8360
8745
|
/*.use_mlock =*/ false,
|
@@ -8382,10 +8767,12 @@ struct llama_context_params llama_context_default_params() {
|
|
8382
8767
|
/*.yarn_beta_fast =*/ 32.0f,
|
8383
8768
|
/*.yarn_beta_slow =*/ 1.0f,
|
8384
8769
|
/*.yarn_orig_ctx =*/ 0,
|
8770
|
+
/*.type_k =*/ GGML_TYPE_F16,
|
8771
|
+
/*.type_v =*/ GGML_TYPE_F16,
|
8385
8772
|
/*.mul_mat_q =*/ true,
|
8386
|
-
/*.f16_kv =*/ true,
|
8387
8773
|
/*.logits_all =*/ false,
|
8388
8774
|
/*.embedding =*/ false,
|
8775
|
+
/*.offload_kqv =*/ true,
|
8389
8776
|
};
|
8390
8777
|
|
8391
8778
|
return result;
|
@@ -8502,6 +8889,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8502
8889
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
8503
8890
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
8504
8891
|
cparams.mul_mat_q = params.mul_mat_q;
|
8892
|
+
cparams.offload_kqv = params.offload_kqv;
|
8505
8893
|
|
8506
8894
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
8507
8895
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -8535,19 +8923,36 @@ struct llama_context * llama_new_context_with_model(
|
|
8535
8923
|
ctx->rng = std::mt19937(params.seed);
|
8536
8924
|
ctx->logits_all = params.logits_all;
|
8537
8925
|
|
8538
|
-
ggml_type
|
8926
|
+
const ggml_type type_k = params.type_k;
|
8927
|
+
const ggml_type type_v = params.type_v;
|
8928
|
+
|
8929
|
+
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
|
8930
|
+
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
|
8539
8931
|
|
8540
8932
|
// reserve memory for context buffers
|
8541
8933
|
if (!hparams.vocab_only) {
|
8542
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self,
|
8934
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
8543
8935
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
8544
8936
|
llama_free(ctx);
|
8545
8937
|
return nullptr;
|
8546
8938
|
}
|
8547
8939
|
|
8548
8940
|
{
|
8549
|
-
|
8550
|
-
|
8941
|
+
size_t memory_size_k = 0;
|
8942
|
+
size_t memory_size_v = 0;
|
8943
|
+
|
8944
|
+
for (auto & k : ctx->kv_self.k_l) {
|
8945
|
+
memory_size_k += ggml_nbytes(k);
|
8946
|
+
}
|
8947
|
+
|
8948
|
+
for (auto & v : ctx->kv_self.v_l) {
|
8949
|
+
memory_size_v += ggml_nbytes(v);
|
8950
|
+
}
|
8951
|
+
|
8952
|
+
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
8953
|
+
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
|
8954
|
+
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
8955
|
+
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
8551
8956
|
}
|
8552
8957
|
|
8553
8958
|
// resized during inference
|
@@ -8618,8 +9023,12 @@ struct llama_context * llama_new_context_with_model(
|
|
8618
9023
|
}
|
8619
9024
|
|
8620
9025
|
size_t kv_vram_size = 0;
|
8621
|
-
|
8622
|
-
|
9026
|
+
for (auto & k : ctx->kv_self.k_l) {
|
9027
|
+
add_tensor(k, kv_vram_size);
|
9028
|
+
}
|
9029
|
+
for (auto & v : ctx->kv_self.v_l) {
|
9030
|
+
add_tensor(v, kv_vram_size);
|
9031
|
+
}
|
8623
9032
|
|
8624
9033
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8625
9034
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
@@ -9089,37 +9498,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9089
9498
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
9090
9499
|
|
9091
9500
|
if (kv_buf_size) {
|
9092
|
-
const size_t elt_size = ggml_element_size(kv_self.
|
9501
|
+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9093
9502
|
|
9094
|
-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9503
|
+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9095
9504
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9096
9505
|
|
9097
|
-
|
9098
|
-
std::vector<uint8_t
|
9099
|
-
|
9506
|
+
std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
|
9507
|
+
std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
|
9508
|
+
|
9509
|
+
for (int il = 0; il < (int) n_layer; ++il) {
|
9510
|
+
ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9511
|
+
kout2d_data[il].resize(ggml_nbytes(kout2d));
|
9512
|
+
kout2d->data = kout2d_data[il].data();
|
9100
9513
|
|
9101
|
-
|
9102
|
-
|
9103
|
-
|
9514
|
+
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9515
|
+
vout2d_data[il].resize(ggml_nbytes(vout2d));
|
9516
|
+
vout2d->data = vout2d_data[il].data();
|
9104
9517
|
|
9105
|
-
|
9106
|
-
|
9107
|
-
|
9518
|
+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9519
|
+
n_embd, kv_head,
|
9520
|
+
elt_size*n_embd, 0);
|
9108
9521
|
|
9109
|
-
|
9110
|
-
|
9111
|
-
|
9522
|
+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
9523
|
+
kv_head, n_embd,
|
9524
|
+
elt_size*n_ctx, 0);
|
9525
|
+
|
9526
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
|
9527
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
|
9528
|
+
}
|
9112
9529
|
|
9113
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
9114
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
9115
9530
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
9116
9531
|
|
9117
9532
|
ggml_free(cpy_ctx);
|
9118
9533
|
|
9119
|
-
// our data is now in the
|
9534
|
+
// our data is now in the kout2d_data and vout2d_data buffers
|
9120
9535
|
// write them to file
|
9121
|
-
|
9122
|
-
|
9536
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
9537
|
+
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
|
9538
|
+
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
|
9539
|
+
}
|
9123
9540
|
}
|
9124
9541
|
|
9125
9542
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
@@ -9219,29 +9636,32 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9219
9636
|
if (kv_buf_size) {
|
9220
9637
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
9221
9638
|
|
9222
|
-
const size_t elt_size = ggml_element_size(kv_self.
|
9639
|
+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9223
9640
|
|
9224
|
-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9641
|
+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9225
9642
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9226
9643
|
|
9227
|
-
|
9228
|
-
|
9229
|
-
|
9644
|
+
for (int il = 0; il < n_layer; ++il) {
|
9645
|
+
ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9646
|
+
kin2d->data = (void *) inp;
|
9647
|
+
inp += ggml_nbytes(kin2d);
|
9230
9648
|
|
9231
|
-
|
9232
|
-
|
9233
|
-
|
9649
|
+
ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9650
|
+
vin2d->data = (void *) inp;
|
9651
|
+
inp += ggml_nbytes(vin2d);
|
9234
9652
|
|
9235
|
-
|
9236
|
-
|
9237
|
-
|
9653
|
+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9654
|
+
n_embd, kv_head,
|
9655
|
+
elt_size*n_embd, 0);
|
9238
9656
|
|
9239
|
-
|
9240
|
-
|
9241
|
-
|
9657
|
+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
9658
|
+
kv_head, n_embd,
|
9659
|
+
elt_size*n_ctx, 0);
|
9660
|
+
|
9661
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
|
9662
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
|
9663
|
+
}
|
9242
9664
|
|
9243
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9244
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9245
9665
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
9246
9666
|
|
9247
9667
|
ggml_free(cpy_ctx);
|