llama_cpp 0.9.4 → 0.10.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +121 -15
- data/ext/llama_cpp/src/ggml-alloc.c +43 -8
- data/ext/llama_cpp/src/ggml-alloc.h +7 -0
- data/ext/llama_cpp/src/ggml-backend-impl.h +46 -21
- data/ext/llama_cpp/src/ggml-backend.c +563 -156
- data/ext/llama_cpp/src/ggml-backend.h +62 -17
- data/ext/llama_cpp/src/ggml-cuda.cu +1270 -434
- data/ext/llama_cpp/src/ggml-cuda.h +9 -1
- data/ext/llama_cpp/src/ggml-impl.h +1 -1
- data/ext/llama_cpp/src/ggml-metal.h +6 -0
- data/ext/llama_cpp/src/ggml-metal.m +535 -175
- data/ext/llama_cpp/src/ggml-metal.metal +888 -237
- data/ext/llama_cpp/src/ggml-opencl.cpp +5 -7
- data/ext/llama_cpp/src/ggml.c +393 -127
- data/ext/llama_cpp/src/ggml.h +59 -7
- data/ext/llama_cpp/src/llama.cpp +791 -357
- data/ext/llama_cpp/src/llama.h +29 -6
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +20 -2
- metadata +3 -3
data/ext/llama_cpp/src/llama.cpp
CHANGED
@@ -46,7 +46,6 @@
|
|
46
46
|
#endif
|
47
47
|
#include <windows.h>
|
48
48
|
#include <io.h>
|
49
|
-
#include <stdio.h> // for _fseeki64
|
50
49
|
#endif
|
51
50
|
|
52
51
|
#include <algorithm>
|
@@ -75,6 +74,7 @@
|
|
75
74
|
#include <set>
|
76
75
|
#include <sstream>
|
77
76
|
#include <thread>
|
77
|
+
#include <type_traits>
|
78
78
|
#include <unordered_map>
|
79
79
|
|
80
80
|
#if defined(_MSC_VER)
|
@@ -193,6 +193,7 @@ enum llm_arch {
|
|
193
193
|
LLM_ARCH_REFACT,
|
194
194
|
LLM_ARCH_BLOOM,
|
195
195
|
LLM_ARCH_STABLELM,
|
196
|
+
LLM_ARCH_QWEN,
|
196
197
|
LLM_ARCH_UNKNOWN,
|
197
198
|
};
|
198
199
|
|
@@ -209,6 +210,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
|
|
209
210
|
{ LLM_ARCH_REFACT, "refact" },
|
210
211
|
{ LLM_ARCH_BLOOM, "bloom" },
|
211
212
|
{ LLM_ARCH_STABLELM, "stablelm" },
|
213
|
+
{ LLM_ARCH_QWEN, "qwen" },
|
212
214
|
};
|
213
215
|
|
214
216
|
enum llm_kv {
|
@@ -519,6 +521,22 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
519
521
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
520
522
|
},
|
521
523
|
},
|
524
|
+
{
|
525
|
+
LLM_ARCH_QWEN,
|
526
|
+
{
|
527
|
+
{ LLM_TENSOR_TOKEN_EMBD, "token_embd" },
|
528
|
+
{ LLM_TENSOR_OUTPUT_NORM, "output_norm" },
|
529
|
+
{ LLM_TENSOR_OUTPUT, "output" },
|
530
|
+
{ LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
|
531
|
+
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
532
|
+
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
533
|
+
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
534
|
+
{ LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
|
535
|
+
{ LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
|
536
|
+
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
537
|
+
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
538
|
+
},
|
539
|
+
},
|
522
540
|
|
523
541
|
{
|
524
542
|
LLM_ARCH_UNKNOWN,
|
@@ -573,21 +591,6 @@ struct LLM_TN {
|
|
573
591
|
// gguf helpers
|
574
592
|
//
|
575
593
|
|
576
|
-
#define GGUF_GET_KEY(ctx, dst, func, type, req, key) \
|
577
|
-
do { \
|
578
|
-
const std::string skey(key); \
|
579
|
-
const int kid = gguf_find_key(ctx, skey.c_str()); \
|
580
|
-
if (kid >= 0) { \
|
581
|
-
enum gguf_type ktype = gguf_get_kv_type(ctx, kid); \
|
582
|
-
if (ktype != (type)) { \
|
583
|
-
throw std::runtime_error(format("key %s has wrong type: %s", skey.c_str(), gguf_type_name(ktype))); \
|
584
|
-
} \
|
585
|
-
(dst) = func(ctx, kid); \
|
586
|
-
} else if (req) { \
|
587
|
-
throw std::runtime_error(format("key not found in model: %s", skey.c_str())); \
|
588
|
-
} \
|
589
|
-
} while (0)
|
590
|
-
|
591
594
|
static std::map<int8_t, std::string> LLAMA_ROPE_SCALING_TYPES = {
|
592
595
|
{ LLAMA_ROPE_SCALING_NONE, "none" },
|
593
596
|
{ LLAMA_ROPE_SCALING_LINEAR, "linear" },
|
@@ -621,7 +624,7 @@ static std::string gguf_data_to_str(enum gguf_type type, const void * data, int
|
|
621
624
|
}
|
622
625
|
}
|
623
626
|
|
624
|
-
static std::string gguf_kv_to_str(struct gguf_context * ctx_gguf, int i) {
|
627
|
+
static std::string gguf_kv_to_str(const struct gguf_context * ctx_gguf, int i) {
|
625
628
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
626
629
|
|
627
630
|
switch (type) {
|
@@ -1113,6 +1116,12 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1113
1116
|
//
|
1114
1117
|
|
1115
1118
|
struct llama_state {
|
1119
|
+
llama_state() {
|
1120
|
+
#ifdef GGML_USE_METAL
|
1121
|
+
ggml_metal_log_set_callback(log_callback, log_callback_user_data);
|
1122
|
+
#endif
|
1123
|
+
}
|
1124
|
+
|
1116
1125
|
// We save the log callback globally
|
1117
1126
|
ggml_log_callback log_callback = llama_log_callback_default;
|
1118
1127
|
void * log_callback_user_data = nullptr;
|
@@ -1217,6 +1226,7 @@ struct llama_cparams {
|
|
1217
1226
|
float yarn_beta_slow;
|
1218
1227
|
|
1219
1228
|
bool mul_mat_q;
|
1229
|
+
bool offload_kqv;
|
1220
1230
|
};
|
1221
1231
|
|
1222
1232
|
struct llama_layer {
|
@@ -1238,6 +1248,9 @@ struct llama_layer {
|
|
1238
1248
|
struct ggml_tensor * wqkv;
|
1239
1249
|
|
1240
1250
|
// attention bias
|
1251
|
+
struct ggml_tensor * bq;
|
1252
|
+
struct ggml_tensor * bk;
|
1253
|
+
struct ggml_tensor * bv;
|
1241
1254
|
struct ggml_tensor * bo;
|
1242
1255
|
struct ggml_tensor * bqkv;
|
1243
1256
|
|
@@ -1282,8 +1295,8 @@ struct llama_kv_cache {
|
|
1282
1295
|
|
1283
1296
|
std::vector<llama_kv_cell> cells;
|
1284
1297
|
|
1285
|
-
struct ggml_tensor
|
1286
|
-
struct ggml_tensor
|
1298
|
+
std::vector<struct ggml_tensor *> k_l; // per layer
|
1299
|
+
std::vector<struct ggml_tensor *> v_l;
|
1287
1300
|
|
1288
1301
|
struct ggml_context * ctx = NULL;
|
1289
1302
|
|
@@ -1296,8 +1309,10 @@ struct llama_kv_cache {
|
|
1296
1309
|
|
1297
1310
|
#ifdef GGML_USE_CUBLAS
|
1298
1311
|
if (ggml_cublas_loaded()) {
|
1299
|
-
|
1300
|
-
|
1312
|
+
for (size_t i = 0; i < k_l.size(); ++i) {
|
1313
|
+
ggml_cuda_free_data(k_l[i]);
|
1314
|
+
ggml_cuda_free_data(v_l[i]);
|
1315
|
+
}
|
1301
1316
|
}
|
1302
1317
|
#endif
|
1303
1318
|
}
|
@@ -1487,9 +1502,11 @@ struct llama_context {
|
|
1487
1502
|
static bool llama_kv_cache_init(
|
1488
1503
|
const struct llama_hparams & hparams,
|
1489
1504
|
struct llama_kv_cache & cache,
|
1490
|
-
ggml_type
|
1505
|
+
ggml_type ktype,
|
1506
|
+
ggml_type vtype,
|
1491
1507
|
uint32_t n_ctx,
|
1492
|
-
int n_gpu_layers
|
1508
|
+
int n_gpu_layers,
|
1509
|
+
bool offload) {
|
1493
1510
|
const uint32_t n_embd = hparams.n_embd_gqa();
|
1494
1511
|
const uint32_t n_layer = hparams.n_layer;
|
1495
1512
|
|
@@ -1505,7 +1522,7 @@ static bool llama_kv_cache_init(
|
|
1505
1522
|
cache.cells.clear();
|
1506
1523
|
cache.cells.resize(n_ctx);
|
1507
1524
|
|
1508
|
-
cache.buf.resize(
|
1525
|
+
cache.buf.resize(n_elements*(ggml_type_sizef(ktype) + ggml_type_sizef(vtype)) + 2u*n_layer*ggml_tensor_overhead());
|
1509
1526
|
memset(cache.buf.data, 0, cache.buf.size);
|
1510
1527
|
|
1511
1528
|
struct ggml_init_params params;
|
@@ -1515,37 +1532,44 @@ static bool llama_kv_cache_init(
|
|
1515
1532
|
|
1516
1533
|
cache.ctx = ggml_init(params);
|
1517
1534
|
|
1535
|
+
size_t vram_kv_cache = 0;
|
1536
|
+
|
1518
1537
|
if (!cache.ctx) {
|
1519
1538
|
LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
|
1520
1539
|
return false;
|
1521
1540
|
}
|
1522
1541
|
|
1523
|
-
cache.
|
1524
|
-
cache.
|
1525
|
-
ggml_set_name(cache.k, "cache_k");
|
1526
|
-
ggml_set_name(cache.v, "cache_v");
|
1542
|
+
cache.k_l.reserve(n_layer);
|
1543
|
+
cache.v_l.reserve(n_layer);
|
1527
1544
|
|
1528
|
-
(
|
1545
|
+
const int i_gpu_start = (int) n_layer - n_gpu_layers; GGML_UNUSED(i_gpu_start);
|
1529
1546
|
|
1530
|
-
|
1531
|
-
if (ggml_cublas_loaded()) {
|
1532
|
-
size_t vram_kv_cache = 0;
|
1547
|
+
GGML_UNUSED(offload);
|
1533
1548
|
|
1534
|
-
|
1535
|
-
|
1536
|
-
|
1537
|
-
|
1538
|
-
|
1539
|
-
|
1540
|
-
|
1541
|
-
|
1542
|
-
|
1543
|
-
|
1544
|
-
|
1545
|
-
|
1549
|
+
for (int i = 0; i < (int) n_layer; i++) {
|
1550
|
+
ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd*n_ctx);
|
1551
|
+
ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd*n_ctx);
|
1552
|
+
ggml_format_name(k, "cache_k_l%d", i);
|
1553
|
+
ggml_format_name(v, "cache_v_l%d", i);
|
1554
|
+
cache.k_l.push_back(k);
|
1555
|
+
cache.v_l.push_back(v);
|
1556
|
+
#ifdef GGML_USE_CUBLAS
|
1557
|
+
if (i >= i_gpu_start) {
|
1558
|
+
if (offload) {
|
1559
|
+
ggml_cuda_assign_buffers_no_scratch(k);
|
1560
|
+
vram_kv_cache += ggml_nbytes(k);
|
1561
|
+
ggml_cuda_assign_buffers_no_scratch(v);
|
1562
|
+
vram_kv_cache += ggml_nbytes(v);
|
1563
|
+
}
|
1546
1564
|
}
|
1565
|
+
#endif // GGML_USE_CUBLAS
|
1547
1566
|
}
|
1548
|
-
|
1567
|
+
|
1568
|
+
if (vram_kv_cache > 0) {
|
1569
|
+
LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
|
1570
|
+
}
|
1571
|
+
|
1572
|
+
GGML_UNUSED(n_gpu_layers);
|
1549
1573
|
|
1550
1574
|
return true;
|
1551
1575
|
}
|
@@ -1766,6 +1790,169 @@ static std::string llama_format_tensor_shape(const struct ggml_tensor * t) {
|
|
1766
1790
|
return buf;
|
1767
1791
|
}
|
1768
1792
|
|
1793
|
+
namespace GGUFMeta {
|
1794
|
+
template <typename T, gguf_type gt_, T (*gfun)(const gguf_context *, const int)>
|
1795
|
+
struct GKV_Base_Type {
|
1796
|
+
static constexpr gguf_type gt = gt_;
|
1797
|
+
|
1798
|
+
static T getter(const gguf_context * ctx, const int kid) {
|
1799
|
+
return gfun(ctx, kid);
|
1800
|
+
}
|
1801
|
+
};
|
1802
|
+
|
1803
|
+
template<typename T> struct GKV_Base;
|
1804
|
+
|
1805
|
+
template<> struct GKV_Base<bool >: GKV_Base_Type<bool, GGUF_TYPE_BOOL, gguf_get_val_bool> {};
|
1806
|
+
template<> struct GKV_Base<uint8_t >: GKV_Base_Type<uint8_t, GGUF_TYPE_UINT8, gguf_get_val_u8 > {};
|
1807
|
+
template<> struct GKV_Base<uint16_t >: GKV_Base_Type<uint16_t, GGUF_TYPE_UINT16, gguf_get_val_u16 > {};
|
1808
|
+
template<> struct GKV_Base<uint32_t >: GKV_Base_Type<uint32_t, GGUF_TYPE_UINT32, gguf_get_val_u32 > {};
|
1809
|
+
template<> struct GKV_Base<uint64_t >: GKV_Base_Type<uint64_t, GGUF_TYPE_UINT64, gguf_get_val_u64 > {};
|
1810
|
+
template<> struct GKV_Base<int8_t >: GKV_Base_Type<int8_t, GGUF_TYPE_INT8, gguf_get_val_i8 > {};
|
1811
|
+
template<> struct GKV_Base<int16_t >: GKV_Base_Type<int16_t, GGUF_TYPE_INT16, gguf_get_val_i16 > {};
|
1812
|
+
template<> struct GKV_Base<int32_t >: GKV_Base_Type<int32_t, GGUF_TYPE_INT32, gguf_get_val_i32 > {};
|
1813
|
+
template<> struct GKV_Base<int64_t >: GKV_Base_Type<int64_t, GGUF_TYPE_INT64, gguf_get_val_i64 > {};
|
1814
|
+
template<> struct GKV_Base<float >: GKV_Base_Type<float, GGUF_TYPE_FLOAT32, gguf_get_val_f32 > {};
|
1815
|
+
template<> struct GKV_Base<double >: GKV_Base_Type<double, GGUF_TYPE_FLOAT64, gguf_get_val_f64 > {};
|
1816
|
+
template<> struct GKV_Base<const char *>: GKV_Base_Type<const char *, GGUF_TYPE_STRING, gguf_get_val_str > {};
|
1817
|
+
|
1818
|
+
template<> struct GKV_Base<std::string> {
|
1819
|
+
static constexpr gguf_type gt = GGUF_TYPE_STRING;
|
1820
|
+
|
1821
|
+
static std::string getter(const gguf_context * ctx, const int kid) {
|
1822
|
+
return gguf_get_val_str(ctx, kid);
|
1823
|
+
}
|
1824
|
+
};
|
1825
|
+
|
1826
|
+
struct ArrayInfo{
|
1827
|
+
const gguf_type gt;
|
1828
|
+
const size_t length;
|
1829
|
+
const void * data;
|
1830
|
+
};
|
1831
|
+
|
1832
|
+
template<> struct GKV_Base<ArrayInfo> {
|
1833
|
+
public:
|
1834
|
+
static constexpr gguf_type gt = GGUF_TYPE_ARRAY;
|
1835
|
+
static ArrayInfo getter(const gguf_context *ctx, const int k) {
|
1836
|
+
return ArrayInfo {
|
1837
|
+
gguf_get_arr_type(ctx, k),
|
1838
|
+
size_t(gguf_get_arr_n(ctx, k)),
|
1839
|
+
gguf_get_arr_data(ctx, k),
|
1840
|
+
};
|
1841
|
+
}
|
1842
|
+
};
|
1843
|
+
|
1844
|
+
template<typename T>
|
1845
|
+
class GKV: public GKV_Base<T> {
|
1846
|
+
GKV() = delete;
|
1847
|
+
|
1848
|
+
public:
|
1849
|
+
static T get_kv(const gguf_context * ctx, const int k) {
|
1850
|
+
const enum gguf_type kt = gguf_get_kv_type(ctx, k);
|
1851
|
+
|
1852
|
+
if (kt != GKV::gt) {
|
1853
|
+
throw std::runtime_error(format("key %s has wrong type %s but expected type %s",
|
1854
|
+
gguf_get_key(ctx, k), gguf_type_name(kt), gguf_type_name(GKV::gt)));
|
1855
|
+
}
|
1856
|
+
return GKV::getter(ctx, k);
|
1857
|
+
}
|
1858
|
+
|
1859
|
+
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
1860
|
+
switch (ty) {
|
1861
|
+
case LLAMA_KV_OVERRIDE_BOOL: return "bool";
|
1862
|
+
case LLAMA_KV_OVERRIDE_INT: return "int";
|
1863
|
+
case LLAMA_KV_OVERRIDE_FLOAT: return "float";
|
1864
|
+
}
|
1865
|
+
return "unknown";
|
1866
|
+
}
|
1867
|
+
|
1868
|
+
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *override) {
|
1869
|
+
if (!override) { return false; }
|
1870
|
+
if (override->tag == expected_type) {
|
1871
|
+
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
1872
|
+
__func__, override_type_to_str(override->tag), override->key);
|
1873
|
+
switch (override->tag) {
|
1874
|
+
case LLAMA_KV_OVERRIDE_BOOL: {
|
1875
|
+
printf("%s\n", override->bool_value ? "true" : "false");
|
1876
|
+
} break;
|
1877
|
+
case LLAMA_KV_OVERRIDE_INT: {
|
1878
|
+
printf("%" PRId64 "\n", override->int_value);
|
1879
|
+
} break;
|
1880
|
+
case LLAMA_KV_OVERRIDE_FLOAT: {
|
1881
|
+
printf("%.6f\n", override->float_value);
|
1882
|
+
} break;
|
1883
|
+
default:
|
1884
|
+
// Shouldn't be possible to end up here, but just in case...
|
1885
|
+
throw std::runtime_error(
|
1886
|
+
format("Unsupported attempt to override %s type for metadata key %s\n",
|
1887
|
+
override_type_to_str(override->tag), override->key));
|
1888
|
+
}
|
1889
|
+
return true;
|
1890
|
+
}
|
1891
|
+
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
|
1892
|
+
__func__, override->key, override_type_to_str(expected_type), override_type_to_str(override->tag));
|
1893
|
+
return false;
|
1894
|
+
}
|
1895
|
+
|
1896
|
+
template<typename OT>
|
1897
|
+
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
1898
|
+
try_override(OT & target, const struct llama_model_kv_override *override) {
|
1899
|
+
if (validate_override(LLAMA_KV_OVERRIDE_BOOL, override)) {
|
1900
|
+
target = override->bool_value;
|
1901
|
+
return true;
|
1902
|
+
}
|
1903
|
+
return true;
|
1904
|
+
}
|
1905
|
+
|
1906
|
+
template<typename OT>
|
1907
|
+
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
1908
|
+
try_override(OT & target, const struct llama_model_kv_override *override) {
|
1909
|
+
if (validate_override(LLAMA_KV_OVERRIDE_INT, override)) {
|
1910
|
+
target = override->int_value;
|
1911
|
+
return true;
|
1912
|
+
}
|
1913
|
+
return false;
|
1914
|
+
}
|
1915
|
+
|
1916
|
+
template<typename OT>
|
1917
|
+
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
1918
|
+
try_override(T & target, const struct llama_model_kv_override *override) {
|
1919
|
+
if (validate_override(LLAMA_KV_OVERRIDE_FLOAT, override)) {
|
1920
|
+
target = override->float_value;
|
1921
|
+
return true;
|
1922
|
+
}
|
1923
|
+
return false;
|
1924
|
+
}
|
1925
|
+
|
1926
|
+
template<typename OT>
|
1927
|
+
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
1928
|
+
try_override(T & target, const struct llama_model_kv_override *override) {
|
1929
|
+
(void)target;
|
1930
|
+
(void)override;
|
1931
|
+
if (!override) { return false; }
|
1932
|
+
// Currently, we should never end up here so it would be a bug if we do.
|
1933
|
+
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
1934
|
+
override ? override->key : "NULL"));
|
1935
|
+
}
|
1936
|
+
|
1937
|
+
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1938
|
+
if (try_override<T>(target, override)) {
|
1939
|
+
return true;
|
1940
|
+
}
|
1941
|
+
if (k < 0) { return false; }
|
1942
|
+
target = get_kv(ctx, k);
|
1943
|
+
return true;
|
1944
|
+
}
|
1945
|
+
|
1946
|
+
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1947
|
+
return set(ctx, gguf_find_key(ctx, key), target, override);
|
1948
|
+
}
|
1949
|
+
|
1950
|
+
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *override = nullptr) {
|
1951
|
+
return set(ctx, key.c_str(), target, override);
|
1952
|
+
}
|
1953
|
+
};
|
1954
|
+
}
|
1955
|
+
|
1769
1956
|
struct llama_model_loader {
|
1770
1957
|
int n_kv = 0;
|
1771
1958
|
int n_tensors = 0;
|
@@ -1781,21 +1968,34 @@ struct llama_model_loader {
|
|
1781
1968
|
llama_fver fver;
|
1782
1969
|
|
1783
1970
|
std::unique_ptr<llama_mmap> mapping;
|
1971
|
+
std::unordered_map<std::string, struct llama_model_kv_override> kv_overrides;
|
1784
1972
|
|
1785
1973
|
struct gguf_context * ctx_gguf = NULL;
|
1786
1974
|
struct ggml_context * ctx_meta = NULL;
|
1787
1975
|
|
1788
|
-
|
1976
|
+
std::string arch_name;
|
1977
|
+
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
1978
|
+
|
1979
|
+
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
|
1789
1980
|
struct gguf_init_params params = {
|
1790
1981
|
/*.no_alloc = */ true,
|
1791
1982
|
/*.ctx = */ &ctx_meta,
|
1792
1983
|
};
|
1793
1984
|
|
1985
|
+
if (param_overrides_p != nullptr) {
|
1986
|
+
for (const struct llama_model_kv_override *p = param_overrides_p; p->key[0] != 0; p++) {
|
1987
|
+
kv_overrides.insert({std::string(p->key), *p});
|
1988
|
+
}
|
1989
|
+
}
|
1990
|
+
|
1794
1991
|
ctx_gguf = gguf_init_from_file(fname.c_str(), params);
|
1795
1992
|
if (!ctx_gguf) {
|
1796
1993
|
throw std::runtime_error(format("%s: failed to load model from %s\n", __func__, fname.c_str()));
|
1797
1994
|
}
|
1798
1995
|
|
1996
|
+
get_key(llm_kv(LLM_KV_GENERAL_ARCHITECTURE), arch_name, false);
|
1997
|
+
llm_kv = LLM_KV(llm_arch_from_string(arch_name));
|
1998
|
+
|
1799
1999
|
n_kv = gguf_get_n_kv(ctx_gguf);
|
1800
2000
|
n_tensors = gguf_get_n_tensors(ctx_gguf);
|
1801
2001
|
|
@@ -1863,6 +2063,7 @@ struct llama_model_loader {
|
|
1863
2063
|
}
|
1864
2064
|
}
|
1865
2065
|
|
2066
|
+
LLAMA_LOG_INFO("%s: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n", __func__);
|
1866
2067
|
for (int i = 0; i < n_kv; i++) {
|
1867
2068
|
const char * name = gguf_get_key(ctx_gguf, i);
|
1868
2069
|
const enum gguf_type type = gguf_get_kv_type(ctx_gguf, i);
|
@@ -1908,19 +2109,59 @@ struct llama_model_loader {
|
|
1908
2109
|
}
|
1909
2110
|
}
|
1910
2111
|
|
1911
|
-
|
1912
|
-
|
2112
|
+
template<typename T>
|
2113
|
+
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2114
|
+
get_arr_n(const std::string & key, T & result, const bool required = true) {
|
2115
|
+
const int kid = gguf_find_key(ctx_gguf, key.c_str());
|
2116
|
+
|
2117
|
+
if (kid < 0) {
|
2118
|
+
if (required) {
|
2119
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
2120
|
+
}
|
2121
|
+
return false;
|
2122
|
+
}
|
1913
2123
|
|
1914
|
-
|
1915
|
-
|
2124
|
+
struct GGUFMeta::ArrayInfo arr_info =
|
2125
|
+
GGUFMeta::GKV<GGUFMeta::ArrayInfo>::get_kv(ctx_gguf, kid);
|
1916
2126
|
|
2127
|
+
|
2128
|
+
result = arr_info.length;
|
2129
|
+
return true;
|
2130
|
+
}
|
2131
|
+
|
2132
|
+
template<typename T>
|
2133
|
+
typename std::enable_if<std::is_integral<T>::value, bool>::type
|
2134
|
+
get_arr_n(const enum llm_kv kid, T & result, const bool required = true) {
|
2135
|
+
return get_arr_n(llm_kv(kid), result, required);
|
2136
|
+
}
|
2137
|
+
|
2138
|
+
template<typename T>
|
2139
|
+
bool get_key(const std::string & key, T & result, const bool required = true) {
|
2140
|
+
auto it = kv_overrides.find(key);
|
2141
|
+
|
2142
|
+
const struct llama_model_kv_override * override =
|
2143
|
+
it != kv_overrides.end() ? &it->second : nullptr;
|
2144
|
+
|
2145
|
+
const bool found = GGUFMeta::GKV<T>::set(ctx_gguf, key, result, override);
|
2146
|
+
|
2147
|
+
if (required && !found) {
|
2148
|
+
throw std::runtime_error(format("key not found in model: %s", key.c_str()));
|
2149
|
+
}
|
2150
|
+
|
2151
|
+
return found;
|
2152
|
+
}
|
2153
|
+
|
2154
|
+
template<typename T>
|
2155
|
+
bool get_key(const enum llm_kv kid, T & result, const bool required = true) {
|
2156
|
+
return get_key(llm_kv(kid), result, required);
|
2157
|
+
}
|
2158
|
+
|
2159
|
+
std::string get_arch_name() const {
|
1917
2160
|
return arch_name;
|
1918
2161
|
}
|
1919
2162
|
|
1920
2163
|
enum llm_arch get_arch() const {
|
1921
|
-
|
1922
|
-
|
1923
|
-
return llm_arch_from_string(arch_name);
|
2164
|
+
return llm_kv.arch;
|
1924
2165
|
}
|
1925
2166
|
|
1926
2167
|
const char * get_tensor_name(int i) const {
|
@@ -1960,10 +2201,13 @@ struct llama_model_loader {
|
|
1960
2201
|
return tensor;
|
1961
2202
|
}
|
1962
2203
|
|
1963
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend) {
|
2204
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
|
1964
2205
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
1965
2206
|
|
1966
2207
|
if (cur == NULL) {
|
2208
|
+
if (!required) {
|
2209
|
+
return NULL;
|
2210
|
+
}
|
1967
2211
|
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
1968
2212
|
}
|
1969
2213
|
|
@@ -2167,11 +2411,8 @@ static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
|
2167
2411
|
static void llm_load_hparams(
|
2168
2412
|
llama_model_loader & ml,
|
2169
2413
|
llama_model & model) {
|
2170
|
-
struct gguf_context * ctx = ml.ctx_gguf;
|
2171
|
-
|
2172
|
-
const auto kv = LLM_KV(model.arch);
|
2173
|
-
|
2174
2414
|
auto & hparams = model.hparams;
|
2415
|
+
const gguf_context * ctx = ml.ctx_gguf;
|
2175
2416
|
|
2176
2417
|
// get metadata as string
|
2177
2418
|
for (int i = 0; i < gguf_get_n_kv(ctx); i++) {
|
@@ -2185,42 +2426,41 @@ static void llm_load_hparams(
|
|
2185
2426
|
}
|
2186
2427
|
|
2187
2428
|
// get general kv
|
2188
|
-
|
2429
|
+
ml.get_key(LLM_KV_GENERAL_NAME, model.name, false);
|
2189
2430
|
|
2190
2431
|
// get hparams kv
|
2191
|
-
|
2192
|
-
|
2193
|
-
|
2194
|
-
|
2195
|
-
|
2196
|
-
|
2432
|
+
ml.get_arr_n(LLM_KV_TOKENIZER_LIST, hparams.n_vocab);
|
2433
|
+
ml.get_key (LLM_KV_CONTEXT_LENGTH, hparams.n_ctx_train);
|
2434
|
+
ml.get_key (LLM_KV_EMBEDDING_LENGTH, hparams.n_embd);
|
2435
|
+
ml.get_key (LLM_KV_FEED_FORWARD_LENGTH, hparams.n_ff);
|
2436
|
+
ml.get_key (LLM_KV_ATTENTION_HEAD_COUNT, hparams.n_head);
|
2437
|
+
ml.get_key (LLM_KV_BLOCK_COUNT, hparams.n_layer);
|
2197
2438
|
|
2198
2439
|
// n_head_kv is optional, default to n_head
|
2199
2440
|
hparams.n_head_kv = hparams.n_head;
|
2200
|
-
|
2441
|
+
ml.get_key(LLM_KV_ATTENTION_HEAD_COUNT_KV, hparams.n_head_kv, false);
|
2201
2442
|
|
2202
|
-
|
2203
|
-
|
2204
|
-
|
2443
|
+
bool rope_finetuned = false;
|
2444
|
+
ml.get_key(LLM_KV_ROPE_SCALING_FINETUNED, rope_finetuned, false);
|
2445
|
+
hparams.rope_finetuned = rope_finetuned;
|
2205
2446
|
|
2206
2447
|
hparams.n_yarn_orig_ctx = hparams.n_ctx_train;
|
2207
|
-
|
2208
|
-
kv(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN));
|
2448
|
+
ml.get_key(LLM_KV_ROPE_SCALING_ORIG_CTX_LEN, hparams.n_yarn_orig_ctx, false);
|
2209
2449
|
|
2210
2450
|
// rope_freq_base (optional)
|
2211
2451
|
hparams.rope_freq_base_train = 10000.0f;
|
2212
|
-
|
2452
|
+
ml.get_key(LLM_KV_ROPE_FREQ_BASE, hparams.rope_freq_base_train, false);
|
2213
2453
|
|
2214
2454
|
std::string rope_scaling("linear");
|
2215
|
-
|
2455
|
+
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
2216
2456
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
2217
2457
|
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_UNSPECIFIED);
|
2218
2458
|
|
2219
2459
|
// rope_freq_scale (inverse of the kv) is optional
|
2220
2460
|
float ropescale = 0.0f;
|
2221
|
-
|
2222
|
-
|
2223
|
-
|
2461
|
+
if (!ml.get_key(LLM_KV_ROPE_SCALING_FACTOR, ropescale, false)) {
|
2462
|
+
// try the old key name
|
2463
|
+
ml.get_key(LLM_KV_ROPE_SCALE_LINEAR, ropescale, false);
|
2224
2464
|
}
|
2225
2465
|
hparams.rope_freq_scale_train = ropescale == 0.0f ? 1.0f : 1.0f/ropescale;
|
2226
2466
|
|
@@ -2228,7 +2468,7 @@ static void llm_load_hparams(
|
|
2228
2468
|
{
|
2229
2469
|
hparams.n_rot = hparams.n_embd / hparams.n_head;
|
2230
2470
|
|
2231
|
-
|
2471
|
+
ml.get_key(LLM_KV_ROPE_DIMENSION_COUNT, hparams.n_rot, false);
|
2232
2472
|
|
2233
2473
|
if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
|
2234
2474
|
if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
|
@@ -2243,7 +2483,7 @@ static void llm_load_hparams(
|
|
2243
2483
|
switch (model.arch) {
|
2244
2484
|
case LLM_ARCH_LLAMA:
|
2245
2485
|
{
|
2246
|
-
|
2486
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2247
2487
|
|
2248
2488
|
switch (hparams.n_layer) {
|
2249
2489
|
case 26: model.type = e_model::MODEL_3B; break;
|
@@ -2257,7 +2497,7 @@ static void llm_load_hparams(
|
|
2257
2497
|
} break;
|
2258
2498
|
case LLM_ARCH_FALCON:
|
2259
2499
|
{
|
2260
|
-
|
2500
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2261
2501
|
|
2262
2502
|
switch (hparams.n_layer) {
|
2263
2503
|
case 32: model.type = e_model::MODEL_7B; break;
|
@@ -2267,7 +2507,7 @@ static void llm_load_hparams(
|
|
2267
2507
|
} break;
|
2268
2508
|
case LLM_ARCH_BAICHUAN:
|
2269
2509
|
{
|
2270
|
-
|
2510
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2271
2511
|
switch (hparams.n_layer) {
|
2272
2512
|
case 32: model.type = e_model::MODEL_7B; break;
|
2273
2513
|
case 40: model.type = e_model::MODEL_13B; break;
|
@@ -2276,7 +2516,7 @@ static void llm_load_hparams(
|
|
2276
2516
|
} break;
|
2277
2517
|
case LLM_ARCH_STARCODER:
|
2278
2518
|
{
|
2279
|
-
|
2519
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2280
2520
|
switch (hparams.n_layer) {
|
2281
2521
|
case 24: model.type = e_model::MODEL_1B; break;
|
2282
2522
|
case 36: model.type = e_model::MODEL_3B; break;
|
@@ -2287,7 +2527,7 @@ static void llm_load_hparams(
|
|
2287
2527
|
} break;
|
2288
2528
|
case LLM_ARCH_PERSIMMON:
|
2289
2529
|
{
|
2290
|
-
|
2530
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2291
2531
|
switch (hparams.n_layer) {
|
2292
2532
|
case 36: model.type = e_model::MODEL_8B; break;
|
2293
2533
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -2295,7 +2535,7 @@ static void llm_load_hparams(
|
|
2295
2535
|
} break;
|
2296
2536
|
case LLM_ARCH_REFACT:
|
2297
2537
|
{
|
2298
|
-
|
2538
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2299
2539
|
switch (hparams.n_layer) {
|
2300
2540
|
case 32: model.type = e_model::MODEL_1B; break;
|
2301
2541
|
default: model.type = e_model::MODEL_UNKNOWN;
|
@@ -2303,7 +2543,7 @@ static void llm_load_hparams(
|
|
2303
2543
|
} break;
|
2304
2544
|
case LLM_ARCH_BLOOM:
|
2305
2545
|
{
|
2306
|
-
|
2546
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2307
2547
|
|
2308
2548
|
switch (hparams.n_layer) {
|
2309
2549
|
case 24: model.type = e_model::MODEL_1B; break;
|
@@ -2318,9 +2558,9 @@ static void llm_load_hparams(
|
|
2318
2558
|
{
|
2319
2559
|
hparams.f_clamp_kqv = 0.0f;
|
2320
2560
|
|
2321
|
-
|
2322
|
-
|
2323
|
-
|
2561
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2562
|
+
ml.get_key(LLM_KV_ATTENTION_CLAMP_KQV, hparams.f_clamp_kqv, false);
|
2563
|
+
ml.get_key(LLM_KV_ATTENTION_MAX_ALIBI_BIAS, hparams.f_max_alibi_bias);
|
2324
2564
|
|
2325
2565
|
switch (hparams.n_layer) {
|
2326
2566
|
case 32: model.type = e_model::MODEL_7B; break;
|
@@ -2330,13 +2570,23 @@ static void llm_load_hparams(
|
|
2330
2570
|
} break;
|
2331
2571
|
case LLM_ARCH_STABLELM:
|
2332
2572
|
{
|
2333
|
-
|
2573
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
2334
2574
|
|
2335
2575
|
switch (hparams.n_layer) {
|
2336
2576
|
case 32: model.type = e_model::MODEL_3B; break;
|
2337
2577
|
default: model.type = e_model::MODEL_UNKNOWN;
|
2338
2578
|
}
|
2339
2579
|
} break;
|
2580
|
+
case LLM_ARCH_QWEN:
|
2581
|
+
{
|
2582
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS, hparams.f_norm_rms_eps);
|
2583
|
+
|
2584
|
+
switch (hparams.n_layer) {
|
2585
|
+
case 32: model.type = e_model::MODEL_7B; break;
|
2586
|
+
case 40: model.type = e_model::MODEL_13B; break;
|
2587
|
+
default: model.type = e_model::MODEL_UNKNOWN;
|
2588
|
+
}
|
2589
|
+
} break;
|
2340
2590
|
|
2341
2591
|
default: (void)0;
|
2342
2592
|
}
|
@@ -2378,7 +2628,7 @@ static void llm_load_vocab(
|
|
2378
2628
|
{
|
2379
2629
|
std::string tokenizer_name;
|
2380
2630
|
|
2381
|
-
|
2631
|
+
ml.get_key(LLM_KV_TOKENIZER_MODEL, tokenizer_name);
|
2382
2632
|
|
2383
2633
|
if (tokenizer_name == "llama") {
|
2384
2634
|
vocab.type = LLAMA_VOCAB_TYPE_SPM;
|
@@ -2468,34 +2718,31 @@ static void llm_load_vocab(
|
|
2468
2718
|
};
|
2469
2719
|
for (const auto & it : special_token_types) {
|
2470
2720
|
const std::string & key = kv(std::get<0>(it));
|
2471
|
-
int32_t & id = std::get<1>(it)
|
2721
|
+
int32_t & id = std::get<1>(it);
|
2472
2722
|
|
2473
|
-
|
2474
|
-
|
2475
|
-
|
2476
|
-
|
2477
|
-
if (
|
2478
|
-
LLAMA_LOG_WARN("%s: bad special token: '%s' = %
|
2479
|
-
__func__, key.c_str(),
|
2480
|
-
|
2723
|
+
uint32_t new_id;
|
2724
|
+
if (!ml.get_key(std::get<0>(it), new_id, false)) {
|
2725
|
+
continue;
|
2726
|
+
}
|
2727
|
+
if (new_id >= vocab.id_to_token.size()) {
|
2728
|
+
LLAMA_LOG_WARN("%s: bad special token: '%s' = %ud, using default id %d\n",
|
2729
|
+
__func__, key.c_str(), new_id, id);
|
2730
|
+
} else {
|
2731
|
+
id = new_id;
|
2481
2732
|
}
|
2482
2733
|
|
2483
2734
|
}
|
2484
2735
|
|
2485
2736
|
// Handle add_bos_token and add_eos_token
|
2486
|
-
|
2487
|
-
|
2488
|
-
|
2489
|
-
|
2490
|
-
|
2491
|
-
|
2492
|
-
|
2493
|
-
|
2494
|
-
|
2495
|
-
ktype = kid < 0 ? GGUF_TYPE_COUNT : gguf_get_kv_type(ctx, kid);
|
2496
|
-
vocab.special_add_eos = ktype == GGUF_TYPE_BOOL ? gguf_get_val_bool(ctx, kid) : -1;
|
2497
|
-
if (ktype != GGUF_TYPE_BOOL && ktype != GGUF_TYPE_COUNT) {
|
2498
|
-
LLAMA_LOG_WARN("%s: bad field type %d for '%s' - ignoring\n", __func__, ktype, key.c_str());
|
2737
|
+
{
|
2738
|
+
bool temp = true;
|
2739
|
+
|
2740
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_BOS, temp, false)) {
|
2741
|
+
vocab.special_add_bos = int(temp);
|
2742
|
+
}
|
2743
|
+
if (ml.get_key(LLM_KV_TOKENIZER_ADD_EOS, temp, false)) {
|
2744
|
+
vocab.special_add_eos = int(temp);
|
2745
|
+
}
|
2499
2746
|
}
|
2500
2747
|
}
|
2501
2748
|
|
@@ -2634,15 +2881,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
2634
2881
|
}
|
2635
2882
|
|
2636
2883
|
// general kv
|
2637
|
-
LLAMA_LOG_INFO("%s: general.name
|
2884
|
+
LLAMA_LOG_INFO("%s: general.name = %s\n", __func__, model.name.c_str());
|
2638
2885
|
|
2639
2886
|
// special tokens
|
2640
|
-
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token
|
2641
|
-
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token
|
2642
|
-
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token
|
2643
|
-
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token
|
2644
|
-
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token
|
2645
|
-
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token
|
2887
|
+
if (vocab.special_bos_id != -1) { LLAMA_LOG_INFO( "%s: BOS token = %d '%s'\n", __func__, vocab.special_bos_id, vocab.id_to_token[vocab.special_bos_id].text.c_str() ); }
|
2888
|
+
if (vocab.special_eos_id != -1) { LLAMA_LOG_INFO( "%s: EOS token = %d '%s'\n", __func__, vocab.special_eos_id, vocab.id_to_token[vocab.special_eos_id].text.c_str() ); }
|
2889
|
+
if (vocab.special_unk_id != -1) { LLAMA_LOG_INFO( "%s: UNK token = %d '%s'\n", __func__, vocab.special_unk_id, vocab.id_to_token[vocab.special_unk_id].text.c_str() ); }
|
2890
|
+
if (vocab.special_sep_id != -1) { LLAMA_LOG_INFO( "%s: SEP token = %d '%s'\n", __func__, vocab.special_sep_id, vocab.id_to_token[vocab.special_sep_id].text.c_str() ); }
|
2891
|
+
if (vocab.special_pad_id != -1) { LLAMA_LOG_INFO( "%s: PAD token = %d '%s'\n", __func__, vocab.special_pad_id, vocab.id_to_token[vocab.special_pad_id].text.c_str() ); }
|
2892
|
+
if (vocab.linefeed_id != -1) { LLAMA_LOG_INFO( "%s: LF token = %d '%s'\n", __func__, vocab.linefeed_id, vocab.id_to_token[vocab.linefeed_id].text.c_str() ); }
|
2646
2893
|
}
|
2647
2894
|
|
2648
2895
|
static void llm_load_tensors(
|
@@ -2728,14 +2975,7 @@ static void llm_load_tensors(
|
|
2728
2975
|
ggml_backend_type backend_output;
|
2729
2976
|
|
2730
2977
|
if (n_gpu_layers > int(n_layer)) {
|
2731
|
-
|
2732
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2733
|
-
#ifndef _WIN32
|
2734
|
-
backend_norm = llama_backend_offload;
|
2735
|
-
#else
|
2736
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2737
|
-
#endif // _WIN32
|
2738
|
-
|
2978
|
+
backend_norm = llama_backend_offload;
|
2739
2979
|
backend_output = llama_backend_offload_split;
|
2740
2980
|
} else {
|
2741
2981
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2772,6 +3012,12 @@ static void llm_load_tensors(
|
|
2772
3012
|
layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
|
2773
3013
|
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
2774
3014
|
|
3015
|
+
// optional bias tensors
|
3016
|
+
layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
|
3017
|
+
layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
|
3018
|
+
layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
|
3019
|
+
layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
|
3020
|
+
|
2775
3021
|
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
2776
3022
|
|
2777
3023
|
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
@@ -2780,9 +3026,14 @@ static void llm_load_tensors(
|
|
2780
3026
|
|
2781
3027
|
if (backend == GGML_BACKEND_GPU) {
|
2782
3028
|
vram_weights +=
|
2783
|
-
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq)
|
2784
|
-
ggml_nbytes(layer.wv)
|
2785
|
-
|
3029
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
|
3030
|
+
ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) +
|
3031
|
+
(layer.bq ? ggml_nbytes(layer.bq) : 0) +
|
3032
|
+
(layer.bk ? ggml_nbytes(layer.bk) : 0) +
|
3033
|
+
(layer.bv ? ggml_nbytes(layer.bv) : 0) +
|
3034
|
+
(layer.bo ? ggml_nbytes(layer.bo) : 0) +
|
3035
|
+
ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
3036
|
+
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
2786
3037
|
}
|
2787
3038
|
}
|
2788
3039
|
} break;
|
@@ -2794,14 +3045,7 @@ static void llm_load_tensors(
|
|
2794
3045
|
ggml_backend_type backend_output;
|
2795
3046
|
|
2796
3047
|
if (n_gpu_layers > int(n_layer)) {
|
2797
|
-
|
2798
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2799
|
-
#ifndef _WIN32
|
2800
|
-
backend_norm = llama_backend_offload;
|
2801
|
-
#else
|
2802
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2803
|
-
#endif // _WIN32
|
2804
|
-
|
3048
|
+
backend_norm = llama_backend_offload;
|
2805
3049
|
backend_output = llama_backend_offload_split;
|
2806
3050
|
} else {
|
2807
3051
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2864,14 +3108,7 @@ static void llm_load_tensors(
|
|
2864
3108
|
ggml_backend_type backend_output;
|
2865
3109
|
|
2866
3110
|
if (n_gpu_layers > int(n_layer)) {
|
2867
|
-
|
2868
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2869
|
-
#ifndef _WIN32
|
2870
|
-
backend_norm = llama_backend_offload;
|
2871
|
-
#else
|
2872
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2873
|
-
#endif // _WIN32
|
2874
|
-
|
3111
|
+
backend_norm = llama_backend_offload;
|
2875
3112
|
backend_output = llama_backend_offload_split;
|
2876
3113
|
} else {
|
2877
3114
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -2941,14 +3178,7 @@ static void llm_load_tensors(
|
|
2941
3178
|
ggml_backend_type backend_output;
|
2942
3179
|
|
2943
3180
|
if (n_gpu_layers > int(n_layer)) {
|
2944
|
-
|
2945
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
2946
|
-
#ifndef _WIN32
|
2947
|
-
backend_norm = llama_backend_offload;
|
2948
|
-
#else
|
2949
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
2950
|
-
#endif // _WIN32
|
2951
|
-
|
3181
|
+
backend_norm = llama_backend_offload;
|
2952
3182
|
backend_output = llama_backend_offload_split;
|
2953
3183
|
} else {
|
2954
3184
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3018,21 +3248,7 @@ static void llm_load_tensors(
|
|
3018
3248
|
ggml_backend_type backend_output;
|
3019
3249
|
|
3020
3250
|
if (n_gpu_layers > int(n_layer)) {
|
3021
|
-
|
3022
|
-
if (n_gpu_layers > int(n_layer + 1)) {
|
3023
|
-
LLAMA_LOG_ERROR("%s: CUDA backend missing Persimmon CUDA ops, can offload at most %ld layers. See: https://github.com/ggerganov/llama.cpp/issues/4038\n",
|
3024
|
-
__func__, n_layer + 1);
|
3025
|
-
throw std::runtime_error("Persimmon CUDA offload failed");
|
3026
|
-
}
|
3027
|
-
#endif
|
3028
|
-
// norm is not performance relevant on its own but keeping it in VRAM reduces data copying
|
3029
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3030
|
-
#ifndef _WIN32
|
3031
|
-
backend_norm = llama_backend_offload;
|
3032
|
-
#else
|
3033
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3034
|
-
#endif // _WIN32
|
3035
|
-
|
3251
|
+
backend_norm = llama_backend_offload;
|
3036
3252
|
backend_output = llama_backend_offload_split;
|
3037
3253
|
} else {
|
3038
3254
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3091,14 +3307,7 @@ static void llm_load_tensors(
|
|
3091
3307
|
ggml_backend_type backend_output;
|
3092
3308
|
|
3093
3309
|
if (n_gpu_layers > int(n_layer)) {
|
3094
|
-
|
3095
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3096
|
-
#ifndef _WIN32
|
3097
|
-
backend_norm = llama_backend_offload;
|
3098
|
-
#else
|
3099
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3100
|
-
#endif // _WIN32
|
3101
|
-
|
3310
|
+
backend_norm = llama_backend_offload;
|
3102
3311
|
backend_output = llama_backend_offload_split;
|
3103
3312
|
} else {
|
3104
3313
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3169,14 +3378,7 @@ static void llm_load_tensors(
|
|
3169
3378
|
ggml_backend_type backend_output;
|
3170
3379
|
|
3171
3380
|
if (n_gpu_layers > int(n_layer)) {
|
3172
|
-
|
3173
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3174
|
-
#ifndef _WIN32
|
3175
|
-
backend_norm = llama_backend_offload;
|
3176
|
-
#else
|
3177
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3178
|
-
#endif // _WIN32
|
3179
|
-
|
3381
|
+
backend_norm = llama_backend_offload;
|
3180
3382
|
backend_output = llama_backend_offload_split;
|
3181
3383
|
} else {
|
3182
3384
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3236,14 +3438,7 @@ static void llm_load_tensors(
|
|
3236
3438
|
ggml_backend_type backend_output;
|
3237
3439
|
|
3238
3440
|
if (n_gpu_layers > int(n_layer)) {
|
3239
|
-
|
3240
|
-
// on Windows however this is detrimental unless everything is on the GPU
|
3241
|
-
#ifndef _WIN32
|
3242
|
-
backend_norm = llama_backend_offload;
|
3243
|
-
#else
|
3244
|
-
backend_norm = n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : llama_backend_offload;
|
3245
|
-
#endif // _WIN32
|
3246
|
-
|
3441
|
+
backend_norm = llama_backend_offload;
|
3247
3442
|
backend_output = llama_backend_offload_split;
|
3248
3443
|
} else {
|
3249
3444
|
backend_norm = GGML_BACKEND_CPU;
|
@@ -3300,6 +3495,64 @@ static void llm_load_tensors(
|
|
3300
3495
|
}
|
3301
3496
|
}
|
3302
3497
|
} break;
|
3498
|
+
case LLM_ARCH_QWEN:
|
3499
|
+
{
|
3500
|
+
model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
|
3501
|
+
{
|
3502
|
+
ggml_backend_type backend_norm;
|
3503
|
+
ggml_backend_type backend_output;
|
3504
|
+
|
3505
|
+
if (n_gpu_layers > int(n_layer)) {
|
3506
|
+
backend_norm = llama_backend_offload;
|
3507
|
+
backend_output = llama_backend_offload_split;
|
3508
|
+
} else {
|
3509
|
+
backend_norm = GGML_BACKEND_CPU;
|
3510
|
+
backend_output = GGML_BACKEND_CPU;
|
3511
|
+
}
|
3512
|
+
|
3513
|
+
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3514
|
+
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3515
|
+
|
3516
|
+
if (backend_norm == GGML_BACKEND_GPU) {
|
3517
|
+
vram_weights += ggml_nbytes(model.output_norm);
|
3518
|
+
}
|
3519
|
+
if (backend_output == GGML_BACKEND_GPU_SPLIT) {
|
3520
|
+
vram_weights += ggml_nbytes(model.output);
|
3521
|
+
}
|
3522
|
+
}
|
3523
|
+
|
3524
|
+
const uint32_t n_ff = hparams.n_ff / 2;
|
3525
|
+
|
3526
|
+
const int i_gpu_start = n_layer - n_gpu_layers;
|
3527
|
+
|
3528
|
+
model.layers.resize(n_layer);
|
3529
|
+
|
3530
|
+
for (uint32_t i = 0; i < n_layer; ++i) {
|
3531
|
+
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3532
|
+
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3533
|
+
|
3534
|
+
auto & layer = model.layers[i];
|
3535
|
+
|
3536
|
+
layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
|
3537
|
+
|
3538
|
+
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
|
3539
|
+
layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
|
3540
|
+
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3541
|
+
|
3542
|
+
layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
|
3543
|
+
|
3544
|
+
layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
|
3545
|
+
layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
|
3546
|
+
layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
|
3547
|
+
|
3548
|
+
if (backend == GGML_BACKEND_GPU) {
|
3549
|
+
vram_weights +=
|
3550
|
+
ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wqkv) + ggml_nbytes(layer.bqkv) +
|
3551
|
+
ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) + ggml_nbytes(layer.ffn_gate) +
|
3552
|
+
ggml_nbytes(layer.ffn_down) + ggml_nbytes(layer.ffn_up);
|
3553
|
+
}
|
3554
|
+
}
|
3555
|
+
} break;
|
3303
3556
|
|
3304
3557
|
default:
|
3305
3558
|
throw std::runtime_error("unknown architecture");
|
@@ -3326,8 +3579,8 @@ static void llm_load_tensors(
|
|
3326
3579
|
}
|
3327
3580
|
|
3328
3581
|
#ifdef GGML_USE_CUBLAS
|
3329
|
-
const int max_backend_supported_layers = hparams.n_layer +
|
3330
|
-
const int max_offloadable_layers = hparams.n_layer +
|
3582
|
+
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3583
|
+
const int max_offloadable_layers = hparams.n_layer + 1;
|
3331
3584
|
#elif GGML_USE_CLBLAST
|
3332
3585
|
const int max_backend_supported_layers = hparams.n_layer + 1;
|
3333
3586
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
@@ -3368,7 +3621,7 @@ static void llm_load_tensors(
|
|
3368
3621
|
|
3369
3622
|
static bool llama_model_load(const std::string & fname, llama_model & model, const llama_model_params & params) {
|
3370
3623
|
try {
|
3371
|
-
llama_model_loader ml(fname, params.use_mmap);
|
3624
|
+
llama_model_loader ml(fname, params.use_mmap, params.kv_overrides);
|
3372
3625
|
|
3373
3626
|
model.hparams.vocab_only = params.vocab_only;
|
3374
3627
|
|
@@ -3464,7 +3717,7 @@ static void llm_build_k_shift(
|
|
3464
3717
|
struct ggml_cgraph * graph,
|
3465
3718
|
llm_rope_type type,
|
3466
3719
|
int64_t n_ctx,
|
3467
|
-
|
3720
|
+
int n_rot,
|
3468
3721
|
float freq_base,
|
3469
3722
|
float freq_scale,
|
3470
3723
|
const llm_build_cb & cb) {
|
@@ -3495,11 +3748,11 @@ static void llm_build_k_shift(
|
|
3495
3748
|
struct ggml_tensor * tmp =
|
3496
3749
|
// we rotate only the first n_rot dimensions
|
3497
3750
|
ggml_rope_custom_inplace(ctx,
|
3498
|
-
ggml_view_3d(ctx, kv.
|
3499
|
-
|
3500
|
-
|
3501
|
-
|
3502
|
-
|
3751
|
+
ggml_view_3d(ctx, kv.k_l[il],
|
3752
|
+
n_embd_head, n_head_kv, n_ctx,
|
3753
|
+
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
3754
|
+
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
3755
|
+
0),
|
3503
3756
|
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
3504
3757
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
3505
3758
|
cb(tmp, "K_shifted", il);
|
@@ -3526,13 +3779,13 @@ static void llm_build_kv_store(
|
|
3526
3779
|
//struct ggml_tensor * v_cur_t = ggml_transpose(ctx, v_cur); // TODO: reshape above is likely not needed
|
3527
3780
|
cb(v_cur_t, "v_cur_t", il);
|
3528
3781
|
|
3529
|
-
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.
|
3530
|
-
(
|
3782
|
+
struct ggml_tensor * k_cache_view = ggml_view_1d(ctx, kv.k_l[il], n_tokens*n_embd_gqa,
|
3783
|
+
(ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa)*kv_head);
|
3531
3784
|
cb(k_cache_view, "k_cache_view", il);
|
3532
3785
|
|
3533
|
-
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.
|
3534
|
-
(
|
3535
|
-
(
|
3786
|
+
struct ggml_tensor * v_cache_view = ggml_view_2d(ctx, kv.v_l[il], n_tokens, n_embd_gqa,
|
3787
|
+
( n_ctx)*ggml_element_size(kv.v_l[il]),
|
3788
|
+
(kv_head)*ggml_element_size(kv.v_l[il]));
|
3536
3789
|
cb(v_cache_view, "v_cache_view", il);
|
3537
3790
|
|
3538
3791
|
// important: storing RoPE-ed version of K in the KV cache!
|
@@ -3684,40 +3937,46 @@ static struct ggml_tensor * llm_build_kqv(
|
|
3684
3937
|
cb(q, "q", il);
|
3685
3938
|
|
3686
3939
|
struct ggml_tensor * k =
|
3687
|
-
ggml_view_3d(ctx, kv.
|
3940
|
+
ggml_view_3d(ctx, kv.k_l[il],
|
3688
3941
|
n_embd_head, n_kv, n_head_kv,
|
3689
|
-
|
3690
|
-
|
3691
|
-
|
3942
|
+
ggml_type_sizef(kv.k_l[il]->type)*n_embd_gqa,
|
3943
|
+
ggml_type_sizef(kv.k_l[il]->type)*n_embd_head,
|
3944
|
+
0);
|
3692
3945
|
cb(k, "k", il);
|
3693
3946
|
|
3694
3947
|
struct ggml_tensor * kq = ggml_mul_mat(ctx, k, q);
|
3695
3948
|
cb(kq, "kq", il);
|
3696
3949
|
|
3697
|
-
kq = ggml_scale(ctx, kq, kq_scale);
|
3698
|
-
cb(kq, "kq_scaled", il);
|
3699
|
-
|
3700
3950
|
if (max_alibi_bias > 0.0f) {
|
3701
|
-
//
|
3702
|
-
|
3703
|
-
|
3704
|
-
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
3705
|
-
cb(kq, "kq_scaled_alibi", il);
|
3706
|
-
}
|
3951
|
+
// temporary branch until we figure out how to handle ggml_alibi through ggml_add
|
3952
|
+
kq = ggml_scale(ctx, kq, kq_scale);
|
3953
|
+
cb(kq, "kq_scaled", il);
|
3707
3954
|
|
3708
|
-
|
3709
|
-
|
3955
|
+
if (max_alibi_bias > 0.0f) {
|
3956
|
+
// TODO: n_head or n_head_kv
|
3957
|
+
// TODO: K-shift is likely not working
|
3958
|
+
// TODO: change to ggml_add
|
3959
|
+
kq = ggml_alibi(ctx, kq, /*n_past*/ 0, n_head, max_alibi_bias);
|
3960
|
+
cb(kq, "kq_scaled_alibi", il);
|
3961
|
+
}
|
3710
3962
|
|
3711
|
-
|
3712
|
-
|
3963
|
+
kq = ggml_add(ctx, kq, kq_mask);
|
3964
|
+
cb(kq, "kq_masked", il);
|
3965
|
+
|
3966
|
+
kq = ggml_soft_max(ctx, kq);
|
3967
|
+
cb(kq, "kq_soft_max", il);
|
3968
|
+
} else {
|
3969
|
+
kq = ggml_soft_max_ext(ctx, kq, kq_mask, 1.0f/sqrtf(float(n_embd_head)));
|
3970
|
+
cb(kq, "kq_soft_max_ext", il);
|
3971
|
+
}
|
3713
3972
|
|
3714
3973
|
// split cached v into n_head heads
|
3715
3974
|
struct ggml_tensor * v =
|
3716
|
-
ggml_view_3d(ctx, kv.
|
3975
|
+
ggml_view_3d(ctx, kv.v_l[il],
|
3717
3976
|
n_kv, n_embd_head, n_head_kv,
|
3718
|
-
ggml_element_size(kv.
|
3719
|
-
ggml_element_size(kv.
|
3720
|
-
|
3977
|
+
ggml_element_size(kv.v_l[il])*n_ctx,
|
3978
|
+
ggml_element_size(kv.v_l[il])*n_ctx*n_embd_head,
|
3979
|
+
0);
|
3721
3980
|
cb(v, "v", il);
|
3722
3981
|
|
3723
3982
|
struct ggml_tensor * kqv = ggml_mul_mat(ctx, v, kq);
|
@@ -3875,12 +4134,24 @@ struct llm_build_context {
|
|
3875
4134
|
// compute Q and K and RoPE them
|
3876
4135
|
struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
|
3877
4136
|
cb(Qcur, "Qcur", il);
|
4137
|
+
if (model.layers[il].bq) {
|
4138
|
+
Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
|
4139
|
+
cb(Qcur, "Qcur", il);
|
4140
|
+
}
|
3878
4141
|
|
3879
4142
|
struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
|
3880
4143
|
cb(Kcur, "Kcur", il);
|
4144
|
+
if (model.layers[il].bk) {
|
4145
|
+
Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
|
4146
|
+
cb(Kcur, "Kcur", il);
|
4147
|
+
}
|
3881
4148
|
|
3882
4149
|
struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
|
3883
4150
|
cb(Vcur, "Vcur", il);
|
4151
|
+
if (model.layers[il].bv) {
|
4152
|
+
Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
|
4153
|
+
cb(Vcur, "Vcur", il);
|
4154
|
+
}
|
3884
4155
|
|
3885
4156
|
Qcur = ggml_rope_custom(
|
3886
4157
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
@@ -3899,7 +4170,7 @@ struct llm_build_context {
|
|
3899
4170
|
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
3900
4171
|
|
3901
4172
|
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
3902
|
-
model.layers[il].wo,
|
4173
|
+
model.layers[il].wo, model.layers[il].bo,
|
3903
4174
|
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
3904
4175
|
cb(cur, "kqv_out", il);
|
3905
4176
|
}
|
@@ -4297,6 +4568,7 @@ struct llm_build_context {
|
|
4297
4568
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
4298
4569
|
cb(inpL, "imp_embd", -1);
|
4299
4570
|
|
4571
|
+
// inp_pos - contains the positions
|
4300
4572
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
4301
4573
|
cb(inp_pos, "inp_pos", -1);
|
4302
4574
|
|
@@ -4304,6 +4576,7 @@ struct llm_build_context {
|
|
4304
4576
|
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
4305
4577
|
cb(KQ_scale, "KQ_scale", -1);
|
4306
4578
|
|
4579
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
4307
4580
|
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
4308
4581
|
cb(KQ_mask, "KQ_mask", -1);
|
4309
4582
|
|
@@ -4892,6 +5165,121 @@ struct llm_build_context {
|
|
4892
5165
|
|
4893
5166
|
return gf;
|
4894
5167
|
}
|
5168
|
+
|
5169
|
+
struct ggml_cgraph * build_qwen() {
|
5170
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5171
|
+
|
5172
|
+
struct ggml_tensor * cur;
|
5173
|
+
struct ggml_tensor * inpL;
|
5174
|
+
|
5175
|
+
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5176
|
+
cb(inpL, "inp_embd", -1);
|
5177
|
+
|
5178
|
+
// inp_pos - contains the positions
|
5179
|
+
struct ggml_tensor * inp_pos= ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
5180
|
+
cb(inp_pos, "inp_pos", -1);
|
5181
|
+
|
5182
|
+
// KQ_scale
|
5183
|
+
struct ggml_tensor * KQ_scale= ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
|
5184
|
+
cb(KQ_scale, "KQ_scale", -1);
|
5185
|
+
|
5186
|
+
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
|
5187
|
+
struct ggml_tensor * KQ_mask= ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
|
5188
|
+
cb(KQ_mask, "KQ_mask", -1);
|
5189
|
+
|
5190
|
+
// shift the entire K-cache if needed
|
5191
|
+
if (do_rope_shift) {
|
5192
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
|
5193
|
+
}
|
5194
|
+
|
5195
|
+
for (int il = 0; il < n_layer; ++il) {
|
5196
|
+
struct ggml_tensor * inpSA = inpL;
|
5197
|
+
|
5198
|
+
cur = llm_build_norm(ctx0, inpL, hparams,
|
5199
|
+
model.layers[il].attn_norm, NULL,
|
5200
|
+
LLM_NORM_RMS, cb, il);
|
5201
|
+
cb(cur, "attn_norm", il);
|
5202
|
+
|
5203
|
+
// self-attention
|
5204
|
+
{
|
5205
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, cur);
|
5206
|
+
cb(cur, "wqkv", il);
|
5207
|
+
|
5208
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5209
|
+
cb(cur, "bqkv", il);
|
5210
|
+
|
5211
|
+
struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5212
|
+
struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5213
|
+
struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*(n_embd)));
|
5214
|
+
|
5215
|
+
cb(Qcur, "Qcur", il);
|
5216
|
+
cb(Kcur, "Kcur", il);
|
5217
|
+
cb(Vcur, "Vcur", il);
|
5218
|
+
|
5219
|
+
Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens);
|
5220
|
+
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
5221
|
+
|
5222
|
+
// using mode = 2 for neox mode
|
5223
|
+
Qcur = ggml_rope_custom(
|
5224
|
+
ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5225
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5226
|
+
);
|
5227
|
+
cb(Qcur, "Qcur", il);
|
5228
|
+
|
5229
|
+
Kcur = ggml_rope_custom(
|
5230
|
+
ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
|
5231
|
+
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5232
|
+
);
|
5233
|
+
cb(Kcur, "Kcur", il);
|
5234
|
+
|
5235
|
+
llm_build_kv_store(ctx0, hparams, kv_self, gf, Kcur, Vcur, n_ctx, n_tokens, kv_head, cb, il);
|
5236
|
+
|
5237
|
+
cur = llm_build_kqv(ctx0, hparams, kv_self,
|
5238
|
+
model.layers[il].wo, NULL,
|
5239
|
+
Qcur, KQ_scale, KQ_mask, n_ctx, n_tokens, n_kv, -1.0f, cb, il);
|
5240
|
+
cb(cur, "kqv_out", il);
|
5241
|
+
}
|
5242
|
+
|
5243
|
+
struct ggml_tensor * ffn_inp = ggml_add(ctx0, cur, inpSA);
|
5244
|
+
cb(ffn_inp, "ffn_inp", il);
|
5245
|
+
|
5246
|
+
// feed-forward forward
|
5247
|
+
{
|
5248
|
+
cur = llm_build_norm(ctx0, ffn_inp, hparams,
|
5249
|
+
model.layers[il].ffn_norm, NULL,
|
5250
|
+
LLM_NORM_RMS, cb, il);
|
5251
|
+
cb(cur, "ffn_norm", il);
|
5252
|
+
|
5253
|
+
cur = llm_build_ffn(ctx0, cur,
|
5254
|
+
model.layers[il].ffn_up, NULL,
|
5255
|
+
model.layers[il].ffn_gate, NULL,
|
5256
|
+
model.layers[il].ffn_down, NULL,
|
5257
|
+
LLM_FFN_SILU, LLM_FFN_PAR, cb, il);
|
5258
|
+
cb(cur, "ffn_out", il);
|
5259
|
+
}
|
5260
|
+
|
5261
|
+
cur = ggml_add(ctx0, cur, ffn_inp);
|
5262
|
+
cb(cur, "l_out", il);
|
5263
|
+
|
5264
|
+
// input for next layer
|
5265
|
+
inpL = cur;
|
5266
|
+
}
|
5267
|
+
|
5268
|
+
cur = inpL;
|
5269
|
+
|
5270
|
+
cur = llm_build_norm(ctx0, cur, hparams,
|
5271
|
+
model.output_norm, NULL,
|
5272
|
+
LLM_NORM_RMS, cb, -1);
|
5273
|
+
cb(cur, "result_norm", -1);
|
5274
|
+
|
5275
|
+
// lm_head
|
5276
|
+
cur = ggml_mul_mat(ctx0, model.output, cur);
|
5277
|
+
cb(cur, "result_output", -1);
|
5278
|
+
|
5279
|
+
ggml_build_forward_expand(gf, cur);
|
5280
|
+
|
5281
|
+
return gf;
|
5282
|
+
}
|
4895
5283
|
};
|
4896
5284
|
|
4897
5285
|
//
|
@@ -4902,8 +5290,8 @@ struct llm_build_context {
|
|
4902
5290
|
enum llm_offload_func_e {
|
4903
5291
|
OFFLOAD_FUNC_NOP,
|
4904
5292
|
OFFLOAD_FUNC,
|
4905
|
-
|
4906
|
-
|
5293
|
+
OFFLOAD_FUNC_FRC, // force offload
|
5294
|
+
OFFLOAD_FUNC_KQV,
|
4907
5295
|
OFFLOAD_FUNC_NR,
|
4908
5296
|
OFFLOAD_FUNC_EMB,
|
4909
5297
|
OFFLOAD_FUNC_OUT,
|
@@ -4989,11 +5377,12 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
4989
5377
|
//{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
|
4990
5378
|
{ "pos_embd", OFFLOAD_FUNC_NR },
|
4991
5379
|
|
4992
|
-
{ "inp_pos",
|
4993
|
-
{ "KQ_scale",
|
4994
|
-
{ "KQ_mask",
|
4995
|
-
{ "K_shift",
|
4996
|
-
|
5380
|
+
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
5381
|
+
{ "KQ_scale", OFFLOAD_FUNC_FRC },
|
5382
|
+
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
5383
|
+
{ "K_shift", OFFLOAD_FUNC_FRC },
|
5384
|
+
|
5385
|
+
{ "K_shifted", OFFLOAD_FUNC },
|
4997
5386
|
|
4998
5387
|
{ "inp_norm", OFFLOAD_FUNC_NR },
|
4999
5388
|
{ "inp_norm_w", OFFLOAD_FUNC_NR },
|
@@ -5006,37 +5395,38 @@ static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map
|
|
5006
5395
|
{ "attn_norm", OFFLOAD_FUNC },
|
5007
5396
|
{ "attn_norm_2", OFFLOAD_FUNC },
|
5008
5397
|
|
5009
|
-
{ "wqkv",
|
5010
|
-
{ "bqkv",
|
5011
|
-
{ "wqkv_clamped",
|
5012
|
-
|
5013
|
-
{ "tmpk",
|
5014
|
-
{ "tmpq",
|
5015
|
-
{ "tmpv",
|
5016
|
-
{ "Kcur",
|
5017
|
-
{ "Qcur",
|
5018
|
-
{ "Vcur",
|
5019
|
-
|
5020
|
-
{ "krot",
|
5021
|
-
{ "qrot",
|
5022
|
-
{ "kpass",
|
5023
|
-
{ "qpass",
|
5024
|
-
{ "krotated",
|
5025
|
-
{ "qrotated",
|
5026
|
-
|
5027
|
-
{ "q",
|
5028
|
-
{ "k",
|
5029
|
-
{ "kq",
|
5030
|
-
{ "kq_scaled",
|
5031
|
-
{ "kq_scaled_alibi",
|
5032
|
-
{ "kq_masked",
|
5033
|
-
{ "kq_soft_max",
|
5034
|
-
{ "
|
5035
|
-
{ "
|
5036
|
-
{ "
|
5037
|
-
{ "
|
5038
|
-
{ "
|
5039
|
-
{ "
|
5398
|
+
{ "wqkv", OFFLOAD_FUNC_KQV },
|
5399
|
+
{ "bqkv", OFFLOAD_FUNC_KQV },
|
5400
|
+
{ "wqkv_clamped", OFFLOAD_FUNC_KQV },
|
5401
|
+
|
5402
|
+
{ "tmpk", OFFLOAD_FUNC_KQV },
|
5403
|
+
{ "tmpq", OFFLOAD_FUNC_KQV },
|
5404
|
+
{ "tmpv", OFFLOAD_FUNC_KQV },
|
5405
|
+
{ "Kcur", OFFLOAD_FUNC_KQV },
|
5406
|
+
{ "Qcur", OFFLOAD_FUNC_KQV },
|
5407
|
+
{ "Vcur", OFFLOAD_FUNC_KQV },
|
5408
|
+
|
5409
|
+
{ "krot", OFFLOAD_FUNC_KQV },
|
5410
|
+
{ "qrot", OFFLOAD_FUNC_KQV },
|
5411
|
+
{ "kpass", OFFLOAD_FUNC_KQV },
|
5412
|
+
{ "qpass", OFFLOAD_FUNC_KQV },
|
5413
|
+
{ "krotated", OFFLOAD_FUNC_KQV },
|
5414
|
+
{ "qrotated", OFFLOAD_FUNC_KQV },
|
5415
|
+
|
5416
|
+
{ "q", OFFLOAD_FUNC_KQV },
|
5417
|
+
{ "k", OFFLOAD_FUNC_KQV },
|
5418
|
+
{ "kq", OFFLOAD_FUNC_KQV },
|
5419
|
+
{ "kq_scaled", OFFLOAD_FUNC_KQV },
|
5420
|
+
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
|
5421
|
+
{ "kq_masked", OFFLOAD_FUNC_KQV },
|
5422
|
+
{ "kq_soft_max", OFFLOAD_FUNC_KQV },
|
5423
|
+
{ "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
|
5424
|
+
{ "v", OFFLOAD_FUNC_KQV },
|
5425
|
+
{ "kqv", OFFLOAD_FUNC_KQV },
|
5426
|
+
{ "kqv_merged", OFFLOAD_FUNC_KQV },
|
5427
|
+
{ "kqv_merged_cont", OFFLOAD_FUNC_KQV },
|
5428
|
+
{ "kqv_wo", OFFLOAD_FUNC_KQV },
|
5429
|
+
{ "kqv_out", OFFLOAD_FUNC_KQV },
|
5040
5430
|
|
5041
5431
|
{ "ffn_inp", OFFLOAD_FUNC },
|
5042
5432
|
{ "ffn_norm", OFFLOAD_FUNC },
|
@@ -5228,15 +5618,15 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5228
5618
|
{ OFFLOAD_FUNC_NOP, "CPU" },
|
5229
5619
|
{ OFFLOAD_FUNC_OUT, "CPU" },
|
5230
5620
|
#ifdef GGML_USE_CUBLAS
|
5231
|
-
{ OFFLOAD_FUNC, "GPU (CUDA)"
|
5232
|
-
{
|
5233
|
-
{
|
5234
|
-
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR"
|
5621
|
+
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
5622
|
+
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
5623
|
+
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
5624
|
+
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
|
5235
5625
|
{ OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
|
5236
5626
|
#else
|
5237
5627
|
{ OFFLOAD_FUNC, "CPU" },
|
5238
|
-
{
|
5239
|
-
{
|
5628
|
+
{ OFFLOAD_FUNC_FRC, "CPU" },
|
5629
|
+
{ OFFLOAD_FUNC_KQV, "CPU" },
|
5240
5630
|
{ OFFLOAD_FUNC_NR, "CPU" },
|
5241
5631
|
{ OFFLOAD_FUNC_EMB, "CPU" },
|
5242
5632
|
#endif // GGML_USE_CUBLAS
|
@@ -5269,18 +5659,23 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5269
5659
|
}
|
5270
5660
|
}
|
5271
5661
|
break;
|
5272
|
-
case
|
5273
|
-
if (
|
5662
|
+
case OFFLOAD_FUNC_FRC:
|
5663
|
+
if (!lctx.cparams.offload_kqv) {
|
5274
5664
|
func_e = OFFLOAD_FUNC_NOP;
|
5275
|
-
}
|
5276
|
-
|
5277
|
-
|
5278
|
-
if (n_gpu_layers <= n_layer + 1) {
|
5665
|
+
} break;
|
5666
|
+
case OFFLOAD_FUNC_KQV:
|
5667
|
+
if (!lctx.cparams.offload_kqv) {
|
5279
5668
|
func_e = OFFLOAD_FUNC_NOP;
|
5669
|
+
} else {
|
5670
|
+
if (n_gpu_layers < n_layer) {
|
5671
|
+
if (il < i_gpu_start) {
|
5672
|
+
func_e = OFFLOAD_FUNC_NOP;
|
5673
|
+
}
|
5674
|
+
}
|
5280
5675
|
}
|
5281
5676
|
break;
|
5282
|
-
case
|
5283
|
-
if (n_gpu_layers <= n_layer +
|
5677
|
+
case OFFLOAD_FUNC_NR:
|
5678
|
+
if (n_gpu_layers <= n_layer + 0) {
|
5284
5679
|
func_e = OFFLOAD_FUNC_NOP;
|
5285
5680
|
}
|
5286
5681
|
break;
|
@@ -5305,8 +5700,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5305
5700
|
case OFFLOAD_FUNC_NOP:
|
5306
5701
|
case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
|
5307
5702
|
case OFFLOAD_FUNC:
|
5308
|
-
case
|
5309
|
-
case
|
5703
|
+
case OFFLOAD_FUNC_KQV:
|
5704
|
+
case OFFLOAD_FUNC_FRC:
|
5310
5705
|
case OFFLOAD_FUNC_NR:
|
5311
5706
|
case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
|
5312
5707
|
default: GGML_ASSERT(false);
|
@@ -5365,6 +5760,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
5365
5760
|
{
|
5366
5761
|
result = llm.build_stablelm();
|
5367
5762
|
} break;
|
5763
|
+
case LLM_ARCH_QWEN:
|
5764
|
+
{
|
5765
|
+
result = llm.build_qwen();
|
5766
|
+
} break;
|
5368
5767
|
default:
|
5369
5768
|
GGML_ASSERT(false);
|
5370
5769
|
}
|
@@ -5487,8 +5886,8 @@ static int llama_decode_internal(
|
|
5487
5886
|
// a heuristic, to avoid attending the full cache if it is not yet utilized
|
5488
5887
|
// after enough generations, the benefit from this heuristic disappears
|
5489
5888
|
// if we start defragmenting the cache, the benefit from this will be more important
|
5490
|
-
|
5491
|
-
kv_self.n =
|
5889
|
+
kv_self.n = std::min((int32_t) cparams.n_ctx, std::max(32, GGML_PAD(llama_kv_cache_cell_max(kv_self), 32)));
|
5890
|
+
//kv_self.n = llama_kv_cache_cell_max(kv_self);
|
5492
5891
|
|
5493
5892
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
5494
5893
|
|
@@ -5539,18 +5938,8 @@ static int llama_decode_internal(
|
|
5539
5938
|
n_threads = std::min(4, n_threads);
|
5540
5939
|
}
|
5541
5940
|
|
5542
|
-
|
5543
|
-
|
5544
|
-
model.arch == LLM_ARCH_LLAMA ||
|
5545
|
-
model.arch == LLM_ARCH_BAICHUAN ||
|
5546
|
-
model.arch == LLM_ARCH_FALCON ||
|
5547
|
-
model.arch == LLM_ARCH_REFACT ||
|
5548
|
-
model.arch == LLM_ARCH_MPT ||
|
5549
|
-
model.arch == LLM_ARCH_STARCODER ||
|
5550
|
-
model.arch == LLM_ARCH_STABLELM;
|
5551
|
-
|
5552
|
-
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 3;
|
5553
|
-
if (ggml_cpu_has_cublas() && full_offload_supported && fully_offloaded) {
|
5941
|
+
const bool fully_offloaded = model.n_gpu_layers >= (int) hparams.n_layer + 1;
|
5942
|
+
if (ggml_cpu_has_cublas() && fully_offloaded) {
|
5554
5943
|
n_threads = 1;
|
5555
5944
|
}
|
5556
5945
|
|
@@ -6408,11 +6797,13 @@ struct llama_grammar_candidate {
|
|
6408
6797
|
// Decodes a UTF-8 string which may end in an incomplete sequence. Adds a terminating 0 for use as
|
6409
6798
|
// pointer. If an invalid sequence is encountered, returns `llama_partial_utf8.n_remain == -1`.
|
6410
6799
|
static std::pair<std::vector<uint32_t>, llama_partial_utf8> decode_utf8(
|
6411
|
-
const
|
6800
|
+
const std::string & src,
|
6412
6801
|
llama_partial_utf8 partial_start) {
|
6413
6802
|
static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 2, 2, 3, 4 };
|
6414
|
-
const char * pos = src;
|
6803
|
+
const char * pos = src.c_str();
|
6415
6804
|
std::vector<uint32_t> code_points;
|
6805
|
+
// common english strings have the same number of codepoints and bytes. `+ 1` for the terminating 0.
|
6806
|
+
code_points.reserve(src.size() + 1);
|
6416
6807
|
uint32_t value = partial_start.value;
|
6417
6808
|
int n_remain = partial_start.n_remain;
|
6418
6809
|
|
@@ -7016,6 +7407,7 @@ void llama_sample_typical(struct llama_context * ctx, llama_token_data_array * c
|
|
7016
7407
|
// Replace the data in candidates with the new_candidates data
|
7017
7408
|
std::copy(new_candidates.begin(), new_candidates.end(), candidates->data);
|
7018
7409
|
candidates->size = new_candidates.size();
|
7410
|
+
candidates->sorted = false;
|
7019
7411
|
|
7020
7412
|
if (ctx) {
|
7021
7413
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
@@ -7100,11 +7492,13 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7100
7492
|
const llama_token eos = llama_token_eos(&ctx->model);
|
7101
7493
|
|
7102
7494
|
std::vector<std::pair<std::vector<uint32_t>, llama_partial_utf8>> candidates_decoded;
|
7495
|
+
candidates_decoded.reserve(candidates->size);
|
7103
7496
|
std::vector<llama_grammar_candidate> candidates_grammar;
|
7497
|
+
candidates_grammar.reserve(candidates->size);
|
7104
7498
|
|
7105
7499
|
for (size_t i = 0; i < candidates->size; ++i) {
|
7106
7500
|
const llama_token id = candidates->data[i].id;
|
7107
|
-
const std::string piece =
|
7501
|
+
const std::string & piece = ctx->model.vocab.id_to_token[id].text;
|
7108
7502
|
if (id == eos) {
|
7109
7503
|
if (!allow_eos) {
|
7110
7504
|
candidates->data[i].logit = -INFINITY;
|
@@ -7112,7 +7506,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
|
|
7112
7506
|
} else if (piece.empty() || piece[0] == 0) {
|
7113
7507
|
candidates->data[i].logit = -INFINITY;
|
7114
7508
|
} else {
|
7115
|
-
candidates_decoded.push_back(decode_utf8(piece
|
7509
|
+
candidates_decoded.push_back(decode_utf8(piece, grammar->partial_utf8));
|
7116
7510
|
candidates_grammar.push_back({ i, candidates_decoded.back().first.data(), candidates_decoded.back().second });
|
7117
7511
|
}
|
7118
7512
|
}
|
@@ -7316,10 +7710,10 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
|
|
7316
7710
|
GGML_ASSERT(false);
|
7317
7711
|
}
|
7318
7712
|
|
7319
|
-
const std::string piece =
|
7713
|
+
const std::string & piece = ctx->model.vocab.id_to_token[token].text;
|
7320
7714
|
|
7321
7715
|
// Note terminating 0 in decoded string
|
7322
|
-
const auto decoded = decode_utf8(piece
|
7716
|
+
const auto decoded = decode_utf8(piece, grammar->partial_utf8);
|
7323
7717
|
const auto & code_points = decoded.first;
|
7324
7718
|
for (auto it = code_points.begin(), end = code_points.end() - 1; it != end; ++it) {
|
7325
7719
|
grammar->stacks = llama_grammar_accept(grammar->rules, grammar->stacks, *it);
|
@@ -7637,18 +8031,21 @@ static void llama_convert_tensor_internal(
|
|
7637
8031
|
return;
|
7638
8032
|
}
|
7639
8033
|
|
7640
|
-
|
7641
|
-
|
8034
|
+
size_t block_size = tensor->type == GGML_TYPE_F16 ? 1 : (size_t)ggml_blck_size(tensor->type);
|
8035
|
+
size_t block_size_bytes = ggml_type_size(tensor->type);
|
7642
8036
|
|
7643
8037
|
GGML_ASSERT(nelements % block_size == 0);
|
7644
|
-
|
7645
|
-
|
7646
|
-
|
8038
|
+
size_t nblocks = nelements / block_size;
|
8039
|
+
size_t blocks_per_thread = nblocks / nthread;
|
8040
|
+
size_t spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
|
8041
|
+
|
8042
|
+
size_t in_buff_offs = 0;
|
8043
|
+
size_t out_buff_offs = 0;
|
7647
8044
|
|
7648
|
-
for (
|
7649
|
-
|
7650
|
-
|
7651
|
-
|
8045
|
+
for (int tnum = 0; tnum < nthread; tnum++) {
|
8046
|
+
size_t thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
|
8047
|
+
size_t thr_elems = thr_blocks * block_size; // number of elements for this thread
|
8048
|
+
size_t thr_block_bytes = thr_blocks * block_size_bytes; // number of input bytes for this thread
|
7652
8049
|
|
7653
8050
|
auto compute = [qtype] (ggml_type typ, uint8_t * inbuf, float * outbuf, int nels) {
|
7654
8051
|
if (typ == GGML_TYPE_F16) {
|
@@ -7818,7 +8215,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
7818
8215
|
constexpr bool use_mmap = false;
|
7819
8216
|
#endif
|
7820
8217
|
|
7821
|
-
llama_model_loader ml(fname_inp, use_mmap);
|
8218
|
+
llama_model_loader ml(fname_inp, use_mmap, NULL);
|
7822
8219
|
if (ml.use_mmap) {
|
7823
8220
|
ml.mapping.reset(new llama_mmap(&ml.file, /* prefetch */ 0, ggml_is_numa()));
|
7824
8221
|
}
|
@@ -8114,7 +8511,7 @@ static int llama_apply_lora_from_file_internal(
|
|
8114
8511
|
std::vector<uint8_t> base_buf;
|
8115
8512
|
if (path_base_model) {
|
8116
8513
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
8117
|
-
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true));
|
8514
|
+
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ NULL));
|
8118
8515
|
|
8119
8516
|
size_t ctx_size;
|
8120
8517
|
size_t mmapped_size;
|
@@ -8342,6 +8739,7 @@ struct llama_model_params llama_model_default_params() {
|
|
8342
8739
|
/*.tensor_split =*/ nullptr,
|
8343
8740
|
/*.progress_callback =*/ nullptr,
|
8344
8741
|
/*.progress_callback_user_data =*/ nullptr,
|
8742
|
+
/*.kv_overrides =*/ nullptr,
|
8345
8743
|
/*.vocab_only =*/ false,
|
8346
8744
|
/*.use_mmap =*/ true,
|
8347
8745
|
/*.use_mlock =*/ false,
|
@@ -8369,10 +8767,12 @@ struct llama_context_params llama_context_default_params() {
|
|
8369
8767
|
/*.yarn_beta_fast =*/ 32.0f,
|
8370
8768
|
/*.yarn_beta_slow =*/ 1.0f,
|
8371
8769
|
/*.yarn_orig_ctx =*/ 0,
|
8770
|
+
/*.type_k =*/ GGML_TYPE_F16,
|
8771
|
+
/*.type_v =*/ GGML_TYPE_F16,
|
8372
8772
|
/*.mul_mat_q =*/ true,
|
8373
|
-
/*.f16_kv =*/ true,
|
8374
8773
|
/*.logits_all =*/ false,
|
8375
8774
|
/*.embedding =*/ false,
|
8775
|
+
/*.offload_kqv =*/ true,
|
8376
8776
|
};
|
8377
8777
|
|
8378
8778
|
return result;
|
@@ -8489,6 +8889,7 @@ struct llama_context * llama_new_context_with_model(
|
|
8489
8889
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
8490
8890
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
8491
8891
|
cparams.mul_mat_q = params.mul_mat_q;
|
8892
|
+
cparams.offload_kqv = params.offload_kqv;
|
8492
8893
|
|
8493
8894
|
cparams.n_ctx = params.n_ctx == 0 ? hparams.n_ctx_train : params.n_ctx;
|
8494
8895
|
cparams.rope_freq_base = params.rope_freq_base == 0.0f ? hparams.rope_freq_base_train : params.rope_freq_base;
|
@@ -8522,19 +8923,36 @@ struct llama_context * llama_new_context_with_model(
|
|
8522
8923
|
ctx->rng = std::mt19937(params.seed);
|
8523
8924
|
ctx->logits_all = params.logits_all;
|
8524
8925
|
|
8525
|
-
ggml_type
|
8926
|
+
const ggml_type type_k = params.type_k;
|
8927
|
+
const ggml_type type_v = params.type_v;
|
8928
|
+
|
8929
|
+
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_k) == 0);
|
8930
|
+
GGML_ASSERT(hparams.n_embd_head() % ggml_blck_size(type_v) == 0);
|
8526
8931
|
|
8527
8932
|
// reserve memory for context buffers
|
8528
8933
|
if (!hparams.vocab_only) {
|
8529
|
-
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self,
|
8934
|
+
if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v, cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
|
8530
8935
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
8531
8936
|
llama_free(ctx);
|
8532
8937
|
return nullptr;
|
8533
8938
|
}
|
8534
8939
|
|
8535
8940
|
{
|
8536
|
-
|
8537
|
-
|
8941
|
+
size_t memory_size_k = 0;
|
8942
|
+
size_t memory_size_v = 0;
|
8943
|
+
|
8944
|
+
for (auto & k : ctx->kv_self.k_l) {
|
8945
|
+
memory_size_k += ggml_nbytes(k);
|
8946
|
+
}
|
8947
|
+
|
8948
|
+
for (auto & v : ctx->kv_self.v_l) {
|
8949
|
+
memory_size_v += ggml_nbytes(v);
|
8950
|
+
}
|
8951
|
+
|
8952
|
+
LLAMA_LOG_INFO("%s: KV self size = %7.2f MiB, K (%s): %7.2f MiB, V (%s): %7.2f MiB\n", __func__,
|
8953
|
+
(float)(memory_size_k + memory_size_v) / (1024.0f * 1024.0f),
|
8954
|
+
ggml_type_name(type_k), (float)memory_size_k / (1024.0f * 1024.0f),
|
8955
|
+
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
8538
8956
|
}
|
8539
8957
|
|
8540
8958
|
// resized during inference
|
@@ -8564,8 +8982,6 @@ struct llama_context * llama_new_context_with_model(
|
|
8564
8982
|
|
8565
8983
|
#ifdef GGML_USE_METAL
|
8566
8984
|
if (model->n_gpu_layers > 0) {
|
8567
|
-
ggml_metal_log_set_callback(llama_log_callback_default, NULL);
|
8568
|
-
|
8569
8985
|
ctx->ctx_metal = ggml_metal_init(1);
|
8570
8986
|
if (!ctx->ctx_metal) {
|
8571
8987
|
LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
|
@@ -8607,8 +9023,12 @@ struct llama_context * llama_new_context_with_model(
|
|
8607
9023
|
}
|
8608
9024
|
|
8609
9025
|
size_t kv_vram_size = 0;
|
8610
|
-
|
8611
|
-
|
9026
|
+
for (auto & k : ctx->kv_self.k_l) {
|
9027
|
+
add_tensor(k, kv_vram_size);
|
9028
|
+
}
|
9029
|
+
for (auto & v : ctx->kv_self.v_l) {
|
9030
|
+
add_tensor(v, kv_vram_size);
|
9031
|
+
}
|
8612
9032
|
|
8613
9033
|
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
8614
9034
|
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
@@ -9078,37 +9498,45 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
9078
9498
|
data_ctx->write(&kv_used, sizeof(kv_used));
|
9079
9499
|
|
9080
9500
|
if (kv_buf_size) {
|
9081
|
-
const size_t elt_size = ggml_element_size(kv_self.
|
9501
|
+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9082
9502
|
|
9083
|
-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9503
|
+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9084
9504
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9085
9505
|
|
9086
|
-
|
9087
|
-
std::vector<uint8_t
|
9088
|
-
kout3d->data = kout3d_data.data();
|
9506
|
+
std::vector<std::vector<uint8_t>> kout2d_data(n_layer);
|
9507
|
+
std::vector<std::vector<uint8_t>> vout2d_data(n_layer);
|
9089
9508
|
|
9090
|
-
|
9091
|
-
|
9092
|
-
|
9509
|
+
for (int il = 0; il < (int) n_layer; ++il) {
|
9510
|
+
ggml_tensor * kout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9511
|
+
kout2d_data[il].resize(ggml_nbytes(kout2d));
|
9512
|
+
kout2d->data = kout2d_data[il].data();
|
9093
9513
|
|
9094
|
-
|
9095
|
-
|
9096
|
-
|
9514
|
+
ggml_tensor * vout2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9515
|
+
vout2d_data[il].resize(ggml_nbytes(vout2d));
|
9516
|
+
vout2d->data = vout2d_data[il].data();
|
9097
9517
|
|
9098
|
-
|
9099
|
-
|
9100
|
-
|
9518
|
+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9519
|
+
n_embd, kv_head,
|
9520
|
+
elt_size*n_embd, 0);
|
9521
|
+
|
9522
|
+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
9523
|
+
kv_head, n_embd,
|
9524
|
+
elt_size*n_ctx, 0);
|
9525
|
+
|
9526
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d));
|
9527
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d));
|
9528
|
+
}
|
9101
9529
|
|
9102
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k3d, kout3d));
|
9103
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v3d, vout3d));
|
9104
9530
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
9105
9531
|
|
9106
9532
|
ggml_free(cpy_ctx);
|
9107
9533
|
|
9108
|
-
// our data is now in the
|
9534
|
+
// our data is now in the kout2d_data and vout2d_data buffers
|
9109
9535
|
// write them to file
|
9110
|
-
|
9111
|
-
|
9536
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
9537
|
+
data_ctx->write(kout2d_data[il].data(), kout2d_data[il].size());
|
9538
|
+
data_ctx->write(vout2d_data[il].data(), vout2d_data[il].size());
|
9539
|
+
}
|
9112
9540
|
}
|
9113
9541
|
|
9114
9542
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
@@ -9208,29 +9636,32 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
9208
9636
|
if (kv_buf_size) {
|
9209
9637
|
GGML_ASSERT(kv_self.buf.size == kv_buf_size);
|
9210
9638
|
|
9211
|
-
const size_t elt_size = ggml_element_size(kv_self.
|
9639
|
+
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
9212
9640
|
|
9213
|
-
ggml_context * cpy_ctx = ggml_init({ 6*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9641
|
+
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
9214
9642
|
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
9215
9643
|
|
9216
|
-
|
9217
|
-
|
9218
|
-
|
9644
|
+
for (int il = 0; il < n_layer; ++il) {
|
9645
|
+
ggml_tensor * kin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd, kv_head);
|
9646
|
+
kin2d->data = (void *) inp;
|
9647
|
+
inp += ggml_nbytes(kin2d);
|
9648
|
+
|
9649
|
+
ggml_tensor * vin2d = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd);
|
9650
|
+
vin2d->data = (void *) inp;
|
9651
|
+
inp += ggml_nbytes(vin2d);
|
9219
9652
|
|
9220
|
-
|
9221
|
-
|
9222
|
-
|
9653
|
+
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
9654
|
+
n_embd, kv_head,
|
9655
|
+
elt_size*n_embd, 0);
|
9223
9656
|
|
9224
|
-
|
9225
|
-
|
9226
|
-
|
9657
|
+
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
9658
|
+
kv_head, n_embd,
|
9659
|
+
elt_size*n_ctx, 0);
|
9227
9660
|
|
9228
|
-
|
9229
|
-
|
9230
|
-
|
9661
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d, k2d));
|
9662
|
+
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d, v2d));
|
9663
|
+
}
|
9231
9664
|
|
9232
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin3d, k3d));
|
9233
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin3d, v3d));
|
9234
9665
|
ggml_graph_compute_helper(ctx->work_buffer, gf, /*n_threads*/ 1);
|
9235
9666
|
|
9236
9667
|
ggml_free(cpy_ctx);
|
@@ -9701,6 +10132,9 @@ const std::vector<std::pair<std::string, struct ggml_tensor *>> & llama_internal
|
|
9701
10132
|
void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
9702
10133
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
9703
10134
|
g_state.log_callback_user_data = user_data;
|
10135
|
+
#ifdef GGML_USE_METAL
|
10136
|
+
ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
10137
|
+
#endif
|
9704
10138
|
}
|
9705
10139
|
|
9706
10140
|
static void llama_log_internal_v(ggml_log_level level, const char * format, va_list args) {
|