llama_cpp 0.12.7 → 0.13.0
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -68,10 +68,12 @@
|
|
68
68
|
#include <cstdio>
|
69
69
|
#include <cstring>
|
70
70
|
#include <ctime>
|
71
|
+
#include <cwctype>
|
71
72
|
#include <forward_list>
|
72
73
|
#include <fstream>
|
73
74
|
#include <functional>
|
74
75
|
#include <initializer_list>
|
76
|
+
#include <locale>
|
75
77
|
#include <map>
|
76
78
|
#include <memory>
|
77
79
|
#include <mutex>
|
@@ -850,9 +852,9 @@ struct LLM_TN {
|
|
850
852
|
//
|
851
853
|
|
852
854
|
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
853
|
-
{
|
854
|
-
{
|
855
|
-
{
|
855
|
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
856
|
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
857
|
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
856
858
|
};
|
857
859
|
|
858
860
|
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
@@ -862,7 +864,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
862
864
|
}
|
863
865
|
}
|
864
866
|
|
865
|
-
return
|
867
|
+
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
866
868
|
}
|
867
869
|
|
868
870
|
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
@@ -1550,8 +1552,9 @@ static const size_t MiB = 1024*kiB;
|
|
1550
1552
|
static const size_t GiB = 1024*MiB;
|
1551
1553
|
|
1552
1554
|
struct llama_hparams {
|
1553
|
-
bool
|
1554
|
-
bool
|
1555
|
+
bool vocab_only;
|
1556
|
+
bool rope_finetuned;
|
1557
|
+
|
1555
1558
|
uint32_t n_vocab;
|
1556
1559
|
uint32_t n_ctx_train; // context size the model was trained on
|
1557
1560
|
uint32_t n_embd;
|
@@ -1580,7 +1583,8 @@ struct llama_hparams {
|
|
1580
1583
|
bool causal_attn = true;
|
1581
1584
|
bool need_kq_pos = false;
|
1582
1585
|
|
1583
|
-
|
1586
|
+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1587
|
+
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
1584
1588
|
|
1585
1589
|
bool operator!=(const llama_hparams & other) const {
|
1586
1590
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1639,8 +1643,8 @@ struct llama_cparams {
|
|
1639
1643
|
float yarn_attn_factor;
|
1640
1644
|
float yarn_beta_fast;
|
1641
1645
|
float yarn_beta_slow;
|
1646
|
+
float defrag_thold;
|
1642
1647
|
|
1643
|
-
bool mul_mat_q;
|
1644
1648
|
bool offload_kqv;
|
1645
1649
|
bool do_pooling;
|
1646
1650
|
|
@@ -1707,11 +1711,20 @@ struct llama_kv_cell {
|
|
1707
1711
|
bool has_seq_id(const llama_seq_id & id) const {
|
1708
1712
|
return seq_id.find(id) != seq_id.end();
|
1709
1713
|
}
|
1714
|
+
|
1715
|
+
bool is_empty() const {
|
1716
|
+
return seq_id.empty();
|
1717
|
+
}
|
1718
|
+
|
1719
|
+
bool is_same_seq(const llama_kv_cell & other) const {
|
1720
|
+
return seq_id == other.seq_id;
|
1721
|
+
}
|
1710
1722
|
};
|
1711
1723
|
|
1712
1724
|
// ring-buffer of cached KV data
|
1713
1725
|
struct llama_kv_cache {
|
1714
1726
|
bool has_shift = false;
|
1727
|
+
bool do_defrag = false;
|
1715
1728
|
|
1716
1729
|
// Note: The value of head isn't only used to optimize searching
|
1717
1730
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -1723,6 +1736,9 @@ struct llama_kv_cache {
|
|
1723
1736
|
// computed before each graph build
|
1724
1737
|
uint32_t n = 0;
|
1725
1738
|
|
1739
|
+
ggml_type type_k = GGML_TYPE_F16;
|
1740
|
+
ggml_type type_v = GGML_TYPE_F16;
|
1741
|
+
|
1726
1742
|
std::vector<llama_kv_cell> cells;
|
1727
1743
|
|
1728
1744
|
std::vector<struct ggml_tensor *> k_l; // per layer
|
@@ -1958,8 +1974,8 @@ struct llama_context {
|
|
1958
1974
|
static bool llama_kv_cache_init(
|
1959
1975
|
struct llama_kv_cache & cache,
|
1960
1976
|
const llama_model & model,
|
1961
|
-
ggml_type
|
1962
|
-
ggml_type
|
1977
|
+
ggml_type type_k,
|
1978
|
+
ggml_type type_v,
|
1963
1979
|
uint32_t n_ctx,
|
1964
1980
|
bool offload) {
|
1965
1981
|
const struct llama_hparams & hparams = model.hparams;
|
@@ -1974,6 +1990,9 @@ static bool llama_kv_cache_init(
|
|
1974
1990
|
cache.size = n_ctx;
|
1975
1991
|
cache.used = 0;
|
1976
1992
|
|
1993
|
+
cache.type_k = type_k;
|
1994
|
+
cache.type_v = type_v;
|
1995
|
+
|
1977
1996
|
cache.cells.clear();
|
1978
1997
|
cache.cells.resize(n_ctx);
|
1979
1998
|
|
@@ -2014,8 +2033,8 @@ static bool llama_kv_cache_init(
|
|
2014
2033
|
|
2015
2034
|
for (int i = 0; i < (int) n_layer; i++) {
|
2016
2035
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
2017
|
-
ggml_tensor * k = ggml_new_tensor_1d(ctx,
|
2018
|
-
ggml_tensor * v = ggml_new_tensor_1d(ctx,
|
2036
|
+
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
|
2037
|
+
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
|
2019
2038
|
ggml_format_name(k, "cache_k_l%d", i);
|
2020
2039
|
ggml_format_name(v, "cache_v_l%d", i);
|
2021
2040
|
cache.k_l.push_back(k);
|
@@ -2099,7 +2118,7 @@ static bool llama_kv_cache_find_slot(
|
|
2099
2118
|
// find how many cells are currently in use
|
2100
2119
|
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
2101
2120
|
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
2102
|
-
if (cache.cells[i].pos >= 0 && !cache.cells[i].
|
2121
|
+
if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
|
2103
2122
|
return i + 1;
|
2104
2123
|
}
|
2105
2124
|
}
|
@@ -2135,7 +2154,7 @@ static void llama_kv_cache_seq_rm(
|
|
2135
2154
|
} else {
|
2136
2155
|
continue;
|
2137
2156
|
}
|
2138
|
-
if (cache.cells[i].
|
2157
|
+
if (cache.cells[i].is_empty()) {
|
2139
2158
|
// keep count of the number of used cells
|
2140
2159
|
if (cache.cells[i].pos >= 0) cache.used--;
|
2141
2160
|
|
@@ -2186,7 +2205,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
2186
2205
|
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
2187
2206
|
}
|
2188
2207
|
|
2189
|
-
static void
|
2208
|
+
static void llama_kv_cache_seq_add(
|
2190
2209
|
struct llama_kv_cache & cache,
|
2191
2210
|
llama_seq_id seq_id,
|
2192
2211
|
llama_pos p0,
|
@@ -2204,10 +2223,14 @@ static void llama_kv_cache_seq_shift(
|
|
2204
2223
|
cache.cells[i].delta += delta;
|
2205
2224
|
|
2206
2225
|
if (cache.cells[i].pos < 0) {
|
2207
|
-
if (!cache.cells[i].
|
2226
|
+
if (!cache.cells[i].is_empty()) {
|
2227
|
+
cache.used--;
|
2228
|
+
}
|
2208
2229
|
cache.cells[i].pos = -1;
|
2209
2230
|
cache.cells[i].seq_id.clear();
|
2210
|
-
if (new_head == cache.size)
|
2231
|
+
if (new_head == cache.size) {
|
2232
|
+
new_head = i;
|
2233
|
+
}
|
2211
2234
|
}
|
2212
2235
|
}
|
2213
2236
|
}
|
@@ -2239,6 +2262,22 @@ static void llama_kv_cache_seq_div(
|
|
2239
2262
|
}
|
2240
2263
|
}
|
2241
2264
|
|
2265
|
+
static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
2266
|
+
llama_pos result = 0;
|
2267
|
+
|
2268
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
2269
|
+
if (cache.cells[i].has_seq_id(seq_id)) {
|
2270
|
+
result = std::max(result, cache.cells[i].pos);
|
2271
|
+
}
|
2272
|
+
}
|
2273
|
+
|
2274
|
+
return result;
|
2275
|
+
}
|
2276
|
+
|
2277
|
+
static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
2278
|
+
cache.do_defrag = true;
|
2279
|
+
}
|
2280
|
+
|
2242
2281
|
//
|
2243
2282
|
// model loading and saving
|
2244
2283
|
//
|
@@ -2310,7 +2349,7 @@ namespace GGUFMeta {
|
|
2310
2349
|
}
|
2311
2350
|
};
|
2312
2351
|
|
2313
|
-
struct ArrayInfo{
|
2352
|
+
struct ArrayInfo {
|
2314
2353
|
const gguf_type gt;
|
2315
2354
|
const size_t length;
|
2316
2355
|
const void * data;
|
@@ -2329,7 +2368,7 @@ namespace GGUFMeta {
|
|
2329
2368
|
};
|
2330
2369
|
|
2331
2370
|
template<typename T>
|
2332
|
-
class GKV: public GKV_Base<T> {
|
2371
|
+
class GKV : public GKV_Base<T> {
|
2333
2372
|
GKV() = delete;
|
2334
2373
|
|
2335
2374
|
public:
|
@@ -2345,46 +2384,46 @@ namespace GGUFMeta {
|
|
2345
2384
|
|
2346
2385
|
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
2347
2386
|
switch (ty) {
|
2348
|
-
case
|
2349
|
-
case
|
2350
|
-
case
|
2387
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2388
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2389
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2351
2390
|
}
|
2352
2391
|
return "unknown";
|
2353
2392
|
}
|
2354
2393
|
|
2355
|
-
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *
|
2356
|
-
if (!
|
2357
|
-
if (
|
2394
|
+
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
|
2395
|
+
if (!ovrd) { return false; }
|
2396
|
+
if (ovrd->tag == expected_type) {
|
2358
2397
|
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
2359
|
-
__func__, override_type_to_str(
|
2360
|
-
switch (
|
2361
|
-
case
|
2362
|
-
LLAMA_LOG_INFO("%s\n",
|
2398
|
+
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2399
|
+
switch (ovrd->tag) {
|
2400
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2401
|
+
LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
|
2363
2402
|
} break;
|
2364
|
-
case
|
2365
|
-
LLAMA_LOG_INFO("%" PRId64 "\n",
|
2403
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2404
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
|
2366
2405
|
} break;
|
2367
|
-
case
|
2368
|
-
LLAMA_LOG_INFO("%.6f\n",
|
2406
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2407
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
|
2369
2408
|
} break;
|
2370
2409
|
default:
|
2371
2410
|
// Shouldn't be possible to end up here, but just in case...
|
2372
2411
|
throw std::runtime_error(
|
2373
2412
|
format("Unsupported attempt to override %s type for metadata key %s\n",
|
2374
|
-
override_type_to_str(
|
2413
|
+
override_type_to_str(ovrd->tag), ovrd->key));
|
2375
2414
|
}
|
2376
2415
|
return true;
|
2377
2416
|
}
|
2378
2417
|
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
|
2379
|
-
__func__,
|
2418
|
+
__func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
|
2380
2419
|
return false;
|
2381
2420
|
}
|
2382
2421
|
|
2383
2422
|
template<typename OT>
|
2384
2423
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2385
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
2386
|
-
if (validate_override(
|
2387
|
-
target =
|
2424
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2425
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2426
|
+
target = ovrd->bool_value;
|
2388
2427
|
return true;
|
2389
2428
|
}
|
2390
2429
|
return false;
|
@@ -2392,9 +2431,9 @@ namespace GGUFMeta {
|
|
2392
2431
|
|
2393
2432
|
template<typename OT>
|
2394
2433
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2395
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
2396
|
-
if (validate_override(
|
2397
|
-
target =
|
2434
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2435
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2436
|
+
target = ovrd->int_value;
|
2398
2437
|
return true;
|
2399
2438
|
}
|
2400
2439
|
return false;
|
@@ -2402,9 +2441,9 @@ namespace GGUFMeta {
|
|
2402
2441
|
|
2403
2442
|
template<typename OT>
|
2404
2443
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2405
|
-
try_override(T & target, const struct llama_model_kv_override *
|
2406
|
-
if (validate_override(
|
2407
|
-
target =
|
2444
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2445
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2446
|
+
target = ovrd->float_value;
|
2408
2447
|
return true;
|
2409
2448
|
}
|
2410
2449
|
return false;
|
@@ -2412,17 +2451,17 @@ namespace GGUFMeta {
|
|
2412
2451
|
|
2413
2452
|
template<typename OT>
|
2414
2453
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2415
|
-
try_override(T & target, const struct llama_model_kv_override *
|
2454
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2416
2455
|
(void)target;
|
2417
|
-
(void)
|
2418
|
-
if (!
|
2456
|
+
(void)ovrd;
|
2457
|
+
if (!ovrd) { return false; }
|
2419
2458
|
// Currently, we should never end up here so it would be a bug if we do.
|
2420
2459
|
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
2421
|
-
|
2460
|
+
ovrd ? ovrd->key : "NULL"));
|
2422
2461
|
}
|
2423
2462
|
|
2424
|
-
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *
|
2425
|
-
if (try_override<T>(target,
|
2463
|
+
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2464
|
+
if (try_override<T>(target, ovrd)) {
|
2426
2465
|
return true;
|
2427
2466
|
}
|
2428
2467
|
if (k < 0) { return false; }
|
@@ -2430,12 +2469,12 @@ namespace GGUFMeta {
|
|
2430
2469
|
return true;
|
2431
2470
|
}
|
2432
2471
|
|
2433
|
-
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *
|
2434
|
-
return set(ctx, gguf_find_key(ctx, key), target,
|
2472
|
+
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2473
|
+
return set(ctx, gguf_find_key(ctx, key), target, ovrd);
|
2435
2474
|
}
|
2436
2475
|
|
2437
|
-
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *
|
2438
|
-
return set(ctx, key.c_str(), target,
|
2476
|
+
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2477
|
+
return set(ctx, key.c_str(), target, ovrd);
|
2439
2478
|
}
|
2440
2479
|
};
|
2441
2480
|
}
|
@@ -2542,9 +2581,12 @@ struct llama_model_loader {
|
|
2542
2581
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
2543
2582
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2544
2583
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2584
|
+
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
2545
2585
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2546
2586
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
2547
2587
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2588
|
+
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
2589
|
+
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
2548
2590
|
default:
|
2549
2591
|
{
|
2550
2592
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2845,6 +2887,15 @@ struct llama_model_loader {
|
|
2845
2887
|
}
|
2846
2888
|
};
|
2847
2889
|
|
2890
|
+
template<>
|
2891
|
+
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
2892
|
+
uint32_t tmp;
|
2893
|
+
const bool found = get_key(kid, tmp, required);
|
2894
|
+
result = (enum llama_pooling_type) tmp;
|
2895
|
+
return found;
|
2896
|
+
}
|
2897
|
+
|
2898
|
+
|
2848
2899
|
//
|
2849
2900
|
// load LLaMA models
|
2850
2901
|
//
|
@@ -2886,10 +2937,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2886
2937
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2887
2938
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
2888
2939
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2889
|
-
case
|
2940
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
2941
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
2942
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
2890
2943
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2891
2944
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
2892
2945
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
2946
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
2947
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
2948
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
2893
2949
|
|
2894
2950
|
default: return "unknown, may not work";
|
2895
2951
|
}
|
@@ -2923,16 +2979,16 @@ static const char * llama_model_type_name(e_model type) {
|
|
2923
2979
|
default: return "?B";
|
2924
2980
|
}
|
2925
2981
|
}
|
2982
|
+
|
2926
2983
|
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
2927
2984
|
switch (type) {
|
2928
|
-
case LLAMA_VOCAB_TYPE_SPM:
|
2929
|
-
case LLAMA_VOCAB_TYPE_BPE:
|
2930
|
-
case LLAMA_VOCAB_TYPE_WPM:
|
2931
|
-
default:
|
2985
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
2986
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
2987
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
2988
|
+
default: return "unknown";
|
2932
2989
|
}
|
2933
2990
|
}
|
2934
2991
|
|
2935
|
-
|
2936
2992
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
2937
2993
|
model.arch = ml.get_arch();
|
2938
2994
|
if (model.arch == LLM_ARCH_UNKNOWN) {
|
@@ -2996,7 +3052,7 @@ static void llm_load_hparams(
|
|
2996
3052
|
std::string rope_scaling("linear");
|
2997
3053
|
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
2998
3054
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
2999
|
-
GGML_ASSERT(hparams.rope_scaling_type_train !=
|
3055
|
+
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
3000
3056
|
|
3001
3057
|
// rope_freq_scale (inverse of the kv) is optional
|
3002
3058
|
float ropescale = 0.0f;
|
@@ -3109,10 +3165,10 @@ static void llm_load_hparams(
|
|
3109
3165
|
} break;
|
3110
3166
|
case LLM_ARCH_BERT:
|
3111
3167
|
{
|
3112
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,
|
3113
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL,
|
3168
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3169
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3114
3170
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3115
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
3171
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3116
3172
|
|
3117
3173
|
switch (hparams.n_layer) {
|
3118
3174
|
case 3:
|
@@ -3130,10 +3186,10 @@ static void llm_load_hparams(
|
|
3130
3186
|
} break;
|
3131
3187
|
case LLM_ARCH_NOMIC_BERT:
|
3132
3188
|
{
|
3133
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,
|
3134
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL,
|
3189
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3190
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3135
3191
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3136
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
3192
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3137
3193
|
|
3138
3194
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
3139
3195
|
model.type = e_model::MODEL_137M;
|
@@ -3272,6 +3328,8 @@ static void llm_load_hparams(
|
|
3272
3328
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
3273
3329
|
hparams.need_kq_pos = true;
|
3274
3330
|
}
|
3331
|
+
|
3332
|
+
hparams.rope_type = llama_rope_type(&model);
|
3275
3333
|
}
|
3276
3334
|
|
3277
3335
|
// TODO: This should probably be in llama.h
|
@@ -3574,6 +3632,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3574
3632
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3575
3633
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3576
3634
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
3635
|
+
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
3636
|
+
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
3577
3637
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
3578
3638
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
3579
3639
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -3640,7 +3700,7 @@ static bool llm_load_tensors(
|
|
3640
3700
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
3641
3701
|
}
|
3642
3702
|
|
3643
|
-
if (split_mode ==
|
3703
|
+
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
3644
3704
|
// calculate the split points
|
3645
3705
|
int device_count = llama_get_device_count();
|
3646
3706
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
@@ -3679,10 +3739,10 @@ static bool llm_load_tensors(
|
|
3679
3739
|
}
|
3680
3740
|
} else {
|
3681
3741
|
ggml_backend_buffer_type_t split_buft;
|
3682
|
-
if (split_mode ==
|
3742
|
+
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
3683
3743
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
3684
3744
|
} else {
|
3685
|
-
//
|
3745
|
+
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
3686
3746
|
split_buft = llama_default_buffer_type_offload(main_gpu);
|
3687
3747
|
}
|
3688
3748
|
// assign the repeating layers
|
@@ -4595,12 +4655,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4595
4655
|
|
4596
4656
|
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
4597
4657
|
|
4598
|
-
enum llm_rope_type {
|
4599
|
-
LLM_ROPE,
|
4600
|
-
LLM_ROPE_NEOX,
|
4601
|
-
LLM_ROPE_GLM,
|
4602
|
-
};
|
4603
|
-
|
4604
4658
|
enum llm_ffn_op_type {
|
4605
4659
|
LLM_FFN_SILU,
|
4606
4660
|
LLM_FFN_GELU,
|
@@ -4646,55 +4700,6 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
4646
4700
|
return inpL;
|
4647
4701
|
}
|
4648
4702
|
|
4649
|
-
// Persimmon: n_rot = n_embd_head_k/2
|
4650
|
-
// Other: n_rot = n_embd_head_k
|
4651
|
-
static void llm_build_k_shift(
|
4652
|
-
struct ggml_context * ctx,
|
4653
|
-
const llama_hparams & hparams,
|
4654
|
-
const llama_cparams & cparams,
|
4655
|
-
const llama_kv_cache & kv,
|
4656
|
-
struct ggml_cgraph * graph,
|
4657
|
-
struct ggml_tensor * K_shift,
|
4658
|
-
llm_rope_type type,
|
4659
|
-
int64_t n_ctx,
|
4660
|
-
float freq_base,
|
4661
|
-
float freq_scale,
|
4662
|
-
const llm_build_cb & cb) {
|
4663
|
-
const int64_t n_layer = hparams.n_layer;
|
4664
|
-
const int64_t n_head_kv = hparams.n_head_kv;
|
4665
|
-
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4666
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4667
|
-
const int32_t n_rot = hparams.n_rot;
|
4668
|
-
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
4669
|
-
const float ext_factor = cparams.yarn_ext_factor;
|
4670
|
-
const float attn_factor = cparams.yarn_attn_factor;
|
4671
|
-
const float beta_fast = cparams.yarn_beta_fast;
|
4672
|
-
const float beta_slow = cparams.yarn_beta_slow;
|
4673
|
-
|
4674
|
-
int rope_type = 0;
|
4675
|
-
|
4676
|
-
switch (type) {
|
4677
|
-
case LLM_ROPE: rope_type = 0; break;
|
4678
|
-
case LLM_ROPE_NEOX: rope_type = 2; break;
|
4679
|
-
case LLM_ROPE_GLM: rope_type = 4; break;
|
4680
|
-
}
|
4681
|
-
|
4682
|
-
for (int il = 0; il < n_layer; ++il) {
|
4683
|
-
struct ggml_tensor * tmp =
|
4684
|
-
// we rotate only the first n_rot dimensions
|
4685
|
-
ggml_rope_custom_inplace(ctx,
|
4686
|
-
ggml_view_3d(ctx, kv.k_l[il],
|
4687
|
-
n_embd_head_k, n_head_kv, n_ctx,
|
4688
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
4689
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
4690
|
-
0),
|
4691
|
-
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
4692
|
-
ext_factor, attn_factor, beta_fast, beta_slow);
|
4693
|
-
cb(tmp, "K_shifted", il);
|
4694
|
-
ggml_build_forward_expand(graph, tmp);
|
4695
|
-
}
|
4696
|
-
}
|
4697
|
-
|
4698
4703
|
static void llm_build_kv_store(
|
4699
4704
|
struct ggml_context * ctx,
|
4700
4705
|
const llama_hparams & hparams,
|
@@ -4896,8 +4901,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4896
4901
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4897
4902
|
}
|
4898
4903
|
|
4899
|
-
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
|
4900
|
-
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan,
|
4904
|
+
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
|
4905
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
|
4901
4906
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
4902
4907
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
4903
4908
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
@@ -4998,6 +5003,7 @@ struct llm_build_context {
|
|
4998
5003
|
|
4999
5004
|
const int64_t n_embd;
|
5000
5005
|
const int64_t n_layer;
|
5006
|
+
const int64_t n_rot;
|
5001
5007
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
5002
5008
|
const int64_t n_head;
|
5003
5009
|
const int64_t n_head_kv;
|
@@ -5022,8 +5028,8 @@ struct llm_build_context {
|
|
5022
5028
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
5023
5029
|
const int32_t n_orig_ctx;
|
5024
5030
|
|
5025
|
-
const
|
5026
|
-
const
|
5031
|
+
const enum llama_pooling_type pooling_type;
|
5032
|
+
const enum llama_rope_type rope_type;
|
5027
5033
|
|
5028
5034
|
const llm_build_cb & cb;
|
5029
5035
|
|
@@ -5045,6 +5051,7 @@ struct llm_build_context {
|
|
5045
5051
|
kv_self (lctx.kv_self),
|
5046
5052
|
n_embd (hparams.n_embd),
|
5047
5053
|
n_layer (hparams.n_layer),
|
5054
|
+
n_rot (hparams.n_rot),
|
5048
5055
|
n_ctx (cparams.n_ctx),
|
5049
5056
|
n_head (hparams.n_head),
|
5050
5057
|
n_head_kv (hparams.n_head_kv),
|
@@ -5066,8 +5073,8 @@ struct llm_build_context {
|
|
5066
5073
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
5067
5074
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
5068
5075
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5069
|
-
|
5070
|
-
|
5076
|
+
pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
|
5077
|
+
rope_type (hparams.rope_type),
|
5071
5078
|
cb (cb),
|
5072
5079
|
buf_compute_meta (lctx.buf_compute_meta) {
|
5073
5080
|
// all initializations should be done in init()
|
@@ -5090,6 +5097,76 @@ struct llm_build_context {
|
|
5090
5097
|
}
|
5091
5098
|
}
|
5092
5099
|
|
5100
|
+
struct ggml_cgraph * build_k_shift() {
|
5101
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5102
|
+
|
5103
|
+
for (int il = 0; il < n_layer; ++il) {
|
5104
|
+
struct ggml_tensor * tmp =
|
5105
|
+
// we rotate only the first n_rot dimensions
|
5106
|
+
ggml_rope_custom_inplace(ctx0,
|
5107
|
+
ggml_view_3d(ctx0, kv_self.k_l[il],
|
5108
|
+
n_embd_head_k, n_head_kv, n_ctx,
|
5109
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
5110
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5111
|
+
0),
|
5112
|
+
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5113
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
5114
|
+
cb(tmp, "K_shifted", il);
|
5115
|
+
ggml_build_forward_expand(gf, tmp);
|
5116
|
+
}
|
5117
|
+
|
5118
|
+
return gf;
|
5119
|
+
}
|
5120
|
+
|
5121
|
+
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
5122
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5123
|
+
|
5124
|
+
for (uint32_t i = 0; i < ids.size(); ++i) {
|
5125
|
+
const uint32_t id = ids[i];
|
5126
|
+
|
5127
|
+
if (i == id || id == ids.size()) {
|
5128
|
+
continue;
|
5129
|
+
}
|
5130
|
+
|
5131
|
+
uint32_t nm = 1;
|
5132
|
+
|
5133
|
+
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
5134
|
+
nm++;
|
5135
|
+
}
|
5136
|
+
|
5137
|
+
for (int il = 0; il < n_layer; ++il) {
|
5138
|
+
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
5139
|
+
n_embd_k_gqa, nm,
|
5140
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5141
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
5142
|
+
|
5143
|
+
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
5144
|
+
n_embd_k_gqa, nm,
|
5145
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5146
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
5147
|
+
|
5148
|
+
ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
5149
|
+
nm, n_embd_v_gqa,
|
5150
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
5151
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
5152
|
+
|
5153
|
+
ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
5154
|
+
nm, n_embd_v_gqa,
|
5155
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
5156
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
5157
|
+
|
5158
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
5159
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
5160
|
+
}
|
5161
|
+
|
5162
|
+
i += nm - 1;
|
5163
|
+
}
|
5164
|
+
|
5165
|
+
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
5166
|
+
|
5167
|
+
return gf;
|
5168
|
+
}
|
5169
|
+
|
5093
5170
|
struct ggml_cgraph * build_llama() {
|
5094
5171
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5095
5172
|
|
@@ -5111,11 +5188,6 @@ struct llm_build_context {
|
|
5111
5188
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5112
5189
|
cb(KQ_mask, "KQ_mask", -1);
|
5113
5190
|
|
5114
|
-
// shift the entire K-cache if needed
|
5115
|
-
if (do_rope_shift) {
|
5116
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5117
|
-
}
|
5118
|
-
|
5119
5191
|
for (int il = 0; il < n_layer; ++il) {
|
5120
5192
|
struct ggml_tensor * inpSA = inpL;
|
5121
5193
|
|
@@ -5151,14 +5223,14 @@ struct llm_build_context {
|
|
5151
5223
|
|
5152
5224
|
Qcur = ggml_rope_custom(
|
5153
5225
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5154
|
-
|
5226
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5155
5227
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5156
5228
|
);
|
5157
5229
|
cb(Qcur, "Qcur", il);
|
5158
5230
|
|
5159
5231
|
Kcur = ggml_rope_custom(
|
5160
5232
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5161
|
-
|
5233
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5162
5234
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5163
5235
|
);
|
5164
5236
|
cb(Kcur, "Kcur", il);
|
@@ -5299,11 +5371,6 @@ struct llm_build_context {
|
|
5299
5371
|
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5300
5372
|
cb(KQ_pos, "KQ_pos", -1);
|
5301
5373
|
|
5302
|
-
// shift the entire K-cache if needed
|
5303
|
-
if (do_rope_shift) {
|
5304
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5305
|
-
}
|
5306
|
-
|
5307
5374
|
for (int il = 0; il < n_layer; ++il) {
|
5308
5375
|
struct ggml_tensor * inpSA = inpL;
|
5309
5376
|
|
@@ -5327,12 +5394,12 @@ struct llm_build_context {
|
|
5327
5394
|
case MODEL_7B:
|
5328
5395
|
Qcur = ggml_rope_custom(
|
5329
5396
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5330
|
-
|
5397
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5331
5398
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5332
5399
|
);
|
5333
5400
|
Kcur = ggml_rope_custom(
|
5334
5401
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5335
|
-
|
5402
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5336
5403
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5337
5404
|
);
|
5338
5405
|
break;
|
@@ -5417,11 +5484,6 @@ struct llm_build_context {
|
|
5417
5484
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5418
5485
|
cb(KQ_mask, "KQ_mask", -1);
|
5419
5486
|
|
5420
|
-
// shift the entire K-cache if needed
|
5421
|
-
if (do_rope_shift) {
|
5422
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5423
|
-
}
|
5424
|
-
|
5425
5487
|
for (int il = 0; il < n_layer; ++il) {
|
5426
5488
|
struct ggml_tensor * attn_norm;
|
5427
5489
|
|
@@ -5460,13 +5522,13 @@ struct llm_build_context {
|
|
5460
5522
|
|
5461
5523
|
// using mode = 2 for neox mode
|
5462
5524
|
Qcur = ggml_rope_custom(
|
5463
|
-
ctx0, Qcur, inp_pos,
|
5525
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5464
5526
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5465
5527
|
);
|
5466
5528
|
cb(Qcur, "Qcur", il);
|
5467
5529
|
|
5468
5530
|
Kcur = ggml_rope_custom(
|
5469
|
-
ctx0, Kcur, inp_pos,
|
5531
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5470
5532
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5471
5533
|
);
|
5472
5534
|
cb(Kcur, "Kcur", il);
|
@@ -5636,10 +5698,6 @@ struct llm_build_context {
|
|
5636
5698
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5637
5699
|
cb(KQ_mask, "KQ_mask", -1);
|
5638
5700
|
|
5639
|
-
if (do_rope_shift) {
|
5640
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5641
|
-
}
|
5642
|
-
|
5643
5701
|
for (int il = 0; il < n_layer; ++il) {
|
5644
5702
|
struct ggml_tensor * residual = inpL;
|
5645
5703
|
|
@@ -5697,7 +5755,7 @@ struct llm_build_context {
|
|
5697
5755
|
|
5698
5756
|
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
5699
5757
|
struct ggml_tensor * qrot = ggml_view_3d(
|
5700
|
-
ctx0, tmpq,
|
5758
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5701
5759
|
ggml_element_size(tmpq) * n_embd_head,
|
5702
5760
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5703
5761
|
0
|
@@ -5705,7 +5763,7 @@ struct llm_build_context {
|
|
5705
5763
|
cb(qrot, "qrot", il);
|
5706
5764
|
|
5707
5765
|
struct ggml_tensor * krot = ggml_view_3d(
|
5708
|
-
ctx0, tmpk,
|
5766
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5709
5767
|
ggml_element_size(tmpk) * n_embd_head,
|
5710
5768
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5711
5769
|
0
|
@@ -5714,29 +5772,29 @@ struct llm_build_context {
|
|
5714
5772
|
|
5715
5773
|
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
5716
5774
|
struct ggml_tensor * qpass = ggml_view_3d(
|
5717
|
-
ctx0, tmpq,
|
5775
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5718
5776
|
ggml_element_size(tmpq) * n_embd_head,
|
5719
5777
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5720
|
-
ggml_element_size(tmpq) *
|
5778
|
+
ggml_element_size(tmpq) * n_rot
|
5721
5779
|
);
|
5722
5780
|
cb(qpass, "qpass", il);
|
5723
5781
|
|
5724
5782
|
struct ggml_tensor * kpass = ggml_view_3d(
|
5725
|
-
ctx0, tmpk,
|
5783
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5726
5784
|
ggml_element_size(tmpk) * n_embd_head,
|
5727
5785
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5728
|
-
ggml_element_size(tmpk) *
|
5786
|
+
ggml_element_size(tmpk) * n_rot
|
5729
5787
|
);
|
5730
5788
|
cb(kpass, "kpass", il);
|
5731
5789
|
|
5732
5790
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
5733
|
-
ctx0, qrot, inp_pos,
|
5791
|
+
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5734
5792
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5735
5793
|
);
|
5736
5794
|
cb(qrotated, "qrotated", il);
|
5737
5795
|
|
5738
5796
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
5739
|
-
ctx0, krot, inp_pos,
|
5797
|
+
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5740
5798
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5741
5799
|
);
|
5742
5800
|
cb(krotated, "krotated", il);
|
@@ -5988,14 +6046,14 @@ struct llm_build_context {
|
|
5988
6046
|
|
5989
6047
|
Qcur = ggml_rope_custom(
|
5990
6048
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5991
|
-
|
6049
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5992
6050
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5993
6051
|
);
|
5994
6052
|
cb(Qcur, "Qcur", il);
|
5995
6053
|
|
5996
6054
|
Kcur = ggml_rope_custom(
|
5997
6055
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5998
|
-
|
6056
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5999
6057
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6000
6058
|
);
|
6001
6059
|
cb(Kcur, "Kcur", il);
|
@@ -6047,12 +6105,12 @@ struct llm_build_context {
|
|
6047
6105
|
cur = inpL;
|
6048
6106
|
|
6049
6107
|
// pooling layer
|
6050
|
-
if (pooling_type ==
|
6108
|
+
if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
6051
6109
|
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
6052
|
-
} else if (pooling_type ==
|
6110
|
+
} else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
6053
6111
|
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
6054
6112
|
} else {
|
6055
|
-
GGML_ASSERT(pooling_type ==
|
6113
|
+
GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
|
6056
6114
|
}
|
6057
6115
|
cb(cur, "result_embd", -1);
|
6058
6116
|
|
@@ -6284,11 +6342,6 @@ struct llm_build_context {
|
|
6284
6342
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6285
6343
|
cb(KQ_mask, "KQ_mask", -1);
|
6286
6344
|
|
6287
|
-
// shift the entire K-cache if needed
|
6288
|
-
if (do_rope_shift) {
|
6289
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6290
|
-
}
|
6291
|
-
|
6292
6345
|
for (int il = 0; il < n_layer; ++il) {
|
6293
6346
|
struct ggml_tensor * inpSA = inpL;
|
6294
6347
|
|
@@ -6325,14 +6378,14 @@ struct llm_build_context {
|
|
6325
6378
|
|
6326
6379
|
Qcur = ggml_rope_custom(
|
6327
6380
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6328
|
-
|
6381
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6329
6382
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6330
6383
|
);
|
6331
6384
|
cb(Qcur, "Qcur", il);
|
6332
6385
|
|
6333
6386
|
Kcur = ggml_rope_custom(
|
6334
6387
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6335
|
-
|
6388
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6336
6389
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6337
6390
|
);
|
6338
6391
|
cb(Kcur, "Kcur", il);
|
@@ -6407,11 +6460,6 @@ struct llm_build_context {
|
|
6407
6460
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6408
6461
|
cb(KQ_mask, "KQ_mask", -1);
|
6409
6462
|
|
6410
|
-
// shift the entire K-cache if needed
|
6411
|
-
if (do_rope_shift) {
|
6412
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6413
|
-
}
|
6414
|
-
|
6415
6463
|
for (int il = 0; il < n_layer; ++il) {
|
6416
6464
|
struct ggml_tensor * inpSA = inpL;
|
6417
6465
|
|
@@ -6441,13 +6489,13 @@ struct llm_build_context {
|
|
6441
6489
|
|
6442
6490
|
// using mode = 2 for neox mode
|
6443
6491
|
Qcur = ggml_rope_custom(
|
6444
|
-
ctx0, Qcur, inp_pos,
|
6492
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6445
6493
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6446
6494
|
);
|
6447
6495
|
cb(Qcur, "Qcur", il);
|
6448
6496
|
|
6449
6497
|
Kcur = ggml_rope_custom(
|
6450
|
-
ctx0, Kcur, inp_pos,
|
6498
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6451
6499
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6452
6500
|
);
|
6453
6501
|
cb(Kcur, "Kcur", il);
|
@@ -6521,11 +6569,6 @@ struct llm_build_context {
|
|
6521
6569
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6522
6570
|
cb(KQ_mask, "KQ_mask", -1);
|
6523
6571
|
|
6524
|
-
// shift the entire K-cache if needed
|
6525
|
-
if (do_rope_shift) {
|
6526
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6527
|
-
}
|
6528
|
-
|
6529
6572
|
for (int il = 0; il < n_layer; ++il) {
|
6530
6573
|
struct ggml_tensor * inpSA = inpL;
|
6531
6574
|
|
@@ -6561,14 +6604,14 @@ struct llm_build_context {
|
|
6561
6604
|
|
6562
6605
|
Qcur = ggml_rope_custom(
|
6563
6606
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6564
|
-
|
6607
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6565
6608
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6566
6609
|
);
|
6567
6610
|
cb(Qcur, "Qcur", il);
|
6568
6611
|
|
6569
6612
|
Kcur = ggml_rope_custom(
|
6570
6613
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6571
|
-
|
6614
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6572
6615
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6573
6616
|
);
|
6574
6617
|
cb(Kcur, "Kcur", il);
|
@@ -6642,11 +6685,6 @@ struct llm_build_context {
|
|
6642
6685
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6643
6686
|
cb(KQ_mask, "KQ_mask", -1);
|
6644
6687
|
|
6645
|
-
// shift the entire K-cache if needed
|
6646
|
-
if (do_rope_shift) {
|
6647
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6648
|
-
}
|
6649
|
-
|
6650
6688
|
for (int il = 0; il < n_layer; ++il) {
|
6651
6689
|
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
6652
6690
|
model.layers[il].attn_norm,
|
@@ -6684,7 +6722,7 @@ struct llm_build_context {
|
|
6684
6722
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
6685
6723
|
|
6686
6724
|
Qcur = ggml_rope_custom(
|
6687
|
-
ctx0, Qcur, inp_pos,
|
6725
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6688
6726
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6689
6727
|
);
|
6690
6728
|
cb(Qcur, "Qcur", il);
|
@@ -6695,7 +6733,7 @@ struct llm_build_context {
|
|
6695
6733
|
cb(Qcur, "Qcur", il);
|
6696
6734
|
|
6697
6735
|
Kcur = ggml_rope_custom(
|
6698
|
-
ctx0, Kcur, inp_pos,
|
6736
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6699
6737
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6700
6738
|
);
|
6701
6739
|
cb(Kcur, "Kcur", il);
|
@@ -6764,11 +6802,6 @@ struct llm_build_context {
|
|
6764
6802
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6765
6803
|
cb(KQ_mask, "KQ_mask", -1);
|
6766
6804
|
|
6767
|
-
// shift the entire K-cache if needed
|
6768
|
-
if (do_rope_shift) {
|
6769
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6770
|
-
}
|
6771
|
-
|
6772
6805
|
for (int il = 0; il < n_layer; ++il) {
|
6773
6806
|
|
6774
6807
|
// norm
|
@@ -6792,14 +6825,14 @@ struct llm_build_context {
|
|
6792
6825
|
cb(Vcur, "Vcur", il);
|
6793
6826
|
|
6794
6827
|
Qcur = ggml_rope_custom(
|
6795
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur,
|
6796
|
-
n_embd_head,
|
6828
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
6829
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6797
6830
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
6798
6831
|
cb(Qcur, "Qcur", il);
|
6799
6832
|
|
6800
6833
|
Kcur = ggml_rope_custom(
|
6801
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur,
|
6802
|
-
n_embd_head,
|
6834
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
6835
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6803
6836
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
6804
6837
|
cb(Kcur, "Kcur", il);
|
6805
6838
|
|
@@ -6969,11 +7002,6 @@ struct llm_build_context {
|
|
6969
7002
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6970
7003
|
cb(KQ_mask, "KQ_mask", -1);
|
6971
7004
|
|
6972
|
-
// shift the entire K-cache if needed
|
6973
|
-
if (do_rope_shift) {
|
6974
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6975
|
-
}
|
6976
|
-
|
6977
7005
|
for (int il = 0; il < n_layer; ++il) {
|
6978
7006
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
6979
7007
|
model.layers[il].attn_norm,
|
@@ -6999,14 +7027,14 @@ struct llm_build_context {
|
|
6999
7027
|
|
7000
7028
|
struct ggml_tensor * Qcur = ggml_rope_custom(
|
7001
7029
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
7002
|
-
|
7030
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7003
7031
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7004
7032
|
);
|
7005
7033
|
cb(Qcur, "Qcur", il);
|
7006
7034
|
|
7007
7035
|
struct ggml_tensor * Kcur = ggml_rope_custom(
|
7008
7036
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7009
|
-
|
7037
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7010
7038
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7011
7039
|
);
|
7012
7040
|
cb(Kcur, "Kcur", il);
|
@@ -7077,11 +7105,6 @@ struct llm_build_context {
|
|
7077
7105
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7078
7106
|
cb(KQ_mask, "KQ_mask", -1);
|
7079
7107
|
|
7080
|
-
// shift the entire K-cache if needed
|
7081
|
-
if (do_rope_shift) {
|
7082
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7083
|
-
}
|
7084
|
-
|
7085
7108
|
for (int il = 0; il < n_layer; ++il) {
|
7086
7109
|
struct ggml_tensor * inpSA = inpL;
|
7087
7110
|
|
@@ -7117,14 +7140,14 @@ struct llm_build_context {
|
|
7117
7140
|
|
7118
7141
|
Qcur = ggml_rope_custom(
|
7119
7142
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7120
|
-
|
7143
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7121
7144
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7122
7145
|
);
|
7123
7146
|
cb(Qcur, "Qcur", il);
|
7124
7147
|
|
7125
7148
|
Kcur = ggml_rope_custom(
|
7126
7149
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7127
|
-
|
7150
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7128
7151
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7129
7152
|
);
|
7130
7153
|
cb(Kcur, "Kcur", il);
|
@@ -7196,11 +7219,6 @@ struct llm_build_context {
|
|
7196
7219
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7197
7220
|
cb(KQ_mask, "KQ_mask", -1);
|
7198
7221
|
|
7199
|
-
// shift the entire K-cache if needed
|
7200
|
-
if (do_rope_shift) {
|
7201
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7202
|
-
}
|
7203
|
-
|
7204
7222
|
for (int il = 0; il < n_layer; ++il) {
|
7205
7223
|
struct ggml_tensor * inpSA = inpL;
|
7206
7224
|
|
@@ -7236,14 +7254,14 @@ struct llm_build_context {
|
|
7236
7254
|
|
7237
7255
|
Qcur = ggml_rope_custom(
|
7238
7256
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7239
|
-
|
7257
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7240
7258
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7241
7259
|
);
|
7242
7260
|
cb(Qcur, "Qcur", il);
|
7243
7261
|
|
7244
7262
|
Kcur = ggml_rope_custom(
|
7245
7263
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7246
|
-
|
7264
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7247
7265
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7248
7266
|
);
|
7249
7267
|
cb(Kcur, "Kcur", il);
|
@@ -7328,11 +7346,6 @@ struct llm_build_context {
|
|
7328
7346
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7329
7347
|
cb(KQ_mask, "KQ_mask", -1);
|
7330
7348
|
|
7331
|
-
// shift the entire K-cache if needed
|
7332
|
-
if (do_rope_shift) {
|
7333
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7334
|
-
}
|
7335
|
-
|
7336
7349
|
for (int il = 0; il < n_layer; ++il) {
|
7337
7350
|
struct ggml_tensor * inpSA = inpL;
|
7338
7351
|
|
@@ -7368,14 +7381,14 @@ struct llm_build_context {
|
|
7368
7381
|
|
7369
7382
|
Qcur = ggml_rope_custom(
|
7370
7383
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7371
|
-
|
7384
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7372
7385
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7373
7386
|
);
|
7374
7387
|
cb(Qcur, "Qcur", il);
|
7375
7388
|
|
7376
7389
|
Kcur = ggml_rope_custom(
|
7377
7390
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7378
|
-
|
7391
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7379
7392
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7380
7393
|
);
|
7381
7394
|
cb(Kcur, "Kcur", il);
|
@@ -7464,11 +7477,6 @@ struct llm_build_context {
|
|
7464
7477
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7465
7478
|
cb(KQ_mask, "KQ_mask", -1);
|
7466
7479
|
|
7467
|
-
// shift the entire K-cache if needed
|
7468
|
-
if (do_rope_shift) {
|
7469
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7470
|
-
}
|
7471
|
-
|
7472
7480
|
for (int il = 0; il < n_layer; ++il) {
|
7473
7481
|
|
7474
7482
|
// norm
|
@@ -7491,7 +7499,7 @@ struct llm_build_context {
|
|
7491
7499
|
|
7492
7500
|
Qcur = ggml_rope_custom(
|
7493
7501
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
7494
|
-
n_embd_head_k,
|
7502
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7495
7503
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7496
7504
|
cb(Qcur, "Qcur", il);
|
7497
7505
|
|
@@ -7500,7 +7508,7 @@ struct llm_build_context {
|
|
7500
7508
|
|
7501
7509
|
Kcur = ggml_rope_custom(
|
7502
7510
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
7503
|
-
n_embd_head_k,
|
7511
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7504
7512
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7505
7513
|
cb(Kcur, "Kcur", il);
|
7506
7514
|
|
@@ -7553,6 +7561,40 @@ struct llm_build_context {
|
|
7553
7561
|
}
|
7554
7562
|
};
|
7555
7563
|
|
7564
|
+
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
7565
|
+
llama_batch dummy;
|
7566
|
+
dummy.n_tokens = 0;
|
7567
|
+
|
7568
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
7569
|
+
|
7570
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
7571
|
+
|
7572
|
+
llm.init();
|
7573
|
+
|
7574
|
+
struct ggml_cgraph * result = llm.build_defrag(ids);
|
7575
|
+
|
7576
|
+
llm.free();
|
7577
|
+
|
7578
|
+
return result;
|
7579
|
+
}
|
7580
|
+
|
7581
|
+
static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
7582
|
+
llama_batch dummy;
|
7583
|
+
dummy.n_tokens = 0;
|
7584
|
+
|
7585
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
7586
|
+
|
7587
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
7588
|
+
|
7589
|
+
llm.init();
|
7590
|
+
|
7591
|
+
struct ggml_cgraph * result = llm.build_k_shift();
|
7592
|
+
|
7593
|
+
llm.free();
|
7594
|
+
|
7595
|
+
return result;
|
7596
|
+
}
|
7597
|
+
|
7556
7598
|
static struct ggml_cgraph * llama_build_graph(
|
7557
7599
|
llama_context & lctx,
|
7558
7600
|
const llama_batch & batch,
|
@@ -7672,6 +7714,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7672
7714
|
return result;
|
7673
7715
|
}
|
7674
7716
|
|
7717
|
+
static void llama_set_k_shift(llama_context & lctx) {
|
7718
|
+
const auto & cparams = lctx.cparams;
|
7719
|
+
|
7720
|
+
const int64_t n_ctx = cparams.n_ctx;
|
7721
|
+
|
7722
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7723
|
+
|
7724
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7725
|
+
|
7726
|
+
for (int i = 0; i < n_ctx; ++i) {
|
7727
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
7728
|
+
}
|
7729
|
+
}
|
7730
|
+
|
7675
7731
|
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7676
7732
|
//
|
7677
7733
|
// set input data
|
@@ -7739,19 +7795,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7739
7795
|
}
|
7740
7796
|
}
|
7741
7797
|
|
7742
|
-
if (
|
7743
|
-
const int64_t n_ctx = cparams.n_ctx;
|
7744
|
-
|
7745
|
-
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7746
|
-
|
7747
|
-
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7748
|
-
|
7749
|
-
for (int i = 0; i < n_ctx; ++i) {
|
7750
|
-
data[i] = lctx.kv_self.cells[i].delta;
|
7751
|
-
}
|
7752
|
-
}
|
7753
|
-
|
7754
|
-
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
7798
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
7755
7799
|
const int64_t n_tokens = batch.n_tokens;
|
7756
7800
|
|
7757
7801
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
@@ -7779,7 +7823,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7779
7823
|
}
|
7780
7824
|
}
|
7781
7825
|
|
7782
|
-
if (cparams.do_pooling && hparams.pooling_type ==
|
7826
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
7783
7827
|
const int64_t n_tokens = batch.n_tokens;
|
7784
7828
|
|
7785
7829
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
@@ -7795,6 +7839,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7795
7839
|
}
|
7796
7840
|
}
|
7797
7841
|
|
7842
|
+
static void llama_graph_compute(
|
7843
|
+
llama_context & lctx,
|
7844
|
+
ggml_cgraph * gf,
|
7845
|
+
int n_threads) {
|
7846
|
+
#ifdef GGML_USE_MPI
|
7847
|
+
const int64_t n_layer = lctx.model.hparams.n_layer;
|
7848
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
7849
|
+
#endif
|
7850
|
+
|
7851
|
+
#ifdef GGML_USE_METAL
|
7852
|
+
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
7853
|
+
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
7854
|
+
}
|
7855
|
+
#endif
|
7856
|
+
|
7857
|
+
if (lctx.backend_cpu != nullptr) {
|
7858
|
+
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7859
|
+
}
|
7860
|
+
|
7861
|
+
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
7862
|
+
|
7863
|
+
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
7864
|
+
|
7865
|
+
#ifdef GGML_USE_MPI
|
7866
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
7867
|
+
#endif
|
7868
|
+
}
|
7869
|
+
|
7798
7870
|
// decode a batch of tokens by evaluating the transformer
|
7799
7871
|
//
|
7800
7872
|
// - lctx: llama context
|
@@ -7821,9 +7893,9 @@ static int llama_decode_internal(
|
|
7821
7893
|
const auto n_batch = cparams.n_batch;
|
7822
7894
|
|
7823
7895
|
GGML_ASSERT(n_tokens <= n_batch);
|
7896
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
7824
7897
|
|
7825
7898
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
7826
|
-
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
7827
7899
|
|
7828
7900
|
const int64_t t_start_us = ggml_time_us();
|
7829
7901
|
|
@@ -7872,6 +7944,8 @@ static int llama_decode_internal(
|
|
7872
7944
|
batch.seq_id = seq_id_arr.data();
|
7873
7945
|
}
|
7874
7946
|
|
7947
|
+
llama_kv_cache_update(&lctx);
|
7948
|
+
|
7875
7949
|
// if we have enough unused cells before the current head ->
|
7876
7950
|
// better to start searching from the beginning of the cache, hoping to fill it
|
7877
7951
|
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
@@ -7896,8 +7970,9 @@ static int llama_decode_internal(
|
|
7896
7970
|
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
7897
7971
|
|
7898
7972
|
// the output is always the last tensor in the graph
|
7899
|
-
struct ggml_tensor * res
|
7973
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
7900
7974
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
7975
|
+
|
7901
7976
|
if (strcmp(res->name, "result_output") == 0) {
|
7902
7977
|
// the embeddings could be the second to last tensor, or the third to last tensor
|
7903
7978
|
if (strcmp(embeddings->name, "result_norm") != 0) {
|
@@ -7924,40 +7999,12 @@ static int llama_decode_internal(
|
|
7924
7999
|
n_threads = std::min(4, n_threads);
|
7925
8000
|
}
|
7926
8001
|
|
7927
|
-
#ifdef GGML_USE_MPI
|
7928
|
-
const int64_t n_layer = hparams.n_layer;
|
7929
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
7930
|
-
#endif
|
7931
|
-
|
7932
|
-
#ifdef GGML_USE_METAL
|
7933
|
-
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
7934
|
-
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
7935
|
-
}
|
7936
|
-
#endif
|
7937
|
-
|
7938
|
-
if (lctx.backend_cpu != nullptr) {
|
7939
|
-
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7940
|
-
}
|
7941
|
-
|
7942
8002
|
llama_set_inputs(lctx, batch);
|
7943
8003
|
|
7944
|
-
|
7945
|
-
|
7946
|
-
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
7947
|
-
|
7948
|
-
#ifdef GGML_USE_MPI
|
7949
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
7950
|
-
#endif
|
8004
|
+
llama_graph_compute(lctx, gf, n_threads);
|
7951
8005
|
|
7952
8006
|
// update the kv ring buffer
|
7953
8007
|
{
|
7954
|
-
if (kv_self.has_shift) {
|
7955
|
-
kv_self.has_shift = false;
|
7956
|
-
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
7957
|
-
kv_self.cells[i].delta = 0;
|
7958
|
-
}
|
7959
|
-
}
|
7960
|
-
|
7961
8008
|
kv_self.head += n_tokens;
|
7962
8009
|
|
7963
8010
|
// Ensure kv cache head points to a valid index.
|
@@ -7966,6 +8013,18 @@ static int llama_decode_internal(
|
|
7966
8013
|
}
|
7967
8014
|
}
|
7968
8015
|
|
8016
|
+
// decide if we need to defrag the kv cache
|
8017
|
+
if (cparams.defrag_thold >= 0.0f) {
|
8018
|
+
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
|
8019
|
+
|
8020
|
+
// queue defragmentation for next llama_kv_cache_update
|
8021
|
+
if (fragmentation > cparams.defrag_thold) {
|
8022
|
+
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
|
8023
|
+
|
8024
|
+
llama_kv_cache_defrag(kv_self);
|
8025
|
+
}
|
8026
|
+
}
|
8027
|
+
|
7969
8028
|
#ifdef GGML_PERF
|
7970
8029
|
// print timing information per ggml operation (for debugging purposes)
|
7971
8030
|
// requires GGML_PERF to be defined
|
@@ -8053,6 +8112,245 @@ static int llama_decode_internal(
|
|
8053
8112
|
return 0;
|
8054
8113
|
}
|
8055
8114
|
|
8115
|
+
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
8116
|
+
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
8117
|
+
auto & kv_self = lctx.kv_self;
|
8118
|
+
|
8119
|
+
const auto & hparams = lctx.model.hparams;
|
8120
|
+
|
8121
|
+
const uint32_t n_layer = hparams.n_layer;
|
8122
|
+
|
8123
|
+
const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
|
8124
|
+
const uint32_t n_used = kv_self.used;
|
8125
|
+
|
8126
|
+
assert(n_used <= n_kv);
|
8127
|
+
|
8128
|
+
//const int64_t t_start = ggml_time_us();
|
8129
|
+
|
8130
|
+
// number of cells moved
|
8131
|
+
uint32_t n_moves = 0;
|
8132
|
+
|
8133
|
+
// determine which KV cells to move where
|
8134
|
+
//
|
8135
|
+
// cell i moves to ids[i]
|
8136
|
+
//
|
8137
|
+
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
8138
|
+
//
|
8139
|
+
std::vector<uint32_t> ids(n_kv, n_kv);
|
8140
|
+
|
8141
|
+
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
8142
|
+
const auto & cell0 = kv_self.cells[i0];
|
8143
|
+
|
8144
|
+
if (!cell0.is_empty()) {
|
8145
|
+
ids[i0] = i0;
|
8146
|
+
|
8147
|
+
continue;
|
8148
|
+
}
|
8149
|
+
|
8150
|
+
// found a hole - fill it with data from the end of the cache
|
8151
|
+
|
8152
|
+
uint32_t nh = 1;
|
8153
|
+
|
8154
|
+
// determine the size of the hole
|
8155
|
+
while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
|
8156
|
+
nh++;
|
8157
|
+
}
|
8158
|
+
|
8159
|
+
// each move requires 6*n_layer tensors (see build_defrag)
|
8160
|
+
// - source view, destination view, copy operation
|
8161
|
+
// - x2 for keys and values
|
8162
|
+
//
|
8163
|
+
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
|
8164
|
+
// the graph is too big, we cannot move more cells
|
8165
|
+
break;
|
8166
|
+
}
|
8167
|
+
|
8168
|
+
uint32_t nf = 0;
|
8169
|
+
uint32_t is = n_kv - 1;
|
8170
|
+
|
8171
|
+
// starting from the end, find nh non-empty cells
|
8172
|
+
for (; is > i0; --is) {
|
8173
|
+
const auto & cell1 = kv_self.cells[is];
|
8174
|
+
|
8175
|
+
if (cell1.is_empty() || ids[is] != n_kv) {
|
8176
|
+
continue;
|
8177
|
+
}
|
8178
|
+
|
8179
|
+
// non-empty cell which is not yet moved
|
8180
|
+
nf++;
|
8181
|
+
|
8182
|
+
if (nf == nh) {
|
8183
|
+
break;
|
8184
|
+
}
|
8185
|
+
}
|
8186
|
+
|
8187
|
+
// this can only happen if `n_used` is not accurate, which would be a bug
|
8188
|
+
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
8189
|
+
|
8190
|
+
nf = 0;
|
8191
|
+
|
8192
|
+
uint32_t i1 = is;
|
8193
|
+
|
8194
|
+
// are we moving a continuous block of memory?
|
8195
|
+
bool cont = false;
|
8196
|
+
|
8197
|
+
// go back and move the nf cells to the hole
|
8198
|
+
for (; i1 < n_kv; ++i1) {
|
8199
|
+
auto & cell1 = kv_self.cells[i1];
|
8200
|
+
|
8201
|
+
if (cell1.is_empty() || ids[i1] != n_kv) {
|
8202
|
+
cont = false;
|
8203
|
+
continue;
|
8204
|
+
}
|
8205
|
+
|
8206
|
+
// this cell goes to (i0 + nf)
|
8207
|
+
ids[i1] = i0 + nf;
|
8208
|
+
|
8209
|
+
// move the cell meta data
|
8210
|
+
kv_self.cells[i0 + nf] = cell1;
|
8211
|
+
|
8212
|
+
// clear the old cell and move the head there
|
8213
|
+
cell1 = llama_kv_cell();
|
8214
|
+
kv_self.head = n_used;
|
8215
|
+
|
8216
|
+
if (!cont) {
|
8217
|
+
n_moves++;
|
8218
|
+
cont = true;
|
8219
|
+
}
|
8220
|
+
|
8221
|
+
nf++;
|
8222
|
+
|
8223
|
+
if (nf == nh) {
|
8224
|
+
break;
|
8225
|
+
}
|
8226
|
+
}
|
8227
|
+
|
8228
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
8229
|
+
|
8230
|
+
i0 += nh - 1;
|
8231
|
+
}
|
8232
|
+
|
8233
|
+
if (n_moves == 0) {
|
8234
|
+
return;
|
8235
|
+
}
|
8236
|
+
|
8237
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
8238
|
+
|
8239
|
+
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
8240
|
+
|
8241
|
+
#if 0
|
8242
|
+
// CPU defrag
|
8243
|
+
//
|
8244
|
+
// TODO: optimizations are possible:
|
8245
|
+
// - multiple threads
|
8246
|
+
// - avoid copying to the host memory when already there
|
8247
|
+
//
|
8248
|
+
// likely not worth the effort, as we have ggml_graph based defrag
|
8249
|
+
//
|
8250
|
+
|
8251
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
8252
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
8253
|
+
|
8254
|
+
const uint32_t kv_size = kv_self.size;
|
8255
|
+
|
8256
|
+
std::vector<uint8_t> buf_k;
|
8257
|
+
std::vector<uint8_t> buf_v;
|
8258
|
+
|
8259
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
8260
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
8261
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
|
8262
|
+
|
8263
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
8264
|
+
const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
|
8265
|
+
|
8266
|
+
buf_k.resize(k_size);
|
8267
|
+
buf_v.resize(v_size);
|
8268
|
+
|
8269
|
+
ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
8270
|
+
ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
8271
|
+
|
8272
|
+
// batch move [i, i+nm) to [id, id+nm)
|
8273
|
+
// note: cells can move only to a lower index
|
8274
|
+
for (uint32_t i = 0; i < n_kv; ++i) {
|
8275
|
+
const uint32_t id = ids[i];
|
8276
|
+
|
8277
|
+
if (i == id || id == n_kv) {
|
8278
|
+
continue;
|
8279
|
+
}
|
8280
|
+
|
8281
|
+
uint32_t nm = 1;
|
8282
|
+
|
8283
|
+
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
8284
|
+
nm++;
|
8285
|
+
}
|
8286
|
+
|
8287
|
+
// move keys
|
8288
|
+
{
|
8289
|
+
const int64_t os = i*k_size_row;
|
8290
|
+
const int64_t od = id*k_size_row;
|
8291
|
+
|
8292
|
+
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
8293
|
+
}
|
8294
|
+
|
8295
|
+
// move values (note: they are transposed)
|
8296
|
+
{
|
8297
|
+
const int64_t os = i;
|
8298
|
+
const int64_t od = id;
|
8299
|
+
|
8300
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
8301
|
+
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
8302
|
+
}
|
8303
|
+
}
|
8304
|
+
|
8305
|
+
i += nm - 1;
|
8306
|
+
}
|
8307
|
+
|
8308
|
+
ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
8309
|
+
ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
8310
|
+
}
|
8311
|
+
#else
|
8312
|
+
// ggml_graph defrag
|
8313
|
+
|
8314
|
+
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
8315
|
+
|
8316
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
8317
|
+
#endif
|
8318
|
+
|
8319
|
+
//const int64_t t_end = ggml_time_us();
|
8320
|
+
|
8321
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
|
8322
|
+
}
|
8323
|
+
|
8324
|
+
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
8325
|
+
// apply K-shift if needed
|
8326
|
+
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
8327
|
+
llama_set_k_shift(lctx);
|
8328
|
+
|
8329
|
+
{
|
8330
|
+
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
8331
|
+
|
8332
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
8333
|
+
}
|
8334
|
+
|
8335
|
+
{
|
8336
|
+
auto & kv_self = lctx.kv_self;
|
8337
|
+
|
8338
|
+
kv_self.has_shift = false;
|
8339
|
+
|
8340
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
8341
|
+
kv_self.cells[i].delta = 0;
|
8342
|
+
}
|
8343
|
+
}
|
8344
|
+
}
|
8345
|
+
|
8346
|
+
// defragment the KV cache if needed
|
8347
|
+
if (lctx.kv_self.do_defrag) {
|
8348
|
+
llama_kv_cache_defrag_internal(lctx);
|
8349
|
+
|
8350
|
+
lctx.kv_self.do_defrag = false;
|
8351
|
+
}
|
8352
|
+
}
|
8353
|
+
|
8056
8354
|
//
|
8057
8355
|
// tokenizer
|
8058
8356
|
//
|
@@ -8644,37 +8942,46 @@ struct llm_tokenizer_wpm {
|
|
8644
8942
|
}
|
8645
8943
|
|
8646
8944
|
std::vector<std::string> preprocess(const std::string & text) {
|
8647
|
-
|
8648
|
-
|
8945
|
+
// normalalization form D
|
8946
|
+
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
8947
|
+
std::vector<uint32_t> nfd_codepoints;
|
8948
|
+
for (uint32_t code : codepoints) {
|
8949
|
+
auto it = nfd_map.equal_range(code);
|
8950
|
+
if (it.first != it.second) {
|
8951
|
+
for (auto jt = it.first; jt != it.second; jt++) {
|
8952
|
+
nfd_codepoints.push_back(jt->second);
|
8953
|
+
}
|
8954
|
+
} else {
|
8955
|
+
nfd_codepoints.push_back(code);
|
8956
|
+
}
|
8957
|
+
}
|
8649
8958
|
|
8650
|
-
//
|
8651
|
-
//
|
8652
|
-
std::vector<std::string> words;
|
8959
|
+
// strip accents, strip control, uniformize whitespace,
|
8960
|
+
// to lowercase, pad chinese characters, pad punctuation
|
8653
8961
|
std::string new_str = "";
|
8654
|
-
|
8655
|
-
|
8656
|
-
|
8657
|
-
|
8658
|
-
new_str += " ";
|
8659
|
-
new_str += ori_str[i];
|
8660
|
-
new_str += " ";
|
8661
|
-
i += 1;
|
8962
|
+
for (uint32_t code : nfd_codepoints) {
|
8963
|
+
int type = codepoint_type(code);
|
8964
|
+
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
8965
|
+
continue;
|
8662
8966
|
}
|
8663
|
-
|
8967
|
+
code = to_lower(code);
|
8968
|
+
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
8969
|
+
code = ' ';
|
8970
|
+
}
|
8971
|
+
std::string s = codepoint_to_utf8(code);
|
8972
|
+
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
|
8664
8973
|
new_str += " ";
|
8665
|
-
new_str +=
|
8974
|
+
new_str += s;
|
8666
8975
|
new_str += " ";
|
8667
|
-
|
8668
|
-
|
8669
|
-
else {
|
8670
|
-
new_str += ori_str[i];
|
8671
|
-
i += 1;
|
8976
|
+
} else {
|
8977
|
+
new_str += s;
|
8672
8978
|
}
|
8673
8979
|
}
|
8674
8980
|
|
8675
8981
|
// split by whitespace
|
8676
8982
|
uint64_t l = 0;
|
8677
8983
|
uint64_t r = 0;
|
8984
|
+
std::vector<std::string> words;
|
8678
8985
|
while (r < new_str.size()) {
|
8679
8986
|
// if is whitespace
|
8680
8987
|
if (isspace(new_str[r])) {
|
@@ -8692,47 +8999,21 @@ struct llm_tokenizer_wpm {
|
|
8692
8999
|
return words;
|
8693
9000
|
}
|
8694
9001
|
|
8695
|
-
|
8696
|
-
|
8697
|
-
|
8698
|
-
|
8699
|
-
|
8700
|
-
if (c >= 'A' && c <= 'Z') {
|
8701
|
-
text2[i] = c - 'A' + 'a';
|
8702
|
-
}
|
9002
|
+
uint32_t to_lower(uint32_t code) {
|
9003
|
+
static const std::locale locale("en_US.UTF-8");
|
9004
|
+
#if defined(_WIN32)
|
9005
|
+
if (code > 0xFFFF) {
|
9006
|
+
return code;
|
8703
9007
|
}
|
8704
|
-
|
9008
|
+
#endif
|
9009
|
+
return std::tolower(wchar_t(code), locale);
|
8705
9010
|
}
|
8706
9011
|
|
8707
|
-
bool
|
8708
|
-
|
8709
|
-
|
8710
|
-
|
8711
|
-
|
8712
|
-
unsigned char ch = static_cast<unsigned char>(str[i]);
|
8713
|
-
if (ch <= 0x7f) {
|
8714
|
-
codepoint = ch;
|
8715
|
-
num_bytes = 1;
|
8716
|
-
} else if ((ch >> 5) == 0x06) {
|
8717
|
-
codepoint = ch & 0x1f;
|
8718
|
-
num_bytes = 2;
|
8719
|
-
} else if ((ch >> 4) == 0x0e) {
|
8720
|
-
codepoint = ch & 0x0f;
|
8721
|
-
num_bytes = 3;
|
8722
|
-
} else if ((ch >> 3) == 0x1e) {
|
8723
|
-
codepoint = ch & 0x07;
|
8724
|
-
num_bytes = 4;
|
8725
|
-
}
|
8726
|
-
for (int j = 1; j < num_bytes; ++j) {
|
8727
|
-
if (i + j >= len) {
|
8728
|
-
return false; // incomplete UTF-8 character
|
8729
|
-
}
|
8730
|
-
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
8731
|
-
if ((next_ch >> 6) != 0x02) {
|
8732
|
-
return false; // invalid trailing byte
|
8733
|
-
}
|
8734
|
-
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
8735
|
-
}
|
9012
|
+
bool is_ascii_punct(uint32_t code) {
|
9013
|
+
return code < 256 && ispunct(code);
|
9014
|
+
}
|
9015
|
+
|
9016
|
+
bool is_chinese_char(uint32_t codepoint) {
|
8736
9017
|
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
8737
9018
|
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
8738
9019
|
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
@@ -8748,41 +9029,6 @@ struct llm_tokenizer_wpm {
|
|
8748
9029
|
return false;
|
8749
9030
|
}
|
8750
9031
|
|
8751
|
-
std::string strip_accents(const std::string & input_string) {
|
8752
|
-
std::string resultString;
|
8753
|
-
std::map<std::string, char> accent_map = {
|
8754
|
-
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
8755
|
-
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
8756
|
-
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
8757
|
-
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
8758
|
-
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
8759
|
-
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
8760
|
-
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
8761
|
-
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
8762
|
-
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
8763
|
-
};
|
8764
|
-
|
8765
|
-
for (size_t i = 0; i < input_string.length();) {
|
8766
|
-
int len = utf8_len(input_string[i]);
|
8767
|
-
std::string curChar = input_string.substr(i, len);
|
8768
|
-
auto iter = accent_map.find(curChar);
|
8769
|
-
if (iter != accent_map.end()) {
|
8770
|
-
resultString += iter->second;
|
8771
|
-
} else {
|
8772
|
-
resultString += curChar;
|
8773
|
-
}
|
8774
|
-
i += len;
|
8775
|
-
}
|
8776
|
-
|
8777
|
-
return resultString;
|
8778
|
-
}
|
8779
|
-
|
8780
|
-
static size_t utf8_len(char src) {
|
8781
|
-
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
8782
|
-
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
8783
|
-
return lookup[highbits];
|
8784
|
-
}
|
8785
|
-
|
8786
9032
|
const llama_vocab & vocab;
|
8787
9033
|
};
|
8788
9034
|
|
@@ -9816,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
|
|
9816
10062
|
}
|
9817
10063
|
}
|
9818
10064
|
|
9819
|
-
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
9820
|
-
llama_sample_temp(ctx, candidates_p, temp);
|
9821
|
-
}
|
9822
|
-
|
9823
10065
|
void llama_sample_repetition_penalties(
|
9824
10066
|
struct llama_context * ctx,
|
9825
10067
|
llama_token_data_array * candidates,
|
@@ -9946,38 +10188,6 @@ void llama_sample_apply_guidance(
|
|
9946
10188
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9947
10189
|
}
|
9948
10190
|
|
9949
|
-
void llama_sample_classifier_free_guidance(
|
9950
|
-
struct llama_context * ctx,
|
9951
|
-
llama_token_data_array * candidates,
|
9952
|
-
struct llama_context * guidance_ctx,
|
9953
|
-
float scale) {
|
9954
|
-
GGML_ASSERT(ctx);
|
9955
|
-
int64_t t_start_sample_us;
|
9956
|
-
|
9957
|
-
t_start_sample_us = ggml_time_us();
|
9958
|
-
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
|
9959
|
-
|
9960
|
-
GGML_ASSERT(n_vocab == candidates->size);
|
9961
|
-
GGML_ASSERT(!candidates->sorted);
|
9962
|
-
|
9963
|
-
std::vector<float> logits_base(n_vocab);
|
9964
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
9965
|
-
logits_base[i] = candidates->data[i].logit;
|
9966
|
-
}
|
9967
|
-
|
9968
|
-
float * logits_guidance = llama_get_logits(guidance_ctx);
|
9969
|
-
|
9970
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9971
|
-
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
|
9972
|
-
t_start_sample_us = ggml_time_us();
|
9973
|
-
|
9974
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
9975
|
-
candidates->data[i].logit = logits_base[i];
|
9976
|
-
}
|
9977
|
-
|
9978
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9979
|
-
}
|
9980
|
-
|
9981
10191
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
9982
10192
|
GGML_ASSERT(ctx);
|
9983
10193
|
|
@@ -10508,31 +10718,47 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10508
10718
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
10509
10719
|
new_type = GGML_TYPE_Q8_0;
|
10510
10720
|
}
|
10511
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype ==
|
10721
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
10722
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10512
10723
|
new_type = GGML_TYPE_Q5_K;
|
10513
10724
|
}
|
10514
10725
|
else if (new_type != GGML_TYPE_Q8_0) {
|
10515
10726
|
new_type = GGML_TYPE_Q6_K;
|
10516
10727
|
}
|
10517
10728
|
} else if (name == "token_embd.weight") {
|
10518
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
10729
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
10730
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10519
10731
|
new_type = GGML_TYPE_Q2_K;
|
10520
10732
|
}
|
10733
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10734
|
+
new_type = GGML_TYPE_IQ3_S;
|
10735
|
+
}
|
10521
10736
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10522
|
-
new_type =
|
10737
|
+
new_type = GGML_TYPE_IQ3_S;
|
10523
10738
|
}
|
10524
|
-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S
|
10739
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
10740
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10525
10741
|
if (name.find("attn_v.weight") != std::string::npos) {
|
10526
10742
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
10527
|
-
else new_type = GGML_TYPE_Q2_K;
|
10743
|
+
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
10528
10744
|
++qs.i_attention_wv;
|
10529
10745
|
}
|
10746
|
+
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
|
10747
|
+
new_type = GGML_TYPE_Q4_K;
|
10748
|
+
}
|
10530
10749
|
else if (name.find("ffn_down") != std::string::npos) {
|
10531
|
-
if (qs.i_ffn_down < qs.n_ffn_down/8)
|
10750
|
+
if (qs.i_ffn_down < qs.n_ffn_down/8) {
|
10751
|
+
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
10752
|
+
}
|
10532
10753
|
++qs.i_ffn_down;
|
10533
10754
|
}
|
10534
10755
|
else if (name.find("attn_output.weight") != std::string::npos) {
|
10535
|
-
if (
|
10756
|
+
if (qs.model.hparams.n_expert == 8) {
|
10757
|
+
new_type = GGML_TYPE_Q5_K;
|
10758
|
+
} else {
|
10759
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
10760
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
10761
|
+
}
|
10536
10762
|
}
|
10537
10763
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
10538
10764
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
@@ -10542,13 +10768,25 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10542
10768
|
new_type = GGML_TYPE_Q4_K;
|
10543
10769
|
}
|
10544
10770
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10545
|
-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ?
|
10771
|
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
10772
|
+
}
|
10773
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
10774
|
+
new_type = GGML_TYPE_Q4_K;
|
10775
|
+
}
|
10776
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
10777
|
+
new_type = GGML_TYPE_Q4_K;
|
10778
|
+
}
|
10779
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
10780
|
+
new_type = GGML_TYPE_Q4_K;
|
10781
|
+
}
|
10782
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
10783
|
+
new_type = GGML_TYPE_Q4_K;
|
10546
10784
|
}
|
10547
10785
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
10548
10786
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
10549
10787
|
}
|
10550
10788
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
10551
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
10789
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
|
10552
10790
|
new_type = GGML_TYPE_Q5_K;
|
10553
10791
|
}
|
10554
10792
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
@@ -10574,14 +10812,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10574
10812
|
// TODO: explore better strategies
|
10575
10813
|
new_type = GGML_TYPE_Q8_0;
|
10576
10814
|
}
|
10577
|
-
else if (ftype ==
|
10578
|
-
new_type =
|
10815
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
10816
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10817
|
+
}
|
10818
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10819
|
+
new_type = GGML_TYPE_IQ2_S;
|
10820
|
+
}
|
10821
|
+
} else if (name.find("attn_q.weight") != std::string::npos) {
|
10822
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
10823
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10824
|
+
}
|
10825
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10826
|
+
new_type = GGML_TYPE_IQ2_S;
|
10579
10827
|
}
|
10580
10828
|
} else if (name.find("ffn_down") != std::string::npos) {
|
10581
10829
|
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
10582
10830
|
int i_layer = info.first, n_layer = info.second;
|
10583
10831
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
10584
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S
|
10832
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
10585
10833
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
10586
10834
|
}
|
10587
10835
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
@@ -10592,6 +10840,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10592
10840
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
10593
10841
|
: GGML_TYPE_Q3_K;
|
10594
10842
|
}
|
10843
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
|
10844
|
+
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
10845
|
+
new_type = GGML_TYPE_Q4_K;
|
10846
|
+
}
|
10595
10847
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
10596
10848
|
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
10597
10849
|
}
|
@@ -10603,8 +10855,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10603
10855
|
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10604
10856
|
}
|
10605
10857
|
}
|
10606
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
10607
|
-
|
10858
|
+
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
|
10859
|
+
new_type = GGML_TYPE_Q5_K;
|
10608
10860
|
}
|
10609
10861
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10610
10862
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
@@ -10621,39 +10873,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10621
10873
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
10622
10874
|
if (arch != LLM_ARCH_FALCON) {
|
10623
10875
|
if (qs.model.hparams.n_expert == 8) {
|
10624
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype ==
|
10876
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
10625
10877
|
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
10626
|
-
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M
|
10878
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
10879
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
|
10627
10880
|
new_type = GGML_TYPE_Q5_K;
|
10628
10881
|
}
|
10629
10882
|
} else {
|
10630
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K
|
10631
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type =
|
10632
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
10633
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
10883
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
10884
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
|
10885
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
|
10886
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
|
10887
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
|
10634
10888
|
}
|
10635
10889
|
} else {
|
10636
10890
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
10637
10891
|
}
|
10638
10892
|
}
|
10639
10893
|
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
10640
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L
|
10894
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
10895
|
+
new_type = GGML_TYPE_Q4_K;
|
10896
|
+
}
|
10641
10897
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
10642
10898
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
10643
10899
|
}
|
10644
10900
|
else if (name.find("ffn_gate") != std::string::npos) {
|
10645
10901
|
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
10646
10902
|
int i_layer = info.first, n_layer = info.second;
|
10647
|
-
if (ftype ==
|
10648
|
-
new_type =
|
10903
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
10904
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10649
10905
|
}
|
10650
10906
|
++qs.i_ffn_gate;
|
10651
10907
|
}
|
10652
10908
|
else if (name.find("ffn_up") != std::string::npos) {
|
10653
10909
|
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
10654
10910
|
int i_layer = info.first, n_layer = info.second;
|
10655
|
-
if (ftype ==
|
10656
|
-
new_type =
|
10911
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
10912
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10657
10913
|
}
|
10658
10914
|
++qs.i_ffn_up;
|
10659
10915
|
}
|
@@ -10671,9 +10927,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10671
10927
|
//}
|
10672
10928
|
bool convert_incompatible_tensor = false;
|
10673
10929
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
10674
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
10675
|
-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
10676
|
-
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10930
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
10931
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
10932
|
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
|
10677
10933
|
int nx = tensor->ne[0];
|
10678
10934
|
int ny = tensor->ne[1];
|
10679
10935
|
if (nx % QK_K != 0) {
|
@@ -10687,13 +10943,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10687
10943
|
switch (new_type) {
|
10688
10944
|
case GGML_TYPE_IQ2_XXS:
|
10689
10945
|
case GGML_TYPE_IQ2_XS:
|
10946
|
+
case GGML_TYPE_IQ2_S:
|
10690
10947
|
case GGML_TYPE_IQ3_XXS:
|
10948
|
+
case GGML_TYPE_IQ3_S:
|
10691
10949
|
case GGML_TYPE_IQ1_S:
|
10692
10950
|
case GGML_TYPE_Q2_K:
|
10693
|
-
case GGML_TYPE_Q3_K:
|
10694
|
-
case
|
10695
|
-
case
|
10696
|
-
case
|
10951
|
+
case GGML_TYPE_Q3_K:
|
10952
|
+
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
10953
|
+
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
10954
|
+
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
10955
|
+
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
10697
10956
|
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
|
10698
10957
|
}
|
10699
10958
|
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
|
@@ -10719,7 +10978,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10719
10978
|
// K-quants
|
10720
10979
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
10721
10980
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
10722
|
-
case
|
10981
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
|
10723
10982
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
10724
10983
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
10725
10984
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
@@ -10730,9 +10989,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10730
10989
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
10731
10990
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10732
10991
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10992
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10993
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
|
10733
10994
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10734
10995
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
10735
10996
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
10997
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
|
10998
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
|
10999
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
|
10736
11000
|
|
10737
11001
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
10738
11002
|
}
|
@@ -10862,7 +11126,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10862
11126
|
quantize &= !params->only_copy;
|
10863
11127
|
|
10864
11128
|
// do not quantize expert gating tensors
|
10865
|
-
|
11129
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
11130
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
10866
11131
|
|
10867
11132
|
// do not quantize positional embeddings and token types (BERT)
|
10868
11133
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
@@ -10906,6 +11171,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10906
11171
|
}
|
10907
11172
|
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
10908
11173
|
new_type == GGML_TYPE_IQ2_XS ||
|
11174
|
+
new_type == GGML_TYPE_IQ2_S ||
|
10909
11175
|
new_type == GGML_TYPE_IQ1_S ||
|
10910
11176
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
10911
11177
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
@@ -11327,7 +11593,7 @@ static int llama_apply_lora_from_file_internal(
|
|
11327
11593
|
struct llama_model_params llama_model_default_params() {
|
11328
11594
|
struct llama_model_params result = {
|
11329
11595
|
/*.n_gpu_layers =*/ 0,
|
11330
|
-
/*.split_mode =*/
|
11596
|
+
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
11331
11597
|
/*.main_gpu =*/ 0,
|
11332
11598
|
/*.tensor_split =*/ nullptr,
|
11333
11599
|
/*.progress_callback =*/ nullptr,
|
@@ -11353,7 +11619,7 @@ struct llama_context_params llama_context_default_params() {
|
|
11353
11619
|
/*.n_batch =*/ 512,
|
11354
11620
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
11355
11621
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
11356
|
-
/*.rope_scaling_type =*/
|
11622
|
+
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
11357
11623
|
/*.rope_freq_base =*/ 0.0f,
|
11358
11624
|
/*.rope_freq_scale =*/ 0.0f,
|
11359
11625
|
/*.yarn_ext_factor =*/ -1.0f,
|
@@ -11361,11 +11627,11 @@ struct llama_context_params llama_context_default_params() {
|
|
11361
11627
|
/*.yarn_beta_fast =*/ 32.0f,
|
11362
11628
|
/*.yarn_beta_slow =*/ 1.0f,
|
11363
11629
|
/*.yarn_orig_ctx =*/ 0,
|
11630
|
+
/*.defrag_thold =*/ -1.0f,
|
11364
11631
|
/*.cb_eval =*/ nullptr,
|
11365
11632
|
/*.cb_eval_user_data =*/ nullptr,
|
11366
11633
|
/*.type_k =*/ GGML_TYPE_F16,
|
11367
11634
|
/*.type_v =*/ GGML_TYPE_F16,
|
11368
|
-
/*.mul_mat_q =*/ true,
|
11369
11635
|
/*.logits_all =*/ false,
|
11370
11636
|
/*.embedding =*/ false,
|
11371
11637
|
/*.offload_kqv =*/ true,
|
@@ -11421,15 +11687,6 @@ bool llama_supports_gpu_offload(void) {
|
|
11421
11687
|
#endif
|
11422
11688
|
}
|
11423
11689
|
|
11424
|
-
// deprecated:
|
11425
|
-
bool llama_mmap_supported(void) {
|
11426
|
-
return llama_supports_mmap();
|
11427
|
-
}
|
11428
|
-
|
11429
|
-
bool llama_mlock_supported(void) {
|
11430
|
-
return llama_supports_mlock();
|
11431
|
-
}
|
11432
|
-
|
11433
11690
|
void llama_backend_init(void) {
|
11434
11691
|
ggml_time_init();
|
11435
11692
|
|
@@ -11525,7 +11782,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11525
11782
|
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
11526
11783
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
11527
11784
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
11528
|
-
cparams.
|
11785
|
+
cparams.defrag_thold = params.defrag_thold;
|
11529
11786
|
cparams.offload_kqv = params.offload_kqv;
|
11530
11787
|
cparams.do_pooling = params.do_pooling;
|
11531
11788
|
|
@@ -11541,16 +11798,16 @@ struct llama_context * llama_new_context_with_model(
|
|
11541
11798
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
11542
11799
|
|
11543
11800
|
auto rope_scaling_type = params.rope_scaling_type;
|
11544
|
-
if (rope_scaling_type ==
|
11801
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
11545
11802
|
rope_scaling_type = hparams.rope_scaling_type_train;
|
11546
11803
|
}
|
11547
11804
|
|
11548
|
-
if (rope_scaling_type ==
|
11805
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
|
11549
11806
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
11550
11807
|
}
|
11551
11808
|
|
11552
11809
|
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
11553
|
-
cparams.yarn_ext_factor = rope_scaling_type ==
|
11810
|
+
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
11554
11811
|
}
|
11555
11812
|
|
11556
11813
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
@@ -11584,8 +11841,8 @@ struct llama_context * llama_new_context_with_model(
|
|
11584
11841
|
}
|
11585
11842
|
#elif defined(GGML_USE_CUBLAS)
|
11586
11843
|
if (model->n_gpu_layers > 0) {
|
11587
|
-
// with split_mode
|
11588
|
-
if (model->split_mode ==
|
11844
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
11845
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
11589
11846
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
11590
11847
|
if (backend == nullptr) {
|
11591
11848
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
@@ -11594,7 +11851,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11594
11851
|
}
|
11595
11852
|
ctx->backends.push_back(backend);
|
11596
11853
|
} else {
|
11597
|
-
//
|
11854
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
11598
11855
|
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
11599
11856
|
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
11600
11857
|
if (backend == nullptr) {
|
@@ -11647,8 +11904,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11647
11904
|
}
|
11648
11905
|
ctx->backends.push_back(ctx->backend_cpu);
|
11649
11906
|
|
11650
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
11651
|
-
cparams.n_ctx, cparams.offload_kqv)) {
|
11907
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
|
11652
11908
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
11653
11909
|
llama_free(ctx);
|
11654
11910
|
return nullptr;
|
@@ -11727,7 +11983,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11727
11983
|
}
|
11728
11984
|
|
11729
11985
|
// buffer used to store the computation graph and the tensor meta data
|
11730
|
-
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES +
|
11986
|
+
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
11731
11987
|
|
11732
11988
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
11733
11989
|
|
@@ -11796,6 +12052,49 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|
11796
12052
|
return model->vocab.type;
|
11797
12053
|
}
|
11798
12054
|
|
12055
|
+
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
12056
|
+
switch (model->arch) {
|
12057
|
+
// these models do not use RoPE
|
12058
|
+
case LLM_ARCH_GPT2:
|
12059
|
+
case LLM_ARCH_GPTJ:
|
12060
|
+
case LLM_ARCH_GPTNEOX:
|
12061
|
+
case LLM_ARCH_MPT:
|
12062
|
+
case LLM_ARCH_REFACT:
|
12063
|
+
case LLM_ARCH_BLOOM:
|
12064
|
+
return LLAMA_ROPE_TYPE_NONE;
|
12065
|
+
|
12066
|
+
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
12067
|
+
case LLM_ARCH_LLAMA:
|
12068
|
+
case LLM_ARCH_BAICHUAN:
|
12069
|
+
case LLM_ARCH_STARCODER:
|
12070
|
+
case LLM_ARCH_PLAMO:
|
12071
|
+
case LLM_ARCH_CODESHELL:
|
12072
|
+
case LLM_ARCH_ORION:
|
12073
|
+
case LLM_ARCH_INTERNLM2:
|
12074
|
+
case LLM_ARCH_MINICPM:
|
12075
|
+
return LLAMA_ROPE_TYPE_NORM;
|
12076
|
+
|
12077
|
+
// the pairs of head values are offset by n_rot/2
|
12078
|
+
case LLM_ARCH_FALCON:
|
12079
|
+
case LLM_ARCH_PERSIMMON:
|
12080
|
+
case LLM_ARCH_BERT:
|
12081
|
+
case LLM_ARCH_NOMIC_BERT:
|
12082
|
+
case LLM_ARCH_STABLELM:
|
12083
|
+
case LLM_ARCH_QWEN:
|
12084
|
+
case LLM_ARCH_QWEN2:
|
12085
|
+
case LLM_ARCH_PHI2:
|
12086
|
+
case LLM_ARCH_GEMMA:
|
12087
|
+
return LLAMA_ROPE_TYPE_NEOX;
|
12088
|
+
|
12089
|
+
// all model arches should be listed explicitly here
|
12090
|
+
case LLM_ARCH_UNKNOWN:
|
12091
|
+
GGML_ASSERT(false && "unknown architecture");
|
12092
|
+
break;
|
12093
|
+
}
|
12094
|
+
|
12095
|
+
return LLAMA_ROPE_TYPE_NONE;
|
12096
|
+
}
|
12097
|
+
|
11799
12098
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
11800
12099
|
return model->vocab.id_to_token.size();
|
11801
12100
|
}
|
@@ -11898,15 +12197,6 @@ uint32_t llama_model_quantize(
|
|
11898
12197
|
}
|
11899
12198
|
}
|
11900
12199
|
|
11901
|
-
int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
11902
|
-
try {
|
11903
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
11904
|
-
} catch (const std::exception & err) {
|
11905
|
-
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
11906
|
-
return 1;
|
11907
|
-
}
|
11908
|
-
}
|
11909
|
-
|
11910
12200
|
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
11911
12201
|
try {
|
11912
12202
|
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
@@ -12038,12 +12328,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
12038
12328
|
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
12039
12329
|
}
|
12040
12330
|
|
12041
|
-
void
|
12331
|
+
void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
12042
12332
|
if (delta == 0) {
|
12043
12333
|
return;
|
12044
12334
|
}
|
12045
12335
|
|
12046
|
-
|
12336
|
+
llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
|
12047
12337
|
}
|
12048
12338
|
|
12049
12339
|
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
@@ -12054,6 +12344,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
|
|
12054
12344
|
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
12055
12345
|
}
|
12056
12346
|
|
12347
|
+
llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
|
12348
|
+
return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
|
12349
|
+
}
|
12350
|
+
|
12351
|
+
void llama_kv_cache_defrag(struct llama_context * ctx) {
|
12352
|
+
llama_kv_cache_defrag(ctx->kv_self);
|
12353
|
+
}
|
12354
|
+
|
12355
|
+
void llama_kv_cache_update(struct llama_context * ctx) {
|
12356
|
+
llama_kv_cache_update_internal(*ctx);
|
12357
|
+
}
|
12358
|
+
|
12359
|
+
|
12057
12360
|
// Returns the *maximum* size of the state
|
12058
12361
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
12059
12362
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
@@ -12180,10 +12483,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12180
12483
|
const auto & hparams = ctx->model.hparams;
|
12181
12484
|
const auto & cparams = ctx->cparams;
|
12182
12485
|
|
12183
|
-
const
|
12184
|
-
const
|
12185
|
-
const
|
12186
|
-
const
|
12486
|
+
const uint32_t n_layer = hparams.n_layer;
|
12487
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12488
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12489
|
+
const uint32_t n_ctx = cparams.n_ctx;
|
12187
12490
|
|
12188
12491
|
const size_t kv_buf_size = kv_self.total_size();
|
12189
12492
|
const uint32_t kv_head = kv_self.head;
|
@@ -12198,14 +12501,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12198
12501
|
if (kv_buf_size) {
|
12199
12502
|
std::vector<uint8_t> tmp_buf;
|
12200
12503
|
for (int il = 0; il < (int) n_layer; ++il) {
|
12201
|
-
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12504
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12505
|
+
|
12202
12506
|
tmp_buf.resize(k_size);
|
12203
12507
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
12204
12508
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
12205
12509
|
|
12206
12510
|
// v is not contiguous, copy row by row
|
12207
|
-
size_t v_row_size
|
12208
|
-
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12511
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12512
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12513
|
+
|
12209
12514
|
tmp_buf.resize(v_row_size);
|
12210
12515
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12211
12516
|
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
@@ -12238,8 +12543,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
12238
12543
|
}
|
12239
12544
|
|
12240
12545
|
// Sets the state reading from the specified source address
|
12241
|
-
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
12242
|
-
uint8_t * inp = src;
|
12546
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
12547
|
+
const uint8_t * inp = src;
|
12243
12548
|
|
12244
12549
|
// set rng
|
12245
12550
|
{
|
@@ -12248,7 +12553,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12248
12553
|
|
12249
12554
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
12250
12555
|
|
12251
|
-
std::string rng_str((char *)inp, rng_size); inp += rng_size;
|
12556
|
+
std::string rng_str((const char *)inp, rng_size); inp += rng_size;
|
12252
12557
|
|
12253
12558
|
std::istringstream rng_ss(rng_str);
|
12254
12559
|
rng_ss >> ctx->rng;
|
@@ -12292,10 +12597,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12292
12597
|
const auto & hparams = ctx->model.hparams;
|
12293
12598
|
const auto & cparams = ctx->cparams;
|
12294
12599
|
|
12295
|
-
const
|
12296
|
-
const
|
12297
|
-
const
|
12298
|
-
const
|
12600
|
+
const uint32_t n_layer = hparams.n_layer;
|
12601
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12602
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12603
|
+
const uint32_t n_ctx = cparams.n_ctx;
|
12299
12604
|
|
12300
12605
|
size_t kv_buf_size;
|
12301
12606
|
uint32_t kv_head;
|
@@ -12311,13 +12616,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12311
12616
|
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
12312
12617
|
|
12313
12618
|
for (int il = 0; il < (int) n_layer; ++il) {
|
12314
|
-
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12619
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12620
|
+
|
12315
12621
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
12316
12622
|
inp += k_size;
|
12317
12623
|
|
12318
12624
|
// v is not contiguous, copy row by row
|
12319
|
-
size_t v_row_size
|
12320
|
-
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12625
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12626
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12627
|
+
|
12321
12628
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12322
12629
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
12323
12630
|
inp += v_row_size;
|
@@ -12439,38 +12746,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
12439
12746
|
return true;
|
12440
12747
|
}
|
12441
12748
|
|
12442
|
-
int llama_eval(
|
12443
|
-
struct llama_context * ctx,
|
12444
|
-
llama_token * tokens,
|
12445
|
-
int32_t n_tokens,
|
12446
|
-
int32_t n_past) {
|
12447
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
12448
|
-
|
12449
|
-
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
12450
|
-
if (ret < 0) {
|
12451
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
12452
|
-
}
|
12453
|
-
|
12454
|
-
return ret;
|
12455
|
-
}
|
12456
|
-
|
12457
|
-
int llama_eval_embd(
|
12458
|
-
struct llama_context * ctx,
|
12459
|
-
float * embd,
|
12460
|
-
int32_t n_tokens,
|
12461
|
-
int32_t n_past) {
|
12462
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
12463
|
-
|
12464
|
-
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
12465
|
-
|
12466
|
-
const int ret = llama_decode_internal(*ctx, batch);
|
12467
|
-
if (ret < 0) {
|
12468
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
12469
|
-
}
|
12470
|
-
|
12471
|
-
return ret;
|
12472
|
-
}
|
12473
|
-
|
12474
12749
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
12475
12750
|
ctx->cparams.n_threads = n_threads;
|
12476
12751
|
ctx->cparams.n_threads_batch = n_threads_batch;
|