llama_cpp 0.12.7 → 0.13.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +11 -0
- data/ext/llama_cpp/llama_cpp.cpp +72 -262
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +23 -25
- data/vendor/tmp/llama.cpp/Makefile +8 -3
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-backend.c +14 -2
- data/vendor/tmp/llama.cpp/ggml-backend.h +1 -1
- data/vendor/tmp/llama.cpp/ggml-kompute.cpp +7 -1
- data/vendor/tmp/llama.cpp/ggml-metal.m +96 -15
- data/vendor/tmp/llama.cpp/ggml-metal.metal +1049 -38
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +25 -25
- data/vendor/tmp/llama.cpp/ggml-quants.c +1873 -218
- data/vendor/tmp/llama.cpp/ggml-quants.h +52 -0
- data/vendor/tmp/llama.cpp/ggml-sycl.cpp +292 -221
- data/vendor/tmp/llama.cpp/ggml-vulkan.cpp +64 -52
- data/vendor/tmp/llama.cpp/ggml.c +318 -195
- data/vendor/tmp/llama.cpp/ggml.h +35 -19
- data/vendor/tmp/llama.cpp/llama.cpp +806 -531
- data/vendor/tmp/llama.cpp/llama.h +53 -65
- data/vendor/tmp/llama.cpp/unicode.h +310 -1
- metadata +2 -2
@@ -68,10 +68,12 @@
|
|
68
68
|
#include <cstdio>
|
69
69
|
#include <cstring>
|
70
70
|
#include <ctime>
|
71
|
+
#include <cwctype>
|
71
72
|
#include <forward_list>
|
72
73
|
#include <fstream>
|
73
74
|
#include <functional>
|
74
75
|
#include <initializer_list>
|
76
|
+
#include <locale>
|
75
77
|
#include <map>
|
76
78
|
#include <memory>
|
77
79
|
#include <mutex>
|
@@ -850,9 +852,9 @@ struct LLM_TN {
|
|
850
852
|
//
|
851
853
|
|
852
854
|
static std::map<int32_t, const char *> LLAMA_ROPE_SCALING_TYPES = {
|
853
|
-
{
|
854
|
-
{
|
855
|
-
{
|
855
|
+
{ LLAMA_ROPE_SCALING_TYPE_NONE, "none" },
|
856
|
+
{ LLAMA_ROPE_SCALING_TYPE_LINEAR, "linear" },
|
857
|
+
{ LLAMA_ROPE_SCALING_TYPE_YARN, "yarn" },
|
856
858
|
};
|
857
859
|
|
858
860
|
static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
@@ -862,7 +864,7 @@ static int32_t llama_rope_scaling_type_from_string(const std::string & name) {
|
|
862
864
|
}
|
863
865
|
}
|
864
866
|
|
865
|
-
return
|
867
|
+
return LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED;
|
866
868
|
}
|
867
869
|
|
868
870
|
static std::string gguf_data_to_str(enum gguf_type type, const void * data, int i) {
|
@@ -1550,8 +1552,9 @@ static const size_t MiB = 1024*kiB;
|
|
1550
1552
|
static const size_t GiB = 1024*MiB;
|
1551
1553
|
|
1552
1554
|
struct llama_hparams {
|
1553
|
-
bool
|
1554
|
-
bool
|
1555
|
+
bool vocab_only;
|
1556
|
+
bool rope_finetuned;
|
1557
|
+
|
1555
1558
|
uint32_t n_vocab;
|
1556
1559
|
uint32_t n_ctx_train; // context size the model was trained on
|
1557
1560
|
uint32_t n_embd;
|
@@ -1580,7 +1583,8 @@ struct llama_hparams {
|
|
1580
1583
|
bool causal_attn = true;
|
1581
1584
|
bool need_kq_pos = false;
|
1582
1585
|
|
1583
|
-
|
1586
|
+
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
|
1587
|
+
enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
|
1584
1588
|
|
1585
1589
|
bool operator!=(const llama_hparams & other) const {
|
1586
1590
|
if (this->vocab_only != other.vocab_only) return true;
|
@@ -1639,8 +1643,8 @@ struct llama_cparams {
|
|
1639
1643
|
float yarn_attn_factor;
|
1640
1644
|
float yarn_beta_fast;
|
1641
1645
|
float yarn_beta_slow;
|
1646
|
+
float defrag_thold;
|
1642
1647
|
|
1643
|
-
bool mul_mat_q;
|
1644
1648
|
bool offload_kqv;
|
1645
1649
|
bool do_pooling;
|
1646
1650
|
|
@@ -1707,11 +1711,20 @@ struct llama_kv_cell {
|
|
1707
1711
|
bool has_seq_id(const llama_seq_id & id) const {
|
1708
1712
|
return seq_id.find(id) != seq_id.end();
|
1709
1713
|
}
|
1714
|
+
|
1715
|
+
bool is_empty() const {
|
1716
|
+
return seq_id.empty();
|
1717
|
+
}
|
1718
|
+
|
1719
|
+
bool is_same_seq(const llama_kv_cell & other) const {
|
1720
|
+
return seq_id == other.seq_id;
|
1721
|
+
}
|
1710
1722
|
};
|
1711
1723
|
|
1712
1724
|
// ring-buffer of cached KV data
|
1713
1725
|
struct llama_kv_cache {
|
1714
1726
|
bool has_shift = false;
|
1727
|
+
bool do_defrag = false;
|
1715
1728
|
|
1716
1729
|
// Note: The value of head isn't only used to optimize searching
|
1717
1730
|
// for a free KV slot. llama_decode_internal also uses it, so it
|
@@ -1723,6 +1736,9 @@ struct llama_kv_cache {
|
|
1723
1736
|
// computed before each graph build
|
1724
1737
|
uint32_t n = 0;
|
1725
1738
|
|
1739
|
+
ggml_type type_k = GGML_TYPE_F16;
|
1740
|
+
ggml_type type_v = GGML_TYPE_F16;
|
1741
|
+
|
1726
1742
|
std::vector<llama_kv_cell> cells;
|
1727
1743
|
|
1728
1744
|
std::vector<struct ggml_tensor *> k_l; // per layer
|
@@ -1958,8 +1974,8 @@ struct llama_context {
|
|
1958
1974
|
static bool llama_kv_cache_init(
|
1959
1975
|
struct llama_kv_cache & cache,
|
1960
1976
|
const llama_model & model,
|
1961
|
-
ggml_type
|
1962
|
-
ggml_type
|
1977
|
+
ggml_type type_k,
|
1978
|
+
ggml_type type_v,
|
1963
1979
|
uint32_t n_ctx,
|
1964
1980
|
bool offload) {
|
1965
1981
|
const struct llama_hparams & hparams = model.hparams;
|
@@ -1974,6 +1990,9 @@ static bool llama_kv_cache_init(
|
|
1974
1990
|
cache.size = n_ctx;
|
1975
1991
|
cache.used = 0;
|
1976
1992
|
|
1993
|
+
cache.type_k = type_k;
|
1994
|
+
cache.type_v = type_v;
|
1995
|
+
|
1977
1996
|
cache.cells.clear();
|
1978
1997
|
cache.cells.resize(n_ctx);
|
1979
1998
|
|
@@ -2014,8 +2033,8 @@ static bool llama_kv_cache_init(
|
|
2014
2033
|
|
2015
2034
|
for (int i = 0; i < (int) n_layer; i++) {
|
2016
2035
|
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
2017
|
-
ggml_tensor * k = ggml_new_tensor_1d(ctx,
|
2018
|
-
ggml_tensor * v = ggml_new_tensor_1d(ctx,
|
2036
|
+
ggml_tensor * k = ggml_new_tensor_1d(ctx, type_k, n_embd_k_gqa*n_ctx);
|
2037
|
+
ggml_tensor * v = ggml_new_tensor_1d(ctx, type_v, n_embd_v_gqa*n_ctx);
|
2019
2038
|
ggml_format_name(k, "cache_k_l%d", i);
|
2020
2039
|
ggml_format_name(v, "cache_v_l%d", i);
|
2021
2040
|
cache.k_l.push_back(k);
|
@@ -2099,7 +2118,7 @@ static bool llama_kv_cache_find_slot(
|
|
2099
2118
|
// find how many cells are currently in use
|
2100
2119
|
static int32_t llama_kv_cache_cell_max(const struct llama_kv_cache & cache) {
|
2101
2120
|
for (uint32_t i = cache.size - 1; i > 0; --i) {
|
2102
|
-
if (cache.cells[i].pos >= 0 && !cache.cells[i].
|
2121
|
+
if (cache.cells[i].pos >= 0 && !cache.cells[i].is_empty()) {
|
2103
2122
|
return i + 1;
|
2104
2123
|
}
|
2105
2124
|
}
|
@@ -2135,7 +2154,7 @@ static void llama_kv_cache_seq_rm(
|
|
2135
2154
|
} else {
|
2136
2155
|
continue;
|
2137
2156
|
}
|
2138
|
-
if (cache.cells[i].
|
2157
|
+
if (cache.cells[i].is_empty()) {
|
2139
2158
|
// keep count of the number of used cells
|
2140
2159
|
if (cache.cells[i].pos >= 0) cache.used--;
|
2141
2160
|
|
@@ -2186,7 +2205,7 @@ static void llama_kv_cache_seq_keep(struct llama_kv_cache & cache, llama_seq_id
|
|
2186
2205
|
if (new_head != cache.size && new_head < cache.head) cache.head = new_head;
|
2187
2206
|
}
|
2188
2207
|
|
2189
|
-
static void
|
2208
|
+
static void llama_kv_cache_seq_add(
|
2190
2209
|
struct llama_kv_cache & cache,
|
2191
2210
|
llama_seq_id seq_id,
|
2192
2211
|
llama_pos p0,
|
@@ -2204,10 +2223,14 @@ static void llama_kv_cache_seq_shift(
|
|
2204
2223
|
cache.cells[i].delta += delta;
|
2205
2224
|
|
2206
2225
|
if (cache.cells[i].pos < 0) {
|
2207
|
-
if (!cache.cells[i].
|
2226
|
+
if (!cache.cells[i].is_empty()) {
|
2227
|
+
cache.used--;
|
2228
|
+
}
|
2208
2229
|
cache.cells[i].pos = -1;
|
2209
2230
|
cache.cells[i].seq_id.clear();
|
2210
|
-
if (new_head == cache.size)
|
2231
|
+
if (new_head == cache.size) {
|
2232
|
+
new_head = i;
|
2233
|
+
}
|
2211
2234
|
}
|
2212
2235
|
}
|
2213
2236
|
}
|
@@ -2239,6 +2262,22 @@ static void llama_kv_cache_seq_div(
|
|
2239
2262
|
}
|
2240
2263
|
}
|
2241
2264
|
|
2265
|
+
static llama_pos llama_kv_cache_seq_pos_max(struct llama_kv_cache & cache, llama_seq_id seq_id) {
|
2266
|
+
llama_pos result = 0;
|
2267
|
+
|
2268
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
2269
|
+
if (cache.cells[i].has_seq_id(seq_id)) {
|
2270
|
+
result = std::max(result, cache.cells[i].pos);
|
2271
|
+
}
|
2272
|
+
}
|
2273
|
+
|
2274
|
+
return result;
|
2275
|
+
}
|
2276
|
+
|
2277
|
+
static void llama_kv_cache_defrag(struct llama_kv_cache & cache) {
|
2278
|
+
cache.do_defrag = true;
|
2279
|
+
}
|
2280
|
+
|
2242
2281
|
//
|
2243
2282
|
// model loading and saving
|
2244
2283
|
//
|
@@ -2310,7 +2349,7 @@ namespace GGUFMeta {
|
|
2310
2349
|
}
|
2311
2350
|
};
|
2312
2351
|
|
2313
|
-
struct ArrayInfo{
|
2352
|
+
struct ArrayInfo {
|
2314
2353
|
const gguf_type gt;
|
2315
2354
|
const size_t length;
|
2316
2355
|
const void * data;
|
@@ -2329,7 +2368,7 @@ namespace GGUFMeta {
|
|
2329
2368
|
};
|
2330
2369
|
|
2331
2370
|
template<typename T>
|
2332
|
-
class GKV: public GKV_Base<T> {
|
2371
|
+
class GKV : public GKV_Base<T> {
|
2333
2372
|
GKV() = delete;
|
2334
2373
|
|
2335
2374
|
public:
|
@@ -2345,46 +2384,46 @@ namespace GGUFMeta {
|
|
2345
2384
|
|
2346
2385
|
static const char * override_type_to_str(const llama_model_kv_override_type ty) {
|
2347
2386
|
switch (ty) {
|
2348
|
-
case
|
2349
|
-
case
|
2350
|
-
case
|
2387
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: return "bool";
|
2388
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: return "int";
|
2389
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: return "float";
|
2351
2390
|
}
|
2352
2391
|
return "unknown";
|
2353
2392
|
}
|
2354
2393
|
|
2355
|
-
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override *
|
2356
|
-
if (!
|
2357
|
-
if (
|
2394
|
+
static bool validate_override(const llama_model_kv_override_type expected_type, const struct llama_model_kv_override * ovrd) {
|
2395
|
+
if (!ovrd) { return false; }
|
2396
|
+
if (ovrd->tag == expected_type) {
|
2358
2397
|
LLAMA_LOG_INFO("%s: Using metadata override (%5s) '%s' = ",
|
2359
|
-
__func__, override_type_to_str(
|
2360
|
-
switch (
|
2361
|
-
case
|
2362
|
-
LLAMA_LOG_INFO("%s\n",
|
2398
|
+
__func__, override_type_to_str(ovrd->tag), ovrd->key);
|
2399
|
+
switch (ovrd->tag) {
|
2400
|
+
case LLAMA_KV_OVERRIDE_TYPE_BOOL: {
|
2401
|
+
LLAMA_LOG_INFO("%s\n", ovrd->bool_value ? "true" : "false");
|
2363
2402
|
} break;
|
2364
|
-
case
|
2365
|
-
LLAMA_LOG_INFO("%" PRId64 "\n",
|
2403
|
+
case LLAMA_KV_OVERRIDE_TYPE_INT: {
|
2404
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", ovrd->int_value);
|
2366
2405
|
} break;
|
2367
|
-
case
|
2368
|
-
LLAMA_LOG_INFO("%.6f\n",
|
2406
|
+
case LLAMA_KV_OVERRIDE_TYPE_FLOAT: {
|
2407
|
+
LLAMA_LOG_INFO("%.6f\n", ovrd->float_value);
|
2369
2408
|
} break;
|
2370
2409
|
default:
|
2371
2410
|
// Shouldn't be possible to end up here, but just in case...
|
2372
2411
|
throw std::runtime_error(
|
2373
2412
|
format("Unsupported attempt to override %s type for metadata key %s\n",
|
2374
|
-
override_type_to_str(
|
2413
|
+
override_type_to_str(ovrd->tag), ovrd->key));
|
2375
2414
|
}
|
2376
2415
|
return true;
|
2377
2416
|
}
|
2378
2417
|
LLAMA_LOG_WARN("%s: Warning: Bad metadata override type for key '%s', expected %s but got %s\n",
|
2379
|
-
__func__,
|
2418
|
+
__func__, ovrd->key, override_type_to_str(expected_type), override_type_to_str(ovrd->tag));
|
2380
2419
|
return false;
|
2381
2420
|
}
|
2382
2421
|
|
2383
2422
|
template<typename OT>
|
2384
2423
|
static typename std::enable_if<std::is_same<OT, bool>::value, bool>::type
|
2385
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
2386
|
-
if (validate_override(
|
2387
|
-
target =
|
2424
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2425
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_BOOL, ovrd)) {
|
2426
|
+
target = ovrd->bool_value;
|
2388
2427
|
return true;
|
2389
2428
|
}
|
2390
2429
|
return false;
|
@@ -2392,9 +2431,9 @@ namespace GGUFMeta {
|
|
2392
2431
|
|
2393
2432
|
template<typename OT>
|
2394
2433
|
static typename std::enable_if<!std::is_same<OT, bool>::value && std::is_integral<OT>::value, bool>::type
|
2395
|
-
try_override(OT & target, const struct llama_model_kv_override *
|
2396
|
-
if (validate_override(
|
2397
|
-
target =
|
2434
|
+
try_override(OT & target, const struct llama_model_kv_override * ovrd) {
|
2435
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_INT, ovrd)) {
|
2436
|
+
target = ovrd->int_value;
|
2398
2437
|
return true;
|
2399
2438
|
}
|
2400
2439
|
return false;
|
@@ -2402,9 +2441,9 @@ namespace GGUFMeta {
|
|
2402
2441
|
|
2403
2442
|
template<typename OT>
|
2404
2443
|
static typename std::enable_if<std::is_floating_point<OT>::value, bool>::type
|
2405
|
-
try_override(T & target, const struct llama_model_kv_override *
|
2406
|
-
if (validate_override(
|
2407
|
-
target =
|
2444
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2445
|
+
if (validate_override(LLAMA_KV_OVERRIDE_TYPE_FLOAT, ovrd)) {
|
2446
|
+
target = ovrd->float_value;
|
2408
2447
|
return true;
|
2409
2448
|
}
|
2410
2449
|
return false;
|
@@ -2412,17 +2451,17 @@ namespace GGUFMeta {
|
|
2412
2451
|
|
2413
2452
|
template<typename OT>
|
2414
2453
|
static typename std::enable_if<std::is_same<OT, std::string>::value, bool>::type
|
2415
|
-
try_override(T & target, const struct llama_model_kv_override *
|
2454
|
+
try_override(T & target, const struct llama_model_kv_override * ovrd) {
|
2416
2455
|
(void)target;
|
2417
|
-
(void)
|
2418
|
-
if (!
|
2456
|
+
(void)ovrd;
|
2457
|
+
if (!ovrd) { return false; }
|
2419
2458
|
// Currently, we should never end up here so it would be a bug if we do.
|
2420
2459
|
throw std::runtime_error(format("Unsupported attempt to override string type for metadata key %s\n",
|
2421
|
-
|
2460
|
+
ovrd ? ovrd->key : "NULL"));
|
2422
2461
|
}
|
2423
2462
|
|
2424
|
-
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override *
|
2425
|
-
if (try_override<T>(target,
|
2463
|
+
static bool set(const gguf_context * ctx, const int k, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2464
|
+
if (try_override<T>(target, ovrd)) {
|
2426
2465
|
return true;
|
2427
2466
|
}
|
2428
2467
|
if (k < 0) { return false; }
|
@@ -2430,12 +2469,12 @@ namespace GGUFMeta {
|
|
2430
2469
|
return true;
|
2431
2470
|
}
|
2432
2471
|
|
2433
|
-
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override *
|
2434
|
-
return set(ctx, gguf_find_key(ctx, key), target,
|
2472
|
+
static bool set(const gguf_context * ctx, const char * key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2473
|
+
return set(ctx, gguf_find_key(ctx, key), target, ovrd);
|
2435
2474
|
}
|
2436
2475
|
|
2437
|
-
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override *
|
2438
|
-
return set(ctx, key.c_str(), target,
|
2476
|
+
static bool set(const gguf_context * ctx, const std::string & key, T & target, const struct llama_model_kv_override * ovrd = nullptr) {
|
2477
|
+
return set(ctx, key.c_str(), target, ovrd);
|
2439
2478
|
}
|
2440
2479
|
};
|
2441
2480
|
}
|
@@ -2542,9 +2581,12 @@ struct llama_model_loader {
|
|
2542
2581
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
2543
2582
|
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
2544
2583
|
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
2584
|
+
case GGML_TYPE_IQ2_S: ftype = LLAMA_FTYPE_MOSTLY_IQ2_S; break;
|
2545
2585
|
case GGML_TYPE_IQ3_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ3_XXS; break;
|
2546
2586
|
case GGML_TYPE_IQ1_S: ftype = LLAMA_FTYPE_MOSTLY_IQ1_S; break;
|
2547
2587
|
case GGML_TYPE_IQ4_NL: ftype = LLAMA_FTYPE_MOSTLY_IQ4_NL; break;
|
2588
|
+
case GGML_TYPE_IQ4_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ4_XS; break;
|
2589
|
+
case GGML_TYPE_IQ3_S: ftype = LLAMA_FTYPE_MOSTLY_IQ3_S; break;
|
2548
2590
|
default:
|
2549
2591
|
{
|
2550
2592
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
@@ -2845,6 +2887,15 @@ struct llama_model_loader {
|
|
2845
2887
|
}
|
2846
2888
|
};
|
2847
2889
|
|
2890
|
+
template<>
|
2891
|
+
bool llama_model_loader::get_key(const enum llm_kv kid, enum llama_pooling_type & result, const bool required) {
|
2892
|
+
uint32_t tmp;
|
2893
|
+
const bool found = get_key(kid, tmp, required);
|
2894
|
+
result = (enum llama_pooling_type) tmp;
|
2895
|
+
return found;
|
2896
|
+
}
|
2897
|
+
|
2898
|
+
|
2848
2899
|
//
|
2849
2900
|
// load LLaMA models
|
2850
2901
|
//
|
@@ -2886,10 +2937,15 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
2886
2937
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
2887
2938
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XXS - 2.0625 bpw";
|
2888
2939
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
2889
|
-
case
|
2940
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: return "IQ2_S - 2.5 bpw";
|
2941
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: return "IQ2_M - 2.7 bpw";
|
2942
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: return "IQ3_XS - 3.3 bpw";
|
2890
2943
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS:return "IQ3_XXS - 3.0625 bpw";
|
2891
2944
|
case LLAMA_FTYPE_MOSTLY_IQ1_S :return "IQ1_S - 1.5625 bpw";
|
2892
2945
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: return "IQ4_NL - 4.5 bpw";
|
2946
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: return "IQ4_XS - 4.25 bpw";
|
2947
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: return "IQ3_S - 3.4375 bpw";
|
2948
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: return "IQ3_S mix - 3.66 bpw";
|
2893
2949
|
|
2894
2950
|
default: return "unknown, may not work";
|
2895
2951
|
}
|
@@ -2923,16 +2979,16 @@ static const char * llama_model_type_name(e_model type) {
|
|
2923
2979
|
default: return "?B";
|
2924
2980
|
}
|
2925
2981
|
}
|
2982
|
+
|
2926
2983
|
static const char * llama_model_vocab_type_name(enum llama_vocab_type type){
|
2927
2984
|
switch (type) {
|
2928
|
-
case LLAMA_VOCAB_TYPE_SPM:
|
2929
|
-
case LLAMA_VOCAB_TYPE_BPE:
|
2930
|
-
case LLAMA_VOCAB_TYPE_WPM:
|
2931
|
-
default:
|
2985
|
+
case LLAMA_VOCAB_TYPE_SPM: return "SPM";
|
2986
|
+
case LLAMA_VOCAB_TYPE_BPE: return "BPE";
|
2987
|
+
case LLAMA_VOCAB_TYPE_WPM: return "WPM";
|
2988
|
+
default: return "unknown";
|
2932
2989
|
}
|
2933
2990
|
}
|
2934
2991
|
|
2935
|
-
|
2936
2992
|
static void llm_load_arch(llama_model_loader & ml, llama_model & model) {
|
2937
2993
|
model.arch = ml.get_arch();
|
2938
2994
|
if (model.arch == LLM_ARCH_UNKNOWN) {
|
@@ -2996,7 +3052,7 @@ static void llm_load_hparams(
|
|
2996
3052
|
std::string rope_scaling("linear");
|
2997
3053
|
ml.get_key(LLM_KV_ROPE_SCALING_TYPE, rope_scaling, false);
|
2998
3054
|
hparams.rope_scaling_type_train = llama_rope_scaling_type_from_string(rope_scaling);
|
2999
|
-
GGML_ASSERT(hparams.rope_scaling_type_train !=
|
3055
|
+
GGML_ASSERT(hparams.rope_scaling_type_train != LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED);
|
3000
3056
|
|
3001
3057
|
// rope_freq_scale (inverse of the kv) is optional
|
3002
3058
|
float ropescale = 0.0f;
|
@@ -3109,10 +3165,10 @@ static void llm_load_hparams(
|
|
3109
3165
|
} break;
|
3110
3166
|
case LLM_ARCH_BERT:
|
3111
3167
|
{
|
3112
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,
|
3113
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL,
|
3168
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3169
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3114
3170
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3115
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
3171
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3116
3172
|
|
3117
3173
|
switch (hparams.n_layer) {
|
3118
3174
|
case 3:
|
@@ -3130,10 +3186,10 @@ static void llm_load_hparams(
|
|
3130
3186
|
} break;
|
3131
3187
|
case LLM_ARCH_NOMIC_BERT:
|
3132
3188
|
{
|
3133
|
-
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS,
|
3134
|
-
ml.get_key(LLM_KV_ATTENTION_CAUSAL,
|
3189
|
+
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
3190
|
+
ml.get_key(LLM_KV_ATTENTION_CAUSAL, hparams.causal_attn);
|
3135
3191
|
ml.get_key(LLM_KV_TOKENIZER_TOKEN_TYPE_COUNT, hparams.n_vocab_type);
|
3136
|
-
ml.get_key(LLM_KV_POOLING_TYPE,
|
3192
|
+
ml.get_key(LLM_KV_POOLING_TYPE, hparams.pooling_type);
|
3137
3193
|
|
3138
3194
|
if (hparams.n_layer == 12 && hparams.n_embd == 768) {
|
3139
3195
|
model.type = e_model::MODEL_137M;
|
@@ -3272,6 +3328,8 @@ static void llm_load_hparams(
|
|
3272
3328
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
3273
3329
|
hparams.need_kq_pos = true;
|
3274
3330
|
}
|
3331
|
+
|
3332
|
+
hparams.rope_type = llama_rope_type(&model);
|
3275
3333
|
}
|
3276
3334
|
|
3277
3335
|
// TODO: This should probably be in llama.h
|
@@ -3574,6 +3632,8 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
3574
3632
|
LLAMA_LOG_INFO("%s: n_ff = %u\n", __func__, hparams.n_ff);
|
3575
3633
|
LLAMA_LOG_INFO("%s: n_expert = %u\n", __func__, hparams.n_expert);
|
3576
3634
|
LLAMA_LOG_INFO("%s: n_expert_used = %u\n", __func__, hparams.n_expert_used);
|
3635
|
+
LLAMA_LOG_INFO("%s: pooling type = %d\n", __func__, hparams.pooling_type);
|
3636
|
+
LLAMA_LOG_INFO("%s: rope type = %d\n", __func__, hparams.rope_type);
|
3577
3637
|
LLAMA_LOG_INFO("%s: rope scaling = %s\n", __func__, rope_scaling_type);
|
3578
3638
|
LLAMA_LOG_INFO("%s: freq_base_train = %.1f\n", __func__, hparams.rope_freq_base_train);
|
3579
3639
|
LLAMA_LOG_INFO("%s: freq_scale_train = %g\n", __func__, hparams.rope_freq_scale_train);
|
@@ -3640,7 +3700,7 @@ static bool llm_load_tensors(
|
|
3640
3700
|
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
3641
3701
|
}
|
3642
3702
|
|
3643
|
-
if (split_mode ==
|
3703
|
+
if (split_mode == LLAMA_SPLIT_MODE_LAYER) {
|
3644
3704
|
// calculate the split points
|
3645
3705
|
int device_count = llama_get_device_count();
|
3646
3706
|
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
@@ -3679,10 +3739,10 @@ static bool llm_load_tensors(
|
|
3679
3739
|
}
|
3680
3740
|
} else {
|
3681
3741
|
ggml_backend_buffer_type_t split_buft;
|
3682
|
-
if (split_mode ==
|
3742
|
+
if (split_mode == LLAMA_SPLIT_MODE_ROW) {
|
3683
3743
|
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
3684
3744
|
} else {
|
3685
|
-
//
|
3745
|
+
// LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_LAYER in backends where it is not supported
|
3686
3746
|
split_buft = llama_default_buffer_type_offload(main_gpu);
|
3687
3747
|
}
|
3688
3748
|
// assign the repeating layers
|
@@ -4595,12 +4655,6 @@ static int llama_model_load(const std::string & fname, llama_model & model, llam
|
|
4595
4655
|
|
4596
4656
|
using llm_build_cb = std::function<void(struct ggml_tensor * cur, const char * name, int nl)>;
|
4597
4657
|
|
4598
|
-
enum llm_rope_type {
|
4599
|
-
LLM_ROPE,
|
4600
|
-
LLM_ROPE_NEOX,
|
4601
|
-
LLM_ROPE_GLM,
|
4602
|
-
};
|
4603
|
-
|
4604
4658
|
enum llm_ffn_op_type {
|
4605
4659
|
LLM_FFN_SILU,
|
4606
4660
|
LLM_FFN_GELU,
|
@@ -4646,55 +4700,6 @@ static struct ggml_tensor * llm_build_inp_embd(
|
|
4646
4700
|
return inpL;
|
4647
4701
|
}
|
4648
4702
|
|
4649
|
-
// Persimmon: n_rot = n_embd_head_k/2
|
4650
|
-
// Other: n_rot = n_embd_head_k
|
4651
|
-
static void llm_build_k_shift(
|
4652
|
-
struct ggml_context * ctx,
|
4653
|
-
const llama_hparams & hparams,
|
4654
|
-
const llama_cparams & cparams,
|
4655
|
-
const llama_kv_cache & kv,
|
4656
|
-
struct ggml_cgraph * graph,
|
4657
|
-
struct ggml_tensor * K_shift,
|
4658
|
-
llm_rope_type type,
|
4659
|
-
int64_t n_ctx,
|
4660
|
-
float freq_base,
|
4661
|
-
float freq_scale,
|
4662
|
-
const llm_build_cb & cb) {
|
4663
|
-
const int64_t n_layer = hparams.n_layer;
|
4664
|
-
const int64_t n_head_kv = hparams.n_head_kv;
|
4665
|
-
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4666
|
-
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
4667
|
-
const int32_t n_rot = hparams.n_rot;
|
4668
|
-
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
4669
|
-
const float ext_factor = cparams.yarn_ext_factor;
|
4670
|
-
const float attn_factor = cparams.yarn_attn_factor;
|
4671
|
-
const float beta_fast = cparams.yarn_beta_fast;
|
4672
|
-
const float beta_slow = cparams.yarn_beta_slow;
|
4673
|
-
|
4674
|
-
int rope_type = 0;
|
4675
|
-
|
4676
|
-
switch (type) {
|
4677
|
-
case LLM_ROPE: rope_type = 0; break;
|
4678
|
-
case LLM_ROPE_NEOX: rope_type = 2; break;
|
4679
|
-
case LLM_ROPE_GLM: rope_type = 4; break;
|
4680
|
-
}
|
4681
|
-
|
4682
|
-
for (int il = 0; il < n_layer; ++il) {
|
4683
|
-
struct ggml_tensor * tmp =
|
4684
|
-
// we rotate only the first n_rot dimensions
|
4685
|
-
ggml_rope_custom_inplace(ctx,
|
4686
|
-
ggml_view_3d(ctx, kv.k_l[il],
|
4687
|
-
n_embd_head_k, n_head_kv, n_ctx,
|
4688
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_head_k),
|
4689
|
-
ggml_row_size(kv.k_l[il]->type, n_embd_k_gqa),
|
4690
|
-
0),
|
4691
|
-
K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
4692
|
-
ext_factor, attn_factor, beta_fast, beta_slow);
|
4693
|
-
cb(tmp, "K_shifted", il);
|
4694
|
-
ggml_build_forward_expand(graph, tmp);
|
4695
|
-
}
|
4696
|
-
}
|
4697
|
-
|
4698
4703
|
static void llm_build_kv_store(
|
4699
4704
|
struct ggml_context * ctx,
|
4700
4705
|
const llama_hparams & hparams,
|
@@ -4896,8 +4901,8 @@ static struct ggml_tensor * llm_build_kqv(
|
|
4896
4901
|
ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
|
4897
4902
|
}
|
4898
4903
|
|
4899
|
-
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
|
4900
|
-
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan,
|
4904
|
+
#if defined(GGML_USE_VULKAN) || defined(GGML_USE_KOMPUTE)
|
4905
|
+
#pragma message("TODO: ALiBi support in ggml_soft_max_ext is not implemented for Vulkan, and Kompute")
|
4901
4906
|
#pragma message(" Falling back to ggml_alibi(). Will become an error in Mar 2024")
|
4902
4907
|
#pragma message("ref: https://github.com/ggerganov/llama.cpp/pull/5488")
|
4903
4908
|
if (hparams.f_max_alibi_bias > 0.0f) {
|
@@ -4998,6 +5003,7 @@ struct llm_build_context {
|
|
4998
5003
|
|
4999
5004
|
const int64_t n_embd;
|
5000
5005
|
const int64_t n_layer;
|
5006
|
+
const int64_t n_rot;
|
5001
5007
|
const int64_t n_ctx; // user-specified context size (can be different from n_ctx_train)
|
5002
5008
|
const int64_t n_head;
|
5003
5009
|
const int64_t n_head_kv;
|
@@ -5022,8 +5028,8 @@ struct llm_build_context {
|
|
5022
5028
|
const int32_t kv_head; // index of where we store new KV data in the cache
|
5023
5029
|
const int32_t n_orig_ctx;
|
5024
5030
|
|
5025
|
-
const
|
5026
|
-
const
|
5031
|
+
const enum llama_pooling_type pooling_type;
|
5032
|
+
const enum llama_rope_type rope_type;
|
5027
5033
|
|
5028
5034
|
const llm_build_cb & cb;
|
5029
5035
|
|
@@ -5045,6 +5051,7 @@ struct llm_build_context {
|
|
5045
5051
|
kv_self (lctx.kv_self),
|
5046
5052
|
n_embd (hparams.n_embd),
|
5047
5053
|
n_layer (hparams.n_layer),
|
5054
|
+
n_rot (hparams.n_rot),
|
5048
5055
|
n_ctx (cparams.n_ctx),
|
5049
5056
|
n_head (hparams.n_head),
|
5050
5057
|
n_head_kv (hparams.n_head_kv),
|
@@ -5066,8 +5073,8 @@ struct llm_build_context {
|
|
5066
5073
|
n_kv (worst_case ? n_ctx : kv_self.n),
|
5067
5074
|
kv_head (worst_case ? n_ctx - n_tokens : kv_self.head),
|
5068
5075
|
n_orig_ctx (cparams.n_yarn_orig_ctx),
|
5069
|
-
|
5070
|
-
|
5076
|
+
pooling_type (cparams.do_pooling ? hparams.pooling_type : LLAMA_POOLING_TYPE_NONE),
|
5077
|
+
rope_type (hparams.rope_type),
|
5071
5078
|
cb (cb),
|
5072
5079
|
buf_compute_meta (lctx.buf_compute_meta) {
|
5073
5080
|
// all initializations should be done in init()
|
@@ -5090,6 +5097,76 @@ struct llm_build_context {
|
|
5090
5097
|
}
|
5091
5098
|
}
|
5092
5099
|
|
5100
|
+
struct ggml_cgraph * build_k_shift() {
|
5101
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5102
|
+
|
5103
|
+
for (int il = 0; il < n_layer; ++il) {
|
5104
|
+
struct ggml_tensor * tmp =
|
5105
|
+
// we rotate only the first n_rot dimensions
|
5106
|
+
ggml_rope_custom_inplace(ctx0,
|
5107
|
+
ggml_view_3d(ctx0, kv_self.k_l[il],
|
5108
|
+
n_embd_head_k, n_head_kv, n_ctx,
|
5109
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_head_k),
|
5110
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5111
|
+
0),
|
5112
|
+
lctx.inp_K_shift, n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5113
|
+
ext_factor, attn_factor, beta_fast, beta_slow);
|
5114
|
+
cb(tmp, "K_shifted", il);
|
5115
|
+
ggml_build_forward_expand(gf, tmp);
|
5116
|
+
}
|
5117
|
+
|
5118
|
+
return gf;
|
5119
|
+
}
|
5120
|
+
|
5121
|
+
struct ggml_cgraph * build_defrag(const std::vector<uint32_t> & ids) {
|
5122
|
+
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5123
|
+
|
5124
|
+
for (uint32_t i = 0; i < ids.size(); ++i) {
|
5125
|
+
const uint32_t id = ids[i];
|
5126
|
+
|
5127
|
+
if (i == id || id == ids.size()) {
|
5128
|
+
continue;
|
5129
|
+
}
|
5130
|
+
|
5131
|
+
uint32_t nm = 1;
|
5132
|
+
|
5133
|
+
while (i + nm < ids.size() && ids[i + nm] == id + nm) {
|
5134
|
+
nm++;
|
5135
|
+
}
|
5136
|
+
|
5137
|
+
for (int il = 0; il < n_layer; ++il) {
|
5138
|
+
ggml_tensor * view_k_src = ggml_view_2d(ctx0, kv_self.k_l[il],
|
5139
|
+
n_embd_k_gqa, nm,
|
5140
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5141
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*i));
|
5142
|
+
|
5143
|
+
ggml_tensor * view_k_dst = ggml_view_2d(ctx0, kv_self.k_l[il],
|
5144
|
+
n_embd_k_gqa, nm,
|
5145
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa),
|
5146
|
+
ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*id));
|
5147
|
+
|
5148
|
+
ggml_tensor * view_v_src = ggml_view_2d(ctx0, kv_self.v_l[il],
|
5149
|
+
nm, n_embd_v_gqa,
|
5150
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
5151
|
+
ggml_row_size(kv_self.v_l[il]->type, i));
|
5152
|
+
|
5153
|
+
ggml_tensor * view_v_dst = ggml_view_2d(ctx0, kv_self.v_l[il],
|
5154
|
+
nm, n_embd_v_gqa,
|
5155
|
+
ggml_row_size(kv_self.v_l[il]->type, kv_self.size),
|
5156
|
+
ggml_row_size(kv_self.v_l[il]->type, id));
|
5157
|
+
|
5158
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_k_src, view_k_dst));
|
5159
|
+
ggml_build_forward_expand(gf, ggml_cpy(ctx0, view_v_src, view_v_dst));
|
5160
|
+
}
|
5161
|
+
|
5162
|
+
i += nm - 1;
|
5163
|
+
}
|
5164
|
+
|
5165
|
+
//LLAMA_LOG_INFO("gf->n_nodes = %d\n", gf->n_nodes);
|
5166
|
+
|
5167
|
+
return gf;
|
5168
|
+
}
|
5169
|
+
|
5093
5170
|
struct ggml_cgraph * build_llama() {
|
5094
5171
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5095
5172
|
|
@@ -5111,11 +5188,6 @@ struct llm_build_context {
|
|
5111
5188
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5112
5189
|
cb(KQ_mask, "KQ_mask", -1);
|
5113
5190
|
|
5114
|
-
// shift the entire K-cache if needed
|
5115
|
-
if (do_rope_shift) {
|
5116
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5117
|
-
}
|
5118
|
-
|
5119
5191
|
for (int il = 0; il < n_layer; ++il) {
|
5120
5192
|
struct ggml_tensor * inpSA = inpL;
|
5121
5193
|
|
@@ -5151,14 +5223,14 @@ struct llm_build_context {
|
|
5151
5223
|
|
5152
5224
|
Qcur = ggml_rope_custom(
|
5153
5225
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5154
|
-
|
5226
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5155
5227
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5156
5228
|
);
|
5157
5229
|
cb(Qcur, "Qcur", il);
|
5158
5230
|
|
5159
5231
|
Kcur = ggml_rope_custom(
|
5160
5232
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5161
|
-
|
5233
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5162
5234
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5163
5235
|
);
|
5164
5236
|
cb(Kcur, "Kcur", il);
|
@@ -5299,11 +5371,6 @@ struct llm_build_context {
|
|
5299
5371
|
struct ggml_tensor * KQ_pos = ggml_view_1d(ctx0, lctx.inp_KQ_pos, n_kv, 0);
|
5300
5372
|
cb(KQ_pos, "KQ_pos", -1);
|
5301
5373
|
|
5302
|
-
// shift the entire K-cache if needed
|
5303
|
-
if (do_rope_shift) {
|
5304
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5305
|
-
}
|
5306
|
-
|
5307
5374
|
for (int il = 0; il < n_layer; ++il) {
|
5308
5375
|
struct ggml_tensor * inpSA = inpL;
|
5309
5376
|
|
@@ -5327,12 +5394,12 @@ struct llm_build_context {
|
|
5327
5394
|
case MODEL_7B:
|
5328
5395
|
Qcur = ggml_rope_custom(
|
5329
5396
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5330
|
-
|
5397
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5331
5398
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5332
5399
|
);
|
5333
5400
|
Kcur = ggml_rope_custom(
|
5334
5401
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5335
|
-
|
5402
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5336
5403
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5337
5404
|
);
|
5338
5405
|
break;
|
@@ -5417,11 +5484,6 @@ struct llm_build_context {
|
|
5417
5484
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5418
5485
|
cb(KQ_mask, "KQ_mask", -1);
|
5419
5486
|
|
5420
|
-
// shift the entire K-cache if needed
|
5421
|
-
if (do_rope_shift) {
|
5422
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5423
|
-
}
|
5424
|
-
|
5425
5487
|
for (int il = 0; il < n_layer; ++il) {
|
5426
5488
|
struct ggml_tensor * attn_norm;
|
5427
5489
|
|
@@ -5460,13 +5522,13 @@ struct llm_build_context {
|
|
5460
5522
|
|
5461
5523
|
// using mode = 2 for neox mode
|
5462
5524
|
Qcur = ggml_rope_custom(
|
5463
|
-
ctx0, Qcur, inp_pos,
|
5525
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5464
5526
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5465
5527
|
);
|
5466
5528
|
cb(Qcur, "Qcur", il);
|
5467
5529
|
|
5468
5530
|
Kcur = ggml_rope_custom(
|
5469
|
-
ctx0, Kcur, inp_pos,
|
5531
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5470
5532
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5471
5533
|
);
|
5472
5534
|
cb(Kcur, "Kcur", il);
|
@@ -5636,10 +5698,6 @@ struct llm_build_context {
|
|
5636
5698
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
5637
5699
|
cb(KQ_mask, "KQ_mask", -1);
|
5638
5700
|
|
5639
|
-
if (do_rope_shift) {
|
5640
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5641
|
-
}
|
5642
|
-
|
5643
5701
|
for (int il = 0; il < n_layer; ++il) {
|
5644
5702
|
struct ggml_tensor * residual = inpL;
|
5645
5703
|
|
@@ -5697,7 +5755,7 @@ struct llm_build_context {
|
|
5697
5755
|
|
5698
5756
|
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
5699
5757
|
struct ggml_tensor * qrot = ggml_view_3d(
|
5700
|
-
ctx0, tmpq,
|
5758
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5701
5759
|
ggml_element_size(tmpq) * n_embd_head,
|
5702
5760
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5703
5761
|
0
|
@@ -5705,7 +5763,7 @@ struct llm_build_context {
|
|
5705
5763
|
cb(qrot, "qrot", il);
|
5706
5764
|
|
5707
5765
|
struct ggml_tensor * krot = ggml_view_3d(
|
5708
|
-
ctx0, tmpk,
|
5766
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5709
5767
|
ggml_element_size(tmpk) * n_embd_head,
|
5710
5768
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5711
5769
|
0
|
@@ -5714,29 +5772,29 @@ struct llm_build_context {
|
|
5714
5772
|
|
5715
5773
|
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
5716
5774
|
struct ggml_tensor * qpass = ggml_view_3d(
|
5717
|
-
ctx0, tmpq,
|
5775
|
+
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5718
5776
|
ggml_element_size(tmpq) * n_embd_head,
|
5719
5777
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5720
|
-
ggml_element_size(tmpq) *
|
5778
|
+
ggml_element_size(tmpq) * n_rot
|
5721
5779
|
);
|
5722
5780
|
cb(qpass, "qpass", il);
|
5723
5781
|
|
5724
5782
|
struct ggml_tensor * kpass = ggml_view_3d(
|
5725
|
-
ctx0, tmpk,
|
5783
|
+
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5726
5784
|
ggml_element_size(tmpk) * n_embd_head,
|
5727
5785
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5728
|
-
ggml_element_size(tmpk) *
|
5786
|
+
ggml_element_size(tmpk) * n_rot
|
5729
5787
|
);
|
5730
5788
|
cb(kpass, "kpass", il);
|
5731
5789
|
|
5732
5790
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
5733
|
-
ctx0, qrot, inp_pos,
|
5791
|
+
ctx0, qrot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5734
5792
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5735
5793
|
);
|
5736
5794
|
cb(qrotated, "qrotated", il);
|
5737
5795
|
|
5738
5796
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
5739
|
-
ctx0, krot, inp_pos,
|
5797
|
+
ctx0, krot, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
5740
5798
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5741
5799
|
);
|
5742
5800
|
cb(krotated, "krotated", il);
|
@@ -5988,14 +6046,14 @@ struct llm_build_context {
|
|
5988
6046
|
|
5989
6047
|
Qcur = ggml_rope_custom(
|
5990
6048
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
5991
|
-
|
6049
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5992
6050
|
ext_factor, attn_factor, beta_fast, beta_slow
|
5993
6051
|
);
|
5994
6052
|
cb(Qcur, "Qcur", il);
|
5995
6053
|
|
5996
6054
|
Kcur = ggml_rope_custom(
|
5997
6055
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
5998
|
-
|
6056
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
5999
6057
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6000
6058
|
);
|
6001
6059
|
cb(Kcur, "Kcur", il);
|
@@ -6047,12 +6105,12 @@ struct llm_build_context {
|
|
6047
6105
|
cur = inpL;
|
6048
6106
|
|
6049
6107
|
// pooling layer
|
6050
|
-
if (pooling_type ==
|
6108
|
+
if (pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
6051
6109
|
cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, cur)), inp_mean);
|
6052
|
-
} else if (pooling_type ==
|
6110
|
+
} else if (pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
6053
6111
|
cur = ggml_get_rows(ctx0, cur, inp_cls);
|
6054
6112
|
} else {
|
6055
|
-
GGML_ASSERT(pooling_type ==
|
6113
|
+
GGML_ASSERT(pooling_type == LLAMA_POOLING_TYPE_NONE && "Invalid pooling type");
|
6056
6114
|
}
|
6057
6115
|
cb(cur, "result_embd", -1);
|
6058
6116
|
|
@@ -6284,11 +6342,6 @@ struct llm_build_context {
|
|
6284
6342
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6285
6343
|
cb(KQ_mask, "KQ_mask", -1);
|
6286
6344
|
|
6287
|
-
// shift the entire K-cache if needed
|
6288
|
-
if (do_rope_shift) {
|
6289
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6290
|
-
}
|
6291
|
-
|
6292
6345
|
for (int il = 0; il < n_layer; ++il) {
|
6293
6346
|
struct ggml_tensor * inpSA = inpL;
|
6294
6347
|
|
@@ -6325,14 +6378,14 @@ struct llm_build_context {
|
|
6325
6378
|
|
6326
6379
|
Qcur = ggml_rope_custom(
|
6327
6380
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6328
|
-
|
6381
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6329
6382
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6330
6383
|
);
|
6331
6384
|
cb(Qcur, "Qcur", il);
|
6332
6385
|
|
6333
6386
|
Kcur = ggml_rope_custom(
|
6334
6387
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6335
|
-
|
6388
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6336
6389
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6337
6390
|
);
|
6338
6391
|
cb(Kcur, "Kcur", il);
|
@@ -6407,11 +6460,6 @@ struct llm_build_context {
|
|
6407
6460
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6408
6461
|
cb(KQ_mask, "KQ_mask", -1);
|
6409
6462
|
|
6410
|
-
// shift the entire K-cache if needed
|
6411
|
-
if (do_rope_shift) {
|
6412
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6413
|
-
}
|
6414
|
-
|
6415
6463
|
for (int il = 0; il < n_layer; ++il) {
|
6416
6464
|
struct ggml_tensor * inpSA = inpL;
|
6417
6465
|
|
@@ -6441,13 +6489,13 @@ struct llm_build_context {
|
|
6441
6489
|
|
6442
6490
|
// using mode = 2 for neox mode
|
6443
6491
|
Qcur = ggml_rope_custom(
|
6444
|
-
ctx0, Qcur, inp_pos,
|
6492
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6445
6493
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6446
6494
|
);
|
6447
6495
|
cb(Qcur, "Qcur", il);
|
6448
6496
|
|
6449
6497
|
Kcur = ggml_rope_custom(
|
6450
|
-
ctx0, Kcur, inp_pos,
|
6498
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6451
6499
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6452
6500
|
);
|
6453
6501
|
cb(Kcur, "Kcur", il);
|
@@ -6521,11 +6569,6 @@ struct llm_build_context {
|
|
6521
6569
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6522
6570
|
cb(KQ_mask, "KQ_mask", -1);
|
6523
6571
|
|
6524
|
-
// shift the entire K-cache if needed
|
6525
|
-
if (do_rope_shift) {
|
6526
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6527
|
-
}
|
6528
|
-
|
6529
6572
|
for (int il = 0; il < n_layer; ++il) {
|
6530
6573
|
struct ggml_tensor * inpSA = inpL;
|
6531
6574
|
|
@@ -6561,14 +6604,14 @@ struct llm_build_context {
|
|
6561
6604
|
|
6562
6605
|
Qcur = ggml_rope_custom(
|
6563
6606
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
6564
|
-
|
6607
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6565
6608
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6566
6609
|
);
|
6567
6610
|
cb(Qcur, "Qcur", il);
|
6568
6611
|
|
6569
6612
|
Kcur = ggml_rope_custom(
|
6570
6613
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
6571
|
-
|
6614
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6572
6615
|
ext_factor, attn_factor, beta_fast, beta_slow
|
6573
6616
|
);
|
6574
6617
|
cb(Kcur, "Kcur", il);
|
@@ -6642,11 +6685,6 @@ struct llm_build_context {
|
|
6642
6685
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6643
6686
|
cb(KQ_mask, "KQ_mask", -1);
|
6644
6687
|
|
6645
|
-
// shift the entire K-cache if needed
|
6646
|
-
if (do_rope_shift) {
|
6647
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
6648
|
-
}
|
6649
|
-
|
6650
6688
|
for (int il = 0; il < n_layer; ++il) {
|
6651
6689
|
attn_norm_output = llm_build_norm(ctx0, inpL, hparams,
|
6652
6690
|
model.layers[il].attn_norm,
|
@@ -6684,7 +6722,7 @@ struct llm_build_context {
|
|
6684
6722
|
Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
|
6685
6723
|
|
6686
6724
|
Qcur = ggml_rope_custom(
|
6687
|
-
ctx0, Qcur, inp_pos,
|
6725
|
+
ctx0, Qcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6688
6726
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6689
6727
|
);
|
6690
6728
|
cb(Qcur, "Qcur", il);
|
@@ -6695,7 +6733,7 @@ struct llm_build_context {
|
|
6695
6733
|
cb(Qcur, "Qcur", il);
|
6696
6734
|
|
6697
6735
|
Kcur = ggml_rope_custom(
|
6698
|
-
ctx0, Kcur, inp_pos,
|
6736
|
+
ctx0, Kcur, inp_pos, n_rot, rope_type, 0, n_orig_ctx,
|
6699
6737
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
6700
6738
|
);
|
6701
6739
|
cb(Kcur, "Kcur", il);
|
@@ -6764,11 +6802,6 @@ struct llm_build_context {
|
|
6764
6802
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6765
6803
|
cb(KQ_mask, "KQ_mask", -1);
|
6766
6804
|
|
6767
|
-
// shift the entire K-cache if needed
|
6768
|
-
if (do_rope_shift) {
|
6769
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6770
|
-
}
|
6771
|
-
|
6772
6805
|
for (int il = 0; il < n_layer; ++il) {
|
6773
6806
|
|
6774
6807
|
// norm
|
@@ -6792,14 +6825,14 @@ struct llm_build_context {
|
|
6792
6825
|
cb(Vcur, "Vcur", il);
|
6793
6826
|
|
6794
6827
|
Qcur = ggml_rope_custom(
|
6795
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur,
|
6796
|
-
n_embd_head,
|
6828
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, n_rot, n_head, n_tokens), inp_pos,
|
6829
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6797
6830
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
6798
6831
|
cb(Qcur, "Qcur", il);
|
6799
6832
|
|
6800
6833
|
Kcur = ggml_rope_custom(
|
6801
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur,
|
6802
|
-
n_embd_head,
|
6834
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, n_rot, n_head_kv, n_tokens), inp_pos,
|
6835
|
+
n_embd_head, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
6803
6836
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
6804
6837
|
cb(Kcur, "Kcur", il);
|
6805
6838
|
|
@@ -6969,11 +7002,6 @@ struct llm_build_context {
|
|
6969
7002
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
6970
7003
|
cb(KQ_mask, "KQ_mask", -1);
|
6971
7004
|
|
6972
|
-
// shift the entire K-cache if needed
|
6973
|
-
if (do_rope_shift) {
|
6974
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
6975
|
-
}
|
6976
|
-
|
6977
7005
|
for (int il = 0; il < n_layer; ++il) {
|
6978
7006
|
cur = llm_build_norm(ctx0, inpL, hparams,
|
6979
7007
|
model.layers[il].attn_norm,
|
@@ -6999,14 +7027,14 @@ struct llm_build_context {
|
|
6999
7027
|
|
7000
7028
|
struct ggml_tensor * Qcur = ggml_rope_custom(
|
7001
7029
|
ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, n_tokens), inp_pos,
|
7002
|
-
|
7030
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7003
7031
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7004
7032
|
);
|
7005
7033
|
cb(Qcur, "Qcur", il);
|
7006
7034
|
|
7007
7035
|
struct ggml_tensor * Kcur = ggml_rope_custom(
|
7008
7036
|
ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7009
|
-
|
7037
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7010
7038
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7011
7039
|
);
|
7012
7040
|
cb(Kcur, "Kcur", il);
|
@@ -7077,11 +7105,6 @@ struct llm_build_context {
|
|
7077
7105
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7078
7106
|
cb(KQ_mask, "KQ_mask", -1);
|
7079
7107
|
|
7080
|
-
// shift the entire K-cache if needed
|
7081
|
-
if (do_rope_shift) {
|
7082
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7083
|
-
}
|
7084
|
-
|
7085
7108
|
for (int il = 0; il < n_layer; ++il) {
|
7086
7109
|
struct ggml_tensor * inpSA = inpL;
|
7087
7110
|
|
@@ -7117,14 +7140,14 @@ struct llm_build_context {
|
|
7117
7140
|
|
7118
7141
|
Qcur = ggml_rope_custom(
|
7119
7142
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7120
|
-
|
7143
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7121
7144
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7122
7145
|
);
|
7123
7146
|
cb(Qcur, "Qcur", il);
|
7124
7147
|
|
7125
7148
|
Kcur = ggml_rope_custom(
|
7126
7149
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7127
|
-
|
7150
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7128
7151
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7129
7152
|
);
|
7130
7153
|
cb(Kcur, "Kcur", il);
|
@@ -7196,11 +7219,6 @@ struct llm_build_context {
|
|
7196
7219
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7197
7220
|
cb(KQ_mask, "KQ_mask", -1);
|
7198
7221
|
|
7199
|
-
// shift the entire K-cache if needed
|
7200
|
-
if (do_rope_shift) {
|
7201
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7202
|
-
}
|
7203
|
-
|
7204
7222
|
for (int il = 0; il < n_layer; ++il) {
|
7205
7223
|
struct ggml_tensor * inpSA = inpL;
|
7206
7224
|
|
@@ -7236,14 +7254,14 @@ struct llm_build_context {
|
|
7236
7254
|
|
7237
7255
|
Qcur = ggml_rope_custom(
|
7238
7256
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7239
|
-
|
7257
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7240
7258
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7241
7259
|
);
|
7242
7260
|
cb(Qcur, "Qcur", il);
|
7243
7261
|
|
7244
7262
|
Kcur = ggml_rope_custom(
|
7245
7263
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7246
|
-
|
7264
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7247
7265
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7248
7266
|
);
|
7249
7267
|
cb(Kcur, "Kcur", il);
|
@@ -7328,11 +7346,6 @@ struct llm_build_context {
|
|
7328
7346
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7329
7347
|
cb(KQ_mask, "KQ_mask", -1);
|
7330
7348
|
|
7331
|
-
// shift the entire K-cache if needed
|
7332
|
-
if (do_rope_shift) {
|
7333
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7334
|
-
}
|
7335
|
-
|
7336
7349
|
for (int il = 0; il < n_layer; ++il) {
|
7337
7350
|
struct ggml_tensor * inpSA = inpL;
|
7338
7351
|
|
@@ -7368,14 +7381,14 @@ struct llm_build_context {
|
|
7368
7381
|
|
7369
7382
|
Qcur = ggml_rope_custom(
|
7370
7383
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
7371
|
-
|
7384
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7372
7385
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7373
7386
|
);
|
7374
7387
|
cb(Qcur, "Qcur", il);
|
7375
7388
|
|
7376
7389
|
Kcur = ggml_rope_custom(
|
7377
7390
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
7378
|
-
|
7391
|
+
n_rot, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7379
7392
|
ext_factor, attn_factor, beta_fast, beta_slow
|
7380
7393
|
);
|
7381
7394
|
cb(Kcur, "Kcur", il);
|
@@ -7464,11 +7477,6 @@ struct llm_build_context {
|
|
7464
7477
|
struct ggml_tensor * KQ_mask = ggml_view_2d(ctx0, lctx.inp_KQ_mask, n_kv, n_tokens, n_kv*ggml_type_size(lctx.inp_KQ_mask->type), 0);
|
7465
7478
|
cb(KQ_mask, "KQ_mask", -1);
|
7466
7479
|
|
7467
|
-
// shift the entire K-cache if needed
|
7468
|
-
if (do_rope_shift) {
|
7469
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, lctx.inp_K_shift, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
7470
|
-
}
|
7471
|
-
|
7472
7480
|
for (int il = 0; il < n_layer; ++il) {
|
7473
7481
|
|
7474
7482
|
// norm
|
@@ -7491,7 +7499,7 @@ struct llm_build_context {
|
|
7491
7499
|
|
7492
7500
|
Qcur = ggml_rope_custom(
|
7493
7501
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head_k, n_head, n_tokens), inp_pos,
|
7494
|
-
n_embd_head_k,
|
7502
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7495
7503
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7496
7504
|
cb(Qcur, "Qcur", il);
|
7497
7505
|
|
@@ -7500,7 +7508,7 @@ struct llm_build_context {
|
|
7500
7508
|
|
7501
7509
|
Kcur = ggml_rope_custom(
|
7502
7510
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head_k, n_head_kv, n_tokens), inp_pos,
|
7503
|
-
n_embd_head_k,
|
7511
|
+
n_embd_head_k, rope_type, 0, n_orig_ctx, freq_base, freq_scale,
|
7504
7512
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
7505
7513
|
cb(Kcur, "Kcur", il);
|
7506
7514
|
|
@@ -7553,6 +7561,40 @@ struct llm_build_context {
|
|
7553
7561
|
}
|
7554
7562
|
};
|
7555
7563
|
|
7564
|
+
static struct ggml_cgraph * llama_build_graph_defrag(llama_context & lctx, const std::vector<uint32_t> & ids) {
|
7565
|
+
llama_batch dummy;
|
7566
|
+
dummy.n_tokens = 0;
|
7567
|
+
|
7568
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
7569
|
+
|
7570
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
7571
|
+
|
7572
|
+
llm.init();
|
7573
|
+
|
7574
|
+
struct ggml_cgraph * result = llm.build_defrag(ids);
|
7575
|
+
|
7576
|
+
llm.free();
|
7577
|
+
|
7578
|
+
return result;
|
7579
|
+
}
|
7580
|
+
|
7581
|
+
static struct ggml_cgraph * llama_build_graph_k_shift(llama_context & lctx) {
|
7582
|
+
llama_batch dummy;
|
7583
|
+
dummy.n_tokens = 0;
|
7584
|
+
|
7585
|
+
llm_build_cb cb = [&](struct ggml_tensor * , const char * , int ) { };
|
7586
|
+
|
7587
|
+
struct llm_build_context llm(lctx, dummy, cb, false);
|
7588
|
+
|
7589
|
+
llm.init();
|
7590
|
+
|
7591
|
+
struct ggml_cgraph * result = llm.build_k_shift();
|
7592
|
+
|
7593
|
+
llm.free();
|
7594
|
+
|
7595
|
+
return result;
|
7596
|
+
}
|
7597
|
+
|
7556
7598
|
static struct ggml_cgraph * llama_build_graph(
|
7557
7599
|
llama_context & lctx,
|
7558
7600
|
const llama_batch & batch,
|
@@ -7672,6 +7714,20 @@ static struct ggml_cgraph * llama_build_graph(
|
|
7672
7714
|
return result;
|
7673
7715
|
}
|
7674
7716
|
|
7717
|
+
static void llama_set_k_shift(llama_context & lctx) {
|
7718
|
+
const auto & cparams = lctx.cparams;
|
7719
|
+
|
7720
|
+
const int64_t n_ctx = cparams.n_ctx;
|
7721
|
+
|
7722
|
+
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7723
|
+
|
7724
|
+
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7725
|
+
|
7726
|
+
for (int i = 0; i < n_ctx; ++i) {
|
7727
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
7728
|
+
}
|
7729
|
+
}
|
7730
|
+
|
7675
7731
|
static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
7676
7732
|
//
|
7677
7733
|
// set input data
|
@@ -7739,19 +7795,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7739
7795
|
}
|
7740
7796
|
}
|
7741
7797
|
|
7742
|
-
if (
|
7743
|
-
const int64_t n_ctx = cparams.n_ctx;
|
7744
|
-
|
7745
|
-
assert(ggml_backend_buffer_is_host(lctx.inp_K_shift->buffer));
|
7746
|
-
|
7747
|
-
int32_t * data = (int32_t *) lctx.inp_K_shift->data;
|
7748
|
-
|
7749
|
-
for (int i = 0; i < n_ctx; ++i) {
|
7750
|
-
data[i] = lctx.kv_self.cells[i].delta;
|
7751
|
-
}
|
7752
|
-
}
|
7753
|
-
|
7754
|
-
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_MEAN) {
|
7798
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_MEAN) {
|
7755
7799
|
const int64_t n_tokens = batch.n_tokens;
|
7756
7800
|
|
7757
7801
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_mean->buffer));
|
@@ -7779,7 +7823,7 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7779
7823
|
}
|
7780
7824
|
}
|
7781
7825
|
|
7782
|
-
if (cparams.do_pooling && hparams.pooling_type ==
|
7826
|
+
if (cparams.do_pooling && hparams.pooling_type == LLAMA_POOLING_TYPE_CLS) {
|
7783
7827
|
const int64_t n_tokens = batch.n_tokens;
|
7784
7828
|
|
7785
7829
|
GGML_ASSERT(ggml_backend_buffer_is_host(lctx.inp_cls->buffer));
|
@@ -7795,6 +7839,34 @@ static void llama_set_inputs(llama_context & lctx, const llama_batch & batch) {
|
|
7795
7839
|
}
|
7796
7840
|
}
|
7797
7841
|
|
7842
|
+
static void llama_graph_compute(
|
7843
|
+
llama_context & lctx,
|
7844
|
+
ggml_cgraph * gf,
|
7845
|
+
int n_threads) {
|
7846
|
+
#ifdef GGML_USE_MPI
|
7847
|
+
const int64_t n_layer = lctx.model.hparams.n_layer;
|
7848
|
+
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
7849
|
+
#endif
|
7850
|
+
|
7851
|
+
#ifdef GGML_USE_METAL
|
7852
|
+
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
7853
|
+
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
7854
|
+
}
|
7855
|
+
#endif
|
7856
|
+
|
7857
|
+
if (lctx.backend_cpu != nullptr) {
|
7858
|
+
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7859
|
+
}
|
7860
|
+
|
7861
|
+
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
7862
|
+
|
7863
|
+
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
7864
|
+
|
7865
|
+
#ifdef GGML_USE_MPI
|
7866
|
+
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
7867
|
+
#endif
|
7868
|
+
}
|
7869
|
+
|
7798
7870
|
// decode a batch of tokens by evaluating the transformer
|
7799
7871
|
//
|
7800
7872
|
// - lctx: llama context
|
@@ -7821,9 +7893,9 @@ static int llama_decode_internal(
|
|
7821
7893
|
const auto n_batch = cparams.n_batch;
|
7822
7894
|
|
7823
7895
|
GGML_ASSERT(n_tokens <= n_batch);
|
7896
|
+
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
7824
7897
|
|
7825
7898
|
int n_threads = n_tokens == 1 ? cparams.n_threads : cparams.n_threads_batch;
|
7826
|
-
GGML_ASSERT((!batch.token && batch.embd) || (batch.token && !batch.embd)); // NOLINT
|
7827
7899
|
|
7828
7900
|
const int64_t t_start_us = ggml_time_us();
|
7829
7901
|
|
@@ -7872,6 +7944,8 @@ static int llama_decode_internal(
|
|
7872
7944
|
batch.seq_id = seq_id_arr.data();
|
7873
7945
|
}
|
7874
7946
|
|
7947
|
+
llama_kv_cache_update(&lctx);
|
7948
|
+
|
7875
7949
|
// if we have enough unused cells before the current head ->
|
7876
7950
|
// better to start searching from the beginning of the cache, hoping to fill it
|
7877
7951
|
if (kv_self.head > kv_self.used + 2*n_tokens) {
|
@@ -7896,8 +7970,9 @@ static int llama_decode_internal(
|
|
7896
7970
|
ggml_cgraph * gf = llama_build_graph(lctx, batch, false);
|
7897
7971
|
|
7898
7972
|
// the output is always the last tensor in the graph
|
7899
|
-
struct ggml_tensor * res
|
7973
|
+
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
7900
7974
|
struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
|
7975
|
+
|
7901
7976
|
if (strcmp(res->name, "result_output") == 0) {
|
7902
7977
|
// the embeddings could be the second to last tensor, or the third to last tensor
|
7903
7978
|
if (strcmp(embeddings->name, "result_norm") != 0) {
|
@@ -7924,40 +7999,12 @@ static int llama_decode_internal(
|
|
7924
7999
|
n_threads = std::min(4, n_threads);
|
7925
8000
|
}
|
7926
8001
|
|
7927
|
-
#ifdef GGML_USE_MPI
|
7928
|
-
const int64_t n_layer = hparams.n_layer;
|
7929
|
-
ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
|
7930
|
-
#endif
|
7931
|
-
|
7932
|
-
#ifdef GGML_USE_METAL
|
7933
|
-
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
7934
|
-
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
7935
|
-
}
|
7936
|
-
#endif
|
7937
|
-
|
7938
|
-
if (lctx.backend_cpu != nullptr) {
|
7939
|
-
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
7940
|
-
}
|
7941
|
-
|
7942
8002
|
llama_set_inputs(lctx, batch);
|
7943
8003
|
|
7944
|
-
|
7945
|
-
|
7946
|
-
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
7947
|
-
|
7948
|
-
#ifdef GGML_USE_MPI
|
7949
|
-
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
7950
|
-
#endif
|
8004
|
+
llama_graph_compute(lctx, gf, n_threads);
|
7951
8005
|
|
7952
8006
|
// update the kv ring buffer
|
7953
8007
|
{
|
7954
|
-
if (kv_self.has_shift) {
|
7955
|
-
kv_self.has_shift = false;
|
7956
|
-
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
7957
|
-
kv_self.cells[i].delta = 0;
|
7958
|
-
}
|
7959
|
-
}
|
7960
|
-
|
7961
8008
|
kv_self.head += n_tokens;
|
7962
8009
|
|
7963
8010
|
// Ensure kv cache head points to a valid index.
|
@@ -7966,6 +8013,18 @@ static int llama_decode_internal(
|
|
7966
8013
|
}
|
7967
8014
|
}
|
7968
8015
|
|
8016
|
+
// decide if we need to defrag the kv cache
|
8017
|
+
if (cparams.defrag_thold >= 0.0f) {
|
8018
|
+
const float fragmentation = kv_self.n >= 128 ? 1.0f - float(kv_self.used + n_tokens)/float(kv_self.n) : 0.0f;
|
8019
|
+
|
8020
|
+
// queue defragmentation for next llama_kv_cache_update
|
8021
|
+
if (fragmentation > cparams.defrag_thold) {
|
8022
|
+
//LLAMA_LOG_INFO("fragmentation: %.2f\n", fragmentation);
|
8023
|
+
|
8024
|
+
llama_kv_cache_defrag(kv_self);
|
8025
|
+
}
|
8026
|
+
}
|
8027
|
+
|
7969
8028
|
#ifdef GGML_PERF
|
7970
8029
|
// print timing information per ggml operation (for debugging purposes)
|
7971
8030
|
// requires GGML_PERF to be defined
|
@@ -8053,6 +8112,245 @@ static int llama_decode_internal(
|
|
8053
8112
|
return 0;
|
8054
8113
|
}
|
8055
8114
|
|
8115
|
+
// find holes from the beginning of the KV cache and fill them by moving data from the end of the cache
|
8116
|
+
static void llama_kv_cache_defrag_internal(struct llama_context & lctx) {
|
8117
|
+
auto & kv_self = lctx.kv_self;
|
8118
|
+
|
8119
|
+
const auto & hparams = lctx.model.hparams;
|
8120
|
+
|
8121
|
+
const uint32_t n_layer = hparams.n_layer;
|
8122
|
+
|
8123
|
+
const uint32_t n_kv = llama_kv_cache_cell_max(kv_self);
|
8124
|
+
const uint32_t n_used = kv_self.used;
|
8125
|
+
|
8126
|
+
assert(n_used <= n_kv);
|
8127
|
+
|
8128
|
+
//const int64_t t_start = ggml_time_us();
|
8129
|
+
|
8130
|
+
// number of cells moved
|
8131
|
+
uint32_t n_moves = 0;
|
8132
|
+
|
8133
|
+
// determine which KV cells to move where
|
8134
|
+
//
|
8135
|
+
// cell i moves to ids[i]
|
8136
|
+
//
|
8137
|
+
// if ids[i] == i || ids[i] == n_kv, then cell i is not moved
|
8138
|
+
//
|
8139
|
+
std::vector<uint32_t> ids(n_kv, n_kv);
|
8140
|
+
|
8141
|
+
for (uint32_t i0 = 0; i0 < n_used; ++i0) {
|
8142
|
+
const auto & cell0 = kv_self.cells[i0];
|
8143
|
+
|
8144
|
+
if (!cell0.is_empty()) {
|
8145
|
+
ids[i0] = i0;
|
8146
|
+
|
8147
|
+
continue;
|
8148
|
+
}
|
8149
|
+
|
8150
|
+
// found a hole - fill it with data from the end of the cache
|
8151
|
+
|
8152
|
+
uint32_t nh = 1;
|
8153
|
+
|
8154
|
+
// determine the size of the hole
|
8155
|
+
while (i0 + nh < n_used && kv_self.cells[i0 + nh].is_empty()) {
|
8156
|
+
nh++;
|
8157
|
+
}
|
8158
|
+
|
8159
|
+
// each move requires 6*n_layer tensors (see build_defrag)
|
8160
|
+
// - source view, destination view, copy operation
|
8161
|
+
// - x2 for keys and values
|
8162
|
+
//
|
8163
|
+
if (6*(n_moves + nh)*n_layer >= LLAMA_MAX_NODES) {
|
8164
|
+
// the graph is too big, we cannot move more cells
|
8165
|
+
break;
|
8166
|
+
}
|
8167
|
+
|
8168
|
+
uint32_t nf = 0;
|
8169
|
+
uint32_t is = n_kv - 1;
|
8170
|
+
|
8171
|
+
// starting from the end, find nh non-empty cells
|
8172
|
+
for (; is > i0; --is) {
|
8173
|
+
const auto & cell1 = kv_self.cells[is];
|
8174
|
+
|
8175
|
+
if (cell1.is_empty() || ids[is] != n_kv) {
|
8176
|
+
continue;
|
8177
|
+
}
|
8178
|
+
|
8179
|
+
// non-empty cell which is not yet moved
|
8180
|
+
nf++;
|
8181
|
+
|
8182
|
+
if (nf == nh) {
|
8183
|
+
break;
|
8184
|
+
}
|
8185
|
+
}
|
8186
|
+
|
8187
|
+
// this can only happen if `n_used` is not accurate, which would be a bug
|
8188
|
+
GGML_ASSERT(nf == nh && "KV defrag bug: nf != nh");
|
8189
|
+
|
8190
|
+
nf = 0;
|
8191
|
+
|
8192
|
+
uint32_t i1 = is;
|
8193
|
+
|
8194
|
+
// are we moving a continuous block of memory?
|
8195
|
+
bool cont = false;
|
8196
|
+
|
8197
|
+
// go back and move the nf cells to the hole
|
8198
|
+
for (; i1 < n_kv; ++i1) {
|
8199
|
+
auto & cell1 = kv_self.cells[i1];
|
8200
|
+
|
8201
|
+
if (cell1.is_empty() || ids[i1] != n_kv) {
|
8202
|
+
cont = false;
|
8203
|
+
continue;
|
8204
|
+
}
|
8205
|
+
|
8206
|
+
// this cell goes to (i0 + nf)
|
8207
|
+
ids[i1] = i0 + nf;
|
8208
|
+
|
8209
|
+
// move the cell meta data
|
8210
|
+
kv_self.cells[i0 + nf] = cell1;
|
8211
|
+
|
8212
|
+
// clear the old cell and move the head there
|
8213
|
+
cell1 = llama_kv_cell();
|
8214
|
+
kv_self.head = n_used;
|
8215
|
+
|
8216
|
+
if (!cont) {
|
8217
|
+
n_moves++;
|
8218
|
+
cont = true;
|
8219
|
+
}
|
8220
|
+
|
8221
|
+
nf++;
|
8222
|
+
|
8223
|
+
if (nf == nh) {
|
8224
|
+
break;
|
8225
|
+
}
|
8226
|
+
}
|
8227
|
+
|
8228
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag: move [%u, %u) to [%u, %u)\n", is, i1 + 1, i0, i0 + nh);
|
8229
|
+
|
8230
|
+
i0 += nh - 1;
|
8231
|
+
}
|
8232
|
+
|
8233
|
+
if (n_moves == 0) {
|
8234
|
+
return;
|
8235
|
+
}
|
8236
|
+
|
8237
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag cell moves: %u\n", n_moves);
|
8238
|
+
|
8239
|
+
//LLAMA_LOG_INFO("expected gf nodes: %u\n", 6*n_moves*n_layer);
|
8240
|
+
|
8241
|
+
#if 0
|
8242
|
+
// CPU defrag
|
8243
|
+
//
|
8244
|
+
// TODO: optimizations are possible:
|
8245
|
+
// - multiple threads
|
8246
|
+
// - avoid copying to the host memory when already there
|
8247
|
+
//
|
8248
|
+
// likely not worth the effort, as we have ggml_graph based defrag
|
8249
|
+
//
|
8250
|
+
|
8251
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
8252
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
8253
|
+
|
8254
|
+
const uint32_t kv_size = kv_self.size;
|
8255
|
+
|
8256
|
+
std::vector<uint8_t> buf_k;
|
8257
|
+
std::vector<uint8_t> buf_v;
|
8258
|
+
|
8259
|
+
for (uint32_t il = 0; il < n_layer; ++il) {
|
8260
|
+
const size_t k_size_row = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa);
|
8261
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_size);
|
8262
|
+
|
8263
|
+
const size_t v_size_el = ggml_type_size(kv_self.v_l[il]->type);
|
8264
|
+
const size_t v_size = ggml_row_size (kv_self.v_l[il]->type, n_embd_v_gqa*kv_size);
|
8265
|
+
|
8266
|
+
buf_k.resize(k_size);
|
8267
|
+
buf_v.resize(v_size);
|
8268
|
+
|
8269
|
+
ggml_backend_tensor_get(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
8270
|
+
ggml_backend_tensor_get(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
8271
|
+
|
8272
|
+
// batch move [i, i+nm) to [id, id+nm)
|
8273
|
+
// note: cells can move only to a lower index
|
8274
|
+
for (uint32_t i = 0; i < n_kv; ++i) {
|
8275
|
+
const uint32_t id = ids[i];
|
8276
|
+
|
8277
|
+
if (i == id || id == n_kv) {
|
8278
|
+
continue;
|
8279
|
+
}
|
8280
|
+
|
8281
|
+
uint32_t nm = 1;
|
8282
|
+
|
8283
|
+
while (i + nm < n_kv && ids[i + nm] == id + nm) {
|
8284
|
+
nm++;
|
8285
|
+
}
|
8286
|
+
|
8287
|
+
// move keys
|
8288
|
+
{
|
8289
|
+
const int64_t os = i*k_size_row;
|
8290
|
+
const int64_t od = id*k_size_row;
|
8291
|
+
|
8292
|
+
memcpy(buf_k.data() + od, buf_k.data() + os, nm*k_size_row);
|
8293
|
+
}
|
8294
|
+
|
8295
|
+
// move values (note: they are transposed)
|
8296
|
+
{
|
8297
|
+
const int64_t os = i;
|
8298
|
+
const int64_t od = id;
|
8299
|
+
|
8300
|
+
for (uint32_t j = 0; j < n_embd_v_gqa; ++j) {
|
8301
|
+
memcpy(buf_v.data() + (od + j*kv_size)*v_size_el, buf_v.data() + (os + j*kv_size)*v_size_el, nm*v_size_el);
|
8302
|
+
}
|
8303
|
+
}
|
8304
|
+
|
8305
|
+
i += nm - 1;
|
8306
|
+
}
|
8307
|
+
|
8308
|
+
ggml_backend_tensor_set(kv_self.k_l[il], buf_k.data(), 0, buf_k.size());
|
8309
|
+
ggml_backend_tensor_set(kv_self.v_l[il], buf_v.data(), 0, buf_v.size());
|
8310
|
+
}
|
8311
|
+
#else
|
8312
|
+
// ggml_graph defrag
|
8313
|
+
|
8314
|
+
ggml_cgraph * gf = llama_build_graph_defrag(lctx, ids);
|
8315
|
+
|
8316
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
8317
|
+
#endif
|
8318
|
+
|
8319
|
+
//const int64_t t_end = ggml_time_us();
|
8320
|
+
|
8321
|
+
//LLAMA_LOG_INFO("(tmp log) KV defrag time: %.3f ms\n", (t_end - t_start)/1000.0);
|
8322
|
+
}
|
8323
|
+
|
8324
|
+
static void llama_kv_cache_update_internal(struct llama_context & lctx) {
|
8325
|
+
// apply K-shift if needed
|
8326
|
+
if (lctx.model.hparams.rope_type != LLAMA_ROPE_TYPE_NONE && lctx.kv_self.has_shift) {
|
8327
|
+
llama_set_k_shift(lctx);
|
8328
|
+
|
8329
|
+
{
|
8330
|
+
ggml_cgraph * gf = llama_build_graph_k_shift(lctx);
|
8331
|
+
|
8332
|
+
llama_graph_compute(lctx, gf, lctx.cparams.n_threads);
|
8333
|
+
}
|
8334
|
+
|
8335
|
+
{
|
8336
|
+
auto & kv_self = lctx.kv_self;
|
8337
|
+
|
8338
|
+
kv_self.has_shift = false;
|
8339
|
+
|
8340
|
+
for (uint32_t i = 0; i < kv_self.size; ++i) {
|
8341
|
+
kv_self.cells[i].delta = 0;
|
8342
|
+
}
|
8343
|
+
}
|
8344
|
+
}
|
8345
|
+
|
8346
|
+
// defragment the KV cache if needed
|
8347
|
+
if (lctx.kv_self.do_defrag) {
|
8348
|
+
llama_kv_cache_defrag_internal(lctx);
|
8349
|
+
|
8350
|
+
lctx.kv_self.do_defrag = false;
|
8351
|
+
}
|
8352
|
+
}
|
8353
|
+
|
8056
8354
|
//
|
8057
8355
|
// tokenizer
|
8058
8356
|
//
|
@@ -8644,37 +8942,46 @@ struct llm_tokenizer_wpm {
|
|
8644
8942
|
}
|
8645
8943
|
|
8646
8944
|
std::vector<std::string> preprocess(const std::string & text) {
|
8647
|
-
|
8648
|
-
|
8945
|
+
// normalalization form D
|
8946
|
+
std::vector<uint32_t> codepoints = codepoints_from_utf8(text);
|
8947
|
+
std::vector<uint32_t> nfd_codepoints;
|
8948
|
+
for (uint32_t code : codepoints) {
|
8949
|
+
auto it = nfd_map.equal_range(code);
|
8950
|
+
if (it.first != it.second) {
|
8951
|
+
for (auto jt = it.first; jt != it.second; jt++) {
|
8952
|
+
nfd_codepoints.push_back(jt->second);
|
8953
|
+
}
|
8954
|
+
} else {
|
8955
|
+
nfd_codepoints.push_back(code);
|
8956
|
+
}
|
8957
|
+
}
|
8649
8958
|
|
8650
|
-
//
|
8651
|
-
//
|
8652
|
-
std::vector<std::string> words;
|
8959
|
+
// strip accents, strip control, uniformize whitespace,
|
8960
|
+
// to lowercase, pad chinese characters, pad punctuation
|
8653
8961
|
std::string new_str = "";
|
8654
|
-
|
8655
|
-
|
8656
|
-
|
8657
|
-
|
8658
|
-
new_str += " ";
|
8659
|
-
new_str += ori_str[i];
|
8660
|
-
new_str += " ";
|
8661
|
-
i += 1;
|
8962
|
+
for (uint32_t code : nfd_codepoints) {
|
8963
|
+
int type = codepoint_type(code);
|
8964
|
+
if (type == CODEPOINT_TYPE_ACCENT_MARK || type == CODEPOINT_TYPE_CONTROL) {
|
8965
|
+
continue;
|
8662
8966
|
}
|
8663
|
-
|
8967
|
+
code = to_lower(code);
|
8968
|
+
if (type == CODEPOINT_TYPE_WHITESPACE) {
|
8969
|
+
code = ' ';
|
8970
|
+
}
|
8971
|
+
std::string s = codepoint_to_utf8(code);
|
8972
|
+
if (type == CODEPOINT_TYPE_PUNCTUATION || is_ascii_punct(code) || is_chinese_char(code)) {
|
8664
8973
|
new_str += " ";
|
8665
|
-
new_str +=
|
8974
|
+
new_str += s;
|
8666
8975
|
new_str += " ";
|
8667
|
-
|
8668
|
-
|
8669
|
-
else {
|
8670
|
-
new_str += ori_str[i];
|
8671
|
-
i += 1;
|
8976
|
+
} else {
|
8977
|
+
new_str += s;
|
8672
8978
|
}
|
8673
8979
|
}
|
8674
8980
|
|
8675
8981
|
// split by whitespace
|
8676
8982
|
uint64_t l = 0;
|
8677
8983
|
uint64_t r = 0;
|
8984
|
+
std::vector<std::string> words;
|
8678
8985
|
while (r < new_str.size()) {
|
8679
8986
|
// if is whitespace
|
8680
8987
|
if (isspace(new_str[r])) {
|
@@ -8692,47 +8999,21 @@ struct llm_tokenizer_wpm {
|
|
8692
8999
|
return words;
|
8693
9000
|
}
|
8694
9001
|
|
8695
|
-
|
8696
|
-
|
8697
|
-
|
8698
|
-
|
8699
|
-
|
8700
|
-
if (c >= 'A' && c <= 'Z') {
|
8701
|
-
text2[i] = c - 'A' + 'a';
|
8702
|
-
}
|
9002
|
+
uint32_t to_lower(uint32_t code) {
|
9003
|
+
static const std::locale locale("en_US.UTF-8");
|
9004
|
+
#if defined(_WIN32)
|
9005
|
+
if (code > 0xFFFF) {
|
9006
|
+
return code;
|
8703
9007
|
}
|
8704
|
-
|
9008
|
+
#endif
|
9009
|
+
return std::tolower(wchar_t(code), locale);
|
8705
9010
|
}
|
8706
9011
|
|
8707
|
-
bool
|
8708
|
-
|
8709
|
-
|
8710
|
-
|
8711
|
-
|
8712
|
-
unsigned char ch = static_cast<unsigned char>(str[i]);
|
8713
|
-
if (ch <= 0x7f) {
|
8714
|
-
codepoint = ch;
|
8715
|
-
num_bytes = 1;
|
8716
|
-
} else if ((ch >> 5) == 0x06) {
|
8717
|
-
codepoint = ch & 0x1f;
|
8718
|
-
num_bytes = 2;
|
8719
|
-
} else if ((ch >> 4) == 0x0e) {
|
8720
|
-
codepoint = ch & 0x0f;
|
8721
|
-
num_bytes = 3;
|
8722
|
-
} else if ((ch >> 3) == 0x1e) {
|
8723
|
-
codepoint = ch & 0x07;
|
8724
|
-
num_bytes = 4;
|
8725
|
-
}
|
8726
|
-
for (int j = 1; j < num_bytes; ++j) {
|
8727
|
-
if (i + j >= len) {
|
8728
|
-
return false; // incomplete UTF-8 character
|
8729
|
-
}
|
8730
|
-
unsigned char next_ch = static_cast<unsigned char>(str[i + j]);
|
8731
|
-
if ((next_ch >> 6) != 0x02) {
|
8732
|
-
return false; // invalid trailing byte
|
8733
|
-
}
|
8734
|
-
codepoint = (codepoint << 6) | (next_ch & 0x3f);
|
8735
|
-
}
|
9012
|
+
bool is_ascii_punct(uint32_t code) {
|
9013
|
+
return code < 256 && ispunct(code);
|
9014
|
+
}
|
9015
|
+
|
9016
|
+
bool is_chinese_char(uint32_t codepoint) {
|
8736
9017
|
if ((codepoint >= 0x4E00 && codepoint <= 0x9FFF) ||
|
8737
9018
|
(codepoint >= 0x3400 && codepoint <= 0x4DBF) ||
|
8738
9019
|
(codepoint >= 0x20000 && codepoint <= 0x2A6DF) ||
|
@@ -8748,41 +9029,6 @@ struct llm_tokenizer_wpm {
|
|
8748
9029
|
return false;
|
8749
9030
|
}
|
8750
9031
|
|
8751
|
-
std::string strip_accents(const std::string & input_string) {
|
8752
|
-
std::string resultString;
|
8753
|
-
std::map<std::string, char> accent_map = {
|
8754
|
-
{"À", 'A'}, {"Á", 'A'}, {"Â", 'A'}, {"Ã", 'A'}, {"Ä", 'A'}, {"Å", 'A'},
|
8755
|
-
{"à", 'a'}, {"á", 'a'}, {"â", 'a'}, {"ã", 'a'}, {"ä", 'a'}, {"å", 'a'},
|
8756
|
-
{"È", 'E'}, {"É", 'E'}, {"Ê", 'E'}, {"Ë", 'E'}, {"è", 'e'}, {"é", 'e'},
|
8757
|
-
{"ê", 'e'}, {"ë", 'e'}, {"Ì", 'I'}, {"Í", 'I'}, {"Î", 'I'}, {"Ï", 'I'},
|
8758
|
-
{"ì", 'i'}, {"í", 'i'}, {"î", 'i'}, {"ï", 'i'}, {"Ò", 'O'}, {"Ó", 'O'},
|
8759
|
-
{"Ô", 'O'}, {"Õ", 'O'}, {"Ö", 'O'}, {"ò", 'o'}, {"ó", 'o'}, {"ô", 'o'},
|
8760
|
-
{"õ", 'o'}, {"ö", 'o'}, {"Ù", 'U'}, {"Ú", 'U'}, {"Û", 'U'}, {"Ü", 'U'},
|
8761
|
-
{"ù", 'u'}, {"ú", 'u'}, {"û", 'u'}, {"ü", 'u'}, {"Ý", 'Y'}, {"ý", 'y'},
|
8762
|
-
{"Ç", 'C'}, {"ç", 'c'}, {"Ñ", 'N'}, {"ñ", 'n'},
|
8763
|
-
};
|
8764
|
-
|
8765
|
-
for (size_t i = 0; i < input_string.length();) {
|
8766
|
-
int len = utf8_len(input_string[i]);
|
8767
|
-
std::string curChar = input_string.substr(i, len);
|
8768
|
-
auto iter = accent_map.find(curChar);
|
8769
|
-
if (iter != accent_map.end()) {
|
8770
|
-
resultString += iter->second;
|
8771
|
-
} else {
|
8772
|
-
resultString += curChar;
|
8773
|
-
}
|
8774
|
-
i += len;
|
8775
|
-
}
|
8776
|
-
|
8777
|
-
return resultString;
|
8778
|
-
}
|
8779
|
-
|
8780
|
-
static size_t utf8_len(char src) {
|
8781
|
-
const size_t lookup[] = {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4};
|
8782
|
-
uint8_t highbits = static_cast<uint8_t>(src) >> 4;
|
8783
|
-
return lookup[highbits];
|
8784
|
-
}
|
8785
|
-
|
8786
9032
|
const llama_vocab & vocab;
|
8787
9033
|
};
|
8788
9034
|
|
@@ -9816,10 +10062,6 @@ void llama_sample_temp(struct llama_context * ctx, llama_token_data_array * cand
|
|
9816
10062
|
}
|
9817
10063
|
}
|
9818
10064
|
|
9819
|
-
void llama_sample_temperature(struct llama_context * ctx, llama_token_data_array * candidates_p, float temp) {
|
9820
|
-
llama_sample_temp(ctx, candidates_p, temp);
|
9821
|
-
}
|
9822
|
-
|
9823
10065
|
void llama_sample_repetition_penalties(
|
9824
10066
|
struct llama_context * ctx,
|
9825
10067
|
llama_token_data_array * candidates,
|
@@ -9946,38 +10188,6 @@ void llama_sample_apply_guidance(
|
|
9946
10188
|
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9947
10189
|
}
|
9948
10190
|
|
9949
|
-
void llama_sample_classifier_free_guidance(
|
9950
|
-
struct llama_context * ctx,
|
9951
|
-
llama_token_data_array * candidates,
|
9952
|
-
struct llama_context * guidance_ctx,
|
9953
|
-
float scale) {
|
9954
|
-
GGML_ASSERT(ctx);
|
9955
|
-
int64_t t_start_sample_us;
|
9956
|
-
|
9957
|
-
t_start_sample_us = ggml_time_us();
|
9958
|
-
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
|
9959
|
-
|
9960
|
-
GGML_ASSERT(n_vocab == candidates->size);
|
9961
|
-
GGML_ASSERT(!candidates->sorted);
|
9962
|
-
|
9963
|
-
std::vector<float> logits_base(n_vocab);
|
9964
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
9965
|
-
logits_base[i] = candidates->data[i].logit;
|
9966
|
-
}
|
9967
|
-
|
9968
|
-
float * logits_guidance = llama_get_logits(guidance_ctx);
|
9969
|
-
|
9970
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9971
|
-
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
|
9972
|
-
t_start_sample_us = ggml_time_us();
|
9973
|
-
|
9974
|
-
for (size_t i = 0; i < n_vocab; ++i) {
|
9975
|
-
candidates->data[i].logit = logits_base[i];
|
9976
|
-
}
|
9977
|
-
|
9978
|
-
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
9979
|
-
}
|
9980
|
-
|
9981
10191
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
9982
10192
|
GGML_ASSERT(ctx);
|
9983
10193
|
|
@@ -10508,31 +10718,47 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10508
10718
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
10509
10719
|
new_type = GGML_TYPE_Q8_0;
|
10510
10720
|
}
|
10511
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype ==
|
10721
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
10722
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10512
10723
|
new_type = GGML_TYPE_Q5_K;
|
10513
10724
|
}
|
10514
10725
|
else if (new_type != GGML_TYPE_Q8_0) {
|
10515
10726
|
new_type = GGML_TYPE_Q6_K;
|
10516
10727
|
}
|
10517
10728
|
} else if (name == "token_embd.weight") {
|
10518
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
10729
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS ||
|
10730
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10519
10731
|
new_type = GGML_TYPE_Q2_K;
|
10520
10732
|
}
|
10733
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10734
|
+
new_type = GGML_TYPE_IQ3_S;
|
10735
|
+
}
|
10521
10736
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10522
|
-
new_type =
|
10737
|
+
new_type = GGML_TYPE_IQ3_S;
|
10523
10738
|
}
|
10524
|
-
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S
|
10739
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
|
10740
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) {
|
10525
10741
|
if (name.find("attn_v.weight") != std::string::npos) {
|
10526
10742
|
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
10527
|
-
else new_type = GGML_TYPE_Q2_K;
|
10743
|
+
else new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
10528
10744
|
++qs.i_attention_wv;
|
10529
10745
|
}
|
10746
|
+
else if (qs.model.hparams.n_expert == 8 && name.find("attn_k.weight") != std::string::npos) {
|
10747
|
+
new_type = GGML_TYPE_Q4_K;
|
10748
|
+
}
|
10530
10749
|
else if (name.find("ffn_down") != std::string::npos) {
|
10531
|
-
if (qs.i_ffn_down < qs.n_ffn_down/8)
|
10750
|
+
if (qs.i_ffn_down < qs.n_ffn_down/8) {
|
10751
|
+
new_type = ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M ? GGML_TYPE_IQ3_S : GGML_TYPE_Q2_K;
|
10752
|
+
}
|
10532
10753
|
++qs.i_ffn_down;
|
10533
10754
|
}
|
10534
10755
|
else if (name.find("attn_output.weight") != std::string::npos) {
|
10535
|
-
if (
|
10756
|
+
if (qs.model.hparams.n_expert == 8) {
|
10757
|
+
new_type = GGML_TYPE_Q5_K;
|
10758
|
+
} else {
|
10759
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) new_type = GGML_TYPE_IQ2_XXS;
|
10760
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || ftype == LLAMA_FTYPE_MOSTLY_IQ2_M) new_type = GGML_TYPE_IQ3_S;
|
10761
|
+
}
|
10536
10762
|
}
|
10537
10763
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
10538
10764
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) {
|
@@ -10542,13 +10768,25 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10542
10768
|
new_type = GGML_TYPE_Q4_K;
|
10543
10769
|
}
|
10544
10770
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10545
|
-
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ?
|
10771
|
+
new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
|
10772
|
+
}
|
10773
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
10774
|
+
new_type = GGML_TYPE_Q4_K;
|
10775
|
+
}
|
10776
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
10777
|
+
new_type = GGML_TYPE_Q4_K;
|
10778
|
+
}
|
10779
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
|
10780
|
+
new_type = GGML_TYPE_Q4_K;
|
10781
|
+
}
|
10782
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
10783
|
+
new_type = GGML_TYPE_Q4_K;
|
10546
10784
|
}
|
10547
10785
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
10548
10786
|
new_type = qs.i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
10549
10787
|
}
|
10550
10788
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
10551
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && qs.model.hparams.n_gqa() >= 4) {
|
10789
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && qs.model.hparams.n_gqa() >= 4) {
|
10552
10790
|
new_type = GGML_TYPE_Q5_K;
|
10553
10791
|
}
|
10554
10792
|
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
|
@@ -10574,14 +10812,24 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10574
10812
|
// TODO: explore better strategies
|
10575
10813
|
new_type = GGML_TYPE_Q8_0;
|
10576
10814
|
}
|
10577
|
-
else if (ftype ==
|
10578
|
-
new_type =
|
10815
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
10816
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10817
|
+
}
|
10818
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10819
|
+
new_type = GGML_TYPE_IQ2_S;
|
10820
|
+
}
|
10821
|
+
} else if (name.find("attn_q.weight") != std::string::npos) {
|
10822
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS) {
|
10823
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10824
|
+
}
|
10825
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
|
10826
|
+
new_type = GGML_TYPE_IQ2_S;
|
10579
10827
|
}
|
10580
10828
|
} else if (name.find("ffn_down") != std::string::npos) {
|
10581
10829
|
auto info = layer_info(qs.i_ffn_down, qs.n_ffn_down, name.c_str());
|
10582
10830
|
int i_layer = info.first, n_layer = info.second;
|
10583
10831
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
10584
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S
|
10832
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
10585
10833
|
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
10586
10834
|
}
|
10587
10835
|
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS && !qs.has_imatrix) {
|
@@ -10592,6 +10840,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10592
10840
|
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
10593
10841
|
: GGML_TYPE_Q3_K;
|
10594
10842
|
}
|
10843
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M && (i_layer < n_layer/8 ||
|
10844
|
+
(qs.model.hparams.n_expert == 8 && use_more_bits(i_layer, n_layer)))) {
|
10845
|
+
new_type = GGML_TYPE_Q4_K;
|
10846
|
+
}
|
10595
10847
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
10596
10848
|
new_type = arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
|
10597
10849
|
}
|
@@ -10603,8 +10855,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10603
10855
|
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10604
10856
|
}
|
10605
10857
|
}
|
10606
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL && !qs.has_imatrix) {
|
10607
|
-
|
10858
|
+
else if (i_layer < n_layer/8 && (ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) && !qs.has_imatrix) {
|
10859
|
+
new_type = GGML_TYPE_Q5_K;
|
10608
10860
|
}
|
10609
10861
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
10610
10862
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
@@ -10621,39 +10873,43 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10621
10873
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
10622
10874
|
if (arch != LLM_ARCH_FALCON) {
|
10623
10875
|
if (qs.model.hparams.n_expert == 8) {
|
10624
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype ==
|
10876
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS ||
|
10625
10877
|
ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_NL ||
|
10626
|
-
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M
|
10878
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S ||
|
10879
|
+
ftype == LLAMA_FTYPE_MOSTLY_IQ3_M || ftype == LLAMA_FTYPE_MOSTLY_IQ4_XS) {
|
10627
10880
|
new_type = GGML_TYPE_Q5_K;
|
10628
10881
|
}
|
10629
10882
|
} else {
|
10630
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K
|
10631
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type =
|
10632
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
10633
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
10883
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
10884
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) new_type = GGML_TYPE_IQ3_S;
|
10885
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ) new_type = GGML_TYPE_Q4_K;
|
10886
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L ) new_type = GGML_TYPE_Q5_K;
|
10887
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M ) new_type = GGML_TYPE_Q4_K;
|
10634
10888
|
}
|
10635
10889
|
} else {
|
10636
10890
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
10637
10891
|
}
|
10638
10892
|
}
|
10639
10893
|
else if (name.find("attn_qkv.weight") != std::string::npos) {
|
10640
|
-
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L
|
10894
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L || ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
|
10895
|
+
new_type = GGML_TYPE_Q4_K;
|
10896
|
+
}
|
10641
10897
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
10642
10898
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
10643
10899
|
}
|
10644
10900
|
else if (name.find("ffn_gate") != std::string::npos) {
|
10645
10901
|
auto info = layer_info(qs.i_ffn_gate, qs.n_ffn_gate, name.c_str());
|
10646
10902
|
int i_layer = info.first, n_layer = info.second;
|
10647
|
-
if (ftype ==
|
10648
|
-
new_type =
|
10903
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
10904
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10649
10905
|
}
|
10650
10906
|
++qs.i_ffn_gate;
|
10651
10907
|
}
|
10652
10908
|
else if (name.find("ffn_up") != std::string::npos) {
|
10653
10909
|
auto info = layer_info(qs.i_ffn_up, qs.n_ffn_up, name.c_str());
|
10654
10910
|
int i_layer = info.first, n_layer = info.second;
|
10655
|
-
if (ftype ==
|
10656
|
-
new_type =
|
10911
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS && (i_layer >= n_layer/8 && i_layer < 7*n_layer/8)) {
|
10912
|
+
new_type = GGML_TYPE_IQ3_XXS;
|
10657
10913
|
}
|
10658
10914
|
++qs.i_ffn_up;
|
10659
10915
|
}
|
@@ -10671,9 +10927,9 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10671
10927
|
//}
|
10672
10928
|
bool convert_incompatible_tensor = false;
|
10673
10929
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
10674
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
10675
|
-
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS ||
|
10676
|
-
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S) {
|
10930
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K || new_type == GGML_TYPE_IQ4_XS ||
|
10931
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_S ||
|
10932
|
+
new_type == GGML_TYPE_IQ3_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || new_type == GGML_TYPE_IQ3_S) {
|
10677
10933
|
int nx = tensor->ne[0];
|
10678
10934
|
int ny = tensor->ne[1];
|
10679
10935
|
if (nx % QK_K != 0) {
|
@@ -10687,13 +10943,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
10687
10943
|
switch (new_type) {
|
10688
10944
|
case GGML_TYPE_IQ2_XXS:
|
10689
10945
|
case GGML_TYPE_IQ2_XS:
|
10946
|
+
case GGML_TYPE_IQ2_S:
|
10690
10947
|
case GGML_TYPE_IQ3_XXS:
|
10948
|
+
case GGML_TYPE_IQ3_S:
|
10691
10949
|
case GGML_TYPE_IQ1_S:
|
10692
10950
|
case GGML_TYPE_Q2_K:
|
10693
|
-
case GGML_TYPE_Q3_K:
|
10694
|
-
case
|
10695
|
-
case
|
10696
|
-
case
|
10951
|
+
case GGML_TYPE_Q3_K:
|
10952
|
+
case GGML_TYPE_IQ4_XS: new_type = GGML_TYPE_IQ4_NL; break;
|
10953
|
+
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
10954
|
+
case GGML_TYPE_Q5_K: new_type = GGML_TYPE_Q5_1; break;
|
10955
|
+
case GGML_TYPE_Q6_K: new_type = GGML_TYPE_Q8_0; break;
|
10697
10956
|
default: throw std::runtime_error("\nUnsupported tensor size encountered\n");
|
10698
10957
|
}
|
10699
10958
|
LLAMA_LOG_WARN(" - using fallback quantization %s\n", ggml_type_name(new_type));
|
@@ -10719,7 +10978,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10719
10978
|
// K-quants
|
10720
10979
|
case LLAMA_FTYPE_MOSTLY_Q2_K_S:
|
10721
10980
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
10722
|
-
case
|
10981
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_XS: quantized_type = GGML_TYPE_IQ3_S; break;
|
10723
10982
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
10724
10983
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
10725
10984
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
@@ -10730,9 +10989,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10730
10989
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
10731
10990
|
case LLAMA_FTYPE_MOSTLY_IQ2_XXS: quantized_type = GGML_TYPE_IQ2_XXS; break;
|
10732
10991
|
case LLAMA_FTYPE_MOSTLY_IQ2_XS: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10992
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_S: quantized_type = GGML_TYPE_IQ2_XS; break;
|
10993
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_M: quantized_type = GGML_TYPE_IQ2_S; break;
|
10733
10994
|
case LLAMA_FTYPE_MOSTLY_IQ3_XXS: quantized_type = GGML_TYPE_IQ3_XXS; break;
|
10734
10995
|
case LLAMA_FTYPE_MOSTLY_IQ1_S: quantized_type = GGML_TYPE_IQ1_S; break;
|
10735
10996
|
case LLAMA_FTYPE_MOSTLY_IQ4_NL: quantized_type = GGML_TYPE_IQ4_NL; break;
|
10997
|
+
case LLAMA_FTYPE_MOSTLY_IQ4_XS: quantized_type = GGML_TYPE_IQ4_XS; break;
|
10998
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_S: quantized_type = GGML_TYPE_IQ3_S; break;
|
10999
|
+
case LLAMA_FTYPE_MOSTLY_IQ3_M: quantized_type = GGML_TYPE_IQ3_S; break;
|
10736
11000
|
|
10737
11001
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
10738
11002
|
}
|
@@ -10862,7 +11126,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10862
11126
|
quantize &= !params->only_copy;
|
10863
11127
|
|
10864
11128
|
// do not quantize expert gating tensors
|
10865
|
-
|
11129
|
+
// NOTE: can't use LLM_TN here because the layer number is not known
|
11130
|
+
quantize &= name.find("ffn_gate_inp.weight") == std::string::npos;
|
10866
11131
|
|
10867
11132
|
// do not quantize positional embeddings and token types (BERT)
|
10868
11133
|
quantize &= name != LLM_TN(model.arch)(LLM_TENSOR_POS_EMBD, "weight");
|
@@ -10906,6 +11171,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
10906
11171
|
}
|
10907
11172
|
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
10908
11173
|
new_type == GGML_TYPE_IQ2_XS ||
|
11174
|
+
new_type == GGML_TYPE_IQ2_S ||
|
10909
11175
|
new_type == GGML_TYPE_IQ1_S ||
|
10910
11176
|
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
10911
11177
|
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
@@ -11327,7 +11593,7 @@ static int llama_apply_lora_from_file_internal(
|
|
11327
11593
|
struct llama_model_params llama_model_default_params() {
|
11328
11594
|
struct llama_model_params result = {
|
11329
11595
|
/*.n_gpu_layers =*/ 0,
|
11330
|
-
/*.split_mode =*/
|
11596
|
+
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
|
11331
11597
|
/*.main_gpu =*/ 0,
|
11332
11598
|
/*.tensor_split =*/ nullptr,
|
11333
11599
|
/*.progress_callback =*/ nullptr,
|
@@ -11353,7 +11619,7 @@ struct llama_context_params llama_context_default_params() {
|
|
11353
11619
|
/*.n_batch =*/ 512,
|
11354
11620
|
/*.n_threads =*/ GGML_DEFAULT_N_THREADS, // TODO: better default
|
11355
11621
|
/*.n_threads_batch =*/ GGML_DEFAULT_N_THREADS,
|
11356
|
-
/*.rope_scaling_type =*/
|
11622
|
+
/*.rope_scaling_type =*/ LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED,
|
11357
11623
|
/*.rope_freq_base =*/ 0.0f,
|
11358
11624
|
/*.rope_freq_scale =*/ 0.0f,
|
11359
11625
|
/*.yarn_ext_factor =*/ -1.0f,
|
@@ -11361,11 +11627,11 @@ struct llama_context_params llama_context_default_params() {
|
|
11361
11627
|
/*.yarn_beta_fast =*/ 32.0f,
|
11362
11628
|
/*.yarn_beta_slow =*/ 1.0f,
|
11363
11629
|
/*.yarn_orig_ctx =*/ 0,
|
11630
|
+
/*.defrag_thold =*/ -1.0f,
|
11364
11631
|
/*.cb_eval =*/ nullptr,
|
11365
11632
|
/*.cb_eval_user_data =*/ nullptr,
|
11366
11633
|
/*.type_k =*/ GGML_TYPE_F16,
|
11367
11634
|
/*.type_v =*/ GGML_TYPE_F16,
|
11368
|
-
/*.mul_mat_q =*/ true,
|
11369
11635
|
/*.logits_all =*/ false,
|
11370
11636
|
/*.embedding =*/ false,
|
11371
11637
|
/*.offload_kqv =*/ true,
|
@@ -11421,15 +11687,6 @@ bool llama_supports_gpu_offload(void) {
|
|
11421
11687
|
#endif
|
11422
11688
|
}
|
11423
11689
|
|
11424
|
-
// deprecated:
|
11425
|
-
bool llama_mmap_supported(void) {
|
11426
|
-
return llama_supports_mmap();
|
11427
|
-
}
|
11428
|
-
|
11429
|
-
bool llama_mlock_supported(void) {
|
11430
|
-
return llama_supports_mlock();
|
11431
|
-
}
|
11432
|
-
|
11433
11690
|
void llama_backend_init(void) {
|
11434
11691
|
ggml_time_init();
|
11435
11692
|
|
@@ -11525,7 +11782,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11525
11782
|
cparams.yarn_attn_factor = params.yarn_attn_factor;
|
11526
11783
|
cparams.yarn_beta_fast = params.yarn_beta_fast;
|
11527
11784
|
cparams.yarn_beta_slow = params.yarn_beta_slow;
|
11528
|
-
cparams.
|
11785
|
+
cparams.defrag_thold = params.defrag_thold;
|
11529
11786
|
cparams.offload_kqv = params.offload_kqv;
|
11530
11787
|
cparams.do_pooling = params.do_pooling;
|
11531
11788
|
|
@@ -11541,16 +11798,16 @@ struct llama_context * llama_new_context_with_model(
|
|
11541
11798
|
cparams.cb_eval_user_data = params.cb_eval_user_data;
|
11542
11799
|
|
11543
11800
|
auto rope_scaling_type = params.rope_scaling_type;
|
11544
|
-
if (rope_scaling_type ==
|
11801
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_UNSPECIFIED) {
|
11545
11802
|
rope_scaling_type = hparams.rope_scaling_type_train;
|
11546
11803
|
}
|
11547
11804
|
|
11548
|
-
if (rope_scaling_type ==
|
11805
|
+
if (rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_NONE) {
|
11549
11806
|
cparams.rope_freq_scale = 1.0f; // never scale if scaling type is none
|
11550
11807
|
}
|
11551
11808
|
|
11552
11809
|
if (cparams.yarn_ext_factor < 0.0f) { // negative indicates 'not set'
|
11553
|
-
cparams.yarn_ext_factor = rope_scaling_type ==
|
11810
|
+
cparams.yarn_ext_factor = rope_scaling_type == LLAMA_ROPE_SCALING_TYPE_YARN ? 1.0f : 0.0f;
|
11554
11811
|
}
|
11555
11812
|
|
11556
11813
|
if (params.seed == LLAMA_DEFAULT_SEED) {
|
@@ -11584,8 +11841,8 @@ struct llama_context * llama_new_context_with_model(
|
|
11584
11841
|
}
|
11585
11842
|
#elif defined(GGML_USE_CUBLAS)
|
11586
11843
|
if (model->n_gpu_layers > 0) {
|
11587
|
-
// with split_mode
|
11588
|
-
if (model->split_mode ==
|
11844
|
+
// with split_mode LLAMA_SPLIT_MODE_NONE or LLAMA_SPLIT_MODE_ROW, only the main GPU backend is used
|
11845
|
+
if (model->split_mode == LLAMA_SPLIT_MODE_NONE || model->split_mode == LLAMA_SPLIT_MODE_ROW) {
|
11589
11846
|
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
11590
11847
|
if (backend == nullptr) {
|
11591
11848
|
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
@@ -11594,7 +11851,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11594
11851
|
}
|
11595
11852
|
ctx->backends.push_back(backend);
|
11596
11853
|
} else {
|
11597
|
-
//
|
11854
|
+
// LLAMA_SPLIT_MODE_LAYER requires a backend for each GPU
|
11598
11855
|
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
11599
11856
|
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
11600
11857
|
if (backend == nullptr) {
|
@@ -11647,8 +11904,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11647
11904
|
}
|
11648
11905
|
ctx->backends.push_back(ctx->backend_cpu);
|
11649
11906
|
|
11650
|
-
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
11651
|
-
cparams.n_ctx, cparams.offload_kqv)) {
|
11907
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v, cparams.n_ctx, cparams.offload_kqv)) {
|
11652
11908
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
11653
11909
|
llama_free(ctx);
|
11654
11910
|
return nullptr;
|
@@ -11727,7 +11983,7 @@ struct llama_context * llama_new_context_with_model(
|
|
11727
11983
|
}
|
11728
11984
|
|
11729
11985
|
// buffer used to store the computation graph and the tensor meta data
|
11730
|
-
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES +
|
11986
|
+
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead_custom(LLAMA_MAX_NODES, false));
|
11731
11987
|
|
11732
11988
|
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
11733
11989
|
|
@@ -11796,6 +12052,49 @@ enum llama_vocab_type llama_vocab_type(const struct llama_model * model) {
|
|
11796
12052
|
return model->vocab.type;
|
11797
12053
|
}
|
11798
12054
|
|
12055
|
+
enum llama_rope_type llama_rope_type(const struct llama_model * model) {
|
12056
|
+
switch (model->arch) {
|
12057
|
+
// these models do not use RoPE
|
12058
|
+
case LLM_ARCH_GPT2:
|
12059
|
+
case LLM_ARCH_GPTJ:
|
12060
|
+
case LLM_ARCH_GPTNEOX:
|
12061
|
+
case LLM_ARCH_MPT:
|
12062
|
+
case LLM_ARCH_REFACT:
|
12063
|
+
case LLM_ARCH_BLOOM:
|
12064
|
+
return LLAMA_ROPE_TYPE_NONE;
|
12065
|
+
|
12066
|
+
// use what we call a normal RoPE, operating on pairs of consecutive head values
|
12067
|
+
case LLM_ARCH_LLAMA:
|
12068
|
+
case LLM_ARCH_BAICHUAN:
|
12069
|
+
case LLM_ARCH_STARCODER:
|
12070
|
+
case LLM_ARCH_PLAMO:
|
12071
|
+
case LLM_ARCH_CODESHELL:
|
12072
|
+
case LLM_ARCH_ORION:
|
12073
|
+
case LLM_ARCH_INTERNLM2:
|
12074
|
+
case LLM_ARCH_MINICPM:
|
12075
|
+
return LLAMA_ROPE_TYPE_NORM;
|
12076
|
+
|
12077
|
+
// the pairs of head values are offset by n_rot/2
|
12078
|
+
case LLM_ARCH_FALCON:
|
12079
|
+
case LLM_ARCH_PERSIMMON:
|
12080
|
+
case LLM_ARCH_BERT:
|
12081
|
+
case LLM_ARCH_NOMIC_BERT:
|
12082
|
+
case LLM_ARCH_STABLELM:
|
12083
|
+
case LLM_ARCH_QWEN:
|
12084
|
+
case LLM_ARCH_QWEN2:
|
12085
|
+
case LLM_ARCH_PHI2:
|
12086
|
+
case LLM_ARCH_GEMMA:
|
12087
|
+
return LLAMA_ROPE_TYPE_NEOX;
|
12088
|
+
|
12089
|
+
// all model arches should be listed explicitly here
|
12090
|
+
case LLM_ARCH_UNKNOWN:
|
12091
|
+
GGML_ASSERT(false && "unknown architecture");
|
12092
|
+
break;
|
12093
|
+
}
|
12094
|
+
|
12095
|
+
return LLAMA_ROPE_TYPE_NONE;
|
12096
|
+
}
|
12097
|
+
|
11799
12098
|
int32_t llama_n_vocab(const struct llama_model * model) {
|
11800
12099
|
return model->vocab.id_to_token.size();
|
11801
12100
|
}
|
@@ -11898,15 +12197,6 @@ uint32_t llama_model_quantize(
|
|
11898
12197
|
}
|
11899
12198
|
}
|
11900
12199
|
|
11901
|
-
int32_t llama_apply_lora_from_file(struct llama_context * ctx, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
11902
|
-
try {
|
11903
|
-
return llama_apply_lora_from_file_internal(ctx->model, path_lora, scale, path_base_model, n_threads);
|
11904
|
-
} catch (const std::exception & err) {
|
11905
|
-
LLAMA_LOG_ERROR("%s: failed to apply lora adapter: %s\n", __func__, err.what());
|
11906
|
-
return 1;
|
11907
|
-
}
|
11908
|
-
}
|
11909
|
-
|
11910
12200
|
int32_t llama_model_apply_lora_from_file(const struct llama_model * model, const char * path_lora, float scale, const char * path_base_model, int32_t n_threads) {
|
11911
12201
|
try {
|
11912
12202
|
return llama_apply_lora_from_file_internal(*model, path_lora, scale, path_base_model, n_threads);
|
@@ -12038,12 +12328,12 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
12038
12328
|
llama_kv_cache_seq_keep(ctx->kv_self, seq_id);
|
12039
12329
|
}
|
12040
12330
|
|
12041
|
-
void
|
12331
|
+
void llama_kv_cache_seq_add(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
12042
12332
|
if (delta == 0) {
|
12043
12333
|
return;
|
12044
12334
|
}
|
12045
12335
|
|
12046
|
-
|
12336
|
+
llama_kv_cache_seq_add(ctx->kv_self, seq_id, p0, p1, delta);
|
12047
12337
|
}
|
12048
12338
|
|
12049
12339
|
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
@@ -12054,6 +12344,19 @@ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, lla
|
|
12054
12344
|
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
12055
12345
|
}
|
12056
12346
|
|
12347
|
+
llama_pos llama_kv_cache_seq_pos_max(struct llama_context * ctx, llama_seq_id seq_id) {
|
12348
|
+
return llama_kv_cache_seq_pos_max(ctx->kv_self, seq_id);
|
12349
|
+
}
|
12350
|
+
|
12351
|
+
void llama_kv_cache_defrag(struct llama_context * ctx) {
|
12352
|
+
llama_kv_cache_defrag(ctx->kv_self);
|
12353
|
+
}
|
12354
|
+
|
12355
|
+
void llama_kv_cache_update(struct llama_context * ctx) {
|
12356
|
+
llama_kv_cache_update_internal(*ctx);
|
12357
|
+
}
|
12358
|
+
|
12359
|
+
|
12057
12360
|
// Returns the *maximum* size of the state
|
12058
12361
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
12059
12362
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
@@ -12180,10 +12483,10 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12180
12483
|
const auto & hparams = ctx->model.hparams;
|
12181
12484
|
const auto & cparams = ctx->cparams;
|
12182
12485
|
|
12183
|
-
const
|
12184
|
-
const
|
12185
|
-
const
|
12186
|
-
const
|
12486
|
+
const uint32_t n_layer = hparams.n_layer;
|
12487
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12488
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12489
|
+
const uint32_t n_ctx = cparams.n_ctx;
|
12187
12490
|
|
12188
12491
|
const size_t kv_buf_size = kv_self.total_size();
|
12189
12492
|
const uint32_t kv_head = kv_self.head;
|
@@ -12198,14 +12501,16 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
12198
12501
|
if (kv_buf_size) {
|
12199
12502
|
std::vector<uint8_t> tmp_buf;
|
12200
12503
|
for (int il = 0; il < (int) n_layer; ++il) {
|
12201
|
-
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12504
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12505
|
+
|
12202
12506
|
tmp_buf.resize(k_size);
|
12203
12507
|
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
12204
12508
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
12205
12509
|
|
12206
12510
|
// v is not contiguous, copy row by row
|
12207
|
-
size_t v_row_size
|
12208
|
-
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12511
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12512
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12513
|
+
|
12209
12514
|
tmp_buf.resize(v_row_size);
|
12210
12515
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12211
12516
|
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*v_row_stride, tmp_buf.size());
|
@@ -12238,8 +12543,8 @@ size_t llama_copy_state_data(struct llama_context * ctx, uint8_t * dst) {
|
|
12238
12543
|
}
|
12239
12544
|
|
12240
12545
|
// Sets the state reading from the specified source address
|
12241
|
-
size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
12242
|
-
uint8_t * inp = src;
|
12546
|
+
size_t llama_set_state_data(struct llama_context * ctx, const uint8_t * src) {
|
12547
|
+
const uint8_t * inp = src;
|
12243
12548
|
|
12244
12549
|
// set rng
|
12245
12550
|
{
|
@@ -12248,7 +12553,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12248
12553
|
|
12249
12554
|
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
12250
12555
|
|
12251
|
-
std::string rng_str((char *)inp, rng_size); inp += rng_size;
|
12556
|
+
std::string rng_str((const char *)inp, rng_size); inp += rng_size;
|
12252
12557
|
|
12253
12558
|
std::istringstream rng_ss(rng_str);
|
12254
12559
|
rng_ss >> ctx->rng;
|
@@ -12292,10 +12597,10 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12292
12597
|
const auto & hparams = ctx->model.hparams;
|
12293
12598
|
const auto & cparams = ctx->cparams;
|
12294
12599
|
|
12295
|
-
const
|
12296
|
-
const
|
12297
|
-
const
|
12298
|
-
const
|
12600
|
+
const uint32_t n_layer = hparams.n_layer;
|
12601
|
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
12602
|
+
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
12603
|
+
const uint32_t n_ctx = cparams.n_ctx;
|
12299
12604
|
|
12300
12605
|
size_t kv_buf_size;
|
12301
12606
|
uint32_t kv_head;
|
@@ -12311,13 +12616,15 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
12311
12616
|
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
12312
12617
|
|
12313
12618
|
for (int il = 0; il < (int) n_layer; ++il) {
|
12314
|
-
size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12619
|
+
const size_t k_size = ggml_row_size(kv_self.k_l[il]->type, n_embd_k_gqa*kv_head);
|
12620
|
+
|
12315
12621
|
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
12316
12622
|
inp += k_size;
|
12317
12623
|
|
12318
12624
|
// v is not contiguous, copy row by row
|
12319
|
-
size_t v_row_size
|
12320
|
-
size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12625
|
+
const size_t v_row_size = ggml_row_size(kv_self.v_l[il]->type, kv_head);
|
12626
|
+
const size_t v_row_stride = ggml_row_size(kv_self.v_l[il]->type, n_ctx);
|
12627
|
+
|
12321
12628
|
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
12322
12629
|
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*v_row_stride, v_row_size);
|
12323
12630
|
inp += v_row_size;
|
@@ -12439,38 +12746,6 @@ bool llama_save_session_file(struct llama_context * ctx, const char * path_sessi
|
|
12439
12746
|
return true;
|
12440
12747
|
}
|
12441
12748
|
|
12442
|
-
int llama_eval(
|
12443
|
-
struct llama_context * ctx,
|
12444
|
-
llama_token * tokens,
|
12445
|
-
int32_t n_tokens,
|
12446
|
-
int32_t n_past) {
|
12447
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
12448
|
-
|
12449
|
-
const int ret = llama_decode_internal(*ctx, llama_batch_get_one(tokens, n_tokens, n_past, 0));
|
12450
|
-
if (ret < 0) {
|
12451
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
12452
|
-
}
|
12453
|
-
|
12454
|
-
return ret;
|
12455
|
-
}
|
12456
|
-
|
12457
|
-
int llama_eval_embd(
|
12458
|
-
struct llama_context * ctx,
|
12459
|
-
float * embd,
|
12460
|
-
int32_t n_tokens,
|
12461
|
-
int32_t n_past) {
|
12462
|
-
llama_kv_cache_seq_rm(ctx->kv_self, -1, n_past, -1);
|
12463
|
-
|
12464
|
-
llama_batch batch = { n_tokens, nullptr, embd, nullptr, nullptr, nullptr, nullptr, n_past, 1, 0, };
|
12465
|
-
|
12466
|
-
const int ret = llama_decode_internal(*ctx, batch);
|
12467
|
-
if (ret < 0) {
|
12468
|
-
LLAMA_LOG_ERROR("%s: failed to decode, ret = %d\n", __func__, ret);
|
12469
|
-
}
|
12470
|
-
|
12471
|
-
return ret;
|
12472
|
-
}
|
12473
|
-
|
12474
12749
|
void llama_set_n_threads(struct llama_context * ctx, uint32_t n_threads, uint32_t n_threads_batch) {
|
12475
12750
|
ctx->cparams.n_threads = n_threads;
|
12476
12751
|
ctx->cparams.n_threads_batch = n_threads_batch;
|