llama_cpp 0.12.1 → 0.12.2
Sign up to get free protection for your applications and to get access to all the features.
- checksums.yaml +4 -4
- data/CHANGELOG.md +8 -0
- data/ext/llama_cpp/llama_cpp.cpp +64 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +7 -0
- data/vendor/tmp/llama.cpp/Makefile +0 -9
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +510 -263
- data/vendor/tmp/llama.cpp/ggml-backend.h +42 -32
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +692 -476
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1860 -2073
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +1638 -134
- data/vendor/tmp/llama.cpp/ggml-quants.h +15 -4
- data/vendor/tmp/llama.cpp/ggml.c +142 -64
- data/vendor/tmp/llama.cpp/ggml.h +47 -29
- data/vendor/tmp/llama.cpp/llama.cpp +1219 -1615
- data/vendor/tmp/llama.cpp/llama.h +30 -8
- metadata +2 -2
@@ -1,5 +1,4 @@
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
2
|
-
//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
|
3
2
|
#include "llama.h"
|
4
3
|
|
5
4
|
#include "unicode.h"
|
@@ -152,10 +151,6 @@ static bool is_float_close(float a, float b, float abs_tol) {
|
|
152
151
|
return std::fabs(b - a) <= abs_tol;
|
153
152
|
}
|
154
153
|
|
155
|
-
#ifdef GGML_USE_CPU_HBM
|
156
|
-
#include <hbwmalloc.h>
|
157
|
-
#endif
|
158
|
-
|
159
154
|
static void zeros(std::ofstream & file, size_t n) {
|
160
155
|
char zero = 0;
|
161
156
|
for (size_t i = 0; i < n; ++i) {
|
@@ -579,6 +574,9 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
579
574
|
{ LLM_TENSOR_OUTPUT, "output" },
|
580
575
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
581
576
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
577
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
578
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
579
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
582
580
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
583
581
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
584
582
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
@@ -988,20 +986,29 @@ struct llama_mmap {
|
|
988
986
|
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
989
987
|
}
|
990
988
|
|
991
|
-
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
992
989
|
if (prefetch > 0) {
|
993
|
-
|
994
|
-
|
995
|
-
|
996
|
-
|
997
|
-
|
998
|
-
|
999
|
-
|
1000
|
-
|
990
|
+
#if _WIN32_WINNT >= 0x602
|
991
|
+
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
|
992
|
+
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
|
993
|
+
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
994
|
+
|
995
|
+
// may fail on pre-Windows 8 systems
|
996
|
+
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
|
997
|
+
|
998
|
+
if (pPrefetchVirtualMemory) {
|
999
|
+
// advise the kernel to preload the mapped memory
|
1000
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
1001
|
+
range.VirtualAddress = addr;
|
1002
|
+
range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
|
1003
|
+
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
1004
|
+
LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
|
1005
|
+
llama_format_win_err(GetLastError()).c_str());
|
1006
|
+
}
|
1007
|
+
}
|
1008
|
+
#else
|
1009
|
+
throw std::runtime_error("PrefetchVirtualMemory unavailable");
|
1010
|
+
#endif
|
1001
1011
|
}
|
1002
|
-
#else
|
1003
|
-
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
1004
|
-
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
1005
1012
|
}
|
1006
1013
|
|
1007
1014
|
void unmap_fragment(size_t first, size_t last) {
|
@@ -1107,7 +1114,7 @@ struct llama_mlock {
|
|
1107
1114
|
suggest = false;
|
1108
1115
|
}
|
1109
1116
|
|
1110
|
-
|
1117
|
+
LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
1111
1118
|
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
1112
1119
|
return false;
|
1113
1120
|
}
|
@@ -1116,7 +1123,7 @@ struct llama_mlock {
|
|
1116
1123
|
|
1117
1124
|
static void raw_unlock(void * addr, size_t size) {
|
1118
1125
|
if (munlock(addr, size)) {
|
1119
|
-
|
1126
|
+
LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
1120
1127
|
}
|
1121
1128
|
}
|
1122
1129
|
#elif defined(_WIN32)
|
@@ -1134,7 +1141,7 @@ struct llama_mlock {
|
|
1134
1141
|
return true;
|
1135
1142
|
}
|
1136
1143
|
if (tries == 2) {
|
1137
|
-
|
1144
|
+
LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
1138
1145
|
len, size, llama_format_win_err(GetLastError()).c_str());
|
1139
1146
|
return false;
|
1140
1147
|
}
|
@@ -1143,7 +1150,7 @@ struct llama_mlock {
|
|
1143
1150
|
// set size and try again.
|
1144
1151
|
SIZE_T min_ws_size, max_ws_size;
|
1145
1152
|
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
1146
|
-
|
1153
|
+
LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
|
1147
1154
|
llama_format_win_err(GetLastError()).c_str());
|
1148
1155
|
return false;
|
1149
1156
|
}
|
@@ -1156,7 +1163,7 @@ struct llama_mlock {
|
|
1156
1163
|
min_ws_size += increment;
|
1157
1164
|
max_ws_size += increment;
|
1158
1165
|
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
1159
|
-
|
1166
|
+
LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
|
1160
1167
|
llama_format_win_err(GetLastError()).c_str());
|
1161
1168
|
return false;
|
1162
1169
|
}
|
@@ -1165,7 +1172,7 @@ struct llama_mlock {
|
|
1165
1172
|
|
1166
1173
|
static void raw_unlock(void * ptr, size_t len) {
|
1167
1174
|
if (!VirtualUnlock(ptr, len)) {
|
1168
|
-
|
1175
|
+
LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
|
1169
1176
|
llama_format_win_err(GetLastError()).c_str());
|
1170
1177
|
}
|
1171
1178
|
}
|
@@ -1177,7 +1184,7 @@ struct llama_mlock {
|
|
1177
1184
|
}
|
1178
1185
|
|
1179
1186
|
bool raw_lock(const void * addr, size_t len) const {
|
1180
|
-
|
1187
|
+
LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
|
1181
1188
|
return false;
|
1182
1189
|
}
|
1183
1190
|
|
@@ -1185,12 +1192,6 @@ struct llama_mlock {
|
|
1185
1192
|
#endif
|
1186
1193
|
};
|
1187
1194
|
|
1188
|
-
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
1189
|
-
|
1190
|
-
static void ggml_offload_nop(struct ggml_tensor * tensor) {
|
1191
|
-
(void) tensor;
|
1192
|
-
}
|
1193
|
-
|
1194
1195
|
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
1195
1196
|
std::vector<char> result(8, 0);
|
1196
1197
|
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
@@ -1206,19 +1207,14 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
1206
1207
|
return std::string(result.data(), result.size());
|
1207
1208
|
}
|
1208
1209
|
|
1209
|
-
static ggml_backend_buffer_type_t
|
1210
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
1210
1211
|
ggml_backend_buffer_type_t buft = nullptr;
|
1211
1212
|
|
1212
|
-
#
|
1213
|
-
|
1214
|
-
|
1215
|
-
|
1216
|
-
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1217
|
-
if (n_gpu_layers > 0) {
|
1218
|
-
buft = ggml_backend_cuda_buffer_type(0);
|
1213
|
+
#if defined(GGML_USE_CUBLAS)
|
1214
|
+
// host buffers should only be used when data is expected to be copied to/from the GPU
|
1215
|
+
if (host_buffer) {
|
1216
|
+
buft = ggml_backend_cuda_host_buffer_type();
|
1219
1217
|
}
|
1220
|
-
#elif defined(GGML_USE_CUBLAS)
|
1221
|
-
buft = ggml_backend_cuda_host_buffer_type();
|
1222
1218
|
#elif defined(GGML_USE_CPU_HBM)
|
1223
1219
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
1224
1220
|
#endif
|
@@ -1226,10 +1222,45 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
|
1226
1222
|
if (buft == nullptr) {
|
1227
1223
|
buft = ggml_backend_cpu_buffer_type();
|
1228
1224
|
}
|
1225
|
+
return buft;
|
1226
|
+
|
1227
|
+
GGML_UNUSED(host_buffer);
|
1228
|
+
}
|
1229
|
+
|
1230
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
1231
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
1232
|
+
|
1233
|
+
#ifdef GGML_USE_METAL
|
1234
|
+
buft = ggml_backend_metal_buffer_type();
|
1235
|
+
#elif defined(GGML_USE_CUBLAS)
|
1236
|
+
buft = ggml_backend_cuda_buffer_type(gpu);
|
1237
|
+
#elif defined(GGML_USE_CLBLAST)
|
1238
|
+
buft = ggml_backend_opencl_buffer_type();
|
1239
|
+
#endif
|
1240
|
+
|
1241
|
+
if (buft == nullptr) {
|
1242
|
+
buft = llama_default_buffer_type_cpu(true);
|
1243
|
+
}
|
1244
|
+
return buft;
|
1245
|
+
|
1246
|
+
GGML_UNUSED(gpu);
|
1247
|
+
}
|
1248
|
+
|
1249
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
1250
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
1251
|
+
|
1252
|
+
#ifdef GGML_USE_CUBLAS
|
1253
|
+
if (ggml_backend_cuda_get_device_count() > 1) {
|
1254
|
+
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
1255
|
+
}
|
1256
|
+
#endif
|
1229
1257
|
|
1258
|
+
if (buft == nullptr) {
|
1259
|
+
buft = llama_default_buffer_type_offload(fallback_gpu);
|
1260
|
+
}
|
1230
1261
|
return buft;
|
1231
1262
|
|
1232
|
-
GGML_UNUSED(
|
1263
|
+
GGML_UNUSED(tensor_split);
|
1233
1264
|
}
|
1234
1265
|
|
1235
1266
|
//
|
@@ -1239,7 +1270,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
|
1239
1270
|
struct llama_state {
|
1240
1271
|
llama_state() {
|
1241
1272
|
#ifdef GGML_USE_METAL
|
1242
|
-
|
1273
|
+
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
1243
1274
|
#endif
|
1244
1275
|
}
|
1245
1276
|
|
@@ -1440,24 +1471,24 @@ struct llama_kv_cache {
|
|
1440
1471
|
std::vector<struct ggml_tensor *> k_l; // per layer
|
1441
1472
|
std::vector<struct ggml_tensor *> v_l;
|
1442
1473
|
|
1443
|
-
struct ggml_context
|
1474
|
+
std::vector<struct ggml_context *> ctxs;
|
1475
|
+
std::vector<ggml_backend_buffer_t> bufs;
|
1444
1476
|
|
1445
|
-
|
1477
|
+
size_t total_size() const {
|
1478
|
+
size_t size = 0;
|
1479
|
+
for (ggml_backend_buffer_t buf : bufs) {
|
1480
|
+
size += ggml_backend_buffer_get_size(buf);
|
1481
|
+
}
|
1482
|
+
return size;
|
1483
|
+
}
|
1446
1484
|
|
1447
1485
|
~llama_kv_cache() {
|
1448
|
-
|
1449
|
-
if (ggml_cublas_loaded()) {
|
1450
|
-
for (size_t i = 0; i < k_l.size(); ++i) {
|
1451
|
-
ggml_cuda_free_data(k_l[i]);
|
1452
|
-
ggml_cuda_free_data(v_l[i]);
|
1453
|
-
}
|
1454
|
-
}
|
1455
|
-
#endif
|
1456
|
-
if (ctx) {
|
1486
|
+
for (struct ggml_context * ctx : ctxs) {
|
1457
1487
|
ggml_free(ctx);
|
1458
1488
|
}
|
1459
|
-
|
1460
|
-
|
1489
|
+
for (ggml_backend_buffer_t buf : bufs) {
|
1490
|
+
ggml_backend_buffer_free(buf);
|
1491
|
+
}
|
1461
1492
|
}
|
1462
1493
|
};
|
1463
1494
|
|
@@ -1534,16 +1565,32 @@ struct llama_model {
|
|
1534
1565
|
|
1535
1566
|
std::vector<llama_layer> layers;
|
1536
1567
|
|
1568
|
+
llama_split_mode split_mode;
|
1569
|
+
int main_gpu;
|
1537
1570
|
int n_gpu_layers;
|
1538
1571
|
|
1539
1572
|
// gguf metadata
|
1540
1573
|
std::unordered_map<std::string, std::string> gguf_kv;
|
1541
1574
|
|
1542
|
-
//
|
1543
|
-
struct
|
1575
|
+
// layer -> buffer type mapping
|
1576
|
+
struct layer_buft {
|
1577
|
+
layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
|
1578
|
+
layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
|
1579
|
+
layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
|
1580
|
+
|
1581
|
+
ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
|
1582
|
+
ggml_backend_buffer_type_t buft; // everything else
|
1583
|
+
};
|
1584
|
+
|
1585
|
+
layer_buft buft_input;
|
1586
|
+
layer_buft buft_output;
|
1587
|
+
std::vector<layer_buft> buft_layer;
|
1544
1588
|
|
1545
|
-
// the model
|
1546
|
-
|
1589
|
+
// contexts where the model tensors metadata is stored
|
1590
|
+
std::vector<struct ggml_context *> ctxs;
|
1591
|
+
|
1592
|
+
// the model memory buffers for the tensor data
|
1593
|
+
std::vector<ggml_backend_buffer_t> bufs;
|
1547
1594
|
|
1548
1595
|
// model memory mapped file
|
1549
1596
|
std::unique_ptr<llama_mmap> mapping;
|
@@ -1559,39 +1606,32 @@ struct llama_model {
|
|
1559
1606
|
int64_t t_start_us = 0;
|
1560
1607
|
|
1561
1608
|
~llama_model() {
|
1562
|
-
|
1563
|
-
if (ggml_cublas_loaded()) {
|
1564
|
-
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1565
|
-
ggml_cuda_free_data(tensors_by_name[i].second);
|
1566
|
-
}
|
1567
|
-
ggml_cuda_free_scratch();
|
1568
|
-
}
|
1569
|
-
#endif
|
1570
|
-
|
1571
|
-
#if defined(GGML_USE_CLBLAST)
|
1572
|
-
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
1573
|
-
ggml_cl_free_data(tensors_by_name[i].second);
|
1574
|
-
}
|
1575
|
-
#endif
|
1576
|
-
if (ctx) {
|
1609
|
+
for (struct ggml_context * ctx : ctxs) {
|
1577
1610
|
ggml_free(ctx);
|
1578
1611
|
}
|
1579
|
-
|
1580
|
-
|
1612
|
+
for (ggml_backend_buffer_t buf : bufs) {
|
1613
|
+
ggml_backend_buffer_free(buf);
|
1614
|
+
}
|
1581
1615
|
}
|
1582
1616
|
};
|
1583
1617
|
|
1584
1618
|
struct llama_context {
|
1585
1619
|
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
1586
1620
|
~llama_context() {
|
1587
|
-
|
1588
|
-
|
1589
|
-
|
1621
|
+
ggml_backend_sched_free(sched);
|
1622
|
+
|
1623
|
+
for (ggml_backend_t backend : backends) {
|
1624
|
+
ggml_backend_free(backend);
|
1625
|
+
}
|
1590
1626
|
}
|
1591
1627
|
|
1592
1628
|
llama_cparams cparams;
|
1593
1629
|
|
1594
|
-
ggml_backend_t
|
1630
|
+
std::vector<ggml_backend_t> backends;
|
1631
|
+
#ifdef GGML_USE_METAL
|
1632
|
+
ggml_backend_t backend_metal = nullptr;
|
1633
|
+
#endif
|
1634
|
+
ggml_backend_t backend_cpu = nullptr;
|
1595
1635
|
|
1596
1636
|
const llama_model & model;
|
1597
1637
|
|
@@ -1625,8 +1665,9 @@ struct llama_context {
|
|
1625
1665
|
|
1626
1666
|
// memory buffers used to evaluate the model
|
1627
1667
|
std::vector<uint8_t> buf_compute_meta;
|
1628
|
-
|
1629
|
-
|
1668
|
+
ggml_backend_sched_t sched = nullptr;
|
1669
|
+
// allocator for the input tensors
|
1670
|
+
ggml_tallocr * alloc = nullptr;
|
1630
1671
|
|
1631
1672
|
// temporary buffer for copying data to/from the backend
|
1632
1673
|
std::vector<no_init<uint8_t>> buf_copy;
|
@@ -1641,16 +1682,17 @@ struct llama_context {
|
|
1641
1682
|
//
|
1642
1683
|
|
1643
1684
|
static bool llama_kv_cache_init(
|
1644
|
-
const struct llama_hparams & hparams,
|
1645
1685
|
struct llama_kv_cache & cache,
|
1686
|
+
const llama_model & model,
|
1646
1687
|
ggml_type ktype,
|
1647
1688
|
ggml_type vtype,
|
1648
1689
|
uint32_t n_ctx,
|
1649
|
-
int n_gpu_layers,
|
1650
1690
|
bool offload) {
|
1691
|
+
const struct llama_hparams & hparams = model.hparams;
|
1692
|
+
|
1651
1693
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
1652
1694
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
1653
|
-
const
|
1695
|
+
const int64_t n_layer = hparams.n_layer;
|
1654
1696
|
|
1655
1697
|
cache.has_shift = false;
|
1656
1698
|
|
@@ -1661,62 +1703,65 @@ static bool llama_kv_cache_init(
|
|
1661
1703
|
cache.cells.clear();
|
1662
1704
|
cache.cells.resize(n_ctx);
|
1663
1705
|
|
1664
|
-
|
1665
|
-
|
1666
|
-
|
1667
|
-
params.no_alloc = true;
|
1668
|
-
|
1669
|
-
cache.ctx = ggml_init(params);
|
1706
|
+
#ifdef GGML_USE_CLBLAST
|
1707
|
+
offload = false;
|
1708
|
+
#endif
|
1670
1709
|
|
1671
|
-
|
1710
|
+
// count used buffer types
|
1711
|
+
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
1712
|
+
if (offload) {
|
1713
|
+
for (int64_t i = 0; i < n_layer; ++i) {
|
1714
|
+
buft_layer_count[model.buft_layer[i].buft]++;
|
1715
|
+
}
|
1716
|
+
} else {
|
1717
|
+
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
|
1718
|
+
}
|
1672
1719
|
|
1673
|
-
|
1674
|
-
|
1675
|
-
|
1720
|
+
// create a context for each buffer type
|
1721
|
+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
1722
|
+
for (auto & it : buft_layer_count) {
|
1723
|
+
int n_layers = it.second;
|
1724
|
+
struct ggml_init_params params = {
|
1725
|
+
/*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead(),
|
1726
|
+
/*.mem_buffer =*/ NULL,
|
1727
|
+
/*.no_alloc =*/ true,
|
1728
|
+
};
|
1729
|
+
ggml_context * ctx = ggml_init(params);
|
1730
|
+
if (!ctx) {
|
1731
|
+
LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
|
1732
|
+
return false;
|
1733
|
+
}
|
1734
|
+
ctx_map[it.first] = ctx;
|
1735
|
+
cache.ctxs.push_back(ctx);
|
1676
1736
|
}
|
1677
1737
|
|
1678
1738
|
cache.k_l.reserve(n_layer);
|
1679
1739
|
cache.v_l.reserve(n_layer);
|
1680
1740
|
|
1681
|
-
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
1682
|
-
|
1683
1741
|
for (int i = 0; i < (int) n_layer; i++) {
|
1684
|
-
|
1685
|
-
ggml_tensor *
|
1742
|
+
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
1743
|
+
ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
|
1744
|
+
ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
|
1686
1745
|
ggml_format_name(k, "cache_k_l%d", i);
|
1687
1746
|
ggml_format_name(v, "cache_v_l%d", i);
|
1688
1747
|
cache.k_l.push_back(k);
|
1689
1748
|
cache.v_l.push_back(v);
|
1690
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
1691
|
-
if (i >= i_gpu_start) {
|
1692
|
-
if (offload) {
|
1693
|
-
ggml_cuda_assign_buffers_no_scratch(k);
|
1694
|
-
ggml_cuda_assign_buffers_no_scratch(v);
|
1695
|
-
vram_kv_cache += ggml_nbytes(k);
|
1696
|
-
vram_kv_cache += ggml_nbytes(v);
|
1697
|
-
// HACK: mark tensor as allocated
|
1698
|
-
k->data = v->data = (void *)(uintptr_t)1;
|
1699
|
-
}
|
1700
|
-
}
|
1701
|
-
#endif // GGML_USE_CUBLAS
|
1702
|
-
}
|
1703
|
-
|
1704
|
-
// allocate tensors
|
1705
|
-
cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
|
1706
|
-
|
1707
|
-
// buf may be NULL with full offload
|
1708
|
-
if (cache.buf) {
|
1709
|
-
// initialize the buffer to avoid NaNs in the padding
|
1710
|
-
ggml_backend_buffer_clear(cache.buf, 0);
|
1711
1749
|
}
|
1712
1750
|
|
1713
|
-
|
1714
|
-
|
1751
|
+
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
1752
|
+
for (auto it : ctx_map) {
|
1753
|
+
ggml_backend_buffer_type_t buft = it.first;
|
1754
|
+
ggml_context * ctx = it.second;
|
1755
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
1756
|
+
if (!buf) {
|
1757
|
+
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
|
1758
|
+
return false;
|
1759
|
+
}
|
1760
|
+
ggml_backend_buffer_clear(buf, 0);
|
1761
|
+
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
1762
|
+
cache.bufs.push_back(buf);
|
1715
1763
|
}
|
1716
1764
|
|
1717
|
-
GGML_UNUSED(i_gpu_start);
|
1718
|
-
GGML_UNUSED(offload);
|
1719
|
-
|
1720
1765
|
return true;
|
1721
1766
|
}
|
1722
1767
|
|
@@ -2040,13 +2085,13 @@ namespace GGUFMeta {
|
|
2040
2085
|
__func__, override_type_to_str(override->tag), override->key);
|
2041
2086
|
switch (override->tag) {
|
2042
2087
|
case LLAMA_KV_OVERRIDE_BOOL: {
|
2043
|
-
|
2088
|
+
LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
|
2044
2089
|
} break;
|
2045
2090
|
case LLAMA_KV_OVERRIDE_INT: {
|
2046
|
-
|
2091
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
|
2047
2092
|
} break;
|
2048
2093
|
case LLAMA_KV_OVERRIDE_FLOAT: {
|
2049
|
-
|
2094
|
+
LLAMA_LOG_INFO("%.6f\n", override->float_value);
|
2050
2095
|
} break;
|
2051
2096
|
default:
|
2052
2097
|
// Shouldn't be possible to end up here, but just in case...
|
@@ -2145,6 +2190,11 @@ struct llama_model_loader {
|
|
2145
2190
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
2146
2191
|
|
2147
2192
|
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
|
2193
|
+
int trace = 0;
|
2194
|
+
if (getenv("LLAMA_TRACE")) {
|
2195
|
+
trace = atoi(getenv("LLAMA_TRACE"));
|
2196
|
+
}
|
2197
|
+
|
2148
2198
|
struct gguf_init_params params = {
|
2149
2199
|
/*.no_alloc = */ true,
|
2150
2200
|
/*.ctx = */ &ctx_meta,
|
@@ -2197,11 +2247,10 @@ struct llama_model_loader {
|
|
2197
2247
|
type_max = type;
|
2198
2248
|
}
|
2199
2249
|
|
2200
|
-
|
2201
|
-
|
2202
|
-
|
2203
|
-
|
2204
|
-
#endif
|
2250
|
+
if (trace > 0) {
|
2251
|
+
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
2252
|
+
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
|
2253
|
+
}
|
2205
2254
|
}
|
2206
2255
|
|
2207
2256
|
switch (type_max) {
|
@@ -2349,9 +2398,8 @@ struct llama_model_loader {
|
|
2349
2398
|
return get_tensor_meta(get_tensor_name(i));
|
2350
2399
|
}
|
2351
2400
|
|
2352
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta
|
2401
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
|
2353
2402
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
2354
|
-
tensor->backend = backend; // TODO: ggml_set_backend
|
2355
2403
|
ggml_set_name(tensor, ggml_get_name(meta));
|
2356
2404
|
|
2357
2405
|
n_created++;
|
@@ -2359,7 +2407,7 @@ struct llama_model_loader {
|
|
2359
2407
|
return tensor;
|
2360
2408
|
}
|
2361
2409
|
|
2362
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
2410
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
2363
2411
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
2364
2412
|
|
2365
2413
|
if (cur == NULL) {
|
@@ -2369,12 +2417,6 @@ struct llama_model_loader {
|
|
2369
2417
|
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
2370
2418
|
}
|
2371
2419
|
|
2372
|
-
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
2373
|
-
if (ne.size() == 1) {
|
2374
|
-
throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
|
2375
|
-
}
|
2376
|
-
}
|
2377
|
-
|
2378
2420
|
{
|
2379
2421
|
bool is_ok = true;
|
2380
2422
|
for (size_t i = 0; i < ne.size(); ++i) {
|
@@ -2392,7 +2434,7 @@ struct llama_model_loader {
|
|
2392
2434
|
}
|
2393
2435
|
}
|
2394
2436
|
|
2395
|
-
return create_tensor_for(ctx, cur
|
2437
|
+
return create_tensor_for(ctx, cur);
|
2396
2438
|
}
|
2397
2439
|
|
2398
2440
|
void done_getting_tensors() const {
|
@@ -2411,25 +2453,35 @@ struct llama_model_loader {
|
|
2411
2453
|
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
|
2412
2454
|
}
|
2413
2455
|
|
2414
|
-
void init_mapping(bool prefetch = true) {
|
2415
|
-
|
2416
|
-
// prefetch only CPU tensors
|
2456
|
+
void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
|
2457
|
+
// prefetch the whole file - all the data is needed anyway
|
2417
2458
|
if (use_mmap) {
|
2418
|
-
|
2459
|
+
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
2460
|
+
}
|
2419
2461
|
|
2420
|
-
|
2421
|
-
|
2422
|
-
|
2423
|
-
|
2424
|
-
|
2425
|
-
|
2462
|
+
// compute the total size of all tensors for progress reporting
|
2463
|
+
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2464
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
2465
|
+
size_data += ggml_nbytes(cur);
|
2466
|
+
}
|
2467
|
+
|
2468
|
+
if (use_mmap && mapping) {
|
2469
|
+
if (lmlock) {
|
2470
|
+
lmlock->init(mapping->addr);
|
2426
2471
|
}
|
2427
|
-
|
2472
|
+
mmap_used_first = mapping->size;
|
2428
2473
|
}
|
2429
|
-
|
2430
|
-
|
2431
|
-
|
2432
|
-
|
2474
|
+
}
|
2475
|
+
|
2476
|
+
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
|
2477
|
+
GGML_ASSERT(mapping);
|
2478
|
+
|
2479
|
+
*first = mapping->size;
|
2480
|
+
*last = 0;
|
2481
|
+
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
2482
|
+
const size_t offs = file_offset(ggml_get_name(tensor));
|
2483
|
+
*first = std::min(*first, offs);
|
2484
|
+
*last = std::max(*last, offs + ggml_nbytes(tensor));
|
2433
2485
|
}
|
2434
2486
|
}
|
2435
2487
|
|
@@ -2438,8 +2490,11 @@ struct llama_model_loader {
|
|
2438
2490
|
const size_t offs = file_offset(ggml_get_name(cur));
|
2439
2491
|
|
2440
2492
|
if (use_mmap && mapping) {
|
2441
|
-
|
2442
|
-
|
2493
|
+
if (cur->data == nullptr) {
|
2494
|
+
cur->data = (uint8_t *)mapping->addr + offs;
|
2495
|
+
} else {
|
2496
|
+
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
|
2497
|
+
}
|
2443
2498
|
} else {
|
2444
2499
|
GGML_ASSERT(cur->data != nullptr);
|
2445
2500
|
file.seek(offs, SEEK_SET);
|
@@ -2447,37 +2502,23 @@ struct llama_model_loader {
|
|
2447
2502
|
}
|
2448
2503
|
}
|
2449
2504
|
|
2450
|
-
|
2451
|
-
|
2452
|
-
|
2453
|
-
|
2454
|
-
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2455
|
-
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2456
|
-
size_data += ggml_nbytes(cur);
|
2457
|
-
}
|
2458
|
-
|
2459
|
-
if (use_mmap && buf_mmap) {
|
2460
|
-
if (lmlock) {
|
2461
|
-
lmlock->init(mapping->addr);
|
2462
|
-
}
|
2463
|
-
}
|
2505
|
+
size_t size_done = 0;
|
2506
|
+
size_t size_data = 0;
|
2507
|
+
size_t mmap_used_first = -1;
|
2508
|
+
size_t mmap_used_last = 0;
|
2464
2509
|
|
2465
|
-
|
2466
|
-
|
2467
|
-
|
2468
|
-
const bool legacy_offload = false;
|
2469
|
-
#endif
|
2510
|
+
// Returns false if cancelled by progress_callback
|
2511
|
+
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
|
2512
|
+
GGML_ASSERT(size_data != 0 && "call init_mapping() first");
|
2470
2513
|
|
2471
2514
|
std::vector<no_init<uint8_t>> read_buf;
|
2472
2515
|
|
2473
|
-
size_t size_done = 0;
|
2474
|
-
|
2475
|
-
size_t mmap_first = -1;
|
2476
|
-
size_t mmap_last = 0;
|
2477
|
-
|
2478
2516
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
2479
2517
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
2480
|
-
|
2518
|
+
if (!cur) {
|
2519
|
+
// some tensors may be allocated in a different context
|
2520
|
+
continue;
|
2521
|
+
}
|
2481
2522
|
|
2482
2523
|
if (progress_callback) {
|
2483
2524
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
@@ -2487,67 +2528,48 @@ struct llama_model_loader {
|
|
2487
2528
|
|
2488
2529
|
const size_t offs = file_offset(ggml_get_name(cur));
|
2489
2530
|
|
2490
|
-
if (
|
2491
|
-
if (
|
2492
|
-
|
2493
|
-
|
2494
|
-
|
2495
|
-
lmlock->grow_to(offs + ggml_nbytes(cur));
|
2496
|
-
}
|
2497
|
-
mmap_first = std::min(mmap_first, offs);
|
2498
|
-
mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
|
2499
|
-
} else {
|
2500
|
-
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
|
2531
|
+
if (use_mmap && mapping) {
|
2532
|
+
if (buf_mmap && cur->data == nullptr) {
|
2533
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
2534
|
+
if (lmlock) {
|
2535
|
+
lmlock->grow_to(offs + ggml_nbytes(cur));
|
2501
2536
|
}
|
2537
|
+
mmap_used_first = std::min(mmap_used_first, offs);
|
2538
|
+
mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
|
2502
2539
|
} else {
|
2503
|
-
|
2504
|
-
file.seek(offs, SEEK_SET);
|
2505
|
-
file.read_raw(cur->data, ggml_nbytes(cur));
|
2506
|
-
} else {
|
2507
|
-
read_buf.resize(ggml_nbytes(cur));
|
2508
|
-
file.seek(offs, SEEK_SET);
|
2509
|
-
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
2510
|
-
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
|
2511
|
-
}
|
2540
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
|
2512
2541
|
}
|
2513
2542
|
} else {
|
2514
|
-
|
2515
|
-
|
2516
|
-
|
2517
|
-
if (use_mmap && mapping) {
|
2518
|
-
data = (uint8_t *) mapping->addr + offs;
|
2543
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
2544
|
+
file.seek(offs, SEEK_SET);
|
2545
|
+
file.read_raw(cur->data, ggml_nbytes(cur));
|
2519
2546
|
} else {
|
2520
2547
|
read_buf.resize(ggml_nbytes(cur));
|
2521
2548
|
file.seek(offs, SEEK_SET);
|
2522
2549
|
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
2523
|
-
|
2550
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
|
2524
2551
|
}
|
2525
|
-
|
2526
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
2527
|
-
ggml_cuda_transform_tensor(data, cur);
|
2528
|
-
#elif defined(GGML_USE_CLBLAST)
|
2529
|
-
GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
|
2530
|
-
ggml_cl_transform_tensor(data, cur);
|
2531
|
-
#else
|
2532
|
-
GGML_ASSERT(!"GPU tensor without a GPU backend");
|
2533
|
-
GGML_UNUSED(data);
|
2534
|
-
#endif
|
2535
2552
|
}
|
2536
2553
|
|
2537
2554
|
size_done += ggml_nbytes(cur);
|
2538
2555
|
}
|
2539
2556
|
|
2540
|
-
//
|
2541
|
-
if (
|
2542
|
-
|
2543
|
-
|
2557
|
+
// check if this is the last call and do final cleanup
|
2558
|
+
if (size_done >= size_data) {
|
2559
|
+
// unmap offloaded tensors and metadata
|
2560
|
+
if (use_mmap && mapping) {
|
2561
|
+
mapping->unmap_fragment(0, mmap_used_first);
|
2562
|
+
if (mmap_used_last != 0) {
|
2563
|
+
mapping->unmap_fragment(mmap_used_last, mapping->size);
|
2564
|
+
}
|
2565
|
+
}
|
2566
|
+
if (progress_callback) {
|
2567
|
+
// Even though the model is done loading, we still honor
|
2568
|
+
// cancellation since we need to free allocations.
|
2569
|
+
return progress_callback(1.0f, progress_callback_user_data);
|
2570
|
+
}
|
2544
2571
|
}
|
2545
2572
|
|
2546
|
-
if (progress_callback) {
|
2547
|
-
// Even though the model is done loading, we still honor
|
2548
|
-
// cancellation since we need to free allocations.
|
2549
|
-
return progress_callback(1.0f, progress_callback_user_data);
|
2550
|
-
}
|
2551
2573
|
return true;
|
2552
2574
|
}
|
2553
2575
|
};
|
@@ -3176,6 +3198,7 @@ static bool llm_load_tensors(
|
|
3176
3198
|
llama_model_loader & ml,
|
3177
3199
|
llama_model & model,
|
3178
3200
|
int n_gpu_layers,
|
3201
|
+
enum llama_split_mode split_mode,
|
3179
3202
|
int main_gpu,
|
3180
3203
|
const float * tensor_split,
|
3181
3204
|
bool use_mlock,
|
@@ -3183,702 +3206,574 @@ static bool llm_load_tensors(
|
|
3183
3206
|
void * progress_callback_user_data) {
|
3184
3207
|
model.t_start_us = ggml_time_us();
|
3185
3208
|
|
3186
|
-
auto & ctx = model.ctx;
|
3187
3209
|
auto & hparams = model.hparams;
|
3188
3210
|
|
3211
|
+
model.split_mode = split_mode;
|
3212
|
+
model.main_gpu = main_gpu;
|
3189
3213
|
model.n_gpu_layers = n_gpu_layers;
|
3190
3214
|
|
3191
|
-
|
3215
|
+
const int64_t n_layer = hparams.n_layer;
|
3216
|
+
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
3192
3217
|
|
3193
|
-
|
3218
|
+
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
3219
|
+
model.buft_input = llama_default_buffer_type_cpu(true);
|
3194
3220
|
|
3195
|
-
|
3221
|
+
model.buft_layer.resize(n_layer);
|
3222
|
+
|
3223
|
+
// assign cpu layers
|
3224
|
+
for (int64_t i = 0; i < i_gpu_start; ++i) {
|
3225
|
+
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
3226
|
+
}
|
3227
|
+
|
3228
|
+
#ifdef GGML_USE_CUBLAS
|
3229
|
+
if (split_mode == LLAMA_SPLIT_LAYER) {
|
3230
|
+
// calculate the split points
|
3231
|
+
int device_count = ggml_backend_cuda_get_device_count();
|
3232
|
+
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
3233
|
+
float splits[GGML_CUDA_MAX_DEVICES];
|
3234
|
+
if (all_zero) {
|
3235
|
+
// default split, by free memory
|
3236
|
+
for (int i = 0; i < device_count; ++i) {
|
3237
|
+
size_t total;
|
3238
|
+
size_t free;
|
3239
|
+
ggml_backend_cuda_get_device_memory(i, &total, &free);
|
3240
|
+
splits[i] = free;
|
3241
|
+
}
|
3242
|
+
} else {
|
3243
|
+
std::copy(tensor_split, tensor_split + device_count, splits);
|
3244
|
+
}
|
3245
|
+
|
3246
|
+
// sum and normalize the splits to get the split points
|
3247
|
+
float split_sum = 0.0f;
|
3248
|
+
for (int i = 0; i < device_count; ++i) {
|
3249
|
+
split_sum += splits[i];
|
3250
|
+
splits[i] = split_sum;
|
3251
|
+
}
|
3252
|
+
for (int i = 0; i < device_count; ++i) {
|
3253
|
+
splits[i] /= split_sum;
|
3254
|
+
}
|
3255
|
+
|
3256
|
+
// assign the repeating layers to the devices according to the splits
|
3257
|
+
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
3258
|
+
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
3259
|
+
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
3260
|
+
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
3261
|
+
}
|
3262
|
+
// assign the output layer
|
3263
|
+
if (n_gpu_layers > n_layer) {
|
3264
|
+
int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
|
3265
|
+
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
3266
|
+
} else {
|
3267
|
+
model.buft_output = llama_default_buffer_type_cpu(true);
|
3268
|
+
}
|
3269
|
+
} else
|
3270
|
+
#endif
|
3196
3271
|
{
|
3272
|
+
ggml_backend_buffer_type_t split_buft;
|
3273
|
+
if (split_mode == LLAMA_SPLIT_ROW) {
|
3274
|
+
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
3275
|
+
} else {
|
3276
|
+
// LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
|
3277
|
+
split_buft = llama_default_buffer_type_offload(main_gpu);
|
3278
|
+
}
|
3279
|
+
// assign the repeating layers
|
3280
|
+
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
3281
|
+
model.buft_layer[i] = {
|
3282
|
+
split_buft,
|
3283
|
+
llama_default_buffer_type_offload(main_gpu)
|
3284
|
+
};
|
3285
|
+
}
|
3286
|
+
// assign the output layer
|
3287
|
+
if (n_gpu_layers > n_layer) {
|
3288
|
+
model.buft_output = {
|
3289
|
+
split_buft,
|
3290
|
+
llama_default_buffer_type_offload(main_gpu)
|
3291
|
+
};
|
3292
|
+
} else {
|
3293
|
+
model.buft_output = llama_default_buffer_type_cpu(true);
|
3294
|
+
}
|
3295
|
+
}
|
3296
|
+
|
3297
|
+
// count used buffer types
|
3298
|
+
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
3299
|
+
buft_layer_count[model.buft_input.buft]++;
|
3300
|
+
buft_layer_count[model.buft_input.buft_matrix]++;
|
3301
|
+
buft_layer_count[model.buft_output.buft]++;
|
3302
|
+
buft_layer_count[model.buft_output.buft_matrix]++;
|
3303
|
+
for (int64_t i = 0; i < n_layer; ++i) {
|
3304
|
+
buft_layer_count[model.buft_layer[i].buft]++;
|
3305
|
+
buft_layer_count[model.buft_layer[i].buft_matrix]++;
|
3306
|
+
}
|
3307
|
+
|
3308
|
+
// create one context per buffer type
|
3309
|
+
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
3310
|
+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
3311
|
+
for (auto & it : buft_layer_count) {
|
3197
3312
|
struct ggml_init_params params = {
|
3198
3313
|
/*.mem_size =*/ ctx_size,
|
3199
3314
|
/*.mem_buffer =*/ NULL,
|
3200
3315
|
/*.no_alloc =*/ true,
|
3201
3316
|
};
|
3202
|
-
|
3203
|
-
|
3204
|
-
|
3205
|
-
throw std::runtime_error(format("ggml_init() failed"));
|
3317
|
+
ggml_context * ctx = ggml_init(params);
|
3318
|
+
if (!ctx) {
|
3319
|
+
throw std::runtime_error(format("failed to create context"));
|
3206
3320
|
}
|
3321
|
+
ctx_map[it.first] = ctx;
|
3322
|
+
model.ctxs.push_back(ctx);
|
3207
3323
|
}
|
3208
3324
|
|
3209
|
-
(
|
3210
|
-
|
3211
|
-
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
3212
|
-
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
3213
|
-
|
3214
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
3215
|
-
if (ggml_cublas_loaded()) {
|
3216
|
-
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
3217
|
-
ggml_cuda_set_main_device(main_gpu);
|
3218
|
-
|
3219
|
-
llama_backend_offload = GGML_BACKEND_GPU;
|
3220
|
-
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
|
3221
|
-
}
|
3222
|
-
#elif defined(GGML_USE_CLBLAST)
|
3223
|
-
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
3224
|
-
llama_backend_offload = GGML_BACKEND_GPU;
|
3225
|
-
llama_backend_offload_split = GGML_BACKEND_GPU;
|
3226
|
-
#endif
|
3325
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
|
3227
3326
|
|
3228
3327
|
// create tensors for the weights
|
3229
3328
|
{
|
3230
3329
|
const int64_t n_embd = hparams.n_embd;
|
3231
3330
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
3232
3331
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
3233
|
-
const int64_t
|
3332
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
3234
3333
|
const int64_t n_vocab = hparams.n_vocab;
|
3334
|
+
const int64_t n_ff = hparams.n_ff;
|
3335
|
+
|
3336
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3337
|
+
|
3338
|
+
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
|
3339
|
+
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
|
3340
|
+
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
3341
|
+
auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
|
3342
|
+
auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
|
3343
|
+
|
3344
|
+
model.layers.resize(n_layer);
|
3235
3345
|
|
3236
3346
|
const auto tn = LLM_TN(model.arch);
|
3237
3347
|
switch (model.arch) {
|
3238
3348
|
case LLM_ARCH_LLAMA:
|
3239
3349
|
case LLM_ARCH_REFACT:
|
3240
3350
|
{
|
3241
|
-
model.tok_embd = ml.create_tensor(
|
3351
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3242
3352
|
|
3243
3353
|
// output
|
3244
3354
|
{
|
3245
|
-
|
3246
|
-
|
3247
|
-
|
3248
|
-
if (n_gpu_layers > int(n_layer)) {
|
3249
|
-
backend_norm = llama_backend_offload;
|
3250
|
-
backend_output = llama_backend_offload_split;
|
3251
|
-
} else {
|
3252
|
-
backend_norm = GGML_BACKEND_CPU;
|
3253
|
-
backend_output = GGML_BACKEND_CPU;
|
3254
|
-
}
|
3255
|
-
|
3256
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3257
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3355
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3356
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3258
3357
|
}
|
3259
3358
|
|
3260
|
-
|
3261
|
-
|
3262
|
-
|
3263
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3264
|
-
|
3265
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3266
|
-
|
3267
|
-
model.layers.resize(n_layer);
|
3268
|
-
|
3269
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3270
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3271
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3359
|
+
for (int i = 0; i < n_layer; ++i) {
|
3360
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3361
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3272
3362
|
|
3273
3363
|
auto & layer = model.layers[i];
|
3274
3364
|
|
3275
|
-
layer.attn_norm = ml.create_tensor(
|
3365
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3276
3366
|
|
3277
|
-
layer.wq = ml.create_tensor(
|
3278
|
-
layer.wk = ml.create_tensor(
|
3279
|
-
layer.wv = ml.create_tensor(
|
3280
|
-
layer.wo = ml.create_tensor(
|
3367
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3368
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3369
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3370
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3281
3371
|
|
3282
3372
|
// optional bias tensors
|
3283
|
-
layer.bq = ml.create_tensor(
|
3284
|
-
layer.bk = ml.create_tensor(
|
3285
|
-
layer.bv = ml.create_tensor(
|
3286
|
-
layer.bo = ml.create_tensor(
|
3373
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
|
3374
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
3375
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
3376
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
3287
3377
|
|
3288
|
-
layer.ffn_norm = ml.create_tensor(
|
3378
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3289
3379
|
|
3290
|
-
layer.ffn_gate_inp = ml.create_tensor(
|
3380
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
|
3291
3381
|
|
3292
3382
|
if (layer.ffn_gate_inp == nullptr) {
|
3293
3383
|
GGML_ASSERT(hparams.n_expert == 0);
|
3294
3384
|
GGML_ASSERT(hparams.n_expert_used == 0);
|
3295
3385
|
|
3296
|
-
layer.ffn_gate = ml.create_tensor(
|
3297
|
-
layer.ffn_down = ml.create_tensor(
|
3298
|
-
layer.ffn_up = ml.create_tensor(
|
3386
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
3387
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
3388
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3299
3389
|
} else {
|
3300
3390
|
GGML_ASSERT(hparams.n_expert > 0);
|
3301
3391
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
3302
3392
|
|
3303
3393
|
// MoE branch
|
3304
3394
|
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
3305
|
-
layer.ffn_gate_exp[x] = ml.create_tensor(
|
3306
|
-
layer.ffn_down_exp[x] = ml.create_tensor(
|
3307
|
-
layer.ffn_up_exp[x] = ml.create_tensor(
|
3395
|
+
layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
|
3396
|
+
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
|
3397
|
+
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
|
3308
3398
|
}
|
3309
3399
|
}
|
3310
3400
|
}
|
3311
3401
|
} break;
|
3312
3402
|
case LLM_ARCH_BAICHUAN:
|
3313
3403
|
{
|
3314
|
-
model.tok_embd = ml.create_tensor(
|
3404
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3315
3405
|
{
|
3316
|
-
|
3317
|
-
|
3318
|
-
|
3319
|
-
if (n_gpu_layers > int(n_layer)) {
|
3320
|
-
backend_norm = llama_backend_offload;
|
3321
|
-
backend_output = llama_backend_offload_split;
|
3322
|
-
} else {
|
3323
|
-
backend_norm = GGML_BACKEND_CPU;
|
3324
|
-
backend_output = GGML_BACKEND_CPU;
|
3325
|
-
}
|
3326
|
-
|
3327
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3328
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3406
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3407
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3329
3408
|
}
|
3330
3409
|
|
3331
|
-
|
3332
|
-
|
3333
|
-
|
3334
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3335
|
-
|
3336
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3337
|
-
|
3338
|
-
model.layers.resize(n_layer);
|
3339
|
-
|
3340
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3341
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3342
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3410
|
+
for (int i = 0; i < n_layer; ++i) {
|
3411
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3412
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3343
3413
|
|
3344
3414
|
auto & layer = model.layers[i];
|
3345
3415
|
|
3346
|
-
layer.attn_norm = ml.create_tensor(
|
3416
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3347
3417
|
|
3348
|
-
layer.wq = ml.create_tensor(
|
3349
|
-
layer.wk = ml.create_tensor(
|
3350
|
-
layer.wv = ml.create_tensor(
|
3351
|
-
layer.wo = ml.create_tensor(
|
3418
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3419
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3420
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3421
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3352
3422
|
|
3353
|
-
layer.ffn_norm = ml.create_tensor(
|
3423
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3354
3424
|
|
3355
|
-
layer.ffn_gate = ml.create_tensor(
|
3356
|
-
layer.ffn_down = ml.create_tensor(
|
3357
|
-
layer.ffn_up = ml.create_tensor(
|
3425
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
3426
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
3427
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3358
3428
|
}
|
3359
3429
|
} break;
|
3360
3430
|
case LLM_ARCH_FALCON:
|
3361
3431
|
{
|
3362
|
-
model.tok_embd = ml.create_tensor(
|
3432
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3363
3433
|
|
3364
3434
|
// output
|
3365
3435
|
{
|
3366
|
-
|
3367
|
-
|
3368
|
-
|
3369
|
-
if (n_gpu_layers > int(n_layer)) {
|
3370
|
-
backend_norm = llama_backend_offload;
|
3371
|
-
backend_output = llama_backend_offload_split;
|
3372
|
-
} else {
|
3373
|
-
backend_norm = GGML_BACKEND_CPU;
|
3374
|
-
backend_output = GGML_BACKEND_CPU;
|
3375
|
-
}
|
3376
|
-
|
3377
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3378
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3379
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3436
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3437
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
3438
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3380
3439
|
}
|
3381
3440
|
|
3382
|
-
|
3383
|
-
|
3384
|
-
|
3385
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3386
|
-
|
3387
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3388
|
-
|
3389
|
-
model.layers.resize(n_layer);
|
3390
|
-
|
3391
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3392
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3393
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3441
|
+
for (int i = 0; i < n_layer; ++i) {
|
3442
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3443
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3394
3444
|
|
3395
3445
|
auto & layer = model.layers[i];
|
3396
3446
|
|
3397
|
-
layer.attn_norm = ml.create_tensor(
|
3398
|
-
layer.attn_norm_b = ml.create_tensor(
|
3447
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3448
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3399
3449
|
|
3400
3450
|
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
3401
|
-
layer.attn_norm_2 = ml.create_tensor(
|
3402
|
-
layer.attn_norm_2_b = ml.create_tensor(
|
3451
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
3452
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
3403
3453
|
}
|
3404
3454
|
|
3405
|
-
layer.wqkv = ml.create_tensor(
|
3406
|
-
layer.wo = ml.create_tensor(
|
3455
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
3456
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3407
3457
|
|
3408
|
-
layer.ffn_down = ml.create_tensor(
|
3409
|
-
layer.ffn_up = ml.create_tensor(
|
3458
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
3459
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3410
3460
|
}
|
3411
3461
|
} break;
|
3412
3462
|
case LLM_ARCH_STARCODER:
|
3413
3463
|
{
|
3414
|
-
model.tok_embd = ml.create_tensor(
|
3415
|
-
model.pos_embd = ml.create_tensor(
|
3464
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3465
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
3416
3466
|
|
3417
3467
|
// output
|
3418
3468
|
{
|
3419
|
-
|
3420
|
-
|
3421
|
-
|
3422
|
-
if (n_gpu_layers > int(n_layer)) {
|
3423
|
-
backend_norm = llama_backend_offload;
|
3424
|
-
backend_output = llama_backend_offload_split;
|
3425
|
-
} else {
|
3426
|
-
backend_norm = GGML_BACKEND_CPU;
|
3427
|
-
backend_output = GGML_BACKEND_CPU;
|
3428
|
-
}
|
3429
|
-
|
3430
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3431
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3432
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3469
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3470
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
3471
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3433
3472
|
}
|
3434
3473
|
|
3435
|
-
|
3436
|
-
|
3437
|
-
|
3438
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3439
|
-
|
3440
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3441
|
-
|
3442
|
-
model.layers.resize(n_layer);
|
3443
|
-
|
3444
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3445
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3446
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3474
|
+
for (int i = 0; i < n_layer; ++i) {
|
3475
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3476
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3447
3477
|
|
3448
3478
|
auto & layer = model.layers[i];
|
3449
3479
|
|
3450
|
-
layer.attn_norm = ml.create_tensor(
|
3451
|
-
layer.attn_norm_b = ml.create_tensor(
|
3480
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3481
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3452
3482
|
|
3453
|
-
layer.wqkv = ml.create_tensor(
|
3454
|
-
layer.bqkv = ml.create_tensor(
|
3483
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
3484
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
3455
3485
|
|
3456
|
-
layer.wo = ml.create_tensor(
|
3457
|
-
layer.bo = ml.create_tensor(
|
3486
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3487
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
3458
3488
|
|
3459
|
-
layer.ffn_norm = ml.create_tensor(
|
3460
|
-
layer.ffn_norm_b = ml.create_tensor(
|
3489
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3490
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
3461
3491
|
|
3462
|
-
layer.ffn_down = ml.create_tensor(
|
3463
|
-
layer.ffn_down_b = ml.create_tensor(
|
3492
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3493
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
3464
3494
|
|
3465
|
-
layer.ffn_up
|
3466
|
-
layer.ffn_up_b
|
3495
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3496
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3467
3497
|
}
|
3468
3498
|
} break;
|
3469
3499
|
case LLM_ARCH_PERSIMMON:
|
3470
3500
|
{
|
3471
|
-
model.tok_embd = ml.create_tensor(
|
3501
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3472
3502
|
|
3473
3503
|
{
|
3474
|
-
|
3475
|
-
|
3476
|
-
|
3477
|
-
if (n_gpu_layers > int(n_layer)) {
|
3478
|
-
backend_norm = llama_backend_offload;
|
3479
|
-
backend_output = llama_backend_offload_split;
|
3480
|
-
} else {
|
3481
|
-
backend_norm = GGML_BACKEND_CPU;
|
3482
|
-
backend_output = GGML_BACKEND_CPU;
|
3483
|
-
}
|
3484
|
-
|
3485
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3486
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3487
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3504
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3505
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
3506
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3488
3507
|
}
|
3489
3508
|
|
3490
|
-
|
3491
|
-
|
3492
|
-
|
3493
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3509
|
+
for (int i = 0; i < n_layer; ++i) {
|
3510
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3511
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3494
3512
|
|
3495
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3496
|
-
model.layers.resize(n_layer);
|
3497
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3498
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
|
3499
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
|
3500
3513
|
auto & layer = model.layers[i];
|
3501
|
-
|
3502
|
-
layer.
|
3503
|
-
layer.
|
3504
|
-
|
3505
|
-
layer.
|
3506
|
-
layer.
|
3507
|
-
|
3508
|
-
layer.
|
3509
|
-
layer.
|
3510
|
-
|
3511
|
-
layer.
|
3512
|
-
layer.
|
3513
|
-
|
3514
|
-
layer.
|
3515
|
-
layer.
|
3516
|
-
|
3514
|
+
|
3515
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3516
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3517
|
+
|
3518
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
3519
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
3520
|
+
|
3521
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3522
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
3523
|
+
|
3524
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3525
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
3526
|
+
|
3527
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3528
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3529
|
+
|
3530
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3531
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
3532
|
+
|
3533
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
3534
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
3535
|
+
|
3536
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
3537
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
3517
3538
|
}
|
3518
3539
|
} break;
|
3519
3540
|
case LLM_ARCH_BLOOM:
|
3520
3541
|
{
|
3521
|
-
model.tok_embd = ml.create_tensor(
|
3522
|
-
model.tok_norm = ml.create_tensor(
|
3523
|
-
model.tok_norm_b = ml.create_tensor(
|
3542
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3543
|
+
model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
3544
|
+
model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
3524
3545
|
|
3525
3546
|
// output
|
3526
3547
|
{
|
3527
|
-
|
3528
|
-
|
3529
|
-
|
3530
|
-
if (n_gpu_layers > int(n_layer)) {
|
3531
|
-
backend_norm = llama_backend_offload;
|
3532
|
-
backend_output = llama_backend_offload_split;
|
3533
|
-
} else {
|
3534
|
-
backend_norm = GGML_BACKEND_CPU;
|
3535
|
-
backend_output = GGML_BACKEND_CPU;
|
3536
|
-
}
|
3537
|
-
|
3538
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3539
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3540
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3548
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3549
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
3550
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3541
3551
|
}
|
3542
3552
|
|
3543
|
-
|
3544
|
-
|
3545
|
-
|
3546
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3547
|
-
|
3548
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3549
|
-
|
3550
|
-
model.layers.resize(n_layer);
|
3551
|
-
|
3552
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3553
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3554
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3553
|
+
for (int i = 0; i < n_layer; ++i) {
|
3554
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3555
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3555
3556
|
|
3556
3557
|
auto & layer = model.layers[i];
|
3557
3558
|
|
3558
|
-
layer.attn_norm = ml.create_tensor(
|
3559
|
-
layer.attn_norm_b = ml.create_tensor(
|
3559
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3560
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3560
3561
|
|
3561
|
-
layer.wqkv = ml.create_tensor(
|
3562
|
-
layer.bqkv = ml.create_tensor(
|
3562
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
3563
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
3563
3564
|
|
3564
|
-
layer.wo = ml.create_tensor(
|
3565
|
-
layer.bo = ml.create_tensor(
|
3565
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3566
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
3566
3567
|
|
3567
|
-
layer.ffn_norm = ml.create_tensor(
|
3568
|
-
layer.ffn_norm_b = ml.create_tensor(
|
3568
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3569
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
3569
3570
|
|
3570
|
-
layer.ffn_down = ml.create_tensor(
|
3571
|
-
layer.ffn_down_b = ml.create_tensor(
|
3571
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3572
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
3572
3573
|
|
3573
|
-
layer.ffn_up
|
3574
|
-
layer.ffn_up_b
|
3574
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3575
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3575
3576
|
}
|
3576
3577
|
} break;
|
3577
3578
|
case LLM_ARCH_MPT:
|
3578
3579
|
{
|
3579
|
-
model.tok_embd = ml.create_tensor(
|
3580
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3581
|
+
|
3580
3582
|
// output
|
3581
3583
|
{
|
3582
|
-
|
3583
|
-
|
3584
|
-
|
3585
|
-
if (n_gpu_layers > int(n_layer)) {
|
3586
|
-
backend_norm = llama_backend_offload;
|
3587
|
-
backend_output = llama_backend_offload_split;
|
3588
|
-
} else {
|
3589
|
-
backend_norm = GGML_BACKEND_CPU;
|
3590
|
-
backend_output = GGML_BACKEND_CPU;
|
3591
|
-
}
|
3592
|
-
|
3593
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3594
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3584
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3585
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3595
3586
|
}
|
3596
3587
|
|
3597
|
-
|
3598
|
-
|
3599
|
-
|
3600
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3601
|
-
|
3602
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3603
|
-
|
3604
|
-
model.layers.resize(n_layer);
|
3605
|
-
|
3606
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3607
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3608
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3588
|
+
for (int i = 0; i < n_layer; ++i) {
|
3589
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3590
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3609
3591
|
|
3610
3592
|
auto & layer = model.layers[i];
|
3611
3593
|
|
3612
|
-
layer.attn_norm = ml.create_tensor(
|
3613
|
-
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
3614
|
-
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
3594
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3615
3595
|
|
3616
|
-
layer.
|
3596
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
3597
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3617
3598
|
|
3618
|
-
layer.
|
3619
|
-
layer.
|
3599
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3600
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
3601
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3620
3602
|
|
3621
3603
|
// AWQ ScaleActivation layer
|
3622
|
-
layer.ffn_act = ml.create_tensor(
|
3604
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
3623
3605
|
}
|
3624
3606
|
} break;
|
3625
3607
|
case LLM_ARCH_STABLELM:
|
3626
3608
|
{
|
3627
|
-
model.tok_embd = ml.create_tensor(
|
3609
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3628
3610
|
|
3629
3611
|
// output
|
3630
3612
|
{
|
3631
|
-
|
3632
|
-
|
3633
|
-
|
3634
|
-
if (n_gpu_layers > int(n_layer)) {
|
3635
|
-
backend_norm = llama_backend_offload;
|
3636
|
-
backend_output = llama_backend_offload_split;
|
3637
|
-
} else {
|
3638
|
-
backend_norm = GGML_BACKEND_CPU;
|
3639
|
-
backend_output = GGML_BACKEND_CPU;
|
3640
|
-
}
|
3641
|
-
|
3642
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3643
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3644
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3613
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
3614
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3615
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3645
3616
|
}
|
3646
3617
|
|
3647
|
-
|
3648
|
-
|
3649
|
-
|
3650
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3651
|
-
|
3652
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3653
|
-
|
3654
|
-
model.layers.resize(n_layer);
|
3655
|
-
|
3656
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3657
|
-
/*
|
3658
|
-
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
|
3659
|
-
*/
|
3660
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3661
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3618
|
+
for (int i = 0; i < n_layer; ++i) {
|
3619
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3620
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3662
3621
|
|
3663
3622
|
auto & layer = model.layers[i];
|
3664
3623
|
|
3665
|
-
layer.attn_norm =
|
3666
|
-
layer.attn_norm_b = ml.create_tensor(
|
3624
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3625
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3667
3626
|
|
3668
|
-
layer.wq = ml.create_tensor(
|
3669
|
-
layer.wk = ml.create_tensor(
|
3670
|
-
layer.wv = ml.create_tensor(
|
3671
|
-
layer.wo = ml.create_tensor(
|
3627
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3628
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3629
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3630
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3672
3631
|
|
3673
|
-
layer.ffn_norm
|
3674
|
-
layer.ffn_norm_b = ml.create_tensor(
|
3632
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3633
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
3675
3634
|
|
3676
|
-
layer.ffn_gate = ml.create_tensor(
|
3677
|
-
layer.ffn_down = ml.create_tensor(
|
3678
|
-
layer.ffn_up
|
3635
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
3636
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
3637
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3679
3638
|
}
|
3680
3639
|
} break;
|
3681
3640
|
case LLM_ARCH_QWEN:
|
3682
3641
|
{
|
3683
|
-
model.tok_embd = ml.create_tensor(
|
3684
|
-
{
|
3685
|
-
ggml_backend_type backend_norm;
|
3686
|
-
ggml_backend_type backend_output;
|
3687
|
-
|
3688
|
-
if (n_gpu_layers > int(n_layer)) {
|
3689
|
-
backend_norm = llama_backend_offload;
|
3690
|
-
backend_output = llama_backend_offload_split;
|
3691
|
-
} else {
|
3692
|
-
backend_norm = GGML_BACKEND_CPU;
|
3693
|
-
backend_output = GGML_BACKEND_CPU;
|
3694
|
-
}
|
3642
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3695
3643
|
|
3696
|
-
|
3697
|
-
|
3698
|
-
|
3699
|
-
|
3700
|
-
|
3701
|
-
|
3702
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3703
|
-
|
3704
|
-
model.layers.resize(n_layer);
|
3644
|
+
// output
|
3645
|
+
{
|
3646
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3647
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3648
|
+
}
|
3705
3649
|
|
3706
|
-
for (
|
3707
|
-
|
3708
|
-
|
3650
|
+
for (int i = 0; i < n_layer; ++i) {
|
3651
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3652
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3709
3653
|
|
3710
3654
|
auto & layer = model.layers[i];
|
3711
3655
|
|
3712
|
-
layer.attn_norm = ml.create_tensor(
|
3656
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3713
3657
|
|
3714
|
-
layer.wqkv = ml.create_tensor(
|
3715
|
-
layer.bqkv = ml.create_tensor(
|
3716
|
-
layer.wo = ml.create_tensor(
|
3658
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
|
3659
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3});
|
3660
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3717
3661
|
|
3718
|
-
layer.ffn_norm = ml.create_tensor(
|
3662
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3719
3663
|
|
3720
|
-
layer.ffn_gate = ml.create_tensor(
|
3721
|
-
layer.ffn_down = ml.create_tensor(
|
3722
|
-
layer.ffn_up = ml.create_tensor(
|
3664
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2});
|
3665
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd});
|
3666
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
|
3723
3667
|
}
|
3724
3668
|
} break;
|
3725
3669
|
case LLM_ARCH_PHI2:
|
3726
3670
|
{
|
3727
|
-
model.tok_embd = ml.create_tensor(
|
3671
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3728
3672
|
|
3729
3673
|
// output
|
3730
3674
|
{
|
3731
|
-
|
3732
|
-
|
3733
|
-
|
3734
|
-
|
3735
|
-
backend_norm = llama_backend_offload;
|
3736
|
-
backend_output = llama_backend_offload;
|
3737
|
-
} else {
|
3738
|
-
backend_norm = GGML_BACKEND_CPU;
|
3739
|
-
backend_output = GGML_BACKEND_CPU;
|
3740
|
-
}
|
3741
|
-
|
3742
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3743
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3744
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3745
|
-
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
3675
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3676
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
3677
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3678
|
+
model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab});
|
3746
3679
|
}
|
3747
3680
|
|
3748
|
-
|
3749
|
-
|
3750
|
-
|
3751
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3681
|
+
for (int i = 0; i < n_layer; ++i) {
|
3682
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3683
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3752
3684
|
|
3753
|
-
|
3685
|
+
auto & layer = model.layers[i];
|
3754
3686
|
|
3755
|
-
|
3687
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3688
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3756
3689
|
|
3757
|
-
|
3758
|
-
|
3759
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3690
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
|
3691
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
3760
3692
|
|
3761
|
-
|
3693
|
+
if (layer.wqkv == nullptr) {
|
3694
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3695
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
3762
3696
|
|
3763
|
-
|
3764
|
-
|
3697
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3698
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
3765
3699
|
|
3766
|
-
|
3767
|
-
|
3700
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3701
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
3702
|
+
}
|
3768
3703
|
|
3769
|
-
layer.wo = ml.create_tensor(
|
3770
|
-
layer.bo = ml.create_tensor(
|
3704
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3705
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
3771
3706
|
|
3772
|
-
layer.ffn_down = ml.create_tensor(
|
3773
|
-
layer.ffn_down_b = ml.create_tensor(
|
3707
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3708
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
3774
3709
|
|
3775
|
-
layer.ffn_up
|
3776
|
-
layer.ffn_up_b
|
3710
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3711
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3777
3712
|
}
|
3778
3713
|
} break;
|
3779
3714
|
case LLM_ARCH_PLAMO:
|
3780
3715
|
{
|
3781
|
-
model.tok_embd = ml.create_tensor(
|
3716
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3782
3717
|
|
3783
3718
|
// output
|
3784
3719
|
{
|
3785
|
-
|
3786
|
-
|
3787
|
-
|
3788
|
-
if (n_gpu_layers > int(n_layer)) {
|
3789
|
-
backend_norm = llama_backend_offload;
|
3790
|
-
backend_output = llama_backend_offload_split;
|
3791
|
-
} else {
|
3792
|
-
backend_norm = GGML_BACKEND_CPU;
|
3793
|
-
backend_output = GGML_BACKEND_CPU;
|
3794
|
-
}
|
3795
|
-
|
3796
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3797
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3720
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3721
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3798
3722
|
}
|
3799
3723
|
|
3800
|
-
|
3801
|
-
|
3802
|
-
|
3803
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3804
|
-
|
3805
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3806
|
-
|
3807
|
-
model.layers.resize(n_layer);
|
3808
|
-
|
3809
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3810
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3811
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3724
|
+
for (int i = 0; i < n_layer; ++i) {
|
3725
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3726
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3812
3727
|
|
3813
3728
|
auto & layer = model.layers[i];
|
3814
3729
|
|
3815
|
-
layer.attn_norm = ml.create_tensor(
|
3730
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3816
3731
|
|
3817
|
-
layer.wq = ml.create_tensor(
|
3818
|
-
layer.wk = ml.create_tensor(
|
3819
|
-
layer.wv = ml.create_tensor(
|
3820
|
-
layer.wo = ml.create_tensor(
|
3732
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
3733
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
3734
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
3735
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3821
3736
|
|
3822
|
-
layer.ffn_gate = ml.create_tensor(
|
3823
|
-
layer.ffn_down = ml.create_tensor(
|
3824
|
-
layer.ffn_up = ml.create_tensor(
|
3737
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
3738
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
3739
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3825
3740
|
}
|
3826
3741
|
} break;
|
3827
3742
|
case LLM_ARCH_GPT2:
|
3828
3743
|
{
|
3829
|
-
model.tok_embd = ml.create_tensor(
|
3830
|
-
model.pos_embd = ml.create_tensor(
|
3744
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
3745
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
3831
3746
|
|
3832
3747
|
// output
|
3833
3748
|
{
|
3834
|
-
|
3835
|
-
|
3836
|
-
|
3837
|
-
if (n_gpu_layers > int(n_layer)) {
|
3838
|
-
backend_norm = llama_backend_offload;
|
3839
|
-
backend_output = llama_backend_offload_split;
|
3840
|
-
} else {
|
3841
|
-
backend_norm = GGML_BACKEND_CPU;
|
3842
|
-
backend_output = GGML_BACKEND_CPU;
|
3843
|
-
}
|
3844
|
-
|
3845
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
3846
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
3847
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
3749
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
3750
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
3751
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
3848
3752
|
}
|
3849
3753
|
|
3850
|
-
|
3851
|
-
|
3852
|
-
|
3853
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
3854
|
-
|
3855
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
3856
|
-
|
3857
|
-
model.layers.resize(n_layer);
|
3858
|
-
|
3859
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
3860
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
3861
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
3754
|
+
for (int i = 0; i < n_layer; ++i) {
|
3755
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
3756
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
3862
3757
|
|
3863
3758
|
auto & layer = model.layers[i];
|
3864
3759
|
|
3865
|
-
layer.attn_norm = ml.create_tensor(
|
3866
|
-
layer.attn_norm_b = ml.create_tensor(
|
3760
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
3761
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
3867
3762
|
|
3868
|
-
layer.wqkv = ml.create_tensor(
|
3869
|
-
layer.bqkv = ml.create_tensor(
|
3763
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
3764
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
3870
3765
|
|
3871
|
-
layer.wo = ml.create_tensor(
|
3872
|
-
layer.bo = ml.create_tensor(
|
3766
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
3767
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
3873
3768
|
|
3874
|
-
layer.ffn_norm = ml.create_tensor(
|
3875
|
-
layer.ffn_norm_b = ml.create_tensor(
|
3769
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
3770
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
3876
3771
|
|
3877
|
-
layer.ffn_down = ml.create_tensor(
|
3878
|
-
layer.ffn_down_b = ml.create_tensor(
|
3772
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
3773
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
3879
3774
|
|
3880
|
-
layer.ffn_up
|
3881
|
-
layer.ffn_up_b
|
3775
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
3776
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
3882
3777
|
}
|
3883
3778
|
} break;
|
3884
3779
|
default:
|
@@ -3888,78 +3783,51 @@ static bool llm_load_tensors(
|
|
3888
3783
|
|
3889
3784
|
ml.done_getting_tensors();
|
3890
3785
|
|
3891
|
-
ml.init_mapping();
|
3786
|
+
ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
|
3892
3787
|
|
3893
|
-
//
|
3894
|
-
|
3895
|
-
size_t buf_size = 0;
|
3788
|
+
// create the backend buffers
|
3789
|
+
std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
|
3896
3790
|
|
3897
|
-
|
3791
|
+
for (auto & it : ctx_map) {
|
3792
|
+
ggml_backend_buffer_type_t buft = it.first;
|
3793
|
+
ggml_context * ctx = it.second;
|
3794
|
+
ggml_backend_buffer_t buf = nullptr;
|
3898
3795
|
|
3899
|
-
|
3900
|
-
//
|
3901
|
-
|
3902
|
-
|
3903
|
-
|
3904
|
-
|
3796
|
+
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
3797
|
+
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
3798
|
+
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
3799
|
+
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
3800
|
+
size_t first, last;
|
3801
|
+
ml.get_mapping_range(&first, &last, ctx);
|
3802
|
+
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
|
3905
3803
|
}
|
3906
|
-
}
|
3907
|
-
|
3908
|
-
// create backend buffer
|
3909
|
-
ggml_backend_buffer_t buf_mmap = nullptr;
|
3910
|
-
|
3911
3804
|
#ifdef GGML_USE_METAL
|
3912
|
-
|
3913
|
-
if (ml.use_mmap) {
|
3805
|
+
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
3914
3806
|
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
3915
|
-
|
3916
|
-
|
3917
|
-
|
3918
|
-
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
|
3807
|
+
size_t first, last;
|
3808
|
+
ml.get_mapping_range(&first, &last, ctx);
|
3809
|
+
buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
|
3919
3810
|
}
|
3920
|
-
}
|
3921
|
-
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
3922
|
-
// for testing only
|
3923
|
-
if (n_gpu_layers > 0) {
|
3924
|
-
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
|
3925
|
-
}
|
3926
3811
|
#endif
|
3927
|
-
|
3928
|
-
|
3929
|
-
|
3930
|
-
|
3931
|
-
|
3932
|
-
buf_mmap = model.buf;
|
3933
|
-
} else {
|
3934
|
-
// allocate only CPU tensors
|
3935
|
-
model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
|
3936
|
-
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
|
3937
|
-
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
3938
|
-
if (t->backend == GGML_BACKEND_CPU) {
|
3939
|
-
ggml_tallocr_alloc(alloc, t);
|
3940
|
-
}
|
3812
|
+
else {
|
3813
|
+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
3814
|
+
if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
|
3815
|
+
model.mlock_buf.init (ggml_backend_buffer_get_base(buf));
|
3816
|
+
model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
|
3941
3817
|
}
|
3942
|
-
ggml_tallocr_free(alloc);
|
3943
3818
|
}
|
3944
|
-
|
3945
|
-
|
3946
|
-
|
3947
|
-
|
3948
|
-
|
3819
|
+
if (buf == nullptr) {
|
3820
|
+
throw std::runtime_error("failed to allocate buffer");
|
3821
|
+
}
|
3822
|
+
// indicate that this buffer contains weights
|
3823
|
+
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
3824
|
+
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
3825
|
+
model.bufs.push_back(buf);
|
3826
|
+
ctx_bufs.emplace_back(ctx, buf);
|
3949
3827
|
}
|
3950
3828
|
|
3951
3829
|
// print memory requirements
|
3952
3830
|
{
|
3953
|
-
size_t sys_mem_required = ctx_size + buf_size;
|
3954
|
-
|
3955
|
-
if (sys_mem_required > 0) {
|
3956
|
-
LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
|
3957
|
-
}
|
3958
|
-
if (vram_weights > 0) {
|
3959
|
-
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
3960
|
-
}
|
3961
|
-
|
3962
|
-
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
|
3963
3831
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
3964
3832
|
|
3965
3833
|
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
@@ -3971,23 +3839,26 @@ static bool llm_load_tensors(
|
|
3971
3839
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
3972
3840
|
|
3973
3841
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
3974
|
-
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
3975
|
-
}
|
3976
3842
|
|
3977
|
-
|
3978
|
-
|
3979
|
-
|
3980
|
-
|
3981
|
-
#endif // GGML_USE_CUBLAS
|
3843
|
+
for (ggml_backend_buffer_t buf : model.bufs) {
|
3844
|
+
LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
3845
|
+
}
|
3846
|
+
}
|
3982
3847
|
|
3983
3848
|
// populate tensors_by_name
|
3984
|
-
for (
|
3985
|
-
|
3986
|
-
|
3849
|
+
for (ggml_context * ctx : model.ctxs) {
|
3850
|
+
for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
3851
|
+
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
3852
|
+
}
|
3987
3853
|
}
|
3988
3854
|
|
3989
|
-
|
3990
|
-
|
3855
|
+
// load tensor data
|
3856
|
+
for (auto & it : ctx_bufs) {
|
3857
|
+
ggml_context * ctx = it.first;
|
3858
|
+
ggml_backend_buffer_t buf = it.second;
|
3859
|
+
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
|
3860
|
+
return false;
|
3861
|
+
}
|
3991
3862
|
}
|
3992
3863
|
|
3993
3864
|
model.mapping = std::move(ml.mapping);
|
@@ -4021,13 +3892,13 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
|
4021
3892
|
}
|
4022
3893
|
|
4023
3894
|
if (!llm_load_tensors(
|
4024
|
-
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
3895
|
+
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
4025
3896
|
params.progress_callback, params.progress_callback_user_data
|
4026
3897
|
)) {
|
4027
3898
|
return -2;
|
4028
3899
|
}
|
4029
3900
|
} catch (const std::exception & err) {
|
4030
|
-
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
3901
|
+
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
|
4031
3902
|
return -1;
|
4032
3903
|
}
|
4033
3904
|
|
@@ -4099,7 +3970,6 @@ static void llm_build_k_shift(
|
|
4099
3970
|
struct ggml_cgraph * graph,
|
4100
3971
|
llm_rope_type type,
|
4101
3972
|
int64_t n_ctx,
|
4102
|
-
int n_rot,
|
4103
3973
|
float freq_base,
|
4104
3974
|
float freq_scale,
|
4105
3975
|
const llm_build_cb & cb) {
|
@@ -4107,14 +3977,13 @@ static void llm_build_k_shift(
|
|
4107
3977
|
const int64_t n_head_kv = hparams.n_head_kv;
|
4108
3978
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
4109
3979
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
3980
|
+
const int32_t n_rot = hparams.n_rot;
|
4110
3981
|
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
4111
3982
|
const float ext_factor = cparams.yarn_ext_factor;
|
4112
3983
|
const float attn_factor = cparams.yarn_attn_factor;
|
4113
3984
|
const float beta_fast = cparams.yarn_beta_fast;
|
4114
3985
|
const float beta_slow = cparams.yarn_beta_slow;
|
4115
3986
|
|
4116
|
-
GGML_ASSERT(n_embd_head_k % n_rot == 0);
|
4117
|
-
|
4118
3987
|
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
4119
3988
|
cb(K_shift, "K_shift", -1);
|
4120
3989
|
|
@@ -4473,8 +4342,6 @@ struct llm_build_context {
|
|
4473
4342
|
do_rope_shift (worst_case || kv_self.has_shift),
|
4474
4343
|
cb (cb),
|
4475
4344
|
buf_compute_meta (lctx.buf_compute_meta) {
|
4476
|
-
GGML_ASSERT(!!kv_self.ctx);
|
4477
|
-
|
4478
4345
|
// all initializations should be done in init()
|
4479
4346
|
}
|
4480
4347
|
|
@@ -4518,7 +4385,7 @@ struct llm_build_context {
|
|
4518
4385
|
|
4519
4386
|
// shift the entire K-cache if needed
|
4520
4387
|
if (do_rope_shift) {
|
4521
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx,
|
4388
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
4522
4389
|
}
|
4523
4390
|
|
4524
4391
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -4554,16 +4421,22 @@ struct llm_build_context {
|
|
4554
4421
|
cb(Vcur, "Vcur", il);
|
4555
4422
|
}
|
4556
4423
|
|
4424
|
+
// these nodes are added to the graph together so that they are not reordered
|
4425
|
+
// by doing so, the number of splits in the graph is reduced
|
4426
|
+
ggml_build_forward_expand(gf, Qcur);
|
4427
|
+
ggml_build_forward_expand(gf, Kcur);
|
4428
|
+
ggml_build_forward_expand(gf, Vcur);
|
4429
|
+
|
4557
4430
|
Qcur = ggml_rope_custom(
|
4558
4431
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4559
|
-
|
4432
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
4560
4433
|
ext_factor, attn_factor, beta_fast, beta_slow
|
4561
4434
|
);
|
4562
4435
|
cb(Qcur, "Qcur", il);
|
4563
4436
|
|
4564
4437
|
Kcur = ggml_rope_custom(
|
4565
4438
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
4566
|
-
|
4439
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
4567
4440
|
ext_factor, attn_factor, beta_fast, beta_slow
|
4568
4441
|
);
|
4569
4442
|
cb(Kcur, "Kcur", il);
|
@@ -4686,6 +4559,7 @@ struct llm_build_context {
|
|
4686
4559
|
|
4687
4560
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4688
4561
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4562
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
4689
4563
|
|
4690
4564
|
struct ggml_tensor * cur;
|
4691
4565
|
struct ggml_tensor * inpL;
|
@@ -4703,7 +4577,7 @@ struct llm_build_context {
|
|
4703
4577
|
|
4704
4578
|
// shift the entire K-cache if needed
|
4705
4579
|
if (do_rope_shift) {
|
4706
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx,
|
4580
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
4707
4581
|
}
|
4708
4582
|
|
4709
4583
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -4729,12 +4603,12 @@ struct llm_build_context {
|
|
4729
4603
|
case MODEL_7B:
|
4730
4604
|
Qcur = ggml_rope_custom(
|
4731
4605
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
4732
|
-
|
4606
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
4733
4607
|
ext_factor, attn_factor, beta_fast, beta_slow
|
4734
4608
|
);
|
4735
4609
|
Kcur = ggml_rope_custom(
|
4736
4610
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
4737
|
-
|
4611
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
4738
4612
|
ext_factor, attn_factor, beta_fast, beta_slow
|
4739
4613
|
);
|
4740
4614
|
break;
|
@@ -4807,6 +4681,7 @@ struct llm_build_context {
|
|
4807
4681
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
4808
4682
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
4809
4683
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4684
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
4810
4685
|
|
4811
4686
|
struct ggml_tensor * cur;
|
4812
4687
|
struct ggml_tensor * inpL;
|
@@ -4824,7 +4699,7 @@ struct llm_build_context {
|
|
4824
4699
|
|
4825
4700
|
// shift the entire K-cache if needed
|
4826
4701
|
if (do_rope_shift) {
|
4827
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
4702
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
4828
4703
|
}
|
4829
4704
|
|
4830
4705
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -4865,13 +4740,13 @@ struct llm_build_context {
|
|
4865
4740
|
|
4866
4741
|
// using mode = 2 for neox mode
|
4867
4742
|
Qcur = ggml_rope_custom(
|
4868
|
-
ctx0, Qcur, inp_pos,
|
4743
|
+
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4869
4744
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4870
4745
|
);
|
4871
4746
|
cb(Qcur, "Qcur", il);
|
4872
4747
|
|
4873
4748
|
Kcur = ggml_rope_custom(
|
4874
|
-
ctx0, Kcur, inp_pos,
|
4749
|
+
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
4875
4750
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
4876
4751
|
);
|
4877
4752
|
cb(Kcur, "Kcur", il);
|
@@ -5028,15 +4903,14 @@ struct llm_build_context {
|
|
5028
4903
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
5029
4904
|
|
5030
4905
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5031
|
-
GGML_ASSERT(n_embd_head
|
5032
|
-
|
5033
|
-
const int64_t n_rot = n_embd_head_k / 2;
|
4906
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
4907
|
+
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
5034
4908
|
|
5035
4909
|
struct ggml_tensor * cur;
|
5036
4910
|
struct ggml_tensor * inpL;
|
5037
4911
|
|
5038
4912
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
5039
|
-
cb(inpL, "
|
4913
|
+
cb(inpL, "inp_embd", -1);
|
5040
4914
|
|
5041
4915
|
// inp_pos - contains the positions
|
5042
4916
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
@@ -5047,7 +4921,7 @@ struct llm_build_context {
|
|
5047
4921
|
cb(KQ_mask, "KQ_mask", -1);
|
5048
4922
|
|
5049
4923
|
if (do_rope_shift) {
|
5050
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
4924
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5051
4925
|
}
|
5052
4926
|
|
5053
4927
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5107,7 +4981,7 @@ struct llm_build_context {
|
|
5107
4981
|
|
5108
4982
|
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
5109
4983
|
struct ggml_tensor * qrot = ggml_view_3d(
|
5110
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
4984
|
+
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
5111
4985
|
ggml_element_size(tmpq) * n_embd_head,
|
5112
4986
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5113
4987
|
0
|
@@ -5115,7 +4989,7 @@ struct llm_build_context {
|
|
5115
4989
|
cb(qrot, "qrot", il);
|
5116
4990
|
|
5117
4991
|
struct ggml_tensor * krot = ggml_view_3d(
|
5118
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
4992
|
+
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
5119
4993
|
ggml_element_size(tmpk) * n_embd_head,
|
5120
4994
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5121
4995
|
0
|
@@ -5124,29 +4998,29 @@ struct llm_build_context {
|
|
5124
4998
|
|
5125
4999
|
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
5126
5000
|
struct ggml_tensor * qpass = ggml_view_3d(
|
5127
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
5001
|
+
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
5128
5002
|
ggml_element_size(tmpq) * n_embd_head,
|
5129
5003
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
5130
|
-
ggml_element_size(tmpq) * n_rot
|
5004
|
+
ggml_element_size(tmpq) * hparams.n_rot
|
5131
5005
|
);
|
5132
5006
|
cb(qpass, "qpass", il);
|
5133
5007
|
|
5134
5008
|
struct ggml_tensor * kpass = ggml_view_3d(
|
5135
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
5009
|
+
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
5136
5010
|
ggml_element_size(tmpk) * n_embd_head,
|
5137
5011
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
5138
|
-
ggml_element_size(tmpk) * n_rot
|
5012
|
+
ggml_element_size(tmpk) * hparams.n_rot
|
5139
5013
|
);
|
5140
5014
|
cb(kpass, "kpass", il);
|
5141
5015
|
|
5142
5016
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
5143
|
-
ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
5017
|
+
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5144
5018
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5145
5019
|
);
|
5146
5020
|
cb(qrotated, "qrotated", il);
|
5147
5021
|
|
5148
5022
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
5149
|
-
ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
5023
|
+
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5150
5024
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5151
5025
|
);
|
5152
5026
|
cb(krotated, "krotated", il);
|
@@ -5543,7 +5417,7 @@ struct llm_build_context {
|
|
5543
5417
|
|
5544
5418
|
// shift the entire K-cache if needed
|
5545
5419
|
if (do_rope_shift) {
|
5546
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
5420
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5547
5421
|
}
|
5548
5422
|
|
5549
5423
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5656,7 +5530,7 @@ struct llm_build_context {
|
|
5656
5530
|
|
5657
5531
|
// shift the entire K-cache if needed
|
5658
5532
|
if (do_rope_shift) {
|
5659
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
5533
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5660
5534
|
}
|
5661
5535
|
|
5662
5536
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5688,13 +5562,13 @@ struct llm_build_context {
|
|
5688
5562
|
|
5689
5563
|
// using mode = 2 for neox mode
|
5690
5564
|
Qcur = ggml_rope_custom(
|
5691
|
-
ctx0, Qcur, inp_pos,
|
5565
|
+
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5692
5566
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5693
5567
|
);
|
5694
5568
|
cb(Qcur, "Qcur", il);
|
5695
5569
|
|
5696
5570
|
Kcur = ggml_rope_custom(
|
5697
|
-
ctx0, Kcur, inp_pos,
|
5571
|
+
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
5698
5572
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
5699
5573
|
);
|
5700
5574
|
cb(Kcur, "Kcur", il);
|
@@ -5773,7 +5647,7 @@ struct llm_build_context {
|
|
5773
5647
|
|
5774
5648
|
// shift the entire K-cache if needed
|
5775
5649
|
if (do_rope_shift) {
|
5776
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
5650
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
5777
5651
|
}
|
5778
5652
|
|
5779
5653
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5785,15 +5659,25 @@ struct llm_build_context {
|
|
5785
5659
|
|
5786
5660
|
// self-attention
|
5787
5661
|
{
|
5788
|
-
|
5789
|
-
|
5662
|
+
struct ggml_tensor * Qcur = nullptr;
|
5663
|
+
struct ggml_tensor * Kcur = nullptr;
|
5664
|
+
struct ggml_tensor * Vcur = nullptr;
|
5790
5665
|
|
5791
|
-
|
5792
|
-
|
5666
|
+
if (model.layers[il].wqkv) {
|
5667
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
5668
|
+
cb(cur, "wqkv", il);
|
5793
5669
|
|
5794
|
-
|
5795
|
-
|
5796
|
-
|
5670
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
5671
|
+
cb(cur, "bqkv", il);
|
5672
|
+
|
5673
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
5674
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
5675
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
5676
|
+
} else {
|
5677
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
5678
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
5679
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
5680
|
+
}
|
5797
5681
|
|
5798
5682
|
cb(Qcur, "Qcur", il);
|
5799
5683
|
cb(Kcur, "Kcur", il);
|
@@ -5869,6 +5753,7 @@ struct llm_build_context {
|
|
5869
5753
|
|
5870
5754
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
5871
5755
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
5756
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
5872
5757
|
|
5873
5758
|
struct ggml_tensor * cur;
|
5874
5759
|
struct ggml_tensor * inpL;
|
@@ -5886,7 +5771,7 @@ struct llm_build_context {
|
|
5886
5771
|
|
5887
5772
|
// shift the entire K-cache if needed
|
5888
5773
|
if (do_rope_shift) {
|
5889
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx,
|
5774
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
5890
5775
|
}
|
5891
5776
|
|
5892
5777
|
for (int il = 0; il < n_layer; ++il) {
|
@@ -5912,13 +5797,13 @@ struct llm_build_context {
|
|
5912
5797
|
cb(Vcur, "Vcur", il);
|
5913
5798
|
|
5914
5799
|
Qcur = ggml_rope_custom(
|
5915
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur,
|
5800
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
|
5916
5801
|
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5917
5802
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
5918
5803
|
cb(Qcur, "Qcur", il);
|
5919
5804
|
|
5920
5805
|
Kcur = ggml_rope_custom(
|
5921
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur,
|
5806
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
|
5922
5807
|
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
5923
5808
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
5924
5809
|
cb(Kcur, "Kcur", il);
|
@@ -6072,199 +5957,13 @@ struct llm_build_context {
|
|
6072
5957
|
}
|
6073
5958
|
};
|
6074
5959
|
|
6075
|
-
//
|
6076
|
-
// tensor offloading helpers
|
6077
|
-
//
|
6078
|
-
// TODO: will be removed with backend v2
|
6079
|
-
|
6080
|
-
enum llm_offload_func_e {
|
6081
|
-
OFFLOAD_FUNC_NOP,
|
6082
|
-
OFFLOAD_FUNC,
|
6083
|
-
OFFLOAD_FUNC_FRC, // force offload
|
6084
|
-
OFFLOAD_FUNC_KQV,
|
6085
|
-
OFFLOAD_FUNC_NR,
|
6086
|
-
OFFLOAD_FUNC_EMB, // embeddings
|
6087
|
-
OFFLOAD_FUNC_OUT,
|
6088
|
-
};
|
6089
|
-
|
6090
|
-
// TODO: will be removed with backend v2
|
6091
|
-
struct llm_offload_trie {
|
6092
|
-
struct node {
|
6093
|
-
~node() {
|
6094
|
-
for (int i = 0; i < 256; ++i) {
|
6095
|
-
if (children[i]) {
|
6096
|
-
delete children[i];
|
6097
|
-
}
|
6098
|
-
}
|
6099
|
-
}
|
6100
|
-
|
6101
|
-
node * children[256] = { nullptr };
|
6102
|
-
llm_offload_func_e func = OFFLOAD_FUNC_NOP;
|
6103
|
-
};
|
6104
|
-
|
6105
|
-
llm_offload_trie() {
|
6106
|
-
root = new node;
|
6107
|
-
}
|
6108
|
-
|
6109
|
-
llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
|
6110
|
-
root = new node;
|
6111
|
-
|
6112
|
-
for (const auto & kv : map) {
|
6113
|
-
add(kv.first, kv.second);
|
6114
|
-
}
|
6115
|
-
}
|
6116
|
-
|
6117
|
-
~llm_offload_trie() {
|
6118
|
-
delete root;
|
6119
|
-
}
|
6120
|
-
|
6121
|
-
void add(const char * name, llm_offload_func_e func) {
|
6122
|
-
node * cur = root;
|
6123
|
-
|
6124
|
-
for (int i = 0; ; ++i) {
|
6125
|
-
const uint8_t c = name[i];
|
6126
|
-
|
6127
|
-
if (!c) {
|
6128
|
-
break;
|
6129
|
-
}
|
6130
|
-
|
6131
|
-
if (!cur->children[c]) {
|
6132
|
-
cur->children[c] = new node;
|
6133
|
-
}
|
6134
|
-
|
6135
|
-
cur = cur->children[c];
|
6136
|
-
}
|
6137
|
-
|
6138
|
-
cur->func = func;
|
6139
|
-
}
|
6140
|
-
|
6141
|
-
llm_offload_func_e find(const char * name) const {
|
6142
|
-
const node * cur = root;
|
6143
|
-
|
6144
|
-
for (int i = 0; ; ++i) {
|
6145
|
-
const uint8_t c = name[i];
|
6146
|
-
|
6147
|
-
if (!c) {
|
6148
|
-
break;
|
6149
|
-
}
|
6150
|
-
|
6151
|
-
if (!cur->children[c]) {
|
6152
|
-
return OFFLOAD_FUNC_NOP;
|
6153
|
-
}
|
6154
|
-
|
6155
|
-
cur = cur->children[c];
|
6156
|
-
}
|
6157
|
-
|
6158
|
-
return cur->func;
|
6159
|
-
}
|
6160
|
-
|
6161
|
-
node * root = nullptr;
|
6162
|
-
};
|
6163
|
-
|
6164
|
-
// TODO: will be removed with backend v2
|
6165
|
-
static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
|
6166
|
-
//{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
|
6167
|
-
//{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
|
6168
|
-
{ "pos_embd", OFFLOAD_FUNC_NR },
|
6169
|
-
|
6170
|
-
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
6171
|
-
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
6172
|
-
{ "K_shift", OFFLOAD_FUNC_FRC },
|
6173
|
-
|
6174
|
-
{ "K_shifted", OFFLOAD_FUNC },
|
6175
|
-
|
6176
|
-
{ "inp_norm", OFFLOAD_FUNC_NR },
|
6177
|
-
{ "inp_norm_w", OFFLOAD_FUNC_NR },
|
6178
|
-
{ "inp_norm_wb", OFFLOAD_FUNC_NR },
|
6179
|
-
|
6180
|
-
{ "norm", OFFLOAD_FUNC },
|
6181
|
-
{ "norm_w", OFFLOAD_FUNC },
|
6182
|
-
{ "norm_wb", OFFLOAD_FUNC },
|
6183
|
-
|
6184
|
-
{ "attn_norm", OFFLOAD_FUNC },
|
6185
|
-
{ "attn_norm_2", OFFLOAD_FUNC },
|
6186
|
-
|
6187
|
-
{ "wqkv", OFFLOAD_FUNC_KQV },
|
6188
|
-
{ "bqkv", OFFLOAD_FUNC_KQV },
|
6189
|
-
{ "wqkv_clamped", OFFLOAD_FUNC_KQV },
|
6190
|
-
|
6191
|
-
{ "tmpk", OFFLOAD_FUNC_KQV },
|
6192
|
-
{ "tmpq", OFFLOAD_FUNC_KQV },
|
6193
|
-
{ "tmpv", OFFLOAD_FUNC_KQV },
|
6194
|
-
{ "Kcur", OFFLOAD_FUNC_KQV },
|
6195
|
-
{ "Qcur", OFFLOAD_FUNC_KQV },
|
6196
|
-
{ "Vcur", OFFLOAD_FUNC_KQV },
|
6197
|
-
|
6198
|
-
{ "krot", OFFLOAD_FUNC_KQV },
|
6199
|
-
{ "qrot", OFFLOAD_FUNC_KQV },
|
6200
|
-
{ "kpass", OFFLOAD_FUNC_KQV },
|
6201
|
-
{ "qpass", OFFLOAD_FUNC_KQV },
|
6202
|
-
{ "krotated", OFFLOAD_FUNC_KQV },
|
6203
|
-
{ "qrotated", OFFLOAD_FUNC_KQV },
|
6204
|
-
|
6205
|
-
{ "q", OFFLOAD_FUNC_KQV },
|
6206
|
-
{ "k", OFFLOAD_FUNC_KQV },
|
6207
|
-
{ "kq", OFFLOAD_FUNC_KQV },
|
6208
|
-
{ "kq_scaled", OFFLOAD_FUNC_KQV },
|
6209
|
-
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
|
6210
|
-
{ "kq_masked", OFFLOAD_FUNC_KQV },
|
6211
|
-
{ "kq_soft_max", OFFLOAD_FUNC_KQV },
|
6212
|
-
{ "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
|
6213
|
-
{ "v", OFFLOAD_FUNC_KQV },
|
6214
|
-
{ "kqv", OFFLOAD_FUNC_KQV },
|
6215
|
-
{ "kqv_merged", OFFLOAD_FUNC_KQV },
|
6216
|
-
{ "kqv_merged_cont", OFFLOAD_FUNC_KQV },
|
6217
|
-
{ "kqv_wo", OFFLOAD_FUNC_KQV },
|
6218
|
-
{ "kqv_out", OFFLOAD_FUNC_KQV },
|
6219
|
-
|
6220
|
-
{ "ffn_inp", OFFLOAD_FUNC },
|
6221
|
-
{ "ffn_norm", OFFLOAD_FUNC },
|
6222
|
-
|
6223
|
-
{ "ffn_up", OFFLOAD_FUNC },
|
6224
|
-
{ "ffn_up_b", OFFLOAD_FUNC },
|
6225
|
-
{ "ffn_gate", OFFLOAD_FUNC },
|
6226
|
-
{ "ffn_gate_b", OFFLOAD_FUNC },
|
6227
|
-
{ "ffn_gate_par", OFFLOAD_FUNC },
|
6228
|
-
{ "ffn_act", OFFLOAD_FUNC },
|
6229
|
-
{ "ffn_down", OFFLOAD_FUNC },
|
6230
|
-
{ "ffn_down_b", OFFLOAD_FUNC },
|
6231
|
-
{ "ffn_out", OFFLOAD_FUNC },
|
6232
|
-
|
6233
|
-
{ "ffn_silu", OFFLOAD_FUNC },
|
6234
|
-
{ "ffn_gelu", OFFLOAD_FUNC },
|
6235
|
-
{ "ffn_relu", OFFLOAD_FUNC },
|
6236
|
-
{ "ffn_sqr(relu)", OFFLOAD_FUNC },
|
6237
|
-
|
6238
|
-
{ "ffn_moe_logits", OFFLOAD_FUNC },
|
6239
|
-
{ "ffn_moe_probs", OFFLOAD_FUNC },
|
6240
|
-
{ "ffn_moe_argsort", OFFLOAD_FUNC },
|
6241
|
-
{ "ffn_moe_weights", OFFLOAD_FUNC },
|
6242
|
-
{ "ffn_moe_weights_sum", OFFLOAD_FUNC },
|
6243
|
-
{ "ffn_moe_weights_norm", OFFLOAD_FUNC },
|
6244
|
-
{ "ffn_moe_weighted", OFFLOAD_FUNC },
|
6245
|
-
{ "ffn_moe_up", OFFLOAD_FUNC },
|
6246
|
-
{ "ffn_moe_gate", OFFLOAD_FUNC },
|
6247
|
-
{ "ffn_moe_silu", OFFLOAD_FUNC },
|
6248
|
-
{ "ffn_moe_gate_par", OFFLOAD_FUNC },
|
6249
|
-
{ "ffn_moe_down", OFFLOAD_FUNC },
|
6250
|
-
{ "ffn_moe_out", OFFLOAD_FUNC },
|
6251
|
-
|
6252
|
-
{ "l_out", OFFLOAD_FUNC },
|
6253
|
-
|
6254
|
-
{ "result_norm", OFFLOAD_FUNC_EMB },
|
6255
|
-
{ "result_output_no_bias", OFFLOAD_FUNC_EMB },
|
6256
|
-
{ "result_output", OFFLOAD_FUNC_OUT },
|
6257
|
-
};
|
6258
|
-
|
6259
|
-
static llm_offload_trie k_offload_func_trie(k_offload_map);
|
6260
|
-
|
6261
5960
|
static struct ggml_cgraph * llama_build_graph(
|
6262
5961
|
llama_context & lctx,
|
6263
5962
|
const llama_batch & batch) {
|
6264
5963
|
const auto & model = lctx.model;
|
6265
5964
|
|
6266
5965
|
// check if we should build the worst-case graph (for memory measurement)
|
6267
|
-
const bool worst_case =
|
5966
|
+
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
6268
5967
|
|
6269
5968
|
// keep track of the input that has already been allocated
|
6270
5969
|
bool alloc_inp_tokens = false;
|
@@ -6273,16 +5972,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6273
5972
|
bool alloc_inp_KQ_mask = false;
|
6274
5973
|
bool alloc_inp_K_shift = false;
|
6275
5974
|
|
6276
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
6277
|
-
const bool do_offload = true;
|
6278
|
-
#else
|
6279
|
-
const bool do_offload = true; // TODO: set to false after finishing refactoring
|
6280
|
-
#endif
|
6281
|
-
|
6282
|
-
int n_non_view = 0; // number of non-view tensors that have been processed by the callback
|
6283
|
-
|
6284
5975
|
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
6285
|
-
// TODO:
|
5976
|
+
// TODO: improve handling of input and output tensors, then replace this with ggml_set_name
|
6286
5977
|
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
6287
5978
|
if (il >= 0) {
|
6288
5979
|
ggml_format_name(cur, "%s-%d", name, il);
|
@@ -6293,12 +5984,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6293
5984
|
//
|
6294
5985
|
// allocate input tensors and set input data
|
6295
5986
|
//
|
6296
|
-
// TODO: will be removed with backend v2
|
6297
5987
|
|
6298
5988
|
if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
|
6299
|
-
|
5989
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
6300
5990
|
|
6301
|
-
if (!
|
5991
|
+
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
|
6302
5992
|
const int64_t n_tokens = cur->ne[0];
|
6303
5993
|
|
6304
5994
|
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
|
@@ -6307,10 +5997,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6307
5997
|
alloc_inp_tokens = true;
|
6308
5998
|
}
|
6309
5999
|
|
6310
|
-
if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
|
6311
|
-
|
6000
|
+
if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
|
6001
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
6312
6002
|
|
6313
|
-
if (!
|
6003
|
+
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
|
6314
6004
|
const int64_t n_embd = cur->ne[0];
|
6315
6005
|
const int64_t n_tokens = cur->ne[1];
|
6316
6006
|
|
@@ -6321,9 +6011,9 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6321
6011
|
}
|
6322
6012
|
|
6323
6013
|
if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
|
6324
|
-
|
6014
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
6325
6015
|
|
6326
|
-
if (!
|
6016
|
+
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
|
6327
6017
|
const int64_t n_tokens = cur->ne[0];
|
6328
6018
|
|
6329
6019
|
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
|
@@ -6334,9 +6024,9 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6334
6024
|
}
|
6335
6025
|
|
6336
6026
|
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
|
6337
|
-
|
6027
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
6338
6028
|
|
6339
|
-
if (!
|
6029
|
+
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
6340
6030
|
const int64_t n_kv = cur->ne[0];
|
6341
6031
|
const int64_t n_tokens = cur->ne[1];
|
6342
6032
|
|
@@ -6369,165 +6059,35 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6369
6059
|
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
6370
6060
|
}
|
6371
6061
|
}
|
6372
|
-
|
6373
|
-
alloc_inp_KQ_mask = true;
|
6374
|
-
}
|
6375
|
-
|
6376
|
-
if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
|
6377
|
-
ggml_allocr_alloc(lctx.alloc, cur);
|
6378
|
-
|
6379
|
-
if (!ggml_allocr_is_measure(lctx.alloc)) {
|
6380
|
-
const int64_t n_ctx = cur->ne[0];
|
6381
|
-
|
6382
|
-
int32_t * data;
|
6383
|
-
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
6384
|
-
data = (int32_t *) cur->data;
|
6385
|
-
} else {
|
6386
|
-
lctx.buf_copy.resize(ggml_nbytes(cur));
|
6387
|
-
data = (int32_t *) lctx.buf_copy.data();
|
6388
|
-
}
|
6389
|
-
|
6390
|
-
for (int i = 0; i < n_ctx; ++i) {
|
6391
|
-
data[i] = lctx.kv_self.cells[i].delta;
|
6392
|
-
}
|
6393
|
-
|
6394
|
-
if (data != cur->data) {
|
6395
|
-
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
6396
|
-
}
|
6397
|
-
}
|
6398
|
-
|
6399
|
-
alloc_inp_K_shift = true;
|
6400
|
-
}
|
6401
|
-
|
6402
|
-
// view tensors are not processed further
|
6403
|
-
if (cur->view_src != nullptr) {
|
6404
|
-
return;
|
6405
|
-
}
|
6406
|
-
|
6407
|
-
if (cur->op != GGML_OP_NONE) {
|
6408
|
-
n_non_view++;
|
6409
|
-
}
|
6410
|
-
|
6411
|
-
//
|
6412
|
-
// offload layers
|
6413
|
-
//
|
6414
|
-
// TODO: will be removed with backend v2
|
6415
|
-
|
6416
|
-
//#define LLAMA_OFFLOAD_DEBUG
|
6417
|
-
|
6418
|
-
if (!do_offload) {
|
6419
|
-
return;
|
6420
|
-
}
|
6421
|
-
|
6422
|
-
const int n_layer = model.hparams.n_layer;
|
6423
|
-
|
6424
|
-
const int n_gpu_layers = model.n_gpu_layers;
|
6425
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
6426
|
-
|
6427
|
-
// should we offload the final norm? yes if we are not computing embeddings
|
6428
|
-
const bool offload_emb = lctx.embedding.empty();
|
6429
|
-
|
6430
|
-
static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
|
6431
|
-
{ OFFLOAD_FUNC_NOP, "CPU" },
|
6432
|
-
{ OFFLOAD_FUNC_OUT, "CPU" },
|
6433
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
6434
|
-
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
6435
|
-
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
6436
|
-
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
6437
|
-
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
|
6438
|
-
{ OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
|
6439
|
-
#else
|
6440
|
-
{ OFFLOAD_FUNC, "CPU" },
|
6441
|
-
{ OFFLOAD_FUNC_FRC, "CPU" },
|
6442
|
-
{ OFFLOAD_FUNC_KQV, "CPU" },
|
6443
|
-
{ OFFLOAD_FUNC_NR, "CPU" },
|
6444
|
-
{ OFFLOAD_FUNC_EMB, "CPU" },
|
6445
|
-
#endif // GGML_USE_CUBLAS
|
6446
|
-
};
|
6447
|
-
|
6448
|
-
// check the global map for what offload function to use for this tensor
|
6449
|
-
llm_offload_func_e func_e = k_offload_func_trie.find(name);
|
6450
|
-
|
6451
|
-
if (func_e == OFFLOAD_FUNC_NOP) {
|
6452
|
-
#ifdef LLAMA_OFFLOAD_DEBUG
|
6453
|
-
// if a tensor hasn't been offloaded, we warn the user
|
6454
|
-
if (worst_case) {
|
6455
|
-
LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
|
6456
|
-
cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
|
6457
|
-
}
|
6458
|
-
#endif
|
6459
|
-
|
6460
|
-
return;
|
6461
|
-
}
|
6462
|
-
|
6463
|
-
// count the number of layers and respect the provided n_gpu_layers
|
6464
|
-
switch (func_e) {
|
6465
|
-
case OFFLOAD_FUNC_NOP:
|
6466
|
-
case OFFLOAD_FUNC_OUT:
|
6467
|
-
break;
|
6468
|
-
case OFFLOAD_FUNC:
|
6469
|
-
if (n_gpu_layers < n_layer) {
|
6470
|
-
if (il < i_gpu_start) {
|
6471
|
-
func_e = OFFLOAD_FUNC_NOP;
|
6472
|
-
}
|
6473
|
-
}
|
6474
|
-
break;
|
6475
|
-
case OFFLOAD_FUNC_FRC:
|
6476
|
-
if (!lctx.cparams.offload_kqv) {
|
6477
|
-
func_e = OFFLOAD_FUNC_NOP;
|
6478
|
-
} break;
|
6479
|
-
case OFFLOAD_FUNC_KQV:
|
6480
|
-
if (!lctx.cparams.offload_kqv) {
|
6481
|
-
func_e = OFFLOAD_FUNC_NOP;
|
6482
|
-
} else {
|
6483
|
-
if (n_gpu_layers < n_layer) {
|
6484
|
-
if (il < i_gpu_start) {
|
6485
|
-
func_e = OFFLOAD_FUNC_NOP;
|
6486
|
-
}
|
6487
|
-
}
|
6488
|
-
}
|
6489
|
-
break;
|
6490
|
-
case OFFLOAD_FUNC_NR:
|
6491
|
-
if (n_gpu_layers <= n_layer + 0) {
|
6492
|
-
func_e = OFFLOAD_FUNC_NOP;
|
6493
|
-
}
|
6494
|
-
break;
|
6495
|
-
case OFFLOAD_FUNC_EMB:
|
6496
|
-
if (!offload_emb || n_gpu_layers < n_layer) {
|
6497
|
-
func_e = OFFLOAD_FUNC_NOP;
|
6498
|
-
}
|
6499
|
-
break;
|
6500
|
-
default: GGML_ASSERT(false);
|
6062
|
+
|
6063
|
+
alloc_inp_KQ_mask = true;
|
6501
6064
|
}
|
6502
6065
|
|
6503
|
-
|
6066
|
+
if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
|
6067
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
6504
6068
|
|
6505
|
-
|
6506
|
-
|
6507
|
-
static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
|
6508
|
-
#else
|
6509
|
-
static offload_func_t ggml_offload_gpu = ggml_offload_nop;
|
6510
|
-
#endif
|
6069
|
+
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
6070
|
+
const int64_t n_ctx = cur->ne[0];
|
6511
6071
|
|
6512
|
-
|
6513
|
-
|
6514
|
-
|
6515
|
-
|
6516
|
-
|
6517
|
-
|
6518
|
-
|
6519
|
-
|
6520
|
-
|
6521
|
-
|
6072
|
+
int32_t * data;
|
6073
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
6074
|
+
data = (int32_t *) cur->data;
|
6075
|
+
} else {
|
6076
|
+
lctx.buf_copy.resize(ggml_nbytes(cur));
|
6077
|
+
data = (int32_t *) lctx.buf_copy.data();
|
6078
|
+
}
|
6079
|
+
|
6080
|
+
for (int i = 0; i < n_ctx; ++i) {
|
6081
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
6082
|
+
}
|
6522
6083
|
|
6523
|
-
|
6524
|
-
|
6084
|
+
if (data != cur->data) {
|
6085
|
+
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
6086
|
+
}
|
6087
|
+
}
|
6525
6088
|
|
6526
|
-
|
6527
|
-
if (worst_case) {
|
6528
|
-
LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
|
6089
|
+
alloc_inp_K_shift = true;
|
6529
6090
|
}
|
6530
|
-
#endif
|
6531
6091
|
};
|
6532
6092
|
|
6533
6093
|
struct ggml_cgraph * result = NULL;
|
@@ -6595,27 +6155,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
6595
6155
|
|
6596
6156
|
llm.free();
|
6597
6157
|
|
6598
|
-
if (worst_case) {
|
6599
|
-
int n_non_view_total = 0;
|
6600
|
-
|
6601
|
-
for (int i = 0; i < result->n_nodes; ++i) {
|
6602
|
-
if (result->nodes[i]->view_src == nullptr) {
|
6603
|
-
n_non_view_total++;
|
6604
|
-
}
|
6605
|
-
}
|
6606
|
-
|
6607
|
-
LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
|
6608
|
-
|
6609
|
-
if (n_non_view != n_non_view_total) {
|
6610
|
-
LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
|
6611
|
-
LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__);
|
6612
|
-
LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__);
|
6613
|
-
LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__);
|
6614
|
-
LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__);
|
6615
|
-
LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
|
6616
|
-
}
|
6617
|
-
}
|
6618
|
-
|
6619
6158
|
return result;
|
6620
6159
|
}
|
6621
6160
|
|
@@ -6661,8 +6200,6 @@ static int llama_decode_internal(
|
|
6661
6200
|
|
6662
6201
|
auto & kv_self = lctx.kv_self;
|
6663
6202
|
|
6664
|
-
GGML_ASSERT(!!kv_self.ctx);
|
6665
|
-
|
6666
6203
|
const int64_t n_embd = hparams.n_embd;
|
6667
6204
|
const int64_t n_vocab = hparams.n_vocab;
|
6668
6205
|
|
@@ -6716,12 +6253,10 @@ static int llama_decode_internal(
|
|
6716
6253
|
|
6717
6254
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
6718
6255
|
|
6719
|
-
|
6256
|
+
ggml_backend_sched_reset(lctx.sched);
|
6720
6257
|
|
6721
6258
|
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
6722
6259
|
|
6723
|
-
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
6724
|
-
|
6725
6260
|
// the output is always the last tensor in the graph
|
6726
6261
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
6727
6262
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
@@ -6733,30 +6268,6 @@ static int llama_decode_internal(
|
|
6733
6268
|
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
6734
6269
|
}
|
6735
6270
|
|
6736
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
6737
|
-
char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
|
6738
|
-
for (int i = 0; i < gf->n_leafs; i++) {
|
6739
|
-
ggml_tensor * node = gf->leafs[i];
|
6740
|
-
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
6741
|
-
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
6742
|
-
ggml_cuda_copy_to_device(node);
|
6743
|
-
}
|
6744
|
-
}
|
6745
|
-
|
6746
|
-
for (int i = 0; i < gf->n_nodes; i++) {
|
6747
|
-
ggml_tensor * node = gf->nodes[i];
|
6748
|
-
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
6749
|
-
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
6750
|
-
}
|
6751
|
-
}
|
6752
|
-
|
6753
|
-
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
|
6754
|
-
if (!lctx.embedding.empty()) {
|
6755
|
-
embeddings->backend = GGML_BACKEND_CPU;
|
6756
|
-
}
|
6757
|
-
res->backend = GGML_BACKEND_CPU;
|
6758
|
-
#endif
|
6759
|
-
|
6760
6271
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
6761
6272
|
|
6762
6273
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
@@ -6779,15 +6290,17 @@ static int llama_decode_internal(
|
|
6779
6290
|
#endif
|
6780
6291
|
|
6781
6292
|
#ifdef GGML_USE_METAL
|
6782
|
-
if (ggml_backend_is_metal(lctx.
|
6783
|
-
ggml_backend_metal_set_n_cb(lctx.
|
6293
|
+
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
6294
|
+
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
6784
6295
|
}
|
6785
6296
|
#endif
|
6786
6297
|
|
6787
|
-
if (
|
6788
|
-
ggml_backend_cpu_set_n_threads(lctx.
|
6298
|
+
if (lctx.backend_cpu != nullptr) {
|
6299
|
+
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
6789
6300
|
}
|
6790
|
-
|
6301
|
+
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
6302
|
+
|
6303
|
+
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
6791
6304
|
|
6792
6305
|
#ifdef GGML_USE_MPI
|
6793
6306
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
@@ -6835,30 +6348,33 @@ static int llama_decode_internal(
|
|
6835
6348
|
logits_out.clear();
|
6836
6349
|
#endif
|
6837
6350
|
|
6351
|
+
ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
|
6352
|
+
GGML_ASSERT(res_backend != nullptr);
|
6838
6353
|
if (batch.logits) {
|
6839
6354
|
logits_out.resize(n_vocab * n_tokens);
|
6840
6355
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
6841
6356
|
if (batch.logits[i] == 0) {
|
6842
6357
|
continue;
|
6843
6358
|
}
|
6844
|
-
|
6359
|
+
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
6845
6360
|
#ifndef NDEBUG
|
6846
6361
|
logits_valid[i] = true;
|
6847
6362
|
#endif
|
6848
6363
|
}
|
6849
6364
|
} else if (lctx.logits_all) {
|
6850
6365
|
logits_out.resize(n_vocab * n_tokens);
|
6851
|
-
|
6366
|
+
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
6852
6367
|
#ifndef NDEBUG
|
6853
6368
|
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
6854
6369
|
#endif
|
6855
6370
|
} else {
|
6856
6371
|
logits_out.resize(n_vocab);
|
6857
|
-
|
6372
|
+
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
6858
6373
|
#ifndef NDEBUG
|
6859
6374
|
logits_valid[0] = true;
|
6860
6375
|
#endif
|
6861
6376
|
}
|
6377
|
+
ggml_backend_synchronize(res_backend);
|
6862
6378
|
}
|
6863
6379
|
|
6864
6380
|
// extract embeddings
|
@@ -6866,7 +6382,9 @@ static int llama_decode_internal(
|
|
6866
6382
|
auto & embedding_out = lctx.embedding;
|
6867
6383
|
|
6868
6384
|
embedding_out.resize(n_embd);
|
6869
|
-
|
6385
|
+
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
6386
|
+
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
|
6387
|
+
ggml_backend_synchronize(embeddings_backend);
|
6870
6388
|
}
|
6871
6389
|
|
6872
6390
|
// measure the performance only for the single-token evals
|
@@ -6937,15 +6455,15 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
6937
6455
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
6938
6456
|
static const char * hex = "0123456789ABCDEF";
|
6939
6457
|
switch (llama_vocab_get_type(vocab)) {
|
6940
|
-
|
6941
|
-
|
6942
|
-
|
6943
|
-
|
6944
|
-
|
6945
|
-
|
6946
|
-
|
6947
|
-
|
6948
|
-
|
6458
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
6459
|
+
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
6460
|
+
return vocab.token_to_id.at(buf);
|
6461
|
+
}
|
6462
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
6463
|
+
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
6464
|
+
}
|
6465
|
+
default:
|
6466
|
+
GGML_ASSERT(false);
|
6949
6467
|
}
|
6950
6468
|
}
|
6951
6469
|
|
@@ -7479,7 +6997,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
7479
6997
|
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
|
7480
6998
|
|
7481
6999
|
#ifdef PRETOKENIZERDEBUG
|
7482
|
-
|
7000
|
+
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
7483
7001
|
#endif
|
7484
7002
|
auto source = std::distance(buffer.begin(), it);
|
7485
7003
|
|
@@ -7492,7 +7010,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
7492
7010
|
buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
|
7493
7011
|
|
7494
7012
|
#ifdef PRETOKENIZERDEBUG
|
7495
|
-
|
7013
|
+
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
7496
7014
|
#endif
|
7497
7015
|
it++;
|
7498
7016
|
}
|
@@ -7508,7 +7026,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
7508
7026
|
buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
|
7509
7027
|
|
7510
7028
|
#ifdef PRETOKENIZERDEBUG
|
7511
|
-
|
7029
|
+
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
7512
7030
|
#endif
|
7513
7031
|
|
7514
7032
|
it++;
|
@@ -7524,7 +7042,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
7524
7042
|
raw_text_base_length = right_reminder_length;
|
7525
7043
|
|
7526
7044
|
#ifdef PRETOKENIZERDEBUG
|
7527
|
-
|
7045
|
+
LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
7528
7046
|
#endif
|
7529
7047
|
} else {
|
7530
7048
|
if (source == 0) {
|
@@ -7581,7 +7099,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
7581
7099
|
}
|
7582
7100
|
|
7583
7101
|
#ifdef PRETOKENIZERDEBUG
|
7584
|
-
|
7102
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
7585
7103
|
#endif
|
7586
7104
|
llm_tokenizer_spm tokenizer(vocab);
|
7587
7105
|
llama_escape_whitespace(raw_text);
|
@@ -7602,7 +7120,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
7602
7120
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
7603
7121
|
|
7604
7122
|
#ifdef PRETOKENIZERDEBUG
|
7605
|
-
|
7123
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
7606
7124
|
#endif
|
7607
7125
|
llm_tokenizer_bpe tokenizer(vocab);
|
7608
7126
|
tokenizer.tokenize(raw_text, output);
|
@@ -8380,39 +7898,59 @@ static void llama_log_softmax(float * array, size_t size) {
|
|
8380
7898
|
}
|
8381
7899
|
}
|
8382
7900
|
|
7901
|
+
void llama_sample_apply_guidance(
|
7902
|
+
struct llama_context * ctx,
|
7903
|
+
float * logits,
|
7904
|
+
float * logits_guidance,
|
7905
|
+
float scale) {
|
7906
|
+
GGML_ASSERT(ctx);
|
7907
|
+
|
7908
|
+
const auto t_start_sample_us = ggml_time_us();
|
7909
|
+
const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
7910
|
+
|
7911
|
+
llama_log_softmax(logits, n_vocab);
|
7912
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
7913
|
+
|
7914
|
+
for (int i = 0; i < n_vocab; ++i) {
|
7915
|
+
auto & l = logits[i];
|
7916
|
+
const auto & g = logits_guidance[i];
|
7917
|
+
|
7918
|
+
l = scale * (l - g) + g;
|
7919
|
+
}
|
7920
|
+
|
7921
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
7922
|
+
}
|
7923
|
+
|
8383
7924
|
void llama_sample_classifier_free_guidance(
|
8384
7925
|
struct llama_context * ctx,
|
8385
7926
|
llama_token_data_array * candidates,
|
8386
7927
|
struct llama_context * guidance_ctx,
|
8387
7928
|
float scale) {
|
8388
|
-
int64_t t_start_sample_us = ggml_time_us();
|
8389
|
-
|
8390
7929
|
GGML_ASSERT(ctx);
|
7930
|
+
int64_t t_start_sample_us;
|
8391
7931
|
|
8392
|
-
|
7932
|
+
t_start_sample_us = ggml_time_us();
|
7933
|
+
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
|
8393
7934
|
|
8394
|
-
GGML_ASSERT(n_vocab ==
|
7935
|
+
GGML_ASSERT(n_vocab == candidates->size);
|
8395
7936
|
GGML_ASSERT(!candidates->sorted);
|
8396
7937
|
|
8397
|
-
std::vector<float> logits_base;
|
8398
|
-
|
8399
|
-
|
8400
|
-
logits_base.push_back(candidates->data[i].logit);
|
7938
|
+
std::vector<float> logits_base(n_vocab);
|
7939
|
+
for (size_t i = 0; i < n_vocab; ++i) {
|
7940
|
+
logits_base[i] = candidates->data[i].logit;
|
8401
7941
|
}
|
8402
|
-
llama_log_softmax(logits_base.data(), candidates->size);
|
8403
7942
|
|
8404
|
-
float* logits_guidance = llama_get_logits(guidance_ctx);
|
8405
|
-
llama_log_softmax(logits_guidance, n_vocab);
|
7943
|
+
float * logits_guidance = llama_get_logits(guidance_ctx);
|
8406
7944
|
|
8407
|
-
|
8408
|
-
|
8409
|
-
|
8410
|
-
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
8411
|
-
}
|
7945
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
7946
|
+
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
|
7947
|
+
t_start_sample_us = ggml_time_us();
|
8412
7948
|
|
8413
|
-
|
8414
|
-
|
7949
|
+
for (size_t i = 0; i < n_vocab; ++i) {
|
7950
|
+
candidates->data[i].logit = logits_base[i];
|
8415
7951
|
}
|
7952
|
+
|
7953
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
8416
7954
|
}
|
8417
7955
|
|
8418
7956
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
@@ -8836,6 +8374,8 @@ struct quantize_state_internal {
|
|
8836
8374
|
int n_k_quantized = 0;
|
8837
8375
|
int n_fallback = 0;
|
8838
8376
|
|
8377
|
+
bool has_imatrix = false;
|
8378
|
+
|
8839
8379
|
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
8840
8380
|
: model(model)
|
8841
8381
|
, params(params)
|
@@ -8919,9 +8459,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8919
8459
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
8920
8460
|
new_type = GGML_TYPE_Q8_0;
|
8921
8461
|
}
|
8462
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
8463
|
+
new_type = GGML_TYPE_Q5_K;
|
8464
|
+
}
|
8922
8465
|
else if (new_type != GGML_TYPE_Q8_0) {
|
8923
8466
|
new_type = GGML_TYPE_Q6_K;
|
8924
8467
|
}
|
8468
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
8469
|
+
if (name.find("attn_v.weight") != std::string::npos) {
|
8470
|
+
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
8471
|
+
else new_type = GGML_TYPE_Q2_K;
|
8472
|
+
++qs.i_attention_wv;
|
8473
|
+
}
|
8474
|
+
else if (name.find("ffn_down") != std::string::npos) {
|
8475
|
+
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
|
8476
|
+
++qs.i_feed_forward_w2;
|
8477
|
+
}
|
8478
|
+
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
8925
8479
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
8926
8480
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8927
8481
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
@@ -8952,13 +8506,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8952
8506
|
new_type = GGML_TYPE_Q8_0;
|
8953
8507
|
}
|
8954
8508
|
} else if (name.find("ffn_down") != std::string::npos) {
|
8509
|
+
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
8510
|
+
int i_layer, n_layer;
|
8511
|
+
if (n_expert == 1) {
|
8512
|
+
i_layer = qs.i_feed_forward_w2;
|
8513
|
+
n_layer = qs.n_feed_forward_w2;
|
8514
|
+
} else {
|
8515
|
+
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
8516
|
+
// sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
|
8517
|
+
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
8518
|
+
// tensor name.
|
8519
|
+
n_layer = qs.n_feed_forward_w2 / n_expert;
|
8520
|
+
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
|
8521
|
+
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
|
8522
|
+
}
|
8523
|
+
if (i_layer < 0 || i_layer >= n_layer) {
|
8524
|
+
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
|
8525
|
+
}
|
8526
|
+
}
|
8955
8527
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
8956
8528
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
8957
|
-
if (
|
8529
|
+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
8958
8530
|
}
|
8959
8531
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
8960
|
-
new_type =
|
8961
|
-
: arch != LLM_ARCH_FALCON || use_more_bits(
|
8532
|
+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
8533
|
+
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
8962
8534
|
: GGML_TYPE_Q3_K;
|
8963
8535
|
}
|
8964
8536
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
@@ -8966,22 +8538,36 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
8966
8538
|
}
|
8967
8539
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
8968
8540
|
if (arch == LLM_ARCH_FALCON) {
|
8969
|
-
new_type =
|
8970
|
-
use_more_bits(
|
8541
|
+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
|
8542
|
+
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
8971
8543
|
} else {
|
8972
|
-
if (use_more_bits(
|
8544
|
+
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
8973
8545
|
}
|
8974
8546
|
}
|
8975
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(
|
8976
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON &&
|
8547
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
8548
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
8977
8549
|
new_type = GGML_TYPE_Q5_K;
|
8978
8550
|
}
|
8551
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
|
8552
|
+
&& qs.has_imatrix && i_layer < n_layer/8) {
|
8553
|
+
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
|
8554
|
+
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
|
8555
|
+
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
8556
|
+
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
8557
|
+
}
|
8979
8558
|
++qs.i_feed_forward_w2;
|
8980
8559
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
8981
8560
|
if (arch != LLM_ARCH_FALCON) {
|
8982
|
-
if
|
8983
|
-
|
8984
|
-
|
8561
|
+
if (qs.model.hparams.n_expert == 8) {
|
8562
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
8563
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
8564
|
+
new_type = GGML_TYPE_Q5_K;
|
8565
|
+
}
|
8566
|
+
} else {
|
8567
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
8568
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
8569
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
8570
|
+
}
|
8985
8571
|
} else {
|
8986
8572
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
8987
8573
|
}
|
@@ -9002,7 +8588,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9002
8588
|
//}
|
9003
8589
|
bool convert_incompatible_tensor = false;
|
9004
8590
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
9005
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K
|
8591
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
8592
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
|
9006
8593
|
int nx = tensor->ne[0];
|
9007
8594
|
int ny = tensor->ne[1];
|
9008
8595
|
if (nx % QK_K != 0) {
|
@@ -9014,6 +8601,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
9014
8601
|
}
|
9015
8602
|
if (convert_incompatible_tensor) {
|
9016
8603
|
switch (new_type) {
|
8604
|
+
case GGML_TYPE_IQ2_XXS:
|
8605
|
+
case GGML_TYPE_IQ2_XS:
|
9017
8606
|
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
9018
8607
|
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
9019
8608
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
@@ -9084,6 +8673,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9084
8673
|
if (params->only_copy) {
|
9085
8674
|
ftype = model.ftype;
|
9086
8675
|
}
|
8676
|
+
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
8677
|
+
if (params->imatrix) {
|
8678
|
+
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
8679
|
+
if (imatrix_data) {
|
8680
|
+
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
8681
|
+
qs.has_imatrix = true;
|
8682
|
+
}
|
8683
|
+
}
|
9087
8684
|
|
9088
8685
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
9089
8686
|
struct gguf_context * ctx_out = gguf_init_empty();
|
@@ -9141,6 +8738,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9141
8738
|
// placeholder for the meta data
|
9142
8739
|
::zeros(fout, meta_size);
|
9143
8740
|
|
8741
|
+
std::set<ggml_type> used_iq2;
|
8742
|
+
|
9144
8743
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
9145
8744
|
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
9146
8745
|
|
@@ -9193,6 +8792,35 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9193
8792
|
} else {
|
9194
8793
|
const size_t nelements = ggml_nelements(tensor);
|
9195
8794
|
|
8795
|
+
if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
|
8796
|
+
ggml_init_iq2_quantization(new_type);
|
8797
|
+
used_iq2.insert(new_type);
|
8798
|
+
}
|
8799
|
+
|
8800
|
+
const float * imatrix = nullptr;
|
8801
|
+
if (imatrix_data) {
|
8802
|
+
auto it = imatrix_data->find(tensor->name);
|
8803
|
+
if (it == imatrix_data->end()) {
|
8804
|
+
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
8805
|
+
} else {
|
8806
|
+
if (it->second.size() == (size_t)tensor->ne[0]) {
|
8807
|
+
imatrix = it->second.data();
|
8808
|
+
} else {
|
8809
|
+
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
8810
|
+
int(it->second.size()), int(tensor->ne[0]), tensor->name);
|
8811
|
+
}
|
8812
|
+
}
|
8813
|
+
}
|
8814
|
+
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
8815
|
+
new_type == GGML_TYPE_IQ2_XS ||
|
8816
|
+
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
8817
|
+
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
8818
|
+
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
8819
|
+
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
8820
|
+
LLAMA_LOG_ERROR("============================================================\n\n");
|
8821
|
+
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
8822
|
+
}
|
8823
|
+
|
9196
8824
|
float * f32_data;
|
9197
8825
|
|
9198
8826
|
if (tensor->type == GGML_TYPE_F32) {
|
@@ -9213,21 +8841,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9213
8841
|
new_data = work.data();
|
9214
8842
|
std::array<int64_t, 1 << 4> hist_cur = {};
|
9215
8843
|
|
9216
|
-
|
8844
|
+
const int n_per_row = tensor->ne[0];
|
8845
|
+
const int nrows = nelements / n_per_row;
|
8846
|
+
|
8847
|
+
static const int min_chunk_size = 32 * 512;
|
8848
|
+
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
8849
|
+
|
9217
8850
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
9218
8851
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
9219
8852
|
if (nthread_use < 2) {
|
9220
|
-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0,
|
8853
|
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
|
9221
8854
|
} else {
|
9222
|
-
|
8855
|
+
int counter = 0;
|
9223
8856
|
new_size = 0;
|
9224
|
-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data,
|
8857
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
8858
|
+
nrows, n_per_row, imatrix]() {
|
9225
8859
|
std::array<int64_t, 1 << 4> local_hist = {};
|
8860
|
+
const int nrows_per_chunk = chunk_size / n_per_row;
|
9226
8861
|
size_t local_size = 0;
|
9227
8862
|
while (true) {
|
9228
8863
|
std::unique_lock<std::mutex> lock(mutex);
|
9229
|
-
|
9230
|
-
if (
|
8864
|
+
int first_row = counter; counter += nrows_per_chunk;
|
8865
|
+
if (first_row >= nrows) {
|
9231
8866
|
if (local_size > 0) {
|
9232
8867
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
9233
8868
|
hist_cur[j] += local_hist[j];
|
@@ -9237,8 +8872,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9237
8872
|
break;
|
9238
8873
|
}
|
9239
8874
|
lock.unlock();
|
9240
|
-
|
9241
|
-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
8875
|
+
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
8876
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
8877
|
+
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
9242
8878
|
}
|
9243
8879
|
};
|
9244
8880
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
@@ -9249,7 +8885,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9249
8885
|
workers.clear();
|
9250
8886
|
}
|
9251
8887
|
|
9252
|
-
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB
|
8888
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
9253
8889
|
int64_t tot_count = 0;
|
9254
8890
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
9255
8891
|
hist_all[i] += hist_cur[i];
|
@@ -9257,6 +8893,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9257
8893
|
}
|
9258
8894
|
|
9259
8895
|
if (tot_count > 0) {
|
8896
|
+
LLAMA_LOG_INFO(" | hist: ");
|
9260
8897
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
9261
8898
|
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
9262
8899
|
}
|
@@ -9285,6 +8922,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
9285
8922
|
|
9286
8923
|
fout.close();
|
9287
8924
|
|
8925
|
+
for (auto type : used_iq2) {
|
8926
|
+
ggml_deinit_iq2_quantization(type);
|
8927
|
+
}
|
8928
|
+
|
9288
8929
|
gguf_free(ctx_out);
|
9289
8930
|
|
9290
8931
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
@@ -9342,48 +8983,23 @@ static int llama_apply_lora_from_file_internal(
|
|
9342
8983
|
|
9343
8984
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
9344
8985
|
|
9345
|
-
// create a name -> tensor map of the model to accelerate lookups
|
9346
|
-
// find the max tensor size to estimate the required temporary buffer size
|
9347
|
-
size_t max_tensor_size = 0;
|
9348
|
-
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
9349
|
-
for (const auto & kv : model.tensors_by_name) {
|
9350
|
-
model_tensors.insert(kv);
|
9351
|
-
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
|
9352
|
-
max_tensor_size = std::max(max_tensor_size, f32_size);
|
9353
|
-
}
|
9354
|
-
|
9355
|
-
// create a temporary ggml context to store the lora tensors
|
9356
|
-
// TODO: use ggml-alloc
|
9357
|
-
size_t lora_ctx_size = max_tensor_size * 3;
|
9358
|
-
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
|
9359
|
-
std::vector<uint8_t> lora_buf(lora_ctx_size);
|
9360
|
-
|
9361
|
-
struct ggml_init_params params;
|
9362
|
-
params.mem_size = lora_buf.size();
|
9363
|
-
params.mem_buffer = lora_buf.data();
|
9364
|
-
params.no_alloc = false;
|
9365
|
-
|
9366
|
-
using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
|
9367
|
-
|
9368
|
-
unique_context lora_ctx(nullptr, ggml_free);
|
9369
|
-
lora_ctx.reset(ggml_init(params));
|
9370
|
-
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
9371
|
-
|
9372
8986
|
// load base model
|
9373
8987
|
std::unique_ptr<llama_model_loader> ml;
|
9374
|
-
|
9375
|
-
if (path_base_model) {
|
8988
|
+
if (path_base_model) {
|
9376
8989
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
9377
8990
|
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
9378
|
-
ml->init_mapping(false); // no prefetching
|
8991
|
+
ml->init_mapping(/*prefetch*/ false); // no prefetching
|
9379
8992
|
}
|
9380
8993
|
|
9381
|
-
|
9382
|
-
|
9383
|
-
|
9384
|
-
|
9385
|
-
|
8994
|
+
struct tensor_meta {
|
8995
|
+
std::string name;
|
8996
|
+
ggml_type type;
|
8997
|
+
int32_t ne[2];
|
8998
|
+
size_t offset;
|
8999
|
+
};
|
9000
|
+
std::map<std::string, tensor_meta> tensor_meta_map;
|
9386
9001
|
|
9002
|
+
// load all tensor meta
|
9387
9003
|
while (true) {
|
9388
9004
|
if (fin.tell() == fin.size) {
|
9389
9005
|
// eof
|
@@ -9396,7 +9012,7 @@ static int llama_apply_lora_from_file_internal(
|
|
9396
9012
|
|
9397
9013
|
fin.read_raw(&n_dims, sizeof(n_dims));
|
9398
9014
|
fin.read_raw(&name_len, sizeof(name_len));
|
9399
|
-
fin.read_raw(&ftype,
|
9015
|
+
fin.read_raw(&ftype, sizeof(ftype));
|
9400
9016
|
|
9401
9017
|
if (n_dims != 1 && n_dims != 2) {
|
9402
9018
|
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
@@ -9410,31 +9026,23 @@ static int llama_apply_lora_from_file_internal(
|
|
9410
9026
|
|
9411
9027
|
std::string name;
|
9412
9028
|
{
|
9413
|
-
GGML_ASSERT(name_len
|
9414
|
-
char buf[
|
9029
|
+
GGML_ASSERT(name_len < GGML_MAX_NAME);
|
9030
|
+
char buf[GGML_MAX_NAME];
|
9415
9031
|
fin.read_raw(buf, name_len);
|
9416
9032
|
name = std::string(buf, name_len);
|
9417
9033
|
}
|
9418
9034
|
|
9419
|
-
// check for lora suffix
|
9420
|
-
|
9421
|
-
|
9422
|
-
|
9423
|
-
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
9424
|
-
return 1;
|
9035
|
+
// check for lora suffix
|
9036
|
+
std::string lora_suffix;
|
9037
|
+
if (name.length() > 6) {
|
9038
|
+
lora_suffix = name.substr(name.length() - 6);
|
9425
9039
|
}
|
9426
|
-
|
9427
|
-
|
9428
|
-
std::string base_name = name;
|
9429
|
-
base_name.erase(pos);
|
9430
|
-
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
|
9431
|
-
|
9432
|
-
if (model_tensors.find(base_name) == model_tensors.end()) {
|
9433
|
-
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
9040
|
+
if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
|
9041
|
+
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
9434
9042
|
return 1;
|
9435
9043
|
}
|
9436
9044
|
|
9437
|
-
//
|
9045
|
+
// tensor type
|
9438
9046
|
ggml_type wtype;
|
9439
9047
|
switch (ftype) {
|
9440
9048
|
case 0: wtype = GGML_TYPE_F32; break;
|
@@ -9446,122 +9054,177 @@ static int llama_apply_lora_from_file_internal(
|
|
9446
9054
|
return false;
|
9447
9055
|
}
|
9448
9056
|
}
|
9449
|
-
ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
|
9450
|
-
ggml_set_name(lora_tensor, name.c_str());
|
9451
9057
|
|
9452
|
-
//
|
9058
|
+
// data offset
|
9453
9059
|
size_t offset = fin.tell();
|
9454
|
-
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
9455
9060
|
offset = (offset + 31) & -32;
|
9456
|
-
fin.seek(offset, SEEK_SET);
|
9457
|
-
fin.read_raw(lora_tensor->data, tensor_data_size);
|
9458
9061
|
|
9459
|
-
|
9062
|
+
// skip tensor data
|
9063
|
+
fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
|
9460
9064
|
|
9461
|
-
|
9462
|
-
|
9463
|
-
lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
|
9065
|
+
tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
|
9066
|
+
}
|
9464
9067
|
|
9465
|
-
|
9068
|
+
bool warned = false;
|
9069
|
+
int n_tensors = 0;
|
9466
9070
|
|
9467
|
-
|
9468
|
-
|
9071
|
+
// apply
|
9072
|
+
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
9073
|
+
if (backend_cpu == nullptr) {
|
9074
|
+
LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
|
9075
|
+
return 1;
|
9076
|
+
}
|
9077
|
+
ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
|
9469
9078
|
|
9470
|
-
|
9471
|
-
|
9472
|
-
|
9473
|
-
|
9474
|
-
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
|
9475
|
-
}
|
9476
|
-
offload_func = ggml_cuda_assign_buffers;
|
9477
|
-
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
9478
|
-
}
|
9479
|
-
#endif // GGML_USE_CUBLAS
|
9079
|
+
std::vector<no_init<uint8_t>> read_buf;
|
9080
|
+
for (const auto & it : model.tensors_by_name) {
|
9081
|
+
const std::string & base_name = it.first;
|
9082
|
+
ggml_tensor * model_t = it.second;
|
9480
9083
|
|
9481
|
-
|
9482
|
-
|
9483
|
-
|
9084
|
+
if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
|
9085
|
+
tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
|
9086
|
+
continue;
|
9087
|
+
}
|
9484
9088
|
|
9485
|
-
|
9486
|
-
|
9487
|
-
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
9488
|
-
return 1;
|
9489
|
-
}
|
9089
|
+
tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
|
9090
|
+
tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
|
9490
9091
|
|
9491
|
-
|
9492
|
-
|
9493
|
-
|
9494
|
-
|
9495
|
-
|
9092
|
+
ggml_init_params lora_init_params = {
|
9093
|
+
/* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
9094
|
+
/* .mem_buffer */ nullptr,
|
9095
|
+
/* .no_alloc */ true,
|
9096
|
+
};
|
9097
|
+
ggml_context * lora_ctx = ggml_init(lora_init_params);
|
9098
|
+
if (lora_ctx == nullptr) {
|
9099
|
+
LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
|
9100
|
+
ggml_backend_free(backend_cpu);
|
9101
|
+
return 1;
|
9102
|
+
}
|
9496
9103
|
|
9497
|
-
|
9498
|
-
|
9499
|
-
|
9500
|
-
|
9501
|
-
|
9502
|
-
|
9104
|
+
// create tensors
|
9105
|
+
ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
|
9106
|
+
ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
|
9107
|
+
ggml_set_name(loraA, metaA.name.c_str());
|
9108
|
+
ggml_set_name(loraB, metaB.name.c_str());
|
9109
|
+
|
9110
|
+
ggml_tensor * base_t;
|
9111
|
+
if (ml) {
|
9112
|
+
if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
|
9113
|
+
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
9114
|
+
return 1;
|
9503
9115
|
}
|
9116
|
+
base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
|
9117
|
+
} else {
|
9118
|
+
base_t = ggml_dup_tensor(lora_ctx, model_t);
|
9119
|
+
}
|
9120
|
+
ggml_set_name(base_t, base_name.c_str());
|
9504
9121
|
|
9505
|
-
|
9506
|
-
|
9507
|
-
|
9122
|
+
// allocate in backend buffer
|
9123
|
+
ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
9124
|
+
if (lora_buf == nullptr) {
|
9125
|
+
LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
|
9126
|
+
return 1;
|
9127
|
+
}
|
9508
9128
|
|
9509
|
-
|
9510
|
-
|
9511
|
-
|
9129
|
+
// load tensor data
|
9130
|
+
auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
|
9131
|
+
read_buf.resize(ggml_nbytes(tensor));
|
9132
|
+
fin.seek(tensor_meta.offset, SEEK_SET);
|
9133
|
+
fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
|
9134
|
+
ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
|
9135
|
+
};
|
9136
|
+
load_tensor(metaA, loraA);
|
9137
|
+
load_tensor(metaB, loraB);
|
9512
9138
|
|
9513
|
-
|
9514
|
-
|
9515
|
-
|
9516
|
-
|
9517
|
-
|
9139
|
+
// load base model tensor data
|
9140
|
+
if (ml) {
|
9141
|
+
ml->load_data_for(base_t);
|
9142
|
+
} else {
|
9143
|
+
ggml_backend_tensor_copy(model_t, base_t);
|
9144
|
+
}
|
9518
9145
|
|
9146
|
+
if (ggml_is_quantized(base_t->type) && !warned) {
|
9147
|
+
LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
9148
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
9149
|
+
warned = true;
|
9150
|
+
}
|
9151
|
+
|
9152
|
+
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
9153
|
+
LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
9154
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
9155
|
+
ggml_free(lora_ctx);
|
9156
|
+
ggml_backend_buffer_free(lora_buf);
|
9157
|
+
ggml_backend_free(backend_cpu);
|
9158
|
+
return 1;
|
9159
|
+
}
|
9160
|
+
|
9161
|
+
auto build_lora_graph = [&]() {
|
9519
9162
|
// w = w + BA*s
|
9520
|
-
ggml_tensor * BA = ggml_mul_mat(lora_ctx
|
9521
|
-
offload_func(BA);
|
9163
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
9522
9164
|
ggml_set_name(BA, "BA");
|
9523
9165
|
|
9524
9166
|
if (scaling != 1.0f) {
|
9525
|
-
BA =
|
9526
|
-
offload_func(BA);
|
9167
|
+
BA = ggml_scale(lora_ctx, BA, scaling);
|
9527
9168
|
ggml_set_name(BA, "BA_scaled");
|
9528
9169
|
}
|
9529
9170
|
|
9530
9171
|
ggml_tensor * r;
|
9531
|
-
|
9532
|
-
|
9533
|
-
offload_func_force_inplace(r);
|
9534
|
-
ggml_set_name(r, "r_add_inplace");
|
9535
|
-
}
|
9536
|
-
else {
|
9537
|
-
r = ggml_add(lora_ctx.get(), base_t, BA);
|
9538
|
-
offload_func(r);
|
9539
|
-
ggml_set_name(r, "r_add");
|
9172
|
+
r = ggml_add_inplace(lora_ctx, base_t, BA);
|
9173
|
+
ggml_set_name(r, "r_add");
|
9540
9174
|
|
9541
|
-
|
9542
|
-
|
9543
|
-
|
9175
|
+
if (base_t->type != model_t->type) {
|
9176
|
+
// convert the result to the model type
|
9177
|
+
r = ggml_cast(lora_ctx, r, model_t->type);
|
9178
|
+
ggml_set_name(r, "r_cast");
|
9544
9179
|
}
|
9545
9180
|
|
9546
|
-
|
9547
|
-
|
9181
|
+
return r;
|
9182
|
+
};
|
9183
|
+
|
9184
|
+
ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
9185
|
+
ggml_tensor * r = build_lora_graph();
|
9186
|
+
ggml_build_forward_expand(gf, r);
|
9548
9187
|
|
9549
|
-
|
9188
|
+
ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
9189
|
+
if (graph_buf == nullptr) {
|
9190
|
+
LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
|
9191
|
+
ggml_free(lora_ctx);
|
9192
|
+
ggml_backend_buffer_free(lora_buf);
|
9193
|
+
ggml_backend_free(backend_cpu);
|
9194
|
+
return 1;
|
9195
|
+
}
|
9550
9196
|
|
9551
|
-
|
9552
|
-
GGML_ASSERT(lora_tensors.size() == 2);
|
9197
|
+
ggml_backend_graph_compute(backend_cpu, gf);
|
9553
9198
|
|
9554
|
-
|
9555
|
-
lora_ctx.reset(ggml_init(params));
|
9556
|
-
lora_tensors.clear();
|
9199
|
+
ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
9557
9200
|
|
9558
|
-
|
9559
|
-
|
9560
|
-
|
9561
|
-
|
9201
|
+
#if 0
|
9202
|
+
// TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
|
9203
|
+
//ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
|
9204
|
+
|
9205
|
+
// sched compute
|
9206
|
+
ggml_build_forward_expand(gf, build_graph());
|
9207
|
+
ggml_backend_sched_init_measure(sched, gf);
|
9208
|
+
|
9209
|
+
// create the graph again, since the previous one was destroyed by the measure
|
9210
|
+
ggml_graph_clear(gf);
|
9211
|
+
ggml_build_forward_expand(gf, build_graph());
|
9212
|
+
ggml_backend_sched_graph_compute(sched, gf);
|
9213
|
+
ggml_backend_sched_free(sched);
|
9214
|
+
#endif
|
9215
|
+
|
9216
|
+
ggml_backend_buffer_free(lora_buf);
|
9217
|
+
ggml_backend_buffer_free(graph_buf);
|
9218
|
+
ggml_free(lora_ctx);
|
9219
|
+
|
9220
|
+
n_tensors++;
|
9221
|
+
if (n_tensors % 4 == 0) {
|
9222
|
+
LLAMA_LOG_INFO(".");
|
9562
9223
|
}
|
9563
9224
|
}
|
9564
9225
|
|
9226
|
+
ggml_backend_free(backend_cpu);
|
9227
|
+
|
9565
9228
|
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
9566
9229
|
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
9567
9230
|
|
@@ -9574,6 +9237,7 @@ static int llama_apply_lora_from_file_internal(
|
|
9574
9237
|
struct llama_model_params llama_model_default_params() {
|
9575
9238
|
struct llama_model_params result = {
|
9576
9239
|
/*.n_gpu_layers =*/ 0,
|
9240
|
+
/*.split_mode =*/ LLAMA_SPLIT_LAYER,
|
9577
9241
|
/*.main_gpu =*/ 0,
|
9578
9242
|
/*.tensor_split =*/ nullptr,
|
9579
9243
|
/*.progress_callback =*/ nullptr,
|
@@ -9585,7 +9249,8 @@ struct llama_model_params llama_model_default_params() {
|
|
9585
9249
|
};
|
9586
9250
|
|
9587
9251
|
#ifdef GGML_USE_METAL
|
9588
|
-
|
9252
|
+
// note: we usually have plenty of VRAM, so by default offload all layers to the GPU
|
9253
|
+
result.n_gpu_layers = 999;
|
9589
9254
|
#endif
|
9590
9255
|
|
9591
9256
|
return result;
|
@@ -9625,6 +9290,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
9625
9290
|
/*.quantize_output_tensor =*/ true,
|
9626
9291
|
/*.only_copy =*/ false,
|
9627
9292
|
/*.pure =*/ false,
|
9293
|
+
/*.imatrix =*/ nullptr,
|
9628
9294
|
};
|
9629
9295
|
|
9630
9296
|
return result;
|
@@ -9775,41 +9441,53 @@ struct llama_context * llama_new_context_with_model(
|
|
9775
9441
|
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
9776
9442
|
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
9777
9443
|
|
9778
|
-
// reserve memory for context buffers
|
9779
9444
|
if (!hparams.vocab_only) {
|
9780
|
-
// initialize
|
9445
|
+
// initialize backends
|
9781
9446
|
#ifdef GGML_USE_METAL
|
9782
9447
|
if (model->n_gpu_layers > 0) {
|
9783
|
-
ctx->
|
9784
|
-
if (ctx->
|
9448
|
+
ctx->backend_metal = ggml_backend_metal_init();
|
9449
|
+
if (ctx->backend_metal == nullptr) {
|
9785
9450
|
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
|
9451
|
+
llama_free(ctx);
|
9452
|
+
return nullptr;
|
9786
9453
|
}
|
9454
|
+
ctx->backends.push_back(ctx->backend_metal);
|
9787
9455
|
}
|
9788
|
-
#elif defined(GGML_USE_CUBLAS)
|
9789
|
-
// for testing only
|
9456
|
+
#elif defined(GGML_USE_CUBLAS)
|
9790
9457
|
if (model->n_gpu_layers > 0) {
|
9791
|
-
|
9792
|
-
if (
|
9793
|
-
|
9458
|
+
// with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
|
9459
|
+
if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
|
9460
|
+
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
9461
|
+
if (backend == nullptr) {
|
9462
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
9463
|
+
llama_free(ctx);
|
9464
|
+
return nullptr;
|
9465
|
+
}
|
9466
|
+
ctx->backends.push_back(backend);
|
9467
|
+
} else {
|
9468
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
9469
|
+
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
9470
|
+
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
9471
|
+
if (backend == nullptr) {
|
9472
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
9473
|
+
llama_free(ctx);
|
9474
|
+
return nullptr;
|
9475
|
+
}
|
9476
|
+
ctx->backends.push_back(backend);
|
9477
|
+
}
|
9794
9478
|
}
|
9795
9479
|
}
|
9796
9480
|
#endif
|
9797
|
-
|
9798
|
-
if (ctx->
|
9799
|
-
|
9800
|
-
|
9801
|
-
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
9802
|
-
}
|
9803
|
-
}
|
9804
|
-
|
9805
|
-
if (ctx->backend == nullptr) {
|
9806
|
-
LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
|
9807
|
-
delete ctx;
|
9481
|
+
ctx->backend_cpu = ggml_backend_cpu_init();
|
9482
|
+
if (ctx->backend_cpu == nullptr) {
|
9483
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
9484
|
+
llama_free(ctx);
|
9808
9485
|
return nullptr;
|
9809
9486
|
}
|
9487
|
+
ctx->backends.push_back(ctx->backend_cpu);
|
9810
9488
|
|
9811
|
-
if (!llama_kv_cache_init(ctx->
|
9812
|
-
cparams.n_ctx,
|
9489
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
9490
|
+
cparams.n_ctx, cparams.offload_kqv)) {
|
9813
9491
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
9814
9492
|
llama_free(ctx);
|
9815
9493
|
return nullptr;
|
@@ -9833,23 +9511,30 @@ struct llama_context * llama_new_context_with_model(
|
|
9833
9511
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
9834
9512
|
}
|
9835
9513
|
|
9836
|
-
// resized during inference
|
9837
|
-
|
9838
|
-
ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
|
9839
|
-
} else {
|
9840
|
-
ctx->logits.reserve(hparams.n_vocab);
|
9841
|
-
}
|
9514
|
+
// resized during inference, reserve maximum
|
9515
|
+
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
9842
9516
|
|
9843
9517
|
if (params.embedding){
|
9844
9518
|
ctx->embedding.resize(hparams.n_embd);
|
9845
9519
|
}
|
9846
9520
|
|
9847
9521
|
{
|
9848
|
-
//
|
9522
|
+
// buffer types used for the compute buffer of each backend
|
9523
|
+
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
9524
|
+
for (auto * backend : ctx->backends) {
|
9525
|
+
if (ggml_backend_is_cpu(backend)) {
|
9526
|
+
// use host buffers for the CPU backend compute buffer
|
9527
|
+
backend_buft.push_back(llama_default_buffer_type_cpu(true));
|
9528
|
+
} else {
|
9529
|
+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
9530
|
+
}
|
9531
|
+
}
|
9532
|
+
|
9533
|
+
// buffer used to store the computation graph and the tensor meta data
|
9849
9534
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
9850
9535
|
|
9851
|
-
|
9852
|
-
ctx->alloc =
|
9536
|
+
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
9537
|
+
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
9853
9538
|
|
9854
9539
|
// build worst-case graph
|
9855
9540
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
@@ -9857,50 +9542,19 @@ struct llama_context * llama_new_context_with_model(
|
|
9857
9542
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
9858
9543
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
9859
9544
|
|
9860
|
-
//
|
9861
|
-
|
9862
|
-
|
9863
|
-
|
9864
|
-
|
9865
|
-
|
9866
|
-
ggml_allocr_free(ctx->alloc);
|
9867
|
-
|
9868
|
-
ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
|
9869
|
-
ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
|
9870
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
9871
|
-
if (model->n_gpu_layers > 0) {
|
9872
|
-
// the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
|
9873
|
-
ggml_cuda_set_scratch_size(alloc_size + 64);
|
9874
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
9875
|
-
|
9876
|
-
// calculate total VRAM usage
|
9877
|
-
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
9878
|
-
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
9879
|
-
size += ggml_nbytes(t);
|
9880
|
-
}
|
9881
|
-
};
|
9882
|
-
size_t model_vram_size = 0;
|
9883
|
-
for (const auto & kv : model->tensors_by_name) {
|
9884
|
-
add_tensor(kv.second, model_vram_size);
|
9885
|
-
}
|
9886
|
-
|
9887
|
-
size_t kv_vram_size = 0;
|
9888
|
-
for (auto & k : ctx->kv_self.k_l) {
|
9889
|
-
add_tensor(k, kv_vram_size);
|
9890
|
-
}
|
9891
|
-
for (auto & v : ctx->kv_self.v_l) {
|
9892
|
-
add_tensor(v, kv_vram_size);
|
9893
|
-
}
|
9894
|
-
|
9895
|
-
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
9896
|
-
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
9545
|
+
// initialize scheduler with the worst-case graph
|
9546
|
+
ggml_backend_sched_init_measure(ctx->sched, gf);
|
9547
|
+
// note: the number of splits during measure is higher than during inference due to the kv shift
|
9548
|
+
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
9549
|
+
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
9550
|
+
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
9897
9551
|
|
9898
|
-
|
9899
|
-
|
9900
|
-
|
9901
|
-
|
9552
|
+
for (ggml_backend_t backend : ctx->backends) {
|
9553
|
+
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
|
9554
|
+
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
9555
|
+
ggml_backend_buffer_name(buf),
|
9556
|
+
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
9902
9557
|
}
|
9903
|
-
#endif
|
9904
9558
|
}
|
9905
9559
|
}
|
9906
9560
|
|
@@ -9997,9 +9651,8 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
|
|
9997
9651
|
}
|
9998
9652
|
|
9999
9653
|
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
10000
|
-
return snprintf(buf, buf_size, "%s %s
|
9654
|
+
return snprintf(buf, buf_size, "%s %s %s",
|
10001
9655
|
llama_model_arch_name(model->arch).c_str(),
|
10002
|
-
model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
|
10003
9656
|
llama_model_type_name(model->type),
|
10004
9657
|
llama_model_ftype_name(model->ftype).c_str());
|
10005
9658
|
}
|
@@ -10021,7 +9674,14 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
10021
9674
|
}
|
10022
9675
|
|
10023
9676
|
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
10024
|
-
|
9677
|
+
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
9678
|
+
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
9679
|
+
return it.first == name;
|
9680
|
+
});
|
9681
|
+
if (it == model->tensors_by_name.end()) {
|
9682
|
+
return nullptr;
|
9683
|
+
}
|
9684
|
+
return it->second;
|
10025
9685
|
}
|
10026
9686
|
|
10027
9687
|
uint32_t llama_model_quantize(
|
@@ -10199,19 +9859,18 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
10199
9859
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
10200
9860
|
const size_t s_rng_size = sizeof(size_t);
|
10201
9861
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
10202
|
-
const size_t s_logits_capacity = sizeof(size_t);
|
10203
9862
|
const size_t s_logits_size = sizeof(size_t);
|
9863
|
+
// assume worst case for logits although only currently set ones are serialized
|
10204
9864
|
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
10205
9865
|
const size_t s_embedding_size = sizeof(size_t);
|
10206
9866
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
10207
9867
|
const size_t s_kv_size = sizeof(size_t);
|
10208
9868
|
const size_t s_kv_ntok = sizeof(int);
|
10209
|
-
const size_t s_kv =
|
9869
|
+
const size_t s_kv = ctx->kv_self.total_size();
|
10210
9870
|
|
10211
9871
|
const size_t s_total = (
|
10212
9872
|
+ s_rng_size
|
10213
9873
|
+ s_rng
|
10214
|
-
+ s_logits_capacity
|
10215
9874
|
+ s_logits_size
|
10216
9875
|
+ s_logits
|
10217
9876
|
+ s_embedding_size
|
@@ -10280,37 +9939,27 @@ struct llama_data_file_context : llama_data_context {
|
|
10280
9939
|
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
10281
9940
|
// copy rng
|
10282
9941
|
{
|
10283
|
-
std::
|
9942
|
+
std::ostringstream rng_ss;
|
10284
9943
|
rng_ss << ctx->rng;
|
10285
9944
|
|
10286
|
-
const
|
10287
|
-
|
9945
|
+
const std::string & rng_str = rng_ss.str();
|
9946
|
+
const size_t rng_size = rng_str.size();
|
10288
9947
|
|
10289
|
-
|
10290
|
-
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
9948
|
+
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
10291
9949
|
|
10292
|
-
data_ctx->write(&rng_size,
|
10293
|
-
data_ctx->write(
|
9950
|
+
data_ctx->write(&rng_size, sizeof(rng_size));
|
9951
|
+
data_ctx->write(rng_str.data(), rng_size);
|
10294
9952
|
}
|
10295
9953
|
|
10296
9954
|
// copy logits
|
10297
9955
|
{
|
10298
|
-
const size_t logits_cap = ctx->logits.capacity();
|
10299
9956
|
const size_t logits_size = ctx->logits.size();
|
10300
9957
|
|
10301
|
-
data_ctx->write(&logits_cap, sizeof(logits_cap));
|
10302
9958
|
data_ctx->write(&logits_size, sizeof(logits_size));
|
10303
9959
|
|
10304
9960
|
if (logits_size) {
|
10305
9961
|
data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
|
10306
9962
|
}
|
10307
|
-
|
10308
|
-
// If there is a gap between the size and the capacity, write padding
|
10309
|
-
size_t padding_size = (logits_cap - logits_size) * sizeof(float);
|
10310
|
-
if (padding_size > 0) {
|
10311
|
-
std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
|
10312
|
-
data_ctx->write(padding.data(), padding_size);
|
10313
|
-
}
|
10314
9963
|
}
|
10315
9964
|
|
10316
9965
|
// copy embeddings
|
@@ -10335,7 +9984,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
10335
9984
|
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
|
10336
9985
|
const auto n_ctx = cparams.n_ctx;
|
10337
9986
|
|
10338
|
-
const size_t kv_buf_size =
|
9987
|
+
const size_t kv_buf_size = kv_self.total_size();
|
10339
9988
|
const uint32_t kv_head = kv_self.head;
|
10340
9989
|
const uint32_t kv_size = kv_self.size;
|
10341
9990
|
const uint32_t kv_used = kv_self.used;
|
@@ -10348,46 +9997,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
10348
9997
|
if (kv_buf_size) {
|
10349
9998
|
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
10350
9999
|
|
10351
|
-
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
10352
|
-
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
10353
|
-
|
10354
|
-
std::vector<struct ggml_tensor *> kout2d(n_layer);
|
10355
|
-
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
10356
|
-
|
10357
|
-
for (int il = 0; il < (int) n_layer; ++il) {
|
10358
|
-
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
10359
|
-
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
10360
|
-
|
10361
|
-
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
10362
|
-
n_embd_k_gqa, kv_head,
|
10363
|
-
elt_size*n_embd_k_gqa, 0);
|
10364
|
-
|
10365
|
-
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
10366
|
-
kv_head, n_embd_v_gqa,
|
10367
|
-
elt_size*n_ctx, 0);
|
10368
|
-
|
10369
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
10370
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
|
10371
|
-
}
|
10372
|
-
|
10373
|
-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
10374
|
-
|
10375
|
-
ggml_backend_graph_compute(ctx->backend, gf);
|
10376
|
-
|
10377
10000
|
std::vector<uint8_t> tmp_buf;
|
10378
10001
|
for (int il = 0; il < (int) n_layer; ++il) {
|
10379
|
-
tmp_buf.resize(
|
10380
|
-
ggml_backend_tensor_get(
|
10002
|
+
tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
|
10003
|
+
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
10381
10004
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
10382
10005
|
|
10383
|
-
|
10384
|
-
|
10385
|
-
|
10006
|
+
// v is not contiguous, copy row by row
|
10007
|
+
tmp_buf.resize(elt_size*kv_head);
|
10008
|
+
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
10009
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
|
10010
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
10011
|
+
}
|
10386
10012
|
}
|
10387
|
-
|
10388
|
-
ggml_free(cpy_ctx);
|
10389
|
-
|
10390
|
-
ggml_backend_buffer_free(buf);
|
10391
10013
|
}
|
10392
10014
|
|
10393
10015
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
@@ -10420,13 +10042,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
10420
10042
|
// set rng
|
10421
10043
|
{
|
10422
10044
|
size_t rng_size;
|
10423
|
-
|
10045
|
+
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
10424
10046
|
|
10425
|
-
|
10426
|
-
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
10047
|
+
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
10427
10048
|
|
10428
|
-
std::
|
10429
|
-
|
10049
|
+
std::string rng_str((char *)inp, rng_size); inp += rng_size;
|
10050
|
+
|
10051
|
+
std::istringstream rng_ss(rng_str);
|
10430
10052
|
rng_ss >> ctx->rng;
|
10431
10053
|
|
10432
10054
|
GGML_ASSERT(!rng_ss.fail());
|
@@ -10434,20 +10056,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
10434
10056
|
|
10435
10057
|
// set logits
|
10436
10058
|
{
|
10437
|
-
size_t logits_cap;
|
10438
10059
|
size_t logits_size;
|
10439
10060
|
|
10440
|
-
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
10441
10061
|
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
10442
10062
|
|
10443
|
-
GGML_ASSERT(ctx->logits.capacity()
|
10063
|
+
GGML_ASSERT(ctx->logits.capacity() >= logits_size);
|
10444
10064
|
|
10445
10065
|
if (logits_size) {
|
10446
10066
|
ctx->logits.resize(logits_size);
|
10067
|
+
|
10447
10068
|
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
10069
|
+
inp += logits_size * sizeof(float);
|
10448
10070
|
}
|
10449
|
-
|
10450
|
-
inp += logits_cap * sizeof(float);
|
10451
10071
|
}
|
10452
10072
|
|
10453
10073
|
// set embeddings
|
@@ -10486,48 +10106,22 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
10486
10106
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
10487
10107
|
|
10488
10108
|
if (kv_buf_size) {
|
10489
|
-
GGML_ASSERT(
|
10109
|
+
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
10490
10110
|
|
10491
10111
|
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
10492
10112
|
|
10493
|
-
|
10494
|
-
|
10495
|
-
|
10496
|
-
|
10497
|
-
|
10498
|
-
|
10499
|
-
|
10500
|
-
|
10501
|
-
|
10502
|
-
|
10503
|
-
|
10504
|
-
n_embd_k_gqa, kv_head,
|
10505
|
-
elt_size*n_embd_k_gqa, 0);
|
10506
|
-
|
10507
|
-
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
10508
|
-
kv_head, n_embd_v_gqa,
|
10509
|
-
elt_size*n_ctx, 0);
|
10510
|
-
|
10511
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
10512
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
|
10513
|
-
}
|
10514
|
-
|
10515
|
-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
10516
|
-
|
10517
|
-
// load data into the tensors
|
10518
|
-
for (int il = 0; il < n_layer; ++il) {
|
10519
|
-
ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
|
10520
|
-
inp += ggml_nbytes(kin2d[il]);
|
10521
|
-
|
10522
|
-
ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
|
10523
|
-
inp += ggml_nbytes(vin2d[il]);
|
10113
|
+
for (int il = 0; il < (int) n_layer; ++il) {
|
10114
|
+
size_t k_size = elt_size*n_embd_k_gqa*kv_head;
|
10115
|
+
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
10116
|
+
inp += k_size;
|
10117
|
+
|
10118
|
+
// v is not contiguous, copy row by row
|
10119
|
+
size_t v_row_size = elt_size*kv_head;
|
10120
|
+
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
10121
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
|
10122
|
+
inp += v_row_size;
|
10123
|
+
}
|
10524
10124
|
}
|
10525
|
-
|
10526
|
-
ggml_backend_graph_compute(ctx->backend, gf);
|
10527
|
-
|
10528
|
-
ggml_free(cpy_ctx);
|
10529
|
-
|
10530
|
-
ggml_backend_buffer_free(buf);
|
10531
10125
|
}
|
10532
10126
|
|
10533
10127
|
ctx->kv_self.head = kv_head;
|
@@ -10843,6 +10437,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
10843
10437
|
if (0 <= token && token < llama_n_vocab(model)) {
|
10844
10438
|
switch (llama_vocab_get_type(model->vocab)) {
|
10845
10439
|
case LLAMA_VOCAB_TYPE_SPM: {
|
10440
|
+
// NOTE: we accept all unsupported token types,
|
10441
|
+
// suppressing them like CONTROL tokens.
|
10846
10442
|
if (llama_is_normal_token(model->vocab, token)) {
|
10847
10443
|
std::string result = model->vocab.id_to_token[token].text;
|
10848
10444
|
llama_unescape_whitespace(result);
|
@@ -10851,6 +10447,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
10851
10447
|
}
|
10852
10448
|
memcpy(buf, result.c_str(), result.length());
|
10853
10449
|
return result.length();
|
10450
|
+
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
10451
|
+
std::string result = model->vocab.id_to_token[token].text;
|
10452
|
+
if (length < (int) result.length()) {
|
10453
|
+
return -result.length();
|
10454
|
+
}
|
10455
|
+
memcpy(buf, result.c_str(), result.length());
|
10456
|
+
return result.length();
|
10854
10457
|
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
10855
10458
|
if (length < 3) {
|
10856
10459
|
return -3;
|
@@ -10865,14 +10468,12 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
10865
10468
|
}
|
10866
10469
|
buf[0] = llama_token_to_byte(model->vocab, token);
|
10867
10470
|
return 1;
|
10868
|
-
} else {
|
10869
|
-
// TODO: for now we accept all unsupported token types,
|
10870
|
-
// suppressing them like CONTROL tokens.
|
10871
|
-
// GGML_ASSERT(false);
|
10872
10471
|
}
|
10873
10472
|
break;
|
10874
10473
|
}
|
10875
10474
|
case LLAMA_VOCAB_TYPE_BPE: {
|
10475
|
+
// NOTE: we accept all unsupported token types,
|
10476
|
+
// suppressing them like CONTROL tokens.
|
10876
10477
|
if (llama_is_normal_token(model->vocab, token)) {
|
10877
10478
|
std::string result = model->vocab.id_to_token[token].text;
|
10878
10479
|
result = llama_decode_text(result);
|
@@ -10881,12 +10482,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
10881
10482
|
}
|
10882
10483
|
memcpy(buf, result.c_str(), result.length());
|
10883
10484
|
return result.length();
|
10485
|
+
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
10486
|
+
std::string result = model->vocab.id_to_token[token].text;
|
10487
|
+
if (length < (int) result.length()) {
|
10488
|
+
return -result.length();
|
10489
|
+
}
|
10490
|
+
memcpy(buf, result.c_str(), result.length());
|
10491
|
+
return result.length();
|
10884
10492
|
} else if (llama_is_control_token(model->vocab, token)) {
|
10885
10493
|
;
|
10886
|
-
} else {
|
10887
|
-
// TODO: for now we accept all unsupported token types,
|
10888
|
-
// suppressing them like CONTROL tokens.
|
10889
|
-
// GGML_ASSERT(false);
|
10890
10494
|
}
|
10891
10495
|
break;
|
10892
10496
|
}
|
@@ -10998,7 +10602,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
10998
10602
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
10999
10603
|
g_state.log_callback_user_data = user_data;
|
11000
10604
|
#ifdef GGML_USE_METAL
|
11001
|
-
|
10605
|
+
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
11002
10606
|
#endif
|
11003
10607
|
}
|
11004
10608
|
|