llama_cpp 0.12.0 → 0.12.2
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- checksums.yaml +4 -4
- data/CHANGELOG.md +14 -0
- data/ext/llama_cpp/llama_cpp.cpp +78 -0
- data/lib/llama_cpp/version.rb +2 -2
- data/sig/llama_cpp.rbs +11 -0
- data/vendor/tmp/llama.cpp/Makefile +7 -10
- data/vendor/tmp/llama.cpp/ggml-alloc.c +28 -6
- data/vendor/tmp/llama.cpp/ggml-alloc.h +3 -1
- data/vendor/tmp/llama.cpp/ggml-backend-impl.h +36 -36
- data/vendor/tmp/llama.cpp/ggml-backend.c +512 -261
- data/vendor/tmp/llama.cpp/ggml-backend.h +43 -33
- data/vendor/tmp/llama.cpp/ggml-cuda.cu +1494 -559
- data/vendor/tmp/llama.cpp/ggml-cuda.h +18 -30
- data/vendor/tmp/llama.cpp/ggml-impl.h +2 -0
- data/vendor/tmp/llama.cpp/ggml-metal.h +4 -56
- data/vendor/tmp/llama.cpp/ggml-metal.m +1868 -2002
- data/vendor/tmp/llama.cpp/ggml-metal.metal +692 -8
- data/vendor/tmp/llama.cpp/ggml-opencl.cpp +321 -14
- data/vendor/tmp/llama.cpp/ggml-opencl.h +13 -3
- data/vendor/tmp/llama.cpp/ggml-quants.c +2182 -44
- data/vendor/tmp/llama.cpp/ggml-quants.h +36 -1
- data/vendor/tmp/llama.cpp/ggml.c +222 -105
- data/vendor/tmp/llama.cpp/ggml.h +56 -35
- data/vendor/tmp/llama.cpp/llama.cpp +1271 -1618
- data/vendor/tmp/llama.cpp/llama.h +44 -8
- metadata +2 -2
|
@@ -1,5 +1,4 @@
|
|
|
1
1
|
#define LLAMA_API_INTERNAL
|
|
2
|
-
//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
|
|
3
2
|
#include "llama.h"
|
|
4
3
|
|
|
5
4
|
#include "unicode.h"
|
|
@@ -152,10 +151,6 @@ static bool is_float_close(float a, float b, float abs_tol) {
|
|
|
152
151
|
return std::fabs(b - a) <= abs_tol;
|
|
153
152
|
}
|
|
154
153
|
|
|
155
|
-
#ifdef GGML_USE_CPU_HBM
|
|
156
|
-
#include <hbwmalloc.h>
|
|
157
|
-
#endif
|
|
158
|
-
|
|
159
154
|
static void zeros(std::ofstream & file, size_t n) {
|
|
160
155
|
char zero = 0;
|
|
161
156
|
for (size_t i = 0; i < n; ++i) {
|
|
@@ -579,6 +574,9 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
|
|
|
579
574
|
{ LLM_TENSOR_OUTPUT, "output" },
|
|
580
575
|
{ LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
|
|
581
576
|
{ LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
|
|
577
|
+
{ LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
|
|
578
|
+
{ LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
|
|
579
|
+
{ LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
|
|
582
580
|
{ LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
|
|
583
581
|
{ LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
|
|
584
582
|
{ LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
|
|
@@ -988,20 +986,29 @@ struct llama_mmap {
|
|
|
988
986
|
throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
|
|
989
987
|
}
|
|
990
988
|
|
|
991
|
-
#if _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
992
989
|
if (prefetch > 0) {
|
|
993
|
-
|
|
994
|
-
|
|
995
|
-
|
|
996
|
-
|
|
997
|
-
|
|
998
|
-
|
|
999
|
-
|
|
990
|
+
#if _WIN32_WINNT >= 0x602
|
|
991
|
+
// PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
|
|
992
|
+
BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
|
|
993
|
+
HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
|
|
994
|
+
|
|
995
|
+
// may fail on pre-Windows 8 systems
|
|
996
|
+
pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
|
|
997
|
+
|
|
998
|
+
if (pPrefetchVirtualMemory) {
|
|
999
|
+
// advise the kernel to preload the mapped memory
|
|
1000
|
+
WIN32_MEMORY_RANGE_ENTRY range;
|
|
1001
|
+
range.VirtualAddress = addr;
|
|
1002
|
+
range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
|
|
1003
|
+
if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
|
|
1004
|
+
LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
|
|
1005
|
+
llama_format_win_err(GetLastError()).c_str());
|
|
1006
|
+
}
|
|
1000
1007
|
}
|
|
1008
|
+
#else
|
|
1009
|
+
throw std::runtime_error("PrefetchVirtualMemory unavailable");
|
|
1010
|
+
#endif
|
|
1001
1011
|
}
|
|
1002
|
-
#else
|
|
1003
|
-
#pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
|
|
1004
|
-
#endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
|
|
1005
1012
|
}
|
|
1006
1013
|
|
|
1007
1014
|
void unmap_fragment(size_t first, size_t last) {
|
|
@@ -1107,7 +1114,7 @@ struct llama_mlock {
|
|
|
1107
1114
|
suggest = false;
|
|
1108
1115
|
}
|
|
1109
1116
|
|
|
1110
|
-
|
|
1117
|
+
LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
|
|
1111
1118
|
size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
|
|
1112
1119
|
return false;
|
|
1113
1120
|
}
|
|
@@ -1116,7 +1123,7 @@ struct llama_mlock {
|
|
|
1116
1123
|
|
|
1117
1124
|
static void raw_unlock(void * addr, size_t size) {
|
|
1118
1125
|
if (munlock(addr, size)) {
|
|
1119
|
-
|
|
1126
|
+
LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
|
|
1120
1127
|
}
|
|
1121
1128
|
}
|
|
1122
1129
|
#elif defined(_WIN32)
|
|
@@ -1134,7 +1141,7 @@ struct llama_mlock {
|
|
|
1134
1141
|
return true;
|
|
1135
1142
|
}
|
|
1136
1143
|
if (tries == 2) {
|
|
1137
|
-
|
|
1144
|
+
LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
|
|
1138
1145
|
len, size, llama_format_win_err(GetLastError()).c_str());
|
|
1139
1146
|
return false;
|
|
1140
1147
|
}
|
|
@@ -1143,7 +1150,7 @@ struct llama_mlock {
|
|
|
1143
1150
|
// set size and try again.
|
|
1144
1151
|
SIZE_T min_ws_size, max_ws_size;
|
|
1145
1152
|
if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
|
|
1146
|
-
|
|
1153
|
+
LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
|
|
1147
1154
|
llama_format_win_err(GetLastError()).c_str());
|
|
1148
1155
|
return false;
|
|
1149
1156
|
}
|
|
@@ -1156,7 +1163,7 @@ struct llama_mlock {
|
|
|
1156
1163
|
min_ws_size += increment;
|
|
1157
1164
|
max_ws_size += increment;
|
|
1158
1165
|
if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
|
|
1159
|
-
|
|
1166
|
+
LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
|
|
1160
1167
|
llama_format_win_err(GetLastError()).c_str());
|
|
1161
1168
|
return false;
|
|
1162
1169
|
}
|
|
@@ -1165,7 +1172,7 @@ struct llama_mlock {
|
|
|
1165
1172
|
|
|
1166
1173
|
static void raw_unlock(void * ptr, size_t len) {
|
|
1167
1174
|
if (!VirtualUnlock(ptr, len)) {
|
|
1168
|
-
|
|
1175
|
+
LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
|
|
1169
1176
|
llama_format_win_err(GetLastError()).c_str());
|
|
1170
1177
|
}
|
|
1171
1178
|
}
|
|
@@ -1177,7 +1184,7 @@ struct llama_mlock {
|
|
|
1177
1184
|
}
|
|
1178
1185
|
|
|
1179
1186
|
bool raw_lock(const void * addr, size_t len) const {
|
|
1180
|
-
|
|
1187
|
+
LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
|
|
1181
1188
|
return false;
|
|
1182
1189
|
}
|
|
1183
1190
|
|
|
@@ -1185,12 +1192,6 @@ struct llama_mlock {
|
|
|
1185
1192
|
#endif
|
|
1186
1193
|
};
|
|
1187
1194
|
|
|
1188
|
-
typedef void (*offload_func_t)(struct ggml_tensor * tensor);
|
|
1189
|
-
|
|
1190
|
-
static void ggml_offload_nop(struct ggml_tensor * tensor) {
|
|
1191
|
-
(void) tensor;
|
|
1192
|
-
}
|
|
1193
|
-
|
|
1194
1195
|
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
|
|
1195
1196
|
std::vector<char> result(8, 0);
|
|
1196
1197
|
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
|
|
@@ -1206,19 +1207,14 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
|
|
|
1206
1207
|
return std::string(result.data(), result.size());
|
|
1207
1208
|
}
|
|
1208
1209
|
|
|
1209
|
-
static ggml_backend_buffer_type_t
|
|
1210
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
|
|
1210
1211
|
ggml_backend_buffer_type_t buft = nullptr;
|
|
1211
1212
|
|
|
1212
|
-
#
|
|
1213
|
-
|
|
1214
|
-
|
|
1215
|
-
|
|
1216
|
-
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
1217
|
-
if (n_gpu_layers > 0) {
|
|
1218
|
-
buft = ggml_backend_cuda_buffer_type(0);
|
|
1213
|
+
#if defined(GGML_USE_CUBLAS)
|
|
1214
|
+
// host buffers should only be used when data is expected to be copied to/from the GPU
|
|
1215
|
+
if (host_buffer) {
|
|
1216
|
+
buft = ggml_backend_cuda_host_buffer_type();
|
|
1219
1217
|
}
|
|
1220
|
-
#elif defined(GGML_USE_CUBLAS)
|
|
1221
|
-
buft = ggml_backend_cuda_host_buffer_type();
|
|
1222
1218
|
#elif defined(GGML_USE_CPU_HBM)
|
|
1223
1219
|
buft = ggml_backend_cpu_hbm_buffer_type();
|
|
1224
1220
|
#endif
|
|
@@ -1226,10 +1222,45 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
|
|
1226
1222
|
if (buft == nullptr) {
|
|
1227
1223
|
buft = ggml_backend_cpu_buffer_type();
|
|
1228
1224
|
}
|
|
1225
|
+
return buft;
|
|
1226
|
+
|
|
1227
|
+
GGML_UNUSED(host_buffer);
|
|
1228
|
+
}
|
|
1229
|
+
|
|
1230
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
|
|
1231
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
1232
|
+
|
|
1233
|
+
#ifdef GGML_USE_METAL
|
|
1234
|
+
buft = ggml_backend_metal_buffer_type();
|
|
1235
|
+
#elif defined(GGML_USE_CUBLAS)
|
|
1236
|
+
buft = ggml_backend_cuda_buffer_type(gpu);
|
|
1237
|
+
#elif defined(GGML_USE_CLBLAST)
|
|
1238
|
+
buft = ggml_backend_opencl_buffer_type();
|
|
1239
|
+
#endif
|
|
1229
1240
|
|
|
1241
|
+
if (buft == nullptr) {
|
|
1242
|
+
buft = llama_default_buffer_type_cpu(true);
|
|
1243
|
+
}
|
|
1230
1244
|
return buft;
|
|
1231
1245
|
|
|
1232
|
-
GGML_UNUSED(
|
|
1246
|
+
GGML_UNUSED(gpu);
|
|
1247
|
+
}
|
|
1248
|
+
|
|
1249
|
+
static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
|
|
1250
|
+
ggml_backend_buffer_type_t buft = nullptr;
|
|
1251
|
+
|
|
1252
|
+
#ifdef GGML_USE_CUBLAS
|
|
1253
|
+
if (ggml_backend_cuda_get_device_count() > 1) {
|
|
1254
|
+
buft = ggml_backend_cuda_split_buffer_type(tensor_split);
|
|
1255
|
+
}
|
|
1256
|
+
#endif
|
|
1257
|
+
|
|
1258
|
+
if (buft == nullptr) {
|
|
1259
|
+
buft = llama_default_buffer_type_offload(fallback_gpu);
|
|
1260
|
+
}
|
|
1261
|
+
return buft;
|
|
1262
|
+
|
|
1263
|
+
GGML_UNUSED(tensor_split);
|
|
1233
1264
|
}
|
|
1234
1265
|
|
|
1235
1266
|
//
|
|
@@ -1239,7 +1270,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
|
|
|
1239
1270
|
struct llama_state {
|
|
1240
1271
|
llama_state() {
|
|
1241
1272
|
#ifdef GGML_USE_METAL
|
|
1242
|
-
|
|
1273
|
+
ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
|
|
1243
1274
|
#endif
|
|
1244
1275
|
}
|
|
1245
1276
|
|
|
@@ -1440,24 +1471,24 @@ struct llama_kv_cache {
|
|
|
1440
1471
|
std::vector<struct ggml_tensor *> k_l; // per layer
|
|
1441
1472
|
std::vector<struct ggml_tensor *> v_l;
|
|
1442
1473
|
|
|
1443
|
-
struct ggml_context
|
|
1474
|
+
std::vector<struct ggml_context *> ctxs;
|
|
1475
|
+
std::vector<ggml_backend_buffer_t> bufs;
|
|
1444
1476
|
|
|
1445
|
-
|
|
1477
|
+
size_t total_size() const {
|
|
1478
|
+
size_t size = 0;
|
|
1479
|
+
for (ggml_backend_buffer_t buf : bufs) {
|
|
1480
|
+
size += ggml_backend_buffer_get_size(buf);
|
|
1481
|
+
}
|
|
1482
|
+
return size;
|
|
1483
|
+
}
|
|
1446
1484
|
|
|
1447
1485
|
~llama_kv_cache() {
|
|
1448
|
-
|
|
1449
|
-
if (ggml_cublas_loaded()) {
|
|
1450
|
-
for (size_t i = 0; i < k_l.size(); ++i) {
|
|
1451
|
-
ggml_cuda_free_data(k_l[i]);
|
|
1452
|
-
ggml_cuda_free_data(v_l[i]);
|
|
1453
|
-
}
|
|
1454
|
-
}
|
|
1455
|
-
#endif
|
|
1456
|
-
if (ctx) {
|
|
1486
|
+
for (struct ggml_context * ctx : ctxs) {
|
|
1457
1487
|
ggml_free(ctx);
|
|
1458
1488
|
}
|
|
1459
|
-
|
|
1460
|
-
|
|
1489
|
+
for (ggml_backend_buffer_t buf : bufs) {
|
|
1490
|
+
ggml_backend_buffer_free(buf);
|
|
1491
|
+
}
|
|
1461
1492
|
}
|
|
1462
1493
|
};
|
|
1463
1494
|
|
|
@@ -1534,16 +1565,32 @@ struct llama_model {
|
|
|
1534
1565
|
|
|
1535
1566
|
std::vector<llama_layer> layers;
|
|
1536
1567
|
|
|
1568
|
+
llama_split_mode split_mode;
|
|
1569
|
+
int main_gpu;
|
|
1537
1570
|
int n_gpu_layers;
|
|
1538
1571
|
|
|
1539
1572
|
// gguf metadata
|
|
1540
1573
|
std::unordered_map<std::string, std::string> gguf_kv;
|
|
1541
1574
|
|
|
1542
|
-
//
|
|
1543
|
-
struct
|
|
1575
|
+
// layer -> buffer type mapping
|
|
1576
|
+
struct layer_buft {
|
|
1577
|
+
layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
|
|
1578
|
+
layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
|
|
1579
|
+
layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
|
|
1580
|
+
|
|
1581
|
+
ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
|
|
1582
|
+
ggml_backend_buffer_type_t buft; // everything else
|
|
1583
|
+
};
|
|
1584
|
+
|
|
1585
|
+
layer_buft buft_input;
|
|
1586
|
+
layer_buft buft_output;
|
|
1587
|
+
std::vector<layer_buft> buft_layer;
|
|
1588
|
+
|
|
1589
|
+
// contexts where the model tensors metadata is stored
|
|
1590
|
+
std::vector<struct ggml_context *> ctxs;
|
|
1544
1591
|
|
|
1545
|
-
// the model memory
|
|
1546
|
-
ggml_backend_buffer_t
|
|
1592
|
+
// the model memory buffers for the tensor data
|
|
1593
|
+
std::vector<ggml_backend_buffer_t> bufs;
|
|
1547
1594
|
|
|
1548
1595
|
// model memory mapped file
|
|
1549
1596
|
std::unique_ptr<llama_mmap> mapping;
|
|
@@ -1559,39 +1606,32 @@ struct llama_model {
|
|
|
1559
1606
|
int64_t t_start_us = 0;
|
|
1560
1607
|
|
|
1561
1608
|
~llama_model() {
|
|
1562
|
-
|
|
1563
|
-
if (ggml_cublas_loaded()) {
|
|
1564
|
-
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
|
1565
|
-
ggml_cuda_free_data(tensors_by_name[i].second);
|
|
1566
|
-
}
|
|
1567
|
-
ggml_cuda_free_scratch();
|
|
1568
|
-
}
|
|
1569
|
-
#endif
|
|
1570
|
-
|
|
1571
|
-
#if defined(GGML_USE_CLBLAST)
|
|
1572
|
-
for (size_t i = 0; i < tensors_by_name.size(); ++i) {
|
|
1573
|
-
ggml_cl_free_data(tensors_by_name[i].second);
|
|
1574
|
-
}
|
|
1575
|
-
#endif
|
|
1576
|
-
if (ctx) {
|
|
1609
|
+
for (struct ggml_context * ctx : ctxs) {
|
|
1577
1610
|
ggml_free(ctx);
|
|
1578
1611
|
}
|
|
1579
|
-
|
|
1580
|
-
|
|
1612
|
+
for (ggml_backend_buffer_t buf : bufs) {
|
|
1613
|
+
ggml_backend_buffer_free(buf);
|
|
1614
|
+
}
|
|
1581
1615
|
}
|
|
1582
1616
|
};
|
|
1583
1617
|
|
|
1584
1618
|
struct llama_context {
|
|
1585
1619
|
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
|
|
1586
1620
|
~llama_context() {
|
|
1587
|
-
|
|
1588
|
-
|
|
1589
|
-
|
|
1621
|
+
ggml_backend_sched_free(sched);
|
|
1622
|
+
|
|
1623
|
+
for (ggml_backend_t backend : backends) {
|
|
1624
|
+
ggml_backend_free(backend);
|
|
1625
|
+
}
|
|
1590
1626
|
}
|
|
1591
1627
|
|
|
1592
1628
|
llama_cparams cparams;
|
|
1593
1629
|
|
|
1594
|
-
ggml_backend_t
|
|
1630
|
+
std::vector<ggml_backend_t> backends;
|
|
1631
|
+
#ifdef GGML_USE_METAL
|
|
1632
|
+
ggml_backend_t backend_metal = nullptr;
|
|
1633
|
+
#endif
|
|
1634
|
+
ggml_backend_t backend_cpu = nullptr;
|
|
1595
1635
|
|
|
1596
1636
|
const llama_model & model;
|
|
1597
1637
|
|
|
@@ -1625,8 +1665,9 @@ struct llama_context {
|
|
|
1625
1665
|
|
|
1626
1666
|
// memory buffers used to evaluate the model
|
|
1627
1667
|
std::vector<uint8_t> buf_compute_meta;
|
|
1628
|
-
|
|
1629
|
-
|
|
1668
|
+
ggml_backend_sched_t sched = nullptr;
|
|
1669
|
+
// allocator for the input tensors
|
|
1670
|
+
ggml_tallocr * alloc = nullptr;
|
|
1630
1671
|
|
|
1631
1672
|
// temporary buffer for copying data to/from the backend
|
|
1632
1673
|
std::vector<no_init<uint8_t>> buf_copy;
|
|
@@ -1641,16 +1682,17 @@ struct llama_context {
|
|
|
1641
1682
|
//
|
|
1642
1683
|
|
|
1643
1684
|
static bool llama_kv_cache_init(
|
|
1644
|
-
const struct llama_hparams & hparams,
|
|
1645
1685
|
struct llama_kv_cache & cache,
|
|
1686
|
+
const llama_model & model,
|
|
1646
1687
|
ggml_type ktype,
|
|
1647
1688
|
ggml_type vtype,
|
|
1648
1689
|
uint32_t n_ctx,
|
|
1649
|
-
int n_gpu_layers,
|
|
1650
1690
|
bool offload) {
|
|
1691
|
+
const struct llama_hparams & hparams = model.hparams;
|
|
1692
|
+
|
|
1651
1693
|
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
1652
1694
|
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
1653
|
-
const
|
|
1695
|
+
const int64_t n_layer = hparams.n_layer;
|
|
1654
1696
|
|
|
1655
1697
|
cache.has_shift = false;
|
|
1656
1698
|
|
|
@@ -1661,62 +1703,65 @@ static bool llama_kv_cache_init(
|
|
|
1661
1703
|
cache.cells.clear();
|
|
1662
1704
|
cache.cells.resize(n_ctx);
|
|
1663
1705
|
|
|
1664
|
-
|
|
1665
|
-
|
|
1666
|
-
|
|
1667
|
-
params.no_alloc = true;
|
|
1668
|
-
|
|
1669
|
-
cache.ctx = ggml_init(params);
|
|
1706
|
+
#ifdef GGML_USE_CLBLAST
|
|
1707
|
+
offload = false;
|
|
1708
|
+
#endif
|
|
1670
1709
|
|
|
1671
|
-
|
|
1710
|
+
// count used buffer types
|
|
1711
|
+
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
|
1712
|
+
if (offload) {
|
|
1713
|
+
for (int64_t i = 0; i < n_layer; ++i) {
|
|
1714
|
+
buft_layer_count[model.buft_layer[i].buft]++;
|
|
1715
|
+
}
|
|
1716
|
+
} else {
|
|
1717
|
+
buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
|
|
1718
|
+
}
|
|
1672
1719
|
|
|
1673
|
-
|
|
1674
|
-
|
|
1675
|
-
|
|
1720
|
+
// create a context for each buffer type
|
|
1721
|
+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
1722
|
+
for (auto & it : buft_layer_count) {
|
|
1723
|
+
int n_layers = it.second;
|
|
1724
|
+
struct ggml_init_params params = {
|
|
1725
|
+
/*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead(),
|
|
1726
|
+
/*.mem_buffer =*/ NULL,
|
|
1727
|
+
/*.no_alloc =*/ true,
|
|
1728
|
+
};
|
|
1729
|
+
ggml_context * ctx = ggml_init(params);
|
|
1730
|
+
if (!ctx) {
|
|
1731
|
+
LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
|
|
1732
|
+
return false;
|
|
1733
|
+
}
|
|
1734
|
+
ctx_map[it.first] = ctx;
|
|
1735
|
+
cache.ctxs.push_back(ctx);
|
|
1676
1736
|
}
|
|
1677
1737
|
|
|
1678
1738
|
cache.k_l.reserve(n_layer);
|
|
1679
1739
|
cache.v_l.reserve(n_layer);
|
|
1680
1740
|
|
|
1681
|
-
const int i_gpu_start = (int) n_layer - n_gpu_layers;
|
|
1682
|
-
|
|
1683
1741
|
for (int i = 0; i < (int) n_layer; i++) {
|
|
1684
|
-
|
|
1685
|
-
ggml_tensor *
|
|
1742
|
+
struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
|
|
1743
|
+
ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
|
|
1744
|
+
ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
|
|
1686
1745
|
ggml_format_name(k, "cache_k_l%d", i);
|
|
1687
1746
|
ggml_format_name(v, "cache_v_l%d", i);
|
|
1688
1747
|
cache.k_l.push_back(k);
|
|
1689
1748
|
cache.v_l.push_back(v);
|
|
1690
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
1691
|
-
if (i >= i_gpu_start) {
|
|
1692
|
-
if (offload) {
|
|
1693
|
-
ggml_cuda_assign_buffers_no_scratch(k);
|
|
1694
|
-
ggml_cuda_assign_buffers_no_scratch(v);
|
|
1695
|
-
vram_kv_cache += ggml_nbytes(k);
|
|
1696
|
-
vram_kv_cache += ggml_nbytes(v);
|
|
1697
|
-
// HACK: mark tensor as allocated
|
|
1698
|
-
k->data = v->data = (void *)(uintptr_t)1;
|
|
1699
|
-
}
|
|
1700
|
-
}
|
|
1701
|
-
#endif // GGML_USE_CUBLAS
|
|
1702
1749
|
}
|
|
1703
1750
|
|
|
1704
|
-
// allocate tensors
|
|
1705
|
-
|
|
1706
|
-
|
|
1707
|
-
|
|
1708
|
-
|
|
1709
|
-
|
|
1710
|
-
|
|
1711
|
-
|
|
1712
|
-
|
|
1713
|
-
|
|
1714
|
-
LLAMA_LOG_INFO("%s:
|
|
1751
|
+
// allocate tensors and initialize the buffers to avoid NaNs in the padding
|
|
1752
|
+
for (auto it : ctx_map) {
|
|
1753
|
+
ggml_backend_buffer_type_t buft = it.first;
|
|
1754
|
+
ggml_context * ctx = it.second;
|
|
1755
|
+
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
1756
|
+
if (!buf) {
|
|
1757
|
+
LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
|
|
1758
|
+
return false;
|
|
1759
|
+
}
|
|
1760
|
+
ggml_backend_buffer_clear(buf, 0);
|
|
1761
|
+
LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
|
|
1762
|
+
cache.bufs.push_back(buf);
|
|
1715
1763
|
}
|
|
1716
1764
|
|
|
1717
|
-
GGML_UNUSED(i_gpu_start);
|
|
1718
|
-
GGML_UNUSED(offload);
|
|
1719
|
-
|
|
1720
1765
|
return true;
|
|
1721
1766
|
}
|
|
1722
1767
|
|
|
@@ -1898,6 +1943,28 @@ static void llama_kv_cache_seq_shift(
|
|
|
1898
1943
|
cache.head = new_head != cache.size ? new_head : 0;
|
|
1899
1944
|
}
|
|
1900
1945
|
|
|
1946
|
+
static void llama_kv_cache_seq_div(
|
|
1947
|
+
struct llama_kv_cache & cache,
|
|
1948
|
+
llama_seq_id seq_id,
|
|
1949
|
+
llama_pos p0,
|
|
1950
|
+
llama_pos p1,
|
|
1951
|
+
int d) {
|
|
1952
|
+
if (p0 < 0) p0 = 0;
|
|
1953
|
+
if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
|
|
1954
|
+
|
|
1955
|
+
for (uint32_t i = 0; i < cache.size; ++i) {
|
|
1956
|
+
if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
|
|
1957
|
+
cache.has_shift = true;
|
|
1958
|
+
|
|
1959
|
+
{
|
|
1960
|
+
llama_pos p_old = cache.cells[i].pos;
|
|
1961
|
+
cache.cells[i].pos /= d;
|
|
1962
|
+
cache.cells[i].delta += cache.cells[i].pos - p_old;
|
|
1963
|
+
}
|
|
1964
|
+
}
|
|
1965
|
+
}
|
|
1966
|
+
}
|
|
1967
|
+
|
|
1901
1968
|
//
|
|
1902
1969
|
// model loading and saving
|
|
1903
1970
|
//
|
|
@@ -2018,13 +2085,13 @@ namespace GGUFMeta {
|
|
|
2018
2085
|
__func__, override_type_to_str(override->tag), override->key);
|
|
2019
2086
|
switch (override->tag) {
|
|
2020
2087
|
case LLAMA_KV_OVERRIDE_BOOL: {
|
|
2021
|
-
|
|
2088
|
+
LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
|
|
2022
2089
|
} break;
|
|
2023
2090
|
case LLAMA_KV_OVERRIDE_INT: {
|
|
2024
|
-
|
|
2091
|
+
LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
|
|
2025
2092
|
} break;
|
|
2026
2093
|
case LLAMA_KV_OVERRIDE_FLOAT: {
|
|
2027
|
-
|
|
2094
|
+
LLAMA_LOG_INFO("%.6f\n", override->float_value);
|
|
2028
2095
|
} break;
|
|
2029
2096
|
default:
|
|
2030
2097
|
// Shouldn't be possible to end up here, but just in case...
|
|
@@ -2123,6 +2190,11 @@ struct llama_model_loader {
|
|
|
2123
2190
|
LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
|
|
2124
2191
|
|
|
2125
2192
|
llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
|
|
2193
|
+
int trace = 0;
|
|
2194
|
+
if (getenv("LLAMA_TRACE")) {
|
|
2195
|
+
trace = atoi(getenv("LLAMA_TRACE"));
|
|
2196
|
+
}
|
|
2197
|
+
|
|
2126
2198
|
struct gguf_init_params params = {
|
|
2127
2199
|
/*.no_alloc = */ true,
|
|
2128
2200
|
/*.ctx = */ &ctx_meta,
|
|
@@ -2175,7 +2247,10 @@ struct llama_model_loader {
|
|
|
2175
2247
|
type_max = type;
|
|
2176
2248
|
}
|
|
2177
2249
|
|
|
2178
|
-
|
|
2250
|
+
if (trace > 0) {
|
|
2251
|
+
struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
|
2252
|
+
LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
|
|
2253
|
+
}
|
|
2179
2254
|
}
|
|
2180
2255
|
|
|
2181
2256
|
switch (type_max) {
|
|
@@ -2191,6 +2266,8 @@ struct llama_model_loader {
|
|
|
2191
2266
|
case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
|
|
2192
2267
|
case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
|
|
2193
2268
|
case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
|
|
2269
|
+
case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
|
|
2270
|
+
case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
|
|
2194
2271
|
default:
|
|
2195
2272
|
{
|
|
2196
2273
|
LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
|
|
@@ -2321,9 +2398,8 @@ struct llama_model_loader {
|
|
|
2321
2398
|
return get_tensor_meta(get_tensor_name(i));
|
|
2322
2399
|
}
|
|
2323
2400
|
|
|
2324
|
-
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta
|
|
2401
|
+
struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
|
|
2325
2402
|
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
|
|
2326
|
-
tensor->backend = backend; // TODO: ggml_set_backend
|
|
2327
2403
|
ggml_set_name(tensor, ggml_get_name(meta));
|
|
2328
2404
|
|
|
2329
2405
|
n_created++;
|
|
@@ -2331,7 +2407,7 @@ struct llama_model_loader {
|
|
|
2331
2407
|
return tensor;
|
|
2332
2408
|
}
|
|
2333
2409
|
|
|
2334
|
-
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne,
|
|
2410
|
+
struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
|
|
2335
2411
|
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
|
|
2336
2412
|
|
|
2337
2413
|
if (cur == NULL) {
|
|
@@ -2341,12 +2417,6 @@ struct llama_model_loader {
|
|
|
2341
2417
|
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
|
|
2342
2418
|
}
|
|
2343
2419
|
|
|
2344
|
-
if (backend == GGML_BACKEND_GPU_SPLIT) {
|
|
2345
|
-
if (ne.size() == 1) {
|
|
2346
|
-
throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
|
|
2347
|
-
}
|
|
2348
|
-
}
|
|
2349
|
-
|
|
2350
2420
|
{
|
|
2351
2421
|
bool is_ok = true;
|
|
2352
2422
|
for (size_t i = 0; i < ne.size(); ++i) {
|
|
@@ -2364,7 +2434,7 @@ struct llama_model_loader {
|
|
|
2364
2434
|
}
|
|
2365
2435
|
}
|
|
2366
2436
|
|
|
2367
|
-
return create_tensor_for(ctx, cur
|
|
2437
|
+
return create_tensor_for(ctx, cur);
|
|
2368
2438
|
}
|
|
2369
2439
|
|
|
2370
2440
|
void done_getting_tensors() const {
|
|
@@ -2383,25 +2453,35 @@ struct llama_model_loader {
|
|
|
2383
2453
|
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
|
|
2384
2454
|
}
|
|
2385
2455
|
|
|
2386
|
-
void init_mapping(bool prefetch = true) {
|
|
2387
|
-
|
|
2388
|
-
// prefetch only CPU tensors
|
|
2456
|
+
void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
|
|
2457
|
+
// prefetch the whole file - all the data is needed anyway
|
|
2389
2458
|
if (use_mmap) {
|
|
2390
|
-
|
|
2459
|
+
mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
|
|
2460
|
+
}
|
|
2391
2461
|
|
|
2392
|
-
|
|
2393
|
-
|
|
2394
|
-
|
|
2395
|
-
|
|
2396
|
-
|
|
2397
|
-
|
|
2462
|
+
// compute the total size of all tensors for progress reporting
|
|
2463
|
+
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
|
2464
|
+
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
|
|
2465
|
+
size_data += ggml_nbytes(cur);
|
|
2466
|
+
}
|
|
2467
|
+
|
|
2468
|
+
if (use_mmap && mapping) {
|
|
2469
|
+
if (lmlock) {
|
|
2470
|
+
lmlock->init(mapping->addr);
|
|
2398
2471
|
}
|
|
2399
|
-
|
|
2472
|
+
mmap_used_first = mapping->size;
|
|
2400
2473
|
}
|
|
2401
|
-
|
|
2402
|
-
|
|
2403
|
-
|
|
2404
|
-
|
|
2474
|
+
}
|
|
2475
|
+
|
|
2476
|
+
void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
|
|
2477
|
+
GGML_ASSERT(mapping);
|
|
2478
|
+
|
|
2479
|
+
*first = mapping->size;
|
|
2480
|
+
*last = 0;
|
|
2481
|
+
for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
|
|
2482
|
+
const size_t offs = file_offset(ggml_get_name(tensor));
|
|
2483
|
+
*first = std::min(*first, offs);
|
|
2484
|
+
*last = std::max(*last, offs + ggml_nbytes(tensor));
|
|
2405
2485
|
}
|
|
2406
2486
|
}
|
|
2407
2487
|
|
|
@@ -2410,8 +2490,11 @@ struct llama_model_loader {
|
|
|
2410
2490
|
const size_t offs = file_offset(ggml_get_name(cur));
|
|
2411
2491
|
|
|
2412
2492
|
if (use_mmap && mapping) {
|
|
2413
|
-
|
|
2414
|
-
|
|
2493
|
+
if (cur->data == nullptr) {
|
|
2494
|
+
cur->data = (uint8_t *)mapping->addr + offs;
|
|
2495
|
+
} else {
|
|
2496
|
+
memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
|
|
2497
|
+
}
|
|
2415
2498
|
} else {
|
|
2416
2499
|
GGML_ASSERT(cur->data != nullptr);
|
|
2417
2500
|
file.seek(offs, SEEK_SET);
|
|
@@ -2419,37 +2502,23 @@ struct llama_model_loader {
|
|
|
2419
2502
|
}
|
|
2420
2503
|
}
|
|
2421
2504
|
|
|
2422
|
-
|
|
2423
|
-
|
|
2424
|
-
|
|
2425
|
-
|
|
2426
|
-
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
|
2427
|
-
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
|
2428
|
-
size_data += ggml_nbytes(cur);
|
|
2429
|
-
}
|
|
2430
|
-
|
|
2431
|
-
if (use_mmap && buf_mmap) {
|
|
2432
|
-
if (lmlock) {
|
|
2433
|
-
lmlock->init(mapping->addr);
|
|
2434
|
-
}
|
|
2435
|
-
}
|
|
2505
|
+
size_t size_done = 0;
|
|
2506
|
+
size_t size_data = 0;
|
|
2507
|
+
size_t mmap_used_first = -1;
|
|
2508
|
+
size_t mmap_used_last = 0;
|
|
2436
2509
|
|
|
2437
|
-
|
|
2438
|
-
|
|
2439
|
-
|
|
2440
|
-
const bool legacy_offload = false;
|
|
2441
|
-
#endif
|
|
2510
|
+
// Returns false if cancelled by progress_callback
|
|
2511
|
+
bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
|
|
2512
|
+
GGML_ASSERT(size_data != 0 && "call init_mapping() first");
|
|
2442
2513
|
|
|
2443
2514
|
std::vector<no_init<uint8_t>> read_buf;
|
|
2444
2515
|
|
|
2445
|
-
size_t size_done = 0;
|
|
2446
|
-
|
|
2447
|
-
size_t mmap_first = -1;
|
|
2448
|
-
size_t mmap_last = 0;
|
|
2449
|
-
|
|
2450
2516
|
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
|
|
2451
2517
|
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
|
|
2452
|
-
|
|
2518
|
+
if (!cur) {
|
|
2519
|
+
// some tensors may be allocated in a different context
|
|
2520
|
+
continue;
|
|
2521
|
+
}
|
|
2453
2522
|
|
|
2454
2523
|
if (progress_callback) {
|
|
2455
2524
|
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
|
|
@@ -2459,67 +2528,48 @@ struct llama_model_loader {
|
|
|
2459
2528
|
|
|
2460
2529
|
const size_t offs = file_offset(ggml_get_name(cur));
|
|
2461
2530
|
|
|
2462
|
-
if (
|
|
2463
|
-
if (
|
|
2464
|
-
|
|
2465
|
-
|
|
2466
|
-
|
|
2467
|
-
lmlock->grow_to(offs + ggml_nbytes(cur));
|
|
2468
|
-
}
|
|
2469
|
-
mmap_first = std::min(mmap_first, offs);
|
|
2470
|
-
mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
|
|
2471
|
-
} else {
|
|
2472
|
-
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
|
|
2531
|
+
if (use_mmap && mapping) {
|
|
2532
|
+
if (buf_mmap && cur->data == nullptr) {
|
|
2533
|
+
ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
|
|
2534
|
+
if (lmlock) {
|
|
2535
|
+
lmlock->grow_to(offs + ggml_nbytes(cur));
|
|
2473
2536
|
}
|
|
2537
|
+
mmap_used_first = std::min(mmap_used_first, offs);
|
|
2538
|
+
mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
|
|
2474
2539
|
} else {
|
|
2475
|
-
|
|
2476
|
-
file.seek(offs, SEEK_SET);
|
|
2477
|
-
file.read_raw(cur->data, ggml_nbytes(cur));
|
|
2478
|
-
} else {
|
|
2479
|
-
read_buf.resize(ggml_nbytes(cur));
|
|
2480
|
-
file.seek(offs, SEEK_SET);
|
|
2481
|
-
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
|
2482
|
-
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
|
|
2483
|
-
}
|
|
2540
|
+
ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
|
|
2484
2541
|
}
|
|
2485
2542
|
} else {
|
|
2486
|
-
|
|
2487
|
-
|
|
2488
|
-
|
|
2489
|
-
if (use_mmap && mapping) {
|
|
2490
|
-
data = (uint8_t *) mapping->addr + offs;
|
|
2543
|
+
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
2544
|
+
file.seek(offs, SEEK_SET);
|
|
2545
|
+
file.read_raw(cur->data, ggml_nbytes(cur));
|
|
2491
2546
|
} else {
|
|
2492
2547
|
read_buf.resize(ggml_nbytes(cur));
|
|
2493
2548
|
file.seek(offs, SEEK_SET);
|
|
2494
2549
|
file.read_raw(read_buf.data(), ggml_nbytes(cur));
|
|
2495
|
-
|
|
2550
|
+
ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
|
|
2496
2551
|
}
|
|
2497
|
-
|
|
2498
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
2499
|
-
ggml_cuda_transform_tensor(data, cur);
|
|
2500
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
2501
|
-
GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
|
|
2502
|
-
ggml_cl_transform_tensor(data, cur);
|
|
2503
|
-
#else
|
|
2504
|
-
GGML_ASSERT(!"GPU tensor without a GPU backend");
|
|
2505
|
-
GGML_UNUSED(data);
|
|
2506
|
-
#endif
|
|
2507
2552
|
}
|
|
2508
2553
|
|
|
2509
2554
|
size_done += ggml_nbytes(cur);
|
|
2510
2555
|
}
|
|
2511
2556
|
|
|
2512
|
-
//
|
|
2513
|
-
if (
|
|
2514
|
-
|
|
2515
|
-
|
|
2557
|
+
// check if this is the last call and do final cleanup
|
|
2558
|
+
if (size_done >= size_data) {
|
|
2559
|
+
// unmap offloaded tensors and metadata
|
|
2560
|
+
if (use_mmap && mapping) {
|
|
2561
|
+
mapping->unmap_fragment(0, mmap_used_first);
|
|
2562
|
+
if (mmap_used_last != 0) {
|
|
2563
|
+
mapping->unmap_fragment(mmap_used_last, mapping->size);
|
|
2564
|
+
}
|
|
2565
|
+
}
|
|
2566
|
+
if (progress_callback) {
|
|
2567
|
+
// Even though the model is done loading, we still honor
|
|
2568
|
+
// cancellation since we need to free allocations.
|
|
2569
|
+
return progress_callback(1.0f, progress_callback_user_data);
|
|
2570
|
+
}
|
|
2516
2571
|
}
|
|
2517
2572
|
|
|
2518
|
-
if (progress_callback) {
|
|
2519
|
-
// Even though the model is done loading, we still honor
|
|
2520
|
-
// cancellation since we need to free allocations.
|
|
2521
|
-
return progress_callback(1.0f, progress_callback_user_data);
|
|
2522
|
-
}
|
|
2523
2573
|
return true;
|
|
2524
2574
|
}
|
|
2525
2575
|
};
|
|
@@ -2553,7 +2603,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
2553
2603
|
case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
|
|
2554
2604
|
|
|
2555
2605
|
// K-quants
|
|
2556
|
-
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
|
|
2606
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
|
|
2607
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
|
|
2557
2608
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
|
|
2558
2609
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
|
|
2559
2610
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
|
|
@@ -2562,6 +2613,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
|
|
|
2562
2613
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
|
|
2563
2614
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
|
|
2564
2615
|
case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
|
|
2616
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
|
|
2617
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
|
|
2565
2618
|
|
|
2566
2619
|
default: return "unknown, may not work";
|
|
2567
2620
|
}
|
|
@@ -2796,6 +2849,7 @@ static void llm_load_hparams(
|
|
|
2796
2849
|
ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
|
|
2797
2850
|
|
|
2798
2851
|
switch (hparams.n_layer) {
|
|
2852
|
+
case 24: model.type = e_model::MODEL_1B; break;
|
|
2799
2853
|
case 32: model.type = e_model::MODEL_3B; break;
|
|
2800
2854
|
default: model.type = e_model::MODEL_UNKNOWN;
|
|
2801
2855
|
}
|
|
@@ -3112,7 +3166,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
|
|
|
3112
3166
|
LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
|
|
3113
3167
|
LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
|
|
3114
3168
|
LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
|
|
3115
|
-
|
|
3169
|
+
if (ml.n_elements >= 1e12) {
|
|
3170
|
+
LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
|
|
3171
|
+
} else if (ml.n_elements >= 1e9) {
|
|
3172
|
+
LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
|
|
3173
|
+
} else if (ml.n_elements >= 1e6) {
|
|
3174
|
+
LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
|
|
3175
|
+
} else {
|
|
3176
|
+
LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
|
|
3177
|
+
}
|
|
3116
3178
|
if (ml.n_bytes < GiB) {
|
|
3117
3179
|
LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
|
|
3118
3180
|
} else {
|
|
@@ -3136,6 +3198,7 @@ static bool llm_load_tensors(
|
|
|
3136
3198
|
llama_model_loader & ml,
|
|
3137
3199
|
llama_model & model,
|
|
3138
3200
|
int n_gpu_layers,
|
|
3201
|
+
enum llama_split_mode split_mode,
|
|
3139
3202
|
int main_gpu,
|
|
3140
3203
|
const float * tensor_split,
|
|
3141
3204
|
bool use_mlock,
|
|
@@ -3143,702 +3206,574 @@ static bool llm_load_tensors(
|
|
|
3143
3206
|
void * progress_callback_user_data) {
|
|
3144
3207
|
model.t_start_us = ggml_time_us();
|
|
3145
3208
|
|
|
3146
|
-
auto & ctx = model.ctx;
|
|
3147
3209
|
auto & hparams = model.hparams;
|
|
3148
3210
|
|
|
3211
|
+
model.split_mode = split_mode;
|
|
3212
|
+
model.main_gpu = main_gpu;
|
|
3149
3213
|
model.n_gpu_layers = n_gpu_layers;
|
|
3150
3214
|
|
|
3151
|
-
|
|
3215
|
+
const int64_t n_layer = hparams.n_layer;
|
|
3216
|
+
const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
|
|
3152
3217
|
|
|
3153
|
-
|
|
3218
|
+
// there is very little benefit to offloading the input layer, so always keep it on the CPU
|
|
3219
|
+
model.buft_input = llama_default_buffer_type_cpu(true);
|
|
3154
3220
|
|
|
3155
|
-
|
|
3221
|
+
model.buft_layer.resize(n_layer);
|
|
3222
|
+
|
|
3223
|
+
// assign cpu layers
|
|
3224
|
+
for (int64_t i = 0; i < i_gpu_start; ++i) {
|
|
3225
|
+
model.buft_layer[i] = llama_default_buffer_type_cpu(true);
|
|
3226
|
+
}
|
|
3227
|
+
|
|
3228
|
+
#ifdef GGML_USE_CUBLAS
|
|
3229
|
+
if (split_mode == LLAMA_SPLIT_LAYER) {
|
|
3230
|
+
// calculate the split points
|
|
3231
|
+
int device_count = ggml_backend_cuda_get_device_count();
|
|
3232
|
+
bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
|
|
3233
|
+
float splits[GGML_CUDA_MAX_DEVICES];
|
|
3234
|
+
if (all_zero) {
|
|
3235
|
+
// default split, by free memory
|
|
3236
|
+
for (int i = 0; i < device_count; ++i) {
|
|
3237
|
+
size_t total;
|
|
3238
|
+
size_t free;
|
|
3239
|
+
ggml_backend_cuda_get_device_memory(i, &total, &free);
|
|
3240
|
+
splits[i] = free;
|
|
3241
|
+
}
|
|
3242
|
+
} else {
|
|
3243
|
+
std::copy(tensor_split, tensor_split + device_count, splits);
|
|
3244
|
+
}
|
|
3245
|
+
|
|
3246
|
+
// sum and normalize the splits to get the split points
|
|
3247
|
+
float split_sum = 0.0f;
|
|
3248
|
+
for (int i = 0; i < device_count; ++i) {
|
|
3249
|
+
split_sum += splits[i];
|
|
3250
|
+
splits[i] = split_sum;
|
|
3251
|
+
}
|
|
3252
|
+
for (int i = 0; i < device_count; ++i) {
|
|
3253
|
+
splits[i] /= split_sum;
|
|
3254
|
+
}
|
|
3255
|
+
|
|
3256
|
+
// assign the repeating layers to the devices according to the splits
|
|
3257
|
+
int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
|
|
3258
|
+
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
|
3259
|
+
int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
|
|
3260
|
+
model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
|
|
3261
|
+
}
|
|
3262
|
+
// assign the output layer
|
|
3263
|
+
if (n_gpu_layers > n_layer) {
|
|
3264
|
+
int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
|
|
3265
|
+
model.buft_output = llama_default_buffer_type_offload(layer_gpu);
|
|
3266
|
+
} else {
|
|
3267
|
+
model.buft_output = llama_default_buffer_type_cpu(true);
|
|
3268
|
+
}
|
|
3269
|
+
} else
|
|
3270
|
+
#endif
|
|
3156
3271
|
{
|
|
3272
|
+
ggml_backend_buffer_type_t split_buft;
|
|
3273
|
+
if (split_mode == LLAMA_SPLIT_ROW) {
|
|
3274
|
+
split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
|
|
3275
|
+
} else {
|
|
3276
|
+
// LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
|
|
3277
|
+
split_buft = llama_default_buffer_type_offload(main_gpu);
|
|
3278
|
+
}
|
|
3279
|
+
// assign the repeating layers
|
|
3280
|
+
for (int64_t i = i_gpu_start; i < n_layer; ++i) {
|
|
3281
|
+
model.buft_layer[i] = {
|
|
3282
|
+
split_buft,
|
|
3283
|
+
llama_default_buffer_type_offload(main_gpu)
|
|
3284
|
+
};
|
|
3285
|
+
}
|
|
3286
|
+
// assign the output layer
|
|
3287
|
+
if (n_gpu_layers > n_layer) {
|
|
3288
|
+
model.buft_output = {
|
|
3289
|
+
split_buft,
|
|
3290
|
+
llama_default_buffer_type_offload(main_gpu)
|
|
3291
|
+
};
|
|
3292
|
+
} else {
|
|
3293
|
+
model.buft_output = llama_default_buffer_type_cpu(true);
|
|
3294
|
+
}
|
|
3295
|
+
}
|
|
3296
|
+
|
|
3297
|
+
// count used buffer types
|
|
3298
|
+
std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
|
|
3299
|
+
buft_layer_count[model.buft_input.buft]++;
|
|
3300
|
+
buft_layer_count[model.buft_input.buft_matrix]++;
|
|
3301
|
+
buft_layer_count[model.buft_output.buft]++;
|
|
3302
|
+
buft_layer_count[model.buft_output.buft_matrix]++;
|
|
3303
|
+
for (int64_t i = 0; i < n_layer; ++i) {
|
|
3304
|
+
buft_layer_count[model.buft_layer[i].buft]++;
|
|
3305
|
+
buft_layer_count[model.buft_layer[i].buft_matrix]++;
|
|
3306
|
+
}
|
|
3307
|
+
|
|
3308
|
+
// create one context per buffer type
|
|
3309
|
+
size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
|
|
3310
|
+
std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
|
|
3311
|
+
for (auto & it : buft_layer_count) {
|
|
3157
3312
|
struct ggml_init_params params = {
|
|
3158
3313
|
/*.mem_size =*/ ctx_size,
|
|
3159
3314
|
/*.mem_buffer =*/ NULL,
|
|
3160
3315
|
/*.no_alloc =*/ true,
|
|
3161
3316
|
};
|
|
3162
|
-
|
|
3163
|
-
|
|
3164
|
-
|
|
3165
|
-
throw std::runtime_error(format("ggml_init() failed"));
|
|
3317
|
+
ggml_context * ctx = ggml_init(params);
|
|
3318
|
+
if (!ctx) {
|
|
3319
|
+
throw std::runtime_error(format("failed to create context"));
|
|
3166
3320
|
}
|
|
3321
|
+
ctx_map[it.first] = ctx;
|
|
3322
|
+
model.ctxs.push_back(ctx);
|
|
3167
3323
|
}
|
|
3168
3324
|
|
|
3169
|
-
(
|
|
3170
|
-
|
|
3171
|
-
enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
|
|
3172
|
-
enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
|
|
3173
|
-
|
|
3174
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
3175
|
-
if (ggml_cublas_loaded()) {
|
|
3176
|
-
LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
|
|
3177
|
-
ggml_cuda_set_main_device(main_gpu);
|
|
3178
|
-
|
|
3179
|
-
llama_backend_offload = GGML_BACKEND_GPU;
|
|
3180
|
-
llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
|
|
3181
|
-
}
|
|
3182
|
-
#elif defined(GGML_USE_CLBLAST)
|
|
3183
|
-
LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
|
|
3184
|
-
llama_backend_offload = GGML_BACKEND_GPU;
|
|
3185
|
-
llama_backend_offload_split = GGML_BACKEND_GPU;
|
|
3186
|
-
#endif
|
|
3325
|
+
LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
|
|
3187
3326
|
|
|
3188
3327
|
// create tensors for the weights
|
|
3189
3328
|
{
|
|
3190
3329
|
const int64_t n_embd = hparams.n_embd;
|
|
3191
3330
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
3192
3331
|
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
3193
|
-
const int64_t
|
|
3332
|
+
const int64_t n_embd_gqa = n_embd_v_gqa;
|
|
3194
3333
|
const int64_t n_vocab = hparams.n_vocab;
|
|
3334
|
+
const int64_t n_ff = hparams.n_ff;
|
|
3335
|
+
|
|
3336
|
+
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3337
|
+
|
|
3338
|
+
ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
|
|
3339
|
+
ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
|
|
3340
|
+
ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
|
|
3341
|
+
auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
|
|
3342
|
+
auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
|
|
3343
|
+
|
|
3344
|
+
model.layers.resize(n_layer);
|
|
3195
3345
|
|
|
3196
3346
|
const auto tn = LLM_TN(model.arch);
|
|
3197
3347
|
switch (model.arch) {
|
|
3198
3348
|
case LLM_ARCH_LLAMA:
|
|
3199
3349
|
case LLM_ARCH_REFACT:
|
|
3200
3350
|
{
|
|
3201
|
-
model.tok_embd = ml.create_tensor(
|
|
3351
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3202
3352
|
|
|
3203
3353
|
// output
|
|
3204
3354
|
{
|
|
3205
|
-
|
|
3206
|
-
|
|
3207
|
-
|
|
3208
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3209
|
-
backend_norm = llama_backend_offload;
|
|
3210
|
-
backend_output = llama_backend_offload_split;
|
|
3211
|
-
} else {
|
|
3212
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3213
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3214
|
-
}
|
|
3215
|
-
|
|
3216
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3217
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3355
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3356
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3218
3357
|
}
|
|
3219
3358
|
|
|
3220
|
-
|
|
3221
|
-
|
|
3222
|
-
|
|
3223
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3224
|
-
|
|
3225
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3226
|
-
|
|
3227
|
-
model.layers.resize(n_layer);
|
|
3228
|
-
|
|
3229
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3230
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
|
3231
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3359
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3360
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3361
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3232
3362
|
|
|
3233
3363
|
auto & layer = model.layers[i];
|
|
3234
3364
|
|
|
3235
|
-
layer.attn_norm = ml.create_tensor(
|
|
3365
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3236
3366
|
|
|
3237
|
-
layer.wq = ml.create_tensor(
|
|
3238
|
-
layer.wk = ml.create_tensor(
|
|
3239
|
-
layer.wv = ml.create_tensor(
|
|
3240
|
-
layer.wo = ml.create_tensor(
|
|
3367
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
3368
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
3369
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
3370
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3241
3371
|
|
|
3242
3372
|
// optional bias tensors
|
|
3243
|
-
layer.bq = ml.create_tensor(
|
|
3244
|
-
layer.bk = ml.create_tensor(
|
|
3245
|
-
layer.bv = ml.create_tensor(
|
|
3246
|
-
layer.bo = ml.create_tensor(
|
|
3373
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
|
|
3374
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
|
|
3375
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
|
|
3376
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
|
|
3247
3377
|
|
|
3248
|
-
layer.ffn_norm = ml.create_tensor(
|
|
3378
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
3249
3379
|
|
|
3250
|
-
layer.ffn_gate_inp = ml.create_tensor(
|
|
3380
|
+
layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
|
|
3251
3381
|
|
|
3252
3382
|
if (layer.ffn_gate_inp == nullptr) {
|
|
3253
3383
|
GGML_ASSERT(hparams.n_expert == 0);
|
|
3254
3384
|
GGML_ASSERT(hparams.n_expert_used == 0);
|
|
3255
3385
|
|
|
3256
|
-
layer.ffn_gate = ml.create_tensor(
|
|
3257
|
-
layer.ffn_down = ml.create_tensor(
|
|
3258
|
-
layer.ffn_up = ml.create_tensor(
|
|
3386
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
3387
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
3388
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3259
3389
|
} else {
|
|
3260
3390
|
GGML_ASSERT(hparams.n_expert > 0);
|
|
3261
3391
|
GGML_ASSERT(hparams.n_expert_used > 0);
|
|
3262
3392
|
|
|
3263
3393
|
// MoE branch
|
|
3264
3394
|
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
|
|
3265
|
-
layer.ffn_gate_exp[x] = ml.create_tensor(
|
|
3266
|
-
layer.ffn_down_exp[x] = ml.create_tensor(
|
|
3267
|
-
layer.ffn_up_exp[x] = ml.create_tensor(
|
|
3395
|
+
layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
|
|
3396
|
+
layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
|
|
3397
|
+
layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
|
|
3268
3398
|
}
|
|
3269
3399
|
}
|
|
3270
3400
|
}
|
|
3271
3401
|
} break;
|
|
3272
3402
|
case LLM_ARCH_BAICHUAN:
|
|
3273
3403
|
{
|
|
3274
|
-
model.tok_embd = ml.create_tensor(
|
|
3404
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3275
3405
|
{
|
|
3276
|
-
|
|
3277
|
-
|
|
3278
|
-
|
|
3279
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3280
|
-
backend_norm = llama_backend_offload;
|
|
3281
|
-
backend_output = llama_backend_offload_split;
|
|
3282
|
-
} else {
|
|
3283
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3284
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3285
|
-
}
|
|
3286
|
-
|
|
3287
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3288
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3406
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3407
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3289
3408
|
}
|
|
3290
3409
|
|
|
3291
|
-
|
|
3292
|
-
|
|
3293
|
-
|
|
3294
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3295
|
-
|
|
3296
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3297
|
-
|
|
3298
|
-
model.layers.resize(n_layer);
|
|
3299
|
-
|
|
3300
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3301
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
|
3302
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3410
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3411
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3412
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3303
3413
|
|
|
3304
3414
|
auto & layer = model.layers[i];
|
|
3305
3415
|
|
|
3306
|
-
layer.attn_norm = ml.create_tensor(
|
|
3416
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3307
3417
|
|
|
3308
|
-
layer.wq = ml.create_tensor(
|
|
3309
|
-
layer.wk = ml.create_tensor(
|
|
3310
|
-
layer.wv = ml.create_tensor(
|
|
3311
|
-
layer.wo = ml.create_tensor(
|
|
3418
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
3419
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
3420
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
3421
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3312
3422
|
|
|
3313
|
-
layer.ffn_norm = ml.create_tensor(
|
|
3423
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
3314
3424
|
|
|
3315
|
-
layer.ffn_gate = ml.create_tensor(
|
|
3316
|
-
layer.ffn_down = ml.create_tensor(
|
|
3317
|
-
layer.ffn_up = ml.create_tensor(
|
|
3425
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
3426
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
3427
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3318
3428
|
}
|
|
3319
3429
|
} break;
|
|
3320
3430
|
case LLM_ARCH_FALCON:
|
|
3321
3431
|
{
|
|
3322
|
-
model.tok_embd = ml.create_tensor(
|
|
3432
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3323
3433
|
|
|
3324
3434
|
// output
|
|
3325
3435
|
{
|
|
3326
|
-
|
|
3327
|
-
|
|
3328
|
-
|
|
3329
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3330
|
-
backend_norm = llama_backend_offload;
|
|
3331
|
-
backend_output = llama_backend_offload_split;
|
|
3332
|
-
} else {
|
|
3333
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3334
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3335
|
-
}
|
|
3336
|
-
|
|
3337
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3338
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
|
3339
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3436
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3437
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
3438
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3340
3439
|
}
|
|
3341
3440
|
|
|
3342
|
-
|
|
3343
|
-
|
|
3344
|
-
|
|
3345
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3346
|
-
|
|
3347
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3348
|
-
|
|
3349
|
-
model.layers.resize(n_layer);
|
|
3350
|
-
|
|
3351
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3352
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
|
3353
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3441
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3442
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3443
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3354
3444
|
|
|
3355
3445
|
auto & layer = model.layers[i];
|
|
3356
3446
|
|
|
3357
|
-
layer.attn_norm = ml.create_tensor(
|
|
3358
|
-
layer.attn_norm_b = ml.create_tensor(
|
|
3447
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3448
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
3359
3449
|
|
|
3360
3450
|
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
|
|
3361
|
-
layer.attn_norm_2 = ml.create_tensor(
|
|
3362
|
-
layer.attn_norm_2_b = ml.create_tensor(
|
|
3451
|
+
layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
|
|
3452
|
+
layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
|
|
3363
3453
|
}
|
|
3364
3454
|
|
|
3365
|
-
layer.wqkv = ml.create_tensor(
|
|
3366
|
-
layer.wo = ml.create_tensor(
|
|
3455
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
3456
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3367
3457
|
|
|
3368
|
-
layer.ffn_down = ml.create_tensor(
|
|
3369
|
-
layer.ffn_up = ml.create_tensor(
|
|
3458
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
3459
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3370
3460
|
}
|
|
3371
3461
|
} break;
|
|
3372
3462
|
case LLM_ARCH_STARCODER:
|
|
3373
3463
|
{
|
|
3374
|
-
model.tok_embd = ml.create_tensor(
|
|
3375
|
-
model.pos_embd = ml.create_tensor(
|
|
3464
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3465
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
|
3376
3466
|
|
|
3377
3467
|
// output
|
|
3378
3468
|
{
|
|
3379
|
-
|
|
3380
|
-
|
|
3381
|
-
|
|
3382
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3383
|
-
backend_norm = llama_backend_offload;
|
|
3384
|
-
backend_output = llama_backend_offload_split;
|
|
3385
|
-
} else {
|
|
3386
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3387
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3388
|
-
}
|
|
3389
|
-
|
|
3390
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3391
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
|
3392
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3469
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3470
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
3471
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3393
3472
|
}
|
|
3394
3473
|
|
|
3395
|
-
|
|
3396
|
-
|
|
3397
|
-
|
|
3398
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3399
|
-
|
|
3400
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3401
|
-
|
|
3402
|
-
model.layers.resize(n_layer);
|
|
3403
|
-
|
|
3404
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3405
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
|
3406
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3474
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3475
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3476
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3407
3477
|
|
|
3408
3478
|
auto & layer = model.layers[i];
|
|
3409
3479
|
|
|
3410
|
-
layer.attn_norm = ml.create_tensor(
|
|
3411
|
-
layer.attn_norm_b = ml.create_tensor(
|
|
3480
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3481
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
3412
3482
|
|
|
3413
|
-
layer.wqkv = ml.create_tensor(
|
|
3414
|
-
layer.bqkv = ml.create_tensor(
|
|
3483
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
3484
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
|
3415
3485
|
|
|
3416
|
-
layer.wo = ml.create_tensor(
|
|
3417
|
-
layer.bo = ml.create_tensor(
|
|
3486
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3487
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
3418
3488
|
|
|
3419
|
-
layer.ffn_norm = ml.create_tensor(
|
|
3420
|
-
layer.ffn_norm_b = ml.create_tensor(
|
|
3489
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
3490
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
3421
3491
|
|
|
3422
|
-
layer.ffn_down = ml.create_tensor(
|
|
3423
|
-
layer.ffn_down_b = ml.create_tensor(
|
|
3492
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
3493
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
3424
3494
|
|
|
3425
|
-
layer.ffn_up
|
|
3426
|
-
layer.ffn_up_b
|
|
3495
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3496
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
3427
3497
|
}
|
|
3428
3498
|
} break;
|
|
3429
3499
|
case LLM_ARCH_PERSIMMON:
|
|
3430
3500
|
{
|
|
3431
|
-
model.tok_embd = ml.create_tensor(
|
|
3501
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3432
3502
|
|
|
3433
3503
|
{
|
|
3434
|
-
|
|
3435
|
-
|
|
3436
|
-
|
|
3437
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3438
|
-
backend_norm = llama_backend_offload;
|
|
3439
|
-
backend_output = llama_backend_offload_split;
|
|
3440
|
-
} else {
|
|
3441
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3442
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3443
|
-
}
|
|
3444
|
-
|
|
3445
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3446
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
|
3447
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3504
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3505
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
3506
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3448
3507
|
}
|
|
3449
3508
|
|
|
3450
|
-
|
|
3451
|
-
|
|
3452
|
-
|
|
3453
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3509
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3510
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3511
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3454
3512
|
|
|
3455
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3456
|
-
model.layers.resize(n_layer);
|
|
3457
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3458
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
|
|
3459
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
|
|
3460
3513
|
auto & layer = model.layers[i];
|
|
3461
|
-
|
|
3462
|
-
layer.
|
|
3463
|
-
layer.
|
|
3464
|
-
|
|
3465
|
-
layer.
|
|
3466
|
-
layer.
|
|
3467
|
-
|
|
3468
|
-
layer.
|
|
3469
|
-
layer.
|
|
3470
|
-
|
|
3471
|
-
layer.
|
|
3472
|
-
layer.
|
|
3473
|
-
|
|
3474
|
-
layer.
|
|
3475
|
-
layer.
|
|
3476
|
-
|
|
3514
|
+
|
|
3515
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3516
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
3517
|
+
|
|
3518
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
3519
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
|
3520
|
+
|
|
3521
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3522
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
3523
|
+
|
|
3524
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
3525
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
3526
|
+
|
|
3527
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3528
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
3529
|
+
|
|
3530
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
3531
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
3532
|
+
|
|
3533
|
+
layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
|
|
3534
|
+
layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
|
|
3535
|
+
|
|
3536
|
+
layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
|
|
3537
|
+
layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
|
|
3477
3538
|
}
|
|
3478
3539
|
} break;
|
|
3479
3540
|
case LLM_ARCH_BLOOM:
|
|
3480
3541
|
{
|
|
3481
|
-
model.tok_embd = ml.create_tensor(
|
|
3482
|
-
model.tok_norm = ml.create_tensor(
|
|
3483
|
-
model.tok_norm_b = ml.create_tensor(
|
|
3542
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3543
|
+
model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
|
|
3544
|
+
model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
|
|
3484
3545
|
|
|
3485
3546
|
// output
|
|
3486
3547
|
{
|
|
3487
|
-
|
|
3488
|
-
|
|
3489
|
-
|
|
3490
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3491
|
-
backend_norm = llama_backend_offload;
|
|
3492
|
-
backend_output = llama_backend_offload_split;
|
|
3493
|
-
} else {
|
|
3494
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3495
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3496
|
-
}
|
|
3497
|
-
|
|
3498
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3499
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
|
3500
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3548
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3549
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
3550
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3501
3551
|
}
|
|
3502
3552
|
|
|
3503
|
-
|
|
3504
|
-
|
|
3505
|
-
|
|
3506
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3507
|
-
|
|
3508
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3509
|
-
|
|
3510
|
-
model.layers.resize(n_layer);
|
|
3511
|
-
|
|
3512
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3513
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
|
3514
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3553
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3554
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3555
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3515
3556
|
|
|
3516
3557
|
auto & layer = model.layers[i];
|
|
3517
3558
|
|
|
3518
|
-
layer.attn_norm = ml.create_tensor(
|
|
3519
|
-
layer.attn_norm_b = ml.create_tensor(
|
|
3559
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3560
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
3520
3561
|
|
|
3521
|
-
layer.wqkv = ml.create_tensor(
|
|
3522
|
-
layer.bqkv = ml.create_tensor(
|
|
3562
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
3563
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
|
3523
3564
|
|
|
3524
|
-
layer.wo = ml.create_tensor(
|
|
3525
|
-
layer.bo = ml.create_tensor(
|
|
3565
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3566
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
3526
3567
|
|
|
3527
|
-
layer.ffn_norm = ml.create_tensor(
|
|
3528
|
-
layer.ffn_norm_b = ml.create_tensor(
|
|
3568
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
3569
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
3529
3570
|
|
|
3530
|
-
layer.ffn_down = ml.create_tensor(
|
|
3531
|
-
layer.ffn_down_b = ml.create_tensor(
|
|
3571
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
3572
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
3532
3573
|
|
|
3533
|
-
layer.ffn_up
|
|
3534
|
-
layer.ffn_up_b
|
|
3574
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3575
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
3535
3576
|
}
|
|
3536
3577
|
} break;
|
|
3537
3578
|
case LLM_ARCH_MPT:
|
|
3538
3579
|
{
|
|
3539
|
-
model.tok_embd = ml.create_tensor(
|
|
3580
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3581
|
+
|
|
3540
3582
|
// output
|
|
3541
3583
|
{
|
|
3542
|
-
|
|
3543
|
-
|
|
3544
|
-
|
|
3545
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3546
|
-
backend_norm = llama_backend_offload;
|
|
3547
|
-
backend_output = llama_backend_offload_split;
|
|
3548
|
-
} else {
|
|
3549
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3550
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3551
|
-
}
|
|
3552
|
-
|
|
3553
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3554
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3584
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3585
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3555
3586
|
}
|
|
3556
3587
|
|
|
3557
|
-
|
|
3558
|
-
|
|
3559
|
-
|
|
3560
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3561
|
-
|
|
3562
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3563
|
-
|
|
3564
|
-
model.layers.resize(n_layer);
|
|
3565
|
-
|
|
3566
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3567
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
|
3568
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3588
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3589
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3590
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3569
3591
|
|
|
3570
3592
|
auto & layer = model.layers[i];
|
|
3571
3593
|
|
|
3572
|
-
layer.attn_norm = ml.create_tensor(
|
|
3573
|
-
layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
|
|
3574
|
-
layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
|
|
3594
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3575
3595
|
|
|
3576
|
-
layer.
|
|
3596
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
3597
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3577
3598
|
|
|
3578
|
-
layer.
|
|
3579
|
-
layer.
|
|
3599
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
3600
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
3601
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3580
3602
|
|
|
3581
3603
|
// AWQ ScaleActivation layer
|
|
3582
|
-
layer.ffn_act = ml.create_tensor(
|
|
3604
|
+
layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
|
|
3583
3605
|
}
|
|
3584
3606
|
} break;
|
|
3585
3607
|
case LLM_ARCH_STABLELM:
|
|
3586
3608
|
{
|
|
3587
|
-
model.tok_embd = ml.create_tensor(
|
|
3609
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3588
3610
|
|
|
3589
3611
|
// output
|
|
3590
3612
|
{
|
|
3591
|
-
|
|
3592
|
-
|
|
3593
|
-
|
|
3594
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3595
|
-
backend_norm = llama_backend_offload;
|
|
3596
|
-
backend_output = llama_backend_offload_split;
|
|
3597
|
-
} else {
|
|
3598
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3599
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3600
|
-
}
|
|
3601
|
-
|
|
3602
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
|
3603
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3604
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3613
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
3614
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3615
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3605
3616
|
}
|
|
3606
3617
|
|
|
3607
|
-
|
|
3608
|
-
|
|
3609
|
-
|
|
3610
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3611
|
-
|
|
3612
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3613
|
-
|
|
3614
|
-
model.layers.resize(n_layer);
|
|
3615
|
-
|
|
3616
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3617
|
-
/*
|
|
3618
|
-
llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
|
|
3619
|
-
*/
|
|
3620
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
|
3621
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3618
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3619
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3620
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3622
3621
|
|
|
3623
3622
|
auto & layer = model.layers[i];
|
|
3624
3623
|
|
|
3625
|
-
layer.attn_norm =
|
|
3626
|
-
layer.attn_norm_b = ml.create_tensor(
|
|
3624
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3625
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
3627
3626
|
|
|
3628
|
-
layer.wq = ml.create_tensor(
|
|
3629
|
-
layer.wk = ml.create_tensor(
|
|
3630
|
-
layer.wv = ml.create_tensor(
|
|
3631
|
-
layer.wo = ml.create_tensor(
|
|
3627
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
3628
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
3629
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
3630
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3632
3631
|
|
|
3633
|
-
layer.ffn_norm
|
|
3634
|
-
layer.ffn_norm_b = ml.create_tensor(
|
|
3632
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
3633
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
3635
3634
|
|
|
3636
|
-
layer.ffn_gate = ml.create_tensor(
|
|
3637
|
-
layer.ffn_down = ml.create_tensor(
|
|
3638
|
-
layer.ffn_up
|
|
3635
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
3636
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
3637
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3639
3638
|
}
|
|
3640
3639
|
} break;
|
|
3641
3640
|
case LLM_ARCH_QWEN:
|
|
3642
3641
|
{
|
|
3643
|
-
model.tok_embd = ml.create_tensor(
|
|
3644
|
-
{
|
|
3645
|
-
ggml_backend_type backend_norm;
|
|
3646
|
-
ggml_backend_type backend_output;
|
|
3647
|
-
|
|
3648
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3649
|
-
backend_norm = llama_backend_offload;
|
|
3650
|
-
backend_output = llama_backend_offload_split;
|
|
3651
|
-
} else {
|
|
3652
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3653
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3654
|
-
}
|
|
3655
|
-
|
|
3656
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3657
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3658
|
-
}
|
|
3659
|
-
|
|
3660
|
-
const uint32_t n_ff = hparams.n_ff / 2;
|
|
3661
|
-
|
|
3662
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3642
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3663
3643
|
|
|
3664
|
-
|
|
3644
|
+
// output
|
|
3645
|
+
{
|
|
3646
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3647
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3648
|
+
}
|
|
3665
3649
|
|
|
3666
|
-
for (
|
|
3667
|
-
|
|
3668
|
-
|
|
3650
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3651
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3652
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3669
3653
|
|
|
3670
3654
|
auto & layer = model.layers[i];
|
|
3671
3655
|
|
|
3672
|
-
layer.attn_norm = ml.create_tensor(
|
|
3656
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3673
3657
|
|
|
3674
|
-
layer.wqkv = ml.create_tensor(
|
|
3675
|
-
layer.bqkv = ml.create_tensor(
|
|
3676
|
-
layer.wo = ml.create_tensor(
|
|
3658
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
|
|
3659
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3});
|
|
3660
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3677
3661
|
|
|
3678
|
-
layer.ffn_norm = ml.create_tensor(
|
|
3662
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
3679
3663
|
|
|
3680
|
-
layer.ffn_gate = ml.create_tensor(
|
|
3681
|
-
layer.ffn_down = ml.create_tensor(
|
|
3682
|
-
layer.ffn_up = ml.create_tensor(
|
|
3664
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2});
|
|
3665
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd});
|
|
3666
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
|
|
3683
3667
|
}
|
|
3684
3668
|
} break;
|
|
3685
3669
|
case LLM_ARCH_PHI2:
|
|
3686
3670
|
{
|
|
3687
|
-
model.tok_embd = ml.create_tensor(
|
|
3671
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3688
3672
|
|
|
3689
3673
|
// output
|
|
3690
3674
|
{
|
|
3691
|
-
|
|
3692
|
-
|
|
3693
|
-
|
|
3694
|
-
|
|
3695
|
-
backend_norm = llama_backend_offload;
|
|
3696
|
-
backend_output = llama_backend_offload;
|
|
3697
|
-
} else {
|
|
3698
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3699
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3700
|
-
}
|
|
3701
|
-
|
|
3702
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3703
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
|
3704
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3705
|
-
model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
|
|
3675
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3676
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
3677
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3678
|
+
model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab});
|
|
3706
3679
|
}
|
|
3707
3680
|
|
|
3708
|
-
|
|
3709
|
-
|
|
3710
|
-
|
|
3711
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3681
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3682
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3683
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3712
3684
|
|
|
3713
|
-
|
|
3685
|
+
auto & layer = model.layers[i];
|
|
3714
3686
|
|
|
3715
|
-
|
|
3687
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3688
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
3716
3689
|
|
|
3717
|
-
|
|
3718
|
-
|
|
3719
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3690
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
|
|
3691
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
|
|
3720
3692
|
|
|
3721
|
-
|
|
3693
|
+
if (layer.wqkv == nullptr) {
|
|
3694
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
3695
|
+
layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
|
|
3722
3696
|
|
|
3723
|
-
|
|
3724
|
-
|
|
3697
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
3698
|
+
layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
|
|
3725
3699
|
|
|
3726
|
-
|
|
3727
|
-
|
|
3700
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
3701
|
+
layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
|
|
3702
|
+
}
|
|
3728
3703
|
|
|
3729
|
-
layer.wo = ml.create_tensor(
|
|
3730
|
-
layer.bo = ml.create_tensor(
|
|
3704
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3705
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
3731
3706
|
|
|
3732
|
-
layer.ffn_down = ml.create_tensor(
|
|
3733
|
-
layer.ffn_down_b = ml.create_tensor(
|
|
3707
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
3708
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
3734
3709
|
|
|
3735
|
-
layer.ffn_up
|
|
3736
|
-
layer.ffn_up_b
|
|
3710
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3711
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
3737
3712
|
}
|
|
3738
3713
|
} break;
|
|
3739
3714
|
case LLM_ARCH_PLAMO:
|
|
3740
3715
|
{
|
|
3741
|
-
model.tok_embd = ml.create_tensor(
|
|
3716
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3742
3717
|
|
|
3743
3718
|
// output
|
|
3744
3719
|
{
|
|
3745
|
-
|
|
3746
|
-
|
|
3747
|
-
|
|
3748
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3749
|
-
backend_norm = llama_backend_offload;
|
|
3750
|
-
backend_output = llama_backend_offload_split;
|
|
3751
|
-
} else {
|
|
3752
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3753
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3754
|
-
}
|
|
3755
|
-
|
|
3756
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3757
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3720
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3721
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3758
3722
|
}
|
|
3759
3723
|
|
|
3760
|
-
|
|
3761
|
-
|
|
3762
|
-
|
|
3763
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3764
|
-
|
|
3765
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3766
|
-
|
|
3767
|
-
model.layers.resize(n_layer);
|
|
3768
|
-
|
|
3769
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3770
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
|
3771
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3724
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3725
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3726
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3772
3727
|
|
|
3773
3728
|
auto & layer = model.layers[i];
|
|
3774
3729
|
|
|
3775
|
-
layer.attn_norm = ml.create_tensor(
|
|
3730
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3776
3731
|
|
|
3777
|
-
layer.wq = ml.create_tensor(
|
|
3778
|
-
layer.wk = ml.create_tensor(
|
|
3779
|
-
layer.wv = ml.create_tensor(
|
|
3780
|
-
layer.wo = ml.create_tensor(
|
|
3732
|
+
layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
|
|
3733
|
+
layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
|
|
3734
|
+
layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
|
|
3735
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3781
3736
|
|
|
3782
|
-
layer.ffn_gate = ml.create_tensor(
|
|
3783
|
-
layer.ffn_down = ml.create_tensor(
|
|
3784
|
-
layer.ffn_up = ml.create_tensor(
|
|
3737
|
+
layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
|
|
3738
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
|
|
3739
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3785
3740
|
}
|
|
3786
3741
|
} break;
|
|
3787
3742
|
case LLM_ARCH_GPT2:
|
|
3788
3743
|
{
|
|
3789
|
-
model.tok_embd = ml.create_tensor(
|
|
3790
|
-
model.pos_embd = ml.create_tensor(
|
|
3744
|
+
model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
|
|
3745
|
+
model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
|
|
3791
3746
|
|
|
3792
3747
|
// output
|
|
3793
3748
|
{
|
|
3794
|
-
|
|
3795
|
-
|
|
3796
|
-
|
|
3797
|
-
if (n_gpu_layers > int(n_layer)) {
|
|
3798
|
-
backend_norm = llama_backend_offload;
|
|
3799
|
-
backend_output = llama_backend_offload_split;
|
|
3800
|
-
} else {
|
|
3801
|
-
backend_norm = GGML_BACKEND_CPU;
|
|
3802
|
-
backend_output = GGML_BACKEND_CPU;
|
|
3803
|
-
}
|
|
3804
|
-
|
|
3805
|
-
model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
|
|
3806
|
-
model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
|
|
3807
|
-
model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
|
|
3749
|
+
model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
|
|
3750
|
+
model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
|
|
3751
|
+
model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
|
|
3808
3752
|
}
|
|
3809
3753
|
|
|
3810
|
-
|
|
3811
|
-
|
|
3812
|
-
|
|
3813
|
-
GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
|
|
3814
|
-
|
|
3815
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
3816
|
-
|
|
3817
|
-
model.layers.resize(n_layer);
|
|
3818
|
-
|
|
3819
|
-
for (uint32_t i = 0; i < n_layer; ++i) {
|
|
3820
|
-
const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
|
|
3821
|
-
const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
|
|
3754
|
+
for (int i = 0; i < n_layer; ++i) {
|
|
3755
|
+
ggml_context * ctx_layer = ctx_for_layer(i);
|
|
3756
|
+
ggml_context * ctx_split = ctx_for_layer_split(i);
|
|
3822
3757
|
|
|
3823
3758
|
auto & layer = model.layers[i];
|
|
3824
3759
|
|
|
3825
|
-
layer.attn_norm = ml.create_tensor(
|
|
3826
|
-
layer.attn_norm_b = ml.create_tensor(
|
|
3760
|
+
layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
|
|
3761
|
+
layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
|
|
3827
3762
|
|
|
3828
|
-
layer.wqkv = ml.create_tensor(
|
|
3829
|
-
layer.bqkv = ml.create_tensor(
|
|
3763
|
+
layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
|
|
3764
|
+
layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
|
|
3830
3765
|
|
|
3831
|
-
layer.wo = ml.create_tensor(
|
|
3832
|
-
layer.bo = ml.create_tensor(
|
|
3766
|
+
layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
|
|
3767
|
+
layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
|
|
3833
3768
|
|
|
3834
|
-
layer.ffn_norm = ml.create_tensor(
|
|
3835
|
-
layer.ffn_norm_b = ml.create_tensor(
|
|
3769
|
+
layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
|
|
3770
|
+
layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
|
|
3836
3771
|
|
|
3837
|
-
layer.ffn_down = ml.create_tensor(
|
|
3838
|
-
layer.ffn_down_b = ml.create_tensor(
|
|
3772
|
+
layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
|
|
3773
|
+
layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
|
|
3839
3774
|
|
|
3840
|
-
layer.ffn_up
|
|
3841
|
-
layer.ffn_up_b
|
|
3775
|
+
layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
|
|
3776
|
+
layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
|
|
3842
3777
|
}
|
|
3843
3778
|
} break;
|
|
3844
3779
|
default:
|
|
@@ -3848,78 +3783,51 @@ static bool llm_load_tensors(
|
|
|
3848
3783
|
|
|
3849
3784
|
ml.done_getting_tensors();
|
|
3850
3785
|
|
|
3851
|
-
ml.init_mapping();
|
|
3786
|
+
ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
|
|
3852
3787
|
|
|
3853
|
-
//
|
|
3854
|
-
|
|
3855
|
-
size_t buf_size = 0;
|
|
3788
|
+
// create the backend buffers
|
|
3789
|
+
std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
|
|
3856
3790
|
|
|
3857
|
-
|
|
3791
|
+
for (auto & it : ctx_map) {
|
|
3792
|
+
ggml_backend_buffer_type_t buft = it.first;
|
|
3793
|
+
ggml_context * ctx = it.second;
|
|
3794
|
+
ggml_backend_buffer_t buf = nullptr;
|
|
3858
3795
|
|
|
3859
|
-
|
|
3860
|
-
//
|
|
3861
|
-
|
|
3862
|
-
|
|
3863
|
-
|
|
3864
|
-
|
|
3796
|
+
// only the mmap region containing the tensors in the model is mapped to the backend buffer
|
|
3797
|
+
// this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
|
|
3798
|
+
// this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
|
|
3799
|
+
if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
|
|
3800
|
+
size_t first, last;
|
|
3801
|
+
ml.get_mapping_range(&first, &last, ctx);
|
|
3802
|
+
buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
|
|
3865
3803
|
}
|
|
3866
|
-
}
|
|
3867
|
-
|
|
3868
|
-
// create backend buffer
|
|
3869
|
-
ggml_backend_buffer_t buf_mmap = nullptr;
|
|
3870
|
-
|
|
3871
3804
|
#ifdef GGML_USE_METAL
|
|
3872
|
-
|
|
3873
|
-
if (ml.use_mmap) {
|
|
3805
|
+
else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
|
|
3874
3806
|
const size_t max_size = ggml_get_max_tensor_size(ctx);
|
|
3875
|
-
|
|
3876
|
-
|
|
3877
|
-
|
|
3878
|
-
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
|
|
3807
|
+
size_t first, last;
|
|
3808
|
+
ml.get_mapping_range(&first, &last, ctx);
|
|
3809
|
+
buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
|
|
3879
3810
|
}
|
|
3880
|
-
}
|
|
3881
|
-
#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
3882
|
-
// for testing only
|
|
3883
|
-
if (n_gpu_layers > 0) {
|
|
3884
|
-
model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
|
|
3885
|
-
}
|
|
3886
3811
|
#endif
|
|
3887
|
-
|
|
3888
|
-
|
|
3889
|
-
|
|
3890
|
-
|
|
3891
|
-
|
|
3892
|
-
buf_mmap = model.buf;
|
|
3893
|
-
} else {
|
|
3894
|
-
// allocate only CPU tensors
|
|
3895
|
-
model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
|
|
3896
|
-
ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
|
|
3897
|
-
for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
|
|
3898
|
-
if (t->backend == GGML_BACKEND_CPU) {
|
|
3899
|
-
ggml_tallocr_alloc(alloc, t);
|
|
3900
|
-
}
|
|
3812
|
+
else {
|
|
3813
|
+
buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
|
|
3814
|
+
if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
|
|
3815
|
+
model.mlock_buf.init (ggml_backend_buffer_get_base(buf));
|
|
3816
|
+
model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
|
|
3901
3817
|
}
|
|
3902
|
-
ggml_tallocr_free(alloc);
|
|
3903
3818
|
}
|
|
3904
|
-
|
|
3905
|
-
|
|
3906
|
-
|
|
3907
|
-
|
|
3908
|
-
|
|
3819
|
+
if (buf == nullptr) {
|
|
3820
|
+
throw std::runtime_error("failed to allocate buffer");
|
|
3821
|
+
}
|
|
3822
|
+
// indicate that this buffer contains weights
|
|
3823
|
+
// this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
|
|
3824
|
+
ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
|
|
3825
|
+
model.bufs.push_back(buf);
|
|
3826
|
+
ctx_bufs.emplace_back(ctx, buf);
|
|
3909
3827
|
}
|
|
3910
3828
|
|
|
3911
3829
|
// print memory requirements
|
|
3912
3830
|
{
|
|
3913
|
-
size_t sys_mem_required = ctx_size + buf_size;
|
|
3914
|
-
|
|
3915
|
-
if (sys_mem_required > 0) {
|
|
3916
|
-
LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
|
|
3917
|
-
}
|
|
3918
|
-
if (vram_weights > 0) {
|
|
3919
|
-
LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
|
|
3920
|
-
}
|
|
3921
|
-
|
|
3922
|
-
#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
|
|
3923
3831
|
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
|
|
3924
3832
|
|
|
3925
3833
|
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
|
|
@@ -3931,23 +3839,26 @@ static bool llm_load_tensors(
|
|
|
3931
3839
|
const int max_offloadable_layers = hparams.n_layer + 1;
|
|
3932
3840
|
|
|
3933
3841
|
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
|
|
3934
|
-
#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
|
|
3935
|
-
}
|
|
3936
3842
|
|
|
3937
|
-
|
|
3938
|
-
|
|
3939
|
-
|
|
3940
|
-
|
|
3941
|
-
#endif // GGML_USE_CUBLAS
|
|
3843
|
+
for (ggml_backend_buffer_t buf : model.bufs) {
|
|
3844
|
+
LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
|
3845
|
+
}
|
|
3846
|
+
}
|
|
3942
3847
|
|
|
3943
3848
|
// populate tensors_by_name
|
|
3944
|
-
for (
|
|
3945
|
-
|
|
3946
|
-
|
|
3849
|
+
for (ggml_context * ctx : model.ctxs) {
|
|
3850
|
+
for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
|
|
3851
|
+
model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
|
|
3852
|
+
}
|
|
3947
3853
|
}
|
|
3948
3854
|
|
|
3949
|
-
|
|
3950
|
-
|
|
3855
|
+
// load tensor data
|
|
3856
|
+
for (auto & it : ctx_bufs) {
|
|
3857
|
+
ggml_context * ctx = it.first;
|
|
3858
|
+
ggml_backend_buffer_t buf = it.second;
|
|
3859
|
+
if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
|
|
3860
|
+
return false;
|
|
3861
|
+
}
|
|
3951
3862
|
}
|
|
3952
3863
|
|
|
3953
3864
|
model.mapping = std::move(ml.mapping);
|
|
@@ -3981,13 +3892,13 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
|
|
|
3981
3892
|
}
|
|
3982
3893
|
|
|
3983
3894
|
if (!llm_load_tensors(
|
|
3984
|
-
ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
|
|
3895
|
+
ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
|
|
3985
3896
|
params.progress_callback, params.progress_callback_user_data
|
|
3986
3897
|
)) {
|
|
3987
3898
|
return -2;
|
|
3988
3899
|
}
|
|
3989
3900
|
} catch (const std::exception & err) {
|
|
3990
|
-
LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
|
|
3901
|
+
LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
|
|
3991
3902
|
return -1;
|
|
3992
3903
|
}
|
|
3993
3904
|
|
|
@@ -4059,7 +3970,6 @@ static void llm_build_k_shift(
|
|
|
4059
3970
|
struct ggml_cgraph * graph,
|
|
4060
3971
|
llm_rope_type type,
|
|
4061
3972
|
int64_t n_ctx,
|
|
4062
|
-
int n_rot,
|
|
4063
3973
|
float freq_base,
|
|
4064
3974
|
float freq_scale,
|
|
4065
3975
|
const llm_build_cb & cb) {
|
|
@@ -4067,14 +3977,13 @@ static void llm_build_k_shift(
|
|
|
4067
3977
|
const int64_t n_head_kv = hparams.n_head_kv;
|
|
4068
3978
|
const int64_t n_embd_head_k = hparams.n_embd_head_k;
|
|
4069
3979
|
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
|
|
3980
|
+
const int32_t n_rot = hparams.n_rot;
|
|
4070
3981
|
const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
|
|
4071
3982
|
const float ext_factor = cparams.yarn_ext_factor;
|
|
4072
3983
|
const float attn_factor = cparams.yarn_attn_factor;
|
|
4073
3984
|
const float beta_fast = cparams.yarn_beta_fast;
|
|
4074
3985
|
const float beta_slow = cparams.yarn_beta_slow;
|
|
4075
3986
|
|
|
4076
|
-
GGML_ASSERT(n_embd_head_k % n_rot == 0);
|
|
4077
|
-
|
|
4078
3987
|
struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
|
|
4079
3988
|
cb(K_shift, "K_shift", -1);
|
|
4080
3989
|
|
|
@@ -4433,8 +4342,6 @@ struct llm_build_context {
|
|
|
4433
4342
|
do_rope_shift (worst_case || kv_self.has_shift),
|
|
4434
4343
|
cb (cb),
|
|
4435
4344
|
buf_compute_meta (lctx.buf_compute_meta) {
|
|
4436
|
-
GGML_ASSERT(!!kv_self.ctx);
|
|
4437
|
-
|
|
4438
4345
|
// all initializations should be done in init()
|
|
4439
4346
|
}
|
|
4440
4347
|
|
|
@@ -4478,7 +4385,7 @@ struct llm_build_context {
|
|
|
4478
4385
|
|
|
4479
4386
|
// shift the entire K-cache if needed
|
|
4480
4387
|
if (do_rope_shift) {
|
|
4481
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx,
|
|
4388
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
4482
4389
|
}
|
|
4483
4390
|
|
|
4484
4391
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -4514,16 +4421,22 @@ struct llm_build_context {
|
|
|
4514
4421
|
cb(Vcur, "Vcur", il);
|
|
4515
4422
|
}
|
|
4516
4423
|
|
|
4424
|
+
// these nodes are added to the graph together so that they are not reordered
|
|
4425
|
+
// by doing so, the number of splits in the graph is reduced
|
|
4426
|
+
ggml_build_forward_expand(gf, Qcur);
|
|
4427
|
+
ggml_build_forward_expand(gf, Kcur);
|
|
4428
|
+
ggml_build_forward_expand(gf, Vcur);
|
|
4429
|
+
|
|
4517
4430
|
Qcur = ggml_rope_custom(
|
|
4518
4431
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
4519
|
-
|
|
4432
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
4520
4433
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4521
4434
|
);
|
|
4522
4435
|
cb(Qcur, "Qcur", il);
|
|
4523
4436
|
|
|
4524
4437
|
Kcur = ggml_rope_custom(
|
|
4525
4438
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
4526
|
-
|
|
4439
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
4527
4440
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4528
4441
|
);
|
|
4529
4442
|
cb(Kcur, "Kcur", il);
|
|
@@ -4646,6 +4559,7 @@ struct llm_build_context {
|
|
|
4646
4559
|
|
|
4647
4560
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4648
4561
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
4562
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
4649
4563
|
|
|
4650
4564
|
struct ggml_tensor * cur;
|
|
4651
4565
|
struct ggml_tensor * inpL;
|
|
@@ -4663,7 +4577,7 @@ struct llm_build_context {
|
|
|
4663
4577
|
|
|
4664
4578
|
// shift the entire K-cache if needed
|
|
4665
4579
|
if (do_rope_shift) {
|
|
4666
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx,
|
|
4580
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
4667
4581
|
}
|
|
4668
4582
|
|
|
4669
4583
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -4689,12 +4603,12 @@ struct llm_build_context {
|
|
|
4689
4603
|
case MODEL_7B:
|
|
4690
4604
|
Qcur = ggml_rope_custom(
|
|
4691
4605
|
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
|
|
4692
|
-
|
|
4606
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
4693
4607
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4694
4608
|
);
|
|
4695
4609
|
Kcur = ggml_rope_custom(
|
|
4696
4610
|
ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
|
|
4697
|
-
|
|
4611
|
+
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
|
|
4698
4612
|
ext_factor, attn_factor, beta_fast, beta_slow
|
|
4699
4613
|
);
|
|
4700
4614
|
break;
|
|
@@ -4767,7 +4681,7 @@ struct llm_build_context {
|
|
|
4767
4681
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4768
4682
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
4769
4683
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
4770
|
-
GGML_ASSERT(
|
|
4684
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
4771
4685
|
|
|
4772
4686
|
struct ggml_tensor * cur;
|
|
4773
4687
|
struct ggml_tensor * inpL;
|
|
@@ -4785,7 +4699,7 @@ struct llm_build_context {
|
|
|
4785
4699
|
|
|
4786
4700
|
// shift the entire K-cache if needed
|
|
4787
4701
|
if (do_rope_shift) {
|
|
4788
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
|
4702
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
4789
4703
|
}
|
|
4790
4704
|
|
|
4791
4705
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -4826,13 +4740,13 @@ struct llm_build_context {
|
|
|
4826
4740
|
|
|
4827
4741
|
// using mode = 2 for neox mode
|
|
4828
4742
|
Qcur = ggml_rope_custom(
|
|
4829
|
-
ctx0, Qcur, inp_pos,
|
|
4743
|
+
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
|
4830
4744
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
4831
4745
|
);
|
|
4832
4746
|
cb(Qcur, "Qcur", il);
|
|
4833
4747
|
|
|
4834
4748
|
Kcur = ggml_rope_custom(
|
|
4835
|
-
ctx0, Kcur, inp_pos,
|
|
4749
|
+
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
|
4836
4750
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
4837
4751
|
);
|
|
4838
4752
|
cb(Kcur, "Kcur", il);
|
|
@@ -4891,7 +4805,6 @@ struct llm_build_context {
|
|
|
4891
4805
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4892
4806
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
4893
4807
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
4894
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
|
4895
4808
|
|
|
4896
4809
|
struct ggml_tensor * cur;
|
|
4897
4810
|
struct ggml_tensor * pos;
|
|
@@ -4990,17 +4903,14 @@ struct llm_build_context {
|
|
|
4990
4903
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
4991
4904
|
|
|
4992
4905
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
4993
|
-
|
|
4994
|
-
GGML_ASSERT(n_embd_head == hparams.
|
|
4995
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
|
4996
|
-
|
|
4997
|
-
const int64_t n_rot = n_embd_head_k / 2;
|
|
4906
|
+
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
4907
|
+
GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
|
|
4998
4908
|
|
|
4999
4909
|
struct ggml_tensor * cur;
|
|
5000
4910
|
struct ggml_tensor * inpL;
|
|
5001
4911
|
|
|
5002
4912
|
inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
|
|
5003
|
-
cb(inpL, "
|
|
4913
|
+
cb(inpL, "inp_embd", -1);
|
|
5004
4914
|
|
|
5005
4915
|
// inp_pos - contains the positions
|
|
5006
4916
|
struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
|
|
@@ -5011,7 +4921,7 @@ struct llm_build_context {
|
|
|
5011
4921
|
cb(KQ_mask, "KQ_mask", -1);
|
|
5012
4922
|
|
|
5013
4923
|
if (do_rope_shift) {
|
|
5014
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
|
4924
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
5015
4925
|
}
|
|
5016
4926
|
|
|
5017
4927
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5071,7 +4981,7 @@ struct llm_build_context {
|
|
|
5071
4981
|
|
|
5072
4982
|
// RoPE the first n_rot of q/k, pass the other half, and concat.
|
|
5073
4983
|
struct ggml_tensor * qrot = ggml_view_3d(
|
|
5074
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
|
4984
|
+
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
|
5075
4985
|
ggml_element_size(tmpq) * n_embd_head,
|
|
5076
4986
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
5077
4987
|
0
|
|
@@ -5079,7 +4989,7 @@ struct llm_build_context {
|
|
|
5079
4989
|
cb(qrot, "qrot", il);
|
|
5080
4990
|
|
|
5081
4991
|
struct ggml_tensor * krot = ggml_view_3d(
|
|
5082
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
|
4992
|
+
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
|
5083
4993
|
ggml_element_size(tmpk) * n_embd_head,
|
|
5084
4994
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
|
5085
4995
|
0
|
|
@@ -5088,29 +4998,29 @@ struct llm_build_context {
|
|
|
5088
4998
|
|
|
5089
4999
|
// get the second half of tmpq, e.g tmpq[n_rot:, :, :]
|
|
5090
5000
|
struct ggml_tensor * qpass = ggml_view_3d(
|
|
5091
|
-
ctx0, tmpq, n_rot, n_head, n_tokens,
|
|
5001
|
+
ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
|
|
5092
5002
|
ggml_element_size(tmpq) * n_embd_head,
|
|
5093
5003
|
ggml_element_size(tmpq) * n_embd_head * n_head,
|
|
5094
|
-
ggml_element_size(tmpq) * n_rot
|
|
5004
|
+
ggml_element_size(tmpq) * hparams.n_rot
|
|
5095
5005
|
);
|
|
5096
5006
|
cb(qpass, "qpass", il);
|
|
5097
5007
|
|
|
5098
5008
|
struct ggml_tensor * kpass = ggml_view_3d(
|
|
5099
|
-
ctx0, tmpk, n_rot, n_head, n_tokens,
|
|
5009
|
+
ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
|
|
5100
5010
|
ggml_element_size(tmpk) * n_embd_head,
|
|
5101
5011
|
ggml_element_size(tmpk) * n_embd_head * n_head,
|
|
5102
|
-
ggml_element_size(tmpk) * n_rot
|
|
5012
|
+
ggml_element_size(tmpk) * hparams.n_rot
|
|
5103
5013
|
);
|
|
5104
5014
|
cb(kpass, "kpass", il);
|
|
5105
5015
|
|
|
5106
5016
|
struct ggml_tensor * qrotated = ggml_rope_custom(
|
|
5107
|
-
ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
|
5017
|
+
ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
|
5108
5018
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
5109
5019
|
);
|
|
5110
5020
|
cb(qrotated, "qrotated", il);
|
|
5111
5021
|
|
|
5112
5022
|
struct ggml_tensor * krotated = ggml_rope_custom(
|
|
5113
|
-
ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
|
|
5023
|
+
ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
|
5114
5024
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
5115
5025
|
);
|
|
5116
5026
|
cb(krotated, "krotated", il);
|
|
@@ -5204,9 +5114,7 @@ struct llm_build_context {
|
|
|
5204
5114
|
struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
|
|
5205
5115
|
|
|
5206
5116
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5207
|
-
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
5208
5117
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
5209
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
|
5210
5118
|
|
|
5211
5119
|
struct ggml_tensor * cur;
|
|
5212
5120
|
struct ggml_tensor * inpL;
|
|
@@ -5299,7 +5207,6 @@ struct llm_build_context {
|
|
|
5299
5207
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5300
5208
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
5301
5209
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
5302
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
|
5303
5210
|
|
|
5304
5211
|
struct ggml_tensor * cur;
|
|
5305
5212
|
struct ggml_tensor * inpL;
|
|
@@ -5395,7 +5302,6 @@ struct llm_build_context {
|
|
|
5395
5302
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5396
5303
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
5397
5304
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
5398
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
|
5399
5305
|
|
|
5400
5306
|
struct ggml_tensor * cur;
|
|
5401
5307
|
struct ggml_tensor * inpL;
|
|
@@ -5511,7 +5417,7 @@ struct llm_build_context {
|
|
|
5511
5417
|
|
|
5512
5418
|
// shift the entire K-cache if needed
|
|
5513
5419
|
if (do_rope_shift) {
|
|
5514
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
|
5420
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
5515
5421
|
}
|
|
5516
5422
|
|
|
5517
5423
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5624,7 +5530,7 @@ struct llm_build_context {
|
|
|
5624
5530
|
|
|
5625
5531
|
// shift the entire K-cache if needed
|
|
5626
5532
|
if (do_rope_shift) {
|
|
5627
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
|
5533
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
5628
5534
|
}
|
|
5629
5535
|
|
|
5630
5536
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5656,13 +5562,13 @@ struct llm_build_context {
|
|
|
5656
5562
|
|
|
5657
5563
|
// using mode = 2 for neox mode
|
|
5658
5564
|
Qcur = ggml_rope_custom(
|
|
5659
|
-
ctx0, Qcur, inp_pos,
|
|
5565
|
+
ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
|
5660
5566
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
5661
5567
|
);
|
|
5662
5568
|
cb(Qcur, "Qcur", il);
|
|
5663
5569
|
|
|
5664
5570
|
Kcur = ggml_rope_custom(
|
|
5665
|
-
ctx0, Kcur, inp_pos,
|
|
5571
|
+
ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
|
|
5666
5572
|
freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
|
|
5667
5573
|
);
|
|
5668
5574
|
cb(Kcur, "Kcur", il);
|
|
@@ -5722,7 +5628,6 @@ struct llm_build_context {
|
|
|
5722
5628
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5723
5629
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
5724
5630
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
5725
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
|
5726
5631
|
|
|
5727
5632
|
struct ggml_tensor * cur;
|
|
5728
5633
|
struct ggml_tensor * attn_norm_output;
|
|
@@ -5742,7 +5647,7 @@ struct llm_build_context {
|
|
|
5742
5647
|
|
|
5743
5648
|
// shift the entire K-cache if needed
|
|
5744
5649
|
if (do_rope_shift) {
|
|
5745
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx,
|
|
5650
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
|
|
5746
5651
|
}
|
|
5747
5652
|
|
|
5748
5653
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5754,15 +5659,25 @@ struct llm_build_context {
|
|
|
5754
5659
|
|
|
5755
5660
|
// self-attention
|
|
5756
5661
|
{
|
|
5757
|
-
|
|
5758
|
-
|
|
5662
|
+
struct ggml_tensor * Qcur = nullptr;
|
|
5663
|
+
struct ggml_tensor * Kcur = nullptr;
|
|
5664
|
+
struct ggml_tensor * Vcur = nullptr;
|
|
5759
5665
|
|
|
5760
|
-
|
|
5761
|
-
|
|
5666
|
+
if (model.layers[il].wqkv) {
|
|
5667
|
+
cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
|
|
5668
|
+
cb(cur, "wqkv", il);
|
|
5762
5669
|
|
|
5763
|
-
|
|
5764
|
-
|
|
5765
|
-
|
|
5670
|
+
cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
|
|
5671
|
+
cb(cur, "bqkv", il);
|
|
5672
|
+
|
|
5673
|
+
Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
|
|
5674
|
+
Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
|
|
5675
|
+
Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
|
|
5676
|
+
} else {
|
|
5677
|
+
Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
|
|
5678
|
+
Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
|
|
5679
|
+
Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
|
|
5680
|
+
}
|
|
5766
5681
|
|
|
5767
5682
|
cb(Qcur, "Qcur", il);
|
|
5768
5683
|
cb(Kcur, "Kcur", il);
|
|
@@ -5838,6 +5753,7 @@ struct llm_build_context {
|
|
|
5838
5753
|
|
|
5839
5754
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5840
5755
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
5756
|
+
GGML_ASSERT(n_embd_head == hparams.n_rot);
|
|
5841
5757
|
|
|
5842
5758
|
struct ggml_tensor * cur;
|
|
5843
5759
|
struct ggml_tensor * inpL;
|
|
@@ -5855,7 +5771,7 @@ struct llm_build_context {
|
|
|
5855
5771
|
|
|
5856
5772
|
// shift the entire K-cache if needed
|
|
5857
5773
|
if (do_rope_shift) {
|
|
5858
|
-
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx,
|
|
5774
|
+
llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
|
|
5859
5775
|
}
|
|
5860
5776
|
|
|
5861
5777
|
for (int il = 0; il < n_layer; ++il) {
|
|
@@ -5881,13 +5797,13 @@ struct llm_build_context {
|
|
|
5881
5797
|
cb(Vcur, "Vcur", il);
|
|
5882
5798
|
|
|
5883
5799
|
Qcur = ggml_rope_custom(
|
|
5884
|
-
ctx0, ggml_reshape_3d(ctx0, Qcur,
|
|
5800
|
+
ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
|
|
5885
5801
|
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
|
5886
5802
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
5887
5803
|
cb(Qcur, "Qcur", il);
|
|
5888
5804
|
|
|
5889
5805
|
Kcur = ggml_rope_custom(
|
|
5890
|
-
ctx0, ggml_reshape_3d(ctx0, Kcur,
|
|
5806
|
+
ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
|
|
5891
5807
|
n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
|
|
5892
5808
|
ext_factor, attn_factor, beta_fast, beta_slow);
|
|
5893
5809
|
cb(Kcur, "Kcur", il);
|
|
@@ -5946,7 +5862,6 @@ struct llm_build_context {
|
|
|
5946
5862
|
const int64_t n_embd_head = hparams.n_embd_head_v;
|
|
5947
5863
|
const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
|
|
5948
5864
|
GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
|
|
5949
|
-
GGML_ASSERT(n_embd_gqa == n_embd);
|
|
5950
5865
|
|
|
5951
5866
|
struct ggml_tensor * cur;
|
|
5952
5867
|
struct ggml_tensor * pos;
|
|
@@ -6042,199 +5957,13 @@ struct llm_build_context {
|
|
|
6042
5957
|
}
|
|
6043
5958
|
};
|
|
6044
5959
|
|
|
6045
|
-
//
|
|
6046
|
-
// tensor offloading helpers
|
|
6047
|
-
//
|
|
6048
|
-
// TODO: will be removed with backend v2
|
|
6049
|
-
|
|
6050
|
-
enum llm_offload_func_e {
|
|
6051
|
-
OFFLOAD_FUNC_NOP,
|
|
6052
|
-
OFFLOAD_FUNC,
|
|
6053
|
-
OFFLOAD_FUNC_FRC, // force offload
|
|
6054
|
-
OFFLOAD_FUNC_KQV,
|
|
6055
|
-
OFFLOAD_FUNC_NR,
|
|
6056
|
-
OFFLOAD_FUNC_EMB, // embeddings
|
|
6057
|
-
OFFLOAD_FUNC_OUT,
|
|
6058
|
-
};
|
|
6059
|
-
|
|
6060
|
-
// TODO: will be removed with backend v2
|
|
6061
|
-
struct llm_offload_trie {
|
|
6062
|
-
struct node {
|
|
6063
|
-
~node() {
|
|
6064
|
-
for (int i = 0; i < 256; ++i) {
|
|
6065
|
-
if (children[i]) {
|
|
6066
|
-
delete children[i];
|
|
6067
|
-
}
|
|
6068
|
-
}
|
|
6069
|
-
}
|
|
6070
|
-
|
|
6071
|
-
node * children[256] = { nullptr };
|
|
6072
|
-
llm_offload_func_e func = OFFLOAD_FUNC_NOP;
|
|
6073
|
-
};
|
|
6074
|
-
|
|
6075
|
-
llm_offload_trie() {
|
|
6076
|
-
root = new node;
|
|
6077
|
-
}
|
|
6078
|
-
|
|
6079
|
-
llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
|
|
6080
|
-
root = new node;
|
|
6081
|
-
|
|
6082
|
-
for (const auto & kv : map) {
|
|
6083
|
-
add(kv.first, kv.second);
|
|
6084
|
-
}
|
|
6085
|
-
}
|
|
6086
|
-
|
|
6087
|
-
~llm_offload_trie() {
|
|
6088
|
-
delete root;
|
|
6089
|
-
}
|
|
6090
|
-
|
|
6091
|
-
void add(const char * name, llm_offload_func_e func) {
|
|
6092
|
-
node * cur = root;
|
|
6093
|
-
|
|
6094
|
-
for (int i = 0; ; ++i) {
|
|
6095
|
-
const uint8_t c = name[i];
|
|
6096
|
-
|
|
6097
|
-
if (!c) {
|
|
6098
|
-
break;
|
|
6099
|
-
}
|
|
6100
|
-
|
|
6101
|
-
if (!cur->children[c]) {
|
|
6102
|
-
cur->children[c] = new node;
|
|
6103
|
-
}
|
|
6104
|
-
|
|
6105
|
-
cur = cur->children[c];
|
|
6106
|
-
}
|
|
6107
|
-
|
|
6108
|
-
cur->func = func;
|
|
6109
|
-
}
|
|
6110
|
-
|
|
6111
|
-
llm_offload_func_e find(const char * name) const {
|
|
6112
|
-
const node * cur = root;
|
|
6113
|
-
|
|
6114
|
-
for (int i = 0; ; ++i) {
|
|
6115
|
-
const uint8_t c = name[i];
|
|
6116
|
-
|
|
6117
|
-
if (!c) {
|
|
6118
|
-
break;
|
|
6119
|
-
}
|
|
6120
|
-
|
|
6121
|
-
if (!cur->children[c]) {
|
|
6122
|
-
return OFFLOAD_FUNC_NOP;
|
|
6123
|
-
}
|
|
6124
|
-
|
|
6125
|
-
cur = cur->children[c];
|
|
6126
|
-
}
|
|
6127
|
-
|
|
6128
|
-
return cur->func;
|
|
6129
|
-
}
|
|
6130
|
-
|
|
6131
|
-
node * root = nullptr;
|
|
6132
|
-
};
|
|
6133
|
-
|
|
6134
|
-
// TODO: will be removed with backend v2
|
|
6135
|
-
static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
|
|
6136
|
-
//{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
|
|
6137
|
-
//{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
|
|
6138
|
-
{ "pos_embd", OFFLOAD_FUNC_NR },
|
|
6139
|
-
|
|
6140
|
-
{ "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
|
|
6141
|
-
{ "KQ_mask", OFFLOAD_FUNC_FRC },
|
|
6142
|
-
{ "K_shift", OFFLOAD_FUNC_FRC },
|
|
6143
|
-
|
|
6144
|
-
{ "K_shifted", OFFLOAD_FUNC },
|
|
6145
|
-
|
|
6146
|
-
{ "inp_norm", OFFLOAD_FUNC_NR },
|
|
6147
|
-
{ "inp_norm_w", OFFLOAD_FUNC_NR },
|
|
6148
|
-
{ "inp_norm_wb", OFFLOAD_FUNC_NR },
|
|
6149
|
-
|
|
6150
|
-
{ "norm", OFFLOAD_FUNC },
|
|
6151
|
-
{ "norm_w", OFFLOAD_FUNC },
|
|
6152
|
-
{ "norm_wb", OFFLOAD_FUNC },
|
|
6153
|
-
|
|
6154
|
-
{ "attn_norm", OFFLOAD_FUNC },
|
|
6155
|
-
{ "attn_norm_2", OFFLOAD_FUNC },
|
|
6156
|
-
|
|
6157
|
-
{ "wqkv", OFFLOAD_FUNC_KQV },
|
|
6158
|
-
{ "bqkv", OFFLOAD_FUNC_KQV },
|
|
6159
|
-
{ "wqkv_clamped", OFFLOAD_FUNC_KQV },
|
|
6160
|
-
|
|
6161
|
-
{ "tmpk", OFFLOAD_FUNC_KQV },
|
|
6162
|
-
{ "tmpq", OFFLOAD_FUNC_KQV },
|
|
6163
|
-
{ "tmpv", OFFLOAD_FUNC_KQV },
|
|
6164
|
-
{ "Kcur", OFFLOAD_FUNC_KQV },
|
|
6165
|
-
{ "Qcur", OFFLOAD_FUNC_KQV },
|
|
6166
|
-
{ "Vcur", OFFLOAD_FUNC_KQV },
|
|
6167
|
-
|
|
6168
|
-
{ "krot", OFFLOAD_FUNC_KQV },
|
|
6169
|
-
{ "qrot", OFFLOAD_FUNC_KQV },
|
|
6170
|
-
{ "kpass", OFFLOAD_FUNC_KQV },
|
|
6171
|
-
{ "qpass", OFFLOAD_FUNC_KQV },
|
|
6172
|
-
{ "krotated", OFFLOAD_FUNC_KQV },
|
|
6173
|
-
{ "qrotated", OFFLOAD_FUNC_KQV },
|
|
6174
|
-
|
|
6175
|
-
{ "q", OFFLOAD_FUNC_KQV },
|
|
6176
|
-
{ "k", OFFLOAD_FUNC_KQV },
|
|
6177
|
-
{ "kq", OFFLOAD_FUNC_KQV },
|
|
6178
|
-
{ "kq_scaled", OFFLOAD_FUNC_KQV },
|
|
6179
|
-
{ "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
|
|
6180
|
-
{ "kq_masked", OFFLOAD_FUNC_KQV },
|
|
6181
|
-
{ "kq_soft_max", OFFLOAD_FUNC_KQV },
|
|
6182
|
-
{ "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
|
|
6183
|
-
{ "v", OFFLOAD_FUNC_KQV },
|
|
6184
|
-
{ "kqv", OFFLOAD_FUNC_KQV },
|
|
6185
|
-
{ "kqv_merged", OFFLOAD_FUNC_KQV },
|
|
6186
|
-
{ "kqv_merged_cont", OFFLOAD_FUNC_KQV },
|
|
6187
|
-
{ "kqv_wo", OFFLOAD_FUNC_KQV },
|
|
6188
|
-
{ "kqv_out", OFFLOAD_FUNC_KQV },
|
|
6189
|
-
|
|
6190
|
-
{ "ffn_inp", OFFLOAD_FUNC },
|
|
6191
|
-
{ "ffn_norm", OFFLOAD_FUNC },
|
|
6192
|
-
|
|
6193
|
-
{ "ffn_up", OFFLOAD_FUNC },
|
|
6194
|
-
{ "ffn_up_b", OFFLOAD_FUNC },
|
|
6195
|
-
{ "ffn_gate", OFFLOAD_FUNC },
|
|
6196
|
-
{ "ffn_gate_b", OFFLOAD_FUNC },
|
|
6197
|
-
{ "ffn_gate_par", OFFLOAD_FUNC },
|
|
6198
|
-
{ "ffn_act", OFFLOAD_FUNC },
|
|
6199
|
-
{ "ffn_down", OFFLOAD_FUNC },
|
|
6200
|
-
{ "ffn_down_b", OFFLOAD_FUNC },
|
|
6201
|
-
{ "ffn_out", OFFLOAD_FUNC },
|
|
6202
|
-
|
|
6203
|
-
{ "ffn_silu", OFFLOAD_FUNC },
|
|
6204
|
-
{ "ffn_gelu", OFFLOAD_FUNC },
|
|
6205
|
-
{ "ffn_relu", OFFLOAD_FUNC },
|
|
6206
|
-
{ "ffn_sqr(relu)", OFFLOAD_FUNC },
|
|
6207
|
-
|
|
6208
|
-
{ "ffn_moe_logits", OFFLOAD_FUNC },
|
|
6209
|
-
{ "ffn_moe_probs", OFFLOAD_FUNC },
|
|
6210
|
-
{ "ffn_moe_argsort", OFFLOAD_FUNC },
|
|
6211
|
-
{ "ffn_moe_weights", OFFLOAD_FUNC },
|
|
6212
|
-
{ "ffn_moe_weights_sum", OFFLOAD_FUNC },
|
|
6213
|
-
{ "ffn_moe_weights_norm", OFFLOAD_FUNC },
|
|
6214
|
-
{ "ffn_moe_weighted", OFFLOAD_FUNC },
|
|
6215
|
-
{ "ffn_moe_up", OFFLOAD_FUNC },
|
|
6216
|
-
{ "ffn_moe_gate", OFFLOAD_FUNC },
|
|
6217
|
-
{ "ffn_moe_silu", OFFLOAD_FUNC },
|
|
6218
|
-
{ "ffn_moe_gate_par", OFFLOAD_FUNC },
|
|
6219
|
-
{ "ffn_moe_down", OFFLOAD_FUNC },
|
|
6220
|
-
{ "ffn_moe_out", OFFLOAD_FUNC },
|
|
6221
|
-
|
|
6222
|
-
{ "l_out", OFFLOAD_FUNC },
|
|
6223
|
-
|
|
6224
|
-
{ "result_norm", OFFLOAD_FUNC_EMB },
|
|
6225
|
-
{ "result_output_no_bias", OFFLOAD_FUNC_EMB },
|
|
6226
|
-
{ "result_output", OFFLOAD_FUNC_OUT },
|
|
6227
|
-
};
|
|
6228
|
-
|
|
6229
|
-
static llm_offload_trie k_offload_func_trie(k_offload_map);
|
|
6230
|
-
|
|
6231
5960
|
static struct ggml_cgraph * llama_build_graph(
|
|
6232
5961
|
llama_context & lctx,
|
|
6233
5962
|
const llama_batch & batch) {
|
|
6234
5963
|
const auto & model = lctx.model;
|
|
6235
5964
|
|
|
6236
5965
|
// check if we should build the worst-case graph (for memory measurement)
|
|
6237
|
-
const bool worst_case =
|
|
5966
|
+
const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
|
|
6238
5967
|
|
|
6239
5968
|
// keep track of the input that has already been allocated
|
|
6240
5969
|
bool alloc_inp_tokens = false;
|
|
@@ -6243,16 +5972,8 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
6243
5972
|
bool alloc_inp_KQ_mask = false;
|
|
6244
5973
|
bool alloc_inp_K_shift = false;
|
|
6245
5974
|
|
|
6246
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
6247
|
-
const bool do_offload = true;
|
|
6248
|
-
#else
|
|
6249
|
-
const bool do_offload = true; // TODO: set to false after finishing refactoring
|
|
6250
|
-
#endif
|
|
6251
|
-
|
|
6252
|
-
int n_non_view = 0; // number of non-view tensors that have been processed by the callback
|
|
6253
|
-
|
|
6254
5975
|
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
|
|
6255
|
-
// TODO:
|
|
5976
|
+
// TODO: improve handling of input and output tensors, then replace this with ggml_set_name
|
|
6256
5977
|
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
|
|
6257
5978
|
if (il >= 0) {
|
|
6258
5979
|
ggml_format_name(cur, "%s-%d", name, il);
|
|
@@ -6263,12 +5984,11 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
6263
5984
|
//
|
|
6264
5985
|
// allocate input tensors and set input data
|
|
6265
5986
|
//
|
|
6266
|
-
// TODO: will be removed with backend v2
|
|
6267
5987
|
|
|
6268
5988
|
if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
|
|
6269
|
-
|
|
5989
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
|
6270
5990
|
|
|
6271
|
-
if (!
|
|
5991
|
+
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
|
|
6272
5992
|
const int64_t n_tokens = cur->ne[0];
|
|
6273
5993
|
|
|
6274
5994
|
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
|
|
@@ -6277,10 +5997,10 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
6277
5997
|
alloc_inp_tokens = true;
|
|
6278
5998
|
}
|
|
6279
5999
|
|
|
6280
|
-
if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
|
|
6281
|
-
|
|
6000
|
+
if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
|
|
6001
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
|
6282
6002
|
|
|
6283
|
-
if (!
|
|
6003
|
+
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
|
|
6284
6004
|
const int64_t n_embd = cur->ne[0];
|
|
6285
6005
|
const int64_t n_tokens = cur->ne[1];
|
|
6286
6006
|
|
|
@@ -6291,9 +6011,9 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
6291
6011
|
}
|
|
6292
6012
|
|
|
6293
6013
|
if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
|
|
6294
|
-
|
|
6014
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
|
6295
6015
|
|
|
6296
|
-
if (!
|
|
6016
|
+
if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
|
|
6297
6017
|
const int64_t n_tokens = cur->ne[0];
|
|
6298
6018
|
|
|
6299
6019
|
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
|
|
@@ -6304,9 +6024,9 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
6304
6024
|
}
|
|
6305
6025
|
|
|
6306
6026
|
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
|
|
6307
|
-
|
|
6027
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
|
6308
6028
|
|
|
6309
|
-
if (!
|
|
6029
|
+
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
|
6310
6030
|
const int64_t n_kv = cur->ne[0];
|
|
6311
6031
|
const int64_t n_tokens = cur->ne[1];
|
|
6312
6032
|
|
|
@@ -6344,160 +6064,30 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
6344
6064
|
}
|
|
6345
6065
|
|
|
6346
6066
|
if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
|
|
6347
|
-
|
|
6067
|
+
ggml_tallocr_alloc(lctx.alloc, cur);
|
|
6348
6068
|
|
|
6349
|
-
if (!
|
|
6069
|
+
if (!ggml_tallocr_is_measure(lctx.alloc)) {
|
|
6350
6070
|
const int64_t n_ctx = cur->ne[0];
|
|
6351
6071
|
|
|
6352
6072
|
int32_t * data;
|
|
6353
6073
|
if (ggml_backend_buffer_is_host(cur->buffer)) {
|
|
6354
6074
|
data = (int32_t *) cur->data;
|
|
6355
6075
|
} else {
|
|
6356
|
-
lctx.buf_copy.resize(ggml_nbytes(cur));
|
|
6357
|
-
data = (int32_t *) lctx.buf_copy.data();
|
|
6358
|
-
}
|
|
6359
|
-
|
|
6360
|
-
for (int i = 0; i < n_ctx; ++i) {
|
|
6361
|
-
data[i] = lctx.kv_self.cells[i].delta;
|
|
6362
|
-
}
|
|
6363
|
-
|
|
6364
|
-
if (data != cur->data) {
|
|
6365
|
-
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
|
6366
|
-
}
|
|
6367
|
-
}
|
|
6368
|
-
|
|
6369
|
-
alloc_inp_K_shift = true;
|
|
6370
|
-
}
|
|
6371
|
-
|
|
6372
|
-
// view tensors are not processed further
|
|
6373
|
-
if (cur->view_src != nullptr) {
|
|
6374
|
-
return;
|
|
6375
|
-
}
|
|
6376
|
-
|
|
6377
|
-
if (cur->op != GGML_OP_NONE) {
|
|
6378
|
-
n_non_view++;
|
|
6379
|
-
}
|
|
6380
|
-
|
|
6381
|
-
//
|
|
6382
|
-
// offload layers
|
|
6383
|
-
//
|
|
6384
|
-
// TODO: will be removed with backend v2
|
|
6385
|
-
|
|
6386
|
-
//#define LLAMA_OFFLOAD_DEBUG
|
|
6387
|
-
|
|
6388
|
-
if (!do_offload) {
|
|
6389
|
-
return;
|
|
6390
|
-
}
|
|
6391
|
-
|
|
6392
|
-
const int n_layer = model.hparams.n_layer;
|
|
6393
|
-
|
|
6394
|
-
const int n_gpu_layers = model.n_gpu_layers;
|
|
6395
|
-
const int i_gpu_start = n_layer - n_gpu_layers;
|
|
6396
|
-
|
|
6397
|
-
// should we offload the final norm? yes if we are not computing embeddings
|
|
6398
|
-
const bool offload_emb = lctx.embedding.empty();
|
|
6399
|
-
|
|
6400
|
-
static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
|
|
6401
|
-
{ OFFLOAD_FUNC_NOP, "CPU" },
|
|
6402
|
-
{ OFFLOAD_FUNC_OUT, "CPU" },
|
|
6403
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
6404
|
-
{ OFFLOAD_FUNC, "GPU (CUDA)" },
|
|
6405
|
-
{ OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
|
|
6406
|
-
{ OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
|
|
6407
|
-
{ OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
|
|
6408
|
-
{ OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
|
|
6409
|
-
#else
|
|
6410
|
-
{ OFFLOAD_FUNC, "CPU" },
|
|
6411
|
-
{ OFFLOAD_FUNC_FRC, "CPU" },
|
|
6412
|
-
{ OFFLOAD_FUNC_KQV, "CPU" },
|
|
6413
|
-
{ OFFLOAD_FUNC_NR, "CPU" },
|
|
6414
|
-
{ OFFLOAD_FUNC_EMB, "CPU" },
|
|
6415
|
-
#endif // GGML_USE_CUBLAS
|
|
6416
|
-
};
|
|
6417
|
-
|
|
6418
|
-
// check the global map for what offload function to use for this tensor
|
|
6419
|
-
llm_offload_func_e func_e = k_offload_func_trie.find(name);
|
|
6420
|
-
|
|
6421
|
-
if (func_e == OFFLOAD_FUNC_NOP) {
|
|
6422
|
-
#ifdef LLAMA_OFFLOAD_DEBUG
|
|
6423
|
-
// if a tensor hasn't been offloaded, we warn the user
|
|
6424
|
-
if (worst_case) {
|
|
6425
|
-
LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
|
|
6426
|
-
cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
|
|
6427
|
-
}
|
|
6428
|
-
#endif
|
|
6429
|
-
|
|
6430
|
-
return;
|
|
6431
|
-
}
|
|
6432
|
-
|
|
6433
|
-
// count the number of layers and respect the provided n_gpu_layers
|
|
6434
|
-
switch (func_e) {
|
|
6435
|
-
case OFFLOAD_FUNC_NOP:
|
|
6436
|
-
case OFFLOAD_FUNC_OUT:
|
|
6437
|
-
break;
|
|
6438
|
-
case OFFLOAD_FUNC:
|
|
6439
|
-
if (n_gpu_layers < n_layer) {
|
|
6440
|
-
if (il < i_gpu_start) {
|
|
6441
|
-
func_e = OFFLOAD_FUNC_NOP;
|
|
6442
|
-
}
|
|
6443
|
-
}
|
|
6444
|
-
break;
|
|
6445
|
-
case OFFLOAD_FUNC_FRC:
|
|
6446
|
-
if (!lctx.cparams.offload_kqv) {
|
|
6447
|
-
func_e = OFFLOAD_FUNC_NOP;
|
|
6448
|
-
} break;
|
|
6449
|
-
case OFFLOAD_FUNC_KQV:
|
|
6450
|
-
if (!lctx.cparams.offload_kqv) {
|
|
6451
|
-
func_e = OFFLOAD_FUNC_NOP;
|
|
6452
|
-
} else {
|
|
6453
|
-
if (n_gpu_layers < n_layer) {
|
|
6454
|
-
if (il < i_gpu_start) {
|
|
6455
|
-
func_e = OFFLOAD_FUNC_NOP;
|
|
6456
|
-
}
|
|
6457
|
-
}
|
|
6458
|
-
}
|
|
6459
|
-
break;
|
|
6460
|
-
case OFFLOAD_FUNC_NR:
|
|
6461
|
-
if (n_gpu_layers <= n_layer + 0) {
|
|
6462
|
-
func_e = OFFLOAD_FUNC_NOP;
|
|
6463
|
-
}
|
|
6464
|
-
break;
|
|
6465
|
-
case OFFLOAD_FUNC_EMB:
|
|
6466
|
-
if (!offload_emb || n_gpu_layers < n_layer) {
|
|
6467
|
-
func_e = OFFLOAD_FUNC_NOP;
|
|
6468
|
-
}
|
|
6469
|
-
break;
|
|
6470
|
-
default: GGML_ASSERT(false);
|
|
6471
|
-
}
|
|
6472
|
-
|
|
6473
|
-
offload_func_t func = ggml_offload_nop;
|
|
6474
|
-
|
|
6475
|
-
// this is needed for compatibility with Metal for example
|
|
6476
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
6477
|
-
static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
|
|
6478
|
-
#else
|
|
6479
|
-
static offload_func_t ggml_offload_gpu = ggml_offload_nop;
|
|
6480
|
-
#endif
|
|
6076
|
+
lctx.buf_copy.resize(ggml_nbytes(cur));
|
|
6077
|
+
data = (int32_t *) lctx.buf_copy.data();
|
|
6078
|
+
}
|
|
6481
6079
|
|
|
6482
|
-
|
|
6483
|
-
|
|
6484
|
-
|
|
6485
|
-
case OFFLOAD_FUNC:
|
|
6486
|
-
case OFFLOAD_FUNC_KQV:
|
|
6487
|
-
case OFFLOAD_FUNC_FRC:
|
|
6488
|
-
case OFFLOAD_FUNC_NR:
|
|
6489
|
-
case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
|
|
6490
|
-
default: GGML_ASSERT(false);
|
|
6491
|
-
}
|
|
6080
|
+
for (int i = 0; i < n_ctx; ++i) {
|
|
6081
|
+
data[i] = lctx.kv_self.cells[i].delta;
|
|
6082
|
+
}
|
|
6492
6083
|
|
|
6493
|
-
|
|
6494
|
-
|
|
6084
|
+
if (data != cur->data) {
|
|
6085
|
+
ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
|
|
6086
|
+
}
|
|
6087
|
+
}
|
|
6495
6088
|
|
|
6496
|
-
|
|
6497
|
-
if (worst_case) {
|
|
6498
|
-
LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
|
|
6089
|
+
alloc_inp_K_shift = true;
|
|
6499
6090
|
}
|
|
6500
|
-
#endif
|
|
6501
6091
|
};
|
|
6502
6092
|
|
|
6503
6093
|
struct ggml_cgraph * result = NULL;
|
|
@@ -6565,27 +6155,6 @@ static struct ggml_cgraph * llama_build_graph(
|
|
|
6565
6155
|
|
|
6566
6156
|
llm.free();
|
|
6567
6157
|
|
|
6568
|
-
if (worst_case) {
|
|
6569
|
-
int n_non_view_total = 0;
|
|
6570
|
-
|
|
6571
|
-
for (int i = 0; i < result->n_nodes; ++i) {
|
|
6572
|
-
if (result->nodes[i]->view_src == nullptr) {
|
|
6573
|
-
n_non_view_total++;
|
|
6574
|
-
}
|
|
6575
|
-
}
|
|
6576
|
-
|
|
6577
|
-
LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
|
|
6578
|
-
|
|
6579
|
-
if (n_non_view != n_non_view_total) {
|
|
6580
|
-
LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
|
|
6581
|
-
LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__);
|
|
6582
|
-
LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__);
|
|
6583
|
-
LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__);
|
|
6584
|
-
LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__);
|
|
6585
|
-
LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
|
|
6586
|
-
}
|
|
6587
|
-
}
|
|
6588
|
-
|
|
6589
6158
|
return result;
|
|
6590
6159
|
}
|
|
6591
6160
|
|
|
@@ -6631,8 +6200,6 @@ static int llama_decode_internal(
|
|
|
6631
6200
|
|
|
6632
6201
|
auto & kv_self = lctx.kv_self;
|
|
6633
6202
|
|
|
6634
|
-
GGML_ASSERT(!!kv_self.ctx);
|
|
6635
|
-
|
|
6636
6203
|
const int64_t n_embd = hparams.n_embd;
|
|
6637
6204
|
const int64_t n_vocab = hparams.n_vocab;
|
|
6638
6205
|
|
|
@@ -6686,12 +6253,10 @@ static int llama_decode_internal(
|
|
|
6686
6253
|
|
|
6687
6254
|
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
|
|
6688
6255
|
|
|
6689
|
-
|
|
6256
|
+
ggml_backend_sched_reset(lctx.sched);
|
|
6690
6257
|
|
|
6691
6258
|
ggml_cgraph * gf = llama_build_graph(lctx, batch);
|
|
6692
6259
|
|
|
6693
|
-
ggml_allocr_alloc_graph(lctx.alloc, gf);
|
|
6694
|
-
|
|
6695
6260
|
// the output is always the last tensor in the graph
|
|
6696
6261
|
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
|
|
6697
6262
|
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
|
|
@@ -6703,30 +6268,6 @@ static int llama_decode_internal(
|
|
|
6703
6268
|
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
|
|
6704
6269
|
}
|
|
6705
6270
|
|
|
6706
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
6707
|
-
char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
|
|
6708
|
-
for (int i = 0; i < gf->n_leafs; i++) {
|
|
6709
|
-
ggml_tensor * node = gf->leafs[i];
|
|
6710
|
-
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
|
6711
|
-
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
|
6712
|
-
ggml_cuda_copy_to_device(node);
|
|
6713
|
-
}
|
|
6714
|
-
}
|
|
6715
|
-
|
|
6716
|
-
for (int i = 0; i < gf->n_nodes; i++) {
|
|
6717
|
-
ggml_tensor * node = gf->nodes[i];
|
|
6718
|
-
if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
|
|
6719
|
-
ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
|
|
6720
|
-
}
|
|
6721
|
-
}
|
|
6722
|
-
|
|
6723
|
-
// HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
|
|
6724
|
-
if (!lctx.embedding.empty()) {
|
|
6725
|
-
embeddings->backend = GGML_BACKEND_CPU;
|
|
6726
|
-
}
|
|
6727
|
-
res->backend = GGML_BACKEND_CPU;
|
|
6728
|
-
#endif
|
|
6729
|
-
|
|
6730
6271
|
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
|
|
6731
6272
|
|
|
6732
6273
|
// for big prompts, if BLAS is enabled, it is better to use only one thread
|
|
@@ -6749,15 +6290,17 @@ static int llama_decode_internal(
|
|
|
6749
6290
|
#endif
|
|
6750
6291
|
|
|
6751
6292
|
#ifdef GGML_USE_METAL
|
|
6752
|
-
if (ggml_backend_is_metal(lctx.
|
|
6753
|
-
ggml_backend_metal_set_n_cb(lctx.
|
|
6293
|
+
if (ggml_backend_is_metal(lctx.backend_metal)) {
|
|
6294
|
+
ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
|
|
6754
6295
|
}
|
|
6755
6296
|
#endif
|
|
6756
6297
|
|
|
6757
|
-
if (
|
|
6758
|
-
ggml_backend_cpu_set_n_threads(lctx.
|
|
6298
|
+
if (lctx.backend_cpu != nullptr) {
|
|
6299
|
+
ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
|
|
6759
6300
|
}
|
|
6760
|
-
|
|
6301
|
+
ggml_backend_sched_graph_compute(lctx.sched, gf);
|
|
6302
|
+
|
|
6303
|
+
// fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
|
|
6761
6304
|
|
|
6762
6305
|
#ifdef GGML_USE_MPI
|
|
6763
6306
|
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
|
|
@@ -6805,30 +6348,33 @@ static int llama_decode_internal(
|
|
|
6805
6348
|
logits_out.clear();
|
|
6806
6349
|
#endif
|
|
6807
6350
|
|
|
6351
|
+
ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
|
|
6352
|
+
GGML_ASSERT(res_backend != nullptr);
|
|
6808
6353
|
if (batch.logits) {
|
|
6809
6354
|
logits_out.resize(n_vocab * n_tokens);
|
|
6810
6355
|
for (uint32_t i = 0; i < n_tokens; i++) {
|
|
6811
6356
|
if (batch.logits[i] == 0) {
|
|
6812
6357
|
continue;
|
|
6813
6358
|
}
|
|
6814
|
-
|
|
6359
|
+
ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
|
|
6815
6360
|
#ifndef NDEBUG
|
|
6816
6361
|
logits_valid[i] = true;
|
|
6817
6362
|
#endif
|
|
6818
6363
|
}
|
|
6819
6364
|
} else if (lctx.logits_all) {
|
|
6820
6365
|
logits_out.resize(n_vocab * n_tokens);
|
|
6821
|
-
|
|
6366
|
+
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
|
|
6822
6367
|
#ifndef NDEBUG
|
|
6823
6368
|
std::fill(logits_valid.begin(), logits_valid.end(), true);
|
|
6824
6369
|
#endif
|
|
6825
6370
|
} else {
|
|
6826
6371
|
logits_out.resize(n_vocab);
|
|
6827
|
-
|
|
6372
|
+
ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
|
|
6828
6373
|
#ifndef NDEBUG
|
|
6829
6374
|
logits_valid[0] = true;
|
|
6830
6375
|
#endif
|
|
6831
6376
|
}
|
|
6377
|
+
ggml_backend_synchronize(res_backend);
|
|
6832
6378
|
}
|
|
6833
6379
|
|
|
6834
6380
|
// extract embeddings
|
|
@@ -6836,7 +6382,9 @@ static int llama_decode_internal(
|
|
|
6836
6382
|
auto & embedding_out = lctx.embedding;
|
|
6837
6383
|
|
|
6838
6384
|
embedding_out.resize(n_embd);
|
|
6839
|
-
|
|
6385
|
+
ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
|
|
6386
|
+
ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
|
|
6387
|
+
ggml_backend_synchronize(embeddings_backend);
|
|
6840
6388
|
}
|
|
6841
6389
|
|
|
6842
6390
|
// measure the performance only for the single-token evals
|
|
@@ -6907,15 +6455,15 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
|
|
|
6907
6455
|
static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
|
|
6908
6456
|
static const char * hex = "0123456789ABCDEF";
|
|
6909
6457
|
switch (llama_vocab_get_type(vocab)) {
|
|
6910
|
-
|
|
6911
|
-
|
|
6912
|
-
|
|
6913
|
-
|
|
6914
|
-
|
|
6915
|
-
|
|
6916
|
-
|
|
6917
|
-
|
|
6918
|
-
|
|
6458
|
+
case LLAMA_VOCAB_TYPE_SPM: {
|
|
6459
|
+
const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
|
|
6460
|
+
return vocab.token_to_id.at(buf);
|
|
6461
|
+
}
|
|
6462
|
+
case LLAMA_VOCAB_TYPE_BPE: {
|
|
6463
|
+
return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
|
|
6464
|
+
}
|
|
6465
|
+
default:
|
|
6466
|
+
GGML_ASSERT(false);
|
|
6919
6467
|
}
|
|
6920
6468
|
}
|
|
6921
6469
|
|
|
@@ -7449,7 +6997,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
7449
6997
|
if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
|
|
7450
6998
|
|
|
7451
6999
|
#ifdef PRETOKENIZERDEBUG
|
|
7452
|
-
|
|
7000
|
+
LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
|
7453
7001
|
#endif
|
|
7454
7002
|
auto source = std::distance(buffer.begin(), it);
|
|
7455
7003
|
|
|
@@ -7462,7 +7010,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
7462
7010
|
buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
|
|
7463
7011
|
|
|
7464
7012
|
#ifdef PRETOKENIZERDEBUG
|
|
7465
|
-
|
|
7013
|
+
LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
|
|
7466
7014
|
#endif
|
|
7467
7015
|
it++;
|
|
7468
7016
|
}
|
|
@@ -7478,7 +7026,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
7478
7026
|
buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
|
|
7479
7027
|
|
|
7480
7028
|
#ifdef PRETOKENIZERDEBUG
|
|
7481
|
-
|
|
7029
|
+
LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
|
|
7482
7030
|
#endif
|
|
7483
7031
|
|
|
7484
7032
|
it++;
|
|
@@ -7494,7 +7042,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
|
|
|
7494
7042
|
raw_text_base_length = right_reminder_length;
|
|
7495
7043
|
|
|
7496
7044
|
#ifdef PRETOKENIZERDEBUG
|
|
7497
|
-
|
|
7045
|
+
LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
|
|
7498
7046
|
#endif
|
|
7499
7047
|
} else {
|
|
7500
7048
|
if (source == 0) {
|
|
@@ -7551,7 +7099,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
7551
7099
|
}
|
|
7552
7100
|
|
|
7553
7101
|
#ifdef PRETOKENIZERDEBUG
|
|
7554
|
-
|
|
7102
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
|
7555
7103
|
#endif
|
|
7556
7104
|
llm_tokenizer_spm tokenizer(vocab);
|
|
7557
7105
|
llama_escape_whitespace(raw_text);
|
|
@@ -7572,7 +7120,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
|
|
|
7572
7120
|
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
|
|
7573
7121
|
|
|
7574
7122
|
#ifdef PRETOKENIZERDEBUG
|
|
7575
|
-
|
|
7123
|
+
LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
|
|
7576
7124
|
#endif
|
|
7577
7125
|
llm_tokenizer_bpe tokenizer(vocab);
|
|
7578
7126
|
tokenizer.tokenize(raw_text, output);
|
|
@@ -8350,39 +7898,59 @@ static void llama_log_softmax(float * array, size_t size) {
|
|
|
8350
7898
|
}
|
|
8351
7899
|
}
|
|
8352
7900
|
|
|
7901
|
+
void llama_sample_apply_guidance(
|
|
7902
|
+
struct llama_context * ctx,
|
|
7903
|
+
float * logits,
|
|
7904
|
+
float * logits_guidance,
|
|
7905
|
+
float scale) {
|
|
7906
|
+
GGML_ASSERT(ctx);
|
|
7907
|
+
|
|
7908
|
+
const auto t_start_sample_us = ggml_time_us();
|
|
7909
|
+
const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
7910
|
+
|
|
7911
|
+
llama_log_softmax(logits, n_vocab);
|
|
7912
|
+
llama_log_softmax(logits_guidance, n_vocab);
|
|
7913
|
+
|
|
7914
|
+
for (int i = 0; i < n_vocab; ++i) {
|
|
7915
|
+
auto & l = logits[i];
|
|
7916
|
+
const auto & g = logits_guidance[i];
|
|
7917
|
+
|
|
7918
|
+
l = scale * (l - g) + g;
|
|
7919
|
+
}
|
|
7920
|
+
|
|
7921
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
7922
|
+
}
|
|
7923
|
+
|
|
8353
7924
|
void llama_sample_classifier_free_guidance(
|
|
8354
7925
|
struct llama_context * ctx,
|
|
8355
7926
|
llama_token_data_array * candidates,
|
|
8356
7927
|
struct llama_context * guidance_ctx,
|
|
8357
7928
|
float scale) {
|
|
8358
|
-
int64_t t_start_sample_us = ggml_time_us();
|
|
8359
|
-
|
|
8360
7929
|
GGML_ASSERT(ctx);
|
|
7930
|
+
int64_t t_start_sample_us;
|
|
8361
7931
|
|
|
8362
|
-
|
|
7932
|
+
t_start_sample_us = ggml_time_us();
|
|
7933
|
+
const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
|
|
8363
7934
|
|
|
8364
|
-
GGML_ASSERT(n_vocab ==
|
|
7935
|
+
GGML_ASSERT(n_vocab == candidates->size);
|
|
8365
7936
|
GGML_ASSERT(!candidates->sorted);
|
|
8366
7937
|
|
|
8367
|
-
std::vector<float> logits_base;
|
|
8368
|
-
|
|
8369
|
-
|
|
8370
|
-
logits_base.push_back(candidates->data[i].logit);
|
|
7938
|
+
std::vector<float> logits_base(n_vocab);
|
|
7939
|
+
for (size_t i = 0; i < n_vocab; ++i) {
|
|
7940
|
+
logits_base[i] = candidates->data[i].logit;
|
|
8371
7941
|
}
|
|
8372
|
-
llama_log_softmax(logits_base.data(), candidates->size);
|
|
8373
7942
|
|
|
8374
|
-
float* logits_guidance = llama_get_logits(guidance_ctx);
|
|
8375
|
-
llama_log_softmax(logits_guidance, n_vocab);
|
|
7943
|
+
float * logits_guidance = llama_get_logits(guidance_ctx);
|
|
8376
7944
|
|
|
8377
|
-
|
|
8378
|
-
|
|
8379
|
-
|
|
8380
|
-
candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
|
|
8381
|
-
}
|
|
7945
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
7946
|
+
llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
|
|
7947
|
+
t_start_sample_us = ggml_time_us();
|
|
8382
7948
|
|
|
8383
|
-
|
|
8384
|
-
|
|
7949
|
+
for (size_t i = 0; i < n_vocab; ++i) {
|
|
7950
|
+
candidates->data[i].logit = logits_base[i];
|
|
8385
7951
|
}
|
|
7952
|
+
|
|
7953
|
+
ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
|
|
8386
7954
|
}
|
|
8387
7955
|
|
|
8388
7956
|
llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
|
|
@@ -8806,6 +8374,8 @@ struct quantize_state_internal {
|
|
|
8806
8374
|
int n_k_quantized = 0;
|
|
8807
8375
|
int n_fallback = 0;
|
|
8808
8376
|
|
|
8377
|
+
bool has_imatrix = false;
|
|
8378
|
+
|
|
8809
8379
|
quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
|
|
8810
8380
|
: model(model)
|
|
8811
8381
|
, params(params)
|
|
@@ -8889,9 +8459,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
8889
8459
|
if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
|
|
8890
8460
|
new_type = GGML_TYPE_Q8_0;
|
|
8891
8461
|
}
|
|
8462
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
|
8463
|
+
new_type = GGML_TYPE_Q5_K;
|
|
8464
|
+
}
|
|
8892
8465
|
else if (new_type != GGML_TYPE_Q8_0) {
|
|
8893
8466
|
new_type = GGML_TYPE_Q6_K;
|
|
8894
8467
|
}
|
|
8468
|
+
} else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
|
|
8469
|
+
if (name.find("attn_v.weight") != std::string::npos) {
|
|
8470
|
+
if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
|
|
8471
|
+
else new_type = GGML_TYPE_Q2_K;
|
|
8472
|
+
++qs.i_attention_wv;
|
|
8473
|
+
}
|
|
8474
|
+
else if (name.find("ffn_down") != std::string::npos) {
|
|
8475
|
+
if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
|
|
8476
|
+
++qs.i_feed_forward_w2;
|
|
8477
|
+
}
|
|
8478
|
+
else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
|
|
8895
8479
|
} else if (name.find("attn_v.weight") != std::string::npos) {
|
|
8896
8480
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
8897
8481
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
@@ -8921,11 +8505,32 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
8921
8505
|
// TODO: explore better strategies
|
|
8922
8506
|
new_type = GGML_TYPE_Q8_0;
|
|
8923
8507
|
}
|
|
8924
|
-
} else if (name.find("ffn_down
|
|
8508
|
+
} else if (name.find("ffn_down") != std::string::npos) {
|
|
8509
|
+
const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
|
|
8510
|
+
int i_layer, n_layer;
|
|
8511
|
+
if (n_expert == 1) {
|
|
8512
|
+
i_layer = qs.i_feed_forward_w2;
|
|
8513
|
+
n_layer = qs.n_feed_forward_w2;
|
|
8514
|
+
} else {
|
|
8515
|
+
// Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
|
|
8516
|
+
// sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
|
|
8517
|
+
// for getting the current layer as I initially thought, and we need to resort to parsing the
|
|
8518
|
+
// tensor name.
|
|
8519
|
+
n_layer = qs.n_feed_forward_w2 / n_expert;
|
|
8520
|
+
if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
|
|
8521
|
+
throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
|
|
8522
|
+
}
|
|
8523
|
+
if (i_layer < 0 || i_layer >= n_layer) {
|
|
8524
|
+
throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
|
|
8525
|
+
}
|
|
8526
|
+
}
|
|
8925
8527
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
8528
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
|
|
8529
|
+
if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
|
|
8530
|
+
}
|
|
8926
8531
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
|
|
8927
|
-
new_type =
|
|
8928
|
-
: arch != LLM_ARCH_FALCON || use_more_bits(
|
|
8532
|
+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
|
|
8533
|
+
: arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
|
|
8929
8534
|
: GGML_TYPE_Q3_K;
|
|
8930
8535
|
}
|
|
8931
8536
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
|
|
@@ -8933,22 +8538,36 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
8933
8538
|
}
|
|
8934
8539
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
|
8935
8540
|
if (arch == LLM_ARCH_FALCON) {
|
|
8936
|
-
new_type =
|
|
8937
|
-
use_more_bits(
|
|
8541
|
+
new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
|
|
8542
|
+
use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
|
|
8938
8543
|
} else {
|
|
8939
|
-
if (use_more_bits(
|
|
8544
|
+
if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
|
8940
8545
|
}
|
|
8941
8546
|
}
|
|
8942
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(
|
|
8943
|
-
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON &&
|
|
8547
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
|
|
8548
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
|
|
8944
8549
|
new_type = GGML_TYPE_Q5_K;
|
|
8945
8550
|
}
|
|
8551
|
+
else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
|
|
8552
|
+
&& qs.has_imatrix && i_layer < n_layer/8) {
|
|
8553
|
+
// Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
|
|
8554
|
+
// We only do it when an imatrix is provided because a) we want to make sure that one can always get the
|
|
8555
|
+
// same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
|
|
8556
|
+
new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
|
|
8557
|
+
}
|
|
8946
8558
|
++qs.i_feed_forward_w2;
|
|
8947
8559
|
} else if (name.find("attn_output.weight") != std::string::npos) {
|
|
8948
8560
|
if (arch != LLM_ARCH_FALCON) {
|
|
8949
|
-
if
|
|
8950
|
-
|
|
8951
|
-
|
|
8561
|
+
if (qs.model.hparams.n_expert == 8) {
|
|
8562
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
|
|
8563
|
+
ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
|
|
8564
|
+
new_type = GGML_TYPE_Q5_K;
|
|
8565
|
+
}
|
|
8566
|
+
} else {
|
|
8567
|
+
if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
|
|
8568
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
|
|
8569
|
+
else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
|
|
8570
|
+
}
|
|
8952
8571
|
} else {
|
|
8953
8572
|
if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
|
|
8954
8573
|
}
|
|
@@ -8958,9 +8577,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
8958
8577
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
|
|
8959
8578
|
else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
|
|
8960
8579
|
}
|
|
8961
|
-
|
|
8962
|
-
|
|
8963
|
-
|
|
8580
|
+
// IK: let's remove this, else Q2_K is almost the same as Q3_K_S
|
|
8581
|
+
//else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
|
|
8582
|
+
// if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
|
|
8583
|
+
//}
|
|
8964
8584
|
// This can be used to reduce the size of the Q5_K_S model.
|
|
8965
8585
|
// The associated PPL increase is fully in line with the size reduction
|
|
8966
8586
|
//else {
|
|
@@ -8968,7 +8588,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
8968
8588
|
//}
|
|
8969
8589
|
bool convert_incompatible_tensor = false;
|
|
8970
8590
|
if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
|
|
8971
|
-
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K
|
|
8591
|
+
new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
|
|
8592
|
+
new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
|
|
8972
8593
|
int nx = tensor->ne[0];
|
|
8973
8594
|
int ny = tensor->ne[1];
|
|
8974
8595
|
if (nx % QK_K != 0) {
|
|
@@ -8980,6 +8601,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
|
|
|
8980
8601
|
}
|
|
8981
8602
|
if (convert_incompatible_tensor) {
|
|
8982
8603
|
switch (new_type) {
|
|
8604
|
+
case GGML_TYPE_IQ2_XXS:
|
|
8605
|
+
case GGML_TYPE_IQ2_XS:
|
|
8983
8606
|
case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
|
|
8984
8607
|
case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
|
|
8985
8608
|
case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
|
|
@@ -9009,6 +8632,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9009
8632
|
|
|
9010
8633
|
// K-quants
|
|
9011
8634
|
case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
|
|
8635
|
+
case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
|
|
9012
8636
|
case LLAMA_FTYPE_MOSTLY_Q3_K_S:
|
|
9013
8637
|
case LLAMA_FTYPE_MOSTLY_Q3_K_M:
|
|
9014
8638
|
case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
|
|
@@ -9017,6 +8641,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9017
8641
|
case LLAMA_FTYPE_MOSTLY_Q5_K_S:
|
|
9018
8642
|
case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
|
|
9019
8643
|
case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
|
|
8644
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
|
|
8645
|
+
case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
|
|
9020
8646
|
|
|
9021
8647
|
default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
|
|
9022
8648
|
}
|
|
@@ -9047,6 +8673,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9047
8673
|
if (params->only_copy) {
|
|
9048
8674
|
ftype = model.ftype;
|
|
9049
8675
|
}
|
|
8676
|
+
const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
|
|
8677
|
+
if (params->imatrix) {
|
|
8678
|
+
imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
|
|
8679
|
+
if (imatrix_data) {
|
|
8680
|
+
LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
|
|
8681
|
+
qs.has_imatrix = true;
|
|
8682
|
+
}
|
|
8683
|
+
}
|
|
9050
8684
|
|
|
9051
8685
|
const size_t align = GGUF_DEFAULT_ALIGNMENT;
|
|
9052
8686
|
struct gguf_context * ctx_out = gguf_init_empty();
|
|
@@ -9065,7 +8699,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9065
8699
|
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
|
|
9066
8700
|
++qs.n_attention_wv;
|
|
9067
8701
|
}
|
|
9068
|
-
else if (name.find("ffn_down
|
|
8702
|
+
else if (name.find("ffn_down") != std::string::npos) {
|
|
9069
8703
|
++qs.n_feed_forward_w2;
|
|
9070
8704
|
}
|
|
9071
8705
|
}
|
|
@@ -9104,6 +8738,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9104
8738
|
// placeholder for the meta data
|
|
9105
8739
|
::zeros(fout, meta_size);
|
|
9106
8740
|
|
|
8741
|
+
std::set<ggml_type> used_iq2;
|
|
8742
|
+
|
|
9107
8743
|
for (int i = 0; i < ml.n_tensors; ++i) {
|
|
9108
8744
|
struct ggml_tensor * tensor = ml.get_tensor_meta(i);
|
|
9109
8745
|
|
|
@@ -9156,6 +8792,35 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9156
8792
|
} else {
|
|
9157
8793
|
const size_t nelements = ggml_nelements(tensor);
|
|
9158
8794
|
|
|
8795
|
+
if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
|
|
8796
|
+
ggml_init_iq2_quantization(new_type);
|
|
8797
|
+
used_iq2.insert(new_type);
|
|
8798
|
+
}
|
|
8799
|
+
|
|
8800
|
+
const float * imatrix = nullptr;
|
|
8801
|
+
if (imatrix_data) {
|
|
8802
|
+
auto it = imatrix_data->find(tensor->name);
|
|
8803
|
+
if (it == imatrix_data->end()) {
|
|
8804
|
+
LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
|
|
8805
|
+
} else {
|
|
8806
|
+
if (it->second.size() == (size_t)tensor->ne[0]) {
|
|
8807
|
+
imatrix = it->second.data();
|
|
8808
|
+
} else {
|
|
8809
|
+
LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
|
|
8810
|
+
int(it->second.size()), int(tensor->ne[0]), tensor->name);
|
|
8811
|
+
}
|
|
8812
|
+
}
|
|
8813
|
+
}
|
|
8814
|
+
if ((new_type == GGML_TYPE_IQ2_XXS ||
|
|
8815
|
+
new_type == GGML_TYPE_IQ2_XS ||
|
|
8816
|
+
(new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
|
|
8817
|
+
LLAMA_LOG_ERROR("\n\n============================================================\n");
|
|
8818
|
+
LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
|
|
8819
|
+
LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
|
|
8820
|
+
LLAMA_LOG_ERROR("============================================================\n\n");
|
|
8821
|
+
throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
|
|
8822
|
+
}
|
|
8823
|
+
|
|
9159
8824
|
float * f32_data;
|
|
9160
8825
|
|
|
9161
8826
|
if (tensor->type == GGML_TYPE_F32) {
|
|
@@ -9176,21 +8841,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9176
8841
|
new_data = work.data();
|
|
9177
8842
|
std::array<int64_t, 1 << 4> hist_cur = {};
|
|
9178
8843
|
|
|
9179
|
-
|
|
8844
|
+
const int n_per_row = tensor->ne[0];
|
|
8845
|
+
const int nrows = nelements / n_per_row;
|
|
8846
|
+
|
|
8847
|
+
static const int min_chunk_size = 32 * 512;
|
|
8848
|
+
const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
|
|
8849
|
+
|
|
9180
8850
|
const int nchunk = (nelements + chunk_size - 1)/chunk_size;
|
|
9181
8851
|
const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
|
|
9182
8852
|
if (nthread_use < 2) {
|
|
9183
|
-
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0,
|
|
8853
|
+
new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
|
|
9184
8854
|
} else {
|
|
9185
|
-
|
|
8855
|
+
int counter = 0;
|
|
9186
8856
|
new_size = 0;
|
|
9187
|
-
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data,
|
|
8857
|
+
auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
|
|
8858
|
+
nrows, n_per_row, imatrix]() {
|
|
9188
8859
|
std::array<int64_t, 1 << 4> local_hist = {};
|
|
8860
|
+
const int nrows_per_chunk = chunk_size / n_per_row;
|
|
9189
8861
|
size_t local_size = 0;
|
|
9190
8862
|
while (true) {
|
|
9191
8863
|
std::unique_lock<std::mutex> lock(mutex);
|
|
9192
|
-
|
|
9193
|
-
if (
|
|
8864
|
+
int first_row = counter; counter += nrows_per_chunk;
|
|
8865
|
+
if (first_row >= nrows) {
|
|
9194
8866
|
if (local_size > 0) {
|
|
9195
8867
|
for (int j=0; j<int(local_hist.size()); ++j) {
|
|
9196
8868
|
hist_cur[j] += local_hist[j];
|
|
@@ -9200,8 +8872,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9200
8872
|
break;
|
|
9201
8873
|
}
|
|
9202
8874
|
lock.unlock();
|
|
9203
|
-
|
|
9204
|
-
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
|
8875
|
+
const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
|
|
8876
|
+
local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
|
|
8877
|
+
first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
|
|
9205
8878
|
}
|
|
9206
8879
|
};
|
|
9207
8880
|
for (int it = 0; it < nthread_use - 1; ++it) {
|
|
@@ -9212,7 +8885,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9212
8885
|
workers.clear();
|
|
9213
8886
|
}
|
|
9214
8887
|
|
|
9215
|
-
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB
|
|
8888
|
+
LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
|
|
9216
8889
|
int64_t tot_count = 0;
|
|
9217
8890
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
|
9218
8891
|
hist_all[i] += hist_cur[i];
|
|
@@ -9220,6 +8893,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9220
8893
|
}
|
|
9221
8894
|
|
|
9222
8895
|
if (tot_count > 0) {
|
|
8896
|
+
LLAMA_LOG_INFO(" | hist: ");
|
|
9223
8897
|
for (size_t i = 0; i < hist_cur.size(); i++) {
|
|
9224
8898
|
LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
|
|
9225
8899
|
}
|
|
@@ -9248,6 +8922,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
|
|
|
9248
8922
|
|
|
9249
8923
|
fout.close();
|
|
9250
8924
|
|
|
8925
|
+
for (auto type : used_iq2) {
|
|
8926
|
+
ggml_deinit_iq2_quantization(type);
|
|
8927
|
+
}
|
|
8928
|
+
|
|
9251
8929
|
gguf_free(ctx_out);
|
|
9252
8930
|
|
|
9253
8931
|
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
|
|
@@ -9305,48 +8983,23 @@ static int llama_apply_lora_from_file_internal(
|
|
|
9305
8983
|
|
|
9306
8984
|
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
|
|
9307
8985
|
|
|
9308
|
-
// create a name -> tensor map of the model to accelerate lookups
|
|
9309
|
-
// find the max tensor size to estimate the required temporary buffer size
|
|
9310
|
-
size_t max_tensor_size = 0;
|
|
9311
|
-
std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
|
|
9312
|
-
for (const auto & kv : model.tensors_by_name) {
|
|
9313
|
-
model_tensors.insert(kv);
|
|
9314
|
-
size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
|
|
9315
|
-
max_tensor_size = std::max(max_tensor_size, f32_size);
|
|
9316
|
-
}
|
|
9317
|
-
|
|
9318
|
-
// create a temporary ggml context to store the lora tensors
|
|
9319
|
-
// TODO: use ggml-alloc
|
|
9320
|
-
size_t lora_ctx_size = max_tensor_size * 3;
|
|
9321
|
-
LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
|
|
9322
|
-
std::vector<uint8_t> lora_buf(lora_ctx_size);
|
|
9323
|
-
|
|
9324
|
-
struct ggml_init_params params;
|
|
9325
|
-
params.mem_size = lora_buf.size();
|
|
9326
|
-
params.mem_buffer = lora_buf.data();
|
|
9327
|
-
params.no_alloc = false;
|
|
9328
|
-
|
|
9329
|
-
using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
|
|
9330
|
-
|
|
9331
|
-
unique_context lora_ctx(nullptr, ggml_free);
|
|
9332
|
-
lora_ctx.reset(ggml_init(params));
|
|
9333
|
-
std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
|
|
9334
|
-
|
|
9335
8986
|
// load base model
|
|
9336
8987
|
std::unique_ptr<llama_model_loader> ml;
|
|
9337
|
-
|
|
9338
|
-
if (path_base_model) {
|
|
8988
|
+
if (path_base_model) {
|
|
9339
8989
|
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
|
|
9340
8990
|
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
|
|
9341
|
-
ml->init_mapping(false); // no prefetching
|
|
8991
|
+
ml->init_mapping(/*prefetch*/ false); // no prefetching
|
|
9342
8992
|
}
|
|
9343
8993
|
|
|
9344
|
-
|
|
9345
|
-
|
|
9346
|
-
|
|
9347
|
-
|
|
9348
|
-
|
|
8994
|
+
struct tensor_meta {
|
|
8995
|
+
std::string name;
|
|
8996
|
+
ggml_type type;
|
|
8997
|
+
int32_t ne[2];
|
|
8998
|
+
size_t offset;
|
|
8999
|
+
};
|
|
9000
|
+
std::map<std::string, tensor_meta> tensor_meta_map;
|
|
9349
9001
|
|
|
9002
|
+
// load all tensor meta
|
|
9350
9003
|
while (true) {
|
|
9351
9004
|
if (fin.tell() == fin.size) {
|
|
9352
9005
|
// eof
|
|
@@ -9359,7 +9012,7 @@ static int llama_apply_lora_from_file_internal(
|
|
|
9359
9012
|
|
|
9360
9013
|
fin.read_raw(&n_dims, sizeof(n_dims));
|
|
9361
9014
|
fin.read_raw(&name_len, sizeof(name_len));
|
|
9362
|
-
fin.read_raw(&ftype,
|
|
9015
|
+
fin.read_raw(&ftype, sizeof(ftype));
|
|
9363
9016
|
|
|
9364
9017
|
if (n_dims != 1 && n_dims != 2) {
|
|
9365
9018
|
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
|
|
@@ -9373,31 +9026,23 @@ static int llama_apply_lora_from_file_internal(
|
|
|
9373
9026
|
|
|
9374
9027
|
std::string name;
|
|
9375
9028
|
{
|
|
9376
|
-
GGML_ASSERT(name_len
|
|
9377
|
-
char buf[
|
|
9029
|
+
GGML_ASSERT(name_len < GGML_MAX_NAME);
|
|
9030
|
+
char buf[GGML_MAX_NAME];
|
|
9378
9031
|
fin.read_raw(buf, name_len);
|
|
9379
9032
|
name = std::string(buf, name_len);
|
|
9380
9033
|
}
|
|
9381
9034
|
|
|
9382
|
-
// check for lora suffix
|
|
9383
|
-
|
|
9384
|
-
|
|
9385
|
-
|
|
9386
|
-
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
|
9387
|
-
return 1;
|
|
9035
|
+
// check for lora suffix
|
|
9036
|
+
std::string lora_suffix;
|
|
9037
|
+
if (name.length() > 6) {
|
|
9038
|
+
lora_suffix = name.substr(name.length() - 6);
|
|
9388
9039
|
}
|
|
9389
|
-
|
|
9390
|
-
|
|
9391
|
-
std::string base_name = name;
|
|
9392
|
-
base_name.erase(pos);
|
|
9393
|
-
// LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
|
|
9394
|
-
|
|
9395
|
-
if (model_tensors.find(base_name) == model_tensors.end()) {
|
|
9396
|
-
LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
|
|
9040
|
+
if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
|
|
9041
|
+
LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
|
|
9397
9042
|
return 1;
|
|
9398
9043
|
}
|
|
9399
9044
|
|
|
9400
|
-
//
|
|
9045
|
+
// tensor type
|
|
9401
9046
|
ggml_type wtype;
|
|
9402
9047
|
switch (ftype) {
|
|
9403
9048
|
case 0: wtype = GGML_TYPE_F32; break;
|
|
@@ -9409,122 +9054,177 @@ static int llama_apply_lora_from_file_internal(
|
|
|
9409
9054
|
return false;
|
|
9410
9055
|
}
|
|
9411
9056
|
}
|
|
9412
|
-
ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
|
|
9413
|
-
ggml_set_name(lora_tensor, name.c_str());
|
|
9414
9057
|
|
|
9415
|
-
//
|
|
9058
|
+
// data offset
|
|
9416
9059
|
size_t offset = fin.tell();
|
|
9417
|
-
size_t tensor_data_size = ggml_nbytes(lora_tensor);
|
|
9418
9060
|
offset = (offset + 31) & -32;
|
|
9419
|
-
fin.seek(offset, SEEK_SET);
|
|
9420
|
-
fin.read_raw(lora_tensor->data, tensor_data_size);
|
|
9421
9061
|
|
|
9422
|
-
|
|
9062
|
+
// skip tensor data
|
|
9063
|
+
fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
|
|
9064
|
+
|
|
9065
|
+
tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
|
|
9066
|
+
}
|
|
9067
|
+
|
|
9068
|
+
bool warned = false;
|
|
9069
|
+
int n_tensors = 0;
|
|
9423
9070
|
|
|
9424
|
-
|
|
9425
|
-
|
|
9426
|
-
|
|
9071
|
+
// apply
|
|
9072
|
+
ggml_backend_t backend_cpu = ggml_backend_cpu_init();
|
|
9073
|
+
if (backend_cpu == nullptr) {
|
|
9074
|
+
LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
|
|
9075
|
+
return 1;
|
|
9076
|
+
}
|
|
9077
|
+
ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
|
|
9427
9078
|
|
|
9428
|
-
|
|
9079
|
+
std::vector<no_init<uint8_t>> read_buf;
|
|
9080
|
+
for (const auto & it : model.tensors_by_name) {
|
|
9081
|
+
const std::string & base_name = it.first;
|
|
9082
|
+
ggml_tensor * model_t = it.second;
|
|
9429
9083
|
|
|
9430
|
-
|
|
9431
|
-
|
|
9084
|
+
if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
|
|
9085
|
+
tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
|
|
9086
|
+
continue;
|
|
9087
|
+
}
|
|
9432
9088
|
|
|
9433
|
-
|
|
9434
|
-
|
|
9435
|
-
if (dest_t->type != GGML_TYPE_F16) {
|
|
9436
|
-
throw std::runtime_error(format(
|
|
9437
|
-
"%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
|
|
9438
|
-
}
|
|
9439
|
-
offload_func = ggml_cuda_assign_buffers;
|
|
9440
|
-
offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
|
|
9441
|
-
}
|
|
9442
|
-
#endif // GGML_USE_CUBLAS
|
|
9089
|
+
tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
|
|
9090
|
+
tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
|
|
9443
9091
|
|
|
9444
|
-
|
|
9445
|
-
|
|
9446
|
-
|
|
9092
|
+
ggml_init_params lora_init_params = {
|
|
9093
|
+
/* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
|
|
9094
|
+
/* .mem_buffer */ nullptr,
|
|
9095
|
+
/* .no_alloc */ true,
|
|
9096
|
+
};
|
|
9097
|
+
ggml_context * lora_ctx = ggml_init(lora_init_params);
|
|
9098
|
+
if (lora_ctx == nullptr) {
|
|
9099
|
+
LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
|
|
9100
|
+
ggml_backend_free(backend_cpu);
|
|
9101
|
+
return 1;
|
|
9102
|
+
}
|
|
9447
9103
|
|
|
9448
|
-
|
|
9449
|
-
|
|
9450
|
-
|
|
9451
|
-
|
|
9452
|
-
|
|
9104
|
+
// create tensors
|
|
9105
|
+
ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
|
|
9106
|
+
ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
|
|
9107
|
+
ggml_set_name(loraA, metaA.name.c_str());
|
|
9108
|
+
ggml_set_name(loraB, metaB.name.c_str());
|
|
9453
9109
|
|
|
9454
|
-
|
|
9455
|
-
|
|
9456
|
-
|
|
9457
|
-
|
|
9110
|
+
ggml_tensor * base_t;
|
|
9111
|
+
if (ml) {
|
|
9112
|
+
if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
|
|
9113
|
+
LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
|
|
9114
|
+
return 1;
|
|
9458
9115
|
}
|
|
9116
|
+
base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
|
|
9117
|
+
} else {
|
|
9118
|
+
base_t = ggml_dup_tensor(lora_ctx, model_t);
|
|
9119
|
+
}
|
|
9120
|
+
ggml_set_name(base_t, base_name.c_str());
|
|
9459
9121
|
|
|
9460
|
-
|
|
9461
|
-
|
|
9462
|
-
|
|
9463
|
-
|
|
9464
|
-
|
|
9465
|
-
|
|
9466
|
-
}
|
|
9122
|
+
// allocate in backend buffer
|
|
9123
|
+
ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
|
9124
|
+
if (lora_buf == nullptr) {
|
|
9125
|
+
LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
|
|
9126
|
+
return 1;
|
|
9127
|
+
}
|
|
9467
9128
|
|
|
9468
|
-
|
|
9469
|
-
|
|
9470
|
-
|
|
9129
|
+
// load tensor data
|
|
9130
|
+
auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
|
|
9131
|
+
read_buf.resize(ggml_nbytes(tensor));
|
|
9132
|
+
fin.seek(tensor_meta.offset, SEEK_SET);
|
|
9133
|
+
fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
|
|
9134
|
+
ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
|
|
9135
|
+
};
|
|
9136
|
+
load_tensor(metaA, loraA);
|
|
9137
|
+
load_tensor(metaB, loraB);
|
|
9471
9138
|
|
|
9472
|
-
|
|
9473
|
-
|
|
9474
|
-
|
|
9139
|
+
// load base model tensor data
|
|
9140
|
+
if (ml) {
|
|
9141
|
+
ml->load_data_for(base_t);
|
|
9142
|
+
} else {
|
|
9143
|
+
ggml_backend_tensor_copy(model_t, base_t);
|
|
9144
|
+
}
|
|
9475
9145
|
|
|
9476
|
-
|
|
9477
|
-
|
|
9478
|
-
|
|
9479
|
-
|
|
9480
|
-
|
|
9146
|
+
if (ggml_is_quantized(base_t->type) && !warned) {
|
|
9147
|
+
LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
|
|
9148
|
+
"use a f16 or f32 base model with --lora-base\n", __func__);
|
|
9149
|
+
warned = true;
|
|
9150
|
+
}
|
|
9151
|
+
|
|
9152
|
+
if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
|
|
9153
|
+
LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
|
|
9154
|
+
" are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
|
|
9155
|
+
ggml_free(lora_ctx);
|
|
9156
|
+
ggml_backend_buffer_free(lora_buf);
|
|
9157
|
+
ggml_backend_free(backend_cpu);
|
|
9158
|
+
return 1;
|
|
9159
|
+
}
|
|
9481
9160
|
|
|
9161
|
+
auto build_lora_graph = [&]() {
|
|
9482
9162
|
// w = w + BA*s
|
|
9483
|
-
ggml_tensor * BA = ggml_mul_mat(lora_ctx
|
|
9484
|
-
offload_func(BA);
|
|
9163
|
+
ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
|
|
9485
9164
|
ggml_set_name(BA, "BA");
|
|
9486
9165
|
|
|
9487
9166
|
if (scaling != 1.0f) {
|
|
9488
|
-
BA =
|
|
9489
|
-
offload_func(BA);
|
|
9167
|
+
BA = ggml_scale(lora_ctx, BA, scaling);
|
|
9490
9168
|
ggml_set_name(BA, "BA_scaled");
|
|
9491
9169
|
}
|
|
9492
9170
|
|
|
9493
9171
|
ggml_tensor * r;
|
|
9494
|
-
|
|
9495
|
-
|
|
9496
|
-
offload_func_force_inplace(r);
|
|
9497
|
-
ggml_set_name(r, "r_add_inplace");
|
|
9498
|
-
}
|
|
9499
|
-
else {
|
|
9500
|
-
r = ggml_add(lora_ctx.get(), base_t, BA);
|
|
9501
|
-
offload_func(r);
|
|
9502
|
-
ggml_set_name(r, "r_add");
|
|
9172
|
+
r = ggml_add_inplace(lora_ctx, base_t, BA);
|
|
9173
|
+
ggml_set_name(r, "r_add");
|
|
9503
9174
|
|
|
9504
|
-
|
|
9505
|
-
|
|
9506
|
-
|
|
9175
|
+
if (base_t->type != model_t->type) {
|
|
9176
|
+
// convert the result to the model type
|
|
9177
|
+
r = ggml_cast(lora_ctx, r, model_t->type);
|
|
9178
|
+
ggml_set_name(r, "r_cast");
|
|
9507
9179
|
}
|
|
9508
9180
|
|
|
9509
|
-
|
|
9510
|
-
|
|
9181
|
+
return r;
|
|
9182
|
+
};
|
|
9183
|
+
|
|
9184
|
+
ggml_cgraph * gf = ggml_new_graph(lora_ctx);
|
|
9185
|
+
ggml_tensor * r = build_lora_graph();
|
|
9186
|
+
ggml_build_forward_expand(gf, r);
|
|
9511
9187
|
|
|
9512
|
-
|
|
9188
|
+
ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
|
|
9189
|
+
if (graph_buf == nullptr) {
|
|
9190
|
+
LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
|
|
9191
|
+
ggml_free(lora_ctx);
|
|
9192
|
+
ggml_backend_buffer_free(lora_buf);
|
|
9193
|
+
ggml_backend_free(backend_cpu);
|
|
9194
|
+
return 1;
|
|
9195
|
+
}
|
|
9513
9196
|
|
|
9514
|
-
|
|
9515
|
-
GGML_ASSERT(lora_tensors.size() == 2);
|
|
9197
|
+
ggml_backend_graph_compute(backend_cpu, gf);
|
|
9516
9198
|
|
|
9517
|
-
|
|
9518
|
-
lora_ctx.reset(ggml_init(params));
|
|
9519
|
-
lora_tensors.clear();
|
|
9199
|
+
ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
|
|
9520
9200
|
|
|
9521
|
-
|
|
9522
|
-
|
|
9523
|
-
|
|
9524
|
-
|
|
9201
|
+
#if 0
|
|
9202
|
+
// TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
|
|
9203
|
+
//ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
|
|
9204
|
+
|
|
9205
|
+
// sched compute
|
|
9206
|
+
ggml_build_forward_expand(gf, build_graph());
|
|
9207
|
+
ggml_backend_sched_init_measure(sched, gf);
|
|
9208
|
+
|
|
9209
|
+
// create the graph again, since the previous one was destroyed by the measure
|
|
9210
|
+
ggml_graph_clear(gf);
|
|
9211
|
+
ggml_build_forward_expand(gf, build_graph());
|
|
9212
|
+
ggml_backend_sched_graph_compute(sched, gf);
|
|
9213
|
+
ggml_backend_sched_free(sched);
|
|
9214
|
+
#endif
|
|
9215
|
+
|
|
9216
|
+
ggml_backend_buffer_free(lora_buf);
|
|
9217
|
+
ggml_backend_buffer_free(graph_buf);
|
|
9218
|
+
ggml_free(lora_ctx);
|
|
9219
|
+
|
|
9220
|
+
n_tensors++;
|
|
9221
|
+
if (n_tensors % 4 == 0) {
|
|
9222
|
+
LLAMA_LOG_INFO(".");
|
|
9525
9223
|
}
|
|
9526
9224
|
}
|
|
9527
9225
|
|
|
9226
|
+
ggml_backend_free(backend_cpu);
|
|
9227
|
+
|
|
9528
9228
|
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
|
|
9529
9229
|
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
|
|
9530
9230
|
|
|
@@ -9537,6 +9237,7 @@ static int llama_apply_lora_from_file_internal(
|
|
|
9537
9237
|
struct llama_model_params llama_model_default_params() {
|
|
9538
9238
|
struct llama_model_params result = {
|
|
9539
9239
|
/*.n_gpu_layers =*/ 0,
|
|
9240
|
+
/*.split_mode =*/ LLAMA_SPLIT_LAYER,
|
|
9540
9241
|
/*.main_gpu =*/ 0,
|
|
9541
9242
|
/*.tensor_split =*/ nullptr,
|
|
9542
9243
|
/*.progress_callback =*/ nullptr,
|
|
@@ -9548,7 +9249,8 @@ struct llama_model_params llama_model_default_params() {
|
|
|
9548
9249
|
};
|
|
9549
9250
|
|
|
9550
9251
|
#ifdef GGML_USE_METAL
|
|
9551
|
-
|
|
9252
|
+
// note: we usually have plenty of VRAM, so by default offload all layers to the GPU
|
|
9253
|
+
result.n_gpu_layers = 999;
|
|
9552
9254
|
#endif
|
|
9553
9255
|
|
|
9554
9256
|
return result;
|
|
@@ -9588,6 +9290,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
|
|
|
9588
9290
|
/*.quantize_output_tensor =*/ true,
|
|
9589
9291
|
/*.only_copy =*/ false,
|
|
9590
9292
|
/*.pure =*/ false,
|
|
9293
|
+
/*.imatrix =*/ nullptr,
|
|
9591
9294
|
};
|
|
9592
9295
|
|
|
9593
9296
|
return result;
|
|
@@ -9738,41 +9441,53 @@ struct llama_context * llama_new_context_with_model(
|
|
|
9738
9441
|
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
|
|
9739
9442
|
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
|
|
9740
9443
|
|
|
9741
|
-
// reserve memory for context buffers
|
|
9742
9444
|
if (!hparams.vocab_only) {
|
|
9743
|
-
// initialize
|
|
9445
|
+
// initialize backends
|
|
9744
9446
|
#ifdef GGML_USE_METAL
|
|
9745
9447
|
if (model->n_gpu_layers > 0) {
|
|
9746
|
-
ctx->
|
|
9747
|
-
if (ctx->
|
|
9448
|
+
ctx->backend_metal = ggml_backend_metal_init();
|
|
9449
|
+
if (ctx->backend_metal == nullptr) {
|
|
9748
9450
|
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
|
|
9451
|
+
llama_free(ctx);
|
|
9452
|
+
return nullptr;
|
|
9749
9453
|
}
|
|
9454
|
+
ctx->backends.push_back(ctx->backend_metal);
|
|
9750
9455
|
}
|
|
9751
|
-
#elif defined(GGML_USE_CUBLAS)
|
|
9752
|
-
// for testing only
|
|
9456
|
+
#elif defined(GGML_USE_CUBLAS)
|
|
9753
9457
|
if (model->n_gpu_layers > 0) {
|
|
9754
|
-
|
|
9755
|
-
if (
|
|
9756
|
-
|
|
9458
|
+
// with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
|
|
9459
|
+
if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
|
|
9460
|
+
ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
|
|
9461
|
+
if (backend == nullptr) {
|
|
9462
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
|
|
9463
|
+
llama_free(ctx);
|
|
9464
|
+
return nullptr;
|
|
9465
|
+
}
|
|
9466
|
+
ctx->backends.push_back(backend);
|
|
9467
|
+
} else {
|
|
9468
|
+
// LLAMA_SPLIT_LAYER requires a backend for each GPU
|
|
9469
|
+
for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
|
|
9470
|
+
ggml_backend_t backend = ggml_backend_cuda_init(device);
|
|
9471
|
+
if (backend == nullptr) {
|
|
9472
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
|
|
9473
|
+
llama_free(ctx);
|
|
9474
|
+
return nullptr;
|
|
9475
|
+
}
|
|
9476
|
+
ctx->backends.push_back(backend);
|
|
9477
|
+
}
|
|
9757
9478
|
}
|
|
9758
9479
|
}
|
|
9759
9480
|
#endif
|
|
9760
|
-
|
|
9761
|
-
if (ctx->
|
|
9762
|
-
|
|
9763
|
-
|
|
9764
|
-
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
|
9765
|
-
}
|
|
9766
|
-
}
|
|
9767
|
-
|
|
9768
|
-
if (ctx->backend == nullptr) {
|
|
9769
|
-
LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
|
|
9770
|
-
delete ctx;
|
|
9481
|
+
ctx->backend_cpu = ggml_backend_cpu_init();
|
|
9482
|
+
if (ctx->backend_cpu == nullptr) {
|
|
9483
|
+
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
|
|
9484
|
+
llama_free(ctx);
|
|
9771
9485
|
return nullptr;
|
|
9772
9486
|
}
|
|
9487
|
+
ctx->backends.push_back(ctx->backend_cpu);
|
|
9773
9488
|
|
|
9774
|
-
if (!llama_kv_cache_init(ctx->
|
|
9775
|
-
cparams.n_ctx,
|
|
9489
|
+
if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
|
|
9490
|
+
cparams.n_ctx, cparams.offload_kqv)) {
|
|
9776
9491
|
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
|
|
9777
9492
|
llama_free(ctx);
|
|
9778
9493
|
return nullptr;
|
|
@@ -9796,23 +9511,30 @@ struct llama_context * llama_new_context_with_model(
|
|
|
9796
9511
|
ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
|
|
9797
9512
|
}
|
|
9798
9513
|
|
|
9799
|
-
// resized during inference
|
|
9800
|
-
|
|
9801
|
-
ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
|
|
9802
|
-
} else {
|
|
9803
|
-
ctx->logits.reserve(hparams.n_vocab);
|
|
9804
|
-
}
|
|
9514
|
+
// resized during inference, reserve maximum
|
|
9515
|
+
ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
|
|
9805
9516
|
|
|
9806
9517
|
if (params.embedding){
|
|
9807
9518
|
ctx->embedding.resize(hparams.n_embd);
|
|
9808
9519
|
}
|
|
9809
9520
|
|
|
9810
9521
|
{
|
|
9811
|
-
//
|
|
9522
|
+
// buffer types used for the compute buffer of each backend
|
|
9523
|
+
std::vector<ggml_backend_buffer_type_t> backend_buft;
|
|
9524
|
+
for (auto * backend : ctx->backends) {
|
|
9525
|
+
if (ggml_backend_is_cpu(backend)) {
|
|
9526
|
+
// use host buffers for the CPU backend compute buffer
|
|
9527
|
+
backend_buft.push_back(llama_default_buffer_type_cpu(true));
|
|
9528
|
+
} else {
|
|
9529
|
+
backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
|
|
9530
|
+
}
|
|
9531
|
+
}
|
|
9532
|
+
|
|
9533
|
+
// buffer used to store the computation graph and the tensor meta data
|
|
9812
9534
|
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
|
|
9813
9535
|
|
|
9814
|
-
|
|
9815
|
-
ctx->alloc =
|
|
9536
|
+
ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
|
|
9537
|
+
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
|
9816
9538
|
|
|
9817
9539
|
// build worst-case graph
|
|
9818
9540
|
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
|
|
@@ -9820,50 +9542,19 @@ struct llama_context * llama_new_context_with_model(
|
|
|
9820
9542
|
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
|
|
9821
9543
|
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
|
|
9822
9544
|
|
|
9823
|
-
//
|
|
9824
|
-
|
|
9825
|
-
|
|
9826
|
-
|
|
9827
|
-
|
|
9828
|
-
|
|
9829
|
-
ggml_allocr_free(ctx->alloc);
|
|
9830
|
-
|
|
9831
|
-
ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
|
|
9832
|
-
ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
|
|
9833
|
-
#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
|
|
9834
|
-
if (model->n_gpu_layers > 0) {
|
|
9835
|
-
// the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
|
|
9836
|
-
ggml_cuda_set_scratch_size(alloc_size + 64);
|
|
9837
|
-
LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
|
|
9838
|
-
|
|
9839
|
-
// calculate total VRAM usage
|
|
9840
|
-
auto add_tensor = [](const ggml_tensor * t, size_t & size) {
|
|
9841
|
-
if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
|
|
9842
|
-
size += ggml_nbytes(t);
|
|
9843
|
-
}
|
|
9844
|
-
};
|
|
9845
|
-
size_t model_vram_size = 0;
|
|
9846
|
-
for (const auto & kv : model->tensors_by_name) {
|
|
9847
|
-
add_tensor(kv.second, model_vram_size);
|
|
9848
|
-
}
|
|
9849
|
-
|
|
9850
|
-
size_t kv_vram_size = 0;
|
|
9851
|
-
for (auto & k : ctx->kv_self.k_l) {
|
|
9852
|
-
add_tensor(k, kv_vram_size);
|
|
9853
|
-
}
|
|
9854
|
-
for (auto & v : ctx->kv_self.v_l) {
|
|
9855
|
-
add_tensor(v, kv_vram_size);
|
|
9856
|
-
}
|
|
9857
|
-
|
|
9858
|
-
size_t ctx_vram_size = alloc_size + kv_vram_size;
|
|
9859
|
-
size_t total_vram_size = model_vram_size + ctx_vram_size;
|
|
9545
|
+
// initialize scheduler with the worst-case graph
|
|
9546
|
+
ggml_backend_sched_init_measure(ctx->sched, gf);
|
|
9547
|
+
// note: the number of splits during measure is higher than during inference due to the kv shift
|
|
9548
|
+
int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
|
|
9549
|
+
LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
|
|
9550
|
+
ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
|
|
9860
9551
|
|
|
9861
|
-
|
|
9862
|
-
|
|
9863
|
-
|
|
9864
|
-
|
|
9552
|
+
for (ggml_backend_t backend : ctx->backends) {
|
|
9553
|
+
ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
|
|
9554
|
+
LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
|
|
9555
|
+
ggml_backend_buffer_name(buf),
|
|
9556
|
+
ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
|
|
9865
9557
|
}
|
|
9866
|
-
#endif
|
|
9867
9558
|
}
|
|
9868
9559
|
}
|
|
9869
9560
|
|
|
@@ -9960,9 +9651,8 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
|
|
|
9960
9651
|
}
|
|
9961
9652
|
|
|
9962
9653
|
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
|
|
9963
|
-
return snprintf(buf, buf_size, "%s %s
|
|
9654
|
+
return snprintf(buf, buf_size, "%s %s %s",
|
|
9964
9655
|
llama_model_arch_name(model->arch).c_str(),
|
|
9965
|
-
model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
|
|
9966
9656
|
llama_model_type_name(model->type),
|
|
9967
9657
|
llama_model_ftype_name(model->ftype).c_str());
|
|
9968
9658
|
}
|
|
@@ -9984,7 +9674,14 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
|
|
|
9984
9674
|
}
|
|
9985
9675
|
|
|
9986
9676
|
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
|
|
9987
|
-
|
|
9677
|
+
auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
|
|
9678
|
+
[name](const std::pair<std::string, struct ggml_tensor *> & it) {
|
|
9679
|
+
return it.first == name;
|
|
9680
|
+
});
|
|
9681
|
+
if (it == model->tensors_by_name.end()) {
|
|
9682
|
+
return nullptr;
|
|
9683
|
+
}
|
|
9684
|
+
return it->second;
|
|
9988
9685
|
}
|
|
9989
9686
|
|
|
9990
9687
|
uint32_t llama_model_quantize(
|
|
@@ -10141,28 +9838,39 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
|
|
|
10141
9838
|
}
|
|
10142
9839
|
|
|
10143
9840
|
void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
|
|
9841
|
+
if (delta == 0) {
|
|
9842
|
+
return;
|
|
9843
|
+
}
|
|
9844
|
+
|
|
10144
9845
|
llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
|
|
10145
9846
|
}
|
|
10146
9847
|
|
|
9848
|
+
void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
|
|
9849
|
+
if (d == 1) {
|
|
9850
|
+
return;
|
|
9851
|
+
}
|
|
9852
|
+
|
|
9853
|
+
llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
|
|
9854
|
+
}
|
|
9855
|
+
|
|
10147
9856
|
// Returns the *maximum* size of the state
|
|
10148
9857
|
size_t llama_get_state_size(const struct llama_context * ctx) {
|
|
10149
9858
|
// we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
|
|
10150
9859
|
// for reference, std::mt19937(1337) serializes to 6701 bytes.
|
|
10151
9860
|
const size_t s_rng_size = sizeof(size_t);
|
|
10152
9861
|
const size_t s_rng = LLAMA_MAX_RNG_STATE;
|
|
10153
|
-
const size_t s_logits_capacity = sizeof(size_t);
|
|
10154
9862
|
const size_t s_logits_size = sizeof(size_t);
|
|
9863
|
+
// assume worst case for logits although only currently set ones are serialized
|
|
10155
9864
|
const size_t s_logits = ctx->logits.capacity() * sizeof(float);
|
|
10156
9865
|
const size_t s_embedding_size = sizeof(size_t);
|
|
10157
9866
|
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
|
|
10158
9867
|
const size_t s_kv_size = sizeof(size_t);
|
|
10159
9868
|
const size_t s_kv_ntok = sizeof(int);
|
|
10160
|
-
const size_t s_kv =
|
|
9869
|
+
const size_t s_kv = ctx->kv_self.total_size();
|
|
10161
9870
|
|
|
10162
9871
|
const size_t s_total = (
|
|
10163
9872
|
+ s_rng_size
|
|
10164
9873
|
+ s_rng
|
|
10165
|
-
+ s_logits_capacity
|
|
10166
9874
|
+ s_logits_size
|
|
10167
9875
|
+ s_logits
|
|
10168
9876
|
+ s_embedding_size
|
|
@@ -10231,37 +9939,27 @@ struct llama_data_file_context : llama_data_context {
|
|
|
10231
9939
|
static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
|
|
10232
9940
|
// copy rng
|
|
10233
9941
|
{
|
|
10234
|
-
std::
|
|
9942
|
+
std::ostringstream rng_ss;
|
|
10235
9943
|
rng_ss << ctx->rng;
|
|
10236
9944
|
|
|
10237
|
-
const
|
|
10238
|
-
|
|
9945
|
+
const std::string & rng_str = rng_ss.str();
|
|
9946
|
+
const size_t rng_size = rng_str.size();
|
|
10239
9947
|
|
|
10240
|
-
|
|
10241
|
-
memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
|
|
9948
|
+
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
|
10242
9949
|
|
|
10243
|
-
data_ctx->write(&rng_size,
|
|
10244
|
-
data_ctx->write(
|
|
9950
|
+
data_ctx->write(&rng_size, sizeof(rng_size));
|
|
9951
|
+
data_ctx->write(rng_str.data(), rng_size);
|
|
10245
9952
|
}
|
|
10246
9953
|
|
|
10247
9954
|
// copy logits
|
|
10248
9955
|
{
|
|
10249
|
-
const size_t logits_cap = ctx->logits.capacity();
|
|
10250
9956
|
const size_t logits_size = ctx->logits.size();
|
|
10251
9957
|
|
|
10252
|
-
data_ctx->write(&logits_cap, sizeof(logits_cap));
|
|
10253
9958
|
data_ctx->write(&logits_size, sizeof(logits_size));
|
|
10254
9959
|
|
|
10255
9960
|
if (logits_size) {
|
|
10256
9961
|
data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
|
|
10257
9962
|
}
|
|
10258
|
-
|
|
10259
|
-
// If there is a gap between the size and the capacity, write padding
|
|
10260
|
-
size_t padding_size = (logits_cap - logits_size) * sizeof(float);
|
|
10261
|
-
if (padding_size > 0) {
|
|
10262
|
-
std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
|
|
10263
|
-
data_ctx->write(padding.data(), padding_size);
|
|
10264
|
-
}
|
|
10265
9963
|
}
|
|
10266
9964
|
|
|
10267
9965
|
// copy embeddings
|
|
@@ -10286,7 +9984,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
|
10286
9984
|
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
|
|
10287
9985
|
const auto n_ctx = cparams.n_ctx;
|
|
10288
9986
|
|
|
10289
|
-
const size_t kv_buf_size =
|
|
9987
|
+
const size_t kv_buf_size = kv_self.total_size();
|
|
10290
9988
|
const uint32_t kv_head = kv_self.head;
|
|
10291
9989
|
const uint32_t kv_size = kv_self.size;
|
|
10292
9990
|
const uint32_t kv_used = kv_self.used;
|
|
@@ -10299,46 +9997,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
|
|
|
10299
9997
|
if (kv_buf_size) {
|
|
10300
9998
|
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
|
10301
9999
|
|
|
10302
|
-
ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
|
|
10303
|
-
ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
|
|
10304
|
-
|
|
10305
|
-
std::vector<struct ggml_tensor *> kout2d(n_layer);
|
|
10306
|
-
std::vector<struct ggml_tensor *> vout2d(n_layer);
|
|
10307
|
-
|
|
10308
|
-
for (int il = 0; il < (int) n_layer; ++il) {
|
|
10309
|
-
kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
|
|
10310
|
-
vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
|
|
10311
|
-
|
|
10312
|
-
ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
|
|
10313
|
-
n_embd_k_gqa, kv_head,
|
|
10314
|
-
elt_size*n_embd_k_gqa, 0);
|
|
10315
|
-
|
|
10316
|
-
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
|
10317
|
-
kv_head, n_embd_v_gqa,
|
|
10318
|
-
elt_size*n_ctx, 0);
|
|
10319
|
-
|
|
10320
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
|
|
10321
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
|
|
10322
|
-
}
|
|
10323
|
-
|
|
10324
|
-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
|
10325
|
-
|
|
10326
|
-
ggml_backend_graph_compute(ctx->backend, gf);
|
|
10327
|
-
|
|
10328
10000
|
std::vector<uint8_t> tmp_buf;
|
|
10329
10001
|
for (int il = 0; il < (int) n_layer; ++il) {
|
|
10330
|
-
tmp_buf.resize(
|
|
10331
|
-
ggml_backend_tensor_get(
|
|
10002
|
+
tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
|
|
10003
|
+
ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
|
|
10332
10004
|
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
|
10333
10005
|
|
|
10334
|
-
|
|
10335
|
-
|
|
10336
|
-
|
|
10006
|
+
// v is not contiguous, copy row by row
|
|
10007
|
+
tmp_buf.resize(elt_size*kv_head);
|
|
10008
|
+
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
|
10009
|
+
ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
|
|
10010
|
+
data_ctx->write(tmp_buf.data(), tmp_buf.size());
|
|
10011
|
+
}
|
|
10337
10012
|
}
|
|
10338
|
-
|
|
10339
|
-
ggml_free(cpy_ctx);
|
|
10340
|
-
|
|
10341
|
-
ggml_backend_buffer_free(buf);
|
|
10342
10013
|
}
|
|
10343
10014
|
|
|
10344
10015
|
for (uint32_t i = 0; i < kv_size; ++i) {
|
|
@@ -10371,13 +10042,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
|
10371
10042
|
// set rng
|
|
10372
10043
|
{
|
|
10373
10044
|
size_t rng_size;
|
|
10374
|
-
|
|
10045
|
+
memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
|
|
10375
10046
|
|
|
10376
|
-
|
|
10377
|
-
memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
|
|
10047
|
+
GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
|
|
10378
10048
|
|
|
10379
|
-
std::
|
|
10380
|
-
|
|
10049
|
+
std::string rng_str((char *)inp, rng_size); inp += rng_size;
|
|
10050
|
+
|
|
10051
|
+
std::istringstream rng_ss(rng_str);
|
|
10381
10052
|
rng_ss >> ctx->rng;
|
|
10382
10053
|
|
|
10383
10054
|
GGML_ASSERT(!rng_ss.fail());
|
|
@@ -10385,20 +10056,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
|
10385
10056
|
|
|
10386
10057
|
// set logits
|
|
10387
10058
|
{
|
|
10388
|
-
size_t logits_cap;
|
|
10389
10059
|
size_t logits_size;
|
|
10390
10060
|
|
|
10391
|
-
memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
|
|
10392
10061
|
memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
|
|
10393
10062
|
|
|
10394
|
-
GGML_ASSERT(ctx->logits.capacity()
|
|
10063
|
+
GGML_ASSERT(ctx->logits.capacity() >= logits_size);
|
|
10395
10064
|
|
|
10396
10065
|
if (logits_size) {
|
|
10397
10066
|
ctx->logits.resize(logits_size);
|
|
10067
|
+
|
|
10398
10068
|
memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
|
|
10069
|
+
inp += logits_size * sizeof(float);
|
|
10399
10070
|
}
|
|
10400
|
-
|
|
10401
|
-
inp += logits_cap * sizeof(float);
|
|
10402
10071
|
}
|
|
10403
10072
|
|
|
10404
10073
|
// set embeddings
|
|
@@ -10437,48 +10106,22 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
|
|
|
10437
10106
|
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
|
|
10438
10107
|
|
|
10439
10108
|
if (kv_buf_size) {
|
|
10440
|
-
GGML_ASSERT(
|
|
10109
|
+
GGML_ASSERT(kv_self.total_size() == kv_buf_size);
|
|
10441
10110
|
|
|
10442
10111
|
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
|
|
10443
10112
|
|
|
10444
|
-
|
|
10445
|
-
|
|
10446
|
-
|
|
10447
|
-
|
|
10448
|
-
|
|
10449
|
-
|
|
10450
|
-
|
|
10451
|
-
|
|
10452
|
-
|
|
10453
|
-
|
|
10454
|
-
|
|
10455
|
-
n_embd_k_gqa, kv_head,
|
|
10456
|
-
elt_size*n_embd_k_gqa, 0);
|
|
10457
|
-
|
|
10458
|
-
ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
|
|
10459
|
-
kv_head, n_embd_v_gqa,
|
|
10460
|
-
elt_size*n_ctx, 0);
|
|
10461
|
-
|
|
10462
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
|
|
10463
|
-
ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
|
|
10464
|
-
}
|
|
10465
|
-
|
|
10466
|
-
ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
|
|
10467
|
-
|
|
10468
|
-
// load data into the tensors
|
|
10469
|
-
for (int il = 0; il < n_layer; ++il) {
|
|
10470
|
-
ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
|
|
10471
|
-
inp += ggml_nbytes(kin2d[il]);
|
|
10472
|
-
|
|
10473
|
-
ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
|
|
10474
|
-
inp += ggml_nbytes(vin2d[il]);
|
|
10113
|
+
for (int il = 0; il < (int) n_layer; ++il) {
|
|
10114
|
+
size_t k_size = elt_size*n_embd_k_gqa*kv_head;
|
|
10115
|
+
ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
|
|
10116
|
+
inp += k_size;
|
|
10117
|
+
|
|
10118
|
+
// v is not contiguous, copy row by row
|
|
10119
|
+
size_t v_row_size = elt_size*kv_head;
|
|
10120
|
+
for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
|
|
10121
|
+
ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
|
|
10122
|
+
inp += v_row_size;
|
|
10123
|
+
}
|
|
10475
10124
|
}
|
|
10476
|
-
|
|
10477
|
-
ggml_backend_graph_compute(ctx->backend, gf);
|
|
10478
|
-
|
|
10479
|
-
ggml_free(cpy_ctx);
|
|
10480
|
-
|
|
10481
|
-
ggml_backend_buffer_free(buf);
|
|
10482
10125
|
}
|
|
10483
10126
|
|
|
10484
10127
|
ctx->kv_self.head = kv_head;
|
|
@@ -10794,6 +10437,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
|
10794
10437
|
if (0 <= token && token < llama_n_vocab(model)) {
|
|
10795
10438
|
switch (llama_vocab_get_type(model->vocab)) {
|
|
10796
10439
|
case LLAMA_VOCAB_TYPE_SPM: {
|
|
10440
|
+
// NOTE: we accept all unsupported token types,
|
|
10441
|
+
// suppressing them like CONTROL tokens.
|
|
10797
10442
|
if (llama_is_normal_token(model->vocab, token)) {
|
|
10798
10443
|
std::string result = model->vocab.id_to_token[token].text;
|
|
10799
10444
|
llama_unescape_whitespace(result);
|
|
@@ -10802,6 +10447,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
|
10802
10447
|
}
|
|
10803
10448
|
memcpy(buf, result.c_str(), result.length());
|
|
10804
10449
|
return result.length();
|
|
10450
|
+
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
|
10451
|
+
std::string result = model->vocab.id_to_token[token].text;
|
|
10452
|
+
if (length < (int) result.length()) {
|
|
10453
|
+
return -result.length();
|
|
10454
|
+
}
|
|
10455
|
+
memcpy(buf, result.c_str(), result.length());
|
|
10456
|
+
return result.length();
|
|
10805
10457
|
} else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
|
|
10806
10458
|
if (length < 3) {
|
|
10807
10459
|
return -3;
|
|
@@ -10816,14 +10468,12 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
|
10816
10468
|
}
|
|
10817
10469
|
buf[0] = llama_token_to_byte(model->vocab, token);
|
|
10818
10470
|
return 1;
|
|
10819
|
-
} else {
|
|
10820
|
-
// TODO: for now we accept all unsupported token types,
|
|
10821
|
-
// suppressing them like CONTROL tokens.
|
|
10822
|
-
// GGML_ASSERT(false);
|
|
10823
10471
|
}
|
|
10824
10472
|
break;
|
|
10825
10473
|
}
|
|
10826
10474
|
case LLAMA_VOCAB_TYPE_BPE: {
|
|
10475
|
+
// NOTE: we accept all unsupported token types,
|
|
10476
|
+
// suppressing them like CONTROL tokens.
|
|
10827
10477
|
if (llama_is_normal_token(model->vocab, token)) {
|
|
10828
10478
|
std::string result = model->vocab.id_to_token[token].text;
|
|
10829
10479
|
result = llama_decode_text(result);
|
|
@@ -10832,12 +10482,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
|
|
|
10832
10482
|
}
|
|
10833
10483
|
memcpy(buf, result.c_str(), result.length());
|
|
10834
10484
|
return result.length();
|
|
10485
|
+
} else if (llama_is_user_defined_token(model->vocab, token)) {
|
|
10486
|
+
std::string result = model->vocab.id_to_token[token].text;
|
|
10487
|
+
if (length < (int) result.length()) {
|
|
10488
|
+
return -result.length();
|
|
10489
|
+
}
|
|
10490
|
+
memcpy(buf, result.c_str(), result.length());
|
|
10491
|
+
return result.length();
|
|
10835
10492
|
} else if (llama_is_control_token(model->vocab, token)) {
|
|
10836
10493
|
;
|
|
10837
|
-
} else {
|
|
10838
|
-
// TODO: for now we accept all unsupported token types,
|
|
10839
|
-
// suppressing them like CONTROL tokens.
|
|
10840
|
-
// GGML_ASSERT(false);
|
|
10841
10494
|
}
|
|
10842
10495
|
break;
|
|
10843
10496
|
}
|
|
@@ -10876,7 +10529,7 @@ void llama_print_timings(struct llama_context * ctx) {
|
|
|
10876
10529
|
__func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
|
|
10877
10530
|
LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
|
|
10878
10531
|
__func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
|
|
10879
|
-
LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
|
|
10532
|
+
LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
|
|
10880
10533
|
}
|
|
10881
10534
|
|
|
10882
10535
|
void llama_reset_timings(struct llama_context * ctx) {
|
|
@@ -10949,7 +10602,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
|
|
|
10949
10602
|
g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
|
|
10950
10603
|
g_state.log_callback_user_data = user_data;
|
|
10951
10604
|
#ifdef GGML_USE_METAL
|
|
10952
|
-
|
|
10605
|
+
ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
|
|
10953
10606
|
#endif
|
|
10954
10607
|
}
|
|
10955
10608
|
|