llama_cpp 0.12.0 → 0.12.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,5 +1,4 @@
1
1
  #define LLAMA_API_INTERNAL
2
- //#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
3
2
  #include "llama.h"
4
3
 
5
4
  #include "unicode.h"
@@ -152,10 +151,6 @@ static bool is_float_close(float a, float b, float abs_tol) {
152
151
  return std::fabs(b - a) <= abs_tol;
153
152
  }
154
153
 
155
- #ifdef GGML_USE_CPU_HBM
156
- #include <hbwmalloc.h>
157
- #endif
158
-
159
154
  static void zeros(std::ofstream & file, size_t n) {
160
155
  char zero = 0;
161
156
  for (size_t i = 0; i < n; ++i) {
@@ -579,6 +574,9 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
579
574
  { LLM_TENSOR_OUTPUT, "output" },
580
575
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
581
576
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
577
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
578
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
579
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
582
580
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
583
581
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
584
582
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
@@ -988,20 +986,29 @@ struct llama_mmap {
988
986
  throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
989
987
  }
990
988
 
991
- #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
992
989
  if (prefetch > 0) {
993
- // Advise the kernel to preload the mapped memory
994
- WIN32_MEMORY_RANGE_ENTRY range;
995
- range.VirtualAddress = addr;
996
- range.NumberOfBytes = (SIZE_T)size;
997
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
998
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
999
- llama_format_win_err(GetLastError()).c_str());
990
+ #if _WIN32_WINNT >= 0x602
991
+ // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
992
+ BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
993
+ HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
994
+
995
+ // may fail on pre-Windows 8 systems
996
+ pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
997
+
998
+ if (pPrefetchVirtualMemory) {
999
+ // advise the kernel to preload the mapped memory
1000
+ WIN32_MEMORY_RANGE_ENTRY range;
1001
+ range.VirtualAddress = addr;
1002
+ range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
1003
+ if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
1004
+ LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
1005
+ llama_format_win_err(GetLastError()).c_str());
1006
+ }
1000
1007
  }
1008
+ #else
1009
+ throw std::runtime_error("PrefetchVirtualMemory unavailable");
1010
+ #endif
1001
1011
  }
1002
- #else
1003
- #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
1004
- #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
1005
1012
  }
1006
1013
 
1007
1014
  void unmap_fragment(size_t first, size_t last) {
@@ -1107,7 +1114,7 @@ struct llama_mlock {
1107
1114
  suggest = false;
1108
1115
  }
1109
1116
 
1110
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
1117
+ LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
1111
1118
  size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
1112
1119
  return false;
1113
1120
  }
@@ -1116,7 +1123,7 @@ struct llama_mlock {
1116
1123
 
1117
1124
  static void raw_unlock(void * addr, size_t size) {
1118
1125
  if (munlock(addr, size)) {
1119
- fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
1126
+ LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
1120
1127
  }
1121
1128
  }
1122
1129
  #elif defined(_WIN32)
@@ -1134,7 +1141,7 @@ struct llama_mlock {
1134
1141
  return true;
1135
1142
  }
1136
1143
  if (tries == 2) {
1137
- fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
1144
+ LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
1138
1145
  len, size, llama_format_win_err(GetLastError()).c_str());
1139
1146
  return false;
1140
1147
  }
@@ -1143,7 +1150,7 @@ struct llama_mlock {
1143
1150
  // set size and try again.
1144
1151
  SIZE_T min_ws_size, max_ws_size;
1145
1152
  if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
1146
- fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
1153
+ LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
1147
1154
  llama_format_win_err(GetLastError()).c_str());
1148
1155
  return false;
1149
1156
  }
@@ -1156,7 +1163,7 @@ struct llama_mlock {
1156
1163
  min_ws_size += increment;
1157
1164
  max_ws_size += increment;
1158
1165
  if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
1159
- fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
1166
+ LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
1160
1167
  llama_format_win_err(GetLastError()).c_str());
1161
1168
  return false;
1162
1169
  }
@@ -1165,7 +1172,7 @@ struct llama_mlock {
1165
1172
 
1166
1173
  static void raw_unlock(void * ptr, size_t len) {
1167
1174
  if (!VirtualUnlock(ptr, len)) {
1168
- fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
1175
+ LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
1169
1176
  llama_format_win_err(GetLastError()).c_str());
1170
1177
  }
1171
1178
  }
@@ -1177,7 +1184,7 @@ struct llama_mlock {
1177
1184
  }
1178
1185
 
1179
1186
  bool raw_lock(const void * addr, size_t len) const {
1180
- fprintf(stderr, "warning: mlock not supported on this system\n");
1187
+ LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
1181
1188
  return false;
1182
1189
  }
1183
1190
 
@@ -1185,12 +1192,6 @@ struct llama_mlock {
1185
1192
  #endif
1186
1193
  };
1187
1194
 
1188
- typedef void (*offload_func_t)(struct ggml_tensor * tensor);
1189
-
1190
- static void ggml_offload_nop(struct ggml_tensor * tensor) {
1191
- (void) tensor;
1192
- }
1193
-
1194
1195
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1195
1196
  std::vector<char> result(8, 0);
1196
1197
  const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
@@ -1206,19 +1207,14 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1206
1207
  return std::string(result.data(), result.size());
1207
1208
  }
1208
1209
 
1209
- static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1210
+ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
1210
1211
  ggml_backend_buffer_type_t buft = nullptr;
1211
1212
 
1212
- #ifdef GGML_USE_METAL
1213
- if (n_gpu_layers > 0) {
1214
- buft = ggml_backend_metal_buffer_type();
1215
- }
1216
- #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1217
- if (n_gpu_layers > 0) {
1218
- buft = ggml_backend_cuda_buffer_type(0);
1213
+ #if defined(GGML_USE_CUBLAS)
1214
+ // host buffers should only be used when data is expected to be copied to/from the GPU
1215
+ if (host_buffer) {
1216
+ buft = ggml_backend_cuda_host_buffer_type();
1219
1217
  }
1220
- #elif defined(GGML_USE_CUBLAS)
1221
- buft = ggml_backend_cuda_host_buffer_type();
1222
1218
  #elif defined(GGML_USE_CPU_HBM)
1223
1219
  buft = ggml_backend_cpu_hbm_buffer_type();
1224
1220
  #endif
@@ -1226,10 +1222,45 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1226
1222
  if (buft == nullptr) {
1227
1223
  buft = ggml_backend_cpu_buffer_type();
1228
1224
  }
1225
+ return buft;
1226
+
1227
+ GGML_UNUSED(host_buffer);
1228
+ }
1229
+
1230
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1231
+ ggml_backend_buffer_type_t buft = nullptr;
1232
+
1233
+ #ifdef GGML_USE_METAL
1234
+ buft = ggml_backend_metal_buffer_type();
1235
+ #elif defined(GGML_USE_CUBLAS)
1236
+ buft = ggml_backend_cuda_buffer_type(gpu);
1237
+ #elif defined(GGML_USE_CLBLAST)
1238
+ buft = ggml_backend_opencl_buffer_type();
1239
+ #endif
1229
1240
 
1241
+ if (buft == nullptr) {
1242
+ buft = llama_default_buffer_type_cpu(true);
1243
+ }
1230
1244
  return buft;
1231
1245
 
1232
- GGML_UNUSED(n_gpu_layers);
1246
+ GGML_UNUSED(gpu);
1247
+ }
1248
+
1249
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1250
+ ggml_backend_buffer_type_t buft = nullptr;
1251
+
1252
+ #ifdef GGML_USE_CUBLAS
1253
+ if (ggml_backend_cuda_get_device_count() > 1) {
1254
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1255
+ }
1256
+ #endif
1257
+
1258
+ if (buft == nullptr) {
1259
+ buft = llama_default_buffer_type_offload(fallback_gpu);
1260
+ }
1261
+ return buft;
1262
+
1263
+ GGML_UNUSED(tensor_split);
1233
1264
  }
1234
1265
 
1235
1266
  //
@@ -1239,7 +1270,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1239
1270
  struct llama_state {
1240
1271
  llama_state() {
1241
1272
  #ifdef GGML_USE_METAL
1242
- ggml_metal_log_set_callback(log_callback, log_callback_user_data);
1273
+ ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1243
1274
  #endif
1244
1275
  }
1245
1276
 
@@ -1440,24 +1471,24 @@ struct llama_kv_cache {
1440
1471
  std::vector<struct ggml_tensor *> k_l; // per layer
1441
1472
  std::vector<struct ggml_tensor *> v_l;
1442
1473
 
1443
- struct ggml_context * ctx = NULL;
1474
+ std::vector<struct ggml_context *> ctxs;
1475
+ std::vector<ggml_backend_buffer_t> bufs;
1444
1476
 
1445
- ggml_backend_buffer_t buf = NULL;
1477
+ size_t total_size() const {
1478
+ size_t size = 0;
1479
+ for (ggml_backend_buffer_t buf : bufs) {
1480
+ size += ggml_backend_buffer_get_size(buf);
1481
+ }
1482
+ return size;
1483
+ }
1446
1484
 
1447
1485
  ~llama_kv_cache() {
1448
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1449
- if (ggml_cublas_loaded()) {
1450
- for (size_t i = 0; i < k_l.size(); ++i) {
1451
- ggml_cuda_free_data(k_l[i]);
1452
- ggml_cuda_free_data(v_l[i]);
1453
- }
1454
- }
1455
- #endif
1456
- if (ctx) {
1486
+ for (struct ggml_context * ctx : ctxs) {
1457
1487
  ggml_free(ctx);
1458
1488
  }
1459
-
1460
- ggml_backend_buffer_free(buf);
1489
+ for (ggml_backend_buffer_t buf : bufs) {
1490
+ ggml_backend_buffer_free(buf);
1491
+ }
1461
1492
  }
1462
1493
  };
1463
1494
 
@@ -1534,16 +1565,32 @@ struct llama_model {
1534
1565
 
1535
1566
  std::vector<llama_layer> layers;
1536
1567
 
1568
+ llama_split_mode split_mode;
1569
+ int main_gpu;
1537
1570
  int n_gpu_layers;
1538
1571
 
1539
1572
  // gguf metadata
1540
1573
  std::unordered_map<std::string, std::string> gguf_kv;
1541
1574
 
1542
- // context
1543
- struct ggml_context * ctx = NULL;
1575
+ // layer -> buffer type mapping
1576
+ struct layer_buft {
1577
+ layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
1578
+ layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
1579
+ layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
1580
+
1581
+ ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
1582
+ ggml_backend_buffer_type_t buft; // everything else
1583
+ };
1584
+
1585
+ layer_buft buft_input;
1586
+ layer_buft buft_output;
1587
+ std::vector<layer_buft> buft_layer;
1588
+
1589
+ // contexts where the model tensors metadata is stored
1590
+ std::vector<struct ggml_context *> ctxs;
1544
1591
 
1545
- // the model memory buffer
1546
- ggml_backend_buffer_t buf = NULL;
1592
+ // the model memory buffers for the tensor data
1593
+ std::vector<ggml_backend_buffer_t> bufs;
1547
1594
 
1548
1595
  // model memory mapped file
1549
1596
  std::unique_ptr<llama_mmap> mapping;
@@ -1559,39 +1606,32 @@ struct llama_model {
1559
1606
  int64_t t_start_us = 0;
1560
1607
 
1561
1608
  ~llama_model() {
1562
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1563
- if (ggml_cublas_loaded()) {
1564
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1565
- ggml_cuda_free_data(tensors_by_name[i].second);
1566
- }
1567
- ggml_cuda_free_scratch();
1568
- }
1569
- #endif
1570
-
1571
- #if defined(GGML_USE_CLBLAST)
1572
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1573
- ggml_cl_free_data(tensors_by_name[i].second);
1574
- }
1575
- #endif
1576
- if (ctx) {
1609
+ for (struct ggml_context * ctx : ctxs) {
1577
1610
  ggml_free(ctx);
1578
1611
  }
1579
-
1580
- ggml_backend_buffer_free(buf);
1612
+ for (ggml_backend_buffer_t buf : bufs) {
1613
+ ggml_backend_buffer_free(buf);
1614
+ }
1581
1615
  }
1582
1616
  };
1583
1617
 
1584
1618
  struct llama_context {
1585
1619
  llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1586
1620
  ~llama_context() {
1587
- ggml_allocr_free(alloc);
1588
- ggml_backend_buffer_free(buf_alloc);
1589
- ggml_backend_free(backend);
1621
+ ggml_backend_sched_free(sched);
1622
+
1623
+ for (ggml_backend_t backend : backends) {
1624
+ ggml_backend_free(backend);
1625
+ }
1590
1626
  }
1591
1627
 
1592
1628
  llama_cparams cparams;
1593
1629
 
1594
- ggml_backend_t backend = nullptr;
1630
+ std::vector<ggml_backend_t> backends;
1631
+ #ifdef GGML_USE_METAL
1632
+ ggml_backend_t backend_metal = nullptr;
1633
+ #endif
1634
+ ggml_backend_t backend_cpu = nullptr;
1595
1635
 
1596
1636
  const llama_model & model;
1597
1637
 
@@ -1625,8 +1665,9 @@ struct llama_context {
1625
1665
 
1626
1666
  // memory buffers used to evaluate the model
1627
1667
  std::vector<uint8_t> buf_compute_meta;
1628
- ggml_backend_buffer_t buf_alloc = NULL;
1629
- ggml_allocr * alloc = NULL;
1668
+ ggml_backend_sched_t sched = nullptr;
1669
+ // allocator for the input tensors
1670
+ ggml_tallocr * alloc = nullptr;
1630
1671
 
1631
1672
  // temporary buffer for copying data to/from the backend
1632
1673
  std::vector<no_init<uint8_t>> buf_copy;
@@ -1641,16 +1682,17 @@ struct llama_context {
1641
1682
  //
1642
1683
 
1643
1684
  static bool llama_kv_cache_init(
1644
- const struct llama_hparams & hparams,
1645
1685
  struct llama_kv_cache & cache,
1686
+ const llama_model & model,
1646
1687
  ggml_type ktype,
1647
1688
  ggml_type vtype,
1648
1689
  uint32_t n_ctx,
1649
- int n_gpu_layers,
1650
1690
  bool offload) {
1691
+ const struct llama_hparams & hparams = model.hparams;
1692
+
1651
1693
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1652
1694
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
1653
- const uint32_t n_layer = hparams.n_layer;
1695
+ const int64_t n_layer = hparams.n_layer;
1654
1696
 
1655
1697
  cache.has_shift = false;
1656
1698
 
@@ -1661,62 +1703,65 @@ static bool llama_kv_cache_init(
1661
1703
  cache.cells.clear();
1662
1704
  cache.cells.resize(n_ctx);
1663
1705
 
1664
- struct ggml_init_params params;
1665
- params.mem_size = 2u*n_layer*ggml_tensor_overhead();
1666
- params.mem_buffer = NULL;
1667
- params.no_alloc = true;
1668
-
1669
- cache.ctx = ggml_init(params);
1706
+ #ifdef GGML_USE_CLBLAST
1707
+ offload = false;
1708
+ #endif
1670
1709
 
1671
- size_t vram_kv_cache = 0;
1710
+ // count used buffer types
1711
+ std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
1712
+ if (offload) {
1713
+ for (int64_t i = 0; i < n_layer; ++i) {
1714
+ buft_layer_count[model.buft_layer[i].buft]++;
1715
+ }
1716
+ } else {
1717
+ buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
1718
+ }
1672
1719
 
1673
- if (!cache.ctx) {
1674
- LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
1675
- return false;
1720
+ // create a context for each buffer type
1721
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
1722
+ for (auto & it : buft_layer_count) {
1723
+ int n_layers = it.second;
1724
+ struct ggml_init_params params = {
1725
+ /*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead(),
1726
+ /*.mem_buffer =*/ NULL,
1727
+ /*.no_alloc =*/ true,
1728
+ };
1729
+ ggml_context * ctx = ggml_init(params);
1730
+ if (!ctx) {
1731
+ LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
1732
+ return false;
1733
+ }
1734
+ ctx_map[it.first] = ctx;
1735
+ cache.ctxs.push_back(ctx);
1676
1736
  }
1677
1737
 
1678
1738
  cache.k_l.reserve(n_layer);
1679
1739
  cache.v_l.reserve(n_layer);
1680
1740
 
1681
- const int i_gpu_start = (int) n_layer - n_gpu_layers;
1682
-
1683
1741
  for (int i = 0; i < (int) n_layer; i++) {
1684
- ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
1685
- ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
1742
+ struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
1743
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
1744
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
1686
1745
  ggml_format_name(k, "cache_k_l%d", i);
1687
1746
  ggml_format_name(v, "cache_v_l%d", i);
1688
1747
  cache.k_l.push_back(k);
1689
1748
  cache.v_l.push_back(v);
1690
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1691
- if (i >= i_gpu_start) {
1692
- if (offload) {
1693
- ggml_cuda_assign_buffers_no_scratch(k);
1694
- ggml_cuda_assign_buffers_no_scratch(v);
1695
- vram_kv_cache += ggml_nbytes(k);
1696
- vram_kv_cache += ggml_nbytes(v);
1697
- // HACK: mark tensor as allocated
1698
- k->data = v->data = (void *)(uintptr_t)1;
1699
- }
1700
- }
1701
- #endif // GGML_USE_CUBLAS
1702
1749
  }
1703
1750
 
1704
- // allocate tensors
1705
- cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
1706
-
1707
- // buf may be NULL with full offload
1708
- if (cache.buf) {
1709
- // initialize the buffer to avoid NaNs in the padding
1710
- ggml_backend_buffer_clear(cache.buf, 0);
1711
- }
1712
-
1713
- if (vram_kv_cache > 0) {
1714
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1751
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
1752
+ for (auto it : ctx_map) {
1753
+ ggml_backend_buffer_type_t buft = it.first;
1754
+ ggml_context * ctx = it.second;
1755
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
1756
+ if (!buf) {
1757
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
1758
+ return false;
1759
+ }
1760
+ ggml_backend_buffer_clear(buf, 0);
1761
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
1762
+ cache.bufs.push_back(buf);
1715
1763
  }
1716
1764
 
1717
- GGML_UNUSED(i_gpu_start);
1718
- GGML_UNUSED(offload);
1719
-
1720
1765
  return true;
1721
1766
  }
1722
1767
 
@@ -1898,6 +1943,28 @@ static void llama_kv_cache_seq_shift(
1898
1943
  cache.head = new_head != cache.size ? new_head : 0;
1899
1944
  }
1900
1945
 
1946
+ static void llama_kv_cache_seq_div(
1947
+ struct llama_kv_cache & cache,
1948
+ llama_seq_id seq_id,
1949
+ llama_pos p0,
1950
+ llama_pos p1,
1951
+ int d) {
1952
+ if (p0 < 0) p0 = 0;
1953
+ if (p1 < 0) p1 = std::numeric_limits<llama_pos>::max();
1954
+
1955
+ for (uint32_t i = 0; i < cache.size; ++i) {
1956
+ if (cache.cells[i].has_seq_id(seq_id) && cache.cells[i].pos >= p0 && cache.cells[i].pos < p1) {
1957
+ cache.has_shift = true;
1958
+
1959
+ {
1960
+ llama_pos p_old = cache.cells[i].pos;
1961
+ cache.cells[i].pos /= d;
1962
+ cache.cells[i].delta += cache.cells[i].pos - p_old;
1963
+ }
1964
+ }
1965
+ }
1966
+ }
1967
+
1901
1968
  //
1902
1969
  // model loading and saving
1903
1970
  //
@@ -2018,13 +2085,13 @@ namespace GGUFMeta {
2018
2085
  __func__, override_type_to_str(override->tag), override->key);
2019
2086
  switch (override->tag) {
2020
2087
  case LLAMA_KV_OVERRIDE_BOOL: {
2021
- printf("%s\n", override->bool_value ? "true" : "false");
2088
+ LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2022
2089
  } break;
2023
2090
  case LLAMA_KV_OVERRIDE_INT: {
2024
- printf("%" PRId64 "\n", override->int_value);
2091
+ LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2025
2092
  } break;
2026
2093
  case LLAMA_KV_OVERRIDE_FLOAT: {
2027
- printf("%.6f\n", override->float_value);
2094
+ LLAMA_LOG_INFO("%.6f\n", override->float_value);
2028
2095
  } break;
2029
2096
  default:
2030
2097
  // Shouldn't be possible to end up here, but just in case...
@@ -2123,6 +2190,11 @@ struct llama_model_loader {
2123
2190
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2124
2191
 
2125
2192
  llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
2193
+ int trace = 0;
2194
+ if (getenv("LLAMA_TRACE")) {
2195
+ trace = atoi(getenv("LLAMA_TRACE"));
2196
+ }
2197
+
2126
2198
  struct gguf_init_params params = {
2127
2199
  /*.no_alloc = */ true,
2128
2200
  /*.ctx = */ &ctx_meta,
@@ -2175,7 +2247,10 @@ struct llama_model_loader {
2175
2247
  type_max = type;
2176
2248
  }
2177
2249
 
2178
- // LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, name, ggml_type_name(meta->type), llama_format_tensor_shape(meta).c_str());
2250
+ if (trace > 0) {
2251
+ struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2252
+ LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
2253
+ }
2179
2254
  }
2180
2255
 
2181
2256
  switch (type_max) {
@@ -2191,6 +2266,8 @@ struct llama_model_loader {
2191
2266
  case GGML_TYPE_Q4_K: ftype = LLAMA_FTYPE_MOSTLY_Q4_K_M; break;
2192
2267
  case GGML_TYPE_Q5_K: ftype = LLAMA_FTYPE_MOSTLY_Q5_K_M; break;
2193
2268
  case GGML_TYPE_Q6_K: ftype = LLAMA_FTYPE_MOSTLY_Q6_K; break;
2269
+ case GGML_TYPE_IQ2_XXS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XXS; break;
2270
+ case GGML_TYPE_IQ2_XS: ftype = LLAMA_FTYPE_MOSTLY_IQ2_XS; break;
2194
2271
  default:
2195
2272
  {
2196
2273
  LLAMA_LOG_WARN("%s: unknown type %s\n", __func__, ggml_type_name(type_max));
@@ -2321,9 +2398,8 @@ struct llama_model_loader {
2321
2398
  return get_tensor_meta(get_tensor_name(i));
2322
2399
  }
2323
2400
 
2324
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
2401
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
2325
2402
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
2326
- tensor->backend = backend; // TODO: ggml_set_backend
2327
2403
  ggml_set_name(tensor, ggml_get_name(meta));
2328
2404
 
2329
2405
  n_created++;
@@ -2331,7 +2407,7 @@ struct llama_model_loader {
2331
2407
  return tensor;
2332
2408
  }
2333
2409
 
2334
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
2410
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
2335
2411
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
2336
2412
 
2337
2413
  if (cur == NULL) {
@@ -2341,12 +2417,6 @@ struct llama_model_loader {
2341
2417
  throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
2342
2418
  }
2343
2419
 
2344
- if (backend == GGML_BACKEND_GPU_SPLIT) {
2345
- if (ne.size() == 1) {
2346
- throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
2347
- }
2348
- }
2349
-
2350
2420
  {
2351
2421
  bool is_ok = true;
2352
2422
  for (size_t i = 0; i < ne.size(); ++i) {
@@ -2364,7 +2434,7 @@ struct llama_model_loader {
2364
2434
  }
2365
2435
  }
2366
2436
 
2367
- return create_tensor_for(ctx, cur, backend);
2437
+ return create_tensor_for(ctx, cur);
2368
2438
  }
2369
2439
 
2370
2440
  void done_getting_tensors() const {
@@ -2383,25 +2453,35 @@ struct llama_model_loader {
2383
2453
  return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
2384
2454
  }
2385
2455
 
2386
- void init_mapping(bool prefetch = true) {
2387
- /*
2388
- // prefetch only CPU tensors
2456
+ void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
2457
+ // prefetch the whole file - all the data is needed anyway
2389
2458
  if (use_mmap) {
2390
- size_t size_pref = 0; // prefetch
2459
+ mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
2460
+ }
2391
2461
 
2392
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2393
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2394
- if (cur->backend == GGML_BACKEND_CPU) {
2395
- size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
2396
- size_pref = std::max(size_pref, tensor_end);
2397
- }
2462
+ // compute the total size of all tensors for progress reporting
2463
+ for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2464
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2465
+ size_data += ggml_nbytes(cur);
2466
+ }
2467
+
2468
+ if (use_mmap && mapping) {
2469
+ if (lmlock) {
2470
+ lmlock->init(mapping->addr);
2398
2471
  }
2399
- mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
2472
+ mmap_used_first = mapping->size;
2400
2473
  }
2401
- */
2402
- // prefetch the whole file - all the data is needed anyway
2403
- if (use_mmap) {
2404
- mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
2474
+ }
2475
+
2476
+ void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
2477
+ GGML_ASSERT(mapping);
2478
+
2479
+ *first = mapping->size;
2480
+ *last = 0;
2481
+ for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
2482
+ const size_t offs = file_offset(ggml_get_name(tensor));
2483
+ *first = std::min(*first, offs);
2484
+ *last = std::max(*last, offs + ggml_nbytes(tensor));
2405
2485
  }
2406
2486
  }
2407
2487
 
@@ -2410,8 +2490,11 @@ struct llama_model_loader {
2410
2490
  const size_t offs = file_offset(ggml_get_name(cur));
2411
2491
 
2412
2492
  if (use_mmap && mapping) {
2413
- GGML_ASSERT(cur->data == nullptr);
2414
- cur->data = (uint8_t *)mapping->addr + offs;
2493
+ if (cur->data == nullptr) {
2494
+ cur->data = (uint8_t *)mapping->addr + offs;
2495
+ } else {
2496
+ memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
2497
+ }
2415
2498
  } else {
2416
2499
  GGML_ASSERT(cur->data != nullptr);
2417
2500
  file.seek(offs, SEEK_SET);
@@ -2419,37 +2502,23 @@ struct llama_model_loader {
2419
2502
  }
2420
2503
  }
2421
2504
 
2422
- // Returns false if cancelled by progress_callback
2423
- bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
2424
- size_t size_data = 0;
2425
-
2426
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2427
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2428
- size_data += ggml_nbytes(cur);
2429
- }
2430
-
2431
- if (use_mmap && buf_mmap) {
2432
- if (lmlock) {
2433
- lmlock->init(mapping->addr);
2434
- }
2435
- }
2505
+ size_t size_done = 0;
2506
+ size_t size_data = 0;
2507
+ size_t mmap_used_first = -1;
2508
+ size_t mmap_used_last = 0;
2436
2509
 
2437
- #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
2438
- const bool legacy_offload = true;
2439
- #else
2440
- const bool legacy_offload = false;
2441
- #endif
2510
+ // Returns false if cancelled by progress_callback
2511
+ bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
2512
+ GGML_ASSERT(size_data != 0 && "call init_mapping() first");
2442
2513
 
2443
2514
  std::vector<no_init<uint8_t>> read_buf;
2444
2515
 
2445
- size_t size_done = 0;
2446
-
2447
- size_t mmap_first = -1;
2448
- size_t mmap_last = 0;
2449
-
2450
2516
  for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2451
2517
  struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2452
- GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
2518
+ if (!cur) {
2519
+ // some tensors may be allocated in a different context
2520
+ continue;
2521
+ }
2453
2522
 
2454
2523
  if (progress_callback) {
2455
2524
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
@@ -2459,67 +2528,48 @@ struct llama_model_loader {
2459
2528
 
2460
2529
  const size_t offs = file_offset(ggml_get_name(cur));
2461
2530
 
2462
- if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
2463
- if (use_mmap && mapping) {
2464
- if (buf_mmap) {
2465
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
2466
- if (lmlock) {
2467
- lmlock->grow_to(offs + ggml_nbytes(cur));
2468
- }
2469
- mmap_first = std::min(mmap_first, offs);
2470
- mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
2471
- } else {
2472
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
2531
+ if (use_mmap && mapping) {
2532
+ if (buf_mmap && cur->data == nullptr) {
2533
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
2534
+ if (lmlock) {
2535
+ lmlock->grow_to(offs + ggml_nbytes(cur));
2473
2536
  }
2537
+ mmap_used_first = std::min(mmap_used_first, offs);
2538
+ mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
2474
2539
  } else {
2475
- if (ggml_backend_buffer_is_host(cur->buffer)) {
2476
- file.seek(offs, SEEK_SET);
2477
- file.read_raw(cur->data, ggml_nbytes(cur));
2478
- } else {
2479
- read_buf.resize(ggml_nbytes(cur));
2480
- file.seek(offs, SEEK_SET);
2481
- file.read_raw(read_buf.data(), ggml_nbytes(cur));
2482
- ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
2483
- }
2540
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
2484
2541
  }
2485
2542
  } else {
2486
- // HACK: mark tensor as allocated
2487
- cur->data = (void *)(uintptr_t)1;
2488
- void * data;
2489
- if (use_mmap && mapping) {
2490
- data = (uint8_t *) mapping->addr + offs;
2543
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
2544
+ file.seek(offs, SEEK_SET);
2545
+ file.read_raw(cur->data, ggml_nbytes(cur));
2491
2546
  } else {
2492
2547
  read_buf.resize(ggml_nbytes(cur));
2493
2548
  file.seek(offs, SEEK_SET);
2494
2549
  file.read_raw(read_buf.data(), ggml_nbytes(cur));
2495
- data = read_buf.data();
2550
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
2496
2551
  }
2497
-
2498
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
2499
- ggml_cuda_transform_tensor(data, cur);
2500
- #elif defined(GGML_USE_CLBLAST)
2501
- GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
2502
- ggml_cl_transform_tensor(data, cur);
2503
- #else
2504
- GGML_ASSERT(!"GPU tensor without a GPU backend");
2505
- GGML_UNUSED(data);
2506
- #endif
2507
2552
  }
2508
2553
 
2509
2554
  size_done += ggml_nbytes(cur);
2510
2555
  }
2511
2556
 
2512
- // unmap offloaded tensors and metadata
2513
- if (use_mmap && mapping) {
2514
- mapping->unmap_fragment(0, mmap_first);
2515
- mapping->unmap_fragment(mmap_last, mapping->size);
2557
+ // check if this is the last call and do final cleanup
2558
+ if (size_done >= size_data) {
2559
+ // unmap offloaded tensors and metadata
2560
+ if (use_mmap && mapping) {
2561
+ mapping->unmap_fragment(0, mmap_used_first);
2562
+ if (mmap_used_last != 0) {
2563
+ mapping->unmap_fragment(mmap_used_last, mapping->size);
2564
+ }
2565
+ }
2566
+ if (progress_callback) {
2567
+ // Even though the model is done loading, we still honor
2568
+ // cancellation since we need to free allocations.
2569
+ return progress_callback(1.0f, progress_callback_user_data);
2570
+ }
2516
2571
  }
2517
2572
 
2518
- if (progress_callback) {
2519
- // Even though the model is done loading, we still honor
2520
- // cancellation since we need to free allocations.
2521
- return progress_callback(1.0f, progress_callback_user_data);
2522
- }
2523
2573
  return true;
2524
2574
  }
2525
2575
  };
@@ -2553,7 +2603,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2553
2603
  case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
2554
2604
 
2555
2605
  // K-quants
2556
- case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K";
2606
+ case LLAMA_FTYPE_MOSTLY_Q2_K: return "Q2_K - Medium";
2607
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
2557
2608
  case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
2558
2609
  case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
2559
2610
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -2562,6 +2613,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
2562
2613
  case LLAMA_FTYPE_MOSTLY_Q5_K_S: return "Q5_K - Small";
2563
2614
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: return "Q5_K - Medium";
2564
2615
  case LLAMA_FTYPE_MOSTLY_Q6_K: return "Q6_K";
2616
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:return "IQ2_XSS - 2.0625 bpw";
2617
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS: return "IQ2_XS - 2.3125 bpw";
2565
2618
 
2566
2619
  default: return "unknown, may not work";
2567
2620
  }
@@ -2796,6 +2849,7 @@ static void llm_load_hparams(
2796
2849
  ml.get_key(LLM_KV_ATTENTION_LAYERNORM_EPS, hparams.f_norm_eps);
2797
2850
 
2798
2851
  switch (hparams.n_layer) {
2852
+ case 24: model.type = e_model::MODEL_1B; break;
2799
2853
  case 32: model.type = e_model::MODEL_3B; break;
2800
2854
  default: model.type = e_model::MODEL_UNKNOWN;
2801
2855
  }
@@ -3112,7 +3166,15 @@ static void llm_load_print_meta(llama_model_loader & ml, llama_model & model) {
3112
3166
  LLAMA_LOG_INFO("%s: rope_finetuned = %s\n", __func__, hparams.rope_finetuned ? "yes" : "unknown");
3113
3167
  LLAMA_LOG_INFO("%s: model type = %s\n", __func__, llama_model_type_name(model.type));
3114
3168
  LLAMA_LOG_INFO("%s: model ftype = %s\n", __func__, llama_model_ftype_name(model.ftype).c_str());
3115
- LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
3169
+ if (ml.n_elements >= 1e12) {
3170
+ LLAMA_LOG_INFO("%s: model params = %.2f T\n", __func__, ml.n_elements*1e-12);
3171
+ } else if (ml.n_elements >= 1e9) {
3172
+ LLAMA_LOG_INFO("%s: model params = %.2f B\n", __func__, ml.n_elements*1e-9);
3173
+ } else if (ml.n_elements >= 1e6) {
3174
+ LLAMA_LOG_INFO("%s: model params = %.2f M\n", __func__, ml.n_elements*1e-6);
3175
+ } else {
3176
+ LLAMA_LOG_INFO("%s: model params = %.2f K\n", __func__, ml.n_elements*1e-3);
3177
+ }
3116
3178
  if (ml.n_bytes < GiB) {
3117
3179
  LLAMA_LOG_INFO("%s: model size = %.2f MiB (%.2f BPW) \n", __func__, ml.n_bytes/1024.0/1024.0, ml.n_bytes*8.0/ml.n_elements);
3118
3180
  } else {
@@ -3136,6 +3198,7 @@ static bool llm_load_tensors(
3136
3198
  llama_model_loader & ml,
3137
3199
  llama_model & model,
3138
3200
  int n_gpu_layers,
3201
+ enum llama_split_mode split_mode,
3139
3202
  int main_gpu,
3140
3203
  const float * tensor_split,
3141
3204
  bool use_mlock,
@@ -3143,702 +3206,574 @@ static bool llm_load_tensors(
3143
3206
  void * progress_callback_user_data) {
3144
3207
  model.t_start_us = ggml_time_us();
3145
3208
 
3146
- auto & ctx = model.ctx;
3147
3209
  auto & hparams = model.hparams;
3148
3210
 
3211
+ model.split_mode = split_mode;
3212
+ model.main_gpu = main_gpu;
3149
3213
  model.n_gpu_layers = n_gpu_layers;
3150
3214
 
3151
- size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
3215
+ const int64_t n_layer = hparams.n_layer;
3216
+ const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
3152
3217
 
3153
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
3218
+ // there is very little benefit to offloading the input layer, so always keep it on the CPU
3219
+ model.buft_input = llama_default_buffer_type_cpu(true);
3154
3220
 
3155
- // create the ggml context
3221
+ model.buft_layer.resize(n_layer);
3222
+
3223
+ // assign cpu layers
3224
+ for (int64_t i = 0; i < i_gpu_start; ++i) {
3225
+ model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3226
+ }
3227
+
3228
+ #ifdef GGML_USE_CUBLAS
3229
+ if (split_mode == LLAMA_SPLIT_LAYER) {
3230
+ // calculate the split points
3231
+ int device_count = ggml_backend_cuda_get_device_count();
3232
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
3233
+ float splits[GGML_CUDA_MAX_DEVICES];
3234
+ if (all_zero) {
3235
+ // default split, by free memory
3236
+ for (int i = 0; i < device_count; ++i) {
3237
+ size_t total;
3238
+ size_t free;
3239
+ ggml_backend_cuda_get_device_memory(i, &total, &free);
3240
+ splits[i] = free;
3241
+ }
3242
+ } else {
3243
+ std::copy(tensor_split, tensor_split + device_count, splits);
3244
+ }
3245
+
3246
+ // sum and normalize the splits to get the split points
3247
+ float split_sum = 0.0f;
3248
+ for (int i = 0; i < device_count; ++i) {
3249
+ split_sum += splits[i];
3250
+ splits[i] = split_sum;
3251
+ }
3252
+ for (int i = 0; i < device_count; ++i) {
3253
+ splits[i] /= split_sum;
3254
+ }
3255
+
3256
+ // assign the repeating layers to the devices according to the splits
3257
+ int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
3258
+ for (int64_t i = i_gpu_start; i < n_layer; ++i) {
3259
+ int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
3260
+ model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
3261
+ }
3262
+ // assign the output layer
3263
+ if (n_gpu_layers > n_layer) {
3264
+ int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
3265
+ model.buft_output = llama_default_buffer_type_offload(layer_gpu);
3266
+ } else {
3267
+ model.buft_output = llama_default_buffer_type_cpu(true);
3268
+ }
3269
+ } else
3270
+ #endif
3156
3271
  {
3272
+ ggml_backend_buffer_type_t split_buft;
3273
+ if (split_mode == LLAMA_SPLIT_ROW) {
3274
+ split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3275
+ } else {
3276
+ // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
3277
+ split_buft = llama_default_buffer_type_offload(main_gpu);
3278
+ }
3279
+ // assign the repeating layers
3280
+ for (int64_t i = i_gpu_start; i < n_layer; ++i) {
3281
+ model.buft_layer[i] = {
3282
+ split_buft,
3283
+ llama_default_buffer_type_offload(main_gpu)
3284
+ };
3285
+ }
3286
+ // assign the output layer
3287
+ if (n_gpu_layers > n_layer) {
3288
+ model.buft_output = {
3289
+ split_buft,
3290
+ llama_default_buffer_type_offload(main_gpu)
3291
+ };
3292
+ } else {
3293
+ model.buft_output = llama_default_buffer_type_cpu(true);
3294
+ }
3295
+ }
3296
+
3297
+ // count used buffer types
3298
+ std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
3299
+ buft_layer_count[model.buft_input.buft]++;
3300
+ buft_layer_count[model.buft_input.buft_matrix]++;
3301
+ buft_layer_count[model.buft_output.buft]++;
3302
+ buft_layer_count[model.buft_output.buft_matrix]++;
3303
+ for (int64_t i = 0; i < n_layer; ++i) {
3304
+ buft_layer_count[model.buft_layer[i].buft]++;
3305
+ buft_layer_count[model.buft_layer[i].buft_matrix]++;
3306
+ }
3307
+
3308
+ // create one context per buffer type
3309
+ size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
3310
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
3311
+ for (auto & it : buft_layer_count) {
3157
3312
  struct ggml_init_params params = {
3158
3313
  /*.mem_size =*/ ctx_size,
3159
3314
  /*.mem_buffer =*/ NULL,
3160
3315
  /*.no_alloc =*/ true,
3161
3316
  };
3162
-
3163
- model.ctx = ggml_init(params);
3164
- if (!model.ctx) {
3165
- throw std::runtime_error(format("ggml_init() failed"));
3317
+ ggml_context * ctx = ggml_init(params);
3318
+ if (!ctx) {
3319
+ throw std::runtime_error(format("failed to create context"));
3166
3320
  }
3321
+ ctx_map[it.first] = ctx;
3322
+ model.ctxs.push_back(ctx);
3167
3323
  }
3168
3324
 
3169
- (void) main_gpu;
3170
-
3171
- enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
3172
- enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
3173
-
3174
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3175
- if (ggml_cublas_loaded()) {
3176
- LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
3177
- ggml_cuda_set_main_device(main_gpu);
3178
-
3179
- llama_backend_offload = GGML_BACKEND_GPU;
3180
- llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
3181
- }
3182
- #elif defined(GGML_USE_CLBLAST)
3183
- LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
3184
- llama_backend_offload = GGML_BACKEND_GPU;
3185
- llama_backend_offload_split = GGML_BACKEND_GPU;
3186
- #endif
3325
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
3187
3326
 
3188
3327
  // create tensors for the weights
3189
3328
  {
3190
3329
  const int64_t n_embd = hparams.n_embd;
3191
3330
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
3192
3331
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3193
- const int64_t n_layer = hparams.n_layer;
3332
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3194
3333
  const int64_t n_vocab = hparams.n_vocab;
3334
+ const int64_t n_ff = hparams.n_ff;
3335
+
3336
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3337
+
3338
+ ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
3339
+ ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
3340
+ ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
3341
+ auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
3342
+ auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
3343
+
3344
+ model.layers.resize(n_layer);
3195
3345
 
3196
3346
  const auto tn = LLM_TN(model.arch);
3197
3347
  switch (model.arch) {
3198
3348
  case LLM_ARCH_LLAMA:
3199
3349
  case LLM_ARCH_REFACT:
3200
3350
  {
3201
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3351
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3202
3352
 
3203
3353
  // output
3204
3354
  {
3205
- ggml_backend_type backend_norm;
3206
- ggml_backend_type backend_output;
3207
-
3208
- if (n_gpu_layers > int(n_layer)) {
3209
- backend_norm = llama_backend_offload;
3210
- backend_output = llama_backend_offload_split;
3211
- } else {
3212
- backend_norm = GGML_BACKEND_CPU;
3213
- backend_output = GGML_BACKEND_CPU;
3214
- }
3215
-
3216
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3217
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3355
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3356
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3218
3357
  }
3219
3358
 
3220
- const uint32_t n_ff = hparams.n_ff;
3221
- const int64_t n_embd_gqa = n_embd_v_gqa;
3222
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3223
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3224
-
3225
- const int i_gpu_start = n_layer - n_gpu_layers;
3226
-
3227
- model.layers.resize(n_layer);
3228
-
3229
- for (uint32_t i = 0; i < n_layer; ++i) {
3230
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3231
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3359
+ for (int i = 0; i < n_layer; ++i) {
3360
+ ggml_context * ctx_layer = ctx_for_layer(i);
3361
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3232
3362
 
3233
3363
  auto & layer = model.layers[i];
3234
3364
 
3235
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3365
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3236
3366
 
3237
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3238
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3239
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3240
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3367
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3368
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3369
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3370
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3241
3371
 
3242
3372
  // optional bias tensors
3243
- layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
3244
- layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
3245
- layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
3246
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
3373
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
3374
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
3375
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
3376
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
3247
3377
 
3248
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3378
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3249
3379
 
3250
- layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
3380
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
3251
3381
 
3252
3382
  if (layer.ffn_gate_inp == nullptr) {
3253
3383
  GGML_ASSERT(hparams.n_expert == 0);
3254
3384
  GGML_ASSERT(hparams.n_expert_used == 0);
3255
3385
 
3256
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3257
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3258
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3386
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3387
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3388
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3259
3389
  } else {
3260
3390
  GGML_ASSERT(hparams.n_expert > 0);
3261
3391
  GGML_ASSERT(hparams.n_expert_used > 0);
3262
3392
 
3263
3393
  // MoE branch
3264
3394
  for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3265
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3266
- layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
3267
- layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3395
+ layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
3396
+ layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
3397
+ layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
3268
3398
  }
3269
3399
  }
3270
3400
  }
3271
3401
  } break;
3272
3402
  case LLM_ARCH_BAICHUAN:
3273
3403
  {
3274
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3404
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3275
3405
  {
3276
- ggml_backend_type backend_norm;
3277
- ggml_backend_type backend_output;
3278
-
3279
- if (n_gpu_layers > int(n_layer)) {
3280
- backend_norm = llama_backend_offload;
3281
- backend_output = llama_backend_offload_split;
3282
- } else {
3283
- backend_norm = GGML_BACKEND_CPU;
3284
- backend_output = GGML_BACKEND_CPU;
3285
- }
3286
-
3287
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3288
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3406
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3407
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3289
3408
  }
3290
3409
 
3291
- const uint32_t n_ff = hparams.n_ff;
3292
- const int64_t n_embd_gqa = n_embd_v_gqa;
3293
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3294
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3295
-
3296
- const int i_gpu_start = n_layer - n_gpu_layers;
3297
-
3298
- model.layers.resize(n_layer);
3299
-
3300
- for (uint32_t i = 0; i < n_layer; ++i) {
3301
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3302
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3410
+ for (int i = 0; i < n_layer; ++i) {
3411
+ ggml_context * ctx_layer = ctx_for_layer(i);
3412
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3303
3413
 
3304
3414
  auto & layer = model.layers[i];
3305
3415
 
3306
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3416
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3307
3417
 
3308
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3309
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3310
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3311
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3418
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3419
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3420
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3421
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3312
3422
 
3313
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3423
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3314
3424
 
3315
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3316
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3317
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3425
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3426
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3427
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3318
3428
  }
3319
3429
  } break;
3320
3430
  case LLM_ARCH_FALCON:
3321
3431
  {
3322
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3432
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3323
3433
 
3324
3434
  // output
3325
3435
  {
3326
- ggml_backend_type backend_norm;
3327
- ggml_backend_type backend_output;
3328
-
3329
- if (n_gpu_layers > int(n_layer)) {
3330
- backend_norm = llama_backend_offload;
3331
- backend_output = llama_backend_offload_split;
3332
- } else {
3333
- backend_norm = GGML_BACKEND_CPU;
3334
- backend_output = GGML_BACKEND_CPU;
3335
- }
3336
-
3337
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3338
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3339
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3436
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3437
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3438
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3340
3439
  }
3341
3440
 
3342
- const uint32_t n_ff = hparams.n_ff;
3343
- const int64_t n_embd_gqa = n_embd_v_gqa;
3344
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3345
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3346
-
3347
- const int i_gpu_start = n_layer - n_gpu_layers;
3348
-
3349
- model.layers.resize(n_layer);
3350
-
3351
- for (uint32_t i = 0; i < n_layer; ++i) {
3352
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3353
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3441
+ for (int i = 0; i < n_layer; ++i) {
3442
+ ggml_context * ctx_layer = ctx_for_layer(i);
3443
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3354
3444
 
3355
3445
  auto & layer = model.layers[i];
3356
3446
 
3357
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3358
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3447
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3448
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3359
3449
 
3360
3450
  if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
3361
- layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
3362
- layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
3451
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
3452
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
3363
3453
  }
3364
3454
 
3365
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3366
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3455
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3456
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3367
3457
 
3368
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3369
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3458
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3459
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3370
3460
  }
3371
3461
  } break;
3372
3462
  case LLM_ARCH_STARCODER:
3373
3463
  {
3374
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3375
- model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
3464
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3465
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3376
3466
 
3377
3467
  // output
3378
3468
  {
3379
- ggml_backend_type backend_norm;
3380
- ggml_backend_type backend_output;
3381
-
3382
- if (n_gpu_layers > int(n_layer)) {
3383
- backend_norm = llama_backend_offload;
3384
- backend_output = llama_backend_offload_split;
3385
- } else {
3386
- backend_norm = GGML_BACKEND_CPU;
3387
- backend_output = GGML_BACKEND_CPU;
3388
- }
3389
-
3390
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3391
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3392
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3469
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3470
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3471
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3393
3472
  }
3394
3473
 
3395
- const uint32_t n_ff = hparams.n_ff;
3396
- const int64_t n_embd_gqa = n_embd_v_gqa;
3397
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3398
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3399
-
3400
- const int i_gpu_start = n_layer - n_gpu_layers;
3401
-
3402
- model.layers.resize(n_layer);
3403
-
3404
- for (uint32_t i = 0; i < n_layer; ++i) {
3405
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3406
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3474
+ for (int i = 0; i < n_layer; ++i) {
3475
+ ggml_context * ctx_layer = ctx_for_layer(i);
3476
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3407
3477
 
3408
3478
  auto & layer = model.layers[i];
3409
3479
 
3410
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3411
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3480
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3481
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3412
3482
 
3413
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3414
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3483
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3484
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3415
3485
 
3416
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3417
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3486
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3487
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3418
3488
 
3419
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3420
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3489
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3490
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3421
3491
 
3422
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3423
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3492
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3493
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3424
3494
 
3425
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3426
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3495
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3496
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3427
3497
  }
3428
3498
  } break;
3429
3499
  case LLM_ARCH_PERSIMMON:
3430
3500
  {
3431
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3501
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3432
3502
 
3433
3503
  {
3434
- ggml_backend_type backend_norm;
3435
- ggml_backend_type backend_output;
3436
-
3437
- if (n_gpu_layers > int(n_layer)) {
3438
- backend_norm = llama_backend_offload;
3439
- backend_output = llama_backend_offload_split;
3440
- } else {
3441
- backend_norm = GGML_BACKEND_CPU;
3442
- backend_output = GGML_BACKEND_CPU;
3443
- }
3444
-
3445
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3446
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3447
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3504
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3505
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3506
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3448
3507
  }
3449
3508
 
3450
- const uint32_t n_ff = hparams.n_ff;
3451
- const int64_t n_embd_gqa = n_embd_v_gqa;
3452
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3453
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3509
+ for (int i = 0; i < n_layer; ++i) {
3510
+ ggml_context * ctx_layer = ctx_for_layer(i);
3511
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3454
3512
 
3455
- const int i_gpu_start = n_layer - n_gpu_layers;
3456
- model.layers.resize(n_layer);
3457
- for (uint32_t i = 0; i < n_layer; ++i) {
3458
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
3459
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
3460
3513
  auto & layer = model.layers[i];
3461
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3462
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3463
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3464
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3465
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3466
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3467
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3468
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3469
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3470
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3471
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3472
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3473
- layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
3474
- layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
3475
- layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
3476
- layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
3514
+
3515
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3516
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3517
+
3518
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3519
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3520
+
3521
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3522
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3523
+
3524
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3525
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3526
+
3527
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3528
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3529
+
3530
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3531
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3532
+
3533
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
3534
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
3535
+
3536
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
3537
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
3477
3538
  }
3478
3539
  } break;
3479
3540
  case LLM_ARCH_BLOOM:
3480
3541
  {
3481
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3482
- model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
3483
- model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
3542
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3543
+ model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3544
+ model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3484
3545
 
3485
3546
  // output
3486
3547
  {
3487
- ggml_backend_type backend_norm;
3488
- ggml_backend_type backend_output;
3489
-
3490
- if (n_gpu_layers > int(n_layer)) {
3491
- backend_norm = llama_backend_offload;
3492
- backend_output = llama_backend_offload_split;
3493
- } else {
3494
- backend_norm = GGML_BACKEND_CPU;
3495
- backend_output = GGML_BACKEND_CPU;
3496
- }
3497
-
3498
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3499
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3500
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3548
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3549
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3550
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3501
3551
  }
3502
3552
 
3503
- const uint32_t n_ff = hparams.n_ff;
3504
- const int64_t n_embd_gqa = n_embd_v_gqa;
3505
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3506
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3507
-
3508
- const int i_gpu_start = n_layer - n_gpu_layers;
3509
-
3510
- model.layers.resize(n_layer);
3511
-
3512
- for (uint32_t i = 0; i < n_layer; ++i) {
3513
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3514
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3553
+ for (int i = 0; i < n_layer; ++i) {
3554
+ ggml_context * ctx_layer = ctx_for_layer(i);
3555
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3515
3556
 
3516
3557
  auto & layer = model.layers[i];
3517
3558
 
3518
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3519
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3559
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3560
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3520
3561
 
3521
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3522
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3562
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3563
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3523
3564
 
3524
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3525
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3565
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3566
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3526
3567
 
3527
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3528
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3568
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3569
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3529
3570
 
3530
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3531
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3571
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3572
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3532
3573
 
3533
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3534
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3574
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3575
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3535
3576
  }
3536
3577
  } break;
3537
3578
  case LLM_ARCH_MPT:
3538
3579
  {
3539
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3580
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3581
+
3540
3582
  // output
3541
3583
  {
3542
- ggml_backend_type backend_norm;
3543
- ggml_backend_type backend_output;
3544
-
3545
- if (n_gpu_layers > int(n_layer)) {
3546
- backend_norm = llama_backend_offload;
3547
- backend_output = llama_backend_offload_split;
3548
- } else {
3549
- backend_norm = GGML_BACKEND_CPU;
3550
- backend_output = GGML_BACKEND_CPU;
3551
- }
3552
-
3553
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3554
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3584
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3585
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3555
3586
  }
3556
3587
 
3557
- const uint32_t n_ff = hparams.n_ff;
3558
- const int64_t n_embd_gqa = n_embd_v_gqa;
3559
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3560
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3561
-
3562
- const int i_gpu_start = n_layer - n_gpu_layers;
3563
-
3564
- model.layers.resize(n_layer);
3565
-
3566
- for (uint32_t i = 0; i < n_layer; ++i) {
3567
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3568
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3588
+ for (int i = 0; i < n_layer; ++i) {
3589
+ ggml_context * ctx_layer = ctx_for_layer(i);
3590
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3569
3591
 
3570
3592
  auto & layer = model.layers[i];
3571
3593
 
3572
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3573
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3574
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3594
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3575
3595
 
3576
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3596
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3597
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3577
3598
 
3578
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3579
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3599
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3600
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3601
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3580
3602
 
3581
3603
  // AWQ ScaleActivation layer
3582
- layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
3604
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
3583
3605
  }
3584
3606
  } break;
3585
3607
  case LLM_ARCH_STABLELM:
3586
3608
  {
3587
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3609
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3588
3610
 
3589
3611
  // output
3590
3612
  {
3591
- ggml_backend_type backend_norm;
3592
- ggml_backend_type backend_output;
3593
-
3594
- if (n_gpu_layers > int(n_layer)) {
3595
- backend_norm = llama_backend_offload;
3596
- backend_output = llama_backend_offload_split;
3597
- } else {
3598
- backend_norm = GGML_BACKEND_CPU;
3599
- backend_output = GGML_BACKEND_CPU;
3600
- }
3601
-
3602
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3603
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3604
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3613
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3614
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3615
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3605
3616
  }
3606
3617
 
3607
- const uint32_t n_ff = hparams.n_ff;
3608
- const int64_t n_embd_gqa = n_embd_v_gqa;
3609
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3610
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3611
-
3612
- const int i_gpu_start = n_layer - n_gpu_layers;
3613
-
3614
- model.layers.resize(n_layer);
3615
-
3616
- for (uint32_t i = 0; i < n_layer; ++i) {
3617
- /*
3618
- llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3619
- */
3620
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3621
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3618
+ for (int i = 0; i < n_layer; ++i) {
3619
+ ggml_context * ctx_layer = ctx_for_layer(i);
3620
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3622
3621
 
3623
3622
  auto & layer = model.layers[i];
3624
3623
 
3625
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3626
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3624
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3625
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3627
3626
 
3628
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3629
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3630
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3631
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3627
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3628
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3629
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3630
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3632
3631
 
3633
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3634
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3632
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3633
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3635
3634
 
3636
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3637
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3638
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3635
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3636
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3637
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3639
3638
  }
3640
3639
  } break;
3641
3640
  case LLM_ARCH_QWEN:
3642
3641
  {
3643
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3644
- {
3645
- ggml_backend_type backend_norm;
3646
- ggml_backend_type backend_output;
3647
-
3648
- if (n_gpu_layers > int(n_layer)) {
3649
- backend_norm = llama_backend_offload;
3650
- backend_output = llama_backend_offload_split;
3651
- } else {
3652
- backend_norm = GGML_BACKEND_CPU;
3653
- backend_output = GGML_BACKEND_CPU;
3654
- }
3655
-
3656
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3657
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3658
- }
3659
-
3660
- const uint32_t n_ff = hparams.n_ff / 2;
3661
-
3662
- const int i_gpu_start = n_layer - n_gpu_layers;
3642
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3663
3643
 
3664
- model.layers.resize(n_layer);
3644
+ // output
3645
+ {
3646
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3647
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3648
+ }
3665
3649
 
3666
- for (uint32_t i = 0; i < n_layer; ++i) {
3667
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3668
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3650
+ for (int i = 0; i < n_layer; ++i) {
3651
+ ggml_context * ctx_layer = ctx_for_layer(i);
3652
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3669
3653
 
3670
3654
  auto & layer = model.layers[i];
3671
3655
 
3672
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3656
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3673
3657
 
3674
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
3675
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
3676
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3658
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
3659
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3});
3660
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3677
3661
 
3678
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3662
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3679
3663
 
3680
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3681
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3682
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3664
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2});
3665
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd});
3666
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
3683
3667
  }
3684
3668
  } break;
3685
3669
  case LLM_ARCH_PHI2:
3686
3670
  {
3687
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3671
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3688
3672
 
3689
3673
  // output
3690
3674
  {
3691
- ggml_backend_type backend_norm;
3692
- ggml_backend_type backend_output;
3693
-
3694
- if (n_gpu_layers > int(n_layer)) {
3695
- backend_norm = llama_backend_offload;
3696
- backend_output = llama_backend_offload;
3697
- } else {
3698
- backend_norm = GGML_BACKEND_CPU;
3699
- backend_output = GGML_BACKEND_CPU;
3700
- }
3701
-
3702
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3703
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3704
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3705
- model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3675
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3676
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3677
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3678
+ model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab});
3706
3679
  }
3707
3680
 
3708
- const uint32_t n_ff = hparams.n_ff;
3709
- const int64_t n_embd_gqa = n_embd_v_gqa;
3710
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3711
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3681
+ for (int i = 0; i < n_layer; ++i) {
3682
+ ggml_context * ctx_layer = ctx_for_layer(i);
3683
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3712
3684
 
3713
- const int i_gpu_start = n_layer - n_gpu_layers;
3685
+ auto & layer = model.layers[i];
3714
3686
 
3715
- model.layers.resize(n_layer);
3687
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3688
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3716
3689
 
3717
- for (uint32_t i = 0; i < n_layer; ++i) {
3718
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3719
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3690
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
3691
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
3720
3692
 
3721
- auto & layer = model.layers[i];
3693
+ if (layer.wqkv == nullptr) {
3694
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3695
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3722
3696
 
3723
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3724
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3697
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3698
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3725
3699
 
3726
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3727
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3700
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3701
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3702
+ }
3728
3703
 
3729
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3730
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3704
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3705
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3731
3706
 
3732
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3733
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3707
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3708
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3734
3709
 
3735
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3736
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3710
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3711
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3737
3712
  }
3738
3713
  } break;
3739
3714
  case LLM_ARCH_PLAMO:
3740
3715
  {
3741
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3716
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3742
3717
 
3743
3718
  // output
3744
3719
  {
3745
- ggml_backend_type backend_norm;
3746
- ggml_backend_type backend_output;
3747
-
3748
- if (n_gpu_layers > int(n_layer)) {
3749
- backend_norm = llama_backend_offload;
3750
- backend_output = llama_backend_offload_split;
3751
- } else {
3752
- backend_norm = GGML_BACKEND_CPU;
3753
- backend_output = GGML_BACKEND_CPU;
3754
- }
3755
-
3756
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3757
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3720
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3721
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3758
3722
  }
3759
3723
 
3760
- const uint32_t n_ff = hparams.n_ff;
3761
- const int64_t n_embd_gqa = n_embd_v_gqa;
3762
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3763
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3764
-
3765
- const int i_gpu_start = n_layer - n_gpu_layers;
3766
-
3767
- model.layers.resize(n_layer);
3768
-
3769
- for (uint32_t i = 0; i < n_layer; ++i) {
3770
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3771
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3724
+ for (int i = 0; i < n_layer; ++i) {
3725
+ ggml_context * ctx_layer = ctx_for_layer(i);
3726
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3772
3727
 
3773
3728
  auto & layer = model.layers[i];
3774
3729
 
3775
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3730
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3776
3731
 
3777
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3778
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3779
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3780
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3732
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3733
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3734
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3735
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3781
3736
 
3782
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3783
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3784
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3737
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3738
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3739
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3785
3740
  }
3786
3741
  } break;
3787
3742
  case LLM_ARCH_GPT2:
3788
3743
  {
3789
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3790
- model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
3744
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3745
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3791
3746
 
3792
3747
  // output
3793
3748
  {
3794
- ggml_backend_type backend_norm;
3795
- ggml_backend_type backend_output;
3796
-
3797
- if (n_gpu_layers > int(n_layer)) {
3798
- backend_norm = llama_backend_offload;
3799
- backend_output = llama_backend_offload_split;
3800
- } else {
3801
- backend_norm = GGML_BACKEND_CPU;
3802
- backend_output = GGML_BACKEND_CPU;
3803
- }
3804
-
3805
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3806
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3807
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3749
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3750
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3751
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3808
3752
  }
3809
3753
 
3810
- const uint32_t n_ff = hparams.n_ff;
3811
- const int64_t n_embd_gqa = n_embd_v_gqa;
3812
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3813
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3814
-
3815
- const int i_gpu_start = n_layer - n_gpu_layers;
3816
-
3817
- model.layers.resize(n_layer);
3818
-
3819
- for (uint32_t i = 0; i < n_layer; ++i) {
3820
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3821
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3754
+ for (int i = 0; i < n_layer; ++i) {
3755
+ ggml_context * ctx_layer = ctx_for_layer(i);
3756
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3822
3757
 
3823
3758
  auto & layer = model.layers[i];
3824
3759
 
3825
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3826
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3760
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3761
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3827
3762
 
3828
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3829
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3763
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3764
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3830
3765
 
3831
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3832
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3766
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3767
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3833
3768
 
3834
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3835
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3769
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3770
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3836
3771
 
3837
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3838
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3772
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3773
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3839
3774
 
3840
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3841
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3775
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3776
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3842
3777
  }
3843
3778
  } break;
3844
3779
  default:
@@ -3848,78 +3783,51 @@ static bool llm_load_tensors(
3848
3783
 
3849
3784
  ml.done_getting_tensors();
3850
3785
 
3851
- ml.init_mapping();
3786
+ ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
3852
3787
 
3853
- // allocate tensors
3854
- size_t vram_weights = 0;
3855
- size_t buf_size = 0;
3788
+ // create the backend buffers
3789
+ std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
3856
3790
 
3857
- ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
3791
+ for (auto & it : ctx_map) {
3792
+ ggml_backend_buffer_type_t buft = it.first;
3793
+ ggml_context * ctx = it.second;
3794
+ ggml_backend_buffer_t buf = nullptr;
3858
3795
 
3859
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3860
- // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
3861
- if (t->backend == GGML_BACKEND_CPU) {
3862
- buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
3863
- } else {
3864
- vram_weights += ggml_nbytes(t);
3796
+ // only the mmap region containing the tensors in the model is mapped to the backend buffer
3797
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
3798
+ // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
3799
+ if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
3800
+ size_t first, last;
3801
+ ml.get_mapping_range(&first, &last, ctx);
3802
+ buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
3865
3803
  }
3866
- }
3867
-
3868
- // create backend buffer
3869
- ggml_backend_buffer_t buf_mmap = nullptr;
3870
-
3871
3804
  #ifdef GGML_USE_METAL
3872
- if (n_gpu_layers > 0) {
3873
- if (ml.use_mmap) {
3805
+ else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
3874
3806
  const size_t max_size = ggml_get_max_tensor_size(ctx);
3875
- model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
3876
- buf_mmap = model.buf;
3877
- } else {
3878
- model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
3807
+ size_t first, last;
3808
+ ml.get_mapping_range(&first, &last, ctx);
3809
+ buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
3879
3810
  }
3880
- }
3881
- #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3882
- // for testing only
3883
- if (n_gpu_layers > 0) {
3884
- model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
3885
- }
3886
3811
  #endif
3887
-
3888
- if (model.buf == nullptr) {
3889
- // CPU backend, and indirectly CUDA and OpenCL
3890
- if (ml.use_mmap) {
3891
- model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
3892
- buf_mmap = model.buf;
3893
- } else {
3894
- // allocate only CPU tensors
3895
- model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
3896
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
3897
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3898
- if (t->backend == GGML_BACKEND_CPU) {
3899
- ggml_tallocr_alloc(alloc, t);
3900
- }
3812
+ else {
3813
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
3814
+ if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
3815
+ model.mlock_buf.init (ggml_backend_buffer_get_base(buf));
3816
+ model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
3901
3817
  }
3902
- ggml_tallocr_free(alloc);
3903
3818
  }
3904
- }
3905
-
3906
- if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
3907
- model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
3908
- model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
3819
+ if (buf == nullptr) {
3820
+ throw std::runtime_error("failed to allocate buffer");
3821
+ }
3822
+ // indicate that this buffer contains weights
3823
+ // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
3824
+ ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
3825
+ model.bufs.push_back(buf);
3826
+ ctx_bufs.emplace_back(ctx, buf);
3909
3827
  }
3910
3828
 
3911
3829
  // print memory requirements
3912
3830
  {
3913
- size_t sys_mem_required = ctx_size + buf_size;
3914
-
3915
- if (sys_mem_required > 0) {
3916
- LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
3917
- }
3918
- if (vram_weights > 0) {
3919
- LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3920
- }
3921
-
3922
- #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
3923
3831
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
3924
3832
 
3925
3833
  LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -3931,23 +3839,26 @@ static bool llm_load_tensors(
3931
3839
  const int max_offloadable_layers = hparams.n_layer + 1;
3932
3840
 
3933
3841
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3934
- #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3935
- }
3936
3842
 
3937
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3938
- ggml_cuda_set_tensor_split(tensor_split);
3939
- #else
3940
- GGML_UNUSED(tensor_split);
3941
- #endif // GGML_USE_CUBLAS
3843
+ for (ggml_backend_buffer_t buf : model.bufs) {
3844
+ LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
3845
+ }
3846
+ }
3942
3847
 
3943
3848
  // populate tensors_by_name
3944
- for (int i = 0; i < ml.n_tensors; ++i) {
3945
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
3946
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3849
+ for (ggml_context * ctx : model.ctxs) {
3850
+ for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3851
+ model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3852
+ }
3947
3853
  }
3948
3854
 
3949
- if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
3950
- return false;
3855
+ // load tensor data
3856
+ for (auto & it : ctx_bufs) {
3857
+ ggml_context * ctx = it.first;
3858
+ ggml_backend_buffer_t buf = it.second;
3859
+ if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
3860
+ return false;
3861
+ }
3951
3862
  }
3952
3863
 
3953
3864
  model.mapping = std::move(ml.mapping);
@@ -3981,13 +3892,13 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
3981
3892
  }
3982
3893
 
3983
3894
  if (!llm_load_tensors(
3984
- ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
3895
+ ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
3985
3896
  params.progress_callback, params.progress_callback_user_data
3986
3897
  )) {
3987
3898
  return -2;
3988
3899
  }
3989
3900
  } catch (const std::exception & err) {
3990
- LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
3901
+ LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
3991
3902
  return -1;
3992
3903
  }
3993
3904
 
@@ -4059,7 +3970,6 @@ static void llm_build_k_shift(
4059
3970
  struct ggml_cgraph * graph,
4060
3971
  llm_rope_type type,
4061
3972
  int64_t n_ctx,
4062
- int n_rot,
4063
3973
  float freq_base,
4064
3974
  float freq_scale,
4065
3975
  const llm_build_cb & cb) {
@@ -4067,14 +3977,13 @@ static void llm_build_k_shift(
4067
3977
  const int64_t n_head_kv = hparams.n_head_kv;
4068
3978
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
4069
3979
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
3980
+ const int32_t n_rot = hparams.n_rot;
4070
3981
  const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4071
3982
  const float ext_factor = cparams.yarn_ext_factor;
4072
3983
  const float attn_factor = cparams.yarn_attn_factor;
4073
3984
  const float beta_fast = cparams.yarn_beta_fast;
4074
3985
  const float beta_slow = cparams.yarn_beta_slow;
4075
3986
 
4076
- GGML_ASSERT(n_embd_head_k % n_rot == 0);
4077
-
4078
3987
  struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
4079
3988
  cb(K_shift, "K_shift", -1);
4080
3989
 
@@ -4433,8 +4342,6 @@ struct llm_build_context {
4433
4342
  do_rope_shift (worst_case || kv_self.has_shift),
4434
4343
  cb (cb),
4435
4344
  buf_compute_meta (lctx.buf_compute_meta) {
4436
- GGML_ASSERT(!!kv_self.ctx);
4437
-
4438
4345
  // all initializations should be done in init()
4439
4346
  }
4440
4347
 
@@ -4478,7 +4385,7 @@ struct llm_build_context {
4478
4385
 
4479
4386
  // shift the entire K-cache if needed
4480
4387
  if (do_rope_shift) {
4481
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
4388
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4482
4389
  }
4483
4390
 
4484
4391
  for (int il = 0; il < n_layer; ++il) {
@@ -4514,16 +4421,22 @@ struct llm_build_context {
4514
4421
  cb(Vcur, "Vcur", il);
4515
4422
  }
4516
4423
 
4424
+ // these nodes are added to the graph together so that they are not reordered
4425
+ // by doing so, the number of splits in the graph is reduced
4426
+ ggml_build_forward_expand(gf, Qcur);
4427
+ ggml_build_forward_expand(gf, Kcur);
4428
+ ggml_build_forward_expand(gf, Vcur);
4429
+
4517
4430
  Qcur = ggml_rope_custom(
4518
4431
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4519
- n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
4432
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4520
4433
  ext_factor, attn_factor, beta_fast, beta_slow
4521
4434
  );
4522
4435
  cb(Qcur, "Qcur", il);
4523
4436
 
4524
4437
  Kcur = ggml_rope_custom(
4525
4438
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4526
- n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
4439
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4527
4440
  ext_factor, attn_factor, beta_fast, beta_slow
4528
4441
  );
4529
4442
  cb(Kcur, "Kcur", il);
@@ -4646,6 +4559,7 @@ struct llm_build_context {
4646
4559
 
4647
4560
  const int64_t n_embd_head = hparams.n_embd_head_v;
4648
4561
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4562
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
4649
4563
 
4650
4564
  struct ggml_tensor * cur;
4651
4565
  struct ggml_tensor * inpL;
@@ -4663,7 +4577,7 @@ struct llm_build_context {
4663
4577
 
4664
4578
  // shift the entire K-cache if needed
4665
4579
  if (do_rope_shift) {
4666
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
4580
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4667
4581
  }
4668
4582
 
4669
4583
  for (int il = 0; il < n_layer; ++il) {
@@ -4689,12 +4603,12 @@ struct llm_build_context {
4689
4603
  case MODEL_7B:
4690
4604
  Qcur = ggml_rope_custom(
4691
4605
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4692
- n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
4606
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4693
4607
  ext_factor, attn_factor, beta_fast, beta_slow
4694
4608
  );
4695
4609
  Kcur = ggml_rope_custom(
4696
4610
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4697
- n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
4611
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4698
4612
  ext_factor, attn_factor, beta_fast, beta_slow
4699
4613
  );
4700
4614
  break;
@@ -4767,7 +4681,7 @@ struct llm_build_context {
4767
4681
  const int64_t n_embd_head = hparams.n_embd_head_v;
4768
4682
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4769
4683
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4770
- GGML_ASSERT(n_embd_gqa == n_embd);
4684
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
4771
4685
 
4772
4686
  struct ggml_tensor * cur;
4773
4687
  struct ggml_tensor * inpL;
@@ -4785,7 +4699,7 @@ struct llm_build_context {
4785
4699
 
4786
4700
  // shift the entire K-cache if needed
4787
4701
  if (do_rope_shift) {
4788
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
4702
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
4789
4703
  }
4790
4704
 
4791
4705
  for (int il = 0; il < n_layer; ++il) {
@@ -4826,13 +4740,13 @@ struct llm_build_context {
4826
4740
 
4827
4741
  // using mode = 2 for neox mode
4828
4742
  Qcur = ggml_rope_custom(
4829
- ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
4743
+ ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4830
4744
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4831
4745
  );
4832
4746
  cb(Qcur, "Qcur", il);
4833
4747
 
4834
4748
  Kcur = ggml_rope_custom(
4835
- ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
4749
+ ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4836
4750
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4837
4751
  );
4838
4752
  cb(Kcur, "Kcur", il);
@@ -4891,7 +4805,6 @@ struct llm_build_context {
4891
4805
  const int64_t n_embd_head = hparams.n_embd_head_v;
4892
4806
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4893
4807
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4894
- GGML_ASSERT(n_embd_gqa == n_embd);
4895
4808
 
4896
4809
  struct ggml_tensor * cur;
4897
4810
  struct ggml_tensor * pos;
@@ -4990,17 +4903,14 @@ struct llm_build_context {
4990
4903
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
4991
4904
 
4992
4905
  const int64_t n_embd_head = hparams.n_embd_head_v;
4993
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4994
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4995
- GGML_ASSERT(n_embd_gqa == n_embd);
4996
-
4997
- const int64_t n_rot = n_embd_head_k / 2;
4906
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4907
+ GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
4998
4908
 
4999
4909
  struct ggml_tensor * cur;
5000
4910
  struct ggml_tensor * inpL;
5001
4911
 
5002
4912
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5003
- cb(inpL, "imp_embd", -1);
4913
+ cb(inpL, "inp_embd", -1);
5004
4914
 
5005
4915
  // inp_pos - contains the positions
5006
4916
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -5011,7 +4921,7 @@ struct llm_build_context {
5011
4921
  cb(KQ_mask, "KQ_mask", -1);
5012
4922
 
5013
4923
  if (do_rope_shift) {
5014
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
4924
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5015
4925
  }
5016
4926
 
5017
4927
  for (int il = 0; il < n_layer; ++il) {
@@ -5071,7 +4981,7 @@ struct llm_build_context {
5071
4981
 
5072
4982
  // RoPE the first n_rot of q/k, pass the other half, and concat.
5073
4983
  struct ggml_tensor * qrot = ggml_view_3d(
5074
- ctx0, tmpq, n_rot, n_head, n_tokens,
4984
+ ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5075
4985
  ggml_element_size(tmpq) * n_embd_head,
5076
4986
  ggml_element_size(tmpq) * n_embd_head * n_head,
5077
4987
  0
@@ -5079,7 +4989,7 @@ struct llm_build_context {
5079
4989
  cb(qrot, "qrot", il);
5080
4990
 
5081
4991
  struct ggml_tensor * krot = ggml_view_3d(
5082
- ctx0, tmpk, n_rot, n_head, n_tokens,
4992
+ ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5083
4993
  ggml_element_size(tmpk) * n_embd_head,
5084
4994
  ggml_element_size(tmpk) * n_embd_head * n_head,
5085
4995
  0
@@ -5088,29 +4998,29 @@ struct llm_build_context {
5088
4998
 
5089
4999
  // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
5090
5000
  struct ggml_tensor * qpass = ggml_view_3d(
5091
- ctx0, tmpq, n_rot, n_head, n_tokens,
5001
+ ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5092
5002
  ggml_element_size(tmpq) * n_embd_head,
5093
5003
  ggml_element_size(tmpq) * n_embd_head * n_head,
5094
- ggml_element_size(tmpq) * n_rot
5004
+ ggml_element_size(tmpq) * hparams.n_rot
5095
5005
  );
5096
5006
  cb(qpass, "qpass", il);
5097
5007
 
5098
5008
  struct ggml_tensor * kpass = ggml_view_3d(
5099
- ctx0, tmpk, n_rot, n_head, n_tokens,
5009
+ ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5100
5010
  ggml_element_size(tmpk) * n_embd_head,
5101
5011
  ggml_element_size(tmpk) * n_embd_head * n_head,
5102
- ggml_element_size(tmpk) * n_rot
5012
+ ggml_element_size(tmpk) * hparams.n_rot
5103
5013
  );
5104
5014
  cb(kpass, "kpass", il);
5105
5015
 
5106
5016
  struct ggml_tensor * qrotated = ggml_rope_custom(
5107
- ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
5017
+ ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5108
5018
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5109
5019
  );
5110
5020
  cb(qrotated, "qrotated", il);
5111
5021
 
5112
5022
  struct ggml_tensor * krotated = ggml_rope_custom(
5113
- ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
5023
+ ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5114
5024
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5115
5025
  );
5116
5026
  cb(krotated, "krotated", il);
@@ -5204,9 +5114,7 @@ struct llm_build_context {
5204
5114
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5205
5115
 
5206
5116
  const int64_t n_embd_head = hparams.n_embd_head_v;
5207
- const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5208
5117
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5209
- GGML_ASSERT(n_embd_gqa == n_embd);
5210
5118
 
5211
5119
  struct ggml_tensor * cur;
5212
5120
  struct ggml_tensor * inpL;
@@ -5299,7 +5207,6 @@ struct llm_build_context {
5299
5207
  const int64_t n_embd_head = hparams.n_embd_head_v;
5300
5208
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5301
5209
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5302
- GGML_ASSERT(n_embd_gqa == n_embd);
5303
5210
 
5304
5211
  struct ggml_tensor * cur;
5305
5212
  struct ggml_tensor * inpL;
@@ -5395,7 +5302,6 @@ struct llm_build_context {
5395
5302
  const int64_t n_embd_head = hparams.n_embd_head_v;
5396
5303
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5397
5304
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5398
- GGML_ASSERT(n_embd_gqa == n_embd);
5399
5305
 
5400
5306
  struct ggml_tensor * cur;
5401
5307
  struct ggml_tensor * inpL;
@@ -5511,7 +5417,7 @@ struct llm_build_context {
5511
5417
 
5512
5418
  // shift the entire K-cache if needed
5513
5419
  if (do_rope_shift) {
5514
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
5420
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5515
5421
  }
5516
5422
 
5517
5423
  for (int il = 0; il < n_layer; ++il) {
@@ -5624,7 +5530,7 @@ struct llm_build_context {
5624
5530
 
5625
5531
  // shift the entire K-cache if needed
5626
5532
  if (do_rope_shift) {
5627
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5533
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5628
5534
  }
5629
5535
 
5630
5536
  for (int il = 0; il < n_layer; ++il) {
@@ -5656,13 +5562,13 @@ struct llm_build_context {
5656
5562
 
5657
5563
  // using mode = 2 for neox mode
5658
5564
  Qcur = ggml_rope_custom(
5659
- ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5565
+ ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5660
5566
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5661
5567
  );
5662
5568
  cb(Qcur, "Qcur", il);
5663
5569
 
5664
5570
  Kcur = ggml_rope_custom(
5665
- ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5571
+ ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5666
5572
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5667
5573
  );
5668
5574
  cb(Kcur, "Kcur", il);
@@ -5722,7 +5628,6 @@ struct llm_build_context {
5722
5628
  const int64_t n_embd_head = hparams.n_embd_head_v;
5723
5629
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5724
5630
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5725
- GGML_ASSERT(n_embd_gqa == n_embd);
5726
5631
 
5727
5632
  struct ggml_tensor * cur;
5728
5633
  struct ggml_tensor * attn_norm_output;
@@ -5742,7 +5647,7 @@ struct llm_build_context {
5742
5647
 
5743
5648
  // shift the entire K-cache if needed
5744
5649
  if (do_rope_shift) {
5745
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5650
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5746
5651
  }
5747
5652
 
5748
5653
  for (int il = 0; il < n_layer; ++il) {
@@ -5754,15 +5659,25 @@ struct llm_build_context {
5754
5659
 
5755
5660
  // self-attention
5756
5661
  {
5757
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5758
- cb(cur, "wqkv", il);
5662
+ struct ggml_tensor * Qcur = nullptr;
5663
+ struct ggml_tensor * Kcur = nullptr;
5664
+ struct ggml_tensor * Vcur = nullptr;
5759
5665
 
5760
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5761
- cb(cur, "bqkv", il);
5666
+ if (model.layers[il].wqkv) {
5667
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5668
+ cb(cur, "wqkv", il);
5762
5669
 
5763
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5764
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5765
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5670
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5671
+ cb(cur, "bqkv", il);
5672
+
5673
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5674
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5675
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5676
+ } else {
5677
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
5678
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
5679
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
5680
+ }
5766
5681
 
5767
5682
  cb(Qcur, "Qcur", il);
5768
5683
  cb(Kcur, "Kcur", il);
@@ -5838,6 +5753,7 @@ struct llm_build_context {
5838
5753
 
5839
5754
  const int64_t n_embd_head = hparams.n_embd_head_v;
5840
5755
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5756
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5841
5757
 
5842
5758
  struct ggml_tensor * cur;
5843
5759
  struct ggml_tensor * inpL;
@@ -5855,7 +5771,7 @@ struct llm_build_context {
5855
5771
 
5856
5772
  // shift the entire K-cache if needed
5857
5773
  if (do_rope_shift) {
5858
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5774
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5859
5775
  }
5860
5776
 
5861
5777
  for (int il = 0; il < n_layer; ++il) {
@@ -5881,13 +5797,13 @@ struct llm_build_context {
5881
5797
  cb(Vcur, "Vcur", il);
5882
5798
 
5883
5799
  Qcur = ggml_rope_custom(
5884
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5800
+ ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
5885
5801
  n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
5886
5802
  ext_factor, attn_factor, beta_fast, beta_slow);
5887
5803
  cb(Qcur, "Qcur", il);
5888
5804
 
5889
5805
  Kcur = ggml_rope_custom(
5890
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5806
+ ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
5891
5807
  n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
5892
5808
  ext_factor, attn_factor, beta_fast, beta_slow);
5893
5809
  cb(Kcur, "Kcur", il);
@@ -5946,7 +5862,6 @@ struct llm_build_context {
5946
5862
  const int64_t n_embd_head = hparams.n_embd_head_v;
5947
5863
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
5948
5864
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5949
- GGML_ASSERT(n_embd_gqa == n_embd);
5950
5865
 
5951
5866
  struct ggml_tensor * cur;
5952
5867
  struct ggml_tensor * pos;
@@ -6042,199 +5957,13 @@ struct llm_build_context {
6042
5957
  }
6043
5958
  };
6044
5959
 
6045
- //
6046
- // tensor offloading helpers
6047
- //
6048
- // TODO: will be removed with backend v2
6049
-
6050
- enum llm_offload_func_e {
6051
- OFFLOAD_FUNC_NOP,
6052
- OFFLOAD_FUNC,
6053
- OFFLOAD_FUNC_FRC, // force offload
6054
- OFFLOAD_FUNC_KQV,
6055
- OFFLOAD_FUNC_NR,
6056
- OFFLOAD_FUNC_EMB, // embeddings
6057
- OFFLOAD_FUNC_OUT,
6058
- };
6059
-
6060
- // TODO: will be removed with backend v2
6061
- struct llm_offload_trie {
6062
- struct node {
6063
- ~node() {
6064
- for (int i = 0; i < 256; ++i) {
6065
- if (children[i]) {
6066
- delete children[i];
6067
- }
6068
- }
6069
- }
6070
-
6071
- node * children[256] = { nullptr };
6072
- llm_offload_func_e func = OFFLOAD_FUNC_NOP;
6073
- };
6074
-
6075
- llm_offload_trie() {
6076
- root = new node;
6077
- }
6078
-
6079
- llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
6080
- root = new node;
6081
-
6082
- for (const auto & kv : map) {
6083
- add(kv.first, kv.second);
6084
- }
6085
- }
6086
-
6087
- ~llm_offload_trie() {
6088
- delete root;
6089
- }
6090
-
6091
- void add(const char * name, llm_offload_func_e func) {
6092
- node * cur = root;
6093
-
6094
- for (int i = 0; ; ++i) {
6095
- const uint8_t c = name[i];
6096
-
6097
- if (!c) {
6098
- break;
6099
- }
6100
-
6101
- if (!cur->children[c]) {
6102
- cur->children[c] = new node;
6103
- }
6104
-
6105
- cur = cur->children[c];
6106
- }
6107
-
6108
- cur->func = func;
6109
- }
6110
-
6111
- llm_offload_func_e find(const char * name) const {
6112
- const node * cur = root;
6113
-
6114
- for (int i = 0; ; ++i) {
6115
- const uint8_t c = name[i];
6116
-
6117
- if (!c) {
6118
- break;
6119
- }
6120
-
6121
- if (!cur->children[c]) {
6122
- return OFFLOAD_FUNC_NOP;
6123
- }
6124
-
6125
- cur = cur->children[c];
6126
- }
6127
-
6128
- return cur->func;
6129
- }
6130
-
6131
- node * root = nullptr;
6132
- };
6133
-
6134
- // TODO: will be removed with backend v2
6135
- static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
6136
- //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
6137
- //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
6138
- { "pos_embd", OFFLOAD_FUNC_NR },
6139
-
6140
- { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
6141
- { "KQ_mask", OFFLOAD_FUNC_FRC },
6142
- { "K_shift", OFFLOAD_FUNC_FRC },
6143
-
6144
- { "K_shifted", OFFLOAD_FUNC },
6145
-
6146
- { "inp_norm", OFFLOAD_FUNC_NR },
6147
- { "inp_norm_w", OFFLOAD_FUNC_NR },
6148
- { "inp_norm_wb", OFFLOAD_FUNC_NR },
6149
-
6150
- { "norm", OFFLOAD_FUNC },
6151
- { "norm_w", OFFLOAD_FUNC },
6152
- { "norm_wb", OFFLOAD_FUNC },
6153
-
6154
- { "attn_norm", OFFLOAD_FUNC },
6155
- { "attn_norm_2", OFFLOAD_FUNC },
6156
-
6157
- { "wqkv", OFFLOAD_FUNC_KQV },
6158
- { "bqkv", OFFLOAD_FUNC_KQV },
6159
- { "wqkv_clamped", OFFLOAD_FUNC_KQV },
6160
-
6161
- { "tmpk", OFFLOAD_FUNC_KQV },
6162
- { "tmpq", OFFLOAD_FUNC_KQV },
6163
- { "tmpv", OFFLOAD_FUNC_KQV },
6164
- { "Kcur", OFFLOAD_FUNC_KQV },
6165
- { "Qcur", OFFLOAD_FUNC_KQV },
6166
- { "Vcur", OFFLOAD_FUNC_KQV },
6167
-
6168
- { "krot", OFFLOAD_FUNC_KQV },
6169
- { "qrot", OFFLOAD_FUNC_KQV },
6170
- { "kpass", OFFLOAD_FUNC_KQV },
6171
- { "qpass", OFFLOAD_FUNC_KQV },
6172
- { "krotated", OFFLOAD_FUNC_KQV },
6173
- { "qrotated", OFFLOAD_FUNC_KQV },
6174
-
6175
- { "q", OFFLOAD_FUNC_KQV },
6176
- { "k", OFFLOAD_FUNC_KQV },
6177
- { "kq", OFFLOAD_FUNC_KQV },
6178
- { "kq_scaled", OFFLOAD_FUNC_KQV },
6179
- { "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
6180
- { "kq_masked", OFFLOAD_FUNC_KQV },
6181
- { "kq_soft_max", OFFLOAD_FUNC_KQV },
6182
- { "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
6183
- { "v", OFFLOAD_FUNC_KQV },
6184
- { "kqv", OFFLOAD_FUNC_KQV },
6185
- { "kqv_merged", OFFLOAD_FUNC_KQV },
6186
- { "kqv_merged_cont", OFFLOAD_FUNC_KQV },
6187
- { "kqv_wo", OFFLOAD_FUNC_KQV },
6188
- { "kqv_out", OFFLOAD_FUNC_KQV },
6189
-
6190
- { "ffn_inp", OFFLOAD_FUNC },
6191
- { "ffn_norm", OFFLOAD_FUNC },
6192
-
6193
- { "ffn_up", OFFLOAD_FUNC },
6194
- { "ffn_up_b", OFFLOAD_FUNC },
6195
- { "ffn_gate", OFFLOAD_FUNC },
6196
- { "ffn_gate_b", OFFLOAD_FUNC },
6197
- { "ffn_gate_par", OFFLOAD_FUNC },
6198
- { "ffn_act", OFFLOAD_FUNC },
6199
- { "ffn_down", OFFLOAD_FUNC },
6200
- { "ffn_down_b", OFFLOAD_FUNC },
6201
- { "ffn_out", OFFLOAD_FUNC },
6202
-
6203
- { "ffn_silu", OFFLOAD_FUNC },
6204
- { "ffn_gelu", OFFLOAD_FUNC },
6205
- { "ffn_relu", OFFLOAD_FUNC },
6206
- { "ffn_sqr(relu)", OFFLOAD_FUNC },
6207
-
6208
- { "ffn_moe_logits", OFFLOAD_FUNC },
6209
- { "ffn_moe_probs", OFFLOAD_FUNC },
6210
- { "ffn_moe_argsort", OFFLOAD_FUNC },
6211
- { "ffn_moe_weights", OFFLOAD_FUNC },
6212
- { "ffn_moe_weights_sum", OFFLOAD_FUNC },
6213
- { "ffn_moe_weights_norm", OFFLOAD_FUNC },
6214
- { "ffn_moe_weighted", OFFLOAD_FUNC },
6215
- { "ffn_moe_up", OFFLOAD_FUNC },
6216
- { "ffn_moe_gate", OFFLOAD_FUNC },
6217
- { "ffn_moe_silu", OFFLOAD_FUNC },
6218
- { "ffn_moe_gate_par", OFFLOAD_FUNC },
6219
- { "ffn_moe_down", OFFLOAD_FUNC },
6220
- { "ffn_moe_out", OFFLOAD_FUNC },
6221
-
6222
- { "l_out", OFFLOAD_FUNC },
6223
-
6224
- { "result_norm", OFFLOAD_FUNC_EMB },
6225
- { "result_output_no_bias", OFFLOAD_FUNC_EMB },
6226
- { "result_output", OFFLOAD_FUNC_OUT },
6227
- };
6228
-
6229
- static llm_offload_trie k_offload_func_trie(k_offload_map);
6230
-
6231
5960
  static struct ggml_cgraph * llama_build_graph(
6232
5961
  llama_context & lctx,
6233
5962
  const llama_batch & batch) {
6234
5963
  const auto & model = lctx.model;
6235
5964
 
6236
5965
  // check if we should build the worst-case graph (for memory measurement)
6237
- const bool worst_case = ggml_allocr_is_measure(lctx.alloc);
5966
+ const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
6238
5967
 
6239
5968
  // keep track of the input that has already been allocated
6240
5969
  bool alloc_inp_tokens = false;
@@ -6243,16 +5972,8 @@ static struct ggml_cgraph * llama_build_graph(
6243
5972
  bool alloc_inp_KQ_mask = false;
6244
5973
  bool alloc_inp_K_shift = false;
6245
5974
 
6246
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6247
- const bool do_offload = true;
6248
- #else
6249
- const bool do_offload = true; // TODO: set to false after finishing refactoring
6250
- #endif
6251
-
6252
- int n_non_view = 0; // number of non-view tensors that have been processed by the callback
6253
-
6254
5975
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
6255
- // TODO: will be removed with backend v2
5976
+ // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
6256
5977
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
6257
5978
  if (il >= 0) {
6258
5979
  ggml_format_name(cur, "%s-%d", name, il);
@@ -6263,12 +5984,11 @@ static struct ggml_cgraph * llama_build_graph(
6263
5984
  //
6264
5985
  // allocate input tensors and set input data
6265
5986
  //
6266
- // TODO: will be removed with backend v2
6267
5987
 
6268
5988
  if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
6269
- ggml_allocr_alloc(lctx.alloc, cur);
5989
+ ggml_tallocr_alloc(lctx.alloc, cur);
6270
5990
 
6271
- if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
5991
+ if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
6272
5992
  const int64_t n_tokens = cur->ne[0];
6273
5993
 
6274
5994
  ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
@@ -6277,10 +5997,10 @@ static struct ggml_cgraph * llama_build_graph(
6277
5997
  alloc_inp_tokens = true;
6278
5998
  }
6279
5999
 
6280
- if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
6281
- ggml_allocr_alloc(lctx.alloc, cur);
6000
+ if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
6001
+ ggml_tallocr_alloc(lctx.alloc, cur);
6282
6002
 
6283
- if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) {
6003
+ if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
6284
6004
  const int64_t n_embd = cur->ne[0];
6285
6005
  const int64_t n_tokens = cur->ne[1];
6286
6006
 
@@ -6291,9 +6011,9 @@ static struct ggml_cgraph * llama_build_graph(
6291
6011
  }
6292
6012
 
6293
6013
  if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
6294
- ggml_allocr_alloc(lctx.alloc, cur);
6014
+ ggml_tallocr_alloc(lctx.alloc, cur);
6295
6015
 
6296
- if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
6016
+ if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
6297
6017
  const int64_t n_tokens = cur->ne[0];
6298
6018
 
6299
6019
  static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
@@ -6304,9 +6024,9 @@ static struct ggml_cgraph * llama_build_graph(
6304
6024
  }
6305
6025
 
6306
6026
  if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
6307
- ggml_allocr_alloc(lctx.alloc, cur);
6027
+ ggml_tallocr_alloc(lctx.alloc, cur);
6308
6028
 
6309
- if (!ggml_allocr_is_measure(lctx.alloc)) {
6029
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
6310
6030
  const int64_t n_kv = cur->ne[0];
6311
6031
  const int64_t n_tokens = cur->ne[1];
6312
6032
 
@@ -6344,160 +6064,30 @@ static struct ggml_cgraph * llama_build_graph(
6344
6064
  }
6345
6065
 
6346
6066
  if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
6347
- ggml_allocr_alloc(lctx.alloc, cur);
6067
+ ggml_tallocr_alloc(lctx.alloc, cur);
6348
6068
 
6349
- if (!ggml_allocr_is_measure(lctx.alloc)) {
6069
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
6350
6070
  const int64_t n_ctx = cur->ne[0];
6351
6071
 
6352
6072
  int32_t * data;
6353
6073
  if (ggml_backend_buffer_is_host(cur->buffer)) {
6354
6074
  data = (int32_t *) cur->data;
6355
6075
  } else {
6356
- lctx.buf_copy.resize(ggml_nbytes(cur));
6357
- data = (int32_t *) lctx.buf_copy.data();
6358
- }
6359
-
6360
- for (int i = 0; i < n_ctx; ++i) {
6361
- data[i] = lctx.kv_self.cells[i].delta;
6362
- }
6363
-
6364
- if (data != cur->data) {
6365
- ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
6366
- }
6367
- }
6368
-
6369
- alloc_inp_K_shift = true;
6370
- }
6371
-
6372
- // view tensors are not processed further
6373
- if (cur->view_src != nullptr) {
6374
- return;
6375
- }
6376
-
6377
- if (cur->op != GGML_OP_NONE) {
6378
- n_non_view++;
6379
- }
6380
-
6381
- //
6382
- // offload layers
6383
- //
6384
- // TODO: will be removed with backend v2
6385
-
6386
- //#define LLAMA_OFFLOAD_DEBUG
6387
-
6388
- if (!do_offload) {
6389
- return;
6390
- }
6391
-
6392
- const int n_layer = model.hparams.n_layer;
6393
-
6394
- const int n_gpu_layers = model.n_gpu_layers;
6395
- const int i_gpu_start = n_layer - n_gpu_layers;
6396
-
6397
- // should we offload the final norm? yes if we are not computing embeddings
6398
- const bool offload_emb = lctx.embedding.empty();
6399
-
6400
- static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
6401
- { OFFLOAD_FUNC_NOP, "CPU" },
6402
- { OFFLOAD_FUNC_OUT, "CPU" },
6403
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6404
- { OFFLOAD_FUNC, "GPU (CUDA)" },
6405
- { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
6406
- { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
6407
- { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
6408
- { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
6409
- #else
6410
- { OFFLOAD_FUNC, "CPU" },
6411
- { OFFLOAD_FUNC_FRC, "CPU" },
6412
- { OFFLOAD_FUNC_KQV, "CPU" },
6413
- { OFFLOAD_FUNC_NR, "CPU" },
6414
- { OFFLOAD_FUNC_EMB, "CPU" },
6415
- #endif // GGML_USE_CUBLAS
6416
- };
6417
-
6418
- // check the global map for what offload function to use for this tensor
6419
- llm_offload_func_e func_e = k_offload_func_trie.find(name);
6420
-
6421
- if (func_e == OFFLOAD_FUNC_NOP) {
6422
- #ifdef LLAMA_OFFLOAD_DEBUG
6423
- // if a tensor hasn't been offloaded, we warn the user
6424
- if (worst_case) {
6425
- LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
6426
- cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
6427
- }
6428
- #endif
6429
-
6430
- return;
6431
- }
6432
-
6433
- // count the number of layers and respect the provided n_gpu_layers
6434
- switch (func_e) {
6435
- case OFFLOAD_FUNC_NOP:
6436
- case OFFLOAD_FUNC_OUT:
6437
- break;
6438
- case OFFLOAD_FUNC:
6439
- if (n_gpu_layers < n_layer) {
6440
- if (il < i_gpu_start) {
6441
- func_e = OFFLOAD_FUNC_NOP;
6442
- }
6443
- }
6444
- break;
6445
- case OFFLOAD_FUNC_FRC:
6446
- if (!lctx.cparams.offload_kqv) {
6447
- func_e = OFFLOAD_FUNC_NOP;
6448
- } break;
6449
- case OFFLOAD_FUNC_KQV:
6450
- if (!lctx.cparams.offload_kqv) {
6451
- func_e = OFFLOAD_FUNC_NOP;
6452
- } else {
6453
- if (n_gpu_layers < n_layer) {
6454
- if (il < i_gpu_start) {
6455
- func_e = OFFLOAD_FUNC_NOP;
6456
- }
6457
- }
6458
- }
6459
- break;
6460
- case OFFLOAD_FUNC_NR:
6461
- if (n_gpu_layers <= n_layer + 0) {
6462
- func_e = OFFLOAD_FUNC_NOP;
6463
- }
6464
- break;
6465
- case OFFLOAD_FUNC_EMB:
6466
- if (!offload_emb || n_gpu_layers < n_layer) {
6467
- func_e = OFFLOAD_FUNC_NOP;
6468
- }
6469
- break;
6470
- default: GGML_ASSERT(false);
6471
- }
6472
-
6473
- offload_func_t func = ggml_offload_nop;
6474
-
6475
- // this is needed for compatibility with Metal for example
6476
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6477
- static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
6478
- #else
6479
- static offload_func_t ggml_offload_gpu = ggml_offload_nop;
6480
- #endif
6076
+ lctx.buf_copy.resize(ggml_nbytes(cur));
6077
+ data = (int32_t *) lctx.buf_copy.data();
6078
+ }
6481
6079
 
6482
- switch (func_e) {
6483
- case OFFLOAD_FUNC_NOP:
6484
- case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
6485
- case OFFLOAD_FUNC:
6486
- case OFFLOAD_FUNC_KQV:
6487
- case OFFLOAD_FUNC_FRC:
6488
- case OFFLOAD_FUNC_NR:
6489
- case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
6490
- default: GGML_ASSERT(false);
6491
- }
6080
+ for (int i = 0; i < n_ctx; ++i) {
6081
+ data[i] = lctx.kv_self.cells[i].delta;
6082
+ }
6492
6083
 
6493
- // apply offload function to the tensor
6494
- func(cur);
6084
+ if (data != cur->data) {
6085
+ ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
6086
+ }
6087
+ }
6495
6088
 
6496
- #ifdef LLAMA_OFFLOAD_DEBUG
6497
- if (worst_case) {
6498
- LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
6089
+ alloc_inp_K_shift = true;
6499
6090
  }
6500
- #endif
6501
6091
  };
6502
6092
 
6503
6093
  struct ggml_cgraph * result = NULL;
@@ -6565,27 +6155,6 @@ static struct ggml_cgraph * llama_build_graph(
6565
6155
 
6566
6156
  llm.free();
6567
6157
 
6568
- if (worst_case) {
6569
- int n_non_view_total = 0;
6570
-
6571
- for (int i = 0; i < result->n_nodes; ++i) {
6572
- if (result->nodes[i]->view_src == nullptr) {
6573
- n_non_view_total++;
6574
- }
6575
- }
6576
-
6577
- LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
6578
-
6579
- if (n_non_view != n_non_view_total) {
6580
- LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
6581
- LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__);
6582
- LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__);
6583
- LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__);
6584
- LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__);
6585
- LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
6586
- }
6587
- }
6588
-
6589
6158
  return result;
6590
6159
  }
6591
6160
 
@@ -6631,8 +6200,6 @@ static int llama_decode_internal(
6631
6200
 
6632
6201
  auto & kv_self = lctx.kv_self;
6633
6202
 
6634
- GGML_ASSERT(!!kv_self.ctx);
6635
-
6636
6203
  const int64_t n_embd = hparams.n_embd;
6637
6204
  const int64_t n_vocab = hparams.n_vocab;
6638
6205
 
@@ -6686,12 +6253,10 @@ static int llama_decode_internal(
6686
6253
 
6687
6254
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
6688
6255
 
6689
- ggml_allocr_reset(lctx.alloc);
6256
+ ggml_backend_sched_reset(lctx.sched);
6690
6257
 
6691
6258
  ggml_cgraph * gf = llama_build_graph(lctx, batch);
6692
6259
 
6693
- ggml_allocr_alloc_graph(lctx.alloc, gf);
6694
-
6695
6260
  // the output is always the last tensor in the graph
6696
6261
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
6697
6262
  GGML_ASSERT(strcmp(res->name, "result_output") == 0);
@@ -6703,30 +6268,6 @@ static int llama_decode_internal(
6703
6268
  GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
6704
6269
  }
6705
6270
 
6706
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6707
- char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
6708
- for (int i = 0; i < gf->n_leafs; i++) {
6709
- ggml_tensor * node = gf->leafs[i];
6710
- if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
6711
- ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
6712
- ggml_cuda_copy_to_device(node);
6713
- }
6714
- }
6715
-
6716
- for (int i = 0; i < gf->n_nodes; i++) {
6717
- ggml_tensor * node = gf->nodes[i];
6718
- if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
6719
- ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
6720
- }
6721
- }
6722
-
6723
- // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
6724
- if (!lctx.embedding.empty()) {
6725
- embeddings->backend = GGML_BACKEND_CPU;
6726
- }
6727
- res->backend = GGML_BACKEND_CPU;
6728
- #endif
6729
-
6730
6271
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
6731
6272
 
6732
6273
  // for big prompts, if BLAS is enabled, it is better to use only one thread
@@ -6749,15 +6290,17 @@ static int llama_decode_internal(
6749
6290
  #endif
6750
6291
 
6751
6292
  #ifdef GGML_USE_METAL
6752
- if (ggml_backend_is_metal(lctx.backend)) {
6753
- ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
6293
+ if (ggml_backend_is_metal(lctx.backend_metal)) {
6294
+ ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
6754
6295
  }
6755
6296
  #endif
6756
6297
 
6757
- if (ggml_backend_is_cpu(lctx.backend)) {
6758
- ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
6298
+ if (lctx.backend_cpu != nullptr) {
6299
+ ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
6759
6300
  }
6760
- ggml_backend_graph_compute(lctx.backend, gf);
6301
+ ggml_backend_sched_graph_compute(lctx.sched, gf);
6302
+
6303
+ // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
6761
6304
 
6762
6305
  #ifdef GGML_USE_MPI
6763
6306
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
@@ -6805,30 +6348,33 @@ static int llama_decode_internal(
6805
6348
  logits_out.clear();
6806
6349
  #endif
6807
6350
 
6351
+ ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
6352
+ GGML_ASSERT(res_backend != nullptr);
6808
6353
  if (batch.logits) {
6809
6354
  logits_out.resize(n_vocab * n_tokens);
6810
6355
  for (uint32_t i = 0; i < n_tokens; i++) {
6811
6356
  if (batch.logits[i] == 0) {
6812
6357
  continue;
6813
6358
  }
6814
- ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
6359
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
6815
6360
  #ifndef NDEBUG
6816
6361
  logits_valid[i] = true;
6817
6362
  #endif
6818
6363
  }
6819
6364
  } else if (lctx.logits_all) {
6820
6365
  logits_out.resize(n_vocab * n_tokens);
6821
- ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
6366
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
6822
6367
  #ifndef NDEBUG
6823
6368
  std::fill(logits_valid.begin(), logits_valid.end(), true);
6824
6369
  #endif
6825
6370
  } else {
6826
6371
  logits_out.resize(n_vocab);
6827
- ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
6372
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
6828
6373
  #ifndef NDEBUG
6829
6374
  logits_valid[0] = true;
6830
6375
  #endif
6831
6376
  }
6377
+ ggml_backend_synchronize(res_backend);
6832
6378
  }
6833
6379
 
6834
6380
  // extract embeddings
@@ -6836,7 +6382,9 @@ static int llama_decode_internal(
6836
6382
  auto & embedding_out = lctx.embedding;
6837
6383
 
6838
6384
  embedding_out.resize(n_embd);
6839
- ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
6385
+ ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
6386
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
6387
+ ggml_backend_synchronize(embeddings_backend);
6840
6388
  }
6841
6389
 
6842
6390
  // measure the performance only for the single-token evals
@@ -6907,15 +6455,15 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
6907
6455
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
6908
6456
  static const char * hex = "0123456789ABCDEF";
6909
6457
  switch (llama_vocab_get_type(vocab)) {
6910
- case LLAMA_VOCAB_TYPE_SPM: {
6911
- const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
6912
- return vocab.token_to_id.at(buf);
6913
- }
6914
- case LLAMA_VOCAB_TYPE_BPE: {
6915
- return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
6916
- }
6917
- default:
6918
- GGML_ASSERT(false);
6458
+ case LLAMA_VOCAB_TYPE_SPM: {
6459
+ const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
6460
+ return vocab.token_to_id.at(buf);
6461
+ }
6462
+ case LLAMA_VOCAB_TYPE_BPE: {
6463
+ return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
6464
+ }
6465
+ default:
6466
+ GGML_ASSERT(false);
6919
6467
  }
6920
6468
  }
6921
6469
 
@@ -7449,7 +6997,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7449
6997
  if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
7450
6998
 
7451
6999
  #ifdef PRETOKENIZERDEBUG
7452
- fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7000
+ LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7453
7001
  #endif
7454
7002
  auto source = std::distance(buffer.begin(), it);
7455
7003
 
@@ -7462,7 +7010,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7462
7010
  buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
7463
7011
 
7464
7012
  #ifdef PRETOKENIZERDEBUG
7465
- fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
7013
+ LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
7466
7014
  #endif
7467
7015
  it++;
7468
7016
  }
@@ -7478,7 +7026,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7478
7026
  buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
7479
7027
 
7480
7028
  #ifdef PRETOKENIZERDEBUG
7481
- fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
7029
+ LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
7482
7030
  #endif
7483
7031
 
7484
7032
  it++;
@@ -7494,7 +7042,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7494
7042
  raw_text_base_length = right_reminder_length;
7495
7043
 
7496
7044
  #ifdef PRETOKENIZERDEBUG
7497
- fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7045
+ LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7498
7046
  #endif
7499
7047
  } else {
7500
7048
  if (source == 0) {
@@ -7551,7 +7099,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7551
7099
  }
7552
7100
 
7553
7101
  #ifdef PRETOKENIZERDEBUG
7554
- fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7102
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7555
7103
  #endif
7556
7104
  llm_tokenizer_spm tokenizer(vocab);
7557
7105
  llama_escape_whitespace(raw_text);
@@ -7572,7 +7120,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7572
7120
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
7573
7121
 
7574
7122
  #ifdef PRETOKENIZERDEBUG
7575
- fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7123
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7576
7124
  #endif
7577
7125
  llm_tokenizer_bpe tokenizer(vocab);
7578
7126
  tokenizer.tokenize(raw_text, output);
@@ -8350,39 +7898,59 @@ static void llama_log_softmax(float * array, size_t size) {
8350
7898
  }
8351
7899
  }
8352
7900
 
7901
+ void llama_sample_apply_guidance(
7902
+ struct llama_context * ctx,
7903
+ float * logits,
7904
+ float * logits_guidance,
7905
+ float scale) {
7906
+ GGML_ASSERT(ctx);
7907
+
7908
+ const auto t_start_sample_us = ggml_time_us();
7909
+ const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
7910
+
7911
+ llama_log_softmax(logits, n_vocab);
7912
+ llama_log_softmax(logits_guidance, n_vocab);
7913
+
7914
+ for (int i = 0; i < n_vocab; ++i) {
7915
+ auto & l = logits[i];
7916
+ const auto & g = logits_guidance[i];
7917
+
7918
+ l = scale * (l - g) + g;
7919
+ }
7920
+
7921
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7922
+ }
7923
+
8353
7924
  void llama_sample_classifier_free_guidance(
8354
7925
  struct llama_context * ctx,
8355
7926
  llama_token_data_array * candidates,
8356
7927
  struct llama_context * guidance_ctx,
8357
7928
  float scale) {
8358
- int64_t t_start_sample_us = ggml_time_us();
8359
-
8360
7929
  GGML_ASSERT(ctx);
7930
+ int64_t t_start_sample_us;
8361
7931
 
8362
- auto n_vocab = llama_n_vocab(llama_get_model(ctx));
7932
+ t_start_sample_us = ggml_time_us();
7933
+ const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
8363
7934
 
8364
- GGML_ASSERT(n_vocab == (int)candidates->size);
7935
+ GGML_ASSERT(n_vocab == candidates->size);
8365
7936
  GGML_ASSERT(!candidates->sorted);
8366
7937
 
8367
- std::vector<float> logits_base;
8368
- logits_base.reserve(candidates->size);
8369
- for (size_t i = 0; i < candidates->size; ++i) {
8370
- logits_base.push_back(candidates->data[i].logit);
7938
+ std::vector<float> logits_base(n_vocab);
7939
+ for (size_t i = 0; i < n_vocab; ++i) {
7940
+ logits_base[i] = candidates->data[i].logit;
8371
7941
  }
8372
- llama_log_softmax(logits_base.data(), candidates->size);
8373
7942
 
8374
- float* logits_guidance = llama_get_logits(guidance_ctx);
8375
- llama_log_softmax(logits_guidance, n_vocab);
7943
+ float * logits_guidance = llama_get_logits(guidance_ctx);
8376
7944
 
8377
- for (int i = 0; i < n_vocab; ++i) {
8378
- float logit_guidance = logits_guidance[i];
8379
- float logit_base = logits_base[i];
8380
- candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
8381
- }
7945
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7946
+ llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
7947
+ t_start_sample_us = ggml_time_us();
8382
7948
 
8383
- if (ctx) {
8384
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7949
+ for (size_t i = 0; i < n_vocab; ++i) {
7950
+ candidates->data[i].logit = logits_base[i];
8385
7951
  }
7952
+
7953
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
8386
7954
  }
8387
7955
 
8388
7956
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
@@ -8806,6 +8374,8 @@ struct quantize_state_internal {
8806
8374
  int n_k_quantized = 0;
8807
8375
  int n_fallback = 0;
8808
8376
 
8377
+ bool has_imatrix = false;
8378
+
8809
8379
  quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
8810
8380
  : model(model)
8811
8381
  , params(params)
@@ -8889,9 +8459,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8889
8459
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
8890
8460
  new_type = GGML_TYPE_Q8_0;
8891
8461
  }
8462
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
8463
+ new_type = GGML_TYPE_Q5_K;
8464
+ }
8892
8465
  else if (new_type != GGML_TYPE_Q8_0) {
8893
8466
  new_type = GGML_TYPE_Q6_K;
8894
8467
  }
8468
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
8469
+ if (name.find("attn_v.weight") != std::string::npos) {
8470
+ if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
8471
+ else new_type = GGML_TYPE_Q2_K;
8472
+ ++qs.i_attention_wv;
8473
+ }
8474
+ else if (name.find("ffn_down") != std::string::npos) {
8475
+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
8476
+ ++qs.i_feed_forward_w2;
8477
+ }
8478
+ else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8895
8479
  } else if (name.find("attn_v.weight") != std::string::npos) {
8896
8480
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8897
8481
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8921,11 +8505,32 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8921
8505
  // TODO: explore better strategies
8922
8506
  new_type = GGML_TYPE_Q8_0;
8923
8507
  }
8924
- } else if (name.find("ffn_down.weight") != std::string::npos) {
8508
+ } else if (name.find("ffn_down") != std::string::npos) {
8509
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8510
+ int i_layer, n_layer;
8511
+ if (n_expert == 1) {
8512
+ i_layer = qs.i_feed_forward_w2;
8513
+ n_layer = qs.n_feed_forward_w2;
8514
+ } else {
8515
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8516
+ // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8517
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
8518
+ // tensor name.
8519
+ n_layer = qs.n_feed_forward_w2 / n_expert;
8520
+ if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
8521
+ throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
8522
+ }
8523
+ if (i_layer < 0 || i_layer >= n_layer) {
8524
+ throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
8525
+ }
8526
+ }
8925
8527
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8528
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8529
+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
8530
+ }
8926
8531
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8927
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
8928
- : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8532
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
8533
+ : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
8929
8534
  : GGML_TYPE_Q3_K;
8930
8535
  }
8931
8536
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -8933,22 +8538,36 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8933
8538
  }
8934
8539
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8935
8540
  if (arch == LLM_ARCH_FALCON) {
8936
- new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
8937
- use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8541
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
8542
+ use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8938
8543
  } else {
8939
- if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8544
+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
8940
8545
  }
8941
8546
  }
8942
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8943
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
8547
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
8548
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
8944
8549
  new_type = GGML_TYPE_Q5_K;
8945
8550
  }
8551
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
8552
+ && qs.has_imatrix && i_layer < n_layer/8) {
8553
+ // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
8554
+ // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
8555
+ // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
8556
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
8557
+ }
8946
8558
  ++qs.i_feed_forward_w2;
8947
8559
  } else if (name.find("attn_output.weight") != std::string::npos) {
8948
8560
  if (arch != LLM_ARCH_FALCON) {
8949
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
8950
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8951
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
8561
+ if (qs.model.hparams.n_expert == 8) {
8562
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
8563
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8564
+ new_type = GGML_TYPE_Q5_K;
8565
+ }
8566
+ } else {
8567
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
8568
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8569
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
8570
+ }
8952
8571
  } else {
8953
8572
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
8954
8573
  }
@@ -8958,9 +8577,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8958
8577
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
8959
8578
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
8960
8579
  }
8961
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
8962
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8963
- }
8580
+ // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
8581
+ //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
8582
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8583
+ //}
8964
8584
  // This can be used to reduce the size of the Q5_K_S model.
8965
8585
  // The associated PPL increase is fully in line with the size reduction
8966
8586
  //else {
@@ -8968,7 +8588,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8968
8588
  //}
8969
8589
  bool convert_incompatible_tensor = false;
8970
8590
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
8971
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
8591
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
8592
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
8972
8593
  int nx = tensor->ne[0];
8973
8594
  int ny = tensor->ne[1];
8974
8595
  if (nx % QK_K != 0) {
@@ -8980,6 +8601,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8980
8601
  }
8981
8602
  if (convert_incompatible_tensor) {
8982
8603
  switch (new_type) {
8604
+ case GGML_TYPE_IQ2_XXS:
8605
+ case GGML_TYPE_IQ2_XS:
8983
8606
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
8984
8607
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
8985
8608
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@@ -9009,6 +8632,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9009
8632
 
9010
8633
  // K-quants
9011
8634
  case LLAMA_FTYPE_MOSTLY_Q2_K: quantized_type = GGML_TYPE_Q2_K; break;
8635
+ case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
9012
8636
  case LLAMA_FTYPE_MOSTLY_Q3_K_S:
9013
8637
  case LLAMA_FTYPE_MOSTLY_Q3_K_M:
9014
8638
  case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9017,6 +8641,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9017
8641
  case LLAMA_FTYPE_MOSTLY_Q5_K_S:
9018
8642
  case LLAMA_FTYPE_MOSTLY_Q5_K_M: quantized_type = GGML_TYPE_Q5_K; break;
9019
8643
  case LLAMA_FTYPE_MOSTLY_Q6_K: quantized_type = GGML_TYPE_Q6_K; break;
8644
+ case LLAMA_FTYPE_MOSTLY_IQ2_XXS:quantized_type = GGML_TYPE_IQ2_XXS; break;
8645
+ case LLAMA_FTYPE_MOSTLY_IQ2_XS :quantized_type = GGML_TYPE_IQ2_XS; break;
9020
8646
 
9021
8647
  default: throw std::runtime_error(format("invalid output file type %d\n", ftype));
9022
8648
  }
@@ -9047,6 +8673,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9047
8673
  if (params->only_copy) {
9048
8674
  ftype = model.ftype;
9049
8675
  }
8676
+ const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
8677
+ if (params->imatrix) {
8678
+ imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
8679
+ if (imatrix_data) {
8680
+ LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
8681
+ qs.has_imatrix = true;
8682
+ }
8683
+ }
9050
8684
 
9051
8685
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
9052
8686
  struct gguf_context * ctx_out = gguf_init_empty();
@@ -9065,7 +8699,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9065
8699
  if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
9066
8700
  ++qs.n_attention_wv;
9067
8701
  }
9068
- else if (name.find("ffn_down.weight") != std::string::npos) {
8702
+ else if (name.find("ffn_down") != std::string::npos) {
9069
8703
  ++qs.n_feed_forward_w2;
9070
8704
  }
9071
8705
  }
@@ -9104,6 +8738,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9104
8738
  // placeholder for the meta data
9105
8739
  ::zeros(fout, meta_size);
9106
8740
 
8741
+ std::set<ggml_type> used_iq2;
8742
+
9107
8743
  for (int i = 0; i < ml.n_tensors; ++i) {
9108
8744
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
9109
8745
 
@@ -9156,6 +8792,35 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9156
8792
  } else {
9157
8793
  const size_t nelements = ggml_nelements(tensor);
9158
8794
 
8795
+ if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
8796
+ ggml_init_iq2_quantization(new_type);
8797
+ used_iq2.insert(new_type);
8798
+ }
8799
+
8800
+ const float * imatrix = nullptr;
8801
+ if (imatrix_data) {
8802
+ auto it = imatrix_data->find(tensor->name);
8803
+ if (it == imatrix_data->end()) {
8804
+ LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
8805
+ } else {
8806
+ if (it->second.size() == (size_t)tensor->ne[0]) {
8807
+ imatrix = it->second.data();
8808
+ } else {
8809
+ LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
8810
+ int(it->second.size()), int(tensor->ne[0]), tensor->name);
8811
+ }
8812
+ }
8813
+ }
8814
+ if ((new_type == GGML_TYPE_IQ2_XXS ||
8815
+ new_type == GGML_TYPE_IQ2_XS ||
8816
+ (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
8817
+ LLAMA_LOG_ERROR("\n\n============================================================\n");
8818
+ LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
8819
+ LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
8820
+ LLAMA_LOG_ERROR("============================================================\n\n");
8821
+ throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
8822
+ }
8823
+
9159
8824
  float * f32_data;
9160
8825
 
9161
8826
  if (tensor->type == GGML_TYPE_F32) {
@@ -9176,21 +8841,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9176
8841
  new_data = work.data();
9177
8842
  std::array<int64_t, 1 << 4> hist_cur = {};
9178
8843
 
9179
- static const int chunk_size = 32 * 512;
8844
+ const int n_per_row = tensor->ne[0];
8845
+ const int nrows = nelements / n_per_row;
8846
+
8847
+ static const int min_chunk_size = 32 * 512;
8848
+ const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
8849
+
9180
8850
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
9181
8851
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
9182
8852
  if (nthread_use < 2) {
9183
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
8853
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
9184
8854
  } else {
9185
- size_t counter = 0;
8855
+ int counter = 0;
9186
8856
  new_size = 0;
9187
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
8857
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
8858
+ nrows, n_per_row, imatrix]() {
9188
8859
  std::array<int64_t, 1 << 4> local_hist = {};
8860
+ const int nrows_per_chunk = chunk_size / n_per_row;
9189
8861
  size_t local_size = 0;
9190
8862
  while (true) {
9191
8863
  std::unique_lock<std::mutex> lock(mutex);
9192
- size_t first = counter; counter += chunk_size;
9193
- if (first >= nelements) {
8864
+ int first_row = counter; counter += nrows_per_chunk;
8865
+ if (first_row >= nrows) {
9194
8866
  if (local_size > 0) {
9195
8867
  for (int j=0; j<int(local_hist.size()); ++j) {
9196
8868
  hist_cur[j] += local_hist[j];
@@ -9200,8 +8872,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9200
8872
  break;
9201
8873
  }
9202
8874
  lock.unlock();
9203
- size_t last = std::min(nelements, first + chunk_size);
9204
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
8875
+ const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
8876
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
8877
+ first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
9205
8878
  }
9206
8879
  };
9207
8880
  for (int it = 0; it < nthread_use - 1; ++it) {
@@ -9212,7 +8885,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9212
8885
  workers.clear();
9213
8886
  }
9214
8887
 
9215
- LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
8888
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
9216
8889
  int64_t tot_count = 0;
9217
8890
  for (size_t i = 0; i < hist_cur.size(); i++) {
9218
8891
  hist_all[i] += hist_cur[i];
@@ -9220,6 +8893,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9220
8893
  }
9221
8894
 
9222
8895
  if (tot_count > 0) {
8896
+ LLAMA_LOG_INFO(" | hist: ");
9223
8897
  for (size_t i = 0; i < hist_cur.size(); i++) {
9224
8898
  LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
9225
8899
  }
@@ -9248,6 +8922,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9248
8922
 
9249
8923
  fout.close();
9250
8924
 
8925
+ for (auto type : used_iq2) {
8926
+ ggml_deinit_iq2_quantization(type);
8927
+ }
8928
+
9251
8929
  gguf_free(ctx_out);
9252
8930
 
9253
8931
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9305,48 +8983,23 @@ static int llama_apply_lora_from_file_internal(
9305
8983
 
9306
8984
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
9307
8985
 
9308
- // create a name -> tensor map of the model to accelerate lookups
9309
- // find the max tensor size to estimate the required temporary buffer size
9310
- size_t max_tensor_size = 0;
9311
- std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
9312
- for (const auto & kv : model.tensors_by_name) {
9313
- model_tensors.insert(kv);
9314
- size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
9315
- max_tensor_size = std::max(max_tensor_size, f32_size);
9316
- }
9317
-
9318
- // create a temporary ggml context to store the lora tensors
9319
- // TODO: use ggml-alloc
9320
- size_t lora_ctx_size = max_tensor_size * 3;
9321
- LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
9322
- std::vector<uint8_t> lora_buf(lora_ctx_size);
9323
-
9324
- struct ggml_init_params params;
9325
- params.mem_size = lora_buf.size();
9326
- params.mem_buffer = lora_buf.data();
9327
- params.no_alloc = false;
9328
-
9329
- using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
9330
-
9331
- unique_context lora_ctx(nullptr, ggml_free);
9332
- lora_ctx.reset(ggml_init(params));
9333
- std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
9334
-
9335
8986
  // load base model
9336
8987
  std::unique_ptr<llama_model_loader> ml;
9337
-
9338
- if (path_base_model) {
8988
+ if (path_base_model) {
9339
8989
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
9340
8990
  ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
9341
- ml->init_mapping(false); // no prefetching
8991
+ ml->init_mapping(/*prefetch*/ false); // no prefetching
9342
8992
  }
9343
8993
 
9344
- // read tensors and apply
9345
- bool warned = false;
9346
- int n_tensors = 0;
9347
-
9348
- std::vector<uint8_t> work_buffer;
8994
+ struct tensor_meta {
8995
+ std::string name;
8996
+ ggml_type type;
8997
+ int32_t ne[2];
8998
+ size_t offset;
8999
+ };
9000
+ std::map<std::string, tensor_meta> tensor_meta_map;
9349
9001
 
9002
+ // load all tensor meta
9350
9003
  while (true) {
9351
9004
  if (fin.tell() == fin.size) {
9352
9005
  // eof
@@ -9359,7 +9012,7 @@ static int llama_apply_lora_from_file_internal(
9359
9012
 
9360
9013
  fin.read_raw(&n_dims, sizeof(n_dims));
9361
9014
  fin.read_raw(&name_len, sizeof(name_len));
9362
- fin.read_raw(&ftype, sizeof(ftype));
9015
+ fin.read_raw(&ftype, sizeof(ftype));
9363
9016
 
9364
9017
  if (n_dims != 1 && n_dims != 2) {
9365
9018
  LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
@@ -9373,31 +9026,23 @@ static int llama_apply_lora_from_file_internal(
9373
9026
 
9374
9027
  std::string name;
9375
9028
  {
9376
- GGML_ASSERT(name_len <= 1024);
9377
- char buf[1024];
9029
+ GGML_ASSERT(name_len < GGML_MAX_NAME);
9030
+ char buf[GGML_MAX_NAME];
9378
9031
  fin.read_raw(buf, name_len);
9379
9032
  name = std::string(buf, name_len);
9380
9033
  }
9381
9034
 
9382
- // check for lora suffix and get the type of tensor
9383
- const std::string lora_suffix = ".lora";
9384
- size_t pos = name.rfind(lora_suffix);
9385
- if (pos == std::string::npos) {
9386
- LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
9387
- return 1;
9035
+ // check for lora suffix
9036
+ std::string lora_suffix;
9037
+ if (name.length() > 6) {
9038
+ lora_suffix = name.substr(name.length() - 6);
9388
9039
  }
9389
-
9390
- std::string lora_type = name.substr(pos + lora_suffix.length());
9391
- std::string base_name = name;
9392
- base_name.erase(pos);
9393
- // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
9394
-
9395
- if (model_tensors.find(base_name) == model_tensors.end()) {
9396
- LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
9040
+ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
9041
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
9397
9042
  return 1;
9398
9043
  }
9399
9044
 
9400
- // create ggml tensor
9045
+ // tensor type
9401
9046
  ggml_type wtype;
9402
9047
  switch (ftype) {
9403
9048
  case 0: wtype = GGML_TYPE_F32; break;
@@ -9409,122 +9054,177 @@ static int llama_apply_lora_from_file_internal(
9409
9054
  return false;
9410
9055
  }
9411
9056
  }
9412
- ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
9413
- ggml_set_name(lora_tensor, name.c_str());
9414
9057
 
9415
- // load tensor data
9058
+ // data offset
9416
9059
  size_t offset = fin.tell();
9417
- size_t tensor_data_size = ggml_nbytes(lora_tensor);
9418
9060
  offset = (offset + 31) & -32;
9419
- fin.seek(offset, SEEK_SET);
9420
- fin.read_raw(lora_tensor->data, tensor_data_size);
9421
9061
 
9422
- lora_tensors[name] = lora_tensor;
9062
+ // skip tensor data
9063
+ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
9064
+
9065
+ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
9066
+ }
9067
+
9068
+ bool warned = false;
9069
+ int n_tensors = 0;
9423
9070
 
9424
- // check if we have both A and B tensors and apply
9425
- if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
9426
- lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
9071
+ // apply
9072
+ ggml_backend_t backend_cpu = ggml_backend_cpu_init();
9073
+ if (backend_cpu == nullptr) {
9074
+ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
9075
+ return 1;
9076
+ }
9077
+ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
9427
9078
 
9428
- ggml_tensor * dest_t = model_tensors[base_name];
9079
+ std::vector<no_init<uint8_t>> read_buf;
9080
+ for (const auto & it : model.tensors_by_name) {
9081
+ const std::string & base_name = it.first;
9082
+ ggml_tensor * model_t = it.second;
9429
9083
 
9430
- offload_func_t offload_func = ggml_offload_nop;
9431
- offload_func_t offload_func_force_inplace = ggml_offload_nop;
9084
+ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
9085
+ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
9086
+ continue;
9087
+ }
9432
9088
 
9433
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9434
- if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
9435
- if (dest_t->type != GGML_TYPE_F16) {
9436
- throw std::runtime_error(format(
9437
- "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
9438
- }
9439
- offload_func = ggml_cuda_assign_buffers;
9440
- offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
9441
- }
9442
- #endif // GGML_USE_CUBLAS
9089
+ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
9090
+ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
9443
9091
 
9444
- ggml_tensor * base_t;
9445
- if (ml) {
9446
- struct gguf_context * ctx_gguf = ml->ctx_gguf;
9092
+ ggml_init_params lora_init_params = {
9093
+ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
9094
+ /* .mem_buffer */ nullptr,
9095
+ /* .no_alloc */ true,
9096
+ };
9097
+ ggml_context * lora_ctx = ggml_init(lora_init_params);
9098
+ if (lora_ctx == nullptr) {
9099
+ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
9100
+ ggml_backend_free(backend_cpu);
9101
+ return 1;
9102
+ }
9447
9103
 
9448
- // load from base model
9449
- if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
9450
- LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
9451
- return 1;
9452
- }
9104
+ // create tensors
9105
+ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
9106
+ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
9107
+ ggml_set_name(loraA, metaA.name.c_str());
9108
+ ggml_set_name(loraB, metaB.name.c_str());
9453
9109
 
9454
- base_t = ml->get_tensor_meta(base_name.c_str());
9455
- ml->load_data_for(base_t);
9456
- } else {
9457
- base_t = dest_t;
9110
+ ggml_tensor * base_t;
9111
+ if (ml) {
9112
+ if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
9113
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
9114
+ return 1;
9458
9115
  }
9116
+ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
9117
+ } else {
9118
+ base_t = ggml_dup_tensor(lora_ctx, model_t);
9119
+ }
9120
+ ggml_set_name(base_t, base_name.c_str());
9459
9121
 
9460
- if (ggml_is_quantized(base_t->type)) {
9461
- if (!warned) {
9462
- LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
9463
- "use a f16 or f32 base model with --lora-base\n", __func__);
9464
- warned = true;
9465
- }
9466
- }
9122
+ // allocate in backend buffer
9123
+ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
9124
+ if (lora_buf == nullptr) {
9125
+ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
9126
+ return 1;
9127
+ }
9467
9128
 
9468
- ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
9469
- GGML_ASSERT(loraA->type == GGML_TYPE_F32);
9470
- ggml_set_name(loraA, "loraA");
9129
+ // load tensor data
9130
+ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
9131
+ read_buf.resize(ggml_nbytes(tensor));
9132
+ fin.seek(tensor_meta.offset, SEEK_SET);
9133
+ fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
9134
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
9135
+ };
9136
+ load_tensor(metaA, loraA);
9137
+ load_tensor(metaB, loraB);
9471
9138
 
9472
- ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
9473
- GGML_ASSERT(loraB->type == GGML_TYPE_F32);
9474
- ggml_set_name(loraB, "loraB");
9139
+ // load base model tensor data
9140
+ if (ml) {
9141
+ ml->load_data_for(base_t);
9142
+ } else {
9143
+ ggml_backend_tensor_copy(model_t, base_t);
9144
+ }
9475
9145
 
9476
- if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
9477
- LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
9478
- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
9479
- return 1;
9480
- }
9146
+ if (ggml_is_quantized(base_t->type) && !warned) {
9147
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
9148
+ "use a f16 or f32 base model with --lora-base\n", __func__);
9149
+ warned = true;
9150
+ }
9151
+
9152
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
9153
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
9154
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
9155
+ ggml_free(lora_ctx);
9156
+ ggml_backend_buffer_free(lora_buf);
9157
+ ggml_backend_free(backend_cpu);
9158
+ return 1;
9159
+ }
9481
9160
 
9161
+ auto build_lora_graph = [&]() {
9482
9162
  // w = w + BA*s
9483
- ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
9484
- offload_func(BA);
9163
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
9485
9164
  ggml_set_name(BA, "BA");
9486
9165
 
9487
9166
  if (scaling != 1.0f) {
9488
- BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
9489
- offload_func(BA);
9167
+ BA = ggml_scale(lora_ctx, BA, scaling);
9490
9168
  ggml_set_name(BA, "BA_scaled");
9491
9169
  }
9492
9170
 
9493
9171
  ggml_tensor * r;
9494
- if (base_t == dest_t) {
9495
- r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
9496
- offload_func_force_inplace(r);
9497
- ggml_set_name(r, "r_add_inplace");
9498
- }
9499
- else {
9500
- r = ggml_add(lora_ctx.get(), base_t, BA);
9501
- offload_func(r);
9502
- ggml_set_name(r, "r_add");
9172
+ r = ggml_add_inplace(lora_ctx, base_t, BA);
9173
+ ggml_set_name(r, "r_add");
9503
9174
 
9504
- r = ggml_cpy(lora_ctx.get(), r, dest_t);
9505
- offload_func(r);
9506
- ggml_set_name(r, "r_cpy");
9175
+ if (base_t->type != model_t->type) {
9176
+ // convert the result to the model type
9177
+ r = ggml_cast(lora_ctx, r, model_t->type);
9178
+ ggml_set_name(r, "r_cast");
9507
9179
  }
9508
9180
 
9509
- struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
9510
- ggml_build_forward_expand(gf, r);
9181
+ return r;
9182
+ };
9183
+
9184
+ ggml_cgraph * gf = ggml_new_graph(lora_ctx);
9185
+ ggml_tensor * r = build_lora_graph();
9186
+ ggml_build_forward_expand(gf, r);
9511
9187
 
9512
- ggml_graph_compute_helper(work_buffer, gf, n_threads);
9188
+ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
9189
+ if (graph_buf == nullptr) {
9190
+ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
9191
+ ggml_free(lora_ctx);
9192
+ ggml_backend_buffer_free(lora_buf);
9193
+ ggml_backend_free(backend_cpu);
9194
+ return 1;
9195
+ }
9513
9196
 
9514
- // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
9515
- GGML_ASSERT(lora_tensors.size() == 2);
9197
+ ggml_backend_graph_compute(backend_cpu, gf);
9516
9198
 
9517
- // we won't need these tensors again, reset the context to save memory
9518
- lora_ctx.reset(ggml_init(params));
9519
- lora_tensors.clear();
9199
+ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
9520
9200
 
9521
- n_tensors++;
9522
- if (n_tensors % 4 == 0) {
9523
- LLAMA_LOG_INFO(".");
9524
- }
9201
+ #if 0
9202
+ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
9203
+ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
9204
+
9205
+ // sched compute
9206
+ ggml_build_forward_expand(gf, build_graph());
9207
+ ggml_backend_sched_init_measure(sched, gf);
9208
+
9209
+ // create the graph again, since the previous one was destroyed by the measure
9210
+ ggml_graph_clear(gf);
9211
+ ggml_build_forward_expand(gf, build_graph());
9212
+ ggml_backend_sched_graph_compute(sched, gf);
9213
+ ggml_backend_sched_free(sched);
9214
+ #endif
9215
+
9216
+ ggml_backend_buffer_free(lora_buf);
9217
+ ggml_backend_buffer_free(graph_buf);
9218
+ ggml_free(lora_ctx);
9219
+
9220
+ n_tensors++;
9221
+ if (n_tensors % 4 == 0) {
9222
+ LLAMA_LOG_INFO(".");
9525
9223
  }
9526
9224
  }
9527
9225
 
9226
+ ggml_backend_free(backend_cpu);
9227
+
9528
9228
  const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
9529
9229
  LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
9530
9230
 
@@ -9537,6 +9237,7 @@ static int llama_apply_lora_from_file_internal(
9537
9237
  struct llama_model_params llama_model_default_params() {
9538
9238
  struct llama_model_params result = {
9539
9239
  /*.n_gpu_layers =*/ 0,
9240
+ /*.split_mode =*/ LLAMA_SPLIT_LAYER,
9540
9241
  /*.main_gpu =*/ 0,
9541
9242
  /*.tensor_split =*/ nullptr,
9542
9243
  /*.progress_callback =*/ nullptr,
@@ -9548,7 +9249,8 @@ struct llama_model_params llama_model_default_params() {
9548
9249
  };
9549
9250
 
9550
9251
  #ifdef GGML_USE_METAL
9551
- result.n_gpu_layers = 1;
9252
+ // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
9253
+ result.n_gpu_layers = 999;
9552
9254
  #endif
9553
9255
 
9554
9256
  return result;
@@ -9588,6 +9290,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
9588
9290
  /*.quantize_output_tensor =*/ true,
9589
9291
  /*.only_copy =*/ false,
9590
9292
  /*.pure =*/ false,
9293
+ /*.imatrix =*/ nullptr,
9591
9294
  };
9592
9295
 
9593
9296
  return result;
@@ -9738,41 +9441,53 @@ struct llama_context * llama_new_context_with_model(
9738
9441
  GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
9739
9442
  GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
9740
9443
 
9741
- // reserve memory for context buffers
9742
9444
  if (!hparams.vocab_only) {
9743
- // initialize backend
9445
+ // initialize backends
9744
9446
  #ifdef GGML_USE_METAL
9745
9447
  if (model->n_gpu_layers > 0) {
9746
- ctx->backend = ggml_backend_metal_init();
9747
- if (ctx->backend == nullptr) {
9448
+ ctx->backend_metal = ggml_backend_metal_init();
9449
+ if (ctx->backend_metal == nullptr) {
9748
9450
  LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
9451
+ llama_free(ctx);
9452
+ return nullptr;
9749
9453
  }
9454
+ ctx->backends.push_back(ctx->backend_metal);
9750
9455
  }
9751
- #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9752
- // for testing only
9456
+ #elif defined(GGML_USE_CUBLAS)
9753
9457
  if (model->n_gpu_layers > 0) {
9754
- ctx->backend = ggml_backend_cuda_init(0);
9755
- if (ctx->backend == nullptr) {
9756
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
9458
+ // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
9459
+ if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
9460
+ ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
9461
+ if (backend == nullptr) {
9462
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
9463
+ llama_free(ctx);
9464
+ return nullptr;
9465
+ }
9466
+ ctx->backends.push_back(backend);
9467
+ } else {
9468
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
9469
+ for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
9470
+ ggml_backend_t backend = ggml_backend_cuda_init(device);
9471
+ if (backend == nullptr) {
9472
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
9473
+ llama_free(ctx);
9474
+ return nullptr;
9475
+ }
9476
+ ctx->backends.push_back(backend);
9477
+ }
9757
9478
  }
9758
9479
  }
9759
9480
  #endif
9760
-
9761
- if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
9762
- ctx->backend = ggml_backend_cpu_init();
9763
- if (ctx->backend == nullptr) {
9764
- LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
9765
- }
9766
- }
9767
-
9768
- if (ctx->backend == nullptr) {
9769
- LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
9770
- delete ctx;
9481
+ ctx->backend_cpu = ggml_backend_cpu_init();
9482
+ if (ctx->backend_cpu == nullptr) {
9483
+ LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
9484
+ llama_free(ctx);
9771
9485
  return nullptr;
9772
9486
  }
9487
+ ctx->backends.push_back(ctx->backend_cpu);
9773
9488
 
9774
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
9775
- cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
9489
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
9490
+ cparams.n_ctx, cparams.offload_kqv)) {
9776
9491
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
9777
9492
  llama_free(ctx);
9778
9493
  return nullptr;
@@ -9796,23 +9511,30 @@ struct llama_context * llama_new_context_with_model(
9796
9511
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
9797
9512
  }
9798
9513
 
9799
- // resized during inference
9800
- if (params.logits_all) {
9801
- ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
9802
- } else {
9803
- ctx->logits.reserve(hparams.n_vocab);
9804
- }
9514
+ // resized during inference, reserve maximum
9515
+ ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
9805
9516
 
9806
9517
  if (params.embedding){
9807
9518
  ctx->embedding.resize(hparams.n_embd);
9808
9519
  }
9809
9520
 
9810
9521
  {
9811
- // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
9522
+ // buffer types used for the compute buffer of each backend
9523
+ std::vector<ggml_backend_buffer_type_t> backend_buft;
9524
+ for (auto * backend : ctx->backends) {
9525
+ if (ggml_backend_is_cpu(backend)) {
9526
+ // use host buffers for the CPU backend compute buffer
9527
+ backend_buft.push_back(llama_default_buffer_type_cpu(true));
9528
+ } else {
9529
+ backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
9530
+ }
9531
+ }
9532
+
9533
+ // buffer used to store the computation graph and the tensor meta data
9812
9534
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
9813
9535
 
9814
- // create measure allocator
9815
- ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
9536
+ ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
9537
+ ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
9816
9538
 
9817
9539
  // build worst-case graph
9818
9540
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
@@ -9820,50 +9542,19 @@ struct llama_context * llama_new_context_with_model(
9820
9542
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
9821
9543
  ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
9822
9544
 
9823
- // measure memory requirements for the graph
9824
- size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
9825
-
9826
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
9827
-
9828
- // create allocator again with exact memory requirements
9829
- ggml_allocr_free(ctx->alloc);
9830
-
9831
- ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
9832
- ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
9833
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9834
- if (model->n_gpu_layers > 0) {
9835
- // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
9836
- ggml_cuda_set_scratch_size(alloc_size + 64);
9837
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9838
-
9839
- // calculate total VRAM usage
9840
- auto add_tensor = [](const ggml_tensor * t, size_t & size) {
9841
- if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
9842
- size += ggml_nbytes(t);
9843
- }
9844
- };
9845
- size_t model_vram_size = 0;
9846
- for (const auto & kv : model->tensors_by_name) {
9847
- add_tensor(kv.second, model_vram_size);
9848
- }
9849
-
9850
- size_t kv_vram_size = 0;
9851
- for (auto & k : ctx->kv_self.k_l) {
9852
- add_tensor(k, kv_vram_size);
9853
- }
9854
- for (auto & v : ctx->kv_self.v_l) {
9855
- add_tensor(v, kv_vram_size);
9856
- }
9857
-
9858
- size_t ctx_vram_size = alloc_size + kv_vram_size;
9859
- size_t total_vram_size = model_vram_size + ctx_vram_size;
9545
+ // initialize scheduler with the worst-case graph
9546
+ ggml_backend_sched_init_measure(ctx->sched, gf);
9547
+ // note: the number of splits during measure is higher than during inference due to the kv shift
9548
+ int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
9549
+ LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
9550
+ ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
9860
9551
 
9861
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
9862
- total_vram_size / 1024.0 / 1024.0,
9863
- model_vram_size / 1024.0 / 1024.0,
9864
- ctx_vram_size / 1024.0 / 1024.0);
9552
+ for (ggml_backend_t backend : ctx->backends) {
9553
+ ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
9554
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
9555
+ ggml_backend_buffer_name(buf),
9556
+ ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
9865
9557
  }
9866
- #endif
9867
9558
  }
9868
9559
  }
9869
9560
 
@@ -9960,9 +9651,8 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
9960
9651
  }
9961
9652
 
9962
9653
  int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
9963
- return snprintf(buf, buf_size, "%s %s%s %s",
9654
+ return snprintf(buf, buf_size, "%s %s %s",
9964
9655
  llama_model_arch_name(model->arch).c_str(),
9965
- model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
9966
9656
  llama_model_type_name(model->type),
9967
9657
  llama_model_ftype_name(model->ftype).c_str());
9968
9658
  }
@@ -9984,7 +9674,14 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
9984
9674
  }
9985
9675
 
9986
9676
  struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
9987
- return ggml_get_tensor(model->ctx, name);
9677
+ auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
9678
+ [name](const std::pair<std::string, struct ggml_tensor *> & it) {
9679
+ return it.first == name;
9680
+ });
9681
+ if (it == model->tensors_by_name.end()) {
9682
+ return nullptr;
9683
+ }
9684
+ return it->second;
9988
9685
  }
9989
9686
 
9990
9687
  uint32_t llama_model_quantize(
@@ -10141,28 +9838,39 @@ void llama_kv_cache_seq_keep(struct llama_context * ctx, llama_seq_id seq_id) {
10141
9838
  }
10142
9839
 
10143
9840
  void llama_kv_cache_seq_shift(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta) {
9841
+ if (delta == 0) {
9842
+ return;
9843
+ }
9844
+
10144
9845
  llama_kv_cache_seq_shift(ctx->kv_self, seq_id, p0, p1, delta);
10145
9846
  }
10146
9847
 
9848
+ void llama_kv_cache_seq_div(struct llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, int d) {
9849
+ if (d == 1) {
9850
+ return;
9851
+ }
9852
+
9853
+ llama_kv_cache_seq_div(ctx->kv_self, seq_id, p0, p1, d);
9854
+ }
9855
+
10147
9856
  // Returns the *maximum* size of the state
10148
9857
  size_t llama_get_state_size(const struct llama_context * ctx) {
10149
9858
  // we don't know size of rng until we actually serialize it. so reserve more than enough memory for its serialized state.
10150
9859
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
10151
9860
  const size_t s_rng_size = sizeof(size_t);
10152
9861
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
10153
- const size_t s_logits_capacity = sizeof(size_t);
10154
9862
  const size_t s_logits_size = sizeof(size_t);
9863
+ // assume worst case for logits although only currently set ones are serialized
10155
9864
  const size_t s_logits = ctx->logits.capacity() * sizeof(float);
10156
9865
  const size_t s_embedding_size = sizeof(size_t);
10157
9866
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
10158
9867
  const size_t s_kv_size = sizeof(size_t);
10159
9868
  const size_t s_kv_ntok = sizeof(int);
10160
- const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
9869
+ const size_t s_kv = ctx->kv_self.total_size();
10161
9870
 
10162
9871
  const size_t s_total = (
10163
9872
  + s_rng_size
10164
9873
  + s_rng
10165
- + s_logits_capacity
10166
9874
  + s_logits_size
10167
9875
  + s_logits
10168
9876
  + s_embedding_size
@@ -10231,37 +9939,27 @@ struct llama_data_file_context : llama_data_context {
10231
9939
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
10232
9940
  // copy rng
10233
9941
  {
10234
- std::stringstream rng_ss;
9942
+ std::ostringstream rng_ss;
10235
9943
  rng_ss << ctx->rng;
10236
9944
 
10237
- const size_t rng_size = rng_ss.str().size();
10238
- char rng_buf[LLAMA_MAX_RNG_STATE];
9945
+ const std::string & rng_str = rng_ss.str();
9946
+ const size_t rng_size = rng_str.size();
10239
9947
 
10240
- memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
10241
- memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
9948
+ GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
10242
9949
 
10243
- data_ctx->write(&rng_size, sizeof(rng_size));
10244
- data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
9950
+ data_ctx->write(&rng_size, sizeof(rng_size));
9951
+ data_ctx->write(rng_str.data(), rng_size);
10245
9952
  }
10246
9953
 
10247
9954
  // copy logits
10248
9955
  {
10249
- const size_t logits_cap = ctx->logits.capacity();
10250
9956
  const size_t logits_size = ctx->logits.size();
10251
9957
 
10252
- data_ctx->write(&logits_cap, sizeof(logits_cap));
10253
9958
  data_ctx->write(&logits_size, sizeof(logits_size));
10254
9959
 
10255
9960
  if (logits_size) {
10256
9961
  data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
10257
9962
  }
10258
-
10259
- // If there is a gap between the size and the capacity, write padding
10260
- size_t padding_size = (logits_cap - logits_size) * sizeof(float);
10261
- if (padding_size > 0) {
10262
- std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
10263
- data_ctx->write(padding.data(), padding_size);
10264
- }
10265
9963
  }
10266
9964
 
10267
9965
  // copy embeddings
@@ -10286,7 +9984,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
10286
9984
  const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
10287
9985
  const auto n_ctx = cparams.n_ctx;
10288
9986
 
10289
- const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
9987
+ const size_t kv_buf_size = kv_self.total_size();
10290
9988
  const uint32_t kv_head = kv_self.head;
10291
9989
  const uint32_t kv_size = kv_self.size;
10292
9990
  const uint32_t kv_used = kv_self.used;
@@ -10299,46 +9997,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
10299
9997
  if (kv_buf_size) {
10300
9998
  const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
10301
9999
 
10302
- ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
10303
- ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
10304
-
10305
- std::vector<struct ggml_tensor *> kout2d(n_layer);
10306
- std::vector<struct ggml_tensor *> vout2d(n_layer);
10307
-
10308
- for (int il = 0; il < (int) n_layer; ++il) {
10309
- kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10310
- vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10311
-
10312
- ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10313
- n_embd_k_gqa, kv_head,
10314
- elt_size*n_embd_k_gqa, 0);
10315
-
10316
- ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10317
- kv_head, n_embd_v_gqa,
10318
- elt_size*n_ctx, 0);
10319
-
10320
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
10321
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
10322
- }
10323
-
10324
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
10325
-
10326
- ggml_backend_graph_compute(ctx->backend, gf);
10327
-
10328
10000
  std::vector<uint8_t> tmp_buf;
10329
10001
  for (int il = 0; il < (int) n_layer; ++il) {
10330
- tmp_buf.resize(ggml_nbytes(kout2d[il]));
10331
- ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
10002
+ tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
10003
+ ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
10332
10004
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
10333
10005
 
10334
- tmp_buf.resize(ggml_nbytes(vout2d[il]));
10335
- ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
10336
- data_ctx->write(tmp_buf.data(), tmp_buf.size());
10006
+ // v is not contiguous, copy row by row
10007
+ tmp_buf.resize(elt_size*kv_head);
10008
+ for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
10009
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
10010
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
10011
+ }
10337
10012
  }
10338
-
10339
- ggml_free(cpy_ctx);
10340
-
10341
- ggml_backend_buffer_free(buf);
10342
10013
  }
10343
10014
 
10344
10015
  for (uint32_t i = 0; i < kv_size; ++i) {
@@ -10371,13 +10042,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10371
10042
  // set rng
10372
10043
  {
10373
10044
  size_t rng_size;
10374
- char rng_buf[LLAMA_MAX_RNG_STATE];
10045
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
10375
10046
 
10376
- memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
10377
- memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
10047
+ GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
10378
10048
 
10379
- std::stringstream rng_ss;
10380
- rng_ss.str(std::string(&rng_buf[0], rng_size));
10049
+ std::string rng_str((char *)inp, rng_size); inp += rng_size;
10050
+
10051
+ std::istringstream rng_ss(rng_str);
10381
10052
  rng_ss >> ctx->rng;
10382
10053
 
10383
10054
  GGML_ASSERT(!rng_ss.fail());
@@ -10385,20 +10056,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10385
10056
 
10386
10057
  // set logits
10387
10058
  {
10388
- size_t logits_cap;
10389
10059
  size_t logits_size;
10390
10060
 
10391
- memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
10392
10061
  memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
10393
10062
 
10394
- GGML_ASSERT(ctx->logits.capacity() == logits_cap);
10063
+ GGML_ASSERT(ctx->logits.capacity() >= logits_size);
10395
10064
 
10396
10065
  if (logits_size) {
10397
10066
  ctx->logits.resize(logits_size);
10067
+
10398
10068
  memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
10069
+ inp += logits_size * sizeof(float);
10399
10070
  }
10400
-
10401
- inp += logits_cap * sizeof(float);
10402
10071
  }
10403
10072
 
10404
10073
  // set embeddings
@@ -10437,48 +10106,22 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10437
10106
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
10438
10107
 
10439
10108
  if (kv_buf_size) {
10440
- GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
10109
+ GGML_ASSERT(kv_self.total_size() == kv_buf_size);
10441
10110
 
10442
10111
  const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
10443
10112
 
10444
- ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
10445
- ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
10446
-
10447
- std::vector<struct ggml_tensor *> kin2d(n_layer);
10448
- std::vector<struct ggml_tensor *> vin2d(n_layer);
10449
-
10450
- for (int il = 0; il < n_layer; ++il) {
10451
- kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10452
- vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10453
-
10454
- ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10455
- n_embd_k_gqa, kv_head,
10456
- elt_size*n_embd_k_gqa, 0);
10457
-
10458
- ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10459
- kv_head, n_embd_v_gqa,
10460
- elt_size*n_ctx, 0);
10461
-
10462
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
10463
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
10464
- }
10465
-
10466
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
10467
-
10468
- // load data into the tensors
10469
- for (int il = 0; il < n_layer; ++il) {
10470
- ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
10471
- inp += ggml_nbytes(kin2d[il]);
10472
-
10473
- ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
10474
- inp += ggml_nbytes(vin2d[il]);
10113
+ for (int il = 0; il < (int) n_layer; ++il) {
10114
+ size_t k_size = elt_size*n_embd_k_gqa*kv_head;
10115
+ ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
10116
+ inp += k_size;
10117
+
10118
+ // v is not contiguous, copy row by row
10119
+ size_t v_row_size = elt_size*kv_head;
10120
+ for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
10121
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
10122
+ inp += v_row_size;
10123
+ }
10475
10124
  }
10476
-
10477
- ggml_backend_graph_compute(ctx->backend, gf);
10478
-
10479
- ggml_free(cpy_ctx);
10480
-
10481
- ggml_backend_buffer_free(buf);
10482
10125
  }
10483
10126
 
10484
10127
  ctx->kv_self.head = kv_head;
@@ -10794,6 +10437,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10794
10437
  if (0 <= token && token < llama_n_vocab(model)) {
10795
10438
  switch (llama_vocab_get_type(model->vocab)) {
10796
10439
  case LLAMA_VOCAB_TYPE_SPM: {
10440
+ // NOTE: we accept all unsupported token types,
10441
+ // suppressing them like CONTROL tokens.
10797
10442
  if (llama_is_normal_token(model->vocab, token)) {
10798
10443
  std::string result = model->vocab.id_to_token[token].text;
10799
10444
  llama_unescape_whitespace(result);
@@ -10802,6 +10447,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10802
10447
  }
10803
10448
  memcpy(buf, result.c_str(), result.length());
10804
10449
  return result.length();
10450
+ } else if (llama_is_user_defined_token(model->vocab, token)) {
10451
+ std::string result = model->vocab.id_to_token[token].text;
10452
+ if (length < (int) result.length()) {
10453
+ return -result.length();
10454
+ }
10455
+ memcpy(buf, result.c_str(), result.length());
10456
+ return result.length();
10805
10457
  } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
10806
10458
  if (length < 3) {
10807
10459
  return -3;
@@ -10816,14 +10468,12 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10816
10468
  }
10817
10469
  buf[0] = llama_token_to_byte(model->vocab, token);
10818
10470
  return 1;
10819
- } else {
10820
- // TODO: for now we accept all unsupported token types,
10821
- // suppressing them like CONTROL tokens.
10822
- // GGML_ASSERT(false);
10823
10471
  }
10824
10472
  break;
10825
10473
  }
10826
10474
  case LLAMA_VOCAB_TYPE_BPE: {
10475
+ // NOTE: we accept all unsupported token types,
10476
+ // suppressing them like CONTROL tokens.
10827
10477
  if (llama_is_normal_token(model->vocab, token)) {
10828
10478
  std::string result = model->vocab.id_to_token[token].text;
10829
10479
  result = llama_decode_text(result);
@@ -10832,12 +10482,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10832
10482
  }
10833
10483
  memcpy(buf, result.c_str(), result.length());
10834
10484
  return result.length();
10485
+ } else if (llama_is_user_defined_token(model->vocab, token)) {
10486
+ std::string result = model->vocab.id_to_token[token].text;
10487
+ if (length < (int) result.length()) {
10488
+ return -result.length();
10489
+ }
10490
+ memcpy(buf, result.c_str(), result.length());
10491
+ return result.length();
10835
10492
  } else if (llama_is_control_token(model->vocab, token)) {
10836
10493
  ;
10837
- } else {
10838
- // TODO: for now we accept all unsupported token types,
10839
- // suppressing them like CONTROL tokens.
10840
- // GGML_ASSERT(false);
10841
10494
  }
10842
10495
  break;
10843
10496
  }
@@ -10876,7 +10529,7 @@ void llama_print_timings(struct llama_context * ctx) {
10876
10529
  __func__, timings.t_p_eval_ms, timings.n_p_eval, timings.t_p_eval_ms / timings.n_p_eval, 1e3 / timings.t_p_eval_ms * timings.n_p_eval);
10877
10530
  LLAMA_LOG_INFO("%s: eval time = %10.2f ms / %5d runs (%8.2f ms per token, %8.2f tokens per second)\n",
10878
10531
  __func__, timings.t_eval_ms, timings.n_eval, timings.t_eval_ms / timings.n_eval, 1e3 / timings.t_eval_ms * timings.n_eval);
10879
- LLAMA_LOG_INFO("%s: total time = %10.2f ms\n", __func__, (timings.t_end_ms - timings.t_start_ms));
10532
+ LLAMA_LOG_INFO("%s: total time = %10.2f ms / %5d tokens\n", __func__, (timings.t_end_ms - timings.t_start_ms), (timings.n_p_eval + timings.n_eval));
10880
10533
  }
10881
10534
 
10882
10535
  void llama_reset_timings(struct llama_context * ctx) {
@@ -10949,7 +10602,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
10949
10602
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
10950
10603
  g_state.log_callback_user_data = user_data;
10951
10604
  #ifdef GGML_USE_METAL
10952
- ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
10605
+ ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
10953
10606
  #endif
10954
10607
  }
10955
10608