llama_cpp 0.12.1 → 0.12.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,5 +1,4 @@
1
1
  #define LLAMA_API_INTERNAL
2
- //#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
3
2
  #include "llama.h"
4
3
 
5
4
  #include "unicode.h"
@@ -152,10 +151,6 @@ static bool is_float_close(float a, float b, float abs_tol) {
152
151
  return std::fabs(b - a) <= abs_tol;
153
152
  }
154
153
 
155
- #ifdef GGML_USE_CPU_HBM
156
- #include <hbwmalloc.h>
157
- #endif
158
-
159
154
  static void zeros(std::ofstream & file, size_t n) {
160
155
  char zero = 0;
161
156
  for (size_t i = 0; i < n; ++i) {
@@ -579,6 +574,9 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
579
574
  { LLM_TENSOR_OUTPUT, "output" },
580
575
  { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
581
576
  { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
577
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
578
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
579
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
582
580
  { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
583
581
  { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
584
582
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
@@ -988,20 +986,29 @@ struct llama_mmap {
988
986
  throw std::runtime_error(format("MapViewOfFile failed: %s", llama_format_win_err(error).c_str()));
989
987
  }
990
988
 
991
- #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
992
989
  if (prefetch > 0) {
993
- // Advise the kernel to preload the mapped memory
994
- WIN32_MEMORY_RANGE_ENTRY range;
995
- range.VirtualAddress = addr;
996
- range.NumberOfBytes = (SIZE_T)size;
997
- if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
998
- fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
999
- llama_format_win_err(GetLastError()).c_str());
1000
- }
990
+ #if _WIN32_WINNT >= 0x602
991
+ // PrefetchVirtualMemory is only present on Windows 8 and above, so we dynamically load it
992
+ BOOL (WINAPI *pPrefetchVirtualMemory) (HANDLE, ULONG_PTR, PWIN32_MEMORY_RANGE_ENTRY, ULONG);
993
+ HMODULE hKernel32 = GetModuleHandleW(L"kernel32.dll");
994
+
995
+ // may fail on pre-Windows 8 systems
996
+ pPrefetchVirtualMemory = reinterpret_cast<decltype(pPrefetchVirtualMemory)> (GetProcAddress(hKernel32, "PrefetchVirtualMemory"));
997
+
998
+ if (pPrefetchVirtualMemory) {
999
+ // advise the kernel to preload the mapped memory
1000
+ WIN32_MEMORY_RANGE_ENTRY range;
1001
+ range.VirtualAddress = addr;
1002
+ range.NumberOfBytes = (SIZE_T) std::min(size, prefetch);
1003
+ if (!pPrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
1004
+ LLAMA_LOG_WARN("warning: PrefetchVirtualMemory failed: %s\n",
1005
+ llama_format_win_err(GetLastError()).c_str());
1006
+ }
1007
+ }
1008
+ #else
1009
+ throw std::runtime_error("PrefetchVirtualMemory unavailable");
1010
+ #endif
1001
1011
  }
1002
- #else
1003
- #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
1004
- #endif // _WIN32_WINNT >= _WIN32_WINNT_WIN8
1005
1012
  }
1006
1013
 
1007
1014
  void unmap_fragment(size_t first, size_t last) {
@@ -1107,7 +1114,7 @@ struct llama_mlock {
1107
1114
  suggest = false;
1108
1115
  }
1109
1116
 
1110
- fprintf(stderr, "warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
1117
+ LLAMA_LOG_WARN("warning: failed to mlock %zu-byte buffer (after previously locking %zu bytes): %s\n%s",
1111
1118
  size, this->size, errmsg, suggest ? MLOCK_SUGGESTION : "");
1112
1119
  return false;
1113
1120
  }
@@ -1116,7 +1123,7 @@ struct llama_mlock {
1116
1123
 
1117
1124
  static void raw_unlock(void * addr, size_t size) {
1118
1125
  if (munlock(addr, size)) {
1119
- fprintf(stderr, "warning: failed to munlock buffer: %s\n", std::strerror(errno));
1126
+ LLAMA_LOG_WARN("warning: failed to munlock buffer: %s\n", std::strerror(errno));
1120
1127
  }
1121
1128
  }
1122
1129
  #elif defined(_WIN32)
@@ -1134,7 +1141,7 @@ struct llama_mlock {
1134
1141
  return true;
1135
1142
  }
1136
1143
  if (tries == 2) {
1137
- fprintf(stderr, "warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
1144
+ LLAMA_LOG_WARN("warning: failed to VirtualLock %zu-byte buffer (after previously locking %zu bytes): %s\n",
1138
1145
  len, size, llama_format_win_err(GetLastError()).c_str());
1139
1146
  return false;
1140
1147
  }
@@ -1143,7 +1150,7 @@ struct llama_mlock {
1143
1150
  // set size and try again.
1144
1151
  SIZE_T min_ws_size, max_ws_size;
1145
1152
  if (!GetProcessWorkingSetSize(GetCurrentProcess(), &min_ws_size, &max_ws_size)) {
1146
- fprintf(stderr, "warning: GetProcessWorkingSetSize failed: %s\n",
1153
+ LLAMA_LOG_WARN("warning: GetProcessWorkingSetSize failed: %s\n",
1147
1154
  llama_format_win_err(GetLastError()).c_str());
1148
1155
  return false;
1149
1156
  }
@@ -1156,7 +1163,7 @@ struct llama_mlock {
1156
1163
  min_ws_size += increment;
1157
1164
  max_ws_size += increment;
1158
1165
  if (!SetProcessWorkingSetSize(GetCurrentProcess(), min_ws_size, max_ws_size)) {
1159
- fprintf(stderr, "warning: SetProcessWorkingSetSize failed: %s\n",
1166
+ LLAMA_LOG_WARN("warning: SetProcessWorkingSetSize failed: %s\n",
1160
1167
  llama_format_win_err(GetLastError()).c_str());
1161
1168
  return false;
1162
1169
  }
@@ -1165,7 +1172,7 @@ struct llama_mlock {
1165
1172
 
1166
1173
  static void raw_unlock(void * ptr, size_t len) {
1167
1174
  if (!VirtualUnlock(ptr, len)) {
1168
- fprintf(stderr, "warning: failed to VirtualUnlock buffer: %s\n",
1175
+ LLAMA_LOG_WARN("warning: failed to VirtualUnlock buffer: %s\n",
1169
1176
  llama_format_win_err(GetLastError()).c_str());
1170
1177
  }
1171
1178
  }
@@ -1177,7 +1184,7 @@ struct llama_mlock {
1177
1184
  }
1178
1185
 
1179
1186
  bool raw_lock(const void * addr, size_t len) const {
1180
- fprintf(stderr, "warning: mlock not supported on this system\n");
1187
+ LLAMA_LOG_WARN("warning: mlock not supported on this system\n");
1181
1188
  return false;
1182
1189
  }
1183
1190
 
@@ -1185,12 +1192,6 @@ struct llama_mlock {
1185
1192
  #endif
1186
1193
  };
1187
1194
 
1188
- typedef void (*offload_func_t)(struct ggml_tensor * tensor);
1189
-
1190
- static void ggml_offload_nop(struct ggml_tensor * tensor) {
1191
- (void) tensor;
1192
- }
1193
-
1194
1195
  static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
1195
1196
  std::vector<char> result(8, 0);
1196
1197
  const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
@@ -1206,19 +1207,14 @@ static std::string llama_token_to_piece(const struct llama_context * ctx, llama_
1206
1207
  return std::string(result.data(), result.size());
1207
1208
  }
1208
1209
 
1209
- static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1210
+ static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
1210
1211
  ggml_backend_buffer_type_t buft = nullptr;
1211
1212
 
1212
- #ifdef GGML_USE_METAL
1213
- if (n_gpu_layers > 0) {
1214
- buft = ggml_backend_metal_buffer_type();
1215
- }
1216
- #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1217
- if (n_gpu_layers > 0) {
1218
- buft = ggml_backend_cuda_buffer_type(0);
1213
+ #if defined(GGML_USE_CUBLAS)
1214
+ // host buffers should only be used when data is expected to be copied to/from the GPU
1215
+ if (host_buffer) {
1216
+ buft = ggml_backend_cuda_host_buffer_type();
1219
1217
  }
1220
- #elif defined(GGML_USE_CUBLAS)
1221
- buft = ggml_backend_cuda_host_buffer_type();
1222
1218
  #elif defined(GGML_USE_CPU_HBM)
1223
1219
  buft = ggml_backend_cpu_hbm_buffer_type();
1224
1220
  #endif
@@ -1226,10 +1222,45 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1226
1222
  if (buft == nullptr) {
1227
1223
  buft = ggml_backend_cpu_buffer_type();
1228
1224
  }
1225
+ return buft;
1226
+
1227
+ GGML_UNUSED(host_buffer);
1228
+ }
1229
+
1230
+ static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
1231
+ ggml_backend_buffer_type_t buft = nullptr;
1232
+
1233
+ #ifdef GGML_USE_METAL
1234
+ buft = ggml_backend_metal_buffer_type();
1235
+ #elif defined(GGML_USE_CUBLAS)
1236
+ buft = ggml_backend_cuda_buffer_type(gpu);
1237
+ #elif defined(GGML_USE_CLBLAST)
1238
+ buft = ggml_backend_opencl_buffer_type();
1239
+ #endif
1240
+
1241
+ if (buft == nullptr) {
1242
+ buft = llama_default_buffer_type_cpu(true);
1243
+ }
1244
+ return buft;
1245
+
1246
+ GGML_UNUSED(gpu);
1247
+ }
1248
+
1249
+ static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
1250
+ ggml_backend_buffer_type_t buft = nullptr;
1251
+
1252
+ #ifdef GGML_USE_CUBLAS
1253
+ if (ggml_backend_cuda_get_device_count() > 1) {
1254
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
1255
+ }
1256
+ #endif
1229
1257
 
1258
+ if (buft == nullptr) {
1259
+ buft = llama_default_buffer_type_offload(fallback_gpu);
1260
+ }
1230
1261
  return buft;
1231
1262
 
1232
- GGML_UNUSED(n_gpu_layers);
1263
+ GGML_UNUSED(tensor_split);
1233
1264
  }
1234
1265
 
1235
1266
  //
@@ -1239,7 +1270,7 @@ static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
1239
1270
  struct llama_state {
1240
1271
  llama_state() {
1241
1272
  #ifdef GGML_USE_METAL
1242
- ggml_metal_log_set_callback(log_callback, log_callback_user_data);
1273
+ ggml_backend_metal_log_set_callback(log_callback, log_callback_user_data);
1243
1274
  #endif
1244
1275
  }
1245
1276
 
@@ -1440,24 +1471,24 @@ struct llama_kv_cache {
1440
1471
  std::vector<struct ggml_tensor *> k_l; // per layer
1441
1472
  std::vector<struct ggml_tensor *> v_l;
1442
1473
 
1443
- struct ggml_context * ctx = NULL;
1474
+ std::vector<struct ggml_context *> ctxs;
1475
+ std::vector<ggml_backend_buffer_t> bufs;
1444
1476
 
1445
- ggml_backend_buffer_t buf = NULL;
1477
+ size_t total_size() const {
1478
+ size_t size = 0;
1479
+ for (ggml_backend_buffer_t buf : bufs) {
1480
+ size += ggml_backend_buffer_get_size(buf);
1481
+ }
1482
+ return size;
1483
+ }
1446
1484
 
1447
1485
  ~llama_kv_cache() {
1448
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1449
- if (ggml_cublas_loaded()) {
1450
- for (size_t i = 0; i < k_l.size(); ++i) {
1451
- ggml_cuda_free_data(k_l[i]);
1452
- ggml_cuda_free_data(v_l[i]);
1453
- }
1454
- }
1455
- #endif
1456
- if (ctx) {
1486
+ for (struct ggml_context * ctx : ctxs) {
1457
1487
  ggml_free(ctx);
1458
1488
  }
1459
-
1460
- ggml_backend_buffer_free(buf);
1489
+ for (ggml_backend_buffer_t buf : bufs) {
1490
+ ggml_backend_buffer_free(buf);
1491
+ }
1461
1492
  }
1462
1493
  };
1463
1494
 
@@ -1534,16 +1565,32 @@ struct llama_model {
1534
1565
 
1535
1566
  std::vector<llama_layer> layers;
1536
1567
 
1568
+ llama_split_mode split_mode;
1569
+ int main_gpu;
1537
1570
  int n_gpu_layers;
1538
1571
 
1539
1572
  // gguf metadata
1540
1573
  std::unordered_map<std::string, std::string> gguf_kv;
1541
1574
 
1542
- // context
1543
- struct ggml_context * ctx = NULL;
1575
+ // layer -> buffer type mapping
1576
+ struct layer_buft {
1577
+ layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
1578
+ layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
1579
+ layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
1580
+
1581
+ ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
1582
+ ggml_backend_buffer_type_t buft; // everything else
1583
+ };
1584
+
1585
+ layer_buft buft_input;
1586
+ layer_buft buft_output;
1587
+ std::vector<layer_buft> buft_layer;
1544
1588
 
1545
- // the model memory buffer
1546
- ggml_backend_buffer_t buf = NULL;
1589
+ // contexts where the model tensors metadata is stored
1590
+ std::vector<struct ggml_context *> ctxs;
1591
+
1592
+ // the model memory buffers for the tensor data
1593
+ std::vector<ggml_backend_buffer_t> bufs;
1547
1594
 
1548
1595
  // model memory mapped file
1549
1596
  std::unique_ptr<llama_mmap> mapping;
@@ -1559,39 +1606,32 @@ struct llama_model {
1559
1606
  int64_t t_start_us = 0;
1560
1607
 
1561
1608
  ~llama_model() {
1562
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1563
- if (ggml_cublas_loaded()) {
1564
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1565
- ggml_cuda_free_data(tensors_by_name[i].second);
1566
- }
1567
- ggml_cuda_free_scratch();
1568
- }
1569
- #endif
1570
-
1571
- #if defined(GGML_USE_CLBLAST)
1572
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
1573
- ggml_cl_free_data(tensors_by_name[i].second);
1574
- }
1575
- #endif
1576
- if (ctx) {
1609
+ for (struct ggml_context * ctx : ctxs) {
1577
1610
  ggml_free(ctx);
1578
1611
  }
1579
-
1580
- ggml_backend_buffer_free(buf);
1612
+ for (ggml_backend_buffer_t buf : bufs) {
1613
+ ggml_backend_buffer_free(buf);
1614
+ }
1581
1615
  }
1582
1616
  };
1583
1617
 
1584
1618
  struct llama_context {
1585
1619
  llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
1586
1620
  ~llama_context() {
1587
- ggml_allocr_free(alloc);
1588
- ggml_backend_buffer_free(buf_alloc);
1589
- ggml_backend_free(backend);
1621
+ ggml_backend_sched_free(sched);
1622
+
1623
+ for (ggml_backend_t backend : backends) {
1624
+ ggml_backend_free(backend);
1625
+ }
1590
1626
  }
1591
1627
 
1592
1628
  llama_cparams cparams;
1593
1629
 
1594
- ggml_backend_t backend = nullptr;
1630
+ std::vector<ggml_backend_t> backends;
1631
+ #ifdef GGML_USE_METAL
1632
+ ggml_backend_t backend_metal = nullptr;
1633
+ #endif
1634
+ ggml_backend_t backend_cpu = nullptr;
1595
1635
 
1596
1636
  const llama_model & model;
1597
1637
 
@@ -1625,8 +1665,9 @@ struct llama_context {
1625
1665
 
1626
1666
  // memory buffers used to evaluate the model
1627
1667
  std::vector<uint8_t> buf_compute_meta;
1628
- ggml_backend_buffer_t buf_alloc = NULL;
1629
- ggml_allocr * alloc = NULL;
1668
+ ggml_backend_sched_t sched = nullptr;
1669
+ // allocator for the input tensors
1670
+ ggml_tallocr * alloc = nullptr;
1630
1671
 
1631
1672
  // temporary buffer for copying data to/from the backend
1632
1673
  std::vector<no_init<uint8_t>> buf_copy;
@@ -1641,16 +1682,17 @@ struct llama_context {
1641
1682
  //
1642
1683
 
1643
1684
  static bool llama_kv_cache_init(
1644
- const struct llama_hparams & hparams,
1645
1685
  struct llama_kv_cache & cache,
1686
+ const llama_model & model,
1646
1687
  ggml_type ktype,
1647
1688
  ggml_type vtype,
1648
1689
  uint32_t n_ctx,
1649
- int n_gpu_layers,
1650
1690
  bool offload) {
1691
+ const struct llama_hparams & hparams = model.hparams;
1692
+
1651
1693
  const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
1652
1694
  const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
1653
- const uint32_t n_layer = hparams.n_layer;
1695
+ const int64_t n_layer = hparams.n_layer;
1654
1696
 
1655
1697
  cache.has_shift = false;
1656
1698
 
@@ -1661,62 +1703,65 @@ static bool llama_kv_cache_init(
1661
1703
  cache.cells.clear();
1662
1704
  cache.cells.resize(n_ctx);
1663
1705
 
1664
- struct ggml_init_params params;
1665
- params.mem_size = 2u*n_layer*ggml_tensor_overhead();
1666
- params.mem_buffer = NULL;
1667
- params.no_alloc = true;
1668
-
1669
- cache.ctx = ggml_init(params);
1706
+ #ifdef GGML_USE_CLBLAST
1707
+ offload = false;
1708
+ #endif
1670
1709
 
1671
- size_t vram_kv_cache = 0;
1710
+ // count used buffer types
1711
+ std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
1712
+ if (offload) {
1713
+ for (int64_t i = 0; i < n_layer; ++i) {
1714
+ buft_layer_count[model.buft_layer[i].buft]++;
1715
+ }
1716
+ } else {
1717
+ buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
1718
+ }
1672
1719
 
1673
- if (!cache.ctx) {
1674
- LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
1675
- return false;
1720
+ // create a context for each buffer type
1721
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
1722
+ for (auto & it : buft_layer_count) {
1723
+ int n_layers = it.second;
1724
+ struct ggml_init_params params = {
1725
+ /*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead(),
1726
+ /*.mem_buffer =*/ NULL,
1727
+ /*.no_alloc =*/ true,
1728
+ };
1729
+ ggml_context * ctx = ggml_init(params);
1730
+ if (!ctx) {
1731
+ LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
1732
+ return false;
1733
+ }
1734
+ ctx_map[it.first] = ctx;
1735
+ cache.ctxs.push_back(ctx);
1676
1736
  }
1677
1737
 
1678
1738
  cache.k_l.reserve(n_layer);
1679
1739
  cache.v_l.reserve(n_layer);
1680
1740
 
1681
- const int i_gpu_start = (int) n_layer - n_gpu_layers;
1682
-
1683
1741
  for (int i = 0; i < (int) n_layer; i++) {
1684
- ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
1685
- ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
1742
+ struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
1743
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
1744
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
1686
1745
  ggml_format_name(k, "cache_k_l%d", i);
1687
1746
  ggml_format_name(v, "cache_v_l%d", i);
1688
1747
  cache.k_l.push_back(k);
1689
1748
  cache.v_l.push_back(v);
1690
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
1691
- if (i >= i_gpu_start) {
1692
- if (offload) {
1693
- ggml_cuda_assign_buffers_no_scratch(k);
1694
- ggml_cuda_assign_buffers_no_scratch(v);
1695
- vram_kv_cache += ggml_nbytes(k);
1696
- vram_kv_cache += ggml_nbytes(v);
1697
- // HACK: mark tensor as allocated
1698
- k->data = v->data = (void *)(uintptr_t)1;
1699
- }
1700
- }
1701
- #endif // GGML_USE_CUBLAS
1702
- }
1703
-
1704
- // allocate tensors
1705
- cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
1706
-
1707
- // buf may be NULL with full offload
1708
- if (cache.buf) {
1709
- // initialize the buffer to avoid NaNs in the padding
1710
- ggml_backend_buffer_clear(cache.buf, 0);
1711
1749
  }
1712
1750
 
1713
- if (vram_kv_cache > 0) {
1714
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
1751
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
1752
+ for (auto it : ctx_map) {
1753
+ ggml_backend_buffer_type_t buft = it.first;
1754
+ ggml_context * ctx = it.second;
1755
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
1756
+ if (!buf) {
1757
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
1758
+ return false;
1759
+ }
1760
+ ggml_backend_buffer_clear(buf, 0);
1761
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
1762
+ cache.bufs.push_back(buf);
1715
1763
  }
1716
1764
 
1717
- GGML_UNUSED(i_gpu_start);
1718
- GGML_UNUSED(offload);
1719
-
1720
1765
  return true;
1721
1766
  }
1722
1767
 
@@ -2040,13 +2085,13 @@ namespace GGUFMeta {
2040
2085
  __func__, override_type_to_str(override->tag), override->key);
2041
2086
  switch (override->tag) {
2042
2087
  case LLAMA_KV_OVERRIDE_BOOL: {
2043
- printf("%s\n", override->bool_value ? "true" : "false");
2088
+ LLAMA_LOG_INFO("%s\n", override->bool_value ? "true" : "false");
2044
2089
  } break;
2045
2090
  case LLAMA_KV_OVERRIDE_INT: {
2046
- printf("%" PRId64 "\n", override->int_value);
2091
+ LLAMA_LOG_INFO("%" PRId64 "\n", override->int_value);
2047
2092
  } break;
2048
2093
  case LLAMA_KV_OVERRIDE_FLOAT: {
2049
- printf("%.6f\n", override->float_value);
2094
+ LLAMA_LOG_INFO("%.6f\n", override->float_value);
2050
2095
  } break;
2051
2096
  default:
2052
2097
  // Shouldn't be possible to end up here, but just in case...
@@ -2145,6 +2190,11 @@ struct llama_model_loader {
2145
2190
  LLM_KV llm_kv = LLM_KV(LLM_ARCH_UNKNOWN);
2146
2191
 
2147
2192
  llama_model_loader(const std::string & fname, bool use_mmap, const struct llama_model_kv_override * param_overrides_p) : file(fname.c_str(), "rb") {
2193
+ int trace = 0;
2194
+ if (getenv("LLAMA_TRACE")) {
2195
+ trace = atoi(getenv("LLAMA_TRACE"));
2196
+ }
2197
+
2148
2198
  struct gguf_init_params params = {
2149
2199
  /*.no_alloc = */ true,
2150
2200
  /*.ctx = */ &ctx_meta,
@@ -2197,11 +2247,10 @@ struct llama_model_loader {
2197
2247
  type_max = type;
2198
2248
  }
2199
2249
 
2200
- // TODO: make runtime configurable
2201
- #if 0
2202
- struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2203
- LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
2204
- #endif
2250
+ if (trace > 0) {
2251
+ struct ggml_tensor * meta = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2252
+ LLAMA_LOG_INFO("%s: - tensor %4d: %32s %-8s [ %s ]\n", __func__, i, ggml_get_name(meta), ggml_type_name(type), llama_format_tensor_shape(meta).c_str());
2253
+ }
2205
2254
  }
2206
2255
 
2207
2256
  switch (type_max) {
@@ -2349,9 +2398,8 @@ struct llama_model_loader {
2349
2398
  return get_tensor_meta(get_tensor_name(i));
2350
2399
  }
2351
2400
 
2352
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
2401
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
2353
2402
  struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
2354
- tensor->backend = backend; // TODO: ggml_set_backend
2355
2403
  ggml_set_name(tensor, ggml_get_name(meta));
2356
2404
 
2357
2405
  n_created++;
@@ -2359,7 +2407,7 @@ struct llama_model_loader {
2359
2407
  return tensor;
2360
2408
  }
2361
2409
 
2362
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
2410
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
2363
2411
  struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
2364
2412
 
2365
2413
  if (cur == NULL) {
@@ -2369,12 +2417,6 @@ struct llama_model_loader {
2369
2417
  throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
2370
2418
  }
2371
2419
 
2372
- if (backend == GGML_BACKEND_GPU_SPLIT) {
2373
- if (ne.size() == 1) {
2374
- throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
2375
- }
2376
- }
2377
-
2378
2420
  {
2379
2421
  bool is_ok = true;
2380
2422
  for (size_t i = 0; i < ne.size(); ++i) {
@@ -2392,7 +2434,7 @@ struct llama_model_loader {
2392
2434
  }
2393
2435
  }
2394
2436
 
2395
- return create_tensor_for(ctx, cur, backend);
2437
+ return create_tensor_for(ctx, cur);
2396
2438
  }
2397
2439
 
2398
2440
  void done_getting_tensors() const {
@@ -2411,25 +2453,35 @@ struct llama_model_loader {
2411
2453
  return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
2412
2454
  }
2413
2455
 
2414
- void init_mapping(bool prefetch = true) {
2415
- /*
2416
- // prefetch only CPU tensors
2456
+ void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
2457
+ // prefetch the whole file - all the data is needed anyway
2417
2458
  if (use_mmap) {
2418
- size_t size_pref = 0; // prefetch
2459
+ mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
2460
+ }
2419
2461
 
2420
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2421
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2422
- if (cur->backend == GGML_BACKEND_CPU) {
2423
- size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
2424
- size_pref = std::max(size_pref, tensor_end);
2425
- }
2462
+ // compute the total size of all tensors for progress reporting
2463
+ for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2464
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
2465
+ size_data += ggml_nbytes(cur);
2466
+ }
2467
+
2468
+ if (use_mmap && mapping) {
2469
+ if (lmlock) {
2470
+ lmlock->init(mapping->addr);
2426
2471
  }
2427
- mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
2472
+ mmap_used_first = mapping->size;
2428
2473
  }
2429
- */
2430
- // prefetch the whole file - all the data is needed anyway
2431
- if (use_mmap) {
2432
- mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
2474
+ }
2475
+
2476
+ void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
2477
+ GGML_ASSERT(mapping);
2478
+
2479
+ *first = mapping->size;
2480
+ *last = 0;
2481
+ for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
2482
+ const size_t offs = file_offset(ggml_get_name(tensor));
2483
+ *first = std::min(*first, offs);
2484
+ *last = std::max(*last, offs + ggml_nbytes(tensor));
2433
2485
  }
2434
2486
  }
2435
2487
 
@@ -2438,8 +2490,11 @@ struct llama_model_loader {
2438
2490
  const size_t offs = file_offset(ggml_get_name(cur));
2439
2491
 
2440
2492
  if (use_mmap && mapping) {
2441
- GGML_ASSERT(cur->data == nullptr);
2442
- cur->data = (uint8_t *)mapping->addr + offs;
2493
+ if (cur->data == nullptr) {
2494
+ cur->data = (uint8_t *)mapping->addr + offs;
2495
+ } else {
2496
+ memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
2497
+ }
2443
2498
  } else {
2444
2499
  GGML_ASSERT(cur->data != nullptr);
2445
2500
  file.seek(offs, SEEK_SET);
@@ -2447,37 +2502,23 @@ struct llama_model_loader {
2447
2502
  }
2448
2503
  }
2449
2504
 
2450
- // Returns false if cancelled by progress_callback
2451
- bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
2452
- size_t size_data = 0;
2453
-
2454
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2455
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2456
- size_data += ggml_nbytes(cur);
2457
- }
2458
-
2459
- if (use_mmap && buf_mmap) {
2460
- if (lmlock) {
2461
- lmlock->init(mapping->addr);
2462
- }
2463
- }
2505
+ size_t size_done = 0;
2506
+ size_t size_data = 0;
2507
+ size_t mmap_used_first = -1;
2508
+ size_t mmap_used_last = 0;
2464
2509
 
2465
- #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
2466
- const bool legacy_offload = true;
2467
- #else
2468
- const bool legacy_offload = false;
2469
- #endif
2510
+ // Returns false if cancelled by progress_callback
2511
+ bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
2512
+ GGML_ASSERT(size_data != 0 && "call init_mapping() first");
2470
2513
 
2471
2514
  std::vector<no_init<uint8_t>> read_buf;
2472
2515
 
2473
- size_t size_done = 0;
2474
-
2475
- size_t mmap_first = -1;
2476
- size_t mmap_last = 0;
2477
-
2478
2516
  for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
2479
2517
  struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
2480
- GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
2518
+ if (!cur) {
2519
+ // some tensors may be allocated in a different context
2520
+ continue;
2521
+ }
2481
2522
 
2482
2523
  if (progress_callback) {
2483
2524
  if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
@@ -2487,67 +2528,48 @@ struct llama_model_loader {
2487
2528
 
2488
2529
  const size_t offs = file_offset(ggml_get_name(cur));
2489
2530
 
2490
- if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
2491
- if (use_mmap && mapping) {
2492
- if (buf_mmap) {
2493
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
2494
- if (lmlock) {
2495
- lmlock->grow_to(offs + ggml_nbytes(cur));
2496
- }
2497
- mmap_first = std::min(mmap_first, offs);
2498
- mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
2499
- } else {
2500
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
2531
+ if (use_mmap && mapping) {
2532
+ if (buf_mmap && cur->data == nullptr) {
2533
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
2534
+ if (lmlock) {
2535
+ lmlock->grow_to(offs + ggml_nbytes(cur));
2501
2536
  }
2537
+ mmap_used_first = std::min(mmap_used_first, offs);
2538
+ mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
2502
2539
  } else {
2503
- if (ggml_backend_buffer_is_host(cur->buffer)) {
2504
- file.seek(offs, SEEK_SET);
2505
- file.read_raw(cur->data, ggml_nbytes(cur));
2506
- } else {
2507
- read_buf.resize(ggml_nbytes(cur));
2508
- file.seek(offs, SEEK_SET);
2509
- file.read_raw(read_buf.data(), ggml_nbytes(cur));
2510
- ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
2511
- }
2540
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
2512
2541
  }
2513
2542
  } else {
2514
- // HACK: mark tensor as allocated
2515
- cur->data = (void *)(uintptr_t)1;
2516
- void * data;
2517
- if (use_mmap && mapping) {
2518
- data = (uint8_t *) mapping->addr + offs;
2543
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
2544
+ file.seek(offs, SEEK_SET);
2545
+ file.read_raw(cur->data, ggml_nbytes(cur));
2519
2546
  } else {
2520
2547
  read_buf.resize(ggml_nbytes(cur));
2521
2548
  file.seek(offs, SEEK_SET);
2522
2549
  file.read_raw(read_buf.data(), ggml_nbytes(cur));
2523
- data = read_buf.data();
2550
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
2524
2551
  }
2525
-
2526
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
2527
- ggml_cuda_transform_tensor(data, cur);
2528
- #elif defined(GGML_USE_CLBLAST)
2529
- GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
2530
- ggml_cl_transform_tensor(data, cur);
2531
- #else
2532
- GGML_ASSERT(!"GPU tensor without a GPU backend");
2533
- GGML_UNUSED(data);
2534
- #endif
2535
2552
  }
2536
2553
 
2537
2554
  size_done += ggml_nbytes(cur);
2538
2555
  }
2539
2556
 
2540
- // unmap offloaded tensors and metadata
2541
- if (use_mmap && mapping) {
2542
- mapping->unmap_fragment(0, mmap_first);
2543
- mapping->unmap_fragment(mmap_last, mapping->size);
2557
+ // check if this is the last call and do final cleanup
2558
+ if (size_done >= size_data) {
2559
+ // unmap offloaded tensors and metadata
2560
+ if (use_mmap && mapping) {
2561
+ mapping->unmap_fragment(0, mmap_used_first);
2562
+ if (mmap_used_last != 0) {
2563
+ mapping->unmap_fragment(mmap_used_last, mapping->size);
2564
+ }
2565
+ }
2566
+ if (progress_callback) {
2567
+ // Even though the model is done loading, we still honor
2568
+ // cancellation since we need to free allocations.
2569
+ return progress_callback(1.0f, progress_callback_user_data);
2570
+ }
2544
2571
  }
2545
2572
 
2546
- if (progress_callback) {
2547
- // Even though the model is done loading, we still honor
2548
- // cancellation since we need to free allocations.
2549
- return progress_callback(1.0f, progress_callback_user_data);
2550
- }
2551
2573
  return true;
2552
2574
  }
2553
2575
  };
@@ -3176,6 +3198,7 @@ static bool llm_load_tensors(
3176
3198
  llama_model_loader & ml,
3177
3199
  llama_model & model,
3178
3200
  int n_gpu_layers,
3201
+ enum llama_split_mode split_mode,
3179
3202
  int main_gpu,
3180
3203
  const float * tensor_split,
3181
3204
  bool use_mlock,
@@ -3183,702 +3206,574 @@ static bool llm_load_tensors(
3183
3206
  void * progress_callback_user_data) {
3184
3207
  model.t_start_us = ggml_time_us();
3185
3208
 
3186
- auto & ctx = model.ctx;
3187
3209
  auto & hparams = model.hparams;
3188
3210
 
3211
+ model.split_mode = split_mode;
3212
+ model.main_gpu = main_gpu;
3189
3213
  model.n_gpu_layers = n_gpu_layers;
3190
3214
 
3191
- size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
3215
+ const int64_t n_layer = hparams.n_layer;
3216
+ const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
3192
3217
 
3193
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
3218
+ // there is very little benefit to offloading the input layer, so always keep it on the CPU
3219
+ model.buft_input = llama_default_buffer_type_cpu(true);
3194
3220
 
3195
- // create the ggml context
3221
+ model.buft_layer.resize(n_layer);
3222
+
3223
+ // assign cpu layers
3224
+ for (int64_t i = 0; i < i_gpu_start; ++i) {
3225
+ model.buft_layer[i] = llama_default_buffer_type_cpu(true);
3226
+ }
3227
+
3228
+ #ifdef GGML_USE_CUBLAS
3229
+ if (split_mode == LLAMA_SPLIT_LAYER) {
3230
+ // calculate the split points
3231
+ int device_count = ggml_backend_cuda_get_device_count();
3232
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
3233
+ float splits[GGML_CUDA_MAX_DEVICES];
3234
+ if (all_zero) {
3235
+ // default split, by free memory
3236
+ for (int i = 0; i < device_count; ++i) {
3237
+ size_t total;
3238
+ size_t free;
3239
+ ggml_backend_cuda_get_device_memory(i, &total, &free);
3240
+ splits[i] = free;
3241
+ }
3242
+ } else {
3243
+ std::copy(tensor_split, tensor_split + device_count, splits);
3244
+ }
3245
+
3246
+ // sum and normalize the splits to get the split points
3247
+ float split_sum = 0.0f;
3248
+ for (int i = 0; i < device_count; ++i) {
3249
+ split_sum += splits[i];
3250
+ splits[i] = split_sum;
3251
+ }
3252
+ for (int i = 0; i < device_count; ++i) {
3253
+ splits[i] /= split_sum;
3254
+ }
3255
+
3256
+ // assign the repeating layers to the devices according to the splits
3257
+ int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
3258
+ for (int64_t i = i_gpu_start; i < n_layer; ++i) {
3259
+ int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
3260
+ model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
3261
+ }
3262
+ // assign the output layer
3263
+ if (n_gpu_layers > n_layer) {
3264
+ int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
3265
+ model.buft_output = llama_default_buffer_type_offload(layer_gpu);
3266
+ } else {
3267
+ model.buft_output = llama_default_buffer_type_cpu(true);
3268
+ }
3269
+ } else
3270
+ #endif
3196
3271
  {
3272
+ ggml_backend_buffer_type_t split_buft;
3273
+ if (split_mode == LLAMA_SPLIT_ROW) {
3274
+ split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
3275
+ } else {
3276
+ // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
3277
+ split_buft = llama_default_buffer_type_offload(main_gpu);
3278
+ }
3279
+ // assign the repeating layers
3280
+ for (int64_t i = i_gpu_start; i < n_layer; ++i) {
3281
+ model.buft_layer[i] = {
3282
+ split_buft,
3283
+ llama_default_buffer_type_offload(main_gpu)
3284
+ };
3285
+ }
3286
+ // assign the output layer
3287
+ if (n_gpu_layers > n_layer) {
3288
+ model.buft_output = {
3289
+ split_buft,
3290
+ llama_default_buffer_type_offload(main_gpu)
3291
+ };
3292
+ } else {
3293
+ model.buft_output = llama_default_buffer_type_cpu(true);
3294
+ }
3295
+ }
3296
+
3297
+ // count used buffer types
3298
+ std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
3299
+ buft_layer_count[model.buft_input.buft]++;
3300
+ buft_layer_count[model.buft_input.buft_matrix]++;
3301
+ buft_layer_count[model.buft_output.buft]++;
3302
+ buft_layer_count[model.buft_output.buft_matrix]++;
3303
+ for (int64_t i = 0; i < n_layer; ++i) {
3304
+ buft_layer_count[model.buft_layer[i].buft]++;
3305
+ buft_layer_count[model.buft_layer[i].buft_matrix]++;
3306
+ }
3307
+
3308
+ // create one context per buffer type
3309
+ size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
3310
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
3311
+ for (auto & it : buft_layer_count) {
3197
3312
  struct ggml_init_params params = {
3198
3313
  /*.mem_size =*/ ctx_size,
3199
3314
  /*.mem_buffer =*/ NULL,
3200
3315
  /*.no_alloc =*/ true,
3201
3316
  };
3202
-
3203
- model.ctx = ggml_init(params);
3204
- if (!model.ctx) {
3205
- throw std::runtime_error(format("ggml_init() failed"));
3317
+ ggml_context * ctx = ggml_init(params);
3318
+ if (!ctx) {
3319
+ throw std::runtime_error(format("failed to create context"));
3206
3320
  }
3321
+ ctx_map[it.first] = ctx;
3322
+ model.ctxs.push_back(ctx);
3207
3323
  }
3208
3324
 
3209
- (void) main_gpu;
3210
-
3211
- enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
3212
- enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
3213
-
3214
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3215
- if (ggml_cublas_loaded()) {
3216
- LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
3217
- ggml_cuda_set_main_device(main_gpu);
3218
-
3219
- llama_backend_offload = GGML_BACKEND_GPU;
3220
- llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
3221
- }
3222
- #elif defined(GGML_USE_CLBLAST)
3223
- LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
3224
- llama_backend_offload = GGML_BACKEND_GPU;
3225
- llama_backend_offload_split = GGML_BACKEND_GPU;
3226
- #endif
3325
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
3227
3326
 
3228
3327
  // create tensors for the weights
3229
3328
  {
3230
3329
  const int64_t n_embd = hparams.n_embd;
3231
3330
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
3232
3331
  const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
3233
- const int64_t n_layer = hparams.n_layer;
3332
+ const int64_t n_embd_gqa = n_embd_v_gqa;
3234
3333
  const int64_t n_vocab = hparams.n_vocab;
3334
+ const int64_t n_ff = hparams.n_ff;
3335
+
3336
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3337
+
3338
+ ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
3339
+ ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
3340
+ ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
3341
+ auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
3342
+ auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
3343
+
3344
+ model.layers.resize(n_layer);
3235
3345
 
3236
3346
  const auto tn = LLM_TN(model.arch);
3237
3347
  switch (model.arch) {
3238
3348
  case LLM_ARCH_LLAMA:
3239
3349
  case LLM_ARCH_REFACT:
3240
3350
  {
3241
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3351
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3242
3352
 
3243
3353
  // output
3244
3354
  {
3245
- ggml_backend_type backend_norm;
3246
- ggml_backend_type backend_output;
3247
-
3248
- if (n_gpu_layers > int(n_layer)) {
3249
- backend_norm = llama_backend_offload;
3250
- backend_output = llama_backend_offload_split;
3251
- } else {
3252
- backend_norm = GGML_BACKEND_CPU;
3253
- backend_output = GGML_BACKEND_CPU;
3254
- }
3255
-
3256
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3257
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3355
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3356
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3258
3357
  }
3259
3358
 
3260
- const uint32_t n_ff = hparams.n_ff;
3261
- const int64_t n_embd_gqa = n_embd_v_gqa;
3262
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3263
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3264
-
3265
- const int i_gpu_start = n_layer - n_gpu_layers;
3266
-
3267
- model.layers.resize(n_layer);
3268
-
3269
- for (uint32_t i = 0; i < n_layer; ++i) {
3270
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3271
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3359
+ for (int i = 0; i < n_layer; ++i) {
3360
+ ggml_context * ctx_layer = ctx_for_layer(i);
3361
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3272
3362
 
3273
3363
  auto & layer = model.layers[i];
3274
3364
 
3275
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3365
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3276
3366
 
3277
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3278
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3279
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3280
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3367
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3368
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3369
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3370
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3281
3371
 
3282
3372
  // optional bias tensors
3283
- layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
3284
- layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
3285
- layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
3286
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
3373
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
3374
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
3375
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
3376
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
3287
3377
 
3288
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3378
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3289
3379
 
3290
- layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
3380
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
3291
3381
 
3292
3382
  if (layer.ffn_gate_inp == nullptr) {
3293
3383
  GGML_ASSERT(hparams.n_expert == 0);
3294
3384
  GGML_ASSERT(hparams.n_expert_used == 0);
3295
3385
 
3296
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3297
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3298
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3386
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3387
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3388
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3299
3389
  } else {
3300
3390
  GGML_ASSERT(hparams.n_expert > 0);
3301
3391
  GGML_ASSERT(hparams.n_expert_used > 0);
3302
3392
 
3303
3393
  // MoE branch
3304
3394
  for (uint32_t x = 0; x < hparams.n_expert; ++x) {
3305
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3306
- layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
3307
- layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
3395
+ layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
3396
+ layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
3397
+ layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
3308
3398
  }
3309
3399
  }
3310
3400
  }
3311
3401
  } break;
3312
3402
  case LLM_ARCH_BAICHUAN:
3313
3403
  {
3314
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3404
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3315
3405
  {
3316
- ggml_backend_type backend_norm;
3317
- ggml_backend_type backend_output;
3318
-
3319
- if (n_gpu_layers > int(n_layer)) {
3320
- backend_norm = llama_backend_offload;
3321
- backend_output = llama_backend_offload_split;
3322
- } else {
3323
- backend_norm = GGML_BACKEND_CPU;
3324
- backend_output = GGML_BACKEND_CPU;
3325
- }
3326
-
3327
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3328
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3406
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3407
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3329
3408
  }
3330
3409
 
3331
- const uint32_t n_ff = hparams.n_ff;
3332
- const int64_t n_embd_gqa = n_embd_v_gqa;
3333
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3334
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3335
-
3336
- const int i_gpu_start = n_layer - n_gpu_layers;
3337
-
3338
- model.layers.resize(n_layer);
3339
-
3340
- for (uint32_t i = 0; i < n_layer; ++i) {
3341
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3342
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3410
+ for (int i = 0; i < n_layer; ++i) {
3411
+ ggml_context * ctx_layer = ctx_for_layer(i);
3412
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3343
3413
 
3344
3414
  auto & layer = model.layers[i];
3345
3415
 
3346
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3416
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3347
3417
 
3348
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3349
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3350
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3351
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3418
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3419
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3420
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3421
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3352
3422
 
3353
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3423
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3354
3424
 
3355
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3356
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3357
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3425
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3426
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3427
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3358
3428
  }
3359
3429
  } break;
3360
3430
  case LLM_ARCH_FALCON:
3361
3431
  {
3362
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3432
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3363
3433
 
3364
3434
  // output
3365
3435
  {
3366
- ggml_backend_type backend_norm;
3367
- ggml_backend_type backend_output;
3368
-
3369
- if (n_gpu_layers > int(n_layer)) {
3370
- backend_norm = llama_backend_offload;
3371
- backend_output = llama_backend_offload_split;
3372
- } else {
3373
- backend_norm = GGML_BACKEND_CPU;
3374
- backend_output = GGML_BACKEND_CPU;
3375
- }
3376
-
3377
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3378
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3379
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3436
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3437
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3438
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3380
3439
  }
3381
3440
 
3382
- const uint32_t n_ff = hparams.n_ff;
3383
- const int64_t n_embd_gqa = n_embd_v_gqa;
3384
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3385
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3386
-
3387
- const int i_gpu_start = n_layer - n_gpu_layers;
3388
-
3389
- model.layers.resize(n_layer);
3390
-
3391
- for (uint32_t i = 0; i < n_layer; ++i) {
3392
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3393
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3441
+ for (int i = 0; i < n_layer; ++i) {
3442
+ ggml_context * ctx_layer = ctx_for_layer(i);
3443
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3394
3444
 
3395
3445
  auto & layer = model.layers[i];
3396
3446
 
3397
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3398
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3447
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3448
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3399
3449
 
3400
3450
  if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
3401
- layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
3402
- layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
3451
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
3452
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
3403
3453
  }
3404
3454
 
3405
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3406
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3455
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3456
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3407
3457
 
3408
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3409
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3458
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3459
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3410
3460
  }
3411
3461
  } break;
3412
3462
  case LLM_ARCH_STARCODER:
3413
3463
  {
3414
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3415
- model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
3464
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3465
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3416
3466
 
3417
3467
  // output
3418
3468
  {
3419
- ggml_backend_type backend_norm;
3420
- ggml_backend_type backend_output;
3421
-
3422
- if (n_gpu_layers > int(n_layer)) {
3423
- backend_norm = llama_backend_offload;
3424
- backend_output = llama_backend_offload_split;
3425
- } else {
3426
- backend_norm = GGML_BACKEND_CPU;
3427
- backend_output = GGML_BACKEND_CPU;
3428
- }
3429
-
3430
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3431
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3432
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3469
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3470
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3471
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3433
3472
  }
3434
3473
 
3435
- const uint32_t n_ff = hparams.n_ff;
3436
- const int64_t n_embd_gqa = n_embd_v_gqa;
3437
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3438
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3439
-
3440
- const int i_gpu_start = n_layer - n_gpu_layers;
3441
-
3442
- model.layers.resize(n_layer);
3443
-
3444
- for (uint32_t i = 0; i < n_layer; ++i) {
3445
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3446
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3474
+ for (int i = 0; i < n_layer; ++i) {
3475
+ ggml_context * ctx_layer = ctx_for_layer(i);
3476
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3447
3477
 
3448
3478
  auto & layer = model.layers[i];
3449
3479
 
3450
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3451
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3480
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3481
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3452
3482
 
3453
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3454
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3483
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3484
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3455
3485
 
3456
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3457
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3486
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3487
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3458
3488
 
3459
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3460
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3489
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3490
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3461
3491
 
3462
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3463
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3492
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3493
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3464
3494
 
3465
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3466
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3495
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3496
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3467
3497
  }
3468
3498
  } break;
3469
3499
  case LLM_ARCH_PERSIMMON:
3470
3500
  {
3471
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3501
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3472
3502
 
3473
3503
  {
3474
- ggml_backend_type backend_norm;
3475
- ggml_backend_type backend_output;
3476
-
3477
- if (n_gpu_layers > int(n_layer)) {
3478
- backend_norm = llama_backend_offload;
3479
- backend_output = llama_backend_offload_split;
3480
- } else {
3481
- backend_norm = GGML_BACKEND_CPU;
3482
- backend_output = GGML_BACKEND_CPU;
3483
- }
3484
-
3485
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3486
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3487
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3504
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3505
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3506
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3488
3507
  }
3489
3508
 
3490
- const uint32_t n_ff = hparams.n_ff;
3491
- const int64_t n_embd_gqa = n_embd_v_gqa;
3492
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3493
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3509
+ for (int i = 0; i < n_layer; ++i) {
3510
+ ggml_context * ctx_layer = ctx_for_layer(i);
3511
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3494
3512
 
3495
- const int i_gpu_start = n_layer - n_gpu_layers;
3496
- model.layers.resize(n_layer);
3497
- for (uint32_t i = 0; i < n_layer; ++i) {
3498
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
3499
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
3500
3513
  auto & layer = model.layers[i];
3501
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3502
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3503
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3504
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3505
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3506
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3507
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3508
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3509
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3510
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3511
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3512
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3513
- layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
3514
- layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
3515
- layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
3516
- layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
3514
+
3515
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3516
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3517
+
3518
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3519
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3520
+
3521
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3522
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3523
+
3524
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3525
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3526
+
3527
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3528
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3529
+
3530
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3531
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3532
+
3533
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
3534
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
3535
+
3536
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
3537
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
3517
3538
  }
3518
3539
  } break;
3519
3540
  case LLM_ARCH_BLOOM:
3520
3541
  {
3521
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3522
- model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
3523
- model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
3542
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3543
+ model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
3544
+ model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
3524
3545
 
3525
3546
  // output
3526
3547
  {
3527
- ggml_backend_type backend_norm;
3528
- ggml_backend_type backend_output;
3529
-
3530
- if (n_gpu_layers > int(n_layer)) {
3531
- backend_norm = llama_backend_offload;
3532
- backend_output = llama_backend_offload_split;
3533
- } else {
3534
- backend_norm = GGML_BACKEND_CPU;
3535
- backend_output = GGML_BACKEND_CPU;
3536
- }
3537
-
3538
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3539
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3540
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3548
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3549
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3550
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3541
3551
  }
3542
3552
 
3543
- const uint32_t n_ff = hparams.n_ff;
3544
- const int64_t n_embd_gqa = n_embd_v_gqa;
3545
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3546
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3547
-
3548
- const int i_gpu_start = n_layer - n_gpu_layers;
3549
-
3550
- model.layers.resize(n_layer);
3551
-
3552
- for (uint32_t i = 0; i < n_layer; ++i) {
3553
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3554
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3553
+ for (int i = 0; i < n_layer; ++i) {
3554
+ ggml_context * ctx_layer = ctx_for_layer(i);
3555
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3555
3556
 
3556
3557
  auto & layer = model.layers[i];
3557
3558
 
3558
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3559
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3559
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3560
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3560
3561
 
3561
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3562
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3562
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3563
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3563
3564
 
3564
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3565
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3565
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3566
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3566
3567
 
3567
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3568
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3568
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3569
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3569
3570
 
3570
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3571
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3571
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3572
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3572
3573
 
3573
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3574
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3574
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3575
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3575
3576
  }
3576
3577
  } break;
3577
3578
  case LLM_ARCH_MPT:
3578
3579
  {
3579
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3580
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3581
+
3580
3582
  // output
3581
3583
  {
3582
- ggml_backend_type backend_norm;
3583
- ggml_backend_type backend_output;
3584
-
3585
- if (n_gpu_layers > int(n_layer)) {
3586
- backend_norm = llama_backend_offload;
3587
- backend_output = llama_backend_offload_split;
3588
- } else {
3589
- backend_norm = GGML_BACKEND_CPU;
3590
- backend_output = GGML_BACKEND_CPU;
3591
- }
3592
-
3593
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3594
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3584
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3585
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3595
3586
  }
3596
3587
 
3597
- const uint32_t n_ff = hparams.n_ff;
3598
- const int64_t n_embd_gqa = n_embd_v_gqa;
3599
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3600
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3601
-
3602
- const int i_gpu_start = n_layer - n_gpu_layers;
3603
-
3604
- model.layers.resize(n_layer);
3605
-
3606
- for (uint32_t i = 0; i < n_layer; ++i) {
3607
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3608
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3588
+ for (int i = 0; i < n_layer; ++i) {
3589
+ ggml_context * ctx_layer = ctx_for_layer(i);
3590
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3609
3591
 
3610
3592
  auto & layer = model.layers[i];
3611
3593
 
3612
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3613
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3614
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3594
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3615
3595
 
3616
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3596
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3597
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3617
3598
 
3618
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3619
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3599
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3600
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3601
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3620
3602
 
3621
3603
  // AWQ ScaleActivation layer
3622
- layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
3604
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
3623
3605
  }
3624
3606
  } break;
3625
3607
  case LLM_ARCH_STABLELM:
3626
3608
  {
3627
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3609
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3628
3610
 
3629
3611
  // output
3630
3612
  {
3631
- ggml_backend_type backend_norm;
3632
- ggml_backend_type backend_output;
3633
-
3634
- if (n_gpu_layers > int(n_layer)) {
3635
- backend_norm = llama_backend_offload;
3636
- backend_output = llama_backend_offload_split;
3637
- } else {
3638
- backend_norm = GGML_BACKEND_CPU;
3639
- backend_output = GGML_BACKEND_CPU;
3640
- }
3641
-
3642
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3643
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3644
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3613
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3614
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3615
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3645
3616
  }
3646
3617
 
3647
- const uint32_t n_ff = hparams.n_ff;
3648
- const int64_t n_embd_gqa = n_embd_v_gqa;
3649
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3650
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3651
-
3652
- const int i_gpu_start = n_layer - n_gpu_layers;
3653
-
3654
- model.layers.resize(n_layer);
3655
-
3656
- for (uint32_t i = 0; i < n_layer; ++i) {
3657
- /*
3658
- llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
3659
- */
3660
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3661
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3618
+ for (int i = 0; i < n_layer; ++i) {
3619
+ ggml_context * ctx_layer = ctx_for_layer(i);
3620
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3662
3621
 
3663
3622
  auto & layer = model.layers[i];
3664
3623
 
3665
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3666
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3624
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3625
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3667
3626
 
3668
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3669
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3670
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3671
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3627
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3628
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3629
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3630
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3672
3631
 
3673
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3674
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3632
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3633
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3675
3634
 
3676
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3677
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3678
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3635
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3636
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3637
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3679
3638
  }
3680
3639
  } break;
3681
3640
  case LLM_ARCH_QWEN:
3682
3641
  {
3683
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3684
- {
3685
- ggml_backend_type backend_norm;
3686
- ggml_backend_type backend_output;
3687
-
3688
- if (n_gpu_layers > int(n_layer)) {
3689
- backend_norm = llama_backend_offload;
3690
- backend_output = llama_backend_offload_split;
3691
- } else {
3692
- backend_norm = GGML_BACKEND_CPU;
3693
- backend_output = GGML_BACKEND_CPU;
3694
- }
3642
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3695
3643
 
3696
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3697
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3698
- }
3699
-
3700
- const uint32_t n_ff = hparams.n_ff / 2;
3701
-
3702
- const int i_gpu_start = n_layer - n_gpu_layers;
3703
-
3704
- model.layers.resize(n_layer);
3644
+ // output
3645
+ {
3646
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3647
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3648
+ }
3705
3649
 
3706
- for (uint32_t i = 0; i < n_layer; ++i) {
3707
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3708
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3650
+ for (int i = 0; i < n_layer; ++i) {
3651
+ ggml_context * ctx_layer = ctx_for_layer(i);
3652
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3709
3653
 
3710
3654
  auto & layer = model.layers[i];
3711
3655
 
3712
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3656
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3713
3657
 
3714
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
3715
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
3716
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3658
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
3659
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3});
3660
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3717
3661
 
3718
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3662
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3719
3663
 
3720
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3721
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3722
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3664
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2});
3665
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd});
3666
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
3723
3667
  }
3724
3668
  } break;
3725
3669
  case LLM_ARCH_PHI2:
3726
3670
  {
3727
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3671
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3728
3672
 
3729
3673
  // output
3730
3674
  {
3731
- ggml_backend_type backend_norm;
3732
- ggml_backend_type backend_output;
3733
-
3734
- if (n_gpu_layers > int(n_layer)) {
3735
- backend_norm = llama_backend_offload;
3736
- backend_output = llama_backend_offload;
3737
- } else {
3738
- backend_norm = GGML_BACKEND_CPU;
3739
- backend_output = GGML_BACKEND_CPU;
3740
- }
3741
-
3742
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3743
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3744
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3745
- model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
3675
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3676
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3677
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3678
+ model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab});
3746
3679
  }
3747
3680
 
3748
- const uint32_t n_ff = hparams.n_ff;
3749
- const int64_t n_embd_gqa = n_embd_v_gqa;
3750
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3751
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3681
+ for (int i = 0; i < n_layer; ++i) {
3682
+ ggml_context * ctx_layer = ctx_for_layer(i);
3683
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3752
3684
 
3753
- const int i_gpu_start = n_layer - n_gpu_layers;
3685
+ auto & layer = model.layers[i];
3754
3686
 
3755
- model.layers.resize(n_layer);
3687
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3688
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3756
3689
 
3757
- for (uint32_t i = 0; i < n_layer; ++i) {
3758
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3759
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3690
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, false);
3691
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, false);
3760
3692
 
3761
- auto & layer = model.layers[i];
3693
+ if (layer.wqkv == nullptr) {
3694
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3695
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd});
3762
3696
 
3763
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3764
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3697
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3698
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa});
3765
3699
 
3766
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3767
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3700
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3701
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa});
3702
+ }
3768
3703
 
3769
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3770
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3704
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3705
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3771
3706
 
3772
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3773
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3707
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3708
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3774
3709
 
3775
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3776
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3710
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3711
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3777
3712
  }
3778
3713
  } break;
3779
3714
  case LLM_ARCH_PLAMO:
3780
3715
  {
3781
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3716
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3782
3717
 
3783
3718
  // output
3784
3719
  {
3785
- ggml_backend_type backend_norm;
3786
- ggml_backend_type backend_output;
3787
-
3788
- if (n_gpu_layers > int(n_layer)) {
3789
- backend_norm = llama_backend_offload;
3790
- backend_output = llama_backend_offload_split;
3791
- } else {
3792
- backend_norm = GGML_BACKEND_CPU;
3793
- backend_output = GGML_BACKEND_CPU;
3794
- }
3795
-
3796
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3797
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3720
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3721
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3798
3722
  }
3799
3723
 
3800
- const uint32_t n_ff = hparams.n_ff;
3801
- const int64_t n_embd_gqa = n_embd_v_gqa;
3802
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3803
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3804
-
3805
- const int i_gpu_start = n_layer - n_gpu_layers;
3806
-
3807
- model.layers.resize(n_layer);
3808
-
3809
- for (uint32_t i = 0; i < n_layer; ++i) {
3810
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3811
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3724
+ for (int i = 0; i < n_layer; ++i) {
3725
+ ggml_context * ctx_layer = ctx_for_layer(i);
3726
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3812
3727
 
3813
3728
  auto & layer = model.layers[i];
3814
3729
 
3815
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3730
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3816
3731
 
3817
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
3818
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3819
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
3820
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3732
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
3733
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
3734
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
3735
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3821
3736
 
3822
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
3823
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
3824
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3737
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
3738
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
3739
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3825
3740
  }
3826
3741
  } break;
3827
3742
  case LLM_ARCH_GPT2:
3828
3743
  {
3829
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
3830
- model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
3744
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
3745
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
3831
3746
 
3832
3747
  // output
3833
3748
  {
3834
- ggml_backend_type backend_norm;
3835
- ggml_backend_type backend_output;
3836
-
3837
- if (n_gpu_layers > int(n_layer)) {
3838
- backend_norm = llama_backend_offload;
3839
- backend_output = llama_backend_offload_split;
3840
- } else {
3841
- backend_norm = GGML_BACKEND_CPU;
3842
- backend_output = GGML_BACKEND_CPU;
3843
- }
3844
-
3845
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
3846
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
3847
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
3749
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
3750
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
3751
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
3848
3752
  }
3849
3753
 
3850
- const uint32_t n_ff = hparams.n_ff;
3851
- const int64_t n_embd_gqa = n_embd_v_gqa;
3852
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
3853
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
3854
-
3855
- const int i_gpu_start = n_layer - n_gpu_layers;
3856
-
3857
- model.layers.resize(n_layer);
3858
-
3859
- for (uint32_t i = 0; i < n_layer; ++i) {
3860
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
3861
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
3754
+ for (int i = 0; i < n_layer; ++i) {
3755
+ ggml_context * ctx_layer = ctx_for_layer(i);
3756
+ ggml_context * ctx_split = ctx_for_layer_split(i);
3862
3757
 
3863
3758
  auto & layer = model.layers[i];
3864
3759
 
3865
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
3866
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
3760
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
3761
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
3867
3762
 
3868
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
3869
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
3763
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
3764
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
3870
3765
 
3871
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
3872
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
3766
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
3767
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
3873
3768
 
3874
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
3875
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
3769
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
3770
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
3876
3771
 
3877
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
3878
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
3772
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
3773
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
3879
3774
 
3880
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
3881
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
3775
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
3776
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
3882
3777
  }
3883
3778
  } break;
3884
3779
  default:
@@ -3888,78 +3783,51 @@ static bool llm_load_tensors(
3888
3783
 
3889
3784
  ml.done_getting_tensors();
3890
3785
 
3891
- ml.init_mapping();
3786
+ ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
3892
3787
 
3893
- // allocate tensors
3894
- size_t vram_weights = 0;
3895
- size_t buf_size = 0;
3788
+ // create the backend buffers
3789
+ std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
3896
3790
 
3897
- ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
3791
+ for (auto & it : ctx_map) {
3792
+ ggml_backend_buffer_type_t buft = it.first;
3793
+ ggml_context * ctx = it.second;
3794
+ ggml_backend_buffer_t buf = nullptr;
3898
3795
 
3899
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3900
- // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
3901
- if (t->backend == GGML_BACKEND_CPU) {
3902
- buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
3903
- } else {
3904
- vram_weights += ggml_nbytes(t);
3796
+ // only the mmap region containing the tensors in the model is mapped to the backend buffer
3797
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
3798
+ // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
3799
+ if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
3800
+ size_t first, last;
3801
+ ml.get_mapping_range(&first, &last, ctx);
3802
+ buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
3905
3803
  }
3906
- }
3907
-
3908
- // create backend buffer
3909
- ggml_backend_buffer_t buf_mmap = nullptr;
3910
-
3911
3804
  #ifdef GGML_USE_METAL
3912
- if (n_gpu_layers > 0) {
3913
- if (ml.use_mmap) {
3805
+ else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
3914
3806
  const size_t max_size = ggml_get_max_tensor_size(ctx);
3915
- model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
3916
- buf_mmap = model.buf;
3917
- } else {
3918
- model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
3807
+ size_t first, last;
3808
+ ml.get_mapping_range(&first, &last, ctx);
3809
+ buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
3919
3810
  }
3920
- }
3921
- #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3922
- // for testing only
3923
- if (n_gpu_layers > 0) {
3924
- model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
3925
- }
3926
3811
  #endif
3927
-
3928
- if (model.buf == nullptr) {
3929
- // CPU backend, and indirectly CUDA and OpenCL
3930
- if (ml.use_mmap) {
3931
- model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
3932
- buf_mmap = model.buf;
3933
- } else {
3934
- // allocate only CPU tensors
3935
- model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
3936
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
3937
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
3938
- if (t->backend == GGML_BACKEND_CPU) {
3939
- ggml_tallocr_alloc(alloc, t);
3940
- }
3812
+ else {
3813
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
3814
+ if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
3815
+ model.mlock_buf.init (ggml_backend_buffer_get_base(buf));
3816
+ model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
3941
3817
  }
3942
- ggml_tallocr_free(alloc);
3943
3818
  }
3944
- }
3945
-
3946
- if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
3947
- model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
3948
- model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
3819
+ if (buf == nullptr) {
3820
+ throw std::runtime_error("failed to allocate buffer");
3821
+ }
3822
+ // indicate that this buffer contains weights
3823
+ // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
3824
+ ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
3825
+ model.bufs.push_back(buf);
3826
+ ctx_bufs.emplace_back(ctx, buf);
3949
3827
  }
3950
3828
 
3951
3829
  // print memory requirements
3952
3830
  {
3953
- size_t sys_mem_required = ctx_size + buf_size;
3954
-
3955
- if (sys_mem_required > 0) {
3956
- LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
3957
- }
3958
- if (vram_weights > 0) {
3959
- LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
3960
- }
3961
-
3962
- #if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
3963
3831
  const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
3964
3832
 
3965
3833
  LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
@@ -3971,23 +3839,26 @@ static bool llm_load_tensors(
3971
3839
  const int max_offloadable_layers = hparams.n_layer + 1;
3972
3840
 
3973
3841
  LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
3974
- #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
3975
- }
3976
3842
 
3977
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
3978
- ggml_cuda_set_tensor_split(tensor_split);
3979
- #else
3980
- GGML_UNUSED(tensor_split);
3981
- #endif // GGML_USE_CUBLAS
3843
+ for (ggml_backend_buffer_t buf : model.bufs) {
3844
+ LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
3845
+ }
3846
+ }
3982
3847
 
3983
3848
  // populate tensors_by_name
3984
- for (int i = 0; i < ml.n_tensors; ++i) {
3985
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
3986
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3849
+ for (ggml_context * ctx : model.ctxs) {
3850
+ for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
3851
+ model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
3852
+ }
3987
3853
  }
3988
3854
 
3989
- if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
3990
- return false;
3855
+ // load tensor data
3856
+ for (auto & it : ctx_bufs) {
3857
+ ggml_context * ctx = it.first;
3858
+ ggml_backend_buffer_t buf = it.second;
3859
+ if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
3860
+ return false;
3861
+ }
3991
3862
  }
3992
3863
 
3993
3864
  model.mapping = std::move(ml.mapping);
@@ -4021,13 +3892,13 @@ static int llama_model_load(const std::string & fname, llama_model & model, cons
4021
3892
  }
4022
3893
 
4023
3894
  if (!llm_load_tensors(
4024
- ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
3895
+ ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
4025
3896
  params.progress_callback, params.progress_callback_user_data
4026
3897
  )) {
4027
3898
  return -2;
4028
3899
  }
4029
3900
  } catch (const std::exception & err) {
4030
- LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
3901
+ LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
4031
3902
  return -1;
4032
3903
  }
4033
3904
 
@@ -4099,7 +3970,6 @@ static void llm_build_k_shift(
4099
3970
  struct ggml_cgraph * graph,
4100
3971
  llm_rope_type type,
4101
3972
  int64_t n_ctx,
4102
- int n_rot,
4103
3973
  float freq_base,
4104
3974
  float freq_scale,
4105
3975
  const llm_build_cb & cb) {
@@ -4107,14 +3977,13 @@ static void llm_build_k_shift(
4107
3977
  const int64_t n_head_kv = hparams.n_head_kv;
4108
3978
  const int64_t n_embd_head_k = hparams.n_embd_head_k;
4109
3979
  const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
3980
+ const int32_t n_rot = hparams.n_rot;
4110
3981
  const int32_t n_orig_ctx = cparams.n_yarn_orig_ctx;
4111
3982
  const float ext_factor = cparams.yarn_ext_factor;
4112
3983
  const float attn_factor = cparams.yarn_attn_factor;
4113
3984
  const float beta_fast = cparams.yarn_beta_fast;
4114
3985
  const float beta_slow = cparams.yarn_beta_slow;
4115
3986
 
4116
- GGML_ASSERT(n_embd_head_k % n_rot == 0);
4117
-
4118
3987
  struct ggml_tensor * K_shift = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_ctx);
4119
3988
  cb(K_shift, "K_shift", -1);
4120
3989
 
@@ -4473,8 +4342,6 @@ struct llm_build_context {
4473
4342
  do_rope_shift (worst_case || kv_self.has_shift),
4474
4343
  cb (cb),
4475
4344
  buf_compute_meta (lctx.buf_compute_meta) {
4476
- GGML_ASSERT(!!kv_self.ctx);
4477
-
4478
4345
  // all initializations should be done in init()
4479
4346
  }
4480
4347
 
@@ -4518,7 +4385,7 @@ struct llm_build_context {
4518
4385
 
4519
4386
  // shift the entire K-cache if needed
4520
4387
  if (do_rope_shift) {
4521
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
4388
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4522
4389
  }
4523
4390
 
4524
4391
  for (int il = 0; il < n_layer; ++il) {
@@ -4554,16 +4421,22 @@ struct llm_build_context {
4554
4421
  cb(Vcur, "Vcur", il);
4555
4422
  }
4556
4423
 
4424
+ // these nodes are added to the graph together so that they are not reordered
4425
+ // by doing so, the number of splits in the graph is reduced
4426
+ ggml_build_forward_expand(gf, Qcur);
4427
+ ggml_build_forward_expand(gf, Kcur);
4428
+ ggml_build_forward_expand(gf, Vcur);
4429
+
4557
4430
  Qcur = ggml_rope_custom(
4558
4431
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4559
- n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
4432
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4560
4433
  ext_factor, attn_factor, beta_fast, beta_slow
4561
4434
  );
4562
4435
  cb(Qcur, "Qcur", il);
4563
4436
 
4564
4437
  Kcur = ggml_rope_custom(
4565
4438
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4566
- n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
4439
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4567
4440
  ext_factor, attn_factor, beta_fast, beta_slow
4568
4441
  );
4569
4442
  cb(Kcur, "Kcur", il);
@@ -4686,6 +4559,7 @@ struct llm_build_context {
4686
4559
 
4687
4560
  const int64_t n_embd_head = hparams.n_embd_head_v;
4688
4561
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4562
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
4689
4563
 
4690
4564
  struct ggml_tensor * cur;
4691
4565
  struct ggml_tensor * inpL;
@@ -4703,7 +4577,7 @@ struct llm_build_context {
4703
4577
 
4704
4578
  // shift the entire K-cache if needed
4705
4579
  if (do_rope_shift) {
4706
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
4580
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
4707
4581
  }
4708
4582
 
4709
4583
  for (int il = 0; il < n_layer; ++il) {
@@ -4729,12 +4603,12 @@ struct llm_build_context {
4729
4603
  case MODEL_7B:
4730
4604
  Qcur = ggml_rope_custom(
4731
4605
  ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
4732
- n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
4606
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4733
4607
  ext_factor, attn_factor, beta_fast, beta_slow
4734
4608
  );
4735
4609
  Kcur = ggml_rope_custom(
4736
4610
  ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
4737
- n_embd_head, 0, 0, n_orig_ctx, freq_base, freq_scale,
4611
+ hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
4738
4612
  ext_factor, attn_factor, beta_fast, beta_slow
4739
4613
  );
4740
4614
  break;
@@ -4807,6 +4681,7 @@ struct llm_build_context {
4807
4681
  const int64_t n_embd_head = hparams.n_embd_head_v;
4808
4682
  const int64_t n_embd_gqa = hparams.n_embd_v_gqa();
4809
4683
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4684
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
4810
4685
 
4811
4686
  struct ggml_tensor * cur;
4812
4687
  struct ggml_tensor * inpL;
@@ -4824,7 +4699,7 @@ struct llm_build_context {
4824
4699
 
4825
4700
  // shift the entire K-cache if needed
4826
4701
  if (do_rope_shift) {
4827
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
4702
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
4828
4703
  }
4829
4704
 
4830
4705
  for (int il = 0; il < n_layer; ++il) {
@@ -4865,13 +4740,13 @@ struct llm_build_context {
4865
4740
 
4866
4741
  // using mode = 2 for neox mode
4867
4742
  Qcur = ggml_rope_custom(
4868
- ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
4743
+ ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4869
4744
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4870
4745
  );
4871
4746
  cb(Qcur, "Qcur", il);
4872
4747
 
4873
4748
  Kcur = ggml_rope_custom(
4874
- ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
4749
+ ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
4875
4750
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
4876
4751
  );
4877
4752
  cb(Kcur, "Kcur", il);
@@ -5028,15 +4903,14 @@ struct llm_build_context {
5028
4903
  struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, LLAMA_MAX_NODES, false);
5029
4904
 
5030
4905
  const int64_t n_embd_head = hparams.n_embd_head_v;
5031
- GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5032
-
5033
- const int64_t n_rot = n_embd_head_k / 2;
4906
+ GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
4907
+ GGML_ASSERT(n_embd_head/2 == hparams.n_rot);
5034
4908
 
5035
4909
  struct ggml_tensor * cur;
5036
4910
  struct ggml_tensor * inpL;
5037
4911
 
5038
4912
  inpL = llm_build_inp_embd(ctx0, hparams, batch, model.tok_embd, cb);
5039
- cb(inpL, "imp_embd", -1);
4913
+ cb(inpL, "inp_embd", -1);
5040
4914
 
5041
4915
  // inp_pos - contains the positions
5042
4916
  struct ggml_tensor * inp_pos = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
@@ -5047,7 +4921,7 @@ struct llm_build_context {
5047
4921
  cb(KQ_mask, "KQ_mask", -1);
5048
4922
 
5049
4923
  if (do_rope_shift) {
5050
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
4924
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5051
4925
  }
5052
4926
 
5053
4927
  for (int il = 0; il < n_layer; ++il) {
@@ -5107,7 +4981,7 @@ struct llm_build_context {
5107
4981
 
5108
4982
  // RoPE the first n_rot of q/k, pass the other half, and concat.
5109
4983
  struct ggml_tensor * qrot = ggml_view_3d(
5110
- ctx0, tmpq, n_rot, n_head, n_tokens,
4984
+ ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5111
4985
  ggml_element_size(tmpq) * n_embd_head,
5112
4986
  ggml_element_size(tmpq) * n_embd_head * n_head,
5113
4987
  0
@@ -5115,7 +4989,7 @@ struct llm_build_context {
5115
4989
  cb(qrot, "qrot", il);
5116
4990
 
5117
4991
  struct ggml_tensor * krot = ggml_view_3d(
5118
- ctx0, tmpk, n_rot, n_head, n_tokens,
4992
+ ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5119
4993
  ggml_element_size(tmpk) * n_embd_head,
5120
4994
  ggml_element_size(tmpk) * n_embd_head * n_head,
5121
4995
  0
@@ -5124,29 +4998,29 @@ struct llm_build_context {
5124
4998
 
5125
4999
  // get the second half of tmpq, e.g tmpq[n_rot:, :, :]
5126
5000
  struct ggml_tensor * qpass = ggml_view_3d(
5127
- ctx0, tmpq, n_rot, n_head, n_tokens,
5001
+ ctx0, tmpq, hparams.n_rot, n_head, n_tokens,
5128
5002
  ggml_element_size(tmpq) * n_embd_head,
5129
5003
  ggml_element_size(tmpq) * n_embd_head * n_head,
5130
- ggml_element_size(tmpq) * n_rot
5004
+ ggml_element_size(tmpq) * hparams.n_rot
5131
5005
  );
5132
5006
  cb(qpass, "qpass", il);
5133
5007
 
5134
5008
  struct ggml_tensor * kpass = ggml_view_3d(
5135
- ctx0, tmpk, n_rot, n_head, n_tokens,
5009
+ ctx0, tmpk, hparams.n_rot, n_head, n_tokens,
5136
5010
  ggml_element_size(tmpk) * n_embd_head,
5137
5011
  ggml_element_size(tmpk) * n_embd_head * n_head,
5138
- ggml_element_size(tmpk) * n_rot
5012
+ ggml_element_size(tmpk) * hparams.n_rot
5139
5013
  );
5140
5014
  cb(kpass, "kpass", il);
5141
5015
 
5142
5016
  struct ggml_tensor * qrotated = ggml_rope_custom(
5143
- ctx0, qrot, inp_pos, n_rot, 2, 0, n_orig_ctx,
5017
+ ctx0, qrot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5144
5018
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5145
5019
  );
5146
5020
  cb(qrotated, "qrotated", il);
5147
5021
 
5148
5022
  struct ggml_tensor * krotated = ggml_rope_custom(
5149
- ctx0, krot, inp_pos, n_rot, 2, 0, n_orig_ctx,
5023
+ ctx0, krot, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5150
5024
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5151
5025
  );
5152
5026
  cb(krotated, "krotated", il);
@@ -5543,7 +5417,7 @@ struct llm_build_context {
5543
5417
 
5544
5418
  // shift the entire K-cache if needed
5545
5419
  if (do_rope_shift) {
5546
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, hparams.n_rot, freq_base, freq_scale, cb);
5420
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5547
5421
  }
5548
5422
 
5549
5423
  for (int il = 0; il < n_layer; ++il) {
@@ -5656,7 +5530,7 @@ struct llm_build_context {
5656
5530
 
5657
5531
  // shift the entire K-cache if needed
5658
5532
  if (do_rope_shift) {
5659
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5533
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5660
5534
  }
5661
5535
 
5662
5536
  for (int il = 0; il < n_layer; ++il) {
@@ -5688,13 +5562,13 @@ struct llm_build_context {
5688
5562
 
5689
5563
  // using mode = 2 for neox mode
5690
5564
  Qcur = ggml_rope_custom(
5691
- ctx0, Qcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5565
+ ctx0, Qcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5692
5566
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5693
5567
  );
5694
5568
  cb(Qcur, "Qcur", il);
5695
5569
 
5696
5570
  Kcur = ggml_rope_custom(
5697
- ctx0, Kcur, inp_pos, n_embd_head, 2, 0, n_orig_ctx,
5571
+ ctx0, Kcur, inp_pos, hparams.n_rot, 2, 0, n_orig_ctx,
5698
5572
  freq_base, freq_scale, ext_factor, attn_factor, beta_fast, beta_slow
5699
5573
  );
5700
5574
  cb(Kcur, "Kcur", il);
@@ -5773,7 +5647,7 @@ struct llm_build_context {
5773
5647
 
5774
5648
  // shift the entire K-cache if needed
5775
5649
  if (do_rope_shift) {
5776
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5650
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE_NEOX, n_ctx, freq_base, freq_scale, cb);
5777
5651
  }
5778
5652
 
5779
5653
  for (int il = 0; il < n_layer; ++il) {
@@ -5785,15 +5659,25 @@ struct llm_build_context {
5785
5659
 
5786
5660
  // self-attention
5787
5661
  {
5788
- cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5789
- cb(cur, "wqkv", il);
5662
+ struct ggml_tensor * Qcur = nullptr;
5663
+ struct ggml_tensor * Kcur = nullptr;
5664
+ struct ggml_tensor * Vcur = nullptr;
5790
5665
 
5791
- cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5792
- cb(cur, "bqkv", il);
5666
+ if (model.layers[il].wqkv) {
5667
+ cur = ggml_mul_mat(ctx0, model.layers[il].wqkv, attn_norm_output);
5668
+ cb(cur, "wqkv", il);
5793
5669
 
5794
- struct ggml_tensor * Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5795
- struct ggml_tensor * Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5796
- struct ggml_tensor * Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5670
+ cur = ggml_add(ctx0, cur, model.layers[il].bqkv);
5671
+ cb(cur, "bqkv", il);
5672
+
5673
+ Qcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*(n_embd)));
5674
+ Kcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd)));
5675
+ Vcur = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, n_embd_gqa, n_tokens, cur->nb[1], 1*sizeof(float)*(n_embd + n_embd_gqa)));
5676
+ } else {
5677
+ Qcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wq, attn_norm_output), model.layers[il].bq);
5678
+ Kcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wk, attn_norm_output), model.layers[il].bk);
5679
+ Vcur = ggml_add(ctx0, ggml_mul_mat(ctx0, model.layers[il].wv, attn_norm_output), model.layers[il].bv);
5680
+ }
5797
5681
 
5798
5682
  cb(Qcur, "Qcur", il);
5799
5683
  cb(Kcur, "Kcur", il);
@@ -5869,6 +5753,7 @@ struct llm_build_context {
5869
5753
 
5870
5754
  const int64_t n_embd_head = hparams.n_embd_head_v;
5871
5755
  GGML_ASSERT(n_embd_head == hparams.n_embd_head_k);
5756
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
5872
5757
 
5873
5758
  struct ggml_tensor * cur;
5874
5759
  struct ggml_tensor * inpL;
@@ -5886,7 +5771,7 @@ struct llm_build_context {
5886
5771
 
5887
5772
  // shift the entire K-cache if needed
5888
5773
  if (do_rope_shift) {
5889
- llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, n_embd_head, freq_base, freq_scale, cb);
5774
+ llm_build_k_shift(ctx0, hparams, cparams, kv_self, gf, LLM_ROPE, n_ctx, freq_base, freq_scale, cb);
5890
5775
  }
5891
5776
 
5892
5777
  for (int il = 0; il < n_layer; ++il) {
@@ -5912,13 +5797,13 @@ struct llm_build_context {
5912
5797
  cb(Vcur, "Vcur", il);
5913
5798
 
5914
5799
  Qcur = ggml_rope_custom(
5915
- ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
5800
+ ctx0, ggml_reshape_3d(ctx0, Qcur, hparams.n_rot, n_head, n_tokens), inp_pos,
5916
5801
  n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
5917
5802
  ext_factor, attn_factor, beta_fast, beta_slow);
5918
5803
  cb(Qcur, "Qcur", il);
5919
5804
 
5920
5805
  Kcur = ggml_rope_custom(
5921
- ctx0, ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens), inp_pos,
5806
+ ctx0, ggml_reshape_3d(ctx0, Kcur, hparams.n_rot, n_head_kv, n_tokens), inp_pos,
5922
5807
  n_embd_head, 2, 0, n_orig_ctx, freq_base, freq_scale,
5923
5808
  ext_factor, attn_factor, beta_fast, beta_slow);
5924
5809
  cb(Kcur, "Kcur", il);
@@ -6072,199 +5957,13 @@ struct llm_build_context {
6072
5957
  }
6073
5958
  };
6074
5959
 
6075
- //
6076
- // tensor offloading helpers
6077
- //
6078
- // TODO: will be removed with backend v2
6079
-
6080
- enum llm_offload_func_e {
6081
- OFFLOAD_FUNC_NOP,
6082
- OFFLOAD_FUNC,
6083
- OFFLOAD_FUNC_FRC, // force offload
6084
- OFFLOAD_FUNC_KQV,
6085
- OFFLOAD_FUNC_NR,
6086
- OFFLOAD_FUNC_EMB, // embeddings
6087
- OFFLOAD_FUNC_OUT,
6088
- };
6089
-
6090
- // TODO: will be removed with backend v2
6091
- struct llm_offload_trie {
6092
- struct node {
6093
- ~node() {
6094
- for (int i = 0; i < 256; ++i) {
6095
- if (children[i]) {
6096
- delete children[i];
6097
- }
6098
- }
6099
- }
6100
-
6101
- node * children[256] = { nullptr };
6102
- llm_offload_func_e func = OFFLOAD_FUNC_NOP;
6103
- };
6104
-
6105
- llm_offload_trie() {
6106
- root = new node;
6107
- }
6108
-
6109
- llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
6110
- root = new node;
6111
-
6112
- for (const auto & kv : map) {
6113
- add(kv.first, kv.second);
6114
- }
6115
- }
6116
-
6117
- ~llm_offload_trie() {
6118
- delete root;
6119
- }
6120
-
6121
- void add(const char * name, llm_offload_func_e func) {
6122
- node * cur = root;
6123
-
6124
- for (int i = 0; ; ++i) {
6125
- const uint8_t c = name[i];
6126
-
6127
- if (!c) {
6128
- break;
6129
- }
6130
-
6131
- if (!cur->children[c]) {
6132
- cur->children[c] = new node;
6133
- }
6134
-
6135
- cur = cur->children[c];
6136
- }
6137
-
6138
- cur->func = func;
6139
- }
6140
-
6141
- llm_offload_func_e find(const char * name) const {
6142
- const node * cur = root;
6143
-
6144
- for (int i = 0; ; ++i) {
6145
- const uint8_t c = name[i];
6146
-
6147
- if (!c) {
6148
- break;
6149
- }
6150
-
6151
- if (!cur->children[c]) {
6152
- return OFFLOAD_FUNC_NOP;
6153
- }
6154
-
6155
- cur = cur->children[c];
6156
- }
6157
-
6158
- return cur->func;
6159
- }
6160
-
6161
- node * root = nullptr;
6162
- };
6163
-
6164
- // TODO: will be removed with backend v2
6165
- static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
6166
- //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
6167
- //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
6168
- { "pos_embd", OFFLOAD_FUNC_NR },
6169
-
6170
- { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
6171
- { "KQ_mask", OFFLOAD_FUNC_FRC },
6172
- { "K_shift", OFFLOAD_FUNC_FRC },
6173
-
6174
- { "K_shifted", OFFLOAD_FUNC },
6175
-
6176
- { "inp_norm", OFFLOAD_FUNC_NR },
6177
- { "inp_norm_w", OFFLOAD_FUNC_NR },
6178
- { "inp_norm_wb", OFFLOAD_FUNC_NR },
6179
-
6180
- { "norm", OFFLOAD_FUNC },
6181
- { "norm_w", OFFLOAD_FUNC },
6182
- { "norm_wb", OFFLOAD_FUNC },
6183
-
6184
- { "attn_norm", OFFLOAD_FUNC },
6185
- { "attn_norm_2", OFFLOAD_FUNC },
6186
-
6187
- { "wqkv", OFFLOAD_FUNC_KQV },
6188
- { "bqkv", OFFLOAD_FUNC_KQV },
6189
- { "wqkv_clamped", OFFLOAD_FUNC_KQV },
6190
-
6191
- { "tmpk", OFFLOAD_FUNC_KQV },
6192
- { "tmpq", OFFLOAD_FUNC_KQV },
6193
- { "tmpv", OFFLOAD_FUNC_KQV },
6194
- { "Kcur", OFFLOAD_FUNC_KQV },
6195
- { "Qcur", OFFLOAD_FUNC_KQV },
6196
- { "Vcur", OFFLOAD_FUNC_KQV },
6197
-
6198
- { "krot", OFFLOAD_FUNC_KQV },
6199
- { "qrot", OFFLOAD_FUNC_KQV },
6200
- { "kpass", OFFLOAD_FUNC_KQV },
6201
- { "qpass", OFFLOAD_FUNC_KQV },
6202
- { "krotated", OFFLOAD_FUNC_KQV },
6203
- { "qrotated", OFFLOAD_FUNC_KQV },
6204
-
6205
- { "q", OFFLOAD_FUNC_KQV },
6206
- { "k", OFFLOAD_FUNC_KQV },
6207
- { "kq", OFFLOAD_FUNC_KQV },
6208
- { "kq_scaled", OFFLOAD_FUNC_KQV },
6209
- { "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
6210
- { "kq_masked", OFFLOAD_FUNC_KQV },
6211
- { "kq_soft_max", OFFLOAD_FUNC_KQV },
6212
- { "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
6213
- { "v", OFFLOAD_FUNC_KQV },
6214
- { "kqv", OFFLOAD_FUNC_KQV },
6215
- { "kqv_merged", OFFLOAD_FUNC_KQV },
6216
- { "kqv_merged_cont", OFFLOAD_FUNC_KQV },
6217
- { "kqv_wo", OFFLOAD_FUNC_KQV },
6218
- { "kqv_out", OFFLOAD_FUNC_KQV },
6219
-
6220
- { "ffn_inp", OFFLOAD_FUNC },
6221
- { "ffn_norm", OFFLOAD_FUNC },
6222
-
6223
- { "ffn_up", OFFLOAD_FUNC },
6224
- { "ffn_up_b", OFFLOAD_FUNC },
6225
- { "ffn_gate", OFFLOAD_FUNC },
6226
- { "ffn_gate_b", OFFLOAD_FUNC },
6227
- { "ffn_gate_par", OFFLOAD_FUNC },
6228
- { "ffn_act", OFFLOAD_FUNC },
6229
- { "ffn_down", OFFLOAD_FUNC },
6230
- { "ffn_down_b", OFFLOAD_FUNC },
6231
- { "ffn_out", OFFLOAD_FUNC },
6232
-
6233
- { "ffn_silu", OFFLOAD_FUNC },
6234
- { "ffn_gelu", OFFLOAD_FUNC },
6235
- { "ffn_relu", OFFLOAD_FUNC },
6236
- { "ffn_sqr(relu)", OFFLOAD_FUNC },
6237
-
6238
- { "ffn_moe_logits", OFFLOAD_FUNC },
6239
- { "ffn_moe_probs", OFFLOAD_FUNC },
6240
- { "ffn_moe_argsort", OFFLOAD_FUNC },
6241
- { "ffn_moe_weights", OFFLOAD_FUNC },
6242
- { "ffn_moe_weights_sum", OFFLOAD_FUNC },
6243
- { "ffn_moe_weights_norm", OFFLOAD_FUNC },
6244
- { "ffn_moe_weighted", OFFLOAD_FUNC },
6245
- { "ffn_moe_up", OFFLOAD_FUNC },
6246
- { "ffn_moe_gate", OFFLOAD_FUNC },
6247
- { "ffn_moe_silu", OFFLOAD_FUNC },
6248
- { "ffn_moe_gate_par", OFFLOAD_FUNC },
6249
- { "ffn_moe_down", OFFLOAD_FUNC },
6250
- { "ffn_moe_out", OFFLOAD_FUNC },
6251
-
6252
- { "l_out", OFFLOAD_FUNC },
6253
-
6254
- { "result_norm", OFFLOAD_FUNC_EMB },
6255
- { "result_output_no_bias", OFFLOAD_FUNC_EMB },
6256
- { "result_output", OFFLOAD_FUNC_OUT },
6257
- };
6258
-
6259
- static llm_offload_trie k_offload_func_trie(k_offload_map);
6260
-
6261
5960
  static struct ggml_cgraph * llama_build_graph(
6262
5961
  llama_context & lctx,
6263
5962
  const llama_batch & batch) {
6264
5963
  const auto & model = lctx.model;
6265
5964
 
6266
5965
  // check if we should build the worst-case graph (for memory measurement)
6267
- const bool worst_case = ggml_allocr_is_measure(lctx.alloc);
5966
+ const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
6268
5967
 
6269
5968
  // keep track of the input that has already been allocated
6270
5969
  bool alloc_inp_tokens = false;
@@ -6273,16 +5972,8 @@ static struct ggml_cgraph * llama_build_graph(
6273
5972
  bool alloc_inp_KQ_mask = false;
6274
5973
  bool alloc_inp_K_shift = false;
6275
5974
 
6276
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6277
- const bool do_offload = true;
6278
- #else
6279
- const bool do_offload = true; // TODO: set to false after finishing refactoring
6280
- #endif
6281
-
6282
- int n_non_view = 0; // number of non-view tensors that have been processed by the callback
6283
-
6284
5975
  // this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
6285
- // TODO: will be removed with backend v2
5976
+ // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
6286
5977
  llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
6287
5978
  if (il >= 0) {
6288
5979
  ggml_format_name(cur, "%s-%d", name, il);
@@ -6293,12 +5984,11 @@ static struct ggml_cgraph * llama_build_graph(
6293
5984
  //
6294
5985
  // allocate input tensors and set input data
6295
5986
  //
6296
- // TODO: will be removed with backend v2
6297
5987
 
6298
5988
  if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
6299
- ggml_allocr_alloc(lctx.alloc, cur);
5989
+ ggml_tallocr_alloc(lctx.alloc, cur);
6300
5990
 
6301
- if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
5991
+ if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
6302
5992
  const int64_t n_tokens = cur->ne[0];
6303
5993
 
6304
5994
  ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
@@ -6307,10 +5997,10 @@ static struct ggml_cgraph * llama_build_graph(
6307
5997
  alloc_inp_tokens = true;
6308
5998
  }
6309
5999
 
6310
- if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
6311
- ggml_allocr_alloc(lctx.alloc, cur);
6000
+ if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
6001
+ ggml_tallocr_alloc(lctx.alloc, cur);
6312
6002
 
6313
- if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) {
6003
+ if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
6314
6004
  const int64_t n_embd = cur->ne[0];
6315
6005
  const int64_t n_tokens = cur->ne[1];
6316
6006
 
@@ -6321,9 +6011,9 @@ static struct ggml_cgraph * llama_build_graph(
6321
6011
  }
6322
6012
 
6323
6013
  if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
6324
- ggml_allocr_alloc(lctx.alloc, cur);
6014
+ ggml_tallocr_alloc(lctx.alloc, cur);
6325
6015
 
6326
- if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
6016
+ if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
6327
6017
  const int64_t n_tokens = cur->ne[0];
6328
6018
 
6329
6019
  static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
@@ -6334,9 +6024,9 @@ static struct ggml_cgraph * llama_build_graph(
6334
6024
  }
6335
6025
 
6336
6026
  if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
6337
- ggml_allocr_alloc(lctx.alloc, cur);
6027
+ ggml_tallocr_alloc(lctx.alloc, cur);
6338
6028
 
6339
- if (!ggml_allocr_is_measure(lctx.alloc)) {
6029
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
6340
6030
  const int64_t n_kv = cur->ne[0];
6341
6031
  const int64_t n_tokens = cur->ne[1];
6342
6032
 
@@ -6369,165 +6059,35 @@ static struct ggml_cgraph * llama_build_graph(
6369
6059
  ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
6370
6060
  }
6371
6061
  }
6372
-
6373
- alloc_inp_KQ_mask = true;
6374
- }
6375
-
6376
- if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
6377
- ggml_allocr_alloc(lctx.alloc, cur);
6378
-
6379
- if (!ggml_allocr_is_measure(lctx.alloc)) {
6380
- const int64_t n_ctx = cur->ne[0];
6381
-
6382
- int32_t * data;
6383
- if (ggml_backend_buffer_is_host(cur->buffer)) {
6384
- data = (int32_t *) cur->data;
6385
- } else {
6386
- lctx.buf_copy.resize(ggml_nbytes(cur));
6387
- data = (int32_t *) lctx.buf_copy.data();
6388
- }
6389
-
6390
- for (int i = 0; i < n_ctx; ++i) {
6391
- data[i] = lctx.kv_self.cells[i].delta;
6392
- }
6393
-
6394
- if (data != cur->data) {
6395
- ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
6396
- }
6397
- }
6398
-
6399
- alloc_inp_K_shift = true;
6400
- }
6401
-
6402
- // view tensors are not processed further
6403
- if (cur->view_src != nullptr) {
6404
- return;
6405
- }
6406
-
6407
- if (cur->op != GGML_OP_NONE) {
6408
- n_non_view++;
6409
- }
6410
-
6411
- //
6412
- // offload layers
6413
- //
6414
- // TODO: will be removed with backend v2
6415
-
6416
- //#define LLAMA_OFFLOAD_DEBUG
6417
-
6418
- if (!do_offload) {
6419
- return;
6420
- }
6421
-
6422
- const int n_layer = model.hparams.n_layer;
6423
-
6424
- const int n_gpu_layers = model.n_gpu_layers;
6425
- const int i_gpu_start = n_layer - n_gpu_layers;
6426
-
6427
- // should we offload the final norm? yes if we are not computing embeddings
6428
- const bool offload_emb = lctx.embedding.empty();
6429
-
6430
- static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
6431
- { OFFLOAD_FUNC_NOP, "CPU" },
6432
- { OFFLOAD_FUNC_OUT, "CPU" },
6433
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6434
- { OFFLOAD_FUNC, "GPU (CUDA)" },
6435
- { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
6436
- { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
6437
- { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
6438
- { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
6439
- #else
6440
- { OFFLOAD_FUNC, "CPU" },
6441
- { OFFLOAD_FUNC_FRC, "CPU" },
6442
- { OFFLOAD_FUNC_KQV, "CPU" },
6443
- { OFFLOAD_FUNC_NR, "CPU" },
6444
- { OFFLOAD_FUNC_EMB, "CPU" },
6445
- #endif // GGML_USE_CUBLAS
6446
- };
6447
-
6448
- // check the global map for what offload function to use for this tensor
6449
- llm_offload_func_e func_e = k_offload_func_trie.find(name);
6450
-
6451
- if (func_e == OFFLOAD_FUNC_NOP) {
6452
- #ifdef LLAMA_OFFLOAD_DEBUG
6453
- // if a tensor hasn't been offloaded, we warn the user
6454
- if (worst_case) {
6455
- LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
6456
- cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
6457
- }
6458
- #endif
6459
-
6460
- return;
6461
- }
6462
-
6463
- // count the number of layers and respect the provided n_gpu_layers
6464
- switch (func_e) {
6465
- case OFFLOAD_FUNC_NOP:
6466
- case OFFLOAD_FUNC_OUT:
6467
- break;
6468
- case OFFLOAD_FUNC:
6469
- if (n_gpu_layers < n_layer) {
6470
- if (il < i_gpu_start) {
6471
- func_e = OFFLOAD_FUNC_NOP;
6472
- }
6473
- }
6474
- break;
6475
- case OFFLOAD_FUNC_FRC:
6476
- if (!lctx.cparams.offload_kqv) {
6477
- func_e = OFFLOAD_FUNC_NOP;
6478
- } break;
6479
- case OFFLOAD_FUNC_KQV:
6480
- if (!lctx.cparams.offload_kqv) {
6481
- func_e = OFFLOAD_FUNC_NOP;
6482
- } else {
6483
- if (n_gpu_layers < n_layer) {
6484
- if (il < i_gpu_start) {
6485
- func_e = OFFLOAD_FUNC_NOP;
6486
- }
6487
- }
6488
- }
6489
- break;
6490
- case OFFLOAD_FUNC_NR:
6491
- if (n_gpu_layers <= n_layer + 0) {
6492
- func_e = OFFLOAD_FUNC_NOP;
6493
- }
6494
- break;
6495
- case OFFLOAD_FUNC_EMB:
6496
- if (!offload_emb || n_gpu_layers < n_layer) {
6497
- func_e = OFFLOAD_FUNC_NOP;
6498
- }
6499
- break;
6500
- default: GGML_ASSERT(false);
6062
+
6063
+ alloc_inp_KQ_mask = true;
6501
6064
  }
6502
6065
 
6503
- offload_func_t func = ggml_offload_nop;
6066
+ if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
6067
+ ggml_tallocr_alloc(lctx.alloc, cur);
6504
6068
 
6505
- // this is needed for compatibility with Metal for example
6506
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6507
- static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
6508
- #else
6509
- static offload_func_t ggml_offload_gpu = ggml_offload_nop;
6510
- #endif
6069
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
6070
+ const int64_t n_ctx = cur->ne[0];
6511
6071
 
6512
- switch (func_e) {
6513
- case OFFLOAD_FUNC_NOP:
6514
- case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
6515
- case OFFLOAD_FUNC:
6516
- case OFFLOAD_FUNC_KQV:
6517
- case OFFLOAD_FUNC_FRC:
6518
- case OFFLOAD_FUNC_NR:
6519
- case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
6520
- default: GGML_ASSERT(false);
6521
- }
6072
+ int32_t * data;
6073
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
6074
+ data = (int32_t *) cur->data;
6075
+ } else {
6076
+ lctx.buf_copy.resize(ggml_nbytes(cur));
6077
+ data = (int32_t *) lctx.buf_copy.data();
6078
+ }
6079
+
6080
+ for (int i = 0; i < n_ctx; ++i) {
6081
+ data[i] = lctx.kv_self.cells[i].delta;
6082
+ }
6522
6083
 
6523
- // apply offload function to the tensor
6524
- func(cur);
6084
+ if (data != cur->data) {
6085
+ ggml_backend_tensor_set(cur, data, 0, ggml_nbytes(cur));
6086
+ }
6087
+ }
6525
6088
 
6526
- #ifdef LLAMA_OFFLOAD_DEBUG
6527
- if (worst_case) {
6528
- LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
6089
+ alloc_inp_K_shift = true;
6529
6090
  }
6530
- #endif
6531
6091
  };
6532
6092
 
6533
6093
  struct ggml_cgraph * result = NULL;
@@ -6595,27 +6155,6 @@ static struct ggml_cgraph * llama_build_graph(
6595
6155
 
6596
6156
  llm.free();
6597
6157
 
6598
- if (worst_case) {
6599
- int n_non_view_total = 0;
6600
-
6601
- for (int i = 0; i < result->n_nodes; ++i) {
6602
- if (result->nodes[i]->view_src == nullptr) {
6603
- n_non_view_total++;
6604
- }
6605
- }
6606
-
6607
- LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
6608
-
6609
- if (n_non_view != n_non_view_total) {
6610
- LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
6611
- LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__);
6612
- LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__);
6613
- LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__);
6614
- LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__);
6615
- LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
6616
- }
6617
- }
6618
-
6619
6158
  return result;
6620
6159
  }
6621
6160
 
@@ -6661,8 +6200,6 @@ static int llama_decode_internal(
6661
6200
 
6662
6201
  auto & kv_self = lctx.kv_self;
6663
6202
 
6664
- GGML_ASSERT(!!kv_self.ctx);
6665
-
6666
6203
  const int64_t n_embd = hparams.n_embd;
6667
6204
  const int64_t n_vocab = hparams.n_vocab;
6668
6205
 
@@ -6716,12 +6253,10 @@ static int llama_decode_internal(
6716
6253
 
6717
6254
  //printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
6718
6255
 
6719
- ggml_allocr_reset(lctx.alloc);
6256
+ ggml_backend_sched_reset(lctx.sched);
6720
6257
 
6721
6258
  ggml_cgraph * gf = llama_build_graph(lctx, batch);
6722
6259
 
6723
- ggml_allocr_alloc_graph(lctx.alloc, gf);
6724
-
6725
6260
  // the output is always the last tensor in the graph
6726
6261
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
6727
6262
  GGML_ASSERT(strcmp(res->name, "result_output") == 0);
@@ -6733,30 +6268,6 @@ static int llama_decode_internal(
6733
6268
  GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
6734
6269
  }
6735
6270
 
6736
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
6737
- char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
6738
- for (int i = 0; i < gf->n_leafs; i++) {
6739
- ggml_tensor * node = gf->leafs[i];
6740
- if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
6741
- ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
6742
- ggml_cuda_copy_to_device(node);
6743
- }
6744
- }
6745
-
6746
- for (int i = 0; i < gf->n_nodes; i++) {
6747
- ggml_tensor * node = gf->nodes[i];
6748
- if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
6749
- ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
6750
- }
6751
- }
6752
-
6753
- // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
6754
- if (!lctx.embedding.empty()) {
6755
- embeddings->backend = GGML_BACKEND_CPU;
6756
- }
6757
- res->backend = GGML_BACKEND_CPU;
6758
- #endif
6759
-
6760
6271
  // LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
6761
6272
 
6762
6273
  // for big prompts, if BLAS is enabled, it is better to use only one thread
@@ -6779,15 +6290,17 @@ static int llama_decode_internal(
6779
6290
  #endif
6780
6291
 
6781
6292
  #ifdef GGML_USE_METAL
6782
- if (ggml_backend_is_metal(lctx.backend)) {
6783
- ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
6293
+ if (ggml_backend_is_metal(lctx.backend_metal)) {
6294
+ ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
6784
6295
  }
6785
6296
  #endif
6786
6297
 
6787
- if (ggml_backend_is_cpu(lctx.backend)) {
6788
- ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
6298
+ if (lctx.backend_cpu != nullptr) {
6299
+ ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
6789
6300
  }
6790
- ggml_backend_graph_compute(lctx.backend, gf);
6301
+ ggml_backend_sched_graph_compute(lctx.sched, gf);
6302
+
6303
+ // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
6791
6304
 
6792
6305
  #ifdef GGML_USE_MPI
6793
6306
  ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
@@ -6835,30 +6348,33 @@ static int llama_decode_internal(
6835
6348
  logits_out.clear();
6836
6349
  #endif
6837
6350
 
6351
+ ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
6352
+ GGML_ASSERT(res_backend != nullptr);
6838
6353
  if (batch.logits) {
6839
6354
  logits_out.resize(n_vocab * n_tokens);
6840
6355
  for (uint32_t i = 0; i < n_tokens; i++) {
6841
6356
  if (batch.logits[i] == 0) {
6842
6357
  continue;
6843
6358
  }
6844
- ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
6359
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
6845
6360
  #ifndef NDEBUG
6846
6361
  logits_valid[i] = true;
6847
6362
  #endif
6848
6363
  }
6849
6364
  } else if (lctx.logits_all) {
6850
6365
  logits_out.resize(n_vocab * n_tokens);
6851
- ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
6366
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
6852
6367
  #ifndef NDEBUG
6853
6368
  std::fill(logits_valid.begin(), logits_valid.end(), true);
6854
6369
  #endif
6855
6370
  } else {
6856
6371
  logits_out.resize(n_vocab);
6857
- ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
6372
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
6858
6373
  #ifndef NDEBUG
6859
6374
  logits_valid[0] = true;
6860
6375
  #endif
6861
6376
  }
6377
+ ggml_backend_synchronize(res_backend);
6862
6378
  }
6863
6379
 
6864
6380
  // extract embeddings
@@ -6866,7 +6382,9 @@ static int llama_decode_internal(
6866
6382
  auto & embedding_out = lctx.embedding;
6867
6383
 
6868
6384
  embedding_out.resize(n_embd);
6869
- ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
6385
+ ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
6386
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
6387
+ ggml_backend_synchronize(embeddings_backend);
6870
6388
  }
6871
6389
 
6872
6390
  // measure the performance only for the single-token evals
@@ -6937,15 +6455,15 @@ static uint8_t llama_token_to_byte(const llama_vocab& vocab, llama_token id) {
6937
6455
  static llama_token llama_byte_to_token(const llama_vocab & vocab, uint8_t ch) {
6938
6456
  static const char * hex = "0123456789ABCDEF";
6939
6457
  switch (llama_vocab_get_type(vocab)) {
6940
- case LLAMA_VOCAB_TYPE_SPM: {
6941
- const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
6942
- return vocab.token_to_id.at(buf);
6943
- }
6944
- case LLAMA_VOCAB_TYPE_BPE: {
6945
- return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
6946
- }
6947
- default:
6948
- GGML_ASSERT(false);
6458
+ case LLAMA_VOCAB_TYPE_SPM: {
6459
+ const char buf[7] = { '<', '0', 'x', hex[ch >> 4], hex[ch & 15], '>', 0 };
6460
+ return vocab.token_to_id.at(buf);
6461
+ }
6462
+ case LLAMA_VOCAB_TYPE_BPE: {
6463
+ return vocab.token_to_id.at(bytes_to_unicode_bpe(ch));
6464
+ }
6465
+ default:
6466
+ GGML_ASSERT(false);
6949
6467
  }
6950
6468
  }
6951
6469
 
@@ -7479,7 +6997,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7479
6997
  if (match + special_token.length() > raw_text_base_offset + raw_text_base_length) break;
7480
6998
 
7481
6999
  #ifdef PRETOKENIZERDEBUG
7482
- fprintf(stderr, "FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7000
+ LLAMA_LOG_WARN("FF: (%ld %ld %ld) '%s'\n", raw_text->length(), raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7483
7001
  #endif
7484
7002
  auto source = std::distance(buffer.begin(), it);
7485
7003
 
@@ -7492,7 +7010,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7492
7010
  buffer.emplace_after(it, (*raw_text), left_reminder_offset, left_reminder_length);
7493
7011
 
7494
7012
  #ifdef PRETOKENIZERDEBUG
7495
- fprintf(stderr, "FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
7013
+ LLAMA_LOG_WARN("FL: (%ld %ld) '%s'\n", left_reminder_offset, left_reminder_length, raw_text->substr(left_reminder_offset, left_reminder_length).c_str());
7496
7014
  #endif
7497
7015
  it++;
7498
7016
  }
@@ -7508,7 +7026,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7508
7026
  buffer.emplace_after(it, (*raw_text), right_reminder_offset, right_reminder_length);
7509
7027
 
7510
7028
  #ifdef PRETOKENIZERDEBUG
7511
- fprintf(stderr, "FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
7029
+ LLAMA_LOG_WARN("FR: (%ld %ld) '%s'\n", right_reminder_offset, right_reminder_length, raw_text->substr(right_reminder_offset, right_reminder_length).c_str());
7512
7030
  #endif
7513
7031
 
7514
7032
  it++;
@@ -7524,7 +7042,7 @@ static void tokenizer_st_partition(const llama_vocab & vocab, std::forward_list<
7524
7042
  raw_text_base_length = right_reminder_length;
7525
7043
 
7526
7044
  #ifdef PRETOKENIZERDEBUG
7527
- fprintf(stderr, "RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7045
+ LLAMA_LOG_WARN("RR: (%ld %ld) '%s'\n", raw_text_base_offset, raw_text_base_length, raw_text->substr(raw_text_base_offset, raw_text_base_length).c_str());
7528
7046
  #endif
7529
7047
  } else {
7530
7048
  if (source == 0) {
@@ -7581,7 +7099,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7581
7099
  }
7582
7100
 
7583
7101
  #ifdef PRETOKENIZERDEBUG
7584
- fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7102
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7585
7103
  #endif
7586
7104
  llm_tokenizer_spm tokenizer(vocab);
7587
7105
  llama_escape_whitespace(raw_text);
@@ -7602,7 +7120,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
7602
7120
  auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
7603
7121
 
7604
7122
  #ifdef PRETOKENIZERDEBUG
7605
- fprintf(stderr,"TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7123
+ LLAMA_LOG_WARN("TT: (%ld %ld %ld) '%s'\n", raw_text.length(), fragment.offset, fragment.length, raw_text.c_str());
7606
7124
  #endif
7607
7125
  llm_tokenizer_bpe tokenizer(vocab);
7608
7126
  tokenizer.tokenize(raw_text, output);
@@ -8380,39 +7898,59 @@ static void llama_log_softmax(float * array, size_t size) {
8380
7898
  }
8381
7899
  }
8382
7900
 
7901
+ void llama_sample_apply_guidance(
7902
+ struct llama_context * ctx,
7903
+ float * logits,
7904
+ float * logits_guidance,
7905
+ float scale) {
7906
+ GGML_ASSERT(ctx);
7907
+
7908
+ const auto t_start_sample_us = ggml_time_us();
7909
+ const auto n_vocab = llama_n_vocab(llama_get_model(ctx));
7910
+
7911
+ llama_log_softmax(logits, n_vocab);
7912
+ llama_log_softmax(logits_guidance, n_vocab);
7913
+
7914
+ for (int i = 0; i < n_vocab; ++i) {
7915
+ auto & l = logits[i];
7916
+ const auto & g = logits_guidance[i];
7917
+
7918
+ l = scale * (l - g) + g;
7919
+ }
7920
+
7921
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7922
+ }
7923
+
8383
7924
  void llama_sample_classifier_free_guidance(
8384
7925
  struct llama_context * ctx,
8385
7926
  llama_token_data_array * candidates,
8386
7927
  struct llama_context * guidance_ctx,
8387
7928
  float scale) {
8388
- int64_t t_start_sample_us = ggml_time_us();
8389
-
8390
7929
  GGML_ASSERT(ctx);
7930
+ int64_t t_start_sample_us;
8391
7931
 
8392
- auto n_vocab = llama_n_vocab(llama_get_model(ctx));
7932
+ t_start_sample_us = ggml_time_us();
7933
+ const size_t n_vocab = llama_n_vocab(llama_get_model(ctx));
8393
7934
 
8394
- GGML_ASSERT(n_vocab == (int)candidates->size);
7935
+ GGML_ASSERT(n_vocab == candidates->size);
8395
7936
  GGML_ASSERT(!candidates->sorted);
8396
7937
 
8397
- std::vector<float> logits_base;
8398
- logits_base.reserve(candidates->size);
8399
- for (size_t i = 0; i < candidates->size; ++i) {
8400
- logits_base.push_back(candidates->data[i].logit);
7938
+ std::vector<float> logits_base(n_vocab);
7939
+ for (size_t i = 0; i < n_vocab; ++i) {
7940
+ logits_base[i] = candidates->data[i].logit;
8401
7941
  }
8402
- llama_log_softmax(logits_base.data(), candidates->size);
8403
7942
 
8404
- float* logits_guidance = llama_get_logits(guidance_ctx);
8405
- llama_log_softmax(logits_guidance, n_vocab);
7943
+ float * logits_guidance = llama_get_logits(guidance_ctx);
8406
7944
 
8407
- for (int i = 0; i < n_vocab; ++i) {
8408
- float logit_guidance = logits_guidance[i];
8409
- float logit_base = logits_base[i];
8410
- candidates->data[i].logit = scale * (logit_base - logit_guidance) + logit_guidance;
8411
- }
7945
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7946
+ llama_sample_apply_guidance(ctx, logits_base.data(), logits_guidance, scale);
7947
+ t_start_sample_us = ggml_time_us();
8412
7948
 
8413
- if (ctx) {
8414
- ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
7949
+ for (size_t i = 0; i < n_vocab; ++i) {
7950
+ candidates->data[i].logit = logits_base[i];
8415
7951
  }
7952
+
7953
+ ctx->t_sample_us += ggml_time_us() - t_start_sample_us;
8416
7954
  }
8417
7955
 
8418
7956
  llama_token llama_sample_token_mirostat(struct llama_context * ctx, llama_token_data_array * candidates, float tau, float eta, int32_t m, float * mu) {
@@ -8836,6 +8374,8 @@ struct quantize_state_internal {
8836
8374
  int n_k_quantized = 0;
8837
8375
  int n_fallback = 0;
8838
8376
 
8377
+ bool has_imatrix = false;
8378
+
8839
8379
  quantize_state_internal(const llama_model & model, const llama_model_quantize_params * params)
8840
8380
  : model(model)
8841
8381
  , params(params)
@@ -8919,9 +8459,23 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8919
8459
  if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
8920
8460
  new_type = GGML_TYPE_Q8_0;
8921
8461
  }
8462
+ else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
8463
+ new_type = GGML_TYPE_Q5_K;
8464
+ }
8922
8465
  else if (new_type != GGML_TYPE_Q8_0) {
8923
8466
  new_type = GGML_TYPE_Q6_K;
8924
8467
  }
8468
+ } else if (ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS || ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS) {
8469
+ if (name.find("attn_v.weight") != std::string::npos) {
8470
+ if (qs.model.hparams.n_gqa() >= 4 || qs.model.hparams.n_expert >= 4) new_type = GGML_TYPE_Q4_K;
8471
+ else new_type = GGML_TYPE_Q2_K;
8472
+ ++qs.i_attention_wv;
8473
+ }
8474
+ else if (name.find("ffn_down") != std::string::npos) {
8475
+ if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q2_K;
8476
+ ++qs.i_feed_forward_w2;
8477
+ }
8478
+ else if (name == "token_embd.weight") new_type = GGML_TYPE_Q2_K;
8925
8479
  } else if (name.find("attn_v.weight") != std::string::npos) {
8926
8480
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8927
8481
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
@@ -8952,13 +8506,31 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8952
8506
  new_type = GGML_TYPE_Q8_0;
8953
8507
  }
8954
8508
  } else if (name.find("ffn_down") != std::string::npos) {
8509
+ const int n_expert = std::max(1, (int)qs.model.hparams.n_expert);
8510
+ int i_layer, n_layer;
8511
+ if (n_expert == 1) {
8512
+ i_layer = qs.i_feed_forward_w2;
8513
+ n_layer = qs.n_feed_forward_w2;
8514
+ } else {
8515
+ // Believe it or not, "experts" in the FFN of Mixtral-8x7B are not consecutive, but iccasionally randomly
8516
+ // sprinkled in the model. Hence, simply dividing i_feed_forward_w2 by n_expert does not work
8517
+ // for getting the current layer as I initially thought, and we need to resort to parsing the
8518
+ // tensor name.
8519
+ n_layer = qs.n_feed_forward_w2 / n_expert;
8520
+ if (sscanf(name.c_str(), "blk.%d.ffn_down", &i_layer) != 1) {
8521
+ throw std::runtime_error(format("Failed to determine layer for tensor %s", name.c_str()));
8522
+ }
8523
+ if (i_layer < 0 || i_layer >= n_layer) {
8524
+ throw std::runtime_error(format("Bad layer %d for tensor %s. Must be in [0, %d)", i_layer, name.c_str(), n_layer));
8525
+ }
8526
+ }
8955
8527
  if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
8956
8528
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
8957
- if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
8529
+ if (i_layer < n_layer/8) new_type = GGML_TYPE_Q4_K;
8958
8530
  }
8959
8531
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
8960
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
8961
- : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
8532
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q5_K
8533
+ : arch != LLM_ARCH_FALCON || use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q4_K
8962
8534
  : GGML_TYPE_Q3_K;
8963
8535
  }
8964
8536
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
@@ -8966,22 +8538,36 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
8966
8538
  }
8967
8539
  else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8968
8540
  if (arch == LLM_ARCH_FALCON) {
8969
- new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
8970
- use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8541
+ new_type = i_layer < n_layer/16 ? GGML_TYPE_Q6_K :
8542
+ use_more_bits(i_layer, n_layer) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
8971
8543
  } else {
8972
- if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8544
+ if (use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
8973
8545
  }
8974
8546
  }
8975
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
8976
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
8547
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_layer, n_layer)) new_type = GGML_TYPE_Q6_K;
8548
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && i_layer < n_layer/8) {
8977
8549
  new_type = GGML_TYPE_Q5_K;
8978
8550
  }
8551
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_0 || ftype == LLAMA_FTYPE_MOSTLY_Q5_0)
8552
+ && qs.has_imatrix && i_layer < n_layer/8) {
8553
+ // Guard against craziness in the first few ffn_down layers that can happen even with imatrix for Q4_0/Q5_0.
8554
+ // We only do it when an imatrix is provided because a) we want to make sure that one can always get the
8555
+ // same quantization as before imatrix stuff, and b) Q4_1/Q5_1 do go crazy on ffn_down without an imatrix.
8556
+ new_type = ftype == LLAMA_FTYPE_MOSTLY_Q4_0 ? GGML_TYPE_Q4_1 : GGML_TYPE_Q5_1;
8557
+ }
8979
8558
  ++qs.i_feed_forward_w2;
8980
8559
  } else if (name.find("attn_output.weight") != std::string::npos) {
8981
8560
  if (arch != LLM_ARCH_FALCON) {
8982
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
8983
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8984
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
8561
+ if (qs.model.hparams.n_expert == 8) {
8562
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
8563
+ ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
8564
+ new_type = GGML_TYPE_Q5_K;
8565
+ }
8566
+ } else {
8567
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
8568
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
8569
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
8570
+ }
8985
8571
  } else {
8986
8572
  if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
8987
8573
  }
@@ -9002,7 +8588,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9002
8588
  //}
9003
8589
  bool convert_incompatible_tensor = false;
9004
8590
  if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
9005
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
8591
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K ||
8592
+ new_type == GGML_TYPE_IQ2_XS || new_type == GGML_TYPE_IQ2_XXS) {
9006
8593
  int nx = tensor->ne[0];
9007
8594
  int ny = tensor->ne[1];
9008
8595
  if (nx % QK_K != 0) {
@@ -9014,6 +8601,8 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
9014
8601
  }
9015
8602
  if (convert_incompatible_tensor) {
9016
8603
  switch (new_type) {
8604
+ case GGML_TYPE_IQ2_XXS:
8605
+ case GGML_TYPE_IQ2_XS:
9017
8606
  case GGML_TYPE_Q2_K: new_type = GGML_TYPE_Q4_0; break;
9018
8607
  case GGML_TYPE_Q3_K: new_type = GGML_TYPE_Q4_1; break;
9019
8608
  case GGML_TYPE_Q4_K: new_type = GGML_TYPE_Q5_0; break;
@@ -9084,6 +8673,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9084
8673
  if (params->only_copy) {
9085
8674
  ftype = model.ftype;
9086
8675
  }
8676
+ const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
8677
+ if (params->imatrix) {
8678
+ imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
8679
+ if (imatrix_data) {
8680
+ LLAMA_LOG_INFO("================================ Have weights data with %d entries\n",int(imatrix_data->size()));
8681
+ qs.has_imatrix = true;
8682
+ }
8683
+ }
9087
8684
 
9088
8685
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
9089
8686
  struct gguf_context * ctx_out = gguf_init_empty();
@@ -9141,6 +8738,8 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9141
8738
  // placeholder for the meta data
9142
8739
  ::zeros(fout, meta_size);
9143
8740
 
8741
+ std::set<ggml_type> used_iq2;
8742
+
9144
8743
  for (int i = 0; i < ml.n_tensors; ++i) {
9145
8744
  struct ggml_tensor * tensor = ml.get_tensor_meta(i);
9146
8745
 
@@ -9193,6 +8792,35 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9193
8792
  } else {
9194
8793
  const size_t nelements = ggml_nelements(tensor);
9195
8794
 
8795
+ if ((new_type == GGML_TYPE_IQ2_XXS || new_type == GGML_TYPE_IQ2_XS) && used_iq2.find(new_type) == used_iq2.end()) {
8796
+ ggml_init_iq2_quantization(new_type);
8797
+ used_iq2.insert(new_type);
8798
+ }
8799
+
8800
+ const float * imatrix = nullptr;
8801
+ if (imatrix_data) {
8802
+ auto it = imatrix_data->find(tensor->name);
8803
+ if (it == imatrix_data->end()) {
8804
+ LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
8805
+ } else {
8806
+ if (it->second.size() == (size_t)tensor->ne[0]) {
8807
+ imatrix = it->second.data();
8808
+ } else {
8809
+ LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
8810
+ int(it->second.size()), int(tensor->ne[0]), tensor->name);
8811
+ }
8812
+ }
8813
+ }
8814
+ if ((new_type == GGML_TYPE_IQ2_XXS ||
8815
+ new_type == GGML_TYPE_IQ2_XS ||
8816
+ (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
8817
+ LLAMA_LOG_ERROR("\n\n============================================================\n");
8818
+ LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
8819
+ LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
8820
+ LLAMA_LOG_ERROR("============================================================\n\n");
8821
+ throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
8822
+ }
8823
+
9196
8824
  float * f32_data;
9197
8825
 
9198
8826
  if (tensor->type == GGML_TYPE_F32) {
@@ -9213,21 +8841,28 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9213
8841
  new_data = work.data();
9214
8842
  std::array<int64_t, 1 << 4> hist_cur = {};
9215
8843
 
9216
- static const int chunk_size = 32 * 512;
8844
+ const int n_per_row = tensor->ne[0];
8845
+ const int nrows = nelements / n_per_row;
8846
+
8847
+ static const int min_chunk_size = 32 * 512;
8848
+ const int chunk_size = n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row);
8849
+
9217
8850
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
9218
8851
  const int nthread_use = nthread > 1 ? std::max(1, std::min(nthread, nchunk)) : 1;
9219
8852
  if (nthread_use < 2) {
9220
- new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nelements, hist_cur.data());
8853
+ new_size = ggml_quantize_chunk(new_type, f32_data, new_data, 0, nrows, n_per_row, hist_cur.data(), imatrix);
9221
8854
  } else {
9222
- size_t counter = 0;
8855
+ int counter = 0;
9223
8856
  new_size = 0;
9224
- auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
8857
+ auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, chunk_size,
8858
+ nrows, n_per_row, imatrix]() {
9225
8859
  std::array<int64_t, 1 << 4> local_hist = {};
8860
+ const int nrows_per_chunk = chunk_size / n_per_row;
9226
8861
  size_t local_size = 0;
9227
8862
  while (true) {
9228
8863
  std::unique_lock<std::mutex> lock(mutex);
9229
- size_t first = counter; counter += chunk_size;
9230
- if (first >= nelements) {
8864
+ int first_row = counter; counter += nrows_per_chunk;
8865
+ if (first_row >= nrows) {
9231
8866
  if (local_size > 0) {
9232
8867
  for (int j=0; j<int(local_hist.size()); ++j) {
9233
8868
  hist_cur[j] += local_hist[j];
@@ -9237,8 +8872,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9237
8872
  break;
9238
8873
  }
9239
8874
  lock.unlock();
9240
- size_t last = std::min(nelements, first + chunk_size);
9241
- local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
8875
+ const int this_nrow = std::min(nrows - first_row, nrows_per_chunk);
8876
+ local_size += ggml_quantize_chunk(new_type, f32_data, new_data,
8877
+ first_row * n_per_row, this_nrow, n_per_row, local_hist.data(), imatrix);
9242
8878
  }
9243
8879
  };
9244
8880
  for (int it = 0; it < nthread_use - 1; ++it) {
@@ -9249,7 +8885,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9249
8885
  workers.clear();
9250
8886
  }
9251
8887
 
9252
- LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
8888
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
9253
8889
  int64_t tot_count = 0;
9254
8890
  for (size_t i = 0; i < hist_cur.size(); i++) {
9255
8891
  hist_all[i] += hist_cur[i];
@@ -9257,6 +8893,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9257
8893
  }
9258
8894
 
9259
8895
  if (tot_count > 0) {
8896
+ LLAMA_LOG_INFO(" | hist: ");
9260
8897
  for (size_t i = 0; i < hist_cur.size(); i++) {
9261
8898
  LLAMA_LOG_INFO("%5.3f ", hist_cur[i] / float(nelements));
9262
8899
  }
@@ -9285,6 +8922,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
9285
8922
 
9286
8923
  fout.close();
9287
8924
 
8925
+ for (auto type : used_iq2) {
8926
+ ggml_deinit_iq2_quantization(type);
8927
+ }
8928
+
9288
8929
  gguf_free(ctx_out);
9289
8930
 
9290
8931
  LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
@@ -9342,48 +8983,23 @@ static int llama_apply_lora_from_file_internal(
9342
8983
 
9343
8984
  LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
9344
8985
 
9345
- // create a name -> tensor map of the model to accelerate lookups
9346
- // find the max tensor size to estimate the required temporary buffer size
9347
- size_t max_tensor_size = 0;
9348
- std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
9349
- for (const auto & kv : model.tensors_by_name) {
9350
- model_tensors.insert(kv);
9351
- size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
9352
- max_tensor_size = std::max(max_tensor_size, f32_size);
9353
- }
9354
-
9355
- // create a temporary ggml context to store the lora tensors
9356
- // TODO: use ggml-alloc
9357
- size_t lora_ctx_size = max_tensor_size * 3;
9358
- LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
9359
- std::vector<uint8_t> lora_buf(lora_ctx_size);
9360
-
9361
- struct ggml_init_params params;
9362
- params.mem_size = lora_buf.size();
9363
- params.mem_buffer = lora_buf.data();
9364
- params.no_alloc = false;
9365
-
9366
- using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
9367
-
9368
- unique_context lora_ctx(nullptr, ggml_free);
9369
- lora_ctx.reset(ggml_init(params));
9370
- std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
9371
-
9372
8986
  // load base model
9373
8987
  std::unique_ptr<llama_model_loader> ml;
9374
-
9375
- if (path_base_model) {
8988
+ if (path_base_model) {
9376
8989
  LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
9377
8990
  ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
9378
- ml->init_mapping(false); // no prefetching
8991
+ ml->init_mapping(/*prefetch*/ false); // no prefetching
9379
8992
  }
9380
8993
 
9381
- // read tensors and apply
9382
- bool warned = false;
9383
- int n_tensors = 0;
9384
-
9385
- std::vector<uint8_t> work_buffer;
8994
+ struct tensor_meta {
8995
+ std::string name;
8996
+ ggml_type type;
8997
+ int32_t ne[2];
8998
+ size_t offset;
8999
+ };
9000
+ std::map<std::string, tensor_meta> tensor_meta_map;
9386
9001
 
9002
+ // load all tensor meta
9387
9003
  while (true) {
9388
9004
  if (fin.tell() == fin.size) {
9389
9005
  // eof
@@ -9396,7 +9012,7 @@ static int llama_apply_lora_from_file_internal(
9396
9012
 
9397
9013
  fin.read_raw(&n_dims, sizeof(n_dims));
9398
9014
  fin.read_raw(&name_len, sizeof(name_len));
9399
- fin.read_raw(&ftype, sizeof(ftype));
9015
+ fin.read_raw(&ftype, sizeof(ftype));
9400
9016
 
9401
9017
  if (n_dims != 1 && n_dims != 2) {
9402
9018
  LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
@@ -9410,31 +9026,23 @@ static int llama_apply_lora_from_file_internal(
9410
9026
 
9411
9027
  std::string name;
9412
9028
  {
9413
- GGML_ASSERT(name_len <= 1024);
9414
- char buf[1024];
9029
+ GGML_ASSERT(name_len < GGML_MAX_NAME);
9030
+ char buf[GGML_MAX_NAME];
9415
9031
  fin.read_raw(buf, name_len);
9416
9032
  name = std::string(buf, name_len);
9417
9033
  }
9418
9034
 
9419
- // check for lora suffix and get the type of tensor
9420
- const std::string lora_suffix = ".lora";
9421
- size_t pos = name.rfind(lora_suffix);
9422
- if (pos == std::string::npos) {
9423
- LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
9424
- return 1;
9035
+ // check for lora suffix
9036
+ std::string lora_suffix;
9037
+ if (name.length() > 6) {
9038
+ lora_suffix = name.substr(name.length() - 6);
9425
9039
  }
9426
-
9427
- std::string lora_type = name.substr(pos + lora_suffix.length());
9428
- std::string base_name = name;
9429
- base_name.erase(pos);
9430
- // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
9431
-
9432
- if (model_tensors.find(base_name) == model_tensors.end()) {
9433
- LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
9040
+ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
9041
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
9434
9042
  return 1;
9435
9043
  }
9436
9044
 
9437
- // create ggml tensor
9045
+ // tensor type
9438
9046
  ggml_type wtype;
9439
9047
  switch (ftype) {
9440
9048
  case 0: wtype = GGML_TYPE_F32; break;
@@ -9446,122 +9054,177 @@ static int llama_apply_lora_from_file_internal(
9446
9054
  return false;
9447
9055
  }
9448
9056
  }
9449
- ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
9450
- ggml_set_name(lora_tensor, name.c_str());
9451
9057
 
9452
- // load tensor data
9058
+ // data offset
9453
9059
  size_t offset = fin.tell();
9454
- size_t tensor_data_size = ggml_nbytes(lora_tensor);
9455
9060
  offset = (offset + 31) & -32;
9456
- fin.seek(offset, SEEK_SET);
9457
- fin.read_raw(lora_tensor->data, tensor_data_size);
9458
9061
 
9459
- lora_tensors[name] = lora_tensor;
9062
+ // skip tensor data
9063
+ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
9460
9064
 
9461
- // check if we have both A and B tensors and apply
9462
- if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
9463
- lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
9065
+ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
9066
+ }
9464
9067
 
9465
- ggml_tensor * dest_t = model_tensors[base_name];
9068
+ bool warned = false;
9069
+ int n_tensors = 0;
9466
9070
 
9467
- offload_func_t offload_func = ggml_offload_nop;
9468
- offload_func_t offload_func_force_inplace = ggml_offload_nop;
9071
+ // apply
9072
+ ggml_backend_t backend_cpu = ggml_backend_cpu_init();
9073
+ if (backend_cpu == nullptr) {
9074
+ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
9075
+ return 1;
9076
+ }
9077
+ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
9469
9078
 
9470
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9471
- if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
9472
- if (dest_t->type != GGML_TYPE_F16) {
9473
- throw std::runtime_error(format(
9474
- "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
9475
- }
9476
- offload_func = ggml_cuda_assign_buffers;
9477
- offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
9478
- }
9479
- #endif // GGML_USE_CUBLAS
9079
+ std::vector<no_init<uint8_t>> read_buf;
9080
+ for (const auto & it : model.tensors_by_name) {
9081
+ const std::string & base_name = it.first;
9082
+ ggml_tensor * model_t = it.second;
9480
9083
 
9481
- ggml_tensor * base_t;
9482
- if (ml) {
9483
- struct gguf_context * ctx_gguf = ml->ctx_gguf;
9084
+ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
9085
+ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
9086
+ continue;
9087
+ }
9484
9088
 
9485
- // load from base model
9486
- if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
9487
- LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
9488
- return 1;
9489
- }
9089
+ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
9090
+ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
9490
9091
 
9491
- base_t = ml->get_tensor_meta(base_name.c_str());
9492
- ml->load_data_for(base_t);
9493
- } else {
9494
- base_t = dest_t;
9495
- }
9092
+ ggml_init_params lora_init_params = {
9093
+ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
9094
+ /* .mem_buffer */ nullptr,
9095
+ /* .no_alloc */ true,
9096
+ };
9097
+ ggml_context * lora_ctx = ggml_init(lora_init_params);
9098
+ if (lora_ctx == nullptr) {
9099
+ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
9100
+ ggml_backend_free(backend_cpu);
9101
+ return 1;
9102
+ }
9496
9103
 
9497
- if (ggml_is_quantized(base_t->type)) {
9498
- if (!warned) {
9499
- LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
9500
- "use a f16 or f32 base model with --lora-base\n", __func__);
9501
- warned = true;
9502
- }
9104
+ // create tensors
9105
+ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
9106
+ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
9107
+ ggml_set_name(loraA, metaA.name.c_str());
9108
+ ggml_set_name(loraB, metaB.name.c_str());
9109
+
9110
+ ggml_tensor * base_t;
9111
+ if (ml) {
9112
+ if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
9113
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
9114
+ return 1;
9503
9115
  }
9116
+ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
9117
+ } else {
9118
+ base_t = ggml_dup_tensor(lora_ctx, model_t);
9119
+ }
9120
+ ggml_set_name(base_t, base_name.c_str());
9504
9121
 
9505
- ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
9506
- GGML_ASSERT(loraA->type == GGML_TYPE_F32);
9507
- ggml_set_name(loraA, "loraA");
9122
+ // allocate in backend buffer
9123
+ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
9124
+ if (lora_buf == nullptr) {
9125
+ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
9126
+ return 1;
9127
+ }
9508
9128
 
9509
- ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
9510
- GGML_ASSERT(loraB->type == GGML_TYPE_F32);
9511
- ggml_set_name(loraB, "loraB");
9129
+ // load tensor data
9130
+ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
9131
+ read_buf.resize(ggml_nbytes(tensor));
9132
+ fin.seek(tensor_meta.offset, SEEK_SET);
9133
+ fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
9134
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
9135
+ };
9136
+ load_tensor(metaA, loraA);
9137
+ load_tensor(metaB, loraB);
9512
9138
 
9513
- if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
9514
- LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
9515
- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
9516
- return 1;
9517
- }
9139
+ // load base model tensor data
9140
+ if (ml) {
9141
+ ml->load_data_for(base_t);
9142
+ } else {
9143
+ ggml_backend_tensor_copy(model_t, base_t);
9144
+ }
9518
9145
 
9146
+ if (ggml_is_quantized(base_t->type) && !warned) {
9147
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
9148
+ "use a f16 or f32 base model with --lora-base\n", __func__);
9149
+ warned = true;
9150
+ }
9151
+
9152
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
9153
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
9154
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
9155
+ ggml_free(lora_ctx);
9156
+ ggml_backend_buffer_free(lora_buf);
9157
+ ggml_backend_free(backend_cpu);
9158
+ return 1;
9159
+ }
9160
+
9161
+ auto build_lora_graph = [&]() {
9519
9162
  // w = w + BA*s
9520
- ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
9521
- offload_func(BA);
9163
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
9522
9164
  ggml_set_name(BA, "BA");
9523
9165
 
9524
9166
  if (scaling != 1.0f) {
9525
- BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
9526
- offload_func(BA);
9167
+ BA = ggml_scale(lora_ctx, BA, scaling);
9527
9168
  ggml_set_name(BA, "BA_scaled");
9528
9169
  }
9529
9170
 
9530
9171
  ggml_tensor * r;
9531
- if (base_t == dest_t) {
9532
- r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
9533
- offload_func_force_inplace(r);
9534
- ggml_set_name(r, "r_add_inplace");
9535
- }
9536
- else {
9537
- r = ggml_add(lora_ctx.get(), base_t, BA);
9538
- offload_func(r);
9539
- ggml_set_name(r, "r_add");
9172
+ r = ggml_add_inplace(lora_ctx, base_t, BA);
9173
+ ggml_set_name(r, "r_add");
9540
9174
 
9541
- r = ggml_cpy(lora_ctx.get(), r, dest_t);
9542
- offload_func(r);
9543
- ggml_set_name(r, "r_cpy");
9175
+ if (base_t->type != model_t->type) {
9176
+ // convert the result to the model type
9177
+ r = ggml_cast(lora_ctx, r, model_t->type);
9178
+ ggml_set_name(r, "r_cast");
9544
9179
  }
9545
9180
 
9546
- struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
9547
- ggml_build_forward_expand(gf, r);
9181
+ return r;
9182
+ };
9183
+
9184
+ ggml_cgraph * gf = ggml_new_graph(lora_ctx);
9185
+ ggml_tensor * r = build_lora_graph();
9186
+ ggml_build_forward_expand(gf, r);
9548
9187
 
9549
- ggml_graph_compute_helper(work_buffer, gf, n_threads);
9188
+ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
9189
+ if (graph_buf == nullptr) {
9190
+ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
9191
+ ggml_free(lora_ctx);
9192
+ ggml_backend_buffer_free(lora_buf);
9193
+ ggml_backend_free(backend_cpu);
9194
+ return 1;
9195
+ }
9550
9196
 
9551
- // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
9552
- GGML_ASSERT(lora_tensors.size() == 2);
9197
+ ggml_backend_graph_compute(backend_cpu, gf);
9553
9198
 
9554
- // we won't need these tensors again, reset the context to save memory
9555
- lora_ctx.reset(ggml_init(params));
9556
- lora_tensors.clear();
9199
+ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
9557
9200
 
9558
- n_tensors++;
9559
- if (n_tensors % 4 == 0) {
9560
- LLAMA_LOG_INFO(".");
9561
- }
9201
+ #if 0
9202
+ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
9203
+ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
9204
+
9205
+ // sched compute
9206
+ ggml_build_forward_expand(gf, build_graph());
9207
+ ggml_backend_sched_init_measure(sched, gf);
9208
+
9209
+ // create the graph again, since the previous one was destroyed by the measure
9210
+ ggml_graph_clear(gf);
9211
+ ggml_build_forward_expand(gf, build_graph());
9212
+ ggml_backend_sched_graph_compute(sched, gf);
9213
+ ggml_backend_sched_free(sched);
9214
+ #endif
9215
+
9216
+ ggml_backend_buffer_free(lora_buf);
9217
+ ggml_backend_buffer_free(graph_buf);
9218
+ ggml_free(lora_ctx);
9219
+
9220
+ n_tensors++;
9221
+ if (n_tensors % 4 == 0) {
9222
+ LLAMA_LOG_INFO(".");
9562
9223
  }
9563
9224
  }
9564
9225
 
9226
+ ggml_backend_free(backend_cpu);
9227
+
9565
9228
  const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
9566
9229
  LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
9567
9230
 
@@ -9574,6 +9237,7 @@ static int llama_apply_lora_from_file_internal(
9574
9237
  struct llama_model_params llama_model_default_params() {
9575
9238
  struct llama_model_params result = {
9576
9239
  /*.n_gpu_layers =*/ 0,
9240
+ /*.split_mode =*/ LLAMA_SPLIT_LAYER,
9577
9241
  /*.main_gpu =*/ 0,
9578
9242
  /*.tensor_split =*/ nullptr,
9579
9243
  /*.progress_callback =*/ nullptr,
@@ -9585,7 +9249,8 @@ struct llama_model_params llama_model_default_params() {
9585
9249
  };
9586
9250
 
9587
9251
  #ifdef GGML_USE_METAL
9588
- result.n_gpu_layers = 1;
9252
+ // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
9253
+ result.n_gpu_layers = 999;
9589
9254
  #endif
9590
9255
 
9591
9256
  return result;
@@ -9625,6 +9290,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
9625
9290
  /*.quantize_output_tensor =*/ true,
9626
9291
  /*.only_copy =*/ false,
9627
9292
  /*.pure =*/ false,
9293
+ /*.imatrix =*/ nullptr,
9628
9294
  };
9629
9295
 
9630
9296
  return result;
@@ -9775,41 +9441,53 @@ struct llama_context * llama_new_context_with_model(
9775
9441
  GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
9776
9442
  GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
9777
9443
 
9778
- // reserve memory for context buffers
9779
9444
  if (!hparams.vocab_only) {
9780
- // initialize backend
9445
+ // initialize backends
9781
9446
  #ifdef GGML_USE_METAL
9782
9447
  if (model->n_gpu_layers > 0) {
9783
- ctx->backend = ggml_backend_metal_init();
9784
- if (ctx->backend == nullptr) {
9448
+ ctx->backend_metal = ggml_backend_metal_init();
9449
+ if (ctx->backend_metal == nullptr) {
9785
9450
  LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
9451
+ llama_free(ctx);
9452
+ return nullptr;
9786
9453
  }
9454
+ ctx->backends.push_back(ctx->backend_metal);
9787
9455
  }
9788
- #elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9789
- // for testing only
9456
+ #elif defined(GGML_USE_CUBLAS)
9790
9457
  if (model->n_gpu_layers > 0) {
9791
- ctx->backend = ggml_backend_cuda_init(0);
9792
- if (ctx->backend == nullptr) {
9793
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
9458
+ // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
9459
+ if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
9460
+ ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
9461
+ if (backend == nullptr) {
9462
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
9463
+ llama_free(ctx);
9464
+ return nullptr;
9465
+ }
9466
+ ctx->backends.push_back(backend);
9467
+ } else {
9468
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
9469
+ for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
9470
+ ggml_backend_t backend = ggml_backend_cuda_init(device);
9471
+ if (backend == nullptr) {
9472
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
9473
+ llama_free(ctx);
9474
+ return nullptr;
9475
+ }
9476
+ ctx->backends.push_back(backend);
9477
+ }
9794
9478
  }
9795
9479
  }
9796
9480
  #endif
9797
-
9798
- if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
9799
- ctx->backend = ggml_backend_cpu_init();
9800
- if (ctx->backend == nullptr) {
9801
- LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
9802
- }
9803
- }
9804
-
9805
- if (ctx->backend == nullptr) {
9806
- LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
9807
- delete ctx;
9481
+ ctx->backend_cpu = ggml_backend_cpu_init();
9482
+ if (ctx->backend_cpu == nullptr) {
9483
+ LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
9484
+ llama_free(ctx);
9808
9485
  return nullptr;
9809
9486
  }
9487
+ ctx->backends.push_back(ctx->backend_cpu);
9810
9488
 
9811
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
9812
- cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
9489
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
9490
+ cparams.n_ctx, cparams.offload_kqv)) {
9813
9491
  LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
9814
9492
  llama_free(ctx);
9815
9493
  return nullptr;
@@ -9833,23 +9511,30 @@ struct llama_context * llama_new_context_with_model(
9833
9511
  ggml_type_name(type_v), (float)memory_size_v / (1024.0f * 1024.0f));
9834
9512
  }
9835
9513
 
9836
- // resized during inference
9837
- if (params.logits_all) {
9838
- ctx->logits.reserve(cparams.n_ctx*hparams.n_vocab);
9839
- } else {
9840
- ctx->logits.reserve(hparams.n_vocab);
9841
- }
9514
+ // resized during inference, reserve maximum
9515
+ ctx->logits.reserve(hparams.n_vocab*cparams.n_batch);
9842
9516
 
9843
9517
  if (params.embedding){
9844
9518
  ctx->embedding.resize(hparams.n_embd);
9845
9519
  }
9846
9520
 
9847
9521
  {
9848
- // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
9522
+ // buffer types used for the compute buffer of each backend
9523
+ std::vector<ggml_backend_buffer_type_t> backend_buft;
9524
+ for (auto * backend : ctx->backends) {
9525
+ if (ggml_backend_is_cpu(backend)) {
9526
+ // use host buffers for the CPU backend compute buffer
9527
+ backend_buft.push_back(llama_default_buffer_type_cpu(true));
9528
+ } else {
9529
+ backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
9530
+ }
9531
+ }
9532
+
9533
+ // buffer used to store the computation graph and the tensor meta data
9849
9534
  ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
9850
9535
 
9851
- // create measure allocator
9852
- ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
9536
+ ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
9537
+ ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
9853
9538
 
9854
9539
  // build worst-case graph
9855
9540
  int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
@@ -9857,50 +9542,19 @@ struct llama_context * llama_new_context_with_model(
9857
9542
  llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
9858
9543
  ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
9859
9544
 
9860
- // measure memory requirements for the graph
9861
- size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
9862
-
9863
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
9864
-
9865
- // create allocator again with exact memory requirements
9866
- ggml_allocr_free(ctx->alloc);
9867
-
9868
- ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
9869
- ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
9870
- #if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
9871
- if (model->n_gpu_layers > 0) {
9872
- // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
9873
- ggml_cuda_set_scratch_size(alloc_size + 64);
9874
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
9875
-
9876
- // calculate total VRAM usage
9877
- auto add_tensor = [](const ggml_tensor * t, size_t & size) {
9878
- if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
9879
- size += ggml_nbytes(t);
9880
- }
9881
- };
9882
- size_t model_vram_size = 0;
9883
- for (const auto & kv : model->tensors_by_name) {
9884
- add_tensor(kv.second, model_vram_size);
9885
- }
9886
-
9887
- size_t kv_vram_size = 0;
9888
- for (auto & k : ctx->kv_self.k_l) {
9889
- add_tensor(k, kv_vram_size);
9890
- }
9891
- for (auto & v : ctx->kv_self.v_l) {
9892
- add_tensor(v, kv_vram_size);
9893
- }
9894
-
9895
- size_t ctx_vram_size = alloc_size + kv_vram_size;
9896
- size_t total_vram_size = model_vram_size + ctx_vram_size;
9545
+ // initialize scheduler with the worst-case graph
9546
+ ggml_backend_sched_init_measure(ctx->sched, gf);
9547
+ // note: the number of splits during measure is higher than during inference due to the kv shift
9548
+ int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
9549
+ LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
9550
+ ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
9897
9551
 
9898
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
9899
- total_vram_size / 1024.0 / 1024.0,
9900
- model_vram_size / 1024.0 / 1024.0,
9901
- ctx_vram_size / 1024.0 / 1024.0);
9552
+ for (ggml_backend_t backend : ctx->backends) {
9553
+ ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
9554
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
9555
+ ggml_backend_buffer_name(buf),
9556
+ ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
9902
9557
  }
9903
- #endif
9904
9558
  }
9905
9559
  }
9906
9560
 
@@ -9997,9 +9651,8 @@ int32_t llama_model_meta_val_str_by_index(const struct llama_model * model, int3
9997
9651
  }
9998
9652
 
9999
9653
  int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
10000
- return snprintf(buf, buf_size, "%s %s%s %s",
9654
+ return snprintf(buf, buf_size, "%s %s %s",
10001
9655
  llama_model_arch_name(model->arch).c_str(),
10002
- model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
10003
9656
  llama_model_type_name(model->type),
10004
9657
  llama_model_ftype_name(model->ftype).c_str());
10005
9658
  }
@@ -10021,7 +9674,14 @@ uint64_t llama_model_n_params(const struct llama_model * model) {
10021
9674
  }
10022
9675
 
10023
9676
  struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
10024
- return ggml_get_tensor(model->ctx, name);
9677
+ auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
9678
+ [name](const std::pair<std::string, struct ggml_tensor *> & it) {
9679
+ return it.first == name;
9680
+ });
9681
+ if (it == model->tensors_by_name.end()) {
9682
+ return nullptr;
9683
+ }
9684
+ return it->second;
10025
9685
  }
10026
9686
 
10027
9687
  uint32_t llama_model_quantize(
@@ -10199,19 +9859,18 @@ size_t llama_get_state_size(const struct llama_context * ctx) {
10199
9859
  // for reference, std::mt19937(1337) serializes to 6701 bytes.
10200
9860
  const size_t s_rng_size = sizeof(size_t);
10201
9861
  const size_t s_rng = LLAMA_MAX_RNG_STATE;
10202
- const size_t s_logits_capacity = sizeof(size_t);
10203
9862
  const size_t s_logits_size = sizeof(size_t);
9863
+ // assume worst case for logits although only currently set ones are serialized
10204
9864
  const size_t s_logits = ctx->logits.capacity() * sizeof(float);
10205
9865
  const size_t s_embedding_size = sizeof(size_t);
10206
9866
  const size_t s_embedding = ctx->embedding.size() * sizeof(float);
10207
9867
  const size_t s_kv_size = sizeof(size_t);
10208
9868
  const size_t s_kv_ntok = sizeof(int);
10209
- const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
9869
+ const size_t s_kv = ctx->kv_self.total_size();
10210
9870
 
10211
9871
  const size_t s_total = (
10212
9872
  + s_rng_size
10213
9873
  + s_rng
10214
- + s_logits_capacity
10215
9874
  + s_logits_size
10216
9875
  + s_logits
10217
9876
  + s_embedding_size
@@ -10280,37 +9939,27 @@ struct llama_data_file_context : llama_data_context {
10280
9939
  static void llama_copy_state_data_internal(struct llama_context * ctx, llama_data_context * data_ctx) {
10281
9940
  // copy rng
10282
9941
  {
10283
- std::stringstream rng_ss;
9942
+ std::ostringstream rng_ss;
10284
9943
  rng_ss << ctx->rng;
10285
9944
 
10286
- const size_t rng_size = rng_ss.str().size();
10287
- char rng_buf[LLAMA_MAX_RNG_STATE];
9945
+ const std::string & rng_str = rng_ss.str();
9946
+ const size_t rng_size = rng_str.size();
10288
9947
 
10289
- memset(&rng_buf[0], 0, LLAMA_MAX_RNG_STATE);
10290
- memcpy(&rng_buf[0], rng_ss.str().data(), rng_ss.str().size());
9948
+ GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
10291
9949
 
10292
- data_ctx->write(&rng_size, sizeof(rng_size));
10293
- data_ctx->write(&rng_buf[0], LLAMA_MAX_RNG_STATE);
9950
+ data_ctx->write(&rng_size, sizeof(rng_size));
9951
+ data_ctx->write(rng_str.data(), rng_size);
10294
9952
  }
10295
9953
 
10296
9954
  // copy logits
10297
9955
  {
10298
- const size_t logits_cap = ctx->logits.capacity();
10299
9956
  const size_t logits_size = ctx->logits.size();
10300
9957
 
10301
- data_ctx->write(&logits_cap, sizeof(logits_cap));
10302
9958
  data_ctx->write(&logits_size, sizeof(logits_size));
10303
9959
 
10304
9960
  if (logits_size) {
10305
9961
  data_ctx->write(ctx->logits.data(), logits_size * sizeof(float));
10306
9962
  }
10307
-
10308
- // If there is a gap between the size and the capacity, write padding
10309
- size_t padding_size = (logits_cap - logits_size) * sizeof(float);
10310
- if (padding_size > 0) {
10311
- std::vector<uint8_t> padding(padding_size, 0); // Create a buffer filled with zeros
10312
- data_ctx->write(padding.data(), padding_size);
10313
- }
10314
9963
  }
10315
9964
 
10316
9965
  // copy embeddings
@@ -10335,7 +9984,7 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
10335
9984
  const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
10336
9985
  const auto n_ctx = cparams.n_ctx;
10337
9986
 
10338
- const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
9987
+ const size_t kv_buf_size = kv_self.total_size();
10339
9988
  const uint32_t kv_head = kv_self.head;
10340
9989
  const uint32_t kv_size = kv_self.size;
10341
9990
  const uint32_t kv_used = kv_self.used;
@@ -10348,46 +9997,19 @@ static void llama_copy_state_data_internal(struct llama_context * ctx, llama_dat
10348
9997
  if (kv_buf_size) {
10349
9998
  const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
10350
9999
 
10351
- ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
10352
- ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
10353
-
10354
- std::vector<struct ggml_tensor *> kout2d(n_layer);
10355
- std::vector<struct ggml_tensor *> vout2d(n_layer);
10356
-
10357
- for (int il = 0; il < (int) n_layer; ++il) {
10358
- kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10359
- vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10360
-
10361
- ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10362
- n_embd_k_gqa, kv_head,
10363
- elt_size*n_embd_k_gqa, 0);
10364
-
10365
- ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10366
- kv_head, n_embd_v_gqa,
10367
- elt_size*n_ctx, 0);
10368
-
10369
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
10370
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
10371
- }
10372
-
10373
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
10374
-
10375
- ggml_backend_graph_compute(ctx->backend, gf);
10376
-
10377
10000
  std::vector<uint8_t> tmp_buf;
10378
10001
  for (int il = 0; il < (int) n_layer; ++il) {
10379
- tmp_buf.resize(ggml_nbytes(kout2d[il]));
10380
- ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
10002
+ tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
10003
+ ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
10381
10004
  data_ctx->write(tmp_buf.data(), tmp_buf.size());
10382
10005
 
10383
- tmp_buf.resize(ggml_nbytes(vout2d[il]));
10384
- ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
10385
- data_ctx->write(tmp_buf.data(), tmp_buf.size());
10006
+ // v is not contiguous, copy row by row
10007
+ tmp_buf.resize(elt_size*kv_head);
10008
+ for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
10009
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
10010
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
10011
+ }
10386
10012
  }
10387
-
10388
- ggml_free(cpy_ctx);
10389
-
10390
- ggml_backend_buffer_free(buf);
10391
10013
  }
10392
10014
 
10393
10015
  for (uint32_t i = 0; i < kv_size; ++i) {
@@ -10420,13 +10042,13 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10420
10042
  // set rng
10421
10043
  {
10422
10044
  size_t rng_size;
10423
- char rng_buf[LLAMA_MAX_RNG_STATE];
10045
+ memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
10424
10046
 
10425
- memcpy(&rng_size, inp, sizeof(rng_size)); inp += sizeof(rng_size);
10426
- memcpy(&rng_buf[0], inp, LLAMA_MAX_RNG_STATE); inp += LLAMA_MAX_RNG_STATE;
10047
+ GGML_ASSERT(rng_size <= LLAMA_MAX_RNG_STATE);
10427
10048
 
10428
- std::stringstream rng_ss;
10429
- rng_ss.str(std::string(&rng_buf[0], rng_size));
10049
+ std::string rng_str((char *)inp, rng_size); inp += rng_size;
10050
+
10051
+ std::istringstream rng_ss(rng_str);
10430
10052
  rng_ss >> ctx->rng;
10431
10053
 
10432
10054
  GGML_ASSERT(!rng_ss.fail());
@@ -10434,20 +10056,18 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10434
10056
 
10435
10057
  // set logits
10436
10058
  {
10437
- size_t logits_cap;
10438
10059
  size_t logits_size;
10439
10060
 
10440
- memcpy(&logits_cap, inp, sizeof(logits_cap)); inp += sizeof(logits_cap);
10441
10061
  memcpy(&logits_size, inp, sizeof(logits_size)); inp += sizeof(logits_size);
10442
10062
 
10443
- GGML_ASSERT(ctx->logits.capacity() == logits_cap);
10063
+ GGML_ASSERT(ctx->logits.capacity() >= logits_size);
10444
10064
 
10445
10065
  if (logits_size) {
10446
10066
  ctx->logits.resize(logits_size);
10067
+
10447
10068
  memcpy(ctx->logits.data(), inp, logits_size * sizeof(float));
10069
+ inp += logits_size * sizeof(float);
10448
10070
  }
10449
-
10450
- inp += logits_cap * sizeof(float);
10451
10071
  }
10452
10072
 
10453
10073
  // set embeddings
@@ -10486,48 +10106,22 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
10486
10106
  memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
10487
10107
 
10488
10108
  if (kv_buf_size) {
10489
- GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
10109
+ GGML_ASSERT(kv_self.total_size() == kv_buf_size);
10490
10110
 
10491
10111
  const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
10492
10112
 
10493
- ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
10494
- ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
10495
-
10496
- std::vector<struct ggml_tensor *> kin2d(n_layer);
10497
- std::vector<struct ggml_tensor *> vin2d(n_layer);
10498
-
10499
- for (int il = 0; il < n_layer; ++il) {
10500
- kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
10501
- vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
10502
-
10503
- ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
10504
- n_embd_k_gqa, kv_head,
10505
- elt_size*n_embd_k_gqa, 0);
10506
-
10507
- ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
10508
- kv_head, n_embd_v_gqa,
10509
- elt_size*n_ctx, 0);
10510
-
10511
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
10512
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
10513
- }
10514
-
10515
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
10516
-
10517
- // load data into the tensors
10518
- for (int il = 0; il < n_layer; ++il) {
10519
- ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
10520
- inp += ggml_nbytes(kin2d[il]);
10521
-
10522
- ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
10523
- inp += ggml_nbytes(vin2d[il]);
10113
+ for (int il = 0; il < (int) n_layer; ++il) {
10114
+ size_t k_size = elt_size*n_embd_k_gqa*kv_head;
10115
+ ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
10116
+ inp += k_size;
10117
+
10118
+ // v is not contiguous, copy row by row
10119
+ size_t v_row_size = elt_size*kv_head;
10120
+ for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
10121
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
10122
+ inp += v_row_size;
10123
+ }
10524
10124
  }
10525
-
10526
- ggml_backend_graph_compute(ctx->backend, gf);
10527
-
10528
- ggml_free(cpy_ctx);
10529
-
10530
- ggml_backend_buffer_free(buf);
10531
10125
  }
10532
10126
 
10533
10127
  ctx->kv_self.head = kv_head;
@@ -10843,6 +10437,8 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10843
10437
  if (0 <= token && token < llama_n_vocab(model)) {
10844
10438
  switch (llama_vocab_get_type(model->vocab)) {
10845
10439
  case LLAMA_VOCAB_TYPE_SPM: {
10440
+ // NOTE: we accept all unsupported token types,
10441
+ // suppressing them like CONTROL tokens.
10846
10442
  if (llama_is_normal_token(model->vocab, token)) {
10847
10443
  std::string result = model->vocab.id_to_token[token].text;
10848
10444
  llama_unescape_whitespace(result);
@@ -10851,6 +10447,13 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10851
10447
  }
10852
10448
  memcpy(buf, result.c_str(), result.length());
10853
10449
  return result.length();
10450
+ } else if (llama_is_user_defined_token(model->vocab, token)) {
10451
+ std::string result = model->vocab.id_to_token[token].text;
10452
+ if (length < (int) result.length()) {
10453
+ return -result.length();
10454
+ }
10455
+ memcpy(buf, result.c_str(), result.length());
10456
+ return result.length();
10854
10457
  } else if (llama_is_unknown_token(model->vocab, token)) { // NOLINT
10855
10458
  if (length < 3) {
10856
10459
  return -3;
@@ -10865,14 +10468,12 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10865
10468
  }
10866
10469
  buf[0] = llama_token_to_byte(model->vocab, token);
10867
10470
  return 1;
10868
- } else {
10869
- // TODO: for now we accept all unsupported token types,
10870
- // suppressing them like CONTROL tokens.
10871
- // GGML_ASSERT(false);
10872
10471
  }
10873
10472
  break;
10874
10473
  }
10875
10474
  case LLAMA_VOCAB_TYPE_BPE: {
10475
+ // NOTE: we accept all unsupported token types,
10476
+ // suppressing them like CONTROL tokens.
10876
10477
  if (llama_is_normal_token(model->vocab, token)) {
10877
10478
  std::string result = model->vocab.id_to_token[token].text;
10878
10479
  result = llama_decode_text(result);
@@ -10881,12 +10482,15 @@ int32_t llama_token_to_piece(const struct llama_model * model, llama_token token
10881
10482
  }
10882
10483
  memcpy(buf, result.c_str(), result.length());
10883
10484
  return result.length();
10485
+ } else if (llama_is_user_defined_token(model->vocab, token)) {
10486
+ std::string result = model->vocab.id_to_token[token].text;
10487
+ if (length < (int) result.length()) {
10488
+ return -result.length();
10489
+ }
10490
+ memcpy(buf, result.c_str(), result.length());
10491
+ return result.length();
10884
10492
  } else if (llama_is_control_token(model->vocab, token)) {
10885
10493
  ;
10886
- } else {
10887
- // TODO: for now we accept all unsupported token types,
10888
- // suppressing them like CONTROL tokens.
10889
- // GGML_ASSERT(false);
10890
10494
  }
10891
10495
  break;
10892
10496
  }
@@ -10998,7 +10602,7 @@ void llama_log_set(ggml_log_callback log_callback, void * user_data) {
10998
10602
  g_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
10999
10603
  g_state.log_callback_user_data = user_data;
11000
10604
  #ifdef GGML_USE_METAL
11001
- ggml_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
10605
+ ggml_backend_metal_log_set_callback(g_state.log_callback, g_state.log_callback_user_data);
11002
10606
  #endif
11003
10607
  }
11004
10608