llama_cpp 0.5.0 → 0.5.2

Sign up to get free protection for your applications and to get access to all the features.
@@ -1,8 +1,3 @@
1
- // Defines fileno on msys:
2
- #ifndef _GNU_SOURCE
3
- #define _GNU_SOURCE
4
- #endif
5
-
6
1
  #include "llama.h"
7
2
 
8
3
  #include "ggml.h"
@@ -126,6 +121,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
126
121
  }
127
122
  s = std::move(result);
128
123
  }
124
+ #ifdef GGML_USE_CPU_HBM
125
+ #include <hbwmalloc.h>
126
+ #endif
129
127
 
130
128
  static void zeros(std::ofstream & file, size_t n) {
131
129
  char zero = 0;
@@ -157,6 +155,7 @@ static std::string format(const char * fmt, ...) {
157
155
  enum llm_arch {
158
156
  LLM_ARCH_LLAMA,
159
157
  LLM_ARCH_FALCON,
158
+ LLM_ARCH_BAICHUAN,
160
159
  LLM_ARCH_GPT2,
161
160
  LLM_ARCH_GPTJ,
162
161
  LLM_ARCH_GPTNEOX,
@@ -171,6 +170,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
171
170
  { LLM_ARCH_GPTJ, "gptj" },
172
171
  { LLM_ARCH_GPTNEOX, "gptneox" },
173
172
  { LLM_ARCH_MPT, "mpt" },
173
+ { LLM_ARCH_BAICHUAN,"baichuan" },
174
174
  };
175
175
 
176
176
  enum llm_kv {
@@ -311,6 +311,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
311
311
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
312
312
  },
313
313
  },
314
+ {
315
+ LLM_ARCH_BAICHUAN,
316
+ {
317
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
318
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
319
+ { LLM_TENSOR_OUTPUT, "output" },
320
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
321
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
322
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
323
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
324
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
325
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
326
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
327
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
328
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
329
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
330
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
331
+ },
332
+ },
314
333
  {
315
334
  LLM_ARCH_FALCON,
316
335
  {
@@ -325,6 +344,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
325
344
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
326
345
  },
327
346
  },
347
+ {
348
+ LLM_ARCH_GPT2,
349
+ {
350
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
351
+ },
352
+ },
353
+ {
354
+ LLM_ARCH_GPTJ,
355
+ {
356
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
357
+ },
358
+ },
359
+ {
360
+ LLM_ARCH_GPTNEOX,
361
+ {
362
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
363
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
364
+ { LLM_TENSOR_OUTPUT, "output" },
365
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
366
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
367
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
368
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
369
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
370
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
371
+ },
372
+ },
373
+ {
374
+ LLM_ARCH_MPT,
375
+ {
376
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
377
+ },
378
+ },
379
+ {
380
+ LLM_ARCH_UNKNOWN,
381
+ {
382
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
383
+ },
384
+ },
328
385
  };
329
386
 
330
387
  static llm_arch llm_arch_from_string(const std::string & name) {
@@ -412,6 +469,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
412
469
  #elif GGML_USE_METAL
413
470
  # define llama_host_malloc(n) ggml_metal_host_malloc(n)
414
471
  # define llama_host_free(data) ggml_metal_host_free(data)
472
+ #elif GGML_USE_CPU_HBM
473
+ # define llama_host_malloc(n) hbw_malloc(n)
474
+ # define llama_host_free(data) if (data != NULL) hbw_free(data)
415
475
  #else
416
476
  # define llama_host_malloc(n) malloc(n)
417
477
  # define llama_host_free(data) free(data)
@@ -568,16 +628,16 @@ struct llama_mmap {
568
628
 
569
629
  if (prefetch > 0) {
570
630
  // Advise the kernel to preload the mapped memory
571
- if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
572
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
631
+ if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
632
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
573
633
  strerror(errno));
574
634
  }
575
635
  }
576
636
  if (numa) {
577
637
  // advise the kernel not to use readahead
578
638
  // (because the next page might not belong on the same node)
579
- if (madvise(addr, file->size, MADV_RANDOM)) {
580
- fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
639
+ if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
640
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
581
641
  strerror(errno));
582
642
  }
583
643
  }
@@ -620,7 +680,6 @@ struct llama_mmap {
620
680
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
621
681
  fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
622
682
  llama_format_win_err(GetLastError()).c_str());
623
- }
624
683
  }
625
684
  #else
626
685
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -1446,7 +1505,11 @@ struct llama_model_loader {
1446
1505
  // allocate temp buffer if not using mmap
1447
1506
  if (!use_mmap && cur->data == NULL) {
1448
1507
  GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
1449
- cur->data = malloc(ggml_nbytes(cur));
1508
+ #ifdef GGML_USE_CPU_HBM
1509
+ cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
1510
+ #else
1511
+ cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
1512
+ #endif
1450
1513
  }
1451
1514
 
1452
1515
  load_data_for(cur);
@@ -1600,9 +1663,13 @@ static void llm_load_hparams(
1600
1663
 
1601
1664
  GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
1602
1665
 
1603
- if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1604
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1666
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
1667
+ if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1668
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1669
+ }
1605
1670
  }
1671
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
1672
+ // gpt-j n_rot = rotary_dim
1606
1673
  }
1607
1674
 
1608
1675
  // arch-specific KVs
@@ -1631,6 +1698,15 @@ static void llm_load_hparams(
1631
1698
  default: model.type = e_model::MODEL_UNKNOWN;
1632
1699
  }
1633
1700
  } break;
1701
+ case LLM_ARCH_BAICHUAN:
1702
+ {
1703
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1704
+ switch (hparams.n_layer) {
1705
+ case 32: model.type = e_model::MODEL_7B; break;
1706
+ case 40: model.type = e_model::MODEL_13B; break;
1707
+ default: model.type = e_model::MODEL_UNKNOWN;
1708
+ }
1709
+ } break;
1634
1710
  default: (void)0;
1635
1711
  };
1636
1712
 
@@ -1871,7 +1947,6 @@ static void llm_load_tensors(
1871
1947
  const int64_t n_vocab = hparams.n_vocab;
1872
1948
 
1873
1949
  const auto tn = LLM_TN(model.arch);
1874
-
1875
1950
  switch (model.arch) {
1876
1951
  case LLM_ARCH_LLAMA:
1877
1952
  {
@@ -1914,6 +1989,72 @@ static void llm_load_tensors(
1914
1989
 
1915
1990
  model.layers.resize(n_layer);
1916
1991
 
1992
+ for (uint32_t i = 0; i < n_layer; ++i) {
1993
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1994
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1995
+
1996
+ auto & layer = model.layers[i];
1997
+
1998
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
1999
+
2000
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
2001
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2002
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2003
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2004
+
2005
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2006
+
2007
+ layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
2008
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2009
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2010
+
2011
+ if (backend == GGML_BACKEND_GPU) {
2012
+ vram_weights +=
2013
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2014
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2015
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
2016
+ }
2017
+ }
2018
+ } break;
2019
+ case LLM_ARCH_BAICHUAN:
2020
+ {
2021
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2022
+ {
2023
+ ggml_backend backend_norm;
2024
+ ggml_backend backend_output;
2025
+
2026
+ if (n_gpu_layers > int(n_layer)) {
2027
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2028
+ // on Windows however this is detrimental unless everything is on the GPU
2029
+ #ifndef _WIN32
2030
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2031
+ #else
2032
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2033
+ #endif // _WIN32
2034
+
2035
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2036
+ } else {
2037
+ backend_norm = GGML_BACKEND_CPU;
2038
+ backend_output = GGML_BACKEND_CPU;
2039
+ }
2040
+
2041
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2042
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2043
+
2044
+ if (backend_norm == GGML_BACKEND_GPU) {
2045
+ vram_weights += ggml_nbytes(model.output_norm);
2046
+ }
2047
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2048
+ vram_weights += ggml_nbytes(model.output);
2049
+ }
2050
+ }
2051
+
2052
+ const uint32_t n_ff = hparams.n_ff;
2053
+
2054
+ const int i_gpu_start = n_layer - n_gpu_layers;
2055
+
2056
+ model.layers.resize(n_layer);
2057
+
1917
2058
  for (uint32_t i = 0; i < n_layer; ++i) {
1918
2059
  const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1919
2060
  const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
@@ -2071,95 +2212,427 @@ static void llm_load_tensors(
2071
2212
  vram_kv_cache += hparams.kv_size() / 2;
2072
2213
  }
2073
2214
  }
2074
- #elif defined(GGML_USE_CLBLAST)
2075
- const int max_backend_supported_layers = hparams.n_layer + 1;
2076
- const int max_offloadable_layers = hparams.n_layer + 1;
2077
- #endif // GGML_USE_CUBLAS
2215
+ #elif defined(GGML_USE_CLBLAST)
2216
+ const int max_backend_supported_layers = hparams.n_layer + 1;
2217
+ const int max_offloadable_layers = hparams.n_layer + 1;
2218
+ #endif // GGML_USE_CUBLAS
2219
+
2220
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2221
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2222
+ LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2223
+ __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2224
+ #else
2225
+ (void) n_gpu_layers;
2226
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2227
+ }
2228
+
2229
+ // populate `tensors_by_name`
2230
+ for (int i = 0; i < ml.n_tensors; ++i) {
2231
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2232
+ model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2233
+ }
2234
+
2235
+ (void) tensor_split;
2236
+ #if defined(GGML_USE_CUBLAS)
2237
+ {
2238
+ ggml_cuda_set_tensor_split(tensor_split);
2239
+ }
2240
+ #endif
2241
+
2242
+ ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2243
+
2244
+ if (progress_callback) {
2245
+ progress_callback(1.0f, progress_callback_user_data);
2246
+ }
2247
+
2248
+ model.mapping = std::move(ml.mapping);
2249
+
2250
+ // loading time will be recalculate after the first eval, so
2251
+ // we take page faults deferred by mmap() into consideration
2252
+ model.t_load_us = ggml_time_us() - model.t_start_us;
2253
+ }
2254
+
2255
+ static bool llama_model_load(
2256
+ const std::string & fname,
2257
+ llama_model & model,
2258
+ int n_ctx,
2259
+ int n_batch,
2260
+ int n_gpu_layers,
2261
+ int main_gpu,
2262
+ const float * tensor_split,
2263
+ const bool mul_mat_q,
2264
+ float rope_freq_base,
2265
+ float rope_freq_scale,
2266
+ bool low_vram,
2267
+ ggml_type memory_type,
2268
+ bool use_mmap,
2269
+ bool use_mlock,
2270
+ bool vocab_only,
2271
+ llama_progress_callback progress_callback,
2272
+ void *progress_callback_user_data) {
2273
+ try {
2274
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2275
+
2276
+ llm_load_arch (*ml, model);
2277
+ llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2278
+ llm_load_vocab (*ml, model);
2279
+
2280
+ llm_load_print_meta(*ml, model);
2281
+
2282
+ if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2283
+ throw std::runtime_error("vocab size mismatch");
2284
+ }
2285
+
2286
+ if (vocab_only) {
2287
+ LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2288
+ return true;
2289
+ }
2290
+
2291
+ llm_load_tensors(
2292
+ *ml, model, n_batch, n_gpu_layers,
2293
+ main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2294
+ use_mlock, progress_callback, progress_callback_user_data);
2295
+ } catch (const std::exception & err) {
2296
+ LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
2297
+ return false;
2298
+ }
2299
+
2300
+ return true;
2301
+ }
2302
+
2303
+ static struct ggml_cgraph * llm_build_llama(
2304
+ llama_context & lctx,
2305
+ const llama_token * tokens,
2306
+ const float * embd,
2307
+ int n_tokens,
2308
+ int n_past) {
2309
+
2310
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2311
+
2312
+ const int N = n_tokens;
2313
+
2314
+ const auto & model = lctx.model;
2315
+ const auto & hparams = model.hparams;
2316
+
2317
+ const auto & kv_self = lctx.kv_self;
2318
+
2319
+ GGML_ASSERT(!!kv_self.ctx);
2320
+
2321
+ const int64_t n_embd = hparams.n_embd;
2322
+ const int64_t n_layer = hparams.n_layer;
2323
+ const int64_t n_ctx = hparams.n_ctx;
2324
+ const int64_t n_head = hparams.n_head;
2325
+ const int64_t n_head_kv = hparams.n_head_kv;
2326
+ const int64_t n_embd_head = hparams.n_embd_head();
2327
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
2328
+
2329
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
2330
+
2331
+ const float freq_base = hparams.rope_freq_base;
2332
+ const float freq_scale = hparams.rope_freq_scale;
2333
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
2334
+
2335
+ const int n_gpu_layers = model.n_gpu_layers;
2336
+
2337
+ auto & buf_compute = lctx.buf_compute;
2338
+
2339
+ struct ggml_init_params params = {
2340
+ /*.mem_size =*/ buf_compute.size,
2341
+ /*.mem_buffer =*/ buf_compute.data,
2342
+ /*.no_alloc =*/ false,
2343
+ };
2344
+
2345
+ params.no_alloc = true;
2346
+
2347
+ struct ggml_context * ctx0 = ggml_init(params);
2348
+
2349
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
2350
+
2351
+ struct ggml_tensor * cur;
2352
+ struct ggml_tensor * inpL;
2353
+
2354
+ if (tokens) {
2355
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2356
+
2357
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
2358
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2359
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2360
+ }
2361
+ ggml_set_name(inp_tokens, "inp_tokens");
2362
+
2363
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
2364
+ } else {
2365
+ #ifdef GGML_USE_MPI
2366
+ GGML_ASSERT(false && "not implemented");
2367
+ #endif
2368
+
2369
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2370
+
2371
+ ggml_allocr_alloc(lctx.alloc, inpL);
2372
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2373
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2374
+ }
2375
+ }
2376
+
2377
+ const int i_gpu_start = n_layer - n_gpu_layers;
2378
+ (void) i_gpu_start;
2379
+
2380
+ // offload functions set the tensor output backend to GPU
2381
+ // tensors are GPU-accelerated if any input or the output has been offloaded
2382
+ //
2383
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2384
+ // in that case ggml_cuda_assign_buffers has no effect
2385
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2386
+ offload_func_t offload_func_kq = llama_nop;
2387
+ offload_func_t offload_func_v = llama_nop;
2388
+
2389
+ #ifdef GGML_USE_CUBLAS
2390
+ if (n_gpu_layers > n_layer) {
2391
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
2392
+ }
2393
+ if (n_gpu_layers > n_layer + 1) {
2394
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
2395
+ }
2396
+ if (n_gpu_layers > n_layer + 2) {
2397
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
2398
+ }
2399
+ #endif // GGML_USE_CUBLAS
2400
+
2401
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2402
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
2403
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2404
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2405
+ }
2406
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2407
+
2408
+ for (int il = 0; il < n_layer; ++il) {
2409
+ ggml_format_name(inpL, "layer_inp_%d", il);
2410
+
2411
+ offload_func_t offload_func = llama_nop;
2412
+
2413
+ #ifdef GGML_USE_CUBLAS
2414
+ if (il >= i_gpu_start) {
2415
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
2416
+ }
2417
+ #endif // GGML_USE_CUBLAS
2418
+
2419
+ struct ggml_tensor * inpSA = inpL;
2420
+
2421
+ // norm
2422
+ {
2423
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
2424
+ offload_func(cur);
2425
+ ggml_set_name(cur, "rms_norm_0");
2426
+
2427
+ // cur = cur*attn_norm(broadcasted)
2428
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
2429
+ offload_func(cur);
2430
+ ggml_set_name(cur, "attention_norm_0");
2431
+ }
2432
+
2433
+ // self-attention
2434
+ {
2435
+ // compute Q and K and RoPE them
2436
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
2437
+ offload_func_kq(tmpk);
2438
+ ggml_set_name(tmpk, "tmpk");
2439
+
2440
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
2441
+ offload_func_kq(tmpq);
2442
+ ggml_set_name(tmpq, "tmpq");
2443
+
2444
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2445
+ offload_func_kq(Kcur);
2446
+ ggml_set_name(Kcur, "Kcur");
2447
+
2448
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2449
+ offload_func_kq(Qcur);
2450
+ ggml_set_name(Qcur, "Qcur");
2451
+
2452
+ // store key and value to memory
2453
+ {
2454
+ // compute the transposed [N, n_embd] V matrix
2455
+
2456
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2457
+ offload_func_v(tmpv);
2458
+ ggml_set_name(tmpv, "tmpv");
2459
+
2460
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2461
+ offload_func_v(Vcur);
2462
+ ggml_set_name(Vcur, "Vcur");
2463
+
2464
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2465
+ offload_func_kq(k);
2466
+ ggml_set_name(k, "k");
2467
+
2468
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2469
+ ( n_ctx)*ggml_element_size(kv_self.v),
2470
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2471
+ offload_func_v(v);
2472
+ ggml_set_name(v, "v");
2473
+
2474
+ // important: storing RoPE-ed version of K in the KV cache!
2475
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
2476
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
2477
+ }
2478
+
2479
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
2480
+ offload_func_kq(Q);
2481
+ ggml_set_name(Q, "Q");
2482
+
2483
+ struct ggml_tensor * K =
2484
+ ggml_view_3d(ctx0, kv_self.k,
2485
+ n_embd_head, n_past + N, n_head_kv,
2486
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2487
+ ggml_element_size(kv_self.k)*n_embd_head,
2488
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
2489
+ offload_func_kq(K);
2490
+ ggml_set_name(K, "K");
2491
+
2492
+ // K * Q
2493
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2494
+ offload_func_kq(KQ);
2495
+ ggml_set_name(KQ, "KQ");
2496
+
2497
+ // KQ_scaled = KQ / sqrt(n_embd_head)
2498
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
2499
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2500
+ offload_func_kq(KQ_scaled);
2501
+ ggml_set_name(KQ_scaled, "KQ_scaled");
2502
+
2503
+ // KQ_masked = mask_past(KQ_scaled)
2504
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2505
+ offload_func_kq(KQ_masked);
2506
+ ggml_set_name(KQ_masked, "KQ_masked");
2507
+
2508
+ // KQ = soft_max(KQ_masked)
2509
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2510
+ offload_func_v(KQ_soft_max);
2511
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
2512
+
2513
+ // split cached V into n_head heads
2514
+ struct ggml_tensor * V =
2515
+ ggml_view_3d(ctx0, kv_self.v,
2516
+ n_past + N, n_embd_head, n_head_kv,
2517
+ ggml_element_size(kv_self.v)*n_ctx,
2518
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2519
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
2520
+ offload_func_v(V);
2521
+ ggml_set_name(V, "V");
2522
+
2523
+ #if 1
2524
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2525
+ offload_func_v(KQV);
2526
+ ggml_set_name(KQV, "KQV");
2527
+ #else
2528
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2529
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2530
+ // is there a better way?
2531
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2532
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2533
+ #endif
2534
+
2535
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
2536
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2537
+ offload_func_v(KQV_merged);
2538
+ ggml_set_name(KQV_merged, "KQV_merged");
2539
+
2540
+ // cur = KQV_merged.contiguous().view(n_embd, N)
2541
+ cur = ggml_cpy(ctx0,
2542
+ KQV_merged,
2543
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2544
+ offload_func_v(cur);
2545
+ ggml_set_name(cur, "KQV_merged_contiguous");
2546
+
2547
+ // projection (no bias)
2548
+ cur = ggml_mul_mat(ctx0,
2549
+ model.layers[il].wo,
2550
+ cur);
2551
+ offload_func(cur);
2552
+ ggml_set_name(cur, "result_wo");
2553
+ }
2078
2554
 
2079
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2080
- __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2081
- LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2082
- __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2083
- #else
2084
- (void) n_gpu_layers;
2085
- #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2086
- }
2555
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
2556
+ offload_func(inpFF);
2557
+ ggml_set_name(inpFF, "inpFF");
2087
2558
 
2088
- // populate `tensors_by_name`
2089
- for (int i = 0; i < ml.n_tensors; ++i) {
2090
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2091
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2092
- }
2559
+ // feed-forward network
2560
+ {
2561
+ // norm
2562
+ {
2563
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
2564
+ offload_func(cur);
2565
+ ggml_set_name(cur, "rms_norm_1");
2093
2566
 
2094
- (void) tensor_split;
2095
- #if defined(GGML_USE_CUBLAS)
2096
- {
2097
- ggml_cuda_set_tensor_split(tensor_split);
2098
- }
2099
- #endif
2567
+ // cur = cur*ffn_norm(broadcasted)
2568
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
2569
+ offload_func(cur);
2570
+ ggml_set_name(cur, "ffn_norm");
2571
+ }
2100
2572
 
2101
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2573
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
2574
+ model.layers[il].w3,
2575
+ cur);
2576
+ offload_func(tmp);
2577
+ ggml_set_name(tmp, "result_w3");
2102
2578
 
2103
- if (progress_callback) {
2104
- progress_callback(1.0f, progress_callback_user_data);
2105
- }
2579
+ cur = ggml_mul_mat(ctx0,
2580
+ model.layers[il].w1,
2581
+ cur);
2582
+ offload_func(cur);
2583
+ ggml_set_name(cur, "result_w1");
2106
2584
 
2107
- model.mapping = std::move(ml.mapping);
2585
+ // SILU activation
2586
+ cur = ggml_silu(ctx0, cur);
2587
+ offload_func(cur);
2588
+ ggml_set_name(cur, "silu");
2108
2589
 
2109
- // loading time will be recalculate after the first eval, so
2110
- // we take page faults deferred by mmap() into consideration
2111
- model.t_load_us = ggml_time_us() - model.t_start_us;
2112
- }
2590
+ cur = ggml_mul(ctx0, cur, tmp);
2591
+ offload_func(cur);
2592
+ ggml_set_name(cur, "silu_x_result_w3");
2113
2593
 
2114
- static bool llama_model_load(
2115
- const std::string & fname,
2116
- llama_model & model,
2117
- int n_ctx,
2118
- int n_batch,
2119
- int n_gpu_layers,
2120
- int main_gpu,
2121
- const float * tensor_split,
2122
- const bool mul_mat_q,
2123
- float rope_freq_base,
2124
- float rope_freq_scale,
2125
- bool low_vram,
2126
- ggml_type memory_type,
2127
- bool use_mmap,
2128
- bool use_mlock,
2129
- bool vocab_only,
2130
- llama_progress_callback progress_callback,
2131
- void *progress_callback_user_data) {
2132
- try {
2133
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2594
+ cur = ggml_mul_mat(ctx0,
2595
+ model.layers[il].w2,
2596
+ cur);
2597
+ offload_func(cur);
2598
+ ggml_set_name(cur, "result_w2");
2599
+ }
2134
2600
 
2135
- llm_load_arch (*ml, model);
2136
- llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2137
- llm_load_vocab (*ml, model);
2601
+ cur = ggml_add(ctx0, cur, inpFF);
2602
+ offload_func(cur);
2603
+ ggml_set_name(cur, "inpFF_+_result_w2");
2138
2604
 
2139
- llm_load_print_meta(*ml, model);
2605
+ // input for next layer
2606
+ inpL = cur;
2607
+ }
2140
2608
 
2141
- if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2142
- throw std::runtime_error("vocab size mismatch");
2143
- }
2609
+ cur = inpL;
2144
2610
 
2145
- if (vocab_only) {
2146
- LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2147
- return true;
2148
- }
2611
+ // norm
2612
+ {
2613
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
2614
+ offload_func_nr(cur);
2615
+ ggml_set_name(cur, "rms_norm_2");
2149
2616
 
2150
- llm_load_tensors(
2151
- *ml, model, n_batch, n_gpu_layers,
2152
- main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2153
- use_mlock, progress_callback, progress_callback_user_data);
2154
- } catch (const std::exception & err) {
2155
- LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
2156
- return false;
2617
+ // cur = cur*norm(broadcasted)
2618
+ cur = ggml_mul(ctx0, cur, model.output_norm);
2619
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
2620
+ ggml_set_name(cur, "result_norm");
2157
2621
  }
2158
2622
 
2159
- return true;
2623
+ // lm_head
2624
+ cur = ggml_mul_mat(ctx0, model.output, cur);
2625
+ ggml_set_name(cur, "result_output");
2626
+
2627
+ ggml_build_forward_expand(gf, cur);
2628
+
2629
+ ggml_free(ctx0);
2630
+
2631
+ return gf;
2160
2632
  }
2161
2633
 
2162
- static struct ggml_cgraph * llm_build_llama(
2634
+
2635
+ static struct ggml_cgraph * llm_build_baichaun(
2163
2636
  llama_context & lctx,
2164
2637
  const llama_token * tokens,
2165
2638
  const float * embd,
@@ -2300,11 +2773,24 @@ static struct ggml_cgraph * llm_build_llama(
2300
2773
  offload_func_kq(tmpq);
2301
2774
  ggml_set_name(tmpq, "tmpq");
2302
2775
 
2303
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2776
+ struct ggml_tensor * Kcur;
2777
+ struct ggml_tensor * Qcur;
2778
+ switch (model.type) {
2779
+ case MODEL_7B:
2780
+ Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2781
+ Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2782
+ break;
2783
+ case MODEL_13B:
2784
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2785
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
2786
+ break;
2787
+ default:
2788
+ GGML_ASSERT(false);
2789
+ }
2790
+
2304
2791
  offload_func_kq(Kcur);
2305
2792
  ggml_set_name(Kcur, "Kcur");
2306
2793
 
2307
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2308
2794
  offload_func_kq(Qcur);
2309
2795
  ggml_set_name(Qcur, "Qcur");
2310
2796
 
@@ -2359,10 +2845,26 @@ static struct ggml_cgraph * llm_build_llama(
2359
2845
  offload_func_kq(KQ_scaled);
2360
2846
  ggml_set_name(KQ_scaled, "KQ_scaled");
2361
2847
 
2848
+ struct ggml_tensor * KQ_masked;
2849
+ struct ggml_tensor * KQ_scaled_alibi;
2850
+
2851
+ switch (model.type) {
2852
+ case MODEL_7B:
2853
+ KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2854
+ break;
2855
+ case MODEL_13B:
2856
+ KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
2857
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2858
+ KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2859
+ break;
2860
+ default:
2861
+ GGML_ASSERT(false);
2862
+ }
2362
2863
  // KQ_masked = mask_past(KQ_scaled)
2363
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2364
- offload_func_kq(KQ_masked);
2365
- ggml_set_name(KQ_masked, "KQ_masked");
2864
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2865
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2866
+ // offload_func_kq(KQ_masked);
2867
+ // ggml_set_name(KQ_masked, "KQ_masked");
2366
2868
 
2367
2869
  // KQ = soft_max(KQ_masked)
2368
2870
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
@@ -2812,6 +3314,10 @@ static struct ggml_cgraph * llama_build_graph(
2812
3314
  {
2813
3315
  result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
2814
3316
  } break;
3317
+ case LLM_ARCH_BAICHUAN:
3318
+ {
3319
+ result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3320
+ } break;
2815
3321
  case LLM_ARCH_FALCON:
2816
3322
  {
2817
3323
  result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
@@ -2895,7 +3401,12 @@ static bool llama_eval_internal(
2895
3401
 
2896
3402
  // for big prompts, if BLAS is enabled, it is better to use only one thread
2897
3403
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
2898
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
3404
+ // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
3405
+ // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
3406
+ // with the BLAS calls. need a better solution
3407
+ if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
3408
+ n_threads = std::min(4, n_threads);
3409
+ }
2899
3410
 
2900
3411
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
2901
3412
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@@ -3000,33 +3511,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
3000
3511
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
3001
3512
  }
3002
3513
 
3003
- static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
3004
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
3005
- }
3006
-
3007
- static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
3008
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
3009
- }
3010
-
3011
3514
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
3012
3515
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
3013
3516
  }
3014
3517
 
3015
- static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
3016
- GGML_ASSERT(llama_is_control_token(vocab, id));
3017
- return id == vocab.special_bos_id;
3018
- }
3019
-
3020
- static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
3021
- GGML_ASSERT(llama_is_control_token(vocab, id));
3022
- return id == vocab.special_eos_id;
3023
- }
3024
-
3025
- static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
3026
- GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
3027
- return id == vocab.special_pad_id;
3028
- }
3029
-
3030
3518
  static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
3031
3519
  GGML_ASSERT(llama_is_byte_token(vocab, id));
3032
3520
  const auto& token_data = vocab.id_to_token.at(id);
@@ -3087,10 +3575,9 @@ struct llm_tokenizer_spm {
3087
3575
  while (offs < text.size()) {
3088
3576
  llm_symbol sym;
3089
3577
  size_t len = utf8_len(text[offs]);
3090
- GGML_ASSERT(offs + len <= text.size());
3091
3578
  sym.text = text.c_str() + offs;
3092
- sym.n = len;
3093
- offs += len;
3579
+ sym.n = std::min(len, text.size() - offs);
3580
+ offs += sym.n;
3094
3581
  sym.prev = index - 1;
3095
3582
  sym.next = offs == text.size() ? -1 : index + 1;
3096
3583
  index++;
@@ -3319,9 +3806,15 @@ struct llm_tokenizer_bpe {
3319
3806
  std::string byte_str(1, *j);
3320
3807
  auto token_multibyte = vocab.token_to_id.find(byte_str);
3321
3808
  if (token_multibyte == vocab.token_to_id.end()) {
3322
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3809
+ try {
3810
+ llama_token token_byte = llama_byte_to_token(vocab, *j);
3811
+ output.push_back(token_byte);
3812
+ } catch (const std::out_of_range & err) {
3813
+ fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3814
+ }
3815
+ } else {
3816
+ output.push_back((*token_multibyte).second);
3323
3817
  }
3324
- output.push_back((*token_multibyte).second);
3325
3818
  }
3326
3819
  } else {
3327
3820
  output.push_back((*token).second);
@@ -3595,7 +4088,7 @@ static void llama_grammar_advance_stack(
3595
4088
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
3596
4089
 
3597
4090
  if (stack.empty()) {
3598
- new_stacks.push_back(stack);
4091
+ new_stacks.emplace_back(stack);
3599
4092
  return;
3600
4093
  }
3601
4094
 
@@ -3632,7 +4125,7 @@ static void llama_grammar_advance_stack(
3632
4125
  }
3633
4126
  case LLAMA_GRETYPE_CHAR:
3634
4127
  case LLAMA_GRETYPE_CHAR_NOT:
3635
- new_stacks.push_back(stack);
4128
+ new_stacks.emplace_back(stack);
3636
4129
  break;
3637
4130
  default:
3638
4131
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -3797,6 +4290,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
3797
4290
  delete grammar;
3798
4291
  }
3799
4292
 
4293
+ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
4294
+ llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
4295
+
4296
+ // redirect elements in stacks to point to new rules
4297
+ for (size_t is = 0; is < result->stacks.size(); is++) {
4298
+ for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
4299
+ for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
4300
+ for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
4301
+ if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
4302
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
4303
+ }
4304
+ }
4305
+ }
4306
+ }
4307
+ }
4308
+
4309
+ return result;
4310
+ }
4311
+
3800
4312
  //
3801
4313
  // sampling
3802
4314
  //
@@ -4388,7 +4900,7 @@ struct llama_logit_info {
4388
4900
  }
4389
4901
  return min_heap;
4390
4902
  }
4391
- float probability_from_logit(float logit) {
4903
+ float probability_from_logit(float logit) const {
4392
4904
  return normalizer * std::exp(logit - max_l);
4393
4905
  }
4394
4906
  };
@@ -4581,7 +5093,16 @@ void llama_beam_search(llama_context * ctx,
4581
5093
  // quantization
4582
5094
  //
4583
5095
 
4584
- static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
5096
+ template <typename T>
5097
+ struct no_init {
5098
+ T value;
5099
+ no_init() { /* do nothing */ }
5100
+ };
5101
+
5102
+ static void llama_convert_tensor_internal(
5103
+ struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
5104
+ const size_t nelements, const int nthread
5105
+ ) {
4585
5106
  if (output.size() < nelements) {
4586
5107
  output.resize(nelements);
4587
5108
  }
@@ -4616,7 +5137,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4616
5137
  auto blocks_per_thread = nblocks / nthread;
4617
5138
  auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
4618
5139
 
4619
- std::vector<std::thread> workers;
4620
5140
  for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
4621
5141
  auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
4622
5142
  auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4629,14 +5149,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4629
5149
  qtype.to_float(inbuf, outbuf, nels);
4630
5150
  }
4631
5151
  };
4632
- workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
5152
+ workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
4633
5153
  in_buff_offs += thr_block_bytes;
4634
5154
  out_buff_offs += thr_elems;
4635
5155
  }
4636
- for (auto & worker : workers) {
4637
- worker.join();
5156
+ for (auto & w : workers) { w.join(); }
5157
+ workers.clear();
5158
+ }
5159
+
5160
+ #ifdef GGML_USE_K_QUANTS
5161
+ static ggml_type get_k_quant_type(
5162
+ ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
5163
+ int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
5164
+ ) {
5165
+ const std::string name = ggml_get_name(tensor);
5166
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
5167
+ const auto tn = LLM_TN(model.arch);
5168
+
5169
+ auto use_more_bits = [](int i_layer, int num_layers) -> bool {
5170
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
5171
+ };
5172
+
5173
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5174
+ int nx = tensor->ne[0];
5175
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
5176
+ new_type = GGML_TYPE_Q8_0;
5177
+ }
5178
+ else if (new_type != GGML_TYPE_Q8_0) {
5179
+ new_type = GGML_TYPE_Q6_K;
5180
+ }
5181
+ } else if (name.find("attn_v.weight") != std::string::npos) {
5182
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5183
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5184
+ new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5185
+ }
5186
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5187
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5188
+ use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5189
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
5190
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5191
+ (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
5192
+ if (model.type == MODEL_70B) {
5193
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5194
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5195
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
5196
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5197
+ }
5198
+ ++*i_attention_wv;
5199
+ } else if (name.find("ffn_down.weight") != std::string::npos) {
5200
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5201
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5202
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5203
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5204
+ : GGML_TYPE_Q3_K;
5205
+ }
5206
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5207
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5208
+ }
5209
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5210
+ if (model.arch == LLM_ARCH_FALCON) {
5211
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5212
+ use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5213
+ } else {
5214
+ if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5215
+ }
5216
+ }
5217
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5218
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
5219
+ new_type = GGML_TYPE_Q5_K;
5220
+ }
5221
+ ++*i_feed_forward_w2;
5222
+ } else if (name.find("attn_output.weight") != std::string::npos) {
5223
+ if (model.arch != LLM_ARCH_FALCON) {
5224
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5225
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5226
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5227
+ } else {
5228
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5229
+ }
5230
+ }
5231
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
5232
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5233
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5234
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5235
+ }
5236
+ else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
5237
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5238
+ }
5239
+ // This can be used to reduce the size of the Q5_K_S model.
5240
+ // The associated PPL increase is fully in line with the size reduction
5241
+ //else {
5242
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5243
+ //}
5244
+ bool convert_incompatible_tensor = false;
5245
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5246
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5247
+ int nx = tensor->ne[0];
5248
+ int ny = tensor->ne[1];
5249
+ if (nx % QK_K != 0) {
5250
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
5251
+ convert_incompatible_tensor = true;
5252
+ }
5253
+ }
5254
+ if (convert_incompatible_tensor) {
5255
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5256
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
5257
+ LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
5258
+ } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
5259
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
5260
+ LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
5261
+ } else {
5262
+ throw std::runtime_error("Unsupported tensor size encountered\n");
5263
+ }
4638
5264
  }
5265
+
5266
+ return new_type;
4639
5267
  }
5268
+ #endif
4640
5269
 
4641
5270
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
4642
5271
  ggml_type quantized_type;
@@ -4678,6 +5307,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4678
5307
  llm_load_arch(*ml, model);
4679
5308
  llm_load_hparams(*ml, model, 0, 0, 0);
4680
5309
 
5310
+ if (params->only_copy) {
5311
+ ftype = model.ftype;
5312
+ }
5313
+
4681
5314
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
4682
5315
  struct gguf_context * ctx_out = gguf_init_empty();
4683
5316
 
@@ -4717,16 +5350,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4717
5350
  std::vector<int64_t> hist_all(1 << 4, 0);
4718
5351
 
4719
5352
  std::vector<std::thread> workers;
5353
+ workers.reserve(nthread);
4720
5354
  std::mutex mutex;
4721
5355
 
4722
- auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4723
- return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4724
- };
4725
-
4726
5356
  int idx = 0;
4727
5357
 
4728
- std::vector<uint8_t> read_data;
4729
- std::vector<uint8_t> work;
5358
+ std::vector<no_init<uint8_t>> read_data;
5359
+ std::vector<no_init<uint8_t>> work;
5360
+ std::vector<no_init<float>> f32_conv_buf;
4730
5361
 
4731
5362
  // populate the original tensors so we get an initial meta data
4732
5363
  for (int i = 0; i < ml->n_tensors; ++i) {
@@ -4748,7 +5379,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4748
5379
 
4749
5380
  const std::string name = ggml_get_name(tensor);
4750
5381
 
4751
- read_data.resize(ggml_nbytes(tensor));
5382
+ if (read_data.size() < ggml_nbytes(tensor)) {
5383
+ read_data.resize(ggml_nbytes(tensor));
5384
+ }
4752
5385
  tensor->data = read_data.data();
4753
5386
  ml->load_data_for(tensor);
4754
5387
 
@@ -4764,137 +5397,50 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4764
5397
  // quantize only 2D tensors
4765
5398
  quantize &= (tensor->n_dims == 2);
4766
5399
  quantize &= params->quantize_output_tensor || name != "output.weight";
4767
- quantize &= quantized_type != tensor->type;
5400
+ quantize &= !params->only_copy;
4768
5401
 
4769
5402
  enum ggml_type new_type;
4770
5403
  void * new_data;
4771
5404
  size_t new_size;
4772
5405
 
5406
+ if (quantize) {
5407
+ new_type = quantized_type;
5408
+ #ifdef GGML_USE_K_QUANTS
5409
+ new_type = get_k_quant_type(
5410
+ new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
5411
+ );
5412
+ #endif
5413
+ // If we've decided to quantize to the same type the tensor is already
5414
+ // in then there's nothing to do.
5415
+ quantize = tensor->type != new_type;
5416
+ }
4773
5417
  if (!quantize) {
4774
5418
  new_type = tensor->type;
4775
5419
  new_data = tensor->data;
4776
5420
  new_size = ggml_nbytes(tensor);
4777
5421
  LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4778
5422
  } else {
4779
- new_type = quantized_type;
4780
- #ifdef GGML_USE_K_QUANTS
4781
- // TODO: avoid hardcoded tensor names - use the TN_* constants
4782
- const auto tn = LLM_TN(ml->get_arch());
4783
-
4784
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4785
- int nx = tensor->ne[0];
4786
- if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4787
- new_type = GGML_TYPE_Q8_0;
4788
- }
4789
- else if (new_type != GGML_TYPE_Q8_0) {
4790
- new_type = GGML_TYPE_Q6_K;
4791
- }
4792
- } else if (name.find("attn_v.weight") != std::string::npos) {
4793
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4794
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4795
- new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4796
- }
4797
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4798
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4799
- use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4800
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4801
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4802
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4803
- if (model.type == MODEL_70B) {
4804
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4805
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4806
- // nearly negligible increase in model size by quantizing this tensor with more bits:
4807
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4808
- }
4809
- ++i_attention_wv;
4810
- } else if (name.find("ffn_down.weight") != std::string::npos) {
4811
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4812
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4813
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4814
- : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4815
- : GGML_TYPE_Q3_K;
4816
- }
4817
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4818
- new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4819
- }
4820
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4821
- if (model.arch == LLM_ARCH_FALCON) {
4822
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4823
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4824
- } else {
4825
- if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4826
- }
4827
- }
4828
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4829
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4830
- new_type = GGML_TYPE_Q5_K;
4831
- }
4832
- ++i_feed_forward_w2;
4833
- } else if (name.find("attn_output.weight") != std::string::npos) {
4834
- if (model.arch != LLM_ARCH_FALCON) {
4835
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4836
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4837
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4838
- } else {
4839
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4840
- }
4841
- }
4842
- else if (name.find("attn_qkv.weight") != std::string::npos) {
4843
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4844
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4845
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4846
- }
4847
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4848
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4849
- }
4850
- // This can be used to reduce the size of the Q5_K_S model.
4851
- // The associated PPL increase is fully in line with the size reduction
4852
- //else {
4853
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4854
- //}
4855
- bool convert_incompatible_tensor = false;
4856
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4857
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4858
- int nx = tensor->ne[0];
4859
- int ny = tensor->ne[1];
4860
- if (nx % QK_K != 0) {
4861
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4862
- convert_incompatible_tensor = true;
4863
- }
4864
- }
4865
- if (convert_incompatible_tensor) {
4866
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4867
- new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4868
- LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4869
- } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4870
- new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4871
- LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4872
- } else {
4873
- throw std::runtime_error("Unsupported tensor size encountered\n");
4874
- }
4875
- }
4876
- #endif
4877
-
4878
5423
  const size_t nelements = ggml_nelements(tensor);
4879
5424
 
4880
5425
  float * f32_data;
4881
- std::vector<float> f32_conv_buf;
4882
5426
 
4883
5427
  if (tensor->type == GGML_TYPE_F32) {
4884
5428
  f32_data = (float *) tensor->data;
4885
5429
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
4886
5430
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
4887
5431
  } else {
4888
- llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
5432
+ llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
4889
5433
  f32_data = (float *) f32_conv_buf.data();
4890
5434
  }
4891
5435
 
4892
5436
  LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
4893
5437
  fflush(stdout);
4894
5438
 
4895
- work.resize(nelements * 4); // upper bound on size
5439
+ if (work.size() < nelements * 4) {
5440
+ work.resize(nelements * 4); // upper bound on size
5441
+ }
4896
5442
  new_data = work.data();
4897
- std::vector<int64_t> hist_cur(1 << 4, 0);
5443
+ std::array<int64_t, 1 << 4> hist_cur = {};
4898
5444
 
4899
5445
  static const int chunk_size = 32 * 512;
4900
5446
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -4905,13 +5451,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4905
5451
  size_t counter = 0;
4906
5452
  new_size = 0;
4907
5453
  auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
4908
- std::vector<int64_t> local_hist;
5454
+ std::array<int64_t, 1 << 4> local_hist = {};
4909
5455
  size_t local_size = 0;
4910
5456
  while (true) {
4911
5457
  std::unique_lock<std::mutex> lock(mutex);
4912
5458
  size_t first = counter; counter += chunk_size;
4913
5459
  if (first >= nelements) {
4914
- if (!local_hist.empty()) {
5460
+ if (local_size > 0) {
4915
5461
  for (int j=0; j<int(local_hist.size()); ++j) {
4916
5462
  hist_cur[j] += local_hist[j];
4917
5463
  }
@@ -4921,22 +5467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4921
5467
  }
4922
5468
  lock.unlock();
4923
5469
  size_t last = std::min(nelements, first + chunk_size);
4924
- if (local_hist.empty()) {
4925
- local_hist.resize(hist_cur.size(), 0);
4926
- }
4927
5470
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
4928
5471
  }
4929
5472
  };
4930
- if ((int) workers.size() < nthread_use - 1) {
4931
- workers.resize(nthread_use - 1);
4932
- }
4933
5473
  for (int it = 0; it < nthread_use - 1; ++it) {
4934
- workers[it] = std::thread(compute);
5474
+ workers.emplace_back(compute);
4935
5475
  }
4936
5476
  compute();
4937
- for (int it = 0; it < nthread_use - 1; ++it) {
4938
- workers[it].join();
4939
- }
5477
+ for (auto & w : workers) { w.join(); }
5478
+ workers.clear();
4940
5479
  }
4941
5480
 
4942
5481
  LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -5279,7 +5818,7 @@ struct llama_context_params llama_context_default_params() {
5279
5818
  /*.seed =*/ LLAMA_DEFAULT_SEED,
5280
5819
  /*.n_ctx =*/ 512,
5281
5820
  /*.n_batch =*/ 512,
5282
- /*.gpu_layers =*/ 0,
5821
+ /*.n_gpu_layers =*/ 0,
5283
5822
  /*.main_gpu =*/ 0,
5284
5823
  /*.tensor_split =*/ nullptr,
5285
5824
  /*.rope_freq_base =*/ 10000.0f,
@@ -5296,6 +5835,10 @@ struct llama_context_params llama_context_default_params() {
5296
5835
  /*.embedding =*/ false,
5297
5836
  };
5298
5837
 
5838
+ #ifdef GGML_USE_METAL
5839
+ result.n_gpu_layers = 1;
5840
+ #endif
5841
+
5299
5842
  return result;
5300
5843
  }
5301
5844
 
@@ -5305,6 +5848,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
5305
5848
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
5306
5849
  /*.allow_requantize =*/ false,
5307
5850
  /*.quantize_output_tensor =*/ true,
5851
+ /*.only_copy =*/ false,
5308
5852
  };
5309
5853
 
5310
5854
  return result;
@@ -5487,43 +6031,43 @@ struct llama_context * llama_new_context_with_model(
5487
6031
  }
5488
6032
  #endif
5489
6033
  }
5490
- }
5491
6034
 
5492
6035
  #ifdef GGML_USE_METAL
5493
- if (params.n_gpu_layers > 0) {
5494
- // this allocates all Metal resources and memory buffers
6036
+ if (params.n_gpu_layers > 0) {
6037
+ // this allocates all Metal resources and memory buffers
5495
6038
 
5496
- void * data_ptr = NULL;
5497
- size_t data_size = 0;
6039
+ void * data_ptr = NULL;
6040
+ size_t data_size = 0;
5498
6041
 
5499
- if (params.use_mmap) {
5500
- data_ptr = ctx->model.mapping->addr;
5501
- data_size = ctx->model.mapping->size;
5502
- } else {
5503
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5504
- data_size = ggml_get_mem_size (ctx->model.ctx);
5505
- }
6042
+ if (params.use_mmap) {
6043
+ data_ptr = ctx->model.mapping->addr;
6044
+ data_size = ctx->model.mapping->size;
6045
+ } else {
6046
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
6047
+ data_size = ggml_get_mem_size (ctx->model.ctx);
6048
+ }
5506
6049
 
5507
- const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
6050
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5508
6051
 
5509
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
6052
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5510
6053
 
5511
6054
  #define LLAMA_METAL_CHECK_BUF(result) \
5512
- if (!(result)) { \
5513
- LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5514
- llama_free(ctx); \
5515
- return NULL; \
5516
- }
6055
+ if (!(result)) { \
6056
+ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
6057
+ llama_free(ctx); \
6058
+ return NULL; \
6059
+ }
5517
6060
 
5518
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6061
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5519
6062
 
5520
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5521
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6063
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
6064
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5522
6065
 
5523
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
6066
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5524
6067
  #undef LLAMA_METAL_CHECK_BUF
5525
- }
6068
+ }
5526
6069
  #endif
6070
+ }
5527
6071
 
5528
6072
  #ifdef GGML_USE_MPI
5529
6073
  ctx->ctx_mpi = ggml_mpi_init();
@@ -5559,15 +6103,19 @@ void llama_free(struct llama_context * ctx) {
5559
6103
  }
5560
6104
 
5561
6105
  int llama_n_vocab(const struct llama_context * ctx) {
5562
- return ctx->model.vocab.id_to_token.size();
6106
+ return llama_model_n_vocab(&ctx->model);
5563
6107
  }
5564
6108
 
5565
6109
  int llama_n_ctx(const struct llama_context * ctx) {
5566
- return ctx->model.hparams.n_ctx;
6110
+ return llama_model_n_ctx(&ctx->model);
6111
+ }
6112
+
6113
+ int llama_n_ctx_train(const struct llama_context * ctx) {
6114
+ return llama_model_n_ctx_train(&ctx->model);
5567
6115
  }
5568
6116
 
5569
6117
  int llama_n_embd(const struct llama_context * ctx) {
5570
- return ctx->model.hparams.n_embd;
6118
+ return llama_model_n_embd(&ctx->model);
5571
6119
  }
5572
6120
 
5573
6121
  enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
@@ -5582,6 +6130,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
5582
6130
  return model->hparams.n_ctx;
5583
6131
  }
5584
6132
 
6133
+ int llama_model_n_ctx_train(const struct llama_model * model) {
6134
+ return model->hparams.n_ctx_train;
6135
+ }
6136
+
5585
6137
  int llama_model_n_embd(const struct llama_model * model) {
5586
6138
  return model->hparams.n_embd;
5587
6139
  }
@@ -5857,7 +6409,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
5857
6409
  rng_ss.str(std::string(&rng_buf[0], rng_size));
5858
6410
  rng_ss >> ctx->rng;
5859
6411
 
5860
- GGML_ASSERT(rng_ss.fail() == false);
6412
+ GGML_ASSERT(!rng_ss.fail());
5861
6413
  }
5862
6414
 
5863
6415
  // set logits
@@ -6136,7 +6688,7 @@ int llama_tokenize_with_model(
6136
6688
  auto res = llama_tokenize_internal(model->vocab, text, add_bos);
6137
6689
 
6138
6690
  if (n_max_tokens < (int) res.size()) {
6139
- LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6691
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6140
6692
  return -((int) res.size());
6141
6693
  }
6142
6694