llama_cpp 0.5.0 → 0.5.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -1,8 +1,3 @@
1
- // Defines fileno on msys:
2
- #ifndef _GNU_SOURCE
3
- #define _GNU_SOURCE
4
- #endif
5
-
6
1
  #include "llama.h"
7
2
 
8
3
  #include "ggml.h"
@@ -126,6 +121,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
126
121
  }
127
122
  s = std::move(result);
128
123
  }
124
+ #ifdef GGML_USE_CPU_HBM
125
+ #include <hbwmalloc.h>
126
+ #endif
129
127
 
130
128
  static void zeros(std::ofstream & file, size_t n) {
131
129
  char zero = 0;
@@ -157,6 +155,7 @@ static std::string format(const char * fmt, ...) {
157
155
  enum llm_arch {
158
156
  LLM_ARCH_LLAMA,
159
157
  LLM_ARCH_FALCON,
158
+ LLM_ARCH_BAICHUAN,
160
159
  LLM_ARCH_GPT2,
161
160
  LLM_ARCH_GPTJ,
162
161
  LLM_ARCH_GPTNEOX,
@@ -171,6 +170,7 @@ static std::map<llm_arch, std::string> LLM_ARCH_NAMES = {
171
170
  { LLM_ARCH_GPTJ, "gptj" },
172
171
  { LLM_ARCH_GPTNEOX, "gptneox" },
173
172
  { LLM_ARCH_MPT, "mpt" },
173
+ { LLM_ARCH_BAICHUAN,"baichuan" },
174
174
  };
175
175
 
176
176
  enum llm_kv {
@@ -311,6 +311,25 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
311
311
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
312
312
  },
313
313
  },
314
+ {
315
+ LLM_ARCH_BAICHUAN,
316
+ {
317
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
318
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
319
+ { LLM_TENSOR_OUTPUT, "output" },
320
+ { LLM_TENSOR_ROPE_FREQS, "rope_freqs" },
321
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
322
+ { LLM_TENSOR_ATTN_Q, "blk.%d.attn_q" },
323
+ { LLM_TENSOR_ATTN_K, "blk.%d.attn_k" },
324
+ { LLM_TENSOR_ATTN_V, "blk.%d.attn_v" },
325
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
326
+ { LLM_TENSOR_ATTN_ROT_EMBD, "blk.%d.attn_rot_embd" },
327
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
328
+ { LLM_TENSOR_FFN_GATE, "blk.%d.ffn_gate" },
329
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
330
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
331
+ },
332
+ },
314
333
  {
315
334
  LLM_ARCH_FALCON,
316
335
  {
@@ -325,6 +344,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
325
344
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
326
345
  },
327
346
  },
347
+ {
348
+ LLM_ARCH_GPT2,
349
+ {
350
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
351
+ },
352
+ },
353
+ {
354
+ LLM_ARCH_GPTJ,
355
+ {
356
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
357
+ },
358
+ },
359
+ {
360
+ LLM_ARCH_GPTNEOX,
361
+ {
362
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
363
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
364
+ { LLM_TENSOR_OUTPUT, "output" },
365
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
366
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
367
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
368
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
369
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
370
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
371
+ },
372
+ },
373
+ {
374
+ LLM_ARCH_MPT,
375
+ {
376
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
377
+ },
378
+ },
379
+ {
380
+ LLM_ARCH_UNKNOWN,
381
+ {
382
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
383
+ },
384
+ },
328
385
  };
329
386
 
330
387
  static llm_arch llm_arch_from_string(const std::string & name) {
@@ -412,6 +469,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
412
469
  #elif GGML_USE_METAL
413
470
  # define llama_host_malloc(n) ggml_metal_host_malloc(n)
414
471
  # define llama_host_free(data) ggml_metal_host_free(data)
472
+ #elif GGML_USE_CPU_HBM
473
+ # define llama_host_malloc(n) hbw_malloc(n)
474
+ # define llama_host_free(data) if (data != NULL) hbw_free(data)
415
475
  #else
416
476
  # define llama_host_malloc(n) malloc(n)
417
477
  # define llama_host_free(data) free(data)
@@ -568,16 +628,16 @@ struct llama_mmap {
568
628
 
569
629
  if (prefetch > 0) {
570
630
  // Advise the kernel to preload the mapped memory
571
- if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
572
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
631
+ if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
632
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
573
633
  strerror(errno));
574
634
  }
575
635
  }
576
636
  if (numa) {
577
637
  // advise the kernel not to use readahead
578
638
  // (because the next page might not belong on the same node)
579
- if (madvise(addr, file->size, MADV_RANDOM)) {
580
- fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
639
+ if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
640
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
581
641
  strerror(errno));
582
642
  }
583
643
  }
@@ -620,7 +680,6 @@ struct llama_mmap {
620
680
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
621
681
  fprintf(stderr, "warning: PrefetchVirtualMemory failed: %s\n",
622
682
  llama_format_win_err(GetLastError()).c_str());
623
- }
624
683
  }
625
684
  #else
626
685
  #pragma message("warning: You are building for pre-Windows 8; prefetch not supported")
@@ -1446,7 +1505,11 @@ struct llama_model_loader {
1446
1505
  // allocate temp buffer if not using mmap
1447
1506
  if (!use_mmap && cur->data == NULL) {
1448
1507
  GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
1449
- cur->data = malloc(ggml_nbytes(cur));
1508
+ #ifdef GGML_USE_CPU_HBM
1509
+ cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
1510
+ #else
1511
+ cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
1512
+ #endif
1450
1513
  }
1451
1514
 
1452
1515
  load_data_for(cur);
@@ -1600,9 +1663,13 @@ static void llm_load_hparams(
1600
1663
 
1601
1664
  GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
1602
1665
 
1603
- if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1604
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1666
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
1667
+ if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1668
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1669
+ }
1605
1670
  }
1671
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
1672
+ // gpt-j n_rot = rotary_dim
1606
1673
  }
1607
1674
 
1608
1675
  // arch-specific KVs
@@ -1631,6 +1698,15 @@ static void llm_load_hparams(
1631
1698
  default: model.type = e_model::MODEL_UNKNOWN;
1632
1699
  }
1633
1700
  } break;
1701
+ case LLM_ARCH_BAICHUAN:
1702
+ {
1703
+ GGUF_GET_KEY(ctx, hparams.f_norm_rms_eps, gguf_get_val_f32, GGUF_TYPE_FLOAT32, true, kv(LLM_KV_ATTENTION_LAYERNORM_RMS_EPS));
1704
+ switch (hparams.n_layer) {
1705
+ case 32: model.type = e_model::MODEL_7B; break;
1706
+ case 40: model.type = e_model::MODEL_13B; break;
1707
+ default: model.type = e_model::MODEL_UNKNOWN;
1708
+ }
1709
+ } break;
1634
1710
  default: (void)0;
1635
1711
  };
1636
1712
 
@@ -1871,7 +1947,6 @@ static void llm_load_tensors(
1871
1947
  const int64_t n_vocab = hparams.n_vocab;
1872
1948
 
1873
1949
  const auto tn = LLM_TN(model.arch);
1874
-
1875
1950
  switch (model.arch) {
1876
1951
  case LLM_ARCH_LLAMA:
1877
1952
  {
@@ -1914,6 +1989,72 @@ static void llm_load_tensors(
1914
1989
 
1915
1990
  model.layers.resize(n_layer);
1916
1991
 
1992
+ for (uint32_t i = 0; i < n_layer; ++i) {
1993
+ const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1994
+ const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
1995
+
1996
+ auto & layer = model.layers[i];
1997
+
1998
+ layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
1999
+
2000
+ layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
2001
+ layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2002
+ layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
2003
+ layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
2004
+
2005
+ layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
2006
+
2007
+ layer.w1 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
2008
+ layer.w2 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
2009
+ layer.w3 = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
2010
+
2011
+ if (backend == GGML_BACKEND_GPU) {
2012
+ vram_weights +=
2013
+ ggml_nbytes(layer.attn_norm) + ggml_nbytes(layer.wq) + ggml_nbytes(layer.wk) +
2014
+ ggml_nbytes(layer.wv) + ggml_nbytes(layer.wo) + ggml_nbytes(layer.ffn_norm) +
2015
+ ggml_nbytes(layer.w1) + ggml_nbytes(layer.w2) + ggml_nbytes(layer.w3);
2016
+ }
2017
+ }
2018
+ } break;
2019
+ case LLM_ARCH_BAICHUAN:
2020
+ {
2021
+ model.tok_embeddings = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
2022
+ {
2023
+ ggml_backend backend_norm;
2024
+ ggml_backend backend_output;
2025
+
2026
+ if (n_gpu_layers > int(n_layer)) {
2027
+ // norm is not performance relevant on its own but keeping it in VRAM reduces data copying
2028
+ // on Windows however this is detrimental unless everything is on the GPU
2029
+ #ifndef _WIN32
2030
+ backend_norm = low_vram ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2031
+ #else
2032
+ backend_norm = low_vram || n_gpu_layers <= (int) n_layer + 2 ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD;
2033
+ #endif // _WIN32
2034
+
2035
+ backend_output = LLAMA_BACKEND_OFFLOAD_SPLIT;
2036
+ } else {
2037
+ backend_norm = GGML_BACKEND_CPU;
2038
+ backend_output = GGML_BACKEND_CPU;
2039
+ }
2040
+
2041
+ model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
2042
+ model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
2043
+
2044
+ if (backend_norm == GGML_BACKEND_GPU) {
2045
+ vram_weights += ggml_nbytes(model.output_norm);
2046
+ }
2047
+ if (backend_output == GGML_BACKEND_GPU_SPLIT) {
2048
+ vram_weights += ggml_nbytes(model.output);
2049
+ }
2050
+ }
2051
+
2052
+ const uint32_t n_ff = hparams.n_ff;
2053
+
2054
+ const int i_gpu_start = n_layer - n_gpu_layers;
2055
+
2056
+ model.layers.resize(n_layer);
2057
+
1917
2058
  for (uint32_t i = 0; i < n_layer; ++i) {
1918
2059
  const ggml_backend backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD; // NOLINT
1919
2060
  const ggml_backend backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : LLAMA_BACKEND_OFFLOAD_SPLIT; // NOLINT
@@ -2071,95 +2212,427 @@ static void llm_load_tensors(
2071
2212
  vram_kv_cache += hparams.kv_size() / 2;
2072
2213
  }
2073
2214
  }
2074
- #elif defined(GGML_USE_CLBLAST)
2075
- const int max_backend_supported_layers = hparams.n_layer + 1;
2076
- const int max_offloadable_layers = hparams.n_layer + 1;
2077
- #endif // GGML_USE_CUBLAS
2215
+ #elif defined(GGML_USE_CLBLAST)
2216
+ const int max_backend_supported_layers = hparams.n_layer + 1;
2217
+ const int max_offloadable_layers = hparams.n_layer + 1;
2218
+ #endif // GGML_USE_CUBLAS
2219
+
2220
+ LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2221
+ __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2222
+ LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2223
+ __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2224
+ #else
2225
+ (void) n_gpu_layers;
2226
+ #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2227
+ }
2228
+
2229
+ // populate `tensors_by_name`
2230
+ for (int i = 0; i < ml.n_tensors; ++i) {
2231
+ struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2232
+ model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2233
+ }
2234
+
2235
+ (void) tensor_split;
2236
+ #if defined(GGML_USE_CUBLAS)
2237
+ {
2238
+ ggml_cuda_set_tensor_split(tensor_split);
2239
+ }
2240
+ #endif
2241
+
2242
+ ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2243
+
2244
+ if (progress_callback) {
2245
+ progress_callback(1.0f, progress_callback_user_data);
2246
+ }
2247
+
2248
+ model.mapping = std::move(ml.mapping);
2249
+
2250
+ // loading time will be recalculate after the first eval, so
2251
+ // we take page faults deferred by mmap() into consideration
2252
+ model.t_load_us = ggml_time_us() - model.t_start_us;
2253
+ }
2254
+
2255
+ static bool llama_model_load(
2256
+ const std::string & fname,
2257
+ llama_model & model,
2258
+ int n_ctx,
2259
+ int n_batch,
2260
+ int n_gpu_layers,
2261
+ int main_gpu,
2262
+ const float * tensor_split,
2263
+ const bool mul_mat_q,
2264
+ float rope_freq_base,
2265
+ float rope_freq_scale,
2266
+ bool low_vram,
2267
+ ggml_type memory_type,
2268
+ bool use_mmap,
2269
+ bool use_mlock,
2270
+ bool vocab_only,
2271
+ llama_progress_callback progress_callback,
2272
+ void *progress_callback_user_data) {
2273
+ try {
2274
+ std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2275
+
2276
+ llm_load_arch (*ml, model);
2277
+ llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2278
+ llm_load_vocab (*ml, model);
2279
+
2280
+ llm_load_print_meta(*ml, model);
2281
+
2282
+ if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2283
+ throw std::runtime_error("vocab size mismatch");
2284
+ }
2285
+
2286
+ if (vocab_only) {
2287
+ LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2288
+ return true;
2289
+ }
2290
+
2291
+ llm_load_tensors(
2292
+ *ml, model, n_batch, n_gpu_layers,
2293
+ main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2294
+ use_mlock, progress_callback, progress_callback_user_data);
2295
+ } catch (const std::exception & err) {
2296
+ LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
2297
+ return false;
2298
+ }
2299
+
2300
+ return true;
2301
+ }
2302
+
2303
+ static struct ggml_cgraph * llm_build_llama(
2304
+ llama_context & lctx,
2305
+ const llama_token * tokens,
2306
+ const float * embd,
2307
+ int n_tokens,
2308
+ int n_past) {
2309
+
2310
+ GGML_ASSERT((!tokens && embd) || (tokens && !embd)); // NOLINT
2311
+
2312
+ const int N = n_tokens;
2313
+
2314
+ const auto & model = lctx.model;
2315
+ const auto & hparams = model.hparams;
2316
+
2317
+ const auto & kv_self = lctx.kv_self;
2318
+
2319
+ GGML_ASSERT(!!kv_self.ctx);
2320
+
2321
+ const int64_t n_embd = hparams.n_embd;
2322
+ const int64_t n_layer = hparams.n_layer;
2323
+ const int64_t n_ctx = hparams.n_ctx;
2324
+ const int64_t n_head = hparams.n_head;
2325
+ const int64_t n_head_kv = hparams.n_head_kv;
2326
+ const int64_t n_embd_head = hparams.n_embd_head();
2327
+ const int64_t n_embd_gqa = hparams.n_embd_gqa();
2328
+
2329
+ GGML_ASSERT(n_embd_head == hparams.n_rot);
2330
+
2331
+ const float freq_base = hparams.rope_freq_base;
2332
+ const float freq_scale = hparams.rope_freq_scale;
2333
+ const float norm_rms_eps = hparams.f_norm_rms_eps;
2334
+
2335
+ const int n_gpu_layers = model.n_gpu_layers;
2336
+
2337
+ auto & buf_compute = lctx.buf_compute;
2338
+
2339
+ struct ggml_init_params params = {
2340
+ /*.mem_size =*/ buf_compute.size,
2341
+ /*.mem_buffer =*/ buf_compute.data,
2342
+ /*.no_alloc =*/ false,
2343
+ };
2344
+
2345
+ params.no_alloc = true;
2346
+
2347
+ struct ggml_context * ctx0 = ggml_init(params);
2348
+
2349
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
2350
+
2351
+ struct ggml_tensor * cur;
2352
+ struct ggml_tensor * inpL;
2353
+
2354
+ if (tokens) {
2355
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
2356
+
2357
+ ggml_allocr_alloc(lctx.alloc, inp_tokens);
2358
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2359
+ memcpy(inp_tokens->data, tokens, N*ggml_element_size(inp_tokens));
2360
+ }
2361
+ ggml_set_name(inp_tokens, "inp_tokens");
2362
+
2363
+ inpL = ggml_get_rows(ctx0, model.tok_embeddings, inp_tokens);
2364
+ } else {
2365
+ #ifdef GGML_USE_MPI
2366
+ GGML_ASSERT(false && "not implemented");
2367
+ #endif
2368
+
2369
+ inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N);
2370
+
2371
+ ggml_allocr_alloc(lctx.alloc, inpL);
2372
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2373
+ memcpy(inpL->data, embd, N * n_embd * ggml_element_size(inpL));
2374
+ }
2375
+ }
2376
+
2377
+ const int i_gpu_start = n_layer - n_gpu_layers;
2378
+ (void) i_gpu_start;
2379
+
2380
+ // offload functions set the tensor output backend to GPU
2381
+ // tensors are GPU-accelerated if any input or the output has been offloaded
2382
+ //
2383
+ // with the low VRAM option VRAM scratch is disabled in llama_load_model_internal
2384
+ // in that case ggml_cuda_assign_buffers has no effect
2385
+ offload_func_t offload_func_nr = llama_nop; // nr = non-repeating
2386
+ offload_func_t offload_func_kq = llama_nop;
2387
+ offload_func_t offload_func_v = llama_nop;
2388
+
2389
+ #ifdef GGML_USE_CUBLAS
2390
+ if (n_gpu_layers > n_layer) {
2391
+ offload_func_nr = ggml_cuda_assign_buffers_no_alloc;
2392
+ }
2393
+ if (n_gpu_layers > n_layer + 1) {
2394
+ offload_func_v = ggml_cuda_assign_buffers_no_alloc;
2395
+ }
2396
+ if (n_gpu_layers > n_layer + 2) {
2397
+ offload_func_kq = ggml_cuda_assign_buffers_no_alloc;
2398
+ }
2399
+ #endif // GGML_USE_CUBLAS
2400
+
2401
+ struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
2402
+ ggml_allocr_alloc(lctx.alloc, KQ_scale);
2403
+ if (!ggml_allocr_is_measure(lctx.alloc)) {
2404
+ ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
2405
+ }
2406
+ ggml_set_name(KQ_scale, "1/sqrt(n_embd_head)");
2407
+
2408
+ for (int il = 0; il < n_layer; ++il) {
2409
+ ggml_format_name(inpL, "layer_inp_%d", il);
2410
+
2411
+ offload_func_t offload_func = llama_nop;
2412
+
2413
+ #ifdef GGML_USE_CUBLAS
2414
+ if (il >= i_gpu_start) {
2415
+ offload_func = ggml_cuda_assign_buffers_no_alloc;
2416
+ }
2417
+ #endif // GGML_USE_CUBLAS
2418
+
2419
+ struct ggml_tensor * inpSA = inpL;
2420
+
2421
+ // norm
2422
+ {
2423
+ cur = ggml_rms_norm(ctx0, inpL, norm_rms_eps);
2424
+ offload_func(cur);
2425
+ ggml_set_name(cur, "rms_norm_0");
2426
+
2427
+ // cur = cur*attn_norm(broadcasted)
2428
+ cur = ggml_mul(ctx0, cur, model.layers[il].attn_norm);
2429
+ offload_func(cur);
2430
+ ggml_set_name(cur, "attention_norm_0");
2431
+ }
2432
+
2433
+ // self-attention
2434
+ {
2435
+ // compute Q and K and RoPE them
2436
+ struct ggml_tensor * tmpk = ggml_mul_mat(ctx0, model.layers[il].wk, cur);
2437
+ offload_func_kq(tmpk);
2438
+ ggml_set_name(tmpk, "tmpk");
2439
+
2440
+ struct ggml_tensor * tmpq = ggml_mul_mat(ctx0, model.layers[il].wq, cur);
2441
+ offload_func_kq(tmpq);
2442
+ ggml_set_name(tmpq, "tmpq");
2443
+
2444
+ struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2445
+ offload_func_kq(Kcur);
2446
+ ggml_set_name(Kcur, "Kcur");
2447
+
2448
+ struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2449
+ offload_func_kq(Qcur);
2450
+ ggml_set_name(Qcur, "Qcur");
2451
+
2452
+ // store key and value to memory
2453
+ {
2454
+ // compute the transposed [N, n_embd] V matrix
2455
+
2456
+ struct ggml_tensor * tmpv = ggml_mul_mat(ctx0, model.layers[il].wv, cur);
2457
+ offload_func_v(tmpv);
2458
+ ggml_set_name(tmpv, "tmpv");
2459
+
2460
+ struct ggml_tensor * Vcur = ggml_transpose(ctx0, ggml_reshape_2d(ctx0, tmpv, n_embd_gqa, N));
2461
+ offload_func_v(Vcur);
2462
+ ggml_set_name(Vcur, "Vcur");
2463
+
2464
+ struct ggml_tensor * k = ggml_view_1d(ctx0, kv_self.k, N*n_embd_gqa, (ggml_element_size(kv_self.k)*n_embd_gqa)*(il*n_ctx + n_past));
2465
+ offload_func_kq(k);
2466
+ ggml_set_name(k, "k");
2467
+
2468
+ struct ggml_tensor * v = ggml_view_2d(ctx0, kv_self.v, N, n_embd_gqa,
2469
+ ( n_ctx)*ggml_element_size(kv_self.v),
2470
+ (il*n_ctx)*ggml_element_size(kv_self.v)*n_embd_gqa + n_past*ggml_element_size(kv_self.v));
2471
+ offload_func_v(v);
2472
+ ggml_set_name(v, "v");
2473
+
2474
+ // important: storing RoPE-ed version of K in the KV cache!
2475
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
2476
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
2477
+ }
2478
+
2479
+ struct ggml_tensor * Q = ggml_permute(ctx0, Qcur, 0, 2, 1, 3);
2480
+ offload_func_kq(Q);
2481
+ ggml_set_name(Q, "Q");
2482
+
2483
+ struct ggml_tensor * K =
2484
+ ggml_view_3d(ctx0, kv_self.k,
2485
+ n_embd_head, n_past + N, n_head_kv,
2486
+ ggml_element_size(kv_self.k)*n_embd_gqa,
2487
+ ggml_element_size(kv_self.k)*n_embd_head,
2488
+ ggml_element_size(kv_self.k)*n_embd_gqa*n_ctx*il);
2489
+ offload_func_kq(K);
2490
+ ggml_set_name(K, "K");
2491
+
2492
+ // K * Q
2493
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
2494
+ offload_func_kq(KQ);
2495
+ ggml_set_name(KQ, "KQ");
2496
+
2497
+ // KQ_scaled = KQ / sqrt(n_embd_head)
2498
+ // KQ_scaled shape [n_past + N, N, n_head, 1]
2499
+ struct ggml_tensor * KQ_scaled = ggml_scale_inplace(ctx0, KQ, KQ_scale);
2500
+ offload_func_kq(KQ_scaled);
2501
+ ggml_set_name(KQ_scaled, "KQ_scaled");
2502
+
2503
+ // KQ_masked = mask_past(KQ_scaled)
2504
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2505
+ offload_func_kq(KQ_masked);
2506
+ ggml_set_name(KQ_masked, "KQ_masked");
2507
+
2508
+ // KQ = soft_max(KQ_masked)
2509
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
2510
+ offload_func_v(KQ_soft_max);
2511
+ ggml_set_name(KQ_soft_max, "KQ_soft_max");
2512
+
2513
+ // split cached V into n_head heads
2514
+ struct ggml_tensor * V =
2515
+ ggml_view_3d(ctx0, kv_self.v,
2516
+ n_past + N, n_embd_head, n_head_kv,
2517
+ ggml_element_size(kv_self.v)*n_ctx,
2518
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_head,
2519
+ ggml_element_size(kv_self.v)*n_ctx*n_embd_gqa*il);
2520
+ offload_func_v(V);
2521
+ ggml_set_name(V, "V");
2522
+
2523
+ #if 1
2524
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
2525
+ offload_func_v(KQV);
2526
+ ggml_set_name(KQV, "KQV");
2527
+ #else
2528
+ // make V contiguous in memory to speed up the matmul, however we waste time on the copy
2529
+ // on M1 this is faster for the perplexity computation, but ~5% slower for the single-token generation
2530
+ // is there a better way?
2531
+ struct ggml_tensor * V_cont = ggml_cpy(ctx0, V, ggml_new_tensor_3d(ctx0, kv_self.v->type, n_past + N, n_embd_head, n_head));
2532
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_cont, KQ_soft_max);
2533
+ #endif
2534
+
2535
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
2536
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
2537
+ offload_func_v(KQV_merged);
2538
+ ggml_set_name(KQV_merged, "KQV_merged");
2539
+
2540
+ // cur = KQV_merged.contiguous().view(n_embd, N)
2541
+ cur = ggml_cpy(ctx0,
2542
+ KQV_merged,
2543
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
2544
+ offload_func_v(cur);
2545
+ ggml_set_name(cur, "KQV_merged_contiguous");
2546
+
2547
+ // projection (no bias)
2548
+ cur = ggml_mul_mat(ctx0,
2549
+ model.layers[il].wo,
2550
+ cur);
2551
+ offload_func(cur);
2552
+ ggml_set_name(cur, "result_wo");
2553
+ }
2078
2554
 
2079
- LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n",
2080
- __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
2081
- LLAMA_LOG_INFO("%s: VRAM used: %zu MB\n",
2082
- __func__, (vram_weights + vram_kv_cache + MB - 1) / MB); // round up
2083
- #else
2084
- (void) n_gpu_layers;
2085
- #endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
2086
- }
2555
+ struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpSA);
2556
+ offload_func(inpFF);
2557
+ ggml_set_name(inpFF, "inpFF");
2087
2558
 
2088
- // populate `tensors_by_name`
2089
- for (int i = 0; i < ml.n_tensors; ++i) {
2090
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
2091
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
2092
- }
2559
+ // feed-forward network
2560
+ {
2561
+ // norm
2562
+ {
2563
+ cur = ggml_rms_norm(ctx0, inpFF, norm_rms_eps);
2564
+ offload_func(cur);
2565
+ ggml_set_name(cur, "rms_norm_1");
2093
2566
 
2094
- (void) tensor_split;
2095
- #if defined(GGML_USE_CUBLAS)
2096
- {
2097
- ggml_cuda_set_tensor_split(tensor_split);
2098
- }
2099
- #endif
2567
+ // cur = cur*ffn_norm(broadcasted)
2568
+ cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
2569
+ offload_func(cur);
2570
+ ggml_set_name(cur, "ffn_norm");
2571
+ }
2100
2572
 
2101
- ml.load_all_data(ctx, progress_callback, progress_callback_user_data, use_mlock ? &model.mlock_mmap : NULL);
2573
+ struct ggml_tensor * tmp = ggml_mul_mat(ctx0,
2574
+ model.layers[il].w3,
2575
+ cur);
2576
+ offload_func(tmp);
2577
+ ggml_set_name(tmp, "result_w3");
2102
2578
 
2103
- if (progress_callback) {
2104
- progress_callback(1.0f, progress_callback_user_data);
2105
- }
2579
+ cur = ggml_mul_mat(ctx0,
2580
+ model.layers[il].w1,
2581
+ cur);
2582
+ offload_func(cur);
2583
+ ggml_set_name(cur, "result_w1");
2106
2584
 
2107
- model.mapping = std::move(ml.mapping);
2585
+ // SILU activation
2586
+ cur = ggml_silu(ctx0, cur);
2587
+ offload_func(cur);
2588
+ ggml_set_name(cur, "silu");
2108
2589
 
2109
- // loading time will be recalculate after the first eval, so
2110
- // we take page faults deferred by mmap() into consideration
2111
- model.t_load_us = ggml_time_us() - model.t_start_us;
2112
- }
2590
+ cur = ggml_mul(ctx0, cur, tmp);
2591
+ offload_func(cur);
2592
+ ggml_set_name(cur, "silu_x_result_w3");
2113
2593
 
2114
- static bool llama_model_load(
2115
- const std::string & fname,
2116
- llama_model & model,
2117
- int n_ctx,
2118
- int n_batch,
2119
- int n_gpu_layers,
2120
- int main_gpu,
2121
- const float * tensor_split,
2122
- const bool mul_mat_q,
2123
- float rope_freq_base,
2124
- float rope_freq_scale,
2125
- bool low_vram,
2126
- ggml_type memory_type,
2127
- bool use_mmap,
2128
- bool use_mlock,
2129
- bool vocab_only,
2130
- llama_progress_callback progress_callback,
2131
- void *progress_callback_user_data) {
2132
- try {
2133
- std::unique_ptr<llama_model_loader> ml(new llama_model_loader(fname, use_mmap));
2594
+ cur = ggml_mul_mat(ctx0,
2595
+ model.layers[il].w2,
2596
+ cur);
2597
+ offload_func(cur);
2598
+ ggml_set_name(cur, "result_w2");
2599
+ }
2134
2600
 
2135
- llm_load_arch (*ml, model);
2136
- llm_load_hparams(*ml, model, n_ctx, rope_freq_base, rope_freq_scale);
2137
- llm_load_vocab (*ml, model);
2601
+ cur = ggml_add(ctx0, cur, inpFF);
2602
+ offload_func(cur);
2603
+ ggml_set_name(cur, "inpFF_+_result_w2");
2138
2604
 
2139
- llm_load_print_meta(*ml, model);
2605
+ // input for next layer
2606
+ inpL = cur;
2607
+ }
2140
2608
 
2141
- if (model.hparams.n_vocab != model.vocab.id_to_token.size()) {
2142
- throw std::runtime_error("vocab size mismatch");
2143
- }
2609
+ cur = inpL;
2144
2610
 
2145
- if (vocab_only) {
2146
- LLAMA_LOG_INFO("%s: vocab only - skipping tensors\n", __func__);
2147
- return true;
2148
- }
2611
+ // norm
2612
+ {
2613
+ cur = ggml_rms_norm(ctx0, cur, norm_rms_eps);
2614
+ offload_func_nr(cur);
2615
+ ggml_set_name(cur, "rms_norm_2");
2149
2616
 
2150
- llm_load_tensors(
2151
- *ml, model, n_batch, n_gpu_layers,
2152
- main_gpu, tensor_split, mul_mat_q, low_vram, memory_type,
2153
- use_mlock, progress_callback, progress_callback_user_data);
2154
- } catch (const std::exception & err) {
2155
- LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
2156
- return false;
2617
+ // cur = cur*norm(broadcasted)
2618
+ cur = ggml_mul(ctx0, cur, model.output_norm);
2619
+ // offload_func_nr(cur); // TODO CPU + GPU mirrored backend
2620
+ ggml_set_name(cur, "result_norm");
2157
2621
  }
2158
2622
 
2159
- return true;
2623
+ // lm_head
2624
+ cur = ggml_mul_mat(ctx0, model.output, cur);
2625
+ ggml_set_name(cur, "result_output");
2626
+
2627
+ ggml_build_forward_expand(gf, cur);
2628
+
2629
+ ggml_free(ctx0);
2630
+
2631
+ return gf;
2160
2632
  }
2161
2633
 
2162
- static struct ggml_cgraph * llm_build_llama(
2634
+
2635
+ static struct ggml_cgraph * llm_build_baichaun(
2163
2636
  llama_context & lctx,
2164
2637
  const llama_token * tokens,
2165
2638
  const float * embd,
@@ -2300,11 +2773,24 @@ static struct ggml_cgraph * llm_build_llama(
2300
2773
  offload_func_kq(tmpq);
2301
2774
  ggml_set_name(tmpq, "tmpq");
2302
2775
 
2303
- struct ggml_tensor * Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2776
+ struct ggml_tensor * Kcur;
2777
+ struct ggml_tensor * Qcur;
2778
+ switch (model.type) {
2779
+ case MODEL_7B:
2780
+ Kcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpk, n_embd_head, n_head_kv, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2781
+ Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2782
+ break;
2783
+ case MODEL_13B:
2784
+ Kcur = ggml_reshape_3d(ctx0, tmpk, n_embd/n_head, n_head, N);
2785
+ Qcur = ggml_reshape_3d(ctx0, tmpq, n_embd/n_head, n_head, N);
2786
+ break;
2787
+ default:
2788
+ GGML_ASSERT(false);
2789
+ }
2790
+
2304
2791
  offload_func_kq(Kcur);
2305
2792
  ggml_set_name(Kcur, "Kcur");
2306
2793
 
2307
- struct ggml_tensor * Qcur = ggml_rope_custom_inplace(ctx0, ggml_reshape_3d(ctx0, tmpq, n_embd_head, n_head, N), n_past, n_embd_head, 0, 0, freq_base, freq_scale);
2308
2794
  offload_func_kq(Qcur);
2309
2795
  ggml_set_name(Qcur, "Qcur");
2310
2796
 
@@ -2359,10 +2845,26 @@ static struct ggml_cgraph * llm_build_llama(
2359
2845
  offload_func_kq(KQ_scaled);
2360
2846
  ggml_set_name(KQ_scaled, "KQ_scaled");
2361
2847
 
2848
+ struct ggml_tensor * KQ_masked;
2849
+ struct ggml_tensor * KQ_scaled_alibi;
2850
+
2851
+ switch (model.type) {
2852
+ case MODEL_7B:
2853
+ KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2854
+ break;
2855
+ case MODEL_13B:
2856
+ KQ_scaled_alibi =ggml_alibi(ctx0, KQ_scaled, n_past, n_head, 8);
2857
+ ggml_set_name(KQ_scaled_alibi, "KQ_scaled_alibi");
2858
+ KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2859
+ break;
2860
+ default:
2861
+ GGML_ASSERT(false);
2862
+ }
2362
2863
  // KQ_masked = mask_past(KQ_scaled)
2363
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2364
- offload_func_kq(KQ_masked);
2365
- ggml_set_name(KQ_masked, "KQ_masked");
2864
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf_inplace(ctx0, KQ_scaled, n_past);
2865
+ // struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled_alibi, n_past);
2866
+ // offload_func_kq(KQ_masked);
2867
+ // ggml_set_name(KQ_masked, "KQ_masked");
2366
2868
 
2367
2869
  // KQ = soft_max(KQ_masked)
2368
2870
  struct ggml_tensor * KQ_soft_max = ggml_soft_max_inplace(ctx0, KQ_masked);
@@ -2812,6 +3314,10 @@ static struct ggml_cgraph * llama_build_graph(
2812
3314
  {
2813
3315
  result = llm_build_llama(lctx, tokens, embd, n_tokens, n_past);
2814
3316
  } break;
3317
+ case LLM_ARCH_BAICHUAN:
3318
+ {
3319
+ result = llm_build_baichaun(lctx, tokens, embd, n_tokens, n_past);
3320
+ } break;
2815
3321
  case LLM_ARCH_FALCON:
2816
3322
  {
2817
3323
  result = llm_build_falcon(lctx, tokens, embd, n_tokens, n_past);
@@ -2895,7 +3401,12 @@ static bool llama_eval_internal(
2895
3401
 
2896
3402
  // for big prompts, if BLAS is enabled, it is better to use only one thread
2897
3403
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
2898
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
3404
+ // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
3405
+ // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
3406
+ // with the BLAS calls. need a better solution
3407
+ if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
3408
+ n_threads = std::min(4, n_threads);
3409
+ }
2899
3410
 
2900
3411
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
2901
3412
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@@ -3000,33 +3511,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
3000
3511
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
3001
3512
  }
3002
3513
 
3003
- static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
3004
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
3005
- }
3006
-
3007
- static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
3008
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
3009
- }
3010
-
3011
3514
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
3012
3515
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
3013
3516
  }
3014
3517
 
3015
- static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
3016
- GGML_ASSERT(llama_is_control_token(vocab, id));
3017
- return id == vocab.special_bos_id;
3018
- }
3019
-
3020
- static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
3021
- GGML_ASSERT(llama_is_control_token(vocab, id));
3022
- return id == vocab.special_eos_id;
3023
- }
3024
-
3025
- static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
3026
- GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
3027
- return id == vocab.special_pad_id;
3028
- }
3029
-
3030
3518
  static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
3031
3519
  GGML_ASSERT(llama_is_byte_token(vocab, id));
3032
3520
  const auto& token_data = vocab.id_to_token.at(id);
@@ -3087,10 +3575,9 @@ struct llm_tokenizer_spm {
3087
3575
  while (offs < text.size()) {
3088
3576
  llm_symbol sym;
3089
3577
  size_t len = utf8_len(text[offs]);
3090
- GGML_ASSERT(offs + len <= text.size());
3091
3578
  sym.text = text.c_str() + offs;
3092
- sym.n = len;
3093
- offs += len;
3579
+ sym.n = std::min(len, text.size() - offs);
3580
+ offs += sym.n;
3094
3581
  sym.prev = index - 1;
3095
3582
  sym.next = offs == text.size() ? -1 : index + 1;
3096
3583
  index++;
@@ -3319,9 +3806,15 @@ struct llm_tokenizer_bpe {
3319
3806
  std::string byte_str(1, *j);
3320
3807
  auto token_multibyte = vocab.token_to_id.find(byte_str);
3321
3808
  if (token_multibyte == vocab.token_to_id.end()) {
3322
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3809
+ try {
3810
+ llama_token token_byte = llama_byte_to_token(vocab, *j);
3811
+ output.push_back(token_byte);
3812
+ } catch (const std::out_of_range & err) {
3813
+ fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3814
+ }
3815
+ } else {
3816
+ output.push_back((*token_multibyte).second);
3323
3817
  }
3324
- output.push_back((*token_multibyte).second);
3325
3818
  }
3326
3819
  } else {
3327
3820
  output.push_back((*token).second);
@@ -3595,7 +4088,7 @@ static void llama_grammar_advance_stack(
3595
4088
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
3596
4089
 
3597
4090
  if (stack.empty()) {
3598
- new_stacks.push_back(stack);
4091
+ new_stacks.emplace_back(stack);
3599
4092
  return;
3600
4093
  }
3601
4094
 
@@ -3632,7 +4125,7 @@ static void llama_grammar_advance_stack(
3632
4125
  }
3633
4126
  case LLAMA_GRETYPE_CHAR:
3634
4127
  case LLAMA_GRETYPE_CHAR_NOT:
3635
- new_stacks.push_back(stack);
4128
+ new_stacks.emplace_back(stack);
3636
4129
  break;
3637
4130
  default:
3638
4131
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -3797,6 +4290,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
3797
4290
  delete grammar;
3798
4291
  }
3799
4292
 
4293
+ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
4294
+ llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
4295
+
4296
+ // redirect elements in stacks to point to new rules
4297
+ for (size_t is = 0; is < result->stacks.size(); is++) {
4298
+ for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
4299
+ for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
4300
+ for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
4301
+ if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
4302
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
4303
+ }
4304
+ }
4305
+ }
4306
+ }
4307
+ }
4308
+
4309
+ return result;
4310
+ }
4311
+
3800
4312
  //
3801
4313
  // sampling
3802
4314
  //
@@ -4388,7 +4900,7 @@ struct llama_logit_info {
4388
4900
  }
4389
4901
  return min_heap;
4390
4902
  }
4391
- float probability_from_logit(float logit) {
4903
+ float probability_from_logit(float logit) const {
4392
4904
  return normalizer * std::exp(logit - max_l);
4393
4905
  }
4394
4906
  };
@@ -4581,7 +5093,16 @@ void llama_beam_search(llama_context * ctx,
4581
5093
  // quantization
4582
5094
  //
4583
5095
 
4584
- static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vector<float> & output, const size_t nelements, const int nthread) {
5096
+ template <typename T>
5097
+ struct no_init {
5098
+ T value;
5099
+ no_init() { /* do nothing */ }
5100
+ };
5101
+
5102
+ static void llama_convert_tensor_internal(
5103
+ struct ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
5104
+ const size_t nelements, const int nthread
5105
+ ) {
4585
5106
  if (output.size() < nelements) {
4586
5107
  output.resize(nelements);
4587
5108
  }
@@ -4616,7 +5137,6 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4616
5137
  auto blocks_per_thread = nblocks / nthread;
4617
5138
  auto spare_blocks = nblocks - (blocks_per_thread * nthread); // if blocks aren't divisible by thread count
4618
5139
 
4619
- std::vector<std::thread> workers;
4620
5140
  for (auto tnum = 0, in_buff_offs = 0, out_buff_offs = 0; tnum < nthread; tnum++) {
4621
5141
  auto thr_blocks = blocks_per_thread + (tnum == nthread - 1 ? spare_blocks : 0); // num blocks for this thread
4622
5142
  auto thr_elems = thr_blocks * block_size; // number of elements for this thread
@@ -4629,14 +5149,123 @@ static void llama_convert_tensor_internal(struct ggml_tensor * tensor, std::vect
4629
5149
  qtype.to_float(inbuf, outbuf, nels);
4630
5150
  }
4631
5151
  };
4632
- workers.push_back(std::thread(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems));
5152
+ workers.emplace_back(compute, tensor->type, (uint8_t *) tensor->data + in_buff_offs, f32_output + out_buff_offs, thr_elems);
4633
5153
  in_buff_offs += thr_block_bytes;
4634
5154
  out_buff_offs += thr_elems;
4635
5155
  }
4636
- for (auto & worker : workers) {
4637
- worker.join();
5156
+ for (auto & w : workers) { w.join(); }
5157
+ workers.clear();
5158
+ }
5159
+
5160
+ #ifdef GGML_USE_K_QUANTS
5161
+ static ggml_type get_k_quant_type(
5162
+ ggml_type new_type, const ggml_tensor * tensor, const llama_model & model, llama_ftype ftype, int * i_attention_wv,
5163
+ int n_attention_wv, int * i_feed_forward_w2, int n_feed_forward_w2
5164
+ ) {
5165
+ const std::string name = ggml_get_name(tensor);
5166
+ // TODO: avoid hardcoded tensor names - use the TN_* constants
5167
+ const auto tn = LLM_TN(model.arch);
5168
+
5169
+ auto use_more_bits = [](int i_layer, int num_layers) -> bool {
5170
+ return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
5171
+ };
5172
+
5173
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5174
+ int nx = tensor->ne[0];
5175
+ if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
5176
+ new_type = GGML_TYPE_Q8_0;
5177
+ }
5178
+ else if (new_type != GGML_TYPE_Q8_0) {
5179
+ new_type = GGML_TYPE_Q6_K;
5180
+ }
5181
+ } else if (name.find("attn_v.weight") != std::string::npos) {
5182
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5183
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5184
+ new_type = *i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5185
+ }
5186
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5187
+ else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
5188
+ use_more_bits(*i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
5189
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && *i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
5190
+ else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
5191
+ (*i_attention_wv < n_attention_wv/8 || *i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
5192
+ if (model.type == MODEL_70B) {
5193
+ // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
5194
+ // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
5195
+ // nearly negligible increase in model size by quantizing this tensor with more bits:
5196
+ if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
5197
+ }
5198
+ ++*i_attention_wv;
5199
+ } else if (name.find("ffn_down.weight") != std::string::npos) {
5200
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5201
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
5202
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
5203
+ : model.arch != LLM_ARCH_FALCON || use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
5204
+ : GGML_TYPE_Q3_K;
5205
+ }
5206
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
5207
+ new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
5208
+ }
5209
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
5210
+ if (model.arch == LLM_ARCH_FALCON) {
5211
+ new_type = *i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
5212
+ use_more_bits(*i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
5213
+ } else {
5214
+ if (use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5215
+ }
5216
+ }
5217
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(*i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
5218
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && *i_feed_forward_w2 < 4) {
5219
+ new_type = GGML_TYPE_Q5_K;
5220
+ }
5221
+ ++*i_feed_forward_w2;
5222
+ } else if (name.find("attn_output.weight") != std::string::npos) {
5223
+ if (model.arch != LLM_ARCH_FALCON) {
5224
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
5225
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
5226
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
5227
+ } else {
5228
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5229
+ }
5230
+ }
5231
+ else if (name.find("attn_qkv.weight") != std::string::npos) {
5232
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
5233
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
5234
+ else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
5235
+ }
5236
+ else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
5237
+ if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
5238
+ }
5239
+ // This can be used to reduce the size of the Q5_K_S model.
5240
+ // The associated PPL increase is fully in line with the size reduction
5241
+ //else {
5242
+ // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
5243
+ //}
5244
+ bool convert_incompatible_tensor = false;
5245
+ if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
5246
+ new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
5247
+ int nx = tensor->ne[0];
5248
+ int ny = tensor->ne[1];
5249
+ if (nx % QK_K != 0) {
5250
+ LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
5251
+ convert_incompatible_tensor = true;
5252
+ }
5253
+ }
5254
+ if (convert_incompatible_tensor) {
5255
+ if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
5256
+ new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
5257
+ LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
5258
+ } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
5259
+ new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
5260
+ LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
5261
+ } else {
5262
+ throw std::runtime_error("Unsupported tensor size encountered\n");
5263
+ }
4638
5264
  }
5265
+
5266
+ return new_type;
4639
5267
  }
5268
+ #endif
4640
5269
 
4641
5270
  static void llama_model_quantize_internal(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
4642
5271
  ggml_type quantized_type;
@@ -4678,6 +5307,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4678
5307
  llm_load_arch(*ml, model);
4679
5308
  llm_load_hparams(*ml, model, 0, 0, 0);
4680
5309
 
5310
+ if (params->only_copy) {
5311
+ ftype = model.ftype;
5312
+ }
5313
+
4681
5314
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
4682
5315
  struct gguf_context * ctx_out = gguf_init_empty();
4683
5316
 
@@ -4717,16 +5350,14 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4717
5350
  std::vector<int64_t> hist_all(1 << 4, 0);
4718
5351
 
4719
5352
  std::vector<std::thread> workers;
5353
+ workers.reserve(nthread);
4720
5354
  std::mutex mutex;
4721
5355
 
4722
- auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4723
- return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4724
- };
4725
-
4726
5356
  int idx = 0;
4727
5357
 
4728
- std::vector<uint8_t> read_data;
4729
- std::vector<uint8_t> work;
5358
+ std::vector<no_init<uint8_t>> read_data;
5359
+ std::vector<no_init<uint8_t>> work;
5360
+ std::vector<no_init<float>> f32_conv_buf;
4730
5361
 
4731
5362
  // populate the original tensors so we get an initial meta data
4732
5363
  for (int i = 0; i < ml->n_tensors; ++i) {
@@ -4748,7 +5379,9 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4748
5379
 
4749
5380
  const std::string name = ggml_get_name(tensor);
4750
5381
 
4751
- read_data.resize(ggml_nbytes(tensor));
5382
+ if (read_data.size() < ggml_nbytes(tensor)) {
5383
+ read_data.resize(ggml_nbytes(tensor));
5384
+ }
4752
5385
  tensor->data = read_data.data();
4753
5386
  ml->load_data_for(tensor);
4754
5387
 
@@ -4764,137 +5397,50 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4764
5397
  // quantize only 2D tensors
4765
5398
  quantize &= (tensor->n_dims == 2);
4766
5399
  quantize &= params->quantize_output_tensor || name != "output.weight";
4767
- quantize &= quantized_type != tensor->type;
5400
+ quantize &= !params->only_copy;
4768
5401
 
4769
5402
  enum ggml_type new_type;
4770
5403
  void * new_data;
4771
5404
  size_t new_size;
4772
5405
 
5406
+ if (quantize) {
5407
+ new_type = quantized_type;
5408
+ #ifdef GGML_USE_K_QUANTS
5409
+ new_type = get_k_quant_type(
5410
+ new_type, tensor, model, ftype, &i_attention_wv, n_attention_wv, &i_feed_forward_w2, n_feed_forward_w2
5411
+ );
5412
+ #endif
5413
+ // If we've decided to quantize to the same type the tensor is already
5414
+ // in then there's nothing to do.
5415
+ quantize = tensor->type != new_type;
5416
+ }
4773
5417
  if (!quantize) {
4774
5418
  new_type = tensor->type;
4775
5419
  new_data = tensor->data;
4776
5420
  new_size = ggml_nbytes(tensor);
4777
5421
  LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4778
5422
  } else {
4779
- new_type = quantized_type;
4780
- #ifdef GGML_USE_K_QUANTS
4781
- // TODO: avoid hardcoded tensor names - use the TN_* constants
4782
- const auto tn = LLM_TN(ml->get_arch());
4783
-
4784
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4785
- int nx = tensor->ne[0];
4786
- if (model.arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
4787
- new_type = GGML_TYPE_Q8_0;
4788
- }
4789
- else if (new_type != GGML_TYPE_Q8_0) {
4790
- new_type = GGML_TYPE_Q6_K;
4791
- }
4792
- } else if (name.find("attn_v.weight") != std::string::npos) {
4793
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4794
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4795
- new_type = i_attention_wv < 2 ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4796
- }
4797
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4798
- else if ((ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) &&
4799
- use_more_bits(i_attention_wv, n_attention_wv)) new_type = GGML_TYPE_Q6_K;
4800
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && i_attention_wv < 4) new_type = GGML_TYPE_Q5_K;
4801
- else if (QK_K == 64 && (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S) &&
4802
- (i_attention_wv < n_attention_wv/8 || i_attention_wv >= 7*n_attention_wv/8)) new_type = GGML_TYPE_Q6_K;
4803
- if (model.type == MODEL_70B) {
4804
- // In the 70B model we have 8 heads sharing the same attn_v weights. As a result, the attn_v.weight tensor is
4805
- // 8x smaller compared to attn_q.weight. Hence, we can get a nice boost in quantization accuracy with
4806
- // nearly negligible increase in model size by quantizing this tensor with more bits:
4807
- if (new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K) new_type = GGML_TYPE_Q5_K;
4808
- }
4809
- ++i_attention_wv;
4810
- } else if (name.find("ffn_down.weight") != std::string::npos) {
4811
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4812
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
4813
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
4814
- : model.arch != LLM_ARCH_FALCON || use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q4_K
4815
- : GGML_TYPE_Q3_K;
4816
- }
4817
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) {
4818
- new_type = model.arch == LLM_ARCH_FALCON ? GGML_TYPE_Q4_K : GGML_TYPE_Q5_K;
4819
- }
4820
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
4821
- if (model.arch == LLM_ARCH_FALCON) {
4822
- new_type = i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
4823
- use_more_bits(i_feed_forward_w2, n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
4824
- } else {
4825
- if (use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4826
- }
4827
- }
4828
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(i_feed_forward_w2, n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
4829
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && model.arch != LLM_ARCH_FALCON && i_feed_forward_w2 < 4) {
4830
- new_type = GGML_TYPE_Q5_K;
4831
- }
4832
- ++i_feed_forward_w2;
4833
- } else if (name.find("attn_output.weight") != std::string::npos) {
4834
- if (model.arch != LLM_ARCH_FALCON) {
4835
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K ) new_type = GGML_TYPE_Q3_K;
4836
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
4837
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
4838
- } else {
4839
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4840
- }
4841
- }
4842
- else if (name.find("attn_qkv.weight") != std::string::npos) {
4843
- if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
4844
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
4845
- else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
4846
- }
4847
- else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
4848
- if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
4849
- }
4850
- // This can be used to reduce the size of the Q5_K_S model.
4851
- // The associated PPL increase is fully in line with the size reduction
4852
- //else {
4853
- // if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_S) new_type = GGML_TYPE_Q4_K;
4854
- //}
4855
- bool convert_incompatible_tensor = false;
4856
- if (new_type == GGML_TYPE_Q2_K || new_type == GGML_TYPE_Q3_K || new_type == GGML_TYPE_Q4_K ||
4857
- new_type == GGML_TYPE_Q5_K || new_type == GGML_TYPE_Q6_K) {
4858
- int nx = tensor->ne[0];
4859
- int ny = tensor->ne[1];
4860
- if (nx % QK_K != 0) {
4861
- LLAMA_LOG_WARN("\n\n%s : tensor cols %d x %d are not divisible by %d, required for k-quants\n", __func__, nx, ny, QK_K);
4862
- convert_incompatible_tensor = true;
4863
- }
4864
- }
4865
- if (convert_incompatible_tensor) {
4866
- if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
4867
- new_type = GGML_TYPE_F16; //fall back to F16 instead of just failing.
4868
- LLAMA_LOG_WARN("F16 will be used for this tensor instead.\n");
4869
- } else if (name == tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
4870
- new_type = GGML_TYPE_Q4_0; //fall back to Q4_0 instead of just failing.
4871
- LLAMA_LOG_WARN("Q4_0 will be used for this tensor instead.\n");
4872
- } else {
4873
- throw std::runtime_error("Unsupported tensor size encountered\n");
4874
- }
4875
- }
4876
- #endif
4877
-
4878
5423
  const size_t nelements = ggml_nelements(tensor);
4879
5424
 
4880
5425
  float * f32_data;
4881
- std::vector<float> f32_conv_buf;
4882
5426
 
4883
5427
  if (tensor->type == GGML_TYPE_F32) {
4884
5428
  f32_data = (float *) tensor->data;
4885
5429
  } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
4886
5430
  throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
4887
5431
  } else {
4888
- llama_convert_tensor_internal(tensor, f32_conv_buf, nelements, nthread);
5432
+ llama_convert_tensor_internal(tensor, f32_conv_buf, workers, nelements, nthread);
4889
5433
  f32_data = (float *) f32_conv_buf.data();
4890
5434
  }
4891
5435
 
4892
5436
  LLAMA_LOG_INFO("quantizing to %s .. ", ggml_type_name(new_type));
4893
5437
  fflush(stdout);
4894
5438
 
4895
- work.resize(nelements * 4); // upper bound on size
5439
+ if (work.size() < nelements * 4) {
5440
+ work.resize(nelements * 4); // upper bound on size
5441
+ }
4896
5442
  new_data = work.data();
4897
- std::vector<int64_t> hist_cur(1 << 4, 0);
5443
+ std::array<int64_t, 1 << 4> hist_cur = {};
4898
5444
 
4899
5445
  static const int chunk_size = 32 * 512;
4900
5446
  const int nchunk = (nelements + chunk_size - 1)/chunk_size;
@@ -4905,13 +5451,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4905
5451
  size_t counter = 0;
4906
5452
  new_size = 0;
4907
5453
  auto compute = [&mutex, &counter, &hist_cur, &new_size, new_type, f32_data, new_data, nelements]() {
4908
- std::vector<int64_t> local_hist;
5454
+ std::array<int64_t, 1 << 4> local_hist = {};
4909
5455
  size_t local_size = 0;
4910
5456
  while (true) {
4911
5457
  std::unique_lock<std::mutex> lock(mutex);
4912
5458
  size_t first = counter; counter += chunk_size;
4913
5459
  if (first >= nelements) {
4914
- if (!local_hist.empty()) {
5460
+ if (local_size > 0) {
4915
5461
  for (int j=0; j<int(local_hist.size()); ++j) {
4916
5462
  hist_cur[j] += local_hist[j];
4917
5463
  }
@@ -4921,22 +5467,15 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4921
5467
  }
4922
5468
  lock.unlock();
4923
5469
  size_t last = std::min(nelements, first + chunk_size);
4924
- if (local_hist.empty()) {
4925
- local_hist.resize(hist_cur.size(), 0);
4926
- }
4927
5470
  local_size += ggml_quantize_chunk(new_type, f32_data, new_data, first, last - first, local_hist.data());
4928
5471
  }
4929
5472
  };
4930
- if ((int) workers.size() < nthread_use - 1) {
4931
- workers.resize(nthread_use - 1);
4932
- }
4933
5473
  for (int it = 0; it < nthread_use - 1; ++it) {
4934
- workers[it] = std::thread(compute);
5474
+ workers.emplace_back(compute);
4935
5475
  }
4936
5476
  compute();
4937
- for (int it = 0; it < nthread_use - 1; ++it) {
4938
- workers[it].join();
4939
- }
5477
+ for (auto & w : workers) { w.join(); }
5478
+ workers.clear();
4940
5479
  }
4941
5480
 
4942
5481
  LLAMA_LOG_INFO("size = %8.2f MB -> %8.2f MB | hist: ", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
@@ -5279,7 +5818,7 @@ struct llama_context_params llama_context_default_params() {
5279
5818
  /*.seed =*/ LLAMA_DEFAULT_SEED,
5280
5819
  /*.n_ctx =*/ 512,
5281
5820
  /*.n_batch =*/ 512,
5282
- /*.gpu_layers =*/ 0,
5821
+ /*.n_gpu_layers =*/ 0,
5283
5822
  /*.main_gpu =*/ 0,
5284
5823
  /*.tensor_split =*/ nullptr,
5285
5824
  /*.rope_freq_base =*/ 10000.0f,
@@ -5296,6 +5835,10 @@ struct llama_context_params llama_context_default_params() {
5296
5835
  /*.embedding =*/ false,
5297
5836
  };
5298
5837
 
5838
+ #ifdef GGML_USE_METAL
5839
+ result.n_gpu_layers = 1;
5840
+ #endif
5841
+
5299
5842
  return result;
5300
5843
  }
5301
5844
 
@@ -5305,6 +5848,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
5305
5848
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
5306
5849
  /*.allow_requantize =*/ false,
5307
5850
  /*.quantize_output_tensor =*/ true,
5851
+ /*.only_copy =*/ false,
5308
5852
  };
5309
5853
 
5310
5854
  return result;
@@ -5487,43 +6031,43 @@ struct llama_context * llama_new_context_with_model(
5487
6031
  }
5488
6032
  #endif
5489
6033
  }
5490
- }
5491
6034
 
5492
6035
  #ifdef GGML_USE_METAL
5493
- if (params.n_gpu_layers > 0) {
5494
- // this allocates all Metal resources and memory buffers
6036
+ if (params.n_gpu_layers > 0) {
6037
+ // this allocates all Metal resources and memory buffers
5495
6038
 
5496
- void * data_ptr = NULL;
5497
- size_t data_size = 0;
6039
+ void * data_ptr = NULL;
6040
+ size_t data_size = 0;
5498
6041
 
5499
- if (params.use_mmap) {
5500
- data_ptr = ctx->model.mapping->addr;
5501
- data_size = ctx->model.mapping->size;
5502
- } else {
5503
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5504
- data_size = ggml_get_mem_size (ctx->model.ctx);
5505
- }
6042
+ if (params.use_mmap) {
6043
+ data_ptr = ctx->model.mapping->addr;
6044
+ data_size = ctx->model.mapping->size;
6045
+ } else {
6046
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
6047
+ data_size = ggml_get_mem_size (ctx->model.ctx);
6048
+ }
5506
6049
 
5507
- const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
6050
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5508
6051
 
5509
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
6052
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5510
6053
 
5511
6054
  #define LLAMA_METAL_CHECK_BUF(result) \
5512
- if (!(result)) { \
5513
- LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5514
- llama_free(ctx); \
5515
- return NULL; \
5516
- }
6055
+ if (!(result)) { \
6056
+ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
6057
+ llama_free(ctx); \
6058
+ return NULL; \
6059
+ }
5517
6060
 
5518
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
6061
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5519
6062
 
5520
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5521
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
6063
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
6064
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5522
6065
 
5523
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
6066
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5524
6067
  #undef LLAMA_METAL_CHECK_BUF
5525
- }
6068
+ }
5526
6069
  #endif
6070
+ }
5527
6071
 
5528
6072
  #ifdef GGML_USE_MPI
5529
6073
  ctx->ctx_mpi = ggml_mpi_init();
@@ -5559,15 +6103,19 @@ void llama_free(struct llama_context * ctx) {
5559
6103
  }
5560
6104
 
5561
6105
  int llama_n_vocab(const struct llama_context * ctx) {
5562
- return ctx->model.vocab.id_to_token.size();
6106
+ return llama_model_n_vocab(&ctx->model);
5563
6107
  }
5564
6108
 
5565
6109
  int llama_n_ctx(const struct llama_context * ctx) {
5566
- return ctx->model.hparams.n_ctx;
6110
+ return llama_model_n_ctx(&ctx->model);
6111
+ }
6112
+
6113
+ int llama_n_ctx_train(const struct llama_context * ctx) {
6114
+ return llama_model_n_ctx_train(&ctx->model);
5567
6115
  }
5568
6116
 
5569
6117
  int llama_n_embd(const struct llama_context * ctx) {
5570
- return ctx->model.hparams.n_embd;
6118
+ return llama_model_n_embd(&ctx->model);
5571
6119
  }
5572
6120
 
5573
6121
  enum llama_vocab_type llama_vocab_type(const struct llama_context * ctx) {
@@ -5582,6 +6130,10 @@ int llama_model_n_ctx(const struct llama_model * model) {
5582
6130
  return model->hparams.n_ctx;
5583
6131
  }
5584
6132
 
6133
+ int llama_model_n_ctx_train(const struct llama_model * model) {
6134
+ return model->hparams.n_ctx_train;
6135
+ }
6136
+
5585
6137
  int llama_model_n_embd(const struct llama_model * model) {
5586
6138
  return model->hparams.n_embd;
5587
6139
  }
@@ -5857,7 +6409,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
5857
6409
  rng_ss.str(std::string(&rng_buf[0], rng_size));
5858
6410
  rng_ss >> ctx->rng;
5859
6411
 
5860
- GGML_ASSERT(rng_ss.fail() == false);
6412
+ GGML_ASSERT(!rng_ss.fail());
5861
6413
  }
5862
6414
 
5863
6415
  // set logits
@@ -6136,7 +6688,7 @@ int llama_tokenize_with_model(
6136
6688
  auto res = llama_tokenize_internal(model->vocab, text, add_bos);
6137
6689
 
6138
6690
  if (n_max_tokens < (int) res.size()) {
6139
- LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6691
+ // LLAMA_LOG_ERROR("%s: too many tokens\n", __func__);
6140
6692
  return -((int) res.size());
6141
6693
  }
6142
6694