llama_cpp 0.5.0 → 0.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -13,6 +13,26 @@
13
13
  //
14
14
  #include <arm_neon.h>
15
15
 
16
+ #if !defined(__aarch64__)
17
+ inline static int32_t vaddvq_s16(int16x8_t v) {
18
+ return
19
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
20
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
21
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
22
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
23
+ }
24
+
25
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
26
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
27
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
28
+ return vcombine_s16(a0, b0);
29
+ }
30
+
31
+ inline static int32_t vaddvq_s32(int32x4_t v) {
32
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
33
+ }
34
+ #endif
35
+
16
36
  #else
17
37
 
18
38
  #ifdef __wasm_simd128__
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
63
83
  float ax = fabsf(x[i]);
64
84
  if (ax > amax) { amax = ax; max = x[i]; }
65
85
  }
66
- if (!amax) { // all zero
86
+ if (amax < 1e-30f) { // all zero
67
87
  for (int i = 0; i < n; ++i) {
68
88
  L[i] = 0;
69
89
  }
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
183
203
  int ntry, float alpha) {
184
204
  float min = x[0];
185
205
  float max = x[0];
186
- float sum_x = 0;
187
- float sum_x2 = 0;
188
206
  for (int i = 1; i < n; ++i) {
189
207
  if (x[i] < min) min = x[i];
190
208
  if (x[i] > max) max = x[i];
191
- sum_x += x[i];
192
- sum_x2 += x[i]*x[i];
193
209
  }
194
210
  if (max == min) {
195
211
  for (int i = 0; i < n; ++i) L[i] = 0;
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
1070
1086
 
1071
1087
  }
1072
1088
 
1089
+ if (!max_abs_scale) {
1090
+ memset(&y[i], 0, sizeof(block_q6_K));
1091
+ y[i].d = ggml_fp32_to_fp16(0.f);
1092
+ x += QK_K;
1093
+ continue;
1094
+ }
1095
+
1073
1096
  float iscale = -128.f/max_scale;
1074
1097
  y[i].d = ggml_fp32_to_fp16(1/iscale);
1075
1098
  for (int ib = 0; ib < QK_K/16; ++ib) {
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1306
1329
 
1307
1330
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1308
1331
  const uint8x16_t m4 = vdupq_n_u8(0xF);
1332
+ #if defined(__ARM_FEATURE_DOTPROD)
1309
1333
  const int32x4_t vzero = vdupq_n_s32(0);
1334
+ #endif
1310
1335
 
1311
1336
  int8x16x2_t q2bytes;
1312
1337
  uint8_t aux[16];
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1612
1637
  #ifdef __ARM_NEON
1613
1638
 
1614
1639
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1640
+ #if defined(__ARM_FEATURE_DOTPROD)
1615
1641
  const int32x4_t vzero = vdupq_n_s32(0);
1642
+ #endif
1616
1643
 
1617
1644
  int8x16x4_t q2bytes;
1618
1645
 
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2060
2087
 
2061
2088
  __m256 acc = _mm256_setzero_ps();
2062
2089
 
2063
- uint32_t *aux;
2090
+ const uint32_t *aux;
2064
2091
 
2065
2092
  for (int i = 0; i < nb; ++i) {
2066
2093
 
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2070
2097
  const int8_t * restrict q8 = y[i].qs;
2071
2098
 
2072
2099
  // Set up scales
2073
- aux = (uint32_t *)x[i].scales;
2100
+ aux = (const uint32_t *)x[i].scales;
2074
2101
  __m128i scales128 = _mm_set_epi32(
2075
2102
  ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
2076
2103
  ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
@@ -2596,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2596
2623
  const uint8_t * restrict q4 = x[i].qs;
2597
2624
  const int8_t * restrict q8 = y[i].qs;
2598
2625
 
2599
- //int32x4_t isum = mzero;
2600
-
2601
2626
  int32_t sumi1 = 0;
2602
2627
  int32_t sumi2 = 0;
2603
2628
 
@@ -3096,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3096
3121
  #ifdef __ARM_NEON
3097
3122
 
3098
3123
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3099
- const int32x4_t mzero = vdupq_n_s32(0);
3100
3124
  const uint8x16_t mone = vdupq_n_u8(1);
3101
3125
  const uint8x16_t mtwo = vdupq_n_u8(2);
3126
+ #if defined(__ARM_FEATURE_DOTPROD)
3127
+ const int32x4_t mzero = vdupq_n_s32(0);
3128
+ #endif
3102
3129
 
3103
3130
  int8x16x4_t q5bytes;
3104
3131
 
@@ -3441,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3441
3468
  #ifdef __ARM_NEON
3442
3469
 
3443
3470
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3444
- const int32x4_t mzero = vdupq_n_s32(0);
3445
3471
  const uint8x16_t mh = vdupq_n_u8(16);
3472
+ #if defined(__ARM_FEATURE_DOTPROD)
3473
+ const int32x4_t mzero = vdupq_n_s32(0);
3474
+ #endif
3446
3475
 
3447
3476
  int8x16x4_t q5bytes;
3448
3477
  uint8x16x4_t q5h;
@@ -3660,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3660
3689
  float sum = 0;
3661
3690
 
3662
3691
  const uint8x16_t m4b = vdupq_n_u8(0xF);
3692
+ #if defined(__ARM_FEATURE_DOTPROD)
3663
3693
  const int32x4_t vzero = vdupq_n_s32(0);
3694
+ #endif
3664
3695
  //const int8x16_t m32s = vdupq_n_s8(32);
3665
3696
 
3666
3697
  const uint8x16_t mone = vdupq_n_u8(3);
@@ -4049,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4049
4080
  float sum = 0;
4050
4081
 
4051
4082
  const uint8x16_t m4b = vdupq_n_u8(0xF);
4052
- const int32x4_t vzero = vdupq_n_s32(0);
4053
4083
  const int8x16_t m32s = vdupq_n_s8(32);
4084
+ #if defined(__ARM_FEATURE_DOTPROD)
4085
+ const int32x4_t vzero = vdupq_n_s32(0);
4086
+ #endif
4054
4087
 
4055
4088
  const uint8x16_t mone = vdupq_n_u8(3);
4056
4089
 
@@ -126,6 +126,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
126
126
  }
127
127
  s = std::move(result);
128
128
  }
129
+ #ifdef GGML_USE_CPU_HBM
130
+ #include <hbwmalloc.h>
131
+ #endif
129
132
 
130
133
  static void zeros(std::ofstream & file, size_t n) {
131
134
  char zero = 0;
@@ -325,6 +328,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
325
328
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
326
329
  },
327
330
  },
331
+ {
332
+ LLM_ARCH_GPT2,
333
+ {
334
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
335
+ },
336
+ },
337
+ {
338
+ LLM_ARCH_GPTJ,
339
+ {
340
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
341
+ },
342
+ },
343
+ {
344
+ LLM_ARCH_GPTNEOX,
345
+ {
346
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
347
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
348
+ { LLM_TENSOR_OUTPUT, "output" },
349
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
350
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
351
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
352
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
353
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
354
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
355
+ },
356
+ },
357
+ {
358
+ LLM_ARCH_MPT,
359
+ {
360
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
361
+ },
362
+ },
363
+ {
364
+ LLM_ARCH_UNKNOWN,
365
+ {
366
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
367
+ },
368
+ },
328
369
  };
329
370
 
330
371
  static llm_arch llm_arch_from_string(const std::string & name) {
@@ -412,6 +453,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
412
453
  #elif GGML_USE_METAL
413
454
  # define llama_host_malloc(n) ggml_metal_host_malloc(n)
414
455
  # define llama_host_free(data) ggml_metal_host_free(data)
456
+ #elif GGML_USE_CPU_HBM
457
+ # define llama_host_malloc(n) hbw_malloc(n)
458
+ # define llama_host_free(data) if (data != NULL) hbw_free(data)
415
459
  #else
416
460
  # define llama_host_malloc(n) malloc(n)
417
461
  # define llama_host_free(data) free(data)
@@ -568,16 +612,16 @@ struct llama_mmap {
568
612
 
569
613
  if (prefetch > 0) {
570
614
  // Advise the kernel to preload the mapped memory
571
- if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
572
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
615
+ if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
616
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
573
617
  strerror(errno));
574
618
  }
575
619
  }
576
620
  if (numa) {
577
621
  // advise the kernel not to use readahead
578
622
  // (because the next page might not belong on the same node)
579
- if (madvise(addr, file->size, MADV_RANDOM)) {
580
- fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
623
+ if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
624
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
581
625
  strerror(errno));
582
626
  }
583
627
  }
@@ -614,7 +658,9 @@ struct llama_mmap {
614
658
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
615
659
  if (prefetch) {
616
660
  // Advise the kernel to preload the mapped memory
661
+
617
662
  WIN32_MEMORY_RANGE_ENTRY range;
663
+
618
664
  range.VirtualAddress = addr;
619
665
  range.NumberOfBytes = (SIZE_T)size;
620
666
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
@@ -1446,7 +1492,11 @@ struct llama_model_loader {
1446
1492
  // allocate temp buffer if not using mmap
1447
1493
  if (!use_mmap && cur->data == NULL) {
1448
1494
  GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
1449
- cur->data = malloc(ggml_nbytes(cur));
1495
+ #ifdef GGML_USE_CPU_HBM
1496
+ cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
1497
+ #else
1498
+ cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
1499
+ #endif
1450
1500
  }
1451
1501
 
1452
1502
  load_data_for(cur);
@@ -1600,9 +1650,13 @@ static void llm_load_hparams(
1600
1650
 
1601
1651
  GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
1602
1652
 
1603
- if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1604
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1653
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
1654
+ if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1655
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1656
+ }
1605
1657
  }
1658
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
1659
+ // gpt-j n_rot = rotary_dim
1606
1660
  }
1607
1661
 
1608
1662
  // arch-specific KVs
@@ -2895,7 +2949,12 @@ static bool llama_eval_internal(
2895
2949
 
2896
2950
  // for big prompts, if BLAS is enabled, it is better to use only one thread
2897
2951
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
2898
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
2952
+ // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
2953
+ // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
2954
+ // with the BLAS calls. need a better solution
2955
+ if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
2956
+ n_threads = std::min(4, n_threads);
2957
+ }
2899
2958
 
2900
2959
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
2901
2960
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@@ -3000,33 +3059,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
3000
3059
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
3001
3060
  }
3002
3061
 
3003
- static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
3004
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
3005
- }
3006
-
3007
- static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
3008
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
3009
- }
3010
-
3011
3062
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
3012
3063
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
3013
3064
  }
3014
3065
 
3015
- static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
3016
- GGML_ASSERT(llama_is_control_token(vocab, id));
3017
- return id == vocab.special_bos_id;
3018
- }
3019
-
3020
- static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
3021
- GGML_ASSERT(llama_is_control_token(vocab, id));
3022
- return id == vocab.special_eos_id;
3023
- }
3024
-
3025
- static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
3026
- GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
3027
- return id == vocab.special_pad_id;
3028
- }
3029
-
3030
3066
  static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
3031
3067
  GGML_ASSERT(llama_is_byte_token(vocab, id));
3032
3068
  const auto& token_data = vocab.id_to_token.at(id);
@@ -3319,9 +3355,15 @@ struct llm_tokenizer_bpe {
3319
3355
  std::string byte_str(1, *j);
3320
3356
  auto token_multibyte = vocab.token_to_id.find(byte_str);
3321
3357
  if (token_multibyte == vocab.token_to_id.end()) {
3322
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3358
+ try {
3359
+ llama_token token_byte = llama_byte_to_token(vocab, *j);
3360
+ output.push_back(token_byte);
3361
+ } catch (const std::out_of_range & err) {
3362
+ fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3363
+ }
3364
+ } else {
3365
+ output.push_back((*token_multibyte).second);
3323
3366
  }
3324
- output.push_back((*token_multibyte).second);
3325
3367
  }
3326
3368
  } else {
3327
3369
  output.push_back((*token).second);
@@ -3595,7 +3637,7 @@ static void llama_grammar_advance_stack(
3595
3637
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
3596
3638
 
3597
3639
  if (stack.empty()) {
3598
- new_stacks.push_back(stack);
3640
+ new_stacks.emplace_back(stack);
3599
3641
  return;
3600
3642
  }
3601
3643
 
@@ -3632,7 +3674,7 @@ static void llama_grammar_advance_stack(
3632
3674
  }
3633
3675
  case LLAMA_GRETYPE_CHAR:
3634
3676
  case LLAMA_GRETYPE_CHAR_NOT:
3635
- new_stacks.push_back(stack);
3677
+ new_stacks.emplace_back(stack);
3636
3678
  break;
3637
3679
  default:
3638
3680
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -3797,6 +3839,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
3797
3839
  delete grammar;
3798
3840
  }
3799
3841
 
3842
+ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
3843
+ llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
3844
+
3845
+ // redirect elements in stacks to point to new rules
3846
+ for (size_t is = 0; is < result->stacks.size(); is++) {
3847
+ for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
3848
+ for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
3849
+ for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
3850
+ if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
3851
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
3852
+ }
3853
+ }
3854
+ }
3855
+ }
3856
+ }
3857
+
3858
+ return result;
3859
+ }
3860
+
3800
3861
  //
3801
3862
  // sampling
3802
3863
  //
@@ -4388,7 +4449,7 @@ struct llama_logit_info {
4388
4449
  }
4389
4450
  return min_heap;
4390
4451
  }
4391
- float probability_from_logit(float logit) {
4452
+ float probability_from_logit(float logit) const {
4392
4453
  return normalizer * std::exp(logit - max_l);
4393
4454
  }
4394
4455
  };
@@ -4678,6 +4739,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4678
4739
  llm_load_arch(*ml, model);
4679
4740
  llm_load_hparams(*ml, model, 0, 0, 0);
4680
4741
 
4742
+ if (params->only_copy) {
4743
+ ftype = model.ftype;
4744
+ }
4745
+
4681
4746
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
4682
4747
  struct gguf_context * ctx_out = gguf_init_empty();
4683
4748
 
@@ -4719,9 +4784,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4719
4784
  std::vector<std::thread> workers;
4720
4785
  std::mutex mutex;
4721
4786
 
4787
+ #ifdef GGML_USE_K_QUANTS
4722
4788
  auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4723
4789
  return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4724
4790
  };
4791
+ #endif
4725
4792
 
4726
4793
  int idx = 0;
4727
4794
 
@@ -4764,18 +4831,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4764
4831
  // quantize only 2D tensors
4765
4832
  quantize &= (tensor->n_dims == 2);
4766
4833
  quantize &= params->quantize_output_tensor || name != "output.weight";
4767
- quantize &= quantized_type != tensor->type;
4834
+ quantize &= !params->only_copy;
4768
4835
 
4769
4836
  enum ggml_type new_type;
4770
4837
  void * new_data;
4771
4838
  size_t new_size;
4772
4839
 
4773
- if (!quantize) {
4774
- new_type = tensor->type;
4775
- new_data = tensor->data;
4776
- new_size = ggml_nbytes(tensor);
4777
- LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4778
- } else {
4840
+ if (quantize) {
4779
4841
  new_type = quantized_type;
4780
4842
  #ifdef GGML_USE_K_QUANTS
4781
4843
  // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -4874,7 +4936,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4874
4936
  }
4875
4937
  }
4876
4938
  #endif
4877
-
4939
+ // If we've decided to quantize to the same type the tensor is already
4940
+ // in then there's nothing to do.
4941
+ quantize = tensor->type != new_type;
4942
+ }
4943
+ if (!quantize) {
4944
+ new_type = tensor->type;
4945
+ new_data = tensor->data;
4946
+ new_size = ggml_nbytes(tensor);
4947
+ LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4948
+ } else {
4878
4949
  const size_t nelements = ggml_nelements(tensor);
4879
4950
 
4880
4951
  float * f32_data;
@@ -5279,7 +5350,7 @@ struct llama_context_params llama_context_default_params() {
5279
5350
  /*.seed =*/ LLAMA_DEFAULT_SEED,
5280
5351
  /*.n_ctx =*/ 512,
5281
5352
  /*.n_batch =*/ 512,
5282
- /*.gpu_layers =*/ 0,
5353
+ /*.n_gpu_layers =*/ 0,
5283
5354
  /*.main_gpu =*/ 0,
5284
5355
  /*.tensor_split =*/ nullptr,
5285
5356
  /*.rope_freq_base =*/ 10000.0f,
@@ -5296,6 +5367,10 @@ struct llama_context_params llama_context_default_params() {
5296
5367
  /*.embedding =*/ false,
5297
5368
  };
5298
5369
 
5370
+ #ifdef GGML_USE_METAL
5371
+ result.n_gpu_layers = 1;
5372
+ #endif
5373
+
5299
5374
  return result;
5300
5375
  }
5301
5376
 
@@ -5305,6 +5380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
5305
5380
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
5306
5381
  /*.allow_requantize =*/ false,
5307
5382
  /*.quantize_output_tensor =*/ true,
5383
+ /*.only_copy =*/ false,
5308
5384
  };
5309
5385
 
5310
5386
  return result;
@@ -5487,43 +5563,43 @@ struct llama_context * llama_new_context_with_model(
5487
5563
  }
5488
5564
  #endif
5489
5565
  }
5490
- }
5491
5566
 
5492
5567
  #ifdef GGML_USE_METAL
5493
- if (params.n_gpu_layers > 0) {
5494
- // this allocates all Metal resources and memory buffers
5568
+ if (params.n_gpu_layers > 0) {
5569
+ // this allocates all Metal resources and memory buffers
5495
5570
 
5496
- void * data_ptr = NULL;
5497
- size_t data_size = 0;
5571
+ void * data_ptr = NULL;
5572
+ size_t data_size = 0;
5498
5573
 
5499
- if (params.use_mmap) {
5500
- data_ptr = ctx->model.mapping->addr;
5501
- data_size = ctx->model.mapping->size;
5502
- } else {
5503
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5504
- data_size = ggml_get_mem_size (ctx->model.ctx);
5505
- }
5574
+ if (params.use_mmap) {
5575
+ data_ptr = ctx->model.mapping->addr;
5576
+ data_size = ctx->model.mapping->size;
5577
+ } else {
5578
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5579
+ data_size = ggml_get_mem_size (ctx->model.ctx);
5580
+ }
5506
5581
 
5507
- const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5582
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5508
5583
 
5509
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5584
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5510
5585
 
5511
5586
  #define LLAMA_METAL_CHECK_BUF(result) \
5512
- if (!(result)) { \
5513
- LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5514
- llama_free(ctx); \
5515
- return NULL; \
5516
- }
5587
+ if (!(result)) { \
5588
+ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5589
+ llama_free(ctx); \
5590
+ return NULL; \
5591
+ }
5517
5592
 
5518
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5593
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5519
5594
 
5520
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5521
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5595
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5596
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5522
5597
 
5523
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5598
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5524
5599
  #undef LLAMA_METAL_CHECK_BUF
5525
- }
5600
+ }
5526
5601
  #endif
5602
+ }
5527
5603
 
5528
5604
  #ifdef GGML_USE_MPI
5529
5605
  ctx->ctx_mpi = ggml_mpi_init();
@@ -5857,7 +5933,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
5857
5933
  rng_ss.str(std::string(&rng_buf[0], rng_size));
5858
5934
  rng_ss >> ctx->rng;
5859
5935
 
5860
- GGML_ASSERT(rng_ss.fail() == false);
5936
+ GGML_ASSERT(!rng_ss.fail());
5861
5937
  }
5862
5938
 
5863
5939
  // set logits
@@ -164,6 +164,7 @@ extern "C" {
164
164
  enum llama_ftype ftype; // quantize to this llama_ftype
165
165
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
166
166
  bool quantize_output_tensor; // quantize output.weight
167
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
167
168
  } llama_model_quantize_params;
168
169
 
169
170
  // grammar types
@@ -409,6 +410,8 @@ extern "C" {
409
410
 
410
411
  LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
411
412
 
413
+ LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
414
+
412
415
  //
413
416
  // Sampling functions
414
417
  //
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.5.0'
6
+ VERSION = '0.5.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1140'
9
+ LLAMA_CPP_VERSION = 'b1198'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-09-02 00:00:00.000000000 Z
11
+ date: 2023-09-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: