llama_cpp 0.5.0 → 0.5.1

Sign up to get free protection for your applications and to get access to all the features.
@@ -13,6 +13,26 @@
13
13
  //
14
14
  #include <arm_neon.h>
15
15
 
16
+ #if !defined(__aarch64__)
17
+ inline static int32_t vaddvq_s16(int16x8_t v) {
18
+ return
19
+ (int32_t)vgetq_lane_s16(v, 0) + (int32_t)vgetq_lane_s16(v, 1) +
20
+ (int32_t)vgetq_lane_s16(v, 2) + (int32_t)vgetq_lane_s16(v, 3) +
21
+ (int32_t)vgetq_lane_s16(v, 4) + (int32_t)vgetq_lane_s16(v, 5) +
22
+ (int32_t)vgetq_lane_s16(v, 6) + (int32_t)vgetq_lane_s16(v, 7);
23
+ }
24
+
25
+ inline static int16x8_t vpaddq_s16(int16x8_t a, int16x8_t b) {
26
+ int16x4_t a0 = vpadd_s16(vget_low_s16(a), vget_high_s16(a));
27
+ int16x4_t b0 = vpadd_s16(vget_low_s16(b), vget_high_s16(b));
28
+ return vcombine_s16(a0, b0);
29
+ }
30
+
31
+ inline static int32_t vaddvq_s32(int32x4_t v) {
32
+ return vgetq_lane_s32(v, 0) + vgetq_lane_s32(v, 1) + vgetq_lane_s32(v, 2) + vgetq_lane_s32(v, 3);
33
+ }
34
+ #endif
35
+
16
36
  #else
17
37
 
18
38
  #ifdef __wasm_simd128__
@@ -63,7 +83,7 @@ static float make_qx_quants(int n, int nmax, const float * restrict x, int8_t *
63
83
  float ax = fabsf(x[i]);
64
84
  if (ax > amax) { amax = ax; max = x[i]; }
65
85
  }
66
- if (!amax) { // all zero
86
+ if (amax < 1e-30f) { // all zero
67
87
  for (int i = 0; i < n; ++i) {
68
88
  L[i] = 0;
69
89
  }
@@ -183,13 +203,9 @@ static float make_qkx1_quants(int n, int nmax, const float * restrict x, uint8_t
183
203
  int ntry, float alpha) {
184
204
  float min = x[0];
185
205
  float max = x[0];
186
- float sum_x = 0;
187
- float sum_x2 = 0;
188
206
  for (int i = 1; i < n; ++i) {
189
207
  if (x[i] < min) min = x[i];
190
208
  if (x[i] > max) max = x[i];
191
- sum_x += x[i];
192
- sum_x2 += x[i]*x[i];
193
209
  }
194
210
  if (max == min) {
195
211
  for (int i = 0; i < n; ++i) L[i] = 0;
@@ -1070,6 +1086,13 @@ void quantize_row_q6_K_reference(const float * restrict x, block_q6_K * restrict
1070
1086
 
1071
1087
  }
1072
1088
 
1089
+ if (!max_abs_scale) {
1090
+ memset(&y[i], 0, sizeof(block_q6_K));
1091
+ y[i].d = ggml_fp32_to_fp16(0.f);
1092
+ x += QK_K;
1093
+ continue;
1094
+ }
1095
+
1073
1096
  float iscale = -128.f/max_scale;
1074
1097
  y[i].d = ggml_fp32_to_fp16(1/iscale);
1075
1098
  for (int ib = 0; ib < QK_K/16; ++ib) {
@@ -1306,7 +1329,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1306
1329
 
1307
1330
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1308
1331
  const uint8x16_t m4 = vdupq_n_u8(0xF);
1332
+ #if defined(__ARM_FEATURE_DOTPROD)
1309
1333
  const int32x4_t vzero = vdupq_n_s32(0);
1334
+ #endif
1310
1335
 
1311
1336
  int8x16x2_t q2bytes;
1312
1337
  uint8_t aux[16];
@@ -1612,7 +1637,9 @@ void ggml_vec_dot_q2_K_q8_K(const int n, float * restrict s, const void * restri
1612
1637
  #ifdef __ARM_NEON
1613
1638
 
1614
1639
  const uint8x16_t m3 = vdupq_n_u8(0x3);
1640
+ #if defined(__ARM_FEATURE_DOTPROD)
1615
1641
  const int32x4_t vzero = vdupq_n_s32(0);
1642
+ #endif
1616
1643
 
1617
1644
  int8x16x4_t q2bytes;
1618
1645
 
@@ -2060,7 +2087,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2060
2087
 
2061
2088
  __m256 acc = _mm256_setzero_ps();
2062
2089
 
2063
- uint32_t *aux;
2090
+ const uint32_t *aux;
2064
2091
 
2065
2092
  for (int i = 0; i < nb; ++i) {
2066
2093
 
@@ -2070,7 +2097,7 @@ void ggml_vec_dot_q3_K_q8_K(const int n, float * restrict s, const void * restri
2070
2097
  const int8_t * restrict q8 = y[i].qs;
2071
2098
 
2072
2099
  // Set up scales
2073
- aux = (uint32_t *)x[i].scales;
2100
+ aux = (const uint32_t *)x[i].scales;
2074
2101
  __m128i scales128 = _mm_set_epi32(
2075
2102
  ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4),
2076
2103
  ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4),
@@ -2596,8 +2623,6 @@ void ggml_vec_dot_q4_K_q8_K(const int n, float * restrict s, const void * restri
2596
2623
  const uint8_t * restrict q4 = x[i].qs;
2597
2624
  const int8_t * restrict q8 = y[i].qs;
2598
2625
 
2599
- //int32x4_t isum = mzero;
2600
-
2601
2626
  int32_t sumi1 = 0;
2602
2627
  int32_t sumi2 = 0;
2603
2628
 
@@ -3096,9 +3121,11 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3096
3121
  #ifdef __ARM_NEON
3097
3122
 
3098
3123
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3099
- const int32x4_t mzero = vdupq_n_s32(0);
3100
3124
  const uint8x16_t mone = vdupq_n_u8(1);
3101
3125
  const uint8x16_t mtwo = vdupq_n_u8(2);
3126
+ #if defined(__ARM_FEATURE_DOTPROD)
3127
+ const int32x4_t mzero = vdupq_n_s32(0);
3128
+ #endif
3102
3129
 
3103
3130
  int8x16x4_t q5bytes;
3104
3131
 
@@ -3441,8 +3468,10 @@ void ggml_vec_dot_q5_K_q8_K(const int n, float * restrict s, const void * restri
3441
3468
  #ifdef __ARM_NEON
3442
3469
 
3443
3470
  const uint8x16_t m4b = vdupq_n_u8(0xf);
3444
- const int32x4_t mzero = vdupq_n_s32(0);
3445
3471
  const uint8x16_t mh = vdupq_n_u8(16);
3472
+ #if defined(__ARM_FEATURE_DOTPROD)
3473
+ const int32x4_t mzero = vdupq_n_s32(0);
3474
+ #endif
3446
3475
 
3447
3476
  int8x16x4_t q5bytes;
3448
3477
  uint8x16x4_t q5h;
@@ -3660,7 +3689,9 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
3660
3689
  float sum = 0;
3661
3690
 
3662
3691
  const uint8x16_t m4b = vdupq_n_u8(0xF);
3692
+ #if defined(__ARM_FEATURE_DOTPROD)
3663
3693
  const int32x4_t vzero = vdupq_n_s32(0);
3694
+ #endif
3664
3695
  //const int8x16_t m32s = vdupq_n_s8(32);
3665
3696
 
3666
3697
  const uint8x16_t mone = vdupq_n_u8(3);
@@ -4049,8 +4080,10 @@ void ggml_vec_dot_q6_K_q8_K(const int n, float * restrict s, const void * restri
4049
4080
  float sum = 0;
4050
4081
 
4051
4082
  const uint8x16_t m4b = vdupq_n_u8(0xF);
4052
- const int32x4_t vzero = vdupq_n_s32(0);
4053
4083
  const int8x16_t m32s = vdupq_n_s8(32);
4084
+ #if defined(__ARM_FEATURE_DOTPROD)
4085
+ const int32x4_t vzero = vdupq_n_s32(0);
4086
+ #endif
4054
4087
 
4055
4088
  const uint8x16_t mone = vdupq_n_u8(3);
4056
4089
 
@@ -126,6 +126,9 @@ void replace_all(std::string & s, const std::string & search, const std::string
126
126
  }
127
127
  s = std::move(result);
128
128
  }
129
+ #ifdef GGML_USE_CPU_HBM
130
+ #include <hbwmalloc.h>
131
+ #endif
129
132
 
130
133
  static void zeros(std::ofstream & file, size_t n) {
131
134
  char zero = 0;
@@ -325,6 +328,44 @@ static std::map<llm_arch, std::map<llm_tensor, std::string>> LLM_TENSOR_NAMES =
325
328
  { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
326
329
  },
327
330
  },
331
+ {
332
+ LLM_ARCH_GPT2,
333
+ {
334
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
335
+ },
336
+ },
337
+ {
338
+ LLM_ARCH_GPTJ,
339
+ {
340
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
341
+ },
342
+ },
343
+ {
344
+ LLM_ARCH_GPTNEOX,
345
+ {
346
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
347
+ { LLM_TENSOR_OUTPUT_NORM, "output_norm" },
348
+ { LLM_TENSOR_OUTPUT, "output" },
349
+ { LLM_TENSOR_ATTN_NORM, "blk.%d.attn_norm" },
350
+ { LLM_TENSOR_ATTN_QKV, "blk.%d.attn_qkv" },
351
+ { LLM_TENSOR_ATTN_OUT, "blk.%d.attn_output" },
352
+ { LLM_TENSOR_FFN_NORM, "blk.%d.ffn_norm" },
353
+ { LLM_TENSOR_FFN_DOWN, "blk.%d.ffn_down" },
354
+ { LLM_TENSOR_FFN_UP, "blk.%d.ffn_up" },
355
+ },
356
+ },
357
+ {
358
+ LLM_ARCH_MPT,
359
+ {
360
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
361
+ },
362
+ },
363
+ {
364
+ LLM_ARCH_UNKNOWN,
365
+ {
366
+ { LLM_TENSOR_TOKEN_EMBD, "token_embd" },
367
+ },
368
+ },
328
369
  };
329
370
 
330
371
  static llm_arch llm_arch_from_string(const std::string & name) {
@@ -412,6 +453,9 @@ static void ggml_graph_compute_helper(std::vector<uint8_t> & buf, ggml_cgraph *
412
453
  #elif GGML_USE_METAL
413
454
  # define llama_host_malloc(n) ggml_metal_host_malloc(n)
414
455
  # define llama_host_free(data) ggml_metal_host_free(data)
456
+ #elif GGML_USE_CPU_HBM
457
+ # define llama_host_malloc(n) hbw_malloc(n)
458
+ # define llama_host_free(data) if (data != NULL) hbw_free(data)
415
459
  #else
416
460
  # define llama_host_malloc(n) malloc(n)
417
461
  # define llama_host_free(data) free(data)
@@ -568,16 +612,16 @@ struct llama_mmap {
568
612
 
569
613
  if (prefetch > 0) {
570
614
  // Advise the kernel to preload the mapped memory
571
- if (madvise(addr, std::min(file->size, prefetch), MADV_WILLNEED)) {
572
- fprintf(stderr, "warning: madvise(.., MADV_WILLNEED) failed: %s\n",
615
+ if (posix_madvise(addr, std::min(file->size, prefetch), POSIX_MADV_WILLNEED)) {
616
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_WILLNEED) failed: %s\n",
573
617
  strerror(errno));
574
618
  }
575
619
  }
576
620
  if (numa) {
577
621
  // advise the kernel not to use readahead
578
622
  // (because the next page might not belong on the same node)
579
- if (madvise(addr, file->size, MADV_RANDOM)) {
580
- fprintf(stderr, "warning: madvise(.., MADV_RANDOM) failed: %s\n",
623
+ if (posix_madvise(addr, file->size, POSIX_MADV_RANDOM)) {
624
+ fprintf(stderr, "warning: posix_madvise(.., POSIX_MADV_RANDOM) failed: %s\n",
581
625
  strerror(errno));
582
626
  }
583
627
  }
@@ -614,7 +658,9 @@ struct llama_mmap {
614
658
  #if _WIN32_WINNT >= _WIN32_WINNT_WIN8
615
659
  if (prefetch) {
616
660
  // Advise the kernel to preload the mapped memory
661
+
617
662
  WIN32_MEMORY_RANGE_ENTRY range;
663
+
618
664
  range.VirtualAddress = addr;
619
665
  range.NumberOfBytes = (SIZE_T)size;
620
666
  if (!PrefetchVirtualMemory(GetCurrentProcess(), 1, &range, 0)) {
@@ -1446,7 +1492,11 @@ struct llama_model_loader {
1446
1492
  // allocate temp buffer if not using mmap
1447
1493
  if (!use_mmap && cur->data == NULL) {
1448
1494
  GGML_ASSERT(cur->backend != GGML_BACKEND_CPU);
1449
- cur->data = malloc(ggml_nbytes(cur));
1495
+ #ifdef GGML_USE_CPU_HBM
1496
+ cur->data = (uint8_t*)hbw_malloc(ggml_nbytes(cur));
1497
+ #else
1498
+ cur->data = (uint8_t*)malloc(ggml_nbytes(cur));
1499
+ #endif
1450
1500
  }
1451
1501
 
1452
1502
  load_data_for(cur);
@@ -1600,9 +1650,13 @@ static void llm_load_hparams(
1600
1650
 
1601
1651
  GGUF_GET_KEY(ctx, hparams.n_rot, gguf_get_val_u32, GGUF_TYPE_UINT32, false, kv(LLM_KV_ROPE_DIMENSION_COUNT));
1602
1652
 
1603
- if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1604
- throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1653
+ if (model.arch == LLM_ARCH_LLAMA || model.arch == LLM_ARCH_FALCON) {
1654
+ if (hparams.n_rot != hparams.n_embd / hparams.n_head) {
1655
+ throw std::runtime_error(format("invalid n_rot: %u, expected %u", hparams.n_rot, hparams.n_embd / hparams.n_head));
1656
+ }
1605
1657
  }
1658
+ // gpt-neox n_rot = rotary_pct * (n_embd / n_head)
1659
+ // gpt-j n_rot = rotary_dim
1606
1660
  }
1607
1661
 
1608
1662
  // arch-specific KVs
@@ -2895,7 +2949,12 @@ static bool llama_eval_internal(
2895
2949
 
2896
2950
  // for big prompts, if BLAS is enabled, it is better to use only one thread
2897
2951
  // otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
2898
- n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
2952
+ // TODO: this is mostly important for Apple Silicon where CBLAS is still performing very well
2953
+ // we still need some threads to process all non-mul_mat ops, but not too much to avoid interfering
2954
+ // with the BLAS calls. need a better solution
2955
+ if (N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas()) {
2956
+ n_threads = std::min(4, n_threads);
2957
+ }
2899
2958
 
2900
2959
  struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
2901
2960
  struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 2];
@@ -3000,33 +3059,10 @@ static bool llama_is_control_token(const llama_vocab & vocab, llama_token id) {
3000
3059
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_CONTROL;
3001
3060
  }
3002
3061
 
3003
- static bool llama_is_user_defined_token(const llama_vocab & vocab, llama_token id) {
3004
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_USER_DEFINED;
3005
- }
3006
-
3007
- static bool llama_is_unused_token(const llama_vocab & vocab, llama_token id) {
3008
- return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_UNUSED;
3009
- }
3010
-
3011
3062
  static bool llama_is_byte_token(const llama_vocab & vocab, llama_token id) {
3012
3063
  return vocab.id_to_token[id].type == LLAMA_TOKEN_TYPE_BYTE;
3013
3064
  }
3014
3065
 
3015
- static bool llama_is_bos_token(const llama_vocab & vocab, llama_token id) {
3016
- GGML_ASSERT(llama_is_control_token(vocab, id));
3017
- return id == vocab.special_bos_id;
3018
- }
3019
-
3020
- static bool llama_is_eos_token(const llama_vocab & vocab, llama_token id ) {
3021
- GGML_ASSERT(llama_is_control_token(vocab, id));
3022
- return id == vocab.special_eos_id;
3023
- }
3024
-
3025
- static bool llama_is_pad_token(const llama_vocab & vocab, llama_token id ) {
3026
- GGML_ASSERT(id < 0 || llama_is_control_token(vocab, id));
3027
- return id == vocab.special_pad_id;
3028
- }
3029
-
3030
3066
  static uint8_t llama_token_to_byte(const llama_vocab & vocab, llama_token id) {
3031
3067
  GGML_ASSERT(llama_is_byte_token(vocab, id));
3032
3068
  const auto& token_data = vocab.id_to_token.at(id);
@@ -3319,9 +3355,15 @@ struct llm_tokenizer_bpe {
3319
3355
  std::string byte_str(1, *j);
3320
3356
  auto token_multibyte = vocab.token_to_id.find(byte_str);
3321
3357
  if (token_multibyte == vocab.token_to_id.end()) {
3322
- fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3358
+ try {
3359
+ llama_token token_byte = llama_byte_to_token(vocab, *j);
3360
+ output.push_back(token_byte);
3361
+ } catch (const std::out_of_range & err) {
3362
+ fprintf(stderr,"ERROR: byte not found in vocab: '%s'\n", byte_str.c_str());
3363
+ }
3364
+ } else {
3365
+ output.push_back((*token_multibyte).second);
3323
3366
  }
3324
- output.push_back((*token_multibyte).second);
3325
3367
  }
3326
3368
  } else {
3327
3369
  output.push_back((*token).second);
@@ -3595,7 +3637,7 @@ static void llama_grammar_advance_stack(
3595
3637
  std::vector<std::vector<const llama_grammar_element *>> & new_stacks) {
3596
3638
 
3597
3639
  if (stack.empty()) {
3598
- new_stacks.push_back(stack);
3640
+ new_stacks.emplace_back(stack);
3599
3641
  return;
3600
3642
  }
3601
3643
 
@@ -3632,7 +3674,7 @@ static void llama_grammar_advance_stack(
3632
3674
  }
3633
3675
  case LLAMA_GRETYPE_CHAR:
3634
3676
  case LLAMA_GRETYPE_CHAR_NOT:
3635
- new_stacks.push_back(stack);
3677
+ new_stacks.emplace_back(stack);
3636
3678
  break;
3637
3679
  default:
3638
3680
  // end of alternate (LLAMA_GRETYPE_END, LLAMA_GRETYPE_ALT) or middle of char range
@@ -3797,6 +3839,25 @@ void llama_grammar_free(struct llama_grammar * grammar) {
3797
3839
  delete grammar;
3798
3840
  }
3799
3841
 
3842
+ struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar) {
3843
+ llama_grammar * result = new llama_grammar{ grammar->rules, grammar->stacks, grammar->partial_utf8 };
3844
+
3845
+ // redirect elements in stacks to point to new rules
3846
+ for (size_t is = 0; is < result->stacks.size(); is++) {
3847
+ for (size_t ie = 0; ie < result->stacks[is].size(); ie++) {
3848
+ for (size_t ir0 = 0; ir0 < grammar->rules.size(); ir0++) {
3849
+ for (size_t ir1 = 0; ir1 < grammar->rules[ir0].size(); ir1++) {
3850
+ if (grammar->stacks[is][ie] == &grammar->rules[ir0][ir1]) {
3851
+ result->stacks[is][ie] = &result->rules[ir0][ir1];
3852
+ }
3853
+ }
3854
+ }
3855
+ }
3856
+ }
3857
+
3858
+ return result;
3859
+ }
3860
+
3800
3861
  //
3801
3862
  // sampling
3802
3863
  //
@@ -4388,7 +4449,7 @@ struct llama_logit_info {
4388
4449
  }
4389
4450
  return min_heap;
4390
4451
  }
4391
- float probability_from_logit(float logit) {
4452
+ float probability_from_logit(float logit) const {
4392
4453
  return normalizer * std::exp(logit - max_l);
4393
4454
  }
4394
4455
  };
@@ -4678,6 +4739,10 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4678
4739
  llm_load_arch(*ml, model);
4679
4740
  llm_load_hparams(*ml, model, 0, 0, 0);
4680
4741
 
4742
+ if (params->only_copy) {
4743
+ ftype = model.ftype;
4744
+ }
4745
+
4681
4746
  const size_t align = GGUF_DEFAULT_ALIGNMENT;
4682
4747
  struct gguf_context * ctx_out = gguf_init_empty();
4683
4748
 
@@ -4719,9 +4784,11 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4719
4784
  std::vector<std::thread> workers;
4720
4785
  std::mutex mutex;
4721
4786
 
4787
+ #ifdef GGML_USE_K_QUANTS
4722
4788
  auto use_more_bits = [] (int i_layer, int num_layers) -> bool {
4723
4789
  return i_layer < num_layers/8 || i_layer >= 7*num_layers/8 || (i_layer - num_layers/8)%3 == 2;
4724
4790
  };
4791
+ #endif
4725
4792
 
4726
4793
  int idx = 0;
4727
4794
 
@@ -4764,18 +4831,13 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4764
4831
  // quantize only 2D tensors
4765
4832
  quantize &= (tensor->n_dims == 2);
4766
4833
  quantize &= params->quantize_output_tensor || name != "output.weight";
4767
- quantize &= quantized_type != tensor->type;
4834
+ quantize &= !params->only_copy;
4768
4835
 
4769
4836
  enum ggml_type new_type;
4770
4837
  void * new_data;
4771
4838
  size_t new_size;
4772
4839
 
4773
- if (!quantize) {
4774
- new_type = tensor->type;
4775
- new_data = tensor->data;
4776
- new_size = ggml_nbytes(tensor);
4777
- LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4778
- } else {
4840
+ if (quantize) {
4779
4841
  new_type = quantized_type;
4780
4842
  #ifdef GGML_USE_K_QUANTS
4781
4843
  // TODO: avoid hardcoded tensor names - use the TN_* constants
@@ -4874,7 +4936,16 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
4874
4936
  }
4875
4937
  }
4876
4938
  #endif
4877
-
4939
+ // If we've decided to quantize to the same type the tensor is already
4940
+ // in then there's nothing to do.
4941
+ quantize = tensor->type != new_type;
4942
+ }
4943
+ if (!quantize) {
4944
+ new_type = tensor->type;
4945
+ new_data = tensor->data;
4946
+ new_size = ggml_nbytes(tensor);
4947
+ LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
4948
+ } else {
4878
4949
  const size_t nelements = ggml_nelements(tensor);
4879
4950
 
4880
4951
  float * f32_data;
@@ -5279,7 +5350,7 @@ struct llama_context_params llama_context_default_params() {
5279
5350
  /*.seed =*/ LLAMA_DEFAULT_SEED,
5280
5351
  /*.n_ctx =*/ 512,
5281
5352
  /*.n_batch =*/ 512,
5282
- /*.gpu_layers =*/ 0,
5353
+ /*.n_gpu_layers =*/ 0,
5283
5354
  /*.main_gpu =*/ 0,
5284
5355
  /*.tensor_split =*/ nullptr,
5285
5356
  /*.rope_freq_base =*/ 10000.0f,
@@ -5296,6 +5367,10 @@ struct llama_context_params llama_context_default_params() {
5296
5367
  /*.embedding =*/ false,
5297
5368
  };
5298
5369
 
5370
+ #ifdef GGML_USE_METAL
5371
+ result.n_gpu_layers = 1;
5372
+ #endif
5373
+
5299
5374
  return result;
5300
5375
  }
5301
5376
 
@@ -5305,6 +5380,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
5305
5380
  /*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
5306
5381
  /*.allow_requantize =*/ false,
5307
5382
  /*.quantize_output_tensor =*/ true,
5383
+ /*.only_copy =*/ false,
5308
5384
  };
5309
5385
 
5310
5386
  return result;
@@ -5487,43 +5563,43 @@ struct llama_context * llama_new_context_with_model(
5487
5563
  }
5488
5564
  #endif
5489
5565
  }
5490
- }
5491
5566
 
5492
5567
  #ifdef GGML_USE_METAL
5493
- if (params.n_gpu_layers > 0) {
5494
- // this allocates all Metal resources and memory buffers
5568
+ if (params.n_gpu_layers > 0) {
5569
+ // this allocates all Metal resources and memory buffers
5495
5570
 
5496
- void * data_ptr = NULL;
5497
- size_t data_size = 0;
5571
+ void * data_ptr = NULL;
5572
+ size_t data_size = 0;
5498
5573
 
5499
- if (params.use_mmap) {
5500
- data_ptr = ctx->model.mapping->addr;
5501
- data_size = ctx->model.mapping->size;
5502
- } else {
5503
- data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5504
- data_size = ggml_get_mem_size (ctx->model.ctx);
5505
- }
5574
+ if (params.use_mmap) {
5575
+ data_ptr = ctx->model.mapping->addr;
5576
+ data_size = ctx->model.mapping->size;
5577
+ } else {
5578
+ data_ptr = ggml_get_mem_buffer(ctx->model.ctx);
5579
+ data_size = ggml_get_mem_size (ctx->model.ctx);
5580
+ }
5506
5581
 
5507
- const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5582
+ const size_t max_size = ggml_get_max_tensor_size(ctx->model.ctx);
5508
5583
 
5509
- LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5584
+ LLAMA_LOG_INFO("%s: max tensor size = %8.2f MB\n", __func__, max_size/1024.0/1024.0);
5510
5585
 
5511
5586
  #define LLAMA_METAL_CHECK_BUF(result) \
5512
- if (!(result)) { \
5513
- LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5514
- llama_free(ctx); \
5515
- return NULL; \
5516
- }
5587
+ if (!(result)) { \
5588
+ LLAMA_LOG_ERROR("%s: failed to add buffer\n", __func__); \
5589
+ llama_free(ctx); \
5590
+ return NULL; \
5591
+ }
5517
5592
 
5518
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5593
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "data", data_ptr, data_size, max_size));
5519
5594
 
5520
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5521
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5595
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.data, ctx->buf_compute.size, 0));
5596
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.data, ctx->kv_self.buf.size, 0));
5522
5597
 
5523
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5598
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.data, ctx->buf_alloc.size, 0));
5524
5599
  #undef LLAMA_METAL_CHECK_BUF
5525
- }
5600
+ }
5526
5601
  #endif
5602
+ }
5527
5603
 
5528
5604
  #ifdef GGML_USE_MPI
5529
5605
  ctx->ctx_mpi = ggml_mpi_init();
@@ -5857,7 +5933,7 @@ size_t llama_set_state_data(struct llama_context * ctx, uint8_t * src) {
5857
5933
  rng_ss.str(std::string(&rng_buf[0], rng_size));
5858
5934
  rng_ss >> ctx->rng;
5859
5935
 
5860
- GGML_ASSERT(rng_ss.fail() == false);
5936
+ GGML_ASSERT(!rng_ss.fail());
5861
5937
  }
5862
5938
 
5863
5939
  // set logits
@@ -164,6 +164,7 @@ extern "C" {
164
164
  enum llama_ftype ftype; // quantize to this llama_ftype
165
165
  bool allow_requantize; // allow quantizing non-f32/f16 tensors
166
166
  bool quantize_output_tensor; // quantize output.weight
167
+ bool only_copy; // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
167
168
  } llama_model_quantize_params;
168
169
 
169
170
  // grammar types
@@ -409,6 +410,8 @@ extern "C" {
409
410
 
410
411
  LLAMA_API void llama_grammar_free(struct llama_grammar * grammar);
411
412
 
413
+ LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar);
414
+
412
415
  //
413
416
  // Sampling functions
414
417
  //
@@ -3,8 +3,8 @@
3
3
  # llama_cpp.rb provides Ruby bindings for the llama.cpp.
4
4
  module LLaMACpp
5
5
  # The version of llama_cpp.rb you install.
6
- VERSION = '0.5.0'
6
+ VERSION = '0.5.1'
7
7
 
8
8
  # The version of llama.cpp bundled with llama_cpp.rb.
9
- LLAMA_CPP_VERSION = 'b1140'
9
+ LLAMA_CPP_VERSION = 'b1198'
10
10
  end
metadata CHANGED
@@ -1,14 +1,14 @@
1
1
  --- !ruby/object:Gem::Specification
2
2
  name: llama_cpp
3
3
  version: !ruby/object:Gem::Version
4
- version: 0.5.0
4
+ version: 0.5.1
5
5
  platform: ruby
6
6
  authors:
7
7
  - yoshoku
8
8
  autorequire:
9
9
  bindir: exe
10
10
  cert_chain: []
11
- date: 2023-09-02 00:00:00.000000000 Z
11
+ date: 2023-09-08 00:00:00.000000000 Z
12
12
  dependencies: []
13
13
  description: llama_cpp.rb provides Ruby bindings for the llama.cpp.
14
14
  email: