cui-llama.rn 1.0.7 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml-quants.h CHANGED
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
127
127
  void iq3xs_init_impl(int grid_size);
128
128
  void iq3xs_free_impl(int grid_size);
129
129
 
130
+ #if defined(__ARM_FEATURE_SVE)
131
+ extern int lm_ggml_sve_cnt_b;
132
+ #endif
133
+
130
134
  #ifdef __cplusplus
131
135
  }
132
136
  #endif
package/cpp/ggml.c CHANGED
@@ -37,6 +37,9 @@
37
37
  #include <unistd.h>
38
38
  #endif
39
39
 
40
+ #if defined(__ARM_FEATURE_SVE)
41
+ int lm_ggml_sve_cnt_b = 0;
42
+ #endif
40
43
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
41
44
  #undef LM_GGML_USE_LLAMAFILE
42
45
  #endif
@@ -53,6 +56,9 @@
53
56
  // disable POSIX deprecation warnings
54
57
  // these functions are never going away, anyway
55
58
  #pragma warning(disable: 4996)
59
+
60
+ // unreachable code because of multiple instances of code after LM_GGML_ABORT
61
+ #pragma warning(disable: 4702)
56
62
  #endif
57
63
 
58
64
  #if defined(_WIN32)
@@ -185,7 +191,7 @@ static void lm_ggml_print_backtrace_symbols(void) {
185
191
  fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
186
192
  }
187
193
  }
188
- #elif defined(__linux__)
194
+ #elif defined(__linux__) && defined(__GLIBC__)
189
195
  #include <execinfo.h>
190
196
  static void lm_ggml_print_backtrace_symbols(void) {
191
197
  // void * trace[100];
@@ -480,9 +486,16 @@ void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) {
480
486
  }
481
487
  }
482
488
 
489
+ void lm_ggml_fp32_to_bf16_row_ref(const float * x, lm_ggml_bf16_t * y, int64_t n) {
490
+ for (int i = 0; i < n; i++) {
491
+ y[i] = lm_ggml_compute_fp32_to_bf16(x[i]);
492
+ }
493
+ }
494
+
483
495
  void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) {
484
496
  int i = 0;
485
497
  #if defined(__AVX512BF16__)
498
+ // subnormals are flushed to zero on this platform
486
499
  for (; i + 32 <= n; i += 32) {
487
500
  _mm512_storeu_si512(
488
501
  (__m512i *)(y + i),
@@ -962,7 +975,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = {
962
975
  .is_quantized = false,
963
976
  .to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
964
977
  .from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
965
- .from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
978
+ .from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
966
979
  .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
967
980
  .vec_dot_type = LM_GGML_TYPE_BF16,
968
981
  .nrows = 1,
@@ -2302,7 +2315,7 @@ inline static void lm_ggml_vec_abs_f32 (const int n, float * y, const float * x
2302
2315
  inline static void lm_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
2303
2316
  inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
2304
2317
  inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
2305
- inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
2318
+ inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
2306
2319
  inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
2307
2320
  inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
2308
2321
  inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
@@ -3551,6 +3564,12 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
3551
3564
 
3552
3565
  LM_GGML_ASSERT_ALIGNED(ctx->mem_buffer);
3553
3566
 
3567
+ #if defined(__ARM_FEATURE_SVE)
3568
+ if (!lm_ggml_sve_cnt_b) {
3569
+ lm_ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3570
+ }
3571
+ #endif
3572
+
3554
3573
  LM_GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3555
3574
 
3556
3575
  lm_ggml_critical_section_end();
@@ -3705,7 +3724,8 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl(
3705
3724
  struct lm_ggml_tensor * view_src,
3706
3725
  size_t view_offs) {
3707
3726
 
3708
- assert(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
3727
+ LM_GGML_ASSERT(type >= 0 && type < LM_GGML_TYPE_COUNT);
3728
+ LM_GGML_ASSERT(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
3709
3729
 
3710
3730
  // find the base tensor and absolute offset
3711
3731
  if (view_src != NULL && view_src->view_src != NULL) {
@@ -5358,6 +5378,7 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
5358
5378
  struct lm_ggml_context * ctx,
5359
5379
  struct lm_ggml_tensor * a,
5360
5380
  int n_groups,
5381
+ float eps,
5361
5382
  bool inplace) {
5362
5383
 
5363
5384
  bool is_node = false;
@@ -5368,7 +5389,8 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
5368
5389
 
5369
5390
  struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
5370
5391
 
5371
- result->op_params[0] = n_groups;
5392
+ lm_ggml_set_op_params_i32(result, 0, n_groups);
5393
+ lm_ggml_set_op_params_f32(result, 1, eps);
5372
5394
 
5373
5395
  result->op = LM_GGML_OP_GROUP_NORM;
5374
5396
  result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
@@ -5380,15 +5402,17 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
5380
5402
  struct lm_ggml_tensor * lm_ggml_group_norm(
5381
5403
  struct lm_ggml_context * ctx,
5382
5404
  struct lm_ggml_tensor * a,
5383
- int n_groups) {
5384
- return lm_ggml_group_norm_impl(ctx, a, n_groups, false);
5405
+ int n_groups,
5406
+ float eps) {
5407
+ return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, false);
5385
5408
  }
5386
5409
 
5387
5410
  struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
5388
5411
  struct lm_ggml_context * ctx,
5389
5412
  struct lm_ggml_tensor * a,
5390
- int n_groups) {
5391
- return lm_ggml_group_norm_impl(ctx, a, n_groups, true);
5413
+ int n_groups,
5414
+ float eps) {
5415
+ return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
5392
5416
  }
5393
5417
 
5394
5418
  // lm_ggml_mul_mat
@@ -12079,10 +12103,11 @@ static void lm_ggml_compute_forward_group_norm_f32(
12079
12103
 
12080
12104
  LM_GGML_TENSOR_UNARY_OP_LOCALS
12081
12105
 
12082
- const float eps = 1e-6f; // TODO: make this a parameter
12083
-
12084
12106
  // TODO: optimize
12085
12107
 
12108
+ float eps;
12109
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
12110
+
12086
12111
  int n_channels = src0->ne[2];
12087
12112
  int n_groups = dst->op_params[0];
12088
12113
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
@@ -20650,7 +20675,7 @@ size_t lm_ggml_quantize_chunk(
20650
20675
  case LM_GGML_TYPE_BF16:
20651
20676
  {
20652
20677
  size_t elemsize = sizeof(lm_ggml_bf16_t);
20653
- lm_ggml_fp32_to_bf16_row(src + start, (lm_ggml_bf16_t *)dst + start, n);
20678
+ lm_ggml_fp32_to_bf16_row_ref(src + start, (lm_ggml_bf16_t *)dst + start, n);
20654
20679
  result = n * elemsize;
20655
20680
  } break;
20656
20681
  case LM_GGML_TYPE_F32:
package/cpp/ggml.h CHANGED
@@ -349,6 +349,7 @@ extern "C" {
349
349
  LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float);
350
350
  LM_GGML_API float lm_ggml_bf16_to_fp32(lm_ggml_bf16_t); // consider just doing << 16
351
351
  LM_GGML_API void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t);
352
+ LM_GGML_API void lm_ggml_fp32_to_bf16_row_ref(const float *, lm_ggml_bf16_t *, int64_t);
352
353
  LM_GGML_API void lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t);
353
354
 
354
355
  struct lm_ggml_object;
@@ -1139,16 +1140,17 @@ extern "C" {
1139
1140
 
1140
1141
  // group normalize along ne0*ne1*n_groups
1141
1142
  // used in stable-diffusion
1142
- // TODO: eps is hardcoded to 1e-6 for now
1143
1143
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm(
1144
1144
  struct lm_ggml_context * ctx,
1145
1145
  struct lm_ggml_tensor * a,
1146
- int n_groups);
1146
+ int n_groups,
1147
+ float eps);
1147
1148
 
1148
1149
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
1149
1150
  struct lm_ggml_context * ctx,
1150
1151
  struct lm_ggml_tensor * a,
1151
- int n_groups);
1152
+ int n_groups,
1153
+ float eps);
1152
1154
 
1153
1155
  // a - x
1154
1156
  // b - dy
@@ -1455,7 +1457,6 @@ extern "C" {
1455
1457
  // if mode & 2 == 1, GPT-NeoX style
1456
1458
  //
1457
1459
  // b is an int32 vector with size a->ne[2], it contains the positions
1458
- // c is freq factors (e.g. phi3-128k), (optional)
1459
1460
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
1460
1461
  struct lm_ggml_context * ctx,
1461
1462
  struct lm_ggml_tensor * a,
@@ -1472,6 +1473,7 @@ extern "C" {
1472
1473
  int mode);
1473
1474
 
1474
1475
  // custom RoPE
1476
+ // c is freq factors (e.g. phi3-128k), (optional)
1475
1477
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext(
1476
1478
  struct lm_ggml_context * ctx,
1477
1479
  struct lm_ggml_tensor * a,
package/cpp/llama-impl.h CHANGED
@@ -24,3 +24,18 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
24
24
  #define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
25
25
  #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
26
26
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
27
+
28
+ //
29
+ // helpers
30
+ //
31
+
32
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
33
+ if (search.empty()) {
34
+ return; // Avoid infinite loop if 'search' is an empty string
35
+ }
36
+ size_t pos = 0;
37
+ while ((pos = s.find(search, pos)) != std::string::npos) {
38
+ s.replace(pos, search.length(), replace);
39
+ pos += replace.length();
40
+ }
41
+ }
@@ -16,20 +16,6 @@
16
16
  // helpers
17
17
  //
18
18
 
19
- static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
20
- std::string result;
21
- for (size_t pos = 0; ; pos += search.length()) {
22
- auto new_pos = s.find(search, pos);
23
- if (new_pos == std::string::npos) {
24
- result += s.substr(pos, s.size() - pos);
25
- break;
26
- }
27
- result += s.substr(pos, new_pos - pos) + replace;
28
- pos = new_pos;
29
- }
30
- s = std::move(result);
31
- }
32
-
33
19
  LLAMA_ATTRIBUTE_FORMAT(1, 2)
34
20
  static std::string format(const char * fmt, ...) {
35
21
  va_list ap;
@@ -816,6 +802,9 @@ struct llm_tokenizer_ugm {
816
802
  * the best tokenization.
817
803
  */
818
804
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
805
+ // get current size of output (for reversal later)
806
+ size_t output_size = output.size();
807
+
819
808
  // normalize the input first
820
809
  std::string normalized;
821
810
  normalize(text, &normalized);
@@ -895,7 +884,7 @@ struct llm_tokenizer_ugm {
895
884
  }
896
885
 
897
886
  // reverse the output since we added tokens starting from the end of the input
898
- std::reverse(output.begin(), output.end());
887
+ std::reverse(output.begin() + output_size, output.end());
899
888
  }
900
889
 
901
890
  private:
@@ -1444,7 +1433,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
1444
1433
  bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
1445
1434
  return token != -1 && (
1446
1435
  token == llama_token_eos_impl(vocab) ||
1447
- token == llama_token_eot_impl(vocab)
1436
+ token == llama_token_eot_impl(vocab) ||
1437
+ token == llama_token_eom_impl(vocab)
1448
1438
  );
1449
1439
  }
1450
1440
 
@@ -1500,6 +1490,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
1500
1490
  return vocab.special_eot_id;
1501
1491
  }
1502
1492
 
1493
+ llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
1494
+ return vocab.special_eom_id;
1495
+ }
1496
+
1503
1497
  int32_t llama_tokenize_impl(
1504
1498
  const struct llama_vocab & vocab,
1505
1499
  const char * text,
package/cpp/llama-vocab.h CHANGED
@@ -45,6 +45,7 @@ struct llama_vocab {
45
45
  id special_suffix_id = -1;
46
46
  id special_middle_id = -1;
47
47
  id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
48
+ id special_eom_id = -1;
48
49
 
49
50
  // tokenizer flags
50
51
  bool tokenizer_add_space_prefix = false;
@@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
101
102
  llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
102
103
  llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
103
104
  llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
105
+ llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
104
106
 
105
107
  int32_t llama_tokenize_impl(
106
108
  const struct llama_vocab & vocab,