cui-llama.rn 1.0.7 → 1.0.10

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml-metal.m CHANGED
@@ -210,7 +210,7 @@ enum lm_ggml_metal_kernel_type {
210
210
  LM_GGML_METAL_KERNEL_TYPE_COUNT
211
211
  };
212
212
 
213
- struct lm_ggml_metal_context {
213
+ struct lm_ggml_backend_metal_context {
214
214
  int n_cb;
215
215
 
216
216
  id<MTLDevice> device;
@@ -224,6 +224,10 @@ struct lm_ggml_metal_context {
224
224
  bool support_simdgroup_mm;
225
225
 
226
226
  bool should_capture_next_compute;
227
+
228
+ // abort lm_ggml_metal_graph_compute if callback returns true
229
+ lm_ggml_abort_callback abort_callback;
230
+ void * abort_callback_data;
227
231
  };
228
232
 
229
233
  // MSL code
@@ -289,7 +293,7 @@ static void * lm_ggml_metal_host_malloc(size_t n) {
289
293
  return data;
290
294
  }
291
295
 
292
- static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
296
+ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
293
297
  LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
294
298
 
295
299
  #if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG
@@ -306,7 +310,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
306
310
  LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
307
311
 
308
312
  // Configure context
309
- struct lm_ggml_metal_context * ctx = malloc(sizeof(struct lm_ggml_metal_context));
313
+ struct lm_ggml_backend_metal_context * ctx = calloc(1, sizeof(struct lm_ggml_backend_metal_context));
310
314
  ctx->device = device;
311
315
  ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
312
316
  ctx->queue = [ctx->device newCommandQueue];
@@ -668,7 +672,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
668
672
  return ctx;
669
673
  }
670
674
 
671
- static void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) {
675
+ static void lm_ggml_metal_free(struct lm_ggml_backend_metal_context * ctx) {
672
676
  LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
673
677
 
674
678
  for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
@@ -734,7 +738,7 @@ static id<MTLBuffer> lm_ggml_metal_get_buffer(struct lm_ggml_tensor * t, size_t
734
738
  return nil;
735
739
  }
736
740
 
737
- static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, const struct lm_ggml_tensor * op) {
741
+ static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context * ctx, const struct lm_ggml_tensor * op) {
738
742
  for (size_t i = 0, n = 3; i < n; ++i) {
739
743
  if (op->src[i] != NULL && op->src[i]->type == LM_GGML_TYPE_BF16) {
740
744
  return false;
@@ -845,7 +849,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx,
845
849
  }
846
850
 
847
851
  static enum lm_ggml_status lm_ggml_metal_graph_compute(
848
- struct lm_ggml_metal_context * ctx,
852
+ struct lm_ggml_backend_metal_context * ctx,
849
853
  struct lm_ggml_cgraph * gf) {
850
854
 
851
855
  @autoreleasepool {
@@ -878,8 +882,11 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
878
882
  id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
879
883
  command_buffer_builder[cb_idx] = command_buffer;
880
884
 
881
- // enqueue the command buffers in order to specify their execution order
882
- [command_buffer enqueue];
885
+ // always enqueue the first two command buffers
886
+ // enqueue all of the command buffers if we don't need to abort
887
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
888
+ [command_buffer enqueue];
889
+ }
883
890
  }
884
891
 
885
892
  const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
@@ -2229,10 +2236,8 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2229
2236
  LM_GGML_ASSERT(ne00 % 4 == 0);
2230
2237
  LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
2231
2238
 
2232
- //float eps;
2233
- //memcpy(&eps, dst->op_params, sizeof(float));
2234
-
2235
- const float eps = 1e-6f; // TODO: temporarily hardcoded
2239
+ float eps;
2240
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
2236
2241
 
2237
2242
  const int32_t n_groups = ((int32_t *) dst->op_params)[0];
2238
2243
 
@@ -2308,7 +2313,7 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2308
2313
  memcpy(&beta_fast, (int32_t *) dst->op_params + 9, sizeof(float));
2309
2314
  memcpy(&beta_slow, (int32_t *) dst->op_params + 10, sizeof(float));
2310
2315
 
2311
- const bool is_neox = mode & 2;
2316
+ const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
2312
2317
 
2313
2318
  id<MTLComputePipelineState> pipeline = nil;
2314
2319
 
@@ -2829,7 +2834,9 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2829
2834
 
2830
2835
  [encoder endEncoding];
2831
2836
 
2832
- [command_buffer commit];
2837
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
2838
+ [command_buffer commit];
2839
+ }
2833
2840
  });
2834
2841
 
2835
2842
  // Wait for completion and check status of each command buffer
@@ -2849,6 +2856,23 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2849
2856
 
2850
2857
  return LM_GGML_STATUS_FAILED;
2851
2858
  }
2859
+
2860
+ id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
2861
+ if (!next_buffer) {
2862
+ continue;
2863
+ }
2864
+
2865
+ bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
2866
+ if (next_queued) {
2867
+ continue;
2868
+ }
2869
+
2870
+ if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
2871
+ LM_GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
2872
+ return LM_GGML_STATUS_ABORTED;
2873
+ }
2874
+
2875
+ [next_buffer commit];
2852
2876
  }
2853
2877
 
2854
2878
  if (should_capture) {
@@ -3152,7 +3176,7 @@ LM_GGML_CALL static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t ba
3152
3176
  }
3153
3177
 
3154
3178
  LM_GGML_CALL static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
3155
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3179
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3156
3180
  lm_ggml_metal_free(ctx);
3157
3181
  free(backend);
3158
3182
  }
@@ -3164,13 +3188,13 @@ LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_defa
3164
3188
  }
3165
3189
 
3166
3190
  LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
3167
- struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context;
3191
+ struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3168
3192
 
3169
3193
  return lm_ggml_metal_graph_compute(metal_ctx, cgraph);
3170
3194
  }
3171
3195
 
3172
3196
  LM_GGML_CALL static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
3173
- struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context;
3197
+ struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3174
3198
 
3175
3199
  return lm_ggml_metal_supports_op(metal_ctx, op);
3176
3200
  }
@@ -3215,9 +3239,9 @@ static lm_ggml_guid_t lm_ggml_backend_metal_guid(void) {
3215
3239
  }
3216
3240
 
3217
3241
  lm_ggml_backend_t lm_ggml_backend_metal_init(void) {
3218
- struct lm_ggml_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
3219
-
3242
+ struct lm_ggml_backend_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
3220
3243
  if (ctx == NULL) {
3244
+ LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
3221
3245
  return NULL;
3222
3246
  }
3223
3247
 
@@ -3239,15 +3263,24 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) {
3239
3263
  void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) {
3240
3264
  LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3241
3265
 
3242
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3266
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3243
3267
 
3244
3268
  ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
3245
3269
  }
3246
3270
 
3271
+ void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data) {
3272
+ LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3273
+
3274
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3275
+
3276
+ ctx->abort_callback = abort_callback;
3277
+ ctx->abort_callback_data = user_data;
3278
+ }
3279
+
3247
3280
  bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) {
3248
3281
  LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3249
3282
 
3250
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3283
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3251
3284
 
3252
3285
  return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
3253
3286
  }
@@ -3255,7 +3288,7 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family
3255
3288
  void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend) {
3256
3289
  LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3257
3290
 
3258
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3291
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3259
3292
  ctx->should_capture_next_compute = true;
3260
3293
  }
3261
3294
 
package/cpp/ggml-quants.c CHANGED
@@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
3818
3818
  float sumf = 0;
3819
3819
 
3820
3820
  #if defined(__ARM_FEATURE_SVE)
3821
- if (svcntb() == QK8_0) {
3821
+ if (lm_ggml_sve_cnt_b == QK8_0) {
3822
3822
  const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
3823
3823
  const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
3824
3824
 
@@ -5303,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
5303
5303
  float sumf = 0;
5304
5304
 
5305
5305
  #if defined(__ARM_FEATURE_SVE)
5306
- if (svcntb() == QK8_0) {
5306
+ if (lm_ggml_sve_cnt_b == QK8_0) {
5307
5307
  svfloat32_t sumv0 = svdup_n_f32(0.0f);
5308
5308
  svfloat32_t sumv1 = svdup_n_f32(0.0f);
5309
5309
 
package/cpp/ggml-quants.h CHANGED
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
127
127
  void iq3xs_init_impl(int grid_size);
128
128
  void iq3xs_free_impl(int grid_size);
129
129
 
130
+ #if defined(__ARM_FEATURE_SVE)
131
+ extern int lm_ggml_sve_cnt_b;
132
+ #endif
133
+
130
134
  #ifdef __cplusplus
131
135
  }
132
136
  #endif
package/cpp/ggml.c CHANGED
@@ -37,6 +37,9 @@
37
37
  #include <unistd.h>
38
38
  #endif
39
39
 
40
+ #if defined(__ARM_FEATURE_SVE)
41
+ int lm_ggml_sve_cnt_b = 0;
42
+ #endif
40
43
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
41
44
  #undef LM_GGML_USE_LLAMAFILE
42
45
  #endif
@@ -53,6 +56,9 @@
53
56
  // disable POSIX deprecation warnings
54
57
  // these functions are never going away, anyway
55
58
  #pragma warning(disable: 4996)
59
+
60
+ // unreachable code because of multiple instances of code after LM_GGML_ABORT
61
+ #pragma warning(disable: 4702)
56
62
  #endif
57
63
 
58
64
  #if defined(_WIN32)
@@ -185,7 +191,7 @@ static void lm_ggml_print_backtrace_symbols(void) {
185
191
  fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
186
192
  }
187
193
  }
188
- #elif defined(__linux__)
194
+ #elif defined(__linux__) && defined(__GLIBC__)
189
195
  #include <execinfo.h>
190
196
  static void lm_ggml_print_backtrace_symbols(void) {
191
197
  // void * trace[100];
@@ -480,9 +486,16 @@ void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) {
480
486
  }
481
487
  }
482
488
 
489
+ void lm_ggml_fp32_to_bf16_row_ref(const float * x, lm_ggml_bf16_t * y, int64_t n) {
490
+ for (int i = 0; i < n; i++) {
491
+ y[i] = lm_ggml_compute_fp32_to_bf16(x[i]);
492
+ }
493
+ }
494
+
483
495
  void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) {
484
496
  int i = 0;
485
497
  #if defined(__AVX512BF16__)
498
+ // subnormals are flushed to zero on this platform
486
499
  for (; i + 32 <= n; i += 32) {
487
500
  _mm512_storeu_si512(
488
501
  (__m512i *)(y + i),
@@ -962,7 +975,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = {
962
975
  .is_quantized = false,
963
976
  .to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
964
977
  .from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
965
- .from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
978
+ .from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
966
979
  .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
967
980
  .vec_dot_type = LM_GGML_TYPE_BF16,
968
981
  .nrows = 1,
@@ -2302,7 +2315,7 @@ inline static void lm_ggml_vec_abs_f32 (const int n, float * y, const float * x
2302
2315
  inline static void lm_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
2303
2316
  inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
2304
2317
  inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
2305
- inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
2318
+ inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
2306
2319
  inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
2307
2320
  inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
2308
2321
  inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
@@ -3551,6 +3564,12 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
3551
3564
 
3552
3565
  LM_GGML_ASSERT_ALIGNED(ctx->mem_buffer);
3553
3566
 
3567
+ #if defined(__ARM_FEATURE_SVE)
3568
+ if (!lm_ggml_sve_cnt_b) {
3569
+ lm_ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3570
+ }
3571
+ #endif
3572
+
3554
3573
  LM_GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3555
3574
 
3556
3575
  lm_ggml_critical_section_end();
@@ -3705,7 +3724,8 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl(
3705
3724
  struct lm_ggml_tensor * view_src,
3706
3725
  size_t view_offs) {
3707
3726
 
3708
- assert(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
3727
+ LM_GGML_ASSERT(type >= 0 && type < LM_GGML_TYPE_COUNT);
3728
+ LM_GGML_ASSERT(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
3709
3729
 
3710
3730
  // find the base tensor and absolute offset
3711
3731
  if (view_src != NULL && view_src->view_src != NULL) {
@@ -5358,6 +5378,7 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
5358
5378
  struct lm_ggml_context * ctx,
5359
5379
  struct lm_ggml_tensor * a,
5360
5380
  int n_groups,
5381
+ float eps,
5361
5382
  bool inplace) {
5362
5383
 
5363
5384
  bool is_node = false;
@@ -5368,7 +5389,8 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
5368
5389
 
5369
5390
  struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
5370
5391
 
5371
- result->op_params[0] = n_groups;
5392
+ lm_ggml_set_op_params_i32(result, 0, n_groups);
5393
+ lm_ggml_set_op_params_f32(result, 1, eps);
5372
5394
 
5373
5395
  result->op = LM_GGML_OP_GROUP_NORM;
5374
5396
  result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
@@ -5380,15 +5402,17 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
5380
5402
  struct lm_ggml_tensor * lm_ggml_group_norm(
5381
5403
  struct lm_ggml_context * ctx,
5382
5404
  struct lm_ggml_tensor * a,
5383
- int n_groups) {
5384
- return lm_ggml_group_norm_impl(ctx, a, n_groups, false);
5405
+ int n_groups,
5406
+ float eps) {
5407
+ return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, false);
5385
5408
  }
5386
5409
 
5387
5410
  struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
5388
5411
  struct lm_ggml_context * ctx,
5389
5412
  struct lm_ggml_tensor * a,
5390
- int n_groups) {
5391
- return lm_ggml_group_norm_impl(ctx, a, n_groups, true);
5413
+ int n_groups,
5414
+ float eps) {
5415
+ return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
5392
5416
  }
5393
5417
 
5394
5418
  // lm_ggml_mul_mat
@@ -12079,10 +12103,11 @@ static void lm_ggml_compute_forward_group_norm_f32(
12079
12103
 
12080
12104
  LM_GGML_TENSOR_UNARY_OP_LOCALS
12081
12105
 
12082
- const float eps = 1e-6f; // TODO: make this a parameter
12083
-
12084
12106
  // TODO: optimize
12085
12107
 
12108
+ float eps;
12109
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
12110
+
12086
12111
  int n_channels = src0->ne[2];
12087
12112
  int n_groups = dst->op_params[0];
12088
12113
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
@@ -14069,7 +14094,7 @@ static void lm_ggml_compute_forward_rope_f32(
14069
14094
  float corr_dims[2];
14070
14095
  lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14071
14096
 
14072
- const bool is_neox = mode & 2;
14097
+ const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
14073
14098
 
14074
14099
  const float * freq_factors = NULL;
14075
14100
  if (src2 != NULL) {
@@ -14194,7 +14219,7 @@ static void lm_ggml_compute_forward_rope_f16(
14194
14219
  float corr_dims[2];
14195
14220
  lm_ggml_rope_yarn_corr_dims(n_dims, n_ctx_orig, freq_base, beta_fast, beta_slow, corr_dims);
14196
14221
 
14197
- const bool is_neox = mode & 2;
14222
+ const bool is_neox = mode & LM_GGML_ROPE_TYPE_NEOX;
14198
14223
 
14199
14224
  const float * freq_factors = NULL;
14200
14225
  if (src2 != NULL) {
@@ -20650,7 +20675,7 @@ size_t lm_ggml_quantize_chunk(
20650
20675
  case LM_GGML_TYPE_BF16:
20651
20676
  {
20652
20677
  size_t elemsize = sizeof(lm_ggml_bf16_t);
20653
- lm_ggml_fp32_to_bf16_row(src + start, (lm_ggml_bf16_t *)dst + start, n);
20678
+ lm_ggml_fp32_to_bf16_row_ref(src + start, (lm_ggml_bf16_t *)dst + start, n);
20654
20679
  result = n * elemsize;
20655
20680
  } break;
20656
20681
  case LM_GGML_TYPE_F32:
@@ -21104,7 +21129,7 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
21104
21129
  (int64_t) info->ne[2] *
21105
21130
  (int64_t) info->ne[3];
21106
21131
 
21107
- if (ne % lm_ggml_blck_size(info->type) != 0) {
21132
+ if (lm_ggml_blck_size(info->type) == 0 || ne % lm_ggml_blck_size(info->type) != 0) {
21108
21133
  fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
21109
21134
  __func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type));
21110
21135
  fclose(file);
package/cpp/ggml.h CHANGED
@@ -244,6 +244,8 @@
244
244
  #define LM_GGML_EXIT_SUCCESS 0
245
245
  #define LM_GGML_EXIT_ABORTED 1
246
246
 
247
+ #define LM_GGML_ROPE_TYPE_NEOX 2
248
+
247
249
  #define LM_GGUF_MAGIC "GGUF"
248
250
 
249
251
  #define LM_GGUF_VERSION 3
@@ -349,6 +351,7 @@ extern "C" {
349
351
  LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float);
350
352
  LM_GGML_API float lm_ggml_bf16_to_fp32(lm_ggml_bf16_t); // consider just doing << 16
351
353
  LM_GGML_API void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t);
354
+ LM_GGML_API void lm_ggml_fp32_to_bf16_row_ref(const float *, lm_ggml_bf16_t *, int64_t);
352
355
  LM_GGML_API void lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t);
353
356
 
354
357
  struct lm_ggml_object;
@@ -1139,16 +1142,17 @@ extern "C" {
1139
1142
 
1140
1143
  // group normalize along ne0*ne1*n_groups
1141
1144
  // used in stable-diffusion
1142
- // TODO: eps is hardcoded to 1e-6 for now
1143
1145
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm(
1144
1146
  struct lm_ggml_context * ctx,
1145
1147
  struct lm_ggml_tensor * a,
1146
- int n_groups);
1148
+ int n_groups,
1149
+ float eps);
1147
1150
 
1148
1151
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
1149
1152
  struct lm_ggml_context * ctx,
1150
1153
  struct lm_ggml_tensor * a,
1151
- int n_groups);
1154
+ int n_groups,
1155
+ float eps);
1152
1156
 
1153
1157
  // a - x
1154
1158
  // b - dy
@@ -1451,11 +1455,10 @@ extern "C" {
1451
1455
  struct lm_ggml_tensor * b);
1452
1456
 
1453
1457
  // rotary position embedding
1454
- // if mode & 1 == 1, skip n_past elements (NOT SUPPORTED)
1455
- // if mode & 2 == 1, GPT-NeoX style
1458
+ // if (mode & 1) - skip n_past elements (NOT SUPPORTED)
1459
+ // if (mode & LM_GGML_ROPE_TYPE_NEOX) - GPT-NeoX style
1456
1460
  //
1457
1461
  // b is an int32 vector with size a->ne[2], it contains the positions
1458
- // c is freq factors (e.g. phi3-128k), (optional)
1459
1462
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
1460
1463
  struct lm_ggml_context * ctx,
1461
1464
  struct lm_ggml_tensor * a,
@@ -1472,6 +1475,7 @@ extern "C" {
1472
1475
  int mode);
1473
1476
 
1474
1477
  // custom RoPE
1478
+ // c is freq factors (e.g. phi3-128k), (optional)
1475
1479
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext(
1476
1480
  struct lm_ggml_context * ctx,
1477
1481
  struct lm_ggml_tensor * a,
@@ -369,6 +369,9 @@ namespace grammar_parser {
369
369
  }
370
370
  // Validate the state to ensure that all rules are defined
371
371
  for (const auto & rule : state.rules) {
372
+ if (rule.empty()) {
373
+ throw std::runtime_error("Undefined rule");
374
+ }
372
375
  for (const auto & elem : rule) {
373
376
  if (elem.type == LLAMA_GRETYPE_RULE_REF) {
374
377
  // Ensure that the rule at that location exists
package/cpp/llama-impl.h CHANGED
@@ -24,3 +24,18 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
24
24
  #define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
25
25
  #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
26
26
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
27
+
28
+ //
29
+ // helpers
30
+ //
31
+
32
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
33
+ if (search.empty()) {
34
+ return; // Avoid infinite loop if 'search' is an empty string
35
+ }
36
+ size_t pos = 0;
37
+ while ((pos = s.find(search, pos)) != std::string::npos) {
38
+ s.replace(pos, search.length(), replace);
39
+ pos += replace.length();
40
+ }
41
+ }
@@ -85,14 +85,14 @@ void llama_sample_top_k_impl(struct llama_sampling * smpl, llama_token_data_arra
85
85
  constexpr float bucket_low = -10.0f;
86
86
  constexpr float bucket_high = 10.0f;
87
87
  constexpr float bucket_scale = nbuckets/(bucket_high - bucket_low);
88
- constexpr float bucker_inter = -bucket_low * bucket_scale;
88
+ constexpr float bucket_inter = -bucket_low * bucket_scale;
89
89
 
90
90
  std::vector<int> bucket_idx(candidates->size);
91
91
  std::vector<int> histo(nbuckets, 0);
92
92
 
93
93
  for (int i = 0; i < (int)candidates->size; ++i) {
94
94
  const float val = candidates->data[i].logit;
95
- int ib = int(bucket_scale * val + bucker_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
95
+ int ib = int(bucket_scale * val + bucket_inter); //nbuckets * (val - bucket_low) / (bucket_high - bucket_low);
96
96
  ib = std::max(0, std::min(nbuckets-1, ib));
97
97
  bucket_idx[i] = ib;
98
98
  ++histo[ib];
@@ -16,20 +16,6 @@
16
16
  // helpers
17
17
  //
18
18
 
19
- static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
20
- std::string result;
21
- for (size_t pos = 0; ; pos += search.length()) {
22
- auto new_pos = s.find(search, pos);
23
- if (new_pos == std::string::npos) {
24
- result += s.substr(pos, s.size() - pos);
25
- break;
26
- }
27
- result += s.substr(pos, new_pos - pos) + replace;
28
- pos = new_pos;
29
- }
30
- s = std::move(result);
31
- }
32
-
33
19
  LLAMA_ATTRIBUTE_FORMAT(1, 2)
34
20
  static std::string format(const char * fmt, ...) {
35
21
  va_list ap;
@@ -424,6 +410,8 @@ struct llm_tokenizer_bpe {
424
410
  };
425
411
  break;
426
412
  case LLAMA_VOCAB_PRE_TYPE_PORO:
413
+ case LLAMA_VOCAB_PRE_TYPE_BLOOM:
414
+ case LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH:
427
415
  regex_exprs = {
428
416
  " ?[^(\\s|.,!?…。,、।۔،)]+",
429
417
  };
@@ -816,6 +804,9 @@ struct llm_tokenizer_ugm {
816
804
  * the best tokenization.
817
805
  */
818
806
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
807
+ // get current size of output (for reversal later)
808
+ size_t output_size = output.size();
809
+
819
810
  // normalize the input first
820
811
  std::string normalized;
821
812
  normalize(text, &normalized);
@@ -895,7 +886,7 @@ struct llm_tokenizer_ugm {
895
886
  }
896
887
 
897
888
  // reverse the output since we added tokens starting from the end of the input
898
- std::reverse(output.begin(), output.end());
889
+ std::reverse(output.begin() + output_size, output.end());
899
890
  }
900
891
 
901
892
  private:
@@ -1444,7 +1435,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
1444
1435
  bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
1445
1436
  return token != -1 && (
1446
1437
  token == llama_token_eos_impl(vocab) ||
1447
- token == llama_token_eot_impl(vocab)
1438
+ token == llama_token_eot_impl(vocab) ||
1439
+ token == llama_token_eom_impl(vocab)
1448
1440
  );
1449
1441
  }
1450
1442
 
@@ -1476,11 +1468,11 @@ llama_token llama_token_pad_impl(const struct llama_vocab & vocab) {
1476
1468
  return vocab.special_pad_id;
1477
1469
  }
1478
1470
 
1479
- int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab) {
1471
+ bool llama_add_bos_token_impl(const struct llama_vocab & vocab) {
1480
1472
  return vocab.tokenizer_add_bos;
1481
1473
  }
1482
1474
 
1483
- int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab) {
1475
+ bool llama_add_eos_token_impl(const struct llama_vocab & vocab) {
1484
1476
  return vocab.tokenizer_add_eos;
1485
1477
  }
1486
1478
 
@@ -1500,6 +1492,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
1500
1492
  return vocab.special_eot_id;
1501
1493
  }
1502
1494
 
1495
+ llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
1496
+ return vocab.special_eom_id;
1497
+ }
1498
+
1503
1499
  int32_t llama_tokenize_impl(
1504
1500
  const struct llama_vocab & vocab,
1505
1501
  const char * text,
package/cpp/llama-vocab.h CHANGED
@@ -45,6 +45,7 @@ struct llama_vocab {
45
45
  id special_suffix_id = -1;
46
46
  id special_middle_id = -1;
47
47
  id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
48
+ id special_eom_id = -1;
48
49
 
49
50
  // tokenizer flags
50
51
  bool tokenizer_add_space_prefix = false;
@@ -94,13 +95,14 @@ llama_token llama_token_sep_impl(const struct llama_vocab & vocab);
94
95
  llama_token llama_token_nl_impl (const struct llama_vocab & vocab);
95
96
  llama_token llama_token_pad_impl(const struct llama_vocab & vocab);
96
97
 
97
- int32_t llama_add_bos_token_impl(const struct llama_vocab & vocab);
98
- int32_t llama_add_eos_token_impl(const struct llama_vocab & vocab);
98
+ bool llama_add_bos_token_impl(const struct llama_vocab & vocab);
99
+ bool llama_add_eos_token_impl(const struct llama_vocab & vocab);
99
100
 
100
101
  llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
101
102
  llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
102
103
  llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
103
104
  llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
105
+ llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
104
106
 
105
107
  int32_t llama_tokenize_impl(
106
108
  const struct llama_vocab & vocab,