cui-llama.rn 1.0.6 → 1.0.9

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml-metal.m CHANGED
@@ -210,7 +210,7 @@ enum lm_ggml_metal_kernel_type {
210
210
  LM_GGML_METAL_KERNEL_TYPE_COUNT
211
211
  };
212
212
 
213
- struct lm_ggml_metal_context {
213
+ struct lm_ggml_backend_metal_context {
214
214
  int n_cb;
215
215
 
216
216
  id<MTLDevice> device;
@@ -224,6 +224,10 @@ struct lm_ggml_metal_context {
224
224
  bool support_simdgroup_mm;
225
225
 
226
226
  bool should_capture_next_compute;
227
+
228
+ // abort lm_ggml_metal_graph_compute if callback returns true
229
+ lm_ggml_abort_callback abort_callback;
230
+ void * abort_callback_data;
227
231
  };
228
232
 
229
233
  // MSL code
@@ -289,7 +293,7 @@ static void * lm_ggml_metal_host_malloc(size_t n) {
289
293
  return data;
290
294
  }
291
295
 
292
- static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
296
+ static struct lm_ggml_backend_metal_context * lm_ggml_metal_init(int n_cb) {
293
297
  LM_GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
294
298
 
295
299
  #if TARGET_OS_OSX && !LM_GGML_METAL_NDEBUG
@@ -306,7 +310,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
306
310
  LM_GGML_METAL_LOG_INFO("%s: picking default device: %s\n", __func__, [[device name] UTF8String]);
307
311
 
308
312
  // Configure context
309
- struct lm_ggml_metal_context * ctx = malloc(sizeof(struct lm_ggml_metal_context));
313
+ struct lm_ggml_backend_metal_context * ctx = malloc(sizeof(struct lm_ggml_backend_metal_context));
310
314
  ctx->device = device;
311
315
  ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
312
316
  ctx->queue = [ctx->device newCommandQueue];
@@ -668,7 +672,7 @@ static struct lm_ggml_metal_context * lm_ggml_metal_init(int n_cb) {
668
672
  return ctx;
669
673
  }
670
674
 
671
- static void lm_ggml_metal_free(struct lm_ggml_metal_context * ctx) {
675
+ static void lm_ggml_metal_free(struct lm_ggml_backend_metal_context * ctx) {
672
676
  LM_GGML_METAL_LOG_INFO("%s: deallocating\n", __func__);
673
677
 
674
678
  for (int i = 0; i < LM_GGML_METAL_KERNEL_TYPE_COUNT; ++i) {
@@ -734,7 +738,7 @@ static id<MTLBuffer> lm_ggml_metal_get_buffer(struct lm_ggml_tensor * t, size_t
734
738
  return nil;
735
739
  }
736
740
 
737
- static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx, const struct lm_ggml_tensor * op) {
741
+ static bool lm_ggml_metal_supports_op(const struct lm_ggml_backend_metal_context * ctx, const struct lm_ggml_tensor * op) {
738
742
  for (size_t i = 0, n = 3; i < n; ++i) {
739
743
  if (op->src[i] != NULL && op->src[i]->type == LM_GGML_TYPE_BF16) {
740
744
  return false;
@@ -845,7 +849,7 @@ static bool lm_ggml_metal_supports_op(const struct lm_ggml_metal_context * ctx,
845
849
  }
846
850
 
847
851
  static enum lm_ggml_status lm_ggml_metal_graph_compute(
848
- struct lm_ggml_metal_context * ctx,
852
+ struct lm_ggml_backend_metal_context * ctx,
849
853
  struct lm_ggml_cgraph * gf) {
850
854
 
851
855
  @autoreleasepool {
@@ -878,8 +882,11 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
878
882
  id<MTLCommandBuffer> command_buffer = [ctx->queue commandBufferWithUnretainedReferences];
879
883
  command_buffer_builder[cb_idx] = command_buffer;
880
884
 
881
- // enqueue the command buffers in order to specify their execution order
882
- [command_buffer enqueue];
885
+ // always enqueue the first two command buffers
886
+ // enqueue all of the command buffers if we don't need to abort
887
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
888
+ [command_buffer enqueue];
889
+ }
883
890
  }
884
891
 
885
892
  const id<MTLCommandBuffer> *command_buffers = command_buffer_builder;
@@ -2229,10 +2236,8 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2229
2236
  LM_GGML_ASSERT(ne00 % 4 == 0);
2230
2237
  LM_GGML_ASSERT(lm_ggml_is_contiguous(src0));
2231
2238
 
2232
- //float eps;
2233
- //memcpy(&eps, dst->op_params, sizeof(float));
2234
-
2235
- const float eps = 1e-6f; // TODO: temporarily hardcoded
2239
+ float eps;
2240
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
2236
2241
 
2237
2242
  const int32_t n_groups = ((int32_t *) dst->op_params)[0];
2238
2243
 
@@ -2829,7 +2834,9 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2829
2834
 
2830
2835
  [encoder endEncoding];
2831
2836
 
2832
- [command_buffer commit];
2837
+ if (cb_idx < 2 || ctx->abort_callback == NULL) {
2838
+ [command_buffer commit];
2839
+ }
2833
2840
  });
2834
2841
 
2835
2842
  // Wait for completion and check status of each command buffer
@@ -2849,6 +2856,23 @@ static enum lm_ggml_status lm_ggml_metal_graph_compute(
2849
2856
 
2850
2857
  return LM_GGML_STATUS_FAILED;
2851
2858
  }
2859
+
2860
+ id<MTLCommandBuffer> next_buffer = (i + 1 < n_cb ? command_buffers[i + 1] : nil);
2861
+ if (!next_buffer) {
2862
+ continue;
2863
+ }
2864
+
2865
+ bool next_queued = ([next_buffer status] != MTLCommandBufferStatusNotEnqueued);
2866
+ if (next_queued) {
2867
+ continue;
2868
+ }
2869
+
2870
+ if (ctx->abort_callback && ctx->abort_callback(ctx->abort_callback_data)) {
2871
+ LM_GGML_METAL_LOG_INFO("%s: command buffer %d aborted", __func__, i);
2872
+ return LM_GGML_STATUS_ABORTED;
2873
+ }
2874
+
2875
+ [next_buffer commit];
2852
2876
  }
2853
2877
 
2854
2878
  if (should_capture) {
@@ -3152,7 +3176,7 @@ LM_GGML_CALL static const char * lm_ggml_backend_metal_name(lm_ggml_backend_t ba
3152
3176
  }
3153
3177
 
3154
3178
  LM_GGML_CALL static void lm_ggml_backend_metal_free(lm_ggml_backend_t backend) {
3155
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3179
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3156
3180
  lm_ggml_metal_free(ctx);
3157
3181
  free(backend);
3158
3182
  }
@@ -3164,13 +3188,13 @@ LM_GGML_CALL static lm_ggml_backend_buffer_type_t lm_ggml_backend_metal_get_defa
3164
3188
  }
3165
3189
 
3166
3190
  LM_GGML_CALL static enum lm_ggml_status lm_ggml_backend_metal_graph_compute(lm_ggml_backend_t backend, struct lm_ggml_cgraph * cgraph) {
3167
- struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context;
3191
+ struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3168
3192
 
3169
3193
  return lm_ggml_metal_graph_compute(metal_ctx, cgraph);
3170
3194
  }
3171
3195
 
3172
3196
  LM_GGML_CALL static bool lm_ggml_backend_metal_supports_op(lm_ggml_backend_t backend, const struct lm_ggml_tensor * op) {
3173
- struct lm_ggml_metal_context * metal_ctx = (struct lm_ggml_metal_context *)backend->context;
3197
+ struct lm_ggml_backend_metal_context * metal_ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3174
3198
 
3175
3199
  return lm_ggml_metal_supports_op(metal_ctx, op);
3176
3200
  }
@@ -3215,9 +3239,9 @@ static lm_ggml_guid_t lm_ggml_backend_metal_guid(void) {
3215
3239
  }
3216
3240
 
3217
3241
  lm_ggml_backend_t lm_ggml_backend_metal_init(void) {
3218
- struct lm_ggml_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
3219
-
3242
+ struct lm_ggml_backend_metal_context * ctx = lm_ggml_metal_init(LM_GGML_DEFAULT_N_THREADS);
3220
3243
  if (ctx == NULL) {
3244
+ LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate context\n", __func__);
3221
3245
  return NULL;
3222
3246
  }
3223
3247
 
@@ -3239,15 +3263,24 @@ bool lm_ggml_backend_is_metal(lm_ggml_backend_t backend) {
3239
3263
  void lm_ggml_backend_metal_set_n_cb(lm_ggml_backend_t backend, int n_cb) {
3240
3264
  LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3241
3265
 
3242
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3266
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3243
3267
 
3244
3268
  ctx->n_cb = MIN(n_cb, LM_GGML_METAL_MAX_BUFFERS);
3245
3269
  }
3246
3270
 
3271
+ void lm_ggml_backend_metal_set_abort_callback(lm_ggml_backend_t backend, lm_ggml_abort_callback abort_callback, void * user_data) {
3272
+ LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3273
+
3274
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3275
+
3276
+ ctx->abort_callback = abort_callback;
3277
+ ctx->abort_callback_data = user_data;
3278
+ }
3279
+
3247
3280
  bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family) {
3248
3281
  LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3249
3282
 
3250
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3283
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3251
3284
 
3252
3285
  return [ctx->device supportsFamily:(MTLGPUFamilyApple1 + family - 1)];
3253
3286
  }
@@ -3255,7 +3288,7 @@ bool lm_ggml_backend_metal_supports_family(lm_ggml_backend_t backend, int family
3255
3288
  void lm_ggml_backend_metal_capture_next_compute(lm_ggml_backend_t backend) {
3256
3289
  LM_GGML_ASSERT(lm_ggml_backend_is_metal(backend));
3257
3290
 
3258
- struct lm_ggml_metal_context * ctx = (struct lm_ggml_metal_context *)backend->context;
3291
+ struct lm_ggml_backend_metal_context * ctx = (struct lm_ggml_backend_metal_context *)backend->context;
3259
3292
  ctx->should_capture_next_compute = true;
3260
3293
  }
3261
3294
 
package/cpp/ggml-quants.c CHANGED
@@ -3818,7 +3818,7 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
3818
3818
  float sumf = 0;
3819
3819
 
3820
3820
  #if defined(__ARM_FEATURE_SVE)
3821
- if (svcntb() == QK8_0) {
3821
+ if (lm_ggml_sve_cnt_b == QK8_0) {
3822
3822
  const svbool_t ptrueh = svptrue_pat_b8(SV_VL16);
3823
3823
  const svbool_t ptruel = svnot_b_z(svptrue_b8(), ptrueh);
3824
3824
 
@@ -5303,7 +5303,7 @@ void lm_ggml_vec_dot_q8_0_q8_0(int n, float * restrict s, size_t bs, const void
5303
5303
  float sumf = 0;
5304
5304
 
5305
5305
  #if defined(__ARM_FEATURE_SVE)
5306
- if (svcntb() == QK8_0) {
5306
+ if (lm_ggml_sve_cnt_b == QK8_0) {
5307
5307
  svfloat32_t sumv0 = svdup_n_f32(0.0f);
5308
5308
  svfloat32_t sumv1 = svdup_n_f32(0.0f);
5309
5309
 
@@ -6449,22 +6449,22 @@ void lm_ggml_vec_dot_q3_K_q8_K(int n, float * restrict s, size_t bs, const void
6449
6449
  // compute mask for subtraction
6450
6450
  vuint8m1_t qh_m0 = __riscv_vand_vx_u8m1(vqh, m, vl);
6451
6451
  vbool8_t vmask_0 = __riscv_vmseq_vx_u8m1_b8(qh_m0, 0, vl);
6452
- vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_m(vmask_0, q3_0, 0x4, vl);
6452
+ vint8m1_t q3_m0 = __riscv_vsub_vx_i8m1_mu(vmask_0, q3_0, q3_0, 0x4, vl);
6453
6453
  m <<= 1;
6454
6454
 
6455
6455
  vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
6456
6456
  vbool8_t vmask_1 = __riscv_vmseq_vx_u8m1_b8(qh_m1, 0, vl);
6457
- vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_m(vmask_1, q3_1, 0x4, vl);
6457
+ vint8m1_t q3_m1 = __riscv_vsub_vx_i8m1_mu(vmask_1, q3_1, q3_1, 0x4, vl);
6458
6458
  m <<= 1;
6459
6459
 
6460
6460
  vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
6461
6461
  vbool8_t vmask_2 = __riscv_vmseq_vx_u8m1_b8(qh_m2, 0, vl);
6462
- vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_m(vmask_2, q3_2, 0x4, vl);
6462
+ vint8m1_t q3_m2 = __riscv_vsub_vx_i8m1_mu(vmask_2, q3_2, q3_2, 0x4, vl);
6463
6463
  m <<= 1;
6464
6464
 
6465
6465
  vuint8m1_t qh_m3 = __riscv_vand_vx_u8m1(vqh, m, vl);
6466
6466
  vbool8_t vmask_3 = __riscv_vmseq_vx_u8m1_b8(qh_m3, 0, vl);
6467
- vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_m(vmask_3, q3_3, 0x4, vl);
6467
+ vint8m1_t q3_m3 = __riscv_vsub_vx_i8m1_mu(vmask_3, q3_3, q3_3, 0x4, vl);
6468
6468
  m <<= 1;
6469
6469
 
6470
6470
  // load Q8 and take product with Q3
@@ -7720,13 +7720,13 @@ void lm_ggml_vec_dot_q5_K_q8_K(int n, float * restrict s, size_t bs, const void
7720
7720
  vint8m1_t q5_a = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vand_vx_u8m1(q5_x, 0x0F, vl));
7721
7721
  vuint8m1_t qh_m1 = __riscv_vand_vx_u8m1(vqh, m, vl);
7722
7722
  vbool8_t vmask_1 = __riscv_vmsne_vx_u8m1_b8(qh_m1, 0, vl);
7723
- vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_m(vmask_1, q5_a, 16, vl);
7723
+ vint8m1_t q5_m1 = __riscv_vadd_vx_i8m1_mu(vmask_1, q5_a, q5_a, 16, vl);
7724
7724
  m <<= 1;
7725
7725
 
7726
7726
  vint8m1_t q5_l = __riscv_vreinterpret_v_u8m1_i8m1(__riscv_vsrl_vx_u8m1(q5_x, 0x04, vl));
7727
7727
  vuint8m1_t qh_m2 = __riscv_vand_vx_u8m1(vqh, m, vl);
7728
7728
  vbool8_t vmask_2 = __riscv_vmsne_vx_u8m1_b8(qh_m2, 0, vl);
7729
- vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_m(vmask_2, q5_l, 16, vl);
7729
+ vint8m1_t q5_m2 = __riscv_vadd_vx_i8m1_mu(vmask_2, q5_l, q5_l, 16, vl);
7730
7730
  m <<= 1;
7731
7731
 
7732
7732
  vint16m2_t v0 = __riscv_vwmul_vv_i16m2(q5_m1, q8_y1, vl);
package/cpp/ggml-quants.h CHANGED
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
127
127
  void iq3xs_init_impl(int grid_size);
128
128
  void iq3xs_free_impl(int grid_size);
129
129
 
130
+ #if defined(__ARM_FEATURE_SVE)
131
+ extern int lm_ggml_sve_cnt_b;
132
+ #endif
133
+
130
134
  #ifdef __cplusplus
131
135
  }
132
136
  #endif
package/cpp/ggml.c CHANGED
@@ -37,6 +37,9 @@
37
37
  #include <unistd.h>
38
38
  #endif
39
39
 
40
+ #if defined(__ARM_FEATURE_SVE)
41
+ int lm_ggml_sve_cnt_b = 0;
42
+ #endif
40
43
  #if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
41
44
  #undef LM_GGML_USE_LLAMAFILE
42
45
  #endif
@@ -53,6 +56,9 @@
53
56
  // disable POSIX deprecation warnings
54
57
  // these functions are never going away, anyway
55
58
  #pragma warning(disable: 4996)
59
+
60
+ // unreachable code because of multiple instances of code after LM_GGML_ABORT
61
+ #pragma warning(disable: 4702)
56
62
  #endif
57
63
 
58
64
  #if defined(_WIN32)
@@ -141,7 +147,51 @@ typedef pthread_t lm_ggml_thread_t;
141
147
 
142
148
  #include <sys/wait.h>
143
149
 
144
- #if defined(__linux__)
150
+ #if defined(__ANDROID__)
151
+ #include <unwind.h>
152
+ #include <dlfcn.h>
153
+ #include <stdio.h>
154
+
155
+ struct backtrace_state {
156
+ void ** current;
157
+ void ** end;
158
+ };
159
+
160
+ static _Unwind_Reason_Code unwind_callback(struct _Unwind_Context* context, void* arg) {
161
+ struct backtrace_state * state = (struct backtrace_state *)arg;
162
+ uintptr_t pc = _Unwind_GetIP(context);
163
+ if (pc) {
164
+ if (state->current == state->end) {
165
+ return _URC_END_OF_STACK;
166
+ } else {
167
+ *state->current++ = (void*)pc;
168
+ }
169
+ }
170
+ return _URC_NO_REASON;
171
+ }
172
+
173
+ static void lm_ggml_print_backtrace_symbols(void) {
174
+ const int max = 100;
175
+ void* buffer[max];
176
+
177
+ struct backtrace_state state = {buffer, buffer + max};
178
+ _Unwind_Backtrace(unwind_callback, &state);
179
+
180
+ int count = state.current - buffer;
181
+
182
+ for (int idx = 0; idx < count; ++idx) {
183
+ const void * addr = buffer[idx];
184
+ const char * symbol = "";
185
+
186
+ Dl_info info;
187
+ if (dladdr(addr, &info) && info.dli_sname) {
188
+ symbol = info.dli_sname;
189
+ }
190
+
191
+ fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
192
+ }
193
+ }
194
+ #elif defined(__linux__) && defined(__GLIBC__)
145
195
  #include <execinfo.h>
146
196
  static void lm_ggml_print_backtrace_symbols(void) {
147
197
  // void * trace[100];
@@ -436,9 +486,16 @@ void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) {
436
486
  }
437
487
  }
438
488
 
489
+ void lm_ggml_fp32_to_bf16_row_ref(const float * x, lm_ggml_bf16_t * y, int64_t n) {
490
+ for (int i = 0; i < n; i++) {
491
+ y[i] = lm_ggml_compute_fp32_to_bf16(x[i]);
492
+ }
493
+ }
494
+
439
495
  void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) {
440
496
  int i = 0;
441
497
  #if defined(__AVX512BF16__)
498
+ // subnormals are flushed to zero on this platform
442
499
  for (; i + 32 <= n; i += 32) {
443
500
  _mm512_storeu_si512(
444
501
  (__m512i *)(y + i),
@@ -918,7 +975,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = {
918
975
  .is_quantized = false,
919
976
  .to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
920
977
  .from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
921
- .from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
978
+ .from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
922
979
  .vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
923
980
  .vec_dot_type = LM_GGML_TYPE_BF16,
924
981
  .nrows = 1,
@@ -2258,7 +2315,7 @@ inline static void lm_ggml_vec_abs_f32 (const int n, float * y, const float * x
2258
2315
  inline static void lm_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
2259
2316
  inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
2260
2317
  inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
2261
- inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expf(x[i])-1; }
2318
+ inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
2262
2319
  inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
2263
2320
  inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
2264
2321
  inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
@@ -3507,6 +3564,12 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
3507
3564
 
3508
3565
  LM_GGML_ASSERT_ALIGNED(ctx->mem_buffer);
3509
3566
 
3567
+ #if defined(__ARM_FEATURE_SVE)
3568
+ if (!lm_ggml_sve_cnt_b) {
3569
+ lm_ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
3570
+ }
3571
+ #endif
3572
+
3510
3573
  LM_GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
3511
3574
 
3512
3575
  lm_ggml_critical_section_end();
@@ -3661,7 +3724,8 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl(
3661
3724
  struct lm_ggml_tensor * view_src,
3662
3725
  size_t view_offs) {
3663
3726
 
3664
- assert(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
3727
+ LM_GGML_ASSERT(type >= 0 && type < LM_GGML_TYPE_COUNT);
3728
+ LM_GGML_ASSERT(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
3665
3729
 
3666
3730
  // find the base tensor and absolute offset
3667
3731
  if (view_src != NULL && view_src->view_src != NULL) {
@@ -5314,6 +5378,7 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
5314
5378
  struct lm_ggml_context * ctx,
5315
5379
  struct lm_ggml_tensor * a,
5316
5380
  int n_groups,
5381
+ float eps,
5317
5382
  bool inplace) {
5318
5383
 
5319
5384
  bool is_node = false;
@@ -5324,7 +5389,8 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
5324
5389
 
5325
5390
  struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
5326
5391
 
5327
- result->op_params[0] = n_groups;
5392
+ lm_ggml_set_op_params_i32(result, 0, n_groups);
5393
+ lm_ggml_set_op_params_f32(result, 1, eps);
5328
5394
 
5329
5395
  result->op = LM_GGML_OP_GROUP_NORM;
5330
5396
  result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
@@ -5336,15 +5402,17 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
5336
5402
  struct lm_ggml_tensor * lm_ggml_group_norm(
5337
5403
  struct lm_ggml_context * ctx,
5338
5404
  struct lm_ggml_tensor * a,
5339
- int n_groups) {
5340
- return lm_ggml_group_norm_impl(ctx, a, n_groups, false);
5405
+ int n_groups,
5406
+ float eps) {
5407
+ return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, false);
5341
5408
  }
5342
5409
 
5343
5410
  struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
5344
5411
  struct lm_ggml_context * ctx,
5345
5412
  struct lm_ggml_tensor * a,
5346
- int n_groups) {
5347
- return lm_ggml_group_norm_impl(ctx, a, n_groups, true);
5413
+ int n_groups,
5414
+ float eps) {
5415
+ return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
5348
5416
  }
5349
5417
 
5350
5418
  // lm_ggml_mul_mat
@@ -12035,10 +12103,11 @@ static void lm_ggml_compute_forward_group_norm_f32(
12035
12103
 
12036
12104
  LM_GGML_TENSOR_UNARY_OP_LOCALS
12037
12105
 
12038
- const float eps = 1e-6f; // TODO: make this a parameter
12039
-
12040
12106
  // TODO: optimize
12041
12107
 
12108
+ float eps;
12109
+ memcpy(&eps, dst->op_params + 1, sizeof(float));
12110
+
12042
12111
  int n_channels = src0->ne[2];
12043
12112
  int n_groups = dst->op_params[0];
12044
12113
  int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
@@ -20606,7 +20675,7 @@ size_t lm_ggml_quantize_chunk(
20606
20675
  case LM_GGML_TYPE_BF16:
20607
20676
  {
20608
20677
  size_t elemsize = sizeof(lm_ggml_bf16_t);
20609
- lm_ggml_fp32_to_bf16_row(src + start, (lm_ggml_bf16_t *)dst + start, n);
20678
+ lm_ggml_fp32_to_bf16_row_ref(src + start, (lm_ggml_bf16_t *)dst + start, n);
20610
20679
  result = n * elemsize;
20611
20680
  } break;
20612
20681
  case LM_GGML_TYPE_F32:
package/cpp/ggml.h CHANGED
@@ -349,6 +349,7 @@ extern "C" {
349
349
  LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float);
350
350
  LM_GGML_API float lm_ggml_bf16_to_fp32(lm_ggml_bf16_t); // consider just doing << 16
351
351
  LM_GGML_API void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t);
352
+ LM_GGML_API void lm_ggml_fp32_to_bf16_row_ref(const float *, lm_ggml_bf16_t *, int64_t);
352
353
  LM_GGML_API void lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t);
353
354
 
354
355
  struct lm_ggml_object;
@@ -1139,16 +1140,17 @@ extern "C" {
1139
1140
 
1140
1141
  // group normalize along ne0*ne1*n_groups
1141
1142
  // used in stable-diffusion
1142
- // TODO: eps is hardcoded to 1e-6 for now
1143
1143
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm(
1144
1144
  struct lm_ggml_context * ctx,
1145
1145
  struct lm_ggml_tensor * a,
1146
- int n_groups);
1146
+ int n_groups,
1147
+ float eps);
1147
1148
 
1148
1149
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
1149
1150
  struct lm_ggml_context * ctx,
1150
1151
  struct lm_ggml_tensor * a,
1151
- int n_groups);
1152
+ int n_groups,
1153
+ float eps);
1152
1154
 
1153
1155
  // a - x
1154
1156
  // b - dy
@@ -1455,7 +1457,6 @@ extern "C" {
1455
1457
  // if mode & 2 == 1, GPT-NeoX style
1456
1458
  //
1457
1459
  // b is an int32 vector with size a->ne[2], it contains the positions
1458
- // c is freq factors (e.g. phi3-128k), (optional)
1459
1460
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
1460
1461
  struct lm_ggml_context * ctx,
1461
1462
  struct lm_ggml_tensor * a,
@@ -1472,6 +1473,7 @@ extern "C" {
1472
1473
  int mode);
1473
1474
 
1474
1475
  // custom RoPE
1476
+ // c is freq factors (e.g. phi3-128k), (optional)
1475
1477
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext(
1476
1478
  struct lm_ggml_context * ctx,
1477
1479
  struct lm_ggml_tensor * a,
package/cpp/llama-impl.h CHANGED
@@ -24,3 +24,18 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
24
24
  #define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
25
25
  #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
26
26
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
27
+
28
+ //
29
+ // helpers
30
+ //
31
+
32
+ static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
33
+ if (search.empty()) {
34
+ return; // Avoid infinite loop if 'search' is an empty string
35
+ }
36
+ size_t pos = 0;
37
+ while ((pos = s.find(search, pos)) != std::string::npos) {
38
+ s.replace(pos, search.length(), replace);
39
+ pos += replace.length();
40
+ }
41
+ }
@@ -16,20 +16,6 @@
16
16
  // helpers
17
17
  //
18
18
 
19
- static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
20
- std::string result;
21
- for (size_t pos = 0; ; pos += search.length()) {
22
- auto new_pos = s.find(search, pos);
23
- if (new_pos == std::string::npos) {
24
- result += s.substr(pos, s.size() - pos);
25
- break;
26
- }
27
- result += s.substr(pos, new_pos - pos) + replace;
28
- pos = new_pos;
29
- }
30
- s = std::move(result);
31
- }
32
-
33
19
  LLAMA_ATTRIBUTE_FORMAT(1, 2)
34
20
  static std::string format(const char * fmt, ...) {
35
21
  va_list ap;
@@ -816,6 +802,9 @@ struct llm_tokenizer_ugm {
816
802
  * the best tokenization.
817
803
  */
818
804
  void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
805
+ // get current size of output (for reversal later)
806
+ size_t output_size = output.size();
807
+
819
808
  // normalize the input first
820
809
  std::string normalized;
821
810
  normalize(text, &normalized);
@@ -895,7 +884,7 @@ struct llm_tokenizer_ugm {
895
884
  }
896
885
 
897
886
  // reverse the output since we added tokens starting from the end of the input
898
- std::reverse(output.begin(), output.end());
887
+ std::reverse(output.begin() + output_size, output.end());
899
888
  }
900
889
 
901
890
  private:
@@ -1444,7 +1433,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
1444
1433
  bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
1445
1434
  return token != -1 && (
1446
1435
  token == llama_token_eos_impl(vocab) ||
1447
- token == llama_token_eot_impl(vocab)
1436
+ token == llama_token_eot_impl(vocab) ||
1437
+ token == llama_token_eom_impl(vocab)
1448
1438
  );
1449
1439
  }
1450
1440
 
@@ -1500,6 +1490,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
1500
1490
  return vocab.special_eot_id;
1501
1491
  }
1502
1492
 
1493
+ llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
1494
+ return vocab.special_eom_id;
1495
+ }
1496
+
1503
1497
  int32_t llama_tokenize_impl(
1504
1498
  const struct llama_vocab & vocab,
1505
1499
  const char * text,
package/cpp/llama-vocab.h CHANGED
@@ -45,6 +45,7 @@ struct llama_vocab {
45
45
  id special_suffix_id = -1;
46
46
  id special_middle_id = -1;
47
47
  id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
48
+ id special_eom_id = -1;
48
49
 
49
50
  // tokenizer flags
50
51
  bool tokenizer_add_space_prefix = false;
@@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
101
102
  llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
102
103
  llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
103
104
  llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
105
+ llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
104
106
 
105
107
  int32_t llama_tokenize_impl(
106
108
  const struct llama_vocab & vocab,