cui-llama.rn 1.1.5 → 1.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/ggml-metal.m CHANGED
@@ -13,13 +13,16 @@
13
13
  #define MAX(a, b) ((a) > (b) ? (a) : (b))
14
14
 
15
15
  #ifdef LM_GGML_METAL_NDEBUG
16
+ #define LM_GGML_METAL_LOG(...)
16
17
  #define LM_GGML_METAL_LOG_INFO(...)
17
18
  #define LM_GGML_METAL_LOG_WARN(...)
18
19
  #define LM_GGML_METAL_LOG_ERROR(...)
19
20
  #else
21
+ #define LM_GGML_METAL_LOG(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_NONE, __VA_ARGS__)
20
22
  #define LM_GGML_METAL_LOG_INFO(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_INFO, __VA_ARGS__)
21
23
  #define LM_GGML_METAL_LOG_WARN(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_WARN, __VA_ARGS__)
22
24
  #define LM_GGML_METAL_LOG_ERROR(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
25
+ #define LM_GGML_METAL_LOG_DEBUG(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
23
26
  #endif
24
27
 
25
28
  #define UNUSED(x) (void)(x)
@@ -3183,7 +3186,7 @@ static void lm_ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_
3183
3186
  #ifndef LM_GGML_METAL_NDEBUG
3184
3187
  #if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
3185
3188
  if (@available(macOS 10.12, iOS 16.0, *)) {
3186
- LM_GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)",
3189
+ LM_GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
3187
3190
  __func__,
3188
3191
  size_aligned / 1024.0 / 1024.0,
3189
3192
  device.currentAllocatedSize / 1024.0 / 1024.0,
@@ -3191,8 +3194,6 @@ static void lm_ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_
3191
3194
 
3192
3195
  if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
3193
3196
  LM_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
3194
- } else {
3195
- LM_GGML_METAL_LOG_INFO("\n");
3196
3197
  }
3197
3198
  } else {
3198
3199
  LM_GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
@@ -3224,15 +3225,19 @@ LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_a
3224
3225
  ctx->n_buffers = 1;
3225
3226
 
3226
3227
  if (ctx->all_data != NULL) {
3227
- ctx->buffers[0].data = ctx->all_data;
3228
- ctx->buffers[0].size = size;
3229
- ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
3230
- length:size_aligned
3231
- options:MTLResourceStorageModeShared
3232
- deallocator:nil];
3228
+ ctx->buffers[0].data = ctx->all_data;
3229
+ ctx->buffers[0].size = size;
3230
+ ctx->buffers[0].metal = nil;
3231
+
3232
+ if (size_aligned > 0) {
3233
+ ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
3234
+ length:size_aligned
3235
+ options:MTLResourceStorageModeShared
3236
+ deallocator:nil];
3237
+ }
3233
3238
  }
3234
3239
 
3235
- if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
3240
+ if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
3236
3241
  LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
3237
3242
  free(ctx);
3238
3243
  lm_ggml_backend_metal_free_device();
@@ -3309,14 +3314,17 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void
3309
3314
 
3310
3315
  // the buffer fits into the max buffer size allowed by the device
3311
3316
  if (size_aligned <= device.maxBufferLength) {
3312
- ctx->buffers[ctx->n_buffers].data = data;
3313
- ctx->buffers[ctx->n_buffers].size = size;
3317
+ ctx->buffers[ctx->n_buffers].data = data;
3318
+ ctx->buffers[ctx->n_buffers].size = size;
3319
+ ctx->buffers[ctx->n_buffers].metal = nil;
3314
3320
 
3315
- ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
3321
+ if (size_aligned > 0) {
3322
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
3316
3323
 
3317
- if (ctx->buffers[ctx->n_buffers].metal == nil) {
3318
- LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
3319
- return false;
3324
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
3325
+ LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
3326
+ return false;
3327
+ }
3320
3328
  }
3321
3329
 
3322
3330
  lm_ggml_backend_metal_log_allocated_size(device, size_aligned);
@@ -3332,14 +3340,17 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void
3332
3340
  for (size_t i = 0; i < size; i += size_step) {
3333
3341
  const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
3334
3342
 
3335
- ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
3336
- ctx->buffers[ctx->n_buffers].size = size_step_aligned;
3343
+ ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
3344
+ ctx->buffers[ctx->n_buffers].size = size_step_aligned;
3345
+ ctx->buffers[ctx->n_buffers].metal = nil;
3337
3346
 
3338
- ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
3347
+ if (size_step_aligned > 0) {
3348
+ ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
3339
3349
 
3340
- if (ctx->buffers[ctx->n_buffers].metal == nil) {
3341
- LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
3342
- return false;
3350
+ if (ctx->buffers[ctx->n_buffers].metal == nil) {
3351
+ LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
3352
+ return false;
3353
+ }
3343
3354
  }
3344
3355
 
3345
3356
  lm_ggml_backend_metal_log_allocated_size(device, size_step_aligned);
package/cpp/ggml-quants.c CHANGED
@@ -230,6 +230,12 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
230
230
 
231
231
  return _mm_packus_epi16( bytes1, bytes2);
232
232
  }
233
+
234
+ static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
235
+ const __m128i ax = _mm_sign_epi8(x, x);
236
+ const __m128i sy = _mm_sign_epi8(y, x);
237
+ return _mm_maddubs_epi16(ax, sy);
238
+ }
233
239
  #endif
234
240
  #elif defined(__SSSE3__)
235
241
  // horizontally add 4x4 floats
@@ -4206,37 +4212,37 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
4206
4212
 
4207
4213
  sumf = hsum_float_8(acc);
4208
4214
  #elif defined(__AVX__)
4209
- // Initialize accumulator with zeros
4210
- __m256 acc = _mm256_setzero_ps();
4211
-
4212
- // Main loop
4213
- for (; ib < nb; ++ib) {
4214
- // Compute combined scale for the block
4215
- const __m256 d = _mm256_set1_ps( LM_GGML_FP16_TO_FP32(x[ib].d) * LM_GGML_FP16_TO_FP32(y[ib].d) );
4216
-
4217
- const __m128i lowMask = _mm_set1_epi8(0xF);
4218
- const __m128i off = _mm_set1_epi8(8);
4219
-
4220
- const __m128i tmp = _mm_loadu_si128((const __m128i *)x[ib].qs);
4221
-
4222
- __m128i bx_0 = _mm_and_si128(lowMask, tmp);
4223
- __m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
4224
- bx_0 = _mm_sub_epi8(bx_0, off);
4225
- const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
4226
-
4227
- bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
4228
- by_0 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
4229
- bx_0 = _mm_sub_epi8(bx_0, off);
4230
- const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
4215
+ const __m128i mone = _mm_set1_epi16(1);
4231
4216
 
4232
- // Convert int32_t to float
4233
- __m256 p = _mm256_cvtepi32_ps(MM256_SET_M128I(i32_0, i32_1));
4217
+ __m256 accum1 = _mm256_setzero_ps();
4218
+ __m256 accum2 = _mm256_setzero_ps();
4219
+ for (; ib + 1 < nb; ib += 2) {
4220
+ const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
4221
+ const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
4222
+ const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
4223
+ const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
4224
+ const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
4225
+ const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
4234
4226
 
4235
- // Apply the scale, and accumulate
4236
- acc = _mm256_add_ps(_mm256_mul_ps( d, p ), acc);
4227
+ const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
4228
+ const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
4229
+ const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
4230
+ const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
4231
+ const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
4232
+ const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
4233
+ const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
4234
+ const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
4235
+ const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
4236
+ const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
4237
+ const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
4238
+ const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
4239
+ accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(LM_GGML_FP16_TO_FP32(y[ib + 0].d)*LM_GGML_FP16_TO_FP32(x[ib + 0].d)),
4240
+ _mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
4241
+ accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(LM_GGML_FP16_TO_FP32(y[ib + 1].d)*LM_GGML_FP16_TO_FP32(x[ib + 1].d)),
4242
+ _mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
4237
4243
  }
4238
4244
 
4239
- sumf = hsum_float_8(acc);
4245
+ sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
4240
4246
  #elif defined(__SSSE3__)
4241
4247
  // set constants
4242
4248
  const __m128i lowMask = _mm_set1_epi8(0xF);
@@ -11819,15 +11825,6 @@ void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const voi
11819
11825
  #endif
11820
11826
  }
11821
11827
 
11822
-
11823
- #if defined(__AVX__)
11824
- static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
11825
- const __m128i ax = _mm_sign_epi8(x, x);
11826
- const __m128i sy = _mm_sign_epi8(y, x);
11827
- return _mm_maddubs_epi16(ax, sy);
11828
- }
11829
- #endif
11830
-
11831
11828
  #if defined(__AVX2__)
11832
11829
  static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
11833
11830
  const __m256i ax = _mm256_sign_epi8(x, x);
package/cpp/ggml.h CHANGED
@@ -564,10 +564,11 @@ extern "C" {
564
564
  };
565
565
 
566
566
  enum lm_ggml_log_level {
567
- LM_GGML_LOG_LEVEL_ERROR = 2,
568
- LM_GGML_LOG_LEVEL_WARN = 3,
569
- LM_GGML_LOG_LEVEL_INFO = 4,
570
- LM_GGML_LOG_LEVEL_DEBUG = 5
567
+ LM_GGML_LOG_LEVEL_NONE = 0,
568
+ LM_GGML_LOG_LEVEL_INFO = 1,
569
+ LM_GGML_LOG_LEVEL_WARN = 2,
570
+ LM_GGML_LOG_LEVEL_ERROR = 3,
571
+ LM_GGML_LOG_LEVEL_DEBUG = 4,
571
572
  };
572
573
 
573
574
  enum lm_ggml_tensor_flag {
package/cpp/llama-impl.h CHANGED
@@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
24
24
  void llama_log_internal (lm_ggml_log_level level, const char * format, ...);
25
25
  void llama_log_callback_default(lm_ggml_log_level level, const char * text, void * user_data);
26
26
 
27
+ #define LLAMA_LOG(...) llama_log_internal(LM_GGML_LOG_LEVEL_NONE , __VA_ARGS__)
27
28
  #define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
28
29
  #define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
29
30
  #define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
@@ -346,14 +346,6 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
346
346
  p->samplers.push_back(smpl);
347
347
  }
348
348
 
349
- llama_sampler_timings llama_sampler_chain_timings(struct llama_sampler * chain) {
350
- auto * p = (llama_sampler_chain *) chain->ctx;
351
- struct llama_sampler_timings result = {
352
- p -> t_sample_us,
353
- p -> n_sample
354
- };
355
- return result;
356
- }
357
349
 
358
350
  struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
359
351
  const auto * p = (const llama_sampler_chain *) chain->ctx;