cui-llama.rn 1.1.5 → 1.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +1 -0
- package/android/src/main/jni.cpp +1 -2
- package/cpp/common.cpp +157 -53
- package/cpp/common.h +11 -3
- package/cpp/ggml-metal.m +33 -22
- package/cpp/ggml-quants.c +33 -36
- package/cpp/ggml.h +5 -4
- package/cpp/llama-impl.h +1 -0
- package/cpp/llama-sampling.cpp +0 -8
- package/cpp/llama.cpp +519 -34
- package/cpp/llama.h +0 -17
- package/cpp/log.cpp +401 -0
- package/cpp/log.h +85 -703
- package/cpp/rn-llama.hpp +7 -10
- package/cpp/sampling.cpp +1 -5
- package/cpp/sgemm.cpp +38 -0
- package/package.json +1 -1
package/cpp/ggml-metal.m
CHANGED
@@ -13,13 +13,16 @@
|
|
13
13
|
#define MAX(a, b) ((a) > (b) ? (a) : (b))
|
14
14
|
|
15
15
|
#ifdef LM_GGML_METAL_NDEBUG
|
16
|
+
#define LM_GGML_METAL_LOG(...)
|
16
17
|
#define LM_GGML_METAL_LOG_INFO(...)
|
17
18
|
#define LM_GGML_METAL_LOG_WARN(...)
|
18
19
|
#define LM_GGML_METAL_LOG_ERROR(...)
|
19
20
|
#else
|
21
|
+
#define LM_GGML_METAL_LOG(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_NONE, __VA_ARGS__)
|
20
22
|
#define LM_GGML_METAL_LOG_INFO(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_INFO, __VA_ARGS__)
|
21
23
|
#define LM_GGML_METAL_LOG_WARN(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_WARN, __VA_ARGS__)
|
22
24
|
#define LM_GGML_METAL_LOG_ERROR(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
25
|
+
#define LM_GGML_METAL_LOG_DEBUG(...) lm_ggml_metal_log(LM_GGML_LOG_LEVEL_DEBUG, __VA_ARGS__)
|
23
26
|
#endif
|
24
27
|
|
25
28
|
#define UNUSED(x) (void)(x)
|
@@ -3183,7 +3186,7 @@ static void lm_ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_
|
|
3183
3186
|
#ifndef LM_GGML_METAL_NDEBUG
|
3184
3187
|
#if TARGET_OS_OSX || (TARGET_OS_IOS && __clang_major__ >= 15)
|
3185
3188
|
if (@available(macOS 10.12, iOS 16.0, *)) {
|
3186
|
-
|
3189
|
+
LM_GGML_METAL_LOG_DEBUG("%s: allocated buffer, size = %8.2f MiB, (%8.2f / %8.2f)\n",
|
3187
3190
|
__func__,
|
3188
3191
|
size_aligned / 1024.0 / 1024.0,
|
3189
3192
|
device.currentAllocatedSize / 1024.0 / 1024.0,
|
@@ -3191,8 +3194,6 @@ static void lm_ggml_backend_metal_log_allocated_size(id<MTLDevice> device, size_
|
|
3191
3194
|
|
3192
3195
|
if (device.currentAllocatedSize > device.recommendedMaxWorkingSetSize) {
|
3193
3196
|
LM_GGML_METAL_LOG_WARN("%s: warning: current allocated size is greater than the recommended max working set size\n", __func__);
|
3194
|
-
} else {
|
3195
|
-
LM_GGML_METAL_LOG_INFO("\n");
|
3196
3197
|
}
|
3197
3198
|
} else {
|
3198
3199
|
LM_GGML_METAL_LOG_INFO("%s: allocated buffer, size = %8.2f MiB, (%8.2f)\n",
|
@@ -3224,15 +3225,19 @@ LM_GGML_CALL static lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_type_a
|
|
3224
3225
|
ctx->n_buffers = 1;
|
3225
3226
|
|
3226
3227
|
if (ctx->all_data != NULL) {
|
3227
|
-
ctx->buffers[0].data
|
3228
|
-
ctx->buffers[0].size
|
3229
|
-
ctx->buffers[0].metal =
|
3230
|
-
|
3231
|
-
|
3232
|
-
|
3228
|
+
ctx->buffers[0].data = ctx->all_data;
|
3229
|
+
ctx->buffers[0].size = size;
|
3230
|
+
ctx->buffers[0].metal = nil;
|
3231
|
+
|
3232
|
+
if (size_aligned > 0) {
|
3233
|
+
ctx->buffers[0].metal = [device newBufferWithBytesNoCopy:ctx->all_data
|
3234
|
+
length:size_aligned
|
3235
|
+
options:MTLResourceStorageModeShared
|
3236
|
+
deallocator:nil];
|
3237
|
+
}
|
3233
3238
|
}
|
3234
3239
|
|
3235
|
-
if (ctx->all_data == NULL || ctx->buffers[0].metal == nil) {
|
3240
|
+
if (size_aligned > 0 && (ctx->all_data == NULL || ctx->buffers[0].metal == nil)) {
|
3236
3241
|
LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
3237
3242
|
free(ctx);
|
3238
3243
|
lm_ggml_backend_metal_free_device();
|
@@ -3309,14 +3314,17 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void
|
|
3309
3314
|
|
3310
3315
|
// the buffer fits into the max buffer size allowed by the device
|
3311
3316
|
if (size_aligned <= device.maxBufferLength) {
|
3312
|
-
ctx->buffers[ctx->n_buffers].data
|
3313
|
-
ctx->buffers[ctx->n_buffers].size
|
3317
|
+
ctx->buffers[ctx->n_buffers].data = data;
|
3318
|
+
ctx->buffers[ctx->n_buffers].size = size;
|
3319
|
+
ctx->buffers[ctx->n_buffers].metal = nil;
|
3314
3320
|
|
3315
|
-
|
3321
|
+
if (size_aligned > 0) {
|
3322
|
+
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:data length:size_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
3316
3323
|
|
3317
|
-
|
3318
|
-
|
3319
|
-
|
3324
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
3325
|
+
LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_aligned / 1024.0 / 1024.0);
|
3326
|
+
return false;
|
3327
|
+
}
|
3320
3328
|
}
|
3321
3329
|
|
3322
3330
|
lm_ggml_backend_metal_log_allocated_size(device, size_aligned);
|
@@ -3332,14 +3340,17 @@ LM_GGML_CALL lm_ggml_backend_buffer_t lm_ggml_backend_metal_buffer_from_ptr(void
|
|
3332
3340
|
for (size_t i = 0; i < size; i += size_step) {
|
3333
3341
|
const size_t size_step_aligned = (i + size_view <= size) ? size_view : (size_aligned - i);
|
3334
3342
|
|
3335
|
-
ctx->buffers[ctx->n_buffers].data
|
3336
|
-
ctx->buffers[ctx->n_buffers].size
|
3343
|
+
ctx->buffers[ctx->n_buffers].data = (void *) ((uint8_t *) data + i);
|
3344
|
+
ctx->buffers[ctx->n_buffers].size = size_step_aligned;
|
3345
|
+
ctx->buffers[ctx->n_buffers].metal = nil;
|
3337
3346
|
|
3338
|
-
|
3347
|
+
if (size_step_aligned > 0) {
|
3348
|
+
ctx->buffers[ctx->n_buffers].metal = [device newBufferWithBytesNoCopy:(void *) ((uint8_t *) data + i) length:size_step_aligned options:MTLResourceStorageModeShared deallocator:nil];
|
3339
3349
|
|
3340
|
-
|
3341
|
-
|
3342
|
-
|
3350
|
+
if (ctx->buffers[ctx->n_buffers].metal == nil) {
|
3351
|
+
LM_GGML_METAL_LOG_ERROR("%s: error: failed to allocate buffer, size = %8.2f MiB\n", __func__, size_step_aligned / 1024.0 / 1024.0);
|
3352
|
+
return false;
|
3353
|
+
}
|
3343
3354
|
}
|
3344
3355
|
|
3345
3356
|
lm_ggml_backend_metal_log_allocated_size(device, size_step_aligned);
|
package/cpp/ggml-quants.c
CHANGED
@@ -230,6 +230,12 @@ static inline __m128i packNibbles( __m128i bytes1, __m128i bytes2 )
|
|
230
230
|
|
231
231
|
return _mm_packus_epi16( bytes1, bytes2);
|
232
232
|
}
|
233
|
+
|
234
|
+
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
235
|
+
const __m128i ax = _mm_sign_epi8(x, x);
|
236
|
+
const __m128i sy = _mm_sign_epi8(y, x);
|
237
|
+
return _mm_maddubs_epi16(ax, sy);
|
238
|
+
}
|
233
239
|
#endif
|
234
240
|
#elif defined(__SSSE3__)
|
235
241
|
// horizontally add 4x4 floats
|
@@ -4206,37 +4212,37 @@ void lm_ggml_vec_dot_q4_0_q8_0(int n, float * restrict s, size_t bs, const void
|
|
4206
4212
|
|
4207
4213
|
sumf = hsum_float_8(acc);
|
4208
4214
|
#elif defined(__AVX__)
|
4209
|
-
|
4210
|
-
__m256 acc = _mm256_setzero_ps();
|
4211
|
-
|
4212
|
-
// Main loop
|
4213
|
-
for (; ib < nb; ++ib) {
|
4214
|
-
// Compute combined scale for the block
|
4215
|
-
const __m256 d = _mm256_set1_ps( LM_GGML_FP16_TO_FP32(x[ib].d) * LM_GGML_FP16_TO_FP32(y[ib].d) );
|
4216
|
-
|
4217
|
-
const __m128i lowMask = _mm_set1_epi8(0xF);
|
4218
|
-
const __m128i off = _mm_set1_epi8(8);
|
4219
|
-
|
4220
|
-
const __m128i tmp = _mm_loadu_si128((const __m128i *)x[ib].qs);
|
4221
|
-
|
4222
|
-
__m128i bx_0 = _mm_and_si128(lowMask, tmp);
|
4223
|
-
__m128i by_0 = _mm_loadu_si128((const __m128i *)y[ib].qs);
|
4224
|
-
bx_0 = _mm_sub_epi8(bx_0, off);
|
4225
|
-
const __m128i i32_0 = mul_sum_i8_pairs(bx_0, by_0);
|
4226
|
-
|
4227
|
-
bx_0 = _mm_and_si128(lowMask, _mm_srli_epi64(tmp, 4));
|
4228
|
-
by_0 = _mm_loadu_si128((const __m128i *)(y[ib].qs + 16));
|
4229
|
-
bx_0 = _mm_sub_epi8(bx_0, off);
|
4230
|
-
const __m128i i32_1 = mul_sum_i8_pairs(bx_0, by_0);
|
4215
|
+
const __m128i mone = _mm_set1_epi16(1);
|
4231
4216
|
|
4232
|
-
|
4233
|
-
|
4217
|
+
__m256 accum1 = _mm256_setzero_ps();
|
4218
|
+
__m256 accum2 = _mm256_setzero_ps();
|
4219
|
+
for (; ib + 1 < nb; ib += 2) {
|
4220
|
+
const __m128i q4bits_1 = _mm_loadu_si128((const __m128i *)x[ib + 0].qs);
|
4221
|
+
const __m128i q4bits_2 = _mm_loadu_si128((const __m128i *)x[ib + 1].qs);
|
4222
|
+
const __m128i q8b_1_0 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs);
|
4223
|
+
const __m128i q8b_1_1 = _mm_loadu_si128((const __m128i *)y[ib + 0].qs + 1);
|
4224
|
+
const __m128i q8b_2_0 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs);
|
4225
|
+
const __m128i q8b_2_1 = _mm_loadu_si128((const __m128i *)y[ib + 1].qs + 1);
|
4234
4226
|
|
4235
|
-
|
4236
|
-
|
4227
|
+
const __m128i q4b_1_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_1), _mm_set1_epi8(8));
|
4228
|
+
const __m128i q4b_1_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_1, 4)), _mm_set1_epi8(8));
|
4229
|
+
const __m128i q4b_2_0 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), q4bits_2), _mm_set1_epi8(8));
|
4230
|
+
const __m128i q4b_2_1 = _mm_sub_epi8(_mm_and_si128(_mm_set1_epi8(15), _mm_srli_epi16(q4bits_2, 4)), _mm_set1_epi8(8));
|
4231
|
+
const __m128i p16_1_0 = mul_add_epi8_sse(q4b_1_0, q8b_1_0);
|
4232
|
+
const __m128i p16_1_1 = mul_add_epi8_sse(q4b_1_1, q8b_1_1);
|
4233
|
+
const __m128i p16_2_0 = mul_add_epi8_sse(q4b_2_0, q8b_2_0);
|
4234
|
+
const __m128i p16_2_1 = mul_add_epi8_sse(q4b_2_1, q8b_2_1);
|
4235
|
+
const __m128i p_1_0 = _mm_madd_epi16(p16_1_0, mone);
|
4236
|
+
const __m128i p_1_1 = _mm_madd_epi16(p16_1_1, mone);
|
4237
|
+
const __m128i p_2_0 = _mm_madd_epi16(p16_2_0, mone);
|
4238
|
+
const __m128i p_2_1 = _mm_madd_epi16(p16_2_1, mone);
|
4239
|
+
accum1 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(LM_GGML_FP16_TO_FP32(y[ib + 0].d)*LM_GGML_FP16_TO_FP32(x[ib + 0].d)),
|
4240
|
+
_mm256_cvtepi32_ps(MM256_SET_M128I(p_1_1, p_1_0))), accum1);
|
4241
|
+
accum2 = _mm256_add_ps(_mm256_mul_ps(_mm256_set1_ps(LM_GGML_FP16_TO_FP32(y[ib + 1].d)*LM_GGML_FP16_TO_FP32(x[ib + 1].d)),
|
4242
|
+
_mm256_cvtepi32_ps(MM256_SET_M128I(p_2_1, p_2_0))), accum2);
|
4237
4243
|
}
|
4238
4244
|
|
4239
|
-
sumf = hsum_float_8(
|
4245
|
+
sumf = hsum_float_8(_mm256_add_ps(accum1, accum2));
|
4240
4246
|
#elif defined(__SSSE3__)
|
4241
4247
|
// set constants
|
4242
4248
|
const __m128i lowMask = _mm_set1_epi8(0xF);
|
@@ -11819,15 +11825,6 @@ void lm_ggml_vec_dot_iq3_s_q8_K (int n, float * restrict s, size_t bs, const voi
|
|
11819
11825
|
#endif
|
11820
11826
|
}
|
11821
11827
|
|
11822
|
-
|
11823
|
-
#if defined(__AVX__)
|
11824
|
-
static inline __m128i mul_add_epi8_sse(const __m128i x, const __m128i y) {
|
11825
|
-
const __m128i ax = _mm_sign_epi8(x, x);
|
11826
|
-
const __m128i sy = _mm_sign_epi8(y, x);
|
11827
|
-
return _mm_maddubs_epi16(ax, sy);
|
11828
|
-
}
|
11829
|
-
#endif
|
11830
|
-
|
11831
11828
|
#if defined(__AVX2__)
|
11832
11829
|
static inline __m256i mul_add_epi8(const __m256i x, const __m256i y) {
|
11833
11830
|
const __m256i ax = _mm256_sign_epi8(x, x);
|
package/cpp/ggml.h
CHANGED
@@ -564,10 +564,11 @@ extern "C" {
|
|
564
564
|
};
|
565
565
|
|
566
566
|
enum lm_ggml_log_level {
|
567
|
-
|
568
|
-
|
569
|
-
|
570
|
-
|
567
|
+
LM_GGML_LOG_LEVEL_NONE = 0,
|
568
|
+
LM_GGML_LOG_LEVEL_INFO = 1,
|
569
|
+
LM_GGML_LOG_LEVEL_WARN = 2,
|
570
|
+
LM_GGML_LOG_LEVEL_ERROR = 3,
|
571
|
+
LM_GGML_LOG_LEVEL_DEBUG = 4,
|
571
572
|
};
|
572
573
|
|
573
574
|
enum lm_ggml_tensor_flag {
|
package/cpp/llama-impl.h
CHANGED
@@ -24,6 +24,7 @@ LLAMA_ATTRIBUTE_FORMAT(2, 3)
|
|
24
24
|
void llama_log_internal (lm_ggml_log_level level, const char * format, ...);
|
25
25
|
void llama_log_callback_default(lm_ggml_log_level level, const char * text, void * user_data);
|
26
26
|
|
27
|
+
#define LLAMA_LOG(...) llama_log_internal(LM_GGML_LOG_LEVEL_NONE , __VA_ARGS__)
|
27
28
|
#define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
28
29
|
#define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
29
30
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
package/cpp/llama-sampling.cpp
CHANGED
@@ -346,14 +346,6 @@ void llama_sampler_chain_add(struct llama_sampler * chain, struct llama_sampler
|
|
346
346
|
p->samplers.push_back(smpl);
|
347
347
|
}
|
348
348
|
|
349
|
-
llama_sampler_timings llama_sampler_chain_timings(struct llama_sampler * chain) {
|
350
|
-
auto * p = (llama_sampler_chain *) chain->ctx;
|
351
|
-
struct llama_sampler_timings result = {
|
352
|
-
p -> t_sample_us,
|
353
|
-
p -> n_sample
|
354
|
-
};
|
355
|
-
return result;
|
356
|
-
}
|
357
349
|
|
358
350
|
struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i) {
|
359
351
|
const auto * p = (const llama_sampler_chain *) chain->ctx;
|