cui-llama.rn 1.0.7 → 1.0.9
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/jni.cpp +1 -1
- package/cpp/common.cpp +67 -28
- package/cpp/common.h +23 -4
- package/cpp/ggml-aarch64.c +16 -14
- package/cpp/ggml-backend.c +15 -10
- package/cpp/ggml-impl.h +4 -6
- package/cpp/ggml-metal.h +2 -0
- package/cpp/ggml-metal.m +54 -21
- package/cpp/ggml-quants.c +2 -2
- package/cpp/ggml-quants.h +4 -0
- package/cpp/ggml.c +37 -12
- package/cpp/ggml.h +6 -4
- package/cpp/llama-impl.h +15 -0
- package/cpp/llama-vocab.cpp +10 -16
- package/cpp/llama-vocab.h +2 -0
- package/cpp/llama.cpp +432 -265
- package/cpp/llama.h +4 -1
- package/cpp/rn-llama.hpp +3 -4
- package/package.json +1 -1
package/cpp/ggml-quants.h
CHANGED
@@ -127,6 +127,10 @@ void iq2xs_free_impl(enum lm_ggml_type type);
|
|
127
127
|
void iq3xs_init_impl(int grid_size);
|
128
128
|
void iq3xs_free_impl(int grid_size);
|
129
129
|
|
130
|
+
#if defined(__ARM_FEATURE_SVE)
|
131
|
+
extern int lm_ggml_sve_cnt_b;
|
132
|
+
#endif
|
133
|
+
|
130
134
|
#ifdef __cplusplus
|
131
135
|
}
|
132
136
|
#endif
|
package/cpp/ggml.c
CHANGED
@@ -37,6 +37,9 @@
|
|
37
37
|
#include <unistd.h>
|
38
38
|
#endif
|
39
39
|
|
40
|
+
#if defined(__ARM_FEATURE_SVE)
|
41
|
+
int lm_ggml_sve_cnt_b = 0;
|
42
|
+
#endif
|
40
43
|
#if defined(__ARM_FEATURE_SVE) || defined(__ARM_FEATURE_MATMUL_INT8)
|
41
44
|
#undef LM_GGML_USE_LLAMAFILE
|
42
45
|
#endif
|
@@ -53,6 +56,9 @@
|
|
53
56
|
// disable POSIX deprecation warnings
|
54
57
|
// these functions are never going away, anyway
|
55
58
|
#pragma warning(disable: 4996)
|
59
|
+
|
60
|
+
// unreachable code because of multiple instances of code after LM_GGML_ABORT
|
61
|
+
#pragma warning(disable: 4702)
|
56
62
|
#endif
|
57
63
|
|
58
64
|
#if defined(_WIN32)
|
@@ -185,7 +191,7 @@ static void lm_ggml_print_backtrace_symbols(void) {
|
|
185
191
|
fprintf(stderr, "%d: %p %s\n", idx, addr, symbol);
|
186
192
|
}
|
187
193
|
}
|
188
|
-
#elif defined(__linux__)
|
194
|
+
#elif defined(__linux__) && defined(__GLIBC__)
|
189
195
|
#include <execinfo.h>
|
190
196
|
static void lm_ggml_print_backtrace_symbols(void) {
|
191
197
|
// void * trace[100];
|
@@ -480,9 +486,16 @@ void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t * x, float * y, int64_t n) {
|
|
480
486
|
}
|
481
487
|
}
|
482
488
|
|
489
|
+
void lm_ggml_fp32_to_bf16_row_ref(const float * x, lm_ggml_bf16_t * y, int64_t n) {
|
490
|
+
for (int i = 0; i < n; i++) {
|
491
|
+
y[i] = lm_ggml_compute_fp32_to_bf16(x[i]);
|
492
|
+
}
|
493
|
+
}
|
494
|
+
|
483
495
|
void lm_ggml_fp32_to_bf16_row(const float * x, lm_ggml_bf16_t * y, int64_t n) {
|
484
496
|
int i = 0;
|
485
497
|
#if defined(__AVX512BF16__)
|
498
|
+
// subnormals are flushed to zero on this platform
|
486
499
|
for (; i + 32 <= n; i += 32) {
|
487
500
|
_mm512_storeu_si512(
|
488
501
|
(__m512i *)(y + i),
|
@@ -962,7 +975,7 @@ static const lm_ggml_type_traits_t type_traits[LM_GGML_TYPE_COUNT] = {
|
|
962
975
|
.is_quantized = false,
|
963
976
|
.to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
|
964
977
|
.from_float = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row,
|
965
|
-
.from_float_ref = (lm_ggml_from_float_t)
|
978
|
+
.from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
|
966
979
|
.vec_dot = (lm_ggml_vec_dot_t) lm_ggml_vec_dot_bf16,
|
967
980
|
.vec_dot_type = LM_GGML_TYPE_BF16,
|
968
981
|
.nrows = 1,
|
@@ -2302,7 +2315,7 @@ inline static void lm_ggml_vec_abs_f32 (const int n, float * y, const float * x
|
|
2302
2315
|
inline static void lm_ggml_vec_sgn_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : ((x[i] < 0.f) ? -1.f : 0.f); }
|
2303
2316
|
inline static void lm_ggml_vec_step_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? 1.f : 0.f; }
|
2304
2317
|
inline static void lm_ggml_vec_tanh_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = tanhf(x[i]); }
|
2305
|
-
inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] :
|
2318
|
+
inline static void lm_ggml_vec_elu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : expm1f(x[i]); }
|
2306
2319
|
inline static void lm_ggml_vec_relu_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = (x[i] > 0.f) ? x[i] : 0.f; }
|
2307
2320
|
inline static void lm_ggml_vec_leaky_relu_f32 (const int n, float * y, const float * x, const float ns) { for (int i = 0; i < n; ++i) y[i] = ((x[i] > 0.f) ? x[i] : 0.f) + ns * ((x[i] < 0.0f) ? x[i] : 0.f); }
|
2308
2321
|
inline static void lm_ggml_vec_sigmoid_f32 (const int n, float * y, const float * x) { for (int i = 0; i < n; ++i) y[i] = 1.f / (1.f + expf(-x[i])); }
|
@@ -3551,6 +3564,12 @@ struct lm_ggml_context * lm_ggml_init(struct lm_ggml_init_params params) {
|
|
3551
3564
|
|
3552
3565
|
LM_GGML_ASSERT_ALIGNED(ctx->mem_buffer);
|
3553
3566
|
|
3567
|
+
#if defined(__ARM_FEATURE_SVE)
|
3568
|
+
if (!lm_ggml_sve_cnt_b) {
|
3569
|
+
lm_ggml_sve_cnt_b = PR_SVE_VL_LEN_MASK & prctl(PR_SVE_GET_VL);
|
3570
|
+
}
|
3571
|
+
#endif
|
3572
|
+
|
3554
3573
|
LM_GGML_PRINT_DEBUG("%s: context initialized\n", __func__);
|
3555
3574
|
|
3556
3575
|
lm_ggml_critical_section_end();
|
@@ -3705,7 +3724,8 @@ static struct lm_ggml_tensor * lm_ggml_new_tensor_impl(
|
|
3705
3724
|
struct lm_ggml_tensor * view_src,
|
3706
3725
|
size_t view_offs) {
|
3707
3726
|
|
3708
|
-
|
3727
|
+
LM_GGML_ASSERT(type >= 0 && type < LM_GGML_TYPE_COUNT);
|
3728
|
+
LM_GGML_ASSERT(n_dims >= 1 && n_dims <= LM_GGML_MAX_DIMS);
|
3709
3729
|
|
3710
3730
|
// find the base tensor and absolute offset
|
3711
3731
|
if (view_src != NULL && view_src->view_src != NULL) {
|
@@ -5358,6 +5378,7 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
|
|
5358
5378
|
struct lm_ggml_context * ctx,
|
5359
5379
|
struct lm_ggml_tensor * a,
|
5360
5380
|
int n_groups,
|
5381
|
+
float eps,
|
5361
5382
|
bool inplace) {
|
5362
5383
|
|
5363
5384
|
bool is_node = false;
|
@@ -5368,7 +5389,8 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
|
|
5368
5389
|
|
5369
5390
|
struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
|
5370
5391
|
|
5371
|
-
result
|
5392
|
+
lm_ggml_set_op_params_i32(result, 0, n_groups);
|
5393
|
+
lm_ggml_set_op_params_f32(result, 1, eps);
|
5372
5394
|
|
5373
5395
|
result->op = LM_GGML_OP_GROUP_NORM;
|
5374
5396
|
result->grad = is_node ? lm_ggml_dup_tensor(ctx, result) : NULL;
|
@@ -5380,15 +5402,17 @@ static struct lm_ggml_tensor * lm_ggml_group_norm_impl(
|
|
5380
5402
|
struct lm_ggml_tensor * lm_ggml_group_norm(
|
5381
5403
|
struct lm_ggml_context * ctx,
|
5382
5404
|
struct lm_ggml_tensor * a,
|
5383
|
-
int n_groups
|
5384
|
-
|
5405
|
+
int n_groups,
|
5406
|
+
float eps) {
|
5407
|
+
return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, false);
|
5385
5408
|
}
|
5386
5409
|
|
5387
5410
|
struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
|
5388
5411
|
struct lm_ggml_context * ctx,
|
5389
5412
|
struct lm_ggml_tensor * a,
|
5390
|
-
int n_groups
|
5391
|
-
|
5413
|
+
int n_groups,
|
5414
|
+
float eps) {
|
5415
|
+
return lm_ggml_group_norm_impl(ctx, a, n_groups, eps, true);
|
5392
5416
|
}
|
5393
5417
|
|
5394
5418
|
// lm_ggml_mul_mat
|
@@ -12079,10 +12103,11 @@ static void lm_ggml_compute_forward_group_norm_f32(
|
|
12079
12103
|
|
12080
12104
|
LM_GGML_TENSOR_UNARY_OP_LOCALS
|
12081
12105
|
|
12082
|
-
const float eps = 1e-6f; // TODO: make this a parameter
|
12083
|
-
|
12084
12106
|
// TODO: optimize
|
12085
12107
|
|
12108
|
+
float eps;
|
12109
|
+
memcpy(&eps, dst->op_params + 1, sizeof(float));
|
12110
|
+
|
12086
12111
|
int n_channels = src0->ne[2];
|
12087
12112
|
int n_groups = dst->op_params[0];
|
12088
12113
|
int n_channels_per_group = (n_channels + n_groups - 1) / n_groups;
|
@@ -20650,7 +20675,7 @@ size_t lm_ggml_quantize_chunk(
|
|
20650
20675
|
case LM_GGML_TYPE_BF16:
|
20651
20676
|
{
|
20652
20677
|
size_t elemsize = sizeof(lm_ggml_bf16_t);
|
20653
|
-
|
20678
|
+
lm_ggml_fp32_to_bf16_row_ref(src + start, (lm_ggml_bf16_t *)dst + start, n);
|
20654
20679
|
result = n * elemsize;
|
20655
20680
|
} break;
|
20656
20681
|
case LM_GGML_TYPE_F32:
|
package/cpp/ggml.h
CHANGED
@@ -349,6 +349,7 @@ extern "C" {
|
|
349
349
|
LM_GGML_API lm_ggml_bf16_t lm_ggml_fp32_to_bf16(float);
|
350
350
|
LM_GGML_API float lm_ggml_bf16_to_fp32(lm_ggml_bf16_t); // consider just doing << 16
|
351
351
|
LM_GGML_API void lm_ggml_bf16_to_fp32_row(const lm_ggml_bf16_t *, float *, int64_t);
|
352
|
+
LM_GGML_API void lm_ggml_fp32_to_bf16_row_ref(const float *, lm_ggml_bf16_t *, int64_t);
|
352
353
|
LM_GGML_API void lm_ggml_fp32_to_bf16_row(const float *, lm_ggml_bf16_t *, int64_t);
|
353
354
|
|
354
355
|
struct lm_ggml_object;
|
@@ -1139,16 +1140,17 @@ extern "C" {
|
|
1139
1140
|
|
1140
1141
|
// group normalize along ne0*ne1*n_groups
|
1141
1142
|
// used in stable-diffusion
|
1142
|
-
// TODO: eps is hardcoded to 1e-6 for now
|
1143
1143
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm(
|
1144
1144
|
struct lm_ggml_context * ctx,
|
1145
1145
|
struct lm_ggml_tensor * a,
|
1146
|
-
int n_groups
|
1146
|
+
int n_groups,
|
1147
|
+
float eps);
|
1147
1148
|
|
1148
1149
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_group_norm_inplace(
|
1149
1150
|
struct lm_ggml_context * ctx,
|
1150
1151
|
struct lm_ggml_tensor * a,
|
1151
|
-
int n_groups
|
1152
|
+
int n_groups,
|
1153
|
+
float eps);
|
1152
1154
|
|
1153
1155
|
// a - x
|
1154
1156
|
// b - dy
|
@@ -1455,7 +1457,6 @@ extern "C" {
|
|
1455
1457
|
// if mode & 2 == 1, GPT-NeoX style
|
1456
1458
|
//
|
1457
1459
|
// b is an int32 vector with size a->ne[2], it contains the positions
|
1458
|
-
// c is freq factors (e.g. phi3-128k), (optional)
|
1459
1460
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope(
|
1460
1461
|
struct lm_ggml_context * ctx,
|
1461
1462
|
struct lm_ggml_tensor * a,
|
@@ -1472,6 +1473,7 @@ extern "C" {
|
|
1472
1473
|
int mode);
|
1473
1474
|
|
1474
1475
|
// custom RoPE
|
1476
|
+
// c is freq factors (e.g. phi3-128k), (optional)
|
1475
1477
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext(
|
1476
1478
|
struct lm_ggml_context * ctx,
|
1477
1479
|
struct lm_ggml_tensor * a,
|
package/cpp/llama-impl.h
CHANGED
@@ -24,3 +24,18 @@ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void
|
|
24
24
|
#define LLAMA_LOG_INFO(...) llama_log_internal(LM_GGML_LOG_LEVEL_INFO , __VA_ARGS__)
|
25
25
|
#define LLAMA_LOG_WARN(...) llama_log_internal(LM_GGML_LOG_LEVEL_WARN , __VA_ARGS__)
|
26
26
|
#define LLAMA_LOG_ERROR(...) llama_log_internal(LM_GGML_LOG_LEVEL_ERROR, __VA_ARGS__)
|
27
|
+
|
28
|
+
//
|
29
|
+
// helpers
|
30
|
+
//
|
31
|
+
|
32
|
+
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
33
|
+
if (search.empty()) {
|
34
|
+
return; // Avoid infinite loop if 'search' is an empty string
|
35
|
+
}
|
36
|
+
size_t pos = 0;
|
37
|
+
while ((pos = s.find(search, pos)) != std::string::npos) {
|
38
|
+
s.replace(pos, search.length(), replace);
|
39
|
+
pos += replace.length();
|
40
|
+
}
|
41
|
+
}
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -16,20 +16,6 @@
|
|
16
16
|
// helpers
|
17
17
|
//
|
18
18
|
|
19
|
-
static void replace_all(std::string & s, const std::string & search, const std::string & replace) {
|
20
|
-
std::string result;
|
21
|
-
for (size_t pos = 0; ; pos += search.length()) {
|
22
|
-
auto new_pos = s.find(search, pos);
|
23
|
-
if (new_pos == std::string::npos) {
|
24
|
-
result += s.substr(pos, s.size() - pos);
|
25
|
-
break;
|
26
|
-
}
|
27
|
-
result += s.substr(pos, new_pos - pos) + replace;
|
28
|
-
pos = new_pos;
|
29
|
-
}
|
30
|
-
s = std::move(result);
|
31
|
-
}
|
32
|
-
|
33
19
|
LLAMA_ATTRIBUTE_FORMAT(1, 2)
|
34
20
|
static std::string format(const char * fmt, ...) {
|
35
21
|
va_list ap;
|
@@ -816,6 +802,9 @@ struct llm_tokenizer_ugm {
|
|
816
802
|
* the best tokenization.
|
817
803
|
*/
|
818
804
|
void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
|
805
|
+
// get current size of output (for reversal later)
|
806
|
+
size_t output_size = output.size();
|
807
|
+
|
819
808
|
// normalize the input first
|
820
809
|
std::string normalized;
|
821
810
|
normalize(text, &normalized);
|
@@ -895,7 +884,7 @@ struct llm_tokenizer_ugm {
|
|
895
884
|
}
|
896
885
|
|
897
886
|
// reverse the output since we added tokens starting from the end of the input
|
898
|
-
std::reverse(output.begin(), output.end());
|
887
|
+
std::reverse(output.begin() + output_size, output.end());
|
899
888
|
}
|
900
889
|
|
901
890
|
private:
|
@@ -1444,7 +1433,8 @@ llama_token_attr llama_token_get_attr_impl(const struct llama_vocab & vocab, lla
|
|
1444
1433
|
bool llama_token_is_eog_impl(const struct llama_vocab & vocab, llama_token token) {
|
1445
1434
|
return token != -1 && (
|
1446
1435
|
token == llama_token_eos_impl(vocab) ||
|
1447
|
-
token == llama_token_eot_impl(vocab)
|
1436
|
+
token == llama_token_eot_impl(vocab) ||
|
1437
|
+
token == llama_token_eom_impl(vocab)
|
1448
1438
|
);
|
1449
1439
|
}
|
1450
1440
|
|
@@ -1500,6 +1490,10 @@ llama_token llama_token_eot_impl(const struct llama_vocab & vocab) {
|
|
1500
1490
|
return vocab.special_eot_id;
|
1501
1491
|
}
|
1502
1492
|
|
1493
|
+
llama_token llama_token_eom_impl(const struct llama_vocab & vocab) {
|
1494
|
+
return vocab.special_eom_id;
|
1495
|
+
}
|
1496
|
+
|
1503
1497
|
int32_t llama_tokenize_impl(
|
1504
1498
|
const struct llama_vocab & vocab,
|
1505
1499
|
const char * text,
|
package/cpp/llama-vocab.h
CHANGED
@@ -45,6 +45,7 @@ struct llama_vocab {
|
|
45
45
|
id special_suffix_id = -1;
|
46
46
|
id special_middle_id = -1;
|
47
47
|
id special_eot_id = -1; // TODO: move above after "eos_id", and here add "file separator" token
|
48
|
+
id special_eom_id = -1;
|
48
49
|
|
49
50
|
// tokenizer flags
|
50
51
|
bool tokenizer_add_space_prefix = false;
|
@@ -101,6 +102,7 @@ llama_token llama_token_prefix_impl(const struct llama_vocab & vocab);
|
|
101
102
|
llama_token llama_token_middle_impl(const struct llama_vocab & vocab);
|
102
103
|
llama_token llama_token_suffix_impl(const struct llama_vocab & vocab);
|
103
104
|
llama_token llama_token_eot_impl (const struct llama_vocab & vocab);
|
105
|
+
llama_token llama_token_eom_impl (const struct llama_vocab & vocab);
|
104
106
|
|
105
107
|
int32_t llama_tokenize_impl(
|
106
108
|
const struct llama_vocab & vocab,
|