cui-llama.rn 1.3.3 → 1.3.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/CMakeLists.txt +5 -7
- package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
- package/android/src/main/jni.cpp +9 -9
- package/cpp/common.cpp +21 -40
- package/cpp/common.h +21 -12
- package/cpp/ggml-backend-impl.h +38 -20
- package/cpp/ggml-backend-reg.cpp +216 -87
- package/cpp/ggml-backend.h +1 -0
- package/cpp/ggml-common.h +42 -48
- package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +591 -152
- package/cpp/ggml-cpu-aarch64.h +2 -26
- package/cpp/ggml-cpu-traits.cpp +36 -0
- package/cpp/ggml-cpu-traits.h +38 -0
- package/cpp/ggml-cpu.c +14122 -13971
- package/cpp/ggml-cpu.cpp +618 -715
- package/cpp/ggml-cpu.h +0 -17
- package/cpp/ggml-impl.h +6 -6
- package/cpp/ggml-metal.m +482 -24
- package/cpp/ggml-quants.c +0 -9
- package/cpp/ggml-threading.h +4 -2
- package/cpp/ggml.c +132 -43
- package/cpp/ggml.h +44 -13
- package/cpp/llama-sampling.cpp +35 -90
- package/cpp/llama-vocab.cpp +2 -1
- package/cpp/llama.cpp +737 -233
- package/cpp/llama.h +20 -16
- package/cpp/sampling.cpp +11 -16
- package/cpp/speculative.cpp +4 -0
- package/cpp/unicode.cpp +51 -51
- package/cpp/unicode.h +9 -10
- package/lib/commonjs/index.js +38 -1
- package/lib/commonjs/index.js.map +1 -1
- package/lib/module/index.js +36 -0
- package/lib/module/index.js.map +1 -1
- package/lib/typescript/NativeRNLlama.d.ts +2 -3
- package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
- package/lib/typescript/index.d.ts +36 -2
- package/lib/typescript/index.d.ts.map +1 -1
- package/package.json +1 -1
- package/src/NativeRNLlama.ts +3 -3
- package/src/index.ts +46 -2
- package/cpp/amx/amx.cpp +0 -196
- package/cpp/amx/amx.h +0 -20
- package/cpp/amx/common.h +0 -101
- package/cpp/amx/mmq.cpp +0 -2524
- package/cpp/amx/mmq.h +0 -16
- package/cpp/ggml-aarch64.c +0 -129
- package/cpp/ggml-aarch64.h +0 -19
package/cpp/ggml-quants.c
CHANGED
@@ -5220,15 +5220,6 @@ bool lm_ggml_validate_row_data(enum lm_ggml_type type, const void * data, size_t
|
|
5220
5220
|
{
|
5221
5221
|
VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
|
5222
5222
|
} break;
|
5223
|
-
case LM_GGML_TYPE_Q4_0_4_4:
|
5224
|
-
case LM_GGML_TYPE_Q4_0_4_8:
|
5225
|
-
{
|
5226
|
-
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
|
5227
|
-
} break;
|
5228
|
-
case LM_GGML_TYPE_Q4_0_8_8:
|
5229
|
-
{
|
5230
|
-
VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
|
5231
|
-
} break;
|
5232
5223
|
|
5233
5224
|
case LM_GGML_TYPE_I8:
|
5234
5225
|
case LM_GGML_TYPE_I16:
|
package/cpp/ggml-threading.h
CHANGED
@@ -1,11 +1,13 @@
|
|
1
1
|
#pragma once
|
2
2
|
|
3
|
+
#include "ggml.h"
|
4
|
+
|
3
5
|
#ifdef __cplusplus
|
4
6
|
extern "C" {
|
5
7
|
#endif
|
6
8
|
|
7
|
-
void lm_ggml_critical_section_start(void);
|
8
|
-
void lm_ggml_critical_section_end(void);
|
9
|
+
LM_GGML_API void lm_ggml_critical_section_start(void);
|
10
|
+
LM_GGML_API void lm_ggml_critical_section_end(void);
|
9
11
|
|
10
12
|
#ifdef __cplusplus
|
11
13
|
}
|
package/cpp/ggml.c
CHANGED
@@ -8,7 +8,10 @@
|
|
8
8
|
|
9
9
|
// FIXME: required here for quantization functions
|
10
10
|
#include "ggml-quants.h"
|
11
|
-
|
11
|
+
|
12
|
+
#ifdef LM_GGML_USE_CPU_HBM
|
13
|
+
#include <hbwmalloc.h>
|
14
|
+
#endif
|
12
15
|
|
13
16
|
#if defined(_MSC_VER) || defined(__MINGW32__)
|
14
17
|
#include <malloc.h> // using malloc.h with MSC/MINGW
|
@@ -801,32 +804,23 @@ static const struct lm_ggml_type_traits type_traits[LM_GGML_TYPE_COUNT] = {
|
|
801
804
|
.to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
|
802
805
|
.from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
|
803
806
|
},
|
804
|
-
[
|
805
|
-
.type_name = "
|
806
|
-
.blck_size =
|
807
|
-
.
|
808
|
-
.
|
809
|
-
.is_quantized = true,
|
810
|
-
.to_float = NULL,
|
811
|
-
.from_float_ref = NULL,
|
807
|
+
[31] = { // LM_GGML_TYPE_Q4_0_4_4
|
808
|
+
.type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
|
809
|
+
.blck_size = 0,
|
810
|
+
.type_size = 0,
|
811
|
+
.is_quantized = false,
|
812
812
|
},
|
813
|
-
[
|
814
|
-
.type_name = "
|
815
|
-
.blck_size =
|
816
|
-
.
|
817
|
-
.
|
818
|
-
.is_quantized = true,
|
819
|
-
.to_float = NULL,
|
820
|
-
.from_float_ref = NULL,
|
813
|
+
[32] = { // LM_GGML_TYPE_Q4_0_4_8
|
814
|
+
.type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
|
815
|
+
.blck_size = 0,
|
816
|
+
.type_size = 0,
|
817
|
+
.is_quantized = false,
|
821
818
|
},
|
822
|
-
[
|
823
|
-
.type_name = "
|
824
|
-
.blck_size =
|
825
|
-
.
|
826
|
-
.
|
827
|
-
.is_quantized = true,
|
828
|
-
.to_float = NULL,
|
829
|
-
.from_float_ref = NULL,
|
819
|
+
[33] = { // LM_GGML_TYPE_Q4_0_8_8
|
820
|
+
.type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
|
821
|
+
.blck_size = 0,
|
822
|
+
.type_size = 0,
|
823
|
+
.is_quantized = false,
|
830
824
|
},
|
831
825
|
[LM_GGML_TYPE_TQ1_0] = {
|
832
826
|
.type_name = "tq1_0",
|
@@ -844,14 +838,23 @@ static const struct lm_ggml_type_traits type_traits[LM_GGML_TYPE_COUNT] = {
|
|
844
838
|
.to_float = (lm_ggml_to_float_t) dequantize_row_tq2_0,
|
845
839
|
.from_float_ref = (lm_ggml_from_float_t) quantize_row_tq2_0_ref,
|
846
840
|
},
|
847
|
-
[
|
848
|
-
.type_name = "
|
849
|
-
.blck_size =
|
850
|
-
.
|
851
|
-
.
|
852
|
-
|
853
|
-
|
854
|
-
.
|
841
|
+
[36] = { // LM_GGML_TYPE_IQ4_NL_4_4
|
842
|
+
.type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
|
843
|
+
.blck_size = 0,
|
844
|
+
.type_size = 0,
|
845
|
+
.is_quantized = false,
|
846
|
+
},
|
847
|
+
[37] = { // LM_GGML_TYPE_IQ4_NL_4_8
|
848
|
+
.type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
|
849
|
+
.blck_size = 0,
|
850
|
+
.type_size = 0,
|
851
|
+
.is_quantized = false,
|
852
|
+
},
|
853
|
+
[38] = { // LM_GGML_TYPE_IQ4_NL_8_8
|
854
|
+
.type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
|
855
|
+
.blck_size = 0,
|
856
|
+
.type_size = 0,
|
857
|
+
.is_quantized = false,
|
855
858
|
},
|
856
859
|
};
|
857
860
|
|
@@ -963,6 +966,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
|
|
963
966
|
"POOL_2D_BACK",
|
964
967
|
"UPSCALE",
|
965
968
|
"PAD",
|
969
|
+
"PAD_REFLECT_1D",
|
966
970
|
"ARANGE",
|
967
971
|
"TIMESTEP_EMBEDDING",
|
968
972
|
"ARGSORT",
|
@@ -996,7 +1000,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
|
|
996
1000
|
"OPT_STEP_ADAMW",
|
997
1001
|
};
|
998
1002
|
|
999
|
-
static_assert(LM_GGML_OP_COUNT ==
|
1003
|
+
static_assert(LM_GGML_OP_COUNT == 82, "LM_GGML_OP_COUNT != 82");
|
1000
1004
|
|
1001
1005
|
static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
|
1002
1006
|
"none",
|
@@ -1058,6 +1062,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
|
|
1058
1062
|
"pool_2d_back(x)",
|
1059
1063
|
"upscale(x)",
|
1060
1064
|
"pad(x)",
|
1065
|
+
"pad_reflect_1d(x)",
|
1061
1066
|
"arange(start, stop, step)",
|
1062
1067
|
"timestep_embedding(timesteps, dim, max_period)",
|
1063
1068
|
"argsort(x)",
|
@@ -1091,7 +1096,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
|
|
1091
1096
|
"adamw(x)",
|
1092
1097
|
};
|
1093
1098
|
|
1094
|
-
static_assert(LM_GGML_OP_COUNT ==
|
1099
|
+
static_assert(LM_GGML_OP_COUNT == 82, "LM_GGML_OP_COUNT != 82");
|
1095
1100
|
|
1096
1101
|
static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
|
1097
1102
|
|
@@ -1281,9 +1286,6 @@ enum lm_ggml_type lm_ggml_ftype_to_lm_ggml_type(enum lm_ggml_ftype ftype) {
|
|
1281
1286
|
case LM_GGML_FTYPE_MOSTLY_IQ4_XS: wtype = LM_GGML_TYPE_IQ4_XS; break;
|
1282
1287
|
case LM_GGML_FTYPE_MOSTLY_IQ3_S: wtype = LM_GGML_TYPE_IQ3_S; break;
|
1283
1288
|
case LM_GGML_FTYPE_MOSTLY_IQ2_S: wtype = LM_GGML_TYPE_IQ2_S; break;
|
1284
|
-
case LM_GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = LM_GGML_TYPE_Q4_0_4_4; break;
|
1285
|
-
case LM_GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = LM_GGML_TYPE_Q4_0_4_8; break;
|
1286
|
-
case LM_GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = LM_GGML_TYPE_Q4_0_8_8; break;
|
1287
1289
|
case LM_GGML_FTYPE_UNKNOWN: wtype = LM_GGML_TYPE_COUNT; break;
|
1288
1290
|
case LM_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = LM_GGML_TYPE_COUNT; break;
|
1289
1291
|
}
|
@@ -3528,15 +3530,18 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl(
|
|
3528
3530
|
LM_GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
3529
3531
|
}
|
3530
3532
|
|
3533
|
+
int sections[4] = {0, 0, 0, 0};
|
3534
|
+
|
3531
3535
|
struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
|
3532
3536
|
|
3533
|
-
int32_t params[
|
3537
|
+
int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
3534
3538
|
memcpy(params + 5, &freq_base, sizeof(float));
|
3535
3539
|
memcpy(params + 6, &freq_scale, sizeof(float));
|
3536
3540
|
memcpy(params + 7, &ext_factor, sizeof(float));
|
3537
3541
|
memcpy(params + 8, &attn_factor, sizeof(float));
|
3538
3542
|
memcpy(params + 9, &beta_fast, sizeof(float));
|
3539
3543
|
memcpy(params + 10, &beta_slow, sizeof(float));
|
3544
|
+
memcpy(params + 11, §ions, sizeof(int)*4);
|
3540
3545
|
lm_ggml_set_op_params(result, params, sizeof(params));
|
3541
3546
|
|
3542
3547
|
result->op = LM_GGML_OP_ROPE;
|
@@ -3558,6 +3563,53 @@ struct lm_ggml_tensor * lm_ggml_rope(
|
|
3558
3563
|
);
|
3559
3564
|
}
|
3560
3565
|
|
3566
|
+
struct lm_ggml_tensor * lm_ggml_rope_multi(
|
3567
|
+
struct lm_ggml_context * ctx,
|
3568
|
+
struct lm_ggml_tensor * a,
|
3569
|
+
struct lm_ggml_tensor * b,
|
3570
|
+
struct lm_ggml_tensor * c,
|
3571
|
+
int n_dims,
|
3572
|
+
int sections[4],
|
3573
|
+
int mode,
|
3574
|
+
int n_ctx_orig,
|
3575
|
+
float freq_base,
|
3576
|
+
float freq_scale,
|
3577
|
+
float ext_factor,
|
3578
|
+
float attn_factor,
|
3579
|
+
float beta_fast,
|
3580
|
+
float beta_slow) {
|
3581
|
+
// Multimodal Rotary Position Embedding
|
3582
|
+
LM_GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
|
3583
|
+
|
3584
|
+
LM_GGML_ASSERT(lm_ggml_is_vector(b));
|
3585
|
+
LM_GGML_ASSERT(b->type == LM_GGML_TYPE_I32);
|
3586
|
+
LM_GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
|
3587
|
+
|
3588
|
+
if (c) {
|
3589
|
+
LM_GGML_ASSERT(c->type == LM_GGML_TYPE_F32);
|
3590
|
+
LM_GGML_ASSERT(c->ne[0] >= n_dims / 2);
|
3591
|
+
}
|
3592
|
+
|
3593
|
+
struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, a);
|
3594
|
+
|
3595
|
+
int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
|
3596
|
+
memcpy(params + 5, &freq_base, sizeof(float));
|
3597
|
+
memcpy(params + 6, &freq_scale, sizeof(float));
|
3598
|
+
memcpy(params + 7, &ext_factor, sizeof(float));
|
3599
|
+
memcpy(params + 8, &attn_factor, sizeof(float));
|
3600
|
+
memcpy(params + 9, &beta_fast, sizeof(float));
|
3601
|
+
memcpy(params + 10, &beta_slow, sizeof(float));
|
3602
|
+
memcpy(¶ms[11], sections, sizeof(int)*4);
|
3603
|
+
lm_ggml_set_op_params(result, params, sizeof(params));
|
3604
|
+
|
3605
|
+
result->op = LM_GGML_OP_ROPE;
|
3606
|
+
result->src[0] = a;
|
3607
|
+
result->src[1] = b;
|
3608
|
+
result->src[2] = c;
|
3609
|
+
|
3610
|
+
return result;
|
3611
|
+
}
|
3612
|
+
|
3561
3613
|
struct lm_ggml_tensor * lm_ggml_rope_inplace(
|
3562
3614
|
struct lm_ggml_context * ctx,
|
3563
3615
|
struct lm_ggml_tensor * a,
|
@@ -4110,6 +4162,37 @@ struct lm_ggml_tensor * lm_ggml_pad(
|
|
4110
4162
|
return result;
|
4111
4163
|
}
|
4112
4164
|
|
4165
|
+
// lm_ggml_pad_reflect_1d
|
4166
|
+
|
4167
|
+
struct lm_ggml_tensor * lm_ggml_pad_reflect_1d(
|
4168
|
+
struct lm_ggml_context * ctx,
|
4169
|
+
struct lm_ggml_tensor * a,
|
4170
|
+
int p0,
|
4171
|
+
int p1) {
|
4172
|
+
LM_GGML_ASSERT(p0 >= 0);
|
4173
|
+
LM_GGML_ASSERT(p1 >= 0);
|
4174
|
+
|
4175
|
+
LM_GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
|
4176
|
+
LM_GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
|
4177
|
+
|
4178
|
+
LM_GGML_ASSERT(lm_ggml_is_contiguous(a));
|
4179
|
+
LM_GGML_ASSERT(a->type == LM_GGML_TYPE_F32);
|
4180
|
+
|
4181
|
+
struct lm_ggml_tensor * result = lm_ggml_new_tensor_4d(ctx, a->type,
|
4182
|
+
a->ne[0] + p0 + p1,
|
4183
|
+
a->ne[1],
|
4184
|
+
a->ne[2],
|
4185
|
+
a->ne[3]);
|
4186
|
+
|
4187
|
+
int32_t params[] = { p0, p1 };
|
4188
|
+
lm_ggml_set_op_params(result, params, sizeof(params));
|
4189
|
+
|
4190
|
+
result->op = LM_GGML_OP_PAD_REFLECT_1D;
|
4191
|
+
result->src[0] = a;
|
4192
|
+
|
4193
|
+
return result;
|
4194
|
+
}
|
4195
|
+
|
4113
4196
|
// lm_ggml_arange
|
4114
4197
|
|
4115
4198
|
struct lm_ggml_tensor * lm_ggml_arange(
|
@@ -6284,9 +6367,6 @@ size_t lm_ggml_quantize_chunk(
|
|
6284
6367
|
case LM_GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6285
6368
|
case LM_GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6286
6369
|
case LM_GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6287
|
-
case LM_GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6288
|
-
case LM_GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6289
|
-
case LM_GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
|
6290
6370
|
case LM_GGML_TYPE_F16:
|
6291
6371
|
{
|
6292
6372
|
size_t elemsize = sizeof(lm_ggml_fp16_t);
|
@@ -6818,7 +6898,16 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
|
|
6818
6898
|
(int64_t) info->ne[2] *
|
6819
6899
|
(int64_t) info->ne[3];
|
6820
6900
|
|
6821
|
-
if (lm_ggml_blck_size(info->type) == 0
|
6901
|
+
if (lm_ggml_blck_size(info->type) == 0 ) {
|
6902
|
+
// this tensor type support have been removed:
|
6903
|
+
fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
|
6904
|
+
__func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type));
|
6905
|
+
fclose(file);
|
6906
|
+
lm_gguf_free(ctx);
|
6907
|
+
return NULL;
|
6908
|
+
}
|
6909
|
+
|
6910
|
+
if (ne % lm_ggml_blck_size(info->type) != 0) {
|
6822
6911
|
fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
|
6823
6912
|
__func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type));
|
6824
6913
|
fclose(file);
|
package/cpp/ggml.h
CHANGED
@@ -238,7 +238,9 @@
|
|
238
238
|
#define LM_GGML_EXIT_SUCCESS 0
|
239
239
|
#define LM_GGML_EXIT_ABORTED 1
|
240
240
|
|
241
|
-
#define LM_GGML_ROPE_TYPE_NEOX
|
241
|
+
#define LM_GGML_ROPE_TYPE_NEOX 2
|
242
|
+
#define LM_GGML_ROPE_TYPE_MROPE 8
|
243
|
+
#define LM_GGML_ROPE_TYPE_VISION 24
|
242
244
|
|
243
245
|
#define LM_GGUF_MAGIC "GGUF"
|
244
246
|
|
@@ -385,15 +387,15 @@ extern "C" {
|
|
385
387
|
LM_GGML_TYPE_F64 = 28,
|
386
388
|
LM_GGML_TYPE_IQ1_M = 29,
|
387
389
|
LM_GGML_TYPE_BF16 = 30,
|
388
|
-
LM_GGML_TYPE_Q4_0_4_4 = 31,
|
389
|
-
LM_GGML_TYPE_Q4_0_4_8 = 32,
|
390
|
-
LM_GGML_TYPE_Q4_0_8_8 = 33,
|
390
|
+
// LM_GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
|
391
|
+
// LM_GGML_TYPE_Q4_0_4_8 = 32,
|
392
|
+
// LM_GGML_TYPE_Q4_0_8_8 = 33,
|
391
393
|
LM_GGML_TYPE_TQ1_0 = 34,
|
392
394
|
LM_GGML_TYPE_TQ2_0 = 35,
|
393
|
-
LM_GGML_TYPE_IQ4_NL_4_4 = 36,
|
395
|
+
// LM_GGML_TYPE_IQ4_NL_4_4 = 36,
|
394
396
|
// LM_GGML_TYPE_IQ4_NL_4_8 = 37,
|
395
397
|
// LM_GGML_TYPE_IQ4_NL_8_8 = 38,
|
396
|
-
LM_GGML_TYPE_COUNT,
|
398
|
+
LM_GGML_TYPE_COUNT = 39,
|
397
399
|
};
|
398
400
|
|
399
401
|
// precision
|
@@ -434,9 +436,6 @@ extern "C" {
|
|
434
436
|
LM_GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
|
435
437
|
LM_GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
|
436
438
|
LM_GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
|
437
|
-
LM_GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
|
438
|
-
LM_GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
|
439
|
-
LM_GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
|
440
439
|
};
|
441
440
|
|
442
441
|
// available tensor operations:
|
@@ -500,6 +499,7 @@ extern "C" {
|
|
500
499
|
LM_GGML_OP_POOL_2D_BACK,
|
501
500
|
LM_GGML_OP_UPSCALE, // nearest interpolate
|
502
501
|
LM_GGML_OP_PAD,
|
502
|
+
LM_GGML_OP_PAD_REFLECT_1D,
|
503
503
|
LM_GGML_OP_ARANGE,
|
504
504
|
LM_GGML_OP_TIMESTEP_EMBEDDING,
|
505
505
|
LM_GGML_OP_ARGSORT,
|
@@ -1446,6 +1446,22 @@ extern "C" {
|
|
1446
1446
|
float beta_fast,
|
1447
1447
|
float beta_slow);
|
1448
1448
|
|
1449
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_multi(
|
1450
|
+
struct lm_ggml_context * ctx,
|
1451
|
+
struct lm_ggml_tensor * a,
|
1452
|
+
struct lm_ggml_tensor * b,
|
1453
|
+
struct lm_ggml_tensor * c,
|
1454
|
+
int n_dims,
|
1455
|
+
int sections[4],
|
1456
|
+
int mode,
|
1457
|
+
int n_ctx_orig,
|
1458
|
+
float freq_base,
|
1459
|
+
float freq_scale,
|
1460
|
+
float ext_factor,
|
1461
|
+
float attn_factor,
|
1462
|
+
float beta_fast,
|
1463
|
+
float beta_slow);
|
1464
|
+
|
1449
1465
|
// in-place, returns view(a)
|
1450
1466
|
LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext_inplace(
|
1451
1467
|
struct lm_ggml_context * ctx,
|
@@ -1696,6 +1712,13 @@ extern "C" {
|
|
1696
1712
|
int p2,
|
1697
1713
|
int p3);
|
1698
1714
|
|
1715
|
+
// pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
|
1716
|
+
LM_GGML_API struct lm_ggml_tensor * lm_ggml_pad_reflect_1d(
|
1717
|
+
struct lm_ggml_context * ctx,
|
1718
|
+
struct lm_ggml_tensor * a,
|
1719
|
+
int p0,
|
1720
|
+
int p1);
|
1721
|
+
|
1699
1722
|
// Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
|
1700
1723
|
// timesteps: [N,]
|
1701
1724
|
// return: [N, dim]
|
@@ -2198,11 +2221,19 @@ extern "C" {
|
|
2198
2221
|
LM_GGML_API size_t lm_gguf_get_meta_size(const struct lm_gguf_context * ctx);
|
2199
2222
|
LM_GGML_API void lm_gguf_get_meta_data(const struct lm_gguf_context * ctx, void * data);
|
2200
2223
|
|
2201
|
-
#ifdef
|
2202
|
-
// restrict not standard in C++
|
2203
|
-
#
|
2224
|
+
#ifdef __cplusplus
|
2225
|
+
// restrict not standard in C++
|
2226
|
+
# if defined(__GNUC__)
|
2227
|
+
# define LM_GGML_RESTRICT __restrict__
|
2228
|
+
# elif defined(__clang__)
|
2229
|
+
# define LM_GGML_RESTRICT __restrict
|
2230
|
+
# elif defined(_MSC_VER)
|
2231
|
+
# define LM_GGML_RESTRICT __restrict
|
2232
|
+
# else
|
2233
|
+
# define LM_GGML_RESTRICT
|
2234
|
+
# endif
|
2204
2235
|
#else
|
2205
|
-
#define LM_GGML_RESTRICT restrict
|
2236
|
+
# define LM_GGML_RESTRICT restrict
|
2206
2237
|
#endif
|
2207
2238
|
typedef void (*lm_ggml_to_float_t) (const void * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int64_t k);
|
2208
2239
|
typedef void (*lm_ggml_from_float_t)(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
|
package/cpp/llama-sampling.cpp
CHANGED
@@ -1397,19 +1397,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
|
|
1397
1397
|
// penalties
|
1398
1398
|
|
1399
1399
|
struct llama_sampler_penalties {
|
1400
|
-
const int32_t n_vocab;
|
1401
|
-
const llama_token special_eos_id;
|
1402
|
-
const llama_token linefeed_id;
|
1403
|
-
|
1404
1400
|
const int32_t penalty_last_n;
|
1405
1401
|
const float penalty_repeat;
|
1406
1402
|
const float penalty_freq;
|
1407
1403
|
const float penalty_present;
|
1408
1404
|
|
1409
|
-
const bool penalize_nl;
|
1410
|
-
const bool ignore_eos;
|
1411
|
-
|
1412
1405
|
ring_buffer<llama_token> prev;
|
1406
|
+
|
1407
|
+
// a frequency map to count token occurrences
|
1408
|
+
std::unordered_map<llama_token, int> token_count;
|
1413
1409
|
};
|
1414
1410
|
|
1415
1411
|
static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
|
@@ -1422,76 +1418,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
|
|
1422
1418
|
return;
|
1423
1419
|
}
|
1424
1420
|
|
1425
|
-
ctx->
|
1426
|
-
}
|
1427
|
-
|
1428
|
-
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
1429
|
-
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
1421
|
+
ctx->token_count[token]++;
|
1430
1422
|
|
1431
|
-
if
|
1432
|
-
|
1423
|
+
// if the ring buffer is full, remove the oldest token
|
1424
|
+
if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
|
1425
|
+
const auto old = ctx->prev.front();
|
1433
1426
|
|
1434
|
-
|
1435
|
-
if (
|
1436
|
-
|
1437
|
-
} else {
|
1438
|
-
// else, search for the special EOS token
|
1439
|
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
1440
|
-
if (cur_p->data[i].id == ctx->special_eos_id) {
|
1441
|
-
cur_p->data[i].logit = -INFINITY;
|
1442
|
-
break;
|
1443
|
-
}
|
1444
|
-
}
|
1427
|
+
ctx->token_count[old]--;
|
1428
|
+
if (ctx->token_count[old] == 0) {
|
1429
|
+
ctx->token_count.erase(old);
|
1445
1430
|
}
|
1446
1431
|
}
|
1447
1432
|
|
1448
|
-
|
1449
|
-
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
1450
|
-
return;
|
1451
|
-
}
|
1452
|
-
|
1453
|
-
bool nl_found = false;
|
1454
|
-
size_t nl_idx = 0;
|
1455
|
-
float nl_logit = -INFINITY;
|
1456
|
-
if (!ctx->penalize_nl) {
|
1457
|
-
assert(ctx->linefeed_id >= 0);
|
1433
|
+
ctx->prev.push_back(token);
|
1458
1434
|
|
1459
|
-
|
1460
|
-
|
1461
|
-
|
1462
|
-
|
1463
|
-
|
1464
|
-
} else {
|
1465
|
-
// else, search for the linefeed token
|
1466
|
-
for (size_t i = 0; i < cur_p->size; ++i) {
|
1467
|
-
if (cur_p->data[i].id == ctx->linefeed_id) {
|
1468
|
-
nl_found = true;
|
1469
|
-
nl_idx = i;
|
1470
|
-
nl_logit = cur_p->data[i].logit;
|
1471
|
-
break;
|
1472
|
-
}
|
1473
|
-
}
|
1474
|
-
}
|
1435
|
+
#if 0
|
1436
|
+
// sanity check
|
1437
|
+
std::unordered_map<llama_token, int> tmp;
|
1438
|
+
for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
|
1439
|
+
tmp[ctx->prev.rat(i)]++;
|
1475
1440
|
}
|
1476
1441
|
|
1477
|
-
|
1478
|
-
|
1479
|
-
|
1480
|
-
|
1442
|
+
assert(ctx->token_count == tmp);
|
1443
|
+
#endif
|
1444
|
+
}
|
1445
|
+
|
1446
|
+
static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
|
1447
|
+
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
1481
1448
|
|
1482
|
-
|
1483
|
-
|
1449
|
+
if ((ctx->penalty_last_n == 0) ||
|
1450
|
+
(ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
|
1451
|
+
return;
|
1484
1452
|
}
|
1485
1453
|
|
1486
1454
|
// Apply frequency and presence penalties to the cur_p
|
1487
1455
|
for (size_t i = 0; i < cur_p->size; ++i) {
|
1488
|
-
const auto token_iter = token_count.find(cur_p->data[i].id);
|
1489
|
-
if (token_iter == token_count.end()) {
|
1456
|
+
const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
|
1457
|
+
if (token_iter == ctx->token_count.end()) {
|
1490
1458
|
continue;
|
1491
1459
|
}
|
1492
1460
|
|
1493
1461
|
const int count = token_iter->second;
|
1494
1462
|
|
1463
|
+
assert(count > 0 && count <= ctx->penalty_last_n);
|
1464
|
+
|
1495
1465
|
// The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
|
1496
1466
|
// This is common fix for this problem, which is to multiply by the penalty instead of dividing.
|
1497
1467
|
if (cur_p->data[i].logit <= 0) {
|
@@ -1504,30 +1474,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
|
|
1504
1474
|
}
|
1505
1475
|
|
1506
1476
|
cur_p->sorted = false;
|
1507
|
-
|
1508
|
-
if (!ctx->penalize_nl && nl_found) {
|
1509
|
-
// restore the logit of the newline token if it was penalized
|
1510
|
-
cur_p->data[nl_idx].logit = nl_logit;
|
1511
|
-
}
|
1512
1477
|
}
|
1513
1478
|
|
1514
1479
|
static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
|
1515
1480
|
auto * ctx = (llama_sampler_penalties *) smpl->ctx;
|
1516
1481
|
ctx->prev.clear();
|
1482
|
+
ctx->token_count.clear();
|
1517
1483
|
}
|
1518
1484
|
|
1519
1485
|
static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
|
1520
1486
|
const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
|
1521
1487
|
auto * result = llama_sampler_init_penalties(
|
1522
|
-
ctx->n_vocab,
|
1523
|
-
ctx->special_eos_id,
|
1524
|
-
ctx->linefeed_id,
|
1525
1488
|
ctx->penalty_last_n,
|
1526
1489
|
ctx->penalty_repeat,
|
1527
1490
|
ctx->penalty_freq,
|
1528
|
-
ctx->penalty_present
|
1529
|
-
ctx->penalize_nl,
|
1530
|
-
ctx->ignore_eos);
|
1491
|
+
ctx->penalty_present);
|
1531
1492
|
|
1532
1493
|
// copy the state
|
1533
1494
|
{
|
@@ -1553,38 +1514,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
|
|
1553
1514
|
};
|
1554
1515
|
|
1555
1516
|
struct llama_sampler * llama_sampler_init_penalties(
|
1556
|
-
int32_t n_vocab,
|
1557
|
-
llama_token special_eos_id,
|
1558
|
-
llama_token linefeed_id,
|
1559
1517
|
int32_t penalty_last_n,
|
1560
1518
|
float penalty_repeat,
|
1561
1519
|
float penalty_freq,
|
1562
|
-
float penalty_present
|
1563
|
-
bool penalize_nl,
|
1564
|
-
bool ignore_eos) {
|
1565
|
-
if (linefeed_id == LLAMA_TOKEN_NULL) {
|
1566
|
-
penalize_nl = true;
|
1567
|
-
}
|
1568
|
-
|
1569
|
-
if (special_eos_id == LLAMA_TOKEN_NULL) {
|
1570
|
-
ignore_eos = false;
|
1571
|
-
}
|
1572
|
-
|
1520
|
+
float penalty_present) {
|
1573
1521
|
penalty_last_n = std::max(penalty_last_n, 0);
|
1574
1522
|
|
1575
1523
|
return new llama_sampler {
|
1576
1524
|
/* .iface = */ &llama_sampler_penalties_i,
|
1577
1525
|
/* .ctx = */ new llama_sampler_penalties {
|
1578
|
-
/* .n_vocab = */ n_vocab,
|
1579
|
-
/* .special_eos_id = */ special_eos_id,
|
1580
|
-
/* .linefeed_id = */ linefeed_id,
|
1581
1526
|
/* .penalty_last_n = */ penalty_last_n,
|
1582
1527
|
/* .penalty_repeat = */ penalty_repeat,
|
1583
1528
|
/* .penalty_freq = */ penalty_freq,
|
1584
1529
|
/* .penalty_present = */ penalty_present,
|
1585
|
-
/* .penalize_nl = */ penalize_nl,
|
1586
|
-
/* .ignore_eos = */ ignore_eos,
|
1587
1530
|
/* .prev = */ ring_buffer<llama_token>(penalty_last_n),
|
1531
|
+
/* .token_count = */ {},
|
1588
1532
|
},
|
1589
1533
|
};
|
1590
1534
|
}
|
@@ -1612,7 +1556,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
|
|
1612
1556
|
if (word.find(str) != std::string::npos) {
|
1613
1557
|
token_sequences.emplace(token_id, std::vector<llama_token>());
|
1614
1558
|
} else {
|
1615
|
-
size_t word_len = word.size()
|
1559
|
+
size_t word_len = word.size();
|
1560
|
+
size_t str_len = str.size();
|
1616
1561
|
size_t pos = -1;
|
1617
1562
|
while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
|
1618
1563
|
bool match = true;
|
package/cpp/llama-vocab.cpp
CHANGED
@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
|
|
418
418
|
case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
|
419
419
|
case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
|
420
420
|
case LLAMA_VOCAB_PRE_TYPE_EXAONE:
|
421
|
+
case LLAMA_VOCAB_PRE_TYPE_MINERVA:
|
421
422
|
regex_exprs = {
|
422
423
|
"\\p{N}",
|
423
424
|
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
|
@@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
|
|
737
738
|
std::vector<std::string> words(1, "");
|
738
739
|
|
739
740
|
for (const uint32_t cpt : cpts_nfd) {
|
740
|
-
const auto flags =
|
741
|
+
const auto flags = unicode_cpt_flags_from_cpt(cpt);
|
741
742
|
|
742
743
|
if (flags.is_whitespace) {
|
743
744
|
if (words.back().size()) { // finish previous word if any
|