cui-llama.rn 1.3.3 → 1.3.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (48) hide show
  1. package/android/src/main/CMakeLists.txt +5 -7
  2. package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
  3. package/android/src/main/jni.cpp +9 -9
  4. package/cpp/common.cpp +21 -40
  5. package/cpp/common.h +21 -12
  6. package/cpp/ggml-backend-impl.h +38 -20
  7. package/cpp/ggml-backend-reg.cpp +216 -87
  8. package/cpp/ggml-backend.h +1 -0
  9. package/cpp/ggml-common.h +42 -48
  10. package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +591 -152
  11. package/cpp/ggml-cpu-aarch64.h +2 -26
  12. package/cpp/ggml-cpu-traits.cpp +36 -0
  13. package/cpp/ggml-cpu-traits.h +38 -0
  14. package/cpp/ggml-cpu.c +14122 -13971
  15. package/cpp/ggml-cpu.cpp +618 -715
  16. package/cpp/ggml-cpu.h +0 -17
  17. package/cpp/ggml-impl.h +6 -6
  18. package/cpp/ggml-metal.m +482 -24
  19. package/cpp/ggml-quants.c +0 -9
  20. package/cpp/ggml-threading.h +4 -2
  21. package/cpp/ggml.c +132 -43
  22. package/cpp/ggml.h +44 -13
  23. package/cpp/llama-sampling.cpp +35 -90
  24. package/cpp/llama-vocab.cpp +2 -1
  25. package/cpp/llama.cpp +737 -233
  26. package/cpp/llama.h +20 -16
  27. package/cpp/sampling.cpp +11 -16
  28. package/cpp/speculative.cpp +4 -0
  29. package/cpp/unicode.cpp +51 -51
  30. package/cpp/unicode.h +9 -10
  31. package/lib/commonjs/index.js +38 -1
  32. package/lib/commonjs/index.js.map +1 -1
  33. package/lib/module/index.js +36 -0
  34. package/lib/module/index.js.map +1 -1
  35. package/lib/typescript/NativeRNLlama.d.ts +2 -3
  36. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  37. package/lib/typescript/index.d.ts +36 -2
  38. package/lib/typescript/index.d.ts.map +1 -1
  39. package/package.json +1 -1
  40. package/src/NativeRNLlama.ts +3 -3
  41. package/src/index.ts +46 -2
  42. package/cpp/amx/amx.cpp +0 -196
  43. package/cpp/amx/amx.h +0 -20
  44. package/cpp/amx/common.h +0 -101
  45. package/cpp/amx/mmq.cpp +0 -2524
  46. package/cpp/amx/mmq.h +0 -16
  47. package/cpp/ggml-aarch64.c +0 -129
  48. package/cpp/ggml-aarch64.h +0 -19
package/cpp/ggml-quants.c CHANGED
@@ -5220,15 +5220,6 @@ bool lm_ggml_validate_row_data(enum lm_ggml_type type, const void * data, size_t
5220
5220
  {
5221
5221
  VALIDATE_ROW_DATA_D_F16_IMPL(block_iq4_nl, data, nb);
5222
5222
  } break;
5223
- case LM_GGML_TYPE_Q4_0_4_4:
5224
- case LM_GGML_TYPE_Q4_0_4_8:
5225
- {
5226
- VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x4, data, nbytes / sizeof(block_q4_0x4), 4);
5227
- } break;
5228
- case LM_GGML_TYPE_Q4_0_8_8:
5229
- {
5230
- VALIDATE_ROW_DATA_DVEC_F16_IMPL(block_q4_0x8, data, nbytes / sizeof(block_q4_0x8), 8);
5231
- } break;
5232
5223
 
5233
5224
  case LM_GGML_TYPE_I8:
5234
5225
  case LM_GGML_TYPE_I16:
@@ -1,11 +1,13 @@
1
1
  #pragma once
2
2
 
3
+ #include "ggml.h"
4
+
3
5
  #ifdef __cplusplus
4
6
  extern "C" {
5
7
  #endif
6
8
 
7
- void lm_ggml_critical_section_start(void);
8
- void lm_ggml_critical_section_end(void);
9
+ LM_GGML_API void lm_ggml_critical_section_start(void);
10
+ LM_GGML_API void lm_ggml_critical_section_end(void);
9
11
 
10
12
  #ifdef __cplusplus
11
13
  }
package/cpp/ggml.c CHANGED
@@ -8,7 +8,10 @@
8
8
 
9
9
  // FIXME: required here for quantization functions
10
10
  #include "ggml-quants.h"
11
- #include "ggml-aarch64.h"
11
+
12
+ #ifdef LM_GGML_USE_CPU_HBM
13
+ #include <hbwmalloc.h>
14
+ #endif
12
15
 
13
16
  #if defined(_MSC_VER) || defined(__MINGW32__)
14
17
  #include <malloc.h> // using malloc.h with MSC/MINGW
@@ -801,32 +804,23 @@ static const struct lm_ggml_type_traits type_traits[LM_GGML_TYPE_COUNT] = {
801
804
  .to_float = (lm_ggml_to_float_t) lm_ggml_bf16_to_fp32_row,
802
805
  .from_float_ref = (lm_ggml_from_float_t) lm_ggml_fp32_to_bf16_row_ref,
803
806
  },
804
- [LM_GGML_TYPE_Q4_0_4_4] = {
805
- .type_name = "q4_0_4x4",
806
- .blck_size = QK4_0,
807
- .blck_size_interleave = 4,
808
- .type_size = sizeof(block_q4_0),
809
- .is_quantized = true,
810
- .to_float = NULL,
811
- .from_float_ref = NULL,
807
+ [31] = { // LM_GGML_TYPE_Q4_0_4_4
808
+ .type_name = "TYPE_Q4_0_4_4 REMOVED, use Q4_0 with runtime repacking",
809
+ .blck_size = 0,
810
+ .type_size = 0,
811
+ .is_quantized = false,
812
812
  },
813
- [LM_GGML_TYPE_Q4_0_4_8] = {
814
- .type_name = "q4_0_4x8",
815
- .blck_size = QK4_0,
816
- .blck_size_interleave = 8,
817
- .type_size = sizeof(block_q4_0),
818
- .is_quantized = true,
819
- .to_float = NULL,
820
- .from_float_ref = NULL,
813
+ [32] = { // LM_GGML_TYPE_Q4_0_4_8
814
+ .type_name = "TYPE_Q4_0_4_8 REMOVED, use Q4_0 with runtime repacking",
815
+ .blck_size = 0,
816
+ .type_size = 0,
817
+ .is_quantized = false,
821
818
  },
822
- [LM_GGML_TYPE_Q4_0_8_8] = {
823
- .type_name = "q4_0_8x8",
824
- .blck_size = QK4_0,
825
- .blck_size_interleave = 8,
826
- .type_size = sizeof(block_q4_0),
827
- .is_quantized = true,
828
- .to_float = NULL,
829
- .from_float_ref = NULL,
819
+ [33] = { // LM_GGML_TYPE_Q4_0_8_8
820
+ .type_name = "TYPE_Q4_0_8_8 REMOVED, use Q4_0 with runtime repacking",
821
+ .blck_size = 0,
822
+ .type_size = 0,
823
+ .is_quantized = false,
830
824
  },
831
825
  [LM_GGML_TYPE_TQ1_0] = {
832
826
  .type_name = "tq1_0",
@@ -844,14 +838,23 @@ static const struct lm_ggml_type_traits type_traits[LM_GGML_TYPE_COUNT] = {
844
838
  .to_float = (lm_ggml_to_float_t) dequantize_row_tq2_0,
845
839
  .from_float_ref = (lm_ggml_from_float_t) quantize_row_tq2_0_ref,
846
840
  },
847
- [LM_GGML_TYPE_IQ4_NL_4_4] = {
848
- .type_name = "iq4_nl_4x4",
849
- .blck_size = QK4_NL,
850
- .blck_size_interleave = 4,
851
- .type_size = sizeof(block_iq4_nl),
852
- .is_quantized = true,
853
- .to_float = NULL,
854
- .from_float_ref = NULL,
841
+ [36] = { // LM_GGML_TYPE_IQ4_NL_4_4
842
+ .type_name = "TYPE_IQ4_NL_4_4 REMOVED, use IQ4_NL with runtime repacking",
843
+ .blck_size = 0,
844
+ .type_size = 0,
845
+ .is_quantized = false,
846
+ },
847
+ [37] = { // LM_GGML_TYPE_IQ4_NL_4_8
848
+ .type_name = "TYPE_IQ4_NL_4_8 REMOVED, use IQ4_NL with runtime repacking",
849
+ .blck_size = 0,
850
+ .type_size = 0,
851
+ .is_quantized = false,
852
+ },
853
+ [38] = { // LM_GGML_TYPE_IQ4_NL_8_8
854
+ .type_name = "TYPE_IQ4_NL_8_8 REMOVED, use IQ4_NL with runtime repacking",
855
+ .blck_size = 0,
856
+ .type_size = 0,
857
+ .is_quantized = false,
855
858
  },
856
859
  };
857
860
 
@@ -963,6 +966,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
963
966
  "POOL_2D_BACK",
964
967
  "UPSCALE",
965
968
  "PAD",
969
+ "PAD_REFLECT_1D",
966
970
  "ARANGE",
967
971
  "TIMESTEP_EMBEDDING",
968
972
  "ARGSORT",
@@ -996,7 +1000,7 @@ static const char * LM_GGML_OP_NAME[LM_GGML_OP_COUNT] = {
996
1000
  "OPT_STEP_ADAMW",
997
1001
  };
998
1002
 
999
- static_assert(LM_GGML_OP_COUNT == 81, "LM_GGML_OP_COUNT != 81");
1003
+ static_assert(LM_GGML_OP_COUNT == 82, "LM_GGML_OP_COUNT != 82");
1000
1004
 
1001
1005
  static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
1002
1006
  "none",
@@ -1058,6 +1062,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
1058
1062
  "pool_2d_back(x)",
1059
1063
  "upscale(x)",
1060
1064
  "pad(x)",
1065
+ "pad_reflect_1d(x)",
1061
1066
  "arange(start, stop, step)",
1062
1067
  "timestep_embedding(timesteps, dim, max_period)",
1063
1068
  "argsort(x)",
@@ -1091,7 +1096,7 @@ static const char * LM_GGML_OP_SYMBOL[LM_GGML_OP_COUNT] = {
1091
1096
  "adamw(x)",
1092
1097
  };
1093
1098
 
1094
- static_assert(LM_GGML_OP_COUNT == 81, "LM_GGML_OP_COUNT != 81");
1099
+ static_assert(LM_GGML_OP_COUNT == 82, "LM_GGML_OP_COUNT != 82");
1095
1100
 
1096
1101
  static_assert(LM_GGML_OP_POOL_COUNT == 2, "LM_GGML_OP_POOL_COUNT != 2");
1097
1102
 
@@ -1281,9 +1286,6 @@ enum lm_ggml_type lm_ggml_ftype_to_lm_ggml_type(enum lm_ggml_ftype ftype) {
1281
1286
  case LM_GGML_FTYPE_MOSTLY_IQ4_XS: wtype = LM_GGML_TYPE_IQ4_XS; break;
1282
1287
  case LM_GGML_FTYPE_MOSTLY_IQ3_S: wtype = LM_GGML_TYPE_IQ3_S; break;
1283
1288
  case LM_GGML_FTYPE_MOSTLY_IQ2_S: wtype = LM_GGML_TYPE_IQ2_S; break;
1284
- case LM_GGML_FTYPE_MOSTLY_Q4_0_4_4: wtype = LM_GGML_TYPE_Q4_0_4_4; break;
1285
- case LM_GGML_FTYPE_MOSTLY_Q4_0_4_8: wtype = LM_GGML_TYPE_Q4_0_4_8; break;
1286
- case LM_GGML_FTYPE_MOSTLY_Q4_0_8_8: wtype = LM_GGML_TYPE_Q4_0_8_8; break;
1287
1289
  case LM_GGML_FTYPE_UNKNOWN: wtype = LM_GGML_TYPE_COUNT; break;
1288
1290
  case LM_GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = LM_GGML_TYPE_COUNT; break;
1289
1291
  }
@@ -3528,15 +3530,18 @@ static struct lm_ggml_tensor * lm_ggml_rope_impl(
3528
3530
  LM_GGML_ASSERT(c->ne[0] >= n_dims / 2);
3529
3531
  }
3530
3532
 
3533
+ int sections[4] = {0, 0, 0, 0};
3534
+
3531
3535
  struct lm_ggml_tensor * result = inplace ? lm_ggml_view_tensor(ctx, a) : lm_ggml_dup_tensor(ctx, a);
3532
3536
 
3533
- int32_t params[11] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3537
+ int32_t params[15] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3534
3538
  memcpy(params + 5, &freq_base, sizeof(float));
3535
3539
  memcpy(params + 6, &freq_scale, sizeof(float));
3536
3540
  memcpy(params + 7, &ext_factor, sizeof(float));
3537
3541
  memcpy(params + 8, &attn_factor, sizeof(float));
3538
3542
  memcpy(params + 9, &beta_fast, sizeof(float));
3539
3543
  memcpy(params + 10, &beta_slow, sizeof(float));
3544
+ memcpy(params + 11, &sections, sizeof(int)*4);
3540
3545
  lm_ggml_set_op_params(result, params, sizeof(params));
3541
3546
 
3542
3547
  result->op = LM_GGML_OP_ROPE;
@@ -3558,6 +3563,53 @@ struct lm_ggml_tensor * lm_ggml_rope(
3558
3563
  );
3559
3564
  }
3560
3565
 
3566
+ struct lm_ggml_tensor * lm_ggml_rope_multi(
3567
+ struct lm_ggml_context * ctx,
3568
+ struct lm_ggml_tensor * a,
3569
+ struct lm_ggml_tensor * b,
3570
+ struct lm_ggml_tensor * c,
3571
+ int n_dims,
3572
+ int sections[4],
3573
+ int mode,
3574
+ int n_ctx_orig,
3575
+ float freq_base,
3576
+ float freq_scale,
3577
+ float ext_factor,
3578
+ float attn_factor,
3579
+ float beta_fast,
3580
+ float beta_slow) {
3581
+ // Multimodal Rotary Position Embedding
3582
+ LM_GGML_ASSERT((mode & 1) == 0 && "mode & 1 == 1 is no longer supported");
3583
+
3584
+ LM_GGML_ASSERT(lm_ggml_is_vector(b));
3585
+ LM_GGML_ASSERT(b->type == LM_GGML_TYPE_I32);
3586
+ LM_GGML_ASSERT(a->ne[2] * 4 == b->ne[0]); // mrope expecting 4 position ids per token
3587
+
3588
+ if (c) {
3589
+ LM_GGML_ASSERT(c->type == LM_GGML_TYPE_F32);
3590
+ LM_GGML_ASSERT(c->ne[0] >= n_dims / 2);
3591
+ }
3592
+
3593
+ struct lm_ggml_tensor * result = lm_ggml_dup_tensor(ctx, a);
3594
+
3595
+ int32_t params[11 + 4] = { /*n_past*/ 0, n_dims, mode, /*n_ctx*/ 0, n_ctx_orig };
3596
+ memcpy(params + 5, &freq_base, sizeof(float));
3597
+ memcpy(params + 6, &freq_scale, sizeof(float));
3598
+ memcpy(params + 7, &ext_factor, sizeof(float));
3599
+ memcpy(params + 8, &attn_factor, sizeof(float));
3600
+ memcpy(params + 9, &beta_fast, sizeof(float));
3601
+ memcpy(params + 10, &beta_slow, sizeof(float));
3602
+ memcpy(&params[11], sections, sizeof(int)*4);
3603
+ lm_ggml_set_op_params(result, params, sizeof(params));
3604
+
3605
+ result->op = LM_GGML_OP_ROPE;
3606
+ result->src[0] = a;
3607
+ result->src[1] = b;
3608
+ result->src[2] = c;
3609
+
3610
+ return result;
3611
+ }
3612
+
3561
3613
  struct lm_ggml_tensor * lm_ggml_rope_inplace(
3562
3614
  struct lm_ggml_context * ctx,
3563
3615
  struct lm_ggml_tensor * a,
@@ -4110,6 +4162,37 @@ struct lm_ggml_tensor * lm_ggml_pad(
4110
4162
  return result;
4111
4163
  }
4112
4164
 
4165
+ // lm_ggml_pad_reflect_1d
4166
+
4167
+ struct lm_ggml_tensor * lm_ggml_pad_reflect_1d(
4168
+ struct lm_ggml_context * ctx,
4169
+ struct lm_ggml_tensor * a,
4170
+ int p0,
4171
+ int p1) {
4172
+ LM_GGML_ASSERT(p0 >= 0);
4173
+ LM_GGML_ASSERT(p1 >= 0);
4174
+
4175
+ LM_GGML_ASSERT(p0 < a->ne[0]); // padding length on each size must be less than the
4176
+ LM_GGML_ASSERT(p1 < a->ne[0]); // existing length of the dimension being padded
4177
+
4178
+ LM_GGML_ASSERT(lm_ggml_is_contiguous(a));
4179
+ LM_GGML_ASSERT(a->type == LM_GGML_TYPE_F32);
4180
+
4181
+ struct lm_ggml_tensor * result = lm_ggml_new_tensor_4d(ctx, a->type,
4182
+ a->ne[0] + p0 + p1,
4183
+ a->ne[1],
4184
+ a->ne[2],
4185
+ a->ne[3]);
4186
+
4187
+ int32_t params[] = { p0, p1 };
4188
+ lm_ggml_set_op_params(result, params, sizeof(params));
4189
+
4190
+ result->op = LM_GGML_OP_PAD_REFLECT_1D;
4191
+ result->src[0] = a;
4192
+
4193
+ return result;
4194
+ }
4195
+
4113
4196
  // lm_ggml_arange
4114
4197
 
4115
4198
  struct lm_ggml_tensor * lm_ggml_arange(
@@ -6284,9 +6367,6 @@ size_t lm_ggml_quantize_chunk(
6284
6367
  case LM_GGML_TYPE_IQ1_M: result = quantize_iq1_m (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6285
6368
  case LM_GGML_TYPE_IQ4_NL: result = quantize_iq4_nl (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6286
6369
  case LM_GGML_TYPE_IQ4_XS: result = quantize_iq4_xs (src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6287
- case LM_GGML_TYPE_Q4_0_4_4: result = quantize_q4_0_4x4(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6288
- case LM_GGML_TYPE_Q4_0_4_8: result = quantize_q4_0_4x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6289
- case LM_GGML_TYPE_Q4_0_8_8: result = quantize_q4_0_8x8(src + start, (char *) dst + start_row * row_size, nrows, n_per_row, imatrix); break;
6290
6370
  case LM_GGML_TYPE_F16:
6291
6371
  {
6292
6372
  size_t elemsize = sizeof(lm_ggml_fp16_t);
@@ -6818,7 +6898,16 @@ struct lm_gguf_context * lm_gguf_init_from_file(const char * fname, struct lm_gg
6818
6898
  (int64_t) info->ne[2] *
6819
6899
  (int64_t) info->ne[3];
6820
6900
 
6821
- if (lm_ggml_blck_size(info->type) == 0 || ne % lm_ggml_blck_size(info->type) != 0) {
6901
+ if (lm_ggml_blck_size(info->type) == 0 ) {
6902
+ // this tensor type support have been removed:
6903
+ fprintf(stderr, "%s: tensor '%s' of type %d: %s\n",
6904
+ __func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type));
6905
+ fclose(file);
6906
+ lm_gguf_free(ctx);
6907
+ return NULL;
6908
+ }
6909
+
6910
+ if (ne % lm_ggml_blck_size(info->type) != 0) {
6822
6911
  fprintf(stderr, "%s: tensor '%s' of type %d (%s) number of elements (%" PRId64 ") is not a multiple of block size (%" PRId64 ")\n",
6823
6912
  __func__, info->name.data, (int) info->type, lm_ggml_type_name(info->type), ne, lm_ggml_blck_size(info->type));
6824
6913
  fclose(file);
package/cpp/ggml.h CHANGED
@@ -238,7 +238,9 @@
238
238
  #define LM_GGML_EXIT_SUCCESS 0
239
239
  #define LM_GGML_EXIT_ABORTED 1
240
240
 
241
- #define LM_GGML_ROPE_TYPE_NEOX 2
241
+ #define LM_GGML_ROPE_TYPE_NEOX 2
242
+ #define LM_GGML_ROPE_TYPE_MROPE 8
243
+ #define LM_GGML_ROPE_TYPE_VISION 24
242
244
 
243
245
  #define LM_GGUF_MAGIC "GGUF"
244
246
 
@@ -385,15 +387,15 @@ extern "C" {
385
387
  LM_GGML_TYPE_F64 = 28,
386
388
  LM_GGML_TYPE_IQ1_M = 29,
387
389
  LM_GGML_TYPE_BF16 = 30,
388
- LM_GGML_TYPE_Q4_0_4_4 = 31,
389
- LM_GGML_TYPE_Q4_0_4_8 = 32,
390
- LM_GGML_TYPE_Q4_0_8_8 = 33,
390
+ // LM_GGML_TYPE_Q4_0_4_4 = 31, support has been removed from gguf files
391
+ // LM_GGML_TYPE_Q4_0_4_8 = 32,
392
+ // LM_GGML_TYPE_Q4_0_8_8 = 33,
391
393
  LM_GGML_TYPE_TQ1_0 = 34,
392
394
  LM_GGML_TYPE_TQ2_0 = 35,
393
- LM_GGML_TYPE_IQ4_NL_4_4 = 36,
395
+ // LM_GGML_TYPE_IQ4_NL_4_4 = 36,
394
396
  // LM_GGML_TYPE_IQ4_NL_4_8 = 37,
395
397
  // LM_GGML_TYPE_IQ4_NL_8_8 = 38,
396
- LM_GGML_TYPE_COUNT,
398
+ LM_GGML_TYPE_COUNT = 39,
397
399
  };
398
400
 
399
401
  // precision
@@ -434,9 +436,6 @@ extern "C" {
434
436
  LM_GGML_FTYPE_MOSTLY_IQ4_XS = 22, // except 1d tensors
435
437
  LM_GGML_FTYPE_MOSTLY_IQ1_M = 23, // except 1d tensors
436
438
  LM_GGML_FTYPE_MOSTLY_BF16 = 24, // except 1d tensors
437
- LM_GGML_FTYPE_MOSTLY_Q4_0_4_4 = 25, // except 1d tensors
438
- LM_GGML_FTYPE_MOSTLY_Q4_0_4_8 = 26, // except 1d tensors
439
- LM_GGML_FTYPE_MOSTLY_Q4_0_8_8 = 27, // except 1d tensors
440
439
  };
441
440
 
442
441
  // available tensor operations:
@@ -500,6 +499,7 @@ extern "C" {
500
499
  LM_GGML_OP_POOL_2D_BACK,
501
500
  LM_GGML_OP_UPSCALE, // nearest interpolate
502
501
  LM_GGML_OP_PAD,
502
+ LM_GGML_OP_PAD_REFLECT_1D,
503
503
  LM_GGML_OP_ARANGE,
504
504
  LM_GGML_OP_TIMESTEP_EMBEDDING,
505
505
  LM_GGML_OP_ARGSORT,
@@ -1446,6 +1446,22 @@ extern "C" {
1446
1446
  float beta_fast,
1447
1447
  float beta_slow);
1448
1448
 
1449
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_multi(
1450
+ struct lm_ggml_context * ctx,
1451
+ struct lm_ggml_tensor * a,
1452
+ struct lm_ggml_tensor * b,
1453
+ struct lm_ggml_tensor * c,
1454
+ int n_dims,
1455
+ int sections[4],
1456
+ int mode,
1457
+ int n_ctx_orig,
1458
+ float freq_base,
1459
+ float freq_scale,
1460
+ float ext_factor,
1461
+ float attn_factor,
1462
+ float beta_fast,
1463
+ float beta_slow);
1464
+
1449
1465
  // in-place, returns view(a)
1450
1466
  LM_GGML_API struct lm_ggml_tensor * lm_ggml_rope_ext_inplace(
1451
1467
  struct lm_ggml_context * ctx,
@@ -1696,6 +1712,13 @@ extern "C" {
1696
1712
  int p2,
1697
1713
  int p3);
1698
1714
 
1715
+ // pad each dimension with reflection: [a, b, c, d] -> [b, a, b, c, d, c]
1716
+ LM_GGML_API struct lm_ggml_tensor * lm_ggml_pad_reflect_1d(
1717
+ struct lm_ggml_context * ctx,
1718
+ struct lm_ggml_tensor * a,
1719
+ int p0,
1720
+ int p1);
1721
+
1699
1722
  // Ref: https://github.com/CompVis/stable-diffusion/blob/main/ldm/modules/diffusionmodules/util.py#L151
1700
1723
  // timesteps: [N,]
1701
1724
  // return: [N, dim]
@@ -2198,11 +2221,19 @@ extern "C" {
2198
2221
  LM_GGML_API size_t lm_gguf_get_meta_size(const struct lm_gguf_context * ctx);
2199
2222
  LM_GGML_API void lm_gguf_get_meta_data(const struct lm_gguf_context * ctx, void * data);
2200
2223
 
2201
- #ifdef __cplusplus
2202
- // restrict not standard in C++
2203
- #define LM_GGML_RESTRICT
2224
+ #ifdef __cplusplus
2225
+ // restrict not standard in C++
2226
+ # if defined(__GNUC__)
2227
+ # define LM_GGML_RESTRICT __restrict__
2228
+ # elif defined(__clang__)
2229
+ # define LM_GGML_RESTRICT __restrict
2230
+ # elif defined(_MSC_VER)
2231
+ # define LM_GGML_RESTRICT __restrict
2232
+ # else
2233
+ # define LM_GGML_RESTRICT
2234
+ # endif
2204
2235
  #else
2205
- #define LM_GGML_RESTRICT restrict
2236
+ # define LM_GGML_RESTRICT restrict
2206
2237
  #endif
2207
2238
  typedef void (*lm_ggml_to_float_t) (const void * LM_GGML_RESTRICT x, float * LM_GGML_RESTRICT y, int64_t k);
2208
2239
  typedef void (*lm_ggml_from_float_t)(const float * LM_GGML_RESTRICT x, void * LM_GGML_RESTRICT y, int64_t k);
@@ -1397,19 +1397,15 @@ struct llama_sampler * llama_sampler_init_grammar_impl(const struct llama_vocab
1397
1397
  // penalties
1398
1398
 
1399
1399
  struct llama_sampler_penalties {
1400
- const int32_t n_vocab;
1401
- const llama_token special_eos_id;
1402
- const llama_token linefeed_id;
1403
-
1404
1400
  const int32_t penalty_last_n;
1405
1401
  const float penalty_repeat;
1406
1402
  const float penalty_freq;
1407
1403
  const float penalty_present;
1408
1404
 
1409
- const bool penalize_nl;
1410
- const bool ignore_eos;
1411
-
1412
1405
  ring_buffer<llama_token> prev;
1406
+
1407
+ // a frequency map to count token occurrences
1408
+ std::unordered_map<llama_token, int> token_count;
1413
1409
  };
1414
1410
 
1415
1411
  static const char * llama_sampler_penalties_name(const struct llama_sampler * /*smpl*/) {
@@ -1422,76 +1418,50 @@ static void llama_sampler_penalties_accept(struct llama_sampler * smpl, llama_to
1422
1418
  return;
1423
1419
  }
1424
1420
 
1425
- ctx->prev.push_back(token);
1426
- }
1427
-
1428
- static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1429
- auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1421
+ ctx->token_count[token]++;
1430
1422
 
1431
- if (ctx->ignore_eos) {
1432
- assert(ctx->special_eos_id >= 0);
1423
+ // if the ring buffer is full, remove the oldest token
1424
+ if (ctx->prev.size() >= (size_t) ctx->penalty_last_n) {
1425
+ const auto old = ctx->prev.front();
1433
1426
 
1434
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
1435
- if (cur_p->size > (size_t) ctx->special_eos_id && cur_p->data[ctx->special_eos_id].id == ctx->special_eos_id) {
1436
- cur_p->data[ctx->special_eos_id].logit = -INFINITY;
1437
- } else {
1438
- // else, search for the special EOS token
1439
- for (size_t i = 0; i < cur_p->size; ++i) {
1440
- if (cur_p->data[i].id == ctx->special_eos_id) {
1441
- cur_p->data[i].logit = -INFINITY;
1442
- break;
1443
- }
1444
- }
1427
+ ctx->token_count[old]--;
1428
+ if (ctx->token_count[old] == 0) {
1429
+ ctx->token_count.erase(old);
1445
1430
  }
1446
1431
  }
1447
1432
 
1448
- if ((ctx->penalty_last_n == 0) ||
1449
- (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
1450
- return;
1451
- }
1452
-
1453
- bool nl_found = false;
1454
- size_t nl_idx = 0;
1455
- float nl_logit = -INFINITY;
1456
- if (!ctx->penalize_nl) {
1457
- assert(ctx->linefeed_id >= 0);
1433
+ ctx->prev.push_back(token);
1458
1434
 
1459
- // optimistically check if the candidates are not yet sorted/shuffled/truncated
1460
- if (cur_p->size > (size_t) ctx->linefeed_id && cur_p->data[ctx->linefeed_id].id == ctx->linefeed_id) {
1461
- nl_found = true;
1462
- nl_idx = ctx->linefeed_id;
1463
- nl_logit = cur_p->data[ctx->linefeed_id].logit;
1464
- } else {
1465
- // else, search for the linefeed token
1466
- for (size_t i = 0; i < cur_p->size; ++i) {
1467
- if (cur_p->data[i].id == ctx->linefeed_id) {
1468
- nl_found = true;
1469
- nl_idx = i;
1470
- nl_logit = cur_p->data[i].logit;
1471
- break;
1472
- }
1473
- }
1474
- }
1435
+ #if 0
1436
+ // sanity check
1437
+ std::unordered_map<llama_token, int> tmp;
1438
+ for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
1439
+ tmp[ctx->prev.rat(i)]++;
1475
1440
  }
1476
1441
 
1477
- // Create a frequency map to count occurrences of each token in last_tokens
1478
- // TODO: optimize this by maintaining the token count in the sampler context
1479
- using llama_token_cnt = std::unordered_map<llama_token, int>;
1480
- llama_token_cnt token_count;
1442
+ assert(ctx->token_count == tmp);
1443
+ #endif
1444
+ }
1445
+
1446
+ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_token_data_array * cur_p) {
1447
+ auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1481
1448
 
1482
- for (int i = 0; i < std::min<int>(ctx->penalty_last_n, ctx->prev.size()); ++i) {
1483
- token_count[ctx->prev.rat(i)]++;
1449
+ if ((ctx->penalty_last_n == 0) ||
1450
+ (ctx->penalty_repeat == 1.0f && ctx->penalty_freq == 0.0f && ctx->penalty_present == 0.0f)) {
1451
+ return;
1484
1452
  }
1485
1453
 
1486
1454
  // Apply frequency and presence penalties to the cur_p
1487
1455
  for (size_t i = 0; i < cur_p->size; ++i) {
1488
- const auto token_iter = token_count.find(cur_p->data[i].id);
1489
- if (token_iter == token_count.end()) {
1456
+ const auto token_iter = ctx->token_count.find(cur_p->data[i].id);
1457
+ if (token_iter == ctx->token_count.end()) {
1490
1458
  continue;
1491
1459
  }
1492
1460
 
1493
1461
  const int count = token_iter->second;
1494
1462
 
1463
+ assert(count > 0 && count <= ctx->penalty_last_n);
1464
+
1495
1465
  // The academic publication that described this technique actually just only divided, but that would cause tokens with negative logits to become more likely, which is obviously wrong.
1496
1466
  // This is common fix for this problem, which is to multiply by the penalty instead of dividing.
1497
1467
  if (cur_p->data[i].logit <= 0) {
@@ -1504,30 +1474,21 @@ static void llama_sampler_penalties_apply(struct llama_sampler * smpl, llama_tok
1504
1474
  }
1505
1475
 
1506
1476
  cur_p->sorted = false;
1507
-
1508
- if (!ctx->penalize_nl && nl_found) {
1509
- // restore the logit of the newline token if it was penalized
1510
- cur_p->data[nl_idx].logit = nl_logit;
1511
- }
1512
1477
  }
1513
1478
 
1514
1479
  static void llama_sampler_penalties_reset(struct llama_sampler * smpl) {
1515
1480
  auto * ctx = (llama_sampler_penalties *) smpl->ctx;
1516
1481
  ctx->prev.clear();
1482
+ ctx->token_count.clear();
1517
1483
  }
1518
1484
 
1519
1485
  static struct llama_sampler * llama_sampler_penalties_clone(const struct llama_sampler * smpl) {
1520
1486
  const auto * ctx = (const llama_sampler_penalties *) smpl->ctx;
1521
1487
  auto * result = llama_sampler_init_penalties(
1522
- ctx->n_vocab,
1523
- ctx->special_eos_id,
1524
- ctx->linefeed_id,
1525
1488
  ctx->penalty_last_n,
1526
1489
  ctx->penalty_repeat,
1527
1490
  ctx->penalty_freq,
1528
- ctx->penalty_present,
1529
- ctx->penalize_nl,
1530
- ctx->ignore_eos);
1491
+ ctx->penalty_present);
1531
1492
 
1532
1493
  // copy the state
1533
1494
  {
@@ -1553,38 +1514,21 @@ static struct llama_sampler_i llama_sampler_penalties_i = {
1553
1514
  };
1554
1515
 
1555
1516
  struct llama_sampler * llama_sampler_init_penalties(
1556
- int32_t n_vocab,
1557
- llama_token special_eos_id,
1558
- llama_token linefeed_id,
1559
1517
  int32_t penalty_last_n,
1560
1518
  float penalty_repeat,
1561
1519
  float penalty_freq,
1562
- float penalty_present,
1563
- bool penalize_nl,
1564
- bool ignore_eos) {
1565
- if (linefeed_id == LLAMA_TOKEN_NULL) {
1566
- penalize_nl = true;
1567
- }
1568
-
1569
- if (special_eos_id == LLAMA_TOKEN_NULL) {
1570
- ignore_eos = false;
1571
- }
1572
-
1520
+ float penalty_present) {
1573
1521
  penalty_last_n = std::max(penalty_last_n, 0);
1574
1522
 
1575
1523
  return new llama_sampler {
1576
1524
  /* .iface = */ &llama_sampler_penalties_i,
1577
1525
  /* .ctx = */ new llama_sampler_penalties {
1578
- /* .n_vocab = */ n_vocab,
1579
- /* .special_eos_id = */ special_eos_id,
1580
- /* .linefeed_id = */ linefeed_id,
1581
1526
  /* .penalty_last_n = */ penalty_last_n,
1582
1527
  /* .penalty_repeat = */ penalty_repeat,
1583
1528
  /* .penalty_freq = */ penalty_freq,
1584
1529
  /* .penalty_present = */ penalty_present,
1585
- /* .penalize_nl = */ penalize_nl,
1586
- /* .ignore_eos = */ ignore_eos,
1587
1530
  /* .prev = */ ring_buffer<llama_token>(penalty_last_n),
1531
+ /* .token_count = */ {},
1588
1532
  },
1589
1533
  };
1590
1534
  }
@@ -1612,7 +1556,8 @@ static void get_overlapping_token_sequences(const llama_vocab & vocab, const std
1612
1556
  if (word.find(str) != std::string::npos) {
1613
1557
  token_sequences.emplace(token_id, std::vector<llama_token>());
1614
1558
  } else {
1615
- size_t word_len = word.size(), str_len = str.size();
1559
+ size_t word_len = word.size();
1560
+ size_t str_len = str.size();
1616
1561
  size_t pos = -1;
1617
1562
  while ((pos = word.find(str[0], pos + 1)) != std::string::npos) {
1618
1563
  bool match = true;
@@ -418,6 +418,7 @@ struct llm_tokenizer_bpe : llm_tokenizer {
418
418
  case LLAMA_VOCAB_PRE_TYPE_SMOLLM:
419
419
  case LLAMA_VOCAB_PRE_TYPE_CODESHELL:
420
420
  case LLAMA_VOCAB_PRE_TYPE_EXAONE:
421
+ case LLAMA_VOCAB_PRE_TYPE_MINERVA:
421
422
  regex_exprs = {
422
423
  "\\p{N}",
423
424
  "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
@@ -737,7 +738,7 @@ struct llm_tokenizer_wpm_session {
737
738
  std::vector<std::string> words(1, "");
738
739
 
739
740
  for (const uint32_t cpt : cpts_nfd) {
740
- const auto flags = unicode_cpt_flags(cpt);
741
+ const auto flags = unicode_cpt_flags_from_cpt(cpt);
741
742
 
742
743
  if (flags.is_whitespace) {
743
744
  if (words.back().size()) { // finish previous word if any