cui-llama.rn 1.3.3 → 1.3.5

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (54) hide show
  1. package/android/src/main/CMakeLists.txt +5 -7
  2. package/android/src/main/java/com/rnllama/LlamaContext.java +4 -4
  3. package/android/src/main/jni.cpp +9 -9
  4. package/cpp/common.cpp +28 -44
  5. package/cpp/common.h +35 -14
  6. package/cpp/ggml-alloc.c +0 -1
  7. package/cpp/ggml-backend-impl.h +38 -20
  8. package/cpp/ggml-backend-reg.cpp +246 -92
  9. package/cpp/ggml-backend.h +1 -0
  10. package/cpp/ggml-common.h +42 -48
  11. package/cpp/{ggml-cpu-aarch64.c → ggml-cpu-aarch64.cpp} +642 -223
  12. package/cpp/ggml-cpu-aarch64.h +2 -26
  13. package/cpp/ggml-cpu-traits.cpp +36 -0
  14. package/cpp/ggml-cpu-traits.h +38 -0
  15. package/cpp/ggml-cpu.c +14122 -13971
  16. package/cpp/ggml-cpu.cpp +627 -715
  17. package/cpp/ggml-cpu.h +0 -17
  18. package/cpp/ggml-impl.h +22 -6
  19. package/cpp/ggml-metal.m +482 -24
  20. package/cpp/ggml-quants.c +0 -9
  21. package/cpp/ggml-threading.h +4 -2
  22. package/cpp/ggml.c +284 -178
  23. package/cpp/ggml.h +73 -25
  24. package/cpp/llama-grammar.cpp +15 -15
  25. package/cpp/llama-grammar.h +2 -5
  26. package/cpp/llama-sampling.cpp +35 -90
  27. package/cpp/llama-vocab.cpp +7 -2
  28. package/cpp/llama-vocab.h +1 -1
  29. package/cpp/llama.cpp +1782 -586
  30. package/cpp/llama.h +20 -19
  31. package/cpp/sampling.cpp +11 -16
  32. package/cpp/sgemm.cpp +265 -258
  33. package/cpp/sgemm.h +2 -2
  34. package/cpp/speculative.cpp +4 -0
  35. package/cpp/unicode.cpp +51 -51
  36. package/cpp/unicode.h +9 -10
  37. package/lib/commonjs/index.js +38 -1
  38. package/lib/commonjs/index.js.map +1 -1
  39. package/lib/module/index.js +36 -0
  40. package/lib/module/index.js.map +1 -1
  41. package/lib/typescript/NativeRNLlama.d.ts +2 -3
  42. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  43. package/lib/typescript/index.d.ts +36 -2
  44. package/lib/typescript/index.d.ts.map +1 -1
  45. package/package.json +1 -1
  46. package/src/NativeRNLlama.ts +3 -3
  47. package/src/index.ts +46 -2
  48. package/cpp/amx/amx.cpp +0 -196
  49. package/cpp/amx/amx.h +0 -20
  50. package/cpp/amx/common.h +0 -101
  51. package/cpp/amx/mmq.cpp +0 -2524
  52. package/cpp/amx/mmq.h +0 -16
  53. package/cpp/ggml-aarch64.c +0 -129
  54. package/cpp/ggml-aarch64.h +0 -19
package/cpp/llama.h CHANGED
@@ -105,12 +105,15 @@ extern "C" {
105
105
  LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
106
106
  LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
107
107
  LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
108
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
108
109
  };
109
110
 
110
111
  enum llama_rope_type {
111
- LLAMA_ROPE_TYPE_NONE = -1,
112
- LLAMA_ROPE_TYPE_NORM = 0,
113
- LLAMA_ROPE_TYPE_NEOX = LM_GGML_ROPE_TYPE_NEOX,
112
+ LLAMA_ROPE_TYPE_NONE = -1,
113
+ LLAMA_ROPE_TYPE_NORM = 0,
114
+ LLAMA_ROPE_TYPE_NEOX = LM_GGML_ROPE_TYPE_NEOX,
115
+ LLAMA_ROPE_TYPE_MROPE = LM_GGML_ROPE_TYPE_MROPE,
116
+ LLAMA_ROPE_TYPE_VISION = LM_GGML_ROPE_TYPE_VISION,
114
117
  };
115
118
 
116
119
  enum llama_token_type { //TODO: remove, required until per token attributes are available from GGUF file
@@ -172,9 +175,9 @@ extern "C" {
172
175
  LLAMA_FTYPE_MOSTLY_IQ4_XS = 30, // except 1d tensors
173
176
  LLAMA_FTYPE_MOSTLY_IQ1_M = 31, // except 1d tensors
174
177
  LLAMA_FTYPE_MOSTLY_BF16 = 32, // except 1d tensors
175
- LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // except 1d tensors
176
- LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // except 1d tensors
177
- LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // except 1d tensors
178
+ //LLAMA_FTYPE_MOSTLY_Q4_0_4_4 = 33, // removed from gguf files, use Q4_0 and runtime repack
179
+ //LLAMA_FTYPE_MOSTLY_Q4_0_4_8 = 34, // removed from gguf files, use Q4_0 and runtime repack
180
+ //LLAMA_FTYPE_MOSTLY_Q4_0_8_8 = 35, // removed from gguf files, use Q4_0 and runtime repack
178
181
  LLAMA_FTYPE_MOSTLY_TQ1_0 = 36, // except 1d tensors
179
182
  LLAMA_FTYPE_MOSTLY_TQ2_0 = 37, // except 1d tensors
180
183
 
@@ -186,7 +189,8 @@ extern "C" {
186
189
  LLAMA_ROPE_SCALING_TYPE_NONE = 0,
187
190
  LLAMA_ROPE_SCALING_TYPE_LINEAR = 1,
188
191
  LLAMA_ROPE_SCALING_TYPE_YARN = 2,
189
- LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_YARN,
192
+ LLAMA_ROPE_SCALING_TYPE_LONGROPE = 3,
193
+ LLAMA_ROPE_SCALING_TYPE_MAX_VALUE = LLAMA_ROPE_SCALING_TYPE_LONGROPE,
190
194
  };
191
195
 
192
196
  enum llama_pooling_type {
@@ -455,6 +459,7 @@ extern "C" {
455
459
  // Functions to access the model's GGUF metadata scalar values
456
460
  // - The functions return the length of the string on success, or -1 on failure
457
461
  // - The output string is always null-terminated and cleared on failure
462
+ // - When retrieving a string, an extra byte must be allocated to account for the null terminator
458
463
  // - GGUF array values are not supported by these functions
459
464
 
460
465
  // Get metadata value as a string by key name
@@ -478,9 +483,6 @@ extern "C" {
478
483
  // Returns the total number of parameters in the model
479
484
  LLAMA_API uint64_t llama_model_n_params(const struct llama_model * model);
480
485
 
481
- // Get a llama model tensor
482
- LLAMA_API struct lm_ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name);
483
-
484
486
  // Returns true if the model contains an encoder that requires llama_encode() call
485
487
  LLAMA_API bool llama_model_has_encoder(const struct llama_model * model);
486
488
 
@@ -991,6 +993,9 @@ extern "C" {
991
993
  char * buf,
992
994
  int32_t length);
993
995
 
996
+ // Get list of built-in chat templates
997
+ LLAMA_API int32_t llama_chat_builtin_templates(const char ** output, size_t len);
998
+
994
999
  //
995
1000
  // Sampling API
996
1001
  //
@@ -1132,16 +1137,12 @@ extern "C" {
1132
1137
  const char * grammar_str,
1133
1138
  const char * grammar_root);
1134
1139
 
1140
+ /// NOTE: Avoid using on the full vocabulary as searching for repeated tokens can become slow. For example, apply top-k or top-p sampling first.
1135
1141
  LLAMA_API struct llama_sampler * llama_sampler_init_penalties(
1136
- int32_t n_vocab, // llama_n_vocab()
1137
- llama_token special_eos_id, // llama_token_eos()
1138
- llama_token linefeed_id, // llama_token_nl()
1139
- int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
1140
- float penalty_repeat, // 1.0 = disabled
1141
- float penalty_freq, // 0.0 = disabled
1142
- float penalty_present, // 0.0 = disabled
1143
- bool penalize_nl, // consider newlines as a repeatable token
1144
- bool ignore_eos); // ignore the end-of-sequence token
1142
+ int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size)
1143
+ float penalty_repeat, // 1.0 = disabled
1144
+ float penalty_freq, // 0.0 = disabled
1145
+ float penalty_present); // 0.0 = disabled
1145
1146
 
1146
1147
  /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982
1147
1148
  LLAMA_API struct llama_sampler * llama_sampler_init_dry(
package/cpp/sampling.cpp CHANGED
@@ -161,32 +161,20 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
161
161
  params.logit_bias.size(),
162
162
  params.logit_bias.data()));
163
163
 
164
- llama_sampler_chain_add(result->chain,
165
- llama_sampler_init_penalties(
166
- llama_n_vocab (model),
167
- llama_token_eos(model),
168
- llama_token_nl (model),
169
- params.penalty_last_n,
170
- params.penalty_repeat,
171
- params.penalty_freq,
172
- params.penalty_present,
173
- params.penalize_nl,
174
- params.ignore_eos));
175
-
176
164
  if (params.mirostat == 0) {
177
165
  for (const auto & cnstr : params.samplers) {
178
166
  switch (cnstr) {
179
- case COMMON_SAMPLER_TYPE_DRY:
167
+ case COMMON_SAMPLER_TYPE_DRY:
180
168
  {
181
- std::vector<const char*> c_breakers;
169
+ std::vector<const char *> c_breakers;
182
170
  c_breakers.reserve(params.dry_sequence_breakers.size());
183
- for (const auto& str : params.dry_sequence_breakers) {
171
+ for (const auto & str : params.dry_sequence_breakers) {
184
172
  c_breakers.push_back(str.c_str());
185
173
  }
186
174
 
187
175
  llama_sampler_chain_add(result->chain, llama_sampler_init_dry (model, params.dry_multiplier, params.dry_base, params.dry_allowed_length, params.dry_penalty_last_n, c_breakers.data(), c_breakers.size()));
188
176
  }
189
- break;
177
+ break;
190
178
  case COMMON_SAMPLER_TYPE_TOP_K:
191
179
  llama_sampler_chain_add(result->chain, llama_sampler_init_top_k (params.top_k));
192
180
  break;
@@ -208,6 +196,9 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, co
208
196
  case COMMON_SAMPLER_TYPE_INFILL:
209
197
  llama_sampler_chain_add(result->chain, llama_sampler_init_infill (model));
210
198
  break;
199
+ case COMMON_SAMPLER_TYPE_PENALTIES:
200
+ llama_sampler_chain_add(result->chain, llama_sampler_init_penalties(params.penalty_last_n, params.penalty_repeat, params.penalty_freq, params.penalty_present));
201
+ break;
211
202
  default:
212
203
  LM_GGML_ASSERT(false && "unknown sampler type");
213
204
  }
@@ -415,6 +406,7 @@ char common_sampler_type_to_chr(enum common_sampler_type cnstr) {
415
406
  case COMMON_SAMPLER_TYPE_TEMPERATURE: return 't';
416
407
  case COMMON_SAMPLER_TYPE_XTC: return 'x';
417
408
  case COMMON_SAMPLER_TYPE_INFILL: return 'i';
409
+ case COMMON_SAMPLER_TYPE_PENALTIES: return 'e';
418
410
  default : return '?';
419
411
  }
420
412
  }
@@ -429,6 +421,7 @@ std::string common_sampler_type_to_str(enum common_sampler_type cnstr) {
429
421
  case COMMON_SAMPLER_TYPE_TEMPERATURE: return "temperature";
430
422
  case COMMON_SAMPLER_TYPE_XTC: return "xtc";
431
423
  case COMMON_SAMPLER_TYPE_INFILL: return "infill";
424
+ case COMMON_SAMPLER_TYPE_PENALTIES: return "penalties";
432
425
  default : return "";
433
426
  }
434
427
  }
@@ -443,6 +436,7 @@ std::vector<common_sampler_type> common_sampler_types_from_names(const std::vect
443
436
  { "temperature", COMMON_SAMPLER_TYPE_TEMPERATURE },
444
437
  { "xtc", COMMON_SAMPLER_TYPE_XTC },
445
438
  { "infill", COMMON_SAMPLER_TYPE_INFILL },
439
+ { "penalties", COMMON_SAMPLER_TYPE_PENALTIES },
446
440
  };
447
441
 
448
442
  // since samplers names are written multiple ways
@@ -489,6 +483,7 @@ std::vector<common_sampler_type> common_sampler_types_from_chars(const std::stri
489
483
  { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_TEMPERATURE), COMMON_SAMPLER_TYPE_TEMPERATURE },
490
484
  { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_XTC), COMMON_SAMPLER_TYPE_XTC },
491
485
  { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_INFILL), COMMON_SAMPLER_TYPE_INFILL },
486
+ { common_sampler_type_to_chr(COMMON_SAMPLER_TYPE_PENALTIES), COMMON_SAMPLER_TYPE_PENALTIES },
492
487
  };
493
488
 
494
489
  std::vector<common_sampler_type> samplers;