cui-llama.rn 1.4.6 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/android/src/main/CMakeLists.txt +9 -2
  2. package/android/src/main/jni.cpp +52 -34
  3. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  4. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  5. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  6. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  9. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  10. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  11. package/cpp/binary-ops.cpp +158 -0
  12. package/cpp/binary-ops.h +16 -0
  13. package/cpp/chat.cpp +1769 -1779
  14. package/cpp/chat.h +9 -1
  15. package/cpp/common.cpp +20 -522
  16. package/cpp/common.h +13 -36
  17. package/cpp/cpu-common.h +72 -0
  18. package/cpp/ggml-common.h +12 -6
  19. package/cpp/ggml-cpu-aarch64.cpp +1557 -80
  20. package/cpp/ggml-cpu-impl.h +2 -21
  21. package/cpp/ggml-cpu-quants.c +904 -405
  22. package/cpp/ggml-cpu.c +909 -13237
  23. package/cpp/ggml-impl.h +50 -23
  24. package/cpp/ggml-metal-impl.h +77 -3
  25. package/cpp/ggml-metal.m +794 -580
  26. package/cpp/ggml.c +92 -3
  27. package/cpp/ggml.h +29 -5
  28. package/cpp/gguf.cpp +1 -0
  29. package/cpp/llama-adapter.cpp +55 -20
  30. package/cpp/llama-adapter.h +11 -9
  31. package/cpp/llama-arch.cpp +217 -16
  32. package/cpp/llama-arch.h +25 -0
  33. package/cpp/llama-batch.h +2 -2
  34. package/cpp/llama-chat.cpp +54 -2
  35. package/cpp/llama-chat.h +3 -0
  36. package/cpp/llama-context.cpp +2294 -1238
  37. package/cpp/llama-context.h +214 -77
  38. package/cpp/llama-cparams.h +1 -0
  39. package/cpp/llama-graph.cpp +1695 -0
  40. package/cpp/llama-graph.h +592 -0
  41. package/cpp/llama-hparams.cpp +8 -0
  42. package/cpp/llama-hparams.h +17 -0
  43. package/cpp/llama-io.cpp +15 -0
  44. package/cpp/llama-io.h +35 -0
  45. package/cpp/llama-kv-cache.cpp +965 -303
  46. package/cpp/llama-kv-cache.h +145 -151
  47. package/cpp/llama-memory.cpp +1 -0
  48. package/cpp/llama-memory.h +21 -0
  49. package/cpp/llama-mmap.cpp +1 -1
  50. package/cpp/llama-model-loader.cpp +10 -5
  51. package/cpp/llama-model-loader.h +5 -3
  52. package/cpp/llama-model.cpp +9194 -201
  53. package/cpp/llama-model.h +40 -1
  54. package/cpp/llama-sampling.cpp +5 -0
  55. package/cpp/llama-vocab.cpp +36 -5
  56. package/cpp/llama.cpp +51 -9984
  57. package/cpp/llama.h +102 -22
  58. package/cpp/log.cpp +34 -0
  59. package/cpp/minja/chat-template.hpp +15 -7
  60. package/cpp/minja/minja.hpp +120 -94
  61. package/cpp/ops.cpp +8723 -0
  62. package/cpp/ops.h +128 -0
  63. package/cpp/rn-llama.cpp +44 -53
  64. package/cpp/rn-llama.h +2 -12
  65. package/cpp/sampling.cpp +3 -0
  66. package/cpp/sgemm.cpp +533 -88
  67. package/cpp/simd-mappings.h +888 -0
  68. package/cpp/speculative.cpp +4 -4
  69. package/cpp/unary-ops.cpp +186 -0
  70. package/cpp/unary-ops.h +28 -0
  71. package/cpp/vec.cpp +258 -0
  72. package/cpp/vec.h +802 -0
  73. package/ios/CMakeLists.txt +5 -2
  74. package/ios/RNLlama.mm +2 -2
  75. package/ios/RNLlamaContext.mm +40 -24
  76. package/package.json +1 -1
  77. package/src/NativeRNLlama.ts +6 -4
  78. package/src/index.ts +3 -1
  79. package/cpp/chat-template.hpp +0 -529
  80. package/cpp/minja.hpp +0 -2915
package/cpp/common.h CHANGED
@@ -132,10 +132,6 @@ struct common_grammar_trigger {
132
132
  common_grammar_trigger_type type;
133
133
  std::string value;
134
134
  llama_token token = LLAMA_TOKEN_NULL;
135
-
136
- // T can only be nlohmann::ordered_json
137
- template <class T> T to_json() const;
138
- template <class T> static common_grammar_trigger from_json(const T & in);
139
135
  };
140
136
 
141
137
  // sampling parameters
@@ -195,6 +191,13 @@ struct common_params_sampling {
195
191
  std::string print() const;
196
192
  };
197
193
 
194
+ struct common_params_model {
195
+ std::string path = ""; // model local path // NOLINT
196
+ std::string url = ""; // model url to download // NOLINT
197
+ std::string hf_repo = ""; // HF repo // NOLINT
198
+ std::string hf_file = ""; // HF file // NOLINT
199
+ };
200
+
198
201
  struct common_params_speculative {
199
202
  std::vector<lm_ggml_backend_dev_t> devices; // devices to use for offloading
200
203
 
@@ -208,19 +211,11 @@ struct common_params_speculative {
208
211
  struct cpu_params cpuparams;
209
212
  struct cpu_params cpuparams_batch;
210
213
 
211
- std::string hf_repo = ""; // HF repo // NOLINT
212
- std::string hf_file = ""; // HF file // NOLINT
213
-
214
- std::string model = ""; // draft model for speculative decoding // NOLINT
215
- std::string model_url = ""; // model url to download // NOLINT
214
+ struct common_params_model model;
216
215
  };
217
216
 
218
217
  struct common_params_vocoder {
219
- std::string hf_repo = ""; // HF repo // NOLINT
220
- std::string hf_file = ""; // HF file // NOLINT
221
-
222
- std::string model = ""; // model path // NOLINT
223
- std::string model_url = ""; // model url to download // NOLINT
218
+ struct common_params_model model;
224
219
 
225
220
  std::string speaker_file = ""; // speaker file path // NOLINT
226
221
 
@@ -282,12 +277,10 @@ struct common_params {
282
277
  struct common_params_speculative speculative;
283
278
  struct common_params_vocoder vocoder;
284
279
 
285
- std::string model = ""; // model path // NOLINT
280
+ struct common_params_model model;
281
+
286
282
  std::string model_alias = ""; // model alias // NOLINT
287
- std::string model_url = ""; // model url to download // NOLINT
288
283
  std::string hf_token = ""; // HF token // NOLINT
289
- std::string hf_repo = ""; // HF repo // NOLINT
290
- std::string hf_file = ""; // HF file // NOLINT
291
284
  std::string prompt = ""; // NOLINT
292
285
  std::string system_prompt = ""; // NOLINT
293
286
  std::string prompt_file = ""; // store the external prompt file name // NOLINT
@@ -301,6 +294,7 @@ struct common_params {
301
294
  std::vector<std::string> in_files; // all input files
302
295
  std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
303
296
  std::vector<llama_model_kv_override> kv_overrides;
297
+ std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
304
298
 
305
299
  bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_adapter_lora_apply)
306
300
  std::vector<common_adapter_lora_info> lora_adapters; // lora adapter path with user defined scale
@@ -362,7 +356,7 @@ struct common_params {
362
356
  common_conversation_mode conversation_mode = COMMON_CONVERSATION_MODE_AUTO;
363
357
 
364
358
  // multimodal models (see examples/llava)
365
- std::string mmproj = ""; // path to multimodal projector // NOLINT
359
+ struct common_params_model mmproj;
366
360
  std::vector<std::string> image; // path to image file(s)
367
361
 
368
362
  // embedding
@@ -561,23 +555,6 @@ struct llama_model_params common_model_params_to_llama ( common_params
561
555
  struct llama_context_params common_context_params_to_llama(const common_params & params);
562
556
  struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
563
557
 
564
- struct llama_model * common_load_model_from_url(
565
- const std::string & model_url,
566
- const std::string & local_path,
567
- const std::string & hf_token,
568
- const struct llama_model_params & params);
569
-
570
- struct llama_model * common_load_model_from_hf(
571
- const std::string & repo,
572
- const std::string & remote_path,
573
- const std::string & local_path,
574
- const std::string & hf_token,
575
- const struct llama_model_params & params);
576
-
577
- std::pair<std::string, std::string> common_get_hf_file(
578
- const std::string & hf_repo_with_tag,
579
- const std::string & hf_token);
580
-
581
558
  // clear LoRA adapters from context, then apply new list of adapters
582
559
  void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
583
560
 
@@ -0,0 +1,72 @@
1
+ #pragma once
2
+
3
+ #include "ggml.h"
4
+ #include "ggml-cpu-traits.h"
5
+ #include "ggml-cpu-impl.h"
6
+ #include "ggml-impl.h"
7
+
8
+ #ifdef __cplusplus
9
+
10
+ #include <utility>
11
+
12
+ // convenience functions/macros for use in template calls
13
+ // note: these won't be required after the 'traits' lookup table is used.
14
+ static inline lm_ggml_fp16_t f32_to_f16(float x) {
15
+ return LM_GGML_FP32_TO_FP16(x);
16
+ }
17
+
18
+ static inline float f16_to_f32(lm_ggml_fp16_t x) {
19
+ return LM_GGML_FP16_TO_FP32(x);
20
+ }
21
+
22
+ static inline lm_ggml_bf16_t f32_to_bf16(float x) {
23
+ return LM_GGML_FP32_TO_BF16(x);
24
+ }
25
+
26
+ static inline float bf16_to_f32(lm_ggml_bf16_t x) {
27
+ return LM_GGML_BF16_TO_FP32(x);
28
+ }
29
+
30
+ static inline float f32_to_f32(float x) {
31
+ return x;
32
+ }
33
+
34
+ // TODO - merge this into the traits table, after using row-based conversions
35
+ template <class T>
36
+ struct type_conversion_table;
37
+
38
+ template <>
39
+ struct type_conversion_table<lm_ggml_fp16_t> {
40
+ static constexpr float (*to_f32)(lm_ggml_fp16_t) = f16_to_f32;
41
+ static constexpr lm_ggml_fp16_t (*from_f32)(float) = f32_to_f16;
42
+ };
43
+
44
+ template <>
45
+ struct type_conversion_table<float> {
46
+ static constexpr float (*to_f32)(float) = f32_to_f32;
47
+ static constexpr float (*from_f32)(float) = f32_to_f32;
48
+ };
49
+
50
+ template <>
51
+ struct type_conversion_table<lm_ggml_bf16_t> {
52
+ static constexpr float (*to_f32)(lm_ggml_bf16_t) = bf16_to_f32;
53
+ static constexpr lm_ggml_bf16_t (*from_f32)(float) = f32_to_bf16;
54
+ };
55
+
56
+ static std::pair<int64_t, int64_t> get_thread_range(const struct lm_ggml_compute_params * params, const struct lm_ggml_tensor * src0) {
57
+ const int64_t ith = params->ith;
58
+ const int64_t nth = params->nth;
59
+
60
+ const int64_t nr = lm_ggml_nrows(src0);
61
+
62
+ // rows per thread
63
+ const int64_t dr = (nr + nth - 1)/nth;
64
+
65
+ // row range for this thread
66
+ const int64_t ir0 = dr*ith;
67
+ const int64_t ir1 = MIN(ir0 + dr, nr);
68
+
69
+ return {ir0, ir1};
70
+ }
71
+
72
+ #endif
package/cpp/ggml-common.h CHANGED
@@ -158,6 +158,12 @@ typedef sycl::half2 lm_ggml_half2;
158
158
 
159
159
  #endif // LM_GGML_COMMON_DECL_CUDA || LM_GGML_COMMON_DECL_HIP
160
160
 
161
+ #ifdef _MSC_VER
162
+ #define LM_GGML_EXTENSION
163
+ #else // _MSC_VER
164
+ #define LM_GGML_EXTENSION __extension__
165
+ #endif // _MSC_VER
166
+
161
167
  #define QK4_0 32
162
168
  typedef struct {
163
169
  lm_ggml_half d; // delta
@@ -167,7 +173,7 @@ static_assert(sizeof(block_q4_0) == sizeof(lm_ggml_half) + QK4_0 / 2, "wrong q4_
167
173
 
168
174
  #define QK4_1 32
169
175
  typedef struct {
170
- union {
176
+ LM_GGML_EXTENSION union {
171
177
  struct {
172
178
  lm_ggml_half d; // delta
173
179
  lm_ggml_half m; // min
@@ -188,7 +194,7 @@ static_assert(sizeof(block_q5_0) == sizeof(lm_ggml_half) + sizeof(uint32_t) + QK
188
194
 
189
195
  #define QK5_1 32
190
196
  typedef struct {
191
- union {
197
+ LM_GGML_EXTENSION union {
192
198
  struct {
193
199
  lm_ggml_half d; // delta
194
200
  lm_ggml_half m; // min
@@ -209,7 +215,7 @@ static_assert(sizeof(block_q8_0) == sizeof(lm_ggml_half) + QK8_0, "wrong q8_0 bl
209
215
 
210
216
  #define QK8_1 32
211
217
  typedef struct {
212
- union {
218
+ LM_GGML_EXTENSION union {
213
219
  struct {
214
220
  lm_ggml_half d; // delta
215
221
  lm_ggml_half s; // d * sum(qs[i])
@@ -250,7 +256,7 @@ static_assert(sizeof(block_tq2_0) == sizeof(lm_ggml_half) + QK_K / 4, "wrong tq2
250
256
  typedef struct {
251
257
  uint8_t scales[QK_K/16]; // scales and mins, quantized with 4 bits
252
258
  uint8_t qs[QK_K/4]; // quants
253
- union {
259
+ LM_GGML_EXTENSION union {
254
260
  struct {
255
261
  lm_ggml_half d; // super-block scale for quantized scales
256
262
  lm_ggml_half dmin; // super-block scale for quantized mins
@@ -277,7 +283,7 @@ static_assert(sizeof(block_q3_K) == sizeof(lm_ggml_half) + QK_K / 4 + QK_K / 8 +
277
283
  // weight is represented as x = a * q + b
278
284
  // Effectively 4.5 bits per weight
279
285
  typedef struct {
280
- union {
286
+ LM_GGML_EXTENSION union {
281
287
  struct {
282
288
  lm_ggml_half d; // super-block scale for quantized scales
283
289
  lm_ggml_half dmin; // super-block scale for quantized mins
@@ -294,7 +300,7 @@ static_assert(sizeof(block_q4_K) == 2*sizeof(lm_ggml_half) + K_SCALE_SIZE + QK_K
294
300
  // weight is represented as x = a * q + b
295
301
  // Effectively 5.5 bits per weight
296
302
  typedef struct {
297
- union {
303
+ LM_GGML_EXTENSION union {
298
304
  struct {
299
305
  lm_ggml_half d; // super-block scale for quantized scales
300
306
  lm_ggml_half dmin; // super-block scale for quantized mins