cui-llama.rn 1.3.5 → 1.4.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (80) hide show
  1. package/README.md +22 -1
  2. package/android/src/main/CMakeLists.txt +25 -20
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
  5. package/android/src/main/jni-utils.h +94 -0
  6. package/android/src/main/jni.cpp +108 -37
  7. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
  8. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
  9. package/cpp/common.cpp +1982 -1965
  10. package/cpp/common.h +665 -657
  11. package/cpp/ggml-backend-reg.cpp +5 -0
  12. package/cpp/ggml-backend.cpp +5 -2
  13. package/cpp/ggml-cpp.h +1 -0
  14. package/cpp/ggml-cpu-aarch64.cpp +6 -1
  15. package/cpp/ggml-cpu-quants.c +5 -1
  16. package/cpp/ggml-cpu.c +14122 -14122
  17. package/cpp/ggml-cpu.cpp +627 -627
  18. package/cpp/ggml-impl.h +11 -16
  19. package/cpp/ggml-metal-impl.h +288 -0
  20. package/cpp/ggml-metal.m +2 -2
  21. package/cpp/ggml-opt.cpp +854 -0
  22. package/cpp/ggml-opt.h +216 -0
  23. package/cpp/ggml.c +0 -1276
  24. package/cpp/ggml.h +0 -140
  25. package/cpp/gguf.cpp +1325 -0
  26. package/cpp/gguf.h +202 -0
  27. package/cpp/llama-adapter.cpp +346 -0
  28. package/cpp/llama-adapter.h +73 -0
  29. package/cpp/llama-arch.cpp +1434 -0
  30. package/cpp/llama-arch.h +395 -0
  31. package/cpp/llama-batch.cpp +368 -0
  32. package/cpp/llama-batch.h +88 -0
  33. package/cpp/llama-chat.cpp +567 -0
  34. package/cpp/llama-chat.h +51 -0
  35. package/cpp/llama-context.cpp +1771 -0
  36. package/cpp/llama-context.h +128 -0
  37. package/cpp/llama-cparams.cpp +1 -0
  38. package/cpp/llama-cparams.h +37 -0
  39. package/cpp/llama-cpp.h +30 -0
  40. package/cpp/llama-grammar.cpp +1 -0
  41. package/cpp/llama-grammar.h +3 -1
  42. package/cpp/llama-hparams.cpp +71 -0
  43. package/cpp/llama-hparams.h +140 -0
  44. package/cpp/llama-impl.cpp +167 -0
  45. package/cpp/llama-impl.h +16 -136
  46. package/cpp/llama-kv-cache.cpp +718 -0
  47. package/cpp/llama-kv-cache.h +218 -0
  48. package/cpp/llama-mmap.cpp +589 -0
  49. package/cpp/llama-mmap.h +67 -0
  50. package/cpp/llama-model-loader.cpp +1011 -0
  51. package/cpp/llama-model-loader.h +158 -0
  52. package/cpp/llama-model.cpp +2202 -0
  53. package/cpp/llama-model.h +391 -0
  54. package/cpp/llama-sampling.cpp +117 -4
  55. package/cpp/llama-vocab.cpp +21 -28
  56. package/cpp/llama-vocab.h +13 -1
  57. package/cpp/llama.cpp +12547 -23528
  58. package/cpp/llama.h +31 -6
  59. package/cpp/rn-llama.hpp +90 -87
  60. package/cpp/sgemm.cpp +776 -70
  61. package/cpp/sgemm.h +14 -14
  62. package/cpp/unicode.cpp +6 -0
  63. package/ios/RNLlama.mm +47 -0
  64. package/ios/RNLlamaContext.h +3 -1
  65. package/ios/RNLlamaContext.mm +71 -14
  66. package/jest/mock.js +15 -3
  67. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  68. package/lib/commonjs/index.js +33 -37
  69. package/lib/commonjs/index.js.map +1 -1
  70. package/lib/module/NativeRNLlama.js.map +1 -1
  71. package/lib/module/index.js +31 -35
  72. package/lib/module/index.js.map +1 -1
  73. package/lib/typescript/NativeRNLlama.d.ts +26 -6
  74. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  75. package/lib/typescript/index.d.ts +21 -36
  76. package/lib/typescript/index.d.ts.map +1 -1
  77. package/llama-rn.podspec +4 -18
  78. package/package.json +2 -3
  79. package/src/NativeRNLlama.ts +32 -13
  80. package/src/index.ts +52 -47
@@ -574,4 +574,9 @@ void lm_ggml_backend_load_all_from_path(const char * dir_path) {
574
574
  lm_ggml_backend_load_best("opencl", silent, dir_path);
575
575
  lm_ggml_backend_load_best("musa", silent, dir_path);
576
576
  lm_ggml_backend_load_best("cpu", silent, dir_path);
577
+ // check the environment variable LM_GGML_BACKEND_PATH to load an out-of-tree backend
578
+ const char * backend_path = std::getenv("LM_GGML_BACKEND_PATH");
579
+ if (backend_path) {
580
+ lm_ggml_backend_load(backend_path);
581
+ }
577
582
  }
@@ -764,7 +764,7 @@ static int lm_ggml_backend_sched_backend_id_from_cur(lm_ggml_backend_sched_t sch
764
764
  if (tensor->op != LM_GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == LM_GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
765
765
  int src_backend_id = lm_ggml_backend_sched_backend_from_buffer(sched, src, tensor);
766
766
  // check if a backend with higher prio wants to offload the op
767
- if (src_backend_id == sched->n_backends - 1) {
767
+ if (src_backend_id == sched->n_backends - 1 && lm_ggml_backend_buffer_is_host(src->buffer)) {
768
768
  for (int b = 0; b < src_backend_id; b++) {
769
769
  if (lm_ggml_backend_supports_op(sched->backends[b], tensor) && lm_ggml_backend_offload_op(sched->backends[b], tensor)) {
770
770
  SET_CAUSE(tensor, "1.off");
@@ -795,9 +795,12 @@ static void lm_ggml_backend_sched_print_assignments(lm_ggml_backend_sched_t sche
795
795
  for (int i = 0; i < graph->n_nodes; i++) {
796
796
  if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
797
797
  lm_ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
798
- LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs: ", cur_split, lm_ggml_backend_name(split_backend),
798
+ LM_GGML_LOG_DEBUG("\n## SPLIT #%d: %s # %d inputs", cur_split, lm_ggml_backend_name(split_backend),
799
799
  sched->splits[cur_split].n_inputs);
800
800
  for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
801
+ if (j == 0) {
802
+ LM_GGML_LOG_DEBUG(": ");
803
+ }
801
804
  LM_GGML_LOG_DEBUG("[%s (%5.5s)] ", sched->splits[cur_split].inputs[j]->name,
802
805
  fmt_size(lm_ggml_nbytes(sched->splits[cur_split].inputs[j])));
803
806
  }
package/cpp/ggml-cpp.h CHANGED
@@ -7,6 +7,7 @@
7
7
  #include "ggml.h"
8
8
  #include "ggml-alloc.h"
9
9
  #include "ggml-backend.h"
10
+ #include "gguf.h"
10
11
  #include <memory>
11
12
 
12
13
  // Smart pointers for ggml types
@@ -194,9 +194,12 @@ static inline __m256i sum_i16_pairs_int32x8(const __m256i x) {
194
194
  }
195
195
 
196
196
  static inline __m256i mul_sum_us8_pairs_int32x8(const __m256i ax, const __m256i sy) {
197
- #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
197
+ #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
198
198
  const __m256i zero = _mm256_setzero_si256();
199
199
  return _mm256_dpbusd_epi32(zero, ax, sy);
200
+ #elif defined(__AVXVNNI__)
201
+ const __m256i zero = _mm256_setzero_si256();
202
+ return _mm256_dpbusd_avx_epi32(zero, ax, sy);
200
203
  #else
201
204
  // Perform multiplication and create 16-bit values
202
205
  const __m256i dot = _mm256_maddubs_epi16(ax, sy);
@@ -4166,6 +4169,8 @@ static lm_ggml_backend_buffer_t lm_ggml_backend_cpu_aarch64_buffer_type_alloc_bu
4166
4169
  buffer->buft = buft;
4167
4170
  buffer->iface.init_tensor = lm_ggml_backend_cpu_aarch64_buffer_init_tensor;
4168
4171
  buffer->iface.set_tensor = lm_ggml_backend_cpu_aarch64_buffer_set_tensor;
4172
+ buffer->iface.get_tensor = nullptr;
4173
+ buffer->iface.cpy_tensor = nullptr;
4169
4174
  return buffer;
4170
4175
  }
4171
4176
 
@@ -103,10 +103,14 @@ static inline __m256 sum_i16_pairs_float(const __m256i x) {
103
103
  }
104
104
 
105
105
  static inline __m256 mul_sum_us8_pairs_float(const __m256i ax, const __m256i sy) {
106
- #if defined(__AVXVNNI__) || (defined(__AVX512VNNI__) && defined(__AVX512VL__))
106
+ #if defined(__AVX512VNNI__) && defined(__AVX512VL__)
107
107
  const __m256i zero = _mm256_setzero_si256();
108
108
  const __m256i summed_pairs = _mm256_dpbusd_epi32(zero, ax, sy);
109
109
  return _mm256_cvtepi32_ps(summed_pairs);
110
+ #elif defined(__AVXVNNI__)
111
+ const __m256i zero = _mm256_setzero_si256();
112
+ const __m256i summed_pairs = _mm256_dpbusd_avx_epi32(zero, ax, sy);
113
+ return _mm256_cvtepi32_ps(summed_pairs);
110
114
  #else
111
115
  // Perform multiplication and create 16-bit values
112
116
  const __m256i dot = _mm256_maddubs_epi16(ax, sy);