cui-llama.rn 1.4.0 → 1.4.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (108) hide show
  1. package/README.md +4 -23
  2. package/android/build.gradle +12 -3
  3. package/android/src/main/CMakeLists.txt +13 -7
  4. package/android/src/main/java/com/rnllama/LlamaContext.java +27 -20
  5. package/android/src/main/java/com/rnllama/RNLlama.java +5 -1
  6. package/android/src/main/jni.cpp +15 -12
  7. package/android/src/main/jniLibs/arm64-v8a/librnllama.so +0 -0
  8. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8.so +0 -0
  9. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2.so +0 -0
  10. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod.so +0 -0
  11. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_dotprod_i8mm.so +0 -0
  12. package/android/src/main/jniLibs/arm64-v8a/librnllama_v8_2_i8mm.so +0 -0
  13. package/android/src/main/jniLibs/x86_64/librnllama.so +0 -0
  14. package/android/src/main/jniLibs/x86_64/librnllama_x86_64.so +0 -0
  15. package/cpp/README.md +1 -1
  16. package/cpp/common.cpp +158 -267
  17. package/cpp/common.h +46 -12
  18. package/cpp/ggml-alloc.c +1042 -1037
  19. package/cpp/ggml-backend-impl.h +255 -256
  20. package/cpp/ggml-backend-reg.cpp +582 -582
  21. package/cpp/ggml-backend.cpp +2002 -2002
  22. package/cpp/ggml-backend.h +354 -352
  23. package/cpp/ggml-common.h +1853 -1853
  24. package/cpp/ggml-cpp.h +39 -39
  25. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  26. package/cpp/ggml-cpu-aarch64.h +8 -8
  27. package/cpp/ggml-cpu-impl.h +386 -386
  28. package/cpp/ggml-cpu-quants.c +10920 -10839
  29. package/cpp/ggml-cpu-traits.cpp +36 -36
  30. package/cpp/ggml-cpu-traits.h +38 -38
  31. package/cpp/ggml-cpu.c +329 -60
  32. package/cpp/ggml-cpu.cpp +10 -2
  33. package/cpp/ggml-cpu.h +135 -135
  34. package/cpp/ggml-impl.h +567 -567
  35. package/cpp/ggml-metal-impl.h +17 -17
  36. package/cpp/ggml-metal.m +4884 -4884
  37. package/cpp/ggml-quants.c +5238 -5238
  38. package/cpp/ggml-threading.h +14 -14
  39. package/cpp/ggml.c +6514 -6448
  40. package/cpp/ggml.h +2194 -2163
  41. package/cpp/gguf.cpp +1329 -1325
  42. package/cpp/gguf.h +202 -202
  43. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  44. package/cpp/json-schema-to-grammar.h +8 -8
  45. package/cpp/json.hpp +24766 -24766
  46. package/cpp/llama-adapter.cpp +347 -346
  47. package/cpp/llama-adapter.h +74 -73
  48. package/cpp/llama-arch.cpp +1487 -1434
  49. package/cpp/llama-arch.h +400 -395
  50. package/cpp/llama-batch.cpp +368 -368
  51. package/cpp/llama-batch.h +88 -88
  52. package/cpp/llama-chat.cpp +578 -567
  53. package/cpp/llama-chat.h +52 -51
  54. package/cpp/llama-context.cpp +1775 -1771
  55. package/cpp/llama-context.h +128 -128
  56. package/cpp/llama-cparams.cpp +1 -1
  57. package/cpp/llama-cparams.h +37 -37
  58. package/cpp/llama-cpp.h +30 -30
  59. package/cpp/llama-grammar.cpp +1139 -1139
  60. package/cpp/llama-grammar.h +143 -143
  61. package/cpp/llama-hparams.cpp +71 -71
  62. package/cpp/llama-hparams.h +139 -140
  63. package/cpp/llama-impl.cpp +167 -167
  64. package/cpp/llama-impl.h +61 -61
  65. package/cpp/llama-kv-cache.cpp +718 -718
  66. package/cpp/llama-kv-cache.h +218 -218
  67. package/cpp/llama-mmap.cpp +2 -1
  68. package/cpp/llama-mmap.h +67 -67
  69. package/cpp/llama-model-loader.cpp +1124 -1011
  70. package/cpp/llama-model-loader.h +167 -158
  71. package/cpp/llama-model.cpp +3997 -2202
  72. package/cpp/llama-model.h +370 -391
  73. package/cpp/llama-sampling.cpp +2408 -2406
  74. package/cpp/llama-sampling.h +32 -48
  75. package/cpp/llama-vocab.cpp +3247 -1982
  76. package/cpp/llama-vocab.h +125 -182
  77. package/cpp/llama.cpp +416 -2886
  78. package/cpp/llama.h +1323 -1285
  79. package/cpp/log.cpp +401 -401
  80. package/cpp/log.h +121 -121
  81. package/cpp/rn-llama.cpp +822 -0
  82. package/cpp/rn-llama.h +123 -0
  83. package/cpp/rn-llama.hpp +18 -12
  84. package/cpp/sampling.cpp +505 -500
  85. package/cpp/sgemm.cpp +2597 -2597
  86. package/cpp/speculative.cpp +277 -274
  87. package/cpp/speculative.h +28 -28
  88. package/cpp/unicode.cpp +2 -3
  89. package/ios/CMakeLists.txt +99 -0
  90. package/ios/RNLlama.h +5 -1
  91. package/ios/RNLlama.mm +2 -2
  92. package/ios/RNLlamaContext.h +8 -1
  93. package/ios/RNLlamaContext.mm +15 -11
  94. package/ios/rnllama.xcframework/Info.plist +74 -0
  95. package/jest/mock.js +3 -2
  96. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  97. package/lib/commonjs/index.js +4 -2
  98. package/lib/commonjs/index.js.map +1 -1
  99. package/lib/module/NativeRNLlama.js.map +1 -1
  100. package/lib/module/index.js +4 -2
  101. package/lib/module/index.js.map +1 -1
  102. package/lib/typescript/NativeRNLlama.d.ts +5 -1
  103. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  104. package/lib/typescript/index.d.ts.map +1 -1
  105. package/llama-rn.podspec +8 -2
  106. package/package.json +5 -2
  107. package/src/NativeRNLlama.ts +5 -1
  108. package/src/index.ts +9 -2
@@ -1,128 +1,128 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
- #include "llama-batch.h"
5
- #include "llama-cparams.h"
6
- #include "llama-model.h"
7
- #include "llama-kv-cache.h"
8
- #include "llama-adapter.h"
9
-
10
- #include "ggml-cpp.h"
11
-
12
- #include <map>
13
- #include <unordered_map>
14
- #include <vector>
15
- #include <set>
16
-
17
- struct llama_context {
18
- llama_context(const llama_model & model)
19
- : model(model)
20
- , t_start_us(model.t_start_us)
21
- , t_load_us(model.t_load_us) {}
22
-
23
- const struct llama_model & model;
24
-
25
- struct llama_cparams cparams;
26
- struct llama_sbatch sbatch; // TODO: revisit if needed
27
- struct llama_kv_cache kv_self;
28
- struct llama_control_vector cvec;
29
-
30
- std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
31
-
32
- std::vector<lm_ggml_backend_ptr> backends;
33
- std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
34
-
35
- lm_ggml_backend_t backend_cpu = nullptr;
36
-
37
- lm_ggml_threadpool_t threadpool = nullptr;
38
- lm_ggml_threadpool_t threadpool_batch = nullptr;
39
-
40
- bool has_evaluated_once = false;
41
-
42
- mutable int64_t t_start_us;
43
- mutable int64_t t_load_us;
44
- mutable int64_t t_p_eval_us = 0;
45
- mutable int64_t t_eval_us = 0;
46
-
47
- mutable int64_t t_compute_start_us = 0;
48
- mutable int64_t n_queued_tokens = 0;
49
-
50
- mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
51
- mutable int32_t n_eval = 0; // number of eval calls
52
-
53
- // host buffer for the model output (logits and embeddings)
54
- lm_ggml_backend_buffer_ptr buf_output;
55
-
56
- // decode output (2-dimensional array: [n_outputs][n_vocab])
57
- size_t logits_size = 0; // capacity (of floats) for logits
58
- float * logits = nullptr;
59
-
60
- std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
61
- size_t output_size = 0; // capacity (of tokens positions) for the output buffers
62
- int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
63
-
64
- bool logits_all = false;
65
-
66
- // embeddings output (2-dimensional array: [n_outputs][n_embd])
67
- // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
68
- size_t embd_size = 0; // capacity (of floats) for embeddings
69
- float * embd = nullptr;
70
-
71
- // sequence embeddings output (map of [n_embd] vectors)
72
- // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
73
- std::map<llama_seq_id, std::vector<float>> embd_seq;
74
-
75
- // whether we are computing encoder output or decoder output
76
- bool is_encoding = false;
77
-
78
- // TODO: find a better way to accommodate mutli-dimension position encoding methods
79
- // number of position id each token get, 1 for each token in most cases.
80
- // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
81
- int n_pos_per_token = 1;
82
-
83
- // output of the encoder part of the encoder-decoder models
84
- std::vector<float> embd_enc;
85
- std::vector<std::set<llama_seq_id>> seq_ids_enc;
86
-
87
- // memory buffers used to evaluate the model
88
- std::vector<uint8_t> buf_compute_meta;
89
- lm_ggml_backend_sched_ptr sched;
90
-
91
- lm_ggml_abort_callback abort_callback = nullptr;
92
- void * abort_callback_data = nullptr;
93
-
94
- // input tensors
95
- struct lm_ggml_tensor * inp_tokens; // I32 [n_batch]
96
- struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
97
- struct lm_ggml_tensor * inp_pos; // I32 [n_batch]
98
- struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs]
99
- struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
100
- struct lm_ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
101
- struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size]
102
- struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
103
- struct lm_ggml_tensor * inp_cls; // I32 [n_batch]
104
- struct lm_ggml_tensor * inp_s_copy; // I32 [kv_size]
105
- struct lm_ggml_tensor * inp_s_mask; // F32 [1, n_kv]
106
- struct lm_ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
107
- struct lm_ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
108
- struct lm_ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
109
- struct lm_ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
110
- };
111
-
112
- // TODO: make these methods of llama_context
113
- void llama_set_k_shift(struct llama_context & lctx);
114
-
115
- void llama_set_s_copy(struct llama_context & lctx);
116
-
117
- void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
118
-
119
- // Make sure enough space is available for outputs.
120
- // Returns max number of outputs for which space was reserved.
121
- size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
122
-
123
- // make the outputs have the same order they had in the user-provided batch
124
- void llama_output_reorder(struct llama_context & ctx);
125
-
126
- // For internal test use
127
- // TODO: remove
128
- const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-model.h"
7
+ #include "llama-kv-cache.h"
8
+ #include "llama-adapter.h"
9
+
10
+ #include "ggml-cpp.h"
11
+
12
+ #include <map>
13
+ #include <unordered_map>
14
+ #include <vector>
15
+ #include <set>
16
+
17
+ struct llama_context {
18
+ llama_context(const llama_model & model)
19
+ : model(model)
20
+ , t_start_us(model.t_start_us)
21
+ , t_load_us(model.t_load_us) {}
22
+
23
+ const struct llama_model & model;
24
+
25
+ struct llama_cparams cparams;
26
+ struct llama_sbatch sbatch; // TODO: revisit if needed
27
+ struct llama_kv_cache kv_self;
28
+ struct llama_adapter_cvec cvec;
29
+
30
+ std::unordered_map<struct llama_adapter_lora *, float> lora;
31
+
32
+ std::vector<lm_ggml_backend_ptr> backends;
33
+ std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
34
+
35
+ lm_ggml_backend_t backend_cpu = nullptr;
36
+
37
+ lm_ggml_threadpool_t threadpool = nullptr;
38
+ lm_ggml_threadpool_t threadpool_batch = nullptr;
39
+
40
+ bool has_evaluated_once = false;
41
+
42
+ mutable int64_t t_start_us;
43
+ mutable int64_t t_load_us;
44
+ mutable int64_t t_p_eval_us = 0;
45
+ mutable int64_t t_eval_us = 0;
46
+
47
+ mutable int64_t t_compute_start_us = 0;
48
+ mutable int64_t n_queued_tokens = 0;
49
+
50
+ mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
51
+ mutable int32_t n_eval = 0; // number of eval calls
52
+
53
+ // host buffer for the model output (logits and embeddings)
54
+ lm_ggml_backend_buffer_ptr buf_output;
55
+
56
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
57
+ size_t logits_size = 0; // capacity (of floats) for logits
58
+ float * logits = nullptr;
59
+
60
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
61
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
62
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
63
+
64
+ bool logits_all = false;
65
+
66
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
67
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
68
+ size_t embd_size = 0; // capacity (of floats) for embeddings
69
+ float * embd = nullptr;
70
+
71
+ // sequence embeddings output (map of [n_embd] vectors)
72
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
73
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
74
+
75
+ // whether we are computing encoder output or decoder output
76
+ bool is_encoding = false;
77
+
78
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
79
+ // number of position id each token get, 1 for each token in most cases.
80
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
81
+ int n_pos_per_token = 1;
82
+
83
+ // output of the encoder part of the encoder-decoder models
84
+ std::vector<float> embd_enc;
85
+ std::vector<std::set<llama_seq_id>> seq_ids_enc;
86
+
87
+ // memory buffers used to evaluate the model
88
+ std::vector<uint8_t> buf_compute_meta;
89
+ lm_ggml_backend_sched_ptr sched;
90
+
91
+ lm_ggml_abort_callback abort_callback = nullptr;
92
+ void * abort_callback_data = nullptr;
93
+
94
+ // input tensors
95
+ struct lm_ggml_tensor * inp_tokens; // I32 [n_batch]
96
+ struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
97
+ struct lm_ggml_tensor * inp_pos; // I32 [n_batch]
98
+ struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs]
99
+ struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
100
+ struct lm_ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
101
+ struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size]
102
+ struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
103
+ struct lm_ggml_tensor * inp_cls; // I32 [n_batch]
104
+ struct lm_ggml_tensor * inp_s_copy; // I32 [kv_size]
105
+ struct lm_ggml_tensor * inp_s_mask; // F32 [1, n_kv]
106
+ struct lm_ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
107
+ struct lm_ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
108
+ struct lm_ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
109
+ struct lm_ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
110
+ };
111
+
112
+ // TODO: make these methods of llama_context
113
+ void llama_set_k_shift(struct llama_context & lctx);
114
+
115
+ void llama_set_s_copy(struct llama_context & lctx);
116
+
117
+ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
118
+
119
+ // Make sure enough space is available for outputs.
120
+ // Returns max number of outputs for which space was reserved.
121
+ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
122
+
123
+ // make the outputs have the same order they had in the user-provided batch
124
+ void llama_output_reorder(struct llama_context & ctx);
125
+
126
+ // For internal test use
127
+ // TODO: remove
128
+ const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
@@ -1 +1 @@
1
- #include "llama-cparams.h"
1
+ #include "llama-cparams.h"
@@ -1,37 +1,37 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
-
5
- #include <cstdint>
6
-
7
- struct llama_cparams {
8
- uint32_t n_ctx; // context size used during inference
9
- uint32_t n_batch;
10
- uint32_t n_ubatch;
11
- uint32_t n_seq_max;
12
- int n_threads; // number of threads to use for generation
13
- int n_threads_batch; // number of threads to use for batch processing
14
-
15
- float rope_freq_base;
16
- float rope_freq_scale;
17
-
18
- uint32_t n_ctx_orig_yarn;
19
- // These hyperparameters are not exposed in GGUF, because all
20
- // existing YaRN models use the same values for them.
21
- float yarn_ext_factor;
22
- float yarn_attn_factor;
23
- float yarn_beta_fast;
24
- float yarn_beta_slow;
25
- float defrag_thold;
26
-
27
- bool embeddings;
28
- bool causal_attn;
29
- bool offload_kqv;
30
- bool flash_attn;
31
- bool no_perf;
32
-
33
- enum llama_pooling_type pooling_type;
34
-
35
- lm_ggml_backend_sched_eval_callback cb_eval;
36
- void * cb_eval_user_data;
37
- };
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include <cstdint>
6
+
7
+ struct llama_cparams {
8
+ uint32_t n_ctx; // context size used during inference
9
+ uint32_t n_batch;
10
+ uint32_t n_ubatch;
11
+ uint32_t n_seq_max;
12
+ int n_threads; // number of threads to use for generation
13
+ int n_threads_batch; // number of threads to use for batch processing
14
+
15
+ float rope_freq_base;
16
+ float rope_freq_scale;
17
+
18
+ uint32_t n_ctx_orig_yarn;
19
+ // These hyperparameters are not exposed in GGUF, because all
20
+ // existing YaRN models use the same values for them.
21
+ float yarn_ext_factor;
22
+ float yarn_attn_factor;
23
+ float yarn_beta_fast;
24
+ float yarn_beta_slow;
25
+ float defrag_thold;
26
+
27
+ bool embeddings;
28
+ bool causal_attn;
29
+ bool offload_kqv;
30
+ bool flash_attn;
31
+ bool no_perf;
32
+
33
+ enum llama_pooling_type pooling_type;
34
+
35
+ lm_ggml_backend_sched_eval_callback cb_eval;
36
+ void * cb_eval_user_data;
37
+ };
package/cpp/llama-cpp.h CHANGED
@@ -1,30 +1,30 @@
1
- #pragma once
2
-
3
- #ifndef __cplusplus
4
- #error "This header is for C++ only"
5
- #endif
6
-
7
- #include <memory>
8
-
9
- #include "llama.h"
10
-
11
- struct llama_model_deleter {
12
- void operator()(llama_model * model) { llama_model_free(model); }
13
- };
14
-
15
- struct llama_context_deleter {
16
- void operator()(llama_context * context) { llama_free(context); }
17
- };
18
-
19
- struct llama_sampler_deleter {
20
- void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
- };
22
-
23
- struct llama_lora_adapter_deleter {
24
- void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
25
- };
26
-
27
- typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
28
- typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
29
- typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
30
- typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;
1
+ #pragma once
2
+
3
+ #ifndef __cplusplus
4
+ #error "This header is for C++ only"
5
+ #endif
6
+
7
+ #include <memory>
8
+
9
+ #include "llama.h"
10
+
11
+ struct llama_model_deleter {
12
+ void operator()(llama_model * model) { llama_model_free(model); }
13
+ };
14
+
15
+ struct llama_context_deleter {
16
+ void operator()(llama_context * context) { llama_free(context); }
17
+ };
18
+
19
+ struct llama_sampler_deleter {
20
+ void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
+ };
22
+
23
+ struct llama_adapter_lora_deleter {
24
+ void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
25
+ };
26
+
27
+ typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
28
+ typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
29
+ typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
30
+ typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;