cui-llama.rn 1.3.6 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (101) hide show
  1. package/README.md +22 -1
  2. package/android/src/main/CMakeLists.txt +25 -26
  3. package/android/src/main/java/com/rnllama/LlamaContext.java +31 -9
  4. package/android/src/main/java/com/rnllama/RNLlama.java +98 -0
  5. package/android/src/main/jni-utils.h +94 -0
  6. package/android/src/main/jni.cpp +133 -63
  7. package/android/src/newarch/java/com/rnllama/RNLlamaModule.java +15 -0
  8. package/android/src/oldarch/java/com/rnllama/RNLlamaModule.java +15 -0
  9. package/cpp/common.cpp +2085 -1982
  10. package/cpp/common.h +696 -664
  11. package/cpp/ggml-alloc.c +1042 -1037
  12. package/cpp/ggml-backend-impl.h +255 -256
  13. package/cpp/ggml-backend-reg.cpp +582 -582
  14. package/cpp/ggml-backend.cpp +2002 -2002
  15. package/cpp/ggml-backend.h +354 -352
  16. package/cpp/ggml-common.h +1853 -1853
  17. package/cpp/ggml-cpp.h +39 -39
  18. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  19. package/cpp/ggml-cpu-aarch64.h +8 -8
  20. package/cpp/ggml-cpu-impl.h +386 -386
  21. package/cpp/ggml-cpu-quants.c +10920 -10839
  22. package/cpp/ggml-cpu-traits.cpp +36 -36
  23. package/cpp/ggml-cpu-traits.h +38 -38
  24. package/cpp/ggml-cpu.c +14391 -14122
  25. package/cpp/ggml-cpu.cpp +635 -627
  26. package/cpp/ggml-cpu.h +135 -135
  27. package/cpp/ggml-impl.h +567 -567
  28. package/cpp/ggml-metal-impl.h +288 -0
  29. package/cpp/ggml-metal.m +4884 -4884
  30. package/cpp/ggml-opt.cpp +854 -0
  31. package/cpp/ggml-opt.h +216 -0
  32. package/cpp/ggml-quants.c +5238 -5238
  33. package/cpp/ggml-threading.h +14 -14
  34. package/cpp/ggml.c +6514 -6448
  35. package/cpp/ggml.h +2194 -2163
  36. package/cpp/gguf.cpp +1329 -1325
  37. package/cpp/gguf.h +202 -202
  38. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  39. package/cpp/json-schema-to-grammar.h +8 -8
  40. package/cpp/json.hpp +24766 -24766
  41. package/cpp/llama-adapter.cpp +347 -346
  42. package/cpp/llama-adapter.h +74 -73
  43. package/cpp/llama-arch.cpp +1487 -1434
  44. package/cpp/llama-arch.h +400 -395
  45. package/cpp/llama-batch.cpp +368 -368
  46. package/cpp/llama-batch.h +88 -88
  47. package/cpp/llama-chat.cpp +578 -567
  48. package/cpp/llama-chat.h +52 -51
  49. package/cpp/llama-context.cpp +1775 -1771
  50. package/cpp/llama-context.h +128 -128
  51. package/cpp/llama-cparams.cpp +1 -1
  52. package/cpp/llama-cparams.h +37 -37
  53. package/cpp/llama-cpp.h +30 -30
  54. package/cpp/llama-grammar.cpp +1139 -1139
  55. package/cpp/llama-grammar.h +143 -143
  56. package/cpp/llama-hparams.cpp +71 -71
  57. package/cpp/llama-hparams.h +139 -140
  58. package/cpp/llama-impl.cpp +167 -167
  59. package/cpp/llama-impl.h +61 -61
  60. package/cpp/llama-kv-cache.cpp +718 -718
  61. package/cpp/llama-kv-cache.h +218 -218
  62. package/cpp/llama-mmap.cpp +590 -589
  63. package/cpp/llama-mmap.h +67 -67
  64. package/cpp/llama-model-loader.cpp +1124 -1011
  65. package/cpp/llama-model-loader.h +167 -158
  66. package/cpp/llama-model.cpp +3997 -2202
  67. package/cpp/llama-model.h +370 -391
  68. package/cpp/llama-sampling.cpp +2408 -2406
  69. package/cpp/llama-sampling.h +32 -48
  70. package/cpp/llama-vocab.cpp +3247 -1982
  71. package/cpp/llama-vocab.h +125 -182
  72. package/cpp/llama.cpp +10077 -12544
  73. package/cpp/llama.h +1323 -1285
  74. package/cpp/log.cpp +401 -401
  75. package/cpp/log.h +121 -121
  76. package/cpp/rn-llama.hpp +123 -116
  77. package/cpp/sampling.cpp +505 -500
  78. package/cpp/sgemm.cpp +2597 -2597
  79. package/cpp/sgemm.h +14 -14
  80. package/cpp/speculative.cpp +277 -274
  81. package/cpp/speculative.h +28 -28
  82. package/cpp/unicode.cpp +2 -3
  83. package/ios/RNLlama.mm +47 -0
  84. package/ios/RNLlamaContext.h +3 -1
  85. package/ios/RNLlamaContext.mm +71 -14
  86. package/jest/mock.js +15 -3
  87. package/lib/commonjs/NativeRNLlama.js.map +1 -1
  88. package/lib/commonjs/index.js +33 -37
  89. package/lib/commonjs/index.js.map +1 -1
  90. package/lib/module/NativeRNLlama.js.map +1 -1
  91. package/lib/module/index.js +31 -35
  92. package/lib/module/index.js.map +1 -1
  93. package/lib/typescript/NativeRNLlama.d.ts +26 -6
  94. package/lib/typescript/NativeRNLlama.d.ts.map +1 -1
  95. package/lib/typescript/index.d.ts +21 -36
  96. package/lib/typescript/index.d.ts.map +1 -1
  97. package/llama-rn.podspec +4 -18
  98. package/package.json +2 -3
  99. package/src/NativeRNLlama.ts +32 -13
  100. package/src/index.ts +52 -47
  101. package/cpp/llama.cpp.rej +0 -23
@@ -1,128 +1,128 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
- #include "llama-batch.h"
5
- #include "llama-cparams.h"
6
- #include "llama-model.h"
7
- #include "llama-kv-cache.h"
8
- #include "llama-adapter.h"
9
-
10
- #include "ggml-cpp.h"
11
-
12
- #include <map>
13
- #include <unordered_map>
14
- #include <vector>
15
- #include <set>
16
-
17
- struct llama_context {
18
- llama_context(const llama_model & model)
19
- : model(model)
20
- , t_start_us(model.t_start_us)
21
- , t_load_us(model.t_load_us) {}
22
-
23
- const struct llama_model & model;
24
-
25
- struct llama_cparams cparams;
26
- struct llama_sbatch sbatch; // TODO: revisit if needed
27
- struct llama_kv_cache kv_self;
28
- struct llama_control_vector cvec;
29
-
30
- std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
31
-
32
- std::vector<lm_ggml_backend_ptr> backends;
33
- std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
34
-
35
- lm_ggml_backend_t backend_cpu = nullptr;
36
-
37
- lm_ggml_threadpool_t threadpool = nullptr;
38
- lm_ggml_threadpool_t threadpool_batch = nullptr;
39
-
40
- bool has_evaluated_once = false;
41
-
42
- mutable int64_t t_start_us;
43
- mutable int64_t t_load_us;
44
- mutable int64_t t_p_eval_us = 0;
45
- mutable int64_t t_eval_us = 0;
46
-
47
- mutable int64_t t_compute_start_us = 0;
48
- mutable int64_t n_queued_tokens = 0;
49
-
50
- mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
51
- mutable int32_t n_eval = 0; // number of eval calls
52
-
53
- // host buffer for the model output (logits and embeddings)
54
- lm_ggml_backend_buffer_ptr buf_output;
55
-
56
- // decode output (2-dimensional array: [n_outputs][n_vocab])
57
- size_t logits_size = 0; // capacity (of floats) for logits
58
- float * logits = nullptr;
59
-
60
- std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
61
- size_t output_size = 0; // capacity (of tokens positions) for the output buffers
62
- int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
63
-
64
- bool logits_all = false;
65
-
66
- // embeddings output (2-dimensional array: [n_outputs][n_embd])
67
- // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
68
- size_t embd_size = 0; // capacity (of floats) for embeddings
69
- float * embd = nullptr;
70
-
71
- // sequence embeddings output (map of [n_embd] vectors)
72
- // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
73
- std::map<llama_seq_id, std::vector<float>> embd_seq;
74
-
75
- // whether we are computing encoder output or decoder output
76
- bool is_encoding = false;
77
-
78
- // TODO: find a better way to accommodate mutli-dimension position encoding methods
79
- // number of position id each token get, 1 for each token in most cases.
80
- // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
81
- int n_pos_per_token = 1;
82
-
83
- // output of the encoder part of the encoder-decoder models
84
- std::vector<float> embd_enc;
85
- std::vector<std::set<llama_seq_id>> seq_ids_enc;
86
-
87
- // memory buffers used to evaluate the model
88
- std::vector<uint8_t> buf_compute_meta;
89
- lm_ggml_backend_sched_ptr sched;
90
-
91
- lm_ggml_abort_callback abort_callback = nullptr;
92
- void * abort_callback_data = nullptr;
93
-
94
- // input tensors
95
- struct lm_ggml_tensor * inp_tokens; // I32 [n_batch]
96
- struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
97
- struct lm_ggml_tensor * inp_pos; // I32 [n_batch]
98
- struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs]
99
- struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
100
- struct lm_ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
101
- struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size]
102
- struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
103
- struct lm_ggml_tensor * inp_cls; // I32 [n_batch]
104
- struct lm_ggml_tensor * inp_s_copy; // I32 [kv_size]
105
- struct lm_ggml_tensor * inp_s_mask; // F32 [1, n_kv]
106
- struct lm_ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
107
- struct lm_ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
108
- struct lm_ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
109
- struct lm_ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
110
- };
111
-
112
- // TODO: make these methods of llama_context
113
- void llama_set_k_shift(struct llama_context & lctx);
114
-
115
- void llama_set_s_copy(struct llama_context & lctx);
116
-
117
- void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
118
-
119
- // Make sure enough space is available for outputs.
120
- // Returns max number of outputs for which space was reserved.
121
- size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
122
-
123
- // make the outputs have the same order they had in the user-provided batch
124
- void llama_output_reorder(struct llama_context & ctx);
125
-
126
- // For internal test use
127
- // TODO: remove
128
- const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-model.h"
7
+ #include "llama-kv-cache.h"
8
+ #include "llama-adapter.h"
9
+
10
+ #include "ggml-cpp.h"
11
+
12
+ #include <map>
13
+ #include <unordered_map>
14
+ #include <vector>
15
+ #include <set>
16
+
17
+ struct llama_context {
18
+ llama_context(const llama_model & model)
19
+ : model(model)
20
+ , t_start_us(model.t_start_us)
21
+ , t_load_us(model.t_load_us) {}
22
+
23
+ const struct llama_model & model;
24
+
25
+ struct llama_cparams cparams;
26
+ struct llama_sbatch sbatch; // TODO: revisit if needed
27
+ struct llama_kv_cache kv_self;
28
+ struct llama_adapter_cvec cvec;
29
+
30
+ std::unordered_map<struct llama_adapter_lora *, float> lora;
31
+
32
+ std::vector<lm_ggml_backend_ptr> backends;
33
+ std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
34
+
35
+ lm_ggml_backend_t backend_cpu = nullptr;
36
+
37
+ lm_ggml_threadpool_t threadpool = nullptr;
38
+ lm_ggml_threadpool_t threadpool_batch = nullptr;
39
+
40
+ bool has_evaluated_once = false;
41
+
42
+ mutable int64_t t_start_us;
43
+ mutable int64_t t_load_us;
44
+ mutable int64_t t_p_eval_us = 0;
45
+ mutable int64_t t_eval_us = 0;
46
+
47
+ mutable int64_t t_compute_start_us = 0;
48
+ mutable int64_t n_queued_tokens = 0;
49
+
50
+ mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
51
+ mutable int32_t n_eval = 0; // number of eval calls
52
+
53
+ // host buffer for the model output (logits and embeddings)
54
+ lm_ggml_backend_buffer_ptr buf_output;
55
+
56
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
57
+ size_t logits_size = 0; // capacity (of floats) for logits
58
+ float * logits = nullptr;
59
+
60
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
61
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
62
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
63
+
64
+ bool logits_all = false;
65
+
66
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
67
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
68
+ size_t embd_size = 0; // capacity (of floats) for embeddings
69
+ float * embd = nullptr;
70
+
71
+ // sequence embeddings output (map of [n_embd] vectors)
72
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
73
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
74
+
75
+ // whether we are computing encoder output or decoder output
76
+ bool is_encoding = false;
77
+
78
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
79
+ // number of position id each token get, 1 for each token in most cases.
80
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
81
+ int n_pos_per_token = 1;
82
+
83
+ // output of the encoder part of the encoder-decoder models
84
+ std::vector<float> embd_enc;
85
+ std::vector<std::set<llama_seq_id>> seq_ids_enc;
86
+
87
+ // memory buffers used to evaluate the model
88
+ std::vector<uint8_t> buf_compute_meta;
89
+ lm_ggml_backend_sched_ptr sched;
90
+
91
+ lm_ggml_abort_callback abort_callback = nullptr;
92
+ void * abort_callback_data = nullptr;
93
+
94
+ // input tensors
95
+ struct lm_ggml_tensor * inp_tokens; // I32 [n_batch]
96
+ struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
97
+ struct lm_ggml_tensor * inp_pos; // I32 [n_batch]
98
+ struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs]
99
+ struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
100
+ struct lm_ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
101
+ struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size]
102
+ struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
103
+ struct lm_ggml_tensor * inp_cls; // I32 [n_batch]
104
+ struct lm_ggml_tensor * inp_s_copy; // I32 [kv_size]
105
+ struct lm_ggml_tensor * inp_s_mask; // F32 [1, n_kv]
106
+ struct lm_ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
107
+ struct lm_ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
108
+ struct lm_ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
109
+ struct lm_ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
110
+ };
111
+
112
+ // TODO: make these methods of llama_context
113
+ void llama_set_k_shift(struct llama_context & lctx);
114
+
115
+ void llama_set_s_copy(struct llama_context & lctx);
116
+
117
+ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
118
+
119
+ // Make sure enough space is available for outputs.
120
+ // Returns max number of outputs for which space was reserved.
121
+ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
122
+
123
+ // make the outputs have the same order they had in the user-provided batch
124
+ void llama_output_reorder(struct llama_context & ctx);
125
+
126
+ // For internal test use
127
+ // TODO: remove
128
+ const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
@@ -1 +1 @@
1
- #include "llama-cparams.h"
1
+ #include "llama-cparams.h"
@@ -1,37 +1,37 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
-
5
- #include <cstdint>
6
-
7
- struct llama_cparams {
8
- uint32_t n_ctx; // context size used during inference
9
- uint32_t n_batch;
10
- uint32_t n_ubatch;
11
- uint32_t n_seq_max;
12
- int n_threads; // number of threads to use for generation
13
- int n_threads_batch; // number of threads to use for batch processing
14
-
15
- float rope_freq_base;
16
- float rope_freq_scale;
17
-
18
- uint32_t n_ctx_orig_yarn;
19
- // These hyperparameters are not exposed in GGUF, because all
20
- // existing YaRN models use the same values for them.
21
- float yarn_ext_factor;
22
- float yarn_attn_factor;
23
- float yarn_beta_fast;
24
- float yarn_beta_slow;
25
- float defrag_thold;
26
-
27
- bool embeddings;
28
- bool causal_attn;
29
- bool offload_kqv;
30
- bool flash_attn;
31
- bool no_perf;
32
-
33
- enum llama_pooling_type pooling_type;
34
-
35
- lm_ggml_backend_sched_eval_callback cb_eval;
36
- void * cb_eval_user_data;
37
- };
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include <cstdint>
6
+
7
+ struct llama_cparams {
8
+ uint32_t n_ctx; // context size used during inference
9
+ uint32_t n_batch;
10
+ uint32_t n_ubatch;
11
+ uint32_t n_seq_max;
12
+ int n_threads; // number of threads to use for generation
13
+ int n_threads_batch; // number of threads to use for batch processing
14
+
15
+ float rope_freq_base;
16
+ float rope_freq_scale;
17
+
18
+ uint32_t n_ctx_orig_yarn;
19
+ // These hyperparameters are not exposed in GGUF, because all
20
+ // existing YaRN models use the same values for them.
21
+ float yarn_ext_factor;
22
+ float yarn_attn_factor;
23
+ float yarn_beta_fast;
24
+ float yarn_beta_slow;
25
+ float defrag_thold;
26
+
27
+ bool embeddings;
28
+ bool causal_attn;
29
+ bool offload_kqv;
30
+ bool flash_attn;
31
+ bool no_perf;
32
+
33
+ enum llama_pooling_type pooling_type;
34
+
35
+ lm_ggml_backend_sched_eval_callback cb_eval;
36
+ void * cb_eval_user_data;
37
+ };
package/cpp/llama-cpp.h CHANGED
@@ -1,30 +1,30 @@
1
- #pragma once
2
-
3
- #ifndef __cplusplus
4
- #error "This header is for C++ only"
5
- #endif
6
-
7
- #include <memory>
8
-
9
- #include "llama.h"
10
-
11
- struct llama_model_deleter {
12
- void operator()(llama_model * model) { llama_model_free(model); }
13
- };
14
-
15
- struct llama_context_deleter {
16
- void operator()(llama_context * context) { llama_free(context); }
17
- };
18
-
19
- struct llama_sampler_deleter {
20
- void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
- };
22
-
23
- struct llama_lora_adapter_deleter {
24
- void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
25
- };
26
-
27
- typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
28
- typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
29
- typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
30
- typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;
1
+ #pragma once
2
+
3
+ #ifndef __cplusplus
4
+ #error "This header is for C++ only"
5
+ #endif
6
+
7
+ #include <memory>
8
+
9
+ #include "llama.h"
10
+
11
+ struct llama_model_deleter {
12
+ void operator()(llama_model * model) { llama_model_free(model); }
13
+ };
14
+
15
+ struct llama_context_deleter {
16
+ void operator()(llama_context * context) { llama_free(context); }
17
+ };
18
+
19
+ struct llama_sampler_deleter {
20
+ void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
+ };
22
+
23
+ struct llama_adapter_lora_deleter {
24
+ void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
25
+ };
26
+
27
+ typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
28
+ typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
29
+ typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
30
+ typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;