cui-llama.rn 1.4.0 → 1.4.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (73) hide show
  1. package/android/src/main/jni.cpp +9 -9
  2. package/cpp/common.cpp +163 -60
  3. package/cpp/common.h +43 -12
  4. package/cpp/ggml-alloc.c +1042 -1037
  5. package/cpp/ggml-backend-impl.h +255 -256
  6. package/cpp/ggml-backend-reg.cpp +582 -582
  7. package/cpp/ggml-backend.cpp +2002 -2002
  8. package/cpp/ggml-backend.h +354 -352
  9. package/cpp/ggml-common.h +1853 -1853
  10. package/cpp/ggml-cpp.h +39 -39
  11. package/cpp/ggml-cpu-aarch64.cpp +4247 -4247
  12. package/cpp/ggml-cpu-aarch64.h +8 -8
  13. package/cpp/ggml-cpu-impl.h +386 -386
  14. package/cpp/ggml-cpu-quants.c +10920 -10839
  15. package/cpp/ggml-cpu-traits.cpp +36 -36
  16. package/cpp/ggml-cpu-traits.h +38 -38
  17. package/cpp/ggml-cpu.c +329 -60
  18. package/cpp/ggml-cpu.cpp +10 -2
  19. package/cpp/ggml-cpu.h +135 -135
  20. package/cpp/ggml-impl.h +567 -567
  21. package/cpp/ggml-metal-impl.h +17 -17
  22. package/cpp/ggml-metal.m +4884 -4884
  23. package/cpp/ggml-quants.c +5238 -5238
  24. package/cpp/ggml-threading.h +14 -14
  25. package/cpp/ggml.c +6514 -6448
  26. package/cpp/ggml.h +2194 -2163
  27. package/cpp/gguf.cpp +1329 -1325
  28. package/cpp/gguf.h +202 -202
  29. package/cpp/json-schema-to-grammar.cpp +1045 -1045
  30. package/cpp/json-schema-to-grammar.h +8 -8
  31. package/cpp/json.hpp +24766 -24766
  32. package/cpp/llama-adapter.cpp +347 -346
  33. package/cpp/llama-adapter.h +74 -73
  34. package/cpp/llama-arch.cpp +1487 -1434
  35. package/cpp/llama-arch.h +400 -395
  36. package/cpp/llama-batch.cpp +368 -368
  37. package/cpp/llama-batch.h +88 -88
  38. package/cpp/llama-chat.cpp +578 -567
  39. package/cpp/llama-chat.h +52 -51
  40. package/cpp/llama-context.cpp +1775 -1771
  41. package/cpp/llama-context.h +128 -128
  42. package/cpp/llama-cparams.cpp +1 -1
  43. package/cpp/llama-cparams.h +37 -37
  44. package/cpp/llama-cpp.h +30 -30
  45. package/cpp/llama-grammar.cpp +1139 -1139
  46. package/cpp/llama-grammar.h +143 -143
  47. package/cpp/llama-hparams.cpp +71 -71
  48. package/cpp/llama-hparams.h +139 -140
  49. package/cpp/llama-impl.cpp +167 -167
  50. package/cpp/llama-impl.h +61 -61
  51. package/cpp/llama-kv-cache.cpp +718 -718
  52. package/cpp/llama-kv-cache.h +218 -218
  53. package/cpp/llama-mmap.cpp +2 -1
  54. package/cpp/llama-mmap.h +67 -67
  55. package/cpp/llama-model-loader.cpp +1124 -1011
  56. package/cpp/llama-model-loader.h +167 -158
  57. package/cpp/llama-model.cpp +3997 -2202
  58. package/cpp/llama-model.h +370 -391
  59. package/cpp/llama-sampling.cpp +2408 -2406
  60. package/cpp/llama-sampling.h +32 -48
  61. package/cpp/llama-vocab.cpp +3247 -1982
  62. package/cpp/llama-vocab.h +125 -182
  63. package/cpp/llama.cpp +416 -2886
  64. package/cpp/llama.h +1323 -1285
  65. package/cpp/log.cpp +401 -401
  66. package/cpp/log.h +121 -121
  67. package/cpp/rn-llama.hpp +18 -12
  68. package/cpp/sampling.cpp +505 -500
  69. package/cpp/sgemm.cpp +2597 -2597
  70. package/cpp/speculative.cpp +277 -274
  71. package/cpp/speculative.h +28 -28
  72. package/cpp/unicode.cpp +2 -3
  73. package/package.json +1 -1
@@ -1,128 +1,128 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
- #include "llama-batch.h"
5
- #include "llama-cparams.h"
6
- #include "llama-model.h"
7
- #include "llama-kv-cache.h"
8
- #include "llama-adapter.h"
9
-
10
- #include "ggml-cpp.h"
11
-
12
- #include <map>
13
- #include <unordered_map>
14
- #include <vector>
15
- #include <set>
16
-
17
- struct llama_context {
18
- llama_context(const llama_model & model)
19
- : model(model)
20
- , t_start_us(model.t_start_us)
21
- , t_load_us(model.t_load_us) {}
22
-
23
- const struct llama_model & model;
24
-
25
- struct llama_cparams cparams;
26
- struct llama_sbatch sbatch; // TODO: revisit if needed
27
- struct llama_kv_cache kv_self;
28
- struct llama_control_vector cvec;
29
-
30
- std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
31
-
32
- std::vector<lm_ggml_backend_ptr> backends;
33
- std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
34
-
35
- lm_ggml_backend_t backend_cpu = nullptr;
36
-
37
- lm_ggml_threadpool_t threadpool = nullptr;
38
- lm_ggml_threadpool_t threadpool_batch = nullptr;
39
-
40
- bool has_evaluated_once = false;
41
-
42
- mutable int64_t t_start_us;
43
- mutable int64_t t_load_us;
44
- mutable int64_t t_p_eval_us = 0;
45
- mutable int64_t t_eval_us = 0;
46
-
47
- mutable int64_t t_compute_start_us = 0;
48
- mutable int64_t n_queued_tokens = 0;
49
-
50
- mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
51
- mutable int32_t n_eval = 0; // number of eval calls
52
-
53
- // host buffer for the model output (logits and embeddings)
54
- lm_ggml_backend_buffer_ptr buf_output;
55
-
56
- // decode output (2-dimensional array: [n_outputs][n_vocab])
57
- size_t logits_size = 0; // capacity (of floats) for logits
58
- float * logits = nullptr;
59
-
60
- std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
61
- size_t output_size = 0; // capacity (of tokens positions) for the output buffers
62
- int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
63
-
64
- bool logits_all = false;
65
-
66
- // embeddings output (2-dimensional array: [n_outputs][n_embd])
67
- // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
68
- size_t embd_size = 0; // capacity (of floats) for embeddings
69
- float * embd = nullptr;
70
-
71
- // sequence embeddings output (map of [n_embd] vectors)
72
- // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
73
- std::map<llama_seq_id, std::vector<float>> embd_seq;
74
-
75
- // whether we are computing encoder output or decoder output
76
- bool is_encoding = false;
77
-
78
- // TODO: find a better way to accommodate mutli-dimension position encoding methods
79
- // number of position id each token get, 1 for each token in most cases.
80
- // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
81
- int n_pos_per_token = 1;
82
-
83
- // output of the encoder part of the encoder-decoder models
84
- std::vector<float> embd_enc;
85
- std::vector<std::set<llama_seq_id>> seq_ids_enc;
86
-
87
- // memory buffers used to evaluate the model
88
- std::vector<uint8_t> buf_compute_meta;
89
- lm_ggml_backend_sched_ptr sched;
90
-
91
- lm_ggml_abort_callback abort_callback = nullptr;
92
- void * abort_callback_data = nullptr;
93
-
94
- // input tensors
95
- struct lm_ggml_tensor * inp_tokens; // I32 [n_batch]
96
- struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
97
- struct lm_ggml_tensor * inp_pos; // I32 [n_batch]
98
- struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs]
99
- struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
100
- struct lm_ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
101
- struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size]
102
- struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
103
- struct lm_ggml_tensor * inp_cls; // I32 [n_batch]
104
- struct lm_ggml_tensor * inp_s_copy; // I32 [kv_size]
105
- struct lm_ggml_tensor * inp_s_mask; // F32 [1, n_kv]
106
- struct lm_ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
107
- struct lm_ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
108
- struct lm_ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
109
- struct lm_ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
110
- };
111
-
112
- // TODO: make these methods of llama_context
113
- void llama_set_k_shift(struct llama_context & lctx);
114
-
115
- void llama_set_s_copy(struct llama_context & lctx);
116
-
117
- void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
118
-
119
- // Make sure enough space is available for outputs.
120
- // Returns max number of outputs for which space was reserved.
121
- size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
122
-
123
- // make the outputs have the same order they had in the user-provided batch
124
- void llama_output_reorder(struct llama_context & ctx);
125
-
126
- // For internal test use
127
- // TODO: remove
128
- const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-model.h"
7
+ #include "llama-kv-cache.h"
8
+ #include "llama-adapter.h"
9
+
10
+ #include "ggml-cpp.h"
11
+
12
+ #include <map>
13
+ #include <unordered_map>
14
+ #include <vector>
15
+ #include <set>
16
+
17
+ struct llama_context {
18
+ llama_context(const llama_model & model)
19
+ : model(model)
20
+ , t_start_us(model.t_start_us)
21
+ , t_load_us(model.t_load_us) {}
22
+
23
+ const struct llama_model & model;
24
+
25
+ struct llama_cparams cparams;
26
+ struct llama_sbatch sbatch; // TODO: revisit if needed
27
+ struct llama_kv_cache kv_self;
28
+ struct llama_adapter_cvec cvec;
29
+
30
+ std::unordered_map<struct llama_adapter_lora *, float> lora;
31
+
32
+ std::vector<lm_ggml_backend_ptr> backends;
33
+ std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
34
+
35
+ lm_ggml_backend_t backend_cpu = nullptr;
36
+
37
+ lm_ggml_threadpool_t threadpool = nullptr;
38
+ lm_ggml_threadpool_t threadpool_batch = nullptr;
39
+
40
+ bool has_evaluated_once = false;
41
+
42
+ mutable int64_t t_start_us;
43
+ mutable int64_t t_load_us;
44
+ mutable int64_t t_p_eval_us = 0;
45
+ mutable int64_t t_eval_us = 0;
46
+
47
+ mutable int64_t t_compute_start_us = 0;
48
+ mutable int64_t n_queued_tokens = 0;
49
+
50
+ mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
51
+ mutable int32_t n_eval = 0; // number of eval calls
52
+
53
+ // host buffer for the model output (logits and embeddings)
54
+ lm_ggml_backend_buffer_ptr buf_output;
55
+
56
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
57
+ size_t logits_size = 0; // capacity (of floats) for logits
58
+ float * logits = nullptr;
59
+
60
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
61
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
62
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
63
+
64
+ bool logits_all = false;
65
+
66
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
67
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
68
+ size_t embd_size = 0; // capacity (of floats) for embeddings
69
+ float * embd = nullptr;
70
+
71
+ // sequence embeddings output (map of [n_embd] vectors)
72
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
73
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
74
+
75
+ // whether we are computing encoder output or decoder output
76
+ bool is_encoding = false;
77
+
78
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
79
+ // number of position id each token get, 1 for each token in most cases.
80
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
81
+ int n_pos_per_token = 1;
82
+
83
+ // output of the encoder part of the encoder-decoder models
84
+ std::vector<float> embd_enc;
85
+ std::vector<std::set<llama_seq_id>> seq_ids_enc;
86
+
87
+ // memory buffers used to evaluate the model
88
+ std::vector<uint8_t> buf_compute_meta;
89
+ lm_ggml_backend_sched_ptr sched;
90
+
91
+ lm_ggml_abort_callback abort_callback = nullptr;
92
+ void * abort_callback_data = nullptr;
93
+
94
+ // input tensors
95
+ struct lm_ggml_tensor * inp_tokens; // I32 [n_batch]
96
+ struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
97
+ struct lm_ggml_tensor * inp_pos; // I32 [n_batch]
98
+ struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs]
99
+ struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
100
+ struct lm_ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
101
+ struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size]
102
+ struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
103
+ struct lm_ggml_tensor * inp_cls; // I32 [n_batch]
104
+ struct lm_ggml_tensor * inp_s_copy; // I32 [kv_size]
105
+ struct lm_ggml_tensor * inp_s_mask; // F32 [1, n_kv]
106
+ struct lm_ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
107
+ struct lm_ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
108
+ struct lm_ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
109
+ struct lm_ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
110
+ };
111
+
112
+ // TODO: make these methods of llama_context
113
+ void llama_set_k_shift(struct llama_context & lctx);
114
+
115
+ void llama_set_s_copy(struct llama_context & lctx);
116
+
117
+ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
118
+
119
+ // Make sure enough space is available for outputs.
120
+ // Returns max number of outputs for which space was reserved.
121
+ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
122
+
123
+ // make the outputs have the same order they had in the user-provided batch
124
+ void llama_output_reorder(struct llama_context & ctx);
125
+
126
+ // For internal test use
127
+ // TODO: remove
128
+ const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
@@ -1 +1 @@
1
- #include "llama-cparams.h"
1
+ #include "llama-cparams.h"
@@ -1,37 +1,37 @@
1
- #pragma once
2
-
3
- #include "llama.h"
4
-
5
- #include <cstdint>
6
-
7
- struct llama_cparams {
8
- uint32_t n_ctx; // context size used during inference
9
- uint32_t n_batch;
10
- uint32_t n_ubatch;
11
- uint32_t n_seq_max;
12
- int n_threads; // number of threads to use for generation
13
- int n_threads_batch; // number of threads to use for batch processing
14
-
15
- float rope_freq_base;
16
- float rope_freq_scale;
17
-
18
- uint32_t n_ctx_orig_yarn;
19
- // These hyperparameters are not exposed in GGUF, because all
20
- // existing YaRN models use the same values for them.
21
- float yarn_ext_factor;
22
- float yarn_attn_factor;
23
- float yarn_beta_fast;
24
- float yarn_beta_slow;
25
- float defrag_thold;
26
-
27
- bool embeddings;
28
- bool causal_attn;
29
- bool offload_kqv;
30
- bool flash_attn;
31
- bool no_perf;
32
-
33
- enum llama_pooling_type pooling_type;
34
-
35
- lm_ggml_backend_sched_eval_callback cb_eval;
36
- void * cb_eval_user_data;
37
- };
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include <cstdint>
6
+
7
+ struct llama_cparams {
8
+ uint32_t n_ctx; // context size used during inference
9
+ uint32_t n_batch;
10
+ uint32_t n_ubatch;
11
+ uint32_t n_seq_max;
12
+ int n_threads; // number of threads to use for generation
13
+ int n_threads_batch; // number of threads to use for batch processing
14
+
15
+ float rope_freq_base;
16
+ float rope_freq_scale;
17
+
18
+ uint32_t n_ctx_orig_yarn;
19
+ // These hyperparameters are not exposed in GGUF, because all
20
+ // existing YaRN models use the same values for them.
21
+ float yarn_ext_factor;
22
+ float yarn_attn_factor;
23
+ float yarn_beta_fast;
24
+ float yarn_beta_slow;
25
+ float defrag_thold;
26
+
27
+ bool embeddings;
28
+ bool causal_attn;
29
+ bool offload_kqv;
30
+ bool flash_attn;
31
+ bool no_perf;
32
+
33
+ enum llama_pooling_type pooling_type;
34
+
35
+ lm_ggml_backend_sched_eval_callback cb_eval;
36
+ void * cb_eval_user_data;
37
+ };
package/cpp/llama-cpp.h CHANGED
@@ -1,30 +1,30 @@
1
- #pragma once
2
-
3
- #ifndef __cplusplus
4
- #error "This header is for C++ only"
5
- #endif
6
-
7
- #include <memory>
8
-
9
- #include "llama.h"
10
-
11
- struct llama_model_deleter {
12
- void operator()(llama_model * model) { llama_model_free(model); }
13
- };
14
-
15
- struct llama_context_deleter {
16
- void operator()(llama_context * context) { llama_free(context); }
17
- };
18
-
19
- struct llama_sampler_deleter {
20
- void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
- };
22
-
23
- struct llama_lora_adapter_deleter {
24
- void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
25
- };
26
-
27
- typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
28
- typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
29
- typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
30
- typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;
1
+ #pragma once
2
+
3
+ #ifndef __cplusplus
4
+ #error "This header is for C++ only"
5
+ #endif
6
+
7
+ #include <memory>
8
+
9
+ #include "llama.h"
10
+
11
+ struct llama_model_deleter {
12
+ void operator()(llama_model * model) { llama_model_free(model); }
13
+ };
14
+
15
+ struct llama_context_deleter {
16
+ void operator()(llama_context * context) { llama_free(context); }
17
+ };
18
+
19
+ struct llama_sampler_deleter {
20
+ void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
+ };
22
+
23
+ struct llama_adapter_lora_deleter {
24
+ void operator()(llama_adapter_lora * adapter) { llama_adapter_lora_free(adapter); }
25
+ };
26
+
27
+ typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
28
+ typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
29
+ typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
30
+ typedef std::unique_ptr<llama_adapter_lora, llama_adapter_lora_deleter> llama_adapter_lora_ptr;