cui-llama.rn 1.3.5 → 1.3.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (52) hide show
  1. package/android/src/main/CMakeLists.txt +14 -8
  2. package/android/src/main/jni.cpp +38 -37
  3. package/cpp/common.cpp +43 -26
  4. package/cpp/common.h +18 -11
  5. package/cpp/ggml-backend-reg.cpp +5 -0
  6. package/cpp/ggml-backend.cpp +5 -2
  7. package/cpp/ggml-cpp.h +1 -0
  8. package/cpp/ggml-cpu-aarch64.cpp +6 -1
  9. package/cpp/ggml-cpu-quants.c +5 -1
  10. package/cpp/ggml-impl.h +11 -16
  11. package/cpp/ggml-metal.m +2 -2
  12. package/cpp/ggml.c +0 -1276
  13. package/cpp/ggml.h +0 -140
  14. package/cpp/gguf.cpp +1325 -0
  15. package/cpp/gguf.h +202 -0
  16. package/cpp/llama-adapter.cpp +346 -0
  17. package/cpp/llama-adapter.h +73 -0
  18. package/cpp/llama-arch.cpp +1434 -0
  19. package/cpp/llama-arch.h +395 -0
  20. package/cpp/llama-batch.cpp +368 -0
  21. package/cpp/llama-batch.h +88 -0
  22. package/cpp/llama-chat.cpp +567 -0
  23. package/cpp/llama-chat.h +51 -0
  24. package/cpp/llama-context.cpp +1771 -0
  25. package/cpp/llama-context.h +128 -0
  26. package/cpp/llama-cparams.cpp +1 -0
  27. package/cpp/llama-cparams.h +37 -0
  28. package/cpp/llama-cpp.h +30 -0
  29. package/cpp/llama-grammar.cpp +1 -0
  30. package/cpp/llama-grammar.h +3 -1
  31. package/cpp/llama-hparams.cpp +71 -0
  32. package/cpp/llama-hparams.h +140 -0
  33. package/cpp/llama-impl.cpp +167 -0
  34. package/cpp/llama-impl.h +16 -136
  35. package/cpp/llama-kv-cache.cpp +718 -0
  36. package/cpp/llama-kv-cache.h +218 -0
  37. package/cpp/llama-mmap.cpp +589 -0
  38. package/cpp/llama-mmap.h +67 -0
  39. package/cpp/llama-model-loader.cpp +1011 -0
  40. package/cpp/llama-model-loader.h +158 -0
  41. package/cpp/llama-model.cpp +2202 -0
  42. package/cpp/llama-model.h +391 -0
  43. package/cpp/llama-sampling.cpp +117 -4
  44. package/cpp/llama-vocab.cpp +21 -28
  45. package/cpp/llama-vocab.h +13 -1
  46. package/cpp/llama.cpp +8437 -19421
  47. package/cpp/llama.cpp.rej +23 -0
  48. package/cpp/llama.h +31 -6
  49. package/cpp/rn-llama.hpp +39 -37
  50. package/cpp/sgemm.cpp +776 -70
  51. package/cpp/unicode.cpp +6 -0
  52. package/package.json +1 -1
@@ -0,0 +1,128 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+ #include "llama-batch.h"
5
+ #include "llama-cparams.h"
6
+ #include "llama-model.h"
7
+ #include "llama-kv-cache.h"
8
+ #include "llama-adapter.h"
9
+
10
+ #include "ggml-cpp.h"
11
+
12
+ #include <map>
13
+ #include <unordered_map>
14
+ #include <vector>
15
+ #include <set>
16
+
17
+ struct llama_context {
18
+ llama_context(const llama_model & model)
19
+ : model(model)
20
+ , t_start_us(model.t_start_us)
21
+ , t_load_us(model.t_load_us) {}
22
+
23
+ const struct llama_model & model;
24
+
25
+ struct llama_cparams cparams;
26
+ struct llama_sbatch sbatch; // TODO: revisit if needed
27
+ struct llama_kv_cache kv_self;
28
+ struct llama_control_vector cvec;
29
+
30
+ std::unordered_map<struct llama_lora_adapter *, float> lora_adapters;
31
+
32
+ std::vector<lm_ggml_backend_ptr> backends;
33
+ std::vector<std::pair<lm_ggml_backend_t, lm_ggml_backend_set_n_threads_t>> set_n_threads_fns;
34
+
35
+ lm_ggml_backend_t backend_cpu = nullptr;
36
+
37
+ lm_ggml_threadpool_t threadpool = nullptr;
38
+ lm_ggml_threadpool_t threadpool_batch = nullptr;
39
+
40
+ bool has_evaluated_once = false;
41
+
42
+ mutable int64_t t_start_us;
43
+ mutable int64_t t_load_us;
44
+ mutable int64_t t_p_eval_us = 0;
45
+ mutable int64_t t_eval_us = 0;
46
+
47
+ mutable int64_t t_compute_start_us = 0;
48
+ mutable int64_t n_queued_tokens = 0;
49
+
50
+ mutable int32_t n_p_eval = 0; // number of tokens in eval calls for the prompt (with batch size > 1)
51
+ mutable int32_t n_eval = 0; // number of eval calls
52
+
53
+ // host buffer for the model output (logits and embeddings)
54
+ lm_ggml_backend_buffer_ptr buf_output;
55
+
56
+ // decode output (2-dimensional array: [n_outputs][n_vocab])
57
+ size_t logits_size = 0; // capacity (of floats) for logits
58
+ float * logits = nullptr;
59
+
60
+ std::vector<int32_t> output_ids; // map batch token positions to ids of the logits and embd buffers
61
+ size_t output_size = 0; // capacity (of tokens positions) for the output buffers
62
+ int32_t n_outputs = 0; // number of actually-used outputs in the current ubatch or last logical batch
63
+
64
+ bool logits_all = false;
65
+
66
+ // embeddings output (2-dimensional array: [n_outputs][n_embd])
67
+ // populated only when pooling_type == LLAMA_POOLING_TYPE_NONE
68
+ size_t embd_size = 0; // capacity (of floats) for embeddings
69
+ float * embd = nullptr;
70
+
71
+ // sequence embeddings output (map of [n_embd] vectors)
72
+ // populated only when pooling_type != LLAMA_POOLING_TYPE_NONE
73
+ std::map<llama_seq_id, std::vector<float>> embd_seq;
74
+
75
+ // whether we are computing encoder output or decoder output
76
+ bool is_encoding = false;
77
+
78
+ // TODO: find a better way to accommodate mutli-dimension position encoding methods
79
+ // number of position id each token get, 1 for each token in most cases.
80
+ // when using m-rope, it will be 3 position ids per token to representing 3 dimension coordinate.
81
+ int n_pos_per_token = 1;
82
+
83
+ // output of the encoder part of the encoder-decoder models
84
+ std::vector<float> embd_enc;
85
+ std::vector<std::set<llama_seq_id>> seq_ids_enc;
86
+
87
+ // memory buffers used to evaluate the model
88
+ std::vector<uint8_t> buf_compute_meta;
89
+ lm_ggml_backend_sched_ptr sched;
90
+
91
+ lm_ggml_abort_callback abort_callback = nullptr;
92
+ void * abort_callback_data = nullptr;
93
+
94
+ // input tensors
95
+ struct lm_ggml_tensor * inp_tokens; // I32 [n_batch]
96
+ struct lm_ggml_tensor * inp_embd; // F32 [n_embd, n_batch]
97
+ struct lm_ggml_tensor * inp_pos; // I32 [n_batch]
98
+ struct lm_ggml_tensor * inp_out_ids; // I32 [n_outputs]
99
+ struct lm_ggml_tensor * inp_KQ_mask; // F32 [kv_size, n_batch]
100
+ struct lm_ggml_tensor * inp_KQ_mask_swa; // F32 [kv_size, n_batch]
101
+ struct lm_ggml_tensor * inp_K_shift; // I32 [kv_size]
102
+ struct lm_ggml_tensor * inp_mean; // F32 [n_batch, n_batch]
103
+ struct lm_ggml_tensor * inp_cls; // I32 [n_batch]
104
+ struct lm_ggml_tensor * inp_s_copy; // I32 [kv_size]
105
+ struct lm_ggml_tensor * inp_s_mask; // F32 [1, n_kv]
106
+ struct lm_ggml_tensor * inp_s_seq; // I32 [n_kv, n_batch]
107
+ struct lm_ggml_tensor * inp_pos_bucket; // I32 [n_batch|n_kv, n_batch]
108
+ struct lm_ggml_tensor * inp_embd_enc; // F32 [n_embd, n_outputs_enc]
109
+ struct lm_ggml_tensor * inp_KQ_mask_cross; // F32 [n_outputs_enc, n_batch]
110
+ };
111
+
112
+ // TODO: make these methods of llama_context
113
+ void llama_set_k_shift(struct llama_context & lctx);
114
+
115
+ void llama_set_s_copy(struct llama_context & lctx);
116
+
117
+ void llama_set_inputs(llama_context & lctx, const llama_ubatch & ubatch);
118
+
119
+ // Make sure enough space is available for outputs.
120
+ // Returns max number of outputs for which space was reserved.
121
+ size_t llama_output_reserve(struct llama_context & lctx, size_t n_outputs);
122
+
123
+ // make the outputs have the same order they had in the user-provided batch
124
+ void llama_output_reorder(struct llama_context & ctx);
125
+
126
+ // For internal test use
127
+ // TODO: remove
128
+ const std::vector<std::pair<std::string, struct lm_ggml_tensor *>> & llama_internal_get_tensor_map(struct llama_context * ctx);
@@ -0,0 +1 @@
1
+ #include "llama-cparams.h"
@@ -0,0 +1,37 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include <cstdint>
6
+
7
+ struct llama_cparams {
8
+ uint32_t n_ctx; // context size used during inference
9
+ uint32_t n_batch;
10
+ uint32_t n_ubatch;
11
+ uint32_t n_seq_max;
12
+ int n_threads; // number of threads to use for generation
13
+ int n_threads_batch; // number of threads to use for batch processing
14
+
15
+ float rope_freq_base;
16
+ float rope_freq_scale;
17
+
18
+ uint32_t n_ctx_orig_yarn;
19
+ // These hyperparameters are not exposed in GGUF, because all
20
+ // existing YaRN models use the same values for them.
21
+ float yarn_ext_factor;
22
+ float yarn_attn_factor;
23
+ float yarn_beta_fast;
24
+ float yarn_beta_slow;
25
+ float defrag_thold;
26
+
27
+ bool embeddings;
28
+ bool causal_attn;
29
+ bool offload_kqv;
30
+ bool flash_attn;
31
+ bool no_perf;
32
+
33
+ enum llama_pooling_type pooling_type;
34
+
35
+ lm_ggml_backend_sched_eval_callback cb_eval;
36
+ void * cb_eval_user_data;
37
+ };
@@ -0,0 +1,30 @@
1
+ #pragma once
2
+
3
+ #ifndef __cplusplus
4
+ #error "This header is for C++ only"
5
+ #endif
6
+
7
+ #include <memory>
8
+
9
+ #include "llama.h"
10
+
11
+ struct llama_model_deleter {
12
+ void operator()(llama_model * model) { llama_model_free(model); }
13
+ };
14
+
15
+ struct llama_context_deleter {
16
+ void operator()(llama_context * context) { llama_free(context); }
17
+ };
18
+
19
+ struct llama_sampler_deleter {
20
+ void operator()(llama_sampler * sampler) { llama_sampler_free(sampler); }
21
+ };
22
+
23
+ struct llama_lora_adapter_deleter {
24
+ void operator()(llama_lora_adapter * lora_adapter) { llama_lora_adapter_free(lora_adapter); }
25
+ };
26
+
27
+ typedef std::unique_ptr<llama_model, llama_model_deleter> llama_model_ptr;
28
+ typedef std::unique_ptr<llama_context, llama_context_deleter> llama_context_ptr;
29
+ typedef std::unique_ptr<llama_sampler, llama_sampler_deleter> llama_sampler_ptr;
30
+ typedef std::unique_ptr<llama_lora_adapter, llama_lora_adapter_deleter> llama_lora_adapter_ptr;
@@ -1,5 +1,6 @@
1
1
  #include "llama-grammar.h"
2
2
 
3
+ #include "llama-impl.h"
3
4
  #include "llama-vocab.h"
4
5
  #include "llama-sampling.h"
5
6
 
@@ -1,8 +1,10 @@
1
1
  #pragma once
2
2
 
3
- #include "llama-impl.h"
3
+ #include "llama.h"
4
4
 
5
5
  #include <map>
6
+ #include <string>
7
+ #include <vector>
6
8
 
7
9
  struct llama_vocab;
8
10
 
@@ -0,0 +1,71 @@
1
+ #include "llama-hparams.h"
2
+
3
+ #include "ggml.h"
4
+
5
+ uint32_t llama_hparams::n_head(uint32_t il) const {
6
+ if (il < n_layer) {
7
+ return n_head_arr[il];
8
+ }
9
+
10
+ LM_GGML_ABORT("fatal error");
11
+ }
12
+
13
+ uint32_t llama_hparams::n_head_kv(uint32_t il) const {
14
+ if (il < n_layer) {
15
+ return n_head_kv_arr[il];
16
+ }
17
+
18
+ LM_GGML_ABORT("fatal error");
19
+ }
20
+
21
+ uint32_t llama_hparams::n_ff(uint32_t il) const {
22
+ if (il < n_layer) {
23
+ return n_ff_arr[il];
24
+ }
25
+
26
+ LM_GGML_ABORT("fatal error");
27
+ }
28
+
29
+ uint32_t llama_hparams::n_gqa(uint32_t il) const {
30
+ const uint32_t n_head = this->n_head(il);
31
+ const uint32_t n_head_kv = this->n_head_kv(il);
32
+
33
+ if (n_head_kv == 0) {
34
+ return 0;
35
+ }
36
+
37
+ return n_head/n_head_kv;
38
+ }
39
+
40
+ uint32_t llama_hparams::n_embd_k_gqa(uint32_t il) const {
41
+ const uint32_t n_head_kv = this->n_head_kv(il);
42
+
43
+ return n_embd_head_k * n_head_kv;
44
+ }
45
+
46
+ uint32_t llama_hparams::n_embd_v_gqa(uint32_t il) const {
47
+ const uint32_t n_head_kv = this->n_head_kv(il);
48
+
49
+ return n_embd_head_v * n_head_kv;
50
+ }
51
+
52
+ uint32_t llama_hparams::n_embd_k_s() const {
53
+ if (wkv_head_size != 0) {
54
+ // for RWKV models
55
+ return 2 * n_embd;
56
+ }
57
+
58
+ // TODO: maybe support other convolution strides than 1
59
+ // NOTE: since the first column of the conv_state is shifted out each time, it's not actually needed
60
+ return (ssm_d_conv > 0 ? ssm_d_conv - 1 : 0) * ssm_d_inner;
61
+ }
62
+
63
+ uint32_t llama_hparams::n_embd_v_s() const {
64
+ if (wkv_head_size != 0) {
65
+ // corresponds to RWKV's wkv_states size
66
+ return n_embd * wkv_head_size;
67
+ }
68
+
69
+ // corresponds to Mamba's ssm_states size
70
+ return ssm_d_state * ssm_d_inner;
71
+ }
@@ -0,0 +1,140 @@
1
+ #pragma once
2
+
3
+ #include "llama.h"
4
+
5
+ #include <array>
6
+
7
+ // bump if necessary
8
+ #define LLAMA_MAX_LAYERS 512
9
+ #define LLAMA_MAX_EXPERTS 256 // DeepSeekV3
10
+
11
+ enum llama_expert_gating_func_type {
12
+ LLAMA_EXPERT_GATING_FUNC_TYPE_NONE = 0,
13
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SOFTMAX = 1,
14
+ LLAMA_EXPERT_GATING_FUNC_TYPE_SIGMOID = 2,
15
+ };
16
+
17
+ struct llama_hparams_posnet {
18
+ uint32_t n_embd;
19
+ uint32_t n_layer;
20
+ };
21
+
22
+ struct llama_hparams_convnext {
23
+ uint32_t n_embd;
24
+ uint32_t n_layer;
25
+ };
26
+
27
+ struct llama_hparams {
28
+ bool vocab_only;
29
+ bool rope_finetuned;
30
+ bool use_par_res;
31
+ bool swin_norm;
32
+
33
+ uint32_t n_vocab = 0;
34
+ uint32_t n_ctx_train; // context size the model was trained on
35
+ uint32_t n_embd;
36
+ uint32_t n_embd_features = 0;
37
+ uint32_t n_layer;
38
+ uint32_t n_rot;
39
+ uint32_t n_swa = 0; // sliding window attention (SWA)
40
+ uint32_t n_embd_head_k; // dimension of keys (d_k). d_q is assumed to be the same, but there are n_head q heads, and only n_head_kv k-v heads
41
+ uint32_t n_embd_head_v; // dimension of values (d_v) aka n_embd_head
42
+ uint32_t n_expert = 0;
43
+ uint32_t n_expert_used = 0;
44
+ uint32_t n_vocab_type = 0; // for BERT-style token types
45
+ uint32_t n_rel_attn_bkts = 0;
46
+
47
+ // for WavTokenizer
48
+ struct llama_hparams_posnet posnet;
49
+ struct llama_hparams_convnext convnext;
50
+
51
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_arr;
52
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_head_kv_arr;
53
+ std::array<uint32_t, LLAMA_MAX_LAYERS> n_ff_arr;
54
+
55
+ uint32_t n_layer_dense_lead = 0;
56
+ uint32_t n_lora_q = 0;
57
+ uint32_t n_lora_kv = 0;
58
+ uint32_t n_ff_exp = 0;
59
+ uint32_t n_ff_shexp = 0;
60
+ uint32_t n_expert_shared = 0;
61
+ uint32_t n_norm_groups = 0;
62
+
63
+ float expert_weights_scale = 0.0;
64
+ bool expert_weights_norm = false;
65
+ uint32_t expert_gating_func = LLAMA_EXPERT_GATING_FUNC_TYPE_NONE;
66
+
67
+ float f_norm_eps;
68
+ float f_norm_rms_eps;
69
+ float f_norm_group_eps;
70
+
71
+ float f_attn_logit_softcapping = 50.0f;
72
+ float f_final_logit_softcapping = 30.0f;
73
+
74
+ // for RWKV
75
+ uint32_t rescale_every_n_layers = 0;
76
+ uint32_t time_mix_extra_dim = 0;
77
+ uint32_t time_decay_extra_dim = 0;
78
+ uint32_t wkv_head_size = 0;
79
+
80
+ float rope_attn_factor = 1.0f;
81
+ float rope_freq_base_train;
82
+ float rope_freq_scale_train;
83
+ uint32_t n_ctx_orig_yarn;
84
+ float rope_yarn_log_mul;
85
+
86
+ std::array<int, 4> rope_sections;
87
+
88
+ // for State Space Models
89
+ uint32_t ssm_d_conv = 0;
90
+ uint32_t ssm_d_inner = 0;
91
+ uint32_t ssm_d_state = 0;
92
+ uint32_t ssm_dt_rank = 0;
93
+
94
+ bool ssm_dt_b_c_rms = false;
95
+
96
+ float f_clamp_kqv = 0.0f;
97
+ float f_max_alibi_bias = 0.0f;
98
+ float f_logit_scale = 0.0f;
99
+
100
+ // Additional scale factors (Granite/Granite MoE)
101
+ float f_residual_scale = 0.0f;
102
+ float f_embedding_scale = 0.0f;
103
+ float f_attention_scale = 0.0f;
104
+
105
+ bool causal_attn = true;
106
+ bool use_alibi = false;
107
+ bool attn_soft_cap = false;
108
+
109
+ // needed by encoder-decoder models (e.g. T5, FLAN-T5)
110
+ // ref: https://github.com/ggerganov/llama.cpp/pull/8141
111
+ llama_token dec_start_token_id = LLAMA_TOKEN_NULL;
112
+
113
+ enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_NONE;
114
+ enum llama_rope_type rope_type = LLAMA_ROPE_TYPE_NONE;
115
+ enum llama_rope_scaling_type rope_scaling_type_train = LLAMA_ROPE_SCALING_TYPE_NONE;
116
+
117
+ uint32_t n_head(uint32_t il = 0) const;
118
+
119
+ uint32_t n_head_kv(uint32_t il = 0) const;
120
+
121
+ uint32_t n_ff(uint32_t il = 0) const;
122
+
123
+ uint32_t n_gqa(uint32_t il = 0) const;
124
+
125
+ // dimension of key embeddings across all k-v heads
126
+ uint32_t n_embd_k_gqa(uint32_t il = 0) const;
127
+
128
+ // dimension of value embeddings across all k-v heads
129
+ uint32_t n_embd_v_gqa(uint32_t il = 0) const;
130
+
131
+ // dimension of the rolling state embeddings
132
+ // corresponds to Mamba's conv_states size or RWKV's token_shift states size
133
+ uint32_t n_embd_k_s() const;
134
+
135
+ // dimension of the recurrent state embeddings
136
+ uint32_t n_embd_v_s() const;
137
+ };
138
+
139
+ static_assert(std::is_trivially_copyable<llama_hparams>::value, "llama_hparams must be trivially copyable");
140
+
@@ -0,0 +1,167 @@
1
+ #include "llama-impl.h"
2
+
3
+ #include "gguf.h"
4
+ #include "llama.h"
5
+
6
+ #include <cinttypes>
7
+ #include <climits>
8
+ #include <cstdarg>
9
+ #include <cstring>
10
+ #include <vector>
11
+ #include <sstream>
12
+
13
+ struct llama_logger_state {
14
+ lm_ggml_log_callback log_callback = llama_log_callback_default;
15
+ void * log_callback_user_data = nullptr;
16
+ };
17
+
18
+ static llama_logger_state g_logger_state;
19
+
20
+ time_meas::time_meas(int64_t & t_acc, bool disable) : t_start_us(disable ? -1 : lm_ggml_time_us()), t_acc(t_acc) {}
21
+
22
+ time_meas::~time_meas() {
23
+ if (t_start_us >= 0) {
24
+ t_acc += lm_ggml_time_us() - t_start_us;
25
+ }
26
+ }
27
+
28
+ void llama_log_set(lm_ggml_log_callback log_callback, void * user_data) {
29
+ lm_ggml_log_set(log_callback, user_data);
30
+ g_logger_state.log_callback = log_callback ? log_callback : llama_log_callback_default;
31
+ g_logger_state.log_callback_user_data = user_data;
32
+ }
33
+
34
+ static void llama_log_internal_v(lm_ggml_log_level level, const char * format, va_list args) {
35
+ va_list args_copy;
36
+ va_copy(args_copy, args);
37
+ char buffer[128];
38
+ int len = vsnprintf(buffer, 128, format, args);
39
+ if (len < 128) {
40
+ g_logger_state.log_callback(level, buffer, g_logger_state.log_callback_user_data);
41
+ } else {
42
+ char * buffer2 = new char[len + 1];
43
+ vsnprintf(buffer2, len + 1, format, args_copy);
44
+ buffer2[len] = 0;
45
+ g_logger_state.log_callback(level, buffer2, g_logger_state.log_callback_user_data);
46
+ delete[] buffer2;
47
+ }
48
+ va_end(args_copy);
49
+ }
50
+
51
+ void llama_log_internal(lm_ggml_log_level level, const char * format, ...) {
52
+ va_list args;
53
+ va_start(args, format);
54
+ llama_log_internal_v(level, format, args);
55
+ va_end(args);
56
+ }
57
+
58
+ void llama_log_callback_default(lm_ggml_log_level level, const char * text, void * user_data) {
59
+ (void) level;
60
+ (void) user_data;
61
+ fputs(text, stderr);
62
+ fflush(stderr);
63
+ }
64
+
65
+ void replace_all(std::string & s, const std::string & search, const std::string & replace) {
66
+ if (search.empty()) {
67
+ return;
68
+ }
69
+ std::string builder;
70
+ builder.reserve(s.length());
71
+ size_t pos = 0;
72
+ size_t last_pos = 0;
73
+ while ((pos = s.find(search, last_pos)) != std::string::npos) {
74
+ builder.append(s, last_pos, pos - last_pos);
75
+ builder.append(replace);
76
+ last_pos = pos + search.length();
77
+ }
78
+ builder.append(s, last_pos, std::string::npos);
79
+ s = std::move(builder);
80
+ }
81
+
82
+ std::string format(const char * fmt, ...) {
83
+ va_list ap;
84
+ va_list ap2;
85
+ va_start(ap, fmt);
86
+ va_copy(ap2, ap);
87
+ int size = vsnprintf(NULL, 0, fmt, ap);
88
+ LM_GGML_ASSERT(size >= 0 && size < INT_MAX); // NOLINT
89
+ std::vector<char> buf(size + 1);
90
+ int size2 = vsnprintf(buf.data(), size + 1, fmt, ap2);
91
+ LM_GGML_ASSERT(size2 == size);
92
+ va_end(ap2);
93
+ va_end(ap);
94
+ return std::string(buf.data(), size);
95
+ }
96
+
97
+ std::string llama_format_tensor_shape(const std::vector<int64_t> & ne) {
98
+ char buf[256];
99
+ snprintf(buf, sizeof(buf), "%5" PRId64, ne.at(0));
100
+ for (size_t i = 1; i < ne.size(); i++) {
101
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, ne.at(i));
102
+ }
103
+ return buf;
104
+ }
105
+
106
+ std::string llama_format_tensor_shape(const struct lm_ggml_tensor * t) {
107
+ char buf[256];
108
+ snprintf(buf, sizeof(buf), "%5" PRId64, t->ne[0]);
109
+ for (int i = 1; i < LM_GGML_MAX_DIMS; i++) {
110
+ snprintf(buf + strlen(buf), sizeof(buf) - strlen(buf), ", %5" PRId64, t->ne[i]);
111
+ }
112
+ return buf;
113
+ }
114
+
115
+ static std::string lm_gguf_data_to_str(enum lm_gguf_type type, const void * data, int i) {
116
+ switch (type) {
117
+ case LM_GGUF_TYPE_UINT8: return std::to_string(((const uint8_t *)data)[i]);
118
+ case LM_GGUF_TYPE_INT8: return std::to_string(((const int8_t *)data)[i]);
119
+ case LM_GGUF_TYPE_UINT16: return std::to_string(((const uint16_t *)data)[i]);
120
+ case LM_GGUF_TYPE_INT16: return std::to_string(((const int16_t *)data)[i]);
121
+ case LM_GGUF_TYPE_UINT32: return std::to_string(((const uint32_t *)data)[i]);
122
+ case LM_GGUF_TYPE_INT32: return std::to_string(((const int32_t *)data)[i]);
123
+ case LM_GGUF_TYPE_UINT64: return std::to_string(((const uint64_t *)data)[i]);
124
+ case LM_GGUF_TYPE_INT64: return std::to_string(((const int64_t *)data)[i]);
125
+ case LM_GGUF_TYPE_FLOAT32: return std::to_string(((const float *)data)[i]);
126
+ case LM_GGUF_TYPE_FLOAT64: return std::to_string(((const double *)data)[i]);
127
+ case LM_GGUF_TYPE_BOOL: return ((const bool *)data)[i] ? "true" : "false";
128
+ default: return format("unknown type %d", type);
129
+ }
130
+ }
131
+
132
+ std::string lm_gguf_kv_to_str(const struct lm_gguf_context * ctx_gguf, int i) {
133
+ const enum lm_gguf_type type = lm_gguf_get_kv_type(ctx_gguf, i);
134
+
135
+ switch (type) {
136
+ case LM_GGUF_TYPE_STRING:
137
+ return lm_gguf_get_val_str(ctx_gguf, i);
138
+ case LM_GGUF_TYPE_ARRAY:
139
+ {
140
+ const enum lm_gguf_type arr_type = lm_gguf_get_arr_type(ctx_gguf, i);
141
+ int arr_n = lm_gguf_get_arr_n(ctx_gguf, i);
142
+ const void * data = arr_type == LM_GGUF_TYPE_STRING ? nullptr : lm_gguf_get_arr_data(ctx_gguf, i);
143
+ std::stringstream ss;
144
+ ss << "[";
145
+ for (int j = 0; j < arr_n; j++) {
146
+ if (arr_type == LM_GGUF_TYPE_STRING) {
147
+ std::string val = lm_gguf_get_arr_str(ctx_gguf, i, j);
148
+ // escape quotes
149
+ replace_all(val, "\\", "\\\\");
150
+ replace_all(val, "\"", "\\\"");
151
+ ss << '"' << val << '"';
152
+ } else if (arr_type == LM_GGUF_TYPE_ARRAY) {
153
+ ss << "???";
154
+ } else {
155
+ ss << lm_gguf_data_to_str(arr_type, data, j);
156
+ }
157
+ if (j < arr_n - 1) {
158
+ ss << ", ";
159
+ }
160
+ }
161
+ ss << "]";
162
+ return ss.str();
163
+ }
164
+ default:
165
+ return lm_gguf_data_to_str(type, lm_gguf_get_val_data(ctx_gguf, i), 0);
166
+ }
167
+ }