npm - cui-llama.rn - Versions diffs - 1.2.2 → 1.2.4 - Mend

cui-llama.rn 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (22) hide show

package/android/src/main/java/com/rnllama/LlamaContext.java +5 -2
package/android/src/main/jni.cpp +7 -7
package/cpp/common.cpp +81 -63
package/cpp/common.h +79 -62
package/cpp/ggml-alloc.c +17 -19
package/cpp/ggml-backend.cpp +59 -24
package/cpp/ggml-impl.h +8 -0
package/cpp/ggml.c +65 -23
package/cpp/ggml.h +1 -0
package/cpp/json-schema-to-grammar.cpp +1 -1
package/cpp/llama-sampling.cpp +366 -24
package/cpp/llama-sampling.h +3 -2
package/cpp/llama-vocab.cpp +33 -9
package/cpp/llama-vocab.h +30 -11
package/cpp/llama.cpp +471 -387
package/cpp/llama.h +52 -21
package/cpp/log.cpp +50 -50
package/cpp/log.h +18 -18
package/cpp/rn-llama.hpp +23 -22
package/cpp/sampling.cpp +110 -119
package/cpp/sampling.h +20 -20
package/package.json +1 -1

package/cpp/common.h CHANGED Viewed

@@ -24,12 +24,12 @@
 #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
-struct llama_lora_adapter_info {
+struct common_lora_adapter_info {
     std::string path;
     float scale;
 };
-struct llama_lora_adapter_container : llama_lora_adapter_info {
+struct common_lora_adapter_container : common_lora_adapter_info {
     struct llama_lora_adapter * adapter;
 };
@@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
 extern char const * LLAMA_COMPILER;
 extern char const * LLAMA_BUILD_TARGET;
-struct llama_control_vector_load_info;
+struct common_control_vector_load_info;
 #define print_build_info() do {                                                                     \
     fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);           \
@@ -93,15 +93,16 @@ enum llama_example {
     LLAMA_EXAMPLE_COUNT,
 };
-enum gpt_sampler_type {
-    GPT_SAMPLER_TYPE_NONE        = 0,
-    GPT_SAMPLER_TYPE_TOP_K       = 1,
-    GPT_SAMPLER_TYPE_TOP_P       = 2,
-    GPT_SAMPLER_TYPE_MIN_P       = 3,
-    GPT_SAMPLER_TYPE_TFS_Z       = 4,
-    GPT_SAMPLER_TYPE_TYPICAL_P   = 5,
-    GPT_SAMPLER_TYPE_TEMPERATURE = 6,
-    GPT_SAMPLER_TYPE_XTC         = 7,
+enum common_sampler_type {
+    COMMON_SAMPLER_TYPE_NONE        = 0,
+    COMMON_SAMPLER_TYPE_TOP_K       = 1,
+    COMMON_SAMPLER_TYPE_TOP_P       = 2,
+    COMMON_SAMPLER_TYPE_MIN_P       = 3,
+    COMMON_SAMPLER_TYPE_TFS_Z       = 4,
+    COMMON_SAMPLER_TYPE_TYPICAL_P   = 5,
+    COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
+    COMMON_SAMPLER_TYPE_XTC         = 7,
+    COMMON_SAMPLER_TYPE_INFILL      = 8,
 };
 // dimensionality reduction methods, used by cvector-generator
@@ -111,7 +112,7 @@ enum dimre_method {
 };
 // sampler parameters
-struct gpt_sampler_params {
+struct common_sampler_params {
     uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
     int32_t n_prev            = 64;    // number of previous tokens to remember
@@ -120,6 +121,8 @@ struct gpt_sampler_params {
     int32_t top_k             = 40;    // <= 0 to use vocab size
     float   top_p             = 0.95f; // 1.0 = disabled
     float   min_p             = 0.05f; // 0.0 = disabled
+    float   xtc_probability   = 0.00f; // 0.0 = disabled
+    float   xtc_threshold     = 0.10f; // > 0.5 disables XTC
     float   tfs_z             = 1.00f; // 1.0 = disabled
     float   xtc_t             = 0.0f;  // 0.0 = disabled
     float   xtc_p             = 0.0f;
@@ -138,14 +141,15 @@ struct gpt_sampler_params {
     bool    ignore_eos        = false;
     bool    no_perf           = false; // disable performance metrics
-    std::vector<enum gpt_sampler_type> samplers = {
-        GPT_SAMPLER_TYPE_TOP_K,
-        GPT_SAMPLER_TYPE_TFS_Z,
-        GPT_SAMPLER_TYPE_TYPICAL_P,
-        GPT_SAMPLER_TYPE_TOP_P,
-        GPT_SAMPLER_TYPE_MIN_P,
-        GPT_SAMPLER_TYPE_TEMPERATURE,
-        GPT_SAMPLER_TYPE_XTC
+    std::vector<enum common_sampler_type> samplers = {
+        COMMON_SAMPLER_TYPE_TOP_K,
+        COMMON_SAMPLER_TYPE_TFS_Z,
+        COMMON_SAMPLER_TYPE_TYPICAL_P,
+        COMMON_SAMPLER_TYPE_TOP_P,
+        COMMON_SAMPLER_TYPE_MIN_P,
+        COMMON_SAMPLER_TYPE_XTC,
+        COMMON_SAMPLER_TYPE_TEMPERATURE,
     };
     std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -156,7 +160,7 @@ struct gpt_sampler_params {
     std::string print() const;
 };
-struct gpt_params {
+struct common_params {
     void * progress_callback_user_data        = nullptr;
     llama_progress_callback progress_callback = nullptr;
@@ -202,7 +206,7 @@ struct gpt_params {
     enum llama_pooling_type      pooling_type      = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
     enum llama_attention_type    attention_type    = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
-    struct gpt_sampler_params sparams;
+    struct common_sampler_params sparams;
     std::string model                = ""; // model path                                                    // NOLINT
     std::string model_draft          = ""; // draft model for speculative decoding                          // NOLINT
@@ -227,9 +231,9 @@ struct gpt_params {
     std::vector<llama_model_kv_override> kv_overrides;
     bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
-    std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
+    std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
-    std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
+    std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
     int32_t verbosity                  = 0;
     int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -296,12 +300,12 @@ struct gpt_params {
     int32_t port           = 8080;         // server listens on this network port
     int32_t timeout_read   = 600;          // http read timeout in seconds
     int32_t timeout_write  = timeout_read; // http write timeout in seconds
-    int     n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_threads_http = -1;           // number of threads to process HTTP requests (TODO: support threadpool)
+    int32_t n_cache_reuse  = 0;            // min chunk size to reuse from the cache via KV shifting
     std::string hostname      = "127.0.0.1";
     std::string public_path   = "";                                                                         // NOLINT
     std::string chat_template = "";                                                                         // NOLINT
-    std::string system_prompt = "";                                                                         // NOLINT
     bool enable_chat_template = true;
     std::vector<std::string> api_keys;
@@ -367,19 +371,32 @@ struct gpt_params {
 // call once at the start of a program if it uses libcommon
 // initializes the logging system and prints info about the build
-void gpt_init();
+void common_init();
-std::string gpt_params_get_system_info(const gpt_params & params);
+std::string common_params_get_system_info(const common_params & params);
-bool parse_cpu_range(const std::string& range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
-bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
+bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
+bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
+void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
 bool set_process_priority(enum lm_ggml_sched_priority prio);
 //
 // String utils
 //
+#ifdef __GNUC__
+#ifdef __MINGW32__
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
+#endif
+#else
+#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
+#endif
+LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
+std::string string_format(const char * fmt, ...);
 std::vector<std::string> string_split(std::string input, char separator);
 std::string string_strip(const std::string & str);
@@ -423,29 +440,29 @@ std::string fs_get_cache_file(const std::string & filename);
 // Model utils
 //
-struct llama_init_result {
+struct common_init_result {
     struct llama_model   * model   = nullptr;
     struct llama_context * context = nullptr;
-    std::vector<llama_lora_adapter_container> lora_adapters;
+    std::vector<common_lora_adapter_container> lora_adapters;
 };
-struct llama_init_result    llama_init_from_gpt_params(gpt_params & params);
+struct common_init_result     common_init_from_params(common_params & params);
-struct llama_model_params     llama_model_params_from_gpt_params    (const gpt_params & params);
-struct llama_context_params   llama_context_params_from_gpt_params  (const gpt_params & params);
+struct llama_model_params     common_model_params_to_llama  (const common_params & params);
+struct llama_context_params   common_context_params_to_llama(const common_params & params);
 struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
-struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
+struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
 // clear LoRA adapters from context, then apply new list of adapters
-void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
+void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
 // Batch utils
-void llama_batch_clear(struct llama_batch & batch);
+void common_batch_clear(struct llama_batch & batch);
-void llama_batch_add(
+void common_batch_add(
                  struct llama_batch & batch,
                         llama_token   id,
                           llama_pos   pos,
@@ -458,13 +475,13 @@ void llama_batch_add(
 // tokenizes a string into a vector of tokens
 // should work similar to Python's `tokenizer.encode`
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
   const struct llama_context * ctx,
            const std::string & text,
                         bool   add_special,
                         bool   parse_special = false);
-std::vector<llama_token> llama_tokenize(
+std::vector<llama_token> common_tokenize(
     const struct llama_model * model,
            const std::string & text,
                         bool   add_special,
@@ -472,7 +489,7 @@ std::vector<llama_token> llama_tokenize(
 // tokenizes a token into a piece, optionally renders special/control tokens
 // should work similar to Python's `tokenizer.id_to_piece`
-std::string llama_token_to_piece(
+std::string common_token_to_piece(
         const struct llama_context * ctx,
                        llama_token   token,
                        bool          special = true);
@@ -480,7 +497,7 @@ std::string llama_token_to_piece(
 // detokenizes a vector of tokens into a string
 // should work similar to Python's `tokenizer.decode`
 // optionally renders special/control tokens
-std::string llama_detokenize(
+std::string common_detokenize(
                          llama_context * ctx,
         const std::vector<llama_token> & tokens,
                                   bool   special = true);
@@ -490,31 +507,31 @@ std::string llama_detokenize(
 //
 // same with llama_chat_message, but uses std::string
-struct llama_chat_msg {
+struct common_chat_msg {
     std::string role;
     std::string content;
 };
 // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
-bool llama_chat_verify_template(const std::string & tmpl);
+bool common_chat_verify_template(const std::string & tmpl);
 // CPP wrapper for llama_chat_apply_template
 // If the built-in template is not supported, we default to chatml
 // If the custom "tmpl" is not supported, we throw an error
-std::string llama_chat_apply_template(const struct llama_model * model,
+std::string common_chat_apply_template(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & chat,
+        const std::vector<common_chat_msg> & chat,
         bool add_ass);
 // Format single message, while taking into account the position of that message in chat history
-std::string llama_chat_format_single(const struct llama_model * model,
+std::string common_chat_format_single(const struct llama_model * model,
         const std::string & tmpl,
-        const std::vector<llama_chat_msg> & past_msg,
-        const llama_chat_msg & new_msg,
+        const std::vector<common_chat_msg> & past_msg,
+        const common_chat_msg & new_msg,
         bool add_ass);
 // Returns an example of formatted chat
-std::string llama_chat_format_example(const struct llama_model * model,
+std::string common_chat_format_example(const struct llama_model * model,
         const std::string & tmpl);
 //
@@ -522,31 +539,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
 //
 // Dump the KV cache view with the number of sequences per cell.
-void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
+void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
 // Dump the KV cache view showing individual sequences in each cell (long output).
-void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
+void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
 //
 // Embedding utils
 //
-void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
+void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
-float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
+float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
 //
 // Control vector utils
 //
-struct llama_control_vector_data {
+struct common_control_vector_data {
     int n_embd;
     // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
     std::vector<float> data;
 };
-struct llama_control_vector_load_info {
+struct common_control_vector_load_info {
     float strength;
     std::string fname;
@@ -554,7 +571,7 @@ struct llama_control_vector_load_info {
 // Load control vectors, scale each by strength, and add them together.
 // On error, returns {-1, empty}
-llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
+common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
 //
 // Split utils
@@ -573,5 +590,5 @@ void yaml_dump_vector_int      (FILE * stream, const char * prop_name, const std
 void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
 void yaml_dump_non_result_info(
-    FILE * stream, const gpt_params & params, const llama_context * lctx,
+    FILE * stream, const common_params & params, const llama_context * lctx,
     const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);

package/cpp/ggml-alloc.c CHANGED Viewed

@@ -14,7 +14,7 @@
 //#define LM_GGML_ALLOCATOR_DEBUG
-//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
+//#define AT_PRINTF(...) LM_GGML_LOG_DEBUG(__VA_ARGS__)
 #define AT_PRINTF(...)
@@ -89,7 +89,7 @@ void lm_ggml_tallocr_alloc(struct lm_ggml_tallocr * talloc, struct lm_ggml_tenso
     size = LM_GGML_PAD(size, talloc->alignment);
     if (talloc->offset + size > lm_ggml_backend_buffer_get_size(talloc->buffer)) {
-        fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+        LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
                 __func__, tensor->name, size, lm_ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
         LM_GGML_ABORT("not enough space in the buffer");
     }
@@ -172,7 +172,7 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
             best_fit_block = alloc->n_free_blocks - 1;
         } else {
             // this should never happen
-            fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+            LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
                     __func__, size, max_avail);
             LM_GGML_ABORT("not enough space in the buffer");
         }
@@ -209,16 +209,16 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
                 }
             }
         }
-        fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+        LM_GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
         for (int i = 0; i < 1024; i++) {
             if (alloc->allocated_tensors[i].tensor) {
-                fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+                LM_GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
                     alloc->allocated_tensors[i].offset,
                     alloc->allocated_tensors[i].offset + lm_ggml_nbytes(alloc->allocated_tensors[i].tensor),
                     lm_ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
             }
         }
-        fprintf(stderr, "\n");
+        LM_GGML_LOG_DEBUG("\n");
     }
 #endif
@@ -348,7 +348,6 @@ struct tensor_alloc {
 };
 struct leaf_alloc {
-    int buffer_id;
     struct tensor_alloc leaf;
 };
@@ -740,7 +739,6 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
     for (int i = 0; i < graph->n_leafs; i++) {
         struct lm_ggml_tensor * leaf = graph->leafs[i];
         struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, leaf);
-        galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
         if (leaf->view_src || leaf->data) {
             galloc->leaf_allocs[i].leaf.buffer_id = -1;
             galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
@@ -768,13 +766,13 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
         // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
         if (new_size > cur_size || galloc->buffers[i] == NULL) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+            LM_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
 #endif
             lm_ggml_backend_buffer_free(galloc->buffers[i]);
             galloc->buffers[i] = lm_ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
             if (galloc->buffers[i] == NULL) {
-                fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), new_size);
+                LM_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), new_size);
                 return false;
             }
             lm_ggml_backend_buffer_set_usage(galloc->buffers[i], LM_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -825,14 +823,14 @@ static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct
 static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph) {
     if (galloc->n_nodes != graph->n_nodes) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
+        LM_GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
 #endif
         return true;
     }
     if (galloc->n_leafs != graph->n_leafs) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
+        LM_GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
 #endif
         return true;
     }
@@ -843,7 +841,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
         if (!lm_ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
+            LM_GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
 #endif
             return true;
         }
@@ -855,7 +853,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
             }
             if (!lm_ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
 #ifndef NDEBUG
-                fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
+                LM_GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
 #endif
                 return true;
             }
@@ -869,14 +867,14 @@ bool lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph
     if (lm_ggml_gallocr_needs_realloc(galloc, graph)) {
         if (galloc->n_buffers == 1) {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
+            LM_GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
 #endif
             if (!lm_ggml_gallocr_reserve(galloc, graph)) {
                 return false;
             }
         } else {
 #ifndef NDEBUG
-            fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
+            LM_GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
 #endif
             return false;
         }
@@ -940,7 +938,7 @@ static bool alloc_tensor_range(struct lm_ggml_context * ctx,
     lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, size);
     if (buffer == NULL) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(buft), size);
+        LM_GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(buft), size);
 #endif
         for (size_t i = 0; i < *n_buffers; i++) {
             lm_ggml_backend_buffer_free((*buffers)[i]);
@@ -990,7 +988,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
         }
         if (this_size > max_size) {
-            fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+            LM_GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
                     __func__, t->name,
                     lm_ggml_backend_buft_name(buft),
                     this_size, max_size);
@@ -1022,7 +1020,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
     if (n_buffers == 0) {
 #ifndef NDEBUG
-        fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+        LM_GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
 #endif
         return NULL;
     }