cui-llama.rn 1.2.2 → 1.2.4

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/cpp/common.h CHANGED
@@ -24,12 +24,12 @@
24
24
 
25
25
  #define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
26
26
 
27
- struct llama_lora_adapter_info {
27
+ struct common_lora_adapter_info {
28
28
  std::string path;
29
29
  float scale;
30
30
  };
31
31
 
32
- struct llama_lora_adapter_container : llama_lora_adapter_info {
32
+ struct common_lora_adapter_container : common_lora_adapter_info {
33
33
  struct llama_lora_adapter * adapter;
34
34
  };
35
35
 
@@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
39
39
  extern char const * LLAMA_COMPILER;
40
40
  extern char const * LLAMA_BUILD_TARGET;
41
41
 
42
- struct llama_control_vector_load_info;
42
+ struct common_control_vector_load_info;
43
43
 
44
44
  #define print_build_info() do { \
45
45
  fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
@@ -93,15 +93,16 @@ enum llama_example {
93
93
  LLAMA_EXAMPLE_COUNT,
94
94
  };
95
95
 
96
- enum gpt_sampler_type {
97
- GPT_SAMPLER_TYPE_NONE = 0,
98
- GPT_SAMPLER_TYPE_TOP_K = 1,
99
- GPT_SAMPLER_TYPE_TOP_P = 2,
100
- GPT_SAMPLER_TYPE_MIN_P = 3,
101
- GPT_SAMPLER_TYPE_TFS_Z = 4,
102
- GPT_SAMPLER_TYPE_TYPICAL_P = 5,
103
- GPT_SAMPLER_TYPE_TEMPERATURE = 6,
104
- GPT_SAMPLER_TYPE_XTC = 7,
96
+ enum common_sampler_type {
97
+ COMMON_SAMPLER_TYPE_NONE = 0,
98
+ COMMON_SAMPLER_TYPE_TOP_K = 1,
99
+ COMMON_SAMPLER_TYPE_TOP_P = 2,
100
+ COMMON_SAMPLER_TYPE_MIN_P = 3,
101
+ COMMON_SAMPLER_TYPE_TFS_Z = 4,
102
+ COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
103
+ COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
104
+ COMMON_SAMPLER_TYPE_XTC = 7,
105
+ COMMON_SAMPLER_TYPE_INFILL = 8,
105
106
  };
106
107
 
107
108
  // dimensionality reduction methods, used by cvector-generator
@@ -111,7 +112,7 @@ enum dimre_method {
111
112
  };
112
113
 
113
114
  // sampler parameters
114
- struct gpt_sampler_params {
115
+ struct common_sampler_params {
115
116
  uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
116
117
 
117
118
  int32_t n_prev = 64; // number of previous tokens to remember
@@ -120,6 +121,8 @@ struct gpt_sampler_params {
120
121
  int32_t top_k = 40; // <= 0 to use vocab size
121
122
  float top_p = 0.95f; // 1.0 = disabled
122
123
  float min_p = 0.05f; // 0.0 = disabled
124
+ float xtc_probability = 0.00f; // 0.0 = disabled
125
+ float xtc_threshold = 0.10f; // > 0.5 disables XTC
123
126
  float tfs_z = 1.00f; // 1.0 = disabled
124
127
  float xtc_t = 0.0f; // 0.0 = disabled
125
128
  float xtc_p = 0.0f;
@@ -138,14 +141,15 @@ struct gpt_sampler_params {
138
141
  bool ignore_eos = false;
139
142
  bool no_perf = false; // disable performance metrics
140
143
 
141
- std::vector<enum gpt_sampler_type> samplers = {
142
- GPT_SAMPLER_TYPE_TOP_K,
143
- GPT_SAMPLER_TYPE_TFS_Z,
144
- GPT_SAMPLER_TYPE_TYPICAL_P,
145
- GPT_SAMPLER_TYPE_TOP_P,
146
- GPT_SAMPLER_TYPE_MIN_P,
147
- GPT_SAMPLER_TYPE_TEMPERATURE,
148
- GPT_SAMPLER_TYPE_XTC
144
+
145
+ std::vector<enum common_sampler_type> samplers = {
146
+ COMMON_SAMPLER_TYPE_TOP_K,
147
+ COMMON_SAMPLER_TYPE_TFS_Z,
148
+ COMMON_SAMPLER_TYPE_TYPICAL_P,
149
+ COMMON_SAMPLER_TYPE_TOP_P,
150
+ COMMON_SAMPLER_TYPE_MIN_P,
151
+ COMMON_SAMPLER_TYPE_XTC,
152
+ COMMON_SAMPLER_TYPE_TEMPERATURE,
149
153
  };
150
154
 
151
155
  std::string grammar; // optional BNF-like grammar to constrain sampling
@@ -156,7 +160,7 @@ struct gpt_sampler_params {
156
160
  std::string print() const;
157
161
  };
158
162
 
159
- struct gpt_params {
163
+ struct common_params {
160
164
 
161
165
  void * progress_callback_user_data = nullptr;
162
166
  llama_progress_callback progress_callback = nullptr;
@@ -202,7 +206,7 @@ struct gpt_params {
202
206
  enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
203
207
  enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
204
208
 
205
- struct gpt_sampler_params sparams;
209
+ struct common_sampler_params sparams;
206
210
 
207
211
  std::string model = ""; // model path // NOLINT
208
212
  std::string model_draft = ""; // draft model for speculative decoding // NOLINT
@@ -227,9 +231,9 @@ struct gpt_params {
227
231
  std::vector<llama_model_kv_override> kv_overrides;
228
232
 
229
233
  bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
230
- std::vector<llama_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
234
+ std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
231
235
 
232
- std::vector<llama_control_vector_load_info> control_vectors; // control vector with user defined scale
236
+ std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
233
237
 
234
238
  int32_t verbosity = 0;
235
239
  int32_t control_vector_layer_start = -1; // layer range for control vector
@@ -296,12 +300,12 @@ struct gpt_params {
296
300
  int32_t port = 8080; // server listens on this network port
297
301
  int32_t timeout_read = 600; // http read timeout in seconds
298
302
  int32_t timeout_write = timeout_read; // http write timeout in seconds
299
- int n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
303
+ int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
304
+ int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
300
305
 
301
306
  std::string hostname = "127.0.0.1";
302
307
  std::string public_path = ""; // NOLINT
303
308
  std::string chat_template = ""; // NOLINT
304
- std::string system_prompt = ""; // NOLINT
305
309
  bool enable_chat_template = true;
306
310
 
307
311
  std::vector<std::string> api_keys;
@@ -367,19 +371,32 @@ struct gpt_params {
367
371
 
368
372
  // call once at the start of a program if it uses libcommon
369
373
  // initializes the logging system and prints info about the build
370
- void gpt_init();
374
+ void common_init();
371
375
 
372
- std::string gpt_params_get_system_info(const gpt_params & params);
376
+ std::string common_params_get_system_info(const common_params & params);
373
377
 
374
- bool parse_cpu_range(const std::string& range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
375
- bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
376
- void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
378
+ bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
379
+ bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
380
+ void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
377
381
  bool set_process_priority(enum lm_ggml_sched_priority prio);
378
382
 
379
383
  //
380
384
  // String utils
381
385
  //
382
386
 
387
+ #ifdef __GNUC__
388
+ #ifdef __MINGW32__
389
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
390
+ #else
391
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
392
+ #endif
393
+ #else
394
+ #define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
395
+ #endif
396
+
397
+ LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
398
+ std::string string_format(const char * fmt, ...);
399
+
383
400
  std::vector<std::string> string_split(std::string input, char separator);
384
401
 
385
402
  std::string string_strip(const std::string & str);
@@ -423,29 +440,29 @@ std::string fs_get_cache_file(const std::string & filename);
423
440
  // Model utils
424
441
  //
425
442
 
426
- struct llama_init_result {
443
+ struct common_init_result {
427
444
  struct llama_model * model = nullptr;
428
445
  struct llama_context * context = nullptr;
429
- std::vector<llama_lora_adapter_container> lora_adapters;
446
+ std::vector<common_lora_adapter_container> lora_adapters;
430
447
  };
431
448
 
432
- struct llama_init_result llama_init_from_gpt_params(gpt_params & params);
449
+ struct common_init_result common_init_from_params(common_params & params);
433
450
 
434
- struct llama_model_params llama_model_params_from_gpt_params (const gpt_params & params);
435
- struct llama_context_params llama_context_params_from_gpt_params (const gpt_params & params);
451
+ struct llama_model_params common_model_params_to_llama (const common_params & params);
452
+ struct llama_context_params common_context_params_to_llama(const common_params & params);
436
453
  struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
437
454
 
438
- struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
439
- struct llama_model * llama_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
455
+ struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
456
+ struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
440
457
 
441
458
  // clear LoRA adapters from context, then apply new list of adapters
442
- void llama_lora_adapters_apply(struct llama_context * ctx, std::vector<llama_lora_adapter_container> & lora_adapters);
459
+ void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
443
460
 
444
461
  // Batch utils
445
462
 
446
- void llama_batch_clear(struct llama_batch & batch);
463
+ void common_batch_clear(struct llama_batch & batch);
447
464
 
448
- void llama_batch_add(
465
+ void common_batch_add(
449
466
  struct llama_batch & batch,
450
467
  llama_token id,
451
468
  llama_pos pos,
@@ -458,13 +475,13 @@ void llama_batch_add(
458
475
 
459
476
  // tokenizes a string into a vector of tokens
460
477
  // should work similar to Python's `tokenizer.encode`
461
- std::vector<llama_token> llama_tokenize(
478
+ std::vector<llama_token> common_tokenize(
462
479
  const struct llama_context * ctx,
463
480
  const std::string & text,
464
481
  bool add_special,
465
482
  bool parse_special = false);
466
483
 
467
- std::vector<llama_token> llama_tokenize(
484
+ std::vector<llama_token> common_tokenize(
468
485
  const struct llama_model * model,
469
486
  const std::string & text,
470
487
  bool add_special,
@@ -472,7 +489,7 @@ std::vector<llama_token> llama_tokenize(
472
489
 
473
490
  // tokenizes a token into a piece, optionally renders special/control tokens
474
491
  // should work similar to Python's `tokenizer.id_to_piece`
475
- std::string llama_token_to_piece(
492
+ std::string common_token_to_piece(
476
493
  const struct llama_context * ctx,
477
494
  llama_token token,
478
495
  bool special = true);
@@ -480,7 +497,7 @@ std::string llama_token_to_piece(
480
497
  // detokenizes a vector of tokens into a string
481
498
  // should work similar to Python's `tokenizer.decode`
482
499
  // optionally renders special/control tokens
483
- std::string llama_detokenize(
500
+ std::string common_detokenize(
484
501
  llama_context * ctx,
485
502
  const std::vector<llama_token> & tokens,
486
503
  bool special = true);
@@ -490,31 +507,31 @@ std::string llama_detokenize(
490
507
  //
491
508
 
492
509
  // same with llama_chat_message, but uses std::string
493
- struct llama_chat_msg {
510
+ struct common_chat_msg {
494
511
  std::string role;
495
512
  std::string content;
496
513
  };
497
514
 
498
515
  // Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
499
- bool llama_chat_verify_template(const std::string & tmpl);
516
+ bool common_chat_verify_template(const std::string & tmpl);
500
517
 
501
518
  // CPP wrapper for llama_chat_apply_template
502
519
  // If the built-in template is not supported, we default to chatml
503
520
  // If the custom "tmpl" is not supported, we throw an error
504
- std::string llama_chat_apply_template(const struct llama_model * model,
521
+ std::string common_chat_apply_template(const struct llama_model * model,
505
522
  const std::string & tmpl,
506
- const std::vector<llama_chat_msg> & chat,
523
+ const std::vector<common_chat_msg> & chat,
507
524
  bool add_ass);
508
525
 
509
526
  // Format single message, while taking into account the position of that message in chat history
510
- std::string llama_chat_format_single(const struct llama_model * model,
527
+ std::string common_chat_format_single(const struct llama_model * model,
511
528
  const std::string & tmpl,
512
- const std::vector<llama_chat_msg> & past_msg,
513
- const llama_chat_msg & new_msg,
529
+ const std::vector<common_chat_msg> & past_msg,
530
+ const common_chat_msg & new_msg,
514
531
  bool add_ass);
515
532
 
516
533
  // Returns an example of formatted chat
517
- std::string llama_chat_format_example(const struct llama_model * model,
534
+ std::string common_chat_format_example(const struct llama_model * model,
518
535
  const std::string & tmpl);
519
536
 
520
537
  //
@@ -522,31 +539,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
522
539
  //
523
540
 
524
541
  // Dump the KV cache view with the number of sequences per cell.
525
- void llama_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
542
+ void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
526
543
 
527
544
  // Dump the KV cache view showing individual sequences in each cell (long output).
528
- void llama_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
545
+ void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
529
546
 
530
547
  //
531
548
  // Embedding utils
532
549
  //
533
550
 
534
- void llama_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
551
+ void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
535
552
 
536
- float llama_embd_similarity_cos(const float * embd1, const float * embd2, int n);
553
+ float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
537
554
 
538
555
  //
539
556
  // Control vector utils
540
557
  //
541
558
 
542
- struct llama_control_vector_data {
559
+ struct common_control_vector_data {
543
560
  int n_embd;
544
561
 
545
562
  // stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
546
563
  std::vector<float> data;
547
564
  };
548
565
 
549
- struct llama_control_vector_load_info {
566
+ struct common_control_vector_load_info {
550
567
  float strength;
551
568
 
552
569
  std::string fname;
@@ -554,7 +571,7 @@ struct llama_control_vector_load_info {
554
571
 
555
572
  // Load control vectors, scale each by strength, and add them together.
556
573
  // On error, returns {-1, empty}
557
- llama_control_vector_data llama_control_vector_load(const std::vector<llama_control_vector_load_info> & load_infos);
574
+ common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
558
575
 
559
576
  //
560
577
  // Split utils
@@ -573,5 +590,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std
573
590
  void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
574
591
 
575
592
  void yaml_dump_non_result_info(
576
- FILE * stream, const gpt_params & params, const llama_context * lctx,
593
+ FILE * stream, const common_params & params, const llama_context * lctx,
577
594
  const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
package/cpp/ggml-alloc.c CHANGED
@@ -14,7 +14,7 @@
14
14
 
15
15
  //#define LM_GGML_ALLOCATOR_DEBUG
16
16
 
17
- //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
17
+ //#define AT_PRINTF(...) LM_GGML_LOG_DEBUG(__VA_ARGS__)
18
18
  #define AT_PRINTF(...)
19
19
 
20
20
 
@@ -89,7 +89,7 @@ void lm_ggml_tallocr_alloc(struct lm_ggml_tallocr * talloc, struct lm_ggml_tenso
89
89
  size = LM_GGML_PAD(size, talloc->alignment);
90
90
 
91
91
  if (talloc->offset + size > lm_ggml_backend_buffer_get_size(talloc->buffer)) {
92
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
92
+ LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
93
93
  __func__, tensor->name, size, lm_ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
94
94
  LM_GGML_ABORT("not enough space in the buffer");
95
95
  }
@@ -172,7 +172,7 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
172
172
  best_fit_block = alloc->n_free_blocks - 1;
173
173
  } else {
174
174
  // this should never happen
175
- fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
175
+ LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
176
176
  __func__, size, max_avail);
177
177
  LM_GGML_ABORT("not enough space in the buffer");
178
178
  }
@@ -209,16 +209,16 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
209
209
  }
210
210
  }
211
211
  }
212
- fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
212
+ LM_GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
213
213
  for (int i = 0; i < 1024; i++) {
214
214
  if (alloc->allocated_tensors[i].tensor) {
215
- fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
215
+ LM_GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
216
216
  alloc->allocated_tensors[i].offset,
217
217
  alloc->allocated_tensors[i].offset + lm_ggml_nbytes(alloc->allocated_tensors[i].tensor),
218
218
  lm_ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
219
219
  }
220
220
  }
221
- fprintf(stderr, "\n");
221
+ LM_GGML_LOG_DEBUG("\n");
222
222
  }
223
223
  #endif
224
224
 
@@ -348,7 +348,6 @@ struct tensor_alloc {
348
348
  };
349
349
 
350
350
  struct leaf_alloc {
351
- int buffer_id;
352
351
  struct tensor_alloc leaf;
353
352
  };
354
353
 
@@ -740,7 +739,6 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
740
739
  for (int i = 0; i < graph->n_leafs; i++) {
741
740
  struct lm_ggml_tensor * leaf = graph->leafs[i];
742
741
  struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, leaf);
743
- galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
744
742
  if (leaf->view_src || leaf->data) {
745
743
  galloc->leaf_allocs[i].leaf.buffer_id = -1;
746
744
  galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
@@ -768,13 +766,13 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
768
766
  // even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
769
767
  if (new_size > cur_size || galloc->buffers[i] == NULL) {
770
768
  #ifndef NDEBUG
771
- fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
769
+ LM_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
772
770
  #endif
773
771
 
774
772
  lm_ggml_backend_buffer_free(galloc->buffers[i]);
775
773
  galloc->buffers[i] = lm_ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
776
774
  if (galloc->buffers[i] == NULL) {
777
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), new_size);
775
+ LM_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), new_size);
778
776
  return false;
779
777
  }
780
778
  lm_ggml_backend_buffer_set_usage(galloc->buffers[i], LM_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
@@ -825,14 +823,14 @@ static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct
825
823
  static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph) {
826
824
  if (galloc->n_nodes != graph->n_nodes) {
827
825
  #ifndef NDEBUG
828
- fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
826
+ LM_GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
829
827
  #endif
830
828
  return true;
831
829
  }
832
830
 
833
831
  if (galloc->n_leafs != graph->n_leafs) {
834
832
  #ifndef NDEBUG
835
- fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
833
+ LM_GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
836
834
  #endif
837
835
  return true;
838
836
  }
@@ -843,7 +841,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
843
841
 
844
842
  if (!lm_ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
845
843
  #ifndef NDEBUG
846
- fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
844
+ LM_GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
847
845
  #endif
848
846
  return true;
849
847
  }
@@ -855,7 +853,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
855
853
  }
856
854
  if (!lm_ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
857
855
  #ifndef NDEBUG
858
- fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
856
+ LM_GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
859
857
  #endif
860
858
  return true;
861
859
  }
@@ -869,14 +867,14 @@ bool lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph
869
867
  if (lm_ggml_gallocr_needs_realloc(galloc, graph)) {
870
868
  if (galloc->n_buffers == 1) {
871
869
  #ifndef NDEBUG
872
- fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
870
+ LM_GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
873
871
  #endif
874
872
  if (!lm_ggml_gallocr_reserve(galloc, graph)) {
875
873
  return false;
876
874
  }
877
875
  } else {
878
876
  #ifndef NDEBUG
879
- fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
877
+ LM_GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
880
878
  #endif
881
879
  return false;
882
880
  }
@@ -940,7 +938,7 @@ static bool alloc_tensor_range(struct lm_ggml_context * ctx,
940
938
  lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, size);
941
939
  if (buffer == NULL) {
942
940
  #ifndef NDEBUG
943
- fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(buft), size);
941
+ LM_GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(buft), size);
944
942
  #endif
945
943
  for (size_t i = 0; i < *n_buffers; i++) {
946
944
  lm_ggml_backend_buffer_free((*buffers)[i]);
@@ -990,7 +988,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
990
988
  }
991
989
 
992
990
  if (this_size > max_size) {
993
- fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
991
+ LM_GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
994
992
  __func__, t->name,
995
993
  lm_ggml_backend_buft_name(buft),
996
994
  this_size, max_size);
@@ -1022,7 +1020,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
1022
1020
 
1023
1021
  if (n_buffers == 0) {
1024
1022
  #ifndef NDEBUG
1025
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
1023
+ LM_GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
1026
1024
  #endif
1027
1025
  return NULL;
1028
1026
  }