cui-llama.rn 1.2.2 → 1.2.4
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/android/src/main/java/com/rnllama/LlamaContext.java +5 -2
- package/android/src/main/jni.cpp +7 -7
- package/cpp/common.cpp +81 -63
- package/cpp/common.h +79 -62
- package/cpp/ggml-alloc.c +17 -19
- package/cpp/ggml-backend.cpp +59 -24
- package/cpp/ggml-impl.h +8 -0
- package/cpp/ggml.c +65 -23
- package/cpp/ggml.h +1 -0
- package/cpp/json-schema-to-grammar.cpp +1 -1
- package/cpp/llama-sampling.cpp +366 -24
- package/cpp/llama-sampling.h +3 -2
- package/cpp/llama-vocab.cpp +33 -9
- package/cpp/llama-vocab.h +30 -11
- package/cpp/llama.cpp +471 -387
- package/cpp/llama.h +52 -21
- package/cpp/log.cpp +50 -50
- package/cpp/log.h +18 -18
- package/cpp/rn-llama.hpp +23 -22
- package/cpp/sampling.cpp +110 -119
- package/cpp/sampling.h +20 -20
- package/package.json +1 -1
package/cpp/common.h
CHANGED
@@ -24,12 +24,12 @@
|
|
24
24
|
|
25
25
|
#define DEFAULT_MODEL_PATH "models/7B/ggml-model-f16.gguf"
|
26
26
|
|
27
|
-
struct
|
27
|
+
struct common_lora_adapter_info {
|
28
28
|
std::string path;
|
29
29
|
float scale;
|
30
30
|
};
|
31
31
|
|
32
|
-
struct
|
32
|
+
struct common_lora_adapter_container : common_lora_adapter_info {
|
33
33
|
struct llama_lora_adapter * adapter;
|
34
34
|
};
|
35
35
|
|
@@ -39,7 +39,7 @@ extern char const * LLAMA_COMMIT;
|
|
39
39
|
extern char const * LLAMA_COMPILER;
|
40
40
|
extern char const * LLAMA_BUILD_TARGET;
|
41
41
|
|
42
|
-
struct
|
42
|
+
struct common_control_vector_load_info;
|
43
43
|
|
44
44
|
#define print_build_info() do { \
|
45
45
|
fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \
|
@@ -93,15 +93,16 @@ enum llama_example {
|
|
93
93
|
LLAMA_EXAMPLE_COUNT,
|
94
94
|
};
|
95
95
|
|
96
|
-
enum
|
97
|
-
|
98
|
-
|
99
|
-
|
100
|
-
|
101
|
-
|
102
|
-
|
103
|
-
|
104
|
-
|
96
|
+
enum common_sampler_type {
|
97
|
+
COMMON_SAMPLER_TYPE_NONE = 0,
|
98
|
+
COMMON_SAMPLER_TYPE_TOP_K = 1,
|
99
|
+
COMMON_SAMPLER_TYPE_TOP_P = 2,
|
100
|
+
COMMON_SAMPLER_TYPE_MIN_P = 3,
|
101
|
+
COMMON_SAMPLER_TYPE_TFS_Z = 4,
|
102
|
+
COMMON_SAMPLER_TYPE_TYPICAL_P = 5,
|
103
|
+
COMMON_SAMPLER_TYPE_TEMPERATURE = 6,
|
104
|
+
COMMON_SAMPLER_TYPE_XTC = 7,
|
105
|
+
COMMON_SAMPLER_TYPE_INFILL = 8,
|
105
106
|
};
|
106
107
|
|
107
108
|
// dimensionality reduction methods, used by cvector-generator
|
@@ -111,7 +112,7 @@ enum dimre_method {
|
|
111
112
|
};
|
112
113
|
|
113
114
|
// sampler parameters
|
114
|
-
struct
|
115
|
+
struct common_sampler_params {
|
115
116
|
uint32_t seed = LLAMA_DEFAULT_SEED; // the seed used to initialize llama_sampler
|
116
117
|
|
117
118
|
int32_t n_prev = 64; // number of previous tokens to remember
|
@@ -120,6 +121,8 @@ struct gpt_sampler_params {
|
|
120
121
|
int32_t top_k = 40; // <= 0 to use vocab size
|
121
122
|
float top_p = 0.95f; // 1.0 = disabled
|
122
123
|
float min_p = 0.05f; // 0.0 = disabled
|
124
|
+
float xtc_probability = 0.00f; // 0.0 = disabled
|
125
|
+
float xtc_threshold = 0.10f; // > 0.5 disables XTC
|
123
126
|
float tfs_z = 1.00f; // 1.0 = disabled
|
124
127
|
float xtc_t = 0.0f; // 0.0 = disabled
|
125
128
|
float xtc_p = 0.0f;
|
@@ -138,14 +141,15 @@ struct gpt_sampler_params {
|
|
138
141
|
bool ignore_eos = false;
|
139
142
|
bool no_perf = false; // disable performance metrics
|
140
143
|
|
141
|
-
|
142
|
-
|
143
|
-
|
144
|
-
|
145
|
-
|
146
|
-
|
147
|
-
|
148
|
-
|
144
|
+
|
145
|
+
std::vector<enum common_sampler_type> samplers = {
|
146
|
+
COMMON_SAMPLER_TYPE_TOP_K,
|
147
|
+
COMMON_SAMPLER_TYPE_TFS_Z,
|
148
|
+
COMMON_SAMPLER_TYPE_TYPICAL_P,
|
149
|
+
COMMON_SAMPLER_TYPE_TOP_P,
|
150
|
+
COMMON_SAMPLER_TYPE_MIN_P,
|
151
|
+
COMMON_SAMPLER_TYPE_XTC,
|
152
|
+
COMMON_SAMPLER_TYPE_TEMPERATURE,
|
149
153
|
};
|
150
154
|
|
151
155
|
std::string grammar; // optional BNF-like grammar to constrain sampling
|
@@ -156,7 +160,7 @@ struct gpt_sampler_params {
|
|
156
160
|
std::string print() const;
|
157
161
|
};
|
158
162
|
|
159
|
-
struct
|
163
|
+
struct common_params {
|
160
164
|
|
161
165
|
void * progress_callback_user_data = nullptr;
|
162
166
|
llama_progress_callback progress_callback = nullptr;
|
@@ -202,7 +206,7 @@ struct gpt_params {
|
|
202
206
|
enum llama_pooling_type pooling_type = LLAMA_POOLING_TYPE_UNSPECIFIED; // pooling type for embeddings
|
203
207
|
enum llama_attention_type attention_type = LLAMA_ATTENTION_TYPE_UNSPECIFIED; // attention type for embeddings
|
204
208
|
|
205
|
-
struct
|
209
|
+
struct common_sampler_params sparams;
|
206
210
|
|
207
211
|
std::string model = ""; // model path // NOLINT
|
208
212
|
std::string model_draft = ""; // draft model for speculative decoding // NOLINT
|
@@ -227,9 +231,9 @@ struct gpt_params {
|
|
227
231
|
std::vector<llama_model_kv_override> kv_overrides;
|
228
232
|
|
229
233
|
bool lora_init_without_apply = false; // only load lora to memory, but do not apply it to ctx (user can manually apply lora later using llama_lora_adapter_apply)
|
230
|
-
std::vector<
|
234
|
+
std::vector<common_lora_adapter_info> lora_adapters; // lora adapter path with user defined scale
|
231
235
|
|
232
|
-
std::vector<
|
236
|
+
std::vector<common_control_vector_load_info> control_vectors; // control vector with user defined scale
|
233
237
|
|
234
238
|
int32_t verbosity = 0;
|
235
239
|
int32_t control_vector_layer_start = -1; // layer range for control vector
|
@@ -296,12 +300,12 @@ struct gpt_params {
|
|
296
300
|
int32_t port = 8080; // server listens on this network port
|
297
301
|
int32_t timeout_read = 600; // http read timeout in seconds
|
298
302
|
int32_t timeout_write = timeout_read; // http write timeout in seconds
|
299
|
-
|
303
|
+
int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool)
|
304
|
+
int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting
|
300
305
|
|
301
306
|
std::string hostname = "127.0.0.1";
|
302
307
|
std::string public_path = ""; // NOLINT
|
303
308
|
std::string chat_template = ""; // NOLINT
|
304
|
-
std::string system_prompt = ""; // NOLINT
|
305
309
|
bool enable_chat_template = true;
|
306
310
|
|
307
311
|
std::vector<std::string> api_keys;
|
@@ -367,19 +371,32 @@ struct gpt_params {
|
|
367
371
|
|
368
372
|
// call once at the start of a program if it uses libcommon
|
369
373
|
// initializes the logging system and prints info about the build
|
370
|
-
void
|
374
|
+
void common_init();
|
371
375
|
|
372
|
-
std::string
|
376
|
+
std::string common_params_get_system_info(const common_params & params);
|
373
377
|
|
374
|
-
bool parse_cpu_range(const std::string& range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
|
375
|
-
bool parse_cpu_mask(const std::string& mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
|
376
|
-
void postprocess_cpu_params(cpu_params& cpuparams, const cpu_params* role_model = nullptr);
|
378
|
+
bool parse_cpu_range(const std::string & range, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
|
379
|
+
bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[LM_GGML_MAX_N_THREADS]);
|
380
|
+
void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
|
377
381
|
bool set_process_priority(enum lm_ggml_sched_priority prio);
|
378
382
|
|
379
383
|
//
|
380
384
|
// String utils
|
381
385
|
//
|
382
386
|
|
387
|
+
#ifdef __GNUC__
|
388
|
+
#ifdef __MINGW32__
|
389
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(gnu_printf, __VA_ARGS__)))
|
390
|
+
#else
|
391
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...) __attribute__((format(printf, __VA_ARGS__)))
|
392
|
+
#endif
|
393
|
+
#else
|
394
|
+
#define LLAMA_COMMON_ATTRIBUTE_FORMAT(...)
|
395
|
+
#endif
|
396
|
+
|
397
|
+
LLAMA_COMMON_ATTRIBUTE_FORMAT(1, 2)
|
398
|
+
std::string string_format(const char * fmt, ...);
|
399
|
+
|
383
400
|
std::vector<std::string> string_split(std::string input, char separator);
|
384
401
|
|
385
402
|
std::string string_strip(const std::string & str);
|
@@ -423,29 +440,29 @@ std::string fs_get_cache_file(const std::string & filename);
|
|
423
440
|
// Model utils
|
424
441
|
//
|
425
442
|
|
426
|
-
struct
|
443
|
+
struct common_init_result {
|
427
444
|
struct llama_model * model = nullptr;
|
428
445
|
struct llama_context * context = nullptr;
|
429
|
-
std::vector<
|
446
|
+
std::vector<common_lora_adapter_container> lora_adapters;
|
430
447
|
};
|
431
448
|
|
432
|
-
struct
|
449
|
+
struct common_init_result common_init_from_params(common_params & params);
|
433
450
|
|
434
|
-
struct llama_model_params
|
435
|
-
struct llama_context_params
|
451
|
+
struct llama_model_params common_model_params_to_llama (const common_params & params);
|
452
|
+
struct llama_context_params common_context_params_to_llama(const common_params & params);
|
436
453
|
struct lm_ggml_threadpool_params lm_ggml_threadpool_params_from_cpu_params(const cpu_params & params);
|
437
454
|
|
438
|
-
struct llama_model *
|
439
|
-
struct llama_model *
|
455
|
+
struct llama_model * common_load_model_from_url(const char * model_url, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
456
|
+
struct llama_model * common_load_model_from_hf(const char * repo, const char * file, const char * path_model, const char * hf_token, const struct llama_model_params & params);
|
440
457
|
|
441
458
|
// clear LoRA adapters from context, then apply new list of adapters
|
442
|
-
void
|
459
|
+
void common_lora_adapters_apply(struct llama_context * ctx, std::vector<common_lora_adapter_container> & lora_adapters);
|
443
460
|
|
444
461
|
// Batch utils
|
445
462
|
|
446
|
-
void
|
463
|
+
void common_batch_clear(struct llama_batch & batch);
|
447
464
|
|
448
|
-
void
|
465
|
+
void common_batch_add(
|
449
466
|
struct llama_batch & batch,
|
450
467
|
llama_token id,
|
451
468
|
llama_pos pos,
|
@@ -458,13 +475,13 @@ void llama_batch_add(
|
|
458
475
|
|
459
476
|
// tokenizes a string into a vector of tokens
|
460
477
|
// should work similar to Python's `tokenizer.encode`
|
461
|
-
std::vector<llama_token>
|
478
|
+
std::vector<llama_token> common_tokenize(
|
462
479
|
const struct llama_context * ctx,
|
463
480
|
const std::string & text,
|
464
481
|
bool add_special,
|
465
482
|
bool parse_special = false);
|
466
483
|
|
467
|
-
std::vector<llama_token>
|
484
|
+
std::vector<llama_token> common_tokenize(
|
468
485
|
const struct llama_model * model,
|
469
486
|
const std::string & text,
|
470
487
|
bool add_special,
|
@@ -472,7 +489,7 @@ std::vector<llama_token> llama_tokenize(
|
|
472
489
|
|
473
490
|
// tokenizes a token into a piece, optionally renders special/control tokens
|
474
491
|
// should work similar to Python's `tokenizer.id_to_piece`
|
475
|
-
std::string
|
492
|
+
std::string common_token_to_piece(
|
476
493
|
const struct llama_context * ctx,
|
477
494
|
llama_token token,
|
478
495
|
bool special = true);
|
@@ -480,7 +497,7 @@ std::string llama_token_to_piece(
|
|
480
497
|
// detokenizes a vector of tokens into a string
|
481
498
|
// should work similar to Python's `tokenizer.decode`
|
482
499
|
// optionally renders special/control tokens
|
483
|
-
std::string
|
500
|
+
std::string common_detokenize(
|
484
501
|
llama_context * ctx,
|
485
502
|
const std::vector<llama_token> & tokens,
|
486
503
|
bool special = true);
|
@@ -490,31 +507,31 @@ std::string llama_detokenize(
|
|
490
507
|
//
|
491
508
|
|
492
509
|
// same with llama_chat_message, but uses std::string
|
493
|
-
struct
|
510
|
+
struct common_chat_msg {
|
494
511
|
std::string role;
|
495
512
|
std::string content;
|
496
513
|
};
|
497
514
|
|
498
515
|
// Check if the template supplied via "--chat-template" is supported or not. Returns true if it's valid
|
499
|
-
bool
|
516
|
+
bool common_chat_verify_template(const std::string & tmpl);
|
500
517
|
|
501
518
|
// CPP wrapper for llama_chat_apply_template
|
502
519
|
// If the built-in template is not supported, we default to chatml
|
503
520
|
// If the custom "tmpl" is not supported, we throw an error
|
504
|
-
std::string
|
521
|
+
std::string common_chat_apply_template(const struct llama_model * model,
|
505
522
|
const std::string & tmpl,
|
506
|
-
const std::vector<
|
523
|
+
const std::vector<common_chat_msg> & chat,
|
507
524
|
bool add_ass);
|
508
525
|
|
509
526
|
// Format single message, while taking into account the position of that message in chat history
|
510
|
-
std::string
|
527
|
+
std::string common_chat_format_single(const struct llama_model * model,
|
511
528
|
const std::string & tmpl,
|
512
|
-
const std::vector<
|
513
|
-
const
|
529
|
+
const std::vector<common_chat_msg> & past_msg,
|
530
|
+
const common_chat_msg & new_msg,
|
514
531
|
bool add_ass);
|
515
532
|
|
516
533
|
// Returns an example of formatted chat
|
517
|
-
std::string
|
534
|
+
std::string common_chat_format_example(const struct llama_model * model,
|
518
535
|
const std::string & tmpl);
|
519
536
|
|
520
537
|
//
|
@@ -522,31 +539,31 @@ std::string llama_chat_format_example(const struct llama_model * model,
|
|
522
539
|
//
|
523
540
|
|
524
541
|
// Dump the KV cache view with the number of sequences per cell.
|
525
|
-
void
|
542
|
+
void common_kv_cache_dump_view(const llama_kv_cache_view & view, int row_size = 80);
|
526
543
|
|
527
544
|
// Dump the KV cache view showing individual sequences in each cell (long output).
|
528
|
-
void
|
545
|
+
void common_kv_cache_dump_view_seqs(const llama_kv_cache_view & view, int row_size = 40);
|
529
546
|
|
530
547
|
//
|
531
548
|
// Embedding utils
|
532
549
|
//
|
533
550
|
|
534
|
-
void
|
551
|
+
void common_embd_normalize(const float * inp, float * out, int n, int embd_norm = 2);
|
535
552
|
|
536
|
-
float
|
553
|
+
float common_embd_similarity_cos(const float * embd1, const float * embd2, int n);
|
537
554
|
|
538
555
|
//
|
539
556
|
// Control vector utils
|
540
557
|
//
|
541
558
|
|
542
|
-
struct
|
559
|
+
struct common_control_vector_data {
|
543
560
|
int n_embd;
|
544
561
|
|
545
562
|
// stores data for layers [1, n_layer] where n_layer = data.size() / n_embd
|
546
563
|
std::vector<float> data;
|
547
564
|
};
|
548
565
|
|
549
|
-
struct
|
566
|
+
struct common_control_vector_load_info {
|
550
567
|
float strength;
|
551
568
|
|
552
569
|
std::string fname;
|
@@ -554,7 +571,7 @@ struct llama_control_vector_load_info {
|
|
554
571
|
|
555
572
|
// Load control vectors, scale each by strength, and add them together.
|
556
573
|
// On error, returns {-1, empty}
|
557
|
-
|
574
|
+
common_control_vector_data common_control_vector_load(const std::vector<common_control_vector_load_info> & load_infos);
|
558
575
|
|
559
576
|
//
|
560
577
|
// Split utils
|
@@ -573,5 +590,5 @@ void yaml_dump_vector_int (FILE * stream, const char * prop_name, const std
|
|
573
590
|
void yaml_dump_string_multiline(FILE * stream, const char * prop_name, const char * data);
|
574
591
|
|
575
592
|
void yaml_dump_non_result_info(
|
576
|
-
FILE * stream, const
|
593
|
+
FILE * stream, const common_params & params, const llama_context * lctx,
|
577
594
|
const std::string & timestamp, const std::vector<int> & prompt_tokens, const char * model_desc);
|
package/cpp/ggml-alloc.c
CHANGED
@@ -14,7 +14,7 @@
|
|
14
14
|
|
15
15
|
//#define LM_GGML_ALLOCATOR_DEBUG
|
16
16
|
|
17
|
-
//#define AT_PRINTF(...)
|
17
|
+
//#define AT_PRINTF(...) LM_GGML_LOG_DEBUG(__VA_ARGS__)
|
18
18
|
#define AT_PRINTF(...)
|
19
19
|
|
20
20
|
|
@@ -89,7 +89,7 @@ void lm_ggml_tallocr_alloc(struct lm_ggml_tallocr * talloc, struct lm_ggml_tenso
|
|
89
89
|
size = LM_GGML_PAD(size, talloc->alignment);
|
90
90
|
|
91
91
|
if (talloc->offset + size > lm_ggml_backend_buffer_get_size(talloc->buffer)) {
|
92
|
-
|
92
|
+
LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
|
93
93
|
__func__, tensor->name, size, lm_ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
|
94
94
|
LM_GGML_ABORT("not enough space in the buffer");
|
95
95
|
}
|
@@ -172,7 +172,7 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
|
|
172
172
|
best_fit_block = alloc->n_free_blocks - 1;
|
173
173
|
} else {
|
174
174
|
// this should never happen
|
175
|
-
|
175
|
+
LM_GGML_LOG_ERROR("%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
|
176
176
|
__func__, size, max_avail);
|
177
177
|
LM_GGML_ABORT("not enough space in the buffer");
|
178
178
|
}
|
@@ -209,16 +209,16 @@ static size_t lm_ggml_dyn_tallocr_alloc(struct lm_ggml_dyn_tallocr * alloc, size
|
|
209
209
|
}
|
210
210
|
}
|
211
211
|
}
|
212
|
-
|
212
|
+
LM_GGML_LOG_DEBUG("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
|
213
213
|
for (int i = 0; i < 1024; i++) {
|
214
214
|
if (alloc->allocated_tensors[i].tensor) {
|
215
|
-
|
215
|
+
LM_GGML_LOG_DEBUG("%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
|
216
216
|
alloc->allocated_tensors[i].offset,
|
217
217
|
alloc->allocated_tensors[i].offset + lm_ggml_nbytes(alloc->allocated_tensors[i].tensor),
|
218
218
|
lm_ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
|
219
219
|
}
|
220
220
|
}
|
221
|
-
|
221
|
+
LM_GGML_LOG_DEBUG("\n");
|
222
222
|
}
|
223
223
|
#endif
|
224
224
|
|
@@ -348,7 +348,6 @@ struct tensor_alloc {
|
|
348
348
|
};
|
349
349
|
|
350
350
|
struct leaf_alloc {
|
351
|
-
int buffer_id;
|
352
351
|
struct tensor_alloc leaf;
|
353
352
|
};
|
354
353
|
|
@@ -740,7 +739,6 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
|
|
740
739
|
for (int i = 0; i < graph->n_leafs; i++) {
|
741
740
|
struct lm_ggml_tensor * leaf = graph->leafs[i];
|
742
741
|
struct hash_node * hn = lm_ggml_gallocr_hash_get(galloc, leaf);
|
743
|
-
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
|
744
742
|
if (leaf->view_src || leaf->data) {
|
745
743
|
galloc->leaf_allocs[i].leaf.buffer_id = -1;
|
746
744
|
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
|
@@ -768,13 +766,13 @@ bool lm_ggml_gallocr_reserve_n(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph *
|
|
768
766
|
// even if there are no tensors allocated in this buffer, we still need to allocate it to initialize views
|
769
767
|
if (new_size > cur_size || galloc->buffers[i] == NULL) {
|
770
768
|
#ifndef NDEBUG
|
771
|
-
|
769
|
+
LM_GGML_LOG_DEBUG("%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
|
772
770
|
#endif
|
773
771
|
|
774
772
|
lm_ggml_backend_buffer_free(galloc->buffers[i]);
|
775
773
|
galloc->buffers[i] = lm_ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
|
776
774
|
if (galloc->buffers[i] == NULL) {
|
777
|
-
|
775
|
+
LM_GGML_LOG_ERROR("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(galloc->bufts[i]), new_size);
|
778
776
|
return false;
|
779
777
|
}
|
780
778
|
lm_ggml_backend_buffer_set_usage(galloc->buffers[i], LM_GGML_BACKEND_BUFFER_USAGE_COMPUTE);
|
@@ -825,14 +823,14 @@ static bool lm_ggml_gallocr_node_needs_realloc(lm_ggml_gallocr_t galloc, struct
|
|
825
823
|
static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph * graph) {
|
826
824
|
if (galloc->n_nodes != graph->n_nodes) {
|
827
825
|
#ifndef NDEBUG
|
828
|
-
|
826
|
+
LM_GGML_LOG_DEBUG("%s: graph has different number of nodes\n", __func__);
|
829
827
|
#endif
|
830
828
|
return true;
|
831
829
|
}
|
832
830
|
|
833
831
|
if (galloc->n_leafs != graph->n_leafs) {
|
834
832
|
#ifndef NDEBUG
|
835
|
-
|
833
|
+
LM_GGML_LOG_DEBUG("%s: graph has different number of leafs\n", __func__);
|
836
834
|
#endif
|
837
835
|
return true;
|
838
836
|
}
|
@@ -843,7 +841,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
|
|
843
841
|
|
844
842
|
if (!lm_ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
|
845
843
|
#ifndef NDEBUG
|
846
|
-
|
844
|
+
LM_GGML_LOG_DEBUG("%s: node %s is not valid\n", __func__, node->name);
|
847
845
|
#endif
|
848
846
|
return true;
|
849
847
|
}
|
@@ -855,7 +853,7 @@ static bool lm_ggml_gallocr_needs_realloc(lm_ggml_gallocr_t galloc, struct lm_gg
|
|
855
853
|
}
|
856
854
|
if (!lm_ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
|
857
855
|
#ifndef NDEBUG
|
858
|
-
|
856
|
+
LM_GGML_LOG_DEBUG("%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
|
859
857
|
#endif
|
860
858
|
return true;
|
861
859
|
}
|
@@ -869,14 +867,14 @@ bool lm_ggml_gallocr_alloc_graph(lm_ggml_gallocr_t galloc, struct lm_ggml_cgraph
|
|
869
867
|
if (lm_ggml_gallocr_needs_realloc(galloc, graph)) {
|
870
868
|
if (galloc->n_buffers == 1) {
|
871
869
|
#ifndef NDEBUG
|
872
|
-
|
870
|
+
LM_GGML_LOG_DEBUG("%s: reallocating buffers automatically\n", __func__);
|
873
871
|
#endif
|
874
872
|
if (!lm_ggml_gallocr_reserve(galloc, graph)) {
|
875
873
|
return false;
|
876
874
|
}
|
877
875
|
} else {
|
878
876
|
#ifndef NDEBUG
|
879
|
-
|
877
|
+
LM_GGML_LOG_DEBUG("%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
|
880
878
|
#endif
|
881
879
|
return false;
|
882
880
|
}
|
@@ -940,7 +938,7 @@ static bool alloc_tensor_range(struct lm_ggml_context * ctx,
|
|
940
938
|
lm_ggml_backend_buffer_t buffer = lm_ggml_backend_buft_alloc_buffer(buft, size);
|
941
939
|
if (buffer == NULL) {
|
942
940
|
#ifndef NDEBUG
|
943
|
-
|
941
|
+
LM_GGML_LOG_DEBUG("%s: failed to allocate %s buffer of size %zu\n", __func__, lm_ggml_backend_buft_name(buft), size);
|
944
942
|
#endif
|
945
943
|
for (size_t i = 0; i < *n_buffers; i++) {
|
946
944
|
lm_ggml_backend_buffer_free((*buffers)[i]);
|
@@ -990,7 +988,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
|
|
990
988
|
}
|
991
989
|
|
992
990
|
if (this_size > max_size) {
|
993
|
-
|
991
|
+
LM_GGML_LOG_ERROR("%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
|
994
992
|
__func__, t->name,
|
995
993
|
lm_ggml_backend_buft_name(buft),
|
996
994
|
this_size, max_size);
|
@@ -1022,7 +1020,7 @@ lm_ggml_backend_buffer_t lm_ggml_backend_alloc_ctx_tensors_from_buft(struct lm_g
|
|
1022
1020
|
|
1023
1021
|
if (n_buffers == 0) {
|
1024
1022
|
#ifndef NDEBUG
|
1025
|
-
|
1023
|
+
LM_GGML_LOG_DEBUG("%s: all tensors in the context are already allocated\n", __func__);
|
1026
1024
|
#endif
|
1027
1025
|
return NULL;
|
1028
1026
|
}
|