@fugood/llama.node 0.0.1-alpha.3 → 0.1.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/CMakeLists.txt +36 -7
- package/README.md +9 -0
- package/bin/darwin/arm64/default.metallib +0 -0
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/default.metallib +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/lib/binding.js +18 -1
- package/lib/binding.ts +22 -2
- package/lib/index.ts +2 -2
- package/package.json +15 -3
- package/src/LlamaCompletionWorker.cpp +5 -1
- package/src/LlamaCompletionWorker.h +4 -0
- package/src/LlamaContext.cpp +18 -1
- package/src/common.hpp +11 -7
- package/src/llama.cpp/CMakeLists.txt +13 -7
- package/src/llama.cpp/common/common.cpp +221 -173
- package/src/llama.cpp/common/common.h +19 -8
- package/src/llama.cpp/common/json-schema-to-grammar.h +4 -0
- package/src/llama.cpp/common/log.h +2 -2
- package/src/llama.cpp/common/sampling.cpp +17 -1
- package/src/llama.cpp/common/sampling.h +28 -20
- package/src/llama.cpp/examples/batched-bench/batched-bench.cpp +17 -11
- package/src/llama.cpp/examples/eval-callback/eval-callback.cpp +5 -5
- package/src/llama.cpp/examples/finetune/finetune.cpp +1 -1
- package/src/llama.cpp/examples/gguf-split/gguf-split.cpp +15 -4
- package/src/llama.cpp/examples/imatrix/imatrix.cpp +72 -39
- package/src/llama.cpp/examples/llama-bench/llama-bench.cpp +42 -3
- package/src/llama.cpp/examples/llava/clip.cpp +74 -23
- package/src/llama.cpp/examples/llava/llava-cli.cpp +37 -28
- package/src/llama.cpp/examples/lookup/lookup-stats.cpp +0 -1
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -1
- package/src/llama.cpp/examples/main/main.cpp +10 -8
- package/src/llama.cpp/examples/perplexity/perplexity.cpp +175 -55
- package/src/llama.cpp/examples/quantize/CMakeLists.txt +1 -1
- package/src/llama.cpp/examples/quantize/quantize.cpp +74 -47
- package/src/llama.cpp/examples/quantize-stats/quantize-stats.cpp +1 -1
- package/src/llama.cpp/examples/server/server.cpp +97 -86
- package/src/llama.cpp/examples/server/utils.hpp +17 -15
- package/src/llama.cpp/ggml-backend.c +7 -5
- package/src/llama.cpp/ggml-impl.h +339 -4
- package/src/llama.cpp/ggml-kompute.cpp +7 -0
- package/src/llama.cpp/ggml-opencl.cpp +1 -0
- package/src/llama.cpp/ggml-quants.c +302 -293
- package/src/llama.cpp/ggml-sycl.cpp +28 -16
- package/src/llama.cpp/ggml-vulkan-shaders.hpp +46843 -39205
- package/src/llama.cpp/ggml-vulkan.cpp +951 -263
- package/src/llama.cpp/ggml.c +1469 -116
- package/src/llama.cpp/ggml.h +37 -7
- package/src/llama.cpp/llama.cpp +969 -432
- package/src/llama.cpp/llama.h +46 -14
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf-update.txt +2 -0
- package/src/llama.cpp/requirements/requirements-convert-hf-to-gguf.txt +0 -1
- package/src/llama.cpp/requirements/requirements-convert.txt +2 -2
- package/src/llama.cpp/requirements.txt +1 -0
- package/src/llama.cpp/sgemm.cpp +134 -103
- package/src/llama.cpp/sgemm.h +4 -2
- package/src/llama.cpp/tests/CMakeLists.txt +96 -36
- package/src/llama.cpp/tests/test-backend-ops.cpp +56 -6
- package/src/llama.cpp/tests/test-chat-template.cpp +4 -0
- package/src/llama.cpp/tests/test-grammar-integration.cpp +225 -136
- package/src/llama.cpp/tests/test-json-schema-to-grammar.cpp +1 -0
- package/src/llama.cpp/tests/test-tokenizer-0.cpp +292 -0
- package/src/llama.cpp/tests/{test-tokenizer-1-llama.cpp → test-tokenizer-1-spm.cpp} +1 -1
- package/src/llama.cpp/unicode-data.cpp +1188 -656
- package/src/llama.cpp/unicode-data.h +4 -3
- package/src/llama.cpp/unicode.cpp +590 -49
- package/src/llama.cpp/unicode.h +6 -3
- package/src/llama.cpp/tests/test-tokenizer-0-falcon.cpp +0 -187
- package/src/llama.cpp/tests/test-tokenizer-0-llama.cpp +0 -190
|
@@ -174,9 +174,11 @@ struct cmd_params {
|
|
|
174
174
|
std::vector<llama_split_mode> split_mode;
|
|
175
175
|
std::vector<int> main_gpu;
|
|
176
176
|
std::vector<bool> no_kv_offload;
|
|
177
|
+
std::vector<bool> flash_attn;
|
|
177
178
|
std::vector<std::vector<float>> tensor_split;
|
|
178
179
|
std::vector<bool> use_mmap;
|
|
179
180
|
std::vector<bool> embeddings;
|
|
181
|
+
ggml_numa_strategy numa;
|
|
180
182
|
int reps;
|
|
181
183
|
bool verbose;
|
|
182
184
|
output_formats output_format;
|
|
@@ -195,9 +197,11 @@ static const cmd_params cmd_params_defaults = {
|
|
|
195
197
|
/* split_mode */ {LLAMA_SPLIT_MODE_LAYER},
|
|
196
198
|
/* main_gpu */ {0},
|
|
197
199
|
/* no_kv_offload */ {false},
|
|
200
|
+
/* flash_attn */ {false},
|
|
198
201
|
/* tensor_split */ {std::vector<float>(llama_max_devices(), 0.0f)},
|
|
199
202
|
/* use_mmap */ {true},
|
|
200
203
|
/* embeddings */ {false},
|
|
204
|
+
/* numa */ GGML_NUMA_STRATEGY_DISABLED,
|
|
201
205
|
/* reps */ 5,
|
|
202
206
|
/* verbose */ false,
|
|
203
207
|
/* output_format */ MARKDOWN
|
|
@@ -220,7 +224,9 @@ static void print_usage(int /* argc */, char ** argv) {
|
|
|
220
224
|
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
|
|
221
225
|
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
|
|
222
226
|
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
|
|
227
|
+
printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str());
|
|
223
228
|
printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str());
|
|
229
|
+
printf(" --numa <distribute|isolate|numactl> (default: disabled)\n");
|
|
224
230
|
printf(" -embd, --embeddings <0|1> (default: %s)\n", join(cmd_params_defaults.embeddings, ",").c_str());
|
|
225
231
|
printf(" -ts, --tensor-split <ts0/ts1/..> (default: 0)\n");
|
|
226
232
|
printf(" -r, --repetitions <n> (default: %d)\n", cmd_params_defaults.reps);
|
|
@@ -393,6 +399,24 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
393
399
|
}
|
|
394
400
|
auto p = split<bool>(argv[i], split_delim);
|
|
395
401
|
params.no_kv_offload.insert(params.no_kv_offload.end(), p.begin(), p.end());
|
|
402
|
+
} else if (arg == "--numa") {
|
|
403
|
+
if (++i >= argc) {
|
|
404
|
+
invalid_param = true;
|
|
405
|
+
break;
|
|
406
|
+
} else {
|
|
407
|
+
std::string value(argv[i]);
|
|
408
|
+
/**/ if (value == "distribute" || value == "" ) { params.numa = GGML_NUMA_STRATEGY_DISTRIBUTE; }
|
|
409
|
+
else if (value == "isolate") { params.numa = GGML_NUMA_STRATEGY_ISOLATE; }
|
|
410
|
+
else if (value == "numactl") { params.numa = GGML_NUMA_STRATEGY_NUMACTL; }
|
|
411
|
+
else { invalid_param = true; break; }
|
|
412
|
+
}
|
|
413
|
+
} else if (arg == "-fa" || arg == "--flash-attn") {
|
|
414
|
+
if (++i >= argc) {
|
|
415
|
+
invalid_param = true;
|
|
416
|
+
break;
|
|
417
|
+
}
|
|
418
|
+
auto p = split<bool>(argv[i], split_delim);
|
|
419
|
+
params.flash_attn.insert(params.flash_attn.end(), p.begin(), p.end());
|
|
396
420
|
} else if (arg == "-mmp" || arg == "--mmap") {
|
|
397
421
|
if (++i >= argc) {
|
|
398
422
|
invalid_param = true;
|
|
@@ -477,6 +501,7 @@ static cmd_params parse_cmd_params(int argc, char ** argv) {
|
|
|
477
501
|
if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; }
|
|
478
502
|
if (params.main_gpu.empty()) { params.main_gpu = cmd_params_defaults.main_gpu; }
|
|
479
503
|
if (params.no_kv_offload.empty()){ params.no_kv_offload = cmd_params_defaults.no_kv_offload; }
|
|
504
|
+
if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; }
|
|
480
505
|
if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; }
|
|
481
506
|
if (params.use_mmap.empty()) { params.use_mmap = cmd_params_defaults.use_mmap; }
|
|
482
507
|
if (params.embeddings.empty()) { params.embeddings = cmd_params_defaults.embeddings; }
|
|
@@ -498,6 +523,7 @@ struct cmd_params_instance {
|
|
|
498
523
|
llama_split_mode split_mode;
|
|
499
524
|
int main_gpu;
|
|
500
525
|
bool no_kv_offload;
|
|
526
|
+
bool flash_attn;
|
|
501
527
|
std::vector<float> tensor_split;
|
|
502
528
|
bool use_mmap;
|
|
503
529
|
bool embeddings;
|
|
@@ -532,6 +558,7 @@ struct cmd_params_instance {
|
|
|
532
558
|
cparams.type_k = type_k;
|
|
533
559
|
cparams.type_v = type_v;
|
|
534
560
|
cparams.offload_kqv = !no_kv_offload;
|
|
561
|
+
cparams.flash_attn = flash_attn;
|
|
535
562
|
cparams.embeddings = embeddings;
|
|
536
563
|
|
|
537
564
|
return cparams;
|
|
@@ -554,6 +581,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
554
581
|
for (const auto & tk : params.type_k)
|
|
555
582
|
for (const auto & tv : params.type_v)
|
|
556
583
|
for (const auto & nkvo : params.no_kv_offload)
|
|
584
|
+
for (const auto & fa : params.flash_attn)
|
|
557
585
|
for (const auto & nt : params.n_threads) {
|
|
558
586
|
for (const auto & n_prompt : params.n_prompt) {
|
|
559
587
|
if (n_prompt == 0) {
|
|
@@ -572,6 +600,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
572
600
|
/* .split_mode = */ sm,
|
|
573
601
|
/* .main_gpu = */ mg,
|
|
574
602
|
/* .no_kv_offload= */ nkvo,
|
|
603
|
+
/* .flash_attn = */ fa,
|
|
575
604
|
/* .tensor_split = */ ts,
|
|
576
605
|
/* .use_mmap = */ mmp,
|
|
577
606
|
/* .embeddings = */ embd,
|
|
@@ -596,6 +625,7 @@ static std::vector<cmd_params_instance> get_cmd_params_instances(const cmd_param
|
|
|
596
625
|
/* .split_mode = */ sm,
|
|
597
626
|
/* .main_gpu = */ mg,
|
|
598
627
|
/* .no_kv_offload= */ nkvo,
|
|
628
|
+
/* .flash_attn = */ fa,
|
|
599
629
|
/* .tensor_split = */ ts,
|
|
600
630
|
/* .use_mmap = */ mmp,
|
|
601
631
|
/* .embeddings = */ embd,
|
|
@@ -633,6 +663,7 @@ struct test {
|
|
|
633
663
|
llama_split_mode split_mode;
|
|
634
664
|
int main_gpu;
|
|
635
665
|
bool no_kv_offload;
|
|
666
|
+
bool flash_attn;
|
|
636
667
|
std::vector<float> tensor_split;
|
|
637
668
|
bool use_mmap;
|
|
638
669
|
bool embeddings;
|
|
@@ -657,6 +688,7 @@ struct test {
|
|
|
657
688
|
split_mode = inst.split_mode;
|
|
658
689
|
main_gpu = inst.main_gpu;
|
|
659
690
|
no_kv_offload = inst.no_kv_offload;
|
|
691
|
+
flash_attn = inst.flash_attn;
|
|
660
692
|
tensor_split = inst.tensor_split;
|
|
661
693
|
use_mmap = inst.use_mmap;
|
|
662
694
|
embeddings = inst.embeddings;
|
|
@@ -731,7 +763,7 @@ struct test {
|
|
|
731
763
|
"n_batch", "n_ubatch",
|
|
732
764
|
"n_threads", "type_k", "type_v",
|
|
733
765
|
"n_gpu_layers", "split_mode",
|
|
734
|
-
"main_gpu", "no_kv_offload",
|
|
766
|
+
"main_gpu", "no_kv_offload", "flash_attn",
|
|
735
767
|
"tensor_split", "use_mmap", "embeddings",
|
|
736
768
|
"n_prompt", "n_gen", "test_time",
|
|
737
769
|
"avg_ns", "stddev_ns",
|
|
@@ -753,7 +785,7 @@ struct test {
|
|
|
753
785
|
}
|
|
754
786
|
if (field == "cuda" || field == "opencl" || field == "vulkan" || field == "kompute" || field == "metal" ||
|
|
755
787
|
field == "gpu_blas" || field == "blas" || field == "sycl" ||field == "f16_kv" || field == "no_kv_offload" ||
|
|
756
|
-
field == "use_mmap" || field == "embeddings") {
|
|
788
|
+
field == "flash_attn" || field == "use_mmap" || field == "embeddings") {
|
|
757
789
|
return BOOL;
|
|
758
790
|
}
|
|
759
791
|
if (field == "avg_ts" || field == "stddev_ts") {
|
|
@@ -787,7 +819,7 @@ struct test {
|
|
|
787
819
|
std::to_string(n_batch), std::to_string(n_ubatch),
|
|
788
820
|
std::to_string(n_threads), ggml_type_name(type_k), ggml_type_name(type_v),
|
|
789
821
|
std::to_string(n_gpu_layers), split_mode_str(split_mode),
|
|
790
|
-
std::to_string(main_gpu), std::to_string(no_kv_offload),
|
|
822
|
+
std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn),
|
|
791
823
|
tensor_split_str, std::to_string(use_mmap), std::to_string(embeddings),
|
|
792
824
|
std::to_string(n_prompt), std::to_string(n_gen), test_time,
|
|
793
825
|
std::to_string(avg_ns()), std::to_string(stdev_ns()),
|
|
@@ -955,6 +987,9 @@ struct markdown_printer : public printer {
|
|
|
955
987
|
if (field == "no_kv_offload") {
|
|
956
988
|
return "nkvo";
|
|
957
989
|
}
|
|
990
|
+
if (field == "flash_attn") {
|
|
991
|
+
return "fa";
|
|
992
|
+
}
|
|
958
993
|
if (field == "use_mmap") {
|
|
959
994
|
return "mmap";
|
|
960
995
|
}
|
|
@@ -1001,6 +1036,9 @@ struct markdown_printer : public printer {
|
|
|
1001
1036
|
if (params.no_kv_offload.size() > 1 || params.no_kv_offload != cmd_params_defaults.no_kv_offload) {
|
|
1002
1037
|
fields.emplace_back("no_kv_offload");
|
|
1003
1038
|
}
|
|
1039
|
+
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
|
|
1040
|
+
fields.emplace_back("flash_attn");
|
|
1041
|
+
}
|
|
1004
1042
|
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
|
|
1005
1043
|
fields.emplace_back("tensor_split");
|
|
1006
1044
|
}
|
|
@@ -1191,6 +1229,7 @@ int main(int argc, char ** argv) {
|
|
|
1191
1229
|
llama_log_set(llama_null_log_callback, NULL);
|
|
1192
1230
|
}
|
|
1193
1231
|
llama_backend_init();
|
|
1232
|
+
llama_numa_init(params.numa);
|
|
1194
1233
|
|
|
1195
1234
|
// initialize printer
|
|
1196
1235
|
std::unique_ptr<printer> p;
|
|
@@ -104,6 +104,7 @@ static std::string format(const char * fmt, ...) {
|
|
|
104
104
|
#define TN_POS_EMBD "%s.position_embd.weight"
|
|
105
105
|
#define TN_CLASS_EMBD "v.class_embd"
|
|
106
106
|
#define TN_PATCH_EMBD "v.patch_embd.weight"
|
|
107
|
+
#define TN_PATCH_BIAS "v.patch_embd.bias"
|
|
107
108
|
#define TN_ATTN_K "%s.blk.%d.attn_k.%s"
|
|
108
109
|
#define TN_ATTN_Q "%s.blk.%d.attn_q.%s"
|
|
109
110
|
#define TN_ATTN_V "%s.blk.%d.attn_v.%s"
|
|
@@ -425,6 +426,7 @@ struct clip_vision_model {
|
|
|
425
426
|
// embeddings
|
|
426
427
|
struct ggml_tensor * class_embedding;
|
|
427
428
|
struct ggml_tensor * patch_embeddings;
|
|
429
|
+
struct ggml_tensor * patch_bias;
|
|
428
430
|
struct ggml_tensor * position_embeddings;
|
|
429
431
|
|
|
430
432
|
struct ggml_tensor * pre_ln_w;
|
|
@@ -501,6 +503,11 @@ struct clip_ctx {
|
|
|
501
503
|
bool use_gelu = false;
|
|
502
504
|
int32_t ftype = 1;
|
|
503
505
|
|
|
506
|
+
bool has_class_embedding = true;
|
|
507
|
+
bool has_pre_norm = true;
|
|
508
|
+
bool has_post_norm = false;
|
|
509
|
+
bool has_patch_bias = false;
|
|
510
|
+
|
|
504
511
|
struct gguf_context * ctx_gguf;
|
|
505
512
|
struct ggml_context * ctx_data;
|
|
506
513
|
|
|
@@ -526,7 +533,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
526
533
|
const int patch_size = hparams.patch_size;
|
|
527
534
|
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
|
528
535
|
const int num_patches_per_side = image_size / patch_size; GGML_UNUSED(num_patches_per_side);
|
|
529
|
-
const int num_positions = num_patches + 1;
|
|
536
|
+
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
|
530
537
|
const int hidden_size = hparams.hidden_size;
|
|
531
538
|
const int n_head = hparams.n_head;
|
|
532
539
|
const int d_head = hidden_size / n_head;
|
|
@@ -557,16 +564,23 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
557
564
|
inp = ggml_reshape_3d(ctx0, inp, num_patches, hidden_size, batch_size);
|
|
558
565
|
inp = ggml_cont(ctx0, ggml_permute(ctx0, inp, 1, 0, 2, 3));
|
|
559
566
|
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
567
|
+
if (ctx->has_patch_bias) {
|
|
568
|
+
// inp = ggml_add(ctx0, inp, ggml_repeat(ctx0, model.patch_bias, inp));
|
|
569
|
+
inp = ggml_add(ctx0, inp, model.patch_bias);
|
|
570
|
+
}
|
|
564
571
|
|
|
565
|
-
|
|
566
|
-
|
|
572
|
+
// concat class_embeddings and patch_embeddings
|
|
573
|
+
struct ggml_tensor * embeddings = inp;
|
|
574
|
+
if (ctx->has_class_embedding) {
|
|
575
|
+
embeddings = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, hidden_size, num_positions, batch_size);
|
|
576
|
+
ggml_set_name(embeddings, "embeddings");
|
|
577
|
+
ggml_set_input(embeddings);
|
|
578
|
+
embeddings = ggml_acc(ctx0, embeddings, model.class_embedding,
|
|
579
|
+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], 0);
|
|
580
|
+
embeddings = ggml_acc(ctx0, embeddings, inp,
|
|
581
|
+
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
|
582
|
+
}
|
|
567
583
|
|
|
568
|
-
embeddings = ggml_acc(ctx0, embeddings, inp,
|
|
569
|
-
embeddings->nb[1], embeddings->nb[2], embeddings->nb[3], model.class_embedding->nb[1]);
|
|
570
584
|
|
|
571
585
|
struct ggml_tensor * positions = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, num_positions);
|
|
572
586
|
ggml_set_name(positions, "positions");
|
|
@@ -576,7 +590,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
576
590
|
ggml_add(ctx0, embeddings, ggml_get_rows(ctx0, model.position_embeddings, positions));
|
|
577
591
|
|
|
578
592
|
// pre-layernorm
|
|
579
|
-
{
|
|
593
|
+
if (ctx->has_pre_norm) {
|
|
580
594
|
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
581
595
|
ggml_set_name(embeddings, "pre_ln");
|
|
582
596
|
|
|
@@ -664,6 +678,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
664
678
|
embeddings = cur;
|
|
665
679
|
}
|
|
666
680
|
|
|
681
|
+
// post-layernorm
|
|
682
|
+
if (ctx->has_post_norm) {
|
|
683
|
+
embeddings = ggml_norm(ctx0, embeddings, eps);
|
|
684
|
+
ggml_set_name(embeddings, "post_ln");
|
|
685
|
+
|
|
686
|
+
embeddings = ggml_add(ctx0, ggml_mul(ctx0, embeddings, model.post_ln_w), model.post_ln_b);
|
|
687
|
+
}
|
|
688
|
+
|
|
667
689
|
// llava projector
|
|
668
690
|
{
|
|
669
691
|
embeddings = ggml_reshape_2d(ctx0, embeddings, embeddings->ne[0], embeddings->ne[1]);
|
|
@@ -1148,12 +1170,39 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
|
|
|
1148
1170
|
|
|
1149
1171
|
}
|
|
1150
1172
|
|
|
1173
|
+
try {
|
|
1174
|
+
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
|
1175
|
+
new_clip->has_class_embedding = true;
|
|
1176
|
+
} catch (const std::exception& e) {
|
|
1177
|
+
new_clip->has_class_embedding = false;
|
|
1178
|
+
}
|
|
1179
|
+
|
|
1180
|
+
try {
|
|
1181
|
+
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
|
1182
|
+
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
|
1183
|
+
new_clip->has_pre_norm = true;
|
|
1184
|
+
} catch (std::exception & e) {
|
|
1185
|
+
new_clip->has_pre_norm = false;
|
|
1186
|
+
}
|
|
1187
|
+
|
|
1188
|
+
try {
|
|
1189
|
+
vision_model.post_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "weight"));
|
|
1190
|
+
vision_model.post_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_POST, "v", "bias"));
|
|
1191
|
+
new_clip->has_post_norm = true;
|
|
1192
|
+
} catch (std::exception & e) {
|
|
1193
|
+
new_clip->has_post_norm = false;
|
|
1194
|
+
}
|
|
1195
|
+
|
|
1196
|
+
try {
|
|
1197
|
+
vision_model.patch_bias = get_tensor(new_clip->ctx_data, TN_PATCH_BIAS);
|
|
1198
|
+
new_clip->has_patch_bias = true;
|
|
1199
|
+
} catch (std::exception & e) {
|
|
1200
|
+
new_clip->has_patch_bias = false;
|
|
1201
|
+
}
|
|
1202
|
+
|
|
1151
1203
|
try {
|
|
1152
1204
|
vision_model.patch_embeddings = get_tensor(new_clip->ctx_data, TN_PATCH_EMBD);
|
|
1153
|
-
vision_model.class_embedding = get_tensor(new_clip->ctx_data, TN_CLASS_EMBD);
|
|
1154
1205
|
vision_model.position_embeddings = get_tensor(new_clip->ctx_data, format(TN_POS_EMBD, "v"));
|
|
1155
|
-
vision_model.pre_ln_w = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "weight"));
|
|
1156
|
-
vision_model.pre_ln_b = get_tensor(new_clip->ctx_data, format(TN_LN_PRE, "v", "bias"));
|
|
1157
1206
|
} catch(const std::exception& e) {
|
|
1158
1207
|
LOG_TEE("%s: failed to load vision model tensors\n", __func__);
|
|
1159
1208
|
}
|
|
@@ -1325,7 +1374,7 @@ bool clip_image_load_from_bytes(const unsigned char * bytes, size_t bytes_length
|
|
|
1325
1374
|
}
|
|
1326
1375
|
|
|
1327
1376
|
// Linear interpolation between two points
|
|
1328
|
-
inline float
|
|
1377
|
+
inline float clip_lerp(float s, float e, float t) {
|
|
1329
1378
|
return s + (e - s) * t;
|
|
1330
1379
|
}
|
|
1331
1380
|
// Bilinear resize function
|
|
@@ -1347,17 +1396,17 @@ static void bilinear_resize(const clip_image_u8& src, clip_image_u8& dst, int ta
|
|
|
1347
1396
|
float y_lerp = py - y_floor;
|
|
1348
1397
|
|
|
1349
1398
|
for (int c = 0; c < 3; c++) {
|
|
1350
|
-
float top =
|
|
1399
|
+
float top = clip_lerp(
|
|
1351
1400
|
static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
|
|
1352
1401
|
static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
|
|
1353
1402
|
x_lerp
|
|
1354
1403
|
);
|
|
1355
|
-
float bottom =
|
|
1404
|
+
float bottom = clip_lerp(
|
|
1356
1405
|
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
|
|
1357
1406
|
static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
|
|
1358
1407
|
x_lerp
|
|
1359
1408
|
);
|
|
1360
|
-
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(
|
|
1409
|
+
dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(clip_lerp(top, bottom, y_lerp));
|
|
1361
1410
|
}
|
|
1362
1411
|
}
|
|
1363
1412
|
}
|
|
@@ -1797,7 +1846,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
1797
1846
|
const int image_size = hparams.image_size;
|
|
1798
1847
|
const int patch_size = hparams.patch_size;
|
|
1799
1848
|
const int num_patches = ((image_size / patch_size) * (image_size / patch_size));
|
|
1800
|
-
const int num_positions = num_patches + 1;
|
|
1849
|
+
const int num_positions = num_patches + (ctx->has_class_embedding ? 1 : 0);
|
|
1801
1850
|
|
|
1802
1851
|
{
|
|
1803
1852
|
struct ggml_tensor * inp_raw = ggml_graph_get_tensor(gf, "inp_raw");
|
|
@@ -1825,12 +1874,14 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
1825
1874
|
}
|
|
1826
1875
|
|
|
1827
1876
|
{
|
|
1828
|
-
|
|
1877
|
+
if (ctx->has_class_embedding) {
|
|
1878
|
+
struct ggml_tensor * embeddings = ggml_graph_get_tensor(gf, "embeddings");
|
|
1829
1879
|
|
|
1830
|
-
|
|
1831
|
-
|
|
1832
|
-
|
|
1833
|
-
|
|
1880
|
+
void* zero_mem = malloc(ggml_nbytes(embeddings));
|
|
1881
|
+
memset(zero_mem, 0, ggml_nbytes(embeddings));
|
|
1882
|
+
ggml_backend_tensor_set(embeddings, zero_mem, 0, ggml_nbytes(embeddings));
|
|
1883
|
+
free(zero_mem);
|
|
1884
|
+
}
|
|
1834
1885
|
}
|
|
1835
1886
|
|
|
1836
1887
|
{
|
|
@@ -113,11 +113,11 @@ struct llava_context {
|
|
|
113
113
|
};
|
|
114
114
|
|
|
115
115
|
static void show_additional_info(int /*argc*/, char ** argv) {
|
|
116
|
-
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
116
|
+
LOG_TEE("\n example usage: %s -m <llava-v1.5-7b/ggml-model-q5_k.gguf> --mmproj <llava-v1.5-7b/mmproj-model-f16.gguf> --image <path/to/an/image.jpg> --image <path/to/another/image.jpg> [--temp 0.1] [-p \"describe the image in detail.\"]\n", argv[0]);
|
|
117
117
|
LOG_TEE(" note: a lower temperature value like 0.1 is recommended for better quality.\n");
|
|
118
118
|
}
|
|
119
119
|
|
|
120
|
-
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params) {
|
|
120
|
+
static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_params * params, const std::string & fname) {
|
|
121
121
|
|
|
122
122
|
// load and preprocess the image
|
|
123
123
|
llava_image_embed * embed = NULL;
|
|
@@ -133,9 +133,9 @@ static struct llava_image_embed * load_image(llava_context * ctx_llava, gpt_para
|
|
|
133
133
|
}
|
|
134
134
|
params->prompt = remove_image_from_prompt(prompt);
|
|
135
135
|
} else {
|
|
136
|
-
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads,
|
|
136
|
+
embed = llava_image_embed_make_with_filename(ctx_llava->ctx_clip, params->n_threads, fname.c_str());
|
|
137
137
|
if (!embed) {
|
|
138
|
-
|
|
138
|
+
fprintf(stderr, "%s: is %s really an image file?\n", __func__, fname.c_str());
|
|
139
139
|
return NULL;
|
|
140
140
|
}
|
|
141
141
|
}
|
|
@@ -207,17 +207,7 @@ static void process_prompt(struct llava_context * ctx_llava, struct llava_image_
|
|
|
207
207
|
printf("\n");
|
|
208
208
|
}
|
|
209
209
|
|
|
210
|
-
|
|
211
|
-
static struct llava_context * llava_init(gpt_params * params) {
|
|
212
|
-
const char * clip_path = params->mmproj.c_str();
|
|
213
|
-
|
|
214
|
-
auto prompt = params->prompt;
|
|
215
|
-
if (prompt.empty()) {
|
|
216
|
-
prompt = "describe the image in detail.";
|
|
217
|
-
}
|
|
218
|
-
|
|
219
|
-
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
|
220
|
-
|
|
210
|
+
static struct llama_model * llava_init(gpt_params * params) {
|
|
221
211
|
llama_backend_init();
|
|
222
212
|
llama_numa_init(params->numa);
|
|
223
213
|
|
|
@@ -228,6 +218,19 @@ static struct llava_context * llava_init(gpt_params * params) {
|
|
|
228
218
|
LOG_TEE("%s: error: unable to load model\n" , __func__);
|
|
229
219
|
return NULL;
|
|
230
220
|
}
|
|
221
|
+
return model;
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
static struct llava_context * llava_init_context(gpt_params * params, llama_model * model) {
|
|
225
|
+
const char * clip_path = params->mmproj.c_str();
|
|
226
|
+
|
|
227
|
+
auto prompt = params->prompt;
|
|
228
|
+
if (prompt.empty()) {
|
|
229
|
+
prompt = "describe the image in detail.";
|
|
230
|
+
}
|
|
231
|
+
|
|
232
|
+
auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 1);
|
|
233
|
+
|
|
231
234
|
|
|
232
235
|
llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
|
|
233
236
|
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
|
|
@@ -286,24 +289,30 @@ int main(int argc, char ** argv) {
|
|
|
286
289
|
show_additional_info(argc, argv);
|
|
287
290
|
return 1;
|
|
288
291
|
}
|
|
289
|
-
|
|
290
|
-
|
|
291
|
-
|
|
292
|
-
LOG_TEE("%s: error: failed to init llava\n", __func__);
|
|
292
|
+
auto model = llava_init(¶ms);
|
|
293
|
+
if (model == NULL) {
|
|
294
|
+
fprintf(stderr, "%s: error: failed to init llava model\n", __func__);
|
|
293
295
|
return 1;
|
|
294
296
|
}
|
|
295
297
|
|
|
296
|
-
auto
|
|
297
|
-
|
|
298
|
-
return 1;
|
|
299
|
-
}
|
|
298
|
+
for (auto & image : params.image) {
|
|
299
|
+
auto ctx_llava = llava_init_context(¶ms, model);
|
|
300
300
|
|
|
301
|
-
|
|
302
|
-
|
|
301
|
+
auto image_embed = load_image(ctx_llava, ¶ms, image);
|
|
302
|
+
if (!image_embed) {
|
|
303
|
+
std::cerr << "error: failed to load image " << image << ". Terminating\n\n";
|
|
304
|
+
return 1;
|
|
305
|
+
}
|
|
306
|
+
|
|
307
|
+
// process the prompt
|
|
308
|
+
process_prompt(ctx_llava, image_embed, ¶ms, params.prompt);
|
|
303
309
|
|
|
304
|
-
|
|
310
|
+
llama_print_timings(ctx_llava->ctx_llama);
|
|
311
|
+
llava_image_embed_free(image_embed);
|
|
312
|
+
ctx_llava->model = NULL;
|
|
313
|
+
llava_free(ctx_llava);
|
|
314
|
+
}
|
|
315
|
+
llama_free_model(model);
|
|
305
316
|
|
|
306
|
-
llava_image_embed_free(image_embed);
|
|
307
|
-
llava_free(ctx_llava);
|
|
308
317
|
return 0;
|
|
309
318
|
}
|
|
@@ -240,7 +240,6 @@ int main(int argc, char ** argv) {
|
|
|
240
240
|
return 1;
|
|
241
241
|
}
|
|
242
242
|
session_tokens.resize(n_token_count_out);
|
|
243
|
-
llama_set_rng_seed(ctx, params.seed);
|
|
244
243
|
LOG_TEE("%s: loaded a session with prompt size of %d tokens\n", __func__, (int)session_tokens.size());
|
|
245
244
|
}
|
|
246
245
|
}
|
|
@@ -325,7 +324,7 @@ int main(int argc, char ** argv) {
|
|
|
325
324
|
log_tostr(embd_inp.empty()), n_matching_session_tokens, embd_inp.size(), session_tokens.size(), embd_inp.size());
|
|
326
325
|
|
|
327
326
|
// if we will use the cache for the full prompt without reaching the end of the cache, force
|
|
328
|
-
// reevaluation of the last token
|
|
327
|
+
// reevaluation of the last token to recalculate the cached logits
|
|
329
328
|
if (!embd_inp.empty() && n_matching_session_tokens == embd_inp.size() && session_tokens.size() > embd_inp.size()) {
|
|
330
329
|
LOGLN("recalculate the cached logits (do): session_tokens.resize( %zu )", embd_inp.size() - 1);
|
|
331
330
|
|
|
@@ -363,6 +362,9 @@ int main(int argc, char ** argv) {
|
|
|
363
362
|
params.interactive_first = true;
|
|
364
363
|
params.antiprompt.emplace_back("<|im_start|>user\n");
|
|
365
364
|
}
|
|
365
|
+
else if (params.conversation) {
|
|
366
|
+
params.interactive_first = true;
|
|
367
|
+
}
|
|
366
368
|
|
|
367
369
|
// enable interactive mode if interactive start is specified
|
|
368
370
|
if (params.interactive_first) {
|
|
@@ -545,7 +547,7 @@ int main(int argc, char ** argv) {
|
|
|
545
547
|
// if we run out of context:
|
|
546
548
|
// - take the n_keep first tokens from the original prompt (via n_past)
|
|
547
549
|
// - take half of the last (n_ctx - n_keep) tokens and recompute the logits in batches
|
|
548
|
-
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset)
|
|
550
|
+
if (n_past + (int) embd.size() + std::max<int>(0, guidance_offset) >= n_ctx) {
|
|
549
551
|
if (params.n_predict == -2) {
|
|
550
552
|
LOG_TEE("\n\n%s: context full and n_predict == -%d => stopping\n", __func__, params.n_predict);
|
|
551
553
|
break;
|
|
@@ -734,7 +736,7 @@ int main(int argc, char ** argv) {
|
|
|
734
736
|
// display text
|
|
735
737
|
if (input_echo && display) {
|
|
736
738
|
for (auto id : embd) {
|
|
737
|
-
const std::string token_str = llama_token_to_piece(ctx, id);
|
|
739
|
+
const std::string token_str = llama_token_to_piece(ctx, id, !params.conversation);
|
|
738
740
|
printf("%s", token_str.c_str());
|
|
739
741
|
|
|
740
742
|
if (embd.size() > 1) {
|
|
@@ -797,7 +799,7 @@ int main(int argc, char ** argv) {
|
|
|
797
799
|
|
|
798
800
|
// deal with end of generation tokens in interactive mode
|
|
799
801
|
if (llama_token_is_eog(model, llama_sampling_last(ctx_sampling))) {
|
|
800
|
-
LOG("found
|
|
802
|
+
LOG("found an EOG token\n");
|
|
801
803
|
|
|
802
804
|
if (params.interactive) {
|
|
803
805
|
if (!params.antiprompt.empty()) {
|
|
@@ -817,7 +819,7 @@ int main(int argc, char ** argv) {
|
|
|
817
819
|
if (n_past > 0 && is_interacting) {
|
|
818
820
|
LOG("waiting for user input\n");
|
|
819
821
|
|
|
820
|
-
if (params.instruct || params.chatml) {
|
|
822
|
+
if (params.conversation || params.instruct || params.chatml) {
|
|
821
823
|
printf("\n> ");
|
|
822
824
|
}
|
|
823
825
|
|
|
@@ -827,7 +829,7 @@ int main(int argc, char ** argv) {
|
|
|
827
829
|
}
|
|
828
830
|
|
|
829
831
|
std::string buffer;
|
|
830
|
-
if (!params.input_prefix.empty()) {
|
|
832
|
+
if (!params.input_prefix.empty() && !params.conversation) {
|
|
831
833
|
LOG("appending input prefix: '%s'\n", params.input_prefix.c_str());
|
|
832
834
|
printf("%s", params.input_prefix.c_str());
|
|
833
835
|
}
|
|
@@ -851,7 +853,7 @@ int main(int argc, char ** argv) {
|
|
|
851
853
|
// Entering a empty line lets the user pass control back
|
|
852
854
|
if (buffer.length() > 1) {
|
|
853
855
|
// append input suffix if any
|
|
854
|
-
if (!params.input_suffix.empty()) {
|
|
856
|
+
if (!params.input_suffix.empty() && !params.conversation) {
|
|
855
857
|
LOG("appending input suffix: '%s'\n", params.input_suffix.c_str());
|
|
856
858
|
printf("%s", params.input_suffix.c_str());
|
|
857
859
|
}
|