@fugood/llama.node 0.4.6 → 0.5.0
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/darwin/arm64/llama-node.node +0 -0
- package/bin/darwin/x64/llama-node.node +0 -0
- package/bin/linux/arm64/llama-node.node +0 -0
- package/bin/linux/x64/llama-node.node +0 -0
- package/bin/linux-cuda/arm64/llama-node.node +0 -0
- package/bin/linux-cuda/x64/llama-node.node +0 -0
- package/bin/linux-vulkan/arm64/llama-node.node +0 -0
- package/bin/linux-vulkan/x64/llama-node.node +0 -0
- package/bin/win32/arm64/llama-node.node +0 -0
- package/bin/win32/arm64/node.lib +0 -0
- package/bin/win32/x64/llama-node.node +0 -0
- package/bin/win32/x64/node.lib +0 -0
- package/bin/win32-vulkan/arm64/llama-node.node +0 -0
- package/bin/win32-vulkan/arm64/node.lib +0 -0
- package/bin/win32-vulkan/x64/llama-node.node +0 -0
- package/bin/win32-vulkan/x64/node.lib +0 -0
- package/lib/binding.ts +22 -4
- package/lib/index.js +42 -18
- package/lib/index.ts +57 -23
- package/package.json +1 -1
- package/src/LlamaCompletionWorker.cpp +22 -381
- package/src/LlamaCompletionWorker.h +2 -4
- package/src/LlamaContext.cpp +40 -100
- package/src/LlamaContext.h +1 -0
- package/src/TokenizeWorker.cpp +33 -4
- package/src/TokenizeWorker.h +2 -5
- package/src/common.hpp +389 -0
- package/src/llama.cpp/.github/workflows/build.yml +2 -2
- package/src/llama.cpp/.github/workflows/release.yml +152 -129
- package/src/llama.cpp/.github/workflows/winget.yml +42 -0
- package/src/llama.cpp/common/arg.cpp +14 -13
- package/src/llama.cpp/common/common.cpp +4 -75
- package/src/llama.cpp/common/common.h +7 -12
- package/src/llama.cpp/examples/lookahead/lookahead.cpp +0 -13
- package/src/llama.cpp/examples/lookup/lookup.cpp +0 -11
- package/src/llama.cpp/examples/parallel/parallel.cpp +0 -9
- package/src/llama.cpp/examples/retrieval/retrieval.cpp +6 -6
- package/src/llama.cpp/examples/simple/simple.cpp +1 -1
- package/src/llama.cpp/examples/simple-chat/simple-chat.cpp +2 -2
- package/src/llama.cpp/examples/sycl/run-llama2.sh +4 -4
- package/src/llama.cpp/examples/sycl/run-llama3.sh +28 -0
- package/src/llama.cpp/examples/sycl/win-run-llama2.bat +1 -1
- package/src/llama.cpp/examples/sycl/win-run-llama3.bat +9 -0
- package/src/llama.cpp/ggml/include/ggml-opt.h +2 -0
- package/src/llama.cpp/ggml/include/ggml.h +11 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.cpp +274 -0
- package/src/llama.cpp/ggml/src/ggml-cann/aclnn_ops.h +27 -0
- package/src/llama.cpp/ggml/src/ggml-cann/ggml-cann.cpp +18 -2
- package/src/llama.cpp/ggml/src/ggml-cpu/ggml-cpu.c +1 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/ops.cpp +107 -0
- package/src/llama.cpp/ggml/src/ggml-cpu/vec.h +16 -0
- package/src/llama.cpp/ggml/src/ggml-musa/CMakeLists.txt +8 -2
- package/src/llama.cpp/ggml/src/ggml-opencl/ggml-opencl.cpp +315 -155
- package/src/llama.cpp/ggml/src/ggml-opt.cpp +5 -0
- package/src/llama.cpp/ggml/src/ggml-sycl/ggml-sycl.cpp +43 -12
- package/src/llama.cpp/ggml/src/ggml-vulkan/ggml-vulkan.cpp +171 -112
- package/src/llama.cpp/ggml/src/ggml.c +64 -18
- package/src/llama.cpp/include/llama.h +24 -124
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_hf_to_gguf_update.txt +5 -1
- package/src/llama.cpp/requirements/requirements-convert_lora_to_gguf.txt +2 -0
- package/src/llama.cpp/src/llama-batch.cpp +3 -1
- package/src/llama.cpp/src/llama-context.cpp +60 -110
- package/src/llama.cpp/src/llama-graph.cpp +137 -233
- package/src/llama.cpp/src/llama-graph.h +49 -7
- package/src/llama.cpp/src/llama-hparams.cpp +17 -1
- package/src/llama.cpp/src/llama-hparams.h +34 -5
- package/src/llama.cpp/src/llama-kv-cache.cpp +654 -321
- package/src/llama.cpp/src/llama-kv-cache.h +201 -85
- package/src/llama.cpp/src/llama-memory.h +3 -2
- package/src/llama.cpp/src/llama-model.cpp +273 -94
- package/src/llama.cpp/src/llama-model.h +4 -1
- package/src/llama.cpp/tests/test-arg-parser.cpp +1 -1
- package/src/llama.cpp/tools/llama-bench/llama-bench.cpp +1 -0
- package/src/llama.cpp/tools/mtmd/CMakeLists.txt +13 -2
- package/src/llama.cpp/tools/mtmd/clip-impl.h +108 -11
- package/src/llama.cpp/tools/mtmd/clip.cpp +466 -88
- package/src/llama.cpp/tools/mtmd/clip.h +6 -4
- package/src/llama.cpp/tools/mtmd/miniaudio.h +93468 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.cpp +855 -0
- package/src/llama.cpp/tools/mtmd/mtmd-audio.h +62 -0
- package/src/llama.cpp/tools/mtmd/mtmd-cli.cpp +21 -14
- package/src/llama.cpp/tools/mtmd/mtmd-helper.cpp +36 -49
- package/src/llama.cpp/tools/mtmd/mtmd.cpp +362 -98
- package/src/llama.cpp/tools/mtmd/mtmd.h +52 -21
- package/src/llama.cpp/tools/run/run.cpp +2 -2
- package/src/llama.cpp/tools/server/server.cpp +158 -47
- package/src/llama.cpp/tools/server/utils.hpp +71 -43
- package/src/llama.cpp/tools/tts/tts.cpp +4 -2
|
@@ -35,6 +35,7 @@ struct clip_logger_state g_logger_state = {GGML_LOG_LEVEL_CONT, clip_log_callbac
|
|
|
35
35
|
|
|
36
36
|
enum ffn_op_type {
|
|
37
37
|
FFN_GELU,
|
|
38
|
+
FFN_GELU_ERF,
|
|
38
39
|
FFN_SILU,
|
|
39
40
|
FFN_GELU_QUICK,
|
|
40
41
|
};
|
|
@@ -165,6 +166,9 @@ enum patch_merge_type {
|
|
|
165
166
|
};
|
|
166
167
|
|
|
167
168
|
struct clip_hparams {
|
|
169
|
+
bool has_vision = false;
|
|
170
|
+
bool has_audio = false;
|
|
171
|
+
|
|
168
172
|
int32_t image_size;
|
|
169
173
|
int32_t patch_size;
|
|
170
174
|
int32_t n_embd;
|
|
@@ -191,6 +195,10 @@ struct clip_hparams {
|
|
|
191
195
|
int32_t attn_window_size = 0;
|
|
192
196
|
int32_t n_wa_pattern = 0;
|
|
193
197
|
int32_t spatial_merge_size = 0;
|
|
198
|
+
|
|
199
|
+
// audio
|
|
200
|
+
int32_t n_mel_bins = 0; // whisper preprocessor
|
|
201
|
+
int32_t proj_stack_factor = 0; // ultravox
|
|
194
202
|
};
|
|
195
203
|
|
|
196
204
|
struct clip_layer {
|
|
@@ -332,6 +340,14 @@ struct clip_vision_model {
|
|
|
332
340
|
// pixtral
|
|
333
341
|
ggml_tensor * token_embd_img_break = nullptr;
|
|
334
342
|
ggml_tensor * mm_patch_merger_w = nullptr;
|
|
343
|
+
|
|
344
|
+
// ultravox / whisper encoder
|
|
345
|
+
ggml_tensor * conv1d_1_w = nullptr;
|
|
346
|
+
ggml_tensor * conv1d_1_b = nullptr;
|
|
347
|
+
ggml_tensor * conv1d_2_w = nullptr;
|
|
348
|
+
ggml_tensor * conv1d_2_b = nullptr;
|
|
349
|
+
ggml_tensor * mm_norm_pre_w = nullptr;
|
|
350
|
+
ggml_tensor * mm_norm_mid_w = nullptr;
|
|
335
351
|
};
|
|
336
352
|
|
|
337
353
|
struct clip_ctx {
|
|
@@ -359,9 +375,12 @@ struct clip_ctx {
|
|
|
359
375
|
int max_nodes = 8192;
|
|
360
376
|
ggml_backend_sched_ptr sched;
|
|
361
377
|
|
|
362
|
-
|
|
378
|
+
// for debugging
|
|
379
|
+
bool debug_graph = false;
|
|
380
|
+
std::vector<ggml_tensor *> debug_print_tensors;
|
|
363
381
|
|
|
364
382
|
clip_ctx(clip_context_params & ctx_params) {
|
|
383
|
+
debug_graph = std::getenv("MTMD_DEBUG_GRAPH") != nullptr;
|
|
365
384
|
backend_cpu = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
|
|
366
385
|
if (!backend_cpu) {
|
|
367
386
|
throw std::runtime_error("failed to initialize CPU backend");
|
|
@@ -440,7 +459,7 @@ struct clip_graph {
|
|
|
440
459
|
};
|
|
441
460
|
ctx0_ptr.reset(ggml_init(params));
|
|
442
461
|
ctx0 = ctx0_ptr.get();
|
|
443
|
-
gf =
|
|
462
|
+
gf = ggml_new_graph_custom(ctx0, ctx->max_nodes, false);
|
|
444
463
|
}
|
|
445
464
|
|
|
446
465
|
ggml_cgraph * build_siglip() {
|
|
@@ -522,7 +541,7 @@ struct clip_graph {
|
|
|
522
541
|
ggml_set_input(pos_w);
|
|
523
542
|
|
|
524
543
|
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
|
525
|
-
return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta);
|
|
544
|
+
return build_rope_2d(ctx0, cur, pos_h, pos_w, hparams.rope_theta, true);
|
|
526
545
|
};
|
|
527
546
|
|
|
528
547
|
ggml_tensor * inp = build_inp();
|
|
@@ -936,6 +955,101 @@ struct clip_graph {
|
|
|
936
955
|
return gf;
|
|
937
956
|
}
|
|
938
957
|
|
|
958
|
+
ggml_cgraph * build_llama4() {
|
|
959
|
+
GGML_ASSERT(model.class_embedding != nullptr);
|
|
960
|
+
GGML_ASSERT(model.position_embeddings != nullptr);
|
|
961
|
+
|
|
962
|
+
const int n_pos = n_patches + 1; // +1 for [CLS]
|
|
963
|
+
|
|
964
|
+
// 2D input positions
|
|
965
|
+
ggml_tensor * pos_h = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
|
966
|
+
ggml_set_name(pos_h, "pos_h");
|
|
967
|
+
ggml_set_input(pos_h);
|
|
968
|
+
|
|
969
|
+
ggml_tensor * pos_w = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_pos);
|
|
970
|
+
ggml_set_name(pos_w, "pos_w");
|
|
971
|
+
ggml_set_input(pos_w);
|
|
972
|
+
|
|
973
|
+
ggml_tensor * inp = build_inp_raw();
|
|
974
|
+
|
|
975
|
+
// Llama4UnfoldConvolution
|
|
976
|
+
{
|
|
977
|
+
ggml_tensor * kernel = ggml_reshape_4d(ctx0, model.patch_embeddings_0,
|
|
978
|
+
patch_size, patch_size, 3, n_embd);
|
|
979
|
+
inp = ggml_im2col(ctx0, kernel, inp, patch_size, patch_size, 0, 0, 1, 1, true, inp->type);
|
|
980
|
+
inp = ggml_mul_mat(ctx0, model.patch_embeddings_0, inp);
|
|
981
|
+
inp = ggml_reshape_2d(ctx0, inp, n_embd, n_patches);
|
|
982
|
+
cb(inp, "patch_conv", -1);
|
|
983
|
+
}
|
|
984
|
+
|
|
985
|
+
// add CLS token
|
|
986
|
+
inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
|
|
987
|
+
|
|
988
|
+
// build ViT with 2D position embeddings
|
|
989
|
+
auto add_pos = [&](ggml_tensor * cur, const clip_layer &) {
|
|
990
|
+
// first half is X axis and second half is Y axis
|
|
991
|
+
// ref: https://github.com/huggingface/transformers/blob/40a493c7ed4f19f08eadb0639cf26d49bfa5e180/src/transformers/models/llama4/modeling_llama4.py#L1312
|
|
992
|
+
// ref: https://github.com/Blaizzy/mlx-vlm/blob/a57156aa87b33cca6e5ee6cfc14dd4ef8f611be6/mlx_vlm/models/llama4/vision.py#L441
|
|
993
|
+
return build_rope_2d(ctx0, cur, pos_w, pos_h, hparams.rope_theta, false);
|
|
994
|
+
};
|
|
995
|
+
ggml_tensor * cur = build_vit(
|
|
996
|
+
inp, n_pos,
|
|
997
|
+
NORM_TYPE_NORMAL,
|
|
998
|
+
hparams.ffn_op,
|
|
999
|
+
model.position_embeddings,
|
|
1000
|
+
add_pos);
|
|
1001
|
+
|
|
1002
|
+
// remove CLS token
|
|
1003
|
+
cur = ggml_view_2d(ctx0, cur,
|
|
1004
|
+
n_embd, n_patches,
|
|
1005
|
+
ggml_row_size(cur->type, n_embd), 0);
|
|
1006
|
+
|
|
1007
|
+
// pixel shuffle
|
|
1008
|
+
// based on Llama4VisionPixelShuffleMLP
|
|
1009
|
+
// https://github.com/huggingface/transformers/blob/2932f318a20d9e54cc7aea052e040164d85de7d6/src/transformers/models/llama4/modeling_llama4.py#L1151
|
|
1010
|
+
{
|
|
1011
|
+
const int scale_factor = model.hparams.proj_scale_factor;
|
|
1012
|
+
const int bsz = 1; // batch size, always 1 for now since we don't support batching
|
|
1013
|
+
GGML_ASSERT(scale_factor > 0);
|
|
1014
|
+
GGML_ASSERT(n_patches_x == n_patches_y); // llama4 only supports square images
|
|
1015
|
+
cur = ggml_reshape_4d(ctx0, cur,
|
|
1016
|
+
n_embd * scale_factor,
|
|
1017
|
+
n_patches_x / scale_factor,
|
|
1018
|
+
n_patches_y,
|
|
1019
|
+
bsz);
|
|
1020
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
1021
|
+
cur = ggml_reshape_4d(ctx0, ggml_cont(ctx0, cur),
|
|
1022
|
+
n_embd * scale_factor * scale_factor,
|
|
1023
|
+
n_patches_x / scale_factor,
|
|
1024
|
+
n_patches_y / scale_factor,
|
|
1025
|
+
bsz);
|
|
1026
|
+
cur = ggml_permute(ctx0, cur, 0, 2, 1, 3);
|
|
1027
|
+
// flatten to 2D
|
|
1028
|
+
cur = ggml_reshape_2d(ctx0, ggml_cont(ctx0, cur),
|
|
1029
|
+
n_embd * scale_factor * scale_factor,
|
|
1030
|
+
n_patches / scale_factor / scale_factor);
|
|
1031
|
+
cb(cur, "pixel_shuffle", -1);
|
|
1032
|
+
}
|
|
1033
|
+
|
|
1034
|
+
// based on Llama4VisionMLP2 (always uses GELU activation, no bias)
|
|
1035
|
+
{
|
|
1036
|
+
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_1_w, cur);
|
|
1037
|
+
cur = ggml_gelu(ctx0, cur);
|
|
1038
|
+
cur = ggml_mul_mat(ctx0, model.mm_model_mlp_2_w, cur);
|
|
1039
|
+
cur = ggml_gelu(ctx0, cur);
|
|
1040
|
+
cb(cur, "adapter_mlp", -1);
|
|
1041
|
+
}
|
|
1042
|
+
|
|
1043
|
+
// Llama4MultiModalProjector
|
|
1044
|
+
cur = ggml_mul_mat(ctx0, model.mm_model_proj, cur);
|
|
1045
|
+
cb(cur, "projected", -1);
|
|
1046
|
+
|
|
1047
|
+
// build the graph
|
|
1048
|
+
ggml_build_forward_expand(gf, cur);
|
|
1049
|
+
|
|
1050
|
+
return gf;
|
|
1051
|
+
}
|
|
1052
|
+
|
|
939
1053
|
// this graph is used by llava, granite and glm
|
|
940
1054
|
// due to having embedding_stack (used by granite), we cannot reuse build_vit
|
|
941
1055
|
ggml_cgraph * build_llava() {
|
|
@@ -1310,16 +1424,118 @@ struct clip_graph {
|
|
|
1310
1424
|
return gf;
|
|
1311
1425
|
}
|
|
1312
1426
|
|
|
1427
|
+
// whisper encoder with custom projector
|
|
1428
|
+
ggml_cgraph * build_whisper_enc() {
|
|
1429
|
+
const int n_frames = img.nx;
|
|
1430
|
+
const int n_pos = n_frames / 2;
|
|
1431
|
+
GGML_ASSERT(model.position_embeddings->ne[1] >= n_pos);
|
|
1432
|
+
|
|
1433
|
+
ggml_tensor * inp = build_inp_raw(1);
|
|
1434
|
+
|
|
1435
|
+
// conv1d block
|
|
1436
|
+
{
|
|
1437
|
+
// convolution + gelu
|
|
1438
|
+
ggml_tensor * cur = ggml_conv_1d_ph(ctx0, model.conv1d_1_w, inp, 1, 1);
|
|
1439
|
+
cur = ggml_add(ctx0, cur, model.conv1d_1_b);
|
|
1440
|
+
|
|
1441
|
+
cur = ggml_gelu_erf(ctx0, cur);
|
|
1442
|
+
|
|
1443
|
+
cur = ggml_conv_1d_ph(ctx0, model.conv1d_2_w, cur, 2, 1);
|
|
1444
|
+
cur = ggml_add(ctx0, cur, model.conv1d_2_b);
|
|
1445
|
+
|
|
1446
|
+
cur = ggml_gelu_erf(ctx0, cur);
|
|
1447
|
+
// transpose
|
|
1448
|
+
inp = ggml_cont(ctx0, ggml_transpose(ctx0, cur));
|
|
1449
|
+
cb(inp, "after_conv1d", -1);
|
|
1450
|
+
}
|
|
1451
|
+
|
|
1452
|
+
// sanity check (only check one layer, but it should be the same for all)
|
|
1453
|
+
GGML_ASSERT(model.layers[0].ln_1_w && model.layers[0].ln_1_b);
|
|
1454
|
+
GGML_ASSERT(model.layers[0].ln_2_w && model.layers[0].ln_2_b);
|
|
1455
|
+
GGML_ASSERT(model.layers[0].q_b);
|
|
1456
|
+
GGML_ASSERT(model.layers[0].v_b);
|
|
1457
|
+
GGML_ASSERT(!model.layers[0].k_b); // no bias for k
|
|
1458
|
+
GGML_ASSERT(model.post_ln_w && model.post_ln_b);
|
|
1459
|
+
|
|
1460
|
+
ggml_tensor * pos_embd_selected = ggml_view_2d(
|
|
1461
|
+
ctx0, model.position_embeddings,
|
|
1462
|
+
model.position_embeddings->ne[0], n_pos,
|
|
1463
|
+
model.position_embeddings->nb[1], 0
|
|
1464
|
+
);
|
|
1465
|
+
ggml_tensor * cur = build_vit(
|
|
1466
|
+
inp, n_pos,
|
|
1467
|
+
NORM_TYPE_NORMAL,
|
|
1468
|
+
hparams.ffn_op,
|
|
1469
|
+
pos_embd_selected,
|
|
1470
|
+
nullptr);
|
|
1471
|
+
|
|
1472
|
+
cb(cur, "after_transformer", -1);
|
|
1473
|
+
|
|
1474
|
+
// StackAudioFrames
|
|
1475
|
+
// https://huggingface.co/fixie-ai/ultravox-v0_5-llama-3_2-1b/blob/main/ultravox_model.py
|
|
1476
|
+
{
|
|
1477
|
+
int64_t stride = n_embd * hparams.proj_stack_factor;
|
|
1478
|
+
int64_t padded_len = GGML_PAD(ggml_nelements(cur), stride);
|
|
1479
|
+
int64_t pad = padded_len - ggml_nelements(cur);
|
|
1480
|
+
if (pad > 0) {
|
|
1481
|
+
cur = ggml_view_1d(ctx0, cur, ggml_nelements(cur), 0);
|
|
1482
|
+
cur = ggml_pad(ctx0, cur, pad, 0, 0, 0);
|
|
1483
|
+
}
|
|
1484
|
+
cur = ggml_view_2d(ctx0, cur, stride, padded_len / stride,
|
|
1485
|
+
ggml_row_size(cur->type, stride), 0);
|
|
1486
|
+
}
|
|
1487
|
+
|
|
1488
|
+
cb(cur, "after_stacked", -1);
|
|
1489
|
+
|
|
1490
|
+
// UltravoxProjector
|
|
1491
|
+
{
|
|
1492
|
+
// pre-norm
|
|
1493
|
+
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
|
1494
|
+
cur = ggml_mul(ctx0, cur, model.mm_norm_pre_w);
|
|
1495
|
+
|
|
1496
|
+
// ffn in
|
|
1497
|
+
cur = ggml_mul_mat(ctx0, model.mm_1_w, cur);
|
|
1498
|
+
|
|
1499
|
+
// swiglu
|
|
1500
|
+
{
|
|
1501
|
+
int64_t split_point = cur->ne[0] / 2;
|
|
1502
|
+
ggml_tensor * x0 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], 0));
|
|
1503
|
+
ggml_tensor * x1 = ggml_cont(ctx0, ggml_view_2d(ctx0, cur, split_point, cur->ne[1], cur->nb[1], split_point * ggml_element_size(cur)));
|
|
1504
|
+
|
|
1505
|
+
// see SwiGLU in ultravox_model.py, the second half passed through is silu, not the first half
|
|
1506
|
+
x1 = ggml_silu(ctx0, x1);
|
|
1507
|
+
cur = ggml_mul(ctx0, x0, x1);
|
|
1508
|
+
}
|
|
1509
|
+
|
|
1510
|
+
// mid-norm
|
|
1511
|
+
cur = ggml_rms_norm(ctx0, cur, 1e-6);
|
|
1512
|
+
cur = ggml_mul(ctx0, cur, model.mm_norm_mid_w);
|
|
1513
|
+
|
|
1514
|
+
// ffn out
|
|
1515
|
+
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
|
|
1516
|
+
}
|
|
1517
|
+
|
|
1518
|
+
cb(cur, "projected", -1);
|
|
1519
|
+
|
|
1520
|
+
ggml_build_forward_expand(gf, cur);
|
|
1521
|
+
|
|
1522
|
+
return gf;
|
|
1523
|
+
}
|
|
1524
|
+
|
|
1313
1525
|
private:
|
|
1314
1526
|
//
|
|
1315
1527
|
// utility functions
|
|
1316
1528
|
//
|
|
1317
1529
|
|
|
1318
|
-
void cb(ggml_tensor *
|
|
1319
|
-
|
|
1320
|
-
|
|
1321
|
-
|
|
1322
|
-
|
|
1530
|
+
void cb(ggml_tensor * cur0, const char * name, int il) const {
|
|
1531
|
+
if (ctx->debug_graph) {
|
|
1532
|
+
ggml_tensor * cur = ggml_cpy(ctx0, cur0, ggml_dup_tensor(ctx0, cur0));
|
|
1533
|
+
std::string cur_name = il >= 0 ? std::string(name) + "_" + std::to_string(il) : name;
|
|
1534
|
+
ggml_set_name(cur, cur_name.c_str());
|
|
1535
|
+
ggml_set_output(cur);
|
|
1536
|
+
ggml_build_forward_expand(gf, cur);
|
|
1537
|
+
ctx->debug_print_tensors.push_back(cur);
|
|
1538
|
+
}
|
|
1323
1539
|
}
|
|
1324
1540
|
|
|
1325
1541
|
// build vision transformer (ViT) cgraph
|
|
@@ -1460,8 +1676,8 @@ private:
|
|
|
1460
1676
|
return inp;
|
|
1461
1677
|
}
|
|
1462
1678
|
|
|
1463
|
-
ggml_tensor * build_inp_raw() {
|
|
1464
|
-
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny,
|
|
1679
|
+
ggml_tensor * build_inp_raw(int channels = 3) {
|
|
1680
|
+
ggml_tensor * inp_raw = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, img.nx, img.ny, channels);
|
|
1465
1681
|
ggml_set_name(inp_raw, "inp_raw");
|
|
1466
1682
|
ggml_set_input(inp_raw);
|
|
1467
1683
|
return inp_raw;
|
|
@@ -1539,6 +1755,11 @@ private:
|
|
|
1539
1755
|
cur = ggml_gelu(ctx0, cur);
|
|
1540
1756
|
cb(cur, "ffn_gelu", il);
|
|
1541
1757
|
} break;
|
|
1758
|
+
case FFN_GELU_ERF:
|
|
1759
|
+
{
|
|
1760
|
+
cur = ggml_gelu_erf(ctx0, cur);
|
|
1761
|
+
cb(cur, "ggml_gelu_erf", il);
|
|
1762
|
+
} break;
|
|
1542
1763
|
case FFN_GELU_QUICK:
|
|
1543
1764
|
{
|
|
1544
1765
|
cur = ggml_gelu_quick(ctx0, cur);
|
|
@@ -1630,9 +1851,10 @@ private:
|
|
|
1630
1851
|
static ggml_tensor * build_rope_2d(
|
|
1631
1852
|
ggml_context * ctx0,
|
|
1632
1853
|
ggml_tensor * cur,
|
|
1633
|
-
ggml_tensor *
|
|
1634
|
-
ggml_tensor *
|
|
1635
|
-
const float freq_base
|
|
1854
|
+
ggml_tensor * pos_a, // first half
|
|
1855
|
+
ggml_tensor * pos_b, // second half
|
|
1856
|
+
const float freq_base,
|
|
1857
|
+
const bool interleave_freq
|
|
1636
1858
|
) {
|
|
1637
1859
|
const int64_t n_dim = cur->ne[0];
|
|
1638
1860
|
const int64_t n_head = cur->ne[1];
|
|
@@ -1646,7 +1868,9 @@ private:
|
|
|
1646
1868
|
// ^ don't ask me why, it's math! -2(2i) / n_dim == -2i / (n_dim/2)
|
|
1647
1869
|
// then for the second half, we use freq_scale to shift the inv_freq
|
|
1648
1870
|
// ^ why? replace (2i) with (2i+1) in the above equation
|
|
1649
|
-
const float freq_scale_odd =
|
|
1871
|
+
const float freq_scale_odd = interleave_freq
|
|
1872
|
+
? std::pow(freq_base, (float)-2/n_dim)
|
|
1873
|
+
: 1.0;
|
|
1650
1874
|
|
|
1651
1875
|
// first half
|
|
1652
1876
|
ggml_tensor * first;
|
|
@@ -1659,7 +1883,7 @@ private:
|
|
|
1659
1883
|
first = ggml_rope_ext(
|
|
1660
1884
|
ctx0,
|
|
1661
1885
|
first,
|
|
1662
|
-
|
|
1886
|
+
pos_a, // positions
|
|
1663
1887
|
nullptr, // freq factors
|
|
1664
1888
|
n_dim/2, // n_dims
|
|
1665
1889
|
0, 0, freq_base,
|
|
@@ -1679,7 +1903,7 @@ private:
|
|
|
1679
1903
|
second = ggml_rope_ext(
|
|
1680
1904
|
ctx0,
|
|
1681
1905
|
second,
|
|
1682
|
-
|
|
1906
|
+
pos_b, // positions
|
|
1683
1907
|
nullptr, // freq factors
|
|
1684
1908
|
n_dim/2, // n_dims
|
|
1685
1909
|
0, 0, freq_base,
|
|
@@ -1723,6 +1947,14 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
|
|
|
1723
1947
|
{
|
|
1724
1948
|
res = graph.build_internvl();
|
|
1725
1949
|
} break;
|
|
1950
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
1951
|
+
{
|
|
1952
|
+
res = graph.build_llama4();
|
|
1953
|
+
} break;
|
|
1954
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
1955
|
+
{
|
|
1956
|
+
res = graph.build_whisper_enc();
|
|
1957
|
+
} break;
|
|
1726
1958
|
default:
|
|
1727
1959
|
{
|
|
1728
1960
|
res = graph.build_llava();
|
|
@@ -1806,18 +2038,30 @@ struct clip_model_loader {
|
|
|
1806
2038
|
|
|
1807
2039
|
// other hparams
|
|
1808
2040
|
{
|
|
1809
|
-
|
|
1810
|
-
|
|
1811
|
-
|
|
1812
|
-
|
|
1813
|
-
get_u32(
|
|
1814
|
-
get_u32(
|
|
1815
|
-
get_u32(
|
|
1816
|
-
|
|
1817
|
-
get_u32(
|
|
1818
|
-
|
|
1819
|
-
|
|
1820
|
-
|
|
2041
|
+
get_bool(KEY_HAS_AUDIO_ENC, hparams.has_audio, false);
|
|
2042
|
+
get_bool(KEY_HAS_VISION_ENC, hparams.has_vision, false);
|
|
2043
|
+
|
|
2044
|
+
const char * prefix = hparams.has_vision ? "vision" : "audio";
|
|
2045
|
+
get_u32(string_format(KEY_N_EMBD, prefix), hparams.n_embd);
|
|
2046
|
+
get_u32(string_format(KEY_N_HEAD, prefix), hparams.n_head);
|
|
2047
|
+
get_u32(string_format(KEY_N_FF, prefix), hparams.n_ff);
|
|
2048
|
+
get_u32(string_format(KEY_N_BLOCK, prefix), hparams.n_layer);
|
|
2049
|
+
get_u32(string_format(KEY_PROJ_DIM, prefix), hparams.projection_dim);
|
|
2050
|
+
get_f32(string_format(KEY_LAYER_NORM_EPS, prefix), hparams.eps);
|
|
2051
|
+
|
|
2052
|
+
if (hparams.has_vision) {
|
|
2053
|
+
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
|
|
2054
|
+
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
|
|
2055
|
+
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
|
|
2056
|
+
get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
|
|
2057
|
+
get_i32(KEY_MINICPMV_VERSION, ctx_clip.minicpmv_version, false); // legacy
|
|
2058
|
+
|
|
2059
|
+
} else if (hparams.has_audio) {
|
|
2060
|
+
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
|
|
2061
|
+
|
|
2062
|
+
} else {
|
|
2063
|
+
throw std::runtime_error(string_format("%s: neither vision nor audio encoder is present\n", __func__));
|
|
2064
|
+
}
|
|
1821
2065
|
|
|
1822
2066
|
// default warmup value
|
|
1823
2067
|
hparams.warmup_image_size = hparams.image_size;
|
|
@@ -1855,7 +2099,7 @@ struct clip_model_loader {
|
|
|
1855
2099
|
}
|
|
1856
2100
|
}
|
|
1857
2101
|
|
|
1858
|
-
{
|
|
2102
|
+
if (hparams.has_vision) {
|
|
1859
2103
|
int idx_mean = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_MEAN);
|
|
1860
2104
|
int idx_std = gguf_find_key(ctx_gguf.get(), KEY_IMAGE_STD);
|
|
1861
2105
|
GGML_ASSERT(idx_mean >= 0 && "image_mean not found");
|
|
@@ -1926,24 +2170,56 @@ struct clip_model_loader {
|
|
|
1926
2170
|
hparams.warmup_image_size = hparams.patch_size * 8;
|
|
1927
2171
|
get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
|
|
1928
2172
|
} break;
|
|
2173
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
2174
|
+
{
|
|
2175
|
+
hparams.rope_theta = 10000.0f;
|
|
2176
|
+
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
|
|
2177
|
+
|
|
2178
|
+
// borrowed from llava-1.6
|
|
2179
|
+
const int isize = hparams.image_size;
|
|
2180
|
+
hparams.image_grid_pinpoints = {
|
|
2181
|
+
isize, isize*2, // 336, 672
|
|
2182
|
+
isize*2, isize, // 672, 336
|
|
2183
|
+
isize*2, isize*2, // 672, 672
|
|
2184
|
+
isize*3, isize, // 1008, 336
|
|
2185
|
+
isize, isize*3, // 336, 1008
|
|
2186
|
+
};
|
|
2187
|
+
} break;
|
|
2188
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
2189
|
+
{
|
|
2190
|
+
get_u32(KEY_A_PROJ_STACK_FACTOR, hparams.proj_stack_factor);
|
|
2191
|
+
if (hparams.n_mel_bins != 128) {
|
|
2192
|
+
throw std::runtime_error(string_format("%s: only 128 mel bins are supported for ultravox\n", __func__));
|
|
2193
|
+
}
|
|
2194
|
+
hparams.ffn_op = FFN_GELU_ERF;
|
|
2195
|
+
log_ffn_op = "gelu_erf"; // temporary solution for logging
|
|
2196
|
+
} break;
|
|
1929
2197
|
default:
|
|
1930
2198
|
break;
|
|
1931
2199
|
}
|
|
1932
2200
|
|
|
1933
2201
|
LOG_INF("%s: projector: %s\n", __func__, proj_type.c_str());
|
|
2202
|
+
LOG_INF("%s: has_vision_encoder: %d\n", __func__, hparams.has_vision);
|
|
2203
|
+
LOG_INF("%s: has_audio_encoder: %d\n", __func__, hparams.has_audio);
|
|
1934
2204
|
LOG_INF("%s: n_embd: %d\n", __func__, hparams.n_embd);
|
|
1935
2205
|
LOG_INF("%s: n_head: %d\n", __func__, hparams.n_head);
|
|
1936
2206
|
LOG_INF("%s: n_ff: %d\n", __func__, hparams.n_ff);
|
|
1937
2207
|
LOG_INF("%s: n_layer: %d\n", __func__, hparams.n_layer);
|
|
2208
|
+
LOG_INF("%s: ffn_op: %s\n", __func__, log_ffn_op.c_str());
|
|
1938
2209
|
LOG_INF("%s: projection_dim: %d\n", __func__, hparams.projection_dim);
|
|
1939
|
-
LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
|
|
1940
|
-
LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
|
|
1941
2210
|
LOG_INF("\n");
|
|
1942
|
-
|
|
1943
|
-
|
|
1944
|
-
|
|
1945
|
-
|
|
1946
|
-
|
|
2211
|
+
if (hparams.has_vision) {
|
|
2212
|
+
LOG_INF("%s: image_size: %d\n", __func__, hparams.image_size);
|
|
2213
|
+
LOG_INF("%s: patch_size: %d\n", __func__, hparams.patch_size);
|
|
2214
|
+
LOG_INF("%s: has_llava_proj: %d\n", __func__, ctx_clip.has_llava_projector);
|
|
2215
|
+
LOG_INF("%s: minicpmv_version: %d\n", __func__, ctx_clip.minicpmv_version);
|
|
2216
|
+
LOG_INF("%s: proj_scale_factor: %d\n", __func__, hparams.proj_scale_factor);
|
|
2217
|
+
LOG_INF("%s: n_wa_pattern: %d\n", __func__, hparams.n_wa_pattern);
|
|
2218
|
+
} else if (hparams.has_audio) {
|
|
2219
|
+
LOG_INF("%s: n_mel_bins: %d\n", __func__, hparams.n_mel_bins);
|
|
2220
|
+
LOG_INF("%s: proj_stack_factor: %d\n", __func__, hparams.proj_stack_factor);
|
|
2221
|
+
}
|
|
2222
|
+
LOG_INF("\n");
|
|
1947
2223
|
LOG_INF("%s: model size: %.2f MiB\n", __func__, model_size / 1024.0 / 1024.0);
|
|
1948
2224
|
LOG_INF("%s: metadata size: %.2f MiB\n", __func__, ggml_get_mem_size(ctx_meta.get()) / 1024.0 / 1024.0);
|
|
1949
2225
|
}
|
|
@@ -1954,6 +2230,9 @@ struct clip_model_loader {
|
|
|
1954
2230
|
std::map<std::string, size_t> tensor_offset;
|
|
1955
2231
|
std::vector<ggml_tensor *> tensors_to_load;
|
|
1956
2232
|
|
|
2233
|
+
// TODO @ngxson : support both audio and video in the future
|
|
2234
|
+
const char * prefix = hparams.has_audio ? "a" : "v";
|
|
2235
|
+
|
|
1957
2236
|
// get offsets
|
|
1958
2237
|
for (int64_t i = 0; i < gguf_get_n_tensors(ctx_gguf.get()); ++i) {
|
|
1959
2238
|
const char * name = gguf_get_tensor_name(ctx_gguf.get(), i);
|
|
@@ -1991,47 +2270,47 @@ struct clip_model_loader {
|
|
|
1991
2270
|
|
|
1992
2271
|
vision_model.class_embedding = get_tensor(TN_CLASS_EMBD, false);
|
|
1993
2272
|
|
|
1994
|
-
vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE,
|
|
1995
|
-
vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE,
|
|
2273
|
+
vision_model.pre_ln_w = get_tensor(string_format(TN_LN_PRE, prefix, "weight"), false);
|
|
2274
|
+
vision_model.pre_ln_b = get_tensor(string_format(TN_LN_PRE, prefix, "bias"), false);
|
|
1996
2275
|
|
|
1997
|
-
vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST,
|
|
1998
|
-
vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST,
|
|
2276
|
+
vision_model.post_ln_w = get_tensor(string_format(TN_LN_POST, prefix, "weight"), false);
|
|
2277
|
+
vision_model.post_ln_b = get_tensor(string_format(TN_LN_POST, prefix, "bias"), false);
|
|
1999
2278
|
|
|
2000
2279
|
vision_model.patch_bias = get_tensor(TN_PATCH_BIAS, false);
|
|
2001
2280
|
vision_model.patch_embeddings_0 = get_tensor(TN_PATCH_EMBD, false);
|
|
2002
2281
|
vision_model.patch_embeddings_1 = get_tensor(TN_PATCH_EMBD_1, false);
|
|
2003
2282
|
|
|
2004
|
-
vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD,
|
|
2283
|
+
vision_model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
|
|
2005
2284
|
|
|
2006
2285
|
// layers
|
|
2007
2286
|
vision_model.layers.resize(hparams.n_layer);
|
|
2008
2287
|
for (int il = 0; il < hparams.n_layer; ++il) {
|
|
2009
2288
|
auto & layer = vision_model.layers[il];
|
|
2010
|
-
layer.k_w = get_tensor(string_format(TN_ATTN_K,
|
|
2011
|
-
layer.q_w = get_tensor(string_format(TN_ATTN_Q,
|
|
2012
|
-
layer.v_w = get_tensor(string_format(TN_ATTN_V,
|
|
2013
|
-
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT,
|
|
2014
|
-
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM,
|
|
2015
|
-
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM,
|
|
2016
|
-
layer.ln_1_w = get_tensor(string_format(TN_LN_1,
|
|
2017
|
-
layer.ln_2_w = get_tensor(string_format(TN_LN_2,
|
|
2018
|
-
layer.ls_1_w = get_tensor(string_format(TN_LS_1,
|
|
2019
|
-
layer.ls_2_w = get_tensor(string_format(TN_LS_2,
|
|
2020
|
-
|
|
2021
|
-
layer.k_b = get_tensor(string_format(TN_ATTN_K,
|
|
2022
|
-
layer.q_b = get_tensor(string_format(TN_ATTN_Q,
|
|
2023
|
-
layer.v_b = get_tensor(string_format(TN_ATTN_V,
|
|
2024
|
-
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT,
|
|
2025
|
-
layer.ln_1_b = get_tensor(string_format(TN_LN_1,
|
|
2026
|
-
layer.ln_2_b = get_tensor(string_format(TN_LN_2,
|
|
2289
|
+
layer.k_w = get_tensor(string_format(TN_ATTN_K, prefix, il, "weight"));
|
|
2290
|
+
layer.q_w = get_tensor(string_format(TN_ATTN_Q, prefix, il, "weight"));
|
|
2291
|
+
layer.v_w = get_tensor(string_format(TN_ATTN_V, prefix, il, "weight"));
|
|
2292
|
+
layer.o_w = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "weight"));
|
|
2293
|
+
layer.k_norm = get_tensor(string_format(TN_ATTN_K_NORM, prefix, il, "weight"), false);
|
|
2294
|
+
layer.q_norm = get_tensor(string_format(TN_ATTN_Q_NORM, prefix, il, "weight"), false);
|
|
2295
|
+
layer.ln_1_w = get_tensor(string_format(TN_LN_1, prefix, il, "weight"), false);
|
|
2296
|
+
layer.ln_2_w = get_tensor(string_format(TN_LN_2, prefix, il, "weight"), false);
|
|
2297
|
+
layer.ls_1_w = get_tensor(string_format(TN_LS_1, prefix, il, "weight"), false); // no bias
|
|
2298
|
+
layer.ls_2_w = get_tensor(string_format(TN_LS_2, prefix, il, "weight"), false); // no bias
|
|
2299
|
+
|
|
2300
|
+
layer.k_b = get_tensor(string_format(TN_ATTN_K, prefix, il, "bias"), false);
|
|
2301
|
+
layer.q_b = get_tensor(string_format(TN_ATTN_Q, prefix, il, "bias"), false);
|
|
2302
|
+
layer.v_b = get_tensor(string_format(TN_ATTN_V, prefix, il, "bias"), false);
|
|
2303
|
+
layer.o_b = get_tensor(string_format(TN_ATTN_OUTPUT, prefix, il, "bias"), false);
|
|
2304
|
+
layer.ln_1_b = get_tensor(string_format(TN_LN_1, prefix, il, "bias"), false);
|
|
2305
|
+
layer.ln_2_b = get_tensor(string_format(TN_LN_2, prefix, il, "bias"), false);
|
|
2027
2306
|
|
|
2028
2307
|
// ffn
|
|
2029
|
-
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP,
|
|
2030
|
-
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP,
|
|
2031
|
-
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE,
|
|
2032
|
-
layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE,
|
|
2033
|
-
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN,
|
|
2034
|
-
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN,
|
|
2308
|
+
layer.ff_up_w = get_tensor(string_format(TN_FFN_UP, prefix, il, "weight"));
|
|
2309
|
+
layer.ff_up_b = get_tensor(string_format(TN_FFN_UP, prefix, il, "bias"), false);
|
|
2310
|
+
layer.ff_gate_w = get_tensor(string_format(TN_FFN_GATE, prefix, il, "weight"), false);
|
|
2311
|
+
layer.ff_gate_b = get_tensor(string_format(TN_FFN_GATE, prefix, il, "bias"), false);
|
|
2312
|
+
layer.ff_down_w = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "weight"));
|
|
2313
|
+
layer.ff_down_b = get_tensor(string_format(TN_FFN_DOWN, prefix, il, "bias"), false);
|
|
2035
2314
|
|
|
2036
2315
|
// some models already exported with legacy (incorrect) naming which is quite messy, let's fix it here
|
|
2037
2316
|
// note: Qwen model converted from the old surgery script has n_ff = 0, so we cannot use n_ff to check!
|
|
@@ -2173,6 +2452,17 @@ struct clip_model_loader {
|
|
|
2173
2452
|
vision_model.mm_input_norm_w = get_tensor(TN_MM_INP_NORM, false);
|
|
2174
2453
|
vision_model.mm_patch_merger_w = get_tensor(TN_MM_PATCH_MERGER, false);
|
|
2175
2454
|
} break;
|
|
2455
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
2456
|
+
{
|
|
2457
|
+
vision_model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
|
|
2458
|
+
vision_model.conv1d_1_b = get_tensor(string_format(TN_CONV1D, 1, "bias"));
|
|
2459
|
+
vision_model.conv1d_2_w = get_tensor(string_format(TN_CONV1D, 2, "weight"));
|
|
2460
|
+
vision_model.conv1d_2_b = get_tensor(string_format(TN_CONV1D, 2, "bias"));
|
|
2461
|
+
vision_model.mm_1_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 1, "weight"));
|
|
2462
|
+
vision_model.mm_2_w = get_tensor(string_format(TN_MM_AUDIO_MLP, 2, "weight"));
|
|
2463
|
+
vision_model.mm_norm_pre_w = get_tensor(string_format(TN_MM_NORM_PRE, "weight"));
|
|
2464
|
+
vision_model.mm_norm_mid_w = get_tensor(string_format(TN_MM_NORM_MID, "weight"));
|
|
2465
|
+
} break;
|
|
2176
2466
|
case PROJECTOR_TYPE_INTERNVL:
|
|
2177
2467
|
{
|
|
2178
2468
|
vision_model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
|
|
@@ -2182,6 +2472,12 @@ struct clip_model_loader {
|
|
|
2182
2472
|
vision_model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
|
|
2183
2473
|
vision_model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
|
|
2184
2474
|
} break;
|
|
2475
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
2476
|
+
{
|
|
2477
|
+
vision_model.mm_model_proj = get_tensor(TN_MM_PROJECTOR);
|
|
2478
|
+
vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
|
|
2479
|
+
vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 2, "weight"));
|
|
2480
|
+
} break;
|
|
2185
2481
|
default:
|
|
2186
2482
|
GGML_ASSERT(false && "unknown projector type");
|
|
2187
2483
|
}
|
|
@@ -2224,13 +2520,19 @@ struct clip_model_loader {
|
|
|
2224
2520
|
}
|
|
2225
2521
|
|
|
2226
2522
|
void alloc_compute_meta() {
|
|
2523
|
+
const auto & hparams = ctx_clip.vision_model.hparams;
|
|
2227
2524
|
ctx_clip.buf_compute_meta.resize(ctx_clip.max_nodes * ggml_tensor_overhead() + ggml_graph_overhead());
|
|
2228
2525
|
|
|
2229
2526
|
// create a fake batch
|
|
2230
2527
|
clip_image_f32_batch batch;
|
|
2231
2528
|
clip_image_f32_ptr img(clip_image_f32_init());
|
|
2232
|
-
|
|
2233
|
-
|
|
2529
|
+
if (hparams.has_vision) {
|
|
2530
|
+
img->nx = hparams.warmup_image_size;
|
|
2531
|
+
img->ny = hparams.warmup_image_size;
|
|
2532
|
+
} else {
|
|
2533
|
+
img->nx = 1024; // TODO @ngxson : use a better default
|
|
2534
|
+
img->ny = hparams.n_mel_bins;
|
|
2535
|
+
}
|
|
2234
2536
|
img->buf.resize(img->nx * img->ny * 3);
|
|
2235
2537
|
batch.entries.push_back(std::move(img));
|
|
2236
2538
|
|
|
@@ -2328,14 +2630,6 @@ struct clip_ctx * clip_init(const char * fname, struct clip_context_params ctx_p
|
|
|
2328
2630
|
return ctx_clip;
|
|
2329
2631
|
}
|
|
2330
2632
|
|
|
2331
|
-
void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
|
|
2332
|
-
ctx_clip->load_image_size = *load_image_size; // copy
|
|
2333
|
-
}
|
|
2334
|
-
|
|
2335
|
-
struct clip_image_size * clip_get_load_image_size(struct clip_ctx * ctx_clip) {
|
|
2336
|
-
return &ctx_clip->load_image_size;
|
|
2337
|
-
}
|
|
2338
|
-
|
|
2339
2633
|
struct clip_image_size * clip_image_size_init() {
|
|
2340
2634
|
struct clip_image_size * load_image_size = new struct clip_image_size();
|
|
2341
2635
|
load_image_size->width = 448;
|
|
@@ -2849,7 +3143,7 @@ private:
|
|
|
2849
3143
|
|
|
2850
3144
|
// used by llava 1.6 with custom list of pinpoints
|
|
2851
3145
|
static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
|
|
2852
|
-
std::vector<clip_image_size> possible_resolutions;
|
|
3146
|
+
std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
|
|
2853
3147
|
for (size_t i = 0; i < pinpoints.size(); i += 2) {
|
|
2854
3148
|
possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
|
|
2855
3149
|
}
|
|
@@ -2916,12 +3210,6 @@ private:
|
|
|
2916
3210
|
}
|
|
2917
3211
|
};
|
|
2918
3212
|
|
|
2919
|
-
// TODO @ngxson : decprecate the load_image_size singleton pattern
|
|
2920
|
-
int clip_uhd_num_image_embeds_col(struct clip_ctx * ctx_clip) {
|
|
2921
|
-
const auto inst = llava_uhd::get_slice_instructions(ctx_clip, ctx_clip->load_image_size);
|
|
2922
|
-
return inst.grid_size.width;
|
|
2923
|
-
}
|
|
2924
|
-
|
|
2925
3213
|
// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
|
|
2926
3214
|
// res_imgs memory is being allocated here, previous allocations will be freed if found
|
|
2927
3215
|
bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
|
|
@@ -2943,9 +3231,12 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
|
2943
3231
|
normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
|
|
2944
3232
|
res_imgs->entries.push_back(std::move(res));
|
|
2945
3233
|
}
|
|
3234
|
+
|
|
3235
|
+
res_imgs->grid_x = inst.grid_size.width;
|
|
3236
|
+
res_imgs->grid_y = inst.grid_size.height;
|
|
2946
3237
|
return true;
|
|
2947
|
-
|
|
2948
|
-
else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
3238
|
+
|
|
3239
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
|
|
2949
3240
|
clip_image_u8 resized;
|
|
2950
3241
|
auto patch_size = params.patch_size * 2;
|
|
2951
3242
|
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, patch_size, params.image_size);
|
|
@@ -2971,8 +3262,8 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
|
2971
3262
|
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
|
|
2972
3263
|
res_imgs->entries.push_back(std::move(img_f32));
|
|
2973
3264
|
return true;
|
|
2974
|
-
|
|
2975
|
-
else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
|
3265
|
+
|
|
3266
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_PIXTRAL) {
|
|
2976
3267
|
clip_image_u8 resized_image;
|
|
2977
3268
|
auto new_size = image_manipulation::calc_size_preserved_ratio(original_size, params.patch_size, params.image_size);
|
|
2978
3269
|
image_manipulation::bilinear_resize(*img, resized_image, new_size.width, new_size.height);
|
|
@@ -2980,6 +3271,22 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
|
|
|
2980
3271
|
normalize_image_u8_to_f32(resized_image, *img_f32, ctx->image_mean, ctx->image_std);
|
|
2981
3272
|
res_imgs->entries.push_back(std::move(img_f32));
|
|
2982
3273
|
return true;
|
|
3274
|
+
|
|
3275
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
|
|
3276
|
+
GGML_ASSERT(!params.image_grid_pinpoints.empty());
|
|
3277
|
+
auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
|
|
3278
|
+
std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
|
|
3279
|
+
|
|
3280
|
+
for (size_t i = 0; i < imgs.size(); ++i) {
|
|
3281
|
+
clip_image_f32_ptr res(clip_image_f32_init());
|
|
3282
|
+
normalize_image_u8_to_f32(*imgs[i], *res, ctx->image_mean, ctx->image_std);
|
|
3283
|
+
res_imgs->entries.push_back(std::move(res));
|
|
3284
|
+
}
|
|
3285
|
+
|
|
3286
|
+
res_imgs->grid_x = inst.grid_size.width;
|
|
3287
|
+
res_imgs->grid_y = inst.grid_size.height;
|
|
3288
|
+
return true;
|
|
3289
|
+
|
|
2983
3290
|
}
|
|
2984
3291
|
|
|
2985
3292
|
// the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
|
|
@@ -3098,6 +3405,7 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|
|
3098
3405
|
const auto & params = ctx->vision_model.hparams;
|
|
3099
3406
|
|
|
3100
3407
|
int n_patches = (params.image_size / params.patch_size) * (params.image_size / params.patch_size);
|
|
3408
|
+
int scale_factor = ctx->vision_model.hparams.proj_scale_factor;
|
|
3101
3409
|
|
|
3102
3410
|
if (ctx->proj_type == PROJECTOR_TYPE_LDP
|
|
3103
3411
|
|| ctx->proj_type == PROJECTOR_TYPE_LDPV2
|
|
@@ -3136,6 +3444,12 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
|
|
|
3136
3444
|
int n_patches_x = img->nx / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
|
3137
3445
|
int n_patches_y = img->ny / params.patch_size / (n_merge > 0 ? n_merge : 1);
|
|
3138
3446
|
n_patches = n_patches_y*n_patches_x + n_patches_y - 1; // + one [IMG_BREAK] per row, except the last row
|
|
3447
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_LLAMA4) {
|
|
3448
|
+
n_patches /= (scale_factor * scale_factor);
|
|
3449
|
+
} else if (ctx->proj_type == PROJECTOR_TYPE_ULTRAVOX) {
|
|
3450
|
+
const int proj_stack_factor = ctx->vision_model.hparams.proj_stack_factor;
|
|
3451
|
+
const int n_len = CLIP_ALIGN(img->nx, proj_stack_factor);
|
|
3452
|
+
n_patches = n_len / proj_stack_factor / 2;
|
|
3139
3453
|
}
|
|
3140
3454
|
|
|
3141
3455
|
return n_patches;
|
|
@@ -3247,6 +3561,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3247
3561
|
}
|
|
3248
3562
|
|
|
3249
3563
|
// build the inference graph
|
|
3564
|
+
ctx->debug_print_tensors.clear();
|
|
3250
3565
|
ggml_backend_sched_reset(ctx->sched.get());
|
|
3251
3566
|
ggml_cgraph * gf = clip_image_build_graph(ctx, imgs);
|
|
3252
3567
|
ggml_backend_sched_alloc_graph(ctx->sched.get(), gf);
|
|
@@ -3261,8 +3576,8 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3261
3576
|
const int patch_size = hparams.patch_size;
|
|
3262
3577
|
const int num_patches = ((image_size_width / patch_size) * (image_size_height / patch_size));
|
|
3263
3578
|
const int n_pos = num_patches + (model.class_embedding ? 1 : 0);
|
|
3264
|
-
const int pos_w =
|
|
3265
|
-
const int pos_h =
|
|
3579
|
+
const int pos_w = image_size_width / patch_size;
|
|
3580
|
+
const int pos_h = image_size_height / patch_size;
|
|
3266
3581
|
|
|
3267
3582
|
const bool use_window_attn = hparams.n_wa_pattern > 0; // for qwen2.5vl
|
|
3268
3583
|
|
|
@@ -3292,7 +3607,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3292
3607
|
};
|
|
3293
3608
|
|
|
3294
3609
|
// set input pixel values
|
|
3295
|
-
{
|
|
3610
|
+
if (!imgs.is_audio) {
|
|
3296
3611
|
size_t nelem = 0;
|
|
3297
3612
|
for (const auto & img : imgs.entries) {
|
|
3298
3613
|
nelem += img->nx * img->ny * 3;
|
|
@@ -3329,6 +3644,16 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3329
3644
|
}
|
|
3330
3645
|
}
|
|
3331
3646
|
set_input_f32("inp_raw", inp_raw);
|
|
3647
|
+
|
|
3648
|
+
} else {
|
|
3649
|
+
// audio input
|
|
3650
|
+
GGML_ASSERT(imgs.entries.size() == 1);
|
|
3651
|
+
const auto & mel_inp = imgs.entries[0];
|
|
3652
|
+
const int n_step = mel_inp->nx;
|
|
3653
|
+
const int n_mel = mel_inp->ny;
|
|
3654
|
+
std::vector<float> inp_raw(n_step * n_mel);
|
|
3655
|
+
std::memcpy(inp_raw.data(), mel_inp->buf.data(), n_step * n_mel * sizeof(float));
|
|
3656
|
+
set_input_f32("inp_raw", inp_raw);
|
|
3332
3657
|
}
|
|
3333
3658
|
|
|
3334
3659
|
// set input per projector
|
|
@@ -3525,9 +3850,27 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3525
3850
|
case PROJECTOR_TYPE_GEMMA3:
|
|
3526
3851
|
case PROJECTOR_TYPE_IDEFICS3:
|
|
3527
3852
|
case PROJECTOR_TYPE_INTERNVL:
|
|
3853
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
3528
3854
|
{
|
|
3529
3855
|
// do nothing
|
|
3530
3856
|
} break;
|
|
3857
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
3858
|
+
{
|
|
3859
|
+
// set the 2D positions
|
|
3860
|
+
int n_patches_per_col = image_size_width / patch_size;
|
|
3861
|
+
std::vector<int> pos_data(num_patches + 1, 0); // +1 for the [CLS] token
|
|
3862
|
+
// last pos is always kept 0, it's for CLS
|
|
3863
|
+
// dimension H
|
|
3864
|
+
for (int i = 0; i < num_patches; i++) {
|
|
3865
|
+
pos_data[i] = (i / n_patches_per_col) + 1;
|
|
3866
|
+
}
|
|
3867
|
+
set_input_i32("pos_h", pos_data);
|
|
3868
|
+
// dimension W
|
|
3869
|
+
for (int i = 0; i < num_patches; i++) {
|
|
3870
|
+
pos_data[i] = (i % n_patches_per_col) + 1;
|
|
3871
|
+
}
|
|
3872
|
+
set_input_i32("pos_w", pos_data);
|
|
3873
|
+
} break;
|
|
3531
3874
|
default:
|
|
3532
3875
|
GGML_ABORT("Unknown projector type");
|
|
3533
3876
|
}
|
|
@@ -3548,6 +3891,18 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
|
|
|
3548
3891
|
return false;
|
|
3549
3892
|
}
|
|
3550
3893
|
|
|
3894
|
+
// print debug nodes
|
|
3895
|
+
if (ctx->debug_graph) {
|
|
3896
|
+
LOG_INF("\n\n---\n\n");
|
|
3897
|
+
LOG_INF("\n\nDebug graph:\n\n");
|
|
3898
|
+
for (ggml_tensor * t : ctx->debug_print_tensors) {
|
|
3899
|
+
std::vector<uint8_t> data(ggml_nbytes(t));
|
|
3900
|
+
ggml_backend_tensor_get(t, data.data(), 0, ggml_nbytes(t));
|
|
3901
|
+
print_tensor_shape(t);
|
|
3902
|
+
print_tensor_data(t, data.data(), 3);
|
|
3903
|
+
}
|
|
3904
|
+
}
|
|
3905
|
+
|
|
3551
3906
|
// the last node is the embedding tensor
|
|
3552
3907
|
ggml_tensor * embeddings = ggml_graph_node(gf, -1);
|
|
3553
3908
|
|
|
@@ -3594,8 +3949,12 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
|
|
|
3594
3949
|
return ctx->vision_model.mm_input_proj_w->ne[0];
|
|
3595
3950
|
case PROJECTOR_TYPE_IDEFICS3:
|
|
3596
3951
|
return ctx->vision_model.projection->ne[1];
|
|
3952
|
+
case PROJECTOR_TYPE_ULTRAVOX:
|
|
3953
|
+
return ctx->vision_model.mm_2_w->ne[1];
|
|
3597
3954
|
case PROJECTOR_TYPE_INTERNVL:
|
|
3598
3955
|
return ctx->vision_model.mm_3_w->ne[1];
|
|
3956
|
+
case PROJECTOR_TYPE_LLAMA4:
|
|
3957
|
+
return ctx->vision_model.mm_model_proj->ne[1];
|
|
3599
3958
|
default:
|
|
3600
3959
|
GGML_ABORT("Unknown projector type");
|
|
3601
3960
|
}
|
|
@@ -3624,6 +3983,14 @@ bool clip_is_gemma3(const struct clip_ctx * ctx) {
|
|
|
3624
3983
|
return ctx->proj_type == PROJECTOR_TYPE_GEMMA3;
|
|
3625
3984
|
}
|
|
3626
3985
|
|
|
3986
|
+
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
|
|
3987
|
+
return ctx->vision_model.hparams.has_vision;
|
|
3988
|
+
}
|
|
3989
|
+
|
|
3990
|
+
bool clip_has_audio_encoder(const struct clip_ctx * ctx) {
|
|
3991
|
+
return ctx->vision_model.hparams.has_audio;
|
|
3992
|
+
}
|
|
3993
|
+
|
|
3627
3994
|
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
|
|
3628
3995
|
clip_image_f32 clip_img;
|
|
3629
3996
|
clip_img.buf.resize(h * w * 3);
|
|
@@ -3644,3 +4011,14 @@ bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img,
|
|
|
3644
4011
|
projector_type clip_get_projector_type(const struct clip_ctx * ctx) {
|
|
3645
4012
|
return ctx->proj_type;
|
|
3646
4013
|
}
|
|
4014
|
+
|
|
4015
|
+
void clip_image_f32_batch_add_mel(struct clip_image_f32_batch * batch, int n_mel, int n_frames, float * mel) {
|
|
4016
|
+
clip_image_f32 * audio = new clip_image_f32;
|
|
4017
|
+
audio->nx = n_frames;
|
|
4018
|
+
audio->ny = n_mel;
|
|
4019
|
+
audio->buf.resize(n_frames * n_mel);
|
|
4020
|
+
std::memcpy(audio->buf.data(), mel, n_frames * n_mel * sizeof(float));
|
|
4021
|
+
|
|
4022
|
+
batch->entries.push_back(clip_image_f32_ptr(audio));
|
|
4023
|
+
batch->is_audio = true;
|
|
4024
|
+
}
|